mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-05-28 17:27:26 +03:00
Compare commits
117 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
7fb1e70b59 | ||
|
|
d374e71e55 | ||
|
|
30af6e2b98 | ||
|
|
d7be46189f | ||
|
|
bc81d47aba | ||
|
|
0b246862b9 | ||
|
|
a919001134 | ||
|
|
48e7078ee0 | ||
|
|
bb771cbd2b | ||
|
|
7c48fb81ce | ||
|
|
91eb8f4fa0 | ||
|
|
d205df6812 | ||
|
|
e8d2567429 | ||
|
|
09e7b76c93 | ||
|
|
48e7eae41c | ||
|
|
c5229087a5 | ||
|
|
e31cdaa0eb | ||
|
|
491c4d7d2e | ||
|
|
939a7dd648 | ||
|
|
8ad8aef447 | ||
|
|
f12cc6d0fa | ||
|
|
aa50b2c2ae | ||
|
|
c40006a62e | ||
|
|
c6e4088376 | ||
|
|
b36eefc1b3 | ||
|
|
837bb6b447 | ||
|
|
ba4dd0bc67 | ||
|
|
617255d437 | ||
|
|
87b0a60cdd | ||
|
|
fda8528aa8 | ||
|
|
2d0656fbdd | ||
|
|
6b4e4bd582 | ||
|
|
9f0e4b14d2 | ||
|
|
b3a739c9b6 | ||
|
|
4d8cc0c56f | ||
|
|
0d227ec358 | ||
|
|
1d971bba36 | ||
|
|
9777256c31 | ||
|
|
7085492c6f | ||
|
|
b4c0549a49 | ||
|
|
0d18aaa9d1 | ||
|
|
08bc21b459 | ||
|
|
35a74c8fb9 | ||
|
|
5190c2ea8d | ||
|
|
7799d31e68 | ||
|
|
3a3ed153d9 | ||
|
|
ef66bfab68 | ||
|
|
678d43d720 | ||
|
|
ef41a69179 | ||
|
|
3dc7684f39 | ||
|
|
dbe9c0c8ce | ||
|
|
6fe90deffa | ||
|
|
581d020b12 | ||
|
|
7623de11d9 | ||
|
|
c9d98295a3 | ||
|
|
1506d39e76 | ||
|
|
54121f7325 | ||
|
|
192d8ae8b8 | ||
|
|
35c9b1f39e | ||
|
|
4bead4e30d | ||
|
|
302e2c2652 | ||
|
|
328874d054 | ||
|
|
c1f1e28d29 | ||
|
|
5a4126adc1 | ||
|
|
a4d2d4ae41 | ||
|
|
d161ea7071 | ||
|
|
45158f460e | ||
|
|
22307b3e8b | ||
|
|
ce5890b5f7 | ||
|
|
b251f74f49 | ||
|
|
fa97041524 | ||
|
|
ae251b5ff2 | ||
|
|
66efd13375 | ||
|
|
6c4cbdc70b | ||
|
|
5fdf07e33b | ||
|
|
062d3115aa | ||
|
|
314e729347 | ||
|
|
d55fb97174 | ||
|
|
826539ce59 | ||
|
|
b96487645c | ||
|
|
9627d0f540 | ||
|
|
e2ef8fe42c | ||
|
|
6d57c26ef8 | ||
|
|
28123a3937 | ||
|
|
549b9d8433 | ||
|
|
5d246a792d | ||
|
|
63248fc3e3 | ||
|
|
83eebe9d08 | ||
|
|
fff63b5108 | ||
|
|
f3061116ff | ||
|
|
1c0f6db545 | ||
|
|
cec51c7a7d | ||
|
|
b22ff4b7b4 | ||
|
|
c0c7e147e7 | ||
|
|
b0df4c0cfd | ||
|
|
a497476330 | ||
|
|
95405ac65f | ||
|
|
0f3cb3fc8b | ||
|
|
1acee6bf89 | ||
|
|
ef570f6308 | ||
|
|
cc9e331213 | ||
|
|
bcfd1989e9 | ||
|
|
56f16f235c | ||
|
|
8cc67efcd4 | ||
|
|
95feeab52e | ||
|
|
99d4026b11 | ||
|
|
9c92e96a64 | ||
|
|
afcda09d15 | ||
|
|
bbce619adb | ||
|
|
4f0e43da6f | ||
|
|
bb28c1fe24 | ||
|
|
ee7c30578a | ||
|
|
47c0eda9d4 | ||
|
|
5306f4b3b5 | ||
|
|
40d5358d3c | ||
|
|
b65bb4baae | ||
|
|
a1a69f777a |
101
.devops/zendnn.Dockerfile
Normal file
101
.devops/zendnn.Dockerfile
Normal file
@@ -0,0 +1,101 @@
|
||||
ARG UBUNTU_VERSION=24.04
|
||||
ARG BUILD_DATE=N/A
|
||||
ARG APP_VERSION=N/A
|
||||
ARG APP_REVISION=N/A
|
||||
|
||||
FROM ubuntu:$UBUNTU_VERSION AS build
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y gcc-13 g++-13 build-essential git cmake libssl-dev libomp-dev libnuma-dev python3 ca-certificates
|
||||
|
||||
ENV CC=gcc-13 CXX=g++-13
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
RUN cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_ZENDNN=ON && \
|
||||
cmake --build build -j $(nproc)
|
||||
|
||||
RUN mkdir -p /app/lib && \
|
||||
find build -name "*.so*" -exec cp -P {} /app/lib \;
|
||||
|
||||
RUN mkdir -p /app/full \
|
||||
&& cp build/bin/* /app/full \
|
||||
&& cp *.py /app/full \
|
||||
&& cp -r conversion /app/full \
|
||||
&& cp -r gguf-py /app/full \
|
||||
&& cp -r requirements /app/full \
|
||||
&& cp requirements.txt /app/full \
|
||||
&& cp .devops/tools.sh /app/full/tools.sh
|
||||
|
||||
## Base image
|
||||
FROM ubuntu:$UBUNTU_VERSION AS base
|
||||
|
||||
ARG BUILD_DATE=N/A
|
||||
ARG APP_VERSION=N/A
|
||||
ARG APP_REVISION=N/A
|
||||
ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp
|
||||
ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp
|
||||
LABEL org.opencontainers.image.created=$BUILD_DATE \
|
||||
org.opencontainers.image.version=$APP_VERSION \
|
||||
org.opencontainers.image.revision=$APP_REVISION \
|
||||
org.opencontainers.image.title="llama.cpp" \
|
||||
org.opencontainers.image.description="LLM inference in C/C++" \
|
||||
org.opencontainers.image.url=$IMAGE_URL \
|
||||
org.opencontainers.image.source=$IMAGE_SOURCE
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y libgomp1 libnuma1 curl \
|
||||
&& apt autoremove -y \
|
||||
&& apt clean -y \
|
||||
&& rm -rf /tmp/* /var/tmp/* \
|
||||
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
||||
&& find /var/cache -type f -delete
|
||||
|
||||
COPY --from=build /app/lib/ /app
|
||||
|
||||
### Full
|
||||
FROM base AS full
|
||||
|
||||
COPY --from=build /app/full /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y \
|
||||
git \
|
||||
python3 \
|
||||
python3-pip \
|
||||
python3-wheel \
|
||||
&& pip install --break-system-packages --upgrade setuptools \
|
||||
&& pip install --break-system-packages -r requirements.txt \
|
||||
&& apt autoremove -y \
|
||||
&& apt clean -y \
|
||||
&& rm -rf /tmp/* /var/tmp/* \
|
||||
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
|
||||
&& find /var/cache -type f -delete
|
||||
|
||||
ENTRYPOINT ["/app/tools.sh"]
|
||||
|
||||
### Light, CLI only
|
||||
FROM base AS light
|
||||
|
||||
COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
ENTRYPOINT [ "/app/llama-cli" ]
|
||||
|
||||
### Server, Server only
|
||||
FROM base AS server
|
||||
|
||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||
|
||||
COPY --from=build /app/full/llama-server /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||
|
||||
ENTRYPOINT [ "/app/llama-server" ]
|
||||
@@ -15,6 +15,6 @@ runs:
|
||||
id: setup
|
||||
uses: ./.github/actions/unarchive-tar
|
||||
with:
|
||||
url: https://archive.spacemit.com/toolchain/spacemit-toolchain-linux-glibc-x86_64-v${{ inputs.version }}.tar.xz
|
||||
url: https://github.com/spacemit-com/toolchain/releases/download/v${{ inputs.version }}/spacemit-toolchain-linux-glibc-x86_64-v${{ inputs.version }}.tar.xz
|
||||
path: ${{ inputs.path }}
|
||||
strip: 1
|
||||
|
||||
2
.github/actions/unarchive-tar/action.yml
vendored
2
.github/actions/unarchive-tar/action.yml
vendored
@@ -24,4 +24,4 @@ runs:
|
||||
run: |
|
||||
mkdir -p ${{ inputs.path }}
|
||||
cd ${{ inputs.path }}
|
||||
curl --no-progress-meter ${{ inputs.url }} | tar -${{ inputs.type }}x --strip-components=${{ inputs.strip }}
|
||||
curl --no-progress-meter -L ${{ inputs.url }} | tar -${{ inputs.type }}x --strip-components=${{ inputs.strip }}
|
||||
|
||||
31
.github/actions/windows-setup-cuda/action.yml
vendored
31
.github/actions/windows-setup-cuda/action.yml
vendored
@@ -96,3 +96,34 @@ runs:
|
||||
echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||
echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
|
||||
echo "CUDA_PATH_V13_1=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
|
||||
|
||||
- name: Install Cuda Toolkit 13.3
|
||||
if: ${{ inputs.cuda_version == '13.3' }}
|
||||
shell: pwsh
|
||||
run: |
|
||||
mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3"
|
||||
choco install unzip -y
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_crt/windows-x86_64/cuda_crt-windows-x86_64-13.3.33-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-13.3.29-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-13.3.33-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-13.3.33-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-13.5.1.27-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libnvvm/windows-x86_64/libnvvm-windows-x86_64-13.3.33-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-13.3.29-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_profiler_api/windows-x86_64/cuda_profiler_api-windows-x86_64-13.3.27-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-13.3.27-archive.zip"
|
||||
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cccl/windows-x86_64/cccl-windows-x86_64-13.3.3.3.1-archive.zip"
|
||||
unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3"
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\cuda_crt-windows-x86_64-13.3.33-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\cuda_cudart-windows-x86_64-13.3.29-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\cuda_nvcc-windows-x86_64-13.3.33-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\cuda_nvrtc-windows-x86_64-13.3.33-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\libcublas-windows-x86_64-13.5.1.27-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\libnvvm-windows-x86_64-13.3.33-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\cuda_nvtx-windows-x86_64-13.3.29-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\cuda_profiler_api-windows-x86_64-13.3.27-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\visual_studio_integration-windows-x86_64-13.3.27-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y
|
||||
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\cccl-windows-x86_64-13.3.3.3.1-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y
|
||||
echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||
echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
|
||||
echo "CUDA_PATH_V13_3=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
|
||||
|
||||
6
.github/workflows/build-3rd-party.yml
vendored
6
.github/workflows/build-3rd-party.yml
vendored
@@ -22,9 +22,9 @@ concurrency:
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
ubuntu-24-llguidance:
|
||||
|
||||
@@ -31,7 +31,7 @@ jobs:
|
||||
android-ndk-snapdragon:
|
||||
runs-on: ubuntu-latest
|
||||
container:
|
||||
image: 'ghcr.io/snapdragon-toolchain/arm64-android:v0.6'
|
||||
image: 'ghcr.io/snapdragon-toolchain/arm64-android:v0.7'
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
@@ -61,7 +61,7 @@ jobs:
|
||||
linux-iot-snapdragon:
|
||||
runs-on: ubuntu-latest
|
||||
container:
|
||||
image: 'ghcr.io/snapdragon-toolchain/arm64-linux:v0.6'
|
||||
image: 'ghcr.io/snapdragon-toolchain/arm64-linux:v0.7'
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
|
||||
71
.github/workflows/build-android.yml
vendored
71
.github/workflows/build-android.yml
vendored
@@ -27,12 +27,12 @@ concurrency:
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
android:
|
||||
default:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
@@ -58,7 +58,7 @@ jobs:
|
||||
cd examples/llama.android
|
||||
./gradlew build --no-daemon
|
||||
|
||||
android-ndk:
|
||||
ndk:
|
||||
runs-on: ubuntu-latest
|
||||
container:
|
||||
image: 'ghcr.io/snapdragon-toolchain/arm64-android:v0.3'
|
||||
@@ -73,6 +73,11 @@ jobs:
|
||||
fetch-depth: 0
|
||||
lfs: false
|
||||
|
||||
- name: Dependencies
|
||||
run: |
|
||||
apt-get update
|
||||
apt-get install -y build-essential
|
||||
|
||||
- name: Build
|
||||
id: ndk_build
|
||||
run: |
|
||||
@@ -86,3 +91,59 @@ jobs:
|
||||
with:
|
||||
name: llama-cpp-android-arm64-cpu
|
||||
path: pkg-adb/llama.cpp
|
||||
|
||||
arm64:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
env:
|
||||
NDK_VERSION: "29.0.14206865"
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
# note : disabled to spare some cache space (https://github.com/ggml-org/llama.cpp/pull/23789)
|
||||
# for some reason, the ccache does not improve the build time in this case
|
||||
# example:
|
||||
# cache off: https://github.com/ggerganov/tmp2/actions/runs/26534713799/job/78160400831
|
||||
# cache on: https://github.com/ggerganov/tmp2/actions/runs/26534713799/job/78224189394
|
||||
#
|
||||
#- name: ccache
|
||||
# uses: ggml-org/ccache-action@v1.2.21
|
||||
# with:
|
||||
# key: android-ubuntu-arm64
|
||||
# evict-old-files: 1d
|
||||
# save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Set up JDK
|
||||
uses: actions/setup-java@v5
|
||||
with:
|
||||
java-version: 17
|
||||
distribution: temurin
|
||||
|
||||
- name: Setup Android SDK
|
||||
uses: android-actions/setup-android@40fd30fb8d7440372e1316f5d1809ec01dcd3699 # v4.0.1
|
||||
with:
|
||||
log-accepted-android-sdk-licenses: false
|
||||
|
||||
- name: Install NDK
|
||||
run: |
|
||||
sdkmanager "ndk;${{ env.NDK_VERSION }}"
|
||||
echo "ANDROID_NDK=${ANDROID_SDK_ROOT}/ndk/${{ env.NDK_VERSION }}" >> $GITHUB_ENV
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake \
|
||||
-DANDROID_ABI=arm64-v8a \
|
||||
-DANDROID_PLATFORM=android-28 \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DGGML_BACKEND_DL=ON \
|
||||
-DGGML_NATIVE=OFF \
|
||||
-DGGML_CPU_ALL_VARIANTS=ON \
|
||||
-DGGML_OPENMP=OFF \
|
||||
-DLLAMA_BUILD_BORINGSSL=ON \
|
||||
-DGGML_RPC=ON
|
||||
time cmake --build build --config Release -j $(nproc)
|
||||
|
||||
108
.github/workflows/build-apple.yml
vendored
108
.github/workflows/build-apple.yml
vendored
@@ -32,12 +32,12 @@ concurrency:
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
macOS-latest-ios:
|
||||
macos-latest-arm64:
|
||||
runs-on: macos-latest
|
||||
|
||||
steps:
|
||||
@@ -48,7 +48,80 @@ jobs:
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: macOS-latest-ios
|
||||
key: apple-arm64
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
sysctl -a
|
||||
cmake -B build \
|
||||
-DCMAKE_BUILD_RPATH="@loader_path" \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DLLAMA_BUILD_BORINGSSL=ON \
|
||||
-DGGML_METAL_USE_BF16=ON \
|
||||
-DGGML_METAL_EMBED_LIBRARY=OFF \
|
||||
-DGGML_METAL_SHADER_DEBUG=ON \
|
||||
-DGGML_RPC=ON
|
||||
time cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
|
||||
leaks -atExit -- ./build/bin/test-thread-safety -hf ggml-org/gemma-3-270m-qat-GGUF -ngl 99 -p "$(printf 'hello %.0s' {1..128})" -n 16 -c 512 -ub 32 -np 2 -t 2 -lv 1
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
run: |
|
||||
cd build
|
||||
ctest -L main -E "test-llama-archs" --verbose --timeout 900
|
||||
|
||||
macos-latest-x64:
|
||||
runs-on: macos-15-intel
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: apple-x64
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
sysctl -a
|
||||
# Metal is disabled due to intermittent failures with Github runners not having a GPU:
|
||||
# https://github.com/ggml-org/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
|
||||
cmake -B build \
|
||||
-DCMAKE_BUILD_RPATH="@loader_path" \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DLLAMA_BUILD_BORINGSSL=ON \
|
||||
-DGGML_METAL=OFF \
|
||||
-DGGML_RPC=ON \
|
||||
-DCMAKE_OSX_DEPLOYMENT_TARGET=13.3
|
||||
time cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
run: |
|
||||
cd build
|
||||
ctest -L main --verbose --timeout 900
|
||||
|
||||
macos-latest-ios:
|
||||
runs-on: macos-latest
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
# TODO: this likely does not do anything - if yes, remove it
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: apple-ios
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
@@ -59,6 +132,7 @@ jobs:
|
||||
cmake -B build -G Xcode \
|
||||
-DGGML_METAL_USE_BF16=ON \
|
||||
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||
-DLLAMA_BUILD_APP=OFF \
|
||||
-DLLAMA_BUILD_COMMON=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||
-DLLAMA_BUILD_TOOLS=OFF \
|
||||
@@ -89,6 +163,7 @@ jobs:
|
||||
-DGGML_METAL_USE_BF16=ON \
|
||||
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DLLAMA_BUILD_APP=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||
-DLLAMA_BUILD_TOOLS=OFF \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
@@ -115,7 +190,7 @@ jobs:
|
||||
xcodebuild -downloadPlatform iOS
|
||||
xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' FRAMEWORK_FOLDER_PATH=./build-ios build
|
||||
|
||||
macOS-latest-tvos:
|
||||
macos-latest-tvos:
|
||||
runs-on: macos-latest
|
||||
|
||||
steps:
|
||||
@@ -123,10 +198,11 @@ jobs:
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
# TODO: this likely does not do anything - if yes, remove it
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: macOS-latest-tvos
|
||||
key: apple-tvos
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
@@ -138,6 +214,7 @@ jobs:
|
||||
-DGGML_METAL_USE_BF16=ON \
|
||||
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||
-DLLAMA_BUILD_COMMON=OFF \
|
||||
-DLLAMA_BUILD_APP=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||
-DLLAMA_BUILD_TOOLS=OFF \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
@@ -147,7 +224,7 @@ jobs:
|
||||
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
|
||||
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
|
||||
|
||||
macOS-latest-visionos:
|
||||
macos-latest-visionos:
|
||||
runs-on: macos-latest
|
||||
|
||||
steps:
|
||||
@@ -155,6 +232,14 @@ jobs:
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
# TODO: this likely does not do anything - if yes, remove it
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: apple-visionos
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
@@ -163,6 +248,7 @@ jobs:
|
||||
-DGGML_METAL_USE_BF16=ON \
|
||||
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||
-DLLAMA_BUILD_COMMON=OFF \
|
||||
-DLLAMA_BUILD_APP=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||
-DLLAMA_BUILD_TOOLS=OFF \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
@@ -172,7 +258,7 @@ jobs:
|
||||
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
|
||||
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
|
||||
|
||||
macOS-latest-swift:
|
||||
macos-latest-swift:
|
||||
runs-on: macos-latest
|
||||
needs: macos-latest-ios-xcode
|
||||
|
||||
@@ -185,10 +271,11 @@ jobs:
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
# TODO: this likely does not do anything - if yes, remove it
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: macOS-latest-swift
|
||||
key: apple-swift
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
@@ -206,6 +293,7 @@ jobs:
|
||||
-DGGML_METAL_USE_BF16=ON \
|
||||
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DLLAMA_BUILD_APP=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||
-DLLAMA_BUILD_TOOLS=OFF \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
|
||||
8
.github/workflows/build-cache.yml
vendored
8
.github/workflows/build-cache.yml
vendored
@@ -28,7 +28,7 @@ jobs:
|
||||
id: cache-sdk
|
||||
with:
|
||||
path: ./vulkan_sdk
|
||||
key: vulkan-sdk-${{ env.VULKAN_SDK_VERSION }}-${{ runner.os }}
|
||||
key: cache-gha-vulkan-sdk-${{ env.VULKAN_SDK_VERSION }}-${{ runner.os }}
|
||||
|
||||
- name: Setup Vulkan SDK
|
||||
if: steps.cache-sdk.outputs.cache-hit != 'true'
|
||||
@@ -54,7 +54,7 @@ jobs:
|
||||
# id: cache-toolchain
|
||||
# with:
|
||||
# path: ./spacemit_toolchain
|
||||
# key: spacemit-ime-toolchain-v${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}-${{ runner.os }}
|
||||
# key: cache-gha-spacemit-ime-toolchain-v${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}-${{ runner.os }}
|
||||
|
||||
# - name: Setup SpacemiT Toolchain
|
||||
# if: steps.cache-toolchain.outputs.cache-hit != 'true'
|
||||
@@ -81,7 +81,7 @@ jobs:
|
||||
id: cache-openvino
|
||||
with:
|
||||
path: ./openvino_toolkit
|
||||
key: openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
|
||||
key: cache-gha-openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
|
||||
|
||||
- name: Setup OpenVINO Toolkit
|
||||
if: steps.cache-openvino.outputs.cache-hit != 'true'
|
||||
@@ -108,7 +108,7 @@ jobs:
|
||||
id: cache-rocm
|
||||
with:
|
||||
path: C:\Program Files\AMD\ROCm
|
||||
key: rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }}
|
||||
key: cache-gha-rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }}
|
||||
|
||||
- name: Setup ROCm
|
||||
if: steps.cache-rocm.outputs.cache-hit != 'true'
|
||||
|
||||
140
.github/workflows/build-cann.yml
vendored
140
.github/workflows/build-cann.yml
vendored
@@ -29,74 +29,76 @@ concurrency:
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
openEuler-latest-cann:
|
||||
defaults:
|
||||
run:
|
||||
shell: bash -el {0}
|
||||
strategy:
|
||||
matrix:
|
||||
arch: [x86, aarch64]
|
||||
chip_type: ['910b', '310p']
|
||||
build: ['Release']
|
||||
use_acl_graph: ['on', 'off']
|
||||
exclude:
|
||||
# 310P does not support USE_ACL_GRAPH=on
|
||||
- chip_type: '310p'
|
||||
use_acl_graph: 'on'
|
||||
runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Free up disk space
|
||||
uses: ggml-org/free-disk-space@v1.3.1
|
||||
with:
|
||||
tool-cache: true
|
||||
|
||||
- name: Set container image
|
||||
id: cann-image
|
||||
run: |
|
||||
image="ascendai/cann:${{ matrix.chip_type == '910b' && '8.5.0-910b-openeuler24.03-py3.11' || '8.5.0-310p-openeuler24.03-py3.11' }}"
|
||||
echo "image=${image}" >> "${GITHUB_OUTPUT}"
|
||||
|
||||
- name: Pull container image
|
||||
run: docker pull "${{ steps.cann-image.outputs.image }}"
|
||||
|
||||
- name: Build
|
||||
env:
|
||||
BUILD_TYPE: ${{ matrix.build }}
|
||||
SOC_TYPE: ascend${{ matrix.chip_type }}
|
||||
USE_ACL_GRAPH: ${{ matrix.use_acl_graph }}
|
||||
run: |
|
||||
HOST_UID=$(id -u)
|
||||
HOST_GID=$(id -g)
|
||||
|
||||
docker run --rm \
|
||||
-v "${PWD}:/workspace" \
|
||||
-w /workspace \
|
||||
-e SOC_TYPE=${SOC_TYPE} \
|
||||
-e BUILD_TYPE=${BUILD_TYPE} \
|
||||
-e USE_ACL_GRAPH=${USE_ACL_GRAPH} \
|
||||
"${{ steps.cann-image.outputs.image }}" \
|
||||
bash -lc '
|
||||
set -e
|
||||
yum install -y --setopt=install_weak_deps=False --setopt=tsflags=nodocs git gcc gcc-c++ make cmake openssl-devel
|
||||
yum clean all && rm -rf /var/cache/yum
|
||||
git config --global --add safe.directory "/workspace"
|
||||
export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
|
||||
cmake -S . -B build \
|
||||
-DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
|
||||
-DGGML_CANN=on \
|
||||
-DSOC_TYPE=${SOC_TYPE} \
|
||||
-DUSE_ACL_GRAPH=${USE_ACL_GRAPH}
|
||||
cmake --build build -j $(nproc)
|
||||
|
||||
chown -R '"${HOST_UID}"':'"${HOST_GID}"' /workspace/build
|
||||
'
|
||||
# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23705)
|
||||
# in order to enable it again, we have to provision dedicated runners to run it
|
||||
# openEuler-latest-cann:
|
||||
# defaults:
|
||||
# run:
|
||||
# shell: bash -el {0}
|
||||
# strategy:
|
||||
# matrix:
|
||||
# arch: [x86, aarch64]
|
||||
# chip_type: ['910b', '310p']
|
||||
# build: ['Release']
|
||||
# use_acl_graph: ['on', 'off']
|
||||
# exclude:
|
||||
# # 310P does not support USE_ACL_GRAPH=on
|
||||
# - chip_type: '310p'
|
||||
# use_acl_graph: 'on'
|
||||
# runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
|
||||
# steps:
|
||||
# - name: Checkout
|
||||
# uses: actions/checkout@v6
|
||||
# with:
|
||||
# fetch-depth: 0
|
||||
#
|
||||
# - name: Free up disk space
|
||||
# uses: ggml-org/free-disk-space@v1.3.1
|
||||
# with:
|
||||
# tool-cache: true
|
||||
#
|
||||
# - name: Set container image
|
||||
# id: cann-image
|
||||
# run: |
|
||||
# image="ascendai/cann:${{ matrix.chip_type == '910b' && '8.5.0-910b-openeuler24.03-py3.11' || '8.5.0-310p-openeuler24.03-py3.11' }}"
|
||||
# echo "image=${image}" >> "${GITHUB_OUTPUT}"
|
||||
#
|
||||
# - name: Pull container image
|
||||
# run: docker pull "${{ steps.cann-image.outputs.image }}"
|
||||
#
|
||||
# - name: Build
|
||||
# env:
|
||||
# BUILD_TYPE: ${{ matrix.build }}
|
||||
# SOC_TYPE: ascend${{ matrix.chip_type }}
|
||||
# USE_ACL_GRAPH: ${{ matrix.use_acl_graph }}
|
||||
# run: |
|
||||
# HOST_UID=$(id -u)
|
||||
# HOST_GID=$(id -g)
|
||||
#
|
||||
# docker run --rm \
|
||||
# -v "${PWD}:/workspace" \
|
||||
# -w /workspace \
|
||||
# -e SOC_TYPE=${SOC_TYPE} \
|
||||
# -e BUILD_TYPE=${BUILD_TYPE} \
|
||||
# -e USE_ACL_GRAPH=${USE_ACL_GRAPH} \
|
||||
# "${{ steps.cann-image.outputs.image }}" \
|
||||
# bash -lc '
|
||||
# set -e
|
||||
# yum install -y --setopt=install_weak_deps=False --setopt=tsflags=nodocs git gcc gcc-c++ make cmake openssl-devel
|
||||
# yum clean all && rm -rf /var/cache/yum
|
||||
# git config --global --add safe.directory "/workspace"
|
||||
# export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
|
||||
# cmake -S . -B build \
|
||||
# -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
|
||||
# -DGGML_CANN=on \
|
||||
# -DSOC_TYPE=${SOC_TYPE} \
|
||||
# -DUSE_ACL_GRAPH=${USE_ACL_GRAPH}
|
||||
# cmake --build build -j $(nproc)
|
||||
#
|
||||
# chown -R '"${HOST_UID}"':'"${HOST_GID}"' /workspace/build
|
||||
# '
|
||||
|
||||
18
.github/workflows/build-cmake-pkg.yml
vendored
18
.github/workflows/build-cmake-pkg.yml
vendored
@@ -5,23 +5,23 @@ on:
|
||||
|
||||
jobs:
|
||||
linux:
|
||||
runs-on: ubuntu-slim
|
||||
runs-on: [self-hosted, Linux, CPU]
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install -y build-essential tcl cmake
|
||||
|
||||
- name: Build
|
||||
run: |
|
||||
PREFIX="$(pwd)"/inst
|
||||
cmake -S . -B build -DCMAKE_PREFIX_PATH="$PREFIX" \
|
||||
-DLLAMA_OPENSSL=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=OFF -DCMAKE_BUILD_TYPE=Release
|
||||
cmake -S . -B build \
|
||||
-DCMAKE_PREFIX_PATH="$PREFIX" \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
-DLLAMA_BUILD_TOOLS=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||
-DLLAMA_BUILD_APP=OFF \
|
||||
-DCMAKE_BUILD_TYPE=Release
|
||||
cmake --build build --config Release
|
||||
cmake --install build --prefix "$PREFIX" --config Release
|
||||
|
||||
|
||||
231
.github/workflows/build-cpu.yml
vendored
Normal file
231
.github/workflows/build-cpu.yml
vendored
Normal file
@@ -0,0 +1,231 @@
|
||||
name: CI (cpu)
|
||||
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: [
|
||||
'.github/workflows/build-cpu.yml',
|
||||
'.github/workflows/build-cmake-pkg.yml',
|
||||
'**/CMakeLists.txt',
|
||||
'**/.cmake',
|
||||
'**/*.h',
|
||||
'**/*.hpp',
|
||||
'**/*.c',
|
||||
'**/*.cpp',
|
||||
'**/*.cu',
|
||||
'**/*.cuh',
|
||||
'**/*.swift',
|
||||
'**/*.m',
|
||||
'**/*.metal',
|
||||
'**/*.comp',
|
||||
'**/*.glsl',
|
||||
'**/*.wgsl'
|
||||
]
|
||||
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: [
|
||||
'.github/workflows/build-cpu.yml',
|
||||
'.github/workflows/build-cmake-pkg.yml',
|
||||
'**/CMakeLists.txt',
|
||||
'**/.cmake',
|
||||
'**/*.h',
|
||||
'**/*.hpp',
|
||||
'**/*.c',
|
||||
'**/*.cpp',
|
||||
'**/*.cu',
|
||||
'**/*.cuh',
|
||||
'**/*.swift',
|
||||
'**/*.m',
|
||||
'**/*.metal',
|
||||
'**/*.comp',
|
||||
'**/*.glsl',
|
||||
'**/*.wgsl'
|
||||
]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
build-cmake-pkg:
|
||||
uses: ./.github/workflows/build-cmake-pkg.yml
|
||||
|
||||
ubuntu:
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- build: 'x64'
|
||||
os: ubuntu-22.04
|
||||
- build: 'arm64'
|
||||
os: ubuntu-24.04-arm
|
||||
|
||||
runs-on: ${{ matrix.os }}
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: cpu-${{ matrix.os }}
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Build Dependencies
|
||||
id: build_depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y --no-install-recommends \
|
||||
python3 python3-pip python3-dev python3-wheel \
|
||||
libjpeg-dev build-essential libssl-dev \
|
||||
git-lfs
|
||||
|
||||
- name: Toolchain workaround (GCC 14)
|
||||
if: ${{ contains(matrix.os, 'ubuntu-24.04') }}
|
||||
run: |
|
||||
sudo apt-get install -y gcc-14 g++-14
|
||||
echo "CC=gcc-14" >> "$GITHUB_ENV"
|
||||
echo "CXX=g++-14" >> "$GITHUB_ENV"
|
||||
|
||||
- name: Python Dependencies
|
||||
id: python_depends
|
||||
run: |
|
||||
export PIP_BREAK_SYSTEM_PACKAGES="1"
|
||||
python3 -m pip install --upgrade pip setuptools
|
||||
pip3 install ./gguf-py
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DGGML_RPC=ON
|
||||
time cmake --build build --config Release -j $(nproc)
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
run: |
|
||||
cd build
|
||||
ctest -L main --verbose --timeout 900
|
||||
|
||||
- name: Test llama2c conversion
|
||||
id: llama2c_test
|
||||
run: |
|
||||
cd build
|
||||
echo "Fetch tokenizer"
|
||||
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin
|
||||
echo "Fetch llama2c model"
|
||||
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
|
||||
./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
|
||||
./bin/llama-completion -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
|
||||
|
||||
windows:
|
||||
runs-on: windows-2025
|
||||
|
||||
env:
|
||||
OPENBLAS_VERSION: 0.3.23
|
||||
SDE_VERSION: 9.33.0-2024-01-07
|
||||
VULKAN_VERSION: 1.4.313.2
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- build: 'x64-cpu-static'
|
||||
arch: 'x64'
|
||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF'
|
||||
- build: 'x64-openblas'
|
||||
arch: 'x64'
|
||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
|
||||
- build: 'x64-vulkan'
|
||||
arch: 'x64'
|
||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON'
|
||||
- build: 'arm64'
|
||||
arch: 'arm64'
|
||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON'
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: cpu-windows-2025-${{ matrix.build }}
|
||||
variant: ccache
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Download OpenBLAS
|
||||
id: get_openblas
|
||||
if: ${{ matrix.build == 'x64-openblas' }}
|
||||
run: |
|
||||
curl.exe -o $env:RUNNER_TEMP/openblas.zip -L "https://github.com/xianyi/OpenBLAS/releases/download/v${env:OPENBLAS_VERSION}/OpenBLAS-${env:OPENBLAS_VERSION}-x64.zip"
|
||||
curl.exe -o $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt -L "https://github.com/xianyi/OpenBLAS/raw/v${env:OPENBLAS_VERSION}/LICENSE"
|
||||
mkdir $env:RUNNER_TEMP/openblas
|
||||
tar.exe -xvf $env:RUNNER_TEMP/openblas.zip -C $env:RUNNER_TEMP/openblas
|
||||
$vcdir = $(vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath)
|
||||
$msvc = $(join-path $vcdir $('VC\Tools\MSVC\'+$(gc -raw $(join-path $vcdir 'VC\Auxiliary\Build\Microsoft.VCToolsVersion.default.txt')).Trim()))
|
||||
$lib = $(join-path $msvc 'bin\Hostx64\x64\lib.exe')
|
||||
& $lib /machine:x64 "/def:${env:RUNNER_TEMP}/openblas/lib/libopenblas.def" "/out:${env:RUNNER_TEMP}/openblas/lib/openblas.lib" /name:openblas.dll
|
||||
|
||||
- name: Install Vulkan SDK
|
||||
id: get_vulkan
|
||||
if: ${{ matrix.build == 'x64-vulkan' }}
|
||||
run: |
|
||||
curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/vulkansdk-windows-X64-${env:VULKAN_VERSION}.exe"
|
||||
& "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
|
||||
Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
|
||||
Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
|
||||
|
||||
- name: Install Ninja
|
||||
id: install_ninja
|
||||
run: |
|
||||
choco install ninja
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -S . -B build ${{ matrix.defines }} `
|
||||
-DLLAMA_BUILD_BORINGSSL=ON
|
||||
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
|
||||
|
||||
- name: Add libopenblas.dll
|
||||
id: add_libopenblas_dll
|
||||
if: ${{ matrix.build == 'x64-openblas' }}
|
||||
run: |
|
||||
cp $env:RUNNER_TEMP/openblas/bin/libopenblas.dll ./build/bin/Release/openblas.dll
|
||||
cp $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt ./build/bin/Release/OpenBLAS-${env:OPENBLAS_VERSION}.txt
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
if: ${{ matrix.arch == 'x64' }}
|
||||
run: |
|
||||
cd build
|
||||
ctest -L main -C Release --verbose --timeout 900
|
||||
|
||||
# TODO: disabled for now, consider adding tests for all CPU variants instead
|
||||
# - name: Test (Intel SDE)
|
||||
# id: cmake_test_sde
|
||||
# if: ${{ matrix.build == 'avx512-x64' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation
|
||||
# run: |
|
||||
# curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/813591/sde-external-${env:SDE_VERSION}-win.tar.xz"
|
||||
# # for some weird reason windows tar doesn't like sde tar.xz
|
||||
# 7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar.xz
|
||||
# 7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar
|
||||
# $sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe)
|
||||
# cd build
|
||||
# $env:LLAMA_SKIP_TESTS_SLOW_ON_EMULATOR = 1
|
||||
# & $sde -future -- ctest -L main -C Release --verbose --timeout 900
|
||||
4
.github/workflows/build-cross.yml
vendored
4
.github/workflows/build-cross.yml
vendored
@@ -277,7 +277,7 @@ jobs:
|
||||
|
||||
env:
|
||||
# Make sure this is in sync with build-cache.yml
|
||||
SPACEMIT_IME_TOOLCHAIN_VERSION: "1.1.2"
|
||||
SPACEMIT_IME_TOOLCHAIN_VERSION: "1.2.4"
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
@@ -287,7 +287,7 @@ jobs:
|
||||
# id: cache-toolchain
|
||||
# with:
|
||||
# path: ./spacemit_toolchain
|
||||
# key: spacemit-ime-toolchain-v${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}-${{ runner.os }}
|
||||
# key: cache-gha-spacemit-ime-toolchain-v${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}-${{ runner.os }}
|
||||
|
||||
- name: Setup SpacemiT Toolchain
|
||||
#if: steps.cache-toolchain.outputs.cache-hit != 'true'
|
||||
|
||||
134
.github/workflows/build-cuda-ubuntu.yml
vendored
Normal file
134
.github/workflows/build-cuda-ubuntu.yml
vendored
Normal file
@@ -0,0 +1,134 @@
|
||||
name: CI (CUDA, ubuntu)
|
||||
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: [
|
||||
'.github/workflows/build-cuda-ubuntu.yml',
|
||||
'**/CMakeLists.txt',
|
||||
'**/.cmake',
|
||||
'**/*.h',
|
||||
'**/*.hpp',
|
||||
'**/*.c',
|
||||
'**/*.cpp',
|
||||
'**/*.cu',
|
||||
'**/*.cuh'
|
||||
]
|
||||
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: [
|
||||
'.github/workflows/build-cuda-ubuntu.yml',
|
||||
'ggml/src/ggml-cuda/**'
|
||||
]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
cuda:
|
||||
runs-on: ubuntu-24.04
|
||||
container: nvidia/cuda:12.6.2-devel-ubuntu24.04
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Install dependencies
|
||||
env:
|
||||
DEBIAN_FRONTEND: noninteractive
|
||||
run: |
|
||||
apt update
|
||||
apt install -y cmake build-essential ninja-build libgomp1 git libssl-dev
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: cuda-ubuntu-24.04-cuda
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Build with CMake
|
||||
# TODO: Remove GGML_CUDA_CUB_3DOT2 flag once CCCL 3.2 is bundled within CTK and that CTK version is used in this project
|
||||
run: |
|
||||
cmake -S . -B build -G Ninja \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DCMAKE_CUDA_ARCHITECTURES=89-real \
|
||||
-DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined \
|
||||
-DGGML_NATIVE=OFF \
|
||||
-DGGML_CUDA=ON \
|
||||
-DGGML_CUDA_CUB_3DOT2=ON
|
||||
cmake --build build
|
||||
|
||||
hip:
|
||||
runs-on: ubuntu-22.04
|
||||
container: rocm/dev-ubuntu-22.04:6.1.2
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev libssl-dev rocwmma-dev
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: cuda-ubuntu-22.04-hip
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Build with native CMake HIP support
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build -S . \
|
||||
-DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \
|
||||
-DGGML_HIP_ROCWMMA_FATTN=ON \
|
||||
-DGPU_TARGETS="gfx1030" \
|
||||
-DGGML_HIP=ON
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
|
||||
musa:
|
||||
runs-on: ubuntu-22.04
|
||||
container: mthreads/musa:rc4.3.0-devel-ubuntu22.04-amd64
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
apt-get update
|
||||
apt-get install -y build-essential git cmake libssl-dev
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: cuda-ubuntu-22.04-musa
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Build with native CMake MUSA support
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build -S . \
|
||||
-DGGML_MUSA=ON
|
||||
time cmake --build build --config Release -j $(nproc)
|
||||
146
.github/workflows/build-cuda-windows.yml
vendored
Normal file
146
.github/workflows/build-cuda-windows.yml
vendored
Normal file
@@ -0,0 +1,146 @@
|
||||
name: CI (CUDA, windows)
|
||||
|
||||
# TODO: this workflow is only triggered manually because it is very heavy on the CI
|
||||
# when we provision dedicated windows runners, we can enable it for pushes too
|
||||
# note: running this workflow manually will populate the ccache for the release builds
|
||||
# this can be used before merging a PR to speed up the release workflow
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
|
||||
# note: this will run in queue with the release workflow
|
||||
concurrency:
|
||||
group: release
|
||||
queue: max
|
||||
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
cuda:
|
||||
runs-on: windows-2022
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
cuda: ['12.4', '13.3']
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: release-windows-2022-x64-cuda-${{ matrix.cuda }}
|
||||
append-timestamp: false # note: use this only with non-concurrent jobs!
|
||||
|
||||
- name: Install Cuda Toolkit
|
||||
uses: ./.github/actions/windows-setup-cuda
|
||||
with:
|
||||
cuda_version: ${{ matrix.cuda }}
|
||||
|
||||
- name: Install Ninja
|
||||
id: install_ninja
|
||||
run: |
|
||||
choco install ninja
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
shell: cmd
|
||||
# TODO: Remove GGML_CUDA_CUB_3DOT2 flag once CCCL 3.2 is bundled within CTK and that CTK version is used in this project
|
||||
run: |
|
||||
call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64
|
||||
cmake -S . -B build -G "Ninja Multi-Config" ^
|
||||
-DLLAMA_BUILD_SERVER=ON ^
|
||||
-DLLAMA_BUILD_BORINGSSL=ON ^
|
||||
-DGGML_NATIVE=OFF ^
|
||||
-DGGML_BACKEND_DL=ON ^
|
||||
-DGGML_CPU_ALL_VARIANTS=ON ^
|
||||
-DGGML_CUDA=ON ^
|
||||
-DGGML_RPC=ON ^
|
||||
-DGGML_CUDA_CUB_3DOT2=ON
|
||||
set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
|
||||
cmake --build build --config Release -j %NINJA_JOBS% -t ggml
|
||||
cmake --build build --config Release
|
||||
|
||||
hip:
|
||||
runs-on: windows-2022
|
||||
|
||||
env:
|
||||
# Make sure this is in sync with build-cache.yml
|
||||
HIPSDK_INSTALLER_VERSION: "26.Q1"
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
# sync with release.yml
|
||||
- name: "radeon"
|
||||
gpu_targets: "gfx1150;gfx1151;gfx1200;gfx1201;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032"
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Grab rocWMMA package
|
||||
id: grab_rocwmma
|
||||
run: |
|
||||
curl -o rocwmma.deb "https://repo.radeon.com/rocm/apt/7.2.1/pool/main/r/rocwmma-dev/rocwmma-dev_2.2.0.70201-81~24.04_amd64.deb"
|
||||
7z x rocwmma.deb
|
||||
7z x data.tar
|
||||
|
||||
- name: Use ROCm Installation Cache
|
||||
uses: actions/cache@v5
|
||||
id: cache-rocm
|
||||
with:
|
||||
path: C:\Program Files\AMD\ROCm
|
||||
key: cache-gha-rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }}
|
||||
|
||||
- name: Setup ROCm
|
||||
if: steps.cache-rocm.outputs.cache-hit != 'true'
|
||||
uses: ./.github/actions/windows-setup-rocm
|
||||
with:
|
||||
version: ${{ env.HIPSDK_INSTALLER_VERSION }}
|
||||
|
||||
- name: Verify ROCm
|
||||
id: verify
|
||||
run: |
|
||||
# Find and test ROCm installation
|
||||
$clangPath = Get-ChildItem 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | Select-Object -First 1
|
||||
if (-not $clangPath) {
|
||||
Write-Error "ROCm installation not found"
|
||||
exit 1
|
||||
}
|
||||
& $clangPath.FullName --version
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
# TODO: this build does not match the build in release.yml, so we use a different cache key
|
||||
# ideally, the builds should match, similar to the CUDA build above so that we would be able
|
||||
# to populate the ccache for the release with manual runs of this workflow
|
||||
#key: release-windows-2022-x64-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ matrix.name }}
|
||||
key: cuda-windows-2022-x64-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ matrix.name }}
|
||||
append-timestamp: false # note: use this only with non-concurrent jobs!
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
$env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
|
||||
$env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
|
||||
cmake -G "Unix Makefiles" -B build -S . `
|
||||
-DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
|
||||
-DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
|
||||
-DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/opt/rocm-7.2.1/include/" `
|
||||
-DCMAKE_BUILD_TYPE=Release `
|
||||
-DLLAMA_BUILD_BORINGSSL=ON `
|
||||
-DROCM_DIR="${env:HIP_PATH}" `
|
||||
-DGGML_HIP=ON `
|
||||
-DGGML_HIP_ROCWMMA_FATTN=ON `
|
||||
-DGPU_TARGETS="gfx1100" `
|
||||
-DGGML_RPC=ON
|
||||
cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
|
||||
150
.github/workflows/build-ibm.yml
vendored
Normal file
150
.github/workflows/build-ibm.yml
vendored
Normal file
@@ -0,0 +1,150 @@
|
||||
name: CI (ibm)
|
||||
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: [
|
||||
'.github/workflows/build-ibm.yml',
|
||||
'**/CMakeLists.txt',
|
||||
'**/.cmake',
|
||||
'**/*.h',
|
||||
'**/*.hpp',
|
||||
'**/*.c',
|
||||
'**/*.cpp'
|
||||
]
|
||||
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: [
|
||||
'.github/workflows/build-ibm.yml',
|
||||
'ggml/src/ggml-cpu/**'
|
||||
]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
|
||||
ubuntu-24-s390x:
|
||||
runs-on: ubuntu-24.04-s390x
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Build Dependencies
|
||||
id: build_depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y --no-install-recommends \
|
||||
python3 python3-pip python3-dev python3-wheel \
|
||||
libjpeg-dev build-essential libssl-dev \
|
||||
git-lfs
|
||||
|
||||
- name: Toolchain workaround (GCC 14)
|
||||
run: |
|
||||
sudo apt-get install -y gcc-14 g++-14
|
||||
echo "CC=gcc-14" >> "$GITHUB_ENV"
|
||||
echo "CXX=g++-14" >> "$GITHUB_ENV"
|
||||
|
||||
- name: Python Dependencies
|
||||
id: python_depends
|
||||
run: |
|
||||
export PIP_BREAK_SYSTEM_PACKAGES="1"
|
||||
python3 -m pip install --upgrade pip setuptools
|
||||
pip3 install ./gguf-py
|
||||
|
||||
- name: Swap Endianness
|
||||
id: endianness
|
||||
run: |
|
||||
for f in models/*.gguf; do
|
||||
echo YES | python3 gguf-py/gguf/scripts/gguf_convert_endian.py $f big
|
||||
done
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DGGML_RPC=ON
|
||||
time cmake --build build --config Release -j $(nproc)
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
run: |
|
||||
cd build
|
||||
ctest -L main --verbose --timeout 900
|
||||
|
||||
- name: Test llama2c (s390x)
|
||||
id: llama2c_test_s390x
|
||||
run: |
|
||||
cd build
|
||||
echo "Fetch llama2c big-endian model"
|
||||
wget https://huggingface.co/ggml-org/models/resolve/main/tinyllamas/stories260K-be.gguf
|
||||
./bin/llama-completion -m stories260K-be.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
|
||||
|
||||
ubuntu-24-ppc64le:
|
||||
runs-on: ubuntu-24.04-ppc64le
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Build Dependencies
|
||||
id: build_depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y --no-install-recommends \
|
||||
python3 python3-pip python3-dev python3-wheel \
|
||||
libjpeg-dev build-essential libssl-dev \
|
||||
git-lfs
|
||||
|
||||
- name: Toolchain workaround (GCC 14)
|
||||
run: |
|
||||
sudo apt-get install -y gcc-14 g++-14
|
||||
echo "CC=gcc-14" >> "$GITHUB_ENV"
|
||||
echo "CXX=g++-14" >> "$GITHUB_ENV"
|
||||
|
||||
- name: Python Dependencies
|
||||
id: python_depends
|
||||
run: |
|
||||
export PIP_BREAK_SYSTEM_PACKAGES="1"
|
||||
python3 -m pip install --upgrade pip setuptools
|
||||
pip3 install ./gguf-py
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DGGML_RPC=ON
|
||||
time cmake --build build --config Release -j $(nproc)
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
run: |
|
||||
cd build
|
||||
ctest -L main --verbose --timeout 900
|
||||
|
||||
- name: Test llama2c conversion
|
||||
id: llama2c_test
|
||||
run: |
|
||||
cd build
|
||||
echo "Fetch tokenizer"
|
||||
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin
|
||||
echo "Fetch llama2c model"
|
||||
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
|
||||
./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
|
||||
./bin/llama-completion -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
|
||||
8
.github/workflows/build-msys.yml
vendored
8
.github/workflows/build-msys.yml
vendored
@@ -15,9 +15,9 @@ concurrency:
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
windows-msys2:
|
||||
@@ -37,7 +37,7 @@ jobs:
|
||||
#- name: ccache
|
||||
# uses: ggml-org/ccache-action@v1.2.16
|
||||
# with:
|
||||
# key: windows-msys2
|
||||
# key: msys-windows-2025-x64
|
||||
# variant: ccache
|
||||
# evict-old-files: 1d
|
||||
# save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
82
.github/workflows/build-opencl.yml
vendored
Normal file
82
.github/workflows/build-opencl.yml
vendored
Normal file
@@ -0,0 +1,82 @@
|
||||
name: CI (opencl)
|
||||
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: [
|
||||
'.github/workflows/build-opencl.yml',
|
||||
'**/CMakeLists.txt',
|
||||
'**/.cmake',
|
||||
'**/*.h',
|
||||
'**/*.hpp',
|
||||
'**/*.c',
|
||||
'**/*.cpp',
|
||||
'**/*.cl'
|
||||
]
|
||||
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: [
|
||||
'.github/workflows/build-opencl.yml',
|
||||
'ggml/src/ggml-opencl/**'
|
||||
]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
windows-2025-opencl-adreno:
|
||||
runs-on: windows-2025
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: opencl-windows-2025-x64
|
||||
variant: ccache
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Install Ninja
|
||||
id: install_ninja
|
||||
run: |
|
||||
choco install ninja
|
||||
|
||||
- name: Install OpenCL Headers and Libs
|
||||
id: install_opencl
|
||||
run: |
|
||||
git clone https://github.com/KhronosGroup/OpenCL-Headers
|
||||
cd OpenCL-Headers
|
||||
cmake -B build `
|
||||
-DBUILD_TESTING=OFF `
|
||||
-DOPENCL_HEADERS_BUILD_TESTING=OFF `
|
||||
-DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF `
|
||||
-DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
|
||||
cmake --build build --target install
|
||||
git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader
|
||||
cd OpenCL-ICD-Loader
|
||||
cmake -B build-arm64-release `
|
||||
-A arm64 `
|
||||
-DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" `
|
||||
-DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
|
||||
cmake --build build-arm64-release --target install --config release
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -S . -B build -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON -DLLAMA_BUILD_BORINGSSL=ON
|
||||
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
|
||||
10
.github/workflows/build-openvino.yml
vendored
10
.github/workflows/build-openvino.yml
vendored
@@ -29,9 +29,9 @@ concurrency:
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
ubuntu-24-openvino:
|
||||
@@ -67,7 +67,7 @@ jobs:
|
||||
if: runner.environment == 'github-hosted'
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: ubuntu-24-openvino-${{ matrix.variant }}-no-preset-v1
|
||||
key: openvino-ubuntu-24.04-${{ matrix.variant }}-no-preset-v1
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
@@ -84,7 +84,7 @@ jobs:
|
||||
id: cache-openvino
|
||||
with:
|
||||
path: ./openvino_toolkit
|
||||
key: openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
|
||||
key: cache-gha-openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
|
||||
|
||||
- name: Setup OpenVINO Toolkit
|
||||
if: steps.cache-openvino.outputs.cache-hit != 'true'
|
||||
|
||||
92
.github/workflows/build-riscv.yml
vendored
92
.github/workflows/build-riscv.yml
vendored
@@ -29,11 +29,84 @@ concurrency:
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
ubuntu-cpu-riscv64-native:
|
||||
runs-on: ubuntu-24.04-riscv
|
||||
|
||||
steps:
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
# Install necessary packages
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y libssl-dev
|
||||
|
||||
# Set gcc-14 and g++-14 as the default compilers
|
||||
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100
|
||||
sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-14 100
|
||||
|
||||
git lfs install
|
||||
|
||||
- name: Check environment
|
||||
run: |
|
||||
uname -a
|
||||
gcc --version
|
||||
g++ --version
|
||||
ldd --version
|
||||
cmake --version
|
||||
rustc --version
|
||||
env
|
||||
echo "nproc=$(nproc)"
|
||||
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
# note: sparing some ccache since these jobs run on dedicated runners that are not part of the organitzation
|
||||
#- name: ccache
|
||||
# uses: ggml-org/ccache-action@afde29e5b5422e5da23cb1f639e8baecadeadfc3 # https://github.com/ggml-org/ccache-action/pull/1
|
||||
# with:
|
||||
# key: riscv-ubuntu-native
|
||||
# evict-old-files: 1d
|
||||
# save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_OPENMP=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=ON \
|
||||
-DLLAMA_BUILD_TOOLS=ON \
|
||||
-DLLAMA_BUILD_TESTS=ON \
|
||||
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
|
||||
-DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
|
||||
-DGGML_RPC=ON \
|
||||
-DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
|
||||
-DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14
|
||||
|
||||
time cmake --build build --config Release -j $(nproc)
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
run: |
|
||||
cd build
|
||||
ctest -L main --verbose --timeout 900
|
||||
|
||||
- name: Test llama2c conversion
|
||||
id: llama2c_test
|
||||
run: |
|
||||
cd build
|
||||
echo "Fetch tokenizer"
|
||||
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin
|
||||
echo "Fetch llama2c model"
|
||||
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
|
||||
./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
|
||||
./bin/llama-completion -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
|
||||
|
||||
ubuntu-riscv64-native-sanitizer:
|
||||
runs-on: ubuntu-24.04-riscv
|
||||
|
||||
@@ -62,12 +135,13 @@ jobs:
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@afde29e5b5422e5da23cb1f639e8baecadeadfc3 # https://github.com/ggml-org/ccache-action/pull/1
|
||||
with:
|
||||
key: ubuntu-riscv64-native-sanitizer-${{ matrix.sanitizer }}-${{ matrix.build_type }}
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
# note: sparing some ccache since these jobs run on dedicated runners that are not part of the organitzation
|
||||
#- name: ccache
|
||||
# uses: ggml-org/ccache-action@afde29e5b5422e5da23cb1f639e8baecadeadfc3 # https://github.com/ggml-org/ccache-action/pull/1
|
||||
# with:
|
||||
# key: riscv-ubuntu-native-sanitizer-${{ matrix.sanitizer }}-${{ matrix.build_type }}
|
||||
# evict-old-files: 1d
|
||||
# save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
|
||||
66
.github/workflows/build-rpc.yml
vendored
Normal file
66
.github/workflows/build-rpc.yml
vendored
Normal file
@@ -0,0 +1,66 @@
|
||||
name: CI (rpc)
|
||||
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: [
|
||||
'.github/workflows/build-rpc.yml',
|
||||
'**/CMakeLists.txt',
|
||||
'**/.cmake',
|
||||
'**/*.h',
|
||||
'**/*.hpp',
|
||||
'**/*.c',
|
||||
'**/*.cpp'
|
||||
]
|
||||
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: [
|
||||
'.github/workflows/build-rpc.yml',
|
||||
'ggml/src/ggml-rpc/**'
|
||||
]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
ubuntu-latest-rpc:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
continue-on-error: true
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential libssl-dev ninja-build
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build \
|
||||
-G "Ninja" \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_RPC=ON
|
||||
time cmake --build build --config Release -j $(nproc)
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
run: |
|
||||
cd build
|
||||
ctest -L main --verbose
|
||||
51
.github/workflows/build-sanitize.yml
vendored
51
.github/workflows/build-sanitize.yml
vendored
@@ -22,66 +22,65 @@ concurrency:
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
ubuntu-latest-sanitizer:
|
||||
runs-on: ubuntu-latest
|
||||
ctest:
|
||||
runs-on: [self-hosted, X64, CPU, Linux]
|
||||
|
||||
continue-on-error: true
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
sanitizer: [ADDRESS, THREAD, UNDEFINED]
|
||||
build_type: [Debug]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: ubuntu-latest-sanitizer-${{ matrix.sanitizer }}
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
# with UNDEFINED sanitizer, we have to build in Debug to avoid GCC 13 false-positive warnings
|
||||
- name: Build (undefined)
|
||||
id: cmake_build_undefined
|
||||
if: ${{ matrix.sanitizer == 'UNDEFINED' }}
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential libssl-dev
|
||||
cmake -B build \
|
||||
-DCMAKE_BUILD_TYPE=Debug \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
|
||||
-DGGML_SANITIZE_${{ matrix.sanitizer }}=ON
|
||||
|
||||
cmake --build build --config Debug -j $(nproc)
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
if: ${{ matrix.sanitizer != 'THREAD' }}
|
||||
if: ${{ matrix.sanitizer == 'ADDRESS' }}
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DCMAKE_BUILD_TYPE=RelWithDebInfo \
|
||||
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
|
||||
-DGGML_SANITIZE_${{ matrix.sanitizer }}=ON \
|
||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
|
||||
-DGGML_SANITIZE_${{ matrix.sanitizer }}=ON
|
||||
|
||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
|
||||
cmake --build build --config RelWithDebInfo -j $(nproc)
|
||||
|
||||
- name: Build (no OpenMP)
|
||||
id: cmake_build_no_openmp
|
||||
if: ${{ matrix.sanitizer == 'THREAD' }}
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DCMAKE_BUILD_TYPE=RelWithDebInfo \
|
||||
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
|
||||
-DGGML_SANITIZE_${{ matrix.sanitizer }}=ON \
|
||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
||||
-DGGML_OPENMP=OFF
|
||||
|
||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
|
||||
cmake --build build --config RelWithDebInfo -j $(nproc)
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
# skip run in Debug - very slow
|
||||
if: ${{ matrix.sanitizer != 'UNDEFINED' }}
|
||||
run: |
|
||||
cd build
|
||||
ctest -L main --verbose --timeout 900
|
||||
ctest -L main -E tokenizer --verbose --timeout 900
|
||||
|
||||
198
.github/workflows/build-self-hosted.yml
vendored
198
.github/workflows/build-self-hosted.yml
vendored
@@ -50,29 +50,12 @@ concurrency:
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
determine-tag:
|
||||
name: Determine tag name
|
||||
runs-on: ubuntu-slim
|
||||
outputs:
|
||||
tag_name: ${{ steps.tag.outputs.name }}
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
uses: ./.github/actions/get-tag-name
|
||||
env:
|
||||
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
||||
|
||||
ggml-ci-nvidia-cuda:
|
||||
needs: determine-tag
|
||||
gpu-cuda:
|
||||
runs-on: [self-hosted, Linux, NVIDIA]
|
||||
|
||||
steps:
|
||||
@@ -82,14 +65,11 @@ jobs:
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
env:
|
||||
HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
|
||||
run: |
|
||||
nvidia-smi
|
||||
GG_BUILD_CUDA=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||
GG_BUILD_CUDA=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
ggml-ci-nvidia-vulkan-cm:
|
||||
needs: determine-tag
|
||||
gpu-vulkan-nvidia-cm:
|
||||
runs-on: [self-hosted, Linux, NVIDIA]
|
||||
|
||||
steps:
|
||||
@@ -99,14 +79,11 @@ jobs:
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
env:
|
||||
HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
|
||||
run: |
|
||||
vulkaninfo --summary
|
||||
GG_BUILD_VULKAN=1 GGML_VK_DISABLE_COOPMAT2=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||
GG_BUILD_VULKAN=1 GGML_VK_DISABLE_COOPMAT2=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
ggml-ci-nvidia-vulkan-cm2:
|
||||
needs: determine-tag
|
||||
gpu-vulkan-nvidia-cm2:
|
||||
runs-on: [self-hosted, Linux, NVIDIA, COOPMAT2]
|
||||
|
||||
steps:
|
||||
@@ -116,14 +93,12 @@ jobs:
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
env:
|
||||
HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
|
||||
run: |
|
||||
vulkaninfo --summary
|
||||
GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||
GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
ggml-ci-nvidia-webgpu:
|
||||
runs-on: [self-hosted, Linux, NVIDIA]
|
||||
gpu-webgpu-nvidia:
|
||||
runs-on: [self-hosted, Linux, NVIDIA, X64]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
@@ -149,10 +124,10 @@ jobs:
|
||||
GG_BUILD_WEBGPU=1 \
|
||||
GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \
|
||||
GG_BUILD_WEBGPU_DAWN_DIR="$GITHUB_WORKSPACE/dawn/lib64/cmake/Dawn" \
|
||||
bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||
bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
# TODO: provision AMX-compatible machine
|
||||
#ggml-ci-cpu-amx:
|
||||
#cpu-amx:
|
||||
# runs-on: [self-hosted, Linux, CPU, AMX]
|
||||
|
||||
# steps:
|
||||
@@ -163,10 +138,10 @@ jobs:
|
||||
# - name: Test
|
||||
# id: ggml-ci
|
||||
# run: |
|
||||
# bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||
# bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
# TODO: provision AMD GPU machine
|
||||
# ggml-ci-amd-vulkan:
|
||||
# amd-vulkan:
|
||||
# runs-on: [self-hosted, Linux, AMD]
|
||||
|
||||
# steps:
|
||||
@@ -178,10 +153,10 @@ jobs:
|
||||
# id: ggml-ci
|
||||
# run: |
|
||||
# vulkaninfo --summary
|
||||
# GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||
# GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
# TODO: provision AMD GPU machine
|
||||
# ggml-ci-amd-rocm:
|
||||
# amd-rocm:
|
||||
# runs-on: [self-hosted, Linux, AMD]
|
||||
|
||||
# steps:
|
||||
@@ -193,10 +168,9 @@ jobs:
|
||||
# id: ggml-ci
|
||||
# run: |
|
||||
# amd-smi static
|
||||
# GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||
# GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
ggml-ci-mac-metal:
|
||||
needs: determine-tag
|
||||
gpu-metal:
|
||||
runs-on: [self-hosted, macOS, ARM64]
|
||||
|
||||
steps:
|
||||
@@ -206,13 +180,10 @@ jobs:
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
env:
|
||||
HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
|
||||
run: |
|
||||
GG_BUILD_METAL=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
ggml-ci-mac-webgpu:
|
||||
needs: determine-tag
|
||||
gpu-webgpu-apple:
|
||||
runs-on: [self-hosted, macOS, ARM64]
|
||||
|
||||
steps:
|
||||
@@ -235,14 +206,11 @@ jobs:
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
env:
|
||||
HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
|
||||
run: |
|
||||
GG_BUILD_WEBGPU=1 GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \
|
||||
bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
ggml-ci-mac-vulkan:
|
||||
needs: determine-tag
|
||||
gpu-vulkan:
|
||||
runs-on: [self-hosted, macOS, ARM64]
|
||||
|
||||
steps:
|
||||
@@ -252,14 +220,11 @@ jobs:
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
env:
|
||||
HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
|
||||
run: |
|
||||
vulkaninfo --summary
|
||||
GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
ggml-ci-linux-intel-vulkan:
|
||||
needs: determine-tag
|
||||
gpu-vulkan-intel-linux:
|
||||
runs-on: [self-hosted, Linux, Intel]
|
||||
|
||||
steps:
|
||||
@@ -271,14 +236,11 @@ jobs:
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
env:
|
||||
HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
|
||||
run: |
|
||||
vulkaninfo --summary
|
||||
GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
ggml-ci-win-intel-vulkan:
|
||||
needs: determine-tag
|
||||
gpu-vulkan-intel-windows:
|
||||
runs-on: [self-hosted, Windows, X64, Intel]
|
||||
|
||||
steps:
|
||||
@@ -293,15 +255,13 @@ jobs:
|
||||
MSYSTEM: UCRT64
|
||||
CHERE_INVOKING: 1
|
||||
PATH: C:\msys64\ucrt64\bin;C:\msys64\usr\bin;C:\Windows\System32;${{ env.PATH }}
|
||||
HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
|
||||
run: |
|
||||
vulkaninfo --summary
|
||||
# Skip python related tests with GG_BUILD_LOW_PERF=1 since Windows MSYS2 UCRT64 currently fails to create
|
||||
# a valid python environment for testing
|
||||
LLAMA_FATAL_WARNINGS=OFF GG_BUILD_NINJA=1 GG_BUILD_VULKAN=1 GG_BUILD_LOW_PERF=1 ./ci/run.sh ./results/llama.cpp ./mnt/llama.cpp
|
||||
|
||||
ggml-ci-intel-openvino-gpu-low-perf:
|
||||
needs: determine-tag
|
||||
cpu-openvino-low-perf:
|
||||
runs-on: [self-hosted, Linux, Intel, OpenVINO]
|
||||
|
||||
concurrency:
|
||||
@@ -333,8 +293,112 @@ jobs:
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
env:
|
||||
HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
|
||||
run: |
|
||||
source ./openvino_toolkit/setupvars.sh
|
||||
GG_BUILD_OPENVINO=1 GGML_OPENVINO_DEVICE=GPU GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
GG_BUILD_OPENVINO=1 GGML_OPENVINO_DEVICE=GPU GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
cpu-any-low-perf:
|
||||
runs-on: [self-hosted, CPU]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
run: |
|
||||
LLAMA_ARG_THREADS=$(nproc) GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
cpu-any-high-perf:
|
||||
runs-on: [self-hosted, CPU]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
run: |
|
||||
LLAMA_ARG_THREADS=$(nproc) GG_BUILD_HIGH_PERF=1 GG_BUILD_NO_SVE=1 GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
cpu-arm64-graviton4:
|
||||
runs-on: ah-ubuntu_22_04-c8g_8x
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
set -euxo pipefail
|
||||
sudo apt-get update
|
||||
sudo DEBIAN_FRONTEND=noninteractive NEEDRESTART_MODE=a \
|
||||
apt-get install -y \
|
||||
build-essential \
|
||||
python3-venv \
|
||||
gpg \
|
||||
wget \
|
||||
time \
|
||||
git-lfs
|
||||
|
||||
git lfs install
|
||||
|
||||
# install the latest cmake
|
||||
sudo install -d /usr/share/keyrings
|
||||
wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc \
|
||||
| gpg --dearmor \
|
||||
| sudo tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null
|
||||
echo 'deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ jammy main' \
|
||||
| sudo tee /etc/apt/sources.list.d/kitware.list
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y cmake
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
run: |
|
||||
LLAMA_ARG_THREADS=$(nproc) GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
|
||||
|
||||
cpu-arm64-graviton4-kleidiai:
|
||||
runs-on: ah-ubuntu_22_04-c8g_8x
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
set -euxo pipefail
|
||||
sudo apt-get update
|
||||
sudo DEBIAN_FRONTEND=noninteractive NEEDRESTART_MODE=a \
|
||||
apt-get install -y \
|
||||
build-essential \
|
||||
python3-venv \
|
||||
gpg \
|
||||
wget \
|
||||
time \
|
||||
git-lfs
|
||||
|
||||
git lfs install
|
||||
|
||||
# install the latest cmake
|
||||
sudo install -d /usr/share/keyrings
|
||||
wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc \
|
||||
| gpg --dearmor \
|
||||
| sudo tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null
|
||||
echo 'deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ jammy main' \
|
||||
| sudo tee /etc/apt/sources.list.d/kitware.list
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y cmake
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
run: |
|
||||
GG_BUILD_KLEIDIAI=1 \
|
||||
GG_BUILD_EXTRA_TESTS_0=1 \
|
||||
bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
|
||||
252
.github/workflows/build-sycl.yml
vendored
252
.github/workflows/build-sycl.yml
vendored
@@ -29,132 +29,134 @@ concurrency:
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
|
||||
ubuntu-24-sycl:
|
||||
strategy:
|
||||
matrix:
|
||||
build: [fp32, fp16]
|
||||
include:
|
||||
- build: fp32
|
||||
fp16: OFF
|
||||
- build: fp16
|
||||
fp16: ON
|
||||
# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23705)
|
||||
# in order to enable it again, we have to provision dedicated runners to run it
|
||||
# ubuntu-24-sycl:
|
||||
# strategy:
|
||||
# matrix:
|
||||
# build: [fp32]
|
||||
# include:
|
||||
# - build: fp32
|
||||
# fp16: OFF
|
||||
#
|
||||
# runs-on: ubuntu-24.04
|
||||
#
|
||||
# env:
|
||||
# ONEAPI_ROOT: /opt/intel/oneapi/
|
||||
# ONEAPI_INSTALLER_VERSION: "2025.3.3"
|
||||
# LEVEL_ZERO_VERSION: "1.28.2"
|
||||
# LEVEL_ZERO_UBUNTU_VERSION: "u24.04"
|
||||
#
|
||||
# continue-on-error: true
|
||||
#
|
||||
# steps:
|
||||
# - uses: actions/checkout@v6
|
||||
#
|
||||
# - name: Use oneAPI Installation Cache
|
||||
# uses: actions/cache@v5
|
||||
# id: cache-sycl
|
||||
# with:
|
||||
# path: ${{ env.ONEAPI_ROOT }}
|
||||
# key: cache-gha-oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
|
||||
#
|
||||
# - name: Download & Install oneAPI
|
||||
# shell: bash
|
||||
# if: steps.cache-sycl.outputs.cache-hit != 'true'
|
||||
# run: |
|
||||
# cd /tmp
|
||||
# wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
|
||||
# sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
|
||||
#
|
||||
# - name: Install Level Zero SDK
|
||||
# shell: bash
|
||||
# run: |
|
||||
# cd /tmp
|
||||
# wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb
|
||||
# wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb
|
||||
# sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb
|
||||
#
|
||||
# - name: Clone
|
||||
# id: checkout
|
||||
# uses: actions/checkout@v6
|
||||
#
|
||||
# - name: ccache
|
||||
# uses: ggml-org/ccache-action@v1.2.21
|
||||
# with:
|
||||
# key: sycl-ubuntu-24-${{ matrix.build }}
|
||||
# evict-old-files: 1d
|
||||
# save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
#
|
||||
# - name: Build
|
||||
# id: cmake_build
|
||||
# run: |
|
||||
# source /opt/intel/oneapi/setvars.sh
|
||||
# cmake -B build \
|
||||
# -G "Ninja" \
|
||||
# -DCMAKE_BUILD_TYPE=Release \
|
||||
# -DGGML_SYCL=ON \
|
||||
# -DCMAKE_C_COMPILER=icx \
|
||||
# -DCMAKE_CXX_COMPILER=icpx \
|
||||
# -DLLAMA_OPENSSL=OFF \
|
||||
# -DGGML_NATIVE=OFF \
|
||||
# -DGGML_SYCL_F16=${{ matrix.fp16 }}
|
||||
# time cmake --build build --config Release -j $(nproc)
|
||||
|
||||
runs-on: ubuntu-24.04
|
||||
|
||||
env:
|
||||
ONEAPI_ROOT: /opt/intel/oneapi/
|
||||
ONEAPI_INSTALLER_VERSION: "2025.3.3"
|
||||
LEVEL_ZERO_VERSION: "1.28.2"
|
||||
LEVEL_ZERO_UBUNTU_VERSION: "u24.04"
|
||||
|
||||
continue-on-error: true
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
|
||||
- name: Use oneAPI Installation Cache
|
||||
uses: actions/cache@v5
|
||||
id: cache-sycl
|
||||
with:
|
||||
path: ${{ env.ONEAPI_ROOT }}
|
||||
key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
|
||||
|
||||
- name: Download & Install oneAPI
|
||||
shell: bash
|
||||
if: steps.cache-sycl.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
cd /tmp
|
||||
wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
|
||||
sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
|
||||
|
||||
- name: Install Level Zero SDK
|
||||
shell: bash
|
||||
run: |
|
||||
cd /tmp
|
||||
wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb
|
||||
wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb
|
||||
sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb
|
||||
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: ubuntu-24-sycl-${{ matrix.build }}
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
cmake -B build \
|
||||
-G "Ninja" \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_SYCL=ON \
|
||||
-DCMAKE_C_COMPILER=icx \
|
||||
-DCMAKE_CXX_COMPILER=icpx \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DGGML_NATIVE=OFF \
|
||||
-DGGML_SYCL_F16=${{ matrix.fp16 }}
|
||||
time cmake --build build --config Release -j $(nproc)
|
||||
|
||||
windows-latest-sycl:
|
||||
runs-on: windows-2022
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
|
||||
env:
|
||||
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
|
||||
WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
|
||||
LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip
|
||||
ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
|
||||
ONEAPI_INSTALLER_VERSION: "2025.3.3"
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Use oneAPI Installation Cache
|
||||
uses: actions/cache@v5
|
||||
id: cache-sycl
|
||||
with:
|
||||
path: ${{ env.ONEAPI_ROOT }}
|
||||
key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
|
||||
|
||||
- name: Download & Install oneAPI
|
||||
shell: bash
|
||||
if: steps.cache-sycl.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
|
||||
|
||||
- name: Install Level Zero SDK
|
||||
shell: pwsh
|
||||
run: |
|
||||
Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip"
|
||||
Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force
|
||||
"LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: windows-latest-sycl
|
||||
variant: ccache
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
# TODO: add ssl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: examples/sycl/win-build-sycl.bat
|
||||
# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23705)
|
||||
# in order to enable it again, we have to provision dedicated runners to run it
|
||||
# windows-latest-sycl:
|
||||
# runs-on: windows-2022
|
||||
#
|
||||
# defaults:
|
||||
# run:
|
||||
# shell: bash
|
||||
#
|
||||
# env:
|
||||
# WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
|
||||
# WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
|
||||
# LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip
|
||||
# ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
|
||||
# ONEAPI_INSTALLER_VERSION: "2025.3.3"
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# id: checkout
|
||||
# uses: actions/checkout@v6
|
||||
#
|
||||
# - name: Use oneAPI Installation Cache
|
||||
# uses: actions/cache@v5
|
||||
# id: cache-sycl
|
||||
# with:
|
||||
# path: ${{ env.ONEAPI_ROOT }}
|
||||
# key: cache-gha-oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
|
||||
#
|
||||
# - name: Download & Install oneAPI
|
||||
# shell: bash
|
||||
# if: steps.cache-sycl.outputs.cache-hit != 'true'
|
||||
# run: |
|
||||
# scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
|
||||
#
|
||||
# - name: Install Level Zero SDK
|
||||
# shell: pwsh
|
||||
# run: |
|
||||
# Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip"
|
||||
# Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force
|
||||
# "LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append
|
||||
#
|
||||
# - name: ccache
|
||||
# uses: ggml-org/ccache-action@v1.2.21
|
||||
# with:
|
||||
# key: sycl-windows-latest
|
||||
# variant: ccache
|
||||
# evict-old-files: 1d
|
||||
# save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
#
|
||||
# # TODO: add ssl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
|
||||
#
|
||||
# - name: Build
|
||||
# id: cmake_build
|
||||
# run: examples/sycl/win-build-sycl.bat
|
||||
|
||||
59
.github/workflows/build-vulkan.yml
vendored
59
.github/workflows/build-vulkan.yml
vendored
@@ -31,12 +31,59 @@ concurrency:
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
ubuntu-24-vulkan-llvmpipe:
|
||||
ubuntu:
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- build: 'x64'
|
||||
os: ubuntu-24.04
|
||||
- build: 'arm64'
|
||||
os: ubuntu-24.04-arm
|
||||
|
||||
runs-on: ${{ matrix.os }}
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: vulkan-${{ matrix.os }}
|
||||
variant: ccache
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y gcc-14 g++-14 build-essential glslc libvulkan-dev spirv-headers libssl-dev ninja-build
|
||||
echo "CC=gcc-14" >> "$GITHUB_ENV"
|
||||
echo "CXX=g++-14" >> "$GITHUB_ENV"
|
||||
|
||||
- name: Configure
|
||||
id: cmake_configure
|
||||
run: |
|
||||
cmake -B build \
|
||||
-G "Ninja" \
|
||||
-DCMAKE_BUILD_TYPE=RelWithDebInfo \
|
||||
-DGGML_BACKEND_DL=ON \
|
||||
-DGGML_CPU_ALL_VARIANTS=ON \
|
||||
-DGGML_VULKAN=ON
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
time cmake --build build -j $(nproc)
|
||||
|
||||
ubuntu-llvmpipe:
|
||||
runs-on: ubuntu-24.04
|
||||
|
||||
steps:
|
||||
@@ -47,7 +94,7 @@ jobs:
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: ubuntu-24-vulkan-llvmpipe
|
||||
key: vulkan-ubuntu-24.04-llvmpipe
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
@@ -68,7 +115,7 @@ jobs:
|
||||
id: cache-sdk
|
||||
with:
|
||||
path: ./vulkan_sdk
|
||||
key: vulkan-sdk-${{ env.VULKAN_SDK_VERSION }}-${{ runner.os }}
|
||||
key: cache-gha-vulkan-sdk-${{ env.VULKAN_SDK_VERSION }}-${{ runner.os }}
|
||||
|
||||
- name: Setup Vulkan SDK
|
||||
if: steps.cache-sdk.outputs.cache-hit != 'true'
|
||||
|
||||
181
.github/workflows/build-webgpu.yml
vendored
Normal file
181
.github/workflows/build-webgpu.yml
vendored
Normal file
@@ -0,0 +1,181 @@
|
||||
name: CI (webgpu)
|
||||
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: [
|
||||
'.github/workflows/build-webgpu.yml',
|
||||
'**/CMakeLists.txt',
|
||||
'**/.cmake',
|
||||
'**/*.h',
|
||||
'**/*.hpp',
|
||||
'**/*.c',
|
||||
'**/*.cpp',
|
||||
'**/*.wgsl'
|
||||
]
|
||||
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: [
|
||||
'.github/workflows/build-webgpu.yml',
|
||||
'ggml/src/ggml-webgpu/**'
|
||||
]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
macos:
|
||||
runs-on: macos-latest
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: webgpu-macos-latest
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Dawn Dependency
|
||||
id: dawn-depends
|
||||
run: |
|
||||
DAWN_VERSION="v20260317.182325"
|
||||
DAWN_OWNER="google"
|
||||
DAWN_REPO="dawn"
|
||||
DAWN_ASSET_NAME="Dawn-18eb229ef5f707c1464cc581252e7603c73a3ef0-macos-latest-Release"
|
||||
echo "Fetching release asset from https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
|
||||
curl -L -o artifact.tar.gz \
|
||||
"https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
|
||||
mkdir dawn
|
||||
tar -xvf artifact.tar.gz -C dawn --strip-components=1
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
export CMAKE_PREFIX_PATH=dawn
|
||||
cmake -B build -G "Ninja" -DCMAKE_BUILD_TYPE=Release -DGGML_WEBGPU=ON -DGGML_METAL=OFF -DGGML_BLAS=OFF
|
||||
time cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
run: |
|
||||
cd build
|
||||
ctest -L main --verbose --timeout 900
|
||||
|
||||
ubuntu:
|
||||
runs-on: ubuntu-24.04
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: webgpu-ubuntu-24.04
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
sudo add-apt-repository -y ppa:kisak/kisak-mesa
|
||||
sudo apt-get update -y
|
||||
sudo apt-get install -y build-essential mesa-vulkan-drivers \
|
||||
libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libssl-dev
|
||||
|
||||
- name: Dawn Dependency
|
||||
id: dawn-depends
|
||||
run: |
|
||||
sudo apt-get install -y libxrandr-dev libxinerama-dev libxcursor-dev mesa-common-dev libx11-xcb-dev libxi-dev
|
||||
DAWN_VERSION="v20260317.182325"
|
||||
DAWN_OWNER="google"
|
||||
DAWN_REPO="dawn"
|
||||
DAWN_ASSET_NAME="Dawn-18eb229ef5f707c1464cc581252e7603c73a3ef0-ubuntu-latest-Release"
|
||||
echo "Fetching release asset from https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
|
||||
curl -L -o artifact.tar.gz \
|
||||
"https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
|
||||
mkdir dawn
|
||||
tar -xvf artifact.tar.gz -C dawn --strip-components=1
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
export Dawn_DIR=dawn/lib64/cmake/Dawn
|
||||
cmake -B build \
|
||||
-DGGML_WEBGPU=ON
|
||||
time cmake --build build --config Release -j $(nproc)
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
run: |
|
||||
cd build
|
||||
# This is using llvmpipe and runs slower than other backends
|
||||
# test-backend-ops is too slow on llvmpipe, skip it
|
||||
ctest -L main -E test-backend-ops --verbose --timeout 900
|
||||
|
||||
ubuntu-wasm:
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- build: 'x64'
|
||||
os: ubuntu-24.04
|
||||
- build: 'arm64'
|
||||
os: ubuntu-24.04-arm
|
||||
|
||||
runs-on: ${{ matrix.os }}
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: webgpu-${{ matrix.os }}-wasm
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Install Emscripten
|
||||
run: |
|
||||
git clone https://github.com/emscripten-core/emsdk.git
|
||||
cd emsdk
|
||||
./emsdk install latest
|
||||
./emsdk activate latest
|
||||
|
||||
- name: Fetch emdawnwebgpu
|
||||
run: |
|
||||
DAWN_TAG="v20260317.182325"
|
||||
EMDAWN_PKG="emdawnwebgpu_pkg-${DAWN_TAG}.zip"
|
||||
echo "Downloading ${EMDAWN_PKG}"
|
||||
curl -L -o emdawn.zip \
|
||||
"https://github.com/google/dawn/releases/download/${DAWN_TAG}/${EMDAWN_PKG}"
|
||||
unzip emdawn.zip
|
||||
|
||||
- name: Build WASM WebGPU
|
||||
run: |
|
||||
source emsdk/emsdk_env.sh
|
||||
emcmake cmake -B build-wasm \
|
||||
-G "Ninja" \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_WEBGPU=ON \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DEMDAWNWEBGPU_DIR=emdawnwebgpu_pkg
|
||||
|
||||
time cmake --build build-wasm --config Release --target test-backend-ops -j $(nproc)
|
||||
1110
.github/workflows/build.yml
vendored
1110
.github/workflows/build.yml
vendored
File diff suppressed because it is too large
Load Diff
2
.github/workflows/check-vendor.yml
vendored
2
.github/workflows/check-vendor.yml
vendored
@@ -19,7 +19,7 @@ on:
|
||||
|
||||
jobs:
|
||||
check-vendor:
|
||||
runs-on: ubuntu-slim
|
||||
runs-on: [self-hosted, fast]
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
|
||||
2
.github/workflows/code-style.yml
vendored
2
.github/workflows/code-style.yml
vendored
@@ -15,7 +15,7 @@ concurrency:
|
||||
|
||||
jobs:
|
||||
model-naming:
|
||||
runs-on: ubuntu-slim
|
||||
runs-on: [self-hosted, fast]
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
- name: Check model naming conventions
|
||||
|
||||
2
.github/workflows/editorconfig.yml
vendored
2
.github/workflows/editorconfig.yml
vendored
@@ -15,7 +15,7 @@ concurrency:
|
||||
|
||||
jobs:
|
||||
editorconfig:
|
||||
runs-on: ubuntu-slim
|
||||
runs-on: [self-hosted, fast]
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
- uses: editorconfig-checker/action-editorconfig-checker@840e866d93b8e032123c23bac69dece044d4d84c # v2.2.0
|
||||
|
||||
8
.github/workflows/hip-quality-check.yml
vendored
8
.github/workflows/hip-quality-check.yml
vendored
@@ -28,9 +28,9 @@ concurrency:
|
||||
env:
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
ubuntu-22-hip-quality-check:
|
||||
@@ -50,7 +50,7 @@ jobs:
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: ubuntu-22-hip-quality-check
|
||||
key: hip-quality-check-ubuntu-22.04
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
|
||||
16
.github/workflows/pre-tokenizer-hashes.yml
vendored
16
.github/workflows/pre-tokenizer-hashes.yml
vendored
@@ -3,16 +3,16 @@ name: Check Pre-Tokenizer Hashes
|
||||
on:
|
||||
push:
|
||||
paths:
|
||||
- 'convert_hf_to_gguf.py'
|
||||
- 'conversion/base.py'
|
||||
- 'convert_hf_to_gguf_update.py'
|
||||
pull_request:
|
||||
paths:
|
||||
- 'convert_hf_to_gguf.py'
|
||||
- 'conversion/base.py'
|
||||
- 'convert_hf_to_gguf_update.py'
|
||||
|
||||
jobs:
|
||||
pre-tokenizer-hashes:
|
||||
runs-on: ubuntu-slim
|
||||
runs-on: [self-hosted, fast]
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
@@ -30,16 +30,16 @@ jobs:
|
||||
|
||||
- name: Update pre-tokenizer hashes
|
||||
run: |
|
||||
cp convert_hf_to_gguf.py /tmp
|
||||
cp conversion/base.py /tmp
|
||||
.venv/bin/python convert_hf_to_gguf_update.py --check-missing
|
||||
|
||||
- name: Check if committed pre-tokenizer hashes matches generated version
|
||||
run: |
|
||||
if ! diff -q convert_hf_to_gguf.py /tmp/convert_hf_to_gguf.py; then
|
||||
echo "Model pre-tokenizer hashes (in convert_hf_to_gguf.py) do not match generated hashes (from convert_hf_to_gguf_update.py)."
|
||||
echo "To fix: run ./convert_hf_to_gguf_update.py and commit the updated convert_hf_to_gguf.py along with your changes"
|
||||
if ! diff -q conversion/base.py /tmp/base.py; then
|
||||
echo "Model pre-tokenizer hashes (in conversion/base.py) do not match generated hashes (from convert_hf_to_gguf_update.py)."
|
||||
echo "To fix: run ./convert_hf_to_gguf_update.py and commit the updated conversion/base.py along with your changes"
|
||||
echo "Differences found:"
|
||||
diff convert_hf_to_gguf.py /tmp/convert_hf_to_gguf.py || true
|
||||
diff conversion/base.py /tmp/base.py || true
|
||||
exit 1
|
||||
fi
|
||||
echo "Model pre-tokenizer hashes are up to date."
|
||||
|
||||
@@ -20,7 +20,7 @@ concurrency:
|
||||
|
||||
jobs:
|
||||
python-check-requirements:
|
||||
runs-on: ubuntu-slim
|
||||
runs-on: [self-hosted, CPU, fast]
|
||||
name: check-requirements
|
||||
steps:
|
||||
- name: Check out source repository
|
||||
|
||||
2
.github/workflows/python-lint.yml
vendored
2
.github/workflows/python-lint.yml
vendored
@@ -21,7 +21,7 @@ concurrency:
|
||||
|
||||
jobs:
|
||||
flake8-lint:
|
||||
runs-on: ubuntu-slim
|
||||
runs-on: [self-hosted, fast]
|
||||
name: Lint
|
||||
steps:
|
||||
- name: Check out source repository
|
||||
|
||||
2
.github/workflows/python-type-check.yml
vendored
2
.github/workflows/python-type-check.yml
vendored
@@ -22,7 +22,7 @@ concurrency:
|
||||
|
||||
jobs:
|
||||
python-type-check:
|
||||
runs-on: ubuntu-slim
|
||||
runs-on: [self-hosted, fast]
|
||||
name: python type-check
|
||||
steps:
|
||||
- name: Check out source repository
|
||||
|
||||
763
.github/workflows/release.yml
vendored
763
.github/workflows/release.yml
vendored
@@ -27,18 +27,40 @@ on:
|
||||
'**/*.glsl'
|
||||
]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
||||
CMAKE_ARGS: "-DLLAMA_BUILD_EXAMPLES=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=ON -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON"
|
||||
|
||||
# note: run this workflow one at a time for better cache reuse
|
||||
concurrency:
|
||||
group: release
|
||||
queue: max
|
||||
|
||||
jobs:
|
||||
check_release:
|
||||
runs-on: ubuntu-slim
|
||||
|
||||
macOS-cpu:
|
||||
outputs:
|
||||
should_release: ${{ steps.check.outputs.should_release }}
|
||||
|
||||
steps:
|
||||
- id: check
|
||||
run: |
|
||||
if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
|
||||
echo "should_release=true" >> $GITHUB_OUTPUT
|
||||
elif [[ "${{ github.event_name }}" == "push" && "${{ github.ref }}" == "refs/heads/master" ]]; then
|
||||
if echo "${{ github.event.head_commit.message }}" | grep -q '\[no release\]'; then
|
||||
echo "should_release=false" >> $GITHUB_OUTPUT
|
||||
else
|
||||
echo "should_release=true" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
else
|
||||
echo "should_release=false" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
|
||||
macos-cpu:
|
||||
needs: [check_release]
|
||||
if: ${{ needs.check_release.outputs.should_release == 'true' }}
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
@@ -46,10 +68,12 @@ jobs:
|
||||
arch: 'arm64'
|
||||
os: macos-14
|
||||
defines: "-DGGML_METAL_USE_BF16=ON -DGGML_METAL_EMBED_LIBRARY=ON"
|
||||
- build: 'arm64-kleidiai'
|
||||
arch: 'arm64'
|
||||
os: macos-14
|
||||
defines: "-DGGML_METAL_USE_BF16=ON -DGGML_METAL_EMBED_LIBRARY=ON -DGGML_CPU_KLEIDIAI=ON"
|
||||
# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23780)
|
||||
# in order to enable it again, we have to provision dedicated runners to run it
|
||||
#- build: 'arm64-kleidiai'
|
||||
# arch: 'arm64'
|
||||
# os: macos-14
|
||||
# defines: "-DGGML_METAL_USE_BF16=ON -DGGML_METAL_EMBED_LIBRARY=ON -DGGML_CPU_KLEIDIAI=ON"
|
||||
- build: 'x64'
|
||||
arch: 'x64'
|
||||
os: macos-15-intel
|
||||
@@ -76,8 +100,8 @@ jobs:
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: macOS-latest-${{ matrix.arch }}
|
||||
evict-old-files: 1d
|
||||
key: release-${{ matrix.os }}-${{ matrix.arch }}
|
||||
append-timestamp: false # note: use this only with non-concurrent jobs!
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
@@ -109,7 +133,8 @@ jobs:
|
||||
name: llama-bin-macos-${{ matrix.build }}.tar.gz
|
||||
|
||||
ubuntu-cpu:
|
||||
|
||||
needs: [check_release]
|
||||
if: ${{ needs.check_release.outputs.should_release == 'true' }}
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
@@ -140,8 +165,8 @@ jobs:
|
||||
if: ${{ matrix.build != 's390x' }}
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: ubuntu-cpu-${{ matrix.build }}
|
||||
evict-old-files: 1d
|
||||
key: release-${{ matrix.os }}-cpu
|
||||
append-timestamp: false # note: use this only with non-concurrent jobs!
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
@@ -186,6 +211,8 @@ jobs:
|
||||
name: llama-bin-ubuntu-${{ matrix.build }}.tar.gz
|
||||
|
||||
ubuntu-vulkan:
|
||||
needs: [check_release]
|
||||
if: ${{ needs.check_release.outputs.should_release == 'true' }}
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
@@ -214,8 +241,8 @@ jobs:
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: ubuntu-vulkan-${{ matrix.build }}
|
||||
evict-old-files: 1d
|
||||
key: release-${{ matrix.os }}-vulkan
|
||||
append-timestamp: false # note: use this only with non-concurrent jobs!
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
@@ -262,6 +289,8 @@ jobs:
|
||||
name: llama-bin-ubuntu-vulkan-${{ matrix.build }}.tar.gz
|
||||
|
||||
android-arm64:
|
||||
needs: [check_release]
|
||||
if: ${{ needs.check_release.outputs.should_release == 'true' }}
|
||||
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
@@ -282,11 +311,17 @@ jobs:
|
||||
cache: "npm"
|
||||
cache-dependency-path: "tools/ui/package-lock.json"
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: android-arm64
|
||||
evict-old-files: 1d
|
||||
# note : disabled to spare some cache space (https://github.com/ggml-org/llama.cpp/pull/23789)
|
||||
# for some reason, the ccache does not improve the build time in this case
|
||||
# example:
|
||||
# cache off: https://github.com/ggerganov/tmp2/actions/runs/26534713799/job/78160400831
|
||||
# cache on: https://github.com/ggerganov/tmp2/actions/runs/26534713799/job/78224189394
|
||||
#
|
||||
#- name: ccache
|
||||
# uses: ggml-org/ccache-action@v1.2.21
|
||||
# with:
|
||||
# key: release-android-arm64
|
||||
# append-timestamp: false # note: use this only with non-concurrent jobs!
|
||||
|
||||
- name: Set up JDK
|
||||
uses: actions/setup-java@v5
|
||||
@@ -339,6 +374,8 @@ jobs:
|
||||
name: llama-bin-android-arm64.tar.gz
|
||||
|
||||
ubuntu-24-openvino:
|
||||
needs: [check_release]
|
||||
if: ${{ needs.check_release.outputs.should_release == 'true' }}
|
||||
|
||||
runs-on: ubuntu-24.04
|
||||
|
||||
@@ -371,8 +408,8 @@ jobs:
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: ubuntu-24-openvino-release-no-preset-v1
|
||||
evict-old-files: 1d
|
||||
key: release-ubuntu-24.04-openvino-release-no-preset-v1
|
||||
append-timestamp: false # note: use this only with non-concurrent jobs!
|
||||
|
||||
- name: Dependencies
|
||||
run: |
|
||||
@@ -385,7 +422,7 @@ jobs:
|
||||
id: cache-openvino
|
||||
with:
|
||||
path: ./openvino_toolkit
|
||||
key: openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
|
||||
key: cache-gha-openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
|
||||
|
||||
- name: Setup OpenVINO Toolkit
|
||||
if: steps.cache-openvino.outputs.cache-hit != 'true'
|
||||
@@ -427,6 +464,8 @@ jobs:
|
||||
name: llama-bin-ubuntu-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.tar.gz
|
||||
|
||||
windows-cpu:
|
||||
needs: [check_release]
|
||||
if: ${{ needs.check_release.outputs.should_release == 'true' }}
|
||||
|
||||
runs-on: windows-2025
|
||||
|
||||
@@ -452,9 +491,8 @@ jobs:
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: windows-latest-cpu-${{ matrix.arch }}
|
||||
variant: ccache
|
||||
evict-old-files: 1d
|
||||
key: release-windows-2025-${{ matrix.arch }}-cpu
|
||||
append-timestamp: false # note: use this only with non-concurrent jobs!
|
||||
|
||||
- name: Install Ninja
|
||||
run: |
|
||||
@@ -487,6 +525,8 @@ jobs:
|
||||
name: llama-bin-win-cpu-${{ matrix.arch }}.zip
|
||||
|
||||
windows:
|
||||
needs: [check_release]
|
||||
if: ${{ needs.check_release.outputs.should_release == 'true' }}
|
||||
|
||||
runs-on: windows-2025
|
||||
|
||||
@@ -521,9 +561,8 @@ jobs:
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: windows-latest-${{ matrix.backend }}-${{ matrix.arch }}
|
||||
variant: ccache
|
||||
evict-old-files: 1d
|
||||
key: release-windows-2025-${{ matrix.arch }}-${{ matrix.backend }}
|
||||
append-timestamp: false # note: use this only with non-concurrent jobs!
|
||||
|
||||
- name: Install Vulkan SDK
|
||||
id: get_vulkan
|
||||
@@ -577,12 +616,14 @@ jobs:
|
||||
name: llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip
|
||||
|
||||
windows-cuda:
|
||||
needs: [check_release]
|
||||
if: ${{ needs.check_release.outputs.should_release == 'true' }}
|
||||
|
||||
runs-on: windows-2022
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
cuda: ['12.4', '13.1']
|
||||
cuda: ['12.4', '13.3']
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
@@ -596,12 +637,11 @@ jobs:
|
||||
cache: "npm"
|
||||
cache-dependency-path: "tools/ui/package-lock.json"
|
||||
|
||||
- name: Install ccache
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: windows-cuda-${{ matrix.cuda }}
|
||||
variant: ccache
|
||||
evict-old-files: 1d
|
||||
key: release-windows-2022-x64-cuda-${{ matrix.cuda }}
|
||||
append-timestamp: false # note: use this only with non-concurrent jobs!
|
||||
|
||||
- name: Install Cuda Toolkit
|
||||
uses: ./.github/actions/windows-setup-cuda
|
||||
@@ -655,214 +695,216 @@ jobs:
|
||||
path: cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
|
||||
name: cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
|
||||
|
||||
windows-sycl:
|
||||
# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23705)
|
||||
# in order to enable it again, we have to provision dedicated runners to run it
|
||||
# windows-sycl:
|
||||
#
|
||||
# runs-on: windows-2022
|
||||
#
|
||||
# defaults:
|
||||
# run:
|
||||
# shell: bash
|
||||
#
|
||||
# env:
|
||||
# WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
|
||||
# WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
|
||||
# LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip
|
||||
# ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
|
||||
# ONEAPI_INSTALLER_VERSION: "2025.3.3"
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# id: checkout
|
||||
# uses: actions/checkout@v6
|
||||
#
|
||||
# - name: Use oneAPI Installation Cache
|
||||
# uses: actions/cache@v5
|
||||
# id: cache-sycl
|
||||
# with:
|
||||
# path: ${{ env.ONEAPI_ROOT }}
|
||||
# key: cache-gha-oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
|
||||
#
|
||||
# - name: Download & Install oneAPI
|
||||
# shell: bash
|
||||
# if: steps.cache-sycl.outputs.cache-hit != 'true'
|
||||
# run: |
|
||||
# scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
|
||||
#
|
||||
# - name: Install Level Zero SDK
|
||||
# shell: pwsh
|
||||
# run: |
|
||||
# Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip"
|
||||
# Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force
|
||||
# "LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append
|
||||
#
|
||||
# - name: Setup Node.js
|
||||
# uses: actions/setup-node@v6
|
||||
# with:
|
||||
# node-version: "24"
|
||||
# cache: "npm"
|
||||
# cache-dependency-path: "tools/ui/package-lock.json"
|
||||
#
|
||||
# - name: ccache
|
||||
# uses: ggml-org/ccache-action@v1.2.21
|
||||
# with:
|
||||
# key: release-windows-2022-x64-sycl
|
||||
# append-timestamp: false # note: use this only with non-concurrent jobs!
|
||||
#
|
||||
# - name: Build
|
||||
# id: cmake_build
|
||||
# shell: cmd
|
||||
# run: |
|
||||
# call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
|
||||
# cmake -G "Ninja" -B build ^
|
||||
# -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx ^
|
||||
# -DCMAKE_BUILD_TYPE=Release ^
|
||||
# -DGGML_BACKEND_DL=ON -DBUILD_SHARED_LIBS=ON ^
|
||||
# -DGGML_CPU=OFF -DGGML_SYCL=ON ^
|
||||
# -DLLAMA_BUILD_BORINGSSL=ON
|
||||
# cmake --build build --target ggml-sycl -j
|
||||
#
|
||||
# - name: Build the release package
|
||||
# id: pack_artifacts
|
||||
# run: |
|
||||
# echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin"
|
||||
#
|
||||
# cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.5.dll" ./build/bin
|
||||
# cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin
|
||||
# cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin
|
||||
#
|
||||
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin
|
||||
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero_v2.dll" ./build/bin
|
||||
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin
|
||||
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin
|
||||
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin
|
||||
# ZE_LOADER_DLL=$(find "${{ env.ONEAPI_ROOT }}" "$LEVEL_ZERO_V1_SDK_PATH" -iname ze_loader.dll -print -quit 2>/dev/null || true)
|
||||
# if [ -n "$ZE_LOADER_DLL" ]; then
|
||||
# echo "Using Level Zero loader: $ZE_LOADER_DLL"
|
||||
# cp "$ZE_LOADER_DLL" ./build/bin
|
||||
# else
|
||||
# echo "Level Zero loader DLL not found in oneAPI or SDK; relying on system driver/runtime"
|
||||
# fi
|
||||
#
|
||||
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin
|
||||
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
|
||||
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
|
||||
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
|
||||
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl-ls.exe" ./build/bin
|
||||
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libsycl-fallback-bfloat16.spv" ./build/bin
|
||||
# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libsycl-native-bfloat16.spv" ./build/bin
|
||||
#
|
||||
# cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin
|
||||
# cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin
|
||||
#
|
||||
# cp "${{ env.ONEAPI_ROOT }}/tcm/latest/bin/tcm.dll" ./build/bin
|
||||
# cp "${{ env.ONEAPI_ROOT }}/tcm/latest/bin/libhwloc-15.dll" ./build/bin
|
||||
# cp "${{ env.ONEAPI_ROOT }}/umf/latest/bin/umf.dll" ./build/bin
|
||||
#
|
||||
# echo "cp oneAPI running time dll files to ./build/bin done"
|
||||
# 7z a -snl llama-bin-win-sycl-x64.zip ./build/bin/*
|
||||
#
|
||||
# - name: Upload the release package
|
||||
# uses: actions/upload-artifact@v6
|
||||
# with:
|
||||
# path: llama-bin-win-sycl-x64.zip
|
||||
# name: llama-bin-win-sycl-x64.zip
|
||||
|
||||
runs-on: windows-2022
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
|
||||
env:
|
||||
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
|
||||
WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
|
||||
LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip
|
||||
ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
|
||||
ONEAPI_INSTALLER_VERSION: "2025.3.3"
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Use oneAPI Installation Cache
|
||||
uses: actions/cache@v5
|
||||
id: cache-sycl
|
||||
with:
|
||||
path: ${{ env.ONEAPI_ROOT }}
|
||||
key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
|
||||
|
||||
- name: Download & Install oneAPI
|
||||
shell: bash
|
||||
if: steps.cache-sycl.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
|
||||
|
||||
- name: Install Level Zero SDK
|
||||
shell: pwsh
|
||||
run: |
|
||||
Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip"
|
||||
Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force
|
||||
"LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append
|
||||
|
||||
- name: Setup Node.js
|
||||
uses: actions/setup-node@v6
|
||||
with:
|
||||
node-version: "24"
|
||||
cache: "npm"
|
||||
cache-dependency-path: "tools/ui/package-lock.json"
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: windows-latest-sycl
|
||||
variant: ccache
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
shell: cmd
|
||||
run: |
|
||||
call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
|
||||
cmake -G "Ninja" -B build ^
|
||||
-DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx ^
|
||||
-DCMAKE_BUILD_TYPE=Release ^
|
||||
-DGGML_BACKEND_DL=ON -DBUILD_SHARED_LIBS=ON ^
|
||||
-DGGML_CPU=OFF -DGGML_SYCL=ON ^
|
||||
-DLLAMA_BUILD_BORINGSSL=ON
|
||||
cmake --build build --target ggml-sycl -j
|
||||
|
||||
- name: Build the release package
|
||||
id: pack_artifacts
|
||||
run: |
|
||||
echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin"
|
||||
|
||||
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.5.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin
|
||||
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero_v2.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin
|
||||
ZE_LOADER_DLL=$(find "${{ env.ONEAPI_ROOT }}" "$LEVEL_ZERO_V1_SDK_PATH" -iname ze_loader.dll -print -quit 2>/dev/null || true)
|
||||
if [ -n "$ZE_LOADER_DLL" ]; then
|
||||
echo "Using Level Zero loader: $ZE_LOADER_DLL"
|
||||
cp "$ZE_LOADER_DLL" ./build/bin
|
||||
else
|
||||
echo "Level Zero loader DLL not found in oneAPI or SDK; relying on system driver/runtime"
|
||||
fi
|
||||
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl-ls.exe" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libsycl-fallback-bfloat16.spv" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libsycl-native-bfloat16.spv" ./build/bin
|
||||
|
||||
cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin
|
||||
|
||||
cp "${{ env.ONEAPI_ROOT }}/tcm/latest/bin/tcm.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/tcm/latest/bin/libhwloc-15.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/umf/latest/bin/umf.dll" ./build/bin
|
||||
|
||||
echo "cp oneAPI running time dll files to ./build/bin done"
|
||||
7z a -snl llama-bin-win-sycl-x64.zip ./build/bin/*
|
||||
|
||||
- name: Upload the release package
|
||||
uses: actions/upload-artifact@v6
|
||||
with:
|
||||
path: llama-bin-win-sycl-x64.zip
|
||||
name: llama-bin-win-sycl-x64.zip
|
||||
|
||||
ubuntu-24-sycl:
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
build: [fp32, fp16]
|
||||
include:
|
||||
- build: fp32
|
||||
fp16: OFF
|
||||
- build: fp16
|
||||
fp16: ON
|
||||
|
||||
runs-on: ubuntu-24.04
|
||||
|
||||
env:
|
||||
ONEAPI_ROOT: /opt/intel/oneapi/
|
||||
ONEAPI_INSTALLER_VERSION: "2025.3.3"
|
||||
LEVEL_ZERO_VERSION: "1.28.2"
|
||||
LEVEL_ZERO_UBUNTU_VERSION: "u24.04"
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Use oneAPI Installation Cache
|
||||
uses: actions/cache@v5
|
||||
id: cache-sycl
|
||||
with:
|
||||
path: ${{ env.ONEAPI_ROOT }}
|
||||
key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
|
||||
|
||||
- name: Download & Install oneAPI
|
||||
shell: bash
|
||||
if: steps.cache-sycl.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
cd /tmp
|
||||
wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
|
||||
sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
|
||||
|
||||
- name: Install Level Zero SDK
|
||||
shell: bash
|
||||
run: |
|
||||
cd /tmp
|
||||
wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb
|
||||
wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb
|
||||
sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb
|
||||
|
||||
- name: Setup Node.js
|
||||
uses: actions/setup-node@v6
|
||||
with:
|
||||
node-version: "24"
|
||||
cache: "npm"
|
||||
cache-dependency-path: "tools/ui/package-lock.json"
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: ubuntu-24-sycl-${{ matrix.build }}
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
cmake -B build \
|
||||
-G "Ninja" \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_SYCL=ON \
|
||||
-DCMAKE_C_COMPILER=icx \
|
||||
-DCMAKE_CXX_COMPILER=icpx \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DGGML_NATIVE=OFF \
|
||||
-DGGML_SYCL_F16=${{ matrix.fp16 }}
|
||||
time cmake --build build --config Release -j $(nproc)
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
uses: ./.github/actions/get-tag-name
|
||||
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
run: |
|
||||
cp LICENSE ./build/bin/
|
||||
tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin .
|
||||
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v6
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz
|
||||
name: llama-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz
|
||||
# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23705)
|
||||
# in order to enable it again, we have to provision dedicated runners to run it
|
||||
# ubuntu-24-sycl:
|
||||
#
|
||||
# strategy:
|
||||
# matrix:
|
||||
# build: [fp32]
|
||||
# include:
|
||||
# - build: fp32
|
||||
# fp16: OFF
|
||||
#
|
||||
# runs-on: ubuntu-24.04
|
||||
#
|
||||
# env:
|
||||
# ONEAPI_ROOT: /opt/intel/oneapi/
|
||||
# ONEAPI_INSTALLER_VERSION: "2025.3.3"
|
||||
# LEVEL_ZERO_VERSION: "1.28.2"
|
||||
# LEVEL_ZERO_UBUNTU_VERSION: "u24.04"
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# id: checkout
|
||||
# uses: actions/checkout@v6
|
||||
# with:
|
||||
# fetch-depth: 0
|
||||
#
|
||||
# - name: Use oneAPI Installation Cache
|
||||
# uses: actions/cache@v5
|
||||
# id: cache-sycl
|
||||
# with:
|
||||
# path: ${{ env.ONEAPI_ROOT }}
|
||||
# key: cache-gha-oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
|
||||
#
|
||||
# - name: Download & Install oneAPI
|
||||
# shell: bash
|
||||
# if: steps.cache-sycl.outputs.cache-hit != 'true'
|
||||
# run: |
|
||||
# cd /tmp
|
||||
# wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
|
||||
# sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
|
||||
#
|
||||
# - name: Install Level Zero SDK
|
||||
# shell: bash
|
||||
# run: |
|
||||
# cd /tmp
|
||||
# wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb
|
||||
# wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb
|
||||
# sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb
|
||||
#
|
||||
# - name: Setup Node.js
|
||||
# uses: actions/setup-node@v6
|
||||
# with:
|
||||
# node-version: "24"
|
||||
# cache: "npm"
|
||||
# cache-dependency-path: "tools/ui/package-lock.json"
|
||||
#
|
||||
# - name: ccache
|
||||
# uses: ggml-org/ccache-action@v1.2.21
|
||||
# with:
|
||||
# key: release-ubuntu-24.04-sycl
|
||||
# append-timestamp: false # note: use this only with non-concurrent jobs!
|
||||
#
|
||||
# - name: Build
|
||||
# id: cmake_build
|
||||
# run: |
|
||||
# source /opt/intel/oneapi/setvars.sh
|
||||
# cmake -B build \
|
||||
# -G "Ninja" \
|
||||
# -DCMAKE_BUILD_TYPE=Release \
|
||||
# -DGGML_SYCL=ON \
|
||||
# -DCMAKE_C_COMPILER=icx \
|
||||
# -DCMAKE_CXX_COMPILER=icpx \
|
||||
# -DLLAMA_OPENSSL=OFF \
|
||||
# -DGGML_NATIVE=OFF \
|
||||
# -DGGML_SYCL_F16=${{ matrix.fp16 }}
|
||||
# time cmake --build build --config Release -j $(nproc)
|
||||
#
|
||||
# - name: Determine tag name
|
||||
# id: tag
|
||||
# uses: ./.github/actions/get-tag-name
|
||||
#
|
||||
# - name: Pack artifacts
|
||||
# id: pack_artifacts
|
||||
# run: |
|
||||
# cp LICENSE ./build/bin/
|
||||
# tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin .
|
||||
#
|
||||
# - name: Upload artifacts
|
||||
# uses: actions/upload-artifact@v6
|
||||
# with:
|
||||
# path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz
|
||||
# name: llama-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz
|
||||
|
||||
ubuntu-22-rocm:
|
||||
needs: [check_release]
|
||||
if: ${{ needs.check_release.outputs.should_release == 'true' }}
|
||||
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
@@ -895,8 +937,8 @@ jobs:
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: ubuntu-rocm-${{ matrix.ROCM_VERSION }}-${{ matrix.build }}
|
||||
evict-old-files: 1d
|
||||
key: release-ubuntu-22.04-rocm-${{ matrix.ROCM_VERSION }}
|
||||
append-timestamp: false # note: use this only with non-concurrent jobs!
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
@@ -974,6 +1016,8 @@ jobs:
|
||||
name: llama-bin-ubuntu-rocm-${{ env.ROCM_VERSION_SHORT }}-${{ matrix.build }}.tar.gz
|
||||
|
||||
windows-hip:
|
||||
needs: [check_release]
|
||||
if: ${{ needs.check_release.outputs.should_release == 'true' }}
|
||||
|
||||
runs-on: windows-2022
|
||||
|
||||
@@ -1010,13 +1054,13 @@ jobs:
|
||||
uses: actions/cache@v5
|
||||
with:
|
||||
path: C:\Program Files\AMD\ROCm
|
||||
key: rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }}
|
||||
key: cache-gha-rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }}
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
key: windows-latest-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ matrix.name }}-x64
|
||||
evict-old-files: 1d
|
||||
key: release-windows-2022-x64-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ matrix.name }}
|
||||
append-timestamp: false # note: use this only with non-concurrent jobs!
|
||||
|
||||
- name: Install ROCm
|
||||
if: steps.cache-rocm.outputs.cache-hit != 'true'
|
||||
@@ -1088,6 +1132,8 @@ jobs:
|
||||
name: llama-bin-win-hip-${{ matrix.name }}-x64.zip
|
||||
|
||||
ios-xcode-build:
|
||||
needs: [check_release]
|
||||
if: ${{ needs.check_release.outputs.should_release == 'true' }}
|
||||
runs-on: macos-15
|
||||
|
||||
steps:
|
||||
@@ -1108,6 +1154,7 @@ jobs:
|
||||
-DGGML_METAL_USE_BF16=ON \
|
||||
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DLLAMA_BUILD_APP=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||
-DLLAMA_BUILD_TOOLS=OFF \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
@@ -1142,96 +1189,102 @@ jobs:
|
||||
path: llama-${{ steps.tag.outputs.name }}-xcframework.zip
|
||||
name: llama-${{ steps.tag.outputs.name }}-xcframework.zip
|
||||
|
||||
# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23705)
|
||||
# in order to enable it again, we have to provision dedicated runners to run it
|
||||
# openEuler-cann:
|
||||
# strategy:
|
||||
# matrix:
|
||||
# include:
|
||||
# # 910b with aclgraph (both architectures)
|
||||
# - arch: x86
|
||||
# chip_type: '910b'
|
||||
# build: 'Release'
|
||||
# use_acl_graph: 'on'
|
||||
# - arch: aarch64
|
||||
# chip_type: '910b'
|
||||
# build: 'Release'
|
||||
# use_acl_graph: 'on'
|
||||
# # 310p without aclgraph (both architectures)
|
||||
# - arch: x86
|
||||
# chip_type: '310p'
|
||||
# build: 'Release'
|
||||
# use_acl_graph: 'off'
|
||||
# - arch: aarch64
|
||||
# chip_type: '310p'
|
||||
# build: 'Release'
|
||||
# use_acl_graph: 'off'
|
||||
# runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
|
||||
# steps:
|
||||
# - name: Checkout
|
||||
# uses: actions/checkout@v6
|
||||
# with:
|
||||
# fetch-depth: 0
|
||||
#
|
||||
# - name: Free up disk space
|
||||
# uses: ggml-org/free-disk-space@v1.3.1
|
||||
# with:
|
||||
# tool-cache: true
|
||||
#
|
||||
# - name: Set container image
|
||||
# id: cann-image
|
||||
# run: |
|
||||
# image="ascendai/cann:${{ matrix.chip_type == '910b' && '8.5.0-910b-openeuler24.03-py3.11' || '8.5.0-310p-openeuler24.03-py3.11' }}"
|
||||
# echo "image=${image}" >> "${GITHUB_OUTPUT}"
|
||||
#
|
||||
# - name: Pull container image
|
||||
# run: docker pull "${{ steps.cann-image.outputs.image }}"
|
||||
#
|
||||
# - name: Build
|
||||
# env:
|
||||
# BUILD_TYPE: ${{ matrix.build }}
|
||||
# SOC_TYPE: ascend${{ matrix.chip_type }}
|
||||
# USE_ACL_GRAPH: ${{ matrix.use_acl_graph }}
|
||||
# run: |
|
||||
# HOST_UID=$(id -u)
|
||||
# HOST_GID=$(id -g)
|
||||
#
|
||||
# docker run --rm \
|
||||
# -v "${PWD}:/workspace" \
|
||||
# -w /workspace \
|
||||
# -e SOC_TYPE=${SOC_TYPE} \
|
||||
# -e BUILD_TYPE=${BUILD_TYPE} \
|
||||
# -e USE_ACL_GRAPH=${USE_ACL_GRAPH} \
|
||||
# "${{ steps.cann-image.outputs.image }}" \
|
||||
# bash -lc '
|
||||
# set -e
|
||||
# yum install -y --setopt=install_weak_deps=False --setopt=tsflags=nodocs git gcc gcc-c++ make cmake openssl-devel
|
||||
# yum clean all && rm -rf /var/cache/yum
|
||||
# git config --global --add safe.directory "/workspace"
|
||||
# export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
|
||||
# cmake -S . -B build \
|
||||
# -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
|
||||
# -DGGML_CANN=on \
|
||||
# -DSOC_TYPE=${SOC_TYPE} \
|
||||
# -DUSE_ACL_GRAPH=${USE_ACL_GRAPH}
|
||||
# cmake --build build -j $(nproc)
|
||||
#
|
||||
# chown -R '"${HOST_UID}"':'"${HOST_GID}"' /workspace/build
|
||||
# '
|
||||
#
|
||||
# - name: Determine tag name
|
||||
# id: tag
|
||||
# uses: ./.github/actions/get-tag-name
|
||||
#
|
||||
# - name: Pack artifacts
|
||||
# run: |
|
||||
# cp LICENSE ./build/bin/
|
||||
# tar -czvf llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin .
|
||||
#
|
||||
# - name: Upload artifacts
|
||||
# uses: actions/upload-artifact@v6
|
||||
# with:
|
||||
# path: llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz
|
||||
# name: llama-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz
|
||||
|
||||
openEuler-cann:
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
# 910b with aclgraph (both architectures)
|
||||
- arch: x86
|
||||
chip_type: '910b'
|
||||
build: 'Release'
|
||||
use_acl_graph: 'on'
|
||||
- arch: aarch64
|
||||
chip_type: '910b'
|
||||
build: 'Release'
|
||||
use_acl_graph: 'on'
|
||||
# 310p without aclgraph (both architectures)
|
||||
- arch: x86
|
||||
chip_type: '310p'
|
||||
build: 'Release'
|
||||
use_acl_graph: 'off'
|
||||
- arch: aarch64
|
||||
chip_type: '310p'
|
||||
build: 'Release'
|
||||
use_acl_graph: 'off'
|
||||
runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Free up disk space
|
||||
uses: ggml-org/free-disk-space@v1.3.1
|
||||
with:
|
||||
tool-cache: true
|
||||
|
||||
- name: Set container image
|
||||
id: cann-image
|
||||
run: |
|
||||
image="ascendai/cann:${{ matrix.chip_type == '910b' && '8.5.0-910b-openeuler24.03-py3.11' || '8.5.0-310p-openeuler24.03-py3.11' }}"
|
||||
echo "image=${image}" >> "${GITHUB_OUTPUT}"
|
||||
|
||||
- name: Pull container image
|
||||
run: docker pull "${{ steps.cann-image.outputs.image }}"
|
||||
|
||||
- name: Build
|
||||
env:
|
||||
BUILD_TYPE: ${{ matrix.build }}
|
||||
SOC_TYPE: ascend${{ matrix.chip_type }}
|
||||
USE_ACL_GRAPH: ${{ matrix.use_acl_graph }}
|
||||
run: |
|
||||
HOST_UID=$(id -u)
|
||||
HOST_GID=$(id -g)
|
||||
|
||||
docker run --rm \
|
||||
-v "${PWD}:/workspace" \
|
||||
-w /workspace \
|
||||
-e SOC_TYPE=${SOC_TYPE} \
|
||||
-e BUILD_TYPE=${BUILD_TYPE} \
|
||||
-e USE_ACL_GRAPH=${USE_ACL_GRAPH} \
|
||||
"${{ steps.cann-image.outputs.image }}" \
|
||||
bash -lc '
|
||||
set -e
|
||||
yum install -y --setopt=install_weak_deps=False --setopt=tsflags=nodocs git gcc gcc-c++ make cmake openssl-devel
|
||||
yum clean all && rm -rf /var/cache/yum
|
||||
git config --global --add safe.directory "/workspace"
|
||||
export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
|
||||
cmake -S . -B build \
|
||||
-DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
|
||||
-DGGML_CANN=on \
|
||||
-DSOC_TYPE=${SOC_TYPE} \
|
||||
-DUSE_ACL_GRAPH=${USE_ACL_GRAPH}
|
||||
cmake --build build -j $(nproc)
|
||||
|
||||
chown -R '"${HOST_UID}"':'"${HOST_GID}"' /workspace/build
|
||||
'
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
uses: ./.github/actions/get-tag-name
|
||||
|
||||
- name: Pack artifacts
|
||||
run: |
|
||||
cp LICENSE ./build/bin/
|
||||
tar -czvf llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin .
|
||||
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v6
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz
|
||||
name: llama-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz
|
||||
ui-build:
|
||||
needs: [check_release]
|
||||
if: ${{ needs.check_release.outputs.should_release == 'true' }}
|
||||
uses: ./.github/workflows/ui-build.yml
|
||||
|
||||
release:
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
@@ -1247,17 +1300,18 @@ jobs:
|
||||
- windows
|
||||
- windows-cpu
|
||||
- windows-cuda
|
||||
- windows-sycl
|
||||
#- windows-sycl
|
||||
- windows-hip
|
||||
- ubuntu-22-rocm
|
||||
- ubuntu-cpu
|
||||
- ubuntu-vulkan
|
||||
- ubuntu-24-openvino
|
||||
- ubuntu-24-sycl
|
||||
#- ubuntu-24-sycl
|
||||
- android-arm64
|
||||
- macOS-cpu
|
||||
- macos-cpu
|
||||
- ios-xcode-build
|
||||
- openEuler-cann
|
||||
#- openEuler-cann
|
||||
- ui-build
|
||||
|
||||
outputs:
|
||||
tag_name: ${{ steps.tag.outputs.name }}
|
||||
@@ -1317,6 +1371,18 @@ jobs:
|
||||
mv -v artifact/*.zip release
|
||||
mv -v artifact/*.tar.gz release
|
||||
|
||||
- name: Download UI build
|
||||
id: download_ui
|
||||
uses: actions/download-artifact@v7
|
||||
with:
|
||||
name: ui-build
|
||||
path: ./ui-dist
|
||||
|
||||
- name: Package UI
|
||||
id: package_ui
|
||||
run: |
|
||||
tar -czvf release/llama-${{ steps.tag.outputs.name }}-ui.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./ui-dist .
|
||||
|
||||
- name: Create release
|
||||
id: create_release
|
||||
uses: ggml-org/action-create-release@v1
|
||||
@@ -1333,7 +1399,7 @@ jobs:
|
||||
|
||||
**macOS/iOS:**
|
||||
- [macOS Apple Silicon (arm64)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.tar.gz)
|
||||
- [macOS Apple Silicon (arm64, KleidiAI enabled)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-macos-arm64-kleidiai.tar.gz)
|
||||
- macOS Apple Silicon (arm64, KleidiAI enabled) [DISABLED](https://github.com/ggml-org/llama.cpp/pull/23780)
|
||||
- [macOS Intel (x64)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-macos-x64.tar.gz)
|
||||
- [iOS XCFramework](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-xcframework.zip)
|
||||
|
||||
@@ -1345,8 +1411,7 @@ jobs:
|
||||
- [Ubuntu arm64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-arm64.tar.gz)
|
||||
- [Ubuntu x64 (ROCm 7.2)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-7.2-x64.tar.gz)
|
||||
- [Ubuntu x64 (OpenVINO)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ needs.ubuntu-24-openvino.outputs.openvino_version }}-x64.tar.gz)
|
||||
- [Ubuntu x64 (SYCL FP32)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-fp32-x64.tar.gz)
|
||||
- [Ubuntu x64 (SYCL FP16)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-fp16-x64.tar.gz)
|
||||
- Ubuntu x64 (SYCL FP32) [DISABLED](https://github.com/ggml-org/llama.cpp/pull/23705)
|
||||
|
||||
**Android:**
|
||||
- [Android arm64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-android-arm64.tar.gz)
|
||||
@@ -1355,16 +1420,20 @@ jobs:
|
||||
- [Windows x64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cpu-x64.zip)
|
||||
- [Windows arm64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cpu-arm64.zip)
|
||||
- [Windows x64 (CUDA 12)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-12.4-x64.zip) - [CUDA 12.4 DLLs](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/cudart-llama-bin-win-cuda-12.4-x64.zip)
|
||||
- [Windows x64 (CUDA 13)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-13.1-x64.zip) - [CUDA 13.1 DLLs](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/cudart-llama-bin-win-cuda-13.1-x64.zip)
|
||||
- [Windows x64 (CUDA 13)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-13.3-x64.zip) - [CUDA 13.3 DLLs](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/cudart-llama-bin-win-cuda-13.3-x64.zip)
|
||||
- [Windows x64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-vulkan-x64.zip)
|
||||
- [Windows x64 (SYCL)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip)
|
||||
- Windows x64 (SYCL) [DISABLED](https://github.com/ggml-org/llama.cpp/pull/23705)
|
||||
- [Windows x64 (HIP)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-hip-radeon-x64.zip)
|
||||
|
||||
**openEuler:**
|
||||
- [openEuler x86 (310p)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-310p-openEuler-x86.tar.gz)
|
||||
- [openEuler x86 (910b, ACL Graph)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-910b-openEuler-x86-aclgraph.tar.gz)
|
||||
- [openEuler aarch64 (310p)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-310p-openEuler-aarch64.tar.gz)
|
||||
- [openEuler aarch64 (910b, ACL Graph)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-910b-openEuler-aarch64-aclgraph.tar.gz)
|
||||
- [DISABLED](https://github.com/ggml-org/llama.cpp/pull/23705)
|
||||
- openEuler x86 (310p)
|
||||
- openEuler x86 (910b, ACL Graph)
|
||||
- openEuler aarch64 (310p)
|
||||
- openEuler aarch64 (910b, ACL Graph)
|
||||
|
||||
**UI:**
|
||||
- [UI](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-ui.tar.gz)
|
||||
|
||||
- name: Upload release
|
||||
id: upload_release
|
||||
|
||||
36
.github/workflows/server-sanitize.yml
vendored
36
.github/workflows/server-sanitize.yml
vendored
@@ -26,10 +26,10 @@ on:
|
||||
]
|
||||
|
||||
env:
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_LOG_VERBOSITY: 10
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
LLAMA_ARG_LOG_VERBOSITY: 10
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
|
||||
@@ -37,7 +37,7 @@ concurrency:
|
||||
|
||||
jobs:
|
||||
server:
|
||||
runs-on: ubuntu-latest
|
||||
runs-on: [self-hosted, CPU, Linux, llama-server]
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
@@ -46,19 +46,19 @@ jobs:
|
||||
fail-fast: false
|
||||
|
||||
steps:
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get -y install \
|
||||
build-essential \
|
||||
xxd \
|
||||
git \
|
||||
cmake \
|
||||
curl \
|
||||
wget \
|
||||
language-pack-en \
|
||||
libssl-dev
|
||||
#- name: Dependencies
|
||||
# id: depends
|
||||
# run: |
|
||||
# sudo apt-get update
|
||||
# sudo apt-get -y install \
|
||||
# build-essential \
|
||||
# xxd \
|
||||
# git \
|
||||
# cmake \
|
||||
# curl \
|
||||
# wget \
|
||||
# language-pack-en \
|
||||
# libssl-dev
|
||||
|
||||
- name: Clone
|
||||
id: checkout
|
||||
|
||||
85
.github/workflows/server-self-hosted.yml
vendored
85
.github/workflows/server-self-hosted.yml
vendored
@@ -29,10 +29,10 @@ on:
|
||||
]
|
||||
|
||||
env:
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_LOG_VERBOSITY: 10
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
LLAMA_ARG_LOG_VERBOSITY: 10
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
|
||||
@@ -91,45 +91,44 @@ jobs:
|
||||
export ${{ matrix.extra_args }}
|
||||
pytest -v -x -m "not slow"
|
||||
|
||||
# TODO: provision CUDA runner
|
||||
# server-cuda:
|
||||
# runs-on: [self-hosted, llama-server, Linux, NVIDIA]
|
||||
#
|
||||
# name: server-cuda (${{ matrix.wf_name }})
|
||||
# strategy:
|
||||
# matrix:
|
||||
# build_type: [Release]
|
||||
# wf_name: ["GPUx1"]
|
||||
# include:
|
||||
# - build_type: Release
|
||||
# extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1"
|
||||
# wf_name: "GPUx1, backend-sampling"
|
||||
# fail-fast: false
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# id: checkout
|
||||
# uses: actions/checkout@v6
|
||||
# with:
|
||||
# fetch-depth: 0
|
||||
# ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
||||
#
|
||||
# - name: Build
|
||||
# id: cmake_build
|
||||
# run: |
|
||||
# cmake -B build -DGGML_SCHED_NO_REALLOC=ON
|
||||
# cmake --build build --config ${{ matrix.build_type }} -j $(sysctl -n hw.logicalcpu) --target llama-server
|
||||
#
|
||||
# - name: Tests
|
||||
# id: server_integration_tests
|
||||
# if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }}
|
||||
# run: |
|
||||
# cd tools/server/tests
|
||||
# python3 -m venv venv
|
||||
# source venv/bin/activate
|
||||
# pip install -r requirements.txt
|
||||
# export ${{ matrix.extra_args }}
|
||||
# pytest -v -x -m "not slow"
|
||||
server-cuda:
|
||||
runs-on: [self-hosted, llama-server, Linux, NVIDIA]
|
||||
|
||||
name: server-cuda (${{ matrix.wf_name }})
|
||||
strategy:
|
||||
matrix:
|
||||
build_type: [Release]
|
||||
wf_name: ["GPUx1"]
|
||||
include:
|
||||
- build_type: Release
|
||||
extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1"
|
||||
wf_name: "GPUx1, backend-sampling"
|
||||
fail-fast: false
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build -DGGML_CUDA=ON -DGGML_SCHED_NO_REALLOC=ON
|
||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
||||
|
||||
- name: Tests
|
||||
id: server_integration_tests
|
||||
if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }}
|
||||
run: |
|
||||
cd tools/server/tests
|
||||
python3 -m venv venv
|
||||
source venv/bin/activate
|
||||
pip install -r requirements.txt
|
||||
export ${{ matrix.extra_args }}
|
||||
pytest -v -x -m "not slow"
|
||||
|
||||
server-kleidiai:
|
||||
runs-on: ah-ubuntu_22_04-c8g_8x
|
||||
|
||||
47
.github/workflows/server.yml
vendored
47
.github/workflows/server.yml
vendored
@@ -44,20 +44,20 @@ on:
|
||||
]
|
||||
|
||||
env:
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_LOG_VERBOSITY: 10
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
LLAMA_ARG_LOG_VERBOSITY: 10
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
server:
|
||||
runs-on: ubuntu-latest
|
||||
ubuntu:
|
||||
runs-on: ubuntu-24.04
|
||||
|
||||
name: server (${{ matrix.wf_name }})
|
||||
name: ubuntu (${{ matrix.wf_name }})
|
||||
strategy:
|
||||
matrix:
|
||||
build_type: [Release]
|
||||
@@ -93,18 +93,17 @@ jobs:
|
||||
fetch-depth: 0
|
||||
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
||||
|
||||
- name: Setup Node.js
|
||||
uses: actions/setup-node@v6
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
node-version: "24"
|
||||
cache: "npm"
|
||||
cache-dependency-path: "tools/ui/package-lock.json"
|
||||
key: server-ubuntu-24.04-x64
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DLLAMA_BUILD_BORINGSSL=ON \
|
||||
-DGGML_SCHED_NO_REALLOC=ON
|
||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
||||
|
||||
@@ -131,8 +130,8 @@ jobs:
|
||||
export ${{ matrix.extra_args }}
|
||||
SLOW_TESTS=1 pytest -v -x
|
||||
|
||||
server-windows:
|
||||
runs-on: windows-2022
|
||||
windows:
|
||||
runs-on: windows-2025
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
@@ -142,16 +141,24 @@ jobs:
|
||||
fetch-depth: 0
|
||||
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
||||
|
||||
- name: Setup Node.js
|
||||
uses: actions/setup-node@v6
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
node-version: "24"
|
||||
key: server-windows-2025-x64
|
||||
evict-old-files: 1d
|
||||
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
shell: cmd
|
||||
run: |
|
||||
cmake -B build -DLLAMA_BUILD_BORINGSSL=ON -DGGML_SCHED_NO_REALLOC=ON
|
||||
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target llama-server
|
||||
cmake -B build -G "Ninja Multi-Config" ^
|
||||
-DCMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake ^
|
||||
-DCMAKE_BUILD_TYPE=Release ^
|
||||
-DLLAMA_BUILD_BORINGSSL=ON ^
|
||||
-DGGML_SCHED_NO_REALLOC=ON
|
||||
set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
|
||||
cmake --build build --config Release -j %NINJA_JOBS% --target llama-server
|
||||
|
||||
- name: Python setup
|
||||
id: setup_python
|
||||
|
||||
7
.github/workflows/ui-build.yml
vendored
7
.github/workflows/ui-build.yml
vendored
@@ -5,8 +5,7 @@ on:
|
||||
|
||||
jobs:
|
||||
build:
|
||||
name: Build static output
|
||||
runs-on: ubuntu-slim
|
||||
runs-on: [self-hosted, fast]
|
||||
env:
|
||||
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
||||
|
||||
@@ -31,7 +30,7 @@ jobs:
|
||||
|
||||
- name: Generate checksums
|
||||
run: |
|
||||
cd build/tools/ui/dist
|
||||
cd tools/ui/dist
|
||||
for f in *; do
|
||||
sha256sum "$f" | awk '{print $1, $2}' >> checksums.txt
|
||||
done
|
||||
@@ -40,5 +39,5 @@ jobs:
|
||||
uses: actions/upload-artifact@v6
|
||||
with:
|
||||
name: ui-build
|
||||
path: build/tools/ui/dist/
|
||||
path: tools/ui/dist/
|
||||
retention-days: 1
|
||||
|
||||
6
.github/workflows/ui-publish.yml
vendored
6
.github/workflows/ui-publish.yml
vendored
@@ -38,7 +38,7 @@ jobs:
|
||||
uses: actions/download-artifact@v7
|
||||
with:
|
||||
name: ui-build
|
||||
path: build/tools/ui/dist/
|
||||
path: tools/ui/dist/
|
||||
|
||||
- name: Install Hugging Face Hub CLI
|
||||
run: pip install -U huggingface_hub
|
||||
@@ -49,12 +49,12 @@ jobs:
|
||||
- name: Sync built files to Hugging Face bucket (version tag)
|
||||
run: |
|
||||
# Upload the built files to the Hugging Face bucket under the release version
|
||||
hf buckets sync build/tools/ui/dist hf://buckets/ggml-org/${{ env.HF_BUCKET_NAME }}/${{ inputs.version_tag }} --delete --quiet
|
||||
hf buckets sync tools/ui/dist hf://buckets/ggml-org/${{ env.HF_BUCKET_NAME }}/${{ inputs.version_tag }} --delete --quiet
|
||||
|
||||
- name: Sync built files to Hugging Face bucket (latest)
|
||||
run: |
|
||||
# Also upload to the 'latest' directory for fallback downloads
|
||||
hf buckets sync build/tools/ui/dist hf://buckets/ggml-org/${{ env.HF_BUCKET_NAME }}/latest --delete --quiet
|
||||
hf buckets sync tools/ui/dist hf://buckets/ggml-org/${{ env.HF_BUCKET_NAME }}/latest --delete --quiet
|
||||
|
||||
- name: Verify upload
|
||||
run: |
|
||||
|
||||
118
.github/workflows/ui-self-hosted.yml
vendored
Normal file
118
.github/workflows/ui-self-hosted.yml
vendored
Normal file
@@ -0,0 +1,118 @@
|
||||
name: UI (self-hosted)
|
||||
|
||||
# these are the same as ui.yml, but with self-hosted runners
|
||||
# the runners come with pre-installed Playwright browsers version: 1.56.1
|
||||
# the jobs are much lighter because they don't need to install node and playwright browsers
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
sha:
|
||||
description: 'Commit SHA1 to build'
|
||||
required: false
|
||||
type: string
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: [
|
||||
'.github/workflows/ui-self-hosted.yml',
|
||||
'.github/workflows/ui-build.yml',
|
||||
'tools/ui/**.*',
|
||||
'tools/server/tests/**.*'
|
||||
]
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: [
|
||||
'.github/workflows/ui-self-hosted.yml',
|
||||
'.github/workflows/ui-build.yml',
|
||||
'tools/ui/**.*',
|
||||
'tools/server/tests/**.*'
|
||||
]
|
||||
|
||||
env:
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
LLAMA_ARG_LOG_VERBOSITY: 10
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
ui-build:
|
||||
name: Build static output
|
||||
uses: ./.github/workflows/ui-build.yml
|
||||
|
||||
ui-checks:
|
||||
name: Checks
|
||||
needs: ui-build
|
||||
runs-on: [self-hosted, PLAYWRIGHT]
|
||||
continue-on-error: true
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
||||
|
||||
- name: Install dependencies
|
||||
id: setup
|
||||
run: npm ci
|
||||
working-directory: tools/ui
|
||||
|
||||
- name: Run type checking
|
||||
if: ${{ always() && steps.setup.conclusion == 'success' }}
|
||||
run: npm run check
|
||||
working-directory: tools/ui
|
||||
|
||||
- name: Run linting
|
||||
if: ${{ always() && steps.setup.conclusion == 'success' }}
|
||||
run: npm run lint
|
||||
working-directory: tools/ui
|
||||
|
||||
- name: Run Client tests
|
||||
if: ${{ always() }}
|
||||
run: npm run test:client
|
||||
working-directory: tools/ui
|
||||
|
||||
- name: Run Unit tests
|
||||
if: ${{ always() }}
|
||||
run: npm run test:unit
|
||||
working-directory: tools/ui
|
||||
|
||||
e2e-tests:
|
||||
name: E2E Tests
|
||||
needs: ui-build
|
||||
runs-on: [self-hosted, PLAYWRIGHT]
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
||||
|
||||
- name: Install dependencies
|
||||
id: setup
|
||||
run: npm ci
|
||||
working-directory: tools/ui
|
||||
|
||||
- name: Build application
|
||||
if: ${{ always() && steps.setup.conclusion == 'success' }}
|
||||
run: npm run build
|
||||
working-directory: tools/ui
|
||||
|
||||
- name: Build Storybook
|
||||
if: ${{ always() }}
|
||||
run: npm run build-storybook
|
||||
working-directory: tools/ui
|
||||
|
||||
- name: Run UI tests
|
||||
if: ${{ always() }}
|
||||
run: npm run test:ui -- --testTimeout=60000
|
||||
working-directory: tools/ui
|
||||
|
||||
- name: Run E2E tests
|
||||
if: ${{ always() }}
|
||||
run: npm run test:e2e
|
||||
working-directory: tools/ui
|
||||
@@ -1,4 +1,4 @@
|
||||
name: CI (UI)
|
||||
name: UI
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
@@ -11,23 +11,25 @@ on:
|
||||
branches:
|
||||
- master
|
||||
paths: [
|
||||
'.github/workflows/ui-ci.yml',
|
||||
'.github/workflows/ui.yml',
|
||||
'.github/workflows/ui-build.yml',
|
||||
'tools/ui/**.*',
|
||||
'tools/server/tests/**.*'
|
||||
]
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: [
|
||||
'.github/workflows/ui-ci.yml',
|
||||
'.github/workflows/ui.yml',
|
||||
'.github/workflows/ui-build.yml',
|
||||
'tools/ui/**.*',
|
||||
'tools/server/tests/**.*'
|
||||
]
|
||||
|
||||
env:
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_LOG_VERBOSITY: 10
|
||||
LLAMA_ARG_LOG_COLORS: 1
|
||||
LLAMA_ARG_LOG_PREFIX: 1
|
||||
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
||||
LLAMA_ARG_LOG_VERBOSITY: 10
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
|
||||
@@ -39,7 +41,7 @@ jobs:
|
||||
uses: ./.github/workflows/ui-build.yml
|
||||
|
||||
ui-checks:
|
||||
name: UI Checks
|
||||
name: Checks
|
||||
needs: ui-build
|
||||
runs-on: ubuntu-latest
|
||||
continue-on-error: true
|
||||
4
.github/workflows/update-ops-docs.yml
vendored
4
.github/workflows/update-ops-docs.yml
vendored
@@ -3,18 +3,20 @@ name: Update Operations Documentation
|
||||
on:
|
||||
push:
|
||||
paths:
|
||||
- '.github/workflows/update-ops-docs.yml'
|
||||
- 'docs/ops.md'
|
||||
- 'docs/ops/**'
|
||||
- 'scripts/create_ops_docs.py'
|
||||
pull_request:
|
||||
paths:
|
||||
- '.github/workflows/update-ops-docs.yml'
|
||||
- 'docs/ops.md'
|
||||
- 'docs/ops/**'
|
||||
- 'scripts/create_ops_docs.py'
|
||||
|
||||
jobs:
|
||||
update-ops-docs:
|
||||
runs-on: ubuntu-slim
|
||||
runs-on: [self-hosted, fast, ARM64]
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
You are a coding agent. Here are some very important rules that you must follow:
|
||||
|
||||
General:
|
||||
- By very precise and concise when writing code, comments, explanations, etc.
|
||||
- Be very precise and concise when writing code, comments, explanations, etc.
|
||||
- PR and commit titles format: `<module> : <title>`. Lookup recents for examples
|
||||
- Don't try to build or run the code unless you are explicitly asked to do so
|
||||
- Use the `gh` CLI tool when querying PRs, issues, or other GitHub resources
|
||||
@@ -16,7 +16,8 @@ Pull requests (PRs):
|
||||
- New branch names are prefixed with "gg/"
|
||||
- Before opening a pull request, ask the user to confirm the description
|
||||
- When creating a pull request, look for the repository's PR template and follow it
|
||||
- For the AI usage disclosure section, write "YES. llama.cpp + pi"
|
||||
- For the AI usage disclosure section, write "YES. llama.cpp + pi + [MODEL]"
|
||||
- Ask the user to tell you what model was used and write it in place of [MODEL]
|
||||
- Always create the pull requests in draft mode
|
||||
|
||||
Commits:
|
||||
|
||||
@@ -108,20 +108,10 @@ option(LLAMA_BUILD_TESTS "llama: build tests"
|
||||
option(LLAMA_BUILD_TOOLS "llama: build tools" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_APP "llama: build the unified binary" OFF)
|
||||
option(LLAMA_BUILD_APP "llama: build the unified binary" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_UI "llama: build the embedded Web UI for server" ON)
|
||||
option(LLAMA_USE_PREBUILT_UI "llama: use prebuilt UI from HF Bucket when available (requires LLAMA_BUILD_UI=ON)" ON)
|
||||
|
||||
# Backward compat: when old var is set but new one isn't, forward the value
|
||||
if(DEFINED LLAMA_BUILD_WEBUI)
|
||||
set(LLAMA_BUILD_UI ${LLAMA_BUILD_WEBUI})
|
||||
message(DEPRECATION "LLAMA_BUILD_WEBUI is deprecated, use LLAMA_BUILD_UI instead")
|
||||
endif()
|
||||
if(DEFINED LLAMA_USE_PREBUILT_WEBUI)
|
||||
set(LLAMA_USE_PREBUILT_UI ${LLAMA_USE_PREBUILT_WEBUI})
|
||||
message(DEPRECATION "LLAMA_USE_PREBUILT_WEBUI is deprecated, use LLAMA_USE_PREBUILT_UI instead")
|
||||
endif()
|
||||
|
||||
option(LLAMA_TOOLS_INSTALL "llama: install tools" ${LLAMA_TOOLS_INSTALL_DEFAULT})
|
||||
option(LLAMA_TESTS_INSTALL "llama: install tests" ON)
|
||||
|
||||
|
||||
@@ -49,7 +49,6 @@
|
||||
/examples/parallel/ @ggerganov
|
||||
/examples/passkey/ @ggerganov
|
||||
/examples/retrieval/ @ggerganov
|
||||
/examples/save-load-state/ @ggerganov
|
||||
/examples/speculative-simple/ @ggerganov
|
||||
/examples/speculative/ @ggerganov
|
||||
/ggml/cmake/ @ggerganov
|
||||
|
||||
@@ -63,6 +63,7 @@ After submitting your PR:
|
||||
- Optionally pick a `<module>` from here: https://github.com/ggml-org/llama.cpp/wiki/Modules
|
||||
- Let other maintainers merge their own PRs
|
||||
- When merging a PR, make sure you have a good understanding of the changes
|
||||
- If a PR does not warrant a new release, add `[no release]` in the squashed commit to spare CI resources
|
||||
- Be mindful of maintenance: most of the work going into a feature happens after the PR is merged. If the PR author is not committed to contribute long-term, someone else needs to take responsibility (you)
|
||||
|
||||
Maintainers reserve the right to decline review or close pull requests for any reason, without any questions, particularly under any of the following conditions:
|
||||
|
||||
@@ -27,6 +27,7 @@ LLM inference in C/C++
|
||||
- Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim
|
||||
- Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggml-org/llama.cpp/discussions/9669
|
||||
- Hugging Face GGUF editor: [discussion](https://github.com/ggml-org/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor)
|
||||
- WebGPU support is now available in the browser, see a blog/demo introducing it [here](https://reeselevine.github.io/llamas-on-the-web/).
|
||||
|
||||
----
|
||||
|
||||
@@ -290,7 +291,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
||||
| [CANN](docs/build.md#cann) | Ascend NPU |
|
||||
| [OpenCL](docs/backend/OPENCL.md) | Adreno GPU |
|
||||
| [IBM zDNN](docs/backend/zDNN.md) | IBM Z & LinuxONE |
|
||||
| [WebGPU [In Progress]](docs/build.md#webgpu) | All |
|
||||
| [WebGPU](docs/build.md#webgpu) | All |
|
||||
| [RPC](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) | All |
|
||||
| [Hexagon [In Progress]](docs/backend/snapdragon/README.md) | Snapdragon |
|
||||
| [VirtGPU](docs/backend/VirtGPU.md) | VirtGPU APIR |
|
||||
|
||||
@@ -7,6 +7,7 @@ VISIONOS_MIN_OS_VERSION=1.0
|
||||
TVOS_MIN_OS_VERSION=16.4
|
||||
|
||||
BUILD_SHARED_LIBS=OFF
|
||||
LLAMA_BUILD_APP=OFF
|
||||
LLAMA_BUILD_EXAMPLES=OFF
|
||||
LLAMA_BUILD_TOOLS=OFF
|
||||
LLAMA_BUILD_TESTS=OFF
|
||||
@@ -31,6 +32,7 @@ COMMON_CMAKE_ARGS=(
|
||||
-DCMAKE_XCODE_ATTRIBUTE_STRIP_INSTALLED_PRODUCT=NO
|
||||
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
|
||||
-DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS}
|
||||
-DLLAMA_BUILD_APP=${LLAMA_BUILD_APP}
|
||||
-DLLAMA_BUILD_EXAMPLES=${LLAMA_BUILD_EXAMPLES}
|
||||
-DLLAMA_BUILD_TOOLS=${LLAMA_BUILD_TOOLS}
|
||||
-DLLAMA_BUILD_TESTS=${LLAMA_BUILD_TESTS}
|
||||
|
||||
23
ci/run.sh
23
ci/run.sh
@@ -66,6 +66,8 @@ fi
|
||||
|
||||
if [ ! -z ${GG_BUILD_METAL} ]; then
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON"
|
||||
else
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=OFF"
|
||||
fi
|
||||
|
||||
if [ ! -z ${GG_BUILD_CUDA} ]; then
|
||||
@@ -114,10 +116,7 @@ fi
|
||||
if [ ! -z ${GG_BUILD_VULKAN} ]; then
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_VULKAN=1"
|
||||
|
||||
# if on Mac, disable METAL
|
||||
if [[ "$OSTYPE" == "darwin"* ]]; then
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=OFF -DGGML_BLAS=OFF"
|
||||
|
||||
MACOS_RUNNER_CUSTOM_VULKAN_CMAKE_LOCATION="/usr/local/lib/cmake/vulkan"
|
||||
MACOS_RUNNER_CUSTOM_SPIRV_HEADERS_LOCATION="${MACOS_RUNNER_CUSTOM_VULKAN_CMAKE_LOCATION}/SPIRV-Headers/SPIRV-HeadersConfig.cmake"
|
||||
if [[ -f "${MACOS_RUNNER_CUSTOM_SPIRV_HEADERS_LOCATION}" || -h "${MACOS_RUNNER_CUSTOM_SPIRV_HEADERS_LOCATION}" ]]; then
|
||||
@@ -133,7 +132,7 @@ if [ ! -z ${GG_BUILD_VULKAN} ]; then
|
||||
fi
|
||||
|
||||
if [ ! -z ${GG_BUILD_WEBGPU} ]; then
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_WEBGPU=1 -DGGML_METAL=OFF -DGGML_BLAS=OFF"
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_WEBGPU=1"
|
||||
|
||||
if [ ! -z "${GG_BUILD_WEBGPU_DAWN_PREFIX}" ]; then
|
||||
if [ -z "${CMAKE_PREFIX_PATH}" ]; then
|
||||
@@ -167,6 +166,8 @@ fi
|
||||
|
||||
if [ ! -z ${GG_BUILD_BLAS} ]; then
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=${GG_BUILD_BLAS_VENDOR:-OpenBLAS}"
|
||||
else
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_BLAS=OFF"
|
||||
fi
|
||||
|
||||
if [ ! -z ${GG_BUILD_OPENVINO} ]; then
|
||||
@@ -238,7 +239,7 @@ function gg_run_ctest_debug {
|
||||
(cmake -G "${CMAKE_GENERATOR}" -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time cmake --build . --config Debug -j$(nproc)) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
|
||||
(time ctest -C Debug --output-on-failure -L main -E "test-opt|test-backend-ops" ${CTEST_EXTRA}) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||
(time ctest -C Debug --output-on-failure -L main -E "test-opt|test-backend-ops|test-llama-archs" ${CTEST_EXTRA}) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||
|
||||
set +e
|
||||
}
|
||||
@@ -461,10 +462,10 @@ function gg_run_qwen3_0_6b {
|
||||
|
||||
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||
|
||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa off --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa on --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa on ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/test-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa off --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/test-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa on --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/test-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/test-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa on ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
|
||||
function check_ppl {
|
||||
qnt="$1"
|
||||
@@ -700,8 +701,8 @@ function gg_sum_test_backend_ops_cpu {
|
||||
|
||||
## main
|
||||
|
||||
export LLAMA_LOG_PREFIX=1
|
||||
export LLAMA_LOG_TIMESTAMPS=1
|
||||
export LLAMA_ARG_LOG_PREFIX=1
|
||||
export LLAMA_ARG_LOG_TIMESTAMPS=1
|
||||
|
||||
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
||||
# Create symlink: ./llama.cpp/models-mnt -> $MNT/models
|
||||
|
||||
@@ -1334,12 +1334,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
}
|
||||
).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
|
||||
add_opt(common_arg(
|
||||
{"-cpent", "--checkpoint-every-n-tokens"}, "N",
|
||||
string_format("create a checkpoint every n tokens during prefill (processing), -1 to disable (default: %d)", params.checkpoint_every_nt),
|
||||
{"-cms", "--checkpoint-min-step"}, "N",
|
||||
string_format("minimum spacing between context checkpoints in tokens (default: %d, 0 = no minimum)", params.checkpoint_min_step),
|
||||
[](common_params & params, int value) {
|
||||
params.checkpoint_every_nt = value;
|
||||
if (value < 0) {
|
||||
throw std::invalid_argument("checkpoint-min-step must be non-negative");
|
||||
}
|
||||
params.checkpoint_min_step = value;
|
||||
}
|
||||
).set_env("LLAMA_ARG_CHECKPOINT_EVERY_NT").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
|
||||
).set_env("LLAMA_ARG_CHECKPOINT_MIN_SPACING_NT").set_examples({LLAMA_EXAMPLE_SERVER}));
|
||||
add_opt(common_arg(
|
||||
{"-cram", "--cache-ram"}, "N",
|
||||
string_format("set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)"
|
||||
@@ -2995,7 +2998,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
}
|
||||
key_file.close();
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_KEY_FILE"));
|
||||
add_opt(common_arg(
|
||||
{"--ssl-key-file"}, "FNAME",
|
||||
"path to file a PEM-encoded SSL private key",
|
||||
@@ -3023,7 +3026,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.default_template_kwargs[item.key()] = item.value().dump();
|
||||
}
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_CHAT_TEMPLATE_KWARGS"));
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_CHAT_TEMPLATE_KWARGS"));
|
||||
add_opt(common_arg(
|
||||
{"-to", "--timeout"}, "N",
|
||||
string_format("server read/write timeout in seconds (default: %d)", params.timeout_read),
|
||||
@@ -3324,7 +3327,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
[](common_params &, const std::string & value) {
|
||||
common_log_set_file(common_log_main(), value.c_str());
|
||||
}
|
||||
).set_env("LLAMA_LOG_FILE"));
|
||||
).set_env("LLAMA_ARG_LOG_FILE"));
|
||||
add_opt(common_arg(
|
||||
{"--log-colors"}, "[on|off|auto]",
|
||||
"Set colored logging ('on', 'off', or 'auto', default: 'auto')\n"
|
||||
@@ -3341,7 +3344,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
string_format("error: unknown value for --log-colors: '%s'\n", value.c_str()));
|
||||
}
|
||||
}
|
||||
).set_env("LLAMA_LOG_COLORS"));
|
||||
).set_env("LLAMA_ARG_LOG_COLORS"));
|
||||
add_opt(common_arg(
|
||||
{"-v", "--verbose", "--log-verbose"},
|
||||
"Set verbosity level to infinity (i.e. log all messages, useful for debugging)",
|
||||
@@ -3356,7 +3359,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
[](common_params & params) {
|
||||
params.offline = true;
|
||||
}
|
||||
).set_env("LLAMA_OFFLINE"));
|
||||
).set_env("LLAMA_ARG_OFFLINE"));
|
||||
add_opt(common_arg(
|
||||
{"-lv", "--verbosity", "--log-verbosity"}, "N",
|
||||
string_format("Set the verbosity threshold. Messages with a higher verbosity will be ignored. Values:\n"
|
||||
@@ -3371,7 +3374,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.verbosity = value;
|
||||
common_log_set_verbosity_thold(value);
|
||||
}
|
||||
).set_env("LLAMA_LOG_VERBOSITY"));
|
||||
).set_env("LLAMA_ARG_LOG_VERBOSITY"));
|
||||
add_opt(common_arg(
|
||||
{"--log-prefix"},
|
||||
{"--no-log-prefix"},
|
||||
|
||||
@@ -310,6 +310,8 @@ std::vector<segment> prune_whitespace_segments(const std::vector<segment> & segm
|
||||
|
||||
namespace autoparser {
|
||||
|
||||
static const std::string ERR_TMPL = "#**ERROR**#";
|
||||
|
||||
std::string apply_template(const common_chat_template & tmpl, const template_params & params) {
|
||||
generation_params tmpl_params;
|
||||
tmpl_params.messages = params.messages;
|
||||
@@ -326,7 +328,7 @@ std::string apply_template(const common_chat_template & tmpl, const template_par
|
||||
return common_chat_template_direct_apply(tmpl, tmpl_params);
|
||||
} catch (const std::exception & e) {
|
||||
LOG_DBG("Template application failed: %s\n", e.what());
|
||||
return "";
|
||||
return ERR_TMPL;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -347,7 +349,7 @@ std::optional<compare_variants_result> compare_variants(
|
||||
std::string output_B = apply_template(tmpl, params_B);
|
||||
|
||||
// Check for template application failures
|
||||
if (output_A.empty() || output_B.empty()) {
|
||||
if (output_A == ERR_TMPL || output_B == ERR_TMPL) {
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
|
||||
@@ -377,6 +377,8 @@ struct analyze_tools : analyze_base {
|
||||
|
||||
struct autoparser {
|
||||
jinja::caps jinja_caps;
|
||||
std::string user_start;
|
||||
std::string assistant_start;
|
||||
analyze_reasoning reasoning;
|
||||
analyze_content content;
|
||||
analyze_tools tools;
|
||||
@@ -387,6 +389,10 @@ struct autoparser {
|
||||
|
||||
autoparser() = default;
|
||||
|
||||
// Find the starting marker for the user message and assistant message
|
||||
std::string detect_user_start_marker(const common_chat_template & tmpl);
|
||||
std::string detect_assistant_start_marker(const common_chat_template & tmpl);
|
||||
|
||||
// Run full differential analysis on a template
|
||||
void analyze_template(const common_chat_template & tmpl);
|
||||
|
||||
|
||||
@@ -8,6 +8,9 @@
|
||||
#include "peg-parser.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cctype>
|
||||
#include <ostream>
|
||||
#include <sstream>
|
||||
|
||||
#define ANSI_RESET "\033[0m"
|
||||
#define ANSI_PURPLE "\033[1m\x1b[38;5;126m"
|
||||
@@ -23,6 +26,7 @@ static const std::string FUN_SECOND = "SSS_SECOND_FUN_S";
|
||||
static const std::string ARG_FIRST = "AA_ARG_FST_AA";
|
||||
static const std::string ARG_SECOND = "BB_ARG_SND_BB";
|
||||
static const std::string USER_MSG = "U_USER_MSG Hello END_U";
|
||||
static const std::string USER_MSG_TWO = "V_USER_MSG Hello END_V";
|
||||
static const std::string ASSISTANT_MSG = "A_ASST_MSG I can help END_A";
|
||||
static const std::string THINKING_CONTENT = "REASON_PART I am thinking END_R";
|
||||
static const std::string CALL_ID_001 = "call00001";
|
||||
@@ -71,6 +75,7 @@ static std::vector<std::function<void(const common_chat_template & tmpl, autopar
|
||||
analysis.content.end = "<|END_OF_TURN_TOKEN|>";
|
||||
analysis.preserved_tokens.push_back("<|CHATBOT_TOKEN|>");
|
||||
analysis.preserved_tokens.push_back("<|END_OF_TURN_TOKEN|>");
|
||||
analysis.user_start = "<|START_OF_TURN_TOKEN|><|USER_TOKEN|>";
|
||||
LOG_DBG(ANSI_ORANGE "[Patch: Cohere Command R+]\n" ANSI_RESET);
|
||||
}
|
||||
},
|
||||
@@ -108,7 +113,59 @@ static std::vector<std::function<void(const common_chat_template & tmpl, autopar
|
||||
analysis.tools.function.close = "```";
|
||||
LOG_DBG(ANSI_ORANGE "[Patch: DeepSeek-R1-Distill-Qwen]\n" ANSI_RESET);
|
||||
}
|
||||
}
|
||||
},
|
||||
// Nemotron Nano v2
|
||||
[](const common_chat_template & tmpl, autoparser & analysis) -> void {
|
||||
if (tmpl.src.find("<SPECIAL_10>") != std::string::npos && tmpl.src.find("<SPECIAL_11>") != std::string::npos &&
|
||||
tmpl.src.find("<SPECIAL_12>") != std::string::npos && tmpl.src.find("<TOOL_RESPONSE>") != std::string::npos) {
|
||||
|
||||
analysis.tools.format.mode = tool_format::JSON_NATIVE;
|
||||
analysis.tools.format.section_start = "";
|
||||
analysis.tools.format.section_end = "";
|
||||
analysis.tools.format.per_call_start = "<TOOLCALL>";
|
||||
analysis.tools.format.per_call_end = "</TOOLCALL>";
|
||||
analysis.content.mode = content_mode::PLAIN;
|
||||
analysis.content.start = "";
|
||||
analysis.content.end = "";
|
||||
analysis.reasoning.mode = reasoning_mode::TAG_BASED;
|
||||
analysis.reasoning.start = "<think>\n\n";
|
||||
analysis.reasoning.end = "</think>";
|
||||
analysis.assistant_start = "<SPECIAL_11>Assistant";
|
||||
analysis.user_start = "<SPECIAL_11>User";
|
||||
analysis.preserved_tokens.clear();
|
||||
analysis.preserved_tokens.push_back("<SPECIAL_12>");
|
||||
analysis.preserved_tokens.push_back("<SPECIAL_11>");
|
||||
analysis.preserved_tokens.push_back("</think>");
|
||||
analysis.preserved_tokens.push_back("<TOOLCALL>");
|
||||
analysis.preserved_tokens.push_back("</TOOLCALL>");
|
||||
LOG_DBG(ANSI_ORANGE "[Patch: Nemotron Nano v2]\n" ANSI_RESET);
|
||||
}
|
||||
},
|
||||
// Fireworks
|
||||
[](const common_chat_template & tmpl, autoparser & analysis) -> void {
|
||||
if (tmpl.src.find("{%- set system_prompt = '<|start_header_id|>' + 'system' + '<|end_header_id|>\\n\\n'"
|
||||
" + message['content'] | trim + '\\n' + system_prompt_suffix + '<|eot_id|>' -%}") != std::string::npos) {
|
||||
analysis.assistant_start = "<|start_header_id|>assistant<|end_header_id|>";
|
||||
analysis.user_start = "<|start_header_id|>user<|end_header_id|>";
|
||||
LOG_DBG(ANSI_ORANGE "[Patch: Fireworks v2]\n" ANSI_RESET);
|
||||
}
|
||||
},
|
||||
// Solar Open
|
||||
[](const common_chat_template & tmpl, autoparser & analysis) -> void {
|
||||
if (tmpl.src.find("<|begin|>assistant<|think|><|end|>") != std::string::npos) {
|
||||
analysis.assistant_start = "<|begin|>assistant";
|
||||
LOG_DBG(ANSI_ORANGE "[Patch: Solar Open]\n" ANSI_RESET);
|
||||
}
|
||||
},
|
||||
// Apriel 1.6
|
||||
[](const common_chat_template & tmpl, autoparser & analysis) -> void {
|
||||
if (tmpl.src.find("if not loop.last and '[BEGIN FINAL RESPONSE]' in asst_text") != std::string::npos) {
|
||||
analysis.user_start = "<|begin_user|>";
|
||||
analysis.assistant_start = "<|begin_assistant|>";
|
||||
LOG_DBG(ANSI_ORANGE "[Patch: Apriel 1.6]\n" ANSI_RESET);
|
||||
}
|
||||
},
|
||||
|
||||
});
|
||||
|
||||
// Common JSON structures
|
||||
@@ -166,6 +223,8 @@ void autoparser::analyze_template(const common_chat_template & tmpl) {
|
||||
reasoning = analyze_reasoning(tmpl, jinja_caps.supports_tool_calls);
|
||||
content = analyze_content(tmpl, reasoning);
|
||||
tools = analyze_tools(jinja_caps.supports_tool_calls ? analyze_tools(tmpl, jinja_caps, reasoning) : analyze_tools());
|
||||
assistant_start = detect_assistant_start_marker(tmpl);
|
||||
user_start = detect_user_start_marker(tmpl);
|
||||
collect_preserved_tokens();
|
||||
|
||||
for (auto & workaround : workarounds) {
|
||||
@@ -173,6 +232,8 @@ void autoparser::analyze_template(const common_chat_template & tmpl) {
|
||||
}
|
||||
|
||||
LOG_DBG("\n--- Reasoning & Content Structure ---\n");
|
||||
LOG_DBG("user_msg_start: %s\n", user_start.c_str());
|
||||
LOG_DBG("assistant_msg_start: %s\n", assistant_start.c_str());
|
||||
LOG_DBG("reasoning_mode: %s\n", mode_to_str(reasoning.mode).c_str());
|
||||
LOG_DBG("reasoning_start: '%s'\n", reasoning.start.c_str());
|
||||
LOG_DBG("reasoning_end: '%s'\n", reasoning.end.c_str());
|
||||
@@ -245,6 +306,120 @@ void autoparser::collect_preserved_tokens() {
|
||||
add_token(tools.call_id.suffix);
|
||||
}
|
||||
|
||||
std::string autoparser::detect_assistant_start_marker(const common_chat_template & tmpl) {
|
||||
json user_msg = json{
|
||||
{ "role", "user" },
|
||||
{ "content", USER_MSG }
|
||||
};
|
||||
|
||||
json assistant_no_reasoning = json{
|
||||
{ "role", "assistant" },
|
||||
{ "content", ASSISTANT_MSG }
|
||||
};
|
||||
|
||||
template_params params;
|
||||
params.messages = json::array({ user_msg });
|
||||
params.add_generation_prompt = false;
|
||||
params.enable_thinking = true;
|
||||
|
||||
auto comparison = compare_variants(
|
||||
tmpl, params, [&](template_params & p) {
|
||||
p.messages = json::array({ user_msg, assistant_no_reasoning });
|
||||
}
|
||||
);
|
||||
|
||||
if (!comparison) {
|
||||
LOG_DBG(ANSI_ORANGE "%s: Template application failed, skipping assistant start detection\n" ANSI_RESET, __func__);
|
||||
return "";
|
||||
}
|
||||
|
||||
auto usermsg = comparison->diff.right;
|
||||
if (usermsg.find(ASSISTANT_MSG) == std::string::npos) {
|
||||
LOG_DBG(ANSI_ORANGE "%s: Did not find assistant message in assistant message block, skipping detection\n" ANSI_RESET, __func__);
|
||||
}
|
||||
|
||||
auto ast_prefix = usermsg.substr(0, usermsg.find(ASSISTANT_MSG));
|
||||
if (!reasoning.start.empty() && ast_prefix.find(trim_whitespace(reasoning.start)) != std::string::npos) {
|
||||
ast_prefix = ast_prefix.substr(0, ast_prefix.find(trim_whitespace(reasoning.start)));
|
||||
}
|
||||
if (!reasoning.end.empty() && ast_prefix.find(trim_whitespace(reasoning.end)) != std::string::npos) {
|
||||
ast_prefix = ast_prefix.substr(0, ast_prefix.find(trim_whitespace(reasoning.end)));
|
||||
}
|
||||
return trim_whitespace(ast_prefix);
|
||||
}
|
||||
|
||||
std::string autoparser::detect_user_start_marker(const common_chat_template & tmpl) {
|
||||
json user_msg = json{
|
||||
{ "role", "user" },
|
||||
{ "content", USER_MSG }
|
||||
};
|
||||
|
||||
json assistant = json{
|
||||
{ "role", "assistant" },
|
||||
{ "content", ASSISTANT_MSG }
|
||||
};
|
||||
|
||||
json user_msg_two = json{
|
||||
{ "role", "user" },
|
||||
{ "content", USER_MSG_TWO }
|
||||
};
|
||||
|
||||
template_params params;
|
||||
params.messages = json::array({});
|
||||
params.add_generation_prompt = false;
|
||||
params.enable_thinking = true;
|
||||
|
||||
auto comparison = compare_variants(
|
||||
tmpl, params, [&](template_params & p) {
|
||||
p.messages = json::array({ user_msg });
|
||||
}
|
||||
);
|
||||
|
||||
if (!comparison) {
|
||||
LOG_DBG(ANSI_ORANGE "%s: Template application failed, unsupported empty messages? trying complex variant\n" ANSI_RESET, __func__);
|
||||
params.messages = json::array({ user_msg_two, assistant });
|
||||
comparison = compare_variants(
|
||||
tmpl, params, [&](template_params & p) {
|
||||
p.messages = json::array({ user_msg_two, assistant, user_msg });
|
||||
}
|
||||
);
|
||||
if (!comparison) {
|
||||
LOG_DBG(ANSI_ORANGE "%s: Template application failed for reserve variant, aborting\n" ANSI_RESET, __func__);
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
auto usermsg = comparison->diff.right;
|
||||
if (usermsg.find(USER_MSG) == std::string::npos) {
|
||||
LOG_DBG(ANSI_ORANGE "%s: Did not find user message in user message block, aborting detection\n" ANSI_RESET, __func__);
|
||||
}
|
||||
|
||||
if (usermsg.find(ASSISTANT_MSG) != std::string::npos) {
|
||||
usermsg = usermsg.substr(usermsg.find(ASSISTANT_MSG) + ASSISTANT_MSG.size());
|
||||
}
|
||||
|
||||
auto candidate = usermsg.substr(0, usermsg.find(USER_MSG));
|
||||
auto candidate_split = segmentize_markers(candidate);
|
||||
std::stringstream result;
|
||||
bool encountered_marker = false;
|
||||
for (const auto & mrk : candidate_split) {
|
||||
std::string lower_mrk = std::string(mrk.value);
|
||||
std::transform(lower_mrk.begin(), lower_mrk.end(), lower_mrk.begin(),
|
||||
[](unsigned char c) { return std::tolower(c); });
|
||||
// heuristic to weed out potential end markers, but only at the start
|
||||
if (mrk.type == segment_type::MARKER && !encountered_marker &&
|
||||
(lower_mrk.find("end") != std::string::npos || lower_mrk.find("close") != std::string::npos)) {
|
||||
continue;
|
||||
}
|
||||
if (mrk.type == segment_type::TEXT && !encountered_marker && trim_whitespace(mrk.value).empty()) {
|
||||
continue;
|
||||
}
|
||||
encountered_marker |= mrk.type == segment_type::MARKER;
|
||||
result << mrk.value;
|
||||
}
|
||||
return trim_whitespace(result.str());
|
||||
}
|
||||
|
||||
analyze_reasoning::analyze_reasoning(const common_chat_template & tmpl, bool supports_tools)
|
||||
: analyze_base(tmpl) {
|
||||
LOG_DBG(ANSI_PURPLE "=== Starting differential analysis ===\n" ANSI_RESET);
|
||||
|
||||
@@ -90,6 +90,45 @@ std::string common_chat_msg::render_content(const std::string & delimiter) const
|
||||
return text;
|
||||
}
|
||||
|
||||
std::vector<common_chat_msg_span> common_chat_split_by_role(const std::string & prompt, const std::vector<common_chat_msg_delimiter> & delims) {
|
||||
if (delims.empty() || prompt.empty()) {
|
||||
return {};
|
||||
}
|
||||
|
||||
auto parser = build_peg_parser([&](common_peg_parser_builder & p) {
|
||||
std::vector<std::string> all_delims;
|
||||
std::vector<common_peg_parser> tagged_messages;
|
||||
|
||||
all_delims.reserve(delims.size());
|
||||
tagged_messages.reserve(delims.size());
|
||||
for (const auto & d : delims) {
|
||||
all_delims.push_back(d.delimiter);
|
||||
}
|
||||
|
||||
auto any_delim = p.until_one_of(all_delims);
|
||||
for (const auto & d : delims) {
|
||||
tagged_messages.push_back(p.tag(d.role, p.literal(d.delimiter) + any_delim));
|
||||
}
|
||||
|
||||
return any_delim + p.zero_or_more(p.choice(tagged_messages)) + p.end();
|
||||
});
|
||||
|
||||
common_peg_parse_context ctx(prompt);
|
||||
const auto result = parser.parse(ctx);
|
||||
if (!result.success()) {
|
||||
return {};
|
||||
}
|
||||
|
||||
std::vector<common_chat_msg_span> spans;
|
||||
ctx.ast.visit(result, [&](const common_peg_ast_node & node) {
|
||||
if (!node.tag.empty()) {
|
||||
spans.push_back({ node.tag, node.start, node.end - node.start });
|
||||
}
|
||||
});
|
||||
|
||||
return spans;
|
||||
}
|
||||
|
||||
json common_chat_msg::to_json_oaicompat(bool concat_typed_text) const {
|
||||
if (!content.empty() && !content_parts.empty()) {
|
||||
throw std::runtime_error("Cannot specify both content and content_parts");
|
||||
@@ -1042,6 +1081,14 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
|
||||
|
||||
data.prompt = prompt;
|
||||
data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs, /* messages_override= */ adjusted_messages);
|
||||
data.message_spans = common_chat_split_by_role(prompt, {
|
||||
{ "assistant", "<|start|>assistant" },
|
||||
{ "user", "<|start|>user" },
|
||||
{ "system", "<|start|>developer" },
|
||||
{ "system", "<|start|>system" },
|
||||
{ "tool", "<|start|>functions" },
|
||||
});
|
||||
|
||||
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
|
||||
data.supports_thinking = true;
|
||||
|
||||
@@ -1181,6 +1228,11 @@ static common_chat_params common_chat_params_init_gemma4(const common_chat_templ
|
||||
data.prompt += data.generation_prompt;
|
||||
}
|
||||
|
||||
data.message_spans = common_chat_split_by_role(data.prompt, {
|
||||
{ "user", "<|turn>user\n" },
|
||||
{ "assistant", "<|turn>model\n" },
|
||||
});
|
||||
|
||||
data.format = COMMON_CHAT_FORMAT_PEG_GEMMA4;
|
||||
data.supports_thinking = true;
|
||||
data.thinking_start_tag = "<|channel>thought";
|
||||
@@ -2393,6 +2445,19 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
|
||||
struct autoparser::autoparser autoparser;
|
||||
autoparser.analyze_template(tmpl);
|
||||
auto auto_params = autoparser::peg_generator::generate_parser(tmpl, params, autoparser);
|
||||
|
||||
std::vector<common_chat_msg_delimiter> delimiters;
|
||||
if (!autoparser.assistant_start.empty()) {
|
||||
delimiters.push_back({ "assistant", autoparser.assistant_start });
|
||||
}
|
||||
if (!autoparser.user_start.empty()) {
|
||||
delimiters.push_back({ "user", autoparser.user_start });
|
||||
}
|
||||
|
||||
if (!delimiters.empty()) {
|
||||
auto_params.message_spans = common_chat_split_by_role(auto_params.prompt, delimiters);
|
||||
}
|
||||
|
||||
auto_params.supports_thinking = autoparser.reasoning.mode != autoparser::reasoning_mode::NONE;
|
||||
if (auto_params.supports_thinking) {
|
||||
auto_params.thinking_start_tag = trim_whitespace(autoparser.reasoning.start);
|
||||
|
||||
@@ -143,6 +143,17 @@ struct common_chat_msg_diff {
|
||||
}
|
||||
};
|
||||
|
||||
struct common_chat_msg_span {
|
||||
std::string role;
|
||||
std::size_t pos = 0;
|
||||
std::size_t len = 0;
|
||||
};
|
||||
|
||||
struct common_chat_msg_delimiter {
|
||||
std::string role;
|
||||
std::string delimiter;
|
||||
};
|
||||
|
||||
struct common_chat_tool {
|
||||
std::string name;
|
||||
std::string description;
|
||||
@@ -208,6 +219,7 @@ struct common_chat_params {
|
||||
std::vector<std::string> preserved_tokens;
|
||||
std::vector<std::string> additional_stops;
|
||||
std::string parser;
|
||||
std::vector<common_chat_msg_span> message_spans;
|
||||
};
|
||||
|
||||
// per-message parsing syntax
|
||||
@@ -219,6 +231,7 @@ struct common_chat_parser_params {
|
||||
bool reasoning_in_content = false;
|
||||
std::string generation_prompt;
|
||||
bool parse_tool_calls = true;
|
||||
bool is_continuation = false;
|
||||
bool echo = false; // Include assistant prefilled msg in output
|
||||
bool debug = false; // Enable debug output for PEG parser
|
||||
common_peg_arena parser = {};
|
||||
@@ -303,6 +316,7 @@ std::optional<common_chat_params> common_chat_try_specialized_template(
|
||||
const std::string & src,
|
||||
autoparser::generation_params & params);
|
||||
|
||||
|
||||
// specialized per-task preset
|
||||
struct common_chat_prompt_preset {
|
||||
std::string system;
|
||||
@@ -310,3 +324,6 @@ struct common_chat_prompt_preset {
|
||||
};
|
||||
|
||||
common_chat_prompt_preset common_chat_get_asr_prompt(const common_chat_templates * chat_templates);
|
||||
|
||||
std::vector<common_chat_msg_span> common_chat_split_by_role(const std::string & prompt, const std::vector<common_chat_msg_delimiter> & delims);
|
||||
|
||||
|
||||
@@ -445,6 +445,27 @@ std::string string_strip(const std::string & str) {
|
||||
return str.substr(start, end - start);
|
||||
}
|
||||
|
||||
std::string string_lcs(std::string_view a, std::string_view b) {
|
||||
if (a.empty() || b.empty()) return {};
|
||||
|
||||
std::vector<std::vector<size_t>> dp(a.size() + 1, std::vector<size_t>(b.size() + 1, 0));
|
||||
size_t best_len = 0;
|
||||
size_t best_end_a = 0;
|
||||
|
||||
for (size_t i = 1; i <= a.size(); ++i) {
|
||||
for (size_t j = 1; j <= b.size(); ++j) {
|
||||
if (a[i - 1] == b[j - 1]) {
|
||||
dp[i][j] = dp[i - 1][j - 1] + 1;
|
||||
if (dp[i][j] > best_len) {
|
||||
best_len = dp[i][j];
|
||||
best_end_a = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return std::string(a.substr(best_end_a - best_len, best_len));
|
||||
}
|
||||
|
||||
std::string string_get_sortable_timestamp() {
|
||||
using clock = std::chrono::system_clock;
|
||||
|
||||
|
||||
@@ -594,7 +594,7 @@ struct common_params {
|
||||
bool cache_prompt = true; // whether to enable prompt caching
|
||||
bool cache_idle_slots = true; // save and clear idle slots upon starting a new task
|
||||
int32_t n_ctx_checkpoints = 32; // max number of context checkpoints per slot
|
||||
int32_t checkpoint_every_nt = 8192; // make a checkpoint every n tokens during prefill
|
||||
int32_t checkpoint_min_step = 256; // minimum spacing between context checkpoints
|
||||
int32_t cache_ram_mib = 8192; // -1 = no limit, 0 - disable, 1 = 1 MiB, etc.
|
||||
|
||||
std::string hostname = "127.0.0.1";
|
||||
@@ -617,11 +617,7 @@ struct common_params {
|
||||
std::map<std::string, std::string> default_template_kwargs;
|
||||
|
||||
// UI configs
|
||||
#ifdef LLAMA_UI_DEFAULT_ENABLED
|
||||
bool ui = LLAMA_UI_DEFAULT_ENABLED != 0;
|
||||
#else
|
||||
bool ui = true; // default to enabled when not set
|
||||
#endif
|
||||
bool ui = true;
|
||||
|
||||
// Deprecated: use ui, ui_mcp_proxy, ui_config_json instead
|
||||
bool webui = ui;
|
||||
@@ -735,6 +731,7 @@ std::string string_format(const char * fmt, ...);
|
||||
|
||||
std::string string_strip(const std::string & str);
|
||||
std::string string_get_sortable_timestamp();
|
||||
std::string string_lcs(std::string_view a, std::string_view b);
|
||||
|
||||
std::string string_join(const std::vector<std::string> & values, const std::string & separator);
|
||||
std::vector<std::string> string_split(const std::string & str, const std::string & delimiter);
|
||||
|
||||
@@ -26,7 +26,7 @@ class common_params_fit_exception : public std::runtime_error {
|
||||
using std::runtime_error::runtime_error;
|
||||
};
|
||||
|
||||
static std::vector<llama_device_memory_data> common_get_device_memory_data(
|
||||
std::vector<llama_device_memory_data> common_get_device_memory_data(
|
||||
const char * path_model,
|
||||
const llama_model_params * mparams,
|
||||
const llama_context_params * cparams,
|
||||
|
||||
16
common/fit.h
16
common/fit.h
@@ -1,6 +1,11 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-backend.h"
|
||||
#include "llama.h"
|
||||
#include "../src/llama-ext.h"
|
||||
|
||||
#include <vector>
|
||||
|
||||
enum common_params_fit_status {
|
||||
COMMON_PARAMS_FIT_STATUS_SUCCESS = 0, // found allocations that are projected to fit
|
||||
@@ -30,3 +35,14 @@ void common_fit_print(
|
||||
struct llama_context_params * cparams);
|
||||
|
||||
void common_memory_breakdown_print(const struct llama_context * ctx);
|
||||
|
||||
// Load a model + context with no_alloc and return the per-device memory breakdown.
|
||||
std::vector<llama_device_memory_data> common_get_device_memory_data(
|
||||
const char * path_model,
|
||||
const struct llama_model_params * mparams,
|
||||
const struct llama_context_params * cparams,
|
||||
std::vector<ggml_backend_dev_t> & devs,
|
||||
uint32_t & hp_ngl,
|
||||
uint32_t & hp_n_ctx_train,
|
||||
uint32_t & hp_n_expert,
|
||||
enum ggml_log_level log_level);
|
||||
|
||||
@@ -74,6 +74,7 @@ TEXT_MODEL_MAP: dict[str, str] = {
|
||||
"Gemma3nForCausalLM": "gemma",
|
||||
"Gemma3nForConditionalGeneration": "gemma",
|
||||
"Gemma4ForConditionalGeneration": "gemma",
|
||||
"Gemma4ForCausalLM": "gemma",
|
||||
"GemmaForCausalLM": "gemma",
|
||||
"Glm4ForCausalLM": "glm",
|
||||
"Glm4MoeForCausalLM": "glm",
|
||||
@@ -215,6 +216,7 @@ TEXT_MODEL_MAP: dict[str, str] = {
|
||||
"T5EncoderModel": "t5",
|
||||
"T5ForConditionalGeneration": "t5",
|
||||
"T5WithLMHeadModel": "t5",
|
||||
"TalkieForCausalLM": "talkie",
|
||||
"UMT5ForConditionalGeneration": "t5",
|
||||
"UMT5Model": "t5",
|
||||
"UltravoxModel": "ultravox",
|
||||
|
||||
@@ -119,7 +119,8 @@ class ModelBase:
|
||||
small_first_shard: bool = False, hparams: dict[str, Any] | None = None, remote_hf_model_id: str | None = None,
|
||||
disable_mistral_community_chat_template: bool = False,
|
||||
sentence_transformers_dense_modules: bool = False,
|
||||
fuse_gate_up_exps: bool = False):
|
||||
fuse_gate_up_exps: bool = False,
|
||||
fp8_as_q8: bool = False):
|
||||
if type(self) is ModelBase or \
|
||||
type(self) is TextModel or \
|
||||
type(self) is MmprojModel:
|
||||
@@ -148,6 +149,8 @@ class ModelBase:
|
||||
self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py
|
||||
self._is_nvfp4 = False
|
||||
self._is_mxfp4 = False
|
||||
self._fp8_as_q8 = fp8_as_q8
|
||||
self._fp8_dequantized: set[str] = set()
|
||||
|
||||
# Apply heuristics to figure out typical tensor encoding based on first tensor's dtype
|
||||
# NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie.
|
||||
@@ -429,6 +432,8 @@ class ModelBase:
|
||||
s = self.model_tensors[name]
|
||||
self.model_tensors[weight_name] = lambda w=w, s=s, bs=block_size: dequant_simple(w(), s(), bs)
|
||||
tensors_to_remove.append(name)
|
||||
if self._fp8_as_q8:
|
||||
self._fp8_dequantized.add(weight_name)
|
||||
if name.endswith(".activation_scale"): # unused
|
||||
tensors_to_remove.append(name)
|
||||
if name.endswith("_activation_scale"): # Mistral-Small-4-119B-2602, unused
|
||||
@@ -440,6 +445,8 @@ class ModelBase:
|
||||
s = self.model_tensors[name]
|
||||
self.model_tensors[weight_name] = lambda w=w, s=s, bs=block_size: dequant_simple(w(), s(), bs)
|
||||
tensors_to_remove.append(name)
|
||||
if self._fp8_as_q8:
|
||||
self._fp8_dequantized.add(weight_name)
|
||||
if name.endswith(".qscale_act"):
|
||||
tensors_to_remove.append(name)
|
||||
elif quant_method == "gptq":
|
||||
@@ -467,7 +474,14 @@ class ModelBase:
|
||||
elif quant_method == "compressed-tensors":
|
||||
quant_format = quant_config["format"]
|
||||
groups = quant_config["config_groups"]
|
||||
if len(groups) > 1:
|
||||
nvfp4_compressed_tensors = (
|
||||
quant_format == "nvfp4-pack-quantized"
|
||||
or quant_format == "mixed-precision"
|
||||
and bool(groups)
|
||||
and all(g.get("format") == "nvfp4-pack-quantized" for g in groups.values() if isinstance(g, dict))
|
||||
)
|
||||
|
||||
if len(groups) > 1 and not nvfp4_compressed_tensors:
|
||||
raise NotImplementedError("Can't handle multiple config groups for compressed-tensors yet")
|
||||
weight_config = tuple(groups.values())[0]["weights"]
|
||||
|
||||
@@ -476,6 +490,11 @@ class ModelBase:
|
||||
strategy = weight_config.get("strategy")
|
||||
assert strategy == "channel" or strategy == "block"
|
||||
assert weight_config.get("group_size") is None # didn't find a model using this yet
|
||||
is_fp8 = (
|
||||
quant_format == "float-quantized"
|
||||
and weight_config.get("type") == "float"
|
||||
and weight_config.get("num_bits") == 8
|
||||
)
|
||||
for name in self.model_tensors.keys():
|
||||
if name.endswith(".weight_scale"):
|
||||
weight_name = name.removesuffix("_scale")
|
||||
@@ -483,6 +502,8 @@ class ModelBase:
|
||||
s = self.model_tensors[name]
|
||||
self.model_tensors[weight_name] = lambda w=w, s=s: dequant_simple(w(), s(), block_size)
|
||||
tensors_to_remove.append(name)
|
||||
if self._fp8_as_q8 and is_fp8:
|
||||
self._fp8_dequantized.add(weight_name)
|
||||
elif quant_format == "pack-quantized":
|
||||
assert weight_config.get("strategy") == "group"
|
||||
assert weight_config.get("type", "int") == "int"
|
||||
@@ -505,6 +526,9 @@ class ModelBase:
|
||||
tensors_to_remove += [base_name + n for n in ("_packed", "_shape", "_scale")]
|
||||
if (base_name + "_zero_point") in self.model_tensors:
|
||||
tensors_to_remove.append(base_name + "_zero_point")
|
||||
elif nvfp4_compressed_tensors:
|
||||
# Don't error from compressed-tensors, we'll handle them in _generate_nvfp4_tensors
|
||||
pass
|
||||
else:
|
||||
raise NotImplementedError(f"Quant format {quant_format!r} for method {quant_method!r} is not yet supported")
|
||||
elif quant_method == "modelopt":
|
||||
@@ -514,10 +538,18 @@ class ModelBase:
|
||||
for name in self.model_tensors.keys():
|
||||
if name.endswith(".weight_scale"):
|
||||
weight_name = name.removesuffix("_scale")
|
||||
if weight_name not in self.model_tensors:
|
||||
tensors_to_remove.append(name)
|
||||
continue
|
||||
w = self.model_tensors[weight_name]
|
||||
s = self.model_tensors[name]
|
||||
is_fp8_weight = False
|
||||
if self._fp8_as_q8:
|
||||
is_fp8_weight = w().dtype in (torch.float8_e4m3fn, torch.float8_e5m2)
|
||||
self.model_tensors[weight_name] = lambda w=w, s=s: dequant_simple(w(), s(), None)
|
||||
tensors_to_remove.append(name)
|
||||
if is_fp8_weight:
|
||||
self._fp8_dequantized.add(weight_name)
|
||||
if name.endswith((".input_scale", ".k_scale", ".v_scale")):
|
||||
tensors_to_remove.append(name)
|
||||
elif quant_method is not None:
|
||||
@@ -605,8 +637,10 @@ class ModelBase:
|
||||
return [(new_name, data_torch)]
|
||||
|
||||
def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool:
|
||||
del name, new_name, bid, n_dims # unused
|
||||
|
||||
del new_name, bid # unused
|
||||
# Force FP8-original tensors to Q8_0 when requested; Q8_0 is faster than F16/BF16.
|
||||
if self._fp8_as_q8 and name in self._fp8_dequantized and n_dims >= 2:
|
||||
return gguf.GGMLQuantizationType.Q8_0
|
||||
return False
|
||||
|
||||
# some models need extra generated tensors (like rope_freqs)
|
||||
@@ -746,10 +780,13 @@ class ModelBase:
|
||||
del experts, merged
|
||||
|
||||
def prepare_tensors(self):
|
||||
# detect NVFP4 quantization (ModelOpt format)
|
||||
quant_algo = (self.hparams.get("quantization_config") or {}).get("quant_algo")
|
||||
quant_method = (self.hparams.get("quantization_config") or {}).get("quant_method")
|
||||
quant_layers = (self.hparams.get("quantization_config") or {}).get("quantized_layers") or {}
|
||||
# detect NVFP4 quantization (ModelOpt and Compressed-tensors formats)
|
||||
quantization_config = self.hparams.get("quantization_config") or {}
|
||||
quant_algo = quantization_config.get("quant_algo")
|
||||
quant_method = quantization_config.get("quant_method")
|
||||
quant_format = quantization_config.get("format")
|
||||
quant_groups = quantization_config.get("config_groups") or {}
|
||||
quant_layers = quantization_config.get("quantized_layers") or {}
|
||||
quant_config_file = self.dir_model / "hf_quant_config.json"
|
||||
|
||||
if (not quant_algo or not quant_layers) and quant_config_file.is_file():
|
||||
@@ -760,13 +797,25 @@ class ModelBase:
|
||||
producer_name = (producer.get("name") or "").lower()
|
||||
if quant_method is None:
|
||||
self.hparams.setdefault("quantization_config", {})["quant_method"] = producer_name
|
||||
quant_method = producer_name
|
||||
quant_algo = quant_config.get("quant_algo", quant_algo)
|
||||
quant_method = quant_config.get("quant_method", quant_method)
|
||||
quant_format = quant_config.get("format", quant_format)
|
||||
quant_groups = quant_config.get("config_groups", quant_groups) or {}
|
||||
quant_layers = quant_config.get("quantized_layers", quant_layers) or {}
|
||||
|
||||
# Some models use per-tensor quant_algo (e.g. "MIXED_PRECISION" with
|
||||
# per-layer NVFP4/FP8) instead of a single global "NVFP4" value.
|
||||
nvfp4_compressed_tensors = quant_method == "compressed-tensors" and (
|
||||
quant_format == "nvfp4-pack-quantized"
|
||||
or quant_format == "mixed-precision"
|
||||
and bool(quant_groups)
|
||||
and all(g.get("format") == "nvfp4-pack-quantized" for g in quant_groups.values() if isinstance(g, dict))
|
||||
)
|
||||
if quant_algo != "NVFP4":
|
||||
if any(v.get("quant_algo") == "NVFP4" for v in quant_layers.values() if isinstance(v, dict)):
|
||||
if nvfp4_compressed_tensors:
|
||||
quant_algo = "NVFP4"
|
||||
elif any(str(v.get("quant_algo")).endswith("NVFP4") for v in quant_layers.values() if isinstance(v, dict)):
|
||||
quant_algo = "NVFP4"
|
||||
|
||||
self._is_nvfp4 = quant_algo == "NVFP4"
|
||||
@@ -776,6 +825,28 @@ class ModelBase:
|
||||
# This must run before dequant_model so NVFP4 tensors are removed
|
||||
# from model_tensors, leaving only non-NVFP4 (e.g. FP8) for dequant.
|
||||
if self._is_nvfp4:
|
||||
if nvfp4_compressed_tensors:
|
||||
# Convert compressed-tensors 'global' scales into the reciprocal
|
||||
def inverse_scale(gen):
|
||||
def load():
|
||||
scale = LazyTorchTensor.to_eager(gen()).float()
|
||||
return 1.0 / scale
|
||||
return load
|
||||
|
||||
# Change the compressed-tensors names to the ModelOpt names for handling consistently later
|
||||
for name in list(self.model_tensors.keys()):
|
||||
if name.endswith(".weight_packed"):
|
||||
weight_name = name.removesuffix("_packed")
|
||||
if weight_name not in self.model_tensors:
|
||||
self.model_tensors[weight_name] = self.model_tensors.pop(name)
|
||||
elif name.endswith(".weight_global_scale"):
|
||||
scale2_name = name.replace(".weight_global_scale", ".weight_scale_2")
|
||||
if scale2_name not in self.model_tensors:
|
||||
self.model_tensors[scale2_name] = inverse_scale(self.model_tensors.pop(name))
|
||||
elif name.endswith(".input_global_scale"):
|
||||
input_scale_name = name.replace(".input_global_scale", ".input_scale")
|
||||
if input_scale_name not in self.model_tensors:
|
||||
self.model_tensors[input_scale_name] = inverse_scale(self.model_tensors.pop(name))
|
||||
self._generate_nvfp4_tensors()
|
||||
|
||||
self.dequant_model()
|
||||
@@ -1575,6 +1646,12 @@ class TextModel(ModelBase):
|
||||
if chkhsh == "62f6fb0a6fd5098caeabb19b07a5c1099cafc8b9c40eab6ea89ece4ec02fbc57":
|
||||
# ref: https://huggingface.co/sarvamai/sarvam-30b
|
||||
res = "sarvam-moe"
|
||||
if chkhsh == "f728162c1315c26e40249849799b4ba3fe584c32084b4795b03eb295e63cb5af":
|
||||
# ref: https://huggingface.co/lewtun/talkie-1930-13b-it-hf
|
||||
res = "talkie"
|
||||
if chkhsh == "36f3066e97b7f3994b379aaacde306c1444c6ae84e81a5ae3cd2b7ed3b8c42d4":
|
||||
# ref: https://huggingface.co/openbmb/MiniCPM5-1B
|
||||
res = "minicpm5"
|
||||
|
||||
if res is None:
|
||||
logger.warning("\n")
|
||||
@@ -1617,6 +1694,11 @@ class TextModel(ModelBase):
|
||||
assert max(tokenizer.vocab.values()) < vocab_size # ty: ignore[unresolved-attribute]
|
||||
|
||||
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} # ty: ignore[unresolved-attribute]
|
||||
# k-mers can share text with a base-vocab BPE token (e.g. CCCCCC) and get
|
||||
# dropped by get_vocab(); a reserved marker suffix (U+E000) keeps each
|
||||
# k-mer's own id (llama.cpp strips it on detokenization)
|
||||
for kmer in tokenizer.kmers: # ty: ignore[unresolved-attribute]
|
||||
reverse_vocab[tokenizer.dna_token_to_id[kmer]] = kmer + "\ue000" # ty: ignore[unresolved-attribute]
|
||||
added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute]
|
||||
added_tokens_decoder = tokenizer.added_tokens_decoder # ty: ignore[unresolved-attribute]
|
||||
|
||||
@@ -2359,10 +2441,9 @@ class MmprojModel(ModelBase):
|
||||
raise KeyError(f"could not find any of: {keys}")
|
||||
|
||||
def tensor_force_quant(self, name, new_name, bid, n_dims):
|
||||
del bid, name, n_dims # unused
|
||||
if ".patch_embd.weight" in new_name or ".patch_merger.weight" in new_name:
|
||||
return gguf.GGMLQuantizationType.F16 if self.ftype == gguf.LlamaFileType.MOSTLY_F16 else gguf.GGMLQuantizationType.F32
|
||||
return False
|
||||
return super().tensor_force_quant(name, new_name, bid, n_dims)
|
||||
|
||||
|
||||
class LazyTorchTensor(gguf.LazyBase):
|
||||
|
||||
@@ -614,7 +614,7 @@ class Gemma3NModel(Gemma3Model):
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("Gemma4ForConditionalGeneration")
|
||||
@ModelBase.register("Gemma4ForConditionalGeneration", "Gemma4ForCausalLM")
|
||||
class Gemma4Model(Gemma3Model):
|
||||
model_arch = gguf.MODEL_ARCH.GEMMA4
|
||||
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable, Iterable, TYPE_CHECKING
|
||||
|
||||
import torch
|
||||
@@ -549,6 +548,7 @@ class _Qwen35MtpMixin:
|
||||
tensor_map: gguf.TensorNameMap
|
||||
no_mtp: bool
|
||||
mtp_only: bool
|
||||
_original_block_count: int | None = None
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
@@ -557,22 +557,44 @@ class _Qwen35MtpMixin:
|
||||
self.block_count += self.hparams.get("mtp_num_hidden_layers", 0)
|
||||
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
|
||||
|
||||
def index_tensors(self, remote_hf_model_id: str | None = None) -> dict[str, Callable[[], Tensor]]:
|
||||
hparams = {**self.hparams, **self.hparams.get("text_config", {})}
|
||||
key = next((k for k in ["n_layers", "num_hidden_layers", "n_layer", "num_layers"] if k in hparams), None)
|
||||
type(self)._original_block_count = hparams.get(key)
|
||||
return super().index_tensors(remote_hf_model_id=remote_hf_model_id) # ty: ignore[unresolved-attribute]
|
||||
|
||||
@classmethod
|
||||
def filter_tensors(cls, item):
|
||||
name, _ = item
|
||||
assert cls._original_block_count is not None
|
||||
# TODO: change TextModel to super()
|
||||
if (titem := TextModel.filter_tensors(item)) is None:
|
||||
return None
|
||||
name, gen = titem
|
||||
if name.startswith("model.mtp."):
|
||||
name = name.replace("model.", "", 1)
|
||||
if name.startswith("mtp."):
|
||||
if cls.no_mtp:
|
||||
return None
|
||||
return item
|
||||
if cls.mtp_only:
|
||||
canonical = name.replace("language_model.", "")
|
||||
keep = canonical in (
|
||||
remapper = {
|
||||
"fc": "eh_proj",
|
||||
"pre_fc_norm_embedding": "enorm",
|
||||
"pre_fc_norm_hidden": "hnorm",
|
||||
"norm": "shared_head.norm",
|
||||
}
|
||||
parts = name.split(".", 3)
|
||||
if len(parts) == 4 and parts[1] == "layers" and parts[2].isdecimal():
|
||||
mtp_idx = int(parts[2])
|
||||
name = f"model.layers.{cls._original_block_count + mtp_idx}.{parts[3]}"
|
||||
elif len(parts) == 3 and parts[1] in remapper:
|
||||
name = f"model.layers.{cls._original_block_count}.{remapper[parts[1]]}.{parts[2]}"
|
||||
elif cls.mtp_only:
|
||||
keep = name in (
|
||||
"model.embed_tokens.weight", "model.norm.weight", "lm_head.weight",
|
||||
"embed_tokens.weight", "norm.weight",
|
||||
)
|
||||
if not keep:
|
||||
return None
|
||||
return super().filter_tensors(item) # ty: ignore[unresolved-attribute]
|
||||
return name, gen
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters() # ty: ignore[unresolved-attribute]
|
||||
@@ -594,29 +616,6 @@ class _Qwen35MtpMixin:
|
||||
self.metadata.version, size_label=None, output_type=output_type, model_type=None) # pyright: ignore[reportAttributeAccessIssue] # ty: ignore[unresolved-attribute]
|
||||
self.fname_out = self.fname_out.parent / f"mtp-{fname_default}.gguf"
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
if name.startswith("mtp."):
|
||||
n_layer = self.hparams["num_hidden_layers"]
|
||||
if name.find("layers.") != -1:
|
||||
assert bid is not None
|
||||
name = name.replace(f"mtp.layers.{bid}", f"model.layers.{bid + n_layer}")
|
||||
bid = bid + n_layer
|
||||
else:
|
||||
remapper = {
|
||||
"mtp.fc": "model.layers.{bid}.eh_proj",
|
||||
"mtp.pre_fc_norm_embedding": "model.layers.{bid}.enorm",
|
||||
"mtp.pre_fc_norm_hidden": "model.layers.{bid}.hnorm",
|
||||
"mtp.norm": "model.layers.{bid}.shared_head.norm",
|
||||
}
|
||||
stem = Path(name).stem
|
||||
suffix = Path(name).suffix
|
||||
tmpl = remapper[stem] + suffix
|
||||
for b in range(n_layer, self.block_count):
|
||||
yield from super().modify_tensors(data_torch, tmpl.format(bid=b), b) # ty: ignore[unresolved-attribute]
|
||||
return
|
||||
|
||||
yield from super().modify_tensors(data_torch, name, bid) # ty: ignore[unresolved-attribute]
|
||||
|
||||
|
||||
@ModelBase.register("Qwen3_5ForConditionalGeneration", "Qwen3_5ForCausalLM")
|
||||
class Qwen3_5TextModel(_Qwen35MtpMixin, _Qwen35MRopeMixin, _LinearAttentionVReorderBase):
|
||||
|
||||
53
conversion/talkie.py
Normal file
53
conversion/talkie.py
Normal file
@@ -0,0 +1,53 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Iterable, TYPE_CHECKING
|
||||
|
||||
import torch
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from torch import Tensor
|
||||
|
||||
from .base import LazyTorchTensor, ModelBase, TextModel, gguf
|
||||
|
||||
|
||||
@ModelBase.register("TalkieForCausalLM")
|
||||
class TalkieModel(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.TALKIE
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
# Talkie used F.rms_norm without an explicit eps
|
||||
self.gguf_writer.add_layer_norm_rms_eps(torch.finfo(torch.float32).eps)
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
prefix = f"model.blocks.{bid}." if bid is not None else ""
|
||||
suffix = name.removeprefix(prefix)
|
||||
|
||||
if suffix == "attn_gain.a_g":
|
||||
yield self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, bid, ".scale"), data_torch
|
||||
return
|
||||
elif suffix == "mlp_gain.a_g":
|
||||
yield self.format_tensor_name(gguf.MODEL_TENSOR.FFN_DOWN, bid, ".scale"), data_torch
|
||||
return
|
||||
elif suffix == "lm_head_gain.w_g":
|
||||
self.gguf_writer.add_logit_scale(LazyTorchTensor.to_eager(data_torch).item())
|
||||
return
|
||||
elif suffix in ("attn.attn_query.weight", "attn.attn_key.weight"):
|
||||
# absorb inverse rope
|
||||
head_dim = self.hparams["head_dim"]
|
||||
shape = data_torch.shape
|
||||
data_torch = torch.reshape(data_torch, (-1, head_dim, shape[-1]))
|
||||
signs = torch.ones((1, head_dim, 1), dtype=data_torch.dtype)
|
||||
signs[:, head_dim // 2 :, :] = -1
|
||||
if self.lazy:
|
||||
signs = LazyTorchTensor.from_eager(signs)
|
||||
# (n_head, head_dim, n_in) -> (n_out, n_in)
|
||||
data_torch = torch.reshape(data_torch * signs, shape)
|
||||
elif suffix == "attn.head_gain.head_g":
|
||||
# allow head gain to broadcast
|
||||
data_torch = data_torch.unsqueeze(-1)
|
||||
|
||||
if not name.endswith(".weight"):
|
||||
name += ".weight"
|
||||
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
@@ -148,6 +148,10 @@ def parse_args() -> argparse.Namespace:
|
||||
"--fuse-gate-up-exps", action="store_true",
|
||||
help="Fuse gate_exps and up_exps tensors into a single gate_up_exps tensor for MoE models.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--fp8-as-q8", action="store_true",
|
||||
help="Store tensors dequantized from FP8 as Q8_0 instead of BF16/F16.",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
if not args.print_supported_models and args.model is None:
|
||||
@@ -264,7 +268,8 @@ def main() -> None:
|
||||
small_first_shard=args.no_tensor_first_split,
|
||||
remote_hf_model_id=hf_repo_id, disable_mistral_community_chat_template=disable_mistral_community_chat_template,
|
||||
sentence_transformers_dense_modules=args.sentence_transformers_dense_modules,
|
||||
fuse_gate_up_exps=args.fuse_gate_up_exps
|
||||
fuse_gate_up_exps=args.fuse_gate_up_exps,
|
||||
fp8_as_q8=args.fp8_as_q8,
|
||||
)
|
||||
|
||||
if args.vocab_only:
|
||||
|
||||
@@ -156,6 +156,8 @@ models = [
|
||||
{"name": "kanana2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/kakaocorp/kanana-2-30b-a3b-instruct-2601", },
|
||||
{"name": "f2llmv2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/codefuse-ai/F2LLM-v2-4B", },
|
||||
{"name": "sarvam-moe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sarvamai/sarvam-30b", },
|
||||
{"name": "talkie", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/lewtun/talkie-1930-13b-it-hf", },
|
||||
{"name": "minicpm5", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openbmb/MiniCPM5-1B"},
|
||||
]
|
||||
|
||||
# some models are known to be broken upstream, so we will skip them as exceptions
|
||||
|
||||
@@ -208,6 +208,16 @@ class LoraTorchTensor:
|
||||
def to(self, *args, **kwargs):
|
||||
return LoraTorchTensor(self._lora_A.to(*args, **kwargs), self._lora_B.to(*args, **kwargs))
|
||||
|
||||
def __mul__(self, other) -> LoraTorchTensor:
|
||||
# Only output-side multiplication for now
|
||||
# W = B @ A, so M_out * W == (M_out * B) @ A
|
||||
if not isinstance(other, (int, float)) and other.shape and other.shape[-1] != 1:
|
||||
raise NotImplementedError
|
||||
return LoraTorchTensor(self._lora_A, self._lora_B * other)
|
||||
|
||||
def __rmul__(self, other) -> LoraTorchTensor:
|
||||
return self * other
|
||||
|
||||
@classmethod
|
||||
def __torch_function__(cls, func: Callable, types, args=(), kwargs=None):
|
||||
del types # unused
|
||||
|
||||
@@ -459,7 +459,7 @@ Each returned parser is wrapped by `wrap_for_generation_prompt()`, which prepend
|
||||
|
||||
- Usage: `./bin/llama-template-analysis path/to/template.jinja`
|
||||
|
||||
**Debug Logging**: Enable with `LLAMA_LOG_VERBOSITY=2`
|
||||
**Debug Logging**: Enable with `LLAMA_ARG_LOG_VERBOSITY=2`
|
||||
|
||||
- Shows detailed analysis steps, pattern extraction results, and generated parser structure
|
||||
|
||||
@@ -489,6 +489,7 @@ The following templates have active tests in `tests/test-chat.cpp`:
|
||||
| Qwen-QwQ-32B | Reasoning | Forced-open thinking |
|
||||
| NousResearch Hermes 2 Pro | JSON_NATIVE | `<tool_call>` wrapper |
|
||||
| IBM Granite 3.3 | JSON_NATIVE | `<think></think>` + `<response></response>` |
|
||||
| IBM Granite 4.0 | JSON_NATIVE | `<tool_call>` wrapper (same template used by 4.1) |
|
||||
| ByteDance Seed-OSS | TAG_WITH_TAGGED | Custom `<seed:think>` and `<seed:tool_call>` tags |
|
||||
| Qwen3-Coder | TAG_WITH_TAGGED | XML-style tool format |
|
||||
| DeepSeek V3.1 | JSON_NATIVE | Forced thinking mode |
|
||||
|
||||
@@ -743,6 +743,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
|
||||
| GGML_SYCL_DISABLE_GRAPH | 0 or 1 (default) | Disable running computations through SYCL Graphs feature. Disabled by default because SYCL Graph is still on development, no better performance. |
|
||||
| GGML_SYCL_ENABLE_LEVEL_ZERO | 1 (default) or 0 | Use Level Zero API for device memory allocation instead of SYCL. Reduces system RAM usage on Intel dGPUs by avoiding DMA-buf/TTM host memory staging. Requires GGML_SYCL_SUPPORT_LEVEL_ZERO=ON at build time. |
|
||||
| GGML_SYCL_DISABLE_DNN | 0 (default) or 1 | Disable running computations through oneDNN and always use oneMKL. |
|
||||
| GGML_SYCL_ENABLE_VMM | 0 or 1 (default) | Enable the virtual-memory device pool. |
|
||||
| ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |
|
||||
| UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS | 0 (default) or 1 | Allow SYCL/Unified Runtime Level Zero device allocations larger than 4 GiB. llama.cpp's direct Level Zero allocation path requests the relaxed maximum-size limit itself when GGML_SYCL_ENABLE_LEVEL_ZERO=1. |
|
||||
|
||||
@@ -753,6 +754,7 @@ Pass these via `CXXFLAGS` or add a one-off `#define` to enable a flag on the spo
|
||||
| Name | Function |
|
||||
|-----------------|----------------------------------------------------------------------------------|
|
||||
| DEBUG_SYCL_POOL | Enable device memory pool logging on teardown. Useful for profiling allocations. |
|
||||
| DEBUG_SYCL_MALLOC | Enable verbose per-call logging of device pool alloc/free operations. |
|
||||
|
||||
## Design Rule
|
||||
|
||||
|
||||
@@ -33,8 +33,8 @@
|
||||
"name": "arm64-windows-snapdragon",
|
||||
"inherits": [ "base", "arm64-windows-llvm" ],
|
||||
"cacheVariables": {
|
||||
"CMAKE_C_FLAGS": "-march=armv8.7a+fp16 -fvectorize -ffp-model=fast -flto -D_GNU_SOURCE",
|
||||
"CMAKE_CXX_FLAGS": "-march=armv8.7a+fp16 -fvectorize -ffp-model=fast -flto -D_GNU_SOURCE",
|
||||
"CMAKE_C_FLAGS": "-march=armv8.7a+fp16+dotprod+i8mm -fvectorize -ffp-model=fast -flto -D_GNU_SOURCE",
|
||||
"CMAKE_CXX_FLAGS": "-march=armv8.7a+fp16+dotprod+i8mm -fvectorize -ffp-model=fast -flto -D_GNU_SOURCE",
|
||||
"CMAKE_C_FLAGS_RELEASE": "-O3 -DNDEBUG",
|
||||
"CMAKE_CXX_FLAGS_RELEASE": "-O3 -DNDEBUG",
|
||||
"CMAKE_C_FLAGS_RELWITHDEBINFO": "-O3 -DNDEBUG -g",
|
||||
|
||||
@@ -10,7 +10,7 @@ This image includes Android NDK, OpenCL SDK, Hexagon SDK, CMake, etc.
|
||||
This method works on Linux, macOS, and Windows. macOS and Windows users should install Docker Desktop.
|
||||
|
||||
```
|
||||
~/src/llama.cpp$ docker run -it -u $(id -u):$(id -g) --volume $(pwd):/workspace --platform linux/amd64 ghcr.io/snapdragon-toolchain/arm64-android:v0.6
|
||||
~/src/llama.cpp$ docker run -it -u $(id -u):$(id -g) --volume $(pwd):/workspace --platform linux/amd64 ghcr.io/snapdragon-toolchain/arm64-android:v0.7
|
||||
[d]/> cd /workspace
|
||||
```
|
||||
|
||||
@@ -24,7 +24,7 @@ Native Windows 11 arm64 builds has the following tools dependencies:
|
||||
- UCRT and Driver Kit
|
||||
- LLVM core libraries and Clang compiler (winget)
|
||||
- CMake, Git, Python (winget)
|
||||
- Hexagon SDK Community Edition 6.4 or later (see windows.md)
|
||||
- Hexagon SDK Community Edition 6.6 or later (see windows.md)
|
||||
- OpenCL SDK 2.3 or later (see windows.md)
|
||||
|
||||
Note: The rest of the **Windows** build process assumes that you're running natively in Powershell.
|
||||
@@ -45,7 +45,7 @@ Preset CMake variables:
|
||||
GGML_HEXAGON="ON"
|
||||
GGML_OPENCL="ON"
|
||||
GGML_OPENMP="OFF"
|
||||
HEXAGON_SDK_ROOT="/opt/hexagon/6.4.0.2"
|
||||
HEXAGON_SDK_ROOT="/opt/hexagon/6.6.0.0"
|
||||
...
|
||||
-- Including OpenCL backend
|
||||
-- Including Hexagon backend
|
||||
|
||||
@@ -28,15 +28,15 @@ c:\Qualcomm\OpenCL_SDK\2.3.2
|
||||
|
||||
Either use the trimmed down version (optimized for CI) from
|
||||
|
||||
https://github.com/snapdragon-toolchain/hexagon-sdk/releases/download/v6.4.0.2/hexagon-sdk-v6.4.0.2-arm64-wos.tar.xz
|
||||
https://github.com/snapdragon-toolchain/hexagon-sdk/releases/download/v6.6.0.0/hexagon-sdk-v6.6.0.0-arm64-wos.tar.xz
|
||||
|
||||
Or download the complete official version from
|
||||
|
||||
https://softwarecenter.qualcomm.com/catalog/item/Hexagon_SDK?version=6.4.0.2
|
||||
https://softwarecenter.qualcomm.com/catalog/item/Hexagon_SDK?version=6.6.0.0
|
||||
|
||||
Unzip/untar the archive into
|
||||
```
|
||||
c:\Qualcomm\Hexagon_SDK\6.4.0.2
|
||||
c:\Qualcomm\Hexagon_SDK\6.6.0.0
|
||||
```
|
||||
|
||||
## Install the latest Adreno GPU driver
|
||||
@@ -123,10 +123,10 @@ The overall Hexagon backend build procedure for Windows on Snapdragon is the sam
|
||||
However, additional settings are required for generating and signing HTP Ops libraries.
|
||||
```
|
||||
> $env:OPENCL_SDK_ROOT="C:\Qualcomm\OpenCL_SDK\2.3.2"
|
||||
> $env:HEXAGON_SDK_ROOT="C:\Qualcomm\Hexagon_SDK\6.4.0.2"
|
||||
> $env:HEXAGON_TOOLS_ROOT="C:\Qualcomm\Hexagon_SDK\6.4.0.2\tools\HEXAGON_Tools\19.0.04"
|
||||
> $env:HEXAGON_SDK_ROOT="C:\Qualcomm\Hexagon_SDK\6.6.0.0"
|
||||
> $env:HEXAGON_TOOLS_ROOT="C:\Qualcomm\Hexagon_SDK\6.6.0.0\tools\HEXAGON_Tools\19.0.07"
|
||||
> $env:HEXAGON_HTP_CERT="c:\Users\MyUsers\Certs\ggml-htp-v1.pfx"
|
||||
> $env:WINDOWS_SDK_BIN="C:\Program Files (x86)\Windows Kits\10\bin\10.0.26100.0\arm64"
|
||||
> $env:WINDOWS_SDK_BIN="C:\Program Files (x86)\Windows Kits\10\bin\10.0.26100.0"
|
||||
|
||||
> cmake --preset arm64-windows-snapdragon-release -B build-wos
|
||||
...
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
|
||||
1. Prepare Toolchain For RISCV
|
||||
~~~
|
||||
wget https://archive.spacemit.com/toolchain/spacemit-toolchain-linux-glibc-x86_64-v1.1.2.tar.xz
|
||||
wget https://github.com/spacemit-com/toolchain/releases/download/v1.2.4/spacemit-toolchain-linux-glibc-x86_64-v1.2.4.tar.xz
|
||||
~~~
|
||||
|
||||
2. Build
|
||||
|
||||
@@ -735,7 +735,7 @@ ninja
|
||||
|
||||
To read documentation for how to build on Android, [click here](./android.md)
|
||||
|
||||
## WebGPU [In Progress]
|
||||
## WebGPU
|
||||
|
||||
The WebGPU backend relies on [Dawn](https://dawn.googlesource.com/dawn). Follow the instructions [here](https://dawn.googlesource.com/dawn/+/refs/heads/main/docs/quickstart-cmake.md) to install Dawn locally so that llama.cpp can find it using CMake. The current implementation is up-to-date with Dawn commit `18eb229`.
|
||||
|
||||
|
||||
@@ -291,6 +291,7 @@ Here are some models known to work (w/ chat template override when needed):
|
||||
llama-server --jinja -fa -hf bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M
|
||||
llama-server --jinja -fa -hf bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q6_K_L
|
||||
llama-server --jinja -fa -hf bartowski/Llama-3.3-70B-Instruct-GGUF:Q4_K_M
|
||||
llama-server --jinja -fa -hf ibm-granite/granite-4.1-3b-GGUF:Q4_K_M
|
||||
|
||||
# Native support for DeepSeek R1 works best w/ our template override (official template is buggy, although we do work around it)
|
||||
|
||||
|
||||
@@ -176,7 +176,7 @@ Note that currently you cannot quantize the visual encoder because granite visio
|
||||
|
||||
|
||||
### 5. Running the Model in Llama cpp
|
||||
Build llama cpp normally; you should have a target binary named `llama-mtmd-cli`, which you can pass two binaries to. As an example, we pass the the llama.cpp banner.
|
||||
Build llama cpp normally; you should have a target binary named `llama-mtmd-cli`, which you can pass two binaries to. As an example, we pass the llama.cpp banner.
|
||||
|
||||
```bash
|
||||
$ ./build/bin/llama-mtmd-cli -m $LLM_GGUF_PATH \
|
||||
|
||||
@@ -27,7 +27,6 @@ else()
|
||||
add_subdirectory(parallel)
|
||||
add_subdirectory(passkey)
|
||||
add_subdirectory(retrieval)
|
||||
add_subdirectory(save-load-state)
|
||||
add_subdirectory(simple)
|
||||
add_subdirectory(simple-chat)
|
||||
add_subdirectory(speculative)
|
||||
|
||||
@@ -1308,7 +1308,8 @@ def do_dump_model(model_plus: ModelPlus) -> None:
|
||||
|
||||
def main(args_in: list[str] | None = None) -> None:
|
||||
output_choices = ["f32", "f16"]
|
||||
if np.uint32(1) == np.uint32(1).newbyteorder("<"):
|
||||
dummy_val = np.uint32(1)
|
||||
if dummy_val == dummy_val.view(dummy_val.dtype.newbyteorder("<")):
|
||||
# We currently only support Q8_0 output on little endian systems.
|
||||
output_choices.append("q8_0")
|
||||
parser = argparse.ArgumentParser(description="Convert a LLaMA model to a GGML compatible file")
|
||||
|
||||
@@ -25,6 +25,7 @@ android {
|
||||
arguments += "-DCMAKE_VERBOSE_MAKEFILE=ON"
|
||||
|
||||
arguments += "-DBUILD_SHARED_LIBS=ON"
|
||||
arguments += "-DLLAMA_BUILD_APP=OFF"
|
||||
arguments += "-DLLAMA_BUILD_COMMON=ON"
|
||||
arguments += "-DLLAMA_OPENSSL=OFF"
|
||||
|
||||
|
||||
@@ -335,7 +335,7 @@ $ make perplexity-run-full QUANTIZED_MODEL=~/path/to/quantized/model-Qxx.gguf LO
|
||||
|
||||
## HuggingFace utilities
|
||||
The following targets are useful for creating collections and model repositories
|
||||
on Hugging Face in the the ggml-org. These can be used when preparing a release
|
||||
on Hugging Face in the ggml-org. These can be used when preparing a release
|
||||
to script the process for new model releases.
|
||||
|
||||
For the following targets a `HF_TOKEN` environment variable is required.
|
||||
|
||||
@@ -64,7 +64,7 @@ def load_model_and_tokenizer(model_path, use_sentence_transformers=False, device
|
||||
print("Using SentenceTransformer to apply all numbered layers")
|
||||
model = SentenceTransformer(model_path)
|
||||
tokenizer = model.tokenizer
|
||||
config = model[0].auto_model.config
|
||||
config = model[0].auto_model.config # ty: ignore[unresolved-attribute]
|
||||
else:
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_path)
|
||||
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
|
||||
|
||||
@@ -1,5 +0,0 @@
|
||||
set(TARGET llama-save-load-state)
|
||||
add_executable(${TARGET} save-load-state.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
@@ -4,7 +4,7 @@ project("ggml" C CXX ASM)
|
||||
|
||||
### GGML Version
|
||||
set(GGML_VERSION_MAJOR 0)
|
||||
set(GGML_VERSION_MINOR 12)
|
||||
set(GGML_VERSION_MINOR 13)
|
||||
set(GGML_VERSION_PATCH 0)
|
||||
set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")
|
||||
|
||||
|
||||
@@ -6,6 +6,7 @@
|
||||
include(CMakeFindDependencyMacro)
|
||||
find_dependency(Threads)
|
||||
if (NOT GGML_SHARED_LIB)
|
||||
set(GGML_BASE_INTERFACE_LINK_LIBRARIES "")
|
||||
set(GGML_CPU_INTERFACE_LINK_LIBRARIES "")
|
||||
set(GGML_CPU_INTERFACE_LINK_OPTIONS "")
|
||||
|
||||
@@ -20,7 +21,15 @@ if (NOT GGML_SHARED_LIB)
|
||||
|
||||
if (GGML_OPENMP_ENABLED)
|
||||
find_dependency(OpenMP)
|
||||
list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
|
||||
set(GGML_OPENMP_INTERFACE_LINK_LIBRARIES "")
|
||||
if (TARGET OpenMP::OpenMP_C)
|
||||
list(APPEND GGML_OPENMP_INTERFACE_LINK_LIBRARIES OpenMP::OpenMP_C)
|
||||
endif()
|
||||
if (TARGET OpenMP::OpenMP_CXX)
|
||||
list(APPEND GGML_OPENMP_INTERFACE_LINK_LIBRARIES OpenMP::OpenMP_CXX)
|
||||
endif()
|
||||
list(APPEND GGML_BASE_INTERFACE_LINK_LIBRARIES ${GGML_OPENMP_INTERFACE_LINK_LIBRARIES})
|
||||
list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES ${GGML_OPENMP_INTERFACE_LINK_LIBRARIES})
|
||||
endif()
|
||||
|
||||
if (GGML_CPU_HBM)
|
||||
@@ -122,7 +131,8 @@ if(NOT TARGET ggml::ggml)
|
||||
add_library(ggml::ggml-base UNKNOWN IMPORTED)
|
||||
set_target_properties(ggml::ggml-base
|
||||
PROPERTIES
|
||||
IMPORTED_LOCATION "${GGML_BASE_LIBRARY}")
|
||||
IMPORTED_LOCATION "${GGML_BASE_LIBRARY}"
|
||||
INTERFACE_LINK_LIBRARIES "${GGML_BASE_INTERFACE_LINK_LIBRARIES}")
|
||||
|
||||
set(_ggml_all_targets "")
|
||||
if (NOT GGML_BACKEND_DL)
|
||||
|
||||
@@ -76,6 +76,7 @@ GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_i
|
||||
// Utils
|
||||
// Create a buffer and allocate all the tensors in a ggml_context
|
||||
// ggml_backend_alloc_ctx_tensors_from_buft_size returns the size of the buffer that would be allocated by ggml_backend_alloc_ctx_tensors_from_buft
|
||||
// ggml_backend_alloc_ctx_tensors_from_buft returns NULL on failure or if all tensors in ctx are already allocated or zero-sized
|
||||
GGML_API size_t ggml_backend_alloc_ctx_tensors_from_buft_size(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
|
||||
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
|
||||
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend);
|
||||
|
||||
@@ -1189,8 +1189,8 @@ extern "C" {
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
// a - x
|
||||
// b - dy
|
||||
// a - dy
|
||||
// b - x
|
||||
GGML_API struct ggml_tensor * ggml_silu_back(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
|
||||
@@ -76,10 +76,16 @@ extern "C" {
|
||||
struct ggml_context ** ctx;
|
||||
};
|
||||
|
||||
// callback to simulate or wrap a FILE pointer - read up to `len` bytes at `offset` into `output` and return the number of bytes read
|
||||
typedef size_t (*gguf_reader_callback_t)(void * userdata, void * output, uint64_t offset, size_t len);
|
||||
|
||||
GGML_API struct gguf_context * gguf_init_empty(void);
|
||||
GGML_API struct gguf_context * gguf_init_from_file_ptr(FILE * file, struct gguf_init_params params);
|
||||
GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params);
|
||||
//GGML_API struct gguf_context * gguf_init_from_buffer(..);
|
||||
GGML_API struct gguf_context * gguf_init_from_buffer(const void * data, size_t size, struct gguf_init_params params);
|
||||
|
||||
// max_chunk_read is the maximum number of bytes that the GGUF code will read at once from the callback, a value of 0 means no limit
|
||||
GGML_API struct gguf_context * gguf_init_from_callback(gguf_reader_callback_t callback, void * userdata, size_t max_chunk_read, uint64_t max_expected_size, struct gguf_init_params params);
|
||||
|
||||
GGML_API void gguf_free(struct gguf_context * ctx);
|
||||
|
||||
@@ -87,7 +93,7 @@ extern "C" {
|
||||
|
||||
GGML_API uint32_t gguf_get_version (const struct gguf_context * ctx);
|
||||
GGML_API size_t gguf_get_alignment (const struct gguf_context * ctx);
|
||||
GGML_API size_t gguf_get_data_offset(const struct gguf_context * ctx);
|
||||
GGML_API size_t gguf_get_data_offset(const struct gguf_context * ctx); // padded to gguf_get_alignment if and only if the gguf_context contains at least one tensor
|
||||
|
||||
GGML_API int64_t gguf_get_n_kv(const struct gguf_context * ctx);
|
||||
GGML_API int64_t gguf_find_key(const struct gguf_context * ctx, const char * key); // returns -1 if key is not found
|
||||
|
||||
@@ -222,6 +222,23 @@ if (GGML_SCHED_NO_REALLOC)
|
||||
target_compile_definitions(ggml-base PUBLIC GGML_SCHED_NO_REALLOC)
|
||||
endif()
|
||||
|
||||
if (GGML_OPENMP)
|
||||
find_package(OpenMP)
|
||||
if (OpenMP_FOUND)
|
||||
set(GGML_OPENMP_ENABLED "ON" CACHE INTERNAL "")
|
||||
else()
|
||||
set(GGML_OPENMP_ENABLED "OFF" CACHE INTERNAL "")
|
||||
message(WARNING "OpenMP not found")
|
||||
endif()
|
||||
else()
|
||||
set(GGML_OPENMP_ENABLED "OFF" CACHE INTERNAL "")
|
||||
endif()
|
||||
|
||||
if (GGML_OPENMP_ENABLED)
|
||||
target_compile_definitions(ggml-base PRIVATE GGML_USE_OPENMP)
|
||||
target_link_libraries(ggml-base PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
|
||||
endif()
|
||||
|
||||
add_library(ggml
|
||||
ggml-backend-dl.cpp
|
||||
ggml-backend-reg.cpp)
|
||||
|
||||
@@ -150,7 +150,7 @@ static void ggml_dyn_tallocr_insert_block(struct tallocr_chunk * chunk, size_t o
|
||||
|
||||
static void ggml_dyn_tallocr_remove_block(struct tallocr_chunk * chunk, int idx) {
|
||||
// shift all elements after idx by 1 to the left, overwriting the element at idx
|
||||
for (int i = idx; i < chunk->n_free_blocks; i++) {
|
||||
for (int i = idx; i < chunk->n_free_blocks - 1; i++) {
|
||||
chunk->free_blocks[i] = chunk->free_blocks[i+1];
|
||||
}
|
||||
chunk->n_free_blocks--;
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
#include <cstring>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <set>
|
||||
#include <string>
|
||||
#include <tuple>
|
||||
#include <utility>
|
||||
@@ -392,64 +393,100 @@ static ggml_backend_buffer_type_t ggml_backend_meta_device_get_host_buffer_type(
|
||||
// meta backend buffer
|
||||
//
|
||||
|
||||
// Container to hold the tensor slices per simple ggml backend buffer.
|
||||
struct ggml_backend_meta_simple_tensor_container {
|
||||
std::vector<ggml_context_ptr> ctxs;
|
||||
std::map<const ggml_tensor *, std::vector<ggml_tensor *>> simple_tensors;
|
||||
|
||||
ggml_backend_meta_simple_tensor_container(const ggml_init_params & params, const int n_simple) {
|
||||
ctxs.reserve(n_simple);
|
||||
for (int i = 0; i < n_simple; i++) {
|
||||
ctxs.emplace_back(ggml_init(params));
|
||||
}
|
||||
}
|
||||
ggml_backend_meta_simple_tensor_container() {}
|
||||
};
|
||||
|
||||
struct ggml_backend_meta_buffer_context {
|
||||
// FIXME
|
||||
// Most tensors can simply be stored statically in their own buffer.
|
||||
// Externally created views however also need a mapping to simple tensors but they use the buffer of the view source.
|
||||
// If external views are simply using that buffer they will slowly deplete its memory.
|
||||
// Current solution: rotating set of 2 "compute" containers to hold external views, works correctly for llama.cpp.
|
||||
// Long-term: tie the lifetime of external views to the meta backend executing the graph instead,
|
||||
// currently not possible due to graph-external operations in the backend scheduler.
|
||||
ggml_backend_meta_simple_tensor_container stc_static;
|
||||
ggml_backend_meta_simple_tensor_container stc_compute[2];
|
||||
int stc_compute_index = 0;
|
||||
int stc_compute_index_next = 0;
|
||||
std::vector<ggml_backend_buffer_ptr> bufs;
|
||||
|
||||
// FIXME
|
||||
// The size of the split state cache is unbounded and can theoretically grow infinitely large.
|
||||
// However, it is also expensive to build and clearing it on every rebuild in ggml_backend_meta_graph_compute is too expensive.
|
||||
static constexpr size_t nbtc = GGML_TENSOR_SIZE - sizeof(ggml_tensor::padding);
|
||||
|
||||
std::map<std::pair<const ggml_tensor *, bool>, std::pair<ggml_backend_meta_split_state, char[nbtc]>> split_state_cache;
|
||||
std::map< const ggml_tensor *, std::vector<ggml_tensor *>> simple_tensors;
|
||||
|
||||
struct buffer_config {
|
||||
ggml_context * ctx;
|
||||
ggml_backend_buffer_t buf;
|
||||
|
||||
buffer_config(ggml_context * ctx, ggml_backend_buffer_t buf) : ctx(ctx), buf(buf) {}
|
||||
};
|
||||
std::vector<buffer_config> buf_configs;
|
||||
|
||||
int debug;
|
||||
|
||||
ggml_backend_meta_buffer_context() {
|
||||
ggml_backend_meta_buffer_context(
|
||||
ggml_backend_meta_simple_tensor_container & stc_static,
|
||||
ggml_backend_meta_simple_tensor_container & stc_compute_0,
|
||||
ggml_backend_meta_simple_tensor_container & stc_compute_1,
|
||||
const std::vector<ggml_backend_buffer_t> & bufs)
|
||||
: stc_static(std::move(stc_static)), stc_compute{std::move(stc_compute_0), std::move(stc_compute_1)} {
|
||||
this->bufs.reserve(bufs.size());
|
||||
for (ggml_backend_buffer_t buf : bufs) {
|
||||
this->bufs.emplace_back(buf);
|
||||
}
|
||||
const char * GGML_META_DEBUG = getenv("GGML_META_DEBUG");
|
||||
debug = GGML_META_DEBUG ? atoi(GGML_META_DEBUG) : 0;
|
||||
}
|
||||
|
||||
ggml_backend_meta_simple_tensor_container & get_simple_tensor_container(const ggml_tensor * tensor) {
|
||||
if (stc_static.simple_tensors.find(tensor) != stc_static.simple_tensors.end()) {
|
||||
return stc_static;
|
||||
}
|
||||
return stc_compute[stc_compute_index];
|
||||
}
|
||||
};
|
||||
|
||||
static void ggml_backend_meta_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
GGML_ASSERT(ggml_backend_buffer_is_meta(buffer));
|
||||
ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) buffer->context;
|
||||
for (auto & [ctx, buf] : buf_ctx->buf_configs) {
|
||||
ggml_backend_buffer_free(buf);
|
||||
ggml_free(ctx);
|
||||
}
|
||||
delete buf_ctx;
|
||||
}
|
||||
|
||||
static size_t ggml_backend_meta_buffer_n_bufs(ggml_backend_buffer_t meta_buf) {
|
||||
GGML_ASSERT(ggml_backend_buffer_is_meta(meta_buf));
|
||||
ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) meta_buf->context;
|
||||
return buf_ctx->buf_configs.size();
|
||||
return buf_ctx->bufs.size();
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_t ggml_backend_meta_buffer_simple_buffer(ggml_backend_buffer_t meta_buf, size_t index) {
|
||||
GGML_ASSERT(ggml_backend_buffer_is_meta(meta_buf));
|
||||
ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) meta_buf->context;
|
||||
GGML_ASSERT(index < buf_ctx->buf_configs.size());
|
||||
return buf_ctx->buf_configs[index].buf;
|
||||
GGML_ASSERT(index < buf_ctx->bufs.size());
|
||||
return buf_ctx->bufs[index].get();
|
||||
}
|
||||
|
||||
static struct ggml_tensor * ggml_backend_meta_buffer_simple_tensor(const struct ggml_tensor * tensor, size_t index) {
|
||||
GGML_ASSERT(ggml_backend_buffer_is_meta(tensor->buffer));
|
||||
ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) tensor->buffer->context;
|
||||
GGML_ASSERT(index < buf_ctx->buf_configs.size());
|
||||
GGML_ASSERT(index < buf_ctx->bufs.size());
|
||||
|
||||
auto it = buf_ctx->simple_tensors.find(tensor);
|
||||
if (it == buf_ctx->simple_tensors.end()) {
|
||||
ggml_backend_meta_simple_tensor_container & stc = buf_ctx->get_simple_tensor_container(tensor);
|
||||
auto it = stc.simple_tensors.find(tensor);
|
||||
if (it == stc.simple_tensors.end()) {
|
||||
return nullptr;
|
||||
}
|
||||
return it->second[index];
|
||||
}
|
||||
|
||||
static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(const struct ggml_tensor * tensor, bool assume_sync) {
|
||||
static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(const struct ggml_tensor * tensor, bool assume_sync);
|
||||
|
||||
static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(
|
||||
ggml_backend_meta_simple_tensor_container & stc, const struct ggml_tensor * tensor, bool assume_sync) {
|
||||
const size_t n_bufs = ggml_backend_meta_buffer_n_bufs(tensor->buffer);
|
||||
ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) tensor->buffer->context;
|
||||
|
||||
@@ -785,7 +822,7 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(co
|
||||
src_ss[i] = {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, 1};
|
||||
continue;
|
||||
}
|
||||
src_ss[i] = ggml_backend_meta_get_split_state(tensor->src[i], /*assume_sync =*/ true);
|
||||
src_ss[i] = ggml_backend_meta_get_split_state(stc, tensor->src[i], /*assume_sync =*/ true);
|
||||
GGML_ASSERT(src_ss[i].axis != GGML_BACKEND_SPLIT_AXIS_UNKNOWN);
|
||||
}
|
||||
|
||||
@@ -1079,17 +1116,23 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(co
|
||||
return ret;
|
||||
}
|
||||
|
||||
static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(const struct ggml_tensor * tensor, bool assume_sync) {
|
||||
GGML_ASSERT(ggml_backend_buffer_is_meta(tensor->buffer));
|
||||
ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) tensor->buffer->context;
|
||||
return ggml_backend_meta_get_split_state(buf_ctx->get_simple_tensor_container(tensor), tensor, assume_sync);
|
||||
}
|
||||
|
||||
static void * ggml_backend_meta_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
GGML_UNUSED(buffer);
|
||||
return (void *) 0x1000000000000000; // FIXME
|
||||
}
|
||||
|
||||
static enum ggml_status ggml_backend_meta_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
||||
GGML_ASSERT(ggml_backend_buffer_is_meta(buffer));
|
||||
ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) buffer->context;
|
||||
const size_t n_simple_bufs = ggml_backend_meta_buffer_n_bufs(buffer);
|
||||
static enum ggml_status ggml_backend_meta_buffer_init_tensor_impl(ggml_backend_meta_simple_tensor_container & stc, ggml_tensor * tensor) {
|
||||
GGML_ASSERT(ggml_backend_buffer_is_meta(tensor->buffer));
|
||||
ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) tensor->buffer->context;
|
||||
const size_t n_simple_bufs = ggml_backend_meta_buffer_n_bufs(tensor->buffer);
|
||||
|
||||
const ggml_backend_meta_split_state split_state = ggml_backend_meta_get_split_state(tensor, /*assume_sync =*/ true);
|
||||
const ggml_backend_meta_split_state split_state = ggml_backend_meta_get_split_state(stc, tensor, /*assume_sync =*/ true);
|
||||
GGML_ASSERT(ggml_nelements(tensor) == 0 || split_state.axis != GGML_BACKEND_SPLIT_AXIS_UNKNOWN);
|
||||
GGML_ASSERT(split_state.n_segments <= 16);
|
||||
|
||||
@@ -1104,8 +1147,8 @@ static enum ggml_status ggml_backend_meta_buffer_init_tensor(ggml_backend_buffer
|
||||
std::vector<ggml_tensor *> simple_tensors;
|
||||
simple_tensors.reserve(n_simple_bufs);
|
||||
for (size_t j = 0; j < n_simple_bufs; j++) {
|
||||
ggml_context * simple_ctx = buf_ctx->buf_configs[j].ctx;
|
||||
ggml_backend_buffer_t simple_buf = buf_ctx->buf_configs[j].buf;
|
||||
ggml_context * simple_ctx = stc.ctxs[j].get();
|
||||
ggml_backend_buffer_t simple_buf = buf_ctx->bufs[j].get();
|
||||
|
||||
if (split_dim >= 0 && split_dim < GGML_MAX_DIMS) {
|
||||
// TODO: the following assert fails for llama-parallel even though the results are correct:
|
||||
@@ -1158,7 +1201,7 @@ static enum ggml_status ggml_backend_meta_buffer_init_tensor(ggml_backend_buffer
|
||||
t_ij->data = (char *) t_ij->view_src->data + t_ij->view_offs;
|
||||
} else if (simple_buf != nullptr) {
|
||||
t_ij->data = (char *) ggml_backend_buffer_get_base(simple_buf)
|
||||
+ size_t(tensor->data) - size_t(ggml_backend_buffer_get_base(buffer));
|
||||
+ size_t(tensor->data) - size_t(ggml_backend_buffer_get_base(tensor->buffer));
|
||||
}
|
||||
t_ij->extra = tensor->extra;
|
||||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||
@@ -1194,11 +1237,18 @@ static enum ggml_status ggml_backend_meta_buffer_init_tensor(ggml_backend_buffer
|
||||
}
|
||||
}
|
||||
|
||||
buf_ctx->simple_tensors[tensor] = simple_tensors;
|
||||
stc.simple_tensors[tensor] = simple_tensors;
|
||||
|
||||
return GGML_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
static enum ggml_status ggml_backend_meta_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
||||
GGML_ASSERT(ggml_backend_buffer_is_meta(buffer));
|
||||
ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) buffer->context;
|
||||
buf_ctx->stc_compute_index = buf_ctx->stc_compute_index_next;
|
||||
return ggml_backend_meta_buffer_init_tensor_impl(buf_ctx->get_simple_tensor_container(tensor), tensor);
|
||||
}
|
||||
|
||||
static void ggml_backend_meta_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||
const size_t n_bufs = ggml_backend_meta_buffer_n_bufs(buffer);
|
||||
GGML_ASSERT(ggml_is_contiguous(tensor));
|
||||
@@ -1275,6 +1325,9 @@ static void ggml_backend_meta_buffer_set_tensor(ggml_backend_buffer_t buffer, gg
|
||||
for (size_t j = 0; j < n_bufs; j++) {
|
||||
ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
|
||||
const size_t chunk_size_j = simple_tensor->nb[split_state.axis + 1];
|
||||
if (chunk_size_j == 0) {
|
||||
continue;
|
||||
}
|
||||
const size_t simple_offset = i_start * chunk_size_j;
|
||||
ggml_backend_tensor_set_2d(simple_tensor, (const char *) data + offset_j, simple_offset, chunk_size_j, i_stop - i_start, chunk_size_j, chunk_size_full);
|
||||
offset_j += chunk_size_j;
|
||||
@@ -1382,6 +1435,9 @@ static void ggml_backend_meta_buffer_get_tensor(ggml_backend_buffer_t buffer, co
|
||||
for (size_t j = 0; j < n_bufs; j++){
|
||||
const ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
|
||||
const size_t chunk_size_j = simple_tensor->nb[split_state.axis + 1];
|
||||
if (chunk_size_j == 0) {
|
||||
continue;
|
||||
}
|
||||
const size_t simple_offset = i_start * chunk_size_j;
|
||||
ggml_backend_tensor_get_2d(simple_tensor, (char *) data + offset_j, simple_offset, chunk_size_j, i_stop - i_start, chunk_size_j, chunk_size_full);
|
||||
offset_j += chunk_size_j;
|
||||
@@ -1407,8 +1463,9 @@ static void ggml_backend_meta_buffer_clear(ggml_backend_buffer_t buffer, uint8_t
|
||||
}
|
||||
|
||||
static void ggml_backend_meta_buffer_reset(ggml_backend_buffer_t buffer) {
|
||||
const size_t n_buffers = ggml_backend_meta_buffer_n_bufs(buffer);
|
||||
for (size_t i = 0; i < n_buffers; i++) {
|
||||
GGML_ASSERT(ggml_backend_buffer_is_meta(buffer));
|
||||
ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) buffer->context;
|
||||
for (size_t i = 0; i < buf_ctx->bufs.size(); i++) {
|
||||
ggml_backend_buffer_reset(ggml_backend_meta_buffer_simple_buffer(buffer, i));
|
||||
}
|
||||
}
|
||||
@@ -1434,20 +1491,24 @@ bool ggml_backend_buffer_is_meta(ggml_backend_buffer_t buf) {
|
||||
static ggml_backend_buffer_t ggml_backend_meta_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
const size_t n_simple_bufts = ggml_backend_meta_buft_n_bufts(buft);
|
||||
|
||||
ggml_init_params params = {
|
||||
/*.mem_size =*/ 1024*1024*1024, // FIXME
|
||||
const ggml_init_params params = {
|
||||
/*.mem_size =*/ 1024*1024*ggml_tensor_overhead(), // FIXME
|
||||
/*.mem_buffer =*/ nullptr,
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
ggml_backend_meta_simple_tensor_container stc_static;
|
||||
ggml_backend_meta_simple_tensor_container stc_compute_0(params, n_simple_bufts);
|
||||
ggml_backend_meta_simple_tensor_container stc_compute_1(params, n_simple_bufts);
|
||||
|
||||
ggml_backend_meta_buffer_context * buf_ctx = new ggml_backend_meta_buffer_context();
|
||||
size_t max_size = 0;
|
||||
buf_ctx->buf_configs.reserve(n_simple_bufts);
|
||||
std::vector<ggml_backend_buffer_t> bufs;
|
||||
bufs.reserve(n_simple_bufts);
|
||||
for (size_t i = 0; i < n_simple_bufts; i++) {
|
||||
ggml_backend_buffer_t simple_buf = ggml_backend_buft_alloc_buffer(ggml_backend_meta_buft_simple_buft(buft, i), size);
|
||||
max_size = std::max(max_size, ggml_backend_buffer_get_size(simple_buf));
|
||||
buf_ctx->buf_configs.emplace_back(ggml_init(params), simple_buf);
|
||||
bufs.push_back(ggml_backend_buft_alloc_buffer(ggml_backend_meta_buft_simple_buft(buft, i), size));
|
||||
GGML_ASSERT(bufs.back() != nullptr);
|
||||
max_size = std::max(max_size, ggml_backend_buffer_get_size(bufs.back()));
|
||||
}
|
||||
ggml_backend_meta_buffer_context * buf_ctx = new ggml_backend_meta_buffer_context(stc_static, stc_compute_0, stc_compute_1, bufs);
|
||||
|
||||
return ggml_backend_buffer_init(buft, ggml_backend_meta_buffer_iface, buf_ctx, max_size);
|
||||
}
|
||||
@@ -1455,28 +1516,53 @@ static ggml_backend_buffer_t ggml_backend_meta_buffer_type_alloc_buffer(ggml_bac
|
||||
struct ggml_backend_buffer * ggml_backend_meta_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
|
||||
const size_t n_simple_bufts = ggml_backend_meta_buft_n_bufts(buft);
|
||||
|
||||
ggml_init_params params = {
|
||||
/*.mem_size =*/ 1024*1024*1024, // FIXME
|
||||
constexpr size_t compute_headroom = 16; // Maximum number of views per statically allocated tensor that can be created between evals.
|
||||
const ggml_init_params params_static = {
|
||||
/*.mem_size =*/ ggml_get_mem_size(ctx),
|
||||
/*.mem_buffer =*/ nullptr,
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
const ggml_init_params params_compute = {
|
||||
/*.mem_size =*/ compute_headroom*ggml_get_mem_size(ctx),
|
||||
/*.mem_buffer =*/ nullptr,
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
ggml_backend_meta_simple_tensor_container stc_static (params_static, n_simple_bufts);
|
||||
ggml_backend_meta_simple_tensor_container stc_compute_0(params_compute, n_simple_bufts);
|
||||
ggml_backend_meta_simple_tensor_container stc_compute_1(params_compute, n_simple_bufts);
|
||||
|
||||
ggml_backend_meta_buffer_context * meta_buf_ctx = new ggml_backend_meta_buffer_context();
|
||||
meta_buf_ctx->buf_configs.reserve(n_simple_bufts);
|
||||
for (size_t i = 0; i < n_simple_bufts; i++) {
|
||||
meta_buf_ctx->buf_configs.emplace_back(ggml_init(params), nullptr);
|
||||
}
|
||||
std::vector<ggml_backend_buffer_t> bufs(n_simple_bufts, nullptr);
|
||||
ggml_backend_meta_buffer_context * meta_buf_ctx = new ggml_backend_meta_buffer_context(stc_static, stc_compute_0, stc_compute_1, bufs);
|
||||
|
||||
ggml_backend_buffer_t meta_buf = ggml_backend_buffer_init(buft, ggml_backend_meta_buffer_iface, meta_buf_ctx, 0);
|
||||
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
|
||||
t->buffer = meta_buf;
|
||||
ggml_backend_meta_buffer_init_tensor(meta_buf, t);
|
||||
ggml_backend_meta_buffer_init_tensor_impl(meta_buf_ctx->stc_static, t);
|
||||
t->data = (void *) 0x2000000000000000; // FIXME
|
||||
}
|
||||
for (size_t i = 0; i < n_simple_bufts; i++) {
|
||||
meta_buf_ctx->buf_configs[i].buf = ggml_backend_alloc_ctx_tensors_from_buft(
|
||||
meta_buf_ctx->buf_configs[i].ctx, ggml_backend_meta_buft_simple_buft(buft, i));
|
||||
meta_buf->size = std::max(meta_buf->size, ggml_backend_buffer_get_size(meta_buf_ctx->buf_configs[i].buf));
|
||||
ggml_context * ctx = meta_buf_ctx->stc_static.ctxs[i].get();
|
||||
ggml_backend_buffer_type_t simple_buft = ggml_backend_meta_buft_simple_buft(buft, i);
|
||||
|
||||
// If a ggml_context only has zero-sized tensors, ggml_backend_alloc_ctx_tensors_from_buft returns NULL.
|
||||
// For those edge cases, allocate a dummy buffer instead.
|
||||
bool any_nonzero_slice = false;
|
||||
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
|
||||
if (ggml_nelements(t) != 0) {
|
||||
any_nonzero_slice = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (any_nonzero_slice) {
|
||||
meta_buf_ctx->bufs[i].reset(ggml_backend_alloc_ctx_tensors_from_buft(ctx, simple_buft));
|
||||
} else {
|
||||
meta_buf_ctx->bufs[i].reset(ggml_backend_buft_alloc_buffer(simple_buft, 0));
|
||||
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
|
||||
t->buffer = meta_buf_ctx->bufs[i].get();
|
||||
}
|
||||
}
|
||||
GGML_ASSERT(meta_buf_ctx->bufs[i]);
|
||||
meta_buf->size = std::max(meta_buf->size, ggml_backend_buffer_get_size(meta_buf_ctx->bufs[i].get()));
|
||||
}
|
||||
return meta_buf;
|
||||
}
|
||||
@@ -1605,6 +1691,9 @@ static void ggml_backend_meta_set_tensor_async(ggml_backend_t backend, ggml_tens
|
||||
ggml_backend_t simple_backend = ggml_backend_meta_simple_backend(backend, j);
|
||||
ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
|
||||
const size_t chunk_size_j = simple_tensor->nb[split_state.axis + 1];
|
||||
if (chunk_size_j == 0) {
|
||||
continue;
|
||||
}
|
||||
ggml_backend_tensor_set_2d_async(simple_backend, simple_tensor, (const char *) data + offset_j, offset, chunk_size_j,
|
||||
i_stop - i_start, chunk_size_j, chunk_size_full);
|
||||
offset_j += chunk_size_j;
|
||||
@@ -1646,6 +1735,9 @@ static void ggml_backend_meta_get_tensor_async(ggml_backend_t backend, const ggm
|
||||
ggml_backend_t simple_backend = ggml_backend_meta_simple_backend(backend, j);
|
||||
const ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
|
||||
const size_t chunk_size_j = simple_tensor->nb[split_state.axis + 1];
|
||||
if (chunk_size_j == 0) {
|
||||
continue;
|
||||
}
|
||||
ggml_backend_tensor_get_2d_async(simple_backend, simple_tensor, (char *) data + offset_j, offset, chunk_size_j,
|
||||
i_stop - i_start, chunk_size_j, chunk_size_full);
|
||||
offset_j += chunk_size_j;
|
||||
@@ -1692,6 +1784,26 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
|
||||
}
|
||||
|
||||
if (needs_rebuild) {
|
||||
std::set<ggml_backend_buffer_t> used_buffers;
|
||||
for (int i = 0; i < cgraph->n_leafs; i++) {
|
||||
if (ggml_backend_buffer_is_meta(cgraph->leafs[i]->buffer)) {
|
||||
used_buffers.emplace(cgraph->leafs[i]->buffer);
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
if (ggml_backend_buffer_is_meta(cgraph->nodes[i]->buffer)) {
|
||||
used_buffers.emplace(cgraph->nodes[i]->buffer);
|
||||
}
|
||||
}
|
||||
for (ggml_backend_buffer_t buf : used_buffers) {
|
||||
ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) buf->context;
|
||||
buf_ctx->stc_compute_index_next = buf_ctx->stc_compute_index ^ 1;
|
||||
ggml_backend_meta_simple_tensor_container & stc = buf_ctx->stc_compute[buf_ctx->stc_compute_index_next];
|
||||
for (ggml_context_ptr & ctx : stc.ctxs) {
|
||||
ggml_reset(ctx.get());
|
||||
}
|
||||
stc.simple_tensors.clear();
|
||||
}
|
||||
size_t n_subgraphs = 0;
|
||||
size_t max_tmp_size = 0;
|
||||
|
||||
@@ -1877,7 +1989,7 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
|
||||
const size_t mem_per_device_graphs_main = backend_ctx->max_subgraphs*ggml_graph_overhead_custom(backend_ctx->max_nnodes, cgraph->grads);
|
||||
const size_t mem_per_device_graphs_aux = n_cgraphs_per_device*backend_ctx->max_subgraphs*ggml_graph_overhead_custom(1, cgraph->grads);
|
||||
const size_t mem_per_device_nodes_aux = n_nodes_per_device*backend_ctx->max_subgraphs*ggml_tensor_overhead();
|
||||
ggml_init_params params = {
|
||||
const ggml_init_params params = {
|
||||
/*.mem_size =*/ n_backends * (mem_per_device_graphs_main + mem_per_device_graphs_aux + mem_per_device_nodes_aux),
|
||||
/*.mem_buffer =*/ nullptr,
|
||||
/*.no_alloc =*/ true,
|
||||
|
||||
@@ -306,7 +306,7 @@ void ggml_backend_tensor_get_2d_async(ggml_backend_t backend, const struct ggml_
|
||||
GGML_ASSERT(tensor);
|
||||
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
||||
|
||||
if (n_copies <= 1 || backend->iface.set_tensor_2d_async == NULL) {
|
||||
if (n_copies <= 1 || backend->iface.get_tensor_2d_async == NULL) {
|
||||
for (size_t i = 0; i < n_copies; i++) {
|
||||
ggml_backend_tensor_get_async(backend, tensor, (char *) data + i*stride_data, offset + i*stride_tensor, size);
|
||||
}
|
||||
@@ -317,7 +317,7 @@ void ggml_backend_tensor_get_2d_async(ggml_backend_t backend, const struct ggml_
|
||||
}
|
||||
|
||||
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
||||
GGML_ASSERT(offset + (n_copies-1)*stride_tensor + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
||||
GGML_ASSERT(offset + (n_copies-1)*stride_tensor + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
||||
backend->iface.get_tensor_2d_async(backend, tensor, data, offset, size, n_copies, stride_tensor, stride_data);
|
||||
}
|
||||
|
||||
|
||||
@@ -72,17 +72,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (GGML_OPENMP)
|
||||
find_package(OpenMP)
|
||||
if (OpenMP_FOUND)
|
||||
set(GGML_OPENMP_ENABLED "ON" CACHE INTERNAL "")
|
||||
target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_OPENMP)
|
||||
|
||||
target_link_libraries(${GGML_CPU_NAME} PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
|
||||
else()
|
||||
set(GGML_OPENMP_ENABLED "OFF" CACHE INTERNAL "")
|
||||
message(WARNING "OpenMP not found")
|
||||
endif()
|
||||
if (GGML_OPENMP_ENABLED)
|
||||
target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_OPENMP)
|
||||
target_link_libraries(${GGML_CPU_NAME} PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
|
||||
endif()
|
||||
|
||||
if (GGML_LLAMAFILE)
|
||||
|
||||
@@ -273,67 +273,51 @@ void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * G
|
||||
|
||||
#if defined(GGML_SIMD)
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
const int sve_register_length = svcntb() * 8; //get vector length
|
||||
const int ggml_f16_epr = sve_register_length / 16; // running when 16
|
||||
const int ggml_f16_step = 8 * ggml_f16_epr; // choose 8 SVE registers
|
||||
const int ggml_f16_epr = svcnth();
|
||||
const int ggml_f16_step = 8 * ggml_f16_epr;
|
||||
const int np = n - (n % ggml_f16_step);
|
||||
const int np2 = n - (n % ggml_f16_epr);
|
||||
|
||||
const int np= (n & ~(ggml_f16_step - 1));
|
||||
svfloat16_t sum1 = svdup_n_f16(0.0f);
|
||||
svfloat16_t sum2 = svdup_n_f16(0.0f);
|
||||
svfloat16_t sum3 = svdup_n_f16(0.0f);
|
||||
svfloat16_t sum4 = svdup_n_f16(0.0f);
|
||||
svfloat32_t sum1_lo = svdup_n_f32(0.0f);
|
||||
svfloat32_t sum1_hi = svdup_n_f32(0.0f);
|
||||
svfloat32_t sum2_lo = svdup_n_f32(0.0f);
|
||||
svfloat32_t sum2_hi = svdup_n_f32(0.0f);
|
||||
svfloat32_t sum3_lo = svdup_n_f32(0.0f);
|
||||
svfloat32_t sum3_hi = svdup_n_f32(0.0f);
|
||||
svfloat32_t sum4_lo = svdup_n_f32(0.0f);
|
||||
svfloat32_t sum4_hi = svdup_n_f32(0.0f);
|
||||
|
||||
svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
|
||||
svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
|
||||
for (int i = 0; i < np; i += ggml_f16_step) {
|
||||
ax1 = GGML_F16x_VEC_LOAD(x + i + 0 * ggml_f16_epr, 0);
|
||||
ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0);
|
||||
sum1 = GGML_F16x_VEC_FMA(sum1, ax1, ay1);
|
||||
|
||||
ax2 = GGML_F16x_VEC_LOAD(x + i + 1 * ggml_f16_epr, 1);
|
||||
ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1);
|
||||
sum2 = GGML_F16x_VEC_FMA(sum2, ax2, ay2);
|
||||
|
||||
ax3 = GGML_F16x_VEC_LOAD(x + i + 2 * ggml_f16_epr, 2);
|
||||
ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2);
|
||||
sum3 = GGML_F16x_VEC_FMA(sum3, ax3, ay3);
|
||||
|
||||
ax4 = GGML_F16x_VEC_LOAD(x + i + 3 * ggml_f16_epr, 3);
|
||||
ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3);
|
||||
sum4 = GGML_F16x_VEC_FMA(sum4, ax4, ay4);
|
||||
|
||||
ax5 = GGML_F16x_VEC_LOAD(x + i + 4 * ggml_f16_epr, 4);
|
||||
ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4);
|
||||
sum1 = GGML_F16x_VEC_FMA(sum1, ax5, ay5);
|
||||
|
||||
ax6 = GGML_F16x_VEC_LOAD(x + i + 5 * ggml_f16_epr, 5);
|
||||
ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5);
|
||||
sum2 = GGML_F16x_VEC_FMA(sum2, ax6, ay6);
|
||||
|
||||
ax7 = GGML_F16x_VEC_LOAD(x + i + 6 * ggml_f16_epr, 6);
|
||||
ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6);
|
||||
sum3 = GGML_F16x_VEC_FMA(sum3, ax7, ay7);
|
||||
|
||||
ax8 = GGML_F16x_VEC_LOAD(x + i + 7 * ggml_f16_epr, 7);
|
||||
ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7);
|
||||
sum4 = GGML_F16x_VEC_FMA(sum4, ax8, ay8);
|
||||
ggml_sve_f16_fma_widened(&sum1_lo, &sum1_hi, GGML_F16x_VEC_LOAD(x + i + 0 * ggml_f16_epr, 0), GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0));
|
||||
ggml_sve_f16_fma_widened(&sum2_lo, &sum2_hi, GGML_F16x_VEC_LOAD(x + i + 1 * ggml_f16_epr, 1), GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1));
|
||||
ggml_sve_f16_fma_widened(&sum3_lo, &sum3_hi, GGML_F16x_VEC_LOAD(x + i + 2 * ggml_f16_epr, 2), GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2));
|
||||
ggml_sve_f16_fma_widened(&sum4_lo, &sum4_hi, GGML_F16x_VEC_LOAD(x + i + 3 * ggml_f16_epr, 3), GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3));
|
||||
ggml_sve_f16_fma_widened(&sum1_lo, &sum1_hi, GGML_F16x_VEC_LOAD(x + i + 4 * ggml_f16_epr, 4), GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4));
|
||||
ggml_sve_f16_fma_widened(&sum2_lo, &sum2_hi, GGML_F16x_VEC_LOAD(x + i + 5 * ggml_f16_epr, 5), GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5));
|
||||
ggml_sve_f16_fma_widened(&sum3_lo, &sum3_hi, GGML_F16x_VEC_LOAD(x + i + 6 * ggml_f16_epr, 6), GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6));
|
||||
ggml_sve_f16_fma_widened(&sum4_lo, &sum4_hi, GGML_F16x_VEC_LOAD(x + i + 7 * ggml_f16_epr, 7), GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7));
|
||||
}
|
||||
|
||||
const int np2 = (n & ~(ggml_f16_epr - 1)); // round down to multiple of 8
|
||||
for (int k = np; k < np2; k += ggml_f16_epr) {
|
||||
svfloat16_t rx = GGML_F16x_VEC_LOAD(x + k, 0);
|
||||
svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0);
|
||||
sum1 = GGML_F16x_VEC_FMA(sum1, rx, ry);
|
||||
for (int i = np; i < np2; i += ggml_f16_epr) {
|
||||
ggml_sve_f16_fma_widened(&sum1_lo, &sum1_hi, GGML_F16x_VEC_LOAD(x + i, 0), GGML_F16x_VEC_LOAD(y + i, 0));
|
||||
}
|
||||
|
||||
if (np2 < n) {
|
||||
svbool_t pg = svwhilelt_b16(np2, n);
|
||||
svfloat16_t hx = svld1_f16(pg, (const __fp16 *)(x + np2));
|
||||
svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2));
|
||||
const svbool_t pg = svwhilelt_b16(np2, n);
|
||||
const svfloat16_t rx = svld1_f16(pg, (const __fp16 *)(x + np2));
|
||||
const svfloat16_t ry = svld1_f16(pg, (const __fp16 *)(y + np2));
|
||||
|
||||
sum1 = svmad_f16_x(pg, hx, hy, sum1);
|
||||
ggml_sve_f16_fma_widened(&sum1_lo, &sum1_hi, rx, ry);
|
||||
}
|
||||
GGML_F16x_VEC_REDUCE(sumf, sum1, sum2, sum3, sum4);
|
||||
|
||||
sum1_lo = svadd_f32_m(DEFAULT_PG32, sum1_lo, sum2_lo);
|
||||
sum1_hi = svadd_f32_m(DEFAULT_PG32, sum1_hi, sum2_hi);
|
||||
sum3_lo = svadd_f32_m(DEFAULT_PG32, sum3_lo, sum4_lo);
|
||||
sum3_hi = svadd_f32_m(DEFAULT_PG32, sum3_hi, sum4_hi);
|
||||
sum1_lo = svadd_f32_m(DEFAULT_PG32, sum1_lo, sum3_lo);
|
||||
sum1_hi = svadd_f32_m(DEFAULT_PG32, sum1_hi, sum3_hi);
|
||||
|
||||
sumf = ggml_sve_sum_f32x2(sum1_lo, sum1_hi);
|
||||
#elif defined(__riscv_v_intrinsic)
|
||||
#if defined(__riscv_zvfh)
|
||||
int vl = __riscv_vsetvlmax_e32m2();
|
||||
|
||||
@@ -14,6 +14,35 @@
|
||||
// floating point type used to accumulate sums
|
||||
typedef double ggml_float;
|
||||
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
inline static void ggml_sve_f16_fma_widened(
|
||||
svfloat32_t * acc_lo,
|
||||
svfloat32_t * acc_hi,
|
||||
svfloat16_t x,
|
||||
svfloat16_t y) {
|
||||
#if defined(__ARM_FEATURE_SVE2)
|
||||
*acc_lo = svmlalb_f32(*acc_lo, x, y);
|
||||
*acc_hi = svmlalt_f32(*acc_hi, x, y);
|
||||
#else
|
||||
// Plain SVE fallback path if SVE2 instructions not available
|
||||
svfloat16_t x_even = svtrn1_f16(x, x);
|
||||
svfloat16_t x_odd = svtrn2_f16(x, x);
|
||||
|
||||
svfloat16_t y_even = svtrn1_f16(y, y);
|
||||
svfloat16_t y_odd = svtrn2_f16(y, y);
|
||||
|
||||
svbool_t pg = svptrue_b32();
|
||||
|
||||
*acc_lo = svmla_f32_x(pg, *acc_lo, svcvt_f32_f16_x(pg, x_even), svcvt_f32_f16_x(pg, y_even));
|
||||
*acc_hi = svmla_f32_x(pg, *acc_hi, svcvt_f32_f16_x(pg, x_odd), svcvt_f32_f16_x(pg, y_odd));
|
||||
#endif
|
||||
}
|
||||
|
||||
inline static ggml_float ggml_sve_sum_f32x2(svfloat32_t sum_lo, svfloat32_t sum_hi) {
|
||||
return (ggml_float) (svaddv_f32(svptrue_b32(), sum_lo) + svaddv_f32(svptrue_b32(), sum_hi));
|
||||
}
|
||||
#endif
|
||||
|
||||
#define GGML_GELU_FP16
|
||||
#define GGML_GELU_QUICK_FP16
|
||||
|
||||
@@ -122,108 +151,61 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
|
||||
#if defined(GGML_SIMD)
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
|
||||
const int sve_register_length = svcntb() * 8;
|
||||
const int ggml_f16_epr = sve_register_length / 16; // running when 16
|
||||
const int ggml_f16_step = 8 * ggml_f16_epr; // choose 8 SVE registers
|
||||
const int ggml_f16_epr = svcnth();
|
||||
const int ggml_f16_step = 2 * ggml_f16_epr;
|
||||
int np = n - (n % ggml_f16_step);
|
||||
int np2 = n - (n % ggml_f16_epr);
|
||||
|
||||
int np = (n & ~(ggml_f16_step - 1));
|
||||
|
||||
svfloat16_t sum_00 = svdup_n_f16(0.0f);
|
||||
svfloat16_t sum_01 = svdup_n_f16(0.0f);
|
||||
svfloat16_t sum_02 = svdup_n_f16(0.0f);
|
||||
svfloat16_t sum_03 = svdup_n_f16(0.0f);
|
||||
|
||||
svfloat16_t sum_10 = svdup_n_f16(0.0f);
|
||||
svfloat16_t sum_11 = svdup_n_f16(0.0f);
|
||||
svfloat16_t sum_12 = svdup_n_f16(0.0f);
|
||||
svfloat16_t sum_13 = svdup_n_f16(0.0f);
|
||||
|
||||
svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
|
||||
svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
|
||||
svfloat32_t sum_0_0_lo = svdup_n_f32(0.0f);
|
||||
svfloat32_t sum_0_0_hi = svdup_n_f32(0.0f);
|
||||
svfloat32_t sum_0_1_lo = svdup_n_f32(0.0f);
|
||||
svfloat32_t sum_0_1_hi = svdup_n_f32(0.0f);
|
||||
svfloat32_t sum_1_0_lo = svdup_n_f32(0.0f);
|
||||
svfloat32_t sum_1_0_hi = svdup_n_f32(0.0f);
|
||||
svfloat32_t sum_1_1_lo = svdup_n_f32(0.0f);
|
||||
svfloat32_t sum_1_1_hi = svdup_n_f32(0.0f);
|
||||
|
||||
for (int i = 0; i < np; i += ggml_f16_step) {
|
||||
ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0); // 8 elements
|
||||
const svfloat16_t ay0 = GGML_F16x_VEC_LOAD(y + i, 0);
|
||||
const svfloat16_t ax00 = GGML_F16x_VEC_LOAD(x[0] + i, 0);
|
||||
const svfloat16_t ax01 = GGML_F16x_VEC_LOAD(x[1] + i, 0);
|
||||
|
||||
ax1 = GGML_F16x_VEC_LOAD(x[0] + i + 0*ggml_f16_epr, 0); // 8 elements
|
||||
sum_00 = GGML_F16x_VEC_FMA(sum_00, ax1, ay1); // sum_00 = sum_00+ax1*ay1
|
||||
ax1 = GGML_F16x_VEC_LOAD(x[1] + i + 0*ggml_f16_epr, 0); // 8 elements
|
||||
sum_10 = GGML_F16x_VEC_FMA(sum_10, ax1, ay1);
|
||||
ggml_sve_f16_fma_widened(&sum_0_0_lo, &sum_0_0_hi, ax00, ay0);
|
||||
ggml_sve_f16_fma_widened(&sum_1_0_lo, &sum_1_0_hi, ax01, ay0);
|
||||
|
||||
ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1); // next 8 elements
|
||||
const svfloat16_t ay1 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 0);
|
||||
const svfloat16_t ax10 = GGML_F16x_VEC_LOAD(x[0] + i + 1 * ggml_f16_epr, 0);
|
||||
const svfloat16_t ax11 = GGML_F16x_VEC_LOAD(x[1] + i + 1 * ggml_f16_epr, 0);
|
||||
|
||||
ax2 = GGML_F16x_VEC_LOAD(x[0] + i + 1*ggml_f16_epr, 1); // next 8 elements
|
||||
sum_01 = GGML_F16x_VEC_FMA(sum_01, ax2, ay2);
|
||||
ax2 = GGML_F16x_VEC_LOAD(x[1] + i + 1*ggml_f16_epr, 1);
|
||||
sum_11 = GGML_F16x_VEC_FMA(sum_11, ax2, ay2);
|
||||
|
||||
ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2);
|
||||
|
||||
ax3 = GGML_F16x_VEC_LOAD(x[0] + i + 2*ggml_f16_epr, 2);
|
||||
sum_02 = GGML_F16x_VEC_FMA(sum_02, ax3, ay3);
|
||||
ax3 = GGML_F16x_VEC_LOAD(x[1] + i + 2*ggml_f16_epr, 2);
|
||||
sum_12 = GGML_F16x_VEC_FMA(sum_12, ax3, ay3);
|
||||
|
||||
ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3);
|
||||
|
||||
ax4 = GGML_F16x_VEC_LOAD(x[0] + i + 3*ggml_f16_epr, 3);
|
||||
sum_03 = GGML_F16x_VEC_FMA(sum_03, ax4, ay4);
|
||||
ax4 = GGML_F16x_VEC_LOAD(x[1] + i + 3*ggml_f16_epr, 3);
|
||||
sum_13 = GGML_F16x_VEC_FMA(sum_13, ax4, ay4);
|
||||
|
||||
ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4);
|
||||
|
||||
ax5 = GGML_F16x_VEC_LOAD(x[0] + i + 4*ggml_f16_epr, 4);
|
||||
|
||||
sum_00 = GGML_F16x_VEC_FMA(sum_00, ax5, ay5);
|
||||
ax5 = GGML_F16x_VEC_LOAD(x[1] + i + 4*ggml_f16_epr, 4);
|
||||
sum_10 = GGML_F16x_VEC_FMA(sum_10, ax5, ay5);
|
||||
|
||||
ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5);
|
||||
|
||||
ax6 = GGML_F16x_VEC_LOAD(x[0] + i + 5*ggml_f16_epr, 5);
|
||||
|
||||
sum_01 = GGML_F16x_VEC_FMA(sum_01, ax6, ay6);
|
||||
ax6 = GGML_F16x_VEC_LOAD(x[1] + i + 5*ggml_f16_epr, 5);
|
||||
sum_11 = GGML_F16x_VEC_FMA(sum_11, ax6, ay6);
|
||||
|
||||
ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6);
|
||||
|
||||
ax7 = GGML_F16x_VEC_LOAD(x[0] + i + 6*ggml_f16_epr, 6);
|
||||
|
||||
sum_02 = GGML_F16x_VEC_FMA(sum_02, ax7, ay7);
|
||||
ax7 = GGML_F16x_VEC_LOAD(x[1] + i + 6*ggml_f16_epr, 6);
|
||||
sum_12 = GGML_F16x_VEC_FMA(sum_12, ax7, ay7);
|
||||
|
||||
ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7);
|
||||
|
||||
ax8 = GGML_F16x_VEC_LOAD(x[0] + i + 7*ggml_f16_epr, 7);
|
||||
|
||||
sum_03 = GGML_F16x_VEC_FMA(sum_03, ax8, ay8);
|
||||
ax8 = GGML_F16x_VEC_LOAD(x[1] + i + 7*ggml_f16_epr, 7);
|
||||
sum_13 = GGML_F16x_VEC_FMA(sum_13, ax8, ay8);
|
||||
ggml_sve_f16_fma_widened(&sum_0_1_lo, &sum_0_1_hi, ax10, ay1);
|
||||
ggml_sve_f16_fma_widened(&sum_1_1_lo, &sum_1_1_hi, ax11, ay1);
|
||||
}
|
||||
|
||||
const int np2 = (n & ~(ggml_f16_epr - 1));
|
||||
for (int k = np; k < np2; k += ggml_f16_epr) {
|
||||
svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0);
|
||||
for (int i = np; i < np2; i += ggml_f16_epr) {
|
||||
const svfloat16_t ry = GGML_F16x_VEC_LOAD(y + i, 0);
|
||||
const svfloat16_t rx0 = GGML_F16x_VEC_LOAD(x[0] + i, 0);
|
||||
const svfloat16_t rx1 = GGML_F16x_VEC_LOAD(x[1] + i, 0);
|
||||
|
||||
svfloat16_t rx = GGML_F16x_VEC_LOAD(x[0] + k, 0);
|
||||
sum_00 = GGML_F16x_VEC_FMA(sum_00, rx, ry);
|
||||
rx = GGML_F16x_VEC_LOAD(x[1] + k, 0);
|
||||
sum_10 = GGML_F16x_VEC_FMA(sum_10, rx, ry);
|
||||
ggml_sve_f16_fma_widened(&sum_0_0_lo, &sum_0_0_hi, rx0, ry);
|
||||
ggml_sve_f16_fma_widened(&sum_1_0_lo, &sum_1_0_hi, rx1, ry);
|
||||
}
|
||||
|
||||
if (np2 < n) {
|
||||
svbool_t pg = svwhilelt_b16(np2, n);
|
||||
svfloat16_t hx_0 = svld1_f16(pg, (const __fp16 *)(x[0] + np2));
|
||||
svfloat16_t hx_1 = svld1_f16(pg, (const __fp16 *)(x[1] + np2));
|
||||
svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2));
|
||||
const svbool_t pg = svwhilelt_b16(np2, n);
|
||||
const svfloat16_t ay = svld1_f16(pg, (const __fp16 *)(y + np2));
|
||||
const svfloat16_t ax0 = svld1_f16(pg, (const __fp16 *)(x[0] + np2));
|
||||
const svfloat16_t ax1 = svld1_f16(pg, (const __fp16 *)(x[1] + np2));
|
||||
|
||||
sum_00 = svmad_f16_x(pg, hx_0, hy, sum_00);
|
||||
sum_10 = svmad_f16_x(pg, hx_1, hy, sum_10);
|
||||
ggml_sve_f16_fma_widened(&sum_0_0_lo, &sum_0_0_hi, ax0, ay);
|
||||
ggml_sve_f16_fma_widened(&sum_1_0_lo, &sum_1_0_hi, ax1, ay);
|
||||
}
|
||||
GGML_F16x_VEC_REDUCE(sumf[0], sum_00, sum_01, sum_02, sum_03);
|
||||
GGML_F16x_VEC_REDUCE(sumf[1], sum_10, sum_11, sum_12, sum_13);
|
||||
|
||||
svfloat32_t sum_0_lo = svadd_f32_x(DEFAULT_PG32, sum_0_0_lo, sum_0_1_lo);
|
||||
svfloat32_t sum_0_hi = svadd_f32_x(DEFAULT_PG32, sum_0_0_hi, sum_0_1_hi);
|
||||
svfloat32_t sum_1_lo = svadd_f32_x(DEFAULT_PG32, sum_1_0_lo, sum_1_1_lo);
|
||||
svfloat32_t sum_1_hi = svadd_f32_x(DEFAULT_PG32, sum_1_0_hi, sum_1_1_hi);
|
||||
sumf[0] = ggml_sve_sum_f32x2(sum_0_lo, sum_0_hi);
|
||||
sumf[1] = ggml_sve_sum_f32x2(sum_1_lo, sum_1_hi);
|
||||
np = n;
|
||||
#elif defined(__riscv_v_intrinsic)
|
||||
#if defined(__riscv_zvfh)
|
||||
|
||||
@@ -110,11 +110,14 @@
|
||||
# define GGML_CUDA_USE_CUB
|
||||
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11070
|
||||
|
||||
// PDL host-side support (cudaLaunchKernelEx) requires CUDART >= 11.8 and excludes HIP/MUSA.
|
||||
// PDL host-side support (cudaLaunchKernelEx) requires CUDART >= 11.8.
|
||||
// However, this has been bugged in CTK < 12.3 for MSVC builds, see
|
||||
// https://github.com/ggml-org/llama.cpp/pull/22522#discussion_r3302393293
|
||||
// __CUDA_ARCH__ is undefined in host passes; GPU arch check happens in device-side code.
|
||||
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11080
|
||||
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && \
|
||||
(CUDART_VERSION >= 12030 || (!(defined(_MSC_VER) && !defined(__clang__)) && CUDART_VERSION >= 11080))
|
||||
# define GGML_CUDA_USE_PDL
|
||||
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11080
|
||||
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && (CUDART_VERSION >= 12030 || (!(defined(_MSC_VER) && !defined(__clang__)) && CUDART_VERSION >= 11080))
|
||||
|
||||
static __device__ __forceinline__ void ggml_cuda_pdl_sync() {
|
||||
#if defined(GGML_CUDA_USE_PDL) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER
|
||||
@@ -1561,7 +1564,8 @@ static __inline__ void ggml_cuda_kernel_launch(Kernel kernel, const ggml_cuda_ke
|
||||
return env == nullptr || std::atoi(env) != 0;
|
||||
}();
|
||||
|
||||
if (env_pdl_enabled && ggml_cuda_info().devices[ggml_cuda_get_device()].cc >= GGML_CUDA_CC_HOPPER) {
|
||||
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
|
||||
if (env_pdl_enabled && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_HOPPER) {
|
||||
auto pdl_cfg = ggml_cuda_pdl_config(launch_params);
|
||||
|
||||
CUDA_CHECK(cudaLaunchKernelEx(&pdl_cfg.cfg, kernel, std::forward<Args>(args)... ));
|
||||
|
||||
@@ -472,7 +472,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_load_mask(
|
||||
|
||||
const int i = 8 * (threadIdx.x % (nbatch_fa/8));
|
||||
|
||||
cp_async_cg_16<preload>(tile_mask_32 + j_sram*(nbatch_fa*sizeof(half) + 16) + i*sizeof(half), mask_h + j_vram*stride_mask + i);
|
||||
cp_async_cg_16<preload>(tile_mask_32 + j_sram*(nbatch_fa*sizeof(half) + 16) + i*sizeof(half), mask_h + int64_t(j_vram)*stride_mask + i);
|
||||
}
|
||||
} else if constexpr (oob_check) {
|
||||
#pragma unroll
|
||||
@@ -488,7 +488,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_load_mask(
|
||||
for (int i0 = 0; i0 < nbatch_fa; i0 += warp_size) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
|
||||
tile_mask[j_sram*(nbatch_fa + 8) + i] = i < i_sup ? mask_h[j_vram*stride_mask + i] : half(0.0f);
|
||||
tile_mask[j_sram*(nbatch_fa + 8) + i] = i < i_sup ? mask_h[int64_t(j_vram)*stride_mask + i] : half(0.0f);
|
||||
}
|
||||
}
|
||||
} else if constexpr (nbatch_fa < 2*warp_size) {
|
||||
@@ -505,7 +505,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_load_mask(
|
||||
|
||||
const int i = threadIdx.x % (warp_size/cols_per_warp);
|
||||
|
||||
ggml_cuda_memcpy_1<sizeof(half2)>(tile_mask + j_sram*(nbatch_fa + 8) + 2*i, mask_h + j_vram*stride_mask + 2*i);
|
||||
ggml_cuda_memcpy_1<sizeof(half2)>(tile_mask + j_sram*(nbatch_fa + 8) + 2*i, mask_h + int64_t(j_vram)*stride_mask + 2*i);
|
||||
}
|
||||
} else {
|
||||
#pragma unroll
|
||||
@@ -521,7 +521,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_load_mask(
|
||||
for (int i0 = 0; i0 < nbatch_fa; i0 += 2*warp_size) {
|
||||
const int i = i0 + 2*threadIdx.x;
|
||||
|
||||
ggml_cuda_memcpy_1<sizeof(half2)>(tile_mask + j_sram*(nbatch_fa + 8) + i, mask_h + j_vram*stride_mask + i);
|
||||
ggml_cuda_memcpy_1<sizeof(half2)>(tile_mask + j_sram*(nbatch_fa + 8) + i, mask_h + int64_t(j_vram)*stride_mask + i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
101
ggml/src/ggml-cuda/fwht.cu
Normal file
101
ggml/src/ggml-cuda/fwht.cu
Normal file
@@ -0,0 +1,101 @@
|
||||
#include "common.cuh"
|
||||
#include "fwht.cuh"
|
||||
|
||||
template <int N>
|
||||
__launch_bounds__(4*ggml_cuda_get_physical_warp_size(), 1)
|
||||
__global__ void fwht_cuda(const float * src, float * dst, const int64_t n_rows, const float scale) {
|
||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
||||
|
||||
const int64_t r = (int64_t) blockIdx.x * blockDim.y + threadIdx.y;
|
||||
|
||||
if (r >= n_rows) {
|
||||
return;
|
||||
}
|
||||
|
||||
src += r * N;
|
||||
dst += r * N;
|
||||
|
||||
static constexpr int el_w = N / warp_size;
|
||||
float reg[el_w];
|
||||
const int lane = threadIdx.x;
|
||||
|
||||
ggml_cuda_pdl_sync();
|
||||
#pragma unroll
|
||||
for (int i = 0; i < el_w; ++i) {
|
||||
reg[i] = src[i * warp_size + lane] * scale;
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int h = 1; h < warp_size; h *= 2) {
|
||||
#pragma unroll
|
||||
for (int j = 0; j < el_w; j++) {
|
||||
const float val = reg[j];
|
||||
const float val2 = __shfl_xor_sync(0xFFFFFFFF, val, h, warp_size);
|
||||
|
||||
reg[j] = (lane & h) == 0 ? val + val2 : val2 - val;
|
||||
}
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int h = warp_size; h < N; h *= 2) {
|
||||
const int step = h / warp_size;
|
||||
#pragma unroll
|
||||
for (int j = 0; j < el_w; j += 2 * step) {
|
||||
#pragma unroll
|
||||
for (int k = 0; k < step; k++) {
|
||||
const float x = reg[j + k];
|
||||
const float y = reg[j + k + step];
|
||||
|
||||
reg[j + k] = x + y;
|
||||
reg[j + k + step] = x - y;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < el_w; ++i) {
|
||||
dst[i * warp_size + lane] = reg[i];
|
||||
}
|
||||
}
|
||||
|
||||
bool ggml_cuda_op_fwht(ggml_backend_cuda_context & ctx, const ggml_tensor * src, ggml_tensor * dst) {
|
||||
GGML_ASSERT(ggml_are_same_shape(src, dst));
|
||||
if (!ggml_is_contiguous(src) || !ggml_is_contiguous(dst)) {
|
||||
return false;
|
||||
}
|
||||
const int n = src->ne[0];
|
||||
const int64_t rows = ggml_nrows(src);
|
||||
|
||||
const float * src_d = (const float *) src->data;
|
||||
float * dst_d = (float *) dst->data;
|
||||
|
||||
const int warp_size = ggml_cuda_info().devices[ggml_cuda_get_device()].warp_size;
|
||||
const int rows_per_block = 4;
|
||||
|
||||
const int64_t num_blocks = (rows + rows_per_block - 1) / rows_per_block;
|
||||
|
||||
cudaStream_t stream = ctx.stream();
|
||||
dim3 grid_dims(num_blocks, 1, 1);
|
||||
dim3 block_dims(warp_size, rows_per_block, 1);
|
||||
const ggml_cuda_kernel_launch_params launch_params =
|
||||
ggml_cuda_kernel_launch_params(grid_dims, block_dims, 0, stream);
|
||||
|
||||
const float scale = 1 / sqrtf(n);
|
||||
|
||||
switch (n) {
|
||||
case 64:
|
||||
ggml_cuda_kernel_launch(fwht_cuda<64>, launch_params, src_d, dst_d, rows, scale);
|
||||
return true;
|
||||
case 128:
|
||||
ggml_cuda_kernel_launch(fwht_cuda<128>, launch_params, src_d, dst_d, rows, scale);
|
||||
return true;
|
||||
case 256:
|
||||
ggml_cuda_kernel_launch(fwht_cuda<256>, launch_params, src_d, dst_d, rows, scale);
|
||||
return true;
|
||||
case 512:
|
||||
ggml_cuda_kernel_launch(fwht_cuda<512>, launch_params, src_d, dst_d, rows, scale);
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
4
ggml/src/ggml-cuda/fwht.cuh
Normal file
4
ggml/src/ggml-cuda/fwht.cuh
Normal file
@@ -0,0 +1,4 @@
|
||||
#include "common.cuh"
|
||||
|
||||
// Returns whether the Fast Walsh-Hadamard transform could be used.
|
||||
bool ggml_cuda_op_fwht(ggml_backend_cuda_context & ctx, const ggml_tensor * src, ggml_tensor * dst);
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user