Compare commits

..

13 Commits

Author SHA1 Message Date
Francis Couture-Harpin
a5b1943912 ggml-quants : fix some edge cases in make_qkxh_nl_quants 2025-03-23 17:59:37 -04:00
Francis Couture-Harpin
8b8b88f3de ggml-quants : restore Q2_K use of make_qp_quants
Weirdly, it seems like in practice replacing this instance is not better.
This is probably because of its interaction with make_qkx3_quants.
2025-03-22 18:47:56 -04:00
Francis Couture-Harpin
a41139723d Merge branch 'master' into compilade/optimal-rounding 2025-03-22 15:05:11 -04:00
Francis Couture-Harpin
af23abd3cb ggml-quants : remove slower qsort-based cumulative search 2025-03-22 12:08:42 -04:00
Francis Couture-Harpin
3e4b675c9f ggml-quants : use a max-heap for TQ1_0 and TQ2_0 quantization 2025-03-22 12:03:37 -04:00
Francis Couture-Harpin
f86b8ff210 ggml-quants : use qkxh in more places 2025-03-21 14:05:58 -04:00
Francis Couture-Harpin
3be115100f ggml-quants : use a max-heap for linear quants like Q3_K
Slightly faster than the previous method.
2025-03-20 19:21:45 -04:00
Francis Couture-Harpin
30ad9c2873 ggml-quants : faster exhaustive IQ4_NL rounding with k_heap 2025-03-15 12:57:44 -04:00
Francis Couture-Harpin
0c9e442489 ggml-quants : remove some commented code 2025-03-15 10:29:47 -04:00
Francis Couture-Harpin
f27c1afc40 ggml-quants : improve TQ2_0 imatrix 2025-03-07 12:54:56 -05:00
Francis Couture-Harpin
6f7fe74946 ggml-quants : improve imatrix behavior for TQ1_0, TQ2_0, Q4_0, Q5_0 2025-02-21 18:47:09 -05:00
Francis Couture-Harpin
d0060fc498 ggml-quants : better and faster make_qkxs_quants 2025-02-21 15:11:21 -05:00
Francis Couture-Harpin
dd6b8408c9 ggml-quants : improve IQ4_NL, IQ4_XS, and Q3_K 2025-02-21 13:49:18 -05:00
236 changed files with 23613 additions and 25505 deletions

View File

@@ -14,9 +14,9 @@ WORKDIR /app
COPY . .
RUN if [ "$TARGETARCH" = "amd64" ]; then \
cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON; \
cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DLLAMA_CURL=ON -DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON; \
elif [ "$TARGETARCH" = "arm64" ]; then \
cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=${GGML_CPU_ARM_ARCH}; \
cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DLLAMA_CURL=ON -DGGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=${GGML_CPU_ARM_ARCH}; \
else \
echo "Unsupported architecture"; \
exit 1; \

View File

@@ -21,7 +21,7 @@ COPY . .
RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
fi && \
cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
cmake --build build --config Release -j$(nproc)
RUN mkdir -p /app/lib && \

View File

@@ -17,7 +17,7 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
&& export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
fi && \
echo "Building with dynamic libs" && \
cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON ${OPT_SYCL_F16} && \
cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
cmake --build build --config Release -j$(nproc)
RUN mkdir -p /app/lib && \

View File

@@ -1,4 +1,4 @@
ARG ASCEND_VERSION=8.1.RC1.alpha001-910b-openeuler22.03-py3.10
ARG ASCEND_VERSION=8.0.rc2.alpha003-910b-openeuler22.03-py3.8
FROM ascendai/cann:$ASCEND_VERSION AS build
@@ -6,7 +6,7 @@ WORKDIR /app
COPY . .
RUN yum install -y gcc g++ cmake make libcurl-devel
RUN yum install -y gcc g++ cmake make
ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
ENV LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:$LIBRARY_PATH
ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling:${LD_LIBRARY_PATH}

View File

@@ -35,7 +35,7 @@ COPY . .
RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
fi && \
cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
cmake --build build --config Release -j$(nproc)
RUN mkdir -p /app/lib && \

View File

@@ -17,8 +17,8 @@ FROM ${BASE_ROCM_DEV_CONTAINER} AS build
# gfx906 is deprecated
#check https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.2.4/reference/system-requirements.html
ARG ROCM_DOCKER_ARCH='gfx803,gfx900,gfx906,gfx908,gfx90a,gfx942,gfx1010,gfx1030,gfx1032,gfx1100,gfx1101,gfx1102'
#ARG ROCM_DOCKER_ARCH=gfx1100
#ARG ROCM_DOCKER_ARCH='gfx803,gfx900,gfx906,gfx908,gfx90a,gfx942,gfx1010,gfx1030,gfx1032,gfx1100,gfx1101,gfx1102'
ARG ROCM_DOCKER_ARCH=gfx1100
# Set nvcc architectured
ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
@@ -40,7 +40,7 @@ WORKDIR /app
COPY . .
RUN HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=$ROCM_DOCKER_ARCH -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DCMAKE_BUILD_TYPE=Release -DLLAMA_CURL=ON \
cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=$ROCM_DOCKER_ARCH -DCMAKE_BUILD_TYPE=Release -DLLAMA_CURL=ON \
&& cmake --build build --config Release -j$(nproc)
RUN mkdir -p /app/lib \

View File

@@ -16,7 +16,7 @@ WORKDIR /app
COPY . .
RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=1 -DLLAMA_CURL=1 -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON && \
RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=1 -DLLAMA_CURL=1 && \
cmake --build build --config Release -j$(nproc)
RUN mkdir -p /app/lib && \

View File

@@ -1,25 +0,0 @@
name: 'Windows - Setup CURL'
description: 'Composite action, to be reused in other workflow'
inputs:
curl_version:
description: 'CURL version'
required: false
default: '8.6.0_6'
outputs:
curl_path:
description: "Path to the downloaded libcurl"
value: ${{ steps.get_libcurl.outputs.curl_path }}
runs:
using: "composite"
steps:
- name: libCURL
id: get_libcurl
shell: powershell
env:
CURL_VERSION: ${{ inputs.curl_version }}
run: |
curl.exe -o $env:RUNNER_TEMP/curl.zip -L "https://curl.se/windows/dl-${env:CURL_VERSION}/curl-${env:CURL_VERSION}-win64-mingw.zip"
mkdir $env:RUNNER_TEMP/libcurl
tar.exe -xvf $env:RUNNER_TEMP/curl.zip --strip-components=1 -C $env:RUNNER_TEMP/libcurl
echo "curl_path=$env:RUNNER_TEMP/libcurl" >> $env:GITHUB_OUTPUT

View File

@@ -104,6 +104,7 @@ jobs:
cmake -B build \
-DGGML_NATIVE=OFF \
-DLLAMA_BUILD_SERVER=ON \
-DLLAMA_CURL=ON \
-DLLAMA_CUBLAS=ON \
-DCUDAToolkit_ROOT=/usr/local/cuda \
-DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \

View File

@@ -1,124 +0,0 @@
name: Build on Linux using cross-compiler
on:
workflow_dispatch:
workflow_call:
jobs:
ubuntu-latest-riscv64-cpu-cross:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Setup Riscv
run: |
sudo dpkg --add-architecture riscv64
sudo sed -i 's|http://azure.archive.ubuntu.com/ubuntu|http://ports.ubuntu.com/ubuntu-ports|g' \
/etc/apt/sources.list /etc/apt/apt-mirrors.txt
sudo apt-get clean
sudo apt-get update
sudo apt-get install -y --no-install-recommends \
build-essential \
gcc-14-riscv64-linux-gnu \
g++-14-riscv64-linux-gnu \
libcurl4-openssl-dev:riscv64
- name: Build
run: |
cmake -B build -DCMAKE_BUILD_TYPE=Release \
-DGGML_OPENMP=OFF \
-DLLAMA_BUILD_EXAMPLES=ON \
-DLLAMA_BUILD_TESTS=OFF \
-DCMAKE_SYSTEM_NAME=Linux \
-DCMAKE_SYSTEM_PROCESSOR=riscv64 \
-DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
-DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
-DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
-DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
-DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
-DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
cmake --build build --config Release -j $(nproc)
ubuntu-latest-riscv64-vulkan-cross:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Setup Riscv
run: |
sudo dpkg --add-architecture riscv64
sudo sed -i 's|http://azure.archive.ubuntu.com/ubuntu|http://ports.ubuntu.com/ubuntu-ports|g' \
/etc/apt/sources.list /etc/apt/apt-mirrors.txt
sudo apt-get clean
sudo apt-get update
sudo apt-get install -y --no-install-recommends \
build-essential \
glslc \
gcc-14-riscv64-linux-gnu \
g++-14-riscv64-linux-gnu \
libvulkan-dev:riscv64 \
libcurl4-openssl-dev:riscv64
- name: Build
run: |
cmake -B build -DCMAKE_BUILD_TYPE=Release \
-DGGML_VULKAN=ON \
-DGGML_OPENMP=OFF \
-DLLAMA_BUILD_EXAMPLES=ON \
-DLLAMA_BUILD_TESTS=OFF \
-DCMAKE_SYSTEM_NAME=Linux \
-DCMAKE_SYSTEM_PROCESSOR=riscv64 \
-DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
-DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
-DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
-DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
-DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
-DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
cmake --build build --config Release -j $(nproc)
ubuntu-latest-arm64-vulkan-cross:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Setup Arm64
run: |
sudo dpkg --add-architecture arm64
sudo sed -i 's|http://azure.archive.ubuntu.com/ubuntu|http://ports.ubuntu.com/ubuntu-ports|g' \
/etc/apt/sources.list /etc/apt/apt-mirrors.txt
sudo apt-get clean
sudo apt-get update
sudo apt-get install -y --no-install-recommends \
build-essential \
glslc \
crossbuild-essential-arm64 \
libvulkan-dev:arm64 \
libcurl4-openssl-dev:arm64
- name: Build
run: |
cmake -B build -DCMAKE_BUILD_TYPE=Release \
-DGGML_VULKAN=ON \
-DGGML_OPENMP=OFF \
-DLLAMA_BUILD_EXAMPLES=ON \
-DLLAMA_BUILD_TESTS=OFF \
-DCMAKE_SYSTEM_NAME=Linux \
-DCMAKE_SYSTEM_PROCESSOR=aarch64 \
-DCMAKE_C_COMPILER=aarch64-linux-gnu-gcc \
-DCMAKE_CXX_COMPILER=aarch64-linux-gnu-g++ \
-DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-DCMAKE_FIND_ROOT_PATH=/usr/lib/aarch64-linux-gnu \
-DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
-DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
-DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
cmake --build build --config Release -j $(nproc)

View File

@@ -10,7 +10,7 @@ on:
push:
branches:
- master
paths: ['.github/workflows/build.yml', '.github/workflows/build-linux-cross.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp']
paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp']
pull_request:
types: [opened, synchronize, reopened]
paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp']
@@ -54,7 +54,6 @@ jobs:
continue-on-error: true
run: |
brew update
brew install curl
- name: Build
id: cmake_build
@@ -63,6 +62,7 @@ jobs:
cmake -B build \
-DCMAKE_BUILD_RPATH="@loader_path" \
-DLLAMA_FATAL_WARNINGS=ON \
-DLLAMA_CURL=ON \
-DGGML_METAL_USE_BF16=ON \
-DGGML_METAL_EMBED_LIBRARY=ON \
-DGGML_RPC=ON
@@ -92,6 +92,7 @@ jobs:
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: |
cp LICENSE ./build/bin/
cp examples/run/linenoise.cpp/LICENSE ./build/bin/LICENSE.linenoise.cpp
zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/*
- name: Upload artifacts
@@ -122,7 +123,6 @@ jobs:
continue-on-error: true
run: |
brew update
brew install curl
- name: Build
id: cmake_build
@@ -133,6 +133,7 @@ jobs:
cmake -B build \
-DCMAKE_BUILD_RPATH="@loader_path" \
-DLLAMA_FATAL_WARNINGS=ON \
-DLLAMA_CURL=ON \
-DGGML_METAL=OFF \
-DGGML_RPC=ON
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
@@ -161,6 +162,7 @@ jobs:
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: |
cp LICENSE ./build/bin/
cp examples/run/linenoise.cpp/LICENSE ./build/bin/LICENSE.linenoise.cpp
zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/*
- name: Upload artifacts
@@ -205,6 +207,7 @@ jobs:
run: |
cmake -B build \
-DLLAMA_FATAL_WARNINGS=ON \
-DLLAMA_CURL=ON \
-DGGML_RPC=ON
cmake --build build --config Release -j $(nproc)
@@ -243,6 +246,7 @@ jobs:
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: |
cp LICENSE ./build/bin/
cp examples/run/linenoise.cpp/LICENSE ./build/bin/LICENSE.linenoise.cpp
zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip ./build/bin/*
- name: Upload artifacts
@@ -277,7 +281,7 @@ jobs:
id: depends
run: |
sudo apt-get update
sudo apt-get install build-essential libcurl4-openssl-dev
sudo apt-get install build-essential
- name: Build
id: cmake_build
@@ -318,7 +322,7 @@ jobs:
id: depends
run: |
sudo apt-get update
sudo apt-get install build-essential libcurl4-openssl-dev
sudo apt-get install build-essential
- name: Build
id: cmake_build
@@ -356,7 +360,7 @@ jobs:
id: depends
run: |
sudo apt-get update
sudo apt-get install build-essential libcurl4-openssl-dev
sudo apt-get install build-essential
- name: Build
id: cmake_build
@@ -393,7 +397,7 @@ jobs:
wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add -
sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
sudo apt-get update -y
sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk libcurl4-openssl-dev
sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk
- name: Build
id: cmake_build
@@ -427,6 +431,7 @@ jobs:
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: |
cp LICENSE ./build/bin/
cp examples/run/linenoise.cpp/LICENSE ./build/bin/LICENSE.linenoise.cpp
zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip ./build/bin/*
- name: Upload artifacts
@@ -449,7 +454,7 @@ jobs:
id: depends
run: |
sudo apt-get update
sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev libcurl4-openssl-dev
sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev
- name: ccache
uses: hendrikmuhs/ccache-action@v1.2.16
@@ -525,7 +530,7 @@ jobs:
shell: bash
run: |
sudo apt update
sudo apt install intel-oneapi-compiler-dpcpp-cpp libcurl4-openssl-dev
sudo apt install intel-oneapi-compiler-dpcpp-cpp
- name: install oneAPI MKL library
shell: bash
@@ -573,7 +578,7 @@ jobs:
shell: bash
run: |
sudo apt update
sudo apt install intel-oneapi-compiler-dpcpp-cpp libcurl4-openssl-dev
sudo apt install intel-oneapi-compiler-dpcpp-cpp
- name: install oneAPI MKL library
shell: bash
@@ -601,9 +606,6 @@ jobs:
-DGGML_SYCL_F16=ON
cmake --build build --config Release -j $(nproc)
build-linux-cross:
uses: ./.github/workflows/build-linux-cross.yml
macOS-latest-cmake-ios:
runs-on: macos-latest
@@ -631,7 +633,6 @@ jobs:
cmake -B build -G Xcode \
-DGGML_METAL_USE_BF16=ON \
-DGGML_METAL_EMBED_LIBRARY=ON \
-DLLAMA_BUILD_COMMON=OFF \
-DLLAMA_BUILD_EXAMPLES=OFF \
-DLLAMA_BUILD_TESTS=OFF \
-DLLAMA_BUILD_SERVER=OFF \
@@ -667,7 +668,6 @@ jobs:
cmake -B build -G Xcode \
-DGGML_METAL_USE_BF16=ON \
-DGGML_METAL_EMBED_LIBRARY=ON \
-DLLAMA_BUILD_COMMON=OFF \
-DLLAMA_BUILD_EXAMPLES=OFF \
-DLLAMA_BUILD_TESTS=OFF \
-DLLAMA_BUILD_SERVER=OFF \
@@ -697,7 +697,6 @@ jobs:
cmake -B build -G Xcode \
-DGGML_METAL_USE_BF16=ON \
-DGGML_METAL_EMBED_LIBRARY=ON \
-DLLAMA_BUILD_COMMON=OFF \
-DLLAMA_BUILD_EXAMPLES=OFF \
-DLLAMA_BUILD_TESTS=OFF \
-DLLAMA_BUILD_SERVER=OFF \
@@ -737,7 +736,6 @@ jobs:
cmake -B build -G Xcode \
-DGGML_METAL_USE_BF16=ON \
-DGGML_METAL_EMBED_LIBRARY=ON \
-DLLAMA_CURL=OFF \
-DLLAMA_BUILD_EXAMPLES=OFF \
-DLLAMA_BUILD_TESTS=OFF \
-DLLAMA_BUILD_SERVER=OFF \
@@ -805,7 +803,7 @@ jobs:
env:
OPENBLAS_VERSION: 0.3.23
SDE_VERSION: 9.33.0-2024-01-07
VULKAN_VERSION: 1.4.309.0
VULKAN_VERSION: 1.4.304.1
strategy:
matrix:
@@ -898,17 +896,10 @@ jobs:
-DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
cmake --build build-arm64-release --target install --config release
- name: libCURL
id: get_libcurl
uses: ./.github/actions/windows-setup-curl
- name: Build
id: cmake_build
env:
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
run: |
cmake -S . -B build ${{ matrix.defines }} `
-DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include"
cmake -S . -B build ${{ matrix.defines }}
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
- name: Add libopenblas.dll
@@ -968,10 +959,9 @@ jobs:
- name: Pack artifacts
id: pack_artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
env:
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
run: |
Copy-Item $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\Release\libcurl-x64.dll
Copy-Item LICENSE .\build\bin\Release\llama.cpp.txt
Copy-Item .\examples\run\linenoise.cpp\LICENSE .\build\bin\Release\linenoise.cpp.txt
7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip .\build\bin\Release\*
- name: Upload artifacts
@@ -997,7 +987,7 @@ jobs:
DEBIAN_FRONTEND: noninteractive
run: |
apt update
apt install -y cmake build-essential ninja-build libgomp1 git libcurl4-openssl-dev
apt install -y cmake build-essential ninja-build libgomp1 git
- name: ccache
uses: hendrikmuhs/ccache-action@v1.2.16
@@ -1099,23 +1089,16 @@ jobs:
run: |
choco install ninja
- name: libCURL
id: get_libcurl
uses: ./.github/actions/windows-setup-curl
- name: Build
id: cmake_build
shell: cmd
env:
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
run: |
call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
cmake -S . -B build -G "Ninja Multi-Config" ^
-DLLAMA_BUILD_SERVER=ON ^
-DGGML_NATIVE=OFF ^
-DGGML_CUDA=ON ^
-DGGML_RPC=ON ^
-DCURL_LIBRARY="%CURL_PATH%/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="%CURL_PATH%/include"
-DGGML_RPC=ON
set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
cmake --build build --config Release -j %NINJA_JOBS% -t ggml
cmake --build build --config Release
@@ -1136,10 +1119,7 @@ jobs:
- name: Pack artifacts
id: pack_artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
env:
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
run: |
cp $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\Release\libcurl-x64.dll
7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip .\build\bin\Release\*
- name: Upload artifacts
@@ -1194,8 +1174,6 @@ jobs:
run: |
scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
# TODO: add libcurl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
- name: Build
id: cmake_build
run: examples/sycl/win-build-sycl.bat
@@ -1281,14 +1259,8 @@ jobs:
key: ${{ github.job }}
evict-old-files: 1d
- name: libCURL
id: get_libcurl
uses: ./.github/actions/windows-setup-curl
- name: Build
id: cmake_build
env:
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
run: |
$env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
$env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
@@ -1299,11 +1271,9 @@ jobs:
-DCMAKE_BUILD_TYPE=Release `
-DGGML_HIP=ON `
-DGGML_HIP_ROCWMMA_FATTN=ON `
-DGGML_RPC=ON `
-DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include"
-DGGML_RPC=ON
cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
# TODO: reuse windows-latest-cmake-hip instead of duplicating this job
windows-latest-cmake-hip-release:
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
runs-on: windows-latest
@@ -1345,14 +1315,8 @@ jobs:
run: |
& 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
- name: libCURL
id: get_libcurl
uses: ./.github/actions/windows-setup-curl
- name: Build
id: cmake_build
env:
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
run: |
$env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
$env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
@@ -1364,8 +1328,7 @@ jobs:
-DAMDGPU_TARGETS=${{ matrix.gpu_target }} `
-DGGML_HIP_ROCWMMA_FATTN=ON `
-DGGML_HIP=ON `
-DGGML_RPC=ON `
-DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include"
-DGGML_RPC=ON
cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
md "build\bin\rocblas\library\"
cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
@@ -1387,10 +1350,7 @@ jobs:
- name: Pack artifacts
id: pack_artifacts
env:
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
run: |
cp $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\libcurl-x64.dll
7z a llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip .\build\bin\*
- name: Upload artifacts
@@ -1415,7 +1375,6 @@ jobs:
cmake -B build -G Xcode \
-DGGML_METAL_USE_BF16=ON \
-DGGML_METAL_EMBED_LIBRARY=ON \
-DLLAMA_CURL=OFF \
-DLLAMA_BUILD_EXAMPLES=OFF \
-DLLAMA_BUILD_TESTS=OFF \
-DLLAMA_BUILD_SERVER=OFF \
@@ -1771,7 +1730,7 @@ jobs:
strategy:
matrix:
cann:
- '8.1.RC1.alpha001-910b-openeuler22.03-py3.10'
- '8.0.rc3.beta1-910b-openeuler22.03-py3.10'
device:
- 'ascend910b3'
build:
@@ -1784,7 +1743,7 @@ jobs:
- name: Dependencies
run: |
yum update -y
yum install -y git gcc gcc-c++ make cmake libcurl-devel
yum install -y git gcc gcc-c++ make cmake
- name: Build
run: |

View File

@@ -38,7 +38,7 @@ jobs:
# Multi-stage build
- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, freediskspace: false}
- { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
- { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: true}
- { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
- { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
- { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
# Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete

View File

@@ -129,6 +129,7 @@ jobs:
cmake -B build \
-DGGML_NATIVE=OFF \
-DLLAMA_BUILD_SERVER=ON \
-DLLAMA_CURL=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
-DGGML_OPENMP=OFF ;
@@ -141,6 +142,7 @@ jobs:
cmake -B build \
-DGGML_NATIVE=OFF \
-DLLAMA_BUILD_SERVER=ON \
-DLLAMA_CURL=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
@@ -152,6 +154,7 @@ jobs:
cmake -B build \
-DGGML_NATIVE=OFF \
-DLLAMA_BUILD_SERVER=ON \
-DLLAMA_CURL=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ;
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
@@ -192,14 +195,17 @@ jobs:
- name: libCURL
id: get_libcurl
uses: ./.github/actions/windows-setup-curl
env:
CURL_VERSION: 8.6.0_6
run: |
curl.exe -o $env:RUNNER_TEMP/curl.zip -L "https://curl.se/windows/dl-${env:CURL_VERSION}/curl-${env:CURL_VERSION}-win64-mingw.zip"
mkdir $env:RUNNER_TEMP/libcurl
tar.exe -xvf $env:RUNNER_TEMP/curl.zip --strip-components=1 -C $env:RUNNER_TEMP/libcurl
- name: Build
id: cmake_build
env:
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
run: |
cmake -B build -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include"
cmake -B build -DLLAMA_CURL=ON -DCURL_LIBRARY="$env:RUNNER_TEMP/libcurl/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:RUNNER_TEMP/libcurl/include"
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target llama-server
- name: Python setup
@@ -215,10 +221,8 @@ jobs:
- name: Copy Libcurl
id: prepare_libcurl
env:
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
run: |
cp $env:CURL_PATH/bin/libcurl-x64.dll ./build/bin/Release/libcurl-x64.dll
cp $env:RUNNER_TEMP/libcurl/bin/libcurl-x64.dll ./build/bin/Release/libcurl-x64.dll
- name: Tests
id: server_integration_tests

View File

@@ -81,7 +81,7 @@ option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE})
# 3rd party libs
option(LLAMA_CURL "llama: use libcurl to download model from an URL" ON)
option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured output in common utils" OFF)
# Required for relocatable CMake package
@@ -168,11 +168,6 @@ add_subdirectory(src)
# utils, programs, examples and tests
#
if (NOT LLAMA_BUILD_COMMON)
message(STATUS "LLAMA_BUILD_COMMON is OFF, disabling LLAMA_CURL")
set(LLAMA_CURL OFF)
endif()
if (LLAMA_BUILD_COMMON)
add_subdirectory(common)
endif()
@@ -247,20 +242,3 @@ configure_file(cmake/llama.pc.in
install(FILES "${CMAKE_CURRENT_BINARY_DIR}/llama.pc"
DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)
#
# copy the license files
#
# Check if running in GitHub Actions
if(DEFINED ENV{GITHUB_ACTIONS} AND "$ENV{GITHUB_ACTIONS}" STREQUAL "true")
message(STATUS "Running inside GitHub Actions - copying license files")
# Copy all files from licenses/ to build/bin/
file(GLOB LICENSE_FILES "${CMAKE_SOURCE_DIR}/licenses/*")
foreach(LICENSE_FILE ${LICENSE_FILES})
get_filename_component(FILENAME ${LICENSE_FILE} NAME)
configure_file(${LICENSE_FILE} "${CMAKE_BINARY_DIR}/bin/${FILENAME}" COPYONLY)
endforeach()
endif()

View File

@@ -9,6 +9,13 @@
Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
> [!IMPORTANT]
> New `llama.cpp` package location: [ggml-org/llama.cpp](https://github.com/ggml-org/llama.cpp/pkgs/container/llama.cpp)
>
> Update your container URLs to: `ghcr.io/ggml-org/llama.cpp`
>
> More info: https://github.com/ggml-org/llama.cpp/discussions/11801
## Recent API changes
- [Changelog for `libllama` API](https://github.com/ggml-org/llama.cpp/issues/9289)
@@ -105,8 +112,6 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
- [x] [RWKV-6](https://github.com/BlinkDL/RWKV-LM)
- [x] [QRWKV-6](https://huggingface.co/recursal/QRWKV6-32B-Instruct-Preview-v0.1)
- [x] [GigaChat-20B-A3B](https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct)
- [X] [Trillion-7B-preview](https://huggingface.co/trillionlabs/Trillion-7B-preview)
- [x] [Ling models](https://huggingface.co/collections/inclusionAI/ling-67c51c85b34a7ea0aba94c32)
#### Multimodal
@@ -240,7 +245,6 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
| [Vulkan](docs/build.md#vulkan) | GPU |
| [CANN](docs/build.md#cann) | Ascend NPU |
| [OpenCL](docs/backend/OPENCL.md) | Adreno GPU |
| [RPC](https://github.com/ggml-org/llama.cpp/tree/master/examples/rpc) | All |
## Building the project
@@ -524,35 +528,6 @@ If your issue is with model generation quality, then please at least scan the fo
- [Aligning language models to follow instructions](https://openai.com/research/instruction-following)
- [Training language models to follow instructions with human feedback](https://arxiv.org/abs/2203.02155)
## XCFramework
The XCFramework is a precompiled version of the library for iOS, visionOS, tvOS,
and macOS. It can be used in Swift projects without the need to compile the
library from source. For example:
```swift
// swift-tools-version: 5.10
// The swift-tools-version declares the minimum version of Swift required to build this package.
import PackageDescription
let package = Package(
name: "MyLlamaPackage",
targets: [
.executableTarget(
name: "MyLlamaPackage",
dependencies: [
"LlamaFramework"
]),
.binaryTarget(
name: "LlamaFramework",
url: "https://github.com/ggml-org/llama.cpp/releases/download/b5046/llama-b5046-xcframework.zip",
checksum: "c19be78b5f00d8d29a25da41042cb7afa094cbf6280a225abe614b03b20029ab"
)
]
)
```
The above example is using an intermediate build `b5046` of the library. This can be modified
to use a different version by changing the URL and checksum.
## Completions
Command-line completion is available for some environments.

View File

@@ -399,7 +399,6 @@ cmake -B build-ios-sim -G Xcode \
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=iphonesimulator \
-DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
-DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
-DLLAMA_CURL=OFF \
-S .
cmake --build build-ios-sim --config Release -- -quiet
@@ -412,7 +411,6 @@ cmake -B build-ios-device -G Xcode \
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=iphoneos \
-DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
-DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
-DLLAMA_CURL=OFF \
-S .
cmake --build build-ios-device --config Release -- -quiet
@@ -423,7 +421,6 @@ cmake -B build-macos -G Xcode \
-DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" \
-DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
-DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
-DLLAMA_CURL=OFF \
-S .
cmake --build build-macos --config Release -- -quiet
@@ -437,7 +434,6 @@ cmake -B build-visionos -G Xcode \
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xros \
-DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \
-DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \
-DLLAMA_CURL=OFF \
-S .
cmake --build build-visionos --config Release -- -quiet
@@ -451,7 +447,6 @@ cmake -B build-visionos-sim -G Xcode \
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xrsimulator \
-DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \
-DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \
-DLLAMA_CURL=OFF \
-S .
cmake --build build-visionos-sim --config Release -- -quiet
@@ -467,7 +462,6 @@ cmake -B build-tvos-sim -G Xcode \
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=appletvsimulator \
-DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
-DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
-DLLAMA_CURL=OFF \
-S .
cmake --build build-tvos-sim --config Release -- -quiet
@@ -482,7 +476,6 @@ cmake -B build-tvos-device -G Xcode \
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=appletvos \
-DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
-DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
-DLLAMA_CURL=OFF \
-S .
cmake --build build-tvos-device --config Release -- -quiet

View File

@@ -26,43 +26,4 @@ GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
# with SYCL support
source /opt/intel/oneapi/setvars.sh
GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
# with MUSA support
GG_BUILD_MUSA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
```
## Running MUSA CI in a Docker Container
Assuming `$PWD` is the root of the `llama.cpp` repository, follow these steps to set up and run MUSA CI in a Docker container:
### 1. Create a local directory to store cached models, configuration files and venv:
```bash
mkdir -p $HOME/llama.cpp/ci-cache
```
### 2. Create a local directory to store CI run results:
```bash
mkdir -p $HOME/llama.cpp/ci-results
```
### 3. Start a Docker container and run the CI:
```bash
docker run --privileged -it \
-v $HOME/llama.cpp/ci-cache:/ci-cache \
-v $HOME/llama.cpp/ci-results:/ci-results \
-v $PWD:/ws -w /ws \
mthreads/musa:rc3.1.1-devel-ubuntu22.04
```
Inside the container, execute the following commands:
```bash
apt update -y && apt install -y bc cmake ccache git python3.10-venv time unzip wget
git config --global --add safe.directory /ws
GG_BUILD_MUSA=1 bash ./ci/run.sh /ci-results /ci-cache
```
This setup ensures that the CI runs within an isolated Docker environment while maintaining cached files and results across runs.

View File

@@ -16,9 +16,6 @@
# # with VULKAN support
# GG_BUILD_VULKAN=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
#
# # with MUSA support
# GG_BUILD_MUSA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
#
if [ -z "$2" ]; then
echo "usage: $0 <output-dir> <mnt-dir>"
@@ -39,7 +36,7 @@ sd=`dirname $0`
cd $sd/../
SRC=`pwd`
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=OFF"
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON"
if [ ! -z ${GG_BUILD_METAL} ]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON -DGGML_METAL_USE_BF16=ON"
@@ -55,24 +52,13 @@ if [ ! -z ${GG_BUILD_SYCL} ]; then
echo "source /opt/intel/oneapi/setvars.sh"
exit 1
fi
# Use only main GPU
export ONEAPI_DEVICE_SELECTOR="level_zero:0"
# Enable sysman for correct memory reporting
export ZES_ENABLE_SYSMAN=1
# to circumvent precision issues on CPY operations
export SYCL_PROGRAM_COMPILE_OPTIONS="-cl-fp32-correctly-rounded-divide-sqrt"
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_SYCL=1 -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON"
fi
if [ ! -z ${GG_BUILD_VULKAN} ]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_VULKAN=1"
fi
if [ ! -z ${GG_BUILD_MUSA} ]; then
# Use qy1 by default (MTT S80)
MUSA_ARCH=${MUSA_ARCH:-21}
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_MUSA=ON -DMUSA_ARCHITECTURES=${MUSA_ARCH}"
fi
## helpers
# download a file if it does not exist or if it is outdated
@@ -822,7 +808,7 @@ export LLAMA_LOG_PREFIX=1
export LLAMA_LOG_TIMESTAMPS=1
if [ -z ${GG_BUILD_LOW_PERF} ]; then
# Create symlink: ./llama.cpp/models-mnt -> $MNT/models
# Create symlink: ./llama.cpp/models-mnt -> $MNT/models/models-mnt
rm -rf ${SRC}/models-mnt
mnt_models=${MNT}/models
mkdir -p ${mnt_models}
@@ -840,10 +826,8 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
fi
ret=0
if [ -z ${GG_BUILD_SYCL} ]; then
# SYCL build breaks with debug build flags
test $ret -eq 0 && gg_run ctest_debug
fi
test $ret -eq 0 && gg_run ctest_debug
test $ret -eq 0 && gg_run ctest_release
if [ -z ${GG_BUILD_LOW_PERF} ]; then
@@ -851,9 +835,7 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
test $ret -eq 0 && gg_run rerank_tiny
if [ -z ${GG_BUILD_CLOUD} ] || [ ${GG_BUILD_EXTRA_TESTS_0} ]; then
if [ -z ${GG_BUILD_SYCL} ]; then
test $ret -eq 0 && gg_run test_scripts_debug
fi
test $ret -eq 0 && gg_run test_scripts_debug
test $ret -eq 0 && gg_run test_scripts_release
fi
@@ -864,9 +846,7 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
test $ret -eq 0 && gg_run pythia_2_8b
#test $ret -eq 0 && gg_run open_llama_7b_v2
fi
if [ -z ${GG_BUILD_SYCL} ]; then
test $ret -eq 0 && gg_run ctest_with_model_debug
fi
test $ret -eq 0 && gg_run ctest_with_model_debug
test $ret -eq 0 && gg_run ctest_with_model_release
fi
fi

View File

@@ -85,10 +85,7 @@ set(LLAMA_COMMON_EXTRA_LIBS build_info)
# Use curl to download model url
if (LLAMA_CURL)
find_package(CURL)
if (NOT CURL_FOUND)
message(FATAL_ERROR "Could NOT find CURL. Hint: to disable this feature, set -DLLAMA_CURL=OFF")
endif()
find_package(CURL REQUIRED)
target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL)
include_directories(${CURL_INCLUDE_DIRS})
find_library(CURL_LIBRARY curl REQUIRED)
@@ -117,8 +114,8 @@ if (LLAMA_LLGUIDANCE)
ExternalProject_Add(llguidance_ext
GIT_REPOSITORY https://github.com/guidance-ai/llguidance
# v0.7.10:
GIT_TAG 0309d2a6bf40abda35344a362edc71e06d5009f8
# v0.6.12:
GIT_TAG ced1c9023d47ec194fa977932d35ce65c2ebfc09
PREFIX ${CMAKE_BINARY_DIR}/llguidance
SOURCE_DIR ${LLGUIDANCE_SRC}
BUILD_IN_SOURCE TRUE

View File

@@ -1,24 +1,12 @@
#include "gguf.h" // for reading GGUF splits
#include "arg.h"
#include "common.h"
#include "log.h"
#include "sampling.h"
#include "chat.h"
// fix problem with std::min and std::max
#if defined(_WIN32)
#define WIN32_LEAN_AND_MEAN
#ifndef NOMINMAX
# define NOMINMAX
#endif
#include <windows.h>
#endif
#include <algorithm>
#include <climits>
#include <cstdarg>
#include <filesystem>
#include <fstream>
#include <regex>
#include <set>
@@ -26,14 +14,6 @@
#include <thread>
#include <vector>
//#define LLAMA_USE_CURL
#if defined(LLAMA_USE_CURL)
#include <curl/curl.h>
#include <curl/easy.h>
#include <future>
#endif
#include "json-schema-to-grammar.h"
using json = nlohmann::ordered_json;
@@ -145,555 +125,47 @@ std::string common_arg::to_string() {
return ss.str();
}
//
// downloader
//
struct common_hf_file_res {
std::string repo; // repo name with ":tag" removed
std::string ggufFile;
std::string mmprojFile;
};
#ifdef LLAMA_USE_CURL
#ifdef __linux__
#include <linux/limits.h>
#elif defined(_WIN32)
# if !defined(PATH_MAX)
# define PATH_MAX MAX_PATH
# endif
#elif defined(_AIX)
#include <sys/limits.h>
#else
#include <sys/syslimits.h>
#endif
#define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
//
// CURL utils
//
using curl_ptr = std::unique_ptr<CURL, decltype(&curl_easy_cleanup)>;
// cannot use unique_ptr for curl_slist, because we cannot update without destroying the old one
struct curl_slist_ptr {
struct curl_slist * ptr = nullptr;
~curl_slist_ptr() {
if (ptr) {
curl_slist_free_all(ptr);
}
}
};
#define CURL_MAX_RETRY 3
#define CURL_RETRY_DELAY_SECONDS 2
static bool curl_perform_with_retry(const std::string & url, CURL * curl, int max_attempts, int retry_delay_seconds) {
int remaining_attempts = max_attempts;
while (remaining_attempts > 0) {
LOG_INF("%s: Trying to download from %s (attempt %d of %d)...\n", __func__ , url.c_str(), max_attempts - remaining_attempts + 1, max_attempts);
CURLcode res = curl_easy_perform(curl);
if (res == CURLE_OK) {
return true;
}
int exponential_backoff_delay = std::pow(retry_delay_seconds, max_attempts - remaining_attempts) * 1000;
LOG_WRN("%s: curl_easy_perform() failed: %s, retrying after %d milliseconds...\n", __func__, curl_easy_strerror(res), exponential_backoff_delay);
remaining_attempts--;
std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay));
}
LOG_ERR("%s: curl_easy_perform() failed after %d attempts\n", __func__, max_attempts);
return false;
}
// download one single file from remote URL to local path
static bool common_download_file_single(const std::string & url, const std::string & path, const std::string & bearer_token) {
// Initialize libcurl
curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
curl_slist_ptr http_headers;
if (!curl) {
LOG_ERR("%s: error initializing libcurl\n", __func__);
return false;
}
bool force_download = false;
// Set the URL, allow to follow http redirection
curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
// Check if hf-token or bearer-token was specified
if (!bearer_token.empty()) {
std::string auth_header = "Authorization: Bearer " + bearer_token;
http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
}
#if defined(_WIN32)
// CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
// operating system. Currently implemented under MS-Windows.
curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
#endif
// Check if the file already exists locally
auto file_exists = std::filesystem::exists(path);
// If the file exists, check its JSON metadata companion file.
std::string metadata_path = path + ".json";
nlohmann::json metadata;
std::string etag;
std::string last_modified;
if (file_exists) {
// Try and read the JSON metadata file (note: stream autoclosed upon exiting this block).
std::ifstream metadata_in(metadata_path);
if (metadata_in.good()) {
try {
metadata_in >> metadata;
LOG_INF("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
if (metadata.contains("url") && metadata.at("url").is_string()) {
auto previous_url = metadata.at("url").get<std::string>();
if (previous_url != url) {
LOG_ERR("%s: Model URL mismatch: %s != %s\n", __func__, url.c_str(), previous_url.c_str());
return false;
}
}
if (metadata.contains("etag") && metadata.at("etag").is_string()) {
etag = metadata.at("etag");
}
if (metadata.contains("lastModified") && metadata.at("lastModified").is_string()) {
last_modified = metadata.at("lastModified");
}
} catch (const nlohmann::json::exception & e) {
LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
return false;
}
}
} else {
LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
}
// Send a HEAD request to retrieve the etag and last-modified headers
struct common_load_model_from_url_headers {
std::string etag;
std::string last_modified;
};
common_load_model_from_url_headers headers;
{
typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
static std::regex header_regex("([^:]+): (.*)\r\n");
static std::regex etag_regex("ETag", std::regex_constants::icase);
static std::regex last_modified_regex("Last-Modified", std::regex_constants::icase);
std::string header(buffer, n_items);
std::smatch match;
if (std::regex_match(header, match, header_regex)) {
const std::string & key = match[1];
const std::string & value = match[2];
if (std::regex_match(key, match, etag_regex)) {
headers->etag = value;
} else if (std::regex_match(key, match, last_modified_regex)) {
headers->last_modified = value;
}
}
return n_items;
};
curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 1L); // will trigger the HEAD verb
curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); // hide head request progress
curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
if (!was_perform_successful) {
return false;
}
long http_code = 0;
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
if (http_code != 200) {
// HEAD not supported, we don't know if the file has changed
// force trigger downloading
force_download = true;
LOG_ERR("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
}
}
bool should_download = !file_exists || force_download;
if (!should_download) {
if (!etag.empty() && etag != headers.etag) {
LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(), headers.etag.c_str());
should_download = true;
} else if (!last_modified.empty() && last_modified != headers.last_modified) {
LOG_WRN("%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__, last_modified.c_str(), headers.last_modified.c_str());
should_download = true;
}
}
if (should_download) {
std::string path_temporary = path + ".downloadInProgress";
if (file_exists) {
LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
if (remove(path.c_str()) != 0) {
LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
return false;
}
}
// Set the output file
struct FILE_deleter {
void operator()(FILE * f) const {
fclose(f);
}
};
std::unique_ptr<FILE, FILE_deleter> outfile(fopen(path_temporary.c_str(), "wb"));
if (!outfile) {
LOG_ERR("%s: error opening local file for writing: %s\n", __func__, path.c_str());
return false;
}
typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * data, size_t size, size_t nmemb, void * fd);
auto write_callback = [](void * data, size_t size, size_t nmemb, void * fd) -> size_t {
return fwrite(data, size, nmemb, (FILE *)fd);
};
curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 0L);
curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, outfile.get());
// display download progress
curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 0L);
// helper function to hide password in URL
auto llama_download_hide_password_in_url = [](const std::string & url) -> std::string {
std::size_t protocol_pos = url.find("://");
if (protocol_pos == std::string::npos) {
return url; // Malformed URL
}
std::size_t at_pos = url.find('@', protocol_pos + 3);
if (at_pos == std::string::npos) {
return url; // No password in URL
}
return url.substr(0, protocol_pos + 3) + "********" + url.substr(at_pos);
};
// start the download
LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
if (!was_perform_successful) {
return false;
}
long http_code = 0;
curl_easy_getinfo (curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
if (http_code < 200 || http_code >= 400) {
LOG_ERR("%s: invalid http status code received: %ld\n", __func__, http_code);
return false;
}
// Causes file to be closed explicitly here before we rename it.
outfile.reset();
// Write the updated JSON metadata file.
metadata.update({
{"url", url},
{"etag", headers.etag},
{"lastModified", headers.last_modified}
});
std::ofstream(metadata_path) << metadata.dump(4);
LOG_INF("%s: file metadata saved: %s\n", __func__, metadata_path.c_str());
if (rename(path_temporary.c_str(), path.c_str()) != 0) {
LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
return false;
}
}
return true;
}
// download multiple files from remote URLs to local paths
// the input is a vector of pairs <url, path>
static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> & urls, const std::string & bearer_token) {
// Prepare download in parallel
std::vector<std::future<bool>> futures_download;
for (auto const & item : urls) {
futures_download.push_back(std::async(std::launch::async, [bearer_token](const std::pair<std::string, std::string> & it) -> bool {
return common_download_file_single(it.first, it.second, bearer_token);
}, item));
}
// Wait for all downloads to complete
for (auto & f : futures_download) {
if (!f.get()) {
return false;
}
}
return true;
}
static bool common_download_model(
const common_params_model & model,
const std::string & bearer_token) {
// Basic validation of the model.url
if (model.url.empty()) {
LOG_ERR("%s: invalid model url\n", __func__);
return false;
}
if (!common_download_file_single(model.url, model.path, bearer_token)) {
return false;
}
// check for additional GGUFs split to download
int n_split = 0;
{
struct gguf_init_params gguf_params = {
/*.no_alloc = */ true,
/*.ctx = */ NULL,
};
auto * ctx_gguf = gguf_init_from_file(model.path.c_str(), gguf_params);
if (!ctx_gguf) {
LOG_ERR("\n%s: failed to load input GGUF from %s\n", __func__, model.path.c_str());
return false;
}
auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT);
if (key_n_split >= 0) {
n_split = gguf_get_val_u16(ctx_gguf, key_n_split);
}
gguf_free(ctx_gguf);
}
if (n_split > 1) {
char split_prefix[PATH_MAX] = {0};
char split_url_prefix[LLAMA_CURL_MAX_URL_LENGTH] = {0};
// Verify the first split file format
// and extract split URL and PATH prefixes
{
if (!llama_split_prefix(split_prefix, sizeof(split_prefix), model.path.c_str(), 0, n_split)) {
LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, model.path.c_str(), n_split);
return false;
}
if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model.url.c_str(), 0, n_split)) {
LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model.url.c_str(), n_split);
return false;
}
}
std::vector<std::pair<std::string, std::string>> urls;
for (int idx = 1; idx < n_split; idx++) {
char split_path[PATH_MAX] = {0};
llama_split_path(split_path, sizeof(split_path), split_prefix, idx, n_split);
char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
llama_split_path(split_url, sizeof(split_url), split_url_prefix, idx, n_split);
if (std::string(split_path) == model.path) {
continue; // skip the already downloaded file
}
urls.push_back({split_url, split_path});
}
// Download in parallel
common_download_file_multiple(urls, bearer_token);
}
return true;
}
/**
* Allow getting the HF file from the HF repo with tag (like ollama), for example:
* - bartowski/Llama-3.2-3B-Instruct-GGUF:q4
* - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M
* - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s
* Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo)
*
* Return pair of <repo, file> (with "repo" already having tag removed)
*
* Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
*/
static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & bearer_token) {
auto parts = string_split<std::string>(hf_repo_with_tag, ':');
std::string tag = parts.size() > 1 ? parts.back() : "latest";
std::string hf_repo = parts[0];
if (string_split<std::string>(hf_repo, '/').size() != 2) {
throw std::invalid_argument("error: invalid HF repo format, expected <user>/<model>[:quant]\n");
}
// fetch model info from Hugging Face Hub API
curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
curl_slist_ptr http_headers;
std::string res_str;
std::string url = "https://huggingface.co/v2/" + hf_repo + "/manifests/" + tag;
curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L);
typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data);
auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t {
static_cast<std::string *>(data)->append((char * ) ptr, size * nmemb);
return size * nmemb;
};
curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_str);
#if defined(_WIN32)
curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
#endif
if (!bearer_token.empty()) {
std::string auth_header = "Authorization: Bearer " + bearer_token;
http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
}
// Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response
http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
http_headers.ptr = curl_slist_append(http_headers.ptr, "Accept: application/json");
curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
CURLcode res = curl_easy_perform(curl.get());
if (res != CURLE_OK) {
throw std::runtime_error("error: cannot make GET request to HF API");
}
long res_code;
std::string ggufFile = "";
std::string mmprojFile = "";
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code);
if (res_code == 200) {
// extract ggufFile.rfilename in json, using regex
{
std::regex pattern("\"ggufFile\"[\\s\\S]*?\"rfilename\"\\s*:\\s*\"([^\"]+)\"");
std::smatch match;
if (std::regex_search(res_str, match, pattern)) {
ggufFile = match[1].str();
}
}
// extract mmprojFile.rfilename in json, using regex
{
std::regex pattern("\"mmprojFile\"[\\s\\S]*?\"rfilename\"\\s*:\\s*\"([^\"]+)\"");
std::smatch match;
if (std::regex_search(res_str, match, pattern)) {
mmprojFile = match[1].str();
}
}
} else if (res_code == 401) {
throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token");
} else {
throw std::runtime_error(string_format("error from HF API, response code: %ld, data: %s", res_code, res_str.c_str()));
}
// check response
if (ggufFile.empty()) {
throw std::runtime_error("error: model does not have ggufFile");
}
return { hf_repo, ggufFile, mmprojFile };
}
#else
static bool common_download_file_single(const std::string &, const std::string &, const std::string &) {
LOG_ERR("error: built without CURL, cannot download model from internet\n");
return false;
}
static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> &, const std::string &) {
LOG_ERR("error: built without CURL, cannot download model from the internet\n");
return false;
}
static bool common_download_model(
const common_params_model &,
const std::string &) {
LOG_ERR("error: built without CURL, cannot download model from the internet\n");
return false;
}
static struct common_hf_file_res common_get_hf_file(const std::string &, const std::string &) {
LOG_ERR("error: built without CURL, cannot download model from the internet\n");
return {};
}
#endif // LLAMA_USE_CURL
//
// utils
//
static void common_params_handle_model(
struct common_params_model & model,
const std::string & bearer_token,
const std::string & model_path_default,
bool is_mmproj = false) { // TODO: move is_mmproj to an enum when we have more files?
// handle pre-fill default model path and url based on hf_repo and hf_file
{
if (!model.hf_repo.empty()) {
// short-hand to avoid specifying --hf-file -> default it to --model
if (model.hf_file.empty()) {
if (model.path.empty()) {
auto auto_detected = common_get_hf_file(model.hf_repo, bearer_token);
if (auto_detected.repo.empty() || auto_detected.ggufFile.empty()) {
exit(1); // built without CURL, error message already printed
}
model.hf_repo = auto_detected.repo;
model.hf_file = is_mmproj ? auto_detected.mmprojFile : auto_detected.ggufFile;
} else {
model.hf_file = model.path;
static void common_params_handle_model_default(
std::string & model,
const std::string & model_url,
std::string & hf_repo,
std::string & hf_file,
const std::string & hf_token,
const std::string & model_default) {
if (!hf_repo.empty()) {
// short-hand to avoid specifying --hf-file -> default it to --model
if (hf_file.empty()) {
if (model.empty()) {
auto auto_detected = common_get_hf_file(hf_repo, hf_token);
if (auto_detected.first.empty() || auto_detected.second.empty()) {
exit(1); // built without CURL, error message already printed
}
hf_repo = auto_detected.first;
hf_file = auto_detected.second;
} else {
hf_file = model;
}
std::string hf_endpoint = "https://huggingface.co/";
const char * hf_endpoint_env = getenv("HF_ENDPOINT");
if (hf_endpoint_env) {
hf_endpoint = hf_endpoint_env;
if (hf_endpoint.back() != '/') hf_endpoint += '/';
}
model.url = hf_endpoint + model.hf_repo + "/resolve/main/" + model.hf_file;
// make sure model path is present (for caching purposes)
if (model.path.empty()) {
// this is to avoid different repo having same file name, or same file name in different subdirs
std::string filename = model.hf_repo + "_" + model.hf_file;
// to make sure we don't have any slashes in the filename
string_replace_all(filename, "/", "_");
model.path = fs_get_cache_file(filename);
}
} else if (!model.url.empty()) {
if (model.path.empty()) {
auto f = string_split<std::string>(model.url, '#').front();
f = string_split<std::string>(f, '?').front();
model.path = fs_get_cache_file(string_split<std::string>(f, '/').back());
}
} else if (model.path.empty()) {
model.path = model_path_default;
}
}
// then, download it if needed
if (!model.url.empty()) {
bool ok = common_download_model(model, bearer_token);
if (!ok) {
LOG_ERR("error: failed to download model from %s\n", model.url.c_str());
exit(1);
// make sure model path is present (for caching purposes)
if (model.empty()) {
// this is to avoid different repo having same file name, or same file name in different subdirs
std::string filename = hf_repo + "_" + hf_file;
// to make sure we don't have any slashes in the filename
string_replace_all(filename, "/", "_");
model = fs_get_cache_file(filename);
}
} else if (!model_url.empty()) {
if (model.empty()) {
auto f = string_split<std::string>(model_url, '#').front();
f = string_split<std::string>(f, '?').front();
model = fs_get_cache_file(string_split<std::string>(f, '/').back());
}
} else if (model.empty()) {
model = model_default;
}
}
@@ -828,16 +300,10 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
}
common_params_handle_model(params.model, params.hf_token, DEFAULT_MODEL_PATH);
common_params_handle_model(params.speculative.model, params.hf_token, "");
common_params_handle_model(params.vocoder.model, params.hf_token, "");
// allow --mmproj to be set from -hf
// assuming that mmproj is always in the same repo as text model
if (!params.model.hf_repo.empty() && ctx_arg.ex == LLAMA_EXAMPLE_LLAVA) {
params.mmproj.hf_repo = params.model.hf_repo;
}
common_params_handle_model(params.mmproj, params.hf_token, "", true);
// TODO: refactor model params in a common struct
common_params_handle_model_default(params.model, params.model_url, params.hf_repo, params.hf_file, params.hf_token, DEFAULT_MODEL_PATH);
common_params_handle_model_default(params.speculative.model, params.speculative.model_url, params.speculative.hf_repo, params.speculative.hf_file, params.hf_token, "");
common_params_handle_model_default(params.vocoder.model, params.vocoder.model_url, params.vocoder.hf_repo, params.vocoder.hf_file, params.hf_token, "");
if (params.escape) {
string_process_escapes(params.prompt);
@@ -856,10 +322,6 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
params.kv_overrides.back().key[0] = 0;
}
if (!params.tensor_buft_overrides.empty()) {
params.tensor_buft_overrides.push_back({nullptr, nullptr});
}
if (params.reranking && params.embedding) {
throw std::invalid_argument("error: either --embedding or --reranking can be specified, but not both");
}
@@ -2099,14 +1561,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
{"--mmproj"}, "FILE",
"path to a multimodal projector file for LLaVA. see examples/llava/README.md",
[](common_params & params, const std::string & value) {
params.mmproj.path = value;
}
).set_examples({LLAMA_EXAMPLE_LLAVA}));
add_opt(common_arg(
{"--mmproj-url"}, "URL",
"URL to a multimodal projector file for LLaVA. see examples/llava/README.md",
[](common_params & params, const std::string & value) {
params.mmproj.url = value;
params.mmproj = value;
}
).set_examples({LLAMA_EXAMPLE_LLAVA}));
add_opt(common_arg(
@@ -2192,41 +1647,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
exit(0);
}
));
add_opt(common_arg(
{"--override-tensor", "-ot"}, "<tensor name pattern>=<buffer type>,...",
"override tensor buffer type", [](common_params & params, const std::string & value) {
/* static */ std::map<std::string, ggml_backend_buffer_type_t> buft_list;
if (buft_list.empty()) {
// enumerate all the devices and add their buffer types to the list
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
auto * dev = ggml_backend_dev_get(i);
auto * buft = ggml_backend_dev_buffer_type(dev);
if (buft) {
buft_list[ggml_backend_buft_name(buft)] = buft;
}
}
}
for (const auto & override : string_split<std::string>(value, ',')) {
std::string::size_type pos = override.find('=');
if (pos == std::string::npos) {
throw std::invalid_argument("invalid value");
}
std::string tensor_name = override.substr(0, pos);
std::string buffer_type = override.substr(pos + 1);
if (buft_list.find(buffer_type) == buft_list.end()) {
printf("Available buffer types:\n");
for (const auto & it : buft_list) {
printf(" %s\n", ggml_backend_buft_name(it.second));
}
throw std::invalid_argument("unknown buffer type");
}
// FIXME: this leaks memory
params.tensor_buft_overrides.push_back({strdup(tensor_name.c_str()), buft_list.at(buffer_type)});
}
}
));
add_opt(common_arg(
{"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
"number of layers to store in VRAM",
@@ -2370,14 +1790,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
"or `--model-url` if set, otherwise %s)", DEFAULT_MODEL_PATH
),
[](common_params & params, const std::string & value) {
params.model.path = value;
params.model = value;
}
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}).set_env("LLAMA_ARG_MODEL"));
add_opt(common_arg(
{"-mu", "--model-url"}, "MODEL_URL",
"model download url (default: unused)",
[](common_params & params, const std::string & value) {
params.model.url = value;
params.model_url = value;
}
).set_env("LLAMA_ARG_MODEL_URL"));
add_opt(common_arg(
@@ -2386,35 +1806,35 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
"example: unsloth/phi-4-GGUF:q4_k_m\n"
"(default: unused)",
[](common_params & params, const std::string & value) {
params.model.hf_repo = value;
params.hf_repo = value;
}
).set_env("LLAMA_ARG_HF_REPO"));
add_opt(common_arg(
{"-hfd", "-hfrd", "--hf-repo-draft"}, "<user>/<model>[:quant]",
"Same as --hf-repo, but for the draft model (default: unused)",
[](common_params & params, const std::string & value) {
params.speculative.model.hf_repo = value;
params.speculative.hf_repo = value;
}
).set_env("LLAMA_ARG_HFD_REPO"));
add_opt(common_arg(
{"-hff", "--hf-file"}, "FILE",
"Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)",
[](common_params & params, const std::string & value) {
params.model.hf_file = value;
params.hf_file = value;
}
).set_env("LLAMA_ARG_HF_FILE"));
add_opt(common_arg(
{"-hfv", "-hfrv", "--hf-repo-v"}, "<user>/<model>[:quant]",
"Hugging Face model repository for the vocoder model (default: unused)",
[](common_params & params, const std::string & value) {
params.vocoder.model.hf_repo = value;
params.vocoder.hf_repo = value;
}
).set_env("LLAMA_ARG_HF_REPO_V"));
add_opt(common_arg(
{"-hffv", "--hf-file-v"}, "FILE",
"Hugging Face model file for the vocoder model (default: unused)",
[](common_params & params, const std::string & value) {
params.vocoder.model.hf_file = value;
params.vocoder.hf_file = value;
}
).set_env("LLAMA_ARG_HF_FILE_V"));
add_opt(common_arg(
@@ -2559,7 +1979,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
add_opt(common_arg(
{"--host"}, "HOST",
string_format("ip address to listen, or bind to an UNIX socket if the address ends with .sock (default: %s)", params.hostname.c_str()),
string_format("ip address to listen (default: %s)", params.hostname.c_str()),
[](common_params & params, const std::string & value) {
params.hostname = value;
}
@@ -3034,7 +2454,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
{"-md", "--model-draft"}, "FNAME",
"draft model for speculative decoding (default: unused)",
[](common_params & params, const std::string & value) {
params.speculative.model.path = value;
params.speculative.model = value;
}
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
@@ -3042,7 +2462,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
{"-mv", "--model-vocoder"}, "FNAME",
"vocoder model for audio generation (default: unused)",
[](common_params & params, const std::string & value) {
params.vocoder.model.path = value;
params.vocoder.model = value;
}
).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
@@ -3065,10 +2485,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
{"--tts-oute-default"},
string_format("use default OuteTTS models (note: can download weights from the internet)"),
[](common_params & params) {
params.model.hf_repo = "OuteAI/OuteTTS-0.2-500M-GGUF";
params.model.hf_file = "OuteTTS-0.2-500M-Q8_0.gguf";
params.vocoder.model.hf_repo = "ggml-org/WavTokenizer";
params.vocoder.model.hf_file = "WavTokenizer-Large-75-F16.gguf";
params.hf_repo = "OuteAI/OuteTTS-0.2-500M-GGUF";
params.hf_file = "OuteTTS-0.2-500M-Q8_0.gguf";
params.vocoder.hf_repo = "ggml-org/WavTokenizer";
params.vocoder.hf_file = "WavTokenizer-Large-75-F16.gguf";
}
).set_examples({LLAMA_EXAMPLE_TTS}));
@@ -3076,8 +2496,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
{"--embd-bge-small-en-default"},
string_format("use default bge-small-en-v1.5 model (note: can download weights from the internet)"),
[](common_params & params) {
params.model.hf_repo = "ggml-org/bge-small-en-v1.5-Q8_0-GGUF";
params.model.hf_file = "bge-small-en-v1.5-q8_0.gguf";
params.hf_repo = "ggml-org/bge-small-en-v1.5-Q8_0-GGUF";
params.hf_file = "bge-small-en-v1.5-q8_0.gguf";
params.pooling_type = LLAMA_POOLING_TYPE_NONE;
params.embd_normalize = 2;
params.n_ctx = 512;
@@ -3090,8 +2510,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
{"--embd-e5-small-en-default"},
string_format("use default e5-small-v2 model (note: can download weights from the internet)"),
[](common_params & params) {
params.model.hf_repo = "ggml-org/e5-small-v2-Q8_0-GGUF";
params.model.hf_file = "e5-small-v2-q8_0.gguf";
params.hf_repo = "ggml-org/e5-small-v2-Q8_0-GGUF";
params.hf_file = "e5-small-v2-q8_0.gguf";
params.pooling_type = LLAMA_POOLING_TYPE_NONE;
params.embd_normalize = 2;
params.n_ctx = 512;
@@ -3104,8 +2524,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
{"--embd-gte-small-default"},
string_format("use default gte-small model (note: can download weights from the internet)"),
[](common_params & params) {
params.model.hf_repo = "ggml-org/gte-small-Q8_0-GGUF";
params.model.hf_file = "gte-small-q8_0.gguf";
params.hf_repo = "ggml-org/gte-small-Q8_0-GGUF";
params.hf_file = "gte-small-q8_0.gguf";
params.pooling_type = LLAMA_POOLING_TYPE_NONE;
params.embd_normalize = 2;
params.n_ctx = 512;
@@ -3118,8 +2538,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
{"--fim-qwen-1.5b-default"},
string_format("use default Qwen 2.5 Coder 1.5B (note: can download weights from the internet)"),
[](common_params & params) {
params.model.hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-Q8_0-GGUF";
params.model.hf_file = "qwen2.5-coder-1.5b-q8_0.gguf";
params.hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-Q8_0-GGUF";
params.hf_file = "qwen2.5-coder-1.5b-q8_0.gguf";
params.port = 8012;
params.n_gpu_layers = 99;
params.flash_attn = true;
@@ -3134,8 +2554,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
{"--fim-qwen-3b-default"},
string_format("use default Qwen 2.5 Coder 3B (note: can download weights from the internet)"),
[](common_params & params) {
params.model.hf_repo = "ggml-org/Qwen2.5-Coder-3B-Q8_0-GGUF";
params.model.hf_file = "qwen2.5-coder-3b-q8_0.gguf";
params.hf_repo = "ggml-org/Qwen2.5-Coder-3B-Q8_0-GGUF";
params.hf_file = "qwen2.5-coder-3b-q8_0.gguf";
params.port = 8012;
params.n_gpu_layers = 99;
params.flash_attn = true;
@@ -3150,8 +2570,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
{"--fim-qwen-7b-default"},
string_format("use default Qwen 2.5 Coder 7B (note: can download weights from the internet)"),
[](common_params & params) {
params.model.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
params.model.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
params.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
params.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
params.port = 8012;
params.n_gpu_layers = 99;
params.flash_attn = true;
@@ -3166,10 +2586,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
{"--fim-qwen-7b-spec"},
string_format("use Qwen 2.5 Coder 7B + 0.5B draft for speculative decoding (note: can download weights from the internet)"),
[](common_params & params) {
params.model.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
params.model.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
params.speculative.model.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
params.speculative.model.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
params.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
params.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
params.speculative.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
params.speculative.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
params.speculative.n_gpu_layers = 99;
params.port = 8012;
params.n_gpu_layers = 99;
@@ -3185,10 +2605,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
{"--fim-qwen-14b-spec"},
string_format("use Qwen 2.5 Coder 14B + 0.5B draft for speculative decoding (note: can download weights from the internet)"),
[](common_params & params) {
params.model.hf_repo = "ggml-org/Qwen2.5-Coder-14B-Q8_0-GGUF";
params.model.hf_file = "qwen2.5-coder-14b-q8_0.gguf";
params.speculative.model.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
params.speculative.model.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
params.hf_repo = "ggml-org/Qwen2.5-Coder-14B-Q8_0-GGUF";
params.hf_file = "qwen2.5-coder-14b-q8_0.gguf";
params.speculative.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
params.speculative.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
params.speculative.n_gpu_layers = 99;
params.port = 8012;
params.n_gpu_layers = 99;

View File

@@ -7,6 +7,9 @@
#include "common.h"
#include "log.h"
// Change JSON_ASSERT from assert() to GGML_ASSERT:
#define JSON_ASSERT GGML_ASSERT
#include "json.hpp"
#include "llama.h"
#include <algorithm>
@@ -48,11 +51,47 @@
#include <sys/stat.h>
#include <unistd.h>
#endif
#if defined(LLAMA_USE_CURL)
#include <curl/curl.h>
#include <curl/easy.h>
#include <future>
#endif
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
#if defined(LLAMA_USE_CURL)
#ifdef __linux__
#include <linux/limits.h>
#elif defined(_WIN32)
# if !defined(PATH_MAX)
# define PATH_MAX MAX_PATH
# endif
#else
#include <sys/syslimits.h>
#endif
#define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
//
// CURL utils
//
using curl_ptr = std::unique_ptr<CURL, decltype(&curl_easy_cleanup)>;
// cannot use unique_ptr for curl_slist, because we cannot update without destroying the old one
struct curl_slist_ptr {
struct curl_slist * ptr = nullptr;
~curl_slist_ptr() {
if (ptr) {
curl_slist_free_all(ptr);
}
}
};
#endif // LLAMA_USE_CURL
using json = nlohmann::ordered_json;
//
// CPU utils
//
@@ -861,14 +900,22 @@ std::string fs_get_cache_file(const std::string & filename) {
//
// Model utils
//
struct common_init_result common_init_from_params(common_params & params) {
common_init_result iparams;
auto mparams = common_model_params_to_llama(params);
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
llama_model * model = nullptr;
if (!params.hf_repo.empty() && !params.hf_file.empty()) {
model = common_load_model_from_hf(params.hf_repo, params.hf_file, params.model, params.hf_token, mparams);
} else if (!params.model_url.empty()) {
model = common_load_model_from_url(params.model_url, params.model, params.hf_token, mparams);
} else {
model = llama_model_load_from_file(params.model.c_str(), mparams);
}
if (model == NULL) {
LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.path.c_str());
LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.c_str());
return iparams;
}
@@ -903,7 +950,7 @@ struct common_init_result common_init_from_params(common_params & params) {
llama_context * lctx = llama_init_from_model(model, cparams);
if (lctx == NULL) {
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str());
llama_model_free(model);
return iparams;
}
@@ -1042,18 +1089,15 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
if (!params.devices.empty()) {
mparams.devices = params.devices.data();
}
if (params.n_gpu_layers != -1) {
mparams.n_gpu_layers = params.n_gpu_layers;
}
mparams.main_gpu = params.main_gpu;
mparams.split_mode = params.split_mode;
mparams.tensor_split = params.tensor_split;
mparams.use_mmap = params.use_mmap;
mparams.use_mlock = params.use_mlock;
mparams.check_tensors = params.check_tensors;
if (params.kv_overrides.empty()) {
mparams.kv_overrides = NULL;
} else {
@@ -1061,13 +1105,6 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
mparams.kv_overrides = params.kv_overrides.data();
}
if (params.tensor_buft_overrides.empty()) {
mparams.tensor_buft_overrides = NULL;
} else {
GGML_ASSERT(params.tensor_buft_overrides.back().pattern == nullptr && "Tensor buffer overrides not terminated with empty pattern");
mparams.tensor_buft_overrides = params.tensor_buft_overrides.data();
}
return mparams;
}
@@ -1127,6 +1164,451 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p
return tpp;
}
#ifdef LLAMA_USE_CURL
#define CURL_MAX_RETRY 3
#define CURL_RETRY_DELAY_SECONDS 2
static bool curl_perform_with_retry(const std::string & url, CURL * curl, int max_attempts, int retry_delay_seconds) {
int remaining_attempts = max_attempts;
while (remaining_attempts > 0) {
LOG_INF("%s: Trying to download from %s (attempt %d of %d)...\n", __func__ , url.c_str(), max_attempts - remaining_attempts + 1, max_attempts);
CURLcode res = curl_easy_perform(curl);
if (res == CURLE_OK) {
return true;
}
int exponential_backoff_delay = std::pow(retry_delay_seconds, max_attempts - remaining_attempts) * 1000;
LOG_WRN("%s: curl_easy_perform() failed: %s, retrying after %d milliseconds...\n", __func__, curl_easy_strerror(res), exponential_backoff_delay);
remaining_attempts--;
std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay));
}
LOG_ERR("%s: curl_easy_perform() failed after %d attempts\n", __func__, max_attempts);
return false;
}
static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
// Initialize libcurl
curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
curl_slist_ptr http_headers;
if (!curl) {
LOG_ERR("%s: error initializing libcurl\n", __func__);
return false;
}
bool force_download = false;
// Set the URL, allow to follow http redirection
curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
// Check if hf-token or bearer-token was specified
if (!hf_token.empty()) {
std::string auth_header = "Authorization: Bearer " + hf_token;
http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
}
#if defined(_WIN32)
// CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
// operating system. Currently implemented under MS-Windows.
curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
#endif
// Check if the file already exists locally
auto file_exists = std::filesystem::exists(path);
// If the file exists, check its JSON metadata companion file.
std::string metadata_path = path + ".json";
nlohmann::json metadata;
std::string etag;
std::string last_modified;
if (file_exists) {
// Try and read the JSON metadata file (note: stream autoclosed upon exiting this block).
std::ifstream metadata_in(metadata_path);
if (metadata_in.good()) {
try {
metadata_in >> metadata;
LOG_INF("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
if (metadata.contains("url") && metadata.at("url").is_string()) {
auto previous_url = metadata.at("url").get<std::string>();
if (previous_url != url) {
LOG_ERR("%s: Model URL mismatch: %s != %s\n", __func__, url.c_str(), previous_url.c_str());
return false;
}
}
if (metadata.contains("etag") && metadata.at("etag").is_string()) {
etag = metadata.at("etag");
}
if (metadata.contains("lastModified") && metadata.at("lastModified").is_string()) {
last_modified = metadata.at("lastModified");
}
} catch (const nlohmann::json::exception & e) {
LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
return false;
}
}
} else {
LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
}
// Send a HEAD request to retrieve the etag and last-modified headers
struct common_load_model_from_url_headers {
std::string etag;
std::string last_modified;
};
common_load_model_from_url_headers headers;
{
typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
static std::regex header_regex("([^:]+): (.*)\r\n");
static std::regex etag_regex("ETag", std::regex_constants::icase);
static std::regex last_modified_regex("Last-Modified", std::regex_constants::icase);
std::string header(buffer, n_items);
std::smatch match;
if (std::regex_match(header, match, header_regex)) {
const std::string & key = match[1];
const std::string & value = match[2];
if (std::regex_match(key, match, etag_regex)) {
headers->etag = value;
} else if (std::regex_match(key, match, last_modified_regex)) {
headers->last_modified = value;
}
}
return n_items;
};
curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 1L); // will trigger the HEAD verb
curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); // hide head request progress
curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
if (!was_perform_successful) {
return false;
}
long http_code = 0;
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
if (http_code != 200) {
// HEAD not supported, we don't know if the file has changed
// force trigger downloading
force_download = true;
LOG_ERR("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
}
}
bool should_download = !file_exists || force_download;
if (!should_download) {
if (!etag.empty() && etag != headers.etag) {
LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(), headers.etag.c_str());
should_download = true;
} else if (!last_modified.empty() && last_modified != headers.last_modified) {
LOG_WRN("%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__, last_modified.c_str(), headers.last_modified.c_str());
should_download = true;
}
}
if (should_download) {
std::string path_temporary = path + ".downloadInProgress";
if (file_exists) {
LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
if (remove(path.c_str()) != 0) {
LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
return false;
}
}
// Set the output file
struct FILE_deleter {
void operator()(FILE * f) const {
fclose(f);
}
};
std::unique_ptr<FILE, FILE_deleter> outfile(fopen(path_temporary.c_str(), "wb"));
if (!outfile) {
LOG_ERR("%s: error opening local file for writing: %s\n", __func__, path.c_str());
return false;
}
typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * data, size_t size, size_t nmemb, void * fd);
auto write_callback = [](void * data, size_t size, size_t nmemb, void * fd) -> size_t {
return fwrite(data, size, nmemb, (FILE *)fd);
};
curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 0L);
curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, outfile.get());
// display download progress
curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 0L);
// helper function to hide password in URL
auto llama_download_hide_password_in_url = [](const std::string & url) -> std::string {
std::size_t protocol_pos = url.find("://");
if (protocol_pos == std::string::npos) {
return url; // Malformed URL
}
std::size_t at_pos = url.find('@', protocol_pos + 3);
if (at_pos == std::string::npos) {
return url; // No password in URL
}
return url.substr(0, protocol_pos + 3) + "********" + url.substr(at_pos);
};
// start the download
LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
if (!was_perform_successful) {
return false;
}
long http_code = 0;
curl_easy_getinfo (curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
if (http_code < 200 || http_code >= 400) {
LOG_ERR("%s: invalid http status code received: %ld\n", __func__, http_code);
return false;
}
// Causes file to be closed explicitly here before we rename it.
outfile.reset();
// Write the updated JSON metadata file.
metadata.update({
{"url", url},
{"etag", headers.etag},
{"lastModified", headers.last_modified}
});
std::ofstream(metadata_path) << metadata.dump(4);
LOG_INF("%s: file metadata saved: %s\n", __func__, metadata_path.c_str());
if (rename(path_temporary.c_str(), path.c_str()) != 0) {
LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
return false;
}
}
return true;
}
struct llama_model * common_load_model_from_url(
const std::string & model_url,
const std::string & local_path,
const std::string & hf_token,
const struct llama_model_params & params) {
// Basic validation of the model_url
if (model_url.empty()) {
LOG_ERR("%s: invalid model_url\n", __func__);
return NULL;
}
if (!common_download_file(model_url, local_path, hf_token)) {
return NULL;
}
// check for additional GGUFs split to download
int n_split = 0;
{
struct gguf_init_params gguf_params = {
/*.no_alloc = */ true,
/*.ctx = */ NULL,
};
auto * ctx_gguf = gguf_init_from_file(local_path.c_str(), gguf_params);
if (!ctx_gguf) {
LOG_ERR("\n%s: failed to load input GGUF from %s\n", __func__, local_path.c_str());
return NULL;
}
auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT);
if (key_n_split >= 0) {
n_split = gguf_get_val_u16(ctx_gguf, key_n_split);
}
gguf_free(ctx_gguf);
}
if (n_split > 1) {
char split_prefix[PATH_MAX] = {0};
char split_url_prefix[LLAMA_CURL_MAX_URL_LENGTH] = {0};
// Verify the first split file format
// and extract split URL and PATH prefixes
{
if (!llama_split_prefix(split_prefix, sizeof(split_prefix), local_path.c_str(), 0, n_split)) {
LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, local_path.c_str(), n_split);
return NULL;
}
if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url.c_str(), 0, n_split)) {
LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url.c_str(), n_split);
return NULL;
}
}
// Prepare download in parallel
std::vector<std::future<bool>> futures_download;
for (int idx = 1; idx < n_split; idx++) {
futures_download.push_back(std::async(std::launch::async, [&split_prefix, &split_url_prefix, &n_split, hf_token](int download_idx) -> bool {
char split_path[PATH_MAX] = {0};
llama_split_path(split_path, sizeof(split_path), split_prefix, download_idx, n_split);
char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split);
return common_download_file(split_url, split_path, hf_token);
}, idx));
}
// Wait for all downloads to complete
for (auto & f : futures_download) {
if (!f.get()) {
return NULL;
}
}
}
return llama_model_load_from_file(local_path.c_str(), params);
}
struct llama_model * common_load_model_from_hf(
const std::string & repo,
const std::string & remote_path,
const std::string & local_path,
const std::string & hf_token,
const struct llama_model_params & params) {
// construct hugging face model url:
//
// --repo ggml-org/models --file tinyllama-1.1b/ggml-model-f16.gguf
// https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf
//
// --repo TheBloke/Mixtral-8x7B-v0.1-GGUF --file mixtral-8x7b-v0.1.Q4_K_M.gguf
// https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q4_K_M.gguf
//
std::string model_url = "https://huggingface.co/";
model_url += repo;
model_url += "/resolve/main/";
model_url += remote_path;
return common_load_model_from_url(model_url, local_path, hf_token, params);
}
/**
* Allow getting the HF file from the HF repo with tag (like ollama), for example:
* - bartowski/Llama-3.2-3B-Instruct-GGUF:q4
* - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M
* - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s
* Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo)
*
* Return pair of <repo, file> (with "repo" already having tag removed)
*
* Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
*/
std::pair<std::string, std::string> common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & hf_token) {
auto parts = string_split<std::string>(hf_repo_with_tag, ':');
std::string tag = parts.size() > 1 ? parts.back() : "latest";
std::string hf_repo = parts[0];
if (string_split<std::string>(hf_repo, '/').size() != 2) {
throw std::invalid_argument("error: invalid HF repo format, expected <user>/<model>[:quant]\n");
}
// fetch model info from Hugging Face Hub API
json model_info;
curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
curl_slist_ptr http_headers;
std::string res_str;
std::string url = "https://huggingface.co/v2/" + hf_repo + "/manifests/" + tag;
curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L);
typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data);
auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t {
static_cast<std::string *>(data)->append((char * ) ptr, size * nmemb);
return size * nmemb;
};
curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_str);
#if defined(_WIN32)
curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
#endif
if (!hf_token.empty()) {
std::string auth_header = "Authorization: Bearer " + hf_token;
http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
}
// Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response
http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
http_headers.ptr = curl_slist_append(http_headers.ptr, "Accept: application/json");
curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
CURLcode res = curl_easy_perform(curl.get());
if (res != CURLE_OK) {
throw std::runtime_error("error: cannot make GET request to HF API");
}
long res_code;
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code);
if (res_code == 200) {
model_info = json::parse(res_str);
} else if (res_code == 401) {
throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token");
} else {
throw std::runtime_error(string_format("error from HF API, response code: %ld, data: %s", res_code, res_str.c_str()));
}
// check response
if (!model_info.contains("ggufFile")) {
throw std::runtime_error("error: model does not have ggufFile");
}
json & gguf_file = model_info.at("ggufFile");
if (!gguf_file.contains("rfilename")) {
throw std::runtime_error("error: ggufFile does not have rfilename");
}
return std::make_pair(hf_repo, gguf_file.at("rfilename"));
}
#else
struct llama_model * common_load_model_from_url(
const std::string & /*model_url*/,
const std::string & /*local_path*/,
const std::string & /*hf_token*/,
const struct llama_model_params & /*params*/) {
LOG_WRN("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
return nullptr;
}
struct llama_model * common_load_model_from_hf(
const std::string & /*repo*/,
const std::string & /*remote_path*/,
const std::string & /*local_path*/,
const std::string & /*hf_token*/,
const struct llama_model_params & /*params*/) {
LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
return nullptr;
}
std::pair<std::string, std::string> common_get_hf_file(const std::string &, const std::string &) {
LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
return std::make_pair("", "");
}
#endif // LLAMA_USE_CURL
//
// Batch utils
//
@@ -1550,3 +2032,26 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
return result;
}
template <>
json common_grammar_trigger::to_json() const {
json out {
{"type", (int) type},
{"value", value},
};
if (type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
out["token"] = (int) token;
}
return out;
}
template <>
common_grammar_trigger common_grammar_trigger::from_json(const json & in) {
common_grammar_trigger out;
out.type = (common_grammar_trigger_type) in.at("type").get<int>();
out.value = in.at("value").get<std::string>();
if (out.type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
out.token = (llama_token) in.at("token").get<int>();
}
return out;
}

View File

@@ -121,6 +121,10 @@ struct common_grammar_trigger {
common_grammar_trigger_type type;
std::string value;
llama_token token = LLAMA_TOKEN_NULL;
// T can only be nlohmann::ordered_json
template <class T> T to_json() const;
template <class T> static common_grammar_trigger from_json(const T & in);
};
// sampling parameters
@@ -180,13 +184,6 @@ struct common_params_sampling {
std::string print() const;
};
struct common_params_model {
std::string path = ""; // model local path // NOLINT
std::string url = ""; // model url to download // NOLINT
std::string hf_repo = ""; // HF repo // NOLINT
std::string hf_file = ""; // HF file // NOLINT
};
struct common_params_speculative {
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
@@ -200,11 +197,19 @@ struct common_params_speculative {
struct cpu_params cpuparams;
struct cpu_params cpuparams_batch;
struct common_params_model model;
std::string hf_repo = ""; // HF repo // NOLINT
std::string hf_file = ""; // HF file // NOLINT
std::string model = ""; // draft model for speculative decoding // NOLINT
std::string model_url = ""; // model url to download // NOLINT
};
struct common_params_vocoder {
struct common_params_model model;
std::string hf_repo = ""; // HF repo // NOLINT
std::string hf_file = ""; // HF file // NOLINT
std::string model = ""; // model path // NOLINT
std::string model_url = ""; // model url to download // NOLINT
std::string speaker_file = ""; // speaker file path // NOLINT
@@ -262,10 +267,12 @@ struct common_params {
struct common_params_speculative speculative;
struct common_params_vocoder vocoder;
struct common_params_model model;
std::string model = ""; // model path // NOLINT
std::string model_alias = ""; // model alias // NOLINT
std::string model_url = ""; // model url to download // NOLINT
std::string hf_token = ""; // HF token // NOLINT
std::string hf_repo = ""; // HF repo // NOLINT
std::string hf_file = ""; // HF file // NOLINT
std::string prompt = ""; // NOLINT
std::string system_prompt = ""; // NOLINT
std::string prompt_file = ""; // store the external prompt file name // NOLINT
@@ -279,7 +286,6 @@ struct common_params {
std::vector<std::string> in_files; // all input files
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
std::vector<llama_model_kv_override> kv_overrides;
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply)
std::vector<common_adapter_lora_info> lora_adapters; // lora adapter path with user defined scale
@@ -341,7 +347,7 @@ struct common_params {
common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
// multimodal models (see examples/llava)
struct common_params_model mmproj;
std::string mmproj = ""; // path to multimodal projector // NOLINT
std::vector<std::string> image; // path to image file(s)
// embedding
@@ -540,6 +546,23 @@ struct llama_model_params common_model_params_to_llama ( common_params
struct llama_context_params common_context_params_to_llama(const common_params & params);
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
struct llama_model * common_load_model_from_url(
const std::string & model_url,
const std::string & local_path,
const std::string & hf_token,
const struct llama_model_params & params);
struct llama_model * common_load_model_from_hf(
const std::string & repo,
const std::string & remote_path,
const std::string & local_path,
const std::string & hf_token,
const struct llama_model_params & params);
std::pair<std::string, std::string> common_get_hf_file(
const std::string & hf_repo_with_tag,
const std::string & hf_token);
// clear LoRA adapters from context, then apply new list of adapters
void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);

View File

@@ -11,24 +11,25 @@ struct llama_sampler_llg {
std::string grammar_kind;
std::string grammar_data;
LlgTokenizer * tokenizer;
LlgMatcher * grammar;
LlgConstraint * grammar;
LlgMaskResult llg_res;
bool has_llg_res;
};
static LlgMatcher * llama_sampler_llg_new(LlgTokenizer * tokenizer, const char * grammar_kind,
const char * grammar_data) {
static LlgConstraint * llama_sampler_llg_new(LlgTokenizer * tokenizer, const char * grammar_kind,
const char * grammar_data) {
LlgConstraintInit cinit;
llg_constraint_init_set_defaults(&cinit, tokenizer);
const char * log_level = getenv("LLGUIDANCE_LOG_LEVEL");
if (log_level && *log_level) {
cinit.log_stderr_level = atoi(log_level);
}
auto c = llg_new_matcher(&cinit, grammar_kind, grammar_data);
if (llg_matcher_get_error(c)) {
LOG_ERR("llg error: %s\n", llg_matcher_get_error(c));
llg_free_matcher(c);
auto c = llg_new_constraint_any(&cinit, grammar_kind, grammar_data);
if (llg_get_error(c)) {
LOG_ERR("llg error: %s\n", llg_get_error(c));
llg_free_constraint(c);
return nullptr;
}
return c;
}
@@ -39,29 +40,39 @@ static const char * llama_sampler_llg_name(const llama_sampler * /*smpl*/) {
static void llama_sampler_llg_accept_impl(llama_sampler * smpl, llama_token token) {
auto * ctx = (llama_sampler_llg *) smpl->ctx;
if (ctx->grammar) {
llg_matcher_consume_token(ctx->grammar, token);
LlgCommitResult res;
llg_commit_token(ctx->grammar, token, &res);
ctx->has_llg_res = false;
}
}
static void llama_sampler_llg_apply(llama_sampler * smpl, llama_token_data_array * cur_p) {
auto * ctx = (llama_sampler_llg *) smpl->ctx;
if (ctx->grammar) {
const uint32_t * mask = llg_matcher_get_mask(ctx->grammar);
if (mask == nullptr) {
if (llg_matcher_compute_mask(ctx->grammar) == 0) {
mask = llg_matcher_get_mask(ctx->grammar);
if (!ctx->has_llg_res) {
if (llg_compute_mask(ctx->grammar, &ctx->llg_res) == 0) {
ctx->has_llg_res = true;
} else {
LOG_ERR("llg error: %s\n", llg_matcher_get_error(ctx->grammar));
llg_free_matcher(ctx->grammar);
LOG_ERR("llg error: %s\n", llg_get_error(ctx->grammar));
llg_free_constraint(ctx->grammar);
ctx->grammar = nullptr;
return;
}
}
for (size_t i = 0; i < cur_p->size; ++i) {
auto token = cur_p->data[i].id;
if ((mask[token / 32] & (1 << (token % 32))) == 0) {
cur_p->data[i].logit = -INFINITY;
if (ctx->has_llg_res) {
if (ctx->llg_res.is_stop) {
for (size_t i = 0; i < cur_p->size; ++i) {
if (!llama_vocab_is_eog(ctx->vocab, cur_p->data[i].id)) {
cur_p->data[i].logit = -INFINITY;
}
}
} else {
const uint32_t * mask = ctx->llg_res.sample_mask;
for (size_t i = 0; i < cur_p->size; ++i) {
auto token = cur_p->data[i].id;
if ((mask[token / 32] & (1 << (token % 32))) == 0) {
cur_p->data[i].logit = -INFINITY;
}
}
}
}
}
@@ -69,9 +80,14 @@ static void llama_sampler_llg_apply(llama_sampler * smpl, llama_token_data_array
static void llama_sampler_llg_reset(llama_sampler * smpl) {
auto * ctx = (llama_sampler_llg *) smpl->ctx;
if (ctx->grammar) {
llg_matcher_reset(ctx->grammar);
if (!ctx->grammar) {
return;
}
auto * grammar_new = llama_sampler_llg_new(ctx->tokenizer, ctx->grammar_kind.c_str(), ctx->grammar_data.c_str());
llg_free_constraint(ctx->grammar);
ctx->grammar = grammar_new;
ctx->has_llg_res = false;
}
static llama_sampler * llama_sampler_llg_clone(const llama_sampler * smpl) {
@@ -86,7 +102,7 @@ static llama_sampler * llama_sampler_llg_clone(const llama_sampler * smpl) {
if (ctx->grammar) {
result_ctx->grammar_kind = ctx->grammar_kind;
result_ctx->grammar_data = ctx->grammar_data;
result_ctx->grammar = llg_clone_matcher(ctx->grammar);
result_ctx->grammar = llg_clone_constraint(ctx->grammar);
result_ctx->tokenizer = llg_clone_tokenizer(ctx->tokenizer);
}
}
@@ -98,7 +114,7 @@ static void llama_sampler_llg_free(llama_sampler * smpl) {
const auto * ctx = (llama_sampler_llg *) smpl->ctx;
if (ctx->grammar) {
llg_free_matcher(ctx->grammar);
llg_free_constraint(ctx->grammar);
llg_free_tokenizer(ctx->tokenizer);
}
@@ -223,11 +239,9 @@ llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab, const char * g
/* .grammar_data = */ grammar_data,
/* .tokenizer = */ tokenizer,
/* .grammar = */ llama_sampler_llg_new(tokenizer, grammar_kind, grammar_data),
/* .llg_res = */ {},
/* .has_llg_res = */ false,
};
if (ctx->grammar) {
GGML_ASSERT(((size_t) llama_vocab_n_tokens(vocab) + 31) / 32 * 4 ==
llg_matcher_get_mask_byte_size(ctx->grammar));
}
} else {
*ctx = {
/* .vocab = */ vocab,
@@ -235,12 +249,15 @@ llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab, const char * g
/* .grammar_data = */ {},
/* .tokenizer = */ nullptr,
/* .grammar = */ nullptr,
/* .llg_res = */ {},
/* .has_llg_res = */ false,
};
}
return llama_sampler_init(
/* .iface = */ &llama_sampler_llg_i,
/* .ctx = */ ctx);
/* .ctx = */ ctx
);
}
#else

View File

@@ -9,19 +9,10 @@
#pragma once
#include "minja.hpp"
#include <chrono>
#include <cstddef>
#include <cstdio>
#include <exception>
#include <iomanip>
#include <memory>
#include <sstream>
#include <json.hpp>
#include <string>
#include <vector>
#include <json.hpp>
using json = nlohmann::ordered_json;
namespace minja {
@@ -434,7 +425,7 @@ class chat_template {
auto obj = json {
{"tool_calls", tool_calls},
};
if (!content.is_null() && !content.empty()) {
if (!content.is_null() && content != "") {
obj["content"] = content;
}
message["content"] = obj.dump(2);
@@ -444,12 +435,13 @@ class chat_template {
if (polyfill_tool_responses && role == "tool") {
message["role"] = "user";
auto obj = json {
{"tool_response", json::object()},
{"tool_response", {
{"content", message.at("content")},
}},
};
if (message.contains("name")) {
obj["tool_response"]["tool"] = message.at("name");
obj["tool_response"]["name"] = message.at("name");
}
obj["tool_response"]["content"] = message.at("content");
if (message.contains("tool_call_id")) {
obj["tool_response"]["tool_call_id"] = message.at("tool_call_id");
}
@@ -518,7 +510,7 @@ class chat_template {
static nlohmann::ordered_json add_system(const nlohmann::ordered_json & messages, const std::string & system_prompt) {
json messages_with_system = messages;
if (!messages_with_system.empty() && messages_with_system[0].at("role") == "system") {
if (messages_with_system.size() > 0 && messages_with_system[0].at("role") == "system") {
std::string existing_system = messages_with_system.at(0).at("content");
messages_with_system[0] = json {
{"role", "system"},

View File

@@ -8,26 +8,14 @@
// SPDX-License-Identifier: MIT
#pragma once
#include <algorithm>
#include <cctype>
#include <cstddef>
#include <cmath>
#include <exception>
#include <functional>
#include <iostream>
#include <iterator>
#include <limits>
#include <map>
#include <memory>
#include <regex>
#include <sstream>
#include <string>
#include <stdexcept>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector>
#include <regex>
#include <memory>
#include <stdexcept>
#include <sstream>
#include <unordered_set>
#include <json.hpp>
using json = nlohmann::ordered_json;
@@ -743,51 +731,51 @@ public:
struct TextTemplateToken : public TemplateToken {
std::string text;
TextTemplateToken(const Location & loc, SpaceHandling pre, SpaceHandling post, const std::string& t) : TemplateToken(Type::Text, loc, pre, post), text(t) {}
TextTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post, const std::string& t) : TemplateToken(Type::Text, location, pre, post), text(t) {}
};
struct ExpressionTemplateToken : public TemplateToken {
std::shared_ptr<Expression> expr;
ExpressionTemplateToken(const Location & loc, SpaceHandling pre, SpaceHandling post, std::shared_ptr<Expression> && e) : TemplateToken(Type::Expression, loc, pre, post), expr(std::move(e)) {}
ExpressionTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post, std::shared_ptr<Expression> && e) : TemplateToken(Type::Expression, location, pre, post), expr(std::move(e)) {}
};
struct IfTemplateToken : public TemplateToken {
std::shared_ptr<Expression> condition;
IfTemplateToken(const Location & loc, SpaceHandling pre, SpaceHandling post, std::shared_ptr<Expression> && c) : TemplateToken(Type::If, loc, pre, post), condition(std::move(c)) {}
IfTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post, std::shared_ptr<Expression> && c) : TemplateToken(Type::If, location, pre, post), condition(std::move(c)) {}
};
struct ElifTemplateToken : public TemplateToken {
std::shared_ptr<Expression> condition;
ElifTemplateToken(const Location & loc, SpaceHandling pre, SpaceHandling post, std::shared_ptr<Expression> && c) : TemplateToken(Type::Elif, loc, pre, post), condition(std::move(c)) {}
ElifTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post, std::shared_ptr<Expression> && c) : TemplateToken(Type::Elif, location, pre, post), condition(std::move(c)) {}
};
struct ElseTemplateToken : public TemplateToken {
ElseTemplateToken(const Location & loc, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::Else, loc, pre, post) {}
ElseTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::Else, location, pre, post) {}
};
struct EndIfTemplateToken : public TemplateToken {
EndIfTemplateToken(const Location & loc, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::EndIf, loc, pre, post) {}
EndIfTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::EndIf, location, pre, post) {}
};
struct MacroTemplateToken : public TemplateToken {
std::shared_ptr<VariableExpr> name;
Expression::Parameters params;
MacroTemplateToken(const Location & loc, SpaceHandling pre, SpaceHandling post, std::shared_ptr<VariableExpr> && n, Expression::Parameters && p)
: TemplateToken(Type::Macro, loc, pre, post), name(std::move(n)), params(std::move(p)) {}
MacroTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post, std::shared_ptr<VariableExpr> && n, Expression::Parameters && p)
: TemplateToken(Type::Macro, location, pre, post), name(std::move(n)), params(std::move(p)) {}
};
struct EndMacroTemplateToken : public TemplateToken {
EndMacroTemplateToken(const Location & loc, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::EndMacro, loc, pre, post) {}
EndMacroTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::EndMacro, location, pre, post) {}
};
struct FilterTemplateToken : public TemplateToken {
std::shared_ptr<Expression> filter;
FilterTemplateToken(const Location & loc, SpaceHandling pre, SpaceHandling post, std::shared_ptr<Expression> && filter)
: TemplateToken(Type::Filter, loc, pre, post), filter(std::move(filter)) {}
FilterTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post, std::shared_ptr<Expression> && filter)
: TemplateToken(Type::Filter, location, pre, post), filter(std::move(filter)) {}
};
struct EndFilterTemplateToken : public TemplateToken {
EndFilterTemplateToken(const Location & loc, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::EndFilter, loc, pre, post) {}
EndFilterTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::EndFilter, location, pre, post) {}
};
struct ForTemplateToken : public TemplateToken {
@@ -795,38 +783,38 @@ struct ForTemplateToken : public TemplateToken {
std::shared_ptr<Expression> iterable;
std::shared_ptr<Expression> condition;
bool recursive;
ForTemplateToken(const Location & loc, SpaceHandling pre, SpaceHandling post, const std::vector<std::string> & vns, std::shared_ptr<Expression> && iter,
ForTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post, const std::vector<std::string> & vns, std::shared_ptr<Expression> && iter,
std::shared_ptr<Expression> && c, bool r)
: TemplateToken(Type::For, loc, pre, post), var_names(vns), iterable(std::move(iter)), condition(std::move(c)), recursive(r) {}
: TemplateToken(Type::For, location, pre, post), var_names(vns), iterable(std::move(iter)), condition(std::move(c)), recursive(r) {}
};
struct EndForTemplateToken : public TemplateToken {
EndForTemplateToken(const Location & loc, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::EndFor, loc, pre, post) {}
EndForTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::EndFor, location, pre, post) {}
};
struct GenerationTemplateToken : public TemplateToken {
GenerationTemplateToken(const Location & loc, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::Generation, loc, pre, post) {}
GenerationTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::Generation, location, pre, post) {}
};
struct EndGenerationTemplateToken : public TemplateToken {
EndGenerationTemplateToken(const Location & loc, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::EndGeneration, loc, pre, post) {}
EndGenerationTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::EndGeneration, location, pre, post) {}
};
struct SetTemplateToken : public TemplateToken {
std::string ns;
std::vector<std::string> var_names;
std::shared_ptr<Expression> value;
SetTemplateToken(const Location & loc, SpaceHandling pre, SpaceHandling post, const std::string & ns, const std::vector<std::string> & vns, std::shared_ptr<Expression> && v)
: TemplateToken(Type::Set, loc, pre, post), ns(ns), var_names(vns), value(std::move(v)) {}
SetTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post, const std::string & ns, const std::vector<std::string> & vns, std::shared_ptr<Expression> && v)
: TemplateToken(Type::Set, location, pre, post), ns(ns), var_names(vns), value(std::move(v)) {}
};
struct EndSetTemplateToken : public TemplateToken {
EndSetTemplateToken(const Location & loc, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::EndSet, loc, pre, post) {}
EndSetTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::EndSet, location, pre, post) {}
};
struct CommentTemplateToken : public TemplateToken {
std::string text;
CommentTemplateToken(const Location & loc, SpaceHandling pre, SpaceHandling post, const std::string& t) : TemplateToken(Type::Comment, loc, pre, post), text(t) {}
CommentTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post, const std::string& t) : TemplateToken(Type::Comment, location, pre, post), text(t) {}
};
enum class LoopControlType { Break, Continue };
@@ -842,7 +830,7 @@ public:
struct LoopControlTemplateToken : public TemplateToken {
LoopControlType control_type;
LoopControlTemplateToken(const Location & loc, SpaceHandling pre, SpaceHandling post, LoopControlType control_type) : TemplateToken(Type::Break, loc, pre, post), control_type(control_type) {}
LoopControlTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post, LoopControlType control_type) : TemplateToken(Type::Break, location, pre, post), control_type(control_type) {}
};
class TemplateNode {
@@ -880,8 +868,8 @@ public:
class SequenceNode : public TemplateNode {
std::vector<std::shared_ptr<TemplateNode>> children;
public:
SequenceNode(const Location & loc, std::vector<std::shared_ptr<TemplateNode>> && c)
: TemplateNode(loc), children(std::move(c)) {}
SequenceNode(const Location & location, std::vector<std::shared_ptr<TemplateNode>> && c)
: TemplateNode(location), children(std::move(c)) {}
void do_render(std::ostringstream & out, const std::shared_ptr<Context> & context) const override {
for (const auto& child : children) child->render(out, context);
}
@@ -890,7 +878,7 @@ public:
class TextNode : public TemplateNode {
std::string text;
public:
TextNode(const Location & loc, const std::string& t) : TemplateNode(loc), text(t) {}
TextNode(const Location & location, const std::string& t) : TemplateNode(location), text(t) {}
void do_render(std::ostringstream & out, const std::shared_ptr<Context> &) const override {
out << text;
}
@@ -899,7 +887,7 @@ public:
class ExpressionNode : public TemplateNode {
std::shared_ptr<Expression> expr;
public:
ExpressionNode(const Location & loc, std::shared_ptr<Expression> && e) : TemplateNode(loc), expr(std::move(e)) {}
ExpressionNode(const Location & location, std::shared_ptr<Expression> && e) : TemplateNode(location), expr(std::move(e)) {}
void do_render(std::ostringstream & out, const std::shared_ptr<Context> & context) const override {
if (!expr) throw std::runtime_error("ExpressionNode.expr is null");
auto result = expr->evaluate(context);
@@ -916,8 +904,8 @@ public:
class IfNode : public TemplateNode {
std::vector<std::pair<std::shared_ptr<Expression>, std::shared_ptr<TemplateNode>>> cascade;
public:
IfNode(const Location & loc, std::vector<std::pair<std::shared_ptr<Expression>, std::shared_ptr<TemplateNode>>> && c)
: TemplateNode(loc), cascade(std::move(c)) {}
IfNode(const Location & location, std::vector<std::pair<std::shared_ptr<Expression>, std::shared_ptr<TemplateNode>>> && c)
: TemplateNode(location), cascade(std::move(c)) {}
void do_render(std::ostringstream & out, const std::shared_ptr<Context> & context) const override {
for (const auto& branch : cascade) {
auto enter_branch = true;
@@ -936,7 +924,7 @@ public:
class LoopControlNode : public TemplateNode {
LoopControlType control_type_;
public:
LoopControlNode(const Location & loc, LoopControlType control_type) : TemplateNode(loc), control_type_(control_type) {}
LoopControlNode(const Location & location, LoopControlType control_type) : TemplateNode(location), control_type_(control_type) {}
void do_render(std::ostringstream &, const std::shared_ptr<Context> &) const override {
throw LoopControlException(control_type_);
}
@@ -950,9 +938,9 @@ class ForNode : public TemplateNode {
bool recursive;
std::shared_ptr<TemplateNode> else_body;
public:
ForNode(const Location & loc, std::vector<std::string> && var_names, std::shared_ptr<Expression> && iterable,
ForNode(const Location & location, std::vector<std::string> && var_names, std::shared_ptr<Expression> && iterable,
std::shared_ptr<Expression> && condition, std::shared_ptr<TemplateNode> && body, bool recursive, std::shared_ptr<TemplateNode> && else_body)
: TemplateNode(loc), var_names(var_names), iterable(std::move(iterable)), condition(std::move(condition)), body(std::move(body)), recursive(recursive), else_body(std::move(else_body)) {}
: TemplateNode(location), var_names(var_names), iterable(std::move(iterable)), condition(std::move(condition)), body(std::move(body)), recursive(recursive), else_body(std::move(else_body)) {}
void do_render(std::ostringstream & out, const std::shared_ptr<Context> & context) const override {
// https://jinja.palletsprojects.com/en/3.0.x/templates/#for
@@ -1037,8 +1025,8 @@ class MacroNode : public TemplateNode {
std::shared_ptr<TemplateNode> body;
std::unordered_map<std::string, size_t> named_param_positions;
public:
MacroNode(const Location & loc, std::shared_ptr<VariableExpr> && n, Expression::Parameters && p, std::shared_ptr<TemplateNode> && b)
: TemplateNode(loc), name(std::move(n)), params(std::move(p)), body(std::move(b)) {
MacroNode(const Location & location, std::shared_ptr<VariableExpr> && n, Expression::Parameters && p, std::shared_ptr<TemplateNode> && b)
: TemplateNode(location), name(std::move(n)), params(std::move(p)), body(std::move(b)) {
for (size_t i = 0; i < params.size(); ++i) {
const auto & name = params[i].first;
if (!name.empty()) {
@@ -1084,8 +1072,8 @@ class FilterNode : public TemplateNode {
std::shared_ptr<TemplateNode> body;
public:
FilterNode(const Location & loc, std::shared_ptr<Expression> && f, std::shared_ptr<TemplateNode> && b)
: TemplateNode(loc), filter(std::move(f)), body(std::move(b)) {}
FilterNode(const Location & location, std::shared_ptr<Expression> && f, std::shared_ptr<TemplateNode> && b)
: TemplateNode(location), filter(std::move(f)), body(std::move(b)) {}
void do_render(std::ostringstream & out, const std::shared_ptr<Context> & context) const override {
if (!filter) throw std::runtime_error("FilterNode.filter is null");
@@ -1107,8 +1095,8 @@ class SetNode : public TemplateNode {
std::vector<std::string> var_names;
std::shared_ptr<Expression> value;
public:
SetNode(const Location & loc, const std::string & ns, const std::vector<std::string> & vns, std::shared_ptr<Expression> && v)
: TemplateNode(loc), ns(ns), var_names(vns), value(std::move(v)) {}
SetNode(const Location & location, const std::string & ns, const std::vector<std::string> & vns, std::shared_ptr<Expression> && v)
: TemplateNode(location), ns(ns), var_names(vns), value(std::move(v)) {}
void do_render(std::ostringstream &, const std::shared_ptr<Context> & context) const override {
if (!value) throw std::runtime_error("SetNode.value is null");
if (!ns.empty()) {
@@ -1130,8 +1118,8 @@ class SetTemplateNode : public TemplateNode {
std::string name;
std::shared_ptr<TemplateNode> template_value;
public:
SetTemplateNode(const Location & loc, const std::string & name, std::shared_ptr<TemplateNode> && tv)
: TemplateNode(loc), name(name), template_value(std::move(tv)) {}
SetTemplateNode(const Location & location, const std::string & name, std::shared_ptr<TemplateNode> && tv)
: TemplateNode(location), name(name), template_value(std::move(tv)) {}
void do_render(std::ostringstream &, const std::shared_ptr<Context> & context) const override {
if (!template_value) throw std::runtime_error("SetTemplateNode.template_value is null");
Value value { template_value->render(context) };
@@ -1144,8 +1132,8 @@ class IfExpr : public Expression {
std::shared_ptr<Expression> then_expr;
std::shared_ptr<Expression> else_expr;
public:
IfExpr(const Location & loc, std::shared_ptr<Expression> && c, std::shared_ptr<Expression> && t, std::shared_ptr<Expression> && e)
: Expression(loc), condition(std::move(c)), then_expr(std::move(t)), else_expr(std::move(e)) {}
IfExpr(const Location & location, std::shared_ptr<Expression> && c, std::shared_ptr<Expression> && t, std::shared_ptr<Expression> && e)
: Expression(location), condition(std::move(c)), then_expr(std::move(t)), else_expr(std::move(e)) {}
Value do_evaluate(const std::shared_ptr<Context> & context) const override {
if (!condition) throw std::runtime_error("IfExpr.condition is null");
if (!then_expr) throw std::runtime_error("IfExpr.then_expr is null");
@@ -1162,16 +1150,16 @@ public:
class LiteralExpr : public Expression {
Value value;
public:
LiteralExpr(const Location & loc, const Value& v)
: Expression(loc), value(v) {}
LiteralExpr(const Location & location, const Value& v)
: Expression(location), value(v) {}
Value do_evaluate(const std::shared_ptr<Context> &) const override { return value; }
};
class ArrayExpr : public Expression {
std::vector<std::shared_ptr<Expression>> elements;
public:
ArrayExpr(const Location & loc, std::vector<std::shared_ptr<Expression>> && e)
: Expression(loc), elements(std::move(e)) {}
ArrayExpr(const Location & location, std::vector<std::shared_ptr<Expression>> && e)
: Expression(location), elements(std::move(e)) {}
Value do_evaluate(const std::shared_ptr<Context> & context) const override {
auto result = Value::array();
for (const auto& e : elements) {
@@ -1185,8 +1173,8 @@ public:
class DictExpr : public Expression {
std::vector<std::pair<std::shared_ptr<Expression>, std::shared_ptr<Expression>>> elements;
public:
DictExpr(const Location & loc, std::vector<std::pair<std::shared_ptr<Expression>, std::shared_ptr<Expression>>> && e)
: Expression(loc), elements(std::move(e)) {}
DictExpr(const Location & location, std::vector<std::pair<std::shared_ptr<Expression>, std::shared_ptr<Expression>>> && e)
: Expression(location), elements(std::move(e)) {}
Value do_evaluate(const std::shared_ptr<Context> & context) const override {
auto result = Value::object();
for (const auto& [key, value] : elements) {
@@ -1201,8 +1189,8 @@ public:
class SliceExpr : public Expression {
public:
std::shared_ptr<Expression> start, end;
SliceExpr(const Location & loc, std::shared_ptr<Expression> && s, std::shared_ptr<Expression> && e)
: Expression(loc), start(std::move(s)), end(std::move(e)) {}
SliceExpr(const Location & location, std::shared_ptr<Expression> && s, std::shared_ptr<Expression> && e)
: Expression(location), start(std::move(s)), end(std::move(e)) {}
Value do_evaluate(const std::shared_ptr<Context> &) const override {
throw std::runtime_error("SliceExpr not implemented");
}
@@ -1212,8 +1200,8 @@ class SubscriptExpr : public Expression {
std::shared_ptr<Expression> base;
std::shared_ptr<Expression> index;
public:
SubscriptExpr(const Location & loc, std::shared_ptr<Expression> && b, std::shared_ptr<Expression> && i)
: Expression(loc), base(std::move(b)), index(std::move(i)) {}
SubscriptExpr(const Location & location, std::shared_ptr<Expression> && b, std::shared_ptr<Expression> && i)
: Expression(location), base(std::move(b)), index(std::move(i)) {}
Value do_evaluate(const std::shared_ptr<Context> & context) const override {
if (!base) throw std::runtime_error("SubscriptExpr.base is null");
if (!index) throw std::runtime_error("SubscriptExpr.index is null");
@@ -1255,8 +1243,8 @@ public:
enum class Op { Plus, Minus, LogicalNot, Expansion, ExpansionDict };
std::shared_ptr<Expression> expr;
Op op;
UnaryOpExpr(const Location & loc, std::shared_ptr<Expression> && e, Op o)
: Expression(loc), expr(std::move(e)), op(o) {}
UnaryOpExpr(const Location & location, std::shared_ptr<Expression> && e, Op o)
: Expression(location), expr(std::move(e)), op(o) {}
Value do_evaluate(const std::shared_ptr<Context> & context) const override {
if (!expr) throw std::runtime_error("UnaryOpExpr.expr is null");
auto e = expr->evaluate(context);
@@ -1281,8 +1269,8 @@ private:
std::shared_ptr<Expression> right;
Op op;
public:
BinaryOpExpr(const Location & loc, std::shared_ptr<Expression> && l, std::shared_ptr<Expression> && r, Op o)
: Expression(loc), left(std::move(l)), right(std::move(r)), op(o) {}
BinaryOpExpr(const Location & location, std::shared_ptr<Expression> && l, std::shared_ptr<Expression> && r, Op o)
: Expression(location), left(std::move(l)), right(std::move(r)), op(o) {}
Value do_evaluate(const std::shared_ptr<Context> & context) const override {
if (!left) throw std::runtime_error("BinaryOpExpr.left is null");
if (!right) throw std::runtime_error("BinaryOpExpr.right is null");
@@ -1439,8 +1427,8 @@ class MethodCallExpr : public Expression {
std::shared_ptr<VariableExpr> method;
ArgumentsExpression args;
public:
MethodCallExpr(const Location & loc, std::shared_ptr<Expression> && obj, std::shared_ptr<VariableExpr> && m, ArgumentsExpression && a)
: Expression(loc), object(std::move(obj)), method(std::move(m)), args(std::move(a)) {}
MethodCallExpr(const Location & location, std::shared_ptr<Expression> && obj, std::shared_ptr<VariableExpr> && m, ArgumentsExpression && a)
: Expression(location), object(std::move(obj)), method(std::move(m)), args(std::move(a)) {}
Value do_evaluate(const std::shared_ptr<Context> & context) const override {
if (!object) throw std::runtime_error("MethodCallExpr.object is null");
if (!method) throw std::runtime_error("MethodCallExpr.method is null");
@@ -1538,8 +1526,8 @@ class CallExpr : public Expression {
public:
std::shared_ptr<Expression> object;
ArgumentsExpression args;
CallExpr(const Location & loc, std::shared_ptr<Expression> && obj, ArgumentsExpression && a)
: Expression(loc), object(std::move(obj)), args(std::move(a)) {}
CallExpr(const Location & location, std::shared_ptr<Expression> && obj, ArgumentsExpression && a)
: Expression(location), object(std::move(obj)), args(std::move(a)) {}
Value do_evaluate(const std::shared_ptr<Context> & context) const override {
if (!object) throw std::runtime_error("CallExpr.object is null");
auto obj = object->evaluate(context);
@@ -1554,8 +1542,8 @@ public:
class FilterExpr : public Expression {
std::vector<std::shared_ptr<Expression>> parts;
public:
FilterExpr(const Location & loc, std::vector<std::shared_ptr<Expression>> && p)
: Expression(loc), parts(std::move(p)) {}
FilterExpr(const Location & location, std::vector<std::shared_ptr<Expression>> && p)
: Expression(location), parts(std::move(p)) {}
Value do_evaluate(const std::shared_ptr<Context> & context) const override {
Value result;
bool first = true;
@@ -2472,7 +2460,7 @@ private:
static std::regex leading_space_regex(R"(^\s+)");
text = std::regex_replace(text, leading_space_regex, "");
} else if (options.trim_blocks && (it - 1) != begin && !dynamic_cast<ExpressionTemplateToken*>((*(it - 2)).get())) {
if (!text.empty() && text[0] == '\n') {
if (text.length() > 0 && text[0] == '\n') {
text.erase(0, 1);
}
}
@@ -2550,7 +2538,7 @@ public:
TemplateTokenIterator begin = tokens.begin();
auto it = begin;
TemplateTokenIterator end = tokens.end();
return parser.parseTemplate(begin, it, end, /* fully= */ true);
return parser.parseTemplate(begin, it, end, /* full= */ true);
}
};
@@ -2589,7 +2577,7 @@ inline std::shared_ptr<Context> Context::builtins() {
throw std::runtime_error(args.at("message").get<std::string>());
}));
globals.set("tojson", simple_function("tojson", { "value", "indent" }, [](const std::shared_ptr<Context> &, Value & args) {
return Value(args.at("value").dump(args.get<int64_t>("indent", -1), /* to_json= */ true));
return Value(args.at("value").dump(args.get<int64_t>("indent", -1), /* tojson= */ true));
}));
globals.set("items", simple_function("items", { "object" }, [](const std::shared_ptr<Context> &, Value & args) {
auto items = Value::array();
@@ -2611,25 +2599,21 @@ inline std::shared_ptr<Context> Context::builtins() {
globals.set("last", simple_function("last", { "items" }, [](const std::shared_ptr<Context> &, Value & args) {
auto items = args.at("items");
if (!items.is_array()) throw std::runtime_error("object is not a list");
if (items.empty()) return Value();
if (items.size() == 0) return Value();
return items.at(items.size() - 1);
}));
globals.set("trim", simple_function("trim", { "text" }, [](const std::shared_ptr<Context> &, Value & args) {
auto & text = args.at("text");
return text.is_null() ? text : Value(strip(text.get<std::string>()));
}));
auto char_transform_function = [](const std::string & name, const std::function<char(char)> & fn) {
return simple_function(name, { "text" }, [=](const std::shared_ptr<Context> &, Value & args) {
auto text = args.at("text");
if (text.is_null()) return text;
std::string res;
auto str = text.get<std::string>();
std::transform(str.begin(), str.end(), std::back_inserter(res), fn);
return Value(res);
});
};
globals.set("lower", char_transform_function("lower", ::tolower));
globals.set("upper", char_transform_function("upper", ::toupper));
globals.set("lower", simple_function("lower", { "text" }, [](const std::shared_ptr<Context> &, Value & args) {
auto text = args.at("text");
if (text.is_null()) return text;
std::string res;
auto str = text.get<std::string>();
std::transform(str.begin(), str.end(), std::back_inserter(res), ::tolower);
return Value(res);
}));
globals.set("default", Value::callable([=](const std::shared_ptr<Context> &, ArgumentsValue & args) {
args.expectArgs("default", {2, 3}, {0, 1});
auto & value = args.args[0];
@@ -2759,17 +2743,12 @@ inline std::shared_ptr<Context> Context::builtins() {
return Value::callable([=](const std::shared_ptr<Context> & context, ArgumentsValue & args) {
args.expectArgs(is_select ? "select" : "reject", {2, (std::numeric_limits<size_t>::max)()}, {0, 0});
auto & items = args.args[0];
if (items.is_null()) {
if (items.is_null())
return Value::array();
}
if (!items.is_array()) {
throw std::runtime_error("object is not iterable: " + items.dump());
}
if (!items.is_array()) throw std::runtime_error("object is not iterable: " + items.dump());
auto filter_fn = context->get(args.args[1]);
if (filter_fn.is_null()) {
throw std::runtime_error("Undefined filter: " + args.args[1].dump());
}
if (filter_fn.is_null()) throw std::runtime_error("Undefined filter: " + args.args[1].dump());
auto filter_args = Value::array();
for (size_t i = 2, n = args.args.size(); i < n; i++) {
@@ -2891,25 +2870,20 @@ inline std::shared_ptr<Context> Context::builtins() {
auto v = arg.get<int64_t>();
startEndStep[i] = v;
param_set[i] = true;
}
}
}
for (auto & [name, value] : args.kwargs) {
size_t i;
if (name == "start") {
i = 0;
} else if (name == "end") {
i = 1;
} else if (name == "step") {
i = 2;
} else {
throw std::runtime_error("Unknown argument " + name + " for function range");
}
for (auto & [name, value] : args.kwargs) {
size_t i;
if (name == "start") i = 0;
else if (name == "end") i = 1;
else if (name == "step") i = 2;
else throw std::runtime_error("Unknown argument " + name + " for function range");
if (param_set[i]) {
throw std::runtime_error("Duplicate argument " + name + " for function range");
}
startEndStep[i] = value.get<int64_t>();
param_set[i] = true;
if (param_set[i]) {
throw std::runtime_error("Duplicate argument " + name + " for function range");
}
startEndStep[i] = value.get<int64_t>();
param_set[i] = true;
}
if (!param_set[1]) {
throw std::runtime_error("Missing required argument 'end' for function range");

View File

@@ -208,9 +208,6 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
trigger_patterns_c.data(), trigger_patterns_c.size(),
trigger_tokens.data(), trigger_tokens.size())
: llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root");
if (!grmr) {
return nullptr;
}
}
auto * result = new common_sampler {

View File

@@ -705,18 +705,6 @@ class Model:
if chkhsh == "ccc2ef013c104be7bae2965776d611e1d7a8a2a9c547dd93a682c9a9fc80352e":
# ref: https://huggingface.co/Xenova/gpt-4o
res = "gpt-4o"
if chkhsh == "7dec86086fcc38b66b7bc1575a160ae21cf705be7718b9d5598190d7c12db76f":
# ref: https://huggingface.co/UW/OLMo2-8B-SuperBPE-t180k
res = "superbpe"
if chkhsh == "1994ffd01900cfb37395608534236ecd63f2bd5995d6cb1004dda1af50240f15":
# ref: https://huggingface.co/trillionlabs/Trillion-7B-preview
res = "trillion"
if chkhsh == "96a5f08be6259352137b512d4157e333e21df7edd3fcd152990608735a65b224":
# ref: https://huggingface.co/inclusionAI/Ling-lite
res = "bailingmoe"
if chkhsh == "d353350c764d8c3b39c763113960e4fb4919bea5fbf208a0e3b22e8469dc7406":
# ref: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct
res = "llama4"
if res is None:
logger.warning("\n")
@@ -1611,7 +1599,6 @@ class StableLMModel(Model):
@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
class LlamaModel(Model):
model_arch = gguf.MODEL_ARCH.LLAMA
undo_permute = True
def set_vocab(self):
try:
@@ -1676,11 +1663,10 @@ class LlamaModel(Model):
n_head = self.hparams["num_attention_heads"]
n_kv_head = self.hparams.get("num_key_value_heads")
if self.undo_permute:
if name.endswith(("q_proj.weight", "q_proj.bias")):
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
if name.endswith(("k_proj.weight", "k_proj.bias")):
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
if name.endswith(("q_proj.weight", "q_proj.bias")):
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
if name.endswith(("k_proj.weight", "k_proj.bias")):
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
# process the experts separately
if name.find("block_sparse_moe.experts") != -1:
@@ -1757,68 +1743,13 @@ class LlamaModel(Model):
raise ValueError(f"Unprocessed experts: {experts}")
@Model.register("Llama4ForConditionalGeneration")
class Llama4Model(LlamaModel):
model_arch = gguf.MODEL_ARCH.LLAMA4
has_vision: bool = False
undo_permute = False
# TODO @ngxson : avoid duplicate this code everywhere by at least support "text_config"
# same with llama, but we need to merge the text_config into the root level of hparams
def __init__(self, *args, **kwargs):
hparams = kwargs["hparams"] if "hparams" in kwargs else Model.load_hparams(args[0])
if "text_config" in hparams:
hparams = {**hparams, **hparams["text_config"]}
kwargs["hparams"] = hparams
super().__init__(*args, **kwargs)
if "vision_config" in hparams:
logger.info("Has vision encoder, but it will be ignored")
self.has_vision = True
# IMPORTANT: the normal "intermediate_size" is renamed to "intermediate_size_mlp", we need to undo this
self.hparams["intermediate_size_moe"] = self.hparams["intermediate_size"]
self.hparams["intermediate_size"] = self.hparams["intermediate_size_mlp"]
def set_vocab(self):
self._set_vocab_gpt2()
self.gguf_writer.add_add_bos_token(True)
def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_interleave_moe_layer_step(self.hparams["interleave_moe_layer_step"])
self.gguf_writer.add_expert_feed_forward_length(self.hparams["intermediate_size_moe"])
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
name = name.replace("language_model.", "")
name = name.replace("feed_forward.", "mlp.") # a bit hacky for now
name = name.replace(".router.weight", ".gate.weight") # a bit hacky for now
# split the gate_up into gate and up
if "gate_up_proj" in name:
name_up = name.replace("gate_up_proj", "up_proj.weight")
name_gate = name.replace("gate_up_proj", "gate_proj.weight")
dim_half = data_torch.shape[-1] // 2
gate_proj_weight, up_proj_weight = data_torch.transpose(-1, -2).split(dim_half, dim=-2)
return [
(self.map_tensor_name(name_gate), gate_proj_weight),
(self.map_tensor_name(name_up), up_proj_weight)
]
if name.endswith("down_proj"):
name += ".weight"
data_torch = data_torch.transpose(-1, -2)
if "multi_modal_projector" in name or "vision_model" in name:
return []
return super().modify_tensors(data_torch, name, bid)
@Model.register("Mistral3ForConditionalGeneration")
class Mistral3Model(LlamaModel):
model_arch = gguf.MODEL_ARCH.LLAMA
# we need to merge the text_config into the root level of hparams
def __init__(self, *args, **kwargs):
hparams = kwargs["hparams"] if "hparams" in kwargs else Model.load_hparams(args[0])
hparams = Model.load_hparams(kwargs["dir_model"])
if "text_config" in hparams:
hparams = {**hparams, **hparams["text_config"]}
kwargs["hparams"] = hparams
@@ -2335,7 +2266,7 @@ class Qwen2Model(Model):
self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
@Model.register("Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration")
@Model.register("Qwen2VLForConditionalGeneration")
class Qwen2VLModel(Model):
model_arch = gguf.MODEL_ARCH.QWEN2VL
@@ -2459,16 +2390,6 @@ class Qwen2MoeModel(Model):
raise ValueError(f"Unprocessed experts: {experts}")
@Model.register("Qwen3ForCausalLM")
class Qwen3Model(Qwen2Model):
model_arch = gguf.MODEL_ARCH.QWEN3
@Model.register("Qwen3MoeForCausalLM")
class Qwen3MoeModel(Qwen2MoeModel):
model_arch = gguf.MODEL_ARCH.QWEN3MOE
@Model.register("GPT2LMHeadModel")
class GPT2Model(Model):
model_arch = gguf.MODEL_ARCH.GPT2
@@ -3461,7 +3382,7 @@ class Gemma3Model(Model):
# we need to merge the text_config into the root level of hparams
def __init__(self, *args, **kwargs):
hparams = kwargs["hparams"] if "hparams" in kwargs else Model.load_hparams(args[0])
hparams = Model.load_hparams(kwargs["dir_model"])
if "text_config" in hparams:
hparams = {**hparams, **hparams["text_config"]}
kwargs["hparams"] = hparams
@@ -3627,8 +3548,8 @@ class RWKV6Qwen2Model(Rwkv6Model):
head_size = hidden_size // num_attention_heads
rms_norm_eps = self.hparams["rms_norm_eps"]
intermediate_size = self.hparams["intermediate_size"]
time_mix_extra_dim = self.hparams.get("lora_rank_tokenshift", 64 if hidden_size >= 4096 else 32)
time_decay_extra_dim = self.hparams.get("lora_rank_decay", 128 if hidden_size >= 4096 else 64)
time_mix_extra_dim = 64 if hidden_size >= 4096 else 32
time_decay_extra_dim = 128 if hidden_size >= 4096 else 64
# RWKV isn't context limited
self.gguf_writer.add_context_length(1048576)
@@ -3879,6 +3800,8 @@ class MambaModel(Model):
_tok_embd = None
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
del bid # unused
output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT)
tok_embd_name = self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD)
@@ -3888,10 +3811,6 @@ class MambaModel(Model):
logger.debug("A_log --> A ==> " + new_name)
data_torch = -torch.exp(data_torch)
# [4 1 8192 1] -> [4 8192 1 1]
if self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_CONV1D, bid):
data_torch = data_torch.squeeze()
# assuming token_embd.weight is seen before output.weight
if self._tok_embd is not None and new_name == output_name:
if torch.equal(self._tok_embd, data_torch):
@@ -4495,29 +4414,6 @@ class DeepseekV2Model(Model):
raise ValueError(f"Unprocessed experts: {experts}")
@Model.register("PLMForCausalLM")
class PLMModel(Model):
model_arch = gguf.MODEL_ARCH.PLM
def set_vocab(self):
self._set_vocab_gpt2()
def set_gguf_parameters(self):
super().set_gguf_parameters()
hparams = self.hparams
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"])
self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
self.gguf_writer.add_value_length(hparams["v_head_dim"])
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
return [(self.map_tensor_name(name), data_torch)]
def prepare_tensors(self):
super().prepare_tensors()
@Model.register("T5WithLMHeadModel")
@Model.register("T5ForConditionalGeneration")
@Model.register("MT5ForConditionalGeneration")
@@ -5206,105 +5102,6 @@ class GraniteMoeModel(GraniteModel):
return super().modify_tensors(data_torch, name, bid)
@Model.register("BailingMoeForCausalLM")
class BailingMoeModel(Model):
model_arch = gguf.MODEL_ARCH.BAILINGMOE
def set_vocab(self):
self._set_vocab_gpt2()
def set_gguf_parameters(self):
super().set_gguf_parameters()
hparams = self.hparams
rope_dim = hparams.get("head_dim") or hparams["hidden_size"] // hparams["num_attention_heads"]
self.gguf_writer.add_rope_dimension_count(rope_dim)
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
self.gguf_writer.add_expert_weights_scale(1.0)
self.gguf_writer.add_expert_count(hparams["num_experts"])
self.gguf_writer.add_expert_shared_count(hparams["num_shared_experts"])
self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"])
_experts: list[dict[str, Tensor]] | None = None
@staticmethod
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
if n_head_kv is not None and n_head != n_head_kv:
n_head = n_head_kv
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
.swapaxes(1, 2)
.reshape(weights.shape))
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
n_head = self.hparams["num_attention_heads"]
n_kv_head = self.hparams.get("num_key_value_heads")
n_embd = self.hparams["hidden_size"]
head_dim = self.hparams.get("head_dim") or n_embd // n_head
output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT)
if name.endswith("attention.dense.weight"):
return [(self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, bid), data_torch)]
elif name.endswith("query_key_value.weight"):
q, k, v = data_torch.split([n_head * head_dim, n_kv_head * head_dim, n_kv_head * head_dim], dim=-2)
return [
(self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), BailingMoeModel.permute(q, n_head, n_head)),
(self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), BailingMoeModel.permute(k, n_head, n_kv_head)),
(self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), v)
]
elif name.find("mlp.experts") != -1:
n_experts = self.hparams["num_experts"]
assert bid is not None
tensors: list[tuple[str, Tensor]] = []
if self._experts is None:
self._experts = [{} for _ in range(self.block_count)]
self._experts[bid][name] = data_torch
if len(self._experts[bid]) >= n_experts * 3:
# merge the experts into a single 3d tensor
for w_name in ["down_proj", "gate_proj", "up_proj"]:
datas: list[Tensor] = []
for xid in range(n_experts):
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
datas.append(self._experts[bid][ename])
del self._experts[bid][ename]
data_torch = torch.stack(datas, dim=0)
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
new_name = self.map_tensor_name(merged_name)
tensors.append((new_name, data_torch))
return tensors
new_name = self.map_tensor_name(name)
if new_name == output_name and self.hparams.get("norm_head"):
data_torch = data_torch.float()
data_torch /= torch.norm(data_torch, p=2, dim=0, keepdim=True) + 1e-7
return [(new_name, data_torch)]
def prepare_tensors(self):
super().prepare_tensors()
if self._experts is not None:
# flatten `list[dict[str, Tensor]]` into `list[str]`
experts = [k for d in self._experts for k in d.keys()]
if len(experts) > 0:
raise ValueError(f"Unprocessed experts: {experts}")
@Model.register("ChameleonForConditionalGeneration")
@Model.register("ChameleonForCausalLM") # obsolete
class ChameleonModel(Model):
@@ -5558,7 +5355,7 @@ def main() -> None:
logger.error(f"Model {model_architecture} is not supported")
sys.exit(1)
model_instance = model_class(dir_model, output_type, fname_out,
model_instance = model_class(dir_model=dir_model, ftype=output_type, fname_out=fname_out,
is_big_endian=args.bigendian, use_temp_file=args.use_temp_file,
eager=args.no_lazy,
metadata_override=args.metadata, model_name=args.model_name,

View File

@@ -110,10 +110,6 @@ models = [
{"name": "deepseek-v3", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-V3"},
{"name": "deepseek-r1-qwen", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"},
{"name": "gpt-4o", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Xenova/gpt-4o", },
{"name": "superbpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/UW/OLMo2-8B-SuperBPE-t180k", },
{"name": "trillion", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/trillionlabs/Trillion-7B-preview", },
{"name": "bailingmoe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inclusionAI/Ling-lite", },
{"name": "llama4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct", },
]

View File

@@ -145,13 +145,8 @@ A Snapdragon X Elite device with Windows 11 Arm64 is used. Make sure the followi
* Clang 19
* Ninja
* Visual Studio 2022
* Powershell 7
Visual Studio provides necessary headers and libraries although it is not directly used for building.
Alternatively, Visual Studio Build Tools can be installed instead of the full Visual Studio.
Powershell 7 is used for the following commands.
If an older version of Powershell is used, these commands may not work as they are.
Powershell is used for the following instructions.
### I. Setup Environment
@@ -201,9 +196,10 @@ ninja
## Known Issues
- Currently OpenCL backend does not work on Adreno 6xx GPUs.
- Qwen2.5 0.5B model produces gibberish output with Adreno kernels.
## TODO
- Fix Qwen2.5 0.5B
- Optimization for Q6_K
- Support and optimization for Q4_K

View File

@@ -20,7 +20,7 @@
**oneAPI** is an open ecosystem and a standard-based specification, supporting multiple architectures including but not limited to intel CPUs, GPUs and FPGAs. The key components of the oneAPI ecosystem include:
- **DPCPP** *(Data Parallel C++)*: The primary oneAPI SYCL implementation, which includes the icpx/icx Compilers.
- **oneAPI Libraries**: A set of highly optimized libraries targeting multiple domains *(e.g. Intel oneMKL, oneMath and oneDNN)*.
- **oneAPI Libraries**: A set of highly optimized libraries targeting multiple domains *(e.g. oneMKL and oneDNN)*.
- **oneAPI LevelZero**: A high performance low level interface for fine-grained control over intel iGPUs and dGPUs.
- **Nvidia & AMD Plugins**: These are plugins extending oneAPI's DPCPP support to SYCL on Nvidia and AMD GPU targets.
@@ -227,6 +227,16 @@ Upon a successful installation, SYCL is enabled for the available intel devices,
**oneAPI Plugin**: In order to enable SYCL support on Nvidia GPUs, please install the [Codeplay oneAPI Plugin for Nvidia GPUs](https://developer.codeplay.com/products/oneapi/nvidia/download). User should also make sure the plugin version matches the installed base toolkit one *(previous step)* for a seamless "oneAPI on Nvidia GPU" setup.
**oneMKL for cuBlas**: The current oneMKL releases *(shipped with the oneAPI base-toolkit)* do not contain the cuBLAS backend. A build from source of the upstream [oneMKL](https://github.com/oneapi-src/oneMKL) with the *cuBLAS* backend enabled is thus required to run it on Nvidia GPUs.
```sh
git clone https://github.com/oneapi-src/oneMKL
cd oneMKL
cmake -B buildWithCublas -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENABLE_MKLGPU_BACKEND=OFF -DENABLE_MKLCPU_BACKEND=OFF -DENABLE_CUBLAS_BACKEND=ON -DTARGET_DOMAINS=blas
cmake --build buildWithCublas --config Release
```
**oneDNN**: The current oneDNN releases *(shipped with the oneAPI base-toolkit)* do not include the NVIDIA backend. Therefore, oneDNN must be compiled from source to enable the NVIDIA target:
```sh
@@ -240,6 +250,16 @@ cmake --build build-nvidia --config Release
**oneAPI Plugin**: In order to enable SYCL support on AMD GPUs, please install the [Codeplay oneAPI Plugin for AMD GPUs](https://developer.codeplay.com/products/oneapi/amd/download). As with Nvidia GPUs, the user should also make sure the plugin version matches the installed base toolkit.
**oneMKL for rocBlas**: The current oneMKL releases *(shipped with the oneAPI base-toolkit)* doesn't contain the rocBLAS backend. A build from source of the upstream [oneMKL](https://github.com/oneapi-src/oneMKL) with the *rocBLAS* backend enabled is thus required to run it on AMD GPUs.
```sh
git clone https://github.com/oneapi-src/oneMKL
cd oneMKL
# Find your HIPTARGET with rocminfo, under the key 'Name:'
cmake -B buildWithrocBLAS -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENABLE_MKLGPU_BACKEND=OFF -DENABLE_MKLCPU_BACKEND=OFF -DENABLE_ROCBLAS_BACKEND=ON -DHIPTARGETS=${HIPTARGET} -DTARGET_DOMAINS=blas
cmake --build buildWithrocBLAS --config Release
```
3. **Verify installation and environment**
In order to check the available SYCL devices on the machine, please use the `sycl-ls` command.
@@ -302,16 +322,15 @@ cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -
cmake --build build --config Release -j -v
```
It is possible to come across some precision issues when running tests that stem from using faster
instructions, which can be circumvented by setting the environment variable `SYCL_PROGRAM_COMPILE_OPTIONS`
as `-cl-fp32-correctly-rounded-divide-sqrt`
#### Nvidia GPU
The SYCL backend depends on [oneMath](https://github.com/uxlfoundation/oneMath) for Nvidia and AMD devices.
By default it is automatically built along with the project. A specific build can be provided by setting the CMake flag `-DoneMath_DIR=/path/to/oneMath/install/lib/cmake/oneMath`.
```sh
# Export relevant ENV variables
export LD_LIBRARY_PATH=/path/to/oneMKL/buildWithCublas/lib:$LD_LIBRARY_PATH
export LIBRARY_PATH=/path/to/oneMKL/buildWithCublas/lib:$LIBRARY_PATH
export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithCublas/include:$CPLUS_INCLUDE_DIR
export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR
# Build LLAMA with Nvidia BLAS acceleration through SYCL
# Setting GGML_SYCL_DEVICE_ARCH is optional but can improve performance
GGML_SYCL_DEVICE_ARCH=sm_80 # Example architecture
@@ -326,15 +345,14 @@ cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=
cmake --build build --config Release -j -v
```
It is possible to come across some precision issues when running tests that stem from using faster
instructions, which can be circumvented by passing the `-fno-fast-math` flag to the compiler.
#### AMD GPU
The SYCL backend depends on [oneMath](https://github.com/uxlfoundation/oneMath) for Nvidia and AMD devices.
By default it is automatically built along with the project. A specific build can be provided by setting the CMake flag `-DoneMath_DIR=/path/to/oneMath/install/lib/cmake/oneMath`.
```sh
# Export relevant ENV variables
export LD_LIBRARY_PATH=/path/to/oneMKL/buildWithrocBLAS/lib:$LD_LIBRARY_PATH
export LIBRARY_PATH=/path/to/oneMKL/buildWithrocBLAS/lib:$LIBRARY_PATH
export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithrocBLAS/include:$CPLUS_INCLUDE_DIR
# Build LLAMA with rocBLAS acceleration through SYCL
## AMD
@@ -425,13 +443,13 @@ Examples:
- Use device 0:
```sh
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -no-cnv -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
```
- Use multiple devices:
```sh
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -no-cnv -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
```
*Notes:*
@@ -475,12 +493,6 @@ b. Enable oneAPI running environment:
"C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64
```
- if you are using Powershell, enable the runtime environment with the following:
```
cmd.exe "/K" '"C:\Program Files (x86)\Intel\oneAPI\setvars.bat" && powershell'
```
c. Verify installation
In the oneAPI command line, run the following to print the available SYCL devices:
@@ -511,13 +523,13 @@ You could download the release package for Windows directly, which including bin
Choose one of following methods to build from source code.
#### 1. Script
1. Script
```sh
.\examples\sycl\win-build-sycl.bat
```
#### 2. CMake
2. CMake
On the oneAPI command line window, step into the llama.cpp main directory and run the following:
@@ -546,84 +558,13 @@ cmake --preset x64-windows-sycl-debug
cmake --build build-x64-windows-sycl-debug -j --target llama-cli
```
#### 3. Visual Studio
3. Visual Studio
You have two options to use Visual Studio to build llama.cpp:
- As CMake Project using CMake presets.
- Creating a Visual Studio solution to handle the project.
**Note**:
All following commands are executed in PowerShell.
##### - Open as a CMake Project
You can use Visual Studio to open the `llama.cpp` folder directly as a CMake project. Before compiling, select one of the SYCL CMake presets:
- `x64-windows-sycl-release`
- `x64-windows-sycl-debug`
You can use Visual Studio to open llama.cpp folder as a CMake project. Choose the sycl CMake presets (`x64-windows-sycl-release` or `x64-windows-sycl-debug`) before you compile the project.
*Notes:*
- For a minimal experimental setup, you can build only the inference executable using:
```Powershell
cmake --build build --config Release -j --target llama-cli
```
##### - Generating a Visual Studio Solution
You can use Visual Studio solution to build and work on llama.cpp on Windows. You need to convert the CMake Project into a `.sln` file.
If you want to use the Intel C++ Compiler for the entire `llama.cpp` project, run the following command:
```Powershell
cmake -B build -G "Visual Studio 17 2022" -T "Intel C++ Compiler 2025" -A x64 -DGGML_SYCL=ON -DCMAKE_BUILD_TYPE=Release
```
If you prefer to use the Intel C++ Compiler only for `ggml-sycl`, ensure that `ggml` and its backend libraries are built as shared libraries ( i.e. `-DBUILD_SHARED_LIBRARIES=ON`, this is default behaviour):
```Powershell
cmake -B build -G "Visual Studio 17 2022" -A x64 -DGGML_SYCL=ON -DCMAKE_BUILD_TYPE=Release \
-DSYCL_INCLUDE_DIR="C:\Program Files (x86)\Intel\oneAPI\compiler\latest\include" \
-DSYCL_LIBRARY_DIR="C:\Program Files (x86)\Intel\oneAPI\compiler\latest\lib"
```
If successful the build files have been written to: *path/to/llama.cpp/build*
Open the project file **build/llama.cpp.sln** with Visual Studio.
Once the Visual Studio solution is created, follow these steps:
1. Open the solution in Visual Studio.
2. Right-click on `ggml-sycl` and select **Properties**.
3. In the left column, expand **C/C++** and select **DPC++**.
4. In the right panel, find **Enable SYCL Offload** and set it to `Yes`.
5. Apply the changes and save.
*Navigation Path:*
```
Properties -> C/C++ -> DPC++ -> Enable SYCL Offload (Yes)
```
Now, you can build `llama.cpp` with the SYCL backend as a Visual Studio project.
To do it from menu: `Build -> Build Solution`.
Once it is completed, final results will be in **build/Release/bin**
*Additional Note*
- You can avoid specifying `SYCL_INCLUDE_DIR` and `SYCL_LIBRARY_DIR` in the CMake command by setting the environment variables:
- `SYCL_INCLUDE_DIR_HINT`
- `SYCL_LIBRARY_DIR_HINT`
- Above instruction has been tested with Visual Studio 17 Community edition and oneAPI 2025.0. We expect them to work also with future version if the instructions are adapted accordingly.
- In case of a minimal experimental setup, the user can build the inference executable only through `cmake --build build --config Release -j --target llama-cli`.
### III. Run the inference
@@ -697,13 +638,13 @@ Examples:
- Use device 0:
```
build\bin\llama-cli.exe -no-cnv -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm none -mg 0
build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm none -mg 0
```
- Use multiple devices:
```
build\bin\llama-cli.exe -no-cnv -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer
build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer
```

View File

@@ -132,14 +132,12 @@ You may find the official downloads here: [NVIDIA developer site](https://develo
#### Compile and run inside a Fedora Toolbox Container
We also have a [guide](./backend/CUDA-FEDORA.md) for setting up CUDA toolkit in a Fedora [toolbox container](https://containertoolbx.org/).
We also have a [guide](./cuda-fedora.md) for setting up CUDA toolkit in a Fedora [toolbox container](https://containertoolbx.org/).
**Recommended for:**
- ***Necessary*** for users of [Atomic Desktops for Fedora](https://fedoraproject.org/atomic-desktops/); such as: [Silverblue](https://fedoraproject.org/atomic-desktops/silverblue/) and [Kinoite](https://fedoraproject.org/atomic-desktops/kinoite/).
- (there are no supported CUDA packages for these systems)
- ***Necessary*** for users that have a host that is not a: [Supported Nvidia CUDA Release Platform](https://developer.nvidia.com/cuda-downloads).
- (for example, you may have [Fedora 42 Beta](https://fedoramagazine.org/announcing-fedora-linux-42-beta/) as your your host operating system)
- ***Convenient*** For those running [Fedora Workstation](https://fedoraproject.org/workstation/) or [Fedora KDE Plasma Desktop](https://fedoraproject.org/spins/kde), and want to keep their host system clean.
- ***Particularly*** *convenient* for users of [Atomic Desktops for Fedora](https://fedoraproject.org/atomic-desktops/); such as: [Silverblue](https://fedoraproject.org/atomic-desktops/silverblue/) and [Kinoite](https://fedoraproject.org/atomic-desktops/kinoite/).
- Toolbox is installed by default: [Fedora Workstation](https://fedoraproject.org/workstation/) or [Fedora KDE Plasma Desktop](https://fedoraproject.org/spins/kde).
- *Optionally* toolbox packages are available: [Arch Linux](https://archlinux.org/), [Red Hat Enterprise Linux >= 8.5](https://www.redhat.com/en/technologies/linux-platforms/enterprise-linux), or [Ubuntu](https://ubuntu.com/download)
@@ -191,7 +189,7 @@ The following compilation options are also available to tweak performance:
| Option | Legal values | Default | Description |
|-------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| GGML_CUDA_FORCE_MMQ | Boolean | false | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, CDNA and RDNA3+). MMQ kernels are enabled by default on GPUs with int8 tensor core support. With MMQ force enabled, speed for large batch sizes will be worse but VRAM consumption will be lower. |
| GGML_CUDA_FORCE_MMQ | Boolean | false | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, RDNA3). MMQ kernels are enabled by default on GPUs with int8 tensor core support. With MMQ force enabled, speed for large batch sizes will be worse but VRAM consumption will be lower. |
| GGML_CUDA_FORCE_CUBLAS | Boolean | false | Force the use of FP16 cuBLAS instead of custom matrix multiplication kernels for quantized models |
| GGML_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
| GGML_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
@@ -218,7 +216,6 @@ By default, all supported compute capabilities are enabled. To customize this be
```bash
cmake -B build -DGGML_MUSA=ON -DMUSA_ARCHITECTURES="21"
cmake --build build --config Release
```
This configuration enables only compute capability `2.1` (MTT S80) during compilation, which can help reduce compilation time.
@@ -436,116 +433,6 @@ llama_new_context_with_model: CANN compute buffer size = 1260.81 MiB
For detailed info, such as model/device supports, CANN install, please refer to [llama.cpp for CANN](./backend/CANN.md).
## Arm® KleidiAI™
KleidiAI is a library of optimized microkernels for AI workloads, specifically designed for Arm CPUs. These microkernels enhance performance and can be enabled for use by the CPU backend.
To enable KleidiAI, go to the llama.cpp directory and build using CMake
```bash
cmake -B build -DGGML_CPU_KLEIDIAI=ON
cmake --build build --config Release
```
You can verify that KleidiAI is being used by running
```bash
./build/bin/llama-cli -m PATH_TO_MODEL -p "What is a car?"
```
If KleidiAI is enabled, the ouput will contain a line similar to:
```
load_tensors: CPU_KLEIDIAI model buffer size = 3474.00 MiB
```
KleidiAI's microkernels implement optimized tensor operations using Arm CPU features such as dotprod, int8mm and SME. llama.cpp selects the most efficient kernel based on runtime CPU feature detection. However, on platforms that support SME, you must manually enable SME microkernels by setting the environment variable `GGML_KLEIDIAI_SME=1`.
Depending on your build target, other higher priority backends may be enabled by default. To ensure the CPU backend is used, you must disable the higher priority backends either at compile time, e.g. -DGGML_METAL=OFF, or during run-time using the command line option `--device none`.
## OpenCL
This provides GPU acceleration through OpenCL on recent Adreno GPU.
More information about OpenCL backend can be found in [OPENCL.md](./backend/OPENCL.md) for more information.
### Android
Assume NDK is available in `$ANDROID_NDK`. First, install OpenCL headers and ICD loader library if not available,
```sh
mkdir -p ~/dev/llm
cd ~/dev/llm
git clone https://github.com/KhronosGroup/OpenCL-Headers && \
cd OpenCL-Headers && \
cp -r CL $ANDROID_NDK/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include
cd ~/dev/llm
git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader && \
cd OpenCL-ICD-Loader && \
mkdir build_ndk && cd build_ndk && \
cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release \
-DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
-DOPENCL_ICD_LOADER_HEADERS_DIR=$ANDROID_NDK/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include \
-DANDROID_ABI=arm64-v8a \
-DANDROID_PLATFORM=24 \
-DANDROID_STL=c++_shared && \
ninja && \
cp libOpenCL.so $ANDROID_NDK/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/lib/aarch64-linux-android
```
Then build llama.cpp with OpenCL enabled,
```sh
cd ~/dev/llm
git clone https://github.com/ggml-org/llama.cpp && \
cd llama.cpp && \
mkdir build-android && cd build-android
cmake .. -G Ninja \
-DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
-DANDROID_ABI=arm64-v8a \
-DANDROID_PLATFORM=android-28 \
-DBUILD_SHARED_LIBS=OFF \
-DGGML_OPENCL=ON
ninja
```
### Windows Arm64
First, install OpenCL headers and ICD loader library if not available,
```powershell
mkdir -p ~/dev/llm
cd ~/dev/llm
git clone https://github.com/KhronosGroup/OpenCL-Headers && cd OpenCL-Headers
mkdir build && cd build
cmake .. -G Ninja `
-DBUILD_TESTING=OFF `
-DOPENCL_HEADERS_BUILD_TESTING=OFF `
-DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF `
-DCMAKE_INSTALL_PREFIX="$HOME/dev/llm/opencl"
cmake --build . --target install
cd ~/dev/llm
git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader && cd OpenCL-ICD-Loader
mkdir build && cd build
cmake .. -G Ninja `
-DCMAKE_BUILD_TYPE=Release `
-DCMAKE_PREFIX_PATH="$HOME/dev/llm/opencl" `
-DCMAKE_INSTALL_PREFIX="$HOME/dev/llm/opencl"
cmake --build . --target install
```
Then build llama.cpp with OpenCL enabled,
```powershell
cmake .. -G Ninja `
-DCMAKE_TOOLCHAIN_FILE="$HOME/dev/llm/llama.cpp/cmake/arm64-windows-llvm.cmake" `
-DCMAKE_BUILD_TYPE=Release `
-DCMAKE_PREFIX_PATH="$HOME/dev/llm/opencl" `
-DBUILD_SHARED_LIBS=OFF `
-DGGML_OPENCL=ON
ninja
```
## Android
To read documentation for how to build on Android, [click here](./android.md)

View File

@@ -14,7 +14,9 @@ In this guide we setup [Nvidia CUDA](https://docs.nvidia.com/cuda/) in a toolbox
- [Creating a Fedora Toolbox Environment](#creating-a-fedora-toolbox-environment)
- [Installing Essential Development Tools](#installing-essential-development-tools)
- [Adding the CUDA Repository](#adding-the-cuda-repository)
- [Installing Nvidia Driver Libraries](#installing-nvidia-driver-libraries)
- [Installing `nvidia-driver-libs`](#installing-nvidia-driver-libs)
- [Manually Resolving Package Conflicts](#manually-resolving-package-conflicts)
- [Finalizing the Installation of `nvidia-driver-libs`](#finalizing-the-installation-of-nvidia-driver-libs)
- [Installing the CUDA Meta-Package](#installing-the-cuda-meta-package)
- [Configuring the Environment](#configuring-the-environment)
- [Verifying the Installation](#verifying-the-installation)
@@ -65,7 +67,7 @@ This guide focuses on Fedora hosts, but with small adjustments, it can work for
sudo dnf distro-sync
```
2. **Install **Vim** the default text editor (Optional):**
2. **Install the Default Text Editor (Optional):**
```bash
sudo dnf install vim-default-editor --allowerasing
@@ -95,48 +97,36 @@ After adding the repository, synchronize the package manager again:
sudo dnf distro-sync
```
## Installing Nvidia Driver Libraries
## Installing `nvidia-driver-libs` and `nvidia-driver-cuda-libs`
First, we need to detect if the host is supplying the [NVIDIA driver libraries into the toolbox](https://github.com/containers/toolbox/blob/main/src/pkg/nvidia/nvidia.go):
We need to detect if the host is supplying the [NVIDIA driver libraries into the toolbox](https://github.com/containers/toolbox/blob/main/src/pkg/nvidia/nvidia.go).
```bash
ls -la /usr/lib64/libcuda.so.1
```
### If *`libcuda.so.1`* is missing:
```
ls: cannot access '/usr/lib64/libcuda.so.1': No such file or directory
```
**Explanation:**
The host dose not supply the CUDA drivers, **install them now:**
#### Install the Nvidia Driver Libraries on Guest:
- `nvidia-driver-libs` and `nvidia-driver-cuda-libs` contains necessary NVIDIA driver libraries required by CUDA,
on hosts with NVIDIA drivers installed the Fedora Container will supply the host libraries.
### Install Nvidia Driver Libraries on Guest (if `libcuda.so.1` was NOT found).
```bash
sudo dnf install nvidia-driver-cuda nvidia-driver-libs nvidia-driver-cuda-libs nvidia-persistenced
sudo dnf install nvidia-driver-libs nvidia-driver-cuda-libs
```
### If *`libcuda.so.1`* exists:
```
lrwxrwxrwx. 1 root root 21 Mar 24 11:26 /usr/lib64/libcuda.so.1 -> libcuda.so.570.133.07
```
### Manually Updating the RPM database for host-supplied NVIDIA drivers (if `libcuda.so.1` was found).
**Explanation:**
The host is supply the CUDA drivers, **we need to update the guest RPM Database accordingly:**
If the installation fails due to conflicts, we'll manually download and install the required packages, excluding conflicting files.
#### Update the Toolbox RPM Database to include the Host-Supplied Libraries:
Note: we do not actually install the libraries, we just update the DB so that the guest system knows they are supplied by the host.
##### 1. Download `nvidia-` parts that are supplied by the host RPM's (with dependencies)
#### 1. Download `nvidia-driver-libs` and `nvidia-driver-cuda-libs` RPM's (with dependencies)
```bash
sudo dnf download --destdir=/tmp/nvidia-driver-libs --resolve --arch x86_64 nvidia-driver-cuda nvidia-driver-libs nvidia-driver-cuda-libs nvidia-persistenced
sudo dnf download --destdir=/tmp/nvidia-driver-libs --resolve --arch x86_64 nvidia-driver-libs nvidia-driver-cuda-libs
```
##### 2. Update the RPM database to assume the installation of these packages.
#### 2. Update the RPM database to assume the installation of these packages.
```bash
sudo rpm --install --verbose --hash --justdb /tmp/nvidia-driver-libs/*
@@ -144,26 +134,23 @@ sudo rpm --install --verbose --hash --justdb /tmp/nvidia-driver-libs/*
**Note:**
- The `--justdb` option only updates the RPM database, without touching the filesystem elsewhere.
- The `--justdb` option only updates the RPM database, without touching the filesystem.
##### Check that the RPM Database has been correctly updated:
**Note:** This is the same command as in the *"Install the Nvidia Driver Libraries on Guest"* for if *`libcuda.so.1`* was missing.
#### Finalizing the Installation of `nvidia-driver-libs` and `nvidia-driver-cuda-libs`
After manually installing the dependencies, run:
```bash
sudo dnf install nvidia-driver-cuda nvidia-driver-libs nvidia-driver-cuda-libs nvidia-persistenced
sudo dnf install nvidia-driver-libs nvidia-driver-cuda-libs
```
*(this time it will not install anything, as the database things that these packages are already installed)*
You should receive a message indicating the package is already installed:
```
Updating and loading repositories:
Repositories loaded.
Package "nvidia-driver-cuda-3:570.124.06-1.fc41.x86_64" is already installed.
Package "nvidia-driver-libs-3:570.124.06-1.fc41.x86_64" is already installed.
Package "nvidia-driver-cuda-libs-3:570.124.06-1.fc41.x86_64" is already installed.
Package "nvidia-persistenced-3:570.124.06-1.fc41.x86_64" is already installed.
Package "nvidia-driver-libs-3:570.86.10-1.fc41.x86_64" is already installed.
Package "nvidia-driver-cuda-libs-3:570.86.10-1.fc41.x86_64" is already installed.
Nothing to do.
```
@@ -220,9 +207,9 @@ You should see output similar to:
```
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2025 NVIDIA Corporation
Built on Fri_Feb_21_20:23:50_PST_2025
Cuda compilation tools, release 12.8, V12.8.93
Build cuda_12.8.r12.8/compiler.35583870_0
Built on Wed_Jan_15_19:20:09_PST_2025
Cuda compilation tools, release 12.8, V12.8.61
Build cuda_12.8.r12.8/compiler.35404655_0
```
This output confirms that the CUDA compiler is accessible and indicates the installed version.

View File

@@ -9,13 +9,6 @@ brew install llama.cpp
```
The formula is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggml-org/llama.cpp/discussions/7668
## MacPorts
```sh
sudo port install llama.cpp
```
see also: https://ports.macports.org/port/llama.cpp/details/
## Nix
On Mac and Linux, the Nix package manager can be used via

View File

@@ -38,7 +38,7 @@ int main(int argc, char ** argv) {
llama_model_params model_params = common_model_params_to_llama(params);
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params);
llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params);
if (model == NULL) {
fprintf(stderr , "%s: error: unable to load model\n" , __func__);

View File

@@ -41,7 +41,7 @@ int main(int argc, char ** argv) {
llama_model_params model_params = common_model_params_to_llama(params);
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params);
llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params);
if (model == NULL) {
LOG_ERR("%s: error: unable to load model\n" , __func__);

View File

@@ -421,7 +421,7 @@ int main(int argc, char ** argv) {
g_verbose = (params.verbosity > 1);
try {
lora_merge_ctx ctx(params.model.path, params.lora_adapters, params.out_file, params.cpuparams.n_threads);
lora_merge_ctx ctx(params.model, params.lora_adapters, params.out_file, params.cpuparams.n_threads);
ctx.run_merge();
} catch (const std::exception & err) {
fprintf(stderr, "%s\n", err.what());

View File

@@ -408,6 +408,8 @@ static void gguf_merge(const split_params & split_params) {
exit(EXIT_FAILURE);
}
std::ofstream fout(split_params.output.c_str(), std::ios::binary);
fout.exceptions(std::ofstream::failbit); // fail fast on write errors
auto * ctx_out = gguf_init_empty();
@@ -451,6 +453,7 @@ static void gguf_merge(const split_params & split_params) {
gguf_free(ctx_gguf);
ggml_free(ctx_meta);
gguf_free(ctx_out);
fout.close();
exit(EXIT_FAILURE);
}
@@ -463,6 +466,7 @@ static void gguf_merge(const split_params & split_params) {
gguf_free(ctx_gguf);
ggml_free(ctx_meta);
gguf_free(ctx_out);
fout.close();
exit(EXIT_FAILURE);
}
@@ -475,6 +479,7 @@ static void gguf_merge(const split_params & split_params) {
gguf_free(ctx_gguf);
ggml_free(ctx_meta);
gguf_free(ctx_out);
fout.close();
exit(EXIT_FAILURE);
}
@@ -495,11 +500,9 @@ static void gguf_merge(const split_params & split_params) {
fprintf(stderr, "\033[3Ddone\n");
}
std::ofstream fout;
if (!split_params.dry_run) {
fout.open(split_params.output.c_str(), std::ios::binary);
fout.exceptions(std::ofstream::failbit); // fail fast on write errors
// placeholder for the meta data
// placeholder for the meta data
{
auto meta_size = gguf_get_meta_size(ctx_out);
::zeros(fout, meta_size);
}
@@ -515,9 +518,7 @@ static void gguf_merge(const split_params & split_params) {
ggml_free(ctx_metas[i]);
}
gguf_free(ctx_out);
if (!split_params.dry_run) {
fout.close();
}
fout.close();
exit(EXIT_FAILURE);
}
fprintf(stderr, "%s: writing tensors %s ...", __func__, split_path);
@@ -539,11 +540,10 @@ static void gguf_merge(const split_params & split_params) {
auto offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i_tensor);
f_input.seekg(offset);
f_input.read((char *)read_data.data(), n_bytes);
if (!split_params.dry_run) {
// write tensor data + padding
fout.write((const char *)read_data.data(), n_bytes);
zeros(fout, GGML_PAD(n_bytes, GGUF_DEFAULT_ALIGNMENT) - n_bytes);
}
// write tensor data + padding
fout.write((const char *)read_data.data(), n_bytes);
zeros(fout, GGML_PAD(n_bytes, GGUF_DEFAULT_ALIGNMENT) - n_bytes);
}
gguf_free(ctx_gguf);
@@ -552,15 +552,16 @@ static void gguf_merge(const split_params & split_params) {
fprintf(stderr, "\033[3Ddone\n");
}
if (!split_params.dry_run) {
{
// go back to beginning of file and write the updated metadata
fout.seekp(0);
std::vector<uint8_t> data(gguf_get_meta_size(ctx_out));
gguf_get_meta_data(ctx_out, data.data());
fout.write((const char *)data.data(), data.size());
fout.close();
gguf_free(ctx_out);
}
gguf_free(ctx_out);
fprintf(stderr, "%s: %s merged from %d split with %d tensors.\n",
__func__, split_params.output.c_str(), n_split, total_tensors);

View File

@@ -168,7 +168,7 @@ int main(int argc, char * argv[]) {
llama_backend_init();
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
llama_model * model = llama_model_load_from_file(params.model.c_str(), mparams);
// create generation context
llama_context * ctx = llama_init_from_model(model, cparams);

View File

@@ -18,7 +18,6 @@ android {
}
externalNativeBuild {
cmake {
arguments += "-DLLAMA_CURL=OFF"
arguments += "-DLLAMA_BUILD_COMMON=ON"
arguments += "-DGGML_LLAMAFILE=OFF"
arguments += "-DCMAKE_BUILD_TYPE=Release"

View File

@@ -4,26 +4,6 @@
>
> This is very experimental, only used for demo purpose.
## Quick started
You can use pre-quantized model from [ggml-org](https://huggingface.co/ggml-org)'s Hugging Face account
```bash
# build
cmake -B build
cmake --build build --target llama-gemma3-cli
# alternatively, install from brew (MacOS)
brew install llama.cpp
# run it
llama-gemma3-cli -hf ggml-org/gemma-3-4b-it-GGUF
llama-gemma3-cli -hf ggml-org/gemma-3-12b-it-GGUF
llama-gemma3-cli -hf ggml-org/gemma-3-27b-it-GGUF
# note: 1B model does not support vision
```
## How to get mmproj.gguf?
```bash

View File

@@ -1,273 +0,0 @@
#include "ggml.h"
#include "gguf.h"
#include <climits>
#include <cstdarg>
#include <string>
#include <map>
#include <sstream>
#include <vector>
// Internal header for clip.cpp
#define KEY_FTYPE "general.file_type"
#define KEY_NAME "general.name"
#define KEY_DESCRIPTION "general.description"
#define KEY_HAS_TEXT_ENC "clip.has_text_encoder"
#define KEY_HAS_VIS_ENC "clip.has_vision_encoder"
#define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector"
#define KEY_HAS_MINICPMV_PROJ "clip.has_minicpmv_projector"
#define KEY_HAS_GLM_PROJ "clip.has_glm_projector"
#define KEY_MINICPMV_VERSION "clip.minicpmv_version"
#define KEY_HAS_QWEN2VL_MERGER "clip.has_qwen2vl_merger"
#define KEY_USE_GELU "clip.use_gelu"
#define KEY_USE_SILU "clip.use_silu"
#define KEY_N_EMBD "clip.%s.embedding_length"
#define KEY_N_FF "clip.%s.feed_forward_length"
#define KEY_N_BLOCK "clip.%s.block_count"
#define KEY_N_HEAD "clip.%s.attention.head_count"
#define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon"
#define KEY_PROJ_DIM "clip.%s.projection_dim"
#define KEY_TOKENS "tokenizer.ggml.tokens"
#define KEY_N_POSITIONS "clip.text.context_length"
#define KEY_IMAGE_SIZE "clip.vision.image_size"
#define KEY_PATCH_SIZE "clip.vision.patch_size"
#define KEY_IMAGE_MEAN "clip.vision.image_mean"
#define KEY_IMAGE_STD "clip.vision.image_std"
#define KEY_PROJ_TYPE "clip.projector_type"
#define KEY_FEATURE_LAYER "clip.vision.feature_layer"
#define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type"
#define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints"
#define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution"
//
// tensor name constants
//
#define TN_TOKEN_EMBD "%s.token_embd.weight"
#define TN_POS_EMBD "%s.position_embd.weight"
#define TN_CLASS_EMBD "v.class_embd"
#define TN_PATCH_EMBD "v.patch_embd.weight" // not rename tensor with ".0" postfix for backwrad compat
#define TN_PATCH_EMBD_1 "v.patch_embd.weight.1"
#define TN_PATCH_BIAS "v.patch_embd.bias"
#define TN_ATTN_K "%s.blk.%d.attn_k.%s"
#define TN_ATTN_Q "%s.blk.%d.attn_q.%s"
#define TN_ATTN_V "%s.blk.%d.attn_v.%s"
#define TN_ATTN_OUTPUT "%s.blk.%d.attn_out.%s"
#define TN_FFN_DOWN "%s.blk.%d.ffn_down.%s"
#define TN_FFN_UP "%s.blk.%d.ffn_up.%s"
#define TN_LN_1 "%s.blk.%d.ln1.%s"
#define TN_LN_2 "%s.blk.%d.ln2.%s"
#define TN_LN_PRE "%s.pre_ln.%s"
#define TN_LN_POST "%s.post_ln.%s"
#define TN_TEXT_PROJ "text_projection.weight"
#define TN_VIS_PROJ "visual_projection.weight"
#define TN_LLAVA_PROJ "mm.%d.%s"
#define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s"
#define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
#define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s"
#define TN_IMAGE_NEWLINE "model.image_newline"
#define TN_MM_INP_PROJ "mm.input_projection.weight" // gemma3
#define TN_MM_SOFT_EMB_N "mm.soft_emb_norm.weight" // gemma3
// mimicpmv
#define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k"
#define TN_MINICPMV_QUERY "resampler.query"
#define TN_MINICPMV_PROJ "resampler.proj.weight"
#define TN_MINICPMV_KV_PROJ "resampler.kv.weight"
#define TN_MINICPMV_ATTN "resampler.attn.%s.%s"
#define TN_MINICPMV_LN "resampler.ln_%s.%s"
#define TN_GLM_ADAPER_CONV "adapter.conv.%s"
#define TN_GLM_ADAPTER_LINEAR "adapter.linear.linear.%s"
#define TN_GLM_ADAPTER_NORM_1 "adapter.linear.norm1.%s"
#define TN_GLM_ADAPTER_D_H_2_4H "adapter.linear.dense_h_to_4h.%s"
#define TN_GLM_ADAPTER_GATE "adapter.linear.gate.%s"
#define TN_GLM_ADAPTER_D_4H_2_H "adapter.linear.dense_4h_to_h.%s"
#define TN_GLM_BOI_W "adapter.boi"
#define TN_GLM_EOI_W "adapter.eoi"
enum projector_type {
PROJECTOR_TYPE_MLP,
PROJECTOR_TYPE_MLP_NORM,
PROJECTOR_TYPE_LDP,
PROJECTOR_TYPE_LDPV2,
PROJECTOR_TYPE_RESAMPLER,
PROJECTOR_TYPE_GLM_EDGE,
PROJECTOR_TYPE_MERGER,
PROJECTOR_TYPE_GEMMA3,
PROJECTOR_TYPE_UNKNOWN,
};
static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
{ PROJECTOR_TYPE_MLP, "mlp" },
{ PROJECTOR_TYPE_LDP, "ldp" },
{ PROJECTOR_TYPE_LDPV2, "ldpv2"},
{ PROJECTOR_TYPE_RESAMPLER, "resampler"},
{ PROJECTOR_TYPE_GLM_EDGE, "adapter"},
{ PROJECTOR_TYPE_MERGER, "qwen2vl_merger"},
{ PROJECTOR_TYPE_GEMMA3, "gemma3"},
};
static projector_type clip_projector_type_from_string(const std::string & str) {
for (const auto & pair : PROJECTOR_TYPE_NAMES) {
if (pair.second == str) {
return pair.first;
}
}
return PROJECTOR_TYPE_UNKNOWN;
}
//
// logging
//
static void clip_log_callback_default(enum ggml_log_level level, const char * text, void * user_data) {
(void) level;
(void) user_data;
fputs(text, stderr);
fflush(stderr);
}
struct clip_logger_state {
ggml_log_level verbosity_thold;
ggml_log_callback log_callback;
void * log_callback_user_data;
};
extern struct clip_logger_state g_logger_state;
static void clip_log_internal_v(enum ggml_log_level level, const char * format, va_list args) {
if (format == NULL) {
return;
}
va_list args_copy;
va_copy(args_copy, args);
char buffer[128];
int len = vsnprintf(buffer, 128, format, args);
if (len < 128) {
g_logger_state.log_callback(level, buffer, g_logger_state.log_callback_user_data);
} else {
char * buffer2 = (char *) calloc(len + 1, sizeof(char));
vsnprintf(buffer2, len + 1, format, args_copy);
buffer2[len] = 0;
g_logger_state.log_callback(level, buffer2, g_logger_state.log_callback_user_data);
free(buffer2);
}
va_end(args_copy);
}
static void clip_log_internal(enum ggml_log_level level, const char * format, ...) {
va_list args;
va_start(args, format);
clip_log_internal_v(level, format, args);
va_end(args);
}
#define LOG_TMPL(level, ...) \
do { \
if ((level) >= g_logger_state.verbosity_thold) { \
clip_log_internal((level), __VA_ARGS__); \
} \
} while (0)
#define LOG_INF(...) LOG_TMPL(GGML_LOG_LEVEL_INFO, __VA_ARGS__)
#define LOG_WRN(...) LOG_TMPL(GGML_LOG_LEVEL_WARN, __VA_ARGS__)
#define LOG_ERR(...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
#define LOG_DBG(...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
#define LOG_CNT(...) LOG_TMPL(GGML_LOG_LEVEL_CONT, __VA_ARGS__)
//
// common utils
//
static std::string string_format(const char * fmt, ...) {
va_list ap;
va_list ap2;
va_start(ap, fmt);
va_copy(ap2, ap);
int size = vsnprintf(NULL, 0, fmt, ap);
GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
std::vector<char> buf(size + 1);
int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
GGML_ASSERT(size2 == size);
va_end(ap2);
va_end(ap);
return std::string(buf.data(), buf.size());
}
static void string_replace_all(std::string & s, const std::string & search, const std::string & replace) {
if (search.empty()) {
return;
}
std::string builder;
builder.reserve(s.length());
size_t pos = 0;
size_t last_pos = 0;
while ((pos = s.find(search, last_pos)) != std::string::npos) {
builder.append(s, last_pos, pos - last_pos);
builder.append(replace);
last_pos = pos + search.length();
}
builder.append(s, last_pos, std::string::npos);
s = std::move(builder);
}
//
// gguf utils
//
static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
switch (type) {
case GGUF_TYPE_UINT8: return std::to_string(((const uint8_t *)data)[i]);
case GGUF_TYPE_INT8: return std::to_string(((const int8_t *)data)[i]);
case GGUF_TYPE_UINT16: return std::to_string(((const uint16_t *)data)[i]);
case GGUF_TYPE_INT16: return std::to_string(((const int16_t *)data)[i]);
case GGUF_TYPE_UINT32: return std::to_string(((const uint32_t *)data)[i]);
case GGUF_TYPE_INT32: return std::to_string(((const int32_t *)data)[i]);
case GGUF_TYPE_UINT64: return std::to_string(((const uint64_t *)data)[i]);
case GGUF_TYPE_INT64: return std::to_string(((const int64_t *)data)[i]);
case GGUF_TYPE_FLOAT32: return std::to_string(((const float *)data)[i]);
case GGUF_TYPE_FLOAT64: return std::to_string(((const double *)data)[i]);
case GGUF_TYPE_BOOL: return ((const bool *)data)[i] ? "true" : "false";
default: return string_format("unknown type %d", type);
}
}
static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
switch (type) {
case GGUF_TYPE_STRING:
return gguf_get_val_str(ctx_gguf, i);
case GGUF_TYPE_ARRAY:
{
const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
int arr_n = gguf_get_arr_n(ctx_gguf, i);
const void * data = arr_type == GGUF_TYPE_STRING ? nullptr : gguf_get_arr_data(ctx_gguf, i);
std::stringstream ss;
ss << "[";
for (int j = 0; j < arr_n; j++) {
if (arr_type == GGUF_TYPE_STRING) {
std::string val = gguf_get_arr_str(ctx_gguf, i, j);
// escape quotes
string_replace_all(val, "\\", "\\\\");
string_replace_all(val, "\"", "\\\"");
ss << '"' << val << '"';
} else if (arr_type == GGUF_TYPE_ARRAY) {
ss << "???";
} else {
ss << gguf_data_to_str(arr_type, data, j);
}
if (j < arr_n - 1) {
ss << ", ";
}
}
ss << "]";
return ss.str();
}
default:
return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0);
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -1,7 +1,6 @@
#ifndef CLIP_H
#define CLIP_H
#include "ggml.h"
#include <stddef.h>
#include <stdint.h>
@@ -42,7 +41,7 @@ struct clip_image_f32_batch {
struct clip_context_params {
bool use_gpu;
ggml_log_level verbosity;
int verbosity;
};
// deprecated, use clip_init
@@ -77,7 +76,6 @@ CLIP_API struct clip_image_size * clip_image_size_init();
CLIP_API struct clip_image_u8 * clip_image_u8_init ();
CLIP_API struct clip_image_f32 * clip_image_f32_init();
CLIP_API void clip_image_size_free (struct clip_image_size * img_size);
CLIP_API void clip_image_u8_free (struct clip_image_u8 * img);
CLIP_API void clip_image_f32_free(struct clip_image_f32 * img);
CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch * batch);
@@ -107,8 +105,6 @@ CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out
CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx);
CLIP_API bool clip_is_glm(const struct clip_ctx * ctx);
CLIP_API bool clip_is_qwen2vl(const struct clip_ctx * ctx);
CLIP_API bool clip_is_llava(const struct clip_ctx * ctx);
CLIP_API bool clip_is_gemma3(const struct clip_ctx * ctx);
CLIP_API int get_deepest_feature_layer(const struct clip_ctx * ctx);

View File

@@ -10,7 +10,7 @@
#include <vector>
#include <limits.h>
#include <cinttypes>
#include <inttypes.h>
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
#include <signal.h>
@@ -78,12 +78,8 @@ struct gemma3_context {
}
void init_clip_model(common_params & params) {
const char * clip_path = params.mmproj.path.c_str();
ctx_clip = clip_model_load(clip_path, GGML_LOG_LEVEL_INFO);
if (!ctx_clip) {
LOG_ERR("Failed to load CLIP model from %s\n", clip_path);
exit(1);
}
const char * clip_path = params.mmproj.c_str();
ctx_clip = clip_model_load(clip_path, params.verbosity > 1);
}
~gemma3_context() {
@@ -236,13 +232,13 @@ int main(int argc, char ** argv) {
common_init();
if (params.mmproj.path.empty()) {
if (params.mmproj.empty()) {
show_additional_info(argc, argv);
return 1;
}
gemma3_context ctx(params);
printf("%s: %s\n", __func__, params.model.path.c_str());
printf("%s: %s\n", __func__, params.model.c_str());
bool is_single_turn = !params.prompt.empty() && !params.image.empty();

View File

@@ -225,7 +225,7 @@ static struct llama_model * llava_init(common_params * params) {
llama_model_params model_params = common_model_params_to_llama(*params);
llama_model * model = llama_model_load_from_file(params->model.path.c_str(), model_params);
llama_model * model = llama_model_load_from_file(params->model.c_str(), model_params);
if (model == NULL) {
LOG_ERR("%s: unable to load model\n" , __func__);
return NULL;
@@ -234,14 +234,14 @@ static struct llama_model * llava_init(common_params * params) {
}
static struct llava_context * llava_init_context(common_params * params, llama_model * model) {
const char * clip_path = params->mmproj.path.c_str();
const char * clip_path = params->mmproj.c_str();
auto prompt = params->prompt;
if (prompt.empty()) {
prompt = "describe the image in detail.";
}
auto ctx_clip = clip_model_load(clip_path, GGML_LOG_LEVEL_INFO);
auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
llama_context_params ctx_params = common_context_params_to_llama(*params);
ctx_params.n_ctx = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings
@@ -283,7 +283,7 @@ int main(int argc, char ** argv) {
common_init();
if (params.mmproj.path.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
print_usage(argc, argv);
return 1;
}

View File

@@ -31,7 +31,7 @@ static struct llama_model * llava_init(common_params * params) {
llama_model_params model_params = common_model_params_to_llama(*params);
llama_model * model = llama_model_load_from_file(params->model.path.c_str(), model_params);
llama_model * model = llama_model_load_from_file(params->model.c_str(), model_params);
if (model == NULL) {
LOG_ERR("%s: unable to load model\n" , __func__);
return NULL;
@@ -80,7 +80,7 @@ static void llava_free(struct llava_context * ctx_llava) {
}
static struct clip_ctx * clip_init_context(common_params * params) {
const char * clip_path = params->mmproj.path.c_str();
const char * clip_path = params->mmproj.c_str();
auto prompt = params->prompt;
if (prompt.empty()) {
@@ -88,7 +88,7 @@ static struct clip_ctx * clip_init_context(common_params * params) {
}
struct clip_context_params clip_params = {
/* use_gpu */ params->n_gpu_layers != 0,
/* verbosity */ GGML_LOG_LEVEL_INFO, // TODO: make this configurable
/* verbosity */ params->verbosity,
};
auto * ctx_clip = clip_init(clip_path, clip_params);
return ctx_clip;
@@ -290,7 +290,7 @@ int main(int argc, char ** argv) {
common_init();
if (params.mmproj.path.empty() || (params.image.empty())) {
if (params.mmproj.empty() || (params.image.empty())) {
show_additional_info(argc, argv);
return 1;
}

View File

@@ -314,7 +314,7 @@ static struct llama_model * llava_init(common_params * params) {
llama_model_params model_params = common_model_params_to_llama(*params);
llama_model * model = llama_model_load_from_file(params->model.path.c_str(), model_params);
llama_model * model = llama_model_load_from_file(params->model.c_str(), model_params);
if (model == NULL) {
LOG_ERR("%s: unable to load model\n" , __func__);
return NULL;
@@ -323,14 +323,14 @@ static struct llama_model * llava_init(common_params * params) {
}
static struct llava_context * llava_init_context(common_params * params, llama_model * model) {
const char * clip_path = params->mmproj.path.c_str();
const char * clip_path = params->mmproj.c_str();
auto prompt = params->prompt;
if (prompt.empty()) {
prompt = "describe the image in detail.";
}
auto ctx_clip = clip_model_load(clip_path, GGML_LOG_LEVEL_INFO);
auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
llama_context_params ctx_params = common_context_params_to_llama(*params);
ctx_params.n_ctx = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings
@@ -524,7 +524,7 @@ int main(int argc, char ** argv) {
common_init();
if (params.mmproj.path.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
print_usage(argc, argv);
return 1;
}

Binary file not shown.

Before

Width:  |  Height:  |  Size: 121 KiB

View File

@@ -1,81 +0,0 @@
#!/bin/bash
# make sure we are in the right directory
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
cd $SCRIPT_DIR
#export LLAMA_CACHE="$SCRIPT_DIR/tmp"
set -eux
mkdir -p $SCRIPT_DIR/output
PROJ_ROOT="$SCRIPT_DIR/../.."
cd $PROJ_ROOT
###############
arr_bin=()
arr_hf=()
add_test() {
local bin=$1
local hf=$2
arr_bin+=("$bin")
arr_hf+=("$hf")
}
add_test "llama-gemma3-cli" "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M"
add_test "llama-llava-cli" "cmp-nct/Yi-VL-6B-GGUF:Q5_K"
add_test "llama-llava-cli" "guinmoon/MobileVLM-3B-GGUF:Q4_K_M"
add_test "llama-llava-cli" "THUDM/glm-edge-v-5b-gguf:Q4_K_M"
add_test "llama-llava-cli" "second-state/Llava-v1.5-7B-GGUF:Q2_K"
add_test "llama-llava-cli" "cjpais/llava-1.6-mistral-7b-gguf:Q3_K"
add_test "llama-llava-cli" "ibm-research/granite-vision-3.2-2b-GGUF:Q4_K_M"
add_test "llama-minicpmv-cli" "second-state/MiniCPM-Llama3-V-2_5-GGUF:Q2_K" # model from openbmb is corrupted
add_test "llama-minicpmv-cli" "openbmb/MiniCPM-V-2_6-gguf:Q2_K"
add_test "llama-minicpmv-cli" "openbmb/MiniCPM-o-2_6-gguf:Q4_0"
add_test "llama-qwen2vl-cli" "bartowski/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M"
###############
cmake --build build -j --target "${arr_bin[@]}"
arr_res=()
for i in "${!arr_bin[@]}"; do
bin="${arr_bin[$i]}"
hf="${arr_hf[$i]}"
echo "Running test with binary: $bin and HF model: $hf"
echo ""
echo ""
output=$("$PROJ_ROOT/build/bin/$bin" -hf "$hf" --image $SCRIPT_DIR/test-1.jpeg -p "what is the publisher name of the newspaper?" --temp 0 2>&1 | tee /dev/tty)
echo "$output" > $SCRIPT_DIR/output/$bin-$(echo "$hf" | tr '/' '-').log
if echo "$output" | grep -iq "new york"; then
result="\033[32mOK\033[0m: $bin $hf"
else
result="\033[31mFAIL\033[0m: $bin $hf"
fi
echo -e "$result"
arr_res+=("$result")
echo ""
echo ""
echo ""
echo "#################################################"
echo "#################################################"
echo ""
echo ""
done
set +x
for i in "${!arr_res[@]}"; do
echo -e "${arr_res[$i]}"
done
echo ""
echo "Output logs are saved in $SCRIPT_DIR/output"

View File

@@ -106,8 +106,6 @@ int main(int argc, char ** argv) {
common_params params;
params.n_predict = 128;
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PARALLEL)) {
return 1;
}
@@ -407,7 +405,7 @@ int main(int argc, char ** argv) {
params.prompt_file = "used built-in defaults";
}
LOG_INF("External prompt file: \033[32m%s\033[0m\n", params.prompt_file.c_str());
LOG_INF("Model and path used: \033[32m%s\033[0m\n\n", params.model.path.c_str());
LOG_INF("Model and path used: \033[32m%s\033[0m\n\n", params.model.c_str());
LOG_INF("Total prompt tokens: %6d, speed: %5.2f t/s\n", n_total_prompt, (double) (n_total_prompt ) / (t_main_end - t_main_start) * 1e6);
LOG_INF("Total gen tokens: %6d, speed: %5.2f t/s\n", n_total_gen, (double) (n_total_gen ) / (t_main_end - t_main_start) * 1e6);

View File

@@ -64,7 +64,7 @@ int main(int argc, char ** argv) {
llama_model_params model_params = common_model_params_to_llama(params);
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params);
llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params);
if (model == NULL) {
LOG_ERR("%s: unable to load model\n" , __func__);

View File

@@ -851,7 +851,7 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) {
LOG_INF("%s : calculating hellaswag score over selected tasks.\n", __func__);
LOG("\ntask\tacc_norm\t95%% confidence interval\n");
LOG("\ntask\tacc_norm\n");
double acc = 0.0f;
@@ -985,22 +985,8 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) {
acc += 1.0;
}
double freq = acc / double(i + 1);
const double za = 1.95996398454;
// // Wald normal approx
// double conf =za*sqrt(freq*(1-freq)/double(i + 1));
// LOG("%zu\t%.8lf +/- %.8lf\n", i + 1, freq*100.0, conf*100.0);
// Wilson score interval, more accurate
double z = za * za / double(i + 1);
double cnf = z * sqrt(double(i + 1) * (4.0 * freq * (1 - freq) + z)) / (za + za);
double a = (freq + z * 0.5 - cnf) / (1.0 + z);
double b = (freq + z * 0.5 + cnf) / (1.0 + z);
// Print the accumulated accuracy mean x 100 and confidence interval
LOG("%zu\t%3.8lf%%\t[%3.4lf%%, %3.4lf%%]\n", i + 1, freq * 100.0, a * 100.0, b * 100.0);
// Print the accumulated accuracy mean x 100
LOG("%zu\t%.8lf\n", i + 1, acc/double(i + 1)*100.0);
}
i0 = i1 - 1;

View File

@@ -1,4 +1,2 @@
set(TARGET rpc-server)
add_executable(${TARGET} rpc-server.cpp)
target_link_libraries(${TARGET} PRIVATE ggml)
target_compile_features(${TARGET} PRIVATE cxx_std_17)
add_executable(rpc-server rpc-server.cpp)
target_link_libraries(rpc-server PRIVATE ggml llama)

View File

@@ -72,14 +72,3 @@ $ bin/llama-cli -m ../models/tinyllama-1b/ggml-model-f16.gguf -p "Hello, my name
This way you can offload model layers to both local and remote devices.
### Local cache
The RPC server can use a local cache to store large tensors and avoid transferring them over the network.
This can speed up model loading significantly, especially when using large models.
To enable the cache, use the `-c` option:
```bash
$ bin/rpc-server -c
```
By default, the cache is stored in the `$HOME/.cache/llama.cpp/rpc` directory and can be controlled via the `LLAMA_CACHE` environment variable.

View File

@@ -1,7 +1,3 @@
#if defined(_MSC_VER)
#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
#endif
#include "ggml-cpu.h"
#ifdef GGML_USE_CUDA
@@ -22,142 +18,26 @@
#include "ggml-rpc.h"
#ifdef _WIN32
# define DIRECTORY_SEPARATOR '\\'
# include <locale>
# include <windows.h>
# include <fcntl.h>
# include <io.h>
#else
# define DIRECTORY_SEPARATOR '/'
# include <unistd.h>
# include <sys/stat.h>
#endif
#include <codecvt>
#include <string>
#include <stdio.h>
#include <vector>
#include <filesystem>
namespace fs = std::filesystem;
// NOTE: this is copied from common.cpp to avoid linking with libcommon
// returns true if successful, false otherwise
static bool fs_create_directory_with_parents(const std::string & path) {
#ifdef _WIN32
std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
std::wstring wpath = converter.from_bytes(path);
// if the path already exists, check whether it's a directory
const DWORD attributes = GetFileAttributesW(wpath.c_str());
if ((attributes != INVALID_FILE_ATTRIBUTES) && (attributes & FILE_ATTRIBUTE_DIRECTORY)) {
return true;
}
size_t pos_slash = 0;
// process path from front to back, procedurally creating directories
while ((pos_slash = path.find('\\', pos_slash)) != std::string::npos) {
const std::wstring subpath = wpath.substr(0, pos_slash);
const wchar_t * test = subpath.c_str();
const bool success = CreateDirectoryW(test, NULL);
if (!success) {
const DWORD error = GetLastError();
// if the path already exists, ensure that it's a directory
if (error == ERROR_ALREADY_EXISTS) {
const DWORD attributes = GetFileAttributesW(subpath.c_str());
if (attributes == INVALID_FILE_ATTRIBUTES || !(attributes & FILE_ATTRIBUTE_DIRECTORY)) {
return false;
}
} else {
return false;
}
}
pos_slash += 1;
}
return true;
#else
// if the path already exists, check whether it's a directory
struct stat info;
if (stat(path.c_str(), &info) == 0) {
return S_ISDIR(info.st_mode);
}
size_t pos_slash = 1; // skip leading slashes for directory creation
// process path from front to back, procedurally creating directories
while ((pos_slash = path.find('/', pos_slash)) != std::string::npos) {
const std::string subpath = path.substr(0, pos_slash);
struct stat info;
// if the path already exists, ensure that it's a directory
if (stat(subpath.c_str(), &info) == 0) {
if (!S_ISDIR(info.st_mode)) {
return false;
}
} else {
// create parent directories
const int ret = mkdir(subpath.c_str(), 0755);
if (ret != 0) {
return false;
}
}
pos_slash += 1;
}
return true;
#endif // _WIN32
}
// NOTE: this is copied from common.cpp to avoid linking with libcommon
static std::string fs_get_cache_directory() {
std::string cache_directory = "";
auto ensure_trailing_slash = [](std::string p) {
// Make sure to add trailing slash
if (p.back() != DIRECTORY_SEPARATOR) {
p += DIRECTORY_SEPARATOR;
}
return p;
};
if (getenv("LLAMA_CACHE")) {
cache_directory = std::getenv("LLAMA_CACHE");
} else {
#ifdef __linux__
if (std::getenv("XDG_CACHE_HOME")) {
cache_directory = std::getenv("XDG_CACHE_HOME");
} else {
cache_directory = std::getenv("HOME") + std::string("/.cache/");
}
#elif defined(__APPLE__)
cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
#elif defined(_WIN32)
cache_directory = std::getenv("LOCALAPPDATA");
#endif // __linux__
cache_directory = ensure_trailing_slash(cache_directory);
cache_directory += "llama.cpp";
}
return ensure_trailing_slash(cache_directory);
}
struct rpc_server_params {
std::string host = "127.0.0.1";
int port = 50052;
size_t backend_mem = 0;
bool use_cache = false;
};
static void print_usage(int /*argc*/, char ** argv, rpc_server_params params) {
fprintf(stderr, "Usage: %s [options]\n\n", argv[0]);
fprintf(stderr, "options:\n");
fprintf(stderr, " -h, --help show this help message and exit\n");
fprintf(stderr, " -H HOST, --host HOST host to bind to (default: %s)\n", params.host.c_str());
fprintf(stderr, " -p PORT, --port PORT port to bind to (default: %d)\n", params.port);
fprintf(stderr, " -m MEM, --mem MEM backend memory size (in MB)\n");
fprintf(stderr, " -c, --cache enable local file cache\n");
fprintf(stderr, " -h, --help show this help message and exit\n");
fprintf(stderr, " -H HOST, --host HOST host to bind to (default: %s)\n", params.host.c_str());
fprintf(stderr, " -p PORT, --port PORT port to bind to (default: %d)\n", params.port);
fprintf(stderr, " -m MEM, --mem MEM backend memory size (in MB)\n");
fprintf(stderr, "\n");
}
@@ -178,8 +58,6 @@ static bool rpc_server_params_parse(int argc, char ** argv, rpc_server_params &
if (params.port <= 0 || params.port > 65535) {
return false;
}
} else if (arg == "-c" || arg == "--cache") {
params.use_cache = true;
} else if (arg == "-m" || arg == "--mem") {
if (++i >= argc) {
return false;
@@ -286,20 +164,8 @@ int main(int argc, char * argv[]) {
} else {
get_backend_memory(&free_mem, &total_mem);
}
const char * cache_dir = nullptr;
std::string cache_dir_str = fs_get_cache_directory() + "rpc/";
if (params.use_cache) {
if (!fs_create_directory_with_parents(cache_dir_str)) {
fprintf(stderr, "Failed to create cache directory: %s\n", cache_dir_str.c_str());
return 1;
}
cache_dir = cache_dir_str.c_str();
}
printf("Starting RPC server\n");
printf(" endpoint : %s\n", endpoint.c_str());
printf(" local cache : %s\n", cache_dir ? cache_dir : "n/a");
printf(" backend memory : %zu MB\n", free_mem / (1024 * 1024));
ggml_backend_rpc_start_server(backend, endpoint.c_str(), cache_dir, free_mem, total_mem);
printf("Starting RPC server on %s, backend memory: %zu MB\n", endpoint.c_str(), free_mem / (1024 * 1024));
ggml_backend_rpc_start_server(backend, endpoint.c_str(), free_mem, total_mem);
ggml_backend_free(backend);
return 0;
}

View File

@@ -1,16 +1,5 @@
set(TARGET llama-run)
add_executable(${TARGET} run.cpp linenoise.cpp/linenoise.cpp)
# TODO: avoid copying this code block from common/CMakeLists.txt
set(LLAMA_RUN_EXTRA_LIBS "")
if (LLAMA_CURL)
find_package(CURL REQUIRED)
target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL)
include_directories(${CURL_INCLUDE_DIRS})
find_library(CURL_LIBRARY curl REQUIRED)
set(LLAMA_RUN_EXTRA_LIBS ${LLAMA_RUN_EXTRA_LIBS} ${CURL_LIBRARY})
endif ()
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT} ${LLAMA_RUN_EXTRA_LIBS})
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@@ -38,6 +38,24 @@
}
#endif
GGML_ATTRIBUTE_FORMAT(1, 2)
static std::string fmt(const char * fmt, ...) {
va_list ap;
va_list ap2;
va_start(ap, fmt);
va_copy(ap2, ap);
const int size = vsnprintf(NULL, 0, fmt, ap);
GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
std::string buf;
buf.resize(size);
const int size2 = vsnprintf(const_cast<char *>(buf.data()), buf.size() + 1, fmt, ap2);
GGML_ASSERT(size2 == size);
va_end(ap2);
va_end(ap);
return buf;
}
GGML_ATTRIBUTE_FORMAT(1, 2)
static int printe(const char * fmt, ...) {
va_list args;
@@ -507,11 +525,11 @@ class HttpClient {
int secs = static_cast<int>(seconds) % 60;
if (hrs > 0) {
return string_format("%dh %02dm %02ds", hrs, mins, secs);
return fmt("%dh %02dm %02ds", hrs, mins, secs);
} else if (mins > 0) {
return string_format("%dm %02ds", mins, secs);
return fmt("%dm %02ds", mins, secs);
} else {
return string_format("%ds", secs);
return fmt("%ds", secs);
}
}
@@ -526,7 +544,7 @@ class HttpClient {
}
}
return string_format("%.2f %s", dbl_size, suffix[i]);
return fmt("%.2f %s", dbl_size, suffix[i]);
}
static int update_progress(void * ptr, curl_off_t total_to_download, curl_off_t now_downloaded, curl_off_t,
@@ -560,9 +578,7 @@ class HttpClient {
return (now_downloaded_plus_file_size * 100) / total_to_download;
}
static std::string generate_progress_prefix(curl_off_t percentage) {
return string_format("%3ld%% |", static_cast<long int>(percentage));
}
static std::string generate_progress_prefix(curl_off_t percentage) { return fmt("%3ld%% |", static_cast<long int>(percentage)); }
static double calculate_speed(curl_off_t now_downloaded, const std::chrono::steady_clock::time_point & start_time) {
const auto now = std::chrono::steady_clock::now();
@@ -573,9 +589,9 @@ class HttpClient {
static std::string generate_progress_suffix(curl_off_t now_downloaded_plus_file_size, curl_off_t total_to_download,
double speed, double estimated_time) {
const int width = 10;
return string_format("%*s/%*s%*s/s%*s", width, human_readable_size(now_downloaded_plus_file_size).c_str(),
width, human_readable_size(total_to_download).c_str(), width,
human_readable_size(speed).c_str(), width, human_readable_time(estimated_time).c_str());
return fmt("%*s/%*s%*s/s%*s", width, human_readable_size(now_downloaded_plus_file_size).c_str(), width,
human_readable_size(total_to_download).c_str(), width, human_readable_size(speed).c_str(), width,
human_readable_time(estimated_time).c_str());
}
static int calculate_progress_bar_width(const std::string & progress_prefix, const std::string & progress_suffix) {

File diff suppressed because it is too large Load Diff

Binary file not shown.

View File

@@ -133,8 +133,7 @@ struct slot_params {
auto grammar_triggers = json::array();
for (const auto & trigger : sampling.grammar_triggers) {
server_grammar_trigger ct(std::move(trigger));
grammar_triggers.push_back(ct.to_json());
grammar_triggers.push_back(trigger.to_json<json>());
}
return json {
@@ -373,9 +372,9 @@ struct server_task {
const auto grammar_triggers = data.find("grammar_triggers");
if (grammar_triggers != data.end()) {
for (const auto & t : *grammar_triggers) {
server_grammar_trigger ct(t);
if (ct.value.type == COMMON_GRAMMAR_TRIGGER_TYPE_WORD) {
const auto & word = ct.value.value;
auto ct = common_grammar_trigger::from_json(t);
if (ct.type == COMMON_GRAMMAR_TRIGGER_TYPE_WORD) {
const auto & word = ct.value;
auto ids = common_tokenize(vocab, word, /* add_special= */ false, /* parse_special= */ true);
if (ids.size() == 1) {
auto token = ids[0];
@@ -393,7 +392,7 @@ struct server_task {
params.sampling.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, word});
}
} else {
params.sampling.grammar_triggers.push_back(std::move(ct.value));
params.sampling.grammar_triggers.push_back(ct);
}
}
}
@@ -490,12 +489,8 @@ struct result_timings {
double predicted_per_token_ms;
double predicted_per_second;
// Optional speculative metrics - only included when > 0
int32_t draft_n = 0;
int32_t draft_n_accepted = 0;
json to_json() const {
json base = {
return {
{"prompt_n", prompt_n},
{"prompt_ms", prompt_ms},
{"prompt_per_token_ms", prompt_per_token_ms},
@@ -506,13 +501,6 @@ struct result_timings {
{"predicted_per_token_ms", predicted_per_token_ms},
{"predicted_per_second", predicted_per_second},
};
if (draft_n > 0) {
base["draft_n"] = draft_n;
base["draft_n_accepted"] = draft_n_accepted;
}
return base;
}
};
@@ -842,11 +830,6 @@ struct server_task_result_cmpl_final : server_task_result {
ret.push_back({"timings", timings.to_json()});
}
// extra fields for debugging purposes
if (verbose) {
ret["__verbose"] = to_json_non_oaicompat();
}
return ret;
}
};
@@ -1311,10 +1294,6 @@ struct server_slot {
std::function<void(int)> callback_on_release;
// Speculative decoding stats
int32_t n_draft_total = 0; // Total draft tokens generated
int32_t n_draft_accepted = 0; // Draft tokens actually accepted
void reset() {
SLT_DBG(*this, "%s", "\n");
@@ -1331,10 +1310,6 @@ struct server_slot {
generated_tokens.clear();
generated_token_probs.clear();
// clear speculative decoding stats
n_draft_total = 0;
n_draft_accepted = 0;
}
bool is_non_causal() const {
@@ -1401,12 +1376,6 @@ struct server_slot {
timings.predicted_per_token_ms = t_token_generation / n_decoded;
timings.predicted_per_second = 1e3 / t_token_generation * n_decoded;
// Add speculative metrics
if (n_draft_total > 0) {
timings.draft_n = n_draft_total;
timings.draft_n_accepted = n_draft_accepted;
}
return timings;
}
@@ -1454,15 +1423,6 @@ struct server_slot {
t_prompt_processing, n_prompt_tokens_processed, t_prompt, n_prompt_second,
t_token_generation, n_decoded, t_gen, n_gen_second,
t_prompt_processing + t_token_generation, n_prompt_tokens_processed + n_decoded);
if (n_draft_total > 0) {
const float draft_ratio = (float) n_draft_accepted / n_draft_total;
SLT_INF(*this,
"\n"
"draft acceptance rate = %0.5f (%5d accepted / %5d generated)\n",
draft_ratio, n_draft_accepted, n_draft_total
);
}
}
json to_json() const {
@@ -1705,8 +1665,6 @@ private:
};
struct server_response {
bool running = true;
// for keeping track of all tasks waiting for the result
std::unordered_set<int> waiting_task_ids;
@@ -1761,10 +1719,6 @@ struct server_response {
while (true) {
std::unique_lock<std::mutex> lock(mutex_results);
condition_results.wait(lock, [&]{
if (!running) {
SRV_DBG("%s : queue result stop\n", __func__);
std::terminate(); // we cannot return here since the caller is HTTP code
}
return !queue_results.empty();
});
@@ -1795,10 +1749,6 @@ struct server_response {
}
std::cv_status cr_res = condition_results.wait_for(lock, std::chrono::seconds(timeout));
if (!running) {
SRV_DBG("%s : queue result stop\n", __func__);
std::terminate(); // we cannot return here since the caller is HTTP code
}
if (cr_res == std::cv_status::timeout) {
return nullptr;
}
@@ -1828,12 +1778,6 @@ struct server_response {
}
}
}
// terminate the waiting loop
void terminate() {
running = false;
condition_results.notify_all();
}
};
struct server_context {
@@ -1893,7 +1837,7 @@ struct server_context {
}
bool load_model(const common_params & params) {
SRV_INF("loading model '%s'\n", params.model.path.c_str());
SRV_INF("loading model '%s'\n", params.model.c_str());
params_base = params;
@@ -1903,7 +1847,7 @@ struct server_context {
ctx = llama_init.context.get();
if (model == nullptr) {
SRV_ERR("failed to load model, '%s'\n", params_base.model.path.c_str());
SRV_ERR("failed to load model, '%s'\n", params_base.model.c_str());
return false;
}
@@ -1914,13 +1858,16 @@ struct server_context {
add_bos_token = llama_vocab_get_add_bos(vocab);
has_eos_token = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
if (!params_base.speculative.model.path.empty() || !params_base.speculative.model.hf_repo.empty()) {
SRV_INF("loading draft model '%s'\n", params_base.speculative.model.path.c_str());
if (!params_base.speculative.model.empty() || !params_base.speculative.hf_repo.empty()) {
SRV_INF("loading draft model '%s'\n", params_base.speculative.model.c_str());
auto params_dft = params_base;
params_dft.devices = params_base.speculative.devices;
params_dft.hf_file = params_base.speculative.hf_file;
params_dft.hf_repo = params_base.speculative.hf_repo;
params_dft.model = params_base.speculative.model;
params_dft.model_url = params_base.speculative.model_url;
params_dft.n_ctx = params_base.speculative.n_ctx == 0 ? params_base.n_ctx / params_base.n_parallel : params_base.speculative.n_ctx;
params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers;
params_dft.n_parallel = 1;
@@ -1934,12 +1881,12 @@ struct server_context {
model_dft = llama_init_dft.model.get();
if (model_dft == nullptr) {
SRV_ERR("failed to load draft model, '%s'\n", params_base.speculative.model.path.c_str());
SRV_ERR("failed to load draft model, '%s'\n", params_base.speculative.model.c_str());
return false;
}
if (!common_speculative_are_compatible(ctx, llama_init_dft.context.get())) {
SRV_ERR("the draft model '%s' is not compatible with the target model '%s'\n", params_base.speculative.model.path.c_str(), params_base.model.path.c_str());
SRV_ERR("the draft model '%s' is not compatible with the target model '%s'\n", params_base.speculative.model.c_str(), params_base.model.c_str());
return false;
}
@@ -3338,9 +3285,6 @@ struct server_context {
llama_tokens draft = common_speculative_gen_draft(slot.spec, params_spec, slot.cache_tokens, id);
// keep track of total number of tokens generated in the draft
slot.n_draft_total += draft.size();
// ignore small drafts
if (slot.params.speculative.n_min > (int) draft.size()) {
SLT_DBG(slot, "ignoring small draft: %d < %d\n", (int) draft.size(), slot.params.speculative.n_min);
@@ -3366,9 +3310,6 @@ struct server_context {
slot.n_past += ids.size();
slot.n_decoded += ids.size();
// update how many tokens out of draft was accepted
slot.n_draft_accepted += ids.size() - 1;
slot.cache_tokens.push_back(id);
slot.cache_tokens.insert(slot.cache_tokens.end(), ids.begin(), ids.end() - 1);
@@ -3879,7 +3820,7 @@ int main(int argc, char ** argv) {
json data = {
{ "default_generation_settings", ctx_server.default_generation_settings_for_props },
{ "total_slots", ctx_server.params_base.n_parallel },
{ "model_path", ctx_server.params_base.model.path },
{ "model_path", ctx_server.params_base.model },
{ "chat_template", common_chat_templates_source(ctx_server.chat_templates.get()) },
{ "bos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_bos(ctx_server.vocab), /* special= */ true)},
{ "eos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_eos(ctx_server.vocab), /* special= */ true)},
@@ -4145,7 +4086,7 @@ int main(int argc, char ** argv) {
{"object", "list"},
{"data", {
{
{"id", params.model_alias.empty() ? params.model.path : params.model_alias},
{"id", params.model_alias.empty() ? params.model : params.model_alias},
{"object", "model"},
{"created", std::time(0)},
{"owned_by", "llamacpp"},
@@ -4507,31 +4448,21 @@ int main(int argc, char ** argv) {
svr->new_task_queue = [&params] { return new httplib::ThreadPool(params.n_threads_http); };
// clean up function, to be called before exit
auto clean_up = [&svr, &ctx_server]() {
auto clean_up = [&svr]() {
SRV_INF("%s: cleaning up before exit...\n", __func__);
svr->stop();
ctx_server.queue_results.terminate();
llama_backend_free();
};
// bind HTTP listen port
bool was_bound = false;
if (string_ends_with(std::string(params.hostname), ".sock")) {
LOG_INF("%s: setting address family to AF_UNIX\n", __func__);
svr->set_address_family(AF_UNIX);
// bind_to_port requires a second arg, any value other than 0 should
// simply get ignored
was_bound = svr->bind_to_port(params.hostname, 8080);
} else {
LOG_INF("%s: binding port with default address family\n", __func__);
// bind HTTP listen port
if (params.port == 0) {
int bound_port = svr->bind_to_any_port(params.hostname);
if ((was_bound = (bound_port >= 0))) {
params.port = bound_port;
}
} else {
was_bound = svr->bind_to_port(params.hostname, params.port);
if (params.port == 0) {
int bound_port = svr->bind_to_any_port(params.hostname);
if ((was_bound = (bound_port >= 0))) {
params.port = bound_port;
}
} else {
was_bound = svr->bind_to_port(params.hostname, params.port);
}
if (!was_bound) {
@@ -4551,7 +4482,7 @@ int main(int argc, char ** argv) {
if (!ctx_server.load_model(params)) {
clean_up();
t.join();
// t.join(); // FIXME: see below
LOG_ERR("%s: exiting due to model loading error\n", __func__);
return 1;
}
@@ -4599,7 +4530,7 @@ int main(int argc, char ** argv) {
ctx_server.queue_tasks.start_loop();
clean_up();
t.join();
// t.join(); // FIXME: http thread may stuck if there is an on-going request. we don't need to care about this for now as the HTTP connection will already be closed at this point, but it's better to fix this
return 0;
}

View File

@@ -17,7 +17,7 @@ To mitigate it, you can increase values in `n_predict`, `kv_size`.
```shell
cd ../../..
cmake -B build
cmake -B build -DLLAMA_CURL=ON
cmake --build build --target llama-server
```

View File

@@ -49,26 +49,6 @@ def test_embedding_multiple():
assert len(d['embedding']) > 1
def test_embedding_multiple_with_fa():
server = ServerPreset.bert_bge_small_with_fa()
server.pooling = 'last'
server.start()
# one of these should trigger the FA branch (i.e. context size % 256 == 0)
res = server.make_request("POST", "/v1/embeddings", data={
"input": [
"a "*253,
"b "*254,
"c "*255,
"d "*256,
],
})
assert res.status_code == 200
assert len(res.body['data']) == 4
for d in res.body['data']:
assert 'embedding' in d
assert len(d['embedding']) > 1
@pytest.mark.parametrize(
"input,is_multi_prompt",
[

View File

@@ -323,21 +323,6 @@ class ServerPreset:
server.server_embeddings = True
return server
@staticmethod
def bert_bge_small_with_fa() -> ServerProcess:
server = ServerProcess()
server.model_hf_repo = "ggml-org/models"
server.model_hf_file = "bert-bge-small/ggml-model-f16.gguf"
server.model_alias = "bert-bge-small"
server.n_ctx = 1024
server.n_batch = 300
server.n_ubatch = 300
server.n_slots = 2
server.fa = True
server.seed = 42
server.server_embeddings = True
return server
@staticmethod
def tinyllama_infill() -> ServerProcess:
server = ServerProcess()

View File

@@ -3,7 +3,7 @@
#include "common.h"
#include "log.h"
#include "llama.h"
#include "base64.hpp"
#include "common/base64.hpp"
// increase max payload length to allow use of larger context size
#define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
@@ -58,32 +58,6 @@ static T json_value(const json & body, const std::string & key, const T & defaul
const static std::string build_info("b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT);
// thin wrapper around common_grammar_trigger with (de)serialization functions
struct server_grammar_trigger {
common_grammar_trigger value;
server_grammar_trigger() = default;
server_grammar_trigger(const common_grammar_trigger & value) : value(value) {}
server_grammar_trigger(const json & in) {
value.type = (common_grammar_trigger_type) in.at("type").get<int>();
value.value = in.at("value").get<std::string>();
if (value.type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
value.token = (llama_token) in.at("token").get<int>();
}
}
json to_json() const {
json out {
{"type", (int) value.type},
{"value", value.value},
};
if (value.type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
out["token"] = (int) value.token;
}
return out;
}
};
//
// tokenizer and input processing utils
//
@@ -653,8 +627,7 @@ static json oaicompat_completion_params_parse(
llama_params["grammar_lazy"] = chat_params.grammar_lazy;
auto grammar_triggers = json::array();
for (const auto & trigger : chat_params.grammar_triggers) {
server_grammar_trigger ct(trigger);
grammar_triggers.push_back(ct.to_json());
grammar_triggers.push_back(trigger.to_json<json>());
}
llama_params["grammar_triggers"] = grammar_triggers;
llama_params["preserved_tokens"] = chat_params.preserved_tokens;

File diff suppressed because it is too large Load Diff

View File

@@ -13,11 +13,9 @@
"dependencies": {
"@heroicons/react": "^2.2.0",
"@sec-ant/readable-stream": "^0.6.0",
"@tailwindcss/postcss": "^4.1.1",
"@tailwindcss/vite": "^4.1.1",
"@vscode/markdown-it-katex": "^1.1.1",
"autoprefixer": "^10.4.20",
"daisyui": "^5.0.12",
"daisyui": "^4.12.14",
"dexie": "^4.0.11",
"highlight.js": "^11.10.0",
"katex": "^0.16.15",
@@ -31,7 +29,7 @@
"remark-breaks": "^4.0.0",
"remark-gfm": "^4.0.0",
"remark-math": "^6.0.0",
"tailwindcss": "^4.1.1",
"tailwindcss": "^3.4.15",
"textlinestream": "^1.1.1",
"vite-plugin-singlefile": "^2.0.3"
},

View File

@@ -1,5 +1,6 @@
export default {
plugins: {
"@tailwindcss/postcss": {},
tailwindcss: {},
autoprefixer: {},
},
}

View File

@@ -28,7 +28,7 @@ function AppLayout() {
<>
<Sidebar />
<div
className="drawer-content grow flex flex-col h-screen w-screen mx-auto px-4 overflow-auto bg-base-100"
className="drawer-content grow flex flex-col h-screen w-screen mx-auto px-4 overflow-auto"
id="main-scroll"
>
<Header />

View File

@@ -1,4 +1,4 @@
import daisyuiThemes from 'daisyui/theme/object';
import daisyuiThemes from 'daisyui/src/theming/themes';
import { isNumeric } from './utils/misc';
export const isDev = import.meta.env.MODE === 'development';

View File

@@ -1,4 +1,4 @@
import { useEffect, useMemo, useState } from 'react';
import { useEffect, useMemo, useRef, useState } from 'react';
import { CallbackGeneratedChunk, useAppContext } from '../utils/app.context';
import ChatMessage from './ChatMessage';
import { CanvasType, Message, PendingMessage } from '../utils/types';
@@ -6,7 +6,6 @@ import { classNames, cleanCurrentUrl, throttle } from '../utils/misc';
import CanvasPyInterpreter from './CanvasPyInterpreter';
import StorageUtils from '../utils/storage';
import { useVSCodeContext } from '../utils/llama-vscode';
import { useChatTextarea, ChatTextareaApi } from './useChatTextarea.ts';
/**
* A message display is a message node with additional information for rendering.
@@ -100,8 +99,7 @@ export default function ChatScreen() {
canvasData,
replaceMessageAndGenerate,
} = useAppContext();
const textarea: ChatTextareaApi = useChatTextarea(prefilledMsg.content());
const textarea = useOptimizedTextarea(prefilledMsg.content());
const { extraContext, clearExtraContext } = useVSCodeContext(textarea);
// TODO: improve this when we have "upload file" feature
@@ -250,16 +248,14 @@ export default function ChatScreen() {
</div>
{/* chat input */}
<div className="flex flex-row items-end pt-8 pb-6 sticky bottom-0 bg-base-100">
<div className="flex flex-row items-center pt-8 pb-6 sticky bottom-0 bg-base-100">
<textarea
// Default (mobile): Enable vertical resize, overflow auto for scrolling if needed
// Large screens (lg:): Disable manual resize, apply max-height for autosize limit
className="textarea textarea-bordered w-full resize-vertical lg:resize-none lg:max-h-48 lg:overflow-y-auto" // Adjust lg:max-h-48 as needed (e.g., lg:max-h-60)
className="textarea textarea-bordered w-full"
placeholder="Type a message (Shift+Enter to add a new line)"
ref={textarea.ref}
onInput={textarea.onInput} // Hook's input handler (will only resize height on lg+ screens)
onKeyDown={(e) => {
if (e.nativeEvent.isComposing || e.keyCode === 229) return;
if (e.key === 'Enter' && e.shiftKey) return;
if (e.key === 'Enter' && !e.shiftKey) {
e.preventDefault();
sendNewMessage();
@@ -267,11 +263,7 @@ export default function ChatScreen() {
}}
id="msg-input"
dir="auto"
// Set a base height of 2 rows for mobile views
// On lg+ screens, the hook will calculate and set the initial height anyway
rows={2}
></textarea>
{isGenerating(currConvId ?? '') ? (
<button
className="btn btn-neutral ml-2"
@@ -294,3 +286,43 @@ export default function ChatScreen() {
</div>
);
}
export interface OptimizedTextareaValue {
value: () => string;
setValue: (value: string) => void;
focus: () => void;
ref: React.RefObject<HTMLTextAreaElement>;
}
// This is a workaround to prevent the textarea from re-rendering when the inner content changes
// See https://github.com/ggml-org/llama.cpp/pull/12299
function useOptimizedTextarea(initValue: string): OptimizedTextareaValue {
const [savedInitValue, setSavedInitValue] = useState<string>(initValue);
const textareaRef = useRef<HTMLTextAreaElement>(null);
useEffect(() => {
if (textareaRef.current && savedInitValue) {
textareaRef.current.value = savedInitValue;
setSavedInitValue('');
}
}, [textareaRef, savedInitValue, setSavedInitValue]);
return {
value: () => {
return textareaRef.current?.value ?? savedInitValue;
},
setValue: (value: string) => {
if (textareaRef.current) {
textareaRef.current.value = value;
}
},
focus: () => {
if (textareaRef.current) {
// focus and move the cursor to the end
textareaRef.current.focus();
textareaRef.current.selectionStart = textareaRef.current.value.length;
}
},
ref: textareaRef,
};
}

View File

@@ -2,7 +2,7 @@ import { useEffect, useState } from 'react';
import StorageUtils from '../utils/storage';
import { useAppContext } from '../utils/app.context';
import { classNames } from '../utils/misc';
import daisyuiThemes from 'daisyui/theme/object';
import daisyuiThemes from 'daisyui/src/theming/themes';
import { THEMES } from '../Config';
import { useNavigate } from 'react-router';
@@ -20,6 +20,7 @@ export default function Header() {
document.body.setAttribute('data-theme', selectedTheme);
document.body.setAttribute(
'data-color-scheme',
// @ts-expect-error daisyuiThemes complains about index type, but it should work
daisyuiThemes[selectedTheme]?.['color-scheme'] ?? 'auto'
);
}, [selectedTheme]);

View File

@@ -1,96 +0,0 @@
import { useEffect, useRef, useState, useCallback } from 'react';
// Media Query for detecting "large" screens (matching Tailwind's lg: breakpoint)
const LARGE_SCREEN_MQ = '(min-width: 1024px)';
// Calculates and sets the textarea height based on its scrollHeight
const adjustTextareaHeight = (textarea: HTMLTextAreaElement | null) => {
if (!textarea) return;
// Only perform auto-sizing on large screens
if (!window.matchMedia(LARGE_SCREEN_MQ).matches) {
// On small screens, reset inline height and max-height styles.
// This allows CSS (e.g., `rows` attribute or classes) to control the height,
// and enables manual resizing if `resize-vertical` is set.
textarea.style.height = ''; // Use 'auto' or '' to reset
textarea.style.maxHeight = '';
return; // Do not adjust height programmatically on small screens
}
const computedStyle = window.getComputedStyle(textarea);
// Get the max-height specified by CSS (e.g., from `lg:max-h-48`)
const currentMaxHeight = computedStyle.maxHeight;
// Temporarily remove max-height to allow scrollHeight to be calculated correctly
textarea.style.maxHeight = 'none';
// Reset height to 'auto' to measure the actual scrollHeight needed
textarea.style.height = 'auto';
// Set the height to the calculated scrollHeight
textarea.style.height = `${textarea.scrollHeight}px`;
// Re-apply the original max-height from CSS to enforce the limit
textarea.style.maxHeight = currentMaxHeight;
};
// Interface describing the API returned by the hook
export interface ChatTextareaApi {
value: () => string;
setValue: (value: string) => void;
focus: () => void;
ref: React.RefObject<HTMLTextAreaElement>;
onInput: (event: React.FormEvent<HTMLTextAreaElement>) => void; // Input handler
}
// This is a workaround to prevent the textarea from re-rendering when the inner content changes
// See https://github.com/ggml-org/llama.cpp/pull/12299
// combined now with auto-sizing logic.
export function useChatTextarea(initValue: string): ChatTextareaApi {
const [savedInitValue, setSavedInitValue] = useState<string>(initValue);
const textareaRef = useRef<HTMLTextAreaElement>(null);
// Effect to set initial value and height on mount or when initValue changes
useEffect(() => {
const textarea = textareaRef.current;
if (textarea) {
if (typeof savedInitValue === 'string' && savedInitValue.length > 0) {
textarea.value = savedInitValue;
// Call adjustTextareaHeight - it will check screen size internally
setTimeout(() => adjustTextareaHeight(textarea), 0);
setSavedInitValue(''); // Reset after applying
} else {
// Adjust height even if there's no initial value (for initial render)
setTimeout(() => adjustTextareaHeight(textarea), 0);
}
}
}, [textareaRef, savedInitValue]); // Depend on ref and savedInitValue
const handleInput = useCallback(
(event: React.FormEvent<HTMLTextAreaElement>) => {
// Call adjustTextareaHeight on every input - it will decide whether to act
adjustTextareaHeight(event.currentTarget);
},
[]
);
return {
// Method to get the current value directly from the textarea
value: () => {
return textareaRef.current?.value ?? '';
},
// Method to programmatically set the value and trigger height adjustment
setValue: (value: string) => {
const textarea = textareaRef.current;
if (textarea) {
textarea.value = value;
// Call adjustTextareaHeight - it will check screen size internally
setTimeout(() => adjustTextareaHeight(textarea), 0);
}
},
focus: () => {
if (textareaRef.current) {
textareaRef.current.focus();
}
},
ref: textareaRef,
onInput: handleInput,
};
}

View File

@@ -1,13 +1,8 @@
@use 'sass:meta';
@use 'tailwindcss';
@plugin 'daisyui' {
themes: all;
}
html {
scrollbar-gutter: auto;
}
@tailwind base;
@tailwind components;
@tailwind utilities;
.markdown {
h1,

View File

@@ -1,6 +1,6 @@
import { useEffect, useState } from 'react';
import { MessageExtraContext } from './types';
import { ChatTextareaApi } from '../components/useChatTextarea.ts';
import { OptimizedTextareaValue } from '../components/ChatScreen';
// Extra context when using llama.cpp WebUI from llama-vscode, inside an iframe
// Ref: https://github.com/ggml-org/llama.cpp/pull/11940
@@ -15,7 +15,7 @@ interface SetTextEvData {
* window.postMessage({ command: 'setText', text: 'Spot the syntax error', context: 'def test()\n return 123' }, '*');
*/
export const useVSCodeContext = (textarea: ChatTextareaApi) => {
export const useVSCodeContext = (textarea: OptimizedTextareaValue) => {
const [extraContext, setExtraContext] = useState<MessageExtraContext | null>(
null
);

View File

@@ -15,7 +15,7 @@ async def main():
model_url = "http://127.0.0.1:6900"
responses: list[requests.Response] = await asyncio.gather(*[requests_post_async(
url= f"{model_url}/embedding",
json= {"content": "a "*1022}
json= {"content": str(0)*1024}
) for i in range(n)])
for response in responses:

View File

@@ -24,7 +24,7 @@ int main(int argc, char ** argv) {
common_init();
if (params.speculative.model.path.empty()) {
if (params.speculative.model.empty()) {
LOG_ERR("%s: --model-draft is required\n", __func__);
return 1;
}

View File

@@ -46,7 +46,7 @@ int main(int argc, char ** argv) {
common_init();
if (params.speculative.model.path.empty()) {
if (params.speculative.model.empty()) {
LOG_ERR("%s: --model-draft is required\n", __func__);
return 1;
}

View File

@@ -13,10 +13,10 @@ if %errorlevel% neq 0 goto ERROR
:: for FP16
:: faster for long-prompt inference
:: cmake -G "MinGW Makefiles" .. -DLLAMA_CURL=OFF -DGGML_SYCL=ON -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DGGML_SYCL_F16=ON
:: cmake -G "MinGW Makefiles" .. -DGGML_SYCL=ON -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DGGML_SYCL_F16=ON
:: for FP32
cmake -G "Ninja" .. -DLLAMA_CURL=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release
cmake -G "Ninja" .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release
if %errorlevel% neq 0 goto ERROR
:: build example/main only
:: make main

View File

@@ -577,7 +577,12 @@ int main(int argc, char ** argv) {
const llama_vocab * vocab = llama_model_get_vocab(model_ttc);
params.model = params.vocoder.model;
// TODO: refactor in a common struct
params.model = params.vocoder.model;
params.model_url = params.vocoder.model_url;
params.hf_repo = params.vocoder.hf_repo;
params.hf_file = params.vocoder.hf_file;
params.embedding = true;
common_init_result llama_init_cts = common_init_from_params(params);
@@ -694,13 +699,11 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14
const std::string voice_data = audio_data;
auto tmp = common_tokenize(vocab, voice_data, false, true);
std::ostringstream tokens_oss;
printf("\n\n");
for (size_t i = 0; i < tmp.size(); ++i) {
tokens_oss << tmp[i] << ", ";
printf("%d, ", tmp[i]);
}
LOG_INF("\n\n%s: llama tokens: %s\n\n", __func__, tokens_oss.str().c_str());
printf("\n\n");
prompt_add(prompt_inp, tmp);
#else
prompt_add(prompt_inp, llama_tokens {

View File

@@ -100,10 +100,6 @@ else()
set(INS_ENB ON)
endif()
message(DEBUG "GGML_NATIVE : ${GGML_NATIVE}")
message(DEBUG "GGML_NATIVE_DEFAULT : ${GGML_NATIVE_DEFAULT}")
message(DEBUG "INS_ENB : ${INS_ENB}")
option(GGML_CPU_HBM "ggml: use memkind for CPU HBM" OFF)
option(GGML_CPU_AARCH64 "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
option(GGML_CPU_KLEIDIAI "ggml: use KleidiAI optimized kernels if applicable" OFF)
@@ -127,12 +123,10 @@ endif()
option(GGML_LASX "ggml: enable lasx" ON)
option(GGML_LSX "ggml: enable lsx" ON)
option(GGML_RVV "ggml: enable rvv" ON)
option(GGML_RV_ZFH "ggml: enable riscv zfh" OFF)
option(GGML_VXE "ggml: enable vxe" ON)
option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM")
set(GGML_CPU_POWERPC_CPUTYPE "" CACHE STRING "ggml: CPU type for PowerPC")
set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM")
if (WIN32)

View File

@@ -1,22 +0,0 @@
find_package(Git)
# the commit's SHA1
execute_process(COMMAND
"${GIT_EXECUTABLE}" describe --match=NeVeRmAtCh --always --abbrev=8
WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
OUTPUT_VARIABLE GIT_SHA1
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
# the date of the commit
execute_process(COMMAND
"${GIT_EXECUTABLE}" log -1 --format=%ad --date=local
WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
OUTPUT_VARIABLE GIT_DATE
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
# the subject of the commit
execute_process(COMMAND
"${GIT_EXECUTABLE}" log -1 --format=%s
WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
OUTPUT_VARIABLE GIT_COMMIT_SUBJECT
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)

View File

@@ -5,7 +5,7 @@
set_and_check(GGML_INCLUDE_DIR "@PACKAGE_GGML_INCLUDE_INSTALL_DIR@")
set_and_check(GGML_LIB_DIR "@PACKAGE_GGML_LIB_INSTALL_DIR@")
#set_and_check(GGML_BIN_DIR "@PACKAGE_GGML_BIN_INSTALL_DIR@")
set_and_check(GGML_BIN_DIR "@PACKAGE_GGML_BIN_INSTALL_DIR@")
find_package(Threads REQUIRED)

View File

@@ -17,9 +17,7 @@ GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const c
GGML_BACKEND_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);
GGML_BACKEND_API void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint,
const char * cache_dir,
size_t free_mem, size_t total_mem);
GGML_BACKEND_API void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem);
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_rpc_reg(void);

View File

@@ -1791,11 +1791,11 @@ extern "C" {
#define GGML_KQ_MASK_PAD 64
// q: [n_embd_k, n_batch, n_head, 1]
// k: [n_embd_k, n_kv, n_head_kv, 1]
// v: [n_embd_v, n_kv, n_head_kv, 1] !! not transposed !!
// mask: [n_kv, n_batch_pad, 1, 1] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
// res: [n_embd_v, n_head, n_batch, 1] !! permuted !!
// q: [n_embd, n_batch, n_head, 1]
// k: [n_embd, n_kv, n_head_kv, 1]
// v: [n_embd, n_kv, n_head_kv, 1] !! not transposed !!
// mask: [n_kv, n_batch_pad, 1, 1] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
// res: [n_embd, n_head, n_batch, 1] !! permuted !!
GGML_API struct ggml_tensor * ggml_flash_attn_ext(
struct ggml_context * ctx,
struct ggml_tensor * q,

View File

@@ -65,7 +65,7 @@ if (GGML_LTO)
endif()
endif()
if (GGML_CCACHE AND NOT CMAKE_C_COMPILER_LAUNCHER AND NOT CMAKE_CXX_COMPILER_LAUNCHER)
if (GGML_CCACHE)
find_program(GGML_CCACHE_FOUND ccache)
find_program(GGML_SCCACHE_FOUND sccache)

View File

@@ -0,0 +1,168 @@
---
Language: Cpp
# BasedOnStyle: Google
AccessModifierOffset: -1
AlignAfterOpenBracket: Align
AlignConsecutiveMacros: false
AlignConsecutiveAssignments: false
AlignConsecutiveDeclarations: false
AlignEscapedNewlines: Left
AlignOperands: true
AlignTrailingComments: true
AllowAllArgumentsOnNextLine: true
AllowAllConstructorInitializersOnNextLine: true
AllowAllParametersOfDeclarationOnNextLine: true
AllowShortBlocksOnASingleLine: Never
AllowShortCaseLabelsOnASingleLine: false
AllowShortFunctionsOnASingleLine: All
AllowShortLambdasOnASingleLine: All
AllowShortIfStatementsOnASingleLine: WithoutElse
AllowShortLoopsOnASingleLine: true
AlwaysBreakAfterDefinitionReturnType: None
AlwaysBreakAfterReturnType: None
AlwaysBreakBeforeMultilineStrings: true
AlwaysBreakTemplateDeclarations: Yes
BinPackArguments: true
BinPackParameters: true
BraceWrapping:
AfterCaseLabel: false
AfterClass: false
AfterControlStatement: false
AfterEnum: false
AfterFunction: false
AfterNamespace: false
AfterObjCDeclaration: false
AfterStruct: false
AfterUnion: false
AfterExternBlock: false
BeforeCatch: false
BeforeElse: false
IndentBraces: false
SplitEmptyFunction: true
SplitEmptyRecord: true
SplitEmptyNamespace: true
BreakBeforeBinaryOperators: None
BreakBeforeBraces: Attach
BreakBeforeInheritanceComma: false
BreakInheritanceList: BeforeColon
BreakBeforeTernaryOperators: true
BreakConstructorInitializersBeforeComma: false
BreakConstructorInitializers: BeforeColon
BreakAfterJavaFieldAnnotations: false
BreakStringLiterals: true
ColumnLimit: 80
CommentPragmas: '^ IWYU pragma:'
CompactNamespaces: false
ConstructorInitializerAllOnOneLineOrOnePerLine: true
ConstructorInitializerIndentWidth: 4
ContinuationIndentWidth: 4
Cpp11BracedListStyle: true
DeriveLineEnding: true
DerivePointerAlignment: true
DisableFormat: false
ExperimentalAutoDetectBinPacking: false
FixNamespaceComments: true
ForEachMacros:
- foreach
- Q_FOREACH
- BOOST_FOREACH
IncludeBlocks: Regroup
IncludeCategories:
- Regex: '^<ext/.*\.h>'
Priority: 2
SortPriority: 0
- Regex: '^<.*\.h>'
Priority: 1
SortPriority: 0
- Regex: '^<.*'
Priority: 2
SortPriority: 0
- Regex: '.*'
Priority: 3
SortPriority: 0
IncludeIsMainRegex: '([-_](test|unittest))?$'
IncludeIsMainSourceRegex: ''
IndentCaseLabels: true
IndentGotoLabels: true
IndentPPDirectives: None
IndentWidth: 4
IndentWrappedFunctionNames: false
JavaScriptQuotes: Leave
JavaScriptWrapImports: true
KeepEmptyLinesAtTheStartOfBlocks: false
MacroBlockBegin: ''
MacroBlockEnd: ''
MaxEmptyLinesToKeep: 1
NamespaceIndentation: None
ObjCBinPackProtocolList: Never
ObjCBlockIndentWidth: 2
ObjCSpaceAfterProperty: false
ObjCSpaceBeforeProtocolList: true
PenaltyBreakAssignment: 2
PenaltyBreakBeforeFirstCallParameter: 1
PenaltyBreakComment: 300
PenaltyBreakFirstLessLess: 120
PenaltyBreakString: 1000
PenaltyBreakTemplateDeclaration: 10
PenaltyExcessCharacter: 1000000
PenaltyReturnTypeOnItsOwnLine: 200
PointerAlignment: Left
RawStringFormats:
- Language: Cpp
Delimiters:
- cc
- CC
- cpp
- Cpp
- CPP
- 'c++'
- 'C++'
CanonicalDelimiter: ''
BasedOnStyle: google
- Language: TextProto
Delimiters:
- pb
- PB
- proto
- PROTO
EnclosingFunctions:
- EqualsProto
- EquivToProto
- PARSE_PARTIAL_TEXT_PROTO
- PARSE_TEST_PROTO
- PARSE_TEXT_PROTO
- ParseTextOrDie
- ParseTextProtoOrDie
CanonicalDelimiter: ''
BasedOnStyle: google
ReflowComments: true
SortIncludes: true
SortUsingDeclarations: true
SpaceAfterCStyleCast: false
SpaceAfterLogicalNot: false
SpaceAfterTemplateKeyword: true
SpaceBeforeAssignmentOperators: true
SpaceBeforeCpp11BracedList: false
SpaceBeforeCtorInitializerColon: true
SpaceBeforeInheritanceColon: true
SpaceBeforeParens: ControlStatements
SpaceBeforeRangeBasedForLoopColon: true
SpaceInEmptyBlock: false
SpaceInEmptyParentheses: false
SpacesBeforeTrailingComments: 2
SpacesInAngles: false
SpacesInConditionalStatement: false
SpacesInContainerLiterals: true
SpacesInCStyleCastParentheses: false
SpacesInParentheses: false
SpacesInSquareBrackets: false
SpaceBeforeSquareBrackets: false
Standard: Auto
StatementMacros:
- Q_UNUSED
- QT_REQUIRE_VERSION
TabWidth: 8
UseCRLF: false
UseTab: Never
...

View File

@@ -51,11 +51,13 @@ if (CANN_INSTALL_DIR)
${CANN_INSTALL_DIR}/acllib/include
)
add_subdirectory(kernels)
list(APPEND CANN_LIBRARIES
ascendcl
nnopbase
opapi
acl_op_compiler
ascendc_kernels
)
file(GLOB GGML_SOURCES_CANN "*.cpp")

View File

@@ -54,7 +54,9 @@ aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne,
// added.
int64_t acl_ne[GGML_MAX_DIMS * 2], acl_stride[GGML_MAX_DIMS * 2];
int64_t acl_storage_len = 0;
if (ne == nullptr) {
acl_storage_len = ggml_nbytes(tensor);
for (int i = 0; i < GGML_MAX_DIMS; i++) {
acl_ne[i] = tensor->ne[i];
// The step size of acl is in elements.
@@ -63,18 +65,14 @@ aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne,
} else {
// With bcast
for (int i = 0; i < dims; i++) {
acl_storage_len += (ne[i] - 1) * nb[i];
acl_ne[i] = ne[i];
acl_stride[i] = nb[i] / ggml_element_size(tensor);
}
}
int64_t final_dims = (dims == 0 ? GGML_MAX_DIMS : dims);
int64_t acl_storage_len = 1;
for (int i = 0; i < final_dims; i++) {
acl_storage_len += (acl_ne[i] - 1) * acl_stride[i];
}
// Reverse ne and stride.
int64_t final_dims = (dims == 0 ? GGML_MAX_DIMS : dims);
std::reverse(acl_ne, acl_ne + final_dims);
std::reverse(acl_stride, acl_stride + final_dims);

View File

@@ -101,14 +101,14 @@ aclTensor* ggml_cann_create_tensor(void* data_ptr, aclDataType dtype,
tmp_stride[i] = nb[i] / type_size;
}
int64_t acl_storage_len = 1;
for (int i = 0; i < dims; i++) {
acl_storage_len += (tmp_ne[i] - 1) * tmp_stride[i];
}
std::reverse(tmp_ne, tmp_ne + dims);
std::reverse(tmp_stride, tmp_stride + dims);
int64_t acl_storage_len = 0;
for (int i = 0; i < dims; i++) {
acl_storage_len += (ne[i] - 1) * nb[i];
}
aclTensor* acl_tensor =
aclCreateTensor(tmp_ne, dims, dtype, tmp_stride, offset / type_size,
format, &acl_storage_len, 1, data_ptr);

File diff suppressed because it is too large Load Diff

View File

@@ -1,4 +1,15 @@
#ifndef CANN_ACLNN_OPS
#define CANN_ACLNN_OPS
/**
* @file acl_tensor
* @brief This file contains related functions of ggml_tensor and acl_tensor.
* Contains conversion from ggml_tensor to acl_tensor, broadcast and other
* functions.
* @author hipudding <huafengchun@gmail.com>
* @author wangshuai09 <391746016@qq.com>
* @date July 15, 2024
*
* Copyright (c) 2023-2024 The ggml authors
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -20,28 +31,20 @@
* IN THE SOFTWARE.
*/
#ifndef CANN_ACLNN_OPS
#define CANN_ACLNN_OPS
#include <aclnnop/aclnn_abs.h>
#include <aclnnop/aclnn_neg.h>
#include <aclnnop/aclnn_exp.h>
#include <aclnnop/aclnn_add.h>
#include <aclnnop/aclnn_arange.h>
#include <aclnnop/aclnn_argsort.h>
#include <aclnnop/aclnn_cat.h>
#include <aclnnop/aclnn_clamp.h>
#include <aclnnop/aclnn_div.h>
#include <aclnnop/aclnn_gelu.h>
#include <aclnnop/aclnn_gelu_v2.h>
#include <aclnnop/aclnn_sigmoid.h>
#include <aclnnop/aclnn_hardsigmoid.h>
#include <aclnnop/aclnn_hardswish.h>
#include <aclnnop/aclnn_leaky_relu.h>
#include <aclnnop/aclnn_mul.h>
#include <aclnnop/aclnn_relu.h>
#include <aclnnop/aclnn_silu.h>
#include <aclnnop/aclnn_tanh.h>
#include <aclnnop/aclnn_sqrt.h>
#include <aclnnop/aclnn_sin.h>
#include <aclnnop/aclnn_cos.h>
#include "acl_tensor.h"
#include "common.h"
@@ -60,6 +63,23 @@
*/
void ggml_cann_repeat(ggml_backend_cann_context& ctx, ggml_tensor* dst);
/**
* @brief Adds two ggml tensors using the CANN backend.
*
* @details This function performs an element-wise addition of two tensors. In
* case the tensors do not have the same shape, one or both tensors
* will be broadcasted to match the shape of the other before the
* addition is performed.The formula for the operation is given by:
* \f[
* \text{dst} = \text{acl_src0} + \alpha \cdot \text{acl_src1}
* \f]
*
* @param ctx The CANN context used for operations.
* @param dst The ggml tensor representing the destination, result of the
* addition is stored at dst->data, and dst->op is `GGML_OP_ADD`
*/
void ggml_cann_add(ggml_backend_cann_context& ctx, ggml_tensor* dst);
/**
* @brief Applies the Leaky ReLU activation function to a tensor using the CANN
* backend.
@@ -111,6 +131,19 @@ void ggml_cann_concat(ggml_backend_cann_context& ctx, ggml_tensor* dst);
*/
void ggml_cann_arange(ggml_backend_cann_context& ctx, ggml_tensor* dst);
/**
* @brief Computes the square of the elements of a ggml tensor using the CANN
* backend.
* @details The function sets the second source tensor of the destination
* tensor `dst` to be equal to the first source tensor. This is
* effectively squaring the elements since the multiplication becomes
* `element * element`.
* @param ctx The CANN context used for operations.
* @param dst The destination tensor where the squared values will be stored
* which dst->op is `GGML_OP_SQR`.
*/
void ggml_cann_sqr(ggml_backend_cann_context& ctx, ggml_tensor* dst);
/**
* @brief Applies a clamp operation to the elements of a ggml tensor using the
* CANN backend.
@@ -242,20 +275,6 @@ void ggml_cann_acc(ggml_backend_cann_context& ctx, ggml_tensor* dst);
*/
void ggml_cann_sum_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst);
/**
* @brief Computes the sum of elements in a ggml tensor.
*
* @details This function performs a reduction sum operation along the last
* dimension of the input tensor `src`. The result of the sum is stored
* in the destination tensor `dst`.
*
* @param ctx The CANN context used for operations.
* @param dst The destination tensor where the reduced values will be stored。
*
*/
void ggml_cann_sum(ggml_backend_cann_context& ctx, ggml_tensor* dst);
/**
* @brief Upsamples a ggml tensor using nearest neighbor interpolation using
* the CANN backend.
@@ -465,326 +484,109 @@ void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst);
*/
void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst);
/**
* @brief Computes the index of the maximum value along the specified dimension
* of a ggml tensor using the CANN backend.
*
* @details This function performs an argmax operation on the input tensor.
* It finds the index of the maximum value along the specified axis
* and stores these indices in the destination tensor `dst`. The
* operation is executed using the CANN backend for optimized performance.
*
* @param ctx The CANN context used for operations.
* @param dst The destination tensor where the indices of the maximum values will
* be stored. dst->op is `GGML_OP_ARGMAX`.
*/
void ggml_cann_argmax(ggml_backend_cann_context& ctx, ggml_tensor* dst);
/**
* @brief Adds two tensors element-wise and stores the result in a destination
* tensor.
*
* This function performs the operation:
* \f[
* dst = acl\_src0 + alpha \times acl\_src1
* \f]
* where alpha is a scalar value and defaults to 1.0f.
*
* @param ctx The context for the CANN backend operations.
* @param acl_src0 The first source tensor.
* @param acl_src1 The second source tensor.
* @param acl_dst The destination tensor where the result will be stored.
*/
void aclnn_add(ggml_backend_cann_context& ctx, aclTensor* acl_src0,
aclTensor* acl_src1, aclTensor* acl_dst = nullptr);
/**
* @brief Sub two tensors element-wise and stores the result in a destination
* tensor.
*
* This function performs the operation:
* \f[
* dst = acl\_src0 - alpha \times acl\_src1
* \f]
* where alpha is a scalar value and defaults to 1.0f.
*
* @param ctx The context for the CANN backend operations.
* @param acl_src0 The first source tensor.
* @param acl_src1 The second source tensor.
* @param acl_dst The destination tensor where the result will be stored.
*/
void aclnn_sub(ggml_backend_cann_context& ctx, aclTensor* acl_src0,
aclTensor* acl_src1, aclTensor* acl_dst = nullptr);
/**
* @brief Performs element-wise multiplication of two tensors and stores the
* result in a destination tensor.
*
* This function performs element-wise multiplication of the tensors `acl_src`
* and `acl_other` and stores the result in the destination tensor `acl_dst`.
* The operation is defined as:
* \f[
* \text {acl_dst }_i=\text {acl_src }_i \times \text {acl_other }_i
* \f]
*
* @param ctx The context for the CANN backend operations.
* @param acl_src The first tensor for element-wise multiplication.
* @param acl_other The second tensor for element-wise multiplication.
* @param acl_dst The destination tensor where the result will be stored.
*/
void aclnn_mul(ggml_backend_cann_context& ctx, aclTensor* acl_src,
aclTensor* acl_other, aclTensor* acl_dst = nullptr);
/**
* @brief Matrix division, optionally in-place.
*
* This function division each element of the source tensor `acl_src` by the
* tensor `acl_other` and stores the result in the destination tensor `acl_dst`.
* If `inplace` is true, `acl_dst` will not be used and the operation is
* performed in-place on `acl_src`. The operation is defined as: \f[
* \text{dst}_i = \frac{\text{acl_src}_i}{\text{acl_other}_i}
* \f]
*
* @param ctx The context for the CANN backend operations.
* @param acl_src Numerator tensor..
* @param acl_other Denominator tensor.
* @param acl_dst The destination tensor where the result will be stored if
* `inplace` is false.
* @param inplace Flag indicating whether to perform the operation in-place on
* `acl_src`.
*/
void aclnn_div(ggml_backend_cann_context& ctx, aclTensor* acl_src,
aclTensor* acl_other, aclTensor* acl_dst = nullptr);
/**
* @brief Applies element-wise cosine function to the elements of a tensor.
*
* This function computes the cosine of each element in the source tensor
* `acl_src` and stores the result in the destination tensor `acl_dst`. The
* operation is defined as: \f[ \text {acl_dst }_i=\cos \left(\text {acl_src
* }_i\right) \f]
*
* @param ctx The context for the CANN backend operations.
* @param acl_src The source tensor on which the cosine function will be
* applied.
* @param acl_dst The destination tensor where the cosine results will be
* stored.
*/
void aclnn_cos(ggml_backend_cann_context& ctx, aclTensor* acl_src,
aclTensor* acl_dst);
/**
* @brief Applies element-wise sine function to the elements of a tensor.
*
* This function computes the sine of each element in the source tensor
`acl_src`
* and stores the result in the destination tensor `acl_dst`.
* The operation is defined as:
* \f[
* \text {acl_dst }_i=\sin \left(\text {acl_src }_i\right)
* \f]
* @param ctx The context for the CANN backend operations.
* @param acl_src The source tensor on which the sine function will be applied.
* @param acl_dst The destination tensor where the sine results will be stored.
*/
void aclnn_sin(ggml_backend_cann_context& ctx, aclTensor* acl_src,
aclTensor* acl_dst);
/**
* @brief Prepares broadcast-compatible ACL tensors for two input tensors and one
* output tensor.
*
* This function checks whether broadcasting is needed between `src0` and `src1`.
* If broadcasting is required, it calculates the proper shapes and creates
* ACL tensors with broadcast parameters. Otherwise, it directly creates ACL tensors
* based on the original tensor shapes.
*
* @param src0 The first input tensor (reference shape).
* @param src1 The second input tensor (possibly broadcasted).
* @param dst The destination/output tensor.
* @param acl_src0 Output pointer to the created ACL tensor corresponding to src0.
* @param acl_src1 Output pointer to the created ACL tensor corresponding to src1.
* @param acl_dst Output pointer to the created ACL tensor corresponding to dst.
*/
void bcast_shape(ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst,
aclTensor ** acl_src0, aclTensor ** acl_src1, aclTensor ** acl_dst);
/**
* @brief Computes the 1D transposed convolution (deconvolution) of a ggml
* tensor using the CANN backend.
*
* @details This function performs a 1D transposed convolution (also known as
* deconvolution) operation on the input tensor. The computed result is stored
* in the destination tensor `dst`. The operation is optimized using the CANN
* backend for improved performance.
*
* @param ctx The CANN context used for operations.
* @param dst The destination tensor where the transposed convolution result
* will be stored. dst->op is `GGML_OP_CONV_TRANSPOSE_1D`.
*/
void ggml_cann_conv_transpose_1d(ggml_backend_cann_context& ctx, ggml_tensor* dst);
/**
* @brief Applies the ELU (Exponential Linear Unit) activation to a ggml tensor
* using the CANN backend.
*
* @details This function performs an element-wise ELU activation on the input
* tensor.
* The result is written to the destination tensor `dst` in-place.
* The ELU function is defined as:
*
* \text{ELU}(x) =
* \begin{cases}
* x, & \text{if } x > 0 \\
* \alpha \left( \exp(x) - 1 \right), & \text{if } x \leq 0
* \end{cases}
*
* where α (alpha) is a hyperparameter, typically set to 1.0.
* This operation is optimized using the CANN backend for high-performance
* inference or training.
*
* @param ctx The CANN context used for operations.
* @param dst The destination tensor where the ELU-activated result will be stored.
* dst->op is expected to be `GGML_OP_ELU`.
*/
void ggml_cann_elu(ggml_backend_cann_context& ctx, ggml_tensor* dst);
/**
* @brief Applies a element-wise operation to two input tensors using the CANN
* backend.
*
* This templated function takes a binary operator and applies it to two source
* tensors
* associated with the destination tensor. The function handles broadcasting as
* needed.
*
* @tparam binary_op A callable object (e.g., lambda or function pointer) representing
* the binary operation to be performed. It must take three arguments:
* (ggml_backend_cann_context&, aclTensor*, aclTensor*, aclTensor*).
*
* @param ctx The CANN backend context used to manage execution and resources.
* @param dst The destination tensor.
*/
template <auto binary_op>
void ggml_cann_binary_op(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
template <aclnnStatus getWorkspaceSize(const aclTensor*, const aclTensor*,
aclTensor*, uint64_t*, aclOpExecutor**),
aclnnStatus execute(void*, uint64_t, aclOpExecutor*, aclrtStream)>
void ggml_cann_mul_div(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
ggml_tensor* src0 = dst->src[0];
ggml_tensor* src1 = dst->src[1];
GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
aclTensor* acl_src0;
aclTensor* acl_src1;
aclTensor* acl_dst;
// Need bcast
bcast_shape(src0, src1, dst, &acl_src0, &acl_src1, &acl_dst);
binary_op(ctx, acl_src0, acl_src1, acl_dst);
if (!ggml_are_same_shape(src0, src1) && ggml_cann_need_bcast(src0, src1)) {
BCAST_SHAPE(src0, src1)
acl_src0 = ggml_cann_create_tensor(src0, BCAST_PARAM(src0));
acl_src1 = ggml_cann_create_tensor(src1, BCAST_PARAM(src1));
acl_dst = ggml_cann_create_tensor(dst, BCAST_PARAM(src0));
} else {
acl_src0 = ggml_cann_create_tensor(src0);
acl_src1 = ggml_cann_create_tensor(src1);
acl_dst = ggml_cann_create_tensor(dst);
}
uint64_t workspaceSize = 0;
aclOpExecutor* executor;
void* workspaceAddr = nullptr;
ACL_CHECK(getWorkspaceSize(acl_src0, acl_src1, acl_dst, &workspaceSize,
&executor));
if (workspaceSize > 0) {
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
workspaceAddr = workspace_allocator.get();
}
aclrtStream main_stream = ctx.stream();
ACL_CHECK(execute(workspaceAddr, workspaceSize, executor, main_stream));
ACL_CHECK(aclDestroyTensor(acl_src0));
ACL_CHECK(aclDestroyTensor(acl_src1));
ACL_CHECK(aclDestroyTensor(acl_dst));
}
/**
* @brief Launches an asynchronous task using the memory allocator.
*
* This macro submit an asynchronous task on the specified stream.
* The task uses memory allocated by the allocator. It is guaranteed
* that the memory will not be accessed by other tasks until this task
* completes, due to the sequential execution order within the same stream.
*
* @param OP_NAME aclnn operator name.
* @param args Additional arguments required by the task.
*
* @note
* Memory from the allocator will be "freed" immediately and can be
* reallocated to other pointers. However, it won't be accessed by any
* other task before this asynchronous task ends, because all tasks in the
* same stream are executed in queue order.
*/
#define GGML_CANN_CALL_ACLNN_OP(OP_NAME, ...) \
do { \
uint64_t workspaceSize = 0; \
aclOpExecutor * executor; \
void * workspaceAddr = nullptr; \
\
ACL_CHECK(aclnn##OP_NAME##GetWorkspaceSize(__VA_ARGS__, &workspaceSize, &executor)); \
\
if (workspaceSize > 0) { \
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); \
workspaceAddr = workspace_allocator.get(); \
} \
ACL_CHECK(aclnn##OP_NAME(workspaceAddr, workspaceSize, executor, ctx.stream())); \
} while (0)
/**
* @brief Applies a unary operation to an input tensor using the CANN backend.
*
* This templated function applies a unary operator to the source tensor of `dst`
* and stores the result in the destination tensor.
*
* @tparam unary_op A callable with the signature:
* void(ggml_backend_cann_context&, aclTensor*, aclTensor*)
* where the first aclTensor is the source and the second is the destination.
* @param ctx The CANN backend context for managing resources and execution.
* @param dst The destination tensor. Its src[0] is treated as the input tensor.
*/
template <void unary_op(ggml_backend_cann_context&, aclTensor*, aclTensor*)>
void ggml_cann_unary_op(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
// Activation functions template.
template <aclnnStatus getWorkspaceSize(const aclTensor*, aclTensor*, uint64_t*,
aclOpExecutor**),
aclnnStatus execute(void*, uint64_t, aclOpExecutor*,
const aclrtStream)>
void ggml_cann_activation(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
ggml_tensor* src = dst->src[0];
GGML_ASSERT(src->type == GGML_TYPE_F32);
GGML_ASSERT(dst->type == GGML_TYPE_F32);
aclTensor* acl_src = ggml_cann_create_tensor(src);
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
unary_op(ctx, acl_src, acl_dst);
uint64_t workspaceSize = 0;
aclOpExecutor* executor;
void* workspaceAddr = nullptr;
ACL_CHECK(getWorkspaceSize(acl_src, acl_dst, &workspaceSize, &executor));
if (workspaceSize > 0) {
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
workspaceAddr = workspace_allocator.get();
}
aclrtStream main_stream = ctx.stream();
ACL_CHECK(execute(workspaceAddr, workspaceSize, executor, main_stream));
ACL_CHECK(aclDestroyTensor(acl_src));
ACL_CHECK(aclDestroyTensor(acl_dst));
}
/**
* @brief Applies a unary operation to a ggml tensor using the CANN backend.
*
* @details This function performs a unary operation on the input tensor using
* a user-provided lambda or callable object `unary_op`, which accepts the CANN
* context and two ACL tensors (source and destination). Internally, this function
* creates ACL representations of the ggml tensors and invokes the unary operation.
* The result is stored in the destination tensor `dst`. This utility abstracts the
* common boilerplate of tensor conversion and cleanup when implementing unary ops.
*
* @param unary_op A callable that performs the unary operation using CANN APIs.
* @param ctx The CANN context used for operations.
* @param dst The destination tensor where the result will be stored.
* The source tensor is retrieved from `dst->src[0]`.
*/
void ggml_cann_unary_op(
std::function<void(ggml_backend_cann_context&, aclTensor*, aclTensor*)> unary_op,
ggml_backend_cann_context& ctx, ggml_tensor* dst);
// Activation functions template for const aclTensors.
template <aclnnStatus getWorkspaceSize(const aclTensor*, const aclTensor*,
uint64_t*, aclOpExecutor**),
aclnnStatus execute(void*, uint64_t, aclOpExecutor*,
const aclrtStream)>
void ggml_cann_activation(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
ggml_tensor* src = dst->src[0];
GGML_ASSERT(src->type == GGML_TYPE_F32);
GGML_ASSERT(dst->type == GGML_TYPE_F32);
aclTensor* acl_src = ggml_cann_create_tensor(src);
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
uint64_t workspaceSize = 0;
aclOpExecutor* executor;
void* workspaceAddr = nullptr;
ACL_CHECK(getWorkspaceSize(acl_src, acl_dst, &workspaceSize, &executor));
if (workspaceSize > 0) {
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
workspaceAddr = workspace_allocator.get();
}
aclrtStream main_stream = ctx.stream();
ACL_CHECK(execute(workspaceAddr, workspaceSize, executor, main_stream));
ACL_CHECK(aclDestroyTensor(acl_src));
ACL_CHECK(aclDestroyTensor(acl_dst));
}
/**
* @brief Helper macro to invoke a unary ACL operation using ggml_cann_unary_op.
*
* This macro defines an inline lambda wrapping a specific ACL operation name,
* and passes it to the templated ggml_cann_unary_op function. It simplifies
* calling unary ops by hiding the lambda boilerplate.
*
* Internally, the lambda will call:
* @code
* GGML_CANN_CALL_ACLNN_OP(OP_NAME, acl_src, acl_dst);
* @endcode
*
* @param OP_NAME The name of the ACL unary operator to invoke via GGML_CANN_CALL_ACLNN_OP.
*
* @see ggml_cann_unary_op
* @see GGML_CANN_CALL_ACLNN_OP
*/
#define GGML_CANN_CALL_UNARY_OP(OP_NAME) \
do { \
auto lambda = [](ggml_backend_cann_context& ctx, \
aclTensor* acl_src, \
aclTensor* acl_dst) { \
GGML_CANN_CALL_ACLNN_OP(OP_NAME, acl_src, acl_dst); \
}; \
ggml_cann_unary_op(lambda, ctx, dst); \
} \
while (0)
#endif // CANN_ACLNN_OPS

View File

@@ -803,7 +803,7 @@ static enum ggml_status ggml_backend_cann_buffer_init_tensor(
return GGML_STATUS_SUCCESS;
}
// TODO: cann backend doesn't support quantized yet. Just leave the code
// TODO: can backend doesn't support quantized yet. Just leave the code
// here.
if (ggml_is_quantized(tensor->type)) {
// Initialize padding to 0 to avoid possible NaN values
@@ -1300,63 +1300,47 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
ggml_cann_dup(ctx, dst);
break;
case GGML_OP_ADD:
case GGML_OP_ADD1:
ggml_cann_binary_op<aclnn_add>(ctx, dst);
break;
case GGML_OP_SUB:
ggml_cann_binary_op<aclnn_sub>(ctx, dst);
ggml_cann_add(ctx, dst);
break;
case GGML_OP_ACC:
ggml_cann_acc(ctx, dst);
break;
case GGML_OP_MUL:
ggml_cann_binary_op<aclnn_mul>(ctx, dst);
ggml_cann_mul_div<aclnnMulGetWorkspaceSize, aclnnMul>(ctx, dst);
break;
case GGML_OP_DIV:
ggml_cann_binary_op<aclnn_div>(ctx, dst);
ggml_cann_mul_div<aclnnDivGetWorkspaceSize, aclnnDiv>(ctx, dst);
break;
case GGML_OP_UNARY:
switch (ggml_get_unary_op(dst)) {
case GGML_UNARY_OP_ABS:
GGML_CANN_CALL_UNARY_OP(Abs);
break;
case GGML_UNARY_OP_NEG:
GGML_CANN_CALL_UNARY_OP(Neg);
break;
case GGML_UNARY_OP_GELU:
GGML_CANN_CALL_UNARY_OP(Gelu);
ggml_cann_activation<aclnnGeluGetWorkspaceSize, aclnnGelu>(
ctx, dst);
break;
case GGML_UNARY_OP_SILU:
GGML_CANN_CALL_UNARY_OP(Silu);
ggml_cann_activation<aclnnSiluGetWorkspaceSize, aclnnSilu>(
ctx, dst);
break;
// TODO: Use faster gelu??
case GGML_UNARY_OP_GELU_QUICK:
ggml_cann_activation<aclnnGeluGetWorkspaceSize, aclnnGelu>(
ctx, dst);
break;
case GGML_UNARY_OP_GELU_QUICK: {
auto lambda = [](ggml_backend_cann_context& ctx,
aclTensor* acl_src,
aclTensor* acl_dst) {
GGML_CANN_CALL_ACLNN_OP(GeluV2, acl_src, 0, acl_dst);
};
ggml_cann_unary_op(lambda, ctx, dst);
} break;
case GGML_UNARY_OP_TANH:
GGML_CANN_CALL_UNARY_OP(Tanh);
ggml_cann_activation<aclnnTanhGetWorkspaceSize, aclnnTanh>(
ctx, dst);
break;
case GGML_UNARY_OP_RELU:
GGML_CANN_CALL_UNARY_OP(Relu);
break;
case GGML_UNARY_OP_SIGMOID:
GGML_CANN_CALL_UNARY_OP(Sigmoid);
ggml_cann_activation<aclnnReluGetWorkspaceSize, aclnnRelu>(
ctx, dst);
break;
case GGML_UNARY_OP_HARDSIGMOID:
GGML_CANN_CALL_UNARY_OP(Hardsigmoid);
ggml_cann_activation<aclnnHardsigmoidGetWorkspaceSize,
aclnnHardsigmoid>(ctx, dst);
break;
case GGML_UNARY_OP_HARDSWISH:
GGML_CANN_CALL_UNARY_OP(Hardswish);
break;
case GGML_UNARY_OP_EXP:
GGML_CANN_CALL_UNARY_OP(Exp);
break;
case GGML_UNARY_OP_ELU:
ggml_cann_elu(ctx, dst);
ggml_cann_activation<aclnnHardswishGetWorkspaceSize,
aclnnHardswish>(ctx, dst);
break;
default:
return false;
@@ -1398,12 +1382,7 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
ggml_cann_scale(ctx, dst);
break;
case GGML_OP_SQR:
GGML_ASSERT(dst->src[1] == nullptr);
dst->src[1] = dst->src[0];
ggml_cann_binary_op<aclnn_mul>(ctx, dst);
break;
case GGML_OP_SQRT:
GGML_CANN_CALL_UNARY_OP(Sqrt);
ggml_cann_sqr(ctx, dst);
break;
case GGML_OP_CLAMP:
ggml_cann_clamp(ctx, dst);
@@ -1435,27 +1414,12 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
case GGML_OP_POOL_2D:
ggml_cann_pool2d(ctx, dst);
break;
case GGML_OP_SUM:
ggml_cann_sum(ctx, dst);
break;
case GGML_OP_SUM_ROWS:
ggml_cann_sum_rows(ctx, dst);
break;
case GGML_OP_ARGSORT:
ggml_cann_argsort(ctx, dst);
break;
case GGML_OP_ARGMAX:
ggml_cann_argmax(ctx, dst);
break;
case GGML_OP_COS:
ggml_cann_unary_op<aclnn_cos>(ctx, dst);
break;
case GGML_OP_SIN:
ggml_cann_unary_op<aclnn_sin>(ctx, dst);
break;
case GGML_OP_CONV_TRANSPOSE_1D:
ggml_cann_conv_transpose_1d(ctx, dst);
break;
default:
return false;
}
@@ -1494,6 +1458,11 @@ static void ggml_backend_cann_free(ggml_backend_t backend) {
ACL_CHECK(aclrtSynchronizeDevice());
ACL_CHECK(aclrtResetDevice(cann_ctx->device));
// finalize when last backend freed.
if (cann_ctx->device == ggml_backend_cann_get_device_count() - 1) {
ACL_CHECK(aclFinalize());
}
delete cann_ctx;
delete backend;
}
@@ -1706,32 +1675,24 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
switch (op->op) {
case GGML_OP_UNARY:
switch (ggml_get_unary_op(op)) {
case GGML_UNARY_OP_ABS:
case GGML_UNARY_OP_NEG:
case GGML_UNARY_OP_GELU:
case GGML_UNARY_OP_SILU:
case GGML_UNARY_OP_RELU:
case GGML_UNARY_OP_SIGMOID:
case GGML_UNARY_OP_HARDSIGMOID:
case GGML_UNARY_OP_HARDSWISH:
case GGML_UNARY_OP_GELU_QUICK:
case GGML_UNARY_OP_TANH:
case GGML_UNARY_OP_EXP:
case GGML_UNARY_OP_ELU:
return true;
default:
return false;
}
case GGML_OP_MUL_MAT: {
switch (op->src[0]->type) {
case GGML_TYPE_Q8_0:
case GGML_TYPE_F16:
case GGML_TYPE_F32:
return true;
case GGML_TYPE_Q8_0:
case GGML_TYPE_Q4_0:
// only support contiguous for quantized types.
return ggml_is_contiguous(op->src[0]) &&
ggml_is_contiguous(op->src[1]);
return true;
default:
return false;
}
@@ -1743,6 +1704,7 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
switch (op->src[0]->type) {
case GGML_TYPE_F32:
case GGML_TYPE_F16:
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q8_0:
return true;
default:
@@ -1750,21 +1712,16 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
}
} break;
case GGML_OP_CPY: {
ggml_tensor *src = op->src[0];
if ((op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_F16) ||
(src->type != GGML_TYPE_F32 &&
src->type != GGML_TYPE_F16)) {
// only support F32 and F16.
return false;
switch (op->type) {
case GGML_TYPE_F32:
case GGML_TYPE_F16:
case GGML_TYPE_Q8_0:
case GGML_TYPE_Q4_0:
return true;
default:
return false;
}
if (!ggml_are_same_shape(op, src) && !ggml_is_contiguous(op)) {
// unsupport dst is not contiguous.
return false;
}
return true;
} break;
}
case GGML_OP_CONT: {
// TODO: support GGML_TYPE_BF16
switch (op->src[0]->type) {
@@ -1777,14 +1734,13 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
}
case GGML_OP_ROPE: {
// TODO: with ops-test v == 1
float ext_factor = 0.0f;
memcpy(&ext_factor, (const float *) op->op_params + 7, sizeof(float));
float * ext_factor = (float*)((int32_t*)op->op_params + 7);
// TODO: n_dims <= ne0
if (op->src[0]->ne[0] != op->op_params[1]) {
return false;
}
// TODO: ext_factor != 0
if (ext_factor != 0) {
if (*ext_factor != 0) {
return false;
}
@@ -1806,20 +1762,9 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
}
return true;
}
case GGML_OP_POOL_2D: {
const int32_t * opts = (const int32_t *) op->op_params;
const int k0 = opts[1];
const int k1 = opts[2];
const int p0 = opts[5];
const int p1 = opts[6];
// value of paddingH should be at most half of kernelH
// value of paddingW should be at most half of kernelW
return (p0 <= (k0 / 2)) && (p1 <= (k1 / 2));
}
case GGML_OP_SUM:
case GGML_OP_DUP:
case GGML_OP_IM2COL:
case GGML_OP_CONCAT:
case GGML_OP_DUP:
case GGML_OP_REPEAT:
case GGML_OP_NONE:
case GGML_OP_RESHAPE:
@@ -1828,17 +1773,15 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
case GGML_OP_TRANSPOSE:
case GGML_OP_NORM:
case GGML_OP_ADD:
case GGML_OP_ADD1:
case GGML_OP_SUB:
case GGML_OP_MUL:
case GGML_OP_DIV:
case GGML_OP_RMS_NORM:
case GGML_OP_SCALE:
case GGML_OP_SQR:
case GGML_OP_SQRT:
case GGML_OP_CLAMP:
case GGML_OP_DIAG_MASK_INF:
case GGML_OP_SOFT_MAX:
case GGML_OP_POOL_2D:
case GGML_OP_SUM_ROWS:
case GGML_OP_ARGSORT:
case GGML_OP_ACC:
@@ -1847,10 +1790,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
case GGML_OP_ARANGE:
case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_LEAKY_RELU:
case GGML_OP_ARGMAX:
case GGML_OP_COS:
case GGML_OP_SIN:
case GGML_OP_CONV_TRANSPOSE_1D:
return true;
default:
return false;

View File

@@ -0,0 +1,30 @@
file(GLOB SRC_FILES
get_row_f32.cpp
get_row_f16.cpp
get_row_q4_0.cpp
get_row_q8_0.cpp
quantize_f32_q8_0.cpp
quantize_f16_q8_0.cpp
quantize_float_to_q4_0.cpp
dup.cpp
)
set(ASCEND_CANN_PACKAGE_PATH ${CANN_INSTALL_DIR})
set(RUN_MODE "npu" CACHE STRING "run mode: npu/sim")
if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
else()
message(FATAL_ERROR "ascendc_kernel_cmake does not exist, please check whether the compiler package is installed.")
endif()
include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
ascendc_library(ascendc_kernels STATIC
${SRC_FILES}
)
message(STATUS "CANN: compile ascend kernels witch SOC_TYPE:${SOC_TYPE}, SOC_VERSION:${SOC_VERSION}, compile macro:-D${SOC_TYPE_COMPILE_OPTION}.")
ascendc_compile_definitions(ascendc_kernels PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}")
# ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP)

View File

@@ -0,0 +1,19 @@
#ifndef ASCENDC_KERNELS_H
#define ASCENDC_KERNELS_H
#include "aclrtlaunch_ascendc_get_row_f32.h"
#include "aclrtlaunch_ascendc_get_row_f16.h"
#include "aclrtlaunch_ascendc_get_row_q8_0.h"
#include "aclrtlaunch_ascendc_get_row_q4_0.h"
#include "aclrtlaunch_ascendc_quantize_f32_q8_0.h"
#include "aclrtlaunch_ascendc_quantize_f16_q8_0.h"
#include "aclrtlaunch_ascendc_quantize_f16_to_q4_0.h"
#include "aclrtlaunch_ascendc_quantize_f32_to_q4_0.h"
#include "aclrtlaunch_ascendc_dup_by_rows_fp16.h"
#include "aclrtlaunch_ascendc_dup_by_rows_fp32.h"
#include "aclrtlaunch_ascendc_dup_by_rows_fp32_to_fp16.h"
#include "aclrtlaunch_ascendc_dup_by_rows_fp16_to_fp32.h"
#endif // ASCENDC_KERNELS_H

View File

@@ -0,0 +1,234 @@
#include "kernel_operator.h"
using namespace AscendC;
#define BUFFER_NUM 2
const int64_t SUPPORTED_MAX_DIM = 65535; // currently the limit of max block dim supportted by dup kernel is 65535template <typename SRC_T, typename DST_T>
template <typename SRC_T, typename DST_T>
class DupByRows {
public:
__aicore__ inline DupByRows() {}
__aicore__ inline void init(GM_ADDR src, GM_ADDR dst, int64_t *input_ne_ub,
size_t *input_nb_ub) {
/* Dup by rows when src is contigous on first dimension and dst is
contiguous, each kernel process one row.
*/
// Input has four dims.
int64_t op_block_num = GetBlockNum();
int64_t op_block_idx = GetBlockIdx();
// param
num_rows = input_ne_ub[1] * input_ne_ub[2] * input_ne_ub[3];
num_elem = input_ne_ub[0];
// index for (ne[1], ne[2], ne[3]): (idx_ne1, idx_ne2, idx_ne3)
idx_ne3 = op_block_idx / (input_ne_ub[1] * input_ne_ub[2]);
idx_ne2 = (op_block_idx - idx_ne3 * (input_ne_ub[1] * input_ne_ub[2]))
/ (input_ne_ub[1]);
idx_ne1 = op_block_idx - idx_ne3 * (input_ne_ub[1] * input_ne_ub[2])
- idx_ne2 * input_ne_ub[1];
// src may not contiguous in dim [1,2,3], so stride decited by ne&nb
src_stride = input_nb_ub[3] * idx_ne3 + input_nb_ub[2] * idx_ne2
+ input_nb_ub[1] * idx_ne1;
// dst is contiguous
dst_stride = op_block_idx * (input_ne_ub[0] * sizeof(DST_T));
src_gm.SetGlobalBuffer(reinterpret_cast<__gm__ SRC_T *>(src +
src_stride));
dst_gm.SetGlobalBuffer(reinterpret_cast<__gm__ DST_T *>(dst +
dst_stride));
pipe.InitBuffer(src_queue, BUFFER_NUM, (sizeof(SRC_T) * num_elem +
32 - 1) / 32 * 32);
pipe.InitBuffer(dst_queue, BUFFER_NUM, (sizeof(DST_T) * num_elem +
32 - 1) / 32 * 32);
}
__aicore__ inline void copy_in() {
LocalTensor<SRC_T> src_local = src_queue.AllocTensor<SRC_T>();
const size_t elem_per_block = 32 / sizeof(SRC_T);
size_t tail = num_elem % elem_per_block;
size_t cpy_elements_len = tail > 0 ? num_elem + 1 : num_elem;
DataCopy(src_local, src_gm, cpy_elements_len);
src_queue.EnQue(src_local);
}
__aicore__ inline void copy_out() {
LocalTensor<DST_T> dst_local = dst_queue.DeQue<DST_T>();
#ifdef ASCEND_310P
const size_t elem_per_block = 32 / sizeof(DST_T);
size_t tail = num_elem % elem_per_block;
size_t len = num_elem & ~(elem_per_block - 1);
if (len > 0) {
DataCopy(dst_gm, dst_local, len);
}
if(tail != 0) {
for (size_t i = tail; i < elem_per_block; i++) {
dst_local[len + i].SetValue(0, 0);
}
SetAtomicAdd<float>();
DataCopy(dst_gm[len], dst_local[len], elem_per_block);
SetAtomicNone();
}
#else
DataCopyExtParams dataCopyParams;
dataCopyParams.blockCount = 1;
dataCopyParams.blockLen = num_elem * sizeof(DST_T);
DataCopyPad(dst_gm, dst_local, dataCopyParams);
#endif
dst_queue.FreeTensor(dst_local);
}
__aicore__ inline void dup() {
// main process, copy one row data from src to dst.
copy_in();
LocalTensor<SRC_T> src_local = src_queue.DeQue<SRC_T>();
LocalTensor<DST_T> dst_local = dst_queue.AllocTensor<DST_T>();
int32_t BLOCK_NUM = 32 / sizeof(DST_T);
DataCopy(dst_local, src_local, (num_elem + BLOCK_NUM - 1)
/ BLOCK_NUM * BLOCK_NUM);
dst_queue.EnQue<DST_T>(dst_local);
src_queue.FreeTensor(src_local);
copy_out();
}
__aicore__ inline void dup_with_cast() {
// main process, copy one row data from src to dst.
// cast dtype from src to dst.
copy_in();
LocalTensor<SRC_T> src_local = src_queue.DeQue<SRC_T>();
LocalTensor<DST_T> dst_local = dst_queue.AllocTensor<DST_T>();
Cast(dst_local, src_local, RoundMode::CAST_NONE, num_elem);
dst_queue.EnQue<DST_T>(dst_local);
src_queue.FreeTensor(src_local);
copy_out();
}
private:
TPipe pipe;
GlobalTensor<SRC_T> src_gm;
GlobalTensor<DST_T> dst_gm;
int64_t num_rows;
int64_t num_elem;
int64_t idx_ne3;
int64_t idx_ne2;
int64_t idx_ne1;
int64_t src_stride;
int64_t dst_stride;
TQue<QuePosition::VECIN, BUFFER_NUM> src_queue;
TQue<QuePosition::VECOUT, BUFFER_NUM> dst_queue;
};
template <typename T>
__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
auto gm_ptr = (__gm__ uint8_t *)gm;
auto ub_ptr = (uint8_t *)(ub);
for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
*ub_ptr = *gm_ptr;
}
}
extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp16(
GM_ADDR src_gm,
GM_ADDR dst_gm,
GM_ADDR input_ne_gm,
GM_ADDR input_nb_gm,
GM_ADDR output_ne_gm,
GM_ADDR output_nb_gm) {
int64_t input_ne_ub[4];
size_t input_nb_ub[4];
int64_t output_ne_ub[4];
size_t output_nb_ub[4];
copy_to_ub(input_ne_gm, input_ne_ub, 32);
copy_to_ub(input_nb_gm, input_nb_ub, 32);
copy_to_ub(output_ne_gm, output_ne_ub, 32);
copy_to_ub(output_nb_gm, output_nb_ub, 32);
DupByRows<half, half> op;
op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
op.dup();
}
extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp32(
GM_ADDR src_gm,
GM_ADDR dst_gm,
GM_ADDR input_ne_gm,
GM_ADDR input_nb_gm,
GM_ADDR output_ne_gm,
GM_ADDR output_nb_gm) {
int64_t input_ne_ub[4];
size_t input_nb_ub[4];
int64_t output_ne_ub[4];
size_t output_nb_ub[4];
copy_to_ub(input_ne_gm, input_ne_ub, 32);
copy_to_ub(input_nb_gm, input_nb_ub, 32);
copy_to_ub(output_ne_gm, output_ne_ub, 32);
copy_to_ub(output_nb_gm, output_nb_ub, 32);
DupByRows<float, float> op;
op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
op.dup();
}
extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp32_to_fp16(
GM_ADDR src_gm,
GM_ADDR dst_gm,
GM_ADDR input_ne_gm,
GM_ADDR input_nb_gm,
GM_ADDR output_ne_gm,
GM_ADDR output_nb_gm) {
int64_t input_ne_ub[4];
size_t input_nb_ub[4];
int64_t output_ne_ub[4];
size_t output_nb_ub[4];
copy_to_ub(input_ne_gm, input_ne_ub, 32);
copy_to_ub(input_nb_gm, input_nb_ub, 32);
copy_to_ub(output_ne_gm, output_ne_ub, 32);
copy_to_ub(output_nb_gm, output_nb_ub, 32);
DupByRows<float, half> op;
op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
op.dup_with_cast();
}
extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp16_to_fp32(
GM_ADDR src_gm,
GM_ADDR dst_gm,
GM_ADDR input_ne_gm,
GM_ADDR input_nb_gm,
GM_ADDR output_ne_gm,
GM_ADDR output_nb_gm) {
// copy params from gm to ub.
int64_t input_ne_ub[4];
size_t input_nb_ub[4];
int64_t output_ne_ub[4];
size_t output_nb_ub[4];
copy_to_ub(input_ne_gm, input_ne_ub, 32);
copy_to_ub(input_nb_gm, input_nb_ub, 32);
copy_to_ub(output_ne_gm, output_ne_ub, 32);
copy_to_ub(output_nb_gm, output_nb_ub, 32);
DupByRows<half, float> op;
op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
op.dup_with_cast();
}

View File

@@ -0,0 +1,197 @@
#include "kernel_operator.h"
// optimize me. Use template to avoid copy code.
using namespace AscendC;
#define BUFFER_NUM 2
class GET_ROW_F16 {
public:
__aicore__ inline GET_ROW_F16() {}
__aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output,
int64_t *input_ne_ub, size_t *input_nb_ub,
int64_t *indices_ne_ub, size_t *indices_nb_ub,
int64_t *output_ne_ub, size_t *output_nb_ub) {
// TODO, use template for F16/f32
int64_t op_block_num = GetBlockNum();
op_block_idx = GetBlockIdx();
for (int i = 0; i < 4; i++) {
input_ne[i] = input_ne_ub[i];
input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
indices_ne[i] = indices_ne_ub[i];
indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0];
output_ne[i] = output_ne_ub[i];
output_stride[i] = output_nb_ub[i] / output_nb_ub[0];
}
// Indices has two dims. n_elements = all rows should get.
// dr, all rows should this thread get.
uint64_t n_elements =
indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3];
dr = n_elements / op_block_num;
uint64_t tails = n_elements % op_block_num;
if (op_block_idx < tails) {
dr += 1;
ir = dr * op_block_idx;
} else {
ir = dr * op_block_idx + tails;
}
input_gm.SetGlobalBuffer((__gm__ half *)input);
indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices);
output_gm.SetGlobalBuffer((__gm__ float *)output);
uint64_t input_local_buffer_size = ((input_ne[0] * sizeof(half) + 31)
& ~31);
uint64_t output_local_buffer_size = ((input_ne[0] * sizeof(float) + 31)
& ~31);
local_buffer_elems = input_local_buffer_size / sizeof(half);
// TODO, consider long row that can't put in UB.
// All data should asign to 32. It's ok because all data is align to 32.
pipe.InitBuffer(input_queue, BUFFER_NUM, input_local_buffer_size);
pipe.InitBuffer(output_queue, BUFFER_NUM, output_local_buffer_size);
}
__aicore__ inline void copy_in(uint32_t offset, size_t len) {
size_t origin_len = len;
LocalTensor<half> input_local = input_queue.AllocTensor<half>();
const size_t elem_per_block = 32 / sizeof(half);
size_t tail = len % elem_per_block;
len = len & ~(elem_per_block - 1);
if(tail != 0) {
len += elem_per_block;
}
DataCopy(input_local, input_gm[offset], len);
input_queue.EnQue(input_local);
}
__aicore__ inline void copy_out(uint32_t offset, size_t len) {
LocalTensor<float> output_local = output_queue.DeQue<float>();
const size_t elem_per_block = 32 / sizeof(float);
size_t tail = len % elem_per_block;
len = len & ~(elem_per_block - 1);
if (len > 0) {
DataCopy(output_gm[offset], output_local, len);
}
if(tail != 0) {
#ifdef ASCEND_310P
for (size_t i = tail; i < elem_per_block; i++) {
output_local[len + i].SetValue(0, 0);
}
SetAtomicAdd<float>();
DataCopy(output_gm[offset + len], output_local[len], elem_per_block);
SetAtomicNone();
#else
DataCopyExtParams dataCopyParams;
dataCopyParams.blockCount = 1;
dataCopyParams.blockLen = tail * sizeof(float);
DataCopyPad(output_gm[offset + len], output_local[len],
dataCopyParams);
#endif
}
output_queue.FreeTensor(output_local);
}
__aicore__ inline void calculate_row(int64_t idx) {
const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
const int64_t indices_ne1_idx =
(idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) /
indices_ne[0];
const int64_t indices_ne0_idx =
(idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] -
indices_ne1_idx * indices_ne[0]);
const int64_t indices_offset = indices_ne0_idx * indices_stride[0] +
indices_ne1_idx * indices_stride[1] +
indices_ne2_idx * indices_stride[2];
const int32_t selected_row_idx = indices_gm.GetValue(indices_offset);
const int64_t input_offset = selected_row_idx * input_stride[1] +
indices_ne1_idx * input_stride[2] +
indices_ne2_idx * input_stride[3];
const int64_t output_offset = indices_ne0_idx * output_stride[1] +
indices_ne1_idx * output_stride[2] +
indices_ne2_idx * output_stride[3];
copy_in(input_offset, input_ne[0]);
LocalTensor<half> input_local = input_queue.DeQue<half>();
LocalTensor<float> output_local = output_queue.AllocTensor<float>();
Cast(output_local, input_local, RoundMode::CAST_NONE,
local_buffer_elems);
output_queue.EnQue(output_local);
copy_out(output_offset, input_ne[0]);
input_queue.FreeTensor(input_local);
}
__aicore__ inline void calculate() {
for (int64_t i = ir; i < ir + dr; i++) {
calculate_row(i);
}
}
private:
int64_t input_ne[4];
size_t input_stride[4];
int64_t indices_ne[4];
size_t indices_stride[4];
int64_t output_ne[4];
size_t output_stride[4];
size_t local_buffer_elems;
int64_t ir;
int64_t dr;
TPipe pipe;
GlobalTensor<half> input_gm;
GlobalTensor<int32_t> indices_gm;
GlobalTensor<float> output_gm;
TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
int64_t op_block_idx;
};
template <typename T>
__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
auto gm_ptr = (__gm__ uint8_t *)gm;
auto ub_ptr = (uint8_t *)(ub);
for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
*ub_ptr = *gm_ptr;
}
}
extern "C" __global__ __aicore__ void ascendc_get_row_f16(
GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
GM_ADDR input_ne_gm, GM_ADDR input_nb_gm, GM_ADDR indices_ne_gm,
GM_ADDR indices_nb_gm, GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
int64_t input_ne_ub[4];
size_t input_nb_ub[4];
int64_t indices_ne_ub[4];
size_t indices_nb_ub[4];
int64_t output_ne_ub[4];
size_t output_nb_ub[4];
copy_to_ub(input_ne_gm, input_ne_ub, 32);
copy_to_ub(input_nb_gm, input_nb_ub, 32);
copy_to_ub(indices_ne_gm, indices_ne_ub, 32);
copy_to_ub(indices_nb_gm, indices_nb_ub, 32);
copy_to_ub(output_ne_gm, output_ne_ub, 32);
copy_to_ub(output_nb_gm, output_nb_ub, 32);
GET_ROW_F16 op;
op.init(input_gm, indices_gm, output_gm, input_ne_ub, input_nb_ub,
indices_ne_ub, indices_nb_ub, output_ne_ub, output_nb_ub);
op.calculate();
}

View File

@@ -0,0 +1,190 @@
#include "kernel_operator.h"
// optimize me. Use template to avoid copy code.
using namespace AscendC;
#define BUFFER_NUM 2
class GET_ROW_F32 {
public:
__aicore__ inline GET_ROW_F32() {}
__aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output,
int64_t *input_ne_ub, size_t *input_nb_ub,
int64_t *indices_ne_ub, size_t *indices_nb_ub,
int64_t *output_ne_ub, size_t *output_nb_ub) {
int64_t op_block_num = GetBlockNum();
op_block_idx = GetBlockIdx();
for (int i = 0; i < 4; i++) {
input_ne[i] = input_ne_ub[i];
input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
indices_ne[i] = indices_ne_ub[i];
indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0];
output_ne[i] = output_ne_ub[i];
output_stride[i] = output_nb_ub[i] / output_nb_ub[0];
}
// Indices has two dims. n_elements = all rows should get.
// dr, all rows should this thread get.
uint64_t n_elements =
indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3];
dr = n_elements / op_block_num;
uint64_t tails = n_elements % op_block_num;
if (op_block_idx < tails) {
dr += 1;
ir = dr * op_block_idx;
} else {
ir = dr * op_block_idx + tails;
}
input_gm.SetGlobalBuffer((__gm__ float *)input);
indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices);
output_gm.SetGlobalBuffer((__gm__ float *)output);
uint64_t local_buffer_size = ((input_ne[0] * sizeof(float) + 31) & ~31);
local_buffer_elems = local_buffer_size / sizeof(float);
// TODO, consider long row that can't put in UB.
// All data should asign to 32. It's ok because all data is align to 32.
pipe.InitBuffer(input_queue, BUFFER_NUM, local_buffer_size);
pipe.InitBuffer(output_queue, BUFFER_NUM, local_buffer_size);
}
__aicore__ inline void copy_in(uint32_t offset, size_t len) {
LocalTensor<float> input_local = input_queue.AllocTensor<float>();
const size_t elem_per_block = 32 / sizeof(float);
size_t tail = len % elem_per_block;
len = len & ~(elem_per_block - 1);
if(tail != 0) {
len += elem_per_block;
}
DataCopy(input_local, input_gm[offset], len);
input_queue.EnQue(input_local);
}
__aicore__ inline void copy_out(uint32_t offset, size_t len) {
LocalTensor<float> output_local = output_queue.DeQue<float>();
const size_t elem_per_block = 32 / sizeof(float);
size_t tail = len % elem_per_block;
len = len & ~(elem_per_block - 1);
if (len > 0) {
DataCopy(output_gm[offset], output_local, len);
}
if(tail != 0) {
#ifdef ASCEND_310P
for (size_t i = tail; i < elem_per_block; i++) {
output_local[len + i].SetValue(0, 0);
}
SetAtomicAdd<float>();
DataCopy(output_gm[offset + len], output_local[len], elem_per_block);
SetAtomicNone();
#else
DataCopyExtParams dataCopyParams;
dataCopyParams.blockCount = 1;
dataCopyParams.blockLen = tail * sizeof(float);
DataCopyPad(output_gm[offset + len], output_local[len],
dataCopyParams);
#endif
}
output_queue.FreeTensor(output_local);
}
__aicore__ inline void calculate_row(int64_t idx) {
const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
const int64_t indices_ne1_idx =
(idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) /
indices_ne[0];
const int64_t indices_ne0_idx =
(idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] -
indices_ne1_idx * indices_ne[0]);
const int64_t indices_offset = indices_ne0_idx * indices_stride[0] +
indices_ne1_idx * indices_stride[1] +
indices_ne2_idx * indices_stride[2];
const int32_t selected_row_idx = indices_gm.GetValue(indices_offset);
const int64_t input_offset = selected_row_idx * input_stride[1] +
indices_ne1_idx * input_stride[2] +
indices_ne2_idx * input_stride[3];
const int64_t output_offset = indices_ne0_idx * output_stride[1] +
indices_ne1_idx * output_stride[2] +
indices_ne2_idx * output_stride[3];
copy_in(input_offset, input_ne[0]);
LocalTensor<float> input_local = input_queue.DeQue<float>();
LocalTensor<float> output_local = output_queue.AllocTensor<float>();
DataCopy(output_local, input_local, local_buffer_elems);
output_queue.EnQue(output_local);
copy_out(output_offset, input_ne[0]);
input_queue.FreeTensor(input_local);
}
__aicore__ inline void calculate() {
for (int64_t i = ir; i < ir + dr; i++) {
calculate_row(i);
}
}
private:
int64_t input_ne[4];
size_t input_stride[4];
int64_t indices_ne[4];
size_t indices_stride[4];
int64_t output_ne[4];
size_t output_stride[4];
size_t local_buffer_elems;
int64_t ir;
int64_t dr;
TPipe pipe;
GlobalTensor<float> input_gm;
GlobalTensor<int32_t> indices_gm;
GlobalTensor<float> output_gm;
TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
int64_t op_block_idx;
};
template <typename T>
__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
auto gm_ptr = (__gm__ uint8_t *)gm;
auto ub_ptr = (uint8_t *)(ub);
for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
*ub_ptr = *gm_ptr;
}
}
extern "C" __global__ __aicore__ void ascendc_get_row_f32(
GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
GM_ADDR input_ne_gm, GM_ADDR input_nb_gm, GM_ADDR indices_ne_gm,
GM_ADDR indices_nb_gm, GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
int64_t input_ne_ub[4];
size_t input_nb_ub[4];
int64_t indices_ne_ub[4];
size_t indices_nb_ub[4];
int64_t output_ne_ub[4];
size_t output_nb_ub[4];
copy_to_ub(input_ne_gm, input_ne_ub, 32);
copy_to_ub(input_nb_gm, input_nb_ub, 32);
copy_to_ub(indices_ne_gm, indices_ne_ub, 32);
copy_to_ub(indices_nb_gm, indices_nb_ub, 32);
copy_to_ub(output_ne_gm, output_ne_ub, 32);
copy_to_ub(output_nb_gm, output_nb_ub, 32);
GET_ROW_F32 op;
op.init(input_gm, indices_gm, output_gm, input_ne_ub, input_nb_ub,
indices_ne_ub, indices_nb_ub, output_ne_ub, output_nb_ub);
op.calculate();
}

View File

@@ -0,0 +1,204 @@
#include "kernel_operator.h"
// optimize me. Use template to avoid copy code.
using namespace AscendC;
#ifdef ASCEND_310P // 310P not support 4bit get row
extern "C" __global__ __aicore__ void ascendc_get_row_q4_0(
GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm,
GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
// let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
printf("Ascend310P not support 4bit get row.\n");
}
#else
#define BUFFER_NUM 2
#define QK4_0 32
class GET_ROW_Q4_0 {
public:
__aicore__ inline GET_ROW_Q4_0() {}
__aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output,
int64_t *input_ne_ub, int64_t *indices_ne_ub,
size_t *indices_nb_ub, int64_t *output_ne_ub,
size_t *output_nb_ub) {
int64_t op_block_num = GetBlockNum();
int64_t op_block_idx = GetBlockIdx();
for (int i = 0; i < 4; i++) {
input_ne[i] = input_ne_ub[i];
indices_ne[i] = indices_ne_ub[i];
indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0];
scale_ne[i] = input_ne_ub[i];
output_ne[i] = output_ne_ub[i];
output_stride[i] = output_nb_ub[i] / output_nb_ub[0];
}
// one scale for a group.
scale_ne[0] /= QK4_0;
input_stride[0] = 1;
scale_stride[0] = 1;
output_stride[0] = 1;
for (int i = 1; i < 4; i++) {
input_stride[i] = input_stride[i - 1] * input_ne[i - 1];
scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
}
group_size_in_row = input_ne[0] / QK4_0;
int64_t scale_offset = input_ne[0] * input_ne[1] * input_ne[2] *
input_ne[3] / 2;
// Indices has two dims. n_elements = all rows should get.
// dr, all rows should this thread get.
uint64_t n_elements =
indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3];
dr = n_elements / op_block_num;
uint64_t tails = n_elements % op_block_num;
if (op_block_idx < tails) {
dr += 1;
ir = dr * op_block_idx;
} else {
ir = dr * op_block_idx + tails;
}
input_gm.SetGlobalBuffer((__gm__ int4b_t *)input);
scale_gm.SetGlobalBuffer((__gm__ half *)(input + scale_offset));
indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices);
output_gm.SetGlobalBuffer((__gm__ float *)output);
pipe.InitBuffer(input_queue, BUFFER_NUM, QK4_0 * sizeof(int4b_t));
pipe.InitBuffer(cast_queue, BUFFER_NUM, QK4_0 * sizeof(half));
pipe.InitBuffer(output_queue, BUFFER_NUM, QK4_0 * sizeof(float));
}
__aicore__ inline void copy_in(uint32_t offset) {
LocalTensor<int4b_t> input_local = input_queue.AllocTensor<int4b_t>();
// 32 * sizeof(int4b_t) = 16, which is not aligned to 32, why no error?
DataCopy(input_local, input_gm[offset], QK4_0);
input_queue.EnQue(input_local);
}
__aicore__ inline void copy_out(uint32_t offset) {
LocalTensor<float> output_local = output_queue.DeQue<float>();
DataCopy(output_gm[offset], output_local, QK4_0);
output_queue.FreeTensor(output_local);
}
__aicore__ inline void calculate_group(int64_t idx, int64_t group) {
const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
const int64_t indices_ne1_idx =
(idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) /
indices_ne[0];
const int64_t indices_ne0_idx =
(idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] -
indices_ne1_idx * indices_ne[0]);
const int64_t indices_offset = indices_ne0_idx * indices_stride[0] +
indices_ne1_idx * indices_stride[1] +
indices_ne2_idx * indices_stride[2];
const int32_t selected_row_idx = indices_gm.GetValue(indices_offset);
const int64_t input_offset = selected_row_idx * input_stride[1] +
indices_ne1_idx * input_stride[2] +
indices_ne2_idx * input_stride[3] +
group * QK4_0;
const int64_t scale_offset = selected_row_idx * scale_stride[1] +
indices_ne1_idx * scale_stride[2] +
indices_ne2_idx * scale_stride[3] + group;
const int64_t output_offset = indices_ne0_idx * output_stride[1] +
indices_ne1_idx * output_stride[2] +
indices_ne2_idx * output_stride[3] +
group * QK4_0;
copy_in(input_offset);
LocalTensor<int4b_t> input_local = input_queue.DeQue<int4b_t>();
LocalTensor<half> cast_local = cast_queue.AllocTensor<half>();
LocalTensor<float> output_local = output_queue.AllocTensor<float>();
// TODO: cast more data to speed up.
Cast(cast_local, input_local, RoundMode::CAST_NONE, QK4_0);
Cast(output_local, cast_local, RoundMode::CAST_NONE, QK4_0);
// Only mul need compile by group.
half scale = scale_gm.GetValue(scale_offset);
Muls(output_local, output_local, (float)scale, QK4_0);
input_queue.FreeTensor(input_local);
cast_queue.FreeTensor(cast_local);
output_queue.EnQue(output_local);
copy_out(output_offset);
}
__aicore__ inline void calculate() {
for (int64_t i = ir; i < ir + dr; i++) {
for (int64_t j = 0; j < group_size_in_row; j++) {
calculate_group(i, j);
}
}
}
private:
int64_t input_ne[4];
size_t input_stride[4];
int64_t scale_ne[4];
size_t scale_stride[4];
int64_t indices_ne[4];
size_t indices_stride[4];
int64_t output_ne[4];
size_t output_stride[4];
int64_t ir;
int64_t dr;
int64_t group_size_in_row;
TPipe pipe;
GlobalTensor<int4b_t> input_gm;
GlobalTensor<half> scale_gm;
GlobalTensor<int32_t> indices_gm;
GlobalTensor<float> output_gm;
TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
TQue<QuePosition::VECIN, BUFFER_NUM> cast_queue;
};
template <typename T>
__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
auto gm_ptr = (__gm__ uint8_t *)gm;
auto ub_ptr = (uint8_t *)(ub);
for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
*ub_ptr = *gm_ptr;
}
}
extern "C" __global__ __aicore__ void ascendc_get_row_q4_0(
GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm,
GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
int64_t input_ne_ub[4];
int64_t indices_ne_ub[4];
size_t indices_nb_ub[4];
int64_t output_ne_ub[4];
size_t output_nb_ub[4];
copy_to_ub(input_ne_gm, input_ne_ub, 32);
copy_to_ub(indices_ne_gm, indices_ne_ub, 32);
copy_to_ub(indices_nb_gm, indices_nb_ub, 32);
copy_to_ub(output_ne_gm, output_ne_ub, 32);
copy_to_ub(output_nb_gm, output_nb_ub, 32);
GET_ROW_Q4_0 op;
op.init(input_gm, indices_gm, output_gm, input_ne_ub, indices_ne_ub,
indices_nb_ub, output_ne_ub, output_nb_ub);
op.calculate();
}
#endif // #ifdef ASCEND_310P

View File

@@ -0,0 +1,191 @@
#include "kernel_operator.h"
// optimize me. Use template to avoid copy code.
using namespace AscendC;
#define BUFFER_NUM 2
#define QK8_0 32
class GET_ROW_Q8_0 {
public:
__aicore__ inline GET_ROW_Q8_0() {}
__aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output,
int64_t *input_ne_ub, int64_t *indices_ne_ub,
size_t *indices_nb_ub, int64_t *output_ne_ub,
size_t *output_nb_ub) {
int64_t op_block_num = GetBlockNum();
int64_t op_block_idx = GetBlockIdx();
for (int i = 0; i < 4; i++) {
input_ne[i] = input_ne_ub[i];
indices_ne[i] = indices_ne_ub[i];
indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0];
scale_ne[i] = input_ne_ub[i];
output_ne[i] = output_ne_ub[i];
output_stride[i] = output_nb_ub[i] / output_nb_ub[0];
}
// one scale for a group.
scale_ne[0] /= QK8_0;
input_stride[0] = 1;
scale_stride[0] = 1;
output_stride[0] = 1;
for (int i = 1; i < 4; i++) {
input_stride[i] = input_stride[i - 1] * input_ne[i - 1];
scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
}
group_size_in_row = input_ne[0] / QK8_0;
int64_t scale_offset = input_ne[0] * input_ne[1] * input_ne[2] *
input_ne[3] * sizeof(int8_t);
// Indices has two dims. n_elements = all rows should get.
// dr, all rows should this thread get.
uint64_t n_elements =
indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3];
dr = n_elements / op_block_num;
uint64_t tails = n_elements % op_block_num;
if (op_block_idx < tails) {
dr += 1;
ir = dr * op_block_idx;
} else {
ir = dr * op_block_idx + tails;
}
input_gm.SetGlobalBuffer((__gm__ int8_t *)input);
scale_gm.SetGlobalBuffer((__gm__ half *)(input + scale_offset));
indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices);
output_gm.SetGlobalBuffer((__gm__ float *)output);
pipe.InitBuffer(input_queue, BUFFER_NUM, QK8_0 * sizeof(int8_t));
pipe.InitBuffer(cast_queue, BUFFER_NUM, QK8_0 * sizeof(half));
pipe.InitBuffer(output_queue, BUFFER_NUM, QK8_0 * sizeof(float));
}
__aicore__ inline void copy_in(uint32_t offset) {
LocalTensor<int8_t> input_local = input_queue.AllocTensor<int8_t>();
DataCopy(input_local, input_gm[offset], QK8_0);
input_queue.EnQue(input_local);
}
__aicore__ inline void copy_out(uint32_t offset) {
LocalTensor<float> output_local = output_queue.DeQue<float>();
DataCopy(output_gm[offset], output_local, QK8_0);
output_queue.FreeTensor(output_local);
}
__aicore__ inline void calculate_group(int64_t idx, int64_t group) {
const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
const int64_t indices_ne1_idx =
(idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) /
indices_ne[0];
const int64_t indices_ne0_idx =
(idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] -
indices_ne1_idx * indices_ne[0]);
const int64_t indices_offset = indices_ne0_idx * indices_stride[0] +
indices_ne1_idx * indices_stride[1] +
indices_ne2_idx * indices_stride[2];
const int32_t selected_row_idx = indices_gm.GetValue(indices_offset);
const int64_t input_offset = selected_row_idx * input_stride[1] +
indices_ne1_idx * input_stride[2] +
indices_ne2_idx * input_stride[3] +
group * QK8_0;
const int64_t scale_offset = selected_row_idx * scale_stride[1] +
indices_ne1_idx * scale_stride[2] +
indices_ne2_idx * scale_stride[3] + group;
const int64_t output_offset = indices_ne0_idx * output_stride[1] +
indices_ne1_idx * output_stride[2] +
indices_ne2_idx * output_stride[3] +
group * QK8_0;
copy_in(input_offset);
LocalTensor<int8_t> input_local = input_queue.DeQue<int8_t>();
LocalTensor<half> cast_local = cast_queue.AllocTensor<half>();
LocalTensor<float> output_local = output_queue.AllocTensor<float>();
// TODO: cast more data to speed up.
Cast(cast_local, input_local, RoundMode::CAST_NONE, QK8_0);
Cast(output_local, cast_local, RoundMode::CAST_NONE, QK8_0);
// Only mul need compile by group.
half scale = scale_gm.GetValue(scale_offset);
Muls(output_local, output_local, (float)scale, QK8_0);
input_queue.FreeTensor(input_local);
cast_queue.FreeTensor(cast_local);
output_queue.EnQue(output_local);
copy_out(output_offset);
}
__aicore__ inline void calculate() {
for (int64_t i = ir; i < ir + dr; i++) {
for (int64_t j = 0; j < group_size_in_row; j++) {
calculate_group(i, j);
}
}
}
private:
int64_t input_ne[4];
size_t input_stride[4];
int64_t scale_ne[4];
size_t scale_stride[4];
int64_t indices_ne[4];
size_t indices_stride[4];
int64_t output_ne[4];
size_t output_stride[4];
int64_t ir;
int64_t dr;
int64_t group_size_in_row;
TPipe pipe;
GlobalTensor<int8_t> input_gm;
GlobalTensor<half> scale_gm;
GlobalTensor<int32_t> indices_gm;
GlobalTensor<float> output_gm;
TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
TQue<QuePosition::VECIN, BUFFER_NUM> cast_queue;
};
template <typename T>
__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
auto gm_ptr = (__gm__ uint8_t *)gm;
auto ub_ptr = (uint8_t *)(ub);
for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
*ub_ptr = *gm_ptr;
}
}
extern "C" __global__ __aicore__ void ascendc_get_row_q8_0(
GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm,
GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
int64_t input_ne_ub[4];
int64_t indices_ne_ub[4];
size_t indices_nb_ub[4];
int64_t output_ne_ub[4];
size_t output_nb_ub[4];
copy_to_ub(input_ne_gm, input_ne_ub, 32);
copy_to_ub(indices_ne_gm, indices_ne_ub, 32);
copy_to_ub(indices_nb_gm, indices_nb_ub, 32);
copy_to_ub(output_ne_gm, output_ne_ub, 32);
copy_to_ub(output_nb_gm, output_nb_ub, 32);
GET_ROW_Q8_0 op;
op.init(input_gm, indices_gm, output_gm, input_ne_ub, indices_ne_ub,
indices_nb_ub, output_ne_ub, output_nb_ub);
op.calculate();
}

Some files were not shown because too many files have changed in this diff Show More