Compare commits

..

2 Commits

Author SHA1 Message Date
Georgi Gerganov
091d98e2c5 rpc : use std::unique_ptr for the message_queue 2026-01-06 15:32:01 +02:00
Radoslav Gerganov
df27d80ae3 rpc : implement event and async backend APIs 2026-01-05 16:33:15 +02:00
276 changed files with 15561 additions and 26822 deletions

View File

@@ -13,7 +13,7 @@ ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.3.rc2-${CHIP_TYPE}-openeuler24.03-py3.
FROM ${CANN_BASE_IMAGE} AS build
# -- Install build dependencies --
RUN yum install -y gcc g++ cmake make git openssl-devel python3 python3-pip && \
RUN yum install -y gcc g++ cmake make git libcurl-devel python3 python3-pip && \
yum clean all && \
rm -rf /var/cache/yum

View File

@@ -5,7 +5,7 @@ FROM ubuntu:$UBUNTU_VERSION AS build
ARG TARGETARCH
RUN apt-get update && \
apt-get install -y build-essential git cmake libssl-dev
apt-get install -y build-essential git cmake libcurl4-openssl-dev
WORKDIR /app

View File

@@ -12,7 +12,7 @@ FROM ${BASE_CUDA_DEV_CONTAINER} AS build
ARG CUDA_DOCKER_ARCH=default
RUN apt-get update && \
apt-get install -y build-essential cmake python3 python3-pip git libssl-dev libgomp1
apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1
WORKDIR /app

View File

@@ -12,7 +12,7 @@ FROM ${BASE_CUDA_DEV_CONTAINER} AS build
ARG CUDA_DOCKER_ARCH=default
RUN apt-get update && \
apt-get install -y build-essential cmake python3 python3-pip git libssl-dev libgomp1
apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1
WORKDIR /app

View File

@@ -6,7 +6,7 @@ FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS build
ARG GGML_SYCL_F16=OFF
RUN apt-get update && \
apt-get install -y git libssl-dev
apt-get install -y git libcurl4-openssl-dev
WORKDIR /app

View File

@@ -6,7 +6,7 @@ WORKDIR /app
COPY . .
RUN yum install -y gcc g++ cmake make openssl-devel
RUN yum install -y gcc g++ cmake make libcurl-devel
ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
ENV LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:$LIBRARY_PATH
ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling:${LD_LIBRARY_PATH}

View File

@@ -18,7 +18,7 @@ RUN apt-get update && \
python3 \
python3-pip \
git \
libssl-dev \
libcurl4-openssl-dev \
libgomp1
WORKDIR /app

View File

@@ -32,6 +32,7 @@
useMpi ? false,
useRocm ? config.rocmSupport,
rocmGpuTargets ? builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets,
enableCurl ? true,
useVulkan ? false,
useRpc ? false,
llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake
@@ -159,13 +160,15 @@ effectiveStdenv.mkDerivation (finalAttrs: {
++ optionals useMpi [ mpi ]
++ optionals useRocm rocmBuildInputs
++ optionals useBlas [ blas ]
++ optionals useVulkan vulkanBuildInputs;
++ optionals useVulkan vulkanBuildInputs
++ optionals enableCurl [ curl ];
cmakeFlags =
[
(cmakeBool "LLAMA_BUILD_SERVER" true)
(cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
(cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
(cmakeBool "LLAMA_CURL" enableCurl)
(cmakeBool "GGML_NATIVE" false)
(cmakeBool "GGML_BLAS" useBlas)
(cmakeBool "GGML_CUDA" useCuda)

View File

@@ -27,7 +27,7 @@ RUN apt-get update \
build-essential \
cmake \
git \
libssl-dev \
libcurl4-openssl-dev \
curl \
libgomp1

View File

@@ -11,7 +11,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
apt install -y --no-install-recommends \
git cmake ccache ninja-build \
# WARNING: Do not use libopenblas-openmp-dev. libopenblas-dev is faster.
libopenblas-dev libssl-dev && \
libopenblas-dev libcurl4-openssl-dev && \
rm -rf /var/lib/apt/lists/*
WORKDIR /app

View File

@@ -5,8 +5,8 @@ FROM ubuntu:$UBUNTU_VERSION AS build
# Install build tools
RUN apt update && apt install -y git build-essential cmake wget xz-utils
# Install SSL and Vulkan SDK dependencies
RUN apt install -y libssl-dev curl \
# Install cURL and Vulkan SDK dependencies
RUN apt install -y libcurl4-openssl-dev curl \
libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libvulkan-dev glslc
# Build it
@@ -33,7 +33,6 @@ FROM ubuntu:$UBUNTU_VERSION AS base
RUN apt-get update \
&& apt-get install -y libgomp1 curl libvulkan1 mesa-vulkan-drivers \
libglvnd0 libgl1 libglx0 libegl1 libgles2 \
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \

View File

@@ -0,0 +1,30 @@
name: 'Windows - Setup CURL'
description: 'Composite action, to be reused in other workflow'
inputs:
curl_version:
description: 'CURL version'
required: false
default: '8.6.0_6'
architecture:
description: 'Architecture of the libcurl to download'
required: false
default: 'win64'
outputs:
curl_path:
description: "Path to the downloaded libcurl"
value: ${{ steps.get_libcurl.outputs.curl_path }}
runs:
using: "composite"
steps:
- name: libCURL
id: get_libcurl
shell: powershell
env:
CURL_VERSION: ${{ inputs.curl_version }}
ARCHITECTURE: ${{ inputs.architecture }}
run: |
curl.exe -o $env:RUNNER_TEMP/curl.zip -L "https://curl.se/windows/dl-${env:CURL_VERSION}/curl-${env:CURL_VERSION}-${env:ARCHITECTURE}-mingw.zip"
mkdir $env:RUNNER_TEMP/libcurl
tar.exe -xvf $env:RUNNER_TEMP/curl.zip --strip-components=1 -C $env:RUNNER_TEMP/libcurl
echo "curl_path=$env:RUNNER_TEMP/libcurl" >> $env:GITHUB_OUTPUT

View File

@@ -20,7 +20,7 @@ jobs:
run: |
PREFIX="$(pwd)"/inst
cmake -S . -B build -DCMAKE_PREFIX_PATH="$PREFIX" \
-DLLAMA_OPENSSL=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=OFF \
-DLLAMA_CURL=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=OFF \
-DLLAMA_BUILD_EXAMPLES=OFF -DCMAKE_BUILD_TYPE=Release
cmake --build build --config Release
cmake --install build --prefix "$PREFIX" --config Release

View File

@@ -30,7 +30,7 @@ jobs:
# - name: Build
# run: |
# cmake -B build -DLLAMA_OPENSSL=OFF \
# cmake -B build -DLLAMA_CURL=OFF \
# -DCMAKE_BUILD_TYPE=Release \
# -DGGML_OPENMP=OFF \
# -DLLAMA_BUILD_EXAMPLES=ON \
@@ -76,7 +76,7 @@ jobs:
# - name: Build
# run: |
# cmake -B build -DLLAMA_OPENSSL=OFF \
# cmake -B build -DLLAMA_CURL=OFF \
# -DCMAKE_BUILD_TYPE=Release \
# -DGGML_VULKAN=ON \
# -DGGML_OPENMP=OFF \
@@ -122,7 +122,7 @@ jobs:
# - name: Build
# run: |
# cmake -B build -DLLAMA_OPENSSL=OFF \
# cmake -B build -DLLAMA_CURL=OFF \
# -DCMAKE_BUILD_TYPE=Release \
# -DGGML_VULKAN=ON \
# -DGGML_OPENMP=OFF \
@@ -178,7 +178,7 @@ jobs:
- name: Build
run: |
cmake -B build -DLLAMA_OPENSSL=OFF \
cmake -B build -DLLAMA_CURL=OFF \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_OPENMP=OFF \
-DLLAMA_BUILD_EXAMPLES=ON \
@@ -235,7 +235,7 @@ jobs:
- name: Build
run: |
cmake -B build -DLLAMA_OPENSSL=OFF \
cmake -B build -DLLAMA_CURL=OFF \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_VULKAN=ON \
-DGGML_OPENMP=OFF \
@@ -281,7 +281,7 @@ jobs:
- name: Build
run: |
export RISCV_ROOT_PATH=${PWD}/spacemit_toolchain
cmake -B build -DLLAMA_OPENSSL=OFF \
cmake -B build -DLLAMA_CURL=OFF \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_OPENMP=OFF \
-DLLAMA_BUILD_EXAMPLES=ON \

View File

@@ -79,6 +79,7 @@ jobs:
cmake -B build \
-DCMAKE_BUILD_RPATH="@loader_path" \
-DLLAMA_FATAL_WARNINGS=ON \
-DLLAMA_CURL=OFF \
-DLLAMA_BUILD_BORINGSSL=ON \
-DGGML_METAL_USE_BF16=ON \
-DGGML_METAL_EMBED_LIBRARY=OFF \
@@ -91,7 +92,7 @@ jobs:
id: cmake_test
run: |
cd build
ctest -L main --verbose --timeout 900
ctest -L 'main|curl' --verbose --timeout 900
macOS-latest-cmake-x64:
runs-on: macos-15-intel
@@ -117,6 +118,7 @@ jobs:
cmake -B build \
-DCMAKE_BUILD_RPATH="@loader_path" \
-DLLAMA_FATAL_WARNINGS=ON \
-DLLAMA_CURL=OFF \
-DLLAMA_BUILD_BORINGSSL=ON \
-DGGML_METAL=OFF \
-DGGML_RPC=ON \
@@ -150,13 +152,13 @@ jobs:
DAWN_VERSION="v2.0.0"
DAWN_OWNER="reeselevine"
DAWN_REPO="dawn"
DAWN_ASSET_NAME="Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-macos-latest-Release"
echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
DAWN_ASSET_NAME="Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-macos-latest-Release.zip"
echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
curl -L -o artifact.zip \
"https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
"https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
mkdir dawn
unzip artifact.zip
tar -xvf ${DAWN_ASSET_NAME}.tar.gz -C dawn --strip-components=1
tar -xvf Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-macos-latest-Release.tar.gz -C dawn --strip-components=1
- name: Build
id: cmake_build
@@ -225,6 +227,8 @@ jobs:
id: cmake_build
run: |
cmake -B build \
-DLLAMA_CURL=OFF \
-DLLAMA_OPENSSL=ON \
-DLLAMA_FATAL_WARNINGS=ON \
-DGGML_RPC=ON
cmake --build build --config Release -j $(nproc)
@@ -233,7 +237,7 @@ jobs:
id: cmake_test
run: |
cd build
ctest -L main --verbose --timeout 900
ctest -L 'main|curl' --verbose --timeout 900
- name: Test llama2c conversion
id: llama2c_test
@@ -289,6 +293,8 @@ jobs:
if: ${{ matrix.sanitizer != 'THREAD' }}
run: |
cmake -B build \
-DLLAMA_CURL=OFF \
-DLLAMA_OPENSSL=ON \
-DLLAMA_FATAL_WARNINGS=ON \
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
@@ -299,6 +305,8 @@ jobs:
if: ${{ matrix.sanitizer == 'THREAD' }}
run: |
cmake -B build \
-DLLAMA_CURL=OFF \
-DLLAMA_OPENSSL=ON \
-DLLAMA_FATAL_WARNINGS=ON \
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
@@ -328,10 +336,14 @@ jobs:
- name: Build
id: cmake_build
run: |
cmake -B build \
mkdir build
cd build
cmake .. \
-DLLAMA_CURL=OFF \
-DLLAMA_OPENSSL=ON \
-DLLAMA_FATAL_WARNINGS=ON \
-DLLAMA_LLGUIDANCE=ON
cmake --build build --config Release -j $(nproc)
cmake --build . --config Release -j $(nproc)
- name: Test
id: cmake_test
@@ -365,6 +377,8 @@ jobs:
id: cmake_build
run: |
cmake -B build \
-DLLAMA_CURL=OFF \
-DLLAMA_OPENSSL=ON \
-DGGML_RPC=ON
cmake --build build --config Release -j $(nproc)
@@ -398,6 +412,8 @@ jobs:
id: cmake_configure
run: |
cmake -B build \
-DLLAMA_CURL=OFF \
-DLLAMA_OPENSSL=ON \
-DCMAKE_BUILD_TYPE=RelWithDebInfo \
-DGGML_BACKEND_DL=ON \
-DGGML_CPU_ALL_VARIANTS=ON \
@@ -454,6 +470,8 @@ jobs:
run: |
source ./vulkan_sdk/setup-env.sh
cmake -B build \
-DLLAMA_CURL=OFF \
-DLLAMA_OPENSSL=ON \
-DGGML_VULKAN=ON
cmake --build build --config Release -j $(nproc)
@@ -514,19 +532,21 @@ jobs:
DAWN_VERSION="v2.0.0"
DAWN_OWNER="reeselevine"
DAWN_REPO="dawn"
DAWN_ASSET_NAME="Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-ubuntu-latest-Release"
echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
DAWN_ASSET_NAME="Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-ubuntu-latest-Release.zip"
echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
curl -L -o artifact.zip \
"https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
"https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
mkdir dawn
unzip artifact.zip
tar -xvf ${DAWN_ASSET_NAME}.tar.gz -C dawn --strip-components=1
tar -xvf Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-ubuntu-latest-Release.tar.gz -C dawn --strip-components=1
- name: Build
id: cmake_build
run: |
export Dawn_DIR=dawn/lib64/cmake/Dawn
cmake -B build \
-DLLAMA_CURL=OFF \
-DLLAMA_OPENSSL=ON \
-DGGML_WEBGPU=ON
cmake --build build --config Release -j $(nproc)
@@ -573,7 +593,7 @@ jobs:
source emsdk/emsdk_env.sh
emcmake cmake -B build-wasm \
-DGGML_WEBGPU=ON \
-DLLAMA_OPENSSL=OFF \
-DLLAMA_CURL=OFF \
-DEMDAWNWEBGPU_DIR=emdawnwebgpu_pkg
cmake --build build-wasm --target test-backend-ops -j $(nproc)
@@ -604,6 +624,8 @@ jobs:
id: cmake_build
run: |
cmake -B build -S . \
-DLLAMA_CURL=OFF \
-DLLAMA_OPENSSL=ON \
-DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \
-DGGML_HIP_ROCWMMA_FATTN=ON \
-DGGML_HIP=ON
@@ -635,6 +657,8 @@ jobs:
id: cmake_build
run: |
cmake -B build -S . \
-DLLAMA_CURL=OFF \
-DLLAMA_OPENSSL=ON \
-DGGML_MUSA=ON
cmake --build build --config Release -j $(nproc)
@@ -682,6 +706,8 @@ jobs:
run: |
source /opt/intel/oneapi/setvars.sh
cmake -B build \
-DLLAMA_CURL=OFF \
-DLLAMA_OPENSSL=ON \
-DGGML_SYCL=ON \
-DCMAKE_C_COMPILER=icx \
-DCMAKE_CXX_COMPILER=icpx
@@ -731,6 +757,8 @@ jobs:
run: |
source /opt/intel/oneapi/setvars.sh
cmake -B build \
-DLLAMA_CURL=OFF \
-DLLAMA_OPENSSL=ON \
-DGGML_SYCL=ON \
-DCMAKE_C_COMPILER=icx \
-DCMAKE_CXX_COMPILER=icpx \
@@ -865,7 +893,7 @@ jobs:
cmake -B build -G Xcode \
-DGGML_METAL_USE_BF16=ON \
-DGGML_METAL_EMBED_LIBRARY=ON \
-DLLAMA_OPENSSL=OFF \
-DLLAMA_CURL=OFF \
-DLLAMA_BUILD_EXAMPLES=OFF \
-DLLAMA_BUILD_TOOLS=OFF \
-DLLAMA_BUILD_TESTS=OFF \
@@ -1015,7 +1043,7 @@ jobs:
id: cmake_build
run: |
cmake -S . -B build ${{ matrix.defines }} `
-DLLAMA_BUILD_BORINGSSL=ON
-DLLAMA_CURL=OFF -DLLAMA_BUILD_BORINGSSL=ON
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
- name: Add libopenblas.dll
@@ -1073,6 +1101,8 @@ jobs:
# TODO: Remove GGML_CUDA_CUB_3DOT2 flag once CCCL 3.2 is bundled within CTK and that CTK version is used in this project
run: |
cmake -S . -B build -G Ninja \
-DLLAMA_CURL=OFF \
-DLLAMA_OPENSSL=ON \
-DLLAMA_FATAL_WARNINGS=ON \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_CUDA_ARCHITECTURES=89-real \
@@ -1120,6 +1150,7 @@ jobs:
call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64
cmake -S . -B build -G "Ninja Multi-Config" ^
-DLLAMA_BUILD_SERVER=ON ^
-DLLAMA_CURL=OFF ^
-DLLAMA_BUILD_BORINGSSL=ON ^
-DGGML_NATIVE=OFF ^
-DGGML_BACKEND_DL=ON ^
@@ -1227,6 +1258,7 @@ jobs:
-DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
-DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/opt/rocm-${{ env.ROCM_VERSION }}/include/" `
-DCMAKE_BUILD_TYPE=Release `
-DLLAMA_CURL=OFF `
-DLLAMA_BUILD_BORINGSSL=ON `
-DROCM_DIR="${env:HIP_PATH}" `
-DGGML_HIP=ON `
@@ -1253,7 +1285,7 @@ jobs:
cmake -B build -G Xcode \
-DGGML_METAL_USE_BF16=ON \
-DGGML_METAL_EMBED_LIBRARY=ON \
-DLLAMA_OPENSSL=OFF \
-DLLAMA_CURL=OFF \
-DLLAMA_BUILD_EXAMPLES=OFF \
-DLLAMA_BUILD_TOOLS=OFF \
-DLLAMA_BUILD_TESTS=OFF \
@@ -1320,7 +1352,7 @@ jobs:
matrix:
include:
- build: 'arm64-cpu'
defines: '-D ANDROID_ABI=arm64-v8a -D ANDROID_PLATFORM=android-31 -D CMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake -D GGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8.5-a+fp16+i8mm -G Ninja -D LLAMA_OPENSSL=OFF -D GGML_OPENMP=OFF'
defines: '-D ANDROID_ABI=arm64-v8a -D ANDROID_PLATFORM=android-31 -D CMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake -D GGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8.5-a+fp16+i8mm -G Ninja -D LLAMA_CURL=OFF -D GGML_OPENMP=OFF'
- build: 'arm64-snapdragon'
defines: '--preset arm64-android-snapdragon-release'
@@ -1386,6 +1418,7 @@ jobs:
echo "FIXME: test on devices"
openEuler-latest-cmake-cann:
if: ${{ github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'Ascend NPU') }}
defaults:
run:
shell: bash -el {0}
@@ -1431,7 +1464,7 @@ jobs:
"${{ steps.cann-image.outputs.image }}" \
bash -lc '
set -e
yum install -y --setopt=install_weak_deps=False --setopt=tsflags=nodocs git gcc gcc-c++ make cmake openssl-devel
yum install -y --setopt=install_weak_deps=False --setopt=tsflags=nodocs git gcc gcc-c++ make cmake libcurl-devel
yum clean all && rm -rf /var/cache/yum
git config --global --add safe.directory "/workspace"
export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
@@ -1465,7 +1498,7 @@ jobs:
id: depends
run: |
sudo apt-get update
sudo apt-get install build-essential
sudo apt-get install build-essential libcurl4-openssl-dev
- name: Test
id: ggml-ci
@@ -1491,7 +1524,7 @@ jobs:
id: depends
run: |
sudo apt-get update
sudo apt-get install build-essential
sudo apt-get install build-essential libcurl4-openssl-dev
- name: Test
id: ggml-ci
@@ -1517,7 +1550,7 @@ jobs:
id: depends
run: |
sudo apt-get update
sudo apt-get install build-essential
sudo apt-get install build-essential libcurl4-openssl-dev
- name: Test
id: ggml-ci
@@ -1543,7 +1576,7 @@ jobs:
id: depends
run: |
sudo apt-get update
sudo apt-get install build-essential
sudo apt-get install build-essential libcurl4-openssl-dev
- name: Test
id: ggml-ci
@@ -1569,7 +1602,7 @@ jobs:
id: depends
run: |
sudo apt-get update
sudo apt-get install build-essential
sudo apt-get install build-essential libcurl4-openssl-dev
- name: Test
id: ggml-ci
@@ -1672,34 +1705,6 @@ jobs:
run: |
GG_BUILD_METAL=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
ggml-ci-mac-webgpu:
runs-on: [self-hosted, macOS, ARM64]
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
- name: Dawn Dependency
id: dawn-depends
run: |
DAWN_VERSION="v2.0.0"
DAWN_OWNER="reeselevine"
DAWN_REPO="dawn"
DAWN_ASSET_NAME="Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-macos-latest-Release"
echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
curl -L -o artifact.zip \
"https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
mkdir dawn
unzip artifact.zip
tar -xvf ${DAWN_ASSET_NAME}.tar.gz -C dawn --strip-components=1
- name: Test
id: ggml-ci
run: |
GG_BUILD_WEBGPU=1 GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \
bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
ggml-ci-mac-vulkan:
runs-on: [self-hosted, macOS, ARM64]
@@ -1733,7 +1738,7 @@ jobs:
id: depends
run: |
sudo apt-get update
sudo apt-get install -y build-essential
sudo apt-get install -y build-essential libcurl4-openssl-dev
- name: Test
id: ggml-ci
@@ -1800,6 +1805,8 @@ jobs:
id: cmake_build
run: |
cmake -B build \
-DLLAMA_CURL=OFF \
-DLLAMA_OPENSSL=ON \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_OPENMP=OFF \
-DLLAMA_BUILD_EXAMPLES=ON \
@@ -1817,7 +1824,7 @@ jobs:
id: cmake_test
run: |
cd build
ctest -L main --verbose --timeout 900
ctest -L 'main|curl' --verbose --timeout 900
- name: Test llama2c conversion
id: llama2c_test
@@ -1892,7 +1899,7 @@ jobs:
if: ${{ matrix.sanitizer != 'THREAD' }}
run: |
cmake -B build \
-DLLAMA_OPENSSL=OFF \
-DLLAMA_CURL=OFF \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-DGGML_OPENMP=ON \
-DLLAMA_BUILD_EXAMPLES=ON \
@@ -1911,7 +1918,7 @@ jobs:
if: ${{ matrix.sanitizer == 'THREAD' }}
run: |
cmake -B build \
-DLLAMA_OPENSSL=OFF \
-DLLAMA_CURL=OFF \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-DGGML_OPENMP=OFF \
-DLLAMA_BUILD_EXAMPLES=ON \
@@ -1982,7 +1989,7 @@ jobs:
id: cmake_build
run: |
cmake -B build \
-DLLAMA_OPENSSL=OFF \
-DLLAMA_CURL=OFF \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_OPENMP=OFF \
-DLLAMA_BUILD_EXAMPLES=ON \
@@ -2056,6 +2063,8 @@ jobs:
id: cmake_build
run: |
cmake -B build \
-DLLAMA_CURL=OFF \
-DLLAMA_OPENSSL=ON \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_OPENMP=OFF \
-DLLAMA_BUILD_EXAMPLES=ON \
@@ -2091,6 +2100,7 @@ jobs:
sudo DEBIAN_FRONTEND=noninteractive NEEDRESTART_MODE=a \
apt-get install -y \
build-essential \
libcurl4-openssl-dev \
python3-venv \
gpg \
wget \

View File

@@ -38,7 +38,7 @@ jobs:
id: depends
run: |
sudo apt-get update
sudo apt-get install build-essential libssl-dev
sudo apt-get install build-essential libcurl4-openssl-dev
# Install git-clang-format script for formatting only changed code
wget -O /tmp/git-clang-format https://raw.githubusercontent.com/llvm/llvm-project/release/18.x/clang/tools/clang-format/git-clang-format
sudo cp /tmp/git-clang-format /usr/local/bin/git-clang-format

View File

@@ -37,6 +37,13 @@ jobs:
key: macOS-latest-cmake-arm64
evict-old-files: 1d
- name: Dependencies
id: depends
continue-on-error: true
run: |
brew update
brew install curl
- name: Build
id: cmake_build
run: |
@@ -45,7 +52,6 @@ jobs:
-DCMAKE_INSTALL_RPATH='@loader_path' \
-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
-DLLAMA_FATAL_WARNINGS=ON \
-DLLAMA_BUILD_BORINGSSL=ON \
-DGGML_METAL_USE_BF16=ON \
-DGGML_METAL_EMBED_LIBRARY=ON \
-DGGML_RPC=ON \
@@ -84,6 +90,13 @@ jobs:
key: macOS-latest-cmake-x64
evict-old-files: 1d
- name: Dependencies
id: depends
continue-on-error: true
run: |
brew update
brew install curl
- name: Build
id: cmake_build
run: |
@@ -94,7 +107,6 @@ jobs:
-DCMAKE_INSTALL_RPATH='@loader_path' \
-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
-DLLAMA_FATAL_WARNINGS=ON \
-DLLAMA_BUILD_BORINGSSL=ON \
-DGGML_METAL=OFF \
-DGGML_RPC=ON \
-DCMAKE_OSX_DEPLOYMENT_TARGET=13.3
@@ -147,7 +159,7 @@ jobs:
id: depends
run: |
sudo apt-get update
sudo apt-get install build-essential libssl-dev
sudo apt-get install build-essential libcurl4-openssl-dev
- name: Build
id: cmake_build
@@ -200,7 +212,7 @@ jobs:
wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add -
sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
sudo apt-get update -y
sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk libssl-dev
sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk libcurl4-openssl-dev
- name: Build
id: cmake_build
@@ -257,23 +269,34 @@ jobs:
run: |
choco install ninja
- name: libCURL
id: get_libcurl
uses: ./.github/actions/windows-setup-curl
with:
architecture: ${{ matrix.arch == 'x64' && 'win64' || 'win64a' }}
- name: Build
shell: cmd
env:
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
run: |
call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" ${{ matrix.arch == 'x64' && 'x64' || 'amd64_arm64' }}
cmake -S . -B build -G "Ninja Multi-Config" ^
-D CMAKE_TOOLCHAIN_FILE=cmake/${{ matrix.arch }}-windows-llvm.cmake ^
-DLLAMA_BUILD_BORINGSSL=ON ^
-DGGML_NATIVE=OFF ^
-DGGML_BACKEND_DL=ON ^
-DGGML_CPU_ALL_VARIANTS=${{ matrix.arch == 'x64' && 'ON' || 'OFF' }} ^
-DGGML_OPENMP=ON ^
-DCURL_LIBRARY="%CURL_PATH%/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="%CURL_PATH%/include" ^
${{ env.CMAKE_ARGS }}
cmake --build build --config Release
- name: Pack artifacts
id: pack_artifacts
env:
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
run: |
Copy-Item $env:CURL_PATH\bin\libcurl-${{ matrix.arch }}.dll .\build\bin\Release\
Copy-Item "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Redist\MSVC\14.44.35112\debug_nonredist\${{ matrix.arch }}\Microsoft.VC143.OpenMP.LLVM\libomp140.${{ matrix.arch == 'x64' && 'x86_64' || 'aarch64' }}.dll" .\build\bin\Release\
7z a -snl llama-bin-win-cpu-${{ matrix.arch }}.zip .\build\bin\Release\*
@@ -351,7 +374,7 @@ jobs:
- name: Build
id: cmake_build
run: |
cmake -S . -B build ${{ matrix.defines }} -DGGML_NATIVE=OFF -DGGML_CPU=OFF -DGGML_BACKEND_DL=ON -DLLAMA_BUILD_BORINGSSL=ON
cmake -S . -B build ${{ matrix.defines }} -DGGML_NATIVE=OFF -DGGML_CPU=OFF -DGGML_BACKEND_DL=ON -DLLAMA_CURL=OFF
cmake --build build --config Release --target ${{ matrix.target }}
- name: Pack artifacts
@@ -405,7 +428,7 @@ jobs:
-DGGML_NATIVE=OFF ^
-DGGML_CPU=OFF ^
-DGGML_CUDA=ON ^
-DLLAMA_BUILD_BORINGSSL=ON ^
-DLLAMA_CURL=OFF ^
-DGGML_CUDA_CUB_3DOT2=ON
set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
cmake --build build --config Release -j %NINJA_JOBS% --target ggml-cuda
@@ -474,7 +497,7 @@ jobs:
-DCMAKE_BUILD_TYPE=Release ^
-DGGML_BACKEND_DL=ON -DBUILD_SHARED_LIBS=ON ^
-DGGML_CPU=OFF -DGGML_SYCL=ON ^
-DLLAMA_BUILD_BORINGSSL=ON
-DLLAMA_CURL=OFF
cmake --build build --target ggml-sycl -j
- name: Build the release package
@@ -601,7 +624,7 @@ jobs:
-DAMDGPU_TARGETS="${{ matrix.gpu_targets }}" `
-DGGML_HIP_ROCWMMA_FATTN=ON `
-DGGML_HIP=ON `
-DLLAMA_BUILD_BORINGSSL=ON
-DLLAMA_CURL=OFF
cmake --build build --target ggml-hip -j ${env:NUMBER_OF_PROCESSORS}
md "build\bin\rocblas\library\"
md "build\bin\hipblaslt\library"
@@ -642,7 +665,7 @@ jobs:
cmake -B build -G Xcode \
-DGGML_METAL_USE_BF16=ON \
-DGGML_METAL_EMBED_LIBRARY=ON \
-DLLAMA_OPENSSL=OFF \
-DLLAMA_CURL=OFF \
-DLLAMA_BUILD_EXAMPLES=OFF \
-DLLAMA_BUILD_TOOLS=OFF \
-DLLAMA_BUILD_TESTS=OFF \
@@ -721,7 +744,7 @@ jobs:
"${{ steps.cann-image.outputs.image }}" \
bash -lc '
set -e
yum install -y --setopt=install_weak_deps=False --setopt=tsflags=nodocs git gcc gcc-c++ make cmake openssl-devel
yum install -y --setopt=install_weak_deps=False --setopt=tsflags=nodocs git gcc gcc-c++ make cmake libcurl-devel
yum clean all && rm -rf /var/cache/yum
git config --global --add safe.directory "/workspace"
export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}

View File

@@ -168,6 +168,8 @@ jobs:
run: |
cmake -B build \
-DGGML_NATIVE=OFF \
-DLLAMA_CURL=OFF \
-DLLAMA_OPENSSL=ON \
-DLLAMA_BUILD_SERVER=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
@@ -180,6 +182,8 @@ jobs:
run: |
cmake -B build \
-DGGML_NATIVE=OFF \
-DLLAMA_CURL=OFF \
-DLLAMA_OPENSSL=ON \
-DLLAMA_BUILD_SERVER=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
@@ -191,6 +195,8 @@ jobs:
run: |
cmake -B build \
-DGGML_NATIVE=OFF \
-DLLAMA_CURL=OFF \
-DLLAMA_OPENSSL=ON \
-DLLAMA_BUILD_SERVER=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ;
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server

View File

@@ -72,7 +72,7 @@ jobs:
- name: Build
id: cmake_build
run: |
cmake -B build -DLLAMA_BUILD_BORINGSSL=ON
cmake -B build -DLLAMA_CURL=OFF -DLLAMA_BUILD_BORINGSSL=ON
cmake --build build --config ${{ matrix.build_type }} -j ${env:NUMBER_OF_PROCESSORS} --target llama-server
- name: Python setup
@@ -108,7 +108,7 @@ jobs:
- name: Build
id: cmake_build
run: |
cmake -B build -DLLAMA_BUILD_BORINGSSL=ON
cmake -B build -DLLAMA_CURL=OFF -DLLAMA_BUILD_BORINGSSL=ON
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target llama-server
- name: Python setup

1
.gitignore vendored
View File

@@ -130,7 +130,6 @@ poetry.toml
# Local scripts
/run-vim.sh
/run-chat.sh
/run-spec.sh
/.ccache/
# IDE

View File

@@ -111,16 +111,11 @@ option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE})
option(LLAMA_TOOLS_INSTALL "llama: install tools" ${LLAMA_TOOLS_INSTALL_DEFAULT})
# 3rd party libs
option(LLAMA_HTTPLIB "llama: httplib for downloading functionality" ON)
option(LLAMA_OPENSSL "llama: use openssl to support HTTPS" ON)
option(LLAMA_CURL "llama: use libcurl to download model from an URL" ON)
option(LLAMA_HTTPLIB "llama: if libcurl is disabled, use httplib to download model from an URL" ON)
option(LLAMA_OPENSSL "llama: use openssl to support HTTPS" OFF)
option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured output in common utils" OFF)
# deprecated
option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
if (LLAMA_CURL)
message(WARNING "LLAMA_CURL option is deprecated and will be ignored")
endif()
# Required for relocatable CMake package
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/common.cmake)
@@ -187,9 +182,6 @@ if (NOT MSVC)
endif()
endif()
include("cmake/license.cmake")
license_add_file("llama.cpp" "LICENSE")
#
# 3rd-party
#
@@ -217,6 +209,11 @@ add_subdirectory(src)
# utils, programs, examples and tests
#
if (NOT LLAMA_BUILD_COMMON)
message(STATUS "LLAMA_BUILD_COMMON is OFF, disabling LLAMA_CURL")
set(LLAMA_CURL OFF)
endif()
if (LLAMA_BUILD_COMMON)
add_subdirectory(common)
if (LLAMA_HTTPLIB)
@@ -238,19 +235,6 @@ if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TOOLS)
add_subdirectory(tools)
endif()
# Automatically add all files from the 'licenses' directory
file(GLOB EXTRA_LICENSES "${CMAKE_SOURCE_DIR}/licenses/LICENSE-*")
foreach(FILE_PATH ${EXTRA_LICENSES})
get_filename_component(FILE_NAME "${FILE_PATH}" NAME)
string(REGEX REPLACE "^LICENSE-" "" NAME "${FILE_NAME}")
license_add_file("${NAME}" "${FILE_PATH}")
endforeach()
if (LLAMA_BUILD_COMMON)
license_generate(common)
endif()
#
# install
#

View File

@@ -20,7 +20,7 @@ If AI is used to generate any portion of the code, contributors must adhere to t
1. Explicitly disclose the manner in which AI was employed.
2. Perform a comprehensive manual review prior to submitting the pull request.
3. Be prepared to explain every line of code they submitted when asked about it by a maintainer.
4. Using AI to write pull request descriptions or to respond to human reviewers is strictly prohibited.
4. Using AI to respond to human reviewers is strictly prohibited.
For more info, please refer to the [AGENTS.md](AGENTS.md) file.

View File

@@ -200,7 +200,6 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
- [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)
- [BonzAI App](https://apps.apple.com/us/app/bonzai-your-local-ai-agent/id6752847988) (proprietary)
- [cztomsik/ava](https://github.com/cztomsik/ava) (MIT)
- [Dot](https://github.com/alexpinel/Dot) (GPL)
- [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
@@ -483,6 +482,21 @@ To learn more about model quantization, [read this documentation](tools/quantize
</details>
## [`llama-run`](tools/run)
#### A comprehensive example for running `llama.cpp` models. Useful for inferencing. Used with RamaLama [^3].
- <details>
<summary>Run a model with a specific prompt (by default it's pulled from Ollama registry)</summary>
```bash
llama-run granite-code
```
</details>
[^3]: [RamaLama](https://github.com/containers/ramalama)
## [`llama-simple`](examples/simple)
#### A minimal example for implementing apps with `llama.cpp`. Useful for developers.
@@ -586,5 +600,7 @@ $ echo "source ~/.llama-completion.bash" >> ~/.bashrc
- [stb-image](https://github.com/nothings/stb) - Single-header image format decoder, used by multimodal subsystem - Public domain
- [nlohmann/json](https://github.com/nlohmann/json) - Single-header JSON library, used by various tools/examples - MIT License
- [minja](https://github.com/google/minja) - Minimal Jinja parser in C++, used by various tools/examples - MIT License
- [linenoise.cpp](./tools/run/linenoise.cpp/linenoise.cpp) - C++ library that provides readline-like line editing capabilities, used by `llama-run` - BSD 2-Clause License
- [curl](https://curl.se/) - Client-side URL transfer library, used by various tools/examples - [CURL License](https://curl.se/docs/copyright.html)
- [miniaudio.h](https://github.com/mackron/miniaudio) - Single-header audio format decoder, used by multimodal subsystem - Public domain
- [subprocess.h](https://github.com/sheredom/subprocess.h) - Single-header process launching solution for C and C++ - Public domain

View File

@@ -1,52 +1,12 @@
# Security Policy
- [**Reporting a vulnerability**](#reporting-a-vulnerability)
- [**Requirements**](#requirements)
- [**Covered Topics**](#covered-topics)
- [**Using llama.cpp securely**](#using-llamacpp-securely)
- [Untrusted models](#untrusted-models)
- [Untrusted inputs](#untrusted-inputs)
- [Data privacy](#data-privacy)
- [Untrusted environments or networks](#untrusted-environments-or-networks)
- [Multi-Tenant environments](#multi-tenant-environments)
## Reporting a vulnerability
If you have discovered a security vulnerability in this project that falls inside the [covered topics](#covered-topics), please report it privately. **Do not disclose it as a public issue.** This gives us time to work with you to fix the issue before public exposure, reducing the chance that the exploit will be used before a patch is released.
Please disclose it as a private [security advisory](https://github.com/ggml-org/llama.cpp/security/advisories/new).
A team of volunteers on a reasonable-effort basis maintains this project. As such, please give us at least 90 days to work on a fix before public exposure.
> [!IMPORTANT]
> For collaborators: if you are interested in helping out with reviewing privting security disclosures, please see: https://github.com/ggml-org/llama.cpp/discussions/18080
## Requirements
Before submitting your report, ensure you meet the following requirements:
- You have read this policy and fully understand it.
- AI is only permitted in an assistive capacity as stated in [AGENTS.md](AGENTS.md). We do not accept reports that are written exclusively by AI.
- Your report must include a working Proof-of-Concept in the form of a script and/or attached files.
Maintainers reserve the right to close the report if these requirements are not fulfilled.
## Covered Topics
Only vulnerabilities that fall within these parts of the project are considered valid. For problems falling outside of this list, please report them as issues.
- `src/**/*`
- `ggml/**/*`
- `gguf-py/**/*`
- `tools/server/*`, **excluding** the following topics:
- Web UI
- Features marked as experimental
- Features not recommended for use in untrusted environments (e.g., router, MCP)
- Bugs that can lead to Denial-of-Service attack
Note that none of the topics under [Using llama.cpp securely](#using-llamacpp-securely) are considered vulnerabilities in LLaMA C++.
For vulnerabilities that fall within the `vendor` directory, please report them directly to the third-party project.
- [**Reporting a vulnerability**](#reporting-a-vulnerability)
## Using llama.cpp securely
@@ -95,3 +55,19 @@ If you intend to run multiple models in parallel with shared memory, it is your
3. Model Sharing: In a multitenant model sharing design, tenants and users must understand the security risks of running code provided by others. Since there are no reliable methods to detect malicious models, sandboxing the model execution is the recommended approach to mitigate the risk.
4. Hardware Attacks: GPUs or TPUs can also be attacked. [Researches](https://scholar.google.com/scholar?q=gpu+side+channel) has shown that side channel attacks on GPUs are possible, which can make data leak from other models or processes running on the same system at the same time.
## Reporting a vulnerability
Beware that none of the topics under [Using llama.cpp securely](#using-llamacpp-securely) are considered vulnerabilities of LLaMA C++.
<!-- normal version -->
However, If you have discovered a security vulnerability in this project, please report it privately. **Do not disclose it as a public issue.** This gives us time to work with you to fix the issue before public exposure, reducing the chance that the exploit will be used before a patch is released.
Please disclose it as a private [security advisory](https://github.com/ggml-org/llama.cpp/security/advisories/new).
Please note that using AI to identify vulnerabilities and generate reports is permitted. However, you must (1) explicitly disclose how AI was used and (2) conduct a thorough manual review before submitting the report.
A team of volunteers on a reasonable-effort basis maintains this project. As such, please give us at least 90 days to work on a fix before public exposure.
> [!IMPORTANT]
> For collaborators: if you are interested in helping out with reviewing privting security disclosures, please see: https://github.com/ggml-org/llama.cpp/discussions/18080

View File

@@ -414,7 +414,7 @@ cmake -B build-ios-sim -G Xcode \
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=iphonesimulator \
-DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
-DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
-DLLAMA_OPENSSL=OFF \
-DLLAMA_CURL=OFF \
-S .
cmake --build build-ios-sim --config Release -- -quiet
@@ -428,7 +428,7 @@ cmake -B build-ios-device -G Xcode \
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=iphoneos \
-DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
-DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
-DLLAMA_OPENSSL=OFF \
-DLLAMA_CURL=OFF \
-S .
cmake --build build-ios-device --config Release -- -quiet
@@ -439,7 +439,7 @@ cmake -B build-macos -G Xcode \
-DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" \
-DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
-DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
-DLLAMA_OPENSSL=OFF \
-DLLAMA_CURL=OFF \
-S .
cmake --build build-macos --config Release -- -quiet
@@ -453,7 +453,7 @@ cmake -B build-visionos -G Xcode \
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xros \
-DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \
-DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \
-DLLAMA_OPENSSL=OFF \
-DLLAMA_CURL=OFF \
-DLLAMA_HTTPLIB=OFF \
-DLLAMA_BUILD_SERVER=OFF \
-S .
@@ -469,7 +469,7 @@ cmake -B build-visionos-sim -G Xcode \
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xrsimulator \
-DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \
-DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \
-DLLAMA_OPENSSL=OFF \
-DLLAMA_CURL=OFF \
-DLLAMA_HTTPLIB=OFF \
-DLLAMA_BUILD_SERVER=OFF \
-S .
@@ -487,7 +487,7 @@ cmake -B build-tvos-sim -G Xcode \
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=appletvsimulator \
-DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
-DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
-DLLAMA_OPENSSL=OFF \
-DLLAMA_CURL=OFF \
-S .
cmake --build build-tvos-sim --config Release -- -quiet
@@ -502,7 +502,7 @@ cmake -B build-tvos-device -G Xcode \
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=appletvos \
-DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
-DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
-DLLAMA_OPENSSL=OFF \
-DLLAMA_CURL=OFF \
-S .
cmake --build build-tvos-device --config Release -- -quiet

View File

@@ -45,7 +45,7 @@ sd=`dirname $0`
cd $sd/../
SRC=`pwd`
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=${LLAMA_FATAL_WARNINGS:-ON} -DLLAMA_OPENSSL=OFF -DGGML_SCHED_NO_REALLOC=ON"
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=${LLAMA_FATAL_WARNINGS:-ON} -DLLAMA_CURL=ON -DGGML_SCHED_NO_REALLOC=ON"
if [ ! -z ${GG_BUILD_METAL} ]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON"
@@ -105,20 +105,7 @@ if [ ! -z ${GG_BUILD_VULKAN} ]; then
fi
if [ ! -z ${GG_BUILD_WEBGPU} ]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_WEBGPU=1 -DGGML_METAL=OFF -DGGML_BLAS=OFF"
if [ ! -z "${GG_BUILD_WEBGPU_DAWN_PREFIX}" ]; then
if [ -z "${CMAKE_PREFIX_PATH}" ]; then
export CMAKE_PREFIX_PATH="${GG_BUILD_WEBGPU_DAWN_PREFIX}"
else
export CMAKE_PREFIX_PATH="${GG_BUILD_WEBGPU_DAWN_PREFIX}:${CMAKE_PREFIX_PATH}"
fi
fi
# For some systems, Dawn_DIR needs to be set explicitly, e.g., the lib64 path
if [ ! -z "${GG_BUILD_WEBGPU_DAWN_DIR}" ]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DDawn_DIR=${GG_BUILD_WEBGPU_DAWN_DIR}"
fi
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_WEBGPU=1"
fi
if [ ! -z ${GG_BUILD_MUSA} ]; then
@@ -297,8 +284,7 @@ function gg_sum_test_scripts {
}
function gg_get_model {
#local gguf_0="$MNT/models/qwen3/0.6B/ggml-model-f16.gguf"
local gguf_0="$MNT/models/qwen3/0.6B/ggml-model-q4_0.gguf"
local gguf_0="$MNT/models/qwen3/0.6B/ggml-model-f16.gguf"
if [[ -s $gguf_0 ]]; then
echo -n "$gguf_0"
else

View File

@@ -1,21 +0,0 @@
get_filename_component(DEST_DIR "${DEST}" DIRECTORY)
file(MAKE_DIRECTORY "${DEST_DIR}")
if(NOT EXISTS "${DEST}")
message(STATUS "Downloading ${NAME} from ggml-org/models...")
endif()
file(DOWNLOAD
"https://huggingface.co/ggml-org/models/resolve/main/${NAME}?download=true"
"${DEST}"
TLS_VERIFY ON
EXPECTED_HASH ${HASH}
STATUS status
)
list(GET status 0 code)
if(NOT code EQUAL 0)
list(GET status 1 msg)
message(FATAL_ERROR "Failed to download ${NAME}: ${msg}")
endif()

View File

@@ -1,40 +0,0 @@
define_property(GLOBAL PROPERTY LICENSE_TEXT
BRIEF_DOCS "Embedded licenses"
FULL_DOCS "Global string containing all aggregated licenses"
)
function(license_add_file NAME FILE)
if(NOT IS_ABSOLUTE "${FILE}")
set(FILE "${CMAKE_CURRENT_SOURCE_DIR}/${FILE}")
endif()
if(EXISTS "${FILE}")
set(TITLE "License for ${NAME}")
string(REGEX REPLACE "." "=" UNDERLINE "${TITLE}")
file(READ "${FILE}" TEXT)
get_property(TMP GLOBAL PROPERTY LICENSE_TEXT)
string(APPEND TMP "R\"=L=(${TITLE}\n${UNDERLINE}\n\n${TEXT})=L=\",\n")
set_property(GLOBAL PROPERTY LICENSE_TEXT "${TMP}")
else()
message(WARNING "License file '${FILE}' not found")
endif()
endfunction()
function(license_generate TARGET_NAME)
message(STATUS "Generating embedded license file for target: ${TARGET_NAME}")
get_property(TEXT GLOBAL PROPERTY LICENSE_TEXT)
set(CPP_CONTENT "// Generated by CMake\n\n")
string(APPEND CPP_CONTENT "const char* LICENSES[] = {\n")
string(APPEND CPP_CONTENT "${TEXT}")
string(APPEND CPP_CONTENT "nullptr\n")
string(APPEND CPP_CONTENT "};\n")
set(CPP_FILE "${CMAKE_BINARY_DIR}/license.cpp")
file(WRITE "${CPP_FILE}" "${CPP_CONTENT}")
if(TARGET ${TARGET_NAME})
target_sources(${TARGET_NAME} PRIVATE "${CPP_FILE}")
else()
message(FATAL_ERROR "Target '${TARGET_NAME}' does not exist")
endif()
endfunction()

View File

@@ -60,8 +60,6 @@ add_library(${TARGET} STATIC
common.h
console.cpp
console.h
debug.cpp
debug.h
download.cpp
download.h
http.h
@@ -97,7 +95,17 @@ endif()
# TODO: use list(APPEND LLAMA_COMMON_EXTRA_LIBS ...)
set(LLAMA_COMMON_EXTRA_LIBS build_info)
if (LLAMA_HTTPLIB)
if (LLAMA_CURL)
# Use curl to download model url
find_package(CURL)
if (NOT CURL_FOUND)
message(FATAL_ERROR "Could NOT find CURL. Hint: to disable this feature, set -DLLAMA_CURL=OFF")
endif()
target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL)
include_directories(${CURL_INCLUDE_DIRS})
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARIES})
elseif (LLAMA_HTTPLIB)
# otherwise, use cpp-httplib
target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_HTTPLIB)
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} cpp-httplib)
endif()
@@ -147,3 +155,27 @@ if (LLAMA_LLGUIDANCE)
endif ()
target_link_libraries(${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
#
# copy the license files
#
# Check if running in GitHub Actions
if (DEFINED ENV{GITHUB_ACTIONS} AND "$ENV{GITHUB_ACTIONS}" STREQUAL "true")
message(STATUS "Running inside GitHub Actions - copying license files")
# Copy all files from licenses/ to build/bin/
file(GLOB LICENSE_FILES "${CMAKE_SOURCE_DIR}/licenses/*")
foreach(LICENSE_FILE ${LICENSE_FILES})
get_filename_component(FILENAME ${LICENSE_FILE} NAME)
add_custom_command(
POST_BUILD
TARGET ${TARGET}
COMMAND ${CMAKE_COMMAND} -E copy_if_different
"${LICENSE_FILE}"
"$<TARGET_FILE_DIR:llama>/${FILENAME}"
COMMENT "Copying ${FILENAME} to ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}")
message(STATUS "Copying ${LICENSE_FILE} to ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${FILENAME}")
endforeach()
endif()

View File

@@ -2,11 +2,10 @@
#include "chat.h"
#include "common.h"
#include "download.h"
#include "json-schema-to-grammar.h"
#include "log.h"
#include "sampling.h"
#include "preset.h"
#include "download.h"
// fix problem with std::min and std::max
#if defined(_WIN32)
@@ -48,8 +47,6 @@
#define LLAMA_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
extern const char * LICENSES[];
using json = nlohmann::ordered_json;
using namespace common_arg_utils;
@@ -271,55 +268,6 @@ static void parse_tensor_buffer_overrides(const std::string & value, std::vector
}
}
static std::string clean_file_name(const std::string & fname) {
std::string clean_fname = fname;
string_replace_all(clean_fname, "\\", "_");
string_replace_all(clean_fname, "/", "_");
return clean_fname;
}
static bool common_params_handle_remote_preset(common_params & params, llama_example ex) {
GGML_ASSERT(!params.model.hf_repo.empty());
// the returned hf_repo is without tag
auto [hf_repo, hf_tag] = common_download_split_repo_tag(params.model.hf_repo);
// "latest" tag (default if not specified) is translated to "default" preset
if (hf_tag == "latest") {
hf_tag = "default";
}
const bool offline = params.offline;
std::string model_endpoint = get_model_endpoint();
auto preset_url = model_endpoint + hf_repo + "/resolve/main/preset.ini";
// prepare local path for caching
auto preset_fname = clean_file_name(hf_repo + "_preset.ini");
auto preset_path = fs_get_cache_file(preset_fname);
const int status = common_download_file_single(preset_url, preset_path, params.hf_token, offline);
const bool has_preset = status >= 200 && status < 400;
// remote preset is optional, so we don't error out if not found
if (has_preset) {
LOG_INF("applying remote preset from %s\n", preset_url.c_str());
common_preset_context ctx(ex, /* only_remote_allowed */ true);
common_preset global;
auto remote_presets = ctx.load_from_ini(preset_path, global);
remote_presets = ctx.cascade(global, remote_presets);
if (remote_presets.find(hf_tag) != remote_presets.end()) {
common_preset preset = remote_presets.at(hf_tag);
LOG_INF("\n%s", preset.to_ini().c_str()); // to_ini already added trailing newline
preset.apply_to_params(params);
} else {
throw std::runtime_error("Remote preset.ini does not contain [" + std::string(hf_tag) + "] section");
}
} else {
LOG_INF("%s", "no remote preset found, skipping\n");
}
return has_preset;
}
struct handle_model_result {
bool found_mmproj = false;
common_params_model mmproj;
@@ -341,7 +289,7 @@ static handle_model_result common_params_handle_model(
if (model.path.empty()) {
auto auto_detected = common_get_hf_file(model.hf_repo, bearer_token, offline);
if (auto_detected.repo.empty() || auto_detected.ggufFile.empty()) {
exit(1); // error message already printed
exit(1); // built without CURL, error message already printed
}
model.name = model.hf_repo; // repo name with tag
model.hf_repo = auto_detected.repo; // repo name without tag
@@ -361,7 +309,9 @@ static handle_model_result common_params_handle_model(
// make sure model path is present (for caching purposes)
if (model.path.empty()) {
// this is to avoid different repo having same file name, or same file name in different subdirs
std::string filename = clean_file_name(model.hf_repo + "_" + model.hf_file);
std::string filename = model.hf_repo + "_" + model.hf_file;
// to make sure we don't have any slashes in the filename
string_replace_all(filename, "/", "_");
model.path = fs_get_cache_file(filename);
}
@@ -475,87 +425,61 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
}
};
auto parse_cli_args = [&]() {
std::set<std::string> seen_args;
std::set<std::string> seen_args;
for (int i = 1; i < argc; i++) {
const std::string arg_prefix = "--";
for (int i = 1; i < argc; i++) {
const std::string arg_prefix = "--";
std::string arg = argv[i];
if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
std::replace(arg.begin(), arg.end(), '_', '-');
}
if (arg_to_options.find(arg) == arg_to_options.end()) {
throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
}
if (!seen_args.insert(arg).second) {
LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str());
}
auto & tmp = arg_to_options[arg];
auto opt = *tmp.first;
bool is_positive = tmp.second;
if (opt.has_value_from_env()) {
fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str());
}
try {
if (opt.handler_void) {
opt.handler_void(params);
continue;
}
if (opt.handler_bool) {
opt.handler_bool(params, is_positive);
continue;
}
// arg with single value
check_arg(i);
std::string val = argv[++i];
if (opt.handler_int) {
opt.handler_int(params, std::stoi(val));
continue;
}
if (opt.handler_string) {
opt.handler_string(params, val);
continue;
}
// arg with 2 values
check_arg(i);
std::string val2 = argv[++i];
if (opt.handler_str_str) {
opt.handler_str_str(params, val, val2);
continue;
}
} catch (std::exception & e) {
throw std::invalid_argument(string_format(
"error while handling argument \"%s\": %s\n\n"
"usage:\n%s\n\nto show complete usage, run with -h",
arg.c_str(), e.what(), opt.to_string().c_str()));
}
std::string arg = argv[i];
if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
std::replace(arg.begin(), arg.end(), '_', '-');
}
};
// parse the first time to get -hf option (used for remote preset)
parse_cli_args();
// maybe handle remote preset
if (!params.model.hf_repo.empty()) {
std::string cli_hf_repo = params.model.hf_repo;
bool has_preset = common_params_handle_remote_preset(params, ctx_arg.ex);
// special case: if hf_repo explicitly set by preset, we need to preserve it (ignore CLI value)
// this is useful when we have one HF repo pointing to other HF repos (one model - multiple GGUFs)
std::string preset_hf_repo = params.model.hf_repo;
bool preset_has_hf_repo = preset_hf_repo != cli_hf_repo;
if (has_preset) {
// re-parse CLI args to override preset values
parse_cli_args();
if (arg_to_options.find(arg) == arg_to_options.end()) {
throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
}
if (!seen_args.insert(arg).second) {
LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str());
}
auto & tmp = arg_to_options[arg];
auto opt = *tmp.first;
bool is_positive = tmp.second;
if (opt.has_value_from_env()) {
fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str());
}
try {
if (opt.handler_void) {
opt.handler_void(params);
continue;
}
if (opt.handler_bool) {
opt.handler_bool(params, is_positive);
continue;
}
// preserve hf_repo from preset if needed
if (preset_has_hf_repo) {
params.model.hf_repo = preset_hf_repo;
// arg with single value
check_arg(i);
std::string val = argv[++i];
if (opt.handler_int) {
opt.handler_int(params, std::stoi(val));
continue;
}
if (opt.handler_string) {
opt.handler_string(params, val);
continue;
}
// arg with 2 values
check_arg(i);
std::string val2 = argv[++i];
if (opt.handler_str_str) {
opt.handler_str_str(params, val, val2);
continue;
}
} catch (std::exception & e) {
throw std::invalid_argument(string_format(
"error while handling argument \"%s\": %s\n\n"
"usage:\n%s\n\nto show complete usage, run with -h",
arg.c_str(), e.what(), opt.to_string().c_str()));
}
}
@@ -755,6 +679,7 @@ static void common_params_print_completion(common_params_context & ctx_arg) {
"llama-quantize",
"llama-qwen2vl-cli",
"llama-retrieval",
"llama-run",
"llama-save-load-state",
"llama-server",
"llama-simple",
@@ -929,54 +854,6 @@ bool common_arg_utils::is_autoy(const std::string & value) {
return value == "auto" || value == "-1";
}
// Simple CSV parser that handles quoted fields and escaped quotes
// example:
// input: value1,"value, with, commas","value with ""escaped"" quotes",value4
// output: [value1] [value, with, commas] [value with "escaped" quotes] [value4]
static std::vector<std::string> parse_csv_row(const std::string& input) {
std::vector<std::string> fields;
std::string field;
bool in_quotes = false;
for (size_t i = 0; i < input.length(); ++i) {
char ch = input[i];
if (ch == '"') {
if (!in_quotes) {
// start of quoted field (only valid if at beginning of field)
if (!field.empty()) {
// quote appeared in middle of unquoted field, treat as literal
field += '"';
} else {
in_quotes = true; // start
}
} else {
if (i + 1 < input.length() && input[i + 1] == '"') {
// escaped quote: ""
field += '"';
++i; // skip the next quote
} else {
in_quotes = false; // end
}
}
} else if (ch == ',') {
if (in_quotes) {
field += ',';
} else {
fields.push_back(std::move(field));
field.clear();
}
} else {
field += ch;
}
}
// Add the last field
fields.push_back(std::move(field));
return fields;
}
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
// per-example default params
// we define here to make sure it's included in llama-gen-docs
@@ -1041,16 +918,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
exit(0);
}
));
add_opt(common_arg(
{"--license"},
"show source code license and dependencies",
[](common_params &) {
for (int i = 0; LICENSES[i]; ++i) {
printf("%s\n", LICENSES[i]);
}
exit(0);
}
));
add_opt(common_arg(
{"-cl", "--cache-list"},
"show list of models in cache",
@@ -1295,7 +1162,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params) {
params.kv_unified = true;
}
).set_env("LLAMA_ARG_KV_UNIFIED").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_BATCHED}));
).set_env("LLAMA_ARG_KV_UNIFIED").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_PERPLEXITY}));
add_opt(common_arg(
{"--context-shift"},
{"--no-context-shift"},
@@ -1383,7 +1250,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
{"--in-file"}, "FNAME",
"an input file (use comma-separated values to specify multiple files)",
[](common_params & params, const std::string & value) {
for (const auto & item : parse_csv_row(value)) {
for (const auto & item : string_split<std::string>(value, ',')) {
std::ifstream file(item);
if (!file) {
throw std::runtime_error(string_format("error: failed to open file '%s'\n", item.c_str()));
@@ -1530,7 +1397,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, bool value) {
params.warmup = value;
}
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_DEBUG}));
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY}));
add_opt(common_arg(
{"--spm-infill"},
string_format(
@@ -1846,7 +1713,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
else if (value == "rank") { params.pooling_type = LLAMA_POOLING_TYPE_RANK; }
else { throw std::invalid_argument("invalid value"); }
}
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_DEBUG}).set_env("LLAMA_ARG_POOLING"));
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_POOLING"));
add_opt(common_arg(
{"--attention"}, "{causal,non-causal}",
"attention type for embeddings, use model default if unspecified",
@@ -2135,7 +2002,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
{"--image", "--audio"}, "FILE",
"path to an image or audio file. use with multimodal models, use comma-separated values for multiple files\n",
[](common_params & params, const std::string & value) {
for (const auto & item : parse_csv_row(value)) {
for (const auto & item : string_split<std::string>(value, ',')) {
params.image.emplace_back(item);
}
}
@@ -2174,22 +2041,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
add_opt(common_arg(
{"--mmap"},
{"--no-mmap"},
string_format("whether to memory-map model. Explicitly enabling mmap disables direct-io. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: %s)", params.use_mmap ? "enabled" : "disabled"),
string_format("whether to memory-map model (if disabled, slower load but may reduce pageouts if not using mlock) (default: %s)", params.use_mmap ? "enabled" : "disabled"),
[](common_params & params, bool value) {
params.use_mmap = value;
if (value) {
params.use_direct_io = false; // disable direct io when mmap is explicitly enabled
}
}
).set_env("LLAMA_ARG_MMAP"));
add_opt(common_arg(
{"-dio", "--direct-io"},
{"-ndio", "--no-direct-io"},
string_format("use DirectIO if available. Takes precedence over --mmap (default: %s)", params.use_direct_io ? "enabled" : "disabled"),
[](common_params & params, bool value) {
params.use_direct_io = value;
}
).set_env("LLAMA_ARG_DIO"));
add_opt(common_arg(
{"--numa"}, "TYPE",
"attempt optimizations that help on some NUMA systems\n"
@@ -2341,7 +2197,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
std::vector<std::string> split_arg{ it, {} };
if (split_arg.size() >= llama_max_devices()) {
throw std::invalid_argument(
string_format("got %zu input configs, but system only has %zu devices", split_arg.size(), llama_max_devices())
string_format("got %d input configs, but system only has %d devices", (int)split_arg.size(), (int)llama_max_devices())
);
}
for (size_t i = 0; i < llama_max_devices(); ++i) {
@@ -2381,28 +2237,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
).set_env("LLAMA_ARG_FIT"));
add_opt(common_arg(
{ "-fitt", "--fit-target" }, "MiB0,MiB1,MiB2,...",
string_format("target margin per device for --fit, comma-separated list of values, "
"single value is broadcast across all devices, default: %zu", params.fit_params_target[0]/(1024*1024)),
[](common_params & params, const std::string & value) {
std::string arg_next = value;
// split string by , and /
const std::regex regex{ R"([,/]+)" };
std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 };
std::vector<std::string> split_arg{ it, {} };
if (split_arg.size() >= llama_max_devices()) {
throw std::invalid_argument(
string_format("got %zu input configs, but system only has %zu devices", split_arg.size(), llama_max_devices())
);
}
if (split_arg.size() == 1) {
std::fill(params.fit_params_target.begin(), params.fit_params_target.end(), std::stoul(split_arg[0]) * 1024*1024);
return;
}
for (size_t i = 0; i < split_arg.size(); i++) {
params.fit_params_target[i] = std::stoul(split_arg[i]) * 1024*1024;
}
{ "-fitt", "--fit-target" }, "MiB",
string_format("target margin per device for --fit option, default: %zu", params.fit_params_target/(1024*1024)),
[](common_params & params, int value) {
params.fit_params_target = value * size_t(1024*1024);
}
).set_env("LLAMA_ARG_FIT_TARGET"));
add_opt(common_arg(
@@ -2421,12 +2259,37 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
));
add_opt(common_arg(
{"--override-kv"}, "KEY=TYPE:VALUE,...",
"advanced option to override model metadata by key. to specify multiple overrides, either use comma-separated values.\n"
"advanced option to override model metadata by key. to specify multiple overrides, either use comma-separated or repeat this argument.\n"
"types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false,tokenizer.ggml.add_eos_token=bool:false",
[](common_params & params, const std::string & value) {
for (const auto & item : parse_csv_row(value)) {
if (!string_parse_kv_override(item.c_str(), params.kv_overrides)) {
throw std::runtime_error(string_format("error: Invalid type for KV override: %s\n", item.c_str()));
std::vector<std::string> kv_overrides;
std::string current;
bool escaping = false;
for (const char c : value) {
if (escaping) {
current.push_back(c);
escaping = false;
} else if (c == '\\') {
escaping = true;
} else if (c == ',') {
kv_overrides.push_back(current);
current.clear();
} else {
current.push_back(c);
}
}
if (escaping) {
current.push_back('\\');
}
kv_overrides.push_back(current);
for (const auto & kv_override : kv_overrides) {
if (!string_parse_kv_override(kv_override.c_str(), params.kv_overrides)) {
throw std::runtime_error(string_format("error: Invalid type for KV override: %s\n", kv_override.c_str()));
}
}
}
@@ -2443,7 +2306,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
{"--lora"}, "FNAME",
"path to LoRA adapter (use comma-separated values to load multiple adapters)",
[](common_params & params, const std::string & value) {
for (const auto & item : parse_csv_row(value)) {
for (const auto & item : string_split<std::string>(value, ',')) {
params.lora_adapters.push_back({ item, 1.0, "", "", nullptr });
}
}
@@ -2454,7 +2317,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
"path to LoRA adapter with user defined scaling (format: FNAME:SCALE,...)\n"
"note: use comma-separated values",
[](common_params & params, const std::string & value) {
for (const auto & item : parse_csv_row(value)) {
for (const auto & item : string_split<std::string>(value, ',')) {
auto parts = string_split<std::string>(item, ':');
if (parts.size() != 2) {
throw std::invalid_argument("lora-scaled format: FNAME:SCALE");
@@ -2468,7 +2331,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
{"--control-vector"}, "FNAME",
"add a control vector\nnote: use comma-separated values to add multiple control vectors",
[](common_params & params, const std::string & value) {
for (const auto & item : parse_csv_row(value)) {
for (const auto & item : string_split<std::string>(value, ',')) {
params.control_vectors.push_back({ 1.0f, item, });
}
}
@@ -2478,7 +2341,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
"add a control vector with user defined scaling SCALE\n"
"note: use comma-separated values (format: FNAME:SCALE,...)",
[](common_params & params, const std::string & value) {
for (const auto & item : parse_csv_row(value)) {
for (const auto & item : string_split<std::string>(value, ',')) {
auto parts = string_split<std::string>(item, ':');
if (parts.size() != 2) {
throw std::invalid_argument("control-vector-scaled format: FNAME:SCALE");
@@ -2576,7 +2439,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
{"--context-file"}, "FNAME",
"file to load context from (use comma-separated values to specify multiple files)",
[](common_params & params, const std::string & value) {
for (const auto & item : parse_csv_row(value)) {
for (const auto & item : string_split<std::string>(value, ',')) {
std::ifstream file(item, std::ios::binary);
if (!file) {
throw std::runtime_error(string_format("error: failed to open file '%s'\n", item.c_str()));
@@ -2723,7 +2586,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, int value) {
params.embd_normalize = value;
}
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_DEBUG}));
).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
add_opt(common_arg(
{"--embd-output-format"}, "FORMAT",
"empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix, \"raw\" = plain whitespace-delimited output (one embedding per line)",
@@ -2801,7 +2664,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params) {
params.embedding = true;
}
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_DEBUG}).set_env("LLAMA_ARG_EMBEDDINGS"));
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS"));
add_opt(common_arg(
{"--rerank", "--reranking"},
string_format("enable reranking endpoint on server (default: %s)", "disabled"),
@@ -2812,13 +2675,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_RERANKING"));
add_opt(common_arg(
{"--api-key"}, "KEY",
"API key to use for authentication, multiple keys can be provided as a comma-separated list (default: none)",
"API key to use for authentication (default: none)",
[](common_params & params, const std::string & value) {
for (const auto & key : parse_csv_row(value)) {
if (!key.empty()) {
params.api_keys.push_back(key);
}
}
params.api_keys.push_back(value);
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_API_KEY"));
add_opt(common_arg(
@@ -2832,7 +2691,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
std::string key;
while (std::getline(key_file, key)) {
if (!key.empty()) {
params.api_keys.push_back(key);
params.api_keys.push_back(key);
}
}
key_file.close();
@@ -2854,7 +2713,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_CERT_FILE"));
add_opt(common_arg(
{"--chat-template-kwargs"}, "STRING",
"sets additional params for the json template parser, must be a valid json object string, e.g. '{\"key1\":\"value1\",\"key2\":\"value2\"}'",
string_format("sets additional params for the json template parser"),
[](common_params & params, const std::string & value) {
auto parsed = json::parse(value);
for (const auto & item : parsed.items()) {
@@ -2877,18 +2736,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.n_threads_http = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_THREADS_HTTP"));
add_opt(common_arg(
{"--cache-prompt"},
{"--no-cache-prompt"},
string_format("whether to enable prompt caching (default: %s)", params.cache_prompt ? "enabled" : "disabled"),
[](common_params & params, bool value) {
params.cache_prompt = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CACHE_PROMPT"));
add_opt(common_arg(
{"--cache-reuse"}, "N",
string_format(
"min chunk size to attempt reusing from the cache via KV shifting, requires prompt caching to be enabled (default: %d)\n"
"min chunk size to attempt reusing from the cache via KV shifting (default: %d)\n"
"[(card)](https://ggml.ai/f0.png)", params.n_cache_reuse
),
[](common_params & params, int value) {
@@ -3500,27 +3351,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
}
).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
add_opt(common_arg(
{"--save-logits"},
string_format("save final logits to files for verification (default: %s)", params.save_logits ? "true" : "false"),
[](common_params & params) {
params.save_logits = true;
}
).set_examples({LLAMA_EXAMPLE_DEBUG}));
add_opt(common_arg(
{"--logits-output-dir"}, "PATH",
string_format("directory for saving logits output files (default: %s)", params.logits_output_dir.c_str()),
[](common_params & params, const std::string & value) {
params.logits_output_dir = value;
}
).set_examples({LLAMA_EXAMPLE_DEBUG}));
add_opt(common_arg(
{"--tensor-filter"}, "REGEX",
"filter tensor names for debug output (regex pattern, can be specified multiple times)",
[](common_params & params, const std::string & value) {
params.tensor_filter.push_back(value);
}
).set_examples({LLAMA_EXAMPLE_DEBUG}));
// presets
add_opt(common_arg(

View File

@@ -129,3 +129,11 @@ void common_params_add_preset_options(std::vector<common_arg> & args);
// initialize argument parser context - used by test-arg-parser and preset
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
struct common_remote_params {
std::vector<std::string> headers;
long timeout = 0; // CURLOPT_TIMEOUT, in seconds ; 0 means no timeout
long max_size = 0; // max size of the response ; unlimited if 0 ; max is 2GB
};
// get remote file content, returns <http_code, raw_response_body>
std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params & params);

View File

@@ -1403,118 +1403,6 @@ static void common_chat_parse_solar_open(common_chat_msg_parser & builder) {
builder.add_content(builder.consume_rest());
}
static void common_chat_parse_exaone_moe_content(common_chat_msg_parser & builder) {
// 1) <tool_call>{ "name": "...", "arguments": {...} }</tool_call>
// 2) <tool_call>{ "id": "...", "type": "function", "function": { "name": "...", "arguments": {...} } }</tool_call>
static const common_regex tool_call_open(R"(<tool_call[^>]*>)");
if (!builder.syntax().parse_tool_calls) {
LOG_DBG("%s: not parse_tool_calls\n", __func__);
builder.add_content(builder.consume_rest());
return;
}
LOG_DBG("%s: parse_tool_calls\n", __func__);
// Find all <tool_call></tool_call> blocks
while (auto first = builder.try_find_regex(tool_call_open, std::string::npos, /* add_prelude_to_content= */ true)) {
builder.move_to(first->groups[0].end);
builder.consume_spaces();
builder.try_consume_literal("```json");
builder.try_consume_literal("```");
builder.consume_spaces();
// Consume JSON object
auto data = builder.consume_json();
builder.consume_spaces();
builder.try_consume_literal("```");
builder.consume_spaces();
if (!builder.try_consume_literal("</tool_call>")) {
throw common_chat_msg_partial_exception("incomplete tool call");
}
builder.consume_spaces();
// Extract name and arguments
std::string name;
std::string id;
nlohmann::ordered_json arguments;
const auto extract_args = [&](const nlohmann::ordered_json & obj) -> bool {
if (!obj.contains("name") || !obj.contains("arguments")) {
return false;
}
name = obj.at("name").get<std::string>();
arguments = obj.at("arguments");
if (obj.contains("id") && obj.at("id").is_string()) {
id = obj.at("id").get<std::string>();
}
return true;
};
if (!extract_args(data.json)) {
if (data.json.contains("function") && data.json.at("function").is_object()) {
auto fn = data.json.at("function");
extract_args(fn);
if (id.empty() && data.json.contains("id") && data.json.at("id").is_string()) {
id = data.json.at("id").get<std::string>();
}
}
}
// If name is empty, treat the JSON object as content
if (name.empty()) {
LOG_DBG("%s: tool call missing name, treating as content\n", __func__);
builder.add_content(data.json.dump());
continue;
}
std::string args_str = arguments.dump();
if (!builder.add_tool_call(name, id, args_str)) {
throw common_chat_msg_partial_exception("incomplete tool call");
}
}
builder.add_content(builder.consume_rest());
}
static void common_chat_parse_exaone_moe(common_chat_msg_parser & builder) {
LOG_DBG("%s: parsing exaone_moe\n", __func__);
// EXAONE MoE outputs reasoning content between "<think>" and "</think>" tags, followed by regular content
// First try to parse using the standard reasoning parsing method
LOG_DBG("%s: thinking_forced_open: %s\n", __func__, std::to_string(builder.syntax().thinking_forced_open).c_str());
auto start_pos = builder.pos();
auto found_end_think = builder.try_find_literal("</think>");
builder.move_to(start_pos);
if (builder.syntax().thinking_forced_open && !builder.is_partial() && !found_end_think) {
LOG_DBG("%s: no end_think, not partial, adding content\n", __func__);
common_chat_parse_exaone_moe_content(builder);
} else if (builder.try_parse_reasoning("<think>", "</think>")) {
// If reasoning was parsed successfully, the remaining content is regular content
LOG_DBG("%s: parsed reasoning, adding content\n", __func__);
common_chat_parse_exaone_moe_content(builder);
} else {
if (builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_NONE) {
LOG_DBG("%s: reasoning_format none, adding content\n", __func__);
common_chat_parse_exaone_moe_content(builder);
return;
}
// If no reasoning tags found, check if we should treat everything as reasoning
if (builder.syntax().thinking_forced_open) {
// If thinking is forced open but no tags found, treat everything as reasoning
LOG_DBG("%s: thinking_forced_open, adding reasoning content\n", __func__);
builder.add_reasoning_content(builder.consume_rest());
} else {
LOG_DBG("%s: no thinking_forced_open, adding content\n", __func__);
common_chat_parse_exaone_moe_content(builder);
}
}
}
static void common_chat_parse_content_only(common_chat_msg_parser & builder) {
builder.try_parse_reasoning("<think>", "</think>");
builder.add_content(builder.consume_rest());
@@ -1602,9 +1490,6 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
case COMMON_CHAT_FORMAT_SOLAR_OPEN:
common_chat_parse_solar_open(builder);
break;
case COMMON_CHAT_FORMAT_EXAONE_MOE:
common_chat_parse_exaone_moe(builder);
break;
default:
throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(builder.syntax().format));
}

View File

@@ -670,7 +670,6 @@ const char * common_chat_format_name(common_chat_format format) {
case COMMON_CHAT_FORMAT_APRIEL_1_5: return "Apriel 1.5";
case COMMON_CHAT_FORMAT_XIAOMI_MIMO: return "Xiaomi MiMo";
case COMMON_CHAT_FORMAT_SOLAR_OPEN: return "Solar Open";
case COMMON_CHAT_FORMAT_EXAONE_MOE: return "EXAONE MoE";
case COMMON_CHAT_FORMAT_PEG_SIMPLE: return "peg-simple";
case COMMON_CHAT_FORMAT_PEG_NATIVE: return "peg-native";
case COMMON_CHAT_FORMAT_PEG_CONSTRUCTED: return "peg-constructed";
@@ -2540,65 +2539,6 @@ static common_chat_params common_chat_params_init_solar_open(const common_chat_t
return data;
}
static common_chat_params common_chat_params_init_exaone_moe(const common_chat_template & tmpl, const struct templates_params & inputs) {
common_chat_params data;
data.prompt = apply(tmpl, inputs);
data.format = COMMON_CHAT_FORMAT_EXAONE_MOE;
if (string_ends_with(data.prompt, "<think>\n")) {
if (!inputs.enable_thinking) {
data.prompt += "</think>\n\n";
} else {
data.thinking_forced_open = true;
}
}
if (inputs.tools.is_array() && !inputs.tools.empty()) {
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED && inputs.json_schema.is_null();
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
std::vector<std::string> tool_rules;
foreach_function(inputs.tools, [&](const json & tool) {
const auto & function = tool.at("function");
std::string name = function.at("name");
auto parameters = function.at("parameters");
builder.resolve_refs(parameters);
// Expect: <tool_call>{"name": "<name>", "arguments": {...}}</tool_call>
tool_rules.push_back(builder.add_rule(
name + "-call",
"\"<tool_call>\" space " +
builder.add_schema(name + "-obj", json{
{"type", "object"},
{"properties", {
{"name", json{{"const", name}}},
{"arguments", parameters},
}},
{"required", json::array({"name", "arguments"})},
}) +
" space \"</tool_call>\" space"));
});
auto tool_call = builder.add_rule("tool_call", string_join(tool_rules, " | "));
builder.add_rule("root",
std::string(data.thinking_forced_open ? "( \"</think>\" space )? " : "") +
(inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call));
data.grammar_triggers.push_back({
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
std::string(data.thinking_forced_open ? "[\\s\\S]*?(</think>\\s*)?" : "") +
"(<tool_call>)[\\s\\S]*"
});
data.preserved_tokens = {
"<think>",
"</think>",
"<tool_call>",
"</tool_call>",
};
});
}
return data;
}
static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) {
common_chat_params data;
data.prompt = apply(tmpl, inputs);
@@ -2769,13 +2709,6 @@ static common_chat_params common_chat_templates_apply_jinja(
return common_chat_params_init_xiaomi_mimo(tmpl, params);
}
// EXAONE MoE format detection
if (src.find("<tool_call>") != std::string::npos &&
src.find("<tool_result>") != std::string::npos &&
src.find("<|tool_declare|>") != std::string::npos) {
return common_chat_params_init_exaone_moe(tmpl, params);
}
// Hermes 2/3 Pro, Qwen 2.5 Instruct (w/ tools)
if (src.find("<tool_call>") != std::string::npos && params.json_schema.is_null()) {
return common_chat_params_init_hermes_2_pro(tmpl, params);

View File

@@ -125,7 +125,6 @@ enum common_chat_format {
COMMON_CHAT_FORMAT_APRIEL_1_5,
COMMON_CHAT_FORMAT_XIAOMI_MIMO,
COMMON_CHAT_FORMAT_SOLAR_OPEN,
COMMON_CHAT_FORMAT_EXAONE_MOE,
// These are intended to be parsed by the PEG parser
COMMON_CHAT_FORMAT_PEG_SIMPLE,

View File

@@ -1097,7 +1097,7 @@ common_init_result::common_init_result(common_params & params) :
if (params.fit_params) {
LOG_INF("%s: fitting params to device memory, for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on\n", __func__);
llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target.data(), params.fit_params_min_ctx,
params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target, params.fit_params_min_ctx,
params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
}
@@ -1366,7 +1366,6 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
mparams.split_mode = params.split_mode;
mparams.tensor_split = params.tensor_split;
mparams.use_mmap = params.use_mmap;
mparams.use_direct_io = params.use_direct_io;
mparams.use_mlock = params.use_mlock;
mparams.check_tensors = params.check_tensors;
mparams.use_extra_bufts = !params.no_extra_bufts;

View File

@@ -80,8 +80,6 @@ int32_t cpu_get_num_math();
//
enum llama_example {
LLAMA_EXAMPLE_BATCHED,
LLAMA_EXAMPLE_DEBUG,
LLAMA_EXAMPLE_COMMON,
LLAMA_EXAMPLE_SPECULATIVE,
LLAMA_EXAMPLE_COMPLETION,
@@ -333,14 +331,12 @@ struct common_params {
// offload params
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
int32_t n_gpu_layers = -1; // number of layers to store in VRAM, -1 is auto, <= -2 is all
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
bool fit_params = true; // whether to fit unset model/context parameters to free device memory
int32_t fit_params_min_ctx = 4096; // minimum context size to set when trying to reduce memory use
// margin per device in bytes for fitting parameters to free memory:
std::vector<size_t> fit_params_target = std::vector<size_t>(llama_max_devices(), 1024 * 1024*1024);
int32_t n_gpu_layers = -1; // number of layers to store in VRAM, -1 is auto, <= -2 is all
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
bool fit_params = true; // whether to fit unset model/context parameters to free device memory
size_t fit_params_target = 1024 * 1024*1024; // margin per device in bytes for fitting parameters to free memory
int32_t fit_params_min_ctx = 4096; // minimum context size to set when trying to reduce memory use
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
@@ -376,11 +372,6 @@ struct common_params {
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT
std::string logits_file = ""; // file for saving *all* logits // NOLINT
// llama-debug specific options
std::string logits_output_dir = "data"; // directory for saving logits output files // NOLINT
bool save_logits = false; // whether to save logits to files // NOLINT
std::vector<std::string> tensor_filter; // filter tensor names for debug output (regex) // NOLINT
std::vector<std::string> in_files; // all input files
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
std::vector<llama_model_kv_override> kv_overrides;
@@ -431,8 +422,7 @@ struct common_params {
bool kv_unified = false; // enable unified KV cache
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
bool use_mmap = true; // enable mmap to use filesystem cache
bool use_direct_io = true; // read from disk without buffering for faster model loading
bool use_mmap = true; // use mmap for faster loads
bool use_mlock = false; // use mlock to keep model in memory
bool verbose_prompt = false; // print prompt tokens before generation
bool display_prompt = true; // print prompt before generation
@@ -476,7 +466,6 @@ struct common_params {
int32_t timeout_write = timeout_read; // http write timeout in seconds
int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
bool cache_prompt = true; // whether to enable prompt caching
int32_t n_ctx_checkpoints = 8; // max number of context checkpoints per slot
int32_t cache_ram_mib = 8192; // -1 = no limit, 0 - disable, 1 = 1 MiB, etc.

View File

@@ -1,165 +0,0 @@
#include "debug.h"
#include "log.h"
#include <cmath>
#include <string>
static std::string common_ggml_ne_string(const ggml_tensor * t) {
std::string str;
for (int i = 0; i < GGML_MAX_DIMS; ++i) {
str += std::to_string(t->ne[i]);
if (i + 1 < GGML_MAX_DIMS) {
str += ", ";
}
}
return str;
}
static float common_ggml_get_float_value(const uint8_t * data,
ggml_type type,
const size_t * nb,
size_t i0,
size_t i1,
size_t i2,
size_t i3) {
size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
float v;
if (type == GGML_TYPE_F16) {
v = ggml_fp16_to_fp32(*(const ggml_fp16_t *) &data[i]);
} else if (type == GGML_TYPE_F32) {
v = *(const float *) &data[i];
} else if (type == GGML_TYPE_I64) {
v = (float) *(const int64_t *) &data[i];
} else if (type == GGML_TYPE_I32) {
v = (float) *(const int32_t *) &data[i];
} else if (type == GGML_TYPE_I16) {
v = (float) *(const int16_t *) &data[i];
} else if (type == GGML_TYPE_I8) {
v = (float) *(const int8_t *) &data[i];
} else if (type == GGML_TYPE_BF16) {
v = ggml_bf16_to_fp32(*(const ggml_bf16_t *) &data[i]);
} else {
GGML_ABORT("fatal error");
}
return v;
}
template <bool abort>
void common_debug_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) {
GGML_ASSERT(n > 0);
float sum = 0;
for (int64_t i3 = 0; i3 < ne[3]; i3++) {
for (int64_t i2 = 0; i2 < ne[2]; i2++) {
for (int64_t i1 = 0; i1 < ne[1]; i1++) {
for (int64_t i0 = 0; i0 < ne[0]; i0++) {
const float v = common_ggml_get_float_value(data, type, nb, i0, i1, i2, i3);
sum += v;
}
}
}
}
for (int64_t i3 = 0; i3 < ne[3]; i3++) {
LOG_ERR(" [\n");
for (int64_t i2 = 0; i2 < ne[2]; i2++) {
if (i2 == n && ne[2] > 2 * n) {
LOG_ERR(" ..., \n");
i2 = ne[2] - n;
}
LOG_ERR(" [\n");
for (int64_t i1 = 0; i1 < ne[1]; i1++) {
if (i1 == n && ne[1] > 2 * n) {
LOG_ERR(" ..., \n");
i1 = ne[1] - n;
}
LOG_ERR(" [");
for (int64_t i0 = 0; i0 < ne[0]; i0++) {
if (i0 == n && ne[0] > 2 * n) {
LOG_ERR("..., ");
i0 = ne[0] - n;
}
const float v = common_ggml_get_float_value(data, type, nb, i0, i1, i2, i3);
LOG_ERR("%12.4f", v);
if (i0 < ne[0] - 1) {
LOG_ERR(", ");
}
}
LOG_ERR("],\n");
}
LOG_ERR(" ],\n");
}
LOG_ERR(" ]\n");
LOG_ERR(" sum = %f\n", sum);
}
if constexpr (abort) {
if (std::isnan(sum)) {
LOG_ERR("encountered NaN - aborting\n");
exit(0);
}
}
}
/**
* GGML operations callback during the graph execution.
*
* @param t current tensor
* @param ask when ask is true, the scheduler wants to know if we are interested in data from this tensor
* if we return true, a follow-up call will be made with ask=false in which we can do the actual collection.
* see ggml_backend_sched_eval_callback
* @param user_data user data to pass at each call back
* @return true to receive data or continue the graph, false otherwise
*/
template <bool abort_on_nan> bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
auto * cb_data = (base_callback_data *) user_data;
const struct ggml_tensor * src0 = t->src[0];
const struct ggml_tensor * src1 = t->src[1];
if (ask) {
return true; // Always retrieve data
}
bool matches_filter = cb_data->tensor_filters.empty();
if (!matches_filter) {
for (const auto & filter : cb_data->tensor_filters) {
if (std::regex_search(t->name, filter)) {
matches_filter = true;
break;
}
}
}
char src1_str[128] = { 0 };
if (src1) {
snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, common_ggml_ne_string(src1).c_str());
}
if (matches_filter) {
LOG_ERR("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__, t->name, ggml_type_name(t->type),
ggml_op_desc(t), src0->name, common_ggml_ne_string(src0).c_str(), src1 ? src1_str : "",
common_ggml_ne_string(t).c_str());
}
const bool is_host = ggml_backend_buffer_is_host(t->buffer);
if (!is_host) {
auto n_bytes = ggml_nbytes(t);
cb_data->data.resize(n_bytes);
ggml_backend_tensor_get(t, cb_data->data.data(), 0, n_bytes);
}
if (!ggml_is_quantized(t->type) && matches_filter) {
uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();
common_debug_print_tensor<abort_on_nan>(data, t->type, t->ne, t->nb, 3);
}
return true;
}
// Explicit template instantiations
template bool common_debug_cb_eval<false>(ggml_tensor *, bool, void *);
template bool common_debug_cb_eval<true>(ggml_tensor *, bool, void *);
template void common_debug_print_tensor<false>(uint8_t *, ggml_type, const int64_t *, const size_t *, int64_t);
template void common_debug_print_tensor<true>(uint8_t *, ggml_type, const int64_t *, const size_t *, int64_t);

View File

@@ -1,43 +0,0 @@
#pragma once
#include "common.h"
#include <string>
#include <vector>
#include <regex>
// common debug functions and structs
// Print a tensor's detailed data
// data - the tensor's data in byte format
// type - the tensor's quantization type
// ne - the tensor dimensions array
// nb - the tensor strides array
// n - the number of rows/columns to fully print
template <bool abort_on_nan> void common_debug_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n);
// Intended to use as callback for ggml_backend_sched_eval_callback
// prints tensors that are processed in the computation graph
// by default prints all tensors, but can be configured by creating a `base_callback_data` instance with
// non-empty filter_patterns. See examples/debug.ccp for possible usage patterns
// The template parameter determins whether an error should be thrown whenever a NaN is encountered
// in a tensor (useful for stopping debug sessions on first erroneous tensor)
// The callback data will be passed as the third parameter (user_data)
template <bool abort_on_nan> bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data);
struct base_callback_data {
std::vector<uint8_t> data;
std::vector<std::regex> tensor_filters;
base_callback_data() = default;
base_callback_data(common_params & params, const std::vector<std::string> & filter_patterns) {
for (const auto & pattern : filter_patterns) {
try {
std::string anchored_pattern = "^" + pattern;
tensor_filters.emplace_back(anchored_pattern, std::regex::optimize);
} catch (const std::regex_error & e) {
throw std::runtime_error("Invalid regex pattern '" + pattern + "': " + e.what());
}
}
params.cb_eval = common_debug_cb_eval<false>;
params.cb_eval_user_data = this;
}
};

View File

@@ -19,7 +19,10 @@
#include <thread>
#include <vector>
#if defined(LLAMA_USE_HTTPLIB)
#if defined(LLAMA_USE_CURL)
#include <curl/curl.h>
#include <curl/easy.h>
#elif defined(LLAMA_USE_HTTPLIB)
#include "http.h"
#endif
@@ -154,21 +157,322 @@ static std::string read_etag(const std::string & path) {
return none;
}
static bool is_http_status_ok(int status) {
return status >= 200 && status < 400;
}
#ifdef LLAMA_USE_CURL
std::pair<std::string, std::string> common_download_split_repo_tag(const std::string & hf_repo_with_tag) {
auto parts = string_split<std::string>(hf_repo_with_tag, ':');
std::string tag = parts.size() > 1 ? parts.back() : "latest";
std::string hf_repo = parts[0];
if (string_split<std::string>(hf_repo, '/').size() != 2) {
throw std::invalid_argument("error: invalid HF repo format, expected <user>/<model>[:quant]\n");
//
// CURL utils
//
using curl_ptr = std::unique_ptr<CURL, decltype(&curl_easy_cleanup)>;
// cannot use unique_ptr for curl_slist, because we cannot update without destroying the old one
struct curl_slist_ptr {
struct curl_slist * ptr = nullptr;
~curl_slist_ptr() {
if (ptr) {
curl_slist_free_all(ptr);
}
}
return {hf_repo, tag};
};
static CURLcode common_curl_perf(CURL * curl) {
CURLcode res = curl_easy_perform(curl);
if (res != CURLE_OK) {
LOG_ERR("%s: curl_easy_perform() failed\n", __func__);
}
return res;
}
#if defined(LLAMA_USE_HTTPLIB)
// Send a HEAD request to retrieve the etag and last-modified headers
struct common_load_model_from_url_headers {
std::string etag;
std::string last_modified;
std::string accept_ranges;
};
struct FILE_deleter {
void operator()(FILE * f) const { fclose(f); }
};
static size_t common_header_callback(char * buffer, size_t, size_t n_items, void * userdata) {
common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
static std::regex header_regex("([^:]+): (.*)\r\n");
static std::regex etag_regex("ETag", std::regex_constants::icase);
static std::regex last_modified_regex("Last-Modified", std::regex_constants::icase);
static std::regex accept_ranges_regex("Accept-Ranges", std::regex_constants::icase);
std::string header(buffer, n_items);
std::smatch match;
if (std::regex_match(header, match, header_regex)) {
const std::string & key = match[1];
const std::string & value = match[2];
if (std::regex_match(key, match, etag_regex)) {
headers->etag = value;
} else if (std::regex_match(key, match, last_modified_regex)) {
headers->last_modified = value;
} else if (std::regex_match(key, match, accept_ranges_regex)) {
headers->accept_ranges = value;
}
}
return n_items;
}
static size_t common_write_callback(void * data, size_t size, size_t nmemb, void * fd) {
return std::fwrite(data, size, nmemb, static_cast<FILE *>(fd));
}
// helper function to hide password in URL
static std::string llama_download_hide_password_in_url(const std::string & url) {
// Use regex to match and replace the user[:password]@ pattern in URLs
// Pattern: scheme://[user[:password]@]host[...]
static const std::regex url_regex(R"(^(?:[A-Za-z][A-Za-z0-9+.-]://)(?:[^/@]+@)?.$)");
std::smatch match;
if (std::regex_match(url, match, url_regex)) {
// match[1] = scheme (e.g., "https://")
// match[2] = user[:password]@ part
// match[3] = rest of URL (host and path)
return match[1].str() + "********@" + match[3].str();
}
return url; // No credentials found or malformed URL
}
static void common_curl_easy_setopt_head(CURL * curl, const std::string & url) {
// Set the URL, allow to follow http redirection
curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
# if defined(_WIN32)
// CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
// operating system. Currently implemented under MS-Windows.
curl_easy_setopt(curl, CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
# endif
curl_easy_setopt(curl, CURLOPT_NOBODY, 1L); // will trigger the HEAD verb
curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 1L); // hide head request progress
curl_easy_setopt(curl, CURLOPT_HEADERFUNCTION, common_header_callback);
}
static void common_curl_easy_setopt_get(CURL * curl) {
curl_easy_setopt(curl, CURLOPT_NOBODY, 0L);
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, common_write_callback);
// display download progress
curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L);
}
static bool common_pull_file(CURL * curl, const std::string & path_temporary) {
if (std::filesystem::exists(path_temporary)) {
const std::string partial_size = std::to_string(std::filesystem::file_size(path_temporary));
LOG_INF("%s: server supports range requests, resuming download from byte %s\n", __func__, partial_size.c_str());
const std::string range_str = partial_size + "-";
curl_easy_setopt(curl, CURLOPT_RANGE, range_str.c_str());
}
// Always open file in append mode could be resuming
std::unique_ptr<FILE, FILE_deleter> outfile(fopen(path_temporary.c_str(), "ab"));
if (!outfile) {
LOG_ERR("%s: error opening local file for writing: %s\n", __func__, path_temporary.c_str());
return false;
}
common_curl_easy_setopt_get(curl);
curl_easy_setopt(curl, CURLOPT_WRITEDATA, outfile.get());
return common_curl_perf(curl) == CURLE_OK;
}
static bool common_download_head(CURL * curl,
curl_slist_ptr & http_headers,
const std::string & url,
const std::string & bearer_token) {
if (!curl) {
LOG_ERR("%s: error initializing libcurl\n", __func__);
return false;
}
http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
// Check if hf-token or bearer-token was specified
if (!bearer_token.empty()) {
std::string auth_header = "Authorization: Bearer " + bearer_token;
http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
}
curl_easy_setopt(curl, CURLOPT_HTTPHEADER, http_headers.ptr);
common_curl_easy_setopt_head(curl, url);
return common_curl_perf(curl) == CURLE_OK;
}
// download one single file from remote URL to local path
static bool common_download_file_single_online(const std::string & url,
const std::string & path,
const std::string & bearer_token) {
static const int max_attempts = 3;
static const int retry_delay_seconds = 2;
for (int i = 0; i < max_attempts; ++i) {
std::string etag;
// Check if the file already exists locally
const auto file_exists = std::filesystem::exists(path);
if (file_exists) {
etag = read_etag(path);
} else {
LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
}
bool head_request_ok = false;
bool should_download = !file_exists; // by default, we should download if the file does not exist
// Initialize libcurl
curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
common_load_model_from_url_headers headers;
curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
curl_slist_ptr http_headers;
const bool was_perform_successful = common_download_head(curl.get(), http_headers, url, bearer_token);
if (!was_perform_successful) {
head_request_ok = false;
}
long http_code = 0;
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
if (http_code == 200) {
head_request_ok = true;
} else {
LOG_WRN("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
head_request_ok = false;
}
// if head_request_ok is false, we don't have the etag or last-modified headers
// we leave should_download as-is, which is true if the file does not exist
bool should_download_from_scratch = false;
if (head_request_ok) {
// check if ETag or Last-Modified headers are different
// if it is, we need to download the file again
if (!etag.empty() && etag != headers.etag) {
LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(),
headers.etag.c_str());
should_download = true;
should_download_from_scratch = true;
}
}
const bool accept_ranges_supported = !headers.accept_ranges.empty() && headers.accept_ranges != "none";
if (should_download) {
if (file_exists &&
!accept_ranges_supported) { // Resumable downloads not supported, delete and start again.
LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
if (remove(path.c_str()) != 0) {
LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
return false;
}
}
const std::string path_temporary = path + ".downloadInProgress";
if (should_download_from_scratch) {
if (std::filesystem::exists(path_temporary)) {
if (remove(path_temporary.c_str()) != 0) {
LOG_ERR("%s: unable to delete file: %s\n", __func__, path_temporary.c_str());
return false;
}
}
if (std::filesystem::exists(path)) {
if (remove(path.c_str()) != 0) {
LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
return false;
}
}
}
if (head_request_ok) {
write_etag(path, headers.etag);
}
// start the download
LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n",
__func__, llama_download_hide_password_in_url(url).c_str(), path_temporary.c_str(),
headers.etag.c_str(), headers.last_modified.c_str());
const bool was_pull_successful = common_pull_file(curl.get(), path_temporary);
if (!was_pull_successful) {
if (i + 1 < max_attempts) {
const int exponential_backoff_delay = std::pow(retry_delay_seconds, i) * 1000;
LOG_WRN("%s: retrying after %d milliseconds...\n", __func__, exponential_backoff_delay);
std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay));
} else {
LOG_ERR("%s: curl_easy_perform() failed after %d attempts\n", __func__, max_attempts);
}
continue;
}
long http_code = 0;
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
if (http_code < 200 || http_code >= 400) {
LOG_ERR("%s: invalid http status code received: %ld\n", __func__, http_code);
return false;
}
if (rename(path_temporary.c_str(), path.c_str()) != 0) {
LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
return false;
}
} else {
LOG_INF("%s: using cached file: %s\n", __func__, path.c_str());
}
break;
}
return true;
}
std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params & params) {
curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
curl_slist_ptr http_headers;
std::vector<char> res_buffer;
curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L);
curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
curl_easy_setopt(curl.get(), CURLOPT_VERBOSE, 0L);
typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data);
auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t {
auto data_vec = static_cast<std::vector<char> *>(data);
data_vec->insert(data_vec->end(), (char *)ptr, (char *)ptr + size * nmemb);
return size * nmemb;
};
curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_buffer);
#if defined(_WIN32)
curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
#endif
if (params.timeout > 0) {
curl_easy_setopt(curl.get(), CURLOPT_TIMEOUT, params.timeout);
}
if (params.max_size > 0) {
curl_easy_setopt(curl.get(), CURLOPT_MAXFILESIZE, params.max_size);
}
http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
for (const auto & header : params.headers) {
http_headers.ptr = curl_slist_append(http_headers.ptr, header.c_str());
}
curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
CURLcode res = curl_easy_perform(curl.get());
if (res != CURLE_OK) {
std::string error_msg = curl_easy_strerror(res);
throw std::runtime_error("error: cannot make GET request: " + error_msg);
}
long res_code;
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code);
return { res_code, std::move(res_buffer) };
}
#elif defined(LLAMA_USE_HTTPLIB)
class ProgressBar {
static inline std::mutex mutex;
@@ -313,11 +617,9 @@ static bool common_pull_file(httplib::Client & cli,
}
// download one single file from remote URL to local path
// returns status code or -1 on error
static int common_download_file_single_online(const std::string & url,
static bool common_download_file_single_online(const std::string & url,
const std::string & path,
const std::string & bearer_token,
const common_header_list & custom_headers) {
const std::string & bearer_token) {
static const int max_attempts = 3;
static const int retry_delay_seconds = 2;
@@ -327,9 +629,6 @@ static int common_download_file_single_online(const std::string & url,
if (!bearer_token.empty()) {
default_headers.insert({"Authorization", "Bearer " + bearer_token});
}
for (const auto & h : custom_headers) {
default_headers.emplace(h.first, h.second);
}
cli.set_default_headers(default_headers);
const bool file_exists = std::filesystem::exists(path);
@@ -348,10 +647,8 @@ static int common_download_file_single_online(const std::string & url,
LOG_WRN("%s: HEAD invalid http status code received: %d\n", __func__, head ? head->status : -1);
if (file_exists) {
LOG_INF("%s: Using cached file (HEAD failed): %s\n", __func__, path.c_str());
return 304; // 304 Not Modified - fake cached response
return true;
}
return head->status; // cannot use cached file, return raw status code
// TODO: maybe retry only on certain codes
}
std::string etag;
@@ -383,12 +680,12 @@ static int common_download_file_single_online(const std::string & url,
if (file_exists) {
if (!should_download_from_scratch) {
LOG_INF("%s: using cached file: %s\n", __func__, path.c_str());
return 304; // 304 Not Modified - fake cached response
return true;
}
LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
if (remove(path.c_str()) != 0) {
LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
return -1;
return false;
}
}
@@ -400,7 +697,7 @@ static int common_download_file_single_online(const std::string & url,
existing_size = std::filesystem::file_size(path_temporary);
} else if (remove(path_temporary.c_str()) != 0) {
LOG_ERR("%s: unable to delete file: %s\n", __func__, path_temporary.c_str());
return -1;
return false;
}
}
@@ -421,16 +718,15 @@ static int common_download_file_single_online(const std::string & url,
if (std::rename(path_temporary.c_str(), path.c_str()) != 0) {
LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
return -1;
return false;
}
if (!etag.empty()) {
write_etag(path, etag);
}
return head->status; // TODO: use actual GET status?
break;
}
return -1; // max attempts reached
return true;
}
std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url,
@@ -438,9 +734,13 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string
auto [cli, parts] = common_http_client(url);
httplib::Headers headers = {{"User-Agent", "llama-cpp"}};
for (const auto & header : params.headers) {
headers.emplace(header.first, header.second);
size_t pos = header.find(':');
if (pos != std::string::npos) {
headers.emplace(header.substr(0, pos), header.substr(pos + 1));
} else {
headers.emplace(header, "");
}
}
if (params.timeout > 0) {
@@ -465,45 +765,36 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string
return { res->status, std::move(buf) };
}
int common_download_file_single(const std::string & url,
const std::string & path,
const std::string & bearer_token,
bool offline,
const common_header_list & headers) {
#endif // LLAMA_USE_CURL
#if defined(LLAMA_USE_CURL) || defined(LLAMA_USE_HTTPLIB)
static bool common_download_file_single(const std::string & url,
const std::string & path,
const std::string & bearer_token,
bool offline) {
if (!offline) {
return common_download_file_single_online(url, path, bearer_token, headers);
return common_download_file_single_online(url, path, bearer_token);
}
if (!std::filesystem::exists(path)) {
LOG_ERR("%s: required file is not available in cache (offline mode): %s\n", __func__, path.c_str());
return -1;
return false;
}
LOG_INF("%s: using cached file (offline mode): %s\n", __func__, path.c_str());
return 304; // Not Modified - fake cached response
return true;
}
// download multiple files from remote URLs to local paths
// the input is a vector of pairs <url, path>
static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> & urls,
const std::string & bearer_token,
bool offline,
const common_header_list & headers) {
static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> & urls, const std::string & bearer_token, bool offline) {
// Prepare download in parallel
std::vector<std::future<bool>> futures_download;
futures_download.reserve(urls.size());
for (auto const & item : urls) {
futures_download.push_back(
std::async(
std::launch::async,
[&bearer_token, offline, &headers](const std::pair<std::string, std::string> & it) -> bool {
const int http_status = common_download_file_single(it.first, it.second, bearer_token, offline, headers);
return is_http_status_ok(http_status);
},
item
)
);
futures_download.push_back(std::async(std::launch::async, [bearer_token, offline](const std::pair<std::string, std::string> & it) -> bool {
return common_download_file_single(it.first, it.second, bearer_token, offline);
}, item));
}
// Wait for all downloads to complete
@@ -516,18 +807,17 @@ static bool common_download_file_multiple(const std::vector<std::pair<std::strin
return true;
}
bool common_download_model(const common_params_model & model,
const std::string & bearer_token,
bool offline,
const common_header_list & headers) {
bool common_download_model(
const common_params_model & model,
const std::string & bearer_token,
bool offline) {
// Basic validation of the model.url
if (model.url.empty()) {
LOG_ERR("%s: invalid model url\n", __func__);
return false;
}
const int http_status = common_download_file_single(model.url, model.path, bearer_token, offline, headers);
if (!is_http_status_ok(http_status)) {
if (!common_download_file_single(model.url, model.path, bearer_token, offline)) {
return false;
}
@@ -586,26 +876,27 @@ bool common_download_model(const common_params_model & model,
}
// Download in parallel
common_download_file_multiple(urls, bearer_token, offline, headers);
common_download_file_multiple(urls, bearer_token, offline);
}
return true;
}
common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag,
const std::string & bearer_token,
bool offline,
const common_header_list & custom_headers) {
// the returned hf_repo is without tag
auto [hf_repo, tag] = common_download_split_repo_tag(hf_repo_with_tag);
common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & bearer_token, bool offline) {
auto parts = string_split<std::string>(hf_repo_with_tag, ':');
std::string tag = parts.size() > 1 ? parts.back() : "latest";
std::string hf_repo = parts[0];
if (string_split<std::string>(hf_repo, '/').size() != 2) {
throw std::invalid_argument("error: invalid HF repo format, expected <user>/<model>[:quant]\n");
}
std::string url = get_model_endpoint() + "v2/" + hf_repo + "/manifests/" + tag;
// headers
common_header_list headers = custom_headers;
headers.push_back({"Accept", "application/json"});
std::vector<std::string> headers;
headers.push_back("Accept: application/json");
if (!bearer_token.empty()) {
headers.push_back({"Authorization", "Bearer " + bearer_token});
headers.push_back("Authorization: Bearer " + bearer_token);
}
// Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response
// User-Agent header is already set in common_remote_get_content, no need to set it here
@@ -661,7 +952,7 @@ common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag,
} else if (res_code == 401) {
throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token");
} else {
throw std::runtime_error(string_format("error from HF API (%s), response code: %ld, data: %s", url.c_str(), res_code, res_str.c_str()));
throw std::runtime_error(string_format("error from HF API, response code: %ld, data: %s", res_code, res_str.c_str()));
}
// check response
@@ -740,10 +1031,9 @@ std::string common_docker_resolve_model(const std::string & docker) {
const std::string url_prefix = "https://registry-1.docker.io/v2/" + repo;
std::string manifest_url = url_prefix + "/manifests/" + tag;
common_remote_params manifest_params;
manifest_params.headers.push_back({"Authorization", "Bearer " + token});
manifest_params.headers.push_back({"Accept",
"application/vnd.docker.distribution.manifest.v2+json,application/vnd.oci.image.manifest.v1+json"
});
manifest_params.headers.push_back("Authorization: Bearer " + token);
manifest_params.headers.push_back(
"Accept: application/vnd.docker.distribution.manifest.v2+json,application/vnd.oci.image.manifest.v1+json");
auto manifest_res = common_remote_get_content(manifest_url, manifest_params);
if (manifest_res.first != 200) {
throw std::runtime_error("Failed to get Docker manifest, HTTP code: " + std::to_string(manifest_res.first));
@@ -780,8 +1070,7 @@ std::string common_docker_resolve_model(const std::string & docker) {
std::string local_path = fs_get_cache_file(model_filename);
const std::string blob_url = url_prefix + "/blobs/" + gguf_digest;
const int http_status = common_download_file_single(blob_url, local_path, token, false, {});
if (!is_http_status_ok(http_status)) {
if (!common_download_file_single(blob_url, local_path, token, false)) {
throw std::runtime_error("Failed to download Docker Model");
}
@@ -795,11 +1084,11 @@ std::string common_docker_resolve_model(const std::string & docker) {
#else
common_hf_file_res common_get_hf_file(const std::string &, const std::string &, bool, const common_header_list &) {
common_hf_file_res common_get_hf_file(const std::string &, const std::string &, bool) {
throw std::runtime_error("download functionality is not enabled in this build");
}
bool common_download_model(const common_params_model &, const std::string &, bool, const common_header_list &) {
bool common_download_model(const common_params_model &, const std::string &, bool) {
throw std::runtime_error("download functionality is not enabled in this build");
}
@@ -807,15 +1096,7 @@ std::string common_docker_resolve_model(const std::string &) {
throw std::runtime_error("download functionality is not enabled in this build");
}
int common_download_file_single(const std::string &,
const std::string &,
const std::string &,
bool,
const common_header_list &) {
throw std::runtime_error("download functionality is not enabled in this build");
}
#endif // defined(LLAMA_USE_HTTPLIB)
#endif // LLAMA_USE_CURL || LLAMA_USE_HTTPLIB
std::vector<common_cached_model_info> common_list_cached_models() {
std::vector<common_cached_model_info> models;

View File

@@ -1,27 +1,12 @@
#pragma once
#include <string>
#include <vector>
struct common_params_model;
using common_header = std::pair<std::string, std::string>;
using common_header_list = std::vector<common_header>;
struct common_remote_params {
common_header_list headers;
long timeout = 0; // in seconds, 0 means no timeout
long max_size = 0; // unlimited if 0
};
// get remote file content, returns <http_code, raw_response_body>
std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params & params);
// split HF repo with tag into <repo, tag>
// for example: "user/model:tag" -> <"user/model", "tag">
// if tag is not present, default to "latest"
// example: "user/model" -> <"user/model", "latest">
std::pair<std::string, std::string> common_download_split_repo_tag(const std::string & hf_repo_with_tag);
//
// download functionalities
//
struct common_cached_model_info {
std::string manifest_path;
@@ -56,29 +41,17 @@ struct common_hf_file_res {
common_hf_file_res common_get_hf_file(
const std::string & hf_repo_with_tag,
const std::string & bearer_token,
bool offline,
const common_header_list & headers = {}
);
bool offline);
// returns true if download succeeded
bool common_download_model(
const common_params_model & model,
const std::string & bearer_token,
bool offline,
const common_header_list & headers = {}
);
bool offline);
// returns list of cached models
std::vector<common_cached_model_info> common_list_cached_models();
// download single file from url to local path
// returns status code or -1 on error
int common_download_file_single(const std::string & url,
const std::string & path,
const std::string & bearer_token,
bool offline,
const common_header_list & headers = {});
// resolve and download model from Docker registry
// return local path to downloaded model file
std::string common_docker_resolve_model(const std::string & docker);

View File

@@ -16,48 +16,6 @@ static std::string rm_leading_dashes(const std::string & str) {
return str.substr(pos);
}
// only allow a subset of args for remote presets for security reasons
// do not add more args unless absolutely necessary
// args that output to files are strictly prohibited
static std::set<std::string> get_remote_preset_whitelist(const std::map<std::string, common_arg> & key_to_opt) {
static const std::set<std::string> allowed_options = {
"model-url",
"hf-repo",
"hf-repo-draft",
"hf-repo-v", // vocoder
"hf-file-v", // vocoder
"mmproj-url",
"pooling",
"jinja",
"batch-size",
"ubatch-size",
"cache-reuse",
"chat-template-kwargs",
"mmap",
// note: sampling params are automatically allowed by default
// negated args will be added automatically if the positive arg is specified above
};
std::set<std::string> allowed_keys;
for (const auto & it : key_to_opt) {
const std::string & key = it.first;
const common_arg & opt = it.second;
if (allowed_options.find(key) != allowed_options.end() || opt.is_sparam) {
allowed_keys.insert(key);
// also add variant keys (args without leading dashes and env vars)
for (const auto & arg : opt.get_args()) {
allowed_keys.insert(rm_leading_dashes(arg));
}
for (const auto & env : opt.get_env()) {
allowed_keys.insert(env);
}
}
}
return allowed_keys;
}
std::vector<std::string> common_preset::to_args(const std::string & bin_path) const {
std::vector<std::string> args;
@@ -163,29 +121,6 @@ void common_preset::merge(const common_preset & other) {
}
}
void common_preset::apply_to_params(common_params & params) const {
for (const auto & [opt, val] : options) {
// apply each option to params
if (opt.handler_string) {
opt.handler_string(params, val);
} else if (opt.handler_int) {
opt.handler_int(params, std::stoi(val));
} else if (opt.handler_bool) {
opt.handler_bool(params, common_arg_utils::is_truthy(val));
} else if (opt.handler_str_str) {
// not supported yet
throw std::runtime_error(string_format(
"%s: option with two values is not supported yet",
__func__
));
} else if (opt.handler_void) {
opt.handler_void(params);
} else {
GGML_ABORT("unknown handler type");
}
}
}
static std::map<std::string, std::map<std::string, std::string>> parse_ini_from_file(const std::string & path) {
std::map<std::string, std::map<std::string, std::string>> parsed;
@@ -295,16 +230,10 @@ static std::string parse_bool_arg(const common_arg & arg, const std::string & ke
return value;
}
common_preset_context::common_preset_context(llama_example ex, bool only_remote_allowed)
common_preset_context::common_preset_context(llama_example ex)
: ctx_params(common_params_parser_init(default_params, ex)) {
common_params_add_preset_options(ctx_params.options);
key_to_opt = get_map_key_opt(ctx_params);
// setup allowed keys if only_remote_allowed is true
if (only_remote_allowed) {
filter_allowed_keys = true;
allowed_keys = get_remote_preset_whitelist(key_to_opt);
}
}
common_presets common_preset_context::load_from_ini(const std::string & path, common_preset & global) const {
@@ -320,18 +249,7 @@ common_presets common_preset_context::load_from_ini(const std::string & path, co
}
LOG_DBG("loading preset: %s\n", preset.name.c_str());
for (const auto & [key, value] : section.second) {
if (key == "version") {
// skip version key (reserved for future use)
continue;
}
LOG_DBG("option: %s = %s\n", key.c_str(), value.c_str());
if (filter_allowed_keys && allowed_keys.find(key) == allowed_keys.end()) {
throw std::runtime_error(string_format(
"option '%s' is not allowed in remote presets",
key.c_str()
));
}
if (key_to_opt.find(key) != key_to_opt.end()) {
const auto & opt = key_to_opt.at(key);
if (is_bool_arg(opt)) {
@@ -341,10 +259,7 @@ common_presets common_preset_context::load_from_ini(const std::string & path, co
}
LOG_DBG("accepted option: %s = %s\n", key.c_str(), preset.options[opt].c_str());
} else {
throw std::runtime_error(string_format(
"option '%s' not recognized in preset '%s'",
key.c_str(), preset.name.c_str()
));
// TODO: maybe warn about unknown key?
}
}

View File

@@ -6,7 +6,6 @@
#include <string>
#include <vector>
#include <map>
#include <set>
//
// INI preset parser and writer
@@ -41,9 +40,6 @@ struct common_preset {
// merge another preset into this one, overwriting existing options
void merge(const common_preset & other);
// apply preset options to common_params
void apply_to_params(common_params & params) const;
};
// interface for multiple presets in one file
@@ -54,12 +50,7 @@ struct common_preset_context {
common_params default_params; // unused for now
common_params_context ctx_params;
std::map<std::string, common_arg> key_to_opt;
bool filter_allowed_keys = false;
std::set<std::string> allowed_keys;
// if only_remote_allowed is true, only accept whitelisted keys
common_preset_context(llama_example ex, bool only_remote_allowed = false);
common_preset_context(llama_example ex);
// load presets from INI file
common_presets load_from_ini(const std::string & path, common_preset & global) const;

View File

@@ -528,11 +528,7 @@ class ModelBase:
return ()
def prepare_tensors(self):
# Handle empty tensor_map for models with block_count=0 (like MobileNetV5)
if self.tensor_map.mapping:
max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,")
else:
max_name_len = len("vision_encoder.weight,") # Default reasonable length
max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,")
for name, data_torch in chain(self.generate_extra_tensors(), self.get_tensors()):
# we don't need these
@@ -775,8 +771,8 @@ class TextModel(ModelBase):
self.rope_parameters = self.hparams.get("rope_parameters", self.hparams.get("rope_scaling")) or {}
rope_theta = self.find_hparam(["global_rope_theta", "rope_global_theta", "rope_theta_global", "rope_theta", "rotary_emb_base"], optional=True)
local_rope_theta = self.find_hparam(["local_rope_theta", "rope_local_theta", "rope_theta_local", "swa_rope_theta", "rope_local_base_freq"], optional=True)
rope_theta = self.find_hparam(["rope_theta", "global_rope_theta", "rotary_emb_base"], optional=True)
local_rope_theta = self.find_hparam(["local_rope_theta", "rope_local_theta", "swa_rope_theta", "rope_local_base_freq"], optional=True)
# Ensure "rope_theta" and "rope_type" is mirrored in rope_parameters
if "full_attention" not in self.rope_parameters and "sliding_attention" not in self.rope_parameters:
@@ -1252,9 +1248,6 @@ class TextModel(ModelBase):
if chkhsh == "16389f0a1f51ee53e562ffd51c371dc508639ab0e4261502071836e50e223e91":
# ref: https://huggingface.co/upstage/Solar-Open-100B
res = "solar-open"
if chkhsh == "6c81ce329e0802883b22eabab0d3fa48357337ef1ecb45443828bf1f6254833f":
# ref: https://huggingface.co/LGAI-EXAONE/K-EXAONE-236B-A23B
res = "exaone-moe"
if res is None:
logger.warning("\n")
@@ -4370,37 +4363,7 @@ class Qwen3NextModel(Qwen2MoeModel):
elif name.endswith("norm.weight") and not name.endswith("linear_attn.norm.weight"):
data_torch = data_torch + 1
if "in_proj_qkvz.weight" in name:
# original order: [q, k, v, z] * head_count
# corrected order: [q * head_count, k * head_count, v * head_count, z * head_count]
head_k_dim = self.hparams["linear_key_head_dim"]
head_v_dim = self.hparams["linear_value_head_dim"]
num_v_heads = self.hparams["linear_num_value_heads"]
num_k_heads = self.hparams["linear_num_key_heads"]
hidden_size = self.hparams["hidden_size"]
split_arg_list_qkvz = [
head_k_dim, # q partition
head_k_dim, # k partition
(num_v_heads // num_k_heads * head_v_dim), # v partition
(num_v_heads // num_k_heads * head_v_dim), # z partition
]
# view as (n_embd, head_count, [q+k+v+z])
data_torch = data_torch.permute(1, 0).contiguous()
data_torch = data_torch.view(-1, num_k_heads, sum(split_arg_list_qkvz))
# split into q, k, v, z
q, k, v, z = torch.split(data_torch, split_arg_list_qkvz, dim=-1)
# flatten dim + head_count
q = q.contiguous().view(hidden_size, -1)
k = k.contiguous().view(hidden_size, -1)
v = v.contiguous().view(hidden_size, -1)
z = z.contiguous().view(hidden_size, -1)
# stack back
qkv = torch.cat([q, k, v], dim=-1).permute(1, 0).contiguous()
z = z.permute(1, 0).contiguous()
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_QKV, bid, ".weight"), qkv)
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_GATE, bid, ".weight"), z)
else:
yield from super().modify_tensors(data_torch, name, bid)
yield from super().modify_tensors(data_torch, name, bid)
@ModelBase.register("RND1")
@@ -6075,175 +6038,7 @@ class Gemma3VisionModel(MmprojModel):
return [] # skip other tensors
class ConformerAudioModel(MmprojModel):
_batch_norm_tensors: list[dict[str, Tensor]] | None = None
@staticmethod
def is_audio_tensor(name: str):
return any(p in name for p in ["audio", "codebook", "conformer", "depth_embedding", "depthformer", "depth_linear"])
def tensor_force_quant(self, name, new_name, bid, n_dims):
if ConformerAudioModel.is_audio_tensor(name):
if ".conv" in name or "_conv" in name and ".weight" in name:
return gguf.GGMLQuantizationType.F32
return super().tensor_force_quant(name, new_name, bid, n_dims)
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
# fold running_mean, running_var and eps into weight and bias for batch_norm
if "batch_norm" in name:
if self._batch_norm_tensors is None:
self._batch_norm_tensors = [{} for _ in range(self.block_count)]
assert bid is not None
self._batch_norm_tensors[bid][name] = data_torch
if len(self._batch_norm_tensors[bid]) < 5:
return []
weight = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.weight"]
bias = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.bias"]
running_mean = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.running_mean"]
running_var = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.running_var"]
eps = 1e-5 # default value
a = weight / torch.sqrt(running_var + eps)
b = bias - running_mean * a
return [
(self.map_tensor_name(f"conformer.layers.{bid}.conv.batch_norm.weight"), a),
(self.map_tensor_name(f"conformer.layers.{bid}.conv.batch_norm.bias"), b),
]
# reshape conv weights
if name.startswith("conformer.pre_encode.conv.") and name.endswith(".bias"):
data_torch = data_torch[:, None, None]
if "conv.depthwise_conv" in name and name.endswith(".weight"):
assert data_torch.shape[1] == 1
data_torch = data_torch.reshape(data_torch.shape[0], data_torch.shape[2])
if "conv.pointwise_conv" in name and name.endswith(".weight"):
assert data_torch.shape[2] == 1
data_torch = data_torch.reshape(data_torch.shape[0], data_torch.shape[1])
return [(self.map_tensor_name(name), data_torch)]
@ModelBase.register("Gemma3nForConditionalGeneration")
class Gemma3nVisionAudioModel(ConformerAudioModel):
has_audio_encoder = True
has_vision_encoder = True
# Double indexed mapping for MobileNetV5 blocks (not supported by tensor_mapping.py)
# This is the only known model having this, so we prefer implementing it outside of tensor_mapping.py
block_tensor_mapping = {
"model.vision_tower.timm_model.blocks.{bid}.{sid}.conv_exp.weight": "v.blk.{bid}.{sid}.conv_exp.weight",
"model.vision_tower.timm_model.blocks.{bid}.{sid}.bn1.weight": "v.blk.{bid}.{sid}.bn1.weight",
"model.vision_tower.timm_model.blocks.{bid}.{sid}.conv_pwl.weight": "v.blk.{bid}.{sid}.conv_pwl.weight",
"model.vision_tower.timm_model.blocks.{bid}.{sid}.bn2.weight": "v.blk.{bid}.{sid}.bn2.weight",
"model.vision_tower.timm_model.blocks.{bid}.{sid}.dw_start.conv.weight": "v.blk.{bid}.{sid}.dw_start.conv.weight",
"model.vision_tower.timm_model.blocks.{bid}.{sid}.dw_start.bn.weight": "v.blk.{bid}.{sid}.dw_start.bn.weight",
"model.vision_tower.timm_model.blocks.{bid}.{sid}.dw_mid.conv.weight": "v.blk.{bid}.{sid}.dw_mid.conv.weight",
"model.vision_tower.timm_model.blocks.{bid}.{sid}.dw_mid.bn.weight": "v.blk.{bid}.{sid}.dw_mid.bn.weight",
"model.vision_tower.timm_model.blocks.{bid}.{sid}.pw_exp.conv.weight": "v.blk.{bid}.{sid}.pw_exp.conv.weight",
"model.vision_tower.timm_model.blocks.{bid}.{sid}.pw_exp.bn.weight": "v.blk.{bid}.{sid}.pw_exp.bn.weight",
"model.vision_tower.timm_model.blocks.{bid}.{sid}.pw_proj.conv.weight": "v.blk.{bid}.{sid}.pw_proj.conv.weight",
"model.vision_tower.timm_model.blocks.{bid}.{sid}.pw_proj.bn.weight": "v.blk.{bid}.{sid}.pw_proj.bn.weight",
"model.vision_tower.timm_model.blocks.{bid}.{sid}.layer_scale.gamma": "v.blk.{bid}.{sid}.layer_scale.gamma",
"model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.query.proj.weight": "v.blk.{bid}.{sid}.attn.query.proj.weight",
"model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.key.proj.weight": "v.blk.{bid}.{sid}.attn.key.proj.weight",
"model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.value.proj.weight": "v.blk.{bid}.{sid}.attn.value.proj.weight",
"model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.output.proj.weight": "v.blk.{bid}.{sid}.attn.output.proj.weight",
"model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.key.down_conv.weight": "v.blk.{bid}.{sid}.attn.key.down_conv.weight",
"model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.key.norm.weight": "v.blk.{bid}.{sid}.attn.key.norm.weight",
"model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.value.down_conv.weight": "v.blk.{bid}.{sid}.attn.value.down_conv.weight",
"model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.value.norm.weight": "v.blk.{bid}.{sid}.attn.value.norm.weight",
"model.vision_tower.timm_model.blocks.{bid}.{sid}.norm.weight": "v.blk.{bid}.{sid}.norm.weight",
}
def __init__(self, *args, **kwargs):
# Parent init will call find_hparam which now returns 0 for empty keys
super().__init__(*args, **kwargs)
assert self.hparams_vision is not None
self.hparams_vision["n_layers"] = 128 # fake value for audio encoder, vision encoder doesn't use it
self.hparams_vision["intermediate_size"] = self.hparams_vision.get("intermediate_size", 2048) * 4
self.hparams_vision["num_attention_heads"] = self.hparams_vision.get("num_attention_heads", 8)
# MobileNetV5 does not use image_mean/std
self.preprocessor_config["image_mean"] = [0.0 ,0.0 , 0.0]
self.preprocessor_config["image_std"] = [1.0 ,1.0 ,1.0]
self.hparams_vision["image_size"] = self.preprocessor_config.get(
"size", {"height": 768, "width": 768}
)["height"]
# Image sequence length (256 tokens = 16x16 for Gemma3n)
image_seq_length = self.preprocessor_config.get("image_seq_length", 256)
image_size = self.hparams_vision["image_size"]
self.hparams_vision["patch_size"] = image_size // image_seq_length
# remap audio hparams
assert self.hparams_audio is not None
self.hparams_audio["n_layers"] = self.hparams_audio["conf_num_hidden_layers"]
self.hparams_audio["num_attention_heads"] = self.hparams_audio["conf_num_attention_heads"]
self.hparams_audio["feat_in"] = self.hparams_audio["input_feat_size"]
self.hparams_audio["intermediate_size"] = self.hparams_audio.get("intermediate_size", 6144)
def set_gguf_parameters(self):
super().set_gguf_parameters()
# vision params
self.gguf_writer.add_clip_vision_projector_type(gguf.VisionProjectorType.GEMMA3NV)
self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6))
# audio params
assert self.hparams_audio is not None
self.gguf_writer.add_clip_audio_projector_type(gguf.VisionProjectorType.GEMMA3NA)
self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["feat_in"])
self.gguf_writer.add_audio_attention_layernorm_eps(1e-5)
def tensor_force_quant(self, name, new_name, bid, n_dims):
# Force quantization settings for specific tensor types
if "input_projection" in name or "input_proj" in name:
return gguf.GGMLQuantizationType.F16
if ".embeddings." in name or "stem" in name:
return gguf.GGMLQuantizationType.F32
return super().tensor_force_quant(name, new_name, bid, n_dims)
def custom_map(self, name: str) -> str:
"""Parses names like model.vision_tower.timm_model.blocks.1.2.suffix and applies template mapping."""
parts = name.split(".")
# MobileNet blocks have at least 7 parts: model, vision_tower, timm_model, blocks, bid, sid, and suffix
if len(parts) >= 7:
bid, sid = parts[4], parts[5]
suffix = ".".join(parts[6:])
template = f"model.vision_tower.timm_model.blocks.{{bid}}.{{sid}}.{suffix}"
if template in self.block_tensor_mapping:
return self.block_tensor_mapping[template].format(bid=bid, sid=sid)
raise ValueError(f"Unknown name: {name}")
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
if (ConformerAudioModel.is_audio_tensor(name)):
name = name.replace("model.audio_tower.conformer.", "conformer.layers.")
return super().modify_tensors(data_torch, name, bid)
# Gemma3n uses
# - model.embed_vision.* for projection layers
# - model.vision_tower.* for vision encoder
# Skip non-vision tensors
if not (name.startswith("model.embed_vision.") or name.startswith("model.vision_tower.")):
return []
if name.startswith("model.vision_tower.timm_model.blocks."):
# Double-indexed block tensors through custom logic
new_name = self.custom_map(name)
else:
# Route non-repeating (conv_stem, msfa, embedding, etc.) and un-catched through tensor_mapping.py
new_name = self.map_tensor_name(name)
if new_name.endswith("conv_stem.conv.bias") or new_name.endswith("layer_scale.gamma"):
data_torch = data_torch.unsqueeze(0).unsqueeze(-1).unsqueeze(-1) # [1, C, 1, 1]
return [(new_name, data_torch)]
@ModelBase.register("Gemma3nForCausalLM", "Gemma3nForConditionalGeneration")
class Gemma3NModel(Gemma3Model):
model_arch = gguf.MODEL_ARCH.GEMMA3N
norm_shift = 0.0 # same value with Gemma3p5RMSNorm scale_shift on python code
@@ -6266,25 +6061,8 @@ class Gemma3NModel(Gemma3Model):
]
def set_vocab(self):
# For Gemma3n multimodal models, we need the FULL vocab_size (262400)
# which includes special tokens from 262144-262399 for vision/audio.
# The vocab_size_per_layer_input (262144) is only the embedding size per layer.
# Temporarily override the hparams lookup order to prioritize vocab_size.
# Store original vocab_size_per_layer_input if it exists
vocab_size_per_layer_input = self.hparams.get("vocab_size_per_layer_input")
# Temporarily remove vocab_size_per_layer_input to force using vocab_size
if vocab_size_per_layer_input is not None:
del self.hparams["vocab_size_per_layer_input"]
# Call parent set_vocab which will now use vocab_size (262400)
super().set_vocab()
# Restore vocab_size_per_layer_input for later use
if vocab_size_per_layer_input is not None:
self.hparams["vocab_size_per_layer_input"] = vocab_size_per_layer_input
def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_altup_active_idx(self.hparams["altup_active_idx"])
@@ -6320,32 +6098,8 @@ class Gemma3NModel(Gemma3Model):
if "language_model." not in name:
return [] # skip non-language model tensors
# Pad token embeddings for vision/audio special tokens (262144-262399)
if "embed_tokens.weight" in name or "embed_tokens_per_layer" in name:
# Move to CPU to avoid meta device issues during padding
data_torch = data_torch.to(device="cpu")
vocab_size = self.hparams.get("vocab_size", 262400)
current_size = data_torch.shape[0] # First dimension is vocab_size
if current_size < vocab_size:
# Pad with zeros for vision/audio tokens (they get embeddings from vision tower)
padding_size = vocab_size - current_size
tensor_type = "per-layer embeddings" if "per_layer" in name else "token embeddings"
logger.info(f"Padding {tensor_type} shape {list(data_torch.shape)} from {current_size} to {vocab_size} (adding {padding_size} vision/audio token slots)")
# Create padding with zeros (vision tokens won't use these embeddings)
padding = torch.zeros((padding_size, data_torch.shape[1]), dtype=data_torch.dtype, device=data_torch.device)
data_torch = torch.cat([data_torch, padding], dim=0)
# Continue with normal processing
name = name.replace("language_model.", "")
return [(self.map_tensor_name(name), data_torch)]
if "altup_unembed_projections" in name:
data_torch = data_torch.to(device="cpu")
# altup_unembed matrices are [hidden_size, hidden_size], NOT vocab-based
# They should NOT be padded
if ".0." in name:
self._altup_unembd[0] = data_torch
elif ".1." in name:
@@ -7458,7 +7212,6 @@ class DeepseekModel(TextModel):
"DeepseekV3ForCausalLM",
"KimiVLForConditionalGeneration",
"YoutuForCausalLM",
"YoutuVLForConditionalGeneration"
)
class DeepseekV2Model(TextModel):
model_arch = gguf.MODEL_ARCH.DEEPSEEK2
@@ -8751,102 +8504,6 @@ class Exaone4Model(TextModel):
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))
@ModelBase.register("ExaoneMoEForCausalLM")
class ExaoneMoEModel(Exaone4Model):
model_arch = gguf.MODEL_ARCH.EXAONE_MOE
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.block_count = self.hparams["num_hidden_layers"] + self.hparams.get("num_nextn_predict_layers", 0)
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_expert_count(self.hparams["num_experts"])
moe_intermediate_size = self.hparams["moe_intermediate_size"]
num_shared_experts = self.hparams["num_shared_experts"]
self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
self.gguf_writer.add_expert_shared_count(num_shared_experts)
self.gguf_writer.add_expert_shared_feed_forward_length(moe_intermediate_size * num_shared_experts)
self.gguf_writer.add_expert_weights_scale(self.hparams["routed_scaling_factor"])
self.gguf_writer.add_expert_weights_norm(self.hparams["norm_topk_prob"])
n_dense_layer = self.hparams.get("first_k_dense_replace", self.hparams.get("first_last_k_dense_replace", 0))
self.gguf_writer.add_leading_dense_block_count(n_dense_layer)
self.gguf_writer.add_nextn_predict_layers(self.hparams.get("num_nextn_predict_layers", 0))
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
_experts: list[dict[str, Tensor]] | None = None
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
if name.startswith("mtp."):
if name.find("layers.") != -1:
# `mtp.layers.0.[module_name]` format
name = name.replace(f"mtp.layers.{bid}", f"model.layers.{bid + self.hparams['num_hidden_layers']}")
else:
# mtp fc/norm weights
remapper = {
"mtp.fc": "model.layers.{bid}.eh_proj",
"mtp.pre_fc_norm_embedding": "model.layers.{bid}.enorm",
"mtp.pre_fc_norm_hidden": "model.layers.{bid}.hnorm",
"mtp.norm": "model.layers.{bid}.shared_head.norm",
}
_n = Path(name)
new_name = remapper[_n.stem] + _n.suffix
# set shared weights for all NextN/MTP layers
tensors = []
for bid in range(self.hparams['num_hidden_layers'], self.block_count):
new_name = new_name.format(bid=bid)
tensors.append((self.map_tensor_name(new_name), data_torch))
return tensors
if name.endswith("e_score_correction_bias"):
name = name.replace("e_score_correction_bias", "e_score_correction.bias")
if name.find("mlp.experts") != -1:
n_experts = self.hparams["num_experts"]
assert bid is not None
if self._experts is None:
self._experts = [{} for _ in range(self.block_count)]
self._experts[bid][name] = data_torch
if len(self._experts[bid]) >= n_experts * 3:
tensors: list[tuple[str, Tensor]] = []
# merge the experts into a single 3d tensor
for w_name in ["down_proj", "gate_proj", "up_proj"]:
datas: list[Tensor] = []
for xid in range(n_experts):
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
datas.append(self._experts[bid][ename])
del self._experts[bid][ename]
data_torch = torch.stack(datas, dim=0)
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
new_name = self.map_tensor_name(merged_name)
tensors.append((new_name, data_torch))
return tensors
else:
return []
return [(self.map_tensor_name(name), data_torch)]
def prepare_tensors(self):
super().prepare_tensors()
if self._experts is not None:
# flatten `list[dict[str, Tensor]]` into `list[str]`
experts = [k for d in self._experts for k in d.keys()]
if len(experts) > 0:
raise ValueError(f"Unprocessed experts: {experts}")
@ModelBase.register("GraniteForCausalLM")
class GraniteModel(LlamaModel):
"""Conversion for IBM's GraniteForCausalLM"""
@@ -10278,7 +9935,7 @@ class LFM2Model(TextModel):
self._add_feed_forward_length()
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
if self._is_vision_tensor(name) or ConformerAudioModel.is_audio_tensor(name):
if self._is_vision_tensor(name) or self._is_audio_tensor(name):
# skip multimodal tensors
return []
@@ -10294,26 +9951,8 @@ class LFM2Model(TextModel):
def _is_vision_tensor(self, name: str) -> bool:
return "vision_tower" in name or "multi_modal_projector" in name
@ModelBase.register("Lfm2Model")
class LFM2ColBertModel(LFM2Model):
model_arch = gguf.MODEL_ARCH.LFM2
dense_tensor_name = "dense_2"
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
if not name.startswith(self.dense_tensor_name):
name = "model." + name
return super().modify_tensors(data_torch, name, bid)
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
# dense tensor is stored in a separate safetensors file
from safetensors.torch import load_file
tensors_file = self.dir_model / "1_Dense" / "model.safetensors"
assert tensors_file.is_file()
tensor = load_file(tensors_file)["linear.weight"]
self.gguf_writer.add_embedding_length_out(tensor.shape[0])
yield f"{self.dense_tensor_name}.weight", tensor.clone()
def _is_audio_tensor(self, name: str):
return any(p in name for p in ["audio", "codebook", "conformer", "depth_embedding", "depthformer", "depth_linear"])
@ModelBase.register("Lfm2MoeForCausalLM")
@@ -10421,11 +10060,13 @@ class LFM2VLModel(MmprojModel):
@ModelBase.register("Lfm2AudioForConditionalGeneration")
class LFM2AudioModel(ConformerAudioModel):
class LFM2AudioModel(MmprojModel):
has_vision_encoder = False
has_audio_encoder = True
model_name = "Lfm2AudioEncoder"
_batch_norm_tensors: list[dict[str, Tensor]] | None = None
def get_audio_config(self) -> dict[str, Any] | None:
return self.global_config.get("encoder")
@@ -10439,7 +10080,12 @@ class LFM2AudioModel(ConformerAudioModel):
self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["feat_in"])
self.gguf_writer.add_audio_attention_layernorm_eps(1e-5)
def modify_tensors(self, data_torch, name, bid):
def tensor_force_quant(self, name, new_name, bid, n_dims):
if ".conv" in name and ".weight" in name:
return gguf.GGMLQuantizationType.F32
return super().tensor_force_quant(name, new_name, bid, n_dims)
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
# skip language model tensors
if name.startswith("lfm."):
return []
@@ -10452,7 +10098,40 @@ class LFM2AudioModel(ConformerAudioModel):
if any(p in name for p in ["codebook_offsets", "depth_embeddings", "depth_linear", "depthformer"]):
return []
return super().modify_tensors(data_torch, name, bid)
# fold running_mean, running_var and eps into weight and bias for batch_norm
if "batch_norm" in name:
if self._batch_norm_tensors is None:
self._batch_norm_tensors = [{} for _ in range(self.block_count)]
assert bid is not None
self._batch_norm_tensors[bid][name] = data_torch
if len(self._batch_norm_tensors[bid]) < 5:
return []
weight = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.weight"]
bias = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.bias"]
running_mean = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.running_mean"]
running_var = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.running_var"]
eps = 1e-5 # default value
a = weight / torch.sqrt(running_var + eps)
b = bias - running_mean * a
return [
(self.map_tensor_name(f"conformer.layers.{bid}.conv.batch_norm.weight"), a),
(self.map_tensor_name(f"conformer.layers.{bid}.conv.batch_norm.bias"), b),
]
# reshape conv weights
if name.startswith("conformer.pre_encode.conv.") and name.endswith(".bias"):
data_torch = data_torch[:, None, None]
if "conv.depthwise_conv" in name and name.endswith(".weight"):
assert data_torch.shape[1] == 1
data_torch = data_torch.reshape(data_torch.shape[0], data_torch.shape[2])
if "conv.pointwise_conv" in name and name.endswith(".weight"):
assert data_torch.shape[2] == 1
data_torch = data_torch.reshape(data_torch.shape[0], data_torch.shape[1])
return [(self.map_tensor_name(name), data_torch)]
@ModelBase.register("SmallThinkerForCausalLM")
@@ -10995,8 +10674,8 @@ class JanusProVisionModel(MmprojModel):
return []
@ModelBase.register("YoutuVLForConditionalGeneration")
class YoutuVLVisionModel(MmprojModel):
@ModelBase.register("YOUTUVLForConditionalGeneration", "YOUTUVLForCausalLM")
class YOUTUVLVisionModel(MmprojModel):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
assert self.hparams_vision is not None
@@ -11273,8 +10952,8 @@ def parse_args() -> argparse.Namespace:
parser.add_argument(
"--sentence-transformers-dense-modules", action="store_true",
help=("Whether to include sentence-transformers dense modules. "
"It can be used for sentence-transformers models, like google/embeddinggemma-300m. "
help=("Whether to include sentence-transformers dense modules."
"It can be used for sentence-transformers models, like google/embeddinggemma-300m"
"Default these modules are not included.")
)

View File

@@ -147,7 +147,6 @@ models = [
{"name": "kormo", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/KORMo-Team/KORMo-tokenizer", },
{"name": "youtu", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tencent/Youtu-LLM-2B", },
{"name": "solar-open", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/upstage/Solar-Open-100B", },
{"name": "exaone-moe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/K-EXAONE-236B-A23B", },
]
# some models are known to be broken upstream, so we will skip them as exceptions

View File

@@ -1,4 +1,4 @@
{
{
"version": 4,
"configurePresets": [
{
@@ -23,7 +23,7 @@
"GGML_OPENCL": "ON",
"GGML_HEXAGON": "ON",
"GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE": "128",
"LLAMA_OPENSSL": "OFF"
"LLAMA_CURL": "OFF"
}
},
@@ -38,7 +38,7 @@
"GGML_OPENCL": "ON",
"GGML_HEXAGON": "ON",
"GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE": "128",
"LLAMA_OPENSSL": "OFF"
"LLAMA_CURL": "OFF"
}
},

View File

@@ -210,10 +210,6 @@ build: 6a8cf8914 (6733)
Controls whether the Hexagon backend allocates host buffers. By default, all buffers except for REPACK are host buffers.
This option is required for testing Ops that require REPACK buffers (MUL_MAT and MUL_MAT_ID).
- `GGML_HEXAGON_EXPERIMENTAL=1`
Controls whether the Hexagon backend enables experimental features.
This option is required for enabling/testing experimental Ops (FLASH_ATTN_EXT).
- `GGML_HEXAGON_VERBOSE=1`
Enables verbose logging of Ops from the backend. Example output:

View File

@@ -15,7 +15,7 @@ Below is the build script: it requires utilizing RISC-V vector instructions for
cmake -B build \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_CPU_RISCV64_SPACEMIT=ON \
-DLLAMA_OPENSSL=OFF \
-DLLAMA_CURL=OFF \
-DGGML_RVV=ON \
-DGGML_RV_ZFH=ON \
-DGGML_RV_ZICBOP=ON \

View File

@@ -65,10 +65,10 @@ cmake --build build --config Release
cmake --preset x64-windows-llvm-release
cmake --build build-x64-windows-llvm-release
```
- If you want HTTPS/TLS features, you may install OpenSSL development libraries. If not installed, the project will build and run without SSL support.
- **Debian / Ubuntu:** `sudo apt-get install libssl-dev`
- **Fedora / RHEL / Rocky / Alma:** `sudo dnf install openssl-devel`
- **Arch / Manjaro:** `sudo pacman -S openssl`
- Curl usage is enabled by default and can be turned off with `-DLLAMA_CURL=OFF`. Otherwise you need to install development libraries for libcurl.
- **Debian / Ubuntu:** `sudo apt-get install libcurl4-openssl-dev` # (or `libcurl4-gnutls-dev` if you prefer GnuTLS)
- **Fedora / RHEL / Rocky / Alma:** `sudo dnf install libcurl-devel`
- **Arch / Manjaro:** `sudo pacman -S curl` # includes libcurl headers
## BLAS Build

View File

@@ -22,7 +22,7 @@ Legend:
| ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
| ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
| ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ❌ | ❌ | ❌ |
| CEIL | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | | ❌ | ❌ |
| CEIL | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | | ❌ | ❌ |
| CLAMP | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ | ❌ | ❌ |
| CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
| CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ | ❌ |
@@ -57,6 +57,7 @@ Legend:
| GET_ROWS | ❌ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
| GET_ROWS_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
| GROUP_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
| GROUP_NORM_MUL_ADD | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
| HARDSIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
| HARDSWISH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
| IM2COL | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
@@ -70,9 +71,10 @@ Legend:
| MUL_MAT_ID | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ❌ | ❌ | ❌ |
| NEG | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
| NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | ❌ | ❌ | ❌ |
| NORM_MUL_ADD | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
| OPT_STEP_ADAMW | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
| OPT_STEP_SGD | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
| OUT_PROD | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ | 🟡 |
| OUT_PROD | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ | |
| PAD | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | ✅ | ❌ | ❌ | ❌ |
| PAD_REFLECT_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
| POOL_2D | ❌ | 🟡 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
@@ -97,6 +99,7 @@ Legend:
| SILU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
| SILU_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
| SIN | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ |
| SOFTCAP | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
| SOFTPLUS | ❌ | ❌ | ✅ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
| SOFT_MAX | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
| SOFT_MAX_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ✅ | ❌ | ❌ | ❌ |

View File

@@ -965,7 +965,6 @@
"BLAS","IM2COL","type_input=f32,type_kernel=f16,dst_type=f16,ne_input=[12,12,1,2560],ne_kernel=[3,3,1,2560],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1","support","0","no","BLAS"
"BLAS","IM2COL","type_input=f32,type_kernel=f16,dst_type=f16,ne_input=[12,12,2,2560],ne_kernel=[3,3,2,2560],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1","support","0","no","BLAS"
"BLAS","IM2COL","type_input=f32,type_kernel=f16,dst_type=f16,ne_input=[5,5,1,32],ne_kernel=[3,4,1,32],s0=1,s1=1,p0=0,p1=0,d0=1,d1=1,is_2D=1","support","0","no","BLAS"
"BLAS","IM2COL","type_input=f32,type_kernel=f32,dst_type=f32,ne_input=[2,2,1536,729],ne_kernel=[2,2,1536,4096],s0=1,s1=1,p0=0,p1=0,d0=1,d1=1,is_2D=1","support","0","no","BLAS"
"BLAS","IM2COL_3D","type_input=f32,type_kernel=f32,dst_type=f32,ne_input=[10,10,10,9],ne_kernel=[3,3,3,1],IC=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,v=0","support","0","no","BLAS"
"BLAS","IM2COL_3D","type_input=f32,type_kernel=f16,dst_type=f32,ne_input=[10,10,10,9],ne_kernel=[3,3,3,1],IC=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,v=0","support","0","no","BLAS"
"BLAS","IM2COL_3D","type_input=f32,type_kernel=f16,dst_type=f16,ne_input=[10,10,10,9],ne_kernel=[3,3,3,1],IC=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,v=0","support","0","no","BLAS"
@@ -4965,7 +4964,6 @@
"BLAS","CONV_TRANSPOSE_1D","ne_input=[2,1,1,1],ne_kernel=[3,1,1,1],s0=1,p0=0,d0=1","support","0","no","BLAS"
"BLAS","CONV_TRANSPOSE_2D","ne_input=[3,2,3,1],ne_kernel=[2,2,1,3],stride=1","support","0","no","BLAS"
"BLAS","CONV_TRANSPOSE_2D","ne_input=[10,10,9,1],ne_kernel=[3,3,1,9],stride=2","support","0","no","BLAS"
"BLAS","CONV_TRANSPOSE_2D","ne_input=[129,63,35,1],ne_kernel=[3,3,48,35],stride=1","support","0","no","BLAS"
"BLAS","COUNT_EQUAL","type=f32,ne=[4,500,1,1]","support","0","no","BLAS"
"BLAS","COUNT_EQUAL","type=f32,ne=[4,5000,1,1]","support","0","no","BLAS"
"BLAS","ARGMAX","type=f32,ne=[32,1,1,1]","support","0","no","BLAS"
@@ -5717,15 +5715,15 @@
"BLAS","L2_NORM","type=f32,ne=[64,5,4,3]","support","0","no","BLAS"
"BLAS","RMS_NORM","type=f32,ne=[64,5,4,3],v=0,eps=0.000001,inplace=1","support","0","no","BLAS"
"BLAS","L2_NORM","type=f32,ne=[64,5,4,3]","support","0","no","BLAS"
"BLAS","SSM_CONV","type=f32,ne_a=[3,1024,1,1],ne_b=[3,1024,1,1]","support","0","no","BLAS"
"BLAS","SSM_CONV","type=f32,ne_a=[6,1024,1,1],ne_b=[3,1024,1,1]","support","0","no","BLAS"
"BLAS","SSM_CONV","type=f32,ne_a=[3,1024,4,1],ne_b=[3,1024,1,1]","support","0","no","BLAS"
"BLAS","SSM_CONV","type=f32,ne_a=[3,1536,1,1],ne_b=[3,1536,1,1]","support","0","no","BLAS"
"BLAS","SSM_CONV","type=f32,ne_a=[6,1536,1,1],ne_b=[3,1536,1,1]","support","0","no","BLAS"
"BLAS","SSM_CONV","type=f32,ne_a=[3,1536,4,1],ne_b=[3,1536,1,1]","support","0","no","BLAS"
"BLAS","SSM_CONV","type=f32,ne_a=[3,2048,1,1],ne_b=[3,2048,1,1]","support","0","no","BLAS"
"BLAS","SSM_CONV","type=f32,ne_a=[6,2048,1,1],ne_b=[3,2048,1,1]","support","0","no","BLAS"
"BLAS","SSM_CONV","type=f32,ne_a=[3,2048,4,1],ne_b=[3,2048,1,1]","support","0","no","BLAS"
"BLAS","SSM_CONV","type=f32,ne_a=[4,1024,1,1],ne_b=[3,1024,1,1]","support","0","no","BLAS"
"BLAS","SSM_CONV","type=f32,ne_a=[8,1024,1,1],ne_b=[3,1024,1,1]","support","0","no","BLAS"
"BLAS","SSM_CONV","type=f32,ne_a=[4,1024,4,1],ne_b=[3,1024,1,1]","support","0","no","BLAS"
"BLAS","SSM_CONV","type=f32,ne_a=[4,1536,1,1],ne_b=[3,1536,1,1]","support","0","no","BLAS"
"BLAS","SSM_CONV","type=f32,ne_a=[8,1536,1,1],ne_b=[3,1536,1,1]","support","0","no","BLAS"
"BLAS","SSM_CONV","type=f32,ne_a=[4,1536,4,1],ne_b=[3,1536,1,1]","support","0","no","BLAS"
"BLAS","SSM_CONV","type=f32,ne_a=[4,2048,1,1],ne_b=[3,2048,1,1]","support","0","no","BLAS"
"BLAS","SSM_CONV","type=f32,ne_a=[8,2048,1,1],ne_b=[3,2048,1,1]","support","0","no","BLAS"
"BLAS","SSM_CONV","type=f32,ne_a=[4,2048,4,1],ne_b=[3,2048,1,1]","support","0","no","BLAS"
"BLAS","SSM_CONV","type=f32,ne_a=[4,1024,1,1],ne_b=[4,1024,1,1]","support","0","no","BLAS"
"BLAS","SSM_CONV","type=f32,ne_a=[8,1024,1,1],ne_b=[4,1024,1,1]","support","0","no","BLAS"
"BLAS","SSM_CONV","type=f32,ne_a=[4,1024,4,1],ne_b=[4,1024,1,1]","support","0","no","BLAS"
@@ -5735,15 +5733,6 @@
"BLAS","SSM_CONV","type=f32,ne_a=[4,2048,1,1],ne_b=[4,2048,1,1]","support","0","no","BLAS"
"BLAS","SSM_CONV","type=f32,ne_a=[8,2048,1,1],ne_b=[4,2048,1,1]","support","0","no","BLAS"
"BLAS","SSM_CONV","type=f32,ne_a=[4,2048,4,1],ne_b=[4,2048,1,1]","support","0","no","BLAS"
"BLAS","SSM_CONV","type=f32,ne_a=[9,1024,1,1],ne_b=[9,1024,1,1]","support","0","no","BLAS"
"BLAS","SSM_CONV","type=f32,ne_a=[18,1024,1,1],ne_b=[9,1024,1,1]","support","0","no","BLAS"
"BLAS","SSM_CONV","type=f32,ne_a=[9,1024,4,1],ne_b=[9,1024,1,1]","support","0","no","BLAS"
"BLAS","SSM_CONV","type=f32,ne_a=[9,1536,1,1],ne_b=[9,1536,1,1]","support","0","no","BLAS"
"BLAS","SSM_CONV","type=f32,ne_a=[18,1536,1,1],ne_b=[9,1536,1,1]","support","0","no","BLAS"
"BLAS","SSM_CONV","type=f32,ne_a=[9,1536,4,1],ne_b=[9,1536,1,1]","support","0","no","BLAS"
"BLAS","SSM_CONV","type=f32,ne_a=[9,2048,1,1],ne_b=[9,2048,1,1]","support","0","no","BLAS"
"BLAS","SSM_CONV","type=f32,ne_a=[18,2048,1,1],ne_b=[9,2048,1,1]","support","0","no","BLAS"
"BLAS","SSM_CONV","type=f32,ne_a=[9,2048,4,1],ne_b=[9,2048,1,1]","support","0","no","BLAS"
"BLAS","SSM_SCAN","type=f32,d_state=16,head_dim=1,n_head=1024,n_group=1,n_seq_tokens=32,n_seqs=4","support","0","no","BLAS"
"BLAS","SSM_SCAN","type=f32,d_state=128,head_dim=64,n_head=16,n_group=2,n_seq_tokens=32,n_seqs=4","support","0","no","BLAS"
"BLAS","SSM_SCAN","type=f32,d_state=256,head_dim=64,n_head=8,n_group=2,n_seq_tokens=32,n_seqs=4","support","0","no","BLAS"
@@ -6603,30 +6592,6 @@
"BLAS","MUL_MAT","type_a=f16,type_b=f32,m=1056,n=1,k=67,bs=[1,1],nr=[4,1],per=[0,2,1,3],k_v=0,o=1","support","0","no","BLAS"
"BLAS","MUL_MAT","type_a=f32,type_b=f32,m=64,n=77,k=77,bs=[12,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","BLAS"
"BLAS","MUL_MAT","type_a=q4_0,type_b=f32,m=576,n=512,k=576,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","BLAS"
"BLAS","MUL_MAT","type_a=q4_0,type_b=f32,m=1,n=2048,k=8192,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
"BLAS","MUL_MAT","type_a=f32,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
"BLAS","MUL_MAT","type_a=f16,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
"BLAS","MUL_MAT","type_a=bf16,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
"BLAS","MUL_MAT","type_a=q4_0,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
"BLAS","MUL_MAT","type_a=q4_1,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
"BLAS","MUL_MAT","type_a=q5_0,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
"BLAS","MUL_MAT","type_a=q5_1,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
"BLAS","MUL_MAT","type_a=q8_0,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
"BLAS","MUL_MAT","type_a=mxfp4,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
"BLAS","MUL_MAT","type_a=q2_K,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
"BLAS","MUL_MAT","type_a=q3_K,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
"BLAS","MUL_MAT","type_a=q4_K,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
"BLAS","MUL_MAT","type_a=q5_K,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
"BLAS","MUL_MAT","type_a=q6_K,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
"BLAS","MUL_MAT","type_a=iq2_xxs,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
"BLAS","MUL_MAT","type_a=iq2_xs,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
"BLAS","MUL_MAT","type_a=iq2_s,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
"BLAS","MUL_MAT","type_a=iq3_xxs,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
"BLAS","MUL_MAT","type_a=iq1_s,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
"BLAS","MUL_MAT","type_a=iq1_m,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
"BLAS","MUL_MAT","type_a=iq4_nl,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
"BLAS","MUL_MAT","type_a=iq3_s,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
"BLAS","MUL_MAT","type_a=iq4_xs,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","0","no","BLAS"
"BLAS","MUL_MAT","type_a=f16,type_b=f32,m=1056,n=1,k=128,bs=[1,1],nr=[1,1],per=[0,2,1,3],k_v=0,o=1","support","0","no","BLAS"
"BLAS","MUL_MAT","type_a=f16,type_b=f32,m=128,n=1,k=1056,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=2112,o=1","support","0","no","BLAS"
"BLAS","MUL_MAT","type_a=bf16,type_b=f32,m=1056,n=1,k=128,bs=[1,1],nr=[1,1],per=[0,2,1,3],k_v=0,o=1","support","0","no","BLAS"
@@ -8951,11 +8916,6 @@
"BLAS","SOFT_MAX","type=f32,ne=[32,2,32,1],mask=1,sinks=0,m_prec=f16,nr23=[1,1],scale=0.100000,max_bias=0.000000,inplace=0","support","0","no","BLAS"
"BLAS","SOFT_MAX","type=f32,ne=[32,2,32,1],mask=1,sinks=1,m_prec=f32,nr23=[1,1],scale=0.100000,max_bias=8.000000,inplace=0","support","0","no","BLAS"
"BLAS","SOFT_MAX","type=f32,ne=[32,2,32,1],mask=1,sinks=1,m_prec=f16,nr23=[1,1],scale=0.100000,max_bias=8.000000,inplace=0","support","0","no","BLAS"
"BLAS","SOFT_MAX","type=f32,ne=[200001,2,3,1],mask=1,sinks=1,m_prec=f32,nr23=[1,1],scale=0.100000,max_bias=8.000000,inplace=0","support","0","no","BLAS"
"BLAS","SOFT_MAX","type=f32,ne=[200001,2,3,1],mask=1,sinks=1,m_prec=f16,nr23=[1,1],scale=0.100000,max_bias=8.000000,inplace=0","support","0","no","BLAS"
"BLAS","SOFT_MAX","type=f32,ne=[200000,1,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0","support","0","no","BLAS"
"BLAS","SOFT_MAX","type=f32,ne=[200000,4,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0","support","0","no","BLAS"
"BLAS","SOFT_MAX","type=f32,ne=[643251,3,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0","support","0","no","BLAS"
"BLAS","SOFT_MAX_BACK","type=f32,ne=[16,16,1,1],scale=1.000000,max_bias=0.000000","support","0","no","BLAS"
"BLAS","SOFT_MAX_BACK","type=f32,ne=[15,15,1,1],scale=1.000000,max_bias=0.000000","support","0","no","BLAS"
"BLAS","SOFT_MAX_BACK","type=f32,ne=[16,16,2,3],scale=1.000000,max_bias=0.000000","support","0","no","BLAS"
@@ -9008,7 +8968,6 @@
"BLAS","ROPE","type=f32,ne_a=[128,40,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[128,52,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[128,64,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[64,1,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[64,71,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[64,8,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
@@ -9018,7 +8977,6 @@
"BLAS","ROPE","type=f32,ne_a=[80,32,2,1],n_dims=20,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[80,32,2,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[80,32,4,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[128,12,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[128,28,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[128,12,2,1],n_dims=20,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
@@ -9029,13 +8987,11 @@
"BLAS","ROPE","type=f32,ne_a=[128,28,2,1],n_dims=32,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[80,16,2,1],n_dims=80,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[128,16,2,1],n_dims=128,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[128,40,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[128,52,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[128,64,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[64,1,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[64,71,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[64,8,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
@@ -9045,7 +9001,6 @@
"BLAS","ROPE","type=f32,ne_a=[80,32,2,1],n_dims=20,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[80,32,2,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[80,32,4,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[128,12,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[128,28,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[128,12,2,1],n_dims=20,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
@@ -9056,13 +9011,11 @@
"BLAS","ROPE","type=f32,ne_a=[128,28,2,1],n_dims=32,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[80,16,2,1],n_dims=80,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[128,16,2,1],n_dims=128,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[128,40,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[128,52,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[128,64,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[64,1,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[64,71,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[64,8,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
@@ -9072,7 +9025,6 @@
"BLAS","ROPE","type=f32,ne_a=[80,32,2,1],n_dims=20,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[80,32,2,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[80,32,4,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[128,12,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[128,28,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[128,12,2,1],n_dims=20,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
@@ -9083,13 +9035,11 @@
"BLAS","ROPE","type=f32,ne_a=[128,28,2,1],n_dims=32,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[80,16,2,1],n_dims=80,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[128,16,2,1],n_dims=128,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[128,40,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[128,52,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[128,64,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[64,1,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[64,71,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[64,8,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
@@ -9099,7 +9049,6 @@
"BLAS","ROPE","type=f32,ne_a=[80,32,2,1],n_dims=20,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[80,32,2,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[80,32,4,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[128,12,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[128,28,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[128,12,2,1],n_dims=20,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
@@ -9110,7 +9059,6 @@
"BLAS","ROPE","type=f32,ne_a=[128,28,2,1],n_dims=32,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[80,16,2,1],n_dims=80,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[128,16,2,1],n_dims=128,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
@@ -9236,7 +9184,6 @@
"BLAS","ROPE_BACK","type=f32,ne_a=[128,40,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[128,52,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[128,64,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[64,1,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[64,71,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[64,8,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
@@ -9246,7 +9193,6 @@
"BLAS","ROPE_BACK","type=f32,ne_a=[80,32,2,1],n_dims=20,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[80,32,2,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[80,32,4,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[128,12,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[128,28,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[128,12,2,1],n_dims=20,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
@@ -9257,13 +9203,11 @@
"BLAS","ROPE_BACK","type=f32,ne_a=[128,28,2,1],n_dims=32,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[80,16,2,1],n_dims=80,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[128,16,2,1],n_dims=128,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[128,40,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[128,52,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[128,64,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[64,1,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[64,71,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[64,8,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
@@ -9273,7 +9217,6 @@
"BLAS","ROPE_BACK","type=f32,ne_a=[80,32,2,1],n_dims=20,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[80,32,2,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[80,32,4,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[128,12,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[128,28,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[128,12,2,1],n_dims=20,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
@@ -9284,13 +9227,11 @@
"BLAS","ROPE_BACK","type=f32,ne_a=[128,28,2,1],n_dims=32,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[80,16,2,1],n_dims=80,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[128,16,2,1],n_dims=128,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[128,40,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[128,52,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[128,64,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[64,1,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[64,71,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[64,8,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
@@ -9300,7 +9241,6 @@
"BLAS","ROPE_BACK","type=f32,ne_a=[80,32,2,1],n_dims=20,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[80,32,2,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[80,32,4,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[128,12,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[128,28,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[128,12,2,1],n_dims=20,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
@@ -9311,13 +9251,11 @@
"BLAS","ROPE_BACK","type=f32,ne_a=[128,28,2,1],n_dims=32,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[80,16,2,1],n_dims=80,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[128,16,2,1],n_dims=128,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[128,40,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[128,52,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[128,64,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[64,1,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[64,71,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[64,8,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
@@ -9327,7 +9265,6 @@
"BLAS","ROPE_BACK","type=f32,ne_a=[80,32,2,1],n_dims=20,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[80,32,2,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[80,32,4,1],n_dims=32,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[128,12,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[128,28,2,1],n_dims=128,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[128,12,2,1],n_dims=20,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
@@ -9338,7 +9275,6 @@
"BLAS","ROPE_BACK","type=f32,ne_a=[128,28,2,1],n_dims=32,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[80,16,2,1],n_dims=80,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[128,16,2,1],n_dims=128,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
"BLAS","ROPE_BACK","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","0","no","BLAS"
@@ -9606,333 +9542,333 @@
"BLAS","ARGSORT","type=f32,ne=[2048,2,1,3],order=1","support","0","no","BLAS"
"BLAS","ARGSORT","type=f32,ne=[2049,2,1,3],order=1","support","0","no","BLAS"
"BLAS","ARGSORT","type=f32,ne=[2,8,8192,1],order=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1,1,1,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[12,1,2,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2,1,1,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[13,1,2,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2,1,1,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[13,1,2,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[4,1,1,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[15,1,2,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[4,1,1,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[15,1,2,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[4,1,1,1],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[15,1,2,1],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8,1,1,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[19,1,2,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8,1,1,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[19,1,2,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8,1,1,1],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[19,1,2,1],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8,1,1,1],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[19,1,2,1],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16,1,1,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[27,1,2,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16,1,1,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[27,1,2,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16,1,1,1],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[27,1,2,1],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16,1,1,1],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[27,1,2,1],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16,1,1,1],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[27,1,2,1],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32,1,1,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[43,1,2,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32,1,1,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[43,1,2,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32,1,1,1],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[43,1,2,1],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32,1,1,1],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[43,1,2,1],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32,1,1,1],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[43,1,2,1],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[64,1,1,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[75,1,2,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[64,1,1,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[75,1,2,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[64,1,1,1],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[75,1,2,1],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[64,1,1,1],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[75,1,2,1],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[64,1,1,1],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[75,1,2,1],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[128,1,1,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[139,1,2,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[128,1,1,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[139,1,2,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[128,1,1,1],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[139,1,2,1],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[128,1,1,1],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[139,1,2,1],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[128,1,1,1],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[139,1,2,1],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[128,1,1,1],k=100,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[139,1,2,1],k=100,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[256,1,1,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[267,1,2,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[256,1,1,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[267,1,2,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[256,1,1,1],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[267,1,2,1],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[256,1,1,1],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[267,1,2,1],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[256,1,1,1],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[267,1,2,1],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[256,1,1,1],k=100,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[267,1,2,1],k=100,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[512,1,1,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[523,1,2,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[512,1,1,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[523,1,2,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[512,1,1,1],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[523,1,2,1],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[512,1,1,1],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[523,1,2,1],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[512,1,1,1],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[523,1,2,1],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[512,1,1,1],k=100,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[523,1,2,1],k=100,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[512,1,1,1],k=500,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[523,1,2,1],k=500,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=100,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=100,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=500,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=500,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=1023,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=1023,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=100,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=100,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=500,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=500,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=1023,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=1023,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=100,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=100,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=500,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=500,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=1023,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=1023,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=100,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=100,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=500,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=500,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=1023,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=1023,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=100,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=100,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=500,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=500,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=1023,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=1023,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=9999,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=9999,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=100,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=100,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=500,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=500,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=1023,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=1023,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=9999,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=9999,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=100,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=100,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=500,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=500,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=1023,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=1023,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=9999,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=9999,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=100,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=100,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=500,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=500,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=1023,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=1023,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=9999,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=9999,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=100,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=100,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=500,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=500,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=1023,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=1023,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=9999,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=9999,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=100,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=100,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=500,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=500,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=1023,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=1023,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=9999,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=9999,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16,10,10,10],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[60,10,10,10],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1023,2,1,3],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1024,2,1,3],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1025,2,1,3],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2047,2,1,3],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2048,2,1,3],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2049,2,1,3],k=1,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16,10,10,10],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[60,10,10,10],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1023,2,1,3],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1024,2,1,3],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1025,2,1,3],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2047,2,1,3],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2048,2,1,3],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2049,2,1,3],k=2,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16,10,10,10],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[60,10,10,10],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1023,2,1,3],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1024,2,1,3],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1025,2,1,3],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2047,2,1,3],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2048,2,1,3],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2049,2,1,3],k=3,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16,10,10,10],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[60,10,10,10],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1023,2,1,3],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1024,2,1,3],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1025,2,1,3],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2047,2,1,3],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2048,2,1,3],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2049,2,1,3],k=7,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16,10,10,10],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[60,10,10,10],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1023,2,1,3],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1024,2,1,3],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1025,2,1,3],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2047,2,1,3],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2048,2,1,3],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2049,2,1,3],k=15,ties=0","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1,1,1,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[12,1,2,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2,1,1,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[13,1,2,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2,1,1,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[13,1,2,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[4,1,1,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[15,1,2,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[4,1,1,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[15,1,2,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[4,1,1,1],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[15,1,2,1],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8,1,1,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[19,1,2,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8,1,1,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[19,1,2,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8,1,1,1],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[19,1,2,1],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8,1,1,1],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[19,1,2,1],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16,1,1,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[27,1,2,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16,1,1,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[27,1,2,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16,1,1,1],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[27,1,2,1],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16,1,1,1],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[27,1,2,1],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16,1,1,1],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[27,1,2,1],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32,1,1,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[43,1,2,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32,1,1,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[43,1,2,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32,1,1,1],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[43,1,2,1],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32,1,1,1],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[43,1,2,1],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32,1,1,1],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[43,1,2,1],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[64,1,1,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[75,1,2,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[64,1,1,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[75,1,2,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[64,1,1,1],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[75,1,2,1],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[64,1,1,1],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[75,1,2,1],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[64,1,1,1],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[75,1,2,1],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[128,1,1,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[139,1,2,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[128,1,1,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[139,1,2,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[128,1,1,1],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[139,1,2,1],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[128,1,1,1],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[139,1,2,1],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[128,1,1,1],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[139,1,2,1],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[128,1,1,1],k=100","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[139,1,2,1],k=100","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[256,1,1,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[267,1,2,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[256,1,1,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[267,1,2,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[256,1,1,1],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[267,1,2,1],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[256,1,1,1],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[267,1,2,1],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[256,1,1,1],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[267,1,2,1],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[256,1,1,1],k=100","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[267,1,2,1],k=100","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[512,1,1,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[523,1,2,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[512,1,1,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[523,1,2,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[512,1,1,1],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[523,1,2,1],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[512,1,1,1],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[523,1,2,1],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[512,1,1,1],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[523,1,2,1],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[512,1,1,1],k=100","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[523,1,2,1],k=100","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[512,1,1,1],k=500","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[523,1,2,1],k=500","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=100","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=100","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=500","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=500","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1024,1,1,1],k=1023","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1035,1,2,1],k=1023","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=100","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=100","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=500","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=500","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2048,1,1,1],k=1023","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2059,1,2,1],k=1023","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=100","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=100","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=500","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=500","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[4096,1,1,1],k=1023","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[4107,1,2,1],k=1023","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=100","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=100","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=500","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=500","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8192,1,1,1],k=1023","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[8203,1,2,1],k=1023","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=100","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=100","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=500","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=500","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=1023","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=1023","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=9999","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16395,1,2,1],k=9999","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=100","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=100","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=500","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=500","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=1023","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=1023","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32768,1,1,1],k=9999","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[32779,1,2,1],k=9999","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=100","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=100","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=500","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=500","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=1023","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=1023","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[65536,1,1,1],k=9999","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[65547,1,2,1],k=9999","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=100","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=100","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=500","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=500","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=1023","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=1023","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[131072,1,1,1],k=9999","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[131083,1,2,1],k=9999","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=100","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=100","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=500","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=500","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=1023","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=1023","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[262144,1,1,1],k=9999","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[262155,1,2,1],k=9999","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=100","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=100","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=500","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=500","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=1023","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=1023","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[524288,1,1,1],k=9999","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[524299,1,2,1],k=9999","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16,10,10,10],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[60,10,10,10],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1023,2,1,3],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1024,2,1,3],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1025,2,1,3],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2047,2,1,3],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2048,2,1,3],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2049,2,1,3],k=1","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16,10,10,10],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[60,10,10,10],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1023,2,1,3],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1024,2,1,3],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1025,2,1,3],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2047,2,1,3],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2048,2,1,3],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2049,2,1,3],k=2","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16,10,10,10],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[60,10,10,10],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1023,2,1,3],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1024,2,1,3],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1025,2,1,3],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2047,2,1,3],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2048,2,1,3],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2049,2,1,3],k=3","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16,10,10,10],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[60,10,10,10],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1023,2,1,3],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1024,2,1,3],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1025,2,1,3],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2047,2,1,3],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2048,2,1,3],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2049,2,1,3],k=7","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16,10,10,10],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[60,10,10,10],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1023,2,1,3],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1024,2,1,3],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[1025,2,1,3],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[16384,1,1,1],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2047,2,1,3],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2048,2,1,3],k=15","support","0","no","BLAS"
"BLAS","TOP_K","type=f32,ne=[2049,2,1,3],k=15","support","0","no","BLAS"
"BLAS","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=nearest,transpose=0","support","0","no","BLAS"
"BLAS","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=nearest,transpose=1","support","0","no","BLAS"
"BLAS","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=nearest","support","0","no","BLAS"
"BLAS","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=nearest","support","0","no","BLAS"
"BLAS","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=nearest,flags=none","support","0","no","BLAS"
"BLAS","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=nearest,flags=none","support","0","no","BLAS"
"BLAS","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=bilinear,transpose=0","support","0","no","BLAS"
"BLAS","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=bilinear,transpose=1","support","0","no","BLAS"
"BLAS","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bilinear","support","0","no","BLAS"
"BLAS","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=bilinear","support","0","no","BLAS"
"BLAS","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bilinear,flags=none","support","0","no","BLAS"
"BLAS","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=bilinear,flags=none","support","0","no","BLAS"
"BLAS","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=bicubic,transpose=0","support","0","no","BLAS"
"BLAS","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=bicubic,transpose=1","support","0","no","BLAS"
"BLAS","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bicubic","support","0","no","BLAS"
"BLAS","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=bicubic","support","0","no","BLAS"
"BLAS","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=bilinear|antialias,transpose=0","support","0","no","BLAS"
"BLAS","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=bilinear|antialias,transpose=1","support","0","no","BLAS"
"BLAS","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bilinear|antialias","support","0","no","BLAS"
"BLAS","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=bilinear|antialias","support","0","no","BLAS"
"BLAS","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bilinear|align_corners","support","0","no","BLAS"
"BLAS","UPSCALE","type=f32,ne=[1,4,3,2],ne_tgt=[2,8,3,2],mode=bilinear|align_corners","support","0","no","BLAS"
"BLAS","UPSCALE","type=f32,ne=[4,1,3,2],ne_tgt=[1,1,3,2],mode=bilinear|align_corners","support","0","no","BLAS"
"BLAS","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bicubic|align_corners","support","0","no","BLAS"
"BLAS","UPSCALE","type=f32,ne=[1,4,3,2],ne_tgt=[2,8,3,2],mode=bicubic|align_corners","support","0","no","BLAS"
"BLAS","UPSCALE","type=f32,ne=[4,1,3,2],ne_tgt=[1,1,3,2],mode=bicubic|align_corners","support","0","no","BLAS"
"BLAS","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bicubic,flags=none","support","0","no","BLAS"
"BLAS","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=bicubic,flags=none","support","0","no","BLAS"
"BLAS","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=513,transpose=0","support","0","no","BLAS"
"BLAS","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=513,transpose=1","support","0","no","BLAS"
"BLAS","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bilinear,flags=none","support","0","no","BLAS"
"BLAS","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=bilinear,flags=none","support","0","no","BLAS"
"BLAS","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bilinear,flags=align_corners","support","0","no","BLAS"
"BLAS","UPSCALE","type=f32,ne=[1,4,3,2],ne_tgt=[2,8,3,2],mode=bilinear,flags=align_corners","support","0","no","BLAS"
"BLAS","UPSCALE","type=f32,ne=[4,1,3,2],ne_tgt=[1,1,3,2],mode=bilinear,flags=align_corners","support","0","no","BLAS"
"BLAS","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bicubic,flags=align_corners","support","0","no","BLAS"
"BLAS","UPSCALE","type=f32,ne=[1,4,3,2],ne_tgt=[2,8,3,2],mode=bicubic,flags=align_corners","support","0","no","BLAS"
"BLAS","UPSCALE","type=f32,ne=[4,1,3,2],ne_tgt=[1,1,3,2],mode=bicubic,flags=align_corners","support","0","no","BLAS"
"BLAS","SUM","type=f32,ne=[10,5,4,3]","support","0","no","BLAS"
"BLAS","SUM_ROWS","type=f32,ne=[10,5,4,3],permute=0,slice=0","support","0","no","BLAS"
"BLAS","SUM","type=f32,ne=[11,5,6,3],permute=[0,2,1,3]","support","0","no","BLAS"
@@ -9955,9 +9891,8 @@
"BLAS","GROUP_NORM","type=f32,ne=[64,64,320,1],num_groups=32,eps=0.000001","support","0","no","BLAS"
"BLAS","GROUP_NORM","type=f32,ne=[9,9,1280,1],num_groups=32,eps=0.000001","support","0","no","BLAS"
"BLAS","ACC","type=f32,ne_a=[256,17,1,1],ne_b=[256,16,1,1]","support","0","no","BLAS"
"BLAS","PAD","type=f32,ne_a=[512,512,1,1],pad_0=1,pad_1=1,circular=0","support","0","no","BLAS"
"BLAS","PAD","type=f32,ne_a=[33,17,2,1],pad_0=4,pad_1=3,circular=1","support","0","no","BLAS"
"BLAS","PAD","type=f32,ne_a=[512,512,3,1],lp0=1,rp0=1,lp1=1,rp1=1,lp2=1,rp2=1,lp3=1,rp3=1,v=0,circular=0","support","0","no","BLAS"
"BLAS","PAD","type=f32,ne_a=[512,512,1,1],pad_0=1,pad_1=1","support","0","no","BLAS"
"BLAS","PAD","type=f32,ne_a=[512,512,3,1],lp0=1,rp0=1,lp1=1,rp1=1,lp2=1,rp2=1,lp3=1,rp3=1,v=0","support","0","no","BLAS"
"BLAS","PAD_REFLECT_1D","type=f32,ne_a=[512,34,2,1],pad_0=10,pad_1=9","support","0","no","BLAS"
"BLAS","PAD_REFLECT_1D","type=f32,ne_a=[3000,384,4,1],pad_0=10,pad_1=9","support","0","no","BLAS"
"BLAS","ROLL","shift0=3,shift1=-2,shift3=1,shift4=-1","support","0","no","BLAS"
@@ -9979,7 +9914,6 @@
"BLAS","CUMSUM","type=f32,ne=[2048,5,4,3]","support","0","no","BLAS"
"BLAS","CUMSUM","type=f32,ne=[242004,1,1,1]","support","0","no","BLAS"
"BLAS","CUMSUM","type=f32,ne=[375960,1,1,1]","support","0","no","BLAS"
"BLAS","CUMSUM","type=f32,ne=[20481,4,1,1]","support","0","no","BLAS"
"BLAS","XIELU","type=f32,ne=[10,5,4,3]","support","0","no","BLAS"
"BLAS","TRI","type=f32,ne=[10,10,4,3],tri_type=3","support","0","no","BLAS"
"BLAS","TRI","type=f32,ne=[10,10,4,3],tri_type=2","support","0","no","BLAS"
@@ -9989,41 +9923,17 @@
"BLAS","FILL","type=f32,ne=[303,207,11,3],c=2.000000","support","0","no","BLAS"
"BLAS","FILL","type=f32,ne=[800,600,4,4],c=-152.000000","support","0","no","BLAS"
"BLAS","FILL","type=f32,ne=[2048,512,2,2],c=3.500000","support","0","no","BLAS"
"BLAS","DIAG","type=f32,ne=[10,1,4,3]","support","0","no","BLAS"
"BLAS","DIAG","type=f32,ne=[79,1,19,13]","support","0","no","BLAS"
"BLAS","DIAG","type=f32,ne=[256,1,8,16]","support","0","no","BLAS"
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[10,10,4,3],ne_rhs=[3,10,4,3]","support","0","no","BLAS"
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[11,11,1,1],ne_rhs=[5,11,1,1]","support","0","no","BLAS"
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[17,17,2,4],ne_rhs=[9,17,2,4]","support","0","no","BLAS"
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[30,30,7,1],ne_rhs=[8,30,7,1]","support","0","no","BLAS"
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[42,42,5,2],ne_rhs=[10,42,5,2]","support","0","no","BLAS"
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[64,64,2,2],ne_rhs=[10,64,2,2]","support","0","no","BLAS"
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[64,64,2,2],ne_rhs=[64,64,2,2]","support","0","no","BLAS"
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[79,79,5,3],ne_rhs=[417,79,5,3]","support","0","no","BLAS"
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[128,128,4,2],ne_rhs=[32,128,4,2]","support","0","no","BLAS"
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[80,80,2,8],ne_rhs=[80,80,2,8]","support","0","no","BLAS"
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[80,80,2,8],ne_rhs=[79,80,2,8]","support","0","no","BLAS"
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[80,80,2,8],ne_rhs=[81,80,2,8]","support","0","no","BLAS"
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[80,80,8,8],ne_rhs=[80,80,8,8]","support","0","no","BLAS"
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[80,80,8,8],ne_rhs=[79,80,8,8]","support","0","no","BLAS"
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[80,80,8,8],ne_rhs=[81,80,8,8]","support","0","no","BLAS"
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[84,84,4,4],ne_rhs=[32,84,4,4]","support","0","no","BLAS"
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[95,95,8,8],ne_rhs=[40,95,8,8]","support","0","no","BLAS"
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[100,100,4,4],ne_rhs=[41,100,4,4]","support","0","no","BLAS"
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[128,128,4,4],ne_rhs=[31,128,4,4]","support","0","no","BLAS"
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[128,128,4,4],ne_rhs=[32,128,4,4]","support","0","no","BLAS"
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[128,128,3,4],ne_rhs=[32,128,3,4]","support","0","no","BLAS"
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[128,128,4,1],ne_rhs=[32,128,4,1]","support","0","no","BLAS"
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[64,64,4,4],ne_rhs=[200,64,4,4]","support","0","no","BLAS"
"BLAS","SOLVE_TRI","type=f32,ne_lhs=[64,64,4,4],ne_rhs=[384,64,4,4]","support","0","no","BLAS"
"BLAS","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=0,circular=0","support","0","no","BLAS"
"BLAS","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=0,circular=0","support","0","no","BLAS"
"BLAS","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=0,circular=1","support","0","no","BLAS"
"BLAS","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=0,circular=1","support","0","no","BLAS"
"BLAS","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=1,circular=0","support","0","no","BLAS"
"BLAS","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=1,circular=0","support","0","no","BLAS"
"BLAS","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=1,circular=1","support","0","no","BLAS"
"BLAS","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=1,circular=1","support","0","no","BLAS"
"BLAS","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=0","support","0","no","BLAS"
"BLAS","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=0","support","0","no","BLAS"
"BLAS","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=1","support","0","no","BLAS"
"BLAS","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=1","support","0","no","BLAS"
"BLAS","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","0","no","BLAS"
"BLAS","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","0","no","BLAS"
"BLAS","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","BLAS"
Can't render this file because it is too large.

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -1,97 +0,0 @@
# llama.cpp INI Presets
## Introduction
The INI preset feature, introduced in [PR#17859](https://github.com/ggml-org/llama.cpp/pull/17859), allows users to create reusable and shareable parameter configurations for llama.cpp.
### Using Presets with the Server
When running multiple models on the server (router mode), INI preset files can be used to configure model-specific parameters. Please refer to the [server documentation](../tools/server/README.md) for more details.
### Using a Remote Preset
> [!NOTE]
>
> This feature is currently only supported via the `-hf` option.
For GGUF models hosted on Hugging Face, you can include a `preset.ini` file in the root directory of the repository to define specific configurations for that model.
Example:
```ini
hf-repo-draft = username/my-draft-model-GGUF
temp = 0.5
top-k = 20
top-p = 0.95
```
For security reasons, only certain options are allowed. Please refer to [preset.cpp](../common/preset.cpp) for the complete list of permitted options.
Example usage:
Assuming your repository `username/my-model-with-preset` contains a `preset.ini` with the configuration above:
```sh
llama-cli -hf username/my-model-with-preset
# This is equivalent to:
llama-cli -hf username/my-model-with-preset \
--hf-repo-draft username/my-draft-model-GGUF \
--temp 0.5 \
--top-k 20 \
--top-p 0.95
```
You can also override preset arguments by specifying them on the command line:
```sh
# Force temp = 0.1, overriding the preset value
llama-cli -hf username/my-model-with-preset --temp 0.1
```
If you want to define multiple preset configurations for one or more GGUF models, you can create a blank HF repo for each preset. Each HF repo should contain a `preset.ini` file that references the actual model(s):
```ini
hf-repo = user/my-model-main
hf-repo-draft = user/my-model-draft
temp = 0.8
ctx-size = 1024
; (and other configurations)
```
### Named presets
If you want to define multiple preset configurations for one or more GGUF models, you can create a blank HF repo containing a single `preset.ini` file that references the actual model(s):
```ini
[*]
mmap = 1
[gpt-oss-20b-hf]
hf = ggml-org/gpt-oss-20b-GGUF
batch-size = 2048
ubatch-size = 2048
top-p = 1.0
top-k = 0
min-p = 0.01
temp = 1.0
chat-template-kwargs = {"reasoning_effort": "high"}
[gpt-oss-120b-hf]
hf = ggml-org/gpt-oss-120b-GGUF
batch-size = 2048
ubatch-size = 2048
top-p = 1.0
top-k = 0
min-p = 0.01
temp = 1.0
chat-template-kwargs = {"reasoning_effort": "high"}
```
You can then use it via `llama-cli` or `llama-server`, example:
```sh
llama-server -hf user/repo:gpt-oss-120b-hf
```
Please make sure to provide the correct `hf-repo` for each child preset. Otherwise, you may get error: `The specified tag is not a valid quantization scheme.`

View File

@@ -15,7 +15,6 @@ llama_add_compile_flags()
if (EMSCRIPTEN)
else()
add_subdirectory(batched)
add_subdirectory(debug)
add_subdirectory(embedding)
add_subdirectory(eval-callback)
@@ -35,6 +34,7 @@ else()
add_subdirectory(gen-docs)
add_subdirectory(training)
add_subdirectory(diffusion)
add_subdirectory(model-conversion)
if (NOT GGML_BACKEND_DL)
add_subdirectory(convert-llama2c-to-ggml)
# these examples use the backends directly and cannot be built with dynamic loading

View File

@@ -21,7 +21,7 @@ int main(int argc, char ** argv) {
params.prompt = "Hello my name is";
params.n_predict = 32;
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_BATCHED, print_usage)) {
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) {
return 1;
}

View File

@@ -1,54 +0,0 @@
# llama.cpp/examples/debug
This is a utility intended to help debug a model by registering a callback that
logs GGML operations and tensor data. It can also store the generated logits or
embeddings as well as the prompt and token ids for comparision with the original
model.
### Usage
```shell
llama-debug \
--hf-repo ggml-org/models \
--hf-file phi-2/ggml-model-q4_0.gguf \
--model phi-2-q4_0.gguf \
--prompt hello \
--save-logits \
--verbose
```
The tensor data is logged as debug and required the --verbose flag. The reason
for this is that while useful for a model with many layers there can be a lot of
output. You can filter the tensor names using the `--tensor-filter` option.
A recommended approach is to first run without `--verbose` and see if the
generated logits/embeddings are close to the original model. If they are not,
then it might be required to inspect tensor by tensor and in that case it is
useful to enable the `--verbose` flag along with `--tensor-filter` to focus on
specific tensors.
### Options
This example supports all standard `llama.cpp` options and also accepts the
following options:
```console
$ llama-debug --help
...
----- example-specific params -----
--save-logits save final logits to files for verification (default: false)
--logits-output-dir PATH directory for saving logits output files (default: data)
--tensor-filter REGEX filter tensor names for debug output (regex pattern, can be specified multiple times)
```
### Output Files
When `--save-logits` is enabled, the following files are created in the output
directory:
* `llamacpp-<model>[-embeddings].bin` - Binary output (logits or embeddings)
* `llamacpp-<model>[-embeddings].txt` - Text output (logits or embeddings, one per line)
* `llamacpp-<model>[-embeddings]-prompt.txt` - Prompt text and token IDs
* `llamacpp-<model>[-embeddings]-tokens.bin` - Binary token IDs for programmatic comparison
These files can be compared against the original model's output to verify the
converted model.

View File

@@ -1,253 +0,0 @@
#include "debug.h"
#include "arg.h"
#include "common.h"
#include "log.h"
#include "llama.h"
#include <cstdlib>
#include <string>
#include <vector>
#include <filesystem>
#include <fstream>
#include <regex>
static void print_usage(int /*argc*/, char ** argv) {
const std::string usage_template = R"(
example usage:
Print tensors:
{prog} -m model.gguf -p "Hello my name is" --verbose
The tensors to be printed can be filtered with --tensor-filter option.
Save logits/embeddings:
{prog} -m model.gguf -p "Hello my name is" --save-logits
Add --embedding to save embeddings)" "\n";
// Fix the source code indentation above that is introduced by the raw string literal.
std::string usage = std::regex_replace(usage_template, std::regex("\\n {8}"), "\n");
usage = std::regex_replace(usage, std::regex("\\{prog\\}"), argv[0]);
LOG("%s\n", usage.c_str());
}
static bool has_pooling(llama_context * ctx) {
switch (llama_pooling_type(ctx)) {
case LLAMA_POOLING_TYPE_NONE:
case LLAMA_POOLING_TYPE_UNSPECIFIED:
return false;
default:
return true;
}
}
struct output_data {
float * data_ptr = nullptr;
int data_size = 0;
std::string type_suffix;
std::vector<float> embd_norm;
std::string prompt;
std::vector<llama_token> tokens;
output_data(llama_context * ctx, const llama_model * model, const common_params & params) {
const llama_vocab * vocab = llama_model_get_vocab(model);
const bool add_bos = llama_vocab_get_add_bos(vocab);
tokens = common_tokenize(ctx, params.prompt, add_bos);
prompt = params.prompt;
if (params.embedding) {
const int n_embd = llama_model_n_embd_out(model);
const bool pooling = has_pooling(ctx);
const int n_embd_count = pooling ? 1 : tokens.size();
const int n_floats = n_embd * n_embd_count;
float * embd_raw = pooling ? llama_get_embeddings_seq(ctx, 0) : llama_get_embeddings(ctx);
if (embd_raw == nullptr) {
throw std::runtime_error("failed to get embeddings from the model");
}
LOG_DBG("pooling_enabled: %s\n", pooling ? "true" : "false");
LOG_DBG("n_embd: %d\n", n_embd);
LOG_DBG("n_floats: %d\n", n_floats);
LOG_DBG("n_embd_count: %d\n", n_embd_count);
data_ptr = embd_raw;
data_size = n_floats;
type_suffix = "-embeddings";
if (params.embd_normalize >= 0) {
embd_norm.resize(n_floats);
for (int i = 0; i < n_embd_count; i++) {
common_embd_normalize(embd_raw+i*n_embd, embd_norm.data()+i*n_embd, n_embd, params.embd_normalize);
}
data_ptr = embd_norm.data();
}
} else {
const float * logits = llama_get_logits_ith(ctx, tokens.size() - 1);
const int n_logits = llama_vocab_n_tokens(vocab);
data_ptr = const_cast<float*>(logits);
data_size = n_logits;
type_suffix = "";
}
}
};
static void save_output_data(const output_data & output, const std::string & model_name, const std::string & output_dir) {
std::filesystem::create_directory(output_dir);
auto base_path = std::filesystem::path{output_dir} / ("llamacpp-" + model_name + output.type_suffix);
// Save logits/embeddings to binary file.
{
std::filesystem::path filepath{base_path.string() + ".bin"};
std::ofstream file{filepath, std::ios::binary};
if (!file) {
throw std::runtime_error("failed to open binary output file: " + filepath.string());
}
file.write(reinterpret_cast<const char*>(output.data_ptr), output.data_size * sizeof(float));
LOG("Data saved to %s\n", filepath.c_str());
}
// Save logits/embeddings to text file.
{
std::filesystem::path filepath{base_path.string() + ".txt"};
std::ofstream file{filepath};
if (!file) {
throw std::runtime_error("failed to open text output file: " + filepath.string());
}
for (int i = 0; i < output.data_size; i++) {
file << i << ": " << output.data_ptr[i] << '\n';
}
LOG("Data saved to %s\n", filepath.c_str());
}
// Save prompt and tokens to text file.
{
std::filesystem::path filepath{base_path.string() + "-prompt.txt"};
std::ofstream file{filepath};
if (!file) {
throw std::runtime_error("failed to open prompt output file: " + filepath.string());
}
file << "prompt: " << output.prompt << '\n';
file << "n_tokens: " << output.tokens.size() << '\n';
file << "token ids: ";
for (size_t i = 0; i < output.tokens.size(); i++) {
file << output.tokens[i];
if (i + 1 < output.tokens.size()) {
file << ", ";
}
}
file << '\n';
LOG("Prompt saved to %s\n", filepath.c_str());
}
// Save token ids to binary file.
{
std::filesystem::path filepath{base_path.string() + "-tokens.bin"};
std::ofstream file{filepath, std::ios::binary};
if (!file) {
throw std::runtime_error("failed to open tokens binary file: " + filepath.string());
}
file.write(reinterpret_cast<const char*>(output.tokens.data()), output.tokens.size() * sizeof(llama_token));
LOG("Tokens saved to %s\n", filepath.c_str());
}
}
static void print_tokenized_prompt(llama_context * ctx, const std::vector<llama_token> & tokens, const std::string & prompt) {
const llama_model * model = llama_get_model(ctx);
const llama_vocab * vocab = llama_model_get_vocab(model);
LOG("Model add_bos: %s\n", llama_vocab_get_add_bos(vocab) ? "true" : "false");
LOG("Input prompt: \"%s\"\n", prompt.c_str());
LOG("Token ids (%zu):\n", tokens.size());
for (auto id : tokens) {
std::string piece(128, '\0');
int n = llama_token_to_piece(vocab, id, piece.data(), piece.size(), 0, true);
if (n < 0) {
LOG_ERR("failed to convert token %d to piece\n", id);
continue;
}
piece.resize(n);
LOG("%s(%d) ", piece.c_str(), id);
}
LOG("\n");
}
static bool run(llama_context * ctx, const common_params & params) {
const llama_model * model = llama_get_model(ctx);
const llama_vocab * vocab = llama_model_get_vocab(model);
const bool add_bos = llama_vocab_get_add_bos(vocab);
std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, add_bos);
if (tokens.empty()) {
LOG_ERR("%s : there are not input tokens to process - (try to provide a prompt with '-p')\n", __func__);
return false;
}
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {
LOG_ERR("%s : failed to eval\n", __func__);
return false;
}
print_tokenized_prompt(ctx, tokens, params.prompt);
if (params.save_logits) {
output_data output {ctx, model, params};
std::filesystem::path model_path{params.model.path};
std::string model_name{model_path.stem().string()};
save_output_data(output, model_name, params.logits_output_dir);
}
return true;
}
int main(int argc, char ** argv) {
common_params params;
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_DEBUG, print_usage)) {
return 1;
}
common_init();
llama_backend_init();
llama_numa_init(params.numa);
base_callback_data cb_data(params, params.tensor_filter);
auto llama_init = common_init_from_params(params);
auto * model = llama_init->model();
auto * ctx = llama_init->context();
if (model == nullptr || ctx == nullptr) {
LOG_ERR("%s : failed to init\n", __func__);
return 1;
}
{
LOG_INF("\n");
LOG_INF("%s\n", common_params_get_system_info(params).c_str());
LOG_INF("\n");
}
if (!run(ctx, params)) {
return 1;
}
LOG("\n");
llama_perf_context_print(ctx);
llama_backend_free();
return 0;
}

View File

@@ -553,7 +553,6 @@ int main(int argc, char ** argv) {
model_params.n_gpu_layers = params.n_gpu_layers;
model_params.devices = params.devices.data();
model_params.use_mmap = params.use_mmap;
model_params.use_direct_io = params.use_direct_io;
model_params.use_mlock = params.use_mlock;
model_params.check_tensors = params.check_tensors;

View File

@@ -33,7 +33,7 @@ static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & toke
}
}
static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd_out, int embd_norm) {
static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd, int embd_norm) {
const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
// clear previous kv_cache values (irrelevant for embeddings)
@@ -65,8 +65,8 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
GGML_ASSERT(embd != NULL && "failed to get sequence embeddings");
}
float * out = output + embd_pos * n_embd_out;
common_embd_normalize(embd, out, n_embd_out, embd_norm);
float * out = output + embd_pos * n_embd;
common_embd_normalize(embd, out, n_embd, embd_norm);
}
}
@@ -252,8 +252,8 @@ int main(int argc, char ** argv) {
}
// allocate output
const int n_embd_out = llama_model_n_embd_out(model);
std::vector<float> embeddings(n_embd_count * n_embd_out, 0);
const int n_embd = llama_model_n_embd(model);
std::vector<float> embeddings(n_embd_count * n_embd, 0);
float * emb = embeddings.data();
// break into batches
@@ -267,8 +267,8 @@ int main(int argc, char ** argv) {
// encode if at capacity
if (batch.n_tokens + n_toks > n_batch || s >= n_seq_max) {
float * out = emb + e * n_embd_out;
batch_decode(ctx, batch, out, s, n_embd_out, params.embd_normalize);
float * out = emb + e * n_embd;
batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
e += pooling_type == LLAMA_POOLING_TYPE_NONE ? batch.n_tokens : s;
s = 0;
common_batch_clear(batch);
@@ -280,8 +280,8 @@ int main(int argc, char ** argv) {
}
// final batch
float * out = emb + e * n_embd_out;
batch_decode(ctx, batch, out, s, n_embd_out, params.embd_normalize);
float * out = emb + e * n_embd;
batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
if (params.embd_out.empty()) {
LOG("\n");
@@ -289,19 +289,19 @@ int main(int argc, char ** argv) {
if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
for (int j = 0; j < n_embd_count; j++) {
LOG("embedding %d: ", j);
for (int i = 0; i < std::min(3, n_embd_out); i++) {
for (int i = 0; i < std::min(3, n_embd); i++) {
if (params.embd_normalize == 0) {
LOG("%6.0f ", emb[j * n_embd_out + i]);
LOG("%6.0f ", emb[j * n_embd + i]);
} else {
LOG("%9.6f ", emb[j * n_embd_out + i]);
LOG("%9.6f ", emb[j * n_embd + i]);
}
}
LOG(" ... ");
for (int i = n_embd_out - 3; i < n_embd_out; i++) {
for (int i = n_embd - 3; i < n_embd; i++) {
if (params.embd_normalize == 0) {
LOG("%6.0f ", emb[j * n_embd_out + i]);
LOG("%6.0f ", emb[j * n_embd + i]);
} else {
LOG("%9.6f ", emb[j * n_embd_out + i]);
LOG("%9.6f ", emb[j * n_embd + i]);
}
}
LOG("\n");
@@ -320,9 +320,9 @@ int main(int argc, char ** argv) {
for (uint32_t i = 0; i < n_cls_out; i++) {
// NOTE: if you change this log - update the tests in ci/run.sh
if (n_cls_out == 1) {
LOG("rerank score %d: %8.3f\n", j, emb[j * n_embd_out]);
LOG("rerank score %d: %8.3f\n", j, emb[j * n_embd]);
} else {
LOG("rerank score %d: %8.3f [%s]\n", j, emb[j * n_embd_out + i], cls_out_labels[i].c_str());
LOG("rerank score %d: %8.3f [%s]\n", j, emb[j * n_embd + i], cls_out_labels[i].c_str());
}
}
}
@@ -330,11 +330,11 @@ int main(int argc, char ** argv) {
// print the first part of the embeddings or for a single prompt, the full embedding
for (int j = 0; j < n_prompts; j++) {
LOG("embedding %d: ", j);
for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd_out) : n_embd_out); i++) {
for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) {
if (params.embd_normalize == 0) {
LOG("%6.0f ", emb[j * n_embd_out + i]);
LOG("%6.0f ", emb[j * n_embd + i]);
} else {
LOG("%9.6f ", emb[j * n_embd_out + i]);
LOG("%9.6f ", emb[j * n_embd + i]);
}
}
LOG("\n");
@@ -350,7 +350,7 @@ int main(int argc, char ** argv) {
LOG("\n");
for (int i = 0; i < n_prompts; i++) {
for (int j = 0; j < n_prompts; j++) {
float sim = common_embd_similarity_cos(emb + i * n_embd_out, emb + j * n_embd_out, n_embd_out);
float sim = common_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
LOG("%6.2f ", sim);
}
LOG("%1.10s", prompts[i].c_str());
@@ -368,9 +368,9 @@ int main(int argc, char ** argv) {
if (notArray) LOG(" {\n \"object\": \"embedding\",\n \"index\": %d,\n \"embedding\": ",j);
LOG("[");
for (int i = 0;;) { // at least one iteration (n_embd > 0)
LOG(params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd_out + i]);
LOG(params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd + i]);
i++;
if (i < n_embd_out) LOG(","); else break;
if (i < n_embd) LOG(","); else break;
}
LOG(notArray ? "]\n }" : "]");
j++;
@@ -383,7 +383,7 @@ int main(int argc, char ** argv) {
for (int i = 0;;) { // at least two iteration (n_embd_count > 1)
LOG(" [");
for (int j = 0;;) { // at least two iteration (n_embd_count > 1)
float sim = common_embd_similarity_cos(emb + i * n_embd_out, emb + j * n_embd_out, n_embd_out);
float sim = common_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
LOG("%6.2f", sim);
j++;
if (j < n_embd_count) LOG(", "); else break;
@@ -397,7 +397,7 @@ int main(int argc, char ** argv) {
if (notArray) LOG("\n}\n");
} else if (params.embd_out == "raw") {
print_raw_embeddings(emb, n_embd_count, n_embd_out, model, pooling_type, params.embd_normalize);
print_raw_embeddings(emb, n_embd_count, n_embd, model, pooling_type, params.embd_normalize);
}
LOG("\n");

View File

@@ -4,23 +4,12 @@ install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)
if(LLAMA_BUILD_TESTS)
if(NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "s390x")
set(MODEL_NAME "tinyllamas/stories15M-q4_0.gguf")
set(MODEL_HASH "SHA256=66967fbece6dbe97886593fdbb73589584927e29119ec31f08090732d1861739")
else()
set(MODEL_NAME "tinyllamas/stories15M-be.Q4_0.gguf")
set(MODEL_HASH "SHA256=9aec857937849d976f30397e97eb1cabb53eb9dcb1ce4611ba8247fb5f44c65d")
endif()
set(MODEL_DEST "${CMAKE_BINARY_DIR}/${MODEL_NAME}")
set(TEST_TARGET test-eval-callback)
add_test(NAME ${TEST_TARGET}-download-model COMMAND ${CMAKE_COMMAND}
-DDEST=${MODEL_DEST}
-DNAME=${MODEL_NAME}
-DHASH=${MODEL_HASH}
-P ${CMAKE_SOURCE_DIR}/cmake/download-models.cmake
)
set_tests_properties(${TEST_TARGET}-download-model PROPERTIES FIXTURES_SETUP ${TEST_TARGET}-download-model)
add_test(NAME ${TEST_TARGET} COMMAND llama-eval-callback -m "${MODEL_DEST}" --prompt hello --seed 42 -ngl 0)
set_tests_properties(${TEST_TARGET} PROPERTIES FIXTURES_REQUIRED ${TEST_TARGET}-download-model)
set(TEST_TARGET test-eval-callback)
if(NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "s390x")
add_test(NAME ${TEST_TARGET}
COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
else()
add_test(NAME ${TEST_TARGET}
COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K-be.gguf --model stories260K-be.gguf --prompt hello --seed 42 -ngl 0)
endif()
set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl)

View File

@@ -1,12 +1,165 @@
#include "arg.h"
#include "common.h"
#include "debug.h"
#include "log.h"
#include "llama.h"
#include "llama-cpp.h"
#include "ggml.h"
#include <cmath>
#include <cstdio>
#include <string>
#include <vector>
/**
* This the arbitrary data which will be passed to each callback.
* Later on we can for example add operation or tensor name filter from the CLI arg, or a file descriptor to dump the tensor.
*/
struct callback_data {
std::vector<uint8_t> data;
};
static std::string ggml_ne_string(const ggml_tensor * t) {
std::string str;
for (int i = 0; i < GGML_MAX_DIMS; ++i) {
str += std::to_string(t->ne[i]);
if (i + 1 < GGML_MAX_DIMS) {
str += ", ";
}
}
return str;
}
static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
union {
float f;
uint32_t i;
} u;
u.i = (uint32_t)h.bits << 16;
return u.f;
}
static float ggml_get_float_value(const uint8_t * data, ggml_type type, const size_t * nb, size_t i0, size_t i1, size_t i2, size_t i3) {
size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
float v;
if (type == GGML_TYPE_F16) {
v = ggml_fp16_to_fp32(*(const ggml_fp16_t *) &data[i]);
} else if (type == GGML_TYPE_F32) {
v = *(const float *) &data[i];
} else if (type == GGML_TYPE_I64) {
v = (float) *(const int64_t *) &data[i];
} else if (type == GGML_TYPE_I32) {
v = (float) *(const int32_t *) &data[i];
} else if (type == GGML_TYPE_I16) {
v = (float) *(const int16_t *) &data[i];
} else if (type == GGML_TYPE_I8) {
v = (float) *(const int8_t *) &data[i];
} else if (type == GGML_TYPE_BF16) {
v = ggml_compute_bf16_to_fp32(*(const ggml_bf16_t *) &data[i]);
} else {
GGML_ABORT("fatal error");
}
return v;
}
static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) {
GGML_ASSERT(n > 0);
float sum = 0;
for (int64_t i3 = 0; i3 < ne[3]; i3++) {
for (int64_t i2 = 0; i2 < ne[2]; i2++) {
for (int64_t i1 = 0; i1 < ne[1]; i1++) {
for (int64_t i0 = 0; i0 < ne[0]; i0++) {
const float v = ggml_get_float_value(data, type, nb, i0, i1, i2, i3);
sum += v;
}
}
}
}
for (int64_t i3 = 0; i3 < ne[3]; i3++) {
LOG(" [\n");
for (int64_t i2 = 0; i2 < ne[2]; i2++) {
if (i2 == n && ne[2] > 2*n) {
LOG(" ..., \n");
i2 = ne[2] - n;
}
LOG(" [\n");
for (int64_t i1 = 0; i1 < ne[1]; i1++) {
if (i1 == n && ne[1] > 2*n) {
LOG(" ..., \n");
i1 = ne[1] - n;
}
LOG(" [");
for (int64_t i0 = 0; i0 < ne[0]; i0++) {
if (i0 == n && ne[0] > 2*n) {
LOG("..., ");
i0 = ne[0] - n;
}
const float v = ggml_get_float_value(data, type, nb, i0, i1, i2, i3);
LOG("%12.4f", v);
if (i0 < ne[0] - 1) LOG(", ");
}
LOG("],\n");
}
LOG(" ],\n");
}
LOG(" ]\n");
LOG(" sum = %f\n", sum);
}
// TODO: make this abort configurable/optional?
if (std::isnan(sum)) {
LOG_ERR("encountered NaN - aborting\n");
exit(0);
}
}
/**
* GGML operations callback during the graph execution.
*
* @param t current tensor
* @param ask when ask is true, the scheduler wants to know if we are interested in data from this tensor
* if we return true, a follow-up call will be made with ask=false in which we can do the actual collection.
* see ggml_backend_sched_eval_callback
* @param user_data user data to pass at each call back
* @return true to receive data or continue the graph, false otherwise
*/
static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
auto * cb_data = (callback_data *) user_data;
const struct ggml_tensor * src0 = t->src[0];
const struct ggml_tensor * src1 = t->src[1];
if (ask) {
return true; // Always retrieve data
}
char src1_str[128] = {0};
if (src1) {
snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, ggml_ne_string(src1).c_str());
}
LOG("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
t->name, ggml_type_name(t->type), ggml_op_desc(t),
src0->name, ggml_ne_string(src0).c_str(),
src1 ? src1_str : "",
ggml_ne_string(t).c_str());
// copy the data from the GPU memory if needed
const bool is_host = ggml_backend_buffer_is_host(t->buffer);
if (!is_host) {
auto n_bytes = ggml_nbytes(t);
cb_data->data.resize(n_bytes);
ggml_backend_tensor_get(t, cb_data->data.data(), 0, n_bytes);
}
if (!ggml_is_quantized(t->type)) {
uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();
ggml_print_tensor(data, t->type, t->ne, t->nb, 3);
}
return true;
}
static bool run(llama_context * ctx, const common_params & params) {
const llama_model * model = llama_get_model(ctx);
const llama_vocab * vocab = llama_model_get_vocab(model);
@@ -29,7 +182,7 @@ static bool run(llama_context * ctx, const common_params & params) {
}
int main(int argc, char ** argv) {
base_callback_data cb_data;
callback_data cb_data;
common_params params;
@@ -44,7 +197,7 @@ int main(int argc, char ** argv) {
// pass the callback to the backend scheduler
// it will be executed for each node during the graph computation
params.cb_eval = common_debug_cb_eval<false>;
params.cb_eval = ggml_debug;
params.cb_eval_user_data = &cb_data;
params.warmup = false;

View File

@@ -26,7 +26,7 @@ android {
arguments += "-DBUILD_SHARED_LIBS=ON"
arguments += "-DLLAMA_BUILD_COMMON=ON"
arguments += "-DLLAMA_OPENSSL=OFF"
arguments += "-DLLAMA_CURL=OFF"
arguments += "-DGGML_NATIVE=OFF"
arguments += "-DGGML_BACKEND_DL=ON"

View File

@@ -1,5 +1,5 @@
set(TARGET llama-debug)
add_executable(${TARGET} debug.cpp)
set(TARGET llama-logits)
add_executable(${TARGET} logits.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@@ -61,7 +61,7 @@ causal-run-converted-model:
@CONVERTED_MODEL="$(CONVERTED_MODEL)" ./scripts/causal/run-converted-model.sh
causal-verify-logits: causal-run-original-model causal-run-converted-model
@MODEL_PATH="$(MODEL_PATH)" ./scripts/causal/compare-logits.py
@./scripts/causal/compare-logits.py
@MODEL_PATH="$(MODEL_PATH)" ./scripts/utils/check-nmse.py -m ${MODEL_PATH}
causal-run-original-embeddings:
@@ -138,13 +138,16 @@ embedding-run-original-model-st: embedding-run-original-model
embedding-run-converted-model:
@./scripts/embedding/run-converted-model.sh $(CONVERTED_EMBEDDING_MODEL) \
$(if $(PROMPTS_FILE),--prompts-file "$(PROMPTS_FILE)") \
$(if $(EMBD_NORMALIZE),--embd-normalize "$(EMBD_NORMALIZE)")
$(if $(USE_POOLING),--pooling)
embedding-run-converted-model-st: USE_POOLING=1
embedding-run-converted-model-st: embedding-run-converted-model
embedding-verify-logits: embedding-run-original-model embedding-run-converted-model
@./scripts/embedding/compare-embeddings-logits.sh \
$(if $(PROMPTS_FILE),--prompts-file "$(PROMPTS_FILE)")
embedding-verify-logits-st: embedding-run-original-model-st embedding-run-converted-model
embedding-verify-logits-st: embedding-run-original-model-st embedding-run-converted-model-st
@./scripts/embedding/compare-embeddings-logits.sh \
$(if $(PROMPTS_FILE),--prompts-file "$(PROMPTS_FILE)")

View File

@@ -198,13 +198,14 @@ model, and the other is a text file which allows for manual visual inspection.
#### Using SentenceTransformer with numbered layers
For models that have numbered SentenceTransformer layers (01_Pooling, 02_Dense,
03_Dense, 04_Normalize), these will be applied automatically when running the
converted model but currently there is a separate target to run the original
version:
03_Dense, 04_Normalize), use the `-st` targets to apply all these layers:
```console
# Run original model with SentenceTransformer (applies all numbered layers)
(venv) $ make embedding-run-original-model-st
# Run converted model with pooling enabled
(venv) $ make embedding-run-converted-model-st
```
This will use the SentenceTransformer library to load and run the model, which
@@ -212,17 +213,6 @@ automatically applies all the numbered layers in the correct order. This is
particularly useful when comparing with models that should include these
additional transformation layers beyond just the base model output.
The type of normalization can be specified for the converted model but is not
strictly necessary as the verification uses cosine similarity and the magnitude
of the output vectors does not affect this. But the normalization type can be
specified as an argument to the target which might be useful for manual
inspection:
```console
(venv) $ make embedding-verify-logits-st EMBD_NORMALIZE=1
```
The original model will apply the normalization according to the normalization
layer specified in the modules.json configuration file.
### Model conversion
After updates have been made to [gguf-py](../../gguf-py) to add support for the
new model the model can be converted to GGUF format using the following command:

View File

@@ -0,0 +1,268 @@
#include "llama.h"
#include "common.h"
#include <cstdio>
#include <cstring>
#include <string>
#include <vector>
#include <ctype.h>
#include <filesystem>
static void print_usage(int, char ** argv) {
printf("\nexample usage:\n");
printf("\n %s -m model.gguf [-ngl n_gpu_layers] -embd-mode [-pooling] [-embd-norm <norm>] [prompt]\n", argv[0]);
printf("\n");
printf(" -embd-norm: normalization type for pooled embeddings (default: 2)\n");
printf(" -1=none, 0=max absolute int16, 1=taxicab, 2=Euclidean/L2, >2=p-norm\n");
printf("\n");
}
int main(int argc, char ** argv) {
std::string model_path;
std::string prompt = "Hello, my name is";
int ngl = 0;
bool embedding_mode = false;
bool pooling_enabled = false;
int32_t embd_norm = 2; // (-1=none, 0=max absolute int16, 1=taxicab, 2=Euclidean/L2, >2=p-norm)
{
int i = 1;
for (; i < argc; i++) {
if (strcmp(argv[i], "-m") == 0) {
if (i + 1 < argc) {
model_path = argv[++i];
} else {
print_usage(argc, argv);
return 1;
}
} else if (strcmp(argv[i], "-ngl") == 0) {
if (i + 1 < argc) {
try {
ngl = std::stoi(argv[++i]);
} catch (...) {
print_usage(argc, argv);
return 1;
}
} else {
print_usage(argc, argv);
return 1;
}
} else if (strcmp(argv[i], "-embd-mode") == 0) {
embedding_mode = true;
} else if (strcmp(argv[i], "-pooling") == 0) {
pooling_enabled = true;
} else if (strcmp(argv[i], "-embd-norm") == 0) {
if (i + 1 < argc) {
try {
embd_norm = std::stoi(argv[++i]);
} catch (...) {
print_usage(argc, argv);
return 1;
}
} else {
print_usage(argc, argv);
return 1;
}
} else {
// prompt starts here
break;
}
}
if (model_path.empty()) {
print_usage(argc, argv);
return 1;
}
if (i < argc) {
prompt = argv[i++];
for (; i < argc; i++) {
prompt += " ";
prompt += argv[i];
}
}
}
ggml_backend_load_all();
llama_model_params model_params = llama_model_default_params();
model_params.n_gpu_layers = ngl;
llama_model * model = llama_model_load_from_file(model_path.c_str(), model_params);
if (model == NULL) {
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
return 1;
}
// Extract basename from model_path
const char * basename = strrchr(model_path.c_str(), '/');
basename = (basename == NULL) ? model_path.c_str() : basename + 1;
char model_name[256];
strncpy(model_name, basename, 255);
model_name[255] = '\0';
char * dot = strrchr(model_name, '.');
if (dot != NULL && strcmp(dot, ".gguf") == 0) {
*dot = '\0';
}
printf("Model name: %s\n", model_name);
const llama_vocab * vocab = llama_model_get_vocab(model);
const int n_prompt = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, true, true);
std::vector<llama_token> prompt_tokens(n_prompt);
if (llama_tokenize(vocab, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true, true) < 0) {
fprintf(stderr, "%s: error: failed to tokenize the prompt\n", __func__);
return 1;
}
llama_context_params ctx_params = llama_context_default_params();
ctx_params.n_ctx = n_prompt;
ctx_params.n_batch = n_prompt;
ctx_params.no_perf = false;
if (embedding_mode) {
ctx_params.embeddings = true;
ctx_params.pooling_type = pooling_enabled ? LLAMA_POOLING_TYPE_MEAN : LLAMA_POOLING_TYPE_NONE;
ctx_params.n_ubatch = ctx_params.n_batch;
}
llama_context * ctx = llama_init_from_model(model, ctx_params);
if (ctx == NULL) {
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
return 1;
}
printf("Input prompt: \"%s\"\n", prompt.c_str());
printf("Tokenized prompt (%d tokens): ", n_prompt);
for (auto id : prompt_tokens) {
char buf[128];
int n = llama_token_to_piece(vocab, id, buf, sizeof(buf), 0, true);
if (n < 0) {
fprintf(stderr, "%s: error: failed to convert token to piece\n", __func__);
return 1;
}
std::string s(buf, n);
printf("%s (%d)", s.c_str(), id);
}
printf("\n");
llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size());
if (llama_decode(ctx, batch)) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return 1;
}
float * data_ptr;
int data_size;
const char * type;
std::vector<float> embd_out;
if (embedding_mode) {
const int n_embd = llama_model_n_embd(model);
const int n_embd_count = pooling_enabled ? 1 : batch.n_tokens;
const int n_embeddings = n_embd * n_embd_count;
float * embeddings;
type = "-embeddings";
if (llama_pooling_type(ctx) != LLAMA_POOLING_TYPE_NONE) {
embeddings = llama_get_embeddings_seq(ctx, 0);
embd_out.resize(n_embeddings);
printf("Normalizing embeddings using norm: %d\n", embd_norm);
common_embd_normalize(embeddings, embd_out.data(), n_embeddings, embd_norm);
embeddings = embd_out.data();
} else {
embeddings = llama_get_embeddings(ctx);
}
printf("Embedding dimension: %d\n", n_embd);
printf("\n");
// Print embeddings in the specified format
for (int j = 0; j < n_embd_count; j++) {
printf("embedding %d: ", j);
// Print first 3 values
for (int i = 0; i < 3 && i < n_embd; i++) {
printf("%9.6f ", embeddings[j * n_embd + i]);
}
printf(" ... ");
// Print last 3 values
for (int i = n_embd - 3; i < n_embd; i++) {
if (i >= 0) {
printf("%9.6f ", embeddings[j * n_embd + i]);
}
}
printf("\n");
}
printf("\n");
printf("Embeddings size: %d\n", n_embeddings);
data_ptr = embeddings;
data_size = n_embeddings;
} else {
float * logits = llama_get_logits_ith(ctx, batch.n_tokens - 1);
const int n_logits = llama_vocab_n_tokens(vocab);
type = "";
printf("Vocab size: %d\n", n_logits);
data_ptr = logits;
data_size = n_logits;
}
std::filesystem::create_directory("data");
// Save data to binary file
char bin_filename[512];
snprintf(bin_filename, sizeof(bin_filename), "data/llamacpp-%s%s.bin", model_name, type);
printf("Saving data to %s\n", bin_filename);
FILE * f = fopen(bin_filename, "wb");
if (f == NULL) {
fprintf(stderr, "%s: error: failed to open binary output file\n", __func__);
return 1;
}
fwrite(data_ptr, sizeof(float), data_size, f);
fclose(f);
// Also save as text for debugging
char txt_filename[512];
snprintf(txt_filename, sizeof(txt_filename), "data/llamacpp-%s%s.txt", model_name, type);
f = fopen(txt_filename, "w");
if (f == NULL) {
fprintf(stderr, "%s: error: failed to open text output file\n", __func__);
return 1;
}
for (int i = 0; i < data_size; i++) {
fprintf(f, "%d: %.6f\n", i, data_ptr[i]);
}
fclose(f);
if (!embedding_mode) {
printf("First 10 logits: ");
for (int i = 0; i < 10 && i < data_size; i++) {
printf("%.6f ", data_ptr[i]);
}
printf("\n");
printf("Last 10 logits: ");
for (int i = data_size - 10; i < data_size; i++) {
if (i >= 0) printf("%.6f ", data_ptr[i]);
}
printf("\n\n");
}
printf("Data saved to %s\n", bin_filename);
printf("Data saved to %s\n", txt_filename);
llama_free(ctx);
llama_model_free(model);
return 0;
}

View File

@@ -3,11 +3,10 @@
import sys
import numpy as np
from pathlib import Path
import os
# Add utils directory to path for direct script execution
sys.path.insert(0, str(Path(__file__).parent.parent / "utils"))
from common import get_model_name_from_env_path, compare_tokens, exit_with_warning # type: ignore[import-not-found]
from common import get_model_name_from_env_path # type: ignore[import-not-found]
def quick_logits_check(pytorch_file, llamacpp_file):
"""Lightweight sanity check before NMSE"""
@@ -39,7 +38,6 @@ def quick_logits_check(pytorch_file, llamacpp_file):
return True
def main():
model_path = os.environ.get('MODEL_PATH')
model_name = get_model_name_from_env_path('MODEL_PATH')
data_dir = Path("data")
pytorch_file = data_dir / f"pytorch-{model_name}.bin"
@@ -60,12 +58,6 @@ def main():
print("Checked all required files were found. Proceeding...\n")
# Verify tokens as they are a prerequisite for logits comparison.
print("🔍 Token Comparison Check")
print("=" * 40)
if not compare_tokens(f"pytorch-{model_name}", f"llamacpp-{llamacpp_model_name}"):
exit_with_warning("\n❌ Token mismatch detected", model_path)
print()
print("🔍 GGML Model Validation for model ", model_name)
print("=" * 40)
@@ -81,7 +73,8 @@ def main():
print(" Ok to proceed with NMSE check...")
sys.exit(0)
else:
exit_with_warning(f"❌ NOK: Top 10 predictions don't match - generation will differ", model_path)
print(f"❌ NOK: Top 10 predictions don't match - generation will differ")
sys.exit(1)
if __name__ == "__main__":
main()

View File

@@ -7,7 +7,7 @@ base_model:
Recommended way to run this model:
```sh
llama-server -hf {namespace}/{model_name}-GGUF
llama-server -hf {namespace}/{model_name}-GGUF -c 0
```
Then, access http://localhost:8080

View File

@@ -67,7 +67,7 @@ with torch.no_grad():
last_hidden_states = outputs.hidden_states[-1]
# Get embeddings for all tokens
token_embeddings = last_hidden_states[0].float().cpu().numpy() # Remove batch dimension
token_embeddings = last_hidden_states[0].cpu().numpy() # Remove batch dimension
print(f"Hidden states shape: {last_hidden_states.shape}")
print(f"Token embeddings shape: {token_embeddings.shape}")

View File

@@ -13,6 +13,6 @@ if [ -z "$CONVERTED_MODEL" ]; then
exit 1
fi
cmake --build ../../build --target llama-debug -j8
cmake --build ../../build --target llama-logits -j8
../../build/bin/llama-debug -m $CONVERTED_MODEL --embedding -p "Hello world today" --save-logits
../../build/bin/llama-logits -m $CONVERTED_MODEL -embd-mode "Hello world today"

View File

@@ -21,6 +21,6 @@ fi
echo $CONVERTED_MODEL
echo $MODEL_TESTING_PROMPT
cmake --build ../../build --target llama-debug -j8
cmake --build ../../build --target llama-logits -j8
../../build/bin/llama-debug -m "$CONVERTED_MODEL" -p "$MODEL_TESTING_PROMPT" --save-logits
../../build/bin/llama-logits -m "$CONVERTED_MODEL" "$MODEL_TESTING_PROMPT"

View File

@@ -7,11 +7,12 @@ import importlib
import torch
import numpy as np
from pathlib import Path
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForImageTextToText, AutoConfig
# Add parent directory to path for imports
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
from utils.common import debug_hook, save_output_data
from utils.common import debug_hook
def parse_arguments():
parser = argparse.ArgumentParser(description="Process model with specified path")
@@ -125,7 +126,6 @@ def main():
device = next(model.parameters()).device
prompt = get_prompt(args)
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
token_ids = input_ids[0].cpu().tolist()
print(f"Input tokens: {input_ids}")
print(f"Input text: {repr(prompt)}")
@@ -151,6 +151,19 @@ def main():
print(f"Last token logits shape: {last_logits.shape}")
print(f"Vocab size: {len(last_logits)}")
data_dir = Path("data")
data_dir.mkdir(exist_ok=True)
bin_filename = data_dir / f"pytorch-{model_name}.bin"
txt_filename = data_dir / f"pytorch-{model_name}.txt"
# Save to file for comparison
last_logits.astype(np.float32).tofile(bin_filename)
# Also save as text file for easy inspection
with open(txt_filename, "w") as f:
for i, logit in enumerate(last_logits):
f.write(f"{i}: {logit:.6f}\n")
# Print some sample logits for quick verification
print(f"First 10 logits: {last_logits[:10]}")
print(f"Last 10 logits: {last_logits[-10:]}")
@@ -162,7 +175,8 @@ def main():
token = tokenizer.decode([idx])
print(f" Token {idx} ({repr(token)}): {last_logits[idx]:.6f}")
save_output_data(last_logits, token_ids, prompt, model_name)
print(f"Saved bin logits to: {bin_filename}")
print(f"Saved txt logist to: {txt_filename}")
if __name__ == "__main__":
main()

View File

@@ -5,7 +5,7 @@ set -e
# Parse command line arguments
CONVERTED_MODEL=""
PROMPTS_FILE=""
EMBD_NORMALIZE="2"
USE_POOLING=""
while [[ $# -gt 0 ]]; do
case $1 in
@@ -13,9 +13,9 @@ while [[ $# -gt 0 ]]; do
PROMPTS_FILE="$2"
shift 2
;;
--embd-normalize)
EMBD_NORMALIZE="$2"
shift 2
--pooling)
USE_POOLING="1"
shift
;;
*)
if [ -z "$CONVERTED_MODEL" ]; then
@@ -50,5 +50,10 @@ fi
echo $CONVERTED_MODEL
cmake --build ../../build --target llama-debug -j8
../../build/bin/llama-debug -m "$CONVERTED_MODEL" --embedding -p "$PROMPT" --save-logits --embd-normalize $EMBD_NORMALIZE
cmake --build ../../build --target llama-logits -j8
# TODO: update logits.cpp to accept a --file/-f option for the prompt
if [ -n "$USE_POOLING" ]; then
../../build/bin/llama-logits -m "$CONVERTED_MODEL" -embd-mode -pooling "$PROMPT"
else
../../build/bin/llama-logits -m "$CONVERTED_MODEL" -embd-mode "$PROMPT"
fi

View File

@@ -3,15 +3,13 @@
import argparse
import os
import sys
import numpy as np
import importlib
from pathlib import Path
from transformers import AutoTokenizer, AutoConfig, AutoModel
import torch
# Add parent directory to path for imports
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
from utils.common import save_output_data
def parse_arguments():
parser = argparse.ArgumentParser(description='Run original embedding model')
@@ -171,7 +169,6 @@ def main():
return_tensors="pt"
)
tokens = encoded['input_ids'][0]
token_ids = tokens.cpu().tolist()
token_strings = tokenizer.convert_ids_to_tokens(tokens)
for i, (token_id, token_str) in enumerate(zip(tokens, token_strings)):
print(f"{token_id:6d} -> '{token_str}'")
@@ -188,7 +185,6 @@ def main():
)
tokens = encoded['input_ids'][0]
token_ids = tokens.cpu().tolist()
token_strings = tokenizer.convert_ids_to_tokens(tokens)
for i, (token_id, token_str) in enumerate(zip(tokens, token_strings)):
print(f"{token_id:6d} -> '{token_str}'")
@@ -232,11 +228,24 @@ def main():
print()
data_dir = Path("data")
data_dir.mkdir(exist_ok=True)
bin_filename = data_dir / f"pytorch-{model_name}-embeddings.bin"
txt_filename = data_dir / f"pytorch-{model_name}-embeddings.txt"
flattened_embeddings = all_embeddings.flatten()
flattened_embeddings.astype(np.float32).tofile(bin_filename)
with open(txt_filename, "w") as f:
idx = 0
for j in range(n_embd_count):
for value in all_embeddings[j]:
f.write(f"{idx}: {value:.6f}\n")
idx += 1
print(f"Total values: {len(flattened_embeddings)} ({n_embd_count} embeddings × {n_embd} dimensions)")
print("")
save_output_data(flattened_embeddings, token_ids, prompt_text, model_name, type_suffix="-embeddings")
print(f"Saved bin embeddings to: {bin_filename}")
print(f"Saved txt embeddings to: {txt_filename}")
if __name__ == "__main__":

View File

@@ -3,11 +3,6 @@
import os
import sys
import torch
import transformers
import json
import textwrap
import numpy as np
from pathlib import Path
def get_model_name_from_env_path(env_path_name):
@@ -153,147 +148,3 @@ def setup_rope_debug(model_module_path: str, function_name: str = "apply_rotary_
# Patch it
setattr(module, function_name, debug_rope)
print(f"RoPE debug patching applied to {model_module_path}.{function_name}")
def save_output_data(data, tokens, prompt, model_name, type_suffix="", output_dir="data"):
"""
Save output data (logits/embeddings), tokens, and prompt to files.
Args:
data: numpy array of floats (logits or embeddings)
tokens: list or array of token IDs
prompt: string containing the input prompt
model_name: name of the model
type_suffix: optional suffix like "-embeddings" (default: "")
output_dir: directory to save files (default: "data")
Creates the following files in output_dir:
- pytorch-{model_name}{type_suffix}.bin
- pytorch-{model_name}{type_suffix}.txt
- pytorch-{model_name}{type_suffix}-prompt.txt
- pytorch-{model_name}{type_suffix}-tokens.bin
"""
data_dir = Path(output_dir)
data_dir.mkdir(exist_ok=True)
base_path = data_dir / f"pytorch-{model_name}{type_suffix}"
# Convert and flatten logits/embeddings
data = data.cpu().numpy() if isinstance(data, torch.Tensor) else np.asarray(data)
data = data.flatten() if data.ndim > 1 else data
# Save logits/embedding files
data.astype(np.float32).tofile(f"{base_path}.bin")
print(f"Data saved to {base_path}.bin")
with open(f"{base_path}.txt", "w") as f:
f.writelines(f"{i}: {value:.6f}\n" for i, value in enumerate(data))
print(f"Data saved to {base_path}.txt")
# Convert and flatten tokens
tokens = tokens.cpu().numpy() if isinstance(tokens, torch.Tensor) else np.asarray(tokens)
tokens = tokens.flatten() if tokens.ndim > 1 else tokens
# Save token binary file
tokens.astype(np.int32).tofile(f"{base_path}-tokens.bin")
print(f"Tokens saved to {base_path}-tokens.bin")
# Save prompt file
with open(f"{base_path}-prompt.txt", "w") as f:
f.write(f"prompt: {prompt}\n")
f.write(f"n_tokens: {len(tokens)}\n")
f.write(f"token ids: {', '.join(str(int(tid)) for tid in tokens)}\n")
print(f"Prompt saved to {base_path}-prompt.txt")
def compare_tokens(original, converted, type_suffix="", output_dir="data"):
data_dir = Path(output_dir)
# Read tokens from both models
tokens1_file = data_dir / f"{original}{type_suffix}-tokens.bin"
tokens2_file = data_dir / f"{converted}{type_suffix}-tokens.bin"
if not tokens1_file.exists():
print(f"Error: Token file not found: {tokens1_file}")
return False
if not tokens2_file.exists():
print(f"Error: Token file not found: {tokens2_file}")
return False
tokens1 = np.fromfile(tokens1_file, dtype=np.int32)
tokens2 = np.fromfile(tokens2_file, dtype=np.int32)
print(f"\nComparing tokens between:")
print(f" Original : {original} ({len(tokens1)} tokens)")
print(f" Converted: {converted} ({len(tokens2)} tokens)")
if len(tokens1) != len(tokens2):
print(f"\n❌ Token count mismatch: {len(tokens1)} vs {len(tokens2)}")
return False
if np.array_equal(tokens1, tokens2):
print(f"\n✅ All {len(tokens1)} tokens match!")
return True
mismatches = np.where(tokens1 != tokens2)[0]
print(f"\n❌ Found {len(mismatches)} mismatched tokens:")
num_to_show = min(len(mismatches), 10)
for idx in mismatches[:num_to_show]:
print(f" Position {idx}: {tokens1[idx]} vs {tokens2[idx]}")
if len(mismatches) > num_to_show:
print(f" ... and {len(mismatches) - num_to_show} more mismatches")
return False
def show_version_warning(current_version, model_version):
if not model_version:
return False
try:
from packaging.version import parse, InvalidVersion
try:
return parse(current_version) < parse(model_version)
except InvalidVersion:
return current_version != model_version
except ImportError:
return current_version != model_version
def get_model_transformers_version(model_path):
if not model_path:
return None
config_path = Path(model_path) / "config.json"
if not config_path.is_file():
return None
try:
with open(config_path, "r", encoding="utf-8") as f:
config = json.load(f)
return config.get("transformers_version")
except (IOError, json.JSONDecodeError) as e:
print(f"Warning: Could not read or parse {config_path}: {e}", file=sys.stderr)
return None
def exit_with_warning(message, model_path):
print(message)
if model_path and transformers is not None:
model_transformers_version = get_model_transformers_version(model_path)
transformers_version = transformers.__version__
if show_version_warning(transformers_version, model_transformers_version):
warning_message = f"""
=====================================================================
Verification failure might be due to a transformers version mismatch:
Current transformers version: {transformers_version}
Model's required version : {model_transformers_version}
Consider installing the version specified by the model's config:
pip install transformers=={model_transformers_version}
=====================================================================
"""
print(textwrap.dedent(warning_message))
sys.exit(1)

View File

@@ -1,76 +0,0 @@
#!/usr/bin/env python3
import argparse
import sys
from common import compare_tokens # type: ignore
def parse_arguments():
parser = argparse.ArgumentParser(
description='Compare tokens between two models',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
%(prog)s pytorch-gemma-3-270m-it llamacpp-gemma-3-270m-it-bf16
"""
)
parser.add_argument(
'original',
help='Original model name'
)
parser.add_argument(
'converted',
help='Converted model name'
)
parser.add_argument(
'-s', '--suffix',
default='',
help='Type suffix (e.g., "-embeddings")'
)
parser.add_argument(
'-d', '--data-dir',
default='data',
help='Directory containing token files (default: data)'
)
parser.add_argument(
'-v', '--verbose',
action='store_true',
help='Print prompts from both models'
)
return parser.parse_args()
def main():
args = parse_arguments()
if args.verbose:
from pathlib import Path
data_dir = Path(args.data_dir)
prompt1_file = data_dir / f"{args.original}{args.suffix}-prompt.txt"
prompt2_file = data_dir / f"{args.converted}{args.suffix}-prompt.txt"
if prompt1_file.exists():
print(f"\nOriginal model prompt ({args.original}):")
print(f" {prompt1_file.read_text().strip()}")
if prompt2_file.exists():
print(f"\nConverted model prompt ({args.converted}):")
print(f" {prompt2_file.read_text().strip()}")
print()
result = compare_tokens(
args.original,
args.converted,
type_suffix=args.suffix,
output_dir=args.data_dir
)
# Enable the script to be used in shell scripts so that they can check
# the exit code for success/failure.
sys.exit(0 if result else 1)
if __name__ == "__main__":
main()

View File

@@ -4,10 +4,8 @@ import numpy as np
import argparse
import os
import importlib
from pathlib import Path
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, AutoModel
from common import compare_tokens, exit_with_warning # type: ignore[import-not-found]
unreleased_model_name = os.getenv('UNRELEASED_MODEL_NAME')
@@ -159,24 +157,9 @@ def main():
else:
prompt = args.prompt
python_emb_path = Path(args.python_embeddings)
cpp_emb_path = Path(args.cpp_embeddings)
# Extract base names (e.g., "pytorch-model-name-embeddings.bin" -> "pytorch-model-name")
python_model_name = python_emb_path.stem.replace("-embeddings", "")
cpp_model_name = cpp_emb_path.stem.replace("-embeddings", "")
print("Semantic Similarity Test Between Python and llama.cpp Embedding Models")
print("=" * 70)
# First verify tokens match before comparing embeddings
print("\n🔍 Token Comparison Check")
print("=" * 70)
data_dir = python_emb_path.parent
if not compare_tokens(python_model_name, cpp_model_name, type_suffix="-embeddings", output_dir=str(data_dir)):
exit_with_warning("\n❌ Token mismatch detected", args.model_path)
print()
# Single prompt detailed comparison
print(f"\nTesting with prompt: '{prompt}'")
@@ -236,7 +219,7 @@ def main():
elif avg_cross_sim > 0.70:
print("⚠️ FAIR: Models have some differences")
else:
exit_with_warning("❌ POOR: Models are significantly different", args.model_path)
print("❌ POOR: Models are significantly different")
if __name__ == "__main__":
main()

View File

@@ -217,8 +217,8 @@ int main(int argc, char ** argv) {
struct llama_batch batch = llama_batch_init(n_batch, 0, 1);
// allocate output
const int n_embd_out = llama_model_n_embd_out(model);
std::vector<float> embeddings(n_chunks * n_embd_out, 0);
const int n_embd = llama_model_n_embd(model);
std::vector<float> embeddings(n_chunks * n_embd, 0);
float * emb = embeddings.data();
// break into batches
@@ -232,8 +232,8 @@ int main(int argc, char ** argv) {
// encode if at capacity
if (batch.n_tokens + n_toks > n_batch || s >= llama_n_seq_max(ctx)) {
float * out = emb + p * n_embd_out;
batch_process(ctx, batch, out, s, n_embd_out);
float * out = emb + p * n_embd;
batch_process(ctx, batch, out, s, n_embd);
common_batch_clear(batch);
p += s;
s = 0;
@@ -245,12 +245,12 @@ int main(int argc, char ** argv) {
}
// final batch
float * out = emb + p * n_embd_out;
batch_process(ctx, batch, out, s, n_embd_out);
float * out = emb + p * n_embd;
batch_process(ctx, batch, out, s, n_embd);
// save embeddings to chunks
for (int i = 0; i < n_chunks; i++) {
chunks[i].embedding = std::vector<float>(emb + i * n_embd_out, emb + (i + 1) * n_embd_out);
chunks[i].embedding = std::vector<float>(emb + i * n_embd, emb + (i + 1) * n_embd);
// clear tokens as they are no longer needed
chunks[i].tokens.clear();
}
@@ -266,8 +266,8 @@ int main(int argc, char ** argv) {
batch_add_seq(query_batch, query_tokens, 0);
std::vector<float> query_emb(n_embd_out, 0);
batch_process(ctx, query_batch, query_emb.data(), 1, n_embd_out);
std::vector<float> query_emb(n_embd, 0);
batch_process(ctx, query_batch, query_emb.data(), 1, n_embd);
common_batch_clear(query_batch);
@@ -275,7 +275,7 @@ int main(int argc, char ** argv) {
{
std::vector<std::pair<int, float>> similarities;
for (int i = 0; i < n_chunks; i++) {
float sim = common_embd_similarity_cos(chunks[i].embedding.data(), query_emb.data(), n_embd_out);
float sim = common_embd_similarity_cos(chunks[i].embedding.data(), query_emb.data(), n_embd);
similarities.push_back(std::make_pair(i, sim));
}

View File

@@ -8,10 +8,10 @@ cd build
source /opt/intel/oneapi/setvars.sh
#for FP16
#cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON -DLLAMA_OPENSSL=OFF # faster for long-prompt inference
#cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON -DLLAMA_CURL=OFF # faster for long-prompt inference
#for FP32
cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_OPENSSL=OFF
cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=OFF
#build example/main
#cmake --build . --config Release --target main

View File

@@ -13,10 +13,10 @@ if %errorlevel% neq 0 goto ERROR
:: for FP16
:: faster for long-prompt inference
:: cmake -G "MinGW Makefiles" .. -DLLAMA_OPENSSL=OFF -DGGML_SYCL=ON -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DGGML_SYCL_F16=ON
:: cmake -G "MinGW Makefiles" .. -DLLAMA_CURL=OFF -DGGML_SYCL=ON -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DGGML_SYCL_F16=ON
:: for FP32
cmake -G "Ninja" .. -DLLAMA_OPENSSL=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release
cmake -G "Ninja" .. -DLLAMA_CURL=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release
if %errorlevel% neq 0 goto ERROR
:: build all binary

View File

@@ -7,7 +7,7 @@ extern "C" {
#endif
#define RPC_PROTO_MAJOR_VERSION 3
#define RPC_PROTO_MINOR_VERSION 6
#define RPC_PROTO_MINOR_VERSION 7
#define RPC_PROTO_PATCH_VERSION 0
#define GGML_RPC_MAX_SERVERS 16

View File

@@ -234,11 +234,6 @@
#if UINTPTR_MAX == 0xFFFFFFFF
#define GGML_MEM_ALIGN 4
#elif defined(__EMSCRIPTEN__)
// emscripten uses max_align_t == 8, so we need GGML_MEM_ALIGN == 8 for 64-bit wasm.
// (for 32-bit wasm, the first conditional is true and GGML_MEM_ALIGN stays 4.)
// ref: https://github.com/ggml-org/llama.cpp/pull/18628
#define GGML_MEM_ALIGN 8
#else
#define GGML_MEM_ALIGN 16
#endif

View File

@@ -144,7 +144,7 @@ extern "C" {
// device description: short informative description of the device, could be the model name
const char * (*get_description)(ggml_backend_dev_t dev);
// device memory in bytes: 0 bytes to indicate no memory to report
// device memory in bytes
void (*get_memory)(ggml_backend_dev_t dev, size_t * free, size_t * total);
// device type

View File

@@ -32,12 +32,14 @@ if (BLAS_FOUND)
pkg_check_modules(DepBLAS openblas)
endif()
elseif (${GGML_BLAS_VENDOR} MATCHES "FLAME")
add_compile_definitions(GGML_BLAS_USE_BLIS)
pkg_check_modules(DepBLAS blis)
elseif (${GGML_BLAS_VENDOR} MATCHES "ATLAS")
pkg_check_modules(DepBLAS blas-atlas)
elseif (${GGML_BLAS_VENDOR} MATCHES "FlexiBLAS")
pkg_check_modules(DepBLAS flexiblas_api)
elseif (${GGML_BLAS_VENDOR} MATCHES "Intel")
add_compile_definitions(GGML_BLAS_USE_MKL)
# all Intel* libraries share the same include path
pkg_check_modules(DepBLAS mkl-sdl)
elseif (${GGML_BLAS_VENDOR} MATCHES "NVHPC")
@@ -72,26 +74,10 @@ if (BLAS_FOUND)
target_compile_options(ggml-blas PRIVATE ${BLAS_LINKER_FLAGS})
if ("${GGML_BLAS_VENDOR}" STREQUAL "")
message(WARNING "GGML_BLAS_VENDOR is not set; some methods may not link properly.")
endif()
if ("${GGML_BLAS_VENDOR}" MATCHES "Intel" OR ("${BLAS_INCLUDE_DIRS}" MATCHES "mkl" AND "${GGML_BLAS_VENDOR}" MATCHES "Generic"))
if ("${BLAS_INCLUDE_DIRS}" MATCHES "mkl" AND (${GGML_BLAS_VENDOR} MATCHES "Generic" OR ${GGML_BLAS_VENDOR} MATCHES "Intel"))
add_compile_definitions(GGML_BLAS_USE_MKL)
endif()
if ("${GGML_BLAS_VENDOR}" MATCHES "OpenBLAS")
add_compile_definitions(GGML_BLAS_USE_OPENBLAS)
endif()
if ("${GGML_BLAS_VENDOR}" MATCHES "FLAME" OR "${GGML_BLAS_VENDOR}" MATCHES "AOCL" OR "${GGML_BLAS_VENDOR}" MATCHES "AOCL_mt")
add_compile_definitions(GGML_BLAS_USE_BLIS)
endif()
if ("${GGML_BLAS_VENDOR}" MATCHES "NVPL")
add_compile_definitions(GGML_BLAS_USE_NVPL)
endif()
target_link_libraries (ggml-blas PRIVATE ${BLAS_LIBRARIES})
target_include_directories(ggml-blas PRIVATE ${BLAS_INCLUDE_DIRS})
else()

View File

@@ -115,11 +115,15 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg
#endif
}
#if defined(GGML_BLAS_USE_OPENBLAS)
#if defined(OPENBLAS_VERSION)
openblas_set_num_threads(ctx->n_threads);
#elif defined(GGML_BLAS_USE_BLIS)
#endif
#if defined(GGML_BLAS_USE_BLIS)
bli_thread_set_num_threads(ctx->n_threads);
#elif defined(GGML_BLAS_USE_NVPL)
#endif
#if defined(GGML_BLAS_USE_NVPL)
nvpl_blas_set_num_threads(ctx->n_threads);
#endif
@@ -284,7 +288,7 @@ ggml_backend_t ggml_backend_blas_init(void) {
/* .context = */ ctx,
};
#if defined(GGML_BLAS_USE_OPENBLAS) && defined(GGML_USE_OPENMP)
#if defined(OPENBLAS_VERSION) && defined(GGML_USE_OPENMP)
if (openblas_get_parallel() != OPENBLAS_OPENMP) {
GGML_LOG_DEBUG("%s: warning: ggml is using OpenMP, but OpenBLAS was compiled without OpenMP support\n", __func__);
}
@@ -325,7 +329,7 @@ static const char * ggml_backend_blas_device_get_description(ggml_backend_dev_t
return "BLIS";
#elif defined(GGML_BLAS_USE_NVPL)
return "NVPL";
#elif defined(GGML_BLAS_USE_OPENBLAS)
#elif defined(OPENBLAS_VERSION)
return "OpenBLAS";
#else
return "BLAS";

View File

@@ -1963,7 +1963,7 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context & ctx, ggml_tensor *
acl_tensor_ptr acl_weight_tensor;
// Only check env once.
static bool weight_to_nz = parse_bool(get_env_as_lowercase("GGML_CANN_WEIGHT_NZ").value_or("on"));
static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or("on"));
if (weight_to_nz && is_matmul_weight(weight)) {
acl_weight_tensor = ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims, ACL_FORMAT_FRACTAL_NZ);
} else {

View File

@@ -103,7 +103,7 @@ const ggml_cann_device_info & ggml_cann_info();
void ggml_cann_set_device(int32_t device);
int32_t ggml_cann_get_device();
std::optional<std::string> get_env_as_lowercase(const std::string & name);
std::optional<std::string> get_env(const std::string & name);
bool parse_bool(const std::string & value);
int parse_integer(const std::string & value);

View File

@@ -105,10 +105,10 @@ int32_t ggml_cann_get_device() {
}
/**
* @brief Get the value of the specified environment variable (name) as lowercase.
* @brief Get the value of the specified environment variable (name).
* if not empty, return a std::string object
*/
std::optional<std::string> get_env_as_lowercase(const std::string & name) {
std::optional<std::string> get_env(const std::string & name) {
const char * val = std::getenv(name.c_str());
if (!val) {
return std::nullopt;
@@ -122,7 +122,7 @@ std::optional<std::string> get_env_as_lowercase(const std::string & name) {
* @brief Verify whether the environment variable is a valid value.
*/
bool parse_bool(const std::string & value) {
static const std::unordered_set<std::string> valid_values = { "on", "1", "yes", "y", "enable", "true" };
std::unordered_set<std::string> valid_values = { "on", "1", "yes", "y", "enable", "true" };
return valid_values.find(value) != valid_values.end();
}
@@ -259,7 +259,7 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
* @param device The device ID to associate with this buffer pool.
*/
explicit ggml_cann_pool_buf_prio(int device) : device(device) {
disable_clean = parse_bool(get_env_as_lowercase("GGML_CANN_DISABLE_BUF_POOL_CLEAN").value_or(""));
disable_clean = parse_bool(get_env("GGML_CANN_DISABLE_BUF_POOL_CLEAN").value_or(""));
}
/**
@@ -452,7 +452,7 @@ struct ggml_cann_pool_buf : public ggml_cann_pool {
* @param device The device ID to associate with this buffer pool.
*/
explicit ggml_cann_pool_buf(int device) : device(device) {
disable_clean = parse_bool(get_env_as_lowercase("GGML_CANN_DISABLE_BUF_POOL_CLEAN").value_or(""));
disable_clean = parse_bool(get_env("GGML_CANN_DISABLE_BUF_POOL_CLEAN").value_or(""));
}
/**
@@ -764,7 +764,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
* @return A unique pointer to the created CANN pool.
*/
std::unique_ptr<ggml_cann_pool> ggml_backend_cann_context::new_pool_for_device(int device) {
std::string mem_pool_type = get_env_as_lowercase("GGML_CANN_MEM_POOL").value_or("");
std::string mem_pool_type = get_env("GGML_CANN_MEM_POOL").value_or("");
if (mem_pool_type == "prio") {
GGML_LOG_INFO("%s: device %d use buffer pool with priority queue\n", __func__, device);
@@ -1217,7 +1217,7 @@ static void ggml_backend_cann_buffer_set_tensor(ggml_backend_buffer_t buffer,
// Why aclrtSynchronizeDevice?
// Only check env once.
static bool weight_to_nz = parse_bool(get_env_as_lowercase("GGML_CANN_WEIGHT_NZ").value_or("on"));
static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or("on"));
if (!need_transform(tensor->type)) {
ACL_CHECK(aclrtMemcpy((char *) tensor->data + offset, size, data, size, ACL_MEMCPY_HOST_TO_DEVICE));
if (weight_to_nz && is_matmul_weight((const ggml_tensor *) tensor)) {
@@ -1442,7 +1442,7 @@ static size_t ggml_backend_cann_buffer_type_get_alloc_size(ggml_backend_buffer_t
int64_t ne0 = tensor->ne[0];
// Only check env once.
static bool weight_to_nz = parse_bool(get_env_as_lowercase("GGML_CANN_WEIGHT_NZ").value_or("on"));
static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or("on"));
// last line must bigger than 32, because every single op deal at
// least 32 bytes.
@@ -2136,7 +2136,7 @@ static void evaluate_and_capture_cann_graph(ggml_backend_cann_context * cann_ctx
#endif // USE_ACL_GRAPH
// Only perform the graph execution if CANN graphs are not enabled, or we are capturing the graph.
// With the use of CANN graphs, the execution will be performed by the graph launch.
static bool opt_fusion = parse_bool(get_env_as_lowercase("GGML_CANN_OPERATOR_FUSION").value_or(""));
static bool opt_fusion = parse_bool(get_env("GGML_CANN_OPERATOR_FUSION").value_or(""));
if (!use_cann_graph || cann_graph_capture_required) {
for (int i = 0; i < cgraph->n_nodes; i++) {
@@ -2201,7 +2201,7 @@ static enum ggml_status ggml_backend_cann_graph_compute(ggml_backend_t backend,
#ifdef USE_ACL_GRAPH
bool use_cann_graph = true;
static bool prefill_use_graph = parse_bool(get_env_as_lowercase("GGML_CANN_PREFILL_USE_GRAPH").value_or(""));
static bool prefill_use_graph = parse_bool(get_env("GGML_CANN_PREFILL_USE_GRAPH").value_or(""));
if (!prefill_use_graph) {
// Do not use acl_graph for prefill.
for (int i = 0; i < cgraph->n_nodes; i++) {
@@ -2541,6 +2541,27 @@ static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft) {
return buft->iface.get_name == ggml_backend_cann_buffer_type_name;
}
/**
* @brief Determines if a tensor operation should be offloaded to the CANN
* backend.
*
* This function checks if a given tensor operation should be offloaded to the
* CANN backend based on the operation type and the size of the tensor. It
* returns true if the second dimension (ne[1]) of the tensor is greater than or
* equal to the minimum batch size and the operation is not GGML_OP_GET_ROWS.
*
* @param backend Pointer to the CANN backend.
* @param op Pointer to the tensor operation to check.
* @return bool Returns true if the operation should be offloaded, otherwise
* false.
*/
static bool ggml_backend_cann_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
const int min_batch_size = 32;
GGML_UNUSED(dev);
return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS;
}
/**
* @brief Records an event on the CANN backend stream.
*
@@ -2616,7 +2637,6 @@ struct ggml_backend_cann_device_context {
int device;
std::string name;
std::string description;
int op_offload_min_batch_size;
};
static const char * ggml_backend_cann_device_get_name(ggml_backend_dev_t dev) {
@@ -2693,26 +2713,6 @@ static ggml_backend_buffer_type_t ggml_backend_cann_device_get_host_buffer_type(
return ggml_backend_cann_host_buffer_type();
}
/**
* @brief Determines if a tensor operation should be offloaded to the CANN
* backend.
*
* This function checks if a given tensor operation should be offloaded to the
* CANN backend based on the operation type and the size of the tensor. It
* returns true if the second dimension (ne[1]) of the tensor is greater than or
* equal to the minimum batch size and the operation is not GGML_OP_GET_ROWS.
*
* @param backend Pointer to the CANN backend.
* @param op Pointer to the tensor operation to check.
* @return bool Returns true if the operation should be offloaded, otherwise
* false.
*/
static bool ggml_backend_cann_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *)dev->context;
return op->ne[1] >= dev_ctx->op_offload_min_batch_size && op->op != GGML_OP_GET_ROWS;
}
/**
* @brief Creates a new event for the CANN backend device.
*
@@ -2829,14 +2829,12 @@ ggml_backend_reg_t ggml_backend_cann_reg() {
if (!initialized) {
aclInit(nullptr);
ggml_backend_cann_reg_context * ctx = new ggml_backend_cann_reg_context;
const int min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;
for (int i = 0; i < ggml_cann_info().device_count; i++) {
ggml_backend_cann_device_context * dev_ctx = new ggml_backend_cann_device_context();
dev_ctx->description = aclrtGetSocName();
dev_ctx->device = i;
dev_ctx->name = GGML_CANN_NAME + std::to_string(i);
dev_ctx->op_offload_min_batch_size = min_batch_size;
ggml_cann_set_device(i);
ggml_backend_dev_t dev = new ggml_backend_device{ /* .iface = */ ggml_backend_cann_device_interface,
/* .reg = */ &reg,

View File

@@ -47,10 +47,7 @@ if (CUDAToolkit_FOUND)
# check Modules/Internal/CMakeCUDAArchitecturesValidate.cmake in the CMake git repository instead.
# However, the architectures 120a-real and 121a-real should work with basically any CMake version and
# until the release of e.g. Rubin there is no benefit to shipping virtual architectures for Blackwell.
list(APPEND CMAKE_CUDA_ARCHITECTURES 120a-real)
endif()
if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "12.9")
list(APPEND CMAKE_CUDA_ARCHITECTURES 121a-real)
list(APPEND CMAKE_CUDA_ARCHITECTURES 120a-real 121a-real)
endif()
endif()
endif()

View File

@@ -262,10 +262,6 @@ static const char * cu_get_error_str(CUresult err) {
#define FLASH_ATTN_AVAILABLE
#endif // !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ < 220)
#if defined(TURING_MMA_AVAILABLE)
#define LDMATRIX_TRANS_AVAILABLE
#endif // defined(TURING_MMA_AVAILABLE)
static bool fp16_available(const int cc) {
return ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_PASCAL ||
(GGML_CUDA_CC_IS_MTHREADS(cc) && cc >= GGML_CUDA_CC_PH1);
@@ -530,86 +526,6 @@ static __device__ __forceinline__ half2 warp_prefix_inclusive_sum(half2 a) {
#endif // FP16_AVAILABLE
}
enum class block_reduce_method {
MAX,
SUM,
};
template<block_reduce_method method_t, typename T>
struct block_reduce_policy;
template <typename T, typename... Ts>
inline constexpr bool is_any = (std::is_same_v<T, Ts> || ...);
template<typename...>
inline constexpr bool ggml_cuda_dependent_false_v = false;
template <typename T> struct block_reduce_policy<block_reduce_method::SUM, T> {
static __device__ T reduce(T val) {
if constexpr(is_any<T, float, float2, half2, int>) {
return warp_reduce_sum(val);
} else {
static_assert(ggml_cuda_dependent_false_v<T>, "Unsupported type for block reduce sum");
}
}
static __device__ T sentinel() {
if constexpr (std::is_same_v<T, float>) {
return 0.0f;
} else if constexpr (std::is_same_v<T, float2>) {
return make_float2(0.0f, 0.0f);
} else if constexpr (std::is_same_v<T, half2>) {
return make_half2(0.0f, 0.0f);
} else if constexpr (std::is_same_v<T, int>) {
return 0;
} else {
static_assert(ggml_cuda_dependent_false_v<T>, "Unsupported type for block reduce sum");
}
}
};
template <typename T> struct block_reduce_policy<block_reduce_method::MAX, T> {
static __device__ T reduce(T val) {
if constexpr (is_any<T, float, half2>) {
return warp_reduce_max(val);
} else {
static_assert(ggml_cuda_dependent_false_v<T>, "Unsupported type for block reduce max");
}
}
static __device__ T sentinel() {
if constexpr (std::is_same_v<T, float>) {
return -INFINITY;
} else if constexpr (std::is_same_v<T, half2>) {
return make_half2(-INFINITY, -INFINITY);
} else {
static_assert(ggml_cuda_dependent_false_v<T>, "Unsupported type for block reduce max");
}
}
};
template <block_reduce_method reduce_method_t, const unsigned int block_size_template = 0, typename T>
static __device__ T block_reduce(T val, T * shared_vals) {
val = block_reduce_policy<reduce_method_t, T>::reduce(val);
const unsigned int block_size = block_size_template == 0 ? blockDim.x : block_size_template;
if (block_size > WARP_SIZE) {
assert((block_size <= 1024) && (block_size % WARP_SIZE) == 0);
const int warp_id = threadIdx.x / WARP_SIZE;
const int lane_id = threadIdx.x % WARP_SIZE;
if (lane_id == 0) {
shared_vals[warp_id] = val;
}
__syncthreads();
val = block_reduce_policy<reduce_method_t, T>::sentinel();
if (lane_id < (static_cast<int>(block_size) / WARP_SIZE)) {
val = shared_vals[lane_id];
}
return block_reduce_policy<reduce_method_t, T>::reduce(val);
}
return val;
}
static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b) {
#ifdef FP16_AVAILABLE
@@ -1120,7 +1036,7 @@ struct ggml_tensor_extra_gpu {
#define USE_CUDA_GRAPH
#endif
struct ggml_cuda_graph_node_properties {
struct ggml_graph_node_properties {
void * node_address;
ggml_op node_op;
int64_t ne[GGML_MAX_DIMS];
@@ -1145,25 +1061,10 @@ struct ggml_cuda_graph {
std::vector<cudaGraphNode_t> nodes;
bool disable_due_to_gpu_arch = false;
bool disable_due_to_too_many_updates = false;
bool disable_due_to_failed_graph_capture = false;
int number_consecutive_updates = 0;
std::vector<ggml_cuda_graph_node_properties> props;
void record_update(bool use_graph, bool update_required) {
if (use_graph && update_required) {
number_consecutive_updates++;
} else {
number_consecutive_updates = 0;
}
if (number_consecutive_updates >= 4) {
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
disable_due_to_too_many_updates = true;
}
}
bool is_enabled() const {
static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr);
return !(disable_due_to_gpu_arch || disable_cuda_graphs_due_to_env || disable_due_to_too_many_updates);
}
bool cuda_graphs_enabled = false;
std::vector<ggml_graph_node_properties> ggml_graph_properties;
#endif
};

View File

@@ -11,12 +11,10 @@
#define SOFTMAX_FTZ_THRESHOLD -20.0f // Softmax exp. of values smaller than this are flushed to zero to avoid NaNs.
// log(2) = 0.6931, by adding this to the KQ maximum used for the softmax the numerical range representable
// by the VKQ accumulators is effectively being shifted up by a factor of 2.
// by the VKQ accumulators is effectively being shifted up by a factor of 8.
// This reduces issues with numerical overflow but also causes larger values to be flushed to zero.
// However, as the output from FlashAttention will usually be used as an input for a matrix multiplication this should be negligible.
// Still, the value range should be shifted as much as necessary but as little as possible.
// The macro on the following line shifts it by a factor of 2**3=8, as was needed to fix https://github.com/ggml-org/llama.cpp/issues/18606 .
#define FATTN_KQ_MAX_OFFSET (3.0f*0.6931f)
#define FATTN_KQ_MAX_OFFSET 0.6931f
typedef void (* fattn_kernel_t)(
const char * __restrict__ Q,
@@ -914,7 +912,7 @@ void launch_fattn(
const int nblocks_stream_k = max_blocks;
const bool use_stream_k = cc >= GGML_CUDA_CC_ADA_LOVELACE || amd_wmma_available(cc) || tiles_efficiency_percent < 75;
const bool use_stream_k = cc >= GGML_CUDA_CC_ADA_LOVELACE || tiles_efficiency_percent < 75;
blocks_num.x = use_stream_k ? nblocks_stream_k : ntiles_total;
blocks_num.y = 1;

View File

@@ -98,19 +98,6 @@ static constexpr __host__ __device__ fattn_mma_config ggml_cuda_fattn_mma_get_co
return ggml_cuda_fattn_mma_get_config_ampere(DKQ, DV, ncols);
}
static constexpr __host__ __device__ fattn_mma_config ggml_cuda_fattn_mma_get_config_rdna(const int DKQ, const int DV, const int ncols) {
GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 16, 128, 2, 64, 128, 128, 128, 2, true);
GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 32, 128, 2, 64, 128, 128, 64, 2, true);
GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 64, 128, 2, 64, 128, 128, 64, 2, true);
GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512, 16, 64, 4, 32, 96, 64, 128, 1, false);
GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512, 32, 128, 2, 32, 160, 128, 128, 1, false);
GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512, 64, 256, 1, 32, 160, 128, 128, 1, false);
// TODO tune specifically for RDNA
return ggml_cuda_fattn_mma_get_config_ampere(DKQ, DV, ncols);
}
static __host__ fattn_mma_config ggml_cuda_fattn_mma_get_config(const int DKQ, const int DV, const int ncols, const int cc) {
if (ampere_mma_available(cc)) {
return ggml_cuda_fattn_mma_get_config_ampere(DKQ, DV, ncols);
@@ -118,9 +105,6 @@ static __host__ fattn_mma_config ggml_cuda_fattn_mma_get_config(const int DKQ, c
if (turing_mma_available(cc)) {
return ggml_cuda_fattn_mma_get_config_turing(DKQ, DV, ncols);
}
if (amd_wmma_available(cc)) {
return ggml_cuda_fattn_mma_get_config_rdna(DKQ, DV, ncols);
}
GGML_ASSERT(volta_mma_available(cc));
return ggml_cuda_fattn_mma_get_config_volta(DKQ, DV, ncols);
}
@@ -132,8 +116,6 @@ static constexpr __device__ fattn_mma_config ggml_cuda_fattn_mma_get_config(cons
return ggml_cuda_fattn_mma_get_config_turing(DKQ, DV, ncols);
#elif defined(VOLTA_MMA_AVAILABLE)
return ggml_cuda_fattn_mma_get_config_volta(DKQ, DV, ncols);
#elif defined(AMD_WMMA_AVAILABLE)
return ggml_cuda_fattn_mma_get_config_rdna(DKQ, DV, ncols);
#else
GGML_UNUSED_VARS(DKQ, DV, ncols);
return fattn_mma_config(32, 1, 0, 0, 0, 0, 0, false);
@@ -204,23 +186,6 @@ static constexpr __device__ bool ggml_cuda_fattn_mma_get_Q_in_reg(const int DKQ,
return ggml_cuda_fattn_mma_get_config(DKQ, DV, ncols).Q_in_reg;
}
static constexpr __device__ int get_cols_per_thread() {
#if defined(AMD_WMMA_AVAILABLE)
return 1; // RDNA has a single column.
#else
return 2; // This is specifically KQ columns, Volta only has a single VKQ column.
#endif // defined(AMD_WMMA_AVAILABLE)
}
static __host__ int get_cols_per_warp(const int cc) {
if (turing_mma_available(cc) || amd_wmma_available(cc)) {
return 16;
} else {
// Volta
return 32;
}
}
// ------------------------------------------------------------------------------------------------------------------
static __host__ int ggml_cuda_fattn_mma_get_nstages(const int DKQ, const int DV, const int ncols1, const int ncols2, const int cc) {
@@ -428,10 +393,10 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
const int jt,
const int kb0,
const int k_VKQ_sup) {
#if defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || (defined(AMD_WMMA_AVAILABLE) && defined(RDNA4))
#if defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
constexpr int ncols = ncols1 * ncols2;
constexpr int cols_per_warp = T_B_KQ::I;
constexpr int cols_per_thread = get_cols_per_thread();
constexpr int cols_per_thread = 2; // This is specifically KQ columns, Volta only has a single VKQ column.
constexpr int np = nwarps * (cols_per_warp/ncols2) / ncols1; // Number of parallel CUDA warps per Q column.
constexpr int nbatch_fa = ggml_cuda_fattn_mma_get_nbatch_fa(DKQ, DV, ncols);
constexpr int nbatch_K2 = ggml_cuda_fattn_mma_get_nbatch_K2(DKQ, DV, ncols);
@@ -448,8 +413,6 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
const int k_VKQ_0 = kb0 * nbatch_fa;
#if defined(TURING_MMA_AVAILABLE)
T_C_KQ KQ_C[nbatch_fa/(np*(cols_per_warp == 8 ? T_C_KQ::I : T_C_KQ::J))];
#elif defined(AMD_WMMA_AVAILABLE)
T_C_KQ KQ_C[nbatch_fa/(np*T_C_KQ::J)];
#else // Volta
T_C_KQ KQ_C[nbatch_fa/(np*T_C_KQ::J)];
#endif // defined(TURING_MMA_AVAILABLE)
@@ -498,14 +461,8 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
if constexpr (cols_per_warp == 8) {
mma(KQ_C[i_KQ_00/(np*T_A_KQ::I)], K_A, Q_B[k_KQ_0/T_A_KQ::J]);
} else {
// Wide version of KQ_C is column-major
#if defined(AMD_WMMA_AVAILABLE)
// RDNA matrix C is column-major.
mma(KQ_C[i_KQ_00/(np*T_A_KQ::I)], K_A, Q_B[k_KQ_0/T_A_KQ::J]);
#else
// swap A and B for CUDA.
// Wide version of KQ_C is column-major => swap A and B.
mma(KQ_C[i_KQ_00/(np*T_A_KQ::I)], Q_B[k_KQ_0/T_A_KQ::J], K_A);
#endif // defined(AMD_WMMA_AVAILABLE)
}
}
}
@@ -522,14 +479,8 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
T_A_KQ K_A;
load_ldmatrix(K_A, tile_K + i_KQ_0*stride_tile_K + (k_KQ_0 - k0_start), stride_tile_K);
// Wide version of KQ_C is column-major
#if defined(AMD_WMMA_AVAILABLE)
// RDNA matrix C is column-major.
mma(KQ_C[i_KQ_00/(np*T_A_KQ::I)], K_A, Q_B[0]);
#else
// swap A and B for CUDA.
// Wide version of KQ_C is column-major => swap A and B.
mma(KQ_C[i_KQ_00/(np*T_A_KQ::I)], Q_B[0], K_A);
#endif // defined(AMD_WMMA_AVAILABLE)
}
}
}
@@ -581,13 +532,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
#pragma unroll
for (int l = 0; l < T_C_KQ::ne; ++l) {
if (!oob_check || k0 + (threadIdx.y % np)*T_C_KQ::I + T_C_KQ::get_i(l) < k_VKQ_sup) {
#if defined(AMD_WMMA_AVAILABLE)
constexpr int KQ_idx = 0;
#else
// Turing + Volta:
const int KQ_idx = l % 2;
#endif // defined(AMD_WMMA_AVAILABLE)
KQ_max_new[KQ_idx] = fmaxf(KQ_max_new[KQ_idx], KQ_C[k0/(np*T_C_KQ::I)].x[l] + FATTN_KQ_MAX_OFFSET);
KQ_max_new[l % 2] = fmaxf(KQ_max_new[l % 2], KQ_C[k0/(np*T_C_KQ::I)].x[l] + FATTN_KQ_MAX_OFFSET);
}
}
}
@@ -607,14 +552,8 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
#pragma unroll
for (int l = 0; l < T_C_KQ::ne; ++l) {
if (!oob_check || k0 + (threadIdx.y % np)*T_C_KQ::I + T_C_KQ::get_i(l) < k_VKQ_sup) {
#if defined(AMD_WMMA_AVAILABLE)
constexpr int KQ_idx = 0;
#else
// Turing + Volta:
const int KQ_idx = l % 2;
#endif // defined(AMD_WMMA_AVAILABLE)
KQ_C[k0/(np*T_C_KQ::I)].x[l] = expf(KQ_C[k0/(np*T_C_KQ::I)].x[l] - KQ_max_new[KQ_idx]);
KQ_rowsum_add[KQ_idx] += KQ_C[k0/(np*T_C_KQ::I)].x[l];
KQ_C[k0/(np*T_C_KQ::I)].x[l] = expf(KQ_C[k0/(np*T_C_KQ::I)].x[l] - KQ_max_new[l % 2]);
KQ_rowsum_add[l % 2] += KQ_C[k0/(np*T_C_KQ::I)].x[l];
} else {
KQ_C[k0/(np*T_C_KQ::I)].x[l] = 0.0f;
}
@@ -645,13 +584,8 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
#pragma unroll
for (int l = 0; l < T_C_KQ::ne; ++l) {
if (!oob_check || k0 + (threadIdx.y % np)*T_C_KQ::J + T_C_KQ::get_j(l) < k_VKQ_sup) {
#if defined(AMD_WMMA_AVAILABLE)
constexpr int KQ_idx = 0;
#else
// Turing + Volta:
const int KQ_idx = (l/2) % 2;
#endif // defined(AMD_WMMA_AVAILABLE)
KQ_max_new[KQ_idx] = fmaxf(KQ_max_new[KQ_idx], KQ_C[(k0/(np*T_C_KQ::J))].x[l] + FATTN_KQ_MAX_OFFSET);
KQ_max_new[(l/2) % 2] = fmaxf(KQ_max_new[(l/2) % 2], KQ_C[(k0/(np*T_C_KQ::J))].x[l] + FATTN_KQ_MAX_OFFSET);
}
}
}
@@ -662,11 +596,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
// Values per KQ column are spread across 4 threads:
constexpr int offset_first = 2;
constexpr int offset_last = 1;
#elif defined(AMD_WMMA_AVAILABLE)
// Values per KQ column are spread across 2 threads:
constexpr int offset_first = 16;
constexpr int offset_last = 16;
#else // Volta
#else
// Values per KQ column are spread across 2 threads:
constexpr int offset_first = 2;
constexpr int offset_last = 2;
@@ -682,15 +612,10 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
for (int k0 = 0; k0 < nbatch_fa; k0 += np*T_C_KQ::J) {
#pragma unroll
for (int l = 0; l < T_C_KQ::ne; ++l) {
// Turing + Volta:
if (!oob_check || k0 + (threadIdx.y % np)*T_C_KQ::J + T_C_KQ::get_j(l) < k_VKQ_sup) {
#if defined(AMD_WMMA_AVAILABLE)
constexpr int KQ_idx = 0;
#else
// Turing + Volta:
const int KQ_idx = (l/2) % 2;
#endif // defined(AMD_WMMA_AVAILABLE)
KQ_C[(k0/(np*T_C_KQ::J))].x[l] = expf(KQ_C[(k0/(np*T_C_KQ::J))].x[l] - KQ_max_new[KQ_idx]);
KQ_rowsum_add[KQ_idx] += KQ_C[(k0/(np*T_C_KQ::J))].x[l];
KQ_C[(k0/(np*T_C_KQ::J))].x[l] = expf(KQ_C[(k0/(np*T_C_KQ::J))].x[l] - KQ_max_new[(l/2) % 2]);
KQ_rowsum_add[(l/2) % 2] += KQ_C[(k0/(np*T_C_KQ::J))].x[l];
} else {
KQ_C[(k0/(np*T_C_KQ::J))].x[l] = 0.0f;
}
@@ -714,7 +639,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
#if defined(TURING_MMA_AVAILABLE)
if constexpr (cols_per_warp == 8) {
const half2 KQ_max_scale_h2 = make_half2(KQ_max_scale[0], KQ_max_scale[cols_per_thread - 1]);
const half2 KQ_max_scale_h2 = make_half2(KQ_max_scale[0], KQ_max_scale[1]);
#pragma unroll
for (int i = 0; i < DV/T_C_VKQ::I; ++i) {
#pragma unroll
@@ -735,16 +660,6 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
}
}
}
#elif defined(AMD_WMMA_AVAILABLE)
const half2 KQ_max_scale_h2 = make_half2(
KQ_max_scale[0], KQ_max_scale[0]);
#pragma unroll
for (int i = 0; i < (DV/2)/T_C_VKQ::J; ++i) {
#pragma unroll
for (int l = 0; l < T_C_VKQ::ne; ++l) {
VKQ_C[i].x[l] *= KQ_max_scale_h2;
}
}
#else // Volta
const half2 KQ_max_scale_h2 = make_half2(
KQ_max_scale[(threadIdx.x / 2) % 2], KQ_max_scale[(threadIdx.x / 2) % 2]);
@@ -792,10 +707,6 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
// Therefore, iterate over V in reverse and re-use the data if possible.
static_assert(!mla || nstages <= 1, "combination of MLA and multi-stage loading not implemented");
constexpr int reusable_cutoff = mla ? (DKQ - 1) - (DKQ - 1) % (2*nbatch_K2) - (DKQ - DV) : DV;
#if defined(AMD_WMMA_AVAILABLE) && !defined(LDMATRIX_TRANS_AVAILABLE)
T_A_VKQ A_identity;
make_identity_mat(A_identity);
#endif // defined(AMD_WMMA_AVAILABLE) && !defined(LDMATRIX_TRANS_AVAILABLE)
// Calculate VKQ tile, need to use logical rather than physical elements for i0 due to transposition of V:
#pragma unroll
@@ -816,7 +727,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
}
const half2 * tile_V_i = i0_start < reusable_cutoff ? tile_V : tile_V + (i0_start - reusable_cutoff)/2;
#if defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
#if defined(TURING_MMA_AVAILABLE)
constexpr int i0_stride = cols_per_warp == 8 ? T_C_VKQ::I : 2*T_C_VKQ::J;
#pragma unroll
for (int i_VKQ_0 = i0_start; i_VKQ_0 < i0_stop; i_VKQ_0 += i0_stride) {
@@ -826,26 +737,12 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
const int k0 = k00 + (threadIdx.y % np)*T_A_VKQ::J;
T_A_VKQ A; // Transposed in SRAM but not in registers, gets transposed on load.
#if defined(LDMATRIX_TRANS_AVAILABLE)
load_ldmatrix_trans(A, tile_V_i + 2*k0*stride_tile_V + (i_VKQ_0 - i0_start)/2, stride_tile_V);
#else
// TODO: Try to transpose tile_V when loading gmem to smem.
// Use mma to transpose T_A_VKQ for RDNA.
T_A_VKQ A_trans;
load_ldmatrix(A_trans, tile_V_i + 2*k0*stride_tile_V + (i_VKQ_0 - i0_start)/2, stride_tile_V);
mma(A, A_trans, A_identity);
#endif // defined(TURING_MMA_AVAILABLE)
if constexpr (T_B_KQ::I == 8) {
mma(VKQ_C[i_VKQ_0/i0_stride], A, B[k00/(np*T_A_VKQ::J)]);
} else {
// Wide version of VKQ_C is column-major.
#if defined(AMD_WMMA_AVAILABLE)
// RDNA matrix C is column-major.
mma(VKQ_C[i_VKQ_0/i0_stride], A, B[k00/(np*T_A_VKQ::J)]);
#else
// swap A and B for CUDA.
// Wide version of VKQ_C is column-major => swap A and B.
mma(VKQ_C[i_VKQ_0/i0_stride], B[k00/(np*T_A_VKQ::J)], A);
#endif // defined(AMD_WMMA_AVAILABLE)
}
}
}
@@ -864,7 +761,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
mma(VKQ_C[i_VKQ_0/i0_stride], B[k00/(np*T_A_VKQ::I)], A);
}
}
#endif // defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
#endif // defined(TURING_MMA_AVAILABLE)
if constexpr (nstages <= 1) {
__syncthreads(); // Only needed if tile_K == tile_V.
@@ -877,7 +774,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
tile_Q, tile_K, tile_V, tile_mask,
Q_B, VKQ_C, KQ_max, KQ_rowsum, kb0);
NO_DEVICE_CODE;
#endif // defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || (defined(AMD_WMMA_AVAILABLE) && defined(RDNA4))
#endif // defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
}
#if defined(TURING_MMA_AVAILABLE)
@@ -897,15 +794,6 @@ template<> struct mma_tile_sizes<8> {
using T_B_VKQ = tile< 8, 8, half2>; // column-major
using T_C_VKQ = tile<16, 4, half2>; // row-major
};
#elif defined(AMD_WMMA_AVAILABLE)
template<int ncols> struct mma_tile_sizes {
using T_A_KQ = tile<16, 8, half2>; // row-major
using T_B_KQ = tile<16, 8, half2>; // column-major
using T_C_KQ = tile<16, 16, float>; // column-major
using T_A_VKQ = tile<16, 8, half2>; // row-major
using T_B_VKQ = tile<16, 8, half2>; // column-major
using T_C_VKQ = tile<16, 8, half2>; // column-major
};
#else // Volta
template<int ncols> struct mma_tile_sizes {
using T_A_KQ = tile< 8, 4, half2, DATA_LAYOUT_I_MAJOR_MIRRORED>; // row-major
@@ -940,7 +828,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
const int jt,
const int kb0_start,
const int kb0_stop) {
#if defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || (defined(AMD_WMMA_AVAILABLE) && defined(RDNA4))
#if defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
//In this kernel Q, K, V are matrices while i, j, k are matrix indices.
constexpr int ncols = ncols1 * ncols2;
@@ -952,7 +840,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
using T_C_VKQ = typename mma_tile_sizes<ncols>::T_C_VKQ;
constexpr int cols_per_warp = T_B_KQ::I;
constexpr int cols_per_thread = get_cols_per_thread();
constexpr int cols_per_thread = 2; // This is specifically KQ columns, Volta only has a single VKQ column.
constexpr int np = nwarps * (cols_per_warp/ncols2) / ncols1; // Number of parallel CUDA warps per Q column.
constexpr int nbatch_fa = ggml_cuda_fattn_mma_get_nbatch_fa (DKQ, DV, ncols);
constexpr int nbatch_K2 = ggml_cuda_fattn_mma_get_nbatch_K2 (DKQ, DV, ncols);
@@ -983,8 +871,6 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
T_B_KQ Q_B[(Q_in_reg ? DKQ/(2*T_B_KQ::J) : 1)];
#if defined(TURING_MMA_AVAILABLE)
T_C_VKQ VKQ_C[cols_per_warp == 8 ? DV/T_C_VKQ::I : DV/(2*T_C_VKQ::J)];
#elif defined(AMD_WMMA_AVAILABLE)
T_C_VKQ VKQ_C[ DV/(2*T_C_VKQ::J)];
#else // Volta
T_C_VKQ VKQ_C[ DV/(2*T_C_VKQ::J)];
#endif // defined(TURING_MMA_AVAILABLE)
@@ -1124,10 +1010,6 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
// The partial sums are spread across 8/4 threads.
constexpr int offset_first = cols_per_warp == 8 ? 16 : 2;
constexpr int offset_last = cols_per_warp == 8 ? 4 : 1;
#elif defined(AMD_WMMA_AVAILABLE)
// The partial sums are spread across 2 threads.
constexpr int offset_first = 16;
constexpr int offset_last = 16;
#else // Volta
// The partial sums are spread across 2 threads.
constexpr int offset_first = 2;
@@ -1165,7 +1047,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
#if defined(TURING_MMA_AVAILABLE)
if constexpr (cols_per_warp == 8) {
const half2 KQ_max_scale_h2 = make_half2(KQ_max_scale[0], KQ_max_scale[cols_per_thread - 1]);
const half2 KQ_max_scale_h2 = make_half2(KQ_max_scale[0], KQ_max_scale[1]);
#pragma unroll
for (int i = 0; i < DV/T_C_VKQ::I; ++i) {
#pragma unroll
@@ -1186,15 +1068,6 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
}
}
}
#elif defined(AMD_WMMA_AVAILABLE)
const half2 KQ_max_scale_h2 = make_half2(KQ_max_scale[0], KQ_max_scale[0]);
#pragma unroll
for (int i = 0; i < (DV/2)/T_C_VKQ::J; ++i) {
#pragma unroll
for (int l = 0; l < T_C_VKQ::ne; ++l) {
VKQ_C[i].x[l] *= KQ_max_scale_h2;
}
}
#else // Volta
const int col = (threadIdx.x / 2) % 2;
const half2 KQ_max_scale_h2 = make_half2(KQ_max_scale[col], KQ_max_scale[col]);
@@ -1246,10 +1119,6 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
const int jc_cwm = threadIdx.y*cols_per_warp + T_C_VKQ::get_i(threadIdx.x % 4);
const float2 KQ_cmr = make_float2(KQ_max[threadIdx.x % cols_per_thread], KQ_rowsum[threadIdx.x % cols_per_thread]);
const bool thread_should_write = threadIdx.x % 4 < cols_per_thread;
#elif defined(AMD_WMMA_AVAILABLE)
const int jc_cwm = threadIdx.y*cols_per_warp + T_C_VKQ::get_i(0);
const float2 KQ_cmr = make_float2(KQ_max[0], KQ_rowsum[0]);
const bool thread_should_write = threadIdx.x / 16 < cols_per_thread;
#else // Volta
const int jc_cwm = threadIdx.y*cols_per_warp + T_C_KQ::get_i(threadIdx.x & 2);
const float2 KQ_cmr = make_float2(KQ_max[(threadIdx.x & 2) / 2], KQ_rowsum[(threadIdx.x & 2) / 2]);
@@ -1450,7 +1319,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
stride_Q1, stride_Q2, stride_K, stride_V, stride_mask,
jt, kb0_start, kb0_stop);
NO_DEVICE_CODE;
#endif // defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || (defined(AMD_WMMA_AVAILABLE) && defined(RDNA4))
#endif // defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)
}
template<int DKQ, int DV, int ncols1, int ncols2, bool use_logit_softcap, bool mla>
@@ -1477,7 +1346,7 @@ static __global__ void flash_attn_ext_f16(
const int32_t nb21, const int32_t nb22, const int64_t nb23,
const int32_t ne31, const int32_t ne32, const int32_t ne33,
const int32_t nb31, const int32_t nb32, const int64_t nb33) {
#if defined(FLASH_ATTN_AVAILABLE) && (defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || (defined(AMD_WMMA_AVAILABLE) && defined(RDNA4)))
#if defined(FLASH_ATTN_AVAILABLE) && (defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE))
// Skip unused kernel variants for faster compilation:
if (use_logit_softcap && !(DKQ == 128 || DKQ == 256)) {
@@ -1491,13 +1360,6 @@ static __global__ void flash_attn_ext_f16(
}
#endif // __CUDA_ARCH__ == GGML_CUDA_CC_TURING
#if defined(AMD_WMMA_AVAILABLE)
if (ncols1*ncols2 > 32 || ncols1*ncols2 < 16 || DKQ > 128 || ncols2 == 1) {
NO_DEVICE_CODE;
return;
}
#endif // defined(AMD_WMMA_AVAILABLE)
static_assert(!mla || DKQ >= DV, "MLA needs DKQ >= DV");
constexpr int ncols = ncols1 * ncols2;
@@ -1611,7 +1473,7 @@ static __global__ void flash_attn_ext_f16(
ne31, ne32, ne33,
nb31, nb32, nb33);
NO_DEVICE_CODE;
#endif // defined(FLASH_ATTN_AVAILABLE) && (defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || (defined(AMD_WMMA_AVAILABLE) && defined(RDNA4)))
#endif // defined(FLASH_ATTN_AVAILABLE) && (defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE))
}
template <int DKQ, int DV, int ncols1, int ncols2>
@@ -1630,7 +1492,7 @@ void ggml_cuda_flash_attn_ext_mma_f16_case(ggml_backend_cuda_context & ctx, ggml
const bool Q_in_reg = ggml_cuda_fattn_mma_get_Q_in_reg (DKQ, DV, ncols, cc);
const int nstages = ggml_cuda_fattn_mma_get_nstages (DKQ, DV, ncols1, ncols2, cc);
const int cols_per_warp = std::min(ncols, get_cols_per_warp(cc));
const int cols_per_warp = std::min(ncols, turing_mma_available(cc) ? 16 : 32);
const int nwarps = nthreads / WARP_SIZE;
constexpr bool mla = DKQ == 576;
@@ -1650,34 +1512,29 @@ void ggml_cuda_flash_attn_ext_mma_f16_case(ggml_backend_cuda_context & ctx, ggml
float logit_softcap;
memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));
#if defined(GGML_USE_HIP)
using fattn_kernel_ptr_t = const void*;
#else
using fattn_kernel_ptr_t = fattn_kernel_t;
#endif // defined(GGML_USE_HIP)
fattn_kernel_t fattn_kernel;
if (logit_softcap == 0.0f) {
constexpr bool use_logit_softcap = false;
fattn_kernel = flash_attn_ext_f16<DKQ, DV, ncols1, ncols2, use_logit_softcap, mla>;
#if !defined(GGML_USE_MUSA)
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
if (!shared_memory_limit_raised[id]) {
CUDA_CHECK(cudaFuncSetAttribute(reinterpret_cast<fattn_kernel_ptr_t>(fattn_kernel), cudaFuncAttributeMaxDynamicSharedMemorySize, nbytes_shared_total));
CUDA_CHECK(cudaFuncSetAttribute(fattn_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, nbytes_shared_total));
shared_memory_limit_raised[id] = true;
}
#endif // !defined(GGML_USE_MUSA)
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
} else {
constexpr bool use_logit_softcap = true;
fattn_kernel = flash_attn_ext_f16<DKQ, DV, ncols1, ncols2, use_logit_softcap, mla>;
#if !defined(GGML_USE_MUSA)
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
if (!shared_memory_limit_raised[id]) {
CUDA_CHECK(cudaFuncSetAttribute(reinterpret_cast<fattn_kernel_ptr_t>(fattn_kernel), cudaFuncAttributeMaxDynamicSharedMemorySize, nbytes_shared_total));
CUDA_CHECK(cudaFuncSetAttribute(fattn_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, nbytes_shared_total));
shared_memory_limit_raised[id] = true;
}
#endif // !defined(GGML_USE_MUSA)
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
}
launch_fattn<DV, ncols1, ncols2>

View File

@@ -10,7 +10,7 @@ static constexpr __device__ int ggml_cuda_fattn_vec_get_nthreads_device() {
return 128;
}
// Currenlty llvm with the amdgcn target does not support unrolling loops
// Currenlty llvm with the amdgcn target dose not support unrolling loops
// that contain a break that can not be resolved at compile time.
#ifdef __clang__
#pragma clang diagnostic push

View File

@@ -18,12 +18,12 @@ static void ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1(ggml_backend_cuda_con
}
}
if ((turing_mma_available(cc) || amd_wmma_available(cc)) && Q->ne[1] <= 16/ncols2) {
if (turing_mma_available(cc) && Q->ne[1] <= 16/ncols2) {
ggml_cuda_flash_attn_ext_mma_f16_case<DKQ, DV, 16/ncols2, ncols2>(ctx, dst);
return;
}
if (ggml_cuda_highest_compiled_arch(cc) == GGML_CUDA_CC_TURING || amd_wmma_available(cc) || Q->ne[1] <= 32/ncols2) {
if (ggml_cuda_highest_compiled_arch(cc) == GGML_CUDA_CC_TURING || Q->ne[1] <= 32/ncols2) {
ggml_cuda_flash_attn_ext_mma_f16_case<DKQ, DV, 32/ncols2, ncols2>(ctx, dst);
return;
}
@@ -230,18 +230,7 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
// The effective batch size for the kernel can be increased by gqa_ratio.
// The kernel versions without this optimization are also used for ALiBi, if there is no mask, or if the KV cache is not padded,
bool gqa_opt_applies = gqa_ratio % 2 == 0 && mask && max_bias == 0.0f && K->ne[1] % FATTN_KQ_STRIDE == 0;
for (const ggml_tensor * t : {Q, K, V, mask}) {
if (t == nullptr) {
continue;
}
for (size_t i = 1; i < GGML_MAX_DIMS; ++i) {
if (t->nb[i] % 16 != 0) {
gqa_opt_applies = false;
break;
}
}
}
const bool gqa_opt_applies = gqa_ratio % 2 == 0 && mask && max_bias == 0.0f && K->ne[1] % FATTN_KQ_STRIDE == 0;
const int cc = ggml_cuda_info().devices[device].cc;
@@ -348,31 +337,6 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
return BEST_FATTN_KERNEL_WMMA_F16;
}
if (amd_wmma_available(cc) && GGML_CUDA_CC_IS_RDNA4(cc) && gqa_opt_applies && Q->ne[0] <= 128 && Q->ne[0] != 40 && Q->ne[0] != 72) {
if (can_use_vector_kernel) {
if (!ggml_is_quantized(K->type) && !ggml_is_quantized(V->type)) {
if (Q->ne[1] == 1) {
if (!gqa_opt_applies) {
return BEST_FATTN_KERNEL_VEC;
}
}
} else {
if (Q->ne[1] <= 2) {
return BEST_FATTN_KERNEL_VEC;
}
}
}
int gqa_ratio_eff = 1;
const int ncols2_max = Q->ne[0] == 576 ? 16 : 8;
while (gqa_ratio % (2*gqa_ratio_eff) == 0 && gqa_ratio_eff < ncols2_max) {
gqa_ratio_eff *= 2;
}
if (Q->ne[1] * gqa_ratio_eff <= 8) {
return BEST_FATTN_KERNEL_TILE; // AMD WMMA is only faster if the full tile width of 16 can be utilized.
}
return BEST_FATTN_KERNEL_MMA_F16;
}
// If there are no tensor cores available, use the generic tile kernel:
if (can_use_vector_kernel) {
if (!ggml_is_quantized(K->type) && !ggml_is_quantized(V->type)) {

View File

@@ -2853,9 +2853,9 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
}
#ifdef USE_CUDA_GRAPH
static bool ggml_cuda_graph_check_compability(ggml_cgraph * cgraph) {
static bool check_node_graph_compatibility(ggml_cgraph * cgraph,
bool use_cuda_graph) {
bool use_cuda_graph = true;
// Loop over nodes in GGML graph to obtain info needed for CUDA graph
const std::string gemma3n_per_layer_proj_src0_name = "inp_per_layer_selected";
@@ -2915,41 +2915,41 @@ static bool ggml_cuda_graph_check_compability(ggml_cgraph * cgraph) {
return use_cuda_graph;
}
static void ggml_cuda_graph_node_set_properties(ggml_cuda_graph_node_properties * props, ggml_tensor * node) {
props->node_address = node->data;
props->node_op = node->op;
static void set_ggml_graph_node_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
graph_node_properties->node_address = node->data;
graph_node_properties->node_op = node->op;
for (int i = 0; i < GGML_MAX_DIMS; i++) {
props->ne[i] = node->ne[i];
props->nb[i] = node->nb[i];
graph_node_properties->ne[i] = node->ne[i];
graph_node_properties->nb[i] = node->nb[i];
}
for (int i = 0; i < GGML_MAX_SRC; i++) {
props->src_address[i] = node->src[i] ? node->src[i]->data : nullptr;
graph_node_properties->src_address[i] = node->src[i] ? node->src[i]->data : nullptr;
}
memcpy(props->op_params, node->op_params, GGML_MAX_OP_PARAMS);
memcpy(graph_node_properties->op_params, node->op_params, GGML_MAX_OP_PARAMS);
}
static bool ggml_cuda_graph_node_properties_match(ggml_tensor * node, ggml_cuda_graph_node_properties * props) {
if (node->data != props->node_address &&
static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
if (node->data != graph_node_properties->node_address &&
node->op != GGML_OP_VIEW) {
return false;
}
if (node->op != props->node_op) {
if (node->op != graph_node_properties->node_op) {
return false;
}
for (int i = 0; i < GGML_MAX_DIMS; i++) {
if (node->ne[i] != props->ne[i]) {
if (node->ne[i] != graph_node_properties->ne[i]) {
return false;
}
if (node->nb[i] != props->nb[i]) {
if (node->nb[i] != graph_node_properties->nb[i]) {
return false;
}
}
for (int i = 0; i < GGML_MAX_SRC; i++) {
if (node->src[i] &&
node->src[i]->data != props->src_address[i] &&
node->src[i]->data != graph_node_properties->src_address[i] &&
node->op != GGML_OP_VIEW
) {
return false;
@@ -2957,55 +2957,44 @@ static bool ggml_cuda_graph_node_properties_match(ggml_tensor * node, ggml_cuda_
}
if ((node->op == GGML_OP_SCALE || node->op == GGML_OP_GLU) &&
memcmp(props->op_params, node->op_params, GGML_MAX_OP_PARAMS) != 0) {
memcmp(graph_node_properties->op_params, node->op_params, GGML_MAX_OP_PARAMS) != 0) {
return false;
}
return true;
}
static bool ggml_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph) {
static bool is_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph) {
bool res = false;
bool cuda_graph_update_required = false;
if (cuda_ctx->cuda_graph->instance == nullptr) {
res = true;
cuda_graph_update_required = true;
}
// Check if the graph size has changed
if (cuda_ctx->cuda_graph->props.size() != (size_t)cgraph->n_nodes + cgraph->n_leafs) {
res = true;
cuda_ctx->cuda_graph->props.resize(cgraph->n_nodes + cgraph->n_leafs);
if (cuda_ctx->cuda_graph->ggml_graph_properties.size() != (size_t)cgraph->n_nodes) {
cuda_graph_update_required = true;
cuda_ctx->cuda_graph->ggml_graph_properties.resize(cgraph->n_nodes);
}
// Loop over nodes in GGML graph to determine if CUDA graph update is required
// and store properties to allow this comparison for the next token
for (int i = 0; i < cgraph->n_nodes; i++) {
bool props_match = true;
if (!res) {
props_match = ggml_cuda_graph_node_properties_match(cgraph->nodes[i], &cuda_ctx->cuda_graph->props[i]);
bool has_matching_properties = true;
if (!cuda_graph_update_required) {
has_matching_properties = ggml_graph_node_has_matching_properties(cgraph->nodes[i], &cuda_ctx->cuda_graph->ggml_graph_properties[i]);
}
if (!props_match) {
res = true;
if (!has_matching_properties) {
cuda_graph_update_required = true;
}
ggml_cuda_graph_node_set_properties(&cuda_ctx->cuda_graph->props[i], cgraph->nodes[i]);
set_ggml_graph_node_properties(cgraph->nodes[i], &cuda_ctx->cuda_graph->ggml_graph_properties[i]);
}
for (int i = 0; i < cgraph->n_leafs; i++) {
bool props_match= true;
if (!res) {
props_match = ggml_cuda_graph_node_properties_match(cgraph->leafs[i], &cuda_ctx->cuda_graph->props[cgraph->n_nodes + i]);
}
if (!props_match) {
res = true;
}
ggml_cuda_graph_node_set_properties(&cuda_ctx->cuda_graph->props[cgraph->n_nodes + i], cgraph->leafs[i]);
}
return res;
return cuda_graph_update_required;
}
static void ggml_cuda_graph_update_executable(ggml_backend_cuda_context * cuda_ctx) {
static void update_cuda_graph_executable(ggml_backend_cuda_context * cuda_ctx) {
#if CUDART_VERSION >= 12000
cudaGraphExecUpdateResultInfo result_info;
@@ -3236,11 +3225,10 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
return false;
}
static void ggml_cuda_graph_evaluate_and_capture(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph, const bool use_cuda_graph, const bool cuda_graph_update_required) {
bool graph_evaluated_or_captured = false;
static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required) {
// flag used to determine whether it is an integrated_gpu
const bool integrated = ggml_cuda_info().devices[cuda_ctx->device].integrated;
const bool integrated = ggml_cuda_info().devices[cuda_ctx->device].integrated;
ggml_cuda_stream_context & stream_ctx = cuda_ctx->stream_context();
bool is_concurrent_event_active = false;
@@ -3710,7 +3698,7 @@ static void ggml_cuda_graph_evaluate_and_capture(ggml_backend_cuda_context * cud
CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0));
}
if (cuda_graph_update_required) { // Update graph executable
ggml_cuda_graph_update_executable(cuda_ctx);
update_cuda_graph_executable(cuda_ctx);
}
// Launch graph
CUDA_CHECK(cudaGraphLaunch(cuda_ctx->cuda_graph->instance, cuda_ctx->stream()));
@@ -3720,26 +3708,43 @@ static void ggml_cuda_graph_evaluate_and_capture(ggml_backend_cuda_context * cud
}
}
static bool ggml_cuda_graph_set_enabled(ggml_backend_cuda_context * cuda_ctx) {
static bool ggml_cuda_set_cuda_graph_enabled(ggml_backend_cuda_context * cuda_ctx) {
#ifdef USE_CUDA_GRAPH
static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr);
// Objects required for CUDA Graph
if (cuda_ctx->cuda_graph == nullptr) {
cuda_ctx->cuda_graph.reset(new ggml_cuda_graph());
}
bool use_cuda_graph = true;
if (cuda_ctx->cuda_graph->graph == nullptr) {
if (ggml_cuda_info().devices[cuda_ctx->device].cc < GGML_CUDA_CC_AMPERE) {
cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true;
#ifndef NDEBUG
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to GPU architecture\n", __func__);
#endif
}
}
return cuda_ctx->cuda_graph->is_enabled();
// Disable CUDA graphs in presence of env var, old GPU, use-case which is changing too rapidly,
// or previous graph capture failure.
// Also disable for multi-gpu for now. TO DO investigate
if (disable_cuda_graphs_due_to_env
|| cuda_ctx->cuda_graph->disable_due_to_gpu_arch
|| cuda_ctx->cuda_graph->disable_due_to_too_many_updates
|| cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture) {
use_cuda_graph = false;
}
cuda_ctx->cuda_graph->cuda_graphs_enabled = use_cuda_graph;
#else
GGML_UNUSED(cuda_ctx);
return false;
bool use_cuda_graph = false;
#endif // USE_CUDA_GRAPH
return use_cuda_graph;
}
static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
@@ -3750,14 +3755,30 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
bool use_cuda_graph = false;
bool cuda_graph_update_required = false;
// graph_optimize calls set_cuda_graph_enabled, in-case it not called (i.e. graph_compute is directly called)
// we call it here instead.
#ifdef USE_CUDA_GRAPH
use_cuda_graph = ggml_cuda_graph_set_enabled(cuda_ctx);
use_cuda_graph = ggml_cuda_set_cuda_graph_enabled(cuda_ctx);
if (cuda_ctx->cuda_graph->is_enabled()) {
cuda_graph_update_required = ggml_cuda_graph_update_required(cuda_ctx, cgraph);
use_cuda_graph = ggml_cuda_graph_check_compability(cgraph);
if (use_cuda_graph) {
cuda_graph_update_required = is_cuda_graph_update_required(cuda_ctx, cgraph);
cuda_ctx->cuda_graph->record_update(use_cuda_graph, cuda_graph_update_required);
use_cuda_graph = check_node_graph_compatibility(cgraph, use_cuda_graph);
// Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates.
if (use_cuda_graph && cuda_graph_update_required) {
cuda_ctx->cuda_graph->number_consecutive_updates++;
} else {
cuda_ctx->cuda_graph->number_consecutive_updates = 0;
}
if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) {
cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true;
cuda_ctx->cuda_graph->cuda_graphs_enabled = false;
#ifndef NDEBUG
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
#endif
}
}
#endif // USE_CUDA_GRAPH
@@ -3771,7 +3792,9 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed));
}
ggml_cuda_graph_evaluate_and_capture(cuda_ctx, cgraph, use_cuda_graph, cuda_graph_update_required);
bool graph_evaluated_or_captured = false;
evaluate_and_capture_cuda_graph(cuda_ctx, cgraph, graph_evaluated_or_captured, use_cuda_graph, cuda_graph_update_required);
return GGML_STATUS_SUCCESS;
}
@@ -3804,7 +3827,7 @@ static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_ev
static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph * cgraph) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context;
const bool use_cuda_graph = ggml_cuda_graph_set_enabled(cuda_ctx);
const bool use_cuda_graph = ggml_cuda_set_cuda_graph_enabled(cuda_ctx);
static bool enable_graph_optimization = [] {
const char * env = getenv("GGML_CUDA_GRAPH_OPT");
@@ -4123,7 +4146,6 @@ struct ggml_backend_cuda_device_context {
std::string name;
std::string description;
std::string pci_bus_id;
int op_offload_min_batch_size;
};
static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
@@ -4551,7 +4573,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_OP_L2_NORM:
return true;
case GGML_OP_RMS_NORM_BACK:
return ggml_is_contiguous(op->src[0]);
return ggml_is_contiguous(op->src[0]) && op->ne[0] % WARP_SIZE == 0;
break;
case GGML_OP_NONE:
case GGML_OP_RESHAPE:
@@ -4678,9 +4700,11 @@ static int64_t get_op_batch_size(const ggml_tensor * op) {
}
static bool ggml_backend_cuda_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) dev->context;
const int min_batch_size = 32;
return get_op_batch_size(op) >= dev_ctx->op_offload_min_batch_size;
return get_op_batch_size(op) >= min_batch_size;
GGML_UNUSED(dev);
}
static ggml_backend_event_t ggml_backend_cuda_device_event_new(ggml_backend_dev_t dev) {
@@ -4848,7 +4872,6 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
std::lock_guard<std::mutex> lock(mutex);
if (!initialized) {
ggml_backend_cuda_reg_context * ctx = new ggml_backend_cuda_reg_context;
const int min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;
for (int i = 0; i < ggml_cuda_info().device_count; i++) {
ggml_backend_cuda_device_context * dev_ctx = new ggml_backend_cuda_device_context;
@@ -4862,7 +4885,6 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
char pci_bus_id[16] = {};
snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.0", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID);
dev_ctx->pci_bus_id = pci_bus_id;
dev_ctx->op_offload_min_batch_size = min_batch_size;
ggml_backend_dev_t dev = new ggml_backend_device {
/* .iface = */ ggml_backend_cuda_device_interface,

View File

@@ -34,11 +34,13 @@ void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
// CUDA_GRAPHS_DISABLED
((ncols > 65536) &&
((ctx.cuda_graph->instance == nullptr) && (iscapturing == cudaStreamCaptureStatusNone) ||
ctx.cuda_graph->is_enabled())) ||
ctx.cuda_graph->disable_due_to_gpu_arch || ctx.cuda_graph->disable_due_to_too_many_updates ||
ctx.cuda_graph->disable_due_to_failed_graph_capture)) ||
// CUDA_GRAPHS ENABLED
((ncols > 32768) &&
!((ctx.cuda_graph->instance == nullptr) && (iscapturing == cudaStreamCaptureStatusNone) ||
ctx.cuda_graph->is_enabled()))) {
ctx.cuda_graph->disable_due_to_gpu_arch || ctx.cuda_graph->disable_due_to_too_many_updates ||
ctx.cuda_graph->disable_due_to_failed_graph_capture))) {
#else
(ncols > 65536)) {
#endif // USE_CUDA_GRAPH

View File

@@ -206,16 +206,10 @@ namespace ggml_cuda_mma {
static __device__ __forceinline__ int get_j(const int l) {
if constexpr (I == 16 && J == 16) {
// matrix C
#if defined(RDNA3)
if constexpr (std::is_same_v<T, float> || std::is_same_v<T, int>) {
// matrix C
return 2 * l + (threadIdx.x / 16);
} else {
// matrix A&B
return l;
}
return 2 * l + (threadIdx.x / 16);
#else
// matrix C is the transposed matrix A&B on RDNA4
return ne * (threadIdx.x / 16) + l;
#endif // defined(RDNA3)
} else if constexpr (I == 16 && J == 8) {
@@ -627,21 +621,6 @@ namespace ggml_cuda_mma {
return ret;
}
#elif defined(AMD_WMMA_AVAILABLE)
template <int I, int J>
static __device__ __forceinline__ tile<I, J/2, half2> get_half2(const tile<I, J, float> & tile_float) {
tile<I, J/2, half2> ret;
#pragma unroll
for (int l0 = 0; l0 < tile_float.ne; l0 += 2) {
ret.x[l0/2] = make_half2(tile_float.x[l0 + 0], tile_float.x[l0 + 1]);
}
return ret;
}
static __device__ __forceinline__ tile<8, 8, half2> get_transposed(const tile<16, 4, half2> & t) {
NO_DEVICE_CODE;
return tile<8, 8, half2>{};
}
#else // Volta
template <int I, int J>
static __device__ __forceinline__ tile<I, J/2, half2> get_half2(const tile<I, J, float> & tile_float) {
@@ -660,19 +639,6 @@ namespace ggml_cuda_mma {
}
#endif // defined(TURING_MMA_AVAILABLE)
static __device__ __forceinline__ void make_identity_mat(tile<16, 8, half2> & t) {
#if defined(RDNA4)
const int row = t.get_i(0);
const int left_right = t.get_j(0) / 4;
const int up_down = row / 8;
const int idx = row % 8;
reinterpret_cast<half*>(t.x)[idx] = left_right == up_down ? 1.0f : 0.0f;
#else
GGML_UNUSED_VARS(t);
NO_DEVICE_CODE;
#endif // defined(RDNA4)
}
template <int I, int J, typename T, data_layout dl>
static __device__ __forceinline__ void load_generic(tile<I, J, T, dl> & t, const T * __restrict__ xs0, const int stride) {
#if defined(AMD_MFMA_AVAILABLE)
@@ -912,17 +878,6 @@ namespace ggml_cuda_mma {
: "+r"(Dxi[2]), "+r"(Dxi[3])
: "r"(Axi[2]), "r"(Axi[3]), "r"(Bxi[3]));
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
#elif defined(AMD_WMMA_AVAILABLE)
#if defined(RDNA4)
using halfx8_t = __attribute__((ext_vector_type(8))) _Float16;
halfx8_t& acc_frag = reinterpret_cast<halfx8_t&>(D.x[0]);
const halfx8_t& a_frag = reinterpret_cast<const halfx8_t&>(A.x[0]);
const halfx8_t& b_frag = reinterpret_cast<const halfx8_t&>(B.x[0]);
acc_frag = __builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12(a_frag, b_frag, acc_frag);
#else
GGML_UNUSED_VARS(D, A, B);
NO_DEVICE_CODE;
#endif // defined(RDNA4)
#else
GGML_UNUSED_VARS(D, A, B);
NO_DEVICE_CODE;

View File

@@ -190,7 +190,7 @@ void ggml_cuda_mul_mat_q(
{
const int64_t s11 = src1->nb[1] / ts_src1;
const int64_t s12 = src1->nb[2] / ts_src1;
const int64_t s13 = src1->nb[3] / ts_src1;
const int64_t s13 = src1->nb[2] / ts_src1;
if (use_native_mxfp4) {
quantize_mmq_mxfp4_cuda(src1_d, ids_src1.get(), src1_q8_1.get(), src0->type, ne10, s11, s12, s13,
@@ -333,31 +333,6 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t
}
if (amd_wmma_available(cc)) {
if (GGML_CUDA_CC_IS_RDNA3(cc)) {
// High expert counts are almost always better on MMQ due to
// the synchronization overhead in the cuBLAS/hipBLAS path:
// https://github.com/ggml-org/llama.cpp/pull/18202
if (n_experts >= 64) {
return true;
}
// For some quantization types MMQ can have lower peak TOPS than hipBLAS
// so it's only faster for sufficiently small batch sizes:
switch (type) {
case GGML_TYPE_Q2_K:
return ne11 <= 128;
case GGML_TYPE_Q6_K:
return ne11 <= (GGML_CUDA_CC_IS_RDNA3_0(cc) ? 128 : 256);
case GGML_TYPE_IQ2_XS:
case GGML_TYPE_IQ2_S:
return GGML_CUDA_CC_IS_RDNA3_5(cc) || ne11 <= 128;
default:
return true;
}
}
// For RDNA4 MMQ is consistently faster than dequantization + hipBLAS:
// https://github.com/ggml-org/llama.cpp/pull/18537#issuecomment-3706422301
return true;
}

View File

@@ -25,8 +25,19 @@ static __global__ void norm_f32(
}
// sum up partial sums
extern __shared__ float2 s_sum2[];
mean_var = block_reduce<block_reduce_method::SUM, block_size>(mean_var, s_sum2);
mean_var = warp_reduce_sum(mean_var);
if constexpr (block_size > WARP_SIZE) {
static_assert(block_size == 1024, "unexpected block_size");
__shared__ float2 s_sum[32];
const int warp_id = threadIdx.x / WARP_SIZE;
const int lane_id = threadIdx.x % WARP_SIZE;
if (lane_id == 0) {
s_sum[warp_id] = mean_var;
}
__syncthreads();
mean_var = s_sum[lane_id];
mean_var = warp_reduce_sum(mean_var);
}
const float mean = mean_var.x / ncols;
const float var = mean_var.y / ncols - mean * mean;
@@ -50,8 +61,19 @@ static __global__ void group_norm_f32(const float * x, float * dst, const int gr
tmp += x[j];
}
extern __shared__ float s_sum[];
tmp = block_reduce<block_reduce_method::SUM, block_size>(tmp, s_sum);
tmp = warp_reduce_sum(tmp);
if constexpr (block_size > WARP_SIZE) {
static_assert(block_size == 1024, "unexpected block_size");
__shared__ float s_sum[32];
const int warp_id = threadIdx.x / WARP_SIZE;
const int lane_id = threadIdx.x % WARP_SIZE;
if (lane_id == 0) {
s_sum[warp_id] = tmp;
}
__syncthreads();
tmp = s_sum[lane_id];
tmp = warp_reduce_sum(tmp);
}
const float mean = tmp / group_size;
tmp = 0.0f;
@@ -62,7 +84,18 @@ static __global__ void group_norm_f32(const float * x, float * dst, const int gr
tmp += xi * xi;
}
tmp = block_reduce<block_reduce_method::SUM, block_size>(tmp, s_sum);
tmp = warp_reduce_sum(tmp);
if (block_size > WARP_SIZE) {
__shared__ float s_sum[32];
const int warp_id = threadIdx.x / WARP_SIZE;
const int lane_id = threadIdx.x % WARP_SIZE;
if (lane_id == 0) {
s_sum[warp_id] = tmp;
}
__syncthreads();
tmp = s_sum[lane_id];
tmp = warp_reduce_sum(tmp);
}
const float variance = tmp / group_size;
const float scale = rsqrtf(variance + eps);
@@ -130,8 +163,22 @@ static __global__ void rms_norm_f32(const float * x,
}
// sum up partial sums
extern __shared__ float s_sum[];
tmp = block_reduce<block_reduce_method::SUM, block_size>(tmp, s_sum);
tmp = warp_reduce_sum(tmp);
if constexpr (block_size > WARP_SIZE) {
static_assert((block_size <= 1024) && (block_size % 32 == 0), "unexpected block_size");
__shared__ float s_sum[32];
const int warp_id = tid / WARP_SIZE;
const int lane_id = tid % WARP_SIZE;
if (lane_id == 0) {
s_sum[warp_id] = tmp;
}
__syncthreads();
tmp = 0.0f;
if (lane_id < (block_size / WARP_SIZE)) {
tmp = s_sum[lane_id];
}
tmp = warp_reduce_sum(tmp);
}
const float mean = tmp / ncols;
const float scale = rsqrtf(mean + eps);
@@ -259,8 +306,19 @@ static __global__ void l2_norm_f32(
}
// sum up partial sums
extern __shared__ float s_sum[];
tmp = block_reduce<block_reduce_method::SUM, block_size>(tmp, s_sum);
tmp = warp_reduce_sum(tmp);
if constexpr (block_size > WARP_SIZE) {
static_assert(block_size == 1024, "unexpected block_size");
__shared__ float s_sum[32];
const int warp_id = threadIdx.x / WARP_SIZE;
const int lane_id = threadIdx.x % WARP_SIZE;
if (lane_id == 0) {
s_sum[warp_id] = tmp;
}
__syncthreads();
tmp = s_sum[lane_id];
tmp = warp_reduce_sum(tmp);
}
// from https://pytorch.org/docs/stable/generated/torch.nn.functional.normalize.html
const float scale = rsqrtf(fmaxf(tmp, eps * eps));
@@ -279,7 +337,7 @@ static void norm_f32_cuda(
norm_f32<WARP_SIZE><<<blocks_num, block_dims, 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
} else {
const dim3 block_dims(1024, 1, 1);
norm_f32<1024><<<blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float2): 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
norm_f32<1024><<<blocks_num, block_dims, 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
}
}
@@ -290,7 +348,7 @@ static void group_norm_f32_cuda(
group_norm_f32<WARP_SIZE><<<num_groups, block_dims, 0, stream>>>(x, dst, group_size, ne_elements, eps);
} else {
const dim3 block_dims(1024, 1, 1);
group_norm_f32<1024><<<num_groups, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream>>>(x, dst, group_size, ne_elements, eps);
group_norm_f32<1024><<<num_groups, block_dims, 0, stream>>>(x, dst, group_size, ne_elements, eps);
}
}
@@ -300,10 +358,10 @@ static void rms_norm_f32_cuda(
const dim3 blocks_num(nrows, nchannels, nsamples);
if (ncols < 1024) {
const dim3 block_dims(256, 1, 1);
rms_norm_f32<256, false><<<blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
rms_norm_f32<256, false><<<blocks_num, block_dims, 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
} else {
const dim3 block_dims(1024, 1, 1);
rms_norm_f32<1024, false><<<blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
rms_norm_f32<1024, false><<<blocks_num, block_dims, 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
}
}
@@ -346,12 +404,12 @@ static void rms_norm_mul_f32_cuda(const float * x,
const uint3 mul_nsamples_packed = init_fastdiv_values(mul_nsamples);
if (ncols < 1024) {
const dim3 block_dims(256, 1, 1);
rms_norm_f32<256, true><<<blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream>>>(
rms_norm_f32<256, true><<<blocks_num, block_dims, 0, stream>>>(
x, dst, ncols, stride_row, stride_channel, stride_sample, eps, mul, mul_stride_row, mul_stride_channel,
mul_stride_sample, mul_ncols_packed, mul_nrows_packed, mul_nchannels_packed, mul_nsamples_packed);
} else {
const dim3 block_dims(1024, 1, 1);
rms_norm_f32<1024, true><<<blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream>>>(
rms_norm_f32<1024, true><<<blocks_num, block_dims, 0, stream>>>(
x, dst, ncols, stride_row, stride_channel, stride_sample, eps, mul, mul_stride_row, mul_stride_channel,
mul_stride_sample, mul_ncols_packed, mul_nrows_packed, mul_nchannels_packed, mul_nsamples_packed);
}
@@ -367,14 +425,14 @@ static void rms_norm_mul_f32_cuda(const float * x,
const uint3 add_nsamples_packed = init_fastdiv_values(add_nsamples);
if (ncols < 1024) {
const dim3 block_dims(256, 1, 1);
rms_norm_f32<256, true, true><<<blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream>>>(
rms_norm_f32<256, true, true><<<blocks_num, block_dims, 0, stream>>>(
x, dst, ncols, stride_row, stride_channel, stride_sample, eps, mul, mul_stride_row, mul_stride_channel,
mul_stride_sample, mul_ncols_packed, mul_nrows_packed, mul_nchannels_packed, mul_nsamples_packed, add,
add_stride_row, add_stride_channel, add_stride_sample, add_ncols_packed, add_nrows_packed,
add_nchannels_packed, add_nsamples_packed);
} else {
const dim3 block_dims(1024, 1, 1);
rms_norm_f32<1024, true, true><<<blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream>>>(
rms_norm_f32<1024, true, true><<<blocks_num, block_dims, 0, stream>>>(
x, dst, ncols, stride_row, stride_channel, stride_sample, eps, mul, mul_stride_row, mul_stride_channel,
mul_stride_sample, mul_ncols_packed, mul_nrows_packed, mul_nchannels_packed, mul_nsamples_packed, add,
add_stride_row, add_stride_channel, add_stride_sample, add_ncols_packed, add_nrows_packed,
@@ -402,7 +460,7 @@ static void l2_norm_f32_cuda(
l2_norm_f32<WARP_SIZE><<<blocks_num, block_dims, 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
} else {
const dim3 block_dims(1024, 1, 1);
l2_norm_f32<1024><<<blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
l2_norm_f32<1024><<<blocks_num, block_dims, 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
}
}

View File

@@ -28,8 +28,22 @@ static __global__ void reduce_rows_f32(const float * __restrict__ x, float * __r
}
// sum up partial sums
__shared__ float shared_vals[32];
sum = block_reduce<block_reduce_method::SUM>(sum, shared_vals);
sum = warp_reduce_sum(sum);
if (blockDim.x > WARP_SIZE) {
assert((blockDim.x <= 1024) && (blockDim.x % WARP_SIZE) == 0);
__shared__ float s_sum[32];
const int warp_id = threadIdx.x / WARP_SIZE;
const int lane_id = threadIdx.x % WARP_SIZE;
if (lane_id == 0) {
s_sum[warp_id] = sum;
}
__syncthreads();
sum = 0.0f;
if (lane_id < (static_cast<int>(blockDim.x) / WARP_SIZE)) {
sum = s_sum[lane_id];
}
sum = warp_reduce_sum(sum);
}
if (col != 0) {
return;

Some files were not shown because too many files have changed in this diff Show More