mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-04-23 16:37:33 +03:00
Compare commits
2 Commits
b6188
...
sync-ggml-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a5801f408f | ||
|
|
2c1f810178 |
@@ -1,130 +0,0 @@
|
||||
# ==============================================================================
|
||||
# ARGUMENTS
|
||||
# ==============================================================================
|
||||
|
||||
# Define the CANN base image for easier version updates later
|
||||
ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.1.rc1-910b-openeuler22.03-py3.10
|
||||
|
||||
# ==============================================================================
|
||||
# BUILD STAGE
|
||||
# Compile all binary files and libraries
|
||||
# ==============================================================================
|
||||
FROM ${CANN_BASE_IMAGE} AS build
|
||||
|
||||
# Define the Ascend chip model for compilation. Default is Ascend910B3
|
||||
ARG ASCEND_SOC_TYPE=Ascend910B3
|
||||
|
||||
# -- Install build dependencies --
|
||||
RUN yum install -y gcc g++ cmake make git libcurl-devel python3 python3-pip && \
|
||||
yum clean all && \
|
||||
rm -rf /var/cache/yum
|
||||
|
||||
# -- Set the working directory --
|
||||
WORKDIR /app
|
||||
|
||||
# -- Copy project files --
|
||||
COPY . .
|
||||
|
||||
# -- Set CANN environment variables (required for compilation) --
|
||||
# Using ENV instead of `source` allows environment variables to persist across the entire image layer
|
||||
ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
|
||||
ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${LD_LIBRARY_PATH}
|
||||
ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${PATH}
|
||||
ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
|
||||
ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
|
||||
# ... You can add other environment variables from the original file as needed ...
|
||||
# For brevity, only core variables are listed here. You can paste the original ENV list here.
|
||||
|
||||
# -- Build llama.cpp --
|
||||
# Use the passed ASCEND_SOC_TYPE argument and add general build options
|
||||
RUN source /usr/local/Ascend/ascend-toolkit/set_env.sh --force \
|
||||
&& \
|
||||
cmake -B build \
|
||||
-DGGML_CANN=ON \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DSOC_TYPE=${ASCEND_SOC_TYPE} \
|
||||
. && \
|
||||
cmake --build build --config Release -j$(nproc)
|
||||
|
||||
# -- Organize build artifacts for copying in later stages --
|
||||
# Create a lib directory to store all .so files
|
||||
RUN mkdir -p /app/lib && \
|
||||
find build -name "*.so" -exec cp {} /app/lib \;
|
||||
|
||||
# Create a full directory to store all executables and Python scripts
|
||||
RUN mkdir -p /app/full && \
|
||||
cp build/bin/* /app/full/ && \
|
||||
cp *.py /app/full/ && \
|
||||
cp -r gguf-py /app/full/ && \
|
||||
cp -r requirements /app/full/ && \
|
||||
cp requirements.txt /app/full/
|
||||
# If you have a tools.sh script, make sure it is copied here
|
||||
# cp .devops/tools.sh /app/full/tools.sh
|
||||
|
||||
# ==============================================================================
|
||||
# BASE STAGE
|
||||
# Create a minimal base image with CANN runtime and common libraries
|
||||
# ==============================================================================
|
||||
FROM ${CANN_BASE_IMAGE} AS base
|
||||
|
||||
# -- Install runtime dependencies --
|
||||
RUN yum install -y libgomp curl && \
|
||||
yum clean all && \
|
||||
rm -rf /var/cache/yum
|
||||
|
||||
# -- Set CANN environment variables (required for runtime) --
|
||||
ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
|
||||
ENV LD_LIBRARY_PATH=/app:${ASCEND_TOOLKIT_HOME}/lib64:${LD_LIBRARY_PATH}
|
||||
ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${PATH}
|
||||
ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
|
||||
# ... You can add other environment variables from the original file as needed ...
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# Copy compiled .so files from the build stage
|
||||
COPY --from=build /app/lib/ /app
|
||||
|
||||
# ==============================================================================
|
||||
# FINAL STAGES (TARGETS)
|
||||
# ==============================================================================
|
||||
|
||||
### Target: full
|
||||
# Complete image with all tools, Python bindings, and dependencies
|
||||
# ==============================================================================
|
||||
FROM base AS full
|
||||
|
||||
COPY --from=build /app/full /app
|
||||
|
||||
# Install Python dependencies
|
||||
RUN yum install -y git python3 python3-pip && \
|
||||
pip3 install --no-cache-dir --upgrade pip setuptools wheel && \
|
||||
pip3 install --no-cache-dir -r requirements.txt && \
|
||||
yum clean all && \
|
||||
rm -rf /var/cache/yum
|
||||
|
||||
# You need to provide a tools.sh script as the entrypoint
|
||||
ENTRYPOINT ["/app/tools.sh"]
|
||||
# If there is no tools.sh, you can set the default to start the server
|
||||
# ENTRYPOINT ["/app/llama-server"]
|
||||
|
||||
### Target: light
|
||||
# Lightweight image containing only llama-cli
|
||||
# ==============================================================================
|
||||
FROM base AS light
|
||||
|
||||
COPY --from=build /app/full/llama-cli /app
|
||||
|
||||
ENTRYPOINT [ "/app/llama-cli" ]
|
||||
|
||||
### Target: server
|
||||
# Dedicated server image containing only llama-server
|
||||
# ==============================================================================
|
||||
FROM base AS server
|
||||
|
||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||
|
||||
COPY --from=build /app/full/llama-server /app
|
||||
|
||||
HEALTHCHECK --interval=5m CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||
|
||||
ENTRYPOINT [ "/app/llama-server" ]
|
||||
22
.devops/cloud-v-pipeline
Normal file
22
.devops/cloud-v-pipeline
Normal file
@@ -0,0 +1,22 @@
|
||||
node('x86_runner1'){ // Running on x86 runner containing latest vector qemu, latest vector gcc and all the necessary libraries
|
||||
stage('Cleanup'){
|
||||
cleanWs() // Cleaning previous CI build in workspace
|
||||
}
|
||||
stage('checkout repo'){
|
||||
retry(5){ // Retry if the cloning fails due to some reason
|
||||
checkout scm // Clone the repo on Runner
|
||||
}
|
||||
}
|
||||
stage('Compiling llama.cpp'){
|
||||
sh'''#!/bin/bash
|
||||
make RISCV=1 RISCV_CROSS_COMPILE=1 # Compiling llama for RISC-V
|
||||
'''
|
||||
}
|
||||
stage('Running llama.cpp'){
|
||||
sh'''#!/bin/bash
|
||||
module load gnu-bin2/0.1 # loading latest versions of vector qemu and vector gcc
|
||||
qemu-riscv64 -L /softwares/gnu-bin2/sysroot -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./llama-cli -m /home/alitariq/codellama-7b.Q4_K_M.gguf -p "Anything" -n 9 > llama_log.txt # Running llama.cpp on vector qemu-riscv64
|
||||
cat llama_log.txt # Printing results
|
||||
'''
|
||||
}
|
||||
}
|
||||
@@ -4,6 +4,8 @@ FROM ubuntu:$UBUNTU_VERSION AS build
|
||||
|
||||
ARG TARGETARCH
|
||||
|
||||
ARG GGML_CPU_ARM_ARCH=armv8-a
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y build-essential git cmake libcurl4-openssl-dev
|
||||
|
||||
@@ -11,8 +13,10 @@ WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
RUN if [ "$TARGETARCH" = "amd64" ] || [ "$TARGETARCH" = "arm64" ]; then \
|
||||
RUN if [ "$TARGETARCH" = "amd64" ]; then \
|
||||
cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON; \
|
||||
elif [ "$TARGETARCH" = "arm64" ]; then \
|
||||
cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_TESTS=OFF -DGGML_CPU_ARM_ARCH=${GGML_CPU_ARM_ARCH}; \
|
||||
else \
|
||||
echo "Unsupported architecture"; \
|
||||
exit 1; \
|
||||
|
||||
@@ -60,7 +60,8 @@ RUN apt-get update \
|
||||
git \
|
||||
python3 \
|
||||
python3-pip \
|
||||
&& pip install --break-system-packages -r requirements.txt \
|
||||
&& pip install --upgrade pip setuptools wheel \
|
||||
&& pip install -r requirements.txt \
|
||||
&& apt autoremove -y \
|
||||
&& apt clean -y \
|
||||
&& rm -rf /tmp/* /var/tmp/* \
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
ARG UBUNTU_VERSION=24.04
|
||||
|
||||
# This needs to generally match the container host's environment.
|
||||
ARG ROCM_VERSION=6.4
|
||||
ARG AMDGPU_VERSION=6.4
|
||||
ARG ROCM_VERSION=6.3
|
||||
ARG AMDGPU_VERSION=6.3
|
||||
|
||||
# Target the CUDA build image
|
||||
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
|
||||
|
||||
@@ -40,7 +40,7 @@ body:
|
||||
attributes:
|
||||
label: GGML backends
|
||||
description: Which GGML backends do you know to be affected?
|
||||
options: [AMX, BLAS, CPU, CUDA, HIP, Metal, Musa, RPC, SYCL, Vulkan, OpenCL, zDNN]
|
||||
options: [AMX, BLAS, CPU, CUDA, HIP, Metal, Musa, RPC, SYCL, Vulkan, OpenCL]
|
||||
multiple: true
|
||||
validations:
|
||||
required: true
|
||||
|
||||
2
.github/ISSUE_TEMPLATE/011-bug-results.yml
vendored
2
.github/ISSUE_TEMPLATE/011-bug-results.yml
vendored
@@ -42,7 +42,7 @@ body:
|
||||
attributes:
|
||||
label: GGML backends
|
||||
description: Which GGML backends do you know to be affected?
|
||||
options: [AMX, BLAS, CPU, CUDA, HIP, Metal, Musa, RPC, SYCL, Vulkan, OpenCL, zDNN]
|
||||
options: [AMX, BLAS, CPU, CUDA, HIP, Metal, Musa, RPC, SYCL, Vulkan, OpenCL]
|
||||
multiple: true
|
||||
validations:
|
||||
required: true
|
||||
|
||||
5
.github/labeler.yml
vendored
5
.github/labeler.yml
vendored
@@ -22,11 +22,6 @@ Vulkan:
|
||||
- any-glob-to-any-file:
|
||||
- ggml/include/ggml-vulkan.h
|
||||
- ggml/src/ggml-vulkan/**
|
||||
IBM zDNN:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- ggml/include/ggml-zdnn.h
|
||||
- ggml/src/ggml-zdnn/**
|
||||
documentation:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
|
||||
43
.github/workflows/build-riscv-native.yml
vendored
43
.github/workflows/build-riscv-native.yml
vendored
@@ -1,43 +0,0 @@
|
||||
name: Build on RISCV Linux Machine by Cloud-V
|
||||
on:
|
||||
workflow_dispatch:
|
||||
workflow_call:
|
||||
|
||||
jobs:
|
||||
bianbu-riscv64-native: # Bianbu 2.2
|
||||
runs-on: self-hosted
|
||||
|
||||
steps:
|
||||
- name: Install prerequisites
|
||||
run: |
|
||||
sudo apt-get update || true
|
||||
sudo apt-get install -y libatomic1
|
||||
- uses: actions/checkout@v4
|
||||
- name: Setup Riscv
|
||||
run: |
|
||||
sudo apt-get update || true
|
||||
sudo apt-get install -y --no-install-recommends \
|
||||
build-essential \
|
||||
gcc-14-riscv64-linux-gnu \
|
||||
g++-14-riscv64-linux-gnu \
|
||||
cmake
|
||||
|
||||
- name: Build
|
||||
run: |
|
||||
cmake -B build -DLLAMA_CURL=OFF \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_OPENMP=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=ON \
|
||||
-DLLAMA_BUILD_TOOLS=ON \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
-DCMAKE_SYSTEM_NAME=Linux \
|
||||
-DCMAKE_SYSTEM_PROCESSOR=riscv64 \
|
||||
-DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
|
||||
-DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
|
||||
-DCMAKE_POSITION_INDEPENDENT_CODE=ON \
|
||||
-DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
|
||||
-DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
|
||||
-DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
|
||||
-DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
|
||||
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
128
.github/workflows/build.yml
vendored
128
.github/workflows/build.yml
vendored
@@ -64,7 +64,7 @@ jobs:
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
with:
|
||||
key: macOS-latest-cmake-arm64
|
||||
evict-old-files: 1d
|
||||
@@ -104,7 +104,7 @@ jobs:
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
with:
|
||||
key: macOS-latest-cmake-x64
|
||||
evict-old-files: 1d
|
||||
@@ -144,7 +144,7 @@ jobs:
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
with:
|
||||
key: macOS-latest-cmake-arm64-webgpu
|
||||
evict-old-files: 1d
|
||||
@@ -159,15 +159,31 @@ jobs:
|
||||
- name: Dawn Dependency
|
||||
id: dawn-depends
|
||||
run: |
|
||||
DAWN_VERSION="v1.0.0"
|
||||
DAWN_OWNER="reeselevine"
|
||||
DAWN_REPO="dawn"
|
||||
DAWN_ASSET_NAME="Dawn-a1a6b45cced25a3b7f4fb491e0ae70796cc7f22b-macos-latest-Release.tar.gz"
|
||||
echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
|
||||
curl -L -o artifact.tar.gz \
|
||||
"https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
|
||||
ARTIFACTS_JSON=$(curl -s -L \
|
||||
-H "Accept: application/vnd.github+json" \
|
||||
-H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
|
||||
-H "X-GitHub-Api-Version: 2022-11-28" \
|
||||
"https://api.github.com/repos/google/dawn/actions/artifacts")
|
||||
echo "Finding latest macos-latest-Release artifact..."
|
||||
DOWNLOAD_URL=$(echo "$ARTIFACTS_JSON" | jq -r '.artifacts
|
||||
| sort_by(.created_at)
|
||||
| reverse
|
||||
| map(select(.name | test("macos-latest-Release$")))
|
||||
| .[0].archive_download_url')
|
||||
if [ "$DOWNLOAD_URL" = "null" ] || [ -z "$DOWNLOAD_URL" ]; then
|
||||
echo "No suitable Dawn artifact found!"
|
||||
exit 1
|
||||
fi
|
||||
echo "Downloading from: $DOWNLOAD_URL"
|
||||
curl -L \
|
||||
-H "Accept: application/vnd.github+json" \
|
||||
-H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
|
||||
-o artifact.zip "$DOWNLOAD_URL"
|
||||
unzip artifact.zip
|
||||
mkdir dawn
|
||||
tar -xvf artifact.tar.gz -C dawn --strip-components=1
|
||||
tar_file=$(find . -name '*.tar.gz' | head -n 1)
|
||||
echo "Extracting: $tar_file"
|
||||
tar -xvf "$tar_file" -C dawn --strip-components=1
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
@@ -199,7 +215,7 @@ jobs:
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
with:
|
||||
key: ubuntu-cpu-cmake
|
||||
evict-old-files: 1d
|
||||
@@ -251,7 +267,7 @@ jobs:
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
with:
|
||||
key: ubuntu-latest-cmake-sanitizer-${{ matrix.sanitizer }}
|
||||
evict-old-files: 1d
|
||||
@@ -330,7 +346,7 @@ jobs:
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
with:
|
||||
key: ubuntu-latest-cmake-rpc
|
||||
evict-old-files: 1d
|
||||
@@ -363,7 +379,7 @@ jobs:
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
with:
|
||||
key: ubuntu-22-cmake-vulkan
|
||||
evict-old-files: 1d
|
||||
@@ -400,7 +416,7 @@ jobs:
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
with:
|
||||
key: ubuntu-22-cmake-webgpu
|
||||
evict-old-files: 1d
|
||||
@@ -417,15 +433,31 @@ jobs:
|
||||
id: dawn-depends
|
||||
run: |
|
||||
sudo apt-get install -y libxrandr-dev libxinerama-dev libxcursor-dev mesa-common-dev libx11-xcb-dev libxi-dev
|
||||
DAWN_VERSION="v1.0.0"
|
||||
DAWN_OWNER="reeselevine"
|
||||
DAWN_REPO="dawn"
|
||||
DAWN_ASSET_NAME="Dawn-a1a6b45cced25a3b7f4fb491e0ae70796cc7f22b-ubuntu-latest-Release.tar.gz"
|
||||
echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
|
||||
curl -L -o artifact.tar.gz \
|
||||
"https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
|
||||
ARTIFACTS_JSON=$(curl -s -L \
|
||||
-H "Accept: application/vnd.github+json" \
|
||||
-H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
|
||||
-H "X-GitHub-Api-Version: 2022-11-28" \
|
||||
"https://api.github.com/repos/google/dawn/actions/artifacts")
|
||||
echo "Finding latest ubuntu-latest-Release artifact..."
|
||||
DOWNLOAD_URL=$(echo "$ARTIFACTS_JSON" | jq -r '.artifacts
|
||||
| sort_by(.created_at)
|
||||
| reverse
|
||||
| map(select(.name | test("ubuntu-latest-Release$")))
|
||||
| .[0].archive_download_url')
|
||||
if [ "$DOWNLOAD_URL" = "null" ] || [ -z "$DOWNLOAD_URL" ]; then
|
||||
echo "No suitable Dawn artifact found!"
|
||||
exit 1
|
||||
fi
|
||||
echo "Downloading from: $DOWNLOAD_URL"
|
||||
curl -L \
|
||||
-H "Accept: application/vnd.github+json" \
|
||||
-H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
|
||||
-o artifact.zip "$DOWNLOAD_URL"
|
||||
unzip artifact.zip
|
||||
mkdir dawn
|
||||
tar -xvf artifact.tar.gz -C dawn --strip-components=1
|
||||
tar_file=$(find . -name '*.tar.gz' | head -n 1)
|
||||
echo "Extracting: $tar_file"
|
||||
tar -xvf "$tar_file" -C dawn --strip-components=1
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
@@ -443,7 +475,7 @@ jobs:
|
||||
|
||||
ubuntu-22-cmake-hip:
|
||||
runs-on: ubuntu-22.04
|
||||
container: rocm/dev-ubuntu-22.04:6.1.2
|
||||
container: rocm/dev-ubuntu-22.04:6.0.2
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
@@ -457,7 +489,7 @@ jobs:
|
||||
sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev libcurl4-openssl-dev
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
with:
|
||||
key: ubuntu-22-cmake-hip
|
||||
evict-old-files: 1d
|
||||
@@ -471,6 +503,16 @@ jobs:
|
||||
-DGGML_HIP=ON
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
|
||||
- name: Build with legacy HIP support
|
||||
id: cmake_build_legacy_hip
|
||||
run: |
|
||||
cmake -B build2 -S . \
|
||||
-DCMAKE_C_COMPILER=hipcc \
|
||||
-DCMAKE_CXX_COMPILER=hipcc \
|
||||
-DGGML_HIP_ROCWMMA_FATTN=ON \
|
||||
-DGGML_HIP=ON
|
||||
cmake --build build2 --config Release -j $(nproc)
|
||||
|
||||
ubuntu-22-cmake-musa:
|
||||
runs-on: ubuntu-22.04
|
||||
container: mthreads/musa:rc4.2.0-devel-ubuntu22.04-amd64
|
||||
@@ -487,7 +529,7 @@ jobs:
|
||||
apt-get install -y build-essential git cmake libcurl4-openssl-dev
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
with:
|
||||
key: ubuntu-22-cmake-musa
|
||||
evict-old-files: 1d
|
||||
@@ -532,7 +574,7 @@ jobs:
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
with:
|
||||
key: ubuntu-22-cmake-sycl
|
||||
evict-old-files: 1d
|
||||
@@ -580,7 +622,7 @@ jobs:
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
with:
|
||||
key: ubuntu-22-cmake-sycl-fp16
|
||||
evict-old-files: 1d
|
||||
@@ -611,7 +653,7 @@ jobs:
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
with:
|
||||
key: macOS-latest-cmake-ios
|
||||
evict-old-files: 1d
|
||||
@@ -648,7 +690,7 @@ jobs:
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
with:
|
||||
key: macOS-latest-cmake-tvos
|
||||
evict-old-files: 1d
|
||||
@@ -720,7 +762,7 @@ jobs:
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
with:
|
||||
key: macOS-latest-swift
|
||||
evict-old-files: 1d
|
||||
@@ -766,7 +808,7 @@ jobs:
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
with:
|
||||
key: windows-msys2
|
||||
variant: ccache
|
||||
@@ -834,7 +876,7 @@ jobs:
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
with:
|
||||
key: windows-latest-cmake-${{ matrix.build }}
|
||||
variant: ccache
|
||||
@@ -948,7 +990,7 @@ jobs:
|
||||
apt install -y cmake build-essential ninja-build libgomp1 git libcurl4-openssl-dev
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
with:
|
||||
key: ubuntu-latest-cmake-cuda
|
||||
evict-old-files: 1d
|
||||
@@ -977,7 +1019,7 @@ jobs:
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Install ccache
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
with:
|
||||
key: windows-cuda-${{ matrix.cuda }}
|
||||
variant: ccache
|
||||
@@ -1033,7 +1075,7 @@ jobs:
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
with:
|
||||
key: windows-latest-cmake-sycl
|
||||
variant: ccache
|
||||
@@ -1070,8 +1112,7 @@ jobs:
|
||||
write-host "Downloading AMD HIP SDK Installer"
|
||||
Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
|
||||
write-host "Installing AMD HIP SDK"
|
||||
$proc = Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -PassThru
|
||||
$proc.WaitForExit(600000)
|
||||
Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
|
||||
write-host "Completed AMD HIP SDK installation"
|
||||
|
||||
- name: Verify ROCm
|
||||
@@ -1080,7 +1121,7 @@ jobs:
|
||||
& 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
|
||||
|
||||
- name: Install ccache
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
with:
|
||||
key: ${{ github.job }}
|
||||
evict-old-files: 1d
|
||||
@@ -1114,11 +1155,6 @@ jobs:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Setup Xcode
|
||||
uses: maxim-lobanov/setup-xcode@v1
|
||||
with:
|
||||
xcode-version: latest-stable
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
@@ -1152,7 +1188,7 @@ jobs:
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
with:
|
||||
key: android-build
|
||||
evict-old-files: 1d
|
||||
|
||||
53
.github/workflows/copilot-setup-steps.yml
vendored
53
.github/workflows/copilot-setup-steps.yml
vendored
@@ -1,53 +0,0 @@
|
||||
name: "Copilot Setup Steps"
|
||||
|
||||
# Automatically run the setup steps when they are changed to allow for easy validation, and
|
||||
# allow manual testing through the repository's "Actions" tab
|
||||
on:
|
||||
workflow_dispatch:
|
||||
push:
|
||||
paths:
|
||||
- .github/workflows/copilot-setup-steps.yml
|
||||
pull_request:
|
||||
paths:
|
||||
- .github/workflows/copilot-setup-steps.yml
|
||||
|
||||
jobs:
|
||||
# The job MUST be called `copilot-setup-steps` or it will not be picked up by Copilot.
|
||||
copilot-setup-steps:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
# Set the permissions to the lowest permissions possible needed for your steps.
|
||||
# Copilot will be given its own token for its operations.
|
||||
permissions:
|
||||
# If you want to clone the repository as part of your setup steps, for example to install dependencies, you'll need the `contents: read` permission. If you don't clone the repository in your setup steps, Copilot will do this for you automatically after the steps complete.
|
||||
contents: read
|
||||
|
||||
# You can define any steps you want, and they will run before the agent starts.
|
||||
# If you do not check out your code, Copilot will do this for you.
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
with:
|
||||
key: copilot-setup-steps
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential libcurl4-openssl-dev
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.11'
|
||||
|
||||
- name: Install Python dependencies
|
||||
run: |
|
||||
python3 -m venv .venv
|
||||
.venv/bin/activate
|
||||
pip install -r requirements/requirements-all.txt -r tools/server/tests/requirements.txt
|
||||
pip install flake8 pyright
|
||||
45
.github/workflows/pre-tokenizer-hashes.yml
vendored
45
.github/workflows/pre-tokenizer-hashes.yml
vendored
@@ -1,45 +0,0 @@
|
||||
name: Check Pre-Tokenizer Hashes
|
||||
|
||||
on:
|
||||
push:
|
||||
paths:
|
||||
- 'convert_hf_to_gguf.py'
|
||||
- 'convert_hf_to_gguf_update.py'
|
||||
pull_request:
|
||||
paths:
|
||||
- 'convert_hf_to_gguf.py'
|
||||
- 'convert_hf_to_gguf_update.py'
|
||||
|
||||
jobs:
|
||||
pre-tokenizer-hashes:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.11'
|
||||
|
||||
- name: Install Python dependencies
|
||||
run: |
|
||||
python3 -m venv .venv
|
||||
.venv/bin/pip install -r requirements/requirements-convert_hf_to_gguf_update.txt
|
||||
|
||||
- name: Update pre-tokenizer hashes
|
||||
run: |
|
||||
cp convert_hf_to_gguf.py /tmp
|
||||
.venv/bin/python convert_hf_to_gguf_update.py --check-missing
|
||||
|
||||
- name: Check if committed pre-tokenizer hashes matches generated version
|
||||
run: |
|
||||
if ! diff -q convert_hf_to_gguf.py /tmp/convert_hf_to_gguf.py; then
|
||||
echo "Model pre-tokenizer hashes (in convert_hf_to_gguf.py) do not match generated hashes (from convert_hf_to_gguf_update.py)."
|
||||
echo "To fix: run ./convert_hf_to_gguf_update.py and commit the updated convert_hf_to_gguf.py along with your changes"
|
||||
echo "Differences found:"
|
||||
diff convert_hf_to_gguf.py /tmp/convert_hf_to_gguf.py || true
|
||||
exit 1
|
||||
fi
|
||||
echo "Model pre-tokenizer hashes are up to date."
|
||||
27
.github/workflows/release.yml
vendored
27
.github/workflows/release.yml
vendored
@@ -32,7 +32,7 @@ jobs:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
with:
|
||||
key: macOS-latest-cmake-arm64
|
||||
evict-old-files: 1d
|
||||
@@ -85,7 +85,7 @@ jobs:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
with:
|
||||
key: macOS-latest-cmake-x64
|
||||
evict-old-files: 1d
|
||||
@@ -147,7 +147,7 @@ jobs:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
with:
|
||||
key: ubuntu-cpu-cmake
|
||||
evict-old-files: 1d
|
||||
@@ -198,7 +198,7 @@ jobs:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
with:
|
||||
key: ubuntu-22-cmake-vulkan
|
||||
evict-old-files: 1d
|
||||
@@ -256,7 +256,7 @@ jobs:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
with:
|
||||
key: windows-latest-cmake-cpu-${{ matrix.arch }}
|
||||
variant: ccache
|
||||
@@ -328,7 +328,7 @@ jobs:
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
with:
|
||||
key: windows-latest-cmake-${{ matrix.backend }}-${{ matrix.arch }}
|
||||
variant: ccache
|
||||
@@ -398,7 +398,7 @@ jobs:
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Install ccache
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
with:
|
||||
key: windows-cuda-${{ matrix.cuda }}
|
||||
variant: ccache
|
||||
@@ -471,7 +471,7 @@ jobs:
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
with:
|
||||
key: windows-latest-cmake-sycl
|
||||
variant: ccache
|
||||
@@ -545,7 +545,7 @@ jobs:
|
||||
git clone https://github.com/rocm/rocwmma --branch rocm-6.2.4 --depth 1
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
with:
|
||||
key: windows-latest-cmake-hip-${{ matrix.name }}-x64
|
||||
evict-old-files: 1d
|
||||
@@ -557,8 +557,7 @@ jobs:
|
||||
write-host "Downloading AMD HIP SDK Installer"
|
||||
Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
|
||||
write-host "Installing AMD HIP SDK"
|
||||
$proc = Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -PassThru
|
||||
$proc.WaitForExit(600000)
|
||||
Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
|
||||
write-host "Completed AMD HIP SDK installation"
|
||||
|
||||
- name: Verify ROCm
|
||||
@@ -601,7 +600,7 @@ jobs:
|
||||
name: llama-bin-win-hip-${{ matrix.name }}-x64.zip
|
||||
|
||||
ios-xcode-build:
|
||||
runs-on: macos-15
|
||||
runs-on: macos-latest
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
@@ -609,10 +608,6 @@ jobs:
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Setup Xcode
|
||||
run: |
|
||||
sudo xcode-select -s /Applications/Xcode_16.4.app
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
|
||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -82,7 +82,6 @@ models/*
|
||||
models-mnt
|
||||
!models/.editorconfig
|
||||
!models/ggml-vocab-*.gguf*
|
||||
!models/templates
|
||||
|
||||
# Zig
|
||||
zig-out/
|
||||
|
||||
@@ -12,8 +12,6 @@ if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
|
||||
set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
|
||||
endif()
|
||||
|
||||
message("CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}")
|
||||
|
||||
# Add path to modules
|
||||
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
|
||||
|
||||
|
||||
@@ -10,4 +10,3 @@
|
||||
/ggml/src/ggml-opt.cpp @JohannesGaessler
|
||||
/ggml/src/gguf.cpp @JohannesGaessler
|
||||
/ggml/src/ggml-vulkan/ @0cc4m
|
||||
/ggml/src/ggml-zdnn/ @taronaeo
|
||||
|
||||
@@ -17,8 +17,6 @@ LLM inference in C/C++
|
||||
|
||||
## Hot topics
|
||||
|
||||
- **[[FEEDBACK] Better packaging for llama.cpp to support downstream consumers 🤗](https://github.com/ggml-org/llama.cpp/discussions/15313)**
|
||||
- Support for the `gpt-oss` model with native MXFP4 format has been added | [PR](https://github.com/ggml-org/llama.cpp/pull/15091) | [Collaboration with NVIDIA](https://blogs.nvidia.com/blog/rtx-ai-garage-openai-oss) | [Comment](https://github.com/ggml-org/llama.cpp/discussions/15095)
|
||||
- Hot PRs: [All](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Apr+label%3Ahot+) | [Open](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Apr+label%3Ahot+is%3Aopen)
|
||||
- Multimodal support arrived in `llama-server`: [#12898](https://github.com/ggml-org/llama.cpp/pull/12898) | [documentation](./docs/multimodal.md)
|
||||
- VS Code extension for FIM completions: https://github.com/ggml-org/llama.vscode
|
||||
@@ -241,7 +239,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
||||
<details>
|
||||
<summary>Infrastructure</summary>
|
||||
|
||||
- [Paddler](https://github.com/intentee/paddler) - Open-source LLMOps platform for hosting and scaling AI in your own infrastructure
|
||||
- [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp
|
||||
- [GPUStack](https://github.com/gpustack/gpustack) - Manage GPU clusters for running LLMs
|
||||
- [llama_cpp_canister](https://github.com/onicai/llama_cpp_canister) - llama.cpp as a smart contract on the Internet Computer, using WebAssembly
|
||||
- [llama-swap](https://github.com/mostlygeek/llama-swap) - transparent proxy that adds automatic model switching with llama-server
|
||||
|
||||
264
common/arg.cpp
264
common/arg.cpp
@@ -24,7 +24,6 @@
|
||||
#include <cstdarg>
|
||||
#include <filesystem>
|
||||
#include <fstream>
|
||||
#include <list>
|
||||
#include <regex>
|
||||
#include <set>
|
||||
#include <string>
|
||||
@@ -749,39 +748,6 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string &
|
||||
// utils
|
||||
//
|
||||
|
||||
// Helper function to parse tensor buffer override strings
|
||||
static void parse_tensor_buffer_overrides(const std::string & value, std::vector<llama_model_tensor_buft_override> & overrides) {
|
||||
std::map<std::string, ggml_backend_buffer_type_t> buft_list;
|
||||
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
|
||||
auto * dev = ggml_backend_dev_get(i);
|
||||
auto * buft = ggml_backend_dev_buffer_type(dev);
|
||||
if (buft) {
|
||||
buft_list[ggml_backend_buft_name(buft)] = buft;
|
||||
}
|
||||
}
|
||||
|
||||
for (const auto & override : string_split<std::string>(value, ',')) {
|
||||
std::string::size_type pos = override.find('=');
|
||||
if (pos == std::string::npos) {
|
||||
throw std::invalid_argument("invalid value");
|
||||
}
|
||||
std::string tensor_name = override.substr(0, pos);
|
||||
std::string buffer_type = override.substr(pos + 1);
|
||||
|
||||
if (buft_list.find(buffer_type) == buft_list.end()) {
|
||||
printf("Available buffer types:\n");
|
||||
for (const auto & it : buft_list) {
|
||||
printf(" %s\n", ggml_backend_buft_name(it.second));
|
||||
}
|
||||
throw std::invalid_argument("unknown buffer type");
|
||||
}
|
||||
// keep strings alive and avoid leaking memory by storing them in a static vector
|
||||
static std::list<std::string> buft_overrides;
|
||||
buft_overrides.push_back(tensor_name);
|
||||
overrides.push_back({buft_overrides.back().c_str(), buft_list.at(buffer_type)});
|
||||
}
|
||||
}
|
||||
|
||||
struct handle_model_result {
|
||||
bool found_mmproj = false;
|
||||
common_params_model mmproj;
|
||||
@@ -1011,10 +977,6 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
||||
for (auto & seq_breaker : params.sampling.dry_sequence_breakers) {
|
||||
string_process_escapes(seq_breaker);
|
||||
}
|
||||
for (auto & pair : params.speculative.replacements) {
|
||||
string_process_escapes(pair.first);
|
||||
string_process_escapes(pair.second);
|
||||
}
|
||||
}
|
||||
|
||||
if (!params.kv_overrides.empty()) {
|
||||
@@ -1026,10 +988,6 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
||||
params.tensor_buft_overrides.push_back({nullptr, nullptr});
|
||||
}
|
||||
|
||||
if (!params.speculative.tensor_buft_overrides.empty()) {
|
||||
params.speculative.tensor_buft_overrides.push_back({nullptr, nullptr});
|
||||
}
|
||||
|
||||
if (!params.chat_template.empty() && !common_chat_verify_template(params.chat_template, params.use_jinja)) {
|
||||
throw std::runtime_error(string_format(
|
||||
"error: the supplied chat template is not supported: %s%s\n",
|
||||
@@ -1238,7 +1196,6 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e
|
||||
common_params_print_completion(ctx_arg);
|
||||
exit(0);
|
||||
}
|
||||
params.lr.init();
|
||||
} catch (const std::invalid_argument & ex) {
|
||||
fprintf(stderr, "%s\n", ex.what());
|
||||
ctx_arg.params = params_org;
|
||||
@@ -1507,14 +1464,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.swa_full = true;
|
||||
}
|
||||
).set_env("LLAMA_ARG_SWA_FULL"));
|
||||
add_opt(common_arg(
|
||||
{"--swa-checkpoints"}, "N",
|
||||
string_format("max number of SWA checkpoints per slot to create (default: %d)\n"
|
||||
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)", params.n_swa_checkpoints),
|
||||
[](common_params & params, int value) {
|
||||
params.n_swa_checkpoints = value;
|
||||
}
|
||||
).set_env("LLAMA_ARG_SWA_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER}));
|
||||
add_opt(common_arg(
|
||||
{"--kv-unified", "-kvu"},
|
||||
string_format("use single unified KV buffer for the KV cache of all sequences (default: %s)\n"
|
||||
@@ -2142,13 +2091,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.no_kv_offload = true;
|
||||
}
|
||||
).set_env("LLAMA_ARG_NO_KV_OFFLOAD"));
|
||||
add_opt(common_arg(
|
||||
{"-nr", "--no-repack"},
|
||||
"disable weight repacking",
|
||||
[](common_params & params) {
|
||||
params.no_extra_bufts = true;
|
||||
}
|
||||
).set_env("LLAMA_ARG_NO_REPACK"));
|
||||
add_opt(common_arg(
|
||||
{"-ctk", "--cache-type-k"}, "TYPE",
|
||||
string_format(
|
||||
@@ -2395,58 +2337,38 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
add_opt(common_arg(
|
||||
{"--override-tensor", "-ot"}, "<tensor name pattern>=<buffer type>,...",
|
||||
"override tensor buffer type", [](common_params & params, const std::string & value) {
|
||||
parse_tensor_buffer_overrides(value, params.tensor_buft_overrides);
|
||||
/* static */ std::map<std::string, ggml_backend_buffer_type_t> buft_list;
|
||||
if (buft_list.empty()) {
|
||||
// enumerate all the devices and add their buffer types to the list
|
||||
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
|
||||
auto * dev = ggml_backend_dev_get(i);
|
||||
auto * buft = ggml_backend_dev_buffer_type(dev);
|
||||
if (buft) {
|
||||
buft_list[ggml_backend_buft_name(buft)] = buft;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (const auto & override : string_split<std::string>(value, ',')) {
|
||||
std::string::size_type pos = override.find('=');
|
||||
if (pos == std::string::npos) {
|
||||
throw std::invalid_argument("invalid value");
|
||||
}
|
||||
std::string tensor_name = override.substr(0, pos);
|
||||
std::string buffer_type = override.substr(pos + 1);
|
||||
|
||||
if (buft_list.find(buffer_type) == buft_list.end()) {
|
||||
printf("Available buffer types:\n");
|
||||
for (const auto & it : buft_list) {
|
||||
printf(" %s\n", ggml_backend_buft_name(it.second));
|
||||
}
|
||||
throw std::invalid_argument("unknown buffer type");
|
||||
}
|
||||
// FIXME: this leaks memory
|
||||
params.tensor_buft_overrides.push_back({strdup(tensor_name.c_str()), buft_list.at(buffer_type)});
|
||||
}
|
||||
}
|
||||
));
|
||||
add_opt(common_arg(
|
||||
{"--override-tensor-draft", "-otd"}, "<tensor name pattern>=<buffer type>,...",
|
||||
"override tensor buffer type for draft model", [](common_params & params, const std::string & value) {
|
||||
parse_tensor_buffer_overrides(value, params.speculative.tensor_buft_overrides);
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
|
||||
add_opt(common_arg(
|
||||
{"--cpu-moe", "-cmoe"},
|
||||
"keep all Mixture of Experts (MoE) weights in the CPU",
|
||||
[](common_params & params) {
|
||||
params.tensor_buft_overrides.push_back({"\\.ffn_(up|down|gate)_exps", ggml_backend_cpu_buffer_type()});
|
||||
}
|
||||
).set_env("LLAMA_ARG_CPU_MOE"));
|
||||
add_opt(common_arg(
|
||||
{"--n-cpu-moe", "-ncmoe"}, "N",
|
||||
"keep the Mixture of Experts (MoE) weights of the first N layers in the CPU",
|
||||
[](common_params & params, int value) {
|
||||
if (value < 0) {
|
||||
throw std::invalid_argument("invalid value");
|
||||
}
|
||||
for (int i = 0; i < value; ++i) {
|
||||
// keep strings alive and avoid leaking memory by storing them in a static vector
|
||||
static std::list<std::string> buft_overrides;
|
||||
buft_overrides.push_back(string_format("blk\\.%d\\.ffn_(up|down|gate)_exps", i));
|
||||
params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), ggml_backend_cpu_buffer_type()});
|
||||
}
|
||||
}
|
||||
).set_env("LLAMA_ARG_N_CPU_MOE"));
|
||||
add_opt(common_arg(
|
||||
{"--cpu-moe-draft", "-cmoed"},
|
||||
"keep all Mixture of Experts (MoE) weights in the CPU for the draft model",
|
||||
[](common_params & params) {
|
||||
params.speculative.tensor_buft_overrides.push_back({"\\.ffn_(up|down|gate)_exps", ggml_backend_cpu_buffer_type()});
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CPU_MOE_DRAFT"));
|
||||
add_opt(common_arg(
|
||||
{"--n-cpu-moe-draft", "-ncmoed"}, "N",
|
||||
"keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model",
|
||||
[](common_params & params, int value) {
|
||||
if (value < 0) {
|
||||
throw std::invalid_argument("invalid value");
|
||||
}
|
||||
for (int i = 0; i < value; ++i) {
|
||||
static std::list<std::string> buft_overrides_draft;
|
||||
buft_overrides_draft.push_back(string_format("blk\\.%d\\.ffn_(up|down|gate)_exps", i));
|
||||
params.speculative.tensor_buft_overrides.push_back({buft_overrides_draft.back().c_str(), ggml_backend_cpu_buffer_type()});
|
||||
}
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_N_CPU_MOE_DRAFT"));
|
||||
add_opt(common_arg(
|
||||
{"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
|
||||
"number of layers to store in VRAM",
|
||||
@@ -2697,7 +2619,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.out_file = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE}));
|
||||
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS}));
|
||||
add_opt(common_arg(
|
||||
{"-ofreq", "--output-frequency"}, "N",
|
||||
string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq),
|
||||
@@ -2705,15 +2627,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.n_out_freq = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
||||
add_opt(common_arg(
|
||||
{"--output-format"}, "{gguf,dat}",
|
||||
string_format("output format for imatrix file (default: %s)", params.imat_dat > 0 ? "dat" : "gguf"),
|
||||
[](common_params & params, const std::string & value) {
|
||||
/**/ if (value == "gguf") { params.imat_dat = -1; }
|
||||
else if (value == "dat") { params.imat_dat = 1; }
|
||||
else { throw std::invalid_argument("invalid output format"); }
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
||||
add_opt(common_arg(
|
||||
{"--save-frequency"}, "N",
|
||||
string_format("save an imatrix copy every N iterations (default: %d)", params.n_save_freq),
|
||||
@@ -2989,9 +2902,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
"controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
|
||||
"- none: leaves thoughts unparsed in `message.content`\n"
|
||||
"- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)\n"
|
||||
"(default: auto)",
|
||||
"(default: deepseek)",
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.reasoning_format = common_reasoning_format_from_name(value);
|
||||
/**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; }
|
||||
else if (value == "deepseek-legacy") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY; }
|
||||
else if (value == "none") { params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }
|
||||
else { throw std::invalid_argument("invalid value"); }
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));
|
||||
add_opt(common_arg(
|
||||
@@ -3172,7 +3088,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.speculative.cpuparams.n_threads = std::thread::hardware_concurrency();
|
||||
}
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
||||
add_opt(common_arg(
|
||||
{"-tbd", "--threads-batch-draft"}, "N",
|
||||
"number of threads to use during batch and prompt processing (default: same as --threads-draft)",
|
||||
@@ -3182,7 +3098,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.speculative.cpuparams_batch.n_threads = std::thread::hardware_concurrency();
|
||||
}
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
||||
add_opt(common_arg(
|
||||
{"-Cd", "--cpu-mask-draft"}, "M",
|
||||
"Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
|
||||
@@ -3333,13 +3249,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.speculative.model.path = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
|
||||
add_opt(common_arg(
|
||||
{"--spec-replace"}, "TARGET", "DRAFT",
|
||||
"translate the string in TARGET into DRAFT if the draft model and main model are not compatible",
|
||||
[](common_params & params, const std::string & tgt, const std::string & dft) {
|
||||
params.speculative.replacements.push_back({ tgt, dft });
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
|
||||
add_opt(common_arg(
|
||||
{"-ctkd", "--cache-type-k-draft"}, "TYPE",
|
||||
string_format(
|
||||
@@ -3529,11 +3438,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
||||
|
||||
// diffusion parameters
|
||||
add_opt(common_arg(
|
||||
{ "--diffusion-steps" }, "N",
|
||||
string_format("number of diffusion steps (default: %d)", params.diffusion.steps),
|
||||
[](common_params & params, int value) { params.diffusion.steps = value; }
|
||||
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
||||
add_opt(common_arg(
|
||||
{ "--diffusion-eps" }, "F",
|
||||
string_format("epsilon for timesteps (default: %.6f)", (double) params.diffusion.eps),
|
||||
[](common_params & params, const std::string & value) { params.diffusion.eps = std::stof(value); }
|
||||
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
||||
add_opt(common_arg(
|
||||
{ "--diffusion-algorithm" }, "N",
|
||||
string_format("diffusion algorithm: 0=ORIGIN, 1=MASKGIT_PLUS, 2=TOPK_MARGIN, 3=ENTROPY (default: %d)",
|
||||
params.diffusion.algorithm),
|
||||
[](common_params & params, int value) { params.diffusion.algorithm = value; }
|
||||
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
||||
add_opt(common_arg(
|
||||
{ "--diffusion-alg-temp" }, "F",
|
||||
string_format("algorithm temperature (default: %.3f)", (double) params.diffusion.alg_temp),
|
||||
[](common_params & params, const std::string & value) { params.diffusion.alg_temp = std::stof(value); }
|
||||
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
||||
add_opt(common_arg(
|
||||
{ "--diffusion-visual" },
|
||||
string_format("enable visual diffusion mode (show progressive generation) (default: %s)",
|
||||
@@ -3541,85 +3467,5 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
[](common_params & params) { params.diffusion.visual_mode = true; }
|
||||
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
||||
|
||||
add_opt(common_arg(
|
||||
{ "--diffusion-eps" }, "F",
|
||||
string_format("epsilon for timesteps (default: %.6f)", (double) params.diffusion.eps),
|
||||
[](common_params & params, const std::string & value) { params.diffusion.eps = std::stof(value); }
|
||||
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
||||
add_opt(common_arg(
|
||||
{ "--diffusion-algorithm" }, "N",
|
||||
string_format("diffusion algorithm: 0=ORIGIN, 1=ENTROPY_BASED, 2=MARGIN_BASED, 3=RANDOM, 4=LOW_CONFIDENCE (default: %d)",
|
||||
params.diffusion.algorithm),
|
||||
[](common_params & params, int value) { params.diffusion.algorithm = value; }
|
||||
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
||||
add_opt(common_arg(
|
||||
{ "--diffusion-alg-temp" }, "F",
|
||||
string_format("dream algorithm temperature (default: %.3f)", (double) params.diffusion.alg_temp),
|
||||
[](common_params & params, const std::string & value) { params.diffusion.alg_temp = std::stof(value); }
|
||||
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
||||
|
||||
add_opt(common_arg(
|
||||
{ "--diffusion-block-length" }, "N",
|
||||
string_format("llada block length for generation (default: %d)", params.diffusion.block_length),
|
||||
[](common_params & params, int value) { params.diffusion.block_length = value; }
|
||||
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
||||
add_opt(common_arg(
|
||||
{ "--diffusion-cfg-scale" }, "F",
|
||||
string_format("llada classifier-free guidance scale (default: %.3f)", (double) params.diffusion.cfg_scale),
|
||||
[](common_params & params, const std::string & value) { params.diffusion.cfg_scale = std::stof(value); }
|
||||
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
||||
add_opt(common_arg(
|
||||
{ "--diffusion-add-gumbel-noise" }, "F",
|
||||
string_format("add gumbel noise to the logits if temp > 0.0 (default: %s)", params.diffusion.add_gumbel_noise ? "true" : "false"),
|
||||
[](common_params & params, const std::string & value) { params.diffusion.add_gumbel_noise = std::stof(value); }
|
||||
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
||||
|
||||
|
||||
add_opt(
|
||||
common_arg({ "-lr", "--learning-rate" }, "ALPHA",
|
||||
string_format(
|
||||
"adamw or sgd optimizer alpha (default: %.2g); note: sgd alpha recommended ~10x (no momentum)",
|
||||
(double) params.lr.lr0),
|
||||
[](common_params & params, const std::string & value) { params.lr.lr0 = std::stof(value); })
|
||||
.set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
||||
add_opt(
|
||||
common_arg({ "-lr-min", "--learning-rate-min" }, "ALPHA",
|
||||
string_format(
|
||||
"(if >0) final learning rate after decay (if -decay-epochs is set, default=%.2g)",
|
||||
(double) params.lr.lr_min),
|
||||
[](common_params & params, const std::string & value) { params.lr.lr_min = std::stof(value); })
|
||||
.set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
||||
add_opt(
|
||||
common_arg({ "-decay-epochs", "--learning-rate-decay-epochs" }, "ALPHA",
|
||||
string_format(
|
||||
"(if >0) decay learning rate to -lr-min after this many epochs (exponential decay, default=%.2g)",
|
||||
(double) params.lr.decay_epochs),
|
||||
[](common_params & params, const std::string & value) { params.lr.decay_epochs = std::stof(value); })
|
||||
.set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
||||
add_opt(common_arg(
|
||||
{ "-wd", "--weight-decay" }, "WD",
|
||||
string_format(
|
||||
"adamw or sgd optimizer weight decay (0 is off; recommend very small e.g. 1e-9) (default: %.2g).",
|
||||
(double) params.lr.wd),
|
||||
[](common_params & params, const std::string & value) { params.lr.wd = std::stof(value); })
|
||||
.set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
||||
add_opt(common_arg({ "-val-split", "--val-split" }, "FRACTION",
|
||||
string_format("fraction of data to use as validation set for training (default: %.2g).",
|
||||
(double) params.val_split),
|
||||
[](common_params & params, const std::string & value) { params.val_split = std::stof(value); })
|
||||
.set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
||||
add_opt(common_arg({ "-epochs", "--epochs" }, "N",
|
||||
string_format("optimizer max # of epochs (default: %d)", params.lr.epochs),
|
||||
[](common_params & params, int epochs) { params.lr.epochs = epochs; })
|
||||
.set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
||||
add_opt(common_arg({ "-opt", "--optimizer" }, "sgd|adamw", "adamw or sgd",
|
||||
[](common_params & params, const std::string & name) {
|
||||
params.optimizer = common_opt_get_optimizer(name.c_str());
|
||||
if (params.optimizer == GGML_OPT_OPTIMIZER_TYPE_COUNT) {
|
||||
throw std::invalid_argument("invalid --optimizer, valid options: adamw, sgd");
|
||||
}
|
||||
})
|
||||
.set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
||||
|
||||
return ctx_arg;
|
||||
}
|
||||
|
||||
@@ -55,15 +55,7 @@ bool common_chat_msg_parser::add_tool_call(const std::string & name, const std::
|
||||
bool common_chat_msg_parser::add_tool_call(const json & tool_call) {
|
||||
std::string name = tool_call.contains("name") ? tool_call.at("name") : "";
|
||||
std::string id = tool_call.contains("id") ? tool_call.at("id") : "";
|
||||
std::string arguments = "";
|
||||
if (tool_call.contains("arguments")) {
|
||||
if (tool_call.at("arguments").is_object()) {
|
||||
arguments = tool_call.at("arguments").dump();
|
||||
} else {
|
||||
arguments = tool_call.at("arguments");
|
||||
}
|
||||
}
|
||||
|
||||
std::string arguments = tool_call.contains("arguments") ? tool_call.at("arguments") : "";
|
||||
return add_tool_call(name, id, arguments);
|
||||
}
|
||||
|
||||
|
||||
363
common/chat.cpp
363
common/chat.cpp
@@ -126,8 +126,6 @@ std::vector<common_chat_msg_diff> common_chat_msg_diff::compute_diffs(const comm
|
||||
typedef minja::chat_template common_chat_template;
|
||||
|
||||
struct common_chat_templates {
|
||||
bool add_bos;
|
||||
bool add_eos;
|
||||
bool has_explicit_template; // Model had builtin template or template overridde was specified.
|
||||
std::unique_ptr<common_chat_template> template_default; // always set (defaults to chatml)
|
||||
std::unique_ptr<common_chat_template> template_tool_use;
|
||||
@@ -145,8 +143,6 @@ struct templates_params {
|
||||
bool enable_thinking = true;
|
||||
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
|
||||
json extra_context;
|
||||
bool add_bos;
|
||||
bool add_eos;
|
||||
};
|
||||
|
||||
common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice) {
|
||||
@@ -296,7 +292,6 @@ json common_chat_msgs_to_json_oaicompat(const std::vector<common_chat_msg> & msg
|
||||
}
|
||||
if (!msg.reasoning_content.empty()) {
|
||||
jmsg["reasoning_content"] = msg.reasoning_content;
|
||||
jmsg["thinking"] = msg.reasoning_content; // gpt-oss
|
||||
}
|
||||
if (!msg.tool_name.empty()) {
|
||||
jmsg["name"] = msg.tool_name;
|
||||
@@ -450,8 +445,6 @@ std::string common_chat_format_single(
|
||||
|
||||
common_chat_templates_inputs inputs;
|
||||
inputs.use_jinja = use_jinja;
|
||||
inputs.add_bos = tmpls->add_bos;
|
||||
inputs.add_eos = tmpls->add_eos;
|
||||
|
||||
std::string fmt_past_msg;
|
||||
if (!past_msg.empty()) {
|
||||
@@ -473,12 +466,9 @@ std::string common_chat_format_single(
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
std::string common_chat_format_example(const struct common_chat_templates * tmpls, bool use_jinja, const std::map<std::string, std::string> & chat_template_kwargs) {
|
||||
std::string common_chat_format_example(const struct common_chat_templates * tmpls, bool use_jinja) {
|
||||
common_chat_templates_inputs inputs;
|
||||
inputs.use_jinja = use_jinja;
|
||||
inputs.add_bos = tmpls->add_bos;
|
||||
inputs.add_eos = tmpls->add_eos;
|
||||
inputs.chat_template_kwargs = chat_template_kwargs;
|
||||
auto add_simple_msg = [&](auto role, auto content) {
|
||||
common_chat_msg msg;
|
||||
msg.role = role;
|
||||
@@ -554,21 +544,8 @@ common_chat_templates_ptr common_chat_templates_init(
|
||||
default_template_src = CHATML_TEMPLATE_SRC;
|
||||
}
|
||||
}
|
||||
|
||||
// TODO @ngxson : this is a temporary hack to prevent chat template from throwing an error
|
||||
// Ref: https://github.com/ggml-org/llama.cpp/pull/15230#issuecomment-3173959633
|
||||
if (default_template_src.find("<|channel|>") != std::string::npos
|
||||
// search for the error message and patch it
|
||||
&& default_template_src.find("in message.content or") != std::string::npos) {
|
||||
string_replace_all(default_template_src,
|
||||
"{%- if \"<|channel|>analysis<|message|>\" in message.content or \"<|channel|>final<|message|>\" in message.content %}",
|
||||
"{%- if false %}");
|
||||
}
|
||||
|
||||
std::string token_bos = bos_token_override;
|
||||
std::string token_eos = eos_token_override;
|
||||
bool add_bos = false;
|
||||
bool add_eos = false;
|
||||
if (model) {
|
||||
const auto * vocab = llama_model_get_vocab(model);
|
||||
const auto get_token = [&](llama_token token, const char * name, const char * jinja_variable_name) {
|
||||
@@ -583,13 +560,9 @@ common_chat_templates_ptr common_chat_templates_init(
|
||||
};
|
||||
token_bos = get_token(llama_vocab_bos(vocab), "BOS", "bos_token");
|
||||
token_eos = get_token(llama_vocab_eos(vocab), "EOS", "eos_token");
|
||||
add_bos = llama_vocab_get_add_bos(vocab);
|
||||
add_eos = llama_vocab_get_add_eos(vocab);
|
||||
}
|
||||
common_chat_templates_ptr tmpls(new common_chat_templates());
|
||||
tmpls->has_explicit_template = has_explicit_template;
|
||||
tmpls->add_bos = add_bos;
|
||||
tmpls->add_eos = add_eos;
|
||||
try {
|
||||
tmpls->template_default = std::make_unique<minja::chat_template>(default_template_src, token_bos, token_eos);
|
||||
} catch (const std::exception & e) {
|
||||
@@ -619,8 +592,6 @@ const char * common_chat_format_name(common_chat_format format) {
|
||||
case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1: return "Functionary v3.1 Llama 3.1";
|
||||
case COMMON_CHAT_FORMAT_HERMES_2_PRO: return "Hermes 2 Pro";
|
||||
case COMMON_CHAT_FORMAT_COMMAND_R7B: return "Command R7B";
|
||||
case COMMON_CHAT_FORMAT_GRANITE: return "Granite";
|
||||
case COMMON_CHAT_FORMAT_GPT_OSS: return "GPT-OSS";
|
||||
default:
|
||||
throw std::runtime_error("Unknown chat format");
|
||||
}
|
||||
@@ -629,28 +600,13 @@ const char * common_chat_format_name(common_chat_format format) {
|
||||
const char * common_reasoning_format_name(common_reasoning_format format) {
|
||||
switch (format) {
|
||||
case COMMON_REASONING_FORMAT_NONE: return "none";
|
||||
case COMMON_REASONING_FORMAT_AUTO: return "auto";
|
||||
case COMMON_REASONING_FORMAT_DEEPSEEK: return "deepseek";
|
||||
case COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY: return "deepseek-legacy";
|
||||
case COMMON_REASONING_FORMAT_GRANITE: return "granite";
|
||||
default:
|
||||
throw std::runtime_error("Unknown reasoning format");
|
||||
}
|
||||
}
|
||||
|
||||
common_reasoning_format common_reasoning_format_from_name(const std::string & format) {
|
||||
if (format == "none") {
|
||||
return COMMON_REASONING_FORMAT_NONE;
|
||||
} else if (format == "auto") {
|
||||
return COMMON_REASONING_FORMAT_AUTO;
|
||||
} else if (format == "deepseek") {
|
||||
return COMMON_REASONING_FORMAT_DEEPSEEK;
|
||||
} else if (format == "deepseek-legacy") {
|
||||
return COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY;
|
||||
}
|
||||
throw std::runtime_error("Unknown reasoning format: " + format);
|
||||
}
|
||||
|
||||
static std::string wrap_code_as_arguments(common_chat_msg_parser & builder, const std::string & code) {
|
||||
std::string arguments;
|
||||
if (builder.is_partial()) {
|
||||
@@ -792,10 +748,10 @@ static std::string apply(
|
||||
// instead of using `chat_template_options.use_bos_token = false`, since these tokens
|
||||
// may be needed inside the template / between messages too.
|
||||
auto result = tmpl.apply(tmpl_inputs, tmpl_opts);
|
||||
if (inputs.add_bos && string_starts_with(result, tmpl.bos_token())) {
|
||||
if (string_starts_with(result, tmpl.bos_token())) {
|
||||
result = result.substr(tmpl.bos_token().size());
|
||||
}
|
||||
if (inputs.add_eos && string_ends_with(result, tmpl.eos_token())) {
|
||||
if (string_ends_with(result, tmpl.eos_token())) {
|
||||
result = result.substr(0, result.size() - tmpl.eos_token().size());
|
||||
}
|
||||
return result;
|
||||
@@ -1333,174 +1289,6 @@ static void common_chat_parse_deepseek_r1(common_chat_msg_parser & builder) {
|
||||
tool_calls_end);
|
||||
}
|
||||
|
||||
static common_chat_params common_chat_params_init_gpt_oss(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
||||
common_chat_params data;
|
||||
auto prompt = apply(tmpl, inputs);
|
||||
|
||||
data.prompt = prompt;
|
||||
data.format = COMMON_CHAT_FORMAT_GPT_OSS;
|
||||
|
||||
// These special tokens are required to parse properly, so we include them
|
||||
// even if parse_tool_calls is false.
|
||||
data.preserved_tokens = {
|
||||
"<|channel|>",
|
||||
"<|constrain|>",
|
||||
"<|message|>",
|
||||
"<|start|>",
|
||||
"<|end|>",
|
||||
};
|
||||
|
||||
if (inputs.tools.is_array() && !inputs.tools.empty()) {
|
||||
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
|
||||
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
||||
// tool calls can appear in commentary or analysis channels
|
||||
auto channel = builder.add_rule("channel", "\"<|channel|>\" ( \"commentary\" | \"analysis\" )");
|
||||
|
||||
std::vector<std::string> tool_rules_recipient_in_role;
|
||||
std::vector<std::string> tool_rules_recipient_in_channel;
|
||||
foreach_function(inputs.tools, [&](const json & tool) {
|
||||
const auto & function = tool.at("function");
|
||||
std::string name = function.at("name");
|
||||
auto parameters = function.at("parameters");
|
||||
builder.resolve_refs(parameters);
|
||||
|
||||
tool_rules_recipient_in_role.push_back(
|
||||
builder.add_rule(name + "-call",
|
||||
"\"" + name + "\"" + channel + " \" <|constrain|>json\"? \"<|message|>\" " +
|
||||
builder.add_schema(name + "-args", parameters)
|
||||
)
|
||||
);
|
||||
|
||||
tool_rules_recipient_in_channel.push_back(
|
||||
builder.add_rule(name + "-call",
|
||||
"\"" + name + "\"" + " \" <|constrain|>json\"? \"<|message|>\" " +
|
||||
builder.add_schema(name + "-args", parameters)
|
||||
)
|
||||
);
|
||||
});
|
||||
|
||||
auto recipient_in_role = builder.add_rule("recipient_in_role",
|
||||
"\"<|start|>assistant\"? \" to=functions.\" ( " +
|
||||
string_join(tool_rules_recipient_in_role, " | ") + " )"
|
||||
);
|
||||
|
||||
auto recipient_in_channel = builder.add_rule("recipient_in_channel",
|
||||
channel + " \" to=functions.\" ( " +
|
||||
string_join(tool_rules_recipient_in_channel, " | ") + " )"
|
||||
);
|
||||
|
||||
builder.add_rule("root", recipient_in_role + " | " + recipient_in_channel);
|
||||
|
||||
// Trigger on tool calls that appear in the commentary channel
|
||||
data.grammar_triggers.push_back({
|
||||
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
|
||||
"<\\|channel\\|>(commentary|analysis) to"
|
||||
});
|
||||
|
||||
// Trigger tool calls that appear in the role section, either at the
|
||||
// start or in the middle.
|
||||
data.grammar_triggers.push_back({
|
||||
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
|
||||
"^ to"
|
||||
});
|
||||
|
||||
data.grammar_triggers.push_back({
|
||||
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
|
||||
"<\\|start\\|>assistant to"
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
return data;
|
||||
}
|
||||
static void common_chat_parse_gpt_oss(common_chat_msg_parser & builder) {
|
||||
static const std::string constraint = "(?: (<\\|constrain\\|>)?([a-zA-Z0-9_-]+))";
|
||||
static const std::string recipient("(?: to=functions\\.([^<\\s]+))");
|
||||
|
||||
static const common_regex start_regex("<\\|start\\|>assistant");
|
||||
static const common_regex analysis_regex("<\\|channel\\|>analysis");
|
||||
static const common_regex final_regex("<\\|channel\\|>final" + constraint + "?");
|
||||
static const common_regex preamble_regex("<\\|channel\\|>commentary");
|
||||
static const common_regex tool_call1_regex(recipient + "<\\|channel\\|>(analysis|commentary)" + constraint + "?");
|
||||
static const common_regex tool_call2_regex("<\\|channel\\|>(analysis|commentary)" + recipient + constraint + "?");
|
||||
|
||||
auto consume_end = [&](bool include_end = false) {
|
||||
if (auto res = builder.try_find_literal("<|end|>")) {
|
||||
return res->prelude + (include_end ? builder.str(res->groups[0]) : "");
|
||||
}
|
||||
return builder.consume_rest();
|
||||
};
|
||||
|
||||
auto handle_tool_call = [&](const std::string & name) {
|
||||
if (auto args = builder.try_consume_json_with_dumped_args({{}})) {
|
||||
if (builder.syntax().parse_tool_calls) {
|
||||
if (!builder.add_tool_call(name, "", args->value) || args->is_partial) {
|
||||
throw common_chat_msg_partial_exception("incomplete tool call");
|
||||
}
|
||||
} else if (args->is_partial) {
|
||||
throw common_chat_msg_partial_exception("incomplete tool call");
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
auto regex_match = [](const common_regex & regex, const std::string & input) -> std::optional<common_regex_match> {
|
||||
auto match = regex.search(input, 0, true);
|
||||
if (match.type == COMMON_REGEX_MATCH_TYPE_FULL) {
|
||||
return match;
|
||||
}
|
||||
return std::nullopt;
|
||||
};
|
||||
|
||||
do {
|
||||
auto header_start_pos = builder.pos();
|
||||
auto content_start = builder.try_find_literal("<|message|>");
|
||||
if (!content_start) {
|
||||
throw common_chat_msg_partial_exception("incomplete header");
|
||||
}
|
||||
|
||||
auto header = content_start->prelude;
|
||||
|
||||
if (auto match = regex_match(tool_call1_regex, header)) {
|
||||
auto group = match->groups[1];
|
||||
auto name = header.substr(group.begin, group.end - group.begin);
|
||||
handle_tool_call(name);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (auto match = regex_match(tool_call2_regex, header)) {
|
||||
auto group = match->groups[2];
|
||||
auto name = header.substr(group.begin, group.end - group.begin);
|
||||
handle_tool_call(name);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (regex_match(analysis_regex, header)) {
|
||||
builder.move_to(header_start_pos);
|
||||
if (builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_NONE || builder.syntax().reasoning_in_content) {
|
||||
builder.add_content(consume_end(true));
|
||||
} else {
|
||||
builder.try_parse_reasoning("<|channel|>analysis<|message|>", "<|end|>");
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
if(regex_match(final_regex, header) || regex_match(preamble_regex, header)) {
|
||||
builder.add_content(consume_end());
|
||||
continue;
|
||||
}
|
||||
|
||||
// Possibly a malformed message, attempt to recover by rolling
|
||||
// back to pick up the next <|start|>
|
||||
LOG_DBG("%s: unknown header from message: %s\n", __func__, header.c_str());
|
||||
builder.move_to(header_start_pos);
|
||||
} while (builder.try_find_regex(start_regex, std::string::npos, false));
|
||||
|
||||
auto remaining = builder.consume_rest();
|
||||
if (!remaining.empty()) {
|
||||
LOG_DBG("%s: content after last message: %s\n", __func__, remaining.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
static common_chat_params common_chat_params_init_firefunction_v2(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
||||
LOG_DBG("%s\n", __func__);
|
||||
common_chat_params data;
|
||||
@@ -1858,7 +1646,7 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
|
||||
"|<function name=\"([^\"]+)\">" // match 5 (function name again)
|
||||
);
|
||||
|
||||
while (auto res = builder.try_find_regex(open_regex)) {
|
||||
if (auto res = builder.try_find_regex(open_regex)) {
|
||||
const auto & block_start = res->groups[1];
|
||||
std::string block_end = block_start.empty() ? "" : "```";
|
||||
|
||||
@@ -1880,6 +1668,7 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
|
||||
builder.consume_literal(block_end);
|
||||
builder.consume_spaces();
|
||||
}
|
||||
builder.add_content(builder.consume_rest());
|
||||
} else {
|
||||
throw common_chat_msg_partial_exception("failed to parse tool call");
|
||||
}
|
||||
@@ -1904,124 +1693,7 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
|
||||
builder.consume_spaces();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
builder.add_content(builder.consume_rest());
|
||||
}
|
||||
|
||||
static common_chat_params common_chat_params_init_granite(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
||||
common_chat_params data;
|
||||
|
||||
// Pass thinking context for Granite template
|
||||
json additional_context = {
|
||||
{"thinking", inputs.enable_thinking},
|
||||
};
|
||||
|
||||
data.prompt = apply(tmpl, inputs, /* messages_override= */ std::nullopt, /* tools_override= */ std::nullopt, additional_context);
|
||||
data.format = COMMON_CHAT_FORMAT_GRANITE;
|
||||
|
||||
if (string_ends_with(data.prompt, "<think>\n") || string_ends_with(data.prompt, "<think>")) {
|
||||
if (!inputs.enable_thinking) {
|
||||
data.prompt += "</think>";
|
||||
} else {
|
||||
data.thinking_forced_open = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (!inputs.tools.is_null()) {
|
||||
// Granite uses <|tool_call|> followed by JSON list
|
||||
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
|
||||
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
||||
std::vector<std::string> tool_rules;
|
||||
foreach_function(inputs.tools, [&](const json & tool) {
|
||||
const auto & function = tool.at("function");
|
||||
std::string name = function.at("name");
|
||||
auto parameters = function.at("parameters");
|
||||
builder.resolve_refs(parameters);
|
||||
tool_rules.push_back(builder.add_rule(name + "-call", builder.add_schema(name +
|
||||
"-args", {
|
||||
{"type", "object"},
|
||||
{"properties", {
|
||||
{"name", {{"const", name}}},
|
||||
{"arguments", parameters},
|
||||
}},
|
||||
{"required", json::array({"name", "arguments"})},
|
||||
})));
|
||||
});
|
||||
|
||||
auto tool_call = builder.add_rule("tool_call", string_join(tool_rules, " | "));
|
||||
auto tool_list = builder.add_rule("tool_list", "\"[\" space " + tool_call + " (\",\" space " + tool_call + ")* space \"]\"");
|
||||
|
||||
if (data.thinking_forced_open) {
|
||||
builder.add_rule("root", "\"</think>\" space \"<response>\" space [^<]* \"</response>\" space \"<|tool_call|>\" space " + tool_list);
|
||||
} else {
|
||||
builder.add_rule("root", "\"<|tool_call|>\" space " + tool_list);
|
||||
}
|
||||
|
||||
data.grammar_triggers.push_back({
|
||||
COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
|
||||
"<|tool_call|>"
|
||||
});
|
||||
|
||||
data.preserved_tokens = {
|
||||
"<think>",
|
||||
"</think>",
|
||||
"<response>",
|
||||
"</response>",
|
||||
"<|tool_call|>",
|
||||
};
|
||||
});
|
||||
} else {
|
||||
// Handle thinking tags for non-tool responses
|
||||
if (data.thinking_forced_open && inputs.enable_thinking) {
|
||||
data.grammar_lazy = false;
|
||||
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
||||
builder.add_rule("root", "\"</think>\" space \"<response>\" space .* \"</response>\" space");
|
||||
});
|
||||
data.preserved_tokens = {
|
||||
"<think>",
|
||||
"</think>",
|
||||
"<response>",
|
||||
"</response>",
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
return data;
|
||||
}
|
||||
|
||||
static void common_chat_parse_granite(common_chat_msg_parser & builder) {
|
||||
// Parse thinking tags
|
||||
builder.try_parse_reasoning("<think>", "</think>");
|
||||
|
||||
// Parse response tags using regex
|
||||
static const common_regex response_regex("<response>([\\s\\S]*?)</response>");
|
||||
if (auto res = builder.try_find_regex(response_regex)) {
|
||||
// Extract the content between the tags (capture group 1)
|
||||
auto content = builder.str(res->groups[1]);
|
||||
builder.add_content(content);
|
||||
builder.move_to(res->groups[0].end);
|
||||
}
|
||||
|
||||
if (!builder.syntax().parse_tool_calls) {
|
||||
builder.add_content(builder.consume_rest());
|
||||
return;
|
||||
}
|
||||
|
||||
// Look for tool calls
|
||||
static const common_regex tool_call_regex(regex_escape("<|tool_call|>"));
|
||||
if (auto res = builder.try_find_regex(tool_call_regex)) {
|
||||
builder.move_to(res->groups[0].end);
|
||||
|
||||
// Expect JSON array of tool calls
|
||||
auto tool_calls_data = builder.consume_json();
|
||||
if (tool_calls_data.json.is_array()) {
|
||||
if (!builder.add_tool_calls(tool_calls_data.json)) {
|
||||
builder.add_content("<|tool_call|>" + tool_calls_data.json.dump());
|
||||
}
|
||||
} else {
|
||||
builder.add_content("<|tool_call|>" + tool_calls_data.json.dump());
|
||||
builder.add_content(builder.consume_rest());
|
||||
}
|
||||
} else {
|
||||
builder.add_content(builder.consume_rest());
|
||||
@@ -2061,8 +1733,6 @@ static common_chat_params common_chat_templates_apply_jinja(
|
||||
params.enable_thinking = inputs.enable_thinking;
|
||||
params.grammar = inputs.grammar;
|
||||
params.now = inputs.now;
|
||||
params.add_bos = tmpls->add_bos;
|
||||
params.add_eos = tmpls->add_eos;
|
||||
|
||||
params.extra_context = json::object();
|
||||
for (auto el : inputs.chat_template_kwargs) {
|
||||
@@ -2099,21 +1769,11 @@ static common_chat_params common_chat_templates_apply_jinja(
|
||||
return common_chat_params_init_command_r7b(tmpl, params);
|
||||
}
|
||||
|
||||
// Granite (IBM) - detects thinking / tools support
|
||||
if (src.find("elif thinking") != std::string::npos && src.find("<|tool_call|>") != std::string::npos) {
|
||||
return common_chat_params_init_granite(tmpl, params);
|
||||
}
|
||||
|
||||
// Hermes 2/3 Pro, Qwen 2.5 Instruct (w/ tools)
|
||||
if (src.find("<tool_call>") != std::string::npos && params.json_schema.is_null()) {
|
||||
return common_chat_params_init_hermes_2_pro(tmpl, params);
|
||||
}
|
||||
|
||||
// GPT-OSS
|
||||
if (src.find("<|channel|>") != std::string::npos && params.json_schema.is_null()) {
|
||||
return common_chat_params_init_gpt_oss(tmpl, params);
|
||||
}
|
||||
|
||||
// Use generic handler when mixing tools + JSON schema.
|
||||
// TODO: support that mix in handlers below.
|
||||
if ((params.tools.is_array() && params.json_schema.is_object())) {
|
||||
@@ -2164,7 +1824,6 @@ static common_chat_params common_chat_templates_apply_legacy(
|
||||
int alloc_size = 0;
|
||||
std::vector<llama_chat_message> chat;
|
||||
std::vector<std::string> contents;
|
||||
|
||||
for (const auto & msg : inputs.messages) {
|
||||
auto content = msg.content;
|
||||
for (const auto & part : msg.content_parts) {
|
||||
@@ -2266,12 +1925,6 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
|
||||
case COMMON_CHAT_FORMAT_COMMAND_R7B:
|
||||
common_chat_parse_command_r7b(builder);
|
||||
break;
|
||||
case COMMON_CHAT_FORMAT_GRANITE:
|
||||
common_chat_parse_granite(builder);
|
||||
break;
|
||||
case COMMON_CHAT_FORMAT_GPT_OSS:
|
||||
common_chat_parse_gpt_oss(builder);
|
||||
break;
|
||||
default:
|
||||
throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(builder.syntax().format));
|
||||
}
|
||||
@@ -2291,8 +1944,6 @@ common_chat_msg common_chat_parse(const std::string & input, bool is_partial, co
|
||||
}
|
||||
}
|
||||
auto msg = builder.result();
|
||||
if (!is_partial) {
|
||||
LOG_DBG("Parsed message: %s\n", common_chat_msgs_to_json_oaicompat<json>({msg}).at(0).dump().c_str());
|
||||
}
|
||||
LOG_DBG("Parsed message: %s\n", common_chat_msgs_to_json_oaicompat<json>({msg}).at(0).dump().c_str());
|
||||
return msg;
|
||||
}
|
||||
|
||||
@@ -109,8 +109,6 @@ enum common_chat_format {
|
||||
COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
|
||||
COMMON_CHAT_FORMAT_HERMES_2_PRO,
|
||||
COMMON_CHAT_FORMAT_COMMAND_R7B,
|
||||
COMMON_CHAT_FORMAT_GRANITE,
|
||||
COMMON_CHAT_FORMAT_GPT_OSS,
|
||||
|
||||
COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
|
||||
};
|
||||
@@ -129,8 +127,6 @@ struct common_chat_templates_inputs {
|
||||
bool enable_thinking = true;
|
||||
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
|
||||
std::map<std::string, std::string> chat_template_kwargs;
|
||||
bool add_bos = false;
|
||||
bool add_eos = false;
|
||||
};
|
||||
|
||||
struct common_chat_params {
|
||||
@@ -187,12 +183,10 @@ std::string common_chat_format_single(
|
||||
// Returns an example of formatted chat
|
||||
std::string common_chat_format_example(
|
||||
const struct common_chat_templates * tmpls,
|
||||
bool use_jinja,
|
||||
const std::map<std::string, std::string> & chat_template_kwargs);
|
||||
bool use_jinja);
|
||||
|
||||
const char* common_chat_format_name(common_chat_format format);
|
||||
const char* common_reasoning_format_name(common_reasoning_format format);
|
||||
common_reasoning_format common_reasoning_format_from_name(const std::string & format);
|
||||
common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax);
|
||||
|
||||
common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);
|
||||
|
||||
@@ -41,7 +41,6 @@
|
||||
#endif
|
||||
#include <locale>
|
||||
#include <windows.h>
|
||||
#include <string.h>
|
||||
#include <fcntl.h>
|
||||
#include <io.h>
|
||||
#else
|
||||
@@ -1123,7 +1122,6 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
|
||||
mparams.use_mmap = params.use_mmap;
|
||||
mparams.use_mlock = params.use_mlock;
|
||||
mparams.check_tensors = params.check_tensors;
|
||||
mparams.use_extra_bufts = !params.no_extra_bufts;
|
||||
|
||||
if (params.kv_overrides.empty()) {
|
||||
mparams.kv_overrides = NULL;
|
||||
@@ -1566,56 +1564,3 @@ ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
ggml_opt_optimizer_params common_opt_lr_pars(void * userdata) {
|
||||
ggml_opt_optimizer_params result = ggml_opt_get_default_optimizer_params(nullptr);
|
||||
const lr_opt & d = *(lr_opt *) userdata;
|
||||
result.adamw.alpha = result.sgd.alpha = d.get_lr(d.epoch);
|
||||
result.sgd.wd = result.adamw.wd = d.wd;
|
||||
return result;
|
||||
}
|
||||
|
||||
// TODO make all command line args case-insensitive
|
||||
static inline bool eq_case_insensitive(char const* a, char const* b) {
|
||||
return !
|
||||
#if defined(_MSC_VER)
|
||||
_stricmp
|
||||
#else
|
||||
strcasecmp
|
||||
#endif // defined(_MSC_VER)
|
||||
(a, b);
|
||||
}
|
||||
|
||||
enum ggml_opt_optimizer_type common_opt_get_optimizer(const char * n) {
|
||||
if (eq_case_insensitive("adamw", n)) {
|
||||
return GGML_OPT_OPTIMIZER_TYPE_ADAMW;
|
||||
}
|
||||
if (eq_case_insensitive("sgd", n)) {
|
||||
return GGML_OPT_OPTIMIZER_TYPE_SGD;
|
||||
}
|
||||
return GGML_OPT_OPTIMIZER_TYPE_COUNT;
|
||||
}
|
||||
|
||||
// TODO simplify to use just log and exp
|
||||
static float const k_log_2 = std::log(2.f);
|
||||
|
||||
void lr_opt::init() {
|
||||
if (lr_min > 0 && lr_min < lr0) {
|
||||
float nhalf = std::log(lr0 / lr_min) / k_log_2;
|
||||
float e = epochs;
|
||||
if (decay_epochs > 0 && decay_epochs < e) {
|
||||
e = decay_epochs;
|
||||
} else {
|
||||
decay_epochs = e;
|
||||
}
|
||||
scale_epoch = nhalf / e;
|
||||
}
|
||||
}
|
||||
|
||||
float lr_opt::get_lr(float epoch) const {
|
||||
float r = lr_min <= 0 ? lr0 :
|
||||
epoch >= decay_epochs ? lr_min :
|
||||
lr0 * std::pow(0.5f, epoch * scale_epoch);
|
||||
LOG_INF("epoch %.2g lr=%.2g\n", epoch, r);
|
||||
return r;
|
||||
}
|
||||
|
||||
@@ -2,17 +2,14 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "llama-cpp.h"
|
||||
|
||||
#include <set>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <vector>
|
||||
#include <map>
|
||||
#include <sstream>
|
||||
#include <cmath>
|
||||
|
||||
#include "ggml-opt.h"
|
||||
#include "llama-cpp.h"
|
||||
|
||||
#ifdef _WIN32
|
||||
#define DIRECTORY_SEPARATOR '\\'
|
||||
@@ -85,7 +82,6 @@ enum llama_example {
|
||||
LLAMA_EXAMPLE_PARALLEL,
|
||||
LLAMA_EXAMPLE_TTS,
|
||||
LLAMA_EXAMPLE_DIFFUSION,
|
||||
LLAMA_EXAMPLE_FINETUNE,
|
||||
|
||||
LLAMA_EXAMPLE_COUNT,
|
||||
};
|
||||
@@ -205,8 +201,6 @@ struct common_params_speculative {
|
||||
int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
|
||||
float p_split = 0.1f; // speculative decoding split probability
|
||||
float p_min = 0.75f; // minimum speculative decoding probability (greedy)
|
||||
std::vector<std::pair<std::string, std::string>> replacements; // main to speculative model replacements
|
||||
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
|
||||
|
||||
ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
|
||||
ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
|
||||
@@ -226,46 +220,19 @@ struct common_params_vocoder {
|
||||
};
|
||||
|
||||
struct common_params_diffusion {
|
||||
int32_t steps = 128;
|
||||
bool visual_mode = false;
|
||||
|
||||
float eps = 0; // epsilon for timesteps
|
||||
int32_t block_length = 0; // block length for generation
|
||||
|
||||
int32_t algorithm = 4; // default algorithm: low-confidence
|
||||
float alg_temp = 0.0f; // algorithm temperature
|
||||
|
||||
float cfg_scale = 0; // classifier-free guidance scale
|
||||
bool add_gumbel_noise = false; // add gumbel noise to the logits if temp > 0.0
|
||||
int32_t steps = 64; // number of diffusion steps
|
||||
float eps = 1e-3f; // epsilon for timesteps
|
||||
int32_t algorithm = 0; // diffusion algorithm (0=ORIGIN, 1=MASKGIT_PLUS, 2=TOPK_MARGIN, 3=ENTROPY)
|
||||
float alg_temp = 0.0f; // algorithm temperature
|
||||
bool visual_mode = false; // show progressive diffusion on screen
|
||||
};
|
||||
|
||||
enum common_reasoning_format {
|
||||
COMMON_REASONING_FORMAT_NONE,
|
||||
COMMON_REASONING_FORMAT_AUTO,
|
||||
COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
|
||||
COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
|
||||
COMMON_REASONING_FORMAT_GRANITE, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
|
||||
};
|
||||
|
||||
|
||||
struct lr_opt {
|
||||
float lr0 = 1e-5; // learning rate at first epoch
|
||||
float lr_min = -1;
|
||||
float decay_epochs = -1; // if >0, the learning rate starts at lr0 and decays to lr_min after this many epochs
|
||||
float scale_epoch = 0;
|
||||
float wd = 0;
|
||||
unsigned epochs = 2;
|
||||
|
||||
unsigned epoch; // set by optimizer outer (epochs) loop
|
||||
// learning rate decay - constant LR per epoch only for now
|
||||
float get_lr(float e) const;
|
||||
float get_lr() const { return get_lr(epoch); }
|
||||
// must call after arg parse, before get_lr
|
||||
void init();
|
||||
};
|
||||
|
||||
struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata);
|
||||
|
||||
struct common_params {
|
||||
int32_t n_predict = -1; // new tokens to predict
|
||||
int32_t n_ctx = 4096; // context size
|
||||
@@ -385,7 +352,6 @@ struct common_params {
|
||||
bool warmup = true; // warmup run
|
||||
bool check_tensors = false; // validate tensor data
|
||||
bool no_op_offload = false; // globally disable offload host tensor operations to device
|
||||
bool no_extra_bufts = false; // disable extra buffer types (used for weight repacking)
|
||||
|
||||
bool single_turn = false; // single turn chat conversation
|
||||
|
||||
@@ -400,11 +366,6 @@ struct common_params {
|
||||
bool no_mmproj = false; // explicitly disable multimodal model
|
||||
std::vector<std::string> image; // path to image file(s)
|
||||
|
||||
// finetune
|
||||
struct lr_opt lr;
|
||||
enum ggml_opt_optimizer_type optimizer = GGML_OPT_OPTIMIZER_TYPE_ADAMW;
|
||||
float val_split = 0.05f; // fraction of the data used for the validation set
|
||||
|
||||
// embedding
|
||||
bool embedding = false; // get only sentence embedding
|
||||
int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
|
||||
@@ -413,12 +374,11 @@ struct common_params {
|
||||
std::string cls_sep = "\t"; // separator of classification sequences
|
||||
|
||||
// server params
|
||||
int32_t port = 8080; // server listens on this network port
|
||||
int32_t timeout_read = 600; // http read timeout in seconds
|
||||
int32_t timeout_write = timeout_read; // http write timeout in seconds
|
||||
int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
|
||||
int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
|
||||
int32_t n_swa_checkpoints = 3; // max number of SWA checkpoints per slot
|
||||
int32_t port = 8080; // server listens on this network port
|
||||
int32_t timeout_read = 600; // http read timeout in seconds
|
||||
int32_t timeout_write = timeout_read; // http write timeout in seconds
|
||||
int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
|
||||
int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
|
||||
|
||||
std::string hostname = "127.0.0.1";
|
||||
std::string public_path = ""; // NOLINT
|
||||
@@ -426,7 +386,7 @@ struct common_params {
|
||||
std::string chat_template = ""; // NOLINT
|
||||
bool use_jinja = false; // NOLINT
|
||||
bool enable_chat_template = true;
|
||||
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_AUTO;
|
||||
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
|
||||
int reasoning_budget = -1;
|
||||
bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
|
||||
|
||||
@@ -471,7 +431,6 @@ struct common_params {
|
||||
int32_t n_out_freq = 10; // output the imatrix every n_out_freq iterations
|
||||
int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations
|
||||
int32_t i_chunk = 0; // start processing from this chunk
|
||||
int8_t imat_dat = 0; // whether the legacy imatrix.dat format should be output (gguf <= 0 < dat)
|
||||
|
||||
bool process_output = false; // collect data for the output tensor
|
||||
bool compute_ppl = true; // whether to compute perplexity
|
||||
@@ -733,6 +692,3 @@ const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
|
||||
//
|
||||
|
||||
ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector<llama_token> & tokens, int64_t stride);
|
||||
|
||||
// "adamw" or "sgd" (case insensitive)
|
||||
enum ggml_opt_optimizer_type common_opt_get_optimizer(const char *);
|
||||
|
||||
@@ -1,39 +1,30 @@
|
||||
#include "speculative.h"
|
||||
|
||||
#include "ggml.h"
|
||||
#include "llama.h"
|
||||
#include "log.h"
|
||||
#include "common.h"
|
||||
#include "sampling.h"
|
||||
|
||||
#include <cstring>
|
||||
#include <algorithm>
|
||||
#include <map>
|
||||
|
||||
#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 128
|
||||
#define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
|
||||
|
||||
struct common_speculative {
|
||||
struct llama_context * ctx_tgt; // only used for retokenizing from ctx_dft
|
||||
struct llama_context * ctx_dft;
|
||||
struct llama_context * ctx;
|
||||
struct common_sampler * smpl;
|
||||
|
||||
llama_batch batch;
|
||||
llama_tokens prompt_dft;
|
||||
bool vocab_dft_compatible = true; // whether retokenization is needed
|
||||
std::map<std::string, std::string> tgt_dft_replacements = {};
|
||||
llama_tokens prompt;
|
||||
};
|
||||
|
||||
struct common_speculative * common_speculative_init(
|
||||
struct llama_context * ctx_tgt,
|
||||
struct llama_context * ctx_dft) {
|
||||
auto * result = new common_speculative {
|
||||
/* .ctx_tgt = */ ctx_tgt,
|
||||
/* .ctx_dft = */ ctx_dft,
|
||||
/* .smpl = */ nullptr,
|
||||
/* .batch = */ llama_batch_init(llama_n_batch(ctx_dft), 0, 1),
|
||||
/* .prompt_dft = */ {},
|
||||
/* .vocab_dft_compatible = */ false,
|
||||
/* .ctx = */ ctx_dft,
|
||||
/* .smpl = */ nullptr,
|
||||
/* .batch = */ llama_batch_init(llama_n_batch(ctx_dft), 0, 1),
|
||||
/* .prompt = */ {},
|
||||
};
|
||||
|
||||
// TODO: optimize or pass from outside?
|
||||
@@ -68,9 +59,6 @@ struct common_speculative * common_speculative_init(
|
||||
}
|
||||
#endif
|
||||
|
||||
result->vocab_dft_compatible = common_speculative_are_compatible(ctx_tgt, ctx_dft);
|
||||
LOG_DBG("vocab_dft_compatible = %d\n", result->vocab_dft_compatible);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
@@ -87,8 +75,8 @@ void common_speculative_free(struct common_speculative * spec) {
|
||||
}
|
||||
|
||||
bool common_speculative_are_compatible(
|
||||
const struct llama_context * ctx_tgt,
|
||||
const struct llama_context * ctx_dft) {
|
||||
const struct llama_context * ctx_tgt,
|
||||
const struct llama_context * ctx_dft) {
|
||||
const struct llama_model * model_tgt = llama_get_model(ctx_tgt);
|
||||
const struct llama_model * model_dft = llama_get_model(ctx_dft);
|
||||
|
||||
@@ -102,32 +90,31 @@ bool common_speculative_are_compatible(
|
||||
LOG_DBG("%s: vocab_type dft: %d\n", __func__, vocab_type_dft);
|
||||
|
||||
if (vocab_type_tgt != vocab_type_dft) {
|
||||
LOG_DBG("%s: draft model vocab type must match target model to use speculation but ", __func__);
|
||||
LOG_DBG("vocab_type_dft = %d while vocab_type_tgt = %d\n", vocab_type_dft, vocab_type_tgt);
|
||||
LOG_ERR("%s: draft model vocab type must match target model to use speculation but "
|
||||
"vocab_type_dft = %d while vocab_type_tgt = %d\n", __func__, vocab_type_dft, vocab_type_tgt);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (
|
||||
llama_vocab_get_add_bos(vocab_tgt) != llama_vocab_get_add_bos(vocab_dft) ||
|
||||
if (llama_vocab_get_add_bos(vocab_tgt) != llama_vocab_get_add_bos(vocab_dft) ||
|
||||
llama_vocab_get_add_eos(vocab_tgt) != llama_vocab_get_add_eos(vocab_dft) ||
|
||||
llama_vocab_bos(vocab_tgt) != llama_vocab_bos(vocab_dft) ||
|
||||
llama_vocab_eos(vocab_tgt) != llama_vocab_eos(vocab_dft)
|
||||
) {
|
||||
LOG_DBG("%s: draft model special tokens must match target model to use speculation\n", __func__);
|
||||
llama_vocab_eos(vocab_tgt) != llama_vocab_eos(vocab_dft)) {
|
||||
LOG_ERR("%s: draft vocab special tokens must match target vocab to use speculation\n", __func__);
|
||||
LOG_ERR("%s: tgt: bos = %d (%d), eos = %d (%d)\n", __func__, llama_vocab_bos(vocab_tgt), llama_vocab_get_add_bos(vocab_tgt), llama_vocab_eos(vocab_tgt), llama_vocab_get_add_eos(vocab_tgt));
|
||||
LOG_ERR("%s: dft: bos = %d (%d), eos = %d (%d)\n", __func__, llama_vocab_bos(vocab_dft), llama_vocab_get_add_bos(vocab_dft), llama_vocab_eos(vocab_dft), llama_vocab_get_add_eos(vocab_dft));
|
||||
return false;
|
||||
}
|
||||
|
||||
{
|
||||
const int n_vocab_tgt = llama_vocab_n_tokens(vocab_tgt);
|
||||
const int n_vocab_dft = llama_vocab_n_tokens(vocab_dft);
|
||||
const int vocab_diff = n_vocab_tgt > n_vocab_dft
|
||||
? n_vocab_tgt - n_vocab_dft
|
||||
: n_vocab_dft - n_vocab_tgt;
|
||||
|
||||
const int vocab_diff = std::abs(n_vocab_tgt - n_vocab_dft);
|
||||
|
||||
if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) {
|
||||
LOG_DBG("%s: draft model vocab must closely match target model to use speculation but ", __func__);
|
||||
LOG_DBG("target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
|
||||
n_vocab_tgt, llama_vocab_n_tokens(vocab_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
|
||||
LOG_ERR("%s: draft model vocab must closely match target model to use speculation but "
|
||||
"target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
|
||||
__func__, n_vocab_tgt, llama_vocab_n_tokens(vocab_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -135,8 +122,8 @@ bool common_speculative_are_compatible(
|
||||
const char * token_text_tgt = llama_vocab_get_text(vocab_tgt, i);
|
||||
const char * token_text_dft = llama_vocab_get_text(vocab_dft, i);
|
||||
if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
|
||||
LOG_DBG("%s: draft model vocab must match target model to use speculation but ", __func__);
|
||||
LOG_DBG("token %d content differs - target '%s', draft '%s'\n", i,
|
||||
LOG_ERR("%s: draft vocab vocab must match target vocab to use speculation but "
|
||||
"token %d content differs - target '%s', draft '%s'\n", __func__, i,
|
||||
common_token_to_piece(ctx_tgt, i).c_str(),
|
||||
common_token_to_piece(ctx_dft, i).c_str());
|
||||
return false;
|
||||
@@ -147,93 +134,32 @@ bool common_speculative_are_compatible(
|
||||
return true;
|
||||
}
|
||||
|
||||
void common_speculative_add_replacement_tgt_dft(
|
||||
struct common_speculative * spec,
|
||||
const char *source, const char *dest) {
|
||||
spec->tgt_dft_replacements[source] = dest;
|
||||
}
|
||||
|
||||
static std::string replace_to_dft(
|
||||
struct common_speculative * spec,
|
||||
const std::string& input) {
|
||||
std::string result = input;
|
||||
for (const auto & pair : spec->tgt_dft_replacements) {
|
||||
size_t pos = result.find(pair.first);
|
||||
while (pos != std::string::npos) {
|
||||
result.replace(pos, pair.first.length(), pair.second);
|
||||
pos = result.find(pair.first, pos + pair.second.length());
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static std::string replace_to_tgt(
|
||||
struct common_speculative * spec,
|
||||
const std::string& input) {
|
||||
std::string result = input;
|
||||
for (const auto& pair : spec->tgt_dft_replacements) {
|
||||
size_t pos = result.find(pair.second);
|
||||
while (pos != std::string::npos) {
|
||||
result.replace(pos, pair.second.length(), pair.first);
|
||||
pos = result.find(pair.second, pos + pair.first.length());
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
llama_tokens common_speculative_gen_draft(
|
||||
struct common_speculative * spec,
|
||||
struct common_speculative_params params,
|
||||
const llama_tokens & prompt_tgt_main_model, // specified in target model vocab
|
||||
const llama_tokens & prompt_tgt,
|
||||
llama_token id_last) {
|
||||
auto & batch = spec->batch;
|
||||
auto & ctx_tgt = spec->ctx_tgt;
|
||||
auto & ctx_dft = spec->ctx_dft;
|
||||
auto & ctx = spec->ctx;
|
||||
auto & smpl = spec->smpl;
|
||||
auto & prompt_dft = spec->prompt_dft;
|
||||
auto & prompt = spec->prompt;
|
||||
|
||||
auto * mem_dft = llama_get_memory(ctx_dft);
|
||||
auto * mem = llama_get_memory(ctx);
|
||||
|
||||
int reuse_i = 0;
|
||||
int reuse_n = 0;
|
||||
|
||||
const int n_ctx = llama_n_ctx(ctx_dft) - params.n_draft;
|
||||
|
||||
llama_tokens prompt_tgt_draft_model;
|
||||
if (!spec->vocab_dft_compatible) {
|
||||
std::string text;
|
||||
text = common_detokenize(ctx_tgt, prompt_tgt_main_model, true);
|
||||
text = replace_to_dft(spec, text);
|
||||
LOG_DBG("%s: main->draft detokenized string: '%s'\n", __func__, text.c_str());
|
||||
prompt_tgt_draft_model = common_tokenize(ctx_dft, text, false, true);
|
||||
|
||||
// convert id_last to draft vocab. llama_detokenize is called directly to avoid an allocation
|
||||
const auto * model_tgt = llama_get_model(ctx_tgt);
|
||||
const auto * vocab_tgt = llama_model_get_vocab(model_tgt);
|
||||
|
||||
int32_t n_chars = llama_detokenize(vocab_tgt, &id_last, 1, nullptr, 0, false, false);
|
||||
GGML_ASSERT(n_chars < 0 && "failed to detokenize id_last");
|
||||
text.resize(-n_chars);
|
||||
llama_detokenize(vocab_tgt, &id_last, 1, text.data(), text.size(), false, false);
|
||||
text = replace_to_dft(spec, text);
|
||||
|
||||
LOG_DBG("main->draft detokenized id_last(%d): '%s'\n", id_last, text.c_str());
|
||||
id_last = common_tokenize(ctx_dft, text, false, true)[0];
|
||||
}
|
||||
// prompt_tgt's tokens will always be compatible with ctx_dft
|
||||
const llama_tokens &prompt_tgt =
|
||||
spec->vocab_dft_compatible ? prompt_tgt_main_model : prompt_tgt_draft_model;
|
||||
const int n_ctx = llama_n_ctx(ctx) - params.n_draft;
|
||||
|
||||
const int i_start = std::max<int>(0, (int) prompt_tgt.size() - n_ctx);
|
||||
|
||||
// reuse as much as possible from the old draft context
|
||||
// ideally, the draft context should be as big as the target context and we will always reuse the entire prompt
|
||||
for (int i = 0; i < (int) prompt_dft.size(); ++i) {
|
||||
for (int i = 0; i < (int) prompt.size(); ++i) {
|
||||
int cur = 0;
|
||||
while (i_start + cur < (int) prompt_tgt.size() &&
|
||||
i + cur < (int) prompt_dft.size() &&
|
||||
prompt_tgt[i_start + cur] == prompt_dft[i + cur]) {
|
||||
i + cur < (int) prompt.size() &&
|
||||
prompt_tgt[i_start + cur] == prompt[i + cur]) {
|
||||
cur++;
|
||||
}
|
||||
|
||||
@@ -243,20 +169,21 @@ llama_tokens common_speculative_gen_draft(
|
||||
}
|
||||
}
|
||||
|
||||
LOG_DBG("%s: reuse_i = %d, reuse_n = %d, prompt = %d\n", __func__, reuse_i, reuse_n, (int) prompt_dft.size());
|
||||
LOG_DBG("%s: reuse_i = %d, reuse_n = %d, prompt = %d\n", __func__, reuse_i, reuse_n, (int) prompt.size());
|
||||
|
||||
llama_tokens result;
|
||||
result.reserve(params.n_draft);
|
||||
|
||||
if (reuse_n == 0) {
|
||||
llama_memory_clear(mem_dft, false);
|
||||
prompt_dft.clear();
|
||||
llama_memory_clear(mem, false);
|
||||
|
||||
prompt.clear();
|
||||
} else {
|
||||
// this happens when a previous draft has been discarded (for example, due to being too small), but the
|
||||
// target model agreed with it. in this case, we simply pass back the previous results to save compute
|
||||
if (reuse_i + reuse_n < (int) prompt_dft.size() && prompt_dft[reuse_i + reuse_n] == id_last) {
|
||||
for (int i = reuse_i + reuse_n + 1; i < (int) prompt_dft.size(); ++i) {
|
||||
result.push_back(prompt_dft[i]);
|
||||
if (reuse_i + reuse_n < (int) prompt.size() && prompt[reuse_i + reuse_n] == id_last) {
|
||||
for (int i = reuse_i + reuse_n + 1; i < (int) prompt.size(); ++i) {
|
||||
result.push_back(prompt[i]);
|
||||
|
||||
if (params.n_draft <= (int) result.size()) {
|
||||
break;
|
||||
@@ -267,15 +194,16 @@ llama_tokens common_speculative_gen_draft(
|
||||
}
|
||||
|
||||
if (reuse_i > 0) {
|
||||
llama_memory_seq_rm (mem_dft, 0, 0, reuse_i);
|
||||
llama_memory_seq_add(mem_dft, 0, reuse_i, -1, -reuse_i);
|
||||
llama_memory_seq_rm (mem, 0, 0, reuse_i);
|
||||
llama_memory_seq_add(mem, 0, reuse_i, -1, -reuse_i);
|
||||
|
||||
prompt_dft.erase(prompt_dft.begin(), prompt_dft.begin() + reuse_i);
|
||||
prompt.erase(prompt.begin(), prompt.begin() + reuse_i);
|
||||
}
|
||||
|
||||
if (reuse_n < (int) prompt_dft.size()) {
|
||||
llama_memory_seq_rm (mem_dft, 0, reuse_n, -1);
|
||||
prompt_dft.erase(prompt_dft.begin() + reuse_n, prompt_dft.end());
|
||||
if (reuse_n < (int) prompt.size()) {
|
||||
llama_memory_seq_rm (mem, 0, reuse_n, -1);
|
||||
|
||||
prompt.erase(prompt.begin() + reuse_n, prompt.end());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -286,28 +214,28 @@ llama_tokens common_speculative_gen_draft(
|
||||
//LOG_DBG("i = %d, i_start = %d, reuse_n = %d, i - i_start = %d, id = %6d\n", i, i_start, reuse_n, i - i_start, prompt_tgt[i]);
|
||||
common_batch_add(batch, prompt_tgt[i], i - i_start, { 0 }, false);
|
||||
|
||||
prompt_dft.push_back(prompt_tgt[i]);
|
||||
prompt.push_back(prompt_tgt[i]);
|
||||
}
|
||||
|
||||
// we should rarely end-up here during normal decoding
|
||||
if (batch.n_tokens > 0) {
|
||||
//LOG_DBG("%s: draft prompt batch: %s\n", __func__, string_from(ctx, batch).c_str());
|
||||
|
||||
llama_decode(ctx_dft, batch);
|
||||
llama_decode(ctx, batch);
|
||||
}
|
||||
|
||||
const llama_pos n_past = prompt_dft.size();
|
||||
const llama_pos n_past = prompt.size();
|
||||
|
||||
LOG_DBG("%s: n_past = %d\n", __func__, n_past);
|
||||
|
||||
common_batch_clear(batch);
|
||||
common_batch_add (batch, id_last, n_past, { 0 }, true);
|
||||
|
||||
prompt_dft.push_back(id_last);
|
||||
prompt.push_back(id_last);
|
||||
|
||||
LOG_DBG("%s: draft prompt: %s\n", __func__, string_from(ctx_dft, prompt_dft).c_str());
|
||||
//LOG_DBG("%s: draft prompt: %s\n", __func__, string_from(ctx, prompt).c_str());
|
||||
|
||||
llama_decode(ctx_dft, batch);
|
||||
llama_decode(ctx, batch);
|
||||
|
||||
common_sampler_reset(smpl);
|
||||
|
||||
@@ -315,13 +243,13 @@ llama_tokens common_speculative_gen_draft(
|
||||
for (int i = 0; i < params.n_draft; ++i) {
|
||||
common_batch_clear(batch);
|
||||
|
||||
common_sampler_sample(smpl, ctx_dft, 0, true);
|
||||
common_sampler_sample(smpl, ctx, 0, true);
|
||||
|
||||
const auto * cur_p = common_sampler_get_candidates(smpl);
|
||||
|
||||
for (int k = 0; k < std::min(3, (int) cur_p->size); ++k) {
|
||||
LOG_DBG(" - draft candidate %3d, pos %3d: %6d (%8.3f) '%s'\n",
|
||||
k, i, cur_p->data[k].id, cur_p->data[k].p, common_token_to_piece(ctx_dft, cur_p->data[k].id).c_str());
|
||||
k, i, cur_p->data[k].id, cur_p->data[k].p, common_token_to_piece(ctx, cur_p->data[k].id).c_str());
|
||||
}
|
||||
|
||||
// add drafted token for each sequence
|
||||
@@ -343,19 +271,10 @@ llama_tokens common_speculative_gen_draft(
|
||||
common_batch_add(batch, id, n_past + i + 1, { 0 }, true);
|
||||
|
||||
// evaluate the drafted tokens on the draft model
|
||||
llama_decode(ctx_dft, batch);
|
||||
llama_decode(ctx, batch);
|
||||
|
||||
prompt_dft.push_back(id);
|
||||
prompt.push_back(id);
|
||||
}
|
||||
|
||||
if (!spec->vocab_dft_compatible) {
|
||||
std::string detokenized = common_detokenize(ctx_dft, result, true);
|
||||
detokenized = replace_to_tgt(spec, detokenized);
|
||||
LOG_DBG("draft->main detokenized string: '%s'\n", detokenized.c_str());
|
||||
result = common_tokenize(ctx_tgt, detokenized, false, true);
|
||||
if (result.size() > (size_t)params.n_draft) {
|
||||
result.resize(params.n_draft);
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
@@ -12,10 +12,7 @@ struct common_speculative_params {
|
||||
float p_min = 0.75f; // min probability required to accept a token in the draft
|
||||
};
|
||||
|
||||
struct common_speculative * common_speculative_init(
|
||||
struct llama_context * ctx_tgt,
|
||||
struct llama_context * ctx_dft
|
||||
);
|
||||
struct common_speculative * common_speculative_init(struct llama_context * ctx_dft);
|
||||
|
||||
void common_speculative_free(struct common_speculative * spec);
|
||||
|
||||
@@ -23,10 +20,6 @@ bool common_speculative_are_compatible(
|
||||
const struct llama_context * ctx_tgt,
|
||||
const struct llama_context * ctx_dft);
|
||||
|
||||
void common_speculative_add_replacement_tgt_dft(
|
||||
struct common_speculative * spec,
|
||||
const char *source, const char *dest);
|
||||
|
||||
// sample up to n_draft tokens and add them to the batch using the draft model
|
||||
llama_tokens common_speculative_gen_draft(
|
||||
struct common_speculative * spec,
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -59,10 +59,6 @@ parser.add_argument(
|
||||
"--full", action="store_true",
|
||||
help="download full list of models - make sure you have access to all of them",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--check-missing", action="store_true",
|
||||
help="only check for missing pre-tokenizer hashes",
|
||||
)
|
||||
parser.add_argument(
|
||||
"hf_token",
|
||||
help="optional HF token",
|
||||
@@ -74,10 +70,6 @@ hf_token = args.hf_token if args.hf_token is not None else hf_token
|
||||
if hf_token is None:
|
||||
logger.warning("HF token not found. You can provide it as an argument or set it in ~/.cache/huggingface/token")
|
||||
|
||||
if args.check_missing and args.full:
|
||||
logger.warning("Downloading full list of models requested, ignoring --check-missing!")
|
||||
args.check_missing = False
|
||||
|
||||
# TODO: this string has to exercise as much pre-tokenizer functionality as possible
|
||||
# will be updated with time - contributions welcome
|
||||
CHK_TXT = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
|
||||
@@ -138,7 +130,6 @@ models = [
|
||||
{"name": "midm-2.0", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct", },
|
||||
{"name": "lfm2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LiquidAI/LFM2-Tokenizer"},
|
||||
{"name": "exaone4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B", },
|
||||
{"name": "mellum", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/JetBrains/Mellum-4b-base", },
|
||||
]
|
||||
|
||||
# some models are known to be broken upstream, so we will skip them as exceptions
|
||||
@@ -147,17 +138,14 @@ pre_computed_hashes = [
|
||||
{"name": "chatglm-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-chat", "chkhsh": "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b"},
|
||||
{"name": "chatglm-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-chat", "chkhsh": "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516"},
|
||||
{"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf", "chkhsh": "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2"},
|
||||
{"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/zai-org/GLM-4.5-Air", "chkhsh": "9ca2dd618e8afaf09731a7cf6e2105b373ba6a1821559f258b272fe83e6eb902"},
|
||||
{"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", "chkhsh": "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35"},
|
||||
{"name": "hunyuan", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tencent/Hunyuan-A13B-Instruct", "chkhsh": "7e57df22b1fe23a7b1e1c7f3dc4e3f96d43a4eb0836d0c6bdc3436d7b2f1c664"},
|
||||
{"name": "hunyuan-dense", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tencent/Hunyuan-4B-Instruct", "chkhsh": "bba3b3366b646dbdded5dbc42d59598b849371afc42f7beafa914afaa5b70aa6"},
|
||||
# falcon-h1 series uses 4 different tokenizers across model sizes (0.5b - 34b), hence we need to define 4 different hashes
|
||||
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-0.5B-Base", "chkhsh": "a6b57017d60e6edb4d88ecc2845188e0eb333a70357e45dcc9b53964a73bbae6"},
|
||||
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-1B-Base", "chkhsh": "60476e1243776c4fb1b993dbd7a5f15ac22f83c80afdf425fa5ae01c8d44ef86"},
|
||||
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-7B-Base", "chkhsh": "3eda48b4c4dc7de733d1a8b3e3b4a85243dbbf704da2ee9d42c6beced8897896"},
|
||||
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-34B-Base", "chkhsh": "48f8e02c0359c0bbdd82f26909171fac1c18a457bb47573ed1fe3bbb2c1cfd4b"},
|
||||
{"name": "kimi-k2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/moonshotai/Kimi-K2-Base", "chkhsh": "81212dc7cdb7e0c1074ca62c5aeab0d43c9f52b8a737be7b12a777c953027890"},
|
||||
{"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen3-Embedding-0.6B", "chkhsh": "d4540891389ea895b53b399da6ac824becc30f2fba0e9ddbb98f92e55ca0e97c"},
|
||||
]
|
||||
|
||||
|
||||
@@ -232,13 +220,12 @@ if not args.full:
|
||||
all_models = models.copy()
|
||||
models = [model for model in all_models if model["name"] not in existing_models]
|
||||
|
||||
if not args.check_missing:
|
||||
logging.info(f"Downloading {len(models)} models...")
|
||||
for model in models:
|
||||
try:
|
||||
download_model(model)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to download model {model['name']}. Error: {e}")
|
||||
logging.info(f"Downloading {len(models)} models...")
|
||||
for model in models:
|
||||
try:
|
||||
download_model(model)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to download model {model['name']}. Error: {e}")
|
||||
|
||||
|
||||
# generate the source code for the convert_hf_to_gguf.py:get_vocab_base_pre() function:
|
||||
|
||||
@@ -340,7 +340,7 @@ if __name__ == '__main__':
|
||||
sys.exit(1)
|
||||
else:
|
||||
logger.info(f"Loading base model: {dir_base_model.name}")
|
||||
hparams = ModelBase.load_hparams(dir_base_model, False)
|
||||
hparams = ModelBase.load_hparams(dir_base_model)
|
||||
|
||||
with torch.inference_mode():
|
||||
try:
|
||||
|
||||
@@ -310,7 +310,5 @@ Specifies the memory pool management strategy:
|
||||
|
||||
Controls automatic cleanup of the memory pool. This option is only effective when using the prio or leg memory pool strategies.
|
||||
|
||||
### GGML_CANN_WEIGHT_NZ
|
||||
|
||||
Converting the matmul weight format from ND to NZ can significantly improve performance on the 310I DUO NPU.
|
||||
|
||||
## TODO
|
||||
- Support more models and data types.
|
||||
|
||||
@@ -42,14 +42,14 @@ cmake --build build --config Release -j $(nproc)
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
```
|
||||
|
||||
- By default, NNPA is disabled by default. To enable it:
|
||||
- By default, NNPA is enabled when available. To disable it (not recommended):
|
||||
|
||||
```bash
|
||||
cmake -S . -B build \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_BLAS=ON \
|
||||
-DGGML_BLAS_VENDOR=OpenBLAS \
|
||||
-DGGML_NNPA=ON
|
||||
-DGGML_NNPA=OFF
|
||||
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
```
|
||||
@@ -76,23 +76,6 @@ cmake --build build --config Release -j $(nproc)
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
```
|
||||
|
||||
## IBM zDNN Accelerator
|
||||
|
||||
This provides acceleration using the IBM zAIU co-processor located in the Telum I and Telum II processors. Make sure to have the [IBM zDNN library](https://github.com/IBM/zDNN) installed.
|
||||
|
||||
#### Compile from source from IBM
|
||||
|
||||
You may find the official build instructions here: [Building and Installing zDNN](https://github.com/IBM/zDNN?tab=readme-ov-file#building-and-installing-zdnn)
|
||||
|
||||
### Compilation
|
||||
|
||||
```bash
|
||||
cmake -S . -B build \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_ZDNN=ON
|
||||
cmake --build build --config Release -j$(nproc)
|
||||
```
|
||||
|
||||
## Getting GGUF Models
|
||||
|
||||
All models need to be converted to Big-Endian. You can achieve this in three cases:
|
||||
@@ -101,9 +84,9 @@ All models need to be converted to Big-Endian. You can achieve this in three cas
|
||||
|
||||

|
||||
|
||||
You can find popular models pre-converted and verified at [s390x Verified Models](https://huggingface.co/collections/taronaeo/s390x-verified-models-672765393af438d0ccb72a08) or [s390x Runnable Models](https://huggingface.co/collections/taronaeo/s390x-runnable-models-686e951824198df12416017e).
|
||||
You can find popular models pre-converted and verified at [s390x Ready Models](https://huggingface.co/collections/taronaeo/s390x-ready-models-672765393af438d0ccb72a08).
|
||||
|
||||
These models have already been converted from `safetensors` to `GGUF` Big-Endian and their respective tokenizers verified to run correctly on IBM z15 and later system.
|
||||
These models have already been converted from `safetensors` to `GGUF Big-Endian` and their respective tokenizers verified to run correctly on IBM z15 and later system.
|
||||
|
||||
2. **Convert safetensors model to GGUF Big-Endian directly (recommended)**
|
||||
|
||||
@@ -111,14 +94,6 @@ All models need to be converted to Big-Endian. You can achieve this in three cas
|
||||
|
||||
The model you are trying to convert must be in `safetensors` file format (for example [IBM Granite 3.3 2B](https://huggingface.co/ibm-granite/granite-3.3-2b-instruct)). Make sure you have downloaded the model repository for this case.
|
||||
|
||||
Ensure that you have installed the required packages in advance
|
||||
|
||||
```bash
|
||||
pip3 install -r requirements.txt
|
||||
```
|
||||
|
||||
Convert the `safetensors` model to `GGUF`
|
||||
|
||||
```bash
|
||||
python3 convert_hf_to_gguf.py \
|
||||
--outfile model-name-be.f16.gguf \
|
||||
@@ -141,7 +116,7 @@ All models need to be converted to Big-Endian. You can achieve this in three cas
|
||||
|
||||

|
||||
|
||||
The model you are trying to convert must be in `gguf` file format (for example [IBM Granite 3.3 2B GGUF](https://huggingface.co/ibm-granite/granite-3.3-2b-instruct-GGUF)). Make sure you have downloaded the model file for this case.
|
||||
The model you are trying to convert must be in `gguf` file format (for example [IBM Granite 3.3 2B](https://huggingface.co/ibm-granite/granite-3.3-2b-instruct-GGUF)). Make sure you have downloaded the model file for this case.
|
||||
|
||||
```bash
|
||||
python3 gguf-py/gguf/scripts/gguf_convert_endian.py model-name.f16.gguf BIG
|
||||
@@ -162,19 +137,19 @@ All models need to be converted to Big-Endian. You can achieve this in three cas
|
||||
|
||||
### 1. SIMD Acceleration
|
||||
|
||||
Only available in IBM z15/LinuxONE 3 or later system with the `-DGGML_VXE=ON` (turned on by default) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z14/arch12. In such systems, the APIs can still run but will use a scalar implementation.
|
||||
Only available in IBM z15 or later system with the `-DGGML_VXE=ON` (turned on by default) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z14/arch12. In such systems, the APIs can still run but will use a scalar implementation.
|
||||
|
||||
### 2. NNPA Vector Intrinsics Acceleration
|
||||
|
||||
Only available in IBM z16/LinuxONE 4 or later system with the `-DGGML_NNPA=ON` (turned off by default) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z15/arch13. In such systems, the APIs can still run but will use a scalar implementation.
|
||||
Only available in IBM z16 or later system with the `-DGGML_NNPA=ON` (turned on when available) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z15/arch13. In such systems, the APIs can still run but will use a scalar implementation.
|
||||
|
||||
### 3. zDNN Accelerator (WIP)
|
||||
### 3. zDNN Accelerator
|
||||
|
||||
Only available in IBM z17/LinuxONE 5 or later system with the `-DGGML_ZDNN=ON` compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z15/arch13. In such systems, the APIs will default back to CPU routines.
|
||||
_Only available in IBM z16 or later system. No direction at the moment._
|
||||
|
||||
### 4. Spyre Accelerator
|
||||
|
||||
_Only available with IBM z17 / LinuxONE 5 or later system. No support currently available._
|
||||
_No direction at the moment._
|
||||
|
||||
## Performance Tuning
|
||||
|
||||
@@ -214,26 +189,6 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl
|
||||
|
||||
Answer: Please ensure that your GCC compiler is of minimum GCC 15.1.0 version, and have `binutils` updated to the latest version. If this does not fix the problem, kindly open an issue.
|
||||
|
||||
4. Failing to install the `sentencepiece` package using GCC 15+
|
||||
|
||||
Answer: The `sentencepiece` team are aware of this as seen in [this issue](https://github.com/google/sentencepiece/issues/1108).
|
||||
|
||||
As a temporary workaround, please run the installation command with the following environment variables.
|
||||
|
||||
```bash
|
||||
export CXXFLAGS="-include cstdint"
|
||||
```
|
||||
|
||||
For example,
|
||||
|
||||
```bash
|
||||
CXXFLAGS="-include cstdint" pip3 install -r requirements.txt
|
||||
```
|
||||
|
||||
5. `-DGGML_NNPA=ON` generates gibberish output
|
||||
|
||||
Answer: We are aware of this as detailed in [this issue](https://github.com/ggml-org/llama.cpp/issues/14877). Please either try reducing the number of threads, or disable the compile option using `-DGGML_NNPA=OFF`.
|
||||
|
||||
## Getting Help on IBM Z & LinuxONE
|
||||
|
||||
1. **Bugs, Feature Requests**
|
||||
@@ -246,12 +201,11 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl
|
||||
|
||||
## Appendix A: Hardware Support Matrix
|
||||
|
||||
| | Support | Minimum Compiler Version |
|
||||
| -------- | ------- | ------------------------ |
|
||||
| IBM z15 | ✅ | |
|
||||
| IBM z16 | ✅ | |
|
||||
| IBM z17 | ✅ | GCC 15.1.0 |
|
||||
| IBM zDNN | ✅ | |
|
||||
| | Support | Minimum Compiler Version |
|
||||
| ------- | ------- | ------------------------ |
|
||||
| IBM z15 | ✅ | |
|
||||
| IBM z16 | ✅ | |
|
||||
| IBM z17 | ✅ | GCC 15.1.0 |
|
||||
|
||||
- ✅ - supported and verified to run as intended
|
||||
- 🚫 - unsupported, we are unlikely able to provide support
|
||||
@@ -260,7 +214,7 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl
|
||||
|
||||
| | VX/VXE/VXE2 | NNPA | zDNN | Spyre |
|
||||
| ---------- | ----------- | ---- | ---- | ----- |
|
||||
| FP32 | ✅ | ✅ | ✅ | ❓ |
|
||||
| FP32 | ✅ | ✅ | ❓ | ❓ |
|
||||
| FP16 | ✅ | ✅ | ❓ | ❓ |
|
||||
| BF16 | 🚫 | 🚫 | ❓ | ❓ |
|
||||
| Q4_0 | ✅ | ✅ | ❓ | ❓ |
|
||||
@@ -290,5 +244,3 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl
|
||||
- ✅ - acceleration available
|
||||
- 🚫 - acceleration unavailable, will still run using scalar implementation
|
||||
- ❓ - acceleration unknown, please contribute if you can test it yourself
|
||||
|
||||
Last Updated by **Aaron Teo (aaron.teo1@ibm.com)** on July 31, 2025.
|
||||
|
||||
@@ -23,19 +23,11 @@ The convert script reads the model configuration, tokenizer, tensor names+data a
|
||||
|
||||
The required steps to implement for an HF model are:
|
||||
|
||||
1. Define the model `ModelBase.register` annotation in a new `TextModel` or `MmprojModel` subclass, example:
|
||||
1. Define the model `Model.register` annotation in a new `Model` subclass, example:
|
||||
|
||||
```python
|
||||
@ModelBase.register("MyModelForCausalLM")
|
||||
class MyModel(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.MYMODEL
|
||||
```
|
||||
|
||||
or
|
||||
|
||||
```python
|
||||
@ModelBase.register("MyModelForConditionalGeneration")
|
||||
class MyModel(MmprojModel):
|
||||
@Model.register("MyModelForCausalLM")
|
||||
class MyModel(Model):
|
||||
model_arch = gguf.MODEL_ARCH.MYMODEL
|
||||
```
|
||||
|
||||
@@ -83,10 +75,9 @@ block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
|
||||
`transformer.blocks.{bid}.norm_1` will be mapped to `blk.{bid}.attn_norm` in GGUF.
|
||||
|
||||
Depending on the model configuration, tokenizer, code and tensors layout, you will have to override:
|
||||
- `TextModel#set_gguf_parameters`
|
||||
- `MmprojModel#set_gguf_parameters`
|
||||
- `ModelBase#set_vocab`
|
||||
- `ModelBase#modify_tensors`
|
||||
- `Model#set_gguf_parameters`
|
||||
- `Model#set_vocab`
|
||||
- `Model#write_tensors`
|
||||
|
||||
NOTE: Tensor names must end with `.weight` or `.bias` suffixes, that is the convention and several tools like `quantize` expect this to proceed the weights.
|
||||
|
||||
|
||||
@@ -97,9 +97,6 @@ NOTE: some models may require large context window, for example: `-c 8192`
|
||||
# Qwen2-Audio and SeaLLM-Audio
|
||||
# note: no pre-quantized GGUF this model, as they have very poor result
|
||||
# ref: https://github.com/ggml-org/llama.cpp/pull/13760
|
||||
|
||||
# Mistral's Voxtral
|
||||
(tool_name) -hf ggml-org/Voxtral-Mini-3B-2507-GGUF
|
||||
```
|
||||
|
||||
**Mixed modalities**:
|
||||
|
||||
@@ -13,7 +13,7 @@ If there are differences in usage, please refer to the official build [documenta
|
||||
|
||||
Clone llama.cpp:
|
||||
```bash
|
||||
git clone https://github.com/ggml-org/llama.cpp
|
||||
git clone https://github.com/ggerganov/llama.cpp
|
||||
cd llama.cpp
|
||||
```
|
||||
|
||||
@@ -29,8 +29,8 @@ cmake --build build --config Release
|
||||
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-o-2_6-gguf) by us)
|
||||
|
||||
```bash
|
||||
python ./tools/mtmd/legacy-models/minicpmv-surgery.py -m ../MiniCPM-o-2_6
|
||||
python ./tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-o-2_6 --minicpmv-projector ../MiniCPM-o-2_6/minicpmv.projector --output-dir ../MiniCPM-o-2_6/ --minicpmv_version 4
|
||||
python ./tools/mtmd/minicpmv-surgery.py -m ../MiniCPM-o-2_6
|
||||
python ./tools/mtmd/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-o-2_6 --minicpmv-projector ../MiniCPM-o-2_6/minicpmv.projector --output-dir ../MiniCPM-o-2_6/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 4
|
||||
python ./convert_hf_to_gguf.py ../MiniCPM-o-2_6/model
|
||||
|
||||
# quantize int4 version
|
||||
|
||||
@@ -1,47 +0,0 @@
|
||||
## MiniCPM-o 4
|
||||
|
||||
### Prepare models and code
|
||||
|
||||
Download [MiniCPM-o-4](https://huggingface.co/openbmb/MiniCPM-o-4) PyTorch model from huggingface to "MiniCPM-o-4" folder.
|
||||
|
||||
|
||||
### Build llama.cpp
|
||||
Readme modification time: 20250206
|
||||
|
||||
If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
|
||||
|
||||
Clone llama.cpp:
|
||||
```bash
|
||||
git clone https://github.com/ggerganov/llama.cpp
|
||||
cd llama.cpp
|
||||
```
|
||||
|
||||
Build llama.cpp using `CMake`:
|
||||
```bash
|
||||
cmake -B build
|
||||
cmake --build build --config Release
|
||||
```
|
||||
|
||||
|
||||
### Usage of MiniCPM-o 4
|
||||
|
||||
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-o-4-gguf) by us)
|
||||
|
||||
```bash
|
||||
python ./tools/mtmd/legacy-models/minicpmv-surgery.py -m ../MiniCPM-o-4
|
||||
python ./tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-o-4 --minicpmv-projector ../MiniCPM-o-4/minicpmv.projector --output-dir ../MiniCPM-o-4/ --minicpmv_version 6
|
||||
python ./convert_hf_to_gguf.py ../MiniCPM-o-4/model
|
||||
|
||||
# quantize int4 version
|
||||
./build/bin/llama-quantize ../MiniCPM-o-4/model/ggml-model-f16.gguf ../MiniCPM-o-4/model/ggml-model-Q4_K_M.gguf Q4_K_M
|
||||
```
|
||||
|
||||
|
||||
Inference on Linux or Mac
|
||||
```bash
|
||||
# run in single-turn mode
|
||||
./build/bin/llama-mtmd-cli -m ../MiniCPM-o-4/model/ggml-model-f16.gguf --mmproj ../MiniCPM-o-4/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
|
||||
|
||||
# run in conversation mode
|
||||
./build/bin/llama-mtmd-cli -m ../MiniCPM-o-4/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-o-4/mmproj-model-f16.gguf
|
||||
```
|
||||
@@ -28,8 +28,8 @@ cmake --build build --config Release
|
||||
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf) by us)
|
||||
|
||||
```bash
|
||||
python ./tools/mtmd/legacy-models/minicpmv-surgery.py -m ../MiniCPM-Llama3-V-2_5
|
||||
python ./tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-Llama3-V-2_5 --minicpmv-projector ../MiniCPM-Llama3-V-2_5/minicpmv.projector --output-dir ../MiniCPM-Llama3-V-2_5/ --minicpmv_version 2
|
||||
python ./tools/mtmd/minicpmv-surgery.py -m ../MiniCPM-Llama3-V-2_5
|
||||
python ./tools/mtmd/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-Llama3-V-2_5 --minicpmv-projector ../MiniCPM-Llama3-V-2_5/minicpmv.projector --output-dir ../MiniCPM-Llama3-V-2_5/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 2
|
||||
python ./convert_hf_to_gguf.py ../MiniCPM-Llama3-V-2_5/model
|
||||
|
||||
# quantize int4 version
|
||||
|
||||
@@ -12,7 +12,7 @@ If there are differences in usage, please refer to the official build [documenta
|
||||
|
||||
Clone llama.cpp:
|
||||
```bash
|
||||
git clone https://github.com/ggml-org/llama.cpp
|
||||
git clone https://github.com/ggerganov/llama.cpp
|
||||
cd llama.cpp
|
||||
```
|
||||
|
||||
@@ -28,8 +28,8 @@ cmake --build build --config Release
|
||||
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) by us)
|
||||
|
||||
```bash
|
||||
python ./tools/mtmd/legacy-models/minicpmv-surgery.py -m ../MiniCPM-V-2_6
|
||||
python ./tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-V-2_6 --minicpmv-projector ../MiniCPM-V-2_6/minicpmv.projector --output-dir ../MiniCPM-V-2_6/ --minicpmv_version 3
|
||||
python ./tools/mtmd/minicpmv-surgery.py -m ../MiniCPM-V-2_6
|
||||
python ./tools/mtmd/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-V-2_6 --minicpmv-projector ../MiniCPM-V-2_6/minicpmv.projector --output-dir ../MiniCPM-V-2_6/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 3
|
||||
python ./convert_hf_to_gguf.py ../MiniCPM-V-2_6/model
|
||||
|
||||
# quantize int4 version
|
||||
|
||||
@@ -1,47 +0,0 @@
|
||||
## MiniCPM-V 4
|
||||
|
||||
### Prepare models and code
|
||||
|
||||
Download [MiniCPM-V-4](https://huggingface.co/openbmb/MiniCPM-V-4) PyTorch model from huggingface to "MiniCPM-V-4" folder.
|
||||
|
||||
|
||||
### Build llama.cpp
|
||||
Readme modification time: 20250206
|
||||
|
||||
If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
|
||||
|
||||
Clone llama.cpp:
|
||||
```bash
|
||||
git clone https://github.com/ggerganov/llama.cpp
|
||||
cd llama.cpp
|
||||
```
|
||||
|
||||
Build llama.cpp using `CMake`:
|
||||
```bash
|
||||
cmake -B build
|
||||
cmake --build build --config Release
|
||||
```
|
||||
|
||||
|
||||
### Usage of MiniCPM-V 4
|
||||
|
||||
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-V-4-gguf) by us)
|
||||
|
||||
```bash
|
||||
python ./tools/mtmd/legacy-models/minicpmv-surgery.py -m ../MiniCPM-V-4
|
||||
python ./tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-V-4 --minicpmv-projector ../MiniCPM-V-4/minicpmv.projector --output-dir ../MiniCPM-V-4/ --minicpmv_version 5
|
||||
python ./convert_hf_to_gguf.py ../MiniCPM-V-4/model
|
||||
|
||||
# quantize int4 version
|
||||
./build/bin/llama-quantize ../MiniCPM-V-4/model/ggml-model-f16.gguf ../MiniCPM-V-4/model/ggml-model-Q4_K_M.gguf Q4_K_M
|
||||
```
|
||||
|
||||
|
||||
Inference on Linux or Mac
|
||||
```bash
|
||||
# run in single-turn mode
|
||||
./build/bin/llama-mtmd-cli -m ../MiniCPM-V-4/model/ggml-model-f16.gguf --mmproj ../MiniCPM-V-4/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
|
||||
|
||||
# run in conversation mode
|
||||
./build/bin/llama-mtmd-cli -m ../MiniCPM-V-4/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-4/mmproj-model-f16.gguf
|
||||
```
|
||||
180
docs/ops.md
180
docs/ops.md
@@ -2,102 +2,94 @@
|
||||
|
||||
List of GGML operations and backend support status.
|
||||
|
||||
## How to add a backend to this table:
|
||||
|
||||
1. Run `test-backend-ops support --output csv` with your backend name and redirect output to a csv file in `docs/ops/` (e.g., `docs/ops/CUDA.csv`)
|
||||
2. Regenerate `/docs/ops.md` via `./scripts/create_ops_docs.py`
|
||||
|
||||
Legend:
|
||||
- ✅ Fully supported by this backend
|
||||
- 🟡 Partially supported by this backend
|
||||
- ❌ Not supported by this backend
|
||||
|
||||
| Operation | BLAS | CANN | CPU | CUDA | Metal | OpenCL | SYCL | Vulkan | zDNN |
|
||||
|-----------|------|------|------|------|------|------|------|------|------|
|
||||
| ABS | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
|
||||
| ACC | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
|
||||
| ADD | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
|
||||
| ADD1 | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ |
|
||||
| ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
|
||||
| ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
| CLAMP | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ |
|
||||
| CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | 🟡 | ✅ | ❌ |
|
||||
| CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ❌ |
|
||||
| CONV_2D | ❌ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ |
|
||||
| CONV_2D_DW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
|
||||
| CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
|
||||
| CONV_TRANSPOSE_2D | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||
| COS | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ |
|
||||
| COUNT_EQUAL | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
|
||||
| CPY | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
|
||||
| CROSS_ENTROPY_LOSS | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||
| CROSS_ENTROPY_LOSS_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||
| DIAG_MASK_INF | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
|
||||
| DIV | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
|
||||
| DUP | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ❌ |
|
||||
| ELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
|
||||
| EXP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
|
||||
| FLASH_ATTN_EXT | ❌ | 🟡 | ✅ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ |
|
||||
| GATED_LINEAR_ATTN | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| GEGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
|
||||
| GEGLU_ERF | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
|
||||
| GEGLU_QUICK | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
|
||||
| GELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
|
||||
| GELU_ERF | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
|
||||
| GELU_QUICK | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
|
||||
| GET_ROWS | ❌ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | ❌ |
|
||||
| GET_ROWS_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||
| GROUP_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
| HARDSIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
|
||||
| HARDSWISH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
|
||||
| IM2COL | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ |
|
||||
| L2_NORM | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
|
||||
| LEAKY_RELU | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
|
||||
| LOG | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| MEAN | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ |
|
||||
| MUL | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
|
||||
| MUL_MAT | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
|
||||
| MUL_MAT_ID | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ❌ |
|
||||
| NEG | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
|
||||
| NORM | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
|
||||
| OPT_STEP_ADAMW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
|
||||
| OUT_PROD | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ |
|
||||
| PAD | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
| PAD_REFLECT_1D | ❌ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
|
||||
| POOL_2D | ❌ | 🟡 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
|
||||
| REGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
|
||||
| RELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
|
||||
| REPEAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | ❌ |
|
||||
| REPEAT_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
|
||||
| RMS_NORM | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ |
|
||||
| RMS_NORM_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
|
||||
| RMS_NORM_MUL_ADD | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
| ROLL | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ |
|
||||
| ROPE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
| ROPE_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
|
||||
| RWKV_WKV6 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
|
||||
| RWKV_WKV7 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
|
||||
| SCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
| SET | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
|
||||
| SET_ROWS | ❌ | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
|
||||
| SGN | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
|
||||
| SIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
|
||||
| SILU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
|
||||
| SILU_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
|
||||
| SIN | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ |
|
||||
| SOFTCAP | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||
| SOFT_MAX | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | 🟡 | ✅ | ❌ |
|
||||
| SOFT_MAX_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ | ✅ | ❌ |
|
||||
| SQR | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ |
|
||||
| SQRT | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | ❌ | ❌ |
|
||||
| SSM_CONV | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ |
|
||||
| SSM_SCAN | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ |
|
||||
| STEP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
|
||||
| SUB | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
|
||||
| SUM | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ |
|
||||
| SUM_ROWS | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
| SWIGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
|
||||
| TANH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | 🟡 | ❌ |
|
||||
| TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
| UPSCALE | ❌ | 🟡 | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ❌ |
|
||||
| Operation | BLAS | CPU | CUDA | Metal |
|
||||
|-----------|------|------|------|------|
|
||||
| ABS | ❌ | ✅ | 🟡 | ❌ |
|
||||
| ACC | ❌ | ✅ | ✅ | ✅ |
|
||||
| ADD | ❌ | ✅ | ✅ | 🟡 |
|
||||
| ADD1 | ❌ | ✅ | ✅ | ❌ |
|
||||
| ARANGE | ❌ | ✅ | ✅ | ✅ |
|
||||
| ARGMAX | ❌ | ✅ | ✅ | ✅ |
|
||||
| ARGSORT | ❌ | ✅ | ✅ | ✅ |
|
||||
| CLAMP | ❌ | ✅ | ✅ | 🟡 |
|
||||
| CONCAT | ❌ | ✅ | 🟡 | ✅ |
|
||||
| CONT | ❌ | ✅ | 🟡 | ✅ |
|
||||
| CONV_2D_DW | ❌ | ✅ | ✅ | ❌ |
|
||||
| CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ |
|
||||
| CONV_TRANSPOSE_2D | ❌ | ✅ | ✅ | ❌ |
|
||||
| COS | ❌ | ✅ | ✅ | 🟡 |
|
||||
| COUNT_EQUAL | ❌ | ✅ | ✅ | ❌ |
|
||||
| CPY | ❌ | 🟡 | 🟡 | 🟡 |
|
||||
| CROSS_ENTROPY_LOSS | ❌ | ✅ | ✅ | ❌ |
|
||||
| CROSS_ENTROPY_LOSS_BACK | ❌ | ✅ | ✅ | ❌ |
|
||||
| DIAG_MASK_INF | ❌ | ✅ | ✅ | 🟡 |
|
||||
| DIV | ❌ | ✅ | ✅ | 🟡 |
|
||||
| DUP | ❌ | ✅ | 🟡 | 🟡 |
|
||||
| ELU | ❌ | ✅ | ❌ | 🟡 |
|
||||
| EXP | ❌ | ✅ | 🟡 | ❌ |
|
||||
| FLASH_ATTN_EXT | ❌ | ✅ | 🟡 | 🟡 |
|
||||
| GATED_LINEAR_ATTN | ❌ | ✅ | ✅ | ❌ |
|
||||
| GEGLU | ❌ | ✅ | ✅ | 🟡 |
|
||||
| GEGLU_ERF | ❌ | ✅ | ✅ | 🟡 |
|
||||
| GEGLU_QUICK | ❌ | ✅ | ✅ | 🟡 |
|
||||
| GELU | ❌ | ✅ | 🟡 | 🟡 |
|
||||
| GELU_ERF | ❌ | ✅ | 🟡 | 🟡 |
|
||||
| GELU_QUICK | ❌ | ✅ | 🟡 | 🟡 |
|
||||
| GET_ROWS | ❌ | ✅ | 🟡 | ✅ |
|
||||
| GET_ROWS_BACK | ❌ | 🟡 | 🟡 | ❌ |
|
||||
| GROUP_NORM | ❌ | ✅ | ✅ | ✅ |
|
||||
| HARDSIGMOID | ❌ | ✅ | 🟡 | ❌ |
|
||||
| HARDSWISH | ❌ | ✅ | 🟡 | ❌ |
|
||||
| IM2COL | ❌ | ✅ | ✅ | 🟡 |
|
||||
| L2_NORM | ❌ | ✅ | ✅ | ✅ |
|
||||
| LEAKY_RELU | ❌ | ✅ | ✅ | ✅ |
|
||||
| LOG | ❌ | ✅ | ✅ | ❌ |
|
||||
| MEAN | ❌ | ✅ | ✅ | ✅ |
|
||||
| MUL | ❌ | ✅ | ✅ | 🟡 |
|
||||
| MUL_MAT | 🟡 | 🟡 | 🟡 | 🟡 |
|
||||
| MUL_MAT_ID | ❌ | ✅ | ✅ | ✅ |
|
||||
| NEG | ❌ | ✅ | 🟡 | 🟡 |
|
||||
| NORM | ❌ | ✅ | ✅ | 🟡 |
|
||||
| OPT_STEP_ADAMW | ❌ | ✅ | ✅ | ❌ |
|
||||
| OUT_PROD | 🟡 | 🟡 | 🟡 | ❌ |
|
||||
| PAD | ❌ | ✅ | ✅ | ✅ |
|
||||
| PAD_REFLECT_1D | ❌ | ✅ | ❌ | ✅ |
|
||||
| POOL_2D | ❌ | ✅ | ✅ | ✅ |
|
||||
| REGLU | ❌ | ✅ | ✅ | 🟡 |
|
||||
| RELU | ❌ | ✅ | 🟡 | 🟡 |
|
||||
| REPEAT | ❌ | ✅ | 🟡 | ✅ |
|
||||
| REPEAT_BACK | ❌ | ✅ | ✅ | ❌ |
|
||||
| RMS_NORM | ❌ | ✅ | ✅ | 🟡 |
|
||||
| RMS_NORM_BACK | ❌ | ✅ | ✅ | ❌ |
|
||||
| RMS_NORM_MUL | ❌ | ✅ | ✅ | ✅ |
|
||||
| ROPE | ❌ | ✅ | ✅ | ✅ |
|
||||
| ROPE_BACK | ❌ | ✅ | ✅ | ❌ |
|
||||
| RWKV_WKV6 | ❌ | ✅ | ✅ | ✅ |
|
||||
| RWKV_WKV7 | ❌ | ✅ | ✅ | ✅ |
|
||||
| SCALE | ❌ | ✅ | ✅ | ✅ |
|
||||
| SET | ❌ | ✅ | ❌ | ✅ |
|
||||
| SET_ROWS | ❌ | 🟡 | ❌ | 🟡 |
|
||||
| SGN | ❌ | ✅ | 🟡 | ❌ |
|
||||
| SIGMOID | ❌ | ✅ | 🟡 | 🟡 |
|
||||
| SILU | ❌ | ✅ | 🟡 | 🟡 |
|
||||
| SILU_BACK | ❌ | ✅ | ✅ | ❌ |
|
||||
| SIN | ❌ | ✅ | ✅ | 🟡 |
|
||||
| SOFT_MAX | ❌ | ✅ | ✅ | ✅ |
|
||||
| SOFT_MAX_BACK | ❌ | 🟡 | 🟡 | ❌ |
|
||||
| SQR | ❌ | ✅ | ✅ | 🟡 |
|
||||
| SQRT | ❌ | ✅ | ✅ | 🟡 |
|
||||
| SSM_CONV | ❌ | ✅ | ✅ | ✅ |
|
||||
| SSM_SCAN | ❌ | ✅ | ✅ | ✅ |
|
||||
| STEP | ❌ | ✅ | 🟡 | ❌ |
|
||||
| SUB | ❌ | ✅ | ✅ | 🟡 |
|
||||
| SUM | ❌ | ✅ | ✅ | ❌ |
|
||||
| SUM_ROWS | ❌ | ✅ | ✅ | ✅ |
|
||||
| SWIGLU | ❌ | ✅ | ✅ | 🟡 |
|
||||
| TANH | ❌ | ✅ | 🟡 | 🟡 |
|
||||
| TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ |
|
||||
| UPSCALE | ❌ | ✅ | ✅ | 🟡 |
|
||||
|
||||
14667
docs/ops/BLAS.csv
14667
docs/ops/BLAS.csv
File diff suppressed because it is too large
Load Diff
8133
docs/ops/CANN.csv
8133
docs/ops/CANN.csv
File diff suppressed because it is too large
Load Diff
13883
docs/ops/CPU.csv
13883
docs/ops/CPU.csv
File diff suppressed because it is too large
Load Diff
13883
docs/ops/CUDA.csv
13883
docs/ops/CUDA.csv
File diff suppressed because it is too large
Load Diff
14667
docs/ops/Metal.csv
14667
docs/ops/Metal.csv
File diff suppressed because it is too large
Load Diff
8133
docs/ops/OpenCL.csv
8133
docs/ops/OpenCL.csv
File diff suppressed because it is too large
Load Diff
8133
docs/ops/SYCL.csv
8133
docs/ops/SYCL.csv
File diff suppressed because it is too large
Load Diff
8133
docs/ops/Vulkan.csv
8133
docs/ops/Vulkan.csv
File diff suppressed because it is too large
Load Diff
8134
docs/ops/zDNN.csv
8134
docs/ops/zDNN.csv
File diff suppressed because it is too large
Load Diff
@@ -1,13 +0,0 @@
|
||||
# Diffusion Text Generation
|
||||
|
||||
This directory contains implementations for Diffusion LLMs (DLLMs)
|
||||
|
||||
More Info:
|
||||
- https://github.com/ggml-org/llama.cpp/pull/14644
|
||||
- https://github.com/ggml-org/llama.cpp/pull/14771
|
||||
|
||||
|
||||
Example of using Dream architechture: `llama-diffusion-cli -m dream7b.gguf -p "write code to train MNIST in pytorch" -ub 512 --diffusion-eps 0.001 --diffusion-algorithm 3 --diffusion-steps 256 --diffusion-visual`
|
||||
|
||||
Example of using LLaDA architechture: `llama-diffusion-cli -m llada-8b.gguf -p "write code to train MNIST in pytorch" -ub 512 --diffusion-block-length 32 --diffusion-steps 256 --diffusion-visual`
|
||||
|
||||
@@ -5,128 +5,344 @@
|
||||
#include "log.h"
|
||||
|
||||
#include <limits.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <cstring>
|
||||
#include <limits>
|
||||
#include <random>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <limits>
|
||||
#include <random>
|
||||
|
||||
enum diffusion_algorithm { ORIGIN = 0, ENTROPY_BASED = 1, MARGIN_BASED = 2, RANDOM = 3, CONFIDENCE_BASED = 4 };
|
||||
|
||||
// Unified transfer scheduling methods
|
||||
enum transfer_schedule {
|
||||
TIMESTEP_BASED = 0, // Dream-style: (1.0 - s/t) * remaining
|
||||
BLOCK_BASED = 1, // LLaDA-style: process in blocks with get_num_transfer_tokens
|
||||
};
|
||||
|
||||
typedef bool (*diffusion_step_callback_t)(int32_t step,
|
||||
int32_t total_steps,
|
||||
typedef bool (*diffusion_step_callback_t)(int32_t step,
|
||||
int32_t total_steps,
|
||||
const llama_token * tokens,
|
||||
int32_t n_tokens,
|
||||
void * user_data);
|
||||
int32_t n_tokens,
|
||||
void * user_data);
|
||||
|
||||
enum diffusion_alg {
|
||||
DIFFUSION_ALG_ORIGIN = 0,
|
||||
DIFFUSION_ALG_MASKGIT_PLUS = 1,
|
||||
DIFFUSION_ALG_TOPK_MARGIN = 2,
|
||||
DIFFUSION_ALG_ENTROPY = 3,
|
||||
};
|
||||
|
||||
struct diffusion_params {
|
||||
int32_t steps = 0;
|
||||
float temperature = 0;
|
||||
llama_token mask_token_id = LLAMA_TOKEN_NULL;
|
||||
diffusion_step_callback_t step_callback = nullptr;
|
||||
void * step_callback_user_data = nullptr;
|
||||
int32_t seed = 0;
|
||||
bool visual_mode = false;
|
||||
bool shift_logits = false; // Shift logits by -1 after decode
|
||||
|
||||
float top_p = 0.;
|
||||
int32_t top_k = 0.;
|
||||
|
||||
diffusion_algorithm algorithm = CONFIDENCE_BASED;
|
||||
transfer_schedule schedule = TIMESTEP_BASED;
|
||||
|
||||
float cfg_scale = 0.; // Config scale for classifier-free guidance
|
||||
float eps = 0.; // Timestep scheduling
|
||||
int32_t block_length = 0; // Block size (for block scheduling)
|
||||
float alg_temp = 0; // algorithm temperature (0.0 = deterministic)
|
||||
bool add_gumbel_noise = false; // Add gumbel noise to the logits if temp > 0.0
|
||||
|
||||
int32_t max_length = 0; // Maximum sequence length
|
||||
int32_t steps;
|
||||
float eps;
|
||||
float temperature;
|
||||
float top_p;
|
||||
int32_t top_k;
|
||||
llama_token mask_token_id;
|
||||
enum diffusion_alg algorithm;
|
||||
float alg_temp;
|
||||
diffusion_step_callback_t step_callback;
|
||||
void * step_callback_user_data;
|
||||
int32_t seed;
|
||||
};
|
||||
|
||||
|
||||
static diffusion_params diffusion_default_params() {
|
||||
diffusion_params params = {};
|
||||
params.steps = 64;
|
||||
params.eps = 1e-3f;
|
||||
params.temperature = 0.2f;
|
||||
params.top_p = 0.95f;
|
||||
params.top_k = 0;
|
||||
params.mask_token_id = LLAMA_TOKEN_NULL;
|
||||
params.algorithm = DIFFUSION_ALG_ORIGIN;
|
||||
params.alg_temp = 0.0f;
|
||||
params.step_callback = nullptr;
|
||||
params.step_callback_user_data = nullptr;
|
||||
params.seed = 0;
|
||||
return params;
|
||||
}
|
||||
|
||||
static void diffusion_generate(llama_context * ctx,
|
||||
const llama_token * input_tokens,
|
||||
llama_token * output_tokens,
|
||||
int32_t n_input,
|
||||
int32_t max_length,
|
||||
struct diffusion_params params,
|
||||
int32_t & n_generated) {
|
||||
|
||||
n_generated = 0;
|
||||
if (!ctx || !input_tokens || !output_tokens || n_input <= 0 || max_length <= n_input) {
|
||||
return;
|
||||
}
|
||||
|
||||
const llama_model * model = llama_get_model(ctx);
|
||||
|
||||
// Initialize with input and pad with mask tokens
|
||||
std::copy(input_tokens, input_tokens + n_input, output_tokens);
|
||||
std::fill(output_tokens + n_input, output_tokens + max_length, params.mask_token_id);
|
||||
|
||||
std::mt19937 rng(params.seed);
|
||||
|
||||
std::vector<float> timesteps(params.steps + 1);
|
||||
for (int32_t i = 0; i <= params.steps; i++) {
|
||||
timesteps[i] = 1.0f - (float) i / params.steps * (1.0f - params.eps);
|
||||
}
|
||||
|
||||
llama_set_causal_attn(ctx, false);
|
||||
|
||||
int32_t n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(model));
|
||||
|
||||
std::vector<llama_token_data> candidates(n_vocab);
|
||||
|
||||
std::vector<llama_token_data> conf_candidates;
|
||||
conf_candidates.reserve(max_length);
|
||||
|
||||
std::vector<int32_t> mask_positions;
|
||||
mask_positions.reserve(max_length);
|
||||
|
||||
struct llama_sampler * sampler = llama_sampler_chain_init(llama_sampler_chain_default_params());
|
||||
if (params.top_k > 0) {
|
||||
llama_sampler_chain_add(sampler, llama_sampler_init_top_k(params.top_k));
|
||||
}
|
||||
if (params.top_p < 1.0f) {
|
||||
llama_sampler_chain_add(sampler, llama_sampler_init_top_p(params.top_p, 1));
|
||||
}
|
||||
if (params.temperature > 0.0f) {
|
||||
llama_sampler_chain_add(sampler, llama_sampler_init_temp(params.temperature));
|
||||
}
|
||||
llama_sampler_chain_add(sampler, llama_sampler_init_dist(params.seed));
|
||||
|
||||
struct llama_sampler * dist_sampler = llama_sampler_init_dist(params.seed);
|
||||
|
||||
llama_batch batch = llama_batch_init(max_length, 0, 1);
|
||||
batch.n_tokens = max_length;
|
||||
|
||||
int64_t total_sampling_time = 0;
|
||||
int64_t total_time = 0;
|
||||
|
||||
int64_t time_start = ggml_time_us();
|
||||
for (int32_t step = 0; step < params.steps; step++) {
|
||||
if (params.step_callback) {
|
||||
if (!params.step_callback(step, params.steps, output_tokens, max_length, params.step_callback_user_data)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
for (int32_t i = 0; i < max_length; i++) {
|
||||
batch.token[i] = output_tokens[i];
|
||||
batch.pos[i] = i;
|
||||
batch.n_seq_id[i] = 1;
|
||||
batch.seq_id[i][0] = 0;
|
||||
batch.logits[i] = 1;
|
||||
}
|
||||
|
||||
int ret = llama_decode(ctx, batch);
|
||||
if (ret != 0) {
|
||||
LOG_ERR("%s: failed to decode at step %d, ret = %d\n", __func__, step, ret);
|
||||
break;
|
||||
}
|
||||
|
||||
float * raw_logits = llama_get_logits(ctx);
|
||||
if (!raw_logits) {
|
||||
LOG_ERR("%s: failed to get logits at step %d\n", __func__, step);
|
||||
break;
|
||||
}
|
||||
|
||||
auto get_logits_for_pos = [&](int32_t pos) -> const float * {
|
||||
return pos == 0 ? raw_logits : raw_logits + (pos - 1) * n_vocab;
|
||||
};
|
||||
|
||||
int64_t time_start_sampling = ggml_time_us();
|
||||
|
||||
mask_positions.clear();
|
||||
for (int32_t i = 0; i < max_length; i++) {
|
||||
if (output_tokens[i] == params.mask_token_id) {
|
||||
mask_positions.push_back(i);
|
||||
}
|
||||
}
|
||||
|
||||
if (mask_positions.empty()) {
|
||||
break;
|
||||
}
|
||||
|
||||
float t = timesteps[step];
|
||||
float s = timesteps[step + 1];
|
||||
|
||||
if (params.algorithm == DIFFUSION_ALG_ORIGIN) {
|
||||
float p_transfer = (step < params.steps - 1) ? (1.0f - s / t) : 1.0f;
|
||||
|
||||
for (int32_t pos : mask_positions) {
|
||||
if (std::uniform_real_distribution<float>(0.0f, 1.0f)(rng) < p_transfer) {
|
||||
const float * pos_logits = get_logits_for_pos(pos);
|
||||
for (int32_t token_id = 0; token_id < n_vocab; token_id++) {
|
||||
candidates[token_id].id = token_id;
|
||||
candidates[token_id].logit = pos_logits[token_id];
|
||||
candidates[token_id].p = 0.0f;
|
||||
}
|
||||
|
||||
llama_token_data_array cur_p = {
|
||||
/* .data = */ candidates.data(),
|
||||
/* .size = */ (size_t) n_vocab, // Reset size to full vocab
|
||||
/* .selected = */ -1,
|
||||
/* .sorted = */ false,
|
||||
};
|
||||
|
||||
llama_sampler_apply(sampler, &cur_p);
|
||||
output_tokens[pos] = cur_p.data[cur_p.selected].id;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
std::vector<std::pair<float, int32_t>> confidences;
|
||||
std::vector<llama_token> sampled_tokens(mask_positions.size());
|
||||
|
||||
for (size_t i = 0; i < mask_positions.size(); i++) {
|
||||
int32_t pos = mask_positions[i];
|
||||
const float * pos_logits = get_logits_for_pos(pos);
|
||||
|
||||
for (int32_t token_id = 0; token_id < n_vocab; token_id++) {
|
||||
candidates[token_id].logit = pos_logits[token_id];
|
||||
candidates[token_id].p = 0.0f;
|
||||
candidates[token_id].id = token_id;
|
||||
}
|
||||
|
||||
llama_token_data_array cur_p = {
|
||||
/* .data = */ candidates.data(),
|
||||
/* .size = */ candidates.size(),
|
||||
/* .selected = */ -1,
|
||||
/* .sorted = */ false,
|
||||
};
|
||||
|
||||
llama_sampler_apply(sampler, &cur_p);
|
||||
|
||||
llama_token sampled_token = cur_p.data[cur_p.selected].id;
|
||||
|
||||
float confidence = 0.0f;
|
||||
if (params.algorithm == DIFFUSION_ALG_ENTROPY) {
|
||||
const float epsilon = 1e-10f;
|
||||
for (size_t j = 0; j < cur_p.size; j++) {
|
||||
float prob = cur_p.data[j].p;
|
||||
confidence += prob * logf(prob + epsilon);
|
||||
}
|
||||
} else if (params.algorithm == DIFFUSION_ALG_TOPK_MARGIN) {
|
||||
confidence = cur_p.data[0].p - cur_p.data[1].p;
|
||||
} else {
|
||||
confidence = cur_p.data[cur_p.selected].p;
|
||||
}
|
||||
|
||||
sampled_tokens[i] = sampled_token;
|
||||
confidences.emplace_back(confidence, i);
|
||||
}
|
||||
|
||||
int32_t num_transfer =
|
||||
(step < params.steps - 1) ? (int32_t) (mask_positions.size() * (1.0f - s / t)) : mask_positions.size();
|
||||
|
||||
if (num_transfer > 0) {
|
||||
if (params.alg_temp == 0.0f) {
|
||||
std::partial_sort(confidences.begin(), confidences.begin() + num_transfer, confidences.end(),
|
||||
[](const std::pair<float, int32_t> & a, const std::pair<float, int32_t> & b) {
|
||||
if (a.first != b.first) {
|
||||
return a.first > b.first;
|
||||
}
|
||||
return a.second < b.second;
|
||||
});
|
||||
} else {
|
||||
conf_candidates.clear();
|
||||
|
||||
for (int32_t pos = 0; pos < max_length; pos++) {
|
||||
float conf_logit = -std::numeric_limits<float>::infinity();
|
||||
|
||||
auto it = std::find(mask_positions.begin(), mask_positions.end(), pos);
|
||||
if (it != mask_positions.end()) {
|
||||
size_t mask_idx = std::distance(mask_positions.begin(), it);
|
||||
conf_logit = confidences[mask_idx].first / params.alg_temp; // Apply temperature scaling
|
||||
}
|
||||
|
||||
conf_candidates.emplace_back(llama_token_data{ pos, conf_logit, 0.0f });
|
||||
}
|
||||
|
||||
llama_token_data_array conf_array = {
|
||||
/* .data = */ conf_candidates.data(),
|
||||
/* .size = */ conf_candidates.size(),
|
||||
/* .selected = */ -1,
|
||||
/* .sorted = */ false,
|
||||
};
|
||||
|
||||
for (int32_t i = 0; i < num_transfer; i++) {
|
||||
// Apply distribution sampler to get selected index
|
||||
llama_sampler_apply(dist_sampler, &conf_array);
|
||||
int selected_idx = conf_array.selected;
|
||||
confidences[i].second = conf_candidates[selected_idx].id;
|
||||
|
||||
conf_candidates[selected_idx].p = 0.0f;
|
||||
conf_array.selected = -1;
|
||||
}
|
||||
}
|
||||
|
||||
if (params.alg_temp == 0.0f) {
|
||||
// Deterministic - use confidence order
|
||||
for (int32_t i = 0; i < num_transfer; i++) {
|
||||
int32_t mask_idx = confidences[i].second;
|
||||
int32_t pos = mask_positions[mask_idx];
|
||||
llama_token token = sampled_tokens[mask_idx];
|
||||
output_tokens[pos] = token;
|
||||
}
|
||||
} else {
|
||||
for (int32_t i = 0; i < num_transfer; i++) {
|
||||
int32_t pos = confidences[i].second;
|
||||
auto it = std::find(mask_positions.begin(), mask_positions.end(), pos);
|
||||
if (it != mask_positions.end()) {
|
||||
int32_t mask_idx = std::distance(mask_positions.begin(), it);
|
||||
output_tokens[pos] = sampled_tokens[mask_idx];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
int64_t time_end_sampling = ggml_time_us();
|
||||
total_sampling_time += time_end_sampling - time_start_sampling;
|
||||
}
|
||||
int64_t time_end = ggml_time_us();
|
||||
total_time += time_end - time_start;
|
||||
|
||||
LOG_INF("\ntotal time: %0.2fms, time per step: %0.2fms, sampling time per step: %0.2fms\n",
|
||||
total_time / 1000.0, total_time / 1000.0 / params.steps, total_sampling_time / 1000.0 / params.steps);
|
||||
|
||||
|
||||
llama_batch_free(batch);
|
||||
llama_sampler_free(sampler);
|
||||
llama_sampler_free(dist_sampler);
|
||||
|
||||
n_generated = max_length;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
static std::string format_input_text(const std::string & prompt, bool use_chat_template, llama_model * model) {
|
||||
if (!use_chat_template) {
|
||||
return prompt;
|
||||
}
|
||||
|
||||
auto chat_templates = common_chat_templates_init(model, "");
|
||||
|
||||
common_chat_templates_inputs inputs;
|
||||
common_chat_msg user_msg;
|
||||
user_msg.role = "user";
|
||||
user_msg.content = prompt;
|
||||
inputs.add_generation_prompt = true;
|
||||
inputs.messages.push_back(user_msg);
|
||||
|
||||
auto result = common_chat_templates_apply(chat_templates.get(), inputs);
|
||||
|
||||
return result.prompt;
|
||||
}
|
||||
|
||||
struct callback_data {
|
||||
diffusion_params * diff_params;
|
||||
const llama_vocab * vocab;
|
||||
int32_t n_input;
|
||||
const common_params_diffusion * diff_params;
|
||||
const llama_vocab * vocab;
|
||||
int32_t n_input;
|
||||
};
|
||||
|
||||
static float calculate_confidence(const llama_token_data_array & cur_p,
|
||||
diffusion_algorithm algorithm,
|
||||
std::mt19937 & rng) {
|
||||
switch (algorithm) {
|
||||
case CONFIDENCE_BASED:
|
||||
return cur_p.data[cur_p.selected].p; // Selected token probability
|
||||
|
||||
case ENTROPY_BASED:
|
||||
{
|
||||
float entropy = 0.0f;
|
||||
const float epsilon = 1e-10f;
|
||||
for (size_t i = 0; i < cur_p.size; i++) {
|
||||
float prob = cur_p.data[i].p;
|
||||
entropy += prob * logf(prob + epsilon);
|
||||
}
|
||||
return -entropy; // Higher entropy = lower confidence
|
||||
}
|
||||
|
||||
case MARGIN_BASED:
|
||||
return (cur_p.size > 1) ? cur_p.data[0].p - cur_p.data[1].p : cur_p.data[0].p;
|
||||
|
||||
case RANDOM:
|
||||
{
|
||||
std::uniform_real_distribution<float> uniform(0.0f, 1.0f);
|
||||
return uniform(rng); // Random confidence
|
||||
}
|
||||
|
||||
case ORIGIN:
|
||||
return cur_p.data[cur_p.selected].p;
|
||||
|
||||
default:
|
||||
return 0.0f;
|
||||
}
|
||||
}
|
||||
|
||||
// Unified transfer count calculation function
|
||||
static int32_t calculate_transfer_count(int32_t step,
|
||||
int32_t total_steps,
|
||||
int32_t remaining_masked,
|
||||
transfer_schedule schedule,
|
||||
float eps,
|
||||
const std::vector<int32_t> & num_transfer_tokens = {}) {
|
||||
switch (schedule) {
|
||||
case TIMESTEP_BASED:
|
||||
{
|
||||
float t = 1.0f - (float) step / total_steps * (1.0f - eps);
|
||||
float s = 1.0f - (float) (step + 1) / total_steps * (1.0f - eps);
|
||||
float p_transfer = (step < total_steps - 1) ? (1.0f - s / t) : 1.0f;
|
||||
return (int32_t) (remaining_masked * p_transfer);
|
||||
}
|
||||
|
||||
case BLOCK_BASED:
|
||||
if (!num_transfer_tokens.empty() && step < (int32_t) num_transfer_tokens.size()) {
|
||||
return num_transfer_tokens[step];
|
||||
}
|
||||
return remaining_masked / (total_steps - step); // Fallback
|
||||
|
||||
default:
|
||||
return remaining_masked / (total_steps - step);
|
||||
}
|
||||
}
|
||||
|
||||
static bool diffusion_step_callback(int32_t step,
|
||||
int32_t total_steps,
|
||||
static bool diffusion_step_callback(int32_t step,
|
||||
int32_t total_steps,
|
||||
const llama_token * tokens,
|
||||
int32_t n_tokens,
|
||||
void * user_data) {
|
||||
(void) user_data;
|
||||
int32_t n_tokens,
|
||||
void * user_data) {
|
||||
(void)user_data;
|
||||
|
||||
callback_data * data = static_cast<callback_data *>(user_data);
|
||||
|
||||
@@ -134,11 +350,11 @@ static bool diffusion_step_callback(int32_t step,
|
||||
int progress_percent = (step * 100) / total_steps;
|
||||
int progress_bars = (step * 50) / total_steps;
|
||||
LOG_INF("\rdiffusion step: %d/%d [%s%s] %d%%",
|
||||
step,
|
||||
total_steps,
|
||||
std::string(progress_bars, '=').c_str(),
|
||||
std::string(50 - progress_bars, ' ').c_str(),
|
||||
progress_percent);
|
||||
step,
|
||||
total_steps,
|
||||
std::string(progress_bars, '=').c_str(),
|
||||
std::string(50 - progress_bars, ' ').c_str(),
|
||||
progress_percent);
|
||||
};
|
||||
|
||||
if (data->diff_params->visual_mode) {
|
||||
@@ -175,360 +391,6 @@ static bool diffusion_step_callback(int32_t step,
|
||||
return true;
|
||||
}
|
||||
|
||||
static void add_gumbel_noise(float * logits, int32_t n_vocab, float temperature, std::mt19937 & rng) {
|
||||
if (temperature == 0.0f) {
|
||||
return;
|
||||
}
|
||||
|
||||
std::uniform_real_distribution<double> uniform(0.0, 1.0);
|
||||
for (int32_t i = 0; i < n_vocab; i++) {
|
||||
double noise = uniform(rng);
|
||||
// Prevent log(0)
|
||||
noise = std::max(noise, 1e-20);
|
||||
double gumbel_noise = std::pow(-std::log(noise), temperature);
|
||||
logits[i] = std::exp(logits[i]) / gumbel_noise;
|
||||
}
|
||||
}
|
||||
|
||||
static std::vector<int32_t> get_num_transfer_tokens(int32_t mask_count, int32_t steps) {
|
||||
std::vector<int32_t> num_transfer_tokens(steps);
|
||||
|
||||
int32_t base = mask_count / steps;
|
||||
int32_t remainder = mask_count % steps;
|
||||
|
||||
for (int32_t i = 0; i < steps; i++) {
|
||||
num_transfer_tokens[i] = base + (i < remainder ? 1 : 0);
|
||||
}
|
||||
|
||||
return num_transfer_tokens;
|
||||
}
|
||||
|
||||
static void diffusion_generate(llama_context * ctx,
|
||||
const llama_token * input_tokens,
|
||||
llama_token * output_tokens,
|
||||
int32_t n_input,
|
||||
const diffusion_params & params,
|
||||
int32_t & n_generated) {
|
||||
n_generated = 0;
|
||||
if (!ctx || !input_tokens || !output_tokens || n_input <= 0 || params.max_length <= n_input) {
|
||||
return;
|
||||
}
|
||||
|
||||
const llama_model * model = llama_get_model(ctx);
|
||||
|
||||
// Initialize with input and pad with mask tokens
|
||||
std::copy(input_tokens, input_tokens + n_input, output_tokens);
|
||||
std::fill(output_tokens + n_input, output_tokens + params.max_length, params.mask_token_id);
|
||||
|
||||
std::mt19937 rng(params.seed);
|
||||
|
||||
llama_set_causal_attn(ctx, false);
|
||||
|
||||
int32_t n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(model));
|
||||
|
||||
std::vector<llama_token_data> candidates(n_vocab);
|
||||
std::vector<llama_token_data> conf_candidates;
|
||||
conf_candidates.reserve(params.max_length);
|
||||
std::vector<int32_t> mask_positions;
|
||||
mask_positions.reserve(params.max_length);
|
||||
|
||||
// Setup sampler chain
|
||||
struct llama_sampler * sampler = llama_sampler_chain_init(llama_sampler_chain_default_params());
|
||||
if (params.top_k > 0) {
|
||||
llama_sampler_chain_add(sampler, llama_sampler_init_top_k(params.top_k));
|
||||
}
|
||||
if (params.top_p < 1.0f) {
|
||||
llama_sampler_chain_add(sampler, llama_sampler_init_top_p(params.top_p, 1));
|
||||
}
|
||||
if (params.temperature > 0.0f) {
|
||||
llama_sampler_chain_add(sampler, llama_sampler_init_temp(params.temperature));
|
||||
}
|
||||
llama_sampler_chain_add(sampler, llama_sampler_init_dist(params.seed));
|
||||
|
||||
struct llama_sampler * dist_sampler = llama_sampler_init_dist(params.seed);
|
||||
|
||||
llama_batch batch = llama_batch_init(params.max_length, 0, 1);
|
||||
batch.n_tokens = params.max_length;
|
||||
|
||||
// Pre-allocate buffers for CFG if needed
|
||||
int32_t logits_size = n_vocab * params.max_length;
|
||||
std::vector<float> cond_logits_buffer;
|
||||
std::vector<llama_token> un_x_buffer;
|
||||
if (params.cfg_scale > 0.0f) {
|
||||
cond_logits_buffer.resize(logits_size);
|
||||
un_x_buffer.resize(params.max_length);
|
||||
}
|
||||
|
||||
// For block-based processing
|
||||
std::vector<int32_t> num_transfer_tokens;
|
||||
int32_t num_blocks = 1;
|
||||
int32_t steps_per_block = params.steps;
|
||||
|
||||
if (params.schedule == BLOCK_BASED) {
|
||||
GGML_ASSERT(params.max_length % params.block_length == 0);
|
||||
num_blocks = params.max_length / params.block_length;
|
||||
GGML_ASSERT(params.steps % num_blocks == 0);
|
||||
steps_per_block = params.steps / num_blocks;
|
||||
}
|
||||
|
||||
std::vector<float> confidence(params.max_length);
|
||||
|
||||
int64_t total_sampling_time = 0;
|
||||
int64_t total_time = 0;
|
||||
int64_t time_start = ggml_time_us();
|
||||
|
||||
for (int block_num = 0; block_num < num_blocks; block_num++) {
|
||||
int32_t block_start = (params.schedule == BLOCK_BASED) ? n_input + block_num * params.block_length : 0;
|
||||
int32_t block_end = (params.schedule == BLOCK_BASED) ?
|
||||
std::min(n_input + (block_num + 1) * params.block_length, params.max_length) :
|
||||
params.max_length;
|
||||
|
||||
// Count masked tokens in current block for block-based processing
|
||||
if (params.schedule == BLOCK_BASED) {
|
||||
int32_t block_mask_count = 0;
|
||||
for (int i = block_start; i < block_end; i++) {
|
||||
if (output_tokens[i] == params.mask_token_id) {
|
||||
block_mask_count++;
|
||||
}
|
||||
}
|
||||
num_transfer_tokens = get_num_transfer_tokens(block_mask_count, steps_per_block);
|
||||
}
|
||||
|
||||
for (int32_t step = 0; step < steps_per_block; step++) {
|
||||
int32_t global_step = block_num * steps_per_block + step;
|
||||
|
||||
if (params.step_callback) {
|
||||
if (!params.step_callback(
|
||||
global_step, params.steps, output_tokens, params.max_length, params.step_callback_user_data)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Setup batch
|
||||
for (int32_t i = 0; i < params.max_length; i++) {
|
||||
batch.token[i] = output_tokens[i];
|
||||
batch.pos[i] = i;
|
||||
batch.n_seq_id[i] = 1;
|
||||
batch.seq_id[i][0] = 0;
|
||||
batch.logits[i] = 1;
|
||||
}
|
||||
|
||||
float * logits = nullptr;
|
||||
|
||||
if (params.cfg_scale > 0.0f) {
|
||||
int ret = llama_decode(ctx, batch);
|
||||
if (ret != 0) {
|
||||
LOG_ERR("Failed to generate conditional");
|
||||
break;
|
||||
}
|
||||
float * cond_logits_ptr = llama_get_logits(ctx);
|
||||
std::memcpy(cond_logits_buffer.data(), cond_logits_ptr, logits_size * sizeof(float));
|
||||
|
||||
// Unconditional generation (mask input)
|
||||
std::copy(output_tokens, output_tokens + params.max_length, un_x_buffer.begin());
|
||||
for (int32_t i = 0; i < n_input; i++) {
|
||||
un_x_buffer[i] = params.mask_token_id;
|
||||
}
|
||||
|
||||
for (int32_t i = 0; i < params.max_length; i++) {
|
||||
batch.token[i] = un_x_buffer[i];
|
||||
}
|
||||
ret = llama_decode(ctx, batch);
|
||||
if (ret != 0) {
|
||||
LOG_ERR("Failed to generate unconditional");
|
||||
break;
|
||||
}
|
||||
float * uncond_logits = llama_get_logits(ctx);
|
||||
|
||||
// Apply CFG
|
||||
for (int32_t i = 0; i < logits_size; i++) {
|
||||
cond_logits_buffer[i] =
|
||||
uncond_logits[i] + (params.cfg_scale + 1.0f) * (cond_logits_buffer[i] - uncond_logits[i]);
|
||||
}
|
||||
logits = cond_logits_buffer.data();
|
||||
} else {
|
||||
int ret = llama_decode(ctx, batch);
|
||||
if (ret != 0) {
|
||||
LOG_ERR("%s: failed to decode at step %d, ret = %d\n", __func__, global_step, ret);
|
||||
break;
|
||||
}
|
||||
logits = llama_get_logits(ctx);
|
||||
}
|
||||
|
||||
if (!logits) {
|
||||
LOG_ERR("%s: failed to get logits at step %d\n", __func__, global_step);
|
||||
break;
|
||||
}
|
||||
|
||||
auto get_logits_for_pos = [&](int32_t pos) -> const float * {
|
||||
if (params.shift_logits) {
|
||||
return pos == 0 ? logits : logits + (pos - 1) * n_vocab;
|
||||
}
|
||||
return logits + (pos) *n_vocab;
|
||||
};
|
||||
|
||||
int64_t time_start_sampling = ggml_time_us();
|
||||
|
||||
mask_positions.clear();
|
||||
for (int32_t i = 0; i < params.max_length; i++) {
|
||||
if (output_tokens[i] == params.mask_token_id) {
|
||||
// For block-based, only consider current block
|
||||
if (params.schedule != BLOCK_BASED || (i >= block_start && i < block_end)) {
|
||||
mask_positions.push_back(i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (mask_positions.empty()) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (params.add_gumbel_noise && params.temperature > 0.0f) {
|
||||
add_gumbel_noise(logits, n_vocab, params.temperature, rng);
|
||||
}
|
||||
|
||||
if (params.algorithm == ORIGIN) {
|
||||
int32_t transfer_count = calculate_transfer_count(
|
||||
step, steps_per_block, mask_positions.size(), params.schedule, params.eps, num_transfer_tokens);
|
||||
float p_transfer = (float) transfer_count / mask_positions.size();
|
||||
|
||||
for (int32_t pos : mask_positions) {
|
||||
if (std::uniform_real_distribution<float>(0.0f, 1.0f)(rng) < p_transfer) {
|
||||
const float * pos_logits = get_logits_for_pos(pos);
|
||||
for (int32_t token_id = 0; token_id < n_vocab; token_id++) {
|
||||
candidates[token_id].id = token_id;
|
||||
candidates[token_id].logit = pos_logits[token_id];
|
||||
candidates[token_id].p = 0.0f;
|
||||
}
|
||||
|
||||
llama_token_data_array cur_p = {
|
||||
candidates.data(),
|
||||
(size_t) n_vocab,
|
||||
-1,
|
||||
false,
|
||||
};
|
||||
|
||||
llama_sampler_apply(sampler, &cur_p);
|
||||
output_tokens[pos] = cur_p.data[cur_p.selected].id;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
std::vector<std::pair<float, int32_t>> confidences;
|
||||
std::vector<llama_token> sampled_tokens(mask_positions.size());
|
||||
|
||||
for (size_t i = 0; i < mask_positions.size(); i++) {
|
||||
int32_t pos = mask_positions[i];
|
||||
const float * pos_logits = get_logits_for_pos(pos);
|
||||
|
||||
for (int32_t token_id = 0; token_id < n_vocab; token_id++) {
|
||||
candidates[token_id].logit = pos_logits[token_id];
|
||||
candidates[token_id].p = 0.0f;
|
||||
candidates[token_id].id = token_id;
|
||||
}
|
||||
|
||||
llama_token_data_array cur_p = {
|
||||
candidates.data(),
|
||||
candidates.size(),
|
||||
-1,
|
||||
false,
|
||||
};
|
||||
|
||||
llama_sampler_apply(sampler, &cur_p);
|
||||
llama_token sampled_token = cur_p.data[cur_p.selected].id;
|
||||
|
||||
float conf = calculate_confidence(cur_p, params.algorithm, rng);
|
||||
|
||||
sampled_tokens[i] = sampled_token;
|
||||
confidences.emplace_back(conf, i);
|
||||
}
|
||||
|
||||
int32_t transfer_count = calculate_transfer_count(
|
||||
step, steps_per_block, mask_positions.size(), params.schedule, params.eps, num_transfer_tokens);
|
||||
|
||||
if (transfer_count > 0) {
|
||||
if (params.alg_temp == 0.0f) {
|
||||
std::partial_sort(confidences.begin(),
|
||||
confidences.begin() + std::min(transfer_count, (int32_t) confidences.size()),
|
||||
confidences.end(),
|
||||
[](const std::pair<float, int32_t> & a, const std::pair<float, int32_t> & b) {
|
||||
if (a.first != b.first) {
|
||||
return a.first > b.first;
|
||||
}
|
||||
return a.second < b.second;
|
||||
});
|
||||
|
||||
for (int32_t i = 0; i < std::min(transfer_count, (int32_t) confidences.size()); i++) {
|
||||
int32_t mask_idx = confidences[i].second;
|
||||
int32_t pos = mask_positions[mask_idx];
|
||||
output_tokens[pos] = sampled_tokens[mask_idx];
|
||||
}
|
||||
} else {
|
||||
conf_candidates.clear();
|
||||
for (size_t i = 0; i < confidences.size(); i++) {
|
||||
float conf_logit = confidences[i].first / params.alg_temp;
|
||||
conf_candidates.emplace_back(llama_token_data{ (int32_t) i, conf_logit, 0.0f });
|
||||
}
|
||||
|
||||
llama_token_data_array conf_array = {
|
||||
conf_candidates.data(),
|
||||
conf_candidates.size(),
|
||||
-1,
|
||||
false,
|
||||
};
|
||||
|
||||
for (int32_t i = 0; i < std::min(transfer_count, (int32_t) confidences.size()); i++) {
|
||||
llama_sampler_apply(dist_sampler, &conf_array);
|
||||
int32_t selected_idx = conf_array.selected;
|
||||
int32_t mask_idx = selected_idx;
|
||||
int32_t pos = mask_positions[mask_idx];
|
||||
output_tokens[pos] = sampled_tokens[mask_idx];
|
||||
|
||||
conf_candidates[selected_idx].p = 0.0f;
|
||||
conf_array.selected = -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int64_t time_end_sampling = ggml_time_us();
|
||||
total_sampling_time += time_end_sampling - time_start_sampling;
|
||||
}
|
||||
}
|
||||
|
||||
int64_t time_end = ggml_time_us();
|
||||
total_time += time_end - time_start;
|
||||
|
||||
LOG_INF("\ntotal time: %0.2fms, time per step: %0.2fms, sampling time per step: %0.2fms\n",
|
||||
total_time / 1000.0,
|
||||
total_time / 1000.0 / params.steps,
|
||||
total_sampling_time / 1000.0 / params.steps);
|
||||
|
||||
llama_batch_free(batch);
|
||||
llama_sampler_free(sampler);
|
||||
llama_sampler_free(dist_sampler);
|
||||
|
||||
n_generated = params.max_length;
|
||||
}
|
||||
|
||||
static std::string format_input_text(const std::string & prompt, bool use_chat_template, llama_model * model) {
|
||||
if (!use_chat_template) {
|
||||
return prompt;
|
||||
}
|
||||
|
||||
auto chat_templates = common_chat_templates_init(model, "");
|
||||
|
||||
common_chat_templates_inputs inputs;
|
||||
common_chat_msg user_msg;
|
||||
user_msg.role = "user";
|
||||
user_msg.content = prompt;
|
||||
inputs.add_generation_prompt = true;
|
||||
inputs.messages.push_back(user_msg);
|
||||
|
||||
auto result = common_chat_templates_apply(chat_templates.get(), inputs);
|
||||
|
||||
return result.prompt;
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
ggml_time_init();
|
||||
|
||||
@@ -538,6 +400,11 @@ int main(int argc, char ** argv) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
const char * alg_names[] = { "ORIGIN", "MASKGIT_PLUS", "TOPK_MARGIN", "ENTROPY" };
|
||||
const char * alg_name = (params.diffusion.algorithm >= 0 && params.diffusion.algorithm <= 3) ?
|
||||
alg_names[params.diffusion.algorithm] :
|
||||
"UNKNOWN";
|
||||
|
||||
common_init();
|
||||
llama_backend_init();
|
||||
|
||||
@@ -554,12 +421,6 @@ int main(int argc, char ** argv) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (!llama_model_is_diffusion(model)) {
|
||||
LOG_ERR("error: unsupported model for diffusion");
|
||||
llama_model_free(model);
|
||||
return 1;
|
||||
}
|
||||
|
||||
llama_context_params ctx_params = llama_context_default_params();
|
||||
ctx_params.n_ctx = params.n_ctx;
|
||||
ctx_params.n_batch = params.n_batch;
|
||||
@@ -581,12 +442,10 @@ int main(int argc, char ** argv) {
|
||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||
std::string formatted_prompt = format_input_text(params.prompt, params.enable_chat_template, model);
|
||||
|
||||
std::vector<llama_token> input_tokens = common_tokenize(vocab,
|
||||
formatted_prompt,
|
||||
std::vector<llama_token> input_tokens = common_tokenize(vocab, formatted_prompt,
|
||||
/*add special tokens*/ true,
|
||||
/*parse special*/ true);
|
||||
|
||||
int n_input = input_tokens.size();
|
||||
int n_input = input_tokens.size();
|
||||
|
||||
if (n_input >= params.n_ctx) {
|
||||
LOG_ERR("error: input too long (%d tokens), max context is %d\n", n_input, params.n_ctx);
|
||||
@@ -595,79 +454,44 @@ int main(int argc, char ** argv) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
struct diffusion_params ldiff_params = diffusion_default_params();
|
||||
ldiff_params.steps = params.diffusion.steps;
|
||||
ldiff_params.eps = params.diffusion.eps;
|
||||
ldiff_params.temperature = params.sampling.temp;
|
||||
ldiff_params.top_p = params.sampling.top_p;
|
||||
ldiff_params.top_k = params.sampling.top_k;
|
||||
ldiff_params.algorithm = static_cast<enum diffusion_alg>(params.diffusion.algorithm);
|
||||
ldiff_params.alg_temp = params.diffusion.alg_temp;
|
||||
ldiff_params.seed = params.sampling.seed;
|
||||
|
||||
llama_token mask_token_id = llama_vocab_mask(vocab);
|
||||
GGML_ASSERT(mask_token_id != LLAMA_TOKEN_NULL);
|
||||
|
||||
bool visual_mode = params.diffusion.visual_mode;
|
||||
|
||||
int32_t n_generated = 0;
|
||||
std::vector<llama_token> output_tokens(params.n_ubatch);
|
||||
|
||||
struct diffusion_params diff_params;
|
||||
|
||||
char shift_logits_str[8];
|
||||
if (llama_model_meta_val_str(model, "diffusion.shift_logits", shift_logits_str, sizeof(shift_logits_str)) >= 0) {
|
||||
diff_params.shift_logits = (strcmp(shift_logits_str, "true") == 0);
|
||||
} else {
|
||||
diff_params.shift_logits = true;
|
||||
}
|
||||
|
||||
//Use either eps or block length, but not both
|
||||
GGML_ASSERT((params.diffusion.eps == 0) ^ (params.diffusion.block_length == 0));
|
||||
|
||||
if (params.diffusion.eps) {
|
||||
diff_params.schedule = TIMESTEP_BASED;
|
||||
diff_params.eps = params.diffusion.eps;
|
||||
} else if (params.diffusion.block_length) {
|
||||
diff_params.schedule = BLOCK_BASED;
|
||||
diff_params.block_length = params.diffusion.block_length;
|
||||
}
|
||||
|
||||
diff_params.mask_token_id = mask_token_id;
|
||||
diff_params.seed = params.sampling.seed;
|
||||
diff_params.temperature = params.sampling.temp;
|
||||
diff_params.steps = params.diffusion.steps;
|
||||
diff_params.algorithm = static_cast<diffusion_algorithm>(params.diffusion.algorithm);
|
||||
diff_params.max_length = params.n_ubatch;
|
||||
diff_params.top_p = params.sampling.top_p;
|
||||
diff_params.top_k = params.sampling.top_k;
|
||||
diff_params.visual_mode = params.diffusion.visual_mode;
|
||||
diff_params.add_gumbel_noise = params.diffusion.add_gumbel_noise;
|
||||
|
||||
diff_params.step_callback = diffusion_step_callback;
|
||||
callback_data cb_data = { &diff_params, vocab, n_input };
|
||||
diff_params.step_callback_user_data = &cb_data;
|
||||
|
||||
const char * alg_names[] = { "ORIGIN", "ENTROPY_BASED", "MARGIN_BASED", "RANDOM", "CONFIDENCE_BASED" };
|
||||
const char * sched_names[] = { "TIMESTEP_BASED", "BLOCK_BASED" };
|
||||
const char * alg_name =
|
||||
(diff_params.algorithm >= 0 && diff_params.algorithm <= 4) ? alg_names[diff_params.algorithm] : "UNKNOWN";
|
||||
const char * sched_name =
|
||||
(diff_params.schedule >= 0 && diff_params.schedule <= 1) ? sched_names[diff_params.schedule] : "UNKNOWN";
|
||||
|
||||
LOG_INF("diffusion_params: - %-25s llama_token = %d\n", "mask_token_id", mask_token_id);
|
||||
LOG_INF("diffusion_params: - %-25s u32 = %d\n", "steps", diff_params.steps);
|
||||
LOG_INF("diffusion_params: - %-25s u32 = %d\n", "max_length", diff_params.max_length);
|
||||
LOG_INF("diffusion_params: - %-25s enum = %d (%s)\n", "algorithm", diff_params.algorithm, alg_name);
|
||||
LOG_INF("diffusion_params: - %-25s enum = %d (%s)\n", "schedule", diff_params.schedule, sched_name);
|
||||
LOG_INF("diffusion_params: - %-25s f32 = %.3f\n", "temperature", diff_params.temperature);
|
||||
if (diff_params.schedule == TIMESTEP_BASED) {
|
||||
LOG_INF("diffusion_params: - %-25s f32 = %.6f\n", "eps", diff_params.eps);
|
||||
LOG_INF("diffusion_params: - %-25s f32 = %.3f\n", "alg_temp", diff_params.alg_temp);
|
||||
}
|
||||
if (diff_params.schedule == BLOCK_BASED) {
|
||||
LOG_INF("diffusion_params: - %-25s u32 = %d\n", "block_length", diff_params.block_length);
|
||||
LOG_INF("diffusion_params: - %-25s f32 = %.3f\n", "cfg_scale", diff_params.cfg_scale);
|
||||
}
|
||||
LOG_INF("diffusion_params: - %-25s u32 = %d\n", "steps", params.diffusion.steps);
|
||||
LOG_INF("diffusion_params: - %-25s f32 = %.6f\n", "eps", params.diffusion.eps);
|
||||
LOG_INF("diffusion_params: - %-25s u32 = %d (%s)\n", "algorithm", params.diffusion.algorithm,
|
||||
alg_name);
|
||||
LOG_INF("diffusion_params: - %-25s f32 = %.3f\n", "alg_temp", params.diffusion.alg_temp);
|
||||
|
||||
diffusion_generate(ctx, input_tokens.data(), output_tokens.data(), n_input, diff_params, n_generated);
|
||||
ldiff_params.mask_token_id = mask_token_id;
|
||||
|
||||
callback_data cb_data = { ¶ms.diffusion, vocab, n_input };
|
||||
|
||||
ldiff_params.step_callback = diffusion_step_callback;
|
||||
ldiff_params.step_callback_user_data = &cb_data;
|
||||
|
||||
int32_t n_generated = 0;
|
||||
|
||||
std::vector<llama_token> output_tokens(params.n_ubatch);
|
||||
diffusion_generate(ctx, input_tokens.data(), output_tokens.data(), n_input, params.n_ubatch,
|
||||
ldiff_params, n_generated);
|
||||
|
||||
if (n_generated > 0) {
|
||||
if (visual_mode) {
|
||||
if (params.diffusion.visual_mode) {
|
||||
//clear screen and move cursor to top-left
|
||||
LOG_INF("\033[2J\033[H");
|
||||
}
|
||||
|
||||
output_tokens.erase(output_tokens.begin(), output_tokens.begin() + n_input);
|
||||
std::string output_data = common_detokenize(vocab, output_tokens, false);
|
||||
LOG_INF("\n%s\n", output_data.c_str());
|
||||
|
||||
@@ -81,14 +81,6 @@ int main(int argc, char ** argv) {
|
||||
|
||||
params.embedding = true;
|
||||
|
||||
// if the number of prompts that would be encoded is known in advance, it's more efficient to specify the
|
||||
// --parallel argument accordingly. for convenience, if not specified, we fallback to unified KV cache
|
||||
// in order to support any number of prompts
|
||||
if (params.n_parallel == 1) {
|
||||
LOG_INF("%s: n_parallel == 1 -> unified KV cache is enabled\n", __func__);
|
||||
params.kv_unified = true;
|
||||
}
|
||||
|
||||
// utilize the full context
|
||||
if (params.n_batch < params.n_ctx) {
|
||||
LOG_WRN("%s: setting batch size to %d\n", __func__, params.n_ctx);
|
||||
|
||||
@@ -7,7 +7,6 @@
|
||||
#include <cstdio>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <numeric>
|
||||
|
||||
/**
|
||||
* This the arbitrary data which will be passed to each callback.
|
||||
@@ -78,12 +77,6 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
|
||||
LOG(" ]\n");
|
||||
LOG(" sum = %f\n", sum);
|
||||
}
|
||||
|
||||
// TODO: make this abort configurable/optional?
|
||||
if (std::isnan(sum)) {
|
||||
LOG_ERR("encountered NaN - aborting\n");
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@@ -15,12 +15,6 @@ int main(int argc, char ** argv) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (params.n_parallel == 1) {
|
||||
// the example uses 2 sequences, so when n_parallel == 1, we need to enable unified kv cache
|
||||
printf("%s: n_parallel == 1, enabling unified kv cache\n", __func__);
|
||||
params.kv_unified = true;
|
||||
}
|
||||
|
||||
common_init();
|
||||
|
||||
if (params.n_predict < 0) {
|
||||
|
||||
@@ -59,15 +59,13 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
params.cpuparams_batch.n_threads = params.speculative.cpuparams_batch.n_threads;
|
||||
params.tensor_buft_overrides = params.speculative.tensor_buft_overrides;
|
||||
|
||||
common_init_result llama_init_dft = common_init_from_params(params);
|
||||
|
||||
//model_dft = llama_init_dft.model.get();
|
||||
ctx_dft = llama_init_dft.context.get();
|
||||
|
||||
if (!common_speculative_are_compatible(ctx_tgt, ctx_dft)) {
|
||||
LOG_INF("the draft model '%s' is not compatible with the target model '%s'. tokens will be translated between the draft and target models.\n", params.speculative.model.path.c_str(), params.model.path.c_str());
|
||||
return 1;
|
||||
}
|
||||
|
||||
// Tokenize the prompt
|
||||
@@ -132,10 +130,7 @@ int main(int argc, char ** argv) {
|
||||
params_spec.n_reuse = llama_n_ctx(ctx_dft) - n_draft;
|
||||
params_spec.p_min = p_min;
|
||||
|
||||
struct common_speculative * spec = common_speculative_init(ctx_tgt, ctx_dft);
|
||||
for (auto &pair : params.speculative.replacements) {
|
||||
common_speculative_add_replacement_tgt_dft(spec, pair.first.c_str(), pair.second.c_str());
|
||||
}
|
||||
struct common_speculative * spec = common_speculative_init(ctx_dft);
|
||||
|
||||
llama_batch batch_tgt = llama_batch_init(llama_n_batch(ctx_tgt), 0, 1);
|
||||
|
||||
|
||||
@@ -85,8 +85,6 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
params.cpuparams_batch.n_threads = params.speculative.cpuparams_batch.n_threads;
|
||||
params.tensor_buft_overrides = params.speculative.tensor_buft_overrides;
|
||||
|
||||
common_init_result llama_init_dft = common_init_from_params(params);
|
||||
|
||||
model_dft = llama_init_dft.model.get();
|
||||
|
||||
@@ -10,20 +10,20 @@
|
||||
#include <vector>
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
#endif
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
common_params params;
|
||||
|
||||
params.escape = false;
|
||||
|
||||
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_FINETUNE)) {
|
||||
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PERPLEXITY)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (params.use_mmap) {
|
||||
LOG_INF("%s: force disabling memory mapping because it would result in-read-only pointers to the weights\n",
|
||||
__func__);
|
||||
LOG_INF("%s: force disabling memory mapping because it would result in-read-only pointers to the weights\n", __func__);
|
||||
params.use_mmap = false;
|
||||
}
|
||||
if (params.cache_type_k != GGML_TYPE_F32) {
|
||||
@@ -38,10 +38,11 @@ int main(int argc, char ** argv) {
|
||||
common_init();
|
||||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
|
||||
// load the model and apply lora adapter, if any
|
||||
common_init_result llama_init = common_init_from_params(params);
|
||||
llama_model_ptr & model = llama_init.model;
|
||||
llama_context_ptr & ctx = llama_init.context;
|
||||
common_init_result llama_init = common_init_from_params(params);
|
||||
llama_model_ptr & model = llama_init.model;
|
||||
llama_context_ptr & ctx = llama_init.context;
|
||||
|
||||
if (model == NULL) {
|
||||
LOG_ERR("%s: unable to load model\n", __func__);
|
||||
@@ -54,32 +55,31 @@ int main(int argc, char ** argv) {
|
||||
LOG_INF("%s\n", common_params_get_system_info(params).c_str());
|
||||
}
|
||||
|
||||
std::vector<llama_token> tokens = common_tokenize(ctx.get(), params.prompt, true);
|
||||
ggml_opt_dataset_t dataset = common_opt_dataset_init(ctx.get(), tokens, llama_n_ctx(ctx.get()) / 2);
|
||||
constexpr float val_split = 0.05f;
|
||||
|
||||
struct lr_opt & lr = params.lr;
|
||||
LOG_INF("-optimizer %s -lr0 %.2g -wd %.2g -lr-min %.2g -min-epochs %.2g -epochs %d -period %.2g -val %.2g\n",
|
||||
ggml_opt_optimizer_name(params.optimizer), (double) lr.lr0, (double) lr.wd, (double) lr.lr_min, (double) lr.decay_epochs,
|
||||
(unsigned) lr.epochs, (double) params.n_batch / params.n_ubatch, (double) params.val_split);
|
||||
std::vector<llama_token> tokens = common_tokenize(ctx.get(), params.prompt, true);
|
||||
ggml_opt_dataset_t dataset = common_opt_dataset_init(ctx.get(), tokens, llama_n_ctx(ctx.get())/2);
|
||||
|
||||
struct llama_opt_params lopt_params{
|
||||
/*n_ctx_train =*/0,
|
||||
/*param_filter =*/llama_opt_param_filter_all,
|
||||
/*param_filter_ud =*/nullptr,
|
||||
/*get_opt_pars =*/common_opt_lr_pars,
|
||||
/*get_opt_pars_ud =*/¶ms.lr,
|
||||
/*optimizer_type =*/params.optimizer,
|
||||
struct ggml_opt_optimizer_params optimizer_params = ggml_opt_get_default_optimizer_params(nullptr);
|
||||
optimizer_params.adamw.alpha = 1e-7f; // learning rate
|
||||
|
||||
struct llama_opt_params lopt_params {
|
||||
/*n_ctx_train =*/ 0,
|
||||
/*param_filter =*/ llama_opt_param_filter_all,
|
||||
/*param_filter_ud =*/ nullptr,
|
||||
/*get_opt_pars =*/ ggml_opt_get_constant_optimizer_params,
|
||||
/*get_opt_pars_ud =*/ &optimizer_params,
|
||||
};
|
||||
llama_opt_init(ctx.get(), model.get(), lopt_params);
|
||||
|
||||
const int64_t idata_split = ggml_opt_dataset_ndata(dataset) * (1.0f - params.val_split);
|
||||
const int64_t idata_split = ggml_opt_dataset_ndata(dataset) * (1.0f - val_split);
|
||||
|
||||
ggml_opt_result_t result_train = ggml_opt_result_init();
|
||||
ggml_opt_result_t result_eval = ggml_opt_result_init();
|
||||
|
||||
for (lr.epoch = 0; lr.epoch < lr.epochs; ++lr.epoch) {
|
||||
for (int epoch = 0; epoch < 2; ++epoch) {
|
||||
llama_opt_epoch(ctx.get(), dataset, result_train, result_eval, idata_split,
|
||||
ggml_opt_epoch_callback_progress_bar, ggml_opt_epoch_callback_progress_bar);
|
||||
ggml_opt_epoch_callback_progress_bar, ggml_opt_epoch_callback_progress_bar);
|
||||
fprintf(stderr, "\n");
|
||||
|
||||
ggml_opt_result_reset(result_train);
|
||||
@@ -88,7 +88,7 @@ int main(int argc, char ** argv) {
|
||||
ggml_opt_result_free(result_train);
|
||||
ggml_opt_result_free(result_eval);
|
||||
|
||||
llama_model_save_to_file(model.get(), params.out_file.c_str());
|
||||
llama_model_save_to_file(model.get(), "finetuned-model.gguf");
|
||||
|
||||
llama_backend_free();
|
||||
|
||||
|
||||
@@ -36,6 +36,9 @@
|
||||
# ```
|
||||
# nixConfig = {
|
||||
# extra-substituters = [
|
||||
# # Populated by the CI in ggml-org/llama.cpp
|
||||
# "https://llama-cpp.cachix.org"
|
||||
#
|
||||
# # A development cache for nixpkgs imported with `config.cudaSupport = true`.
|
||||
# # Populated by https://hercules-ci.com/github/SomeoneSerge/nixpkgs-cuda-ci.
|
||||
# # This lets one skip building e.g. the CUDA-enabled openmpi.
|
||||
@@ -44,8 +47,10 @@
|
||||
# ];
|
||||
#
|
||||
# # Verify these are the same keys as published on
|
||||
# # - https://app.cachix.org/cache/llama-cpp
|
||||
# # - https://app.cachix.org/cache/cuda-maintainers
|
||||
# extra-trusted-public-keys = [
|
||||
# "llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc="
|
||||
# "cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E="
|
||||
# ];
|
||||
# };
|
||||
|
||||
@@ -39,9 +39,8 @@ if (WIN32)
|
||||
set(CMAKE_SHARED_MODULE_PREFIX "")
|
||||
endif()
|
||||
|
||||
option(BUILD_SHARED_LIBS "ggml: build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
|
||||
option(GGML_BACKEND_DL "ggml: build backends as dynamic libraries (requires BUILD_SHARED_LIBS)" OFF)
|
||||
set(GGML_BACKEND_DIR "" CACHE PATH "ggml: directory to load dynamic backends from (requires GGML_BACKEND_DL")
|
||||
option(BUILD_SHARED_LIBS "ggml: build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
|
||||
option(GGML_BACKEND_DL "ggml: build backends as dynamic libraries (requires BUILD_SHARED_LIBS)" OFF)
|
||||
|
||||
#
|
||||
# option list
|
||||
@@ -132,7 +131,7 @@ option(GGML_RVV "ggml: enable rvv" ON)
|
||||
option(GGML_RV_ZFH "ggml: enable riscv zfh" OFF)
|
||||
option(GGML_XTHEADVECTOR "ggml: enable xtheadvector" OFF)
|
||||
option(GGML_VXE "ggml: enable vxe" ON)
|
||||
option(GGML_NNPA "ggml: enable nnpa" OFF) # temp disabled by default, see: https://github.com/ggml-org/llama.cpp/issues/14877
|
||||
option(GGML_NNPA "ggml: enable nnpa" ON)
|
||||
|
||||
option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
|
||||
set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM")
|
||||
@@ -175,8 +174,6 @@ option(GGML_HIP_GRAPHS "ggml: use HIP graph, experimental,
|
||||
option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM" ON)
|
||||
option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAttention" OFF)
|
||||
option(GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12 "ggml: enable rocWMMA FlashAttention on GFX12" OFF)
|
||||
option(GGML_HIP_MMQ_MFMA "ggml: enable MFMA MMA for CDNA in MMQ" ON)
|
||||
option(GGML_HIP_EXPORT_METRICS "ggml: enable kernel perf metrics output" OFF)
|
||||
option(GGML_MUSA_GRAPHS "ggml: use MUSA graph, experimental, unstable" OFF)
|
||||
option(GGML_MUSA_MUDNN_COPY "ggml: enable muDNN for accelerated copy" OFF)
|
||||
option(GGML_VULKAN "ggml: use Vulkan" OFF)
|
||||
@@ -188,7 +185,6 @@ option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation"
|
||||
option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF)
|
||||
option(GGML_WEBGPU "ggml: use WebGPU" OFF)
|
||||
option(GGML_WEBGPU_DEBUG "ggml: enable WebGPU debug output" OFF)
|
||||
option(GGML_ZDNN "ggml: use zDNN" OFF)
|
||||
option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT})
|
||||
option(GGML_METAL_USE_BF16 "ggml: use bfloat if available" OFF)
|
||||
option(GGML_METAL_NDEBUG "ggml: disable Metal debugging" OFF)
|
||||
|
||||
@@ -34,8 +34,8 @@ if (NOT GGML_SHARED_LIB)
|
||||
|
||||
if (GGML_BLAS)
|
||||
find_dependency(BLAS)
|
||||
list(APPEND GGML_BLAS_INTERFACE_LINK_LIBRARIES ${BLAS_LIBRARIES})
|
||||
list(APPEND GGML_BLAS_INTERFACE_LINK_OPTIONS ${BLAS_LINKER_FLAGS})
|
||||
list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES ${BLAS_LIBRARIES})
|
||||
list(APPEND GGML_CPU_INTERFACE_LINK_OPTIONS ${BLAS_LINKER_FLAGS})
|
||||
endif()
|
||||
|
||||
if (GGML_CUDA)
|
||||
@@ -125,56 +125,54 @@ if(NOT TARGET ggml::ggml)
|
||||
IMPORTED_LOCATION "${GGML_BASE_LIBRARY}")
|
||||
|
||||
set(_ggml_all_targets "")
|
||||
if (NOT GGML_BACKEND_DL)
|
||||
foreach(_ggml_backend ${GGML_AVAILABLE_BACKENDS})
|
||||
string(REPLACE "-" "_" _ggml_backend_pfx "${_ggml_backend}")
|
||||
string(TOUPPER "${_ggml_backend_pfx}" _ggml_backend_pfx)
|
||||
foreach(_ggml_backend ${GGML_AVAILABLE_BACKENDS})
|
||||
string(REPLACE "-" "_" _ggml_backend_pfx "${_ggml_backend}")
|
||||
string(TOUPPER "${_ggml_backend_pfx}" _ggml_backend_pfx)
|
||||
|
||||
find_library(${_ggml_backend_pfx}_LIBRARY ${_ggml_backend}
|
||||
REQUIRED
|
||||
HINTS ${GGML_LIB_DIR}
|
||||
NO_CMAKE_FIND_ROOT_PATH)
|
||||
find_library(${_ggml_backend_pfx}_LIBRARY ${_ggml_backend}
|
||||
REQUIRED
|
||||
HINTS ${GGML_LIB_DIR}
|
||||
NO_CMAKE_FIND_ROOT_PATH)
|
||||
|
||||
message(STATUS "Found ${${_ggml_backend_pfx}_LIBRARY}")
|
||||
message(STATUS "Found ${${_ggml_backend_pfx}_LIBRARY}")
|
||||
|
||||
add_library(ggml::${_ggml_backend} UNKNOWN IMPORTED)
|
||||
add_library(ggml::${_ggml_backend} UNKNOWN IMPORTED)
|
||||
set_target_properties(ggml::${_ggml_backend}
|
||||
PROPERTIES
|
||||
INTERFACE_INCLUDE_DIRECTORIES "${GGML_INCLUDE_DIR}"
|
||||
IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
|
||||
IMPORTED_LOCATION "${${_ggml_backend_pfx}_LIBRARY}"
|
||||
INTERFACE_COMPILE_FEATURES c_std_90
|
||||
POSITION_INDEPENDENT_CODE ON)
|
||||
|
||||
string(REGEX MATCH "^ggml-cpu" is_cpu_variant "${_ggml_backend}")
|
||||
if(is_cpu_variant)
|
||||
list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES "ggml::ggml-base")
|
||||
set_target_properties(ggml::${_ggml_backend}
|
||||
PROPERTIES
|
||||
INTERFACE_INCLUDE_DIRECTORIES "${GGML_INCLUDE_DIR}"
|
||||
IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
|
||||
IMPORTED_LOCATION "${${_ggml_backend_pfx}_LIBRARY}"
|
||||
INTERFACE_COMPILE_FEATURES c_std_90
|
||||
POSITION_INDEPENDENT_CODE ON)
|
||||
PROPERTIES
|
||||
INTERFACE_LINK_LIBRARIES "${GGML_CPU_INTERFACE_LINK_LIBRARIES}")
|
||||
|
||||
string(REGEX MATCH "^ggml-cpu" is_cpu_variant "${_ggml_backend}")
|
||||
if(is_cpu_variant)
|
||||
list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES "ggml::ggml-base")
|
||||
set_target_properties(ggml::${_ggml_backend}
|
||||
PROPERTIES
|
||||
INTERFACE_LINK_LIBRARIES "${GGML_CPU_INTERFACE_LINK_LIBRARIES}")
|
||||
|
||||
if(GGML_CPU_INTERFACE_LINK_OPTIONS)
|
||||
set_target_properties(ggml::${_ggml_backend}
|
||||
PROPERTIES
|
||||
INTERFACE_LINK_OPTIONS "${GGML_CPU_INTERFACE_LINK_OPTIONS}")
|
||||
endif()
|
||||
|
||||
else()
|
||||
list(APPEND ${_ggml_backend_pfx}_INTERFACE_LINK_LIBRARIES "ggml::ggml-base")
|
||||
if(GGML_CPU_INTERFACE_LINK_OPTIONS)
|
||||
set_target_properties(ggml::${_ggml_backend}
|
||||
PROPERTIES
|
||||
INTERFACE_LINK_LIBRARIES "${${_ggml_backend_pfx}_INTERFACE_LINK_LIBRARIES}")
|
||||
|
||||
if(${_ggml_backend_pfx}_INTERFACE_LINK_OPTIONS)
|
||||
set_target_properties(ggml::${_ggml_backend}
|
||||
PROPERTIES
|
||||
INTERFACE_LINK_OPTIONS "${${_ggml_backend_pfx}_INTERFACE_LINK_OPTIONS}")
|
||||
endif()
|
||||
INTERFACE_LINK_OPTIONS "${GGML_CPU_INTERFACE_LINK_OPTIONS}")
|
||||
endif()
|
||||
|
||||
list(APPEND _ggml_all_targets ggml::${_ggml_backend})
|
||||
endforeach()
|
||||
endif()
|
||||
else()
|
||||
list(APPEND ${_ggml_backend_pfx}_INTERFACE_LINK_LIBRARIES "ggml::ggml-base")
|
||||
set_target_properties(ggml::${_ggml_backend}
|
||||
PROPERTIES
|
||||
INTERFACE_LINK_LIBRARIES "${${_ggml_backend_pfx}_INTERFACE_LINK_LIBRARIES}")
|
||||
|
||||
if(${_ggml_backend_pfx}_INTERFACE_LINK_OPTIONS)
|
||||
set_target_properties(ggml::${_ggml_backend}
|
||||
PROPERTIES
|
||||
INTERFACE_LINK_OPTIONS "${${_ggml_backend_pfx}_INTERFACE_LINK_OPTIONS}")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
list(APPEND _ggml_all_targets ggml::${_ggml_backend})
|
||||
endforeach()
|
||||
|
||||
list(APPEND GGML_INTERFACE_LINK_LIBRARIES ggml::ggml-base "${_ggml_all_targets}")
|
||||
set_target_properties(ggml::ggml
|
||||
|
||||
@@ -74,26 +74,16 @@ extern "C" {
|
||||
GGML_OPT_BUILD_TYPE_OPT = 30,
|
||||
};
|
||||
|
||||
enum ggml_opt_optimizer_type {
|
||||
GGML_OPT_OPTIMIZER_TYPE_ADAMW,
|
||||
GGML_OPT_OPTIMIZER_TYPE_SGD,
|
||||
|
||||
GGML_OPT_OPTIMIZER_TYPE_COUNT
|
||||
};
|
||||
|
||||
// parameters that control which optimizer is used and how said optimizer tries to find the minimal loss
|
||||
struct ggml_opt_optimizer_params {
|
||||
// AdamW optimizer parameters
|
||||
struct {
|
||||
float alpha; // learning rate
|
||||
float beta1; // first AdamW momentum
|
||||
float beta2; // second AdamW momentum
|
||||
float beta1;
|
||||
float beta2;
|
||||
float eps; // epsilon for numerical stability
|
||||
float wd; // weight decay - 0.0f to disable
|
||||
float wd; // weight decay for AdamW, use 0.0f to disable
|
||||
} adamw;
|
||||
struct {
|
||||
float alpha; // learning rate
|
||||
float wd; // weight decay
|
||||
} sgd;
|
||||
};
|
||||
|
||||
// callback to calculate optimizer parameters prior to a backward pass
|
||||
@@ -122,11 +112,8 @@ extern "C" {
|
||||
|
||||
int32_t opt_period; // after how many gradient accumulation steps an optimizer step should be done
|
||||
|
||||
ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
|
||||
void * get_opt_pars_ud; // userdata for calculating optimizer parameters
|
||||
|
||||
// only GGML_OPT_OPTIMIZER_TYPE_ADAMW needs m, v momenta per parameter tensor
|
||||
enum ggml_opt_optimizer_type optimizer;
|
||||
ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
|
||||
void * get_opt_pars_ud; // userdata for calculating optimizer parameters
|
||||
};
|
||||
|
||||
// get parameters for an optimization context with defaults set where possible
|
||||
@@ -155,10 +142,6 @@ extern "C" {
|
||||
// get the gradient accumulator for a node from the forward graph
|
||||
GGML_API struct ggml_tensor * ggml_opt_grad_acc(ggml_opt_context_t opt_ctx, struct ggml_tensor * node);
|
||||
|
||||
GGML_API enum ggml_opt_optimizer_type ggml_opt_context_optimizer_type(ggml_opt_context_t); //TODO consistent naming scheme
|
||||
|
||||
GGML_API const char * ggml_opt_optimizer_name(enum ggml_opt_optimizer_type);
|
||||
|
||||
// ====== Optimization Result ======
|
||||
|
||||
GGML_API ggml_opt_result_t ggml_opt_result_init(void);
|
||||
@@ -243,14 +226,12 @@ extern "C" {
|
||||
struct ggml_tensor * outputs, // output tensor, must have shape [ne_label, ndata_batch] if labels are used
|
||||
ggml_opt_dataset_t dataset, // dataset with data and optionally also labels
|
||||
enum ggml_opt_loss_type loss_type, // loss to minimize
|
||||
enum ggml_opt_optimizer_type optimizer, // sgd or adamw
|
||||
ggml_opt_get_optimizer_params get_opt_pars, // callback to get optimizer params, userdata is pointer to epoch (of type int64_t)
|
||||
int64_t nepoch, // how many times the dataset should be iterated over
|
||||
int64_t nbatch_logical, // datapoints optimizer step, must be a multiple of ndata_batch in inputs/outputs
|
||||
float val_split, // fraction of the dataset to use for validation, must be in [0.0f, 1.0f)
|
||||
bool silent); // whether or not info prints to stderr should be suppressed
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -1,16 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-backend.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
GGML_BACKEND_API ggml_backend_t ggml_backend_zdnn_init(void);
|
||||
|
||||
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_zdnn_reg(void);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
@@ -241,8 +241,6 @@
|
||||
#define GGML_ROPE_TYPE_MROPE 8
|
||||
#define GGML_ROPE_TYPE_VISION 24
|
||||
|
||||
#define GGML_MROPE_SECTIONS 4
|
||||
|
||||
#define GGML_UNUSED(x) (void)(x)
|
||||
|
||||
#define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
|
||||
@@ -306,16 +304,6 @@
|
||||
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
|
||||
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
|
||||
|
||||
#define GGML_TENSOR_TERNARY_OP_LOCALS \
|
||||
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
|
||||
GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
|
||||
GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
|
||||
GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) \
|
||||
GGML_TENSOR_LOCALS(int64_t, ne2, src2, ne) \
|
||||
GGML_TENSOR_LOCALS(size_t, nb2, src2, nb) \
|
||||
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
|
||||
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
|
||||
|
||||
#define GGML_TENSOR_BINARY_OP_LOCALS01 \
|
||||
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
|
||||
GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
|
||||
@@ -407,8 +395,7 @@ extern "C" {
|
||||
// GGML_TYPE_IQ4_NL_4_4 = 36,
|
||||
// GGML_TYPE_IQ4_NL_4_8 = 37,
|
||||
// GGML_TYPE_IQ4_NL_8_8 = 38,
|
||||
GGML_TYPE_MXFP4 = 39, // MXFP4 (1 block)
|
||||
GGML_TYPE_COUNT = 40,
|
||||
GGML_TYPE_COUNT = 39,
|
||||
};
|
||||
|
||||
// precision
|
||||
@@ -443,7 +430,6 @@ extern "C" {
|
||||
GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_MXFP4 = 25, // except 1d tensors
|
||||
};
|
||||
|
||||
// available tensor operations:
|
||||
@@ -452,7 +438,6 @@ extern "C" {
|
||||
|
||||
GGML_OP_DUP,
|
||||
GGML_OP_ADD,
|
||||
GGML_OP_ADD_ID,
|
||||
GGML_OP_ADD1,
|
||||
GGML_OP_ACC,
|
||||
GGML_OP_SUB,
|
||||
@@ -542,7 +527,6 @@ extern "C" {
|
||||
GGML_OP_CROSS_ENTROPY_LOSS,
|
||||
GGML_OP_CROSS_ENTROPY_LOSS_BACK,
|
||||
GGML_OP_OPT_STEP_ADAMW,
|
||||
GGML_OP_OPT_STEP_SGD,
|
||||
|
||||
GGML_OP_GLU,
|
||||
|
||||
@@ -573,7 +557,6 @@ extern "C" {
|
||||
GGML_GLU_OP_REGLU,
|
||||
GGML_GLU_OP_GEGLU,
|
||||
GGML_GLU_OP_SWIGLU,
|
||||
GGML_GLU_OP_SWIGLU_OAI,
|
||||
GGML_GLU_OP_GEGLU_ERF,
|
||||
GGML_GLU_OP_GEGLU_QUICK,
|
||||
|
||||
@@ -848,13 +831,6 @@ extern "C" {
|
||||
struct ggml_tensor * b,
|
||||
enum ggml_type type);
|
||||
|
||||
// dst[i0, i1, i2] = a[i0, i1, i2] + b[i0, ids[i1, i2]]
|
||||
GGML_API struct ggml_tensor * ggml_add_id(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
struct ggml_tensor * ids);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_add1(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
@@ -1222,13 +1198,6 @@ extern "C" {
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_swiglu_oai(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
float alpha,
|
||||
float limit);
|
||||
|
||||
// normalize along rows
|
||||
GGML_API struct ggml_tensor * ggml_norm(
|
||||
struct ggml_context * ctx,
|
||||
@@ -1601,10 +1570,6 @@ extern "C" {
|
||||
float scale,
|
||||
float max_bias);
|
||||
|
||||
GGML_API void ggml_soft_max_add_sinks(
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * sinks);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_soft_max_ext_back(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
@@ -1663,7 +1628,7 @@ extern "C" {
|
||||
struct ggml_tensor * b,
|
||||
struct ggml_tensor * c,
|
||||
int n_dims,
|
||||
int sections[GGML_MROPE_SECTIONS],
|
||||
int sections[4],
|
||||
int mode,
|
||||
int n_ctx_orig,
|
||||
float freq_base,
|
||||
@@ -1689,22 +1654,6 @@ extern "C" {
|
||||
float beta_fast,
|
||||
float beta_slow);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_rope_multi_inplace(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
struct ggml_tensor * c,
|
||||
int n_dims,
|
||||
int sections[GGML_MROPE_SECTIONS],
|
||||
int mode,
|
||||
int n_ctx_orig,
|
||||
float freq_base,
|
||||
float freq_scale,
|
||||
float ext_factor,
|
||||
float attn_factor,
|
||||
float beta_fast,
|
||||
float beta_slow);
|
||||
|
||||
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_rope_custom(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
@@ -2103,10 +2052,6 @@ extern "C" {
|
||||
GGML_API enum ggml_prec ggml_flash_attn_ext_get_prec(
|
||||
const struct ggml_tensor * a);
|
||||
|
||||
GGML_API void ggml_flash_attn_ext_add_sinks(
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * sinks);
|
||||
|
||||
// TODO: needs to be adapted to ggml_flash_attn_ext
|
||||
GGML_API struct ggml_tensor * ggml_flash_attn_back(
|
||||
struct ggml_context * ctx,
|
||||
@@ -2312,14 +2257,7 @@ extern "C" {
|
||||
struct ggml_tensor * grad,
|
||||
struct ggml_tensor * m,
|
||||
struct ggml_tensor * v,
|
||||
struct ggml_tensor * adamw_params); // parameters such as the learning rate
|
||||
|
||||
// stochastic gradient descent step (with weight decay)
|
||||
GGML_API struct ggml_tensor * ggml_opt_step_sgd(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * grad,
|
||||
struct ggml_tensor * sgd_params); // alpha, weight decay
|
||||
struct ggml_tensor * adamw_params); // parameters such a the learning rate
|
||||
|
||||
//
|
||||
// automatic differentiation
|
||||
|
||||
@@ -214,13 +214,6 @@ add_library(ggml
|
||||
ggml-backend-reg.cpp)
|
||||
add_library(ggml::ggml ALIAS ggml)
|
||||
|
||||
if (GGML_BACKEND_DIR)
|
||||
if (NOT GGML_BACKEND_DL)
|
||||
message(FATAL_ERROR "GGML_BACKEND_DIR requires GGML_BACKEND_DL")
|
||||
endif()
|
||||
target_compile_definitions(ggml PUBLIC GGML_BACKEND_DIR="${GGML_BACKEND_DIR}")
|
||||
endif()
|
||||
|
||||
target_link_libraries(ggml PUBLIC ggml-base)
|
||||
|
||||
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
|
||||
@@ -234,11 +227,7 @@ function(ggml_add_backend_library backend)
|
||||
set_target_properties(${backend} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
|
||||
target_compile_definitions(${backend} PRIVATE GGML_BACKEND_DL)
|
||||
add_dependencies(ggml ${backend})
|
||||
if (GGML_BACKEND_DIR)
|
||||
install(TARGETS ${backend} LIBRARY DESTINATION ${GGML_BACKEND_DIR})
|
||||
else()
|
||||
install(TARGETS ${backend} LIBRARY DESTINATION ${CMAKE_INSTALL_BINDIR})
|
||||
endif()
|
||||
install(TARGETS ${backend} LIBRARY DESTINATION ${CMAKE_INSTALL_BINDIR})
|
||||
else()
|
||||
add_library(${backend} ${ARGN})
|
||||
target_link_libraries(ggml PUBLIC ${backend})
|
||||
@@ -382,7 +371,6 @@ ggml_add_backend(RPC)
|
||||
ggml_add_backend(SYCL)
|
||||
ggml_add_backend(Vulkan)
|
||||
ggml_add_backend(WebGPU)
|
||||
ggml_add_backend(zDNN)
|
||||
ggml_add_backend(OpenCL)
|
||||
|
||||
foreach (target ggml-base ggml)
|
||||
|
||||
@@ -29,7 +29,6 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
|
||||
case GGML_OP_DIAG_MASK_ZERO:
|
||||
case GGML_OP_DIAG_MASK_INF:
|
||||
case GGML_OP_ADD:
|
||||
case GGML_OP_ADD_ID:
|
||||
case GGML_OP_ADD1:
|
||||
case GGML_OP_SUB:
|
||||
case GGML_OP_MUL:
|
||||
|
||||
@@ -49,10 +49,6 @@
|
||||
#include "ggml-webgpu.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_ZDNN
|
||||
#include "ggml-zdnn.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_OPENCL
|
||||
#include "ggml-opencl.h"
|
||||
#endif
|
||||
@@ -184,9 +180,6 @@ struct ggml_backend_registry {
|
||||
#ifdef GGML_USE_WEBGPU
|
||||
register_backend(ggml_backend_webgpu_reg());
|
||||
#endif
|
||||
#ifdef GGML_USE_ZDNN
|
||||
register_backend(ggml_backend_zdnn_reg());
|
||||
#endif
|
||||
#ifdef GGML_USE_OPENCL
|
||||
register_backend(ggml_backend_opencl_reg());
|
||||
#endif
|
||||
@@ -505,9 +498,6 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
|
||||
|
||||
std::vector<fs::path> search_paths;
|
||||
if (user_search_path == nullptr) {
|
||||
#ifdef GGML_BACKEND_DIR
|
||||
search_paths.push_back(fs::u8path(GGML_BACKEND_DIR));
|
||||
#endif
|
||||
// default search paths: executable directory, current directory
|
||||
search_paths.push_back(get_executable_path());
|
||||
search_paths.push_back(fs::current_path());
|
||||
|
||||
@@ -1071,11 +1071,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||
}
|
||||
}
|
||||
}
|
||||
// if the node is still unassigned, assign it to the first backend that supports it
|
||||
for (int b = 0; b < sched->n_backends && *cur_backend_id == -1; b++) {
|
||||
ggml_backend_sched_set_if_supported(sched, node, b, cur_backend_id);
|
||||
}
|
||||
GGML_ASSERT(*cur_backend_id != -1);
|
||||
}
|
||||
|
||||
// pass 5: split graph, find tensors that need to be copied
|
||||
@@ -1103,7 +1098,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||
|
||||
const int node_backend_id = tensor_backend_id(node);
|
||||
|
||||
GGML_ASSERT(node_backend_id != -1); // all nodes should be assigned by now, this can happen if there is no CPU fallback
|
||||
assert(node_backend_id != -1); // all nodes should be assigned by now, this can happen if there is no CPU fallback
|
||||
|
||||
// check if we should start a new split based on the sources of the current node
|
||||
bool need_new_split = false;
|
||||
@@ -1161,7 +1156,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||
|
||||
size_t src_id = hash_id(src);
|
||||
const int src_backend_id = sched->hv_tensor_backend_ids[src_id];
|
||||
GGML_ASSERT(src_backend_id != -1); // all inputs should be assigned by now
|
||||
assert(src_backend_id != -1); // all inputs should be assigned by now
|
||||
|
||||
if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
|
||||
if (tensor_id_copy(src_id, src_backend_id, 0) == NULL) {
|
||||
|
||||
@@ -281,10 +281,10 @@ ggml_backend_t ggml_backend_blas_init(void) {
|
||||
ggml_backend_blas_context * ctx = new ggml_backend_blas_context;
|
||||
|
||||
ggml_backend_t backend = new ggml_backend {
|
||||
/* .guid = */ ggml_backend_blas_guid(),
|
||||
/* .iface = */ blas_backend_i,
|
||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_blas_reg(), 0),
|
||||
/* .context = */ ctx,
|
||||
/* .guid = */ ggml_backend_blas_guid(),
|
||||
/* .interface = */ blas_backend_i,
|
||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_blas_reg(), 0),
|
||||
/* .context = */ ctx,
|
||||
};
|
||||
|
||||
#if defined(OPENBLAS_VERSION) && defined(GGML_USE_OPENMP)
|
||||
|
||||
@@ -31,13 +31,6 @@ string(REGEX MATCH "[0-9]+[a-zA-Z]" SOC_TYPE_MAJOR_SN "${SOC_VERSION}")
|
||||
set(SOC_TYPE_COMPILE_OPTION "ASCEND_${SOC_TYPE_MAJOR_SN}")
|
||||
string(TOUPPER ${SOC_TYPE_COMPILE_OPTION} SOC_TYPE_COMPILE_OPTION)
|
||||
message(STATUS "CANN: SOC_VERSION = ${SOC_VERSION}")
|
||||
option(USE_ACL_GRAPH "Enable CANN graph execution (ACL graph mode)" OFF)
|
||||
|
||||
if(USE_ACL_GRAPH AND (SOC_TYPE_MAJOR_SN STREQUAL "310P" OR SOC_TYPE_COMPILE_OPTION STREQUAL "ASCEND_310P"))
|
||||
message(FATAL_ERROR
|
||||
"CANN Graph (ACL graph mode) is not supported on 310P devices. "
|
||||
"Please build with -DUSE_ACL_GRAPH=OFF or use a supported SOC.")
|
||||
endif()
|
||||
|
||||
if (CANN_INSTALL_DIR)
|
||||
# Only Support Linux.
|
||||
@@ -75,13 +68,6 @@ if (CANN_INSTALL_DIR)
|
||||
|
||||
target_compile_definitions(ggml-cann PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}")
|
||||
|
||||
if (USE_ACL_GRAPH)
|
||||
target_compile_definitions(ggml-cann PRIVATE USE_ACL_GRAPH)
|
||||
message(STATUS "CANN: USE_ACL_GRAPH is enabled.")
|
||||
else()
|
||||
message(STATUS "CANN: USE_ACL_GRAPH is disabled.")
|
||||
endif()
|
||||
|
||||
message(STATUS "CANN: CANN_INCLUDE_DIRS = ${CANN_INCLUDE_DIRS}")
|
||||
message(STATUS "CANN: CANN_LIBRARIES = ${CANN_LIBRARIES}")
|
||||
else()
|
||||
|
||||
@@ -77,8 +77,6 @@ aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne,
|
||||
for (int i = 0; i < final_dims; i++) {
|
||||
acl_storage_len += (acl_ne[i] - 1) * acl_stride[i];
|
||||
}
|
||||
size_t elem_offset = offset / ggml_element_size(tensor);
|
||||
acl_storage_len += elem_offset;
|
||||
|
||||
// Reverse ne and stride.
|
||||
std::reverse(acl_ne, acl_ne + final_dims);
|
||||
@@ -86,7 +84,7 @@ aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne,
|
||||
|
||||
aclTensor* acl_tensor = aclCreateTensor(
|
||||
acl_ne, final_dims, ggml_cann_type_mapping(tensor->type), acl_stride,
|
||||
elem_offset, format, &acl_storage_len, 1,
|
||||
offset / ggml_element_size(tensor), format, &acl_storage_len, 1,
|
||||
tensor->data);
|
||||
|
||||
return acl_tensor;
|
||||
|
||||
@@ -68,8 +68,6 @@
|
||||
#include <aclnnop/aclnn_grouped_matmul_v3.h>
|
||||
#include <aclnnop/aclnn_fused_infer_attention_score_v2.h>
|
||||
#include <aclnnop/aclnn_zero.h>
|
||||
#include <aclnnop/aclnn_index_copy.h>
|
||||
#include <aclnnop/aclnn_index_select.h>
|
||||
#include <float.h>
|
||||
|
||||
#include <cmath>
|
||||
@@ -101,7 +99,7 @@ void bcast_shape(ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst, aclT
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cann_op_unary(
|
||||
void ggml_cann_unary_op(
|
||||
std::function<void(ggml_backend_cann_context&, aclTensor*, aclTensor*)> unary_op,
|
||||
ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
ggml_tensor* src = dst->src[0];
|
||||
@@ -113,42 +111,6 @@ void ggml_cann_op_unary(
|
||||
ggml_cann_release_resources(ctx, acl_src, acl_dst);
|
||||
}
|
||||
|
||||
void ggml_cann_op_unary_gated(
|
||||
std::function<void(ggml_backend_cann_context&, aclTensor*, aclTensor*)> unary_op,
|
||||
ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
ggml_tensor* src0 = dst->src[0];
|
||||
ggml_tensor* src1 = dst->src[1];
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous_1(src0));
|
||||
GGML_ASSERT(ggml_is_contiguous_1(dst));
|
||||
const int32_t swapped = ggml_get_op_params_i32(dst, 1);
|
||||
|
||||
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
|
||||
aclTensor *acl_src0 = nullptr, *acl_src1 = nullptr;
|
||||
if(src1) {
|
||||
GGML_ASSERT(ggml_is_contiguous_1(src1));
|
||||
GGML_ASSERT(src0->type == src1->type);
|
||||
|
||||
acl_src0 = ggml_cann_create_tensor(src0);
|
||||
acl_src1 = ggml_cann_create_tensor(src1);
|
||||
} else {
|
||||
int64_t ne[] = {src0->ne[0] / 2, src0->ne[1], src0->ne[2], src0->ne[3]};
|
||||
size_t nb[] = {src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]};
|
||||
acl_src0 = ggml_cann_create_tensor(src0, ne, nb, GGML_MAX_DIMS, ACL_FORMAT_ND, 0);
|
||||
acl_src1 = ggml_cann_create_tensor(src0, ne, nb, GGML_MAX_DIMS, ACL_FORMAT_ND, ne[0] * ggml_element_size(src0));
|
||||
if (swapped) {
|
||||
std::swap(acl_src0, acl_src1);
|
||||
}
|
||||
}
|
||||
|
||||
unary_op(ctx, acl_src0, acl_dst);
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMul, acl_dst, acl_src1);
|
||||
|
||||
ggml_cann_release_resources(ctx, acl_src0, acl_dst);
|
||||
if(src1)
|
||||
ggml_cann_release_resources(ctx, acl_src1);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Repeats elements of a tensor along each dimension according to the
|
||||
* specified repeat array.
|
||||
@@ -753,55 +715,69 @@ static void cann_copy(ggml_backend_cann_context& ctx, aclTensor* acl_src,
|
||||
void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
ggml_tensor* src0 = dst->src[0];
|
||||
|
||||
aclTensor* acl_src = ggml_cann_create_tensor(src0);
|
||||
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
|
||||
if (ggml_are_same_shape(src0, dst)) {
|
||||
aclTensor* acl_src = ggml_cann_create_tensor(src0);
|
||||
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
|
||||
if (dst->type == src0->type) {
|
||||
cann_copy(ctx, acl_src, acl_dst);
|
||||
} else {
|
||||
aclnn_cast(ctx, acl_src, acl_dst, ggml_cann_type_mapping(dst->type));
|
||||
}
|
||||
ggml_cann_release_resources(ctx, acl_src, acl_dst);
|
||||
} else {
|
||||
void* src_trans_buffer = src0->data;
|
||||
ggml_cann_pool_alloc src_buffer_allocator;
|
||||
if (!ggml_is_contiguous(src0)) {
|
||||
aclTensor* acl_src = ggml_cann_create_tensor(src0);
|
||||
src_buffer_allocator.alloc(ctx.pool(),
|
||||
ggml_nelements(src0) * ggml_type_size(src0->type));
|
||||
src_trans_buffer = src_buffer_allocator.get();
|
||||
if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst)) {
|
||||
if (dst->type == src0->type) {
|
||||
size_t cpy_size = ggml_nbytes(dst);
|
||||
ggml_cann_async_memcpy(ctx, dst->data, src0->data, cpy_size,
|
||||
ACL_MEMCPY_DEVICE_TO_DEVICE);
|
||||
return;
|
||||
} else {
|
||||
ggml_cann_pool_alloc src_buffer_allocator(
|
||||
ctx.pool(),
|
||||
ggml_nelements(dst) * ggml_type_size(dst->type));
|
||||
void* src_trans_buffer = src_buffer_allocator.get();
|
||||
size_t src_trans_nb[GGML_MAX_DIMS];
|
||||
src_trans_nb[0] = ggml_type_size(dst->type);
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
|
||||
}
|
||||
aclTensor* src_trans_tensor = ggml_cann_create_tensor(
|
||||
src_trans_buffer, ggml_cann_type_mapping(dst->type),
|
||||
ggml_type_size(dst->type), src0->ne, src_trans_nb,
|
||||
GGML_MAX_DIMS);
|
||||
|
||||
aclnn_cast(ctx, acl_src, src_trans_tensor, ggml_cann_type_mapping(dst->type));
|
||||
size_t cpy_size = ggml_nbytes(dst);
|
||||
ggml_cann_async_memcpy(ctx, dst->data, src_trans_buffer, cpy_size,
|
||||
ACL_MEMCPY_DEVICE_TO_DEVICE);
|
||||
ggml_cann_release_resources(ctx, src_trans_tensor);
|
||||
return;
|
||||
}
|
||||
} else if (ggml_is_contiguous(dst)) {
|
||||
ggml_cann_pool_alloc src_buffer_allocator(
|
||||
ctx.pool(), ggml_nelements(dst) * ggml_type_size(dst->type));
|
||||
void* src_trans_buffer = src_buffer_allocator.get();
|
||||
size_t src_trans_nb[GGML_MAX_DIMS];
|
||||
src_trans_nb[0] = ggml_type_size(src0->type);
|
||||
src_trans_nb[0] = ggml_type_size(dst->type);
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
|
||||
}
|
||||
aclTensor* src_trans_tensor = ggml_cann_create_tensor(
|
||||
src_trans_buffer, ggml_cann_type_mapping(src0->type),
|
||||
ggml_type_size(src0->type), src0->ne, src_trans_nb,
|
||||
src_trans_buffer, ggml_cann_type_mapping(dst->type),
|
||||
ggml_type_size(dst->type), src0->ne, src_trans_nb,
|
||||
GGML_MAX_DIMS);
|
||||
cann_copy(ctx, acl_src, src_trans_tensor);
|
||||
ggml_cann_release_resources(ctx, acl_src, src_trans_tensor);
|
||||
}
|
||||
|
||||
size_t src_reshape_nb[GGML_MAX_DIMS];
|
||||
src_reshape_nb[0] = ggml_type_size(src0->type);
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
src_reshape_nb[i] = src_reshape_nb[i - 1] * dst->ne[i - 1];
|
||||
}
|
||||
aclnn_cast(ctx, acl_src, src_trans_tensor, ggml_cann_type_mapping(dst->type));
|
||||
|
||||
aclTensor* trans_acl_src = ggml_cann_create_tensor(src_trans_buffer,
|
||||
ggml_cann_type_mapping(src0->type),ggml_type_size(src0->type),
|
||||
dst->ne, src_reshape_nb, GGML_MAX_DIMS, ACL_FORMAT_ND);
|
||||
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
|
||||
|
||||
if (dst->type == src0->type) {
|
||||
cann_copy(ctx, trans_acl_src, acl_dst);
|
||||
size_t cpy_size = ggml_nbytes(dst);
|
||||
ggml_cann_async_memcpy(ctx, dst->data, src_trans_buffer, cpy_size,
|
||||
ACL_MEMCPY_DEVICE_TO_DEVICE);
|
||||
ggml_cann_release_resources(ctx, src_trans_tensor);
|
||||
return;
|
||||
} else {
|
||||
aclnn_cast(ctx, trans_acl_src, acl_dst, ggml_cann_type_mapping(dst->type));
|
||||
GGML_ABORT("Unsupport dst is not tontiguous.");
|
||||
}
|
||||
ggml_cann_release_resources(ctx, trans_acl_src, acl_dst);
|
||||
}
|
||||
return;
|
||||
ggml_cann_release_resources(ctx, acl_src, acl_dst);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -1316,196 +1292,160 @@ static void aclnn_pow_tensor_tensor(ggml_backend_cann_context& ctx,
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Generate a range of values and apply a scalar base exponentiation.
|
||||
* @brief Applies the Alibi (Attention with Linear Biases) mechanism to the
|
||||
* @details This function implements the Alibi mechanism, which introduces
|
||||
* learnable biases into the attention scores to simulate relative
|
||||
* position encoding without the need for explicit positional
|
||||
* embeddings.
|
||||
*
|
||||
* This function creates an evenly spaced sequence from `start` to `stop` (exclusive),
|
||||
* with step size `step`, stores it in a temporary buffer, and then computes:
|
||||
* @param ctx The backend CANN context for executing operations.
|
||||
* @param acl_src The source tensor representing the query or key.
|
||||
* @param acl_position The position tensor containing relative positions.
|
||||
* @param acl_dst The destination tensor where the result will be stored.
|
||||
* @param n_head The number of attention heads.
|
||||
* @param src_ne The dimensions of the source tensor.
|
||||
* @param src_nb0 The byte size of the first dimension of the source
|
||||
tensor.
|
||||
* @param max_bias The maximum bias value used in the Alibi mechanism.
|
||||
* @param dst The destination tensor object for additional metadata.
|
||||
*
|
||||
* @f[
|
||||
* slope[i] = m^{\left( start + i \cdot step \right)}, \quad 0 \le i < size
|
||||
* @f]
|
||||
*
|
||||
* The results are written to the provided @p slope_buffer.
|
||||
*
|
||||
* @param ctx CANN backend context for memory allocation and operator execution.
|
||||
* @param slope_buffer Pointer to the output buffer (float array) for the computed slope values.
|
||||
* @param m Scalar base for the exponentiation.
|
||||
* @param size Number of elements in the generated sequence.
|
||||
* @param start Starting exponent offset.
|
||||
* @param stop Stopping exponent offset (exclusive).
|
||||
* @param step Step size for the exponent increment.
|
||||
* The function performs the following steps:
|
||||
* 1. Calculates the logarithm floor of the number of heads to determine the
|
||||
base for bias calculation.
|
||||
* 2. Initializes arrays with arithmetic sequences and fills them with bias
|
||||
values.
|
||||
* 3. Computes the bias tensor based on the calculated biases and arithmetic
|
||||
sequences.
|
||||
* 4. Reshapes the bias tensor to match the dimensions of the input tensors.
|
||||
* 5. Multiplies the position tensor by the bias tensor.
|
||||
* 6. Adds the result of the multiplication to the source tensor to produce the
|
||||
final output.
|
||||
*/
|
||||
static void aclnn_get_slope_inner(ggml_backend_cann_context& ctx, void* slope_buffer,
|
||||
float m, int64_t size, float start, float stop, float step){
|
||||
int64_t ne[] = {size};
|
||||
size_t nb[] = {sizeof(float)};
|
||||
static void aclnn_alibi(ggml_backend_cann_context& ctx, aclTensor* acl_src,
|
||||
aclTensor* acl_position, aclTensor* acl_dst,
|
||||
const int n_head, int64_t* src_ne, const size_t src_nb0,
|
||||
float max_bias, ggml_tensor* dst) {
|
||||
const int64_t ne2_ne3 = src_ne[2] * src_ne[3];
|
||||
GGML_ASSERT(src_nb0 == sizeof(float));
|
||||
GGML_ASSERT(n_head == src_ne[2]);
|
||||
|
||||
ggml_cann_pool_alloc arange_allocator(ctx.pool(), size * sizeof(float));
|
||||
void* arange_buffer = arange_allocator.get();
|
||||
const int n_heads_log2_floor = 1u << (uint32_t)floor(log2(n_head));
|
||||
|
||||
aclTensor* arange_tensor = ggml_cann_create_tensor(
|
||||
arange_buffer, ACL_FLOAT, sizeof(float), ne, nb, 1);
|
||||
aclnn_arange(ctx, arange_tensor, start, stop, step, size);
|
||||
float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
|
||||
float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
|
||||
|
||||
aclTensor* slope_tensor = ggml_cann_create_tensor(
|
||||
slope_buffer, ACL_FLOAT, sizeof(float), ne, nb, 1);
|
||||
// init arange
|
||||
ggml_cann_pool_alloc arange_allocator(ctx.pool(),
|
||||
ne2_ne3 * ggml_type_size(dst->type));
|
||||
void* tmp_arange_buffer = arange_allocator.get();
|
||||
|
||||
aclScalar* sc = aclCreateScalar(&m, aclDataType::ACL_FLOAT);
|
||||
// arange1: [1, ..., n_heads_log2_floor+1)
|
||||
float start = 1;
|
||||
float stop = n_heads_log2_floor + 1;
|
||||
float step = 1;
|
||||
int64_t n_elements_arange = n_heads_log2_floor;
|
||||
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, PowScalarTensor, sc, arange_tensor, slope_tensor);
|
||||
ggml_cann_release_resources(ctx, sc, arange_tensor, slope_tensor);
|
||||
int64_t tmp_arange1_ne[] = {n_heads_log2_floor};
|
||||
size_t tmp_arange1_nb[] = {sizeof(dst->type)};
|
||||
aclTensor* tmp_arange1_tensor = ggml_cann_create_tensor(
|
||||
tmp_arange_buffer, ggml_cann_type_mapping(dst->type),
|
||||
ggml_type_size(dst->type), tmp_arange1_ne, tmp_arange1_nb,
|
||||
GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
|
||||
|
||||
aclnn_arange(ctx, tmp_arange1_tensor, start, stop, step, n_elements_arange);
|
||||
|
||||
aclTensor* tmp_arange2_tensor = nullptr;
|
||||
if (n_heads_log2_floor < ne2_ne3) {
|
||||
// arange2: [1, ..., 2 * (k - n_heads_log2_floor) + 1)
|
||||
start = 1;
|
||||
stop = 2 * (ne2_ne3 - n_heads_log2_floor) + 1;
|
||||
step = 2;
|
||||
n_elements_arange = ne2_ne3 - n_heads_log2_floor;
|
||||
int64_t tmp_arange2_ne[] = {ne2_ne3 - n_heads_log2_floor};
|
||||
size_t tmp_arange2_nb[] = {sizeof(dst->type)};
|
||||
|
||||
aclTensor* tmp_arange2_tensor = ggml_cann_create_tensor(
|
||||
(char*)tmp_arange_buffer +
|
||||
n_heads_log2_floor * ggml_type_size(dst->type),
|
||||
ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type),
|
||||
tmp_arange2_ne, tmp_arange2_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
|
||||
aclnn_arange(ctx, tmp_arange2_tensor, start, stop, step,
|
||||
n_elements_arange);
|
||||
}
|
||||
|
||||
// init mk_base
|
||||
ggml_cann_pool_alloc mk_base_allocator(ctx.pool(),
|
||||
ne2_ne3 * ggml_type_size(dst->type));
|
||||
void* tmp_mk_base_buffer = mk_base_allocator.get();
|
||||
int64_t tmp_mk_base1_ne[] = {n_heads_log2_floor};
|
||||
size_t tmp_mk_base1_nb[] = {sizeof(dst->type)};
|
||||
aclTensor* tmp_mk_base1_tensor = ggml_cann_create_tensor(
|
||||
tmp_mk_base_buffer, ggml_cann_type_mapping(dst->type),
|
||||
ggml_type_size(dst->type), tmp_mk_base1_ne, tmp_mk_base1_nb,
|
||||
GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
|
||||
|
||||
aclnn_fill_scalar(ctx, m0, tmp_mk_base1_tensor);
|
||||
|
||||
aclTensor* tmp_mk_base2_tensor = nullptr;
|
||||
if (n_heads_log2_floor < ne2_ne3) {
|
||||
int64_t tmp_mk_base2_ne[] = {ne2_ne3 - n_heads_log2_floor};
|
||||
size_t tmp_mk_base2_nb[] = {sizeof(dst->type)};
|
||||
aclTensor* tmp_mk_base2_tensor = ggml_cann_create_tensor(
|
||||
(char*)tmp_mk_base_buffer +
|
||||
n_heads_log2_floor * ggml_type_size(dst->type),
|
||||
ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type),
|
||||
tmp_mk_base2_ne, tmp_mk_base2_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
|
||||
aclnn_fill_scalar(ctx, m1, tmp_mk_base2_tensor);
|
||||
}
|
||||
|
||||
// init mk
|
||||
int64_t tmp_mk_base_ne[] = {ne2_ne3};
|
||||
size_t tmp_mk_base_nb[] = {sizeof(dst->type)};
|
||||
aclTensor* tmp_mk_base_tensor = ggml_cann_create_tensor(
|
||||
tmp_mk_base_buffer, ggml_cann_type_mapping(dst->type),
|
||||
ggml_type_size(dst->type), tmp_mk_base_ne, tmp_mk_base_nb,
|
||||
GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
|
||||
aclTensor* tmp_arange_tensor = ggml_cann_create_tensor(
|
||||
tmp_arange_buffer, ggml_cann_type_mapping(dst->type),
|
||||
ggml_type_size(dst->type), tmp_mk_base_ne, tmp_mk_base_nb,
|
||||
GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
|
||||
aclnn_pow_tensor_tensor(ctx, tmp_mk_base_tensor, tmp_arange_tensor);
|
||||
|
||||
// reshape mk
|
||||
int64_t tmp_mk_ne[] = {1, 1, src_ne[2], src_ne[3]};
|
||||
size_t tmp_mk_nb[GGML_MAX_DIMS];
|
||||
tmp_mk_nb[0] = ggml_type_size(dst->type);
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
tmp_mk_nb[i] = tmp_mk_nb[i - 1] * tmp_mk_ne[i - 1];
|
||||
}
|
||||
aclTensor* tmp_mk_tensor = ggml_cann_create_tensor(
|
||||
tmp_mk_base_buffer, ggml_cann_type_mapping(dst->type),
|
||||
ggml_type_size(dst->type), tmp_mk_ne, tmp_mk_nb, GGML_MAX_DIMS,
|
||||
ACL_FORMAT_ND);
|
||||
|
||||
// acl_position * mk
|
||||
int64_t tmp_output_ne[] = {src_ne[0], src_ne[1], src_ne[2], src_ne[3]};
|
||||
size_t tmp_output_nb[GGML_MAX_DIMS];
|
||||
tmp_output_nb[0] = ggml_type_size(dst->type);
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
tmp_output_nb[i] = tmp_output_nb[i - 1] * tmp_output_ne[i - 1];
|
||||
}
|
||||
ggml_cann_pool_alloc output_allocator(ctx.pool(), ggml_nbytes(dst));
|
||||
void* tmp_output_buffer = output_allocator.get();
|
||||
aclTensor* tmp_output_tensor = ggml_cann_create_tensor(
|
||||
tmp_output_buffer, ggml_cann_type_mapping(dst->type),
|
||||
ggml_type_size(dst->type), tmp_output_ne, tmp_output_nb, GGML_MAX_DIMS,
|
||||
ACL_FORMAT_ND);
|
||||
aclnn_mul(ctx, acl_position, tmp_mk_tensor, tmp_output_tensor);
|
||||
|
||||
// add
|
||||
aclnn_add(ctx, tmp_output_tensor, acl_src, acl_dst);
|
||||
ggml_cann_release_resources(ctx, tmp_arange1_tensor, tmp_arange2_tensor,
|
||||
tmp_mk_base1_tensor, tmp_mk_base2_tensor, tmp_mk_base_tensor,
|
||||
tmp_arange_tensor, tmp_mk_tensor, tmp_output_tensor);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Compute slope values for multiple attention heads based on ALiBi bias parameters.
|
||||
*
|
||||
* This function generates slope values for each attention head according to the ALiBi
|
||||
* (Attention with Linear Biases) method. It splits the computation into two ranges depending
|
||||
* on whether the head index is less than @p n_head_log2 or not, and uses different base values
|
||||
* (`m0` and `m1`) for the exponentiation.
|
||||
*
|
||||
* @f[
|
||||
* slope[h] =
|
||||
* \begin{cases}
|
||||
* m_0^{(h + 1)}, & h < n\_head\_log2 \\
|
||||
* m_1^{\left( 2 \cdot (h - n\_head\_log2) + 1 \right)}, & h \geq n\_head\_log2
|
||||
* \end{cases}
|
||||
* \quad , \quad \text{if } max\_bias > 0
|
||||
* @f]
|
||||
*
|
||||
* If @p max_bias <= 0, all slope values are set to 1.0.
|
||||
*
|
||||
* @param ctx CANN backend context for memory allocation and operator execution.
|
||||
* @param n_head Total number of attention heads.
|
||||
* @param slope_buffer Pointer to the output buffer (float array) for storing slopes.
|
||||
* @param max_bias Maximum bias value for slope computation.
|
||||
*
|
||||
*/
|
||||
static void aclnn_get_slope(ggml_backend_cann_context & ctx, int64_t n_head,
|
||||
void* slope_buffer, float max_bias) {
|
||||
const int n_head_log2 = 1u << (uint32_t) floor(log2(n_head));
|
||||
|
||||
float m0 = powf(2.0f, -(max_bias) / n_head_log2);
|
||||
float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
||||
|
||||
// const float slope = (max_bias > 0.0f) ?
|
||||
// h < n_head_log2 ?
|
||||
// powf(m0, h + 1) :
|
||||
// powf(m1, 2*(h - n_head_log2) + 1) :
|
||||
// 1.0f;
|
||||
// arange1
|
||||
float start = 0 + 1;
|
||||
float end = (n_head_log2 - 1) + 1;
|
||||
float step = 1;
|
||||
float count = n_head_log2;
|
||||
// end needs to be +1 because aclnn uses a left-closed, right-open interval.
|
||||
aclnn_get_slope_inner(ctx, slope_buffer, m0, count, start, end + 1, step);
|
||||
if (n_head_log2 < n_head) {
|
||||
// arange2
|
||||
start = 2 * (n_head_log2 - n_head_log2) + 1;
|
||||
end = 2 * ((n_head - 1) - n_head_log2) + 1;
|
||||
step = 2;
|
||||
count = n_head - n_head_log2;
|
||||
aclnn_get_slope_inner(
|
||||
ctx, (char *) slope_buffer + n_head_log2 * sizeof(float),
|
||||
m1, count, start, end + 1, step);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Add ALiBi (Attention with Linear Biases) positional biases to the attention mask.
|
||||
*
|
||||
* This function computes the ALiBi slopes for each attention head (if max_bias > 0),
|
||||
* multiplies them with the attention mask to produce bias tensors, and adds these biases
|
||||
* to the destination tensor (@p dst).
|
||||
*
|
||||
* The function performs necessary broadcasting of the mask and slope tensors to match
|
||||
* the shape of the destination tensor, then applies element-wise multiplication and addition
|
||||
* using CANN operators.
|
||||
*
|
||||
* @param ctx CANN backend context for memory management and operator execution.
|
||||
* @param mask Input attention mask tensor, assumed to be contiguous.
|
||||
* @param dst Destination tensor to which ALiBi biases will be added.
|
||||
* @param dst_ptr Pointer to the memory of the destination tensor.
|
||||
* @param max_bias Maximum bias value controlling the slope scaling.
|
||||
*
|
||||
* @note
|
||||
* - Write data into dst_ptr using only the shape information of the dst tensor.
|
||||
* - `GGML_MAX_DIMS + 2` is used to extend tensor dimensions for broadcasting.
|
||||
*/
|
||||
static void aclnn_add_alibi(ggml_backend_cann_context& ctx, ggml_tensor* mask,
|
||||
ggml_tensor* dst, void* dst_ptr, float max_bias) {
|
||||
void* slope_buffer = nullptr;
|
||||
void* bias_buffer = nullptr;
|
||||
|
||||
if (max_bias > 0.0f) {
|
||||
int64_t n_heads = dst->ne[2];
|
||||
ggml_cann_pool_alloc slope_allocator(ctx.pool(), n_heads * sizeof(float));
|
||||
slope_buffer = slope_allocator.get();
|
||||
ggml_cann_pool_alloc bias_allocator(
|
||||
ctx.pool(), ggml_nelements(dst) * ggml_element_size(dst));
|
||||
bias_buffer = bias_allocator.get();
|
||||
aclnn_get_slope(ctx, n_heads, slope_buffer, max_bias);
|
||||
}
|
||||
|
||||
// broadcast for mask, slop and dst;
|
||||
int64_t nr2 = dst->ne[2] / mask->ne[2];
|
||||
int64_t nr3 = dst->ne[3] / mask->ne[3];
|
||||
|
||||
// broadcast the mask across rows
|
||||
int64_t mask_ne[] = { mask->ne[0], dst->ne[1], mask->ne[2], 1, mask->ne[3], 1 };
|
||||
size_t mask_nb[] = {
|
||||
mask_nb[0] = mask->nb[0], mask_nb[1] = mask->nb[1], mask_nb[2] = mask->nb[2],
|
||||
mask_nb[3] = mask->nb[2], mask_nb[4] = mask->nb[3], mask_nb[5] = mask->nb[3]
|
||||
};
|
||||
|
||||
int64_t dst_ne[] = { dst->ne[0], dst->ne[1], mask->ne[2], nr2, mask->ne[3], nr3 };
|
||||
size_t dst_nb[] = {
|
||||
dst_nb[0] = dst->nb[0], dst_nb[1] = dst->nb[1], dst_nb[2] = dst->nb[2],
|
||||
dst_nb[3] = dst->nb[2], dst_nb[4] = dst->nb[3], dst_nb[5] = dst->nb[3]
|
||||
};
|
||||
|
||||
// slope is a 1 dim tensor, slope.ne2 == dst.ne2
|
||||
int64_t slope_ne[] = { 1, 1, mask->ne[2], nr2, 1, 1 };
|
||||
size_t slope_nb[GGML_MAX_DIMS + 2];
|
||||
slope_nb[0] = sizeof(float);
|
||||
for (int i = 1; i < GGML_MAX_DIMS + 2; i++) {
|
||||
slope_nb[i] = slope_nb[i - 1] * slope_ne[i - 1];
|
||||
}
|
||||
|
||||
aclTensor* acl_slope = ggml_cann_create_tensor(
|
||||
slope_buffer, ACL_FLOAT, sizeof(float),
|
||||
slope_ne, slope_nb, GGML_MAX_DIMS + 2);
|
||||
aclTensor* acl_mask = ggml_cann_create_tensor(
|
||||
mask, mask_ne, mask_nb, GGML_MAX_DIMS + 2);
|
||||
|
||||
// write data into dst_ptr using only the shape information of the dst tensor.
|
||||
aclTensor* acl_dst = ggml_cann_create_tensor(
|
||||
dst_ptr, ggml_cann_type_mapping(dst->type),
|
||||
ggml_type_size(dst->type), dst_ne, dst_nb,
|
||||
GGML_MAX_DIMS + 2);
|
||||
|
||||
if (max_bias > 0.0f) {
|
||||
int64_t bias_ne[] = { mask->ne[0], dst->ne[1], mask->ne[2], nr2, mask->ne[3], 1 };
|
||||
size_t bias_nb[GGML_MAX_DIMS + 2];
|
||||
bias_nb[0] = sizeof(float);
|
||||
for (int i = 1; i < GGML_MAX_DIMS + 2; i++) {
|
||||
bias_nb[i] = bias_nb[i - 1] * bias_ne[i - 1];
|
||||
}
|
||||
aclTensor* bias_tensor = ggml_cann_create_tensor(
|
||||
bias_buffer, ACL_FLOAT, sizeof(float),
|
||||
bias_ne, bias_nb, GGML_MAX_DIMS + 2);
|
||||
|
||||
aclnn_mul(ctx, acl_slope, acl_mask, bias_tensor);
|
||||
aclnn_add(ctx, acl_dst, bias_tensor);
|
||||
ggml_cann_release_resources(ctx, bias_tensor);
|
||||
} else {
|
||||
aclnn_add(ctx, acl_dst, acl_mask);
|
||||
}
|
||||
ggml_cann_release_resources(ctx, acl_slope, acl_mask, acl_dst);
|
||||
}
|
||||
|
||||
void ggml_cann_cpy(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
||||
void ggml_cann_cpy(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
ggml_cann_dup(ctx, dst);
|
||||
}
|
||||
|
||||
@@ -1523,135 +1463,165 @@ void ggml_cann_cpy(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
||||
* @param acl_dst The destination tensor where the softmax results will be
|
||||
* stored.
|
||||
*/
|
||||
static void aclnn_softmax(ggml_backend_cann_context & ctx,
|
||||
aclTensor* acl_src, int64_t dim, aclTensor * acl_dst) {
|
||||
static void aclnn_softmax(ggml_backend_cann_context& ctx, aclTensor* acl_src,
|
||||
int64_t dim, aclTensor* acl_dst) {
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, Softmax, acl_src, dim, acl_dst);
|
||||
}
|
||||
|
||||
void ggml_cann_softmax(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
||||
void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
ggml_tensor* src0 = dst->src[0];
|
||||
ggml_tensor* src1 = dst->src[1]; // mask
|
||||
|
||||
aclTensor* acl_src0 = ggml_cann_create_tensor(src0);
|
||||
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
|
||||
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
|
||||
|
||||
float scale = 1.0f;
|
||||
float scale = 1.0f;
|
||||
float max_bias = 0.0f;
|
||||
|
||||
memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
|
||||
memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
|
||||
memcpy(&scale, (float*)dst->op_params + 0, sizeof(float));
|
||||
memcpy(&max_bias, (float*)dst->op_params + 1, sizeof(float));
|
||||
|
||||
// input mul scale
|
||||
aclScalar* acl_scale = aclCreateScalar(&scale, aclDataType::ACL_FLOAT);
|
||||
ggml_cann_pool_alloc src_tensor_allocator(ctx.pool(), ggml_nbytes(src0));
|
||||
void* src_tensor_buffer = src_tensor_allocator.get();
|
||||
aclTensor* softmax_tensor = ggml_cann_create_tensor(
|
||||
src_tensor_buffer, ggml_cann_type_mapping(src0->type),
|
||||
ggml_element_size(src0), src0->ne, src0->nb,GGML_MAX_DIMS);
|
||||
|
||||
aclnn_muls(ctx, acl_src0, scale, softmax_tensor, false);
|
||||
size_t n_bytes = ggml_nbytes(src0);
|
||||
ggml_cann_pool_alloc mul_scale_allocator(ctx.pool(), n_bytes);
|
||||
void* input_mul_scale_buffer = mul_scale_allocator.get();
|
||||
aclTensor* acl_input_mul_scale_tensor = ggml_cann_create_tensor(
|
||||
input_mul_scale_buffer, ACL_FLOAT, ggml_type_size(src0->type), src0->ne,
|
||||
src0->nb, GGML_MAX_DIMS);
|
||||
|
||||
bool inplace = false;
|
||||
aclnn_muls(ctx, acl_src0, scale, acl_input_mul_scale_tensor, inplace);
|
||||
|
||||
// mask
|
||||
aclTensor* acl_src1_fp32_tensor = nullptr;
|
||||
aclTensor* tmp_mask_tensor = nullptr;
|
||||
ggml_cann_pool_alloc src1_fp32_allocator(ctx.pool());
|
||||
if (src1) {
|
||||
aclnn_add_alibi(ctx, src1, src0, src_tensor_buffer, max_bias);
|
||||
}
|
||||
// softmax
|
||||
aclnn_softmax(ctx, softmax_tensor, 3, acl_dst);
|
||||
ggml_cann_release_resources(ctx, acl_src0, acl_dst, acl_scale, softmax_tensor);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Performs index select operation on a 4D tensor using the CANN backend.
|
||||
*
|
||||
* This function applies the `IndexSelect` operation along a specific dimension
|
||||
* of the source tensor (`src_buffer`) using the indices from the index tensor (`index`).
|
||||
* It iterates over the last two dimensions of the source tensor, creates the corresponding
|
||||
* CANN tensors for the source, index, and output slices, and executes the `IndexSelect`
|
||||
* operation for each slice.
|
||||
*
|
||||
* @param ctx The context for CANN backend operations.
|
||||
* @param src_buffer The source buffer containing the 4D input tensor data.
|
||||
* @param src_ne The dimensions of the source tensor.
|
||||
* @param src_nb The strides (byte offsets) of the source tensor.
|
||||
* @param dst_buffer The destination buffer where the output tensor data will be written.
|
||||
* @param dst_ne The dimensions of the destination tensor.
|
||||
* @param dst_nb The strides (byte offsets) of the destination tensor.
|
||||
* @param index The index tensor specifying the indices to select from the source tensor.
|
||||
* @param type The data type of the source and destination tensors.
|
||||
*/
|
||||
static void aclnn_index_select_4d(ggml_backend_cann_context& ctx,
|
||||
void* src_buffer,int64_t* src_ne, size_t* src_nb,
|
||||
void* dst_buffer, int64_t* dst_ne, size_t* dst_nb,
|
||||
ggml_tensor* index, ggml_type type) {
|
||||
for (int64_t i = 0; i < src_ne[3]; i++) {
|
||||
for (int64_t j = 0; j < src_ne[2]; j++) {
|
||||
// src
|
||||
aclTensor* acl_src_tensor = ggml_cann_create_tensor(
|
||||
(char*)src_buffer + i * src_nb[3] + j * src_nb[2],
|
||||
ggml_cann_type_mapping(type), ggml_type_size(type),
|
||||
src_ne, src_nb, 2);
|
||||
|
||||
// index
|
||||
aclTensor* acl_index = ggml_cann_create_tensor(
|
||||
(char*)index->data + (i % index->ne[2]) * index->nb[2] + (j % index->ne[1]) * index->nb[1],
|
||||
ggml_cann_type_mapping(index->type), ggml_element_size(index),
|
||||
index->ne, index->nb, 1);
|
||||
|
||||
// out
|
||||
aclTensor* acl_out = ggml_cann_create_tensor(
|
||||
(char*)dst_buffer + i * dst_nb[3] + j * dst_nb[2],
|
||||
ggml_cann_type_mapping(type), ggml_type_size(type),
|
||||
dst_ne, dst_nb, 2);
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, IndexSelect, acl_src_tensor, 0, acl_index, acl_out);
|
||||
ggml_cann_release_resources(ctx, acl_src_tensor, acl_index, acl_out);
|
||||
const bool use_f16 = src1->type == GGML_TYPE_F16;
|
||||
if (use_f16) {
|
||||
// cast to fp32
|
||||
size_t n_bytes = ggml_nelements(src1) * sizeof(float_t);
|
||||
size_t src1_fp32_nb[GGML_MAX_DIMS];
|
||||
src1_fp32_nb[0] = sizeof(float_t);
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
src1_fp32_nb[i] = src1_fp32_nb[i - 1] * src1->ne[i - 1];
|
||||
}
|
||||
src1_fp32_allocator.alloc(n_bytes);
|
||||
void* src1_fp32_buffer = src1_fp32_allocator.get();
|
||||
acl_src1_fp32_tensor = ggml_cann_create_tensor(
|
||||
src1_fp32_buffer, ACL_FLOAT, sizeof(float), src1->ne,
|
||||
src1_fp32_nb, GGML_MAX_DIMS);
|
||||
aclTensor* acl_src1 = ggml_cann_create_tensor(src1);
|
||||
aclnn_cast(ctx, acl_src1, acl_src1_fp32_tensor, ACL_FLOAT);
|
||||
ggml_cann_release_resources(ctx, acl_src1);
|
||||
} else {
|
||||
acl_src1_fp32_tensor = ggml_cann_create_tensor(src1);
|
||||
}
|
||||
|
||||
// broadcast the mask across rows, only use ne11 of ne01 in mask
|
||||
if (src1->ne[1] != src0->ne[1]) {
|
||||
// mask shape: [1,1,ne11,ne10]
|
||||
int64_t tmp_mask_ne[] = {src0->ne[0], src0->ne[1], 1, 1};
|
||||
size_t tmp_mask_nb[GGML_MAX_DIMS];
|
||||
tmp_mask_nb[0] = sizeof(float_t);
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
tmp_mask_nb[i] = tmp_mask_nb[i - 1] * tmp_mask_ne[i - 1];
|
||||
}
|
||||
tmp_mask_tensor = ggml_cann_create_tensor(
|
||||
src1->data, ACL_FLOAT, sizeof(float), tmp_mask_ne, tmp_mask_nb,
|
||||
GGML_MAX_DIMS, ACL_FORMAT_ND);
|
||||
}
|
||||
|
||||
// alibi
|
||||
const int n_head = src0->ne[2];
|
||||
const size_t src_nb0 = src0->nb[0];
|
||||
|
||||
n_bytes = ggml_nbytes(dst);
|
||||
ggml_cann_pool_alloc output_allocator(ctx.pool(), n_bytes);
|
||||
void* output_buffer = output_allocator.get();
|
||||
aclTensor* alibi_output_tensor = ggml_cann_create_tensor(
|
||||
output_buffer, ACL_FLOAT, ggml_type_size(dst->type), dst->ne,
|
||||
dst->nb, GGML_MAX_DIMS);
|
||||
if (max_bias <= 0.0f) {
|
||||
// slope = 1.0
|
||||
if (tmp_mask_tensor) {
|
||||
aclnn_add(ctx, tmp_mask_tensor, acl_input_mul_scale_tensor,
|
||||
alibi_output_tensor);
|
||||
} else {
|
||||
aclnn_add(ctx, acl_src1_fp32_tensor, acl_input_mul_scale_tensor,
|
||||
alibi_output_tensor);
|
||||
}
|
||||
} else {
|
||||
// slope != 1.0
|
||||
if (tmp_mask_tensor) {
|
||||
aclnn_alibi(ctx, acl_input_mul_scale_tensor, tmp_mask_tensor,
|
||||
alibi_output_tensor, n_head, src0->ne, src_nb0,
|
||||
max_bias, dst);
|
||||
} else {
|
||||
aclnn_alibi(ctx, acl_input_mul_scale_tensor,
|
||||
acl_src1_fp32_tensor, alibi_output_tensor, n_head,
|
||||
src0->ne, src_nb0, max_bias, dst);
|
||||
}
|
||||
}
|
||||
|
||||
// softmax
|
||||
aclnn_softmax(ctx, alibi_output_tensor, 3, acl_dst);
|
||||
ggml_cann_release_resources(ctx, alibi_output_tensor);
|
||||
} else {
|
||||
aclnn_softmax(ctx, acl_input_mul_scale_tensor, 3, acl_dst);
|
||||
}
|
||||
|
||||
ggml_cann_release_resources(ctx, acl_src0, acl_src1_fp32_tensor, acl_dst,
|
||||
acl_scale, acl_input_mul_scale_tensor, tmp_mask_tensor);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Performs inplace index copy operation on a 4D tensor using the CANN backend.
|
||||
* @brief Performs embedding operation on a 4D tensor using the CANN backend.
|
||||
*
|
||||
* This function applies the `IndexCopy` operation along a specific dimension of the
|
||||
* destination tensor (`dst_buffer`) by copying elements from the source tensor (`src_buffer`)
|
||||
* to positions specified by the index tensor (`index`).
|
||||
* It iterates over the last two dimensions of the tensors, creates the corresponding
|
||||
* CANN tensors for source, index, and destination slices, and performs the index copy
|
||||
* operation for each slice.
|
||||
* This function extracts slices from the source tensor (`src_buffer`),
|
||||
* index tensor (`index`), and destination tensor (`dst`), and performs an
|
||||
* embedding operation on them. The embedding operation is applied by iterating
|
||||
* over the last two dimensions of the source tensor, creating the necessary
|
||||
* tensors for the source, index, and output, and executing the embedding operation.
|
||||
*
|
||||
* @param ctx The context for CANN backend operations.
|
||||
* @param src_buffer The source buffer containing the 4D input tensor data to be copied.
|
||||
* @param src_buffer The source buffer holding the data for the source tensor.
|
||||
* @param src_ne The dimensions of the source tensor.
|
||||
* @param src_nb The strides (byte offsets) of the source tensor.
|
||||
* @param dst_buffer The destination buffer where values will be copied to.
|
||||
* @param dst_ne The dimensions of the destination tensor.
|
||||
* @param dst_nb The strides (byte offsets) of the destination tensor.
|
||||
* @param index The index tensor specifying target positions in the destination tensor.
|
||||
* @param type The data type of the source and destination tensors.
|
||||
* @param index The index tensor used in the embedding operation.
|
||||
* @param dst The destination tensor where the result will be stored.
|
||||
*/
|
||||
static void aclnn_index_copy_4d(ggml_backend_cann_context& ctx,
|
||||
void* src_buffer,int64_t* src_ne, size_t* src_nb,
|
||||
void* dst_buffer, int64_t* dst_ne, size_t* dst_nb,
|
||||
ggml_tensor* index, ggml_type type) {
|
||||
static void aclnn_embedding_4d(ggml_backend_cann_context& ctx, void* src_buffer,
|
||||
int64_t* src_ne, size_t* src_nb, ggml_tensor* index,
|
||||
ggml_tensor* dst) {
|
||||
for (int64_t i = 0; i < src_ne[3]; i++) {
|
||||
for (int64_t j = 0; j < src_ne[2]; j++) {
|
||||
// src
|
||||
int64_t acl_src_ne[2] = {src_ne[0], src_ne[1]};
|
||||
size_t acl_src_nb[2] = {src_nb[0], src_nb[1]};
|
||||
aclTensor* acl_src_tensor = ggml_cann_create_tensor(
|
||||
(char*)src_buffer + i * src_nb[3] + j * src_nb[2],
|
||||
ggml_cann_type_mapping(type), ggml_type_size(type),
|
||||
src_ne, src_nb, 2);
|
||||
ggml_cann_type_mapping(dst->type), ggml_element_size(dst),
|
||||
acl_src_ne, acl_src_nb, 2);
|
||||
|
||||
// index
|
||||
int64_t acl_index_ne[1] = {index->ne[0]};
|
||||
size_t acl_index_nb[1] = {index->nb[0]};
|
||||
aclTensor* acl_index = ggml_cann_create_tensor(
|
||||
(char*)index->data + (i % index->ne[2]) * index->nb[2] + (j % index->ne[1]) * index->nb[1],
|
||||
(char*)index->data + i * index->nb[2] + j * index->nb[1],
|
||||
ggml_cann_type_mapping(index->type), ggml_element_size(index),
|
||||
index->ne, index->nb, 1);
|
||||
acl_index_ne, acl_index_nb, 1);
|
||||
|
||||
// out
|
||||
int64_t acl_out_ne[2] = {dst->ne[0], dst->ne[1]};
|
||||
size_t acl_out_nb[2] = {dst->nb[0], dst->nb[1]};
|
||||
aclTensor* acl_out = ggml_cann_create_tensor(
|
||||
(char*)dst_buffer + i * dst_nb[3] + j * dst_nb[2],
|
||||
ggml_cann_type_mapping(type), ggml_type_size(type),
|
||||
dst_ne, dst_nb, 2);
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceIndexCopy, acl_out, 0, acl_index, acl_src_tensor);
|
||||
(char*)dst->data + i * dst->nb[3] + j * dst->nb[2],
|
||||
ggml_cann_type_mapping(dst->type), ggml_element_size(dst),
|
||||
acl_out_ne, acl_out_nb, 2);
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, Embedding, acl_src_tensor, acl_index, acl_out);
|
||||
ggml_cann_release_resources(ctx, acl_src_tensor, acl_index, acl_out);
|
||||
}
|
||||
}
|
||||
@@ -1663,9 +1633,8 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F32: {
|
||||
aclnn_index_select_4d(ctx, src0->data, src0->ne, src0->nb,
|
||||
dst->data, dst->ne, dst->nb,
|
||||
src1, dst->type);
|
||||
aclnn_embedding_4d(ctx, src0->data, src0->ne, src0->nb, src1,
|
||||
dst);
|
||||
break;
|
||||
}
|
||||
case GGML_TYPE_F16: {
|
||||
@@ -1682,9 +1651,8 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
src_trans_buffer, ACL_FLOAT, ggml_type_size(dst->type),
|
||||
src0->ne, src_trans_nb, GGML_MAX_DIMS);
|
||||
aclnn_cast(ctx, acl_src0, src_trans_tensor, ggml_cann_type_mapping(dst->type));
|
||||
aclnn_index_select_4d(ctx, src_trans_buffer, src0->ne, src_trans_nb,
|
||||
dst->data, dst->ne, dst->nb,
|
||||
src1, dst->type);
|
||||
aclnn_embedding_4d(ctx, src_trans_buffer, src0->ne,
|
||||
src_trans_nb, src1, dst);
|
||||
ggml_cann_release_resources(ctx, acl_src0, src_trans_tensor);
|
||||
break;
|
||||
}
|
||||
@@ -1744,10 +1712,8 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
dequant_nb[i] = dequant_nb[i - 1] * src0->ne[i - 1];
|
||||
}
|
||||
|
||||
aclnn_index_select_4d(ctx, dequant_buffer_allocator.get(),
|
||||
dequant_ne, dequant_nb,
|
||||
dst->data, dst->ne, dst->nb,
|
||||
src1, dst->type);
|
||||
aclnn_embedding_4d(ctx, dequant_buffer_allocator.get(),
|
||||
dequant_ne, dequant_nb, src1, dst);
|
||||
|
||||
ggml_cann_release_resources(ctx, dequant_tensor);
|
||||
break;
|
||||
@@ -1758,43 +1724,6 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cann_set_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
ggml_tensor* src0 = dst->src[0]; // src
|
||||
ggml_tensor* src1 = dst->src[1]; // index
|
||||
|
||||
switch (dst->type) {
|
||||
case GGML_TYPE_F32: {
|
||||
aclnn_index_copy_4d(ctx, src0->data, src0->ne, src0->nb,
|
||||
dst->data, dst->ne, dst->nb,
|
||||
src1, dst->type);
|
||||
break;
|
||||
}
|
||||
case GGML_TYPE_F16: {
|
||||
aclTensor* acl_src0 = ggml_cann_create_tensor(src0);
|
||||
ggml_cann_pool_alloc src_buffer_allocator(
|
||||
ctx.pool(), ggml_nelements(src0) * sizeof(uint16_t));
|
||||
void* src_trans_buffer = src_buffer_allocator.get();
|
||||
size_t src_trans_nb[GGML_MAX_DIMS];
|
||||
src_trans_nb[0] = sizeof(uint16_t);
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
|
||||
}
|
||||
aclTensor* src_trans_tensor = ggml_cann_create_tensor(
|
||||
src_trans_buffer, ACL_FLOAT16, ggml_type_size(dst->type),
|
||||
src0->ne, src_trans_nb, GGML_MAX_DIMS);
|
||||
aclnn_cast(ctx, acl_src0, src_trans_tensor, ggml_cann_type_mapping(dst->type));
|
||||
aclnn_index_copy_4d(ctx, src_trans_buffer, src0->ne, src_trans_nb,
|
||||
dst->data, dst->ne, dst->nb,
|
||||
src1, dst->type);
|
||||
ggml_cann_release_resources(ctx, acl_src0, src_trans_tensor);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
GGML_ABORT("Unsupported tensor type for GGML_OP_SET_ROWS");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Repeats elements of a tensor along a specified dimension.
|
||||
*
|
||||
@@ -1858,9 +1787,11 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
|
||||
bcast_weight_nb[4], bcast_weight_nb[5]};
|
||||
aclTensor* acl_weight_tensor;
|
||||
|
||||
// Only check env once.
|
||||
static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or(""));
|
||||
if (weight_to_nz && is_matmul_weight(weight)) {
|
||||
bool weightToNZ = false;
|
||||
#ifdef ASCEND_310P
|
||||
weightToNZ = (getenv("GGML_CANN_WEIGHT_NZ") != nullptr);
|
||||
#endif
|
||||
if (weightToNZ && is_matmul_weight(weight)) {
|
||||
int64_t acl_stride[2] = {1, transpose_ne[1]};
|
||||
|
||||
// Reverse ne.
|
||||
@@ -3153,24 +3084,104 @@ void ggml_cann_flash_attn_ext(ggml_backend_cann_context& ctx, ggml_tensor* dst){
|
||||
// Compute the slope if needed. Derived from ggml_cann_softmax().
|
||||
if(maxBias != 0.0f){
|
||||
// alibi
|
||||
const int64_t n_heads = src0->ne[2];
|
||||
ggml_cann_pool_alloc slope_allocator(ctx.pool(), n_heads * sizeof(float));
|
||||
void* slope_buffer = slope_allocator.get();
|
||||
aclnn_get_slope(ctx, n_heads, slope_buffer, maxBias);
|
||||
const int64_t ne2_ne3 = src0->ne[2] * src0->ne[3];
|
||||
const int64_t n_head = src0->ne[2];
|
||||
const int n_heads_log2_floor = 1u << (uint32_t)floor(log2(n_head));
|
||||
float m0 = powf(2.0f, -(maxBias) / n_heads_log2_floor);
|
||||
float m1 = powf(2.0f, -(maxBias / 2.0f) / n_heads_log2_floor);
|
||||
// init arange
|
||||
ggml_cann_pool_alloc arange_allocator(ctx.pool(),
|
||||
ne2_ne3 * faElemSize);
|
||||
void* tmp_arange_buffer = arange_allocator.get();
|
||||
|
||||
int64_t slope_ne[] = {1, 1, n_heads, 1};
|
||||
size_t slope_nb[GGML_MAX_DIMS];
|
||||
slope_nb[0] = sizeof(float);
|
||||
for(int i = 1;i<GGML_MAX_DIMS;i++) {
|
||||
slope_nb[i] = slope_nb[i-1] * slope_ne[0];
|
||||
// arange1: [1, ..., n_heads_log2_floor+1)
|
||||
float start = 1;
|
||||
float stop = n_heads_log2_floor + 1;
|
||||
float step = 1;
|
||||
int64_t n_elements_arange = n_heads_log2_floor;
|
||||
|
||||
int64_t tmp_arange1_ne[] = {n_heads_log2_floor};
|
||||
size_t tmp_arange1_nb[] = {faElemSize};
|
||||
aclTensor* tmp_arange1_tensor = ggml_cann_create_tensor(
|
||||
tmp_arange_buffer, faDataType, faElemSize,
|
||||
tmp_arange1_ne, tmp_arange1_nb,
|
||||
GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
|
||||
|
||||
aclnn_arange(ctx, tmp_arange1_tensor, start, stop, step, n_elements_arange);
|
||||
|
||||
aclTensor* tmp_arange2_tensor = nullptr;
|
||||
if (n_heads_log2_floor < ne2_ne3) {
|
||||
// arange2: [1, ..., 2 * (k - n_heads_log2_floor) + 1)
|
||||
start = 1;
|
||||
stop = 2 * (ne2_ne3 - n_heads_log2_floor) + 1;
|
||||
step = 2;
|
||||
n_elements_arange = ne2_ne3 - n_heads_log2_floor;
|
||||
int64_t tmp_arange2_ne[] = {ne2_ne3 - n_heads_log2_floor};
|
||||
size_t tmp_arange2_nb[] = {faElemSize};
|
||||
|
||||
aclTensor* tmp_arange2_tensor = ggml_cann_create_tensor(
|
||||
(char*)tmp_arange_buffer +
|
||||
n_heads_log2_floor * faElemSize,
|
||||
faDataType, faElemSize,
|
||||
tmp_arange2_ne, tmp_arange2_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
|
||||
aclnn_arange(ctx, tmp_arange2_tensor, start, stop, step,
|
||||
n_elements_arange);
|
||||
}
|
||||
|
||||
aclTensor* slope_tensor = ggml_cann_create_tensor(
|
||||
slope_buffer, ACL_FLOAT, sizeof(float),
|
||||
slope_ne, slope_nb, GGML_MAX_DIMS);
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMul, bcast_pse_tensor, slope_tensor);
|
||||
// init mk_base
|
||||
ggml_cann_pool_alloc mk_base_allocator(ctx.pool(),
|
||||
ne2_ne3 * faElemSize);
|
||||
void* tmp_mk_base_buffer = mk_base_allocator.get();
|
||||
int64_t tmp_mk_base1_ne[] = {n_heads_log2_floor};
|
||||
size_t tmp_mk_base1_nb[] = {faElemSize};
|
||||
aclTensor* tmp_mk_base1_tensor = ggml_cann_create_tensor(
|
||||
tmp_mk_base_buffer, faDataType, faElemSize,
|
||||
tmp_mk_base1_ne, tmp_mk_base1_nb,
|
||||
GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
|
||||
|
||||
ggml_cann_release_resources(ctx, slope_tensor);
|
||||
aclnn_fill_scalar(ctx, m0, tmp_mk_base1_tensor);
|
||||
|
||||
aclTensor* tmp_mk_base2_tensor = nullptr;
|
||||
if (n_heads_log2_floor < ne2_ne3) {
|
||||
int64_t tmp_mk_base2_ne[] = {ne2_ne3 - n_heads_log2_floor};
|
||||
size_t tmp_mk_base2_nb[] = {faElemSize};
|
||||
aclTensor* tmp_mk_base2_tensor = ggml_cann_create_tensor(
|
||||
(char*)tmp_mk_base_buffer +
|
||||
n_heads_log2_floor * faElemSize,
|
||||
faDataType, faElemSize,
|
||||
tmp_mk_base2_ne, tmp_mk_base2_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
|
||||
aclnn_fill_scalar(ctx, m1, tmp_mk_base2_tensor);
|
||||
}
|
||||
|
||||
// init mk
|
||||
int64_t tmp_mk_base_ne[] = {ne2_ne3};
|
||||
size_t tmp_mk_base_nb[] = {faElemSize};
|
||||
aclTensor* tmp_mk_base_tensor = ggml_cann_create_tensor(
|
||||
tmp_mk_base_buffer, faDataType, faElemSize,
|
||||
tmp_mk_base_ne, tmp_mk_base_nb,
|
||||
GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
|
||||
aclTensor* tmp_arange_tensor = ggml_cann_create_tensor(
|
||||
tmp_arange_buffer, faDataType, faElemSize,
|
||||
tmp_mk_base_ne, tmp_mk_base_nb,
|
||||
GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
|
||||
aclnn_pow_tensor_tensor(ctx, tmp_mk_base_tensor, tmp_arange_tensor);
|
||||
|
||||
// reshape mk
|
||||
int64_t tmp_mk_ne[] = {1, 1, src0->ne[2], src0->ne[3]};
|
||||
size_t tmp_mk_nb[GGML_MAX_DIMS];
|
||||
tmp_mk_nb[0] = faElemSize;
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
tmp_mk_nb[i] = tmp_mk_nb[i - 1] * tmp_mk_ne[i - 1];
|
||||
}
|
||||
aclTensor* tmp_mk_tensor = ggml_cann_create_tensor(
|
||||
tmp_mk_base_buffer, faDataType, faElemSize,
|
||||
tmp_mk_ne, tmp_mk_nb, GGML_MAX_DIMS,
|
||||
ACL_FORMAT_ND);
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMul, bcast_pse_tensor, tmp_mk_tensor);
|
||||
|
||||
ggml_cann_release_resources(ctx, tmp_arange1_tensor, tmp_arange2_tensor,
|
||||
tmp_mk_base1_tensor, tmp_mk_base2_tensor, tmp_mk_base_tensor,
|
||||
tmp_arange_tensor, tmp_mk_tensor);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -424,25 +424,15 @@ void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst);
|
||||
*
|
||||
* @details This function retrieves rows from a source tensor src0 according to
|
||||
* the indices provided in another tensor src1 and stores the result in
|
||||
* a destination tensor (\p dst).
|
||||
* a destination tensor (\p dst). It supports different data types
|
||||
* including F32, F16, Q4_0, and Q8_0.
|
||||
*
|
||||
* @param ctx The backend CANN context for executing operations.
|
||||
* @param dst The destination tensor where the extracted rows will be stored.
|
||||
* dst->op is `GGML_OP_GET_ROWS`.
|
||||
*/
|
||||
void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst);
|
||||
|
||||
/**
|
||||
* @brief Writes specific rows into a tensor at positions specified by indices.
|
||||
*
|
||||
* @details This function copies rows from a source tensor into a destination
|
||||
* tensor (\p dst) at the positions indicated by the indices in another
|
||||
* tensor.
|
||||
*
|
||||
* @param ctx The backend CANN context for executing operations.
|
||||
* @param dst The destination tensor where the specified rows will be updated.
|
||||
*/
|
||||
void ggml_cann_set_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst);
|
||||
|
||||
/**
|
||||
* @brief Executes matrix multiplication for the given tensor.
|
||||
*
|
||||
@@ -1108,7 +1098,7 @@ void ggml_cann_binary_op(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
* @param dst The destination tensor. Its src[0] is treated as the input tensor.
|
||||
*/
|
||||
template <void unary_op(ggml_backend_cann_context&, aclTensor*, aclTensor*)>
|
||||
void ggml_cann_op_unary(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
void ggml_cann_unary_op(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
ggml_tensor* src = dst->src[0];
|
||||
|
||||
aclTensor* acl_src = ggml_cann_create_tensor(src);
|
||||
@@ -1119,125 +1109,49 @@ template <void unary_op(ggml_backend_cann_context&, aclTensor*, aclTensor*)>
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Applies a unary operation to a ggml tensor using the CANN backend.
|
||||
* @brief Applies a unary operation to a ggml tensor using the CANN backend.
|
||||
*
|
||||
* @details This function applies a unary operation to the input tensor using
|
||||
* a user-provided lambda or callable `unary_op`. The lambda receives the
|
||||
* CANN backend context and two ACL tensors: the source and the destination.
|
||||
* @details This function performs a unary operation on the input tensor using
|
||||
* a user-provided lambda or callable object `unary_op`, which accepts the CANN
|
||||
* context and two ACL tensors (source and destination). Internally, this function
|
||||
* creates ACL representations of the ggml tensors and invokes the unary operation.
|
||||
* The result is stored in the destination tensor `dst`. This utility abstracts the
|
||||
* common boilerplate of tensor conversion and cleanup when implementing unary ops.
|
||||
*
|
||||
* Internally, this function handles the conversion from GGML tensors to ACL tensors,
|
||||
* calls the provided unary op, and manages resource cleanup. The input is assumed
|
||||
* to be `dst->src[0]`, and the result is written to `dst`.
|
||||
*
|
||||
* This utility simplifies writing unary op wrappers by abstracting tensor preparation.
|
||||
*
|
||||
* @param unary_op A callable that performs the unary operation using CANN ACL APIs.
|
||||
* @param ctx The CANN context for operation execution.
|
||||
* @param dst The destination ggml_tensor where the result will be stored.
|
||||
* The input tensor is assumed to be `dst->src[0]`.
|
||||
*
|
||||
* @see GGML_CANN_CALL_OP_UNARY
|
||||
* @param unary_op A callable that performs the unary operation using CANN APIs.
|
||||
* @param ctx The CANN context used for operations.
|
||||
* @param dst The destination tensor where the result will be stored.
|
||||
* The source tensor is retrieved from `dst->src[0]`.
|
||||
*/
|
||||
void ggml_cann_op_unary(
|
||||
void ggml_cann_unary_op(
|
||||
std::function<void(ggml_backend_cann_context&, aclTensor*, aclTensor*)> unary_op,
|
||||
ggml_backend_cann_context& ctx, ggml_tensor* dst);
|
||||
|
||||
/**
|
||||
* @brief Applies a gated (GLU-style) unary operation using the CANN backend.
|
||||
* @brief Helper macro to invoke a unary ACL operation using ggml_cann_unary_op.
|
||||
*
|
||||
* @details This function performs a gated activation such as GEGLU or ReGLU.
|
||||
* It supports two input modes:
|
||||
*
|
||||
* 1. **Dual input mode**: `dst->src[0]` and `dst->src[1]` are both valid tensors.
|
||||
* These are used directly as the value and gate tensors.
|
||||
*
|
||||
* 2. **Packed input mode**: Only `dst->src[0]` is valid, and it is assumed to
|
||||
* contain a concatenation of value and gate along the first dimension. This tensor
|
||||
* will be split into two equal halves to form the value and gate inputs.
|
||||
*
|
||||
* The function applies a user-provided unary operation (e.g., GELU) to the value tensor,
|
||||
* then multiplies the result in-place with the gate tensor:
|
||||
* This macro defines an inline lambda wrapping a specific ACL operation name,
|
||||
* and passes it to the templated ggml_cann_unary_op function. It simplifies
|
||||
* calling unary ops by hiding the lambda boilerplate.
|
||||
*
|
||||
* Internally, the lambda will call:
|
||||
* @code
|
||||
* dst = unary_op(value) * gate;
|
||||
* GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst);
|
||||
* @endcode
|
||||
*
|
||||
* The `swapped` parameter (from `dst->op_params[1]`) allows flipping the
|
||||
* order of value/gate in the packed input case.
|
||||
*
|
||||
* @param unary_op A callable that performs the unary operation using CANN ACL APIs.
|
||||
* It receives (ctx, acl_value_tensor, acl_output_tensor).
|
||||
* @param ctx The CANN context used for execution.
|
||||
* @param dst The destination ggml_tensor. Source tensors are in `dst->src[0]` and optionally `src[1]`.
|
||||
*
|
||||
* @see GGML_CANN_CALL_OP_UNARY_GATED
|
||||
*/
|
||||
void ggml_cann_op_unary_gated(
|
||||
std::function<void(ggml_backend_cann_context&, aclTensor*, aclTensor*)> unary_op,
|
||||
ggml_backend_cann_context& ctx, ggml_tensor* dst);
|
||||
|
||||
/**
|
||||
* @brief Helper macro to call a unary ACL operator via ggml_cann_op_unary.
|
||||
*
|
||||
* This macro wraps the specified ACLNN unary operator name into a lambda expression,
|
||||
* and passes it to `ggml_cann_op_unary`, which handles the common logic for executing
|
||||
* unary ops in the CANN backend.
|
||||
*
|
||||
* Internally, this macro expands to a lambda like:
|
||||
* @code
|
||||
* [](ggml_backend_cann_context& ctx, aclTensor* acl_src, aclTensor* acl_dst) {
|
||||
* GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst);
|
||||
* };
|
||||
* @endcode
|
||||
*
|
||||
* This lambda is then passed to `ggml_cann_op_unary`, which applies the operation.
|
||||
*
|
||||
* @param OP_NAME The name of the ACL unary operator to invoke via GGML_CANN_CALL_ACLNN_OP.
|
||||
*
|
||||
* @see ggml_cann_op_unary
|
||||
* @see ggml_cann_unary_op
|
||||
* @see GGML_CANN_CALL_ACLNN_OP
|
||||
*/
|
||||
#define GGML_CANN_CALL_OP_UNARY(OP_NAME) \
|
||||
#define GGML_CANN_CALL_UNARY_OP(OP_NAME) \
|
||||
do { \
|
||||
auto lambda = [](ggml_backend_cann_context& ctx, \
|
||||
aclTensor* acl_src, \
|
||||
aclTensor* acl_dst) { \
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst); \
|
||||
}; \
|
||||
ggml_cann_op_unary(lambda, ctx, dst); \
|
||||
ggml_cann_unary_op(lambda, ctx, dst); \
|
||||
} \
|
||||
while (0)
|
||||
|
||||
/**
|
||||
* @brief Helper macro to call a gated unary ACL operator via ggml_cann_op_unary_gated.
|
||||
*
|
||||
* This macro wraps the specified ACLNN unary operator name into a lambda expression,
|
||||
* and passes it to `ggml_cann_op_unary_gated`, which handles the common logic for
|
||||
* executing gated unary ops in the CANN backend.
|
||||
*
|
||||
* Internally, this macro expands to a lambda like:
|
||||
* @code
|
||||
* [](ggml_backend_cann_context& ctx, aclTensor* acl_src, aclTensor* acl_dst) {
|
||||
* GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst);
|
||||
* };
|
||||
* @endcode
|
||||
*
|
||||
* This lambda is then passed to `ggml_cann_op_unary_gated`, which applies the operation.
|
||||
*
|
||||
* @param OP_NAME The name of the ACL unary operator to invoke via GGML_CANN_CALL_ACLNN_OP.
|
||||
*
|
||||
* @see ggml_cann_op_unary_gated
|
||||
* @see GGML_CANN_CALL_ACLNN_OP
|
||||
*/
|
||||
#define GGML_CANN_CALL_OP_UNARY_GATED(OP_NAME) \
|
||||
do { \
|
||||
auto lambda = [](ggml_backend_cann_context& ctx, \
|
||||
aclTensor* acl_src, \
|
||||
aclTensor* acl_dst) { \
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst); \
|
||||
}; \
|
||||
ggml_cann_op_unary_gated(lambda, ctx, dst); \
|
||||
} \
|
||||
while (0)
|
||||
|
||||
#endif // CANN_ACLNN_OPS
|
||||
|
||||
@@ -337,29 +337,6 @@ private:
|
||||
int32_t device_;
|
||||
};
|
||||
|
||||
#ifdef USE_ACL_GRAPH
|
||||
struct ggml_graph_node_properties {
|
||||
void * node_address;
|
||||
ggml_op node_op;
|
||||
int64_t ne[GGML_MAX_DIMS];
|
||||
size_t nb[GGML_MAX_DIMS];
|
||||
void * src_address[GGML_MAX_SRC];
|
||||
int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
|
||||
};
|
||||
|
||||
struct ggml_cann_graph {
|
||||
~ggml_cann_graph() {
|
||||
if (graph != nullptr) {
|
||||
aclmdlRIDestroy(graph);
|
||||
}
|
||||
}
|
||||
|
||||
aclmdlRI graph = nullptr;
|
||||
|
||||
std::vector<ggml_graph_node_properties> ggml_graph_properties;
|
||||
};
|
||||
#endif // USE_ACL_GRAPH
|
||||
|
||||
/**
|
||||
* @brief Context for managing CANN backend operations.
|
||||
*/
|
||||
@@ -368,13 +345,8 @@ struct ggml_backend_cann_context {
|
||||
std::string name; /**< Name of the device. */
|
||||
std::string description; /**< Description of the device. */
|
||||
aclrtEvent copy_event = nullptr; /**< Event for managing copy operations. */
|
||||
#ifdef USE_ACL_GRAPH
|
||||
/// Cached CANN ACL graph used for executing the current ggml computation graph.
|
||||
std::unique_ptr<ggml_cann_graph> cann_graph;
|
||||
#endif
|
||||
cann_task_queue task_queue;
|
||||
bool async_mode;
|
||||
bool support_set_rows;
|
||||
|
||||
aclrtStream streams[GGML_CANN_MAX_STREAMS] = {nullptr}; /**< Array of streams for the device. */
|
||||
|
||||
@@ -390,14 +362,6 @@ struct ggml_backend_cann_context {
|
||||
async_mode = parse_bool(get_env("GGML_CANN_ASYNC_MODE").value_or(""));
|
||||
GGML_LOG_INFO("%s: device %d async operator submission is %s\n", __func__,
|
||||
device, async_mode ? "ON" : "OFF");
|
||||
|
||||
support_set_rows = parse_bool(get_env("LLAMA_SET_ROWS").value_or(""));
|
||||
GGML_LOG_INFO("%s: LLAMA_SET_ROWS is %s\n", __func__, support_set_rows ? "ON" : "OFF");
|
||||
|
||||
if (!support_set_rows) {
|
||||
GGML_LOG_INFO("%s: CANN Graph currently only supports execution when LLAMA_SET_ROWS is ON. "
|
||||
"Falling back to eager mode.\n", __func__);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@@ -1116,59 +1116,61 @@ static enum ggml_status ggml_backend_cann_buffer_init_tensor(
|
||||
return GGML_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
// ND to NZ Workspace Cache Management. Thread-safety: Not guaranteed
|
||||
namespace {
|
||||
void* g_nz_workspace = nullptr;
|
||||
size_t g_nz_workspace_allocated = 0;
|
||||
|
||||
void release_nz_workspace() {
|
||||
if (g_nz_workspace) {
|
||||
aclrtFree(g_nz_workspace);
|
||||
g_nz_workspace = nullptr;
|
||||
g_nz_workspace_allocated = 0;
|
||||
}
|
||||
static int CreateAclTensorWeight(const void *hostData, const std::vector<int64_t> &shape, void **deviceAddr,
|
||||
aclDataType dataType, aclTensor **tensor)
|
||||
{
|
||||
uint64_t size = 1;
|
||||
for (auto i : shape) {
|
||||
size *= i;
|
||||
}
|
||||
|
||||
void relloc_nz_workspace(size_t new_size) {
|
||||
if (new_size > g_nz_workspace_allocated) {
|
||||
if (g_nz_workspace) {
|
||||
aclrtFree(g_nz_workspace);
|
||||
g_nz_workspace = nullptr;
|
||||
}
|
||||
ACL_CHECK(aclrtMalloc(&g_nz_workspace, new_size, ACL_MEM_MALLOC_HUGE_FIRST));
|
||||
g_nz_workspace_allocated = new_size;
|
||||
}
|
||||
const aclIntArray *mat2Size = aclCreateIntArray(shape.data(), shape.size());
|
||||
ACL_CHECK(aclnnCalculateMatmulWeightSizeV2(mat2Size, dataType, &size));
|
||||
|
||||
size *= sizeof(int16_t);
|
||||
|
||||
ACL_CHECK(aclrtMalloc(deviceAddr, size, ACL_MEM_MALLOC_HUGE_FIRST));
|
||||
aclrtMemcpy(*deviceAddr, size, hostData, size, ACL_MEMCPY_HOST_TO_DEVICE);
|
||||
|
||||
std::vector<int64_t> strides(shape.size(), 1);
|
||||
for (int64_t i = shape.size() - 2; i >= 0; i--) {
|
||||
strides[i] = shape[i + 1] * strides[i + 1];
|
||||
}
|
||||
|
||||
*tensor = aclCreateTensor(shape.data(), shape.size(), dataType, strides.data(), 0, aclFormat::ACL_FORMAT_ND,
|
||||
shape.data(), shape.size(), *deviceAddr);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Convert tensor weights to NZ format using Ascend CANN API.
|
||||
*
|
||||
* This function creates a transposed tensor descriptor and performs the
|
||||
* TransMatmulWeight operation. Converting tensor formats can significantly
|
||||
* improve performance on certain hardware.
|
||||
*
|
||||
* @param tensor Pointer to the input ggml_tensor containing the weights.
|
||||
* @param data Pointer to the raw data buffer for the tensor weights.
|
||||
* @param offset Byte offset within the tensor data buffer where weights start.
|
||||
*
|
||||
* @note The workspace buffer used in this function is managed globally and reused
|
||||
* across calls. This reduces overhead from repeated memory allocation and deallocation.
|
||||
*/
|
||||
static void weight_format_to_nz(ggml_tensor *tensor, const void *data, size_t offset) {
|
||||
aclTensor* weightTransposed = ggml_cann_create_tensor(tensor, tensor->ne,
|
||||
tensor->nb, 2, ACL_FORMAT_ND, offset);
|
||||
aclrtStream stream;
|
||||
ACL_CHECK(aclrtCreateStream(&stream));
|
||||
|
||||
std::vector<int64_t> weightTransposedShape = {tensor->ne[1], tensor->ne[0]};
|
||||
void *weightTransposedDeviceAddr = nullptr;
|
||||
aclTensor *weightTransposed = nullptr;
|
||||
CreateAclTensorWeight(data, weightTransposedShape, &weightTransposedDeviceAddr,
|
||||
ggml_cann_type_mapping(tensor->type), &weightTransposed);
|
||||
|
||||
uint64_t workspaceSize = 0;
|
||||
aclOpExecutor *executor;
|
||||
void *workspaceAddr = nullptr;
|
||||
|
||||
// TransMatmulWeight
|
||||
ACL_CHECK(aclnnTransMatmulWeightGetWorkspaceSize(weightTransposed,
|
||||
&workspaceSize, &executor));
|
||||
// Avoid frequent malloc/free of the workspace.
|
||||
relloc_nz_workspace(workspaceSize);
|
||||
ACL_CHECK(aclnnTransMatmulWeightGetWorkspaceSize(weightTransposed, &workspaceSize, &executor));
|
||||
std::unique_ptr<void, aclError (*)(void *)> workspaceAddrPtrTrans(nullptr, aclrtFree);
|
||||
if (workspaceSize > 0) {
|
||||
ACL_CHECK(aclrtMalloc(&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST));
|
||||
workspaceAddrPtrTrans.reset(workspaceAddr);
|
||||
}
|
||||
ACL_CHECK(aclnnTransMatmulWeight(workspaceAddr, workspaceSize, executor, stream));
|
||||
|
||||
ACL_CHECK(aclnnTransMatmulWeight(g_nz_workspace, workspaceSize, executor, nullptr));
|
||||
size_t size = ggml_nelements(tensor) * ggml_element_size(tensor);
|
||||
|
||||
aclrtMemcpy((char *)tensor->data + offset, size,
|
||||
weightTransposedDeviceAddr, size, ACL_MEMCPY_HOST_TO_DEVICE);
|
||||
ACL_CHECK(aclDestroyTensor(weightTransposed));
|
||||
aclrtFree(weightTransposedDeviceAddr);
|
||||
}
|
||||
|
||||
// TODO: need handle tensor which has paddings.
|
||||
@@ -1195,14 +1197,14 @@ static void ggml_backend_cann_buffer_set_tensor(
|
||||
// For acl, synchronous functions use this default stream.
|
||||
// Why aclrtSynchronizeDevice?
|
||||
|
||||
// Only check env once.
|
||||
static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or(""));
|
||||
bool weightToNZ = false;
|
||||
#ifdef ASCEND_310P
|
||||
weightToNZ = (getenv("GGML_CANN_WEIGHT_NZ") != nullptr);
|
||||
#endif
|
||||
if (!need_transform(tensor->type)) {
|
||||
ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size, data, size,
|
||||
ACL_MEMCPY_HOST_TO_DEVICE));
|
||||
if (weight_to_nz && is_matmul_weight((const ggml_tensor*)tensor)) {
|
||||
GGML_ASSERT(tensor->ne[2] == 1);
|
||||
GGML_ASSERT(tensor->ne[3] == 1);
|
||||
if (weightToNZ && is_matmul_weight((const ggml_tensor*)tensor)) {
|
||||
weight_format_to_nz(tensor, data, offset);
|
||||
}
|
||||
} else {
|
||||
@@ -1438,32 +1440,20 @@ static size_t ggml_backend_cann_buffer_type_get_alloc_size(
|
||||
size_t size = ggml_nbytes(tensor);
|
||||
int64_t ne0 = tensor->ne[0];
|
||||
|
||||
// Only check env once.
|
||||
static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or(""));
|
||||
|
||||
// last line must bigger than 32, because every single op deal at
|
||||
// least 32 bytes.
|
||||
// TODO: quantized type?
|
||||
// int64_t line_size = ne0 * ggml_element_size(tensor);
|
||||
// int64_t line_size_align_32 = (line_size + 31) & ~31;
|
||||
// size += (line_size_align_32 - line_size);
|
||||
|
||||
// TODO: not support quantized yet.
|
||||
// TODO: consider un-continue tensor.
|
||||
if (ggml_is_quantized(tensor->type)) {
|
||||
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
||||
size += ggml_row_size(
|
||||
tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
|
||||
}
|
||||
} else if (weight_to_nz && is_matmul_weight((const ggml_tensor*)tensor)) {
|
||||
// NZ format weight are not support quantized yet.
|
||||
// If ND tensor transform to NZ, size may changed.
|
||||
int64_t shape[] = {tensor->ne[1], tensor->ne[0]};
|
||||
GGML_ASSERT(tensor->ne[2] == 1);
|
||||
GGML_ASSERT(tensor->ne[3] == 1);
|
||||
const aclIntArray *acl_shape = aclCreateIntArray(shape, 2);
|
||||
size_t new_size;
|
||||
ACL_CHECK(aclnnCalculateMatmulWeightSizeV2(acl_shape,
|
||||
ggml_cann_type_mapping(tensor->type), &new_size));
|
||||
ACL_CHECK(aclDestroyIntArray(acl_shape));
|
||||
size = std::max(size, new_size);
|
||||
}
|
||||
|
||||
return size;
|
||||
@@ -1669,9 +1659,6 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
|
||||
case GGML_OP_GET_ROWS:
|
||||
ggml_cann_get_rows(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_SET_ROWS:
|
||||
ggml_cann_set_rows(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_DUP:
|
||||
ggml_cann_dup(ctx, dst);
|
||||
break;
|
||||
@@ -1694,18 +1681,16 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
|
||||
case GGML_OP_UNARY:
|
||||
switch (ggml_get_unary_op(dst)) {
|
||||
case GGML_UNARY_OP_ABS:
|
||||
GGML_CANN_CALL_OP_UNARY(Abs);
|
||||
GGML_CANN_CALL_UNARY_OP(Abs);
|
||||
break;
|
||||
case GGML_UNARY_OP_NEG:
|
||||
GGML_CANN_CALL_OP_UNARY(Neg);
|
||||
GGML_CANN_CALL_UNARY_OP(Neg);
|
||||
break;
|
||||
case GGML_UNARY_OP_GELU:
|
||||
case GGML_UNARY_OP_GELU_ERF:
|
||||
// aclnnGelu internally uses the erf-based approximation.
|
||||
GGML_CANN_CALL_OP_UNARY(Gelu);
|
||||
GGML_CANN_CALL_UNARY_OP(Gelu);
|
||||
break;
|
||||
case GGML_UNARY_OP_SILU:
|
||||
GGML_CANN_CALL_OP_UNARY(Silu);
|
||||
GGML_CANN_CALL_UNARY_OP(Silu);
|
||||
break;
|
||||
case GGML_UNARY_OP_GELU_QUICK: {
|
||||
auto lambda = [](ggml_backend_cann_context& ctx,
|
||||
@@ -1713,31 +1698,31 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
|
||||
aclTensor* acl_dst) {
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, GeluV2, acl_src, 0, acl_dst);
|
||||
};
|
||||
ggml_cann_op_unary(lambda, ctx, dst);
|
||||
ggml_cann_unary_op(lambda, ctx, dst);
|
||||
} break;
|
||||
case GGML_UNARY_OP_TANH:
|
||||
GGML_CANN_CALL_OP_UNARY(Tanh);
|
||||
GGML_CANN_CALL_UNARY_OP(Tanh);
|
||||
break;
|
||||
case GGML_UNARY_OP_RELU:
|
||||
GGML_CANN_CALL_OP_UNARY(Relu);
|
||||
GGML_CANN_CALL_UNARY_OP(Relu);
|
||||
break;
|
||||
case GGML_UNARY_OP_SIGMOID:
|
||||
GGML_CANN_CALL_OP_UNARY(Sigmoid);
|
||||
GGML_CANN_CALL_UNARY_OP(Sigmoid);
|
||||
break;
|
||||
case GGML_UNARY_OP_HARDSIGMOID:
|
||||
GGML_CANN_CALL_OP_UNARY(Hardsigmoid);
|
||||
GGML_CANN_CALL_UNARY_OP(Hardsigmoid);
|
||||
break;
|
||||
case GGML_UNARY_OP_HARDSWISH:
|
||||
GGML_CANN_CALL_OP_UNARY(Hardswish);
|
||||
GGML_CANN_CALL_UNARY_OP(Hardswish);
|
||||
break;
|
||||
case GGML_UNARY_OP_EXP:
|
||||
GGML_CANN_CALL_OP_UNARY(Exp);
|
||||
GGML_CANN_CALL_UNARY_OP(Exp);
|
||||
break;
|
||||
case GGML_UNARY_OP_ELU:
|
||||
ggml_cann_elu(ctx, dst);
|
||||
break;
|
||||
case GGML_UNARY_OP_SGN:
|
||||
GGML_CANN_CALL_OP_UNARY(Sign);
|
||||
GGML_CANN_CALL_UNARY_OP(Sign);
|
||||
break;
|
||||
case GGML_UNARY_OP_STEP:
|
||||
ggml_cann_step(ctx, dst);
|
||||
@@ -1746,31 +1731,6 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
case GGML_OP_GLU:
|
||||
switch (ggml_get_glu_op(dst)) {
|
||||
case GGML_GLU_OP_REGLU:
|
||||
GGML_CANN_CALL_OP_UNARY_GATED(Relu);
|
||||
break;
|
||||
case GGML_GLU_OP_GEGLU:
|
||||
case GGML_GLU_OP_GEGLU_ERF:
|
||||
// aclnnGelu internally uses the erf-based approximation.
|
||||
GGML_CANN_CALL_OP_UNARY_GATED(Gelu);
|
||||
break;
|
||||
case GGML_GLU_OP_SWIGLU:
|
||||
GGML_CANN_CALL_OP_UNARY_GATED(Silu);
|
||||
break;
|
||||
case GGML_GLU_OP_GEGLU_QUICK: {
|
||||
auto lambda = [](ggml_backend_cann_context& ctx,
|
||||
aclTensor* acl_src,
|
||||
aclTensor* acl_dst) {
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, GeluV2, acl_src, 0, acl_dst);
|
||||
};
|
||||
ggml_cann_op_unary_gated(lambda, ctx, dst);
|
||||
} break;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
case GGML_OP_NORM:
|
||||
ggml_cann_norm(ctx, dst);
|
||||
break;
|
||||
@@ -1813,7 +1773,7 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
|
||||
ggml_cann_binary_op<aclnn_mul>(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_SQRT:
|
||||
GGML_CANN_CALL_OP_UNARY(Sqrt);
|
||||
GGML_CANN_CALL_UNARY_OP(Sqrt);
|
||||
break;
|
||||
case GGML_OP_CLAMP:
|
||||
ggml_cann_clamp(ctx, dst);
|
||||
@@ -1858,16 +1818,16 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
|
||||
ggml_cann_argmax(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_COS:
|
||||
ggml_cann_op_unary<aclnn_cos>(ctx, dst);
|
||||
ggml_cann_unary_op<aclnn_cos>(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_SIN:
|
||||
ggml_cann_op_unary<aclnn_sin>(ctx, dst);
|
||||
ggml_cann_unary_op<aclnn_sin>(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||
ggml_cann_conv_transpose_1d(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_LOG:
|
||||
GGML_CANN_CALL_OP_UNARY(Log);
|
||||
GGML_CANN_CALL_UNARY_OP(Log);
|
||||
break;
|
||||
case GGML_OP_MEAN:
|
||||
ggml_cann_mean(ctx, dst);
|
||||
@@ -2016,9 +1976,6 @@ static bool ggml_backend_cann_cpy_tensor_async(
|
||||
(ggml_backend_cann_context*)backend_dst->context;
|
||||
|
||||
size_t copy_size = ggml_nbytes(dst);
|
||||
if (copy_size == 0) {
|
||||
return true;
|
||||
}
|
||||
if (backend_src != backend_dst) {
|
||||
ggml_backend_cann_buffer_context* buf_ctx_src =
|
||||
(ggml_backend_cann_buffer_context*)buf_src->context;
|
||||
@@ -2075,160 +2032,6 @@ static void ggml_backend_cann_synchronize(ggml_backend_t backend) {
|
||||
ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream()));
|
||||
}
|
||||
|
||||
#ifdef USE_ACL_GRAPH
|
||||
/**
|
||||
* @brief Populate the internal CANN graph node properties from the ggml computation graph.
|
||||
*
|
||||
* This function copies all node attributes (operation type, dimensions, strides, input sources,
|
||||
* and operation parameters) into the cached CANN graph structure for later reuse or comparison.
|
||||
*
|
||||
* @param cann_ctx The CANN backend context.
|
||||
* @param cgraph The ggml computational graph.
|
||||
*/
|
||||
static void set_ggml_graph_node_properties(ggml_backend_cann_context * cann_ctx, ggml_cgraph * cgraph) {
|
||||
for (int node_idx = 0; node_idx < cgraph->n_nodes; node_idx++) {
|
||||
ggml_tensor * node = cgraph->nodes[node_idx];
|
||||
cann_ctx->cann_graph->ggml_graph_properties[node_idx].node_address = node->data;
|
||||
cann_ctx->cann_graph->ggml_graph_properties[node_idx].node_op = node->op;
|
||||
|
||||
for (int dim = 0; dim < GGML_MAX_DIMS; dim++) {
|
||||
cann_ctx->cann_graph->ggml_graph_properties[node_idx].ne[dim] = node->ne[dim];
|
||||
cann_ctx->cann_graph->ggml_graph_properties[node_idx].nb[dim] = node->nb[dim];
|
||||
}
|
||||
for (int src = 0; src < GGML_MAX_SRC; src++) {
|
||||
cann_ctx->cann_graph->ggml_graph_properties[node_idx].src_address[src] =
|
||||
node->src[src] ? node->src[src]->data : nullptr;
|
||||
}
|
||||
memcpy(cann_ctx->cann_graph->ggml_graph_properties[node_idx].op_params, node->op_params, GGML_MAX_OP_PARAMS);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Check if a ggml tensor node matches a previously captured CANN graph node.
|
||||
*
|
||||
* This function compares all relevant fields (address, op type, shape, source inputs, op params)
|
||||
* to determine whether the current node matches a previously recorded version.
|
||||
*
|
||||
* @param node The current ggml tensor node.
|
||||
* @param graph_node_properties The stored properties of a CANN graph node.
|
||||
* @return true if all fields match (excluding GGML_OP_VIEW); false otherwise.
|
||||
*/
|
||||
static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
|
||||
if (node->data != graph_node_properties->node_address &&
|
||||
node->op != GGML_OP_VIEW) {
|
||||
return false;
|
||||
}
|
||||
if (node->op != graph_node_properties->node_op) {
|
||||
return false;
|
||||
}
|
||||
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
||||
if (node->ne[i] != graph_node_properties->ne[i]) {
|
||||
return false;
|
||||
}
|
||||
if (node->nb[i] != graph_node_properties->nb[i]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||
if (node->src[i] &&
|
||||
node->src[i]->data != graph_node_properties->src_address[i] &&
|
||||
node->op != GGML_OP_VIEW
|
||||
) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if (node->op == GGML_OP_SCALE &&
|
||||
memcmp(graph_node_properties->op_params, node->op_params, GGML_MAX_OP_PARAMS) != 0) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Determine if the CANN graph needs to be rebuilt due to graph changes.
|
||||
*
|
||||
* This checks whether the number or properties of ggml graph nodes have changed
|
||||
* compared to the last captured CANN graph. If so, the CANN graph must be re-captured.
|
||||
*
|
||||
* @param cann_ctx The CANN backend context.
|
||||
* @param cgraph The current ggml computation graph.
|
||||
* @return true if an update is required; false otherwise.
|
||||
*/
|
||||
static bool is_cann_graph_update_required(ggml_backend_cann_context * cann_ctx, ggml_cgraph * cgraph) {
|
||||
// The number of nodes is different, so the graph needs to be reconstructed.
|
||||
if (cann_ctx->cann_graph->ggml_graph_properties.size() != (size_t)cgraph->n_nodes) {
|
||||
cann_ctx->cann_graph->ggml_graph_properties.resize(cgraph->n_nodes);
|
||||
return true;
|
||||
}
|
||||
|
||||
// The number of nodes is the same; iterate over each node to check whether they match.
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
bool has_matching_properties = ggml_graph_node_has_matching_properties(
|
||||
cgraph->nodes[i], &cann_ctx->cann_graph->ggml_graph_properties[i]);
|
||||
if(!has_matching_properties) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
#endif // USE_ACL_GRAPH
|
||||
|
||||
/**
|
||||
* @brief Evaluate the computation graph and optionally capture or execute it using CANN graph API.
|
||||
*
|
||||
* If CANN graph execution is enabled and graph capture is required, this function begins
|
||||
* graph capture, runs the graph, ends capture, and stores the captured graph.
|
||||
*
|
||||
* Otherwise, it falls back to op-by-op execution using the CANN compute kernel dispatcher.
|
||||
*
|
||||
* @param cann_ctx The CANN backend context.
|
||||
* @param cgraph The ggml computation graph.
|
||||
* @param use_cann_graph Whether to use CANN graph execution.
|
||||
* @param cann_graph_update_required Whether graph capture is needed due to graph changes.
|
||||
*/
|
||||
static void evaluate_and_capture_cann_graph(ggml_backend_cann_context * cann_ctx, ggml_cgraph * cgraph,
|
||||
bool & use_cann_graph, bool & cann_graph_update_required) {
|
||||
#ifdef USE_ACL_GRAPH
|
||||
if (use_cann_graph && cann_graph_update_required) {
|
||||
if (cann_ctx->cann_graph->graph != nullptr) {
|
||||
ACL_CHECK(aclmdlRIDestroy(cann_ctx->cann_graph->graph));
|
||||
cann_ctx->cann_graph->graph = nullptr;
|
||||
}
|
||||
ACL_CHECK(aclmdlRICaptureBegin(cann_ctx->stream(), ACL_MODEL_RI_CAPTURE_MODE_GLOBAL));
|
||||
}
|
||||
#endif // USE_ACL_GRAPH
|
||||
|
||||
// Only perform the graph execution if CANN graphs are not enabled, or we are capturing the graph.
|
||||
// With the use of CANN graphs, the execution will be performed by the graph launch.
|
||||
if (!use_cann_graph || cann_graph_update_required) {
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
ggml_tensor * node = cgraph->nodes[i];
|
||||
|
||||
if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
|
||||
continue;
|
||||
}
|
||||
|
||||
bool ok = ggml_cann_compute_forward(*cann_ctx, node);
|
||||
if (!ok) {
|
||||
GGML_LOG_ERROR("%s: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
|
||||
}
|
||||
GGML_ASSERT(ok);
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef USE_ACL_GRAPH
|
||||
if (use_cann_graph && cann_graph_update_required) { // End CANN graph capture
|
||||
ACL_CHECK(aclmdlRICaptureEnd(cann_ctx->stream(), &cann_ctx->cann_graph->graph));
|
||||
}
|
||||
|
||||
if (use_cann_graph) {
|
||||
// Execute graph
|
||||
ACL_CHECK(aclmdlRIExecuteAsync(cann_ctx->cann_graph->graph, cann_ctx->stream()));
|
||||
}
|
||||
#endif // USE_ACL_GRAPH
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @brief Computes a computational graph using a CANN backend.
|
||||
*
|
||||
@@ -2245,37 +2048,24 @@ static enum ggml_status ggml_backend_cann_graph_compute(
|
||||
ggml_backend_t backend, ggml_cgraph* cgraph) {
|
||||
ggml_backend_cann_context* cann_ctx =
|
||||
(ggml_backend_cann_context*)backend->context;
|
||||
|
||||
ggml_cann_set_device(cann_ctx->device);
|
||||
release_nz_workspace();
|
||||
#ifdef USE_ACL_GRAPH
|
||||
bool use_cann_graph = true;
|
||||
bool cann_graph_update_required = false;
|
||||
|
||||
// check environment LLAMA_SET_ROWS
|
||||
if (!cann_ctx->support_set_rows) {
|
||||
use_cann_graph = false;
|
||||
}
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
ggml_tensor* node = cgraph->nodes[i];
|
||||
|
||||
if (use_cann_graph) {
|
||||
if (cann_ctx->cann_graph == nullptr) {
|
||||
cann_ctx->cann_graph.reset(new ggml_cann_graph());
|
||||
cann_graph_update_required = true;
|
||||
if (ggml_is_empty(node) || node->op == GGML_OP_NONE) {
|
||||
continue;
|
||||
}
|
||||
|
||||
cann_graph_update_required = is_cann_graph_update_required(cann_ctx, cgraph);
|
||||
set_ggml_graph_node_properties(cann_ctx, cgraph);
|
||||
}
|
||||
#else
|
||||
bool use_cann_graph = false;
|
||||
bool cann_graph_update_required = false;
|
||||
#endif // USE_ACL_GRAPH
|
||||
bool ok = ggml_cann_compute_forward(*cann_ctx, node);
|
||||
|
||||
evaluate_and_capture_cann_graph(
|
||||
cann_ctx,
|
||||
cgraph,
|
||||
use_cann_graph,
|
||||
cann_graph_update_required
|
||||
);
|
||||
if (!ok) {
|
||||
GGML_LOG_ERROR("%s: error: op not supported %s (%s)\n", __func__,
|
||||
node->name, ggml_op_name(node->op));
|
||||
}
|
||||
GGML_ASSERT(ok);
|
||||
}
|
||||
|
||||
return GGML_STATUS_SUCCESS;
|
||||
}
|
||||
@@ -2311,23 +2101,10 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
||||
case GGML_UNARY_OP_ELU:
|
||||
case GGML_UNARY_OP_SGN:
|
||||
case GGML_UNARY_OP_STEP:
|
||||
case GGML_UNARY_OP_GELU_ERF:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
case GGML_OP_GLU:
|
||||
switch (ggml_get_glu_op(op)) {
|
||||
case GGML_GLU_OP_REGLU:
|
||||
case GGML_GLU_OP_GEGLU:
|
||||
case GGML_GLU_OP_SWIGLU:
|
||||
case GGML_GLU_OP_GEGLU_ERF:
|
||||
case GGML_GLU_OP_GEGLU_QUICK:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
case GGML_OP_MUL_MAT: {
|
||||
switch (op->src[0]->type) {
|
||||
case GGML_TYPE_F16:
|
||||
@@ -2374,15 +2151,13 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
||||
return false;
|
||||
}
|
||||
} break;
|
||||
case GGML_OP_SET_ROWS: {
|
||||
switch (op->type) {
|
||||
case GGML_TYPE_F32:
|
||||
case GGML_TYPE_F16:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
} break;
|
||||
case GGML_OP_SET_ROWS:
|
||||
{
|
||||
// TODO: add support
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/14274
|
||||
#pragma message("TODO: implement F32, F16, BF16, Q4_0, Q4_1, Q5_0, Q5_1, Q8_0, IQ4_NL support (https://github.com/ggml-org/llama.cpp/pull/14661)")
|
||||
return false;
|
||||
} break;
|
||||
case GGML_OP_CPY: {
|
||||
ggml_tensor *src = op->src[0];
|
||||
if ((op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_F16) ||
|
||||
@@ -2391,6 +2166,12 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
||||
// only support F32 and F16.
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!ggml_are_same_shape(op, src) && !ggml_is_contiguous(op)) {
|
||||
// unsupport dst is not contiguous.
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
} break;
|
||||
case GGML_OP_CONT: {
|
||||
@@ -2456,8 +2237,8 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
||||
// value of paddingW should be at most half of kernelW
|
||||
return (p0 <= (k0 / 2)) && (p1 <= (k1 / 2));
|
||||
}
|
||||
case GGML_OP_DUP:
|
||||
case GGML_OP_SUM:
|
||||
case GGML_OP_DUP:
|
||||
case GGML_OP_IM2COL:
|
||||
case GGML_OP_CONCAT:
|
||||
case GGML_OP_REPEAT:
|
||||
@@ -2499,11 +2280,9 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
||||
memcpy(&bias, (float*)op->op_params + 1, sizeof(float));
|
||||
return bias == 0.0f; // TODO: support bias != 0.0f
|
||||
case GGML_OP_SOFT_MAX:
|
||||
// TODO: support attention sinks [TAG_ATTN_SINKS]
|
||||
if (op->src[2]) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
// TODO: support broadcast
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/14435
|
||||
return !op->src[1] || (op->src[1]->ne[2] == 1 && op->src[1]->ne[3] == 1);
|
||||
case GGML_OP_FLASH_ATTN_EXT:{
|
||||
// derived from [ggml-cuda.cu]
|
||||
if(op->src[1]->type != GGML_TYPE_F16 || op->src[2]->type != GGML_TYPE_F16){
|
||||
@@ -2515,10 +2294,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
||||
if(op->type != GGML_TYPE_F16 && op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_BF16){
|
||||
return false;
|
||||
}
|
||||
// TODO: support attention sinks [TAG_ATTN_SINKS]
|
||||
if (op->src[4]) {
|
||||
return false;
|
||||
}
|
||||
if (op->src[1]->ne[0] != op->src[2]->ne[0]) {
|
||||
// different head sizes of K and V are not supported yet
|
||||
return false;
|
||||
@@ -2530,6 +2305,11 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
||||
// DeepSeek MLA
|
||||
return false;
|
||||
}
|
||||
// TODO: support broadcast
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/14435
|
||||
if (op->src[0]->ne[3] != 1) {
|
||||
return false;
|
||||
}
|
||||
float logitSoftcap = 0.0f;
|
||||
memcpy(&logitSoftcap, (float*)op->op_params + 2, sizeof(float));
|
||||
if(logitSoftcap != 0.0f) {
|
||||
|
||||
@@ -99,9 +99,6 @@ typedef sycl::half2 ggml_half2;
|
||||
#define QI4_1 (QK4_1 / (4 * QR4_1))
|
||||
#define QR4_1 2
|
||||
|
||||
#define QI_MXFP4 (QK_MXFP4 / (4 * QR_MXFP4))
|
||||
#define QR_MXFP4 2
|
||||
|
||||
#define QI5_0 (QK5_0 / (4 * QR5_0))
|
||||
#define QR5_0 2
|
||||
|
||||
@@ -187,13 +184,6 @@ typedef struct {
|
||||
} block_q4_1;
|
||||
static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_half) + QK4_1 / 2, "wrong q4_1 block size/padding");
|
||||
|
||||
#define QK_MXFP4 32
|
||||
typedef struct {
|
||||
uint8_t e; // E8M0
|
||||
uint8_t qs[QK_MXFP4/2];
|
||||
} block_mxfp4;
|
||||
static_assert(sizeof(block_mxfp4) == sizeof(uint8_t) + QK_MXFP4/2, "wrong mxfp4 block size/padding");
|
||||
|
||||
#define QK5_0 32
|
||||
typedef struct {
|
||||
ggml_half d; // delta
|
||||
@@ -1084,17 +1074,10 @@ GGML_TABLE_BEGIN(uint32_t, iq3s_grid, 512)
|
||||
0x0f090307, 0x0f090501, 0x0f090b01, 0x0f0b0505, 0x0f0b0905, 0x0f0d0105, 0x0f0d0703, 0x0f0f0101,
|
||||
GGML_TABLE_END()
|
||||
|
||||
// TODO: fix name to kvalues_iq4_nl
|
||||
GGML_TABLE_BEGIN(int8_t, kvalues_iq4nl, 16)
|
||||
-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113,
|
||||
GGML_TABLE_END()
|
||||
|
||||
// e2m1 values (doubled)
|
||||
// ref: https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
|
||||
GGML_TABLE_BEGIN(int8_t, kvalues_mxfp4, 16)
|
||||
0, 1, 2, 3, 4, 6, 8, 12, 0, -1, -2, -3, -4, -6, -8, -12,
|
||||
GGML_TABLE_END()
|
||||
|
||||
#define NGRID_IQ1S 2048
|
||||
#define IQ1S_DELTA 0.125f
|
||||
#define IQ1M_DELTA 0.125f
|
||||
|
||||
@@ -458,9 +458,8 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
list(APPEND ARCH_FLAGS -march=z16)
|
||||
elseif (${S390X_M} MATCHES "9175|9176")
|
||||
# NOTE: Only available from GCC 15.1.0 onwards. Any z17 machine with compile issues must first verify their GCC version.
|
||||
# binutils must also be updated to the latest for the -march=z17 flag to work. Otherwise, use -march=arch15.
|
||||
message(STATUS "z17 target")
|
||||
list(APPEND ARCH_FLAGS -march=arch15)
|
||||
list(APPEND ARCH_FLAGS -march=z17)
|
||||
else()
|
||||
message(STATUS "Unknown target")
|
||||
message(WARNING "Unknown target. If you are compiling for z14 and earlier, you might have to add -DGGML_VXE=OFF.")
|
||||
|
||||
@@ -13,7 +13,6 @@
|
||||
#define ggml_vec_dot_q5_0_q8_0_generic ggml_vec_dot_q5_0_q8_0
|
||||
#define ggml_vec_dot_q5_1_q8_1_generic ggml_vec_dot_q5_1_q8_1
|
||||
#define ggml_vec_dot_q8_0_q8_0_generic ggml_vec_dot_q8_0_q8_0
|
||||
#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
|
||||
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
|
||||
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
|
||||
#define ggml_vec_dot_q2_K_q8_K_generic ggml_vec_dot_q2_K_q8_K
|
||||
@@ -38,25 +37,17 @@
|
||||
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
||||
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
|
||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
|
||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
|
||||
#elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) || defined(_M_ARM64)
|
||||
// repack.cpp
|
||||
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
|
||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
|
||||
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
|
||||
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||
#elif defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
|
||||
// repack.cpp
|
||||
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
||||
@@ -73,7 +64,6 @@
|
||||
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
|
||||
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
|
||||
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
|
||||
#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
|
||||
// repack.cpp
|
||||
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
||||
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
|
||||
@@ -82,23 +72,18 @@
|
||||
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
||||
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
|
||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
|
||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
|
||||
#elif defined(__loongarch64)
|
||||
// quants.c
|
||||
#define quantize_row_q8_K_generic quantize_row_q8_K
|
||||
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
|
||||
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
|
||||
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
|
||||
#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
|
||||
// repack.cpp
|
||||
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
||||
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
|
||||
@@ -107,16 +92,12 @@
|
||||
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
||||
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
|
||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
|
||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
|
||||
#elif defined(__riscv)
|
||||
// quants.c
|
||||
#define quantize_row_q8_K_generic quantize_row_q8_K
|
||||
@@ -131,7 +112,6 @@
|
||||
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
|
||||
#define ggml_vec_dot_iq4_nl_q8_0_generic ggml_vec_dot_iq4_nl_q8_0
|
||||
#define ggml_vec_dot_iq4_xs_q8_K_generic ggml_vec_dot_iq4_xs_q8_K
|
||||
#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
|
||||
// repack.cpp
|
||||
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
||||
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
|
||||
@@ -139,15 +119,11 @@
|
||||
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
|
||||
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
|
||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
|
||||
#elif defined(__s390x__)
|
||||
// quants.c
|
||||
#define quantize_row_q8_K_generic quantize_row_q8_K
|
||||
@@ -163,7 +139,6 @@
|
||||
#define ggml_vec_dot_iq3_s_q8_K_generic ggml_vec_dot_iq3_s_q8_K
|
||||
#define ggml_vec_dot_iq1_s_q8_K_generic ggml_vec_dot_iq1_s_q8_K
|
||||
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
|
||||
#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
|
||||
// repack.cpp
|
||||
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
||||
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
|
||||
@@ -172,16 +147,12 @@
|
||||
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
||||
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
|
||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
|
||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
|
||||
#elif defined(__wasm__)
|
||||
// quants.c
|
||||
#define ggml_vec_dot_q4_1_q8_1_generic ggml_vec_dot_q4_1_q8_1
|
||||
@@ -196,7 +167,6 @@
|
||||
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
|
||||
#define ggml_vec_dot_iq4_nl_q8_0_generic ggml_vec_dot_iq4_nl_q8_0
|
||||
#define ggml_vec_dot_iq4_xs_q8_K_generic ggml_vec_dot_iq4_xs_q8_K
|
||||
#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
|
||||
// repack.cpp
|
||||
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
||||
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
|
||||
@@ -205,14 +175,10 @@
|
||||
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
||||
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
|
||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
|
||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
|
||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
|
||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
|
||||
#endif
|
||||
|
||||
@@ -589,67 +589,6 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
UNUSED(bx);
|
||||
UNUSED(by);
|
||||
UNUSED(bs);
|
||||
assert(n % QK_MXFP4 == 0);
|
||||
static_assert(QK_MXFP4 == QK8_0, "QK_MXFP4 and QK8_0 must be the same");
|
||||
|
||||
const block_mxfp4 * GGML_RESTRICT x = vx;
|
||||
const block_q8_0 * GGML_RESTRICT y = vy;
|
||||
|
||||
const int nb = n / QK_MXFP4;
|
||||
|
||||
int ib = 0;
|
||||
float sumf = 0;
|
||||
|
||||
#if defined __ARM_NEON
|
||||
const int8x16_t values = vld1q_s8(kvalues_mxfp4);
|
||||
const uint8x16_t m4b = vdupq_n_u8(0x0f);
|
||||
uint8x16x2_t q4bits;
|
||||
int8x16x4_t q4b;
|
||||
int8x16x4_t q8b;
|
||||
int32x4_t prod_1;
|
||||
int32x4_t prod_2;
|
||||
|
||||
for (; ib + 1 < nb; ib += 2) {
|
||||
q4bits.val[0] = vld1q_u8(x[ib + 0].qs);
|
||||
q4bits.val[1] = vld1q_u8(x[ib + 1].qs);
|
||||
q8b.val[0] = vld1q_s8(y[ib + 0].qs);
|
||||
q8b.val[1] = vld1q_s8(y[ib + 0].qs + 16);
|
||||
q8b.val[2] = vld1q_s8(y[ib + 1].qs);
|
||||
q8b.val[3] = vld1q_s8(y[ib + 1].qs + 16);
|
||||
|
||||
q4b.val[0] = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits.val[0], m4b));
|
||||
q4b.val[1] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[0], 4));
|
||||
q4b.val[2] = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits.val[1], m4b));
|
||||
q4b.val[3] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[1], 4));
|
||||
|
||||
prod_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[0], q8b.val[0]), q4b.val[1], q8b.val[1]);
|
||||
prod_2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[2], q8b.val[2]), q4b.val[3], q8b.val[3]);
|
||||
|
||||
sumf +=
|
||||
GGML_E8M0_TO_FP32_HALF(x[ib + 0].e) * GGML_CPU_FP16_TO_FP32(y[ib + 0].d) * vaddvq_s32(prod_1) +
|
||||
GGML_E8M0_TO_FP32_HALF(x[ib + 1].e) * GGML_CPU_FP16_TO_FP32(y[ib + 1].d) * vaddvq_s32(prod_2);
|
||||
}
|
||||
|
||||
#endif
|
||||
for (; ib < nb; ++ib) {
|
||||
const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_E8M0_TO_FP32_HALF(x[ib].e);
|
||||
int sumi1 = 0;
|
||||
int sumi2 = 0;
|
||||
for (int j = 0; j < QK_MXFP4/2; ++j) {
|
||||
sumi1 += y[ib].qs[j + 0] * kvalues_mxfp4[x[ib].qs[j] & 0xf];
|
||||
sumi2 += y[ib].qs[j + QK_MXFP4/2] * kvalues_mxfp4[x[ib].qs[j] >> 4];
|
||||
}
|
||||
sumf += d * (sumi1 + sumi2);
|
||||
}
|
||||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
@@ -1297,10 +1236,44 @@ void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
*s = sumf;
|
||||
|
||||
#else
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_tq1_0_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
const uint8_t pow3[6] = {1, 3, 9, 27, 81, 243};
|
||||
|
||||
float sumf = 0.0f;
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
int sum = 0;
|
||||
|
||||
for (size_t j = 0; j < sizeof(x->qs) - sizeof(x->qs) % 32; j += 32) {
|
||||
for (size_t l = 0; l < 5; ++l) {
|
||||
for (size_t m = 0; m < 32; ++m) {
|
||||
uint8_t q = x[i].qs[j + m] * pow3[l];
|
||||
uint16_t xi = ((uint16_t) q * 3) >> 8;
|
||||
sum += (xi - 1) * y[i].qs[j*5 + l*32 + m];
|
||||
}
|
||||
}
|
||||
}
|
||||
for (size_t j = sizeof(x->qs) - sizeof(x->qs) % 32; j < sizeof(x->qs); j += 16) {
|
||||
for (size_t l = 0; l < 5; ++l) {
|
||||
for (size_t m = 0; m < 16; ++m) {
|
||||
uint8_t q = x[i].qs[j + m] * pow3[l];
|
||||
uint16_t xi = ((uint16_t) q * 3) >> 8;
|
||||
sum += (xi - 1) * y[i].qs[j*5 + l*16 + m];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t l = 0; l < 4; ++l) {
|
||||
for (size_t j = 0; j < sizeof(x->qh); ++j) {
|
||||
uint8_t q = x[i].qh[j] * pow3[l];
|
||||
uint16_t xi = ((uint16_t) q * 3) >> 8;
|
||||
sum += (xi - 1) * y[i].qs[sizeof(x->qs)*5 + l*sizeof(x->qh) + j];
|
||||
}
|
||||
}
|
||||
|
||||
sumf += (float) sum * (GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -1408,10 +1381,25 @@ void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
*s = sumf;
|
||||
|
||||
#else
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_tq2_0_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
float sumf = 0.0f;
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
int32_t sumi = 0;
|
||||
|
||||
for (size_t j = 0; j < sizeof(x->qs); j += 32) {
|
||||
for (size_t l = 0; l < 4; ++l) {
|
||||
for (size_t k = 0; k < 32; ++k) {
|
||||
sumi += y[i].qs[j*4 + l*32 + k] * (((x[i].qs[j + k] >> (l*2)) & 3) - 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
||||
|
||||
sumf += (float) sumi * d;
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -1741,10 +1729,45 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
*s = sum;
|
||||
|
||||
#else
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_q2_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
|
||||
float sumf = 0;
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
|
||||
const uint8_t * q2 = x[i].qs;
|
||||
const int8_t * q8 = y[i].qs;
|
||||
const uint8_t * sc = x[i].scales;
|
||||
|
||||
int summs = 0;
|
||||
for (int j = 0; j < 16; ++j) {
|
||||
summs += y[i].bsums[j] * (sc[j] >> 4);
|
||||
}
|
||||
|
||||
const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
||||
|
||||
int isum = 0;
|
||||
int is = 0;
|
||||
int d;
|
||||
for (int k = 0; k < QK_K/128; ++k) {
|
||||
int shift = 0;
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
d = sc[is++] & 0xF;
|
||||
int isuml = 0;
|
||||
for (int l = 0; l < 16; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
|
||||
isum += d * isuml;
|
||||
d = sc[is++] & 0xF;
|
||||
isuml = 0;
|
||||
for (int l = 16; l < 32; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
|
||||
isum += d * isuml;
|
||||
shift += 2;
|
||||
q8 += 32;
|
||||
}
|
||||
q2 += 32;
|
||||
}
|
||||
sumf += dall * isum - dmin * summs;
|
||||
}
|
||||
*s = sumf;
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -2034,12 +2057,68 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
*s = sum;
|
||||
|
||||
#else
|
||||
UNUSED(kmask1);
|
||||
UNUSED(kmask2);
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_q3_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
// scalar version
|
||||
// This function is written like this so the compiler can manage to vectorize most of it
|
||||
// Using -Ofast, GCC and clang manage to produce code that is within a factor of 2 or so from the
|
||||
// manually vectorized version above. Every other version I tried would run at least 4 times slower.
|
||||
// The ideal situation would be if we could just write the code once, and the compiler would
|
||||
// automatically produce the best possible set of machine instructions, instead of us having to manually
|
||||
// write vectorized versions for AVX, ARM_NEON, etc.
|
||||
|
||||
int8_t aux8[QK_K];
|
||||
int16_t aux16[8];
|
||||
float sums [8];
|
||||
int32_t aux32[8];
|
||||
memset(sums, 0, 8*sizeof(float));
|
||||
|
||||
uint32_t auxs[4];
|
||||
const int8_t * scales = (const int8_t*)auxs;
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT hm = x[i].hmask;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
memset(aux32, 0, 8*sizeof(int32_t));
|
||||
int8_t * GGML_RESTRICT a = aux8;
|
||||
uint8_t m = 1;
|
||||
for (int j = 0; j < QK_K; j += 128) {
|
||||
for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
|
||||
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
||||
a += 32; m <<= 1;
|
||||
for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3;
|
||||
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
||||
a += 32; m <<= 1;
|
||||
for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3;
|
||||
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
||||
a += 32; m <<= 1;
|
||||
for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3;
|
||||
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
||||
a += 32; m <<= 1;
|
||||
q3 += 32;
|
||||
}
|
||||
a = aux8;
|
||||
|
||||
memcpy(auxs, x[i].scales, 12);
|
||||
uint32_t tmp = auxs[2];
|
||||
auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
|
||||
auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
|
||||
auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
|
||||
auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
|
||||
for (int j = 0; j < QK_K/16; ++j) {
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
*s = sumf;
|
||||
|
||||
#endif
|
||||
|
||||
}
|
||||
@@ -2352,14 +2431,61 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
*s = sumf;
|
||||
|
||||
#else
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
UNUSED(kmask1);
|
||||
UNUSED(kmask2);
|
||||
UNUSED(kmask3);
|
||||
UNUSED(utmp);
|
||||
ggml_vec_dot_q4_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
|
||||
const uint8_t * scales = (const uint8_t*)&utmp[0];
|
||||
const uint8_t * mins = (const uint8_t*)&utmp[2];
|
||||
|
||||
int8_t aux8[QK_K];
|
||||
int16_t aux16[8];
|
||||
float sums [8];
|
||||
int32_t aux32[8];
|
||||
memset(sums, 0, 8*sizeof(float));
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
memset(aux32, 0, 8*sizeof(int32_t));
|
||||
int8_t * GGML_RESTRICT a = aux8;
|
||||
for (int j = 0; j < QK_K/64; ++j) {
|
||||
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
|
||||
a += 32;
|
||||
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
|
||||
a += 32; q4 += 32;
|
||||
}
|
||||
memcpy(utmp, x[i].scales, 12);
|
||||
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
||||
const uint32_t uaux = utmp[1] & kmask1;
|
||||
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
||||
utmp[2] = uaux;
|
||||
utmp[0] &= kmask1;
|
||||
|
||||
int sumi = 0;
|
||||
for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
|
||||
a = aux8;
|
||||
int is = 0;
|
||||
for (int j = 0; j < QK_K/32; ++j) {
|
||||
int32_t scale = scales[is++];
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
||||
sumf -= dmin * sumi;
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
*s = sumf;
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -2452,14 +2578,66 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
*s = sumf;
|
||||
|
||||
#else
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
UNUSED(kmask1);
|
||||
UNUSED(kmask2);
|
||||
UNUSED(kmask3);
|
||||
UNUSED(utmp);
|
||||
ggml_vec_dot_q5_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
|
||||
const uint8_t * scales = (const uint8_t*)&utmp[0];
|
||||
const uint8_t * mins = (const uint8_t*)&utmp[2];
|
||||
|
||||
int8_t aux8[QK_K];
|
||||
int16_t aux16[8];
|
||||
float sums [8];
|
||||
int32_t aux32[8];
|
||||
memset(sums, 0, 8*sizeof(float));
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT hm = x[i].qh;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
memset(aux32, 0, 8*sizeof(int32_t));
|
||||
int8_t * GGML_RESTRICT a = aux8;
|
||||
uint8_t m = 1;
|
||||
for (int j = 0; j < QK_K/64; ++j) {
|
||||
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
|
||||
for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
|
||||
a += 32; m <<= 1;
|
||||
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
|
||||
for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
|
||||
a += 32; m <<= 1;
|
||||
q4 += 32;
|
||||
}
|
||||
memcpy(utmp, x[i].scales, 12);
|
||||
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
||||
const uint32_t uaux = utmp[1] & kmask1;
|
||||
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
||||
utmp[2] = uaux;
|
||||
utmp[0] &= kmask1;
|
||||
|
||||
int sumi = 0;
|
||||
for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
|
||||
a = aux8;
|
||||
int is = 0;
|
||||
for (int j = 0; j < QK_K/32; ++j) {
|
||||
int32_t scale = scales[is++];
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
||||
sumf -= dmin * sumi;
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
*s = sumf;
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -2915,10 +3093,47 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
}
|
||||
*s = sum;
|
||||
#else
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_q6_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
|
||||
int8_t aux8[QK_K];
|
||||
int16_t aux16[8];
|
||||
float sums [8];
|
||||
int32_t aux32[8];
|
||||
memset(sums, 0, 8*sizeof(float));
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const uint8_t * GGML_RESTRICT q4 = x[i].ql;
|
||||
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
memset(aux32, 0, 8*sizeof(int32_t));
|
||||
int8_t * GGML_RESTRICT a = aux8;
|
||||
for (int j = 0; j < QK_K; j += 128) {
|
||||
for (int l = 0; l < 32; ++l) {
|
||||
a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
|
||||
a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
|
||||
a[l + 64] = (int8_t)((q4[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
|
||||
a[l + 96] = (int8_t)((q4[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
|
||||
}
|
||||
a += 128;
|
||||
q4 += 64;
|
||||
qh += 32;
|
||||
}
|
||||
a = aux8;
|
||||
int is = 0;
|
||||
for (int j = 0; j < QK_K/16; ++j) {
|
||||
int scale = x[i].scales[is++];
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
*s = sumf;
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -3014,10 +3229,34 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
|
||||
*s = 0.25f * sumf;
|
||||
|
||||
#else
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_iq2_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
|
||||
uint32_t aux32[2];
|
||||
const uint8_t * aux8 = (const uint8_t *)aux32;
|
||||
|
||||
float sumf = 0.f;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
int32_t bsum = 0;
|
||||
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
||||
memcpy(aux32, q2, 2*sizeof(uint32_t));
|
||||
q2 += 4;
|
||||
const uint32_t ls = 2*(aux32[1] >> 28) + 1;
|
||||
int32_t sumi = 0;
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
|
||||
const uint8_t signs = ksigns_iq2xs[(aux32[1] >> 7*l) & 127];
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
|
||||
}
|
||||
q8 += 8;
|
||||
}
|
||||
bsum += sumi * ls;
|
||||
}
|
||||
sumf += d * bsum;
|
||||
}
|
||||
*s = 0.125f * sumf;
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -3088,10 +3327,42 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
||||
*s = 0.125f * sumf;
|
||||
|
||||
#else
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_iq2_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
|
||||
float sumf = 0.f;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT sc = x[i].scales;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
int32_t bsum = 0;
|
||||
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
||||
const uint16_t ls1 = 2*(sc[ib32] & 0xf) + 1;
|
||||
const uint16_t ls2 = 2*(sc[ib32] >> 4) + 1;
|
||||
int32_t sumi = 0;
|
||||
for (int l = 0; l < 2; ++l) {
|
||||
const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
|
||||
const uint8_t signs = ksigns_iq2xs[q2[l] >> 9];
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
|
||||
}
|
||||
q8 += 8;
|
||||
}
|
||||
bsum += sumi * ls1;
|
||||
sumi = 0;
|
||||
for (int l = 2; l < 4; ++l) {
|
||||
const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
|
||||
const uint8_t signs = ksigns_iq2xs[q2[l] >> 9];
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
|
||||
}
|
||||
q8 += 8;
|
||||
}
|
||||
bsum += sumi * ls2;
|
||||
q2 += 4;
|
||||
}
|
||||
sumf += d * bsum;
|
||||
}
|
||||
*s = 0.125f * sumf;
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -3184,10 +3455,45 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
*s = 0.125f * sumf;
|
||||
|
||||
#else
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_iq2_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; i++) {
|
||||
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const int8_t * q8 = y[i].qs;
|
||||
const uint8_t * qs = x[i].qs;
|
||||
const uint8_t * qh = x[i].qh;
|
||||
const uint8_t * signs = qs + QK_K/8;
|
||||
|
||||
int bsum = 0;
|
||||
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
||||
int ls1 = 1 + 2*(x[i].scales[ib32] & 0xf);
|
||||
int ls2 = 1 + 2*(x[i].scales[ib32] >> 4);
|
||||
int sumi1 = 0, sumi2 = 0;
|
||||
for (int l = 0; l < 2; ++l) {
|
||||
const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
sumi1 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
|
||||
}
|
||||
q8 += 8;
|
||||
}
|
||||
for (int l = 2; l < 4; ++l) {
|
||||
const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
sumi2 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
|
||||
}
|
||||
q8 += 8;
|
||||
}
|
||||
bsum += ls1 * sumi1 + ls2 * sumi2;
|
||||
qs += 4;
|
||||
signs += 4;
|
||||
}
|
||||
|
||||
sumf += d * bsum;
|
||||
}
|
||||
|
||||
*s = 0.125f * sumf;
|
||||
|
||||
#endif
|
||||
|
||||
}
|
||||
@@ -3247,10 +3553,36 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
|
||||
*s = 0.5f * sumf;
|
||||
|
||||
#else
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_iq3_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
|
||||
uint32_t aux32;
|
||||
|
||||
float sumf = 0.f;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
int32_t bsum = 0;
|
||||
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
||||
memcpy(&aux32, gas, sizeof(uint32_t)); gas += sizeof(uint32_t);
|
||||
const uint32_t ls = 2*(aux32 >> 28) + 1;
|
||||
int32_t sumi = 0;
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*l+0]);
|
||||
const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*l+1]);
|
||||
const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*l) & 127];
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
sumi += grid1[j] * q8[j+0] * (signs & kmask_iq2xs[j+0] ? -1 : 1);
|
||||
sumi += grid2[j] * q8[j+4] * (signs & kmask_iq2xs[j+4] ? -1 : 1);
|
||||
}
|
||||
q8 += 8;
|
||||
}
|
||||
q3 += 8;
|
||||
bsum += sumi * ls;
|
||||
}
|
||||
sumf += d * bsum;
|
||||
}
|
||||
*s = 0.25f * sumf;
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -3357,10 +3689,48 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
*s = sumf;
|
||||
|
||||
#else
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_iq3_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
|
||||
float sumf = 0.f;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint8_t * GGML_RESTRICT qs = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
||||
const uint8_t * GGML_RESTRICT signs = x[i].signs;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
int32_t bsum = 0;
|
||||
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
||||
const uint32_t ls1 = 2*(x[i].scales[ib32/2] & 0xf) + 1;
|
||||
const uint32_t ls2 = 2*(x[i].scales[ib32/2] >> 4) + 1;
|
||||
int32_t sumi = 0;
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+0] << (8-2*l)) & 256)));
|
||||
const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+0] << (7-2*l)) & 256)));
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
|
||||
sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
|
||||
}
|
||||
q8 += 8;
|
||||
}
|
||||
qs += 8;
|
||||
signs += 4;
|
||||
bsum += sumi * ls1;
|
||||
sumi = 0;
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+1] << (8-2*l)) & 256)));
|
||||
const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+1] << (7-2*l)) & 256)));
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
|
||||
sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
|
||||
}
|
||||
q8 += 8;
|
||||
}
|
||||
qs += 8;
|
||||
signs += 4;
|
||||
bsum += sumi * ls2;
|
||||
}
|
||||
sumf += d * bsum;
|
||||
}
|
||||
*s = sumf;
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -3423,10 +3793,36 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
*s = sumf;
|
||||
|
||||
#else
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_iq1_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; i++) {
|
||||
|
||||
const int8_t * q8 = y[i].qs;
|
||||
const uint8_t * qs = x[i].qs;
|
||||
const uint16_t * qh = x[i].qh;
|
||||
|
||||
int sumi = 0, sumi1 = 0;
|
||||
for (int ib = 0; ib < QK_K/32; ++ib) {
|
||||
const int ls = 2*((qh[ib] >> 12) & 7) + 1;
|
||||
const int delta = qh[ib] & 0x8000 ? -1 : 1;
|
||||
int lsum = 0;
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((qh[ib] >> 3*l) & 7) << 8)));
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
lsum += q8[j] * grid[j];
|
||||
}
|
||||
q8 += 8;
|
||||
}
|
||||
sumi += ls * lsum;
|
||||
sumi1 += ls * delta * (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]);
|
||||
qs += 4;
|
||||
}
|
||||
|
||||
sumf += GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -3516,11 +3912,52 @@ void ggml_vec_dot_iq1_m_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
*s = sumf;
|
||||
|
||||
#else
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
UNUSED(scale);
|
||||
ggml_vec_dot_iq1_m_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
|
||||
int sum1[2], sum2[2], delta[4];
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; i++) {
|
||||
|
||||
const int8_t * q8 = y[i].qs;
|
||||
const uint8_t * qs = x[i].qs;
|
||||
const uint8_t * qh = x[i].qh;
|
||||
const uint16_t * sc = (const uint16_t *)x[i].scales;
|
||||
|
||||
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
|
||||
|
||||
int sumi1 = 0, sumi2 = 0;
|
||||
for (int ib = 0; ib < QK_K/32; ++ib) {
|
||||
delta[0] = qh[0] & 0x08 ? -1 : 1;
|
||||
delta[1] = qh[0] & 0x80 ? -1 : 1;
|
||||
delta[2] = qh[1] & 0x08 ? -1 : 1;
|
||||
delta[3] = qh[1] & 0x80 ? -1 : 1;
|
||||
sum1[0] = sum1[1] = sum2[0] = sum2[1] = 0;
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((uint16_t)qh[l/2] << (8 - 4*(l%2))) & 0x700)));
|
||||
int lsum1 = 0, lsum2 = 0;
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
lsum1 += q8[j] * grid[j];
|
||||
lsum2 += q8[j];
|
||||
}
|
||||
q8 += 8;
|
||||
sum1[l/2] += lsum1;
|
||||
sum2[l/2] += lsum2*delta[l];
|
||||
}
|
||||
|
||||
const int ls1 = 2*((sc[ib/2] >> (6*(ib%2)+0)) & 0x7) + 1;
|
||||
const int ls2 = 2*((sc[ib/2] >> (6*(ib%2)+3)) & 0x7) + 1;
|
||||
|
||||
sumi1 += sum1[0] * ls1 + sum1[1] * ls2;
|
||||
sumi2 += sum2[0] * ls1 + sum2[1] * ls2;
|
||||
qs += 4;
|
||||
qh += 2;
|
||||
}
|
||||
|
||||
sumf += GGML_CPU_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -3641,10 +4078,37 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
||||
*s = sumf;
|
||||
|
||||
#else
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
float sumf = 0;
|
||||
for (int ibl = 0; ibl < nb; ++ibl) {
|
||||
const float d4d8 = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
|
||||
uint16_t h = x[ibl].scales_h;
|
||||
const uint8_t * qs = x[ibl].qs;
|
||||
const int8_t * q8 = y[ibl].qs;
|
||||
for (int ib = 0; ib < QK_K/32; ib += 2) {
|
||||
const uint8_t ls1 = (x[ibl].scales_l[ib/2] & 0xf) | ((h << 4) & 0x30);
|
||||
const uint8_t ls2 = (x[ibl].scales_l[ib/2] >> 4) | ((h << 2) & 0x30);
|
||||
h >>= 4;
|
||||
const float d1 = d4d8*(ls1 - 32);
|
||||
const float d2 = d4d8*(ls2 - 32);
|
||||
int sumi1 = 0, sumi2 = 0;
|
||||
for (int j = 0; j < 16; ++j) {
|
||||
sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
|
||||
sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
|
||||
}
|
||||
sumf += d1 * (sumi1 + sumi2);
|
||||
qs += 16;
|
||||
q8 += 32;
|
||||
sumi1 = sumi2 = 0;
|
||||
for (int j = 0; j < 16; ++j) {
|
||||
sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
|
||||
sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
|
||||
}
|
||||
sumf += d2 * (sumi1 + sumi2);
|
||||
qs += 16;
|
||||
q8 += 32;
|
||||
}
|
||||
}
|
||||
*s = sumf;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
@@ -86,9 +86,35 @@ void ggml_quantize_mat_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTR
|
||||
}
|
||||
}
|
||||
#else
|
||||
UNUSED(nb);
|
||||
UNUSED(y);
|
||||
ggml_quantize_mat_q8_0_4x4_generic(x, vy, k);
|
||||
// scalar
|
||||
const int blck_size_interleave = 4;
|
||||
float srcv[4][QK8_0];
|
||||
float id[4];
|
||||
|
||||
for (int i = 0; i < nb; i++) {
|
||||
for (int row_iter = 0; row_iter < 4; row_iter++) {
|
||||
float amax = 0.0f; // absolute max
|
||||
|
||||
for (int j = 0; j < QK8_0; j++) {
|
||||
srcv[row_iter][j] = x[row_iter * k + i * QK8_0 + j];
|
||||
amax = MAX(amax, fabsf(srcv[row_iter][j]));
|
||||
}
|
||||
|
||||
const float d = amax / ((1 << 7) - 1);
|
||||
id[row_iter] = d ? 1.0f / d : 0.0f;
|
||||
|
||||
y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
|
||||
}
|
||||
|
||||
for (int j = 0; j < QK8_0 * 4; j++) {
|
||||
int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave;
|
||||
int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave;
|
||||
src_offset += (j % blck_size_interleave);
|
||||
|
||||
float x0 = srcv[src_id][src_offset] * id[src_id];
|
||||
y[i].qs[j] = roundf(x0);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -179,9 +205,35 @@ void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTR
|
||||
}
|
||||
|
||||
#else
|
||||
UNUSED(nb);
|
||||
UNUSED(y);
|
||||
ggml_quantize_mat_q8_0_4x8_generic(x, vy, k);
|
||||
// scalar
|
||||
const int blck_size_interleave = 8;
|
||||
float srcv[4][QK8_0];
|
||||
float id[4];
|
||||
|
||||
for (int i = 0; i < nb; i++) {
|
||||
for (int row_iter = 0; row_iter < 4; row_iter++) {
|
||||
float amax = 0.0f; // absolute max
|
||||
|
||||
for (int j = 0; j < QK8_0; j++) {
|
||||
srcv[row_iter][j] = x[row_iter * k + i * QK8_0 + j];
|
||||
amax = MAX(amax, fabsf(srcv[row_iter][j]));
|
||||
}
|
||||
|
||||
const float d = amax / ((1 << 7) - 1);
|
||||
id[row_iter] = d ? 1.0f / d : 0.0f;
|
||||
|
||||
y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
|
||||
}
|
||||
|
||||
for (int j = 0; j < QK8_0 * 4; j++) {
|
||||
int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave;
|
||||
int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave;
|
||||
src_offset += (j % blck_size_interleave);
|
||||
|
||||
float x0 = srcv[src_id][src_offset] * id[src_id];
|
||||
y[i].qs[j] = roundf(x0);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -243,7 +295,29 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
}
|
||||
return;
|
||||
#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
|
||||
ggml_gemv_q4_0_4x4_q8_0_generic(n, s, bs, vx, vy, nr, nc);
|
||||
float sumf[4];
|
||||
int sumi;
|
||||
|
||||
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
||||
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||
const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
|
||||
|
||||
for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
|
||||
for (int l = 0; l < nb; l++) {
|
||||
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
sumi = 0;
|
||||
for (int i = 0; i < blocklen; ++i) {
|
||||
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
|
||||
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
||||
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
|
||||
}
|
||||
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
@@ -309,7 +383,29 @@ void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
}
|
||||
return;
|
||||
#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
|
||||
ggml_gemv_q4_0_4x8_q8_0_generic(n, s, bs, vx, vy, nr, nc);
|
||||
float sumf[4];
|
||||
int sumi;
|
||||
|
||||
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
||||
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||
const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
|
||||
|
||||
for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
|
||||
for (int l = 0; l < nb; l++) {
|
||||
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
sumi = 0;
|
||||
for (int i = 0; i < blocklen; ++i) {
|
||||
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
|
||||
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
||||
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
|
||||
}
|
||||
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
@@ -401,7 +497,31 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
#endif // #if defined(__ARM_FEATURE_SVE)
|
||||
|
||||
#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
|
||||
ggml_gemv_q4_0_8x8_q8_0_generic(n, s, bs, vx, vy, nr, nc);
|
||||
{
|
||||
float sumf[8];
|
||||
int sumi;
|
||||
|
||||
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
||||
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||
const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
|
||||
|
||||
for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
|
||||
for (int l = 0; l < nb; l++) {
|
||||
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
sumi = 0;
|
||||
for (int i = 0; i < blocklen; ++i) {
|
||||
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
|
||||
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
||||
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
|
||||
}
|
||||
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
@@ -471,7 +591,31 @@ void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const
|
||||
}
|
||||
return;
|
||||
#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
|
||||
ggml_gemv_iq4_nl_4x4_q8_0_generic(n, s, bs, vx, vy, nr, nc);
|
||||
{
|
||||
float sumf[4];
|
||||
int sumi;
|
||||
|
||||
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
||||
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||
const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
|
||||
|
||||
for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
|
||||
for (int l = 0; l < nb; l++) {
|
||||
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
sumi = 0;
|
||||
for (int i = 0; i < blocklen; ++i) {
|
||||
const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
|
||||
const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
|
||||
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
|
||||
}
|
||||
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
@@ -952,7 +1096,40 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
);
|
||||
return;
|
||||
#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
|
||||
ggml_gemm_q4_0_4x4_q8_0_generic(n, s, bs, vx, vy, nr, nc);
|
||||
{
|
||||
float sumf[4][4];
|
||||
int sumi;
|
||||
|
||||
for (int y = 0; y < nr / 4; y++) {
|
||||
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
||||
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||
const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
|
||||
for (int m = 0; m < 4; m++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
|
||||
}
|
||||
for (int l = 0; l < nb; l++) {
|
||||
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
||||
for (int m = 0; m < 4; m++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
sumi = 0;
|
||||
for (int i = 0; i < blocklen; ++i) {
|
||||
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
|
||||
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
||||
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
||||
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
|
||||
}
|
||||
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int m = 0; m < 4; m++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++)
|
||||
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
@@ -1373,7 +1550,38 @@ void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
);
|
||||
return;
|
||||
#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
|
||||
ggml_gemm_q4_0_4x8_q8_0_generic(n, s, bs, vx, vy, nr, nc);
|
||||
float sumf[4][4];
|
||||
int sumi;
|
||||
|
||||
for (int y = 0; y < nr / 4; y++) {
|
||||
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
||||
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||
const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
|
||||
for (int m = 0; m < 4; m++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
|
||||
}
|
||||
for (int l = 0; l < nb; l++) {
|
||||
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
||||
for (int m = 0; m < 4; m++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
sumi = 0;
|
||||
for (int i = 0; i < blocklen; ++i) {
|
||||
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
|
||||
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
||||
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
||||
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
|
||||
}
|
||||
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int m = 0; m < 4; m++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++)
|
||||
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
@@ -1811,7 +2019,38 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
#endif // #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
|
||||
|
||||
#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
|
||||
ggml_gemm_q4_0_8x8_q8_0_generic(n, s, bs, vx, vy, nr, nc);
|
||||
float sumf[4][8];
|
||||
int sumi;
|
||||
|
||||
for (int y = 0; y < nr / 4; y++) {
|
||||
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
||||
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||
const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
|
||||
for (int m = 0; m < 4; m++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
|
||||
}
|
||||
for (int l = 0; l < nb; l++) {
|
||||
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
||||
for (int m = 0; m < 4; m++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
sumi = 0;
|
||||
for (int i = 0; i < blocklen; ++i) {
|
||||
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
|
||||
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
||||
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
||||
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
|
||||
}
|
||||
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int m = 0; m < 4; m++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++)
|
||||
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
@@ -1887,5 +2126,38 @@ void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const
|
||||
}
|
||||
return;
|
||||
#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
|
||||
ggml_gemm_iq4_nl_4x4_q8_0_generic(n, s, bs, vx, vy, nr, nc);
|
||||
{
|
||||
float sumf[4][4];
|
||||
int sumi;
|
||||
|
||||
for (int y = 0; y < nr / 4; y++) {
|
||||
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
||||
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||
const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
|
||||
for (int m = 0; m < 4; m++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
|
||||
}
|
||||
for (int l = 0; l < nb; l++) {
|
||||
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
||||
for (int m = 0; m < 4; m++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
sumi = 0;
|
||||
for (int i = 0; i < blocklen; ++i) {
|
||||
const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
|
||||
const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
|
||||
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
||||
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
|
||||
}
|
||||
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int m = 0; m < 4; m++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++)
|
||||
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -821,15 +821,24 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
sumf = hsum_float_8(acc) + summs;
|
||||
|
||||
*s = sumf;
|
||||
#else
|
||||
UNUSED(nb);
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(ib);
|
||||
UNUSED(sumf);
|
||||
ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
for (; ib < nb; ++ib) {
|
||||
int sumi0 = 0;
|
||||
int sumi1 = 0;
|
||||
|
||||
for (int j = 0; j < qk/2; ++j) {
|
||||
const int v0 = (x[ib].qs[j] & 0x0F);
|
||||
const int v1 = (x[ib].qs[j] >> 4);
|
||||
|
||||
sumi0 += (v0 * y[ib].qs[j]);
|
||||
sumi1 += (v1 * y[ib].qs[j + qk/2]);
|
||||
}
|
||||
|
||||
int sumi = sumi0 + sumi1;
|
||||
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
@@ -874,15 +883,30 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
sumf = hsum_float_8(acc);
|
||||
|
||||
*s = sumf;
|
||||
#else
|
||||
UNUSED(nb);
|
||||
UNUSED(ib);
|
||||
UNUSED(sumf);
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
for (; ib < nb; ++ib) {
|
||||
uint32_t qh;
|
||||
memcpy(&qh, x[ib].qh, sizeof(qh));
|
||||
|
||||
int sumi0 = 0;
|
||||
int sumi1 = 0;
|
||||
|
||||
for (int j = 0; j < qk/2; ++j) {
|
||||
const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
|
||||
const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
|
||||
|
||||
const int32_t x0 = (int8_t)(((x[ib].qs[j] & 0x0F) | xh_0) - 16);
|
||||
const int32_t x1 = (int8_t)(((x[ib].qs[j] >> 4) | xh_1) - 16);
|
||||
|
||||
sumi0 += (x0 * y[ib].qs[j]);
|
||||
sumi1 += (x1 * y[ib].qs[j + qk/2]);
|
||||
}
|
||||
|
||||
int sumi = sumi0 + sumi1;
|
||||
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
@@ -930,15 +954,30 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
sumf = hsum_float_8(acc) + summs;
|
||||
|
||||
*s = sumf;
|
||||
#else
|
||||
UNUSED(nb);
|
||||
UNUSED(ib);
|
||||
UNUSED(sumf);
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
for (; ib < nb; ++ib) {
|
||||
uint32_t qh;
|
||||
memcpy(&qh, x[ib].qh, sizeof(qh));
|
||||
|
||||
int sumi0 = 0;
|
||||
int sumi1 = 0;
|
||||
|
||||
for (int j = 0; j < qk/2; ++j) {
|
||||
const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
|
||||
const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10;
|
||||
|
||||
const int32_t x0 = (x[ib].qs[j] & 0xF) | xh_0;
|
||||
const int32_t x1 = (x[ib].qs[j] >> 4) | xh_1;
|
||||
|
||||
sumi0 += (x0 * y[ib].qs[j]);
|
||||
sumi1 += (x1 * y[ib].qs[j + qk/2]);
|
||||
}
|
||||
|
||||
int sumi = sumi0 + sumi1;
|
||||
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
@@ -977,15 +1016,18 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
sumf = hsum_float_8(acc);
|
||||
|
||||
*s = sumf;
|
||||
#else
|
||||
UNUSED(nb);
|
||||
UNUSED(ib);
|
||||
UNUSED(sumf);
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
for (; ib < nb; ++ib) {
|
||||
int sumi = 0;
|
||||
|
||||
for (int j = 0; j < qk; j++) {
|
||||
sumi += x[ib].qs[j]*y[ib].qs[j];
|
||||
}
|
||||
|
||||
sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
@@ -1061,10 +1103,45 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
*s = hsum_float_8(acc);
|
||||
|
||||
#else
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_q2_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
|
||||
float sumf = 0;
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
|
||||
const uint8_t * q2 = x[i].qs;
|
||||
const int8_t * q8 = y[i].qs;
|
||||
const uint8_t * sc = x[i].scales;
|
||||
|
||||
int summs = 0;
|
||||
for (int j = 0; j < 16; ++j) {
|
||||
summs += y[i].bsums[j] * (sc[j] >> 4);
|
||||
}
|
||||
|
||||
const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
||||
|
||||
int isum = 0;
|
||||
int is = 0;
|
||||
int d;
|
||||
for (int k = 0; k < QK_K/128; ++k) {
|
||||
int shift = 0;
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
d = sc[is++] & 0xF;
|
||||
int isuml = 0;
|
||||
for (int l = 0; l < 16; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
|
||||
isum += d * isuml;
|
||||
d = sc[is++] & 0xF;
|
||||
isuml = 0;
|
||||
for (int l = 16; l < 32; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
|
||||
isum += d * isuml;
|
||||
shift += 2;
|
||||
q8 += 32;
|
||||
}
|
||||
q2 += 32;
|
||||
}
|
||||
sumf += dall * isum - dmin * summs;
|
||||
}
|
||||
*s = sumf;
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -1162,13 +1239,70 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
*s = hsum_float_8(acc);
|
||||
|
||||
#else
|
||||
UNUSED(kmask1);
|
||||
UNUSED(kmask2);
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_q3_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
// scalar version
|
||||
// This function is written like this so the compiler can manage to vectorize most of it
|
||||
// Using -Ofast, GCC and clang manage to produce code that is within a factor of 2 or so from the
|
||||
// manually vectorized version above. Every other version I tried would run at least 4 times slower.
|
||||
// The ideal situation would be if we could just write the code once, and the compiler would
|
||||
// automatically produce the best possible set of machine instructions, instead of us having to manually
|
||||
// write vectorized versions for AVX, ARM_NEON, etc.
|
||||
|
||||
int8_t aux8[QK_K];
|
||||
int16_t aux16[8];
|
||||
float sums [8];
|
||||
int32_t aux32[8];
|
||||
memset(sums, 0, 8*sizeof(float));
|
||||
|
||||
uint32_t auxs[4];
|
||||
const int8_t * scales = (const int8_t*)auxs;
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT hm = x[i].hmask;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
memset(aux32, 0, 8*sizeof(int32_t));
|
||||
int8_t * GGML_RESTRICT a = aux8;
|
||||
uint8_t m = 1;
|
||||
for (int j = 0; j < QK_K; j += 128) {
|
||||
for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
|
||||
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
||||
a += 32; m <<= 1;
|
||||
for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3;
|
||||
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
||||
a += 32; m <<= 1;
|
||||
for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3;
|
||||
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
||||
a += 32; m <<= 1;
|
||||
for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3;
|
||||
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
||||
a += 32; m <<= 1;
|
||||
q3 += 32;
|
||||
}
|
||||
a = aux8;
|
||||
|
||||
memcpy(auxs, x[i].scales, 12);
|
||||
uint32_t tmp = auxs[2];
|
||||
auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
|
||||
auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
|
||||
auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
|
||||
auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
|
||||
for (int j = 0; j < QK_K/16; ++j) {
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
*s = sumf;
|
||||
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
@@ -1257,14 +1391,61 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
*s = hsum_float_8(acc) + ((v4f32)acc_m)[0];
|
||||
|
||||
#else
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
UNUSED(kmask1);
|
||||
UNUSED(kmask2);
|
||||
UNUSED(kmask3);
|
||||
UNUSED(utmp);
|
||||
ggml_vec_dot_q4_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
|
||||
const uint8_t * scales = (const uint8_t*)&utmp[0];
|
||||
const uint8_t * mins = (const uint8_t*)&utmp[2];
|
||||
|
||||
int8_t aux8[QK_K];
|
||||
int16_t aux16[8];
|
||||
float sums [8];
|
||||
int32_t aux32[8];
|
||||
memset(sums, 0, 8*sizeof(float));
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
memset(aux32, 0, 8*sizeof(int32_t));
|
||||
int8_t * GGML_RESTRICT a = aux8;
|
||||
for (int j = 0; j < QK_K/64; ++j) {
|
||||
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
|
||||
a += 32;
|
||||
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
|
||||
a += 32; q4 += 32;
|
||||
}
|
||||
memcpy(utmp, x[i].scales, 12);
|
||||
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
||||
const uint32_t uaux = utmp[1] & kmask1;
|
||||
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
||||
utmp[2] = uaux;
|
||||
utmp[0] &= kmask1;
|
||||
|
||||
int sumi = 0;
|
||||
for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
|
||||
a = aux8;
|
||||
int is = 0;
|
||||
for (int j = 0; j < QK_K/32; ++j) {
|
||||
int32_t scale = scales[is++];
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
||||
sumf -= dmin * sumi;
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
*s = sumf;
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -1360,14 +1541,66 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
*s = hsum_float_8(acc) + ((v4f32)acc_m)[0];
|
||||
|
||||
#else
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
UNUSED(kmask1);
|
||||
UNUSED(kmask2);
|
||||
UNUSED(kmask3);
|
||||
UNUSED(utmp);
|
||||
ggml_vec_dot_q5_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
|
||||
const uint8_t * scales = (const uint8_t*)&utmp[0];
|
||||
const uint8_t * mins = (const uint8_t*)&utmp[2];
|
||||
|
||||
int8_t aux8[QK_K];
|
||||
int16_t aux16[8];
|
||||
float sums [8];
|
||||
int32_t aux32[8];
|
||||
memset(sums, 0, 8*sizeof(float));
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT hm = x[i].qh;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
memset(aux32, 0, 8*sizeof(int32_t));
|
||||
int8_t * GGML_RESTRICT a = aux8;
|
||||
uint8_t m = 1;
|
||||
for (int j = 0; j < QK_K/64; ++j) {
|
||||
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
|
||||
for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
|
||||
a += 32; m <<= 1;
|
||||
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
|
||||
for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
|
||||
a += 32; m <<= 1;
|
||||
q4 += 32;
|
||||
}
|
||||
memcpy(utmp, x[i].scales, 12);
|
||||
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
||||
const uint32_t uaux = utmp[1] & kmask1;
|
||||
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
||||
utmp[2] = uaux;
|
||||
utmp[0] &= kmask1;
|
||||
|
||||
int sumi = 0;
|
||||
for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
|
||||
a = aux8;
|
||||
int is = 0;
|
||||
for (int j = 0; j < QK_K/32; ++j) {
|
||||
int32_t scale = scales[is++];
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
||||
sumf -= dmin * sumi;
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
*s = sumf;
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -1445,10 +1678,47 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
*s = hsum_float_8(acc);
|
||||
|
||||
#else
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_q6_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
|
||||
int8_t aux8[QK_K];
|
||||
int16_t aux16[8];
|
||||
float sums [8];
|
||||
int32_t aux32[8];
|
||||
memset(sums, 0, 8*sizeof(float));
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const uint8_t * GGML_RESTRICT q4 = x[i].ql;
|
||||
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
memset(aux32, 0, 8*sizeof(int32_t));
|
||||
int8_t * GGML_RESTRICT a = aux8;
|
||||
for (int j = 0; j < QK_K; j += 128) {
|
||||
for (int l = 0; l < 32; ++l) {
|
||||
a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
|
||||
a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
|
||||
a[l + 64] = (int8_t)((q4[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
|
||||
a[l + 96] = (int8_t)((q4[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
|
||||
}
|
||||
a += 128;
|
||||
q4 += 64;
|
||||
qh += 32;
|
||||
}
|
||||
a = aux8;
|
||||
int is = 0;
|
||||
for (int j = 0; j < QK_K/16; ++j) {
|
||||
int scale = x[i].scales[is++];
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
*s = sumf;
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -1545,10 +1815,34 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
|
||||
*s = 0.125f * hsum_float_8(accumf);
|
||||
|
||||
#else
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_iq2_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
|
||||
uint32_t aux32[2];
|
||||
const uint8_t * aux8 = (const uint8_t *)aux32;
|
||||
|
||||
float sumf = 0.f;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
int32_t bsum = 0;
|
||||
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
||||
memcpy(aux32, q2, 2*sizeof(uint32_t));
|
||||
q2 += 4;
|
||||
const uint32_t ls = 2*(aux32[1] >> 28) + 1;
|
||||
int32_t sumi = 0;
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
|
||||
const uint8_t signs = ksigns_iq2xs[(aux32[1] >> 7*l) & 127];
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
|
||||
}
|
||||
q8 += 8;
|
||||
}
|
||||
bsum += sumi * ls;
|
||||
}
|
||||
sumf += d * bsum;
|
||||
}
|
||||
*s = 0.125f * sumf;
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -1684,10 +1978,42 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
||||
*s = 0.125f * hsum_float_8(accumf);
|
||||
|
||||
#else
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_iq2_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
|
||||
float sumf = 0.f;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT sc = x[i].scales;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
int32_t bsum = 0;
|
||||
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
||||
const uint16_t ls1 = 2*(sc[ib32] & 0xf) + 1;
|
||||
const uint16_t ls2 = 2*(sc[ib32] >> 4) + 1;
|
||||
int32_t sumi = 0;
|
||||
for (int l = 0; l < 2; ++l) {
|
||||
const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
|
||||
const uint8_t signs = ksigns_iq2xs[q2[l] >> 9];
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
|
||||
}
|
||||
q8 += 8;
|
||||
}
|
||||
bsum += sumi * ls1;
|
||||
sumi = 0;
|
||||
for (int l = 2; l < 4; ++l) {
|
||||
const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
|
||||
const uint8_t signs = ksigns_iq2xs[q2[l] >> 9];
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
|
||||
}
|
||||
q8 += 8;
|
||||
}
|
||||
bsum += sumi * ls2;
|
||||
q2 += 4;
|
||||
}
|
||||
sumf += d * bsum;
|
||||
}
|
||||
*s = 0.125f * sumf;
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -1779,11 +2105,47 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
*s = 0.125f * hsum_float_8(accumf);
|
||||
|
||||
#else
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_iq2_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; i++) {
|
||||
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const int8_t * q8 = y[i].qs;
|
||||
const uint8_t * qs = x[i].qs;
|
||||
const uint8_t * qh = x[i].qh;
|
||||
const uint8_t * signs = qs + QK_K/8;
|
||||
|
||||
int bsum = 0;
|
||||
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
||||
int ls1 = 1 + 2*(x[i].scales[ib32] & 0xf);
|
||||
int ls2 = 1 + 2*(x[i].scales[ib32] >> 4);
|
||||
int sumi1 = 0, sumi2 = 0;
|
||||
for (int l = 0; l < 2; ++l) {
|
||||
const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
sumi1 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
|
||||
}
|
||||
q8 += 8;
|
||||
}
|
||||
for (int l = 2; l < 4; ++l) {
|
||||
const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
sumi2 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
|
||||
}
|
||||
q8 += 8;
|
||||
}
|
||||
bsum += ls1 * sumi1 + ls2 * sumi2;
|
||||
qs += 4;
|
||||
signs += 4;
|
||||
}
|
||||
|
||||
sumf += d * bsum;
|
||||
}
|
||||
|
||||
*s = 0.125f * sumf;
|
||||
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
@@ -1847,10 +2209,36 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
|
||||
*s = 0.25f * hsum_float_8(accumf);
|
||||
|
||||
#else
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_iq3_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
|
||||
uint32_t aux32;
|
||||
|
||||
float sumf = 0.f;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
int32_t bsum = 0;
|
||||
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
||||
memcpy(&aux32, gas, sizeof(uint32_t)); gas += sizeof(uint32_t);
|
||||
const uint32_t ls = 2*(aux32 >> 28) + 1;
|
||||
int32_t sumi = 0;
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*l+0]);
|
||||
const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*l+1]);
|
||||
const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*l) & 127];
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
sumi += grid1[j] * q8[j+0] * (signs & kmask_iq2xs[j+0] ? -1 : 1);
|
||||
sumi += grid2[j] * q8[j+4] * (signs & kmask_iq2xs[j+4] ? -1 : 1);
|
||||
}
|
||||
q8 += 8;
|
||||
}
|
||||
q3 += 8;
|
||||
bsum += sumi * ls;
|
||||
}
|
||||
sumf += d * bsum;
|
||||
}
|
||||
*s = 0.25f * sumf;
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -1950,10 +2338,48 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
*s = hsum_float_8(accumf);
|
||||
|
||||
#else
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_iq3_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
|
||||
float sumf = 0.f;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint8_t * GGML_RESTRICT qs = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
||||
const uint8_t * GGML_RESTRICT signs = x[i].signs;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
int32_t bsum = 0;
|
||||
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
||||
const uint32_t ls1 = 2*(x[i].scales[ib32/2] & 0xf) + 1;
|
||||
const uint32_t ls2 = 2*(x[i].scales[ib32/2] >> 4) + 1;
|
||||
int32_t sumi = 0;
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+0] << (8-2*l)) & 256)));
|
||||
const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+0] << (7-2*l)) & 256)));
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
|
||||
sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
|
||||
}
|
||||
q8 += 8;
|
||||
}
|
||||
qs += 8;
|
||||
signs += 4;
|
||||
bsum += sumi * ls1;
|
||||
sumi = 0;
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+1] << (8-2*l)) & 256)));
|
||||
const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+1] << (7-2*l)) & 256)));
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
|
||||
sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
|
||||
}
|
||||
q8 += 8;
|
||||
}
|
||||
qs += 8;
|
||||
signs += 4;
|
||||
bsum += sumi * ls2;
|
||||
}
|
||||
sumf += d * bsum;
|
||||
}
|
||||
*s = sumf;
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -2034,10 +2460,36 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
*s = hsum_float_8(accum) + IQ1S_DELTA * accum1;
|
||||
|
||||
#else
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_iq1_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; i++) {
|
||||
|
||||
const int8_t * q8 = y[i].qs;
|
||||
const uint8_t * qs = x[i].qs;
|
||||
const uint16_t * qh = x[i].qh;
|
||||
|
||||
int sumi = 0, sumi1 = 0;
|
||||
for (int ib = 0; ib < QK_K/32; ++ib) {
|
||||
const int ls = 2*((qh[ib] >> 12) & 7) + 1;
|
||||
const int delta = qh[ib] & 0x8000 ? -1 : 1;
|
||||
int lsum = 0;
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((qh[ib] >> 3*l) & 7) << 8)));
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
lsum += q8[j] * grid[j];
|
||||
}
|
||||
q8 += 8;
|
||||
}
|
||||
sumi += ls * lsum;
|
||||
sumi1 += ls * delta * (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]);
|
||||
qs += 4;
|
||||
}
|
||||
|
||||
sumf += GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -2151,10 +2603,37 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
||||
*s = hsum_float_8(accum);
|
||||
|
||||
#else
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
float sumf = 0;
|
||||
for (int ibl = 0; ibl < nb; ++ibl) {
|
||||
const float d4d8 = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
|
||||
uint16_t h = x[ibl].scales_h;
|
||||
const uint8_t * qs = x[ibl].qs;
|
||||
const int8_t * q8 = y[ibl].qs;
|
||||
for (int ib = 0; ib < QK_K/32; ib += 2) {
|
||||
const uint8_t ls1 = (x[ibl].scales_l[ib/2] & 0xf) | ((h << 4) & 0x30);
|
||||
const uint8_t ls2 = (x[ibl].scales_l[ib/2] >> 4) | ((h << 2) & 0x30);
|
||||
h >>= 4;
|
||||
const float d1 = d4d8*(ls1 - 32);
|
||||
const float d2 = d4d8*(ls2 - 32);
|
||||
int sumi1 = 0, sumi2 = 0;
|
||||
for (int j = 0; j < 16; ++j) {
|
||||
sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
|
||||
sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
|
||||
}
|
||||
sumf += d1 * (sumi1 + sumi2);
|
||||
qs += 16;
|
||||
q8 += 32;
|
||||
sumi1 = sumi2 = 0;
|
||||
for (int j = 0; j < 16; ++j) {
|
||||
sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
|
||||
sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
|
||||
}
|
||||
sumf += d2 * (sumi1 + sumi2);
|
||||
qs += 16;
|
||||
q8 += 32;
|
||||
}
|
||||
}
|
||||
*s = sumf;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
@@ -201,14 +201,24 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
sumf = vec_extract(vsumf0, 0);
|
||||
|
||||
*s = sumf;
|
||||
#else
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(ib);
|
||||
UNUSED(sumf);
|
||||
ggml_vec_dot_q4_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
for (; ib < nb; ++ib) {
|
||||
int sumi0 = 0;
|
||||
int sumi1 = 0;
|
||||
|
||||
for (int j = 0; j < qk/2; ++j) {
|
||||
const int v0 = (x[ib].qs[j] & 0x0F) - 8;
|
||||
const int v1 = (x[ib].qs[j] >> 4) - 8;
|
||||
|
||||
sumi0 += (v0 * y[ib].qs[j]);
|
||||
sumi1 += (v1 * y[ib].qs[j + qk/2]);
|
||||
}
|
||||
|
||||
int sumi = sumi0 + sumi1;
|
||||
sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
@@ -268,14 +278,24 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
sumf = vec_extract(vsumf0, 0);
|
||||
|
||||
*s = sumf;
|
||||
#else
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(ib);
|
||||
UNUSED(sumf);
|
||||
ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
for (; ib < nb; ++ib) {
|
||||
int sumi0 = 0;
|
||||
int sumi1 = 0;
|
||||
|
||||
for (int j = 0; j < qk/2; ++j) {
|
||||
const int v0 = (x[ib].qs[j] & 0x0F);
|
||||
const int v1 = (x[ib].qs[j] >> 4);
|
||||
|
||||
sumi0 += (v0 * y[ib].qs[j]);
|
||||
sumi1 += (v1 * y[ib].qs[j + qk/2]);
|
||||
}
|
||||
|
||||
int sumi = sumi0 + sumi1;
|
||||
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
@@ -340,14 +360,30 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
sumf = vec_extract(vsumf0, 0);
|
||||
|
||||
*s = sumf;
|
||||
#else
|
||||
UNUSED(ib);
|
||||
UNUSED(sumf);
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
for (; ib < nb; ++ib) {
|
||||
uint32_t qh;
|
||||
memcpy(&qh, x[ib].qh, sizeof(qh));
|
||||
|
||||
int sumi0 = 0;
|
||||
int sumi1 = 0;
|
||||
|
||||
for (int j = 0; j < qk/2; ++j) {
|
||||
const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
|
||||
const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
|
||||
|
||||
const int32_t x0 = (int8_t)(((x[ib].qs[j] & 0x0F) | xh_0) - 16);
|
||||
const int32_t x1 = (int8_t)(((x[ib].qs[j] >> 4) | xh_1) - 16);
|
||||
|
||||
sumi0 += (x0 * y[ib].qs[j]);
|
||||
sumi1 += (x1 * y[ib].qs[j + qk/2]);
|
||||
}
|
||||
|
||||
int sumi = sumi0 + sumi1;
|
||||
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
@@ -415,15 +451,30 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
sumf = vec_extract(vsumf0, 0);
|
||||
|
||||
*s = sumf;
|
||||
#else
|
||||
UNUSED(nb);
|
||||
UNUSED(ib);
|
||||
UNUSED(sumf);
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
for (; ib < nb; ++ib) {
|
||||
uint32_t qh;
|
||||
memcpy(&qh, x[ib].qh, sizeof(qh));
|
||||
|
||||
int sumi0 = 0;
|
||||
int sumi1 = 0;
|
||||
|
||||
for (int j = 0; j < qk/2; ++j) {
|
||||
const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
|
||||
const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10;
|
||||
|
||||
const int32_t x0 = (x[ib].qs[j] & 0xF) | xh_0;
|
||||
const int32_t x1 = (x[ib].qs[j] >> 4) | xh_1;
|
||||
|
||||
sumi0 += (x0 * y[ib].qs[j]);
|
||||
sumi1 += (x1 * y[ib].qs[j + qk/2]);
|
||||
}
|
||||
|
||||
int sumi = sumi0 + sumi1;
|
||||
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
@@ -484,15 +535,18 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
sumf = vec_extract(vsumf0, 0);
|
||||
|
||||
*s = sumf;
|
||||
#else
|
||||
UNUSED(nb);
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(ib);
|
||||
UNUSED(sumf);
|
||||
ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
for (; ib < nb; ++ib) {
|
||||
int sumi = 0;
|
||||
|
||||
for (int j = 0; j < qk; j++) {
|
||||
sumi += x[ib].qs[j]*y[ib].qs[j];
|
||||
}
|
||||
|
||||
sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
@@ -641,10 +695,45 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
*s = vec_extract(vsumf0, 0);
|
||||
|
||||
#else
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_q2_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
|
||||
float sumf = 0;
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
|
||||
const uint8_t * q2 = x[i].qs;
|
||||
const int8_t * q8 = y[i].qs;
|
||||
const uint8_t * sc = x[i].scales;
|
||||
|
||||
int summs = 0;
|
||||
for (int j = 0; j < 16; ++j) {
|
||||
summs += y[i].bsums[j] * (sc[j] >> 4);
|
||||
}
|
||||
|
||||
const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
||||
|
||||
int isum = 0;
|
||||
int is = 0;
|
||||
int d;
|
||||
for (int k = 0; k < QK_K/128; ++k) {
|
||||
int shift = 0;
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
d = sc[is++] & 0xF;
|
||||
int isuml = 0;
|
||||
for (int l = 0; l < 16; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
|
||||
isum += d * isuml;
|
||||
d = sc[is++] & 0xF;
|
||||
isuml = 0;
|
||||
for (int l = 16; l < 32; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
|
||||
isum += d * isuml;
|
||||
shift += 2;
|
||||
q8 += 32;
|
||||
}
|
||||
q2 += 32;
|
||||
}
|
||||
sumf += dall * isum - dmin * summs;
|
||||
}
|
||||
*s = sumf;
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -818,13 +907,70 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
*s = vec_extract(vsumf0, 0);
|
||||
|
||||
#else
|
||||
UNUSED(kmask1);
|
||||
UNUSED(kmask2);
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_q3_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
// scalar version
|
||||
// This function is written like this so the compiler can manage to vectorize most of it
|
||||
// Using -Ofast, GCC and clang manage to produce code that is within a factor of 2 or so from the
|
||||
// manually vectorized version above. Every other version I tried would run at least 4 times slower.
|
||||
// The ideal situation would be if we could just write the code once, and the compiler would
|
||||
// automatically produce the best possible set of machine instructions, instead of us having to manually
|
||||
// write vectorized versions for AVX, ARM_NEON, etc.
|
||||
|
||||
int8_t aux8[QK_K];
|
||||
int16_t aux16[8];
|
||||
float sums [8];
|
||||
int32_t aux32[8];
|
||||
memset(sums, 0, 8*sizeof(float));
|
||||
|
||||
uint32_t auxs[4];
|
||||
const int8_t * scales = (const int8_t*)auxs;
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT hm = x[i].hmask;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
memset(aux32, 0, 8*sizeof(int32_t));
|
||||
int8_t * GGML_RESTRICT a = aux8;
|
||||
uint8_t m = 1;
|
||||
for (int j = 0; j < QK_K; j += 128) {
|
||||
for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
|
||||
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
||||
a += 32; m <<= 1;
|
||||
for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3;
|
||||
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
||||
a += 32; m <<= 1;
|
||||
for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3;
|
||||
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
||||
a += 32; m <<= 1;
|
||||
for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3;
|
||||
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
||||
a += 32; m <<= 1;
|
||||
q3 += 32;
|
||||
}
|
||||
a = aux8;
|
||||
|
||||
memcpy(auxs, x[i].scales, 12);
|
||||
uint32_t tmp = auxs[2];
|
||||
auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
|
||||
auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
|
||||
auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
|
||||
auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
|
||||
for (int j = 0; j < QK_K/16; ++j) {
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
*s = sumf;
|
||||
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
@@ -984,14 +1130,61 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
*s = vec_extract(vsumf0, 0);
|
||||
|
||||
#else
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
UNUSED(kmask1);
|
||||
UNUSED(kmask2);
|
||||
UNUSED(kmask3);
|
||||
UNUSED(utmp);
|
||||
ggml_vec_dot_q4_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
|
||||
const uint8_t * scales = (const uint8_t*)&utmp[0];
|
||||
const uint8_t * mins = (const uint8_t*)&utmp[2];
|
||||
|
||||
int8_t aux8[QK_K];
|
||||
int16_t aux16[8];
|
||||
float sums [8];
|
||||
int32_t aux32[8];
|
||||
memset(sums, 0, 8*sizeof(float));
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
memset(aux32, 0, 8*sizeof(int32_t));
|
||||
int8_t * GGML_RESTRICT a = aux8;
|
||||
for (int j = 0; j < QK_K/64; ++j) {
|
||||
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
|
||||
a += 32;
|
||||
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
|
||||
a += 32; q4 += 32;
|
||||
}
|
||||
memcpy(utmp, x[i].scales, 12);
|
||||
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
||||
const uint32_t uaux = utmp[1] & kmask1;
|
||||
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
||||
utmp[2] = uaux;
|
||||
utmp[0] &= kmask1;
|
||||
|
||||
int sumi = 0;
|
||||
for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
|
||||
a = aux8;
|
||||
int is = 0;
|
||||
for (int j = 0; j < QK_K/32; ++j) {
|
||||
int32_t scale = scales[is++];
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
||||
sumf -= dmin * sumi;
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
*s = sumf;
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -1149,14 +1342,66 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
*s = vec_extract(vsumf0, 0);
|
||||
|
||||
#else
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
UNUSED(kmask1);
|
||||
UNUSED(kmask2);
|
||||
UNUSED(kmask3);
|
||||
UNUSED(utmp);
|
||||
ggml_vec_dot_q5_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
|
||||
const uint8_t * scales = (const uint8_t*)&utmp[0];
|
||||
const uint8_t * mins = (const uint8_t*)&utmp[2];
|
||||
|
||||
int8_t aux8[QK_K];
|
||||
int16_t aux16[8];
|
||||
float sums [8];
|
||||
int32_t aux32[8];
|
||||
memset(sums, 0, 8*sizeof(float));
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT hm = x[i].qh;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
memset(aux32, 0, 8*sizeof(int32_t));
|
||||
int8_t * GGML_RESTRICT a = aux8;
|
||||
uint8_t m = 1;
|
||||
for (int j = 0; j < QK_K/64; ++j) {
|
||||
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
|
||||
for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
|
||||
a += 32; m <<= 1;
|
||||
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
|
||||
for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
|
||||
a += 32; m <<= 1;
|
||||
q4 += 32;
|
||||
}
|
||||
memcpy(utmp, x[i].scales, 12);
|
||||
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
||||
const uint32_t uaux = utmp[1] & kmask1;
|
||||
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
||||
utmp[2] = uaux;
|
||||
utmp[0] &= kmask1;
|
||||
|
||||
int sumi = 0;
|
||||
for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
|
||||
a = aux8;
|
||||
int is = 0;
|
||||
for (int j = 0; j < QK_K/32; ++j) {
|
||||
int32_t scale = scales[is++];
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
||||
sumf -= dmin * sumi;
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
*s = sumf;
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -1311,10 +1556,47 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
*s = vec_extract(vsumf0, 0);
|
||||
|
||||
#else
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_q6_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
|
||||
int8_t aux8[QK_K];
|
||||
int16_t aux16[8];
|
||||
float sums [8];
|
||||
int32_t aux32[8];
|
||||
memset(sums, 0, 8*sizeof(float));
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const uint8_t * GGML_RESTRICT q4 = x[i].ql;
|
||||
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
memset(aux32, 0, 8*sizeof(int32_t));
|
||||
int8_t * GGML_RESTRICT a = aux8;
|
||||
for (int j = 0; j < QK_K; j += 128) {
|
||||
for (int l = 0; l < 32; ++l) {
|
||||
a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
|
||||
a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
|
||||
a[l + 64] = (int8_t)((q4[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
|
||||
a[l + 96] = (int8_t)((q4[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
|
||||
}
|
||||
a += 128;
|
||||
q4 += 64;
|
||||
qh += 32;
|
||||
}
|
||||
a = aux8;
|
||||
int is = 0;
|
||||
for (int j = 0; j < QK_K/16; ++j) {
|
||||
int scale = x[i].scales[is++];
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
*s = sumf;
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -1455,10 +1737,34 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
|
||||
*s = 0.125f * vec_extract(vsumf0, 0);
|
||||
|
||||
#else
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_iq2_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
|
||||
uint32_t aux32[2];
|
||||
const uint8_t * aux8 = (const uint8_t *)aux32;
|
||||
|
||||
float sumf = 0.f;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
int32_t bsum = 0;
|
||||
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
||||
memcpy(aux32, q2, 2*sizeof(uint32_t));
|
||||
q2 += 4;
|
||||
const uint32_t ls = 2*(aux32[1] >> 28) + 1;
|
||||
int32_t sumi = 0;
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
|
||||
const uint8_t signs = ksigns_iq2xs[(aux32[1] >> 7*l) & 127];
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
|
||||
}
|
||||
q8 += 8;
|
||||
}
|
||||
bsum += sumi * ls;
|
||||
}
|
||||
sumf += d * bsum;
|
||||
}
|
||||
*s = 0.125f * sumf;
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -1563,10 +1869,42 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
||||
*s = 0.125f * vec_extract(vsumf0, 0);
|
||||
|
||||
#else
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_iq2_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
|
||||
float sumf = 0.f;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT sc = x[i].scales;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
int32_t bsum = 0;
|
||||
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
||||
const uint16_t ls1 = 2*(sc[ib32] & 0xf) + 1;
|
||||
const uint16_t ls2 = 2*(sc[ib32] >> 4) + 1;
|
||||
int32_t sumi = 0;
|
||||
for (int l = 0; l < 2; ++l) {
|
||||
const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
|
||||
const uint8_t signs = ksigns_iq2xs[q2[l] >> 9];
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
|
||||
}
|
||||
q8 += 8;
|
||||
}
|
||||
bsum += sumi * ls1;
|
||||
sumi = 0;
|
||||
for (int l = 2; l < 4; ++l) {
|
||||
const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
|
||||
const uint8_t signs = ksigns_iq2xs[q2[l] >> 9];
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
|
||||
}
|
||||
q8 += 8;
|
||||
}
|
||||
bsum += sumi * ls2;
|
||||
q2 += 4;
|
||||
}
|
||||
sumf += d * bsum;
|
||||
}
|
||||
*s = 0.125f * sumf;
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -1692,11 +2030,47 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
*s = 0.125f * vec_extract(vsumf0, 0);
|
||||
|
||||
#else
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_iq2_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; i++) {
|
||||
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const int8_t * q8 = y[i].qs;
|
||||
const uint8_t * qs = x[i].qs;
|
||||
const uint8_t * qh = x[i].qh;
|
||||
const uint8_t * signs = qs + QK_K/8;
|
||||
|
||||
int bsum = 0;
|
||||
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
||||
int ls1 = 1 + 2*(x[i].scales[ib32] & 0xf);
|
||||
int ls2 = 1 + 2*(x[i].scales[ib32] >> 4);
|
||||
int sumi1 = 0, sumi2 = 0;
|
||||
for (int l = 0; l < 2; ++l) {
|
||||
const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
sumi1 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
|
||||
}
|
||||
q8 += 8;
|
||||
}
|
||||
for (int l = 2; l < 4; ++l) {
|
||||
const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
sumi2 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
|
||||
}
|
||||
q8 += 8;
|
||||
}
|
||||
bsum += ls1 * sumi1 + ls2 * sumi2;
|
||||
qs += 4;
|
||||
signs += 4;
|
||||
}
|
||||
|
||||
sumf += d * bsum;
|
||||
}
|
||||
|
||||
*s = 0.125f * sumf;
|
||||
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
@@ -1798,10 +2172,36 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
|
||||
*s = 0.25f * vec_extract(vsumf0, 0);
|
||||
|
||||
#else
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_iq3_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
|
||||
uint32_t aux32;
|
||||
|
||||
float sumf = 0.f;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
int32_t bsum = 0;
|
||||
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
||||
memcpy(&aux32, gas, sizeof(uint32_t)); gas += sizeof(uint32_t);
|
||||
const uint32_t ls = 2*(aux32 >> 28) + 1;
|
||||
int32_t sumi = 0;
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*l+0]);
|
||||
const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*l+1]);
|
||||
const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*l) & 127];
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
sumi += grid1[j] * q8[j+0] * (signs & kmask_iq2xs[j+0] ? -1 : 1);
|
||||
sumi += grid2[j] * q8[j+4] * (signs & kmask_iq2xs[j+4] ? -1 : 1);
|
||||
}
|
||||
q8 += 8;
|
||||
}
|
||||
q3 += 8;
|
||||
bsum += sumi * ls;
|
||||
}
|
||||
sumf += d * bsum;
|
||||
}
|
||||
*s = 0.25f * sumf;
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -1927,10 +2327,48 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
*s = vec_extract(vsumf0, 0);
|
||||
|
||||
#else
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_iq3_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
|
||||
float sumf = 0.f;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint8_t * GGML_RESTRICT qs = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
||||
const uint8_t * GGML_RESTRICT signs = x[i].signs;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
int32_t bsum = 0;
|
||||
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
||||
const uint32_t ls1 = 2*(x[i].scales[ib32/2] & 0xf) + 1;
|
||||
const uint32_t ls2 = 2*(x[i].scales[ib32/2] >> 4) + 1;
|
||||
int32_t sumi = 0;
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+0] << (8-2*l)) & 256)));
|
||||
const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+0] << (7-2*l)) & 256)));
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
|
||||
sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
|
||||
}
|
||||
q8 += 8;
|
||||
}
|
||||
qs += 8;
|
||||
signs += 4;
|
||||
bsum += sumi * ls1;
|
||||
sumi = 0;
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+1] << (8-2*l)) & 256)));
|
||||
const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+1] << (7-2*l)) & 256)));
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
|
||||
sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
|
||||
}
|
||||
q8 += 8;
|
||||
}
|
||||
qs += 8;
|
||||
signs += 4;
|
||||
bsum += sumi * ls2;
|
||||
}
|
||||
sumf += d * bsum;
|
||||
}
|
||||
*s = sumf;
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -2043,10 +2481,36 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
*s = vec_extract(vsumf0, 0);
|
||||
|
||||
#else
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_iq1_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; i++) {
|
||||
|
||||
const int8_t * q8 = y[i].qs;
|
||||
const uint8_t * qs = x[i].qs;
|
||||
const uint16_t * qh = x[i].qh;
|
||||
|
||||
int sumi = 0, sumi1 = 0;
|
||||
for (int ib = 0; ib < QK_K/32; ++ib) {
|
||||
const int ls = 2*((qh[ib] >> 12) & 7) + 1;
|
||||
const int delta = qh[ib] & 0x8000 ? -1 : 1;
|
||||
int lsum = 0;
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((qh[ib] >> 3*l) & 7) << 8)));
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
lsum += q8[j] * grid[j];
|
||||
}
|
||||
q8 += 8;
|
||||
}
|
||||
sumi += ls * lsum;
|
||||
sumi1 += ls * delta * (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]);
|
||||
qs += 4;
|
||||
}
|
||||
|
||||
sumf += GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -2117,15 +2581,17 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v
|
||||
|
||||
sumf = vec_extract(vsumf0, 0);
|
||||
|
||||
*s = sumf;
|
||||
#else
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
UNUSED(ib);
|
||||
UNUSED(sumf);
|
||||
ggml_vec_dot_iq4_nl_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
for (; ib < nb; ++ib) {
|
||||
const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d);
|
||||
int sumi1 = 0, sumi2 = 0;
|
||||
for (int j = 0; j < QK4_NL/2; ++j) {
|
||||
sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf];
|
||||
sumi2 += y[ib].qs[j+QK4_NL/2] * kvalues_iq4nl[x[ib].qs[j] >> 4];
|
||||
}
|
||||
sumf += d * (sumi1 + sumi2);
|
||||
}
|
||||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
@@ -2230,10 +2696,37 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
||||
*s = vec_extract(vsumf0, 0);
|
||||
|
||||
#else
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
float sumf = 0;
|
||||
for (int ibl = 0; ibl < nb; ++ibl) {
|
||||
const float d4d8 = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
|
||||
uint16_t h = x[ibl].scales_h;
|
||||
const uint8_t * qs = x[ibl].qs;
|
||||
const int8_t * q8 = y[ibl].qs;
|
||||
for (int ib = 0; ib < QK_K/32; ib += 2) {
|
||||
const uint8_t ls1 = (x[ibl].scales_l[ib/2] & 0xf) | ((h << 4) & 0x30);
|
||||
const uint8_t ls2 = (x[ibl].scales_l[ib/2] >> 4) | ((h << 2) & 0x30);
|
||||
h >>= 4;
|
||||
const float d1 = d4d8*(ls1 - 32);
|
||||
const float d2 = d4d8*(ls2 - 32);
|
||||
int sumi1 = 0, sumi2 = 0;
|
||||
for (int j = 0; j < 16; ++j) {
|
||||
sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
|
||||
sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
|
||||
}
|
||||
sumf += d1 * (sumi1 + sumi2);
|
||||
qs += 16;
|
||||
q8 += 32;
|
||||
sumi1 = sumi2 = 0;
|
||||
for (int j = 0; j < 16; ++j) {
|
||||
sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
|
||||
sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
|
||||
}
|
||||
sumf += d2 * (sumi1 + sumi2);
|
||||
qs += 16;
|
||||
q8 += 32;
|
||||
}
|
||||
}
|
||||
*s = sumf;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
@@ -116,7 +116,6 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
||||
//===================================== Dot products =================================
|
||||
|
||||
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
#if defined(__riscv_v)
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
|
||||
@@ -133,6 +132,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
int ib = 0;
|
||||
float sumf = 0;
|
||||
|
||||
#if defined(__riscv_v)
|
||||
size_t vl = qk / 2;
|
||||
|
||||
for (; ib < nb; ++ib) {
|
||||
@@ -164,14 +164,27 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
#else
|
||||
ggml_vec_dot_q4_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
for (; ib < nb; ++ib) {
|
||||
int sumi0 = 0;
|
||||
int sumi1 = 0;
|
||||
|
||||
for (int j = 0; j < qk/2; ++j) {
|
||||
const int v0 = (x[ib].qs[j] & 0x0F) - 8;
|
||||
const int v1 = (x[ib].qs[j] >> 4) - 8;
|
||||
|
||||
sumi0 += (v0 * y[ib].qs[j]);
|
||||
sumi1 += (v1 * y[ib].qs[j + qk/2]);
|
||||
}
|
||||
|
||||
int sumi = sumi0 + sumi1;
|
||||
sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
#if defined(__riscv_v)
|
||||
const int qk = QK8_1;
|
||||
const int nb = n / qk;
|
||||
|
||||
@@ -188,6 +201,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
int ib = 0;
|
||||
float sumf = 0;
|
||||
|
||||
#if defined(__riscv_v)
|
||||
size_t vl = qk / 2;
|
||||
|
||||
for (; ib < nb; ++ib) {
|
||||
@@ -215,14 +229,27 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
#else
|
||||
ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
for (; ib < nb; ++ib) {
|
||||
int sumi0 = 0;
|
||||
int sumi1 = 0;
|
||||
|
||||
for (int j = 0; j < qk/2; ++j) {
|
||||
const int v0 = (x[ib].qs[j] & 0x0F);
|
||||
const int v1 = (x[ib].qs[j] >> 4);
|
||||
|
||||
sumi0 += (v0 * y[ib].qs[j]);
|
||||
sumi1 += (v1 * y[ib].qs[j + qk/2]);
|
||||
}
|
||||
|
||||
int sumi = sumi0 + sumi1;
|
||||
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
#if defined(__riscv_v)
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
|
||||
@@ -240,6 +267,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
const block_q5_0 * GGML_RESTRICT x = vx;
|
||||
const block_q8_0 * GGML_RESTRICT y = vy;
|
||||
|
||||
#if defined(__riscv_v)
|
||||
size_t vl;
|
||||
size_t vlenb = __riscv_vlenb();
|
||||
|
||||
@@ -269,14 +297,33 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
#else
|
||||
ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
for (; ib < nb; ++ib) {
|
||||
uint32_t qh;
|
||||
memcpy(&qh, x[ib].qh, sizeof(qh));
|
||||
|
||||
int sumi0 = 0;
|
||||
int sumi1 = 0;
|
||||
|
||||
for (int j = 0; j < qk/2; ++j) {
|
||||
const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
|
||||
const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
|
||||
|
||||
const int32_t x0 = (int8_t)(((x[ib].qs[j] & 0x0F) | xh_0) - 16);
|
||||
const int32_t x1 = (int8_t)(((x[ib].qs[j] >> 4) | xh_1) - 16);
|
||||
|
||||
sumi0 += (x0 * y[ib].qs[j]);
|
||||
sumi1 += (x1 * y[ib].qs[j + qk/2]);
|
||||
}
|
||||
|
||||
int sumi = sumi0 + sumi1;
|
||||
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
#if defined(__riscv_v)
|
||||
const int qk = QK8_1;
|
||||
const int nb = n / qk;
|
||||
|
||||
@@ -294,6 +341,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
const block_q5_1 * GGML_RESTRICT x = vx;
|
||||
const block_q8_1 * GGML_RESTRICT y = vy;
|
||||
|
||||
#if defined(__riscv_v)
|
||||
size_t vl;
|
||||
size_t vlenb = __riscv_vlenb();
|
||||
|
||||
@@ -322,10 +370,30 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
#else
|
||||
ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
for (; ib < nb; ++ib) {
|
||||
uint32_t qh;
|
||||
memcpy(&qh, x[ib].qh, sizeof(qh));
|
||||
|
||||
int sumi0 = 0;
|
||||
int sumi1 = 0;
|
||||
|
||||
for (int j = 0; j < qk/2; ++j) {
|
||||
const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
|
||||
const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10;
|
||||
|
||||
const int32_t x0 = (x[ib].qs[j] & 0xF) | xh_0;
|
||||
const int32_t x1 = (x[ib].qs[j] >> 4) | xh_1;
|
||||
|
||||
sumi0 += (x0 * y[ib].qs[j]);
|
||||
sumi1 += (x1 * y[ib].qs[j + qk/2]);
|
||||
}
|
||||
|
||||
int sumi = sumi0 + sumi1;
|
||||
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
@@ -363,17 +431,18 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
#else
|
||||
|
||||
UNUSED(nb);
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(ib);
|
||||
UNUSED(sumf);
|
||||
|
||||
ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
for (; ib < nb; ++ib) {
|
||||
int sumi = 0;
|
||||
|
||||
for (int j = 0; j < qk; j++) {
|
||||
sumi += x[ib].qs[j]*y[ib].qs[j];
|
||||
}
|
||||
|
||||
sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
@@ -669,11 +738,44 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
#else
|
||||
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
float sumf = 0;
|
||||
|
||||
ggml_vec_dot_q2_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
|
||||
const uint8_t * q2 = x[i].qs;
|
||||
const int8_t * q8 = y[i].qs;
|
||||
const uint8_t * sc = x[i].scales;
|
||||
|
||||
int summs = 0;
|
||||
for (int j = 0; j < 16; ++j) {
|
||||
summs += y[i].bsums[j] * (sc[j] >> 4);
|
||||
}
|
||||
|
||||
const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
||||
|
||||
int isum = 0;
|
||||
int is = 0;
|
||||
int d;
|
||||
for (int k = 0; k < QK_K/128; ++k) {
|
||||
int shift = 0;
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
d = sc[is++] & 0xF;
|
||||
int isuml = 0;
|
||||
for (int l = 0; l < 16; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
|
||||
isum += d * isuml;
|
||||
d = sc[is++] & 0xF;
|
||||
isuml = 0;
|
||||
for (int l = 16; l < 32; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
|
||||
isum += d * isuml;
|
||||
shift += 2;
|
||||
q8 += 32;
|
||||
}
|
||||
q2 += 32;
|
||||
}
|
||||
sumf += dall * isum - dmin * summs;
|
||||
}
|
||||
*s = sumf;
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -1045,14 +1147,68 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
*s = sumf;
|
||||
|
||||
#else
|
||||
// scalar version
|
||||
// This function is written like this so the compiler can manage to vectorize most of it
|
||||
// Using -Ofast, GCC and clang manage to produce code that is within a factor of 2 or so from the
|
||||
// manually vectorized version above. Every other version I tried would run at least 4 times slower.
|
||||
// The ideal situation would be if we could just write the code once, and the compiler would
|
||||
// automatically produce the best possible set of machine instructions, instead of us having to manually
|
||||
// write vectorized versions for AVX, ARM_NEON, etc.
|
||||
|
||||
UNUSED(kmask1);
|
||||
UNUSED(kmask2);
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
int8_t aux8[QK_K];
|
||||
int16_t aux16[8];
|
||||
float sums [8];
|
||||
int32_t aux32[8];
|
||||
memset(sums, 0, 8*sizeof(float));
|
||||
|
||||
uint32_t auxs[4];
|
||||
const int8_t * scales = (const int8_t*)auxs;
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT hm = x[i].hmask;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
memset(aux32, 0, 8*sizeof(int32_t));
|
||||
int8_t * GGML_RESTRICT a = aux8;
|
||||
uint8_t m = 1;
|
||||
for (int j = 0; j < QK_K; j += 128) {
|
||||
for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
|
||||
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
||||
a += 32; m <<= 1;
|
||||
for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3;
|
||||
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
||||
a += 32; m <<= 1;
|
||||
for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3;
|
||||
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
||||
a += 32; m <<= 1;
|
||||
for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3;
|
||||
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
||||
a += 32; m <<= 1;
|
||||
q3 += 32;
|
||||
}
|
||||
a = aux8;
|
||||
|
||||
memcpy(auxs, x[i].scales, 12);
|
||||
uint32_t tmp = auxs[2];
|
||||
auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
|
||||
auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
|
||||
auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
|
||||
auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
|
||||
for (int j = 0; j < QK_K/16; ++j) {
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
*s = sumf;
|
||||
|
||||
ggml_vec_dot_q3_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
|
||||
}
|
||||
@@ -1378,15 +1534,60 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
#else
|
||||
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(kmask1);
|
||||
UNUSED(kmask2);
|
||||
UNUSED(kmask3);
|
||||
UNUSED(nb);
|
||||
UNUSED(utmp);
|
||||
const uint8_t * scales = (const uint8_t*)&utmp[0];
|
||||
const uint8_t * mins = (const uint8_t*)&utmp[2];
|
||||
|
||||
ggml_vec_dot_q4_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
int8_t aux8[QK_K];
|
||||
int16_t aux16[8];
|
||||
float sums [8];
|
||||
int32_t aux32[8];
|
||||
memset(sums, 0, 8*sizeof(float));
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
memset(aux32, 0, 8*sizeof(int32_t));
|
||||
int8_t * GGML_RESTRICT a = aux8;
|
||||
for (int j = 0; j < QK_K/64; ++j) {
|
||||
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
|
||||
a += 32;
|
||||
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
|
||||
a += 32; q4 += 32;
|
||||
}
|
||||
memcpy(utmp, x[i].scales, 12);
|
||||
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
||||
const uint32_t uaux = utmp[1] & kmask1;
|
||||
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
||||
utmp[2] = uaux;
|
||||
utmp[0] &= kmask1;
|
||||
|
||||
int sumi = 0;
|
||||
for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
|
||||
a = aux8;
|
||||
int is = 0;
|
||||
for (int j = 0; j < QK_K/32; ++j) {
|
||||
int32_t scale = scales[is++];
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
||||
sumf -= dmin * sumi;
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
*s = sumf;
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -1497,15 +1698,65 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
#else
|
||||
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(kmask1);
|
||||
UNUSED(kmask2);
|
||||
UNUSED(kmask3);
|
||||
UNUSED(nb);
|
||||
UNUSED(utmp);
|
||||
const uint8_t * scales = (const uint8_t*)&utmp[0];
|
||||
const uint8_t * mins = (const uint8_t*)&utmp[2];
|
||||
|
||||
ggml_vec_dot_q5_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
int8_t aux8[QK_K];
|
||||
int16_t aux16[8];
|
||||
float sums [8];
|
||||
int32_t aux32[8];
|
||||
memset(sums, 0, 8*sizeof(float));
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT hm = x[i].qh;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
memset(aux32, 0, 8*sizeof(int32_t));
|
||||
int8_t * GGML_RESTRICT a = aux8;
|
||||
uint8_t m = 1;
|
||||
for (int j = 0; j < QK_K/64; ++j) {
|
||||
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
|
||||
for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
|
||||
a += 32; m <<= 1;
|
||||
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
|
||||
for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
|
||||
a += 32; m <<= 1;
|
||||
q4 += 32;
|
||||
}
|
||||
memcpy(utmp, x[i].scales, 12);
|
||||
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
||||
const uint32_t uaux = utmp[1] & kmask1;
|
||||
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
||||
utmp[2] = uaux;
|
||||
utmp[0] &= kmask1;
|
||||
|
||||
int sumi = 0;
|
||||
for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
|
||||
a = aux8;
|
||||
int is = 0;
|
||||
for (int j = 0; j < QK_K/32; ++j) {
|
||||
int32_t scale = scales[is++];
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
||||
sumf -= dmin * sumi;
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
*s = sumf;
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -1773,11 +2024,46 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
#else
|
||||
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
int8_t aux8[QK_K];
|
||||
int16_t aux16[8];
|
||||
float sums [8];
|
||||
int32_t aux32[8];
|
||||
memset(sums, 0, 8*sizeof(float));
|
||||
|
||||
ggml_vec_dot_q6_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const uint8_t * GGML_RESTRICT q4 = x[i].ql;
|
||||
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
memset(aux32, 0, 8*sizeof(int32_t));
|
||||
int8_t * GGML_RESTRICT a = aux8;
|
||||
for (int j = 0; j < QK_K; j += 128) {
|
||||
for (int l = 0; l < 32; ++l) {
|
||||
a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
|
||||
a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
|
||||
a[l + 64] = (int8_t)((q4[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
|
||||
a[l + 96] = (int8_t)((q4[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
|
||||
}
|
||||
a += 128;
|
||||
q4 += 64;
|
||||
qh += 32;
|
||||
}
|
||||
a = aux8;
|
||||
int is = 0;
|
||||
for (int j = 0; j < QK_K/16; ++j) {
|
||||
int scale = x[i].scales[is++];
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
*s = sumf;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
@@ -112,7 +112,31 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
}
|
||||
|
||||
#endif
|
||||
ggml_gemv_q4_0_8x8_q8_0_generic(n, s, bs, vx, vy, nr, nc);
|
||||
{
|
||||
float sumf[8];
|
||||
int sumi;
|
||||
|
||||
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
||||
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||
const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
|
||||
|
||||
for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
|
||||
for (int l = 0; l < nb; l++) {
|
||||
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
sumi = 0;
|
||||
for (int i = 0; i < blocklen; ++i) {
|
||||
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
|
||||
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
||||
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
|
||||
}
|
||||
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
@@ -337,6 +361,37 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
return;
|
||||
}
|
||||
|
||||
#endif
|
||||
ggml_gemm_q4_0_8x8_q8_0_generic(n, s, bs, vx, vy, nr, nc);
|
||||
#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
|
||||
float sumf[4][8];
|
||||
int sumi;
|
||||
|
||||
for (int y = 0; y < nr / 4; y++) {
|
||||
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
||||
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||
const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
|
||||
for (int m = 0; m < 4; m++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
|
||||
}
|
||||
for (int l = 0; l < nb; l++) {
|
||||
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
||||
for (int m = 0; m < 4; m++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
sumi = 0;
|
||||
for (int i = 0; i < blocklen; ++i) {
|
||||
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
|
||||
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
||||
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
||||
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
|
||||
}
|
||||
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int m = 0; m < 4; m++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++)
|
||||
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -172,15 +172,24 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
sumf = acc[0] + acc[1] + acc[2] + acc[3];
|
||||
|
||||
*s = sumf;
|
||||
#else
|
||||
UNUSED(nb);
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(ib);
|
||||
UNUSED(sumf);
|
||||
ggml_vec_dot_q4_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
for (; ib < nb; ++ib) {
|
||||
int sumi0 = 0;
|
||||
int sumi1 = 0;
|
||||
|
||||
for (int j = 0; j < qk/2; ++j) {
|
||||
const int v0 = (x[ib].qs[j] & 0x0F) - 8;
|
||||
const int v1 = (x[ib].qs[j] >> 4) - 8;
|
||||
|
||||
sumi0 += (v0 * y[ib].qs[j]);
|
||||
sumi1 += (v1 * y[ib].qs[j + qk/2]);
|
||||
}
|
||||
|
||||
int sumi = sumi0 + sumi1;
|
||||
sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
@@ -230,15 +239,24 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
sumf = acc[0] + acc[1] + acc[2] + acc[3] + summs;
|
||||
|
||||
*s = sumf;
|
||||
#else
|
||||
UNUSED(nb);
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(ib);
|
||||
UNUSED(sumf);
|
||||
ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
for (; ib < nb; ++ib) {
|
||||
int sumi0 = 0;
|
||||
int sumi1 = 0;
|
||||
|
||||
for (int j = 0; j < qk/2; ++j) {
|
||||
const int v0 = (x[ib].qs[j] & 0x0F);
|
||||
const int v1 = (x[ib].qs[j] >> 4);
|
||||
|
||||
sumi0 += (v0 * y[ib].qs[j]);
|
||||
sumi1 += (v1 * y[ib].qs[j + qk/2]);
|
||||
}
|
||||
|
||||
int sumi = sumi0 + sumi1;
|
||||
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
@@ -280,15 +298,18 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
sumf = acc[0] + acc[1] + acc[2] + acc[3];
|
||||
|
||||
*s = sumf;
|
||||
#else
|
||||
UNUSED(nb);
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(ib);
|
||||
UNUSED(sumf);
|
||||
ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
for (; ib < nb; ++ib) {
|
||||
int sumi = 0;
|
||||
|
||||
for (int j = 0; j < qk; j++) {
|
||||
sumi += x[ib].qs[j]*y[ib].qs[j];
|
||||
}
|
||||
|
||||
sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
@@ -421,13 +442,70 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
*s = sum;
|
||||
|
||||
#else
|
||||
UNUSED(kmask1);
|
||||
UNUSED(kmask2);
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_q3_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
// scalar version
|
||||
// This function is written like this so the compiler can manage to vectorize most of it
|
||||
// Using -Ofast, GCC and clang manage to produce code that is within a factor of 2 or so from the
|
||||
// manually vectorized version above. Every other version I tried would run at least 4 times slower.
|
||||
// The ideal situation would be if we could just write the code once, and the compiler would
|
||||
// automatically produce the best possible set of machine instructions, instead of us having to manually
|
||||
// write vectorized versions for AVX, ARM_NEON, etc.
|
||||
|
||||
int8_t aux8[QK_K];
|
||||
int16_t aux16[8];
|
||||
float sums [8];
|
||||
int32_t aux32[8];
|
||||
memset(sums, 0, 8*sizeof(float));
|
||||
|
||||
uint32_t auxs[4];
|
||||
const int8_t * scales = (const int8_t*)auxs;
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT hm = x[i].hmask;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
memset(aux32, 0, 8*sizeof(int32_t));
|
||||
int8_t * GGML_RESTRICT a = aux8;
|
||||
uint8_t m = 1;
|
||||
for (int j = 0; j < QK_K; j += 128) {
|
||||
for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
|
||||
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
||||
a += 32; m <<= 1;
|
||||
for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3;
|
||||
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
||||
a += 32; m <<= 1;
|
||||
for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3;
|
||||
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
||||
a += 32; m <<= 1;
|
||||
for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3;
|
||||
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
||||
a += 32; m <<= 1;
|
||||
q3 += 32;
|
||||
}
|
||||
a = aux8;
|
||||
|
||||
memcpy(auxs, x[i].scales, 12);
|
||||
uint32_t tmp = auxs[2];
|
||||
auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
|
||||
auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
|
||||
auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
|
||||
auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
|
||||
for (int j = 0; j < QK_K/16; ++j) {
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
*s = sumf;
|
||||
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
@@ -522,14 +600,61 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
*s = sumf;
|
||||
|
||||
#else
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
UNUSED(kmask1);
|
||||
UNUSED(kmask2);
|
||||
UNUSED(kmask3);
|
||||
UNUSED(utmp);
|
||||
ggml_vec_dot_q4_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
|
||||
const uint8_t * scales = (const uint8_t*)&utmp[0];
|
||||
const uint8_t * mins = (const uint8_t*)&utmp[2];
|
||||
|
||||
int8_t aux8[QK_K];
|
||||
int16_t aux16[8];
|
||||
float sums [8];
|
||||
int32_t aux32[8];
|
||||
memset(sums, 0, 8*sizeof(float));
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
memset(aux32, 0, 8*sizeof(int32_t));
|
||||
int8_t * GGML_RESTRICT a = aux8;
|
||||
for (int j = 0; j < QK_K/64; ++j) {
|
||||
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
|
||||
a += 32;
|
||||
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
|
||||
a += 32; q4 += 32;
|
||||
}
|
||||
memcpy(utmp, x[i].scales, 12);
|
||||
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
||||
const uint32_t uaux = utmp[1] & kmask1;
|
||||
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
||||
utmp[2] = uaux;
|
||||
utmp[0] &= kmask1;
|
||||
|
||||
int sumi = 0;
|
||||
for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
|
||||
a = aux8;
|
||||
int is = 0;
|
||||
for (int j = 0; j < QK_K/32; ++j) {
|
||||
int32_t scale = scales[is++];
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
||||
sumf -= dmin * sumi;
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
*s = sumf;
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -642,14 +767,66 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
*s = sumf;
|
||||
|
||||
#else
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
UNUSED(kmask1);
|
||||
UNUSED(kmask2);
|
||||
UNUSED(kmask3);
|
||||
UNUSED(utmp);
|
||||
ggml_vec_dot_q5_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
|
||||
const uint8_t * scales = (const uint8_t*)&utmp[0];
|
||||
const uint8_t * mins = (const uint8_t*)&utmp[2];
|
||||
|
||||
int8_t aux8[QK_K];
|
||||
int16_t aux16[8];
|
||||
float sums [8];
|
||||
int32_t aux32[8];
|
||||
memset(sums, 0, 8*sizeof(float));
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT hm = x[i].qh;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
memset(aux32, 0, 8*sizeof(int32_t));
|
||||
int8_t * GGML_RESTRICT a = aux8;
|
||||
uint8_t m = 1;
|
||||
for (int j = 0; j < QK_K/64; ++j) {
|
||||
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
|
||||
for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
|
||||
a += 32; m <<= 1;
|
||||
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
|
||||
for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
|
||||
a += 32; m <<= 1;
|
||||
q4 += 32;
|
||||
}
|
||||
memcpy(utmp, x[i].scales, 12);
|
||||
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
||||
const uint32_t uaux = utmp[1] & kmask1;
|
||||
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
||||
utmp[2] = uaux;
|
||||
utmp[0] &= kmask1;
|
||||
|
||||
int sumi = 0;
|
||||
for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
|
||||
a = aux8;
|
||||
int is = 0;
|
||||
for (int j = 0; j < QK_K/32; ++j) {
|
||||
int32_t scale = scales[is++];
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
||||
sumf -= dmin * sumi;
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
*s = sumf;
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -792,10 +969,47 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
*s = sum;
|
||||
|
||||
#else
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_q6_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
|
||||
int8_t aux8[QK_K];
|
||||
int16_t aux16[8];
|
||||
float sums [8];
|
||||
int32_t aux32[8];
|
||||
memset(sums, 0, 8*sizeof(float));
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const uint8_t * GGML_RESTRICT q4 = x[i].ql;
|
||||
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
memset(aux32, 0, 8*sizeof(int32_t));
|
||||
int8_t * GGML_RESTRICT a = aux8;
|
||||
for (int j = 0; j < QK_K; j += 128) {
|
||||
for (int l = 0; l < 32; ++l) {
|
||||
a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
|
||||
a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
|
||||
a[l + 64] = (int8_t)((q4[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
|
||||
a[l + 96] = (int8_t)((q4[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
|
||||
}
|
||||
a += 128;
|
||||
q4 += 64;
|
||||
qh += 32;
|
||||
}
|
||||
a = aux8;
|
||||
int is = 0;
|
||||
for (int j = 0; j < QK_K/16; ++j) {
|
||||
int scale = x[i].scales[is++];
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
*s = sumf;
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -972,15 +1186,17 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v
|
||||
sumf += GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d) * (v_xy[0] + v_xy[1] + v_xy[2] + v_xy[3]);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
#else
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
UNUSED(ib);
|
||||
UNUSED(sumf);
|
||||
ggml_vec_dot_iq4_nl_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
for (; ib < nb; ++ib) {
|
||||
const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d);
|
||||
int sumi1 = 0, sumi2 = 0;
|
||||
for (int j = 0; j < QK4_NL/2; ++j) {
|
||||
sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf];
|
||||
sumi2 += y[ib].qs[j+QK4_NL/2] * kvalues_iq4nl[x[ib].qs[j] >> 4];
|
||||
}
|
||||
sumf += d * (sumi1 + sumi2);
|
||||
}
|
||||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
@@ -1048,10 +1264,37 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
||||
*s = sumf;
|
||||
|
||||
#else
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
float sumf = 0;
|
||||
for (int ibl = 0; ibl < nb; ++ibl) {
|
||||
const float d4d8 = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
|
||||
uint16_t h = x[ibl].scales_h;
|
||||
const uint8_t * qs = x[ibl].qs;
|
||||
const int8_t * q8 = y[ibl].qs;
|
||||
for (int ib = 0; ib < QK_K/32; ib += 2) {
|
||||
const uint8_t ls1 = (x[ibl].scales_l[ib/2] & 0xf) | ((h << 4) & 0x30);
|
||||
const uint8_t ls2 = (x[ibl].scales_l[ib/2] >> 4) | ((h << 2) & 0x30);
|
||||
h >>= 4;
|
||||
const float d1 = d4d8*(ls1 - 32);
|
||||
const float d2 = d4d8*(ls2 - 32);
|
||||
int sumi1 = 0, sumi2 = 0;
|
||||
for (int j = 0; j < 16; ++j) {
|
||||
sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
|
||||
sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
|
||||
}
|
||||
sumf += d1 * (sumi1 + sumi2);
|
||||
qs += 16;
|
||||
q8 += 32;
|
||||
sumi1 = sumi2 = 0;
|
||||
for (int j = 0; j < 16; ++j) {
|
||||
sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
|
||||
sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
|
||||
}
|
||||
sumf += d2 * (sumi1 + sumi2);
|
||||
qs += 16;
|
||||
q8 += 32;
|
||||
}
|
||||
}
|
||||
*s = sumf;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
@@ -435,15 +435,30 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
sumf = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
|
||||
wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3);
|
||||
|
||||
*s = sumf;
|
||||
#else
|
||||
UNUSED(nb);
|
||||
UNUSED(ib);
|
||||
UNUSED(sumf);
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
for (; ib < nb; ++ib) {
|
||||
uint32_t qh;
|
||||
memcpy(&qh, x[ib].qh, sizeof(qh));
|
||||
|
||||
int sumi0 = 0;
|
||||
int sumi1 = 0;
|
||||
|
||||
for (int j = 0; j < qk/2; ++j) {
|
||||
const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
|
||||
const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
|
||||
|
||||
const int32_t x0 = (int8_t)(((x[ib].qs[j] & 0x0F) | xh_0) - 16);
|
||||
const int32_t x1 = (int8_t)(((x[ib].qs[j] >> 4) | xh_1) - 16);
|
||||
|
||||
sumi0 += (x0 * y[ib].qs[j]);
|
||||
sumi1 += (x1 * y[ib].qs[j + qk/2]);
|
||||
}
|
||||
|
||||
int sumi = sumi0 + sumi1;
|
||||
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
@@ -530,15 +545,30 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
sumf = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
|
||||
wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3) + summs;
|
||||
|
||||
*s = sumf;
|
||||
#else
|
||||
UNUSED(nb);
|
||||
UNUSED(ib);
|
||||
UNUSED(sumf);
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
for (; ib < nb; ++ib) {
|
||||
uint32_t qh;
|
||||
memcpy(&qh, x[ib].qh, sizeof(qh));
|
||||
|
||||
int sumi0 = 0;
|
||||
int sumi1 = 0;
|
||||
|
||||
for (int j = 0; j < qk/2; ++j) {
|
||||
const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
|
||||
const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10;
|
||||
|
||||
const int32_t x0 = (x[ib].qs[j] & 0xF) | xh_0;
|
||||
const int32_t x1 = (x[ib].qs[j] >> 4) | xh_1;
|
||||
|
||||
sumi0 += (x0 * y[ib].qs[j]);
|
||||
sumi1 += (x1 * y[ib].qs[j + qk/2]);
|
||||
}
|
||||
|
||||
int sumi = sumi0 + sumi1;
|
||||
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
@@ -598,15 +628,18 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
sumf = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
|
||||
wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3);
|
||||
|
||||
*s = sumf;
|
||||
#else
|
||||
UNUSED(nb);
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(ib);
|
||||
UNUSED(sumf);
|
||||
ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
#endif
|
||||
for (; ib < nb; ++ib) {
|
||||
int sumi = 0;
|
||||
|
||||
for (int j = 0; j < qk; j++) {
|
||||
sumi += x[ib].qs[j]*y[ib].qs[j];
|
||||
}
|
||||
|
||||
sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
@@ -722,10 +755,45 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
*s = sumf;
|
||||
|
||||
#else
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_q2_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
|
||||
float sumf = 0;
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
|
||||
const uint8_t * q2 = x[i].qs;
|
||||
const int8_t * q8 = y[i].qs;
|
||||
const uint8_t * sc = x[i].scales;
|
||||
|
||||
int summs = 0;
|
||||
for (int j = 0; j < 16; ++j) {
|
||||
summs += y[i].bsums[j] * (sc[j] >> 4);
|
||||
}
|
||||
|
||||
const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
||||
|
||||
int isum = 0;
|
||||
int is = 0;
|
||||
int d;
|
||||
for (int k = 0; k < QK_K/128; ++k) {
|
||||
int shift = 0;
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
d = sc[is++] & 0xF;
|
||||
int isuml = 0;
|
||||
for (int l = 0; l < 16; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
|
||||
isum += d * isuml;
|
||||
d = sc[is++] & 0xF;
|
||||
isuml = 0;
|
||||
for (int l = 16; l < 32; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
|
||||
isum += d * isuml;
|
||||
shift += 2;
|
||||
q8 += 32;
|
||||
}
|
||||
q2 += 32;
|
||||
}
|
||||
sumf += dall * isum - dmin * summs;
|
||||
}
|
||||
*s = sumf;
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -834,12 +902,68 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
*s = sumf;
|
||||
|
||||
#else
|
||||
UNUSED(kmask1);
|
||||
UNUSED(kmask2);
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_q3_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
// scalar version
|
||||
// This function is written like this so the compiler can manage to vectorize most of it
|
||||
// Using -Ofast, GCC and clang manage to produce code that is within a factor of 2 or so from the
|
||||
// manually vectorized version above. Every other version I tried would run at least 4 times slower.
|
||||
// The ideal situation would be if we could just write the code once, and the compiler would
|
||||
// automatically produce the best possible set of machine instructions, instead of us having to manually
|
||||
// write vectorized versions for AVX, ARM_NEON, etc.
|
||||
|
||||
int8_t aux8[QK_K];
|
||||
int16_t aux16[8];
|
||||
float sums [8];
|
||||
int32_t aux32[8];
|
||||
memset(sums, 0, 8*sizeof(float));
|
||||
|
||||
uint32_t auxs[4];
|
||||
const int8_t * scales = (const int8_t*)auxs;
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT hm = x[i].hmask;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
memset(aux32, 0, 8*sizeof(int32_t));
|
||||
int8_t * GGML_RESTRICT a = aux8;
|
||||
uint8_t m = 1;
|
||||
for (int j = 0; j < QK_K; j += 128) {
|
||||
for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
|
||||
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
||||
a += 32; m <<= 1;
|
||||
for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3;
|
||||
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
||||
a += 32; m <<= 1;
|
||||
for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3;
|
||||
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
||||
a += 32; m <<= 1;
|
||||
for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3;
|
||||
for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
|
||||
a += 32; m <<= 1;
|
||||
q3 += 32;
|
||||
}
|
||||
a = aux8;
|
||||
|
||||
memcpy(auxs, x[i].scales, 12);
|
||||
uint32_t tmp = auxs[2];
|
||||
auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
|
||||
auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
|
||||
auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
|
||||
auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
|
||||
for (int j = 0; j < QK_K/16; ++j) {
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
*s = sumf;
|
||||
|
||||
#endif
|
||||
|
||||
}
|
||||
@@ -965,14 +1089,61 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
*s = sumf;
|
||||
|
||||
#else
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
UNUSED(kmask1);
|
||||
UNUSED(kmask2);
|
||||
UNUSED(kmask3);
|
||||
UNUSED(utmp);
|
||||
ggml_vec_dot_q4_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
|
||||
const uint8_t * scales = (const uint8_t*)&utmp[0];
|
||||
const uint8_t * mins = (const uint8_t*)&utmp[2];
|
||||
|
||||
int8_t aux8[QK_K];
|
||||
int16_t aux16[8];
|
||||
float sums [8];
|
||||
int32_t aux32[8];
|
||||
memset(sums, 0, 8*sizeof(float));
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
memset(aux32, 0, 8*sizeof(int32_t));
|
||||
int8_t * GGML_RESTRICT a = aux8;
|
||||
for (int j = 0; j < QK_K/64; ++j) {
|
||||
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
|
||||
a += 32;
|
||||
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
|
||||
a += 32; q4 += 32;
|
||||
}
|
||||
memcpy(utmp, x[i].scales, 12);
|
||||
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
||||
const uint32_t uaux = utmp[1] & kmask1;
|
||||
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
||||
utmp[2] = uaux;
|
||||
utmp[0] &= kmask1;
|
||||
|
||||
int sumi = 0;
|
||||
for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
|
||||
a = aux8;
|
||||
int is = 0;
|
||||
for (int j = 0; j < QK_K/32; ++j) {
|
||||
int32_t scale = scales[is++];
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
||||
sumf -= dmin * sumi;
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
*s = sumf;
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -1108,14 +1279,66 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
*s = sumf;
|
||||
|
||||
#else
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
UNUSED(kmask1);
|
||||
UNUSED(kmask2);
|
||||
UNUSED(kmask3);
|
||||
UNUSED(utmp);
|
||||
ggml_vec_dot_q5_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
|
||||
const uint8_t * scales = (const uint8_t*)&utmp[0];
|
||||
const uint8_t * mins = (const uint8_t*)&utmp[2];
|
||||
|
||||
int8_t aux8[QK_K];
|
||||
int16_t aux16[8];
|
||||
float sums [8];
|
||||
int32_t aux32[8];
|
||||
memset(sums, 0, 8*sizeof(float));
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT hm = x[i].qh;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
memset(aux32, 0, 8*sizeof(int32_t));
|
||||
int8_t * GGML_RESTRICT a = aux8;
|
||||
uint8_t m = 1;
|
||||
for (int j = 0; j < QK_K/64; ++j) {
|
||||
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
|
||||
for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
|
||||
a += 32; m <<= 1;
|
||||
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
|
||||
for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
|
||||
a += 32; m <<= 1;
|
||||
q4 += 32;
|
||||
}
|
||||
memcpy(utmp, x[i].scales, 12);
|
||||
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
||||
const uint32_t uaux = utmp[1] & kmask1;
|
||||
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
||||
utmp[2] = uaux;
|
||||
utmp[0] &= kmask1;
|
||||
|
||||
int sumi = 0;
|
||||
for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
|
||||
a = aux8;
|
||||
int is = 0;
|
||||
for (int j = 0; j < QK_K/32; ++j) {
|
||||
int32_t scale = scales[is++];
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
||||
sumf -= dmin * sumi;
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
*s = sumf;
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -1212,10 +1435,47 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
*s = sumf;
|
||||
|
||||
#else
|
||||
UNUSED(x);
|
||||
UNUSED(y);
|
||||
UNUSED(nb);
|
||||
ggml_vec_dot_q6_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
|
||||
|
||||
int8_t aux8[QK_K];
|
||||
int16_t aux16[8];
|
||||
float sums [8];
|
||||
int32_t aux32[8];
|
||||
memset(sums, 0, 8*sizeof(float));
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const uint8_t * GGML_RESTRICT q4 = x[i].ql;
|
||||
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
memset(aux32, 0, 8*sizeof(int32_t));
|
||||
int8_t * GGML_RESTRICT a = aux8;
|
||||
for (int j = 0; j < QK_K; j += 128) {
|
||||
for (int l = 0; l < 32; ++l) {
|
||||
a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
|
||||
a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
|
||||
a[l + 64] = (int8_t)((q4[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
|
||||
a[l + 96] = (int8_t)((q4[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
|
||||
}
|
||||
a += 128;
|
||||
q4 += 64;
|
||||
qh += 32;
|
||||
}
|
||||
a = aux8;
|
||||
int is = 0;
|
||||
for (int j = 0; j < QK_K/16; ++j) {
|
||||
int scale = x[i].scales[is++];
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
*s = sumf;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -253,12 +253,6 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
|
||||
.vec_dot_type = GGML_TYPE_Q8_1,
|
||||
.nrows = 1,
|
||||
},
|
||||
[GGML_TYPE_MXFP4] = {
|
||||
.from_float = quantize_row_mxfp4,
|
||||
.vec_dot = ggml_vec_dot_mxfp4_q8_0,
|
||||
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||
.nrows = 1,
|
||||
},
|
||||
[GGML_TYPE_Q2_K] = {
|
||||
.from_float = quantize_row_q2_K,
|
||||
.vec_dot = ggml_vec_dot_q2_K_q8_K,
|
||||
@@ -1676,10 +1670,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
||||
{
|
||||
ggml_compute_forward_add(params, tensor);
|
||||
} break;
|
||||
case GGML_OP_ADD_ID:
|
||||
{
|
||||
ggml_compute_forward_add_id(params, tensor);
|
||||
} break;
|
||||
case GGML_OP_ADD1:
|
||||
{
|
||||
ggml_compute_forward_add1(params, tensor);
|
||||
@@ -1934,7 +1924,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
||||
} break;
|
||||
case GGML_OP_FLASH_ATTN_EXT:
|
||||
{
|
||||
ggml_compute_forward_flash_attn_ext(params, tensor);
|
||||
ggml_compute_forward_flash_attn_ext(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], tensor);
|
||||
} break;
|
||||
case GGML_OP_FLASH_ATTN_BACK:
|
||||
{
|
||||
@@ -2022,11 +2012,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
||||
ggml_compute_forward_opt_step_adamw(params, tensor);
|
||||
}
|
||||
break;
|
||||
case GGML_OP_OPT_STEP_SGD:
|
||||
{
|
||||
ggml_compute_forward_opt_step_sgd(params, tensor);
|
||||
}
|
||||
break;
|
||||
case GGML_OP_NONE:
|
||||
{
|
||||
// nop
|
||||
@@ -2126,7 +2111,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
||||
case GGML_OP_DUP:
|
||||
case GGML_OP_CONT:
|
||||
case GGML_OP_ADD:
|
||||
case GGML_OP_ADD_ID:
|
||||
case GGML_OP_ADD1:
|
||||
case GGML_OP_ACC:
|
||||
{
|
||||
@@ -2188,7 +2172,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
||||
case GGML_GLU_OP_REGLU:
|
||||
case GGML_GLU_OP_GEGLU:
|
||||
case GGML_GLU_OP_SWIGLU:
|
||||
case GGML_GLU_OP_SWIGLU_OAI:
|
||||
case GGML_GLU_OP_GEGLU_ERF:
|
||||
case GGML_GLU_OP_GEGLU_QUICK:
|
||||
{
|
||||
@@ -2330,7 +2313,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
||||
case GGML_OP_CROSS_ENTROPY_LOSS:
|
||||
case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
|
||||
case GGML_OP_OPT_STEP_ADAMW:
|
||||
case GGML_OP_OPT_STEP_SGD:
|
||||
{
|
||||
n_tasks = n_threads;
|
||||
} break;
|
||||
@@ -2691,7 +2673,6 @@ struct ggml_cplan ggml_graph_plan(
|
||||
}
|
||||
} break;
|
||||
case GGML_OP_ADD:
|
||||
case GGML_OP_ADD_ID:
|
||||
case GGML_OP_ADD1:
|
||||
{
|
||||
if (ggml_is_quantized(node->src[0]->type)) {
|
||||
|
||||
@@ -35,7 +35,7 @@
|
||||
|
||||
// ggml-backend interface
|
||||
|
||||
std::vector<ggml_backend_buffer_type_t> & ggml_backend_cpu_get_extra_buffer_types() {
|
||||
std::vector<ggml_backend_buffer_type_t>& ggml_backend_cpu_get_extra_buffers_type() {
|
||||
static std::vector<ggml_backend_buffer_type_t> bufts = []() {
|
||||
std::vector<ggml_backend_buffer_type_t> bufts;
|
||||
|
||||
@@ -57,6 +57,8 @@ std::vector<ggml_backend_buffer_type_t> & ggml_backend_cpu_get_extra_buffer_type
|
||||
}
|
||||
#endif
|
||||
|
||||
bufts.push_back(NULL);
|
||||
|
||||
return bufts;
|
||||
}();
|
||||
|
||||
@@ -64,20 +66,14 @@ std::vector<ggml_backend_buffer_type_t> & ggml_backend_cpu_get_extra_buffer_type
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_type_t * ggml_backend_cpu_device_get_extra_buffers_type(ggml_backend_dev_t device) {
|
||||
static std::vector<ggml_backend_buffer_type_t> extra_bufts = [] {
|
||||
std::vector<ggml_backend_buffer_type_t> bufts = ggml_backend_cpu_get_extra_buffer_types();
|
||||
bufts.push_back(nullptr);
|
||||
return bufts;
|
||||
}();
|
||||
|
||||
return extra_bufts.data();
|
||||
return ggml_backend_cpu_get_extra_buffers_type().data();
|
||||
|
||||
GGML_UNUSED(device);
|
||||
}
|
||||
|
||||
static bool ggml_backend_cpu_is_extra_buffer_type(ggml_backend_buffer_type_t buft) {
|
||||
for (auto * extra : ggml_backend_cpu_get_extra_buffer_types()) {
|
||||
if (extra == buft) {
|
||||
for (auto * extra : ggml_backend_cpu_get_extra_buffers_type()) {
|
||||
if (extra && extra == buft) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
@@ -214,10 +210,10 @@ ggml_backend_t ggml_backend_cpu_init(void) {
|
||||
ctx->abort_callback_data = NULL;
|
||||
|
||||
ggml_backend_t cpu_backend = new ggml_backend {
|
||||
/* .guid = */ ggml_backend_cpu_guid(),
|
||||
/* .iface = */ ggml_backend_cpu_i,
|
||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
|
||||
/* .context = */ ctx,
|
||||
/* .guid = */ ggml_backend_cpu_guid(),
|
||||
/* .interface = */ ggml_backend_cpu_i,
|
||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
|
||||
/* .context = */ ctx,
|
||||
};
|
||||
|
||||
if (cpu_backend == NULL) {
|
||||
@@ -401,13 +397,20 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
|
||||
return true;
|
||||
}
|
||||
|
||||
// check extra buffer types
|
||||
// note: only the first sources are checked for extra buffer types to reduce overhead, increase if necessary
|
||||
for (int i = 0; i < 4; i++) {
|
||||
if (op->src[i] && op->src[i]->buffer &&
|
||||
ggml_backend_cpu_is_extra_buffer_type(op->src[i]->buffer->buft)) {
|
||||
auto * buf_extra = (ggml::cpu::extra_buffer_type *) op->src[i]->buffer->buft->context;
|
||||
return buf_extra->supports_op(dev, op);
|
||||
// extra_buffer_op?
|
||||
for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
|
||||
if (extra) {
|
||||
auto buf_extra = (ggml::cpu::extra_buffer_type*) extra->context;
|
||||
if (buf_extra && buf_extra->supports_op(dev, op)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// the other case need host buffer.
|
||||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||
if (op->src[i] && op->src[i]->buffer && !ggml_backend_buft_is_host(op->src[i]->buffer->buft)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -259,10 +259,7 @@ class tensor_traits : public ggml::cpu::tensor_traits {
|
||||
const int64_t m_start = 0;
|
||||
|
||||
const int64_t n_step = static_cast<int64_t>(kernel->get_n_step());
|
||||
int64_t num_threads = KAI_MIN(n / n_step, nth);
|
||||
if (num_threads <= 0) {
|
||||
num_threads = 1;
|
||||
}
|
||||
const int64_t num_threads = KAI_MIN(n / n_step, nth);
|
||||
|
||||
if (ith < num_threads) {
|
||||
const int64_t num_n_per_thread0 = round_down(n / num_threads, n_step);
|
||||
@@ -312,8 +309,7 @@ class tensor_traits : public ggml::cpu::tensor_traits {
|
||||
GGML_ASSERT(kernel);
|
||||
|
||||
const int ith = params->ith;
|
||||
const int nth_raw = params->nth;
|
||||
const int nth = nth_raw > 0 ? nth_raw : 1;
|
||||
const int nth = params->nth;
|
||||
|
||||
const size_t k = ne00;
|
||||
const size_t m = ne11;
|
||||
@@ -331,12 +327,9 @@ class tensor_traits : public ggml::cpu::tensor_traits {
|
||||
const size_t num_n_per_thread = kai_roundup(kai_roundup(n, nth) / nth, n_step);
|
||||
const size_t n_start = ith * num_n_per_thread;
|
||||
|
||||
size_t n_to_process = 0;
|
||||
if (n_start < n) {
|
||||
n_to_process = num_n_per_thread;
|
||||
if ((n_start + n_to_process) > n) {
|
||||
n_to_process = n - n_start;
|
||||
}
|
||||
size_t n_to_process = num_n_per_thread;
|
||||
if ((n_start + n_to_process) > n) {
|
||||
n_to_process = n - n_start;
|
||||
}
|
||||
|
||||
// Calculate number of columns to be processed per thread
|
||||
@@ -368,10 +361,8 @@ class tensor_traits : public ggml::cpu::tensor_traits {
|
||||
const void* lhs_ptr = (const void*)((const char *)lhs_packed + lhs_packed_offset);
|
||||
float *dst_ptr = reinterpret_cast<float *>(static_cast<uint8_t *>(dst->data) + dst_offset);
|
||||
|
||||
if (n_to_process > 0) {
|
||||
variant_call<void>(kernel->run_kernel, m, n_to_process, k, QK4_0, lhs_ptr, rhs_ptr, dst_ptr, dst_stride,
|
||||
sizeof(float), -FLT_MAX, FLT_MAX);
|
||||
}
|
||||
variant_call<void>(kernel->run_kernel, m, n_to_process, k, QK4_0, lhs_ptr, rhs_ptr, dst_ptr, dst_stride,
|
||||
sizeof(float), -FLT_MAX, FLT_MAX);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -8,7 +8,6 @@
|
||||
#include "vec.h"
|
||||
|
||||
#include <float.h>
|
||||
#include <algorithm>
|
||||
|
||||
// ggml_compute_forward_dup
|
||||
|
||||
@@ -1284,7 +1283,6 @@ void ggml_compute_forward_add(
|
||||
case GGML_TYPE_Q5_0:
|
||||
case GGML_TYPE_Q5_1:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_MXFP4:
|
||||
case GGML_TYPE_Q2_K:
|
||||
case GGML_TYPE_Q3_K:
|
||||
case GGML_TYPE_Q4_K:
|
||||
@@ -1311,77 +1309,6 @@ void ggml_compute_forward_add(
|
||||
}
|
||||
}
|
||||
|
||||
// ggml_compute_forward_add_id
|
||||
|
||||
static void ggml_compute_forward_add_id_f32(
|
||||
const ggml_compute_params * params,
|
||||
ggml_tensor * dst) {
|
||||
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const ggml_tensor * src1 = dst->src[1];
|
||||
const ggml_tensor * src2 = dst->src[2];
|
||||
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src2->type == GGML_TYPE_I32);
|
||||
|
||||
GGML_ASSERT(src0->nb[0] == sizeof(float));
|
||||
GGML_ASSERT(src1->nb[0] == sizeof(float));
|
||||
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
const int nr = ggml_nrows(src0);
|
||||
|
||||
GGML_TENSOR_TERNARY_OP_LOCALS
|
||||
|
||||
GGML_ASSERT( nb0 == sizeof(float));
|
||||
GGML_ASSERT(nb10 == sizeof(float));
|
||||
|
||||
// rows per thread
|
||||
const int dr = (nr + nth - 1)/nth;
|
||||
|
||||
// row range for this thread
|
||||
const int ir0 = dr*ith;
|
||||
const int ir1 = MIN(ir0 + dr, nr);
|
||||
|
||||
for (int ir = ir0; ir < ir1; ++ir) {
|
||||
// src0 indices
|
||||
const int i3 = ir/(ne2*ne1);
|
||||
const int i2 = (ir - i3*ne2*ne1)/ne1;
|
||||
const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
|
||||
|
||||
// src1 indices
|
||||
const int i11 = *(int32_t *) ((char *) src2->data + i1*nb20 + i2*nb21);
|
||||
|
||||
GGML_ASSERT(i11 >= 0 && i11 < ne11);
|
||||
|
||||
ggml_vec_add_f32(ne0,
|
||||
(float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ),
|
||||
(float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01),
|
||||
(float *) ((char *) src1->data + i11*nb11));
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_compute_forward_add_id(
|
||||
const ggml_compute_params * params,
|
||||
ggml_tensor * dst) {
|
||||
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F32:
|
||||
{
|
||||
ggml_compute_forward_add_id_f32(params, dst);
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
GGML_ABORT("unsupported type for ggml_compute_forward_add_id: %s", ggml_type_name(src0->type));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ggml_compute_forward_add1
|
||||
|
||||
static void ggml_compute_forward_add1_f32(
|
||||
@@ -1733,7 +1660,6 @@ void ggml_compute_forward_add1(
|
||||
case GGML_TYPE_Q5_1:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_Q8_1:
|
||||
case GGML_TYPE_MXFP4:
|
||||
case GGML_TYPE_Q2_K:
|
||||
case GGML_TYPE_Q3_K:
|
||||
case GGML_TYPE_Q4_K:
|
||||
@@ -1861,7 +1787,6 @@ void ggml_compute_forward_acc(
|
||||
case GGML_TYPE_Q5_1:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_Q8_1:
|
||||
case GGML_TYPE_MXFP4:
|
||||
case GGML_TYPE_Q2_K:
|
||||
case GGML_TYPE_Q3_K:
|
||||
case GGML_TYPE_Q4_K:
|
||||
@@ -3689,93 +3614,6 @@ static void ggml_compute_forward_swiglu(
|
||||
}
|
||||
}
|
||||
|
||||
// ggml_compute_forward_swiglu_oai
|
||||
|
||||
static void ggml_compute_forward_swiglu_oai_f32(
|
||||
const ggml_compute_params * params,
|
||||
ggml_tensor * dst) {
|
||||
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const ggml_tensor * src1 = dst->src[1];
|
||||
char * src0_d = (char *) src0->data;
|
||||
char * src1_d = (char *) (src1 ? src1->data : src0->data);
|
||||
const size_t src0_o = src0->nb[1];
|
||||
const size_t src1_o = src1 ? src1->nb[1] : src0->nb[1];
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous_1(src0));
|
||||
GGML_ASSERT(ggml_is_contiguous_1(dst));
|
||||
|
||||
if (src1) {
|
||||
GGML_ASSERT(ggml_is_contiguous_1(src1));
|
||||
GGML_ASSERT(src0->type == src1->type);
|
||||
}
|
||||
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
const int nc = src1 ? src0->ne[0] : src0->ne[0] / 2;
|
||||
const int nr = ggml_nrows(src0);
|
||||
|
||||
GGML_ASSERT(dst->ne[0] == nc);
|
||||
GGML_ASSERT(ggml_nrows(dst) == nr);
|
||||
|
||||
const int32_t swapped = ggml_get_op_params_i32(dst, 1);
|
||||
const float alpha = ggml_get_op_params_f32(dst, 2);
|
||||
const float limit = ggml_get_op_params_f32(dst, 3);
|
||||
|
||||
// rows per thread
|
||||
const int dr = (nr + nth - 1)/nth;
|
||||
|
||||
// row range for this thread
|
||||
const int ir0 = dr*ith;
|
||||
const int ir1 = MIN(ir0 + dr, nr);
|
||||
|
||||
for (int i1 = ir0; i1 < ir1; i1++) {
|
||||
float * src0_p = (float *) (src0_d + i1*src0_o);
|
||||
float * src1_p = (float *) (src1_d + i1*src1_o);
|
||||
float * dst_p = (float *) ((char *) dst->data + i1*(dst->nb[1]));
|
||||
|
||||
if (!src1) {
|
||||
src0_p += swapped ? nc : 0;
|
||||
src1_p += swapped ? 0 : nc;
|
||||
}
|
||||
|
||||
for (int k = 0; k < nc; k++) {
|
||||
const float x = std::min(src0_p[k], limit);
|
||||
const float y = std::clamp(src1_p[k], -limit, limit);
|
||||
const float out_glu = x / (1.f + expf(alpha * (-x)));
|
||||
dst_p[k] = out_glu * (y + 1.f);
|
||||
}
|
||||
|
||||
#ifndef NDEBUG
|
||||
for (int k = 0; k < nc; k++) {
|
||||
const float x = dst_p[k];
|
||||
GGML_UNUSED(x);
|
||||
assert(!isnan(x));
|
||||
assert(!isinf(x));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_compute_forward_swiglu_oai(
|
||||
const ggml_compute_params * params,
|
||||
ggml_tensor * dst) {
|
||||
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F32:
|
||||
{
|
||||
ggml_compute_forward_swiglu_oai_f32(params, dst);
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ggml_compute_forward_geglu_erf
|
||||
|
||||
static void ggml_compute_forward_geglu_erf_f32(
|
||||
@@ -4761,7 +4599,6 @@ void ggml_compute_forward_out_prod(
|
||||
case GGML_TYPE_Q5_0:
|
||||
case GGML_TYPE_Q5_1:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_MXFP4:
|
||||
case GGML_TYPE_Q2_K:
|
||||
case GGML_TYPE_Q3_K:
|
||||
case GGML_TYPE_Q4_K:
|
||||
@@ -5036,7 +4873,6 @@ void ggml_compute_forward_set(
|
||||
case GGML_TYPE_Q5_1:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_Q8_1:
|
||||
case GGML_TYPE_MXFP4:
|
||||
case GGML_TYPE_Q2_K:
|
||||
case GGML_TYPE_Q3_K:
|
||||
case GGML_TYPE_Q4_K:
|
||||
@@ -5298,7 +5134,6 @@ void ggml_compute_forward_get_rows(
|
||||
case GGML_TYPE_Q5_1:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_Q8_1:
|
||||
case GGML_TYPE_MXFP4:
|
||||
case GGML_TYPE_Q2_K:
|
||||
case GGML_TYPE_Q3_K:
|
||||
case GGML_TYPE_Q4_K:
|
||||
@@ -5688,7 +5523,6 @@ static void ggml_compute_forward_soft_max_f32(
|
||||
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const ggml_tensor * src1 = dst->src[1];
|
||||
const ggml_tensor * src2 = dst->src[2];
|
||||
|
||||
assert(ggml_is_contiguous(dst));
|
||||
assert(ggml_are_same_shape(src0, dst));
|
||||
@@ -5723,9 +5557,6 @@ static void ggml_compute_forward_soft_max_f32(
|
||||
|
||||
const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16);
|
||||
|
||||
// sinks
|
||||
const float * sk = src2 ? (float *)((char *) src2->data) : nullptr;
|
||||
|
||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||
for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
|
||||
@@ -5768,18 +5599,9 @@ static void ggml_compute_forward_soft_max_f32(
|
||||
float max = -INFINITY;
|
||||
ggml_vec_max_f32(ne00, &max, wp);
|
||||
|
||||
// if we have sinks, make a correction as if they were included in the softmax
|
||||
if (sk) {
|
||||
max = MAX(max, sk[i02]);
|
||||
}
|
||||
|
||||
ggml_float sum = ggml_vec_soft_max_f32(ne00, dp, wp, max);
|
||||
assert(sum > 0.0);
|
||||
|
||||
if (sk) {
|
||||
sum += (ggml_float) expf(sk[i02] - max);
|
||||
}
|
||||
|
||||
sum = 1.0/sum;
|
||||
ggml_vec_scale_f32(ne00, dp, sum);
|
||||
|
||||
@@ -6014,7 +5836,6 @@ void ggml_compute_forward_clamp(
|
||||
case GGML_TYPE_Q5_1:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_Q8_1:
|
||||
case GGML_TYPE_MXFP4:
|
||||
case GGML_TYPE_Q2_K:
|
||||
case GGML_TYPE_Q3_K:
|
||||
case GGML_TYPE_Q4_K:
|
||||
@@ -8168,14 +7989,12 @@ void ggml_compute_forward_argsort(
|
||||
|
||||
static void ggml_compute_forward_flash_attn_ext_f16(
|
||||
const ggml_compute_params * params,
|
||||
const ggml_tensor * q,
|
||||
const ggml_tensor * k,
|
||||
const ggml_tensor * v,
|
||||
const ggml_tensor * mask,
|
||||
ggml_tensor * dst) {
|
||||
|
||||
const ggml_tensor * q = dst->src[0];
|
||||
const ggml_tensor * k = dst->src[1];
|
||||
const ggml_tensor * v = dst->src[2];
|
||||
const ggml_tensor * mask = dst->src[3];
|
||||
const ggml_tensor * sinks = dst->src[4];
|
||||
|
||||
GGML_TENSOR_LOCALS(int64_t, neq, q, ne)
|
||||
GGML_TENSOR_LOCALS(size_t, nbq, q, nb)
|
||||
GGML_TENSOR_LOCALS(int64_t, nek, k, ne)
|
||||
@@ -8370,23 +8189,6 @@ static void ggml_compute_forward_flash_attn_ext_f16(
|
||||
}
|
||||
}
|
||||
|
||||
// sinks
|
||||
if (sinks) {
|
||||
const float s = ((float *)((char *) sinks->data))[h];
|
||||
|
||||
float ms = 1.0f;
|
||||
float vs = 1.0f;
|
||||
|
||||
if (s > M) {
|
||||
ms = expf(M - s);
|
||||
ggml_vec_scale_f32(DV, VKQ32, ms);
|
||||
} else {
|
||||
vs = expf(s - M);
|
||||
}
|
||||
|
||||
S = S*ms + vs;
|
||||
}
|
||||
|
||||
// V /= S
|
||||
const float S_inv = 1.0f/S;
|
||||
ggml_vec_scale_f32(DV, VKQ32, S_inv);
|
||||
@@ -8406,13 +8208,17 @@ static void ggml_compute_forward_flash_attn_ext_f16(
|
||||
|
||||
void ggml_compute_forward_flash_attn_ext(
|
||||
const ggml_compute_params * params,
|
||||
const ggml_tensor * q,
|
||||
const ggml_tensor * k,
|
||||
const ggml_tensor * v,
|
||||
const ggml_tensor * mask,
|
||||
ggml_tensor * dst) {
|
||||
switch (dst->op_params[3]) {
|
||||
case GGML_PREC_DEFAULT:
|
||||
case GGML_PREC_F32:
|
||||
{
|
||||
// uses F32 accumulators
|
||||
ggml_compute_forward_flash_attn_ext_f16(params, dst);
|
||||
ggml_compute_forward_flash_attn_ext_f16(params, q, k, v, mask, dst);
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
@@ -9274,10 +9080,6 @@ void ggml_compute_forward_glu(
|
||||
{
|
||||
ggml_compute_forward_swiglu(params, dst);
|
||||
} break;
|
||||
case GGML_GLU_OP_SWIGLU_OAI:
|
||||
{
|
||||
ggml_compute_forward_swiglu_oai(params, dst);
|
||||
} break;
|
||||
case GGML_GLU_OP_GEGLU_ERF:
|
||||
{
|
||||
ggml_compute_forward_geglu_erf(params, dst);
|
||||
@@ -10330,7 +10132,6 @@ static void ggml_compute_forward_opt_step_adamw_f32(
|
||||
const int ir1 = MIN(ir0 + dr, nr);
|
||||
|
||||
const float * adamw_params_ptr = ggml_get_data_f32(adamw_params);
|
||||
|
||||
const float alpha = adamw_params_ptr[0];
|
||||
const float beta1 = adamw_params_ptr[1];
|
||||
const float beta2 = adamw_params_ptr[2];
|
||||
@@ -10338,7 +10139,7 @@ static void ggml_compute_forward_opt_step_adamw_f32(
|
||||
const float wd = adamw_params_ptr[4];
|
||||
const float beta1h = adamw_params_ptr[5];
|
||||
const float beta2h = adamw_params_ptr[6];
|
||||
const float keep = 1.f - alpha * wd;
|
||||
|
||||
for (int ir = ir0; ir < ir1; ++ir) {
|
||||
const int64_t i03 = ir/(ne02*ne01);
|
||||
const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
|
||||
@@ -10361,7 +10162,7 @@ static void ggml_compute_forward_opt_step_adamw_f32(
|
||||
// The weight decay is applied independently of the Adam momenta m and v.
|
||||
// This is NOT equivalent to l2 regularization that adds w[i00]*w[i00] to the loss.
|
||||
// See: https://arxiv.org/pdf/1711.05101v3.pdf
|
||||
w[i00] = w[i00] * keep - alpha * mh / vh;
|
||||
w[i00] = w[i00]*(1.0f - alpha*wd) - alpha*mh/vh;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -10383,63 +10184,3 @@ void ggml_compute_forward_opt_step_adamw(
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_compute_forward_opt_step_sgd_f32(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const ggml_tensor * src0_grad = dst->src[1];
|
||||
const ggml_tensor * sgd_params = dst->src[2];
|
||||
|
||||
GGML_ASSERT(ggml_are_same_shape(src0, src0_grad));
|
||||
GGML_ASSERT(ggml_nelements(sgd_params) == 2);
|
||||
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
const int nr = ggml_nrows(src0);
|
||||
|
||||
GGML_TENSOR_UNARY_OP_LOCALS
|
||||
GGML_ASSERT(nb00 == sizeof(float));
|
||||
|
||||
// rows per thread
|
||||
const int dr = (nr + nth - 1) / nth;
|
||||
|
||||
// row range for this thread
|
||||
const int ir0 = dr * ith;
|
||||
const int ir1 = MIN(ir0 + dr, nr);
|
||||
|
||||
// using adamw param subset we care about - alpha, wd - could have a separate struct
|
||||
const float * sgd_params_ptr = ggml_get_data_f32(sgd_params);
|
||||
const float alpha = sgd_params_ptr[0];
|
||||
const float keep = 1.f - alpha * sgd_params_ptr[1];
|
||||
|
||||
for (int ir = ir0; ir < ir1; ++ir) {
|
||||
const int64_t i03 = ir / (ne02 * ne01);
|
||||
const int64_t i02 = (ir - i03 * ne02 * ne01) / ne01;
|
||||
const int64_t i01 = (ir - i03 * ne02 * ne01 - i02 * ne01);
|
||||
|
||||
const size_t offset = i03 * nb03 + i02 * nb02 + i01 * nb01;
|
||||
|
||||
float * w = (float *) ((char *) src0->data + offset); // weight
|
||||
const float * g = (const float *) ((const char *) src0_grad->data + offset); // grad
|
||||
|
||||
for (int i00 = 0; i00 < ne00; ++i00) {
|
||||
w[i00] = w[i00] * keep - alpha * g[i00];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_compute_forward_opt_step_sgd(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F32:
|
||||
{
|
||||
ggml_compute_forward_opt_step_sgd_f32(params, dst);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
{
|
||||
GGML_ABORT("fatal error - sgd is F32 only");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -29,7 +29,6 @@ extern "C" {
|
||||
|
||||
void ggml_compute_forward_dup(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_add(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_add_id(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_add1(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_acc(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_sum(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
@@ -83,7 +82,13 @@ void ggml_compute_forward_arange(const struct ggml_compute_params * params, stru
|
||||
void ggml_compute_forward_timestep_embedding(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_argsort(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_leaky_relu(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_flash_attn_ext(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_flash_attn_ext(
|
||||
const struct ggml_compute_params * params,
|
||||
const struct ggml_tensor * q,
|
||||
const struct ggml_tensor * k,
|
||||
const struct ggml_tensor * v,
|
||||
const struct ggml_tensor * mask,
|
||||
struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_flash_attn_back(
|
||||
const struct ggml_compute_params * params,
|
||||
const bool masked,
|
||||
@@ -107,7 +112,7 @@ void ggml_compute_forward_cross_entropy_loss(const struct ggml_compute_params *
|
||||
void ggml_compute_forward_cross_entropy_loss_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_opt_step_adamw(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_opt_step_sgd(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -46,10 +46,6 @@ void quantize_row_q8_1_generic(const float * GGML_RESTRICT x, void * GGML_RESTRI
|
||||
quantize_row_q8_1_ref(x, y, k);
|
||||
}
|
||||
|
||||
void quantize_row_mxfp4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
|
||||
quantize_row_mxfp4_ref(x, y, k);
|
||||
}
|
||||
|
||||
//
|
||||
// 2-6 bit quantization in super-blocks
|
||||
//
|
||||
@@ -185,37 +181,6 @@ void ggml_vec_dot_q4_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, c
|
||||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_mxfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
UNUSED(bx);
|
||||
UNUSED(by);
|
||||
UNUSED(bs);
|
||||
assert(n % QK_MXFP4 == 0);
|
||||
static_assert(QK_MXFP4 == QK8_0, "QK_MXFP4 and QK8_0 must be the same");
|
||||
|
||||
const block_mxfp4 * GGML_RESTRICT x = vx;
|
||||
const block_q8_0 * GGML_RESTRICT y = vy;
|
||||
|
||||
const int nb = n / QK_MXFP4;
|
||||
|
||||
int ib = 0;
|
||||
float sumf = 0;
|
||||
|
||||
for (; ib < nb; ++ib) {
|
||||
const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_E8M0_TO_FP32_HALF(x[ib].e);
|
||||
|
||||
int sumi1 = 0;
|
||||
int sumi2 = 0;
|
||||
for (int j = 0; j < QK_MXFP4/2; ++j) {
|
||||
sumi1 += y[ib].qs[j + 0] * kvalues_mxfp4[x[ib].qs[j] & 0xf];
|
||||
sumi2 += y[ib].qs[j + QK_MXFP4/2] * kvalues_mxfp4[x[ib].qs[j] >> 4];
|
||||
}
|
||||
sumf += d * (sumi1 + sumi2);
|
||||
}
|
||||
*s = sumf;
|
||||
}
|
||||
|
||||
void ggml_vec_dot_q5_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
|
||||
@@ -19,8 +19,6 @@ void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in
|
||||
void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
void quantize_row_mxfp4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
@@ -41,8 +39,6 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
|
||||
void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
|
||||
void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
@@ -71,12 +67,8 @@ void ggml_vec_dot_q4_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, c
|
||||
void ggml_vec_dot_q5_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q5_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q8_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
|
||||
void ggml_vec_dot_mxfp4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
|
||||
void ggml_vec_dot_tq1_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_tq2_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
|
||||
void ggml_vec_dot_q2_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q3_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q4_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
|
||||
@@ -206,9 +206,8 @@ void ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||
const int ncols_interleaved = 4;
|
||||
const int blocklen = 4;
|
||||
|
||||
assert(nr == 1);
|
||||
assert(n % qk == 0);
|
||||
assert(nc % ncols_interleaved == 0);
|
||||
assert (n % qk == 0);
|
||||
assert (nc % ncols_interleaved == 0);
|
||||
|
||||
UNUSED(s);
|
||||
UNUSED(bs);
|
||||
@@ -308,28 +307,30 @@ void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||
UNUSED(ncols_interleaved);
|
||||
UNUSED(blocklen);
|
||||
|
||||
float sumf[8];
|
||||
int sumi;
|
||||
{
|
||||
float sumf[8];
|
||||
int sumi;
|
||||
|
||||
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
||||
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||
const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
|
||||
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
||||
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||
const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
|
||||
|
||||
for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
|
||||
for (int l = 0; l < nb; l++) {
|
||||
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
sumi = 0;
|
||||
for (int i = 0; i < blocklen; ++i) {
|
||||
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
|
||||
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
||||
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
|
||||
for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
|
||||
for (int l = 0; l < nb; l++) {
|
||||
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
sumi = 0;
|
||||
for (int i = 0; i < blocklen; ++i) {
|
||||
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
|
||||
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
||||
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
|
||||
}
|
||||
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
||||
}
|
||||
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
||||
}
|
||||
}
|
||||
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
||||
}
|
||||
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
||||
}
|
||||
}
|
||||
|
||||
@@ -411,11 +412,11 @@ void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
const int qk = QK_K;
|
||||
void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
const int ncols_interleaved = 8;
|
||||
const int blocklen = 8;
|
||||
const int ncols_interleaved = 4;
|
||||
const int blocklen = 4;
|
||||
|
||||
assert (n % qk == 0);
|
||||
assert (nc % ncols_interleaved == 0);
|
||||
@@ -430,136 +431,30 @@ void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||
UNUSED(ncols_interleaved);
|
||||
UNUSED(blocklen);
|
||||
|
||||
float sumf[8];
|
||||
float sum_minf[8];
|
||||
int sumi1,sumi2,sumi3,sumi4;
|
||||
int sumi;
|
||||
{
|
||||
float sumf[4];
|
||||
int sumi;
|
||||
|
||||
const block_q8_K * a_ptr = (const block_q8_K *)vy;
|
||||
for(int x = 0; x < nc / ncols_interleaved; x++) {
|
||||
const block_q2_Kx8 * b_ptr = (const block_q2_Kx8 *) vx + (x * nb);
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
sumf[j] = 0.0;
|
||||
sum_minf[j] = 0.0;
|
||||
}
|
||||
for (int l = 0; l < nb; l++) {
|
||||
for (int k = 0; k < (qk / (4 * blocklen)); k++) {
|
||||
const uint8_t *scales_0 = b_ptr[l].scales + (k / 4) * 64 ;
|
||||
const uint8_t *scales_1 = b_ptr[l].scales + (k / 4) * 64 + 16;
|
||||
const uint8_t *scales_2 = b_ptr[l].scales + (k / 4) * 64 + 32;
|
||||
const uint8_t *scales_3 = b_ptr[l].scales + (k / 4) * 64 + 48;
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
sumi1 = 0;
|
||||
sumi2 = 0;
|
||||
sumi3 = 0;
|
||||
sumi4 = 0;
|
||||
sumi = 0;
|
||||
int offset = ((k / 2) % 2) + j * 2;
|
||||
for (int i = 0; i < blocklen; ++i){
|
||||
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 3);
|
||||
const int v1 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 2 ) & 3);
|
||||
const int v2 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4 ) & 3);
|
||||
const int v3 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 6 ) & 3);
|
||||
sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i]);
|
||||
sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 32]);
|
||||
sumi3 = (v2 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 64]);
|
||||
sumi4 = (v3 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 96]);
|
||||
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
||||
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||
const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
|
||||
|
||||
sumi1 = sumi1 * (scales_0[offset] & 0xF);
|
||||
sumi2 = sumi2 * (scales_1[offset] & 0xF);
|
||||
sumi3 = sumi3 * (scales_2[offset] & 0xF);
|
||||
sumi4 = sumi4 * (scales_3[offset] & 0xF);
|
||||
sumi += sumi1 + sumi2 + sumi3 + sumi4;
|
||||
for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
|
||||
for (int l = 0; l < nb; l++) {
|
||||
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
sumi = 0;
|
||||
for (int i = 0; i < blocklen; ++i) {
|
||||
const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
|
||||
const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
|
||||
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
|
||||
}
|
||||
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
||||
}
|
||||
sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
|
||||
}
|
||||
}
|
||||
for(int sb = 0; sb < 8; sb++) {
|
||||
const uint8_t *mins = b_ptr[l].scales + sb * 16;
|
||||
for(int j = 0; j < ncols_interleaved; j++){
|
||||
sum_minf[j] += ((mins[j * 2] >> 4) * a_ptr[l].bsums[sb * 2] + (mins[(j * 2)+ 1] >> 4) * a_ptr[l].bsums[sb * 2 + 1]) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
|
||||
}
|
||||
}
|
||||
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
||||
}
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
const int ncols_interleaved = 4;
|
||||
const int blocklen = 4;
|
||||
|
||||
assert(nr == 1);
|
||||
assert(n % qk == 0);
|
||||
assert(nc % ncols_interleaved == 0);
|
||||
|
||||
UNUSED(bs);
|
||||
UNUSED(nr);
|
||||
|
||||
float sumf[4];
|
||||
int sumi;
|
||||
|
||||
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
||||
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||
const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
|
||||
|
||||
for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
|
||||
for (int l = 0; l < nb; l++) {
|
||||
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
sumi = 0;
|
||||
for (int i = 0; i < blocklen; ++i) {
|
||||
const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
|
||||
const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
|
||||
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
|
||||
}
|
||||
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_gemv_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
const int ncols_interleaved = 8;
|
||||
const int blocklen = 8;
|
||||
|
||||
assert(nr == 1);
|
||||
assert(n % qk == 0);
|
||||
assert(nc % ncols_interleaved == 0);
|
||||
|
||||
UNUSED(bs);
|
||||
UNUSED(nr);
|
||||
|
||||
float sumf[8];
|
||||
int sumi;
|
||||
|
||||
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
||||
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||
const block_iq4_nlx8 * b_ptr = (const block_iq4_nlx8 *) vx + (x * nb);
|
||||
|
||||
for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
|
||||
for (int l = 0; l < nb; l++) {
|
||||
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
sumi = 0;
|
||||
for (int i = 0; i < blocklen; ++i) {
|
||||
const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
|
||||
const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
|
||||
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
|
||||
}
|
||||
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
||||
}
|
||||
}
|
||||
|
||||
@@ -816,97 +711,6 @@ void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
const int qk = QK_K;
|
||||
const int nb = n / qk;
|
||||
const int ncols_interleaved = 8;
|
||||
const int blocklen = 8;
|
||||
|
||||
assert (n % qk == 0);
|
||||
assert (nr % 4 == 0);
|
||||
assert (nc % ncols_interleaved == 0);
|
||||
|
||||
UNUSED(s);
|
||||
UNUSED(bs);
|
||||
UNUSED(vx);
|
||||
UNUSED(vy);
|
||||
UNUSED(nr);
|
||||
UNUSED(nc);
|
||||
UNUSED(nb);
|
||||
UNUSED(ncols_interleaved);
|
||||
UNUSED(blocklen);
|
||||
|
||||
float sumf[4][8];
|
||||
float sum_minf[4][8];
|
||||
int sumi1, sumi2, sumi3, sumi4;
|
||||
int sumi;
|
||||
|
||||
for (int y = 0; y < nr / 4; y++) {
|
||||
const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
|
||||
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||
const block_q2_Kx8 * b_ptr = (const block_q2_Kx8 *) vx + (x * nb);
|
||||
for (int m = 0; m < 4; m++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
sumf[m][j] = 0.0;
|
||||
sum_minf[m][j] = 0.0;
|
||||
}
|
||||
}
|
||||
for (int l = 0; l < nb; l++) {
|
||||
for (int k = 0; k < (qk / (4 * blocklen)); k++) {
|
||||
|
||||
const uint8_t *scales_0 = b_ptr[l].scales + (k / 4) * 64 ;
|
||||
const uint8_t *scales_1 = b_ptr[l].scales + (k / 4) * 64 + 16;
|
||||
const uint8_t *scales_2 = b_ptr[l].scales + (k / 4) * 64 + 32;
|
||||
const uint8_t *scales_3 = b_ptr[l].scales + (k / 4) * 64 + 48;
|
||||
for (int m = 0; m < 4; m++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
sumi1 = 0;
|
||||
sumi2 = 0;
|
||||
sumi3 = 0;
|
||||
sumi4 = 0;
|
||||
sumi = 0;
|
||||
int offset = ((k / 2) % 2) + j * 2;
|
||||
for (int i = 0; i < blocklen; ++i){
|
||||
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 3);
|
||||
const int v1 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 2 ) & 3);
|
||||
const int v2 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4 ) & 3);
|
||||
const int v3 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 6 ) & 3);
|
||||
sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i]);
|
||||
sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 128]);
|
||||
sumi3 = (v2 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 256]);
|
||||
sumi4 = (v3 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 384]);
|
||||
sumi1 = sumi1 * (scales_0[offset] & 0xF);
|
||||
sumi2 = sumi2 * (scales_1[offset] & 0xF);
|
||||
sumi3 = sumi3 * (scales_2[offset] & 0xF);
|
||||
sumi4 = sumi4 * (scales_3[offset] & 0xF);
|
||||
sumi += sumi1 + sumi2 + sumi3 + sumi4;
|
||||
}
|
||||
sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
|
||||
}
|
||||
}
|
||||
}
|
||||
for(int sb = 0; sb < 8; sb++) {
|
||||
const uint8_t *mins = b_ptr[l].scales + sb * 16;
|
||||
for(int m = 0; m < 4; m++) {
|
||||
const int16_t *bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
|
||||
for(int j = 0; j < ncols_interleaved; j++) {
|
||||
int mins_prod = ((mins[j * 2] >> 4) * bsums[0] + (mins[(j * 2)+ 1] >> 4) * bsums[1]);
|
||||
sum_minf[m][j] += (mins_prod) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (int m = 0; m < 4; m++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
@@ -963,50 +767,6 @@ void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
const int ncols_interleaved = 8;
|
||||
const int blocklen = 8;
|
||||
|
||||
assert(n % qk == 0);
|
||||
assert(nr % 4 == 0);
|
||||
assert(nc % ncols_interleaved == 0);
|
||||
|
||||
float sumf[4][8];
|
||||
int sumi;
|
||||
|
||||
for (int y = 0; y < nr / 4; y++) {
|
||||
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
||||
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
||||
const block_iq4_nlx8 * b_ptr = (const block_iq4_nlx8 *) vx + (x * nb);
|
||||
for (int m = 0; m < 4; m++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
|
||||
}
|
||||
for (int l = 0; l < nb; l++) {
|
||||
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
||||
for (int m = 0; m < 4; m++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
sumi = 0;
|
||||
for (int i = 0; i < blocklen; ++i) {
|
||||
const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
|
||||
const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
|
||||
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
||||
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
|
||||
}
|
||||
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int m = 0; m < 4; m++) {
|
||||
for (int j = 0; j < ncols_interleaved; j++)
|
||||
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // extern "C"
|
||||
|
||||
static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) {
|
||||
@@ -1154,50 +914,6 @@ static block_q4_Kx8 make_block_q4_Kx8(block_q4_K * in, unsigned int blck_size_in
|
||||
return out;
|
||||
}
|
||||
|
||||
static block_q2_Kx8 make_block_q2_Kx8(block_q2_K * in, unsigned int blck_size_interleave) {
|
||||
block_q2_Kx8 out;
|
||||
|
||||
// Delta(scale) and dmin values of the eight Q2_K structures are copied onto the output interleaved structure
|
||||
for (int i = 0; i < 8; i++) {
|
||||
out.d[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d;
|
||||
}
|
||||
|
||||
for (int i = 0; i < 8; i++) {
|
||||
out.dmin[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.dmin;
|
||||
}
|
||||
|
||||
const int end = QK_K * 2 / blck_size_interleave;
|
||||
|
||||
// Interleave Q2_K quants by taking 8 bytes at a time
|
||||
for (int i = 0; i < end; ++i) {
|
||||
int src_id = i % 8;
|
||||
int src_offset = (i / 8) * blck_size_interleave;
|
||||
int dst_offset = i * blck_size_interleave;
|
||||
|
||||
uint64_t elems;
|
||||
memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
|
||||
memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
|
||||
}
|
||||
|
||||
// The below logic is designed so as to unpack and rearrange scales and mins values in Q2_K
|
||||
// Currently the Q2_K structure has 16 scales and 16 mins packed in 16 bytes ( 4 bits for each value)
|
||||
// The output Q2_Kx8 structure has 128 bytes for storing scales and mins
|
||||
// Every 16 byte is packed such that it contains scales and mins for corresponding sub blocks from Q2_K structure
|
||||
// For eg - First 16 bytes contains 16 scales and 16 mins - each of first and second sub blocks from different Q2_K structures
|
||||
|
||||
for(int i = 0; i < 128; i++){
|
||||
|
||||
// Index for selecting which q2k super block
|
||||
int src1 = (i % 16) / 2;
|
||||
// Index for selecting scale
|
||||
int src2 = ((i / 16) * 2) + (i % 2);
|
||||
|
||||
out.scales[i] = in[src1].scales[src2];
|
||||
}
|
||||
return out;
|
||||
|
||||
}
|
||||
|
||||
static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
|
||||
GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
|
||||
GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
|
||||
@@ -1259,37 +975,6 @@ static int repack_q4_K_to_q4_K_8_bl(struct ggml_tensor * t, int interleave_block
|
||||
GGML_UNUSED(data_size);
|
||||
}
|
||||
|
||||
static int repack_q2_K_to_q2_K_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
|
||||
GGML_ASSERT(t->type == GGML_TYPE_Q2_K);
|
||||
GGML_ASSERT(interleave_block == 8);
|
||||
constexpr int nrows_interleaved = 8;
|
||||
|
||||
block_q2_Kx8 * dst = (block_q2_Kx8*)t->data;
|
||||
const block_q2_K * src = (const block_q2_K*) data;
|
||||
block_q2_K dst_tmp[8];
|
||||
int nrow = ggml_nrows(t);
|
||||
int nblocks = t->ne[0] / QK_K;
|
||||
|
||||
GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q2_K));
|
||||
|
||||
if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
for (int b = 0; b < nrow; b += nrows_interleaved) {
|
||||
for (int64_t x = 0; x < nblocks; x++) {
|
||||
for (int i = 0; i < nrows_interleaved; i++ ) {
|
||||
dst_tmp[i] = src[x + i * nblocks];
|
||||
}
|
||||
*dst++ = make_block_q2_Kx8(dst_tmp, interleave_block);
|
||||
}
|
||||
src += nrows_interleaved * nblocks;
|
||||
}
|
||||
return 0;
|
||||
|
||||
GGML_UNUSED(data_size);
|
||||
}
|
||||
|
||||
static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
|
||||
GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
|
||||
GGML_ASSERT(interleave_block == 8);
|
||||
@@ -1358,16 +1043,15 @@ static block_iq4_nlx4 make_block_iq4_nlx4(block_iq4_nl * in, unsigned int blck_s
|
||||
|
||||
static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
|
||||
GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL);
|
||||
//GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
|
||||
GGML_ASSERT(interleave_block == 4);
|
||||
|
||||
const block_iq4_nl * src = (const block_iq4_nl *)data;
|
||||
block_iq4_nlx4 * dst = ( block_iq4_nlx4 *)t->data;
|
||||
|
||||
block_iq4_nlx4 * dst = (block_iq4_nlx4 *)t->data;
|
||||
const block_iq4_nl * src = (const block_iq4_nl *)data;
|
||||
block_iq4_nl dst_tmp[4];
|
||||
|
||||
int nrow = ggml_nrows(t);
|
||||
int nrows_interleaved = 4;
|
||||
int nblocks = t->ne[0] / QK4_NL;
|
||||
int nblocks = t->ne[0] / QK4_0;
|
||||
|
||||
GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_iq4_nl));
|
||||
|
||||
@@ -1389,63 +1073,6 @@ static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_b
|
||||
GGML_UNUSED(data_size);
|
||||
}
|
||||
|
||||
static block_iq4_nlx8 make_block_iq4_nlx8(block_iq4_nl * in, unsigned int blck_size_interleave) {
|
||||
block_iq4_nlx8 out;
|
||||
|
||||
for (int i = 0; i < 8; i++) {
|
||||
out.d[i] = in[i].d;
|
||||
}
|
||||
|
||||
const int end = QK4_NL * 4 / blck_size_interleave;
|
||||
|
||||
if (blck_size_interleave == 8) {
|
||||
for (int i = 0; i < end; ++i) {
|
||||
int src_id = i % 8;
|
||||
int src_offset = (i / 8) * blck_size_interleave;
|
||||
int dst_offset = i * blck_size_interleave;
|
||||
|
||||
memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint64_t));
|
||||
}
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
static int repack_iq4_nl_to_iq4_nl_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
|
||||
GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL);
|
||||
GGML_ASSERT(interleave_block == 8);
|
||||
|
||||
const block_iq4_nl * src = (const block_iq4_nl *)data;
|
||||
block_iq4_nlx8 * dst = ( block_iq4_nlx8 *)t->data;
|
||||
|
||||
block_iq4_nl dst_tmp[8];
|
||||
|
||||
int nrow = ggml_nrows(t);
|
||||
int nrows_interleaved = 8;
|
||||
int nblocks = t->ne[0] / QK4_NL;
|
||||
|
||||
GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_iq4_nl));
|
||||
|
||||
if (t->ne[1] % nrows_interleaved != 0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
for (int b = 0; b < nrow; b += nrows_interleaved) {
|
||||
for (int64_t x = 0; x < nblocks; x++) {
|
||||
for (int i = 0; i < nrows_interleaved; i++) {
|
||||
dst_tmp[i] = src[x + i * nblocks];
|
||||
}
|
||||
*dst++ = make_block_iq4_nlx8(dst_tmp, interleave_block);
|
||||
}
|
||||
src += nrows_interleaved * nblocks;
|
||||
}
|
||||
return 0;
|
||||
|
||||
GGML_UNUSED(data_size);
|
||||
}
|
||||
|
||||
namespace ggml::cpu::repack {
|
||||
// repack
|
||||
template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS>
|
||||
@@ -1468,10 +1095,6 @@ template <> int repack<block_q4_K, 8, 8>(struct ggml_tensor * t, const void * da
|
||||
return repack_q4_K_to_q4_K_8_bl(t, 8, data, data_size);
|
||||
}
|
||||
|
||||
template <> int repack<block_q2_K, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
||||
return repack_q2_K_to_q2_K_8_bl(t, 8, data, data_size);
|
||||
}
|
||||
|
||||
template <> int repack<block_iq4_nl, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
||||
return repack_iq4_nl_to_iq4_nl_4_bl(t, 4, data, data_size);
|
||||
}
|
||||
@@ -1481,10 +1104,6 @@ template <> int repack<block_iq4_nl, 4, 4>(struct ggml_tensor * t, const void *
|
||||
// return repack_iq4_nl_to_iq4_nl_4_bl(t, 8, data, data_size);
|
||||
//}
|
||||
|
||||
template <> int repack<block_iq4_nl, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
||||
return repack_iq4_nl_to_iq4_nl_8_bl(t, 8, data, data_size);
|
||||
}
|
||||
|
||||
// gemv
|
||||
template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
|
||||
void gemv(int, float *, size_t, const void *, const void *, int, int);
|
||||
@@ -1505,18 +1124,10 @@ template <> void gemv<block_q4_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t
|
||||
ggml_gemv_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
||||
template <> void gemv<block_q2_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
||||
ggml_gemv_q2_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
||||
template <> void gemv<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
||||
ggml_gemv_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
||||
template <> void gemv<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
||||
ggml_gemv_iq4_nl_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
||||
// gemm
|
||||
template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
|
||||
void gemm(int, float *, size_t, const void *, const void *, int, int);
|
||||
@@ -1537,18 +1148,10 @@ template <> void gemm<block_q4_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t
|
||||
ggml_gemm_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
||||
template <> void gemm<block_q2_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
||||
ggml_gemm_q2_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
||||
template <> void gemm<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
||||
ggml_gemm_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
||||
template <> void gemm<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
||||
ggml_gemm_iq4_nl_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
||||
class tensor_traits_base : public ggml::cpu::tensor_traits {
|
||||
public:
|
||||
virtual int repack(struct ggml_tensor * t, const void * data, size_t data_size) = 0;
|
||||
@@ -1818,12 +1421,8 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
|
||||
static const ggml::cpu::repack::tensor_traits<block_q4_0, 8, 8, GGML_TYPE_Q8_0> q4_0_8x8_q8_0;
|
||||
static const ggml::cpu::repack::tensor_traits<block_q4_K, 8, 8, GGML_TYPE_Q8_K> q4_K_8x8_q8_K;
|
||||
|
||||
// instance for Q2
|
||||
static const ggml::cpu::repack::tensor_traits<block_q2_K, 8, 8, GGML_TYPE_Q8_K> q2_K_8x8_q8_K;
|
||||
|
||||
// instance for IQ4
|
||||
static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0> iq4_nl_4x4_q8_0;
|
||||
static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0> iq4_nl_8x8_q8_0;
|
||||
|
||||
if (cur->type == GGML_TYPE_Q4_0) {
|
||||
if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) {
|
||||
@@ -1847,18 +1446,7 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
|
||||
return &q4_K_8x8_q8_K;
|
||||
}
|
||||
}
|
||||
} else if (cur->type == GGML_TYPE_Q2_K) {
|
||||
if (ggml_cpu_has_avx512()) {
|
||||
if (cur->ne[1] % 8 == 0) {
|
||||
return &q2_K_8x8_q8_K;
|
||||
}
|
||||
}
|
||||
} else if (cur->type == GGML_TYPE_IQ4_NL) {
|
||||
if (ggml_cpu_has_avx2()) {
|
||||
if (cur->ne[1] % 8 == 0) {
|
||||
return &iq4_nl_8x8_q8_0;
|
||||
}
|
||||
}
|
||||
if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
|
||||
if (cur->ne[1] % 4 == 0) {
|
||||
return &iq4_nl_4x4_q8_0;
|
||||
|
||||
@@ -44,14 +44,7 @@ struct block_q4_Kx8 {
|
||||
};
|
||||
|
||||
static_assert(sizeof(block_q4_Kx8) == sizeof(ggml_half) * 16 + K_SCALE_SIZE * 8 + QK_K * 4, "wrong q4_K block size/padding");
|
||||
struct block_q2_Kx8 {
|
||||
ggml_half d[8]; // super-block scale for quantized scales
|
||||
ggml_half dmin[8]; // super-block scale for quantized mins
|
||||
uint8_t scales[128]; // scales and mins, quantized with 4 bits
|
||||
uint8_t qs[512]; // 2--bit quants
|
||||
};
|
||||
|
||||
static_assert(sizeof(block_q2_Kx8) == sizeof(ggml_half) * 16 + QK_K/2 + QK_K * 2, "wrong q2_K block size/padding");
|
||||
struct block_q8_Kx4 {
|
||||
float d[4]; // delta
|
||||
int8_t qs[QK_K * 4]; // quants
|
||||
@@ -67,13 +60,6 @@ struct block_iq4_nlx4 {
|
||||
|
||||
static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wrong iq4_nlx4 block size/padding");
|
||||
|
||||
struct block_iq4_nlx8 {
|
||||
ggml_half d[8]; // deltas for 8 iq4_nl blocks
|
||||
uint8_t qs[QK4_NL * 4]; // nibbles / quants for 8 iq4_nl blocks
|
||||
};
|
||||
|
||||
static_assert(sizeof(block_iq4_nlx8) == 8 * sizeof(ggml_half) + QK4_NL * 4, "wrong iq4_nlx8 block size/padding");
|
||||
|
||||
#if defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif
|
||||
@@ -85,16 +71,12 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
|
||||
// Native implementations
|
||||
void ggml_quantize_mat_q8_0_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
||||
@@ -104,16 +86,12 @@ void ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||
void ggml_gemv_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
|
||||
#if defined(__cplusplus)
|
||||
} // extern "C"
|
||||
|
||||
@@ -10,7 +10,7 @@ extra_buffer_type::~extra_buffer_type() {}
|
||||
} // namespace ggml::cpu
|
||||
|
||||
bool ggml_cpu_extra_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) {
|
||||
for (auto extra : ggml_backend_cpu_get_extra_buffer_types()) {
|
||||
for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
|
||||
if (extra && extra->context) {
|
||||
auto buf_extra = (ggml::cpu::extra_buffer_type *) extra->context;
|
||||
auto tensor_traits = buf_extra->get_tensor_traits(op);
|
||||
@@ -23,7 +23,7 @@ bool ggml_cpu_extra_compute_forward(struct ggml_compute_params * params, struct
|
||||
}
|
||||
|
||||
bool ggml_cpu_extra_work_size(int n_threads, const struct ggml_tensor * op, size_t * size) {
|
||||
for (auto extra : ggml_backend_cpu_get_extra_buffer_types()) {
|
||||
for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
|
||||
if (extra && extra->context) {
|
||||
auto buf_extra = (ggml::cpu::extra_buffer_type *) extra->context;
|
||||
auto tensor_traits = buf_extra->get_tensor_traits(op);
|
||||
|
||||
@@ -33,6 +33,6 @@ class extra_buffer_type {
|
||||
} // namespace ggml::cpu
|
||||
|
||||
// implemented in ggml-cpu.cpp.
|
||||
std::vector<ggml_backend_buffer_type_t> & ggml_backend_cpu_get_extra_buffer_types();
|
||||
std::vector<ggml_backend_buffer_type_t> & ggml_backend_cpu_get_extra_buffers_type();
|
||||
|
||||
#endif
|
||||
|
||||
@@ -55,22 +55,7 @@ inline static void ggml_vec_cpy_i32(const int n, int32_t * y, const int32_t * x)
|
||||
|
||||
inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const ggml_fp16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
||||
inline static void ggml_vec_set_bf16(const int n, ggml_bf16_t * x, const ggml_bf16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
||||
|
||||
inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) {
|
||||
int i = 0;
|
||||
#if defined(__AVX2__)
|
||||
for (; i + 7 < n; i += 8) {
|
||||
__m256 vx = _mm256_loadu_ps(x + i);
|
||||
__m256 vy = _mm256_loadu_ps(y + i);
|
||||
__m256 vz = _mm256_add_ps(vx, vy);
|
||||
_mm256_storeu_ps(z + i, vz);
|
||||
}
|
||||
#endif
|
||||
for (; i < n; ++i) {
|
||||
z[i] = x[i] + y[i];
|
||||
}
|
||||
}
|
||||
|
||||
inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; }
|
||||
inline static void ggml_vec_add_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
|
||||
for (int i = 0; i < n; ++i) {
|
||||
z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) + GGML_CPU_FP16_TO_FP32(y[i]));
|
||||
@@ -1007,9 +992,9 @@ void ggml_vec_swiglu_f32(const int n, float * y, const float * x, const float *
|
||||
|
||||
inline static void ggml_vec_swiglu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
|
||||
for (int i = 0; i < n; ++i) {
|
||||
float xi = GGML_CPU_FP16_TO_FP32(x[i]);
|
||||
float gi = GGML_CPU_FP16_TO_FP32(g[i]);
|
||||
y[i] = GGML_CPU_FP32_TO_FP16((xi/(1.0f + expf(-xi))) * gi);
|
||||
float v = GGML_CPU_FP16_TO_FP32(x[i]);
|
||||
float w = GGML_CPU_FP16_TO_FP32(g[i]);
|
||||
y[i] = GGML_CPU_FP32_TO_FP16((v/(1.0f + expf(-v))) * w);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -120,10 +120,6 @@ if (CUDAToolkit_FOUND)
|
||||
|
||||
set(CUDA_FLAGS -use_fast_math -extended-lambda)
|
||||
|
||||
if (GGML_CUDA_DEBUG)
|
||||
list(APPEND CUDA_FLAGS -lineinfo)
|
||||
endif()
|
||||
|
||||
if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "12.8")
|
||||
# Options are:
|
||||
# - none (not recommended)
|
||||
|
||||
@@ -1,58 +0,0 @@
|
||||
#include "add-id.cuh"
|
||||
|
||||
static __global__ void add_id_kernel(
|
||||
const float * src0, const float * src1, const int32_t * src2, float * dst,
|
||||
int64_t ne0, int64_t ne1,
|
||||
size_t nb01, size_t nb02,
|
||||
size_t nb11,
|
||||
size_t nb21
|
||||
) {
|
||||
|
||||
const int64_t i1 = blockIdx.x;
|
||||
const int64_t i2 = blockIdx.y;
|
||||
|
||||
const int i11 = *(int32_t *) ((char *) src2 + i1*sizeof(int32_t) + i2*nb21);
|
||||
|
||||
const size_t nb1 = ne0 * sizeof(float);
|
||||
const size_t nb2 = ne1 * nb1;
|
||||
|
||||
float * dst_row = (float *)((char *)dst + i1*nb1 + i2*nb2);
|
||||
const float * src0_row = (const float *)((char *)src0 + i1*nb01 + i2*nb02);
|
||||
const float * src1_row = (const float *)((char *)src1 + i11*nb11);
|
||||
|
||||
for (int64_t i0 = threadIdx.x; i0 < ne0; i0 += blockDim.x) {
|
||||
dst_row[i0] = src0_row[i0] + src1_row[i0];
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cuda_op_add_id(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const ggml_tensor * src1 = dst->src[1];
|
||||
const ggml_tensor * src2 = dst->src[2];
|
||||
|
||||
GGML_TENSOR_TERNARY_OP_LOCALS
|
||||
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src2->type == GGML_TYPE_I32);
|
||||
|
||||
GGML_ASSERT(nb00 == sizeof(float));
|
||||
GGML_ASSERT(nb10 == sizeof(float));
|
||||
GGML_ASSERT(nb20 == sizeof(int32_t));
|
||||
|
||||
const float * src0_d = (const float *)src0->data;
|
||||
const float * src1_d = (const float *)src1->data;
|
||||
const int32_t * src2_d = (const int32_t *)src2->data;
|
||||
float * dst_d = (float *)dst->data;
|
||||
|
||||
int threads = std::min((int)ne00, 768); // cols
|
||||
dim3 blocks(ne01, ne02); // n_experts_used, n_tokens
|
||||
add_id_kernel<<<blocks, threads, 0, ctx.stream()>>>(
|
||||
src0_d, src1_d, src2_d, dst_d,
|
||||
ne0, ne1,
|
||||
nb01, nb02,
|
||||
nb11,
|
||||
nb21
|
||||
);
|
||||
}
|
||||
@@ -1,3 +0,0 @@
|
||||
#include "common.cuh"
|
||||
|
||||
void ggml_cuda_op_add_id(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user