mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-02-05 13:53:23 +02:00
Compare commits
197 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a457551332 | ||
|
|
704bb7a71c | ||
|
|
435a6d10d6 | ||
|
|
f9a867f592 | ||
|
|
ac44eb6c80 | ||
|
|
a57d1bcb3c | ||
|
|
cb9178f885 | ||
|
|
4a5686da22 | ||
|
|
98bab638fb | ||
|
|
26a48ad699 | ||
|
|
ffd59e7d18 | ||
|
|
105554595f | ||
|
|
04655063c4 | ||
|
|
20b7bf8a32 | ||
|
|
6efcd65945 | ||
|
|
699f4392a3 | ||
|
|
08382869a2 | ||
|
|
bb4f7a9e4e | ||
|
|
b8eeb8741d | ||
|
|
17a1f0d2d4 | ||
|
|
8f22dc0a53 | ||
|
|
53903ae6fa | ||
|
|
4d0dcd4a06 | ||
|
|
75c91de6e9 | ||
|
|
68155c66f0 | ||
|
|
e1a7059053 | ||
|
|
12f55c302b | ||
|
|
b9c3eefde1 | ||
|
|
6491d6e4f1 | ||
|
|
e592be1575 | ||
|
|
a0374a67e2 | ||
|
|
ddef99522d | ||
|
|
6681688146 | ||
|
|
bac8bed248 | ||
|
|
b81510a7b7 | ||
|
|
ef797db357 | ||
|
|
67d1ef23c6 | ||
|
|
7b50f7c025 | ||
|
|
c79184d2d1 | ||
|
|
499a8f5a78 | ||
|
|
28657a8229 | ||
|
|
bee28421be | ||
|
|
2b72bedec1 | ||
|
|
c8c4495b8d | ||
|
|
7b63a71a6b | ||
|
|
0c2ee38ab7 | ||
|
|
a70c8a0c4b | ||
|
|
9067487c44 | ||
|
|
d4cdd9c1c3 | ||
|
|
55c2646b45 | ||
|
|
e75ba4c043 | ||
|
|
5d46babdc2 | ||
|
|
e17991c466 | ||
|
|
c46944aa25 | ||
|
|
f3ed38d793 | ||
|
|
55a1c5a5fd | ||
|
|
12a81af45f | ||
|
|
8875523eb3 | ||
|
|
ec68e84c32 | ||
|
|
307e79d33d | ||
|
|
d7f5f4e578 | ||
|
|
c8a4e470f6 | ||
|
|
603e43dc91 | ||
|
|
611ba4b264 | ||
|
|
85841e121d | ||
|
|
68b3cd6514 | ||
|
|
de56944147 | ||
|
|
1b2aaf28ac | ||
|
|
343b6e94b6 | ||
|
|
6a746cf9c4 | ||
|
|
eff5e45443 | ||
|
|
a6a47958a1 | ||
|
|
f61c05d4b1 | ||
|
|
431b2c24f3 | ||
|
|
497be7c01d | ||
|
|
79b33b2317 | ||
|
|
0a5a3b5cdf | ||
|
|
745f11fed0 | ||
|
|
5dd942de59 | ||
|
|
a7417f5594 | ||
|
|
eb3fa2913e | ||
|
|
c839a2da1a | ||
|
|
e9b6350e61 | ||
|
|
caf5681fcb | ||
|
|
83790b0e7e | ||
|
|
f47c1d7106 | ||
|
|
a5d1fb6212 | ||
|
|
a0535ffa0d | ||
|
|
bd9c981d72 | ||
|
|
27208bf657 | ||
|
|
63a7bb3c7e | ||
|
|
00d5282c7f | ||
|
|
566c16fcce | ||
|
|
b25e92774e | ||
|
|
6609507a91 | ||
|
|
ceb1bf5a34 | ||
|
|
72babea5de | ||
|
|
43678060c1 | ||
|
|
8d94219a4a | ||
|
|
f667f1e624 | ||
|
|
8846aace49 | ||
|
|
a01047b041 | ||
|
|
b25346221d | ||
|
|
e8215dbb96 | ||
|
|
5783ae4359 | ||
|
|
bf5bcd0b85 | ||
|
|
716301d1b0 | ||
|
|
60ef23d6c1 | ||
|
|
b193d53069 | ||
|
|
2bf9d539dd | ||
|
|
73e53dc834 | ||
|
|
62af464227 | ||
|
|
c148cf1946 | ||
|
|
1b809cee22 | ||
|
|
abf241045d | ||
|
|
901e20bbe5 | ||
|
|
0142961a2e | ||
|
|
ce82bd0117 | ||
|
|
bf2a99e3cb | ||
|
|
72c6bc3f3d | ||
|
|
defe2158dd | ||
|
|
7b50d589a8 | ||
|
|
3a9457df96 | ||
|
|
fa4a9f2a1c | ||
|
|
238005c2dc | ||
|
|
66aba7aca9 | ||
|
|
f1f5e82df6 | ||
|
|
af3373f1ad | ||
|
|
5d5c066de8 | ||
|
|
40bfa04c95 | ||
|
|
aa064b2eb7 | ||
|
|
aa0ef5c578 | ||
|
|
bb16041cae | ||
|
|
58cba76a9a | ||
|
|
67ae5312e2 | ||
|
|
692e3cdd0a | ||
|
|
b23fa0b3f4 | ||
|
|
06cbedfca1 | ||
|
|
b7147673f2 | ||
|
|
d860dd99a4 | ||
|
|
c959f462a0 | ||
|
|
22015b2092 | ||
|
|
dd6e6d0b6a | ||
|
|
8308f98c7f | ||
|
|
6369be0735 | ||
|
|
88fc854b4b | ||
|
|
e28c1b93fd | ||
|
|
d27b3ca175 | ||
|
|
9230dbe2c7 | ||
|
|
812939a9e9 | ||
|
|
4c9fdfbe15 | ||
|
|
9eaa51e7f0 | ||
|
|
8f71d0f3e8 | ||
|
|
381174bbda | ||
|
|
d67341dc18 | ||
|
|
456af35eb7 | ||
|
|
600e3e9b50 | ||
|
|
fffcce535e | ||
|
|
5fc7856815 | ||
|
|
faed5a5f5d | ||
|
|
10bb545c5b | ||
|
|
edc4a29eff | ||
|
|
ed3290ab34 | ||
|
|
8d94713654 | ||
|
|
50d2227953 | ||
|
|
6231c5cd6d | ||
|
|
ef035803eb | ||
|
|
413977de32 | ||
|
|
95402553a5 | ||
|
|
3865cff4f5 | ||
|
|
d03172cc79 | ||
|
|
dd8e59f443 | ||
|
|
bbe98d2784 | ||
|
|
c2056ed6d4 | ||
|
|
c46503014d | ||
|
|
860a9e4eef | ||
|
|
fe9d60e74a | ||
|
|
e434e69183 | ||
|
|
89fea80d29 | ||
|
|
6adc3c3ebc | ||
|
|
0dbcabde8c | ||
|
|
ad590be98c | ||
|
|
7d6d91babf | ||
|
|
d3e64b9f49 | ||
|
|
3ba0d843c6 | ||
|
|
0bf49eb668 | ||
|
|
4ad243677b | ||
|
|
c89c2d1ab9 | ||
|
|
3555b3004b | ||
|
|
d7da8dc83a | ||
|
|
cd355eda7d | ||
|
|
30e5b01de2 | ||
|
|
e54b394082 | ||
|
|
2c2caa4443 | ||
|
|
5fce5f948d | ||
|
|
9ae4143bc6 | ||
|
|
c311ac664d |
@@ -1,4 +1,4 @@
|
||||
#!/bin/bash
|
||||
#!/usr/bin/env bash
|
||||
set -e
|
||||
|
||||
# Read the first argument into a variable
|
||||
|
||||
@@ -40,7 +40,7 @@ body:
|
||||
attributes:
|
||||
label: GGML backends
|
||||
description: Which GGML backends do you know to be affected?
|
||||
options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan]
|
||||
options: [AMX, BLAS, CPU, CUDA, HIP, Metal, Musa, RPC, SYCL, Vulkan, OpenCL]
|
||||
multiple: true
|
||||
validations:
|
||||
required: true
|
||||
|
||||
2
.github/ISSUE_TEMPLATE/011-bug-results.yml
vendored
2
.github/ISSUE_TEMPLATE/011-bug-results.yml
vendored
@@ -42,7 +42,7 @@ body:
|
||||
attributes:
|
||||
label: GGML backends
|
||||
description: Which GGML backends do you know to be affected?
|
||||
options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan]
|
||||
options: [AMX, BLAS, CPU, CUDA, HIP, Metal, Musa, RPC, SYCL, Vulkan, OpenCL]
|
||||
multiple: true
|
||||
validations:
|
||||
required: true
|
||||
|
||||
11
.github/labeler.yml
vendored
11
.github/labeler.yml
vendored
@@ -1,10 +1,4 @@
|
||||
# https://github.com/actions/labeler
|
||||
Kompute:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- ggml/include/ggml-kompute.h
|
||||
- ggml/src/ggml-kompute/**
|
||||
- README-kompute.md
|
||||
Apple Metal:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
@@ -93,3 +87,8 @@ Ascend NPU:
|
||||
- ggml/include/ggml-cann.h
|
||||
- ggml/src/ggml-cann/**
|
||||
- docs/backend/CANN.md
|
||||
OpenCL:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- ggml/include/ggml-opencl.h
|
||||
- ggml/src/ggml-opencl/**
|
||||
|
||||
51
.github/workflows/build-cmake-pkg.yml
vendored
Normal file
51
.github/workflows/build-cmake-pkg.yml
vendored
Normal file
@@ -0,0 +1,51 @@
|
||||
name: Build relocatable cmake package
|
||||
on:
|
||||
workflow_dispatch:
|
||||
workflow_call:
|
||||
|
||||
jobs:
|
||||
linux:
|
||||
runs-on: ubuntu-24.04
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install -y build-essential tcl
|
||||
|
||||
- name: Build
|
||||
run: |
|
||||
PREFIX="$(pwd)"/inst
|
||||
cmake -S . -B build -DCMAKE_PREFIX_PATH="$PREFIX" \
|
||||
-DLLAMA_CURL=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=OFF -DCMAKE_BUILD_TYPE=Release
|
||||
cmake --build build --config Release
|
||||
cmake --install build --prefix "$PREFIX" --config Release
|
||||
|
||||
export LLAMA_CONFIG="$PREFIX"/lib/cmake/llama/llama-config.cmake
|
||||
tclsh <<'EOF'
|
||||
set build(commit) [string trim [exec git rev-parse --short HEAD]]
|
||||
set build(number) [string trim [exec git rev-list --count HEAD]]
|
||||
set build(version) "0.0.$build(number)"
|
||||
|
||||
set llamaconfig [read [open "$env(LLAMA_CONFIG)" r]]
|
||||
set checks [list "set\\(LLAMA_VERSION \\s+$build(version)\\)" \
|
||||
"set\\(LLAMA_BUILD_COMMIT\\s+$build(commit)\\)" \
|
||||
"set\\(LLAMA_BUILD_NUMBER\\s+$build(number)\\)"]
|
||||
|
||||
puts -nonewline "Checking llama-config.cmake version... "
|
||||
foreach check $checks {
|
||||
if {![regexp -expanded -- $check $llamaconfig]} {
|
||||
puts "\"$check\" failed!"
|
||||
exit 1
|
||||
}
|
||||
}
|
||||
puts "success."
|
||||
EOF
|
||||
|
||||
cd examples/simple-cmake-pkg
|
||||
cmake -S . -B build -DCMAKE_PREFIX_PATH="$PREFIX"/lib/cmake
|
||||
cmake --build build
|
||||
79
.github/workflows/build.yml
vendored
79
.github/workflows/build.yml
vendored
@@ -5,10 +5,43 @@ on:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: ['.github/workflows/build.yml', '.github/workflows/build-linux-cross.yml', '**/CMakeLists.txt', '**/.cmake', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp']
|
||||
paths: [
|
||||
'.github/workflows/build.yml',
|
||||
'.github/workflows/build-linux-cross.yml',
|
||||
'.github/workflows/build-cmake-pkg.yml',
|
||||
'**/CMakeLists.txt',
|
||||
'**/.cmake',
|
||||
'**/*.h',
|
||||
'**/*.hpp',
|
||||
'**/*.c',
|
||||
'**/*.cpp',
|
||||
'**/*.cu',
|
||||
'**/*.cuh',
|
||||
'**/*.swift',
|
||||
'**/*.m',
|
||||
'**/*.metal',
|
||||
'**/*.comp'
|
||||
]
|
||||
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: ['.github/workflows/build.yml', '.github/workflows/build-linux-cross.yml', '**/CMakeLists.txt', '**/.cmake', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp']
|
||||
paths: [
|
||||
'.github/workflows/build.yml',
|
||||
'.github/workflows/build-linux-cross.yml',
|
||||
'.github/workflows/build-cmake-pkg.yml',
|
||||
'**/CMakeLists.txt',
|
||||
'**/.cmake',
|
||||
'**/*.h',
|
||||
'**/*.hpp',
|
||||
'**/*.c',
|
||||
'**/*.cpp',
|
||||
'**/*.cu',
|
||||
'**/*.cuh',
|
||||
'**/*.swift',
|
||||
'**/*.m',
|
||||
'**/*.metal',
|
||||
'**/*.comp'
|
||||
]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
@@ -51,7 +84,8 @@ jobs:
|
||||
-DCMAKE_BUILD_RPATH="@loader_path" \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DGGML_METAL_USE_BF16=ON \
|
||||
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||
-DGGML_METAL_EMBED_LIBRARY=OFF \
|
||||
-DGGML_METAL_SHADER_DEBUG=ON \
|
||||
-DGGML_RPC=ON
|
||||
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
|
||||
|
||||
@@ -308,7 +342,7 @@ jobs:
|
||||
cd build
|
||||
export GGML_VK_VISIBLE_DEVICES=0
|
||||
# This is using llvmpipe and runs slower than other backends
|
||||
ctest -L main --verbose --timeout 3600
|
||||
ctest -L main --verbose --timeout 4200
|
||||
|
||||
ubuntu-22-cmake-hip:
|
||||
runs-on: ubuntu-22.04
|
||||
@@ -478,6 +512,9 @@ jobs:
|
||||
build-linux-cross:
|
||||
uses: ./.github/workflows/build-linux-cross.yml
|
||||
|
||||
build-cmake-pkg:
|
||||
uses: ./.github/workflows/build-cmake-pkg.yml
|
||||
|
||||
macOS-latest-cmake-ios:
|
||||
runs-on: macos-latest
|
||||
|
||||
@@ -628,7 +665,7 @@ jobs:
|
||||
./build-xcframework.sh
|
||||
|
||||
windows-msys2:
|
||||
runs-on: windows-latest
|
||||
runs-on: windows-2025
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
@@ -678,28 +715,31 @@ jobs:
|
||||
cmake --build build --config ${{ matrix.build }} -j $(nproc)
|
||||
|
||||
windows-latest-cmake:
|
||||
runs-on: windows-latest
|
||||
runs-on: windows-2025
|
||||
|
||||
env:
|
||||
OPENBLAS_VERSION: 0.3.23
|
||||
SDE_VERSION: 9.33.0-2024-01-07
|
||||
VULKAN_VERSION: 1.4.309.0
|
||||
VULKAN_VERSION: 1.4.313.2
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- build: 'cpu-x64 (static)'
|
||||
arch: 'x64'
|
||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF'
|
||||
- build: 'openblas-x64'
|
||||
arch: 'x64'
|
||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
|
||||
- build: 'vulkan-x64'
|
||||
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON'
|
||||
arch: 'x64'
|
||||
defines: '-DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON'
|
||||
- build: 'llvm-arm64'
|
||||
arch: 'arm64'
|
||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON'
|
||||
- build: 'llvm-arm64-opencl-adreno'
|
||||
arch: 'arm64'
|
||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON'
|
||||
# - build: 'kompute-x64'
|
||||
# defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON'
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
@@ -713,12 +753,6 @@ jobs:
|
||||
variant: ccache
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Clone Kompute submodule
|
||||
id: clone_kompute
|
||||
if: ${{ matrix.build == 'kompute-x64' }}
|
||||
run: |
|
||||
git submodule update --init ggml/src/ggml-kompute/kompute
|
||||
|
||||
- name: Download OpenBLAS
|
||||
id: get_openblas
|
||||
if: ${{ matrix.build == 'openblas-x64' }}
|
||||
@@ -734,9 +768,9 @@ jobs:
|
||||
|
||||
- name: Install Vulkan SDK
|
||||
id: get_vulkan
|
||||
if: ${{ matrix.build == 'kompute-x64' || matrix.build == 'vulkan-x64' }}
|
||||
if: ${{ matrix.build == 'vulkan-x64' }}
|
||||
run: |
|
||||
curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
|
||||
curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/vulkansdk-windows-X64-${env:VULKAN_VERSION}.exe"
|
||||
& "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
|
||||
Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
|
||||
Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
|
||||
@@ -769,6 +803,8 @@ jobs:
|
||||
- name: libCURL
|
||||
id: get_libcurl
|
||||
uses: ./.github/actions/windows-setup-curl
|
||||
with:
|
||||
architecture: ${{ matrix.arch == 'x64' && 'win64' || 'win64a' }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
@@ -778,6 +814,7 @@ jobs:
|
||||
cmake -S . -B build ${{ matrix.defines }} `
|
||||
-DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include"
|
||||
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
|
||||
cp $env:CURL_PATH/bin/libcurl-*.dll build/bin/Release
|
||||
|
||||
- name: Add libopenblas.dll
|
||||
id: add_libopenblas_dll
|
||||
@@ -788,7 +825,7 @@ jobs:
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
if: ${{ matrix.build != 'llvm-arm64' && matrix.build != 'llvm-arm64-opencl-adreno' }}
|
||||
if: ${{ matrix.arch == 'x64' }}
|
||||
run: |
|
||||
cd build
|
||||
ctest -L main -C Release --verbose --timeout 900
|
||||
@@ -893,7 +930,7 @@ jobs:
|
||||
cmake --build build --config Release
|
||||
|
||||
windows-latest-cmake-sycl:
|
||||
runs-on: windows-latest
|
||||
runs-on: windows-2022
|
||||
|
||||
defaults:
|
||||
run:
|
||||
@@ -927,7 +964,7 @@ jobs:
|
||||
|
||||
windows-latest-cmake-hip:
|
||||
if: ${{ github.event.inputs.create_release != 'true' }}
|
||||
runs-on: windows-latest
|
||||
runs-on: windows-2022
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
|
||||
26
.github/workflows/release.yml
vendored
26
.github/workflows/release.yml
vendored
@@ -49,7 +49,8 @@ jobs:
|
||||
run: |
|
||||
sysctl -a
|
||||
cmake -B build \
|
||||
-DCMAKE_BUILD_RPATH="@loader_path" \
|
||||
-DCMAKE_INSTALL_RPATH='@loader_path' \
|
||||
-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DGGML_METAL_USE_BF16=ON \
|
||||
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||
@@ -103,7 +104,8 @@ jobs:
|
||||
# Metal is disabled due to intermittent failures with Github runners not having a GPU:
|
||||
# https://github.com/ggml-org/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
|
||||
cmake -B build \
|
||||
-DCMAKE_BUILD_RPATH="@loader_path" \
|
||||
-DCMAKE_INSTALL_RPATH='@loader_path' \
|
||||
-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
|
||||
-DLLAMA_FATAL_WARNINGS=ON \
|
||||
-DGGML_METAL=OFF \
|
||||
-DGGML_RPC=ON
|
||||
@@ -160,6 +162,8 @@ jobs:
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DCMAKE_INSTALL_RPATH='$ORIGIN' \
|
||||
-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
|
||||
-DGGML_BACKEND_DL=ON \
|
||||
-DGGML_NATIVE=OFF \
|
||||
-DGGML_CPU_ALL_VARIANTS=ON \
|
||||
@@ -211,6 +215,8 @@ jobs:
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build \
|
||||
-DCMAKE_INSTALL_RPATH='$ORIGIN' \
|
||||
-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
|
||||
-DGGML_BACKEND_DL=ON \
|
||||
-DGGML_NATIVE=OFF \
|
||||
-DGGML_CPU_ALL_VARIANTS=ON \
|
||||
@@ -235,7 +241,7 @@ jobs:
|
||||
name: llama-bin-ubuntu-vulkan-x64.zip
|
||||
|
||||
windows-cpu:
|
||||
runs-on: windows-latest
|
||||
runs-on: windows-2025
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
@@ -271,7 +277,7 @@ jobs:
|
||||
env:
|
||||
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
|
||||
run: |
|
||||
call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" ${{ matrix.arch }}
|
||||
call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" ${{ matrix.arch == 'x64' && 'x64' || 'amd64_arm64' }}
|
||||
cmake -S . -B build -G "Ninja Multi-Config" ^
|
||||
-D CMAKE_TOOLCHAIN_FILE=cmake/${{ matrix.arch }}-windows-llvm.cmake ^
|
||||
-DGGML_NATIVE=OFF ^
|
||||
@@ -288,7 +294,7 @@ jobs:
|
||||
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
|
||||
run: |
|
||||
Copy-Item $env:CURL_PATH\bin\libcurl-${{ matrix.arch }}.dll .\build\bin\Release\
|
||||
Copy-Item "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Redist\MSVC\14.42.34433\debug_nonredist\${{ matrix.arch }}\Microsoft.VC143.OpenMP.LLVM\libomp140.${{ matrix.arch == 'x64' && 'x86_64' || 'aarch64' }}.dll" .\build\bin\Release\
|
||||
Copy-Item "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Redist\MSVC\14.44.35112\debug_nonredist\${{ matrix.arch }}\Microsoft.VC143.OpenMP.LLVM\libomp140.${{ matrix.arch == 'x64' && 'x86_64' || 'aarch64' }}.dll" .\build\bin\Release\
|
||||
7z a llama-bin-win-cpu-${{ matrix.arch }}.zip .\build\bin\Release\*
|
||||
|
||||
- name: Upload artifacts
|
||||
@@ -298,11 +304,11 @@ jobs:
|
||||
name: llama-bin-win-cpu-${{ matrix.arch }}.zip
|
||||
|
||||
windows:
|
||||
runs-on: windows-latest
|
||||
runs-on: windows-2025
|
||||
|
||||
env:
|
||||
OPENBLAS_VERSION: 0.3.23
|
||||
VULKAN_VERSION: 1.4.309.0
|
||||
VULKAN_VERSION: 1.4.313.2
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
@@ -332,7 +338,7 @@ jobs:
|
||||
id: get_vulkan
|
||||
if: ${{ matrix.backend == 'vulkan' }}
|
||||
run: |
|
||||
curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
|
||||
curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/vulkansdk-windows-X64-${env:VULKAN_VERSION}.exe"
|
||||
& "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
|
||||
Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
|
||||
Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
|
||||
@@ -448,7 +454,7 @@ jobs:
|
||||
name: cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
|
||||
|
||||
windows-sycl:
|
||||
runs-on: windows-latest
|
||||
runs-on: windows-2022
|
||||
|
||||
defaults:
|
||||
run:
|
||||
@@ -520,7 +526,7 @@ jobs:
|
||||
name: llama-bin-win-sycl-x64.zip
|
||||
|
||||
windows-hip:
|
||||
runs-on: windows-latest
|
||||
runs-on: windows-2022
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
|
||||
3
.gitmodules
vendored
3
.gitmodules
vendored
@@ -1,3 +0,0 @@
|
||||
[submodule "kompute"]
|
||||
path = ggml/src/ggml-kompute/kompute
|
||||
url = https://github.com/nomic-ai/kompute.git
|
||||
|
||||
@@ -95,7 +95,7 @@ endif()
|
||||
if (NOT DEFINED LLAMA_BUILD_COMMIT)
|
||||
set(LLAMA_BUILD_COMMIT ${BUILD_COMMIT})
|
||||
endif()
|
||||
set(LLAMA_INSTALL_VERSION 0.0.${BUILD_NUMBER})
|
||||
set(LLAMA_INSTALL_VERSION 0.0.${LLAMA_BUILD_NUMBER})
|
||||
|
||||
# override ggml options
|
||||
set(GGML_ALL_WARNINGS ${LLAMA_ALL_WARNINGS})
|
||||
@@ -120,7 +120,6 @@ endfunction()
|
||||
|
||||
llama_option_depr(FATAL_ERROR LLAMA_CUBLAS GGML_CUDA)
|
||||
llama_option_depr(WARNING LLAMA_CUDA GGML_CUDA)
|
||||
llama_option_depr(WARNING LLAMA_KOMPUTE GGML_KOMPUTE)
|
||||
llama_option_depr(WARNING LLAMA_METAL GGML_METAL)
|
||||
llama_option_depr(WARNING LLAMA_METAL_EMBED_LIBRARY GGML_METAL_EMBED_LIBRARY)
|
||||
llama_option_depr(WARNING LLAMA_NATIVE GGML_NATIVE)
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/bin/bash
|
||||
#!/usr/bin/env bash
|
||||
#
|
||||
# Options
|
||||
IOS_MIN_OS_VERSION=16.4
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/bin/bash
|
||||
#!/usr/bin/env bash
|
||||
#
|
||||
# sample usage:
|
||||
#
|
||||
@@ -39,7 +39,7 @@ sd=`dirname $0`
|
||||
cd $sd/../
|
||||
SRC=`pwd`
|
||||
|
||||
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=OFF"
|
||||
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON"
|
||||
|
||||
if [ ! -z ${GG_BUILD_METAL} ]; then
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON -DGGML_METAL_USE_BF16=ON"
|
||||
@@ -779,7 +779,7 @@ function gg_run_rerank_tiny {
|
||||
model_f16="${path_models}/ggml-model-f16.gguf"
|
||||
|
||||
# for this model, the SEP token is "</s>"
|
||||
(time ./bin/llama-embedding --model ${model_f16} -p "what is panda?</s></s>hi\nwhat is panda?</s></s>it's a bear\nwhat is panda?</s></s>The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." -ngl 99 -c 0 --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
|
||||
(time ./bin/llama-embedding --model ${model_f16} -p "what is panda?\thi\nwhat is panda?\tit's a bear\nwhat is panda?\tThe giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." -ngl 99 -c 0 --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
|
||||
|
||||
# sample output
|
||||
# rerank score 0: 0.029
|
||||
|
||||
@@ -86,8 +86,7 @@ if (LLAMA_CURL)
|
||||
endif()
|
||||
target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL)
|
||||
include_directories(${CURL_INCLUDE_DIRS})
|
||||
find_library(CURL_LIBRARY curl REQUIRED)
|
||||
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARY})
|
||||
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARIES})
|
||||
endif ()
|
||||
|
||||
if (LLAMA_LLGUIDANCE)
|
||||
@@ -112,13 +111,13 @@ if (LLAMA_LLGUIDANCE)
|
||||
|
||||
ExternalProject_Add(llguidance_ext
|
||||
GIT_REPOSITORY https://github.com/guidance-ai/llguidance
|
||||
# v0.7.20 (+ fix to build on GCC 15):
|
||||
GIT_TAG b5b8b64dba11c4e4ee6b1d1450d3a3ae279891e8
|
||||
# v1.0.1:
|
||||
GIT_TAG d795912fedc7d393de740177ea9ea761e7905774
|
||||
PREFIX ${CMAKE_BINARY_DIR}/llguidance
|
||||
SOURCE_DIR ${LLGUIDANCE_SRC}
|
||||
BUILD_IN_SOURCE TRUE
|
||||
CONFIGURE_COMMAND ""
|
||||
BUILD_COMMAND cargo build --release
|
||||
BUILD_COMMAND cargo build --release --package llguidance
|
||||
INSTALL_COMMAND ""
|
||||
BUILD_BYPRODUCTS ${LLGUIDANCE_PATH}/${LLGUIDANCE_LIB_NAME} ${LLGUIDANCE_PATH}/llguidance.h
|
||||
UPDATE_COMMAND ""
|
||||
|
||||
@@ -988,10 +988,6 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
||||
params.tensor_buft_overrides.push_back({nullptr, nullptr});
|
||||
}
|
||||
|
||||
if (params.reranking && params.embedding) {
|
||||
throw std::invalid_argument("error: either --embedding or --reranking can be specified, but not both");
|
||||
}
|
||||
|
||||
if (!params.chat_template.empty() && !common_chat_verify_template(params.chat_template, params.use_jinja)) {
|
||||
throw std::runtime_error(string_format(
|
||||
"error: the supplied chat template is not supported: %s%s\n",
|
||||
@@ -2710,6 +2706,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.embd_sep = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
|
||||
add_opt(common_arg(
|
||||
{"--cls-separator"}, "STRING",
|
||||
"separator of classification sequences (default \\t) for example \"<#seq#>\"",
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.cls_sep = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
|
||||
add_opt(common_arg(
|
||||
{"--host"}, "HOST",
|
||||
string_format("ip address to listen, or bind to an UNIX socket if the address ends with .sock (default: %s)", params.hostname.c_str()),
|
||||
@@ -2731,6 +2734,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.public_path = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STATIC_PATH"));
|
||||
add_opt(common_arg(
|
||||
{"--api-prefix"}, "PREFIX",
|
||||
string_format("prefix path the server serves from, without the trailing slash (default: %s)", params.api_prefix.c_str()),
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.api_prefix = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX"));
|
||||
add_opt(common_arg(
|
||||
{"--no-webui"},
|
||||
string_format("Disable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"),
|
||||
@@ -2747,9 +2757,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS"));
|
||||
add_opt(common_arg(
|
||||
{"--reranking", "--rerank"},
|
||||
string_format("enable reranking endpoint on server (default: %s)", params.reranking ? "enabled" : "disabled"),
|
||||
string_format("enable reranking endpoint on server (default: %s)", "disabled"),
|
||||
[](common_params & params) {
|
||||
params.reranking = true;
|
||||
params.embedding = true;
|
||||
params.pooling_type = LLAMA_POOLING_TYPE_RANK;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_RERANKING"));
|
||||
add_opt(common_arg(
|
||||
@@ -2790,6 +2801,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.ssl_file_cert = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_CERT_FILE"));
|
||||
add_opt(common_arg(
|
||||
{"--chat-template-kwargs"}, "STRING",
|
||||
string_format("sets additional params for the json template parser"),
|
||||
[](common_params & params, const std::string & value) {
|
||||
auto parsed = json::parse(value);
|
||||
for (const auto & item : parsed.items()) {
|
||||
params.default_template_kwargs[item.key()] = item.value().dump();
|
||||
}
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_CHAT_TEMPLATE_KWARGS"));
|
||||
add_opt(common_arg(
|
||||
{"-to", "--timeout"}, "N",
|
||||
string_format("server read/write timeout in seconds (default: %d)", params.timeout_read),
|
||||
@@ -3213,6 +3234,32 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.speculative.model.path = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
|
||||
add_opt(common_arg(
|
||||
{"-ctkd", "--cache-type-k-draft"}, "TYPE",
|
||||
string_format(
|
||||
"KV cache data type for K for the draft model\n"
|
||||
"allowed values: %s\n"
|
||||
"(default: %s)",
|
||||
get_all_kv_cache_types().c_str(),
|
||||
ggml_type_name(params.speculative.cache_type_k)
|
||||
),
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.speculative.cache_type_k = kv_cache_type_from_str(value);
|
||||
}
|
||||
).set_env("LLAMA_ARG_CACHE_TYPE_K_DRAFT"));
|
||||
add_opt(common_arg(
|
||||
{"-ctvd", "--cache-type-v-draft"}, "TYPE",
|
||||
string_format(
|
||||
"KV cache data type for V for the draft model\n"
|
||||
"allowed values: %s\n"
|
||||
"(default: %s)",
|
||||
get_all_kv_cache_types().c_str(),
|
||||
ggml_type_name(params.speculative.cache_type_v)
|
||||
),
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.speculative.cache_type_v = kv_cache_type_from_str(value);
|
||||
}
|
||||
).set_env("LLAMA_ARG_CACHE_TYPE_V_DRAFT"));
|
||||
|
||||
add_opt(common_arg(
|
||||
{"-mv", "--model-vocoder"}, "FNAME",
|
||||
|
||||
@@ -17,6 +17,8 @@
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
using json = nlohmann::ordered_json;
|
||||
|
||||
static std::string format_time(const std::chrono::system_clock::time_point & now, const std::string & format) {
|
||||
auto time = std::chrono::system_clock::to_time_t(now);
|
||||
auto local_time = *std::localtime(&time);
|
||||
@@ -140,6 +142,7 @@ struct templates_params {
|
||||
bool add_generation_prompt = true;
|
||||
bool enable_thinking = true;
|
||||
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
|
||||
json extra_context;
|
||||
};
|
||||
|
||||
common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice) {
|
||||
@@ -720,16 +723,23 @@ static void foreach_function(const json & tools, const std::function<void(const
|
||||
|
||||
static std::string apply(
|
||||
const common_chat_template & tmpl,
|
||||
const nlohmann::ordered_json & messages,
|
||||
const nlohmann::ordered_json & tools,
|
||||
bool add_generation_prompt,
|
||||
const nlohmann::ordered_json & extra_context = nlohmann::ordered_json())
|
||||
const struct templates_params & inputs,
|
||||
const std::optional<json> & messages_override = std::nullopt,
|
||||
const std::optional<json> & tools_override = std::nullopt,
|
||||
const std::optional<json> & additional_context = std::nullopt)
|
||||
{
|
||||
minja::chat_template_inputs tmpl_inputs;
|
||||
tmpl_inputs.messages = messages;
|
||||
tmpl_inputs.tools = tools;
|
||||
tmpl_inputs.add_generation_prompt = add_generation_prompt;
|
||||
tmpl_inputs.extra_context = extra_context;
|
||||
tmpl_inputs.messages = messages_override ? *messages_override : inputs.messages;
|
||||
if (tools_override) {
|
||||
tmpl_inputs.tools = *tools_override;
|
||||
} else {
|
||||
tmpl_inputs.tools = inputs.tools.empty() ? json() : inputs.tools;
|
||||
}
|
||||
tmpl_inputs.add_generation_prompt = inputs.add_generation_prompt;
|
||||
tmpl_inputs.extra_context = inputs.extra_context;
|
||||
if (additional_context) {
|
||||
tmpl_inputs.extra_context.merge_patch(*additional_context);
|
||||
}
|
||||
// TODO: add flag to control date/time, if only for testing purposes.
|
||||
// tmpl_inputs.now = std::chrono::system_clock::now();
|
||||
|
||||
@@ -828,7 +838,7 @@ static common_chat_params common_chat_params_init_generic(const common_chat_temp
|
||||
inputs.messages,
|
||||
"Respond in JSON format, either with `tool_call` (a request to call tools) or with `response` reply to the user's request");
|
||||
|
||||
data.prompt = apply(tmpl, tweaked_messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
|
||||
data.prompt = apply(tmpl, inputs, /* messages_override= */ tweaked_messages);
|
||||
data.format = COMMON_CHAT_FORMAT_GENERIC;
|
||||
return data;
|
||||
}
|
||||
@@ -904,7 +914,7 @@ static common_chat_params common_chat_params_init_mistral_nemo(const common_chat
|
||||
data.preserved_tokens = {
|
||||
"[TOOL_CALLS]",
|
||||
};
|
||||
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
|
||||
data.prompt = apply(tmpl, inputs);
|
||||
data.format = COMMON_CHAT_FORMAT_MISTRAL_NEMO;
|
||||
return data;
|
||||
}
|
||||
@@ -934,7 +944,7 @@ static common_chat_params common_chat_params_init_command_r7b(const common_chat_
|
||||
adjusted_messages.push_back(msg);
|
||||
}
|
||||
}
|
||||
data.prompt = apply(tmpl, adjusted_messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, {});
|
||||
data.prompt = apply(tmpl, inputs, /* messages_override= */ adjusted_messages);
|
||||
data.format = COMMON_CHAT_FORMAT_COMMAND_R7B;
|
||||
if (string_ends_with(data.prompt, "<|START_THINKING|>")) {
|
||||
if (!inputs.enable_thinking) {
|
||||
@@ -1122,7 +1132,7 @@ static common_chat_params common_chat_params_init_llama_3_x(const common_chat_te
|
||||
} else {
|
||||
data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
|
||||
}
|
||||
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, {
|
||||
data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, /* tools_override= */ std::nullopt, json {
|
||||
{"date_string", format_time(inputs.now, "%d %b %Y")},
|
||||
{"tools_in_user_message", false},
|
||||
{"builtin_tools", builtin_tools.empty() ? json() : builtin_tools},
|
||||
@@ -1187,7 +1197,7 @@ static void common_chat_parse_llama_3_1(common_chat_msg_parser & builder, bool w
|
||||
|
||||
static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
||||
common_chat_params data;
|
||||
auto prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
|
||||
auto prompt = apply(tmpl, inputs);
|
||||
|
||||
// Hacks to fix the official (broken) prompt.
|
||||
// It is advisable to use --chat-template-file models/templates/llama-cpp-deepseek-r1.jinja instead,
|
||||
@@ -1282,7 +1292,7 @@ static void common_chat_parse_deepseek_r1(common_chat_msg_parser & builder) {
|
||||
static common_chat_params common_chat_params_init_firefunction_v2(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
||||
LOG_DBG("%s\n", __func__);
|
||||
common_chat_params data;
|
||||
data.prompt = apply(tmpl, inputs.messages, /* tools= */ nullptr, inputs.add_generation_prompt, {
|
||||
data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, /* tools_override= */ json(), json {
|
||||
{"datetime", format_time(inputs.now, "%b %d %Y %H:%M:%S GMT")},
|
||||
{"functions", json(inputs.tools.empty() ? "" : inputs.tools.dump(2))},
|
||||
});
|
||||
@@ -1338,7 +1348,7 @@ static common_chat_params common_chat_params_init_functionary_v3_2(const common_
|
||||
// Using ">>>f1\n", ">>>f2\n"... as trigger words for the grammar
|
||||
// If the function is python, we also allow raw python code (if the line after `python\n` doesn't start w/ opening `{`), which the model seems to prefer for multiline code.
|
||||
common_chat_params data;
|
||||
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
|
||||
data.prompt = apply(tmpl, inputs);
|
||||
data.format = COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2;
|
||||
if (inputs.tools.is_array() && !inputs.tools.empty()) {
|
||||
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
|
||||
@@ -1465,7 +1475,7 @@ static common_chat_params common_chat_params_init_functionary_v3_1_llama_3_1(con
|
||||
data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
|
||||
}
|
||||
|
||||
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
|
||||
data.prompt = apply(tmpl, inputs);
|
||||
// TODO: if (has_raw_python)
|
||||
return data;
|
||||
}
|
||||
@@ -1498,14 +1508,15 @@ static void common_chat_parse_functionary_v3_1_llama_3_1(common_chat_msg_parser
|
||||
static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
||||
common_chat_params data;
|
||||
|
||||
json additional_context = {
|
||||
json extra_context = json {
|
||||
{"enable_thinking", inputs.enable_thinking},
|
||||
};
|
||||
extra_context.update(inputs.extra_context);
|
||||
|
||||
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, additional_context);
|
||||
data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, /* tools_override= */ std::nullopt, extra_context);
|
||||
data.format = COMMON_CHAT_FORMAT_HERMES_2_PRO;
|
||||
if (string_ends_with(data.prompt, "<think>\n")) {
|
||||
if (!inputs.enable_thinking) {
|
||||
if (!extra_context["enable_thinking"]) {
|
||||
data.prompt += "</think>";
|
||||
} else {
|
||||
data.thinking_forced_open = true;
|
||||
@@ -1691,7 +1702,7 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
|
||||
|
||||
static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
||||
common_chat_params data;
|
||||
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
|
||||
data.prompt = apply(tmpl, inputs);
|
||||
data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
|
||||
data.grammar_lazy = false;
|
||||
if (!inputs.json_schema.is_null()) {
|
||||
@@ -1722,6 +1733,12 @@ static common_chat_params common_chat_templates_apply_jinja(
|
||||
params.enable_thinking = inputs.enable_thinking;
|
||||
params.grammar = inputs.grammar;
|
||||
params.now = inputs.now;
|
||||
|
||||
params.extra_context = json::object();
|
||||
for (auto el : inputs.chat_template_kwargs) {
|
||||
params.extra_context[el.first] = json::parse(el.second);
|
||||
}
|
||||
|
||||
if (!inputs.json_schema.empty()) {
|
||||
params.json_schema = json::parse(inputs.json_schema);
|
||||
}
|
||||
@@ -1838,7 +1855,7 @@ static common_chat_params common_chat_templates_apply_legacy(
|
||||
if (res < 0) {
|
||||
// if the custom "tmpl" is not supported, we throw an error
|
||||
// this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
|
||||
throw std::runtime_error("this custom template is not supported");
|
||||
throw std::runtime_error("this custom template is not supported, try using --jinja");
|
||||
}
|
||||
|
||||
// if it turns out that our buffer is too small, we resize it
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
#include <chrono>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <map>
|
||||
|
||||
struct common_chat_templates;
|
||||
|
||||
@@ -125,6 +126,7 @@ struct common_chat_templates_inputs {
|
||||
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE;
|
||||
bool enable_thinking = true;
|
||||
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
|
||||
std::map<std::string, std::string> chat_template_kwargs;
|
||||
};
|
||||
|
||||
struct common_chat_params {
|
||||
|
||||
@@ -706,11 +706,17 @@ bool fs_validate_filename(const std::string & filename) {
|
||||
// disable C++17 deprecation warning for std::codecvt_utf8
|
||||
# pragma clang diagnostic push
|
||||
# pragma clang diagnostic ignored "-Wdeprecated-declarations"
|
||||
#elif defined(__GNUC__)
|
||||
# pragma GCC diagnostic push
|
||||
# pragma GCC diagnostic ignored "-Wdeprecated-declarations"
|
||||
#endif
|
||||
|
||||
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
|
||||
|
||||
#if defined(__clang__)
|
||||
# pragma clang diagnostic pop
|
||||
#elif defined(__GNUC__)
|
||||
# pragma GCC diagnostic pop
|
||||
#endif
|
||||
|
||||
filename_utf32 = converter.from_bytes(filename);
|
||||
@@ -767,6 +773,9 @@ bool fs_validate_filename(const std::string & filename) {
|
||||
return true;
|
||||
}
|
||||
|
||||
#include <iostream>
|
||||
|
||||
|
||||
// returns true if successful, false otherwise
|
||||
bool fs_create_directory_with_parents(const std::string & path) {
|
||||
#ifdef _WIN32
|
||||
@@ -784,9 +793,16 @@ bool fs_create_directory_with_parents(const std::string & path) {
|
||||
// process path from front to back, procedurally creating directories
|
||||
while ((pos_slash = path.find('\\', pos_slash)) != std::string::npos) {
|
||||
const std::wstring subpath = wpath.substr(0, pos_slash);
|
||||
const wchar_t * test = subpath.c_str();
|
||||
|
||||
const bool success = CreateDirectoryW(test, NULL);
|
||||
pos_slash += 1;
|
||||
|
||||
// skip the drive letter, in some systems it can return an access denied error
|
||||
if (subpath.length() == 2 && subpath[1] == ':') {
|
||||
continue;
|
||||
}
|
||||
|
||||
const bool success = CreateDirectoryW(subpath.c_str(), NULL);
|
||||
|
||||
if (!success) {
|
||||
const DWORD error = GetLastError();
|
||||
|
||||
@@ -800,8 +816,6 @@ bool fs_create_directory_with_parents(const std::string & path) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
pos_slash += 1;
|
||||
}
|
||||
|
||||
return true;
|
||||
@@ -897,34 +911,6 @@ struct common_init_result common_init_from_params(common_params & params) {
|
||||
|
||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||
|
||||
if (params.reranking) {
|
||||
bool ok = true;
|
||||
|
||||
if (llama_vocab_bos(vocab) == LLAMA_TOKEN_NULL) {
|
||||
LOG_WRN("%s: warning: vocab does not have a BOS token, reranking will not work\n", __func__);
|
||||
ok = false;
|
||||
}
|
||||
|
||||
bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
|
||||
bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
|
||||
|
||||
if (!has_eos && !has_sep) {
|
||||
LOG_WRN("%s: warning: vocab does not have an EOS token or SEP token, reranking will not work\n", __func__);
|
||||
ok = false;
|
||||
} else if (!has_eos) {
|
||||
LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
|
||||
} else if (!has_sep) {
|
||||
LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
|
||||
ok = false;
|
||||
}
|
||||
|
||||
if (!ok) {
|
||||
llama_model_free(model);
|
||||
|
||||
return iparams;
|
||||
}
|
||||
}
|
||||
|
||||
auto cparams = common_context_params_to_llama(params);
|
||||
|
||||
llama_context * lctx = llama_init_from_model(model, cparams);
|
||||
@@ -966,6 +952,35 @@ struct common_init_result common_init_from_params(common_params & params) {
|
||||
}
|
||||
}
|
||||
|
||||
if (llama_pooling_type(lctx) == LLAMA_POOLING_TYPE_RANK) {
|
||||
bool ok = true;
|
||||
|
||||
if (llama_vocab_bos(vocab) == LLAMA_TOKEN_NULL) {
|
||||
LOG_WRN("%s: warning: vocab does not have a BOS token, reranking will not work\n", __func__);
|
||||
ok = false;
|
||||
}
|
||||
|
||||
bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
|
||||
bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
|
||||
|
||||
if (!has_eos && !has_sep) {
|
||||
LOG_WRN("%s: warning: vocab does not have an EOS token or SEP token, reranking will not work\n", __func__);
|
||||
ok = false;
|
||||
} else if (!has_eos) {
|
||||
LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
|
||||
} else if (!has_sep) {
|
||||
LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
|
||||
ok = false;
|
||||
}
|
||||
|
||||
if (!ok) {
|
||||
llama_free(lctx);
|
||||
llama_model_free(model);
|
||||
|
||||
return iparams;
|
||||
}
|
||||
}
|
||||
|
||||
// load and optionally apply lora adapters
|
||||
for (auto & la : params.lora_adapters) {
|
||||
llama_adapter_lora_ptr lora;
|
||||
@@ -1143,11 +1158,6 @@ struct llama_context_params common_context_params_to_llama(const common_params &
|
||||
cparams.op_offload = !params.no_op_offload;
|
||||
cparams.swa_full = params.swa_full;
|
||||
|
||||
if (params.reranking) {
|
||||
cparams.embeddings = true;
|
||||
cparams.pooling_type = LLAMA_POOLING_TYPE_RANK;
|
||||
}
|
||||
|
||||
cparams.type_k = params.cache_type_k;
|
||||
cparams.type_v = params.cache_type_v;
|
||||
|
||||
@@ -1280,6 +1290,9 @@ std::vector<llama_token> common_tokenize(
|
||||
int n_tokens = text.length() + 2 * add_special;
|
||||
std::vector<llama_token> result(n_tokens);
|
||||
n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
|
||||
if (n_tokens == std::numeric_limits<int32_t>::min()) {
|
||||
throw std::runtime_error("Tokenization failed: input text too large, tokenization result exceeds int32_t limit");
|
||||
}
|
||||
if (n_tokens < 0) {
|
||||
result.resize(-n_tokens);
|
||||
int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <vector>
|
||||
#include <map>
|
||||
#include <sstream>
|
||||
|
||||
#ifdef _WIN32
|
||||
@@ -199,6 +200,9 @@ struct common_params_speculative {
|
||||
float p_split = 0.1f; // speculative decoding split probability
|
||||
float p_min = 0.75f; // minimum speculative decoding probability (greedy)
|
||||
|
||||
ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
|
||||
ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
|
||||
|
||||
struct cpu_params cpuparams;
|
||||
struct cpu_params cpuparams_batch;
|
||||
|
||||
@@ -355,7 +359,7 @@ struct common_params {
|
||||
int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
|
||||
std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
|
||||
std::string embd_sep = "\n"; // separator of embeddings
|
||||
bool reranking = false; // enable reranking support on server
|
||||
std::string cls_sep = "\t"; // separator of classification sequences
|
||||
|
||||
// server params
|
||||
int32_t port = 8080; // server listens on this network port
|
||||
@@ -366,6 +370,7 @@ struct common_params {
|
||||
|
||||
std::string hostname = "127.0.0.1";
|
||||
std::string public_path = ""; // NOLINT
|
||||
std::string api_prefix = ""; // NOLINT
|
||||
std::string chat_template = ""; // NOLINT
|
||||
bool use_jinja = false; // NOLINT
|
||||
bool enable_chat_template = true;
|
||||
@@ -378,6 +383,8 @@ struct common_params {
|
||||
std::string ssl_file_key = ""; // NOLINT
|
||||
std::string ssl_file_cert = ""; // NOLINT
|
||||
|
||||
std::map<std::string, std::string> default_template_kwargs;
|
||||
|
||||
// "advanced" endpoints are disabled by default for better security
|
||||
bool webui = true;
|
||||
bool endpoint_slots = false;
|
||||
|
||||
@@ -41,49 +41,6 @@ static std::string build_repetition(const std::string & item_rule, int min_items
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Minimalistic replacement for std::string_view, which is only available from C++17 onwards */
|
||||
class string_view {
|
||||
const std::string & _str;
|
||||
const size_t _start;
|
||||
const size_t _end;
|
||||
public:
|
||||
string_view(const std::string & str, size_t start = 0, size_t end = std::string::npos) : _str(str), _start(start), _end(end == std::string::npos ? str.length() : end) {}
|
||||
|
||||
size_t size() const {
|
||||
return _end - _start;
|
||||
}
|
||||
|
||||
size_t length() const {
|
||||
return size();
|
||||
}
|
||||
|
||||
operator std::string() const {
|
||||
return str();
|
||||
}
|
||||
|
||||
std::string str() const {
|
||||
return _str.substr(_start, _end - _start);
|
||||
}
|
||||
|
||||
string_view substr(size_t pos, size_t len = std::string::npos) const {
|
||||
return string_view(_str, _start + pos, len == std::string::npos ? _end : _start + pos + len);
|
||||
}
|
||||
|
||||
char operator[](size_t pos) const {
|
||||
auto index = _start + pos;
|
||||
if (index >= _end) {
|
||||
throw std::out_of_range("string_view index out of range");
|
||||
}
|
||||
return _str[_start + pos];
|
||||
}
|
||||
|
||||
bool operator==(const string_view & other) const {
|
||||
std::string this_str = *this;
|
||||
std::string other_str = other;
|
||||
return this_str == other_str;
|
||||
}
|
||||
};
|
||||
|
||||
static void _build_min_max_int(int min_value, int max_value, std::stringstream & out, int decimals_left = 16, bool top_level = true) {
|
||||
auto has_min = min_value != std::numeric_limits<int>::min();
|
||||
auto has_max = max_value != std::numeric_limits<int>::max();
|
||||
@@ -112,14 +69,14 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
|
||||
}
|
||||
out << "}";
|
||||
};
|
||||
std::function<void(const string_view &, const string_view &)> uniform_range =
|
||||
[&](const string_view & from, const string_view & to) {
|
||||
std::function<void(const std::string_view &, const std::string_view &)> uniform_range =
|
||||
[&](const std::string_view & from, const std::string_view & to) {
|
||||
size_t i = 0;
|
||||
while (i < from.length() && i < to.length() && from[i] == to[i]) {
|
||||
i++;
|
||||
}
|
||||
if (i > 0) {
|
||||
out << "\"" << from.substr(0, i).str() << "\"";
|
||||
out << "\"" << from.substr(0, i) << "\"";
|
||||
}
|
||||
if (i < from.length() && i < to.length()) {
|
||||
if (i > 0) {
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -128,6 +128,7 @@ models = [
|
||||
{"name": "llama4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct", },
|
||||
{"name": "pixtral", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistral-community/pixtral-12b", },
|
||||
{"name": "seed-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base", },
|
||||
{"name": "a.x-4.0", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/skt/A.X-4.0", },
|
||||
]
|
||||
|
||||
# some models are known to be broken upstream, so we will skip them as exceptions
|
||||
@@ -137,6 +138,12 @@ pre_computed_hashes = [
|
||||
{"name": "chatglm-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-chat", "chkhsh": "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516"},
|
||||
{"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf", "chkhsh": "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2"},
|
||||
{"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", "chkhsh": "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35"},
|
||||
{"name": "hunyuan", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tencent/Hunyuan-A13B-Instruct", "chkhsh": "7e57df22b1fe23a7b1e1c7f3dc4e3f96d43a4eb0836d0c6bdc3436d7b2f1c664"},
|
||||
# falcon-h1 series uses 4 different tokenizers across model sizes (0.5b - 34b), hence we need to define 4 different hashes
|
||||
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-0.5B-Base", "chkhsh": "a6b57017d60e6edb4d88ecc2845188e0eb333a70357e45dcc9b53964a73bbae6"},
|
||||
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-1B-Base", "chkhsh": "60476e1243776c4fb1b993dbd7a5f15ac22f83c80afdf425fa5ae01c8d44ef86"},
|
||||
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-7B-Base", "chkhsh": "3eda48b4c4dc7de733d1a8b3e3b4a85243dbbf704da2ee9d42c6beced8897896"},
|
||||
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-34B-Base", "chkhsh": "48f8e02c0359c0bbdd82f26909171fac1c18a457bb47573ed1fe3bbb2c1cfd4b"},
|
||||
]
|
||||
|
||||
|
||||
|
||||
@@ -757,7 +757,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
|
||||
| Name | Value | Function |
|
||||
|-------------------|------------------|---------------------------------------------------------------------------------------------------------------------------|
|
||||
| GGML_SYCL_DEBUG | 0 (default) or 1 | Enable log function by macro: GGML_SYCL_DEBUG |
|
||||
| GGML_SYCL_DISABLE_OPT | 0 (default) or 1 | Disable optimize features based on Intel GPU type, to compare the performance increase |
|
||||
| GGML_SYCL_DISABLE_OPT | 0 (default) or 1 | Disable optimize features for Intel GPUs. (Recommended to 1 for intel devices older than Gen 10) |
|
||||
| GGML_SYCL_DISABLE_GRAPH | 0 or 1 (default) | Disable running computations through SYCL Graphs feature. Disabled by default because graph performance isn't yet better than non-graph performance. |
|
||||
| GGML_SYCL_DISABLE_DNN | 0 (default) or 1 | Disable running computations through oneDNN and always use oneMKL. |
|
||||
| ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |
|
||||
|
||||
246
docs/build-s390x.md
Normal file
246
docs/build-s390x.md
Normal file
@@ -0,0 +1,246 @@
|
||||
> [!IMPORTANT]
|
||||
> This build documentation is specific only to IBM Z & LinuxONE mainframes (s390x). You can find the build documentation for other architectures: [build.md](build.md).
|
||||
|
||||
# Build llama.cpp locally (for s390x)
|
||||
|
||||
The main product of this project is the `llama` library. Its C-style interface can be found in [include/llama.h](../include/llama.h).
|
||||
|
||||
The project also includes many example programs and tools using the `llama` library. The examples range from simple, minimal code snippets to sophisticated sub-projects such as an OpenAI-compatible HTTP server.
|
||||
|
||||
**To get the code:**
|
||||
|
||||
```bash
|
||||
git clone https://github.com/ggml-org/llama.cpp
|
||||
cd llama.cpp
|
||||
```
|
||||
|
||||
## CPU Build with BLAS
|
||||
|
||||
Building llama.cpp with BLAS support is highly recommended as it has shown to provide performance improvements. Make sure to have OpenBLAS installed in your environment.
|
||||
|
||||
```bash
|
||||
cmake -S . -B build \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_BLAS=ON \
|
||||
-DGGML_BLAS_VENDOR=OpenBLAS
|
||||
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
```
|
||||
|
||||
**Notes**:
|
||||
|
||||
- For faster repeated compilation, install [ccache](https://ccache.dev/)
|
||||
- By default, VXE/VXE2 is enabled. To disable it (not recommended):
|
||||
|
||||
```bash
|
||||
cmake -S . -B build \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_BLAS=ON \
|
||||
-DGGML_BLAS_VENDOR=OpenBLAS \
|
||||
-DGGML_VXE=OFF
|
||||
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
```
|
||||
|
||||
- By default, NNPA is enabled when available. To disable it (not recommended):
|
||||
|
||||
```bash
|
||||
cmake -S . -B build \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_BLAS=ON \
|
||||
-DGGML_BLAS_VENDOR=OpenBLAS \
|
||||
-DGGML_NNPA=OFF
|
||||
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
```
|
||||
|
||||
- For debug builds:
|
||||
|
||||
```bash
|
||||
cmake -S . -B build \
|
||||
-DCMAKE_BUILD_TYPE=Debug \
|
||||
-DGGML_BLAS=ON \
|
||||
-DGGML_BLAS_VENDOR=OpenBLAS
|
||||
cmake --build build --config Debug -j $(nproc)
|
||||
```
|
||||
|
||||
- For static builds, add `-DBUILD_SHARED_LIBS=OFF`:
|
||||
|
||||
```bash
|
||||
cmake -S . -B build \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_BLAS=ON \
|
||||
-DGGML_BLAS_VENDOR=OpenBLAS \
|
||||
-DBUILD_SHARED_LIBS=OFF
|
||||
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
```
|
||||
|
||||
## Getting GGUF Models
|
||||
|
||||
All models need to be converted to Big-Endian. You can achieve this in three cases:
|
||||
|
||||
1. **Use pre-converted models verified for use on IBM Z & LinuxONE (easiest)**
|
||||
|
||||

|
||||
|
||||
You can find popular models pre-converted and verified at [s390x Ready Models](https://huggingface.co/collections/taronaeo/s390x-ready-models-672765393af438d0ccb72a08).
|
||||
|
||||
These models have already been converted from `safetensors` to `GGUF Big-Endian` and their respective tokenizers verified to run correctly on IBM z15 and later system.
|
||||
|
||||
2. **Convert safetensors model to GGUF Big-Endian directly (recommended)**
|
||||
|
||||

|
||||
|
||||
The model you are trying to convert must be in `safetensors` file format (for example [IBM Granite 3.3 2B](https://huggingface.co/ibm-granite/granite-3.3-2b-instruct)). Make sure you have downloaded the model repository for this case.
|
||||
|
||||
```bash
|
||||
python3 convert_hf_to_gguf.py \
|
||||
--outfile model-name-be.f16.gguf \
|
||||
--outtype f16 \
|
||||
--bigendian \
|
||||
model-directory/
|
||||
```
|
||||
|
||||
For example,
|
||||
|
||||
```bash
|
||||
python3 convert_hf_to_gguf.py \
|
||||
--outfile granite-3.3-2b-instruct-be.f16.gguf \
|
||||
--outtype f16 \
|
||||
--bigendian \
|
||||
granite-3.3-2b-instruct/
|
||||
```
|
||||
|
||||
3. **Convert existing GGUF Little-Endian model to Big-Endian**
|
||||
|
||||

|
||||
|
||||
The model you are trying to convert must be in `gguf` file format (for example [IBM Granite 3.3 2B](https://huggingface.co/ibm-granite/granite-3.3-2b-instruct-GGUF)). Make sure you have downloaded the model file for this case.
|
||||
|
||||
```bash
|
||||
python3 gguf-py/gguf/scripts/gguf_convert_endian.py model-name.f16.gguf BIG
|
||||
```
|
||||
|
||||
For example,
|
||||
|
||||
```bash
|
||||
python3 gguf-py/gguf/scripts/gguf_convert_endian.py granite-3.3-2b-instruct-le.f16.gguf BIG
|
||||
mv granite-3.3-2b-instruct-le.f16.gguf granite-3.3-2b-instruct-be.f16.gguf
|
||||
```
|
||||
|
||||
**Notes:**
|
||||
|
||||
- The GGUF endian conversion script may not support all data types at the moment and may fail for some models/quantizations. When that happens, please try manually converting the safetensors model to GGUF Big-Endian via Step 2.
|
||||
|
||||
## IBM Accelerators
|
||||
|
||||
### 1. SIMD Acceleration
|
||||
|
||||
Only available in IBM z15 or later system with the `-DGGML_VXE=ON` (turned on by default) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z14/arch12. In such systems, the APIs can still run but will use a scalar implementation.
|
||||
|
||||
### 2. NNPA Vector Intrinsics Acceleration
|
||||
|
||||
Only available in IBM z16 or later system with the `-DGGML_NNPA=ON` (turned on when available) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z15/arch13. In such systems, the APIs can still run but will use a scalar implementation.
|
||||
|
||||
### 3. zDNN Accelerator
|
||||
|
||||
_Only available in IBM z16 or later system. No direction at the moment._
|
||||
|
||||
### 4. Spyre Accelerator
|
||||
|
||||
_No direction at the moment._
|
||||
|
||||
## Performance Tuning
|
||||
|
||||
### 1. Virtualization Setup
|
||||
|
||||
It is strongly recommended to use only LPAR (Type-1) virtualization to get the most performance.
|
||||
|
||||
Note: Type-2 virtualization is not supported at the moment, while you can get it running, the performance will not be the best.
|
||||
|
||||
### 2. IFL (Core) Count
|
||||
|
||||
It is recommended to allocate a minimum of 8 shared IFLs assigned to the LPAR. Increasing the IFL count past 8 shared IFLs will only improve Prompt Processing performance but not Token Generation.
|
||||
|
||||
Note: IFL count does not equate to vCPU count.
|
||||
|
||||
### 3. SMT vs NOSMT (Simultaneous Multithreading)
|
||||
|
||||
It is strongly recommended to disable SMT via the kernel boot parameters as it negatively affects performance. Please refer to your Linux distribution's guide on disabling SMT via kernel boot parameters.
|
||||
|
||||
### 4. BLAS vs NOBLAS
|
||||
|
||||
IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongly recommended to use BLAS.
|
||||
|
||||
## Frequently Asked Questions (FAQ)
|
||||
|
||||
1. I'm getting the following error message while trying to load a model: `gguf_init_from_file_impl: failed to load model: this GGUF file version 50331648 is extremely large, is there a mismatch between the host and model endianness?`
|
||||
|
||||
Answer: Please ensure that the model you have downloaded/converted is GGUFv3 Big-Endian. These models are usually denoted with the `-be` suffix, i.e., `granite-3.3-2b-instruct-be.F16.gguf`.
|
||||
|
||||
You may refer to the [Getting GGUF Models](#getting-gguf-models) section to manually convert a `safetensors` model to `GGUF` Big Endian.
|
||||
|
||||
2. I'm getting extremely poor performance when running inference on a model
|
||||
|
||||
Answer: Please refer to the [Appendix B: SIMD Support Matrix](#appendix-b-simd-support-matrix) to check if your model quantization is supported by SIMD acceleration.
|
||||
|
||||
3. I'm building on IBM z17 and getting the following error messages: `invalid switch -march=z17`
|
||||
|
||||
Answer: Please ensure that your GCC compiler is of minimum GCC 15.1.0 version, and have `binutils` updated to the latest version. If this does not fix the problem, kindly open an issue.
|
||||
|
||||
## Getting Help on IBM Z & LinuxONE
|
||||
|
||||
1. **Bugs, Feature Requests**
|
||||
|
||||
Please file an issue in llama.cpp and ensure that the title contains "s390x".
|
||||
|
||||
2. **Other Questions**
|
||||
|
||||
Please reach out directly to [aionz@us.ibm.com](mailto:aionz@us.ibm.com).
|
||||
|
||||
## Appendix A: Hardware Support Matrix
|
||||
|
||||
| | Support | Minimum Compiler Version |
|
||||
| ------- | ------- | ------------------------ |
|
||||
| IBM z15 | ✅ | |
|
||||
| IBM z16 | ✅ | |
|
||||
| IBM z17 | ✅ | GCC 15.1.0 |
|
||||
|
||||
- ✅ - supported and verified to run as intended
|
||||
- 🚫 - unsupported, we are unlikely able to provide support
|
||||
|
||||
## Appendix B: SIMD Support Matrix
|
||||
|
||||
| | VX/VXE/VXE2 | NNPA | zDNN | Spyre |
|
||||
| ---------- | ----------- | ---- | ---- | ----- |
|
||||
| FP32 | ✅ | ✅ | ❓ | ❓ |
|
||||
| FP16 | ✅ | ✅ | ❓ | ❓ |
|
||||
| BF16 | 🚫 | 🚫 | ❓ | ❓ |
|
||||
| Q4_0 | ✅ | ✅ | ❓ | ❓ |
|
||||
| Q4_1 | ✅ | ✅ | ❓ | ❓ |
|
||||
| Q5_0 | 🚫 | 🚫 | ❓ | ❓ |
|
||||
| Q5_1 | 🚫 | 🚫 | ❓ | ❓ |
|
||||
| Q8_0 | ✅ | ✅ | ❓ | ❓ |
|
||||
| Q2_K | 🚫 | 🚫 | ❓ | ❓ |
|
||||
| Q3_K | ✅ | ✅ | ❓ | ❓ |
|
||||
| Q4_K | ✅ | ✅ | ❓ | ❓ |
|
||||
| Q5_K | ✅ | ✅ | ❓ | ❓ |
|
||||
| Q6_K | ✅ | ✅ | ❓ | ❓ |
|
||||
| TQ1_0 | 🚫 | 🚫 | ❓ | ❓ |
|
||||
| TQ2_0 | 🚫 | 🚫 | ❓ | ❓ |
|
||||
| IQ2_XXS | 🚫 | 🚫 | ❓ | ❓ |
|
||||
| IQ2_XS | 🚫 | 🚫 | ❓ | ❓ |
|
||||
| IQ2_S | 🚫 | 🚫 | ❓ | ❓ |
|
||||
| IQ3_XXS | 🚫 | 🚫 | ❓ | ❓ |
|
||||
| IQ3_S | 🚫 | 🚫 | ❓ | ❓ |
|
||||
| IQ1_S | 🚫 | 🚫 | ❓ | ❓ |
|
||||
| IQ1_M | 🚫 | 🚫 | ❓ | ❓ |
|
||||
| IQ4_NL | ✅ | ✅ | ❓ | ❓ |
|
||||
| IQ4_XS | ✅ | ✅ | ❓ | ❓ |
|
||||
| FP32->FP16 | 🚫 | ✅ | ❓ | ❓ |
|
||||
| FP16->FP32 | 🚫 | ✅ | ❓ | ❓ |
|
||||
|
||||
- ✅ - acceleration available
|
||||
- 🚫 - acceleration unavailable, will still run using scalar implementation
|
||||
- ❓ - acceleration unknown, please contribute if you can test it yourself
|
||||
@@ -1,6 +1,6 @@
|
||||
# Build llama.cpp locally
|
||||
|
||||
The main product of this project is the `llama` library. Its C-style interface can be found in [include/llama.h](include/llama.h).
|
||||
The main product of this project is the `llama` library. Its C-style interface can be found in [include/llama.h](../include/llama.h).
|
||||
|
||||
The project also includes many example programs and tools using the `llama` library. The examples range from simple, minimal code snippets to sophisticated sub-projects such as an OpenAI-compatible HTTP server.
|
||||
|
||||
@@ -557,6 +557,10 @@ ninja
|
||||
|
||||
To read documentation for how to build on Android, [click here](./android.md)
|
||||
|
||||
## IBM Z & LinuxONE
|
||||
|
||||
To read documentation for how to build on IBM Z & LinuxONE, [click here](./build-s390x.md)
|
||||
|
||||
## Notes about GPU-accelerated backends
|
||||
|
||||
The GPU may still be used to accelerate some parts of the computation even when using the `-ngl 0` option. You can fully disable GPU acceleration by using `--device none`.
|
||||
|
||||
@@ -83,20 +83,22 @@ NOTE: Tensor names must end with `.weight` or `.bias` suffixes, that is the conv
|
||||
|
||||
### 2. Define the model architecture in `llama.cpp`
|
||||
|
||||
The model params and tensors layout must be defined in `llama.cpp`:
|
||||
1. Define a new `llm_arch`
|
||||
2. Define the tensors layout in `LLM_TENSOR_NAMES`
|
||||
3. Add any non-standard metadata in `llm_load_hparams`
|
||||
4. Create the tensors for inference in `llm_load_tensors`
|
||||
5. If the model has a RoPE operation, add the rope type in `llama_rope_type`
|
||||
The model params and tensors layout must be defined in `llama.cpp` source files:
|
||||
1. Define a new `llm_arch` enum value in `src/llama-arch.h`.
|
||||
2. In `src/llama-arch.cpp`:
|
||||
- Add the architecture name to the `LLM_ARCH_NAMES` map.
|
||||
- Add the tensor mappings to the `LLM_TENSOR_NAMES` map.
|
||||
3. Add any non-standard metadata loading in the `llama_model_loader` constructor in `src/llama-model-loader.cpp`.
|
||||
4. If the model has a RoPE operation, add a case for the architecture in `llama_model_rope_type` function in `src/llama-model.cpp`.
|
||||
|
||||
NOTE: The dimensions in `ggml` are typically in the reverse order of the `pytorch` dimensions.
|
||||
|
||||
### 3. Build the GGML graph implementation
|
||||
|
||||
This is the funniest part, you have to provide the inference graph implementation of the new model architecture in `llama_build_graph`.
|
||||
|
||||
Have a look at existing implementations like `build_llama`, `build_dbrx` or `build_bert`.
|
||||
This is the funniest part, you have to provide the inference graph implementation of the new model architecture in `src/llama-model.cpp`.
|
||||
Create a new struct that inherits from `llm_graph_context` and implement the graph-building logic in its constructor.
|
||||
Have a look at existing implementations like `llm_build_llama`, `llm_build_dbrx` or `llm_build_bert`.
|
||||
Then, in the `llama_model::build_graph` method, add a case for your architecture to instantiate your new graph-building struct.
|
||||
|
||||
Some `ggml` backends do not support all operations. Backend implementations can be added in a separate PR.
|
||||
|
||||
|
||||
@@ -25,6 +25,9 @@ Additionally, there the following images, similar to the above:
|
||||
- `ghcr.io/ggml-org/llama.cpp:full-intel`: Same as `full` but compiled with SYCL support. (platforms: `linux/amd64`)
|
||||
- `ghcr.io/ggml-org/llama.cpp:light-intel`: Same as `light` but compiled with SYCL support. (platforms: `linux/amd64`)
|
||||
- `ghcr.io/ggml-org/llama.cpp:server-intel`: Same as `server` but compiled with SYCL support. (platforms: `linux/amd64`)
|
||||
- `ghcr.io/ggml-org/llama.cpp:full-vulkan`: Same as `full` but compiled with Vulkan support. (platforms: `linux/amd64`)
|
||||
- `ghcr.io/ggml-org/llama.cpp:light-vulkan`: Same as `light` but compiled with Vulkan support. (platforms: `linux/amd64`)
|
||||
- `ghcr.io/ggml-org/llama.cpp:server-vulkan`: Same as `server` but compiled with Vulkan support. (platforms: `linux/amd64`)
|
||||
|
||||
The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](../.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](../.github/workflows/docker.yml). If you need different settings (for example, a different CUDA, ROCm or MUSA library, you'll need to build the images locally for now).
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/bin/bash
|
||||
#!/usr/bin/env bash
|
||||
set -e
|
||||
|
||||
AI_NAME="${AI_NAME:-Miku}"
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/bin/bash
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -e
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/bin/bash
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/bin/bash
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -e
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/bin/bash
|
||||
#!/usr/bin/env bash
|
||||
|
||||
#
|
||||
# Temporary script - will be removed in the future
|
||||
|
||||
@@ -133,10 +133,36 @@ int main(int argc, char ** argv) {
|
||||
// max batch size
|
||||
const uint64_t n_batch = params.n_batch;
|
||||
|
||||
// get added sep and eos token, if any
|
||||
const std::string added_sep_token = llama_vocab_get_add_sep(vocab) ? llama_vocab_get_text(vocab, llama_vocab_sep(vocab)) : "";
|
||||
const std::string added_eos_token = llama_vocab_get_add_eos(vocab) ? llama_vocab_get_text(vocab, llama_vocab_eos(vocab)) : "";
|
||||
|
||||
// tokenize the prompts and trim
|
||||
std::vector<std::vector<int32_t>> inputs;
|
||||
for (const auto & prompt : prompts) {
|
||||
auto inp = common_tokenize(ctx, prompt, true, true);
|
||||
std::vector<llama_token> inp;
|
||||
|
||||
// split classification pairs and insert expected separator tokens
|
||||
if (pooling_type == LLAMA_POOLING_TYPE_RANK && prompt.find(params.cls_sep) != std::string::npos) {
|
||||
std::vector<std::string> pairs = split_lines(prompt, params.cls_sep);
|
||||
std::string final_prompt;
|
||||
|
||||
for (size_t i = 0; i < pairs.size(); i++) {
|
||||
final_prompt += pairs[i];
|
||||
if (i != pairs.size() - 1) {
|
||||
if (!added_eos_token.empty()) {
|
||||
final_prompt += added_eos_token;
|
||||
}
|
||||
if (!added_sep_token.empty()) {
|
||||
final_prompt += added_sep_token;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inp = common_tokenize(ctx, final_prompt, true, true);
|
||||
} else {
|
||||
inp = common_tokenize(ctx, prompt, true, true);
|
||||
}
|
||||
if (inp.size() > n_batch) {
|
||||
LOG_ERR("%s: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
|
||||
__func__, (long long int) inp.size(), (long long int) n_batch);
|
||||
@@ -145,11 +171,11 @@ int main(int argc, char ** argv) {
|
||||
inputs.push_back(inp);
|
||||
}
|
||||
|
||||
// check if the last token is SEP
|
||||
// check if the last token is SEP/EOS
|
||||
// it should be automatically added by the tokenizer when 'tokenizer.ggml.add_eos_token' is set to 'true'
|
||||
for (auto & inp : inputs) {
|
||||
if (inp.empty() || inp.back() != llama_vocab_sep(vocab)) {
|
||||
LOG_WRN("%s: last token in the prompt is not SEP\n", __func__);
|
||||
if (inp.empty() || (inp.back() != llama_vocab_sep(vocab) && inp.back() != llama_vocab_eos(vocab))) {
|
||||
LOG_WRN("%s: last token in the prompt is not SEP or EOS\n", __func__);
|
||||
LOG_WRN("%s: 'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -55,6 +55,8 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
|
||||
v = ggml_fp16_to_fp32(*(ggml_fp16_t *) &data[i]);
|
||||
} else if (type == GGML_TYPE_F32) {
|
||||
v = *(float *) &data[i];
|
||||
} else if (type == GGML_TYPE_I64) {
|
||||
v = (float) *(int64_t *) &data[i];
|
||||
} else if (type == GGML_TYPE_I32) {
|
||||
v = (float) *(int32_t *) &data[i];
|
||||
} else if (type == GGML_TYPE_I16) {
|
||||
@@ -134,6 +136,11 @@ static bool run(llama_context * ctx, const common_params & params) {
|
||||
|
||||
std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, add_bos);
|
||||
|
||||
if (tokens.empty()) {
|
||||
LOG_ERR("%s : there are not input tokens to process - (try to provide a prompt with '-p')\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {
|
||||
LOG_ERR("%s : failed to eval\n", __func__);
|
||||
return false;
|
||||
|
||||
@@ -41,12 +41,11 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
|
||||
|
||||
// add input to batch (this increments n_tokens)
|
||||
for (int32_t j = 0; j < n_toks; j++) {
|
||||
common_batch_add(batch, inputs[j], j, { 0 }, j >= n_inst);
|
||||
common_batch_add(batch, inputs[j], j, { 0 }, true);
|
||||
}
|
||||
|
||||
// clear previous kv_cache values (irrelevant for embeddings)
|
||||
llama_memory_clear(llama_get_memory(ctx), true);
|
||||
llama_set_embeddings(ctx, true);
|
||||
llama_set_causal_attn(ctx, false);
|
||||
|
||||
// run model
|
||||
@@ -103,7 +102,6 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std
|
||||
llama_token eos_token = llama_vocab_eos(vocab);
|
||||
|
||||
llama_memory_clear(llama_get_memory(ctx), true);
|
||||
llama_set_embeddings(ctx, false);
|
||||
llama_set_causal_attn(ctx, true);
|
||||
|
||||
llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1);
|
||||
@@ -166,6 +164,8 @@ int main(int argc, char * argv[]) {
|
||||
llama_model_params mparams = common_model_params_to_llama(params);
|
||||
llama_context_params cparams = common_context_params_to_llama(params);
|
||||
|
||||
cparams.embeddings = true;
|
||||
|
||||
llama_backend_init();
|
||||
|
||||
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
|
||||
@@ -213,6 +213,8 @@ int main(int argc, char * argv[]) {
|
||||
std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[1].c_str(), documents[1].c_str(), cosine_sim_q1_d1);
|
||||
}
|
||||
|
||||
llama_set_embeddings(ctx, false);
|
||||
|
||||
// ### Generation ###
|
||||
// GritLM models are not finetuned with system prompts, as you can just include system-like instructions together with your user instruction
|
||||
{
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/bin/bash
|
||||
#!/usr/bin/env bash
|
||||
set -e
|
||||
|
||||
MODEL=./models/ggml-vicuna-13b-1.1-q4_0.bin
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/bin/bash
|
||||
#!/usr/bin/env bash
|
||||
|
||||
cd `dirname $0`
|
||||
cd ..
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/bin/bash
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -e
|
||||
|
||||
|
||||
@@ -98,7 +98,7 @@ int main(int argc, char ** argv) {
|
||||
auto generate = [&](const std::string & prompt) {
|
||||
std::string response;
|
||||
|
||||
const bool is_first = llama_memory_seq_pos_max(llama_get_memory(ctx), 0) == 0;
|
||||
const bool is_first = llama_memory_seq_pos_max(llama_get_memory(ctx), 0) == -1;
|
||||
|
||||
// tokenize the prompt
|
||||
const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true);
|
||||
@@ -113,15 +113,16 @@ int main(int argc, char ** argv) {
|
||||
while (true) {
|
||||
// check if we have enough space in the context to evaluate this batch
|
||||
int n_ctx = llama_n_ctx(ctx);
|
||||
int n_ctx_used = llama_memory_seq_pos_max(llama_get_memory(ctx), 0);
|
||||
int n_ctx_used = llama_memory_seq_pos_max(llama_get_memory(ctx), 0) + 1;
|
||||
if (n_ctx_used + batch.n_tokens > n_ctx) {
|
||||
printf("\033[0m\n");
|
||||
fprintf(stderr, "context size exceeded\n");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
if (llama_decode(ctx, batch)) {
|
||||
GGML_ABORT("failed to decode\n");
|
||||
int ret = llama_decode(ctx, batch);
|
||||
if (ret != 0) {
|
||||
GGML_ABORT("failed to decode, ret = %d\n", ret);
|
||||
}
|
||||
|
||||
// sample the next token
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
|
||||
#!/usr/bin/env bash
|
||||
# MIT license
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/bin/bash
|
||||
#!/usr/bin/env bash
|
||||
|
||||
# MIT license
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/bin/bash
|
||||
#!/usr/bin/env bash
|
||||
|
||||
# MIT license
|
||||
# Copyright (C) 2025 Intel Corporation
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/bin/bash
|
||||
#!/usr/bin/env bash
|
||||
#
|
||||
# ./examples/ts-type-to-grammar.sh "{a:string,b:string,c?:string}"
|
||||
# python examples/json_schema_to_grammar.py https://json.schemastore.org/tsconfig.json
|
||||
|
||||
@@ -131,6 +131,7 @@ option(GGML_RVV "ggml: enable rvv" ON)
|
||||
option(GGML_RV_ZFH "ggml: enable riscv zfh" OFF)
|
||||
option(GGML_XTHEADVECTOR "ggml: enable xtheadvector" OFF)
|
||||
option(GGML_VXE "ggml: enable vxe" ON)
|
||||
option(GGML_NNPA "ggml: enable nnpa" ON)
|
||||
|
||||
option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
|
||||
set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM")
|
||||
@@ -172,6 +173,7 @@ option(GGML_HIP "ggml: use HIP"
|
||||
option(GGML_HIP_GRAPHS "ggml: use HIP graph, experimental, slow" OFF)
|
||||
option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM" ON)
|
||||
option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAttention" OFF)
|
||||
option(GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12 "ggml: enable rocWMMA FlashAttention on GFX12" OFF)
|
||||
option(GGML_VULKAN "ggml: use Vulkan" OFF)
|
||||
option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
|
||||
option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF)
|
||||
@@ -179,7 +181,6 @@ option(GGML_VULKAN_MEMORY_DEBUG "ggml: enable Vulkan memory debug ou
|
||||
option(GGML_VULKAN_SHADER_DEBUG_INFO "ggml: enable Vulkan shader debug info" OFF)
|
||||
option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation" OFF)
|
||||
option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF)
|
||||
option(GGML_KOMPUTE "ggml: use Kompute" OFF)
|
||||
option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT})
|
||||
option(GGML_METAL_USE_BF16 "ggml: use bfloat if available" OFF)
|
||||
option(GGML_METAL_NDEBUG "ggml: disable Metal debugging" OFF)
|
||||
@@ -264,7 +265,6 @@ set(GGML_PUBLIC_HEADERS
|
||||
include/ggml-cann.h
|
||||
include/ggml-cpp.h
|
||||
include/ggml-cuda.h
|
||||
include/ggml-kompute.h
|
||||
include/ggml-opt.h
|
||||
include/ggml-metal.h
|
||||
include/ggml-rpc.h
|
||||
@@ -358,6 +358,13 @@ write_basic_package_version_file(
|
||||
VERSION ${GGML_INSTALL_VERSION}
|
||||
COMPATIBILITY SameMajorVersion)
|
||||
|
||||
target_compile_definitions(ggml-base PRIVATE
|
||||
GGML_VERSION="${GGML_INSTALL_VERSION}"
|
||||
GGML_COMMIT="${GGML_BUILD_COMMIT}"
|
||||
)
|
||||
message(STATUS "ggml version: ${GGML_INSTALL_VERSION}")
|
||||
message(STATUS "ggml commit: ${GGML_BUILD_COMMIT}")
|
||||
|
||||
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ggml-config.cmake
|
||||
${CMAKE_CURRENT_BINARY_DIR}/ggml-version.cmake
|
||||
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/ggml)
|
||||
@@ -367,6 +374,8 @@ if (MSVC)
|
||||
/wd4005 # Macro redefinition
|
||||
/wd4244 # Conversion from one type to another type, possible loss of data
|
||||
/wd4267 # Conversion from 'size_t' to a smaller type, possible loss of data
|
||||
/wd4305 # Conversion from 'type1' to 'type2', possible loss of data
|
||||
/wd4566 # Conversion from 'char' to 'wchar_t', possible loss of data
|
||||
/wd4996 # Disable POSIX deprecation warnings
|
||||
/wd4702 # Unreachable code warnings
|
||||
)
|
||||
@@ -386,4 +395,46 @@ if (MSVC)
|
||||
disable_msvc_warnings(ggml-cpu-skylakex)
|
||||
disable_msvc_warnings(ggml-cpu-icelake)
|
||||
disable_msvc_warnings(ggml-cpu-alderlake)
|
||||
|
||||
if (GGML_BUILD_EXAMPLES)
|
||||
disable_msvc_warnings(common-ggml)
|
||||
disable_msvc_warnings(common)
|
||||
|
||||
disable_msvc_warnings(mnist-common)
|
||||
disable_msvc_warnings(mnist-eval)
|
||||
disable_msvc_warnings(mnist-train)
|
||||
|
||||
disable_msvc_warnings(gpt-2-ctx)
|
||||
disable_msvc_warnings(gpt-2-alloc)
|
||||
disable_msvc_warnings(gpt-2-backend)
|
||||
disable_msvc_warnings(gpt-2-sched)
|
||||
disable_msvc_warnings(gpt-2-quantize)
|
||||
disable_msvc_warnings(gpt-2-batched)
|
||||
|
||||
disable_msvc_warnings(gpt-j)
|
||||
disable_msvc_warnings(gpt-j-quantize)
|
||||
|
||||
disable_msvc_warnings(magika)
|
||||
disable_msvc_warnings(yolov3-tiny)
|
||||
disable_msvc_warnings(sam)
|
||||
|
||||
disable_msvc_warnings(simple-ctx)
|
||||
disable_msvc_warnings(simple-backend)
|
||||
endif()
|
||||
|
||||
if (GGML_BUILD_TESTS)
|
||||
disable_msvc_warnings(test-mul-mat)
|
||||
disable_msvc_warnings(test-arange)
|
||||
disable_msvc_warnings(test-backend-ops)
|
||||
disable_msvc_warnings(test-cont)
|
||||
disable_msvc_warnings(test-conv-transpose)
|
||||
disable_msvc_warnings(test-conv-transpose-1d)
|
||||
disable_msvc_warnings(test-conv1d)
|
||||
disable_msvc_warnings(test-conv2d)
|
||||
disable_msvc_warnings(test-conv2d-dw)
|
||||
disable_msvc_warnings(test-customop)
|
||||
disable_msvc_warnings(test-dup)
|
||||
disable_msvc_warnings(test-opt)
|
||||
disable_msvc_warnings(test-pool)
|
||||
endif ()
|
||||
endif()
|
||||
|
||||
@@ -36,8 +36,7 @@ function(ggml_get_system_arch)
|
||||
(NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
|
||||
CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64|amd64)$"))
|
||||
set(GGML_SYSTEM_ARCH "x86" PARENT_SCOPE)
|
||||
elseif ("${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "ppc64le " OR
|
||||
"${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "powerpc ")
|
||||
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc|power")
|
||||
set(GGML_SYSTEM_ARCH "PowerPC" PARENT_SCOPE)
|
||||
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
|
||||
set(GGML_SYSTEM_ARCH "loongarch64" PARENT_SCOPE)
|
||||
|
||||
@@ -339,7 +339,7 @@ extern "C" {
|
||||
typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
|
||||
|
||||
// Compare the output of two backends
|
||||
GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
|
||||
GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data, struct ggml_tensor * test_node);
|
||||
|
||||
// Tensor initialization
|
||||
GGML_API enum ggml_status ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
|
||||
|
||||
@@ -101,6 +101,7 @@ extern "C" {
|
||||
GGML_BACKEND_API int ggml_cpu_has_riscv_v (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_vsx (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_vxe (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_nnpa (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_llamafile (void);
|
||||
|
||||
@@ -133,6 +134,7 @@ extern "C" {
|
||||
|
||||
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
|
||||
|
||||
GGML_BACKEND_API void ggml_cpu_fp32_to_fp32(const float *, float *, int64_t);
|
||||
GGML_BACKEND_API void ggml_cpu_fp32_to_fp16(const float *, ggml_fp16_t *, int64_t);
|
||||
GGML_BACKEND_API void ggml_cpu_fp16_to_fp32(const ggml_fp16_t *, float *, int64_t);
|
||||
GGML_BACKEND_API void ggml_cpu_fp32_to_bf16(const float *, ggml_bf16_t *, int64_t);
|
||||
|
||||
@@ -1,50 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-backend.h"
|
||||
|
||||
#include <stdbool.h>
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define GGML_KOMPUTE_MAX_DEVICES 16
|
||||
|
||||
struct ggml_vk_device {
|
||||
int index;
|
||||
int type; // same as VkPhysicalDeviceType
|
||||
size_t heapSize;
|
||||
const char * name;
|
||||
const char * vendor;
|
||||
int subgroupSize;
|
||||
uint64_t bufferAlignment;
|
||||
uint64_t maxAlloc;
|
||||
};
|
||||
|
||||
struct ggml_vk_device * ggml_vk_available_devices(size_t memoryRequired, size_t * count);
|
||||
bool ggml_vk_get_device(struct ggml_vk_device * device, size_t memoryRequired, const char * name);
|
||||
bool ggml_vk_has_vulkan(void);
|
||||
bool ggml_vk_has_device(void);
|
||||
struct ggml_vk_device ggml_vk_current_device(void);
|
||||
|
||||
//
|
||||
// backend API
|
||||
//
|
||||
|
||||
// forward declaration
|
||||
typedef struct ggml_backend * ggml_backend_t;
|
||||
|
||||
GGML_BACKEND_API ggml_backend_t ggml_backend_kompute_init(int device);
|
||||
|
||||
GGML_BACKEND_API bool ggml_backend_is_kompute(ggml_backend_t backend);
|
||||
|
||||
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device);
|
||||
|
||||
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_kompute_reg(void);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
@@ -314,6 +314,13 @@
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// Function type used in fatal error callbacks
|
||||
typedef void (*ggml_abort_callback_t)(const char * error_message);
|
||||
|
||||
// Set the abort callback (passing null will restore original abort functionality: printing a message to stdout)
|
||||
// Returns the old callback for chaining
|
||||
GGML_API ggml_abort_callback_t ggml_set_abort_callback(ggml_abort_callback_t callback);
|
||||
|
||||
GGML_NORETURN GGML_ATTRIBUTE_FORMAT(3, 4)
|
||||
GGML_API void ggml_abort(const char * file, int line, const char * fmt, ...);
|
||||
|
||||
@@ -470,6 +477,7 @@ extern "C" {
|
||||
GGML_OP_TRANSPOSE,
|
||||
GGML_OP_GET_ROWS,
|
||||
GGML_OP_GET_ROWS_BACK,
|
||||
GGML_OP_SET_ROWS,
|
||||
GGML_OP_DIAG,
|
||||
GGML_OP_DIAG_MASK_INF,
|
||||
GGML_OP_DIAG_MASK_ZERO,
|
||||
@@ -481,14 +489,16 @@ extern "C" {
|
||||
GGML_OP_CONV_TRANSPOSE_1D,
|
||||
GGML_OP_IM2COL,
|
||||
GGML_OP_IM2COL_BACK,
|
||||
GGML_OP_CONV_2D,
|
||||
GGML_OP_CONV_2D_DW,
|
||||
GGML_OP_CONV_TRANSPOSE_2D,
|
||||
GGML_OP_POOL_1D,
|
||||
GGML_OP_POOL_2D,
|
||||
GGML_OP_POOL_2D_BACK,
|
||||
GGML_OP_UPSCALE, // nearest interpolate
|
||||
GGML_OP_UPSCALE,
|
||||
GGML_OP_PAD,
|
||||
GGML_OP_PAD_REFLECT_1D,
|
||||
GGML_OP_ROLL,
|
||||
GGML_OP_ARANGE,
|
||||
GGML_OP_TIMESTEP_EMBEDDING,
|
||||
GGML_OP_ARGSORT,
|
||||
@@ -518,6 +528,8 @@ extern "C" {
|
||||
GGML_OP_CROSS_ENTROPY_LOSS_BACK,
|
||||
GGML_OP_OPT_STEP_ADAMW,
|
||||
|
||||
GGML_OP_GLU,
|
||||
|
||||
GGML_OP_COUNT,
|
||||
};
|
||||
|
||||
@@ -541,6 +553,16 @@ extern "C" {
|
||||
GGML_UNARY_OP_COUNT,
|
||||
};
|
||||
|
||||
enum ggml_glu_op {
|
||||
GGML_GLU_OP_REGLU,
|
||||
GGML_GLU_OP_GEGLU,
|
||||
GGML_GLU_OP_SWIGLU,
|
||||
GGML_GLU_OP_GEGLU_ERF,
|
||||
GGML_GLU_OP_GEGLU_QUICK,
|
||||
|
||||
GGML_GLU_OP_COUNT,
|
||||
};
|
||||
|
||||
enum ggml_object_type {
|
||||
GGML_OBJECT_TYPE_TENSOR,
|
||||
GGML_OBJECT_TYPE_GRAPH,
|
||||
@@ -626,6 +648,9 @@ extern "C" {
|
||||
|
||||
// misc
|
||||
|
||||
GGML_API const char * ggml_version(void);
|
||||
GGML_API const char * ggml_commit(void);
|
||||
|
||||
GGML_API void ggml_time_init(void); // call this once at the beginning of the program
|
||||
GGML_API int64_t ggml_time_ms(void);
|
||||
GGML_API int64_t ggml_time_us(void);
|
||||
@@ -656,6 +681,7 @@ extern "C" {
|
||||
GGML_API const char * ggml_op_symbol(enum ggml_op op);
|
||||
|
||||
GGML_API const char * ggml_unary_op_name(enum ggml_unary_op op);
|
||||
GGML_API const char * ggml_glu_op_name(enum ggml_glu_op op);
|
||||
GGML_API const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name
|
||||
|
||||
GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
|
||||
@@ -686,6 +712,9 @@ extern "C" {
|
||||
// true for tensor that is stored in memory as CxWxHxN and has been permuted to WxHxCxN
|
||||
GGML_API bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor);
|
||||
|
||||
// true if the elements in dimension 0 are contiguous, or there is just 1 block of elements
|
||||
GGML_API bool ggml_is_contiguous_rows(const struct ggml_tensor * tensor);
|
||||
|
||||
GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
||||
GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
||||
|
||||
@@ -757,6 +786,7 @@ extern "C" {
|
||||
GGML_API void ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3);
|
||||
|
||||
GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
|
||||
GGML_API enum ggml_glu_op ggml_get_glu_op(const struct ggml_tensor * tensor);
|
||||
|
||||
GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
|
||||
GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
|
||||
@@ -1085,6 +1115,89 @@ extern "C" {
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
// gated linear unit ops
|
||||
// A: n columns, r rows,
|
||||
// result is n / 2 columns, r rows,
|
||||
// expects gate in second half of row, unless swapped is true
|
||||
GGML_API struct ggml_tensor * ggml_glu(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
enum ggml_glu_op op,
|
||||
bool swapped);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_reglu(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_reglu_swapped(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_geglu(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_geglu_swapped(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_swiglu(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_swiglu_swapped(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_geglu_erf(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_geglu_erf_swapped(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_geglu_quick(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_geglu_quick_swapped(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
// A: n columns, r rows,
|
||||
// B: n columns, r rows,
|
||||
GGML_API struct ggml_tensor * ggml_glu_split(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
enum ggml_glu_op op);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_reglu_split(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_geglu_split(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_swiglu_split(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_geglu_erf_split(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_geglu_quick_split(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
|
||||
// normalize along rows
|
||||
GGML_API struct ggml_tensor * ggml_norm(
|
||||
struct ggml_context * ctx,
|
||||
@@ -1184,6 +1297,19 @@ extern "C" {
|
||||
struct ggml_tensor * a,
|
||||
float s);
|
||||
|
||||
// x = s * a + b
|
||||
GGML_API struct ggml_tensor * ggml_scale_bias(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
float s,
|
||||
float b);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_scale_bias_inplace(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
float s,
|
||||
float b);
|
||||
|
||||
// b -> view(a,offset,nb1,nb2,3), return modified a
|
||||
GGML_API struct ggml_tensor * ggml_set(
|
||||
struct ggml_context * ctx,
|
||||
@@ -1374,6 +1500,23 @@ extern "C" {
|
||||
struct ggml_tensor * b, // row indices
|
||||
struct ggml_tensor * c); // data for ggml_get_rows, only used for its shape
|
||||
|
||||
// a TD [n_embd, ne1, ne2, ne3]
|
||||
// b TS [n_embd, n_rows, ne02, ne03] | ne02 == ne2, ne03 == ne3
|
||||
// c I64 [n_rows, ne11, ne12, 1] | c[i] in [0, ne1)
|
||||
//
|
||||
// undefined behavior if destination rows overlap
|
||||
//
|
||||
// broadcast:
|
||||
// ne2 % ne11 == 0
|
||||
// ne3 % ne12 == 0
|
||||
//
|
||||
// return view(a)
|
||||
GGML_API struct ggml_tensor * ggml_set_rows(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a, // destination
|
||||
struct ggml_tensor * b, // source
|
||||
struct ggml_tensor * c); // row indices
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_diag(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
@@ -1411,8 +1554,14 @@ extern "C" {
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
// a [ne0, ne01, ne02, ne03]
|
||||
// mask [ne0, ne11, ne12, ne13] | ne11 >= ne01, F16 or F32, optional
|
||||
//
|
||||
// broadcast:
|
||||
// ne02 % ne12 == 0
|
||||
// ne03 % ne13 == 0
|
||||
//
|
||||
// fused soft_max(a*scale + mask*(ALiBi slope))
|
||||
// mask is optional
|
||||
// max_bias = 0.0f for no ALiBi
|
||||
GGML_API struct ggml_tensor * ggml_soft_max_ext(
|
||||
struct ggml_context * ctx,
|
||||
@@ -1722,6 +1871,17 @@ extern "C" {
|
||||
struct ggml_tensor * b,
|
||||
int stride);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_conv_2d_direct(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a, // convolution kernel [KW, KH, IC, OC]
|
||||
struct ggml_tensor * b, // input data [W, H, C, N]
|
||||
int s0, // stride dimension 0
|
||||
int s1, // stride dimension 1
|
||||
int p0, // padding dimension 0
|
||||
int p1, // padding dimension 1
|
||||
int d0, // dilation dimension 0
|
||||
int d1); // dilation dimension 1
|
||||
|
||||
enum ggml_op_pool {
|
||||
GGML_OP_POOL_MAX,
|
||||
GGML_OP_POOL_AVG,
|
||||
@@ -1764,6 +1924,12 @@ extern "C" {
|
||||
enum ggml_scale_mode {
|
||||
GGML_SCALE_MODE_NEAREST = 0,
|
||||
GGML_SCALE_MODE_BILINEAR = 1,
|
||||
|
||||
GGML_SCALE_MODE_COUNT
|
||||
};
|
||||
|
||||
enum ggml_scale_flag {
|
||||
GGML_SCALE_FLAG_ALIGN_CORNERS = (1 << 8)
|
||||
};
|
||||
|
||||
// interpolate
|
||||
@@ -1776,14 +1942,26 @@ extern "C" {
|
||||
|
||||
// interpolate
|
||||
// interpolate scale to specified dimensions
|
||||
GGML_API struct ggml_tensor * ggml_upscale_ext(
|
||||
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_upscale_ext(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int ne0,
|
||||
int ne1,
|
||||
int ne2,
|
||||
int ne3,
|
||||
enum ggml_scale_mode mode);
|
||||
enum ggml_scale_mode mode),
|
||||
"use ggml_interpolate instead");
|
||||
|
||||
// Up- or downsamples the input to the specified size.
|
||||
// 2D scale modes (eg. bilinear) are applied to the first two dimensions.
|
||||
GGML_API struct ggml_tensor * ggml_interpolate(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int64_t ne0,
|
||||
int64_t ne1,
|
||||
int64_t ne2,
|
||||
int64_t ne3,
|
||||
uint32_t mode); // ggml_scale_mode [ | ggml_scale_flag...]
|
||||
|
||||
// pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
|
||||
GGML_API struct ggml_tensor * ggml_pad(
|
||||
@@ -1801,6 +1979,17 @@ extern "C" {
|
||||
int p0,
|
||||
int p1);
|
||||
|
||||
// Move tensor elements by an offset given for each dimension. Elements that
|
||||
// are shifted beyond the last position are wrapped around to the beginning.
|
||||
GGML_API struct ggml_tensor * ggml_roll(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int shift0,
|
||||
int shift1,
|
||||
int shift2,
|
||||
int shift3);
|
||||
|
||||
|
||||
// Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
|
||||
// timesteps: [N,]
|
||||
// return: [N, dim]
|
||||
@@ -1835,11 +2024,17 @@ extern "C" {
|
||||
|
||||
#define GGML_KQ_MASK_PAD 64
|
||||
|
||||
// q: [n_embd_k, n_batch, n_head, 1]
|
||||
// k: [n_embd_k, n_kv, n_head_kv, 1]
|
||||
// v: [n_embd_v, n_kv, n_head_kv, 1] !! not transposed !!
|
||||
// mask: [n_kv, n_batch_pad, 1, 1] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
|
||||
// res: [n_embd_v, n_head, n_batch, 1] !! permuted !!
|
||||
// q: [n_embd_k, n_batch, n_head, ne3 ]
|
||||
// k: [n_embd_k, n_kv, n_head_kv, ne3 ]
|
||||
// v: [n_embd_v, n_kv, n_head_kv, ne3 ] !! not transposed !!
|
||||
// mask: [n_kv, n_batch_pad, ne32, ne33] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
|
||||
// res: [n_embd_v, n_head, n_batch, ne3 ] !! permuted !!
|
||||
//
|
||||
// broadcast:
|
||||
// n_head % n_head_kv == 0
|
||||
// n_head % ne32 == 0
|
||||
// ne3 % ne33 == 0
|
||||
//
|
||||
GGML_API struct ggml_tensor * ggml_flash_attn_ext(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * q,
|
||||
@@ -1878,7 +2073,8 @@ extern "C" {
|
||||
struct ggml_tensor * dt,
|
||||
struct ggml_tensor * A,
|
||||
struct ggml_tensor * B,
|
||||
struct ggml_tensor * C);
|
||||
struct ggml_tensor * C,
|
||||
struct ggml_tensor * ids);
|
||||
|
||||
// partition into non-overlapping windows with padding if needed
|
||||
// example:
|
||||
|
||||
@@ -286,6 +286,10 @@ function(ggml_add_cpu_backend_variant tag_name)
|
||||
foreach (feat ${ARGN})
|
||||
set(GGML_INTERNAL_${feat} ON)
|
||||
endforeach()
|
||||
elseif (GGML_SYSTEM_ARCH STREQUAL "PowerPC")
|
||||
foreach (feat ${ARGN})
|
||||
set(GGML_INTERNAL_${feat} ON)
|
||||
endforeach()
|
||||
endif()
|
||||
|
||||
ggml_add_cpu_backend_variant_impl(${tag_name})
|
||||
@@ -311,18 +315,45 @@ if (GGML_CPU_ALL_VARIANTS)
|
||||
# MSVC doesn't support AMX
|
||||
ggml_add_cpu_backend_variant(sapphirerapids SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
|
||||
endif()
|
||||
elseif(GGML_SYSTEM_ARCH STREQUAL "ARM" AND CMAKE_SYSTEM_NAME MATCHES "Linux")
|
||||
# Many of these features are optional so we build versions with popular
|
||||
# combinations and name the backends based on the version they were
|
||||
# first released with
|
||||
ggml_add_cpu_backend_variant(armv8.0_1)
|
||||
ggml_add_cpu_backend_variant(armv8.2_1 DOTPROD)
|
||||
ggml_add_cpu_backend_variant(armv8.2_2 DOTPROD FP16_VECTOR_ARITHMETIC)
|
||||
ggml_add_cpu_backend_variant(armv8.2_3 DOTPROD FP16_VECTOR_ARITHMETIC SVE)
|
||||
ggml_add_cpu_backend_variant(armv8.6_1 DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8)
|
||||
ggml_add_cpu_backend_variant(armv8.6_2 DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8 SVE2)
|
||||
ggml_add_cpu_backend_variant(armv9.2_1 DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8 SME)
|
||||
ggml_add_cpu_backend_variant(armv9.2_2 DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8 SVE2 SME)
|
||||
elseif(GGML_SYSTEM_ARCH STREQUAL "ARM")
|
||||
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
|
||||
# Many of these features are optional so we build versions with popular
|
||||
# combinations and name the backends based on the version they were
|
||||
# first released with
|
||||
ggml_add_cpu_backend_variant(armv8.0_1)
|
||||
ggml_add_cpu_backend_variant(armv8.2_1 DOTPROD)
|
||||
ggml_add_cpu_backend_variant(armv8.2_2 DOTPROD FP16_VECTOR_ARITHMETIC)
|
||||
ggml_add_cpu_backend_variant(armv8.2_3 DOTPROD FP16_VECTOR_ARITHMETIC SVE)
|
||||
ggml_add_cpu_backend_variant(armv8.6_1 DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8)
|
||||
ggml_add_cpu_backend_variant(armv8.6_2 DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8 SVE2)
|
||||
ggml_add_cpu_backend_variant(armv9.2_1 DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8 SME)
|
||||
ggml_add_cpu_backend_variant(armv9.2_2 DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8 SVE2 SME)
|
||||
elseif (CMAKE_SYSTEM_NAME MATCHES "Android")
|
||||
# Android-specific backends with SoC-compatible feature sets
|
||||
ggml_add_cpu_backend_variant(android_armv8.0_1)
|
||||
ggml_add_cpu_backend_variant(android_armv8.2_1 DOTPROD)
|
||||
ggml_add_cpu_backend_variant(android_armv8.2_2 DOTPROD FP16_VECTOR_ARITHMETIC)
|
||||
ggml_add_cpu_backend_variant(android_armv8.6_1 DOTPROD FP16_VECTOR_ARITHMETIC MATMUL_INT8)
|
||||
elseif (APPLE)
|
||||
ggml_add_cpu_backend_variant(apple_m1 DOTPROD)
|
||||
ggml_add_cpu_backend_variant(apple_m2_m3 DOTPROD MATMUL_INT8)
|
||||
ggml_add_cpu_backend_variant(apple_m4 DOTPROD MATMUL_INT8 NOSVE SME)
|
||||
else()
|
||||
message(FATAL_ERROR "Unsupported ARM target OS: ${CMAKE_SYSTEM_NAME}")
|
||||
endif()
|
||||
elseif (GGML_SYSTEM_ARCH STREQUAL "PowerPC")
|
||||
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
|
||||
ggml_add_cpu_backend_variant(power0)
|
||||
ggml_add_cpu_backend_variant(power7_1 POWER7)
|
||||
ggml_add_cpu_backend_variant(power7_2 POWER7 VSX)
|
||||
ggml_add_cpu_backend_variant(power8_1 POWER8)
|
||||
ggml_add_cpu_backend_variant(power8_2 POWER8 VSX)
|
||||
ggml_add_cpu_backend_variant(power9 POWER9 VSX)
|
||||
ggml_add_cpu_backend_variant(power10 POWER10 VSX)
|
||||
ggml_add_cpu_backend_variant(power11 POWER11 VSX)
|
||||
else()
|
||||
message(FATAL_ERROR "Unsupported PowerPC target OS: ${CMAKE_SYSTEM_NAME}")
|
||||
endif()
|
||||
else()
|
||||
message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS not yet supported with ${GGML_SYSTEM_ARCH} on ${CMAKE_SYSTEM_NAME}")
|
||||
endif()
|
||||
@@ -334,7 +365,6 @@ ggml_add_backend(BLAS)
|
||||
ggml_add_backend(CANN)
|
||||
ggml_add_backend(CUDA)
|
||||
ggml_add_backend(HIP)
|
||||
ggml_add_backend(Kompute)
|
||||
ggml_add_backend(METAL)
|
||||
ggml_add_backend(MUSA)
|
||||
ggml_add_backend(RPC)
|
||||
|
||||
@@ -61,14 +61,13 @@
|
||||
#include "ggml-cann.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_KOMPUTE
|
||||
#include "ggml-kompute.h"
|
||||
#endif
|
||||
|
||||
// disable C++17 deprecation warning for std::codecvt_utf8
|
||||
#if defined(__clang__)
|
||||
# pragma clang diagnostic push
|
||||
# pragma clang diagnostic ignored "-Wdeprecated-declarations"
|
||||
#elif defined(__GNUC__)
|
||||
# pragma GCC diagnostic push
|
||||
# pragma GCC diagnostic ignored "-Wdeprecated-declarations"
|
||||
#endif
|
||||
|
||||
namespace fs = std::filesystem;
|
||||
@@ -91,6 +90,8 @@ static std::string path_str(const fs::path & path) {
|
||||
|
||||
#if defined(__clang__)
|
||||
# pragma clang diagnostic pop
|
||||
#elif defined(__GNUC__)
|
||||
# pragma GCC diagnostic pop
|
||||
#endif
|
||||
|
||||
#ifdef _WIN32
|
||||
@@ -184,9 +185,6 @@ struct ggml_backend_registry {
|
||||
#ifdef GGML_USE_RPC
|
||||
register_backend(ggml_backend_rpc_reg());
|
||||
#endif
|
||||
#ifdef GGML_USE_KOMPUTE
|
||||
register_backend(ggml_backend_kompute_reg());
|
||||
#endif
|
||||
#ifdef GGML_USE_CPU
|
||||
register_backend(ggml_backend_cpu_reg());
|
||||
#endif
|
||||
@@ -570,7 +568,6 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
|
||||
ggml_backend_load_best("cann", silent, dir_path);
|
||||
ggml_backend_load_best("cuda", silent, dir_path);
|
||||
ggml_backend_load_best("hip", silent, dir_path);
|
||||
ggml_backend_load_best("kompute", silent, dir_path);
|
||||
ggml_backend_load_best("metal", silent, dir_path);
|
||||
ggml_backend_load_best("rpc", silent, dir_path);
|
||||
ggml_backend_load_best("sycl", silent, dir_path);
|
||||
|
||||
@@ -817,8 +817,9 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
|
||||
}
|
||||
if (sched->debug > 1) {
|
||||
ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
|
||||
GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
|
||||
fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
|
||||
GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s] use=%d:", i, ggml_op_name(node->op), node->name,
|
||||
fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node),
|
||||
graph->use_counts[ggml_hash_find(&graph->visited_hash_set, node)]);
|
||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||
struct ggml_tensor * src = node->src[j];
|
||||
if (src == NULL) {
|
||||
@@ -1826,7 +1827,7 @@ void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy) {
|
||||
ggml_free(copy.ctx_unallocated);
|
||||
}
|
||||
|
||||
bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data) {
|
||||
bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data, struct ggml_tensor * test_node) {
|
||||
struct ggml_backend_graph_copy copy = ggml_backend_graph_copy(backend2, graph);
|
||||
if (copy.buffer == NULL) {
|
||||
return false;
|
||||
@@ -1837,28 +1838,45 @@ bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t
|
||||
|
||||
assert(g1->n_nodes == g2->n_nodes);
|
||||
|
||||
for (int i = 0; i < g1->n_nodes; i++) {
|
||||
struct ggml_tensor * t1 = g1->nodes[i];
|
||||
struct ggml_tensor * t2 = g2->nodes[i];
|
||||
if (test_node != nullptr) {
|
||||
// Compute the whole graph and only test the output for a specific tensor
|
||||
ggml_backend_graph_compute(backend1, g1);
|
||||
ggml_backend_graph_compute(backend2, g2);
|
||||
|
||||
assert(t1->op == t2->op && ggml_are_same_layout(t1, t2));
|
||||
|
||||
struct ggml_cgraph g1v = ggml_graph_view(g1, i, i + 1);
|
||||
struct ggml_cgraph g2v = ggml_graph_view(g2, i, i + 1);
|
||||
|
||||
ggml_backend_graph_compute(backend1, &g1v);
|
||||
ggml_backend_graph_compute(backend2, &g2v);
|
||||
|
||||
if (ggml_is_view_op(t1->op)) {
|
||||
continue;
|
||||
int test_node_idx = -1;
|
||||
for (int i = 0; i < g1->n_nodes; i++) {
|
||||
struct ggml_tensor * t1 = g1->nodes[i];
|
||||
if (t1 == test_node) {
|
||||
test_node_idx = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
GGML_ASSERT(test_node_idx != -1);
|
||||
|
||||
// compare results, calculate rms etc
|
||||
if (!callback(i, t1, t2, user_data)) {
|
||||
break;
|
||||
callback(test_node_idx, g1->nodes[test_node_idx], g2->nodes[test_node_idx], user_data);
|
||||
} else {
|
||||
for (int i = 0; i < g1->n_nodes; i++) {
|
||||
struct ggml_tensor * t1 = g1->nodes[i];
|
||||
struct ggml_tensor * t2 = g2->nodes[i];
|
||||
|
||||
assert(t1->op == t2->op && ggml_are_same_layout(t1, t2));
|
||||
|
||||
struct ggml_cgraph g1v = ggml_graph_view(g1, i, i + 1);
|
||||
struct ggml_cgraph g2v = ggml_graph_view(g2, i, i + 1);
|
||||
|
||||
ggml_backend_graph_compute(backend1, &g1v);
|
||||
ggml_backend_graph_compute(backend2, &g2v);
|
||||
|
||||
if (ggml_is_view_op(t1->op)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// compare results, calculate rms etc
|
||||
if (!callback(i, t1, t2, user_data)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ggml_backend_graph_copy_free(copy);
|
||||
|
||||
return true;
|
||||
|
||||
@@ -65,8 +65,9 @@
|
||||
#include <aclnnop/aclnn_eq_tensor.h>
|
||||
#include <aclnnop/aclnn_gt_scalar.h>
|
||||
#include <aclnnop/aclnn_pow.h>
|
||||
#include <aclnnop/aclnn_grouped_matmul_v2.h>
|
||||
#include <aclnnop/aclnn_grouped_matmul_v3.h>
|
||||
#include <aclnnop/aclnn_fused_infer_attention_score_v2.h>
|
||||
#include <aclnnop/aclnn_zero.h>
|
||||
#include <float.h>
|
||||
|
||||
#include <cmath>
|
||||
@@ -804,10 +805,11 @@ static aclTensor* aclnn_zero(ggml_backend_cann_context& ctx, void* buffer,
|
||||
nb[i] = nb[i - 1] * ne[i - 1];
|
||||
}
|
||||
|
||||
ggml_cann_async_memset(ctx, buffer, n_bytes, 0);
|
||||
aclTensor* zero =
|
||||
ggml_cann_create_tensor(buffer, type, type_size, ne, nb, dims);
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceZero, zero);
|
||||
return zero;
|
||||
GGML_UNUSED(n_bytes);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -2654,6 +2656,67 @@ static void ggml_cann_mul_mat_id_fp(ggml_backend_cann_context& ctx, ggml_tensor*
|
||||
memcpy(ori_src0_nb, cast_nb, sizeof(ori_src0_nb));
|
||||
}
|
||||
|
||||
#ifdef ASCEND_310P
|
||||
ggml_tensor src0_row = *src0;
|
||||
ggml_tensor src1_row = *src1;
|
||||
ggml_tensor dst_row = *dst;
|
||||
|
||||
if (src0->type == GGML_TYPE_F16) {
|
||||
src0_row.type = GGML_TYPE_F32;
|
||||
}
|
||||
|
||||
// src0_row [D, M, 1, 1] weight without permute
|
||||
src0_row.ne[2] = 1;
|
||||
src0_row.ne[3] = 1;
|
||||
src0_row.nb[0] = ori_src0_nb[0];
|
||||
src0_row.nb[1] = ori_src0_nb[1];
|
||||
src0_row.nb[2] = ori_src0_nb[1];
|
||||
src0_row.nb[3] = ori_src0_nb[1];
|
||||
|
||||
// src1_row [D, 1, 1, 1] -> input
|
||||
src1_row.ne[1] = 1;
|
||||
src1_row.ne[2] = 1;
|
||||
src1_row.ne[3] = 1;
|
||||
src1_row.nb[2] = nb11;
|
||||
src1_row.nb[3] = nb11;
|
||||
|
||||
// dst_row [M, 1, 1, 1] -> out
|
||||
dst_row.ne[1] = 1;
|
||||
dst_row.ne[2] = 1;
|
||||
dst_row.ne[3] = 1;
|
||||
dst_row.nb[2] = nb1;
|
||||
dst_row.nb[3] = nb1;
|
||||
|
||||
//create weight for one row
|
||||
for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
|
||||
for (int64_t id = 0; id < n_ids; id++) {
|
||||
// expert index
|
||||
int32_t i02 = *(int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
|
||||
GGML_ASSERT(i02 >= 0 && i02 < n_as);
|
||||
|
||||
// If B = 1 (broadcast), always use 0; otherwise, use id.
|
||||
int64_t i11 = (ne11 == 1 ? 0 : id);
|
||||
int64_t i12 = iid1;
|
||||
|
||||
int64_t i1 = id;
|
||||
int64_t i2 = i12;
|
||||
|
||||
void* src0_tmp_ptr = src0_original + i02*ori_src0_nb[2];
|
||||
void* src1_tmp_ptr = src1_original + i11*nb11 + i12*nb12;
|
||||
void* dst_tmp_ptr = dst_original + i1*nb1 + i2*nb2;
|
||||
|
||||
src0_row.data = src0_tmp_ptr;
|
||||
src1_row.data = src1_tmp_ptr;
|
||||
dst_row.data = dst_tmp_ptr;
|
||||
dst_row.src[0] = &src0_row;
|
||||
dst_row.src[1] = &src1_row;
|
||||
|
||||
ggml_cann_mul_mat(ctx, &dst_row);
|
||||
}
|
||||
}
|
||||
return;
|
||||
#endif
|
||||
|
||||
std::vector<aclTensor*> src0_tensor_vec;
|
||||
std::vector<aclTensor*> src1_tensor_vec;
|
||||
std::vector<aclTensor*> dst_tensor_vec;
|
||||
@@ -2701,9 +2764,9 @@ static void ggml_cann_mul_mat_id_fp(ggml_backend_cann_context& ctx, ggml_tensor*
|
||||
}
|
||||
|
||||
size_t GROUP_SIZE = 128;
|
||||
// GroupedMatmulV2 required tensor_list.size < 128
|
||||
// GroupedMatmulV3 required tensor_list.size < 128
|
||||
for (size_t i = 0; i < src0_tensor_vec.size(); i += GROUP_SIZE) {
|
||||
// split and call GroupedMatmulV2
|
||||
// split and call GroupedMatmulV3
|
||||
size_t end = std::min(i + GROUP_SIZE, src0_tensor_vec.size());
|
||||
std::vector<aclTensor*> src0_tensor_vec_split(src0_tensor_vec.begin() + i, src0_tensor_vec.begin() + end);
|
||||
std::vector<aclTensor*> src1_tensor_vec_split(src1_tensor_vec.begin() + i, src1_tensor_vec.begin() + end);
|
||||
@@ -2713,7 +2776,7 @@ static void ggml_cann_mul_mat_id_fp(ggml_backend_cann_context& ctx, ggml_tensor*
|
||||
aclTensorList* src1_tensor_list = aclCreateTensorList(src1_tensor_vec_split.data(), src1_tensor_vec_split.size());
|
||||
aclTensorList* dst_tensor_list = aclCreateTensorList(dst_tensor_vec_split.data(), dst_tensor_vec_split.size());
|
||||
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, GroupedMatmulV2, src1_tensor_list, src0_tensor_list,
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, GroupedMatmulV3, src1_tensor_list, src0_tensor_list,
|
||||
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, -1, dst_tensor_list);
|
||||
|
||||
ggml_cann_release_resources(ctx, src0_tensor_list, src1_tensor_list, dst_tensor_list);
|
||||
|
||||
@@ -359,7 +359,7 @@ struct ggml_backend_cann_context {
|
||||
ggml_cann_set_device(device);
|
||||
description = aclrtGetSocName();
|
||||
|
||||
bool async_mode = parse_bool(get_env("GGML_CANN_ASYNC_MODE").value_or(""));
|
||||
async_mode = parse_bool(get_env("GGML_CANN_ASYNC_MODE").value_or(""));
|
||||
GGML_LOG_INFO("%s: device %d async operator submission is %s\n", __func__,
|
||||
device, async_mode ? "ON" : "OFF");
|
||||
}
|
||||
|
||||
@@ -2086,6 +2086,12 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
||||
return false;
|
||||
}
|
||||
} break;
|
||||
case GGML_OP_SET_ROWS:
|
||||
{
|
||||
// TODO: add support
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/14274
|
||||
return false;
|
||||
} break;
|
||||
case GGML_OP_CPY: {
|
||||
ggml_tensor *src = op->src[0];
|
||||
if ((op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_F16) ||
|
||||
@@ -2182,12 +2188,10 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
||||
case GGML_OP_MUL:
|
||||
case GGML_OP_DIV:
|
||||
case GGML_OP_RMS_NORM:
|
||||
case GGML_OP_SCALE:
|
||||
case GGML_OP_SQR:
|
||||
case GGML_OP_SQRT:
|
||||
case GGML_OP_CLAMP:
|
||||
case GGML_OP_DIAG_MASK_INF:
|
||||
case GGML_OP_SOFT_MAX:
|
||||
case GGML_OP_SUM_ROWS:
|
||||
case GGML_OP_ARGSORT:
|
||||
case GGML_OP_ACC:
|
||||
@@ -2205,6 +2209,14 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
||||
case GGML_OP_PAD_REFLECT_1D:
|
||||
case GGML_OP_COUNT_EQUAL:
|
||||
return true;
|
||||
case GGML_OP_SCALE:
|
||||
float bias;
|
||||
memcpy(&bias, (float*)op->op_params + 1, sizeof(float));
|
||||
return bias == 0.0f; // TODO: support bias != 0.0f
|
||||
case GGML_OP_SOFT_MAX:
|
||||
// TODO: support broadcast
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/14435
|
||||
return !op->src[1] || (op->src[1]->ne[2] == 1 && op->src[1]->ne[3] == 1);
|
||||
case GGML_OP_FLASH_ATTN_EXT:{
|
||||
// derived from [ggml-cuda.cu]
|
||||
if(op->src[1]->type != GGML_TYPE_F16 || op->src[2]->type != GGML_TYPE_F16){
|
||||
@@ -2227,6 +2239,8 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
||||
// DeepSeek MLA
|
||||
return false;
|
||||
}
|
||||
// TODO: support broadcast
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/14435
|
||||
if (op->src[0]->ne[3] != 1) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -5,7 +5,7 @@ function(ggml_add_cpu_backend_features cpu_name arch)
|
||||
# build, using set_source_files_properties() to set the arch flags is not possible
|
||||
set(GGML_CPU_FEATS_NAME ${cpu_name}-feats)
|
||||
add_library(${GGML_CPU_FEATS_NAME} OBJECT ggml-cpu/arch/${arch}/cpu-feats.cpp)
|
||||
target_include_directories(${GGML_CPU_FEATS_NAME} PRIVATE . .. ../include)
|
||||
target_include_directories(${GGML_CPU_FEATS_NAME} PRIVATE . ../include)
|
||||
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE ${ARGN})
|
||||
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE GGML_BACKEND_DL GGML_BACKEND_BUILD GGML_BACKEND_SHARED)
|
||||
set_target_properties(${GGML_CPU_FEATS_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
@@ -158,48 +158,48 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
if (GGML_CPU_ARM_ARCH)
|
||||
list(APPEND ARCH_FLAGS -march=${GGML_CPU_ARM_ARCH})
|
||||
elseif(GGML_CPU_ALL_VARIANTS)
|
||||
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
|
||||
# Begin with the lowest baseline
|
||||
set(ARM_MCPU "armv8-a")
|
||||
set(ARCH_TAGS "")
|
||||
set(ARCH_DEFINITIONS "")
|
||||
# Begin with the lowest baseline
|
||||
set(ARM_MCPU "armv8-a")
|
||||
set(ARCH_TAGS "")
|
||||
set(ARCH_DEFINITIONS "")
|
||||
|
||||
# When a feature is selected, bump the MCPU to the first
|
||||
# version that supported it
|
||||
if (GGML_INTERNAL_DOTPROD)
|
||||
set(ARM_MCPU "armv8.2-a")
|
||||
set(ARCH_TAGS "${ARCH_TAGS}+dotprod")
|
||||
list(APPEND ARCH_DEFINITIONS GGML_USE_DOTPROD)
|
||||
endif()
|
||||
if (GGML_INTERNAL_FP16_VECTOR_ARITHMETIC)
|
||||
set(ARM_MCPU "armv8.2-a")
|
||||
set(ARCH_TAGS "${ARCH_TAGS}+fp16")
|
||||
list(APPEND ARCH_DEFINITIONS GGML_USE_FP16_VECTOR_ARITHMETIC)
|
||||
endif()
|
||||
if (GGML_INTERNAL_SVE)
|
||||
set(ARM_MCPU "armv8.2-a")
|
||||
set(ARCH_TAGS "${ARCH_TAGS}+sve")
|
||||
list(APPEND ARCH_DEFINITIONS GGML_USE_SVE)
|
||||
endif()
|
||||
if (GGML_INTERNAL_MATMUL_INT8)
|
||||
set(ARM_MCPU "armv8.6-a")
|
||||
set(ARCH_TAGS "${ARCH_TAGS}+i8mm")
|
||||
list(APPEND ARCH_DEFINITIONS GGML_USE_MATMUL_INT8)
|
||||
endif()
|
||||
if (GGML_INTERNAL_SVE2)
|
||||
set(ARM_MCPU "armv8.6-a")
|
||||
set(ARCH_TAGS "${ARCH_TAGS}+sve2")
|
||||
list(APPEND ARCH_DEFINITIONS GGML_USE_SVE2)
|
||||
endif()
|
||||
if (GGML_INTERNAL_SME)
|
||||
set(ARM_MCPU "armv9.2-a")
|
||||
set(ARCH_TAGS "${ARCH_TAGS}+sme")
|
||||
list(APPEND ARCH_DEFINITIONS GGML_USE_SME)
|
||||
endif()
|
||||
|
||||
list(APPEND ARCH_FLAGS "-march=${ARM_MCPU}${ARCH_TAGS}")
|
||||
ggml_add_cpu_backend_features(${GGML_CPU_NAME} arm ${ARCH_DEFINITIONS})
|
||||
# When a feature is selected, bump the MCPU to the first
|
||||
# version that supported it
|
||||
if (GGML_INTERNAL_DOTPROD)
|
||||
set(ARM_MCPU "armv8.2-a")
|
||||
set(ARCH_TAGS "${ARCH_TAGS}+dotprod")
|
||||
list(APPEND ARCH_DEFINITIONS GGML_USE_DOTPROD)
|
||||
endif()
|
||||
if (GGML_INTERNAL_FP16_VECTOR_ARITHMETIC)
|
||||
set(ARM_MCPU "armv8.2-a")
|
||||
set(ARCH_TAGS "${ARCH_TAGS}+fp16")
|
||||
list(APPEND ARCH_DEFINITIONS GGML_USE_FP16_VECTOR_ARITHMETIC)
|
||||
endif()
|
||||
if (GGML_INTERNAL_SVE)
|
||||
set(ARM_MCPU "armv8.2-a")
|
||||
set(ARCH_TAGS "${ARCH_TAGS}+sve")
|
||||
list(APPEND ARCH_DEFINITIONS GGML_USE_SVE)
|
||||
endif()
|
||||
if (GGML_INTERNAL_MATMUL_INT8)
|
||||
set(ARM_MCPU "armv8.6-a")
|
||||
set(ARCH_TAGS "${ARCH_TAGS}+i8mm")
|
||||
list(APPEND ARCH_DEFINITIONS GGML_USE_MATMUL_INT8)
|
||||
endif()
|
||||
if (GGML_INTERNAL_SVE2)
|
||||
set(ARM_MCPU "armv8.6-a")
|
||||
set(ARCH_TAGS "${ARCH_TAGS}+sve2")
|
||||
list(APPEND ARCH_DEFINITIONS GGML_USE_SVE2)
|
||||
endif()
|
||||
if (GGML_INTERNAL_NOSVE)
|
||||
set(ARCH_TAGS "${ARCH_TAGS}+nosve")
|
||||
endif()
|
||||
if (GGML_INTERNAL_SME)
|
||||
set(ARM_MCPU "armv9.2-a")
|
||||
set(ARCH_TAGS "${ARCH_TAGS}+sme")
|
||||
list(APPEND ARCH_DEFINITIONS GGML_USE_SME)
|
||||
endif()
|
||||
list(APPEND ARCH_FLAGS "-march=${ARM_MCPU}${ARCH_TAGS}")
|
||||
ggml_add_cpu_backend_features(${GGML_CPU_NAME} arm ${ARCH_DEFINITIONS})
|
||||
endif()
|
||||
endif()
|
||||
|
||||
@@ -388,6 +388,27 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
else()
|
||||
list(APPEND ARCH_FLAGS -mcpu=native -mtune=native -mpowerpc64)
|
||||
endif()
|
||||
elseif(GGML_CPU_ALL_VARIANTS)
|
||||
# Begin with the lowest baseline
|
||||
set(ARCH_DEFINITIONS "")
|
||||
|
||||
# When a feature is selected, bump the MCPU to the first
|
||||
# version that supported it
|
||||
foreach(PVER RANGE 7 11)
|
||||
if(DEFINED GGML_INTERNAL_POWER${PVER})
|
||||
set(POWERPC_MCPU "power${PVER}")
|
||||
list(APPEND ARCH_DEFINITIONS GGML_USE_POWER${PVER})
|
||||
endif()
|
||||
endforeach()
|
||||
if (GGML_INTERNAL_VSX)
|
||||
list(APPEND ARCH_DEFINITIONS GGML_USE_VSX)
|
||||
list(APPEND ARCH_FLAGS -mvsx)
|
||||
endif()
|
||||
|
||||
if (DEFINED POWERPC_MCPU)
|
||||
list(APPEND ARCH_FLAGS -mcpu=${POWERPC_MCPU})
|
||||
endif()
|
||||
ggml_add_cpu_backend_features(${GGML_CPU_NAME} powerpc ${ARCH_DEFINITIONS})
|
||||
else()
|
||||
if (GGML_CPU_POWERPC_CPUTYPE)
|
||||
list(APPEND ARCH_FLAGS -mcpu=${GGML_CPU_POWERPC_CPUTYPE})
|
||||
@@ -427,6 +448,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
|
||||
# TODO: Separation to determine activation of VX/VXE/VXE2
|
||||
if (${S390X_M} MATCHES "8561|8562")
|
||||
set(GGML_NNPA OFF)
|
||||
message(STATUS "z15 target")
|
||||
list(APPEND ARCH_FLAGS -march=z15)
|
||||
elseif (${S390X_M} MATCHES "3931")
|
||||
@@ -443,7 +465,14 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
endif()
|
||||
|
||||
if (GGML_VXE)
|
||||
message(STATUS "VX/VXE/VXE2 enabled")
|
||||
list(APPEND ARCH_FLAGS -mvx -mzvector)
|
||||
list(APPEND ARCH_DEFINITIONS GGML_VXE)
|
||||
endif()
|
||||
|
||||
if (GGML_NNPA)
|
||||
message(STATUS "NNPA enabled")
|
||||
list(APPEND ARCH_DEFINITIONS GGML_NNPA)
|
||||
endif()
|
||||
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "wasm")
|
||||
message(STATUS "Wasm detected")
|
||||
@@ -465,9 +494,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
|
||||
# Fetch KleidiAI sources:
|
||||
include(FetchContent)
|
||||
set(KLEIDIAI_COMMIT_TAG "v1.6.0")
|
||||
set(KLEIDIAI_COMMIT_TAG "v1.9.0")
|
||||
set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz")
|
||||
set(KLEIDIAI_ARCHIVE_MD5 "75b4ad68f25ab673dcc01065e5a0b05f")
|
||||
set(KLEIDIAI_ARCHIVE_MD5 "2a8e1bb55d201557553545536489a017")
|
||||
|
||||
if (POLICY CMP0135)
|
||||
cmake_policy(SET CMP0135 NEW)
|
||||
@@ -560,4 +589,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
if (EMSCRIPTEN)
|
||||
set_target_properties(${GGML_CPU_NAME} PROPERTIES COMPILE_FLAGS "-msimd128")
|
||||
endif()
|
||||
|
||||
if (CMAKE_CXX_COMPILER_ID STREQUAL "IntelLLVM")
|
||||
# The compiler automatically enables "-ffast-math" which can cause NaNs in tests due to "-fassociative-math"
|
||||
target_compile_options(${GGML_CPU_NAME} PRIVATE "-fno-associative-math")
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
#include "mmq.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-cpu-impl.h"
|
||||
#include "simd-mappings.h"
|
||||
#include "quants.h"
|
||||
#include "ggml-quants.h"
|
||||
#include <algorithm>
|
||||
@@ -453,7 +454,7 @@ void quantize_row_q8_K_vnni(const float * RESTRICT x, void * RESTRICT vy, int64_
|
||||
|
||||
// Quantize these floats
|
||||
const float iscale = 127.f / amax;
|
||||
y[i].d = GGML_FP32_TO_FP16(1 / iscale);
|
||||
y[i].d = GGML_CPU_FP32_TO_FP16(1 / iscale);
|
||||
const float id = ( amax != 0.0f ) ? iscale : 0.f;
|
||||
const __m512 vscale = _mm512_set1_ps(id);
|
||||
|
||||
@@ -1090,7 +1091,7 @@ struct acc_C<block_q8_0, block_q4_0, is_acc> {
|
||||
const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset)));
|
||||
|
||||
for (int m = 0; m < nr; ++m) {
|
||||
const __m512 vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[m * lda].d));
|
||||
const __m512 vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[m * lda].d));
|
||||
const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));
|
||||
|
||||
__m512 vsum;
|
||||
@@ -1113,8 +1114,8 @@ struct acc_C<block_q8_1, block_q4_1, is_acc> {
|
||||
const __m512 vm0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset + TILE_N * sizeof(ggml_half))));
|
||||
|
||||
for (int m = 0; m < nr; ++m) {
|
||||
const __m512 vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[m * lda].d));
|
||||
const __m512 vs1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[m * lda].s));
|
||||
const __m512 vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[m * lda].d));
|
||||
const __m512 vs1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[m * lda].s));
|
||||
const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));
|
||||
|
||||
__m512 vsum;
|
||||
@@ -1137,7 +1138,7 @@ struct acc_C<block_q8_0, block_q8_0, is_acc> {
|
||||
const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset)));
|
||||
|
||||
for (int m = 0; m < nr; ++m) {
|
||||
const __m512 vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[m * lda].d));
|
||||
const __m512 vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[m * lda].d));
|
||||
const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));
|
||||
|
||||
__m512 vsum;
|
||||
@@ -1437,7 +1438,7 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q4_0, float, BLOCK_M, BLOCK_N, BLO
|
||||
va[k] = _mm512_set1_epi32(a_ptr[k]);
|
||||
vcomp = _mm512_dpbusd_epi32(vcomp, off, va[k]);
|
||||
}
|
||||
vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[0 * KB + i].d));
|
||||
vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[0 * KB + i].d));
|
||||
}
|
||||
|
||||
// load b
|
||||
@@ -1498,8 +1499,8 @@ struct tinygemm_kernel_vnni<block_q8_1, block_q4_1, float, 1, BLOCK_N, BLOCK_K>
|
||||
for (int k = 0; k < 8; ++k) {
|
||||
va[k] = _mm512_set1_epi32(a_ptr[k]);
|
||||
}
|
||||
vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[0 * KB + i].d));
|
||||
vs1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[0 * KB + i].s));
|
||||
vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[0 * KB + i].d));
|
||||
vs1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[0 * KB + i].s));
|
||||
}
|
||||
|
||||
// load b
|
||||
@@ -1571,7 +1572,7 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q8_0, float, BLOCK_M, BLOCK_N, BLO
|
||||
va[k] = _mm512_set1_epi32(a_ptr[k]);
|
||||
va[k] = _mm512_add_epi8(va[k], off);
|
||||
}
|
||||
vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[0 * KB + i].d));
|
||||
vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[0 * KB + i].d));
|
||||
}
|
||||
|
||||
// load b
|
||||
|
||||
184
ggml/src/ggml-cpu/arch-fallback.h
Normal file
184
ggml/src/ggml-cpu/arch-fallback.h
Normal file
@@ -0,0 +1,184 @@
|
||||
#pragma once
|
||||
|
||||
// Rename `_generic` functions if no native implementation is available.
|
||||
// This effectively selects the generic implementation.
|
||||
|
||||
#if defined(GGML_CPU_GENERIC)
|
||||
// quants.c
|
||||
#define quantize_row_q8_0_generic quantize_row_q8_0
|
||||
#define quantize_row_q8_1_generic quantize_row_q8_1
|
||||
#define quantize_row_q8_K_generic quantize_row_q8_K
|
||||
#define ggml_vec_dot_q4_0_q8_0_generic ggml_vec_dot_q4_0_q8_0
|
||||
#define ggml_vec_dot_q4_1_q8_1_generic ggml_vec_dot_q4_1_q8_1
|
||||
#define ggml_vec_dot_q5_0_q8_0_generic ggml_vec_dot_q5_0_q8_0
|
||||
#define ggml_vec_dot_q5_1_q8_1_generic ggml_vec_dot_q5_1_q8_1
|
||||
#define ggml_vec_dot_q8_0_q8_0_generic ggml_vec_dot_q8_0_q8_0
|
||||
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
|
||||
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
|
||||
#define ggml_vec_dot_q2_K_q8_K_generic ggml_vec_dot_q2_K_q8_K
|
||||
#define ggml_vec_dot_q3_K_q8_K_generic ggml_vec_dot_q3_K_q8_K
|
||||
#define ggml_vec_dot_q4_K_q8_K_generic ggml_vec_dot_q4_K_q8_K
|
||||
#define ggml_vec_dot_q5_K_q8_K_generic ggml_vec_dot_q5_K_q8_K
|
||||
#define ggml_vec_dot_q6_K_q8_K_generic ggml_vec_dot_q6_K_q8_K
|
||||
#define ggml_vec_dot_iq2_xxs_q8_K_generic ggml_vec_dot_iq2_xxs_q8_K
|
||||
#define ggml_vec_dot_iq2_xs_q8_K_generic ggml_vec_dot_iq2_xs_q8_K
|
||||
#define ggml_vec_dot_iq2_s_q8_K_generic ggml_vec_dot_iq2_s_q8_K
|
||||
#define ggml_vec_dot_iq3_xxs_q8_K_generic ggml_vec_dot_iq3_xxs_q8_K
|
||||
#define ggml_vec_dot_iq3_s_q8_K_generic ggml_vec_dot_iq3_s_q8_K
|
||||
#define ggml_vec_dot_iq1_s_q8_K_generic ggml_vec_dot_iq1_s_q8_K
|
||||
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
|
||||
#define ggml_vec_dot_iq4_nl_q8_0_generic ggml_vec_dot_iq4_nl_q8_0
|
||||
#define ggml_vec_dot_iq4_xs_q8_K_generic ggml_vec_dot_iq4_xs_q8_K
|
||||
// repack.cpp
|
||||
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
||||
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
|
||||
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
|
||||
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
|
||||
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
||||
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
|
||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||
#elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) || defined(_M_ARM64)
|
||||
// repack.cpp
|
||||
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
|
||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||
#elif defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
|
||||
// repack.cpp
|
||||
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
||||
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
|
||||
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||
#elif defined(__POWERPC__) || defined(__powerpc__)
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/14146#issuecomment-2972561679
|
||||
// quants.c
|
||||
#define quantize_row_q8_K_generic quantize_row_q8_K
|
||||
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
|
||||
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
|
||||
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
|
||||
// repack.cpp
|
||||
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
||||
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
|
||||
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
|
||||
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
|
||||
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
||||
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
|
||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||
#elif defined(__loongarch64)
|
||||
// quants.c
|
||||
#define quantize_row_q8_K_generic quantize_row_q8_K
|
||||
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
|
||||
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
|
||||
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
|
||||
// repack.cpp
|
||||
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
||||
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
|
||||
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
|
||||
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
|
||||
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
||||
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
|
||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||
#elif defined(__riscv)
|
||||
// quants.c
|
||||
#define quantize_row_q8_K_generic quantize_row_q8_K
|
||||
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
|
||||
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
|
||||
#define ggml_vec_dot_iq2_xxs_q8_K_generic ggml_vec_dot_iq2_xxs_q8_K
|
||||
#define ggml_vec_dot_iq2_xs_q8_K_generic ggml_vec_dot_iq2_xs_q8_K
|
||||
#define ggml_vec_dot_iq2_s_q8_K_generic ggml_vec_dot_iq2_s_q8_K
|
||||
#define ggml_vec_dot_iq3_xxs_q8_K_generic ggml_vec_dot_iq3_xxs_q8_K
|
||||
#define ggml_vec_dot_iq3_s_q8_K_generic ggml_vec_dot_iq3_s_q8_K
|
||||
#define ggml_vec_dot_iq1_s_q8_K_generic ggml_vec_dot_iq1_s_q8_K
|
||||
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
|
||||
#define ggml_vec_dot_iq4_nl_q8_0_generic ggml_vec_dot_iq4_nl_q8_0
|
||||
#define ggml_vec_dot_iq4_xs_q8_K_generic ggml_vec_dot_iq4_xs_q8_K
|
||||
// repack.cpp
|
||||
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
||||
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
|
||||
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
|
||||
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
|
||||
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||
#elif defined(__s390x__)
|
||||
// quants.c
|
||||
#define quantize_row_q8_K_generic quantize_row_q8_K
|
||||
#define ggml_vec_dot_q5_0_q8_0_generic ggml_vec_dot_q5_0_q8_0
|
||||
#define ggml_vec_dot_q5_1_q8_1_generic ggml_vec_dot_q5_1_q8_1
|
||||
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
|
||||
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
|
||||
#define ggml_vec_dot_q2_K_q8_K_generic ggml_vec_dot_q2_K_q8_K
|
||||
#define ggml_vec_dot_iq2_xxs_q8_K_generic ggml_vec_dot_iq2_xxs_q8_K
|
||||
#define ggml_vec_dot_iq2_xs_q8_K_generic ggml_vec_dot_iq2_xs_q8_K
|
||||
#define ggml_vec_dot_iq2_s_q8_K_generic ggml_vec_dot_iq2_s_q8_K
|
||||
#define ggml_vec_dot_iq3_xxs_q8_K_generic ggml_vec_dot_iq3_xxs_q8_K
|
||||
#define ggml_vec_dot_iq3_s_q8_K_generic ggml_vec_dot_iq3_s_q8_K
|
||||
#define ggml_vec_dot_iq1_s_q8_K_generic ggml_vec_dot_iq1_s_q8_K
|
||||
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
|
||||
// repack.cpp
|
||||
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
||||
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
|
||||
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
|
||||
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
|
||||
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
||||
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
|
||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||
#elif defined(__wasm__)
|
||||
// quants.c
|
||||
#define ggml_vec_dot_q4_1_q8_1_generic ggml_vec_dot_q4_1_q8_1
|
||||
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
|
||||
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
|
||||
#define ggml_vec_dot_iq2_xxs_q8_K_generic ggml_vec_dot_iq2_xxs_q8_K
|
||||
#define ggml_vec_dot_iq2_xs_q8_K_generic ggml_vec_dot_iq2_xs_q8_K
|
||||
#define ggml_vec_dot_iq2_s_q8_K_generic ggml_vec_dot_iq2_s_q8_K
|
||||
#define ggml_vec_dot_iq3_xxs_q8_K_generic ggml_vec_dot_iq3_xxs_q8_K
|
||||
#define ggml_vec_dot_iq3_s_q8_K_generic ggml_vec_dot_iq3_s_q8_K
|
||||
#define ggml_vec_dot_iq1_s_q8_K_generic ggml_vec_dot_iq1_s_q8_K
|
||||
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
|
||||
#define ggml_vec_dot_iq4_nl_q8_0_generic ggml_vec_dot_iq4_nl_q8_0
|
||||
#define ggml_vec_dot_iq4_xs_q8_K_generic ggml_vec_dot_iq4_xs_q8_K
|
||||
// repack.cpp
|
||||
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
||||
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
|
||||
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
|
||||
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
|
||||
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
||||
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
|
||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||
#endif
|
||||
@@ -3,6 +3,7 @@
|
||||
#include "ggml-quants.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-cpu.h"
|
||||
#include "simd-mappings.h"
|
||||
|
||||
#include "../../quants.h"
|
||||
#include "../../ggml-cpu-impl.h"
|
||||
@@ -62,7 +63,7 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
||||
const float d = amax / ((1 << 7) - 1);
|
||||
const float id = d ? 1.0f/d : 0.0f;
|
||||
|
||||
y[i].d = GGML_FP32_TO_FP16(d);
|
||||
y[i].d = GGML_CPU_FP32_TO_FP16(d);
|
||||
|
||||
for (int j = 0; j < 8; j++) {
|
||||
const float32x4_t v = vmulq_n_f32(srcv[j], id);
|
||||
@@ -104,7 +105,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
||||
const float d = amax / ((1 << 7) - 1);
|
||||
const float id = d ? 1.0f/d : 0.0f;
|
||||
|
||||
y[i].d = GGML_FP32_TO_FP16(d);
|
||||
y[i].d = GGML_CPU_FP32_TO_FP16(d);
|
||||
|
||||
int32x4_t accv = vdupq_n_s32(0);
|
||||
|
||||
@@ -120,7 +121,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
||||
accv = vaddq_s32(accv, vi);
|
||||
}
|
||||
|
||||
y[i].s = GGML_FP32_TO_FP16(d * vaddvq_s32(accv));
|
||||
y[i].s = GGML_CPU_FP32_TO_FP16(d * vaddvq_s32(accv));
|
||||
}
|
||||
#else
|
||||
GGML_UNUSED(nb);
|
||||
@@ -194,10 +195,10 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
|
||||
|
||||
float32_t _scale[4] = {
|
||||
GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
|
||||
GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
|
||||
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
|
||||
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)
|
||||
GGML_CPU_FP16_TO_FP32(b_x0->d)*GGML_CPU_FP16_TO_FP32(b_y0->d),
|
||||
GGML_CPU_FP16_TO_FP32(b_x0->d)*GGML_CPU_FP16_TO_FP32(b_y1->d),
|
||||
GGML_CPU_FP16_TO_FP32(b_x1->d)*GGML_CPU_FP16_TO_FP32(b_y0->d),
|
||||
GGML_CPU_FP16_TO_FP32(b_x1->d)*GGML_CPU_FP16_TO_FP32(b_y1->d)
|
||||
};
|
||||
float32x4_t scale = vld1q_f32(_scale);
|
||||
|
||||
@@ -274,10 +275,10 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
// dot product
|
||||
sumv0 = svmla_n_f32_x(ph4, sumv0, svcvt_f32_s32_x(ph4, svadd_x(ph4,
|
||||
svdot_s32(svdup_n_s32(0), qx0ls, qy0l),
|
||||
svdot_s32(svdup_n_s32(0), qx0hs, qy0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
||||
svdot_s32(svdup_n_s32(0), qx0hs, qy0h))), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d));
|
||||
sumv1 = svmla_n_f32_x(ph4, sumv1, svcvt_f32_s32_x(ph4, svadd_x(ph4,
|
||||
svdot_s32(svdup_n_s32(0), qx1ls, qy1l),
|
||||
svdot_s32(svdup_n_s32(0), qx1hs, qy1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
||||
svdot_s32(svdup_n_s32(0), qx1hs, qy1h))), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d));
|
||||
}
|
||||
|
||||
sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
|
||||
@@ -313,9 +314,9 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
// dot product
|
||||
sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(),
|
||||
svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
||||
svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d));
|
||||
sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(),
|
||||
svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
||||
svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d));
|
||||
}
|
||||
|
||||
sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
|
||||
@@ -354,9 +355,9 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
// dot product
|
||||
sumv0 = svmla_n_f32_x(ph32, sumv0, svcvt_f32_s32_x(ph32,
|
||||
svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
||||
svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d));
|
||||
sumv1 = svmla_n_f32_x(ph32, sumv1, svcvt_f32_s32_x(ph32,
|
||||
svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
||||
svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d));
|
||||
}
|
||||
|
||||
sumf = svaddv_f32(ph32, svadd_f32_x(ph32, sumv0, sumv1));
|
||||
@@ -404,8 +405,8 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
const int32x4_t p_0 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h);
|
||||
const int32x4_t p_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h);
|
||||
|
||||
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
||||
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
||||
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d));
|
||||
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d));
|
||||
}
|
||||
|
||||
sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
|
||||
@@ -423,7 +424,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
}
|
||||
|
||||
int sumi = sumi0 + sumi1;
|
||||
sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d);
|
||||
sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
@@ -464,10 +465,10 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
const block_q8_1 * GGML_RESTRICT b_y1 = &vy1[i];
|
||||
|
||||
float32_t summs_t[4] = {
|
||||
GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y0->s),
|
||||
GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y0->s),
|
||||
GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y1->s),
|
||||
GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y1->s)
|
||||
GGML_CPU_FP16_TO_FP32(b_x0->m) * GGML_CPU_FP16_TO_FP32(b_y0->s),
|
||||
GGML_CPU_FP16_TO_FP32(b_x1->m) * GGML_CPU_FP16_TO_FP32(b_y0->s),
|
||||
GGML_CPU_FP16_TO_FP32(b_x0->m) * GGML_CPU_FP16_TO_FP32(b_y1->s),
|
||||
GGML_CPU_FP16_TO_FP32(b_x1->m) * GGML_CPU_FP16_TO_FP32(b_y1->s)
|
||||
};
|
||||
summs0 = vaddq_f32(summs0, vld1q_f32(summs_t));
|
||||
|
||||
@@ -490,10 +491,10 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
// mmla into int32x4_t
|
||||
float32_t _scale[4] = {
|
||||
GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
|
||||
GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
|
||||
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
|
||||
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)
|
||||
GGML_CPU_FP16_TO_FP32(b_x0->d)*GGML_CPU_FP16_TO_FP32(b_y0->d),
|
||||
GGML_CPU_FP16_TO_FP32(b_x0->d)*GGML_CPU_FP16_TO_FP32(b_y1->d),
|
||||
GGML_CPU_FP16_TO_FP32(b_x1->d)*GGML_CPU_FP16_TO_FP32(b_y0->d),
|
||||
GGML_CPU_FP16_TO_FP32(b_x1->d)*GGML_CPU_FP16_TO_FP32(b_y1->d)
|
||||
};
|
||||
float32x4_t scale = vld1q_f32(_scale);
|
||||
|
||||
@@ -539,7 +540,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
const block_q8_1 * GGML_RESTRICT y0 = &y[ib + 0];
|
||||
const block_q8_1 * GGML_RESTRICT y1 = &y[ib + 1];
|
||||
|
||||
summs += GGML_FP16_TO_FP32(x0->m) * GGML_FP16_TO_FP32(y0->s) + GGML_FP16_TO_FP32(x1->m) * GGML_FP16_TO_FP32(y1->s);
|
||||
summs += GGML_CPU_FP16_TO_FP32(x0->m) * GGML_CPU_FP16_TO_FP32(y0->s) + GGML_CPU_FP16_TO_FP32(x1->m) * GGML_CPU_FP16_TO_FP32(y1->s);
|
||||
|
||||
const uint8x16_t m4b = vdupq_n_u8(0x0F);
|
||||
|
||||
@@ -562,8 +563,8 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
const int32x4_t p_0 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h);
|
||||
const int32x4_t p_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h);
|
||||
|
||||
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
||||
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
||||
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d));
|
||||
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d));
|
||||
}
|
||||
|
||||
sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs;
|
||||
@@ -582,7 +583,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
}
|
||||
|
||||
int sumi = sumi0 + sumi1;
|
||||
sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
|
||||
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
@@ -666,10 +667,10 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
|
||||
ggml_vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
|
||||
ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
||||
ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d));
|
||||
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
|
||||
ggml_vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
|
||||
ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
||||
ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d));
|
||||
}
|
||||
|
||||
sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
|
||||
@@ -694,7 +695,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
}
|
||||
|
||||
int sumi = sumi0 + sumi1;
|
||||
sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)) * sumi;
|
||||
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
@@ -739,8 +740,8 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
const uint8x16_t m4b = vdupq_n_u8(0x0F);
|
||||
|
||||
summs0 += GGML_FP16_TO_FP32(x0->m) * GGML_FP16_TO_FP32(y0->s);
|
||||
summs1 += GGML_FP16_TO_FP32(x1->m) * GGML_FP16_TO_FP32(y1->s);
|
||||
summs0 += GGML_CPU_FP16_TO_FP32(x0->m) * GGML_CPU_FP16_TO_FP32(y0->s);
|
||||
summs1 += GGML_CPU_FP16_TO_FP32(x1->m) * GGML_CPU_FP16_TO_FP32(y1->s);
|
||||
|
||||
// extract the 5th bit via lookup table ((b) << 4)
|
||||
memcpy(&qh0, x0->qh, sizeof(qh0));
|
||||
@@ -784,10 +785,10 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
|
||||
ggml_vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
|
||||
ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
||||
ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d));
|
||||
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
|
||||
ggml_vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
|
||||
ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
||||
ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d));
|
||||
}
|
||||
|
||||
sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs0 + summs1;
|
||||
@@ -812,7 +813,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
}
|
||||
|
||||
int sumi = sumi0 + sumi1;
|
||||
sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
|
||||
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
@@ -864,10 +865,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
|
||||
|
||||
float32_t _scale[4] = {
|
||||
GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
|
||||
GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
|
||||
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
|
||||
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)
|
||||
GGML_CPU_FP16_TO_FP32(b_x0->d)*GGML_CPU_FP16_TO_FP32(b_y0->d),
|
||||
GGML_CPU_FP16_TO_FP32(b_x0->d)*GGML_CPU_FP16_TO_FP32(b_y1->d),
|
||||
GGML_CPU_FP16_TO_FP32(b_x1->d)*GGML_CPU_FP16_TO_FP32(b_y0->d),
|
||||
GGML_CPU_FP16_TO_FP32(b_x1->d)*GGML_CPU_FP16_TO_FP32(b_y1->d)
|
||||
};
|
||||
float32x4_t scale = vld1q_f32(_scale);
|
||||
|
||||
@@ -934,10 +935,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
sumv0 = svmla_n_f32_x(pl16, sumv0, svcvt_f32_s32_x(pl16, svadd_x(pl16,
|
||||
svdot_s32(svdup_n_s32(0), qx0_0, qy0_0),
|
||||
svdot_s32(svdup_n_s32(0), qx0_1, qy0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
||||
svdot_s32(svdup_n_s32(0), qx0_1, qy0_1))), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d));
|
||||
sumv1 = svmla_n_f32_x(pl16, sumv1, svcvt_f32_s32_x(pl16, svadd_x(pl16,
|
||||
svdot_s32(svdup_n_s32(0), qx1_0, qy1_0),
|
||||
svdot_s32(svdup_n_s32(0), qx1_1, qy1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
||||
svdot_s32(svdup_n_s32(0), qx1_1, qy1_1))), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d));
|
||||
}
|
||||
|
||||
sumf = svaddv_f32(pl16, svadd_f32_x(pl16, sumv0, sumv1));
|
||||
@@ -960,9 +961,9 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs);
|
||||
|
||||
sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(),
|
||||
svdot_s32(svdup_n_s32(0), qx0, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
||||
svdot_s32(svdup_n_s32(0), qx0, qy0)), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d));
|
||||
sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(),
|
||||
svdot_s32(svdup_n_s32(0), qx1, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
||||
svdot_s32(svdup_n_s32(0), qx1, qy1)), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d));
|
||||
}
|
||||
|
||||
sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
|
||||
@@ -1002,8 +1003,8 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
qy_64 = svadd_s8_x(svptrue_b8(), qy_32, qy_64);
|
||||
|
||||
// scale creation
|
||||
const float32_t deq1 = GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d);
|
||||
const float32_t deq2 = GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d);
|
||||
const float32_t deq1 = GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d);
|
||||
const float32_t deq2 = GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d);
|
||||
|
||||
// duplicate deq1 in first half of vector and deq2 in second half of vector
|
||||
const svfloat32_t temp = svdup_f32_m(svdup_f32_z(ph8, deq1), pl8, deq2);
|
||||
@@ -1043,11 +1044,11 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
|
||||
ggml_vdotq_s32(vdupq_n_s32(0), x0_0, y0_0),
|
||||
ggml_vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
||||
ggml_vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d));
|
||||
|
||||
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
|
||||
ggml_vdotq_s32(vdupq_n_s32(0), x1_0, y1_0),
|
||||
ggml_vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
||||
ggml_vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d));
|
||||
}
|
||||
|
||||
sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
|
||||
@@ -1059,7 +1060,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
sumi += x[ib].qs[j]*y[ib].qs[j];
|
||||
}
|
||||
|
||||
sumf += sumi*(GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d));
|
||||
sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
@@ -1217,7 +1218,7 @@ void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
const int16x8_t ysum0 = vld1q_s16(y[i].bsums);
|
||||
const int16x8_t ysum1 = vld1q_s16(y[i].bsums + 8);
|
||||
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
|
||||
#if defined(__ARM_FEATURE_DOTPROD)
|
||||
sumi0 = vaddq_s32(sumi0, sumi1);
|
||||
@@ -1269,7 +1270,7 @@ void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
}
|
||||
}
|
||||
|
||||
sumf += (float) sum * (GGML_FP16_TO_FP32(x[i].d) * y[i].d);
|
||||
sumf += (float) sum * (GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
@@ -1362,7 +1363,7 @@ void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
const int16x8_t ysum0 = vld1q_s16(y[i].bsums);
|
||||
const int16x8_t ysum1 = vld1q_s16(y[i].bsums + 8);
|
||||
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
|
||||
#if defined(__ARM_FEATURE_DOTPROD)
|
||||
sumi0 = vaddq_s32(sumi0, sumi1);
|
||||
@@ -1393,7 +1394,7 @@ void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
}
|
||||
}
|
||||
|
||||
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
||||
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
||||
|
||||
sumf += (float) sumi * d;
|
||||
}
|
||||
@@ -1425,9 +1426,9 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
switch (vector_length) {
|
||||
case 128:
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
||||
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
||||
svfloat32_t d_broad = svdup_n_f32((float32_t)d);
|
||||
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
||||
const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
||||
svfloat32_t dmin_broad = svdup_n_f32((float32_t)dmin);
|
||||
|
||||
const uint8_t * GGML_RESTRICT q2 = x[i].qs;
|
||||
@@ -1570,9 +1571,9 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
case 256:
|
||||
case 512:
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
||||
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
||||
svfloat32_t d_broad = svdup_n_f32((float32_t)d);
|
||||
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
||||
const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
||||
svfloat32_t dmin_broad = svdup_n_f32((float32_t)dmin);
|
||||
|
||||
const uint8_t * GGML_RESTRICT q2 = x[i].qs;
|
||||
@@ -1671,8 +1672,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
float sum = 0;
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
||||
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
||||
|
||||
const uint8_t * GGML_RESTRICT q2 = x[i].qs;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
@@ -1742,8 +1743,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
summs += y[i].bsums[j] * (sc[j] >> 4);
|
||||
}
|
||||
|
||||
const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
||||
const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
||||
|
||||
int isum = 0;
|
||||
int is = 0;
|
||||
@@ -1805,7 +1806,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
|
||||
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
||||
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
||||
|
||||
const uint8_t * GGML_RESTRICT q3_sv = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT qh_sv = x[i].hmask;
|
||||
@@ -1981,7 +1982,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
|
||||
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
||||
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
||||
|
||||
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT qh = x[i].hmask;
|
||||
@@ -2112,7 +2113,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
@@ -2258,18 +2259,18 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
bias[3] = vaddvq_s32(vaddq_s32(vmull_s16(vget_low_s16(y1_sums), vget_low_s16(x1_mins)),
|
||||
vmull_s16(vget_high_s16(y1_sums), vget_high_s16(x1_mins))));
|
||||
const float32x4_t dmins = {
|
||||
GGML_FP16_TO_FP32(x0->dmin) * y0->d,
|
||||
GGML_FP16_TO_FP32(x0->dmin) * y1->d,
|
||||
GGML_FP16_TO_FP32(x1->dmin) * y0->d,
|
||||
GGML_FP16_TO_FP32(x1->dmin) * y1->d,
|
||||
GGML_CPU_FP16_TO_FP32(x0->dmin) * y0->d,
|
||||
GGML_CPU_FP16_TO_FP32(x0->dmin) * y1->d,
|
||||
GGML_CPU_FP16_TO_FP32(x1->dmin) * y0->d,
|
||||
GGML_CPU_FP16_TO_FP32(x1->dmin) * y1->d,
|
||||
};
|
||||
vfsum = vmlsq_f32(vfsum, vcvtq_f32_s32(vld1q_s32(bias)), dmins);
|
||||
|
||||
const float32x4_t superblock_scale = {
|
||||
GGML_FP16_TO_FP32(x0->d) * y0->d,
|
||||
GGML_FP16_TO_FP32(x0->d) * y1->d,
|
||||
GGML_FP16_TO_FP32(x1->d) * y0->d,
|
||||
GGML_FP16_TO_FP32(x1->d) * y1->d,
|
||||
GGML_CPU_FP16_TO_FP32(x0->d) * y0->d,
|
||||
GGML_CPU_FP16_TO_FP32(x0->d) * y1->d,
|
||||
GGML_CPU_FP16_TO_FP32(x1->d) * y0->d,
|
||||
GGML_CPU_FP16_TO_FP32(x1->d) * y1->d,
|
||||
};
|
||||
vfsum = vmlaq_f32(vfsum, vcvtq_f32_s32(visum), superblock_scale);
|
||||
}
|
||||
@@ -2289,8 +2290,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
|
||||
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
||||
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
||||
|
||||
const int16x8_t q8sums = vpaddq_s16(vld1q_s16(y[i].bsums), vld1q_s16(y[i].bsums + 8));
|
||||
|
||||
@@ -2377,8 +2378,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
|
||||
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
||||
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
||||
|
||||
const int16x8_t q8sums = vpaddq_s16(vld1q_s16(y[i].bsums), vld1q_s16(y[i].bsums + 8));
|
||||
|
||||
@@ -2478,9 +2479,9 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
||||
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
||||
sumf -= dmin * sumi;
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
@@ -2520,8 +2521,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
|
||||
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
||||
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
||||
|
||||
const int16x8_t q8sums = vpaddq_s16(vld1q_s16(y[i].bsums), vld1q_s16(y[i].bsums + 8));
|
||||
|
||||
@@ -2630,9 +2631,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
||||
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
||||
sumf -= dmin * sumi;
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
@@ -2827,10 +2828,10 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
const int32x4_t vibias = vmulq_n_s32(vld1q_s32(bias), 32);
|
||||
|
||||
const float32x4_t superblock_scale = {
|
||||
GGML_FP16_TO_FP32(x0->d) * y0->d,
|
||||
GGML_FP16_TO_FP32(x0->d) * y1->d,
|
||||
GGML_FP16_TO_FP32(x1->d) * y0->d,
|
||||
GGML_FP16_TO_FP32(x1->d) * y1->d,
|
||||
GGML_CPU_FP16_TO_FP32(x0->d) * y0->d,
|
||||
GGML_CPU_FP16_TO_FP32(x0->d) * y1->d,
|
||||
GGML_CPU_FP16_TO_FP32(x1->d) * y0->d,
|
||||
GGML_CPU_FP16_TO_FP32(x1->d) * y1->d,
|
||||
};
|
||||
|
||||
visum = vsubq_s32(visum, vibias);
|
||||
@@ -2858,7 +2859,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
svuint8_t q6h_1, q6h_2, q6h_3, q6h_4;
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d_all = GGML_FP16_TO_FP32(x[i].d);
|
||||
const float d_all = GGML_CPU_FP16_TO_FP32(x[i].d);
|
||||
|
||||
const uint8_t * GGML_RESTRICT q6 = x[i].ql;
|
||||
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
||||
@@ -3011,7 +3012,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
|
||||
const float d_all = GGML_FP16_TO_FP32(x[i].d);
|
||||
const float d_all = GGML_CPU_FP16_TO_FP32(x[i].d);
|
||||
|
||||
const uint8_t * GGML_RESTRICT q6 = x[i].ql;
|
||||
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
||||
@@ -3128,7 +3129,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
@@ -3199,7 +3200,7 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
float sumf1 = 0, sumf2 = 0;
|
||||
@@ -3234,7 +3235,7 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
|
||||
|
||||
float sumf = 0.f;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
int32_t bsum = 0;
|
||||
@@ -3284,7 +3285,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
const uint8x8_t scales8 = vld1_u8(x[i].scales);
|
||||
@@ -3329,7 +3330,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
||||
|
||||
float sumf = 0.f;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT sc = x[i].scales;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
@@ -3398,7 +3399,7 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
|
||||
const uint8_t * GGML_RESTRICT qs = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
||||
@@ -3458,7 +3459,7 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; i++) {
|
||||
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const int8_t * q8 = y[i].qs;
|
||||
const uint8_t * qs = x[i].qs;
|
||||
const uint8_t * qh = x[i].qh;
|
||||
@@ -3521,7 +3522,7 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
@@ -3557,7 +3558,7 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
|
||||
|
||||
float sumf = 0.f;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
@@ -3630,7 +3631,7 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint8_t * GGML_RESTRICT qs = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
||||
const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs;
|
||||
@@ -3691,7 +3692,7 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
|
||||
float sumf = 0.f;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint8_t * GGML_RESTRICT qs = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
||||
const uint8_t * GGML_RESTRICT signs = x[i].signs;
|
||||
@@ -3786,7 +3787,7 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
|
||||
}
|
||||
|
||||
sumf += y[i].d * GGML_FP16_TO_FP32(x[i].d) * (sumi1 + sumi2 + IQ1S_DELTA * sumi3);
|
||||
sumf += y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d) * (sumi1 + sumi2 + IQ1S_DELTA * sumi3);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
@@ -3817,7 +3818,7 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
qs += 4;
|
||||
}
|
||||
|
||||
sumf += GGML_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
|
||||
sumf += GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
@@ -3905,7 +3906,7 @@ void ggml_vec_dot_iq1_m_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
|
||||
}
|
||||
|
||||
sumf += y[i].d * GGML_FP16_TO_FP32(scale.f16) * (vaddvq_s32(sumi1) + IQ1M_DELTA * vaddvq_s32(sumi2));
|
||||
sumf += y[i].d * GGML_CPU_FP16_TO_FP32(scale.f16) * (vaddvq_s32(sumi1) + IQ1M_DELTA * vaddvq_s32(sumi2));
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
@@ -3952,7 +3953,7 @@ void ggml_vec_dot_iq1_m_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
qh += 2;
|
||||
}
|
||||
|
||||
sumf += GGML_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2);
|
||||
sumf += GGML_CPU_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
@@ -4003,13 +4004,13 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v
|
||||
prod_2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[2], q8b.val[2]), q4b.val[3], q8b.val[3]);
|
||||
|
||||
sumf +=
|
||||
GGML_FP16_TO_FP32(x[ib+0].d) * GGML_FP16_TO_FP32(y[ib + 0].d) * vaddvq_s32(prod_1) +
|
||||
GGML_FP16_TO_FP32(x[ib+1].d) * GGML_FP16_TO_FP32(y[ib + 1].d) * vaddvq_s32(prod_2);
|
||||
GGML_CPU_FP16_TO_FP32(x[ib+0].d) * GGML_CPU_FP16_TO_FP32(y[ib + 0].d) * vaddvq_s32(prod_1) +
|
||||
GGML_CPU_FP16_TO_FP32(x[ib+1].d) * GGML_CPU_FP16_TO_FP32(y[ib + 1].d) * vaddvq_s32(prod_2);
|
||||
}
|
||||
|
||||
#endif
|
||||
for (; ib < nb; ++ib) {
|
||||
const float d = GGML_FP16_TO_FP32(y[ib].d)*GGML_FP16_TO_FP32(x[ib].d);
|
||||
const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d);
|
||||
int sumi1 = 0, sumi2 = 0;
|
||||
for (int j = 0; j < QK4_NL/2; ++j) {
|
||||
sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf];
|
||||
@@ -4071,7 +4072,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
||||
|
||||
}
|
||||
|
||||
sumf += GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d * (sumi1 + sumi2);
|
||||
sumf += GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d * (sumi1 + sumi2);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
@@ -4079,7 +4080,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
||||
#else
|
||||
float sumf = 0;
|
||||
for (int ibl = 0; ibl < nb; ++ibl) {
|
||||
const float d4d8 = GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
|
||||
const float d4d8 = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
|
||||
uint16_t h = x[ibl].scales_h;
|
||||
const uint8_t * qs = x[ibl].qs;
|
||||
const int8_t * q8 = y[ibl].qs;
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -3,6 +3,7 @@
|
||||
#include "ggml-quants.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-cpu.h"
|
||||
#include "simd-mappings.h"
|
||||
|
||||
#include "../../quants.h"
|
||||
#include "../../ggml-cpu-impl.h"
|
||||
@@ -474,7 +475,7 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
||||
|
||||
// Quantize these floats
|
||||
const float d = max_scalar / 127.f;
|
||||
y[i].d = GGML_FP32_TO_FP16(d);
|
||||
y[i].d = GGML_CPU_FP32_TO_FP16(d);
|
||||
const float id = ( max_scalar != 0.0f ) ? 127.f / max_scalar : 0.0f;
|
||||
const __m256 mul = (__m256)__lasx_xvreplfr2vr_s( id );
|
||||
|
||||
@@ -548,7 +549,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
||||
|
||||
// Quantize these floats
|
||||
const float d = max_scalar / 127.f;
|
||||
y[i].d = GGML_FP32_TO_FP16(d);
|
||||
y[i].d = GGML_CPU_FP32_TO_FP16(d);
|
||||
const float id = ( max_scalar != 0.0f ) ? 127.f / max_scalar : 0.0f;
|
||||
const __m256 mul = __lasx_xvreplfr2vr_s( id );
|
||||
|
||||
@@ -576,7 +577,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
||||
// Compute the sum of the quants and set y[i].s
|
||||
const __m128i s0 = __lsx_vadd_w(__lsx_vadd_w(ni0, ni1), __lsx_vadd_w(ni2, ni3));
|
||||
const __m128i s1 = __lsx_vadd_w(__lsx_vadd_w(ni4, ni5), __lsx_vadd_w(ni6, ni7));
|
||||
y[i].s = GGML_FP32_TO_FP16(d * hsum_i32_4(__lsx_vadd_w(s0, s1)));
|
||||
y[i].s = GGML_CPU_FP32_TO_FP16(d * hsum_i32_4(__lsx_vadd_w(s0, s1)));
|
||||
|
||||
// Convert int32 to int16
|
||||
ni0 = lsx_packs_w( ni0, ni1 );
|
||||
@@ -667,7 +668,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
// Main loop
|
||||
for (; ib < nb; ++ib) {
|
||||
/* Compute combined scale for the block */
|
||||
const __m256 d = __lasx_xvreplfr2vr_s( GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d) );
|
||||
const __m256 d = __lasx_xvreplfr2vr_s( GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d) );
|
||||
|
||||
__m256i qx = bytes_from_nibbles_32(x[ib].qs);
|
||||
|
||||
@@ -699,7 +700,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
for (; ib + 1 < nb; ib += 2) {
|
||||
|
||||
// Compute combined scale for the block 0 and 1
|
||||
const __m128 d_0_1 = (__m128)__lsx_vreplgr2vr_w( GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d) );
|
||||
const __m128 d_0_1 = (__m128)__lsx_vreplgr2vr_w( GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d) );
|
||||
|
||||
const __m128i tmp_0_1 = __lsx_vld((const __m128i *)x[ib].qs, 0);
|
||||
|
||||
@@ -717,7 +718,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
//_mm_prefetch(&y[ib] + 2 * sizeof(block_q8_0), _MM_HINT_T0);
|
||||
|
||||
// Compute combined scale for the block 2 and 3
|
||||
const __m128 d_2_3 = (__m128)__lsx_vreplgr2vr_w( GGML_FP16_TO_FP32(x[ib + 1].d) * GGML_FP16_TO_FP32(y[ib + 1].d) );
|
||||
const __m128 d_2_3 = (__m128)__lsx_vreplgr2vr_w( GGML_CPU_FP16_TO_FP32(x[ib + 1].d) * GGML_CPU_FP16_TO_FP32(y[ib + 1].d) );
|
||||
|
||||
const __m128i tmp_2_3 = __lsx_vld((const __m128i *)x[ib + 1].qs, 0);
|
||||
|
||||
@@ -766,7 +767,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
}
|
||||
|
||||
int sumi = sumi0 + sumi1;
|
||||
sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d);
|
||||
sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
@@ -797,10 +798,10 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
// Main loop
|
||||
for (; ib < nb; ++ib) {
|
||||
const float d0 = GGML_FP16_TO_FP32(x[ib].d);
|
||||
const float d1 = GGML_FP16_TO_FP32(y[ib].d);
|
||||
const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
|
||||
const float d1 = GGML_CPU_FP16_TO_FP32(y[ib].d);
|
||||
|
||||
summs += GGML_FP16_TO_FP32(x[ib].m) * GGML_FP16_TO_FP32(y[ib].s);
|
||||
summs += GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_CPU_FP16_TO_FP32(y[ib].s);
|
||||
|
||||
const __m256 d0v = __lasx_xvreplfr2vr_s( d0 );
|
||||
const __m256 d1v = __lasx_xvreplfr2vr_s( d1 );
|
||||
@@ -834,7 +835,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
}
|
||||
|
||||
int sumi = sumi0 + sumi1;
|
||||
sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
|
||||
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
@@ -865,7 +866,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
// Main loop
|
||||
for (; ib < nb; ++ib) {
|
||||
/* Compute combined scale for the block */
|
||||
const __m256 d = __lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d)); //FIXME
|
||||
const __m256 d = __lasx_xvreplfr2vr_s(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d)); //FIXME
|
||||
|
||||
__m256i qx = bytes_from_nibbles_32(x[ib].qs);
|
||||
__m256i bxhi = bytes_from_bits_32(x[ib].qh);
|
||||
@@ -902,7 +903,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
}
|
||||
|
||||
int sumi = sumi0 + sumi1;
|
||||
sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)) * sumi;
|
||||
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
@@ -934,16 +935,16 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
// Main loop
|
||||
for (; ib < nb; ++ib) {
|
||||
const __m256 dx = __lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(x[ib].d));
|
||||
const __m256 dx = __lasx_xvreplfr2vr_s(GGML_CPU_FP16_TO_FP32(x[ib].d));
|
||||
|
||||
summs += GGML_FP16_TO_FP32(x[ib].m) * GGML_FP16_TO_FP32(y[ib].s);
|
||||
summs += GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_CPU_FP16_TO_FP32(y[ib].s);
|
||||
|
||||
__m256i qx = bytes_from_nibbles_32(x[ib].qs);
|
||||
__m256i bxhi = bytes_from_bits_32(x[ib].qh);
|
||||
bxhi = __lasx_xvand_v(bxhi, __lasx_xvreplgr2vr_b(0x10));
|
||||
qx = __lasx_xvor_v(qx, bxhi);
|
||||
|
||||
const __m256 dy = __lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(y[ib].d));
|
||||
const __m256 dy = __lasx_xvreplfr2vr_s(GGML_CPU_FP16_TO_FP32(y[ib].d));
|
||||
const __m256i qy = __lasx_xvld((const __m256i *)y[ib].qs, 0);
|
||||
|
||||
const __m256 q = mul_sum_us8_pairs_float(qx, qy);
|
||||
@@ -973,7 +974,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
}
|
||||
|
||||
int sumi = sumi0 + sumi1;
|
||||
sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
|
||||
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
@@ -1003,7 +1004,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
// Main loop
|
||||
for (; ib < nb; ++ib) {
|
||||
// Compute combined scale for the block
|
||||
const __m256 d = __lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d));
|
||||
const __m256 d = __lasx_xvreplfr2vr_s(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
|
||||
__m256i qx = __lasx_xvld((const __m256i *)x[ib].qs, 0);
|
||||
__m256i qy = __lasx_xvld((const __m256i *)y[ib].qs, 0);
|
||||
|
||||
@@ -1023,7 +1024,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
sumi += x[ib].qs[j]*y[ib].qs[j];
|
||||
}
|
||||
|
||||
sumf += sumi*(GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d));
|
||||
sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
@@ -1047,8 +1048,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
|
||||
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
||||
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
||||
|
||||
const uint8_t * GGML_RESTRICT q2 = x[i].qs;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
@@ -1116,8 +1117,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
summs += y[i].bsums[j] * (sc[j] >> 4);
|
||||
}
|
||||
|
||||
const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
||||
const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
||||
|
||||
int isum = 0;
|
||||
int is = 0;
|
||||
@@ -1170,7 +1171,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
|
||||
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
||||
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
||||
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
// Set up scales
|
||||
@@ -1294,7 +1295,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
@@ -1330,8 +1331,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
|
||||
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
||||
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
||||
|
||||
memcpy(utmp, x[i].scales, 12);
|
||||
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
||||
@@ -1438,9 +1439,9 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
||||
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
||||
sumf -= dmin * sumi;
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
@@ -1477,8 +1478,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
const uint8_t * GGML_RESTRICT q5 = x[i].qs;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
|
||||
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
||||
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
||||
|
||||
memcpy(utmp, x[i].scales, 12);
|
||||
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
||||
@@ -1593,9 +1594,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
||||
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
||||
sumf -= dmin * sumi;
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
@@ -1624,7 +1625,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
|
||||
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
||||
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
||||
|
||||
const uint8_t * GGML_RESTRICT q4 = x[i].ql;
|
||||
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
||||
@@ -1713,7 +1714,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
@@ -1780,7 +1781,7 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
|
||||
|
||||
__m256 accumf = (__m256)__lasx_xvldi(0);
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
__m256i sumi1 = __lasx_xvldi(0);
|
||||
@@ -1820,7 +1821,7 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
|
||||
|
||||
float sumf = 0.f;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
int32_t bsum = 0;
|
||||
@@ -1895,7 +1896,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
||||
|
||||
__m256 accumf = (__m256)__lasx_xvldi(0);
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
|
||||
@@ -1980,7 +1981,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
||||
|
||||
float sumf = 0.f;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT sc = x[i].scales;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
@@ -2049,7 +2050,7 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
|
||||
__m256 accumf = (__m256)__lasx_xvldi(0);
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint8_t * GGML_RESTRICT qs = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
||||
const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8);
|
||||
@@ -2108,7 +2109,7 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; i++) {
|
||||
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const int8_t * q8 = y[i].qs;
|
||||
const uint8_t * qs = x[i].qs;
|
||||
const uint8_t * qh = x[i].qh;
|
||||
@@ -2168,7 +2169,7 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
|
||||
|
||||
__m256 accumf = (__m256)__lasx_xvldi(0);
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
@@ -2213,7 +2214,7 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
|
||||
|
||||
float sumf = 0.f;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
@@ -2279,7 +2280,7 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
|
||||
__m256 accumf = (__m256)__lasx_xvldi(0);
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint8_t * GGML_RESTRICT qs = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
||||
const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs;
|
||||
@@ -2340,7 +2341,7 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
|
||||
float sumf = 0.f;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint8_t * GGML_RESTRICT qs = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
||||
const uint8_t * GGML_RESTRICT signs = x[i].signs;
|
||||
@@ -2451,7 +2452,7 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
+ (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * (qh[ib+1] & 0x8000 ? -1 : 1) * ls2;
|
||||
}
|
||||
|
||||
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
||||
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
||||
accum = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(sumi), accum);
|
||||
accum1 += d * sumi1;
|
||||
}
|
||||
@@ -2484,7 +2485,7 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
qs += 4;
|
||||
}
|
||||
|
||||
sumf += GGML_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
|
||||
sumf += GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
@@ -2530,9 +2531,9 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v
|
||||
const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
|
||||
const __m256i p_1 = lasx_madd_h(p16_1, mone);
|
||||
const __m256i p_2 = lasx_madd_h(p16_2, mone);
|
||||
accum1 = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(y[ib + 0].d)*GGML_FP16_TO_FP32(x[ib + 0].d)),
|
||||
accum1 = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(GGML_CPU_FP16_TO_FP32(y[ib + 0].d)*GGML_CPU_FP16_TO_FP32(x[ib + 0].d)),
|
||||
__lasx_xvffint_s_w(p_1), accum1);
|
||||
accum2 = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(y[ib + 1].d)*GGML_FP16_TO_FP32(x[ib + 1].d)),
|
||||
accum2 = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(GGML_CPU_FP16_TO_FP32(y[ib + 1].d)*GGML_CPU_FP16_TO_FP32(x[ib + 1].d)),
|
||||
__lasx_xvffint_s_w(p_2), accum2);
|
||||
}
|
||||
|
||||
@@ -2540,7 +2541,7 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v
|
||||
|
||||
#endif
|
||||
for (; ib < nb; ++ib) {
|
||||
const float d = GGML_FP16_TO_FP32(y[ib].d)*GGML_FP16_TO_FP32(x[ib].d);
|
||||
const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d);
|
||||
int sumi1 = 0, sumi2 = 0;
|
||||
for (int j = 0; j < QK4_NL/2; ++j) {
|
||||
sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf];
|
||||
@@ -2595,7 +2596,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
||||
sumi1 = __lasx_xvadd_w(p_1, sumi1);
|
||||
sumi2 = __lasx_xvadd_w(p_2, sumi2);
|
||||
}
|
||||
accum = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(x[ibl].d)*y[ibl].d),
|
||||
accum = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(GGML_CPU_FP16_TO_FP32(x[ibl].d)*y[ibl].d),
|
||||
__lasx_xvffint_s_w(__lasx_xvadd_w(sumi1, sumi2)), accum);
|
||||
}
|
||||
|
||||
@@ -2604,7 +2605,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
||||
#else
|
||||
float sumf = 0;
|
||||
for (int ibl = 0; ibl < nb; ++ibl) {
|
||||
const float d4d8 = GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
|
||||
const float d4d8 = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
|
||||
uint16_t h = x[ibl].scales_h;
|
||||
const uint8_t * qs = x[ibl].qs;
|
||||
const int8_t * q8 = y[ibl].qs;
|
||||
|
||||
82
ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp
Normal file
82
ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp
Normal file
@@ -0,0 +1,82 @@
|
||||
# include "ggml-backend-impl.h"
|
||||
|
||||
#if defined(__powerpc64__) || defined(__ppc64__) || defined(__PPC64__)
|
||||
|
||||
#if defined(__linux__)
|
||||
#include <sys/auxv.h>
|
||||
#endif
|
||||
|
||||
#include <string>
|
||||
|
||||
struct powerpc_features {
|
||||
std::string platform = "";
|
||||
int power_version = -1;
|
||||
|
||||
bool has_vsx = false;
|
||||
|
||||
powerpc_features() {
|
||||
#if defined(__linux__)
|
||||
unsigned long auxval = getauxval(AT_PLATFORM);
|
||||
if (auxval) {
|
||||
platform = std::string(reinterpret_cast<const char*>(auxval));
|
||||
// TBD: Do systems exist that return this in uppercase?
|
||||
if (platform.substr(0, 5) == "power") {
|
||||
// Extractt a numeric suffix, if one exists
|
||||
int vpos = -1;
|
||||
for (int i = platform.length() - 1; i >= 0; i--) {
|
||||
if (std::isdigit(platform[i])) {
|
||||
vpos = i;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (vpos > -1) {
|
||||
power_version = std::stoi(platform.substr(vpos));
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
if (power_version >= 9) {
|
||||
has_vsx = true;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
static int ggml_backend_cpu_powerpc_score() {
|
||||
int score = 1;
|
||||
powerpc_features pf;
|
||||
|
||||
// Platform scores
|
||||
#if defined(GGML_USE_POWER7)
|
||||
if (pf.power_version < 7) { return 0; }
|
||||
score += 1<<1;
|
||||
#endif
|
||||
#if defined(GGML_USE_POWER8)
|
||||
if (pf.power_version < 8) { return 0; }
|
||||
score += 1<<2;
|
||||
#endif
|
||||
#if defined(GGML_USE_POWER9)
|
||||
if (pf.power_version < 9) { return 0; }
|
||||
score += 1<<3;
|
||||
#endif
|
||||
#if defined(GGML_USE_POWER10)
|
||||
if (pf.power_version < 10) { return 0; }
|
||||
score += 1<<4;
|
||||
#endif
|
||||
#if defined(GGML_USE_POWER11)
|
||||
if (pf.power_version < 11) { return 0; }
|
||||
score += 1<<5;
|
||||
#endif
|
||||
|
||||
// Feature scores
|
||||
#if defined(GGML_USE_VSX)
|
||||
if (!pf.has_vsx) { return 0; }
|
||||
score += 1<<6;
|
||||
#endif
|
||||
|
||||
return score;
|
||||
}
|
||||
|
||||
GGML_BACKEND_DL_SCORE_IMPL(ggml_backend_cpu_powerpc_score)
|
||||
|
||||
#endif // defined(__powerpc64__) || defined(__ppc64__) || defined(__PPC64__)
|
||||
@@ -3,6 +3,7 @@
|
||||
#include "ggml-quants.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-cpu.h"
|
||||
#include "simd-mappings.h"
|
||||
|
||||
#include "../../quants.h"
|
||||
#include "../../ggml-cpu-impl.h"
|
||||
@@ -67,7 +68,7 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
||||
const float id = d ? 1.0f/d : 0.0f;
|
||||
const vector float vid = vec_splats(id);
|
||||
|
||||
y[i].d = GGML_FP32_TO_FP16(d);
|
||||
y[i].d = GGML_CPU_FP32_TO_FP16(d);
|
||||
|
||||
for (int j = 0; j < 8; j++) {
|
||||
const vector float v = vec_round(vec_mul(srcv[j], vid));
|
||||
@@ -112,7 +113,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
||||
const float id = d ? 1.0f/d : 0.0f;
|
||||
const vector float vid = vec_splats(id);
|
||||
|
||||
y[i].d = GGML_FP32_TO_FP16(d);
|
||||
y[i].d = GGML_CPU_FP32_TO_FP16(d);
|
||||
|
||||
vector int accv = vec_splats(0);
|
||||
|
||||
@@ -127,7 +128,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
||||
|
||||
accv = vec_add(accv, vec_sld(accv, accv, 4));
|
||||
accv = vec_add(accv, vec_sld(accv, accv, 8));
|
||||
y[i].s = GGML_FP32_TO_FP16(d * vec_extract(accv, 0));
|
||||
y[i].s = GGML_CPU_FP32_TO_FP16(d * vec_extract(accv, 0));
|
||||
}
|
||||
|
||||
#else
|
||||
@@ -170,8 +171,8 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
__builtin_prefetch(x[ib].qs, 0, 1);
|
||||
__builtin_prefetch(y[ib].qs, 0, 1);
|
||||
|
||||
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ib].d));
|
||||
vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[ib].d));
|
||||
vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d));
|
||||
vector float vyd = vec_splats(GGML_CPU_FP16_TO_FP32(y[ib].d));
|
||||
vector float vd = vec_mul(vxd, vyd);
|
||||
|
||||
vector signed char qxs = (vector signed char)vec_xl( 0, x[ib].qs);
|
||||
@@ -214,7 +215,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
}
|
||||
|
||||
int sumi = sumi0 + sumi1;
|
||||
sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d);
|
||||
sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
@@ -249,12 +250,12 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
__builtin_prefetch(x[ib].qs, 0, 1);
|
||||
__builtin_prefetch(y[ib].qs, 0, 1);
|
||||
|
||||
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ib].d));
|
||||
vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[ib].d));
|
||||
vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d));
|
||||
vector float vyd = vec_splats(GGML_CPU_FP16_TO_FP32(y[ib].d));
|
||||
vector float vd = vec_mul(vxd, vyd);
|
||||
|
||||
vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[ib].m));
|
||||
vector float vys = {GGML_FP16_TO_FP32(y[ib].s), 0.0f, 0.0f, 0.0f};
|
||||
vector float vxmin = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].m));
|
||||
vector float vys = {GGML_CPU_FP16_TO_FP32(y[ib].s), 0.0f, 0.0f, 0.0f};
|
||||
vsumf0 = vec_madd(vxmin, vys, vsumf0);
|
||||
|
||||
vector signed char qxs = (vector signed char)vec_xl( 0, x[ib].qs);
|
||||
@@ -291,7 +292,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
}
|
||||
|
||||
int sumi = sumi0 + sumi1;
|
||||
sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
|
||||
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
@@ -326,8 +327,8 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
__builtin_prefetch(x[ib].qs, 0, 1);
|
||||
__builtin_prefetch(y[ib].qs, 0, 1);
|
||||
|
||||
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ib].d));
|
||||
vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[ib].d));
|
||||
vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d));
|
||||
vector float vyd = vec_splats(GGML_CPU_FP16_TO_FP32(y[ib].d));
|
||||
vector float vd = vec_mul(vxd, vyd);
|
||||
|
||||
vector signed long long aux64x2_0 = {(uint64_t)(table_b2b_1[x[ib].qh[0]]), (uint64_t)(table_b2b_1[x[ib].qh[1]])};
|
||||
@@ -379,7 +380,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
}
|
||||
|
||||
int sumi = sumi0 + sumi1;
|
||||
sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)) * sumi;
|
||||
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
@@ -415,12 +416,12 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
__builtin_prefetch(x[ib].qs, 0, 1);
|
||||
__builtin_prefetch(y[ib].qs, 0, 1);
|
||||
|
||||
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ib].d));
|
||||
vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[ib].d));
|
||||
vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d));
|
||||
vector float vyd = vec_splats(GGML_CPU_FP16_TO_FP32(y[ib].d));
|
||||
vector float vd = vec_mul(vxd, vyd);
|
||||
|
||||
vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[ib].m));
|
||||
vector float vys = {GGML_FP16_TO_FP32(y[ib].s), 0.f, 0.f, 0.f};
|
||||
vector float vxmin = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].m));
|
||||
vector float vys = {GGML_CPU_FP16_TO_FP32(y[ib].s), 0.f, 0.f, 0.f};
|
||||
vsumf0 = vec_madd(vxmin, vys, vsumf0);
|
||||
|
||||
vector unsigned long long aux64x2_0 = {(uint64_t)(table_b2b_0[x[ib].qh[0]]), (uint64_t)(table_b2b_0[x[ib].qh[1]])};
|
||||
@@ -470,7 +471,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
}
|
||||
|
||||
int sumi = sumi0 + sumi1;
|
||||
sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
|
||||
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
@@ -502,8 +503,8 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
__builtin_prefetch(x[ib].qs, 0, 1);
|
||||
__builtin_prefetch(y[ib].qs, 0, 1);
|
||||
|
||||
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ib].d));
|
||||
vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[ib].d));
|
||||
vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d));
|
||||
vector float vyd = vec_splats(GGML_CPU_FP16_TO_FP32(y[ib].d));
|
||||
vector float vd = vec_mul(vxd, vyd);
|
||||
|
||||
vector signed char q8x0 = vec_xl( 0, x[ib].qs);
|
||||
@@ -542,7 +543,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
sumi += x[ib].qs[j]*y[ib].qs[j];
|
||||
}
|
||||
|
||||
sumf += sumi*(GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d));
|
||||
sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
@@ -574,11 +575,11 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
vector float vsumf3 = vec_splats(0.0f);
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
|
||||
vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
|
||||
vector float vyd = vec_splats(y[i].d);
|
||||
vector float vd = vec_mul(vxd, vyd);
|
||||
|
||||
vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[i].dmin));
|
||||
vector float vxmin = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].dmin));
|
||||
vector float vdmin = vec_mul(vxmin, vyd);
|
||||
|
||||
vector signed short q8ysums0 = vec_xl( 0, y[i].bsums);
|
||||
@@ -708,8 +709,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
summs += y[i].bsums[j] * (sc[j] >> 4);
|
||||
}
|
||||
|
||||
const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
||||
const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
||||
|
||||
int isum = 0;
|
||||
int is = 0;
|
||||
@@ -770,7 +771,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
vector float vsumf3 = vec_splats(0.0f);
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
|
||||
vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
|
||||
vector float vyd = vec_splats(y[i].d);
|
||||
vector float vd = vec_mul(vxd, vyd);
|
||||
|
||||
@@ -962,7 +963,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
@@ -1005,11 +1006,11 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
vector float vsumf3 = vec_splats(0.0f);
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
|
||||
vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
|
||||
vector float vyd = vec_splats(y[i].d);
|
||||
vector float vd = vec_mul(vxd, vyd);
|
||||
|
||||
vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[i].dmin));
|
||||
vector float vxmin = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].dmin));
|
||||
vector float vdmin = vec_mul(vxmin, vyd);
|
||||
|
||||
vector signed short q8ysums0 = vec_xl( 0, y[i].bsums);
|
||||
@@ -1177,9 +1178,9 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
||||
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
||||
sumf -= dmin * sumi;
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
@@ -1222,11 +1223,11 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
vector float vsumf3 = vec_splats(0.0f);
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
|
||||
vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
|
||||
vector float vyd = vec_splats(y[i].d);
|
||||
vector float vd = vec_mul(vxd, vyd);
|
||||
|
||||
vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[i].dmin));
|
||||
vector float vxmin = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].dmin));
|
||||
vector float vdmin = vec_mul(vxmin, vyd);
|
||||
|
||||
UNUSED(kmask1);
|
||||
@@ -1394,9 +1395,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
||||
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
||||
sumf -= dmin * sumi;
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
@@ -1432,7 +1433,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
vector float vsumf3 = vec_splats(0.0f);
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
|
||||
vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
|
||||
vector float vyd = vec_splats(y[i].d);
|
||||
vector float vd = vec_mul(vxd, vyd);
|
||||
|
||||
@@ -1591,7 +1592,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
@@ -1659,7 +1660,7 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
|
||||
const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
|
||||
vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
|
||||
vector float vyd = vec_splats(y[i].d);
|
||||
vector float vd = vec_mul(vxd, vyd);
|
||||
|
||||
@@ -1742,7 +1743,7 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
|
||||
|
||||
float sumf = 0.f;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
int32_t bsum = 0;
|
||||
@@ -1790,7 +1791,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
||||
const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
|
||||
vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
|
||||
vector float vyd = vec_splats(y[i].d);
|
||||
vector float vd = vec_mul(vxd, vyd);
|
||||
|
||||
@@ -1871,7 +1872,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
||||
|
||||
float sumf = 0.f;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT sc = x[i].scales;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
@@ -1939,7 +1940,7 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
const vector signed char mask2 = (vector signed char)vec_xl( 0, k_mask2);
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
|
||||
vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
|
||||
vector float vyd = vec_splats(y[i].d);
|
||||
vector float vd = vec_mul(vxd, vyd);
|
||||
|
||||
@@ -2033,7 +2034,7 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; i++) {
|
||||
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const int8_t * q8 = y[i].qs;
|
||||
const uint8_t * qs = x[i].qs;
|
||||
const uint8_t * qh = x[i].qh;
|
||||
@@ -2096,7 +2097,7 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
|
||||
vector float vsumf3 = vec_splats(0.0f);
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
|
||||
vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
|
||||
vector float vyd = vec_splats(y[i].d);
|
||||
vector float vd = vec_mul(vxd, vyd);
|
||||
|
||||
@@ -2176,7 +2177,7 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
|
||||
|
||||
float sumf = 0.f;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
@@ -2236,7 +2237,7 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
const vector signed char mask2 = (vector signed char)vec_xl( 0, k_mask2);
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
|
||||
vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
|
||||
vector float vyd = vec_splats(y[i].d);
|
||||
vector float vd = vec_mul(vxd, vyd);
|
||||
|
||||
@@ -2329,7 +2330,7 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
|
||||
float sumf = 0.f;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint8_t * GGML_RESTRICT qs = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
||||
const uint8_t * GGML_RESTRICT signs = x[i].signs;
|
||||
@@ -2394,7 +2395,7 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
vector float vsumf3 = vec_splats(0.0f);
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
|
||||
vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
|
||||
vector float vyd = vec_splats(y[i].d);
|
||||
vector float vd = vec_mul(vxd, vyd);
|
||||
|
||||
@@ -2505,7 +2506,7 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
qs += 4;
|
||||
}
|
||||
|
||||
sumf += GGML_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
|
||||
sumf += GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
@@ -2546,8 +2547,8 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v
|
||||
__builtin_prefetch(y[ib].qs, 0, 1);
|
||||
|
||||
|
||||
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ib].d));
|
||||
vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[ib].d));
|
||||
vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d));
|
||||
vector float vyd = vec_splats(GGML_CPU_FP16_TO_FP32(y[ib].d));
|
||||
vector float vd = vec_mul(vxd, vyd);
|
||||
|
||||
vector signed char qxs = (vector signed char)vec_xl( 0, x[ib].qs);
|
||||
@@ -2582,7 +2583,7 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v
|
||||
|
||||
#endif
|
||||
for (; ib < nb; ++ib) {
|
||||
const float d = GGML_FP16_TO_FP32(y[ib].d)*GGML_FP16_TO_FP32(x[ib].d);
|
||||
const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d);
|
||||
int sumi1 = 0, sumi2 = 0;
|
||||
for (int j = 0; j < QK4_NL/2; ++j) {
|
||||
sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf];
|
||||
@@ -2620,7 +2621,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
||||
|
||||
for (int ibl = 0; ibl < nb; ++ibl) {
|
||||
|
||||
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ibl].d));
|
||||
vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[ibl].d));
|
||||
vector float vyd = vec_splats(y[ibl].d);
|
||||
vector float vd = vec_mul(vxd, vyd);
|
||||
|
||||
@@ -2697,7 +2698,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
||||
#else
|
||||
float sumf = 0;
|
||||
for (int ibl = 0; ibl < nb; ++ibl) {
|
||||
const float d4d8 = GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
|
||||
const float d4d8 = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
|
||||
uint16_t h = x[ibl].scales_h;
|
||||
const uint8_t * qs = x[ibl].qs;
|
||||
const int8_t * q8 = y[ibl].qs;
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
#include "ggml-quants.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-cpu.h"
|
||||
#include "simd-mappings.h"
|
||||
|
||||
#include "../../quants.h"
|
||||
#include "../../ggml-cpu-impl.h"
|
||||
@@ -45,7 +46,7 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
||||
const float d = amax / ((1 << 7) - 1);
|
||||
const float id = d ? 1.0f/d : 0.0f;
|
||||
|
||||
y[i].d = GGML_FP32_TO_FP16(d);
|
||||
y[i].d = GGML_CPU_FP32_TO_FP16(d);
|
||||
|
||||
vfloat32m8_t x0 = __riscv_vfmul_vf_f32m8(v_x, id, vl);
|
||||
|
||||
@@ -85,7 +86,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
||||
const float d = amax / ((1 << 7) - 1);
|
||||
const float id = d ? 1.0f/d : 0.0f;
|
||||
|
||||
y[i].d = GGML_FP32_TO_FP16(d);
|
||||
y[i].d = GGML_CPU_FP32_TO_FP16(d);
|
||||
|
||||
vfloat32m8_t x0 = __riscv_vfmul_vf_f32m8(v_x, id, vl);
|
||||
|
||||
@@ -102,7 +103,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
||||
|
||||
// set y[i].s
|
||||
int sum = __riscv_vmv_x_s_i16m1_i16(vwrs);
|
||||
y[i].s = GGML_FP32_TO_FP16(sum*d);
|
||||
y[i].s = GGML_CPU_FP32_TO_FP16(sum*d);
|
||||
}
|
||||
|
||||
#else
|
||||
@@ -160,7 +161,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
|
||||
|
||||
sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d);
|
||||
sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -177,7 +178,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
}
|
||||
|
||||
int sumi = sumi0 + sumi1;
|
||||
sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d);
|
||||
sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
@@ -225,7 +226,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
|
||||
|
||||
sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
|
||||
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -242,7 +243,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
}
|
||||
|
||||
int sumi = sumi0 + sumi1;
|
||||
sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
|
||||
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
@@ -293,7 +294,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
vint32m1_t sum = __riscv_vwredsum_vs_i16m4_i32m1(mul, zero, vl);
|
||||
int32_t sumi = __riscv_vmv_x_s_i32m1_i32(sum);
|
||||
|
||||
sumf += (GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d)) * sumi;
|
||||
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -316,7 +317,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
}
|
||||
|
||||
int sumi = sumi0 + sumi1;
|
||||
sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)) * sumi;
|
||||
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
@@ -366,7 +367,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
vint32m1_t sum = __riscv_vwredsum_vs_i16m4_i32m1(mul, zero, vl);
|
||||
int32_t sumi = __riscv_vmv_x_s_i32m1_i32(sum);
|
||||
|
||||
sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
|
||||
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -389,7 +390,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
}
|
||||
|
||||
int sumi = sumi0 + sumi1;
|
||||
sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
|
||||
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
@@ -427,7 +428,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
int sumi = __riscv_vmv_x_s_i32m1_i32(v_sum);
|
||||
|
||||
sumf += sumi*(GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d));
|
||||
sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -438,7 +439,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
sumi += x[ib].qs[j]*y[ib].qs[j];
|
||||
}
|
||||
|
||||
sumf += sumi*(GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d));
|
||||
sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
@@ -465,8 +466,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
const uint8_t * q2 = x[i].qs;
|
||||
const int8_t * q8 = y[i].qs;
|
||||
const uint8_t * sc = x[i].scales;
|
||||
const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
||||
const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
||||
uint8_t *patmp = atmp;
|
||||
int vsums;
|
||||
int tmp;
|
||||
@@ -569,8 +570,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
const int8_t * q8 = y[i].qs;
|
||||
const uint8_t * sc = x[i].scales;
|
||||
|
||||
const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
||||
const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
||||
|
||||
size_t vl = 16;
|
||||
|
||||
@@ -644,8 +645,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
const uint8_t * q2 = x[i].qs;
|
||||
const int8_t * q8 = y[i].qs;
|
||||
const uint8_t * sc = x[i].scales;
|
||||
const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
||||
const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
||||
uint8_t *patmp = atmp;
|
||||
int vsums;
|
||||
int tmp;
|
||||
@@ -750,8 +751,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
summs += y[i].bsums[j] * (sc[j] >> 4);
|
||||
}
|
||||
|
||||
const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
||||
const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
||||
|
||||
int isum = 0;
|
||||
int is = 0;
|
||||
@@ -916,7 +917,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
q3 += 32; q8 += 128; scale += 8;
|
||||
}
|
||||
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
sumf += d * isum;
|
||||
}
|
||||
|
||||
@@ -1017,7 +1018,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
}
|
||||
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
|
||||
sumf += d*sum_t;
|
||||
|
||||
@@ -1134,7 +1135,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
q3 += 32; q8 += 128; scale += 8;
|
||||
}
|
||||
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
sumf += d * isum;
|
||||
}
|
||||
break;
|
||||
@@ -1202,7 +1203,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
@@ -1239,8 +1240,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
float sumf = 0;
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
||||
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
||||
|
||||
int tmp, tmp2, sumi;
|
||||
__asm__ __volatile__(
|
||||
@@ -1361,8 +1362,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
size_t vl = 8;
|
||||
|
||||
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
||||
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
||||
|
||||
vint16mf2_t q8sums_0 = __riscv_vlse16_v_i16mf2(y[i].bsums, 4, vl);
|
||||
vint16mf2_t q8sums_1 = __riscv_vlse16_v_i16mf2(y[i].bsums+1, 4, vl);
|
||||
@@ -1422,8 +1423,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
break;
|
||||
case 128:
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
||||
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
||||
|
||||
int tmp, tmp2, sumi;
|
||||
__asm__ __volatile__(
|
||||
@@ -1580,9 +1581,9 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
||||
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
||||
sumf -= dmin * sumi;
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
@@ -1627,8 +1628,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
const uint8_t * GGML_RESTRICT hm = x[i].qh;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
||||
|
||||
vint16m1_t q8sums_0 = __riscv_vlse16_v_i16m1(y[i].bsums, 4, vl);
|
||||
vint16m1_t q8sums_1 = __riscv_vlse16_v_i16m1(y[i].bsums+1, 4, vl);
|
||||
@@ -1749,9 +1750,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
||||
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
||||
sumf -= dmin * sumi;
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
@@ -1778,7 +1779,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
|
||||
const uint8_t * restrict q6 = x[i].ql;
|
||||
const uint8_t * restrict qh = x[i].qh;
|
||||
@@ -1862,7 +1863,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
case 256:
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
|
||||
const uint8_t * GGML_RESTRICT q6 = x[i].ql;
|
||||
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
||||
@@ -1943,7 +1944,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
case 128:
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
|
||||
const uint8_t * restrict q6 = x[i].ql;
|
||||
const uint8_t * restrict qh = x[i].qh;
|
||||
@@ -2058,7 +2059,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
|
||||
@@ -6,6 +6,7 @@
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-cpu.h"
|
||||
#include "ggml-cpu-impl.h"
|
||||
#include "simd-mappings.h"
|
||||
#include "traits.h"
|
||||
|
||||
#include <cmath>
|
||||
@@ -90,16 +91,16 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4);
|
||||
|
||||
// vector version needs Zvfhmin extension
|
||||
const float a_scale = GGML_FP16_TO_FP32(a_ptr[l].d);
|
||||
const float a_scale = GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
||||
const float b_scales[8] = {
|
||||
GGML_FP16_TO_FP32(b_ptr[l].d[0]),
|
||||
GGML_FP16_TO_FP32(b_ptr[l].d[1]),
|
||||
GGML_FP16_TO_FP32(b_ptr[l].d[2]),
|
||||
GGML_FP16_TO_FP32(b_ptr[l].d[3]),
|
||||
GGML_FP16_TO_FP32(b_ptr[l].d[4]),
|
||||
GGML_FP16_TO_FP32(b_ptr[l].d[5]),
|
||||
GGML_FP16_TO_FP32(b_ptr[l].d[6]),
|
||||
GGML_FP16_TO_FP32(b_ptr[l].d[7])
|
||||
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[0]),
|
||||
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[1]),
|
||||
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[2]),
|
||||
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[3]),
|
||||
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[4]),
|
||||
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[5]),
|
||||
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[6]),
|
||||
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[7])
|
||||
};
|
||||
const vfloat32m1_t b_scales_vec = __riscv_vle32_v_f32m1(b_scales, vl / 4);
|
||||
const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scale, vl / 4);
|
||||
@@ -129,7 +130,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
||||
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
|
||||
}
|
||||
sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d);
|
||||
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -181,20 +182,20 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
|
||||
// vector version needs Zvfhmin extension
|
||||
const float a_scales[4] = {
|
||||
GGML_FP16_TO_FP32(a_ptr[l].d[0]),
|
||||
GGML_FP16_TO_FP32(a_ptr[l].d[1]),
|
||||
GGML_FP16_TO_FP32(a_ptr[l].d[2]),
|
||||
GGML_FP16_TO_FP32(a_ptr[l].d[3])
|
||||
GGML_CPU_FP16_TO_FP32(a_ptr[l].d[0]),
|
||||
GGML_CPU_FP16_TO_FP32(a_ptr[l].d[1]),
|
||||
GGML_CPU_FP16_TO_FP32(a_ptr[l].d[2]),
|
||||
GGML_CPU_FP16_TO_FP32(a_ptr[l].d[3])
|
||||
};
|
||||
const float b_scales[8] = {
|
||||
GGML_FP16_TO_FP32(b_ptr[l].d[0]),
|
||||
GGML_FP16_TO_FP32(b_ptr[l].d[1]),
|
||||
GGML_FP16_TO_FP32(b_ptr[l].d[2]),
|
||||
GGML_FP16_TO_FP32(b_ptr[l].d[3]),
|
||||
GGML_FP16_TO_FP32(b_ptr[l].d[4]),
|
||||
GGML_FP16_TO_FP32(b_ptr[l].d[5]),
|
||||
GGML_FP16_TO_FP32(b_ptr[l].d[6]),
|
||||
GGML_FP16_TO_FP32(b_ptr[l].d[7])
|
||||
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[0]),
|
||||
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[1]),
|
||||
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[2]),
|
||||
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[3]),
|
||||
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[4]),
|
||||
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[5]),
|
||||
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[6]),
|
||||
GGML_CPU_FP16_TO_FP32(b_ptr[l].d[7])
|
||||
};
|
||||
const vfloat32m1_t b_scales_vec = __riscv_vle32_v_f32m1(b_scales, vl / 4);
|
||||
|
||||
@@ -382,7 +383,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
||||
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
|
||||
}
|
||||
sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]);
|
||||
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
#include "ggml-quants.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-cpu.h"
|
||||
#include "simd-mappings.h"
|
||||
|
||||
#include "../../quants.h"
|
||||
#include "../../ggml-cpu-impl.h"
|
||||
@@ -49,7 +50,7 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
||||
const float d = amax / ((1 << 7) - 1);
|
||||
const float id = d ? 1.0f / d : 0.0f;
|
||||
|
||||
y[i].d = GGML_FP32_TO_FP16(d);
|
||||
y[i].d = GGML_CPU_FP32_TO_FP16(d);
|
||||
|
||||
for (int j = 0; j < 8; j++) {
|
||||
const __vector float v = vec_mul(srcv[j], vec_splats(id));
|
||||
@@ -94,7 +95,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
||||
const float d = amax / ((1 << 7) - 1);
|
||||
const float id = d ? 1.0f / d : 0.0f;
|
||||
|
||||
y[i].d = GGML_FP32_TO_FP16(d);
|
||||
y[i].d = GGML_CPU_FP32_TO_FP16(d);
|
||||
|
||||
__vector int32_t acc = vec_splats(0);
|
||||
|
||||
@@ -110,7 +111,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
||||
acc = vec_add(acc, vi);
|
||||
}
|
||||
|
||||
y[i].s = GGML_FP32_TO_FP16(d * (acc[0] + acc[1] + acc[2] + acc[3]));
|
||||
y[i].s = GGML_CPU_FP32_TO_FP16(d * (acc[0] + acc[1] + acc[2] + acc[3]));
|
||||
}
|
||||
#else
|
||||
GGML_UNUSED(nb);
|
||||
@@ -164,7 +165,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
__vector int16_t v_xy_ = v_xylso + v_xylse + v_xyhso + v_xyhse; v_xy_ += vec_reve(v_xy_);
|
||||
|
||||
const __vector float v_xy = vec_float(vec_unpackh(v_xy_));
|
||||
const __vector float v_d = vec_splats(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d));
|
||||
const __vector float v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
|
||||
|
||||
acc = vec_madd(v_xy, v_d, acc);
|
||||
}
|
||||
@@ -185,7 +186,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
}
|
||||
|
||||
int sumi = sumi0 + sumi1;
|
||||
sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d);
|
||||
sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
@@ -219,7 +220,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
__builtin_prefetch(x[ib].qs, 0, 1);
|
||||
__builtin_prefetch(y[ib].qs, 0, 1);
|
||||
|
||||
summs += GGML_FP16_TO_FP32(x[ib].m) * GGML_FP16_TO_FP32(y[ib].s);
|
||||
summs += GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_CPU_FP16_TO_FP32(y[ib].s);
|
||||
|
||||
const uint8x16_t v_x = vec_xl(0, x[ib].qs);
|
||||
const int8x16_t v_xl = (const int8x16_t)(v_x & v_m);
|
||||
@@ -231,7 +232,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
const int32x4_t v_xy_ = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh);
|
||||
const float32x4_t v_xy = vec_float(v_xy_);
|
||||
|
||||
const float32x4_t v_d = vec_splats(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d));
|
||||
const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
|
||||
|
||||
acc = vec_madd(v_xy, v_d, acc);
|
||||
}
|
||||
@@ -252,7 +253,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
}
|
||||
|
||||
int sumi = sumi0 + sumi1;
|
||||
sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
|
||||
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
@@ -290,7 +291,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
const int32x4_t v_xy_ = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh);
|
||||
const float32x4_t v_xy = vec_float(v_xy_);
|
||||
const float32x4_t v_d = vec_splats(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d));
|
||||
const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
|
||||
|
||||
acc = vec_madd(v_xy, v_d, acc);
|
||||
}
|
||||
@@ -305,7 +306,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
sumi += x[ib].qs[j]*y[ib].qs[j];
|
||||
}
|
||||
|
||||
sumf += sumi*(GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d));
|
||||
sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
@@ -348,7 +349,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
float sum = 0;
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
||||
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
||||
|
||||
const uint8_t * restrict x0l = x[i].qs;
|
||||
const uint8_t * restrict x0h = x[i].hmask;
|
||||
@@ -497,7 +498,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
@@ -537,8 +538,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
float sumf = 0;
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
||||
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
||||
|
||||
const int16x8_t v_ysumsl = vec_xl(0 , y[i].bsums);
|
||||
const int16x8_t v_ysumsh = vec_xl(16, y[i].bsums);
|
||||
@@ -647,9 +648,9 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
||||
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
||||
sumf -= dmin * sumi;
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
@@ -698,8 +699,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
float sumf = 0;
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
||||
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
||||
|
||||
const int16x8_t v_ysumsl = vec_xl(0 , y[i].bsums);
|
||||
const int16x8_t v_ysumsh = vec_xl(16, y[i].bsums);
|
||||
@@ -819,9 +820,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
||||
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
||||
sumf -= dmin * sumi;
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
@@ -859,7 +860,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
int8x16_t v_y[4];
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d_all = GGML_FP16_TO_FP32(x[i].d);
|
||||
const float d_all = GGML_CPU_FP16_TO_FP32(x[i].d);
|
||||
|
||||
const uint8_t * GGML_RESTRICT x0l = x[i].ql;
|
||||
const uint8_t * GGML_RESTRICT x0h = x[i].qh;
|
||||
@@ -1004,7 +1005,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
@@ -1071,7 +1072,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
// float sumf = 0;
|
||||
|
||||
// for (int i = 0; i < nb; ++i) {
|
||||
// const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
// const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
// const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
||||
// const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
|
||||
@@ -1121,7 +1122,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
// float sumf = 0.f;
|
||||
// for (int i = 0; i < nb; ++i) {
|
||||
// const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
// const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
// const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
||||
// const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
// int32_t bsum = 0;
|
||||
@@ -1182,12 +1183,12 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v
|
||||
const int8x16_t v_yh = vec_xl(QK8_0/2, y0->qs);
|
||||
const int32x4_t v_xy = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh);
|
||||
|
||||
sumf += GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d) * (v_xy[0] + v_xy[1] + v_xy[2] + v_xy[3]);
|
||||
sumf += GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d) * (v_xy[0] + v_xy[1] + v_xy[2] + v_xy[3]);
|
||||
}
|
||||
|
||||
#endif
|
||||
for (; ib < nb; ++ib) {
|
||||
const float d = GGML_FP16_TO_FP32(y[ib].d)*GGML_FP16_TO_FP32(x[ib].d);
|
||||
const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d);
|
||||
int sumi1 = 0, sumi2 = 0;
|
||||
for (int j = 0; j < QK4_NL/2; ++j) {
|
||||
sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf];
|
||||
@@ -1257,7 +1258,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
||||
sumi2 += (vsumi1[0] + vsumi1[1] + vsumi1[2] + vsumi1[3]) * ls2;
|
||||
}
|
||||
|
||||
sumf += GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d * (sumi1 + sumi2);
|
||||
sumf += GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d * (sumi1 + sumi2);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
@@ -1265,7 +1266,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
||||
#else
|
||||
float sumf = 0;
|
||||
for (int ibl = 0; ibl < nb; ++ibl) {
|
||||
const float d4d8 = GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
|
||||
const float d4d8 = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
|
||||
uint16_t h = x[ibl].scales_h;
|
||||
const uint8_t * qs = x[ibl].qs;
|
||||
const int8_t * q8 = y[ibl].qs;
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
#include "ggml-quants.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-cpu.h"
|
||||
#include "simd-mappings.h"
|
||||
|
||||
#include "../../quants.h"
|
||||
#include "../../ggml-cpu-impl.h"
|
||||
@@ -65,7 +66,7 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
||||
const float d = amax / ((1 << 7) - 1);
|
||||
const float id = d ? 1.0f/d : 0.0f;
|
||||
|
||||
y[i].d = GGML_FP32_TO_FP16(d);
|
||||
y[i].d = GGML_CPU_FP32_TO_FP16(d);
|
||||
|
||||
for (int j = 0; j < 8; j++) {
|
||||
const v128_t v = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id));
|
||||
@@ -110,7 +111,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
||||
const float d = amax / ((1 << 7) - 1);
|
||||
const float id = d ? 1.0f/d : 0.0f;
|
||||
|
||||
y[i].d = GGML_FP32_TO_FP16(d);
|
||||
y[i].d = GGML_CPU_FP32_TO_FP16(d);
|
||||
|
||||
v128_t accv = wasm_i32x4_splat(0);
|
||||
|
||||
@@ -126,7 +127,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
||||
accv = wasm_i32x4_add(accv, vi);
|
||||
}
|
||||
|
||||
y[i].s = GGML_FP32_TO_FP16(
|
||||
y[i].s = GGML_CPU_FP32_TO_FP16(
|
||||
d * (wasm_i32x4_extract_lane(accv, 0) +
|
||||
wasm_i32x4_extract_lane(accv, 1) +
|
||||
wasm_i32x4_extract_lane(accv, 2) +
|
||||
@@ -324,8 +325,8 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
);
|
||||
|
||||
// Accumulate results with scaling
|
||||
float scale0 = GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d);
|
||||
float scale1 = GGML_FP16_TO_FP32(x1->d) * GGML_FP16_TO_FP32(y1->d);
|
||||
float scale0 = GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d);
|
||||
float scale1 = GGML_CPU_FP16_TO_FP32(x1->d) * GGML_CPU_FP16_TO_FP32(y1->d);
|
||||
|
||||
sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(dp0), wasm_f32x4_splat(scale0)));
|
||||
sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(dp1), wasm_f32x4_splat(scale1)));
|
||||
@@ -348,7 +349,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
}
|
||||
|
||||
int sumi = sumi0 + sumi1;
|
||||
sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d);
|
||||
sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
@@ -428,7 +429,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
|
||||
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
|
||||
wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
|
||||
wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d))));
|
||||
wasm_f32x4_splat(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d))));
|
||||
}
|
||||
|
||||
sumf = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
|
||||
@@ -454,7 +455,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
}
|
||||
|
||||
int sumi = sumi0 + sumi1;
|
||||
sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)) * sumi;
|
||||
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
@@ -491,7 +492,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
const block_q5_1 * GGML_RESTRICT x0 = &x[ib];
|
||||
const block_q8_1 * GGML_RESTRICT y0 = &y[ib];
|
||||
|
||||
summs += GGML_FP16_TO_FP32(x0->m) * GGML_FP16_TO_FP32(y0->s);
|
||||
summs += GGML_CPU_FP16_TO_FP32(x0->m) * GGML_CPU_FP16_TO_FP32(y0->s);
|
||||
|
||||
const v128_t m4b = wasm_i8x16_splat(0x0F);
|
||||
|
||||
@@ -538,7 +539,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
|
||||
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
|
||||
wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
|
||||
wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d))));
|
||||
wasm_f32x4_splat(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d))));
|
||||
}
|
||||
|
||||
sumf = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
|
||||
@@ -564,7 +565,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
}
|
||||
|
||||
int sumi = sumi0 + sumi1;
|
||||
sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
|
||||
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
@@ -620,7 +621,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
const v128_t sum_dots = wasm_i32x4_add(wasm_i32x4_add(dx0_0, dx0_1), wasm_i32x4_add(dx1_0, dx1_1));
|
||||
|
||||
// Convert to float and accumulate
|
||||
const float scale = GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d);
|
||||
const float scale = GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d);
|
||||
sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(sum_dots), wasm_f32x4_splat(scale)));
|
||||
}
|
||||
|
||||
@@ -635,7 +636,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
sumi += x[ib].qs[j]*y[ib].qs[j];
|
||||
}
|
||||
|
||||
sumf += sumi*(GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d));
|
||||
sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
@@ -746,8 +747,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
isum += wasm_i32x4_extract_lane(isum_vec, 0);
|
||||
}
|
||||
|
||||
const float dall = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
||||
const float dall = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
||||
sumf += dall * isum - dmin * summs;
|
||||
}
|
||||
|
||||
@@ -768,8 +769,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
summs += y[i].bsums[j] * (sc[j] >> 4);
|
||||
}
|
||||
|
||||
const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
||||
const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
||||
|
||||
int isum = 0;
|
||||
int is = 0;
|
||||
@@ -880,7 +881,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
}
|
||||
|
||||
// Accumulate results
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const v128_t v_d = wasm_f32x4_splat(d);
|
||||
v128_t v_sum = wasm_f32x4_add(
|
||||
wasm_f32x4_mul(wasm_f32x4_convert_i32x4(v_acc0), v_d),
|
||||
@@ -957,7 +958,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
@@ -991,8 +992,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
float sumf = 0;
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); // Corrected sign
|
||||
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); // Corrected sign
|
||||
|
||||
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
@@ -1136,9 +1137,9 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
||||
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
||||
sumf -= dmin * sumi;
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
@@ -1170,8 +1171,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
float sumf = 0;
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); // Fixed sign
|
||||
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); // Fixed sign
|
||||
|
||||
const uint8_t * GGML_RESTRICT q5 = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
||||
@@ -1331,9 +1332,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
||||
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
||||
sumf -= dmin * sumi;
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
@@ -1420,7 +1421,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
wasm_v128_store(&aux32[0], acc0);
|
||||
wasm_v128_store(&aux32[4], acc1);
|
||||
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) {
|
||||
sums[l] += d * aux32[l];
|
||||
}
|
||||
@@ -1470,7 +1471,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
#include "ggml-quants.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-cpu.h"
|
||||
#include "simd-mappings.h"
|
||||
|
||||
#include "../../quants.h"
|
||||
#include "../../ggml-cpu-impl.h"
|
||||
@@ -256,9 +257,9 @@ static inline __m256 mul_sum_i8_quad_float(const __m128i x_1_0, const __m128i x_
|
||||
|
||||
// quad fp16 delta calculation
|
||||
static inline __m256 quad_fp16_delta_float(const float x0, const float y0, const float x1, const float y1) {
|
||||
// GGML_FP16_TO_FP32 is faster than Intel F16C
|
||||
return _mm256_set_m128(_mm_set1_ps(GGML_FP16_TO_FP32(x1) * GGML_FP16_TO_FP32(y1)),
|
||||
_mm_set1_ps(GGML_FP16_TO_FP32(x0) * GGML_FP16_TO_FP32(y0)));
|
||||
// GGML_CPU_FP16_TO_FP32 is faster than Intel F16C
|
||||
return _mm256_set_m128(_mm_set1_ps(GGML_CPU_FP16_TO_FP32(x1) * GGML_CPU_FP16_TO_FP32(y1)),
|
||||
_mm_set1_ps(GGML_CPU_FP16_TO_FP32(x0) * GGML_CPU_FP16_TO_FP32(y0)));
|
||||
}
|
||||
#endif
|
||||
#elif defined(__SSSE3__)
|
||||
@@ -305,7 +306,7 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
||||
|
||||
// Quantize these floats
|
||||
const float d = maxScalar / 127.f;
|
||||
y[i].d = GGML_FP32_TO_FP16(d);
|
||||
y[i].d = GGML_CPU_FP32_TO_FP16(d);
|
||||
const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f;
|
||||
const __m256 mul = _mm256_set1_ps( id );
|
||||
|
||||
@@ -401,7 +402,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
||||
|
||||
// Quantize these floats
|
||||
const float d = max_scalar / 127.f;
|
||||
y[i].d = GGML_FP32_TO_FP16(d);
|
||||
y[i].d = GGML_CPU_FP32_TO_FP16(d);
|
||||
const float id = ( max_scalar != 0.0f ) ? 127.f / max_scalar : 0.0f;
|
||||
const __m256 mul = _mm256_set1_ps( id );
|
||||
|
||||
@@ -425,7 +426,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
||||
|
||||
#if defined(__AVX2__)
|
||||
// Compute the sum of the quants and set y[i].s
|
||||
y[i].s = GGML_FP32_TO_FP16(d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3))));
|
||||
y[i].s = GGML_CPU_FP32_TO_FP16(d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3))));
|
||||
|
||||
// Convert int32 to int16
|
||||
i0 = _mm256_packs_epi32( i0, i1 ); // 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15
|
||||
@@ -455,7 +456,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
||||
// Compute the sum of the quants and set y[i].s
|
||||
const __m128i s0 = _mm_add_epi32(_mm_add_epi32(ni0, ni1), _mm_add_epi32(ni2, ni3));
|
||||
const __m128i s1 = _mm_add_epi32(_mm_add_epi32(ni4, ni5), _mm_add_epi32(ni6, ni7));
|
||||
y[i].s = GGML_FP32_TO_FP16(d * hsum_i32_4(_mm_add_epi32(s0, s1)));
|
||||
y[i].s = GGML_CPU_FP32_TO_FP16(d * hsum_i32_4(_mm_add_epi32(s0, s1)));
|
||||
|
||||
// Convert int32 to int16
|
||||
ni0 = _mm_packs_epi32( ni0, ni1 );
|
||||
@@ -552,7 +553,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
// Main loop
|
||||
for (; ib < nb; ++ib) {
|
||||
/* Compute combined scale for the block */
|
||||
const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d) );
|
||||
const __m256 d = _mm256_set1_ps( GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d) );
|
||||
|
||||
__m256i qx = bytes_from_nibbles_32(x[ib].qs);
|
||||
|
||||
@@ -613,7 +614,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
_mm_prefetch(&y[ib] + sizeof(block_q8_0), _MM_HINT_T0);
|
||||
|
||||
// Compute combined scale for the block 0 and 1
|
||||
const __m128 d_0_1 = _mm_set1_ps( GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d) );
|
||||
const __m128 d_0_1 = _mm_set1_ps( GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d) );
|
||||
|
||||
const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[ib].qs);
|
||||
|
||||
@@ -631,7 +632,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
_mm_prefetch(&y[ib] + 2 * sizeof(block_q8_0), _MM_HINT_T0);
|
||||
|
||||
// Compute combined scale for the block 2 and 3
|
||||
const __m128 d_2_3 = _mm_set1_ps( GGML_FP16_TO_FP32(x[ib + 1].d) * GGML_FP16_TO_FP32(y[ib + 1].d) );
|
||||
const __m128 d_2_3 = _mm_set1_ps( GGML_CPU_FP16_TO_FP32(x[ib + 1].d) * GGML_CPU_FP16_TO_FP32(y[ib + 1].d) );
|
||||
|
||||
const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs);
|
||||
|
||||
@@ -680,7 +681,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
}
|
||||
|
||||
int sumi = sumi0 + sumi1;
|
||||
sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d);
|
||||
sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
@@ -711,10 +712,10 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
// Main loop
|
||||
for (; ib < nb; ++ib) {
|
||||
const float d0 = GGML_FP16_TO_FP32(x[ib].d);
|
||||
const float d1 = GGML_FP16_TO_FP32(y[ib].d);
|
||||
const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
|
||||
const float d1 = GGML_CPU_FP16_TO_FP32(y[ib].d);
|
||||
|
||||
summs += GGML_FP16_TO_FP32(x[ib].m) * GGML_FP16_TO_FP32(y[ib].s);
|
||||
summs += GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_CPU_FP16_TO_FP32(y[ib].s);
|
||||
|
||||
const __m256 d0v = _mm256_set1_ps( d0 );
|
||||
const __m256 d1v = _mm256_set1_ps( d1 );
|
||||
@@ -752,7 +753,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
}
|
||||
|
||||
int sumi = sumi0 + sumi1;
|
||||
sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
|
||||
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
@@ -783,7 +784,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
// Main loop
|
||||
for (; ib < nb; ++ib) {
|
||||
/* Compute combined scale for the block */
|
||||
const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d));
|
||||
const __m256 d = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
|
||||
|
||||
__m256i qx = bytes_from_nibbles_32(x[ib].qs);
|
||||
__m256i bxhi = bytes_from_bits_32(x[ib].qh);
|
||||
@@ -807,7 +808,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
// Main loop
|
||||
for (; ib < nb; ++ib) {
|
||||
/* Compute combined scale for the block */
|
||||
const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d));
|
||||
const __m256 d = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
|
||||
|
||||
__m256i bx_0 = bytes_from_nibbles_32(x[ib].qs);
|
||||
const __m256i bxhi = bytes_from_bits_32(x[ib].qh);
|
||||
@@ -851,7 +852,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
}
|
||||
|
||||
int sumi = sumi0 + sumi1;
|
||||
sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)) * sumi;
|
||||
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
@@ -883,16 +884,16 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
// Main loop
|
||||
for (; ib < nb; ++ib) {
|
||||
const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[ib].d));
|
||||
const __m256 dx = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ib].d));
|
||||
|
||||
summs += GGML_FP16_TO_FP32(x[ib].m) * GGML_FP16_TO_FP32(y[ib].s);
|
||||
summs += GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_CPU_FP16_TO_FP32(y[ib].s);
|
||||
|
||||
__m256i qx = bytes_from_nibbles_32(x[ib].qs);
|
||||
__m256i bxhi = bytes_from_bits_32(x[ib].qh);
|
||||
bxhi = _mm256_and_si256(bxhi, _mm256_set1_epi8(0x10));
|
||||
qx = _mm256_or_si256(qx, bxhi);
|
||||
|
||||
const __m256 dy = _mm256_set1_ps(GGML_FP16_TO_FP32(y[ib].d));
|
||||
const __m256 dy = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib].d));
|
||||
const __m256i qy = _mm256_loadu_si256((const __m256i *)y[ib].qs);
|
||||
|
||||
const __m256 q = mul_sum_us8_pairs_float(qx, qy);
|
||||
@@ -910,9 +911,9 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
// Main loop
|
||||
for (; ib < nb; ++ib) {
|
||||
const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[ib].d));
|
||||
const __m256 dx = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ib].d));
|
||||
|
||||
summs += GGML_FP16_TO_FP32(x[ib].m) * GGML_FP16_TO_FP32(y[ib].s);
|
||||
summs += GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_CPU_FP16_TO_FP32(y[ib].s);
|
||||
|
||||
__m256i bx_0 = bytes_from_nibbles_32(x[ib].qs);
|
||||
const __m256i bxhi = bytes_from_bits_32(x[ib].qh);
|
||||
@@ -926,7 +927,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
bxh = _mm_or_si128(bxh, bxhih);
|
||||
bx_0 = MM256_SET_M128I(bxh, bxl);
|
||||
|
||||
const __m256 dy = _mm256_set1_ps(GGML_FP16_TO_FP32(y[ib].d));
|
||||
const __m256 dy = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib].d));
|
||||
const __m256i by_0 = _mm256_loadu_si256((const __m256i *)y[ib].qs);
|
||||
|
||||
const __m256 q = mul_sum_us8_pairs_float(bx_0, by_0);
|
||||
@@ -956,7 +957,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
}
|
||||
|
||||
int sumi = sumi0 + sumi1;
|
||||
sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
|
||||
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
@@ -986,7 +987,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
// Main loop
|
||||
for (; ib < nb; ++ib) {
|
||||
// Compute combined scale for the block
|
||||
const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d));
|
||||
const __m256 d = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
|
||||
__m256i qx = _mm256_loadu_si256((const __m256i *)x[ib].qs);
|
||||
__m256i qy = _mm256_loadu_si256((const __m256i *)y[ib].qs);
|
||||
|
||||
@@ -1025,7 +1026,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
sumi += x[ib].qs[j]*y[ib].qs[j];
|
||||
}
|
||||
|
||||
sumf += sumi*(GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d));
|
||||
sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
@@ -1144,7 +1145,7 @@ void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
}
|
||||
|
||||
const __m256i ysum = _mm256_loadu_si256((const __m256i *) y[i].bsums);
|
||||
const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(x[i].d));
|
||||
const __m256 d = _mm256_set1_ps(y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d));
|
||||
|
||||
sumi0 = _mm256_sub_epi16(sumi0, ysum);
|
||||
sumi0 = _mm256_add_epi16(sumi0, _mm256_add_epi16(sumi1, sumi2));
|
||||
@@ -1190,7 +1191,7 @@ void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
}
|
||||
}
|
||||
|
||||
sumf += (float) sum * (GGML_FP16_TO_FP32(x[i].d) * y[i].d);
|
||||
sumf += (float) sum * (GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
@@ -1244,7 +1245,7 @@ void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
}
|
||||
|
||||
const __m256i ysum = _mm256_loadu_si256((const __m256i *) y[i].bsums);
|
||||
const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(x[i].d));
|
||||
const __m256 d = _mm256_set1_ps(y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d));
|
||||
|
||||
sumi0 = _mm256_add_epi16(sumi0, sumi1);
|
||||
sumi0 = _mm256_sub_epi16(sumi0, ysum);
|
||||
@@ -1269,7 +1270,7 @@ void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
}
|
||||
}
|
||||
|
||||
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
||||
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
||||
|
||||
sumf += (float) sumi * d;
|
||||
}
|
||||
@@ -1299,8 +1300,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
|
||||
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
||||
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
||||
|
||||
const uint8_t * GGML_RESTRICT q2 = x[i].qs;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
@@ -1366,8 +1367,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
|
||||
const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
||||
const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
||||
|
||||
const uint8_t * GGML_RESTRICT q2 = x[i].qs;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
@@ -1477,8 +1478,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
summs += y[i].bsums[j] * (sc[j] >> 4);
|
||||
}
|
||||
|
||||
const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
||||
const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
||||
|
||||
int isum = 0;
|
||||
int is = 0;
|
||||
@@ -1533,7 +1534,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
|
||||
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
||||
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
||||
|
||||
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
@@ -1638,7 +1639,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
|
||||
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
||||
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
||||
|
||||
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
@@ -1824,7 +1825,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
@@ -1862,8 +1863,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
|
||||
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
||||
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
||||
|
||||
memcpy(utmp, x[i].scales, 12);
|
||||
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
||||
@@ -1928,8 +1929,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
|
||||
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
||||
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
||||
|
||||
const uint8_t * GGML_RESTRICT q4 = x[i].qs;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
@@ -2049,9 +2050,9 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
||||
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
||||
sumf -= dmin * sumi;
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
@@ -2092,8 +2093,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
const uint8_t * GGML_RESTRICT q5 = x[i].qs;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
|
||||
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
||||
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
||||
|
||||
memcpy(utmp, x[i].scales, 12);
|
||||
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
||||
@@ -2170,8 +2171,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
|
||||
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
||||
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
||||
|
||||
const uint8_t * GGML_RESTRICT q5 = x[i].qs;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
@@ -2311,9 +2312,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
||||
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
||||
sumf -= dmin * sumi;
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
@@ -2344,7 +2345,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
|
||||
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
||||
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
||||
|
||||
const uint8_t * GGML_RESTRICT q4 = x[i].ql;
|
||||
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
||||
@@ -2422,7 +2423,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
|
||||
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
||||
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
||||
|
||||
const uint8_t * GGML_RESTRICT q4 = x[i].ql;
|
||||
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
||||
@@ -2555,7 +2556,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
@@ -2622,7 +2623,7 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
|
||||
|
||||
__m256 accumf = _mm256_setzero_ps();
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
__m256i sumi1 = _mm256_setzero_si256();
|
||||
@@ -2663,7 +2664,7 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
|
||||
|
||||
__m256 accumf = _mm256_setzero_ps();
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
__m128i sumi1_0 = _mm_setzero_si128();
|
||||
@@ -2717,7 +2718,7 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
|
||||
|
||||
float sumf = 0.f;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
int32_t bsum = 0;
|
||||
@@ -2792,7 +2793,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
||||
|
||||
__m256 accumf = _mm256_setzero_ps();
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
|
||||
@@ -2913,7 +2914,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
||||
|
||||
__m256 accumf = _mm256_setzero_ps();
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
|
||||
@@ -3035,7 +3036,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
||||
|
||||
float sumf = 0.f;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT sc = x[i].scales;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
@@ -3104,7 +3105,7 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
|
||||
__m256 accumf = _mm256_setzero_ps();
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint8_t * GGML_RESTRICT qs = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
||||
const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8);
|
||||
@@ -3177,7 +3178,7 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
|
||||
__m256 accumf = _mm256_setzero_ps();
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint8_t * GGML_RESTRICT qs = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
||||
const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8);
|
||||
@@ -3253,7 +3254,7 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; i++) {
|
||||
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const int8_t * q8 = y[i].qs;
|
||||
const uint8_t * qs = x[i].qs;
|
||||
const uint8_t * qh = x[i].qh;
|
||||
@@ -3313,7 +3314,7 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
|
||||
|
||||
__m256 accumf = _mm256_setzero_ps();
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
@@ -3358,7 +3359,7 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
|
||||
|
||||
__m256 accumf = _mm256_setzero_ps();
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
@@ -3414,7 +3415,7 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
|
||||
|
||||
float sumf = 0.f;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
@@ -3480,7 +3481,7 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
|
||||
__m256 accumf = _mm256_setzero_ps();
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint8_t * GGML_RESTRICT qs = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
||||
const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs;
|
||||
@@ -3565,7 +3566,7 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
|
||||
__m256 accumf = _mm256_setzero_ps();
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint8_t * GGML_RESTRICT qs = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
||||
const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs;
|
||||
@@ -3648,7 +3649,7 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
|
||||
float sumf = 0.f;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint8_t * GGML_RESTRICT qs = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
||||
const uint8_t * GGML_RESTRICT signs = x[i].signs;
|
||||
@@ -3753,7 +3754,7 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
+ (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * (qh[ib+1] & 0x8000 ? -1 : 1) * ls2;
|
||||
}
|
||||
|
||||
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
||||
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
||||
accum = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(sumi), accum);
|
||||
accum1 += d * sumi1;
|
||||
|
||||
@@ -3801,7 +3802,7 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
+ (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * (qh[ib+1] & 0x8000 ? -1 : 1) * ls2;
|
||||
}
|
||||
|
||||
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
||||
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
||||
accum = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(sumi1_1, sumi1_0))), accum);
|
||||
accum1 += d * sumi1;
|
||||
|
||||
@@ -3835,7 +3836,7 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
qs += 4;
|
||||
}
|
||||
|
||||
sumf += GGML_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
|
||||
sumf += GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
@@ -3947,7 +3948,7 @@ void ggml_vec_dot_iq1_m_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
qs += 8; qh += 4;
|
||||
}
|
||||
|
||||
const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(scale.f16));
|
||||
const __m256 d = _mm256_set1_ps(y[i].d * GGML_CPU_FP16_TO_FP32(scale.f16));
|
||||
|
||||
accum1 = _mm256_fmadd_ps(d, _mm256_cvtepi32_ps(sumi1), accum1);
|
||||
accum2 = _mm256_fmadd_ps(d, _mm256_cvtepi32_ps(sumi2), accum2);
|
||||
@@ -4033,7 +4034,7 @@ void ggml_vec_dot_iq1_m_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
qs += 8; qh += 4;
|
||||
}
|
||||
|
||||
const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(scale.f16));
|
||||
const __m256 d = _mm256_set1_ps(y[i].d * GGML_CPU_FP16_TO_FP32(scale.f16));
|
||||
|
||||
accum1 = _mm256_add_ps(_mm256_mul_ps(d, _mm256_cvtepi32_ps(MM256_SET_M128I(sumi1_1, sumi1_0))), accum1);
|
||||
accum2 = _mm256_add_ps(_mm256_mul_ps(d, _mm256_cvtepi32_ps(MM256_SET_M128I(sumi2_1, sumi2_0))), accum2);
|
||||
@@ -4083,7 +4084,7 @@ void ggml_vec_dot_iq1_m_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
qh += 2;
|
||||
}
|
||||
|
||||
sumf += GGML_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2);
|
||||
sumf += GGML_CPU_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
@@ -4129,9 +4130,9 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v
|
||||
const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
|
||||
const __m256i p_1 = _mm256_madd_epi16(p16_1, mone);
|
||||
const __m256i p_2 = _mm256_madd_epi16(p16_2, mone);
|
||||
accum1 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 0].d)*GGML_FP16_TO_FP32(x[ib + 0].d)),
|
||||
accum1 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib + 0].d)*GGML_CPU_FP16_TO_FP32(x[ib + 0].d)),
|
||||
_mm256_cvtepi32_ps(p_1), accum1);
|
||||
accum2 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 1].d)*GGML_FP16_TO_FP32(x[ib + 1].d)),
|
||||
accum2 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib + 1].d)*GGML_CPU_FP16_TO_FP32(x[ib + 1].d)),
|
||||
_mm256_cvtepi32_ps(p_2), accum2);
|
||||
}
|
||||
|
||||
@@ -4164,7 +4165,7 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v
|
||||
|
||||
#endif
|
||||
for (; ib < nb; ++ib) {
|
||||
const float d = GGML_FP16_TO_FP32(y[ib].d)*GGML_FP16_TO_FP32(x[ib].d);
|
||||
const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d);
|
||||
int sumi1 = 0, sumi2 = 0;
|
||||
for (int j = 0; j < QK4_NL/2; ++j) {
|
||||
sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf];
|
||||
@@ -4219,7 +4220,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
||||
sumi1 = _mm256_add_epi32(p_1, sumi1);
|
||||
sumi2 = _mm256_add_epi32(p_2, sumi2);
|
||||
}
|
||||
accum = _mm256_fmadd_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(x[ibl].d)*y[ibl].d),
|
||||
accum = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ibl].d)*y[ibl].d),
|
||||
_mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accum);
|
||||
}
|
||||
|
||||
@@ -4267,7 +4268,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
||||
}
|
||||
__m128i sumi12_0 = _mm_add_epi32(sumi1_0, sumi2_0);
|
||||
__m128i sumi12_1 = _mm_add_epi32(sumi1_1, sumi2_1);
|
||||
accum = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(x[ibl].d)*y[ibl].d),
|
||||
accum = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ibl].d)*y[ibl].d),
|
||||
_mm256_cvtepi32_ps(MM256_SET_M128I(sumi12_1, sumi12_0))), accum);
|
||||
}
|
||||
|
||||
@@ -4276,7 +4277,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
|
||||
#else
|
||||
float sumf = 0;
|
||||
for (int ibl = 0; ibl < nb; ++ibl) {
|
||||
const float d4d8 = GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
|
||||
const float d4d8 = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
|
||||
uint16_t h = x[ibl].scales_h;
|
||||
const uint8_t * qs = x[ibl].qs;
|
||||
const int8_t * q8 = y[ibl].qs;
|
||||
|
||||
@@ -6,6 +6,7 @@
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-cpu.h"
|
||||
#include "ggml-cpu-impl.h"
|
||||
#include "simd-mappings.h"
|
||||
#include "traits.h"
|
||||
|
||||
#include <cmath>
|
||||
@@ -39,11 +40,11 @@ static inline __m512 __avx512_f32cx8x2_load(ggml_fp16_t *x, ggml_fp16_t *y) {
|
||||
float tmp[16];
|
||||
|
||||
for (int i = 0; i < 8; i++) {
|
||||
tmp[i] = GGML_FP16_TO_FP32(x[i]);
|
||||
tmp[i] = GGML_CPU_FP16_TO_FP32(x[i]);
|
||||
}
|
||||
|
||||
for (int i = 0; i < 8; i++) {
|
||||
tmp[i + 8] = GGML_FP16_TO_FP32(y[i]);
|
||||
tmp[i + 8] = GGML_CPU_FP16_TO_FP32(y[i]);
|
||||
}
|
||||
|
||||
return _mm512_loadu_ps(tmp);
|
||||
@@ -54,10 +55,10 @@ static inline __m512 __avx512_repeat_f32cx16_load(__m128i x) {
|
||||
_mm_storeu_si128((__m128i*)tmphalf, x);
|
||||
|
||||
for (int i = 0; i < 4; i++) {
|
||||
tmp[i] = GGML_FP16_TO_FP32(tmphalf[i]);
|
||||
tmp[i + 4] = GGML_FP16_TO_FP32(tmphalf[i]);
|
||||
tmp[i + 8] = GGML_FP16_TO_FP32(tmphalf[i]);
|
||||
tmp[i + 12] = GGML_FP16_TO_FP32(tmphalf[i]);
|
||||
tmp[i] = GGML_CPU_FP16_TO_FP32(tmphalf[i]);
|
||||
tmp[i + 4] = GGML_CPU_FP16_TO_FP32(tmphalf[i]);
|
||||
tmp[i + 8] = GGML_CPU_FP16_TO_FP32(tmphalf[i]);
|
||||
tmp[i + 12] = GGML_CPU_FP16_TO_FP32(tmphalf[i]);
|
||||
}
|
||||
|
||||
return _mm512_loadu_ps(tmp);
|
||||
@@ -67,7 +68,7 @@ static inline __m256 __avx_f32cx8_load(ggml_fp16_t *x) {
|
||||
float tmp[8];
|
||||
|
||||
for (int i = 0; i < 8; i++) {
|
||||
tmp[i] = GGML_FP16_TO_FP32(x[i]);
|
||||
tmp[i] = GGML_CPU_FP16_TO_FP32(x[i]);
|
||||
}
|
||||
|
||||
return _mm256_loadu_ps(tmp);
|
||||
@@ -76,8 +77,8 @@ static inline __m256 __avx_repeat_f32cx8_load(ggml_fp16_t *x) {
|
||||
float tmp[8];
|
||||
|
||||
for (int i = 0; i < 4; i++) {
|
||||
tmp[i] = GGML_FP16_TO_FP32(x[i]);
|
||||
tmp[i + 4] = GGML_FP16_TO_FP32(x[i]);
|
||||
tmp[i] = GGML_CPU_FP16_TO_FP32(x[i]);
|
||||
tmp[i + 4] = GGML_CPU_FP16_TO_FP32(x[i]);
|
||||
}
|
||||
|
||||
return _mm256_loadu_ps(tmp);
|
||||
@@ -88,7 +89,7 @@ static inline __m256 __avx_rearranged_f32cx8_load(ggml_fp16_t *x, __m128i arrang
|
||||
|
||||
_mm_storeu_si128((__m128i*)tmphalf, _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *) x), arrangeMask));
|
||||
for (int i = 0; i < 8; i++) {
|
||||
tmp[i] = GGML_FP16_TO_FP32(tmphalf[i]);
|
||||
tmp[i] = GGML_CPU_FP16_TO_FP32(tmphalf[i]);
|
||||
}
|
||||
|
||||
return _mm256_loadu_ps(tmp);
|
||||
@@ -211,7 +212,7 @@ void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTR
|
||||
id[row_iter] = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f; //d ? 1.0f / d : 0.0f;
|
||||
|
||||
// Store the scale for the individual block
|
||||
y[i].d[row_iter] = GGML_FP32_TO_FP16(d);
|
||||
y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
|
||||
|
||||
// Store the values in blocks of eight values - Aim is to use these later for block interleaving
|
||||
srcv[row_iter][0] = v0;
|
||||
@@ -297,7 +298,7 @@ void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTR
|
||||
const float d = amax / ((1 << 7) - 1);
|
||||
id[row_iter] = d ? 1.0f / d : 0.0f;
|
||||
|
||||
y[i].d[row_iter] = GGML_FP32_TO_FP16(d);
|
||||
y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
|
||||
}
|
||||
|
||||
for (int j = 0; j < QK8_0 * 4; j++) {
|
||||
@@ -647,7 +648,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
const __m256 col_scale_f32 = GGML_F32Cx8_REARRANGE_LOAD(b_ptr[b].d, changemask);
|
||||
|
||||
// Load and convert to FP32 scale from block_q8_0
|
||||
const __m256 row_scale_f32 = _mm256_set1_ps(GGML_FP16_TO_FP32(a_ptr[b].d));
|
||||
const __m256 row_scale_f32 = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(a_ptr[b].d));
|
||||
|
||||
// Load the block values in block_q8_0 in batches of 16 bytes and replicate the same across 256 bit vector
|
||||
__m256i lhs_vec_0 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)a_ptr[b].qs));
|
||||
@@ -706,7 +707,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
||||
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
|
||||
}
|
||||
sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d);
|
||||
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -972,13 +973,13 @@ void ggml_gemv_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
sumi2 = sumi2 * scales_1[j];
|
||||
sumi += sumi1 + sumi2;
|
||||
}
|
||||
sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
|
||||
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
|
||||
}
|
||||
}
|
||||
for (int sb = 0; sb < 8; sb++) {
|
||||
uint8_t *mins = (uint8_t*) utmp + 8 + sb * 16;
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
|
||||
sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1755,7 +1756,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
||||
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
|
||||
}
|
||||
sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]);
|
||||
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -3259,7 +3260,7 @@ void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
sumi2 = sumi2 * scales_1[j];
|
||||
sumi += sumi1 + sumi2;
|
||||
}
|
||||
sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
|
||||
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -3268,7 +3269,7 @@ void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
for(int m = 0; m < 4; m++) {
|
||||
const int16_t *bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
|
||||
for(int j = 0; j < ncols_interleaved; j++) {
|
||||
sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
|
||||
sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
#include "traits.h"
|
||||
#include "ggml-cpu-impl.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "simd-mappings.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
@@ -12,11 +13,11 @@
|
||||
// convenience functions/macros for use in template calls
|
||||
// note: these won't be required after the 'traits' lookup table is used.
|
||||
static inline ggml_fp16_t f32_to_f16(float x) {
|
||||
return GGML_FP32_TO_FP16(x);
|
||||
return GGML_CPU_FP32_TO_FP16(x);
|
||||
}
|
||||
|
||||
static inline float f16_to_f32(ggml_fp16_t x) {
|
||||
return GGML_FP16_TO_FP32(x);
|
||||
return GGML_CPU_FP16_TO_FP32(x);
|
||||
}
|
||||
|
||||
static inline ggml_bf16_t f32_to_bf16(float x) {
|
||||
|
||||
@@ -62,11 +62,17 @@ struct ggml_compute_params {
|
||||
#if defined(__s390x__) && defined(__VEC__)
|
||||
#ifndef __VXE__
|
||||
#define __VXE__
|
||||
#endif
|
||||
#endif // __VXE__
|
||||
#ifndef __VXE2__
|
||||
#define __VXE2__
|
||||
#endif
|
||||
#endif
|
||||
#endif // __VXE2__
|
||||
#endif // __s390x__ && __VEC__
|
||||
|
||||
#if defined(__s390x__) && defined(GGML_NNPA)
|
||||
#ifndef __NNPA__
|
||||
#define __NNPA__
|
||||
#endif // __NNPA__
|
||||
#endif // __s390x__ && GGML_NNPA
|
||||
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
#include <sys/prctl.h>
|
||||
@@ -371,7 +377,7 @@ inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b)
|
||||
#define vec_xor(a, b) ((a) ^ (b)) // Vector XOR
|
||||
#endif
|
||||
|
||||
typedef signed char char8x16_t __attribute__((vector_size(16)));
|
||||
typedef signed char char8x16_t __attribute__((vector_size(16)));
|
||||
typedef unsigned char uchar8x16_t __attribute__((vector_size(16)));
|
||||
|
||||
typedef int8_t int8x16_t __attribute__((vector_size(16)));
|
||||
@@ -382,10 +388,10 @@ typedef uint8_t uint8x16_t __attribute__((vector_size(16)));
|
||||
typedef uint16_t uint16x8_t __attribute__((vector_size(16)));
|
||||
typedef uint32_t uint32x4_t __attribute__((vector_size(16)));
|
||||
|
||||
typedef float float32x4_t __attribute__((vector_size(16)));
|
||||
typedef double double64x2_t __attribute((vector_size(16)));
|
||||
typedef float float32x4_t __attribute__((vector_size(16)));
|
||||
typedef double double64x2_t __attribute__((vector_size(16)));
|
||||
|
||||
typedef signed long long long64x2_t __attribute((vector_size(16)));
|
||||
typedef signed long long long64x2_t __attribute__((vector_size(16)));
|
||||
typedef unsigned long long ulong64x2_t __attribute__((vector_size(16)));
|
||||
|
||||
typedef struct ggml_uint8x16x2_t {
|
||||
@@ -503,31 +509,9 @@ static __m256 __lasx_xvreplfr2vr_s(const float val) {
|
||||
// TODO: move to ggml-threading
|
||||
void ggml_barrier(struct ggml_threadpool * tp);
|
||||
|
||||
void ggml_threadpool_chunk_set(struct ggml_threadpool * tp, int value);
|
||||
int ggml_threadpool_chunk_add(struct ggml_threadpool * tp, int value);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#define GGML_DO_PRAGMA_(x) _Pragma (#x)
|
||||
#define GGML_DO_PRAGMA(x) GGML_DO_PRAGMA_(x)
|
||||
#if defined(GGML_CPU_GENERIC) || defined(__HIPCC__)
|
||||
// Note for Apple targets:
|
||||
// - clang: aliases are not supported on darwin
|
||||
// - all native kernels need to be implemented in both x86 and arm files
|
||||
// - on iOS, tvOS, and visionOS, if cmake cannot determine the target architecture, all `_generic` names are replaced by defines
|
||||
# define GGML_WEAK_ALIAS(name, alias)
|
||||
#elif defined(__GNUC__)
|
||||
// GCC/Clang on *nix
|
||||
# define GGML_WEAK_ALIAS(name, alias) GGML_DO_PRAGMA(weak name = alias) // NOLINT
|
||||
#elif defined(_MSC_VER) && defined(_WIN64)
|
||||
// MSVC
|
||||
// Note: C name mangling varies across different calling conventions
|
||||
// see https://learn.microsoft.com/en-us/cpp/build/reference/decorated-names?view=msvc-170
|
||||
# define GGML_WEAK_ALIAS(name, alias) GGML_DO_PRAGMA(comment(linker, "/alternatename:" #name "=" #alias))
|
||||
#elif defined(_MSC_VER) && defined(WIN32)
|
||||
// ref: https://github.com/ggml-org/whisper.cpp/pull/3239#issuecomment-2958224591
|
||||
# define GGML_WEAK_ALIAS(name, alias) GGML_DO_PRAGMA(comment(linker, "/alternatename:_" #name "=_" #alias))
|
||||
#else
|
||||
# error "Unsupported compiler for GGML_WEAK_ALIAS"
|
||||
#endif
|
||||
|
||||
#define GGML_CPU_NATIVE_IMPL(name) GGML_WEAK_ALIAS(name, name ## _generic)
|
||||
|
||||
@@ -72,15 +72,13 @@
|
||||
#define UNUSED GGML_UNUSED
|
||||
#define SWAP(x, y, T) do { T SWAP = x; (x) = y; (y) = SWAP; } while (0)
|
||||
|
||||
// precomputed f32 table for f16 (256 KB) (simd-mappings.h)
|
||||
float ggml_table_f32_f16[1 << 16];
|
||||
|
||||
#if defined(__ARM_ARCH)
|
||||
struct ggml_arm_arch_features_type {
|
||||
int has_neon;
|
||||
int has_dotprod;
|
||||
int has_i8mm;
|
||||
int has_sve;
|
||||
int sve_cnt;
|
||||
int has_sme;
|
||||
} ggml_arm_arch_features = {-1, -1, -1, -1, 0, -1};
|
||||
} ggml_arm_arch_features = { 0 };
|
||||
#endif
|
||||
|
||||
|
||||
@@ -197,6 +195,7 @@ typedef pthread_t ggml_thread_t;
|
||||
|
||||
static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
|
||||
[GGML_TYPE_F32] = {
|
||||
.from_float = (ggml_from_float_t) ggml_cpu_fp32_to_fp32,
|
||||
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32,
|
||||
.vec_dot_type = GGML_TYPE_F32,
|
||||
.nrows = 1,
|
||||
@@ -559,6 +558,14 @@ void ggml_barrier(struct ggml_threadpool * tp) {
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_threadpool_chunk_set(struct ggml_threadpool * tp, int value) {
|
||||
atomic_store_explicit(&tp->current_chunk, value, memory_order_relaxed);
|
||||
}
|
||||
|
||||
int ggml_threadpool_chunk_add(struct ggml_threadpool * tp, int value) {
|
||||
return atomic_fetch_add_explicit(&tp->current_chunk, value, memory_order_relaxed);
|
||||
}
|
||||
|
||||
#if defined(__gnu_linux__)
|
||||
static cpu_set_t ggml_get_numa_affinity(void) {
|
||||
cpu_set_t cpuset;
|
||||
@@ -670,87 +677,15 @@ bool ggml_is_numa(void) {
|
||||
|
||||
#if defined(__linux__) && defined(__aarch64__)
|
||||
#include <sys/auxv.h>
|
||||
#elif defined(__APPLE__)
|
||||
#include <sys/sysctl.h>
|
||||
#endif
|
||||
|
||||
#if !defined(HWCAP2_I8MM)
|
||||
#define HWCAP2_I8MM (1 << 13)
|
||||
#endif
|
||||
|
||||
#if !defined(HWCAP2_SME)
|
||||
#define HWCAP2_SME (1 << 23)
|
||||
#endif
|
||||
|
||||
static void ggml_init_arm_arch_features(void) {
|
||||
#if defined(__linux__) && defined(__aarch64__)
|
||||
uint32_t hwcap = getauxval(AT_HWCAP);
|
||||
uint32_t hwcap2 = getauxval(AT_HWCAP2);
|
||||
|
||||
ggml_arm_arch_features.has_neon = !!(hwcap & HWCAP_ASIMD);
|
||||
ggml_arm_arch_features.has_dotprod = !!(hwcap & HWCAP_ASIMDDP);
|
||||
ggml_arm_arch_features.has_i8mm = !!(hwcap2 & HWCAP2_I8MM);
|
||||
ggml_arm_arch_features.has_sve = !!(hwcap & HWCAP_SVE);
|
||||
ggml_arm_arch_features.has_sme = !!(hwcap2 & HWCAP2_SME);
|
||||
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
#if defined(__linux__) && defined(__aarch64__) && defined(__ARM_FEATURE_SVE)
|
||||
ggml_arm_arch_features.sve_cnt = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
|
||||
#endif
|
||||
#elif defined(__APPLE__)
|
||||
int oldp = 0;
|
||||
size_t size = sizeof(oldp);
|
||||
if (sysctlbyname("hw.optional.AdvSIMD", &oldp, &size, NULL, 0) != 0) {
|
||||
oldp = 0;
|
||||
}
|
||||
ggml_arm_arch_features.has_neon = oldp;
|
||||
|
||||
if (sysctlbyname("hw.optional.arm.FEAT_DotProd", &oldp, &size, NULL, 0) != 0) {
|
||||
oldp = 0;
|
||||
}
|
||||
ggml_arm_arch_features.has_dotprod = oldp;
|
||||
|
||||
if (sysctlbyname("hw.optional.arm.FEAT_I8MM", &oldp, &size, NULL, 0) != 0) {
|
||||
oldp = 0;
|
||||
}
|
||||
ggml_arm_arch_features.has_i8mm = oldp;
|
||||
|
||||
if (sysctlbyname("hw.optional.arm.FEAT_SME", &oldp, &size, NULL, 0) != 0) {
|
||||
oldp = 0;
|
||||
}
|
||||
ggml_arm_arch_features.has_sme = oldp;
|
||||
|
||||
ggml_arm_arch_features.has_sve = 0;
|
||||
ggml_arm_arch_features.sve_cnt = 0;
|
||||
#else
|
||||
// Run-time CPU feature detection not implemented for this platform, fallback to compile time
|
||||
#if defined(__ARM_NEON)
|
||||
ggml_arm_arch_features.has_neon = 1;
|
||||
#else
|
||||
ggml_arm_arch_features.has_neon = 0;
|
||||
#endif
|
||||
|
||||
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
||||
ggml_arm_arch_features.has_i8mm = 1;
|
||||
#else
|
||||
ggml_arm_arch_features.has_i8mm = 0;
|
||||
#endif
|
||||
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
ggml_arm_arch_features.has_sve = 1;
|
||||
ggml_arm_arch_features.sve_cnt = 16;
|
||||
#else
|
||||
ggml_arm_arch_features.has_sve = 0;
|
||||
ggml_arm_arch_features.sve_cnt = 0;
|
||||
#endif
|
||||
|
||||
#if defined(__ARM_FEATURE_SME) || defined(__ARM_FEATURE_SME2)
|
||||
ggml_arm_arch_features.has_sme = 1;
|
||||
#else
|
||||
ggml_arm_arch_features.has_sme = 0;
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // __ARM_ARCH
|
||||
|
||||
struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) {
|
||||
GGML_ASSERT(!ggml_get_no_alloc(ctx));
|
||||
@@ -805,7 +740,7 @@ struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value) {
|
||||
{
|
||||
assert(tensor->nb[0] == sizeof(ggml_fp16_t));
|
||||
for (int i = 0; i < n; i++) {
|
||||
ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_FP32_TO_FP16(value));
|
||||
ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_CPU_FP32_TO_FP16(value));
|
||||
}
|
||||
} break;
|
||||
case GGML_TYPE_BF16:
|
||||
@@ -864,7 +799,7 @@ struct ggml_tensor * ggml_set_f32(struct ggml_tensor * tensor, float value) {
|
||||
{
|
||||
assert(tensor->nb[0] == sizeof(ggml_fp16_t));
|
||||
for (int i = 0; i < n; i++) {
|
||||
ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_FP32_TO_FP16(value));
|
||||
ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_CPU_FP32_TO_FP16(value));
|
||||
}
|
||||
} break;
|
||||
case GGML_TYPE_BF16:
|
||||
@@ -915,7 +850,7 @@ int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i) {
|
||||
case GGML_TYPE_F16:
|
||||
{
|
||||
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
|
||||
return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
|
||||
return GGML_CPU_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
|
||||
}
|
||||
case GGML_TYPE_BF16:
|
||||
{
|
||||
@@ -960,7 +895,7 @@ void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value) {
|
||||
case GGML_TYPE_F16:
|
||||
{
|
||||
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
|
||||
((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value);
|
||||
((ggml_fp16_t *)(tensor->data))[i] = GGML_CPU_FP32_TO_FP16(value);
|
||||
} break;
|
||||
case GGML_TYPE_BF16:
|
||||
{
|
||||
@@ -989,7 +924,7 @@ int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i
|
||||
case GGML_TYPE_I32:
|
||||
return ((int32_t *) data)[0];
|
||||
case GGML_TYPE_F16:
|
||||
return GGML_FP16_TO_FP32(((ggml_fp16_t *) data)[0]);
|
||||
return GGML_CPU_FP16_TO_FP32(((ggml_fp16_t *) data)[0]);
|
||||
case GGML_TYPE_BF16:
|
||||
return GGML_BF16_TO_FP32(((ggml_bf16_t *) data)[0]);
|
||||
case GGML_TYPE_F32:
|
||||
@@ -1016,7 +951,7 @@ void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2,
|
||||
} break;
|
||||
case GGML_TYPE_F16:
|
||||
{
|
||||
((ggml_fp16_t *)(data))[0] = GGML_FP32_TO_FP16(value);
|
||||
((ggml_fp16_t *)(data))[0] = GGML_CPU_FP32_TO_FP16(value);
|
||||
} break;
|
||||
case GGML_TYPE_BF16:
|
||||
{
|
||||
@@ -1054,7 +989,7 @@ float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i) {
|
||||
}
|
||||
case GGML_TYPE_F16:
|
||||
{
|
||||
return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
|
||||
return GGML_CPU_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
|
||||
}
|
||||
case GGML_TYPE_BF16:
|
||||
{
|
||||
@@ -1093,7 +1028,7 @@ void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value) {
|
||||
} break;
|
||||
case GGML_TYPE_F16:
|
||||
{
|
||||
((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value);
|
||||
((ggml_fp16_t *)(tensor->data))[i] = GGML_CPU_FP32_TO_FP16(value);
|
||||
} break;
|
||||
case GGML_TYPE_BF16:
|
||||
{
|
||||
@@ -1120,7 +1055,7 @@ float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2,
|
||||
case GGML_TYPE_I32:
|
||||
return ((int32_t *) data)[0];
|
||||
case GGML_TYPE_F16:
|
||||
return GGML_FP16_TO_FP32(((ggml_fp16_t *) data)[0]);
|
||||
return GGML_CPU_FP16_TO_FP32(((ggml_fp16_t *) data)[0]);
|
||||
case GGML_TYPE_BF16:
|
||||
return GGML_BF16_TO_FP32(((ggml_bf16_t *) data)[0]);
|
||||
case GGML_TYPE_F32:
|
||||
@@ -1147,7 +1082,7 @@ void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2,
|
||||
} break;
|
||||
case GGML_TYPE_F16:
|
||||
{
|
||||
((ggml_fp16_t *)(data))[0] = GGML_FP32_TO_FP16(value);
|
||||
((ggml_fp16_t *)(data))[0] = GGML_CPU_FP32_TO_FP16(value);
|
||||
} break;
|
||||
case GGML_TYPE_BF16:
|
||||
{
|
||||
@@ -1258,7 +1193,7 @@ static void ggml_compute_forward_mul_mat_one_chunk(
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_compute_forward_mul_mat(
|
||||
void ggml_compute_forward_mul_mat(
|
||||
const struct ggml_compute_params * params,
|
||||
struct ggml_tensor * dst) {
|
||||
|
||||
@@ -1883,6 +1818,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
||||
{
|
||||
ggml_compute_forward_get_rows_back(params, tensor);
|
||||
} break;
|
||||
case GGML_OP_SET_ROWS:
|
||||
{
|
||||
ggml_compute_forward_set_rows(params, tensor);
|
||||
} break;
|
||||
case GGML_OP_DIAG:
|
||||
{
|
||||
ggml_compute_forward_diag(params, tensor);
|
||||
@@ -1927,6 +1866,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
||||
{
|
||||
ggml_compute_forward_im2col_back_f32(params, tensor);
|
||||
} break;
|
||||
case GGML_OP_CONV_2D:
|
||||
{
|
||||
ggml_compute_forward_conv_2d(params, tensor);
|
||||
} break;
|
||||
case GGML_OP_CONV_2D_DW:
|
||||
{
|
||||
ggml_compute_forward_conv_2d_dw(params, tensor);
|
||||
@@ -1959,6 +1902,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
||||
{
|
||||
ggml_compute_forward_pad_reflect_1d(params, tensor);
|
||||
} break;
|
||||
case GGML_OP_ROLL:
|
||||
{
|
||||
ggml_compute_forward_roll(params, tensor);
|
||||
} break;
|
||||
case GGML_OP_ARANGE:
|
||||
{
|
||||
ggml_compute_forward_arange(params, tensor);
|
||||
@@ -2006,6 +1953,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
||||
{
|
||||
ggml_compute_forward_unary(params, tensor);
|
||||
} break;
|
||||
case GGML_OP_GLU:
|
||||
{
|
||||
ggml_compute_forward_glu(params, tensor);
|
||||
} break;
|
||||
case GGML_OP_GET_REL_POS:
|
||||
{
|
||||
ggml_compute_forward_get_rel_pos(params, tensor);
|
||||
@@ -2216,6 +2167,20 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
break;
|
||||
case GGML_OP_GLU:
|
||||
switch (ggml_get_glu_op(node)) {
|
||||
case GGML_GLU_OP_REGLU:
|
||||
case GGML_GLU_OP_GEGLU:
|
||||
case GGML_GLU_OP_SWIGLU:
|
||||
case GGML_GLU_OP_GEGLU_ERF:
|
||||
case GGML_GLU_OP_GEGLU_QUICK:
|
||||
{
|
||||
n_tasks = n_threads;
|
||||
} break;
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
break;
|
||||
case GGML_OP_SILU_BACK:
|
||||
case GGML_OP_MUL:
|
||||
case GGML_OP_DIV:
|
||||
@@ -2232,6 +2197,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
||||
n_tasks = n_threads;
|
||||
} break;
|
||||
case GGML_OP_GET_ROWS:
|
||||
case GGML_OP_SET_ROWS:
|
||||
{
|
||||
// FIXME: get_rows can use additional threads, but the cost of launching additional threads
|
||||
// decreases performance with GPU offloading
|
||||
@@ -2268,6 +2234,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
||||
} break;
|
||||
case GGML_OP_IM2COL:
|
||||
case GGML_OP_IM2COL_BACK:
|
||||
case GGML_OP_CONV_2D:
|
||||
case GGML_OP_CONV_2D_DW:
|
||||
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||
case GGML_OP_CONV_TRANSPOSE_2D:
|
||||
@@ -2283,6 +2250,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
||||
case GGML_OP_UPSCALE:
|
||||
case GGML_OP_PAD:
|
||||
case GGML_OP_PAD_REFLECT_1D:
|
||||
case GGML_OP_ROLL:
|
||||
case GGML_OP_ARANGE:
|
||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
case GGML_OP_ARGSORT:
|
||||
@@ -2785,6 +2753,10 @@ struct ggml_cplan ggml_graph_plan(
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
} break;
|
||||
case GGML_OP_CONV_2D:
|
||||
{
|
||||
cur = GGML_IM2COL_WORK_SIZE;
|
||||
} break;
|
||||
case GGML_OP_CONV_TRANSPOSE_2D:
|
||||
{
|
||||
const int64_t ne00 = node->src[0]->ne[0]; // W
|
||||
@@ -3185,6 +3157,10 @@ enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct g
|
||||
return ggml_graph_compute(cgraph, &cplan);
|
||||
}
|
||||
|
||||
void ggml_cpu_fp32_to_fp32(const float * x, float * y, int64_t n) {
|
||||
memcpy(y, x, n * sizeof(float));
|
||||
}
|
||||
|
||||
void ggml_cpu_fp32_to_fp16(const float * x, ggml_fp16_t * y, int64_t n) {
|
||||
int64_t i = 0;
|
||||
#if defined(__F16C__)
|
||||
@@ -3205,9 +3181,24 @@ void ggml_cpu_fp32_to_fp16(const float * x, ggml_fp16_t * y, int64_t n) {
|
||||
__m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
|
||||
_mm_storel_epi64((__m128i *)(y + i), y_vec);
|
||||
}
|
||||
#elif defined(__NNPA__)
|
||||
for (; i + 7 < n; i += 8) {
|
||||
float32x4_t v_xh = vec_xl(0, (const float *)(x + i + 0));
|
||||
float32x4_t v_xl = vec_xl(0, (const float *)(x + i + 4));
|
||||
uint16x8_t v_yd = vec_round_from_fp32(v_xh, v_xl, 0);
|
||||
uint16x8_t v_y = vec_convert_to_fp16(v_yd, 0);
|
||||
vec_xst(v_y, 0, (ggml_fp16_t *)(y + i));
|
||||
}
|
||||
for (; i + 3 < n; i += 4) {
|
||||
float32x4_t v_x = vec_xl(0, (const float *)(x + i));
|
||||
float32x4_t v_zero = vec_splats(0.0f);
|
||||
uint16x8_t v_yd = vec_round_from_fp32(v_x, v_zero, 0);
|
||||
uint16x8_t v_y = vec_convert_to_fp16(v_yd, 0);
|
||||
vec_xst(v_y, 0, (ggml_fp16_t *)(y + i));
|
||||
}
|
||||
#endif
|
||||
for (; i < n; ++i) {
|
||||
y[i] = GGML_FP32_TO_FP16(x[i]);
|
||||
y[i] = GGML_CPU_FP32_TO_FP16(x[i]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3231,9 +3222,25 @@ void ggml_cpu_fp16_to_fp32(const ggml_fp16_t * x, float * y, int64_t n) {
|
||||
__m128 y_vec = _mm_cvtph_ps(x_vec);
|
||||
_mm_storeu_ps(y + i, y_vec);
|
||||
}
|
||||
#elif defined(__NNPA__)
|
||||
for (; i + 7 < n; i += 8) {
|
||||
uint16x8_t v_x = vec_xl(0, (const ggml_fp16_t *)(x + i));
|
||||
uint16x8_t v_yd = vec_convert_from_fp16(v_x, 0);
|
||||
float32x4_t v_yh = vec_extend_to_fp32_hi(v_yd, 0);
|
||||
float32x4_t v_yl = vec_extend_to_fp32_lo(v_yd, 0);
|
||||
vec_xst(v_yh, 0, (float *)(y + i + 0));
|
||||
vec_xst(v_yl, 0, (float *)(y + i + 4));
|
||||
}
|
||||
for (; i + 3 < n; i += 4) {
|
||||
uint16x8_t v_x = vec_xl(0, (const ggml_fp16_t *)(x + i));
|
||||
uint16x8_t v_yd = vec_convert_from_fp16(v_x, 0);
|
||||
float32x4_t v_yh = vec_extend_to_fp32_hi(v_yd, 0);
|
||||
vec_xst(v_yh, 0, (float *)(y + i));
|
||||
}
|
||||
#endif
|
||||
|
||||
for (; i < n; ++i) {
|
||||
y[i] = GGML_FP16_TO_FP32(x[i]);
|
||||
y[i] = GGML_CPU_FP16_TO_FP32(x[i]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3433,9 +3440,17 @@ int ggml_cpu_has_vxe(void) {
|
||||
#endif
|
||||
}
|
||||
|
||||
int ggml_cpu_has_nnpa(void) {
|
||||
#if defined(GGML_NNPA)
|
||||
return 1;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
int ggml_cpu_has_neon(void) {
|
||||
#if defined(__ARM_ARCH) && defined(__ARM_NEON)
|
||||
return ggml_arm_arch_features.has_neon;
|
||||
return 1;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
@@ -3443,7 +3458,7 @@ int ggml_cpu_has_neon(void) {
|
||||
|
||||
int ggml_cpu_has_dotprod(void) {
|
||||
#if defined(__ARM_ARCH) && defined(__ARM_FEATURE_DOTPROD)
|
||||
return ggml_arm_arch_features.has_dotprod;
|
||||
return 1;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
@@ -3451,7 +3466,7 @@ int ggml_cpu_has_dotprod(void) {
|
||||
|
||||
int ggml_cpu_has_sve(void) {
|
||||
#if defined(__ARM_ARCH) && defined(__ARM_FEATURE_SVE)
|
||||
return ggml_arm_arch_features.has_sve;
|
||||
return 1;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
@@ -3459,7 +3474,7 @@ int ggml_cpu_has_sve(void) {
|
||||
|
||||
int ggml_cpu_has_matmul_int8(void) {
|
||||
#if defined(__ARM_ARCH) && defined(__ARM_FEATURE_MATMUL_INT8)
|
||||
return ggml_arm_arch_features.has_i8mm;
|
||||
return 1;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
@@ -3475,14 +3490,14 @@ int ggml_cpu_get_sve_cnt(void) {
|
||||
|
||||
int ggml_cpu_has_sme(void) {
|
||||
#if defined(__ARM_ARCH) && defined(__ARM_FEATURE_SME)
|
||||
return ggml_arm_arch_features.has_sme;
|
||||
return 1;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_cpu_init(void) {
|
||||
// needed to initialize f16 tables
|
||||
// needed to initialize ggml_time
|
||||
{
|
||||
struct ggml_init_params params = { 0, NULL, false };
|
||||
struct ggml_context * ctx = ggml_init(params);
|
||||
@@ -3503,9 +3518,10 @@ void ggml_cpu_init(void) {
|
||||
uint16_t u16;
|
||||
ggml_fp16_t fp16;
|
||||
} u = {i};
|
||||
float f = GGML_FP16_TO_FP32(u.fp16);
|
||||
ggml_table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f));
|
||||
ggml_table_gelu_quick_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_quick_f32(f));
|
||||
float f = GGML_COMPUTE_FP16_TO_FP32(u.fp16);
|
||||
ggml_table_f32_f16[i] = f;
|
||||
ggml_table_gelu_f16[i] = GGML_CPU_FP32_TO_FP16(ggml_gelu_f32(f));
|
||||
ggml_table_gelu_quick_f16[i] = GGML_CPU_FP32_TO_FP16(ggml_gelu_quick_f32(f));
|
||||
}
|
||||
|
||||
const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
|
||||
|
||||
@@ -416,6 +416,7 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
|
||||
|
||||
switch (op->op) {
|
||||
case GGML_OP_CPY:
|
||||
case GGML_OP_SET_ROWS:
|
||||
return
|
||||
op->type != GGML_TYPE_IQ3_XXS &&
|
||||
op->type != GGML_TYPE_IQ3_S &&
|
||||
@@ -578,6 +579,9 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
|
||||
if (ggml_cpu_has_vxe()) {
|
||||
features.push_back({ "VXE", "1" });
|
||||
}
|
||||
if (ggml_cpu_has_nnpa()) {
|
||||
features.push_back({ "NNPA", "1" });
|
||||
}
|
||||
if (ggml_cpu_has_wasm_simd()) {
|
||||
features.push_back({ "WASM_SIMD", "1" });
|
||||
}
|
||||
|
||||
@@ -52,8 +52,8 @@
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-cpu-impl.h"
|
||||
#include "ggml-quants.h"
|
||||
#include "simd-mappings.h"
|
||||
|
||||
#include <atomic>
|
||||
#include <array>
|
||||
#include <type_traits>
|
||||
|
||||
@@ -63,7 +63,7 @@
|
||||
#define NOINLINE __attribute__((__noinline__))
|
||||
#endif
|
||||
|
||||
#if defined(__ARM_NEON) || defined(__AVX512F__)
|
||||
#if defined(__ARM_NEON) || defined(__AVX512F__) || defined(__VXE__) || defined(__VXE2__)
|
||||
#define VECTOR_REGISTERS 32
|
||||
#else
|
||||
#define VECTOR_REGISTERS 16
|
||||
@@ -74,7 +74,7 @@
|
||||
namespace {
|
||||
|
||||
inline float unhalf(ggml_fp16_t d) {
|
||||
return GGML_FP16_TO_FP32(d);
|
||||
return GGML_CPU_FP16_TO_FP32(d);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
@@ -110,6 +110,12 @@ inline float16x8_t sub(float16x8_t x, float16x8_t y) { return vsubq_f16(x, y); }
|
||||
inline float16x8_t mul(float16x8_t x, float16x8_t y) { return vmulq_f16(x, y); }
|
||||
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
|
||||
|
||||
#if defined(__VXE__) || defined(__VXE2__)
|
||||
inline float32x4_t add(float32x4_t x, float32x4_t y) { return vec_add(x, y); }
|
||||
inline float32x4_t sub(float32x4_t x, float32x4_t y) { return vec_sub(x, y); }
|
||||
inline float32x4_t mul(float32x4_t x, float32x4_t y) { return vec_mul(x, y); }
|
||||
#endif
|
||||
|
||||
#if defined(__MMA__)
|
||||
typedef vector unsigned char vec_t;
|
||||
typedef __vector_quad acc_t;
|
||||
@@ -163,6 +169,13 @@ inline float16x8_t madd(float16x8_t a, float16x8_t b, float16x8_t c) {
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(__VXE__) || defined(__VXE2__)
|
||||
template <>
|
||||
inline float32x4_t madd(float32x4_t a, float32x4_t b, float32x4_t c) {
|
||||
return vec_madd(a, b, c);
|
||||
}
|
||||
#endif
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// VECTORIZED HORIZONTAL SUM
|
||||
|
||||
@@ -179,6 +192,13 @@ inline float hsum(float16x8_t x) {
|
||||
}
|
||||
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
|
||||
|
||||
#if defined(__VXE__) || defined(__VXE2__)
|
||||
inline float hsum(float32x4_t x) {
|
||||
float32x4_t tmp = x + vec_reve(x);
|
||||
return tmp[0] + tmp[1];
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(__SSE__) || defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
|
||||
inline float hsum(__m128 x) {
|
||||
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
|
||||
@@ -228,6 +248,21 @@ template <> inline float32x4_t load(const ggml_fp16_t *p) {
|
||||
#endif // _MSC_VER
|
||||
#endif // __ARM_NEON
|
||||
|
||||
#if defined(__VXE__) || defined(__VXE2__)
|
||||
template <> inline float32x4_t load(const ggml_fp16_t * p) {
|
||||
float tmp[4];
|
||||
|
||||
for (int i = 0; i < 4; i++) {
|
||||
tmp[i] = GGML_CPU_FP16_TO_FP32(p[i]);
|
||||
}
|
||||
|
||||
return vec_xl(0, (const float *)(tmp));
|
||||
}
|
||||
template <> inline float32x4_t load(const float * p) {
|
||||
return vec_xl(0, p);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(__SSE__) || defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
|
||||
template <> inline __m128 load(const float *p) {
|
||||
return _mm_loadu_ps(p);
|
||||
@@ -394,8 +429,6 @@ class tinyBLAS {
|
||||
|
||||
template <int RM, int RN, int BM>
|
||||
NOINLINE void gemm(int64_t m, int64_t n, int64_t BN) {
|
||||
static std::atomic<int64_t> current_chunk;
|
||||
|
||||
GGML_ASSERT(m % (RM * BM) == 0);
|
||||
const int64_t ytiles = m / (RM * BM);
|
||||
const int64_t xtiles = (n + RN -1) / RN;
|
||||
@@ -410,7 +443,7 @@ class tinyBLAS {
|
||||
if (params->ith == 0) {
|
||||
GGML_ASSERT( jj_BN * SIZE_BN + (NB_BN - jj_BN) * (SIZE_BN - 1) == xtiles);
|
||||
// Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
|
||||
std::atomic_store_explicit(¤t_chunk, (int64_t)params->nth, std::memory_order_relaxed);
|
||||
ggml_threadpool_chunk_set(params->threadpool, params->nth);
|
||||
}
|
||||
|
||||
ggml_barrier(params->threadpool);
|
||||
@@ -439,8 +472,7 @@ class tinyBLAS {
|
||||
GGML_ASSERT(jj == jj2);
|
||||
}
|
||||
|
||||
// next step.
|
||||
job = std::atomic_fetch_add_explicit(¤t_chunk, (int64_t)1, std::memory_order_relaxed);
|
||||
job = ggml_threadpool_chunk_add(params->threadpool, 1);
|
||||
}
|
||||
|
||||
ggml_barrier(params->threadpool);
|
||||
@@ -3323,6 +3355,14 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64
|
||||
(const float *)B, ldb,
|
||||
(float *)C, ldc};
|
||||
return tb.matmul(m, n);
|
||||
#elif defined(__VXE__) || defined(__VXE2__)
|
||||
if (n < 4)
|
||||
return false;
|
||||
tinyBLAS<4, float32x4_t, float32x4_t, float, float, float> tb{ params,
|
||||
k, (const float *)A, lda,
|
||||
(const float *)B, ldb,
|
||||
(float *)C, ldc};
|
||||
return tb.matmul(m, n);
|
||||
#elif defined(__MMA__)
|
||||
if (k % 8)
|
||||
return false;
|
||||
@@ -3414,6 +3454,16 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64
|
||||
(float *)C, ldc};
|
||||
return tb.matmul(m, n);
|
||||
}
|
||||
#elif defined(__VXE__) || defined(__VXE2__)
|
||||
if (n < 4)
|
||||
return false;
|
||||
if (Btype == GGML_TYPE_F16) {
|
||||
tinyBLAS<4, float32x4_t, float32x4_t, ggml_fp16_t, ggml_fp16_t, float> tb{ params,
|
||||
k, (const ggml_fp16_t *)A, lda,
|
||||
(const ggml_fp16_t *)B, ldb,
|
||||
(float *)C, ldc};
|
||||
return tb.matmul(m, n);
|
||||
}
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -1,6 +1,11 @@
|
||||
#pragma once
|
||||
#include <stdint.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
#if defined(__VXE__) || defined(__VXE2__)
|
||||
#include <vecintrin.h>
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -20,6 +20,9 @@
|
||||
|
||||
static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
|
||||
|
||||
// Work buffer size for im2col operations in CONV2D
|
||||
#define GGML_IM2COL_WORK_SIZE (16 * 1024 * 1024)
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
@@ -53,6 +56,7 @@ void ggml_compute_forward_permute(const struct ggml_compute_params * params, str
|
||||
void ggml_compute_forward_transpose(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_get_rows(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_get_rows_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_set_rows(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_diag(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_diag_mask_inf(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_diag_mask_zero(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
@@ -64,6 +68,7 @@ void ggml_compute_forward_clamp(const struct ggml_compute_params * params, struc
|
||||
void ggml_compute_forward_conv_transpose_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_im2col(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_im2col_back_f32(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_conv_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_conv_transpose_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_conv_2d_dw(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_pool_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
@@ -72,6 +77,7 @@ void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params
|
||||
void ggml_compute_forward_upscale(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_pad(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_pad_reflect_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_roll(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_arange(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_timestep_embedding(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_argsort(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
@@ -92,6 +98,7 @@ void ggml_compute_forward_ssm_scan(const struct ggml_compute_params * params, st
|
||||
void ggml_compute_forward_win_part(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_win_unpart(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_unary(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_glu(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_get_rel_pos(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_add_rel_pos(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_rwkv_wkv6(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
@@ -104,6 +111,7 @@ void ggml_compute_forward_custom(const struct ggml_compute_params * params, stru
|
||||
void ggml_compute_forward_cross_entropy_loss(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_cross_entropy_loss_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_opt_step_adamw(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
||||
@@ -2,9 +2,12 @@
|
||||
#include "ggml-common.h"
|
||||
|
||||
#include "ggml-cpu-impl.h"
|
||||
#include "simd-mappings.h"
|
||||
#include "ggml-quants.h"
|
||||
#include "quants.h"
|
||||
|
||||
#include "arch-fallback.h"
|
||||
|
||||
#include <string.h>
|
||||
#include <assert.h>
|
||||
#include <float.h>
|
||||
@@ -38,12 +41,10 @@ void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in
|
||||
void quantize_row_q8_0_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
|
||||
quantize_row_q8_0_ref(x, y, k);
|
||||
}
|
||||
GGML_CPU_NATIVE_IMPL(quantize_row_q8_0)
|
||||
|
||||
void quantize_row_q8_1_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
|
||||
quantize_row_q8_1_ref(x, y, k);
|
||||
}
|
||||
GGML_CPU_NATIVE_IMPL(quantize_row_q8_1)
|
||||
|
||||
//
|
||||
// 2-6 bit quantization in super-blocks
|
||||
@@ -104,7 +105,6 @@ void quantize_row_tq2_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy,
|
||||
void quantize_row_q8_K_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
|
||||
quantize_row_q8_K_ref(x, y, k);
|
||||
}
|
||||
GGML_CPU_NATIVE_IMPL(quantize_row_q8_K)
|
||||
|
||||
//===================================== Dot products =================================
|
||||
|
||||
@@ -138,12 +138,11 @@ void ggml_vec_dot_q4_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, c
|
||||
}
|
||||
|
||||
int sumi = sumi0 + sumi1;
|
||||
sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d);
|
||||
sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
}
|
||||
GGML_CPU_NATIVE_IMPL(ggml_vec_dot_q4_0_q8_0)
|
||||
|
||||
// TODO: add WASM SIMD
|
||||
void ggml_vec_dot_q4_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
@@ -176,12 +175,11 @@ void ggml_vec_dot_q4_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, c
|
||||
}
|
||||
|
||||
int sumi = sumi0 + sumi1;
|
||||
sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
|
||||
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
}
|
||||
GGML_CPU_NATIVE_IMPL(ggml_vec_dot_q4_1_q8_1)
|
||||
|
||||
void ggml_vec_dot_q5_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
const int qk = QK8_0;
|
||||
@@ -220,12 +218,11 @@ void ggml_vec_dot_q5_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, c
|
||||
}
|
||||
|
||||
int sumi = sumi0 + sumi1;
|
||||
sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)) * sumi;
|
||||
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
}
|
||||
GGML_CPU_NATIVE_IMPL(ggml_vec_dot_q5_0_q8_0)
|
||||
|
||||
void ggml_vec_dot_q5_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
const int qk = QK8_1;
|
||||
@@ -264,12 +261,11 @@ void ggml_vec_dot_q5_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, c
|
||||
}
|
||||
|
||||
int sumi = sumi0 + sumi1;
|
||||
sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
|
||||
sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
}
|
||||
GGML_CPU_NATIVE_IMPL(ggml_vec_dot_q5_1_q8_1)
|
||||
|
||||
void ggml_vec_dot_q8_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
const int qk = QK8_0;
|
||||
@@ -295,12 +291,11 @@ void ggml_vec_dot_q8_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, c
|
||||
sumi += x[ib].qs[j]*y[ib].qs[j];
|
||||
}
|
||||
|
||||
sumf += sumi*(GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d));
|
||||
sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
}
|
||||
GGML_CPU_NATIVE_IMPL(ggml_vec_dot_q8_0_q8_0)
|
||||
|
||||
void ggml_vec_dot_tq1_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
assert(nrc == 1);
|
||||
@@ -348,12 +343,11 @@ void ggml_vec_dot_tq1_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||
}
|
||||
}
|
||||
|
||||
sumf += (float) sum * (GGML_FP16_TO_FP32(x[i].d) * y[i].d);
|
||||
sumf += (float) sum * (GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
}
|
||||
GGML_CPU_NATIVE_IMPL(ggml_vec_dot_tq1_0_q8_K)
|
||||
|
||||
void ggml_vec_dot_tq2_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
assert(nrc == 1);
|
||||
@@ -379,14 +373,13 @@ void ggml_vec_dot_tq2_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||
}
|
||||
}
|
||||
|
||||
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
||||
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
||||
|
||||
sumf += (float) sumi * d;
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
}
|
||||
GGML_CPU_NATIVE_IMPL(ggml_vec_dot_tq2_0_q8_K)
|
||||
|
||||
void ggml_vec_dot_q2_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
assert(nrc == 1);
|
||||
@@ -413,8 +406,8 @@ void ggml_vec_dot_q2_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c
|
||||
summs += y[i].bsums[j] * (sc[j] >> 4);
|
||||
}
|
||||
|
||||
const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
||||
const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
|
||||
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
||||
|
||||
int isum = 0;
|
||||
int is = 0;
|
||||
@@ -439,7 +432,6 @@ void ggml_vec_dot_q2_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c
|
||||
}
|
||||
*s = sumf;
|
||||
}
|
||||
GGML_CPU_NATIVE_IMPL(ggml_vec_dot_q2_K_q8_K)
|
||||
|
||||
void ggml_vec_dot_q3_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
assert(n % QK_K == 0);
|
||||
@@ -513,13 +505,12 @@ void ggml_vec_dot_q3_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
*s = sumf;
|
||||
}
|
||||
GGML_CPU_NATIVE_IMPL(ggml_vec_dot_q3_K_q8_K)
|
||||
|
||||
void ggml_vec_dot_q4_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
assert(n % QK_K == 0);
|
||||
@@ -587,15 +578,14 @@ void ggml_vec_dot_q4_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
||||
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
||||
sumf -= dmin * sumi;
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
*s = sumf;
|
||||
}
|
||||
GGML_CPU_NATIVE_IMPL(ggml_vec_dot_q4_K_q8_K)
|
||||
|
||||
void ggml_vec_dot_q5_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
assert(n % QK_K == 0);
|
||||
@@ -668,15 +658,14 @@ void ggml_vec_dot_q5_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
||||
const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
||||
sumf -= dmin * sumi;
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
*s = sumf;
|
||||
}
|
||||
GGML_CPU_NATIVE_IMPL(ggml_vec_dot_q5_K_q8_K)
|
||||
|
||||
void ggml_vec_dot_q6_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
assert(n % QK_K == 0);
|
||||
@@ -726,13 +715,12 @@ void ggml_vec_dot_q6_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c
|
||||
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
|
||||
q8 += 8; a += 8;
|
||||
}
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
|
||||
}
|
||||
for (int l = 0; l < 8; ++l) sumf += sums[l];
|
||||
*s = sumf;
|
||||
}
|
||||
GGML_CPU_NATIVE_IMPL(ggml_vec_dot_q6_K_q8_K)
|
||||
|
||||
void ggml_vec_dot_iq2_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
assert(n % QK_K == 0);
|
||||
@@ -752,7 +740,7 @@ void ggml_vec_dot_iq2_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs
|
||||
|
||||
float sumf = 0.f;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
int32_t bsum = 0;
|
||||
@@ -775,7 +763,6 @@ void ggml_vec_dot_iq2_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs
|
||||
}
|
||||
*s = 0.125f * sumf;
|
||||
}
|
||||
GGML_CPU_NATIVE_IMPL(ggml_vec_dot_iq2_xxs_q8_K)
|
||||
|
||||
void ggml_vec_dot_iq2_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
assert(n % QK_K == 0);
|
||||
@@ -792,7 +779,7 @@ void ggml_vec_dot_iq2_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||
|
||||
float sumf = 0.f;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint16_t * GGML_RESTRICT q2 = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT sc = x[i].scales;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
@@ -826,7 +813,6 @@ void ggml_vec_dot_iq2_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||
}
|
||||
*s = 0.125f * sumf;
|
||||
}
|
||||
GGML_CPU_NATIVE_IMPL(ggml_vec_dot_iq2_xs_q8_K)
|
||||
|
||||
void ggml_vec_dot_iq2_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
assert(n % QK_K == 0);
|
||||
@@ -844,7 +830,7 @@ void ggml_vec_dot_iq2_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; i++) {
|
||||
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const int8_t * q8 = y[i].qs;
|
||||
const uint8_t * qs = x[i].qs;
|
||||
const uint8_t * qh = x[i].qh;
|
||||
@@ -879,7 +865,6 @@ void ggml_vec_dot_iq2_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||
|
||||
*s = 0.125f * sumf;
|
||||
}
|
||||
GGML_CPU_NATIVE_IMPL(ggml_vec_dot_iq2_s_q8_K)
|
||||
|
||||
void ggml_vec_dot_iq3_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
assert(n % QK_K == 0);
|
||||
@@ -898,7 +883,7 @@ void ggml_vec_dot_iq3_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs
|
||||
|
||||
float sumf = 0.f;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
@@ -924,7 +909,6 @@ void ggml_vec_dot_iq3_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs
|
||||
}
|
||||
*s = 0.25f * sumf;
|
||||
}
|
||||
GGML_CPU_NATIVE_IMPL(ggml_vec_dot_iq3_xxs_q8_K)
|
||||
|
||||
void ggml_vec_dot_iq3_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
assert(n % QK_K == 0);
|
||||
@@ -941,7 +925,7 @@ void ggml_vec_dot_iq3_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||
|
||||
float sumf = 0.f;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint8_t * GGML_RESTRICT qs = x[i].qs;
|
||||
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
||||
const uint8_t * GGML_RESTRICT signs = x[i].signs;
|
||||
@@ -981,7 +965,6 @@ void ggml_vec_dot_iq3_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||
}
|
||||
*s = sumf;
|
||||
}
|
||||
GGML_CPU_NATIVE_IMPL(ggml_vec_dot_iq3_s_q8_K)
|
||||
|
||||
void ggml_vec_dot_iq1_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
assert(n % QK_K == 0);
|
||||
@@ -1020,12 +1003,11 @@ void ggml_vec_dot_iq1_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||
qs += 4;
|
||||
}
|
||||
|
||||
sumf += GGML_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
|
||||
sumf += GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
}
|
||||
GGML_CPU_NATIVE_IMPL(ggml_vec_dot_iq1_s_q8_K)
|
||||
|
||||
void ggml_vec_dot_iq1_m_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
assert(n % QK_K == 0);
|
||||
@@ -1082,12 +1064,11 @@ void ggml_vec_dot_iq1_m_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||
qh += 2;
|
||||
}
|
||||
|
||||
sumf += GGML_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2);
|
||||
sumf += GGML_CPU_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2);
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
}
|
||||
GGML_CPU_NATIVE_IMPL(ggml_vec_dot_iq1_m_q8_K)
|
||||
|
||||
void ggml_vec_dot_iq4_nl_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
assert(nrc == 1);
|
||||
@@ -1107,7 +1088,7 @@ void ggml_vec_dot_iq4_nl_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||
float sumf = 0;
|
||||
|
||||
for (; ib < nb; ++ib) {
|
||||
const float d = GGML_FP16_TO_FP32(y[ib].d)*GGML_FP16_TO_FP32(x[ib].d);
|
||||
const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d);
|
||||
int sumi1 = 0, sumi2 = 0;
|
||||
for (int j = 0; j < QK4_NL/2; ++j) {
|
||||
sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf];
|
||||
@@ -1117,7 +1098,6 @@ void ggml_vec_dot_iq4_nl_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||
}
|
||||
*s = sumf;
|
||||
}
|
||||
GGML_CPU_NATIVE_IMPL(ggml_vec_dot_iq4_nl_q8_0)
|
||||
|
||||
void ggml_vec_dot_iq4_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
||||
assert(nrc == 1);
|
||||
@@ -1134,7 +1114,7 @@ void ggml_vec_dot_iq4_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||
|
||||
float sumf = 0;
|
||||
for (int ibl = 0; ibl < nb; ++ibl) {
|
||||
const float d4d8 = GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
|
||||
const float d4d8 = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
|
||||
uint16_t h = x[ibl].scales_h;
|
||||
const uint8_t * qs = x[ibl].qs;
|
||||
const int8_t * q8 = y[ibl].qs;
|
||||
@@ -1164,7 +1144,6 @@ void ggml_vec_dot_iq4_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||
}
|
||||
*s = sumf;
|
||||
}
|
||||
GGML_CPU_NATIVE_IMPL(ggml_vec_dot_iq4_xs_q8_K)
|
||||
|
||||
// ============================ 4-bit non-linear quants
|
||||
|
||||
|
||||
@@ -84,33 +84,6 @@ void ggml_vec_dot_iq1_m_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||
void ggml_vec_dot_iq4_nl_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq4_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
|
||||
#if defined(GGML_CPU_GENERIC)
|
||||
#define quantize_row_q8_0_generic quantize_row_q8_0
|
||||
#define quantize_row_q8_1_generic quantize_row_q8_1
|
||||
#define quantize_row_q8_K_generic quantize_row_q8_K
|
||||
#define ggml_vec_dot_q4_0_q8_0_generic ggml_vec_dot_q4_0_q8_0
|
||||
#define ggml_vec_dot_q4_1_q8_1_generic ggml_vec_dot_q4_1_q8_1
|
||||
#define ggml_vec_dot_q5_0_q8_0_generic ggml_vec_dot_q5_0_q8_0
|
||||
#define ggml_vec_dot_q5_1_q8_1_generic ggml_vec_dot_q5_1_q8_1
|
||||
#define ggml_vec_dot_q8_0_q8_0_generic ggml_vec_dot_q8_0_q8_0
|
||||
#define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K
|
||||
#define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K
|
||||
#define ggml_vec_dot_q2_K_q8_K_generic ggml_vec_dot_q2_K_q8_K
|
||||
#define ggml_vec_dot_q3_K_q8_K_generic ggml_vec_dot_q3_K_q8_K
|
||||
#define ggml_vec_dot_q4_K_q8_K_generic ggml_vec_dot_q4_K_q8_K
|
||||
#define ggml_vec_dot_q5_K_q8_K_generic ggml_vec_dot_q5_K_q8_K
|
||||
#define ggml_vec_dot_q6_K_q8_K_generic ggml_vec_dot_q6_K_q8_K
|
||||
#define ggml_vec_dot_iq2_xxs_q8_K_generic ggml_vec_dot_iq2_xxs_q8_K
|
||||
#define ggml_vec_dot_iq2_xs_q8_K_generic ggml_vec_dot_iq2_xs_q8_K
|
||||
#define ggml_vec_dot_iq2_s_q8_K_generic ggml_vec_dot_iq2_s_q8_K
|
||||
#define ggml_vec_dot_iq3_xxs_q8_K_generic ggml_vec_dot_iq3_xxs_q8_K
|
||||
#define ggml_vec_dot_iq3_s_q8_K_generic ggml_vec_dot_iq3_s_q8_K
|
||||
#define ggml_vec_dot_iq1_s_q8_K_generic ggml_vec_dot_iq1_s_q8_K
|
||||
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
|
||||
#define ggml_vec_dot_iq4_nl_q8_0_generic ggml_vec_dot_iq4_nl_q8_0
|
||||
#define ggml_vec_dot_iq4_xs_q8_K_generic ggml_vec_dot_iq4_xs_q8_K
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -6,8 +6,11 @@
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-cpu.h"
|
||||
#include "ggml-cpu-impl.h"
|
||||
#include "simd-mappings.h"
|
||||
#include "traits.h"
|
||||
|
||||
#include "arch-fallback.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <cstring>
|
||||
#include <cassert>
|
||||
@@ -70,7 +73,7 @@ void ggml_quantize_mat_q8_0_4x4_generic(const float * GGML_RESTRICT x, void * GG
|
||||
const float d = amax / ((1 << 7) - 1);
|
||||
id[row_iter] = d ? 1.0f / d : 0.0f;
|
||||
|
||||
y[i].d[row_iter] = GGML_FP32_TO_FP16(d);
|
||||
y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
|
||||
}
|
||||
|
||||
for (int j = 0; j < QK8_0 * 4; j++) {
|
||||
@@ -83,7 +86,6 @@ void ggml_quantize_mat_q8_0_4x4_generic(const float * GGML_RESTRICT x, void * GG
|
||||
}
|
||||
}
|
||||
}
|
||||
GGML_CPU_NATIVE_IMPL(ggml_quantize_mat_q8_0_4x4)
|
||||
|
||||
void ggml_quantize_mat_q8_0_4x8_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
|
||||
assert(QK8_0 == 32);
|
||||
@@ -109,7 +111,7 @@ void ggml_quantize_mat_q8_0_4x8_generic(const float * GGML_RESTRICT x, void * GG
|
||||
const float d = amax / ((1 << 7) - 1);
|
||||
id[row_iter] = d ? 1.0f / d : 0.0f;
|
||||
|
||||
y[i].d[row_iter] = GGML_FP32_TO_FP16(d);
|
||||
y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
|
||||
}
|
||||
|
||||
for (int j = 0; j < QK8_0 * 4; j++) {
|
||||
@@ -122,7 +124,6 @@ void ggml_quantize_mat_q8_0_4x8_generic(const float * GGML_RESTRICT x, void * GG
|
||||
}
|
||||
}
|
||||
}
|
||||
GGML_CPU_NATIVE_IMPL(ggml_quantize_mat_q8_0_4x8)
|
||||
|
||||
void ggml_quantize_mat_q8_K_4x8_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
|
||||
assert(QK_K == 256);
|
||||
@@ -174,7 +175,6 @@ void ggml_quantize_mat_q8_K_4x8_generic(const float * GGML_RESTRICT x, void * GG
|
||||
}
|
||||
}
|
||||
}
|
||||
GGML_CPU_NATIVE_IMPL(ggml_quantize_mat_q8_K_4x8)
|
||||
|
||||
} // extern "C"
|
||||
|
||||
@@ -237,14 +237,13 @@ void ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
||||
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
|
||||
}
|
||||
sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d);
|
||||
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
||||
}
|
||||
}
|
||||
GGML_CPU_NATIVE_IMPL(ggml_gemv_q4_0_4x4_q8_0)
|
||||
|
||||
void ggml_gemv_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
const int qk = QK8_0;
|
||||
@@ -282,14 +281,13 @@ void ggml_gemv_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
||||
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
|
||||
}
|
||||
sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d);
|
||||
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
||||
}
|
||||
}
|
||||
GGML_CPU_NATIVE_IMPL(ggml_gemv_q4_0_4x8_q8_0)
|
||||
|
||||
void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
const int qk = QK8_0;
|
||||
@@ -328,7 +326,7 @@ void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
||||
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
|
||||
}
|
||||
sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d);
|
||||
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -336,7 +334,6 @@ void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||
}
|
||||
}
|
||||
}
|
||||
GGML_CPU_NATIVE_IMPL(ggml_gemv_q4_0_8x8_q8_0)
|
||||
|
||||
void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
const int qk = QK_K;
|
||||
@@ -400,13 +397,13 @@ void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||
sumi2 = sumi2 * scales_1[j];
|
||||
sumi += sumi1 + sumi2;
|
||||
}
|
||||
sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
|
||||
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
|
||||
}
|
||||
}
|
||||
for (int sb = 0; sb < 8; sb++) {
|
||||
uint8_t *mins = (uint8_t*) utmp + 8 + sb * 16;
|
||||
for (int j = 0; j < ncols_interleaved; j++) {
|
||||
sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
|
||||
sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -415,7 +412,6 @@ void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||
}
|
||||
}
|
||||
}
|
||||
GGML_CPU_NATIVE_IMPL(ggml_gemv_q4_K_8x8_q8_K)
|
||||
|
||||
void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
const int qk = QK8_0;
|
||||
@@ -454,7 +450,7 @@ void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
|
||||
const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
|
||||
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
|
||||
}
|
||||
sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d);
|
||||
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -462,7 +458,6 @@ void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
|
||||
}
|
||||
}
|
||||
}
|
||||
GGML_CPU_NATIVE_IMPL(ggml_gemv_iq4_nl_4x4_q8_0)
|
||||
|
||||
void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
const int qk = QK8_0;
|
||||
@@ -506,7 +501,7 @@ void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
||||
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
|
||||
}
|
||||
sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]);
|
||||
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -519,7 +514,6 @@ void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||
}
|
||||
}
|
||||
}
|
||||
GGML_CPU_NATIVE_IMPL(ggml_gemm_q4_0_4x4_q8_0)
|
||||
|
||||
void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
const int qk = QK8_0;
|
||||
@@ -562,7 +556,7 @@ void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
||||
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
|
||||
}
|
||||
sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]);
|
||||
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -574,7 +568,6 @@ void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||
}
|
||||
}
|
||||
}
|
||||
GGML_CPU_NATIVE_IMPL(ggml_gemm_q4_0_4x8_q8_0)
|
||||
|
||||
void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
const int qk = QK8_0;
|
||||
@@ -617,7 +610,7 @@ void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
||||
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
|
||||
}
|
||||
sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]);
|
||||
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -629,7 +622,6 @@ void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||
}
|
||||
}
|
||||
}
|
||||
GGML_CPU_NATIVE_IMPL(ggml_gemm_q4_0_8x8_q8_0)
|
||||
|
||||
void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
const int qk = QK_K;
|
||||
@@ -697,7 +689,7 @@ void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||
sumi2 = sumi2 * scales_1[j];
|
||||
sumi += sumi1 + sumi2;
|
||||
}
|
||||
sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
|
||||
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -706,7 +698,7 @@ void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||
for(int m = 0; m < 4; m++) {
|
||||
const int16_t *bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
|
||||
for(int j = 0; j < ncols_interleaved; j++) {
|
||||
sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
|
||||
sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -719,7 +711,6 @@ void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||
}
|
||||
}
|
||||
}
|
||||
GGML_CPU_NATIVE_IMPL(ggml_gemm_q4_K_8x8_q8_K)
|
||||
|
||||
void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
const int qk = QK8_0;
|
||||
@@ -763,7 +754,7 @@ void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
|
||||
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
||||
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
|
||||
}
|
||||
sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]);
|
||||
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -776,7 +767,6 @@ void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
|
||||
}
|
||||
}
|
||||
}
|
||||
GGML_CPU_NATIVE_IMPL(ggml_gemm_iq4_nl_4x4_q8_0)
|
||||
|
||||
} // extern "C"
|
||||
|
||||
@@ -1174,13 +1164,24 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
|
||||
// not realy a GGML_TYPE_Q8_0 but same size.
|
||||
switch (op->op) {
|
||||
case GGML_OP_MUL_MAT:
|
||||
size = ggml_row_size(PARAM_TYPE, ggml_nelements(op->src[1]));
|
||||
return true;
|
||||
{
|
||||
size = ggml_row_size(PARAM_TYPE, ggml_nelements(op->src[1]));
|
||||
return true;
|
||||
}
|
||||
case GGML_OP_MUL_MAT_ID:
|
||||
size = ggml_row_size(PARAM_TYPE, ggml_nelements(op->src[1]));
|
||||
size = GGML_PAD(size, sizeof(int64_t)); // + padding for next bloc.
|
||||
size += sizeof(int64_t) * (1+op->src[0]->ne[2]) * op->src[1]->ne[2];
|
||||
return true;
|
||||
{
|
||||
size = ggml_row_size(PARAM_TYPE, ggml_nelements(op->src[1]));
|
||||
size = GGML_PAD(size, sizeof(int64_t)); // + padding for next bloc.
|
||||
|
||||
const int64_t ne02 = op->src[0]->ne[2]; // n_as, n_expert
|
||||
const int64_t ne12 = op->src[1]->ne[2]; // n_tokens
|
||||
|
||||
const size_t sizeof_mmid_row_mapping = sizeof(int64_t);
|
||||
|
||||
size += sizeof_mmid_row_mapping*ne02*(ne12 + 1);
|
||||
|
||||
return true;
|
||||
}
|
||||
default:
|
||||
// GGML_ABORT("fatal error");
|
||||
break;
|
||||
@@ -1316,14 +1317,17 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
|
||||
int32_t i2;
|
||||
};
|
||||
|
||||
GGML_ASSERT(params->wsize >= (GGML_PAD(nbw3, sizeof(int64_t)) + n_as * sizeof(int64_t) +
|
||||
n_as * ne12 * sizeof(mmid_row_mapping)));
|
||||
GGML_ASSERT(params->wsize >=
|
||||
(GGML_PAD(nbw3, sizeof(int64_t)) +
|
||||
n_as*(ne12 + 1)*sizeof(mmid_row_mapping))
|
||||
);
|
||||
|
||||
auto * wdata = (char *) params->wdata;
|
||||
auto * wdata_src1_end = (char *) wdata + GGML_PAD(nbw3, sizeof(int64_t));
|
||||
auto * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
|
||||
auto * wdata = (char *)params->wdata;
|
||||
auto * wdata_src1_end = (char *)wdata + GGML_PAD(nbw3, sizeof(int64_t));
|
||||
|
||||
struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *) (matrix_row_counts + n_as); // [n_as][ne12]
|
||||
// total of [n_as][ne12 + 1] elemets of type mmid_row_mapping (2*int32_t = int64_t)
|
||||
auto * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
|
||||
struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *) (matrix_row_counts + n_as); // [n_as][ne12]
|
||||
|
||||
// src1: float32 => param type
|
||||
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
||||
@@ -1408,44 +1412,45 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
|
||||
}
|
||||
};
|
||||
|
||||
// instance for Q4
|
||||
static const tensor_traits<block_q4_0, 4, 4, GGML_TYPE_Q8_0> q4_0_4x4_q8_0;
|
||||
static const tensor_traits<block_q4_0, 8, 4, GGML_TYPE_Q8_0> q4_0_4x8_q8_0;
|
||||
static const tensor_traits<block_q4_0, 8, 8, GGML_TYPE_Q8_0> q4_0_8x8_q8_0;
|
||||
static const tensor_traits<block_q4_K, 8, 8, GGML_TYPE_Q8_K> q4_K_8x8_q8_K;
|
||||
|
||||
// instance for IQ4
|
||||
static const tensor_traits<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0> iq4_nl_4x4_q8_0;
|
||||
|
||||
} // namespace ggml::cpu::repack
|
||||
|
||||
static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(const struct ggml_tensor * cur) {
|
||||
|
||||
// instance for Q4
|
||||
static const ggml::cpu::repack::tensor_traits<block_q4_0, 4, 4, GGML_TYPE_Q8_0> q4_0_4x4_q8_0;
|
||||
static const ggml::cpu::repack::tensor_traits<block_q4_0, 8, 4, GGML_TYPE_Q8_0> q4_0_4x8_q8_0;
|
||||
static const ggml::cpu::repack::tensor_traits<block_q4_0, 8, 8, GGML_TYPE_Q8_0> q4_0_8x8_q8_0;
|
||||
static const ggml::cpu::repack::tensor_traits<block_q4_K, 8, 8, GGML_TYPE_Q8_K> q4_K_8x8_q8_K;
|
||||
|
||||
// instance for IQ4
|
||||
static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0> iq4_nl_4x4_q8_0;
|
||||
|
||||
if (cur->type == GGML_TYPE_Q4_0) {
|
||||
if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) {
|
||||
if (cur->ne[1] % 8 == 0) {
|
||||
return &ggml::cpu::repack::q4_0_8x8_q8_0;
|
||||
return &q4_0_8x8_q8_0;
|
||||
}
|
||||
}
|
||||
if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
|
||||
if (cur->ne[1] % 4 == 0) {
|
||||
return &ggml::cpu::repack::q4_0_4x8_q8_0;
|
||||
return &q4_0_4x8_q8_0;
|
||||
}
|
||||
}
|
||||
if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
|
||||
if (cur->ne[1] % 4 == 0) {
|
||||
return &ggml::cpu::repack::q4_0_4x4_q8_0;
|
||||
return &q4_0_4x4_q8_0;
|
||||
}
|
||||
}
|
||||
} else if (cur->type == GGML_TYPE_Q4_K) {
|
||||
if (ggml_cpu_has_avx2()) {
|
||||
if (cur->ne[1] % 8 == 0) {
|
||||
return &ggml::cpu::repack::q4_K_8x8_q8_K;
|
||||
return &q4_K_8x8_q8_K;
|
||||
}
|
||||
}
|
||||
} else if (cur->type == GGML_TYPE_IQ4_NL) {
|
||||
if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
|
||||
if (cur->ne[1] % 4 == 0) {
|
||||
return &ggml::cpu::repack::iq4_nl_4x4_q8_0;
|
||||
return &iq4_nl_4x4_q8_0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -64,10 +64,6 @@ static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wro
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// Workaround for clang:
|
||||
// clang++ complains: ``error: call to 'ggml_gemm_q4_0_4x4_q8_0' is ambiguous''
|
||||
// repro: https://godbolt.org/z/oKdeWKonM (ICE), https://godbolt.org/z/1szq6P36v (ambiguous call)
|
||||
#if defined(GGML_CPU_CLANG_WORKAROUND) || !(defined(__GNUC__) && defined(__clang__)) || defined(__HIPCC__)
|
||||
void ggml_quantize_mat_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
||||
void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
||||
void ggml_quantize_mat_q8_K_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
||||
@@ -81,7 +77,6 @@ void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
#endif // !defined(__clang__)
|
||||
|
||||
// Native implementations
|
||||
void ggml_quantize_mat_q8_0_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
||||
@@ -98,22 +93,6 @@ void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
||||
void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
|
||||
#if defined(GGML_CPU_GENERIC)
|
||||
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
|
||||
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
|
||||
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
|
||||
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
|
||||
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
|
||||
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
|
||||
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
|
||||
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
|
||||
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
|
||||
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
|
||||
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
|
||||
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
|
||||
#endif
|
||||
|
||||
#if defined(__cplusplus)
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
@@ -2,10 +2,167 @@
|
||||
|
||||
#include "ggml-cpu-impl.h"
|
||||
|
||||
#ifdef __ARM_FEATURE_SVE
|
||||
#include <arm_sve.h>
|
||||
#endif // __ARM_FEATURE_SVE
|
||||
|
||||
#if defined(__ARM_NEON) && !defined(__CUDACC__) && !defined(__MUSACC__)
|
||||
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
|
||||
//
|
||||
// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
|
||||
//
|
||||
#include <arm_neon.h>
|
||||
#endif
|
||||
|
||||
#if defined(__F16C__)
|
||||
#include <immintrin.h>
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
//
|
||||
// simd mappings
|
||||
//
|
||||
|
||||
// FP16 to FP32 conversion
|
||||
|
||||
// 16-bit float
|
||||
// on Arm, we use __fp16
|
||||
// on x86, we use uint16_t
|
||||
//
|
||||
// for old CUDA compilers (<= 11), we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/10616
|
||||
// for MUSA compilers , we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/11843
|
||||
//
|
||||
#if defined(__ARM_NEON) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) && !defined(__MUSACC__)
|
||||
#define GGML_CPU_COMPUTE_FP16_TO_FP32(x) neon_compute_fp16_to_fp32(x)
|
||||
#define GGML_CPU_COMPUTE_FP32_TO_FP16(x) neon_compute_fp32_to_fp16(x)
|
||||
|
||||
#define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
|
||||
|
||||
static inline float neon_compute_fp16_to_fp32(ggml_fp16_t h) {
|
||||
__fp16 tmp;
|
||||
memcpy(&tmp, &h, sizeof(ggml_fp16_t));
|
||||
return (float)tmp;
|
||||
}
|
||||
|
||||
static inline ggml_fp16_t neon_compute_fp32_to_fp16(float f) {
|
||||
ggml_fp16_t res;
|
||||
__fp16 tmp = f;
|
||||
memcpy(&res, &tmp, sizeof(ggml_fp16_t));
|
||||
return res;
|
||||
}
|
||||
#elif defined(__F16C__)
|
||||
#ifdef _MSC_VER
|
||||
#define GGML_CPU_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
|
||||
#define GGML_CPU_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
|
||||
#else
|
||||
#define GGML_CPU_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
|
||||
#define GGML_CPU_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
|
||||
#endif
|
||||
#elif defined(__POWER9_VECTOR__)
|
||||
#define GGML_CPU_COMPUTE_FP16_TO_FP32(x) power_compute_fp16_to_fp32(x)
|
||||
#define GGML_CPU_COMPUTE_FP32_TO_FP16(x) power_compute_fp32_to_fp16(x)
|
||||
/* the inline asm below is about 12% faster than the lookup method */
|
||||
#define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
|
||||
#define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x)
|
||||
|
||||
static inline float power_compute_fp16_to_fp32(ggml_fp16_t h) {
|
||||
float f;
|
||||
double d;
|
||||
__asm__(
|
||||
"mtfprd %0,%2\n"
|
||||
"xscvhpdp %0,%0\n"
|
||||
"frsp %1,%0\n" :
|
||||
/* temp */ "=d"(d),
|
||||
/* out */ "=f"(f):
|
||||
/* in */ "r"(h));
|
||||
return f;
|
||||
}
|
||||
|
||||
static inline ggml_fp16_t power_compute_fp32_to_fp16(float f) {
|
||||
double d;
|
||||
ggml_fp16_t r;
|
||||
__asm__( /* xscvdphp can work on double or single precision */
|
||||
"xscvdphp %0,%2\n"
|
||||
"mffprd %1,%0\n" :
|
||||
/* temp */ "=d"(d),
|
||||
/* out */ "=r"(r):
|
||||
/* in */ "f"(f));
|
||||
return r;
|
||||
}
|
||||
#elif defined(__riscv) && defined(__riscv_zfhmin)
|
||||
static inline float riscv_compute_fp16_to_fp32(ggml_fp16_t h) {
|
||||
float f;
|
||||
__asm__(
|
||||
"fmv.h.x %[f], %[h]\n\t"
|
||||
"fcvt.s.h %[f], %[f]"
|
||||
: [f] "=&f" (f)
|
||||
: [h] "r" (h)
|
||||
);
|
||||
return f;
|
||||
}
|
||||
|
||||
static inline ggml_fp16_t riscv_compute_fp32_to_fp16(float f) {
|
||||
ggml_fp16_t res;
|
||||
__asm__(
|
||||
"fcvt.h.s %[f], %[f]\n\t"
|
||||
"fmv.x.h %[h], %[f]"
|
||||
: [h] "=&r" (res)
|
||||
: [f] "f" (f)
|
||||
);
|
||||
return res;
|
||||
}
|
||||
|
||||
#define GGML_CPU_COMPUTE_FP16_TO_FP32(x) riscv_compute_fp16_to_fp32(x)
|
||||
#define GGML_CPU_COMPUTE_FP32_TO_FP16(x) riscv_compute_fp32_to_fp16(x)
|
||||
#define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
|
||||
#define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x)
|
||||
#elif defined(__NNPA__)
|
||||
#define GGML_CPU_COMPUTE_FP16_TO_FP32(x) nnpa_compute_fp16_to_fp32(x)
|
||||
#define GGML_CPU_COMPUTE_FP32_TO_FP16(x) nnpa_compute_fp32_to_fp16(x)
|
||||
|
||||
#define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
|
||||
#define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x)
|
||||
|
||||
static inline float nnpa_compute_fp16_to_fp32(ggml_fp16_t h) {
|
||||
uint16x8_t v_h = vec_splats(h);
|
||||
uint16x8_t v_hd = vec_convert_from_fp16(v_h, 0);
|
||||
return vec_extend_to_fp32_hi(v_hd, 0)[0];
|
||||
}
|
||||
|
||||
static inline ggml_fp16_t nnpa_compute_fp32_to_fp16(float f) {
|
||||
float32x4_t v_f = vec_splats(f);
|
||||
float32x4_t v_zero = vec_splats(0.0f);
|
||||
uint16x8_t v_hd = vec_round_from_fp32(v_f, v_zero, 0);
|
||||
uint16x8_t v_h = vec_convert_to_fp16(v_hd, 0);
|
||||
return vec_extract(v_h, 0);
|
||||
}
|
||||
#endif
|
||||
|
||||
// precomputed f32 table for f16 (256 KB)
|
||||
// defined in ggml-cpu.c, initialized in ggml_cpu_init()
|
||||
extern float ggml_table_f32_f16[1 << 16];
|
||||
|
||||
// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
|
||||
// so we define GGML_CPU_FP16_TO_FP32 and GGML_CPU_FP32_TO_FP16 elsewhere for NEON.
|
||||
// This is also true for POWER9.
|
||||
#if !defined(GGML_CPU_FP16_TO_FP32)
|
||||
inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
|
||||
uint16_t s;
|
||||
memcpy(&s, &f, sizeof(uint16_t));
|
||||
return ggml_table_f32_f16[s];
|
||||
}
|
||||
|
||||
#define GGML_CPU_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
|
||||
#endif
|
||||
|
||||
#if !defined(GGML_CPU_FP32_TO_FP16)
|
||||
#define GGML_CPU_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
|
||||
#endif
|
||||
|
||||
|
||||
// we define a common set of C macros which map to specific intrinsics based on the current architecture
|
||||
// we then implement the fundamental computation operations below using only these macros
|
||||
// adding support for new architectures requires to define the corresponding SIMD macros
|
||||
@@ -32,7 +189,7 @@
|
||||
#define GGML_F32xt_LOAD(...) GGML_F32xt_LOAD_IMPL(DEFAULT_PG, __VA_ARGS__)
|
||||
#define GGML_F32xt_STORE_IMPL(pg,a,b) svst1_f32(pg, a, b)
|
||||
#define GGML_F32xt_STORE(...) GGML_F32xt_STORE_IMPL(DEFAULT_PG, __VA_ARGS__)
|
||||
#define GGML_F32xt_FMA_IMPL(pg, a, b, c) svmad_f32_m(pg, a, b, c)
|
||||
#define GGML_F32xt_FMA_IMPL(pg, a, b, c) svmad_f32_m(pg, b, c, a)
|
||||
#define GGML_F32xt_FMA(...) GGML_F32xt_FMA_IMPL(DEFAULT_PG, __VA_ARGS__)
|
||||
#define GGML_F32xt_ADD_IMPL(pg, a, b) svadd_f32_m(pg, a, b)
|
||||
#define GGML_F32xt_ADD(...) GGML_F32xt_ADD_IMPL(DEFAULT_PG, __VA_ARGS__)
|
||||
@@ -415,7 +572,7 @@ static inline __m256 __avx_f32cx8_load(const ggml_fp16_t * x) {
|
||||
float tmp[8];
|
||||
|
||||
for (int i = 0; i < 8; i++) {
|
||||
tmp[i] = GGML_FP16_TO_FP32(x[i]);
|
||||
tmp[i] = GGML_CPU_FP16_TO_FP32(x[i]);
|
||||
}
|
||||
|
||||
return _mm256_loadu_ps(tmp);
|
||||
@@ -426,7 +583,7 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
|
||||
_mm256_storeu_ps(arr, y);
|
||||
|
||||
for (int i = 0; i < 8; i++)
|
||||
x[i] = GGML_FP32_TO_FP16(arr[i]);
|
||||
x[i] = GGML_CPU_FP32_TO_FP16(arr[i]);
|
||||
}
|
||||
#define GGML_F32Cx8_LOAD(x) __avx_f32cx8_load(x)
|
||||
#define GGML_F32Cx8_STORE(x, y) __avx_f32cx8_store(x, y)
|
||||
@@ -574,10 +731,10 @@ static inline unsigned char ggml_endian_byte(int i) {
|
||||
inline static v128_t __wasm_f16x4_load(const ggml_fp16_t * p) {
|
||||
float tmp[4];
|
||||
|
||||
tmp[0] = GGML_FP16_TO_FP32(p[0]);
|
||||
tmp[1] = GGML_FP16_TO_FP32(p[1]);
|
||||
tmp[2] = GGML_FP16_TO_FP32(p[2]);
|
||||
tmp[3] = GGML_FP16_TO_FP32(p[3]);
|
||||
tmp[0] = GGML_CPU_FP16_TO_FP32(p[0]);
|
||||
tmp[1] = GGML_CPU_FP16_TO_FP32(p[1]);
|
||||
tmp[2] = GGML_CPU_FP16_TO_FP32(p[2]);
|
||||
tmp[3] = GGML_CPU_FP16_TO_FP32(p[3]);
|
||||
|
||||
return wasm_v128_load(tmp);
|
||||
}
|
||||
@@ -587,10 +744,10 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
|
||||
|
||||
wasm_v128_store(tmp, x);
|
||||
|
||||
p[0] = GGML_FP32_TO_FP16(tmp[0]);
|
||||
p[1] = GGML_FP32_TO_FP16(tmp[1]);
|
||||
p[2] = GGML_FP32_TO_FP16(tmp[2]);
|
||||
p[3] = GGML_FP32_TO_FP16(tmp[3]);
|
||||
p[0] = GGML_CPU_FP32_TO_FP16(tmp[0]);
|
||||
p[1] = GGML_CPU_FP32_TO_FP16(tmp[1]);
|
||||
p[2] = GGML_CPU_FP32_TO_FP16(tmp[2]);
|
||||
p[3] = GGML_CPU_FP32_TO_FP16(tmp[3]);
|
||||
}
|
||||
|
||||
#define GGML_F16x4 v128_t
|
||||
@@ -690,10 +847,10 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
|
||||
static inline __m128 __sse_f16x4_load(const ggml_fp16_t * x) {
|
||||
float tmp[4];
|
||||
|
||||
tmp[0] = GGML_FP16_TO_FP32(x[0]);
|
||||
tmp[1] = GGML_FP16_TO_FP32(x[1]);
|
||||
tmp[2] = GGML_FP16_TO_FP32(x[2]);
|
||||
tmp[3] = GGML_FP16_TO_FP32(x[3]);
|
||||
tmp[0] = GGML_CPU_FP16_TO_FP32(x[0]);
|
||||
tmp[1] = GGML_CPU_FP16_TO_FP32(x[1]);
|
||||
tmp[2] = GGML_CPU_FP16_TO_FP32(x[2]);
|
||||
tmp[3] = GGML_CPU_FP16_TO_FP32(x[3]);
|
||||
|
||||
return _mm_loadu_ps(tmp);
|
||||
}
|
||||
@@ -703,10 +860,10 @@ static inline void __sse_f16x4_store(ggml_fp16_t * x, __m128 y) {
|
||||
|
||||
_mm_storeu_ps(arr, y);
|
||||
|
||||
x[0] = GGML_FP32_TO_FP16(arr[0]);
|
||||
x[1] = GGML_FP32_TO_FP16(arr[1]);
|
||||
x[2] = GGML_FP32_TO_FP16(arr[2]);
|
||||
x[3] = GGML_FP32_TO_FP16(arr[3]);
|
||||
x[0] = GGML_CPU_FP32_TO_FP16(arr[0]);
|
||||
x[1] = GGML_CPU_FP32_TO_FP16(arr[1]);
|
||||
x[2] = GGML_CPU_FP32_TO_FP16(arr[2]);
|
||||
x[3] = GGML_CPU_FP32_TO_FP16(arr[3]);
|
||||
}
|
||||
|
||||
#define GGML_F32Cx4 __m128
|
||||
@@ -828,7 +985,7 @@ static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
|
||||
#define GGML_F32x4_ZERO __lsx_vldi(0)
|
||||
#define GGML_F32x4_SET1(x) __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
|
||||
#define GGML_F32x4_LOAD(x) __lsx_vld((x), 0)
|
||||
#define GGML_F32x4_STORE((x),(y)) __lsx_vst((y), (x), 0)
|
||||
#define GGML_F32x4_STORE(x, y) __lsx_vst(y, x, 0)
|
||||
#define GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a)
|
||||
#define GGML_F32x4_ADD __lsx_vfadd_s
|
||||
#define GGML_F32x4_MUL __lsx_vfmul_s
|
||||
@@ -874,10 +1031,10 @@ static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
|
||||
static inline __m128 __lsx_f16x4_load(const ggml_fp16_t * x) {
|
||||
float tmp[4];
|
||||
|
||||
tmp[0] = GGML_FP16_TO_FP32(x[0]);
|
||||
tmp[1] = GGML_FP16_TO_FP32(x[1]);
|
||||
tmp[2] = GGML_FP16_TO_FP32(x[2]);
|
||||
tmp[3] = GGML_FP16_TO_FP32(x[3]);
|
||||
tmp[0] = GGML_CPU_FP16_TO_FP32(x[0]);
|
||||
tmp[1] = GGML_CPU_FP16_TO_FP32(x[1]);
|
||||
tmp[2] = GGML_CPU_FP16_TO_FP32(x[2]);
|
||||
tmp[3] = GGML_CPU_FP16_TO_FP32(x[3]);
|
||||
|
||||
return __lsx_vld(tmp, 0);
|
||||
}
|
||||
@@ -887,10 +1044,10 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
|
||||
|
||||
__lsx_vst(y, arr, 0);
|
||||
|
||||
x[0] = GGML_FP32_TO_FP16(arr[0]);
|
||||
x[1] = GGML_FP32_TO_FP16(arr[1]);
|
||||
x[2] = GGML_FP32_TO_FP16(arr[2]);
|
||||
x[3] = GGML_FP32_TO_FP16(arr[3]);
|
||||
x[0] = GGML_CPU_FP32_TO_FP16(arr[0]);
|
||||
x[1] = GGML_CPU_FP32_TO_FP16(arr[1]);
|
||||
x[2] = GGML_CPU_FP32_TO_FP16(arr[2]);
|
||||
x[3] = GGML_CPU_FP32_TO_FP16(arr[3]);
|
||||
}
|
||||
|
||||
#define GGML_F32Cx4 __m128
|
||||
@@ -922,7 +1079,7 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
|
||||
#define GGML_F32_STEP 32
|
||||
#define GGML_F32_EPR 4
|
||||
|
||||
#define GGML_F32x4 __vector float
|
||||
#define GGML_F32x4 float32x4_t
|
||||
#define GGML_F32x4_ZERO vec_splats(0.0f)
|
||||
#define GGML_F32x4_SET1 vec_splats
|
||||
#define GGML_F32x4_LOAD(p) vec_xl(0, p)
|
||||
@@ -944,10 +1101,8 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
x[i] = vec_add(x[i], x[offset + i]); \
|
||||
} \
|
||||
res = vec_extract(x[0], 0) + \
|
||||
vec_extract(x[0], 1) + \
|
||||
vec_extract(x[0], 2) + \
|
||||
vec_extract(x[0], 3); \
|
||||
float32x4_t tmp = x[0] + vec_reve(x[0]); \
|
||||
res = tmp[0] + tmp[1]; \
|
||||
}
|
||||
|
||||
#define GGML_F32_VEC GGML_F32x4
|
||||
@@ -964,28 +1119,45 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
|
||||
#define GGML_F16_STEP GGML_F32_STEP
|
||||
#define GGML_F16_EPR GGML_F32_EPR
|
||||
|
||||
static inline __vector float __lzs_f16cx4_load(const ggml_fp16_t * x) {
|
||||
static inline float32x4_t __lzs_f16cx4_load(const ggml_fp16_t * x) {
|
||||
#if defined(__NNPA__)
|
||||
uint16x8_t v_x = vec_xl(0, (const ggml_fp16_t *)x);
|
||||
uint16x8_t v_xd = vec_convert_from_fp16(v_x, 0);
|
||||
return vec_extend_to_fp32_hi(v_xd, 0);
|
||||
#else
|
||||
float tmp[4];
|
||||
|
||||
for (int i = 0; i < 4; i++) {
|
||||
tmp[i] = GGML_FP16_TO_FP32(x[i]);
|
||||
tmp[i] = GGML_CPU_FP16_TO_FP32(x[i]);
|
||||
}
|
||||
|
||||
// note: keep type-cast here to prevent compiler bugs
|
||||
// see: https://github.com/ggml-org/llama.cpp/issues/12846
|
||||
return vec_xl(0, (const float *)(tmp));
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline void __lzs_f16cx4_store(ggml_fp16_t * x, __vector float y) {
|
||||
static inline void __lzs_f16cx4_store(ggml_fp16_t * x, float32x4_t v_y) {
|
||||
#if defined(__NNPA__)
|
||||
float32x4_t v_zero = vec_splats(0.0f);
|
||||
uint16x8_t v_xd = vec_round_from_fp32(v_y, v_zero, 0);
|
||||
uint16x8_t v_x = vec_convert_to_fp16(v_xd, 0);
|
||||
|
||||
x[0] = vec_extract(v_x, 0);
|
||||
x[1] = vec_extract(v_x, 1);
|
||||
x[2] = vec_extract(v_x, 2);
|
||||
x[3] = vec_extract(v_x, 3);
|
||||
#else
|
||||
float arr[4];
|
||||
|
||||
// note: keep type-cast here to prevent compiler bugs
|
||||
// see: https://github.com/ggml-org/llama.cpp/issues/12846
|
||||
vec_xst(y, 0, (float *)(arr));
|
||||
vec_xst(v_y, 0, (float *)(arr));
|
||||
|
||||
for (int i = 0; i < 4; i++) {
|
||||
x[i] = GGML_FP32_TO_FP16(arr[i]);
|
||||
x[i] = GGML_CPU_FP32_TO_FP16(arr[i]);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
#define GGML_F16_VEC GGML_F32x4
|
||||
@@ -1006,3 +1178,7 @@ static inline void __lzs_f16cx4_store(ggml_fp16_t * x, __vector float y) {
|
||||
#define GGML_F32_ARR (GGML_F32_STEP/GGML_F32_EPR)
|
||||
#define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR)
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -37,35 +37,35 @@ void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * G
|
||||
for (int i = 0; i < np; i += ggml_f32_step) {
|
||||
ax1 = GGML_F32_VEC_LOAD(x + i);
|
||||
ay1 = GGML_F32_VEC_LOAD(y + i);
|
||||
sum1 = GGML_F32_VEC_FMA(ax1, ay1, sum1);
|
||||
sum1 = GGML_F32_VEC_FMA(sum1, ax1, ay1);
|
||||
|
||||
ax2 = GGML_F32_VEC_LOAD(x + i + 1*ggml_f32_epr);
|
||||
ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr);
|
||||
sum2 = GGML_F32_VEC_FMA(ax2, ay2, sum2);
|
||||
sum2 = GGML_F32_VEC_FMA(sum2, ax2, ay2);
|
||||
|
||||
ax3 = GGML_F32_VEC_LOAD(x + i + 2*ggml_f32_epr);
|
||||
ay3 = GGML_F32_VEC_LOAD(y + i + 2*ggml_f32_epr);
|
||||
sum3 = GGML_F32_VEC_FMA(ax3, ay3, sum3);
|
||||
sum3 = GGML_F32_VEC_FMA(sum3, ax3, ay3);
|
||||
|
||||
ax4 = GGML_F32_VEC_LOAD(x + i + 3*ggml_f32_epr);
|
||||
ay4 = GGML_F32_VEC_LOAD(y + i + 3*ggml_f32_epr);
|
||||
sum4 = GGML_F32_VEC_FMA(ax4, ay4, sum4);
|
||||
sum4 = GGML_F32_VEC_FMA(sum4, ax4, ay4);
|
||||
|
||||
ax5 = GGML_F32_VEC_LOAD(x + i + 4*ggml_f32_epr);
|
||||
ay5 = GGML_F32_VEC_LOAD(y + i + 4*ggml_f32_epr);
|
||||
sum5 = GGML_F32_VEC_FMA(ax5, ay5, sum5);
|
||||
sum5 = GGML_F32_VEC_FMA(sum5, ax5, ay5);
|
||||
|
||||
ax6 = GGML_F32_VEC_LOAD(x + i + 5*ggml_f32_epr);
|
||||
ay6 = GGML_F32_VEC_LOAD(y + i + 5*ggml_f32_epr);
|
||||
sum6 = GGML_F32_VEC_FMA(ax6, ay6, sum6);
|
||||
sum6 = GGML_F32_VEC_FMA(sum6, ax6, ay6);
|
||||
|
||||
ax7 = GGML_F32_VEC_LOAD(x + i + 6*ggml_f32_epr);
|
||||
ay7 = GGML_F32_VEC_LOAD(y + i + 6*ggml_f32_epr);
|
||||
sum7 = GGML_F32_VEC_FMA(ax7, ay7, sum7);
|
||||
sum7 = GGML_F32_VEC_FMA(sum7, ax7, ay7);
|
||||
|
||||
ax8 = GGML_F32_VEC_LOAD(x + i + 7*ggml_f32_epr);
|
||||
ay8 = GGML_F32_VEC_LOAD(y + i + 7*ggml_f32_epr);
|
||||
sum8 = GGML_F32_VEC_FMA(ax8, ay8, sum8);
|
||||
sum8 = GGML_F32_VEC_FMA(sum8, ax8, ay8);
|
||||
}
|
||||
// leftovers
|
||||
// Since 8 unrolls are done in above loop, leftovers lie in range [0, ggml_f32_step] which is handled in below loop
|
||||
@@ -73,7 +73,7 @@ void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * G
|
||||
for (int i = np; i < np2; i += ggml_f32_epr) {
|
||||
ax1 = GGML_F32_VEC_LOAD(x + i);
|
||||
ay1 = GGML_F32_VEC_LOAD(y + i);
|
||||
sum1 = GGML_F32_VEC_FMA(ax1, ay1, sum1);
|
||||
sum1 = GGML_F32_VEC_FMA(sum1, ax1, ay1);
|
||||
}
|
||||
// maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
|
||||
if (np2 < n) {
|
||||
@@ -219,11 +219,11 @@ void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * G
|
||||
|
||||
// leftovers
|
||||
for (int i = np; i < n; ++i) {
|
||||
sumf += (ggml_float)(GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]));
|
||||
sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i]));
|
||||
}
|
||||
#else
|
||||
for (int i = 0; i < n; ++i) {
|
||||
sumf += (ggml_float)(GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]));
|
||||
sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i]));
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -254,6 +254,30 @@ void ggml_vec_silu_f32(const int n, float * y, const float * x) {
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_vec_swiglu_f32(const int n, float * y, const float * x, const float * g) {
|
||||
int i = 0;
|
||||
#if defined(__AVX512F__) && defined(__AVX512DQ__)
|
||||
for (; i + 15 < n; i += 16) {
|
||||
_mm512_storeu_ps(y + i, _mm512_mul_ps(ggml_v_silu(_mm512_loadu_ps(x + i)), _mm512_loadu_ps(g + i)));
|
||||
}
|
||||
#elif defined(__AVX2__) && defined(__FMA__)
|
||||
for (; i + 7 < n; i += 8) {
|
||||
_mm256_storeu_ps(y + i, _mm256_mul_ps(ggml_v_silu(_mm256_loadu_ps(x + i)), _mm256_loadu_ps(g + i)));
|
||||
}
|
||||
#elif defined(__SSE2__)
|
||||
for (; i + 3 < n; i += 4) {
|
||||
_mm_storeu_ps(y + i, _mm_mul_ps(ggml_v_silu(_mm_loadu_ps(x + i)), _mm_loadu_ps(g + i)));
|
||||
}
|
||||
#elif defined(__ARM_NEON) && defined(__aarch64__)
|
||||
for (; i + 3 < n; i += 4) {
|
||||
vst1q_f32(y + i, vmulq_f32(ggml_v_silu(vld1q_f32(x + i)), vld1q_f32(g + i)));
|
||||
}
|
||||
#endif
|
||||
for (; i < n; ++i) {
|
||||
y[i] = ggml_silu_f32(x[i]) * g[i];
|
||||
}
|
||||
}
|
||||
|
||||
ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max) {
|
||||
int i = 0;
|
||||
ggml_float sum = 0;
|
||||
|
||||
@@ -58,7 +58,7 @@ inline static void ggml_vec_set_bf16(const int n, ggml_bf16_t * x, const ggml_bf
|
||||
inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; }
|
||||
inline static void ggml_vec_add_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
|
||||
for (int i = 0; i < n; ++i) {
|
||||
z[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(x[i]) + GGML_FP16_TO_FP32(y[i]));
|
||||
z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) + GGML_CPU_FP16_TO_FP32(y[i]));
|
||||
}
|
||||
}
|
||||
inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float v) { for (int i = 0; i < n; ++i) z[i] = x[i] + v; }
|
||||
@@ -67,7 +67,7 @@ inline static void ggml_vec_acc1_f32(const int n, float * y, const float v)
|
||||
inline static void ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] - y[i]; }
|
||||
inline static void ggml_vec_sub_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
|
||||
for (int i = 0; i < n; ++i) {
|
||||
z[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(x[i]) - GGML_FP16_TO_FP32(y[i]));
|
||||
z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) - GGML_CPU_FP16_TO_FP32(y[i]));
|
||||
}
|
||||
}
|
||||
inline static void ggml_vec_set_f32 (const int n, float * x, const float v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
||||
@@ -75,20 +75,20 @@ inline static void ggml_vec_cpy_f32 (const int n, float * y, const float * x)
|
||||
inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = -x[i]; }
|
||||
inline static void ggml_vec_neg_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
||||
for (int i = 0; i < n; ++i) {
|
||||
y[i] = GGML_FP32_TO_FP16(-GGML_FP16_TO_FP32(x[i]));
|
||||
y[i] = GGML_CPU_FP32_TO_FP16(-GGML_CPU_FP16_TO_FP32(x[i]));
|
||||
}
|
||||
}
|
||||
|
||||
inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; }
|
||||
inline static void ggml_vec_mul_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
|
||||
for (int i = 0; i < n; ++i) {
|
||||
z[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(x[i]) * GGML_FP16_TO_FP32(y[i]));
|
||||
z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) * GGML_CPU_FP16_TO_FP32(y[i]));
|
||||
}
|
||||
}
|
||||
inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; }
|
||||
inline static void ggml_vec_div_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
|
||||
for (int i = 0; i < n; ++i) {
|
||||
z[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(x[i]) / GGML_FP16_TO_FP32(y[i]));
|
||||
z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) / GGML_CPU_FP16_TO_FP32(y[i]));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -131,13 +131,13 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
|
||||
// leftovers
|
||||
for (int i = np; i < n; ++i) {
|
||||
for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
|
||||
sumf[j] += (ggml_float)(GGML_FP16_TO_FP32(x[j][i])*GGML_FP16_TO_FP32(y[i]));
|
||||
sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i]));
|
||||
}
|
||||
}
|
||||
#else
|
||||
for (int i = 0; i < n; ++i) {
|
||||
for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
|
||||
sumf[j] += (ggml_float)(GGML_FP16_TO_FP32(x[j][i])*GGML_FP16_TO_FP32(y[i]));
|
||||
sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i]));
|
||||
}
|
||||
}
|
||||
#endif
|
||||
@@ -163,49 +163,49 @@ inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const
|
||||
|
||||
ax1 = GGML_F32_VEC_LOAD(x + i);
|
||||
ay1 = GGML_F32_VEC_LOAD(y + i);
|
||||
ay1 = GGML_F32_VEC_FMA(ax1, vx, ay1);
|
||||
ay1 = GGML_F32_VEC_FMA(ay1, ax1, vx);
|
||||
|
||||
GGML_F32_VEC_STORE(y + i, ay1);
|
||||
|
||||
ax2 = GGML_F32_VEC_LOAD(x + i + 1*ggml_f32_epr);
|
||||
ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr);
|
||||
ay2 = GGML_F32_VEC_FMA(ax2, vx, ay2);
|
||||
ay2 = GGML_F32_VEC_FMA(ay2, ax2, vx);
|
||||
|
||||
GGML_F32_VEC_STORE(y + i + 1*ggml_f32_epr, ay2);
|
||||
|
||||
ax3 = GGML_F32_VEC_LOAD(x + i + 2*ggml_f32_epr);
|
||||
ay3 = GGML_F32_VEC_LOAD(y + i + 2*ggml_f32_epr);
|
||||
ay3 = GGML_F32_VEC_FMA(ax3, vx, ay3);
|
||||
ay3 = GGML_F32_VEC_FMA(ay3, ax3, vx);
|
||||
|
||||
GGML_F32_VEC_STORE(y + i + 2*ggml_f32_epr, ay3);
|
||||
|
||||
ax4 = GGML_F32_VEC_LOAD(x + i + 3*ggml_f32_epr);
|
||||
ay4 = GGML_F32_VEC_LOAD(y + i + 3*ggml_f32_epr);
|
||||
ay4 = GGML_F32_VEC_FMA(ax4, vx, ay4);
|
||||
ay4 = GGML_F32_VEC_FMA(ay4, ax4, vx);
|
||||
|
||||
GGML_F32_VEC_STORE(y + i + 3*ggml_f32_epr, ay4);
|
||||
|
||||
ax5 = GGML_F32_VEC_LOAD(x + i + 4*ggml_f32_epr);
|
||||
ay5 = GGML_F32_VEC_LOAD(y + i + 4*ggml_f32_epr);
|
||||
ay5 = GGML_F32_VEC_FMA(ax5, vx, ay5);
|
||||
ay5 = GGML_F32_VEC_FMA(ay5, ax5, vx);
|
||||
|
||||
GGML_F32_VEC_STORE(y + i + 4*ggml_f32_epr, ay5);
|
||||
|
||||
ax6 = GGML_F32_VEC_LOAD(x + i + 5*ggml_f32_epr);
|
||||
ay6 = GGML_F32_VEC_LOAD(y + i + 5*ggml_f32_epr);
|
||||
ay6 = GGML_F32_VEC_FMA(ax6, vx, ay6);
|
||||
ay6 = GGML_F32_VEC_FMA(ay6, ax6, vx);
|
||||
|
||||
GGML_F32_VEC_STORE(y + i + 5*ggml_f32_epr, ay6);
|
||||
|
||||
ax7 = GGML_F32_VEC_LOAD(x + i + 6*ggml_f32_epr);
|
||||
ay7 = GGML_F32_VEC_LOAD(y + i + 6*ggml_f32_epr);
|
||||
ay7 = GGML_F32_VEC_FMA(ax7, vx, ay7);
|
||||
ay7 = GGML_F32_VEC_FMA(ay7, ax7, vx);
|
||||
|
||||
GGML_F32_VEC_STORE(y + i + 6*ggml_f32_epr, ay7);
|
||||
|
||||
ax8 = GGML_F32_VEC_LOAD(x + i + 7*ggml_f32_epr);
|
||||
ay8 = GGML_F32_VEC_LOAD(y + i + 7*ggml_f32_epr);
|
||||
ay8 = GGML_F32_VEC_FMA(ax8, vx, ay8);
|
||||
ay8 = GGML_F32_VEC_FMA(ay8, ax8, vx);
|
||||
|
||||
GGML_F32_VEC_STORE(y + i + 7*ggml_f32_epr, ay8);
|
||||
}
|
||||
@@ -215,7 +215,7 @@ inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const
|
||||
for (int i = np; i < np2; i += ggml_f32_epr) {
|
||||
ax1 = GGML_F32_VEC_LOAD(x + i);
|
||||
ay1 = GGML_F32_VEC_LOAD(y + i);
|
||||
ay1 = GGML_F32_VEC_FMA(ax1, vx, ay1);
|
||||
ay1 = GGML_F32_VEC_FMA(ay1, ax1, vx);
|
||||
|
||||
GGML_F32_VEC_STORE(y + i, ay1);
|
||||
}
|
||||
@@ -280,12 +280,12 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y,
|
||||
|
||||
// leftovers
|
||||
for (int i = np; i < n; ++i) {
|
||||
y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
|
||||
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
|
||||
}
|
||||
#else
|
||||
// scalar
|
||||
for (int i = 0; i < n; ++i) {
|
||||
y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
|
||||
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@@ -351,6 +351,45 @@ inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int
|
||||
#endif
|
||||
}
|
||||
|
||||
inline static void ggml_vec_mad1_f32(const int n, float * y, const float * x, const float s, const float b) {
|
||||
#if defined(GGML_USE_ACCELERATE)
|
||||
vDSP_vsmsa(x, 1, &s, &b, y, 1, n);
|
||||
#elif defined(GGML_SIMD)
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
// scalar ; TODO: Write SVE code
|
||||
for (int i = 0; i < n; ++i) {
|
||||
y[i] = x[i]*s + b;
|
||||
}
|
||||
#else
|
||||
const int np = (n & ~(GGML_F32_STEP - 1));
|
||||
|
||||
GGML_F32_VEC vs = GGML_F32_VEC_SET1(s);
|
||||
GGML_F32_VEC vb = GGML_F32_VEC_SET1(b);
|
||||
|
||||
GGML_F32_VEC ay[GGML_F32_ARR];
|
||||
|
||||
for (int i = 0; i < np; i += GGML_F32_STEP) {
|
||||
for (int j = 0; j < GGML_F32_ARR; j++) {
|
||||
ay[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
|
||||
ay[j] = GGML_F32_VEC_FMA(ay[j], vs, vb);
|
||||
|
||||
GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
|
||||
}
|
||||
}
|
||||
|
||||
// leftovers
|
||||
for (int i = np; i < n; ++i) {
|
||||
y[i] = x[i]*s + b;
|
||||
}
|
||||
#endif
|
||||
#else
|
||||
// scalar
|
||||
for (int i = 0; i < n; ++i) {
|
||||
y[i] = x[i]*s + b;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
//inline static void ggml_vec_scale_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] *= v; }
|
||||
inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
|
||||
#if defined(GGML_USE_ACCELERATE)
|
||||
@@ -430,12 +469,12 @@ inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float
|
||||
|
||||
// leftovers
|
||||
for (int i = np; i < n; ++i) {
|
||||
y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i])*v);
|
||||
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
|
||||
}
|
||||
#else
|
||||
// scalar
|
||||
for (int i = 0; i < n; ++i) {
|
||||
y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i])*v);
|
||||
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@@ -444,103 +483,103 @@ inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) {
|
||||
inline static void ggml_vec_sqr_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i]; }
|
||||
inline static void ggml_vec_sqr_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
||||
for (int i = 0; i < n; ++i) {
|
||||
float v = GGML_FP16_TO_FP32(x[i]);
|
||||
y[i] = GGML_FP32_TO_FP16(v*v);
|
||||
float v = GGML_CPU_FP16_TO_FP32(x[i]);
|
||||
y[i] = GGML_CPU_FP32_TO_FP16(v*v);
|
||||
}
|
||||
}
|
||||
inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); }
|
||||
inline static void ggml_vec_sqrt_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
||||
for (int i = 0; i < n; ++i) {
|
||||
y[i] = GGML_FP32_TO_FP16(sqrtf(GGML_FP16_TO_FP32(x[i])));
|
||||
y[i] = GGML_CPU_FP32_TO_FP16(sqrtf(GGML_CPU_FP16_TO_FP32(x[i])));
|
||||
}
|
||||
}
|
||||
inline static void ggml_vec_log_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]); }
|
||||
inline static void ggml_vec_log_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
||||
for (int i = 0; i < n; ++i) {
|
||||
y[i] = GGML_FP32_TO_FP16(logf(GGML_FP16_TO_FP32(x[i])));
|
||||
y[i] = GGML_CPU_FP32_TO_FP16(logf(GGML_CPU_FP16_TO_FP32(x[i])));
|
||||
}
|
||||
}
|
||||
inline static void ggml_vec_sin_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sinf(x[i]); }
|
||||
inline static void ggml_vec_sin_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
||||
for (int i = 0; i < n; ++i) {
|
||||
y[i] = GGML_FP32_TO_FP16(sinf(GGML_FP16_TO_FP32(x[i])));
|
||||
y[i] = GGML_CPU_FP32_TO_FP16(sinf(GGML_CPU_FP16_TO_FP32(x[i])));
|
||||
}
|
||||
}
|
||||
inline static void ggml_vec_cos_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = cosf(x[i]); }
|
||||
inline static void ggml_vec_cos_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
||||
for (int i = 0; i < n; ++i) {
|
||||
y[i] = GGML_FP32_TO_FP16(cosf(GGML_FP16_TO_FP32(x[i])));
|
||||
y[i] = GGML_CPU_FP32_TO_FP16(cosf(GGML_CPU_FP16_TO_FP32(x[i])));
|
||||
}
|
||||
}
|
||||
inline static void ggml_vec_abs_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x[i]); }
|
||||
inline static void ggml_vec_abs_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
||||
for (int i = 0; i < n; ++i) {
|
||||
y[i] = GGML_FP32_TO_FP16(fabsf(GGML_FP16_TO_FP32(x[i])));
|
||||
y[i] = GGML_CPU_FP32_TO_FP16(fabsf(GGML_CPU_FP16_TO_FP32(x[i])));
|
||||
}
|
||||
}
|
||||
inline static void ggml_vec_sgn_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); }
|
||||
inline static void ggml_vec_sgn_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
||||
for (int i = 0; i < n; ++i) {
|
||||
float v = GGML_FP16_TO_FP32(x[i]);
|
||||
y[i] = GGML_FP32_TO_FP16((v > 0.f) ? 1.f : ((v < 0.f) ? -1.f : 0.f));
|
||||
float v = GGML_CPU_FP16_TO_FP32(x[i]);
|
||||
y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? 1.f : ((v < 0.f) ? -1.f : 0.f));
|
||||
}
|
||||
}
|
||||
inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; }
|
||||
inline static void ggml_vec_step_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
||||
for (int i = 0; i < n; ++i) {
|
||||
y[i] = GGML_FP32_TO_FP16((GGML_FP16_TO_FP32(x[i]) > 0.f) ? 1.f : 0.f);
|
||||
y[i] = GGML_CPU_FP32_TO_FP16((GGML_CPU_FP16_TO_FP32(x[i]) > 0.f) ? 1.f : 0.f);
|
||||
}
|
||||
}
|
||||
inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]); }
|
||||
inline static void ggml_vec_tanh_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
||||
for (int i = 0; i < n; ++i) {
|
||||
y[i] = GGML_FP32_TO_FP16(tanhf(GGML_FP16_TO_FP32(x[i])));
|
||||
y[i] = GGML_CPU_FP32_TO_FP16(tanhf(GGML_CPU_FP16_TO_FP32(x[i])));
|
||||
}
|
||||
}
|
||||
inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expm1f(x[i]); }
|
||||
inline static void ggml_vec_elu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
||||
for (int i = 0; i < n; ++i) {
|
||||
y[i] = GGML_FP32_TO_FP16(expm1f(GGML_FP16_TO_FP32(x[i])));
|
||||
y[i] = GGML_CPU_FP32_TO_FP16(expm1f(GGML_CPU_FP16_TO_FP32(x[i])));
|
||||
}
|
||||
}
|
||||
inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
|
||||
inline static void ggml_vec_relu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
||||
for (int i = 0; i < n; ++i) {
|
||||
float v = GGML_FP16_TO_FP32(x[i]);
|
||||
y[i] = GGML_FP32_TO_FP16((v > 0.f) ? v : 0.f);
|
||||
float v = GGML_CPU_FP16_TO_FP32(x[i]);
|
||||
y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? v : 0.f);
|
||||
}
|
||||
}
|
||||
inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
|
||||
inline static void ggml_vec_leaky_relu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const float ns) {
|
||||
for (int i = 0; i < n; ++i) {
|
||||
float v = GGML_FP16_TO_FP32(x[i]);
|
||||
y[i] = GGML_FP32_TO_FP16(((v > 0.f) ? v : 0.f) + ns * ((v < 0.0f) ? v : 0.f));
|
||||
float v = GGML_CPU_FP16_TO_FP32(x[i]);
|
||||
y[i] = GGML_CPU_FP32_TO_FP16(((v > 0.f) ? v : 0.f) + ns * ((v < 0.0f) ? v : 0.f));
|
||||
}
|
||||
}
|
||||
inline static void ggml_vec_sigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = 1.f / (1.f + expf(-x[i])); }
|
||||
inline static void ggml_vec_sigmoid_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
||||
for (int i = 0; i < n; ++i) {
|
||||
y[i] = GGML_FP32_TO_FP16(1.f / (1.f + expf(-GGML_FP16_TO_FP32(x[i]))));
|
||||
y[i] = GGML_CPU_FP32_TO_FP16(1.f / (1.f + expf(-GGML_CPU_FP16_TO_FP32(x[i]))));
|
||||
}
|
||||
}
|
||||
// TODO: optimize performance
|
||||
inline static void ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
|
||||
inline static void ggml_vec_hardswish_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
||||
for (int i = 0; i < n; ++i) {
|
||||
float v = GGML_FP16_TO_FP32(x[i]);
|
||||
y[i] = GGML_FP32_TO_FP16(v * fminf(1.0f, fmaxf(0.0f, (v + 3.0f) / 6.0f)));
|
||||
float v = GGML_CPU_FP16_TO_FP32(x[i]);
|
||||
y[i] = GGML_CPU_FP32_TO_FP16(v * fminf(1.0f, fmaxf(0.0f, (v + 3.0f) / 6.0f)));
|
||||
}
|
||||
}
|
||||
inline static void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
|
||||
inline static void ggml_vec_hardsigmoid_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
||||
for (int i = 0; i < n; ++i) {
|
||||
y[i] = GGML_FP32_TO_FP16(fminf(1.0f, fmaxf(0.0f, (GGML_FP16_TO_FP32(x[i]) + 3.0f) / 6.0f)));
|
||||
y[i] = GGML_CPU_FP32_TO_FP16(fminf(1.0f, fmaxf(0.0f, (GGML_CPU_FP16_TO_FP32(x[i]) + 3.0f) / 6.0f)));
|
||||
}
|
||||
}
|
||||
inline static void ggml_vec_exp_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = expf(x[i]); }
|
||||
inline static void ggml_vec_exp_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
||||
for (int i = 0; i < n; ++i) {
|
||||
y[i] = GGML_FP32_TO_FP16(expf(GGML_FP16_TO_FP32(x[i])));
|
||||
y[i] = GGML_CPU_FP32_TO_FP16(expf(GGML_CPU_FP16_TO_FP32(x[i])));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -562,9 +601,9 @@ inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp
|
||||
|
||||
inline static void ggml_vec_gelu_erf_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
||||
for (int i = 0; i < n; ++i) {
|
||||
float xi = GGML_FP16_TO_FP32(x[i]);
|
||||
float xi = GGML_CPU_FP16_TO_FP32(x[i]);
|
||||
float res = 0.5f*xi*(1.0f + erff(xi*SQRT_2_INV));
|
||||
y[i] = GGML_FP32_TO_FP16(res);
|
||||
y[i] = GGML_CPU_FP32_TO_FP16(res);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -577,9 +616,9 @@ inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
|
||||
} else if (x[i] >= 10.0f) {
|
||||
y[i] = x[i];
|
||||
} else {
|
||||
ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
|
||||
ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]);
|
||||
memcpy(&t, &fp16, sizeof(uint16_t));
|
||||
y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_f16[t]);
|
||||
y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_f16[t]);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -613,9 +652,9 @@ inline static float ggml_gelu_quick_f32(float x) {
|
||||
inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
|
||||
uint16_t t;
|
||||
for (int i = 0; i < n; ++i) {
|
||||
ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
|
||||
ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]);
|
||||
memcpy(&t, &fp16, sizeof(uint16_t));
|
||||
y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_quick_f16[t]);
|
||||
y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_quick_f16[t]);
|
||||
}
|
||||
}
|
||||
#else
|
||||
@@ -628,8 +667,8 @@ inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float *
|
||||
|
||||
inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
||||
for (int i = 0; i < n; ++i) {
|
||||
float v = GGML_FP16_TO_FP32(x[i]);
|
||||
y[i] = GGML_FP32_TO_FP16(v*(1.0f/(1.0f+expf(GELU_QUICK_COEF*v))));
|
||||
float v = GGML_CPU_FP16_TO_FP32(x[i]);
|
||||
y[i] = GGML_CPU_FP32_TO_FP16(v*(1.0f/(1.0f+expf(GELU_QUICK_COEF*v))));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -638,8 +677,8 @@ inline static float ggml_silu_f32(float x) {
|
||||
return x/(1.0f + expf(-x));
|
||||
}
|
||||
inline static ggml_fp16_t ggml_silu_f16(ggml_fp16_t x) {
|
||||
float v = GGML_FP16_TO_FP32(x);
|
||||
return GGML_FP32_TO_FP16(v/(1.0f + expf(-v)));
|
||||
float v = GGML_CPU_FP16_TO_FP32(x);
|
||||
return GGML_CPU_FP32_TO_FP16(v/(1.0f + expf(-v)));
|
||||
}
|
||||
|
||||
#if __FINITE_MATH_ONLY__
|
||||
@@ -888,9 +927,9 @@ inline static float ggml_silu_backward_f32(float x, float dy) {
|
||||
}
|
||||
|
||||
inline static ggml_fp16_t ggml_silu_backward_f16(ggml_fp16_t x, ggml_fp16_t dy) {
|
||||
const float v = GGML_FP16_TO_FP32(x);
|
||||
const float v = GGML_CPU_FP16_TO_FP32(x);
|
||||
const float s = 1.0f/(1.0f + expf(-v));
|
||||
return GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(dy)*s*(1.0f + v*(1.0f - s)));
|
||||
return GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(dy)*s*(1.0f + v*(1.0f - s)));
|
||||
}
|
||||
|
||||
inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) {
|
||||
@@ -905,6 +944,100 @@ inline static void ggml_vec_silu_backward_f16(const int n, ggml_fp16_t * dx, con
|
||||
}
|
||||
}
|
||||
|
||||
inline static void ggml_vec_reglu_f32 (const int n, float * y, const float * x, const float * g) {
|
||||
for (int i = 0; i < n; ++i) {
|
||||
y[i] = (x[i] > 0.f) ? x[i] * g[i] : 0.f;
|
||||
}
|
||||
}
|
||||
|
||||
inline static void ggml_vec_reglu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
|
||||
for (int i = 0; i < n; ++i) {
|
||||
float v = GGML_CPU_FP16_TO_FP32(x[i]);
|
||||
y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? v * GGML_CPU_FP16_TO_FP32(g[i]) : 0.f);
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef GGML_GELU_FP16
|
||||
inline static void ggml_vec_geglu_f32(const int n, float * y, const float * x, const float * g) {
|
||||
uint16_t t;
|
||||
for (int i = 0; i < n; ++i) {
|
||||
if (x[i] <= -10.0f) {
|
||||
y[i] = 0.0f;
|
||||
} else if (x[i] >= 10.0f) {
|
||||
y[i] = x[i] * g[i];
|
||||
} else {
|
||||
ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]);
|
||||
memcpy(&t, &fp16, sizeof(uint16_t));
|
||||
y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_f16[t]) * g[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
inline static void ggml_vec_geglu_f32(const int n, float * y, const float * x, const float * g) {
|
||||
for (int i = 0; i < n; ++i) {
|
||||
y[i] = ggml_gelu_f32(x[i]) * g[i];
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
inline static void ggml_vec_geglu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
|
||||
const uint16_t * i16 = (const uint16_t *) x;
|
||||
for (int i = 0; i < n; ++i) {
|
||||
float v = GGML_CPU_FP16_TO_FP32(g[i]);
|
||||
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(ggml_table_gelu_f16[i16[i]]) * v);
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_vec_swiglu_f32(const int n, float * y, const float * x, const float * g);
|
||||
|
||||
inline static void ggml_vec_swiglu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
|
||||
for (int i = 0; i < n; ++i) {
|
||||
float v = GGML_CPU_FP16_TO_FP32(x[i]);
|
||||
float w = GGML_CPU_FP16_TO_FP32(g[i]);
|
||||
y[i] = GGML_CPU_FP32_TO_FP16((v/(1.0f + expf(-v))) * w);
|
||||
}
|
||||
}
|
||||
|
||||
inline static void ggml_vec_geglu_erf_f32(const int n, float * y, const float * x, const float * g) {
|
||||
for (int i = 0; i < n; ++i) {
|
||||
float xi = x[i];
|
||||
y[i] = 0.5f * xi * (1.0f + erff(xi*SQRT_2_INV)) * g[i];
|
||||
}
|
||||
}
|
||||
|
||||
inline static void ggml_vec_geglu_erf_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
|
||||
for (int i = 0; i < n; ++i) {
|
||||
float xi = GGML_CPU_FP16_TO_FP32(x[i]);
|
||||
float gi = GGML_CPU_FP16_TO_FP32(g[i]);
|
||||
y[i] = GGML_CPU_FP32_TO_FP16(0.5f * xi * (1.0f + erff(xi*SQRT_2_INV)) * gi);
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef GGML_GELU_QUICK_FP16
|
||||
inline static void ggml_vec_geglu_quick_f32(const int n, float * y, const float * x, const float * g) {
|
||||
uint16_t t;
|
||||
for (int i = 0; i < n; ++i) {
|
||||
ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]);
|
||||
memcpy(&t, &fp16, sizeof(uint16_t));
|
||||
y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_quick_f16[t]) * g[i];
|
||||
}
|
||||
}
|
||||
#else
|
||||
inline static void ggml_vec_geglu_quick_f32(const int n, float * y, const float * x, const float * g) {
|
||||
for (int i = 0; i < n; ++i) {
|
||||
y[i] = ggml_gelu_quick_f32(x[i]) * g[i];
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
inline static void ggml_vec_geglu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
|
||||
const uint16_t * i16 = (const uint16_t *) x;
|
||||
for (int i = 0; i < n; ++i) {
|
||||
float v = GGML_CPU_FP16_TO_FP32(g[i]);
|
||||
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(ggml_table_gelu_quick_f16[i16[i]]) * v);
|
||||
}
|
||||
}
|
||||
|
||||
inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) {
|
||||
#ifndef GGML_USE_ACCELERATE
|
||||
ggml_float sum = 0.0;
|
||||
@@ -928,7 +1061,7 @@ inline static void ggml_vec_sum_f32_ggf(const int n, ggml_float * s, const float
|
||||
inline static void ggml_vec_sum_f16_ggf(const int n, float * s, const ggml_fp16_t * x) {
|
||||
float sum = 0.0f;
|
||||
for (int i = 0; i < n; ++i) {
|
||||
sum += GGML_FP16_TO_FP32(x[i]);
|
||||
sum += GGML_CPU_FP16_TO_FP32(x[i]);
|
||||
}
|
||||
*s = sum;
|
||||
}
|
||||
|
||||
@@ -19,10 +19,10 @@
|
||||
#endif
|
||||
#include "ggml-common.h"
|
||||
|
||||
#include <cstdio>
|
||||
#include <array>
|
||||
#include <cassert>
|
||||
#include <cfloat>
|
||||
#include <cstdio>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
@@ -76,11 +76,9 @@
|
||||
#define GGML_CUDA_CC_IS_CDNA(cc) (cc >= GGML_CUDA_CC_CDNA && cc < GGML_CUDA_CC_RDNA1)
|
||||
|
||||
// Moore Threads
|
||||
#define GGML_CUDA_MUSA_ARCH_IS_QY1 (__MUSA_ARCH__ <= 210)
|
||||
|
||||
#define GGML_CUDA_CC_QY1 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x210) // MTT S80, MTT S3000
|
||||
#define GGML_CUDA_CC_QY2 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x220) // MTT S4000
|
||||
#define GGML_CUDA_CC_NG (GGML_CUDA_CC_OFFSET_MTHREADS + 0x310) // TBD
|
||||
#define GGML_CUDA_CC_QY1 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x210) // MTT S80, MTT S3000
|
||||
#define GGML_CUDA_CC_QY2 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x220) // MTT S4000
|
||||
#define GGML_CUDA_CC_NG (GGML_CUDA_CC_OFFSET_MTHREADS + 0x310) // TBD
|
||||
|
||||
#define GGML_CUDA_CC_IS_MTHREADS(cc) (cc >= GGML_CUDA_CC_OFFSET_MTHREADS && cc < GGML_CUDA_CC_OFFSET_AMD)
|
||||
#define GGML_CUDA_CC_IS_QY1(cc) (cc >= GGML_CUDA_CC_QY1 && cc < GGML_CUDA_CC_QY2)
|
||||
@@ -177,6 +175,23 @@ static const char * cu_get_error_str(CUresult err) {
|
||||
#define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str)
|
||||
#endif
|
||||
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
|
||||
# define CUDA_SET_SHARED_MEMORY_LIMIT(kernel, nbytes) \
|
||||
do { \
|
||||
static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = { false }; \
|
||||
const int id = ggml_cuda_get_device(); \
|
||||
if (!shared_memory_limit_raised[id]) { \
|
||||
CUDA_CHECK(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, nbytes)); \
|
||||
shared_memory_limit_raised[id] = true; \
|
||||
} \
|
||||
} while (0)
|
||||
#else
|
||||
# define CUDA_SET_SHARED_MEMORY_LIMIT(kernel, nbytes) \
|
||||
do { \
|
||||
GGML_UNUSED(nbytes); \
|
||||
} while (0)
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
|
||||
|
||||
#if CUDART_VERSION >= 11010 || defined(GGML_USE_MUSA)
|
||||
#define GGML_CUDA_ASSUME(x) __builtin_assume(x)
|
||||
#else
|
||||
@@ -203,13 +218,13 @@ typedef float2 dfloat2;
|
||||
#define FAST_FP16_AVAILABLE
|
||||
#endif // defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
|
||||
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
|
||||
#if (!defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA) || defined(GGML_USE_MUSA)
|
||||
#define FP16_MMA_AVAILABLE
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
|
||||
#endif // (!defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA) || defined(GGML_USE_MUSA)
|
||||
|
||||
#if defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || defined(RDNA4))
|
||||
#if defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || (defined(GGML_HIP_ROCWMMA_FATTN_GFX12) && defined(RDNA4)))
|
||||
#define FP16_MMA_AVAILABLE
|
||||
#endif // defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || defined(RDNA4))
|
||||
#endif // defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || (defined(GGML_HIP_ROCWMMA_FATTN_GFX12) && defined(RDNA4)))
|
||||
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
|
||||
#define NEW_MMA_AVAILABLE
|
||||
@@ -219,9 +234,9 @@ typedef float2 dfloat2;
|
||||
#define CP_ASYNC_AVAILABLE
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
|
||||
|
||||
#if !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && GGML_CUDA_MUSA_ARCH_IS_QY1)
|
||||
#if !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ < 220)
|
||||
#define FLASH_ATTN_AVAILABLE
|
||||
#endif // !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && GGML_CUDA_MUSA_ARCH_IS_QY1)
|
||||
#endif // !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ < 220)
|
||||
|
||||
static bool fp16_available(const int cc) {
|
||||
return ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_PASCAL;
|
||||
@@ -233,7 +248,8 @@ static bool fast_fp16_available(const int cc) {
|
||||
|
||||
// To be used for feature selection of external libraries, e.g. cuBLAS.
|
||||
static bool fast_fp16_hardware_available(const int cc) {
|
||||
return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_PASCAL && cc != 610) || GGML_CUDA_CC_IS_AMD(cc);
|
||||
return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_PASCAL && cc != 610) || GGML_CUDA_CC_IS_AMD(cc) ||
|
||||
(GGML_CUDA_CC_IS_MTHREADS(cc) && cc >= GGML_CUDA_CC_QY2);
|
||||
}
|
||||
|
||||
// Any FP16 tensor core instructions are available for ggml code.
|
||||
@@ -241,15 +257,35 @@ static bool fp16_mma_available(const int cc) {
|
||||
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN)
|
||||
return false;
|
||||
#else
|
||||
return (GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) ||
|
||||
GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc);
|
||||
if ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) ||
|
||||
GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) ||
|
||||
GGML_CUDA_CC_IS_MTHREADS(cc)) {
|
||||
return true;
|
||||
} else if (GGML_CUDA_CC_IS_RDNA4(cc)) {
|
||||
#if defined(GGML_HIP_ROCWMMA_FATTN) && defined(GGML_HIP_ROCWMMA_FATTN_GFX12)
|
||||
return true;
|
||||
#else
|
||||
return false;
|
||||
#endif // defined(GGML_HIP_ROCWMMA_FATTN) && defined(GGML_HIP_ROCWMMA_FATTN_GFX12)
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN)
|
||||
}
|
||||
|
||||
// To be used for feature selection of external libraries, e.g. cuBLAS.
|
||||
static bool fp16_mma_hardware_available(const int cc) {
|
||||
return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_VOLTA) ||
|
||||
GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc);
|
||||
GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc) ||
|
||||
(GGML_CUDA_CC_IS_MTHREADS(cc) && cc >= GGML_CUDA_CC_QY2);
|
||||
}
|
||||
|
||||
static bool bf16_mma_hardware_available(const int cc) {
|
||||
return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_AMPERE) || GGML_CUDA_CC_IS_CDNA(cc) || cc >= GGML_CUDA_CC_RDNA3;
|
||||
}
|
||||
|
||||
static bool fp32_mma_hardware_available(const int cc) {
|
||||
return GGML_CUDA_CC_IS_CDNA(cc);
|
||||
}
|
||||
|
||||
// Volta technically had FP16 tensor cores but they work very differently compared to Turing and later.
|
||||
@@ -262,11 +298,11 @@ static bool cp_async_available(const int cc) {
|
||||
}
|
||||
|
||||
static constexpr __device__ int ggml_cuda_get_physical_warp_size() {
|
||||
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
return __AMDGCN_WAVEFRONT_SIZE;
|
||||
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(__GFX9__) || defined(__GFX8__))
|
||||
return 64;
|
||||
#else
|
||||
return 32;
|
||||
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(__GFX9__) || defined(__GFX8__))
|
||||
}
|
||||
|
||||
[[noreturn]]
|
||||
@@ -362,6 +398,26 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
|
||||
#endif // FP16_AVAILABLE
|
||||
}
|
||||
|
||||
// Row reduction kernel template - compute sum (norm=false) or mean (norm=true)
|
||||
template<bool norm>
|
||||
static __global__ void reduce_rows_f32(const float * x, float * dst, const int ncols) {
|
||||
const int row = blockIdx.x;
|
||||
const int col = threadIdx.x;
|
||||
|
||||
float sum = 0.0f;
|
||||
for (int i = col; i < ncols; i += blockDim.x) {
|
||||
sum += x[row * ncols + i];
|
||||
}
|
||||
|
||||
sum = warp_reduce_sum(sum);
|
||||
|
||||
if (col != 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
dst[row] = norm ? sum / ncols : sum;
|
||||
}
|
||||
|
||||
template<int width = WARP_SIZE>
|
||||
static __device__ __forceinline__ float warp_reduce_max(float x) {
|
||||
#pragma unroll
|
||||
@@ -767,21 +823,7 @@ struct ggml_backend_cuda_context {
|
||||
name(GGML_CUDA_NAME + std::to_string(device)) {
|
||||
}
|
||||
|
||||
~ggml_backend_cuda_context() {
|
||||
if (copy_event != nullptr) {
|
||||
CUDA_CHECK(cudaEventDestroy(copy_event));
|
||||
}
|
||||
for (int i = 0; i < GGML_CUDA_MAX_DEVICES; ++i) {
|
||||
for (int j = 0; j < GGML_CUDA_MAX_STREAMS; ++j) {
|
||||
if (streams[i][j] != nullptr) {
|
||||
CUDA_CHECK(cudaStreamDestroy(streams[i][j]));
|
||||
}
|
||||
}
|
||||
if (cublas_handles[i] != nullptr) {
|
||||
CUBLAS_CHECK(cublasDestroy(cublas_handles[i]));
|
||||
}
|
||||
}
|
||||
}
|
||||
~ggml_backend_cuda_context();
|
||||
|
||||
cudaStream_t stream(int device, int stream) {
|
||||
if (streams[device][stream] == nullptr) {
|
||||
|
||||
161
ggml/src/ggml-cuda/conv2d-dw.cu
Normal file
161
ggml/src/ggml-cuda/conv2d-dw.cu
Normal file
@@ -0,0 +1,161 @@
|
||||
#include "conv2d-dw.cuh"
|
||||
|
||||
struct conv_params {
|
||||
int in_w, in_h;
|
||||
int out_w, out_h;
|
||||
int kernel_w, kernel_h;
|
||||
int stride_x, stride_y;
|
||||
int padding_x, padding_y;
|
||||
int dilation_x, dilation_y;
|
||||
int channels, batches;
|
||||
};
|
||||
|
||||
struct kernel_bounds {
|
||||
int y_min, y_max;
|
||||
int x_min, x_max;
|
||||
};
|
||||
|
||||
__device__ __forceinline__ kernel_bounds calculate_kernel_bounds(int out_x, int out_y, const conv_params & params) {
|
||||
kernel_bounds bounds;
|
||||
bounds.y_min = max(0, (params.padding_y - out_y * params.stride_y + params.dilation_y - 1) / params.dilation_y);
|
||||
bounds.y_max =
|
||||
min(params.kernel_h,
|
||||
(params.in_h + params.padding_y - out_y * params.stride_y + params.dilation_y - 1) / params.dilation_y);
|
||||
bounds.x_min = max(0, (params.padding_x - out_x * params.stride_x + params.dilation_x - 1) / params.dilation_x);
|
||||
bounds.x_max =
|
||||
min(params.kernel_w,
|
||||
(params.in_w + params.padding_x - out_x * params.stride_x + params.dilation_x - 1) / params.dilation_x);
|
||||
return bounds;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ int calculate_input_coord(int out_coord, int kern_coord, int stride, int dilation, int padding) {
|
||||
return out_coord * stride + kern_coord * dilation - padding;
|
||||
}
|
||||
|
||||
struct whcn_layout {
|
||||
__device__ static int input_index(int n, int c, int y, int x, const conv_params & params) {
|
||||
return n * (params.channels * params.in_w * params.in_h) + c * params.in_w * params.in_h + y * params.in_w + x;
|
||||
}
|
||||
|
||||
__device__ static int kernel_index(int c, int ky, int kx, const conv_params & params) {
|
||||
return c * params.kernel_h * params.kernel_w + ky * params.kernel_w + kx;
|
||||
}
|
||||
|
||||
__device__ static int output_index(int n, int c, int y, int x, const conv_params & params) {
|
||||
return n * (params.channels * params.out_w * params.out_h) + c * params.out_w * params.out_h +
|
||||
y * params.out_w + x;
|
||||
}
|
||||
|
||||
__device__ static void unpack_indices(int global_idx, const conv_params & params, int & n, int & c, int & out_y,
|
||||
int & out_x) {
|
||||
out_x = global_idx % params.out_w;
|
||||
out_y = (global_idx / params.out_w) % params.out_h;
|
||||
c = (global_idx / (params.out_w * params.out_h)) % params.channels;
|
||||
n = global_idx / (params.out_w * params.out_h * params.channels);
|
||||
}
|
||||
};
|
||||
|
||||
struct cwhn_layout {
|
||||
__device__ static int input_index(int n, int c, int y, int x, const conv_params & params) {
|
||||
return n * (params.channels * params.in_w * params.in_h) + (y * params.in_w + x) * params.channels + c;
|
||||
}
|
||||
|
||||
__device__ static int kernel_index(int c, int ky, int kx, const conv_params & params) {
|
||||
return (ky * params.kernel_w + kx) * params.channels + c;
|
||||
}
|
||||
|
||||
__device__ static int output_index(int n, int c, int y, int x, const conv_params & params) {
|
||||
return n * (params.channels * params.out_w * params.out_h) + y * (params.out_w * params.channels) +
|
||||
x * params.channels + c;
|
||||
}
|
||||
|
||||
__device__ static void unpack_indices(int global_idx, const conv_params & params, int & n, int & c, int & out_y,
|
||||
int & out_x) {
|
||||
c = global_idx % params.channels;
|
||||
out_x = (global_idx / params.channels) % params.out_w;
|
||||
out_y = (global_idx / (params.channels * params.out_w)) % params.out_h;
|
||||
n = global_idx / (params.channels * params.out_w * params.out_h);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T, typename Layout>
|
||||
__global__ void conv2d_dw_kernel(const T * __restrict__ input, const T * __restrict__ kernel, T * __restrict__ output,
|
||||
const int in_w, const int in_h, const int out_w, const int out_h,
|
||||
const int kernel_w, const int kernel_h, const int stride_x, const int stride_y,
|
||||
const int padding_x, const int padding_y, const int dilation_x, const int dilation_y,
|
||||
const int channels, const int batches) {
|
||||
const int global_idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int total_elements = batches * channels * out_h * out_w;
|
||||
|
||||
if (global_idx >= total_elements) {
|
||||
return;
|
||||
}
|
||||
|
||||
conv_params params = { in_w, in_h, out_w, out_h, kernel_w, kernel_h, stride_x,
|
||||
stride_y, padding_x, padding_y, dilation_x, dilation_y, channels, batches };
|
||||
|
||||
int batch_idx, channel_idx, out_y_idx, out_x_idx;
|
||||
Layout::unpack_indices(global_idx, params, batch_idx, channel_idx, out_y_idx, out_x_idx);
|
||||
|
||||
T accumulator = 0;
|
||||
kernel_bounds bounds = calculate_kernel_bounds(out_x_idx, out_y_idx, params);
|
||||
|
||||
for (int kern_y = bounds.y_min; kern_y < bounds.y_max; ++kern_y) {
|
||||
int in_y_idx = calculate_input_coord(out_y_idx, kern_y, params.stride_y, params.dilation_y, params.padding_y);
|
||||
|
||||
for (int kern_x = bounds.x_min; kern_x < bounds.x_max; ++kern_x) {
|
||||
int in_x_idx = calculate_input_coord(out_x_idx, kern_x, params.stride_x, params.dilation_x, params.padding_x);
|
||||
|
||||
const T input_val = input[Layout::input_index(batch_idx, channel_idx, in_y_idx, in_x_idx, params)];
|
||||
const T kernel_val = kernel[Layout::kernel_index(channel_idx, kern_y, kern_x, params)];
|
||||
|
||||
accumulator += input_val * kernel_val;
|
||||
}
|
||||
}
|
||||
|
||||
output[Layout::output_index(batch_idx, channel_idx, out_y_idx, out_x_idx, params)] = accumulator;
|
||||
}
|
||||
|
||||
void ggml_cuda_op_conv2d_dw(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * kernel = dst->src[0];
|
||||
const ggml_tensor * input = dst->src[1];
|
||||
|
||||
GGML_ASSERT(kernel->type == GGML_TYPE_F32 && input->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
||||
const float * w_d = (const float *) kernel->data;
|
||||
const float * x_d = (const float *) input->data;
|
||||
float * y_d = (float *) dst->data;
|
||||
|
||||
const int32_t * p = (const int32_t *) dst->op_params;
|
||||
const int stride_x = p[0];
|
||||
const int stride_y = p[1];
|
||||
const int padding_x = p[2];
|
||||
const int padding_y = p[3];
|
||||
const int dilation_x = p[4];
|
||||
const int dilation_y = p[5];
|
||||
|
||||
const int in_w = input->ne[0];
|
||||
const int in_h = input->ne[1];
|
||||
const int kernel_w = kernel->ne[0];
|
||||
const int kernel_h = kernel->ne[1];
|
||||
const int out_w = dst->ne[0];
|
||||
const int out_h = dst->ne[1];
|
||||
const int channels = dst->ne[2];
|
||||
const int batches = dst->ne[3];
|
||||
|
||||
cudaStream_t st = ctx.stream();
|
||||
|
||||
const int total = batches * channels * out_h * out_w;
|
||||
const int blocks = (total + CUDA_CONV2D_DW_BLOCK_SIZE - 1) / CUDA_CONV2D_DW_BLOCK_SIZE;
|
||||
|
||||
if (ggml_is_contiguous(input)) {
|
||||
conv2d_dw_kernel<float, whcn_layout><<<blocks, CUDA_CONV2D_DW_BLOCK_SIZE, 0, st>>>(
|
||||
x_d, w_d, y_d, in_w, in_h, out_w, out_h, kernel_w, kernel_h, stride_x, stride_y, padding_x, padding_y,
|
||||
dilation_x, dilation_y, channels, batches);
|
||||
} else if (ggml_is_contiguous_channels(input)) {
|
||||
conv2d_dw_kernel<float, cwhn_layout><<<blocks, CUDA_CONV2D_DW_BLOCK_SIZE, 0, st>>>(
|
||||
x_d, w_d, y_d, in_w, in_h, out_w, out_h, kernel_w, kernel_h, stride_x, stride_y, padding_x, padding_y,
|
||||
dilation_x, dilation_y, channels, batches);
|
||||
} else {
|
||||
GGML_ABORT("Unsupported memory layout for conv_2d_dw");
|
||||
}
|
||||
}
|
||||
5
ggml/src/ggml-cuda/conv2d-dw.cuh
Normal file
5
ggml/src/ggml-cuda/conv2d-dw.cuh
Normal file
@@ -0,0 +1,5 @@
|
||||
#pragma once
|
||||
#include "common.cuh"
|
||||
|
||||
#define CUDA_CONV2D_DW_BLOCK_SIZE 256
|
||||
void ggml_cuda_op_conv2d_dw(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
91
ggml/src/ggml-cuda/conv2d-transpose.cu
Normal file
91
ggml/src/ggml-cuda/conv2d-transpose.cu
Normal file
@@ -0,0 +1,91 @@
|
||||
#include <algorithm>
|
||||
|
||||
#include "conv2d-transpose.cuh"
|
||||
#include "ggml.h"
|
||||
|
||||
__global__ void conv2d_transpose_kernel(const float * __restrict__ input, const half * __restrict__ kernel,
|
||||
float * __restrict__ output, const int in_w, const int in_h, const int out_w,
|
||||
const int out_h, const int kernel_w, const int kernel_h, const int stride,
|
||||
const int c_in, const int c_out, const int batches) {
|
||||
const int global_idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
const int total_elements = out_w * out_h * c_out * batches;
|
||||
|
||||
if (global_idx >= total_elements) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int out_x_idx = global_idx % out_w;
|
||||
const int out_y_idx = (global_idx / out_w) % out_h;
|
||||
const int c_idx = (global_idx / (out_w * out_h)) % c_out;
|
||||
const int n_idx = global_idx / (out_w * out_h * c_out);
|
||||
|
||||
float accumulator = 0;
|
||||
// For each output idx, find the inputs that contribute to it by checking stride alignment and bounds
|
||||
|
||||
for (int c_in_idx = 0; c_in_idx < c_in; c_in_idx++) {
|
||||
for (int kh = 0; kh < kernel_h; ++kh) {
|
||||
int in_y = out_y_idx - kh;
|
||||
if (in_y < 0 || in_y % stride) continue;
|
||||
in_y /= stride;
|
||||
if (in_y >= in_h) continue;
|
||||
|
||||
for (int kw = 0; kw < kernel_w; ++kw) {
|
||||
int in_x = out_x_idx - kw;
|
||||
if (in_x < 0 || in_x % stride) continue;
|
||||
in_x /= stride;
|
||||
if (in_x >= in_w) continue;
|
||||
|
||||
const int input_idx = (in_w * in_h * c_in) * n_idx + (in_w * in_h) * c_in_idx + (in_w) *in_y + in_x;
|
||||
const int kernel_idx =
|
||||
(kernel_h * kernel_w * c_out) * c_in_idx + (kernel_h * kernel_w) * c_idx + (kernel_w) *kh + kw;
|
||||
|
||||
float input_val = input[input_idx];
|
||||
half kern_val = kernel[kernel_idx];
|
||||
|
||||
accumulator += input_val * (float) kern_val;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
output[(out_w * out_h * c_out) * n_idx + (out_w * out_h) * c_idx + (out_w) *out_y_idx + out_x_idx] = accumulator;
|
||||
}
|
||||
|
||||
//input is (W, H, C_in, N), Kernel is (W, H, C_out, C_in)
|
||||
void ggml_cuda_conv_2d_transpose_p0(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * kernel = dst->src[0];
|
||||
const ggml_tensor * input = dst->src[1];
|
||||
|
||||
GGML_ASSERT(kernel->type == GGML_TYPE_F16 && input->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
||||
|
||||
const float * input_data = (const float *) input->data;
|
||||
float * output_data = (float *) dst->data;
|
||||
const half * kernel_data = (const half *) kernel->data;
|
||||
|
||||
const int input_w = input->ne[0];
|
||||
const int input_h = input->ne[1];
|
||||
const int output_w = dst->ne[0];
|
||||
const int output_h = dst->ne[1];
|
||||
const int channels_in = input->ne[2];
|
||||
const int channels_out = kernel->ne[2];
|
||||
const int kernel_w = kernel->ne[0];
|
||||
const int kernel_h = kernel->ne[1];
|
||||
const int stride = dst->op_params[0];
|
||||
const int batches = input->ne[3];
|
||||
|
||||
GGML_ASSERT(channels_in == kernel->ne[3]);
|
||||
GGML_ASSERT(stride > 0);
|
||||
|
||||
cudaStream_t st = ctx.stream();
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous(input));
|
||||
GGML_ASSERT(ggml_is_contiguous(kernel));
|
||||
GGML_ASSERT(ggml_is_contiguous(dst));
|
||||
|
||||
const int total = (output_w * output_h * channels_out * batches);
|
||||
const int blocks = (total + CUDA_CONV2D_TRANSPOSE_BLOCK_SIZE - 1) / CUDA_CONV2D_TRANSPOSE_BLOCK_SIZE;
|
||||
|
||||
conv2d_transpose_kernel<<<blocks, CUDA_CONV2D_TRANSPOSE_BLOCK_SIZE, 0, st>>>(
|
||||
input_data, kernel_data, output_data, input_w, input_h, output_w, output_h, kernel_w, kernel_h, stride,
|
||||
channels_in, channels_out, batches);
|
||||
}
|
||||
4
ggml/src/ggml-cuda/conv2d-transpose.cuh
Normal file
4
ggml/src/ggml-cuda/conv2d-transpose.cuh
Normal file
@@ -0,0 +1,4 @@
|
||||
#include "common.cuh"
|
||||
|
||||
#define CUDA_CONV2D_TRANSPOSE_BLOCK_SIZE 256
|
||||
void ggml_cuda_conv_2d_transpose_p0(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
@@ -728,3 +728,25 @@ to_fp16_nc_cuda_t ggml_get_to_fp16_nc_cuda(ggml_type type) {
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
to_bf16_nc_cuda_t ggml_get_to_bf16_nc_cuda(ggml_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_F32:
|
||||
return convert_unary_cuda<float, nv_bfloat16>;
|
||||
case GGML_TYPE_F16:
|
||||
return convert_unary_cuda<half, nv_bfloat16>;
|
||||
default:
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
to_fp32_nc_cuda_t ggml_get_to_fp32_nc_cuda(ggml_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_F16:
|
||||
return convert_unary_cuda<half, float>;
|
||||
case GGML_TYPE_BF16:
|
||||
return convert_unary_cuda<nv_bfloat16, float>;
|
||||
default:
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -22,5 +22,10 @@ using to_t_nc_cuda_t = void (*)(const void * x, T * y,
|
||||
int64_t ne00, int64_t ne01, int64_t ne02, int64_t ne03,
|
||||
int64_t s01, int64_t s02, int64_t s03, cudaStream_t stream);
|
||||
|
||||
typedef to_t_nc_cuda_t<float> to_fp32_nc_cuda_t;
|
||||
typedef to_t_nc_cuda_t<half> to_fp16_nc_cuda_t;
|
||||
typedef to_t_nc_cuda_t<nv_bfloat16> to_bf16_nc_cuda_t;
|
||||
|
||||
to_fp32_nc_cuda_t ggml_get_to_fp32_nc_cuda(ggml_type type);
|
||||
to_fp16_nc_cuda_t ggml_get_to_fp16_nc_cuda(ggml_type type);
|
||||
to_bf16_nc_cuda_t ggml_get_to_bf16_nc_cuda(ggml_type type);
|
||||
|
||||
@@ -123,13 +123,7 @@ void ggml_cuda_cross_entropy_loss(ggml_backend_cuda_context & ctx, ggml_tensor *
|
||||
ggml_cuda_pool_alloc<float> dst_tmp(pool, blocks_num.x);
|
||||
|
||||
if (nbytes_shared <= smpbo) {
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
|
||||
static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
|
||||
if (!shared_memory_limit_raised[id]) {
|
||||
CUDA_CHECK(cudaFuncSetAttribute(cross_entropy_loss_f32<true>, cudaFuncAttributeMaxDynamicSharedMemorySize, smpbo));
|
||||
shared_memory_limit_raised[id] = true;
|
||||
}
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
|
||||
CUDA_SET_SHARED_MEMORY_LIMIT((cross_entropy_loss_f32<true>), smpbo);
|
||||
cross_entropy_loss_f32<true><<<blocks_num, blocks_dim, nbytes_shared, stream>>>(src0_d, src1_d, dst_tmp.ptr, ne00, nrows);
|
||||
} else {
|
||||
cross_entropy_loss_f32<false><<<blocks_num, blocks_dim, 0, stream>>>(src0_d, src1_d, dst_tmp.ptr, ne00, nrows);
|
||||
@@ -175,13 +169,7 @@ void ggml_cuda_cross_entropy_loss_back(ggml_backend_cuda_context & ctx, ggml_ten
|
||||
const size_t smpbo = ggml_cuda_info().devices[id].smpbo;
|
||||
|
||||
if (nbytes_shared <= smpbo) {
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
|
||||
static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
|
||||
if (!shared_memory_limit_raised[id]) {
|
||||
CUDA_CHECK(cudaFuncSetAttribute(cross_entropy_loss_back_f32<true>, cudaFuncAttributeMaxDynamicSharedMemorySize, smpbo));
|
||||
shared_memory_limit_raised[id] = true;
|
||||
}
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
|
||||
CUDA_SET_SHARED_MEMORY_LIMIT((cross_entropy_loss_back_f32<true>), smpbo);
|
||||
cross_entropy_loss_back_f32<true><<<blocks_num, blocks_dim, nbytes_shared, stream>>>(grad_d, src0f_d, src1f_d, dst_d, ne00);
|
||||
} else {
|
||||
cross_entropy_loss_back_f32<false><<<blocks_num, blocks_dim, 0, stream>>>(grad_d, src0f_d, src1f_d, dst_d, ne00);
|
||||
|
||||
@@ -32,7 +32,9 @@ typedef void (* fattn_kernel_t)(
|
||||
const int ne12,
|
||||
const int ne13,
|
||||
const int ne31,
|
||||
const int ne32,
|
||||
const int nb31,
|
||||
const int nb32,
|
||||
const int nb01,
|
||||
const int nb02,
|
||||
const int nb03,
|
||||
@@ -851,7 +853,8 @@ void launch_fattn(
|
||||
scale, max_bias, m0, m1, n_head_log2, logit_softcap,
|
||||
Q->ne[0], Q->ne[1], Q->ne[2], Q->ne[3],
|
||||
K->ne[0], K->ne[1], K->ne[2], K->ne[3],
|
||||
mask ? mask->ne[1] : 0, mask ? mask->nb[1] : 0,
|
||||
mask ? mask->ne[1] : 0, mask ? mask->ne[2] : 0,
|
||||
mask ? mask->nb[1] : 0, mask ? mask->nb[2] : 0,
|
||||
Q->nb[1], Q->nb[2], Q->nb[3],
|
||||
nb11, nb12, nb13,
|
||||
nb21, nb22, nb23,
|
||||
|
||||
@@ -1223,7 +1223,9 @@ static __global__ void flash_attn_ext_f16(
|
||||
const int ne12,
|
||||
const int ne13,
|
||||
const int ne31,
|
||||
const int ne32,
|
||||
const int nb31,
|
||||
const int nb32,
|
||||
const int nb01,
|
||||
const int nb02,
|
||||
const int nb03,
|
||||
@@ -1288,7 +1290,8 @@ static __global__ void flash_attn_ext_f16(
|
||||
|
||||
const float2 * Q_f2 = (const float2 *) (Q + nb02* channel*ncols2);
|
||||
const half2 * K_h2 = (const half2 *) (K + nb12*(channel*ncols2 / gqa_ratio));
|
||||
const half2 * mask_h2 = ncols2 > 1 || mask ? (const half2 *) mask + (nb31/sizeof(half2))*jt*ncols1 : nullptr;
|
||||
const half2 * mask_h2 = ncols2 == 1 && !mask ? nullptr :
|
||||
(const half2 *) (mask + nb32*(channel % ne32) + nb31*jt*ncols1);
|
||||
float2 * dstk = ((float2 *) dst) + channel*(ncols2 * DV/2);
|
||||
|
||||
const half2 * V_h2 = mla ? K_h2 + (DKQ/2 - DV/2) : (const half2 *) (V + nb22*(channel*ncols2 / gqa_ratio));
|
||||
@@ -1327,7 +1330,8 @@ static __global__ void flash_attn_ext_f16(
|
||||
|
||||
const float2 * Q_f2 = (const float2 *) (Q + nb02* channel*ncols2);
|
||||
const half2 * K_h2 = (const half2 *) (K + nb12*(channel*ncols2 / gqa_ratio));
|
||||
const half2 * mask_h2 = ncols2 > 1 || mask ? (const half2 *) mask + (nb31/sizeof(half2))*jt*ncols1 : nullptr;
|
||||
const half2 * mask_h2 = ncols2 == 1 && !mask ? nullptr :
|
||||
(const half2 *) (mask + nb32*(channel % ne32) + nb31*jt*ncols1);
|
||||
float2 * dstk = ((float2 *) dst) + channel*(ncols2 * DV/2);
|
||||
|
||||
const half2 * V_h2 = mla ? K_h2 + (DKQ/2 - DV/2) : (const half2 *) (V + nb22*(channel*ncols2 / gqa_ratio));
|
||||
@@ -1348,8 +1352,8 @@ static __global__ void flash_attn_ext_f16(
|
||||
GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
|
||||
GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap); GGML_UNUSED(ne00);
|
||||
GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03); GGML_UNUSED(ne10);
|
||||
GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31);
|
||||
GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03);
|
||||
GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31); GGML_UNUSED(ne32);
|
||||
GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03);
|
||||
GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13); GGML_UNUSED(nb21);
|
||||
GGML_UNUSED(nb22); GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
|
||||
GGML_UNUSED(ne2); GGML_UNUSED(ne3);
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
|
||||
template<int D, int ncols, int nwarps, bool use_logit_softcap> // D == head size
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||
__launch_bounds__(nwarps*WARP_SIZE, 1)
|
||||
__launch_bounds__(nwarps*WARP_SIZE, 2)
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||
static __global__ void flash_attn_tile_ext_f16(
|
||||
const char * __restrict__ Q,
|
||||
@@ -30,7 +30,9 @@ static __global__ void flash_attn_tile_ext_f16(
|
||||
const int ne12,
|
||||
const int ne13,
|
||||
const int ne31,
|
||||
const int ne32,
|
||||
const int nb31,
|
||||
const int nb32,
|
||||
const int nb01,
|
||||
const int nb02,
|
||||
const int nb03,
|
||||
@@ -64,7 +66,7 @@ static __global__ void flash_attn_tile_ext_f16(
|
||||
const float2 * Q_f2 = (const float2 *) (Q + nb02* blockIdx.z + nb01*ic0);
|
||||
const half2 * K_h2 = (const half2 *) (K + nb12*(blockIdx.z / gqa_ratio));
|
||||
const half2 * V_h2 = (const half2 *) (V + nb12*(blockIdx.z / gqa_ratio)); // K and V have same shape
|
||||
const half * maskh = (const half *) mask + ne11*ic0;
|
||||
const half * maskh = (const half *) (mask + nb32*(blockIdx.z % ne32) + nb31*ic0);
|
||||
|
||||
const int stride_KV2 = nb11 / sizeof(half2);
|
||||
|
||||
@@ -288,8 +290,8 @@ static __global__ void flash_attn_tile_ext_f16(
|
||||
GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
|
||||
GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02);
|
||||
GGML_UNUSED(ne03); GGML_UNUSED(ne10); GGML_UNUSED(ne11);
|
||||
GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31);
|
||||
GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
|
||||
GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31); GGML_UNUSED(ne32);
|
||||
GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
|
||||
GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12);
|
||||
GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22);
|
||||
GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
|
||||
template<int D, int ncols, int nwarps, bool use_logit_softcap> // D == head size
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||
__launch_bounds__(nwarps*WARP_SIZE, 1)
|
||||
__launch_bounds__(nwarps*WARP_SIZE, 2)
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||
static __global__ void flash_attn_tile_ext_f32(
|
||||
const char * __restrict__ Q,
|
||||
@@ -30,7 +30,9 @@ static __global__ void flash_attn_tile_ext_f32(
|
||||
const int ne12,
|
||||
const int ne13,
|
||||
const int ne31,
|
||||
const int ne32,
|
||||
const int nb31,
|
||||
const int nb32,
|
||||
const int nb01,
|
||||
const int nb02,
|
||||
const int nb03,
|
||||
@@ -58,8 +60,8 @@ static __global__ void flash_attn_tile_ext_f32(
|
||||
GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
|
||||
GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02);
|
||||
GGML_UNUSED(ne03); GGML_UNUSED(ne10); GGML_UNUSED(ne11);
|
||||
GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31);
|
||||
GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
|
||||
GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31); GGML_UNUSED(ne32);
|
||||
GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
|
||||
GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12);
|
||||
GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22);
|
||||
GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
|
||||
@@ -76,7 +78,7 @@ static __global__ void flash_attn_tile_ext_f32(
|
||||
const float2 * Q_f2 = (const float2 *) (Q + nb02* blockIdx.z + nb01*ic0);
|
||||
const half2 * K_h2 = (const half2 *) (K + nb12*(blockIdx.z / gqa_ratio));
|
||||
const half2 * V_h2 = (const half2 *) (V + nb12*(blockIdx.z / gqa_ratio)); // K and V have same shape
|
||||
const half * maskh = (const half *) mask + ne11*ic0;
|
||||
const half * maskh = (const half *) (mask + nb32*(blockIdx.z % ne32) + nb31*ic0);
|
||||
|
||||
const int stride_KV2 = nb11 / sizeof(half2);
|
||||
|
||||
@@ -297,14 +299,14 @@ static __global__ void flash_attn_tile_ext_f32(
|
||||
GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
|
||||
GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
|
||||
GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
|
||||
GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02);
|
||||
GGML_UNUSED(ne03); GGML_UNUSED(ne10); GGML_UNUSED(ne11);
|
||||
GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31);
|
||||
GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
|
||||
GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12);
|
||||
GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22);
|
||||
GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
|
||||
GGML_UNUSED(ne2); GGML_UNUSED(ne3);
|
||||
GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03);
|
||||
GGML_UNUSED(ne10); GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13);
|
||||
GGML_UNUSED(ne31); GGML_UNUSED(ne32);
|
||||
GGML_UNUSED(nb31); GGML_UNUSED(nb32);
|
||||
GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03);
|
||||
GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13);
|
||||
GGML_UNUSED(nb21); GGML_UNUSED(nb22); GGML_UNUSED(nb23);
|
||||
GGML_UNUSED(ne0); GGML_UNUSED(ne1); GGML_UNUSED(ne2); GGML_UNUSED(ne3);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // FLASH_ATTN_AVAILABLE
|
||||
}
|
||||
|
||||
@@ -27,7 +27,9 @@ static __global__ void flash_attn_vec_ext_f16(
|
||||
const int ne12,
|
||||
const int ne13,
|
||||
const int ne31,
|
||||
const int ne32,
|
||||
const int nb31,
|
||||
const int nb32,
|
||||
const int nb01,
|
||||
const int nb02,
|
||||
const int nb03,
|
||||
@@ -68,7 +70,7 @@ static __global__ void flash_attn_vec_ext_f16(
|
||||
K += nb12*(blockIdx.z / gqa_ratio);
|
||||
V += nb22*(blockIdx.z / gqa_ratio);
|
||||
|
||||
const half * maskh = (const half *) mask + ne11*ic0;
|
||||
const half * maskh = (const half *) (mask + nb32*(blockIdx.z % ne32) + nb31*ic0);
|
||||
|
||||
const float slopef = get_alibi_slope(max_bias, blockIdx.z, n_head_log2, m0, m1);
|
||||
const half slopeh = __float2half(slopef);
|
||||
@@ -342,8 +344,8 @@ static __global__ void flash_attn_vec_ext_f16(
|
||||
GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
|
||||
GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02);
|
||||
GGML_UNUSED(ne03); GGML_UNUSED(ne10); GGML_UNUSED(ne11);
|
||||
GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31);
|
||||
GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
|
||||
GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31); GGML_UNUSED(ne32);
|
||||
GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
|
||||
GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12);
|
||||
GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22);
|
||||
GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
|
||||
|
||||
@@ -27,7 +27,9 @@ static __global__ void flash_attn_vec_ext_f32(
|
||||
const int ne12,
|
||||
const int ne13,
|
||||
const int ne31,
|
||||
const int ne32,
|
||||
const int nb31,
|
||||
const int nb32,
|
||||
const int nb01,
|
||||
const int nb02,
|
||||
const int nb03,
|
||||
@@ -51,8 +53,8 @@ static __global__ void flash_attn_vec_ext_f32(
|
||||
GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
|
||||
GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02);
|
||||
GGML_UNUSED(ne03); GGML_UNUSED(ne10); GGML_UNUSED(ne11);
|
||||
GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31);
|
||||
GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
|
||||
GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31); GGML_UNUSED(ne32);
|
||||
GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
|
||||
GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12);
|
||||
GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22);
|
||||
GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
|
||||
@@ -79,7 +81,8 @@ static __global__ void flash_attn_vec_ext_f32(
|
||||
Q += nb02* blockIdx.z + nb01*ic0;
|
||||
K += nb12*(blockIdx.z / gqa_ratio);
|
||||
V += nb22*(blockIdx.z / gqa_ratio); // K and V have same shape
|
||||
const half * maskh = (const half *) mask + ne11*ic0;
|
||||
|
||||
const half * maskh = (const half *) (mask + nb32*(blockIdx.z % ne32) + nb31*ic0);
|
||||
|
||||
const float slope = get_alibi_slope(max_bias, blockIdx.z, n_head_log2, m0, m1);
|
||||
|
||||
@@ -334,13 +337,15 @@ static __global__ void flash_attn_vec_ext_f32(
|
||||
GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
|
||||
GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
|
||||
GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
|
||||
GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap); GGML_UNUSED(ne00);
|
||||
GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03); GGML_UNUSED(ne10);
|
||||
GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31);
|
||||
GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03);
|
||||
GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13); GGML_UNUSED(nb21);
|
||||
GGML_UNUSED(nb22); GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
|
||||
GGML_UNUSED(ne2); GGML_UNUSED(ne3);
|
||||
GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
|
||||
GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03);
|
||||
GGML_UNUSED(ne10); GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13);
|
||||
GGML_UNUSED(ne31); GGML_UNUSED(ne32);
|
||||
GGML_UNUSED(nb31); GGML_UNUSED(nb32);
|
||||
GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03);
|
||||
GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13);
|
||||
GGML_UNUSED(nb21); GGML_UNUSED(nb22); GGML_UNUSED(nb23);
|
||||
GGML_UNUSED(ne0); GGML_UNUSED(ne1); GGML_UNUSED(ne2); GGML_UNUSED(ne3);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // FLASH_ATTN_AVAILABLE
|
||||
}
|
||||
|
||||
@@ -9,7 +9,11 @@
|
||||
#ifdef FP16_MMA_AVAILABLE
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||
#include <mma.h>
|
||||
#ifdef GGML_USE_MUSA
|
||||
namespace wmma = mtmusa::wmma;
|
||||
#else // GGML_USE_MUSA
|
||||
namespace wmma = nvcuda::wmma;
|
||||
#endif // GGML_USE_MUSA
|
||||
#elif defined(GGML_HIP_ROCWMMA_FATTN) && defined(FP16_MMA_AVAILABLE)
|
||||
#undef HIP_ENABLE_WARP_SYNC_BUILTINS // conflicts with rocWMMA headers
|
||||
#include <rocwmma/rocwmma.hpp>
|
||||
@@ -42,7 +46,9 @@ static __global__ void flash_attn_ext_f16(
|
||||
const int ne12,
|
||||
const int ne13,
|
||||
const int ne31,
|
||||
const int ne32,
|
||||
const int nb31,
|
||||
const int nb32,
|
||||
const int nb01,
|
||||
const int nb02,
|
||||
const int nb03,
|
||||
@@ -90,11 +96,11 @@ static __global__ void flash_attn_ext_f16(
|
||||
constexpr int kqar = sizeof(KQ_acc_t)/sizeof(half);
|
||||
|
||||
const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
|
||||
const float * Q_f = (const float *) (Q + nb02* blockIdx.z + nb01*ic0);
|
||||
const half * K_h = (const half *) (K + nb12*(blockIdx.z / gqa_ratio));
|
||||
const half * V_h = (const half *) (V + nb12*(blockIdx.z / gqa_ratio)); // K and V have same shape
|
||||
const half * maskh = (const half *) mask + (nb31/sizeof(half))* ic0;
|
||||
const half2 * mask2 = (const half2 *) mask + (nb31/sizeof(half))*(ic0/2);
|
||||
const float * Q_f = (const float *) (Q + nb02* blockIdx.z + nb01*ic0);
|
||||
const half * K_h = (const half *) (K + nb12*(blockIdx.z / gqa_ratio));
|
||||
const half * V_h = (const half *) (V + nb12*(blockIdx.z / gqa_ratio)); // K and V have same shape
|
||||
const half * maskh = (const half *) (mask + nb32*(blockIdx.z % ne32) + nb31*ic0);
|
||||
const half2 * mask2 = (const half2 *) maskh;
|
||||
|
||||
const int stride_Q = nb01 / sizeof(float);
|
||||
const int stride_KV = nb11 / sizeof(half);
|
||||
@@ -436,7 +442,7 @@ static __global__ void flash_attn_ext_f16(
|
||||
GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
|
||||
GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03);
|
||||
GGML_UNUSED(ne10); GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13);
|
||||
GGML_UNUSED(ne31); GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
|
||||
GGML_UNUSED(ne31); GGML_UNUSED(ne32); GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
|
||||
GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13);
|
||||
GGML_UNUSED(nb21); GGML_UNUSED(nb22); GGML_UNUSED(nb23);
|
||||
GGML_UNUSED(ne0); GGML_UNUSED(ne1); GGML_UNUSED(ne2); GGML_UNUSED(ne3);
|
||||
|
||||
@@ -168,6 +168,10 @@ static void ggml_cuda_get_rows_switch_src0_type(
|
||||
get_rows_cuda_float((const float *) src0_d, src1_d, dst_d,
|
||||
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
|
||||
break;
|
||||
case GGML_TYPE_I32:
|
||||
get_rows_cuda_float((const int32_t *) src0_d, src1_d, dst_d,
|
||||
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
|
||||
break;
|
||||
case GGML_TYPE_BF16:
|
||||
get_rows_cuda_float((const nv_bfloat16 *) src0_d, src1_d, dst_d,
|
||||
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
|
||||
@@ -210,6 +214,10 @@ void get_rows_cuda(
|
||||
ggml_cuda_get_rows_switch_src0_type(src0_d, src0_type, src1_d, (float *) dst_d,
|
||||
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
|
||||
break;
|
||||
case GGML_TYPE_I32:
|
||||
ggml_cuda_get_rows_switch_src0_type(src0_d, src0_type, src1_d, (int32_t *) dst_d,
|
||||
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
|
||||
break;
|
||||
case GGML_TYPE_F16:
|
||||
ggml_cuda_get_rows_switch_src0_type(src0_d, src0_type, src1_d, (half *) dst_d,
|
||||
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
|
||||
|
||||
@@ -11,6 +11,8 @@
|
||||
#include "ggml-cuda/clamp.cuh"
|
||||
#include "ggml-cuda/concat.cuh"
|
||||
#include "ggml-cuda/conv-transpose-1d.cuh"
|
||||
#include "ggml-cuda/conv2d-dw.cuh"
|
||||
#include "ggml-cuda/conv2d-transpose.cuh"
|
||||
#include "ggml-cuda/convert.cuh"
|
||||
#include "ggml-cuda/count-equal.cuh"
|
||||
#include "ggml-cuda/cpy.cuh"
|
||||
@@ -35,6 +37,7 @@
|
||||
#include "ggml-cuda/ssm-scan.cuh"
|
||||
#include "ggml-cuda/sum.cuh"
|
||||
#include "ggml-cuda/sumrows.cuh"
|
||||
#include "ggml-cuda/mean.cuh"
|
||||
#include "ggml-cuda/tsembd.cuh"
|
||||
#include "ggml-cuda/unary.cuh"
|
||||
#include "ggml-cuda/upscale.cuh"
|
||||
@@ -47,6 +50,7 @@
|
||||
#include <atomic>
|
||||
#include <charconv>
|
||||
#include <cinttypes>
|
||||
#include <condition_variable>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <float.h>
|
||||
@@ -54,9 +58,8 @@
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <stdarg.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
@@ -97,8 +100,7 @@ int ggml_cuda_get_device() {
|
||||
static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
|
||||
ggml_cuda_set_device(device);
|
||||
cudaError_t err;
|
||||
if (getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr)
|
||||
{
|
||||
if (getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr) {
|
||||
err = cudaMallocManaged(ptr, size);
|
||||
#if defined(GGML_USE_HIP)
|
||||
if (err == hipSuccess) {
|
||||
@@ -116,9 +118,7 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device)
|
||||
err = cudaMalloc(ptr, size);
|
||||
}
|
||||
#endif // defined(GGML_USE_HIP)
|
||||
}
|
||||
else
|
||||
{
|
||||
} else {
|
||||
err = cudaMalloc(ptr, size);
|
||||
}
|
||||
return err;
|
||||
@@ -514,6 +514,33 @@ std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(i
|
||||
return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_leg(device));
|
||||
}
|
||||
|
||||
// destroying a cuBLAS handle while a graph is being captured in a different thread can result in a CUDA error
|
||||
// this lock is used to ensure that no cuBLAS handle is destroyed while a graph is being captured
|
||||
|
||||
static std::mutex ggml_cuda_lock;
|
||||
static std::condition_variable ggml_cuda_lock_cv;
|
||||
static std::atomic<int> ggml_cuda_lock_counter;
|
||||
|
||||
ggml_backend_cuda_context::~ggml_backend_cuda_context() {
|
||||
std::unique_lock<std::mutex> lock(ggml_cuda_lock);
|
||||
ggml_cuda_lock_cv.wait(lock, []{ return ggml_cuda_lock_counter.load(std::memory_order_relaxed) == 0; });
|
||||
|
||||
if (copy_event != nullptr) {
|
||||
CUDA_CHECK(cudaEventDestroy(copy_event));
|
||||
}
|
||||
for (int i = 0; i < GGML_CUDA_MAX_DEVICES; ++i) {
|
||||
for (int j = 0; j < GGML_CUDA_MAX_STREAMS; ++j) {
|
||||
if (streams[i][j] != nullptr) {
|
||||
CUDA_CHECK(cudaStreamDestroy(streams[i][j]));
|
||||
}
|
||||
}
|
||||
if (cublas_handles[i] != nullptr) {
|
||||
CUBLAS_CHECK(cublasDestroy(cublas_handles[i]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// cuda buffer
|
||||
|
||||
struct ggml_backend_cuda_buffer_context {
|
||||
@@ -1200,9 +1227,12 @@ static void ggml_cuda_op_mul_mat_cublas(
|
||||
|
||||
const int cc = ggml_cuda_info().devices[id].cc;
|
||||
|
||||
const bool supports_bf16 = GGML_CUDA_CC_IS_NVIDIA(cc) || GGML_CUDA_CC_IS_AMD(cc) ||
|
||||
(GGML_CUDA_CC_IS_MTHREADS(cc) && cc >= GGML_CUDA_CC_QY2);
|
||||
|
||||
const bool use_fp16 = (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT;
|
||||
|
||||
if (src0->type == GGML_TYPE_BF16 && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) {
|
||||
if (supports_bf16 && src0->type == GGML_TYPE_BF16 && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) {
|
||||
ggml_cuda_pool_alloc<nv_bfloat16> src1_as_bf16(ctx.pool(id));
|
||||
if (src1->type != GGML_TYPE_BF16) {
|
||||
const to_bf16_cuda_t to_bf16_cuda = ggml_get_to_bf16_cuda(src1->type);
|
||||
@@ -1230,7 +1260,7 @@ static void ggml_cuda_op_mul_mat_cublas(
|
||||
|
||||
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_BF16);
|
||||
to_fp32_cuda(dst_bf16.get(), dst_dd_i, row_diff*src1_ncols, stream);
|
||||
} else if (((GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_VOLTA) || GGML_CUDA_CC_IS_AMD(cc)) && use_fp16) {
|
||||
} else if (fast_fp16_hardware_available(cc) && use_fp16) {
|
||||
// convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
|
||||
ggml_cuda_pool_alloc<half> src0_as_f16(ctx.pool(id));
|
||||
if (src0->type != GGML_TYPE_F16) {
|
||||
@@ -1719,7 +1749,7 @@ static void ggml_cuda_op_mul_mat(
|
||||
}
|
||||
|
||||
static __global__ void k_compute_batched_ptrs(
|
||||
const half * src0_as_f16, const half * src1_as_f16, char * dst,
|
||||
const void * src0_as_f16, const void * src1_as_f16, char * dst,
|
||||
const void ** ptrs_src, void ** ptrs_dst,
|
||||
int64_t ne12, int64_t ne13,
|
||||
int64_t ne23,
|
||||
@@ -1742,83 +1772,131 @@ static __global__ void k_compute_batched_ptrs(
|
||||
ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2 + i13*nbd3;
|
||||
}
|
||||
|
||||
static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
// Type traits for mapping ggml types to CUDA/cuBLAS types
|
||||
template<ggml_type T>
|
||||
struct batched_mul_mat_traits;
|
||||
|
||||
template<>
|
||||
struct batched_mul_mat_traits<GGML_TYPE_F32> {
|
||||
using cuda_type = float;
|
||||
static inline const cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F;
|
||||
static inline const cudaDataType_t data_type = CUDA_R_32F;
|
||||
static inline const ggml_type ggml_type_val = GGML_TYPE_F32;
|
||||
static inline const float alpha = 1.0f;
|
||||
static inline const float beta = 0.0f;
|
||||
static inline const void* get_alpha() { static const float val = alpha; return &val; }
|
||||
static inline const void* get_beta() { static const float val = beta; return &val; }
|
||||
static inline auto get_nc_converter(ggml_type src_type) { return ggml_get_to_fp32_nc_cuda(src_type); }
|
||||
};
|
||||
|
||||
template<>
|
||||
struct batched_mul_mat_traits<GGML_TYPE_BF16> {
|
||||
using cuda_type = nv_bfloat16;
|
||||
static inline const cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F;
|
||||
static inline const cudaDataType_t data_type = CUDA_R_16BF;
|
||||
static inline const ggml_type ggml_type_val = GGML_TYPE_BF16;
|
||||
static inline const float alpha = 1.0f;
|
||||
static inline const float beta = 0.0f;
|
||||
static inline const void* get_alpha() { static const float val = alpha; return &val; }
|
||||
static inline const void* get_beta() { static const float val = beta; return &val; }
|
||||
static inline auto get_nc_converter(ggml_type src_type) { return ggml_get_to_bf16_nc_cuda(src_type); }
|
||||
};
|
||||
|
||||
template<>
|
||||
struct batched_mul_mat_traits<GGML_TYPE_F16> {
|
||||
using cuda_type = half;
|
||||
static inline const cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
|
||||
static inline const cudaDataType_t data_type = CUDA_R_16F;
|
||||
static inline const ggml_type ggml_type_val = GGML_TYPE_F16;
|
||||
static inline const half alpha = 1.0;
|
||||
static inline const half beta = 0.0;
|
||||
static inline const void* get_alpha() { static const half val = alpha; return &val; }
|
||||
static inline const void* get_beta() { static const half val = beta; return &val; }
|
||||
static inline auto get_nc_converter(ggml_type src_type) { return ggml_get_to_fp16_nc_cuda(src_type); }
|
||||
};
|
||||
|
||||
template<ggml_type src0_type>
|
||||
static void ggml_cuda_mul_mat_batched_cublas_impl(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
using traits = batched_mul_mat_traits<src0_type>;
|
||||
using cuda_t = typename traits::cuda_type;
|
||||
|
||||
GGML_ASSERT(!ggml_is_transposed(src0));
|
||||
GGML_ASSERT(!ggml_is_transposed(src1));
|
||||
|
||||
GGML_ASSERT(!ggml_backend_buft_is_cuda_split(src0->buffer->buft));
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(src0->type == src0_type);
|
||||
GGML_ASSERT(ggml_is_contiguous(dst));
|
||||
|
||||
// Byte offsets and tensor dimensions are currently used in an inconsistent way for dst.
|
||||
// As long as dst is contiguous this does not matter though.
|
||||
GGML_ASSERT(ggml_is_contiguous(dst));
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
||||
const int64_t ne_dst = ggml_nelements(dst);
|
||||
|
||||
cudaStream_t main_stream = ctx.stream();
|
||||
|
||||
CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(), main_stream));
|
||||
|
||||
const half * src0_f16 = (const half *) src0->data;
|
||||
float * dst_ddf = (float *) dst->data;
|
||||
|
||||
const half * src1_f16 = (const half *) src1->data;
|
||||
const size_t ts_src1 = ggml_type_size(src1->type);
|
||||
GGML_ASSERT(nb10 == ts_src1);
|
||||
int64_t s11 = nb11 / ts_src1;
|
||||
int64_t s12 = nb12 / ts_src1;
|
||||
int64_t s13 = nb13 / ts_src1;
|
||||
ggml_cuda_pool_alloc<half> src1_f16_alloc(ctx.pool());
|
||||
|
||||
// convert src1 to fp16
|
||||
if (src1->type != GGML_TYPE_F16) {
|
||||
const to_fp16_nc_cuda_t to_fp16_cuda = ggml_get_to_fp16_nc_cuda(src1->type);
|
||||
const cuda_t * src0_ptr = nullptr;
|
||||
const cuda_t * src1_ptr = nullptr;
|
||||
|
||||
ggml_cuda_pool_alloc<cuda_t> src0_alloc(ctx.pool());
|
||||
ggml_cuda_pool_alloc<cuda_t> src1_alloc(ctx.pool());
|
||||
|
||||
// Handle src0
|
||||
src0_ptr = (const cuda_t *) src0->data;
|
||||
|
||||
// Handle src1 - convert if necessary
|
||||
if (src1->type == src0_type) {
|
||||
src1_ptr = (const cuda_t *) src1->data;
|
||||
} else {
|
||||
// Convert src1 to target type using traits conversion functions
|
||||
const int64_t ne_src1 = ggml_nelements(src1);
|
||||
src1_f16_alloc.alloc(ne_src1);
|
||||
GGML_ASSERT(to_fp16_cuda != nullptr);
|
||||
src1_alloc.alloc(ne_src1);
|
||||
|
||||
to_fp16_cuda(src1_f16, src1_f16_alloc.get(), ne10, ne11, ne12, ne13, s11, s12, s13, main_stream);
|
||||
|
||||
src1_f16 = src1_f16_alloc.get();
|
||||
const auto convert_func = traits::get_nc_converter(src1->type);
|
||||
GGML_ASSERT(convert_func != nullptr);
|
||||
convert_func(src1->data, src1_alloc.get(), ne10, ne11, ne12, ne13, s11, s12, s13, main_stream);
|
||||
src1_ptr = src1_alloc.get();
|
||||
s11 = ne10;
|
||||
s12 = ne11*s11;
|
||||
s13 = ne12*s12;
|
||||
}
|
||||
|
||||
ggml_cuda_pool_alloc<half> dst_f16(ctx.pool());
|
||||
// Setup destination buffer
|
||||
ggml_cuda_pool_alloc<cuda_t> dst_temp(ctx.pool());
|
||||
char * dst_t;
|
||||
|
||||
cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
|
||||
cudaDataType_t cu_data_type = CUDA_R_16F;
|
||||
|
||||
// dst strides
|
||||
size_t nbd2 = dst->nb[2];
|
||||
size_t nbd3 = dst->nb[3];
|
||||
|
||||
const half alpha_f16 = 1.0f;
|
||||
const half beta_f16 = 0.0f;
|
||||
|
||||
cublasComputeType_t cu_compute_type = traits::compute_type;
|
||||
cudaDataType_t cu_data_type = traits::data_type;
|
||||
cudaDataType_t cu_data_type_a = traits::data_type;
|
||||
cudaDataType_t cu_data_type_b = traits::data_type;
|
||||
const void * alpha = traits::get_alpha();
|
||||
const void * beta = traits::get_beta();
|
||||
const float alpha_f32 = 1.0f;
|
||||
const float beta_f32 = 0.0f;
|
||||
|
||||
const void * alpha = &alpha_f16;
|
||||
const void * beta = &beta_f16;
|
||||
const float beta_f32 = 0.0f;
|
||||
|
||||
if (dst->op_params[0] == GGML_PREC_DEFAULT) {
|
||||
dst_t = (char *) dst_f16.alloc(ne_dst);
|
||||
|
||||
nbd2 /= sizeof(float) / sizeof(half);
|
||||
nbd3 /= sizeof(float) / sizeof(half);
|
||||
if constexpr (src0_type == GGML_TYPE_F32) {
|
||||
dst_t = (char *) dst_ddf; // Direct F32 output
|
||||
} else {
|
||||
dst_t = (char *) dst_temp.alloc(ne_dst);
|
||||
nbd2 /= sizeof(float) / sizeof(cuda_t);
|
||||
nbd3 /= sizeof(float) / sizeof(cuda_t);
|
||||
}
|
||||
} else {
|
||||
dst_t = (char *) dst_ddf;
|
||||
|
||||
cu_compute_type = CUBLAS_COMPUTE_32F;
|
||||
cu_data_type = CUDA_R_32F;
|
||||
|
||||
cu_data_type = CUDA_R_32F;
|
||||
alpha = &alpha_f32;
|
||||
beta = &beta_f32;
|
||||
beta = &beta_f32;
|
||||
}
|
||||
|
||||
int id = ggml_cuda_get_device();
|
||||
@@ -1826,7 +1904,7 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
|
||||
if (GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA4(cc)) {
|
||||
cu_compute_type = CUBLAS_COMPUTE_32F;
|
||||
alpha = &alpha_f32;
|
||||
beta = &beta_f32;
|
||||
beta = &beta_f32;
|
||||
}
|
||||
|
||||
GGML_ASSERT(ne12 % ne02 == 0);
|
||||
@@ -1836,35 +1914,15 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
|
||||
const int64_t r2 = ne12/ne02;
|
||||
const int64_t r3 = ne13/ne03;
|
||||
|
||||
#if 0
|
||||
// use cublasGemmEx
|
||||
{
|
||||
for (int i13 = 0; i13 < ne13; ++i13) {
|
||||
for (int i12 = 0; i12 < ne12; ++i12) {
|
||||
int i03 = i13 / r3;
|
||||
int i02 = i12 / r2;
|
||||
|
||||
CUBLAS_CHECK(
|
||||
cublasGemmEx(ctx.cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N,
|
||||
ne01, ne11, ne10,
|
||||
alpha, (const char *) src0_f16 + i03*nb03 + i02*nb02, CUDA_R_16F, nb01/sizeof(half),
|
||||
src1_f16 + i13*s13 + i12*s12, CUDA_R_16F, s11,
|
||||
beta, ( char *) dst_t + i13*nbd3 + i12*nbd2, cu_data_type, ne0,
|
||||
cu_compute_type,
|
||||
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
|
||||
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
|
||||
// use cublasGemmStridedBatchedEx
|
||||
CUBLAS_CHECK(
|
||||
cublasGemmStridedBatchedEx(ctx.cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N,
|
||||
ne01, ne11, ne10,
|
||||
alpha, src0_f16, CUDA_R_16F, nb01/nb00, nb02/nb00, // strideA
|
||||
src1_f16, CUDA_R_16F, s11, s12, // strideB
|
||||
beta, dst_t, cu_data_type, ne0, ne1*ne0, // strideC
|
||||
alpha, src0_ptr, cu_data_type_a, nb01/nb00, nb02/nb00, // strideA
|
||||
src1_ptr, cu_data_type_b, s11, s12, // strideB
|
||||
beta, dst_t, cu_data_type, ne0, ne1*ne0, // strideC
|
||||
ne12*ne13,
|
||||
cu_compute_type,
|
||||
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
||||
@@ -1875,34 +1933,55 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
|
||||
ggml_cuda_pool_alloc<const void *> ptrs_src(ctx.pool(), 2*ne23);
|
||||
ggml_cuda_pool_alloc< void *> ptrs_dst(ctx.pool(), 1*ne23);
|
||||
|
||||
size_t src1_stride_size = sizeof(cuda_t);
|
||||
|
||||
dim3 block_dims(ne13, ne12);
|
||||
k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
|
||||
src0_f16, src1_f16, dst_t,
|
||||
src0_ptr, src1_ptr, dst_t,
|
||||
ptrs_src.get(), ptrs_dst.get(),
|
||||
ne12, ne13,
|
||||
ne23,
|
||||
nb02, nb03,
|
||||
src1->type == GGML_TYPE_F16 ? nb12 : s12*sizeof(half),
|
||||
src1->type == GGML_TYPE_F16 ? nb13 : s13*sizeof(half),
|
||||
(src1->type == src0_type) ? nb12 : s12*src1_stride_size,
|
||||
(src1->type == src0_type) ? nb13 : s13*src1_stride_size,
|
||||
nbd2, nbd3,
|
||||
r2, r3);
|
||||
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
|
||||
CUBLAS_CHECK(
|
||||
cublasGemmBatchedEx(ctx.cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N,
|
||||
ne01, ne11, ne10,
|
||||
alpha, (const void **) (ptrs_src.get() + 0*ne23), CUDA_R_16F, nb01/nb00,
|
||||
(const void **) (ptrs_src.get() + 1*ne23), CUDA_R_16F, s11,
|
||||
beta, ( void **) (ptrs_dst.get() + 0*ne23), cu_data_type, ne0,
|
||||
alpha, (const void **) (ptrs_src.get() + 0*ne23), cu_data_type_a, nb01/nb00,
|
||||
(const void **) (ptrs_src.get() + 1*ne23), cu_data_type_b, s11,
|
||||
beta, ( void **) (ptrs_dst.get() + 0*ne23), cu_data_type, ne0,
|
||||
ne23,
|
||||
cu_compute_type,
|
||||
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
||||
}
|
||||
#endif
|
||||
|
||||
if (dst->op_params[0] == GGML_PREC_DEFAULT && cu_data_type == CUDA_R_16F) {
|
||||
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
|
||||
to_fp32_cuda(dst_f16.get(), dst_ddf, ne_dst, main_stream);
|
||||
// Convert output back to F32 if needed
|
||||
if (dst->op_params[0] == GGML_PREC_DEFAULT && cu_data_type != CUDA_R_32F) {
|
||||
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(traits::ggml_type_val);
|
||||
to_fp32_cuda(dst_temp.get(), dst_ddf, ne_dst, main_stream);
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16 || src0->type == GGML_TYPE_F32);
|
||||
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F32:
|
||||
ggml_cuda_mul_mat_batched_cublas_impl<GGML_TYPE_F32>(ctx, src0, src1, dst);
|
||||
break;
|
||||
case GGML_TYPE_BF16:
|
||||
ggml_cuda_mul_mat_batched_cublas_impl<GGML_TYPE_BF16>(ctx, src0, src1, dst);
|
||||
break;
|
||||
case GGML_TYPE_F16:
|
||||
ggml_cuda_mul_mat_batched_cublas_impl<GGML_TYPE_F16>(ctx, src0, src1, dst);
|
||||
break;
|
||||
default:
|
||||
GGML_ABORT("Unsupported type");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1916,16 +1995,14 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
|
||||
&& ggml_nbytes(src0) != ggml_backend_buffer_get_alloc_size(src0->buffer, src0) && src0->view_src;
|
||||
|
||||
bool use_mul_mat_vec = (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16)
|
||||
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
||||
&& src0->ne[0] % 2 == 0 && src1->ne[1] == 1;
|
||||
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
|
||||
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type) && !bad_padding_clear
|
||||
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
||||
&& src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
|
||||
bool use_mul_mat_q = ggml_is_quantized(src0->type) && !bad_padding_clear
|
||||
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
|
||||
|
||||
bool any_gpus_with_slow_fp16 = false;
|
||||
bool any_gpus_without_fp16_mma = false;
|
||||
bool any_gpus_with_slow_fp16 = false;
|
||||
|
||||
if (split) {
|
||||
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
|
||||
@@ -1936,16 +2013,16 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
|
||||
continue;
|
||||
}
|
||||
|
||||
const int cc = ggml_cuda_info().devices[id].cc;
|
||||
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
|
||||
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc);
|
||||
any_gpus_without_fp16_mma = any_gpus_without_fp16_mma || !fp16_mma_hardware_available(cc);
|
||||
const int cc = ggml_cuda_info().devices[id].cc;
|
||||
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
|
||||
use_mul_mat_vec = use_mul_mat_vec && ggml_cuda_should_use_mmv(src0->type, cc, src0->ne, src1->ne[1]);
|
||||
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc);
|
||||
}
|
||||
} else {
|
||||
const int cc = ggml_cuda_info().devices[ctx.device].cc;
|
||||
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
|
||||
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc);
|
||||
any_gpus_without_fp16_mma = any_gpus_without_fp16_mma || !fp16_mma_hardware_available(cc);
|
||||
const int cc = ggml_cuda_info().devices[ctx.device].cc;
|
||||
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
|
||||
use_mul_mat_vec = use_mul_mat_vec && ggml_cuda_should_use_mmv(src0->type, cc, src0->ne, src1->ne[1]);
|
||||
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc);
|
||||
}
|
||||
|
||||
// debug helpers
|
||||
@@ -1956,7 +2033,13 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
|
||||
//printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
|
||||
//printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
|
||||
|
||||
if (!split && use_mul_mat_vec && (src0->ne[1] <= MMV_MAX_ROWS || any_gpus_without_fp16_mma)) {
|
||||
//TODO update for generic tensor parallelism
|
||||
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
|
||||
bool use_batched_cublas_f16 = src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16);
|
||||
bool use_batched_cublas_bf16 = src0->type == GGML_TYPE_BF16 && bf16_mma_hardware_available(cc);
|
||||
bool use_batched_cublas_f32 = src0->type == GGML_TYPE_F32;
|
||||
|
||||
if (!split && use_mul_mat_vec) {
|
||||
// the custom F16 vector kernel can be used over batched cuBLAS GEMM
|
||||
// but this is only faster for GPUs without tensor cores or with a thin src0 matrix (particularly KQV in attention)
|
||||
ggml_cuda_mul_mat_vec(ctx, src0, src1, nullptr, dst);
|
||||
@@ -1964,8 +2047,8 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
|
||||
ggml_cuda_mul_mat_vec_q(ctx, src0, src1, nullptr, dst);
|
||||
} else if (!split && use_mul_mat_q) {
|
||||
ggml_cuda_mul_mat_q(ctx, src0, src1, nullptr, dst);
|
||||
} else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16) &&
|
||||
!ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
|
||||
} else if (!split && (use_batched_cublas_f16 || use_batched_cublas_bf16 || use_batched_cublas_f32)
|
||||
&& !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
|
||||
// general KQ + KQV multi-batch without FlashAttention
|
||||
ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
|
||||
} else if (use_mul_mat_vec) {
|
||||
@@ -2220,6 +2303,27 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
case GGML_OP_GLU:
|
||||
switch (ggml_get_glu_op(dst)) {
|
||||
case GGML_GLU_OP_REGLU:
|
||||
ggml_cuda_op_reglu(ctx, dst);
|
||||
break;
|
||||
case GGML_GLU_OP_GEGLU:
|
||||
ggml_cuda_op_geglu(ctx, dst);
|
||||
break;
|
||||
case GGML_GLU_OP_SWIGLU:
|
||||
ggml_cuda_op_swiglu(ctx, dst);
|
||||
break;
|
||||
case GGML_GLU_OP_GEGLU_ERF:
|
||||
ggml_cuda_op_geglu_erf(ctx, dst);
|
||||
break;
|
||||
case GGML_GLU_OP_GEGLU_QUICK:
|
||||
ggml_cuda_op_geglu_quick(ctx, dst);
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
case GGML_OP_NORM:
|
||||
ggml_cuda_op_norm(ctx, dst);
|
||||
break;
|
||||
@@ -2310,6 +2414,12 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
||||
case GGML_OP_IM2COL:
|
||||
ggml_cuda_op_im2col(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_CONV_2D_DW:
|
||||
ggml_cuda_op_conv2d_dw(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_CONV_TRANSPOSE_2D:
|
||||
ggml_cuda_conv_2d_transpose_p0(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||
ggml_cuda_op_conv_transpose_1d(ctx,dst);
|
||||
break;
|
||||
@@ -2322,6 +2432,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
||||
case GGML_OP_SUM_ROWS:
|
||||
ggml_cuda_op_sum_rows(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_MEAN:
|
||||
ggml_cuda_op_mean(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_SSM_CONV:
|
||||
ggml_cuda_op_ssm_conv(ctx, dst);
|
||||
break;
|
||||
@@ -2664,7 +2777,9 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
|
||||
ggml_backend_buft_is_cuda_split(node->src[j]->buffer->buft) || (integrated && ggml_backend_buft_is_cuda_host(node->src[j]->buffer->buft)));
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#else
|
||||
GGML_UNUSED(integrated);
|
||||
#endif // NDEBUG
|
||||
|
||||
bool ok = ggml_cuda_compute_forward(*cuda_ctx, node);
|
||||
if (!ok) {
|
||||
@@ -2683,6 +2798,11 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
|
||||
|
||||
CUDA_CHECK(cudaStreamEndCapture(cuda_ctx->stream(), &cuda_ctx->cuda_graph->graph));
|
||||
graph_evaluated_or_captured = true; // CUDA graph has been captured
|
||||
|
||||
std::lock_guard<std::mutex> lock(ggml_cuda_lock);
|
||||
if (ggml_cuda_lock_counter.fetch_sub(1, std::memory_order_relaxed) == 1) {
|
||||
ggml_cuda_lock_cv.notify_all();
|
||||
}
|
||||
} else {
|
||||
graph_evaluated_or_captured = true; // ggml graph has been directly evaluated
|
||||
}
|
||||
@@ -2758,7 +2878,13 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
||||
}
|
||||
}
|
||||
|
||||
if (use_cuda_graph && cuda_graph_update_required) { // Start CUDA graph capture
|
||||
if (use_cuda_graph && cuda_graph_update_required) {
|
||||
// Start CUDA graph capture
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(ggml_cuda_lock);
|
||||
ggml_cuda_lock_counter.fetch_add(1, std::memory_order_relaxed);
|
||||
}
|
||||
|
||||
CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed));
|
||||
}
|
||||
|
||||
@@ -2991,6 +3117,18 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
case GGML_OP_GLU:
|
||||
switch (ggml_get_glu_op(op)) {
|
||||
case GGML_GLU_OP_REGLU:
|
||||
case GGML_GLU_OP_GEGLU:
|
||||
case GGML_GLU_OP_SWIGLU:
|
||||
case GGML_GLU_OP_GEGLU_ERF:
|
||||
case GGML_GLU_OP_GEGLU_QUICK:
|
||||
return ggml_is_contiguous_1(op->src[0]);
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
case GGML_OP_MUL_MAT:
|
||||
case GGML_OP_MUL_MAT_ID:
|
||||
{
|
||||
@@ -3014,9 +3152,16 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
return false;
|
||||
}
|
||||
#ifdef GGML_USE_MUSA
|
||||
if (b->type == GGML_TYPE_F16 && b->ne[2]*b->ne[3] > 1 &&
|
||||
!ggml_is_transposed(a) && !ggml_is_transposed(b)) {
|
||||
return false;
|
||||
const int cc = ggml_cuda_info().devices[dev_ctx->device].cc;
|
||||
if (b->ne[2]*b->ne[3] > 1 && !ggml_is_transposed(a) && !ggml_is_transposed(b)) {
|
||||
if (GGML_CUDA_CC_IS_QY1(cc) && op->op == GGML_OP_MUL_MAT &&
|
||||
a->type == GGML_TYPE_F16 && b->type == GGML_TYPE_F16) {
|
||||
return false;
|
||||
}
|
||||
if (GGML_CUDA_CC_IS_QY2(cc) && op->op == GGML_OP_MUL_MAT_ID &&
|
||||
a->type == GGML_TYPE_Q2_K && b->type == GGML_TYPE_F32) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
#endif // GGML_USE_MUSA
|
||||
switch (a->type) {
|
||||
@@ -3043,11 +3188,6 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
case GGML_TYPE_IQ4_XS:
|
||||
case GGML_TYPE_BF16:
|
||||
#ifdef GGML_USE_MUSA
|
||||
if (a->type == GGML_TYPE_Q3_K) {
|
||||
return false;
|
||||
}
|
||||
#endif // GGML_USE_MUSA
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
@@ -3060,6 +3200,8 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
switch (op->src[0]->type) {
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_F32:
|
||||
case GGML_TYPE_BF16:
|
||||
case GGML_TYPE_I32:
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_1:
|
||||
case GGML_TYPE_Q5_0:
|
||||
@@ -3189,12 +3331,26 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
case GGML_OP_COS:
|
||||
case GGML_OP_CLAMP:
|
||||
case GGML_OP_LOG:
|
||||
case GGML_OP_SSM_SCAN:
|
||||
case GGML_OP_SSM_CONV:
|
||||
return true;
|
||||
case GGML_OP_SSM_SCAN: {
|
||||
if (op->src[3]->ne[0] == 1) {
|
||||
// Mamba2
|
||||
// (kernel only supports (d_state == 128 || d_state == 256) && d_head % 16 == 0)
|
||||
return (op->src[0]->ne[0] == 128 || op->src[0]->ne[0] == 256) && op->src[0]->ne[1] % 16 == 0;
|
||||
} else {
|
||||
// Mamba
|
||||
// (kernel only supports d_state == 16, d_head == 1, n_head % 128 == 0, n_group == 1)
|
||||
return op->src[0]->ne[0] == 16 && op->src[0]->ne[1] == 1 && op->src[0]->ne[2] % 128 == 0 && op->src[4]->ne[1] == 1;
|
||||
}
|
||||
}
|
||||
case GGML_OP_SSM_CONV: {
|
||||
// assumes d_inner % threads == 0
|
||||
return op->src[0]->ne[1] % 128 == 0;
|
||||
}
|
||||
case GGML_OP_CONT:
|
||||
return op->src[0]->type != GGML_TYPE_BF16;
|
||||
case GGML_OP_DIAG_MASK_INF:
|
||||
return true;
|
||||
case GGML_OP_SOFT_MAX:
|
||||
return true;
|
||||
case GGML_OP_SOFT_MAX_BACK: {
|
||||
@@ -3207,16 +3363,18 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
return op->src[0]->nb[0] == ggml_type_size(op->src[0]->type) && ggml_is_contiguous_2(op->src[0]);
|
||||
}
|
||||
case GGML_OP_IM2COL:
|
||||
case GGML_OP_CONV_2D_DW:
|
||||
case GGML_OP_CONV_TRANSPOSE_2D:
|
||||
case GGML_OP_POOL_2D:
|
||||
case GGML_OP_SUM:
|
||||
case GGML_OP_SUM_ROWS:
|
||||
case GGML_OP_MEAN:
|
||||
case GGML_OP_ARGSORT:
|
||||
case GGML_OP_ACC:
|
||||
return true;
|
||||
case GGML_OP_GROUP_NORM:
|
||||
return ggml_is_contiguous(op->src[0]);
|
||||
case GGML_OP_UPSCALE:
|
||||
return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST;
|
||||
case GGML_OP_PAD:
|
||||
case GGML_OP_ARANGE:
|
||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
@@ -3240,6 +3398,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
if (op->src[0]->ne[0] == 192) {
|
||||
return false;
|
||||
}
|
||||
// TODO: support broadcast
|
||||
// note: this was initially implemented in https://github.com/ggml-org/llama.cpp/pull/14500, but
|
||||
// the interface of ggml_flash_attn_ext() changed in https://github.com/ggml-org/llama.cpp/pull/14505
|
||||
if (op->src[0]->ne[3] != 1) {
|
||||
return false;
|
||||
}
|
||||
|
||||
19
ggml/src/ggml-cuda/mean.cu
Normal file
19
ggml/src/ggml-cuda/mean.cu
Normal file
@@ -0,0 +1,19 @@
|
||||
#include "mean.cuh"
|
||||
|
||||
void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const float * src0_d = (const float *) src0->data;
|
||||
float * dst_d = (float *) dst->data;
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
|
||||
const int64_t ncols = src0->ne[0];
|
||||
const int64_t nrows = ggml_nrows(src0);
|
||||
|
||||
const dim3 block_dims(WARP_SIZE, 1, 1);
|
||||
const dim3 block_nums(nrows, 1, 1);
|
||||
reduce_rows_f32</*norm*/ true><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user