mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-04-30 16:47:31 +03:00
Compare commits
49 Commits
b4281
...
gg/compare
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
7e9208e408 | ||
|
|
89d604f2c8 | ||
|
|
e52aba537a | ||
|
|
ba1cb19cdd | ||
|
|
56eea0781c | ||
|
|
a76c56fa1a | ||
|
|
c27ac678dd | ||
|
|
11e07fd63b | ||
|
|
4601a8bb67 | ||
|
|
9f35e44592 | ||
|
|
64ae065511 | ||
|
|
83ed24a97b | ||
|
|
d583cd03f6 | ||
|
|
adffa6ffd5 | ||
|
|
274ec65af6 | ||
|
|
8faa1d4dd4 | ||
|
|
cb13ef85a4 | ||
|
|
4064c0e3b6 | ||
|
|
dc5301d565 | ||
|
|
9fdb124304 | ||
|
|
5555c0c1f6 | ||
|
|
973f328b1e | ||
|
|
fb18934a97 | ||
|
|
235f6e14bf | ||
|
|
1a31d0dc00 | ||
|
|
92f77a640f | ||
|
|
484d2f31ae | ||
|
|
4b4d92b098 | ||
|
|
43041d2eb3 | ||
|
|
b685daf386 | ||
|
|
dafae66cc2 | ||
|
|
ae4b922614 | ||
|
|
750cb3e246 | ||
|
|
a86ad841f1 | ||
|
|
a05e2afcc2 | ||
|
|
26a8406ba9 | ||
|
|
c37fb4cf62 | ||
|
|
3d98b4cb22 | ||
|
|
1a05004743 | ||
|
|
ce8784bdb1 | ||
|
|
e52522b869 | ||
|
|
06d70147e6 | ||
|
|
43ed389a3f | ||
|
|
ecc93d0558 | ||
|
|
62e84d9848 | ||
|
|
3573fa8e7b | ||
|
|
d9c3ba2b77 | ||
|
|
ce4a7b8493 | ||
|
|
19d8762ab6 |
@@ -31,6 +31,7 @@
|
||||
# Increases the runtime closure size by ~700M
|
||||
useMpi ? false,
|
||||
useRocm ? config.rocmSupport,
|
||||
rocmGpuTargets ? builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets,
|
||||
enableCurl ? true,
|
||||
useVulkan ? false,
|
||||
llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake
|
||||
@@ -188,7 +189,7 @@ effectiveStdenv.mkDerivation (finalAttrs: {
|
||||
]
|
||||
++ optionals useRocm [
|
||||
(cmakeFeature "CMAKE_HIP_COMPILER" "${rocmPackages.llvm.clang}/bin/clang")
|
||||
(cmakeFeature "CMAKE_HIP_ARCHITECTURES" (builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets))
|
||||
(cmakeFeature "CMAKE_HIP_ARCHITECTURES" rocmGpuTargets)
|
||||
]
|
||||
++ optionals useMetalKit [
|
||||
(lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1")
|
||||
|
||||
@@ -8,11 +8,11 @@ arg1="$1"
|
||||
shift
|
||||
|
||||
if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
|
||||
python3 ./convert_hf_to_gguf.py "$@"
|
||||
exec python3 ./convert_hf_to_gguf.py "$@"
|
||||
elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
|
||||
./llama-quantize "$@"
|
||||
exec ./llama-quantize "$@"
|
||||
elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
|
||||
./llama-cli "$@"
|
||||
exec ./llama-cli "$@"
|
||||
elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
|
||||
echo "Converting PTH to GGML..."
|
||||
for i in `ls $1/$2/ggml-model-f16.bin*`; do
|
||||
@@ -20,11 +20,11 @@ elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
|
||||
echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
|
||||
else
|
||||
echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..."
|
||||
./llama-quantize "$i" "${i/f16/q4_0}" q4_0
|
||||
exec ./llama-quantize "$i" "${i/f16/q4_0}" q4_0
|
||||
fi
|
||||
done
|
||||
elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
|
||||
./llama-server "$@"
|
||||
exec ./llama-server "$@"
|
||||
else
|
||||
echo "Unknown command: $arg1"
|
||||
echo "Available commands: "
|
||||
|
||||
133
.github/workflows/build.yml
vendored
133
.github/workflows/build.yml
vendored
@@ -552,35 +552,44 @@ jobs:
|
||||
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
|
||||
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
|
||||
|
||||
# TODO: tmp disabled. see for possible re-enable:
|
||||
# https://github.com/ggerganov/llama.cpp/pull/10525
|
||||
# macOS-latest-swift:
|
||||
# runs-on: macos-latest
|
||||
#
|
||||
# strategy:
|
||||
# matrix:
|
||||
# destination: ['generic/platform=macOS', 'generic/platform=iOS', 'generic/platform=tvOS']
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# id: checkout
|
||||
# uses: actions/checkout@v4
|
||||
#
|
||||
# - name: Dependencies
|
||||
# id: depends
|
||||
# continue-on-error: true
|
||||
# run: |
|
||||
# brew update
|
||||
#
|
||||
# - name: xcodebuild for swift package
|
||||
# id: xcodebuild
|
||||
# run: |
|
||||
# xcodebuild -scheme llama -destination "${{ matrix.destination }}"
|
||||
#
|
||||
# - name: Build Swift Example
|
||||
# id: make_build_swift_example
|
||||
# run: |
|
||||
# make swift
|
||||
macOS-latest-swift:
|
||||
runs-on: macos-latest
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
destination: ['generic/platform=macOS', 'generic/platform=iOS', 'generic/platform=tvOS']
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
continue-on-error: true
|
||||
run: |
|
||||
brew update
|
||||
|
||||
- name: Build llama.cpp with CMake
|
||||
id: cmake_build
|
||||
run: |
|
||||
sysctl -a
|
||||
mkdir build
|
||||
cd build
|
||||
cmake -G Xcode .. \
|
||||
-DGGML_METAL_USE_BF16=ON \
|
||||
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
-DLLAMA_BUILD_SERVER=OFF \
|
||||
-DCMAKE_OSX_ARCHITECTURES="arm64;x86_64"
|
||||
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
|
||||
sudo cmake --install . --config Release
|
||||
|
||||
- name: xcodebuild for swift package
|
||||
id: xcodebuild
|
||||
run: |
|
||||
xcodebuild -scheme llama-Package -destination "${{ matrix.destination }}"
|
||||
|
||||
windows-msys2:
|
||||
runs-on: windows-latest
|
||||
@@ -653,6 +662,8 @@ jobs:
|
||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
|
||||
- build: 'msvc-arm64'
|
||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
|
||||
- build: 'llvm-arm64-opencl-adreno'
|
||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON'
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
@@ -694,6 +705,28 @@ jobs:
|
||||
run: |
|
||||
choco install ninja
|
||||
|
||||
- name: Install OpenCL Headers and Libs
|
||||
id: install_opencl
|
||||
if: ${{ matrix.build == 'llvm-arm64-opencl-adreno' }}
|
||||
run: |
|
||||
git clone https://github.com/KhronosGroup/OpenCL-Headers
|
||||
cd OpenCL-Headers
|
||||
mkdir build && cd build
|
||||
cmake .. `
|
||||
-DBUILD_TESTING=OFF `
|
||||
-DOPENCL_HEADERS_BUILD_TESTING=OFF `
|
||||
-DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF `
|
||||
-DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
|
||||
cmake --build . --target install
|
||||
git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader
|
||||
cd OpenCL-ICD-Loader
|
||||
mkdir build-arm64-release && cd build-arm64-release
|
||||
cmake .. `
|
||||
-A arm64 `
|
||||
-DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" `
|
||||
-DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
|
||||
cmake --build . --target install --config release
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
@@ -723,7 +756,7 @@ jobs:
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
# not all machines have native AVX-512
|
||||
if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }}
|
||||
if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'llvm-arm64-opencl-adreno' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }}
|
||||
run: |
|
||||
cd build
|
||||
ctest -L main -C Release --verbose --timeout 900
|
||||
@@ -1104,6 +1137,29 @@ jobs:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
sysctl -a
|
||||
mkdir build
|
||||
cd build
|
||||
cmake -G Xcode .. \
|
||||
-DGGML_METAL_USE_BF16=ON \
|
||||
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
-DLLAMA_BUILD_SERVER=OFF \
|
||||
-DCMAKE_SYSTEM_NAME=iOS \
|
||||
-DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
|
||||
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
|
||||
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
|
||||
sudo cmake --install . --config Release
|
||||
|
||||
- name: xcodebuild for swift package
|
||||
id: xcodebuild
|
||||
run: |
|
||||
xcodebuild -scheme llama-Package -destination 'generic/platform=iOS'
|
||||
|
||||
- name: Build Xcode project
|
||||
run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build
|
||||
|
||||
@@ -1131,23 +1187,6 @@ jobs:
|
||||
|
||||
./gradlew build --no-daemon
|
||||
|
||||
# freeBSD-latest:
|
||||
# runs-on: macos-12
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# uses: actions/checkout@v4
|
||||
#
|
||||
# - name: Build
|
||||
# uses: cross-platform-actions/action@v0.19.0
|
||||
# with:
|
||||
# operating_system: freebsd
|
||||
# version: '13.2'
|
||||
# hypervisor: 'qemu'
|
||||
# run: |
|
||||
# sudo pkg update
|
||||
# sudo pkg install -y gmake automake autoconf pkgconf llvm15 openblas
|
||||
# gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j `sysctl -n hw.ncpu`
|
||||
|
||||
release:
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
|
||||
|
||||
2
.github/workflows/server.yml
vendored
2
.github/workflows/server.yml
vendored
@@ -79,7 +79,7 @@ jobs:
|
||||
# Setup nodejs (to be used for verifying bundled index.html)
|
||||
- uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: 22
|
||||
node-version: '22.11.0'
|
||||
|
||||
- name: Verify bundled index.html
|
||||
id: verify_server_index_html
|
||||
|
||||
@@ -46,11 +46,9 @@ if (WIN32)
|
||||
add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
|
||||
endif()
|
||||
|
||||
if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
|
||||
add_compile_options("$<$<COMPILE_LANGUAGE:C>:/source-charset:utf-8>")
|
||||
add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/source-charset:utf-8>")
|
||||
add_compile_options("$<$<COMPILE_LANGUAGE:C>:/execution-charset:utf-8>")
|
||||
add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/execution-charset:utf-8>")
|
||||
if (MSVC)
|
||||
add_compile_options("$<$<COMPILE_LANGUAGE:C>:/utf-8>")
|
||||
add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/utf-8>")
|
||||
endif()
|
||||
|
||||
#
|
||||
|
||||
@@ -31,6 +31,13 @@
|
||||
{ "name": "sycl_f16", "hidden": true, "cacheVariables": { "GGML_SYCL_F16": "ON" } },
|
||||
{ "name": "vulkan", "hidden": true, "cacheVariables": { "GGML_VULKAN": "ON" } },
|
||||
|
||||
{
|
||||
"name": "x64-windows-llvm", "hidden": true,
|
||||
"cacheVariables": {
|
||||
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/x64-windows-llvm.cmake"
|
||||
}
|
||||
},
|
||||
|
||||
{
|
||||
"name": "arm64-windows-msvc", "hidden": true,
|
||||
"architecture": { "value": "arm64", "strategy": "external" },
|
||||
@@ -70,6 +77,11 @@
|
||||
{ "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg" ] },
|
||||
{ "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg", "static" ] },
|
||||
|
||||
{ "name": "x64-windows-llvm-debug", "inherits": [ "base", "x64-windows-llvm", "debug" ] },
|
||||
{ "name": "x64-windows-llvm-release", "inherits": [ "base", "x64-windows-llvm", "release" ] },
|
||||
{ "name": "x64-windows-llvm-reldbg", "inherits": [ "base", "x64-windows-llvm", "reldbg" ] },
|
||||
{ "name": "x64-windows-llvm+static-release", "inherits": [ "base", "x64-windows-llvm", "reldbg", "static" ] },
|
||||
|
||||
{ "name": "x64-windows-msvc-debug", "inherits": [ "base", "debug" ] },
|
||||
{ "name": "x64-windows-msvc-release", "inherits": [ "base", "reldbg" ] },
|
||||
{ "name": "x64-windows-msvc+static-release", "inherits": [ "base", "reldbg", "static" ] },
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
# collaborators can optionally add themselves here to indicate their availability for reviewing related PRs
|
||||
|
||||
ci/ @ggerganov
|
||||
/ci/ @ggerganov
|
||||
/.devops/ @ngxson
|
||||
/examples/server/ @ngxson
|
||||
|
||||
31
Makefile
31
Makefile
@@ -22,6 +22,7 @@ BUILD_TARGETS = \
|
||||
llama-infill \
|
||||
llama-llava-cli \
|
||||
llama-minicpmv-cli\
|
||||
llama-qwen2vl-cli\
|
||||
llama-lookahead \
|
||||
llama-lookup \
|
||||
llama-lookup-create \
|
||||
@@ -445,6 +446,10 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
|
||||
MK_CFLAGS += -march=native -mtune=native
|
||||
HOST_CXXFLAGS += -march=native -mtune=native
|
||||
|
||||
# Usage AMX build test
|
||||
#MK_CFLAGS += -march=graniterapids -mtune=graniterapids
|
||||
#HOST_CXXFLAGS += -march=graniterapids -mtune=graniterapids
|
||||
|
||||
# Usage AVX-only
|
||||
#MK_CFLAGS += -mfma -mf16c -mavx
|
||||
#MK_CXXFLAGS += -mfma -mf16c -mavx
|
||||
@@ -948,7 +953,6 @@ DIR_COMMON = common
|
||||
|
||||
OBJ_GGML = \
|
||||
$(DIR_GGML)/src/ggml.o \
|
||||
$(DIR_GGML)/src/ggml-aarch64.o \
|
||||
$(DIR_GGML)/src/ggml-alloc.o \
|
||||
$(DIR_GGML)/src/ggml-backend.o \
|
||||
$(DIR_GGML)/src/ggml-backend-reg.o \
|
||||
@@ -956,9 +960,11 @@ OBJ_GGML = \
|
||||
$(DIR_GGML)/src/ggml-quants.o \
|
||||
$(DIR_GGML)/src/ggml-threading.o \
|
||||
$(DIR_GGML)/src/ggml-cpu/ggml-cpu.o \
|
||||
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-cpp.o \
|
||||
$(DIR_GGML)/src/ggml-cpu/ggml-cpu_cpp.o \
|
||||
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-aarch64.o \
|
||||
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-hbm.o \
|
||||
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-quants.o \
|
||||
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-traits.o \
|
||||
$(OBJ_GGML_EXT)
|
||||
|
||||
OBJ_LLAMA = \
|
||||
@@ -1098,17 +1104,10 @@ DEP_FILES = $(OBJ_GGML:.o=.d) $(OBJ_LLAMA:.o=.d) $(OBJ_COMMON:.o=.d)
|
||||
# Default target
|
||||
all: $(BUILD_TARGETS)
|
||||
|
||||
# force c++ build for source file that have same name as c file
|
||||
# Note: need this exception because `ggml-cpu.c` and `ggml-cpu.cpp` both produce the same obj/dep files
|
||||
# g++ -M -I ./ggml/include/ -I ./ggml/src ggml/src/ggml-cpu/ggml-cpu.cpp | grep ggml
|
||||
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-cpp.o: \
|
||||
ggml/src/ggml-cpu/ggml-cpu.cpp \
|
||||
ggml/include/ggml-backend.h \
|
||||
ggml/include/ggml.h \
|
||||
ggml/include/ggml-alloc.h \
|
||||
ggml/src/ggml-backend-impl.h \
|
||||
ggml/include/ggml-cpu.h \
|
||||
ggml/src/ggml-impl.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
$(DIR_GGML)/%_cpp.o: $(DIR_GGML)/%.cpp
|
||||
$(CXX) $(CXXFLAGS) -MMD -c $< -o $@
|
||||
|
||||
# Rules for building object files
|
||||
$(DIR_GGML)/%.o: $(DIR_GGML)/%.c
|
||||
@@ -1406,6 +1405,14 @@ llama-minicpmv-cli: examples/llava/minicpmv-cli.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
|
||||
|
||||
llama-qwen2vl-cli: examples/llava/qwen2vl-cli.cpp \
|
||||
examples/llava/llava.cpp \
|
||||
examples/llava/llava.h \
|
||||
examples/llava/clip.cpp \
|
||||
examples/llava/clip.h \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
|
||||
|
||||
ifeq ($(UNAME_S),Darwin)
|
||||
swift: examples/batched.swift
|
||||
(cd examples/batched.swift; make build)
|
||||
|
||||
@@ -2,59 +2,6 @@
|
||||
|
||||
import PackageDescription
|
||||
|
||||
var sources = [
|
||||
"src/llama.cpp",
|
||||
"src/llama-vocab.cpp",
|
||||
"src/llama-grammar.cpp",
|
||||
"src/llama-sampling.cpp",
|
||||
"src/unicode.cpp",
|
||||
"src/unicode-data.cpp",
|
||||
"ggml/src/ggml.c",
|
||||
"ggml/src/ggml-aarch64.c",
|
||||
"ggml/src/ggml-alloc.c",
|
||||
"ggml/src/ggml-backend.cpp",
|
||||
"ggml/src/ggml-backend-reg.cpp",
|
||||
"ggml/src/ggml-cpu/ggml-cpu.c",
|
||||
"ggml/src/ggml-cpu/ggml-cpu.cpp",
|
||||
"ggml/src/ggml-cpu/ggml-cpu-aarch64.c",
|
||||
"ggml/src/ggml-cpu/ggml-cpu-quants.c",
|
||||
"ggml/src/ggml-threading.cpp",
|
||||
"ggml/src/ggml-quants.c",
|
||||
]
|
||||
|
||||
var resources: [Resource] = []
|
||||
var linkerSettings: [LinkerSetting] = []
|
||||
var cSettings: [CSetting] = [
|
||||
.unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
|
||||
.unsafeFlags(["-fno-objc-arc"]),
|
||||
.headerSearchPath("ggml/src"),
|
||||
.headerSearchPath("ggml/src/ggml-cpu"),
|
||||
// NOTE: NEW_LAPACK will required iOS version 16.4+
|
||||
// We should consider add this in the future when we drop support for iOS 14
|
||||
// (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
|
||||
// .define("ACCELERATE_NEW_LAPACK"),
|
||||
// .define("ACCELERATE_LAPACK_ILP64")
|
||||
.define("GGML_USE_CPU"),
|
||||
]
|
||||
|
||||
|
||||
#if canImport(Darwin)
|
||||
sources.append("ggml/src/ggml-common.h")
|
||||
sources.append("ggml/src/ggml-metal/ggml-metal.m")
|
||||
resources.append(.process("ggml/src/ggml-metal/ggml-metal.metal"))
|
||||
linkerSettings.append(.linkedFramework("Accelerate"))
|
||||
cSettings.append(
|
||||
contentsOf: [
|
||||
.define("GGML_USE_ACCELERATE"),
|
||||
.define("GGML_USE_METAL"),
|
||||
]
|
||||
)
|
||||
#endif
|
||||
|
||||
#if os(Linux)
|
||||
cSettings.append(.define("_GNU_SOURCE"))
|
||||
#endif
|
||||
|
||||
let package = Package(
|
||||
name: "llama",
|
||||
platforms: [
|
||||
@@ -67,26 +14,6 @@ let package = Package(
|
||||
.library(name: "llama", targets: ["llama"]),
|
||||
],
|
||||
targets: [
|
||||
.target(
|
||||
name: "llama",
|
||||
path: ".",
|
||||
exclude: [
|
||||
"build",
|
||||
"cmake",
|
||||
"examples",
|
||||
"scripts",
|
||||
"models",
|
||||
"tests",
|
||||
"CMakeLists.txt",
|
||||
"Makefile",
|
||||
"ggml/src/ggml-metal-embed.metal"
|
||||
],
|
||||
sources: sources,
|
||||
resources: resources,
|
||||
publicHeadersPath: "spm-headers",
|
||||
cSettings: cSettings,
|
||||
linkerSettings: linkerSettings
|
||||
)
|
||||
],
|
||||
cxxLanguageStandard: .cxx17
|
||||
.systemLibrary(name: "llama", pkgConfig: "llama"),
|
||||
]
|
||||
)
|
||||
|
||||
15
README.md
15
README.md
@@ -110,6 +110,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
||||
- [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM)
|
||||
- [x] [Moondream](https://huggingface.co/vikhyatk/moondream2)
|
||||
- [x] [Bunny](https://github.com/BAAI-DCAI/Bunny)
|
||||
- [x] [Qwen2-VL](https://huggingface.co/collections/Qwen/qwen2-vl-66cee7455501d7126940800d)
|
||||
|
||||
</details>
|
||||
|
||||
@@ -433,6 +434,20 @@ To learn more about model quantization, [read this documentation](examples/quant
|
||||
|
||||
</details>
|
||||
|
||||
## [`llama-run`](examples/run)
|
||||
|
||||
#### A comprehensive example for running `llama.cpp` models. Useful for inferencing. Used with RamaLama [^3].
|
||||
|
||||
- <details>
|
||||
<summary>Run a model with a specific prompt (by default it's pulled from Ollama registry)</summary>
|
||||
|
||||
```bash
|
||||
llama-run granite-code
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
[^3]: [https://github.com/containers/ramalama](RamaLama)
|
||||
|
||||
## [`llama-simple`](examples/simple)
|
||||
|
||||
|
||||
4
Sources/llama/llama.h
Normal file
4
Sources/llama/llama.h
Normal file
@@ -0,0 +1,4 @@
|
||||
#pragma once
|
||||
|
||||
#include <llama.h>
|
||||
|
||||
5
Sources/llama/module.modulemap
Normal file
5
Sources/llama/module.modulemap
Normal file
@@ -0,0 +1,5 @@
|
||||
module llama [system] {
|
||||
header "llama.h"
|
||||
link "llama"
|
||||
export *
|
||||
}
|
||||
@@ -6,5 +6,5 @@ includedir=${prefix}/include
|
||||
Name: llama
|
||||
Description: Port of Facebook's LLaMA model in C/C++
|
||||
Version: @PROJECT_VERSION@
|
||||
Libs: -L${libdir} -lllama
|
||||
Libs: -L${libdir} -lggml -lggml-base -lllama
|
||||
Cflags: -I${includedir}
|
||||
|
||||
11
cmake/x64-windows-llvm.cmake
Normal file
11
cmake/x64-windows-llvm.cmake
Normal file
@@ -0,0 +1,11 @@
|
||||
set( CMAKE_SYSTEM_NAME Windows )
|
||||
set( CMAKE_SYSTEM_PROCESSOR x86_64 )
|
||||
|
||||
set( CMAKE_C_COMPILER clang )
|
||||
set( CMAKE_CXX_COMPILER clang++ )
|
||||
|
||||
set( arch_c_flags "-march=native" )
|
||||
|
||||
set( CMAKE_C_FLAGS_INIT "${arch_c_flags}" )
|
||||
set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags}" )
|
||||
|
||||
@@ -81,7 +81,7 @@ set(LLAMA_COMMON_EXTRA_LIBS build_info)
|
||||
# Use curl to download model url
|
||||
if (LLAMA_CURL)
|
||||
find_package(CURL REQUIRED)
|
||||
add_definitions(-DLLAMA_USE_CURL)
|
||||
target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL)
|
||||
include_directories(${CURL_INCLUDE_DIRS})
|
||||
find_library(CURL_LIBRARY curl REQUIRED)
|
||||
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARY})
|
||||
|
||||
@@ -145,6 +145,35 @@ static void common_params_handle_model_default(common_params & params) {
|
||||
}
|
||||
}
|
||||
|
||||
const std::vector<ggml_type> kv_cache_types = {
|
||||
GGML_TYPE_F32,
|
||||
GGML_TYPE_F16,
|
||||
GGML_TYPE_BF16,
|
||||
GGML_TYPE_Q8_0,
|
||||
GGML_TYPE_Q4_0,
|
||||
GGML_TYPE_Q4_1,
|
||||
GGML_TYPE_IQ4_NL,
|
||||
GGML_TYPE_Q5_0,
|
||||
GGML_TYPE_Q5_1,
|
||||
};
|
||||
|
||||
static ggml_type kv_cache_type_from_str(const std::string & s) {
|
||||
for (const auto & type : kv_cache_types) {
|
||||
if (ggml_type_name(type) == s) {
|
||||
return type;
|
||||
}
|
||||
}
|
||||
throw std::runtime_error("Unsupported cache type: " + s);
|
||||
}
|
||||
|
||||
static std::string get_all_kv_cache_types() {
|
||||
std::ostringstream msg;
|
||||
for (const auto & type : kv_cache_types) {
|
||||
msg << ggml_type_name(type) << (&type == &kv_cache_types.back() ? "" : ", ");
|
||||
}
|
||||
return msg.str();
|
||||
}
|
||||
|
||||
//
|
||||
// CLI argument parsing functions
|
||||
//
|
||||
@@ -591,7 +620,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
[](common_params & params) {
|
||||
params.ctx_shift = false;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
|
||||
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
|
||||
add_opt(common_arg(
|
||||
{"--chunks"}, "N",
|
||||
string_format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
|
||||
@@ -1174,18 +1203,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
).set_env("LLAMA_ARG_NO_KV_OFFLOAD"));
|
||||
add_opt(common_arg(
|
||||
{"-ctk", "--cache-type-k"}, "TYPE",
|
||||
string_format("KV cache data type for K (default: %s)", params.cache_type_k.c_str()),
|
||||
string_format(
|
||||
"KV cache data type for K\n"
|
||||
"allowed values: %s\n"
|
||||
"(default: %s)",
|
||||
get_all_kv_cache_types().c_str(),
|
||||
ggml_type_name(params.cache_type_k)
|
||||
),
|
||||
[](common_params & params, const std::string & value) {
|
||||
// TODO: get the type right here
|
||||
params.cache_type_k = value;
|
||||
params.cache_type_k = kv_cache_type_from_str(value);
|
||||
}
|
||||
).set_env("LLAMA_ARG_CACHE_TYPE_K"));
|
||||
add_opt(common_arg(
|
||||
{"-ctv", "--cache-type-v"}, "TYPE",
|
||||
string_format("KV cache data type for V (default: %s)", params.cache_type_v.c_str()),
|
||||
string_format(
|
||||
"KV cache data type for V\n"
|
||||
"allowed values: %s\n"
|
||||
"(default: %s)",
|
||||
get_all_kv_cache_types().c_str(),
|
||||
ggml_type_name(params.cache_type_v)
|
||||
),
|
||||
[](common_params & params, const std::string & value) {
|
||||
// TODO: get the type right here
|
||||
params.cache_type_v = value;
|
||||
params.cache_type_v = kv_cache_type_from_str(value);
|
||||
}
|
||||
).set_env("LLAMA_ARG_CACHE_TYPE_V"));
|
||||
add_opt(common_arg(
|
||||
@@ -1711,6 +1750,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.public_path = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STATIC_PATH"));
|
||||
add_opt(common_arg(
|
||||
{"--no-webui"},
|
||||
string_format("Disable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"),
|
||||
[](common_params & params) {
|
||||
params.webui = false;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_WEBUI"));
|
||||
add_opt(common_arg(
|
||||
{"--embedding", "--embeddings"},
|
||||
string_format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"),
|
||||
@@ -2076,35 +2122,35 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
[](common_params & params, int value) {
|
||||
params.speculative.n_max = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}));
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MAX"));
|
||||
add_opt(common_arg(
|
||||
{"--draft-min", "--draft-n-min"}, "N",
|
||||
string_format("minimum number of draft tokens to use for speculative decoding (default: %d)", params.speculative.n_min),
|
||||
[](common_params & params, int value) {
|
||||
params.speculative.n_min = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}));
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MIN"));
|
||||
add_opt(common_arg(
|
||||
{"--draft-p-split"}, "P",
|
||||
string_format("speculative decoding split probability (default: %.1f)", (double)params.speculative.p_split),
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.speculative.p_split = std::stof(value);
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}).set_env("LLAMA_ARG_DRAFT_P_SPLIT"));
|
||||
add_opt(common_arg(
|
||||
{"--draft-p-min"}, "P",
|
||||
string_format("minimum speculative decoding probability (greedy) (default: %.1f)", (double)params.speculative.p_min),
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.speculative.p_min = std::stof(value);
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_P_MIN"));
|
||||
add_opt(common_arg(
|
||||
{"-cd", "--ctx-size-draft"}, "N",
|
||||
string_format("size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.speculative.n_ctx),
|
||||
[](common_params & params, int value) {
|
||||
params.speculative.n_ctx = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CTX_SIZE_DRAFT"));
|
||||
add_opt(common_arg(
|
||||
{"-devd", "--device-draft"}, "<dev1,dev2,..>",
|
||||
"comma-separated list of devices to use for offloading the draft model (none = don't offload)\n"
|
||||
@@ -2124,14 +2170,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n");
|
||||
}
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_N_GPU_LAYERS_DRAFT"));
|
||||
add_opt(common_arg(
|
||||
{"-md", "--model-draft"}, "FNAME",
|
||||
"draft model for speculative decoding (default: unused)",
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.speculative.model = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
|
||||
|
||||
return ctx_arg;
|
||||
}
|
||||
|
||||
@@ -1015,38 +1015,6 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
|
||||
return mparams;
|
||||
}
|
||||
|
||||
static ggml_type kv_cache_type_from_str(const std::string & s) {
|
||||
if (s == "f32") {
|
||||
return GGML_TYPE_F32;
|
||||
}
|
||||
if (s == "f16") {
|
||||
return GGML_TYPE_F16;
|
||||
}
|
||||
if (s == "bf16") {
|
||||
return GGML_TYPE_BF16;
|
||||
}
|
||||
if (s == "q8_0") {
|
||||
return GGML_TYPE_Q8_0;
|
||||
}
|
||||
if (s == "q4_0") {
|
||||
return GGML_TYPE_Q4_0;
|
||||
}
|
||||
if (s == "q4_1") {
|
||||
return GGML_TYPE_Q4_1;
|
||||
}
|
||||
if (s == "iq4_nl") {
|
||||
return GGML_TYPE_IQ4_NL;
|
||||
}
|
||||
if (s == "q5_0") {
|
||||
return GGML_TYPE_Q5_0;
|
||||
}
|
||||
if (s == "q5_1") {
|
||||
return GGML_TYPE_Q5_1;
|
||||
}
|
||||
|
||||
throw std::runtime_error("Unsupported cache type: " + s);
|
||||
}
|
||||
|
||||
struct llama_context_params common_context_params_to_llama(const common_params & params) {
|
||||
auto cparams = llama_context_default_params();
|
||||
|
||||
@@ -1081,8 +1049,8 @@ struct llama_context_params common_context_params_to_llama(const common_params &
|
||||
cparams.pooling_type = LLAMA_POOLING_TYPE_RANK;
|
||||
}
|
||||
|
||||
cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
|
||||
cparams.type_v = kv_cache_type_from_str(params.cache_type_v);
|
||||
cparams.type_k = params.cache_type_k;
|
||||
cparams.type_v = params.cache_type_v;
|
||||
|
||||
return cparams;
|
||||
}
|
||||
@@ -1108,12 +1076,6 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p
|
||||
#define CURL_MAX_RETRY 3
|
||||
#define CURL_RETRY_DELAY_SECONDS 2
|
||||
|
||||
|
||||
static bool starts_with(const std::string & str, const std::string & prefix) {
|
||||
// While we wait for C++20's std::string::starts_with...
|
||||
return str.rfind(prefix, 0) == 0;
|
||||
}
|
||||
|
||||
static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_attempts, int retry_delay_seconds) {
|
||||
int remaining_attempts = max_attempts;
|
||||
|
||||
|
||||
@@ -37,9 +37,9 @@ using llama_tokens = std::vector<llama_token>;
|
||||
|
||||
// build info
|
||||
extern int LLAMA_BUILD_NUMBER;
|
||||
extern char const * LLAMA_COMMIT;
|
||||
extern char const * LLAMA_COMPILER;
|
||||
extern char const * LLAMA_BUILD_TARGET;
|
||||
extern const char * LLAMA_COMMIT;
|
||||
extern const char * LLAMA_COMPILER;
|
||||
extern const char * LLAMA_BUILD_TARGET;
|
||||
|
||||
struct common_control_vector_load_info;
|
||||
|
||||
@@ -286,8 +286,8 @@ struct common_params {
|
||||
bool warmup = true; // warmup run
|
||||
bool check_tensors = false; // validate tensor data
|
||||
|
||||
std::string cache_type_k = "f16"; // KV cache data type for the K
|
||||
std::string cache_type_v = "f16"; // KV cache data type for the V
|
||||
ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
|
||||
ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
|
||||
|
||||
// multimodal models (see examples/llava)
|
||||
std::string mmproj = ""; // path to multimodal projector // NOLINT
|
||||
@@ -437,6 +437,11 @@ std::vector<std::string> string_split<std::string>(const std::string & input, ch
|
||||
return parts;
|
||||
}
|
||||
|
||||
static bool string_starts_with(const std::string & str,
|
||||
const std::string & prefix) { // While we wait for C++20's std::string::starts_with...
|
||||
return str.rfind(prefix, 0) == 0;
|
||||
}
|
||||
|
||||
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
|
||||
void string_process_escapes(std::string & input);
|
||||
|
||||
|
||||
@@ -1992,6 +1992,37 @@ class Qwen2Model(Model):
|
||||
except FileNotFoundError:
|
||||
self._set_vocab_gpt2()
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
||||
if self.hparams["rope_scaling"].get("type") == "yarn":
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
||||
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
|
||||
|
||||
|
||||
@Model.register("Qwen2VLForConditionalGeneration")
|
||||
class Qwen2VLModel(Model):
|
||||
model_arch = gguf.MODEL_ARCH.QWEN2VL
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
mrope_section = self.hparams["rope_scaling"]["mrope_section"]
|
||||
mrope_section += [0] * max(0, 4 - len(mrope_section))
|
||||
self.gguf_writer.add_rope_dimension_sections(mrope_section)
|
||||
|
||||
def set_vocab(self):
|
||||
try:
|
||||
self._set_vocab_sentencepiece()
|
||||
except FileNotFoundError:
|
||||
self._set_vocab_gpt2()
|
||||
|
||||
def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
|
||||
for name, data in super().get_tensors():
|
||||
if name.startswith("visual."):
|
||||
continue
|
||||
yield name, data
|
||||
|
||||
|
||||
@Model.register("Qwen2MoeForCausalLM")
|
||||
class Qwen2MoeModel(Model):
|
||||
|
||||
@@ -55,7 +55,14 @@ cmake --build build --config Release
|
||||
cmake --preset arm64-windows-llvm-release -D GGML_OPENMP=OFF
|
||||
cmake --build build-arm64-windows-llvm-release
|
||||
```
|
||||
Building for arm64 can also be done with the MSVC compiler with the build-arm64-windows-MSVC preset, or the standard CMake build instructions. However, note that the MSVC compiler does not support inline ARM assembly code, used e.g. for the accelerated Q4_0_4_8 CPU kernels.
|
||||
Building for arm64 can also be done with the MSVC compiler with the build-arm64-windows-MSVC preset, or the standard CMake build instructions. However, note that the MSVC compiler does not support inline ARM assembly code, used e.g. for the accelerated Q4_0_N_M CPU kernels.
|
||||
|
||||
For building with ninja generator and clang compiler as default:
|
||||
-set path:set LIB=C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\um\x64;C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Tools\MSVC\14.41.34120\lib\x64\uwp;C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\ucrt\x64
|
||||
```bash
|
||||
cmake --preset x64-windows-llvm-release
|
||||
cmake --build build-x64-windows-llvm-release
|
||||
```
|
||||
|
||||
## BLAS Build
|
||||
|
||||
|
||||
@@ -20,7 +20,12 @@ else()
|
||||
add_subdirectory(batched)
|
||||
add_subdirectory(embedding)
|
||||
add_subdirectory(eval-callback)
|
||||
add_subdirectory(gbnf-validator)
|
||||
|
||||
if (NOT WIN32)
|
||||
# disabled on Windows because it uses internal functions not exported with LLAMA_API
|
||||
add_subdirectory(gbnf-validator)
|
||||
endif()
|
||||
|
||||
add_subdirectory(gguf-hash)
|
||||
add_subdirectory(gguf-split)
|
||||
add_subdirectory(gguf)
|
||||
@@ -46,12 +51,16 @@ else()
|
||||
add_subdirectory(speculative)
|
||||
add_subdirectory(speculative-simple)
|
||||
add_subdirectory(tokenize)
|
||||
add_subdirectory(gen-docs)
|
||||
if (NOT GGML_BACKEND_DL)
|
||||
# these examples use the backends directly and cannot be built with dynamic loading
|
||||
add_subdirectory(convert-llama2c-to-ggml)
|
||||
add_subdirectory(cvector-generator)
|
||||
add_subdirectory(export-lora)
|
||||
add_subdirectory(quantize-stats)
|
||||
if (NOT WIN32)
|
||||
# disabled on Windows because it uses internal functions not exported with LLAMA_API
|
||||
add_subdirectory(quantize-stats)
|
||||
endif()
|
||||
add_subdirectory(llava)
|
||||
if (GGML_RPC)
|
||||
add_subdirectory(rpc)
|
||||
|
||||
@@ -287,7 +287,7 @@ struct split_strategy {
|
||||
}
|
||||
|
||||
void print_info() {
|
||||
printf("n_split: %ld\n", ctx_outs.size());
|
||||
printf("n_split: %zu\n", ctx_outs.size());
|
||||
int i_split = 0;
|
||||
for (auto & ctx_out : ctx_outs) {
|
||||
// re-calculate the real gguf size for each split (= metadata size + total size of all tensors)
|
||||
@@ -297,7 +297,7 @@ struct split_strategy {
|
||||
total_size += ggml_nbytes(t);
|
||||
}
|
||||
total_size = total_size / 1000 / 1000; // convert to megabytes
|
||||
printf("split %05d: n_tensors = %d, total_size = %ldM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size);
|
||||
printf("split %05d: n_tensors = %d, total_size = %zuM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size);
|
||||
i_split++;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1521,7 +1521,7 @@ int main(int argc, char ** argv) {
|
||||
for (const auto & inst : params_instances) {
|
||||
params_idx++;
|
||||
if (params.progress) {
|
||||
fprintf(stderr, "llama-bench: benchmark %d/%ld: starting\n", params_idx, params_count);
|
||||
fprintf(stderr, "llama-bench: benchmark %d/%zu: starting\n", params_idx, params_count);
|
||||
}
|
||||
// keep the same model between tests when possible
|
||||
if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) {
|
||||
@@ -1573,14 +1573,14 @@ int main(int argc, char ** argv) {
|
||||
// warmup run
|
||||
if (t.n_prompt > 0) {
|
||||
if (params.progress) {
|
||||
fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup prompt run\n", params_idx, params_count);
|
||||
fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup prompt run\n", params_idx, params_count);
|
||||
}
|
||||
//test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);
|
||||
test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
|
||||
}
|
||||
if (t.n_gen > 0) {
|
||||
if (params.progress) {
|
||||
fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup generation run\n", params_idx, params_count);
|
||||
fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup generation run\n", params_idx, params_count);
|
||||
}
|
||||
test_gen(ctx, 1, t.n_threads);
|
||||
}
|
||||
@@ -1592,14 +1592,14 @@ int main(int argc, char ** argv) {
|
||||
|
||||
if (t.n_prompt > 0) {
|
||||
if (params.progress) {
|
||||
fprintf(stderr, "llama-bench: benchmark %d/%ld: prompt run %d/%d\n", params_idx, params_count,
|
||||
fprintf(stderr, "llama-bench: benchmark %d/%zu: prompt run %d/%d\n", params_idx, params_count,
|
||||
i + 1, params.reps);
|
||||
}
|
||||
test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
|
||||
}
|
||||
if (t.n_gen > 0) {
|
||||
if (params.progress) {
|
||||
fprintf(stderr, "llama-bench: benchmark %d/%ld: generation run %d/%d\n", params_idx, params_count,
|
||||
fprintf(stderr, "llama-bench: benchmark %d/%zu: generation run %d/%d\n", params_idx, params_count,
|
||||
i + 1, params.reps);
|
||||
}
|
||||
test_gen(ctx, t.n_gen, t.n_threads);
|
||||
|
||||
@@ -210,20 +210,20 @@ actor LlamaContext {
|
||||
|
||||
llama_kv_cache_clear(context)
|
||||
|
||||
let t_pp_start = ggml_time_us()
|
||||
let t_pp_start = DispatchTime.now().uptimeNanoseconds / 1000;
|
||||
|
||||
if llama_decode(context, batch) != 0 {
|
||||
print("llama_decode() failed during prompt")
|
||||
}
|
||||
llama_synchronize(context)
|
||||
|
||||
let t_pp_end = ggml_time_us()
|
||||
let t_pp_end = DispatchTime.now().uptimeNanoseconds / 1000;
|
||||
|
||||
// bench text generation
|
||||
|
||||
llama_kv_cache_clear(context)
|
||||
|
||||
let t_tg_start = ggml_time_us()
|
||||
let t_tg_start = DispatchTime.now().uptimeNanoseconds / 1000;
|
||||
|
||||
for i in 0..<tg {
|
||||
llama_batch_clear(&batch)
|
||||
@@ -238,7 +238,7 @@ actor LlamaContext {
|
||||
llama_synchronize(context)
|
||||
}
|
||||
|
||||
let t_tg_end = ggml_time_us()
|
||||
let t_tg_end = DispatchTime.now().uptimeNanoseconds / 1000;
|
||||
|
||||
llama_kv_cache_clear(context)
|
||||
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
objects = {
|
||||
|
||||
/* Begin PBXBuildFile section */
|
||||
1809696D2D05A39F00400EE8 /* llama in Frameworks */ = {isa = PBXBuildFile; productRef = 1809696C2D05A39F00400EE8 /* llama */; };
|
||||
549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 549479CA2AC9E16000E0F78B /* Metal.framework */; };
|
||||
79E1D9CD2B4CD16E005F8E46 /* InputButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = 79E1D9CC2B4CD16E005F8E46 /* InputButton.swift */; };
|
||||
7FA3D2B32B2EA2F600543F92 /* DownloadButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = 7FA3D2B22B2EA2F600543F92 /* DownloadButton.swift */; };
|
||||
@@ -17,7 +18,6 @@
|
||||
8A3F84242AC4C891005E2EE8 /* models in Resources */ = {isa = PBXBuildFile; fileRef = 8A3F84232AC4C891005E2EE8 /* models */; };
|
||||
8A907F332AC7138A006146EA /* LibLlama.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A907F322AC7134E006146EA /* LibLlama.swift */; };
|
||||
8A9F7C4D2AC332EE008AE1EA /* LlamaState.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */; };
|
||||
DF810E132B4A5BA200301144 /* llama in Frameworks */ = {isa = PBXBuildFile; productRef = DF810E122B4A5BA200301144 /* llama */; };
|
||||
F1FE20E22B465ECA00B45541 /* LoadCustomButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = F1FE20E12B465EC900B45541 /* LoadCustomButton.swift */; };
|
||||
/* End PBXBuildFile section */
|
||||
|
||||
@@ -42,7 +42,7 @@
|
||||
isa = PBXFrameworksBuildPhase;
|
||||
buildActionMask = 2147483647;
|
||||
files = (
|
||||
DF810E132B4A5BA200301144 /* llama in Frameworks */,
|
||||
1809696D2D05A39F00400EE8 /* llama in Frameworks */,
|
||||
549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */,
|
||||
8A39BE0A2AC7601100BFEB40 /* Accelerate.framework in Frameworks */,
|
||||
);
|
||||
@@ -151,7 +151,7 @@
|
||||
);
|
||||
name = llama.swiftui;
|
||||
packageProductDependencies = (
|
||||
DF810E122B4A5BA200301144 /* llama */,
|
||||
1809696C2D05A39F00400EE8 /* llama */,
|
||||
);
|
||||
productName = llama.swiftui;
|
||||
productReference = 8A1C83732AC328BD0096AF73 /* llama.swiftui.app */;
|
||||
@@ -429,7 +429,7 @@
|
||||
/* End XCConfigurationList section */
|
||||
|
||||
/* Begin XCSwiftPackageProductDependency section */
|
||||
DF810E122B4A5BA200301144 /* llama */ = {
|
||||
1809696C2D05A39F00400EE8 /* llama */ = {
|
||||
isa = XCSwiftPackageProductDependency;
|
||||
productName = llama;
|
||||
};
|
||||
|
||||
@@ -43,3 +43,10 @@ set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-minicpmv-cli)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
set(TARGET llama-qwen2vl-cli)
|
||||
add_executable(${TARGET} qwen2vl-cli.cpp)
|
||||
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-qwen2vl-cli)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -102,7 +102,9 @@ static std::string format(const char * fmt, ...) {
|
||||
#define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector"
|
||||
#define KEY_HAS_MINICPMV_PROJ "clip.has_minicpmv_projector"
|
||||
#define KEY_MINICPMV_VERSION "clip.minicpmv_version"
|
||||
#define KEY_HAS_QWEN2VL_MERGER "clip.has_qwen2vl_merger"
|
||||
#define KEY_USE_GELU "clip.use_gelu"
|
||||
#define KEY_USE_SILU "clip.use_silu"
|
||||
#define KEY_N_EMBD "clip.%s.embedding_length"
|
||||
#define KEY_N_FF "clip.%s.feed_forward_length"
|
||||
#define KEY_N_BLOCK "clip.%s.block_count"
|
||||
@@ -129,7 +131,8 @@ static std::string format(const char * fmt, ...) {
|
||||
#define TN_TOKEN_EMBD "%s.token_embd.weight"
|
||||
#define TN_POS_EMBD "%s.position_embd.weight"
|
||||
#define TN_CLASS_EMBD "v.class_embd"
|
||||
#define TN_PATCH_EMBD "v.patch_embd.weight"
|
||||
#define TN_PATCH_EMBD "v.patch_embd.weight" // not rename tensor with ".0" postfix for backwrad compat
|
||||
#define TN_PATCH_EMBD_1 "v.patch_embd.weight.1"
|
||||
#define TN_PATCH_BIAS "v.patch_embd.bias"
|
||||
#define TN_ATTN_K "%s.blk.%d.attn_k.%s"
|
||||
#define TN_ATTN_Q "%s.blk.%d.attn_q.%s"
|
||||
@@ -163,6 +166,7 @@ enum projector_type {
|
||||
PROJECTOR_TYPE_LDP,
|
||||
PROJECTOR_TYPE_LDPV2,
|
||||
PROJECTOR_TYPE_RESAMPLER,
|
||||
PROJECTOR_TYPE_MERGER,
|
||||
PROJECTOR_TYPE_UNKNOWN,
|
||||
};
|
||||
|
||||
@@ -171,6 +175,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
||||
{ PROJECTOR_TYPE_LDP, "ldp" },
|
||||
{ PROJECTOR_TYPE_LDPV2, "ldpv2"},
|
||||
{ PROJECTOR_TYPE_RESAMPLER, "resampler"},
|
||||
{ PROJECTOR_TYPE_MERGER, "qwen2vl_merger"},
|
||||
};
|
||||
|
||||
|
||||
@@ -463,7 +468,8 @@ struct clip_vision_model {
|
||||
|
||||
// embeddings
|
||||
struct ggml_tensor * class_embedding;
|
||||
struct ggml_tensor * patch_embeddings;
|
||||
struct ggml_tensor * patch_embeddings_0;
|
||||
struct ggml_tensor * patch_embeddings_1; // second Conv2D kernel when we decouple Conv3D along temproal dimension (Qwen2VL)
|
||||
struct ggml_tensor * patch_bias;
|
||||
struct ggml_tensor * position_embeddings;
|
||||
|
||||
@@ -553,6 +559,7 @@ struct clip_ctx {
|
||||
bool has_vision_encoder = false;
|
||||
bool has_llava_projector = false;
|
||||
bool has_minicpmv_projector = false;
|
||||
bool has_qwen2vl_merger = false;
|
||||
int minicpmv_version = 2;
|
||||
|
||||
struct clip_vision_model vision_model;
|
||||
@@ -561,6 +568,7 @@ struct clip_ctx {
|
||||
float image_mean[3];
|
||||
float image_std[3];
|
||||
bool use_gelu = false;
|
||||
bool use_silu = false;
|
||||
int32_t ftype = 1;
|
||||
|
||||
bool has_class_embedding = true;
|
||||
@@ -606,14 +614,26 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||
image_size_height = imgs->data->ny;
|
||||
}
|
||||
}
|
||||
else if (ctx->has_qwen2vl_merger) {
|
||||
// use the image's native resolution when image is avaible
|
||||
if (is_inf) {
|
||||
// if (imgs->data->nx && imgs->data->ny) {
|
||||
image_size_width = imgs->data->nx;
|
||||
image_size_height = imgs->data->ny;
|
||||
}
|
||||
}
|
||||
const int patch_size = hparams.patch_size;
|
||||
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
|
||||
const int patches_w = image_size_width / patch_size;
|
||||
const int patches_h = image_size_height / patch_size;
|
||||
const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
|
||||
const int num_position_ids = ctx->has_qwen2vl_merger ? num_positions * 4 : num_positions;
|
||||
const int hidden_size = hparams.hidden_size;
|
||||
const int n_head = hparams.n_head;
|
||||
const int d_head = hidden_size / n_head;
|
||||
int n_layer = hparams.n_layer;
|
||||
const float eps = hparams.eps;
|
||||
int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
|
||||
|
||||
const int batch_size = imgs->size;
|
||||
|
||||
@@ -634,10 +654,30 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||
ggml_set_name(inp_raw, "inp_raw");
|
||||
ggml_set_input(inp_raw);
|
||||
|
||||
struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
|
||||
struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
|
||||
|
||||
inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size);
|
||||
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
|
||||
if (ctx->has_qwen2vl_merger) {
|
||||
GGML_ASSERT(image_size_width % (patch_size * 2) == 0);
|
||||
GGML_ASSERT(image_size_height % (patch_size * 2) == 0);
|
||||
|
||||
auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
|
||||
inp = ggml_add(ctx0, inp, inp_1);
|
||||
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 2, 0, 3)); // [w, h, c, b] -> [c, w, h, b]
|
||||
inp = ggml_reshape_4d(
|
||||
ctx0, inp,
|
||||
hidden_size * 2, patches_w / 2, patches_h, batch_size);
|
||||
inp = ggml_reshape_4d(
|
||||
ctx0, inp,
|
||||
hidden_size * 2, patches_w / 2, 2, batch_size * (patches_h / 2));
|
||||
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 0, 2, 1, 3));
|
||||
inp = ggml_reshape_3d(
|
||||
ctx0, inp,
|
||||
hidden_size, patches_w * patches_h, batch_size);
|
||||
}
|
||||
else {
|
||||
inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size);
|
||||
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
|
||||
}
|
||||
|
||||
if (ctx->has_patch_bias) {
|
||||
// inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp));
|
||||
@@ -659,12 +699,14 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||
}
|
||||
}
|
||||
|
||||
struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
|
||||
struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
|
||||
ggml_set_name(positions, "positions");
|
||||
ggml_set_input(positions);
|
||||
|
||||
embeddings =
|
||||
ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions));
|
||||
if (!ctx->has_qwen2vl_merger) { // qwen2vl use rope position embedding
|
||||
embeddings =
|
||||
ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions));
|
||||
}
|
||||
|
||||
if (ctx->has_minicpmv_projector) {
|
||||
int pos_w = image_size_width/patch_size;
|
||||
@@ -688,7 +730,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||
}
|
||||
|
||||
// loop over layers
|
||||
if (ctx->has_minicpmv_projector) {
|
||||
if (ctx->has_minicpmv_projector || ctx->has_qwen2vl_merger) {
|
||||
// TODO: figure out why we doing thing in this way ???
|
||||
n_layer += 1;
|
||||
}
|
||||
for (int il = 0; il < n_layer - 1; il++) {
|
||||
@@ -710,8 +753,13 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||
struct ggml_tensor * Q =
|
||||
ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].q_w, cur), model.layers[il].q_b);
|
||||
|
||||
Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head));
|
||||
Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_positions, batch_size);
|
||||
if (ctx->has_qwen2vl_merger) {
|
||||
Q = ggml_rope_multi(
|
||||
ctx0, Q, positions, nullptr,
|
||||
d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
|
||||
}
|
||||
Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head));
|
||||
Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
|
||||
Q = ggml_reshape_3d(ctx0, Q, d_head, num_positions, n_head * batch_size);
|
||||
|
||||
@@ -719,6 +767,11 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||
ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].k_w, cur), model.layers[il].k_b);
|
||||
|
||||
K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size);
|
||||
if (ctx->has_qwen2vl_merger) {
|
||||
K = ggml_rope_multi(
|
||||
ctx0, K, positions, nullptr,
|
||||
d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
|
||||
}
|
||||
K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
|
||||
K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size);
|
||||
|
||||
@@ -758,6 +811,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||
|
||||
if (ctx->use_gelu) {
|
||||
cur = ggml_gelu_inplace(ctx0, cur);
|
||||
} else if (ctx->use_silu) {
|
||||
cur = ggml_silu_inplace(ctx0, cur);
|
||||
} else {
|
||||
cur = ggml_gelu_quick_inplace(ctx0, cur);
|
||||
}
|
||||
@@ -769,6 +824,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||
cur = ggml_add(ctx0, embeddings, cur);
|
||||
|
||||
embeddings = cur;
|
||||
|
||||
}
|
||||
|
||||
// post-layernorm
|
||||
@@ -1030,6 +1086,19 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
}
|
||||
else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
|
||||
embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size * 4, num_positions / 4, batch_size);
|
||||
|
||||
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
|
||||
embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
|
||||
|
||||
// GELU activation
|
||||
embeddings = ggml_gelu(ctx0, embeddings);
|
||||
|
||||
// Second linear layer
|
||||
embeddings = ggml_mul_mat(ctx0, model.mm_1_w, embeddings);
|
||||
embeddings = ggml_add(ctx0, embeddings, model.mm_1_b);
|
||||
}
|
||||
|
||||
// build the graph
|
||||
ggml_build_forward_expand(gf, embeddings);
|
||||
@@ -1206,6 +1275,10 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
new_clip->minicpmv_version = gguf_get_val_i32(ctx, idx);
|
||||
}
|
||||
|
||||
idx = gguf_find_key(ctx, KEY_HAS_QWEN2VL_MERGER);
|
||||
if (idx != -1) {
|
||||
new_clip->has_qwen2vl_merger = gguf_get_val_bool(ctx, idx);
|
||||
}
|
||||
// GGML_ASSERT(new_clip->has_llava_projector); // see monatis/clip.cpp for image and/or text encoding for semantic search
|
||||
|
||||
GGML_ASSERT(new_clip->has_vision_encoder);
|
||||
@@ -1214,6 +1287,13 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
idx = get_key_idx(ctx, KEY_USE_GELU);
|
||||
new_clip->use_gelu = gguf_get_val_bool(ctx, idx);
|
||||
|
||||
try {
|
||||
idx = get_key_idx(ctx, KEY_USE_SILU);
|
||||
new_clip->use_silu = gguf_get_val_bool(ctx, idx);
|
||||
} catch (std::runtime_error & /*e*/) {
|
||||
new_clip->use_silu = false;
|
||||
}
|
||||
|
||||
if (verbosity >= 1) {
|
||||
LOG_INF("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder);
|
||||
LOG_INF("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
|
||||
@@ -1389,11 +1469,16 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
}
|
||||
|
||||
try {
|
||||
vision_model.patch_embeddings = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
|
||||
vision_model.patch_embeddings_0 = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
|
||||
vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
|
||||
} catch(const std::exception& /*e*/) {
|
||||
LOG_ERR("%s: failed to load vision model tensors\n", __func__);
|
||||
}
|
||||
try {
|
||||
vision_model.patch_embeddings_1 = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD_1);
|
||||
} catch(const std::exception& /*e*/) {
|
||||
new_clip->has_qwen2vl_merger = false;
|
||||
}
|
||||
|
||||
// LLaVA projection
|
||||
if (new_clip->proj_type == PROJECTOR_TYPE_MLP || new_clip->proj_type == PROJECTOR_TYPE_MLP_NORM) {
|
||||
@@ -1481,6 +1566,12 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
vision_model.mm_model_ln_post_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "weight"));
|
||||
vision_model.mm_model_ln_post_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "bias"));
|
||||
}
|
||||
else if (new_clip->proj_type == PROJECTOR_TYPE_MERGER) {
|
||||
vision_model.mm_0_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight"));
|
||||
vision_model.mm_0_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias"));
|
||||
vision_model.mm_1_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
|
||||
vision_model.mm_1_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
|
||||
}
|
||||
else {
|
||||
std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type];
|
||||
throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
|
||||
@@ -1519,6 +1610,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
new_clip->compute_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(new_clip->backend));
|
||||
clip_image_f32_batch batch;
|
||||
batch.size = 1;
|
||||
batch.data = nullptr;
|
||||
ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, nullptr, false);
|
||||
ggml_gallocr_reserve(new_clip->compute_alloc, gf);
|
||||
size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0);
|
||||
@@ -1532,6 +1624,10 @@ void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size
|
||||
ctx_clip->load_image_size = load_image_size;
|
||||
}
|
||||
|
||||
struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip) {
|
||||
return ctx_clip->load_image_size;
|
||||
}
|
||||
|
||||
struct clip_image_size * clip_image_size_init() {
|
||||
struct clip_image_size * load_image_size = new struct clip_image_size();
|
||||
load_image_size->width = 448;
|
||||
@@ -1984,6 +2080,23 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
|
||||
}
|
||||
return true;
|
||||
}
|
||||
else if (ctx->has_qwen2vl_merger) {
|
||||
clip_image_u8 * resized = clip_image_u8_init();
|
||||
auto patch_size = clip_patch_size(ctx) * 2;
|
||||
int nx = ceil((float)img->nx / patch_size) * patch_size;
|
||||
int ny = ceil((float)img->ny / patch_size) * patch_size;
|
||||
bicubic_resize(*img, *resized, nx, ny);
|
||||
|
||||
res_imgs->data = new clip_image_f32[1];
|
||||
// clip_image_f32 * res = clip_image_f32_init();
|
||||
normalize_image_u8_to_f32(resized, res_imgs->data, ctx->image_mean, ctx->image_std);
|
||||
// res_imgs->data[0] = *res;
|
||||
res_imgs->size = 1;
|
||||
|
||||
// clip_image_f32_free(res);
|
||||
clip_image_u8_free(resized);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool pad_to_square = true;
|
||||
if (!ctx->has_vision_encoder) {
|
||||
@@ -2173,6 +2286,13 @@ size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
|
||||
return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx) * sizeof(float);
|
||||
}
|
||||
|
||||
size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w) {
|
||||
clip_image_f32 img;
|
||||
img.nx = img_w;
|
||||
img.ny = img_h;
|
||||
return clip_n_patches_by_img(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float);
|
||||
}
|
||||
|
||||
int32_t clip_image_size(const struct clip_ctx * ctx) {
|
||||
return ctx->vision_model.hparams.image_size;
|
||||
}
|
||||
@@ -2194,6 +2314,13 @@ const int32_t * clip_image_grid(const struct clip_ctx * ctx) {
|
||||
}
|
||||
|
||||
int clip_n_patches(const struct clip_ctx * ctx) {
|
||||
clip_image_f32 img;
|
||||
img.nx = ctx->vision_model.hparams.image_size;
|
||||
img.ny = ctx->vision_model.hparams.image_size;
|
||||
return clip_n_patches_by_img(ctx, &img);
|
||||
}
|
||||
|
||||
int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
|
||||
const auto & params = ctx->vision_model.hparams;
|
||||
|
||||
int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
|
||||
@@ -2207,6 +2334,11 @@ int clip_n_patches(const struct clip_ctx * ctx) {
|
||||
else if (ctx->minicpmv_version == 3) {
|
||||
n_patches = 64;
|
||||
}
|
||||
} else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
|
||||
int patch_size = params.patch_size * 2;
|
||||
int x_patch = img->nx / patch_size + (int)(img->nx % patch_size > 0);
|
||||
int y_patch = img->ny / patch_size + (int)(img->ny % patch_size > 0);
|
||||
n_patches = x_patch * y_patch;
|
||||
}
|
||||
|
||||
return n_patches;
|
||||
@@ -2335,7 +2467,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
const int image_size = hparams.image_size;
|
||||
int image_size_width = image_size;
|
||||
int image_size_height = image_size;
|
||||
if (ctx->has_minicpmv_projector) {
|
||||
if (ctx->has_minicpmv_projector | ctx->has_qwen2vl_merger) {
|
||||
image_size_width = imgs->data[0].nx;
|
||||
image_size_height = imgs->data[0].ny;
|
||||
}
|
||||
@@ -2355,7 +2487,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
for (size_t i = 0; i < imgs->size; i++) {
|
||||
const int nx = imgs->data[i].nx;
|
||||
const int ny = imgs->data[i].ny;
|
||||
if (!ctx->has_minicpmv_projector) {
|
||||
if (!(ctx->has_minicpmv_projector | ctx->has_qwen2vl_merger)) {
|
||||
GGML_ASSERT(nx == image_size && ny == image_size);
|
||||
}
|
||||
|
||||
@@ -2413,9 +2545,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h));
|
||||
|
||||
float * pos_embed_data = (float *)malloc(ggml_nbytes(pos_embed));
|
||||
for(int i=0;i<pos_w * pos_h;++i){
|
||||
for(int j=0;j<embed_dim;++j){
|
||||
pos_embed_data[i*embed_dim+j]=pos_embed_t[i][j];
|
||||
for(int i=0;i < pos_w * pos_h; ++i){
|
||||
for(int j=0; j < embed_dim; ++j){
|
||||
pos_embed_data[i * embed_dim + j] = pos_embed_t[i][j];
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2435,7 +2567,34 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
if (ctx->has_qwen2vl_merger) {
|
||||
struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
|
||||
|
||||
const int pw = image_size_width / patch_size;
|
||||
const int ph = image_size_height / patch_size;
|
||||
int* positions_data = (int*)malloc(ggml_nbytes(positions));
|
||||
|
||||
int ptr = 0;
|
||||
for (int y = 0; y < ph; y+=2)
|
||||
{
|
||||
for (int x = 0; x < pw; x+=2)
|
||||
{
|
||||
for (int dy = 0; dy < 2; dy++) {
|
||||
for (int dx = 0; dx < 2; dx++) {
|
||||
positions_data[ptr] = y + dy;
|
||||
positions_data[num_patches + ptr] = x + dx;
|
||||
positions_data[num_patches * 2 + ptr] = y + dy;
|
||||
positions_data[num_patches * 3 + ptr] = x + dx;
|
||||
ptr++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
|
||||
free(positions_data);
|
||||
}
|
||||
else {
|
||||
struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
|
||||
|
||||
int* positions_data = (int*)malloc(ggml_nbytes(positions));
|
||||
@@ -2444,16 +2603,16 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
}
|
||||
ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
|
||||
free(positions_data);
|
||||
}
|
||||
|
||||
{
|
||||
struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
|
||||
int* patches_data = (int*)malloc(ggml_nbytes(patches));
|
||||
for (int i = 0; i < num_patches; i++) {
|
||||
patches_data[i] = i + 1;
|
||||
{
|
||||
struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
|
||||
int* patches_data = (int*)malloc(ggml_nbytes(patches));
|
||||
for (int i = 0; i < num_patches; i++) {
|
||||
patches_data[i] = i + 1;
|
||||
}
|
||||
ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
|
||||
free(patches_data);
|
||||
}
|
||||
ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
|
||||
free(patches_data);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2626,6 +2785,9 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
||||
return 3584;
|
||||
}
|
||||
}
|
||||
if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
|
||||
return ctx->vision_model.mm_1_b->ne[0];
|
||||
}
|
||||
|
||||
std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
|
||||
throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
|
||||
@@ -2637,3 +2799,21 @@ int clip_is_minicpmv(const struct clip_ctx * ctx) {
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool clip_is_qwen2vl(const struct clip_ctx * ctx) {
|
||||
return ctx->has_qwen2vl_merger;
|
||||
}
|
||||
|
||||
|
||||
bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
|
||||
clip_image_f32 clip_img;
|
||||
clip_img.buf.resize(h * w * 3);
|
||||
for (int i = 0; i < h*w*3; i++)
|
||||
{
|
||||
clip_img.buf[i] = img[i];
|
||||
}
|
||||
clip_img.nx = w;
|
||||
clip_img.ny = h;
|
||||
clip_image_encode(ctx, n_threads, &clip_img, vec);
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -45,6 +45,7 @@ CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity
|
||||
CLIP_API void clip_free(struct clip_ctx * ctx);
|
||||
|
||||
CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);
|
||||
CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w);
|
||||
|
||||
CLIP_API int32_t clip_image_size (const struct clip_ctx * ctx);
|
||||
CLIP_API int32_t clip_patch_size (const struct clip_ctx * ctx);
|
||||
@@ -55,11 +56,13 @@ CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx);
|
||||
|
||||
CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx);
|
||||
|
||||
CLIP_API int clip_n_patches (const struct clip_ctx * ctx);
|
||||
CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx);
|
||||
CLIP_API int clip_n_patches (const struct clip_ctx * ctx);
|
||||
CLIP_API int clip_n_patches_by_img (const struct clip_ctx * ctx, struct clip_image_f32 * img);
|
||||
CLIP_API int clip_n_mmproj_embd (const struct clip_ctx * ctx);
|
||||
|
||||
CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip);
|
||||
CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);
|
||||
CLIP_API struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip);
|
||||
|
||||
CLIP_API struct clip_image_size * clip_image_size_init();
|
||||
CLIP_API struct clip_image_u8 * clip_image_u8_init ();
|
||||
@@ -86,6 +89,9 @@ CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, cons
|
||||
CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype);
|
||||
|
||||
CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx);
|
||||
CLIP_API bool clip_is_qwen2vl(const struct clip_ctx * ctx);
|
||||
|
||||
CLIP_API bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
||||
@@ -259,25 +259,33 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
||||
|
||||
const char * mm_patch_merge_type = clip_patch_merge_type(ctx_clip);
|
||||
|
||||
if (clip_is_minicpmv(ctx_clip)) {
|
||||
if (clip_is_minicpmv(ctx_clip) || clip_is_qwen2vl(ctx_clip)) {
|
||||
std::vector<float *> image_embd_v;
|
||||
image_embd_v.resize(img_res_v.size);
|
||||
struct clip_image_size * load_image_size = clip_image_size_init();
|
||||
|
||||
for (size_t i = 0; i < img_res_v.size; i++) {
|
||||
const int64_t t_img_enc_step_start_us = ggml_time_us();
|
||||
image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip));
|
||||
image_embd_v[i] = (float *)malloc(clip_embd_nbytes_by_img(ctx_clip, img_res_v.data[i].nx, img_res_v.data[i].ny));
|
||||
int patch_size=14;
|
||||
load_image_size->width = img_res_v.data[i].nx;
|
||||
load_image_size->height = img_res_v.data[i].ny;
|
||||
clip_add_load_image_size(ctx_clip, load_image_size);
|
||||
|
||||
bool encoded = false;
|
||||
int has_minicpmv_projector = clip_is_minicpmv(ctx_clip);
|
||||
if (has_minicpmv_projector == 2) {
|
||||
encoded = clip_image_encode(ctx_clip, n_threads, only_v2_5_reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]);
|
||||
}
|
||||
else if (has_minicpmv_projector == 3) {
|
||||
if (clip_is_qwen2vl(ctx_clip)) {
|
||||
encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
|
||||
}
|
||||
else {
|
||||
int has_minicpmv_projector = clip_is_minicpmv(ctx_clip);
|
||||
if (has_minicpmv_projector == 2) {
|
||||
encoded = clip_image_encode(ctx_clip, n_threads, only_v2_5_reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]);
|
||||
}
|
||||
else if (has_minicpmv_projector == 3) {
|
||||
encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
|
||||
}
|
||||
}
|
||||
|
||||
if (!encoded) {
|
||||
LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
|
||||
return false;
|
||||
@@ -290,8 +298,11 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
||||
|
||||
int n_img_pos_out = 0;
|
||||
for (size_t i = 0; i < image_embd_v.size(); i++) {
|
||||
std::memcpy(image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip), image_embd_v[i], clip_embd_nbytes(ctx_clip));
|
||||
n_img_pos_out += clip_n_patches(ctx_clip);
|
||||
std::memcpy(
|
||||
image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip),
|
||||
image_embd_v[i],
|
||||
clip_embd_nbytes_by_img(ctx_clip, img_res_v.data[i].nx, img_res_v.data[i].ny));
|
||||
n_img_pos_out += clip_n_patches_by_img(ctx_clip, &img_res_v.data[i]);
|
||||
}
|
||||
*n_img_pos = n_img_pos_out;
|
||||
for (size_t i = 0; i < image_embd_v.size(); i++) {
|
||||
@@ -387,7 +398,13 @@ bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, co
|
||||
if (clip_is_minicpmv(ctx_clip)) {
|
||||
num_max_patches = 10;
|
||||
}
|
||||
float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*num_max_patches); // TODO: base on gridsize/llava model
|
||||
float * image_embd;
|
||||
if (clip_is_qwen2vl(ctx_clip)) {
|
||||
// qwen2vl don't split image into chunks, so `num_max_patches` is not needed.
|
||||
image_embd = (float *)malloc(clip_embd_nbytes_by_img(ctx_clip, img->nx, img->ny));
|
||||
} else {
|
||||
image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*num_max_patches); // TODO: base on gridsize/llava model
|
||||
}
|
||||
if (!image_embd) {
|
||||
LOG_ERR("Unable to allocate memory for image embeddings\n");
|
||||
return false;
|
||||
|
||||
158
examples/llava/qwen2_vl_surgery.py
Normal file
158
examples/llava/qwen2_vl_surgery.py
Normal file
@@ -0,0 +1,158 @@
|
||||
import argparse
|
||||
from typing import Dict
|
||||
|
||||
import torch
|
||||
import numpy as np
|
||||
from gguf import *
|
||||
from transformers import (
|
||||
Qwen2VLForConditionalGeneration,
|
||||
Qwen2VLProcessor,
|
||||
AutoProcessor,
|
||||
Qwen2VLConfig
|
||||
)
|
||||
|
||||
|
||||
VISION = "clip.vision"
|
||||
|
||||
|
||||
def k(raw_key: str, arch: str) -> str:
|
||||
return raw_key.format(arch=arch)
|
||||
|
||||
|
||||
def to_gguf_name(name: str) -> str:
|
||||
og = name
|
||||
name = name.replace("text_model", "t").replace("vision_model", "v")
|
||||
name = name.replace("blocks", "blk").replace("embeddings.", "")
|
||||
name = name.replace("attn.", "attn_")
|
||||
name = name.replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("proj.", "out.")
|
||||
# name = name.replace("layrnorm", "ln").replace("layer_norm", "ln").replace("layernorm", "ln")
|
||||
name = name.replace("norm1", "ln1").replace("norm2", "ln2")
|
||||
name = name.replace("merger.mlp", 'mm')
|
||||
print(f"[to_gguf_name] {og} --> {name}")
|
||||
return name
|
||||
|
||||
|
||||
def find_vision_tensors(qwen2vl, dtype) -> Dict[str, np.ndarray]:
|
||||
vision_model = qwen2vl.visual
|
||||
tensor_map = {}
|
||||
for name, ten in vision_model.state_dict().items():
|
||||
ten = ten.numpy()
|
||||
if 'qkv' in name:
|
||||
if ten.ndim == 2: # weight
|
||||
c3, _ = ten.shape
|
||||
else: # bias
|
||||
c3 = ten.shape[0]
|
||||
assert c3 % 3 == 0
|
||||
c = c3 // 3
|
||||
wq = ten[:c]
|
||||
wk = ten[c: c * 2]
|
||||
wv = ten[c * 2:]
|
||||
tensor_map[to_gguf_name(f"vision_model.{name}").replace("qkv", "q")] = wq
|
||||
tensor_map[to_gguf_name(f"vision_model.{name}").replace("qkv", "k")] = wk
|
||||
tensor_map[to_gguf_name(f"vision_model.{name}").replace("qkv", "v")] = wv
|
||||
elif 'merger' in name:
|
||||
if name.endswith("ln_q.weight"):
|
||||
tensor_map['v.post_ln.weight'] = ten
|
||||
elif name.endswith("ln_q.bias"):
|
||||
tensor_map['v.post_ln.bias'] = ten
|
||||
else:
|
||||
# "merger.mlp.%d.weight/bias" --> "mm.%d.weight/bias"
|
||||
tensor_map[to_gguf_name(name)] = ten
|
||||
elif 'patch_embed.proj.weight' in name:
|
||||
# NOTE: split Conv3D into Conv2Ds
|
||||
c1, c2, kt, kh, kw = ten.shape
|
||||
assert kt == 2, "Current implmentation only support temporal_patch_size of 2"
|
||||
tensor_map["v.patch_embd.weight"] = ten[:, :, 0, ...]
|
||||
tensor_map["v.patch_embd.weight.1"] = ten[:, :, 1, ...]
|
||||
else:
|
||||
tensor_map[to_gguf_name(f"vision_model.{name}")] = ten
|
||||
|
||||
for new_name, ten in tensor_map.items():
|
||||
if ten.ndim <= 1 or new_name.endswith("_norm.weight"):
|
||||
tensor_map[new_name] = ten.astype(np.float32)
|
||||
else:
|
||||
tensor_map[new_name] = ten.astype(dtype)
|
||||
tensor_map["v.position_embd.weight"] = np.zeros([10, 10], dtype=np.float32) # dummy tensor, just here as a placeholder
|
||||
return tensor_map
|
||||
|
||||
|
||||
def main(args):
|
||||
if args.data_type == 'fp32':
|
||||
dtype = torch.float32
|
||||
np_dtype = np.float32
|
||||
ftype = 0
|
||||
elif args.data_type == 'fp16':
|
||||
dtype = torch.float32
|
||||
np_dtype = np.float16
|
||||
ftype = 1
|
||||
else:
|
||||
raise ValueError()
|
||||
|
||||
model_name = args.model_name
|
||||
print("model_name: ", model_name)
|
||||
qwen2vl = Qwen2VLForConditionalGeneration.from_pretrained(
|
||||
model_name, torch_dtype=dtype, device_map="cpu"
|
||||
)
|
||||
cfg: Qwen2VLConfig = qwen2vl.config # type: ignore[reportAssignmentType]
|
||||
vcfg = cfg.vision_config
|
||||
|
||||
if os.path.isdir(model_name):
|
||||
if model_name.endswith(os.sep):
|
||||
model_name = model_name[:-1]
|
||||
model_name = os.path.basename(model_name)
|
||||
fname_out = f"{model_name.replace('/', '-').lower()}-vision.gguf"
|
||||
|
||||
fout = GGUFWriter(path=fname_out, arch="clip")
|
||||
fout.add_description("image encoder for Qwen2VL")
|
||||
|
||||
fout.add_file_type(ftype)
|
||||
fout.add_bool("clip.has_text_encoder", False)
|
||||
fout.add_bool("clip.has_vision_encoder", True)
|
||||
fout.add_bool("clip.has_qwen2vl_merger", True)
|
||||
fout.add_string("clip.projector_type", "qwen2vl_merger")
|
||||
|
||||
print(cfg.vision_config)
|
||||
if 'silu' in cfg.vision_config.hidden_act.lower():
|
||||
fout.add_bool("clip.use_silu", True)
|
||||
fout.add_bool("clip.use_gelu", False)
|
||||
elif 'gelu' in cfg.vision_config.hidden_act.lower():
|
||||
fout.add_bool("clip.use_silu", False)
|
||||
fout.add_bool("clip.use_gelu", 'quick' not in cfg.vision_config.hidden_act.lower())
|
||||
else:
|
||||
raise ValueError()
|
||||
|
||||
tensor_map = find_vision_tensors(qwen2vl, np_dtype)
|
||||
for name, data in tensor_map.items():
|
||||
fout.add_tensor(name, data)
|
||||
|
||||
fout.add_uint32("clip.vision.patch_size", vcfg.patch_size)
|
||||
fout.add_uint32("clip.vision.image_size", 14 * 40) # some reasonable size that is divable by (14*2)
|
||||
fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), vcfg.embed_dim)
|
||||
fout.add_uint32("clip.vision.projection_dim", vcfg.hidden_size)
|
||||
fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), vcfg.num_heads)
|
||||
fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6)
|
||||
fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), vcfg.depth)
|
||||
fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), 0) # not sure what this does, put 0 here as a placeholder
|
||||
fout.add_name(model_name)
|
||||
"""
|
||||
HACK: Since vision rope related parameter aren't stored in the `Qwen2VLConfig,
|
||||
it will be hardcoded in the `clip_image_build_graph` from `clip.cpp`.
|
||||
"""
|
||||
|
||||
processor: Qwen2VLProcessor = AutoProcessor.from_pretrained(model_name)
|
||||
fout.add_array("clip.vision.image_mean", processor.image_processor.image_mean) # type: ignore[reportAttributeAccessIssue]
|
||||
fout.add_array("clip.vision.image_std", processor.image_processor.image_std) # type: ignore[reportAttributeAccessIssue]
|
||||
|
||||
fout.write_header_to_file()
|
||||
fout.write_kv_data_to_file()
|
||||
fout.write_tensors_to_file()
|
||||
fout.close()
|
||||
print("save model as: ", fname_out)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("model_name", nargs='?', default="Qwen/Qwen2-VL-2B-Instruct")
|
||||
parser.add_argument("--data_type", nargs='?', choices=['fp32', 'fp16'], default="fp32")
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
581
examples/llava/qwen2vl-cli.cpp
Normal file
581
examples/llava/qwen2vl-cli.cpp
Normal file
@@ -0,0 +1,581 @@
|
||||
#include "arg.h"
|
||||
#include "base64.hpp"
|
||||
#include "log.h"
|
||||
#include "common.h"
|
||||
#include "sampling.h"
|
||||
#include "clip.h"
|
||||
#include "llava.h"
|
||||
#include "llama.h"
|
||||
#include "ggml.h"
|
||||
|
||||
#ifdef GGML_USE_CUDA
|
||||
#include "ggml-cuda.h"
|
||||
#endif
|
||||
#ifdef NDEBUG
|
||||
#include "ggml-alloc.h"
|
||||
#include "ggml-backend.h"
|
||||
#endif
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
|
||||
|
||||
static bool qwen2vl_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed,
|
||||
int n_batch, int * n_past, int * st_pos_id, struct clip_image_size * image_size) {
|
||||
int n_embd = llama_n_embd(llama_get_model(ctx_llama));
|
||||
const int patch_size = 14 * 2;
|
||||
const int ph = image_size->height / patch_size + (image_size->height % patch_size > 0);
|
||||
const int pw = image_size->width / patch_size + (image_size->width % patch_size > 0);
|
||||
auto img_tokens = image_embed->n_image_pos;
|
||||
// llama_pos mrope_pos[img_tokens * 4];
|
||||
std::vector<llama_pos> mrope_pos;
|
||||
mrope_pos.resize(img_tokens * 4);
|
||||
|
||||
for (int y = 0; y < ph; y++)
|
||||
{
|
||||
for (int x = 0; x < pw; x++)
|
||||
{
|
||||
int i = y * pw + x;
|
||||
mrope_pos[i] = *st_pos_id;
|
||||
mrope_pos[i + img_tokens] = *st_pos_id + y;
|
||||
mrope_pos[i + img_tokens * 2] = *st_pos_id + x;
|
||||
mrope_pos[i + img_tokens * 3] = 0;
|
||||
}
|
||||
}
|
||||
*st_pos_id += std::max(pw, ph);
|
||||
|
||||
int processed = 0;
|
||||
std::vector<llama_pos> batch_mrope_pos;
|
||||
batch_mrope_pos.resize(img_tokens * 4);
|
||||
|
||||
for (int i = 0; i < img_tokens; i += n_batch) {
|
||||
int n_eval = img_tokens - i;
|
||||
if (n_eval > n_batch) {
|
||||
n_eval = n_batch;
|
||||
}
|
||||
|
||||
// llama_pos batch_mrope_pos[n_eval * 4];
|
||||
std::fill(batch_mrope_pos.begin(), batch_mrope_pos.end(), 0);
|
||||
memcpy(batch_mrope_pos.data(), &mrope_pos[processed], n_eval * sizeof(llama_pos));
|
||||
memcpy(&batch_mrope_pos[n_eval * 1], &mrope_pos[img_tokens * 1 + processed], n_eval * sizeof(llama_pos));
|
||||
memcpy(&batch_mrope_pos[n_eval * 2], &mrope_pos[img_tokens * 2 + processed], n_eval * sizeof(llama_pos));
|
||||
memcpy(&batch_mrope_pos[n_eval * 3], &mrope_pos[img_tokens * 3 + processed], n_eval * sizeof(llama_pos));
|
||||
|
||||
llama_batch batch = {
|
||||
int32_t(n_eval), // n_tokens
|
||||
nullptr, // token
|
||||
(image_embed->embed+i*n_embd), // embed
|
||||
batch_mrope_pos.data(), // pos
|
||||
nullptr, // n_seq_id
|
||||
nullptr, // seq_id
|
||||
nullptr, // logits
|
||||
};
|
||||
|
||||
if (llama_decode(ctx_llama, batch)) {
|
||||
LOG_ERR("%s : failed to eval\n", __func__);
|
||||
return false;
|
||||
}
|
||||
*n_past += n_eval;
|
||||
processed += n_eval;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past, int * st_pos_id) {
|
||||
int N = (int) tokens.size();
|
||||
std::vector<llama_pos> pos;
|
||||
for (int i = 0; i < N; i += n_batch) {
|
||||
int n_eval = (int) tokens.size() - i;
|
||||
if (n_eval > n_batch) {
|
||||
n_eval = n_batch;
|
||||
}
|
||||
auto batch = llama_batch_get_one(&tokens[i], n_eval);
|
||||
// TODO: add mrope pos ids somewhere else
|
||||
pos.resize(batch.n_tokens * 4);
|
||||
std::fill(pos.begin(), pos.end(), 0);
|
||||
for (int j = 0; j < batch.n_tokens * 3; j ++) {
|
||||
pos[j] = *st_pos_id + (j % batch.n_tokens);
|
||||
}
|
||||
batch.pos = pos.data();
|
||||
|
||||
if (llama_decode(ctx_llama, batch)) {
|
||||
LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
|
||||
return false;
|
||||
}
|
||||
*n_past += n_eval;
|
||||
*st_pos_id += n_eval;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past, int * st_pos_id) {
|
||||
std::vector<llama_token> tokens;
|
||||
tokens.push_back(id);
|
||||
return eval_tokens(ctx_llama, tokens, 1, n_past, st_pos_id);
|
||||
}
|
||||
|
||||
static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, int * st_pos_id, bool add_bos){
|
||||
std::string str2 = str;
|
||||
std::vector<llama_token> embd_inp = common_tokenize(ctx_llama, str2, add_bos, true);
|
||||
eval_tokens(ctx_llama, embd_inp, n_batch, n_past, st_pos_id);
|
||||
return true;
|
||||
}
|
||||
|
||||
static const char * sample(struct common_sampler * smpl,
|
||||
struct llama_context * ctx_llama,
|
||||
int * n_past, int * st_pos_id) {
|
||||
const llama_token id = common_sampler_sample(smpl, ctx_llama, -1);
|
||||
common_sampler_accept(smpl, id, true);
|
||||
static std::string ret;
|
||||
if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
|
||||
ret = "</s>";
|
||||
} else {
|
||||
ret = common_token_to_piece(ctx_llama, id);
|
||||
}
|
||||
eval_id(ctx_llama, id, n_past, st_pos_id);
|
||||
return ret.c_str();
|
||||
}
|
||||
|
||||
static const char* IMG_BASE64_TAG_BEGIN = "<img src=\"data:image/jpeg;base64,";
|
||||
static const char* IMG_BASE64_TAG_END = "\">";
|
||||
|
||||
static void find_image_tag_in_prompt(const std::string& prompt, size_t& begin_out, size_t& end_out) {
|
||||
begin_out = prompt.find(IMG_BASE64_TAG_BEGIN);
|
||||
end_out = prompt.find(IMG_BASE64_TAG_END, (begin_out == std::string::npos) ? 0UL : begin_out);
|
||||
}
|
||||
|
||||
static bool prompt_contains_image(const std::string& prompt) {
|
||||
size_t begin, end;
|
||||
find_image_tag_in_prompt(prompt, begin, end);
|
||||
return (begin != std::string::npos);
|
||||
}
|
||||
|
||||
// replaces the base64 image tag in the prompt with `replacement`
|
||||
static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip_ctx * ctx_clip, int n_threads, const std::string& prompt) {
|
||||
size_t img_base64_str_start, img_base64_str_end;
|
||||
find_image_tag_in_prompt(prompt, img_base64_str_start, img_base64_str_end);
|
||||
if (img_base64_str_start == std::string::npos || img_base64_str_end == std::string::npos) {
|
||||
LOG_ERR("%s: invalid base64 image tag. must be %s<base64 byte string>%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
auto base64_bytes_start = img_base64_str_start + strlen(IMG_BASE64_TAG_BEGIN);
|
||||
auto base64_bytes_count = img_base64_str_end - base64_bytes_start;
|
||||
auto base64_str = prompt.substr(base64_bytes_start, base64_bytes_count );
|
||||
|
||||
auto required_bytes = base64::required_encode_size(base64_str.size());
|
||||
auto img_bytes = std::vector<unsigned char>(required_bytes);
|
||||
base64::decode(base64_str.begin(), base64_str.end(), img_bytes.begin());
|
||||
|
||||
auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, img_bytes.data(), img_bytes.size());
|
||||
if (!embed) {
|
||||
LOG_ERR("%s: could not load image from base64 string.\n", __func__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return embed;
|
||||
}
|
||||
|
||||
static std::string remove_image_from_prompt(const std::string& prompt, const char * replacement = "") {
|
||||
size_t begin, end;
|
||||
find_image_tag_in_prompt(prompt, begin, end);
|
||||
if (begin == std::string::npos || end == std::string::npos) {
|
||||
return prompt;
|
||||
}
|
||||
auto pre = prompt.substr(0, begin);
|
||||
auto post = prompt.substr(end + strlen(IMG_BASE64_TAG_END));
|
||||
return pre + replacement + post;
|
||||
}
|
||||
|
||||
struct llava_context {
|
||||
struct clip_ctx * ctx_clip = NULL;
|
||||
struct llama_context * ctx_llama = NULL;
|
||||
struct llama_model * model = NULL;
|
||||
};
|
||||
|
||||
static void print_usage(int, char ** argv) {
|
||||
LOG("\n example usage:\n");
|
||||
LOG("\n %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
|
||||
LOG("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
|
||||
}
|
||||
|
||||
static struct llava_image_embed * load_image(llava_context * ctx_llava, common_params * params, const std::string & fname) {
|
||||
|
||||
// load and preprocess the image
|
||||
llava_image_embed * embed = NULL;
|
||||
auto prompt = params->prompt;
|
||||
if (prompt_contains_image(prompt)) {
|
||||
if (!params->image.empty()) {
|
||||
LOG_INF("using base64 encoded image instead of command line image path\n");
|
||||
}
|
||||
embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->cpuparams.n_threads, prompt);
|
||||
if (!embed) {
|
||||
LOG_ERR("%s: can't load image from prompt\n", __func__);
|
||||
return NULL;
|
||||
}
|
||||
params->prompt = remove_image_from_prompt(prompt);
|
||||
} else {
|
||||
embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->cpuparams.n_threads, fname.c_str());
|
||||
if (!embed) {
|
||||
fprintf(stderr, "%s: is %s really an image file?\n", __func__, fname.c_str());
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
return embed;
|
||||
}
|
||||
|
||||
static void process_prompt(struct llava_context * ctx_llava, struct llava_image_embed * image_embed, common_params * params, const std::string & prompt) {
|
||||
int n_past = 0;
|
||||
int cur_pos_id = 0;
|
||||
|
||||
const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;
|
||||
|
||||
std::string system_prompt, user_prompt;
|
||||
size_t image_pos = prompt.find("<|vision_start|>");
|
||||
if (image_pos != std::string::npos) {
|
||||
// new templating mode: Provide the full prompt including system message and use <image> as a placeholder for the image
|
||||
system_prompt = prompt.substr(0, image_pos);
|
||||
user_prompt = prompt.substr(image_pos + std::string("<|vision_pad|>").length());
|
||||
LOG_INF("system_prompt: %s\n", system_prompt.c_str());
|
||||
if (params->verbose_prompt) {
|
||||
auto tmp = common_tokenize(ctx_llava->ctx_llama, system_prompt, true, true);
|
||||
for (int i = 0; i < (int) tmp.size(); i++) {
|
||||
LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
|
||||
}
|
||||
}
|
||||
LOG_INF("user_prompt: %s\n", user_prompt.c_str());
|
||||
if (params->verbose_prompt) {
|
||||
auto tmp = common_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
|
||||
for (int i = 0; i < (int) tmp.size(); i++) {
|
||||
LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// llava-1.5 native mode
|
||||
system_prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|>";
|
||||
user_prompt = "<|vision_end|>" + prompt + "<|im_end|>\n<|im_start|>assistant\n";
|
||||
if (params->verbose_prompt) {
|
||||
auto tmp = common_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
|
||||
for (int i = 0; i < (int) tmp.size(); i++) {
|
||||
LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, &cur_pos_id, true);
|
||||
if (image_embed != nullptr) {
|
||||
auto image_size = clip_get_load_image_size(ctx_llava->ctx_clip);
|
||||
qwen2vl_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past, &cur_pos_id, image_size);
|
||||
}
|
||||
eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, &cur_pos_id, false);
|
||||
|
||||
// generate the response
|
||||
|
||||
LOG("\n");
|
||||
|
||||
struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sampling);
|
||||
if (!smpl) {
|
||||
LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
std::string response = "";
|
||||
for (int i = 0; i < max_tgt_len; i++) {
|
||||
const char * tmp = sample(smpl, ctx_llava->ctx_llama, &n_past, &cur_pos_id);
|
||||
response += tmp;
|
||||
if (strcmp(tmp, "</s>") == 0) break;
|
||||
if (strstr(tmp, "###")) break; // Yi-VL behavior
|
||||
LOG("%s", tmp);
|
||||
if (strstr(response.c_str(), "<|im_end|>")) break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works)
|
||||
if (strstr(response.c_str(), "<|im_start|>")) break; // Yi-34B llava-1.6
|
||||
if (strstr(response.c_str(), "USER:")) break; // mistral llava-1.6
|
||||
|
||||
fflush(stdout);
|
||||
}
|
||||
|
||||
common_sampler_free(smpl);
|
||||
LOG("\n");
|
||||
}
|
||||
|
||||
static struct llama_model * llava_init(common_params * params) {
|
||||
llama_backend_init();
|
||||
llama_numa_init(params->numa);
|
||||
|
||||
llama_model_params model_params = common_model_params_to_llama(*params);
|
||||
|
||||
llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
|
||||
if (model == NULL) {
|
||||
LOG_ERR("%s: unable to load model\n" , __func__);
|
||||
return NULL;
|
||||
}
|
||||
return model;
|
||||
}
|
||||
|
||||
static struct llava_context * llava_init_context(common_params * params, llama_model * model) {
|
||||
const char * clip_path = params->mmproj.c_str();
|
||||
|
||||
auto prompt = params->prompt;
|
||||
if (prompt.empty()) {
|
||||
prompt = "describe the image in detail.";
|
||||
}
|
||||
|
||||
auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
|
||||
|
||||
|
||||
llama_context_params ctx_params = common_context_params_to_llama(*params);
|
||||
ctx_params.n_ctx = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings
|
||||
|
||||
llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
|
||||
|
||||
if (ctx_llama == NULL) {
|
||||
LOG_ERR("%s: failed to create the llama_context\n" , __func__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
auto * ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
|
||||
|
||||
ctx_llava->ctx_llama = ctx_llama;
|
||||
ctx_llava->ctx_clip = ctx_clip;
|
||||
ctx_llava->model = model;
|
||||
return ctx_llava;
|
||||
}
|
||||
|
||||
static void llava_free(struct llava_context * ctx_llava) {
|
||||
if (ctx_llava->ctx_clip) {
|
||||
clip_free(ctx_llava->ctx_clip);
|
||||
ctx_llava->ctx_clip = NULL;
|
||||
}
|
||||
|
||||
llama_free(ctx_llava->ctx_llama);
|
||||
llama_free_model(ctx_llava->model);
|
||||
llama_backend_free();
|
||||
}
|
||||
|
||||
#ifndef NDEBUG
|
||||
|
||||
static void debug_test_mrope_2d() {
|
||||
// 1. Initialize backend
|
||||
ggml_backend_t backend = NULL;
|
||||
std::string backend_name = "";
|
||||
#ifdef GGML_USE_CUDA
|
||||
fprintf(stderr, "%s: using CUDA backend\n", __func__);
|
||||
backend = ggml_backend_cuda_init(0); // init device 0
|
||||
backend_name = "cuda";
|
||||
if (!backend) {
|
||||
fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
|
||||
}
|
||||
#endif
|
||||
// if there aren't GPU Backends fallback to CPU backend
|
||||
if (!backend) {
|
||||
backend = ggml_backend_cpu_init();
|
||||
backend_name = "cpu";
|
||||
}
|
||||
|
||||
// Calculate the size needed to allocate
|
||||
size_t ctx_size = 0;
|
||||
ctx_size += 2 * ggml_tensor_overhead(); // tensors
|
||||
// no need to allocate anything else!
|
||||
|
||||
// 2. Allocate `ggml_context` to store tensor data
|
||||
struct ggml_init_params params = {
|
||||
/*.mem_size =*/ ctx_size,
|
||||
/*.mem_buffer =*/ NULL,
|
||||
/*.no_alloc =*/ true, // the tensors will be allocated later by ggml_backend_alloc_ctx_tensors()
|
||||
};
|
||||
struct ggml_context * ctx = ggml_init(params);
|
||||
|
||||
struct ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 128, 12, 30);
|
||||
ggml_set_name(inp_raw, "inp_raw");
|
||||
ggml_set_input(inp_raw);
|
||||
|
||||
struct ggml_tensor * pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 30 * 4);
|
||||
ggml_set_name(pos, "pos");
|
||||
ggml_set_input(pos);
|
||||
|
||||
std::vector<float> dummy_q;
|
||||
dummy_q.resize(128 * 12 * 30);
|
||||
std::fill(dummy_q.begin(), dummy_q.end(), 0.1);
|
||||
// memcpy(inp_raw->data, dummy_q.data(), 128 * 12 * 30 * ggml_element_size(inp_raw));
|
||||
|
||||
std::vector<int> pos_id;
|
||||
pos_id.resize(30 * 4);
|
||||
for (int i = 0; i < 30; i ++) {
|
||||
pos_id[i] = i;
|
||||
pos_id[i + 30] = i + 10;
|
||||
pos_id[i + 60] = i + 20;
|
||||
pos_id[i + 90] = i + 30;
|
||||
}
|
||||
int sections[4] = {32, 32, 0, 0};
|
||||
|
||||
// 4. Allocate a `ggml_backend_buffer` to store all tensors
|
||||
ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx, backend);
|
||||
|
||||
// 5. Copy tensor data from main memory (RAM) to backend buffer
|
||||
ggml_backend_tensor_set(inp_raw, dummy_q.data(), 0, ggml_nbytes(inp_raw));
|
||||
ggml_backend_tensor_set(pos, pos_id.data(), 0, ggml_nbytes(pos));
|
||||
|
||||
// 6. Create a `ggml_cgraph` for mul_mat operation
|
||||
struct ggml_cgraph * gf = NULL;
|
||||
struct ggml_context * ctx_cgraph = NULL;
|
||||
|
||||
// create a temporally context to build the graph
|
||||
struct ggml_init_params params0 = {
|
||||
/*.mem_size =*/ ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead(),
|
||||
/*.mem_buffer =*/ NULL,
|
||||
/*.no_alloc =*/ true, // the tensors will be allocated later by ggml_gallocr_alloc_graph()
|
||||
};
|
||||
ctx_cgraph = ggml_init(params0);
|
||||
gf = ggml_new_graph(ctx_cgraph);
|
||||
|
||||
struct ggml_tensor * result0 = ggml_rope_multi(
|
||||
ctx_cgraph, inp_raw, pos, nullptr,
|
||||
128/2, sections, LLAMA_ROPE_TYPE_VISION, 32768, 1000000, 1,
|
||||
0, 1, 32, 1);
|
||||
|
||||
// Add "result" tensor and all of its dependencies to the cgraph
|
||||
ggml_build_forward_expand(gf, result0);
|
||||
|
||||
// 7. Create a `ggml_gallocr` for cgraph computation
|
||||
ggml_gallocr_t allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
|
||||
ggml_gallocr_alloc_graph(allocr, gf);
|
||||
|
||||
// 9. Run the computation
|
||||
int n_threads = 1; // Optional: number of threads to perform some operations with multi-threading
|
||||
if (ggml_backend_is_cpu(backend)) {
|
||||
ggml_backend_cpu_set_n_threads(backend, n_threads);
|
||||
}
|
||||
ggml_backend_graph_compute(backend, gf);
|
||||
|
||||
// 10. Retrieve results (output tensors)
|
||||
// in this example, output tensor is always the last tensor in the graph
|
||||
struct ggml_tensor * result = result0;
|
||||
// struct ggml_tensor * result = gf->nodes[gf->n_nodes - 1];
|
||||
float * result_data = (float *)malloc(ggml_nbytes(result));
|
||||
// because the tensor data is stored in device buffer, we need to copy it back to RAM
|
||||
ggml_backend_tensor_get(result, result_data, 0, ggml_nbytes(result));
|
||||
const std::string bin_file = "mrope_2d_" + backend_name +".bin";
|
||||
std::ofstream outFile(bin_file, std::ios::binary);
|
||||
|
||||
if (outFile.is_open()) {
|
||||
outFile.write(reinterpret_cast<const char*>(result_data), ggml_nbytes(result));
|
||||
outFile.close();
|
||||
std::cout << "Data successfully written to " + bin_file << std::endl;
|
||||
} else {
|
||||
std::cerr << "Error opening file!" << std::endl;
|
||||
}
|
||||
|
||||
free(result_data);
|
||||
// 11. Free memory and exit
|
||||
ggml_free(ctx_cgraph);
|
||||
ggml_gallocr_free(allocr);
|
||||
ggml_free(ctx);
|
||||
ggml_backend_buffer_free(buffer);
|
||||
ggml_backend_free(backend);
|
||||
}
|
||||
|
||||
static void debug_dump_img_embed(struct llava_context * ctx_llava) {
|
||||
int n_embd = llama_n_embd(llama_get_model(ctx_llava->ctx_llama));
|
||||
int ne = n_embd * 4;
|
||||
float vals[56 * 56 * 3];
|
||||
// float embd[ne];
|
||||
std::vector<float> embd;
|
||||
embd.resize(ne);
|
||||
|
||||
for (int i = 0; i < 56*56; i++)
|
||||
{
|
||||
for (int c = 0; c < 3; c++)
|
||||
vals[i * 3 + c] = (float)(i % (56 * 56)) / (56*56);
|
||||
}
|
||||
|
||||
clip_encode_float_image(ctx_llava->ctx_clip, 16, vals, 56, 56, embd.data());
|
||||
|
||||
std::ofstream outFile("img_embed.bin", std::ios::binary);
|
||||
if (outFile.is_open()) {
|
||||
outFile.write(reinterpret_cast<const char*>(embd.data()), ne * sizeof(float));
|
||||
|
||||
outFile.close();
|
||||
std::cout << "Data successfully written to mrope.bin" << std::endl;
|
||||
} else {
|
||||
std::cerr << "Error opening file!" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
ggml_time_init();
|
||||
|
||||
common_params params;
|
||||
|
||||
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, print_usage)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
common_init();
|
||||
|
||||
if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
|
||||
print_usage(argc, argv);
|
||||
return 1;
|
||||
}
|
||||
|
||||
auto * model = llava_init(¶ms);
|
||||
if (model == NULL) {
|
||||
fprintf(stderr, "%s: error: failed to init llava model\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (prompt_contains_image(params.prompt)) {
|
||||
auto * ctx_llava = llava_init_context(¶ms, model);
|
||||
|
||||
auto * image_embed = load_image(ctx_llava, ¶ms, "");
|
||||
|
||||
// process the prompt
|
||||
process_prompt(ctx_llava, image_embed, ¶ms, params.prompt);
|
||||
|
||||
llama_perf_context_print(ctx_llava->ctx_llama);
|
||||
llava_image_embed_free(image_embed);
|
||||
ctx_llava->model = NULL;
|
||||
llava_free(ctx_llava);
|
||||
#ifndef NDEBUG
|
||||
} else if (params.image[0].empty()) {
|
||||
auto ctx_llava = llava_init_context(¶ms, model);
|
||||
|
||||
debug_test_mrope_2d();
|
||||
debug_dump_img_embed(ctx_llava);
|
||||
|
||||
llama_perf_context_print(ctx_llava->ctx_llama);
|
||||
ctx_llava->model = NULL;
|
||||
llava_free(ctx_llava);
|
||||
#endif
|
||||
} else {
|
||||
for (auto & image : params.image) {
|
||||
auto * ctx_llava = llava_init_context(¶ms, model);
|
||||
|
||||
auto * image_embed = load_image(ctx_llava, ¶ms, image);
|
||||
if (!image_embed) {
|
||||
LOG_ERR("%s: failed to load image %s. Terminating\n\n", __func__, image.c_str());
|
||||
return 1;
|
||||
}
|
||||
|
||||
// process the prompt
|
||||
process_prompt(ctx_llava, image_embed, ¶ms, params.prompt);
|
||||
|
||||
llama_perf_context_print(ctx_llava->ctx_llama);
|
||||
llava_image_embed_free(image_embed);
|
||||
ctx_llava->model = NULL;
|
||||
llava_free(ctx_llava);
|
||||
}
|
||||
}
|
||||
|
||||
llama_free_model(model);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -54,8 +54,6 @@ As the models are currently fully loaded into memory, you will need adequate dis
|
||||
|
||||
Several quantization methods are supported. They differ in the resulting model disk size and inference speed.
|
||||
|
||||
The quantization formats `Q4_0_4_4`, `Q4_0_4_8` and `Q4_0_8_8` are block interleaved variants of the `Q4_0` format, providing a data layout that is better suited for specific implementations of optimized mulmat kernels. Since these formats differ only in data layout, they have the same quantized size as the `Q4_0` format.
|
||||
|
||||
*(outdated)*
|
||||
|
||||
| Model | Measure | F16 | Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 |
|
||||
@@ -83,7 +81,7 @@ The quantization formats `Q4_0_4_4`, `Q4_0_4_8` and `Q4_0_8_8` are block interle
|
||||
- [#4930 - imatrix for all k-quants](https://github.com/ggerganov/llama.cpp/pull/4930)
|
||||
- [#4951 - imatrix on the GPU](https://github.com/ggerganov/llama.cpp/pull/4957)
|
||||
- [#4969 - imatrix for legacy quants](https://github.com/ggerganov/llama.cpp/pull/4969)
|
||||
- [#4996 - k-qunats tuning](https://github.com/ggerganov/llama.cpp/pull/4996)
|
||||
- [#4996 - k-quants tuning](https://github.com/ggerganov/llama.cpp/pull/4996)
|
||||
- [#5060 - Q3_K_XS](https://github.com/ggerganov/llama.cpp/pull/5060)
|
||||
- [#5196 - 3-bit i-quants](https://github.com/ggerganov/llama.cpp/pull/5196)
|
||||
- [quantization tuning](https://github.com/ggerganov/llama.cpp/pull/5320), [another one](https://github.com/ggerganov/llama.cpp/pull/5334), and [another one](https://github.com/ggerganov/llama.cpp/pull/5361)
|
||||
|
||||
@@ -48,9 +48,6 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
|
||||
{ "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 5.33G, +0.0569 ppl @ Llama-3-8B", },
|
||||
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 6.14G, +0.0217 ppl @ Llama-3-8B", },
|
||||
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 7.96G, +0.0026 ppl @ Llama-3-8B", },
|
||||
{ "Q4_0_4_4", LLAMA_FTYPE_MOSTLY_Q4_0_4_4, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
|
||||
{ "Q4_0_4_8", LLAMA_FTYPE_MOSTLY_Q4_0_4_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
|
||||
{ "Q4_0_8_8", LLAMA_FTYPE_MOSTLY_Q4_0_8_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
|
||||
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, +0.0020 ppl @ Mistral-7B", },
|
||||
{ "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", },
|
||||
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
|
||||
|
||||
@@ -143,7 +143,7 @@ int main(int argc, char ** argv) {
|
||||
std::vector<chunk> file_chunk = chunk_file(context_file, params.chunk_size, params.chunk_separator);
|
||||
chunks.insert(chunks.end(), file_chunk.begin(), file_chunk.end());
|
||||
}
|
||||
LOG_INF("Number of chunks: %ld\n", chunks.size());
|
||||
LOG_INF("Number of chunks: %zu\n", chunks.size());
|
||||
|
||||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
set(TARGET llama-run)
|
||||
add_executable(${TARGET} run.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -3,5 +3,45 @@
|
||||
The purpose of this example is to demonstrate a minimal usage of llama.cpp for running models.
|
||||
|
||||
```bash
|
||||
./llama-run Meta-Llama-3.1-8B-Instruct.gguf
|
||||
llama-run granite-code
|
||||
...
|
||||
|
||||
```bash
|
||||
llama-run -h
|
||||
Description:
|
||||
Runs a llm
|
||||
|
||||
Usage:
|
||||
llama-run [options] model [prompt]
|
||||
|
||||
Options:
|
||||
-c, --context-size <value>
|
||||
Context size (default: 2048)
|
||||
-n, --ngl <value>
|
||||
Number of GPU layers (default: 0)
|
||||
-h, --help
|
||||
Show help message
|
||||
|
||||
Commands:
|
||||
model
|
||||
Model is a string with an optional prefix of
|
||||
huggingface:// (hf://), ollama://, https:// or file://.
|
||||
If no protocol is specified and a file exists in the specified
|
||||
path, file:// is assumed, otherwise if a file does not exist in
|
||||
the specified path, ollama:// is assumed. Models that are being
|
||||
pulled are downloaded with .partial extension while being
|
||||
downloaded and then renamed as the file without the .partial
|
||||
extension when complete.
|
||||
|
||||
Examples:
|
||||
llama-run llama3
|
||||
llama-run ollama://granite-code
|
||||
llama-run ollama://smollm:135m
|
||||
llama-run hf://QuantFactory/SmolLM-135M-GGUF/SmolLM-135M.Q2_K.gguf
|
||||
llama-run huggingface://bartowski/SmolLM-1.7B-Instruct-v0.2-GGUF/SmolLM-1.7B-Instruct-v0.2-IQ3_M.gguf
|
||||
llama-run https://example.com/some-file1.gguf
|
||||
llama-run some-file2.gguf
|
||||
llama-run file://some-file3.gguf
|
||||
llama-run --ngl 99 some-file4.gguf
|
||||
llama-run --ngl 99 some-file5.gguf Hello World
|
||||
...
|
||||
|
||||
@@ -1,128 +1,350 @@
|
||||
#if defined(_WIN32)
|
||||
#include <windows.h>
|
||||
# include <windows.h>
|
||||
#else
|
||||
#include <unistd.h>
|
||||
# include <unistd.h>
|
||||
#endif
|
||||
|
||||
#include <climits>
|
||||
#if defined(LLAMA_USE_CURL)
|
||||
# include <curl/curl.h>
|
||||
#endif
|
||||
|
||||
#include <cstdarg>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <filesystem>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
#include "common.h"
|
||||
#include "json.hpp"
|
||||
#include "llama-cpp.h"
|
||||
|
||||
typedef std::unique_ptr<char[]> char_array_ptr;
|
||||
#define printe(...) \
|
||||
do { \
|
||||
fprintf(stderr, __VA_ARGS__); \
|
||||
} while (0)
|
||||
|
||||
struct Argument {
|
||||
std::string flag;
|
||||
std::string help_text;
|
||||
};
|
||||
class Opt {
|
||||
public:
|
||||
int init(int argc, const char ** argv) {
|
||||
construct_help_str_();
|
||||
// Parse arguments
|
||||
if (parse(argc, argv)) {
|
||||
printe("Error: Failed to parse arguments.\n");
|
||||
help();
|
||||
return 1;
|
||||
}
|
||||
|
||||
struct Options {
|
||||
std::string model_path, prompt_non_interactive;
|
||||
int ngl = 99;
|
||||
int n_ctx = 2048;
|
||||
};
|
||||
// If help is requested, show help and exit
|
||||
if (help_) {
|
||||
help();
|
||||
return 2;
|
||||
}
|
||||
|
||||
class ArgumentParser {
|
||||
public:
|
||||
ArgumentParser(const char * program_name) : program_name(program_name) {}
|
||||
|
||||
void add_argument(const std::string & flag, std::string & var, const std::string & help_text = "") {
|
||||
string_args[flag] = &var;
|
||||
arguments.push_back({flag, help_text});
|
||||
return 0; // Success
|
||||
}
|
||||
|
||||
void add_argument(const std::string & flag, int & var, const std::string & help_text = "") {
|
||||
int_args[flag] = &var;
|
||||
arguments.push_back({flag, help_text});
|
||||
std::string model_;
|
||||
std::string user_;
|
||||
int context_size_ = 2048, ngl_ = -1;
|
||||
|
||||
private:
|
||||
std::string help_str_;
|
||||
bool help_ = false;
|
||||
|
||||
void construct_help_str_() {
|
||||
help_str_ =
|
||||
"Description:\n"
|
||||
" Runs a llm\n"
|
||||
"\n"
|
||||
"Usage:\n"
|
||||
" llama-run [options] model [prompt]\n"
|
||||
"\n"
|
||||
"Options:\n"
|
||||
" -c, --context-size <value>\n"
|
||||
" Context size (default: " +
|
||||
std::to_string(context_size_);
|
||||
help_str_ +=
|
||||
")\n"
|
||||
" -n, --ngl <value>\n"
|
||||
" Number of GPU layers (default: " +
|
||||
std::to_string(ngl_);
|
||||
help_str_ +=
|
||||
")\n"
|
||||
" -h, --help\n"
|
||||
" Show help message\n"
|
||||
"\n"
|
||||
"Commands:\n"
|
||||
" model\n"
|
||||
" Model is a string with an optional prefix of \n"
|
||||
" huggingface:// (hf://), ollama://, https:// or file://.\n"
|
||||
" If no protocol is specified and a file exists in the specified\n"
|
||||
" path, file:// is assumed, otherwise if a file does not exist in\n"
|
||||
" the specified path, ollama:// is assumed. Models that are being\n"
|
||||
" pulled are downloaded with .partial extension while being\n"
|
||||
" downloaded and then renamed as the file without the .partial\n"
|
||||
" extension when complete.\n"
|
||||
"\n"
|
||||
"Examples:\n"
|
||||
" llama-run llama3\n"
|
||||
" llama-run ollama://granite-code\n"
|
||||
" llama-run ollama://smollm:135m\n"
|
||||
" llama-run hf://QuantFactory/SmolLM-135M-GGUF/SmolLM-135M.Q2_K.gguf\n"
|
||||
" llama-run huggingface://bartowski/SmolLM-1.7B-Instruct-v0.2-GGUF/SmolLM-1.7B-Instruct-v0.2-IQ3_M.gguf\n"
|
||||
" llama-run https://example.com/some-file1.gguf\n"
|
||||
" llama-run some-file2.gguf\n"
|
||||
" llama-run file://some-file3.gguf\n"
|
||||
" llama-run --ngl 99 some-file4.gguf\n"
|
||||
" llama-run --ngl 99 some-file5.gguf Hello World\n";
|
||||
}
|
||||
|
||||
int parse(int argc, const char ** argv) {
|
||||
int positional_args_i = 0;
|
||||
for (int i = 1; i < argc; ++i) {
|
||||
std::string arg = argv[i];
|
||||
if (string_args.count(arg)) {
|
||||
if (i + 1 < argc) {
|
||||
*string_args[arg] = argv[++i];
|
||||
} else {
|
||||
fprintf(stderr, "error: missing value for %s\n", arg.c_str());
|
||||
print_usage();
|
||||
if (strcmp(argv[i], "-c") == 0 || strcmp(argv[i], "--context-size") == 0) {
|
||||
if (i + 1 >= argc) {
|
||||
return 1;
|
||||
}
|
||||
} else if (int_args.count(arg)) {
|
||||
if (i + 1 < argc) {
|
||||
if (parse_int_arg(argv[++i], *int_args[arg]) != 0) {
|
||||
fprintf(stderr, "error: invalid value for %s: %s\n", arg.c_str(), argv[i]);
|
||||
print_usage();
|
||||
return 1;
|
||||
}
|
||||
} else {
|
||||
fprintf(stderr, "error: missing value for %s\n", arg.c_str());
|
||||
print_usage();
|
||||
|
||||
context_size_ = std::atoi(argv[++i]);
|
||||
} else if (strcmp(argv[i], "-n") == 0 || strcmp(argv[i], "--ngl") == 0) {
|
||||
if (i + 1 >= argc) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
ngl_ = std::atoi(argv[++i]);
|
||||
} else if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0) {
|
||||
help_ = true;
|
||||
return 0;
|
||||
} else if (!positional_args_i) {
|
||||
++positional_args_i;
|
||||
model_ = argv[i];
|
||||
} else if (positional_args_i == 1) {
|
||||
++positional_args_i;
|
||||
user_ = argv[i];
|
||||
} else {
|
||||
fprintf(stderr, "error: unrecognized argument %s\n", arg.c_str());
|
||||
print_usage();
|
||||
return 1;
|
||||
user_ += " " + std::string(argv[i]);
|
||||
}
|
||||
}
|
||||
|
||||
if (string_args["-m"]->empty()) {
|
||||
fprintf(stderr, "error: -m is required\n");
|
||||
print_usage();
|
||||
return model_.empty(); // model_ is the only required value
|
||||
}
|
||||
|
||||
void help() const { printf("%s", help_str_.c_str()); }
|
||||
};
|
||||
|
||||
struct progress_data {
|
||||
size_t file_size = 0;
|
||||
std::chrono::steady_clock::time_point start_time = std::chrono::steady_clock::now();
|
||||
bool printed = false;
|
||||
};
|
||||
|
||||
struct FileDeleter {
|
||||
void operator()(FILE * file) const {
|
||||
if (file) {
|
||||
fclose(file);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
typedef std::unique_ptr<FILE, FileDeleter> FILE_ptr;
|
||||
|
||||
#ifdef LLAMA_USE_CURL
|
||||
class CurlWrapper {
|
||||
public:
|
||||
int init(const std::string & url, const std::vector<std::string> & headers, const std::string & output_file,
|
||||
const bool progress, std::string * response_str = nullptr) {
|
||||
std::string output_file_partial;
|
||||
curl = curl_easy_init();
|
||||
if (!curl) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
progress_data data;
|
||||
FILE_ptr out;
|
||||
if (!output_file.empty()) {
|
||||
output_file_partial = output_file + ".partial";
|
||||
out.reset(fopen(output_file_partial.c_str(), "ab"));
|
||||
}
|
||||
|
||||
set_write_options(response_str, out);
|
||||
data.file_size = set_resume_point(output_file_partial);
|
||||
set_progress_options(progress, data);
|
||||
set_headers(headers);
|
||||
perform(url);
|
||||
if (!output_file.empty()) {
|
||||
std::filesystem::rename(output_file_partial, output_file);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
private:
|
||||
const char * program_name;
|
||||
std::unordered_map<std::string, std::string *> string_args;
|
||||
std::unordered_map<std::string, int *> int_args;
|
||||
std::vector<Argument> arguments;
|
||||
~CurlWrapper() {
|
||||
if (chunk) {
|
||||
curl_slist_free_all(chunk);
|
||||
}
|
||||
|
||||
int parse_int_arg(const char * arg, int & value) {
|
||||
char * end;
|
||||
const long val = std::strtol(arg, &end, 10);
|
||||
if (*end == '\0' && val >= INT_MIN && val <= INT_MAX) {
|
||||
value = static_cast<int>(val);
|
||||
if (curl) {
|
||||
curl_easy_cleanup(curl);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
CURL * curl = nullptr;
|
||||
struct curl_slist * chunk = nullptr;
|
||||
|
||||
void set_write_options(std::string * response_str, const FILE_ptr & out) {
|
||||
if (response_str) {
|
||||
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, capture_data);
|
||||
curl_easy_setopt(curl, CURLOPT_WRITEDATA, response_str);
|
||||
} else {
|
||||
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_data);
|
||||
curl_easy_setopt(curl, CURLOPT_WRITEDATA, out.get());
|
||||
}
|
||||
}
|
||||
|
||||
size_t set_resume_point(const std::string & output_file) {
|
||||
size_t file_size = 0;
|
||||
if (std::filesystem::exists(output_file)) {
|
||||
file_size = std::filesystem::file_size(output_file);
|
||||
curl_easy_setopt(curl, CURLOPT_RESUME_FROM_LARGE, static_cast<curl_off_t>(file_size));
|
||||
}
|
||||
|
||||
return file_size;
|
||||
}
|
||||
|
||||
void set_progress_options(bool progress, progress_data & data) {
|
||||
if (progress) {
|
||||
curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L);
|
||||
curl_easy_setopt(curl, CURLOPT_XFERINFODATA, &data);
|
||||
curl_easy_setopt(curl, CURLOPT_XFERINFOFUNCTION, progress_callback);
|
||||
}
|
||||
}
|
||||
|
||||
void set_headers(const std::vector<std::string> & headers) {
|
||||
if (!headers.empty()) {
|
||||
if (chunk) {
|
||||
curl_slist_free_all(chunk);
|
||||
chunk = 0;
|
||||
}
|
||||
|
||||
for (const auto & header : headers) {
|
||||
chunk = curl_slist_append(chunk, header.c_str());
|
||||
}
|
||||
|
||||
curl_easy_setopt(curl, CURLOPT_HTTPHEADER, chunk);
|
||||
}
|
||||
}
|
||||
|
||||
void perform(const std::string & url) {
|
||||
CURLcode res;
|
||||
curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
|
||||
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
|
||||
curl_easy_setopt(curl, CURLOPT_DEFAULT_PROTOCOL, "https");
|
||||
curl_easy_setopt(curl, CURLOPT_FAILONERROR, 1L);
|
||||
res = curl_easy_perform(curl);
|
||||
if (res != CURLE_OK) {
|
||||
printe("curl_easy_perform() failed: %s\n", curl_easy_strerror(res));
|
||||
}
|
||||
}
|
||||
|
||||
static std::string human_readable_time(double seconds) {
|
||||
int hrs = static_cast<int>(seconds) / 3600;
|
||||
int mins = (static_cast<int>(seconds) % 3600) / 60;
|
||||
int secs = static_cast<int>(seconds) % 60;
|
||||
|
||||
std::ostringstream out;
|
||||
if (hrs > 0) {
|
||||
out << hrs << "h " << std::setw(2) << std::setfill('0') << mins << "m " << std::setw(2) << std::setfill('0')
|
||||
<< secs << "s";
|
||||
} else if (mins > 0) {
|
||||
out << mins << "m " << std::setw(2) << std::setfill('0') << secs << "s";
|
||||
} else {
|
||||
out << secs << "s";
|
||||
}
|
||||
|
||||
return out.str();
|
||||
}
|
||||
|
||||
static std::string human_readable_size(curl_off_t size) {
|
||||
static const char * suffix[] = { "B", "KB", "MB", "GB", "TB" };
|
||||
char length = sizeof(suffix) / sizeof(suffix[0]);
|
||||
int i = 0;
|
||||
double dbl_size = size;
|
||||
if (size > 1024) {
|
||||
for (i = 0; (size / 1024) > 0 && i < length - 1; i++, size /= 1024) {
|
||||
dbl_size = size / 1024.0;
|
||||
}
|
||||
}
|
||||
|
||||
std::ostringstream out;
|
||||
out << std::fixed << std::setprecision(2) << dbl_size << " " << suffix[i];
|
||||
return out.str();
|
||||
}
|
||||
|
||||
static int progress_callback(void * ptr, curl_off_t total_to_download, curl_off_t now_downloaded, curl_off_t,
|
||||
curl_off_t) {
|
||||
progress_data * data = static_cast<progress_data *>(ptr);
|
||||
if (total_to_download <= 0) {
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
void print_usage() const {
|
||||
printf("\nUsage:\n");
|
||||
printf(" %s [OPTIONS]\n\n", program_name);
|
||||
printf("Options:\n");
|
||||
for (const auto & arg : arguments) {
|
||||
printf(" %-10s %s\n", arg.flag.c_str(), arg.help_text.c_str());
|
||||
total_to_download += data->file_size;
|
||||
const curl_off_t now_downloaded_plus_file_size = now_downloaded + data->file_size;
|
||||
const curl_off_t percentage = (now_downloaded_plus_file_size * 100) / total_to_download;
|
||||
const curl_off_t pos = (percentage / 5);
|
||||
std::string progress_bar;
|
||||
for (int i = 0; i < 20; ++i) {
|
||||
progress_bar.append((i < pos) ? "█" : " ");
|
||||
}
|
||||
|
||||
printf("\n");
|
||||
// Calculate download speed and estimated time to completion
|
||||
const auto now = std::chrono::steady_clock::now();
|
||||
const std::chrono::duration<double> elapsed_seconds = now - data->start_time;
|
||||
const double speed = now_downloaded / elapsed_seconds.count();
|
||||
const double estimated_time = (total_to_download - now_downloaded) / speed;
|
||||
printe("\r%ld%% |%s| %s/%s %.2f MB/s %s ", percentage, progress_bar.c_str(),
|
||||
human_readable_size(now_downloaded).c_str(), human_readable_size(total_to_download).c_str(),
|
||||
speed / (1024 * 1024), human_readable_time(estimated_time).c_str());
|
||||
fflush(stderr);
|
||||
data->printed = true;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Function to write data to a file
|
||||
static size_t write_data(void * ptr, size_t size, size_t nmemb, void * stream) {
|
||||
FILE * out = static_cast<FILE *>(stream);
|
||||
return fwrite(ptr, size, nmemb, out);
|
||||
}
|
||||
|
||||
// Function to capture data into a string
|
||||
static size_t capture_data(void * ptr, size_t size, size_t nmemb, void * stream) {
|
||||
std::string * str = static_cast<std::string *>(stream);
|
||||
str->append(static_cast<char *>(ptr), size * nmemb);
|
||||
return size * nmemb;
|
||||
}
|
||||
};
|
||||
#endif
|
||||
|
||||
class LlamaData {
|
||||
public:
|
||||
llama_model_ptr model;
|
||||
llama_sampler_ptr sampler;
|
||||
llama_context_ptr context;
|
||||
public:
|
||||
llama_model_ptr model;
|
||||
llama_sampler_ptr sampler;
|
||||
llama_context_ptr context;
|
||||
std::vector<llama_chat_message> messages;
|
||||
std::vector<std::string> msg_strs;
|
||||
std::vector<char> fmtted;
|
||||
|
||||
int init(const Options & opt) {
|
||||
model = initialize_model(opt.model_path, opt.ngl);
|
||||
int init(Opt & opt) {
|
||||
model = initialize_model(opt);
|
||||
if (!model) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
context = initialize_context(model, opt.n_ctx);
|
||||
context = initialize_context(model, opt.context_size_);
|
||||
if (!context) {
|
||||
return 1;
|
||||
}
|
||||
@@ -131,15 +353,123 @@ class LlamaData {
|
||||
return 0;
|
||||
}
|
||||
|
||||
private:
|
||||
// Initializes the model and returns a unique pointer to it
|
||||
llama_model_ptr initialize_model(const std::string & model_path, const int ngl) {
|
||||
llama_model_params model_params = llama_model_default_params();
|
||||
model_params.n_gpu_layers = ngl;
|
||||
private:
|
||||
#ifdef LLAMA_USE_CURL
|
||||
int download(const std::string & url, const std::vector<std::string> & headers, const std::string & output_file,
|
||||
const bool progress, std::string * response_str = nullptr) {
|
||||
CurlWrapper curl;
|
||||
if (curl.init(url, headers, output_file, progress, response_str)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
llama_model_ptr model(llama_load_model_from_file(model_path.c_str(), model_params));
|
||||
return 0;
|
||||
}
|
||||
#else
|
||||
int download(const std::string &, const std::vector<std::string> &, const std::string &, const bool,
|
||||
std::string * = nullptr) {
|
||||
printe("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
#endif
|
||||
|
||||
int huggingface_dl(const std::string & model, const std::vector<std::string> headers, const std::string & bn) {
|
||||
// Find the second occurrence of '/' after protocol string
|
||||
size_t pos = model.find('/');
|
||||
pos = model.find('/', pos + 1);
|
||||
if (pos == std::string::npos) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
const std::string hfr = model.substr(0, pos);
|
||||
const std::string hff = model.substr(pos + 1);
|
||||
const std::string url = "https://huggingface.co/" + hfr + "/resolve/main/" + hff;
|
||||
return download(url, headers, bn, true);
|
||||
}
|
||||
|
||||
int ollama_dl(std::string & model, const std::vector<std::string> headers, const std::string & bn) {
|
||||
if (model.find('/') == std::string::npos) {
|
||||
model = "library/" + model;
|
||||
}
|
||||
|
||||
std::string model_tag = "latest";
|
||||
size_t colon_pos = model.find(':');
|
||||
if (colon_pos != std::string::npos) {
|
||||
model_tag = model.substr(colon_pos + 1);
|
||||
model = model.substr(0, colon_pos);
|
||||
}
|
||||
|
||||
std::string manifest_url = "https://registry.ollama.ai/v2/" + model + "/manifests/" + model_tag;
|
||||
std::string manifest_str;
|
||||
const int ret = download(manifest_url, headers, "", false, &manifest_str);
|
||||
if (ret) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
nlohmann::json manifest = nlohmann::json::parse(manifest_str);
|
||||
std::string layer;
|
||||
for (const auto & l : manifest["layers"]) {
|
||||
if (l["mediaType"] == "application/vnd.ollama.image.model") {
|
||||
layer = l["digest"];
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
std::string blob_url = "https://registry.ollama.ai/v2/" + model + "/blobs/" + layer;
|
||||
return download(blob_url, headers, bn, true);
|
||||
}
|
||||
|
||||
std::string basename(const std::string & path) {
|
||||
const size_t pos = path.find_last_of("/\\");
|
||||
if (pos == std::string::npos) {
|
||||
return path;
|
||||
}
|
||||
|
||||
return path.substr(pos + 1);
|
||||
}
|
||||
|
||||
int remove_proto(std::string & model_) {
|
||||
const std::string::size_type pos = model_.find("://");
|
||||
if (pos == std::string::npos) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
model_ = model_.substr(pos + 3); // Skip past "://"
|
||||
return 0;
|
||||
}
|
||||
|
||||
int resolve_model(std::string & model_) {
|
||||
const std::string bn = basename(model_);
|
||||
const std::vector<std::string> headers = { "--header",
|
||||
"Accept: application/vnd.docker.distribution.manifest.v2+json" };
|
||||
int ret = 0;
|
||||
if (string_starts_with(model_, "file://") || std::filesystem::exists(bn)) {
|
||||
remove_proto(model_);
|
||||
} else if (string_starts_with(model_, "hf://") || string_starts_with(model_, "huggingface://")) {
|
||||
remove_proto(model_);
|
||||
ret = huggingface_dl(model_, headers, bn);
|
||||
} else if (string_starts_with(model_, "ollama://")) {
|
||||
remove_proto(model_);
|
||||
ret = ollama_dl(model_, headers, bn);
|
||||
} else if (string_starts_with(model_, "https://")) {
|
||||
download(model_, headers, bn, true);
|
||||
} else {
|
||||
ret = ollama_dl(model_, headers, bn);
|
||||
}
|
||||
|
||||
model_ = bn;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
// Initializes the model and returns a unique pointer to it
|
||||
llama_model_ptr initialize_model(Opt & opt) {
|
||||
ggml_backend_load_all();
|
||||
llama_model_params model_params = llama_model_default_params();
|
||||
model_params.n_gpu_layers = opt.ngl_ >= 0 ? opt.ngl_ : model_params.n_gpu_layers;
|
||||
resolve_model(opt.model_);
|
||||
llama_model_ptr model(llama_load_model_from_file(opt.model_.c_str(), model_params));
|
||||
if (!model) {
|
||||
fprintf(stderr, "%s: error: unable to load model\n", __func__);
|
||||
printe("%s: error: unable to load model from file: %s\n", __func__, opt.model_.c_str());
|
||||
}
|
||||
|
||||
return model;
|
||||
@@ -148,12 +478,11 @@ class LlamaData {
|
||||
// Initializes the context with the specified parameters
|
||||
llama_context_ptr initialize_context(const llama_model_ptr & model, const int n_ctx) {
|
||||
llama_context_params ctx_params = llama_context_default_params();
|
||||
ctx_params.n_ctx = n_ctx;
|
||||
ctx_params.n_batch = n_ctx;
|
||||
|
||||
ctx_params.n_ctx = n_ctx;
|
||||
ctx_params.n_batch = n_ctx;
|
||||
llama_context_ptr context(llama_new_context_with_model(model.get(), ctx_params));
|
||||
if (!context) {
|
||||
fprintf(stderr, "%s: error: failed to create the llama_context\n", __func__);
|
||||
printe("%s: error: failed to create the llama_context\n", __func__);
|
||||
}
|
||||
|
||||
return context;
|
||||
@@ -170,23 +499,22 @@ class LlamaData {
|
||||
}
|
||||
};
|
||||
|
||||
// Add a message to `messages` and store its content in `owned_content`
|
||||
static void add_message(const char * role, const std::string & text, LlamaData & llama_data,
|
||||
std::vector<char_array_ptr> & owned_content) {
|
||||
char_array_ptr content(new char[text.size() + 1]);
|
||||
std::strcpy(content.get(), text.c_str());
|
||||
llama_data.messages.push_back({role, content.get()});
|
||||
owned_content.push_back(std::move(content));
|
||||
// Add a message to `messages` and store its content in `msg_strs`
|
||||
static void add_message(const char * role, const std::string & text, LlamaData & llama_data) {
|
||||
llama_data.msg_strs.push_back(std::move(text));
|
||||
llama_data.messages.push_back({ role, llama_data.msg_strs.back().c_str() });
|
||||
}
|
||||
|
||||
// Function to apply the chat template and resize `formatted` if needed
|
||||
static int apply_chat_template(const LlamaData & llama_data, std::vector<char> & formatted, const bool append) {
|
||||
int result = llama_chat_apply_template(llama_data.model.get(), nullptr, llama_data.messages.data(),
|
||||
llama_data.messages.size(), append, formatted.data(), formatted.size());
|
||||
if (result > static_cast<int>(formatted.size())) {
|
||||
formatted.resize(result);
|
||||
static int apply_chat_template(LlamaData & llama_data, const bool append) {
|
||||
int result = llama_chat_apply_template(
|
||||
llama_data.model.get(), nullptr, llama_data.messages.data(), llama_data.messages.size(), append,
|
||||
append ? llama_data.fmtted.data() : nullptr, append ? llama_data.fmtted.size() : 0);
|
||||
if (append && result > static_cast<int>(llama_data.fmtted.size())) {
|
||||
llama_data.fmtted.resize(result);
|
||||
result = llama_chat_apply_template(llama_data.model.get(), nullptr, llama_data.messages.data(),
|
||||
llama_data.messages.size(), append, formatted.data(), formatted.size());
|
||||
llama_data.messages.size(), append, llama_data.fmtted.data(),
|
||||
llama_data.fmtted.size());
|
||||
}
|
||||
|
||||
return result;
|
||||
@@ -199,7 +527,8 @@ static int tokenize_prompt(const llama_model_ptr & model, const std::string & pr
|
||||
prompt_tokens.resize(n_prompt_tokens);
|
||||
if (llama_tokenize(model.get(), prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true,
|
||||
true) < 0) {
|
||||
GGML_ABORT("failed to tokenize the prompt\n");
|
||||
printe("failed to tokenize the prompt\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
return n_prompt_tokens;
|
||||
@@ -207,11 +536,11 @@ static int tokenize_prompt(const llama_model_ptr & model, const std::string & pr
|
||||
|
||||
// Check if we have enough space in the context to evaluate this batch
|
||||
static int check_context_size(const llama_context_ptr & ctx, const llama_batch & batch) {
|
||||
const int n_ctx = llama_n_ctx(ctx.get());
|
||||
const int n_ctx = llama_n_ctx(ctx.get());
|
||||
const int n_ctx_used = llama_get_kv_cache_used_cells(ctx.get());
|
||||
if (n_ctx_used + batch.n_tokens > n_ctx) {
|
||||
printf("\033[0m\n");
|
||||
fprintf(stderr, "context size exceeded\n");
|
||||
printe("context size exceeded\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -221,9 +550,10 @@ static int check_context_size(const llama_context_ptr & ctx, const llama_batch &
|
||||
// convert the token to a string
|
||||
static int convert_token_to_string(const llama_model_ptr & model, const llama_token token_id, std::string & piece) {
|
||||
char buf[256];
|
||||
int n = llama_token_to_piece(model.get(), token_id, buf, sizeof(buf), 0, true);
|
||||
int n = llama_token_to_piece(model.get(), token_id, buf, sizeof(buf), 0, true);
|
||||
if (n < 0) {
|
||||
GGML_ABORT("failed to convert token to piece\n");
|
||||
printe("failed to convert token to piece\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
piece = std::string(buf, n);
|
||||
@@ -238,19 +568,19 @@ static void print_word_and_concatenate_to_response(const std::string & piece, st
|
||||
|
||||
// helper function to evaluate a prompt and generate a response
|
||||
static int generate(LlamaData & llama_data, const std::string & prompt, std::string & response) {
|
||||
std::vector<llama_token> prompt_tokens;
|
||||
const int n_prompt_tokens = tokenize_prompt(llama_data.model, prompt, prompt_tokens);
|
||||
if (n_prompt_tokens < 0) {
|
||||
std::vector<llama_token> tokens;
|
||||
if (tokenize_prompt(llama_data.model, prompt, tokens) < 0) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
// prepare a batch for the prompt
|
||||
llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size());
|
||||
llama_batch batch = llama_batch_get_one(tokens.data(), tokens.size());
|
||||
llama_token new_token_id;
|
||||
while (true) {
|
||||
check_context_size(llama_data.context, batch);
|
||||
if (llama_decode(llama_data.context.get(), batch)) {
|
||||
GGML_ABORT("failed to decode\n");
|
||||
printe("failed to decode\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
// sample the next token, check is it an end of generation?
|
||||
@@ -273,22 +603,9 @@ static int generate(LlamaData & llama_data, const std::string & prompt, std::str
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int parse_arguments(const int argc, const char ** argv, Options & opt) {
|
||||
ArgumentParser parser(argv[0]);
|
||||
parser.add_argument("-m", opt.model_path, "model");
|
||||
parser.add_argument("-p", opt.prompt_non_interactive, "prompt");
|
||||
parser.add_argument("-c", opt.n_ctx, "context_size");
|
||||
parser.add_argument("-ngl", opt.ngl, "n_gpu_layers");
|
||||
if (parser.parse(argc, argv)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int read_user_input(std::string & user) {
|
||||
std::getline(std::cin, user);
|
||||
return user.empty(); // Indicate an error or empty input
|
||||
return user.empty(); // Should have data in happy path
|
||||
}
|
||||
|
||||
// Function to generate a response based on the prompt
|
||||
@@ -296,7 +613,7 @@ static int generate_response(LlamaData & llama_data, const std::string & prompt,
|
||||
// Set response color
|
||||
printf("\033[33m");
|
||||
if (generate(llama_data, prompt, response)) {
|
||||
fprintf(stderr, "failed to generate response\n");
|
||||
printe("failed to generate response\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -306,11 +623,10 @@ static int generate_response(LlamaData & llama_data, const std::string & prompt,
|
||||
}
|
||||
|
||||
// Helper function to apply the chat template and handle errors
|
||||
static int apply_chat_template_with_error_handling(const LlamaData & llama_data, std::vector<char> & formatted,
|
||||
const bool is_user_input, int & output_length) {
|
||||
const int new_len = apply_chat_template(llama_data, formatted, is_user_input);
|
||||
static int apply_chat_template_with_error_handling(LlamaData & llama_data, const bool append, int & output_length) {
|
||||
const int new_len = apply_chat_template(llama_data, append);
|
||||
if (new_len < 0) {
|
||||
fprintf(stderr, "failed to apply the chat template\n");
|
||||
printe("failed to apply the chat template\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
@@ -319,56 +635,63 @@ static int apply_chat_template_with_error_handling(const LlamaData & llama_data,
|
||||
}
|
||||
|
||||
// Helper function to handle user input
|
||||
static bool handle_user_input(std::string & user_input, const std::string & prompt_non_interactive) {
|
||||
if (!prompt_non_interactive.empty()) {
|
||||
user_input = prompt_non_interactive;
|
||||
return true; // No need for interactive input
|
||||
static int handle_user_input(std::string & user_input, const std::string & user_) {
|
||||
if (!user_.empty()) {
|
||||
user_input = user_;
|
||||
return 0; // No need for interactive input
|
||||
}
|
||||
|
||||
printf("\033[32m> \033[0m");
|
||||
return !read_user_input(user_input); // Returns false if input ends the loop
|
||||
printf(
|
||||
"\r "
|
||||
"\r\033[32m> \033[0m");
|
||||
return read_user_input(user_input); // Returns true if input ends the loop
|
||||
}
|
||||
|
||||
// Function to tokenize the prompt
|
||||
static int chat_loop(LlamaData & llama_data, std::string & prompt_non_interactive) {
|
||||
std::vector<char_array_ptr> owned_content;
|
||||
std::vector<char> fmtted(llama_n_ctx(llama_data.context.get()));
|
||||
static int chat_loop(LlamaData & llama_data, const std::string & user_) {
|
||||
int prev_len = 0;
|
||||
|
||||
llama_data.fmtted.resize(llama_n_ctx(llama_data.context.get()));
|
||||
while (true) {
|
||||
// Get user input
|
||||
std::string user_input;
|
||||
if (!handle_user_input(user_input, prompt_non_interactive)) {
|
||||
break;
|
||||
while (handle_user_input(user_input, user_)) {
|
||||
}
|
||||
|
||||
add_message("user", prompt_non_interactive.empty() ? user_input : prompt_non_interactive, llama_data,
|
||||
owned_content);
|
||||
|
||||
add_message("user", user_.empty() ? user_input : user_, llama_data);
|
||||
int new_len;
|
||||
if (apply_chat_template_with_error_handling(llama_data, fmtted, true, new_len) < 0) {
|
||||
if (apply_chat_template_with_error_handling(llama_data, true, new_len) < 0) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
std::string prompt(fmtted.begin() + prev_len, fmtted.begin() + new_len);
|
||||
std::string prompt(llama_data.fmtted.begin() + prev_len, llama_data.fmtted.begin() + new_len);
|
||||
std::string response;
|
||||
if (generate_response(llama_data, prompt, response)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (!user_.empty()) {
|
||||
break;
|
||||
}
|
||||
|
||||
add_message("assistant", response, llama_data);
|
||||
if (apply_chat_template_with_error_handling(llama_data, false, prev_len) < 0) {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void log_callback(const enum ggml_log_level level, const char * text, void *) {
|
||||
if (level == GGML_LOG_LEVEL_ERROR) {
|
||||
fprintf(stderr, "%s", text);
|
||||
printe("%s", text);
|
||||
}
|
||||
}
|
||||
|
||||
static bool is_stdin_a_terminal() {
|
||||
#if defined(_WIN32)
|
||||
HANDLE hStdin = GetStdHandle(STD_INPUT_HANDLE);
|
||||
DWORD mode;
|
||||
DWORD mode;
|
||||
return GetConsoleMode(hStdin, &mode);
|
||||
#else
|
||||
return isatty(STDIN_FILENO);
|
||||
@@ -382,17 +705,20 @@ static std::string read_pipe_data() {
|
||||
}
|
||||
|
||||
int main(int argc, const char ** argv) {
|
||||
Options opt;
|
||||
if (parse_arguments(argc, argv, opt)) {
|
||||
Opt opt;
|
||||
const int ret = opt.init(argc, argv);
|
||||
if (ret == 2) {
|
||||
return 0;
|
||||
} else if (ret) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (!is_stdin_a_terminal()) {
|
||||
if (!opt.prompt_non_interactive.empty()) {
|
||||
opt.prompt_non_interactive += "\n\n";
|
||||
if (!opt.user_.empty()) {
|
||||
opt.user_ += "\n\n";
|
||||
}
|
||||
|
||||
opt.prompt_non_interactive += read_pipe_data();
|
||||
opt.user_ += read_pipe_data();
|
||||
}
|
||||
|
||||
llama_log_set(log_callback, nullptr);
|
||||
@@ -401,7 +727,7 @@ int main(int argc, const char ** argv) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (chat_loop(llama_data, opt.prompt_non_interactive)) {
|
||||
if (chat_loop(llama_data, opt.user_)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
@@ -34,14 +34,6 @@ endforeach()
|
||||
add_executable(${TARGET} ${TARGET_SRCS})
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
|
||||
# clean up generated files in pre-build step
|
||||
foreach(asset ${PUBLIC_ASSETS})
|
||||
set(output "${CMAKE_CURRENT_BINARY_DIR}/${asset}.hpp")
|
||||
add_custom_command(TARGET ${TARGET} PRE_BUILD
|
||||
COMMAND "${CMAKE_COMMAND}" -E remove -f "${output}"
|
||||
)
|
||||
endforeach()
|
||||
|
||||
target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT})
|
||||
|
||||
if (LLAMA_SERVER_SSL)
|
||||
|
||||
@@ -62,8 +62,8 @@ The project is under active development, and we are [looking for feedback and co
|
||||
| `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: 32.0)<br/>(env: LLAMA_ARG_YARN_BETA_FAST) |
|
||||
| `-dkvc, --dump-kv-cache` | verbose print of the KV cache |
|
||||
| `-nkvo, --no-kv-offload` | disable KV offload<br/>(env: LLAMA_ARG_NO_KV_OFFLOAD) |
|
||||
| `-ctk, --cache-type-k TYPE` | KV cache data type for K (default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K) |
|
||||
| `-ctv, --cache-type-v TYPE` | KV cache data type for V (default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V) |
|
||||
| `-ctk, --cache-type-k TYPE` | KV cache data type for K<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K) |
|
||||
| `-ctv, --cache-type-v TYPE` | KV cache data type for V<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V) |
|
||||
| `-dt, --defrag-thold N` | KV cache defragmentation threshold (default: 0.1, < 0 - disabled)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
|
||||
| `-np, --parallel N` | number of parallel sequences to decode (default: 1)<br/>(env: LLAMA_ARG_N_PARALLEL) |
|
||||
| `--mlock` | force system to keep model in RAM rather than swapping or compressing<br/>(env: LLAMA_ARG_MLOCK) |
|
||||
@@ -138,6 +138,7 @@ The project is under active development, and we are [looking for feedback and co
|
||||
| -------- | ----------- |
|
||||
| `--no-context-shift` | disables context shift on inifinite text generation (default: disabled)<br/>(env: LLAMA_ARG_NO_CONTEXT_SHIFT) |
|
||||
| `-sp, --special` | special tokens output enabled (default: false) |
|
||||
| `--no-warmup` | skip warming up the model with an empty run |
|
||||
| `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) |
|
||||
| `--pooling {none,mean,cls,last,rank}` | pooling type for embeddings, use model default if unspecified<br/>(env: LLAMA_ARG_POOLING) |
|
||||
| `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
|
||||
@@ -146,6 +147,7 @@ The project is under active development, and we are [looking for feedback and co
|
||||
| `--host HOST` | ip address to listen (default: 127.0.0.1)<br/>(env: LLAMA_ARG_HOST) |
|
||||
| `--port PORT` | port to listen (default: 8080)<br/>(env: LLAMA_ARG_PORT) |
|
||||
| `--path PATH` | path to serve static files from (default: )<br/>(env: LLAMA_ARG_STATIC_PATH) |
|
||||
| `--no-webui` | Disable the Web UI (default: enabled)<br/>(env: LLAMA_ARG_NO_WEBUI) |
|
||||
| `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)<br/>(env: LLAMA_ARG_EMBEDDINGS) |
|
||||
| `--reranking, --rerank` | enable reranking endpoint on server (default: disabled)<br/>(env: LLAMA_ARG_RERANKING) |
|
||||
| `--api-key KEY` | API key to use for authentication (default: none)<br/>(env: LLAMA_API_KEY) |
|
||||
@@ -163,13 +165,13 @@ The project is under active development, and we are [looking for feedback and co
|
||||
| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>list of built-in templates:<br/>chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, exaone3, gemma, granite, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, monarch, openchat, orion, phi3, rwkv-world, vicuna, vicuna-orca, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
|
||||
| `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)<br/> |
|
||||
| `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) |
|
||||
| `--draft-max, --draft, --draft-n N` | number of tokens to draft for speculative decoding (default: 16) |
|
||||
| `--draft-min, --draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 5) |
|
||||
| `--draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.9) |
|
||||
| `-cd, --ctx-size-draft N` | size of the prompt context for the draft model (default: 0, 0 = loaded from model) |
|
||||
| `--draft-max, --draft, --draft-n N` | number of tokens to draft for speculative decoding (default: 16)<br/>(env: LLAMA_ARG_DRAFT_MAX) |
|
||||
| `--draft-min, --draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 5)<br/>(env: LLAMA_ARG_DRAFT_MIN) |
|
||||
| `--draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.9)<br/>(env: LLAMA_ARG_DRAFT_P_MIN) |
|
||||
| `-cd, --ctx-size-draft N` | size of the prompt context for the draft model (default: 0, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE_DRAFT) |
|
||||
| `-devd, --device-draft <dev1,dev2,..>` | comma-separated list of devices to use for offloading the draft model (none = don't offload)<br/>use --list-devices to see a list of available devices |
|
||||
| `-ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | number of layers to store in VRAM for the draft model |
|
||||
| `-md, --model-draft FNAME` | draft model for speculative decoding (default: unused) |
|
||||
| `-ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | number of layers to store in VRAM for the draft model<br/>(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) |
|
||||
| `-md, --model-draft FNAME` | draft model for speculative decoding (default: unused)<br/>(env: LLAMA_ARG_MODEL_DRAFT) |
|
||||
|
||||
|
||||
Note: If both command line argument and environment variable are both set for the same param, the argument will take precedence over env var.
|
||||
@@ -302,23 +304,23 @@ mkdir llama-client
|
||||
cd llama-client
|
||||
```
|
||||
|
||||
Create a index.js file and put this inside:
|
||||
Create an index.js file and put this inside:
|
||||
|
||||
```javascript
|
||||
const prompt = `Building a website can be done in 10 simple steps:`;
|
||||
const prompt = "Building a website can be done in 10 simple steps:"
|
||||
|
||||
async function Test() {
|
||||
async function test() {
|
||||
let response = await fetch("http://127.0.0.1:8080/completion", {
|
||||
method: 'POST',
|
||||
method: "POST",
|
||||
body: JSON.stringify({
|
||||
prompt,
|
||||
n_predict: 512,
|
||||
n_predict: 64,
|
||||
})
|
||||
})
|
||||
console.log((await response.json()).content)
|
||||
}
|
||||
|
||||
Test()
|
||||
test()
|
||||
```
|
||||
|
||||
And run it:
|
||||
@@ -380,7 +382,7 @@ Multiple prompts are also supported. In this case, the completion result will be
|
||||
`n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded. The number excludes the BOS token.
|
||||
By default, this value is set to `0`, meaning no tokens are kept. Use `-1` to retain all tokens from the prompt.
|
||||
|
||||
`stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`.
|
||||
`stream`: Allows receiving each predicted token in real-time instead of waiting for the completion to finish (uses a different response format). To enable this, set to `true`.
|
||||
|
||||
`stop`: Specify a JSON array of stopping strings.
|
||||
These words will not be included in the completion, so make sure to add them to the prompt for the next iteration. Default: `[]`
|
||||
@@ -441,11 +443,11 @@ These words will not be included in the completion, so make sure to add them to
|
||||
|
||||
`samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["dry", "top_k", "typ_p", "top_p", "min_p", "xtc", "temperature"]` - these are all the available values.
|
||||
|
||||
`timings_per_token`: Include prompt processing and text generation speed information in each response. Default: `false`
|
||||
`timings_per_token`: Include prompt processing and text generation speed information in each response. Default: `false`
|
||||
|
||||
**Response format**
|
||||
|
||||
- Note: When using streaming mode (`stream`), only `content` and `stop` will be returned until end of completion.
|
||||
- Note: In streaming mode (`stream`), only `content` and `stop` will be returned until end of completion. Responses are sent using the [Server-sent events](https://html.spec.whatwg.org/multipage/server-sent-events.html) standard. Note: the browser's `EventSource` interface cannot be used due to its lack of `POST` request support.
|
||||
|
||||
- `completion_probabilities`: An array of token probabilities for each completion. The array's length is `n_predict`. Each item in the array has the following structure:
|
||||
|
||||
@@ -618,14 +620,83 @@ This endpoint is public (no API key check). By default, it is read-only. To make
|
||||
|
||||
```json
|
||||
{
|
||||
"default_generation_settings": { ... },
|
||||
"default_generation_settings": {
|
||||
"id": 0,
|
||||
"id_task": -1,
|
||||
"n_ctx": 1024,
|
||||
"speculative": false,
|
||||
"is_processing": false,
|
||||
"params": {
|
||||
"n_predict": -1,
|
||||
"seed": 4294967295,
|
||||
"temperature": 0.800000011920929,
|
||||
"dynatemp_range": 0.0,
|
||||
"dynatemp_exponent": 1.0,
|
||||
"top_k": 40,
|
||||
"top_p": 0.949999988079071,
|
||||
"min_p": 0.05000000074505806,
|
||||
"xtc_probability": 0.0,
|
||||
"xtc_threshold": 0.10000000149011612,
|
||||
"typical_p": 1.0,
|
||||
"repeat_last_n": 64,
|
||||
"repeat_penalty": 1.0,
|
||||
"presence_penalty": 0.0,
|
||||
"frequency_penalty": 0.0,
|
||||
"dry_multiplier": 0.0,
|
||||
"dry_base": 1.75,
|
||||
"dry_allowed_length": 2,
|
||||
"dry_penalty_last_n": -1,
|
||||
"dry_sequence_breakers": [
|
||||
"\n",
|
||||
":",
|
||||
"\"",
|
||||
"*"
|
||||
],
|
||||
"mirostat": 0,
|
||||
"mirostat_tau": 5.0,
|
||||
"mirostat_eta": 0.10000000149011612,
|
||||
"penalize_nl": false,
|
||||
"stop": [],
|
||||
"max_tokens": -1,
|
||||
"n_keep": 0,
|
||||
"n_discard": 0,
|
||||
"ignore_eos": false,
|
||||
"stream": true,
|
||||
"n_probs": 0,
|
||||
"min_keep": 0,
|
||||
"grammar": "",
|
||||
"samplers": [
|
||||
"dry",
|
||||
"top_k",
|
||||
"typ_p",
|
||||
"top_p",
|
||||
"min_p",
|
||||
"xtc",
|
||||
"temperature"
|
||||
],
|
||||
"speculative.n_max": 16,
|
||||
"speculative.n_min": 5,
|
||||
"speculative.p_min": 0.8999999761581421,
|
||||
"timings_per_token": false
|
||||
},
|
||||
"prompt": "",
|
||||
"next_token": {
|
||||
"has_next_token": true,
|
||||
"has_new_line": false,
|
||||
"n_remain": -1,
|
||||
"n_decoded": 0,
|
||||
"stopping_word": ""
|
||||
}
|
||||
},
|
||||
"total_slots": 1,
|
||||
"chat_template": ""
|
||||
"model_path": "../models/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf",
|
||||
"chat_template": "..."
|
||||
}
|
||||
```
|
||||
|
||||
- `default_generation_settings` - the default generation settings for the `/completion` endpoint, which has the same fields as the `generation_settings` response object from the `/completion` endpoint.
|
||||
- `total_slots` - the total number of slots for process requests (defined by `--parallel` option)
|
||||
- `model_path` - the path to model file (same with `-m` argument)
|
||||
- `chat_template` - the model's original Jinja2 prompt template
|
||||
|
||||
### POST `/props`: Change server global properties.
|
||||
@@ -739,56 +810,74 @@ Example:
|
||||
|
||||
```json
|
||||
[
|
||||
{
|
||||
"dynatemp_exponent": 1.0,
|
||||
"dynatemp_range": 0.0,
|
||||
"frequency_penalty": 0.0,
|
||||
"grammar": "",
|
||||
"id": 0,
|
||||
"ignore_eos": false,
|
||||
"is_processing": false,
|
||||
"logit_bias": [],
|
||||
"min_p": 0.05000000074505806,
|
||||
"mirostat": 0,
|
||||
"mirostat_eta": 0.10000000149011612,
|
||||
"mirostat_tau": 5.0,
|
||||
"model": "llama-2-7b-32k-instruct.Q2_K.gguf",
|
||||
"n_ctx": 2048,
|
||||
"n_keep": 0,
|
||||
"n_predict": 100000,
|
||||
"n_probs": 0,
|
||||
"next_token": {
|
||||
"has_next_token": true,
|
||||
"n_remain": -1,
|
||||
"n_decoded": 0,
|
||||
"stopped_eos": false,
|
||||
"stopped_limit": false,
|
||||
"stopped_word": false,
|
||||
"stopping_word": ""
|
||||
},
|
||||
"penalize_nl": true,
|
||||
"presence_penalty": 0.0,
|
||||
"prompt": "Say hello to llama.cpp",
|
||||
"repeat_last_n": 64,
|
||||
"repeat_penalty": 1.100000023841858,
|
||||
"samplers": [
|
||||
"top_k",
|
||||
"typical_p",
|
||||
"top_p",
|
||||
"min_p",
|
||||
"temperature"
|
||||
],
|
||||
"seed": 42,
|
||||
"stop": [
|
||||
"\n"
|
||||
],
|
||||
"stream": false,
|
||||
"task_id": 0,
|
||||
"temperature": 0.0,
|
||||
"top_k": 40,
|
||||
"top_p": 0.949999988079071,
|
||||
"typical_p": 1.0
|
||||
{
|
||||
"id": 0,
|
||||
"id_task": -1,
|
||||
"n_ctx": 1024,
|
||||
"speculative": false,
|
||||
"is_processing": false,
|
||||
"params": {
|
||||
"n_predict": -1,
|
||||
"seed": 4294967295,
|
||||
"temperature": 0.800000011920929,
|
||||
"dynatemp_range": 0.0,
|
||||
"dynatemp_exponent": 1.0,
|
||||
"top_k": 40,
|
||||
"top_p": 0.949999988079071,
|
||||
"min_p": 0.05000000074505806,
|
||||
"xtc_probability": 0.0,
|
||||
"xtc_threshold": 0.10000000149011612,
|
||||
"typical_p": 1.0,
|
||||
"repeat_last_n": 64,
|
||||
"repeat_penalty": 1.0,
|
||||
"presence_penalty": 0.0,
|
||||
"frequency_penalty": 0.0,
|
||||
"dry_multiplier": 0.0,
|
||||
"dry_base": 1.75,
|
||||
"dry_allowed_length": 2,
|
||||
"dry_penalty_last_n": -1,
|
||||
"dry_sequence_breakers": [
|
||||
"\n",
|
||||
":",
|
||||
"\"",
|
||||
"*"
|
||||
],
|
||||
"mirostat": 0,
|
||||
"mirostat_tau": 5.0,
|
||||
"mirostat_eta": 0.10000000149011612,
|
||||
"penalize_nl": false,
|
||||
"stop": [],
|
||||
"max_tokens": -1,
|
||||
"n_keep": 0,
|
||||
"n_discard": 0,
|
||||
"ignore_eos": false,
|
||||
"stream": true,
|
||||
"n_probs": 0,
|
||||
"min_keep": 0,
|
||||
"grammar": "",
|
||||
"samplers": [
|
||||
"dry",
|
||||
"top_k",
|
||||
"typ_p",
|
||||
"top_p",
|
||||
"min_p",
|
||||
"xtc",
|
||||
"temperature"
|
||||
],
|
||||
"speculative.n_max": 16,
|
||||
"speculative.n_min": 5,
|
||||
"speculative.p_min": 0.8999999761581421,
|
||||
"timings_per_token": false
|
||||
},
|
||||
"prompt": "",
|
||||
"next_token": {
|
||||
"has_next_token": true,
|
||||
"has_new_line": false,
|
||||
"n_remain": -1,
|
||||
"n_decoded": 0,
|
||||
"stopping_word": ""
|
||||
}
|
||||
}
|
||||
]
|
||||
```
|
||||
|
||||
|
||||
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
@@ -1,4 +1,5 @@
|
||||
import pytest
|
||||
import requests
|
||||
from utils import *
|
||||
|
||||
server = ServerPreset.tinyllama2()
|
||||
@@ -22,7 +23,12 @@ def test_server_props():
|
||||
server.start()
|
||||
res = server.make_request("GET", "/props")
|
||||
assert res.status_code == 200
|
||||
assert ".gguf" in res.body["model_path"]
|
||||
assert res.body["total_slots"] == server.n_slots
|
||||
default_val = res.body["default_generation_settings"]
|
||||
assert server.n_ctx is not None and server.n_slots is not None
|
||||
assert default_val["n_ctx"] == server.n_ctx / server.n_slots
|
||||
assert default_val["params"]["seed"] == server.seed
|
||||
|
||||
|
||||
def test_server_models():
|
||||
@@ -33,6 +39,31 @@ def test_server_models():
|
||||
assert len(res.body["data"]) == 1
|
||||
assert res.body["data"][0]["id"] == server.model_alias
|
||||
|
||||
|
||||
def test_server_slots():
|
||||
global server
|
||||
|
||||
# without slots endpoint enabled, this should return error
|
||||
server.server_slots = False
|
||||
server.start()
|
||||
res = server.make_request("GET", "/slots")
|
||||
assert res.status_code == 501 # ERROR_TYPE_NOT_SUPPORTED
|
||||
assert "error" in res.body
|
||||
server.stop()
|
||||
|
||||
# with slots endpoint enabled, this should return slots info
|
||||
server.server_slots = True
|
||||
server.n_slots = 2
|
||||
server.start()
|
||||
res = server.make_request("GET", "/slots")
|
||||
assert res.status_code == 200
|
||||
assert len(res.body) == server.n_slots
|
||||
assert server.n_ctx is not None and server.n_slots is not None
|
||||
assert res.body[0]["n_ctx"] == server.n_ctx / server.n_slots
|
||||
assert "params" in res.body[0]
|
||||
assert res.body[0]["params"]["seed"] == server.seed
|
||||
|
||||
|
||||
def test_load_split_model():
|
||||
global server
|
||||
server.model_hf_repo = "ggml-org/models"
|
||||
@@ -46,3 +77,20 @@ def test_load_split_model():
|
||||
})
|
||||
assert res.status_code == 200
|
||||
assert match_regex("(little|girl)+", res.body["content"])
|
||||
|
||||
|
||||
def test_no_webui():
|
||||
global server
|
||||
# default: webui enabled
|
||||
server.start()
|
||||
url = f"http://{server.server_host}:{server.server_port}"
|
||||
res = requests.get(url)
|
||||
assert res.status_code == 200
|
||||
assert "<html>" in res.text
|
||||
server.stop()
|
||||
|
||||
# with --no-webui
|
||||
server.no_webui = True
|
||||
server.start()
|
||||
res = requests.get(url)
|
||||
assert res.status_code == 404
|
||||
|
||||
@@ -30,6 +30,7 @@ def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_conte
|
||||
],
|
||||
})
|
||||
assert res.status_code == 200
|
||||
assert "cmpl" in res.body["id"] # make sure the completion id has the expected format
|
||||
assert res.body["model"] == model if model is not None else server.model_alias
|
||||
assert res.body["usage"]["prompt_tokens"] == n_prompt
|
||||
assert res.body["usage"]["completion_tokens"] == n_predicted
|
||||
@@ -59,9 +60,13 @@ def test_chat_completion_stream(system_prompt, user_prompt, max_tokens, re_conte
|
||||
"stream": True,
|
||||
})
|
||||
content = ""
|
||||
last_cmpl_id = None
|
||||
for data in res:
|
||||
choice = data["choices"][0]
|
||||
assert "gpt-3.5" in data["model"] # DEFAULT_OAICOMPAT_MODEL, maybe changed in the future
|
||||
if last_cmpl_id is None:
|
||||
last_cmpl_id = data["id"]
|
||||
assert last_cmpl_id == data["id"] # make sure the completion id is the same for all events in the stream
|
||||
if choice["finish_reason"] in ["stop", "length"]:
|
||||
assert data["usage"]["prompt_tokens"] == n_prompt
|
||||
assert data["usage"]["completion_tokens"] == n_predicted
|
||||
|
||||
@@ -25,6 +25,7 @@ def test_completion(prompt: str, n_predict: int, re_content: str, n_prompt: int,
|
||||
assert res.body["timings"]["prompt_n"] == n_prompt
|
||||
assert res.body["timings"]["predicted_n"] == n_predicted
|
||||
assert res.body["truncated"] == truncated
|
||||
assert type(res.body["has_new_line"]) == bool
|
||||
assert match_regex(re_content, res.body["content"])
|
||||
|
||||
|
||||
@@ -42,10 +43,17 @@ def test_completion_stream(prompt: str, n_predict: int, re_content: str, n_promp
|
||||
})
|
||||
content = ""
|
||||
for data in res:
|
||||
assert "stop" in data and type(data["stop"]) == bool
|
||||
if data["stop"]:
|
||||
assert data["timings"]["prompt_n"] == n_prompt
|
||||
assert data["timings"]["predicted_n"] == n_predicted
|
||||
assert data["truncated"] == truncated
|
||||
assert data["stop_type"] == "limit"
|
||||
assert type(data["has_new_line"]) == bool
|
||||
assert "generation_settings" in data
|
||||
assert server.n_predict is not None
|
||||
assert data["generation_settings"]["n_predict"] == min(n_predict, server.n_predict)
|
||||
assert data["generation_settings"]["seed"] == server.seed
|
||||
assert match_regex(re_content, content)
|
||||
else:
|
||||
content += data["content"]
|
||||
|
||||
@@ -13,28 +13,28 @@ def test_infill_without_input_extra():
|
||||
global server
|
||||
server.start()
|
||||
res = server.make_request("POST", "/infill", data={
|
||||
"prompt": "Complete this",
|
||||
"input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n int n_threads = llama_",
|
||||
"input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n",
|
||||
"prompt": " int n_threads = llama_",
|
||||
"input_suffix": "}\n",
|
||||
})
|
||||
assert res.status_code == 200
|
||||
assert match_regex("(One|day|she|saw|big|scary|bird)+", res.body["content"])
|
||||
assert match_regex("(Ann|small|shiny)+", res.body["content"])
|
||||
|
||||
|
||||
def test_infill_with_input_extra():
|
||||
global server
|
||||
server.start()
|
||||
res = server.make_request("POST", "/infill", data={
|
||||
"prompt": "Complete this",
|
||||
"input_extra": [{
|
||||
"filename": "llama.h",
|
||||
"text": "LLAMA_API int32_t llama_n_threads();\n"
|
||||
}],
|
||||
"input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n int n_threads = llama_",
|
||||
"input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n",
|
||||
"prompt": " int n_threads = llama_",
|
||||
"input_suffix": "}\n",
|
||||
})
|
||||
assert res.status_code == 200
|
||||
assert match_regex("(cuts|Jimmy|mom|came|into|the|room)+", res.body["content"])
|
||||
assert match_regex("(Dad|excited|park)+", res.body["content"])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("input_extra", [
|
||||
@@ -48,10 +48,30 @@ def test_invalid_input_extra_req(input_extra):
|
||||
global server
|
||||
server.start()
|
||||
res = server.make_request("POST", "/infill", data={
|
||||
"prompt": "Complete this",
|
||||
"input_extra": [input_extra],
|
||||
"input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n int n_threads = llama_",
|
||||
"input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n",
|
||||
"prompt": " int n_threads = llama_",
|
||||
"input_suffix": "}\n",
|
||||
})
|
||||
assert res.status_code == 400
|
||||
assert "error" in res.body
|
||||
|
||||
|
||||
@pytest.mark.skipif(not is_slow_test_allowed(), reason="skipping slow test")
|
||||
def test_with_qwen_model():
|
||||
global server
|
||||
server.model_file = None
|
||||
server.model_hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-IQ3_XXS-GGUF"
|
||||
server.model_hf_file = "qwen2.5-coder-1.5b-iq3_xxs-imat.gguf"
|
||||
server.start(timeout_seconds=600)
|
||||
res = server.make_request("POST", "/infill", data={
|
||||
"input_extra": [{
|
||||
"filename": "llama.h",
|
||||
"text": "LLAMA_API int32_t llama_n_threads();\n"
|
||||
}],
|
||||
"input_prefix": "#include <cstdio>\n#include \"llama.h\"\n\nint main() {\n",
|
||||
"prompt": " int n_threads = llama_",
|
||||
"input_suffix": "}\n",
|
||||
})
|
||||
assert res.status_code == 200
|
||||
assert res.body["content"] == "n_threads();\n printf(\"Number of threads: %d\\n\", n_threads);\n return 0;\n"
|
||||
|
||||
@@ -64,6 +64,7 @@ class ServerProcess:
|
||||
server_embeddings: bool | None = False
|
||||
server_reranking: bool | None = False
|
||||
server_metrics: bool | None = False
|
||||
server_slots: bool | None = False
|
||||
draft: int | None = None
|
||||
api_key: str | None = None
|
||||
response_format: str | None = None
|
||||
@@ -71,6 +72,7 @@ class ServerProcess:
|
||||
disable_ctx_shift: int | None = False
|
||||
draft_min: int | None = None
|
||||
draft_max: int | None = None
|
||||
no_webui: bool | None = None
|
||||
|
||||
# session variables
|
||||
process: subprocess.Popen | None = None
|
||||
@@ -91,7 +93,6 @@ class ServerProcess:
|
||||
else:
|
||||
server_path = "../../../build/bin/llama-server"
|
||||
server_args = [
|
||||
"--slots", # requires to get slot status via /slots endpoint
|
||||
"--host",
|
||||
self.server_host,
|
||||
"--port",
|
||||
@@ -129,6 +130,8 @@ class ServerProcess:
|
||||
server_args.append("--reranking")
|
||||
if self.server_metrics:
|
||||
server_args.append("--metrics")
|
||||
if self.server_slots:
|
||||
server_args.append("--slots")
|
||||
if self.model_alias:
|
||||
server_args.extend(["--alias", self.model_alias])
|
||||
if self.n_ctx:
|
||||
@@ -156,6 +159,8 @@ class ServerProcess:
|
||||
server_args.extend(["--draft-max", self.draft_max])
|
||||
if self.draft_min:
|
||||
server_args.extend(["--draft-min", self.draft_min])
|
||||
if self.no_webui:
|
||||
server_args.append("--no-webui")
|
||||
|
||||
args = [str(arg) for arg in [server_path, *server_args]]
|
||||
print(f"bench: starting server with: {' '.join(args)}")
|
||||
@@ -181,7 +186,7 @@ class ServerProcess:
|
||||
start_time = time.time()
|
||||
while time.time() - start_time < timeout_seconds:
|
||||
try:
|
||||
response = self.make_request("GET", "/slots", headers={
|
||||
response = self.make_request("GET", "/health", headers={
|
||||
"Authorization": f"Bearer {self.api_key}" if self.api_key else None
|
||||
})
|
||||
if response.status_code == 200:
|
||||
@@ -224,7 +229,7 @@ class ServerProcess:
|
||||
result.headers = dict(response.headers)
|
||||
result.status_code = response.status_code
|
||||
result.body = response.json() if parse_body else None
|
||||
print("Response from server", result.body)
|
||||
print("Response from server", json.dumps(result.body, indent=2))
|
||||
return result
|
||||
|
||||
def make_stream_request(
|
||||
@@ -245,7 +250,7 @@ class ServerProcess:
|
||||
break
|
||||
elif line.startswith('data: '):
|
||||
data = json.loads(line[6:])
|
||||
print("Partial response from server", data)
|
||||
print("Partial response from server", json.dumps(data, indent=2))
|
||||
yield data
|
||||
|
||||
|
||||
@@ -369,3 +374,6 @@ def match_regex(regex: str, text: str) -> bool:
|
||||
).search(text)
|
||||
is not None
|
||||
)
|
||||
|
||||
def is_slow_test_allowed():
|
||||
return os.environ.get("SLOW_TESTS") == "1" or os.environ.get("SLOW_TESTS") == "ON"
|
||||
|
||||
@@ -22,7 +22,7 @@
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
|
||||
#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613"
|
||||
#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo"
|
||||
|
||||
using json = nlohmann::ordered_json;
|
||||
|
||||
@@ -164,6 +164,9 @@ static std::vector<llama_tokens> tokenize_input_prompts(llama_context * ctx, con
|
||||
} else {
|
||||
throw std::runtime_error("\"prompt\" must be a string, an list of tokens, a list of mixed strings & tokens, or a list of prompts");
|
||||
}
|
||||
if (result.empty()) {
|
||||
throw std::runtime_error("\"prompt\" must not be empty");
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
@@ -327,12 +330,12 @@ static std::string llama_get_chat_template(const struct llama_model * model) {
|
||||
std::string template_key = "tokenizer.chat_template";
|
||||
// call with NULL buffer to get the total size of the string
|
||||
int32_t res = llama_model_meta_val_str(model, template_key.c_str(), NULL, 0);
|
||||
if (res < 0) {
|
||||
if (res < 2) {
|
||||
return "";
|
||||
} else {
|
||||
std::vector<char> model_template(res, 0);
|
||||
std::vector<char> model_template(res + 1, 0);
|
||||
llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
|
||||
return std::string(model_template.data(), model_template.size());
|
||||
return std::string(model_template.data(), model_template.size() - 1);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -496,8 +499,6 @@ static json oaicompat_completion_params_parse(
|
||||
const std::string & chat_template) {
|
||||
json llama_params;
|
||||
|
||||
llama_params["__oaicompat"] = true;
|
||||
|
||||
// Apply chat template to the list of messages
|
||||
llama_params["prompt"] = format_chat(model, chat_template, body.at("messages"));
|
||||
|
||||
@@ -648,3 +649,18 @@ static json format_detokenized_response(const std::string & content) {
|
||||
{"content", content}
|
||||
};
|
||||
}
|
||||
|
||||
static json format_logit_bias(const std::vector<llama_logit_bias> & logit_bias) {
|
||||
json data = json::array();
|
||||
for (const auto & lb : logit_bias) {
|
||||
data.push_back(json{
|
||||
{"bias", lb.bias},
|
||||
{"token", lb.token},
|
||||
});
|
||||
}
|
||||
return data;
|
||||
}
|
||||
|
||||
static std::string safe_json_to_str(json data) {
|
||||
return data.dump(-1, ' ', false, json::error_handler_t::replace);
|
||||
}
|
||||
|
||||
@@ -15,7 +15,7 @@
|
||||
<!-- sidebar -->
|
||||
<div class="drawer-side h-screen lg:h-screen z-50 lg:max-w-64">
|
||||
<label for="toggle-drawer" aria-label="close sidebar" class="drawer-overlay"></label>
|
||||
<div class="flex flex-col bg-base-200 min-h-full max-w-[calc(100vw-2em)] py-4 px-4">
|
||||
<div class="flex flex-col bg-base-200 min-h-full max-w-64 py-4 px-4">
|
||||
<div class="flex flex-row items-center justify-between mb-4 mt-4">
|
||||
<h2 class="font-bold ml-4">Conversations</h2>
|
||||
|
||||
@@ -120,51 +120,25 @@
|
||||
{{ messages.length === 0 ? 'Send a message to start' : '' }}
|
||||
</div>
|
||||
<div v-for="msg in messages" class="group">
|
||||
<div :class="{
|
||||
'chat': true,
|
||||
'chat-start': msg.role !== 'user',
|
||||
'chat-end': msg.role === 'user',
|
||||
}">
|
||||
<div :class="{
|
||||
'chat-bubble markdown': true,
|
||||
'chat-bubble-base-300': msg.role !== 'user',
|
||||
}">
|
||||
<!-- textarea for editing message -->
|
||||
<template v-if="editingMsg && editingMsg.id === msg.id">
|
||||
<textarea
|
||||
class="textarea textarea-bordered bg-base-100 text-base-content w-[calc(90vw-8em)] lg:w-96"
|
||||
v-model="msg.content"></textarea>
|
||||
<br/>
|
||||
<button class="btn btn-ghost mt-2 mr-2" @click="editingMsg = null">Cancel</button>
|
||||
<button class="btn mt-2" @click="editUserMsgAndRegenerate(msg)">Submit</button>
|
||||
</template>
|
||||
<!-- render message as markdown -->
|
||||
<vue-markdown v-else :source="msg.content" />
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- actions for each message -->
|
||||
<div :class="{'text-right': msg.role === 'user'}" class="mx-4 mt-2 mb-2">
|
||||
<!-- user message -->
|
||||
<button v-if="msg.role === 'user'" class="badge btn-mini show-on-hover" @click="editingMsg = msg" :disabled="isGenerating">
|
||||
✍️ Edit
|
||||
</button>
|
||||
<!-- assistant message -->
|
||||
<button v-if="msg.role === 'assistant'" class="badge btn-mini show-on-hover mr-2" @click="regenerateMsg(msg)" :disabled="isGenerating">
|
||||
🔄 Regenerate
|
||||
</button>
|
||||
<button v-if="msg.role === 'assistant'" class="badge btn-mini show-on-hover mr-2" @click="copyMsg(msg)" :disabled="isGenerating">
|
||||
📋 Copy
|
||||
</button>
|
||||
</div>
|
||||
<message-bubble
|
||||
:config="config"
|
||||
:msg="msg"
|
||||
:key="msg.id"
|
||||
:is-generating="isGenerating"
|
||||
:edit-user-msg-and-regenerate="editUserMsgAndRegenerate"
|
||||
:regenerate-msg="regenerateMsg"></message-bubble>
|
||||
</div>
|
||||
|
||||
<!-- pending (ongoing) assistant message -->
|
||||
<div id="pending-msg" class="chat chat-start">
|
||||
<div v-if="pendingMsg" class="chat-bubble markdown chat-bubble-base-300">
|
||||
<span v-if="!pendingMsg.content" class="loading loading-dots loading-md"></span>
|
||||
<vue-markdown v-else :source="pendingMsg.content" />
|
||||
</div>
|
||||
<div id="pending-msg" class="group">
|
||||
<message-bubble
|
||||
v-if="pendingMsg"
|
||||
:config="config"
|
||||
:msg="pendingMsg"
|
||||
:key="pendingMsg.id"
|
||||
:is-generating="isGenerating"
|
||||
:edit-user-msg-and-regenerate="() => {}"
|
||||
:regenerate-msg="() => {}"></message-bubble>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
@@ -227,6 +201,10 @@
|
||||
<details class="collapse collapse-arrow bg-base-200 mb-2 overflow-visible">
|
||||
<summary class="collapse-title font-bold">Advanced config</summary>
|
||||
<div class="collapse-content">
|
||||
<div class="flex flex-row items-center mb-2">
|
||||
<input type="checkbox" class="checkbox" v-model="config.showTokensPerSecond" />
|
||||
<span class="ml-4">Show tokens per second</span>
|
||||
</div>
|
||||
<label class="form-control mb-2">
|
||||
<!-- Custom parameters input -->
|
||||
<div class="label inline">Custom JSON config (For more info, refer to <a class="underline" href="https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md" target="_blank" rel="noopener noreferrer">server documentation</a>)</div>
|
||||
@@ -247,6 +225,66 @@
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
<!-- Template to be used as message bubble -->
|
||||
<template id="message-bubble">
|
||||
<div :class="{
|
||||
'chat': true,
|
||||
'chat-start': msg.role !== 'user',
|
||||
'chat-end': msg.role === 'user',
|
||||
}">
|
||||
<div :class="{
|
||||
'chat-bubble markdown': true,
|
||||
'chat-bubble-base-300': msg.role !== 'user',
|
||||
}">
|
||||
<!-- textarea for editing message -->
|
||||
<template v-if="editingContent !== null">
|
||||
<textarea
|
||||
class="textarea textarea-bordered bg-base-100 text-base-content w-[calc(90vw-8em)] lg:w-96"
|
||||
v-model="editingContent"></textarea>
|
||||
<br/>
|
||||
<button class="btn btn-ghost mt-2 mr-2" @click="editingContent = null">Cancel</button>
|
||||
<button class="btn mt-2" @click="editMsg()">Submit</button>
|
||||
</template>
|
||||
<template v-else>
|
||||
<!-- show loading dots for pending message -->
|
||||
<span v-if="msg.content === null" class="loading loading-dots loading-md"></span>
|
||||
<!-- render message as markdown -->
|
||||
<vue-markdown v-else :source="msg.content"></vue-markdown>
|
||||
<!-- render timings if enabled -->
|
||||
<div class="dropdown dropdown-hover dropdown-top mt-2" v-if="timings && config.showTokensPerSecond">
|
||||
<div tabindex="0" role="button" class="cursor-pointer font-semibold text-sm opacity-60">Speed: {{ timings.predicted_per_second.toFixed(1) }} t/s</div>
|
||||
<div class="dropdown-content bg-base-100 z-10 w-64 p-2 shadow mt-4">
|
||||
<b>Prompt</b><br/>
|
||||
- Tokens: {{ timings.prompt_n }}<br/>
|
||||
- Time: {{ timings.prompt_ms }} ms<br/>
|
||||
- Speed: {{ timings.prompt_per_second.toFixed(1) }} t/s<br/>
|
||||
<b>Generation</b><br/>
|
||||
- Tokens: {{ timings.predicted_n }}<br/>
|
||||
- Time: {{ timings.predicted_ms }} ms<br/>
|
||||
- Speed: {{ timings.predicted_per_second.toFixed(1) }} t/s<br/>
|
||||
</div>
|
||||
</div>
|
||||
</template>
|
||||
</div>
|
||||
</div>
|
||||
<!-- actions for each message -->
|
||||
<div :class="{'text-right': msg.role === 'user', 'opacity-0': isGenerating}" class="mx-4 mt-2 mb-2">
|
||||
<!-- user message -->
|
||||
<button v-if="msg.role === 'user'" class="badge btn-mini show-on-hover" @click="editingContent = msg.content" :disabled="isGenerating">
|
||||
✍️ Edit
|
||||
</button>
|
||||
<!-- assistant message -->
|
||||
<button v-if="msg.role === 'assistant'" class="badge btn-mini show-on-hover mr-2" @click="regenerateMsg(msg)" :disabled="isGenerating">
|
||||
🔄 Regenerate
|
||||
</button>
|
||||
<button v-if="msg.role === 'assistant'" class="badge btn-mini show-on-hover mr-2" @click="copyMsg()" :disabled="isGenerating">
|
||||
📋 Copy
|
||||
</button>
|
||||
</div>
|
||||
</template>
|
||||
|
||||
|
||||
<!-- Template to be used by settings modal -->
|
||||
<template id="settings-modal-short-input">
|
||||
<label class="input input-bordered join-item grow flex items-center gap-2 mb-2">
|
||||
|
||||
7
examples/server/webui/package-lock.json
generated
7
examples/server/webui/package-lock.json
generated
@@ -13,6 +13,7 @@
|
||||
"markdown-it": "^14.1.0",
|
||||
"postcss": "^8.4.49",
|
||||
"tailwindcss": "^3.4.15",
|
||||
"textlinestream": "^1.1.1",
|
||||
"vite-plugin-singlefile": "^2.0.3",
|
||||
"vue": "^3.5.13"
|
||||
},
|
||||
@@ -2677,6 +2678,12 @@
|
||||
"node": ">=14.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/textlinestream": {
|
||||
"version": "1.1.1",
|
||||
"resolved": "https://registry.npmjs.org/textlinestream/-/textlinestream-1.1.1.tgz",
|
||||
"integrity": "sha512-iBHbi7BQxrFmwZUQJsT0SjNzlLLsXhvW/kg7EyOMVMBIrlnj/qYofwo1LVLZi+3GbUEo96Iu2eqToI2+lZoAEQ==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/uc.micro": {
|
||||
"version": "2.1.0",
|
||||
"resolved": "https://registry.npmjs.org/uc.micro/-/uc.micro-2.1.0.tgz",
|
||||
|
||||
@@ -17,6 +17,7 @@
|
||||
"markdown-it": "^14.1.0",
|
||||
"postcss": "^8.4.49",
|
||||
"tailwindcss": "^3.4.15",
|
||||
"textlinestream": "^1.1.1",
|
||||
"vite-plugin-singlefile": "^2.0.3",
|
||||
"vue": "^3.5.13"
|
||||
}
|
||||
|
||||
@@ -1,225 +0,0 @@
|
||||
const paramDefaults = {
|
||||
stream: true,
|
||||
temperature: 0.2,
|
||||
};
|
||||
|
||||
let generation_settings = null;
|
||||
|
||||
export class CompletionError extends Error {
|
||||
constructor(message, name, data) {
|
||||
super(message);
|
||||
this.name = name;
|
||||
}
|
||||
};
|
||||
|
||||
// Completes the prompt as a generator. Recommended for most use cases.
|
||||
//
|
||||
// Example:
|
||||
//
|
||||
// import { llama } from '/completion.js'
|
||||
//
|
||||
// const request = llama("Tell me a joke", {n_predict: 800})
|
||||
// for await (const chunk of request) {
|
||||
// document.write(chunk.data.content)
|
||||
// }
|
||||
//
|
||||
export async function* llama(prompt, params = {}, config = {}) {
|
||||
let controller = config.controller;
|
||||
const api_url = config.api_url?.replace(/\/+$/, '') || "";
|
||||
|
||||
if (!controller) {
|
||||
controller = new AbortController();
|
||||
}
|
||||
|
||||
const completionParams = { ...paramDefaults, ...params, prompt };
|
||||
|
||||
const response = await fetch(`${api_url}${config.endpoint || '/completion'}`, {
|
||||
method: 'POST',
|
||||
body: JSON.stringify(completionParams),
|
||||
headers: {
|
||||
'Connection': 'keep-alive',
|
||||
'Content-Type': 'application/json',
|
||||
'Accept': 'text/event-stream',
|
||||
...(params.api_key ? {'Authorization': `Bearer ${params.api_key}`} : {})
|
||||
},
|
||||
signal: controller.signal,
|
||||
});
|
||||
|
||||
const status = response.status;
|
||||
if (status !== 200) {
|
||||
try {
|
||||
const body = await response.json();
|
||||
if (body && body.error && body.error.message) {
|
||||
throw new CompletionError(body.error.message, 'ServerError');
|
||||
}
|
||||
} catch (err) {
|
||||
throw new CompletionError(err.message, 'ServerError');
|
||||
}
|
||||
}
|
||||
|
||||
const reader = response.body.getReader();
|
||||
const decoder = new TextDecoder();
|
||||
|
||||
let content = "";
|
||||
let leftover = ""; // Buffer for partially read lines
|
||||
|
||||
try {
|
||||
let cont = true;
|
||||
|
||||
while (cont) {
|
||||
const result = await reader.read();
|
||||
if (result.done) {
|
||||
break;
|
||||
}
|
||||
|
||||
// Add any leftover data to the current chunk of data
|
||||
const text = leftover + decoder.decode(result.value);
|
||||
|
||||
// Check if the last character is a line break
|
||||
const endsWithLineBreak = text.endsWith('\n');
|
||||
|
||||
// Split the text into lines
|
||||
let lines = text.split('\n');
|
||||
|
||||
// If the text doesn't end with a line break, then the last line is incomplete
|
||||
// Store it in leftover to be added to the next chunk of data
|
||||
if (!endsWithLineBreak) {
|
||||
leftover = lines.pop();
|
||||
} else {
|
||||
leftover = ""; // Reset leftover if we have a line break at the end
|
||||
}
|
||||
|
||||
// Parse all sse events and add them to result
|
||||
const regex = /^(\S+):\s(.*)$/gm;
|
||||
for (const line of lines) {
|
||||
const match = regex.exec(line);
|
||||
if (match) {
|
||||
result[match[1]] = match[2];
|
||||
if (result.data === '[DONE]') {
|
||||
cont = false;
|
||||
break;
|
||||
}
|
||||
|
||||
// since we know this is llama.cpp, let's just decode the json in data
|
||||
if (result.data) {
|
||||
result.data = JSON.parse(result.data);
|
||||
content += result.data.content;
|
||||
|
||||
// yield
|
||||
yield result;
|
||||
|
||||
// if we got a stop token from server, we will break here
|
||||
if (result.data.stop) {
|
||||
if (result.data.generation_settings) {
|
||||
generation_settings = result.data.generation_settings;
|
||||
}
|
||||
cont = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (result.error) {
|
||||
try {
|
||||
result.error = JSON.parse(result.error);
|
||||
if (result.error.message.includes('slot unavailable')) {
|
||||
// Throw an error to be caught by upstream callers
|
||||
throw new Error('slot unavailable');
|
||||
} else {
|
||||
console.error(`llama.cpp error [${result.error.code} - ${result.error.type}]: ${result.error.message}`);
|
||||
}
|
||||
} catch(e) {
|
||||
console.error(`llama.cpp error ${result.error}`)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
if (e.name !== 'AbortError') {
|
||||
console.error("llama error: ", e);
|
||||
}
|
||||
throw e;
|
||||
}
|
||||
finally {
|
||||
controller.abort();
|
||||
}
|
||||
|
||||
return content;
|
||||
}
|
||||
|
||||
// Call llama, return an event target that you can subscribe to
|
||||
//
|
||||
// Example:
|
||||
//
|
||||
// import { llamaEventTarget } from '/completion.js'
|
||||
//
|
||||
// const conn = llamaEventTarget(prompt)
|
||||
// conn.addEventListener("message", (chunk) => {
|
||||
// document.write(chunk.detail.content)
|
||||
// })
|
||||
//
|
||||
export const llamaEventTarget = (prompt, params = {}, config = {}) => {
|
||||
const eventTarget = new EventTarget();
|
||||
(async () => {
|
||||
let content = "";
|
||||
for await (const chunk of llama(prompt, params, config)) {
|
||||
if (chunk.data) {
|
||||
content += chunk.data.content;
|
||||
eventTarget.dispatchEvent(new CustomEvent("message", { detail: chunk.data }));
|
||||
}
|
||||
if (chunk.data.generation_settings) {
|
||||
eventTarget.dispatchEvent(new CustomEvent("generation_settings", { detail: chunk.data.generation_settings }));
|
||||
}
|
||||
if (chunk.data.timings) {
|
||||
eventTarget.dispatchEvent(new CustomEvent("timings", { detail: chunk.data.timings }));
|
||||
}
|
||||
}
|
||||
eventTarget.dispatchEvent(new CustomEvent("done", { detail: { content } }));
|
||||
})();
|
||||
return eventTarget;
|
||||
}
|
||||
|
||||
// Call llama, return a promise that resolves to the completed text. This does not support streaming
|
||||
//
|
||||
// Example:
|
||||
//
|
||||
// llamaPromise(prompt).then((content) => {
|
||||
// document.write(content)
|
||||
// })
|
||||
//
|
||||
// or
|
||||
//
|
||||
// const content = await llamaPromise(prompt)
|
||||
// document.write(content)
|
||||
//
|
||||
export const llamaPromise = (prompt, params = {}, config = {}) => {
|
||||
return new Promise(async (resolve, reject) => {
|
||||
let content = "";
|
||||
try {
|
||||
for await (const chunk of llama(prompt, params, config)) {
|
||||
content += chunk.data.content;
|
||||
}
|
||||
resolve(content);
|
||||
} catch (error) {
|
||||
reject(error);
|
||||
}
|
||||
});
|
||||
};
|
||||
|
||||
/**
|
||||
* (deprecated)
|
||||
*/
|
||||
export const llamaComplete = async (params, controller, callback) => {
|
||||
for await (const chunk of llama(params.prompt, params, { controller })) {
|
||||
callback(chunk);
|
||||
}
|
||||
}
|
||||
|
||||
// Get the model info from the server. This is useful for getting the context window and so on.
|
||||
export const llamaModelInfo = async (config = {}) => {
|
||||
if (!generation_settings) {
|
||||
const api_url = config.api_url?.replace(/\/+$/, '') || "";
|
||||
const props = await fetch(`${api_url}/props`).then(r => r.json());
|
||||
generation_settings = props.default_generation_settings;
|
||||
}
|
||||
return generation_settings;
|
||||
}
|
||||
@@ -1,21 +1,25 @@
|
||||
import './styles.css';
|
||||
import { createApp, defineComponent, shallowRef, computed, h } from 'vue/dist/vue.esm-bundler.js';
|
||||
import { llama } from './completion.js';
|
||||
import MarkdownIt from 'markdown-it';
|
||||
import TextLineStream from 'textlinestream';
|
||||
|
||||
const isDev = import.meta.env.MODE === 'development';
|
||||
|
||||
// utility functions
|
||||
const isString = (x) => !!x.toLowerCase;
|
||||
const isNumeric = (n) => !isString(n) && !isNaN(n);
|
||||
const isBoolean = (x) => x === true || x === false;
|
||||
const isNumeric = (n) => !isString(n) && !isNaN(n) && !isBoolean(n);
|
||||
const escapeAttr = (str) => str.replace(/>/g, '>').replace(/"/g, '"');
|
||||
const copyStr = (str) => navigator.clipboard.writeText(str);
|
||||
|
||||
// constants
|
||||
const BASE_URL = localStorage.getItem('base') // for debugging
|
||||
|| (new URL('.', document.baseURI).href).toString(); // for production
|
||||
|| (new URL('.', document.baseURI).href).toString().replace(/\/$/, ''); // for production
|
||||
const CONFIG_DEFAULT = {
|
||||
// Note: in order not to introduce breaking changes, please keep the same data type (number, string, etc) if you want to change the default value. Do not use null or undefined for default value.
|
||||
apiKey: '',
|
||||
systemMessage: 'You are a helpful assistant.',
|
||||
showTokensPerSecond: false,
|
||||
// make sure these default values are in sync with `common.h`
|
||||
samplers: 'dkypmxt',
|
||||
temperature: 0.8,
|
||||
@@ -101,6 +105,48 @@ const SettingsModalShortInput = defineComponent({
|
||||
},
|
||||
});
|
||||
|
||||
// message bubble component
|
||||
const MessageBubble = defineComponent({
|
||||
components: {
|
||||
VueMarkdown
|
||||
},
|
||||
template: document.getElementById('message-bubble').innerHTML,
|
||||
props: {
|
||||
config: Object,
|
||||
msg: Object,
|
||||
isGenerating: Boolean,
|
||||
editUserMsgAndRegenerate: Function,
|
||||
regenerateMsg: Function,
|
||||
},
|
||||
data() {
|
||||
return {
|
||||
editingContent: null,
|
||||
};
|
||||
},
|
||||
computed: {
|
||||
timings() {
|
||||
if (!this.msg.timings) return null;
|
||||
return {
|
||||
...this.msg.timings,
|
||||
prompt_per_second: this.msg.timings.prompt_n / (this.msg.timings.prompt_ms / 1000),
|
||||
predicted_per_second: this.msg.timings.predicted_n / (this.msg.timings.predicted_ms / 1000),
|
||||
};
|
||||
}
|
||||
},
|
||||
methods: {
|
||||
copyMsg() {
|
||||
copyStr(this.msg.content);
|
||||
},
|
||||
editMsg() {
|
||||
this.editUserMsgAndRegenerate({
|
||||
...this.msg,
|
||||
content: this.editingContent,
|
||||
});
|
||||
this.editingContent = null;
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
// coversations is stored in localStorage
|
||||
// format: { [convId]: { id: string, lastModified: number, messages: [...] } }
|
||||
// convId is a string prefixed with 'conv-'
|
||||
@@ -192,10 +238,29 @@ const chatScrollToBottom = (requiresNearBottom) => {
|
||||
}
|
||||
};
|
||||
|
||||
// wrapper for SSE
|
||||
async function* sendSSEPostRequest(url, fetchOptions) {
|
||||
const res = await fetch(url, fetchOptions);
|
||||
const lines = res.body
|
||||
.pipeThrough(new TextDecoderStream())
|
||||
.pipeThrough(new TextLineStream());
|
||||
for await (const line of lines) {
|
||||
if (isDev) console.log({line});
|
||||
if (line.startsWith('data:') && !line.endsWith('[DONE]')) {
|
||||
const data = JSON.parse(line.slice(5));
|
||||
yield data;
|
||||
} else if (line.startsWith('error:')) {
|
||||
const data = JSON.parse(line.slice(6));
|
||||
throw new Error(data.message || 'Unknown error');
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
const mainApp = createApp({
|
||||
components: {
|
||||
VueMarkdown,
|
||||
SettingsModalShortInput,
|
||||
MessageBubble,
|
||||
},
|
||||
data() {
|
||||
return {
|
||||
@@ -209,7 +274,6 @@ const mainApp = createApp({
|
||||
selectedTheme: StorageUtils.getTheme(),
|
||||
config: StorageUtils.getConfig(),
|
||||
showConfigDialog: false,
|
||||
editingMsg: null,
|
||||
// const
|
||||
themes: THEMES,
|
||||
configDefault: {...CONFIG_DEFAULT},
|
||||
@@ -226,6 +290,15 @@ const mainApp = createApp({
|
||||
});
|
||||
resizeObserver.observe(pendingMsgElem);
|
||||
},
|
||||
watch: {
|
||||
viewingConvId: function(val, oldVal) {
|
||||
if (val != oldVal) {
|
||||
this.fetchMessages();
|
||||
chatScrollToBottom();
|
||||
this.hideSidebar();
|
||||
}
|
||||
}
|
||||
},
|
||||
methods: {
|
||||
hideSidebar() {
|
||||
document.getElementById('toggle-drawer').checked = false;
|
||||
@@ -237,18 +310,10 @@ const mainApp = createApp({
|
||||
newConversation() {
|
||||
if (this.isGenerating) return;
|
||||
this.viewingConvId = StorageUtils.getNewConvId();
|
||||
this.editingMsg = null;
|
||||
this.fetchMessages();
|
||||
chatScrollToBottom();
|
||||
this.hideSidebar();
|
||||
},
|
||||
setViewingConv(convId) {
|
||||
if (this.isGenerating) return;
|
||||
this.viewingConvId = convId;
|
||||
this.editingMsg = null;
|
||||
this.fetchMessages();
|
||||
chatScrollToBottom();
|
||||
this.hideSidebar();
|
||||
},
|
||||
deleteConv(convId) {
|
||||
if (this.isGenerating) return;
|
||||
@@ -256,7 +321,6 @@ const mainApp = createApp({
|
||||
StorageUtils.remove(convId);
|
||||
if (this.viewingConvId === convId) {
|
||||
this.viewingConvId = StorageUtils.getNewConvId();
|
||||
this.editingMsg = null;
|
||||
}
|
||||
this.fetchConversation();
|
||||
this.fetchMessages();
|
||||
@@ -291,7 +355,6 @@ const mainApp = createApp({
|
||||
this.fetchConversation();
|
||||
this.fetchMessages();
|
||||
this.inputMsg = '';
|
||||
this.editingMsg = null;
|
||||
this.generateMessage(currConvId);
|
||||
chatScrollToBottom();
|
||||
},
|
||||
@@ -299,7 +362,6 @@ const mainApp = createApp({
|
||||
if (this.isGenerating) return;
|
||||
this.pendingMsg = { id: Date.now()+1, role: 'assistant', content: null };
|
||||
this.isGenerating = true;
|
||||
this.editingMsg = null;
|
||||
|
||||
try {
|
||||
const abortController = new AbortController();
|
||||
@@ -330,17 +392,21 @@ const mainApp = createApp({
|
||||
dry_allowed_length: this.config.dry_allowed_length,
|
||||
dry_penalty_last_n: this.config.dry_penalty_last_n,
|
||||
max_tokens: this.config.max_tokens,
|
||||
timings_per_token: !!this.config.showTokensPerSecond,
|
||||
...(this.config.custom.length ? JSON.parse(this.config.custom) : {}),
|
||||
...(this.config.apiKey ? { api_key: this.config.apiKey } : {}),
|
||||
};
|
||||
const config = {
|
||||
controller: abortController,
|
||||
api_url: BASE_URL,
|
||||
endpoint: '/chat/completions',
|
||||
};
|
||||
for await (const chunk of llama(prompt, params, config)) {
|
||||
const stop = chunk.data.stop;
|
||||
const addedContent = chunk.data.choices[0].delta.content;
|
||||
const chunks = sendSSEPostRequest(`${BASE_URL}/v1/chat/completions`, {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
'Authorization': this.config.apiKey ? `Bearer ${this.config.apiKey}` : undefined,
|
||||
},
|
||||
body: JSON.stringify(params),
|
||||
signal: abortController.signal,
|
||||
});
|
||||
for await (const chunk of chunks) {
|
||||
const stop = chunk.stop;
|
||||
const addedContent = chunk.choices[0].delta.content;
|
||||
const lastContent = this.pendingMsg.content || '';
|
||||
if (addedContent) {
|
||||
this.pendingMsg = {
|
||||
@@ -349,6 +415,16 @@ const mainApp = createApp({
|
||||
content: lastContent + addedContent,
|
||||
};
|
||||
}
|
||||
const timings = chunk.timings;
|
||||
if (timings && this.config.showTokensPerSecond) {
|
||||
// only extract what's really needed, to save some space
|
||||
this.pendingMsg.timings = {
|
||||
prompt_n: timings.prompt_n,
|
||||
prompt_ms: timings.prompt_ms,
|
||||
predicted_n: timings.predicted_n,
|
||||
predicted_ms: timings.predicted_ms,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
StorageUtils.appendMsg(currConvId, this.pendingMsg);
|
||||
@@ -387,14 +463,10 @@ const mainApp = createApp({
|
||||
this.fetchMessages();
|
||||
this.generateMessage(currConvId);
|
||||
},
|
||||
copyMsg(msg) {
|
||||
copyStr(msg.content);
|
||||
},
|
||||
editUserMsgAndRegenerate(msg) {
|
||||
if (this.isGenerating) return;
|
||||
const currConvId = this.viewingConvId;
|
||||
const newContent = msg.content;
|
||||
this.editingMsg = null;
|
||||
StorageUtils.filterAndKeepMsgs(currConvId, (m) => m.id < msg.id);
|
||||
StorageUtils.appendMsg(currConvId, {
|
||||
id: Date.now(),
|
||||
|
||||
@@ -394,7 +394,7 @@ int main(int raw_argc, char ** raw_argv) {
|
||||
}
|
||||
|
||||
if (show_token_count) {
|
||||
printf("Total number of tokens: %ld\n", tokens.size());
|
||||
printf("Total number of tokens: %zu\n", tokens.size());
|
||||
}
|
||||
// silence valgrind
|
||||
llama_free(ctx);
|
||||
|
||||
@@ -32,6 +32,13 @@ else()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# remove the lib prefix on win32 mingw
|
||||
if (WIN32)
|
||||
set(CMAKE_STATIC_LIBRARY_PREFIX "")
|
||||
set(CMAKE_SHARED_LIBRARY_PREFIX "")
|
||||
set(CMAKE_SHARED_MODULE_PREFIX "")
|
||||
endif()
|
||||
|
||||
option(BUILD_SHARED_LIBS "ggml: build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
|
||||
option(GGML_BACKEND_DL "ggml: build backends as dynamic libraries (requires BUILD_SHARED_LIBS)" OFF)
|
||||
|
||||
@@ -172,6 +179,11 @@ set (GGML_SYCL_TARGET "INTEL" CACHE STRING
|
||||
set (GGML_SYCL_DEVICE_ARCH "" CACHE STRING
|
||||
"ggml: sycl device architecture")
|
||||
|
||||
option(GGML_OPENCL "ggml: use OpenCL" OFF)
|
||||
option(GGML_OPENCL_PROFILING "ggml: use OpenCL profiling (increases overhead)" OFF)
|
||||
option(GGML_OPENCL_EMBED_KERNELS "ggml: embed kernels" ON)
|
||||
option(GGML_OPENCL_USE_ADRENO_KERNELS "ggml: use optimized kernels for Adreno" ON)
|
||||
|
||||
# extra artifacts
|
||||
option(GGML_BUILD_TESTS "ggml: build tests" ${GGML_STANDALONE})
|
||||
option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE})
|
||||
|
||||
@@ -228,6 +228,7 @@ extern "C" {
|
||||
GGML_API void ggml_backend_unload(ggml_backend_reg_t reg);
|
||||
// Load all known backends from dynamic libraries
|
||||
GGML_API void ggml_backend_load_all(void);
|
||||
GGML_API void ggml_backend_load_all_from_path(const char * dir_path);
|
||||
|
||||
//
|
||||
// Backend scheduler
|
||||
|
||||
@@ -103,24 +103,14 @@ extern "C" {
|
||||
|
||||
// Internal types and functions exposed for tests and benchmarks
|
||||
|
||||
typedef void (*ggml_from_float_to_mat_t)
|
||||
(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nr, int64_t k, int64_t bs);
|
||||
typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
|
||||
const void * GGML_RESTRICT y, size_t by, int nrc);
|
||||
typedef void (*ggml_gemv_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
|
||||
const void * GGML_RESTRICT y, int nr, int nc);
|
||||
typedef void (*ggml_gemm_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
|
||||
const void * GGML_RESTRICT y, int nr, int nc);
|
||||
|
||||
struct ggml_type_traits_cpu {
|
||||
ggml_from_float_t from_float;
|
||||
ggml_from_float_to_mat_t from_float_to_mat;
|
||||
ggml_vec_dot_t vec_dot;
|
||||
enum ggml_type vec_dot_type;
|
||||
int64_t nrows; // number of rows to process simultaneously
|
||||
int64_t ncols; // number of columns to process simultaneously
|
||||
ggml_gemv_t gemv;
|
||||
ggml_gemm_t gemm;
|
||||
};
|
||||
|
||||
GGML_BACKEND_API const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type);
|
||||
@@ -140,13 +130,6 @@ extern "C" {
|
||||
|
||||
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
|
||||
|
||||
#ifdef GGML_USE_CPU_HBM
|
||||
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
|
||||
#endif
|
||||
|
||||
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void);
|
||||
GGML_BACKEND_API bool ggml_backend_cpu_buft_is_aarch64(ggml_backend_buffer_type_t buft);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
26
ggml/include/ggml-opencl.h
Normal file
26
ggml/include/ggml-opencl.h
Normal file
@@ -0,0 +1,26 @@
|
||||
#ifndef GGML_OPENCL_H
|
||||
#define GGML_OPENCL_H
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-backend.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
//
|
||||
// backend API
|
||||
//
|
||||
GGML_BACKEND_API ggml_backend_t ggml_backend_opencl_init(void);
|
||||
GGML_BACKEND_API bool ggml_backend_is_opencl(ggml_backend_t backend);
|
||||
|
||||
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type(void);
|
||||
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_opencl_host_buffer_type(void);
|
||||
|
||||
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_opencl_reg(void);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // GGML_OPENCL_H
|
||||
@@ -237,7 +237,9 @@
|
||||
#define GGML_EXIT_SUCCESS 0
|
||||
#define GGML_EXIT_ABORTED 1
|
||||
|
||||
#define GGML_ROPE_TYPE_NEOX 2
|
||||
#define GGML_ROPE_TYPE_NEOX 2
|
||||
#define GGML_ROPE_TYPE_MROPE 8
|
||||
#define GGML_ROPE_TYPE_VISION 24
|
||||
|
||||
#define GGUF_MAGIC "GGUF"
|
||||
|
||||
@@ -384,15 +386,15 @@ extern "C" {
|
||||
GGML_TYPE_F64 = 28,
|
||||
GGML_TYPE_IQ1_M = 29,
|
||||
GGML_TYPE_BF16 = 30,
|
||||
GGML_TYPE_Q4_0_4_4 = 31,
|
||||
GGML_TYPE_Q4_0_4_8 = 32,
|
||||
GGML_TYPE_Q4_0_8_8 = 33,
|
||||
// GGML_TYPE_Q4_0_4_4 = 31, support has been removed from gguf files
|
||||
// GGML_TYPE_Q4_0_4_8 = 32,
|
||||
// GGML_TYPE_Q4_0_8_8 = 33,
|
||||
GGML_TYPE_TQ1_0 = 34,
|
||||
GGML_TYPE_TQ2_0 = 35,
|
||||
GGML_TYPE_IQ4_NL_4_4 = 36,
|
||||
// GGML_TYPE_IQ4_NL_4_4 = 36,
|
||||
// GGML_TYPE_IQ4_NL_4_8 = 37,
|
||||
// GGML_TYPE_IQ4_NL_8_8 = 38,
|
||||
GGML_TYPE_COUNT,
|
||||
GGML_TYPE_COUNT = 39,
|
||||
};
|
||||
|
||||
// precision
|
||||
@@ -433,9 +435,6 @@ extern "C" {
|
||||
GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_Q4_0_4_4 = 25, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_Q4_0_4_8 = 26, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_Q4_0_8_8 = 27, // except 1d tensors
|
||||
};
|
||||
|
||||
// available tensor operations:
|
||||
@@ -1446,6 +1445,22 @@ extern "C" {
|
||||
float beta_fast,
|
||||
float beta_slow);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_rope_multi(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
struct ggml_tensor * c,
|
||||
int n_dims,
|
||||
int sections[4],
|
||||
int mode,
|
||||
int n_ctx_orig,
|
||||
float freq_base,
|
||||
float freq_scale,
|
||||
float ext_factor,
|
||||
float attn_factor,
|
||||
float beta_fast,
|
||||
float beta_slow);
|
||||
|
||||
// in-place, returns view(a)
|
||||
GGML_API struct ggml_tensor * ggml_rope_ext_inplace(
|
||||
struct ggml_context * ctx,
|
||||
@@ -2205,11 +2220,19 @@ extern "C" {
|
||||
GGML_API size_t gguf_get_meta_size(const struct gguf_context * ctx);
|
||||
GGML_API void gguf_get_meta_data(const struct gguf_context * ctx, void * data);
|
||||
|
||||
#ifdef __cplusplus
|
||||
// restrict not standard in C++
|
||||
#define GGML_RESTRICT
|
||||
#ifdef __cplusplus
|
||||
// restrict not standard in C++
|
||||
# if defined(__GNUC__)
|
||||
# define GGML_RESTRICT __restrict__
|
||||
# elif defined(__clang__)
|
||||
# define GGML_RESTRICT __restrict
|
||||
# elif defined(_MSC_VER)
|
||||
# define GGML_RESTRICT __restrict
|
||||
# else
|
||||
# define GGML_RESTRICT
|
||||
# endif
|
||||
#else
|
||||
#define GGML_RESTRICT restrict
|
||||
# define GGML_RESTRICT restrict
|
||||
#endif
|
||||
typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
@@ -194,11 +194,6 @@ endif()
|
||||
|
||||
if (WIN32)
|
||||
add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
|
||||
|
||||
if (BUILD_SHARED_LIBS)
|
||||
# TODO: should not use this
|
||||
set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# ggml
|
||||
@@ -220,9 +215,7 @@ add_library(ggml-base
|
||||
ggml-threading.cpp
|
||||
ggml-threading.h
|
||||
ggml-quants.c
|
||||
ggml-quants.h
|
||||
ggml-aarch64.c
|
||||
ggml-aarch64.h)
|
||||
ggml-quants.h)
|
||||
|
||||
target_include_directories(ggml-base PRIVATE .)
|
||||
|
||||
@@ -315,6 +308,7 @@ ggml_add_backend(MUSA)
|
||||
ggml_add_backend(RPC)
|
||||
ggml_add_backend(SYCL)
|
||||
ggml_add_backend(Vulkan)
|
||||
ggml_add_backend(OpenCL)
|
||||
|
||||
foreach (target ggml-base ggml)
|
||||
target_include_directories(${target} PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>)
|
||||
|
||||
@@ -1,129 +0,0 @@
|
||||
#define GGML_COMMON_DECL_C
|
||||
#include "ggml-common.h"
|
||||
|
||||
#include "ggml-aarch64.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-quants.h"
|
||||
#include <assert.h>
|
||||
|
||||
#define UNUSED GGML_UNUSED
|
||||
|
||||
static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) {
|
||||
block_q4_0x4 out;
|
||||
|
||||
for (int i = 0; i < 4; i++) {
|
||||
out.d[i] = in[i].d;
|
||||
}
|
||||
|
||||
const int end = QK4_0 * 2 / blck_size_interleave;
|
||||
|
||||
if (blck_size_interleave == 8) {
|
||||
const uint64_t xor_mask = 0x8888888888888888ULL;
|
||||
for (int i = 0; i < end; ++i) {
|
||||
int src_id = i % 4;
|
||||
int src_offset = (i / 4) * blck_size_interleave;
|
||||
int dst_offset = i * blck_size_interleave;
|
||||
|
||||
uint64_t elems;
|
||||
// Using memcpy to avoid unaligned memory accesses
|
||||
memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
|
||||
elems ^= xor_mask;
|
||||
memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
|
||||
}
|
||||
} else if (blck_size_interleave == 4) {
|
||||
const uint32_t xor_mask = 0x88888888;
|
||||
for (int i = 0; i < end; ++i) {
|
||||
int src_id = i % 4;
|
||||
int src_offset = (i / 4) * blck_size_interleave;
|
||||
int dst_offset = i * blck_size_interleave;
|
||||
|
||||
uint32_t elems;
|
||||
memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint32_t));
|
||||
elems ^= xor_mask;
|
||||
memcpy(&out.qs[dst_offset], &elems, sizeof(uint32_t));
|
||||
}
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
// interleave 8 block_q4_0s in blocks of blck_size_interleave
|
||||
// returns an interleaved block_q4_0x8
|
||||
// in the interleaved block_q4_0x8, place deltas for 8 block_q4_0 blocks
|
||||
// first, then interleave quants from 8 block_q4_0s in blocks of blck_size_interleave
|
||||
static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int blck_size_interleave) {
|
||||
block_q4_0x8 out;
|
||||
|
||||
for (int i = 0; i < 8; i++) {
|
||||
out.d[i] = in[i].d;
|
||||
}
|
||||
|
||||
const int end = QK4_0 * 4 / blck_size_interleave;
|
||||
const uint64_t xor_mask = 0x8888888888888888ULL;
|
||||
|
||||
for (int i = 0; i < end; ++i) {
|
||||
int src_id = i % 8;
|
||||
int src_offset = (i / 8) * blck_size_interleave;
|
||||
int dst_offset = i * blck_size_interleave;
|
||||
|
||||
uint64_t elems;
|
||||
memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
|
||||
elems ^= xor_mask;
|
||||
memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
|
||||
}
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
static size_t quantize_q4_0_nr_bl(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, int nrows_interleaved, int blck_size_interleave) {
|
||||
assert(n_per_row % QK4_0 == 0);
|
||||
const int nb = n_per_row / QK4_0;
|
||||
|
||||
void * out_ptr = NULL;
|
||||
if (nrows_interleaved == 8) {
|
||||
out_ptr = (block_q4_0x8 *) dst;
|
||||
}
|
||||
else if (nrows_interleaved == 4) {
|
||||
out_ptr = (block_q4_0x4 *) dst;
|
||||
}
|
||||
assert(nrows_interleaved <= 8);
|
||||
block_q4_0 dst_tmp[8];
|
||||
|
||||
for (int b = 0; b < (nrow * n_per_row); b += nrows_interleaved * n_per_row) {
|
||||
|
||||
for (int64_t x = 0; x < nb; x++) {
|
||||
|
||||
for (int i = 0; i < nrows_interleaved; i++ ) {
|
||||
quantize_row_q4_0_ref(src + b + i * n_per_row + x * QK4_0, (block_q4_0 *) dst_tmp + i, QK4_0);
|
||||
}
|
||||
|
||||
if (nrows_interleaved == 8) {
|
||||
*(block_q4_0x8 *) out_ptr = make_block_q4_0x8(dst_tmp, blck_size_interleave);
|
||||
out_ptr = (block_q4_0x8 *) out_ptr + 1;
|
||||
}
|
||||
else if (nrows_interleaved == 4) {
|
||||
*(block_q4_0x4 *) out_ptr = make_block_q4_0x4(dst_tmp, blck_size_interleave);
|
||||
out_ptr = (block_q4_0x4 *) out_ptr + 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return ((nrow * n_per_row) / QK4_0 * sizeof(block_q4_0));
|
||||
}
|
||||
|
||||
size_t quantize_q4_0_4x4(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||
UNUSED(quant_weights);
|
||||
return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 4, 4);
|
||||
}
|
||||
|
||||
size_t quantize_q4_0_4x8(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||
UNUSED(quant_weights);
|
||||
return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 4, 8);
|
||||
}
|
||||
|
||||
size_t quantize_q4_0_8x8(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||
UNUSED(quant_weights);
|
||||
return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 8, 8);
|
||||
}
|
||||
@@ -1,19 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
// GGML internal header
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
|
||||
size_t quantize_q4_0_4x4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_q4_0_4x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_q4_0_8x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -46,6 +46,10 @@
|
||||
#include "ggml-vulkan.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_OPENCL
|
||||
#include "ggml-opencl.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_BLAS
|
||||
#include "ggml-blas.h"
|
||||
#endif
|
||||
@@ -146,6 +150,9 @@ struct ggml_backend_registry {
|
||||
#ifdef GGML_USE_VULKAN
|
||||
register_backend(ggml_backend_vk_reg());
|
||||
#endif
|
||||
#ifdef GGML_USE_OPENCL
|
||||
register_backend(ggml_backend_opencl_reg());
|
||||
#endif
|
||||
#ifdef GGML_USE_CANN
|
||||
register_backend(ggml_backend_cann_reg());
|
||||
#endif
|
||||
@@ -449,11 +456,21 @@ static std::string backend_filename_suffix() {
|
||||
#endif
|
||||
}
|
||||
|
||||
static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent) {
|
||||
static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent, const char * user_search_path) {
|
||||
// enumerate all the files that match [lib]ggml-name-*.[so|dll] in the search paths
|
||||
// TODO: search system paths
|
||||
std::vector<std::string> search_paths = { "./", get_executable_path() };
|
||||
std::string file_prefix = backend_filename_prefix() + name + "-";
|
||||
std::vector<std::string> search_paths;
|
||||
if (user_search_path == nullptr) {
|
||||
search_paths.push_back("./");
|
||||
search_paths.push_back(get_executable_path());
|
||||
} else {
|
||||
#if defined(_WIN32)
|
||||
search_paths.push_back(std::string(user_search_path) + "\\");
|
||||
#else
|
||||
search_paths.push_back(std::string(user_search_path) + "/");
|
||||
#endif
|
||||
}
|
||||
|
||||
int best_score = 0;
|
||||
std::string best_path;
|
||||
@@ -463,7 +480,8 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent)
|
||||
if (!fs::exists(search_path)) {
|
||||
continue;
|
||||
}
|
||||
for (const auto & entry : fs::directory_iterator(search_path)) {
|
||||
fs::directory_iterator dir_it(search_path, fs::directory_options::skip_permission_denied);
|
||||
for (const auto & entry : dir_it) {
|
||||
if (entry.is_regular_file()) {
|
||||
std::string filename = entry.path().filename().string();
|
||||
std::string ext = entry.path().extension().string();
|
||||
@@ -509,21 +527,26 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent)
|
||||
}
|
||||
|
||||
void ggml_backend_load_all() {
|
||||
ggml_backend_load_all_from_path(nullptr);
|
||||
}
|
||||
|
||||
void ggml_backend_load_all_from_path(const char * dir_path) {
|
||||
#ifdef NDEBUG
|
||||
bool silent = true;
|
||||
#else
|
||||
bool silent = false;
|
||||
#endif
|
||||
|
||||
ggml_backend_load_best("blas", silent);
|
||||
ggml_backend_load_best("cann", silent);
|
||||
ggml_backend_load_best("cuda", silent);
|
||||
ggml_backend_load_best("hip", silent);
|
||||
ggml_backend_load_best("kompute", silent);
|
||||
ggml_backend_load_best("metal", silent);
|
||||
ggml_backend_load_best("rpc", silent);
|
||||
ggml_backend_load_best("sycl", silent);
|
||||
ggml_backend_load_best("vulkan", silent);
|
||||
ggml_backend_load_best("musa", silent);
|
||||
ggml_backend_load_best("cpu", silent);
|
||||
ggml_backend_load_best("blas", silent, dir_path);
|
||||
ggml_backend_load_best("cann", silent, dir_path);
|
||||
ggml_backend_load_best("cuda", silent, dir_path);
|
||||
ggml_backend_load_best("hip", silent, dir_path);
|
||||
ggml_backend_load_best("kompute", silent, dir_path);
|
||||
ggml_backend_load_best("metal", silent, dir_path);
|
||||
ggml_backend_load_best("rpc", silent, dir_path);
|
||||
ggml_backend_load_best("sycl", silent, dir_path);
|
||||
ggml_backend_load_best("vulkan", silent, dir_path);
|
||||
ggml_backend_load_best("opencl", silent, dir_path);
|
||||
ggml_backend_load_best("musa", silent, dir_path);
|
||||
ggml_backend_load_best("cpu", silent, dir_path);
|
||||
}
|
||||
|
||||
@@ -1747,6 +1747,15 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
||||
if (*ext_factor != 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const int mode = ((const int32_t *) op->op_params)[2];
|
||||
if (mode & GGML_ROPE_TYPE_MROPE) {
|
||||
return false;
|
||||
}
|
||||
if (mode & GGML_ROPE_TYPE_VISION) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
case GGML_OP_UPSCALE: {
|
||||
@@ -2089,7 +2098,7 @@ static void * ggml_backend_cann_reg_get_proc_address(ggml_backend_reg_t reg, con
|
||||
static const ggml_backend_reg_i ggml_backend_cann_reg_interface = {
|
||||
/* .get_name = */ ggml_backend_cann_reg_get_name,
|
||||
/* .get_device_count = */ ggml_backend_cann_reg_get_device_count,
|
||||
/* .get_device_get = */ ggml_backend_cann_reg_get_device,
|
||||
/* .get_device = */ ggml_backend_cann_reg_get_device,
|
||||
/* .get_proc_address = */ ggml_backend_cann_reg_get_proc_address,
|
||||
};
|
||||
|
||||
|
||||
@@ -6,7 +6,20 @@
|
||||
typedef uint16_t ggml_half;
|
||||
typedef uint32_t ggml_half2;
|
||||
|
||||
#define GGML_COMMON_AGGR
|
||||
#define GGML_COMMON_AGGR_U
|
||||
#define GGML_COMMON_AGGR_S
|
||||
|
||||
#define GGML_COMMON_DECL
|
||||
#elif defined(GGML_COMMON_DECL_CPP)
|
||||
#include <cstdint>
|
||||
|
||||
typedef uint16_t ggml_half;
|
||||
typedef uint32_t ggml_half2;
|
||||
|
||||
// std-c++ allow anonymous unions but some compiler warn on it
|
||||
#define GGML_COMMON_AGGR_U data
|
||||
// std-c++ do not allow it.
|
||||
#define GGML_COMMON_AGGR_S data
|
||||
|
||||
#define GGML_COMMON_DECL
|
||||
#elif defined(GGML_COMMON_DECL_METAL)
|
||||
@@ -15,7 +28,8 @@ typedef uint32_t ggml_half2;
|
||||
typedef half ggml_half;
|
||||
typedef half2 ggml_half2;
|
||||
|
||||
#define GGML_COMMON_AGGR
|
||||
#define GGML_COMMON_AGGR_U
|
||||
#define GGML_COMMON_AGGR_S
|
||||
|
||||
#define GGML_COMMON_DECL
|
||||
#elif defined(GGML_COMMON_DECL_CUDA)
|
||||
@@ -29,7 +43,8 @@ typedef half2 ggml_half2;
|
||||
typedef half ggml_half;
|
||||
typedef half2 ggml_half2;
|
||||
|
||||
#define GGML_COMMON_AGGR data
|
||||
#define GGML_COMMON_AGGR_U
|
||||
#define GGML_COMMON_AGGR_S data
|
||||
|
||||
#define GGML_COMMON_DECL
|
||||
#elif defined(GGML_COMMON_DECL_HIP)
|
||||
@@ -39,7 +54,8 @@ typedef half2 ggml_half2;
|
||||
typedef half ggml_half;
|
||||
typedef half2 ggml_half2;
|
||||
|
||||
#define GGML_COMMON_AGGR data
|
||||
#define GGML_COMMON_AGGR_U
|
||||
#define GGML_COMMON_AGGR_S data
|
||||
|
||||
#define GGML_COMMON_DECL
|
||||
#elif defined(GGML_COMMON_DECL_SYCL)
|
||||
@@ -49,7 +65,8 @@ typedef half2 ggml_half2;
|
||||
typedef sycl::half ggml_half;
|
||||
typedef sycl::half2 ggml_half2;
|
||||
|
||||
#define GGML_COMMON_AGGR data
|
||||
#define GGML_COMMON_AGGR_U
|
||||
#define GGML_COMMON_AGGR_S data
|
||||
|
||||
#define GGML_COMMON_DECL
|
||||
#endif
|
||||
@@ -154,9 +171,9 @@ typedef struct {
|
||||
struct {
|
||||
ggml_half d; // delta
|
||||
ggml_half m; // min
|
||||
} GGML_COMMON_AGGR;
|
||||
} GGML_COMMON_AGGR_S;
|
||||
ggml_half2 dm;
|
||||
};
|
||||
} GGML_COMMON_AGGR_U;
|
||||
uint8_t qs[QK4_1 / 2]; // nibbles / quants
|
||||
} block_q4_1;
|
||||
static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_half) + QK4_1 / 2, "wrong q4_1 block size/padding");
|
||||
@@ -175,9 +192,9 @@ typedef struct {
|
||||
struct {
|
||||
ggml_half d; // delta
|
||||
ggml_half m; // min
|
||||
} GGML_COMMON_AGGR;
|
||||
} GGML_COMMON_AGGR_S;
|
||||
ggml_half2 dm;
|
||||
};
|
||||
} GGML_COMMON_AGGR_U;
|
||||
uint8_t qh[4]; // 5-th bit of quants
|
||||
uint8_t qs[QK5_1 / 2]; // nibbles / quants
|
||||
} block_q5_1;
|
||||
@@ -196,37 +213,13 @@ typedef struct {
|
||||
struct {
|
||||
ggml_half d; // delta
|
||||
ggml_half s; // d * sum(qs[i])
|
||||
} GGML_COMMON_AGGR;
|
||||
} GGML_COMMON_AGGR_S;
|
||||
ggml_half2 ds;
|
||||
};
|
||||
} GGML_COMMON_AGGR_U;
|
||||
int8_t qs[QK8_1]; // quants
|
||||
} block_q8_1;
|
||||
static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_half) + QK8_1, "wrong q8_1 block size/padding");
|
||||
|
||||
typedef struct {
|
||||
ggml_half d[4]; // deltas for 4 q4_0 blocks
|
||||
uint8_t qs[QK4_0 * 2]; // nibbles / quants for 4 q4_0 blocks
|
||||
} block_q4_0x4;
|
||||
static_assert(sizeof(block_q4_0x4) == 4 * sizeof(ggml_half) + QK4_0 * 2, "wrong q4_0x4 block size/padding");
|
||||
|
||||
typedef struct {
|
||||
ggml_half d[8]; // deltas for 8 q4_0 blocks
|
||||
uint8_t qs[QK4_0 * 4]; // nibbles / quants for 8 q4_0 blocks
|
||||
} block_q4_0x8;
|
||||
static_assert(sizeof(block_q4_0x8) == 8 * sizeof(ggml_half) + QK4_0 * 4, "wrong q4_0x8 block size/padding");
|
||||
|
||||
typedef struct {
|
||||
ggml_half d[4]; // deltas for 4 q8_0 blocks
|
||||
int8_t qs[QK8_0 * 4]; // quants for 4 q8_0 blocks
|
||||
} block_q8_0x4;
|
||||
static_assert(sizeof(block_q8_0x4) == 4 * sizeof(ggml_half) + QK8_0 * 4, "wrong q8_0x4 block size/padding");
|
||||
|
||||
typedef struct {
|
||||
ggml_half d[8]; // deltas for 8 q8_0 blocks
|
||||
int8_t qs[QK8_0 * 8]; // quants for 8 q8_0 blocks
|
||||
} block_q8_0x8;
|
||||
static_assert(sizeof(block_q8_0x8) == 8 * sizeof(ggml_half) + QK8_0 * 8, "wrong q8_0x8 block size/padding");
|
||||
|
||||
//
|
||||
// Ternary quantization
|
||||
//
|
||||
@@ -261,9 +254,9 @@ typedef struct {
|
||||
struct {
|
||||
ggml_half d; // super-block scale for quantized scales
|
||||
ggml_half dmin; // super-block scale for quantized mins
|
||||
} GGML_COMMON_AGGR;
|
||||
} GGML_COMMON_AGGR_S;
|
||||
ggml_half2 dm;
|
||||
};
|
||||
} GGML_COMMON_AGGR_U;
|
||||
} block_q2_K;
|
||||
static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_half) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
|
||||
|
||||
@@ -288,9 +281,9 @@ typedef struct {
|
||||
struct {
|
||||
ggml_half d; // super-block scale for quantized scales
|
||||
ggml_half dmin; // super-block scale for quantized mins
|
||||
} GGML_COMMON_AGGR;
|
||||
} GGML_COMMON_AGGR_S;
|
||||
ggml_half2 dm;
|
||||
};
|
||||
} GGML_COMMON_AGGR_U;
|
||||
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
|
||||
uint8_t qs[QK_K/2]; // 4--bit quants
|
||||
} block_q4_K;
|
||||
@@ -305,9 +298,9 @@ typedef struct {
|
||||
struct {
|
||||
ggml_half d; // super-block scale for quantized scales
|
||||
ggml_half dmin; // super-block scale for quantized mins
|
||||
} GGML_COMMON_AGGR;
|
||||
} GGML_COMMON_AGGR_S;
|
||||
ggml_half2 dm;
|
||||
};
|
||||
} GGML_COMMON_AGGR_U;
|
||||
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
|
||||
uint8_t qh[QK_K/8]; // quants, high bit
|
||||
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
||||
@@ -418,12 +411,6 @@ typedef struct {
|
||||
} block_iq4_xs;
|
||||
static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
|
||||
|
||||
typedef struct {
|
||||
ggml_half d[4]; // deltas for 4 iq4_nl blocks
|
||||
uint8_t qs[QK4_NL * 2];// nibbles / quants for 4 iq4_nl blocks
|
||||
} block_iq4_nlx4;
|
||||
static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wrong iq4_nlx4 block size/padding");
|
||||
|
||||
#endif // GGML_COMMON_DECL
|
||||
#endif // GGML_COMMON_DECL
|
||||
|
||||
@@ -437,6 +424,13 @@ static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wro
|
||||
#define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
|
||||
#define GGML_TABLE_END() };
|
||||
|
||||
#define GGML_COMMON_IMPL
|
||||
#elif defined(GGML_COMMON_IMPL_CPP)
|
||||
#include <cstdint>
|
||||
|
||||
#define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
|
||||
#define GGML_TABLE_END() };
|
||||
|
||||
#define GGML_COMMON_IMPL
|
||||
#elif defined(GGML_COMMON_IMPL_METAL)
|
||||
#include <metal_stdlib>
|
||||
@@ -479,7 +473,7 @@ GGML_TABLE_BEGIN(uint8_t, ksigns_iq2xs, 128)
|
||||
240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255,
|
||||
GGML_TABLE_END()
|
||||
|
||||
//#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||
//#if __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A // lowest compute capability for integer intrinsics
|
||||
GGML_TABLE_BEGIN(uint64_t, ksigns64, 128)
|
||||
0x0000000000000000, 0xff000000000000ff, 0xff0000000000ff00, 0x000000000000ffff,
|
||||
0xff00000000ff0000, 0x0000000000ff00ff, 0x0000000000ffff00, 0xff00000000ffffff,
|
||||
|
||||
@@ -10,10 +10,14 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
list (APPEND GGML_CPU_SOURCES
|
||||
ggml-cpu/ggml-cpu.c
|
||||
ggml-cpu/ggml-cpu.cpp
|
||||
ggml-cpu/ggml-cpu-aarch64.c
|
||||
ggml-cpu/ggml-cpu-aarch64.cpp
|
||||
ggml-cpu/ggml-cpu-aarch64.h
|
||||
ggml-cpu/ggml-cpu-hbm.cpp
|
||||
ggml-cpu/ggml-cpu-hbm.h
|
||||
ggml-cpu/ggml-cpu-quants.c
|
||||
ggml-cpu/ggml-cpu-quants.h
|
||||
ggml-cpu/ggml-cpu-traits.cpp
|
||||
ggml-cpu/ggml-cpu-traits.h
|
||||
ggml-cpu/amx/amx.cpp
|
||||
ggml-cpu/amx/amx.h
|
||||
ggml-cpu/amx/mmq.cpp
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
#include "ggml-backend.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-cpu.h"
|
||||
#include "ggml-cpu-traits.h"
|
||||
|
||||
#if defined(__gnu_linux__)
|
||||
#include <sys/syscall.h>
|
||||
@@ -17,31 +18,65 @@
|
||||
|
||||
#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
|
||||
|
||||
// AMX type_trais
|
||||
namespace ggml::cpu::amx {
|
||||
class tensor_traits : public ggml::cpu::tensor_traits {
|
||||
bool work_size(int /* n_threads */, const struct ggml_tensor * op, size_t & size) override {
|
||||
size = ggml_backend_amx_desired_wsize(op);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) override {
|
||||
if (op->op == GGML_OP_MUL_MAT) {
|
||||
ggml_backend_amx_mul_mat(params, op);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
static ggml::cpu::tensor_traits * get_tensor_traits(ggml_backend_buffer_t, struct ggml_tensor *) {
|
||||
static tensor_traits traits;
|
||||
return &traits;
|
||||
}
|
||||
} // namespace ggml::cpu::amx
|
||||
|
||||
// AMX buffer interface
|
||||
static void ggml_backend_amx_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
free(buffer->context);
|
||||
}
|
||||
|
||||
static void * ggml_backend_amx_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
return (void *)(buffer->context);
|
||||
return (void *) (buffer->context);
|
||||
}
|
||||
|
||||
static void ggml_backend_amx_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
||||
memset((char *)tensor->data + offset, value, size);
|
||||
static void ggml_backend_amx_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
||||
tensor->extra = (void *) ggml::cpu::amx::get_tensor_traits(buffer, tensor);
|
||||
|
||||
GGML_UNUSED(buffer);
|
||||
}
|
||||
|
||||
static void ggml_backend_amx_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||
static void ggml_backend_amx_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,
|
||||
uint8_t value, size_t offset, size_t size) {
|
||||
memset((char *) tensor->data + offset, value, size);
|
||||
|
||||
GGML_UNUSED(buffer);
|
||||
}
|
||||
|
||||
static void ggml_backend_amx_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,
|
||||
const void * data, size_t offset, size_t size) {
|
||||
if (qtype_has_amx_kernels(tensor->type)) {
|
||||
GGML_LOG_DEBUG("%s: amx repack tensor %s of type %s\n", __func__, tensor->name, ggml_type_name(tensor->type));
|
||||
ggml_backend_amx_convert_weight(tensor, data, offset, size);
|
||||
} else {
|
||||
memcpy((char *)tensor->data + offset, data, size);
|
||||
memcpy((char *) tensor->data + offset, data, size);
|
||||
}
|
||||
|
||||
GGML_UNUSED(buffer);
|
||||
}
|
||||
|
||||
/*
|
||||
// need to figure what we need to do with buffer->extra.
|
||||
static void ggml_backend_amx_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||
GGML_ASSERT(!qtype_has_amx_kernels(tensor->type));
|
||||
memcpy(data, (const char *)tensor->data + offset, size);
|
||||
@@ -62,6 +97,7 @@ static bool ggml_backend_amx_buffer_cpy_tensor(ggml_backend_buffer_t buffer, con
|
||||
|
||||
GGML_UNUSED(buffer);
|
||||
}
|
||||
*/
|
||||
|
||||
static void ggml_backend_amx_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
||||
memset(buffer->context, value, buffer->size);
|
||||
@@ -70,13 +106,13 @@ static void ggml_backend_amx_buffer_clear(ggml_backend_buffer_t buffer, uint8_t
|
||||
static ggml_backend_buffer_i ggml_backend_amx_buffer_interface = {
|
||||
/* .free_buffer = */ ggml_backend_amx_buffer_free_buffer,
|
||||
/* .get_base = */ ggml_backend_amx_buffer_get_base,
|
||||
/* .init_tensor = */ NULL, // no initialization required
|
||||
/* .init_tensor = */ ggml_backend_amx_buffer_init_tensor,
|
||||
/* .memset_tensor = */ ggml_backend_amx_buffer_memset_tensor,
|
||||
/* .set_tensor = */ ggml_backend_amx_buffer_set_tensor,
|
||||
/* .get_tensor = */ ggml_backend_amx_buffer_get_tensor,
|
||||
/* .cpy_tensor = */ ggml_backend_amx_buffer_cpy_tensor,
|
||||
/* .get_tensor = */ nullptr,
|
||||
/* .cpy_tensor = */ nullptr,
|
||||
/* .clear = */ ggml_backend_amx_buffer_clear,
|
||||
/* .reset = */ NULL,
|
||||
/* .reset = */ nullptr,
|
||||
};
|
||||
|
||||
static const char * ggml_backend_amx_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
||||
@@ -86,7 +122,7 @@ static const char * ggml_backend_amx_buffer_type_get_name(ggml_backend_buffer_ty
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_t ggml_backend_amx_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
void * data = aligned_alloc(TENSOR_ALIGNMENT, size);
|
||||
void * data = ggml_aligned_malloc(size);
|
||||
if (data == NULL) {
|
||||
fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
|
||||
return NULL;
|
||||
@@ -101,18 +137,48 @@ static size_t ggml_backend_amx_buffer_type_get_alignment(ggml_backend_buffer_typ
|
||||
GGML_UNUSED(buft);
|
||||
}
|
||||
|
||||
static size_t ggml_backend_amx_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor* tensor) {
|
||||
namespace ggml::cpu::amx {
|
||||
class extra_buffer_type : ggml::cpu::extra_buffer_type {
|
||||
bool supports_op(ggml_backend_dev_t, const struct ggml_tensor * op) override {
|
||||
// handle only 2d gemm for now
|
||||
auto is_contiguous_2d = [](const struct ggml_tensor * t) {
|
||||
return ggml_is_contiguous(t) && t->ne[3] == 1 && t->ne[2] == 1;
|
||||
};
|
||||
|
||||
if (op->op == GGML_OP_MUL_MAT && is_contiguous_2d(op->src[0]) && // src0 must be contiguous
|
||||
is_contiguous_2d(op->src[1]) && // src1 must be contiguous
|
||||
op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_amx_buffer_type() &&
|
||||
op->ne[0] % (TILE_N * 2) == 0 && // out_features is 32x
|
||||
(qtype_has_amx_kernels(op->src[0]->type) || (op->src[0]->type == GGML_TYPE_F16))) {
|
||||
// src1 must be host buffer
|
||||
if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
|
||||
return false;
|
||||
}
|
||||
// src1 must be float32
|
||||
if (op->src[1]->type == GGML_TYPE_F32) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
ggml::cpu::tensor_traits * get_tensor_traits(const struct ggml_tensor * op) override {
|
||||
if (op->op == GGML_OP_MUL_MAT && op->src[0]->buffer &&
|
||||
op->src[0]->buffer->buft == ggml_backend_amx_buffer_type()) {
|
||||
return (ggml::cpu::tensor_traits *) op->src[0]->extra;
|
||||
}
|
||||
|
||||
return nullptr;
|
||||
}
|
||||
};
|
||||
} // namespace ggml::cpu::amx
|
||||
|
||||
static size_t ggml_backend_amx_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
||||
return ggml_backend_amx_get_alloc_size(tensor);
|
||||
|
||||
GGML_UNUSED(buft);
|
||||
}
|
||||
|
||||
static bool ggml_backend_amx_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
||||
return false;
|
||||
|
||||
GGML_UNUSED(buft);
|
||||
}
|
||||
|
||||
#define ARCH_GET_XCOMP_PERM 0x1022
|
||||
#define ARCH_REQ_XCOMP_PERM 0x1023
|
||||
#define XFEATURE_XTILECFG 17
|
||||
@@ -129,68 +195,26 @@ static bool ggml_amx_init() {
|
||||
return true;
|
||||
#endif
|
||||
}
|
||||
|
||||
ggml_backend_buffer_type_t ggml_backend_amx_buffer_type() {
|
||||
static struct ggml_backend_buffer_type ggml_backend_buffer_type_amx = {
|
||||
/* .iface = */ {
|
||||
/* .get_name = */ ggml_backend_amx_buffer_type_get_name,
|
||||
/* .alloc_buffer = */ ggml_backend_amx_buffer_type_alloc_buffer,
|
||||
/* .get_alignment = */ ggml_backend_amx_buffer_type_get_alignment,
|
||||
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
||||
/* .get_alloc_size = */ ggml_backend_amx_buffer_type_get_alloc_size,
|
||||
/* .is_host = */ ggml_backend_amx_buffer_type_is_host,
|
||||
},
|
||||
/* .get_name = */ ggml_backend_amx_buffer_type_get_name,
|
||||
/* .alloc_buffer = */ ggml_backend_amx_buffer_type_alloc_buffer,
|
||||
/* .get_alignment = */ ggml_backend_amx_buffer_type_get_alignment,
|
||||
/* .get_max_size = */ nullptr, // defaults to SIZE_MAX
|
||||
/* .get_alloc_size = */ ggml_backend_amx_buffer_type_get_alloc_size,
|
||||
/* .is_host = */ nullptr,
|
||||
},
|
||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
|
||||
/* .context = */ NULL,
|
||||
/* .context = */ new ggml::cpu::amx::extra_buffer_type(),
|
||||
};
|
||||
|
||||
if (!ggml_amx_init()) {
|
||||
return NULL;
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
return &ggml_backend_buffer_type_amx;
|
||||
}
|
||||
|
||||
bool ggml_backend_amx_buft_is_amx(ggml_backend_buffer_type_t buft) {
|
||||
return buft->iface.get_name == ggml_backend_amx_buffer_type_get_name;
|
||||
}
|
||||
|
||||
bool ggml_backend_amx_device_supports_op(const struct ggml_tensor * op) {
|
||||
// handle only 2d gemm for now
|
||||
auto is_contiguous_2d = [](const struct ggml_tensor * t) {
|
||||
return ggml_is_contiguous(t) && t->ne[3] == 1 && t->ne[2] == 1;
|
||||
};
|
||||
|
||||
switch (op->op) {
|
||||
case GGML_OP_NONE:
|
||||
case GGML_OP_RESHAPE:
|
||||
case GGML_OP_VIEW:
|
||||
case GGML_OP_PERMUTE:
|
||||
case GGML_OP_TRANSPOSE:
|
||||
return true;
|
||||
|
||||
case GGML_OP_MUL_MAT: {
|
||||
const struct ggml_tensor * src0 = op->src[0];
|
||||
const struct ggml_tensor * src1 = op->src[1];
|
||||
|
||||
const enum ggml_type type = src0->type;
|
||||
const int64_t ne0 = op->ne[0];
|
||||
|
||||
// amx kernels enables for Q4_0, Q4_1, Q8_0, F16
|
||||
// Q4_K, Q5_K, Q6_K, IQ4_XS enabled for QK_K = 256
|
||||
bool has_amx_kernels = qtype_has_amx_kernels(type) || (type == GGML_TYPE_F16);
|
||||
|
||||
bool can_use_amx =
|
||||
is_contiguous_2d(src0) && // src0 must be contiguous
|
||||
is_contiguous_2d(src1) && // src1 must be contiguous
|
||||
src1->type == GGML_TYPE_F32 && // src1 must be float32
|
||||
has_amx_kernels && // with amx kernel impls
|
||||
ne0 % (TILE_N * 2) == 0; // out_features is 32x
|
||||
|
||||
return can_use_amx;
|
||||
}
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
#endif // defined(__AMX_INT8__) && defined(__AVX512VNNI__)
|
||||
#endif // defined(__AMX_INT8__) && defined(__AVX512VNNI__)
|
||||
|
||||
@@ -1,20 +1,8 @@
|
||||
#include "ggml-backend.h"
|
||||
#include "ggml-cpu-impl.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
// GGML internal header
|
||||
|
||||
#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
|
||||
|
||||
ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void);
|
||||
bool ggml_backend_amx_buft_is_amx(ggml_backend_buffer_type_t buft);
|
||||
bool ggml_backend_amx_device_supports_op(const struct ggml_tensor * op);
|
||||
void ggml_backend_amx_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
size_t ggml_backend_amx_desired_wsize(const struct ggml_tensor * dst);
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
#include <memory>
|
||||
#include <type_traits>
|
||||
|
||||
#if defined(_OPENMP)
|
||||
#if defined(GGML_USE_OPENMP)
|
||||
#include <omp.h>
|
||||
#endif
|
||||
|
||||
@@ -56,11 +56,11 @@ inline void balance211(T n, T nth, T ith, T& n_start, T& n_end) {
|
||||
}
|
||||
|
||||
template <typename func_t>
|
||||
inline void parallel_for(int nth, int n, const func_t& f) {
|
||||
#if defined(_OPENMP)
|
||||
#pragma omp parallel num_threads(nth)
|
||||
inline void parallel_for(int n, const func_t& f) {
|
||||
#if defined(GGML_USE_OPENMP)
|
||||
#pragma omp parallel
|
||||
{
|
||||
//int nth = omp_get_num_threads();
|
||||
int nth = omp_get_num_threads();
|
||||
int ith = omp_get_thread_num();
|
||||
int tbegin, tend;
|
||||
balance211(n, nth, ith, tbegin, tend);
|
||||
@@ -68,8 +68,6 @@ inline void parallel_for(int nth, int n, const func_t& f) {
|
||||
}
|
||||
#else
|
||||
f(0, n);
|
||||
|
||||
GGML_UNUSED(nth);
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -91,10 +89,3 @@ inline bool qtype_has_amx_kernels(const enum ggml_type type) {
|
||||
(type == GGML_TYPE_Q6_K) ||
|
||||
(type == GGML_TYPE_IQ4_XS);
|
||||
}
|
||||
|
||||
// ggml backend context
|
||||
struct ggml_backend_amx_context {
|
||||
int n_threads = GGML_DEFAULT_N_THREADS;
|
||||
std::unique_ptr<char[]> work_data;
|
||||
size_t work_size = 0;
|
||||
};
|
||||
|
||||
@@ -18,10 +18,6 @@
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
#if defined(_OPENMP)
|
||||
#include <omp.h>
|
||||
#endif
|
||||
|
||||
#if (defined(_WIN32) || defined(_WIN64))
|
||||
#define RESTRICT __restrict
|
||||
#else
|
||||
@@ -1382,13 +1378,13 @@ struct tinygemm_kernel_avx<float, ggml_fp16_t, float, BLOCK_M, BLOCK_N, BLOCK_K>
|
||||
#define PACKED_INDEX(n, k, KB, tile_size) (n * KB + k) * tile_size
|
||||
|
||||
template<typename TB, int BLOCK_K>
|
||||
void convert_B_packed_format(void * RESTRICT packed_B, const TB * RESTRICT B, int N, int K, int n_threads) {
|
||||
void convert_B_packed_format(void * RESTRICT packed_B, const TB * RESTRICT B, int N, int K) {
|
||||
const int NB = N / TILE_N;
|
||||
const int KB = K / BLOCK_K;
|
||||
const int TILE_SIZE = get_tile_size<TB>();
|
||||
|
||||
// parallel on NB should be enough
|
||||
parallel_for(n_threads, NB, [&](int begin, int end) {
|
||||
parallel_for(NB, [&](int begin, int end) {
|
||||
for (int n = begin; n < end; ++n) {
|
||||
for (int k = 0; k < KB; ++k) {
|
||||
int n0 = n * TILE_N;
|
||||
@@ -2334,15 +2330,8 @@ void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * d
|
||||
const int K = tensor->ne[0]; // ne0: in_features
|
||||
const int N = tensor->ne[1]; // ne1: out_features
|
||||
|
||||
#if defined(_OPENMP)
|
||||
// the buffer ctx is not initialized when .set_tensor is called
|
||||
int n_threads = omp_get_num_threads();
|
||||
#else
|
||||
int n_threads = 1;
|
||||
#endif
|
||||
|
||||
GGML_DISPATCH_QTYPES(TYPE, [&] {
|
||||
convert_B_packed_format<type, blck_size>((void *)((char *)tensor->data + offset), (const type *)data, N, K, n_threads);
|
||||
convert_B_packed_format<type, blck_size>((void *)((char *)tensor->data + offset), (const type *)data, N, K);
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
@@ -1,16 +1,10 @@
|
||||
#pragma once
|
||||
#include "common.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
size_t ggml_backend_amx_desired_wsize(const struct ggml_tensor * dst);
|
||||
|
||||
size_t ggml_backend_amx_get_alloc_size(const struct ggml_tensor * tensor);
|
||||
|
||||
void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||
|
||||
void ggml_backend_amx_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -1,20 +1,57 @@
|
||||
#define GGML_COMMON_IMPL_C
|
||||
#define GGML_COMMON_IMPL_CPP
|
||||
#define GGML_COMMON_DECL_CPP
|
||||
#include "ggml-common.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
|
||||
#include "ggml-quants.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-cpu.h"
|
||||
#include "ggml-cpu/ggml-cpu-impl.h"
|
||||
#include "ggml-cpu-impl.h"
|
||||
#include "ggml-cpu-traits.h"
|
||||
|
||||
#include <math.h>
|
||||
#include <string.h>
|
||||
#include <assert.h>
|
||||
#include <float.h>
|
||||
#include <stdlib.h> // for qsort
|
||||
#include <stdio.h> // for GGML_ASSERT
|
||||
#include <cmath>
|
||||
#include <cstring>
|
||||
#include <cassert>
|
||||
#include <cfloat>
|
||||
#include <cstdlib> // for qsort
|
||||
#include <cstdio> // for GGML_ASSERT
|
||||
|
||||
#include "ggml-cpu-aarch64.h"
|
||||
|
||||
// TODO: move to include file?
|
||||
template <int K> constexpr int QK_0() {
|
||||
if constexpr (K == 4) {
|
||||
return QK4_0;
|
||||
}
|
||||
if constexpr (K == 8) {
|
||||
return QK8_0;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
template <int K, int N> struct block {
|
||||
ggml_half d[N]; // deltas for N qK_0 blocks
|
||||
int8_t qs[(QK_0<K>() * N * K) / 8]; // quants for N qK_0 blocks
|
||||
};
|
||||
|
||||
// control size
|
||||
static_assert(sizeof(block<4, 4>) == 4 * sizeof(ggml_half) + QK8_0 * 2, "wrong block<4,4> size/padding");
|
||||
static_assert(sizeof(block<4, 8>) == 8 * sizeof(ggml_half) + QK8_0 * 4, "wrong block<4,8> size/padding");
|
||||
static_assert(sizeof(block<8, 4>) == 4 * sizeof(ggml_half) + QK8_0 * 4, "wrong block<8,4> size/padding");
|
||||
static_assert(sizeof(block<8, 8>) == 8 * sizeof(ggml_half) + QK8_0 * 8, "wrong block<8,8> size/padding");
|
||||
|
||||
using block_q4_0x4 = block<4, 4>;
|
||||
using block_q4_0x8 = block<4, 8>;
|
||||
using block_q8_0x4 = block<8, 4>;
|
||||
using block_q8_0x8 = block<8, 8>;
|
||||
|
||||
struct block_iq4_nlx4 {
|
||||
ggml_half d[4]; // deltas for 4 iq4_nl blocks
|
||||
uint8_t qs[QK4_NL * 2]; // nibbles / quants for 4 iq4_nl blocks
|
||||
};
|
||||
|
||||
static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wrong iq4_nlx4 block size/padding");
|
||||
|
||||
#if defined(__GNUC__)
|
||||
#pragma GCC diagnostic ignored "-Woverlength-strings"
|
||||
#elif defined(_MSC_VER)
|
||||
@@ -185,12 +222,12 @@ static inline __m256i mul_sum_i8_pairs_int32x8(const __m256i x, const __m256i y)
|
||||
|
||||
static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
|
||||
|
||||
static void quantize_q8_0_4x4(const float * restrict x, void * restrict vy, int64_t k) {
|
||||
static void quantize_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
|
||||
assert(QK8_0 == 32);
|
||||
assert(k % QK8_0 == 0);
|
||||
const int nb = k / QK8_0;
|
||||
|
||||
block_q8_0x4 * restrict y = (block_q8_0x4 *) vy;
|
||||
block_q8_0x4 * GGML_RESTRICT y = (block_q8_0x4 *) vy;
|
||||
|
||||
#if defined(__ARM_NEON)
|
||||
float32x4_t srcv[4][8];
|
||||
@@ -279,12 +316,12 @@ static void quantize_q8_0_4x4(const float * restrict x, void * restrict vy, int6
|
||||
#endif
|
||||
}
|
||||
|
||||
static void quantize_q8_0_4x8(const float * restrict x, void * restrict vy, int64_t k) {
|
||||
static void quantize_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
|
||||
assert(QK8_0 == 32);
|
||||
assert(k % QK8_0 == 0);
|
||||
const int nb = k / QK8_0;
|
||||
|
||||
block_q8_0x4 * restrict y = (block_q8_0x4 *) vy;
|
||||
block_q8_0x4 * GGML_RESTRICT y = (block_q8_0x4 *) vy;
|
||||
|
||||
#if defined(__ARM_NEON)
|
||||
float32x4_t srcv[4][8];
|
||||
@@ -494,7 +531,7 @@ static void quantize_q8_0_4x8(const float * restrict x, void * restrict vy, int6
|
||||
#endif
|
||||
}
|
||||
|
||||
void quantize_mat_q8_0(const float * restrict x, void * restrict vy, int64_t nrow, int64_t n_per_row, int64_t blck_size_interleave) {
|
||||
static void quantize_mat_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row, int64_t blck_size_interleave) {
|
||||
assert(nrow == 4);
|
||||
UNUSED(nrow);
|
||||
if (blck_size_interleave == 4) {
|
||||
@@ -506,7 +543,7 @@ void quantize_mat_q8_0(const float * restrict x, void * restrict vy, int64_t nro
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
|
||||
static void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
const int ncols_interleaved = 4;
|
||||
@@ -591,7 +628,7 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
|
||||
static void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
const int ncols_interleaved = 4;
|
||||
@@ -701,7 +738,7 @@ void ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
|
||||
static void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
const int ncols_interleaved = 8;
|
||||
@@ -974,7 +1011,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
|
||||
static void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
const int ncols_interleaved = 4;
|
||||
@@ -1070,7 +1107,7 @@ void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * restrict s, size_t bs, const void
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
|
||||
static void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
const int ncols_interleaved = 4;
|
||||
@@ -1586,7 +1623,7 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
|
||||
static void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
const int ncols_interleaved = 4;
|
||||
@@ -2040,7 +2077,7 @@ void ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
|
||||
static void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
const int ncols_interleaved = 8;
|
||||
@@ -2560,31 +2597,31 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
|
||||
const __m512i rhs_mat_2367ABEF_3 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_1, 4), m4bexpanded)); //B2(24-31) B3(24-31) B6(24-31) B7(24-31) BA(24-31) BB(24-31) BE(24-31) BF(24-31)
|
||||
|
||||
// Shuffle pattern one - right side input
|
||||
const __m512i rhs_mat_014589CD_0_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_0, 136); //B0(0-3) B1(0-3) B0(0-3) B1(0-3) B4(0-3) B5(0-3) B4(0-3) B5(0-3) B8(0-3) B9(0-3) B8(0-3) B9(0-3) BC(0-3) BD(0-3) BC(0-3) BD(0-3)
|
||||
const __m512i rhs_mat_2367ABEF_0_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_0, 136); //B2(0-3) B3(0-3) B2(0-3) B3(0-3) B6(0-3) B7(0-3) B6(0-3) B7(0-3) BA(0-3) BB(0-3) BA(0-3) BB(0-3) BE(0-3) BF(0-3) BE(0-3) BF(0-3)
|
||||
const __m512i rhs_mat_014589CD_0_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_0, (_MM_PERM_ENUM)136); //B0(0-3) B1(0-3) B0(0-3) B1(0-3) B4(0-3) B5(0-3) B4(0-3) B5(0-3) B8(0-3) B9(0-3) B8(0-3) B9(0-3) BC(0-3) BD(0-3) BC(0-3) BD(0-3)
|
||||
const __m512i rhs_mat_2367ABEF_0_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_0, (_MM_PERM_ENUM)136); //B2(0-3) B3(0-3) B2(0-3) B3(0-3) B6(0-3) B7(0-3) B6(0-3) B7(0-3) BA(0-3) BB(0-3) BA(0-3) BB(0-3) BE(0-3) BF(0-3) BE(0-3) BF(0-3)
|
||||
|
||||
const __m512i rhs_mat_014589CD_1_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_1, 136); //B0(8-11) B1(8-11) B0(8-11) B1(8-11) B4(8-11) B5(8-11) B4(8-11) B5(8-11) B8(8-11) B9(8-11) B8(8-11) B9(8-11) BC(8-11) BD(8-11) BC(8-11) BD(8-11)
|
||||
const __m512i rhs_mat_2367ABEF_1_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_1, 136); //B2(8-11) B3(8-11) B2(8-11) B3(8-11) B6(8-11) B7(8-11) B6(8-11) B7(8-11) BA(8-11) BB(8-11) BA(8-11) BB(8-11) BE(8-11) BF(8-11) BE(8-11) BF(8-11)
|
||||
const __m512i rhs_mat_014589CD_1_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_1, (_MM_PERM_ENUM)136); //B0(8-11) B1(8-11) B0(8-11) B1(8-11) B4(8-11) B5(8-11) B4(8-11) B5(8-11) B8(8-11) B9(8-11) B8(8-11) B9(8-11) BC(8-11) BD(8-11) BC(8-11) BD(8-11)
|
||||
const __m512i rhs_mat_2367ABEF_1_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_1, (_MM_PERM_ENUM)136); //B2(8-11) B3(8-11) B2(8-11) B3(8-11) B6(8-11) B7(8-11) B6(8-11) B7(8-11) BA(8-11) BB(8-11) BA(8-11) BB(8-11) BE(8-11) BF(8-11) BE(8-11) BF(8-11)
|
||||
|
||||
const __m512i rhs_mat_014589CD_2_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_2, 136); //B0(16-19) B1(16-19) B0(16-19) B1(16-19) B4(16-19) B5(16-19) B4(16-19) B5(16-19) B8(16-19) B9(16-19) B8(16-19) B9(16-19) BC(16-19) BD(16-19) BC(16-19) BD(16-19)
|
||||
const __m512i rhs_mat_2367ABEF_2_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_2, 136); //B2(16-19) B3(16-19) B2(16-19) B3(16-19) B6(16-19) B7(16-19) B6(16-19) B7(16-19) BA(16-19) BB(16-19) BA(16-19) BB(16-19) BE(16-19) BF(16-19) BE(16-19) BF(16-19)
|
||||
const __m512i rhs_mat_014589CD_2_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_2, (_MM_PERM_ENUM)136); //B0(16-19) B1(16-19) B0(16-19) B1(16-19) B4(16-19) B5(16-19) B4(16-19) B5(16-19) B8(16-19) B9(16-19) B8(16-19) B9(16-19) BC(16-19) BD(16-19) BC(16-19) BD(16-19)
|
||||
const __m512i rhs_mat_2367ABEF_2_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_2, (_MM_PERM_ENUM)136); //B2(16-19) B3(16-19) B2(16-19) B3(16-19) B6(16-19) B7(16-19) B6(16-19) B7(16-19) BA(16-19) BB(16-19) BA(16-19) BB(16-19) BE(16-19) BF(16-19) BE(16-19) BF(16-19)
|
||||
|
||||
const __m512i rhs_mat_014589CD_3_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_3, 136); //B0(24-27) B1(24-27) B0(24-27) B1(24-27) B4(24-27) B5(24-27) B4(24-27) B5(24-27) B8(24-27) B9(24-27) B8(24-27) B9(24-27) BC(24-27) BD(24-27) BC(24-27) BD(24-27)
|
||||
const __m512i rhs_mat_2367ABEF_3_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_3, 136); //B2(24-27) B3(24-27) B2(24-27) B3(24-27) B6(24-27) B7(24-27) B6(24-27) B7(24-27) BA(24-27) BB(24-27) BA(24-27) BB(24-27) BE(24-27) BF(24-27) BE(24-27) BF(24-27)
|
||||
const __m512i rhs_mat_014589CD_3_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_3, (_MM_PERM_ENUM)136); //B0(24-27) B1(24-27) B0(24-27) B1(24-27) B4(24-27) B5(24-27) B4(24-27) B5(24-27) B8(24-27) B9(24-27) B8(24-27) B9(24-27) BC(24-27) BD(24-27) BC(24-27) BD(24-27)
|
||||
const __m512i rhs_mat_2367ABEF_3_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_3, (_MM_PERM_ENUM)136); //B2(24-27) B3(24-27) B2(24-27) B3(24-27) B6(24-27) B7(24-27) B6(24-27) B7(24-27) BA(24-27) BB(24-27) BA(24-27) BB(24-27) BE(24-27) BF(24-27) BE(24-27) BF(24-27)
|
||||
|
||||
// Shuffle pattern two - right side input
|
||||
|
||||
const __m512i rhs_mat_014589CD_0_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_0, 221); //B0(4-7) B1(4-7) B0(4-7) B1(4-7) B4(4-7) B5(4-7) B4(4-7) B5(4-7) B8(4-7) B9(4-7) B8(4-7) B9(4-7) BC(4-7) BD(4-7) BC(4-7) BD(4-7)
|
||||
const __m512i rhs_mat_2367ABEF_0_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_0, 221); //B2(4-7) B3(4-7) B2(4-7) B3(4-7) B6(4-7) B7(4-7) B6(4-7) B7(4-7) BA(4-7) BB(4-7) BA(4-7) BB(4-7) BE(4-7) BF(4-7) BE(4-7) BF(4-7)
|
||||
const __m512i rhs_mat_014589CD_0_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_0, (_MM_PERM_ENUM)221); //B0(4-7) B1(4-7) B0(4-7) B1(4-7) B4(4-7) B5(4-7) B4(4-7) B5(4-7) B8(4-7) B9(4-7) B8(4-7) B9(4-7) BC(4-7) BD(4-7) BC(4-7) BD(4-7)
|
||||
const __m512i rhs_mat_2367ABEF_0_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_0, (_MM_PERM_ENUM)221); //B2(4-7) B3(4-7) B2(4-7) B3(4-7) B6(4-7) B7(4-7) B6(4-7) B7(4-7) BA(4-7) BB(4-7) BA(4-7) BB(4-7) BE(4-7) BF(4-7) BE(4-7) BF(4-7)
|
||||
|
||||
const __m512i rhs_mat_014589CD_1_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_1, 221); //B0(12-15) B1(12-15) B0(12-15) B1(12-15) B4(12-15) B5(12-15) B4(12-15) B5(12-15) B8(12-15) B9(12-15) B8(12-15) B9(12-15) BC(12-15) BD(12-15) BC(12-15) BD(12-15)
|
||||
const __m512i rhs_mat_2367ABEF_1_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_1, 221); //B2(12-15) B3(12-15) B2(12-15) B3(12-15) B6(12-15) B7(12-15) B6(12-15) B7(12-15) BA(12-15) BB(12-15) BA(12-15) BB(12-15) BE(12-15) BF(12-15) BE(12-15) BF(12-15)
|
||||
const __m512i rhs_mat_014589CD_1_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_1, (_MM_PERM_ENUM)221); //B0(12-15) B1(12-15) B0(12-15) B1(12-15) B4(12-15) B5(12-15) B4(12-15) B5(12-15) B8(12-15) B9(12-15) B8(12-15) B9(12-15) BC(12-15) BD(12-15) BC(12-15) BD(12-15)
|
||||
const __m512i rhs_mat_2367ABEF_1_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_1, (_MM_PERM_ENUM)221); //B2(12-15) B3(12-15) B2(12-15) B3(12-15) B6(12-15) B7(12-15) B6(12-15) B7(12-15) BA(12-15) BB(12-15) BA(12-15) BB(12-15) BE(12-15) BF(12-15) BE(12-15) BF(12-15)
|
||||
|
||||
const __m512i rhs_mat_014589CD_2_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_2, 221); //B0(20-23) B1(20-23) B0(20-23) B1(20-23) B4(20-23) B5(20-23) B4(20-23) B5(20-23) B8(20-23) B9(20-23) B8(20-23) B9(20-23) BC(20-23) BD(20-23) BC(20-23) BD(20-23)
|
||||
const __m512i rhs_mat_2367ABEF_2_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_2, 221); //B2(20-23) B3(20-23) B2(20-23) B3(20-23) B6(20-23) B7(20-23) B6(20-23) B7(20-23) BA(20-23) BB(20-23) BA(20-23) BB(20-23) BE(20-23) BF(20-23) BE(20-23) BF(20-23)
|
||||
const __m512i rhs_mat_014589CD_2_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_2, (_MM_PERM_ENUM)221); //B0(20-23) B1(20-23) B0(20-23) B1(20-23) B4(20-23) B5(20-23) B4(20-23) B5(20-23) B8(20-23) B9(20-23) B8(20-23) B9(20-23) BC(20-23) BD(20-23) BC(20-23) BD(20-23)
|
||||
const __m512i rhs_mat_2367ABEF_2_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_2, (_MM_PERM_ENUM)221); //B2(20-23) B3(20-23) B2(20-23) B3(20-23) B6(20-23) B7(20-23) B6(20-23) B7(20-23) BA(20-23) BB(20-23) BA(20-23) BB(20-23) BE(20-23) BF(20-23) BE(20-23) BF(20-23)
|
||||
|
||||
const __m512i rhs_mat_014589CD_3_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_3, 221); //B0(28-31) B1(28-31) B0(28-31) B1(28-31) B4(28-31) B5(28-31) B4(28-31) B5(28-31) B8(28-31) B9(28-31) B8(28-31) B9(28-31) BC(28-31) BD(28-31) BC(28-31) BD(28-31)
|
||||
const __m512i rhs_mat_2367ABEF_3_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_3, 221); //B2(28-31) B3(28-31) B2(28-31) B3(28-31) B6(28-31) B7(28-31) B6(28-31) B7(28-31) BA(28-31) BB(28-31) BA(28-31) BB(28-31) BE(28-31) BF(28-31) BE(28-31) BF(28-31)
|
||||
const __m512i rhs_mat_014589CD_3_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_3, (_MM_PERM_ENUM)221); //B0(28-31) B1(28-31) B0(28-31) B1(28-31) B4(28-31) B5(28-31) B4(28-31) B5(28-31) B8(28-31) B9(28-31) B8(28-31) B9(28-31) BC(28-31) BD(28-31) BC(28-31) BD(28-31)
|
||||
const __m512i rhs_mat_2367ABEF_3_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_3, (_MM_PERM_ENUM)221); //B2(28-31) B3(28-31) B2(28-31) B3(28-31) B6(28-31) B7(28-31) B6(28-31) B7(28-31) BA(28-31) BB(28-31) BA(28-31) BB(28-31) BE(28-31) BF(28-31) BE(28-31) BF(28-31)
|
||||
|
||||
// Scale values - Load the weight scale values of two block_q4_0x8
|
||||
const __m512 col_scale_f32 = GGML_F32Cx8x2_LOAD(b_ptr_0[b].d, b_ptr_1[b].d);
|
||||
@@ -2618,31 +2655,31 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
|
||||
|
||||
// Shuffle pattern one - left side input
|
||||
|
||||
const __m512i lhs_mat_01_0_sp1 = _mm512_shuffle_epi32(lhs_mat_01_0, 160); //A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3)
|
||||
const __m512i lhs_mat_23_0_sp1 = _mm512_shuffle_epi32(lhs_mat_23_0, 160); //A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3)
|
||||
const __m512i lhs_mat_01_0_sp1 = _mm512_shuffle_epi32(lhs_mat_01_0, (_MM_PERM_ENUM)160); //A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3)
|
||||
const __m512i lhs_mat_23_0_sp1 = _mm512_shuffle_epi32(lhs_mat_23_0, (_MM_PERM_ENUM)160); //A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3)
|
||||
|
||||
const __m512i lhs_mat_01_1_sp1 = _mm512_shuffle_epi32(lhs_mat_01_1, 160); //A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11)
|
||||
const __m512i lhs_mat_23_1_sp1 = _mm512_shuffle_epi32(lhs_mat_23_1, 160); //A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11)
|
||||
const __m512i lhs_mat_01_1_sp1 = _mm512_shuffle_epi32(lhs_mat_01_1, (_MM_PERM_ENUM)160); //A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11)
|
||||
const __m512i lhs_mat_23_1_sp1 = _mm512_shuffle_epi32(lhs_mat_23_1, (_MM_PERM_ENUM)160); //A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11)
|
||||
|
||||
const __m512i lhs_mat_01_2_sp1 = _mm512_shuffle_epi32(lhs_mat_01_2, 160); //A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19)
|
||||
const __m512i lhs_mat_23_2_sp1 = _mm512_shuffle_epi32(lhs_mat_23_2, 160); //A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19)
|
||||
const __m512i lhs_mat_01_2_sp1 = _mm512_shuffle_epi32(lhs_mat_01_2, (_MM_PERM_ENUM)160); //A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19)
|
||||
const __m512i lhs_mat_23_2_sp1 = _mm512_shuffle_epi32(lhs_mat_23_2, (_MM_PERM_ENUM)160); //A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19)
|
||||
|
||||
const __m512i lhs_mat_01_3_sp1 = _mm512_shuffle_epi32(lhs_mat_01_3, 160); //A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27)
|
||||
const __m512i lhs_mat_23_3_sp1 = _mm512_shuffle_epi32(lhs_mat_23_3, 160); //A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27)
|
||||
const __m512i lhs_mat_01_3_sp1 = _mm512_shuffle_epi32(lhs_mat_01_3, (_MM_PERM_ENUM)160); //A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27)
|
||||
const __m512i lhs_mat_23_3_sp1 = _mm512_shuffle_epi32(lhs_mat_23_3, (_MM_PERM_ENUM)160); //A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27)
|
||||
|
||||
// Shuffle pattern two - left side input
|
||||
|
||||
const __m512i lhs_mat_01_0_sp2 = _mm512_shuffle_epi32(lhs_mat_01_0, 245); //A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7)
|
||||
const __m512i lhs_mat_23_0_sp2 = _mm512_shuffle_epi32(lhs_mat_23_0, 245); //A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7)
|
||||
const __m512i lhs_mat_01_0_sp2 = _mm512_shuffle_epi32(lhs_mat_01_0, (_MM_PERM_ENUM)245); //A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7)
|
||||
const __m512i lhs_mat_23_0_sp2 = _mm512_shuffle_epi32(lhs_mat_23_0, (_MM_PERM_ENUM)245); //A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7)
|
||||
|
||||
const __m512i lhs_mat_01_1_sp2 = _mm512_shuffle_epi32(lhs_mat_01_1, 245); //A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15)
|
||||
const __m512i lhs_mat_23_1_sp2 = _mm512_shuffle_epi32(lhs_mat_23_1, 245); //A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15)
|
||||
const __m512i lhs_mat_01_1_sp2 = _mm512_shuffle_epi32(lhs_mat_01_1, (_MM_PERM_ENUM)245); //A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15)
|
||||
const __m512i lhs_mat_23_1_sp2 = _mm512_shuffle_epi32(lhs_mat_23_1, (_MM_PERM_ENUM)245); //A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15)
|
||||
|
||||
const __m512i lhs_mat_01_2_sp2 = _mm512_shuffle_epi32(lhs_mat_01_2, 245); //A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23)
|
||||
const __m512i lhs_mat_23_2_sp2 = _mm512_shuffle_epi32(lhs_mat_23_2, 245); //A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23)
|
||||
const __m512i lhs_mat_01_2_sp2 = _mm512_shuffle_epi32(lhs_mat_01_2, (_MM_PERM_ENUM)245); //A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23)
|
||||
const __m512i lhs_mat_23_2_sp2 = _mm512_shuffle_epi32(lhs_mat_23_2, (_MM_PERM_ENUM)245); //A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23)
|
||||
|
||||
const __m512i lhs_mat_01_3_sp2 = _mm512_shuffle_epi32(lhs_mat_01_3, 245); //A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31)
|
||||
const __m512i lhs_mat_23_3_sp2 = _mm512_shuffle_epi32(lhs_mat_23_3, 245); //A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31)
|
||||
const __m512i lhs_mat_01_3_sp2 = _mm512_shuffle_epi32(lhs_mat_01_3, (_MM_PERM_ENUM)245); //A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31)
|
||||
const __m512i lhs_mat_23_3_sp2 = _mm512_shuffle_epi32(lhs_mat_23_3, (_MM_PERM_ENUM)245); //A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31)
|
||||
|
||||
// The values arranged in shuffle patterns are operated with dot product operation within 32 bit lane i.e corresponding bytes and multiplied and added into 32 bit integers within 32 bit lane
|
||||
// Resembles MMLAs into 2x2 matrices in ARM Version
|
||||
@@ -2671,10 +2708,10 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
|
||||
|
||||
|
||||
// Straighten out to make 4 row vectors
|
||||
__m512i iacc_row_0 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_00, _mm512_shuffle_epi32(iacc_mat_01, 78));
|
||||
__m512i iacc_row_1 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_00, 78), iacc_mat_01);
|
||||
__m512i iacc_row_2 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_10, _mm512_shuffle_epi32(iacc_mat_11, 78));
|
||||
__m512i iacc_row_3 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_10, 78), iacc_mat_11);
|
||||
__m512i iacc_row_0 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_00, _mm512_shuffle_epi32(iacc_mat_01, (_MM_PERM_ENUM)78));
|
||||
__m512i iacc_row_1 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_00, (_MM_PERM_ENUM)78), iacc_mat_01);
|
||||
__m512i iacc_row_2 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_10, _mm512_shuffle_epi32(iacc_mat_11, (_MM_PERM_ENUM)78));
|
||||
__m512i iacc_row_3 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_10, (_MM_PERM_ENUM)78), iacc_mat_11);
|
||||
|
||||
// Load the scale(d) values for all the 4 Q8_0 blocks and repeat it across lanes
|
||||
const __m128i row_scale_f16 = _mm_shuffle_epi32(_mm_maskload_epi32((int const*)(a_ptrs[rp][b].d), loadMask), 68);
|
||||
@@ -2753,31 +2790,31 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
|
||||
const __m512i rhs_mat_2367ABEF_3 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_1, 4), m4bexpanded)); //B2(24-31) B3(24-31) B6(24-31) B7(24-31) BA(24-31) BB(24-31) BE(24-31) BF(24-31)
|
||||
|
||||
// Shuffle pattern one - right side input
|
||||
const __m512i rhs_mat_014589CD_0_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_0, 136); //B0(0-3) B1(0-3) B0(0-3) B1(0-3) B4(0-3) B5(0-3) B4(0-3) B5(0-3) B8(0-3) B9(0-3) B8(0-3) B9(0-3) BC(0-3) BD(0-3) BC(0-3) BD(0-3)
|
||||
const __m512i rhs_mat_2367ABEF_0_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_0, 136); //B2(0-3) B3(0-3) B2(0-3) B3(0-3) B6(0-3) B7(0-3) B6(0-3) B7(0-3) BA(0-3) BB(0-3) BA(0-3) BB(0-3) BE(0-3) BF(0-3) BE(0-3) BF(0-3)
|
||||
const __m512i rhs_mat_014589CD_0_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_0, (_MM_PERM_ENUM)136); //B0(0-3) B1(0-3) B0(0-3) B1(0-3) B4(0-3) B5(0-3) B4(0-3) B5(0-3) B8(0-3) B9(0-3) B8(0-3) B9(0-3) BC(0-3) BD(0-3) BC(0-3) BD(0-3)
|
||||
const __m512i rhs_mat_2367ABEF_0_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_0, (_MM_PERM_ENUM)136); //B2(0-3) B3(0-3) B2(0-3) B3(0-3) B6(0-3) B7(0-3) B6(0-3) B7(0-3) BA(0-3) BB(0-3) BA(0-3) BB(0-3) BE(0-3) BF(0-3) BE(0-3) BF(0-3)
|
||||
|
||||
const __m512i rhs_mat_014589CD_1_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_1, 136); //B0(8-11) B1(8-11) B0(8-11) B1(8-11) B4(8-11) B5(8-11) B4(8-11) B5(8-11) B8(8-11) B9(8-11) B8(8-11) B9(8-11) BC(8-11) BD(8-11) BC(8-11) BD(8-11)
|
||||
const __m512i rhs_mat_2367ABEF_1_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_1, 136); //B2(8-11) B3(8-11) B2(8-11) B3(8-11) B6(8-11) B7(8-11) B6(8-11) B7(8-11) BA(8-11) BB(8-11) BA(8-11) BB(8-11) BE(8-11) BF(8-11) BE(8-11) BF(8-11)
|
||||
const __m512i rhs_mat_014589CD_1_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_1, (_MM_PERM_ENUM)136); //B0(8-11) B1(8-11) B0(8-11) B1(8-11) B4(8-11) B5(8-11) B4(8-11) B5(8-11) B8(8-11) B9(8-11) B8(8-11) B9(8-11) BC(8-11) BD(8-11) BC(8-11) BD(8-11)
|
||||
const __m512i rhs_mat_2367ABEF_1_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_1, (_MM_PERM_ENUM)136); //B2(8-11) B3(8-11) B2(8-11) B3(8-11) B6(8-11) B7(8-11) B6(8-11) B7(8-11) BA(8-11) BB(8-11) BA(8-11) BB(8-11) BE(8-11) BF(8-11) BE(8-11) BF(8-11)
|
||||
|
||||
const __m512i rhs_mat_014589CD_2_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_2, 136); //B0(16-19) B1(16-19) B0(16-19) B1(16-19) B4(16-19) B5(16-19) B4(16-19) B5(16-19) B8(16-19) B9(16-19) B8(16-19) B9(16-19) BC(16-19) BD(16-19) BC(16-19) BD(16-19)
|
||||
const __m512i rhs_mat_2367ABEF_2_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_2, 136); //B2(16-19) B3(16-19) B2(16-19) B3(16-19) B6(16-19) B7(16-19) B6(16-19) B7(16-19) BA(16-19) BB(16-19) BA(16-19) BB(16-19) BE(16-19) BF(16-19) BE(16-19) BF(16-19)
|
||||
const __m512i rhs_mat_014589CD_2_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_2, (_MM_PERM_ENUM)136); //B0(16-19) B1(16-19) B0(16-19) B1(16-19) B4(16-19) B5(16-19) B4(16-19) B5(16-19) B8(16-19) B9(16-19) B8(16-19) B9(16-19) BC(16-19) BD(16-19) BC(16-19) BD(16-19)
|
||||
const __m512i rhs_mat_2367ABEF_2_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_2, (_MM_PERM_ENUM)136); //B2(16-19) B3(16-19) B2(16-19) B3(16-19) B6(16-19) B7(16-19) B6(16-19) B7(16-19) BA(16-19) BB(16-19) BA(16-19) BB(16-19) BE(16-19) BF(16-19) BE(16-19) BF(16-19)
|
||||
|
||||
const __m512i rhs_mat_014589CD_3_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_3, 136); //B0(24-27) B1(24-27) B0(24-27) B1(24-27) B4(24-27) B5(24-27) B4(24-27) B5(24-27) B8(24-27) B9(24-27) B8(24-27) B9(24-27) BC(24-27) BD(24-27) BC(24-27) BD(24-27)
|
||||
const __m512i rhs_mat_2367ABEF_3_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_3, 136); //B2(24-27) B3(24-27) B2(24-27) B3(24-27) B6(24-27) B7(24-27) B6(24-27) B7(24-27) BA(24-27) BB(24-27) BA(24-27) BB(24-27) BE(24-27) BF(24-27) BE(24-27) BF(24-27)
|
||||
const __m512i rhs_mat_014589CD_3_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_3, (_MM_PERM_ENUM)136); //B0(24-27) B1(24-27) B0(24-27) B1(24-27) B4(24-27) B5(24-27) B4(24-27) B5(24-27) B8(24-27) B9(24-27) B8(24-27) B9(24-27) BC(24-27) BD(24-27) BC(24-27) BD(24-27)
|
||||
const __m512i rhs_mat_2367ABEF_3_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_3, (_MM_PERM_ENUM)136); //B2(24-27) B3(24-27) B2(24-27) B3(24-27) B6(24-27) B7(24-27) B6(24-27) B7(24-27) BA(24-27) BB(24-27) BA(24-27) BB(24-27) BE(24-27) BF(24-27) BE(24-27) BF(24-27)
|
||||
|
||||
// Shuffle pattern two - right side input
|
||||
|
||||
const __m512i rhs_mat_014589CD_0_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_0, 221); //B0(4-7) B1(4-7) B0(4-7) B1(4-7) B4(4-7) B5(4-7) B4(4-7) B5(4-7) B8(4-7) B9(4-7) B8(4-7) B9(4-7) BC(4-7) BD(4-7) BC(4-7) BD(4-7)
|
||||
const __m512i rhs_mat_2367ABEF_0_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_0, 221); //B2(4-7) B3(4-7) B2(4-7) B3(4-7) B6(4-7) B7(4-7) B6(4-7) B7(4-7) BA(4-7) BB(4-7) BA(4-7) BB(4-7) BE(4-7) BF(4-7) BE(4-7) BF(4-7)
|
||||
const __m512i rhs_mat_014589CD_0_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_0, (_MM_PERM_ENUM)221); //B0(4-7) B1(4-7) B0(4-7) B1(4-7) B4(4-7) B5(4-7) B4(4-7) B5(4-7) B8(4-7) B9(4-7) B8(4-7) B9(4-7) BC(4-7) BD(4-7) BC(4-7) BD(4-7)
|
||||
const __m512i rhs_mat_2367ABEF_0_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_0, (_MM_PERM_ENUM)221); //B2(4-7) B3(4-7) B2(4-7) B3(4-7) B6(4-7) B7(4-7) B6(4-7) B7(4-7) BA(4-7) BB(4-7) BA(4-7) BB(4-7) BE(4-7) BF(4-7) BE(4-7) BF(4-7)
|
||||
|
||||
const __m512i rhs_mat_014589CD_1_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_1, 221); //B0(12-15) B1(12-15) B0(12-15) B1(12-15) B4(12-15) B5(12-15) B4(12-15) B5(12-15) B8(12-15) B9(12-15) B8(12-15) B9(12-15) BC(12-15) BD(12-15) BC(12-15) BD(12-15)
|
||||
const __m512i rhs_mat_2367ABEF_1_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_1, 221); //B2(12-15) B3(12-15) B2(12-15) B3(12-15) B6(12-15) B7(12-15) B6(12-15) B7(12-15) BA(12-15) BB(12-15) BA(12-15) BB(12-15) BE(12-15) BF(12-15) BE(12-15) BF(12-15)
|
||||
const __m512i rhs_mat_014589CD_1_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_1, (_MM_PERM_ENUM)221); //B0(12-15) B1(12-15) B0(12-15) B1(12-15) B4(12-15) B5(12-15) B4(12-15) B5(12-15) B8(12-15) B9(12-15) B8(12-15) B9(12-15) BC(12-15) BD(12-15) BC(12-15) BD(12-15)
|
||||
const __m512i rhs_mat_2367ABEF_1_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_1, (_MM_PERM_ENUM)221); //B2(12-15) B3(12-15) B2(12-15) B3(12-15) B6(12-15) B7(12-15) B6(12-15) B7(12-15) BA(12-15) BB(12-15) BA(12-15) BB(12-15) BE(12-15) BF(12-15) BE(12-15) BF(12-15)
|
||||
|
||||
const __m512i rhs_mat_014589CD_2_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_2, 221); //B0(20-23) B1(20-23) B0(20-23) B1(20-23) B4(20-23) B5(20-23) B4(20-23) B5(20-23) B8(20-23) B9(20-23) B8(20-23) B9(20-23) BC(20-23) BD(20-23) BC(20-23) BD(20-23)
|
||||
const __m512i rhs_mat_2367ABEF_2_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_2, 221); //B2(20-23) B3(20-23) B2(20-23) B3(20-23) B6(20-23) B7(20-23) B6(20-23) B7(20-23) BA(20-23) BB(20-23) BA(20-23) BB(20-23) BE(20-23) BF(20-23) BE(20-23) BF(20-23)
|
||||
const __m512i rhs_mat_014589CD_2_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_2, (_MM_PERM_ENUM)221); //B0(20-23) B1(20-23) B0(20-23) B1(20-23) B4(20-23) B5(20-23) B4(20-23) B5(20-23) B8(20-23) B9(20-23) B8(20-23) B9(20-23) BC(20-23) BD(20-23) BC(20-23) BD(20-23)
|
||||
const __m512i rhs_mat_2367ABEF_2_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_2, (_MM_PERM_ENUM)221); //B2(20-23) B3(20-23) B2(20-23) B3(20-23) B6(20-23) B7(20-23) B6(20-23) B7(20-23) BA(20-23) BB(20-23) BA(20-23) BB(20-23) BE(20-23) BF(20-23) BE(20-23) BF(20-23)
|
||||
|
||||
const __m512i rhs_mat_014589CD_3_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_3, 221); //B0(28-31) B1(28-31) B0(28-31) B1(28-31) B4(28-31) B5(28-31) B4(28-31) B5(28-31) B8(28-31) B9(28-31) B8(28-31) B9(28-31) BC(28-31) BD(28-31) BC(28-31) BD(28-31)
|
||||
const __m512i rhs_mat_2367ABEF_3_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_3, 221); //B2(28-31) B3(28-31) B2(28-31) B3(28-31) B6(28-31) B7(28-31) B6(28-31) B7(28-31) BA(28-31) BB(28-31) BA(28-31) BB(28-31) BE(28-31) BF(28-31) BE(28-31) BF(28-31)
|
||||
const __m512i rhs_mat_014589CD_3_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_3, (_MM_PERM_ENUM)221); //B0(28-31) B1(28-31) B0(28-31) B1(28-31) B4(28-31) B5(28-31) B4(28-31) B5(28-31) B8(28-31) B9(28-31) B8(28-31) B9(28-31) BC(28-31) BD(28-31) BC(28-31) BD(28-31)
|
||||
const __m512i rhs_mat_2367ABEF_3_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_3, (_MM_PERM_ENUM)221); //B2(28-31) B3(28-31) B2(28-31) B3(28-31) B6(28-31) B7(28-31) B6(28-31) B7(28-31) BA(28-31) BB(28-31) BA(28-31) BB(28-31) BE(28-31) BF(28-31) BE(28-31) BF(28-31)
|
||||
|
||||
|
||||
// Scale values - Load the weight scale values of two block_q4_0x8
|
||||
@@ -2809,31 +2846,31 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
|
||||
|
||||
// Shuffle pattern one - left side input
|
||||
|
||||
const __m512i lhs_mat_01_0_sp1 = _mm512_shuffle_epi32(lhs_mat_01_0, 160); //A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3)
|
||||
const __m512i lhs_mat_23_0_sp1 = _mm512_shuffle_epi32(lhs_mat_23_0, 160); //A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3)
|
||||
const __m512i lhs_mat_01_0_sp1 = _mm512_shuffle_epi32(lhs_mat_01_0, (_MM_PERM_ENUM)160); //A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3)
|
||||
const __m512i lhs_mat_23_0_sp1 = _mm512_shuffle_epi32(lhs_mat_23_0, (_MM_PERM_ENUM)160); //A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3)
|
||||
|
||||
const __m512i lhs_mat_01_1_sp1 = _mm512_shuffle_epi32(lhs_mat_01_1, 160); //A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11)
|
||||
const __m512i lhs_mat_23_1_sp1 = _mm512_shuffle_epi32(lhs_mat_23_1, 160); //A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11)
|
||||
const __m512i lhs_mat_01_1_sp1 = _mm512_shuffle_epi32(lhs_mat_01_1, (_MM_PERM_ENUM)160); //A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11)
|
||||
const __m512i lhs_mat_23_1_sp1 = _mm512_shuffle_epi32(lhs_mat_23_1, (_MM_PERM_ENUM)160); //A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11)
|
||||
|
||||
const __m512i lhs_mat_01_2_sp1 = _mm512_shuffle_epi32(lhs_mat_01_2, 160); //A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19)
|
||||
const __m512i lhs_mat_23_2_sp1 = _mm512_shuffle_epi32(lhs_mat_23_2, 160); //A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19)
|
||||
const __m512i lhs_mat_01_2_sp1 = _mm512_shuffle_epi32(lhs_mat_01_2, (_MM_PERM_ENUM)160); //A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19)
|
||||
const __m512i lhs_mat_23_2_sp1 = _mm512_shuffle_epi32(lhs_mat_23_2, (_MM_PERM_ENUM)160); //A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19)
|
||||
|
||||
const __m512i lhs_mat_01_3_sp1 = _mm512_shuffle_epi32(lhs_mat_01_3, 160); //A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27)
|
||||
const __m512i lhs_mat_23_3_sp1 = _mm512_shuffle_epi32(lhs_mat_23_3, 160); //A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27)
|
||||
const __m512i lhs_mat_01_3_sp1 = _mm512_shuffle_epi32(lhs_mat_01_3, (_MM_PERM_ENUM)160); //A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27)
|
||||
const __m512i lhs_mat_23_3_sp1 = _mm512_shuffle_epi32(lhs_mat_23_3, (_MM_PERM_ENUM)160); //A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27)
|
||||
|
||||
// Shuffle pattern two - left side input
|
||||
|
||||
const __m512i lhs_mat_01_0_sp2 = _mm512_shuffle_epi32(lhs_mat_01_0, 245); //A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7)
|
||||
const __m512i lhs_mat_23_0_sp2 = _mm512_shuffle_epi32(lhs_mat_23_0, 245); //A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7)
|
||||
const __m512i lhs_mat_01_0_sp2 = _mm512_shuffle_epi32(lhs_mat_01_0, (_MM_PERM_ENUM)245); //A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7)
|
||||
const __m512i lhs_mat_23_0_sp2 = _mm512_shuffle_epi32(lhs_mat_23_0, (_MM_PERM_ENUM)245); //A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7)
|
||||
|
||||
const __m512i lhs_mat_01_1_sp2 = _mm512_shuffle_epi32(lhs_mat_01_1, 245); //A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15)
|
||||
const __m512i lhs_mat_23_1_sp2 = _mm512_shuffle_epi32(lhs_mat_23_1, 245); //A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15)
|
||||
const __m512i lhs_mat_01_1_sp2 = _mm512_shuffle_epi32(lhs_mat_01_1, (_MM_PERM_ENUM)245); //A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15)
|
||||
const __m512i lhs_mat_23_1_sp2 = _mm512_shuffle_epi32(lhs_mat_23_1, (_MM_PERM_ENUM)245); //A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15)
|
||||
|
||||
const __m512i lhs_mat_01_2_sp2 = _mm512_shuffle_epi32(lhs_mat_01_2, 245); //A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23)
|
||||
const __m512i lhs_mat_23_2_sp2 = _mm512_shuffle_epi32(lhs_mat_23_2, 245); //A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23)
|
||||
const __m512i lhs_mat_01_2_sp2 = _mm512_shuffle_epi32(lhs_mat_01_2, (_MM_PERM_ENUM)245); //A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23)
|
||||
const __m512i lhs_mat_23_2_sp2 = _mm512_shuffle_epi32(lhs_mat_23_2, (_MM_PERM_ENUM)245); //A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23)
|
||||
|
||||
const __m512i lhs_mat_01_3_sp2 = _mm512_shuffle_epi32(lhs_mat_01_3, 245); //A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31)
|
||||
const __m512i lhs_mat_23_3_sp2 = _mm512_shuffle_epi32(lhs_mat_23_3, 245); //A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31)
|
||||
const __m512i lhs_mat_01_3_sp2 = _mm512_shuffle_epi32(lhs_mat_01_3, (_MM_PERM_ENUM)245); //A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31)
|
||||
const __m512i lhs_mat_23_3_sp2 = _mm512_shuffle_epi32(lhs_mat_23_3, (_MM_PERM_ENUM)245); //A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31)
|
||||
|
||||
// The values arranged in shuffle patterns are operated with dot product operation within 32 bit lane i.e corresponding bytes and multiplied and added into 32 bit integers within 32 bit lane
|
||||
// Resembles MMLAs into 2x2 matrices in ARM Version
|
||||
@@ -2862,10 +2899,10 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
|
||||
|
||||
|
||||
// Straighten out to make 4 row vectors
|
||||
__m512i iacc_row_0 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_00, _mm512_shuffle_epi32(iacc_mat_01, 78));
|
||||
__m512i iacc_row_1 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_00, 78), iacc_mat_01);
|
||||
__m512i iacc_row_2 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_10, _mm512_shuffle_epi32(iacc_mat_11, 78));
|
||||
__m512i iacc_row_3 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_10, 78), iacc_mat_11);
|
||||
__m512i iacc_row_0 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_00, _mm512_shuffle_epi32(iacc_mat_01, (_MM_PERM_ENUM)78));
|
||||
__m512i iacc_row_1 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_00, (_MM_PERM_ENUM)78), iacc_mat_01);
|
||||
__m512i iacc_row_2 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_10, _mm512_shuffle_epi32(iacc_mat_11, (_MM_PERM_ENUM)78));
|
||||
__m512i iacc_row_3 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_10, (_MM_PERM_ENUM)78), iacc_mat_11);
|
||||
|
||||
// Load the scale(d) values for all the 4 Q8_0 blocks and repeat it across lanes
|
||||
const __m128i row_scale_f16 = _mm_shuffle_epi32(_mm_maskload_epi32((int const*)(a_ptr[b].d), loadMask), 68);
|
||||
@@ -3460,7 +3497,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
|
||||
static void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
||||
const int qk = QK8_0;
|
||||
const int nb = n / qk;
|
||||
const int ncols_interleaved = 4;
|
||||
@@ -3571,7 +3608,6 @@ void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * restrict s, size_t bs, const void
|
||||
}
|
||||
}
|
||||
|
||||
// FIXME: this code is duplicated from ggml-aarch64.c
|
||||
static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) {
|
||||
block_q4_0x4 out;
|
||||
|
||||
@@ -3641,20 +3677,20 @@ static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int blck_size_in
|
||||
return out;
|
||||
}
|
||||
|
||||
static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block, const void * restrict data, size_t data_size) {
|
||||
static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
|
||||
GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
|
||||
GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
|
||||
constexpr int nrows_interleaved = 4;
|
||||
|
||||
block_q4_0x4 * dst = (block_q4_0x4 *)t->data;
|
||||
const block_q4_0 * src = (const block_q4_0 *)data;
|
||||
block_q4_0 dst_tmp[4];
|
||||
int nrow = t->ne[1]; // Number of rows
|
||||
int nrows_interleaved = 4;
|
||||
int nrow = ggml_nrows(t);
|
||||
int nblocks = t->ne[0] / QK4_0;
|
||||
|
||||
GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0));
|
||||
|
||||
if (nrow % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
|
||||
if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
@@ -3672,20 +3708,20 @@ static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block
|
||||
GGML_UNUSED(data_size);
|
||||
}
|
||||
|
||||
static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor *t, int interleave_block, const void * restrict data, size_t data_size) {
|
||||
static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
|
||||
GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
|
||||
GGML_ASSERT(interleave_block == 8);
|
||||
constexpr int nrows_interleaved = 8;
|
||||
|
||||
block_q4_0x8 * dst = (block_q4_0x8*)t->data;
|
||||
const block_q4_0 * src = (const block_q4_0*) data;
|
||||
block_q4_0 dst_tmp[8];
|
||||
int nrow = t->ne[1]; // Number of rows
|
||||
int nrows_interleaved = 8;
|
||||
int nrow = ggml_nrows(t);
|
||||
int nblocks = t->ne[0] / QK4_0;
|
||||
|
||||
GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0));
|
||||
|
||||
if (nrow % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
|
||||
if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
@@ -3712,16 +3748,18 @@ static block_iq4_nlx4 make_block_iq4_nlx4(block_iq4_nl * in, unsigned int blck_s
|
||||
|
||||
const int end = QK4_NL * 2 / blck_size_interleave;
|
||||
|
||||
if (blck_size_interleave == 8) {
|
||||
for (int i = 0; i < end; ++i) {
|
||||
int src_id = i % 4;
|
||||
int src_offset = (i / 4) * blck_size_interleave;
|
||||
int dst_offset = i * blck_size_interleave;
|
||||
// TODO: this branch seems wrong
|
||||
//if (blck_size_interleave == 8) {
|
||||
// for (int i = 0; i < end; ++i) {
|
||||
// int src_id = i % 4;
|
||||
// int src_offset = (i / 4) * blck_size_interleave;
|
||||
// int dst_offset = i * blck_size_interleave;
|
||||
|
||||
// Using memcpy to avoid unaligned memory accesses
|
||||
memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint64_t));
|
||||
}
|
||||
} else if (blck_size_interleave == 4) {
|
||||
// // Using memcpy to avoid unaligned memory accesses
|
||||
// memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint64_t));
|
||||
// }
|
||||
//} else
|
||||
if (blck_size_interleave == 4) {
|
||||
for (int i = 0; i < end; ++i) {
|
||||
int src_id = i % 4;
|
||||
int src_offset = (i / 4) * blck_size_interleave;
|
||||
@@ -3736,20 +3774,21 @@ static block_iq4_nlx4 make_block_iq4_nlx4(block_iq4_nl * in, unsigned int blck_s
|
||||
return out;
|
||||
}
|
||||
|
||||
static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_block, const void * restrict data, size_t data_size) {
|
||||
static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
|
||||
GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL);
|
||||
GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
|
||||
//GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
|
||||
GGML_ASSERT(interleave_block == 4);
|
||||
|
||||
block_iq4_nlx4 * dst = (block_iq4_nlx4 *)t->data;
|
||||
const block_iq4_nl * src = (const block_iq4_nl *)data;
|
||||
block_iq4_nl dst_tmp[4];
|
||||
int nrow = t->ne[1]; // Number of rows
|
||||
int nrow = ggml_nrows(t);
|
||||
int nrows_interleaved = 4;
|
||||
int nblocks = t->ne[0] / QK4_0;
|
||||
|
||||
GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_iq4_nl));
|
||||
|
||||
if (nrow % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
|
||||
if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
@@ -3767,57 +3806,457 @@ static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_b
|
||||
GGML_UNUSED(data_size);
|
||||
}
|
||||
|
||||
// Prepare for optimized kernels if applicable
|
||||
void ggml_aarch64_repack_tensor(struct ggml_tensor * cur, enum ggml_type repack_type, const void * restrict data, size_t data_size) {
|
||||
if (cur->type == repack_type) {
|
||||
memcpy(cur->data, data, data_size);
|
||||
return;
|
||||
}
|
||||
namespace ggml::cpu::aarch64 {
|
||||
// repack
|
||||
template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS>
|
||||
int repack(struct ggml_tensor *, const void *, size_t);
|
||||
|
||||
if (cur->type == GGML_TYPE_Q4_0) {
|
||||
switch (repack_type) {
|
||||
case GGML_TYPE_Q4_0_8_8:
|
||||
repack_q4_0_to_q4_0_8_bl(cur, 8, data, data_size);
|
||||
break;
|
||||
case GGML_TYPE_Q4_0_4_8:
|
||||
repack_q4_0_to_q4_0_4_bl(cur, 8, data, data_size);
|
||||
break;
|
||||
case GGML_TYPE_Q4_0_4_4:
|
||||
repack_q4_0_to_q4_0_4_bl(cur, 4, data, data_size);
|
||||
break;
|
||||
default:
|
||||
GGML_ABORT("Unsupported type");
|
||||
}
|
||||
} else if (cur->type == GGML_TYPE_IQ4_NL) {
|
||||
switch (repack_type) {
|
||||
case GGML_TYPE_IQ4_NL_4_4:
|
||||
repack_iq4_nl_to_iq4_nl_4_bl(cur, 4, data, data_size);
|
||||
break;
|
||||
default:
|
||||
GGML_ABORT("Unsupported type");
|
||||
}
|
||||
} else {
|
||||
GGML_ABORT("Unsupported type");
|
||||
}
|
||||
// TODO: generalise.
|
||||
template <> int repack<block_q4_0, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
||||
return repack_q4_0_to_q4_0_4_bl(t, 4, data, data_size);
|
||||
}
|
||||
|
||||
enum ggml_type ggml_aarch64_get_optimal_repack_type(const struct ggml_tensor * cur) {
|
||||
template <> int repack<block_q4_0, 8, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
||||
return repack_q4_0_to_q4_0_4_bl(t, 8, data, data_size);
|
||||
}
|
||||
|
||||
template <> int repack<block_q4_0, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
||||
return repack_q4_0_to_q4_0_8_bl(t, 8, data, data_size);
|
||||
}
|
||||
|
||||
template <> int repack<block_iq4_nl, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
||||
return repack_iq4_nl_to_iq4_nl_4_bl(t, 4, data, data_size);
|
||||
}
|
||||
|
||||
// TODO: needs to be revisited
|
||||
//template <> int repack<block_iq4_nl, 8, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
||||
// return repack_iq4_nl_to_iq4_nl_4_bl(t, 8, data, data_size);
|
||||
//}
|
||||
|
||||
// gemv
|
||||
template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS>
|
||||
void gemv(int, float *, size_t, const void *, const void *, int, int);
|
||||
|
||||
template <> void gemv<block_q4_0, 4, 4>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
||||
ggml_gemv_q4_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
||||
template <> void gemv<block_q4_0, 8, 4>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
||||
ggml_gemv_q4_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
||||
template <> void gemv<block_q4_0, 8, 8>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
||||
ggml_gemv_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
||||
template <>
|
||||
void gemv<block_iq4_nl, 4, 4>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
||||
ggml_gemv_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
||||
// gemm
|
||||
template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS>
|
||||
void gemm(int, float *, size_t, const void *, const void *, int, int);
|
||||
|
||||
template <> void gemm<block_q4_0, 4, 4>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
||||
ggml_gemm_q4_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
||||
template <> void gemm<block_q4_0, 8, 4>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
||||
ggml_gemm_q4_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
||||
template <> void gemm<block_q4_0, 8, 8>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
||||
ggml_gemm_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
||||
template <>
|
||||
void gemm<block_iq4_nl, 4, 4>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
||||
ggml_gemm_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
|
||||
}
|
||||
|
||||
class tensor_traits_base : public ggml::cpu::tensor_traits {
|
||||
public:
|
||||
virtual int repack(struct ggml_tensor * t, const void * data, size_t data_size) = 0;
|
||||
};
|
||||
|
||||
template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS> class tensor_traits : public tensor_traits_base {
|
||||
|
||||
bool work_size(int /* n_threads */, const struct ggml_tensor * op, size_t & size) override {
|
||||
// not realy a GGML_TYPE_Q8_0 but same size.
|
||||
switch (op->op) {
|
||||
case GGML_OP_MUL_MAT:
|
||||
size = ggml_row_size(GGML_TYPE_Q8_0, ggml_nelements(op->src[1]));
|
||||
return true;
|
||||
case GGML_OP_MUL_MAT_ID:
|
||||
size = ggml_row_size(GGML_TYPE_Q8_0, ggml_nelements(op->src[1]));
|
||||
size = GGML_PAD(size, sizeof(int64_t)); // + padding for next bloc.
|
||||
size += sizeof(int64_t) * (1+op->src[0]->ne[2]) * op->src[1]->ne[2];
|
||||
return true;
|
||||
default:
|
||||
// GGML_ABORT("fatal error");
|
||||
break;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) override {
|
||||
switch (op->op) {
|
||||
case GGML_OP_MUL_MAT:
|
||||
forward_mul_mat(params, op);
|
||||
return true;
|
||||
case GGML_OP_MUL_MAT_ID:
|
||||
forward_mul_mat_id(params, op);
|
||||
return true;
|
||||
default:
|
||||
// GGML_ABORT("fatal error");
|
||||
break;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
void forward_mul_mat(ggml_compute_params * params, ggml_tensor * op) {
|
||||
const ggml_tensor * src0 = op->src[0];
|
||||
const ggml_tensor * src1 = op->src[1];
|
||||
ggml_tensor * dst = op;
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
GGML_ASSERT(ne0 == ne01);
|
||||
GGML_ASSERT(ne1 == ne11);
|
||||
GGML_ASSERT(ne2 == ne12);
|
||||
GGML_ASSERT(ne3 == ne13);
|
||||
|
||||
// dst cannot be transposed or permuted
|
||||
GGML_ASSERT(nb0 == sizeof(float));
|
||||
GGML_ASSERT(nb0 <= nb1);
|
||||
GGML_ASSERT(nb1 <= nb2);
|
||||
GGML_ASSERT(nb2 <= nb3);
|
||||
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
|
||||
GGML_ASSERT(ggml_n_dims(op->src[0]) == 2);
|
||||
// GGML_ASSERT(ggml_n_dims(op->src[1]) == 2);
|
||||
|
||||
char * wdata = static_cast<char *>(params->wdata);
|
||||
const size_t nbw1 = ggml_row_size(GGML_TYPE_Q8_0, ne10);
|
||||
|
||||
assert(params->wsize >= nbw1 * ne11);
|
||||
|
||||
const ggml_from_float_t from_float = ggml_get_type_traits_cpu(GGML_TYPE_Q8_0)->from_float;
|
||||
|
||||
int64_t i11_processed = 0;
|
||||
for (int64_t i11 = ith * 4; i11 < ne11 - ne11 % 4; i11 += nth * 4) {
|
||||
quantize_mat_q8_0((float *) ((char *) src1->data + i11 * nb11), (void *) (wdata + i11 * nbw1), 4, ne10,
|
||||
INTER_SIZE);
|
||||
}
|
||||
i11_processed = ne11 - ne11 % 4;
|
||||
for (int64_t i11 = i11_processed + ith; i11 < ne11; i11 += nth) {
|
||||
from_float((float *) ((char *) src1->data + i11 * nb11), (void *) (wdata + i11 * nbw1), ne10);
|
||||
}
|
||||
|
||||
ggml_barrier(params->threadpool);
|
||||
|
||||
const void * src1_wdata = params->wdata;
|
||||
const size_t src1_col_stride = ggml_row_size(GGML_TYPE_Q8_0, ne10);
|
||||
int64_t src0_start = (ith * ne01) / nth;
|
||||
int64_t src0_end = ((ith + 1) * ne01) / nth;
|
||||
src0_start = (src0_start % NB_COLS) ? src0_start + NB_COLS - (src0_start % NB_COLS) : src0_start;
|
||||
src0_end = (src0_end % NB_COLS) ? src0_end + NB_COLS - (src0_end % NB_COLS) : src0_end;
|
||||
if (src0_start >= src0_end) {
|
||||
return;
|
||||
}
|
||||
|
||||
// If there are more than three rows in src1, use gemm; otherwise, use gemv.
|
||||
if (ne11 > 3) {
|
||||
gemm<BLOC_TYPE, INTER_SIZE, NB_COLS>(ne00, (float *) ((char *) dst->data) + src0_start, ne01,
|
||||
(const char *) src0->data + src0_start * nb01,
|
||||
(const char *) src1_wdata, ne11 - ne11 % 4, src0_end - src0_start);
|
||||
}
|
||||
for (int iter = ne11 - ne11 % 4; iter < ne11; iter++) {
|
||||
gemv<BLOC_TYPE, INTER_SIZE, NB_COLS>(ne00, (float *) ((char *) dst->data + (iter * nb1)) + src0_start, ne01,
|
||||
(const char *) src0->data + src0_start * nb01,
|
||||
(const char *) src1_wdata + (src1_col_stride * iter), 1,
|
||||
src0_end - src0_start);
|
||||
}
|
||||
}
|
||||
|
||||
void forward_mul_mat_id(ggml_compute_params * params, ggml_tensor * op) {
|
||||
const ggml_tensor * src0 = op->src[0];
|
||||
const ggml_tensor * src1 = op->src[1];
|
||||
const ggml_tensor * ids = op->src[2];
|
||||
ggml_tensor * dst = op;
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
const ggml_from_float_t from_float = ggml_get_type_traits_cpu(GGML_TYPE_Q8_0)->from_float;
|
||||
|
||||
// we don't support permuted src0 or src1
|
||||
GGML_ASSERT(nb00 == ggml_type_size(src0->type));
|
||||
GGML_ASSERT(nb10 == ggml_type_size(src1->type));
|
||||
|
||||
// dst cannot be transposed or permuted
|
||||
GGML_ASSERT(nb0 == sizeof(float));
|
||||
GGML_ASSERT(nb0 <= nb1);
|
||||
GGML_ASSERT(nb1 <= nb2);
|
||||
GGML_ASSERT(nb2 <= nb3);
|
||||
|
||||
GGML_ASSERT(ne03 == 1);
|
||||
GGML_ASSERT(ne13 == 1);
|
||||
GGML_ASSERT(ne3 == 1);
|
||||
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
|
||||
// row groups
|
||||
const int n_ids = ids->ne[0]; // n_expert_used
|
||||
const int n_as = ne02; // n_expert
|
||||
|
||||
const size_t nbw1 = ggml_row_size(GGML_TYPE_Q8_0, ne10);
|
||||
const size_t nbw2 = nbw1*ne11;
|
||||
const size_t nbw3 = nbw2*ne12;
|
||||
|
||||
struct mmid_row_mapping {
|
||||
int32_t i1;
|
||||
int32_t i2;
|
||||
};
|
||||
|
||||
GGML_ASSERT(params->wsize >= (GGML_PAD(nbw3, sizeof(int64_t)) + n_as * sizeof(int64_t) +
|
||||
n_as * ne12 * sizeof(mmid_row_mapping)));
|
||||
|
||||
auto wdata = (char *) params->wdata;
|
||||
auto wdata_src1_end = (char *) wdata + GGML_PAD(nbw3, sizeof(int64_t));
|
||||
int64_t * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
|
||||
struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *) (matrix_row_counts + n_as); // [n_as][ne12]
|
||||
|
||||
// src1: float32 => block_q8_0
|
||||
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
||||
for (int64_t i11 = ith; i11 < ne11; i11 += nth) {
|
||||
from_float((float *)((char *) src1->data + i12 * nb12 + i11 * nb11),
|
||||
(void *) (wdata + i12 * nbw2 + i11 * nbw1),
|
||||
ne10);
|
||||
}
|
||||
}
|
||||
|
||||
#define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id) * ne12 + (i1)]
|
||||
|
||||
if (ith == 0) {
|
||||
// initialize matrix_row_counts
|
||||
memset(matrix_row_counts, 0, n_as * sizeof(int64_t));
|
||||
|
||||
// group rows by src0 matrix
|
||||
for (int32_t iid1 = 0; iid1 < ids->ne[1]; ++iid1) {
|
||||
for (int32_t id = 0; id < n_ids; ++id) {
|
||||
const int32_t i02 =
|
||||
*(const int32_t *) ((const char *) ids->data + iid1 * ids->nb[1] + id * ids->nb[0]);
|
||||
|
||||
GGML_ASSERT(i02 >= 0 && i02 < n_as);
|
||||
|
||||
MMID_MATRIX_ROW(i02, matrix_row_counts[i02]) = { id, iid1 };
|
||||
matrix_row_counts[i02] += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ggml_barrier(params->threadpool);
|
||||
|
||||
// compute each matrix multiplication in sequence
|
||||
for (int cur_a = 0; cur_a < n_as; ++cur_a) {
|
||||
const int64_t cne1 = matrix_row_counts[cur_a];
|
||||
|
||||
if (cne1 == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
auto src0_cur = (const char *) src0->data + cur_a*nb02;
|
||||
|
||||
//const int64_t nr0 = ne01; // src0 rows
|
||||
const int64_t nr1 = cne1; // src1 rows
|
||||
|
||||
int64_t src0_cur_start = (ith * ne01) / nth;
|
||||
int64_t src0_cur_end = ((ith + 1) * ne01) / nth;
|
||||
src0_cur_start =
|
||||
(src0_cur_start % NB_COLS) ? src0_cur_start + NB_COLS - (src0_cur_start % NB_COLS) : src0_cur_start;
|
||||
src0_cur_end = (src0_cur_end % NB_COLS) ? src0_cur_end + NB_COLS - (src0_cur_end % NB_COLS) : src0_cur_end;
|
||||
|
||||
if (src0_cur_start >= src0_cur_end) return;
|
||||
|
||||
for (int ir1 = 0; ir1 < nr1; ir1++) {
|
||||
struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, ir1);
|
||||
const int id = row_mapping.i1; // selected expert index
|
||||
|
||||
const int64_t i11 = id % ne11;
|
||||
const int64_t i12 = row_mapping.i2; // row index in src1
|
||||
|
||||
const int64_t i1 = id; // selected expert index
|
||||
const int64_t i2 = i12; // row
|
||||
|
||||
auto src1_col = (const char *) wdata + (i11 * nbw1 + i12 * nbw2);
|
||||
|
||||
gemv<BLOC_TYPE, INTER_SIZE, NB_COLS>(
|
||||
ne00, (float *)((char *) dst->data + (i1 * nb1 + i2 * nb2)) + src0_cur_start,
|
||||
ne01, src0_cur + src0_cur_start * nb01,
|
||||
src1_col, 1, src0_cur_end - src0_cur_start);
|
||||
}
|
||||
}
|
||||
#undef MMID_MATRIX_ROW
|
||||
}
|
||||
|
||||
int repack(struct ggml_tensor * t, const void * data, size_t data_size) override {
|
||||
GGML_LOG_DEBUG("%s: repack tensor %s with %s_%dx%d\n", __func__, t->name, ggml_type_name(t->type),
|
||||
(int) NB_COLS, (int) INTER_SIZE);
|
||||
return ggml::cpu::aarch64::repack<BLOC_TYPE, INTER_SIZE, NB_COLS>(t, data, data_size);
|
||||
}
|
||||
};
|
||||
|
||||
// instance for Q4
|
||||
static const tensor_traits<block_q4_0, 4, 4> q4_0_4x4_q8_0;
|
||||
static const tensor_traits<block_q4_0, 8, 4> q4_0_4x8_q8_0;
|
||||
static const tensor_traits<block_q4_0, 8, 8> q4_0_8x8_q8_0;
|
||||
|
||||
// instance for IQ4
|
||||
static const tensor_traits<block_iq4_nl, 4, 4> iq4_nl_4x4_q8_0;
|
||||
|
||||
} // namespace ggml::cpu::aarch64
|
||||
|
||||
static const ggml::cpu::tensor_traits * ggml_aarch64_get_optimal_repack_type(const struct ggml_tensor * cur) {
|
||||
if (cur->type == GGML_TYPE_Q4_0) {
|
||||
// TODO: enable for AVX2 - currently disabled due to bad gemv performance
|
||||
if (/* ggml_cpu_has_avx2() || */ (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) {
|
||||
return GGML_TYPE_Q4_0_8_8;
|
||||
if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) {
|
||||
if (cur->ne[1] % 8 == 0) {
|
||||
return &ggml::cpu::aarch64::q4_0_8x8_q8_0;
|
||||
}
|
||||
}
|
||||
if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
|
||||
return GGML_TYPE_Q4_0_4_8;
|
||||
if (cur->ne[1] % 4 == 0) {
|
||||
return &ggml::cpu::aarch64::q4_0_4x8_q8_0;
|
||||
}
|
||||
}
|
||||
if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
|
||||
return GGML_TYPE_Q4_0_4_4;
|
||||
if (cur->ne[1] % 4 == 0) {
|
||||
return &ggml::cpu::aarch64::q4_0_4x4_q8_0;
|
||||
}
|
||||
}
|
||||
} else if (cur->type == GGML_TYPE_IQ4_NL) {
|
||||
if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
|
||||
return GGML_TYPE_IQ4_NL_4_4;
|
||||
if (cur->ne[1] % 4 == 0) {
|
||||
return &ggml::cpu::aarch64::iq4_nl_4x4_q8_0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return cur->type;
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
static void ggml_backend_cpu_aarch64_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
||||
tensor->extra = (void *) const_cast<ggml::cpu::tensor_traits *>(ggml_aarch64_get_optimal_repack_type(tensor));
|
||||
|
||||
GGML_UNUSED(buffer);
|
||||
}
|
||||
|
||||
static void ggml_backend_cpu_aarch64_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,
|
||||
const void * data, size_t offset, size_t size) {
|
||||
GGML_ASSERT(offset == 0);
|
||||
GGML_ASSERT(size == ggml_nbytes(tensor));
|
||||
|
||||
auto tensor_traits = (ggml::cpu::aarch64::tensor_traits_base *) tensor->extra;
|
||||
auto OK = tensor_traits->repack(tensor, data, size);
|
||||
|
||||
GGML_ASSERT(OK == 0);
|
||||
GGML_UNUSED(buffer);
|
||||
}
|
||||
|
||||
static const char * ggml_backend_cpu_aarch64_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
||||
return "CPU_AARCH64";
|
||||
|
||||
GGML_UNUSED(buft);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_t ggml_backend_cpu_aarch64_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
|
||||
|
||||
if (buffer == nullptr) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
buffer->buft = buft;
|
||||
buffer->iface.init_tensor = ggml_backend_cpu_aarch64_buffer_init_tensor;
|
||||
buffer->iface.set_tensor = ggml_backend_cpu_aarch64_buffer_set_tensor;
|
||||
return buffer;
|
||||
}
|
||||
|
||||
static size_t ggml_backend_cpu_aarch64_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
||||
return TENSOR_ALIGNMENT;
|
||||
|
||||
GGML_UNUSED(buft);
|
||||
}
|
||||
|
||||
namespace ggml::cpu::aarch64 {
|
||||
class extra_buffer_type : ggml::cpu::extra_buffer_type {
|
||||
bool supports_op(ggml_backend_dev_t, const struct ggml_tensor * op) override {
|
||||
if ( op->op == GGML_OP_MUL_MAT &&
|
||||
op->src[0]->buffer &&
|
||||
(ggml_n_dims(op->src[0]) == 2) &&
|
||||
op->src[0]->buffer->buft == ggml_backend_cpu_aarch64_buffer_type() &&
|
||||
ggml_aarch64_get_optimal_repack_type(op->src[0])
|
||||
) {
|
||||
if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
|
||||
return false;
|
||||
}
|
||||
if (op->src[1]->type == GGML_TYPE_F32) {
|
||||
return true;
|
||||
}
|
||||
//if (op->src[1]->type == GGML_TYPE_Q8_0) {
|
||||
// return true;
|
||||
//}
|
||||
// may be possible if Q8_0 packed...
|
||||
} else if (op->op == GGML_OP_MUL_MAT_ID
|
||||
&& op->src[0]->buffer
|
||||
&& (ggml_n_dims(op->src[0]) == 3)
|
||||
&& op->src[0]->buffer->buft == ggml_backend_cpu_aarch64_buffer_type()
|
||||
&& ggml_aarch64_get_optimal_repack_type(op->src[0])
|
||||
) {
|
||||
if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
|
||||
return false;
|
||||
}
|
||||
if (op->src[1]->type == GGML_TYPE_F32) {
|
||||
return true;
|
||||
}
|
||||
//if (op->src[1]->type == GGML_TYPE_Q8_0) {
|
||||
// return true;
|
||||
//}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
ggml::cpu::tensor_traits * get_tensor_traits(const struct ggml_tensor * op) override {
|
||||
if (op->op == GGML_OP_MUL_MAT || op->op == GGML_OP_MUL_MAT_ID) {
|
||||
if (op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_cpu_aarch64_buffer_type()) {
|
||||
return (ggml::cpu::tensor_traits *) op->src[0]->extra;
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
};
|
||||
} // namespace ggml::cpu::aarch64
|
||||
|
||||
ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void) {
|
||||
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_aarch64 = {
|
||||
/* .iface = */ {
|
||||
/* .get_name = */ ggml_backend_cpu_aarch64_buffer_type_get_name,
|
||||
/* .alloc_buffer = */ ggml_backend_cpu_aarch64_buffer_type_alloc_buffer,
|
||||
/* .get_alignment = */ ggml_backend_cpu_aarch64_buffer_type_get_alignment,
|
||||
/* .get_max_size = */ nullptr, // defaults to SIZE_MAX
|
||||
/* .get_alloc_size = */ nullptr, // defaults to ggml_nbytes
|
||||
/* .is_host = */ nullptr,
|
||||
},
|
||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
|
||||
/* .context = */ new ggml::cpu::aarch64::extra_buffer_type(),
|
||||
};
|
||||
|
||||
return &ggml_backend_cpu_buffer_type_aarch64;
|
||||
}
|
||||
@@ -1,32 +1,8 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml-cpu-traits.h"
|
||||
#include "ggml.h"
|
||||
|
||||
// GGML internal header
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// Quantization
|
||||
void quantize_mat_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nrows, int64_t n_per_row, int64_t blck_size_interleave);
|
||||
|
||||
// GEMV
|
||||
void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
|
||||
// GEMM
|
||||
void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
|
||||
void ggml_aarch64_repack_tensor(struct ggml_tensor * cur, enum ggml_type repack_type, const void * data, size_t data_size);
|
||||
enum ggml_type ggml_aarch64_get_optimal_repack_type(const struct ggml_tensor * cur);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void);
|
||||
|
||||
55
ggml/src/ggml-cpu/ggml-cpu-hbm.cpp
Normal file
55
ggml/src/ggml-cpu/ggml-cpu-hbm.cpp
Normal file
@@ -0,0 +1,55 @@
|
||||
#ifdef GGML_USE_CPU_HBM
|
||||
|
||||
#include "ggml-backend.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-cpu.h"
|
||||
#include "ggml-impl.h"
|
||||
|
||||
#include "ggml-cpu-hbm.h"
|
||||
|
||||
// buffer type HBM
|
||||
|
||||
#include <hbwmalloc.h>
|
||||
|
||||
static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
||||
return "CPU_HBM";
|
||||
|
||||
GGML_UNUSED(buft);
|
||||
}
|
||||
|
||||
static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
hbw_free(buffer->context);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
|
||||
size_t size) {
|
||||
void * ptr;
|
||||
int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
|
||||
if (result != 0) {
|
||||
GGML_LOG_ERROR("failed to allocate HBM buffer of size %zu\n", size);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
||||
buffer->buft = buft;
|
||||
buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
|
||||
|
||||
return buffer;
|
||||
}
|
||||
|
||||
ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
|
||||
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
|
||||
/* .iface = */ {
|
||||
/* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name,
|
||||
/* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
|
||||
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
||||
/* .get_max_size = */ nullptr, // defaults to SIZE_MAX
|
||||
/* .get_alloc_size = */ nullptr, // defaults to ggml_nbytes
|
||||
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
||||
},
|
||||
/* .context = */ nullptr,
|
||||
};
|
||||
|
||||
return &ggml_backend_cpu_buffer_type_hbm;
|
||||
}
|
||||
#endif
|
||||
8
ggml/src/ggml-cpu/ggml-cpu-hbm.h
Normal file
8
ggml/src/ggml-cpu/ggml-cpu-hbm.h
Normal file
@@ -0,0 +1,8 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml-backend.h"
|
||||
#include "ggml.h"
|
||||
|
||||
// GGML CPU internal header
|
||||
|
||||
ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
|
||||
36
ggml/src/ggml-cpu/ggml-cpu-traits.cpp
Normal file
36
ggml/src/ggml-cpu/ggml-cpu-traits.cpp
Normal file
@@ -0,0 +1,36 @@
|
||||
#include "ggml-cpu-traits.h"
|
||||
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-backend.h"
|
||||
|
||||
namespace ggml::cpu {
|
||||
tensor_traits::~tensor_traits() {}
|
||||
|
||||
extra_buffer_type::~extra_buffer_type() {}
|
||||
} // namespace ggml::cpu
|
||||
|
||||
bool ggml_cpu_extra_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) {
|
||||
for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
|
||||
if (extra && extra->context) {
|
||||
auto buf_extra = (ggml::cpu::extra_buffer_type *) extra->context;
|
||||
auto tensor_traits = buf_extra->get_tensor_traits(op);
|
||||
if (tensor_traits && tensor_traits->compute_forward(params, op)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool ggml_cpu_extra_work_size(int n_threads, const struct ggml_tensor * op, size_t * size) {
|
||||
for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
|
||||
if (extra && extra->context) {
|
||||
auto buf_extra = (ggml::cpu::extra_buffer_type *) extra->context;
|
||||
auto tensor_traits = buf_extra->get_tensor_traits(op);
|
||||
if (tensor_traits && tensor_traits->work_size(n_threads, op, *size)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
38
ggml/src/ggml-cpu/ggml-cpu-traits.h
Normal file
38
ggml/src/ggml-cpu/ggml-cpu-traits.h
Normal file
@@ -0,0 +1,38 @@
|
||||
#pragma once
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-cpu-impl.h"
|
||||
#include "ggml.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
# include <vector>
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// return true if op part of extra "accelerator"
|
||||
bool ggml_cpu_extra_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op);
|
||||
bool ggml_cpu_extra_work_size(int n_threads, const struct ggml_tensor * op, size_t * size);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
||||
namespace ggml::cpu {
|
||||
// register in tensor->extra
|
||||
class tensor_traits {
|
||||
public:
|
||||
virtual ~tensor_traits();
|
||||
virtual bool work_size(int n_threads, const struct ggml_tensor * op, size_t & size) = 0;
|
||||
virtual bool compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) = 0;
|
||||
};
|
||||
|
||||
class extra_buffer_type {
|
||||
public:
|
||||
virtual ~extra_buffer_type();
|
||||
virtual bool supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) = 0;
|
||||
virtual tensor_traits * get_tensor_traits(const struct ggml_tensor * op) = 0;
|
||||
};
|
||||
} // namespace ggml::cpu
|
||||
|
||||
// implemented in ggml-cpu.cpp.
|
||||
std::vector<ggml_backend_buffer_type_t> & ggml_backend_cpu_get_extra_buffers_type();
|
||||
|
||||
#endif
|
||||
File diff suppressed because it is too large
Load Diff
@@ -2,12 +2,18 @@
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-cpu.h"
|
||||
#include "ggml-cpu-aarch64.h"
|
||||
#include "ggml-cpu-traits.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "amx/amx.h"
|
||||
|
||||
#include <cctype>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#ifdef GGML_USE_CPU_HBM
|
||||
#include "ggml-cpu-hbm.h"
|
||||
#endif
|
||||
|
||||
#if defined(__APPLE__)
|
||||
#include <sys/types.h>
|
||||
#include <sys/sysctl.h>
|
||||
@@ -23,115 +29,7 @@
|
||||
|
||||
// ggml-backend interface
|
||||
|
||||
#ifdef GGML_USE_CPU_HBM
|
||||
|
||||
// buffer type HBM
|
||||
|
||||
#include <hbwmalloc.h>
|
||||
|
||||
static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
||||
return "CPU_HBM";
|
||||
|
||||
GGML_UNUSED(buft);
|
||||
}
|
||||
|
||||
static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
hbw_free(buffer->context);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
void * ptr;
|
||||
int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
|
||||
if (result != 0) {
|
||||
GGML_LOG_ERROR("failed to allocate HBM buffer of size %zu\n", size);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
||||
buffer->buft = buft;
|
||||
buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
|
||||
|
||||
return buffer;
|
||||
}
|
||||
|
||||
ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
|
||||
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
|
||||
/* .iface = */ {
|
||||
/* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name,
|
||||
/* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
|
||||
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
||||
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
||||
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
||||
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
||||
},
|
||||
/* .context = */ NULL,
|
||||
};
|
||||
|
||||
return &ggml_backend_cpu_buffer_type_hbm;
|
||||
}
|
||||
#endif
|
||||
|
||||
// buffer type AARCH64
|
||||
|
||||
static void ggml_backend_cpu_aarch64_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
||||
tensor->extra = (void *)ggml_aarch64_get_optimal_repack_type(tensor); // NOLINT
|
||||
|
||||
GGML_UNUSED(buffer);
|
||||
}
|
||||
|
||||
static void ggml_backend_cpu_aarch64_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||
GGML_ASSERT(offset == 0);
|
||||
GGML_ASSERT(size == ggml_nbytes(tensor));
|
||||
|
||||
enum ggml_type repack_type = (enum ggml_type)(intptr_t)tensor->extra;
|
||||
|
||||
ggml_aarch64_repack_tensor(tensor, repack_type, data, size);
|
||||
|
||||
GGML_UNUSED(buffer);
|
||||
}
|
||||
|
||||
static const char * ggml_backend_cpu_aarch64_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
||||
return "CPU_AARCH64";
|
||||
|
||||
GGML_UNUSED(buft);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_t ggml_backend_cpu_aarch64_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
auto * buffer = ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
|
||||
|
||||
if (buffer == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
buffer->buft = buft;
|
||||
buffer->iface.init_tensor = ggml_backend_cpu_aarch64_buffer_init_tensor;
|
||||
buffer->iface.set_tensor = ggml_backend_cpu_aarch64_buffer_set_tensor;
|
||||
|
||||
return buffer;
|
||||
}
|
||||
|
||||
ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void) {
|
||||
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_aarch64 = {
|
||||
/* .iface = */ {
|
||||
/* .get_name = */ ggml_backend_cpu_aarch64_buffer_type_get_name,
|
||||
/* .alloc_buffer = */ ggml_backend_cpu_aarch64_buffer_type_alloc_buffer,
|
||||
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
|
||||
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
||||
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
||||
/* .is_host = */ NULL,
|
||||
},
|
||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
|
||||
/* .context = */ NULL,
|
||||
};
|
||||
|
||||
return &ggml_backend_cpu_buffer_type_aarch64;
|
||||
}
|
||||
|
||||
bool ggml_backend_cpu_buft_is_aarch64(ggml_backend_buffer_type_t buft) {
|
||||
return buft == ggml_backend_cpu_aarch64_buffer_type();
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_type_t * ggml_backend_cpu_get_extra_bufts(ggml_backend_dev_t device) {
|
||||
std::vector<ggml_backend_buffer_type_t>& ggml_backend_cpu_get_extra_buffers_type() {
|
||||
static std::vector<ggml_backend_buffer_type_t> bufts = []() {
|
||||
std::vector<ggml_backend_buffer_type_t> bufts;
|
||||
|
||||
@@ -152,11 +50,22 @@ static ggml_backend_buffer_type_t * ggml_backend_cpu_get_extra_bufts(ggml_backen
|
||||
return bufts;
|
||||
}();
|
||||
|
||||
return bufts.data();
|
||||
return bufts;
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_type_t * ggml_backend_cpu_device_get_extra_buffers_type(ggml_backend_dev_t device) {
|
||||
return ggml_backend_cpu_get_extra_buffers_type().data();
|
||||
|
||||
GGML_UNUSED(device);
|
||||
}
|
||||
|
||||
static bool ggml_backend_cpu_is_extra_buffer_type(ggml_backend_buffer_type_t buft) {
|
||||
for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
|
||||
if (extra && extra == buft) return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// CPU backend - backend (stream)
|
||||
|
||||
struct ggml_backend_cpu_context {
|
||||
@@ -465,25 +374,19 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
|
||||
return true;
|
||||
}
|
||||
|
||||
if (src0 && src0->buffer && ggml_backend_cpu_buft_is_aarch64(src0->buffer->buft)) {
|
||||
if (op->op != GGML_OP_MUL_MAT || src0->type == ggml_aarch64_get_optimal_repack_type(src0)) {
|
||||
return false;
|
||||
// extra_buffer_op?
|
||||
for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
|
||||
if (extra) {
|
||||
auto buf_extra = (ggml::cpu::extra_buffer_type*) extra->context;
|
||||
if (buf_extra && buf_extra->supports_op(dev, op)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
|
||||
if (src0 && src0->buffer && ggml_backend_amx_buft_is_amx(src0->buffer->buft)) {
|
||||
return ggml_backend_amx_device_supports_op(op);
|
||||
}
|
||||
for (int i = 1; i < GGML_MAX_SRC; i++) {
|
||||
if (op->src[i] && op->src[i]->buffer && ggml_backend_amx_buft_is_amx(op->src[i]->buffer->buft)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
for (int i = 1; i < GGML_MAX_SRC; i++) {
|
||||
if (op->src[i] && op->src[i]->buffer && ggml_backend_cpu_buft_is_aarch64(op->src[i]->buffer->buft)) {
|
||||
// the other case need host buffer.
|
||||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||
if (op->src[i] && op->src[i]->buffer && !ggml_backend_buft_is_host(op->src[i]->buffer->buft)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@@ -506,19 +409,10 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
|
||||
default:
|
||||
return true;
|
||||
}
|
||||
|
||||
GGML_UNUSED(dev);
|
||||
}
|
||||
|
||||
static bool ggml_backend_cpu_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
|
||||
bool supported = ggml_backend_buft_is_host(buft) || ggml_backend_cpu_buft_is_aarch64(buft);
|
||||
|
||||
#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
|
||||
supported = supported || ggml_backend_amx_buft_is_amx(buft);
|
||||
#endif
|
||||
|
||||
return supported;
|
||||
|
||||
return ggml_backend_buft_is_host(buft) || ggml_backend_cpu_is_extra_buffer_type(buft);
|
||||
GGML_UNUSED(dev);
|
||||
}
|
||||
|
||||
@@ -666,10 +560,12 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
|
||||
|
||||
static void * ggml_backend_cpu_get_proc_address(ggml_backend_reg_t reg, const char * name) {
|
||||
if (strcmp(name, "ggml_backend_set_n_threads") == 0) {
|
||||
return (void *)ggml_backend_cpu_set_n_threads;
|
||||
ggml_backend_set_n_threads_t fct = ggml_backend_cpu_set_n_threads;
|
||||
return (void *)fct;
|
||||
}
|
||||
if (strcmp(name, "ggml_backend_dev_get_extra_bufts") == 0) {
|
||||
return (void *)ggml_backend_cpu_get_extra_bufts;
|
||||
ggml_backend_dev_get_extra_bufts_t fct = ggml_backend_cpu_device_get_extra_buffers_type;
|
||||
return (void *)fct;
|
||||
}
|
||||
if (strcmp(name, "ggml_backend_get_features") == 0) {
|
||||
return (void *)ggml_backend_cpu_get_features;
|
||||
|
||||
@@ -41,28 +41,28 @@
|
||||
#define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
|
||||
#define CUDART_HMASK 12000 // CUDA 12.0, min. ver. for half2 -> uint mask comparisons
|
||||
|
||||
#define CC_PASCAL 600
|
||||
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
|
||||
#define CC_VOLTA 700
|
||||
#define CC_TURING 750
|
||||
#define CC_AMPERE 800
|
||||
#define CC_OFFSET_AMD 1000000
|
||||
#define GGML_CUDA_CC_PASCAL 600
|
||||
#define GGML_CUDA_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
|
||||
#define GGML_CUDA_CC_VOLTA 700
|
||||
#define GGML_CUDA_CC_TURING 750
|
||||
#define GGML_CUDA_CC_AMPERE 800
|
||||
#define GGML_CUDA_CC_OFFSET_AMD 1000000
|
||||
|
||||
// GCN/CNDA, wave size is 64
|
||||
#define CC_GCN4 (CC_OFFSET_AMD + 803) // Tonga, Fiji, Polaris, minimum for fast fp16
|
||||
#define CC_VEGA (CC_OFFSET_AMD + 900) // Vega56/64, minimum for fp16 dual issue
|
||||
#define CC_VEGA20 (CC_OFFSET_AMD + 906) // MI50/Radeon VII, minimum for dp4a
|
||||
#define CC_CDNA (CC_OFFSET_AMD + 908) // MI100, minimum for MFMA, acc registers
|
||||
#define CC_CDNA2 (CC_OFFSET_AMD + 910) // MI210, minimum acc register renameing
|
||||
#define CC_CDNA3 (CC_OFFSET_AMD + 942) // MI300
|
||||
#define GGML_CUDA_CC_GCN4 (GGML_CUDA_CC_OFFSET_AMD + 803) // Tonga, Fiji, Polaris, minimum for fast fp16
|
||||
#define GGML_CUDA_CC_VEGA (GGML_CUDA_CC_OFFSET_AMD + 900) // Vega56/64, minimum for fp16 dual issue
|
||||
#define GGML_CUDA_CC_VEGA20 (GGML_CUDA_CC_OFFSET_AMD + 906) // MI50/Radeon VII, minimum for dp4a
|
||||
#define GGML_CUDA_CC_CDNA (GGML_CUDA_CC_OFFSET_AMD + 908) // MI100, minimum for MFMA, acc registers
|
||||
#define GGML_CUDA_CC_CDNA2 (GGML_CUDA_CC_OFFSET_AMD + 910) // MI210, minimum acc register renameing
|
||||
#define GGML_CUDA_CC_CDNA3 (GGML_CUDA_CC_OFFSET_AMD + 942) // MI300
|
||||
|
||||
// RNDA removes MFMA, dp4a, xnack, acc registers, wave size is 32
|
||||
#define CC_RDNA1 (CC_OFFSET_AMD + 1010) // RX 5000
|
||||
#define CC_RDNA2 (CC_OFFSET_AMD + 1030) // RX 6000, minimum for dp4a
|
||||
#define CC_RDNA3 (CC_OFFSET_AMD + 1100) // RX 7000, minimum for WMMA
|
||||
#define GGML_CUDA_CC_RDNA1 (GGML_CUDA_CC_OFFSET_AMD + 1010) // RX 5000
|
||||
#define GGML_CUDA_CC_RDNA2 (GGML_CUDA_CC_OFFSET_AMD + 1030) // RX 6000, minimum for dp4a
|
||||
#define GGML_CUDA_CC_RDNA3 (GGML_CUDA_CC_OFFSET_AMD + 1100) // RX 7000, minimum for WMMA
|
||||
|
||||
#define CC_QY1 210
|
||||
#define CC_QY2 220
|
||||
#define GGML_CUDA_CC_QY1 210
|
||||
#define GGML_CUDA_CC_QY2 220
|
||||
|
||||
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
|
||||
|
||||
@@ -131,36 +131,36 @@ typedef float dfloat; // dequantize float
|
||||
typedef float2 dfloat2;
|
||||
#endif // GGML_CUDA_F16
|
||||
|
||||
#if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
|
||||
#if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
|
||||
#define FP16_AVAILABLE
|
||||
#endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
|
||||
#endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
|
||||
|
||||
#if defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
|
||||
#define FAST_FP16_AVAILABLE
|
||||
#endif // defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
|
||||
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
|
||||
#define FP16_MMA_AVAILABLE
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
|
||||
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
|
||||
#define INT8_MMA_AVAILABLE
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
|
||||
|
||||
#if !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= CC_QY1)
|
||||
#if !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= GGML_CUDA_CC_QY1)
|
||||
#define FLASH_ATTN_AVAILABLE
|
||||
#endif // !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= CC_QY1)
|
||||
#endif // !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= GGML_CUDA_CC_QY1)
|
||||
|
||||
static constexpr bool fast_fp16_available(const int cc) {
|
||||
return cc >= CC_PASCAL && cc != 610;
|
||||
return cc >= GGML_CUDA_CC_PASCAL && cc != 610;
|
||||
}
|
||||
|
||||
static constexpr bool fp16_mma_available(const int cc) {
|
||||
return cc < CC_OFFSET_AMD && cc >= CC_VOLTA;
|
||||
return cc < GGML_CUDA_CC_OFFSET_AMD && cc >= GGML_CUDA_CC_VOLTA;
|
||||
}
|
||||
|
||||
static constexpr bool int8_mma_available(const int cc) {
|
||||
return cc < CC_OFFSET_AMD && cc >= CC_TURING;
|
||||
return cc < GGML_CUDA_CC_OFFSET_AMD && cc >= GGML_CUDA_CC_TURING;
|
||||
}
|
||||
|
||||
[[noreturn]]
|
||||
@@ -187,7 +187,7 @@ static __device__ void no_device_code(
|
||||
#endif // __CUDA_ARCH__
|
||||
|
||||
static __device__ __forceinline__ int warp_reduce_sum(int x) {
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
|
||||
return __reduce_add_sync(0xffffffff, x);
|
||||
#else
|
||||
#pragma unroll
|
||||
@@ -195,7 +195,7 @@ static __device__ __forceinline__ int warp_reduce_sum(int x) {
|
||||
x += __shfl_xor_sync(0xffffffff, x, offset, 32);
|
||||
}
|
||||
return x;
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ float warp_reduce_sum(float x) {
|
||||
@@ -284,7 +284,7 @@ static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const hal
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
|
||||
#pragma unroll
|
||||
for (int offset = 16; offset > 0; offset >>= 1) {
|
||||
x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, offset, 32));
|
||||
@@ -293,7 +293,7 @@ static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
|
||||
#else
|
||||
GGML_UNUSED(x);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
|
||||
}
|
||||
|
||||
#if CUDART_VERSION < CUDART_HMASK
|
||||
@@ -333,13 +333,13 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i
|
||||
|
||||
#else // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
#if __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A
|
||||
return __dp4a(a, b, c);
|
||||
#else // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
#else // __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A
|
||||
const int8_t * a8 = (const int8_t *) &a;
|
||||
const int8_t * b8 = (const int8_t *) &b;
|
||||
return c + a8[0]*b8[0] + a8[1]*b8[1] + a8[2]*b8[2] + a8[3]*b8[3];
|
||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A
|
||||
|
||||
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
}
|
||||
|
||||
@@ -94,7 +94,9 @@ static void concat_f32_cuda(const float * x, const float * y, float * dst, int n
|
||||
}
|
||||
|
||||
// non-contiguous kernel (slow)
|
||||
static __global__ void concat_f32_non_cont(
|
||||
template <int dim>
|
||||
static __global__ void __launch_bounds__(CUDA_CONCAT_BLOCK_SIZE)
|
||||
concat_f32_non_cont(
|
||||
const char * src0,
|
||||
const char * src1,
|
||||
char * dst,
|
||||
@@ -121,22 +123,28 @@ static __global__ void concat_f32_non_cont(
|
||||
uint64_t nb0,
|
||||
uint64_t nb1,
|
||||
uint64_t nb2,
|
||||
uint64_t nb3,
|
||||
int32_t dim) {
|
||||
uint64_t nb3){
|
||||
static_assert(dim >= 0 && dim <= 3);
|
||||
|
||||
const int64_t i3 = blockIdx.z;
|
||||
const int64_t i2 = blockIdx.y;
|
||||
const int64_t i1 = blockIdx.x;
|
||||
|
||||
int64_t o[4] = {0, 0, 0, 0};
|
||||
o[dim] = dim == 0 ? ne00 : (dim == 1 ? ne01 : (dim == 2 ? ne02 : ne03));
|
||||
|
||||
const float * x;
|
||||
|
||||
for (int i0 = threadIdx.x; i0 < ne0; i0 += blockDim.x) {
|
||||
for (int64_t i0 = threadIdx.x; i0 < ne0; i0 += blockDim.x) {
|
||||
if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
|
||||
x = (const float *)(src0 + (i3 )*nb03 + (i2 )*nb02 + (i1 )*nb01 + (i0 )*nb00);
|
||||
} else {
|
||||
x = (const float *)(src1 + (i3 - o[3])*nb13 + (i2 - o[2])*nb12 + (i1 - o[1])*nb11 + (i0 - o[0])*nb10);
|
||||
if constexpr (dim == 0) {
|
||||
x = (const float *) (src1 + i3 * nb13 + i2 * nb12 + i1 * nb11 + (i0 - ne00) * nb10);
|
||||
} else if constexpr (dim == 1) {
|
||||
x = (const float *) (src1 + i3 * nb13 + i2 * nb12 + (i1 - ne01) * nb11 + i0 * nb10);
|
||||
} else if constexpr (dim == 2) {
|
||||
x = (const float *) (src1 + i3 * nb13 + (i2 - ne02) * nb12 + i1 * nb11 + i0 * nb10);
|
||||
} else if constexpr (dim == 3) {
|
||||
x = (const float *) (src1 + (i3 - ne03) * nb13 + i2 * nb12 + i1 * nb11 + i0 * nb10);
|
||||
}
|
||||
}
|
||||
|
||||
float * y = (float *)(dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||
@@ -182,15 +190,32 @@ void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
}
|
||||
} else {
|
||||
dim3 grid_dim(dst->ne[1], dst->ne[2], dst->ne[3]);
|
||||
concat_f32_non_cont<<<grid_dim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(
|
||||
(const char *)src0->data,
|
||||
(const char *)src1->data,
|
||||
( char *)dst->data,
|
||||
auto launch_kernel = [&](auto dim) {
|
||||
concat_f32_non_cont<dim><<<grid_dim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(
|
||||
(const char *) src0->data, (const char *) src1->data, (char *) dst->data,
|
||||
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
|
||||
src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
|
||||
src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
|
||||
src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3],
|
||||
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
|
||||
dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3], dim);
|
||||
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
|
||||
dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3]);
|
||||
};
|
||||
switch (dim) {
|
||||
case 0:
|
||||
launch_kernel(std::integral_constant<int, 0>{});
|
||||
break;
|
||||
case 1:
|
||||
launch_kernel(std::integral_constant<int, 1>{});
|
||||
break;
|
||||
case 2:
|
||||
launch_kernel(std::integral_constant<int, 2>{});
|
||||
break;
|
||||
case 3:
|
||||
launch_kernel(std::integral_constant<int, 3>{});
|
||||
break;
|
||||
default:
|
||||
GGML_ABORT("Invalid dim: %d", dim);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -26,7 +26,7 @@ static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __
|
||||
|
||||
template <bool need_check>
|
||||
static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, half * __restrict__ y, const int64_t k) {
|
||||
#if __CUDA_ARCH__ >= CC_PASCAL
|
||||
#if __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
|
||||
constexpr int nint = CUDA_Q8_0_NE_ALIGN/sizeof(int) + WARP_SIZE;
|
||||
|
||||
const int64_t i0 = CUDA_Q8_0_NE_ALIGN*blockIdx.x;
|
||||
@@ -64,7 +64,7 @@ static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, h
|
||||
GGML_UNUSED(y);
|
||||
GGML_UNUSED(k);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= CC_PASCAL
|
||||
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
@@ -599,7 +599,7 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
||||
case GGML_TYPE_Q5_1:
|
||||
return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
|
||||
case GGML_TYPE_Q8_0:
|
||||
if (ggml_cuda_info().devices[ggml_cuda_get_device()].cc >= CC_PASCAL) {
|
||||
if (ggml_cuda_info().devices[ggml_cuda_get_device()].cc >= GGML_CUDA_CC_PASCAL) {
|
||||
return dequantize_block_q8_0_f16_cuda;
|
||||
}
|
||||
return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
|
||||
|
||||
@@ -304,7 +304,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
|
||||
const enum ggml_prec prec = ggml_flash_attn_ext_get_prec(KQV);
|
||||
|
||||
// On AMD the tile kernels perform poorly, use the vec kernel instead:
|
||||
if (cc >= CC_OFFSET_AMD) {
|
||||
if (cc >= GGML_CUDA_CC_OFFSET_AMD) {
|
||||
if (prec == GGML_PREC_DEFAULT && fast_fp16_available(cc)) {
|
||||
ggml_cuda_flash_attn_ext_vec_f16(ctx, dst);
|
||||
} else {
|
||||
|
||||
@@ -177,7 +177,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||
info.devices[id].smpb = prop.sharedMemPerBlock;
|
||||
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
info.devices[id].smpbo = prop.sharedMemPerBlock;
|
||||
info.devices[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
|
||||
info.devices[id].cc = 100*prop.major + 10*prop.minor + GGML_CUDA_CC_OFFSET_AMD;
|
||||
#else
|
||||
info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
|
||||
info.devices[id].cc = 100*prop.major + 10*prop.minor;
|
||||
@@ -1081,7 +1081,7 @@ static void ggml_cuda_op_mul_mat_cublas(
|
||||
|
||||
const int compute_capability = ggml_cuda_info().devices[id].cc;
|
||||
|
||||
if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
|
||||
if (compute_capability >= GGML_CUDA_CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
|
||||
// convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
|
||||
ggml_cuda_pool_alloc<half> src0_as_f16(ctx.pool(id));
|
||||
if (src0->type != GGML_TYPE_F16) {
|
||||
@@ -1108,7 +1108,7 @@ static void ggml_cuda_op_mul_mat_cublas(
|
||||
const half beta_f16 = 0.0f;
|
||||
|
||||
cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
|
||||
if (ggml_cuda_info().devices[ctx.device].cc == CC_CDNA) {
|
||||
if (ggml_cuda_info().devices[ctx.device].cc == GGML_CUDA_CC_CDNA) {
|
||||
cu_compute_type = CUBLAS_COMPUTE_32F;
|
||||
}
|
||||
|
||||
@@ -1612,7 +1612,7 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
|
||||
cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
|
||||
cudaDataType_t cu_data_type = CUDA_R_16F;
|
||||
|
||||
if (ggml_cuda_info().devices[ctx.device].cc == CC_CDNA) {
|
||||
if (ggml_cuda_info().devices[ctx.device].cc == GGML_CUDA_CC_CDNA) {
|
||||
cu_compute_type = CUBLAS_COMPUTE_32F;
|
||||
}
|
||||
|
||||
@@ -2357,7 +2357,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
||||
std::vector<void *> ggml_cuda_cpy_fn_ptrs;
|
||||
|
||||
if (cuda_ctx->cuda_graph->graph == nullptr) {
|
||||
if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) {
|
||||
if (ggml_cuda_info().devices[cuda_ctx->device].cc < GGML_CUDA_CC_AMPERE) {
|
||||
cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true;
|
||||
#ifndef NDEBUG
|
||||
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to GPU architecture\n", __func__);
|
||||
@@ -3028,7 +3028,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
return true;
|
||||
}
|
||||
const int cc = ggml_cuda_info().devices[dev_ctx->device].cc;
|
||||
return cc >= CC_VOLTA && cc < CC_OFFSET_AMD && op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16;
|
||||
return cc >= GGML_CUDA_CC_VOLTA && cc < GGML_CUDA_CC_OFFSET_AMD && op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16;
|
||||
}
|
||||
case GGML_OP_CROSS_ENTROPY_LOSS:
|
||||
case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
|
||||
@@ -3210,7 +3210,7 @@ static void * ggml_backend_cuda_reg_get_proc_address(ggml_backend_reg_t reg, con
|
||||
static const ggml_backend_reg_i ggml_backend_cuda_reg_interface = {
|
||||
/* .get_name = */ ggml_backend_cuda_reg_get_name,
|
||||
/* .get_device_count = */ ggml_backend_cuda_reg_get_device_count,
|
||||
/* .get_device_get = */ ggml_backend_cuda_reg_get_device,
|
||||
/* .get_device = */ ggml_backend_cuda_reg_get_device,
|
||||
/* .get_proc_address = */ ggml_backend_cuda_reg_get_proc_address,
|
||||
};
|
||||
|
||||
|
||||
@@ -171,7 +171,7 @@ struct mma_int_C_I16J8 {
|
||||
|
||||
__device__ __forceinline__ void mma_K4(const mma_int_A_I16K4 & mma_A, const mma_int_B_J8K4 & mma_B) {
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
#if __CUDA_ARCH__ >= CC_AMPERE
|
||||
#if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
|
||||
asm("mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32 {%0, %1, %2, %3}, {%4, %5}, {%6}, {%0, %1, %2, %3};"
|
||||
: "+r"(x[0]), "+r"(x[1]), "+r"(x[2]), "+r"(x[3])
|
||||
: "r"(mma_A.x[0]), "r"(mma_A.x[1]), "r"(mma_B.x[0]));
|
||||
@@ -183,7 +183,7 @@ struct mma_int_C_I16J8 {
|
||||
asm("mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32 {%0, %1}, {%2}, {%3}, {%0, %1};"
|
||||
: "+r"(x[2]), "+r"(x[3])
|
||||
: "r"(mma_A.x[1]), "r"(mma_B.x[0]));
|
||||
#endif // __CUDA_ARCH__ >= CC_AMPERE
|
||||
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
|
||||
#else
|
||||
GGML_UNUSED(mma_A);
|
||||
GGML_UNUSED(mma_B);
|
||||
@@ -193,7 +193,7 @@ struct mma_int_C_I16J8 {
|
||||
|
||||
__device__ __forceinline__ void mma_K8(const mma_int_A_I16K8 & mma_A, const mma_int_B_J8K8 & mma_B) {
|
||||
#ifdef INT8_MMA_AVAILABLE
|
||||
#if __CUDA_ARCH__ >= CC_AMPERE
|
||||
#if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
|
||||
asm("mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32 {%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%0, %1, %2, %3};"
|
||||
: "+r"(x[0]), "+r"(x[1]), "+r"(x[2]), "+r"(x[3])
|
||||
: "r"(mma_A.x[0]), "r"(mma_A.x[1]), "r"(mma_A.x[2]), "r"(mma_A.x[3]), "r"(mma_B.x[0]), "r"(mma_B.x[1]));
|
||||
@@ -211,7 +211,7 @@ struct mma_int_C_I16J8 {
|
||||
asm("mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32 {%0, %1}, {%2}, {%3}, {%0, %1};"
|
||||
: "+r"(x[2]), "+r"(x[3])
|
||||
: "r"(mma_A.x[3]), "r"(mma_B.x[1]));
|
||||
#endif // __CUDA_ARCH__ >= CC_AMPERE
|
||||
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
|
||||
#else
|
||||
GGML_UNUSED(mma_A);
|
||||
GGML_UNUSED(mma_B);
|
||||
|
||||
@@ -27,7 +27,7 @@ void ggml_cuda_op_mul_mat_q(
|
||||
// The stream-k decomposition is only faster for recent NVIDIA GPUs.
|
||||
// Also its fixup needs to allocate a temporary buffer in the memory pool.
|
||||
// There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer.
|
||||
const bool use_stream_k = compute_capability >= CC_VOLTA && compute_capability < CC_OFFSET_AMD && src1_ncols == ne11;
|
||||
const bool use_stream_k = compute_capability >= GGML_CUDA_CC_VOLTA && compute_capability < GGML_CUDA_CC_OFFSET_AMD && src1_ncols == ne11;
|
||||
const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst, use_stream_k};
|
||||
|
||||
switch (src0->type) {
|
||||
@@ -136,7 +136,7 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (cc < MIN_CC_DP4A) {
|
||||
if (cc < GGML_CUDA_CC_DP4A) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -144,9 +144,9 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
|
||||
return true;
|
||||
#endif //GGML_CUDA_FORCE_MMQ
|
||||
|
||||
if (cc < CC_OFFSET_AMD) {
|
||||
return cc < CC_VOLTA || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
|
||||
if (cc < GGML_CUDA_CC_OFFSET_AMD) {
|
||||
return cc < GGML_CUDA_CC_VOLTA || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
|
||||
}
|
||||
|
||||
return (cc < CC_RDNA3 && cc != CC_CDNA && cc != CC_VEGA20) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
|
||||
return (cc < GGML_CUDA_CC_RDNA3 && cc != GGML_CUDA_CC_CDNA && cc != GGML_CUDA_CC_VEGA20) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
|
||||
}
|
||||
|
||||
@@ -89,9 +89,9 @@ struct tile_x_sizes {
|
||||
static constexpr int get_mmq_x_max_host(const int cc) {
|
||||
return int8_mma_available(cc) ? 128 :
|
||||
#ifdef GGML_CUDA_FORCE_MMQ
|
||||
cc >= CC_VOLTA && cc < CC_OFFSET_AMD ? 128 : 64;
|
||||
cc >= GGML_CUDA_CC_VOLTA && cc < GGML_CUDA_CC_OFFSET_AMD ? 128 : 64;
|
||||
#else
|
||||
cc >= CC_VOLTA && cc < CC_OFFSET_AMD ? MMQ_DP4A_MAX_BATCH_SIZE : 64;
|
||||
cc >= GGML_CUDA_CC_VOLTA && cc < GGML_CUDA_CC_OFFSET_AMD ? MMQ_DP4A_MAX_BATCH_SIZE : 64;
|
||||
#endif // GGML_CUDA_FORCE_MMQ
|
||||
}
|
||||
|
||||
@@ -104,23 +104,23 @@ static constexpr __device__ int get_mmq_x_max_device() {
|
||||
return 128;
|
||||
#else // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
|
||||
#if __CUDA_ARCH__ >= CC_VOLTA
|
||||
#if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
|
||||
#ifdef GGML_CUDA_FORCE_MMQ
|
||||
return MMQ_DP4A_MAX_BATCH_SIZE;
|
||||
#else // GGML_CUDA_FORCE_MMQ
|
||||
return 128;
|
||||
#endif // GGML_CUDA_FORCE_MMQ
|
||||
#else // __CUDA_ARCH__ >= CC_VOLTA
|
||||
#else // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
|
||||
|
||||
return 64;
|
||||
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
||||
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
|
||||
|
||||
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
#endif // INT8_MMA_AVAILABLE
|
||||
}
|
||||
|
||||
static constexpr int get_mmq_y_host(const int cc) {
|
||||
return cc >= CC_OFFSET_AMD ? (cc == CC_RDNA1 ? 64 : 128) : (cc >= CC_VOLTA ? 128 : 64);
|
||||
return cc >= GGML_CUDA_CC_OFFSET_AMD ? (cc == GGML_CUDA_CC_RDNA1 ? 64 : 128) : (cc >= GGML_CUDA_CC_VOLTA ? 128 : 64);
|
||||
}
|
||||
|
||||
static constexpr __device__ int get_mmq_y_device() {
|
||||
@@ -131,11 +131,11 @@ static constexpr __device__ int get_mmq_y_device() {
|
||||
return 128;
|
||||
#endif // defined RDNA1
|
||||
#else
|
||||
#if __CUDA_ARCH__ >= CC_VOLTA
|
||||
#if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
|
||||
return 128;
|
||||
#else
|
||||
return 64;
|
||||
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
||||
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
|
||||
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
}
|
||||
|
||||
@@ -2574,11 +2574,11 @@ template <ggml_type type, int mmq_x, int nwarps, bool need_check>
|
||||
__launch_bounds__(WARP_SIZE*nwarps, 2)
|
||||
#endif // defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN)
|
||||
#else
|
||||
#if __CUDA_ARCH__ >= CC_VOLTA
|
||||
#if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
|
||||
__launch_bounds__(WARP_SIZE*nwarps, 1)
|
||||
#else
|
||||
__launch_bounds__(WARP_SIZE*nwarps, 2)
|
||||
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
||||
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
|
||||
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
static __global__ void mul_mat_q(
|
||||
const char * __restrict__ x, const char * __restrict__ yc, float * __restrict__ dst, float * __restrict__ tmp_fixup,
|
||||
@@ -2594,7 +2594,7 @@ static __global__ void mul_mat_q(
|
||||
constexpr int mmq_y = get_mmq_y_device();
|
||||
|
||||
// On AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead:
|
||||
#if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < CC_VOLTA
|
||||
#if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA
|
||||
{
|
||||
constexpr bool fixup = false;
|
||||
mul_mat_q_process_tile<type, mmq_x, nwarps, need_check, fixup>
|
||||
@@ -2602,7 +2602,7 @@ static __global__ void mul_mat_q(
|
||||
blockIdx.x, blockIdx.y, 0, ne00/qk);
|
||||
return;
|
||||
}
|
||||
#endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < CC_VOLTA
|
||||
#endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA
|
||||
|
||||
const int64_t blocks_per_ne00 = ne00 / qk;
|
||||
constexpr int blocks_per_iter = MMQ_ITER_K / qk;
|
||||
@@ -2825,7 +2825,7 @@ void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cuda
|
||||
const int mmq_x_max = get_mmq_x_max_host(cc);
|
||||
const int mmq_y = get_mmq_y_host(cc);
|
||||
const int block_num_y = (args.ne01 + mmq_y - 1) / mmq_y;
|
||||
const bool use_stream_k = cc >= CC_VOLTA && cc < CC_OFFSET_AMD;
|
||||
const bool use_stream_k = cc >= GGML_CUDA_CC_VOLTA && cc < GGML_CUDA_CC_OFFSET_AMD;
|
||||
|
||||
int mmq_x_best = 0;
|
||||
int nparts_best = INT_MAX;
|
||||
|
||||
@@ -57,7 +57,7 @@ static __global__ void mul_mat_vec(
|
||||
if (block_size > WARP_SIZE) {
|
||||
buf_iw[tid/WARP_SIZE] = sumf;
|
||||
__syncthreads();
|
||||
if (tid > WARP_SIZE) {
|
||||
if (tid >= WARP_SIZE) {
|
||||
return;
|
||||
}
|
||||
sumf = buf_iw[tid];
|
||||
|
||||
@@ -142,7 +142,7 @@ static void mul_mat_vec_q_cuda(
|
||||
int64_t nwarps = 1;
|
||||
int64_t rows_per_cuda_block = 1;
|
||||
|
||||
if (ggml_cuda_info().devices[id].cc < CC_CDNA || ggml_cuda_info().devices[id].cc == CC_RDNA1) { // NVIDIA and AMD older than RDNA2 but not CDNA
|
||||
if (ggml_cuda_info().devices[id].cc < GGML_CUDA_CC_CDNA || ggml_cuda_info().devices[id].cc == GGML_CUDA_CC_RDNA1) { // NVIDIA and AMD older than RDNA2 but not CDNA
|
||||
switch(ncols_y) {
|
||||
case 1:
|
||||
nwarps = 4;
|
||||
|
||||
@@ -4,6 +4,11 @@ struct rope_corr_dims {
|
||||
float v[2];
|
||||
};
|
||||
|
||||
|
||||
struct mrope_sections {
|
||||
int v[4];
|
||||
};
|
||||
|
||||
static __device__ float rope_yarn_ramp(const float low, const float high, const int i0) {
|
||||
const float y = (i0 / 2 - low) / max(0.001f, high - low);
|
||||
return 1.0f - min(1.0f, max(0.0f, y));
|
||||
@@ -108,6 +113,105 @@ static __global__ void rope_neox(
|
||||
dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta;
|
||||
}
|
||||
|
||||
template<typename T, bool has_ff>
|
||||
static __global__ void rope_multi(
|
||||
const T * x, T * dst, int ne0, int ne2, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
|
||||
float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, const float * freq_factors, mrope_sections sections) {
|
||||
const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y);
|
||||
|
||||
if (i0 >= ne0) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int row = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
if (i0 >= n_dims) {
|
||||
const int i = row*ne0 + i0;
|
||||
|
||||
dst[i + 0] = x[i + 0];
|
||||
dst[i + 1] = x[i + 1];
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
const int i = row*ne0 + i0/2;
|
||||
const int i2 = row/p_delta_rows;
|
||||
|
||||
int sect_dims = sections.v[0] + sections.v[1] + sections.v[2] + sections.v[3];
|
||||
int sec_w = sections.v[1] + sections.v[0];
|
||||
int sector = (i0 / 2) % sect_dims;
|
||||
|
||||
float theta_base = 0.0;
|
||||
if (sector < sections.v[0]) {
|
||||
theta_base = pos[i2]*powf(theta_scale, i0/2.0f);
|
||||
}
|
||||
else if (sector >= sections.v[0] && sector < sec_w) {
|
||||
theta_base = pos[i2 + ne2 * 1]*powf(theta_scale, i0/2.0f);
|
||||
}
|
||||
else if (sector >= sec_w && sector < sec_w + sections.v[2]) {
|
||||
theta_base = pos[i2 + ne2 * 2]*powf(theta_scale, i0/2.0f);
|
||||
}
|
||||
else if (sector >= sec_w + sections.v[2]) {
|
||||
theta_base = pos[i2 + ne2 * 3]*powf(theta_scale, i0/2.0f);
|
||||
}
|
||||
|
||||
const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
|
||||
|
||||
float cos_theta;
|
||||
float sin_theta;
|
||||
|
||||
rope_yarn(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
|
||||
|
||||
const float x0 = x[i + 0];
|
||||
const float x1 = x[i + n_dims/2];
|
||||
|
||||
dst[i + 0] = x0*cos_theta - x1*sin_theta;
|
||||
dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta;
|
||||
}
|
||||
|
||||
template<typename T, bool has_ff>
|
||||
static __global__ void rope_vision(
|
||||
const T * x, T * dst, int ne0, int ne2, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
|
||||
float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, const float * freq_factors, mrope_sections sections) {
|
||||
const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y);
|
||||
|
||||
if (i0 >= ne0) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int row = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
const int i = row*ne0 + i0/2;
|
||||
const int i2 = row/p_delta_rows; // i2-th tokens
|
||||
|
||||
int sect_dims = sections.v[0] + sections.v[1];
|
||||
int sec_w = sections.v[1] + sections.v[0];
|
||||
int sector = (i0 / 2) % sect_dims;
|
||||
|
||||
float theta_base = 0.0;
|
||||
if (sector < sections.v[0]) {
|
||||
const int p = sector;
|
||||
theta_base = pos[i2]*powf(theta_scale, p);
|
||||
}
|
||||
else if (sector >= sections.v[0] && sector < sec_w) {
|
||||
const int p = sector - sections.v[0];
|
||||
theta_base = pos[i2 + ne2]*powf(theta_scale, p);
|
||||
}
|
||||
|
||||
const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
|
||||
|
||||
float cos_theta;
|
||||
float sin_theta;
|
||||
|
||||
rope_yarn(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
|
||||
|
||||
const float x0 = x[i + 0];
|
||||
const float x1 = x[i + n_dims];
|
||||
|
||||
dst[i + 0] = x0*cos_theta - x1*sin_theta;
|
||||
dst[i + n_dims] = x0*sin_theta + x1*cos_theta;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
static void rope_norm_cuda(
|
||||
const T * x, T * dst, int ne0, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
|
||||
@@ -156,6 +260,56 @@ static void rope_neox_cuda(
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
static void rope_multi_cuda(
|
||||
const T * x, T * dst, int ne0, int ne2, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
|
||||
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, mrope_sections sections, cudaStream_t stream) {
|
||||
GGML_ASSERT(ne0 % 2 == 0);
|
||||
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
|
||||
const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
||||
const dim3 block_nums(nr, n_blocks_x, 1);
|
||||
|
||||
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
||||
|
||||
if (freq_factors == nullptr) {
|
||||
rope_multi<T, false><<<block_nums, block_dims, 0, stream>>>(
|
||||
x, dst, ne0, ne2, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
|
||||
theta_scale, freq_factors, sections
|
||||
);
|
||||
} else {
|
||||
rope_multi<T, true><<<block_nums, block_dims, 0, stream>>>(
|
||||
x, dst, ne0, ne2, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
|
||||
theta_scale, freq_factors, sections
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
static void rope_vision_cuda(
|
||||
const T * x, T * dst, int ne0, int ne2, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
|
||||
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, mrope_sections sections, cudaStream_t stream) {
|
||||
GGML_ASSERT(ne0 % 2 == 0);
|
||||
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
|
||||
const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
||||
const dim3 block_nums(nr, n_blocks_x, 1);
|
||||
// break down (head_dim, heads, seq) into (CUDA_ROPE_BLOCK_SIZE, x, heads * seq)
|
||||
// where x ~= ceil(head_dim / CUDA_ROPE_BLOCK_SIZE);
|
||||
|
||||
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
||||
|
||||
if (freq_factors == nullptr) {
|
||||
rope_vision<T, false><<<block_nums, block_dims, 0, stream>>>(
|
||||
x, dst, ne0, ne2, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
|
||||
theta_scale, freq_factors, sections
|
||||
);
|
||||
} else {
|
||||
rope_vision<T, true><<<block_nums, block_dims, 0, stream>>>(
|
||||
x, dst, ne0, ne2, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
|
||||
theta_scale, freq_factors, sections
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
static void rope_norm_cuda_f16(
|
||||
const half * x, half * dst, int ne0, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
|
||||
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream) {
|
||||
@@ -185,6 +339,38 @@ static void rope_neox_cuda_f32(
|
||||
rope_neox_cuda<float>(x, dst, ne0, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream);
|
||||
}
|
||||
|
||||
static void rope_multi_cuda_f16(
|
||||
const half * x, half * dst, int ne0, int ne2, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
|
||||
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, mrope_sections sections, cudaStream_t stream
|
||||
) {
|
||||
|
||||
rope_multi_cuda<half>(x, dst, ne0, ne2, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream);
|
||||
}
|
||||
|
||||
static void rope_multi_cuda_f32(
|
||||
const float * x, float * dst, int ne0, int ne2, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
|
||||
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, mrope_sections sections, cudaStream_t stream
|
||||
) {
|
||||
|
||||
rope_multi_cuda<float>(x, dst, ne0, ne2, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream);
|
||||
}
|
||||
|
||||
static void rope_vision_cuda_f16(
|
||||
const half * x, half * dst, int ne0, int ne2, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
|
||||
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, mrope_sections sections, cudaStream_t stream
|
||||
) {
|
||||
|
||||
rope_vision_cuda<half>(x, dst, ne0, ne2, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream);
|
||||
}
|
||||
|
||||
static void rope_vision_cuda_f32(
|
||||
const float * x, float * dst, int ne0, int ne2, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
|
||||
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, mrope_sections sections, cudaStream_t stream
|
||||
) {
|
||||
|
||||
rope_vision_cuda<float>(x, dst, ne0, ne2, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const ggml_tensor * src1 = dst->src[1];
|
||||
@@ -201,8 +387,9 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(src0->type == dst->type);
|
||||
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
const int64_t ne01 = src0->ne[1];
|
||||
const int64_t ne00 = src0->ne[0]; // head dims
|
||||
const int64_t ne01 = src0->ne[1]; // num heads
|
||||
const int64_t ne02 = src0->ne[2]; // num heads
|
||||
const int64_t nr = ggml_nrows(src0);
|
||||
|
||||
//const int n_past = ((int32_t *) dst->op_params)[0];
|
||||
@@ -210,6 +397,7 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const int mode = ((int32_t *) dst->op_params)[2];
|
||||
//const int n_ctx = ((int32_t *) dst->op_params)[3];
|
||||
const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
|
||||
mrope_sections sections;
|
||||
|
||||
// RoPE alteration for extended context
|
||||
float freq_base;
|
||||
@@ -225,8 +413,19 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
|
||||
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
|
||||
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
|
||||
memcpy(§ions.v, (int32_t *) dst->op_params + 11, sizeof(int)*4);
|
||||
|
||||
const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
|
||||
const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
|
||||
const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
|
||||
|
||||
if (is_mrope) {
|
||||
GGML_ASSERT(sections.v[0] > 0 || sections.v[1] > 0 || sections.v[2] > 0);
|
||||
}
|
||||
|
||||
if (is_vision) {
|
||||
GGML_ASSERT(n_dims == ne00/2);
|
||||
}
|
||||
|
||||
const int32_t * pos = (const int32_t *) src1_d;
|
||||
|
||||
@@ -253,6 +452,34 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
} else {
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
} else if (is_mrope && !is_vision) {
|
||||
if (src0->type == GGML_TYPE_F32) {
|
||||
rope_multi_cuda_f32(
|
||||
(const float *)src0_d, (float *)dst_d, ne00, ne02, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
|
||||
attn_factor, corr_dims, freq_factors, sections, stream
|
||||
);
|
||||
} else if (src0->type == GGML_TYPE_F16) {
|
||||
rope_multi_cuda_f16(
|
||||
(const half *)src0_d, (half *)dst_d, ne00, ne02, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
|
||||
attn_factor, corr_dims, freq_factors, sections, stream
|
||||
);
|
||||
} else {
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
} else if (is_vision) {
|
||||
if (src0->type == GGML_TYPE_F32) {
|
||||
rope_vision_cuda_f32(
|
||||
(const float *)src0_d, (float *)dst_d, ne00, ne02, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
|
||||
attn_factor, corr_dims, freq_factors, sections, stream
|
||||
);
|
||||
} else if (src0->type == GGML_TYPE_F16) {
|
||||
rope_vision_cuda_f16(
|
||||
(const half *)src0_d, (half *)dst_d, ne00, ne02, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
|
||||
attn_factor, corr_dims, freq_factors, sections, stream
|
||||
);
|
||||
} else {
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
} else {
|
||||
if (src0->type == GGML_TYPE_F32) {
|
||||
rope_norm_cuda_f32(
|
||||
|
||||
@@ -3,8 +3,6 @@
|
||||
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11700
|
||||
|
||||
#ifdef USE_CUB
|
||||
// On Windows CUB uses libraries with variables called CC_PASCAL which conflict with the define in common.cuh.
|
||||
// For this reason CUB must be included BEFORE anything else.
|
||||
#include <cub/cub.cuh>
|
||||
using namespace cub;
|
||||
#endif // USE_CUB
|
||||
|
||||
@@ -74,8 +74,8 @@ static inline int ggml_up(int n, int m) {
|
||||
//
|
||||
|
||||
GGML_ATTRIBUTE_FORMAT(2, 3)
|
||||
void ggml_log_internal (enum ggml_log_level level, const char * format, ...);
|
||||
void ggml_log_callback_default(enum ggml_log_level level, const char * text, void * user_data);
|
||||
GGML_API void ggml_log_internal (enum ggml_log_level level, const char * format, ...);
|
||||
GGML_API void ggml_log_callback_default(enum ggml_log_level level, const char * text, void * user_data);
|
||||
|
||||
#define GGML_LOG(...) ggml_log_internal(GGML_LOG_LEVEL_NONE , __VA_ARGS__)
|
||||
#define GGML_LOG_INFO(...) ggml_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
|
||||
@@ -304,8 +304,8 @@ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);
|
||||
|
||||
// Memory allocation
|
||||
|
||||
void * ggml_aligned_malloc(size_t size);
|
||||
void ggml_aligned_free(void * ptr, size_t size);
|
||||
GGML_API void * ggml_aligned_malloc(size_t size);
|
||||
GGML_API void ggml_aligned_free(void * ptr, size_t size);
|
||||
|
||||
// FP16 to FP32 conversion
|
||||
|
||||
|
||||
@@ -1419,8 +1419,18 @@ static bool ggml_backend_kompute_device_supports_op(ggml_backend_dev_t dev, cons
|
||||
case GGML_OP_SOFT_MAX:
|
||||
case GGML_OP_RMS_NORM:
|
||||
case GGML_OP_NORM:
|
||||
case GGML_OP_ROPE:
|
||||
return true;
|
||||
case GGML_OP_ROPE:
|
||||
{
|
||||
const int mode = ((const int32_t *) op->op_params)[2];
|
||||
if (mode & GGML_ROPE_TYPE_MROPE) {
|
||||
return false;
|
||||
}
|
||||
if (mode & GGML_ROPE_TYPE_VISION) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
case GGML_OP_DUP:
|
||||
case GGML_OP_CPY:
|
||||
case GGML_OP_CONT:
|
||||
|
||||
@@ -1125,8 +1125,18 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
|
||||
return has_simdgroup_reduction && (op->ne[0] % 4 == 0);
|
||||
case GGML_OP_ARGMAX:
|
||||
case GGML_OP_NORM:
|
||||
case GGML_OP_ROPE:
|
||||
return true;
|
||||
case GGML_OP_ROPE:
|
||||
{
|
||||
const int mode = ((const int32_t *) op->op_params)[2];
|
||||
if (mode & GGML_ROPE_TYPE_MROPE) {
|
||||
return false;
|
||||
}
|
||||
if (mode & GGML_ROPE_TYPE_VISION) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
case GGML_OP_IM2COL:
|
||||
return op->src[0]->type == GGML_TYPE_F16;
|
||||
case GGML_OP_POOL_1D:
|
||||
@@ -3026,7 +3036,9 @@ static void ggml_metal_encode_node(
|
||||
} break;
|
||||
case GGML_OP_ROPE:
|
||||
{
|
||||
GGML_ASSERT(ne10 == ne02);
|
||||
// make sure we have one or more position id(ne10) per token(ne02)
|
||||
GGML_ASSERT(ne10 % ne02 == 0);
|
||||
GGML_ASSERT(ne10 >= ne02);
|
||||
|
||||
const int nth = MIN(1024, ne00);
|
||||
|
||||
|
||||
147
ggml/src/ggml-opencl/CMakeLists.txt
Normal file
147
ggml/src/ggml-opencl/CMakeLists.txt
Normal file
@@ -0,0 +1,147 @@
|
||||
find_package(OpenCL REQUIRED)
|
||||
find_package(Python3 REQUIRED)
|
||||
|
||||
set(TARGET_NAME ggml-opencl)
|
||||
|
||||
ggml_add_backend_library(${TARGET_NAME}
|
||||
ggml-opencl.cpp
|
||||
../../include/ggml-opencl.h)
|
||||
target_link_libraries(${TARGET_NAME} PRIVATE ${OpenCL_LIBRARIES})
|
||||
target_include_directories(${TARGET_NAME} PRIVATE ${OpenCL_INCLUDE_DIRS})
|
||||
|
||||
if (GGML_OPENCL_PROFILING)
|
||||
message(STATUS "OpenCL profiling enabled (increases CPU overhead)")
|
||||
add_compile_definitions(GGML_OPENCL_PROFILING)
|
||||
endif ()
|
||||
|
||||
add_compile_definitions(GGML_OPENCL_SOA_Q)
|
||||
|
||||
if (GGML_OPENCL_USE_ADRENO_KERNELS)
|
||||
message(STATUS "OpenCL will use matmul kernels optimized for Adreno")
|
||||
add_compile_definitions(GGML_OPENCL_USE_ADRENO_KERNELS)
|
||||
endif ()
|
||||
|
||||
if (GGML_OPENCL_EMBED_KERNELS)
|
||||
add_compile_definitions(GGML_OPENCL_EMBED_KERNELS)
|
||||
|
||||
set(OPENCL_CL_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl.cl.h")
|
||||
set(OPENCL_MM_CL_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_mm.cl.h")
|
||||
set(OPENCL_CVT_CL_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_cvt.cl.h")
|
||||
|
||||
set(OPENCL_GEMV_NOSHUFFLE_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_gemv_noshuffle.cl.h")
|
||||
set(OPENCL_GEMV_NOSHUFFLE_GENERAL_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_gemv_noshuffle_general.cl.h")
|
||||
set(OPENCL_MUL_MAT_Ab_Bi_8x4_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_mul_mat_Ab_Bi_8x4.cl.h")
|
||||
set(OPENCL_TRANSPOSE_16_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_transpose_16.cl.h")
|
||||
set(OPENCL_TRANSPOSE_32_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_transpose_32.cl.h")
|
||||
set(OPENCL_TRANSPOSE_32_16_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_transpose_32_16.cl.h")
|
||||
|
||||
set(EMBED_KERNEL_SCRIPT "${CMAKE_CURRENT_SOURCE_DIR}/kernels/embed_kernel.py")
|
||||
file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/autogenerated")
|
||||
|
||||
include_directories("${CMAKE_BINARY_DIR}/autogenerated")
|
||||
|
||||
# Python must be accessible from command line
|
||||
add_custom_command(
|
||||
OUTPUT ${OPENCL_CL_SOURCE_EMBED}
|
||||
COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl.cl
|
||||
${OPENCL_CL_SOURCE_EMBED}
|
||||
DEPENDS kernels/ggml-opencl.cl ${EMBED_KERNEL_SCRIPT}
|
||||
COMMENT "Generate ggml-opencl.cl.h"
|
||||
)
|
||||
|
||||
add_custom_command(
|
||||
OUTPUT ${OPENCL_MM_CL_SOURCE_EMBED}
|
||||
COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_mm.cl
|
||||
${OPENCL_MM_CL_SOURCE_EMBED}
|
||||
DEPENDS kernels/ggml-opencl_mm.cl ${EMBED_KERNEL_SCRIPT}
|
||||
COMMENT "Generate ggml-opencl_mm.cl.h"
|
||||
)
|
||||
|
||||
add_custom_command(
|
||||
OUTPUT ${OPENCL_CVT_CL_SOURCE_EMBED}
|
||||
COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_cvt.cl
|
||||
${OPENCL_CVT_CL_SOURCE_EMBED}
|
||||
DEPENDS kernels/ggml-opencl_cvt.cl ${EMBED_KERNEL_SCRIPT}
|
||||
COMMENT "Generate ggml-opencl_cvt.cl.h"
|
||||
)
|
||||
|
||||
add_custom_command(
|
||||
OUTPUT ${OPENCL_GEMV_NOSHUFFLE_SOURCE_EMBED}
|
||||
COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_gemv_noshuffle.cl
|
||||
${OPENCL_GEMV_NOSHUFFLE_SOURCE_EMBED}
|
||||
DEPENDS kernels/ggml-opencl_gemv_noshuffle.cl ${EMBED_KERNEL_SCRIPT}
|
||||
COMMENT "Generate ggml-opencl_gemv_noshuffle.cl.h"
|
||||
)
|
||||
|
||||
add_custom_command(
|
||||
OUTPUT ${OPENCL_GEMV_NOSHUFFLE_GENERAL_SOURCE_EMBED}
|
||||
COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_gemv_noshuffle_general.cl
|
||||
${OPENCL_GEMV_NOSHUFFLE_GENERAL_SOURCE_EMBED}
|
||||
DEPENDS kernels/ggml-opencl_gemv_noshuffle_general.cl ${EMBED_KERNEL_SCRIPT}
|
||||
COMMENT "Generate ggml-opencl_gemv_noshuffle_general.cl.h"
|
||||
)
|
||||
|
||||
add_custom_command(
|
||||
OUTPUT ${OPENCL_MUL_MAT_Ab_Bi_8x4_SOURCE_EMBED}
|
||||
COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_mul_mat_Ab_Bi_8x4.cl
|
||||
${OPENCL_MUL_MAT_Ab_Bi_8x4_SOURCE_EMBED}
|
||||
DEPENDS kernels/ggml-opencl_mul_mat_Ab_Bi_8x4.cl ${EMBED_KERNEL_SCRIPT}
|
||||
COMMENT "Generate ggml-opencl_mul_mat_Ab_Bi_8x4.cl.cl.h"
|
||||
)
|
||||
|
||||
add_custom_command(
|
||||
OUTPUT ${OPENCL_TRANSPOSE_16_SOURCE_EMBED}
|
||||
COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_transpose_16.cl
|
||||
${OPENCL_TRANSPOSE_16_SOURCE_EMBED}
|
||||
DEPENDS kernels/ggml-opencl_transpose_16.cl ${EMBED_KERNEL_SCRIPT}
|
||||
COMMENT "Generate ggml-opencl_transpose_16.cl.h"
|
||||
)
|
||||
|
||||
add_custom_command(
|
||||
OUTPUT ${OPENCL_TRANSPOSE_32_SOURCE_EMBED}
|
||||
COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_transpose_32.cl
|
||||
${OPENCL_TRANSPOSE_32_SOURCE_EMBED}
|
||||
DEPENDS kernels/ggml-opencl_transpose_32.cl ${EMBED_KERNEL_SCRIPT}
|
||||
COMMENT "Generate ggml-opencl_transpose_32.cl.h"
|
||||
)
|
||||
|
||||
add_custom_command(
|
||||
OUTPUT ${OPENCL_TRANSPOSE_32_16_SOURCE_EMBED}
|
||||
COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_transpose_32_16.cl
|
||||
${OPENCL_TRANSPOSE_32_16_SOURCE_EMBED}
|
||||
DEPENDS kernels/ggml-opencl_transpose_32_16.cl ${EMBED_KERNEL_SCRIPT}
|
||||
COMMENT "Generate ggml-opencl_transpose_32_16.cl.h"
|
||||
)
|
||||
|
||||
target_sources(${TARGET_NAME} PRIVATE
|
||||
${OPENCL_CL_SOURCE_EMBED}
|
||||
${OPENCL_MM_CL_SOURCE_EMBED}
|
||||
${OPENCL_CVT_CL_SOURCE_EMBED}
|
||||
${OPENCL_GEMV_NOSHUFFLE_SOURCE_EMBED}
|
||||
${OPENCL_GEMV_NOSHUFFLE_GENERAL_SOURCE_EMBED}
|
||||
${OPENCL_MUL_MAT_Ab_Bi_8x4_SOURCE_EMBED}
|
||||
${OPENCL_TRANSPOSE_16_SOURCE_EMBED}
|
||||
${OPENCL_TRANSPOSE_32_SOURCE_EMBED}
|
||||
${OPENCL_TRANSPOSE_32_16_SOURCE_EMBED})
|
||||
else ()
|
||||
# copy ggml-opencl.cl to bin directory
|
||||
configure_file(kernels/ggml-opencl.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl.cl COPYONLY)
|
||||
configure_file(kernels/ggml-opencl_mm.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_mm.cl COPYONLY)
|
||||
configure_file(kernels/ggml-opencl_cvt.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_cvt.cl COPYONLY)
|
||||
|
||||
configure_file(kernels/ggml-opencl_gemv_noshuffle.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_gemv_noshuffle.cl COPYONLY)
|
||||
configure_file(kernels/ggml-opencl_gemv_noshuffle_general.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_gemv_noshuffle_general.cl COPYONLY)
|
||||
configure_file(kernels/ggml-opencl_mul_mat_Ab_Bi_8x4.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_mul_mat_Ab_Bi_8x4.cl COPYONLY)
|
||||
configure_file(kernels/ggml-opencl_transpose_16.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_transpose_16.cl COPYONLY)
|
||||
configure_file(kernels/ggml-opencl_transpose_32.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_transpose_32.cl COPYONLY)
|
||||
configure_file(kernels/ggml-opencl_transpose_32_16.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_transpose_32_16.cl COPYONLY)
|
||||
endif ()
|
||||
4004
ggml/src/ggml-opencl/ggml-opencl.cpp
Normal file
4004
ggml/src/ggml-opencl/ggml-opencl.cpp
Normal file
File diff suppressed because it is too large
Load Diff
26
ggml/src/ggml-opencl/kernels/embed_kernel.py
Normal file
26
ggml/src/ggml-opencl/kernels/embed_kernel.py
Normal file
@@ -0,0 +1,26 @@
|
||||
#
|
||||
|
||||
import sys
|
||||
import logging
|
||||
logger = logging.getLogger("opencl-embed-kernel")
|
||||
|
||||
|
||||
def main():
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
if len(sys.argv) != 3:
|
||||
logger.info("Usage: python embed_kernel.py <input_file> <output_file>")
|
||||
sys.exit(1)
|
||||
|
||||
ifile = open(sys.argv[1], "r")
|
||||
ofile = open(sys.argv[2], "w")
|
||||
|
||||
for i in ifile:
|
||||
ofile.write('R"({})"\n'.format(i))
|
||||
|
||||
ifile.close()
|
||||
ofile.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
2683
ggml/src/ggml-opencl/kernels/ggml-opencl.cl
Normal file
2683
ggml/src/ggml-opencl/kernels/ggml-opencl.cl
Normal file
File diff suppressed because it is too large
Load Diff
106
ggml/src/ggml-opencl/kernels/ggml-opencl_cvt.cl
Normal file
106
ggml/src/ggml-opencl/kernels/ggml-opencl_cvt.cl
Normal file
@@ -0,0 +1,106 @@
|
||||
//------------------------------------------------------------------------------
|
||||
// This file is contains additional kernels for data conversion.
|
||||
// These kernels are used when loading the model, so its performance is less
|
||||
// important.
|
||||
//------------------------------------------------------------------------------
|
||||
#ifdef cl_khr_fp16
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#elif defined(cl_amd_fp16)
|
||||
#pragma OPENCL EXTENSION cl_amd_fp16 : enable
|
||||
#else
|
||||
#error "Half precision floating point not supportedby OpenCL implementation on your device."
|
||||
#endif
|
||||
|
||||
#ifdef cl_khr_subgroups
|
||||
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
||||
#elif defined(cl_intel_subgroups)
|
||||
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
|
||||
#else
|
||||
#error "Subgroup not supported on your device."
|
||||
#endif
|
||||
|
||||
#ifdef cl_intel_required_subgroup_size
|
||||
// Always use subgroup size of 32 on Intel.
|
||||
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
|
||||
#define INTEL_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
|
||||
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
|
||||
#elif defined(cl_qcom_reqd_sub_group_size)
|
||||
// Always use subgroups size of 64 on Adreno.
|
||||
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||
#define ADRENO_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
|
||||
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
||||
#else
|
||||
// TODO: do not know how to choose subgroup size on other GPUs.
|
||||
#error "Selecting subgroup size is not supported on your device."
|
||||
#endif
|
||||
|
||||
#define QK4_0 32
|
||||
#define QR4_0 2
|
||||
#define QK4_1 32
|
||||
#define QR4_1 2
|
||||
#define QK5_0 32
|
||||
#define QR5_0 2
|
||||
#define QK5_1 32
|
||||
#define QR5_1 2
|
||||
#define QK8_0 32
|
||||
#define QR8_0 1
|
||||
#define QK_K 256
|
||||
#define K_QUANTS_PER_ITERATION 2
|
||||
|
||||
typedef char int8_t;
|
||||
typedef uchar uint8_t;
|
||||
typedef short int16_t;
|
||||
typedef ushort uint16_t;
|
||||
typedef int int32_t;
|
||||
typedef uint uint32_t;
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// block_q4_0
|
||||
//------------------------------------------------------------------------------
|
||||
struct block_q4_0
|
||||
{
|
||||
half d;
|
||||
uint8_t qs[QK4_0 / 2];
|
||||
};
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// mul_vec_q_n_f32_flat_noshuffle
|
||||
//
|
||||
// This variation uses flat arrays (struct of arrays, SOA) representation for
|
||||
// quant tensors. It also uses non shuffled bit order for weights.
|
||||
//
|
||||
// The shuffled version is kept in the original file because moving it here
|
||||
// seems to result in worse performance for adreno.
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
kernel void kernel_convert_block_q4_0_noshuffle(
|
||||
global struct block_q4_0 * src0,
|
||||
global uchar * dst_q,
|
||||
global half * dst_d
|
||||
) {
|
||||
global struct block_q4_0 * b = (global struct block_q4_0 *) src0 + get_global_id(0);
|
||||
global uchar * q = (global uchar *) dst_q + QK4_0/2*get_global_id(0);
|
||||
global half * d = (global half *) dst_d + get_global_id(0);
|
||||
|
||||
*d = b->d;
|
||||
for (int i = 0; i < QK4_0/4; ++i) {
|
||||
uchar x0 = b->qs[2*i + 0];
|
||||
uchar x1 = b->qs[2*i + 1];
|
||||
|
||||
q[i + 0 ] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
|
||||
q[i + QK4_0/4] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
|
||||
|
||||
#ifdef ADRENO_GPU
|
||||
// Workaround for adreno - must have the following printf statement for
|
||||
// the kernel to work properly. Otherwise it produces incorrect result.
|
||||
// convert_uchar above also seems necessary.
|
||||
// Compare against a large number so that it does not print anything.
|
||||
// get_sub_group_local_id() also works.
|
||||
if (get_global_id(0) == 65536*4096) {
|
||||
printf("%04x - %02x\n", *(global ushort*)d, ((x0 & 0xF0) >> 4) | (x1 & 0xF0));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
265
ggml/src/ggml-opencl/kernels/ggml-opencl_gemv_noshuffle.cl
Normal file
265
ggml/src/ggml-opencl/kernels/ggml-opencl_gemv_noshuffle.cl
Normal file
@@ -0,0 +1,265 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
||||
#pragma OPENCL EXTENSION cl_qcom_subgroup_uniform_load: enable
|
||||
#pragma OPENCL EXTENSION cl_qcom_subgroup_constant_load: enable
|
||||
#pragma OPENCL EXTENSION cl_qcom_extra_vector_types : enable
|
||||
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||
|
||||
// assume
|
||||
#define QK4_0 32
|
||||
#define N_SIMDGROUP 4
|
||||
|
||||
#define dequantizeBlockAccum_ns_sgbroadcast_1_hi(total_sums, bits4, scale, y) \
|
||||
float shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s0, 0); \
|
||||
total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s1, 0); \
|
||||
total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s2, 0); \
|
||||
total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s3, 0); \
|
||||
total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s4, 0); \
|
||||
total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s5, 0); \
|
||||
total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s6, 0); \
|
||||
total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s7, 0); \
|
||||
total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s0, 1); \
|
||||
total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s1, 1); \
|
||||
total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s2, 1); \
|
||||
total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s3, 1); \
|
||||
total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s4, 1); \
|
||||
total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s5, 1); \
|
||||
total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s6, 1); \
|
||||
total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s7, 1); \
|
||||
total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
|
||||
|
||||
|
||||
#define dequantizeBlockAccum_ns_sgbroadcast_1_lo(total_sums, bits4, scale, y) \
|
||||
shared_y = sub_group_broadcast(y.s0, 2); \
|
||||
total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s1, 2); \
|
||||
total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s2, 2); \
|
||||
total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s3, 2); \
|
||||
total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s4, 2); \
|
||||
total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s5, 2); \
|
||||
total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s6, 2); \
|
||||
total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s7, 2); \
|
||||
total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s0, 3); \
|
||||
total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s1, 3); \
|
||||
total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s2, 3); \
|
||||
total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s3, 3); \
|
||||
total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s4, 3); \
|
||||
total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s5, 3); \
|
||||
total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s6, 3); \
|
||||
total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s7, 3); \
|
||||
total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
|
||||
|
||||
|
||||
#define dequantizeBlockAccum_ns_sgbroadcast_8_hi(total_sums, bits4, scale, y) \
|
||||
float8 shared_y; \
|
||||
shared_y = sub_group_broadcast(y, 0); \
|
||||
total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
|
||||
total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
|
||||
total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
|
||||
total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
|
||||
total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
|
||||
total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
|
||||
total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
|
||||
total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
|
||||
total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
|
||||
total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
|
||||
total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
|
||||
total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
|
||||
total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
|
||||
total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
|
||||
total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
|
||||
total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
|
||||
shared_y = sub_group_broadcast(y, 1); \
|
||||
total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
|
||||
total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
|
||||
total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
|
||||
total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
|
||||
total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
|
||||
total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
|
||||
total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
|
||||
total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
|
||||
total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
|
||||
total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
|
||||
total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
|
||||
total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
|
||||
total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
|
||||
total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
|
||||
total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
|
||||
total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
|
||||
|
||||
|
||||
#define dequantizeBlockAccum_ns_sgbroadcast_8_lo(total_sums, bits4, scale, y) \
|
||||
shared_y = sub_group_broadcast(y, 2); \
|
||||
total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
|
||||
total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
|
||||
total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
|
||||
total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
|
||||
total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
|
||||
total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
|
||||
total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
|
||||
total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
|
||||
total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
|
||||
total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
|
||||
total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
|
||||
total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
|
||||
total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
|
||||
total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
|
||||
total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
|
||||
total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
|
||||
shared_y = sub_group_broadcast(y, 3); \
|
||||
total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
|
||||
total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
|
||||
total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
|
||||
total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
|
||||
total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
|
||||
total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
|
||||
total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
|
||||
total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
|
||||
total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
|
||||
total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
|
||||
total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
|
||||
total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
|
||||
total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
|
||||
total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
|
||||
total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
|
||||
total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
|
||||
|
||||
|
||||
__attribute__((qcom_reqd_sub_group_size("full")))
|
||||
__kernel void kernel_gemv_noshuffle(
|
||||
__read_only image1d_buffer_t src0_q, // quantized A
|
||||
global half2 * src0_d, // A scales
|
||||
__read_only image1d_buffer_t src1, // B
|
||||
ulong offset1, // offset to B (0)
|
||||
global float * dst, // C
|
||||
ulong offsetd, // offset to C (0)
|
||||
uint K, // K
|
||||
int ne01, // M
|
||||
int ne02, // 1
|
||||
int ne10, // K
|
||||
int ne12, // 1
|
||||
int ne0, // M
|
||||
int ne1, // N
|
||||
int r2, // 1
|
||||
int r3)
|
||||
{
|
||||
uint groupId = get_local_id(1);
|
||||
uint gid = get_global_id(0);
|
||||
ushort slid = get_sub_group_local_id();
|
||||
|
||||
__private uint4 regA;
|
||||
__private half2 regS;
|
||||
__private float8 regB;
|
||||
|
||||
__private float2 totalSum = (float2)(0.0f);
|
||||
|
||||
// loop along K in block granularity, skip 4 blocks every iter
|
||||
for (uint k = groupId; k < (K / QK4_0); k += N_SIMDGROUP) {
|
||||
regS = src0_d[gid + k * LINE_STRIDE_A]; // each fiber loads scale of two rows
|
||||
// first 4 fibers in each wave load 8 B values to its private scope
|
||||
if (slid < 4) {
|
||||
regB.s0123 = read_imagef(src1, (slid * 2 + k * 8));
|
||||
regB.s4567 = read_imagef(src1, (1 + slid * 2 + k * 8));
|
||||
}
|
||||
|
||||
// load half weights for two blocks in consecutive rows
|
||||
regA.s0 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 0)).x;
|
||||
regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 1)).x;
|
||||
regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 2)).x;
|
||||
regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 3)).x;
|
||||
#ifdef VECTOR_SUB_GROUP_BROADCAT
|
||||
dequantizeBlockAccum_ns_sgbroadcast_8_hi(totalSum, as_ushort8(regA), regS, regB);
|
||||
#else
|
||||
dequantizeBlockAccum_ns_sgbroadcast_1_hi(totalSum, as_ushort8(regA), regS, regB);
|
||||
#endif // VECTOR_SUB_GROUP_BROADCAT
|
||||
|
||||
regA.s0 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 4)).x;
|
||||
regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 5)).x;
|
||||
regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 6)).x;
|
||||
regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 7)).x;
|
||||
#ifdef VECTOR_SUB_GROUP_BROADCAT
|
||||
dequantizeBlockAccum_ns_sgbroadcast_8_lo(totalSum, as_ushort8(regA), regS, regB);
|
||||
#else
|
||||
dequantizeBlockAccum_ns_sgbroadcast_1_lo(totalSum, as_ushort8(regA), regS, regB);
|
||||
#endif // VECTOR_SUB_GROUP_BROADCAT
|
||||
}
|
||||
|
||||
// reduction in local memory, assumes #wave=4
|
||||
__local float2 reduceLM[SIMDGROUP_WIDTH * 3];
|
||||
if (groupId == 1) reduceLM[SIMDGROUP_WIDTH * 0 + slid] = totalSum;
|
||||
if (groupId == 2) reduceLM[SIMDGROUP_WIDTH * 1 + slid] = totalSum;
|
||||
if (groupId == 3) reduceLM[SIMDGROUP_WIDTH * 2 + slid] = totalSum;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 0 + slid];
|
||||
if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 1 + slid];
|
||||
if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 2 + slid];
|
||||
|
||||
// 2 outputs per fiber in wave 0
|
||||
if (groupId == 0) {
|
||||
dst = (global float*)((global char*)dst + offsetd);
|
||||
vstore2(totalSum, 0, &(dst[gid * 2]));
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,271 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
||||
#pragma OPENCL EXTENSION cl_qcom_subgroup_uniform_load: enable
|
||||
#pragma OPENCL EXTENSION cl_qcom_subgroup_constant_load: enable
|
||||
#pragma OPENCL EXTENSION cl_qcom_extra_vector_types : enable
|
||||
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||
|
||||
// assume
|
||||
#define QK4_0 32
|
||||
#define N_SIMDGROUP 4
|
||||
|
||||
#define dequantizeBlockAccum_ns_sgbroadcast_1_hi(total_sums, bits4, scale, y) \
|
||||
float shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s0, 0); \
|
||||
total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s1, 0); \
|
||||
total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s2, 0); \
|
||||
total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s3, 0); \
|
||||
total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s4, 0); \
|
||||
total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s5, 0); \
|
||||
total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s6, 0); \
|
||||
total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s7, 0); \
|
||||
total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s0, 1); \
|
||||
total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s1, 1); \
|
||||
total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s2, 1); \
|
||||
total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s3, 1); \
|
||||
total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s4, 1); \
|
||||
total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s5, 1); \
|
||||
total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s6, 1); \
|
||||
total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s7, 1); \
|
||||
total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
|
||||
|
||||
|
||||
#define dequantizeBlockAccum_ns_sgbroadcast_1_lo(total_sums, bits4, scale, y) \
|
||||
shared_y = sub_group_broadcast(y.s0, 2); \
|
||||
total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s1, 2); \
|
||||
total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s2, 2); \
|
||||
total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s3, 2); \
|
||||
total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s4, 2); \
|
||||
total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s5, 2); \
|
||||
total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s6, 2); \
|
||||
total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s7, 2); \
|
||||
total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s0, 3); \
|
||||
total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s1, 3); \
|
||||
total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s2, 3); \
|
||||
total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s3, 3); \
|
||||
total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s4, 3); \
|
||||
total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s5, 3); \
|
||||
total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s6, 3); \
|
||||
total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
|
||||
shared_y = sub_group_broadcast(y.s7, 3); \
|
||||
total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
|
||||
total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
|
||||
|
||||
|
||||
#define dequantizeBlockAccum_ns_sgbroadcast_8_hi(total_sums, bits4, scale, y) \
|
||||
float8 shared_y; \
|
||||
shared_y = sub_group_broadcast(y, 0); \
|
||||
total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
|
||||
total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
|
||||
total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
|
||||
total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
|
||||
total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
|
||||
total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
|
||||
total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
|
||||
total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
|
||||
total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
|
||||
total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
|
||||
total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
|
||||
total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
|
||||
total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
|
||||
total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
|
||||
total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
|
||||
total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
|
||||
shared_y = sub_group_broadcast(y, 1); \
|
||||
total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
|
||||
total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
|
||||
total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
|
||||
total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
|
||||
total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
|
||||
total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
|
||||
total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
|
||||
total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
|
||||
total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
|
||||
total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
|
||||
total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
|
||||
total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
|
||||
total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
|
||||
total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
|
||||
total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
|
||||
total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
|
||||
|
||||
|
||||
#define dequantizeBlockAccum_ns_sgbroadcast_8_lo(total_sums, bits4, scale, y) \
|
||||
shared_y = sub_group_broadcast(y, 2); \
|
||||
total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
|
||||
total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
|
||||
total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
|
||||
total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
|
||||
total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
|
||||
total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
|
||||
total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
|
||||
total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
|
||||
total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
|
||||
total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
|
||||
total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
|
||||
total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
|
||||
total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
|
||||
total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
|
||||
total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
|
||||
total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
|
||||
shared_y = sub_group_broadcast(y, 3); \
|
||||
total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
|
||||
total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
|
||||
total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
|
||||
total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
|
||||
total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
|
||||
total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
|
||||
total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
|
||||
total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
|
||||
total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
|
||||
total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
|
||||
total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
|
||||
total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
|
||||
total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
|
||||
total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
|
||||
total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
|
||||
total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
|
||||
|
||||
|
||||
__attribute__((qcom_reqd_sub_group_size("full")))
|
||||
__kernel void kernel_gemv_noshuffle(
|
||||
__read_only image1d_buffer_t src0_q, // quantized A
|
||||
global half2 * src0_d, // A scales
|
||||
__read_only image1d_buffer_t src1, // B
|
||||
ulong offset1, // offset to B (0)
|
||||
global float * dst, // C
|
||||
ulong offsetd, // offset to C (0)
|
||||
int ne00, // K
|
||||
int ne01, // M
|
||||
int ne02, // 1
|
||||
int ne10, // K
|
||||
int ne12, // 1
|
||||
int ne0, // M
|
||||
int ne1, // N
|
||||
int r2, // 1
|
||||
int r3)
|
||||
{
|
||||
uint groupId = get_local_id(1);
|
||||
uint gid = get_global_id(0);
|
||||
ushort slid = get_sub_group_local_id();
|
||||
|
||||
uint K = ne00;
|
||||
uint M = ne01;
|
||||
|
||||
uint LINE_STRIDE_A = M / 2;
|
||||
uint BLOCK_STRIDE_A = N_SIMDGROUP * M;
|
||||
|
||||
__private uint4 regA;
|
||||
__private half2 regS;
|
||||
__private float8 regB;
|
||||
|
||||
__private float2 totalSum = (float2)(0.0f);
|
||||
|
||||
// loop along K in block granularity, skip 4 blocks every iter
|
||||
for (uint k = groupId; k < (K / QK4_0); k += N_SIMDGROUP) {
|
||||
regS = src0_d[gid + k * LINE_STRIDE_A]; // each fiber loads scale of two rows
|
||||
// first 4 fibers in each wave load 8 B values to its private scope
|
||||
if (slid < 4) {
|
||||
regB.s0123 = read_imagef(src1, (slid * 2 + k * 8));
|
||||
regB.s4567 = read_imagef(src1, (1 + slid * 2 + k * 8));
|
||||
}
|
||||
|
||||
// load half weights for two blocks in consecutive rows
|
||||
regA.s0 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 0)).x;
|
||||
regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 1)).x;
|
||||
regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 2)).x;
|
||||
regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 3)).x;
|
||||
#ifdef VECTOR_SUB_GROUP_BROADCAT
|
||||
dequantizeBlockAccum_ns_sgbroadcast_8_hi(totalSum, as_ushort8(regA), regS, regB);
|
||||
#else
|
||||
dequantizeBlockAccum_ns_sgbroadcast_1_hi(totalSum, as_ushort8(regA), regS, regB);
|
||||
#endif // VECTOR_SUB_GROUP_BROADCAT
|
||||
|
||||
regA.s0 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 4)).x;
|
||||
regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 5)).x;
|
||||
regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 6)).x;
|
||||
regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 7)).x;
|
||||
#ifdef VECTOR_SUB_GROUP_BROADCAT
|
||||
dequantizeBlockAccum_ns_sgbroadcast_8_lo(totalSum, as_ushort8(regA), regS, regB);
|
||||
#else
|
||||
dequantizeBlockAccum_ns_sgbroadcast_1_lo(totalSum, as_ushort8(regA), regS, regB);
|
||||
#endif // VECTOR_SUB_GROUP_BROADCAT
|
||||
}
|
||||
|
||||
// reduction in local memory, assumes #wave=4
|
||||
__local float2 reduceLM[SIMDGROUP_WIDTH * 3];
|
||||
if (groupId == 1) reduceLM[SIMDGROUP_WIDTH * 0 + slid] = totalSum;
|
||||
if (groupId == 2) reduceLM[SIMDGROUP_WIDTH * 1 + slid] = totalSum;
|
||||
if (groupId == 3) reduceLM[SIMDGROUP_WIDTH * 2 + slid] = totalSum;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 0 + slid];
|
||||
if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 1 + slid];
|
||||
if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 2 + slid];
|
||||
|
||||
// 2 outputs per fiber in wave 0
|
||||
if (groupId == 0) {
|
||||
dst = (global float*)((global char*)dst + offsetd);
|
||||
vstore2(totalSum, 0, &(dst[gid * 2]));
|
||||
}
|
||||
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user