mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-02-05 13:53:23 +02:00
Compare commits
60 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
334f76fa38 | ||
|
|
efd56b1c21 | ||
|
|
201294ae17 | ||
|
|
5a9e2f60ba | ||
|
|
373ee3fbba | ||
|
|
4cb4d8b22d | ||
|
|
3a03541ced | ||
|
|
56d03d92be | ||
|
|
a46f50747b | ||
|
|
c5688c6250 | ||
|
|
4ef245a92a | ||
|
|
973053d8b0 | ||
|
|
7c8bcc11dc | ||
|
|
7fe4678b02 | ||
|
|
ba2135ccae | ||
|
|
89febfed93 | ||
|
|
5022cf242d | ||
|
|
1ecea255eb | ||
|
|
a00a35cef9 | ||
|
|
eccd7a26dd | ||
|
|
c14f72db9c | ||
|
|
cc6cac08e3 | ||
|
|
580111d42b | ||
|
|
88c46cbdac | ||
|
|
a14679cc30 | ||
|
|
6560bed3f0 | ||
|
|
06bf2cf8c4 | ||
|
|
4ed8e4fbef | ||
|
|
9c405c9f9a | ||
|
|
5207b3fbc5 | ||
|
|
8dbbd75754 | ||
|
|
c0a8c6db37 | ||
|
|
b9111bd209 | ||
|
|
633782b8d9 | ||
|
|
22f83f0c38 | ||
|
|
bb9dcd560a | ||
|
|
f50db6ae0b | ||
|
|
d8c054517d | ||
|
|
42f664a382 | ||
|
|
5dde540897 | ||
|
|
40c3a6c1e1 | ||
|
|
f24ed14ee0 | ||
|
|
9d679f0fcc | ||
|
|
1387cf60f7 | ||
|
|
6fd413791a | ||
|
|
337c9cbd52 | ||
|
|
a3145bdc30 | ||
|
|
890559ab28 | ||
|
|
d0e3ce51f4 | ||
|
|
68a6b98b3c | ||
|
|
70d45af0ef | ||
|
|
13e2c771aa | ||
|
|
f53119cec4 | ||
|
|
7084755396 | ||
|
|
4480542b22 | ||
|
|
11b12de39b | ||
|
|
3a9cb4ca64 | ||
|
|
769a716e30 | ||
|
|
f0d1fafc02 | ||
|
|
a0c2dad9d4 |
37
.devops/nix/docker.nix
Normal file
37
.devops/nix/docker.nix
Normal file
@@ -0,0 +1,37 @@
|
||||
{
|
||||
lib,
|
||||
dockerTools,
|
||||
buildEnv,
|
||||
llama-cpp,
|
||||
interactive ? true,
|
||||
coreutils,
|
||||
}:
|
||||
|
||||
# A tar that can be fed into `docker load`:
|
||||
#
|
||||
# $ nix build .#llamaPackages.docker
|
||||
# $ docker load < result
|
||||
|
||||
# For details and variations cf.
|
||||
# - https://nixos.org/manual/nixpkgs/unstable/#ssec-pkgs-dockerTools-buildLayeredImage
|
||||
# - https://discourse.nixos.org/t/a-faster-dockertools-buildimage-prototype/16922
|
||||
# - https://nixery.dev/
|
||||
|
||||
# Approximate (compressed) sizes, at the time of writing, are:
|
||||
#
|
||||
# .#llamaPackages.docker: 125M;
|
||||
# .#llamaPackagesCuda.docker: 537M;
|
||||
# .#legacyPackages.aarch64-linux.llamaPackagesXavier.docker: 415M.
|
||||
|
||||
dockerTools.buildLayeredImage {
|
||||
name = llama-cpp.pname;
|
||||
tag = "latest";
|
||||
|
||||
contents =
|
||||
[ llama-cpp ]
|
||||
++ lib.optionals interactive [
|
||||
coreutils
|
||||
dockerTools.binSh
|
||||
dockerTools.caCertificates
|
||||
];
|
||||
}
|
||||
@@ -255,11 +255,11 @@ effectiveStdenv.mkDerivation (
|
||||
# Configurations we don't want even the CI to evaluate. Results in the
|
||||
# "unsupported platform" messages. This is mostly a no-op, because
|
||||
# cudaPackages would've refused to evaluate anyway.
|
||||
badPlatforms = optionals (useCuda || useOpenCL || useVulkan) lib.platforms.darwin;
|
||||
badPlatforms = optionals (useCuda || useOpenCL) lib.platforms.darwin;
|
||||
|
||||
# Configurations that are known to result in build failures. Can be
|
||||
# overridden by importing Nixpkgs with `allowBroken = true`.
|
||||
broken = (useMetalKit && !effectiveStdenv.isDarwin) || (useVulkan && effectiveStdenv.isDarwin);
|
||||
broken = (useMetalKit && !effectiveStdenv.isDarwin);
|
||||
|
||||
description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}";
|
||||
homepage = "https://github.com/ggerganov/llama.cpp/";
|
||||
|
||||
@@ -12,5 +12,8 @@ lib.makeScope newScope (
|
||||
self: {
|
||||
inherit llamaVersion;
|
||||
llama-cpp = self.callPackage ./package.nix { };
|
||||
docker = self.callPackage ./docker.nix { };
|
||||
docker-min = self.callPackage ./docker.nix { interactive = false; };
|
||||
sif = self.callPackage ./sif.nix { };
|
||||
}
|
||||
)
|
||||
|
||||
27
.devops/nix/sif.nix
Normal file
27
.devops/nix/sif.nix
Normal file
@@ -0,0 +1,27 @@
|
||||
{
|
||||
lib,
|
||||
singularity-tools,
|
||||
llama-cpp,
|
||||
bashInteractive,
|
||||
interactive ? false,
|
||||
}:
|
||||
|
||||
let
|
||||
optionalInt = cond: x: if cond then x else 0;
|
||||
in
|
||||
singularity-tools.buildImage rec {
|
||||
inherit (llama-cpp) name;
|
||||
contents = [ llama-cpp ] ++ lib.optionals interactive [ bashInteractive ];
|
||||
|
||||
# These are excessive (but safe) for most variants. Building singularity
|
||||
# images requires superuser privileges, so we build them inside a VM in a
|
||||
# writable image of pre-determined size.
|
||||
#
|
||||
# ROCm is currently affected by https://github.com/NixOS/nixpkgs/issues/276846
|
||||
#
|
||||
# Expected image sizes:
|
||||
# - cpu/blas: 150M,
|
||||
# - cuda, all gencodes: 560M,
|
||||
diskSize = 4096 + optionalInt llama-cpp.useRocm 16384;
|
||||
memSize = diskSize;
|
||||
}
|
||||
7
.github/workflows/nix-ci-aarch64.yml
vendored
7
.github/workflows/nix-ci-aarch64.yml
vendored
@@ -19,7 +19,6 @@ on:
|
||||
|
||||
jobs:
|
||||
nix-build-aarch64:
|
||||
if: ${{ vars.CACHIX_NAME != '' }}
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
@@ -37,8 +36,8 @@ jobs:
|
||||
extra-conf: |
|
||||
extra-platforms = aarch64-linux
|
||||
extra-system-features = nixos-test kvm
|
||||
extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org
|
||||
extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
|
||||
extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
|
||||
extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
|
||||
- uses: DeterminateSystems/magic-nix-cache-action@v2
|
||||
with:
|
||||
upstream-cache: https://${{ matrix.cachixName }}.cachix.org
|
||||
@@ -46,7 +45,7 @@ jobs:
|
||||
uses: cachix/cachix-action@v13
|
||||
with:
|
||||
authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
|
||||
name: ${{ vars.CACHIX_NAME }}
|
||||
name: llama-cpp
|
||||
- name: Show all output paths
|
||||
run: >
|
||||
nix run github:nix-community/nix-eval-jobs
|
||||
|
||||
11
.github/workflows/nix-ci.yml
vendored
11
.github/workflows/nix-ci.yml
vendored
@@ -23,8 +23,8 @@ jobs:
|
||||
with:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
extra-conf: |
|
||||
extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org
|
||||
extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
|
||||
extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
|
||||
extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
|
||||
- uses: DeterminateSystems/magic-nix-cache-action@v2
|
||||
with:
|
||||
upstream-cache: https://${{ matrix.cachixName }}.cachix.org
|
||||
@@ -37,7 +37,6 @@ jobs:
|
||||
--flake
|
||||
".#packages.$(nix eval --raw --impure --expr builtins.currentSystem)"
|
||||
nix-build:
|
||||
if: ${{ vars.CACHIX_NAME != '' }}
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
@@ -51,8 +50,8 @@ jobs:
|
||||
with:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
extra-conf: |
|
||||
extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org
|
||||
extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
|
||||
extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
|
||||
extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
|
||||
- uses: DeterminateSystems/magic-nix-cache-action@v2
|
||||
with:
|
||||
upstream-cache: https://${{ matrix.cachixName }}.cachix.org
|
||||
@@ -60,7 +59,7 @@ jobs:
|
||||
uses: cachix/cachix-action@v13
|
||||
with:
|
||||
authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
|
||||
name: ${{ vars.CACHIX_NAME }}
|
||||
name: llama-cpp
|
||||
- name: Build
|
||||
run: >
|
||||
nix run github:Mic92/nix-fast-build
|
||||
|
||||
@@ -110,6 +110,7 @@ option(LLAMA_VULKAN_RUN_TESTS "llama: run Vulkan tests"
|
||||
option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT})
|
||||
option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging" OFF)
|
||||
option(LLAMA_METAL_SHADER_DEBUG "llama: compile Metal with -fno-fast-math" OFF)
|
||||
option(LLAMA_METAL_EMBED_LIBRARY "llama: embed Metal library" OFF)
|
||||
option(LLAMA_KOMPUTE "llama: use Kompute" OFF)
|
||||
option(LLAMA_MPI "llama: use MPI" OFF)
|
||||
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
|
||||
@@ -145,14 +146,6 @@ set(THREADS_PREFER_PTHREAD_FLAG ON)
|
||||
find_package(Threads REQUIRED)
|
||||
include(CheckCXXCompilerFlag)
|
||||
|
||||
if (LLAMA_FATAL_WARNINGS)
|
||||
if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
|
||||
add_compile_options(-Werror)
|
||||
elseif (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
|
||||
add_compile_options(/WX)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# enable libstdc++ assertions for debug builds
|
||||
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
|
||||
add_compile_definitions($<$<CONFIG:Debug>:_GLIBCXX_ASSERTIONS>)
|
||||
@@ -209,6 +202,29 @@ if (LLAMA_METAL)
|
||||
# copy ggml-metal.metal to bin directory
|
||||
configure_file(ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY)
|
||||
|
||||
if (LLAMA_METAL_EMBED_LIBRARY)
|
||||
enable_language(ASM)
|
||||
add_compile_definitions(GGML_METAL_EMBED_LIBRARY)
|
||||
|
||||
set(METALLIB_SOURCE "${CMAKE_SOURCE_DIR}/ggml-metal.metal")
|
||||
file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/autogenerated")
|
||||
set(EMBED_METALLIB_ASSEMBLY "${CMAKE_BINARY_DIR}/autogenerated/ggml-embed-metallib.s")
|
||||
|
||||
add_custom_command(
|
||||
OUTPUT ${EMBED_METALLIB_ASSEMBLY}
|
||||
COMMAND echo ".section __DATA,__ggml_metallib" > ${EMBED_METALLIB_ASSEMBLY}
|
||||
COMMAND echo ".globl _ggml_metallib_start" >> ${EMBED_METALLIB_ASSEMBLY}
|
||||
COMMAND echo "_ggml_metallib_start:" >> ${EMBED_METALLIB_ASSEMBLY}
|
||||
COMMAND echo ".incbin \\\"${METALLIB_SOURCE}\\\"" >> ${EMBED_METALLIB_ASSEMBLY}
|
||||
COMMAND echo ".globl _ggml_metallib_end" >> ${EMBED_METALLIB_ASSEMBLY}
|
||||
COMMAND echo "_ggml_metallib_end:" >> ${EMBED_METALLIB_ASSEMBLY}
|
||||
DEPENDS ${METALLIB_SOURCE}
|
||||
COMMENT "Generate assembly for embedded Metal library"
|
||||
)
|
||||
|
||||
set(GGML_SOURCES_METAL ${GGML_SOURCES_METAL} ${EMBED_METALLIB_ASSEMBLY})
|
||||
endif()
|
||||
|
||||
if (LLAMA_METAL_SHADER_DEBUG)
|
||||
# custom command to do the following:
|
||||
# xcrun -sdk macosx metal -fno-fast-math -c ggml-metal.metal -o ggml-metal.air
|
||||
@@ -741,28 +757,30 @@ function(get_flags CCID CCVER)
|
||||
if (CCVER VERSION_GREATER_EQUAL 8.1.0)
|
||||
list(APPEND CXX_FLAGS -Wextra-semi)
|
||||
endif()
|
||||
elseif (CCID MATCHES "Intel")
|
||||
if (NOT LLAMA_SYCL)
|
||||
# enable max optimization level when using Intel compiler
|
||||
set(C_FLAGS -ipo -O3 -static -fp-model=fast -flto -fno-stack-protector)
|
||||
set(CXX_FLAGS -ipo -O3 -static -fp-model=fast -flto -fno-stack-protector)
|
||||
add_link_options(-fuse-ld=lld -static-intel)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
set(GF_C_FLAGS ${C_FLAGS} PARENT_SCOPE)
|
||||
set(GF_CXX_FLAGS ${CXX_FLAGS} PARENT_SCOPE)
|
||||
endfunction()
|
||||
|
||||
if (LLAMA_FATAL_WARNINGS)
|
||||
if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
|
||||
list(APPEND C_FLAGS -Werror)
|
||||
list(APPEND CXX_FLAGS -Werror)
|
||||
elseif (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
|
||||
add_compile_options(/WX)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (LLAMA_ALL_WARNINGS)
|
||||
if (NOT MSVC)
|
||||
set(WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
|
||||
set(C_FLAGS -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes
|
||||
-Werror=implicit-int -Werror=implicit-function-declaration)
|
||||
set(CXX_FLAGS -Wmissing-declarations -Wmissing-noreturn)
|
||||
list(APPEND WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
|
||||
list(APPEND C_FLAGS -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes
|
||||
-Werror=implicit-int -Werror=implicit-function-declaration)
|
||||
list(APPEND CXX_FLAGS -Wmissing-declarations -Wmissing-noreturn)
|
||||
|
||||
set(C_FLAGS ${WARNING_FLAGS} ${C_FLAGS})
|
||||
set(CXX_FLAGS ${WARNING_FLAGS} ${CXX_FLAGS})
|
||||
list(APPEND C_FLAGS ${WARNING_FLAGS})
|
||||
list(APPEND CXX_FLAGS ${WARNING_FLAGS})
|
||||
|
||||
get_flags(${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION})
|
||||
|
||||
@@ -778,9 +796,10 @@ endif()
|
||||
set(CUDA_CXX_FLAGS "")
|
||||
|
||||
if (LLAMA_CUBLAS)
|
||||
set(CUDA_FLAGS ${CXX_FLAGS} -use_fast_math)
|
||||
if (NOT MSVC)
|
||||
list(APPEND CUDA_FLAGS -Wno-pedantic)
|
||||
set(CUDA_FLAGS -use_fast_math)
|
||||
|
||||
if (LLAMA_FATAL_WARNINGS)
|
||||
list(APPEND CUDA_FLAGS -Werror all-warnings)
|
||||
endif()
|
||||
|
||||
if (LLAMA_ALL_WARNINGS AND NOT MSVC)
|
||||
@@ -814,7 +833,11 @@ if (LLAMA_CUBLAS)
|
||||
message("-- CUDA host compiler is ${CUDA_CCID} ${CUDA_CCVER}")
|
||||
|
||||
get_flags(${CUDA_CCID} ${CUDA_CCVER})
|
||||
list(APPEND CUDA_CXX_FLAGS ${GF_CXX_FLAGS}) # This is passed to -Xcompiler later
|
||||
list(APPEND CUDA_CXX_FLAGS ${CXX_FLAGS} ${GF_CXX_FLAGS}) # This is passed to -Xcompiler later
|
||||
endif()
|
||||
|
||||
if (NOT MSVC)
|
||||
list(APPEND CUDA_CXX_FLAGS -Wno-pedantic)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
|
||||
75
Makefile
75
Makefile
@@ -97,9 +97,10 @@ endif
|
||||
#
|
||||
|
||||
# keep standard at C11 and C++11
|
||||
MK_CPPFLAGS = -I. -Icommon
|
||||
MK_CFLAGS = -std=c11 -fPIC
|
||||
MK_CXXFLAGS = -std=c++11 -fPIC
|
||||
MK_CPPFLAGS = -I. -Icommon
|
||||
MK_CFLAGS = -std=c11 -fPIC
|
||||
MK_CXXFLAGS = -std=c++11 -fPIC
|
||||
MK_NVCCFLAGS = -std=c++11
|
||||
|
||||
# -Ofast tends to produce faster code, but may not be available for some compilers.
|
||||
ifdef LLAMA_FAST
|
||||
@@ -172,7 +173,7 @@ ifdef LLAMA_DEBUG
|
||||
MK_LDFLAGS += -g
|
||||
|
||||
ifeq ($(UNAME_S),Linux)
|
||||
MK_CXXFLAGS += -Wp,-D_GLIBCXX_ASSERTIONS
|
||||
MK_CPPFLAGS += -D_GLIBCXX_ASSERTIONS
|
||||
endif
|
||||
else
|
||||
MK_CPPFLAGS += -DNDEBUG
|
||||
@@ -216,34 +217,10 @@ MK_CFLAGS += $(WARN_FLAGS) -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmis
|
||||
MK_CXXFLAGS += $(WARN_FLAGS) -Wmissing-declarations -Wmissing-noreturn
|
||||
|
||||
ifeq ($(LLAMA_FATAL_WARNINGS),1)
|
||||
MK_CFLAGS += -Werror
|
||||
MK_CFLAGS += -Werror
|
||||
MK_CXXFLAGS += -Werror
|
||||
endif
|
||||
|
||||
ifeq ($(CC_IS_CLANG), 1)
|
||||
# clang options
|
||||
MK_CFLAGS += -Wunreachable-code-break -Wunreachable-code-return
|
||||
MK_HOST_CXXFLAGS += -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi
|
||||
|
||||
ifneq '' '$(and $(CC_IS_LLVM_CLANG),$(filter 1,$(shell expr $(CC_VER) \>= 030800)))'
|
||||
MK_CFLAGS += -Wdouble-promotion
|
||||
endif
|
||||
ifneq '' '$(and $(CC_IS_APPLE_CLANG),$(filter 1,$(shell expr $(CC_VER) \>= 070300)))'
|
||||
MK_CFLAGS += -Wdouble-promotion
|
||||
endif
|
||||
else
|
||||
# gcc options
|
||||
MK_CFLAGS += -Wdouble-promotion
|
||||
MK_HOST_CXXFLAGS += -Wno-array-bounds
|
||||
|
||||
ifeq ($(shell expr $(CC_VER) \>= 070100), 1)
|
||||
MK_HOST_CXXFLAGS += -Wno-format-truncation
|
||||
endif
|
||||
ifeq ($(shell expr $(CC_VER) \>= 080100), 1)
|
||||
MK_HOST_CXXFLAGS += -Wextra-semi
|
||||
endif
|
||||
endif
|
||||
|
||||
# this version of Apple ld64 is buggy
|
||||
ifneq '' '$(findstring dyld-1015.7,$(shell $(CC) $(LDFLAGS) -Wl,-v 2>&1))'
|
||||
MK_CPPFLAGS += -DHAVE_BUGGY_APPLE_LINKER
|
||||
@@ -408,6 +385,9 @@ ifdef LLAMA_CUBLAS
|
||||
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib -L/usr/local/cuda/targets/aarch64-linux/lib -L/usr/lib/wsl/lib
|
||||
OBJS += ggml-cuda.o
|
||||
MK_NVCCFLAGS += -use_fast_math
|
||||
ifdef LLAMA_FATAL_WARNINGS
|
||||
MK_NVCCFLAGS += -Werror all-warnings
|
||||
endif # LLAMA_FATAL_WARNINGS
|
||||
ifndef JETSON_EOL_MODULE_DETECT
|
||||
MK_NVCCFLAGS += --forward-unknown-to-host-compiler
|
||||
endif # JETSON_EOL_MODULE_DETECT
|
||||
@@ -466,9 +446,9 @@ ifdef LLAMA_CUDA_CCBIN
|
||||
endif
|
||||
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
|
||||
ifdef JETSON_EOL_MODULE_DETECT
|
||||
$(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
|
||||
$(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
|
||||
else
|
||||
$(NVCC) $(BASE_CXXFLAGS) $(NVCCFLAGS) -Wno-pedantic -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
|
||||
$(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
|
||||
endif # JETSON_EOL_MODULE_DETECT
|
||||
endif # LLAMA_CUBLAS
|
||||
|
||||
@@ -553,11 +533,29 @@ ifdef LLAMA_METAL
|
||||
ifdef LLAMA_METAL_NDEBUG
|
||||
MK_CPPFLAGS += -DGGML_METAL_NDEBUG
|
||||
endif
|
||||
ifdef LLAMA_METAL_EMBED_LIBRARY
|
||||
MK_CPPFLAGS += -DGGML_METAL_EMBED_LIBRARY
|
||||
OBJS += ggml-metal-embed.o
|
||||
endif
|
||||
endif # LLAMA_METAL
|
||||
|
||||
ifdef LLAMA_METAL
|
||||
ggml-metal.o: ggml-metal.m ggml-metal.h
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
|
||||
ifdef LLAMA_METAL_EMBED_LIBRARY
|
||||
ggml-metal-embed.o: ggml-metal.metal
|
||||
@echo "Embedding Metal library"
|
||||
$(eval TEMP_ASSEMBLY=$(shell mktemp))
|
||||
@echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY)
|
||||
@echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY)
|
||||
@echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY)
|
||||
@echo ".incbin \"$<\"" >> $(TEMP_ASSEMBLY)
|
||||
@echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY)
|
||||
@echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY)
|
||||
@$(AS) $(TEMP_ASSEMBLY) -o $@
|
||||
@rm -f ${TEMP_ASSEMBLY}
|
||||
endif
|
||||
endif # LLAMA_METAL
|
||||
|
||||
ifdef LLAMA_MPI
|
||||
@@ -569,9 +567,10 @@ GF_CC := $(CC)
|
||||
include scripts/get-flags.mk
|
||||
|
||||
# combine build flags with cmdline overrides
|
||||
override CFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CFLAGS) $(GF_CFLAGS) $(CFLAGS)
|
||||
BASE_CXXFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CXXFLAGS) $(CXXFLAGS)
|
||||
override CXXFLAGS := $(BASE_CXXFLAGS) $(HOST_CXXFLAGS) $(GF_CXXFLAGS)
|
||||
override CPPFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS)
|
||||
override CFLAGS := $(CPPFLAGS) $(MK_CFLAGS) $(GF_CFLAGS) $(CFLAGS)
|
||||
BASE_CXXFLAGS := $(MK_CXXFLAGS) $(CXXFLAGS)
|
||||
override CXXFLAGS := $(BASE_CXXFLAGS) $(HOST_CXXFLAGS) $(GF_CXXFLAGS) $(CPPFLAGS)
|
||||
override NVCCFLAGS := $(MK_NVCCFLAGS) $(NVCCFLAGS)
|
||||
override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS)
|
||||
|
||||
@@ -579,7 +578,7 @@ override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS)
|
||||
ifdef LLAMA_CUBLAS
|
||||
GF_CC := $(NVCC) $(NVCCFLAGS) 2>/dev/null .c -Xcompiler
|
||||
include scripts/get-flags.mk
|
||||
CUDA_CXXFLAGS := $(GF_CXXFLAGS)
|
||||
CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic
|
||||
endif
|
||||
|
||||
#
|
||||
@@ -720,7 +719,7 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
||||
server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h examples/llava/llava.h examples/llava/llava.cpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual
|
||||
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h %.hpp $< examples/llava/clip.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) -o $@ $(LDFLAGS) $(LWINSOCK2)
|
||||
@@ -891,3 +890,7 @@ tests/test-model-load-cancel: tests/test-model-load-cancel.cpp ggml.o llama.o te
|
||||
tests/test-autorelease: tests/test-autorelease.cpp ggml.o llama.o tests/get-model.cpp $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-chat-template: tests/test-chat-template.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
@@ -272,7 +272,7 @@ Please install [Visual Studio](https://visualstudio.microsoft.com/) which impact
|
||||
|
||||
a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).
|
||||
|
||||
Recommend to install to default folder: **/opt/intel/oneapi**.
|
||||
Recommend to install to default folder: **C:\Program Files (x86)\Intel\oneAPI**.
|
||||
|
||||
Following guide uses the default folder as example. If you use other folder, please modify the following guide info with your folder.
|
||||
|
||||
|
||||
15
README.md
15
README.md
@@ -10,13 +10,9 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
|
||||
|
||||
### Hot topics
|
||||
|
||||
- Remove LLAMA_MAX_DEVICES and LLAMA_SUPPORTS_GPU_OFFLOAD: https://github.com/ggerganov/llama.cpp/pull/5240
|
||||
- Incoming backends: https://github.com/ggerganov/llama.cpp/discussions/5138
|
||||
- [SYCL backend](README-sycl.md) is ready (1/28/2024), support Linux/Windows in Intel GPUs (iGPU, Arc/Flex/Max series)
|
||||
- New SOTA quantized models, including pure 2-bits: https://huggingface.co/ikawrakow
|
||||
- Collecting Apple Silicon performance stats:
|
||||
- M-series: https://github.com/ggerganov/llama.cpp/discussions/4167
|
||||
- A-series: https://github.com/ggerganov/llama.cpp/discussions/4508
|
||||
- Support for chat templates: [Wiki (contributions welcome)](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
|
||||
- Support for Gemma models: https://github.com/ggerganov/llama.cpp/pull/5631
|
||||
- Non-linear quantization IQ4_NL: https://github.com/ggerganov/llama.cpp/pull/5590
|
||||
- Looking for contributions to improve and maintain the `server` example: https://github.com/ggerganov/llama.cpp/issues/4216
|
||||
|
||||
----
|
||||
@@ -61,7 +57,7 @@ variety of hardware - locally and in the cloud.
|
||||
- Plain C/C++ implementation without any dependencies
|
||||
- Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
|
||||
- AVX, AVX2 and AVX512 support for x86 architectures
|
||||
- 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
|
||||
- 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
|
||||
- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP)
|
||||
- Vulkan, SYCL, and (partial) OpenCL backend support
|
||||
- CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity
|
||||
@@ -107,6 +103,7 @@ Typically finetunes of the base models below are supported as well.
|
||||
- [x] [Orion 14B](https://github.com/ggerganov/llama.cpp/pull/5118)
|
||||
- [x] [InternLM2](https://huggingface.co/models?search=internlm2)
|
||||
- [x] [CodeShell](https://github.com/WisdomShell/codeshell)
|
||||
- [x] [Gemma](https://ai.google.dev/gemma)
|
||||
|
||||
**Multimodal models:**
|
||||
|
||||
@@ -145,6 +142,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
|
||||
- [nat/openplayground](https://github.com/nat/openplayground)
|
||||
- [Faraday](https://faraday.dev/) (proprietary)
|
||||
- [LMStudio](https://lmstudio.ai/) (proprietary)
|
||||
- [LocalAI](https://github.com/mudler/LocalAI) (MIT)
|
||||
- [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL)
|
||||
- [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile)
|
||||
- [nomic-ai/gpt4all](https://github.com/nomic-ai/gpt4all)
|
||||
@@ -156,6 +154,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
|
||||
- [pythops/tenere](https://github.com/pythops/tenere) (AGPL)
|
||||
- [semperai/amica](https://github.com/semperai/amica)
|
||||
- [withcatai/catai](https://github.com/withcatai/catai)
|
||||
- [Mobile-Artificial-Intelligence/maid](https://github.com/Mobile-Artificial-Intelligence/maid) (MIT)
|
||||
|
||||
---
|
||||
|
||||
|
||||
@@ -123,6 +123,7 @@ pub fn build(b: *std.build.Builder) !void {
|
||||
const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp");
|
||||
const train = make.obj("train", "common/train.cpp");
|
||||
const clip = make.obj("clip", "examples/llava/clip.cpp");
|
||||
const llava = make.obj("llava", "examples/llava/llava.cpp");
|
||||
|
||||
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, console, grammar_parser });
|
||||
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo });
|
||||
@@ -131,7 +132,7 @@ pub fn build(b: *std.build.Builder) !void {
|
||||
_ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, train });
|
||||
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, train });
|
||||
|
||||
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, grammar_parser, clip });
|
||||
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, grammar_parser, clip, llava });
|
||||
if (server.target.isWindows()) {
|
||||
server.linkSystemLibrary("ws2_32");
|
||||
}
|
||||
|
||||
@@ -655,6 +655,8 @@ class OrionModel(Model):
|
||||
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
|
||||
self.gguf_writer.add_head_count(head_count)
|
||||
self.gguf_writer.add_head_count_kv(head_count_kv)
|
||||
# note: config provides rms norm but it is actually layer norm
|
||||
# ref: https://huggingface.co/OrionStarAI/Orion-14B-Chat/blob/276a17221ce42beb45f66fac657a41540e71f4f5/modeling_orion.py#L570-L571
|
||||
self.gguf_writer.add_layer_norm_eps(self.hparams["rms_norm_eps"])
|
||||
|
||||
def write_tensors(self):
|
||||
@@ -1031,7 +1033,6 @@ class PersimmonModel(Model):
|
||||
self.gguf_writer.add_head_count_kv(head_count_kv)
|
||||
self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"])
|
||||
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"])
|
||||
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
|
||||
|
||||
def set_vocab(self):
|
||||
self._set_vocab_sentencepiece()
|
||||
|
||||
@@ -1533,16 +1533,17 @@ int main(int argc, char ** argv) {
|
||||
|
||||
int n_past = 0;
|
||||
|
||||
ggml_cgraph gf = {};
|
||||
struct ggml_cgraph * gf = NULL;
|
||||
gf = ggml_new_graph_custom(ctx0, LLAMA_TRAIN_MAX_NODES, true);
|
||||
|
||||
get_example_targets_batch(ctx0, 64*ex+0, tokens_input, targets);
|
||||
|
||||
struct ggml_tensor * logits = forward_batch(&model, &kv_self, ctx0, &gf, tokens_input, n_tokens, n_past, n_batch);
|
||||
struct ggml_tensor * logits = forward_batch(&model, &kv_self, ctx0, gf, tokens_input, n_tokens, n_past, n_batch);
|
||||
// struct ggml_tensor * e = cross_entropy_loss(ctx0, targets, logits);
|
||||
struct ggml_tensor * e = square_error_loss(ctx0, targets, logits);
|
||||
|
||||
ggml_build_forward_expand(&gf, e);
|
||||
ggml_graph_compute_helper(work_buffer, &gf, /*n_threads*/ 1);
|
||||
ggml_build_forward_expand(gf, e);
|
||||
ggml_graph_compute_helper(work_buffer, gf, /*n_threads*/ 1);
|
||||
|
||||
float error_before_opt = ggml_get_f32_1d(e, 0);
|
||||
|
||||
@@ -1552,8 +1553,8 @@ int main(int argc, char ** argv) {
|
||||
opt_params_lbfgs.lbfgs.n_iter = 16;
|
||||
ggml_opt(ctx0, opt_params_lbfgs, e);
|
||||
//
|
||||
ggml_build_forward_expand(&gf, e);
|
||||
ggml_graph_compute_helper(work_buffer, &gf, /*n_threads*/ 1);
|
||||
ggml_build_forward_expand(gf, e);
|
||||
ggml_graph_compute_helper(work_buffer, gf, /*n_threads*/ 1);
|
||||
|
||||
float error_after_opt = ggml_get_f32_1d(e, 0);
|
||||
|
||||
@@ -1600,13 +1601,14 @@ int main(int argc, char ** argv) {
|
||||
};
|
||||
struct ggml_context * ctx0 = ggml_init(params);
|
||||
|
||||
ggml_cgraph gf = {};
|
||||
struct ggml_cgraph * gf = NULL;
|
||||
gf = ggml_new_graph_custom(ctx0, LLAMA_TRAIN_MAX_NODES, true);
|
||||
|
||||
int n_past = 0;
|
||||
struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, &gf, tokens_input, sample_ctx, n_past);
|
||||
struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, gf, tokens_input, sample_ctx, n_past);
|
||||
|
||||
ggml_build_forward_expand(&gf, logits);
|
||||
ggml_graph_compute_helper(work_buffer, &gf, /*n_threads*/ 1);
|
||||
ggml_build_forward_expand(gf, logits);
|
||||
ggml_graph_compute_helper(work_buffer, gf, /*n_threads*/ 1);
|
||||
|
||||
struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
|
||||
struct ggml_tensor * probs = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx);
|
||||
|
||||
@@ -87,7 +87,21 @@ class SchemaConverter:
|
||||
elif schema_type == 'array' and 'items' in schema:
|
||||
# TODO `prefixItems` keyword
|
||||
item_rule_name = self.visit(schema['items'], f'{name}{"-" if name else ""}item')
|
||||
rule = f'"[" space ({item_rule_name} ("," space {item_rule_name})*)? "]" space'
|
||||
list_item_operator = f'("," space {item_rule_name})'
|
||||
successive_items = ""
|
||||
min_items = schema.get("minItems", 0)
|
||||
if min_items > 0:
|
||||
first_item = f"({item_rule_name})"
|
||||
successive_items = list_item_operator * (min_items - 1)
|
||||
min_items -= 1
|
||||
else:
|
||||
first_item = f"({item_rule_name})?"
|
||||
max_items = schema.get("maxItems")
|
||||
if max_items is not None and max_items > min_items:
|
||||
successive_items += (list_item_operator + "?") * (max_items - min_items - 1)
|
||||
else:
|
||||
successive_items += list_item_operator + "*"
|
||||
rule = f'"[" space {first_item} {successive_items} "]" space'
|
||||
return self._add_rule(rule_name, rule)
|
||||
|
||||
else:
|
||||
|
||||
@@ -59,14 +59,39 @@ python ./convert.py ../llava-v1.5-7b --skip-unknown
|
||||
Now both the LLaMA part and the image encoder is in the `llava-v1.5-7b` directory.
|
||||
|
||||
## LLaVA 1.6 gguf conversion
|
||||
|
||||
1) Backup your pth/safetensor model files as llava-surgery modifies them
|
||||
2) Use `python llava-surgery-v2.py -C -m /path/to/hf-model` which also supports llava-1.5 variants pytorch as well as safetensor models:
|
||||
1) First clone a LLaVA 1.6 model:
|
||||
```console
|
||||
git clone https://huggingface.co/liuhaotian/llava-v1.6-vicuna-7b
|
||||
```
|
||||
2) Use `llava-surgery-v2.py` which also supports llava-1.5 variants pytorch as well as safetensor models:
|
||||
```console
|
||||
python examples/llava/llava-surgery-v2.py -C -m ../llava-v1.6-vicuna-7b/
|
||||
```
|
||||
- you will find a llava.projector and a llava.clip file in your model directory
|
||||
3) Copy the llava.clip file into a subdirectory (like vit), rename it to pytorch_model.bin and add a fitting vit configuration to the directory (https://huggingface.co/cmp-nct/llava-1.6-gguf/blob/main/config_vit.json) and rename it to config.json.
|
||||
4) Create the visual gguf model: `python ./examples/llava/convert-image-encoder-to-gguf.py -m ../path/to/vit --llava-projector ../path/to/llava.projector --output-dir ../path/to/output --clip-model-is-vision`
|
||||
3) Copy the llava.clip file into a subdirectory (like vit), rename it to pytorch_model.bin and add a fitting vit configuration to the directory:
|
||||
```console
|
||||
mkdir vit
|
||||
cp ../llava-v1.6-vicuna-7b/llava.clip vit/pytorch_model.bin
|
||||
cp ../llava-v1.6-vicuna-7b/llava.projector vit/
|
||||
curl -s -q https://huggingface.co/cmp-nct/llava-1.6-gguf/raw/main/config_vit.json -o vit/config.json
|
||||
```
|
||||
|
||||
4) Create the visual gguf model:
|
||||
```console
|
||||
python ./examples/llava/convert-image-encoder-to-gguf.py -m vit --llava-projector vit/llava.projector --output-dir vit --clip-model-is-vision
|
||||
```
|
||||
- This is similar to llava-1.5, the difference is that we tell the encoder that we are working with the pure vision model part of CLIP
|
||||
5) Everything else as usual: convert.py the hf model, quantize as needed
|
||||
|
||||
5) Then convert the model to gguf format:
|
||||
```console
|
||||
python ./convert.py ../llava-v1.6-vicuna-7b/ --skip-unknown
|
||||
```
|
||||
|
||||
6) And finally we can run the llava-cli using the 1.6 model version:
|
||||
```console
|
||||
./llava-cli -m ../llava-v1.6-vicuna-7b/ggml-model-f16.gguf --mmproj vit/mmproj-model-f16.gguf --image some-image.jpg -c 4096
|
||||
```
|
||||
|
||||
**note** llava-1.6 needs more context than llava-1.5, at least 3000 is needed (just run it at -c 4096)
|
||||
**note** llava-1.6 greatly benefits from batched prompt processing (defaults work)
|
||||
|
||||
|
||||
@@ -616,9 +616,9 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||
KQ = ggml_soft_max_inplace(ctx0, KQ);
|
||||
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
|
||||
KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_positions, n_head, batch_size);
|
||||
KQV = ggml_cont(ctx0, ggml_permute(ctx0, KQV, 0, 2, 1, 3));
|
||||
KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
||||
|
||||
cur = ggml_cpy(ctx0, KQV, ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size));
|
||||
cur = ggml_cont_3d(ctx0, KQV, hidden_size, num_positions, batch_size);
|
||||
}
|
||||
|
||||
// attention output
|
||||
|
||||
@@ -65,9 +65,7 @@ def clean_vision_tower_from_checkpoint(checkpoint_path):
|
||||
for name in clip_tensors:
|
||||
del checkpoint[name]
|
||||
|
||||
# Save the updated checkpoint
|
||||
checkpoint_path = checkpoint_path
|
||||
save_model(checkpoint, checkpoint_path, file_type)
|
||||
return True
|
||||
return False
|
||||
|
||||
@@ -152,16 +150,6 @@ for name in first_mm_tensors:
|
||||
if len(projector) > 0:
|
||||
save_model(projector, f"{args.model}/llava.projector", 'pytorch')
|
||||
|
||||
for name in mm_tensors:
|
||||
del last_checkpoint[name]
|
||||
for name in first_mm_tensors:
|
||||
del first_checkpoint[name]
|
||||
|
||||
if len(mm_tensors) > 0:
|
||||
save_model(last_checkpoint, projector_checkpoint_path, file_type)
|
||||
if len(first_mm_tensors) > 0:
|
||||
save_model(first_checkpoint, newline_checkpoint_path, file_type)
|
||||
|
||||
print("Done!")
|
||||
print(f"Now you can convert {args.model} to a a regular LLaMA GGUF file.")
|
||||
print(f"Also, use {args.model}/llava.projector to prepare a llava-encoder.gguf file.")
|
||||
|
||||
@@ -25,9 +25,6 @@ if len(clip_tensors) > 0:
|
||||
clip = {name.replace("vision_tower.vision_tower.", ""): checkpoint[name].float() for name in clip_tensors}
|
||||
torch.save(clip, f"{args.model}/llava.clip")
|
||||
|
||||
# remove these tensors
|
||||
for name in clip_tensors:
|
||||
del checkpoint[name]
|
||||
|
||||
# added tokens should be removed to be able to convert Mistral models
|
||||
if os.path.exists(f"{args.model}/added_tokens.json"):
|
||||
@@ -35,7 +32,6 @@ if len(clip_tensors) > 0:
|
||||
f.write("{}\n")
|
||||
|
||||
|
||||
torch.save(checkpoint, path)
|
||||
|
||||
print("Done!")
|
||||
print(f"Now you can convert {args.model} to a regular LLaMA GGUF file.")
|
||||
|
||||
@@ -311,7 +311,7 @@ bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx *
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) {
|
||||
bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) {
|
||||
float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*6); // TODO: base on gridsize/llava model
|
||||
if (!image_embd) {
|
||||
fprintf(stderr, "Unable to allocate memory for image embeddings\n");
|
||||
|
||||
@@ -31,6 +31,8 @@ struct llava_image_embed {
|
||||
/** sanity check for clip <-> llava embed size match */
|
||||
LLAVA_API bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip);
|
||||
|
||||
LLAVA_API bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out);
|
||||
|
||||
/** build an image embed from image file bytes */
|
||||
LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length);
|
||||
/** build an image embed from a path to an image filename */
|
||||
|
||||
@@ -334,6 +334,8 @@ int main(int argc, char ** argv) {
|
||||
// number of tokens to keep when resetting context
|
||||
if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size() || params.instruct || params.chatml) {
|
||||
params.n_keep = (int)embd_inp.size();
|
||||
} else {
|
||||
params.n_keep += add_bos; // always keep the BOS token
|
||||
}
|
||||
|
||||
// prefix & suffix for instruct mode
|
||||
@@ -383,8 +385,8 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
}
|
||||
|
||||
if (params.n_keep > 0) {
|
||||
LOG_TEE("%s: static prompt based on n_keep: '", __func__);
|
||||
if (params.n_keep > add_bos) {
|
||||
LOG_TEE("%s: static prompt based on n_keep: '", __func__);
|
||||
for (int i = 0; i < params.n_keep; i++) {
|
||||
LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
|
||||
}
|
||||
@@ -540,14 +542,14 @@ int main(int argc, char ** argv) {
|
||||
break;
|
||||
}
|
||||
|
||||
const int n_left = n_past - params.n_keep - 1;
|
||||
const int n_left = n_past - params.n_keep;
|
||||
const int n_discard = n_left/2;
|
||||
|
||||
LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
|
||||
n_past, n_left, n_ctx, params.n_keep, n_discard);
|
||||
|
||||
llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
|
||||
llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
|
||||
llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard);
|
||||
llama_kv_cache_seq_shift(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
|
||||
|
||||
n_past -= n_discard;
|
||||
|
||||
|
||||
@@ -32,6 +32,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
|
||||
{ "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", },
|
||||
{ "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.07G, +0.2496 ppl @ LLaMA-v1-7B", },
|
||||
{ "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 3.35G, +0.1764 ppl @ LLaMA-v1-7B", },
|
||||
{ "IQ4_NL", LLAMA_FTYPE_MOSTLY_IQ4_NL, " 4.25 bpw non-linear quantization", },
|
||||
{ "Q4_K", LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", },
|
||||
{ "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 3.59G, +0.0992 ppl @ LLaMA-v1-7B", },
|
||||
{ "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 3.80G, +0.0532 ppl @ LLaMA-v1-7B", },
|
||||
|
||||
@@ -41,6 +41,7 @@ see https://github.com/ggerganov/llama.cpp/issues/1437
|
||||
- `--grp-attn-w`: Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`
|
||||
- `-n, --n-predict`: Set the maximum tokens to predict (default: -1)
|
||||
- `--slots-endpoint-disable`: To disable slots state monitoring endpoint. Slots state may contain user data, prompts included.
|
||||
- `--chat-template JINJA_TEMPLATE`: Set custom jinja chat template. This parameter accepts a string, not a file name (default: template taken from model's metadata). We only support [some pre-defined templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
|
||||
|
||||
## Build
|
||||
|
||||
@@ -134,10 +135,13 @@ node index.js
|
||||
## API Endpoints
|
||||
|
||||
- **GET** `/health`: Returns the current state of the server:
|
||||
- `{"status": "loading model"}` if the model is still being loaded.
|
||||
- `{"status": "error"}` if the model failed to load.
|
||||
- `{"status": "ok"}` if the model is successfully loaded and the server is ready for further requests mentioned below.
|
||||
- `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if no slot are currently available
|
||||
- 503 -> `{"status": "loading model"}` if the model is still being loaded.
|
||||
- 500 -> `{"status": "error"}` if the model failed to load.
|
||||
- 200 -> `{"status": "ok", "slots_idle": 1, "slots_processing": 2 }` if the model is successfully loaded and the server is ready for further requests mentioned below.
|
||||
- 200 -> `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if no slot are currently available.
|
||||
- 503 -> `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if the query parameter `fail_on_no_slot` is provided and no slot are currently available.
|
||||
|
||||
If the query parameter `include_slots` is passed, `slots` field will contain internal slots data except if `--slots-endpoint-disable` is set.
|
||||
|
||||
- **POST** `/completion`: Given a `prompt`, it returns the predicted completion.
|
||||
|
||||
@@ -147,7 +151,7 @@ node index.js
|
||||
|
||||
`temperature`: Adjust the randomness of the generated text (default: 0.8).
|
||||
|
||||
`dynatemp_range`: Dynamic temperature range (default: 0.0, 0.0 = disabled).
|
||||
`dynatemp_range`: Dynamic temperature range. The final temperature will be in the range of `[temperature - dynatemp_range; temperature + dynatemp_range]` (default: 0.0, 0.0 = disabled).
|
||||
|
||||
`dynatemp_exponent`: Dynamic temperature exponent (default: 1.0).
|
||||
|
||||
@@ -205,7 +209,7 @@ node index.js
|
||||
|
||||
`slot_id`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot (default: -1)
|
||||
|
||||
`cache_prompt`: Save the prompt and generation for avoid reprocess entire prompt if a part of this isn't change (default: false)
|
||||
`cache_prompt`: Re-use previously cached prompt from the last request if possible. This may prevent re-caching the prompt from scratch. (default: false)
|
||||
|
||||
`system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
|
||||
|
||||
@@ -238,7 +242,7 @@ Notice that each `probs` is an array of length `n_probs`.
|
||||
|
||||
- `content`: Completion result as a string (excluding `stopping_word` if any). In case of streaming mode, will contain the next token as a string.
|
||||
- `stop`: Boolean for use with `stream` to check whether the generation has stopped (Note: This is not related to stopping words array `stop` from input options)
|
||||
- `generation_settings`: The provided options above excluding `prompt` but including `n_ctx`, `model`
|
||||
- `generation_settings`: The provided options above excluding `prompt` but including `n_ctx`, `model`. These options may differ from the original ones in some way (e.g. bad values filtered out, strings converted to tokens, etc.).
|
||||
- `model`: The path to the model loaded with `-m`
|
||||
- `prompt`: The provided `prompt`
|
||||
- `stopped_eos`: Indicating whether the completion has stopped because it encountered the EOS token
|
||||
|
||||
@@ -15,13 +15,11 @@
|
||||
using json = nlohmann::json;
|
||||
|
||||
inline static json oaicompat_completion_params_parse(
|
||||
const struct llama_model * model,
|
||||
const json &body, /* openai api json semantics */
|
||||
const std::string &chat_template)
|
||||
{
|
||||
json llama_params;
|
||||
std::string formatted_prompt = chat_template == "chatml"
|
||||
? format_chatml(body["messages"]) // OpenAI 'messages' to chatml (with <|im_start|>,...)
|
||||
: format_llama2(body["messages"]); // OpenAI 'messages' to llama2 (with [INST],...)
|
||||
|
||||
llama_params["__oaicompat"] = true;
|
||||
|
||||
@@ -34,7 +32,7 @@ inline static json oaicompat_completion_params_parse(
|
||||
// https://platform.openai.com/docs/api-reference/chat/create
|
||||
llama_sampling_params default_sparams;
|
||||
llama_params["model"] = json_value(body, "model", std::string("unknown"));
|
||||
llama_params["prompt"] = formatted_prompt;
|
||||
llama_params["prompt"] = format_chat(model, chat_template, body["messages"]);
|
||||
llama_params["cache_prompt"] = json_value(body, "cache_prompt", false);
|
||||
llama_params["temperature"] = json_value(body, "temperature", 0.0);
|
||||
llama_params["top_k"] = json_value(body, "top_k", default_sparams.top_k);
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
#include "oai.hpp"
|
||||
|
||||
#include "../llava/clip.h"
|
||||
#include "../llava/llava.h"
|
||||
|
||||
#include "stb_image.h"
|
||||
|
||||
@@ -37,7 +38,7 @@ struct server_params
|
||||
std::string hostname = "127.0.0.1";
|
||||
std::vector<std::string> api_keys;
|
||||
std::string public_path = "examples/server/public";
|
||||
std::string chat_template = "chatml";
|
||||
std::string chat_template = "";
|
||||
int32_t port = 8080;
|
||||
int32_t read_timeout = 600;
|
||||
int32_t write_timeout = 600;
|
||||
@@ -399,6 +400,16 @@ struct llama_server_context
|
||||
return true;
|
||||
}
|
||||
|
||||
void validate_model_chat_template(server_params & sparams) {
|
||||
llama_chat_message chat[] = {{"user", "test"}};
|
||||
std::vector<char> buf(1);
|
||||
int res = llama_chat_apply_template(model, nullptr, chat, 1, true, buf.data(), buf.size());
|
||||
if (res < 0) {
|
||||
LOG_ERROR("The chat template comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {});
|
||||
sparams.chat_template = "<|im_start|>"; // llama_chat_apply_template only checks if <|im_start|> exist in the template
|
||||
}
|
||||
}
|
||||
|
||||
void initialize() {
|
||||
// create slots
|
||||
all_slots_are_idle = true;
|
||||
@@ -997,43 +1008,12 @@ struct llama_server_context
|
||||
{
|
||||
continue;
|
||||
}
|
||||
clip_image_f32_batch img_res_v;
|
||||
img_res_v.size = 0;
|
||||
img_res_v.data = nullptr;
|
||||
if (!clip_image_preprocess(clp_ctx, img.img_data, img_res_v))
|
||||
{
|
||||
LOG_TEE("Error processing the given image");
|
||||
clip_free(clp_ctx);
|
||||
clip_image_f32_batch_free(img_res_v);
|
||||
return false;
|
||||
}
|
||||
if (img_res_v.size == 0)
|
||||
{
|
||||
|
||||
if (!llava_image_embed_make_with_clip_img(clp_ctx, params.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) {
|
||||
LOG_TEE("Error processing the given image");
|
||||
return false;
|
||||
}
|
||||
|
||||
// note: assumes only one image was returned by clip_image_preprocess
|
||||
clip_image_f32 * img_res = img_res_v.data;
|
||||
|
||||
img.image_tokens = clip_n_patches(clp_ctx);
|
||||
img.image_embedding = (float *)malloc(clip_embd_nbytes(clp_ctx));
|
||||
if (!img.image_embedding)
|
||||
{
|
||||
LOG_TEE("Unable to allocate memory for image embeddings\n");
|
||||
clip_image_f32_batch_free(img_res_v);
|
||||
clip_free(clp_ctx);
|
||||
return false;
|
||||
}
|
||||
LOG_TEE("slot %i - encoding image [id: %i]\n", slot.id, img.id);
|
||||
if (!clip_image_encode(clp_ctx, params.n_threads, img_res, img.image_embedding))
|
||||
{
|
||||
LOG_TEE("Unable to encode image\n");
|
||||
clip_image_f32_batch_free(img_res_v);
|
||||
return false;
|
||||
}
|
||||
|
||||
clip_image_f32_batch_free(img_res_v);
|
||||
|
||||
img.request_encode_image = false;
|
||||
}
|
||||
@@ -1424,6 +1404,46 @@ struct llama_server_context
|
||||
case TASK_TYPE_NEXT_RESPONSE: {
|
||||
// do nothing
|
||||
} break;
|
||||
case TASK_TYPE_SLOTS_DATA: {
|
||||
json slots_data = json::array();
|
||||
int n_idle_slots = 0;
|
||||
int n_processing_slots = 0;
|
||||
|
||||
for (llama_client_slot &slot: slots) {
|
||||
if (slot.available()) {
|
||||
n_idle_slots++;
|
||||
} else {
|
||||
n_processing_slots++;
|
||||
}
|
||||
json slot_data = get_formated_generation(slot);
|
||||
slot_data["id"] = slot.id;
|
||||
slot_data["task_id"] = slot.task_id;
|
||||
slot_data["state"] = slot.state;
|
||||
slot_data["prompt"] = slot.prompt;
|
||||
slot_data["next_token"] = {
|
||||
{"has_next_token", slot.has_next_token},
|
||||
{"n_remain", slot.n_remaining},
|
||||
{"num_tokens_predicted", slot.n_decoded},
|
||||
{"stopped_eos", slot.stopped_eos},
|
||||
{"stopped_word", slot.stopped_word},
|
||||
{"stopped_limit", slot.stopped_limit},
|
||||
{"stopping_word", slot.stopping_word},
|
||||
};
|
||||
slots_data.push_back(slot_data);
|
||||
}
|
||||
LOG_TEE("task %i - slots data: idle=%i processing=%i\n", task.id, n_idle_slots, n_processing_slots);
|
||||
task_result res;
|
||||
res.id = task.id;
|
||||
res.multitask_id = task.multitask_id;
|
||||
res.stop = true;
|
||||
res.error = false;
|
||||
res.result_json = {
|
||||
{ "idle", n_idle_slots },
|
||||
{ "processing", n_processing_slots },
|
||||
{ "slots", slots_data }
|
||||
};
|
||||
queue_results.send(res);
|
||||
} break;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1477,14 +1497,15 @@ struct llama_server_context
|
||||
if (slot.is_processing() && system_tokens.size() + slot.cache_tokens.size() >= (size_t) slot.n_ctx)
|
||||
{
|
||||
// Shift context
|
||||
const int n_left = system_tokens.size() + slot.n_past - slot.params.n_keep - 1;
|
||||
const int n_keep = slot.params.n_keep + add_bos_token;
|
||||
const int n_left = system_tokens.size() + slot.n_past - n_keep;
|
||||
const int n_discard = n_left / 2;
|
||||
|
||||
LOG_TEE("slot %d: context shift - n_keep = %d, n_left = %d, n_discard = %d\n", slot.id, slot.params.n_keep, n_left, n_discard);
|
||||
llama_kv_cache_seq_rm (ctx, slot.id, slot.params.n_keep + 1 , slot.params.n_keep + n_discard + 1);
|
||||
llama_kv_cache_seq_shift(ctx, slot.id, slot.params.n_keep + 1 + n_discard, system_tokens.size() + slot.n_past, -n_discard);
|
||||
LOG_TEE("slot %d: context shift - n_keep = %d, n_left = %d, n_discard = %d\n", slot.id, n_keep, n_left, n_discard);
|
||||
llama_kv_cache_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard);
|
||||
llama_kv_cache_seq_shift(ctx, slot.id, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard);
|
||||
|
||||
for (size_t i = slot.params.n_keep + 1 + n_discard; i < slot.cache_tokens.size(); i++)
|
||||
for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++)
|
||||
{
|
||||
slot.cache_tokens[i - n_discard] = slot.cache_tokens[i];
|
||||
}
|
||||
@@ -1497,7 +1518,7 @@ struct llama_server_context
|
||||
|
||||
LOG_VERBOSE("context shift", {
|
||||
{ "n_ctx", n_ctx },
|
||||
{ "n_keep", params.n_keep },
|
||||
{ "n_keep", n_keep },
|
||||
{ "n_left", n_left },
|
||||
});
|
||||
}
|
||||
@@ -1937,8 +1958,9 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
||||
printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
|
||||
printf(" -gan N, --grp-attn-n N set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`");
|
||||
printf(" -gaw N, --grp-attn-w N set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`");
|
||||
printf(" --chat-template FORMAT_NAME");
|
||||
printf(" set chat template, possible value is: llama2, chatml (default %s)", sparams.chat_template.c_str());
|
||||
printf(" --chat-template JINJA_TEMPLATE\n");
|
||||
printf(" set custom jinja chat template (default: template taken from model's metadata)\n");
|
||||
printf(" Note: only commonly used templates are accepted, since we don't have jinja parser\n");
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
@@ -2389,13 +2411,13 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
std::string value(argv[i]);
|
||||
if (value != "chatml" && value != "llama2") {
|
||||
fprintf(stderr, "error: chat template can be \"llama2\" or \"chatml\", but got: %s\n", value.c_str());
|
||||
if (!verify_custom_template(argv[i])) {
|
||||
fprintf(stderr, "error: the supplied chat template is not supported: %s\n", argv[i]);
|
||||
fprintf(stderr, "note: llama.cpp does not use jinja parser, we only support commonly used templates\n");
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
sparams.chat_template = value;
|
||||
sparams.chat_template = argv[i];
|
||||
}
|
||||
else if (arg == "--override-kv")
|
||||
{
|
||||
@@ -2582,40 +2604,44 @@ int main(int argc, char **argv)
|
||||
res.set_header("Access-Control-Allow-Headers", "*");
|
||||
});
|
||||
|
||||
svr.Get("/health", [&](const httplib::Request&, httplib::Response& res) {
|
||||
svr.Get("/health", [&](const httplib::Request& req, httplib::Response& res) {
|
||||
server_state current_state = state.load();
|
||||
switch(current_state) {
|
||||
case SERVER_STATE_READY:
|
||||
if (llama.all_slots_are_idle) {
|
||||
res.set_content(R"({"status": "ok"})", "application/json");
|
||||
res.status = 200; // HTTP OK
|
||||
} else {
|
||||
int available_slots = 0;
|
||||
int processing_slots = 0;
|
||||
for (llama_client_slot & slot : llama.slots) {
|
||||
if (slot.available()) {
|
||||
available_slots++;
|
||||
} else {
|
||||
processing_slots++;
|
||||
}
|
||||
}
|
||||
if (available_slots > 0) {
|
||||
json health = {
|
||||
{"status", "ok"},
|
||||
{"slots_idle", available_slots},
|
||||
{"slots_processing", processing_slots}};
|
||||
res.set_content(health.dump(), "application/json");
|
||||
res.status = 200; // HTTP OK
|
||||
} else {
|
||||
json health = {
|
||||
{"status", "no slot available"},
|
||||
{"slots_idle", available_slots},
|
||||
{"slots_processing", processing_slots}};
|
||||
res.set_content(health.dump(), "application/json");
|
||||
case SERVER_STATE_READY: {
|
||||
// request slots data using task queue
|
||||
task_server task;
|
||||
task.id = llama.queue_tasks.get_new_id();
|
||||
task.type = TASK_TYPE_SLOTS_DATA;
|
||||
task.target_id = -1;
|
||||
|
||||
llama.queue_results.add_waiting_task_id(task.id);
|
||||
llama.queue_tasks.post(task);
|
||||
|
||||
// get the result
|
||||
task_result result = llama.queue_results.recv(task.id);
|
||||
llama.queue_results.remove_waiting_task_id(task.id);
|
||||
|
||||
int n_idle_slots = result.result_json["idle"];
|
||||
int n_processing_slots = result.result_json["processing"];
|
||||
|
||||
json health = {
|
||||
{"status", "ok"},
|
||||
{"slots_idle", n_idle_slots},
|
||||
{"slots_processing", n_processing_slots}};
|
||||
res.status = 200; // HTTP OK
|
||||
if (sparams.slots_endpoint && req.has_param("include_slots")) {
|
||||
health["slots"] = result.result_json["slots"];
|
||||
}
|
||||
|
||||
if (n_idle_slots == 0) {
|
||||
health["status"] = "no slot available";
|
||||
if (req.has_param("fail_on_no_slot")) {
|
||||
res.status = 503; // HTTP Service Unavailable
|
||||
}
|
||||
}
|
||||
res.set_content(health.dump(), "application/json");
|
||||
break;
|
||||
}
|
||||
case SERVER_STATE_LOADING_MODEL:
|
||||
res.set_content(R"({"status": "loading model"})", "application/json");
|
||||
res.status = 503; // HTTP Service Unavailable
|
||||
@@ -2629,26 +2655,20 @@ int main(int argc, char **argv)
|
||||
|
||||
if (sparams.slots_endpoint) {
|
||||
svr.Get("/slots", [&](const httplib::Request&, httplib::Response& res) {
|
||||
json slots;
|
||||
for (llama_client_slot & slot : llama.slots) {
|
||||
json slot_data = llama.get_formated_generation(slot);
|
||||
slot_data["id"] = slot.id;
|
||||
slot_data["task_id"] = slot.task_id;
|
||||
slot_data["state"] = slot.state;
|
||||
slot_data["prompt"] = slot.prompt;
|
||||
slot_data["next_token"] = {
|
||||
{"has_next_token", slot.has_next_token},
|
||||
{"n_remain", slot.n_remaining},
|
||||
{"num_tokens_predicted", slot.n_decoded},
|
||||
{"stopped_eos", slot.stopped_eos},
|
||||
{"stopped_word", slot.stopped_word},
|
||||
{"stopped_limit", slot.stopped_limit},
|
||||
{"stopping_word", slot.stopping_word},
|
||||
};
|
||||
// request slots data using task queue
|
||||
task_server task;
|
||||
task.id = llama.queue_tasks.get_new_id();
|
||||
task.type = TASK_TYPE_SLOTS_DATA;
|
||||
task.target_id = -1;
|
||||
|
||||
slots.push_back(slot_data);
|
||||
}
|
||||
res.set_content(slots.dump(), "application/json");
|
||||
llama.queue_results.add_waiting_task_id(task.id);
|
||||
llama.queue_tasks.post(task);
|
||||
|
||||
// get the result
|
||||
task_result result = llama.queue_results.recv(task.id);
|
||||
llama.queue_results.remove_waiting_task_id(task.id);
|
||||
|
||||
res.set_content(result.result_json["slots"].dump(), "application/json");
|
||||
res.status = 200; // HTTP OK
|
||||
});
|
||||
}
|
||||
@@ -2742,6 +2762,11 @@ int main(int argc, char **argv)
|
||||
LOG_INFO("model loaded", {});
|
||||
}
|
||||
|
||||
if (sparams.chat_template.empty()) { // custom chat template is not supplied
|
||||
// check if the template comes with the model is supported by us
|
||||
llama.validate_model_chat_template(sparams);
|
||||
}
|
||||
|
||||
// Middleware for API key validation
|
||||
auto validate_api_key = [&sparams](const httplib::Request &req, httplib::Response &res) -> bool {
|
||||
// If API key is not set, skip validation
|
||||
@@ -2913,7 +2938,7 @@ int main(int argc, char **argv)
|
||||
if (!validate_api_key(req, res)) {
|
||||
return;
|
||||
}
|
||||
json data = oaicompat_completion_params_parse(json::parse(req.body), sparams.chat_template);
|
||||
json data = oaicompat_completion_params_parse(llama.model, json::parse(req.body), sparams.chat_template);
|
||||
|
||||
const int task_id = llama.queue_tasks.get_new_id();
|
||||
llama.queue_results.add_waiting_task_id(task_id);
|
||||
|
||||
@@ -49,7 +49,8 @@ enum server_state {
|
||||
enum task_type {
|
||||
TASK_TYPE_COMPLETION,
|
||||
TASK_TYPE_CANCEL,
|
||||
TASK_TYPE_NEXT_RESPONSE
|
||||
TASK_TYPE_NEXT_RESPONSE,
|
||||
TASK_TYPE_SLOTS_DATA
|
||||
};
|
||||
|
||||
struct task_server {
|
||||
@@ -167,50 +168,47 @@ static T json_value(const json &body, const std::string &key, const T &default_v
|
||||
: default_value;
|
||||
}
|
||||
|
||||
inline std::string format_llama2(std::vector<json> messages)
|
||||
{
|
||||
std::ostringstream output;
|
||||
bool is_inside_turn = false;
|
||||
|
||||
for (auto it = messages.begin(); it != messages.end(); ++it) {
|
||||
if (!is_inside_turn) {
|
||||
output << "[INST] ";
|
||||
}
|
||||
std::string role = json_value(*it, "role", std::string("user"));
|
||||
std::string content = json_value(*it, "content", std::string(""));
|
||||
if (role == "system") {
|
||||
output << "<<SYS>>\n" << content << "\n<<SYS>>\n\n";
|
||||
is_inside_turn = true;
|
||||
} else if (role == "user") {
|
||||
output << content << " [/INST]";
|
||||
is_inside_turn = true;
|
||||
} else {
|
||||
output << " " << content << " </s>";
|
||||
is_inside_turn = false;
|
||||
}
|
||||
}
|
||||
|
||||
LOG_VERBOSE("format_llama2", {{"text", output.str()}});
|
||||
|
||||
return output.str();
|
||||
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
|
||||
inline bool verify_custom_template(const std::string & tmpl) {
|
||||
llama_chat_message chat[] = {{"user", "test"}};
|
||||
std::vector<char> buf(1);
|
||||
int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, buf.data(), buf.size());
|
||||
return res >= 0;
|
||||
}
|
||||
|
||||
inline std::string format_chatml(std::vector<json> messages)
|
||||
// Format given chat. If tmpl is empty, we take the template from model metadata
|
||||
inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages)
|
||||
{
|
||||
std::ostringstream chatml_msgs;
|
||||
size_t alloc_size = 0;
|
||||
// vector holding all allocated string to be passed to llama_chat_apply_template
|
||||
std::vector<std::string> str(messages.size() * 2);
|
||||
std::vector<llama_chat_message> chat(messages.size());
|
||||
|
||||
for (auto it = messages.begin(); it != messages.end(); ++it) {
|
||||
chatml_msgs << "<|im_start|>"
|
||||
<< json_value(*it, "role", std::string("user")) << '\n';
|
||||
chatml_msgs << json_value(*it, "content", std::string(""))
|
||||
<< "<|im_end|>\n";
|
||||
for (size_t i = 0; i < messages.size(); ++i) {
|
||||
auto &curr_msg = messages[i];
|
||||
str[i*2 + 0] = json_value(curr_msg, "role", std::string(""));
|
||||
str[i*2 + 1] = json_value(curr_msg, "content", std::string(""));
|
||||
alloc_size += str[i*2 + 1].length();
|
||||
chat[i].role = str[i*2 + 0].c_str();
|
||||
chat[i].content = str[i*2 + 1].c_str();
|
||||
}
|
||||
|
||||
chatml_msgs << "<|im_start|>assistant" << '\n';
|
||||
const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str();
|
||||
std::vector<char> buf(alloc_size * 2);
|
||||
|
||||
LOG_VERBOSE("format_chatml", {{"text", chatml_msgs.str()}});
|
||||
// run the first time to get the total output length
|
||||
int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size());
|
||||
|
||||
return chatml_msgs.str();
|
||||
// if it turns out that our buffer is too small, we resize it
|
||||
if ((size_t) res > buf.size()) {
|
||||
buf.resize(res);
|
||||
res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size());
|
||||
}
|
||||
|
||||
std::string formatted_chat(buf.data(), res);
|
||||
LOG_VERBOSE("formatted_chat", {{"text", formatted_chat.c_str()}});
|
||||
|
||||
return formatted_chat;
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
@@ -150,6 +150,7 @@
|
||||
packages =
|
||||
{
|
||||
default = config.legacyPackages.llamaPackages.llama-cpp;
|
||||
vulkan = config.packages.default.override { useVulkan = true; };
|
||||
}
|
||||
// lib.optionalAttrs pkgs.stdenv.isLinux {
|
||||
opencl = config.packages.default.override { useOpenCL = true; };
|
||||
@@ -157,7 +158,6 @@
|
||||
|
||||
mpi-cpu = config.packages.default.override { useMpi = true; };
|
||||
mpi-cuda = config.packages.default.override { useMpi = true; };
|
||||
vulkan = config.packages.default.override { useVulkan = true; };
|
||||
}
|
||||
// lib.optionalAttrs (system == "x86_64-linux") {
|
||||
rocm = config.legacyPackages.llamaPackagesRocm.llama-cpp;
|
||||
|
||||
116
ggml-alloc.c
116
ggml-alloc.c
@@ -377,6 +377,9 @@ struct ggml_gallocr {
|
||||
|
||||
struct node_alloc * node_allocs; // [n_nodes]
|
||||
int n_nodes;
|
||||
|
||||
struct tensor_alloc * leaf_allocs; // [n_leafs]
|
||||
int n_leafs;
|
||||
};
|
||||
|
||||
ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) {
|
||||
@@ -427,6 +430,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
|
||||
free(galloc->buffers);
|
||||
free(galloc->buf_tallocs);
|
||||
free(galloc->node_allocs);
|
||||
free(galloc->leaf_allocs);
|
||||
free(galloc);
|
||||
}
|
||||
|
||||
@@ -464,7 +468,7 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
|
||||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||
struct ggml_tensor * parent = node->src[i];
|
||||
if (parent == NULL) {
|
||||
break;
|
||||
continue;
|
||||
}
|
||||
|
||||
// if the node's data is external, then we cannot re-use it
|
||||
@@ -544,22 +548,8 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
|
||||
memset(galloc->hash_set.keys, 0, galloc->hash_set.size * sizeof(struct ggml_tensor *));
|
||||
memset(galloc->hash_values, 0, galloc->hash_set.size * sizeof(struct hash_node));
|
||||
|
||||
// allocate all graph inputs first to avoid overwriting them
|
||||
for (int i = 0; i < graph->n_nodes; i++) {
|
||||
if (graph->nodes[i]->flags & GGML_TENSOR_FLAG_INPUT) {
|
||||
ggml_gallocr_allocate_node(galloc, graph->nodes[i], get_node_buffer_id(node_buffer_ids, i));
|
||||
}
|
||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||
if (graph->nodes[i]->src[j] == NULL) {
|
||||
continue;
|
||||
}
|
||||
if (graph->nodes[i]->src[j]->flags & GGML_TENSOR_FLAG_INPUT) {
|
||||
ggml_gallocr_allocate_node(galloc, graph->nodes[i]->src[j], get_node_buffer_id(node_buffer_ids, i));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// count number of children and views
|
||||
// allocate all graph inputs and leafs first to avoid overwriting them
|
||||
for (int i = 0; i < graph->n_nodes; i++) {
|
||||
struct ggml_tensor * node = graph->nodes[i];
|
||||
|
||||
@@ -568,14 +558,37 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
|
||||
ggml_gallocr_hash_get(galloc, view_src)->n_views += 1;
|
||||
}
|
||||
|
||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||
struct ggml_tensor * parent = node->src[j];
|
||||
if (parent == NULL) {
|
||||
break;
|
||||
}
|
||||
ggml_gallocr_hash_get(galloc, parent)->n_children += 1;
|
||||
if (node->flags & GGML_TENSOR_FLAG_INPUT) {
|
||||
ggml_gallocr_allocate_node(galloc, graph->nodes[i], get_node_buffer_id(node_buffer_ids, i));
|
||||
}
|
||||
}
|
||||
|
||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||
struct ggml_tensor * src = node->src[j];
|
||||
if (src == NULL) {
|
||||
continue;
|
||||
}
|
||||
|
||||
ggml_gallocr_hash_get(galloc, src)->n_children += 1;
|
||||
|
||||
// allocate explicit inputs and leafs
|
||||
if (src->flags & GGML_TENSOR_FLAG_INPUT || src->op == GGML_OP_NONE) {
|
||||
ggml_gallocr_allocate_node(galloc, src, get_node_buffer_id(node_buffer_ids, i));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// allocate the remaining leafs that are unused on the graph
|
||||
// these are effectively static tensors that the application is not using in the graph, but may still want to allocate for other purposes
|
||||
for (int i = 0; i < graph->n_leafs; i++) {
|
||||
struct ggml_tensor * leaf = graph->leafs[i];
|
||||
struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
|
||||
|
||||
if (hn->n_children == 0) {
|
||||
assert(!hn->allocated);
|
||||
// since buffer ids are only given for nodes, these leafs are always allocated in the first buffer
|
||||
ggml_gallocr_allocate_node(galloc, leaf, 0);
|
||||
}
|
||||
}
|
||||
|
||||
// allocate tensors
|
||||
for (int i = 0; i < graph->n_nodes; i++) {
|
||||
@@ -586,7 +599,7 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
|
||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||
struct ggml_tensor * parent = node->src[j];
|
||||
if (parent == NULL) {
|
||||
break;
|
||||
continue;
|
||||
}
|
||||
ggml_gallocr_allocate_node(galloc, parent, buffer_id);
|
||||
}
|
||||
@@ -598,7 +611,7 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
|
||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||
struct ggml_tensor * parent = node->src[j];
|
||||
if (parent == NULL) {
|
||||
break;
|
||||
continue;
|
||||
}
|
||||
AT_PRINTF("%s", parent->name);
|
||||
if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
|
||||
@@ -611,7 +624,7 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
|
||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||
struct ggml_tensor * parent = node->src[j];
|
||||
if (parent == NULL) {
|
||||
break;
|
||||
continue;
|
||||
}
|
||||
struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent);
|
||||
p_hn->n_children -= 1;
|
||||
@@ -696,6 +709,18 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
||||
}
|
||||
}
|
||||
}
|
||||
if (galloc->n_leafs < graph->n_leafs) {
|
||||
free(galloc->leaf_allocs);
|
||||
galloc->leaf_allocs = calloc(sizeof(struct tensor_alloc), graph->n_leafs);
|
||||
GGML_ASSERT(galloc->leaf_allocs != NULL);
|
||||
}
|
||||
galloc->n_leafs = graph->n_leafs;
|
||||
for (int i = 0; i < graph->n_leafs; i++) {
|
||||
struct ggml_tensor * leaf = graph->leafs[i];
|
||||
struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
|
||||
galloc->leaf_allocs[i].offset = hn->offset;
|
||||
galloc->leaf_allocs[i].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
|
||||
}
|
||||
|
||||
// reallocate buffers if needed
|
||||
for (int i = 0; i < galloc->n_buffers; i++) {
|
||||
@@ -722,8 +747,8 @@ bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
|
||||
return ggml_gallocr_reserve_n(galloc, graph, NULL);
|
||||
}
|
||||
|
||||
static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * node, struct node_alloc * node_alloc, struct tensor_alloc * tensor_alloc) {
|
||||
assert(node->data || node->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[node_alloc->buffer_id], node) <= tensor_alloc->size_max);
|
||||
static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id, struct tensor_alloc * tensor_alloc) {
|
||||
assert(node->data || node->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], node) <= tensor_alloc->size_max);
|
||||
|
||||
if (node->view_src != NULL) {
|
||||
if (node->buffer == NULL) {
|
||||
@@ -732,29 +757,20 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
|
||||
// this tensor was allocated without ggml-backend
|
||||
return;
|
||||
}
|
||||
ggml_backend_view_init(galloc->buffers[node_alloc->buffer_id], node);
|
||||
ggml_backend_view_init(galloc->buffers[buffer_id], node);
|
||||
}
|
||||
} else {
|
||||
if (node->data == NULL) {
|
||||
assert(tensor_alloc->offset != SIZE_MAX);
|
||||
assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[node_alloc->buffer_id], node) <= tensor_alloc->size_max);
|
||||
void * base = ggml_backend_buffer_get_base(galloc->buffers[node_alloc->buffer_id]);
|
||||
assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], node) <= tensor_alloc->size_max);
|
||||
void * base = ggml_backend_buffer_get_base(galloc->buffers[buffer_id]);
|
||||
void * addr = (char *)base + tensor_alloc->offset;
|
||||
ggml_backend_tensor_alloc(galloc->buffers[node_alloc->buffer_id], node, addr);
|
||||
ggml_backend_tensor_alloc(galloc->buffers[buffer_id], node, addr);
|
||||
} else {
|
||||
if (node->buffer == NULL) {
|
||||
// this tensor was allocated without ggml-backend
|
||||
return;
|
||||
}
|
||||
|
||||
#ifndef NDEBUG
|
||||
size_t offset =
|
||||
(char *)node->data -
|
||||
(char *)ggml_backend_buffer_get_base(node->buffer);
|
||||
size_t size = ggml_backend_buffer_get_alloc_size(node->buffer, node);
|
||||
assert(tensor_alloc->offset == SIZE_MAX || offset == tensor_alloc->offset);
|
||||
assert(tensor_alloc->offset == SIZE_MAX || size <= tensor_alloc->size_max);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -773,6 +789,13 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
|
||||
return true;
|
||||
}
|
||||
|
||||
if (galloc->n_leafs != graph->n_leafs) {
|
||||
#ifndef NDEBUG
|
||||
fprintf(stderr, "%s: graph has different number of leafs\n", __func__);
|
||||
#endif
|
||||
return true;
|
||||
}
|
||||
|
||||
for (int i = 0; i < graph->n_nodes; i++) {
|
||||
struct ggml_tensor * node = graph->nodes[i];
|
||||
struct node_alloc * node_alloc = &galloc->node_allocs[i];
|
||||
@@ -827,6 +850,7 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
|
||||
}
|
||||
|
||||
// allocate the graph tensors from the previous assignments
|
||||
// nodes
|
||||
for (int i = 0; i < graph->n_nodes; i++) {
|
||||
struct ggml_tensor * node = graph->nodes[i];
|
||||
struct node_alloc * node_alloc = &galloc->node_allocs[i];
|
||||
@@ -835,9 +859,15 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
|
||||
if (src == NULL) {
|
||||
continue;
|
||||
}
|
||||
ggml_gallocr_init_tensor(galloc, src, node_alloc, &node_alloc->src[j]);
|
||||
ggml_gallocr_init_tensor(galloc, src, node_alloc->buffer_id, &node_alloc->src[j]);
|
||||
}
|
||||
ggml_gallocr_init_tensor(galloc, node, node_alloc, &node_alloc->dst);
|
||||
ggml_gallocr_init_tensor(galloc, node, node_alloc->buffer_id, &node_alloc->dst);
|
||||
}
|
||||
// leafs
|
||||
for (int i = 0; i < graph->n_leafs; i++) {
|
||||
struct ggml_tensor * leaf = graph->leafs[i];
|
||||
struct tensor_alloc * leaf_alloc = &galloc->leaf_allocs[i];
|
||||
ggml_gallocr_init_tensor(galloc, leaf, 0, leaf_alloc);
|
||||
}
|
||||
|
||||
return true;
|
||||
|
||||
178
ggml-cuda.cu
178
ggml-cuda.cu
@@ -54,6 +54,8 @@
|
||||
#define cudaDeviceProp hipDeviceProp_t
|
||||
#define cudaDeviceSynchronize hipDeviceSynchronize
|
||||
#define cudaError_t hipError_t
|
||||
#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
|
||||
#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
|
||||
#define cudaEventCreateWithFlags hipEventCreateWithFlags
|
||||
#define cudaEventDisableTiming hipEventDisableTiming
|
||||
#define cudaEventRecord hipEventRecord
|
||||
@@ -526,6 +528,15 @@ typedef struct {
|
||||
} block_iq1_s;
|
||||
static_assert(sizeof(block_iq1_s) == sizeof(ggml_fp16_t) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding");
|
||||
|
||||
#define QK4_NL 32
|
||||
#define QR4_NL 2
|
||||
#define QI4_NL (QK4_NL / (4*QR4_NL))
|
||||
typedef struct {
|
||||
half d;
|
||||
uint8_t qs[QK4_NL/2];
|
||||
} block_iq4_nl;
|
||||
static_assert(sizeof(block_iq4_nl) == sizeof(ggml_fp16_t) + QK4_NL/2, "wrong iq4_nl block size/padding");
|
||||
|
||||
#define WARP_SIZE 32
|
||||
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
|
||||
|
||||
@@ -651,18 +662,18 @@ static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
|
||||
return a;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
|
||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
|
||||
#pragma unroll
|
||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||
a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));
|
||||
}
|
||||
return a;
|
||||
#else
|
||||
(void) a;
|
||||
NO_DEVICE_CODE;
|
||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
|
||||
}
|
||||
//static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
|
||||
//#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
|
||||
//#pragma unroll
|
||||
// for (int mask = 16; mask > 0; mask >>= 1) {
|
||||
// a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));
|
||||
// }
|
||||
// return a;
|
||||
//#else
|
||||
// (void) a;
|
||||
// NO_DEVICE_CODE;
|
||||
//#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
|
||||
//}
|
||||
|
||||
static __device__ __forceinline__ float warp_reduce_max(float x) {
|
||||
#pragma unroll
|
||||
@@ -672,18 +683,18 @@ static __device__ __forceinline__ float warp_reduce_max(float x) {
|
||||
return x;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
|
||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
|
||||
#pragma unroll
|
||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||
x = __hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
|
||||
}
|
||||
return x;
|
||||
#else
|
||||
(void) x;
|
||||
NO_DEVICE_CODE;
|
||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
|
||||
}
|
||||
//static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
|
||||
//#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
|
||||
//#pragma unroll
|
||||
// for (int mask = 16; mask > 0; mask >>= 1) {
|
||||
// x = __hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
|
||||
// }
|
||||
// return x;
|
||||
//#else
|
||||
// (void) x;
|
||||
// NO_DEVICE_CODE;
|
||||
//#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
|
||||
//}
|
||||
|
||||
static __device__ __forceinline__ float op_repeat(const float a, const float b) {
|
||||
return b;
|
||||
@@ -1985,6 +1996,26 @@ static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_
|
||||
|
||||
}
|
||||
|
||||
static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
|
||||
|
||||
template<typename dst_t>
|
||||
static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||
|
||||
const int i = blockIdx.x;
|
||||
const block_iq4_nl * x = (const block_iq4_nl *) vx + i*(QK_K/QK4_NL);
|
||||
|
||||
const int tid = threadIdx.x;
|
||||
const int il = tid/8; // 0...3
|
||||
const int ib = tid%8; // 0...7
|
||||
dst_t * y = yy + i*QK_K + 32*ib + 4*il;
|
||||
const uint8_t * q4 = x[ib].qs + 4*il;
|
||||
const float d = (float)x[ib].d;
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
|
||||
y[j+16] = d * kvalues_iq4nl[q4[j] >> 4];
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
||||
|
||||
@@ -4641,10 +4672,12 @@ static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
|
||||
const float d = (float)bq2->d * __low2float(bq8_1[ib32].ds) * 0.25f;
|
||||
return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
|
||||
#else
|
||||
(void) ksigns64;
|
||||
assert(false);
|
||||
return 0.f;
|
||||
#endif
|
||||
#else
|
||||
(void) ksigns64;
|
||||
assert(false);
|
||||
return 0.f;
|
||||
#endif
|
||||
@@ -4728,6 +4761,56 @@ static __device__ __forceinline__ float vec_dot_iq1_s_q8_1(
|
||||
#endif
|
||||
}
|
||||
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||
static __device__ __forceinline__ void get_int_from_table_16(const uint32_t & q4, const uint8_t * values,
|
||||
int & val1, int & val2) {
|
||||
|
||||
uint32_t aux32; const uint8_t * q8 = (const uint8_t *)&aux32;
|
||||
aux32 = q4 & 0x0f0f0f0f;
|
||||
uint16_t v1 = values[q8[0]] | (values[q8[1]] << 8);
|
||||
uint16_t v2 = values[q8[2]] | (values[q8[3]] << 8);
|
||||
val1 = v1 | (v2 << 16);
|
||||
aux32 = (q4 >> 4) & 0x0f0f0f0f;
|
||||
v1 = values[q8[0]] | (values[q8[1]] << 8);
|
||||
v2 = values[q8[2]] | (values[q8[3]] << 8);
|
||||
val2 = v1 | (v2 << 16);
|
||||
}
|
||||
#endif
|
||||
|
||||
static __device__ __forceinline__ float vec_dot_iq4_nl_q8_1(
|
||||
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
||||
|
||||
const block_iq4_nl * bq = (const block_iq4_nl *) vbq;
|
||||
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||
const uint16_t * q4 = (const uint16_t *)bq->qs + 2*iqs;
|
||||
const int32_t * q8 = (const int32_t *)bq8_1->qs + iqs;
|
||||
|
||||
const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
|
||||
|
||||
int v1, v2;
|
||||
int sumi1 = 0, sumi2 = 0;
|
||||
for (int l = 0; l < VDR_Q4_0_Q8_1_MMVQ; ++l) {
|
||||
const uint32_t aux = q4[2*l] | (q4[2*l+1] << 16);
|
||||
get_int_from_table_16(aux, values, v1, v2);
|
||||
sumi1 = __dp4a(v1, q8[l+0], sumi1);
|
||||
sumi2 = __dp4a(v2, q8[l+4], sumi2);
|
||||
}
|
||||
|
||||
#else
|
||||
const uint8_t * q4 = bq->qs + 4*iqs;
|
||||
const int8_t * q8 = bq8_1->qs + 4*iqs;
|
||||
|
||||
int sumi1 = 0, sumi2 = 0;
|
||||
for (int l = 0; l < 4*VDR_Q4_0_Q8_1_MMVQ; ++l) {
|
||||
sumi1 += q8[l+ 0] * kvalues_iq4nl[q4[l] & 0xf];
|
||||
sumi2 += q8[l+16] * kvalues_iq4nl[q4[l] >> 4];
|
||||
}
|
||||
#endif
|
||||
const float d = (float)bq->d * __low2float(bq8_1->ds);
|
||||
return d * (sumi1 + sumi2);
|
||||
}
|
||||
|
||||
template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x, int mmq_y, int nwarps,
|
||||
allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles, int vdr, vec_dot_q_mul_mat_cuda_t vec_dot>
|
||||
static __device__ __forceinline__ void mul_mat_q(
|
||||
@@ -6205,7 +6288,7 @@ static __global__ void soft_max_f32(const float * x, const float * mask, const f
|
||||
const int ix = rowx*ncols + col;
|
||||
const int iy = rowy*ncols + col;
|
||||
|
||||
const float val = x[ix]*scale + (mask ? mask[iy] : 0.0f) + slope*pos[col];
|
||||
const float val = x[ix]*scale + (mask ? mask[iy] : 0.0f) + (pos ? slope*pos[col] : 0.0f);
|
||||
|
||||
vals[col] = val;
|
||||
max_val = max(max_val, val);
|
||||
@@ -6773,6 +6856,12 @@ static void dequantize_row_iq1_s_cuda(const void * vx, dst_t * y, const int k, c
|
||||
dequantize_block_iq1_s<<<nb, 32, 0, stream>>>(vx, y);
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_iq4_nl_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
||||
const int nb = (k + QK_K - 1) / QK_K;
|
||||
dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
|
||||
}
|
||||
|
||||
template <typename src_t, typename dst_t>
|
||||
static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
|
||||
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
||||
@@ -6814,6 +6903,8 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
||||
return dequantize_row_iq3_xxs_cuda;
|
||||
case GGML_TYPE_IQ1_S:
|
||||
return dequantize_row_iq1_s_cuda;
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
return dequantize_row_iq4_nl_cuda;
|
||||
case GGML_TYPE_F32:
|
||||
return convert_unary_cuda<float>;
|
||||
default:
|
||||
@@ -6851,6 +6942,8 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
||||
return dequantize_row_iq3_xxs_cuda;
|
||||
case GGML_TYPE_IQ1_S:
|
||||
return dequantize_row_iq1_s_cuda;
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
return dequantize_row_iq4_nl_cuda;
|
||||
case GGML_TYPE_F16:
|
||||
return convert_unary_cuda<half>;
|
||||
default:
|
||||
@@ -8595,6 +8688,7 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_CUD
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
case GGML_TYPE_IQ1_S:
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
return max_compute_capability >= CC_RDNA2 ? 128 : 64;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
@@ -8619,6 +8713,7 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_CUD
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
case GGML_TYPE_IQ1_S:
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
return max_compute_capability >= CC_VOLTA ? 128 : 64;
|
||||
case GGML_TYPE_Q6_K:
|
||||
return 64;
|
||||
@@ -8720,6 +8815,10 @@ static void ggml_cuda_op_mul_mat_vec_q(
|
||||
mul_mat_vec_q_cuda<QK_K, QI1_S, block_iq1_s, 1, vec_dot_iq1_s_q8_1>
|
||||
(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
mul_mat_vec_q_cuda<QK4_NL, QI4_NL, block_iq4_nl, VDR_Q4_0_Q8_1_MMVQ, vec_dot_iq4_nl_q8_1>
|
||||
(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
break;
|
||||
@@ -9170,17 +9269,17 @@ static void ggml_cuda_op_soft_max(
|
||||
memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
|
||||
|
||||
// positions tensor
|
||||
float * src2_dd = dst_dd; // default to avoid null checks in the kernel
|
||||
float * src2_dd = nullptr;
|
||||
cuda_pool_alloc<float> src2_f;
|
||||
|
||||
ggml_tensor * src2 = dst->src[2];
|
||||
const bool use_src2 = src2 != nullptr;
|
||||
|
||||
if (use_src2) {
|
||||
const bool src2_on_device = use_src2 && src2->backend == GGML_BACKEND_GPU;
|
||||
ggml_tensor_extra_gpu * src2_extra = use_src2 ? (ggml_tensor_extra_gpu *) src2->extra : nullptr;
|
||||
const bool src2_on_device = src2->backend == GGML_BACKEND_GPU;
|
||||
|
||||
if (src2_on_device) {
|
||||
ggml_tensor_extra_gpu * src2_extra = (ggml_tensor_extra_gpu *) src2->extra;
|
||||
src2_dd = (float *) src2_extra->data_device[g_main_device];
|
||||
} else {
|
||||
src2_dd = src2_f.alloc(ggml_nelements(src2));
|
||||
@@ -9323,9 +9422,15 @@ static void ggml_cuda_set_peer_access(const int n_tokens) {
|
||||
CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, id, id_other));
|
||||
if (can_access_peer) {
|
||||
if (enable_peer_access) {
|
||||
CUDA_CHECK(cudaDeviceEnablePeerAccess(id_other, 0));
|
||||
cudaError_t err = cudaDeviceEnablePeerAccess(id_other, 0);
|
||||
if (err != cudaErrorPeerAccessAlreadyEnabled) {
|
||||
CUDA_CHECK(err);
|
||||
}
|
||||
} else {
|
||||
CUDA_CHECK(cudaDeviceDisablePeerAccess(id_other));
|
||||
cudaError_t err = cudaDeviceDisablePeerAccess(id_other);
|
||||
if (err != cudaErrorPeerAccessNotEnabled) {
|
||||
CUDA_CHECK(err);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -10997,10 +11102,10 @@ GGML_CALL static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backe
|
||||
UNUSED(buffer);
|
||||
}
|
||||
|
||||
// unused at the moment
|
||||
//static bool ggml_backend_buffer_is_cuda_split(ggml_backend_buffer_t buffer) {
|
||||
// return buffer->iface.get_name == ggml_backend_cuda_split_buffer_get_name;
|
||||
//}
|
||||
static bool ggml_backend_buffer_is_cuda_split(ggml_backend_buffer_t buffer) {
|
||||
return buffer->iface.get_name == ggml_backend_cuda_split_buffer_get_name;
|
||||
UNUSED(ggml_backend_buffer_is_cuda_split); // only used in debug builds currently, avoid unused function warning in release builds
|
||||
}
|
||||
|
||||
GGML_CALL static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
|
||||
@@ -11388,7 +11493,7 @@ GGML_CALL static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, gg
|
||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||
if (node->src[j] != nullptr) {
|
||||
assert(node->src[j]->backend == GGML_BACKEND_GPU || node->src[j]->backend == GGML_BACKEND_GPU_SPLIT);
|
||||
assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
|
||||
assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || ggml_backend_buffer_is_cuda_split(node->src[j]->buffer));
|
||||
assert(node->src[j]->extra != nullptr);
|
||||
}
|
||||
}
|
||||
@@ -11436,7 +11541,8 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
||||
return false;
|
||||
}
|
||||
ggml_type a_type = a->type;
|
||||
if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS || a_type == GGML_TYPE_IQ1_S) {
|
||||
if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS ||
|
||||
a_type == GGML_TYPE_IQ1_S || a_type == GGML_TYPE_IQ4_NL) {
|
||||
if (b->ne[1] == 1 && ggml_nrows(b) > 1) {
|
||||
return false;
|
||||
}
|
||||
|
||||
44
ggml-metal.m
44
ggml-metal.m
@@ -62,6 +62,7 @@ enum ggml_metal_kernel_type {
|
||||
GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS,
|
||||
GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS,
|
||||
GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S,
|
||||
GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL,
|
||||
GGML_METAL_KERNEL_TYPE_GET_ROWS_I32,
|
||||
GGML_METAL_KERNEL_TYPE_RMS_NORM,
|
||||
GGML_METAL_KERNEL_TYPE_GROUP_NORM,
|
||||
@@ -85,6 +86,7 @@ enum ggml_metal_kernel_type {
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32,
|
||||
//GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F16,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32,
|
||||
@@ -104,6 +106,7 @@ enum ggml_metal_kernel_type {
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32,
|
||||
@@ -120,6 +123,7 @@ enum ggml_metal_kernel_type {
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32,
|
||||
@@ -136,6 +140,7 @@ enum ggml_metal_kernel_type {
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32,
|
||||
GGML_METAL_KERNEL_TYPE_ROPE_F32,
|
||||
GGML_METAL_KERNEL_TYPE_ROPE_F16,
|
||||
GGML_METAL_KERNEL_TYPE_ALIBI_F32,
|
||||
@@ -277,6 +282,14 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
return NULL;
|
||||
}
|
||||
} else {
|
||||
#if GGML_METAL_EMBED_LIBRARY
|
||||
GGML_METAL_LOG_INFO("%s: using embedded metal library\n", __func__);
|
||||
|
||||
extern const char ggml_metallib_start[];
|
||||
extern const char ggml_metallib_end[];
|
||||
|
||||
NSString * src = [[NSString alloc] initWithBytes:ggml_metallib_start length:(ggml_metallib_end-ggml_metallib_start) encoding:NSUTF8StringEncoding];
|
||||
#else
|
||||
GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
|
||||
|
||||
NSString * sourcePath;
|
||||
@@ -299,6 +312,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||
return NULL;
|
||||
}
|
||||
#endif
|
||||
|
||||
@autoreleasepool {
|
||||
// dictionary of preprocessor macros
|
||||
@@ -439,6 +453,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS, get_rows_iq2_xs, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS, get_rows_iq3_xxs, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S, get_rows_iq1_s, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL, get_rows_iq4_nl, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_I32, get_rows_i32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RMS_NORM, rms_norm, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GROUP_NORM, group_norm, ctx->support_simdgroup_reduction);
|
||||
@@ -462,6 +477,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32, mul_mv_iq2_xs_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32, mul_mv_iq3_xxs_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32, mul_mv_iq1_s_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32, mul_mv_iq4_nl_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32, mul_mv_id_f32_f32, ctx->support_simdgroup_reduction);
|
||||
//GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F16, mul_mv_id_f16_f16, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32, mul_mv_id_f16_f32, ctx->support_simdgroup_reduction);
|
||||
@@ -481,6 +497,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32, mul_mv_id_iq2_xs_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32, mul_mv_id_iq3_xxs_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32, mul_mv_id_iq1_s_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32, mul_mv_id_iq4_nl_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32, mul_mm_f32_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32, mul_mm_f16_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32, mul_mm_q4_0_f32, ctx->support_simdgroup_mm);
|
||||
@@ -497,6 +514,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32, mul_mm_iq2_xs_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32, mul_mm_iq3_xxs_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32, mul_mm_iq1_s_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32, mul_mm_iq4_nl_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32, mul_mm_id_f32_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32, mul_mm_id_f16_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32, mul_mm_id_q4_0_f32, ctx->support_simdgroup_mm);
|
||||
@@ -513,6 +531,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32, mul_mm_id_iq2_xs_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32, mul_mm_id_iq3_xxs_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32, mul_mm_id_iq1_s_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32, mul_mm_id_iq4_nl_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F32, rope_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F16, rope_f16, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ALIBI_F32, alibi_f32, true);
|
||||
@@ -1329,6 +1348,7 @@ static bool ggml_metal_graph_compute(
|
||||
case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32 ].pipeline; break;
|
||||
case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32].pipeline; break;
|
||||
case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32 ].pipeline; break;
|
||||
case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32 ].pipeline; break;
|
||||
default: GGML_ASSERT(false && "MUL MAT-MAT not implemented");
|
||||
}
|
||||
|
||||
@@ -1469,6 +1489,12 @@ static bool ggml_metal_graph_compute(
|
||||
nth1 = 16;
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32].pipeline;
|
||||
} break;
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
{
|
||||
nth0 = 4;
|
||||
nth1 = 16;
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32].pipeline;
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src0t);
|
||||
@@ -1516,6 +1542,11 @@ static bool ggml_metal_graph_compute(
|
||||
[encoder setThreadgroupMemoryLength:mem_size atIndex:0];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
else if (src0t == GGML_TYPE_IQ4_NL) {
|
||||
const int mem_size = 32*sizeof(float);
|
||||
[encoder setThreadgroupMemoryLength:mem_size atIndex:0];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
else if (src0t == GGML_TYPE_Q4_K) {
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
@@ -1610,6 +1641,7 @@ static bool ggml_metal_graph_compute(
|
||||
case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32 ].pipeline; break;
|
||||
case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32].pipeline; break;
|
||||
case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32 ].pipeline; break;
|
||||
case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32 ].pipeline; break;
|
||||
default: GGML_ASSERT(false && "MUL_MAT_ID not implemented");
|
||||
}
|
||||
|
||||
@@ -1753,6 +1785,12 @@ static bool ggml_metal_graph_compute(
|
||||
nth1 = 16;
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32].pipeline;
|
||||
} break;
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
{
|
||||
nth0 = 4;
|
||||
nth1 = 16;
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32].pipeline;
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src2t);
|
||||
@@ -1816,6 +1854,11 @@ static bool ggml_metal_graph_compute(
|
||||
[encoder setThreadgroupMemoryLength:mem_size atIndex:0];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
else if (src2t == GGML_TYPE_IQ4_NL) {
|
||||
const int mem_size = 32*sizeof(float);
|
||||
[encoder setThreadgroupMemoryLength:mem_size atIndex:0];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
else if (src2t == GGML_TYPE_Q4_K) {
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
@@ -1858,6 +1901,7 @@ static bool ggml_metal_graph_compute(
|
||||
case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS ].pipeline; break;
|
||||
case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS].pipeline; break;
|
||||
case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S ].pipeline; break;
|
||||
case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL ].pipeline; break;
|
||||
case GGML_TYPE_I32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_I32 ].pipeline; break;
|
||||
default: GGML_ASSERT(false && "not implemented");
|
||||
}
|
||||
|
||||
223
ggml-metal.metal
223
ggml-metal.metal
@@ -392,7 +392,7 @@ kernel void kernel_soft_max(
|
||||
float lmax = -INFINITY;
|
||||
|
||||
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
|
||||
lmax = MAX(lmax, psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + slope*ppos[i00]);
|
||||
lmax = MAX(lmax, psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f));
|
||||
}
|
||||
|
||||
// find the max value in the block
|
||||
@@ -417,7 +417,7 @@ kernel void kernel_soft_max(
|
||||
// parallel sum
|
||||
float lsum = 0.0f;
|
||||
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
|
||||
const float exp_psrc0 = exp((psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + slope*ppos[i00]) - max_val);
|
||||
const float exp_psrc0 = exp((psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f)) - max_val);
|
||||
lsum += exp_psrc0;
|
||||
pdst[i00] = exp_psrc0;
|
||||
}
|
||||
@@ -495,7 +495,7 @@ kernel void kernel_soft_max_4(
|
||||
float4 lmax4 = -INFINITY;
|
||||
|
||||
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
|
||||
lmax4 = fmax(lmax4, psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f) + slope*ppos[i00]);
|
||||
lmax4 = fmax(lmax4, psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f));
|
||||
}
|
||||
|
||||
const float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3]));
|
||||
@@ -521,7 +521,7 @@ kernel void kernel_soft_max_4(
|
||||
// parallel sum
|
||||
float4 lsum4 = 0.0f;
|
||||
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
|
||||
const float4 exp_psrc4 = exp((psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f) + slope*ppos[i00]) - max_val);
|
||||
const float4 exp_psrc4 = exp((psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f)) - max_val);
|
||||
lsum4 += exp_psrc4;
|
||||
pdst4[i00] = exp_psrc4;
|
||||
}
|
||||
@@ -2531,6 +2531,12 @@ typedef struct {
|
||||
uint8_t scales[QK_K/16];
|
||||
} block_iq1_s;
|
||||
|
||||
// Non-linear quants
|
||||
#define QK4_NL 32
|
||||
typedef struct {
|
||||
half d;
|
||||
uint8_t qs[QK4_NL/2];
|
||||
} block_iq4_nl;
|
||||
|
||||
//====================================== dot products =========================
|
||||
|
||||
@@ -4384,7 +4390,6 @@ void kernel_mul_mv_iq1_s_f32_impl(
|
||||
const uint i13 = im/ne12;
|
||||
|
||||
const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
|
||||
|
||||
device const block_iq1_s * x = (device const block_iq1_s *) src0 + ib_row + offset0;
|
||||
device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
|
||||
|
||||
@@ -4447,6 +4452,103 @@ void kernel_mul_mv_iq1_s_f32_impl(
|
||||
}
|
||||
}
|
||||
|
||||
constexpr constant static float kvalues_iq4nl_f[16] = {
|
||||
-127.f, -104.f, -83.f, -65.f, -49.f, -35.f, -22.f, -10.f, 1.f, 13.f, 25.f, 38.f, 53.f, 69.f, 89.f, 113.f
|
||||
};
|
||||
|
||||
void kernel_mul_mv_iq4_nl_f32_impl(
|
||||
device const void * src0,
|
||||
device const float * src1,
|
||||
device float * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
constant int64_t & ne10,
|
||||
constant int64_t & ne12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant uint & r2,
|
||||
constant uint & r3,
|
||||
threadgroup float * shared_values [[threadgroup(0)]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint tiisg[[thread_index_in_simdgroup]],
|
||||
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||
|
||||
const int nb = ne00/QK4_NL;
|
||||
const int r0 = tgpig.x;
|
||||
const int r1 = tgpig.y;
|
||||
const int im = tgpig.z;
|
||||
const int first_row = (r0 * 2 + sgitg) * 2;
|
||||
const int ib_row = first_row * nb;
|
||||
|
||||
const uint i12 = im%ne12;
|
||||
const uint i13 = im/ne12;
|
||||
|
||||
const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
|
||||
device const block_iq4_nl * x = (device const block_iq4_nl *) src0 + ib_row + offset0;
|
||||
device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
|
||||
|
||||
const int ix = tiisg/2; // 0...15
|
||||
const int it = tiisg%2; // 0 or 1
|
||||
|
||||
shared_values[tiisg] = kvalues_iq4nl_f[tiisg%16];
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
float4 yl[4];
|
||||
float sumf[2]={0.f}, all_sum;
|
||||
|
||||
device const float * yb = y + ix * QK4_NL + it * 8;
|
||||
|
||||
uint32_t aux32[2];
|
||||
thread const uint8_t * q8 = (thread const uint8_t *)aux32;
|
||||
|
||||
float4 qf1, qf2;
|
||||
|
||||
for (int ib = ix; ib < nb; ib += 16) {
|
||||
|
||||
device const float4 * y4 = (device const float4 *)yb;
|
||||
yl[0] = y4[0]; yl[1] = y4[4]; yl[2] = y4[1]; yl[3] = y4[5];
|
||||
|
||||
for (int row = 0; row < 2; ++row) {
|
||||
|
||||
device const block_iq4_nl & xb = x[row*nb + ib];
|
||||
device const uint16_t * q4 = (device const uint16_t *)(xb.qs + 8*it);
|
||||
|
||||
float4 acc1 = {0.f}, acc2 = {0.f};
|
||||
|
||||
aux32[0] = q4[0] | (q4[1] << 16);
|
||||
aux32[1] = (aux32[0] >> 4) & 0x0f0f0f0f;
|
||||
aux32[0] &= 0x0f0f0f0f;
|
||||
qf1 = {shared_values[q8[0]], shared_values[q8[1]], shared_values[q8[2]], shared_values[q8[3]]};
|
||||
qf2 = {shared_values[q8[4]], shared_values[q8[5]], shared_values[q8[6]], shared_values[q8[7]]};
|
||||
acc1 += yl[0] * qf1;
|
||||
acc2 += yl[1] * qf2;
|
||||
|
||||
aux32[0] = q4[2] | (q4[3] << 16);
|
||||
aux32[1] = (aux32[0] >> 4) & 0x0f0f0f0f;
|
||||
aux32[0] &= 0x0f0f0f0f;
|
||||
qf1 = {shared_values[q8[0]], shared_values[q8[1]], shared_values[q8[2]], shared_values[q8[3]]};
|
||||
qf2 = {shared_values[q8[4]], shared_values[q8[5]], shared_values[q8[6]], shared_values[q8[7]]};
|
||||
acc1 += yl[2] * qf1;
|
||||
acc2 += yl[3] * qf2;
|
||||
|
||||
acc1 += acc2;
|
||||
|
||||
sumf[row] += (float)xb.d * (acc1[0] + acc1[1] + acc1[2] + acc1[3]);
|
||||
|
||||
}
|
||||
|
||||
yb += 16 * QK4_NL;
|
||||
}
|
||||
|
||||
for (int row = 0; row < 2; ++row) {
|
||||
all_sum = simd_sum(sumf[row]);
|
||||
if (tiisg == 0) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
[[host_name("kernel_mul_mv_iq1_s_f32")]]
|
||||
kernel void kernel_mul_mv_iq1_s_f32(
|
||||
device const void * src0,
|
||||
@@ -4475,6 +4577,34 @@ kernel void kernel_mul_mv_iq1_s_f32(
|
||||
kernel_mul_mv_iq1_s_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, tgpig, tiisg, sgitg);
|
||||
}
|
||||
|
||||
[[host_name("kernel_mul_mv_iq4_nl_f32")]]
|
||||
kernel void kernel_mul_mv_iq4_nl_f32(
|
||||
device const void * src0,
|
||||
device const float * src1,
|
||||
device float * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
constant uint64_t & nb00,
|
||||
constant uint64_t & nb01,
|
||||
constant uint64_t & nb02,
|
||||
constant int64_t & ne10,
|
||||
constant int64_t & ne11,
|
||||
constant int64_t & ne12,
|
||||
constant uint64_t & nb10,
|
||||
constant uint64_t & nb11,
|
||||
constant uint64_t & nb12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant uint & r2,
|
||||
constant uint & r3,
|
||||
threadgroup float * shared_values [[threadgroup(0)]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint tiisg[[thread_index_in_simdgroup]],
|
||||
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||
|
||||
kernel_mul_mv_iq4_nl_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
|
||||
}
|
||||
|
||||
//============================= templates and their specializations =============================
|
||||
|
||||
@@ -4838,6 +4968,21 @@ void dequantize_iq1_s(device const block_iq1_s * xb, short il, thread type4x4 &
|
||||
}
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_iq4_nl(device const block_iq4_nl * xb, short il, thread type4x4 & reg) {
|
||||
device const uint16_t * q4 = (device const uint16_t *)xb->qs;
|
||||
const float d = xb->d;
|
||||
uint32_t aux32;
|
||||
thread const uint8_t * q8 = (thread const uint8_t *)&aux32;
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
aux32 = ((q4[2*i] | (q4[2*i+1] << 16)) >> 4*il) & 0x0f0f0f0f;
|
||||
reg[i][0] = d * kvalues_iq4nl_f[q8[0]];
|
||||
reg[i][1] = d * kvalues_iq4nl_f[q8[1]];
|
||||
reg[i][2] = d * kvalues_iq4nl_f[q8[2]];
|
||||
reg[i][3] = d * kvalues_iq4nl_f[q8[3]];
|
||||
}
|
||||
}
|
||||
|
||||
template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread float4x4 &)>
|
||||
kernel void kernel_get_rows(
|
||||
device const void * src0,
|
||||
@@ -5381,6 +5526,7 @@ template [[host_name("kernel_get_rows_iq2_xxs")]] kernel get_rows_t kernel_get_r
|
||||
template [[host_name("kernel_get_rows_iq2_xs")]] kernel get_rows_t kernel_get_rows<block_iq2_xs, QK_NL, dequantize_iq2_xs>;
|
||||
template [[host_name("kernel_get_rows_iq3_xxs")]] kernel get_rows_t kernel_get_rows<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
|
||||
template [[host_name("kernel_get_rows_iq1_s")]] kernel get_rows_t kernel_get_rows<block_iq1_s, QK_NL, dequantize_iq1_s>;
|
||||
template [[host_name("kernel_get_rows_iq4_nl")]] kernel get_rows_t kernel_get_rows<block_iq4_nl, 2, dequantize_iq4_nl>;
|
||||
|
||||
//
|
||||
// matrix-matrix multiplication
|
||||
@@ -5421,6 +5567,7 @@ template [[host_name("kernel_mul_mm_iq2_xxs_f32")]] kernel mat_mm_t kernel_mul_m
|
||||
template [[host_name("kernel_mul_mm_iq2_xs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq2_xs, QK_NL, dequantize_iq2_xs>;
|
||||
template [[host_name("kernel_mul_mm_iq3_xxs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
|
||||
template [[host_name("kernel_mul_mm_iq1_s_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq1_s, QK_NL, dequantize_iq1_s>;
|
||||
template [[host_name("kernel_mul_mm_iq4_nl_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq4_nl, 2, dequantize_iq4_nl>;
|
||||
|
||||
//
|
||||
// indirect matrix-matrix multiplication
|
||||
@@ -5473,6 +5620,7 @@ template [[host_name("kernel_mul_mm_id_iq2_xxs_f32")]] kernel mat_mm_id_t kernel
|
||||
template [[host_name("kernel_mul_mm_id_iq2_xs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq2_xs, QK_NL, dequantize_iq2_xs>;
|
||||
template [[host_name("kernel_mul_mm_id_iq3_xxs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
|
||||
template [[host_name("kernel_mul_mm_id_iq1_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq1_s, QK_NL, dequantize_iq1_s>;
|
||||
template [[host_name("kernel_mul_mm_id_iq4_nl_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_nl, 2, dequantize_iq4_nl>;
|
||||
|
||||
//
|
||||
// matrix-vector multiplication
|
||||
@@ -6503,3 +6651,68 @@ kernel void kernel_mul_mv_id_iq1_s_f32(
|
||||
tiisg,
|
||||
sgitg);
|
||||
}
|
||||
|
||||
[[host_name("kernel_mul_mv_id_iq4_nl_f32")]]
|
||||
kernel void kernel_mul_mv_id_iq4_nl_f32(
|
||||
device const char * ids,
|
||||
device const char * src1,
|
||||
device float * dst,
|
||||
constant uint64_t & nbi1,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
constant uint64_t & nb00,
|
||||
constant uint64_t & nb01,
|
||||
constant uint64_t & nb02,
|
||||
constant int64_t & ne10,
|
||||
constant int64_t & ne11,
|
||||
constant int64_t & ne12,
|
||||
constant int64_t & ne13,
|
||||
constant uint64_t & nb10,
|
||||
constant uint64_t & nb11,
|
||||
constant uint64_t & nb12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant uint64_t & nb1,
|
||||
constant uint & r2,
|
||||
constant uint & r3,
|
||||
constant int & idx,
|
||||
device const char * src00,
|
||||
device const char * src01,
|
||||
device const char * src02,
|
||||
device const char * src03,
|
||||
device const char * src04,
|
||||
device const char * src05,
|
||||
device const char * src06,
|
||||
device const char * src07,
|
||||
threadgroup float * shared_values [[threadgroup(0)]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint tiitg[[thread_index_in_threadgroup]],
|
||||
uint tiisg[[thread_index_in_simdgroup]],
|
||||
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||
device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07};
|
||||
|
||||
const int64_t bid = tgpig.z/(ne12*ne13);
|
||||
|
||||
tgpig.z = tgpig.z%(ne12*ne13);
|
||||
|
||||
const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx];
|
||||
|
||||
kernel_mul_mv_iq4_nl_f32_impl(
|
||||
src0[id],
|
||||
(device const float *) (src1 + bid*nb11),
|
||||
dst + bid*ne0,
|
||||
ne00,
|
||||
ne01,
|
||||
ne02,
|
||||
ne10,
|
||||
ne12,
|
||||
ne0,
|
||||
ne1,
|
||||
r2,
|
||||
r3,
|
||||
shared_values,
|
||||
tgpig,
|
||||
tiisg,
|
||||
sgitg);
|
||||
}
|
||||
|
||||
261
ggml-quants.c
261
ggml-quants.c
@@ -438,6 +438,30 @@ inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
|
||||
return res;
|
||||
}
|
||||
|
||||
// NOTE: not tested
|
||||
inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
|
||||
int8x16_t res;
|
||||
|
||||
res[ 0] = a[b[ 0]];
|
||||
res[ 1] = a[b[ 1]];
|
||||
res[ 2] = a[b[ 2]];
|
||||
res[ 3] = a[b[ 3]];
|
||||
res[ 4] = a[b[ 4]];
|
||||
res[ 5] = a[b[ 5]];
|
||||
res[ 6] = a[b[ 6]];
|
||||
res[ 7] = a[b[ 7]];
|
||||
res[ 8] = a[b[ 8]];
|
||||
res[ 9] = a[b[ 9]];
|
||||
res[10] = a[b[10]];
|
||||
res[11] = a[b[11]];
|
||||
res[12] = a[b[12]];
|
||||
res[13] = a[b[13]];
|
||||
res[14] = a[b[14]];
|
||||
res[15] = a[b[15]];
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define ggml_int16x8x2_t int16x8x2_t
|
||||
@@ -451,6 +475,7 @@ inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
|
||||
#define ggml_vld1q_u8_x4 vld1q_u8_x4
|
||||
#define ggml_vld1q_s8_x2 vld1q_s8_x2
|
||||
#define ggml_vld1q_s8_x4 vld1q_s8_x4
|
||||
#define ggml_vqtbl1q_s8 vqtbl1q_s8
|
||||
|
||||
#endif
|
||||
|
||||
@@ -3754,6 +3779,26 @@ void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, in
|
||||
}
|
||||
}
|
||||
|
||||
static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
|
||||
|
||||
void dequantize_row_iq4_nl(const block_iq4_nl * restrict x, float * restrict y, int k) {
|
||||
assert(k % QK4_NL == 0);
|
||||
const int nb = k / QK4_NL;
|
||||
|
||||
for (int i = 0; i < nb; i++) {
|
||||
|
||||
const uint8_t * qs = x[i].qs;
|
||||
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d);
|
||||
for (int j = 0; j < QK4_NL/2; ++j) {
|
||||
y[j+ 0] = d * kvalues_iq4nl[qs[j] & 0xf];
|
||||
y[j+QK4_NL/2] = d * kvalues_iq4nl[qs[j] >> 4];
|
||||
}
|
||||
y += QK4_NL;
|
||||
qs += QK4_NL/2;
|
||||
}
|
||||
}
|
||||
|
||||
//===================================== Q8_K ==============================================
|
||||
|
||||
void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k) {
|
||||
@@ -9148,7 +9193,6 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
||||
#endif
|
||||
}
|
||||
|
||||
// TODO
|
||||
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||
assert(n % QK_K == 0);
|
||||
assert(nrc == 1);
|
||||
@@ -9314,7 +9358,7 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const
|
||||
uint16_t gindex[8];
|
||||
uint16x8x2_t vindex;
|
||||
int8x16x4_t q1b;
|
||||
int8x16x4_t q8b;
|
||||
ggml_int8x16x4_t q8b;
|
||||
uint16x8x4_t scales;
|
||||
int32x4x2_t sumi;
|
||||
int32x4x2_t dotq;
|
||||
@@ -9452,7 +9496,100 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const
|
||||
*s = sumf;
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
UNUSED(bx);
|
||||
UNUSED(by);
|
||||
UNUSED(bs);
|
||||
assert(n % QK4_NL == 0);
|
||||
static_assert(QK4_NL == QK8_0, "QK4_NL and QK8_0 must be the same");
|
||||
|
||||
const block_iq4_nl * restrict x = vx;
|
||||
const block_q8_0 * restrict y = vy;
|
||||
|
||||
const int nb = n / QK4_NL;
|
||||
|
||||
#if defined __ARM_NEON
|
||||
const int8x16_t values = vld1q_s8(kvalues_iq4nl);
|
||||
const uint8x16_t m4b = vdupq_n_u8(0x0f);
|
||||
uint8x16x2_t q4bits;
|
||||
int8x16x4_t q4b;
|
||||
int8x16x4_t q8b;
|
||||
int32x4_t prod_1, prod_2;
|
||||
|
||||
float sumf = 0;
|
||||
|
||||
for (int ib = 0; ib < nb; ib += 2) {
|
||||
|
||||
q4bits.val[0] = vld1q_u8(x[ib+0].qs);
|
||||
q4bits.val[1] = vld1q_u8(x[ib+1].qs);
|
||||
q8b.val[0] = vld1q_s8(y[ib+0].qs);
|
||||
q8b.val[1] = vld1q_s8(y[ib+0].qs + 16);
|
||||
q8b.val[2] = vld1q_s8(y[ib+1].qs);
|
||||
q8b.val[3] = vld1q_s8(y[ib+1].qs + 16);
|
||||
|
||||
q4b.val[0] = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits.val[0], m4b));
|
||||
q4b.val[1] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[0], 4));
|
||||
q4b.val[2] = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits.val[1], m4b));
|
||||
q4b.val[3] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[1], 4));
|
||||
|
||||
prod_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[0], q8b.val[0]), q4b.val[1], q8b.val[1]);
|
||||
prod_2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[2], q8b.val[2]), q4b.val[3], q8b.val[3]);
|
||||
|
||||
sumf += (float)x[ib+0].d * (float)y[ib+0].d * vaddvq_s32(prod_1) + (float)x[ib+1].d * (float)y[ib+1].d * vaddvq_s32(prod_2);
|
||||
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
|
||||
#elif defined __AVX2__
|
||||
|
||||
const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
|
||||
const __m128i m4b = _mm_set1_epi8(0x0f);
|
||||
const __m256i mone = _mm256_set1_epi16(1);
|
||||
|
||||
__m256 accum1 = _mm256_setzero_ps();
|
||||
__m256 accum2 = _mm256_setzero_ps();
|
||||
for (int ib = 0; ib < nb; ib += 2) {
|
||||
const __m128i q4bits_1 = _mm_loadu_si128((const __m128i*)x[0].qs);
|
||||
const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)x[1].qs);
|
||||
const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)y[0].qs);
|
||||
const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)y[1].qs);
|
||||
const __m256i q4b_1 = _mm256_set_m128i(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
|
||||
_mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
|
||||
const __m256i q4b_2 = _mm256_set_m128i(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
|
||||
_mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
|
||||
const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
|
||||
const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
|
||||
const __m256i p_1 = _mm256_madd_epi16(p16_1, mone);
|
||||
const __m256i p_2 = _mm256_madd_epi16(p16_2, mone);
|
||||
accum1 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[0].d)*GGML_FP16_TO_FP32(x[0].d)),
|
||||
_mm256_cvtepi32_ps(p_1), accum1);
|
||||
accum2 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[1].d)*GGML_FP16_TO_FP32(x[1].d)),
|
||||
_mm256_cvtepi32_ps(p_2), accum2);
|
||||
|
||||
y += 2;
|
||||
x += 2;
|
||||
}
|
||||
|
||||
*s = hsum_float_8(_mm256_add_ps(accum1, accum2));
|
||||
|
||||
#else
|
||||
float sumf = 0;
|
||||
for (int ib = 0; ib < nb; ++ib) {
|
||||
const float d = GGML_FP16_TO_FP32(y[ib].d)*GGML_FP16_TO_FP32(x[ib].d);
|
||||
int sumi1 = 0, sumi2 = 0;
|
||||
for (int j = 0; j < QK4_NL/2; ++j) {
|
||||
sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf];
|
||||
sumi2 += y[ib].qs[j+QK4_NL/2] * kvalues_iq4nl[x[ib].qs[j] >> 4];
|
||||
}
|
||||
sumf += d * (sumi1 + sumi2);
|
||||
}
|
||||
*s = sumf;
|
||||
#endif
|
||||
}
|
||||
|
||||
// ================================ IQ2 quantization =============================================
|
||||
@@ -10729,3 +10866,123 @@ size_t quantize_iq1_s(const float * src, void * dst, int nrow, int n_per_row, in
|
||||
}
|
||||
return nrow * nblock * sizeof(block_iq1_s);
|
||||
}
|
||||
|
||||
// ============================ 4-bit non-linear quants
|
||||
|
||||
static inline int best_index_int8(int n, const int8_t * val, float x) {
|
||||
if (x <= val[0]) return 0;
|
||||
if (x >= val[n-1]) return n-1;
|
||||
int ml = 0, mu = n-1;
|
||||
while (mu-ml > 1) {
|
||||
int mav = (ml+mu)/2;
|
||||
if (x < val[mav]) mu = mav; else ml = mav;
|
||||
}
|
||||
return x - val[mu-1] < val[mu] - x ? mu-1 : mu;
|
||||
}
|
||||
|
||||
static void quantize_row_iq4_nl_impl(const int block_size, const float * GGML_RESTRICT x,
|
||||
ggml_fp16_t * dh, uint8_t * q4,
|
||||
float * weight, uint8_t * L,
|
||||
const int8_t * values,
|
||||
const float * quant_weights) {
|
||||
|
||||
const int ntry = 7;
|
||||
|
||||
float sigma2 = 0;
|
||||
for (int j = 0; j < QK4_NL; ++j) sigma2 += x[j]*x[j];
|
||||
sigma2 *= 2.f/QK4_NL;
|
||||
|
||||
const int nb = QK4_NL/block_size;
|
||||
|
||||
memset(q4, 0, QK4_NL/2);
|
||||
for (int ib = 0; ib < nb; ++ib) {
|
||||
dh[ib] = GGML_FP32_TO_FP16(0.f);
|
||||
const float * xb = x + ib*block_size;
|
||||
if (quant_weights) {
|
||||
const float * qw = quant_weights + ib*block_size;
|
||||
for (int j = 0; j < block_size; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
|
||||
} else {
|
||||
for (int j = 0; j < block_size; ++j) weight[j] = xb[j]*xb[j];
|
||||
}
|
||||
float amax = 0, max = 0;
|
||||
for (int j = 0; j < block_size; ++j) {
|
||||
float ax = fabsf(xb[j]);
|
||||
if (ax > amax) {
|
||||
amax = ax; max = xb[j];
|
||||
}
|
||||
}
|
||||
if (!amax) {
|
||||
continue;
|
||||
}
|
||||
float d = -max/values[0];
|
||||
float id = 1/d;
|
||||
float sumqx = 0, sumq2 = 0;
|
||||
for (int j = 0; j < block_size; ++j) {
|
||||
float al = id*xb[j];
|
||||
int l = best_index_int8(16, values, al);
|
||||
float q = values[l];
|
||||
float w = weight[j];
|
||||
sumqx += w*q*xb[j];
|
||||
sumq2 += w*q*q;
|
||||
}
|
||||
float best_id = id;
|
||||
d = sumqx/sumq2;
|
||||
float best = d*sumqx;
|
||||
for (int itry = -ntry; itry <= ntry; ++itry) {
|
||||
id = (itry + values[0])/max;
|
||||
sumqx = sumq2 = 0;
|
||||
for (int j = 0; j < block_size; ++j) {
|
||||
float al = id*xb[j];
|
||||
int l = best_index_int8(16, values, al);
|
||||
float q = values[l];
|
||||
float w = weight[j];
|
||||
sumqx += w*q*xb[j];
|
||||
sumq2 += w*q*q;
|
||||
}
|
||||
if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
|
||||
d = sumqx/sumq2; best = d * sumqx;
|
||||
best_id = id;
|
||||
}
|
||||
}
|
||||
dh[ib] = GGML_FP32_TO_FP16(d);
|
||||
for (int j = 0; j < block_size; ++j) {
|
||||
L[ib*block_size + j] = best_index_int8(16, values, best_id*xb[j]);
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < QK4_NL/32; ++i) {
|
||||
for (int j = 0; j < 16; ++j) {
|
||||
q4[16*i + j] = L[32*i + j] | (L[32*i + 16 + j] << 4);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
size_t quantize_iq4_nl(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
||||
(void)hist;
|
||||
GGML_ASSERT(n_per_row%QK4_NL == 0);
|
||||
int nblock = n_per_row/QK4_NL;
|
||||
char * qrow = (char *)dst;
|
||||
uint8_t L[QK4_NL];
|
||||
float weight[32];
|
||||
for (int row = 0; row < nrow; ++row) {
|
||||
block_iq4_nl * iq4 = (block_iq4_nl *)qrow;
|
||||
for (int ibl = 0; ibl < nblock; ++ibl) {
|
||||
const float * qw = quant_weights ? quant_weights + QK4_NL*ibl : NULL;
|
||||
quantize_row_iq4_nl_impl(32, src + QK4_NL*ibl, &iq4[ibl].d, iq4[ibl].qs, weight, L, kvalues_iq4nl, qw);
|
||||
}
|
||||
src += n_per_row;
|
||||
qrow += nblock*sizeof(block_iq4_nl);
|
||||
}
|
||||
return nrow * nblock * sizeof(block_iq4_nl);
|
||||
}
|
||||
|
||||
void quantize_row_iq4_nl(const float * restrict x, void * restrict vy, int k) {
|
||||
assert(k % QK4_NL == 0);
|
||||
block_iq4_nl * restrict y = vy;
|
||||
quantize_row_iq4_nl_reference(x, y, k);
|
||||
}
|
||||
|
||||
void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * restrict y, int k) {
|
||||
assert(k % QK4_NL == 0);
|
||||
quantize_iq4_nl(x, y, 1, k, NULL, NULL);
|
||||
}
|
||||
|
||||
|
||||
@@ -198,6 +198,14 @@ typedef struct {
|
||||
} block_iq1_s;
|
||||
static_assert(sizeof(block_iq1_s) == sizeof(ggml_fp16_t) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding");
|
||||
|
||||
// Non-linear quants
|
||||
#define QK4_NL 32
|
||||
typedef struct {
|
||||
ggml_fp16_t d;
|
||||
uint8_t qs[QK4_NL/2];
|
||||
} block_iq4_nl;
|
||||
static_assert(sizeof(block_iq4_nl) == sizeof(ggml_fp16_t) + QK4_NL/2, "wrong iq4_nl block size/padding");
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
@@ -217,6 +225,7 @@ void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGM
|
||||
void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int k);
|
||||
void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int k);
|
||||
void quantize_row_iq4_nl_reference (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int k);
|
||||
|
||||
void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
@@ -232,6 +241,7 @@ void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in
|
||||
void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
|
||||
// Dequantization
|
||||
void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
@@ -251,6 +261,7 @@ void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_
|
||||
void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
void dequantize_row_iq1_s (const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
void dequantize_row_iq4_nl (const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
|
||||
// Dot product
|
||||
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
@@ -268,6 +279,7 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
|
||||
void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
|
||||
//
|
||||
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
|
||||
@@ -276,6 +288,7 @@ size_t quantize_iq2_xxs(const float * src, void * dst, int nrows, int n_per_row,
|
||||
size_t quantize_iq2_xs (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||
size_t quantize_iq3_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||
size_t quantize_iq1_s (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||
size_t quantize_iq4_nl (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||
size_t quantize_q2_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||
size_t quantize_q3_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||
size_t quantize_q4_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||
|
||||
284
ggml-sycl.cpp
284
ggml-sycl.cpp
@@ -9188,174 +9188,22 @@ static void convert_mul_mat_vec_f16_sycl(const void *vx, const dfloat *y,
|
||||
}
|
||||
}
|
||||
|
||||
static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy,
|
||||
float *dst, const int ncols,
|
||||
const int nrows,
|
||||
dpct::queue_ptr stream) {
|
||||
GGML_ASSERT(ncols % QK4_0 == 0);
|
||||
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
||||
mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ,
|
||||
vec_dot_q4_0_q8_1>(vx, vy, dst, ncols, nrows,
|
||||
item_ct1);
|
||||
});
|
||||
}
|
||||
|
||||
static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
|
||||
float *dst, const int ncols,
|
||||
const int nrows,
|
||||
dpct::queue_ptr stream) {
|
||||
GGML_ASSERT(ncols % QK4_1 == 0);
|
||||
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
||||
mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ,
|
||||
vec_dot_q4_1_q8_1>(vx, vy, dst, ncols, nrows,
|
||||
item_ct1);
|
||||
});
|
||||
}
|
||||
|
||||
static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
|
||||
float *dst, const int ncols,
|
||||
const int nrows,
|
||||
dpct::queue_ptr stream) {
|
||||
GGML_ASSERT(ncols % QK5_0 == 0);
|
||||
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
||||
mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ,
|
||||
vec_dot_q5_0_q8_1>(vx, vy, dst, ncols, nrows,
|
||||
item_ct1);
|
||||
});
|
||||
}
|
||||
|
||||
static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
|
||||
float *dst, const int ncols,
|
||||
const int nrows,
|
||||
dpct::queue_ptr stream) {
|
||||
GGML_ASSERT(ncols % QK5_1 == 0);
|
||||
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
||||
mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ,
|
||||
vec_dot_q5_1_q8_1>(vx, vy, dst, ncols, nrows,
|
||||
item_ct1);
|
||||
});
|
||||
}
|
||||
|
||||
static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
|
||||
float *dst, const int ncols,
|
||||
const int nrows,
|
||||
dpct::queue_ptr stream) {
|
||||
GGML_ASSERT(ncols % QK8_0 == 0);
|
||||
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
||||
mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ,
|
||||
vec_dot_q8_0_q8_1>(vx, vy, dst, ncols, nrows,
|
||||
item_ct1);
|
||||
});
|
||||
}
|
||||
|
||||
static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
float *dst, const int ncols,
|
||||
const int nrows,
|
||||
dpct::queue_ptr stream) {
|
||||
GGML_ASSERT(ncols % QK_K == 0);
|
||||
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
||||
mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ,
|
||||
vec_dot_q2_K_q8_1>(vx, vy, dst, ncols, nrows,
|
||||
item_ct1);
|
||||
});
|
||||
}
|
||||
|
||||
static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
float *dst, const int ncols,
|
||||
const int nrows,
|
||||
dpct::queue_ptr stream) {
|
||||
GGML_ASSERT(ncols % QK_K == 0);
|
||||
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
||||
mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ,
|
||||
vec_dot_q3_K_q8_1>(vx, vy, dst, ncols, nrows,
|
||||
item_ct1);
|
||||
});
|
||||
}
|
||||
|
||||
static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
float *dst, const int ncols,
|
||||
const int nrows,
|
||||
dpct::queue_ptr stream) {
|
||||
GGML_ASSERT(ncols % QK_K == 0);
|
||||
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
||||
mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ,
|
||||
vec_dot_q4_K_q8_1>(vx, vy, dst, ncols, nrows,
|
||||
item_ct1);
|
||||
});
|
||||
}
|
||||
|
||||
static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
float *dst, const int ncols,
|
||||
const int nrows,
|
||||
dpct::queue_ptr stream) {
|
||||
GGML_ASSERT(ncols % QK_K == 0);
|
||||
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
||||
mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ,
|
||||
vec_dot_q5_K_q8_1>(vx, vy, dst, ncols, nrows,
|
||||
item_ct1);
|
||||
});
|
||||
}
|
||||
|
||||
static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
float *dst, const int ncols,
|
||||
const int nrows,
|
||||
dpct::queue_ptr stream) {
|
||||
GGML_ASSERT(ncols % QK_K == 0);
|
||||
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
||||
mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ,
|
||||
vec_dot_q6_K_q8_1>(vx, vy, dst, ncols, nrows,
|
||||
item_ct1);
|
||||
});
|
||||
template <int qk, int qi, typename block_q_t, int vdr,
|
||||
vec_dot_q_sycl_t vec_dot_q_sycl>
|
||||
static void mul_mat_vec_q_sycl_submitter(const void *vx, const void *vy,
|
||||
float *dst, const int ncols,
|
||||
const int nrows,
|
||||
dpct::queue_ptr stream) {
|
||||
GGML_ASSERT(ncols % QK4_0 == 0);
|
||||
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims), [=
|
||||
](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
||||
mul_mat_vec_q<qk, qi, block_q_t, vdr, vec_dot_q_sycl>(
|
||||
vx, vy, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
}
|
||||
|
||||
int get_device_index_by_id(int id){
|
||||
@@ -12095,37 +11943,63 @@ inline void ggml_sycl_op_mul_mat_vec_q(
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
const int64_t row_diff = row_high - row_low;
|
||||
|
||||
// TODO: support these quantization types
|
||||
GGML_ASSERT(!(src0->type == GGML_TYPE_IQ2_XXS ||
|
||||
src0->type == GGML_TYPE_IQ2_XS ||
|
||||
src0->type == GGML_TYPE_IQ3_XXS ||
|
||||
src0->type == GGML_TYPE_IQ1_S));
|
||||
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
mul_mat_vec_q4_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
||||
break;
|
||||
mul_mat_vec_q_sycl_submitter<QK4_0, QI4_0, block_q4_0,
|
||||
VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
|
||||
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q4_1:
|
||||
mul_mat_vec_q4_1_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
||||
break;
|
||||
mul_mat_vec_q_sycl_submitter<QK4_1, QI4_1, block_q4_1,
|
||||
VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>(
|
||||
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q5_0:
|
||||
mul_mat_vec_q5_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
||||
break;
|
||||
mul_mat_vec_q_sycl_submitter<QK5_0, QI5_0, block_q5_0,
|
||||
VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>(
|
||||
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q5_1:
|
||||
mul_mat_vec_q5_1_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
||||
break;
|
||||
mul_mat_vec_q_sycl_submitter<QK5_1, QI5_1, block_q5_1,
|
||||
VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>(
|
||||
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q8_0:
|
||||
mul_mat_vec_q8_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
||||
break;
|
||||
mul_mat_vec_q_sycl_submitter<QK8_0, QI8_0, block_q8_0,
|
||||
VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>(
|
||||
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q2_K:
|
||||
mul_mat_vec_q2_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
||||
break;
|
||||
mul_mat_vec_q_sycl_submitter<QK_K, QI2_K, block_q2_K,
|
||||
VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>(
|
||||
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q3_K:
|
||||
mul_mat_vec_q3_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
||||
break;
|
||||
mul_mat_vec_q_sycl_submitter<QK_K, QI3_K, block_q3_K,
|
||||
VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>(
|
||||
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q4_K:
|
||||
mul_mat_vec_q4_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
||||
break;
|
||||
mul_mat_vec_q_sycl_submitter<QK_K, QI4_K, block_q4_K,
|
||||
VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>(
|
||||
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q5_K:
|
||||
mul_mat_vec_q5_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
||||
break;
|
||||
mul_mat_vec_q_sycl_submitter<QK_K, QI5_K, block_q5_K,
|
||||
VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>(
|
||||
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q6_K:
|
||||
mul_mat_vec_q6_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
||||
break;
|
||||
mul_mat_vec_q_sycl_submitter<QK_K, QI6_K, block_q6_K,
|
||||
VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>(
|
||||
src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
break;
|
||||
@@ -12145,7 +12019,7 @@ inline void ggml_sycl_op_dequantize_mul_mat_vec(
|
||||
const int64_t src1_ncols, const int64_t src1_padded_row_size,
|
||||
const dpct::queue_ptr &stream) {
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
GGML_TENSOR_BINARY_OP_LOCALS;
|
||||
|
||||
const int64_t row_diff = row_high - row_low;
|
||||
|
||||
@@ -14768,7 +14642,8 @@ GGML_CALL static const char * ggml_backend_sycl_buffer_type_name(ggml_backend_bu
|
||||
static ggml_backend_buffer_t
|
||||
ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
|
||||
size_t size) try {
|
||||
int device = (int) (intptr_t) buft->context;
|
||||
ggml_backend_sycl_buffer_type_context * buft_ctx = (ggml_backend_sycl_buffer_type_context *)buft->context;
|
||||
int device = (int) buft_ctx->device;
|
||||
|
||||
ggml_sycl_set_device(device);
|
||||
int device_index = get_device_index_by_id(device);
|
||||
@@ -14846,7 +14721,7 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) {
|
||||
for (int i = 0; i < GGML_SYCL_MAX_DEVICES; i++) {
|
||||
ggml_backend_sycl_buffer_types[i] = {
|
||||
/* .iface = */ ggml_backend_sycl_buffer_type_interface,
|
||||
/* .context = */ (ggml_backend_buffer_type_context_t) (intptr_t) i,
|
||||
/* .context = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(i)},
|
||||
};
|
||||
}
|
||||
ggml_backend_sycl_buffer_type_initialized = true;
|
||||
@@ -14908,10 +14783,6 @@ ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type() {
|
||||
|
||||
// backend
|
||||
|
||||
struct ggml_backend_context_sycl {
|
||||
int device;
|
||||
};
|
||||
|
||||
static const char * ggml_backend_sycl_name(ggml_backend_t backend) {
|
||||
return GGML_SYCL_NAME;
|
||||
|
||||
@@ -14919,14 +14790,14 @@ static const char * ggml_backend_sycl_name(ggml_backend_t backend) {
|
||||
}
|
||||
|
||||
static void ggml_backend_sycl_free(ggml_backend_t backend) {
|
||||
ggml_backend_context_sycl * sycl_ctx = (ggml_backend_context_sycl *)backend->context;
|
||||
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
||||
|
||||
delete sycl_ctx;
|
||||
delete backend;
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_type_t ggml_backend_sycl_get_default_buffer_type(ggml_backend_t backend) {
|
||||
ggml_backend_context_sycl * sycl_ctx = (ggml_backend_context_sycl *)backend->context;
|
||||
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
||||
|
||||
return ggml_backend_sycl_buffer_type(sycl_ctx->device);
|
||||
}
|
||||
@@ -14935,7 +14806,7 @@ static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend,
|
||||
ggml_tensor *tensor,
|
||||
const void *data, size_t offset,
|
||||
size_t size) try {
|
||||
ggml_backend_context_sycl * sycl_ctx = (ggml_backend_context_sycl *)backend->context;
|
||||
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
||||
|
||||
GGML_ASSERT(tensor->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type");
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
||||
@@ -14953,7 +14824,7 @@ static void ggml_backend_sycl_get_tensor_async(ggml_backend_t backend,
|
||||
const ggml_tensor *tensor,
|
||||
void *data, size_t offset,
|
||||
size_t size) try {
|
||||
ggml_backend_context_sycl * sycl_ctx = (ggml_backend_context_sycl *)backend->context;
|
||||
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
||||
|
||||
GGML_ASSERT(tensor->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type");
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
||||
@@ -14968,7 +14839,7 @@ catch (sycl::exception const &exc) {
|
||||
}
|
||||
|
||||
static void ggml_backend_sycl_synchronize(ggml_backend_t backend) try {
|
||||
ggml_backend_context_sycl * sycl_ctx = (ggml_backend_context_sycl *)backend->context;
|
||||
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
||||
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(g_syclStreams[sycl_ctx->device][0]->wait()));
|
||||
|
||||
@@ -15004,7 +14875,7 @@ static void ggml_backend_sycl_graph_plan_compute(ggml_backend_t backend, ggml_ba
|
||||
}
|
||||
|
||||
static bool ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
||||
ggml_backend_context_sycl * sycl_ctx = (ggml_backend_context_sycl *)backend->context;
|
||||
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
||||
|
||||
ggml_sycl_set_main_device(sycl_ctx->device);
|
||||
|
||||
@@ -15093,6 +14964,12 @@ static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, const ggml_ten
|
||||
return false;
|
||||
}
|
||||
|
||||
if (a->type == GGML_TYPE_IQ1_S) {
|
||||
return false;
|
||||
}
|
||||
if (a->type == GGML_TYPE_IQ3_XXS) {
|
||||
return false;
|
||||
}
|
||||
if (a->type == GGML_TYPE_IQ2_XXS) {
|
||||
return false;
|
||||
}
|
||||
@@ -15212,8 +15089,9 @@ ggml_backend_t ggml_backend_sycl_init(int device) {
|
||||
// not strictly necessary, but it may reduce the overhead of the first graph_compute
|
||||
ggml_sycl_set_main_device(device);
|
||||
|
||||
ggml_backend_context_sycl * ctx = new ggml_backend_context_sycl {
|
||||
/* .device = */ device
|
||||
ggml_backend_sycl_context * ctx = new ggml_backend_sycl_context {
|
||||
/* .device = */ device,
|
||||
/* .name = */ GGML_SYCL_NAME + std::to_string(device),
|
||||
};
|
||||
|
||||
ggml_backend_t sycl_backend = new ggml_backend {
|
||||
|
||||
103
ggml-vulkan.cpp
103
ggml-vulkan.cpp
@@ -1091,7 +1091,10 @@ static void ggml_vk_print_gpu_info(size_t idx) {
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_vk_instance_init() {
|
||||
static bool ggml_vk_instance_validation_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions);
|
||||
static bool ggml_vk_instance_portability_enumeration_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions);
|
||||
|
||||
void ggml_vk_instance_init() {
|
||||
if (vk_instance_initialized) {
|
||||
return;
|
||||
}
|
||||
@@ -1100,28 +1103,42 @@ static void ggml_vk_instance_init() {
|
||||
#endif
|
||||
|
||||
vk::ApplicationInfo app_info{ "ggml-vulkan", 1, nullptr, 0, VK_API_VERSION };
|
||||
const std::vector<const char*> layers = {
|
||||
#ifdef GGML_VULKAN_VALIDATE
|
||||
"VK_LAYER_KHRONOS_validation",
|
||||
#endif
|
||||
};
|
||||
const std::vector<const char*> extensions = {
|
||||
#ifdef GGML_VULKAN_VALIDATE
|
||||
"VK_EXT_validation_features",
|
||||
#endif
|
||||
};
|
||||
vk::InstanceCreateInfo instance_create_info(vk::InstanceCreateFlags(), &app_info, layers, extensions);
|
||||
#ifdef GGML_VULKAN_VALIDATE
|
||||
const std::vector<vk::ValidationFeatureEnableEXT> features_enable = { vk::ValidationFeatureEnableEXT::eBestPractices };
|
||||
vk::ValidationFeaturesEXT validation_features = {
|
||||
features_enable,
|
||||
{},
|
||||
};
|
||||
validation_features.setPNext(nullptr);
|
||||
instance_create_info.setPNext(&validation_features);
|
||||
|
||||
std::cerr << "ggml_vulkan: Validation layers enabled" << std::endl;
|
||||
#endif
|
||||
const std::vector<vk::ExtensionProperties> instance_extensions = vk::enumerateInstanceExtensionProperties();
|
||||
const bool validation_ext = ggml_vk_instance_validation_ext_available(instance_extensions);
|
||||
const bool portability_enumeration_ext = ggml_vk_instance_portability_enumeration_ext_available(instance_extensions);
|
||||
|
||||
std::vector<const char*> layers;
|
||||
|
||||
if (validation_ext) {
|
||||
layers.push_back("VK_LAYER_KHRONOS_validation");
|
||||
}
|
||||
std::vector<const char*> extensions;
|
||||
if (validation_ext) {
|
||||
extensions.push_back("VK_EXT_validation_features");
|
||||
}
|
||||
if (portability_enumeration_ext) {
|
||||
extensions.push_back("VK_KHR_portability_enumeration");
|
||||
}
|
||||
vk::InstanceCreateInfo instance_create_info(vk::InstanceCreateFlags{}, &app_info, layers, extensions);
|
||||
if (portability_enumeration_ext) {
|
||||
instance_create_info.flags |= vk::InstanceCreateFlagBits::eEnumeratePortabilityKHR;
|
||||
}
|
||||
|
||||
std::vector<vk::ValidationFeatureEnableEXT> features_enable;
|
||||
vk::ValidationFeaturesEXT validation_features;
|
||||
|
||||
if (validation_ext) {
|
||||
features_enable = { vk::ValidationFeatureEnableEXT::eBestPractices };
|
||||
validation_features = {
|
||||
features_enable,
|
||||
{},
|
||||
};
|
||||
validation_features.setPNext(nullptr);
|
||||
instance_create_info.setPNext(&validation_features);
|
||||
|
||||
std::cerr << "ggml_vulkan: Validation layers enabled" << std::endl;
|
||||
}
|
||||
vk_instance.instance = vk::createInstance(instance_create_info);
|
||||
|
||||
memset(vk_instance.initialized, 0, sizeof(bool) * GGML_VK_MAX_DEVICES);
|
||||
@@ -1168,12 +1185,12 @@ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
|
||||
vk_instance.devices[idx] = std::make_shared<vk_device>();
|
||||
ctx->device = vk_instance.devices[idx];
|
||||
ctx->device.lock()->physical_device = devices[dev_num];
|
||||
std::vector<vk::ExtensionProperties> ext_props = ctx->device.lock()->physical_device.enumerateDeviceExtensionProperties();
|
||||
const std::vector<vk::ExtensionProperties> ext_props = ctx->device.lock()->physical_device.enumerateDeviceExtensionProperties();
|
||||
|
||||
bool maintenance4_support = false;
|
||||
|
||||
// Check if maintenance4 is supported
|
||||
for (auto properties : ext_props) {
|
||||
for (const auto& properties : ext_props) {
|
||||
if (strcmp("VK_KHR_maintenance4", properties.extensionName) == 0) {
|
||||
maintenance4_support = true;
|
||||
}
|
||||
@@ -1204,7 +1221,7 @@ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
|
||||
bool fp16_storage = false;
|
||||
bool fp16_compute = false;
|
||||
|
||||
for (auto properties : ext_props) {
|
||||
for (const auto& properties : ext_props) {
|
||||
if (strcmp("VK_KHR_16bit_storage", properties.extensionName) == 0) {
|
||||
fp16_storage = true;
|
||||
} else if (strcmp("VK_KHR_shader_float16_int8", properties.extensionName) == 0) {
|
||||
@@ -5301,6 +5318,42 @@ GGML_CALL int ggml_backend_vk_reg_devices() {
|
||||
return vk_instance.device_indices.size();
|
||||
}
|
||||
|
||||
// Extension availability
|
||||
static bool ggml_vk_instance_validation_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions) {
|
||||
#ifdef GGML_VULKAN_VALIDATE
|
||||
bool portability_enumeration_ext = false;
|
||||
// Check for portability enumeration extension for MoltenVK support
|
||||
for (const auto& properties : instance_extensions) {
|
||||
if (strcmp("VK_KHR_portability_enumeration", properties.extensionName) == 0) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
if (!portability_enumeration_ext) {
|
||||
std::cerr << "ggml_vulkan: WARNING: Instance extension VK_KHR_portability_enumeration not found." << std::endl;
|
||||
}
|
||||
#endif
|
||||
return false;
|
||||
|
||||
UNUSED(instance_extensions);
|
||||
}
|
||||
static bool ggml_vk_instance_portability_enumeration_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions) {
|
||||
#ifdef __APPLE__
|
||||
bool portability_enumeration_ext = false;
|
||||
// Check for portability enumeration extension for MoltenVK support
|
||||
for (const auto& properties : instance_extensions) {
|
||||
if (strcmp("VK_KHR_portability_enumeration", properties.extensionName) == 0) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
if (!portability_enumeration_ext) {
|
||||
std::cerr << "ggml_vulkan: WARNING: Instance extension VK_KHR_portability_enumeration not found." << std::endl;
|
||||
}
|
||||
#endif
|
||||
return false;
|
||||
|
||||
UNUSED(instance_extensions);
|
||||
}
|
||||
|
||||
// checks
|
||||
|
||||
#ifdef GGML_VULKAN_CHECK_RESULTS
|
||||
|
||||
2
ggml.h
2
ggml.h
@@ -355,6 +355,7 @@ extern "C" {
|
||||
GGML_TYPE_IQ2_XS = 17,
|
||||
GGML_TYPE_IQ3_XXS = 18,
|
||||
GGML_TYPE_IQ1_S = 19,
|
||||
GGML_TYPE_IQ4_NL = 20,
|
||||
GGML_TYPE_I8,
|
||||
GGML_TYPE_I16,
|
||||
GGML_TYPE_I32,
|
||||
@@ -393,6 +394,7 @@ extern "C" {
|
||||
GGML_FTYPE_MOSTLY_IQ2_XS = 16, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_IQ3_XXS = 17, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_IQ1_S = 18, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_IQ4_NL = 19, // except 1d tensors
|
||||
};
|
||||
|
||||
// available tensor operations:
|
||||
|
||||
@@ -111,6 +111,7 @@ class MODEL_ARCH(IntEnum):
|
||||
ORION = auto()
|
||||
INTERNLM2 = auto()
|
||||
MINICPM = auto()
|
||||
GEMMA = auto()
|
||||
|
||||
|
||||
class MODEL_TENSOR(IntEnum):
|
||||
@@ -167,6 +168,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||
MODEL_ARCH.ORION: "orion",
|
||||
MODEL_ARCH.INTERNLM2: "internlm2",
|
||||
MODEL_ARCH.MINICPM: "minicpm",
|
||||
MODEL_ARCH.GEMMA: "gemma",
|
||||
}
|
||||
|
||||
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||
@@ -511,6 +513,19 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||
MODEL_TENSOR.FFN_UP_EXP,
|
||||
],
|
||||
MODEL_ARCH.GEMMA: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_Q,
|
||||
MODEL_TENSOR.ATTN_K,
|
||||
MODEL_TENSOR.ATTN_V,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.FFN_GATE,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
MODEL_TENSOR.FFN_NORM,
|
||||
],
|
||||
# TODO
|
||||
}
|
||||
|
||||
|
||||
407
llama.cpp
407
llama.cpp
@@ -208,6 +208,7 @@ enum llm_arch {
|
||||
LLM_ARCH_ORION,
|
||||
LLM_ARCH_INTERNLM2,
|
||||
LLM_ARCH_MINICPM,
|
||||
LLM_ARCH_GEMMA,
|
||||
LLM_ARCH_UNKNOWN,
|
||||
};
|
||||
|
||||
@@ -234,6 +235,7 @@ static std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_ORION, "orion" },
|
||||
{ LLM_ARCH_INTERNLM2, "internlm2" },
|
||||
{ LLM_ARCH_MINICPM, "minicpm" },
|
||||
{ LLM_ARCH_GEMMA, "gemma" },
|
||||
};
|
||||
|
||||
enum llm_kv {
|
||||
@@ -760,6 +762,22 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
||||
{ LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_GEMMA,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_UNKNOWN,
|
||||
{
|
||||
@@ -2527,6 +2545,7 @@ struct llama_model_loader {
|
||||
case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break;
|
||||
case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break;
|
||||
case GGML_TYPE_IQ1_S: ftype = LLAMA_FTYPE_MOSTLY_IQ1_S; break;
|
||||
case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break;
|
||||
default:
|
||||
{
|
||||
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
|
||||
@@ -2772,13 +2791,7 @@ struct llama_model_loader {
|
||||
|
||||
std::vector<no_init<uint8_t>> read_buf;
|
||||
|
||||
for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
|
||||
struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i));
|
||||
if (!cur) {
|
||||
// some tensors may be allocated in a different context
|
||||
continue;
|
||||
}
|
||||
|
||||
for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
|
||||
if (progress_callback) {
|
||||
if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
|
||||
return false;
|
||||
@@ -2877,6 +2890,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
||||
case LLAMA_FTYPE_MOSTLY_Q3_K_XS:return "Q3_K - Extra small";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ1_S :return "IQ1_S - 1.5625 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw";
|
||||
|
||||
default: return "unknown, may not work";
|
||||
}
|
||||
@@ -3241,6 +3255,16 @@ static void llm_load_hparams(
|
||||
default: model.type = e_model::MODEL_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_GEMMA:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 18: model.type = e_model::MODEL_2B; break;
|
||||
case 28: model.type = e_model::MODEL_7B; break;
|
||||
default: model.type = e_model::MODEL_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
default: (void)0;
|
||||
}
|
||||
|
||||
@@ -3692,7 +3716,7 @@ static bool llm_load_tensors(
|
||||
}
|
||||
|
||||
// create one context per buffer type
|
||||
size_t ctx_size = ggml_tensor_overhead()*ml.n_tensors;
|
||||
size_t ctx_size = ggml_tensor_overhead()*(ml.n_tensors + 1); // +1 for models where tok_embd is duplicated as output
|
||||
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
|
||||
for (auto & it : buft_layer_count) {
|
||||
struct ggml_init_params params = {
|
||||
@@ -3830,6 +3854,7 @@ static bool llm_load_tensors(
|
||||
} else {
|
||||
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
|
||||
ml.n_created--; // artificial tensor
|
||||
ml.size_data += ggml_nbytes(model.output);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4029,6 +4054,8 @@ static bool llm_load_tensors(
|
||||
// output
|
||||
{
|
||||
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
||||
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, false);
|
||||
|
||||
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
||||
}
|
||||
|
||||
@@ -4038,14 +4065,23 @@ static bool llm_load_tensors(
|
||||
|
||||
auto & layer = model.layers[i];
|
||||
|
||||
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
||||
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
||||
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, false);
|
||||
|
||||
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
||||
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
||||
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, false);
|
||||
|
||||
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
||||
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
||||
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
||||
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
||||
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, false);
|
||||
|
||||
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
||||
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, false);
|
||||
|
||||
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
||||
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, false);
|
||||
|
||||
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
||||
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, false);
|
||||
|
||||
// AWQ ScaleActivation layer
|
||||
layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, false);
|
||||
@@ -4358,6 +4394,40 @@ static bool llm_load_tensors(
|
||||
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_GEMMA:
|
||||
{
|
||||
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
||||
|
||||
// output
|
||||
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
||||
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // same as tok_embd, duplicated to allow offloading
|
||||
ml.n_created--; // artificial tensor
|
||||
ml.size_data += ggml_nbytes(model.output);
|
||||
|
||||
const int64_t n_ff = hparams.n_ff;
|
||||
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
||||
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
||||
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
||||
|
||||
for (uint32_t i = 0; i < n_layer; ++i) {
|
||||
ggml_context * ctx_layer = ctx_for_layer(i);
|
||||
ggml_context * ctx_split = ctx_for_layer_split(i);
|
||||
|
||||
auto & layer = model.layers[i];
|
||||
|
||||
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
||||
|
||||
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * hparams.n_head});
|
||||
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa});
|
||||
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa});
|
||||
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * hparams.n_head, n_embd});
|
||||
|
||||
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
||||
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
||||
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
||||
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
||||
}
|
||||
} break;
|
||||
default:
|
||||
throw std::runtime_error("unknown architecture");
|
||||
}
|
||||
@@ -6112,7 +6182,7 @@ struct llm_build_context {
|
||||
|
||||
attn_norm = llm_build_norm(ctx0, inpL, hparams,
|
||||
model.layers[il].attn_norm,
|
||||
NULL,
|
||||
model.layers[il].attn_norm_b,
|
||||
LLM_NORM, cb, il);
|
||||
cb(attn_norm, "attn_norm", il);
|
||||
|
||||
@@ -6123,6 +6193,11 @@ struct llm_build_context {
|
||||
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
||||
cb(cur, "wqkv", il);
|
||||
|
||||
if (model.layers[il].bqkv){
|
||||
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
||||
cb(cur, "bqkv", il);
|
||||
}
|
||||
|
||||
if (hparams.f_clamp_kqv > 0.0f) {
|
||||
cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
||||
cb(cur, "wqkv_clamped", il);
|
||||
@@ -6139,7 +6214,7 @@ struct llm_build_context {
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||
|
||||
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
||||
model.layers[il].wo, NULL,
|
||||
model.layers[il].wo, model.layers[il].bo,
|
||||
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||
cb(cur, "kqv_out", il);
|
||||
}
|
||||
@@ -6152,13 +6227,13 @@ struct llm_build_context {
|
||||
{
|
||||
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
||||
model.layers[il].ffn_norm,
|
||||
NULL,
|
||||
model.layers[il].ffn_norm_b,
|
||||
LLM_NORM, cb, il);
|
||||
cb(cur, "ffn_norm", il);
|
||||
cur = llm_build_ffn(ctx0, cur,
|
||||
model.layers[il].ffn_up, NULL,
|
||||
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
||||
NULL, NULL,
|
||||
model.layers[il].ffn_down, NULL,
|
||||
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
||||
model.layers[il].ffn_act,
|
||||
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
@@ -6175,7 +6250,7 @@ struct llm_build_context {
|
||||
|
||||
cur = llm_build_norm(ctx0, cur, hparams,
|
||||
model.output_norm,
|
||||
NULL,
|
||||
model.output_norm_b,
|
||||
LLM_NORM, cb, -1);
|
||||
cb(cur, "result_norm", -1);
|
||||
|
||||
@@ -7364,6 +7439,113 @@ struct llm_build_context {
|
||||
|
||||
return gf;
|
||||
}
|
||||
|
||||
struct ggml_cgraph * build_gemma() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
|
||||
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
|
||||
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
||||
cb(inpL, "inp_embd", -1);
|
||||
inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
|
||||
cb(inpL, "inp_scaled", -1);
|
||||
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
||||
cb(inp_pos, "inp_pos", -1);
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||
cb(KQ_mask, "KQ_mask", -1);
|
||||
|
||||
// shift the entire K-cache if needed
|
||||
if (do_rope_shift) {
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
||||
}
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
|
||||
// norm
|
||||
cur = llm_build_norm(ctx0, inpL, hparams,
|
||||
model.layers[il].attn_norm, NULL,
|
||||
LLM_NORM_RMS, cb, il);
|
||||
cb(cur, "attn_norm", il);
|
||||
|
||||
// self-attention
|
||||
{
|
||||
// compute Q and K and RoPE them
|
||||
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
Qcur = ggml_rope_custom(
|
||||
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos,
|
||||
n_embd_head_k, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
cb(Qcur, "Qcur", il);
|
||||
Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k)));
|
||||
cb(Qcur, "Qcur_scaled", il);
|
||||
|
||||
Kcur = ggml_rope_custom(
|
||||
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos,
|
||||
n_embd_head_k, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
||||
model.layers[il].wo, NULL,
|
||||
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
||||
cb(cur, "kqv_out", il);
|
||||
}
|
||||
struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
|
||||
cb(sa_out, "sa_out", il);
|
||||
|
||||
cur = llm_build_norm(ctx0, sa_out, hparams,
|
||||
model.layers[il].ffn_norm, NULL,
|
||||
LLM_NORM_RMS, cb, il);
|
||||
cb(cur, "ffn_norm", il);
|
||||
|
||||
// feed-forward network
|
||||
{
|
||||
cur = llm_build_ffn(ctx0, cur,
|
||||
model.layers[il].ffn_up, NULL,
|
||||
model.layers[il].ffn_gate, NULL,
|
||||
model.layers[il].ffn_down, NULL,
|
||||
NULL,
|
||||
LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
}
|
||||
|
||||
cur = ggml_add(ctx0, cur, sa_out);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
cur = inpL;
|
||||
|
||||
cur = llm_build_norm(ctx0, cur, hparams,
|
||||
model.output_norm, NULL,
|
||||
LLM_NORM_RMS, cb, -1);
|
||||
cb(cur, "result_norm", -1);
|
||||
|
||||
// lm_head
|
||||
cur = ggml_mul_mat(ctx0, model.output, cur);
|
||||
cb(cur, "result_output", -1);
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
return gf;
|
||||
}
|
||||
};
|
||||
|
||||
static struct ggml_cgraph * llama_build_graph(
|
||||
@@ -7472,6 +7654,10 @@ static struct ggml_cgraph * llama_build_graph(
|
||||
{
|
||||
result = llm.build_minicpm();
|
||||
} break;
|
||||
case LLM_ARCH_GEMMA:
|
||||
{
|
||||
result = llm.build_gemma();
|
||||
} break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
@@ -10354,6 +10540,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
||||
new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL && qs.model.hparams.n_gqa() >= 4) {
|
||||
new_type = GGML_TYPE_Q5_K;
|
||||
}
|
||||
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
||||
use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
|
||||
@@ -10406,6 +10595,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
||||
if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
|
||||
}
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL && !qs.has_imatrix) {
|
||||
if (i_layer < n_layer/8) new_type = GGML_TYPE_Q5_K;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) {
|
||||
new_type = GGML_TYPE_Q5_K;
|
||||
@@ -10422,7 +10614,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
||||
if (arch != LLM_ARCH_FALCON) {
|
||||
if (qs.model.hparams.n_expert == 8) {
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
|
||||
ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ||
|
||||
ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL ||
|
||||
ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
|
||||
new_type = GGML_TYPE_Q5_K;
|
||||
}
|
||||
@@ -10489,8 +10681,8 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
case GGML_TYPE_IQ1_S:
|
||||
case GGML_TYPE_Q2_K: new_type = GGML_TYPE_Q4_0; break;
|
||||
case GGML_TYPE_Q3_K: new_type = GGML_TYPE_Q4_1; break;
|
||||
case GGML_TYPE_Q2_K:
|
||||
case GGML_TYPE_Q3_K: new_type = GGML_TYPE_IQ4_NL; break;
|
||||
case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
|
||||
case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break;
|
||||
case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break;
|
||||
@@ -10531,7 +10723,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
case LLAMA_FTYPE_MOSTLY_IQ2_XXS: quantized_type = GGML_TYPE_IQ2_XXS; break;
|
||||
case LLAMA_FTYPE_MOSTLY_IQ2_XS: quantized_type = GGML_TYPE_IQ2_XS; break;
|
||||
case LLAMA_FTYPE_MOSTLY_IQ3_XXS: quantized_type = GGML_TYPE_IQ3_XXS; break;
|
||||
case LLAMA_FTYPE_MOSTLY_IQ1_S: quantized_type = GGML_TYPE_IQ1_S ; break;
|
||||
case LLAMA_FTYPE_MOSTLY_IQ1_S: quantized_type = GGML_TYPE_IQ1_S; break;
|
||||
case LLAMA_FTYPE_MOSTLY_IQ4_NL: quantized_type = GGML_TYPE_IQ4_NL; break;
|
||||
|
||||
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
||||
}
|
||||
@@ -11995,18 +12188,19 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
||||
data_ctx->write(&kv_used, sizeof(kv_used));
|
||||
|
||||
if (kv_buf_size) {
|
||||
const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
|
||||
|
||||
std::vector<uint8_t> tmp_buf;
|
||||
for (int il = 0; il < (int) n_layer; ++il) {
|
||||
tmp_buf.resize(elt_size*n_embd_k_gqa*kv_head);
|
||||
size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
|
||||
tmp_buf.resize(k_size);
|
||||
ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), 0, tmp_buf.size());
|
||||
data_ctx->write(tmp_buf.data(), tmp_buf.size());
|
||||
|
||||
// v is not contiguous, copy row by row
|
||||
tmp_buf.resize(elt_size*kv_head);
|
||||
size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
|
||||
size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx);
|
||||
tmp_buf.resize(v_row_size);
|
||||
for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
|
||||
ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), ir*elt_size*n_ctx, tmp_buf.size());
|
||||
ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), ir*v_row_stride, tmp_buf.size());
|
||||
data_ctx->write(tmp_buf.data(), tmp_buf.size());
|
||||
}
|
||||
}
|
||||
@@ -12108,17 +12302,16 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
||||
if (kv_buf_size) {
|
||||
GGML_ASSERT(kv_self.total_size() == kv_buf_size);
|
||||
|
||||
const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
|
||||
|
||||
for (int il = 0; il < (int) n_layer; ++il) {
|
||||
size_t k_size = elt_size*n_embd_k_gqa*kv_head;
|
||||
size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
|
||||
ggml_backend_tensor_set(kv_self.k_l[il], inp, 0, k_size);
|
||||
inp += k_size;
|
||||
|
||||
// v is not contiguous, copy row by row
|
||||
size_t v_row_size = elt_size*kv_head;
|
||||
size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
|
||||
size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx);
|
||||
for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
|
||||
ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*elt_size*n_ctx, v_row_size);
|
||||
ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*v_row_stride, v_row_size);
|
||||
inp += v_row_size;
|
||||
}
|
||||
}
|
||||
@@ -12508,6 +12701,154 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
|
||||
return 0;
|
||||
}
|
||||
|
||||
// trim whitespace from the beginning and end of a string
|
||||
static std::string trim(const std::string & str) {
|
||||
size_t start = 0;
|
||||
size_t end = str.size();
|
||||
while (start < end && isspace(str[start])) {
|
||||
start += 1;
|
||||
}
|
||||
while (end > start && isspace(str[end - 1])) {
|
||||
end -= 1;
|
||||
}
|
||||
return str.substr(start, end - start);
|
||||
}
|
||||
|
||||
// Simple version of "llama_apply_chat_template" that only works with strings
|
||||
// This function uses heuristic checks to determine commonly used template. It is not a jinja parser.
|
||||
static int32_t llama_chat_apply_template_internal(
|
||||
const std::string & tmpl,
|
||||
const std::vector<const llama_chat_message *> & chat,
|
||||
std::string & dest, bool add_ass) {
|
||||
// Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527
|
||||
std::stringstream ss;
|
||||
if (tmpl.find("<|im_start|>") != std::string::npos) {
|
||||
// chatml template
|
||||
for (auto message : chat) {
|
||||
ss << "<|im_start|>" << message->role << "\n" << message->content << "<|im_end|>\n";
|
||||
}
|
||||
if (add_ass) {
|
||||
ss << "<|im_start|>assistant\n";
|
||||
}
|
||||
} else if (tmpl.find("[INST]") != std::string::npos) {
|
||||
// llama2 template and its variants
|
||||
// [variant] support system message
|
||||
bool support_system_message = tmpl.find("<<SYS>>") != std::string::npos;
|
||||
// [variant] space before + after response
|
||||
bool space_around_response = tmpl.find("' ' + eos_token") != std::string::npos;
|
||||
// [variant] add BOS inside history
|
||||
bool add_bos_inside_history = tmpl.find("bos_token + '[INST]") != std::string::npos;
|
||||
// [variant] trim spaces from the input message
|
||||
bool strip_message = tmpl.find("content.strip()") != std::string::npos;
|
||||
// construct the prompt
|
||||
bool is_inside_turn = true; // skip BOS at the beginning
|
||||
ss << "[INST] ";
|
||||
for (auto message : chat) {
|
||||
std::string content = strip_message ? trim(message->content) : message->content;
|
||||
std::string role(message->role);
|
||||
if (!is_inside_turn) {
|
||||
is_inside_turn = true;
|
||||
ss << (add_bos_inside_history ? "<s>[INST] " : "[INST] ");
|
||||
}
|
||||
if (role == "system") {
|
||||
if (support_system_message) {
|
||||
ss << "<<SYS>>\n" << content << "\n<</SYS>>\n\n";
|
||||
} else {
|
||||
// if the model does not support system message, we still include it in the first message, but without <<SYS>>
|
||||
ss << content << "\n";
|
||||
}
|
||||
} else if (role == "user") {
|
||||
ss << content << " [/INST]";
|
||||
} else {
|
||||
ss << (space_around_response ? " " : "") << content << (space_around_response ? " " : "") << "</s>";
|
||||
is_inside_turn = false;
|
||||
}
|
||||
}
|
||||
// llama2 templates seem to not care about "add_generation_prompt"
|
||||
} else if (tmpl.find("<|user|>") != std::string::npos) {
|
||||
// zephyr template
|
||||
for (auto message : chat) {
|
||||
ss << "<|" << message->role << "|>" << "\n" << message->content << "<|endoftext|>\n";
|
||||
}
|
||||
if (add_ass) {
|
||||
ss << "<|assistant|>\n";
|
||||
}
|
||||
} else if (tmpl.find("bos_token + message['role']") != std::string::npos) {
|
||||
// mlabonne/AlphaMonarch-7B template (the <s> is included inside history)
|
||||
for (auto message : chat) {
|
||||
std::string bos = (message == chat.front()) ? "" : "<s>"; // skip BOS for first message
|
||||
ss << bos << message->role << "\n" << message->content << "</s>\n";
|
||||
}
|
||||
if (add_ass) {
|
||||
ss << "<s>assistant\n";
|
||||
}
|
||||
} else if (tmpl.find("<start_of_turn>") != std::string::npos) {
|
||||
// google/gemma-7b-it
|
||||
std::string system_prompt = "";
|
||||
for (auto message : chat) {
|
||||
std::string role(message->role);
|
||||
if (role == "system") {
|
||||
// there is no system message for gemma, but we will merge it with user prompt, so nothing is broken
|
||||
system_prompt = trim(message->content);
|
||||
continue;
|
||||
}
|
||||
// in gemma, "assistant" is "model"
|
||||
role = role == "assistant" ? "model" : message->role;
|
||||
ss << "<start_of_turn>" << role << "\n";
|
||||
if (!system_prompt.empty() && role != "model") {
|
||||
ss << system_prompt << "\n\n";
|
||||
system_prompt = "";
|
||||
}
|
||||
ss << trim(message->content) << "<end_of_turn>\n";
|
||||
}
|
||||
if (add_ass) {
|
||||
ss << "<start_of_turn>model\n";
|
||||
}
|
||||
} else {
|
||||
// template not supported
|
||||
return -1;
|
||||
}
|
||||
dest = ss.str();
|
||||
return dest.size();
|
||||
}
|
||||
|
||||
LLAMA_API int32_t llama_chat_apply_template(
|
||||
const struct llama_model * model,
|
||||
const char * tmpl,
|
||||
const struct llama_chat_message * chat,
|
||||
size_t n_msg,
|
||||
bool add_ass,
|
||||
char * buf,
|
||||
int32_t length) {
|
||||
std::string curr_tmpl(tmpl == nullptr ? "" : tmpl);
|
||||
if (tmpl == nullptr) {
|
||||
GGML_ASSERT(model != nullptr);
|
||||
// load template from model
|
||||
std::vector<char> model_template(2048, 0); // longest known template is about 1200 bytes
|
||||
std::string template_key = "tokenizer.chat_template";
|
||||
int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
|
||||
if (res < 0) {
|
||||
// worst case: there is no information about template, we will use chatml by default
|
||||
curr_tmpl = "<|im_start|>"; // see llama_chat_apply_template_internal
|
||||
} else {
|
||||
curr_tmpl = std::string(model_template.data(), model_template.size());
|
||||
}
|
||||
}
|
||||
// format the chat to string
|
||||
std::vector<const llama_chat_message *> chat_vec;
|
||||
chat_vec.resize(n_msg);
|
||||
for (size_t i = 0; i < n_msg; i++) {
|
||||
chat_vec[i] = &chat[i];
|
||||
}
|
||||
std::string formatted_chat;
|
||||
int32_t res = llama_chat_apply_template_internal(curr_tmpl, chat_vec, formatted_chat, add_ass);
|
||||
if (res < 0) {
|
||||
return res;
|
||||
}
|
||||
strncpy(buf, formatted_chat.c_str(), length);
|
||||
return res;
|
||||
}
|
||||
|
||||
struct llama_timings llama_get_timings(struct llama_context * ctx) {
|
||||
struct llama_timings result = {
|
||||
/*.t_start_ms =*/ 1e-3 * ctx->t_start_us,
|
||||
|
||||
26
llama.h
26
llama.h
@@ -101,6 +101,7 @@ extern "C" {
|
||||
LLAMA_FTYPE_MOSTLY_Q3_K_XS = 22, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_IQ1_S = 24, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_IQ4_NL = 25, // except 1d tensors
|
||||
|
||||
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
||||
};
|
||||
@@ -305,6 +306,12 @@ extern "C" {
|
||||
int32_t n_eval;
|
||||
};
|
||||
|
||||
// used in chat template
|
||||
typedef struct llama_chat_message {
|
||||
const char * role;
|
||||
const char * content;
|
||||
} llama_chat_message;
|
||||
|
||||
// Helpers for getting default parameters
|
||||
LLAMA_API struct llama_model_params llama_model_default_params(void);
|
||||
LLAMA_API struct llama_context_params llama_context_default_params(void);
|
||||
@@ -699,6 +706,25 @@ extern "C" {
|
||||
char * buf,
|
||||
int32_t length);
|
||||
|
||||
/// Apply chat template. Inspired by hf apply_chat_template() on python.
|
||||
/// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
|
||||
/// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
|
||||
/// @param tmpl A Jinja template to use for this chat. If this is nullptr, the model’s default chat template will be used instead.
|
||||
/// @param chat Pointer to a list of multiple llama_chat_message
|
||||
/// @param n_msg Number of llama_chat_message in this chat
|
||||
/// @param add_ass Whether to end the prompt with the token(s) that indicate the start of an assistant message.
|
||||
/// @param buf A buffer to hold the output formatted prompt. The recommended alloc size is 2 * (total number of characters of all messages)
|
||||
/// @param length The size of the allocated buffer
|
||||
/// @return The total number of bytes of the formatted prompt. If is it larger than the size of buffer, you may need to re-alloc it and then re-apply the template.
|
||||
LLAMA_API int32_t llama_chat_apply_template(
|
||||
const struct llama_model * model,
|
||||
const char * tmpl,
|
||||
const struct llama_chat_message * chat,
|
||||
size_t n_msg,
|
||||
bool add_ass,
|
||||
char * buf,
|
||||
int32_t length);
|
||||
|
||||
//
|
||||
// Grammar
|
||||
//
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
ifeq '' '$(findstring clang,$(shell $(GF_CC) --version))'
|
||||
GF_CC_IS_GCC = 1
|
||||
GF_CC_VER := $(shell { $(GF_CC) -dumpfullversion 2>/dev/null || $(GF_CC) -dumpversion; } | awk -F. '{ printf("%02d%02d%02d", $$1, $$2, $$3) }')
|
||||
GF_CC_VER := $(shell { $(GF_CC) -dumpfullversion 2>/dev/null; echo; $(GF_CC) -dumpversion; } | awk -F. '/./ { printf("%02d%02d%02d", $$1, $$2, $$3); exit }')
|
||||
else
|
||||
GF_CC_IS_CLANG = 1
|
||||
ifeq '' '$(findstring Apple,$(shell $(GF_CC) --version))'
|
||||
|
||||
@@ -1 +1 @@
|
||||
5070f078a67c18c11736e78316ab715ca9afde16
|
||||
8cdf783f288a98eddf521b0ab1b4d405be9e18ba
|
||||
|
||||
@@ -28,6 +28,7 @@ endfunction()
|
||||
llama_build_and_test_executable(test-quantize-fns.cpp)
|
||||
llama_build_and_test_executable(test-quantize-perf.cpp)
|
||||
llama_build_and_test_executable(test-sampling.cpp)
|
||||
llama_build_and_test_executable(test-chat-template.cpp)
|
||||
|
||||
llama_build_executable(test-tokenizer-0-llama.cpp)
|
||||
llama_test_executable (test-tokenizer-0-llama test-tokenizer-0-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
|
||||
|
||||
@@ -1918,6 +1918,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
||||
GGML_TYPE_Q6_K,
|
||||
GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS,
|
||||
GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ1_S,
|
||||
GGML_TYPE_IQ4_NL,
|
||||
};
|
||||
|
||||
// unary ops
|
||||
|
||||
75
tests/test-chat-template.cpp
Normal file
75
tests/test-chat-template.cpp
Normal file
@@ -0,0 +1,75 @@
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <sstream>
|
||||
|
||||
#undef NDEBUG
|
||||
#include <cassert>
|
||||
|
||||
#include "llama.h"
|
||||
|
||||
int main(void) {
|
||||
llama_chat_message conversation[] = {
|
||||
{"system", "You are a helpful assistant"},
|
||||
{"user", "Hello"},
|
||||
{"assistant", "Hi there"},
|
||||
{"user", "Who are you"},
|
||||
{"assistant", " I am an assistant "},
|
||||
{"user", "Another question"},
|
||||
};
|
||||
size_t message_count = 6;
|
||||
std::vector<std::string> templates = {
|
||||
// teknium/OpenHermes-2.5-Mistral-7B
|
||||
"{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content'] + '<|im_end|>' + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\\n' }}{% endif %}",
|
||||
// mistralai/Mistral-7B-Instruct-v0.2
|
||||
"{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}",
|
||||
// TheBloke/FusionNet_34Bx2_MoE-AWQ
|
||||
"{%- for idx in range(0, messages|length) -%}\\n{%- if messages[idx]['role'] == 'user' -%}\\n{%- if idx > 1 -%}\\n{{- bos_token + '[INST] ' + messages[idx]['content'] + ' [/INST]' -}}\\n{%- else -%}\\n{{- messages[idx]['content'] + ' [/INST]' -}}\\n{%- endif -%}\\n{% elif messages[idx]['role'] == 'system' %}\\n{{- '[INST] <<SYS>>\\\\n' + messages[idx]['content'] + '\\\\n<</SYS>>\\\\n\\\\n' -}}\\n{%- elif messages[idx]['role'] == 'assistant' -%}\\n{{- ' ' + messages[idx]['content'] + ' ' + eos_token -}}\\n{% endif %}\\n{% endfor %}",
|
||||
// bofenghuang/vigogne-2-70b-chat
|
||||
"{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif true == true and not '<<SYS>>' in messages[0]['content'] %}{% set loop_messages = messages %}{% set system_message = 'Vous êtes Vigogne, un assistant IA créé par Zaion Lab. Vous suivez extrêmement bien les instructions. Aidez autant que vous le pouvez.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\\\n' + system_message + '\\\\n<</SYS>>\\\\n\\\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'system' %}{{ '<<SYS>>\\\\n' + content.strip() + '\\\\n<</SYS>>\\\\n\\\\n' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}",
|
||||
// mlabonne/AlphaMonarch-7B
|
||||
"{% for message in messages %}{{bos_token + message['role'] + '\\n' + message['content'] + eos_token + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ bos_token + 'assistant\\n' }}{% endif %}",
|
||||
// google/gemma-7b-it
|
||||
"{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '\\n' + message['content'] | trim + '<end_of_turn>\\n' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\\n'}}{% endif %}",
|
||||
};
|
||||
std::vector<std::string> expected_output = {
|
||||
// teknium/OpenHermes-2.5-Mistral-7B
|
||||
"<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\nHello<|im_end|>\n<|im_start|>assistant\nHi there<|im_end|>\n<|im_start|>user\nWho are you<|im_end|>\n<|im_start|>assistant\n I am an assistant <|im_end|>\n<|im_start|>user\nAnother question<|im_end|>\n<|im_start|>assistant\n",
|
||||
// mistralai/Mistral-7B-Instruct-v0.2
|
||||
"[INST] You are a helpful assistant\nHello [/INST]Hi there</s>[INST] Who are you [/INST] I am an assistant </s>[INST] Another question [/INST]",
|
||||
// TheBloke/FusionNet_34Bx2_MoE-AWQ
|
||||
"[INST] <<SYS>>\nYou are a helpful assistant\n<</SYS>>\n\nHello [/INST] Hi there </s><s>[INST] Who are you [/INST] I am an assistant </s><s>[INST] Another question [/INST]",
|
||||
// bofenghuang/vigogne-2-70b-chat
|
||||
"[INST] <<SYS>>\nYou are a helpful assistant\n<</SYS>>\n\nHello [/INST] Hi there </s>[INST] Who are you [/INST] I am an assistant </s>[INST] Another question [/INST]",
|
||||
// mlabonne/AlphaMonarch-7B
|
||||
"system\nYou are a helpful assistant</s>\n<s>user\nHello</s>\n<s>assistant\nHi there</s>\n<s>user\nWho are you</s>\n<s>assistant\n I am an assistant </s>\n<s>user\nAnother question</s>\n<s>assistant\n",
|
||||
// google/gemma-7b-it
|
||||
"<start_of_turn>user\nYou are a helpful assistant\n\nHello<end_of_turn>\n<start_of_turn>model\nHi there<end_of_turn>\n<start_of_turn>user\nWho are you<end_of_turn>\n<start_of_turn>model\nI am an assistant<end_of_turn>\n<start_of_turn>user\nAnother question<end_of_turn>\n<start_of_turn>model\n",
|
||||
};
|
||||
std::vector<char> formatted_chat(1024);
|
||||
int32_t res;
|
||||
|
||||
// test invalid chat template
|
||||
res = llama_chat_apply_template(nullptr, "INVALID TEMPLATE", conversation, message_count, true, formatted_chat.data(), formatted_chat.size());
|
||||
assert(res < 0);
|
||||
|
||||
for (size_t i = 0; i < templates.size(); i++) {
|
||||
std::string custom_template = templates[i];
|
||||
std::string expected = expected_output[i];
|
||||
formatted_chat.resize(1024);
|
||||
res = llama_chat_apply_template(
|
||||
nullptr,
|
||||
custom_template.c_str(),
|
||||
conversation,
|
||||
message_count,
|
||||
true,
|
||||
formatted_chat.data(),
|
||||
formatted_chat.size()
|
||||
);
|
||||
formatted_chat.resize(res);
|
||||
std::string output(formatted_chat.data(), formatted_chat.size());
|
||||
std::cout << output << "\n-------------------------\n";
|
||||
assert(output == expected);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
Reference in New Issue
Block a user