Compare commits

...

12 Commits
b5629 ... b5641

Author SHA1 Message Date
compilade
a20b2b05bc context : round n_tokens to next multiple of n_seqs when reserving (#14140)
This fixes RWKV inference which otherwise failed
when the worst case ubatch.n_seq_tokens rounded to 0.
2025-06-12 02:56:04 -04:00
bandoti
2e89f76b7a common: fix issue with regex_escape routine on windows (#14133) 2025-06-11 17:19:44 -03:00
Christian Kastner
532802f938 Implement GGML_CPU_ALL_VARIANTS for ARM (#14080)
* ggml-cpu: Factor out feature detection build from x86

* ggml-cpu: Add ARM feature detection and scoring

This is analogous to cpu-feats-x86.cpp. However, to detect compile-time
activation of features, we rely on GGML_USE_<FEAT> which need to be set
in cmake, instead of GGML_<FEAT> that users would set for x86.

This is because on ARM, users specify features with GGML_CPU_ARM_ARCH,
rather than with individual flags.

* ggml-cpu: Implement GGML_CPU_ALL_VARIANTS for ARM

Like x86, however to pass around arch flags within cmake, we use
GGML_INTERNAL_<FEAT> as we don't have GGML_<FEAT>.

Some features are optional, so we may need to build multiple backends
per arch version (armv8.2_1, armv8.2_2, ...), and let the scoring
function sort out which one can be used.

* ggml-cpu: Limit ARM GGML_CPU_ALL_VARIANTS to Linux for now

The other platforms will need their own specific variants.

This also fixes the bug that the the variant-building branch was always
being executed as the else-branch of GGML_NATIVE=OFF. The branch is
moved to an elseif-branch which restores the previous behavior.
2025-06-11 21:07:44 +02:00
Sigbjørn Skjæret
d4e0d95cf5 chore : clean up relative source dir paths (#14128) 2025-06-11 19:04:23 +02:00
Sigbjørn Skjæret
cc66a7f78f tests : add test-tokenizers-repo (#14017) 2025-06-11 17:16:32 +02:00
Jeff Bolz
bd248d4dc7 vulkan: Better thread-safety for command pools/buffers (#14116)
This change moves the command pool/buffer tracking into a vk_command_pool
structure. There are two instances per context (for compute+transfer) and
two instances per device for operations that don't go through a context.
This should prevent separate contexts from stomping on each other.
2025-06-11 09:48:52 -05:00
Aman
7781e5fe99 webui: Wrap long numbers instead of infinite horizontal scroll (#14062)
* webui: Wrap long numbers instead of infinite horizontal scroll

* Use tailwind class

* update index.html.gz
2025-06-11 16:42:25 +02:00
Georgi Gerganov
89a184fa71 kv-cache : relax SWA masking condition (#14119)
ggml-ci
2025-06-11 16:48:45 +03:00
Taylor
2baf07727f server : pass default --keep argument (#14120) 2025-06-11 13:43:43 +03:00
Georgi Gerganov
7ae2932116 kv-cache : add LLAMA_KV_CACHE_DEBUG environment variable (#14121) 2025-06-11 12:52:45 +03:00
Jeff Bolz
1f7d50b293 vulkan: Track descriptor pools/sets per-context (#14109)
Use the same descriptor set layout for all pipelines (MAX_PARAMETER_COUNT == 8)
and move it to the vk_device. Move all the descriptor pool and set tracking to
the context - none of it is specific to pipelines anymore. It has a single vector
of pools and vector of sets, and a single counter to track requests and a single
counter to track use.
2025-06-11 07:19:25 +02:00
lhez
4c763c8d1b opencl: add mul_mv_id_q4_0_f32_8x_flat (#14003) 2025-06-10 16:55:58 -07:00
17 changed files with 1060 additions and 300 deletions

View File

@@ -7,8 +7,8 @@ llama_add_compile_flags()
# Build info header
#
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
if(EXISTS "${PROJECT_SOURCE_DIR}/.git")
set(GIT_DIR "${PROJECT_SOURCE_DIR}/.git")
# Is git submodule
if(NOT IS_DIRECTORY "${GIT_DIR}")
@@ -18,7 +18,7 @@ if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
if (SLASH_POS EQUAL 0)
set(GIT_DIR "${REAL_GIT_DIR}")
else()
set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../${REAL_GIT_DIR}")
set(GIT_DIR "${PROJECT_SOURCE_DIR}/${REAL_GIT_DIR}")
endif()
endif()
@@ -42,7 +42,7 @@ add_custom_command(
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-DCMAKE_SYSTEM_NAME=${CMAKE_SYSTEM_NAME} -DCMAKE_SYSTEM_PROCESSOR=${CMAKE_SYSTEM_PROCESSOR}
-P "${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info-gen-cpp.cmake"
WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/.."
WORKING_DIRECTORY "${PROJECT_SOURCE_DIR}"
DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in" ${GIT_INDEX}
VERBATIM
)

View File

@@ -466,7 +466,7 @@ size_t string_find_partial_stop(const std::string_view & str, const std::string_
std::string regex_escape(const std::string & s) {
static const std::regex special_chars("[.^$|()*+?\\[\\]{}\\\\]");
return std::regex_replace(s, special_chars, "\\$0");
return std::regex_replace(s, special_chars, "\\$&");
}
std::string string_join(const std::vector<std::string> & values, const std::string & separator) {

View File

@@ -270,17 +270,23 @@ endfunction()
function(ggml_add_cpu_backend_variant tag_name)
set(GGML_CPU_TAG_NAME ${tag_name})
# other: OPENMP LLAMAFILE CPU_HBM
foreach (feat NATIVE
SSE42
AVX AVX2 BMI2 AVX_VNNI FMA F16C
AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16
AMX_TILE AMX_INT8 AMX_BF16)
set(GGML_${feat} OFF)
endforeach()
if (GGML_SYSTEM_ARCH STREQUAL "x86")
foreach (feat NATIVE
SSE42
AVX AVX2 BMI2 AVX_VNNI FMA F16C
AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16
AMX_TILE AMX_INT8 AMX_BF16)
set(GGML_${feat} OFF)
endforeach()
foreach (feat ${ARGN})
set(GGML_${feat} ON)
endforeach()
foreach (feat ${ARGN})
set(GGML_${feat} ON)
endforeach()
elseif (GGML_SYSTEM_ARCH STREQUAL "ARM")
foreach (feat ${ARGN})
set(GGML_INTERNAL_${feat} ON)
endforeach()
endif()
ggml_add_cpu_backend_variant_impl(${tag_name})
endfunction()
@@ -290,6 +296,8 @@ ggml_add_backend(CPU)
if (GGML_CPU_ALL_VARIANTS)
if (NOT GGML_BACKEND_DL)
message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL")
elseif (GGML_CPU_ARM_ARCH)
message(FATAL_ERROR "Cannot use both GGML_CPU_ARM_ARCH and GGML_CPU_ALL_VARIANTS")
endif()
if (GGML_SYSTEM_ARCH STREQUAL "x86")
ggml_add_cpu_backend_variant(x64)
@@ -303,8 +311,20 @@ if (GGML_CPU_ALL_VARIANTS)
# MSVC doesn't support AMX
ggml_add_cpu_backend_variant(sapphirerapids SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
endif()
elseif(GGML_SYSTEM_ARCH STREQUAL "ARM" AND CMAKE_SYSTEM_NAME MATCHES "Linux")
# Many of these features are optional so we build versions with popular
# combinations and name the backends based on the version they were
# first released with
ggml_add_cpu_backend_variant(armv8.0_1)
ggml_add_cpu_backend_variant(armv8.2_1 DOTPROD)
ggml_add_cpu_backend_variant(armv8.2_2 DOTPROD FP16_VECTOR_ARITHMETIC)
ggml_add_cpu_backend_variant(armv8.2_3 DOTPROD FP16_VECTOR_ARITHMETIC SVE)
ggml_add_cpu_backend_variant(armv8.6_1 DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8)
ggml_add_cpu_backend_variant(armv8.6_2 DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8 SVE2)
ggml_add_cpu_backend_variant(armv9.2_1 DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8 SME)
ggml_add_cpu_backend_variant(armv9.2_2 DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8 SVE2 SME)
else()
message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS not yet supported on ${GGML_SYSTEM_ARCH}")
message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS not yet supported with ${GGML_SYSTEM_ARCH} on ${CMAKE_SYSTEM_NAME}")
endif()
elseif (GGML_CPU)
ggml_add_cpu_backend_variant_impl("")

View File

@@ -1,3 +1,17 @@
function(ggml_add_cpu_backend_features cpu_name arch)
# The feature detection code is compiled as a separate target so that
# it can be built without the architecture flags
# Since multiple variants of the CPU backend may be included in the same
# build, using set_source_files_properties() to set the arch flags is not possible
set(GGML_CPU_FEATS_NAME ${cpu_name}-feats)
add_library(${GGML_CPU_FEATS_NAME} OBJECT ggml-cpu/arch/${arch}/cpu-feats.cpp)
target_include_directories(${GGML_CPU_FEATS_NAME} PRIVATE . .. ../include)
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE ${ARGN})
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE GGML_BACKEND_DL GGML_BACKEND_BUILD GGML_BACKEND_SHARED)
set_target_properties(${GGML_CPU_FEATS_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_link_libraries(${cpu_name} PRIVATE ${GGML_CPU_FEATS_NAME})
endfunction()
function(ggml_add_cpu_backend_variant_impl tag_name)
if (tag_name)
set(GGML_CPU_NAME ggml-cpu-${tag_name})
@@ -143,6 +157,49 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
else()
if (GGML_CPU_ARM_ARCH)
list(APPEND ARCH_FLAGS -march=${GGML_CPU_ARM_ARCH})
elseif(GGML_CPU_ALL_VARIANTS)
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
# Begin with the lowest baseline
set(ARM_MCPU "armv8-a")
set(ARCH_TAGS "")
set(ARCH_DEFINITIONS "")
# When a feature is selected, bump the MCPU to the first
# version that supported it
if (GGML_INTERNAL_DOTPROD)
set(ARM_MCPU "armv8.2-a")
set(ARCH_TAGS "${ARCH_TAGS}+dotprod")
list(APPEND ARCH_DEFINITIONS GGML_USE_DOTPROD)
endif()
if (GGML_INTERNAL_FP16_VECTOR_ARITHMETIC)
set(ARM_MCPU "armv8.2-a")
set(ARCH_TAGS "${ARCH_TAGS}+fp16")
list(APPEND ARCH_DEFINITIONS GGML_USE_FP16_VECTOR_ARITHMETIC)
endif()
if (GGML_INTERNAL_SVE)
set(ARM_MCPU "armv8.2-a")
set(ARCH_TAGS "${ARCH_TAGS}+sve")
list(APPEND ARCH_DEFINITIONS GGML_USE_SVE)
endif()
if (GGML_INTERNAL_MATMUL_INT8)
set(ARM_MCPU "armv8.6-a")
set(ARCH_TAGS "${ARCH_TAGS}+i8mm")
list(APPEND ARCH_DEFINITIONS GGML_USE_MATMUL_INT8)
endif()
if (GGML_INTERNAL_SVE2)
set(ARM_MCPU "armv8.6-a")
set(ARCH_TAGS "${ARCH_TAGS}+sve2")
list(APPEND ARCH_DEFINITIONS GGML_USE_SVE2)
endif()
if (GGML_INTERNAL_SME)
set(ARM_MCPU "armv9.2-a")
set(ARCH_TAGS "${ARCH_TAGS}+sme")
list(APPEND ARCH_DEFINITIONS GGML_USE_SME)
endif()
list(APPEND ARCH_FLAGS "-march=${ARM_MCPU}${ARCH_TAGS}")
ggml_add_cpu_backend_features(${GGML_CPU_NAME} arm ${ARCH_DEFINITIONS})
endif()
endif()
endif()
@@ -306,18 +363,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
# the feature check relies on ARCH_DEFINITIONS, but it is not set with GGML_NATIVE
message(FATAL_ERROR "GGML_NATIVE is not compatible with GGML_BACKEND_DL, consider using GGML_CPU_ALL_VARIANTS")
endif()
# The feature detection code is compiled as a separate target so that
# it can be built without the architecture flags
# Since multiple variants of the CPU backend may be included in the same
# build, using set_source_files_properties() to set the arch flags is not possible
set(GGML_CPU_FEATS_NAME ${GGML_CPU_NAME}-feats)
add_library(${GGML_CPU_FEATS_NAME} OBJECT ggml-cpu/arch/x86/cpu-feats.cpp)
target_include_directories(${GGML_CPU_FEATS_NAME} PRIVATE . .. ../include)
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE ${ARCH_DEFINITIONS})
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE GGML_BACKEND_DL GGML_BACKEND_BUILD GGML_BACKEND_SHARED)
set_target_properties(${GGML_CPU_FEATS_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_link_libraries(${GGML_CPU_NAME} PRIVATE ${GGML_CPU_FEATS_NAME})
ggml_add_cpu_backend_features(${GGML_CPU_NAME} x86 ${ARCH_DEFINITIONS})
endif()
elseif (GGML_SYSTEM_ARCH STREQUAL "PowerPC")
message(STATUS "PowerPC detected")

View File

@@ -0,0 +1,94 @@
#include "ggml-backend-impl.h"
#if defined(__aarch64__)
#if defined(__linux__)
#include <sys/auxv.h>
#elif defined(__APPLE__)
#include <sys/sysctl.h>
#endif
#if !defined(HWCAP2_I8MM)
#define HWCAP2_I8MM (1 << 13)
#endif
#if !defined(HWCAP2_SME)
#define HWCAP2_SME (1 << 23)
#endif
struct aarch64_features {
// has_neon not needed, aarch64 has NEON guaranteed
bool has_dotprod = false;
bool has_fp16_va = false;
bool has_sve = false;
bool has_sve2 = false;
bool has_i8mm = false;
bool has_sme = false;
aarch64_features() {
#if defined(__linux__)
uint32_t hwcap = getauxval(AT_HWCAP);
uint32_t hwcap2 = getauxval(AT_HWCAP2);
has_dotprod = !!(hwcap & HWCAP_ASIMDDP);
has_fp16_va = !!(hwcap & HWCAP_FPHP);
has_sve = !!(hwcap & HWCAP_SVE);
has_sve2 = !!(hwcap2 & HWCAP2_SVE2);
has_i8mm = !!(hwcap2 & HWCAP2_I8MM);
has_sme = !!(hwcap2 & HWCAP2_SME);
#elif defined(__APPLE__)
int oldp = 0;
size_t size = sizeof(oldp);
if (sysctlbyname("hw.optional.arm.FEAT_DotProd", &oldp, &size, NULL, 0) == 0) {
has_dotprod = static_cast<bool>(oldp);
}
if (sysctlbyname("hw.optional.arm.FEAT_I8MM", &oldp, &size, NULL, 0) == 0) {
has_i8mm = static_cast<bool>(oldp);
}
if (sysctlbyname("hw.optional.arm.FEAT_SME", &oldp, &size, NULL, 0) == 0) {
has_sme = static_cast<bool>(oldp);
}
// Apple apparently does not implement SVE yet
#endif
}
};
static int ggml_backend_cpu_aarch64_score() {
int score = 1;
aarch64_features af;
#ifdef GGML_USE_DOTPROD
if (!af.has_dotprod) { return 0; }
score += 1<<1;
#endif
#ifdef GGML_USE_FP16_VECTOR_ARITHMETIC
if (!af.has_fp16_va) { return 0; }
score += 1<<2;
#endif
#ifdef GGML_USE_SVE
if (!af.has_sve) { return 0; }
score += 1<<3;
#endif
#ifdef GGML_USE_MATMUL_INT8
if (!af.has_i8mm) { return 0; }
score += 1<<4;
#endif
#ifdef GGML_USE_SVE2
if (!af.has_sve2) { return 0; }
score += 1<<5;
#endif
#ifdef GGML_USE_SME
if (!af.has_sme) { return 0; }
score += 1<<6;
#endif
return score;
}
GGML_BACKEND_DL_SCORE_IMPL(ggml_backend_cpu_aarch64_score)
# endif // defined(__aarch64__)

View File

@@ -80,6 +80,7 @@ set(GGML_OPENCL_KERNELS
mul_mv_q4_0_f32_1d_8x_flat
mul_mv_q4_0_f32_1d_16x_flat
mul_mv_q6_k
mul_mv_id_q4_0_f32_8x_flat
mul
norm
relu

View File

@@ -321,6 +321,7 @@ struct ggml_backend_opencl_context {
cl_program program_upscale;
cl_program program_concat;
cl_program program_tsembd;
cl_program program_mul_mv_id_q4_0_f32_8x_flat;
cl_kernel kernel_add, kernel_add_row;
cl_kernel kernel_mul, kernel_mul_row;
@@ -366,6 +367,7 @@ struct ggml_backend_opencl_context {
cl_kernel kernel_concat_f32_contiguous;
cl_kernel kernel_concat_f32_non_contiguous;
cl_kernel kernel_timestep_embedding;
cl_kernel kernel_mul_mv_id_q4_0_f32_8x_flat;
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
// Transpose kernels
@@ -1112,7 +1114,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
GGML_LOG_CONT(".");
}
// repeat
// repeat
{
#ifdef GGML_OPENCL_EMBED_KERNELS
const std::string kernel_src {
@@ -1256,6 +1258,22 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
}
}
// mul_mv_id_q4_0_f32_8x_flat
{
#ifdef GGML_OPENCL_EMBED_KERNELS
const std::string kernel_src {
#include "mul_mv_id_q4_0_f32_8x_flat.cl.h"
};
#else
const std::string kernel_src = read_file("mul_mv_id_q4_0_f32_8x_flat.cl");
#endif
backend_ctx->program_mul_mv_id_q4_0_f32_8x_flat =
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
CL_CHECK((backend_ctx->kernel_mul_mv_id_q4_0_f32_8x_flat = clCreateKernel(backend_ctx->program_mul_mv_id_q4_0_f32_8x_flat, "kernel_mul_mv_id_q4_0_f32_8x_flat", &err), err));
GGML_LOG_CONT(".");
}
// Adreno kernels
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
// transpose
@@ -2178,6 +2196,13 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
return op->src[1]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
}
return false;
case GGML_OP_MUL_MAT_ID:
if (op->src[0]->type == GGML_TYPE_Q4_0) {
if (op->src[1]->type == GGML_TYPE_F32) {
return ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
}
}
return false;
case GGML_OP_RESHAPE:
case GGML_OP_VIEW:
case GGML_OP_PERMUTE:
@@ -5536,6 +5561,136 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
}
}
static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
GGML_ASSERT(src0);
GGML_ASSERT(src0->extra);
GGML_ASSERT(src1);
GGML_ASSERT(src1->extra);
GGML_ASSERT(dst);
GGML_ASSERT(dst->extra);
const ggml_tensor * src2 = dst->src[2];
GGML_ASSERT(src2);
GGML_ASSERT(src2->extra);
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
cl_command_queue queue = backend_ctx->queue;
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
ggml_tensor_extra_cl * extra2 = (ggml_tensor_extra_cl *)src2->extra;
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
cl_ulong offset1 = extra1->offset + src1->view_offs;
cl_ulong offset2 = extra2->offset + src2->view_offs;
cl_ulong offsetd = extrad->offset + dst->view_offs;
#ifdef GGML_OPENCL_SOA_Q
ggml_tensor_extra_cl_q4_0 * extra0_q4_0 = (ggml_tensor_extra_cl_q4_0 *)src0->extra;
#endif
const int ne00 = src0->ne[0];
const int ne01 = src0->ne[1];
const int ne02 = src0->ne[2];
const int ne03 = src0->ne[3];
const cl_ulong nb00 = src0->nb[0];
const cl_ulong nb02 = src0->nb[2];
const int ne10 = src1->ne[0];
const int ne11 = src1->ne[1];
const int ne12 = src1->ne[2];
const int ne13 = src1->ne[3];
const cl_ulong nb11 = src1->nb[1];
const cl_ulong nb12 = src1->nb[2];
const int ne20 = src2->ne[0];
const int ne21 = src2->ne[1];
const cl_ulong nb21 = src2->nb[1];
const int ne0 = dst->ne[0];
const int ne1 = dst->ne[1];
const int r2 = ne12/ne02;
const int r3 = ne13/ne03;
const int dst_rows = ne20*ne21; // ne20 = n_used_experts, ne21 = n_rows
GGML_ASSERT(ne00 == ne10);
int sgs = 32; // subgroup size
int nsg = 1; // number of subgroups
int nrows = 1; // number of row in src1
int ndst = 4; // number of values produced by each subgroup
cl_kernel kernel;
// subgroup mat vec
switch (src0->type) {
case GGML_TYPE_Q4_0: {
kernel = backend_ctx->kernel_mul_mv_id_q4_0_f32_8x_flat;
if (backend_ctx->gpu_family == INTEL) {
sgs = 16;
nsg = 1;
ndst = 8;
} else if (backend_ctx->gpu_family == ADRENO) {
sgs = 64;
nsg = 1;
ndst = 8;
} else {
GGML_ASSERT(false && "TODO: Unknown GPU");
}
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q4_0->q));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q4_0->d));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra2->data_device));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offset2));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00));
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01));
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne02));
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb00));
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02));
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne10));
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne11));
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne12));
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb11));
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb12));
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &ne20));
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(int), &ne21));
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb21));
CL_CHECK(clSetKernelArg(kernel, 21, sizeof(int), &ne0));
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &ne1));
CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &r2));
CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &r3));
break;
}
default:
GGML_ASSERT(false && "not implemented");;
}
int _ne1 = 1;
int ne123 = dst_rows;
size_t global_work_size[] = {(size_t)(ne01+ndst*nsg-1)/(ndst*nsg)*sgs, (size_t)(_ne1+nrows-1)/nrows*nsg, (size_t)ne123};
size_t local_work_size[] = {(size_t)sgs, (size_t)nsg, 1};
#ifdef GGML_OPENCL_PROFILING
cl_event evt;
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
g_profiling_info.emplace_back();
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
#else
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
#endif
}
static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
GGML_ASSERT(src0);
GGML_ASSERT(src0->extra);
@@ -6444,6 +6599,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
}
func = ggml_cl_mul_mat;
break;
case GGML_OP_MUL_MAT_ID:
if (!any_on_device) {
return false;
}
func = ggml_cl_mul_mat_id;
break;
case GGML_OP_SCALE:
if (!any_on_device) {
return false;

View File

@@ -0,0 +1,283 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#ifdef cl_intel_subgroups
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
#else
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
#endif
#ifdef cl_intel_required_subgroup_size
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
#define INTEL_GPU 1
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
#elif defined(cl_qcom_reqd_sub_group_size)
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
#define ADRENO_GPU 1
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
#endif
#define QK4_0 32
typedef char int8_t;
typedef uchar uint8_t;
typedef short int16_t;
typedef ushort uint16_t;
typedef int int32_t;
typedef uint uint32_t;
//------------------------------------------------------------------------------
// block_q4_0
//------------------------------------------------------------------------------
struct block_q4_0
{
half d;
uint8_t qs[QK4_0 / 2];
};
// This function requires the original shuffled weights.
// As a reminder, the original weights are shuffled so that (q[0], q[16]) are
// packed together in a byte, so are (q[1], q[17]) and so on.
inline float block_q_4_0_dot_y_flat(
global uchar * x,
global half * dh,
float sumy,
float16 yl,
int il
) {
float d = *dh;
global ushort * qs = ((global ushort *)x + il/2);
float acc = 0.f;
acc += yl.s0 * (qs[0] & 0x000F);
acc += yl.s1 * (qs[0] & 0x0F00);
acc += yl.s8 * (qs[0] & 0x00F0);
acc += yl.s9 * (qs[0] & 0xF000);
acc += yl.s2 * (qs[1] & 0x000F);
acc += yl.s3 * (qs[1] & 0x0F00);
acc += yl.sa * (qs[1] & 0x00F0);
acc += yl.sb * (qs[1] & 0xF000);
acc += yl.s4 * (qs[2] & 0x000F);
acc += yl.s5 * (qs[2] & 0x0F00);
acc += yl.sc * (qs[2] & 0x00F0);
acc += yl.sd * (qs[2] & 0xF000);
acc += yl.s6 * (qs[3] & 0x000F);
acc += yl.s7 * (qs[3] & 0x0F00);
acc += yl.se * (qs[3] & 0x00F0);
acc += yl.sf * (qs[3] & 0xF000);
return d * (sumy * -8.f + acc);
}
//
// This variant outputs 8 values.
//
#undef N_DST
#undef N_SIMDGROUP
#undef N_SIMDWIDTH
#ifdef INTEL_GPU
#define N_DST 8 // each SIMD group works on 8 rows
#define N_SIMDGROUP 1 // number of SIMD groups in a thread group
#define N_SIMDWIDTH 16 // subgroup size
#elif defined (ADRENO_GPU)
#define N_DST 8
#define N_SIMDGROUP 1
#define N_SIMDWIDTH 64
#endif
inline void mul_vec_q_n_f32_8x_flat(
global char * src0_q,
global half * src0_d,
global float * src1,
global float * dst,
int ne00,
int ne01,
int ne02,
int ne10,
int ne12,
int ne0,
int ne1,
int r2,
int r3
) {
const ulong nb = ne00/QK4_0;
int r0 = get_group_id(0);
int r1 = get_group_id(1);
int im = 0;
int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
int i12 = im%ne12;
int i13 = im/ne12;
// The number of scales is the same as the number of blocks.
ulong offset0_d = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
// Each block contains QK4_0/2 uchars, hence offset for qs is as follows.
ulong offset0_q = (first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02)) * QK4_0/2;
global uchar * x = (global uchar *) src0_q + offset0_q;
global half * d = (global half *) src0_d + offset0_d;
global float * y = (global float *) src1 + r1*ne10 + im*ne00*ne1;
float16 yl;
float8 sumf = 0.f;
int ix = get_sub_group_local_id()/2;
int il = 8*(get_sub_group_local_id()%2);
global float * yb = y + ix*QK4_0 + il;
for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) {
float sumy = 0.f;
sumy += yb[0];
sumy += yb[1];
sumy += yb[2];
sumy += yb[3];
sumy += yb[4];
sumy += yb[5];
sumy += yb[6];
sumy += yb[7];
sumy += yb[16];
sumy += yb[17];
sumy += yb[18];
sumy += yb[19];
sumy += yb[20];
sumy += yb[21];
sumy += yb[22];
sumy += yb[23];
yl.s0 = yb[0];
yl.s1 = yb[1]/256.f;
yl.s2 = yb[2];
yl.s3 = yb[3]/256.f;
yl.s4 = yb[4];
yl.s5 = yb[5]/256.f;
yl.s6 = yb[6];
yl.s7 = yb[7]/256.f;
yl.s8 = yb[16]/16.f;
yl.s9 = yb[17]/4096.f;
yl.sa = yb[18]/16.f;
yl.sb = yb[19]/4096.f;
yl.sc = yb[20]/16.f;
yl.sd = yb[21]/4096.f;
yl.se = yb[22]/16.f;
yl.sf = yb[23]/4096.f;
sumf.s0 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 0*nb*QK4_0/2, d + ib + 0*nb, sumy, yl, il);
sumf.s1 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 1*nb*QK4_0/2, d + ib + 1*nb, sumy, yl, il);
sumf.s2 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 2*nb*QK4_0/2, d + ib + 2*nb, sumy, yl, il);
sumf.s3 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 3*nb*QK4_0/2, d + ib + 3*nb, sumy, yl, il);
sumf.s4 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 4*nb*QK4_0/2, d + ib + 4*nb, sumy, yl, il);
sumf.s5 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 5*nb*QK4_0/2, d + ib + 5*nb, sumy, yl, il);
sumf.s6 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 6*nb*QK4_0/2, d + ib + 6*nb, sumy, yl, il);
sumf.s7 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 7*nb*QK4_0/2, d + ib + 7*nb, sumy, yl, il);
yb += QK4_0 * (N_SIMDWIDTH/2);
}
float8 tot = (float8)(
sub_group_reduce_add(sumf.s0), sub_group_reduce_add(sumf.s1),
sub_group_reduce_add(sumf.s2), sub_group_reduce_add(sumf.s3),
sub_group_reduce_add(sumf.s4), sub_group_reduce_add(sumf.s5),
sub_group_reduce_add(sumf.s6), sub_group_reduce_add(sumf.s7)
);
if (get_sub_group_local_id() == 0) {
if (first_row + 0 < ne01) {
dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0;
}
if (first_row + 1 < ne01) {
dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1;
}
if (first_row + 2 < ne01) {
dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2;
}
if (first_row + 3 < ne01) {
dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3;
}
if (first_row + 4 < ne01) {
dst[r1*ne0 + im*ne0*ne1 + first_row + 4] = tot.s4;
}
if (first_row + 5 < ne01) {
dst[r1*ne0 + im*ne0*ne1 + first_row + 5] = tot.s5;
}
if (first_row + 6 < ne01) {
dst[r1*ne0 + im*ne0*ne1 + first_row + 6] = tot.s6;
}
if (first_row + 7 < ne01) {
dst[r1*ne0 + im*ne0*ne1 + first_row + 7] = tot.s7;
}
}
}
#ifdef INTEL_GPU
REQD_SUBGROUP_SIZE_16
#elif defined (ADRENO_GPU)
REQD_SUBGROUP_SIZE_64
#endif
kernel void kernel_mul_mv_id_q4_0_f32_8x_flat(
global char * src0_q,
global half * src0_d,
global float * src1,
ulong offset1,
global char * src2,
ulong offset2,
global float * dst,
ulong offsetd,
int ne00,
int ne01,
int ne02,
ulong nb00,
ulong nb02,
int ne10,
int ne11,
int ne12,
ulong nb11,
ulong nb12,
int ne20,
int ne21,
ulong nb21,
int ne0,
int ne1,
int r2,
int r3
) {
src1 = (global float *)((global char *)src1 + offset1);
src2 = (global char *)((global char *)src2 + offset2);
dst = (global float *)((global char *)dst + offsetd);
const int iid1 = get_group_id(2)/ne20;
const int idx = get_group_id(2)%ne20;
const int i02 = ((global int *)(src2 + iid1*nb21))[idx];
const int i11 = idx%ne11;
const int i12 = iid1;
const int i1 = idx;
const int i2 = i12;
global char * src0_q_cur = src0_q + (i02*nb02/nb00)*(QK4_0/2);
global half * src0_d_cur = src0_d + (i02*nb02/nb00);
global float * src1_cur = (global float *)((global char *) src1 + i11*nb11 + i12*nb12);
global float * dst_cur = dst + i1*ne0 + i2*ne1*ne0;
mul_vec_q_n_f32_8x_flat(src0_q_cur, src0_d_cur, src1_cur, dst_cur, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3);
}

View File

@@ -78,7 +78,7 @@ static bool is_pow2(uint32_t x) { return x > 1 && (x & (x-1)) == 0; }
#define VK_VENDOR_ID_INTEL 0x8086
#define VK_VENDOR_ID_NVIDIA 0x10de
#define VK_DEVICE_DESCRIPTOR_POOL_SIZE 32
#define VK_DEVICE_DESCRIPTOR_POOL_SIZE 256
#define GGML_VK_MAX_NODES 8192
@@ -102,25 +102,11 @@ static bool is_pow2(uint32_t x) { return x > 1 && (x & (x-1)) == 0; }
struct ggml_backend_vk_context;
struct vk_queue {
uint32_t queue_family_index;
vk::Queue queue;
vk::CommandPool pool;
uint32_t cmd_buffer_idx;
std::vector<vk::CommandBuffer> cmd_buffers;
vk::PipelineStageFlags stage_flags;
bool transfer_only;
};
#define MAX_PARAMETER_COUNT 8
struct vk_pipeline_struct {
std::string name;
vk::ShaderModule shader_module;
vk::DescriptorSetLayout dsl;
std::vector<vk::DescriptorPool> descriptor_pools;
std::vector<vk::DescriptorSet> descriptor_sets;
uint32_t descriptor_set_idx;
vk::PipelineLayout layout;
vk::Pipeline pipeline;
uint32_t push_constant_size;
@@ -167,6 +153,40 @@ struct ggml_backend_vk_buffer_type_context {
vk_device device;
};
struct vk_queue;
// Stores command pool/buffers. There's an instance of this
// for each (context,queue) pair and for each (device,queue) pair.
struct vk_command_pool {
void init(vk_device& device, vk_queue *q_);
void destroy(vk::Device& device);
vk::CommandPool pool;
uint32_t cmd_buffer_idx;
std::vector<vk::CommandBuffer> cmd_buffers;
vk_queue *q;
};
struct vk_queue {
uint32_t queue_family_index;
vk::Queue queue;
vk_command_pool cmd_pool;
vk::PipelineStageFlags stage_flags;
bool transfer_only;
// copy everything except the cmd_pool
void copyFrom(vk_queue &other) {
queue_family_index = other.queue_family_index;
queue = other.queue;
stage_flags = other.stage_flags;
transfer_only = other.transfer_only;
}
};
static const char * ggml_backend_vk_buffer_type_name(ggml_backend_buffer_type_t buft);
static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size);
static size_t ggml_backend_vk_buffer_type_get_alignment(ggml_backend_buffer_type_t buft);
@@ -341,6 +361,8 @@ struct vk_device_struct {
// set to true to indicate that some shaders need to be compiled after the dryrun
bool need_compiles {};
vk::DescriptorSetLayout dsl;
vk_matmul_pipeline pipeline_matmul_f32 {};
vk_matmul_pipeline pipeline_matmul_f32_f16 {};
vk_matmul_pipeline pipeline_matmul_bf16 {};
@@ -458,7 +480,6 @@ struct vk_device_struct {
vk_pipeline pipeline_flash_attn_split_k_reduce;
std::unordered_map<std::string, vk_pipeline_ref> pipelines;
std::unordered_map<std::string, uint64_t> pipeline_descriptor_set_requirements;
std::vector<std::tuple<void*, size_t, vk_buffer>> pinned_memory;
@@ -483,10 +504,8 @@ struct vk_device_struct {
ggml_vk_destroy_buffer(sync_staging);
device.destroyCommandPool(compute_queue.pool);
if (!single_queue) {
device.destroyCommandPool(transfer_queue.pool);
}
compute_queue.cmd_pool.destroy(device);
transfer_queue.cmd_pool.destroy(device);
for (auto& pipeline : pipelines) {
if (pipeline.second.expired()) {
@@ -498,10 +517,26 @@ struct vk_device_struct {
}
pipelines.clear();
device.destroyDescriptorSetLayout(dsl);
device.destroy();
}
};
void vk_command_pool::init(vk_device& device, vk_queue *q_) {
cmd_buffer_idx = 0;
q = q_;
vk::CommandPoolCreateInfo command_pool_create_info(vk::CommandPoolCreateFlags(VK_COMMAND_POOL_CREATE_TRANSIENT_BIT), q->queue_family_index);
pool = device->device.createCommandPool(command_pool_create_info);
}
void vk_command_pool::destroy(vk::Device& device) {
device.destroyCommandPool(pool);
pool = nullptr;
cmd_buffers.clear();
}
struct vk_buffer_struct {
vk::Buffer buffer = VK_NULL_HANDLE;
vk::DeviceMemory device_memory = VK_NULL_HANDLE;
@@ -819,7 +854,7 @@ struct vk_context_struct {
std::vector<vk_staging_memcpy> in_memcpys;
std::vector<vk_staging_memcpy> out_memcpys;
vk_queue * q;
vk_command_pool * p {};
};
typedef std::shared_ptr<vk_context_struct> vk_context;
typedef std::weak_ptr<vk_context_struct> vk_context_ref;
@@ -930,6 +965,14 @@ struct ggml_backend_vk_context {
vk_context_ref transfer_ctx;
std::vector<vk_context_ref> tensor_ctxs;
std::vector<vk::DescriptorPool> descriptor_pools;
std::vector<vk::DescriptorSet> descriptor_sets;
uint32_t descriptor_set_idx {};
uint32_t pipeline_descriptor_set_requirements {};
vk_command_pool compute_cmd_pool;
vk_command_pool transfer_cmd_pool;
};
static void * const vk_ptr_base = (void *)(uintptr_t) 0x1000; // NOLINT
@@ -1060,39 +1103,19 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
", (" << wg_denoms[0] << "," << wg_denoms[1] << "," << wg_denoms[2] << "), specialization_constants, " <<
disable_robustness << ", " << require_full_subgroups << ", " << required_subgroup_size << ")");
GGML_ASSERT(parameter_count > 0);
GGML_ASSERT(parameter_count <= MAX_PARAMETER_COUNT);
GGML_ASSERT(wg_denoms[0] > 0 && wg_denoms[1] > 0 && wg_denoms[2] > 0); // NOLINT
vk::ShaderModuleCreateInfo shader_module_create_info({}, spv_size, reinterpret_cast<const uint32_t *>(spv_data));
pipeline->shader_module = device->device.createShaderModule(shader_module_create_info);
std::vector<vk::DescriptorSetLayoutBinding> dsl_binding;
std::vector<vk::DescriptorBindingFlags> dsl_binding_flags;
for (uint32_t i = 0; i < parameter_count; i++) {
dsl_binding.push_back({i, vk::DescriptorType::eStorageBuffer, 1, vk::ShaderStageFlagBits::eCompute});
dsl_binding_flags.push_back({});
}
vk::DescriptorSetLayoutBindingFlagsCreateInfo dslbfci = { dsl_binding_flags };
vk::PushConstantRange pcr(
vk::ShaderStageFlagBits::eCompute,
0,
pipeline->push_constant_size
);
vk::DescriptorSetLayoutCreateInfo descriptor_set_layout_create_info(
{},
dsl_binding);
descriptor_set_layout_create_info.setPNext(&dslbfci);
pipeline->dsl = device->device.createDescriptorSetLayout(descriptor_set_layout_create_info);
vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, pipeline->parameter_count * VK_DEVICE_DESCRIPTOR_POOL_SIZE);
vk::DescriptorPoolCreateInfo descriptor_pool_create_info({}, VK_DEVICE_DESCRIPTOR_POOL_SIZE, descriptor_pool_size);
pipeline->descriptor_pools.push_back(device->device.createDescriptorPool(descriptor_pool_create_info));
pipeline->descriptor_set_idx = 0;
vk::PipelineLayoutCreateInfo pipeline_layout_create_info(vk::PipelineLayoutCreateFlags(), pipeline->dsl, pcr);
vk::PipelineLayoutCreateInfo pipeline_layout_create_info(vk::PipelineLayoutCreateFlags(), device->dsl, pcr);
pipeline->layout = device->device.createPipelineLayout(pipeline_layout_create_info);
std::vector<vk::SpecializationMapEntry> specialization_entries(specialization_constants.size());
@@ -1167,15 +1190,6 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline) {
VK_LOG_DEBUG("ggml_pipeline_destroy_pipeline(" << pipeline->name << ")");
for (auto& pool : pipeline->descriptor_pools) {
device.destroyDescriptorPool(pool);
}
pipeline->descriptor_pools.clear();
pipeline->descriptor_sets.clear();
pipeline->descriptor_set_idx = 0;
device.destroyDescriptorSetLayout(pipeline->dsl);
device.destroyPipelineLayout(pipeline->layout);
device.destroyShaderModule(pipeline->shader_module);
@@ -1183,97 +1197,76 @@ static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline)
device.destroyPipeline(pipeline->pipeline);
}
static void ggml_pipeline_request_descriptor_sets(vk_device& device, vk_pipeline& pipeline, uint32_t n) {
static void ggml_pipeline_request_descriptor_sets(ggml_backend_vk_context *ctx, vk_pipeline& pipeline, uint32_t n) {
VK_LOG_DEBUG("ggml_pipeline_request_descriptor_sets(" << pipeline->name << ", " << n << ")");
device->pipeline_descriptor_set_requirements[pipeline->name] += n;
ctx->pipeline_descriptor_set_requirements += n;
if (!pipeline->compiled) {
pipeline->needed = true;
device->need_compiles = true;
ctx->device->need_compiles = true;
}
}
static void ggml_pipeline_allocate_descriptor_sets(vk_device& device) {
std::lock_guard<std::mutex> guard(device->mutex);
static void ggml_pipeline_allocate_descriptor_sets(ggml_backend_vk_context * ctx) {
for (auto& pair : device->pipeline_descriptor_set_requirements) {
vk_pipeline pipeline = device->pipelines.at(pair.first).lock();
const uint64_t n = pair.second;
if (ctx->descriptor_sets.size() >= ctx->pipeline_descriptor_set_requirements) {
// Enough descriptors are available
return;
}
VK_LOG_DEBUG("ggml_pipeline_allocate_descriptor_sets(" << pipeline->name << ", " << n << ")");
vk_device& device = ctx->device;
if (pipeline->descriptor_sets.size() >= pipeline->descriptor_set_idx + n) {
// Enough descriptors are available
continue;
uint32_t to_alloc = ctx->pipeline_descriptor_set_requirements - ctx->descriptor_sets.size();
uint32_t pool_remaining = VK_DEVICE_DESCRIPTOR_POOL_SIZE - ctx->descriptor_sets.size() % VK_DEVICE_DESCRIPTOR_POOL_SIZE;
uint32_t pool_idx = ctx->descriptor_sets.size() / VK_DEVICE_DESCRIPTOR_POOL_SIZE;
while (to_alloc > 0) {
const uint32_t alloc_count = std::min(pool_remaining, to_alloc);
to_alloc -= alloc_count;
pool_remaining = VK_DEVICE_DESCRIPTOR_POOL_SIZE;
if (pool_idx >= ctx->descriptor_pools.size()) {
vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, MAX_PARAMETER_COUNT * VK_DEVICE_DESCRIPTOR_POOL_SIZE);
vk::DescriptorPoolCreateInfo descriptor_pool_create_info({}, VK_DEVICE_DESCRIPTOR_POOL_SIZE, descriptor_pool_size);
ctx->descriptor_pools.push_back(device->device.createDescriptorPool(descriptor_pool_create_info));
}
uint32_t to_alloc = pipeline->descriptor_set_idx + n - pipeline->descriptor_sets.size();
uint32_t pool_remaining = VK_DEVICE_DESCRIPTOR_POOL_SIZE - pipeline->descriptor_sets.size() % VK_DEVICE_DESCRIPTOR_POOL_SIZE;
uint32_t pool_idx = pipeline->descriptor_sets.size() / VK_DEVICE_DESCRIPTOR_POOL_SIZE;
while (to_alloc > 0) {
const uint32_t alloc_count = std::min(pool_remaining, to_alloc);
to_alloc -= alloc_count;
pool_remaining = VK_DEVICE_DESCRIPTOR_POOL_SIZE;
if (pool_idx >= pipeline->descriptor_pools.size()) {
vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, pipeline->parameter_count * VK_DEVICE_DESCRIPTOR_POOL_SIZE);
vk::DescriptorPoolCreateInfo descriptor_pool_create_info({}, VK_DEVICE_DESCRIPTOR_POOL_SIZE, descriptor_pool_size);
pipeline->descriptor_pools.push_back(device->device.createDescriptorPool(descriptor_pool_create_info));
}
std::vector<vk::DescriptorSetLayout> layouts(alloc_count);
for (uint32_t i = 0; i < alloc_count; i++) {
layouts[i] = pipeline->dsl;
}
vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(pipeline->descriptor_pools[pool_idx], alloc_count, layouts.data());
std::vector<vk::DescriptorSet> sets = device->device.allocateDescriptorSets(descriptor_set_alloc_info);
pipeline->descriptor_sets.insert(pipeline->descriptor_sets.end(), sets.begin(), sets.end());
pool_idx++;
std::vector<vk::DescriptorSetLayout> layouts(alloc_count);
for (uint32_t i = 0; i < alloc_count; i++) {
layouts[i] = device->dsl;
}
vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(ctx->descriptor_pools[pool_idx], alloc_count, layouts.data());
std::vector<vk::DescriptorSet> sets = device->device.allocateDescriptorSets(descriptor_set_alloc_info);
ctx->descriptor_sets.insert(ctx->descriptor_sets.end(), sets.begin(), sets.end());
pool_idx++;
}
}
static void ggml_pipeline_cleanup(vk_pipeline& pipeline) {
VK_LOG_DEBUG("ggml_pipeline_cleanup(" << pipeline->name << ")");
pipeline->descriptor_set_idx = 0;
}
static vk::CommandBuffer ggml_vk_create_cmd_buffer(vk_device& device, vk_queue& q) {
static vk::CommandBuffer ggml_vk_create_cmd_buffer(vk_device& device, vk_command_pool& p) {
VK_LOG_DEBUG("ggml_vk_create_cmd_buffer()");
std::lock_guard<std::mutex> guard(device->mutex);
if (q.cmd_buffers.size() > q.cmd_buffer_idx) {
if (p.cmd_buffers.size() > p.cmd_buffer_idx) {
// Reuse command buffer
return q.cmd_buffers[q.cmd_buffer_idx++];
return p.cmd_buffers[p.cmd_buffer_idx++];
}
vk::CommandBufferAllocateInfo command_buffer_alloc_info(
q.pool,
p.pool,
vk::CommandBufferLevel::ePrimary,
1);
const std::vector<vk::CommandBuffer> cmd_buffers = device->device.allocateCommandBuffers(command_buffer_alloc_info);
auto buf = cmd_buffers.front();
q.cmd_buffers.push_back(buf);
q.cmd_buffer_idx++;
p.cmd_buffers.push_back(buf);
p.cmd_buffer_idx++;
return buf;
}
static vk_submission ggml_vk_create_submission(vk_device& device, vk_queue& q, std::vector<vk_semaphore> wait_semaphores, std::vector<vk_semaphore> signal_semaphores) {
VK_LOG_DEBUG("ggml_vk_create_submission()");
vk_submission s;
s.buffer = ggml_vk_create_cmd_buffer(device, q);
s.wait_semaphores = std::move(wait_semaphores);
s.signal_semaphores = std::move(signal_semaphores);
return s;
}
static void ggml_vk_submit(vk_context& ctx, vk::Fence fence) {
if (ctx->seqs.empty()) {
if (fence) {
ctx->q->queue.submit({}, fence);
ctx->p->q->queue.submit({}, fence);
}
return;
}
@@ -1312,7 +1305,7 @@ static void ggml_vk_submit(vk_context& ctx, vk::Fence fence) {
tl_signal_vals.push_back({});
tl_signal_semaphores.push_back({});
for (size_t i = 0; i < submission.wait_semaphores.size(); i++) {
stage_flags[idx].push_back(ctx->q->stage_flags);
stage_flags[idx].push_back(ctx->p->q->stage_flags);
tl_wait_vals[idx].push_back(submission.wait_semaphores[i].value);
tl_wait_semaphores[idx].push_back(submission.wait_semaphores[i].s);
}
@@ -1342,7 +1335,7 @@ static void ggml_vk_submit(vk_context& ctx, vk::Fence fence) {
}
}
ctx->q->queue.submit(submit_infos, fence);
ctx->p->q->queue.submit(submit_infos, fence);
ctx->seqs.clear();
}
@@ -1400,28 +1393,25 @@ static void ggml_vk_create_queue(vk_device& device, vk_queue& q, uint32_t queue_
q.queue_family_index = queue_family_index;
q.transfer_only = transfer_only;
vk::CommandPoolCreateInfo command_pool_create_info_compute(vk::CommandPoolCreateFlags(VK_COMMAND_POOL_CREATE_TRANSIENT_BIT), queue_family_index);
q.pool = device->device.createCommandPool(command_pool_create_info_compute);
q.cmd_buffer_idx = 0;
q.cmd_pool.init(device, &q);
q.queue = device->device.getQueue(queue_family_index, queue_index);
q.stage_flags = stage_flags;
}
static vk_context ggml_vk_create_context(ggml_backend_vk_context * ctx, vk_queue& q) {
static vk_context ggml_vk_create_context(ggml_backend_vk_context * ctx, vk_command_pool& p) {
vk_context result = std::make_shared<vk_context_struct>();
VK_LOG_DEBUG("ggml_vk_create_context(" << result << ")");
ctx->gc.contexts.emplace_back(result);
result->q = &q;
result->p = &p;
return result;
}
static vk_context ggml_vk_create_temporary_context(vk_queue& q) {
static vk_context ggml_vk_create_temporary_context(vk_command_pool& p) {
vk_context result = std::make_shared<vk_context_struct>();
VK_LOG_DEBUG("ggml_vk_create_temporary_context(" << result << ")");
result->q = &q;
result->p = &p;
return result;
}
@@ -1454,15 +1444,29 @@ static vk::Event ggml_vk_create_event(ggml_backend_vk_context * ctx) {
return ctx->gc.events[ctx->event_idx++];
}
static void ggml_vk_queue_cleanup(vk_device& device, vk_queue& q) {
VK_LOG_DEBUG("ggml_vk_queue_cleanup()");
std::lock_guard<std::mutex> guard(device->mutex);
static void ggml_vk_command_pool_cleanup(vk_device& device, vk_command_pool& p) {
VK_LOG_DEBUG("ggml_vk_command_pool_cleanup()");
// Requires command buffers to be done
device->device.resetCommandPool(q.pool);
q.cmd_buffer_idx = 0;
device->device.resetCommandPool(p.pool);
p.cmd_buffer_idx = 0;
}
static void ggml_vk_queue_command_pools_cleanup(vk_device& device) {
VK_LOG_DEBUG("ggml_vk_queue_command_pools_cleanup()");
// Arbitrary frequency to cleanup/reuse command buffers
static constexpr uint32_t cleanup_frequency = 10;
if (device->compute_queue.cmd_pool.cmd_buffer_idx >= cleanup_frequency) {
ggml_vk_command_pool_cleanup(device, device->compute_queue.cmd_pool);
}
if (device->transfer_queue.cmd_pool.cmd_buffer_idx >= cleanup_frequency) {
ggml_vk_command_pool_cleanup(device, device->transfer_queue.cmd_pool);
}
}
static uint32_t find_properties(const vk::PhysicalDeviceMemoryProperties* mem_props, vk::MemoryRequirements* mem_req, vk::MemoryPropertyFlags flags) {
for (uint32_t i = 0; i < mem_props->memoryTypeCount; ++i) {
vk::MemoryType memory_type = mem_props->memoryTypes[i];
@@ -1481,8 +1485,6 @@ static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, vk::Memor
throw vk::OutOfDeviceMemoryError("Requested buffer size exceeds device memory allocation limit");
}
std::lock_guard<std::mutex> guard(device->mutex);
vk_buffer buf = std::make_shared<vk_buffer_struct>();
if (size == 0) {
@@ -1611,11 +1613,11 @@ static vk_subbuffer ggml_vk_subbuffer(vk_buffer& buf) {
static void ggml_vk_sync_buffers(vk_context& ctx) {
VK_LOG_DEBUG("ggml_vk_sync_buffers()");
const bool transfer_queue = ctx->q->transfer_only;
const bool transfer_queue = ctx->p->q->transfer_only;
ctx->s->buffer.pipelineBarrier(
ctx->q->stage_flags,
ctx->q->stage_flags,
ctx->p->q->stage_flags,
ctx->p->q->stage_flags,
{},
{ {
{ !transfer_queue ? (vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eShaderWrite | vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite) : (vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite) },
@@ -1634,8 +1636,8 @@ static void ggml_vk_wait_events(vk_context& ctx, std::vector<vk::Event>&& events
ctx->s->buffer.waitEvents(
events,
ctx->q->stage_flags,
ctx->q->stage_flags,
ctx->p->q->stage_flags,
ctx->p->q->stage_flags,
{},
{},
{}
@@ -3369,6 +3371,22 @@ static vk_device ggml_vk_get_device(size_t idx) {
}
}
std::vector<vk::DescriptorSetLayoutBinding> dsl_binding;
std::vector<vk::DescriptorBindingFlags> dsl_binding_flags;
for (uint32_t i = 0; i < MAX_PARAMETER_COUNT; i++) {
dsl_binding.push_back({i, vk::DescriptorType::eStorageBuffer, 1, vk::ShaderStageFlagBits::eCompute});
dsl_binding_flags.push_back({});
}
vk::DescriptorSetLayoutBindingFlagsCreateInfo dslbfci = { dsl_binding_flags };
vk::DescriptorSetLayoutCreateInfo descriptor_set_layout_create_info(
{},
dsl_binding);
descriptor_set_layout_create_info.setPNext(&dslbfci);
device->dsl = device->device.createDescriptorSetLayout(descriptor_set_layout_create_info);
ggml_vk_load_shaders(device);
if (!device->single_queue) {
@@ -3376,7 +3394,8 @@ static vk_device ggml_vk_get_device(size_t idx) {
ggml_vk_create_queue(device, device->transfer_queue, transfer_queue_family_index, transfer_queue_index, { vk::PipelineStageFlagBits::eTransfer }, true);
} else {
// TODO: Use pointer or reference to avoid copy
device->transfer_queue = device->compute_queue;
device->transfer_queue.copyFrom(device->compute_queue);
device->transfer_queue.cmd_pool.init(device, &device->transfer_queue);
}
device->buffer_type = {
@@ -3742,6 +3761,9 @@ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
ctx->fence = ctx->device->device.createFence({});
ctx->almost_ready_fence = ctx->device->device.createFence({});
ctx->compute_cmd_pool.init(ctx->device, &ctx->device->compute_queue);
ctx->transfer_cmd_pool.init(ctx->device, &ctx->device->transfer_queue);
#ifdef GGML_VULKAN_CHECK_RESULTS
const char* skip_checks = getenv("GGML_VULKAN_SKIP_CHECKS");
vk_skip_checks = (skip_checks == NULL ? 0 : atoi(skip_checks));
@@ -4107,9 +4129,9 @@ static void ggml_vk_host_get(vk_device& device, const void * ptr, vk_buffer& buf
}
}
static vk_submission ggml_vk_begin_submission(vk_device& device, vk_queue& q, bool one_time = true) {
static vk_submission ggml_vk_begin_submission(vk_device& device, vk_command_pool& p, bool one_time = true) {
vk_submission s;
s.buffer = ggml_vk_create_cmd_buffer(device, q);
s.buffer = ggml_vk_create_cmd_buffer(device, p);
if (one_time) {
s.buffer.begin({ vk::CommandBufferUsageFlagBits::eOneTimeSubmit });
} else {
@@ -4154,10 +4176,10 @@ static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context* ctx, vk_context&
std::cerr << "(" << buffer.buffer << ", " << buffer.offset << ", " << buffer.range << "), ";
}
std::cerr << "}, (" << wg0 << "," << wg1 << "," << wg2 << "))");
GGML_ASSERT(pipeline->descriptor_set_idx < pipeline->descriptor_sets.size());
GGML_ASSERT(descriptor_buffer_infos.size() == pipeline->parameter_count);
GGML_ASSERT(ctx->descriptor_set_idx < ctx->descriptor_sets.size());
GGML_ASSERT(descriptor_buffer_infos.size() <= MAX_PARAMETER_COUNT);
vk::DescriptorSet& descriptor_set = pipeline->descriptor_sets[pipeline->descriptor_set_idx++];
vk::DescriptorSet& descriptor_set = ctx->descriptor_sets[ctx->descriptor_set_idx++];
vk::WriteDescriptorSet write_descriptor_set{ descriptor_set, 0, 0, pipeline->parameter_count, vk::DescriptorType::eStorageBuffer, nullptr, descriptor_buffer_infos.begin() };
ctx->device->device.updateDescriptorSets({ write_descriptor_set }, {});
@@ -4194,7 +4216,7 @@ static void ggml_vk_ctx_begin(vk_device& device, vk_context& subctx) {
ggml_vk_ctx_end(subctx);
}
subctx->seqs.push_back({ ggml_vk_begin_submission(device, *subctx->q) });
subctx->seqs.push_back({ ggml_vk_begin_submission(device, *subctx->p) });
subctx->s = subctx->seqs[subctx->seqs.size() - 1].data();
}
@@ -4395,7 +4417,9 @@ static void ggml_vk_buffer_write_2d(vk_buffer& dst, size_t offset, const void *
memcpy((uint8_t *)dst->ptr + offset + i * width, (const uint8_t *) src + i * spitch, width);
}
} else {
vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue);
std::lock_guard<std::mutex> guard(dst->device->mutex);
vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue.cmd_pool);
ggml_vk_ctx_begin(dst->device, subctx);
ggml_vk_buffer_write_2d_async(subctx, dst, offset, src, spitch, width, height, true);
ggml_vk_ctx_end(subctx);
@@ -4407,6 +4431,7 @@ static void ggml_vk_buffer_write_2d(vk_buffer& dst, size_t offset, const void *
ggml_vk_submit(subctx, dst->device->fence);
VK_CHECK(dst->device->device.waitForFences({ dst->device->fence }, true, UINT64_MAX), "vk_buffer_write_2d waitForFences");
dst->device->device.resetFences({ dst->device->fence });
ggml_vk_queue_command_pools_cleanup(dst->device);
}
}
@@ -4483,7 +4508,9 @@ static void ggml_vk_buffer_read(vk_buffer& src, size_t offset, void * dst, size_
memcpy(dst, (uint8_t *) src->ptr + offset, size);
} else {
vk_context subctx = ggml_vk_create_temporary_context(src->device->transfer_queue);
std::lock_guard<std::mutex> guard(src->device->mutex);
vk_context subctx = ggml_vk_create_temporary_context(src->device->transfer_queue.cmd_pool);
ggml_vk_ctx_begin(src->device, subctx);
ggml_vk_buffer_read_async(subctx, src, offset, dst, size, true);
ggml_vk_ctx_end(subctx);
@@ -4491,6 +4518,7 @@ static void ggml_vk_buffer_read(vk_buffer& src, size_t offset, void * dst, size_
ggml_vk_submit(subctx, src->device->fence);
VK_CHECK(src->device->device.waitForFences({ src->device->fence }, true, UINT64_MAX), "vk_buffer_read waitForFences");
src->device->device.resetFences({ src->device->fence });
ggml_vk_queue_command_pools_cleanup(src->device);
for (auto& cpy : subctx->out_memcpys) {
memcpy(cpy.dst, cpy.src, cpy.n);
@@ -4510,15 +4538,17 @@ static void ggml_vk_buffer_copy_async(vk_context& ctx, vk_buffer& dst, size_t ds
static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) {
if (src->device == dst->device) {
std::lock_guard<std::mutex> guard(src->device->mutex);
VK_LOG_DEBUG("ggml_vk_buffer_copy(SINGLE_DEVICE, " << size << ")");
// Copy within the device
vk_context subctx = ggml_vk_create_temporary_context(src->device->transfer_queue);
vk_context subctx = ggml_vk_create_temporary_context(src->device->transfer_queue.cmd_pool);
ggml_vk_ctx_begin(src->device, subctx);
ggml_vk_buffer_copy_async(subctx, dst, dst_offset, src, src_offset, size);
ggml_vk_ctx_end(subctx);
ggml_vk_submit(subctx, src->device->fence);
VK_CHECK(src->device->device.waitForFences({ src->device->fence }, true, UINT64_MAX), "vk_buffer_copy waitForFences");
src->device->device.resetFences({ src->device->fence });
ggml_vk_queue_command_pools_cleanup(src->device);
} else {
VK_LOG_DEBUG("ggml_vk_buffer_copy(MULTI_DEVICE, " << size << ")");
// Copy device to device
@@ -4543,7 +4573,8 @@ static void ggml_vk_buffer_memset_async(vk_context& ctx, vk_buffer& dst, size_t
static void ggml_vk_buffer_memset(vk_buffer& dst, size_t offset, uint32_t c, size_t size) {
VK_LOG_DEBUG("ggml_vk_buffer_memset(" << offset << ", " << c << ", " << size << ")");
vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue);
std::lock_guard<std::mutex> guard(dst->device->mutex);
vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue.cmd_pool);
ggml_vk_ctx_begin(dst->device, subctx);
subctx->s->buffer.fillBuffer(dst->buffer, offset, size, c);
ggml_vk_ctx_end(subctx);
@@ -4551,6 +4582,7 @@ static void ggml_vk_buffer_memset(vk_buffer& dst, size_t offset, uint32_t c, siz
ggml_vk_submit(subctx, dst->device->fence);
VK_CHECK(dst->device->device.waitForFences({ dst->device->fence }, true, UINT64_MAX), "vk_memset waitForFences");
dst->device->device.resetFences({ dst->device->fence });
ggml_vk_queue_command_pools_cleanup(dst->device);
}
static uint32_t ggml_vk_guess_split_k(ggml_backend_vk_context * ctx, int m, int n, int k, const vk_pipeline& pipeline) {
@@ -4964,18 +4996,18 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
}
// Request descriptor sets
ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1);
ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
if (qx_needs_dequant) {
ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_0, 1);
ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_0, 1);
}
if (qy_needs_dequant) {
ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_1, 1);
ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_1, 1);
}
if (quantize_y) {
ggml_pipeline_request_descriptor_sets(ctx->device, to_q8_1, 1);
ggml_pipeline_request_descriptor_sets(ctx, to_q8_1, 1);
}
if (split_k > 1) {
ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_matmul_split_k_reduce, 1);
ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, 1);
}
return;
}
@@ -5157,12 +5189,12 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
// Request descriptor sets
if (qx_needs_dequant) {
ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_0, 1);
ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_0, 1);
}
if (qy_needs_dequant) {
ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_1, 1);
ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_1, 1);
}
ggml_pipeline_request_descriptor_sets(ctx->device, dmmv, 1);
ggml_pipeline_request_descriptor_sets(ctx, dmmv, 1);
return;
}
@@ -5295,7 +5327,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
if (dryrun) {
// Request descriptor sets
ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_mul_mat_vec_p021_f16_f32[gqa_ratio - 1], 1);
ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32[gqa_ratio - 1], 1);
return;
}
@@ -5384,7 +5416,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
if (dryrun) {
// Request descriptor sets
ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, 1);
ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, 1);
return;
}
@@ -5571,12 +5603,12 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
}
// Request descriptor sets
ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1);
ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
if (qx_needs_dequant) {
ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_0, 1);
ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_0, 1);
}
if (qy_needs_dequant) {
ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_1, 1);
ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_1, 1);
}
return;
}
@@ -5765,12 +5797,12 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
// Request descriptor sets
if (qx_needs_dequant) {
ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_0, 1);
ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_0, 1);
}
if (qy_needs_dequant) {
ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_1, 1);
ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_1, 1);
}
ggml_pipeline_request_descriptor_sets(ctx->device, dmmv, 1);
ggml_pipeline_request_descriptor_sets(ctx, dmmv, 1);
return;
}
@@ -6090,9 +6122,9 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
if (dryrun) {
// Request descriptor sets
ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1);
ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
if (split_k > 1) {
ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_flash_attn_split_k_reduce, 1);
ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_flash_attn_split_k_reduce, 1);
}
return;
}
@@ -6655,7 +6687,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
}
if (dryrun) {
ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1);
ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
return;
}
@@ -7036,7 +7068,7 @@ static void ggml_vk_op_f32_wkv(ggml_backend_vk_context * ctx, vk_context& subctx
GGML_ASSERT(pipeline != nullptr);
if (dryrun) {
ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1);
ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
return;
}
@@ -7175,7 +7207,7 @@ static void ggml_vk_op_f32_opt_step_adamw(ggml_backend_vk_context * ctx, vk_cont
GGML_ASSERT(pipeline != nullptr);
if (dryrun) {
ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1);
ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
return;
}
@@ -7853,9 +7885,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
}
}
ggml_pipeline_request_descriptor_sets(ctx->device, p, num_it);
ggml_pipeline_request_descriptor_sets(ctx, p, num_it);
if (split_k > 1) {
ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_matmul_split_k_reduce, num_it);
ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, num_it);
if (ctx->prealloc_split_k == nullptr || ctx->prealloc_split_k->size < sizeof(float) * d_ne * split_k) {
// Resize buffer
@@ -7870,7 +7902,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
ggml_vk_load_shaders(ctx->device);
}
ggml_pipeline_allocate_descriptor_sets(ctx->device);
ggml_pipeline_allocate_descriptor_sets(ctx);
vk_buffer d_X = ggml_vk_create_buffer_check(ctx->device, sizeof(X_TYPE) * x_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
vk_buffer d_Y = ggml_vk_create_buffer_check(ctx->device, sizeof(Y_TYPE) * y_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
@@ -7912,7 +7944,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
ggml_vk_buffer_write(d_X, 0, x, sizeof(X_TYPE) * k * m * batch);
ggml_vk_buffer_write(d_Y, 0, y, sizeof(Y_TYPE) * k * n * batch);
vk_context subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
vk_context subctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
ggml_vk_ctx_begin(ctx->device, subctx);
for (size_t i = 0; i < num_it; i++) {
ggml_vk_matmul(
@@ -7928,6 +7960,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
ggml_vk_submit(subctx, ctx->fence);
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_matmul waitForFences");
ctx->device->device.resetFences({ ctx->fence });
ggml_vk_queue_command_pools_cleanup(ctx->device);
auto end = std::chrono::high_resolution_clock::now();
double time = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0;
@@ -8029,16 +8062,13 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
free(d_chk);
ggml_vk_queue_cleanup(ctx->device, ctx->device->transfer_queue);
ggml_vk_queue_cleanup(ctx->device, ctx->device->compute_queue);
ggml_vk_command_pool_cleanup(ctx->device, ctx->compute_cmd_pool);
ggml_vk_command_pool_cleanup(ctx->device, ctx->transfer_cmd_pool);
ggml_vk_destroy_buffer(d_X);
ggml_vk_destroy_buffer(d_Y);
ggml_vk_destroy_buffer(d_D);
ggml_pipeline_cleanup(p);
ggml_pipeline_cleanup(ctx->device->pipeline_matmul_split_k_reduce);
free(x);
free(y);
free(d);
@@ -8116,17 +8146,17 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
ggml_vk_quantize_data(x, qx, ne, quant);
ggml_vk_dequantize_data(qx, x_ref, ne, quant);
ggml_pipeline_request_descriptor_sets(ctx->device, p, 1);
ggml_pipeline_request_descriptor_sets(ctx, p, 1);
if (ctx->device->need_compiles) {
ggml_vk_load_shaders(ctx->device);
}
ggml_pipeline_allocate_descriptor_sets(ctx->device);
ggml_pipeline_allocate_descriptor_sets(ctx);
ggml_vk_buffer_write(qx_buf, 0, qx, qx_sz);
vk_context subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
vk_context subctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
ggml_vk_ctx_begin(ctx->device, subctx);
const std::vector<uint32_t> pc = { 1, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne };
ggml_vk_dispatch_pipeline(ctx, subctx, p, { vk_subbuffer{ qx_buf, 0, qx_sz }, vk_subbuffer{ x_buf, 0, x_sz_f16 } }, pc, { (uint32_t)ne, 1, 1});
@@ -8137,6 +8167,7 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
ggml_vk_submit(subctx, ctx->fence);
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_dequant waitForFences");
ctx->device->device.resetFences({ ctx->fence });
ggml_vk_queue_command_pools_cleanup(ctx->device);
auto end = std::chrono::high_resolution_clock::now();
@@ -8216,17 +8247,17 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
//
// vk_pipeline p = ggml_vk_get_quantize_pipeline(ctx, quant);
//
// ggml_pipeline_request_descriptor_sets(ctx->device, p, 1);
// ggml_pipeline_request_descriptor_sets(ctx, p, 1);
//
// if (ctx->device->need_compiles) {
// ggml_vk_load_shaders(ctx->device);
// }
//
// ggml_pipeline_allocate_descriptor_sets(ctx->device);
// ggml_pipeline_allocate_descriptor_sets(ctx);
//
// ggml_vk_buffer_write(x_buf, 0, x, x_sz);
//
// vk_context subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
// vk_context subctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
// ggml_vk_ctx_begin(ctx->device, subctx);
// ggml_vk_quantize_q8_1(ctx, subctx, ggml_vk_subbuffer(x_buf), ggml_vk_subbuffer(qx_buf), ne);
// ggml_vk_ctx_end(subctx);
@@ -8236,6 +8267,7 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
// ggml_vk_submit(subctx, ctx->fence);
// VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_quantize waitForFences");
// ctx->device->device.resetFences({ ctx->fence });
// ggml_vk_queue_command_pools_cleanup(ctx->device);
//
// auto end = std::chrono::high_resolution_clock::now();
//
@@ -8375,9 +8407,9 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
// y[i] = i % k;
}
ggml_pipeline_request_descriptor_sets(ctx->device, p, num_it);
ggml_pipeline_request_descriptor_sets(ctx, p, num_it);
if (split_k > 1) {
ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_matmul_split_k_reduce, num_it);
ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, num_it);
if (ctx->prealloc_split_k == nullptr || ctx->prealloc_split_k->size < sizeof(float) * d_ne * split_k) {
// Resize buffer
@@ -8388,19 +8420,19 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
}
}
if (mmq) {
ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_quantize_q8_1, num_it);
ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_quantize_q8_1, num_it);
}
if (ctx->device->need_compiles) {
ggml_vk_load_shaders(ctx->device);
}
ggml_pipeline_allocate_descriptor_sets(ctx->device);
ggml_pipeline_allocate_descriptor_sets(ctx);
ggml_vk_buffer_write(qx_buf, 0, qx, qx_sz);
ggml_vk_buffer_write(y_buf, 0, y, y_sz);
vk_context subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
vk_context subctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
ggml_vk_ctx_begin(ctx->device, subctx);
if (mmq) {
for (size_t i = 0; i < num_it; i++) {
@@ -8429,6 +8461,7 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
ggml_vk_submit(subctx, ctx->fence);
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_dequant waitForFences");
ctx->device->device.resetFences({ ctx->fence });
ggml_vk_queue_command_pools_cleanup(ctx->device);
auto end = std::chrono::high_resolution_clock::now();
@@ -8743,7 +8776,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
if (!dryrun) {
if (ctx->compute_ctx.expired()) {
compute_ctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
ctx->compute_ctx = compute_ctx;
ggml_vk_ctx_begin(ctx->device, compute_ctx);
} else {
@@ -8797,7 +8830,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
// These operations all go through ggml_vk_op_f32, so short-circuit and
// do the only thing needed for the dryrun.
vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, src2, node, node->op);
ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1);
ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
return false;
}
default:
@@ -9189,19 +9222,8 @@ static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) {
}
ctx->gc.temp_buffers.clear();
for (auto& dsr : ctx->device->pipeline_descriptor_set_requirements) {
vk_pipeline_ref plr = ctx->device->pipelines[dsr.first];
if (plr.expired()) {
continue;
}
vk_pipeline pl = plr.lock();
ggml_pipeline_cleanup(pl);
}
ggml_vk_queue_cleanup(ctx->device, ctx->device->compute_queue);
ggml_vk_queue_cleanup(ctx->device, ctx->device->transfer_queue);
ggml_vk_command_pool_cleanup(ctx->device, ctx->compute_cmd_pool);
ggml_vk_command_pool_cleanup(ctx->device, ctx->transfer_cmd_pool);
for (size_t i = 0; i < ctx->gc.semaphores.size(); i++) {
ctx->device->device.destroySemaphore({ ctx->gc.semaphores[i].s });
@@ -9222,7 +9244,8 @@ static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) {
ctx->tensor_ctxs.clear();
ctx->gc.contexts.clear();
ctx->device->pipeline_descriptor_set_requirements.clear();
ctx->pipeline_descriptor_set_requirements = 0;
ctx->descriptor_set_idx = 0;
}
// Clean up on backend free
@@ -9249,6 +9272,15 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
ctx->device->device.destroyFence(ctx->fence);
ctx->device->device.destroyFence(ctx->almost_ready_fence);
for (auto& pool : ctx->descriptor_pools) {
ctx->device->device.destroyDescriptorPool(pool);
}
ctx->descriptor_pools.clear();
ctx->descriptor_sets.clear();
ctx->compute_cmd_pool.destroy(ctx->device->device);
ctx->transfer_cmd_pool.destroy(ctx->device->device);
}
static int ggml_vk_get_device_count() {
@@ -9515,7 +9547,7 @@ static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor
if (ctx->transfer_ctx.expired()) {
// Initialize new transfer context
transfer_ctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue);
transfer_ctx = ggml_vk_create_context(ctx, ctx->transfer_cmd_pool);
ctx->transfer_ctx = transfer_ctx;
ggml_vk_ctx_begin(ctx->device, transfer_ctx);
} else {
@@ -9538,7 +9570,7 @@ static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_
if (ctx->transfer_ctx.expired()) {
// Initialize new transfer context
transfer_ctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue);
transfer_ctx = ggml_vk_create_context(ctx, ctx->transfer_cmd_pool);
ctx->transfer_ctx = transfer_ctx;
ggml_vk_ctx_begin(ctx->device, transfer_ctx);
} else {
@@ -9561,7 +9593,7 @@ static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, const ggml_
if (ctx->transfer_ctx.expired()) {
// Initialize new transfer context
transfer_ctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue);
transfer_ctx = ggml_vk_create_context(ctx, ctx->transfer_cmd_pool);
ctx->transfer_ctx = transfer_ctx;
ggml_vk_ctx_begin(ctx->device, transfer_ctx);
} else {
@@ -9622,7 +9654,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
ggml_vk_load_shaders(ctx->device);
}
ggml_vk_preallocate_buffers(ctx);
ggml_pipeline_allocate_descriptor_sets(ctx->device);
ggml_pipeline_allocate_descriptor_sets(ctx);
int last_node = cgraph->n_nodes - 1;
@@ -9654,7 +9686,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
ctx->device->device.resetQueryPool(ctx->device->query_pool, 0, cgraph->n_nodes+1);
GGML_ASSERT(ctx->compute_ctx.expired());
compute_ctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
ctx->compute_ctx = compute_ctx;
ggml_vk_ctx_begin(ctx->device, compute_ctx);
compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->device->query_pool, 0);
@@ -9689,7 +9721,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
if (vk_perf_logger_enabled) {
if (ctx->compute_ctx.expired()) {
compute_ctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
ctx->compute_ctx = compute_ctx;
ggml_vk_ctx_begin(ctx->device, compute_ctx);
} else {

View File

@@ -1332,7 +1332,7 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u
LLAMA_LOG_DEBUG("%s: reserving a graph for ubatch with n_tokens = %4u, n_seqs = %2u, n_outputs = %4u\n", __func__, n_tokens, n_seqs, n_outputs);
if (n_tokens % n_seqs != 0) {
n_tokens = (n_tokens / n_seqs) * n_seqs;
n_tokens = ((n_tokens + (n_seqs - 1)) / n_seqs) * n_seqs; // round to next multiple of n_seqs
n_outputs = std::min(n_outputs, n_tokens);
LLAMA_LOG_DEBUG("%s: making n_tokens a multiple of n_seqs - n_tokens = %u, n_seqs = %u, n_outputs = %u\n", __func__, n_tokens, n_seqs, n_outputs);

View File

@@ -127,6 +127,9 @@ llama_kv_cache_unified::llama_kv_cache_unified(
ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
}
const char * LLAMA_KV_CACHE_DEBUG = getenv("LLAMA_KV_CACHE_DEBUG");
debug = LLAMA_KV_CACHE_DEBUG ? atoi(LLAMA_KV_CACHE_DEBUG) : 0;
}
void llama_kv_cache_unified::clear(bool data) {
@@ -517,14 +520,12 @@ int32_t llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) const {
return -1;
}
//#define FIND_SLOT_DEBUG 1
#if FIND_SLOT_DEBUG
LLAMA_LOG_WARN("begin: n = %5d, used = %5d, head = %5d, n_swa = %5d\n", cells.used_max_p1(), cells.get_used(), head, n_swa);
if (debug > 0) {
LLAMA_LOG_CONT("\n");
LLAMA_LOG_DEBUG("%s: n = %5d, used = %5d, head = %5d, size = %5d, n_swa = %5d\n", __func__, cells.used_max_p1(), cells.get_used(), head, get_size(), n_swa);
// for debugging
{
std::string ss;
if (n_swa > 0) {
if ((debug == 2 && n_swa > 0) || debug > 2) {
std::string ss;
for (uint32_t i = 0; i < cells.size(); ++i) {
if (cells.is_empty(i)) {
ss += '.';
@@ -532,21 +533,45 @@ int32_t llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) const {
ss += std::to_string(cells.seq_get(i));
}
if (i%256 == 255) {
ss += " *";
ss += '\n';
}
}
}
LLAMA_LOG_WARN("\n%s\n", ss.c_str());
}
for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
if (cells.seq_pos_min(s) < 0) {
continue;
LLAMA_LOG_DEBUG("\n%s\n", ss.c_str());
}
LLAMA_LOG_WARN("kv_cells: n_swa = %4d, min[%d] = %5d, max[%d] = %5d\n", n_swa, s, cells.seq_pos_min(s), s, cells.seq_pos_max(s));
if ((debug == 2 && n_swa > 0) || debug > 2) {
std::string ss;
for (uint32_t i = 0; i < cells.size(); ++i) {
std::string cur;
if (cells.is_empty(i)) {
cur = '.';
} else {
cur = std::to_string(cells.pos_get(i));
}
const int n = cur.size();
for (int j = 0; j < 5 - n; ++j) {
cur += ' ';
}
ss += cur;
if (i%256 == 255) {
ss += " *";
}
if (i%64 == 63) {
ss += '\n';
}
}
LLAMA_LOG_DEBUG("\n%s\n", ss.c_str());
}
for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
if (cells.seq_pos_min(s) < 0) {
continue;
}
LLAMA_LOG_DEBUG("%s: min[%d] = %5d, max[%d] = %5d\n", __func__, s, cells.seq_pos_min(s), s, cells.seq_pos_max(s));
}
}
#endif
uint32_t n_tested = 0;
@@ -557,21 +582,15 @@ int32_t llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) const {
continue;
}
// keep track of what the minimum sequence positions would be if we accept the ubatch
llama_seq_id seq_pos_min[LLAMA_MAX_PARALLEL_SEQUENCES];
for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
seq_pos_min[s] = cells.seq_pos_min(s);
}
bool found = true;
for (uint32_t i = 0; i < n_tokens; i++) {
const llama_pos pos = ubatch.pos[i];
const llama_seq_id seq_id = ubatch.seq_id[i][0];
//const llama_pos pos = ubatch.pos[i];
//const llama_seq_id seq_id = ubatch.seq_id[i][0];
// can we use this cell? either:
// - the cell is empty
// - the cell is occupied only by one sequence:
// - mask causally, if the sequence is the same as the one we are inserting
// - (disabled) mask causally, if the sequence is the same as the one we are inserting
// - mask SWA, using current max pos for that sequence in the cache
// always insert in the cell with minimum pos
bool can_use = cells.is_empty(head_cur + i);
@@ -579,21 +598,17 @@ int32_t llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) const {
if (!can_use && cells.seq_count(head_cur + i) == 1) {
const llama_pos pos_cell = cells.pos_get(head_cur + i);
// causal mask
if (cells.seq_has(head_cur + i, seq_id)) {
can_use = pos_cell >= pos;
}
// (disabled) causal mask
// note: it's better to purge any "future" tokens beforehand
//if (cells.seq_has(head_cur + i, seq_id)) {
// can_use = pos_cell >= pos;
//}
if (!can_use) {
const llama_seq_id seq_id_cell = cells.seq_get(head_cur + i);
// SWA mask
// note: we insert only in the cell with minimum pos in order to preserve the invariant that
// all positions between [pos_min, pos_max] for each sequence will be present in the cache
// ref: https://github.com/ggml-org/llama.cpp/pull/13746#issuecomment-2916057092
if (pos_cell == seq_pos_min[seq_id_cell] &&
is_masked_swa(pos_cell, cells.seq_pos_max(seq_id_cell) + 1)) {
seq_pos_min[seq_id_cell]++;
if (is_masked_swa(pos_cell, cells.seq_pos_max(seq_id_cell) + 1)) {
can_use = true;
}
}
@@ -621,8 +636,22 @@ int32_t llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) const {
}
void llama_kv_cache_unified::apply_ubatch(uint32_t head_cur, const llama_ubatch & ubatch) {
// keep track of the max sequence position that we would overwrite with this ubatch
// for non-SWA cache, this would be always empty
llama_seq_id seq_pos_max_rm[LLAMA_MAX_PARALLEL_SEQUENCES];
for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
seq_pos_max_rm[s] = -1;
}
for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
if (!cells.is_empty(head_cur + i)) {
assert(cells.seq_count(head_cur + i) == 1);
const llama_seq_id seq_id = cells.seq_get(head_cur + i);
const llama_pos pos = cells.pos_get(head_cur + i);
seq_pos_max_rm[seq_id] = std::max(seq_pos_max_rm[seq_id], pos);
cells.rm(head_cur + i);
}
@@ -633,6 +662,22 @@ void llama_kv_cache_unified::apply_ubatch(uint32_t head_cur, const llama_ubatch
}
}
// note: we want to preserve the invariant that all positions between [pos_min, pos_max] for each sequence
// will be present in the cache. so we have to purge any position which is less than those we would overwrite
// ref: https://github.com/ggml-org/llama.cpp/pull/13746#issuecomment-2916057092
for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
if (seq_pos_max_rm[s] == -1) {
continue;
}
if (cells.seq_pos_min(s) <= seq_pos_max_rm[s]) {
LLAMA_LOG_DEBUG("%s: purging positions [%d, %d] of sequence %d from KV cache\n",
__func__, cells.seq_pos_min(s), seq_pos_max_rm[s], s);
seq_rm(s, cells.seq_pos_min(s), seq_pos_max_rm[s] + 1);
}
}
// move the head at the end of the slot
head = head_cur + ubatch.n_tokens;
}

View File

@@ -158,6 +158,8 @@ private:
// SWA
const uint32_t n_swa = 0;
int debug = 0;
const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
std::vector<ggml_context_ptr> ctxs;

View File

@@ -42,6 +42,34 @@ function(llama_test target)
set_property(TEST ${TEST_NAME} PROPERTY LABELS ${LLAMA_TEST_LABEL})
endfunction()
function(llama_test_cmd target)
include(CMakeParseArguments)
set(options)
set(oneValueArgs NAME LABEL WORKING_DIRECTORY)
set(multiValueArgs ARGS)
cmake_parse_arguments(LLAMA_TEST "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
if (NOT DEFINED LLAMA_TEST_LABEL)
set(LLAMA_TEST_LABEL "main")
endif()
if (NOT DEFINED LLAMA_TEST_WORKING_DIRECTORY)
set(LLAMA_TEST_WORKING_DIRECTORY .)
endif()
if (DEFINED LLAMA_TEST_NAME)
set(TEST_NAME ${LLAMA_TEST_NAME})
else()
set(TEST_NAME ${target})
endif()
add_test(
NAME ${TEST_NAME}
WORKING_DIRECTORY ${LLAMA_TEST_WORKING_DIRECTORY}
COMMAND ${target}
${LLAMA_TEST_ARGS})
set_property(TEST ${TEST_NAME} PROPERTY LABELS ${LLAMA_TEST_LABEL})
endfunction()
# Builds and runs a test source file.
# Optional args:
# - NAME: name of the executable & test target (defaults to the source file name without extension)
@@ -83,25 +111,31 @@ endfunction()
# build test-tokenizer-0 target once and add many tests
llama_build(test-tokenizer-0.cpp)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-bert-bge ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bert-bge.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-command-r ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-command-r.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-coder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-llm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-llm.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-gpt-2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-2.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-spm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-spm.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-mpt ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-phi-3 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-phi-3.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-qwen2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-qwen2.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-refact ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-bert-bge ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-bert-bge.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-command-r ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-command-r.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-coder ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-deepseek-coder.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-llm ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-deepseek-llm.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-falcon ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-falcon.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-gpt-2 ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-gpt-2.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-bpe ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-llama-bpe.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-spm ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-llama-spm.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-mpt ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-mpt.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-phi-3 ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-phi-3.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-qwen2 ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-qwen2.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-refact ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-refact.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-starcoder.gguf)
# TODO: missing HF tokenizer for this model in convert_hf_to_gguf_update.py, see https://github.com/ggml-org/llama.cpp/pull/13847
# llama_test(test-tokenizer-0 NAME test-tokenizer-0-nomic-bert-moe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-nomic-bert-moe.gguf)
if (NOT WIN32)
llama_test_cmd(
${CMAKE_CURRENT_SOURCE_DIR}/test-tokenizers-repo.sh
NAME test-tokenizers-ggml-vocabs
WORKING_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}
ARGS https://huggingface.co/ggml-org/vocabs ${PROJECT_SOURCE_DIR}/models/ggml-vocabs
)
endif()
if (LLAMA_LLGUIDANCE)
llama_build_and_test(test-grammar-llguidance.cpp ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf)
llama_build_and_test(test-grammar-llguidance.cpp ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-llama-bpe.gguf)
endif ()
if (NOT WIN32 OR NOT BUILD_SHARED_LIBS)
@@ -113,8 +147,8 @@ if (NOT WIN32 OR NOT BUILD_SHARED_LIBS)
llama_build_and_test(test-chat.cpp)
# TODO: disabled on loongarch64 because the ggml-ci node lacks Python 3.8
if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
llama_build_and_test(test-json-schema-to-grammar.cpp WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..)
target_include_directories(test-json-schema-to-grammar PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../tools/server)
llama_build_and_test(test-json-schema-to-grammar.cpp WORKING_DIRECTORY ${PROJECT_SOURCE_DIR})
target_include_directories(test-json-schema-to-grammar PRIVATE ${PROJECT_SOURCE_DIR}/tools/server)
endif()
if (NOT GGML_BACKEND_DL)
@@ -127,20 +161,20 @@ if (NOT WIN32 OR NOT BUILD_SHARED_LIBS)
llama_build(test-tokenizer-1-bpe.cpp)
# TODO: disabled due to slowness
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-aquila ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt-2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-2.gguf)
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt-neox ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-neox.gguf)
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf --ignore-merges)
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-mpt ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-refact ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-aquila ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-aquila.gguf)
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-falcon ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-falcon.gguf)
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt-2 ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-gpt-2.gguf)
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt-neox ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-gpt-neox.gguf)
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-llama-bpe ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-llama-bpe.gguf --ignore-merges)
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-mpt ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-mpt.gguf)
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-refact ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-refact.gguf)
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-starcoder ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-starcoder.gguf)
# build test-tokenizer-1-spm target once and add many tests
llama_build(test-tokenizer-1-spm.cpp)
llama_test(test-tokenizer-1-spm NAME test-tokenizer-1-llama-spm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-spm.gguf)
#llama_test(test-tokenizer-1-spm NAME test-tokenizer-1-baichuan ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
llama_test(test-tokenizer-1-spm NAME test-tokenizer-1-llama-spm ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-llama-spm.gguf)
#llama_test(test-tokenizer-1-spm NAME test-tokenizer-1-baichuan ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-baichuan.gguf)
# llama_build_and_test(test-double-float.cpp) # SLOW
endif()

36
tests/test-tokenizers-repo.sh Executable file
View File

@@ -0,0 +1,36 @@
#!/bin/bash
if [ $# -lt 2 ]; then
printf "Usage: $0 <git-repo> <target-folder> [<test-exe>]\n"
exit 1
fi
if [ $# -eq 3 ]; then
toktest=$3
else
toktest="./test-tokenizer-0"
fi
if [ ! -x $toktest ]; then
printf "Test executable \"$toktest\" not found!\n"
exit 1
fi
repo=$1
folder=$2
if [ -d $folder ] && [ -d $folder/.git ]; then
(cd $folder; git pull)
else
git clone $repo $folder
fi
shopt -s globstar
for gguf in $folder/**/*.gguf; do
if [ -f $gguf.inp ] && [ -f $gguf.out ]; then
$toktest $gguf
else
printf "Found \"$gguf\" without matching inp/out files, ignoring...\n"
fi
done

Binary file not shown.

View File

@@ -233,6 +233,7 @@ struct server_task {
slot_params defaults;
defaults.sampling = params_base.sampling;
defaults.speculative = params_base.speculative;
defaults.n_keep = params_base.n_keep;
// enabling this will output extra debug information in the HTTP responses from the server
params.verbose = params_base.verbosity > 9;
@@ -2060,6 +2061,7 @@ struct server_context {
SLT_INF(slot, "new slot n_ctx_slot = %d\n", slot.n_ctx);
slot.params.sampling = params_base.sampling;
slot.params.n_keep = params_base.n_keep;
slot.callback_on_release = [this](int) {
queue_tasks.pop_deferred_task();

View File

@@ -41,6 +41,10 @@ html {
max-width: 900px;
}
.chat-bubble {
@apply break-words;
}
.chat-bubble-base-300 {
--tw-bg-opacity: 1;
--tw-text-opacity: 1;