mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-02-05 13:53:23 +02:00
Compare commits
12 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a20b2b05bc | ||
|
|
2e89f76b7a | ||
|
|
532802f938 | ||
|
|
d4e0d95cf5 | ||
|
|
cc66a7f78f | ||
|
|
bd248d4dc7 | ||
|
|
7781e5fe99 | ||
|
|
89a184fa71 | ||
|
|
2baf07727f | ||
|
|
7ae2932116 | ||
|
|
1f7d50b293 | ||
|
|
4c763c8d1b |
@@ -7,8 +7,8 @@ llama_add_compile_flags()
|
||||
# Build info header
|
||||
#
|
||||
|
||||
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
|
||||
set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
|
||||
if(EXISTS "${PROJECT_SOURCE_DIR}/.git")
|
||||
set(GIT_DIR "${PROJECT_SOURCE_DIR}/.git")
|
||||
|
||||
# Is git submodule
|
||||
if(NOT IS_DIRECTORY "${GIT_DIR}")
|
||||
@@ -18,7 +18,7 @@ if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
|
||||
if (SLASH_POS EQUAL 0)
|
||||
set(GIT_DIR "${REAL_GIT_DIR}")
|
||||
else()
|
||||
set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../${REAL_GIT_DIR}")
|
||||
set(GIT_DIR "${PROJECT_SOURCE_DIR}/${REAL_GIT_DIR}")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
@@ -42,7 +42,7 @@ add_custom_command(
|
||||
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
|
||||
-DCMAKE_SYSTEM_NAME=${CMAKE_SYSTEM_NAME} -DCMAKE_SYSTEM_PROCESSOR=${CMAKE_SYSTEM_PROCESSOR}
|
||||
-P "${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info-gen-cpp.cmake"
|
||||
WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/.."
|
||||
WORKING_DIRECTORY "${PROJECT_SOURCE_DIR}"
|
||||
DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in" ${GIT_INDEX}
|
||||
VERBATIM
|
||||
)
|
||||
|
||||
@@ -466,7 +466,7 @@ size_t string_find_partial_stop(const std::string_view & str, const std::string_
|
||||
|
||||
std::string regex_escape(const std::string & s) {
|
||||
static const std::regex special_chars("[.^$|()*+?\\[\\]{}\\\\]");
|
||||
return std::regex_replace(s, special_chars, "\\$0");
|
||||
return std::regex_replace(s, special_chars, "\\$&");
|
||||
}
|
||||
|
||||
std::string string_join(const std::vector<std::string> & values, const std::string & separator) {
|
||||
|
||||
@@ -270,17 +270,23 @@ endfunction()
|
||||
function(ggml_add_cpu_backend_variant tag_name)
|
||||
set(GGML_CPU_TAG_NAME ${tag_name})
|
||||
# other: OPENMP LLAMAFILE CPU_HBM
|
||||
foreach (feat NATIVE
|
||||
SSE42
|
||||
AVX AVX2 BMI2 AVX_VNNI FMA F16C
|
||||
AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16
|
||||
AMX_TILE AMX_INT8 AMX_BF16)
|
||||
set(GGML_${feat} OFF)
|
||||
endforeach()
|
||||
if (GGML_SYSTEM_ARCH STREQUAL "x86")
|
||||
foreach (feat NATIVE
|
||||
SSE42
|
||||
AVX AVX2 BMI2 AVX_VNNI FMA F16C
|
||||
AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16
|
||||
AMX_TILE AMX_INT8 AMX_BF16)
|
||||
set(GGML_${feat} OFF)
|
||||
endforeach()
|
||||
|
||||
foreach (feat ${ARGN})
|
||||
set(GGML_${feat} ON)
|
||||
endforeach()
|
||||
foreach (feat ${ARGN})
|
||||
set(GGML_${feat} ON)
|
||||
endforeach()
|
||||
elseif (GGML_SYSTEM_ARCH STREQUAL "ARM")
|
||||
foreach (feat ${ARGN})
|
||||
set(GGML_INTERNAL_${feat} ON)
|
||||
endforeach()
|
||||
endif()
|
||||
|
||||
ggml_add_cpu_backend_variant_impl(${tag_name})
|
||||
endfunction()
|
||||
@@ -290,6 +296,8 @@ ggml_add_backend(CPU)
|
||||
if (GGML_CPU_ALL_VARIANTS)
|
||||
if (NOT GGML_BACKEND_DL)
|
||||
message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL")
|
||||
elseif (GGML_CPU_ARM_ARCH)
|
||||
message(FATAL_ERROR "Cannot use both GGML_CPU_ARM_ARCH and GGML_CPU_ALL_VARIANTS")
|
||||
endif()
|
||||
if (GGML_SYSTEM_ARCH STREQUAL "x86")
|
||||
ggml_add_cpu_backend_variant(x64)
|
||||
@@ -303,8 +311,20 @@ if (GGML_CPU_ALL_VARIANTS)
|
||||
# MSVC doesn't support AMX
|
||||
ggml_add_cpu_backend_variant(sapphirerapids SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
|
||||
endif()
|
||||
elseif(GGML_SYSTEM_ARCH STREQUAL "ARM" AND CMAKE_SYSTEM_NAME MATCHES "Linux")
|
||||
# Many of these features are optional so we build versions with popular
|
||||
# combinations and name the backends based on the version they were
|
||||
# first released with
|
||||
ggml_add_cpu_backend_variant(armv8.0_1)
|
||||
ggml_add_cpu_backend_variant(armv8.2_1 DOTPROD)
|
||||
ggml_add_cpu_backend_variant(armv8.2_2 DOTPROD FP16_VECTOR_ARITHMETIC)
|
||||
ggml_add_cpu_backend_variant(armv8.2_3 DOTPROD FP16_VECTOR_ARITHMETIC SVE)
|
||||
ggml_add_cpu_backend_variant(armv8.6_1 DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8)
|
||||
ggml_add_cpu_backend_variant(armv8.6_2 DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8 SVE2)
|
||||
ggml_add_cpu_backend_variant(armv9.2_1 DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8 SME)
|
||||
ggml_add_cpu_backend_variant(armv9.2_2 DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8 SVE2 SME)
|
||||
else()
|
||||
message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS not yet supported on ${GGML_SYSTEM_ARCH}")
|
||||
message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS not yet supported with ${GGML_SYSTEM_ARCH} on ${CMAKE_SYSTEM_NAME}")
|
||||
endif()
|
||||
elseif (GGML_CPU)
|
||||
ggml_add_cpu_backend_variant_impl("")
|
||||
|
||||
@@ -1,3 +1,17 @@
|
||||
function(ggml_add_cpu_backend_features cpu_name arch)
|
||||
# The feature detection code is compiled as a separate target so that
|
||||
# it can be built without the architecture flags
|
||||
# Since multiple variants of the CPU backend may be included in the same
|
||||
# build, using set_source_files_properties() to set the arch flags is not possible
|
||||
set(GGML_CPU_FEATS_NAME ${cpu_name}-feats)
|
||||
add_library(${GGML_CPU_FEATS_NAME} OBJECT ggml-cpu/arch/${arch}/cpu-feats.cpp)
|
||||
target_include_directories(${GGML_CPU_FEATS_NAME} PRIVATE . .. ../include)
|
||||
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE ${ARGN})
|
||||
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE GGML_BACKEND_DL GGML_BACKEND_BUILD GGML_BACKEND_SHARED)
|
||||
set_target_properties(${GGML_CPU_FEATS_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
target_link_libraries(${cpu_name} PRIVATE ${GGML_CPU_FEATS_NAME})
|
||||
endfunction()
|
||||
|
||||
function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
if (tag_name)
|
||||
set(GGML_CPU_NAME ggml-cpu-${tag_name})
|
||||
@@ -143,6 +157,49 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
else()
|
||||
if (GGML_CPU_ARM_ARCH)
|
||||
list(APPEND ARCH_FLAGS -march=${GGML_CPU_ARM_ARCH})
|
||||
elseif(GGML_CPU_ALL_VARIANTS)
|
||||
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
|
||||
# Begin with the lowest baseline
|
||||
set(ARM_MCPU "armv8-a")
|
||||
set(ARCH_TAGS "")
|
||||
set(ARCH_DEFINITIONS "")
|
||||
|
||||
# When a feature is selected, bump the MCPU to the first
|
||||
# version that supported it
|
||||
if (GGML_INTERNAL_DOTPROD)
|
||||
set(ARM_MCPU "armv8.2-a")
|
||||
set(ARCH_TAGS "${ARCH_TAGS}+dotprod")
|
||||
list(APPEND ARCH_DEFINITIONS GGML_USE_DOTPROD)
|
||||
endif()
|
||||
if (GGML_INTERNAL_FP16_VECTOR_ARITHMETIC)
|
||||
set(ARM_MCPU "armv8.2-a")
|
||||
set(ARCH_TAGS "${ARCH_TAGS}+fp16")
|
||||
list(APPEND ARCH_DEFINITIONS GGML_USE_FP16_VECTOR_ARITHMETIC)
|
||||
endif()
|
||||
if (GGML_INTERNAL_SVE)
|
||||
set(ARM_MCPU "armv8.2-a")
|
||||
set(ARCH_TAGS "${ARCH_TAGS}+sve")
|
||||
list(APPEND ARCH_DEFINITIONS GGML_USE_SVE)
|
||||
endif()
|
||||
if (GGML_INTERNAL_MATMUL_INT8)
|
||||
set(ARM_MCPU "armv8.6-a")
|
||||
set(ARCH_TAGS "${ARCH_TAGS}+i8mm")
|
||||
list(APPEND ARCH_DEFINITIONS GGML_USE_MATMUL_INT8)
|
||||
endif()
|
||||
if (GGML_INTERNAL_SVE2)
|
||||
set(ARM_MCPU "armv8.6-a")
|
||||
set(ARCH_TAGS "${ARCH_TAGS}+sve2")
|
||||
list(APPEND ARCH_DEFINITIONS GGML_USE_SVE2)
|
||||
endif()
|
||||
if (GGML_INTERNAL_SME)
|
||||
set(ARM_MCPU "armv9.2-a")
|
||||
set(ARCH_TAGS "${ARCH_TAGS}+sme")
|
||||
list(APPEND ARCH_DEFINITIONS GGML_USE_SME)
|
||||
endif()
|
||||
|
||||
list(APPEND ARCH_FLAGS "-march=${ARM_MCPU}${ARCH_TAGS}")
|
||||
ggml_add_cpu_backend_features(${GGML_CPU_NAME} arm ${ARCH_DEFINITIONS})
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
@@ -306,18 +363,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
# the feature check relies on ARCH_DEFINITIONS, but it is not set with GGML_NATIVE
|
||||
message(FATAL_ERROR "GGML_NATIVE is not compatible with GGML_BACKEND_DL, consider using GGML_CPU_ALL_VARIANTS")
|
||||
endif()
|
||||
|
||||
# The feature detection code is compiled as a separate target so that
|
||||
# it can be built without the architecture flags
|
||||
# Since multiple variants of the CPU backend may be included in the same
|
||||
# build, using set_source_files_properties() to set the arch flags is not possible
|
||||
set(GGML_CPU_FEATS_NAME ${GGML_CPU_NAME}-feats)
|
||||
add_library(${GGML_CPU_FEATS_NAME} OBJECT ggml-cpu/arch/x86/cpu-feats.cpp)
|
||||
target_include_directories(${GGML_CPU_FEATS_NAME} PRIVATE . .. ../include)
|
||||
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE ${ARCH_DEFINITIONS})
|
||||
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE GGML_BACKEND_DL GGML_BACKEND_BUILD GGML_BACKEND_SHARED)
|
||||
set_target_properties(${GGML_CPU_FEATS_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
target_link_libraries(${GGML_CPU_NAME} PRIVATE ${GGML_CPU_FEATS_NAME})
|
||||
ggml_add_cpu_backend_features(${GGML_CPU_NAME} x86 ${ARCH_DEFINITIONS})
|
||||
endif()
|
||||
elseif (GGML_SYSTEM_ARCH STREQUAL "PowerPC")
|
||||
message(STATUS "PowerPC detected")
|
||||
|
||||
94
ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp
Normal file
94
ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp
Normal file
@@ -0,0 +1,94 @@
|
||||
#include "ggml-backend-impl.h"
|
||||
|
||||
#if defined(__aarch64__)
|
||||
|
||||
#if defined(__linux__)
|
||||
#include <sys/auxv.h>
|
||||
#elif defined(__APPLE__)
|
||||
#include <sys/sysctl.h>
|
||||
#endif
|
||||
|
||||
#if !defined(HWCAP2_I8MM)
|
||||
#define HWCAP2_I8MM (1 << 13)
|
||||
#endif
|
||||
|
||||
#if !defined(HWCAP2_SME)
|
||||
#define HWCAP2_SME (1 << 23)
|
||||
#endif
|
||||
|
||||
struct aarch64_features {
|
||||
// has_neon not needed, aarch64 has NEON guaranteed
|
||||
bool has_dotprod = false;
|
||||
bool has_fp16_va = false;
|
||||
bool has_sve = false;
|
||||
bool has_sve2 = false;
|
||||
bool has_i8mm = false;
|
||||
bool has_sme = false;
|
||||
|
||||
aarch64_features() {
|
||||
#if defined(__linux__)
|
||||
uint32_t hwcap = getauxval(AT_HWCAP);
|
||||
uint32_t hwcap2 = getauxval(AT_HWCAP2);
|
||||
|
||||
has_dotprod = !!(hwcap & HWCAP_ASIMDDP);
|
||||
has_fp16_va = !!(hwcap & HWCAP_FPHP);
|
||||
has_sve = !!(hwcap & HWCAP_SVE);
|
||||
has_sve2 = !!(hwcap2 & HWCAP2_SVE2);
|
||||
has_i8mm = !!(hwcap2 & HWCAP2_I8MM);
|
||||
has_sme = !!(hwcap2 & HWCAP2_SME);
|
||||
#elif defined(__APPLE__)
|
||||
int oldp = 0;
|
||||
size_t size = sizeof(oldp);
|
||||
|
||||
if (sysctlbyname("hw.optional.arm.FEAT_DotProd", &oldp, &size, NULL, 0) == 0) {
|
||||
has_dotprod = static_cast<bool>(oldp);
|
||||
}
|
||||
|
||||
if (sysctlbyname("hw.optional.arm.FEAT_I8MM", &oldp, &size, NULL, 0) == 0) {
|
||||
has_i8mm = static_cast<bool>(oldp);
|
||||
}
|
||||
|
||||
if (sysctlbyname("hw.optional.arm.FEAT_SME", &oldp, &size, NULL, 0) == 0) {
|
||||
has_sme = static_cast<bool>(oldp);
|
||||
}
|
||||
|
||||
// Apple apparently does not implement SVE yet
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
static int ggml_backend_cpu_aarch64_score() {
|
||||
int score = 1;
|
||||
aarch64_features af;
|
||||
|
||||
#ifdef GGML_USE_DOTPROD
|
||||
if (!af.has_dotprod) { return 0; }
|
||||
score += 1<<1;
|
||||
#endif
|
||||
#ifdef GGML_USE_FP16_VECTOR_ARITHMETIC
|
||||
if (!af.has_fp16_va) { return 0; }
|
||||
score += 1<<2;
|
||||
#endif
|
||||
#ifdef GGML_USE_SVE
|
||||
if (!af.has_sve) { return 0; }
|
||||
score += 1<<3;
|
||||
#endif
|
||||
#ifdef GGML_USE_MATMUL_INT8
|
||||
if (!af.has_i8mm) { return 0; }
|
||||
score += 1<<4;
|
||||
#endif
|
||||
#ifdef GGML_USE_SVE2
|
||||
if (!af.has_sve2) { return 0; }
|
||||
score += 1<<5;
|
||||
#endif
|
||||
#ifdef GGML_USE_SME
|
||||
if (!af.has_sme) { return 0; }
|
||||
score += 1<<6;
|
||||
#endif
|
||||
|
||||
return score;
|
||||
}
|
||||
|
||||
GGML_BACKEND_DL_SCORE_IMPL(ggml_backend_cpu_aarch64_score)
|
||||
|
||||
# endif // defined(__aarch64__)
|
||||
@@ -80,6 +80,7 @@ set(GGML_OPENCL_KERNELS
|
||||
mul_mv_q4_0_f32_1d_8x_flat
|
||||
mul_mv_q4_0_f32_1d_16x_flat
|
||||
mul_mv_q6_k
|
||||
mul_mv_id_q4_0_f32_8x_flat
|
||||
mul
|
||||
norm
|
||||
relu
|
||||
|
||||
@@ -321,6 +321,7 @@ struct ggml_backend_opencl_context {
|
||||
cl_program program_upscale;
|
||||
cl_program program_concat;
|
||||
cl_program program_tsembd;
|
||||
cl_program program_mul_mv_id_q4_0_f32_8x_flat;
|
||||
|
||||
cl_kernel kernel_add, kernel_add_row;
|
||||
cl_kernel kernel_mul, kernel_mul_row;
|
||||
@@ -366,6 +367,7 @@ struct ggml_backend_opencl_context {
|
||||
cl_kernel kernel_concat_f32_contiguous;
|
||||
cl_kernel kernel_concat_f32_non_contiguous;
|
||||
cl_kernel kernel_timestep_embedding;
|
||||
cl_kernel kernel_mul_mv_id_q4_0_f32_8x_flat;
|
||||
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
// Transpose kernels
|
||||
@@ -1112,7 +1114,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// repeat
|
||||
// repeat
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
@@ -1256,6 +1258,22 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
||||
}
|
||||
}
|
||||
|
||||
// mul_mv_id_q4_0_f32_8x_flat
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "mul_mv_id_q4_0_f32_8x_flat.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src = read_file("mul_mv_id_q4_0_f32_8x_flat.cl");
|
||||
#endif
|
||||
backend_ctx->program_mul_mv_id_q4_0_f32_8x_flat =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_mul_mv_id_q4_0_f32_8x_flat = clCreateKernel(backend_ctx->program_mul_mv_id_q4_0_f32_8x_flat, "kernel_mul_mv_id_q4_0_f32_8x_flat", &err), err));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// Adreno kernels
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
// transpose
|
||||
@@ -2178,6 +2196,13 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
||||
return op->src[1]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
|
||||
}
|
||||
return false;
|
||||
case GGML_OP_MUL_MAT_ID:
|
||||
if (op->src[0]->type == GGML_TYPE_Q4_0) {
|
||||
if (op->src[1]->type == GGML_TYPE_F32) {
|
||||
return ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
|
||||
}
|
||||
}
|
||||
return false;
|
||||
case GGML_OP_RESHAPE:
|
||||
case GGML_OP_VIEW:
|
||||
case GGML_OP_PERMUTE:
|
||||
@@ -5536,6 +5561,136 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0);
|
||||
GGML_ASSERT(src0->extra);
|
||||
GGML_ASSERT(src1);
|
||||
GGML_ASSERT(src1->extra);
|
||||
GGML_ASSERT(dst);
|
||||
GGML_ASSERT(dst->extra);
|
||||
|
||||
const ggml_tensor * src2 = dst->src[2];
|
||||
GGML_ASSERT(src2);
|
||||
GGML_ASSERT(src2->extra);
|
||||
|
||||
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
||||
cl_command_queue queue = backend_ctx->queue;
|
||||
|
||||
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
||||
ggml_tensor_extra_cl * extra2 = (ggml_tensor_extra_cl *)src2->extra;
|
||||
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
||||
|
||||
cl_ulong offset1 = extra1->offset + src1->view_offs;
|
||||
cl_ulong offset2 = extra2->offset + src2->view_offs;
|
||||
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
||||
|
||||
#ifdef GGML_OPENCL_SOA_Q
|
||||
ggml_tensor_extra_cl_q4_0 * extra0_q4_0 = (ggml_tensor_extra_cl_q4_0 *)src0->extra;
|
||||
#endif
|
||||
|
||||
const int ne00 = src0->ne[0];
|
||||
const int ne01 = src0->ne[1];
|
||||
const int ne02 = src0->ne[2];
|
||||
const int ne03 = src0->ne[3];
|
||||
|
||||
const cl_ulong nb00 = src0->nb[0];
|
||||
const cl_ulong nb02 = src0->nb[2];
|
||||
|
||||
const int ne10 = src1->ne[0];
|
||||
const int ne11 = src1->ne[1];
|
||||
const int ne12 = src1->ne[2];
|
||||
const int ne13 = src1->ne[3];
|
||||
|
||||
const cl_ulong nb11 = src1->nb[1];
|
||||
const cl_ulong nb12 = src1->nb[2];
|
||||
|
||||
const int ne20 = src2->ne[0];
|
||||
const int ne21 = src2->ne[1];
|
||||
|
||||
const cl_ulong nb21 = src2->nb[1];
|
||||
|
||||
const int ne0 = dst->ne[0];
|
||||
const int ne1 = dst->ne[1];
|
||||
|
||||
const int r2 = ne12/ne02;
|
||||
const int r3 = ne13/ne03;
|
||||
const int dst_rows = ne20*ne21; // ne20 = n_used_experts, ne21 = n_rows
|
||||
|
||||
GGML_ASSERT(ne00 == ne10);
|
||||
|
||||
int sgs = 32; // subgroup size
|
||||
int nsg = 1; // number of subgroups
|
||||
int nrows = 1; // number of row in src1
|
||||
int ndst = 4; // number of values produced by each subgroup
|
||||
|
||||
cl_kernel kernel;
|
||||
|
||||
// subgroup mat vec
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_Q4_0: {
|
||||
kernel = backend_ctx->kernel_mul_mv_id_q4_0_f32_8x_flat;
|
||||
|
||||
if (backend_ctx->gpu_family == INTEL) {
|
||||
sgs = 16;
|
||||
nsg = 1;
|
||||
ndst = 8;
|
||||
} else if (backend_ctx->gpu_family == ADRENO) {
|
||||
sgs = 64;
|
||||
nsg = 1;
|
||||
ndst = 8;
|
||||
} else {
|
||||
GGML_ASSERT(false && "TODO: Unknown GPU");
|
||||
}
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q4_0->q));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q4_0->d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra2->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offset2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne10));
|
||||
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne11));
|
||||
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb11));
|
||||
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &ne20));
|
||||
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(int), &ne21));
|
||||
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb21));
|
||||
CL_CHECK(clSetKernelArg(kernel, 21, sizeof(int), &ne0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &ne1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &r2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &r3));
|
||||
|
||||
break;
|
||||
}
|
||||
default:
|
||||
GGML_ASSERT(false && "not implemented");;
|
||||
}
|
||||
|
||||
int _ne1 = 1;
|
||||
int ne123 = dst_rows;
|
||||
|
||||
size_t global_work_size[] = {(size_t)(ne01+ndst*nsg-1)/(ndst*nsg)*sgs, (size_t)(_ne1+nrows-1)/nrows*nsg, (size_t)ne123};
|
||||
size_t local_work_size[] = {(size_t)sgs, (size_t)nsg, 1};
|
||||
|
||||
#ifdef GGML_OPENCL_PROFILING
|
||||
cl_event evt;
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
||||
|
||||
g_profiling_info.emplace_back();
|
||||
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
||||
#else
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
||||
#endif
|
||||
}
|
||||
|
||||
static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0);
|
||||
GGML_ASSERT(src0->extra);
|
||||
@@ -6444,6 +6599,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
|
||||
}
|
||||
func = ggml_cl_mul_mat;
|
||||
break;
|
||||
case GGML_OP_MUL_MAT_ID:
|
||||
if (!any_on_device) {
|
||||
return false;
|
||||
}
|
||||
func = ggml_cl_mul_mat_id;
|
||||
break;
|
||||
case GGML_OP_SCALE:
|
||||
if (!any_on_device) {
|
||||
return false;
|
||||
|
||||
283
ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl
Normal file
283
ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl
Normal file
@@ -0,0 +1,283 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
#ifdef cl_intel_subgroups
|
||||
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
|
||||
#else
|
||||
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
||||
#endif
|
||||
|
||||
#ifdef cl_intel_required_subgroup_size
|
||||
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
|
||||
#define INTEL_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
|
||||
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
|
||||
#elif defined(cl_qcom_reqd_sub_group_size)
|
||||
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||
#define ADRENO_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
|
||||
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
||||
#endif
|
||||
|
||||
#define QK4_0 32
|
||||
|
||||
typedef char int8_t;
|
||||
typedef uchar uint8_t;
|
||||
typedef short int16_t;
|
||||
typedef ushort uint16_t;
|
||||
typedef int int32_t;
|
||||
typedef uint uint32_t;
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// block_q4_0
|
||||
//------------------------------------------------------------------------------
|
||||
struct block_q4_0
|
||||
{
|
||||
half d;
|
||||
uint8_t qs[QK4_0 / 2];
|
||||
};
|
||||
|
||||
// This function requires the original shuffled weights.
|
||||
// As a reminder, the original weights are shuffled so that (q[0], q[16]) are
|
||||
// packed together in a byte, so are (q[1], q[17]) and so on.
|
||||
inline float block_q_4_0_dot_y_flat(
|
||||
global uchar * x,
|
||||
global half * dh,
|
||||
float sumy,
|
||||
float16 yl,
|
||||
int il
|
||||
) {
|
||||
float d = *dh;
|
||||
global ushort * qs = ((global ushort *)x + il/2);
|
||||
float acc = 0.f;
|
||||
|
||||
acc += yl.s0 * (qs[0] & 0x000F);
|
||||
acc += yl.s1 * (qs[0] & 0x0F00);
|
||||
acc += yl.s8 * (qs[0] & 0x00F0);
|
||||
acc += yl.s9 * (qs[0] & 0xF000);
|
||||
|
||||
acc += yl.s2 * (qs[1] & 0x000F);
|
||||
acc += yl.s3 * (qs[1] & 0x0F00);
|
||||
acc += yl.sa * (qs[1] & 0x00F0);
|
||||
acc += yl.sb * (qs[1] & 0xF000);
|
||||
|
||||
acc += yl.s4 * (qs[2] & 0x000F);
|
||||
acc += yl.s5 * (qs[2] & 0x0F00);
|
||||
acc += yl.sc * (qs[2] & 0x00F0);
|
||||
acc += yl.sd * (qs[2] & 0xF000);
|
||||
|
||||
acc += yl.s6 * (qs[3] & 0x000F);
|
||||
acc += yl.s7 * (qs[3] & 0x0F00);
|
||||
acc += yl.se * (qs[3] & 0x00F0);
|
||||
acc += yl.sf * (qs[3] & 0xF000);
|
||||
|
||||
return d * (sumy * -8.f + acc);
|
||||
}
|
||||
|
||||
//
|
||||
// This variant outputs 8 values.
|
||||
//
|
||||
#undef N_DST
|
||||
#undef N_SIMDGROUP
|
||||
#undef N_SIMDWIDTH
|
||||
|
||||
#ifdef INTEL_GPU
|
||||
#define N_DST 8 // each SIMD group works on 8 rows
|
||||
#define N_SIMDGROUP 1 // number of SIMD groups in a thread group
|
||||
#define N_SIMDWIDTH 16 // subgroup size
|
||||
#elif defined (ADRENO_GPU)
|
||||
#define N_DST 8
|
||||
#define N_SIMDGROUP 1
|
||||
#define N_SIMDWIDTH 64
|
||||
#endif
|
||||
|
||||
inline void mul_vec_q_n_f32_8x_flat(
|
||||
global char * src0_q,
|
||||
global half * src0_d,
|
||||
global float * src1,
|
||||
global float * dst,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
int ne10,
|
||||
int ne12,
|
||||
int ne0,
|
||||
int ne1,
|
||||
int r2,
|
||||
int r3
|
||||
) {
|
||||
const ulong nb = ne00/QK4_0;
|
||||
|
||||
int r0 = get_group_id(0);
|
||||
int r1 = get_group_id(1);
|
||||
int im = 0;
|
||||
|
||||
int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
|
||||
|
||||
int i12 = im%ne12;
|
||||
int i13 = im/ne12;
|
||||
|
||||
// The number of scales is the same as the number of blocks.
|
||||
ulong offset0_d = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
|
||||
// Each block contains QK4_0/2 uchars, hence offset for qs is as follows.
|
||||
ulong offset0_q = (first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02)) * QK4_0/2;
|
||||
|
||||
global uchar * x = (global uchar *) src0_q + offset0_q;
|
||||
global half * d = (global half *) src0_d + offset0_d;
|
||||
global float * y = (global float *) src1 + r1*ne10 + im*ne00*ne1;
|
||||
|
||||
float16 yl;
|
||||
float8 sumf = 0.f;
|
||||
|
||||
int ix = get_sub_group_local_id()/2;
|
||||
int il = 8*(get_sub_group_local_id()%2);
|
||||
|
||||
global float * yb = y + ix*QK4_0 + il;
|
||||
|
||||
for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) {
|
||||
float sumy = 0.f;
|
||||
|
||||
sumy += yb[0];
|
||||
sumy += yb[1];
|
||||
sumy += yb[2];
|
||||
sumy += yb[3];
|
||||
sumy += yb[4];
|
||||
sumy += yb[5];
|
||||
sumy += yb[6];
|
||||
sumy += yb[7];
|
||||
|
||||
sumy += yb[16];
|
||||
sumy += yb[17];
|
||||
sumy += yb[18];
|
||||
sumy += yb[19];
|
||||
sumy += yb[20];
|
||||
sumy += yb[21];
|
||||
sumy += yb[22];
|
||||
sumy += yb[23];
|
||||
|
||||
yl.s0 = yb[0];
|
||||
yl.s1 = yb[1]/256.f;
|
||||
|
||||
yl.s2 = yb[2];
|
||||
yl.s3 = yb[3]/256.f;
|
||||
|
||||
yl.s4 = yb[4];
|
||||
yl.s5 = yb[5]/256.f;
|
||||
|
||||
yl.s6 = yb[6];
|
||||
yl.s7 = yb[7]/256.f;
|
||||
|
||||
yl.s8 = yb[16]/16.f;
|
||||
yl.s9 = yb[17]/4096.f;
|
||||
|
||||
yl.sa = yb[18]/16.f;
|
||||
yl.sb = yb[19]/4096.f;
|
||||
|
||||
yl.sc = yb[20]/16.f;
|
||||
yl.sd = yb[21]/4096.f;
|
||||
|
||||
yl.se = yb[22]/16.f;
|
||||
yl.sf = yb[23]/4096.f;
|
||||
|
||||
sumf.s0 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 0*nb*QK4_0/2, d + ib + 0*nb, sumy, yl, il);
|
||||
sumf.s1 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 1*nb*QK4_0/2, d + ib + 1*nb, sumy, yl, il);
|
||||
sumf.s2 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 2*nb*QK4_0/2, d + ib + 2*nb, sumy, yl, il);
|
||||
sumf.s3 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 3*nb*QK4_0/2, d + ib + 3*nb, sumy, yl, il);
|
||||
|
||||
sumf.s4 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 4*nb*QK4_0/2, d + ib + 4*nb, sumy, yl, il);
|
||||
sumf.s5 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 5*nb*QK4_0/2, d + ib + 5*nb, sumy, yl, il);
|
||||
sumf.s6 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 6*nb*QK4_0/2, d + ib + 6*nb, sumy, yl, il);
|
||||
sumf.s7 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 7*nb*QK4_0/2, d + ib + 7*nb, sumy, yl, il);
|
||||
|
||||
yb += QK4_0 * (N_SIMDWIDTH/2);
|
||||
}
|
||||
|
||||
float8 tot = (float8)(
|
||||
sub_group_reduce_add(sumf.s0), sub_group_reduce_add(sumf.s1),
|
||||
sub_group_reduce_add(sumf.s2), sub_group_reduce_add(sumf.s3),
|
||||
sub_group_reduce_add(sumf.s4), sub_group_reduce_add(sumf.s5),
|
||||
sub_group_reduce_add(sumf.s6), sub_group_reduce_add(sumf.s7)
|
||||
);
|
||||
|
||||
if (get_sub_group_local_id() == 0) {
|
||||
if (first_row + 0 < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0;
|
||||
}
|
||||
if (first_row + 1 < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1;
|
||||
}
|
||||
if (first_row + 2 < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2;
|
||||
}
|
||||
if (first_row + 3 < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3;
|
||||
}
|
||||
|
||||
if (first_row + 4 < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + 4] = tot.s4;
|
||||
}
|
||||
if (first_row + 5 < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + 5] = tot.s5;
|
||||
}
|
||||
if (first_row + 6 < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + 6] = tot.s6;
|
||||
}
|
||||
if (first_row + 7 < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + 7] = tot.s7;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef INTEL_GPU
|
||||
REQD_SUBGROUP_SIZE_16
|
||||
#elif defined (ADRENO_GPU)
|
||||
REQD_SUBGROUP_SIZE_64
|
||||
#endif
|
||||
kernel void kernel_mul_mv_id_q4_0_f32_8x_flat(
|
||||
global char * src0_q,
|
||||
global half * src0_d,
|
||||
global float * src1,
|
||||
ulong offset1,
|
||||
global char * src2,
|
||||
ulong offset2,
|
||||
global float * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
ulong nb00,
|
||||
ulong nb02,
|
||||
int ne10,
|
||||
int ne11,
|
||||
int ne12,
|
||||
ulong nb11,
|
||||
ulong nb12,
|
||||
int ne20,
|
||||
int ne21,
|
||||
ulong nb21,
|
||||
int ne0,
|
||||
int ne1,
|
||||
int r2,
|
||||
int r3
|
||||
) {
|
||||
src1 = (global float *)((global char *)src1 + offset1);
|
||||
src2 = (global char *)((global char *)src2 + offset2);
|
||||
dst = (global float *)((global char *)dst + offsetd);
|
||||
|
||||
const int iid1 = get_group_id(2)/ne20;
|
||||
const int idx = get_group_id(2)%ne20;
|
||||
|
||||
const int i02 = ((global int *)(src2 + iid1*nb21))[idx];
|
||||
|
||||
const int i11 = idx%ne11;
|
||||
const int i12 = iid1;
|
||||
|
||||
const int i1 = idx;
|
||||
const int i2 = i12;
|
||||
|
||||
global char * src0_q_cur = src0_q + (i02*nb02/nb00)*(QK4_0/2);
|
||||
global half * src0_d_cur = src0_d + (i02*nb02/nb00);
|
||||
global float * src1_cur = (global float *)((global char *) src1 + i11*nb11 + i12*nb12);
|
||||
global float * dst_cur = dst + i1*ne0 + i2*ne1*ne0;
|
||||
|
||||
mul_vec_q_n_f32_8x_flat(src0_q_cur, src0_d_cur, src1_cur, dst_cur, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3);
|
||||
}
|
||||
@@ -78,7 +78,7 @@ static bool is_pow2(uint32_t x) { return x > 1 && (x & (x-1)) == 0; }
|
||||
#define VK_VENDOR_ID_INTEL 0x8086
|
||||
#define VK_VENDOR_ID_NVIDIA 0x10de
|
||||
|
||||
#define VK_DEVICE_DESCRIPTOR_POOL_SIZE 32
|
||||
#define VK_DEVICE_DESCRIPTOR_POOL_SIZE 256
|
||||
|
||||
#define GGML_VK_MAX_NODES 8192
|
||||
|
||||
@@ -102,25 +102,11 @@ static bool is_pow2(uint32_t x) { return x > 1 && (x & (x-1)) == 0; }
|
||||
|
||||
struct ggml_backend_vk_context;
|
||||
|
||||
struct vk_queue {
|
||||
uint32_t queue_family_index;
|
||||
vk::Queue queue;
|
||||
vk::CommandPool pool;
|
||||
uint32_t cmd_buffer_idx;
|
||||
std::vector<vk::CommandBuffer> cmd_buffers;
|
||||
|
||||
vk::PipelineStageFlags stage_flags;
|
||||
|
||||
bool transfer_only;
|
||||
};
|
||||
#define MAX_PARAMETER_COUNT 8
|
||||
|
||||
struct vk_pipeline_struct {
|
||||
std::string name;
|
||||
vk::ShaderModule shader_module;
|
||||
vk::DescriptorSetLayout dsl;
|
||||
std::vector<vk::DescriptorPool> descriptor_pools;
|
||||
std::vector<vk::DescriptorSet> descriptor_sets;
|
||||
uint32_t descriptor_set_idx;
|
||||
vk::PipelineLayout layout;
|
||||
vk::Pipeline pipeline;
|
||||
uint32_t push_constant_size;
|
||||
@@ -167,6 +153,40 @@ struct ggml_backend_vk_buffer_type_context {
|
||||
vk_device device;
|
||||
};
|
||||
|
||||
struct vk_queue;
|
||||
|
||||
// Stores command pool/buffers. There's an instance of this
|
||||
// for each (context,queue) pair and for each (device,queue) pair.
|
||||
struct vk_command_pool {
|
||||
void init(vk_device& device, vk_queue *q_);
|
||||
void destroy(vk::Device& device);
|
||||
|
||||
vk::CommandPool pool;
|
||||
uint32_t cmd_buffer_idx;
|
||||
std::vector<vk::CommandBuffer> cmd_buffers;
|
||||
|
||||
vk_queue *q;
|
||||
};
|
||||
|
||||
struct vk_queue {
|
||||
uint32_t queue_family_index;
|
||||
vk::Queue queue;
|
||||
|
||||
vk_command_pool cmd_pool;
|
||||
|
||||
vk::PipelineStageFlags stage_flags;
|
||||
|
||||
bool transfer_only;
|
||||
|
||||
// copy everything except the cmd_pool
|
||||
void copyFrom(vk_queue &other) {
|
||||
queue_family_index = other.queue_family_index;
|
||||
queue = other.queue;
|
||||
stage_flags = other.stage_flags;
|
||||
transfer_only = other.transfer_only;
|
||||
}
|
||||
};
|
||||
|
||||
static const char * ggml_backend_vk_buffer_type_name(ggml_backend_buffer_type_t buft);
|
||||
static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size);
|
||||
static size_t ggml_backend_vk_buffer_type_get_alignment(ggml_backend_buffer_type_t buft);
|
||||
@@ -341,6 +361,8 @@ struct vk_device_struct {
|
||||
// set to true to indicate that some shaders need to be compiled after the dryrun
|
||||
bool need_compiles {};
|
||||
|
||||
vk::DescriptorSetLayout dsl;
|
||||
|
||||
vk_matmul_pipeline pipeline_matmul_f32 {};
|
||||
vk_matmul_pipeline pipeline_matmul_f32_f16 {};
|
||||
vk_matmul_pipeline pipeline_matmul_bf16 {};
|
||||
@@ -458,7 +480,6 @@ struct vk_device_struct {
|
||||
vk_pipeline pipeline_flash_attn_split_k_reduce;
|
||||
|
||||
std::unordered_map<std::string, vk_pipeline_ref> pipelines;
|
||||
std::unordered_map<std::string, uint64_t> pipeline_descriptor_set_requirements;
|
||||
|
||||
std::vector<std::tuple<void*, size_t, vk_buffer>> pinned_memory;
|
||||
|
||||
@@ -483,10 +504,8 @@ struct vk_device_struct {
|
||||
|
||||
ggml_vk_destroy_buffer(sync_staging);
|
||||
|
||||
device.destroyCommandPool(compute_queue.pool);
|
||||
if (!single_queue) {
|
||||
device.destroyCommandPool(transfer_queue.pool);
|
||||
}
|
||||
compute_queue.cmd_pool.destroy(device);
|
||||
transfer_queue.cmd_pool.destroy(device);
|
||||
|
||||
for (auto& pipeline : pipelines) {
|
||||
if (pipeline.second.expired()) {
|
||||
@@ -498,10 +517,26 @@ struct vk_device_struct {
|
||||
}
|
||||
pipelines.clear();
|
||||
|
||||
device.destroyDescriptorSetLayout(dsl);
|
||||
|
||||
device.destroy();
|
||||
}
|
||||
};
|
||||
|
||||
void vk_command_pool::init(vk_device& device, vk_queue *q_) {
|
||||
cmd_buffer_idx = 0;
|
||||
q = q_;
|
||||
|
||||
vk::CommandPoolCreateInfo command_pool_create_info(vk::CommandPoolCreateFlags(VK_COMMAND_POOL_CREATE_TRANSIENT_BIT), q->queue_family_index);
|
||||
pool = device->device.createCommandPool(command_pool_create_info);
|
||||
}
|
||||
|
||||
void vk_command_pool::destroy(vk::Device& device) {
|
||||
device.destroyCommandPool(pool);
|
||||
pool = nullptr;
|
||||
cmd_buffers.clear();
|
||||
}
|
||||
|
||||
struct vk_buffer_struct {
|
||||
vk::Buffer buffer = VK_NULL_HANDLE;
|
||||
vk::DeviceMemory device_memory = VK_NULL_HANDLE;
|
||||
@@ -819,7 +854,7 @@ struct vk_context_struct {
|
||||
std::vector<vk_staging_memcpy> in_memcpys;
|
||||
std::vector<vk_staging_memcpy> out_memcpys;
|
||||
|
||||
vk_queue * q;
|
||||
vk_command_pool * p {};
|
||||
};
|
||||
typedef std::shared_ptr<vk_context_struct> vk_context;
|
||||
typedef std::weak_ptr<vk_context_struct> vk_context_ref;
|
||||
@@ -930,6 +965,14 @@ struct ggml_backend_vk_context {
|
||||
vk_context_ref transfer_ctx;
|
||||
|
||||
std::vector<vk_context_ref> tensor_ctxs;
|
||||
|
||||
std::vector<vk::DescriptorPool> descriptor_pools;
|
||||
std::vector<vk::DescriptorSet> descriptor_sets;
|
||||
uint32_t descriptor_set_idx {};
|
||||
uint32_t pipeline_descriptor_set_requirements {};
|
||||
|
||||
vk_command_pool compute_cmd_pool;
|
||||
vk_command_pool transfer_cmd_pool;
|
||||
};
|
||||
|
||||
static void * const vk_ptr_base = (void *)(uintptr_t) 0x1000; // NOLINT
|
||||
@@ -1060,39 +1103,19 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
|
||||
", (" << wg_denoms[0] << "," << wg_denoms[1] << "," << wg_denoms[2] << "), specialization_constants, " <<
|
||||
disable_robustness << ", " << require_full_subgroups << ", " << required_subgroup_size << ")");
|
||||
GGML_ASSERT(parameter_count > 0);
|
||||
GGML_ASSERT(parameter_count <= MAX_PARAMETER_COUNT);
|
||||
GGML_ASSERT(wg_denoms[0] > 0 && wg_denoms[1] > 0 && wg_denoms[2] > 0); // NOLINT
|
||||
|
||||
vk::ShaderModuleCreateInfo shader_module_create_info({}, spv_size, reinterpret_cast<const uint32_t *>(spv_data));
|
||||
pipeline->shader_module = device->device.createShaderModule(shader_module_create_info);
|
||||
|
||||
std::vector<vk::DescriptorSetLayoutBinding> dsl_binding;
|
||||
std::vector<vk::DescriptorBindingFlags> dsl_binding_flags;
|
||||
for (uint32_t i = 0; i < parameter_count; i++) {
|
||||
dsl_binding.push_back({i, vk::DescriptorType::eStorageBuffer, 1, vk::ShaderStageFlagBits::eCompute});
|
||||
dsl_binding_flags.push_back({});
|
||||
}
|
||||
|
||||
vk::DescriptorSetLayoutBindingFlagsCreateInfo dslbfci = { dsl_binding_flags };
|
||||
|
||||
vk::PushConstantRange pcr(
|
||||
vk::ShaderStageFlagBits::eCompute,
|
||||
0,
|
||||
pipeline->push_constant_size
|
||||
);
|
||||
|
||||
vk::DescriptorSetLayoutCreateInfo descriptor_set_layout_create_info(
|
||||
{},
|
||||
dsl_binding);
|
||||
descriptor_set_layout_create_info.setPNext(&dslbfci);
|
||||
pipeline->dsl = device->device.createDescriptorSetLayout(descriptor_set_layout_create_info);
|
||||
|
||||
vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, pipeline->parameter_count * VK_DEVICE_DESCRIPTOR_POOL_SIZE);
|
||||
vk::DescriptorPoolCreateInfo descriptor_pool_create_info({}, VK_DEVICE_DESCRIPTOR_POOL_SIZE, descriptor_pool_size);
|
||||
pipeline->descriptor_pools.push_back(device->device.createDescriptorPool(descriptor_pool_create_info));
|
||||
|
||||
pipeline->descriptor_set_idx = 0;
|
||||
|
||||
vk::PipelineLayoutCreateInfo pipeline_layout_create_info(vk::PipelineLayoutCreateFlags(), pipeline->dsl, pcr);
|
||||
vk::PipelineLayoutCreateInfo pipeline_layout_create_info(vk::PipelineLayoutCreateFlags(), device->dsl, pcr);
|
||||
pipeline->layout = device->device.createPipelineLayout(pipeline_layout_create_info);
|
||||
|
||||
std::vector<vk::SpecializationMapEntry> specialization_entries(specialization_constants.size());
|
||||
@@ -1167,15 +1190,6 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
|
||||
|
||||
static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline) {
|
||||
VK_LOG_DEBUG("ggml_pipeline_destroy_pipeline(" << pipeline->name << ")");
|
||||
for (auto& pool : pipeline->descriptor_pools) {
|
||||
device.destroyDescriptorPool(pool);
|
||||
}
|
||||
pipeline->descriptor_pools.clear();
|
||||
pipeline->descriptor_sets.clear();
|
||||
pipeline->descriptor_set_idx = 0;
|
||||
|
||||
device.destroyDescriptorSetLayout(pipeline->dsl);
|
||||
|
||||
device.destroyPipelineLayout(pipeline->layout);
|
||||
|
||||
device.destroyShaderModule(pipeline->shader_module);
|
||||
@@ -1183,97 +1197,76 @@ static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline)
|
||||
device.destroyPipeline(pipeline->pipeline);
|
||||
}
|
||||
|
||||
static void ggml_pipeline_request_descriptor_sets(vk_device& device, vk_pipeline& pipeline, uint32_t n) {
|
||||
static void ggml_pipeline_request_descriptor_sets(ggml_backend_vk_context *ctx, vk_pipeline& pipeline, uint32_t n) {
|
||||
VK_LOG_DEBUG("ggml_pipeline_request_descriptor_sets(" << pipeline->name << ", " << n << ")");
|
||||
device->pipeline_descriptor_set_requirements[pipeline->name] += n;
|
||||
ctx->pipeline_descriptor_set_requirements += n;
|
||||
if (!pipeline->compiled) {
|
||||
pipeline->needed = true;
|
||||
device->need_compiles = true;
|
||||
ctx->device->need_compiles = true;
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_pipeline_allocate_descriptor_sets(vk_device& device) {
|
||||
std::lock_guard<std::mutex> guard(device->mutex);
|
||||
static void ggml_pipeline_allocate_descriptor_sets(ggml_backend_vk_context * ctx) {
|
||||
|
||||
for (auto& pair : device->pipeline_descriptor_set_requirements) {
|
||||
vk_pipeline pipeline = device->pipelines.at(pair.first).lock();
|
||||
const uint64_t n = pair.second;
|
||||
if (ctx->descriptor_sets.size() >= ctx->pipeline_descriptor_set_requirements) {
|
||||
// Enough descriptors are available
|
||||
return;
|
||||
}
|
||||
|
||||
VK_LOG_DEBUG("ggml_pipeline_allocate_descriptor_sets(" << pipeline->name << ", " << n << ")");
|
||||
vk_device& device = ctx->device;
|
||||
|
||||
if (pipeline->descriptor_sets.size() >= pipeline->descriptor_set_idx + n) {
|
||||
// Enough descriptors are available
|
||||
continue;
|
||||
uint32_t to_alloc = ctx->pipeline_descriptor_set_requirements - ctx->descriptor_sets.size();
|
||||
uint32_t pool_remaining = VK_DEVICE_DESCRIPTOR_POOL_SIZE - ctx->descriptor_sets.size() % VK_DEVICE_DESCRIPTOR_POOL_SIZE;
|
||||
uint32_t pool_idx = ctx->descriptor_sets.size() / VK_DEVICE_DESCRIPTOR_POOL_SIZE;
|
||||
|
||||
while (to_alloc > 0) {
|
||||
const uint32_t alloc_count = std::min(pool_remaining, to_alloc);
|
||||
to_alloc -= alloc_count;
|
||||
pool_remaining = VK_DEVICE_DESCRIPTOR_POOL_SIZE;
|
||||
|
||||
if (pool_idx >= ctx->descriptor_pools.size()) {
|
||||
vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, MAX_PARAMETER_COUNT * VK_DEVICE_DESCRIPTOR_POOL_SIZE);
|
||||
vk::DescriptorPoolCreateInfo descriptor_pool_create_info({}, VK_DEVICE_DESCRIPTOR_POOL_SIZE, descriptor_pool_size);
|
||||
ctx->descriptor_pools.push_back(device->device.createDescriptorPool(descriptor_pool_create_info));
|
||||
}
|
||||
|
||||
uint32_t to_alloc = pipeline->descriptor_set_idx + n - pipeline->descriptor_sets.size();
|
||||
uint32_t pool_remaining = VK_DEVICE_DESCRIPTOR_POOL_SIZE - pipeline->descriptor_sets.size() % VK_DEVICE_DESCRIPTOR_POOL_SIZE;
|
||||
uint32_t pool_idx = pipeline->descriptor_sets.size() / VK_DEVICE_DESCRIPTOR_POOL_SIZE;
|
||||
|
||||
while (to_alloc > 0) {
|
||||
const uint32_t alloc_count = std::min(pool_remaining, to_alloc);
|
||||
to_alloc -= alloc_count;
|
||||
pool_remaining = VK_DEVICE_DESCRIPTOR_POOL_SIZE;
|
||||
|
||||
if (pool_idx >= pipeline->descriptor_pools.size()) {
|
||||
vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, pipeline->parameter_count * VK_DEVICE_DESCRIPTOR_POOL_SIZE);
|
||||
vk::DescriptorPoolCreateInfo descriptor_pool_create_info({}, VK_DEVICE_DESCRIPTOR_POOL_SIZE, descriptor_pool_size);
|
||||
pipeline->descriptor_pools.push_back(device->device.createDescriptorPool(descriptor_pool_create_info));
|
||||
}
|
||||
|
||||
std::vector<vk::DescriptorSetLayout> layouts(alloc_count);
|
||||
for (uint32_t i = 0; i < alloc_count; i++) {
|
||||
layouts[i] = pipeline->dsl;
|
||||
}
|
||||
vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(pipeline->descriptor_pools[pool_idx], alloc_count, layouts.data());
|
||||
std::vector<vk::DescriptorSet> sets = device->device.allocateDescriptorSets(descriptor_set_alloc_info);
|
||||
pipeline->descriptor_sets.insert(pipeline->descriptor_sets.end(), sets.begin(), sets.end());
|
||||
|
||||
pool_idx++;
|
||||
std::vector<vk::DescriptorSetLayout> layouts(alloc_count);
|
||||
for (uint32_t i = 0; i < alloc_count; i++) {
|
||||
layouts[i] = device->dsl;
|
||||
}
|
||||
vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(ctx->descriptor_pools[pool_idx], alloc_count, layouts.data());
|
||||
std::vector<vk::DescriptorSet> sets = device->device.allocateDescriptorSets(descriptor_set_alloc_info);
|
||||
ctx->descriptor_sets.insert(ctx->descriptor_sets.end(), sets.begin(), sets.end());
|
||||
|
||||
pool_idx++;
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_pipeline_cleanup(vk_pipeline& pipeline) {
|
||||
VK_LOG_DEBUG("ggml_pipeline_cleanup(" << pipeline->name << ")");
|
||||
pipeline->descriptor_set_idx = 0;
|
||||
}
|
||||
|
||||
static vk::CommandBuffer ggml_vk_create_cmd_buffer(vk_device& device, vk_queue& q) {
|
||||
static vk::CommandBuffer ggml_vk_create_cmd_buffer(vk_device& device, vk_command_pool& p) {
|
||||
VK_LOG_DEBUG("ggml_vk_create_cmd_buffer()");
|
||||
std::lock_guard<std::mutex> guard(device->mutex);
|
||||
|
||||
if (q.cmd_buffers.size() > q.cmd_buffer_idx) {
|
||||
if (p.cmd_buffers.size() > p.cmd_buffer_idx) {
|
||||
// Reuse command buffer
|
||||
return q.cmd_buffers[q.cmd_buffer_idx++];
|
||||
return p.cmd_buffers[p.cmd_buffer_idx++];
|
||||
}
|
||||
|
||||
vk::CommandBufferAllocateInfo command_buffer_alloc_info(
|
||||
q.pool,
|
||||
p.pool,
|
||||
vk::CommandBufferLevel::ePrimary,
|
||||
1);
|
||||
const std::vector<vk::CommandBuffer> cmd_buffers = device->device.allocateCommandBuffers(command_buffer_alloc_info);
|
||||
auto buf = cmd_buffers.front();
|
||||
|
||||
q.cmd_buffers.push_back(buf);
|
||||
q.cmd_buffer_idx++;
|
||||
p.cmd_buffers.push_back(buf);
|
||||
p.cmd_buffer_idx++;
|
||||
|
||||
return buf;
|
||||
}
|
||||
|
||||
static vk_submission ggml_vk_create_submission(vk_device& device, vk_queue& q, std::vector<vk_semaphore> wait_semaphores, std::vector<vk_semaphore> signal_semaphores) {
|
||||
VK_LOG_DEBUG("ggml_vk_create_submission()");
|
||||
vk_submission s;
|
||||
s.buffer = ggml_vk_create_cmd_buffer(device, q);
|
||||
s.wait_semaphores = std::move(wait_semaphores);
|
||||
s.signal_semaphores = std::move(signal_semaphores);
|
||||
return s;
|
||||
}
|
||||
|
||||
static void ggml_vk_submit(vk_context& ctx, vk::Fence fence) {
|
||||
if (ctx->seqs.empty()) {
|
||||
if (fence) {
|
||||
ctx->q->queue.submit({}, fence);
|
||||
ctx->p->q->queue.submit({}, fence);
|
||||
}
|
||||
return;
|
||||
}
|
||||
@@ -1312,7 +1305,7 @@ static void ggml_vk_submit(vk_context& ctx, vk::Fence fence) {
|
||||
tl_signal_vals.push_back({});
|
||||
tl_signal_semaphores.push_back({});
|
||||
for (size_t i = 0; i < submission.wait_semaphores.size(); i++) {
|
||||
stage_flags[idx].push_back(ctx->q->stage_flags);
|
||||
stage_flags[idx].push_back(ctx->p->q->stage_flags);
|
||||
tl_wait_vals[idx].push_back(submission.wait_semaphores[i].value);
|
||||
tl_wait_semaphores[idx].push_back(submission.wait_semaphores[i].s);
|
||||
}
|
||||
@@ -1342,7 +1335,7 @@ static void ggml_vk_submit(vk_context& ctx, vk::Fence fence) {
|
||||
}
|
||||
}
|
||||
|
||||
ctx->q->queue.submit(submit_infos, fence);
|
||||
ctx->p->q->queue.submit(submit_infos, fence);
|
||||
|
||||
ctx->seqs.clear();
|
||||
}
|
||||
@@ -1400,28 +1393,25 @@ static void ggml_vk_create_queue(vk_device& device, vk_queue& q, uint32_t queue_
|
||||
q.queue_family_index = queue_family_index;
|
||||
q.transfer_only = transfer_only;
|
||||
|
||||
vk::CommandPoolCreateInfo command_pool_create_info_compute(vk::CommandPoolCreateFlags(VK_COMMAND_POOL_CREATE_TRANSIENT_BIT), queue_family_index);
|
||||
q.pool = device->device.createCommandPool(command_pool_create_info_compute);
|
||||
|
||||
q.cmd_buffer_idx = 0;
|
||||
q.cmd_pool.init(device, &q);
|
||||
|
||||
q.queue = device->device.getQueue(queue_family_index, queue_index);
|
||||
|
||||
q.stage_flags = stage_flags;
|
||||
}
|
||||
|
||||
static vk_context ggml_vk_create_context(ggml_backend_vk_context * ctx, vk_queue& q) {
|
||||
static vk_context ggml_vk_create_context(ggml_backend_vk_context * ctx, vk_command_pool& p) {
|
||||
vk_context result = std::make_shared<vk_context_struct>();
|
||||
VK_LOG_DEBUG("ggml_vk_create_context(" << result << ")");
|
||||
ctx->gc.contexts.emplace_back(result);
|
||||
result->q = &q;
|
||||
result->p = &p;
|
||||
return result;
|
||||
}
|
||||
|
||||
static vk_context ggml_vk_create_temporary_context(vk_queue& q) {
|
||||
static vk_context ggml_vk_create_temporary_context(vk_command_pool& p) {
|
||||
vk_context result = std::make_shared<vk_context_struct>();
|
||||
VK_LOG_DEBUG("ggml_vk_create_temporary_context(" << result << ")");
|
||||
result->q = &q;
|
||||
result->p = &p;
|
||||
return result;
|
||||
}
|
||||
|
||||
@@ -1454,15 +1444,29 @@ static vk::Event ggml_vk_create_event(ggml_backend_vk_context * ctx) {
|
||||
return ctx->gc.events[ctx->event_idx++];
|
||||
}
|
||||
|
||||
static void ggml_vk_queue_cleanup(vk_device& device, vk_queue& q) {
|
||||
VK_LOG_DEBUG("ggml_vk_queue_cleanup()");
|
||||
std::lock_guard<std::mutex> guard(device->mutex);
|
||||
static void ggml_vk_command_pool_cleanup(vk_device& device, vk_command_pool& p) {
|
||||
VK_LOG_DEBUG("ggml_vk_command_pool_cleanup()");
|
||||
|
||||
// Requires command buffers to be done
|
||||
device->device.resetCommandPool(q.pool);
|
||||
q.cmd_buffer_idx = 0;
|
||||
device->device.resetCommandPool(p.pool);
|
||||
p.cmd_buffer_idx = 0;
|
||||
}
|
||||
|
||||
static void ggml_vk_queue_command_pools_cleanup(vk_device& device) {
|
||||
VK_LOG_DEBUG("ggml_vk_queue_command_pools_cleanup()");
|
||||
|
||||
// Arbitrary frequency to cleanup/reuse command buffers
|
||||
static constexpr uint32_t cleanup_frequency = 10;
|
||||
|
||||
if (device->compute_queue.cmd_pool.cmd_buffer_idx >= cleanup_frequency) {
|
||||
ggml_vk_command_pool_cleanup(device, device->compute_queue.cmd_pool);
|
||||
}
|
||||
if (device->transfer_queue.cmd_pool.cmd_buffer_idx >= cleanup_frequency) {
|
||||
ggml_vk_command_pool_cleanup(device, device->transfer_queue.cmd_pool);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static uint32_t find_properties(const vk::PhysicalDeviceMemoryProperties* mem_props, vk::MemoryRequirements* mem_req, vk::MemoryPropertyFlags flags) {
|
||||
for (uint32_t i = 0; i < mem_props->memoryTypeCount; ++i) {
|
||||
vk::MemoryType memory_type = mem_props->memoryTypes[i];
|
||||
@@ -1481,8 +1485,6 @@ static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, vk::Memor
|
||||
throw vk::OutOfDeviceMemoryError("Requested buffer size exceeds device memory allocation limit");
|
||||
}
|
||||
|
||||
std::lock_guard<std::mutex> guard(device->mutex);
|
||||
|
||||
vk_buffer buf = std::make_shared<vk_buffer_struct>();
|
||||
|
||||
if (size == 0) {
|
||||
@@ -1611,11 +1613,11 @@ static vk_subbuffer ggml_vk_subbuffer(vk_buffer& buf) {
|
||||
static void ggml_vk_sync_buffers(vk_context& ctx) {
|
||||
VK_LOG_DEBUG("ggml_vk_sync_buffers()");
|
||||
|
||||
const bool transfer_queue = ctx->q->transfer_only;
|
||||
const bool transfer_queue = ctx->p->q->transfer_only;
|
||||
|
||||
ctx->s->buffer.pipelineBarrier(
|
||||
ctx->q->stage_flags,
|
||||
ctx->q->stage_flags,
|
||||
ctx->p->q->stage_flags,
|
||||
ctx->p->q->stage_flags,
|
||||
{},
|
||||
{ {
|
||||
{ !transfer_queue ? (vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eShaderWrite | vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite) : (vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite) },
|
||||
@@ -1634,8 +1636,8 @@ static void ggml_vk_wait_events(vk_context& ctx, std::vector<vk::Event>&& events
|
||||
|
||||
ctx->s->buffer.waitEvents(
|
||||
events,
|
||||
ctx->q->stage_flags,
|
||||
ctx->q->stage_flags,
|
||||
ctx->p->q->stage_flags,
|
||||
ctx->p->q->stage_flags,
|
||||
{},
|
||||
{},
|
||||
{}
|
||||
@@ -3369,6 +3371,22 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
std::vector<vk::DescriptorSetLayoutBinding> dsl_binding;
|
||||
std::vector<vk::DescriptorBindingFlags> dsl_binding_flags;
|
||||
for (uint32_t i = 0; i < MAX_PARAMETER_COUNT; i++) {
|
||||
dsl_binding.push_back({i, vk::DescriptorType::eStorageBuffer, 1, vk::ShaderStageFlagBits::eCompute});
|
||||
dsl_binding_flags.push_back({});
|
||||
}
|
||||
|
||||
vk::DescriptorSetLayoutBindingFlagsCreateInfo dslbfci = { dsl_binding_flags };
|
||||
|
||||
vk::DescriptorSetLayoutCreateInfo descriptor_set_layout_create_info(
|
||||
{},
|
||||
dsl_binding);
|
||||
descriptor_set_layout_create_info.setPNext(&dslbfci);
|
||||
device->dsl = device->device.createDescriptorSetLayout(descriptor_set_layout_create_info);
|
||||
|
||||
ggml_vk_load_shaders(device);
|
||||
|
||||
if (!device->single_queue) {
|
||||
@@ -3376,7 +3394,8 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||
ggml_vk_create_queue(device, device->transfer_queue, transfer_queue_family_index, transfer_queue_index, { vk::PipelineStageFlagBits::eTransfer }, true);
|
||||
} else {
|
||||
// TODO: Use pointer or reference to avoid copy
|
||||
device->transfer_queue = device->compute_queue;
|
||||
device->transfer_queue.copyFrom(device->compute_queue);
|
||||
device->transfer_queue.cmd_pool.init(device, &device->transfer_queue);
|
||||
}
|
||||
|
||||
device->buffer_type = {
|
||||
@@ -3742,6 +3761,9 @@ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
|
||||
ctx->fence = ctx->device->device.createFence({});
|
||||
ctx->almost_ready_fence = ctx->device->device.createFence({});
|
||||
|
||||
ctx->compute_cmd_pool.init(ctx->device, &ctx->device->compute_queue);
|
||||
ctx->transfer_cmd_pool.init(ctx->device, &ctx->device->transfer_queue);
|
||||
|
||||
#ifdef GGML_VULKAN_CHECK_RESULTS
|
||||
const char* skip_checks = getenv("GGML_VULKAN_SKIP_CHECKS");
|
||||
vk_skip_checks = (skip_checks == NULL ? 0 : atoi(skip_checks));
|
||||
@@ -4107,9 +4129,9 @@ static void ggml_vk_host_get(vk_device& device, const void * ptr, vk_buffer& buf
|
||||
}
|
||||
}
|
||||
|
||||
static vk_submission ggml_vk_begin_submission(vk_device& device, vk_queue& q, bool one_time = true) {
|
||||
static vk_submission ggml_vk_begin_submission(vk_device& device, vk_command_pool& p, bool one_time = true) {
|
||||
vk_submission s;
|
||||
s.buffer = ggml_vk_create_cmd_buffer(device, q);
|
||||
s.buffer = ggml_vk_create_cmd_buffer(device, p);
|
||||
if (one_time) {
|
||||
s.buffer.begin({ vk::CommandBufferUsageFlagBits::eOneTimeSubmit });
|
||||
} else {
|
||||
@@ -4154,10 +4176,10 @@ static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context* ctx, vk_context&
|
||||
std::cerr << "(" << buffer.buffer << ", " << buffer.offset << ", " << buffer.range << "), ";
|
||||
}
|
||||
std::cerr << "}, (" << wg0 << "," << wg1 << "," << wg2 << "))");
|
||||
GGML_ASSERT(pipeline->descriptor_set_idx < pipeline->descriptor_sets.size());
|
||||
GGML_ASSERT(descriptor_buffer_infos.size() == pipeline->parameter_count);
|
||||
GGML_ASSERT(ctx->descriptor_set_idx < ctx->descriptor_sets.size());
|
||||
GGML_ASSERT(descriptor_buffer_infos.size() <= MAX_PARAMETER_COUNT);
|
||||
|
||||
vk::DescriptorSet& descriptor_set = pipeline->descriptor_sets[pipeline->descriptor_set_idx++];
|
||||
vk::DescriptorSet& descriptor_set = ctx->descriptor_sets[ctx->descriptor_set_idx++];
|
||||
vk::WriteDescriptorSet write_descriptor_set{ descriptor_set, 0, 0, pipeline->parameter_count, vk::DescriptorType::eStorageBuffer, nullptr, descriptor_buffer_infos.begin() };
|
||||
ctx->device->device.updateDescriptorSets({ write_descriptor_set }, {});
|
||||
|
||||
@@ -4194,7 +4216,7 @@ static void ggml_vk_ctx_begin(vk_device& device, vk_context& subctx) {
|
||||
ggml_vk_ctx_end(subctx);
|
||||
}
|
||||
|
||||
subctx->seqs.push_back({ ggml_vk_begin_submission(device, *subctx->q) });
|
||||
subctx->seqs.push_back({ ggml_vk_begin_submission(device, *subctx->p) });
|
||||
subctx->s = subctx->seqs[subctx->seqs.size() - 1].data();
|
||||
}
|
||||
|
||||
@@ -4395,7 +4417,9 @@ static void ggml_vk_buffer_write_2d(vk_buffer& dst, size_t offset, const void *
|
||||
memcpy((uint8_t *)dst->ptr + offset + i * width, (const uint8_t *) src + i * spitch, width);
|
||||
}
|
||||
} else {
|
||||
vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue);
|
||||
std::lock_guard<std::mutex> guard(dst->device->mutex);
|
||||
|
||||
vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue.cmd_pool);
|
||||
ggml_vk_ctx_begin(dst->device, subctx);
|
||||
ggml_vk_buffer_write_2d_async(subctx, dst, offset, src, spitch, width, height, true);
|
||||
ggml_vk_ctx_end(subctx);
|
||||
@@ -4407,6 +4431,7 @@ static void ggml_vk_buffer_write_2d(vk_buffer& dst, size_t offset, const void *
|
||||
ggml_vk_submit(subctx, dst->device->fence);
|
||||
VK_CHECK(dst->device->device.waitForFences({ dst->device->fence }, true, UINT64_MAX), "vk_buffer_write_2d waitForFences");
|
||||
dst->device->device.resetFences({ dst->device->fence });
|
||||
ggml_vk_queue_command_pools_cleanup(dst->device);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4483,7 +4508,9 @@ static void ggml_vk_buffer_read(vk_buffer& src, size_t offset, void * dst, size_
|
||||
|
||||
memcpy(dst, (uint8_t *) src->ptr + offset, size);
|
||||
} else {
|
||||
vk_context subctx = ggml_vk_create_temporary_context(src->device->transfer_queue);
|
||||
std::lock_guard<std::mutex> guard(src->device->mutex);
|
||||
|
||||
vk_context subctx = ggml_vk_create_temporary_context(src->device->transfer_queue.cmd_pool);
|
||||
ggml_vk_ctx_begin(src->device, subctx);
|
||||
ggml_vk_buffer_read_async(subctx, src, offset, dst, size, true);
|
||||
ggml_vk_ctx_end(subctx);
|
||||
@@ -4491,6 +4518,7 @@ static void ggml_vk_buffer_read(vk_buffer& src, size_t offset, void * dst, size_
|
||||
ggml_vk_submit(subctx, src->device->fence);
|
||||
VK_CHECK(src->device->device.waitForFences({ src->device->fence }, true, UINT64_MAX), "vk_buffer_read waitForFences");
|
||||
src->device->device.resetFences({ src->device->fence });
|
||||
ggml_vk_queue_command_pools_cleanup(src->device);
|
||||
|
||||
for (auto& cpy : subctx->out_memcpys) {
|
||||
memcpy(cpy.dst, cpy.src, cpy.n);
|
||||
@@ -4510,15 +4538,17 @@ static void ggml_vk_buffer_copy_async(vk_context& ctx, vk_buffer& dst, size_t ds
|
||||
|
||||
static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) {
|
||||
if (src->device == dst->device) {
|
||||
std::lock_guard<std::mutex> guard(src->device->mutex);
|
||||
VK_LOG_DEBUG("ggml_vk_buffer_copy(SINGLE_DEVICE, " << size << ")");
|
||||
// Copy within the device
|
||||
vk_context subctx = ggml_vk_create_temporary_context(src->device->transfer_queue);
|
||||
vk_context subctx = ggml_vk_create_temporary_context(src->device->transfer_queue.cmd_pool);
|
||||
ggml_vk_ctx_begin(src->device, subctx);
|
||||
ggml_vk_buffer_copy_async(subctx, dst, dst_offset, src, src_offset, size);
|
||||
ggml_vk_ctx_end(subctx);
|
||||
ggml_vk_submit(subctx, src->device->fence);
|
||||
VK_CHECK(src->device->device.waitForFences({ src->device->fence }, true, UINT64_MAX), "vk_buffer_copy waitForFences");
|
||||
src->device->device.resetFences({ src->device->fence });
|
||||
ggml_vk_queue_command_pools_cleanup(src->device);
|
||||
} else {
|
||||
VK_LOG_DEBUG("ggml_vk_buffer_copy(MULTI_DEVICE, " << size << ")");
|
||||
// Copy device to device
|
||||
@@ -4543,7 +4573,8 @@ static void ggml_vk_buffer_memset_async(vk_context& ctx, vk_buffer& dst, size_t
|
||||
static void ggml_vk_buffer_memset(vk_buffer& dst, size_t offset, uint32_t c, size_t size) {
|
||||
VK_LOG_DEBUG("ggml_vk_buffer_memset(" << offset << ", " << c << ", " << size << ")");
|
||||
|
||||
vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue);
|
||||
std::lock_guard<std::mutex> guard(dst->device->mutex);
|
||||
vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue.cmd_pool);
|
||||
ggml_vk_ctx_begin(dst->device, subctx);
|
||||
subctx->s->buffer.fillBuffer(dst->buffer, offset, size, c);
|
||||
ggml_vk_ctx_end(subctx);
|
||||
@@ -4551,6 +4582,7 @@ static void ggml_vk_buffer_memset(vk_buffer& dst, size_t offset, uint32_t c, siz
|
||||
ggml_vk_submit(subctx, dst->device->fence);
|
||||
VK_CHECK(dst->device->device.waitForFences({ dst->device->fence }, true, UINT64_MAX), "vk_memset waitForFences");
|
||||
dst->device->device.resetFences({ dst->device->fence });
|
||||
ggml_vk_queue_command_pools_cleanup(dst->device);
|
||||
}
|
||||
|
||||
static uint32_t ggml_vk_guess_split_k(ggml_backend_vk_context * ctx, int m, int n, int k, const vk_pipeline& pipeline) {
|
||||
@@ -4964,18 +4996,18 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
|
||||
}
|
||||
|
||||
// Request descriptor sets
|
||||
ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1);
|
||||
ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
|
||||
if (qx_needs_dequant) {
|
||||
ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_0, 1);
|
||||
ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_0, 1);
|
||||
}
|
||||
if (qy_needs_dequant) {
|
||||
ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_1, 1);
|
||||
ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_1, 1);
|
||||
}
|
||||
if (quantize_y) {
|
||||
ggml_pipeline_request_descriptor_sets(ctx->device, to_q8_1, 1);
|
||||
ggml_pipeline_request_descriptor_sets(ctx, to_q8_1, 1);
|
||||
}
|
||||
if (split_k > 1) {
|
||||
ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_matmul_split_k_reduce, 1);
|
||||
ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, 1);
|
||||
}
|
||||
return;
|
||||
}
|
||||
@@ -5157,12 +5189,12 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
||||
|
||||
// Request descriptor sets
|
||||
if (qx_needs_dequant) {
|
||||
ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_0, 1);
|
||||
ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_0, 1);
|
||||
}
|
||||
if (qy_needs_dequant) {
|
||||
ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_1, 1);
|
||||
ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_1, 1);
|
||||
}
|
||||
ggml_pipeline_request_descriptor_sets(ctx->device, dmmv, 1);
|
||||
ggml_pipeline_request_descriptor_sets(ctx, dmmv, 1);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -5295,7 +5327,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
|
||||
|
||||
if (dryrun) {
|
||||
// Request descriptor sets
|
||||
ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_mul_mat_vec_p021_f16_f32[gqa_ratio - 1], 1);
|
||||
ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32[gqa_ratio - 1], 1);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -5384,7 +5416,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
|
||||
|
||||
if (dryrun) {
|
||||
// Request descriptor sets
|
||||
ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, 1);
|
||||
ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, 1);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -5571,12 +5603,12 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
||||
}
|
||||
|
||||
// Request descriptor sets
|
||||
ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1);
|
||||
ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
|
||||
if (qx_needs_dequant) {
|
||||
ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_0, 1);
|
||||
ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_0, 1);
|
||||
}
|
||||
if (qy_needs_dequant) {
|
||||
ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_1, 1);
|
||||
ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_1, 1);
|
||||
}
|
||||
return;
|
||||
}
|
||||
@@ -5765,12 +5797,12 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
|
||||
|
||||
// Request descriptor sets
|
||||
if (qx_needs_dequant) {
|
||||
ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_0, 1);
|
||||
ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_0, 1);
|
||||
}
|
||||
if (qy_needs_dequant) {
|
||||
ggml_pipeline_request_descriptor_sets(ctx->device, to_fp16_vk_1, 1);
|
||||
ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_1, 1);
|
||||
}
|
||||
ggml_pipeline_request_descriptor_sets(ctx->device, dmmv, 1);
|
||||
ggml_pipeline_request_descriptor_sets(ctx, dmmv, 1);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -6090,9 +6122,9 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
|
||||
|
||||
if (dryrun) {
|
||||
// Request descriptor sets
|
||||
ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1);
|
||||
ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
|
||||
if (split_k > 1) {
|
||||
ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_flash_attn_split_k_reduce, 1);
|
||||
ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_flash_attn_split_k_reduce, 1);
|
||||
}
|
||||
return;
|
||||
}
|
||||
@@ -6655,7 +6687,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
||||
}
|
||||
|
||||
if (dryrun) {
|
||||
ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1);
|
||||
ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -7036,7 +7068,7 @@ static void ggml_vk_op_f32_wkv(ggml_backend_vk_context * ctx, vk_context& subctx
|
||||
GGML_ASSERT(pipeline != nullptr);
|
||||
|
||||
if (dryrun) {
|
||||
ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1);
|
||||
ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -7175,7 +7207,7 @@ static void ggml_vk_op_f32_opt_step_adamw(ggml_backend_vk_context * ctx, vk_cont
|
||||
GGML_ASSERT(pipeline != nullptr);
|
||||
|
||||
if (dryrun) {
|
||||
ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1);
|
||||
ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -7853,9 +7885,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
||||
}
|
||||
}
|
||||
|
||||
ggml_pipeline_request_descriptor_sets(ctx->device, p, num_it);
|
||||
ggml_pipeline_request_descriptor_sets(ctx, p, num_it);
|
||||
if (split_k > 1) {
|
||||
ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_matmul_split_k_reduce, num_it);
|
||||
ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, num_it);
|
||||
|
||||
if (ctx->prealloc_split_k == nullptr || ctx->prealloc_split_k->size < sizeof(float) * d_ne * split_k) {
|
||||
// Resize buffer
|
||||
@@ -7870,7 +7902,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
||||
ggml_vk_load_shaders(ctx->device);
|
||||
}
|
||||
|
||||
ggml_pipeline_allocate_descriptor_sets(ctx->device);
|
||||
ggml_pipeline_allocate_descriptor_sets(ctx);
|
||||
|
||||
vk_buffer d_X = ggml_vk_create_buffer_check(ctx->device, sizeof(X_TYPE) * x_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
vk_buffer d_Y = ggml_vk_create_buffer_check(ctx->device, sizeof(Y_TYPE) * y_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
@@ -7912,7 +7944,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
||||
ggml_vk_buffer_write(d_X, 0, x, sizeof(X_TYPE) * k * m * batch);
|
||||
ggml_vk_buffer_write(d_Y, 0, y, sizeof(Y_TYPE) * k * n * batch);
|
||||
|
||||
vk_context subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
|
||||
vk_context subctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
|
||||
ggml_vk_ctx_begin(ctx->device, subctx);
|
||||
for (size_t i = 0; i < num_it; i++) {
|
||||
ggml_vk_matmul(
|
||||
@@ -7928,6 +7960,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
||||
ggml_vk_submit(subctx, ctx->fence);
|
||||
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_matmul waitForFences");
|
||||
ctx->device->device.resetFences({ ctx->fence });
|
||||
ggml_vk_queue_command_pools_cleanup(ctx->device);
|
||||
|
||||
auto end = std::chrono::high_resolution_clock::now();
|
||||
double time = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0;
|
||||
@@ -8029,16 +8062,13 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
||||
|
||||
free(d_chk);
|
||||
|
||||
ggml_vk_queue_cleanup(ctx->device, ctx->device->transfer_queue);
|
||||
ggml_vk_queue_cleanup(ctx->device, ctx->device->compute_queue);
|
||||
ggml_vk_command_pool_cleanup(ctx->device, ctx->compute_cmd_pool);
|
||||
ggml_vk_command_pool_cleanup(ctx->device, ctx->transfer_cmd_pool);
|
||||
|
||||
ggml_vk_destroy_buffer(d_X);
|
||||
ggml_vk_destroy_buffer(d_Y);
|
||||
ggml_vk_destroy_buffer(d_D);
|
||||
|
||||
ggml_pipeline_cleanup(p);
|
||||
ggml_pipeline_cleanup(ctx->device->pipeline_matmul_split_k_reduce);
|
||||
|
||||
free(x);
|
||||
free(y);
|
||||
free(d);
|
||||
@@ -8116,17 +8146,17 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
|
||||
ggml_vk_quantize_data(x, qx, ne, quant);
|
||||
ggml_vk_dequantize_data(qx, x_ref, ne, quant);
|
||||
|
||||
ggml_pipeline_request_descriptor_sets(ctx->device, p, 1);
|
||||
ggml_pipeline_request_descriptor_sets(ctx, p, 1);
|
||||
|
||||
if (ctx->device->need_compiles) {
|
||||
ggml_vk_load_shaders(ctx->device);
|
||||
}
|
||||
|
||||
ggml_pipeline_allocate_descriptor_sets(ctx->device);
|
||||
ggml_pipeline_allocate_descriptor_sets(ctx);
|
||||
|
||||
ggml_vk_buffer_write(qx_buf, 0, qx, qx_sz);
|
||||
|
||||
vk_context subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
|
||||
vk_context subctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
|
||||
ggml_vk_ctx_begin(ctx->device, subctx);
|
||||
const std::vector<uint32_t> pc = { 1, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne };
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, p, { vk_subbuffer{ qx_buf, 0, qx_sz }, vk_subbuffer{ x_buf, 0, x_sz_f16 } }, pc, { (uint32_t)ne, 1, 1});
|
||||
@@ -8137,6 +8167,7 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
|
||||
ggml_vk_submit(subctx, ctx->fence);
|
||||
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_dequant waitForFences");
|
||||
ctx->device->device.resetFences({ ctx->fence });
|
||||
ggml_vk_queue_command_pools_cleanup(ctx->device);
|
||||
|
||||
auto end = std::chrono::high_resolution_clock::now();
|
||||
|
||||
@@ -8216,17 +8247,17 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
|
||||
//
|
||||
// vk_pipeline p = ggml_vk_get_quantize_pipeline(ctx, quant);
|
||||
//
|
||||
// ggml_pipeline_request_descriptor_sets(ctx->device, p, 1);
|
||||
// ggml_pipeline_request_descriptor_sets(ctx, p, 1);
|
||||
//
|
||||
// if (ctx->device->need_compiles) {
|
||||
// ggml_vk_load_shaders(ctx->device);
|
||||
// }
|
||||
//
|
||||
// ggml_pipeline_allocate_descriptor_sets(ctx->device);
|
||||
// ggml_pipeline_allocate_descriptor_sets(ctx);
|
||||
//
|
||||
// ggml_vk_buffer_write(x_buf, 0, x, x_sz);
|
||||
//
|
||||
// vk_context subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
|
||||
// vk_context subctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
|
||||
// ggml_vk_ctx_begin(ctx->device, subctx);
|
||||
// ggml_vk_quantize_q8_1(ctx, subctx, ggml_vk_subbuffer(x_buf), ggml_vk_subbuffer(qx_buf), ne);
|
||||
// ggml_vk_ctx_end(subctx);
|
||||
@@ -8236,6 +8267,7 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
|
||||
// ggml_vk_submit(subctx, ctx->fence);
|
||||
// VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_quantize waitForFences");
|
||||
// ctx->device->device.resetFences({ ctx->fence });
|
||||
// ggml_vk_queue_command_pools_cleanup(ctx->device);
|
||||
//
|
||||
// auto end = std::chrono::high_resolution_clock::now();
|
||||
//
|
||||
@@ -8375,9 +8407,9 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
|
||||
// y[i] = i % k;
|
||||
}
|
||||
|
||||
ggml_pipeline_request_descriptor_sets(ctx->device, p, num_it);
|
||||
ggml_pipeline_request_descriptor_sets(ctx, p, num_it);
|
||||
if (split_k > 1) {
|
||||
ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_matmul_split_k_reduce, num_it);
|
||||
ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, num_it);
|
||||
|
||||
if (ctx->prealloc_split_k == nullptr || ctx->prealloc_split_k->size < sizeof(float) * d_ne * split_k) {
|
||||
// Resize buffer
|
||||
@@ -8388,19 +8420,19 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
|
||||
}
|
||||
}
|
||||
if (mmq) {
|
||||
ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_quantize_q8_1, num_it);
|
||||
ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_quantize_q8_1, num_it);
|
||||
}
|
||||
|
||||
if (ctx->device->need_compiles) {
|
||||
ggml_vk_load_shaders(ctx->device);
|
||||
}
|
||||
|
||||
ggml_pipeline_allocate_descriptor_sets(ctx->device);
|
||||
ggml_pipeline_allocate_descriptor_sets(ctx);
|
||||
|
||||
ggml_vk_buffer_write(qx_buf, 0, qx, qx_sz);
|
||||
ggml_vk_buffer_write(y_buf, 0, y, y_sz);
|
||||
|
||||
vk_context subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
|
||||
vk_context subctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
|
||||
ggml_vk_ctx_begin(ctx->device, subctx);
|
||||
if (mmq) {
|
||||
for (size_t i = 0; i < num_it; i++) {
|
||||
@@ -8429,6 +8461,7 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
|
||||
ggml_vk_submit(subctx, ctx->fence);
|
||||
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_dequant waitForFences");
|
||||
ctx->device->device.resetFences({ ctx->fence });
|
||||
ggml_vk_queue_command_pools_cleanup(ctx->device);
|
||||
|
||||
auto end = std::chrono::high_resolution_clock::now();
|
||||
|
||||
@@ -8743,7 +8776,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
||||
|
||||
if (!dryrun) {
|
||||
if (ctx->compute_ctx.expired()) {
|
||||
compute_ctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
|
||||
compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
|
||||
ctx->compute_ctx = compute_ctx;
|
||||
ggml_vk_ctx_begin(ctx->device, compute_ctx);
|
||||
} else {
|
||||
@@ -8797,7 +8830,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
||||
// These operations all go through ggml_vk_op_f32, so short-circuit and
|
||||
// do the only thing needed for the dryrun.
|
||||
vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, src2, node, node->op);
|
||||
ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1);
|
||||
ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
|
||||
return false;
|
||||
}
|
||||
default:
|
||||
@@ -9189,19 +9222,8 @@ static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) {
|
||||
}
|
||||
ctx->gc.temp_buffers.clear();
|
||||
|
||||
for (auto& dsr : ctx->device->pipeline_descriptor_set_requirements) {
|
||||
vk_pipeline_ref plr = ctx->device->pipelines[dsr.first];
|
||||
|
||||
if (plr.expired()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
vk_pipeline pl = plr.lock();
|
||||
ggml_pipeline_cleanup(pl);
|
||||
}
|
||||
|
||||
ggml_vk_queue_cleanup(ctx->device, ctx->device->compute_queue);
|
||||
ggml_vk_queue_cleanup(ctx->device, ctx->device->transfer_queue);
|
||||
ggml_vk_command_pool_cleanup(ctx->device, ctx->compute_cmd_pool);
|
||||
ggml_vk_command_pool_cleanup(ctx->device, ctx->transfer_cmd_pool);
|
||||
|
||||
for (size_t i = 0; i < ctx->gc.semaphores.size(); i++) {
|
||||
ctx->device->device.destroySemaphore({ ctx->gc.semaphores[i].s });
|
||||
@@ -9222,7 +9244,8 @@ static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) {
|
||||
|
||||
ctx->tensor_ctxs.clear();
|
||||
ctx->gc.contexts.clear();
|
||||
ctx->device->pipeline_descriptor_set_requirements.clear();
|
||||
ctx->pipeline_descriptor_set_requirements = 0;
|
||||
ctx->descriptor_set_idx = 0;
|
||||
}
|
||||
|
||||
// Clean up on backend free
|
||||
@@ -9249,6 +9272,15 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
|
||||
|
||||
ctx->device->device.destroyFence(ctx->fence);
|
||||
ctx->device->device.destroyFence(ctx->almost_ready_fence);
|
||||
|
||||
for (auto& pool : ctx->descriptor_pools) {
|
||||
ctx->device->device.destroyDescriptorPool(pool);
|
||||
}
|
||||
ctx->descriptor_pools.clear();
|
||||
ctx->descriptor_sets.clear();
|
||||
|
||||
ctx->compute_cmd_pool.destroy(ctx->device->device);
|
||||
ctx->transfer_cmd_pool.destroy(ctx->device->device);
|
||||
}
|
||||
|
||||
static int ggml_vk_get_device_count() {
|
||||
@@ -9515,7 +9547,7 @@ static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor
|
||||
|
||||
if (ctx->transfer_ctx.expired()) {
|
||||
// Initialize new transfer context
|
||||
transfer_ctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue);
|
||||
transfer_ctx = ggml_vk_create_context(ctx, ctx->transfer_cmd_pool);
|
||||
ctx->transfer_ctx = transfer_ctx;
|
||||
ggml_vk_ctx_begin(ctx->device, transfer_ctx);
|
||||
} else {
|
||||
@@ -9538,7 +9570,7 @@ static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_
|
||||
|
||||
if (ctx->transfer_ctx.expired()) {
|
||||
// Initialize new transfer context
|
||||
transfer_ctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue);
|
||||
transfer_ctx = ggml_vk_create_context(ctx, ctx->transfer_cmd_pool);
|
||||
ctx->transfer_ctx = transfer_ctx;
|
||||
ggml_vk_ctx_begin(ctx->device, transfer_ctx);
|
||||
} else {
|
||||
@@ -9561,7 +9593,7 @@ static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, const ggml_
|
||||
|
||||
if (ctx->transfer_ctx.expired()) {
|
||||
// Initialize new transfer context
|
||||
transfer_ctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue);
|
||||
transfer_ctx = ggml_vk_create_context(ctx, ctx->transfer_cmd_pool);
|
||||
ctx->transfer_ctx = transfer_ctx;
|
||||
ggml_vk_ctx_begin(ctx->device, transfer_ctx);
|
||||
} else {
|
||||
@@ -9622,7 +9654,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
||||
ggml_vk_load_shaders(ctx->device);
|
||||
}
|
||||
ggml_vk_preallocate_buffers(ctx);
|
||||
ggml_pipeline_allocate_descriptor_sets(ctx->device);
|
||||
ggml_pipeline_allocate_descriptor_sets(ctx);
|
||||
|
||||
int last_node = cgraph->n_nodes - 1;
|
||||
|
||||
@@ -9654,7 +9686,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
||||
ctx->device->device.resetQueryPool(ctx->device->query_pool, 0, cgraph->n_nodes+1);
|
||||
|
||||
GGML_ASSERT(ctx->compute_ctx.expired());
|
||||
compute_ctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
|
||||
compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
|
||||
ctx->compute_ctx = compute_ctx;
|
||||
ggml_vk_ctx_begin(ctx->device, compute_ctx);
|
||||
compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->device->query_pool, 0);
|
||||
@@ -9689,7 +9721,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
||||
|
||||
if (vk_perf_logger_enabled) {
|
||||
if (ctx->compute_ctx.expired()) {
|
||||
compute_ctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
|
||||
compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
|
||||
ctx->compute_ctx = compute_ctx;
|
||||
ggml_vk_ctx_begin(ctx->device, compute_ctx);
|
||||
} else {
|
||||
|
||||
@@ -1332,7 +1332,7 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u
|
||||
LLAMA_LOG_DEBUG("%s: reserving a graph for ubatch with n_tokens = %4u, n_seqs = %2u, n_outputs = %4u\n", __func__, n_tokens, n_seqs, n_outputs);
|
||||
|
||||
if (n_tokens % n_seqs != 0) {
|
||||
n_tokens = (n_tokens / n_seqs) * n_seqs;
|
||||
n_tokens = ((n_tokens + (n_seqs - 1)) / n_seqs) * n_seqs; // round to next multiple of n_seqs
|
||||
n_outputs = std::min(n_outputs, n_tokens);
|
||||
|
||||
LLAMA_LOG_DEBUG("%s: making n_tokens a multiple of n_seqs - n_tokens = %u, n_seqs = %u, n_outputs = %u\n", __func__, n_tokens, n_seqs, n_outputs);
|
||||
|
||||
@@ -127,6 +127,9 @@ llama_kv_cache_unified::llama_kv_cache_unified(
|
||||
ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
|
||||
ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
|
||||
}
|
||||
|
||||
const char * LLAMA_KV_CACHE_DEBUG = getenv("LLAMA_KV_CACHE_DEBUG");
|
||||
debug = LLAMA_KV_CACHE_DEBUG ? atoi(LLAMA_KV_CACHE_DEBUG) : 0;
|
||||
}
|
||||
|
||||
void llama_kv_cache_unified::clear(bool data) {
|
||||
@@ -517,14 +520,12 @@ int32_t llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) const {
|
||||
return -1;
|
||||
}
|
||||
|
||||
//#define FIND_SLOT_DEBUG 1
|
||||
#if FIND_SLOT_DEBUG
|
||||
LLAMA_LOG_WARN("begin: n = %5d, used = %5d, head = %5d, n_swa = %5d\n", cells.used_max_p1(), cells.get_used(), head, n_swa);
|
||||
if (debug > 0) {
|
||||
LLAMA_LOG_CONT("\n");
|
||||
LLAMA_LOG_DEBUG("%s: n = %5d, used = %5d, head = %5d, size = %5d, n_swa = %5d\n", __func__, cells.used_max_p1(), cells.get_used(), head, get_size(), n_swa);
|
||||
|
||||
// for debugging
|
||||
{
|
||||
std::string ss;
|
||||
if (n_swa > 0) {
|
||||
if ((debug == 2 && n_swa > 0) || debug > 2) {
|
||||
std::string ss;
|
||||
for (uint32_t i = 0; i < cells.size(); ++i) {
|
||||
if (cells.is_empty(i)) {
|
||||
ss += '.';
|
||||
@@ -532,21 +533,45 @@ int32_t llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) const {
|
||||
ss += std::to_string(cells.seq_get(i));
|
||||
}
|
||||
if (i%256 == 255) {
|
||||
ss += " *";
|
||||
ss += '\n';
|
||||
}
|
||||
}
|
||||
}
|
||||
LLAMA_LOG_WARN("\n%s\n", ss.c_str());
|
||||
}
|
||||
|
||||
for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
|
||||
if (cells.seq_pos_min(s) < 0) {
|
||||
continue;
|
||||
LLAMA_LOG_DEBUG("\n%s\n", ss.c_str());
|
||||
}
|
||||
|
||||
LLAMA_LOG_WARN("kv_cells: n_swa = %4d, min[%d] = %5d, max[%d] = %5d\n", n_swa, s, cells.seq_pos_min(s), s, cells.seq_pos_max(s));
|
||||
if ((debug == 2 && n_swa > 0) || debug > 2) {
|
||||
std::string ss;
|
||||
for (uint32_t i = 0; i < cells.size(); ++i) {
|
||||
std::string cur;
|
||||
if (cells.is_empty(i)) {
|
||||
cur = '.';
|
||||
} else {
|
||||
cur = std::to_string(cells.pos_get(i));
|
||||
}
|
||||
const int n = cur.size();
|
||||
for (int j = 0; j < 5 - n; ++j) {
|
||||
cur += ' ';
|
||||
}
|
||||
ss += cur;
|
||||
if (i%256 == 255) {
|
||||
ss += " *";
|
||||
}
|
||||
if (i%64 == 63) {
|
||||
ss += '\n';
|
||||
}
|
||||
}
|
||||
LLAMA_LOG_DEBUG("\n%s\n", ss.c_str());
|
||||
}
|
||||
|
||||
for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
|
||||
if (cells.seq_pos_min(s) < 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
LLAMA_LOG_DEBUG("%s: min[%d] = %5d, max[%d] = %5d\n", __func__, s, cells.seq_pos_min(s), s, cells.seq_pos_max(s));
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
uint32_t n_tested = 0;
|
||||
|
||||
@@ -557,21 +582,15 @@ int32_t llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) const {
|
||||
continue;
|
||||
}
|
||||
|
||||
// keep track of what the minimum sequence positions would be if we accept the ubatch
|
||||
llama_seq_id seq_pos_min[LLAMA_MAX_PARALLEL_SEQUENCES];
|
||||
for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
|
||||
seq_pos_min[s] = cells.seq_pos_min(s);
|
||||
}
|
||||
|
||||
bool found = true;
|
||||
for (uint32_t i = 0; i < n_tokens; i++) {
|
||||
const llama_pos pos = ubatch.pos[i];
|
||||
const llama_seq_id seq_id = ubatch.seq_id[i][0];
|
||||
//const llama_pos pos = ubatch.pos[i];
|
||||
//const llama_seq_id seq_id = ubatch.seq_id[i][0];
|
||||
|
||||
// can we use this cell? either:
|
||||
// - the cell is empty
|
||||
// - the cell is occupied only by one sequence:
|
||||
// - mask causally, if the sequence is the same as the one we are inserting
|
||||
// - (disabled) mask causally, if the sequence is the same as the one we are inserting
|
||||
// - mask SWA, using current max pos for that sequence in the cache
|
||||
// always insert in the cell with minimum pos
|
||||
bool can_use = cells.is_empty(head_cur + i);
|
||||
@@ -579,21 +598,17 @@ int32_t llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) const {
|
||||
if (!can_use && cells.seq_count(head_cur + i) == 1) {
|
||||
const llama_pos pos_cell = cells.pos_get(head_cur + i);
|
||||
|
||||
// causal mask
|
||||
if (cells.seq_has(head_cur + i, seq_id)) {
|
||||
can_use = pos_cell >= pos;
|
||||
}
|
||||
// (disabled) causal mask
|
||||
// note: it's better to purge any "future" tokens beforehand
|
||||
//if (cells.seq_has(head_cur + i, seq_id)) {
|
||||
// can_use = pos_cell >= pos;
|
||||
//}
|
||||
|
||||
if (!can_use) {
|
||||
const llama_seq_id seq_id_cell = cells.seq_get(head_cur + i);
|
||||
|
||||
// SWA mask
|
||||
// note: we insert only in the cell with minimum pos in order to preserve the invariant that
|
||||
// all positions between [pos_min, pos_max] for each sequence will be present in the cache
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/13746#issuecomment-2916057092
|
||||
if (pos_cell == seq_pos_min[seq_id_cell] &&
|
||||
is_masked_swa(pos_cell, cells.seq_pos_max(seq_id_cell) + 1)) {
|
||||
seq_pos_min[seq_id_cell]++;
|
||||
if (is_masked_swa(pos_cell, cells.seq_pos_max(seq_id_cell) + 1)) {
|
||||
can_use = true;
|
||||
}
|
||||
}
|
||||
@@ -621,8 +636,22 @@ int32_t llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) const {
|
||||
}
|
||||
|
||||
void llama_kv_cache_unified::apply_ubatch(uint32_t head_cur, const llama_ubatch & ubatch) {
|
||||
// keep track of the max sequence position that we would overwrite with this ubatch
|
||||
// for non-SWA cache, this would be always empty
|
||||
llama_seq_id seq_pos_max_rm[LLAMA_MAX_PARALLEL_SEQUENCES];
|
||||
for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
|
||||
seq_pos_max_rm[s] = -1;
|
||||
}
|
||||
|
||||
for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
|
||||
if (!cells.is_empty(head_cur + i)) {
|
||||
assert(cells.seq_count(head_cur + i) == 1);
|
||||
|
||||
const llama_seq_id seq_id = cells.seq_get(head_cur + i);
|
||||
const llama_pos pos = cells.pos_get(head_cur + i);
|
||||
|
||||
seq_pos_max_rm[seq_id] = std::max(seq_pos_max_rm[seq_id], pos);
|
||||
|
||||
cells.rm(head_cur + i);
|
||||
}
|
||||
|
||||
@@ -633,6 +662,22 @@ void llama_kv_cache_unified::apply_ubatch(uint32_t head_cur, const llama_ubatch
|
||||
}
|
||||
}
|
||||
|
||||
// note: we want to preserve the invariant that all positions between [pos_min, pos_max] for each sequence
|
||||
// will be present in the cache. so we have to purge any position which is less than those we would overwrite
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/13746#issuecomment-2916057092
|
||||
for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
|
||||
if (seq_pos_max_rm[s] == -1) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (cells.seq_pos_min(s) <= seq_pos_max_rm[s]) {
|
||||
LLAMA_LOG_DEBUG("%s: purging positions [%d, %d] of sequence %d from KV cache\n",
|
||||
__func__, cells.seq_pos_min(s), seq_pos_max_rm[s], s);
|
||||
|
||||
seq_rm(s, cells.seq_pos_min(s), seq_pos_max_rm[s] + 1);
|
||||
}
|
||||
}
|
||||
|
||||
// move the head at the end of the slot
|
||||
head = head_cur + ubatch.n_tokens;
|
||||
}
|
||||
|
||||
@@ -158,6 +158,8 @@ private:
|
||||
// SWA
|
||||
const uint32_t n_swa = 0;
|
||||
|
||||
int debug = 0;
|
||||
|
||||
const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
|
||||
|
||||
std::vector<ggml_context_ptr> ctxs;
|
||||
|
||||
@@ -42,6 +42,34 @@ function(llama_test target)
|
||||
set_property(TEST ${TEST_NAME} PROPERTY LABELS ${LLAMA_TEST_LABEL})
|
||||
endfunction()
|
||||
|
||||
function(llama_test_cmd target)
|
||||
include(CMakeParseArguments)
|
||||
set(options)
|
||||
set(oneValueArgs NAME LABEL WORKING_DIRECTORY)
|
||||
set(multiValueArgs ARGS)
|
||||
cmake_parse_arguments(LLAMA_TEST "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
|
||||
|
||||
if (NOT DEFINED LLAMA_TEST_LABEL)
|
||||
set(LLAMA_TEST_LABEL "main")
|
||||
endif()
|
||||
if (NOT DEFINED LLAMA_TEST_WORKING_DIRECTORY)
|
||||
set(LLAMA_TEST_WORKING_DIRECTORY .)
|
||||
endif()
|
||||
if (DEFINED LLAMA_TEST_NAME)
|
||||
set(TEST_NAME ${LLAMA_TEST_NAME})
|
||||
else()
|
||||
set(TEST_NAME ${target})
|
||||
endif()
|
||||
|
||||
add_test(
|
||||
NAME ${TEST_NAME}
|
||||
WORKING_DIRECTORY ${LLAMA_TEST_WORKING_DIRECTORY}
|
||||
COMMAND ${target}
|
||||
${LLAMA_TEST_ARGS})
|
||||
|
||||
set_property(TEST ${TEST_NAME} PROPERTY LABELS ${LLAMA_TEST_LABEL})
|
||||
endfunction()
|
||||
|
||||
# Builds and runs a test source file.
|
||||
# Optional args:
|
||||
# - NAME: name of the executable & test target (defaults to the source file name without extension)
|
||||
@@ -83,25 +111,31 @@ endfunction()
|
||||
# build test-tokenizer-0 target once and add many tests
|
||||
llama_build(test-tokenizer-0.cpp)
|
||||
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-bert-bge ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bert-bge.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-command-r ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-command-r.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-coder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-llm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-llm.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-gpt-2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-2.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-spm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-spm.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-mpt ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-phi-3 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-phi-3.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-qwen2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-qwen2.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-refact ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-bert-bge ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-bert-bge.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-command-r ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-command-r.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-coder ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-deepseek-coder.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-llm ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-deepseek-llm.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-falcon ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-falcon.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-gpt-2 ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-gpt-2.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-bpe ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-llama-bpe.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-spm ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-llama-spm.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-mpt ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-mpt.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-phi-3 ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-phi-3.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-qwen2 ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-qwen2.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-refact ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-refact.gguf)
|
||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-starcoder.gguf)
|
||||
|
||||
# TODO: missing HF tokenizer for this model in convert_hf_to_gguf_update.py, see https://github.com/ggml-org/llama.cpp/pull/13847
|
||||
# llama_test(test-tokenizer-0 NAME test-tokenizer-0-nomic-bert-moe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-nomic-bert-moe.gguf)
|
||||
if (NOT WIN32)
|
||||
llama_test_cmd(
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/test-tokenizers-repo.sh
|
||||
NAME test-tokenizers-ggml-vocabs
|
||||
WORKING_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}
|
||||
ARGS https://huggingface.co/ggml-org/vocabs ${PROJECT_SOURCE_DIR}/models/ggml-vocabs
|
||||
)
|
||||
endif()
|
||||
|
||||
if (LLAMA_LLGUIDANCE)
|
||||
llama_build_and_test(test-grammar-llguidance.cpp ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf)
|
||||
llama_build_and_test(test-grammar-llguidance.cpp ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-llama-bpe.gguf)
|
||||
endif ()
|
||||
|
||||
if (NOT WIN32 OR NOT BUILD_SHARED_LIBS)
|
||||
@@ -113,8 +147,8 @@ if (NOT WIN32 OR NOT BUILD_SHARED_LIBS)
|
||||
llama_build_and_test(test-chat.cpp)
|
||||
# TODO: disabled on loongarch64 because the ggml-ci node lacks Python 3.8
|
||||
if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
|
||||
llama_build_and_test(test-json-schema-to-grammar.cpp WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..)
|
||||
target_include_directories(test-json-schema-to-grammar PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../tools/server)
|
||||
llama_build_and_test(test-json-schema-to-grammar.cpp WORKING_DIRECTORY ${PROJECT_SOURCE_DIR})
|
||||
target_include_directories(test-json-schema-to-grammar PRIVATE ${PROJECT_SOURCE_DIR}/tools/server)
|
||||
endif()
|
||||
|
||||
if (NOT GGML_BACKEND_DL)
|
||||
@@ -127,20 +161,20 @@ if (NOT WIN32 OR NOT BUILD_SHARED_LIBS)
|
||||
llama_build(test-tokenizer-1-bpe.cpp)
|
||||
|
||||
# TODO: disabled due to slowness
|
||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-aquila ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
|
||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt-2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-2.gguf)
|
||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt-neox ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-neox.gguf)
|
||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf --ignore-merges)
|
||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-mpt ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
|
||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-refact ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
|
||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
|
||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-aquila ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-aquila.gguf)
|
||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-falcon ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-falcon.gguf)
|
||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt-2 ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-gpt-2.gguf)
|
||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt-neox ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-gpt-neox.gguf)
|
||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-llama-bpe ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-llama-bpe.gguf --ignore-merges)
|
||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-mpt ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-mpt.gguf)
|
||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-refact ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-refact.gguf)
|
||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-starcoder ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-starcoder.gguf)
|
||||
|
||||
# build test-tokenizer-1-spm target once and add many tests
|
||||
llama_build(test-tokenizer-1-spm.cpp)
|
||||
|
||||
llama_test(test-tokenizer-1-spm NAME test-tokenizer-1-llama-spm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-spm.gguf)
|
||||
#llama_test(test-tokenizer-1-spm NAME test-tokenizer-1-baichuan ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
|
||||
llama_test(test-tokenizer-1-spm NAME test-tokenizer-1-llama-spm ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-llama-spm.gguf)
|
||||
#llama_test(test-tokenizer-1-spm NAME test-tokenizer-1-baichuan ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-baichuan.gguf)
|
||||
|
||||
# llama_build_and_test(test-double-float.cpp) # SLOW
|
||||
endif()
|
||||
|
||||
36
tests/test-tokenizers-repo.sh
Executable file
36
tests/test-tokenizers-repo.sh
Executable file
@@ -0,0 +1,36 @@
|
||||
#!/bin/bash
|
||||
|
||||
if [ $# -lt 2 ]; then
|
||||
printf "Usage: $0 <git-repo> <target-folder> [<test-exe>]\n"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ $# -eq 3 ]; then
|
||||
toktest=$3
|
||||
else
|
||||
toktest="./test-tokenizer-0"
|
||||
fi
|
||||
|
||||
if [ ! -x $toktest ]; then
|
||||
printf "Test executable \"$toktest\" not found!\n"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
repo=$1
|
||||
folder=$2
|
||||
|
||||
if [ -d $folder ] && [ -d $folder/.git ]; then
|
||||
(cd $folder; git pull)
|
||||
else
|
||||
git clone $repo $folder
|
||||
fi
|
||||
|
||||
shopt -s globstar
|
||||
for gguf in $folder/**/*.gguf; do
|
||||
if [ -f $gguf.inp ] && [ -f $gguf.out ]; then
|
||||
$toktest $gguf
|
||||
else
|
||||
printf "Found \"$gguf\" without matching inp/out files, ignoring...\n"
|
||||
fi
|
||||
done
|
||||
|
||||
Binary file not shown.
@@ -233,6 +233,7 @@ struct server_task {
|
||||
slot_params defaults;
|
||||
defaults.sampling = params_base.sampling;
|
||||
defaults.speculative = params_base.speculative;
|
||||
defaults.n_keep = params_base.n_keep;
|
||||
|
||||
// enabling this will output extra debug information in the HTTP responses from the server
|
||||
params.verbose = params_base.verbosity > 9;
|
||||
@@ -2060,6 +2061,7 @@ struct server_context {
|
||||
SLT_INF(slot, "new slot n_ctx_slot = %d\n", slot.n_ctx);
|
||||
|
||||
slot.params.sampling = params_base.sampling;
|
||||
slot.params.n_keep = params_base.n_keep;
|
||||
|
||||
slot.callback_on_release = [this](int) {
|
||||
queue_tasks.pop_deferred_task();
|
||||
|
||||
@@ -41,6 +41,10 @@ html {
|
||||
max-width: 900px;
|
||||
}
|
||||
|
||||
.chat-bubble {
|
||||
@apply break-words;
|
||||
}
|
||||
|
||||
.chat-bubble-base-300 {
|
||||
--tw-bg-opacity: 1;
|
||||
--tw-text-opacity: 1;
|
||||
|
||||
Reference in New Issue
Block a user