Compare commits

..

1 Commits

Author SHA1 Message Date
Georgi Gerganov
956bb14595 examples : remove --instruct remnants 2024-06-10 08:37:47 +03:00
34 changed files with 1536 additions and 2607 deletions

View File

@@ -1,7 +1,15 @@
ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
rm /etc/apt/sources.list.d/intel-graphics.list && \
wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
chmod 644 /usr/share/keyrings/intel-graphics.gpg
ARG LLAMA_SYCL_F16=OFF
RUN apt-get update && \
apt-get install -y git

View File

@@ -1,7 +1,15 @@
ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
rm /etc/apt/sources.list.d/intel-graphics.list && \
wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
chmod 644 /usr/share/keyrings/intel-graphics.gpg
ARG LLAMA_SYCL_F16=OFF
RUN apt-get update && \
apt-get install -y git libcurl4-openssl-dev
@@ -19,6 +27,14 @@ RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
rm /etc/apt/sources.list.d/intel-graphics.list && \
wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
chmod 644 /usr/share/keyrings/intel-graphics.gpg
RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev

View File

@@ -2,4 +2,4 @@
- [ ] Review Complexity : Low
- [ ] Review Complexity : Medium
- [ ] Review Complexity : High
- [ ] I have read the [contributing guidelines](https://github.com/ggerganov/llama.cpp/blob/master/CONTRIBUTING.md)
- [ ] I have read the [contributing guidelines](CONTRIBUTING.md)

View File

@@ -13,7 +13,7 @@ on:
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m']
pull_request:
types: [opened, synchronize, reopened]
paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m']
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m']
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
@@ -684,7 +684,7 @@ jobs:
cmake --build build --config ${{ matrix.build }} -j $(nproc)
windows-latest-cmake:
runs-on: windows-2019
runs-on: windows-latest
env:
OPENBLAS_VERSION: 0.3.23
@@ -829,7 +829,7 @@ jobs:
name: llama-bin-win-${{ matrix.build }}.zip
windows-latest-cmake-cuda:
runs-on: windows-2019
runs-on: windows-latest
strategy:
matrix:
@@ -843,9 +843,8 @@ jobs:
with:
fetch-depth: 0
- name: Install CUDA toolkit
- uses: Jimver/cuda-toolkit@v0.2.11
id: cuda-toolkit
uses: Jimver/cuda-toolkit@v0.2.15
with:
cuda: ${{ matrix.cuda }}
method: 'network'

View File

@@ -16,9 +16,11 @@ on:
branches:
- master
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
pull_request:
pull_request_target:
types: [opened, synchronize, reopened]
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
schedule:
- cron: '2 4 * * *'
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
@@ -113,7 +115,7 @@ jobs:
server-windows:
runs-on: windows-2019
runs-on: windows-latest
steps:
- name: Clone

View File

@@ -402,26 +402,12 @@ if (LLAMA_CUBLAS)
endif()
if (LLAMA_CUDA)
cmake_minimum_required(VERSION 3.18) # for CMAKE_CUDA_ARCHITECTURES
cmake_minimum_required(VERSION 3.17)
find_package(CUDAToolkit)
if (CUDAToolkit_FOUND)
message(STATUS "CUDA found")
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
# 52 == lowest CUDA 12 standard
# 60 == f16 CUDA intrinsics
# 61 == integer CUDA intrinsics
# 70 == compute capability at which unrolling a loop in mul_mat_q kernels is faster
if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
else()
set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
#set(CMAKE_CUDA_ARCHITECTURES "OFF") # use this to compile much faster, but only F16 models work
endif()
endif()
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
enable_language(CUDA)
set(GGML_HEADERS_CUDA ggml-cuda.h)
@@ -486,6 +472,21 @@ if (LLAMA_CUDA)
else()
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ...
endif()
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
# 52 == lowest CUDA 12 standard
# 60 == f16 CUDA intrinsics
# 61 == integer CUDA intrinsics
# 70 == compute capability at which unrolling a loop in mul_mat_q kernels is faster
if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
else()
set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
#set(CMAKE_CUDA_ARCHITECTURES "") # use this to compile much faster, but only F16 models work
endif()
endif()
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
else()
message(WARNING "CUDA not found")
endif()

View File

@@ -576,9 +576,7 @@ Building the program with BLAS support may lead to some performance improvements
vulkaninfo
```
Alternatively your package manager might be able to provide the appropriate libraries.
For example for Ubuntu 22.04 you can install `libvulkan-dev` instead.
For Fedora 40, you can install `vulkan-devel`, `glslc` and `glslang` packages.
Alternatively your package manager might be able to provide the appropiate libraries. For example for Ubuntu 22.04 you can install `libvulkan-dev` instead.
Then, build llama.cpp using the cmake command below:

View File

@@ -40,7 +40,7 @@ static std::string build_repetition(const std::string & item_rule, int min_items
return result;
}
const std::string SPACE_RULE = "| \" \" | \"\\n\" [ \\t]{0,20}";
const std::string SPACE_RULE = "\" \"?";
struct BuiltinRule {
std::string content;
@@ -57,7 +57,7 @@ std::unordered_map<std::string, BuiltinRule> PRIMITIVE_RULES = {
{"object", {"\"{\" space ( string \":\" space value (\",\" space string \":\" space value)* )? \"}\" space", {"string", "value"}}},
{"array", {"\"[\" space ( value (\",\" space value)* )? \"]\" space", {"value"}}},
{"uuid", {"\"\\\"\" [0-9a-fA-F]{8} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{12} \"\\\"\" space", {}}},
{"char", {"[^\"\\\\\\x7F\\x00-\\x1F] | [\\\\] ([\"\\\\bfnrt] | \"u\" [0-9a-fA-F]{4})", {}}},
{"char", {"[^\"\\\\] | \"\\\\\" ([\"\\\\/bfnrt] | \"u\" [0-9a-fA-F]{4})", {}}},
{"string", {"\"\\\"\" char* \"\\\"\" space", {"char"}}},
{"null", {"\"null\" space", {}}},
};

View File

@@ -29,8 +29,9 @@ class BuiltinRule:
self.content = content
self.deps = deps or []
# Constraining spaces to prevent model "running away".
SPACE_RULE = '| " " | "\\n" [ \\t]{0,20}'
# whitespace is constrained to a single space char to prevent model "running away" in
# whitespace. Also maybe improves generation quality?
SPACE_RULE = '" "?'
PRIMITIVE_RULES = {
'boolean' : BuiltinRule('("true" | "false") space', []),
@@ -42,7 +43,7 @@ PRIMITIVE_RULES = {
'object' : BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', ['string', 'value']),
'array' : BuiltinRule('"[" space ( value ("," space value)* )? "]" space', ['value']),
'uuid' : BuiltinRule(r'"\"" [0-9a-fA-F]{8} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{12} "\"" space', []),
'char' : BuiltinRule(r'[^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})', []),
'char' : BuiltinRule(r'[^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F]{4})', []),
'string' : BuiltinRule(r'"\"" char* "\"" space', ['char']),
'null' : BuiltinRule('"null" space', []),
}

View File

@@ -1033,27 +1033,6 @@ struct markdown_printer : public printer {
if (field == "n_gpu_layers") {
return 3;
}
if (field == "n_threads") {
return 7;
}
if (field == "n_batch") {
return 7;
}
if (field == "n_ubatch") {
return 8;
}
if (field == "type_k" || field == "type_v") {
return 6;
}
if (field == "split_mode") {
return 5;
}
if (field == "flash_attn") {
return 2;
}
if (field == "use_mmap") {
return 4;
}
if (field == "test") {
return 13;
}

View File

@@ -1,5 +1,5 @@
// WARNING: This file was ported from json_schema_to_grammar.py, please fix bugs / add features there first.
const SPACE_RULE = '| " " | "\\n" [ \\t]{0,20}';
const SPACE_RULE = '" "?';
function _buildRepetition(itemRule, minItems, maxItems, opts={}) {
if (minItems === 0 && maxItems === 1) {
@@ -41,7 +41,7 @@ const PRIMITIVE_RULES = {
object : new BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', ['string', 'value']),
array : new BuiltinRule('"[" space ( value ("," space value)* )? "]" space', ['value']),
uuid : new BuiltinRule('"\\"" [0-9a-fA-F]{8} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{12} "\\"" space', []),
char : new BuiltinRule(`[^"\\\\\\x7F\\x00-\\x1F] | [\\\\] (["\\\\bfnrt] | "u" [0-9a-fA-F]{4})`, []),
char : new BuiltinRule(`[^"\\\\] | "\\\\" (["\\\\/bfnrt] | "u" [0-9a-fA-F]{4})`, []),
string : new BuiltinRule(`"\\"" char* "\\"" space`, ['char']),
null : new BuiltinRule('"null" space', []),
};

View File

@@ -147,7 +147,7 @@ struct server_slot {
int32_t n_prompt_tokens = 0;
int32_t n_prompt_tokens_processed = 0;
json prompt; // can be either a string, array of strings or array of token ids
json prompt;
// when a task is submitted, we first tokenize the prompt and store it here
std::vector<llama_token> prompt_tokens;
@@ -958,16 +958,13 @@ struct server_context {
if (!task.infill) {
const auto & prompt = data.find("prompt");
if (prompt == data.end()) {
send_error(task, "\"prompt\" must be provided", ERROR_TYPE_INVALID_REQUEST);
send_error(task, "Either \"prompt\" or \"messages\" must be provided", ERROR_TYPE_INVALID_REQUEST);
return false;
}
if ((prompt->is_string()) ||
(prompt->is_array() && prompt->size() == 1 && prompt->at(0).is_string()) ||
(prompt->is_array() && !prompt->empty() && prompt->at(0).is_number_integer())) {
slot.prompt = *prompt;
} else {
send_error(task, "\"prompt\" must be a string or an array of integers", ERROR_TYPE_INVALID_REQUEST);
slot.prompt = *prompt;
}
if (slot.prompt.is_array() && slot.prompt.size() == 0) {
send_error(task, "\"prompt\" cannot be an empty array", ERROR_TYPE_INVALID_REQUEST);
return false;
}
}
@@ -1585,18 +1582,14 @@ struct server_context {
switch (task.type) {
case SERVER_TASK_TYPE_COMPLETION:
{
const int id_slot = json_value(task.data, "id_slot", -1);
int id_slot = json_value(task.data, "id_slot", -1);
std::string prompt = json_value(task.data, "prompt", std::string());
server_slot * slot;
if (id_slot != -1) {
slot = get_slot_by_id(id_slot);
} else {
std::string prompt;
if (task.data.contains("prompt") && task.data.at("prompt").is_string()) {
json_value(task.data, "prompt", std::string());
}
slot = get_available_slot(prompt);
}

View File

@@ -886,7 +886,7 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
#endif
for (size_t i = 0; i < *n_buffers; i++) {
ggml_backend_buffer_free((*buffers)[i]);
ggml_backend_buffer_free(*buffers[i]);
}
free(*buffers);
return false;

View File

@@ -2740,7 +2740,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
case GGML_UNARY_OP_HARDSWISH:
case GGML_UNARY_OP_GELU_QUICK:
case GGML_UNARY_OP_TANH:
return ggml_is_contiguous(op->src[0]);
return true;
default:
return false;
}

View File

@@ -139,7 +139,6 @@
#define CC_PASCAL 600
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
#define CC_VOLTA 700
#define CC_TURING 750
#define CC_AMPERE 800
#define CC_OFFSET_AMD 1000000
#define CC_RDNA1 (CC_OFFSET_AMD + 1010)
@@ -327,17 +326,9 @@ static __device__ __forceinline__ half2 __shfl_xor(half2 var, int laneMask, int
#endif // defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000
#endif // defined(GGML_USE_HIPBLAS)
#if (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
#define FP16_AVAILABLE
#endif // (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
#define FP16_AVAILABLE (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
#define FP16_MMA_AVAILABLE
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING
#define INT8_MMA_AVAILABLE
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING
#define FP16_MMA_AVAILABLE !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
static bool fast_fp16_available(const int cc) {
return cc >= CC_PASCAL && cc != 610;
@@ -347,10 +338,6 @@ static bool fp16_mma_available(const int cc) {
return cc < CC_OFFSET_AMD && cc >= CC_VOLTA;
}
static bool int8_mma_available(const int cc) {
return cc < CC_OFFSET_AMD && cc >= CC_TURING;
}
[[noreturn]]
static __device__ void no_device_code(
const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
@@ -392,7 +379,7 @@ static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
}
static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
#ifdef FP16_AVAILABLE
#if FP16_AVAILABLE
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#pragma unroll
@@ -425,7 +412,7 @@ static __device__ __forceinline__ float warp_reduce_max(float x) {
}
static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b) {
#ifdef FP16_AVAILABLE
#if FP16_AVAILABLE
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
return __float2half(fmaxf(__half2float(a), __half2float(b)));

View File

@@ -74,7 +74,7 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_0(
const int sumi = __dp4a(v, u, 0);
#ifdef FP16_AVAILABLE
#if FP16_AVAILABLE
if (std::is_same<T, half>::value) {
const half2 * Q_ds = (const half2 *) Q_ds_v;
@@ -122,7 +122,7 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_1(
const int sumi = __dp4a(v, u, 0);
#ifdef FP16_AVAILABLE
#if FP16_AVAILABLE
if (std::is_same<T, half>::value) {
const half2 * Q_ds = (const half2 *) Q_ds_v;
@@ -181,7 +181,7 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_0(
const int sumi = __dp4a(v, u, 0);
#ifdef FP16_AVAILABLE
#if FP16_AVAILABLE
if (std::is_same<T, half>::value) {
const half2 * Q_ds = (const half2 *) Q_ds_v;
@@ -236,7 +236,7 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_1(
const int sumi = __dp4a(v, u, 0);
#ifdef FP16_AVAILABLE
#if FP16_AVAILABLE
if (std::is_same<T, half>::value) {
const half2 * Q_ds = (const half2 *) Q_ds_v;
@@ -314,7 +314,7 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_f16(
GGML_UNUSED(Q_q8);
GGML_UNUSED(Q_ds_v);
#ifdef FP16_AVAILABLE
#if FP16_AVAILABLE
if (std::is_same<T, half>::value) {
const half2 * Q_h2 = (const half2 *) Q_v;
@@ -407,7 +407,7 @@ static __device__ __forceinline__ T dequantize_1_q4_0(const void * __restrict__
const int q0 = x[ib].qs[iqs];
const int q = ((q0 >> (4*shift)) & 0x0F) - 8;
#ifdef FP16_AVAILABLE
#if FP16_AVAILABLE
if (std::is_same<T, half>::value) {
return ((half) d)*((half) q);
}
@@ -428,7 +428,7 @@ static __device__ __forceinline__ T dequantize_1_q4_1(const void * __restrict__
const int q0 = x[ib].qs[iqs];
const int q = ((q0 >> (4*shift)) & 0x0F);
#ifdef FP16_AVAILABLE
#if FP16_AVAILABLE
if (std::is_same<T, half>::value) {
return __low2half(dm)*((half) q) + __high2half(dm);
}
@@ -453,7 +453,7 @@ static __device__ __forceinline__ T dequantize_1_q5_0(const void * __restrict__
const int qh = ((qh0 >> idq) << 4) & 0x10;
const int q = (ql | qh) - 16;
#ifdef FP16_AVAILABLE
#if FP16_AVAILABLE
if (std::is_same<T, half>::value) {
return ((half) d)*((half) q);
}
@@ -478,7 +478,7 @@ static __device__ __forceinline__ T dequantize_1_q5_1(const void * __restrict__
const int qh = ((qh0 >> idq) << 4) & 0x10;
const int q = (ql | qh);
#ifdef FP16_AVAILABLE
#if FP16_AVAILABLE
if (std::is_same<T, half>::value) {
return __low2half(dm)*((half) q) + __high2half(dm);
}
@@ -497,7 +497,7 @@ static __device__ __forceinline__ T dequantize_1_q8_0(const void * __restrict__
const T d = x[ib].d;
const int q = x[ib].qs[iqs];
#ifdef FP16_AVAILABLE
#if FP16_AVAILABLE
if (std::is_same<T, half>::value) {
return ((half) d)*((half) q);
}

View File

@@ -43,7 +43,7 @@ static __global__ void flash_attn_tile_ext_f16(
const int ne1,
const int ne2,
const int ne3) {
#ifdef FP16_AVAILABLE
#if FP16_AVAILABLE
//In this kernel Q, K, V are matrices while i, j, k are matrix indices.
const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on.

View File

@@ -40,7 +40,7 @@ static __global__ void flash_attn_vec_ext_f16(
const int ne1,
const int ne2,
const int ne3) {
#ifdef FP16_AVAILABLE
#if FP16_AVAILABLE
//In this kernel Q, K, V are matrices while i, j, k are matrix indices.
constexpr vec_dot_KQ_f16_t vec_dot_KQ = get_vec_dot_KQ_f16<D>(type_K);

View File

@@ -1,9 +1,9 @@
#include "common.cuh"
#include "fattn-common.cuh"
#ifdef FP16_MMA_AVAILABLE
#if FP16_MMA_AVAILABLE
#include <mma.h>
#endif // FP16_MMA_AVAILABLE
#endif
// D == head size, VKQ_stride == num VKQ rows calculated in parallel:
template<int D, int ncols, int nwarps, int VKQ_stride, int parallel_blocks, typename KQ_acc_t>
@@ -45,7 +45,7 @@ static __global__ void flash_attn_ext_f16(
const int ne1,
const int ne2,
const int ne3) {
#ifdef FP16_MMA_AVAILABLE
#if FP16_MMA_AVAILABLE
//In this kernel Q, K, V are matrices while i, j, k are matrix indices.
const int ic0 = ncols*(blockIdx.x / parallel_blocks); // Index of the first Q/QKV column to work on.

View File

@@ -1,161 +0,0 @@
#include "common.cuh"
struct mma_int_A_I16K4 {
static constexpr int I = 16;
static constexpr int K = 4;
static constexpr int ne = 2;
int x[ne] = {0};
static __device__ __forceinline__ int get_i(const int l) {
const int ret = (l%2) * (I/2) + threadIdx.x / K;
GGML_CUDA_ASSUME(ret >= 0);
GGML_CUDA_ASSUME(ret < I);
return ret;
}
static __device__ __forceinline__ int get_k(const int /* l */) {
const int ret = threadIdx.x % K;
GGML_CUDA_ASSUME(ret >= 0);
GGML_CUDA_ASSUME(ret < K);
return ret;
}
};
struct mma_int_A_I16K8 {
static constexpr int I = 16;
static constexpr int K = 8;
static constexpr int ne = 4;
int x[ne] = {0};
static __device__ __forceinline__ int get_i(const int l) {
const int ret = (l%2) * (I/2) + threadIdx.x / (K/2);
GGML_CUDA_ASSUME(ret >= 0);
GGML_CUDA_ASSUME(ret < I);
return ret;
}
static __device__ __forceinline__ int get_k(const int l) {
const int ret = (l/2) * (K/2) + threadIdx.x % (K/2);
GGML_CUDA_ASSUME(ret >= 0);
GGML_CUDA_ASSUME(ret < K);
return ret;
}
};
struct mma_int_B_J8K4 {
static constexpr int J = 8;
static constexpr int K = 4;
static constexpr int ne = 1;
int x[ne] = {0};
static __device__ __forceinline__ int get_j(const int /* l */) {
const int ret = threadIdx.x / K;
GGML_CUDA_ASSUME(ret >= 0);
GGML_CUDA_ASSUME(ret < J);
return ret;
}
static __device__ __forceinline__ int get_k(const int /* l */) {
const int ret = threadIdx.x % K;
GGML_CUDA_ASSUME(ret >= 0);
GGML_CUDA_ASSUME(ret < K);
return ret;
}
};
struct mma_int_B_J8K8 {
static constexpr int J = 8;
static constexpr int K = 8;
static constexpr int ne = 2;
int x[ne] = {0};
static __device__ __forceinline__ int get_j(const int /* l */) {
const int ret = threadIdx.x / (K/2);
GGML_CUDA_ASSUME(ret >= 0);
GGML_CUDA_ASSUME(ret < J);
return ret;
}
static __device__ __forceinline__ int get_k(const int l) {
const int ret = l * (K/2) + threadIdx.x % (K/2);
GGML_CUDA_ASSUME(ret >= 0);
GGML_CUDA_ASSUME(ret < K);
return ret;
}
};
struct mma_int_C_I16J8 {
static constexpr int I = 16;
static constexpr int J = 8;
static constexpr int ne = 4;
int x[ne] = {0};
static __device__ __forceinline__ int get_i(const int l) {
const int ret = (l/2) * (I/2) + threadIdx.x / (J/2);
GGML_CUDA_ASSUME(ret >= 0);
GGML_CUDA_ASSUME(ret < I);
return ret;
}
static __device__ __forceinline__ int get_j(const int l) {
const int ret = 2 * (threadIdx.x % (J/2)) + l%2;
GGML_CUDA_ASSUME(ret >= 0);
GGML_CUDA_ASSUME(ret < J);
return ret;
}
__device__ __forceinline__ void mma_K4(const mma_int_A_I16K4 & mma_A, const mma_int_B_J8K4 & mma_B) {
#ifdef INT8_MMA_AVAILABLE
#if __CUDA_ARCH__ >= CC_AMPERE
asm("mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32 {%0, %1, %2, %3}, {%4, %5}, {%6}, {%0, %1, %2, %3};"
: "+r"(x[0]), "+r"(x[1]), "+r"(x[2]), "+r"(x[3])
: "r"(mma_A.x[0]), "r"(mma_A.x[1]), "r"(mma_B.x[0]));
#else
// On Turing m16n8k16 mma is not available, use 2x m8n8k16 mma instead:
asm("mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32 {%0, %1}, {%2}, {%3}, {%0, %1};"
: "+r"(x[0]), "+r"(x[1])
: "r"(mma_A.x[0]), "r"(mma_B.x[0]));
asm("mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32 {%0, %1}, {%2}, {%3}, {%0, %1};"
: "+r"(x[2]), "+r"(x[3])
: "r"(mma_A.x[1]), "r"(mma_B.x[0]));
#endif // __CUDA_ARCH__ >= CC_AMPERE
#else
GGML_UNUSED(mma_A);
GGML_UNUSED(mma_B);
NO_DEVICE_CODE;
#endif // INT8_MMA_AVAILABLE
}
__device__ __forceinline__ void mma_K8(const mma_int_A_I16K8 & mma_A, const mma_int_B_J8K8 & mma_B) {
#ifdef INT8_MMA_AVAILABLE
#if __CUDA_ARCH__ >= CC_AMPERE
asm("mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32 {%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%0, %1, %2, %3};"
: "+r"(x[0]), "+r"(x[1]), "+r"(x[2]), "+r"(x[3])
: "r"(mma_A.x[0]), "r"(mma_A.x[1]), "r"(mma_A.x[2]), "r"(mma_A.x[3]), "r"(mma_B.x[0]), "r"(mma_B.x[1]));
#else
// On Turing m16n8k32 mma is not available, use 4x m8n8k16 mma instead:
asm("mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32 {%0, %1}, {%2}, {%3}, {%0, %1};"
: "+r"(x[0]), "+r"(x[1])
: "r"(mma_A.x[0]), "r"(mma_B.x[0]));
asm("mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32 {%0, %1}, {%2}, {%3}, {%0, %1};"
: "+r"(x[2]), "+r"(x[3])
: "r"(mma_A.x[1]), "r"(mma_B.x[0]));
asm("mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32 {%0, %1}, {%2}, {%3}, {%0, %1};"
: "+r"(x[0]), "+r"(x[1])
: "r"(mma_A.x[2]), "r"(mma_B.x[1]));
asm("mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32 {%0, %1}, {%2}, {%3}, {%0, %1};"
: "+r"(x[2]), "+r"(x[3])
: "r"(mma_A.x[3]), "r"(mma_B.x[1]));
#endif // __CUDA_ARCH__ >= CC_AMPERE
#else
GGML_UNUSED(mma_A);
GGML_UNUSED(mma_B);
NO_DEVICE_CODE;
#endif // INT8_MMA_AVAILABLE
}
};

View File

@@ -2,7 +2,6 @@
#include "common.cuh"
#include "vecdotq.cuh"
#include "mma.cuh"
#include <climits>
#include <cstdint>
@@ -15,7 +14,6 @@ typedef void (*load_tiles_mmq_t)(
typedef void (*vec_dot_mmq_t)(
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
const int * __restrict__ y, float * __restrict__ sum, const int & k0);
typedef void (*mmq_write_back_t)(const float * __restrict__ sum, float * __restrict__ dst, const int & ne0, const int & ne1);
struct block_q8_1_mmq {
half2 ds[4];
@@ -143,15 +141,15 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
}
template <int mmq_x, int mmq_y, int nwarps>
static __device__ __forceinline__ void vec_dot_q4_0_q8_1_dp4a(
static __device__ __forceinline__ void vec_dot_q4_0_q8_1_mul_mat(
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
const int * __restrict__ y, float * __restrict__ sum, const int & k0) {
GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
const float * x_df = (const float *) x_dm;
const int * y_qs = (const int *) y + 4;
const half2 * y_ds = (const half2 *) y;
const float * x_dmf = (const float *) x_dm;
const int * y_qs = (const int *) y + 4;
const half2 * y_ds = (const half2 *) y;
#pragma unroll
for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
@@ -172,76 +170,12 @@ static __device__ __forceinline__ void vec_dot_q4_0_q8_1_dp4a(
}
sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMQ>
(&x_ql[i*(WARP_SIZE + 1) + k0], u, x_df[i*(WARP_SIZE/QI4_0) + i/QI4_0 + k0/QI4_0],
(&x_ql[i*(WARP_SIZE + 1) + k0], u, x_dmf[i*(WARP_SIZE/QI4_0) + i/QI4_0 + k0/QI4_0],
y_ds[j*MMQ_TILE_Y_K + (2*k0/QI8_1) % (WARP_SIZE/QI8_1)]);
}
}
}
template <int mmq_x, int mmq_y, int nwarps>
static __device__ __forceinline__ void vec_dot_q4_0_q8_1_mma(
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
const int * __restrict__ y, float * __restrict__ sum, const int & k0) {
GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
typedef mma_int_A_I16K8 mma_A;
typedef mma_int_B_J8K8 mma_B;
typedef mma_int_C_I16J8 mma_C;
const float * x_df = (const float *) x_dm;
const int * y_qs = (const int *) y + 4;
const half2 * y_ds = (const half2 *) y;
mma_A A;
float dA[mma_C::ne/2];
const int i0 = threadIdx.y*mma_A::I;
static_assert(nwarps*mma_A::I == mmq_y, "nwarps*mma_A::I != mmq_y");
#pragma unroll
for (int l = 0; l < mma_A::ne; ++l) {
const int i = i0 + mma_A::get_i(l);
const int k = k0 + mma_A::get_k(l) % QI4_0;
const int shift = 4*(mma_A::get_k(l) / QI4_0);
A.x[l] = __vsubss4((x_ql[i*(WARP_SIZE + 1) + k] >> shift) & 0x0F0F0F0F, 0x08080808);
}
#pragma unroll
for (int l = 0; l < mma_C::ne/2; ++l) {
const int i = i0 + mma_C::get_i(2*l);
dA[l] = x_df[i*(WARP_SIZE/QI4_0) + i/QI4_0 + k0/QI4_0];
}
for (int j0 = 0; j0 < mmq_x; j0 += mma_int_B_J8K8::J) {
mma_C C;
mma_B B;
half2 dsB[mma_C::ne/2];
#pragma unroll
for (int l = 0; l < mma_B::ne; ++l) {
const int j = j0 + mma_B::get_j(l);
const int k = (2*k0 + mma_B::get_k(l)) % WARP_SIZE;
B.x[l] = y_qs[j*MMQ_TILE_Y_K + k];
}
#pragma unroll
for (int l = 0; l < mma_C::ne/2; ++l) {
const int j = j0 + mma_C::get_j(l);
dsB[l] = y_ds[j*MMQ_TILE_Y_K + (2*k0/QI8_1) % (WARP_SIZE/QI8_1)];
}
C.mma_K8(A, B);
#pragma unroll
for (int l = 0; l < mma_C::ne; ++l) {
sum[(j0/B.J)*C.ne + l] += dA[l/2]*__low2float(dsB[l%2])*C.x[l];
}
}
}
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
const char * __restrict__ x, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
int * __restrict__ x_sc, const int & kbx0, const int & i_max, const int & stride) {
@@ -281,7 +215,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
}
template <int mmq_x, int mmq_y, int nwarps>
static __device__ __forceinline__ void vec_dot_q4_1_q8_1_dp4a(
static __device__ __forceinline__ void vec_dot_q4_1_q8_1_mul_mat(
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
const int * __restrict__ y, float * __restrict__ sum, const int & k0) {
@@ -315,70 +249,6 @@ static __device__ __forceinline__ void vec_dot_q4_1_q8_1_dp4a(
}
}
template <int mmq_x, int mmq_y, int nwarps>
static __device__ __forceinline__ void vec_dot_q4_1_q8_1_mma(
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
const int * __restrict__ y, float * __restrict__ sum, const int & k0) {
GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
typedef mma_int_A_I16K8 mma_A;
typedef mma_int_B_J8K8 mma_B;
typedef mma_int_C_I16J8 mma_C;
const int * y_qs = (const int *) y + 4;
const half2 * y_ds = (const half2 *) y;
mma_A A;
half2 dmA[mma_C::ne/2];
const int i0 = threadIdx.y*mma_A::I;
static_assert(nwarps*mma_A::I == mmq_y, "nwarps*mma_A::I != mmq_y");
#pragma unroll
for (int l = 0; l < mma_A::ne; ++l) {
const int i = i0 + mma_A::get_i(l);
const int k = k0 + mma_A::get_k(l) % QI4_0;
const int shift = 4*(mma_A::get_k(l) / QI4_0);
A.x[l] = (x_ql[i*(WARP_SIZE + 1) + k] >> shift) & 0x0F0F0F0F;
}
#pragma unroll
for (int l = 0; l < mma_C::ne/2; ++l) {
const int i = i0 + mma_C::get_i(2*l);
dmA[l] = x_dm[i*(WARP_SIZE/QI4_0) + i/QI4_0 + k0/QI4_0];
}
for (int j0 = 0; j0 < mmq_x; j0 += mma_int_B_J8K8::J) {
mma_C C;
mma_B B;
half2 dsB[mma_C::ne/2];
#pragma unroll
for (int l = 0; l < mma_B::ne; ++l) {
const int j = j0 + mma_B::get_j(l);
const int k = (2*k0 + mma_B::get_k(l)) % WARP_SIZE;
B.x[l] = y_qs[j*MMQ_TILE_Y_K + k];
}
#pragma unroll
for (int l = 0; l < mma_C::ne/2; ++l) {
const int j = j0 + mma_C::get_j(l);
dsB[l] = y_ds[j*MMQ_TILE_Y_K + (2*k0/QI8_1) % (WARP_SIZE/QI8_1)];
}
C.mma_K8(A, B);
#pragma unroll
for (int l = 0; l < mma_C::ne; ++l) {
const half2 dmA_dsB = dmA[l/2]*dsB[l%2];
sum[(j0/B.J)*C.ne + l] += __low2float(dmA_dsB)*C.x[l] + __high2float(dmA_dsB);
}
}
}
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
const char * __restrict__ x, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
int * __restrict__ x_sc, const int & kbx0, const int & i_max, const int & stride) {
@@ -438,7 +308,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
}
template <int mmq_x, int mmq_y, int nwarps>
static __device__ __forceinline__ void vec_dot_q5_0_q8_1_dp4a(
static __device__ __forceinline__ void vec_dot_q5_0_q8_1_mul_mat(
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
const int * __restrict__ y, float * __restrict__ sum, const int & k0) {
@@ -473,68 +343,6 @@ static __device__ __forceinline__ void vec_dot_q5_0_q8_1_dp4a(
}
}
template <int mmq_x, int mmq_y, int nwarps>
static __device__ __forceinline__ void vec_dot_q5_0_q8_1_mma(
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
const int * __restrict__ y, float * __restrict__ sum, const int & k0) {
GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
typedef mma_int_A_I16K8 mma_A;
typedef mma_int_B_J8K8 mma_B;
typedef mma_int_C_I16J8 mma_C;
const float * x_df = (const float *) x_dm;
const int * y_qs = (const int *) y + 4;
const float * y_df = (const float *) y;
mma_A A;
float dA[mma_C::ne/2];
const int i0 = threadIdx.y*mma_A::I;
static_assert(nwarps*mma_A::I == mmq_y, "nwarps*mma_A::I != mmq_y");
#pragma unroll
for (int l = 0; l < mma_A::ne; ++l) {
const int i = i0 + mma_A::get_i(l);
const int k = 2*(k0 + mma_A::get_k(l) % QI5_0) + mma_A::get_k(l) / QI5_0;
A.x[l] = x_ql[i*(2*WARP_SIZE + 1) + k];
}
#pragma unroll
for (int l = 0; l < mma_C::ne/2; ++l) {
const int i = i0 + mma_C::get_i(2*l);
dA[l] = x_df[i*(WARP_SIZE/QI5_0) + i/QI5_0 + k0/QI5_0];
}
for (int j0 = 0; j0 < mmq_x; j0 += mma_int_B_J8K8::J) {
mma_C C;
mma_B B;
float dB[mma_C::ne/2];
#pragma unroll
for (int l = 0; l < mma_B::ne; ++l) {
const int j = j0 + mma_B::get_j(l);
const int k = (2*k0 + mma_B::get_k(l)) % WARP_SIZE;
B.x[l] = y_qs[j*MMQ_TILE_Y_K + k];
}
#pragma unroll
for (int l = 0; l < mma_C::ne/2; ++l) {
const int j = j0 + mma_C::get_j(l);
dB[l] = y_df[j*MMQ_TILE_Y_K + (2*k0/QI8_1) % (WARP_SIZE/QI8_1)];
}
C.mma_K8(A, B);
#pragma unroll
for (int l = 0; l < mma_C::ne; ++l) {
sum[(j0/B.J)*C.ne + l] += dA[l/2]*dB[l%2]*C.x[l];
}
}
}
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
const char * __restrict__ x, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
@@ -592,7 +400,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
}
template <int mmq_x, int mmq_y, int nwarps>
static __device__ __forceinline__ void vec_dot_q5_1_q8_1_dp4a(
static __device__ __forceinline__ void vec_dot_q5_1_q8_1_mul_mat(
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
const int * __restrict__ y, float * __restrict__ sum, const int & k0) {
@@ -626,69 +434,6 @@ static __device__ __forceinline__ void vec_dot_q5_1_q8_1_dp4a(
}
}
template <int mmq_x, int mmq_y, int nwarps>
static __device__ __forceinline__ void vec_dot_q5_1_q8_1_mma(
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
const int * __restrict__ y, float * __restrict__ sum, const int & k0) {
GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
typedef mma_int_A_I16K8 mma_A;
typedef mma_int_B_J8K8 mma_B;
typedef mma_int_C_I16J8 mma_C;
const int * y_qs = (const int *) y + 4;
const half2 * y_ds = (const half2 *) y;
mma_A A;
half2 dmA[mma_C::ne/2];
const int i0 = threadIdx.y*mma_A::I;
static_assert(nwarps*mma_A::I == mmq_y, "nwarps*mma_A::I != mmq_y");
#pragma unroll
for (int l = 0; l < mma_A::ne; ++l) {
const int i = i0 + mma_A::get_i(l);
const int k = 2*(k0 + mma_A::get_k(l) % QI5_1) + mma_A::get_k(l) / QI5_1;
A.x[l] = x_ql[i*(2*WARP_SIZE + 1) + k];
}
#pragma unroll
for (int l = 0; l < mma_C::ne/2; ++l) {
const int i = i0 + mma_C::get_i(2*l);
dmA[l] = x_dm[i*(WARP_SIZE/QI5_1) + i/QI5_1 + k0/QI5_1];
}
for (int j0 = 0; j0 < mmq_x; j0 += mma_int_B_J8K8::J) {
mma_C C;
mma_B B;
half2 dsB[mma_C::ne/2];
#pragma unroll
for (int l = 0; l < mma_B::ne; ++l) {
const int j = j0 + mma_B::get_j(l);
const int k = (2*k0 + mma_B::get_k(l)) % WARP_SIZE;
B.x[l] = y_qs[j*MMQ_TILE_Y_K + k];
}
#pragma unroll
for (int l = 0; l < mma_C::ne/2; ++l) {
const int j = j0 + mma_C::get_j(l);
dsB[l] = y_ds[j*MMQ_TILE_Y_K + (2*k0/QI8_1) % (WARP_SIZE/QI8_1)];
}
C.mma_K8(A, B);
#pragma unroll
for (int l = 0; l < mma_C::ne; ++l) {
const half2 dmA_dsB = dmA[l/2]*dsB[l%2];
sum[(j0/B.J)*C.ne + l] += __low2float(dmA_dsB)*C.x[l] + __high2float(dmA_dsB);
}
}
}
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
const char * __restrict__ x, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
int * __restrict__ x_sc, const int & kbx0, const int & i_max, const int & stride) {
@@ -730,7 +475,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
}
template <int mmq_x, int mmq_y, int nwarps>
static __device__ __forceinline__ void vec_dot_q8_0_q8_1_dp4a(
static __device__ __forceinline__ void vec_dot_q8_0_q8_1_mul_mat(
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
const int * __restrict__ y, float * __restrict__ sum, const int & k0) {
@@ -755,69 +500,6 @@ static __device__ __forceinline__ void vec_dot_q8_0_q8_1_dp4a(
}
}
template <int mmq_x, int mmq_y, int nwarps>
static __device__ __forceinline__ void vec_dot_q8_0_q8_1_mma(
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
const int * __restrict__ y, float * __restrict__ sum, const int & k0) {
GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
typedef mma_int_A_I16K8 mma_A;
typedef mma_int_B_J8K8 mma_B;
typedef mma_int_C_I16J8 mma_C;
const float * x_df = (const float *) x_dm;
const int * y_qs = (const int *) y + 4;
const float * y_df = (const float *) y;
mma_A A;
float dA[mma_C::ne/2];
const int i0 = threadIdx.y*mma_A::I;
static_assert(nwarps*mma_A::I == mmq_y, "nwarps*mma_A::I != mmq_y");
#pragma unroll
for (int l = 0; l < mma_A::ne; ++l) {
const int i = i0 + mma_A::get_i(l);
const int k = k0 + mma_A::get_k(l);
A.x[l] = x_ql[i*(WARP_SIZE + 1) + k];
}
#pragma unroll
for (int l = 0; l < mma_C::ne/2; ++l) {
const int i = i0 + mma_C::get_i(2*l);
dA[l] = x_df[i*(WARP_SIZE/QI8_0) + i/QI8_0 + k0/QI8_0];
}
for (int j0 = 0; j0 < mmq_x; j0 += mma_int_B_J8K8::J) {
mma_C C;
mma_B B;
float dB[mma_C::ne/2];
#pragma unroll
for (int l = 0; l < mma_B::ne; ++l) {
const int j = j0 + mma_B::get_j(l);
const int k = k0 + mma_B::get_k(l);
B.x[l] = y_qs[j*MMQ_TILE_Y_K + k];
}
#pragma unroll
for (int l = 0; l < mma_C::ne/2; ++l) {
const int j = j0 + mma_C::get_j(l);
dB[l] = y_df[j*MMQ_TILE_Y_K + k0/QI8_1];
}
C.mma_K8(A, B);
#pragma unroll
for (int l = 0; l < mma_C::ne; ++l) {
sum[(j0/B.J)*C.ne + l] += C.x[l]*dA[l/2]*dB[l%2];
}
}
}
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
const char * __restrict__ x, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
int * __restrict__ x_sc, const int & kbx0, const int & i_max, const int & stride) {
@@ -1089,7 +771,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
}
template <int mmq_x, int mmq_y, int nwarps>
static __device__ __forceinline__ void vec_dot_q4_K_q8_1_dp4a(
static __device__ __forceinline__ void vec_dot_q4_K_q8_1_mul_mat(
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
const int * __restrict__ y, float * __restrict__ sum, const int & k0) {
@@ -1115,97 +797,6 @@ static __device__ __forceinline__ void vec_dot_q4_K_q8_1_dp4a(
}
}
template <int mmq_x, int mmq_y, int nwarps>
static __device__ __forceinline__ void vec_dot_q4_K_q8_1_mma(
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
const int * __restrict__ y, float * __restrict__ sum, const int & k0) {
GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
typedef mma_int_A_I16K8 mma_A;
typedef mma_int_B_J8K8 mma_B;
typedef mma_int_C_I16J8 mma_C;
const int * y_qs = (const int *) y + 4;
const half2 * y_ds = (const half2 *) y;
const int i0 = threadIdx.y*mma_A::I;
static_assert(nwarps*mma_A::I == mmq_y, "nwarps*mma_A::I != mmq_y");
mma_A A[2];
int scA[mma_C::ne/2][2];
int mA[mma_C::ne/2][2];
half2 dmA[mma_C::ne/2];
#pragma unroll
for (int kvdr = 0; kvdr < VDR_Q4_K_Q8_1_MMQ; kvdr += 4) {
#pragma unroll
for (int l = 0; l < mma_A::ne; ++l) {
const int i = i0 + mma_A::get_i(l);
const int k = k0 + mma_A::get_k(l);
A[kvdr/4].x[l] = (x_ql[i*(WARP_SIZE + 1) + k] >> kvdr) & 0x0F0F0F0F;
}
#pragma unroll
for (int l = 0; l < mma_C::ne/2; ++l) {
const int i = i0 + mma_C::get_i(2*l);
const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k0/16]) + 2 * ((k0 % 16) / 8);
const uint8_t * m = sc + 8;
scA[l][kvdr/4] = sc[kvdr/4];
mA[l][kvdr/4] = m[kvdr/4];
}
}
#pragma unroll
for (int l = 0; l < mma_C::ne/2; ++l) {
const int i = i0 + mma_C::get_i(2*l);
dmA[l] = x_dm[i*(WARP_SIZE/QI5_K) + i/QI5_K + k0/QI5_K];
}
#pragma unroll
for (int j0 = 0; j0 < mmq_x; j0 += mma_int_B_J8K8::J) {
float tmpd[mma_C::ne] = {0.0f};
float tmpm[mma_C::ne] = {0.0f};
#pragma unroll
for (int kvdr = 0; kvdr < VDR_Q5_K_Q8_1_MMQ; kvdr += 4) {
mma_C C;
mma_B B;
half2 dsB[mma_C::ne/2];
#pragma unroll
for (int l = 0; l < mma_B::ne; ++l) {
const int j = j0 + mma_B::get_j(l);
const int k = (2*k0 + 2*kvdr + mma_B::get_k(l)) % WARP_SIZE;
B.x[l] = y_qs[j*MMQ_TILE_Y_K + k];
}
#pragma unroll
for (int l = 0; l < mma_C::ne/2; ++l) {
const int j = j0 + mma_C::get_j(l);
dsB[l] = y_ds[j*MMQ_TILE_Y_K + ((2*k0 + 2*kvdr)/QI8_1) % (WARP_SIZE/QI8_1)];
}
C.mma_K8(A[kvdr/4], B);
#pragma unroll
for (int l = 0; l < mma_C::ne; ++l) {
tmpd[l] += (C.x[l]*scA[l/2][kvdr/4]) * __low2float(dsB[l%2]);
tmpm[l] += mA[l/2][kvdr/4] * __high2float(dsB[l%2]);
}
}
#pragma unroll
for (int l = 0; l < mma_C::ne; ++l) {
sum[(j0/mma_B::J)*mma_C::ne + l] += __low2float(dmA[l/2])*tmpd[l] - __high2float(dmA[l/2])*tmpm[l];
}
}
}
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
const char * __restrict__ x, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
int * __restrict__ x_sc, const int & kbx0, const int & i_max, const int & stride) {
@@ -1279,7 +870,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
}
template <int mmq_x, int mmq_y, int nwarps>
static __device__ __forceinline__ void vec_dot_q5_K_q8_1_dp4a(
static __device__ __forceinline__ void vec_dot_q5_K_q8_1_mul_mat(
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
const int * __restrict__ y, float * __restrict__ sum, const int & k0) {
@@ -1305,97 +896,6 @@ static __device__ __forceinline__ void vec_dot_q5_K_q8_1_dp4a(
}
}
template <int mmq_x, int mmq_y, int nwarps>
static __device__ __forceinline__ void vec_dot_q5_K_q8_1_mma(
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
const int * __restrict__ y, float * __restrict__ sum, const int & k0) {
GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
typedef mma_int_A_I16K8 mma_A;
typedef mma_int_B_J8K8 mma_B;
typedef mma_int_C_I16J8 mma_C;
const int * y_qs = (const int *) y + 4;
const half2 * y_ds = (const half2 *) y;
const int i0 = threadIdx.y*mma_A::I;
static_assert(nwarps*mma_A::I == mmq_y, "nwarps*mma_A::I != mmq_y");
mma_A A[2];
int scA[mma_C::ne/2][2];
int mA[mma_C::ne/2][2];
half2 dmA[mma_C::ne/2];
#pragma unroll
for (int kvdr = 0; kvdr < VDR_Q5_K_Q8_1_MMQ; kvdr += 4) {
#pragma unroll
for (int l = 0; l < mma_A::ne; ++l) {
const int i = i0 + mma_A::get_i(l);
const int k = QR5_K*k0 + QR5_K*kvdr + mma_A::get_k(l);
A[kvdr/4].x[l] = x_ql[i*(QR5_K*WARP_SIZE + 1) + k];
}
#pragma unroll
for (int l = 0; l < mma_C::ne/2; ++l) {
const int i = i0 + mma_C::get_i(2*l);
const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k0/16]) + 2 * ((k0 % 16) / 8);
const uint8_t * m = sc + 8;
scA[l][kvdr/4] = sc[kvdr/4];
mA[l][kvdr/4] = m[kvdr/4];
}
}
#pragma unroll
for (int l = 0; l < mma_C::ne/2; ++l) {
const int i = i0 + mma_C::get_i(2*l);
dmA[l] = x_dm[i*(WARP_SIZE/QI5_K) + i/QI5_K + k0/QI5_K];
}
#pragma unroll
for (int j0 = 0; j0 < mmq_x; j0 += mma_int_B_J8K8::J) {
float tmpd[mma_C::ne] = {0.0f};
float tmpm[mma_C::ne] = {0.0f};
#pragma unroll
for (int kvdr = 0; kvdr < VDR_Q5_K_Q8_1_MMQ; kvdr += 4) {
mma_C C;
mma_B B;
half2 dsB[mma_C::ne/2];
#pragma unroll
for (int l = 0; l < mma_B::ne; ++l) {
const int j = j0 + mma_B::get_j(l);
const int k = (2*k0 + 2*kvdr + mma_B::get_k(l)) % WARP_SIZE;
B.x[l] = y_qs[j*MMQ_TILE_Y_K + k];
}
#pragma unroll
for (int l = 0; l < mma_C::ne/2; ++l) {
const int j = j0 + mma_C::get_j(l);
dsB[l] = y_ds[j*MMQ_TILE_Y_K + ((2*k0 + 2*kvdr)/QI8_1) % (WARP_SIZE/QI8_1)];
}
C.mma_K8(A[kvdr/4], B);
#pragma unroll
for (int l = 0; l < mma_C::ne; ++l) {
tmpd[l] += (C.x[l]*scA[l/2][kvdr/4]) * __low2float(dsB[l%2]);
tmpm[l] += mA[l/2][kvdr/4] * __high2float(dsB[l%2]);
}
}
#pragma unroll
for (int l = 0; l < mma_C::ne; ++l) {
sum[(j0/mma_B::J)*mma_C::ne + l] += __low2float(dmA[l/2])*tmpd[l] - __high2float(dmA[l/2])*tmpm[l];
}
}
}
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
const char * __restrict__ x, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
int * __restrict__ x_sc, const int & kbx0, const int & i_max, const int & stride) {
@@ -1462,7 +962,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
}
template <int mmq_x, int mmq_y, int nwarps>
static __device__ __forceinline__ void vec_dot_q6_K_q8_1_dp4a(
static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mul_mat(
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
const int * __restrict__ y, float * __restrict__ sum, const int & k0) {
@@ -1489,148 +989,6 @@ static __device__ __forceinline__ void vec_dot_q6_K_q8_1_dp4a(
}
}
template <int mmq_x, int mmq_y, int nwarps>
static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mma(
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
const int * __restrict__ y, float * __restrict__ sum, const int & k0) {
GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
typedef mma_int_A_I16K4 mma_A;
typedef mma_int_B_J8K4 mma_B;
typedef mma_int_C_I16J8 mma_C;
const float * x_df = (const float *) x_dm;
const int * y_qs = (const int *) y + 4;
const float * y_df = (const float *) y;
const int i0 = threadIdx.y*mma_A::I;
static_assert(nwarps*mma_A::I == mmq_y, "nwarps*mma_A::I != mmq_y");
mma_A A[4];
int scA[mma_C::ne/2][4];
float dA[mma_C::ne/2];
#pragma unroll
for (int kvdr = 0; kvdr < VDR_Q6_K_Q8_1_MMQ; kvdr += 4) {
#pragma unroll
for (int l = 0; l < mma_A::ne; ++l) {
const int i = i0 + mma_A::get_i(l);
const int k = QR6_K*k0 + QR6_K*kvdr + mma_A::get_k(l);
A[kvdr/2 + 0].x[l] = x_ql[i*(QR6_K*WARP_SIZE + 1) + k + 0];
A[kvdr/2 + 1].x[l] = x_ql[i*(QR6_K*WARP_SIZE + 1) + k + mma_A::K];
}
#pragma unroll
for (int l = 0; l < mma_C::ne/2; ++l) {
const int i = i0 + mma_C::get_i(2*l);
const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k0/8]);
scA[l][kvdr/2 + 0] = sc[kvdr/2 + 0];
scA[l][kvdr/2 + 1] = sc[kvdr/2 + 1];
}
}
#pragma unroll
for (int l = 0; l < mma_C::ne/2; ++l) {
const int i = i0 + mma_C::get_i(2*l);
dA[l] = x_df[i*(WARP_SIZE/QI6_K) + i/QI6_K + k0/QI6_K];
}
#pragma unroll
for (int j0 = 0; j0 < mmq_x; j0 += mma_int_B_J8K8::J) {
float tmp[mma_C::ne] = {0.0f};
#pragma unroll
for (int kvdr = 0; kvdr < VDR_Q6_K_Q8_1_MMQ; kvdr += 4) {
mma_C C[2];
mma_B B[2];
float dB[mma_C::ne/2];
#pragma unroll
for (int l = 0; l < mma_B::ne; ++l) {
const int j = j0 + mma_B::get_j(l);
const int k = (2*k0 + 2*kvdr + mma_B::get_k(l)) % WARP_SIZE;
B[0].x[l] = y_qs[j*MMQ_TILE_Y_K + k + 0];
B[1].x[l] = y_qs[j*MMQ_TILE_Y_K + k + mma_B::K];
}
#pragma unroll
for (int l = 0; l < mma_C::ne/2; ++l) {
const int j = j0 + mma_C::get_j(l);
dB[l] = y_df[j*MMQ_TILE_Y_K + ((2*k0 + 2*kvdr)/QI8_1) % (WARP_SIZE/QI8_1)];
}
C[0].mma_K4(A[kvdr/2 + 0], B[0]);
C[1].mma_K4(A[kvdr/2 + 1], B[1]);
#pragma unroll
for (int l = 0; l < mma_C::ne; ++l) {
tmp[l] += (C[0].x[l]*scA[l/2][kvdr/2 + 0] + C[1].x[l]*scA[l/2][kvdr/2 + 1])*dB[l%2];
}
}
#pragma unroll
for (int l = 0; l < mma_C::ne; ++l) {
sum[(j0/mma_B::J)*mma_C::ne + l] += tmp[l]*dA[l/2];
}
}
}
template<int mmq_x, int mmq_y, int nwarps, bool need_check>
static __device__ __forceinline__ void mmq_write_back_dp4a(const float * __restrict__ sum, float * __restrict__ dst, const int & ne0, const int & ne1) {
#pragma unroll
for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
const int j = blockIdx.y*mmq_x + j0 + threadIdx.y;
if (j >= ne1) {
return;
}
#pragma unroll
for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
const int i = blockIdx.x*mmq_y + i0 + threadIdx.x;
if (need_check && i >= ne0) {
continue;
}
dst[j*ne0 + i] = sum[(j0/nwarps) * (mmq_y/WARP_SIZE) + i0/WARP_SIZE];
}
}
}
template<int mmq_x, int mmq_y, int nwarps, bool need_check>
static __device__ __forceinline__ void mmq_write_back_mma(const float * __restrict__ sum, float * __restrict__ dst, const int & ne0, const int & ne1) {
typedef mma_int_C_I16J8 mma_C;
const int i0 = threadIdx.y*mma_C::I;
static_assert(nwarps*mma_C::I == mmq_y, "nwarps*mma_C::I != mmq_y");
#pragma unroll
for (int j0 = 0; j0 < mmq_x; j0 += mma_C::J) {
#pragma unroll
for (int l = 0; l < mma_C::ne; ++l) {
const int j = blockIdx.y*mmq_x + j0 + mma_C::get_j(l);
if (j >= ne1) {
continue;
}
const int i = blockIdx.x*mmq_y + i0 + mma_C::get_i(l);
if (need_check && i >= ne0) {
continue;
}
dst[j*ne0 + i] = sum[(j0/mma_C::J)*mma_C::ne + l];
}
}
}
// -------------------------------------------------------------------------------------------------------------------------------------
template <int mmq_x, int mmq_y, int nwarps, bool need_check, ggml_type type>
@@ -1640,65 +998,35 @@ template <int mmq_x, int mmq_y, int nwarps, bool need_check>
struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q4_0> {
static constexpr int vdr = VDR_Q4_0_Q8_1_MMQ;
static constexpr load_tiles_mmq_t load_tiles = load_tiles_q4_0<mmq_y, nwarps, need_check>;
#ifdef INT8_MMA_AVAILABLE
static constexpr vec_dot_mmq_t vec_dot = vec_dot_q4_0_q8_1_mma<mmq_x, mmq_y, nwarps>;
static constexpr mmq_write_back_t write_back = mmq_write_back_mma<mmq_x, mmq_y, nwarps, need_check>;
#else
static constexpr vec_dot_mmq_t vec_dot = vec_dot_q4_0_q8_1_dp4a<mmq_x, mmq_y, nwarps>;
static constexpr mmq_write_back_t write_back = mmq_write_back_dp4a<mmq_x, mmq_y, nwarps, need_check>;
#endif // INT8_MMA_AVAILABLE
static constexpr vec_dot_mmq_t vec_dot = vec_dot_q4_0_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
};
template <int mmq_x, int mmq_y, int nwarps, bool need_check>
struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q4_1> {
static constexpr int vdr = VDR_Q4_1_Q8_1_MMQ;
static constexpr load_tiles_mmq_t load_tiles = load_tiles_q4_1<mmq_y, nwarps, need_check>;
#ifdef INT8_MMA_AVAILABLE
static constexpr vec_dot_mmq_t vec_dot = vec_dot_q4_1_q8_1_mma<mmq_x, mmq_y, nwarps>;
static constexpr mmq_write_back_t write_back = mmq_write_back_mma<mmq_x, mmq_y, nwarps, need_check>;
#else
static constexpr vec_dot_mmq_t vec_dot = vec_dot_q4_1_q8_1_dp4a<mmq_x, mmq_y, nwarps>;
static constexpr mmq_write_back_t write_back = mmq_write_back_dp4a<mmq_x, mmq_y, nwarps, need_check>;
#endif // INT8_MMA_AVAILABLE
static constexpr vec_dot_mmq_t vec_dot = vec_dot_q4_1_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
};
template <int mmq_x, int mmq_y, int nwarps, bool need_check>
struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q5_0> {
static constexpr int vdr = VDR_Q5_0_Q8_1_MMQ;
static constexpr load_tiles_mmq_t load_tiles = load_tiles_q5_0<mmq_y, nwarps, need_check>;
#ifdef INT8_MMA_AVAILABLE
static constexpr vec_dot_mmq_t vec_dot = vec_dot_q5_0_q8_1_mma<mmq_x, mmq_y, nwarps>;
static constexpr mmq_write_back_t write_back = mmq_write_back_mma<mmq_x, mmq_y, nwarps, need_check>;
#else
static constexpr vec_dot_mmq_t vec_dot = vec_dot_q5_0_q8_1_dp4a<mmq_x, mmq_y, nwarps>;
static constexpr mmq_write_back_t write_back = mmq_write_back_dp4a<mmq_x, mmq_y, nwarps, need_check>;
#endif // INT8_MMA_AVAILABLE
static constexpr vec_dot_mmq_t vec_dot = vec_dot_q5_0_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
};
template <int mmq_x, int mmq_y, int nwarps, bool need_check>
struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q5_1> {
static constexpr int vdr = VDR_Q5_1_Q8_1_MMQ;
static constexpr load_tiles_mmq_t load_tiles = load_tiles_q5_1<mmq_y, nwarps, need_check>;
#ifdef INT8_MMA_AVAILABLE
static constexpr vec_dot_mmq_t vec_dot = vec_dot_q5_1_q8_1_mma<mmq_x, mmq_y, nwarps>;
static constexpr mmq_write_back_t write_back = mmq_write_back_mma<mmq_x, mmq_y, nwarps, need_check>;
#else
static constexpr vec_dot_mmq_t vec_dot = vec_dot_q5_1_q8_1_dp4a<mmq_x, mmq_y, nwarps>;
static constexpr mmq_write_back_t write_back = mmq_write_back_dp4a<mmq_x, mmq_y, nwarps, need_check>;
#endif // INT8_MMA_AVAILABLE
static constexpr vec_dot_mmq_t vec_dot = vec_dot_q5_1_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
};
template <int mmq_x, int mmq_y, int nwarps, bool need_check>
struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q8_0> {
static constexpr int vdr = VDR_Q8_0_Q8_1_MMQ;
static constexpr load_tiles_mmq_t load_tiles = load_tiles_q8_0<mmq_y, nwarps, need_check>;
#ifdef INT8_MMA_AVAILABLE
static constexpr vec_dot_mmq_t vec_dot = vec_dot_q8_0_q8_1_mma<mmq_x, mmq_y, nwarps>;
static constexpr mmq_write_back_t write_back = mmq_write_back_mma<mmq_x, mmq_y, nwarps, need_check>;
#else
static constexpr vec_dot_mmq_t vec_dot = vec_dot_q8_0_q8_1_dp4a<mmq_x, mmq_y, nwarps>;
static constexpr mmq_write_back_t write_back = mmq_write_back_dp4a<mmq_x, mmq_y, nwarps, need_check>;
#endif // INT8_MMA_AVAILABLE
static constexpr vec_dot_mmq_t vec_dot = vec_dot_q8_0_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
};
template <int mmq_x, int mmq_y, int nwarps, bool need_check>
@@ -1706,7 +1034,6 @@ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q2_K> {
static constexpr int vdr = VDR_Q2_K_Q8_1_MMQ;
static constexpr load_tiles_mmq_t load_tiles = load_tiles_q2_K<mmq_y, nwarps, need_check>;
static constexpr vec_dot_mmq_t vec_dot = vec_dot_q2_K_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
static constexpr mmq_write_back_t write_back = mmq_write_back_dp4a<mmq_x, mmq_y, nwarps, need_check>;
};
template <int mmq_x, int mmq_y, int nwarps, bool need_check>
@@ -1714,46 +1041,27 @@ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q3_K> {
static constexpr int vdr = VDR_Q3_K_Q8_1_MMQ;
static constexpr load_tiles_mmq_t load_tiles = load_tiles_q3_K<mmq_y, nwarps, need_check>;
static constexpr vec_dot_mmq_t vec_dot = vec_dot_q3_K_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
static constexpr mmq_write_back_t write_back = mmq_write_back_dp4a<mmq_x, mmq_y, nwarps, need_check>;
};
template <int mmq_x, int mmq_y, int nwarps, bool need_check>
struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q4_K> {
static constexpr int vdr = VDR_Q4_K_Q8_1_MMQ;
static constexpr load_tiles_mmq_t load_tiles = load_tiles_q4_K<mmq_y, nwarps, need_check>;
#ifdef INT8_MMA_AVAILABLE
static constexpr vec_dot_mmq_t vec_dot = vec_dot_q4_K_q8_1_mma<mmq_x, mmq_y, nwarps>;
static constexpr mmq_write_back_t write_back = mmq_write_back_mma<mmq_x, mmq_y, nwarps, need_check>;
#else
static constexpr vec_dot_mmq_t vec_dot = vec_dot_q4_K_q8_1_dp4a<mmq_x, mmq_y, nwarps>;
static constexpr mmq_write_back_t write_back = mmq_write_back_dp4a<mmq_x, mmq_y, nwarps, need_check>;
#endif // INT8_MMA_AVAILABLE
static constexpr vec_dot_mmq_t vec_dot = vec_dot_q4_K_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
};
template <int mmq_x, int mmq_y, int nwarps, bool need_check>
struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q5_K> {
static constexpr int vdr = VDR_Q5_K_Q8_1_MMQ;
static constexpr load_tiles_mmq_t load_tiles = load_tiles_q5_K<mmq_y, nwarps, need_check>;
#ifdef INT8_MMA_AVAILABLE
static constexpr vec_dot_mmq_t vec_dot = vec_dot_q5_K_q8_1_mma<mmq_x, mmq_y, nwarps>;
static constexpr mmq_write_back_t write_back = mmq_write_back_mma<mmq_x, mmq_y, nwarps, need_check>;
#else
static constexpr vec_dot_mmq_t vec_dot = vec_dot_q5_K_q8_1_dp4a<mmq_x, mmq_y, nwarps>;
static constexpr mmq_write_back_t write_back = mmq_write_back_dp4a<mmq_x, mmq_y, nwarps, need_check>;
#endif // INT8_MMA_AVAILABLE
static constexpr vec_dot_mmq_t vec_dot = vec_dot_q5_K_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
};
template <int mmq_x, int mmq_y, int nwarps, bool need_check>
struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q6_K> {
static constexpr int vdr = VDR_Q6_K_Q8_1_MMQ;
static constexpr load_tiles_mmq_t load_tiles = load_tiles_q6_K<mmq_y, nwarps, need_check>;
#ifdef INT8_MMA_AVAILABLE
static constexpr vec_dot_mmq_t vec_dot = vec_dot_q6_K_q8_1_mma<mmq_x, mmq_y, nwarps>;
static constexpr mmq_write_back_t write_back = mmq_write_back_mma<mmq_x, mmq_y, nwarps, need_check>;
#else
static constexpr vec_dot_mmq_t vec_dot = vec_dot_q6_K_q8_1_dp4a<mmq_x, mmq_y, nwarps>;
static constexpr mmq_write_back_t write_back = mmq_write_back_dp4a<mmq_x, mmq_y, nwarps, need_check>;
#endif // INT8_MMA_AVAILABLE
static constexpr vec_dot_mmq_t vec_dot = vec_dot_q6_K_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
};
static int mmq_need_sum(const ggml_type type_x) {
@@ -1810,7 +1118,6 @@ static __global__ void mul_mat_q(
constexpr int vdr = mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, type>::vdr;
constexpr load_tiles_mmq_t load_tiles = mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, type>::load_tiles;
constexpr vec_dot_mmq_t vec_dot = mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, type>::vec_dot;
constexpr mmq_write_back_t write_back = mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, type>::write_back;
constexpr tile_x_sizes txs = get_tile_x_sizes_device<mmq_y>(type);
@@ -1830,7 +1137,7 @@ static __global__ void mul_mat_q(
const int * y = (const int *) yc + blockIdx.y*(mmq_x*sizeof(block_q8_1_mmq)/sizeof(int));
float sum[mmq_x*mmq_y / (nwarps*WARP_SIZE)] = {0.0f};
float sum[(mmq_x/nwarps) * (mmq_y/WARP_SIZE)] = {0.0f};
for (int kb0 = 0; kb0 < blocks_per_row_x; kb0 += blocks_per_warp) {
@@ -1857,7 +1164,25 @@ static __global__ void mul_mat_q(
}
}
write_back(sum, dst, ne0, ne1);
#pragma unroll
for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
const int j = blockIdx.y*mmq_x + j0 + threadIdx.y;
if (j >= ne1) {
return;
}
#pragma unroll
for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
const int i = blockIdx.x*mmq_y + i0 + threadIdx.x;
if (need_check && i >= ne0) {
continue;
}
dst[j*ne0 + i] = sum[(j0/nwarps) * (mmq_y/WARP_SIZE) + i0/WARP_SIZE];
}
}
}
struct mmq_args {
@@ -1931,10 +1256,10 @@ void mul_mat_q_case(const mmq_args & args, cudaStream_t stream) {
launch_mul_mat_q<type, 8, 4>(args, stream);
break;
case 16:
launch_mul_mat_q<type, 16, 4>(args, stream);
launch_mul_mat_q<type, 16, 8>(args, stream);
break;
case 24:
launch_mul_mat_q<type, 24, 4>(args, stream);
launch_mul_mat_q<type, 24, 8>(args, stream);
break;
case 32:
launch_mul_mat_q<type, 32, 8>(args, stream);

View File

@@ -148,8 +148,6 @@ void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
float * dst_d = (float *)dst->data;
cudaStream_t stream = ctx.stream();
GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
@@ -162,8 +160,6 @@ void ggml_cuda_op_silu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
float * dst_d = (float *)dst->data;
cudaStream_t stream = ctx.stream();
GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
@@ -176,8 +172,6 @@ void ggml_cuda_op_gelu_quick(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
float * dst_d = (float *)dst->data;
cudaStream_t stream = ctx.stream();
GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
@@ -190,8 +184,6 @@ void ggml_cuda_op_tanh(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
float * dst_d = (float *)dst->data;
cudaStream_t stream = ctx.stream();
GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
@@ -204,8 +196,6 @@ void ggml_cuda_op_relu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
float * dst_d = (float *)dst->data;
cudaStream_t stream = ctx.stream();
GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
@@ -218,8 +208,6 @@ void ggml_cuda_op_sigmoid(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
float * dst_d = (float *)dst->data;
cudaStream_t stream = ctx.stream();
GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
@@ -232,8 +220,6 @@ void ggml_cuda_op_hardsigmoid(ggml_backend_cuda_context & ctx, ggml_tensor * dst
float * dst_d = (float *)dst->data;
cudaStream_t stream = ctx.stream();
GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
@@ -246,8 +232,6 @@ void ggml_cuda_op_hardswish(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
float * dst_d = (float *)dst->data;
cudaStream_t stream = ctx.stream();
GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
@@ -260,8 +244,6 @@ void ggml_cuda_op_leaky_relu(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
float * dst_d = (float *)dst->data;
cudaStream_t stream = ctx.stream();
GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
@@ -277,8 +259,6 @@ void ggml_cuda_op_sqr(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
float * dst_d = (float *)dst->data;
cudaStream_t stream = ctx.stream();
GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);

View File

@@ -1340,7 +1340,7 @@ static bool ggml_vk_supports_op(const struct ggml_tensor * op) {
case GGML_UNARY_OP_RELU:
case GGML_UNARY_OP_GELU:
case GGML_UNARY_OP_SILU:
return ggml_is_contiguous(op->src[0]);
return true;
default:
;
}

View File

@@ -744,7 +744,7 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
case GGML_UNARY_OP_GELU:
case GGML_UNARY_OP_GELU_QUICK:
case GGML_UNARY_OP_SILU:
return ggml_is_contiguous(op->src[0]);
return true;
default:
return false;
}

View File

@@ -13089,12 +13089,10 @@ void *ggml_sycl_host_malloc(size_t size) try {
return nullptr;
}
ggml_sycl_set_device(g_main_device);
dpct::queue_ptr main_stream = g_syclStreams[g_main_device][0];
void * ptr = nullptr;
//allow to use dpct::get_in_order_queue() for host malloc
dpct::err0 err = CHECK_TRY_ERROR(
ptr = (void *)sycl::malloc_host(size, *main_stream));
ptr = (void *)sycl::malloc_host(size, dpct::get_in_order_queue()));
if (err != 0) {
// clear the error
@@ -13115,9 +13113,8 @@ catch (sycl::exception const &exc) {
}
void ggml_sycl_host_free(void *ptr) try {
ggml_sycl_set_device(g_main_device);
dpct::queue_ptr main_stream = g_syclStreams[g_main_device][0];
SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(ptr, *main_stream)));
//allow to use dpct::get_in_order_queue() for host malloc
SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(ptr, dpct::get_in_order_queue())));
}
catch (sycl::exception const &exc) {
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@@ -17190,7 +17187,7 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
case GGML_UNARY_OP_HARDSWISH:
case GGML_UNARY_OP_GELU_QUICK:
case GGML_UNARY_OP_TANH:
return ggml_is_contiguous(op->src[0]);
return true;
default:
return false;
}

File diff suppressed because it is too large Load Diff

View File

@@ -1,5 +1,5 @@
#include "ggml-vulkan.h"
#include <vulkan/vulkan_core.h>
#ifdef GGML_VULKAN_RUN_TESTS
#include <chrono>
#endif
@@ -9,13 +9,12 @@
#include <algorithm>
#include <cmath>
#include <iostream>
#include <limits>
#include <tuple>
#include <vector>
#include <sstream>
#include <utility>
#include <memory>
#include <limits>
#include <map>
#include "ggml.h"
#include "ggml-backend-impl.h"
@@ -151,7 +150,7 @@ struct vk_device {
vk_pipeline pipeline_relu_f32;
vk_pipeline pipeline_diag_mask_inf_f32;
vk_pipeline pipeline_soft_max_f32, pipeline_soft_max_f32_f16;
vk_pipeline pipeline_rope_norm_f32, pipeline_rope_norm_f16;
vk_pipeline pipeline_rope_f32, pipeline_rope_f16;
vk_pipeline pipeline_rope_neox_f32, pipeline_rope_neox_f16;
vk_pipeline pipeline_argsort_f32;
vk_pipeline pipeline_sum_rows_f32;
@@ -284,15 +283,26 @@ struct vk_op_diag_mask_push_constants {
struct vk_op_rope_push_constants {
uint32_t ncols;
uint32_t n_dims;
float freq_scale;
uint32_t p_delta_rows;
float freq_base;
float ext_factor;
float attn_factor;
float corr_dims[2];
float corr_dims[4];
};
struct vk_op_rope_neox_push_constants {
uint32_t ncols;
uint32_t ndims;
float freq_scale;
uint32_t p_delta_rows;
float freq_base;
float ext_factor;
float attn_factor;
float corr_dims[4];
float theta_scale;
uint32_t has_ff;
float inv_ndims;
uint32_t has_freq_facs;
};
struct vk_op_soft_max_push_constants {
@@ -1524,11 +1534,11 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_soft_max_f32, "soft_max_f32", soft_max_f32_len, soft_max_f32_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, {}, 1);
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_soft_max_f32_f16, "soft_max_f32_f16", soft_max_f32_f16_len, soft_max_f32_f16_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, {}, 1);
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_norm_f32, "rope_norm_f32", rope_norm_f32_len, rope_norm_f32_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_norm_f16, "rope_norm_f16", rope_norm_f16_len, rope_norm_f16_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_f32, "rope_f32", rope_f32_len, rope_f32_data, "main", 3, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_f16, "rope_f16", rope_f16_len, rope_f16_data, "main", 3, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 4, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1);
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 4, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1);
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_argsort_f32, "argsort_f32", argsort_f32_len, argsort_f32_data, "main", 2, sizeof(vk_op_argsort_push_constants), {1024, 1, 1}, {}, 1);
@@ -1556,10 +1566,8 @@ static void ggml_vk_print_gpu_info(size_t idx) {
vk::PhysicalDeviceProperties2 props2;
vk::PhysicalDeviceMaintenance3Properties props3;
vk::PhysicalDeviceSubgroupProperties subgroup_props;
vk::PhysicalDeviceDriverProperties driver_props;
props2.pNext = &props3;
props3.pNext = &subgroup_props;
subgroup_props.pNext = &driver_props;
physical_device.getProperties2(&props2);
const size_t subgroup_size = subgroup_props.subgroupSize;
@@ -1603,7 +1611,7 @@ static void ggml_vk_print_gpu_info(size_t idx) {
fp16 = fp16 && vk12_features.shaderFloat16;
std::string device_name = props2.properties.deviceName.data();
std::cerr << GGML_VK_NAME << idx << ": " << device_name << " (" << driver_props.driverName << ") | uma: " << uma << " | fp16: " << fp16 << " | warp size: " << subgroup_size << std::endl;
std::cerr << GGML_VK_NAME << idx << ": " << device_name << " | uma: " << uma << " | fp16: " << fp16 << " | warp size: " << subgroup_size << std::endl;
if (props2.properties.deviceType == vk::PhysicalDeviceType::eCpu) {
std::cerr << "ggml_vulkan: Warning: Device type is CPU. This is probably not the device you want." << std::endl;
@@ -1699,78 +1707,7 @@ void ggml_vk_instance_init() {
vk::PhysicalDeviceProperties props = devices[i].getProperties();
if (props.deviceType == vk::PhysicalDeviceType::eDiscreteGpu) {
// Check if there are two physical devices corresponding to the same GPU
auto old_device = std::find_if(
vk_instance.device_indices.begin(),
vk_instance.device_indices.end(),
[&devices, &props](const size_t k){ return devices[k].getProperties().deviceID == props.deviceID; }
);
if (old_device == vk_instance.device_indices.end()) {
vk_instance.device_indices.push_back(i);
} else {
// There can be two physical devices corresponding to the same GPU if there are 2 different drivers
// This can cause error when splitting layers aross the devices, need to keep only 1
#ifdef GGML_VULKAN_DEBUG
std::cerr << "Device " << i << " and device " << *old_device << " have the same device id" << std::endl;
#endif
vk::PhysicalDeviceProperties2 old_prop;
vk::PhysicalDeviceDriverProperties old_driver;
old_prop.pNext = &old_driver;
devices[*old_device].getProperties2(&old_prop);
vk::PhysicalDeviceProperties2 new_prop;
vk::PhysicalDeviceDriverProperties new_driver;
new_prop.pNext = &new_driver;
devices[i].getProperties2(&new_prop);
std::map<vk::DriverId, int> driver_priorities {};
int old_priority = std::numeric_limits<int>::max();
int new_priority = std::numeric_limits<int>::max();
// Check https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkDriverId.html for the list of driver id
// Smaller number -> higher priority
switch (old_prop.properties.vendorID) {
case VK_VENDOR_ID_AMD:
driver_priorities[vk::DriverId::eMesaRadv] = 1;
driver_priorities[vk::DriverId::eAmdOpenSource] = 2;
driver_priorities[vk::DriverId::eAmdProprietary] = 3;
break;
case VK_VENDOR_ID_INTEL:
driver_priorities[vk::DriverId::eIntelOpenSourceMESA] = 1;
driver_priorities[vk::DriverId::eIntelProprietaryWindows] = 2;
break;
case VK_VENDOR_ID_NVIDIA:
driver_priorities[vk::DriverId::eNvidiaProprietary] = 1;
#if defined(VK_API_VERSION_1_3) && VK_HEADER_VERSION >= 235
driver_priorities[vk::DriverId::eMesaNvk] = 2;
#endif
break;
}
if (driver_priorities.count(old_driver.driverID)) {
old_priority = driver_priorities[old_driver.driverID];
}
if (driver_priorities.count(new_driver.driverID)) {
new_priority = driver_priorities[new_driver.driverID];
}
if (new_priority < old_priority) {
auto r = std::remove(vk_instance.device_indices.begin(), vk_instance.device_indices.end(), *old_device);
vk_instance.device_indices.erase(r, vk_instance.device_indices.end());
vk_instance.device_indices.push_back(i);
#ifdef GGML_VULKAN_DEBUG
std::cerr << "Prioritize device " << i << " driver " << new_driver.driverName << " over device " << *old_device << " driver " << old_driver.driverName << std::endl;
#endif
}
#ifdef GGML_VULKAN_DEBUG
else {
std::cerr << "Prioritize device " << *old_device << " driver " << old_driver.driverName << " over device " << i << " driver " << new_driver.driverName << std::endl;
}
#endif
}
vk_instance.device_indices.push_back(i);
}
}
@@ -3968,10 +3905,10 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
}
} else {
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
return ctx->device->pipeline_rope_norm_f32;
return ctx->device->pipeline_rope_f32;
}
if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
return ctx->device->pipeline_rope_norm_f16;
return ctx->device->pipeline_rope_f16;
}
}
return nullptr;
@@ -4215,16 +4152,24 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
ggml_vk_sync_buffers(subctx);
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, subbuf_y, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
} else if (op == GGML_OP_ROPE) {
// Empty src2 is possible in rope, but the shader needs a buffer
vk_subbuffer subbuf_z;
if (use_src2) {
subbuf_z = { d_Z, z_buf_offset, z_sz };
} else {
subbuf_z = { d_X, 0, d_X->size };
}
const int mode = ((int32_t *) dst->op_params)[2];
const bool is_neox = mode & 2;
ggml_vk_sync_buffers(subctx);
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, subbuf_z, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
if (is_neox) {
// Empty src2 is possible in rope, but the shader needs a buffer
vk_subbuffer subbuf_z;
if (use_src2) {
subbuf_z = { d_Z, z_buf_offset, z_sz };
} else {
subbuf_z = { d_X, 0, d_X->size };
}
ggml_vk_sync_buffers(subctx);
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, subbuf_z, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
} else {
ggml_vk_sync_buffers(subctx);
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
}
} else if (use_src2) {
ggml_vk_sync_buffers(subctx);
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_Z, z_buf_offset, z_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
@@ -4446,7 +4391,7 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx,
static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
const int n_dims = ((int32_t *) dst->op_params)[1];
// const int mode = ((int32_t *) dst->op_params)[2];
const int mode = ((int32_t *) dst->op_params)[2];
// const int n_ctx = ((int32_t *) dst->op_params)[3];
const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
const float freq_base = ((float *) dst->op_params)[5];
@@ -4456,16 +4401,28 @@ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, con
const float beta_fast = ((float *) dst->op_params)[9];
const float beta_slow = ((float *) dst->op_params)[10];
const bool is_neox = mode & 2;
#pragma message("TODO: update rope NORM mode to match NEOX mode")
#pragma message(" https://github.com/ggerganov/llama.cpp/pull/7634")
float corr_dims[2];
ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
const float theta_scale = powf(freq_base, -2.0f/n_dims);
ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_ROPE, {
(uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1],
freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1]}, theta_scale,
src2 != nullptr,
});
if (is_neox) {
const float theta_scale = powf(freq_base, -2.0f/n_dims);
const float inv_ndims = -1.0f / n_dims;
ggml_vk_op_f32<vk_op_rope_neox_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_ROPE, {
(uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1],
freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f}, theta_scale, inv_ndims,
src2 != nullptr,
});
} else {
ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_ROPE, {
(uint32_t)src0->ne[0], freq_scale, (uint32_t)src0->ne[1],
freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f}
});
}
}
static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
@@ -6113,13 +6070,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(
std::cerr << "ggml_backend_vk_buffer_type_alloc_buffer(" << size << ")" << std::endl;
#endif
ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context;
vk_buffer dev_buffer = nullptr;
try {
dev_buffer = ggml_vk_create_buffer_device(ctx->ctx, size);
} catch (const vk::SystemError& e) {
return nullptr;
}
vk_buffer dev_buffer = ggml_vk_create_buffer_device(ctx->ctx, size);
ggml_backend_vk_buffer_context * bufctx = new ggml_backend_vk_buffer_context(ctx->ctx, std::move(dev_buffer), ctx->name);
@@ -6439,7 +6390,7 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
case GGML_UNARY_OP_GELU:
case GGML_UNARY_OP_SILU:
case GGML_UNARY_OP_RELU:
return ggml_is_contiguous(op->src[0]);
return true;
default:
return false;
}
@@ -6515,7 +6466,7 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
// return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
// } break;
case GGML_OP_ROPE:
return ggml_is_contiguous(op->src[0]);
return true;
case GGML_OP_NONE:
case GGML_OP_RESHAPE:
case GGML_OP_VIEW:

172
ggml.c
View File

@@ -3212,42 +3212,35 @@ GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor) {
return tensor->nb[0] > tensor->nb[1];
}
static bool ggml_is_contiguous_n(const struct ggml_tensor * tensor, int n) {
size_t next_nb = ggml_type_size(tensor->type);
if (tensor->ne[0] != ggml_blck_size(tensor->type) && tensor->nb[0] != next_nb) {
return false;
}
next_nb *= tensor->ne[0]/ggml_blck_size(tensor->type);
for (int i = 1; i < GGML_MAX_DIMS; i++) {
if (tensor->ne[i] != 1) {
if (i > n) {
if (tensor->nb[i] != next_nb) {
return false;
}
next_nb *= tensor->ne[i];
} else {
// this dimension does not need to be contiguous
next_nb = tensor->ne[i]*tensor->nb[i];
}
}
}
return true;
}
GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
return ggml_is_contiguous_0(tensor);
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
return
tensor->nb[0] == ggml_type_size(tensor->type) &&
tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/ggml_blck_size(tensor->type) &&
tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
}
GGML_CALL bool ggml_is_contiguous_0(const struct ggml_tensor * tensor) {
return ggml_is_contiguous_n(tensor, 0);
return ggml_is_contiguous(tensor);
}
GGML_CALL bool ggml_is_contiguous_1(const struct ggml_tensor * tensor) {
return ggml_is_contiguous_n(tensor, 1);
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
return
tensor->nb[0] == ggml_type_size(tensor->type) &&
tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
}
GGML_CALL bool ggml_is_contiguous_2(const struct ggml_tensor * tensor) {
return ggml_is_contiguous_n(tensor, 2);
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
return
tensor->nb[0] == ggml_type_size(tensor->type) &&
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
}
GGML_CALL bool ggml_is_permuted(const struct ggml_tensor * tensor) {
@@ -3279,20 +3272,20 @@ bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
return
(t0->ne[0] == t1->ne[0]) &&
(t0->ne[1] == t1->ne[1]) &&
(t0->ne[2] == t1->ne[2]) &&
(t0->ne[3] == t1->ne[3]);
(t0->ne[0] == t1->ne[0] ) &&
(t0->ne[1] == t1->ne[1] ) &&
(t0->ne[2] == t1->ne[2] ) &&
(t0->ne[3] == t1->ne[3] );
}
bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
return
(t0->nb[0] == t1->nb[0]) &&
(t0->nb[1] == t1->nb[1]) &&
(t0->nb[2] == t1->nb[2]) &&
(t0->nb[3] == t1->nb[3]);
(t0->nb[0] == t1->nb[0] ) &&
(t0->nb[1] == t1->nb[1] ) &&
(t0->nb[2] == t1->nb[2] ) &&
(t0->nb[3] == t1->nb[3] );
}
// check if t1 can be represented as a repeatition of t0
@@ -4085,26 +4078,32 @@ float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i) {
switch (tensor->type) {
case GGML_TYPE_I8:
{
GGML_ASSERT(tensor->nb[0] == sizeof(int8_t));
return ((int8_t *)(tensor->data))[i];
}
case GGML_TYPE_I16:
{
GGML_ASSERT(tensor->nb[0] == sizeof(int16_t));
return ((int16_t *)(tensor->data))[i];
}
case GGML_TYPE_I32:
{
GGML_ASSERT(tensor->nb[0] == sizeof(int32_t));
return ((int32_t *)(tensor->data))[i];
}
case GGML_TYPE_F16:
{
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
}
case GGML_TYPE_BF16:
{
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_bf16_t));
return GGML_BF16_TO_FP32(((ggml_bf16_t *)(tensor->data))[i]);
}
case GGML_TYPE_F32:
{
GGML_ASSERT(tensor->nb[0] == sizeof(float));
return ((float *)(tensor->data))[i];
}
default:
@@ -4126,26 +4125,32 @@ void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value) {
switch (tensor->type) {
case GGML_TYPE_I8:
{
GGML_ASSERT(tensor->nb[0] == sizeof(int8_t));
((int8_t *)(tensor->data))[i] = value;
} break;
case GGML_TYPE_I16:
{
GGML_ASSERT(tensor->nb[0] == sizeof(int16_t));
((int16_t *)(tensor->data))[i] = value;
} break;
case GGML_TYPE_I32:
{
GGML_ASSERT(tensor->nb[0] == sizeof(int32_t));
((int32_t *)(tensor->data))[i] = value;
} break;
case GGML_TYPE_F16:
{
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value);
} break;
case GGML_TYPE_BF16:
{
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_bf16_t));
((ggml_bf16_t *)(tensor->data))[i] = GGML_FP32_TO_BF16(value);
} break;
case GGML_TYPE_F32:
{
GGML_ASSERT(tensor->nb[0] == sizeof(float));
((float *)(tensor->data))[i] = value;
} break;
default:
@@ -7338,15 +7343,13 @@ struct ggml_tensor * ggml_add_rel_pos_inplace(
return ggml_add_rel_pos_impl(ctx, a, pw, ph, true);
}
// ggml_unary
// gmml_unary
static struct ggml_tensor * ggml_unary_impl(
struct ggml_context * ctx,
struct ggml_tensor * a,
enum ggml_unary_op op,
bool inplace) {
GGML_ASSERT(ggml_is_contiguous_1(a));
bool is_node = false;
if (!inplace && (a->grad)) {
@@ -11011,8 +11014,6 @@ static void ggml_compute_forward_abs_f32(
const struct ggml_tensor * src0 = dst->src[0];
assert(params->ith == 0);
assert(ggml_is_contiguous_1(src0));
assert(ggml_is_contiguous_1(dst));
assert(ggml_are_same_shape(src0, dst));
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -11022,6 +11023,9 @@ static void ggml_compute_forward_abs_f32(
const int n = ggml_nrows(src0);
const int nc = src0->ne[0];
assert(dst->nb[0] == sizeof(float));
assert(src0->nb[0] == sizeof(float));
for (int i = 0; i < n; i++) {
ggml_vec_abs_f32(nc,
(float *) ((char *) dst->data + i*( dst->nb[1])),
@@ -11056,8 +11060,6 @@ static void ggml_compute_forward_sgn_f32(
const struct ggml_tensor * src0 = dst->src[0];
assert(params->ith == 0);
assert(ggml_is_contiguous_1(src0));
assert(ggml_is_contiguous_1(dst));
assert(ggml_are_same_shape(src0, dst));
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -11067,6 +11069,9 @@ static void ggml_compute_forward_sgn_f32(
const int n = ggml_nrows(src0);
const int nc = src0->ne[0];
assert(dst->nb[0] == sizeof(float));
assert(src0->nb[0] == sizeof(float));
for (int i = 0; i < n; i++) {
ggml_vec_sgn_f32(nc,
(float *) ((char *) dst->data + i*( dst->nb[1])),
@@ -11101,8 +11106,6 @@ static void ggml_compute_forward_neg_f32(
const struct ggml_tensor * src0 = dst->src[0];
assert(params->ith == 0);
assert(ggml_is_contiguous_1(src0));
assert(ggml_is_contiguous_1(dst));
assert(ggml_are_same_shape(src0, dst));
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -11112,6 +11115,9 @@ static void ggml_compute_forward_neg_f32(
const int n = ggml_nrows(src0);
const int nc = src0->ne[0];
assert(dst->nb[0] == sizeof(float));
assert(src0->nb[0] == sizeof(float));
for (int i = 0; i < n; i++) {
ggml_vec_neg_f32(nc,
(float *) ((char *) dst->data + i*( dst->nb[1])),
@@ -11146,8 +11152,6 @@ static void ggml_compute_forward_step_f32(
const struct ggml_tensor * src0 = dst->src[0];
assert(params->ith == 0);
assert(ggml_is_contiguous_1(src0));
assert(ggml_is_contiguous_1(dst));
assert(ggml_are_same_shape(src0, dst));
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -11157,6 +11161,9 @@ static void ggml_compute_forward_step_f32(
const int n = ggml_nrows(src0);
const int nc = src0->ne[0];
assert(dst->nb[0] == sizeof(float));
assert(src0->nb[0] == sizeof(float));
for (int i = 0; i < n; i++) {
ggml_vec_step_f32(nc,
(float *) ((char *) dst->data + i*( dst->nb[1])),
@@ -11191,8 +11198,6 @@ static void ggml_compute_forward_tanh_f32(
const struct ggml_tensor * src0 = dst->src[0];
assert(params->ith == 0);
assert(ggml_is_contiguous_1(src0));
assert(ggml_is_contiguous_1(dst));
assert(ggml_are_same_shape(src0, dst));
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -11202,6 +11207,9 @@ static void ggml_compute_forward_tanh_f32(
const int n = ggml_nrows(src0);
const int nc = src0->ne[0];
assert(dst->nb[0] == sizeof(float));
assert(src0->nb[0] == sizeof(float));
for (int i = 0; i < n; i++) {
ggml_vec_tanh_f32(nc,
(float *) ((char *) dst->data + i*( dst->nb[1])),
@@ -11236,8 +11244,6 @@ static void ggml_compute_forward_elu_f32(
const struct ggml_tensor * src0 = dst->src[0];
assert(params->ith == 0);
assert(ggml_is_contiguous_1(src0));
assert(ggml_is_contiguous_1(dst));
assert(ggml_are_same_shape(src0, dst));
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -11247,6 +11253,9 @@ static void ggml_compute_forward_elu_f32(
const int n = ggml_nrows(src0);
const int nc = src0->ne[0];
assert(dst->nb[0] == sizeof(float));
assert(src0->nb[0] == sizeof(float));
for (int i = 0; i < n; i++) {
ggml_vec_elu_f32(nc,
(float *) ((char *) dst->data + i*( dst->nb[1])),
@@ -11281,8 +11290,6 @@ static void ggml_compute_forward_relu_f32(
const struct ggml_tensor * src0 = dst->src[0];
assert(params->ith == 0);
assert(ggml_is_contiguous_1(src0));
assert(ggml_is_contiguous_1(dst));
assert(ggml_are_same_shape(src0, dst));
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -11292,6 +11299,9 @@ static void ggml_compute_forward_relu_f32(
const int n = ggml_nrows(src0);
const int nc = src0->ne[0];
assert(dst->nb[0] == sizeof(float));
assert(src0->nb[0] == sizeof(float));
for (int i = 0; i < n; i++) {
ggml_vec_relu_f32(nc,
(float *) ((char *) dst->data + i*( dst->nb[1])),
@@ -11326,8 +11336,6 @@ static void ggml_compute_forward_sigmoid_f32(
const struct ggml_tensor * src0 = dst->src[0];
assert(params->ith == 0);
assert(ggml_is_contiguous_1(src0));
assert(ggml_is_contiguous_1(dst));
assert(ggml_are_same_shape(src0, dst));
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -11337,6 +11345,9 @@ static void ggml_compute_forward_sigmoid_f32(
const int n = ggml_nrows(src0);
const int nc = src0->ne[0];
assert(dst->nb[0] == sizeof(float));
assert(src0->nb[0] == sizeof(float));
for (int i = 0; i < n; i++) {
ggml_vec_sigmoid_f32(nc,
(float *) ((char *) dst->data + i*( dst->nb[1])),
@@ -11370,9 +11381,9 @@ static void ggml_compute_forward_gelu_f32(
const struct ggml_tensor * src0 = dst->src[0];
assert(ggml_is_contiguous_1(src0));
assert(ggml_is_contiguous_1(dst));
assert(ggml_are_same_shape(src0, dst));
GGML_ASSERT(ggml_is_contiguous_1(src0));
GGML_ASSERT(ggml_is_contiguous_1(dst));
GGML_ASSERT(ggml_are_same_shape(src0, dst));
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
return;
@@ -11433,9 +11444,9 @@ static void ggml_compute_forward_gelu_quick_f32(
const struct ggml_tensor * src0 = dst->src[0];
assert(ggml_is_contiguous_1(src0));
assert(ggml_is_contiguous_1(dst));
assert(ggml_are_same_shape(src0, dst));
GGML_ASSERT(ggml_is_contiguous_1(src0));
GGML_ASSERT(ggml_is_contiguous_1(dst));
GGML_ASSERT(ggml_are_same_shape(src0, dst));
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
return;
@@ -11496,9 +11507,9 @@ static void ggml_compute_forward_silu_f32(
const struct ggml_tensor * src0 = dst->src[0];
assert(ggml_is_contiguous_1(src0));
assert(ggml_is_contiguous_1(dst));
assert(ggml_are_same_shape(src0, dst));
GGML_ASSERT(ggml_is_contiguous_1(src0));
GGML_ASSERT(ggml_is_contiguous_1(dst));
GGML_ASSERT(ggml_are_same_shape(src0, dst));
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
return;
@@ -11559,8 +11570,6 @@ static void ggml_compute_forward_leaky_relu_f32(
const struct ggml_tensor * src0 = dst->src[0];
assert(params->ith == 0);
assert(ggml_is_contiguous_1(src0));
assert(ggml_is_contiguous_1(dst));
assert(ggml_are_same_shape(src0, dst));
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -11610,11 +11619,11 @@ static void ggml_compute_forward_silu_back_f32(
const struct ggml_tensor * src0 = dst->src[0];
const struct ggml_tensor * grad = dst->src[1];
assert(ggml_is_contiguous_1(grad));
assert(ggml_is_contiguous_1(src0));
assert(ggml_is_contiguous_1(dst));
assert(ggml_are_same_shape(src0, dst));
assert(ggml_are_same_shape(src0, grad));
GGML_ASSERT(ggml_is_contiguous_1(grad));
GGML_ASSERT(ggml_is_contiguous_1(src0));
GGML_ASSERT(ggml_is_contiguous_1(dst));
GGML_ASSERT(ggml_are_same_shape(src0, dst));
GGML_ASSERT(ggml_are_same_shape(src0, grad));
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
return;
@@ -11676,8 +11685,6 @@ static void ggml_compute_forward_hardswish_f32(
const struct ggml_tensor * src0 = dst->src[0];
assert(params->ith == 0);
assert(ggml_is_contiguous_1(src0));
assert(ggml_is_contiguous_1(dst));
assert(ggml_are_same_shape(src0, dst));
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -11687,6 +11694,9 @@ static void ggml_compute_forward_hardswish_f32(
const int n = ggml_nrows(src0);
const int nc = src0->ne[0];
assert(dst->nb[0] == sizeof(float));
assert(src0->nb[0] == sizeof(float));
for (int i = 0; i < n; i++) {
ggml_vec_hardswish_f32(nc,
(float *) ((char *) dst->data + i*( dst->nb[1])),
@@ -11718,8 +11728,6 @@ static void ggml_compute_forward_hardsigmoid_f32(
const struct ggml_tensor * src0 = dst->src[0];
assert(params->ith == 0);
assert(ggml_is_contiguous_1(src0));
assert(ggml_is_contiguous_1(dst));
assert(ggml_are_same_shape(src0, dst));
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -11729,6 +11737,9 @@ static void ggml_compute_forward_hardsigmoid_f32(
const int n = ggml_nrows(src0);
const int nc = src0->ne[0];
assert(dst->nb[0] == sizeof(float));
assert(src0->nb[0] == sizeof(float));
for (int i = 0; i < n; i++) {
ggml_vec_hardsigmoid_f32(nc,
(float *) ((char *) dst->data + i*( dst->nb[1])),
@@ -16675,10 +16686,7 @@ static void ggml_compute_forward_map_unary_f32(
const struct ggml_tensor * src0 = dst->src[0];
assert(params->ith == 0);
assert(ggml_is_contiguous_1(src0));
assert(ggml_is_contiguous_1(dst));
assert(ggml_are_same_shape(src0, dst));
GGML_ASSERT(ggml_are_same_shape(src0, dst));
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
return;
@@ -16687,6 +16695,9 @@ static void ggml_compute_forward_map_unary_f32(
const int n = ggml_nrows(src0);
const int nc = src0->ne[0];
assert( dst->nb[0] == sizeof(float));
assert(src0->nb[0] == sizeof(float));
for (int i = 0; i < n; i++) {
fun(nc,
(float *) ((char *) dst->data + i*( dst->nb[1])),
@@ -16724,9 +16735,6 @@ static void ggml_compute_forward_map_binary_f32(
const struct ggml_tensor * src1 = dst->src[1];
assert(params->ith == 0);
assert(ggml_is_contiguous_1(src0));
assert(ggml_is_contiguous_1(src1));
assert(ggml_is_contiguous_1(dst));
assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
@@ -16736,6 +16744,10 @@ static void ggml_compute_forward_map_binary_f32(
const int n = ggml_nrows(src0);
const int nc = src0->ne[0];
assert( dst->nb[0] == sizeof(float));
assert(src0->nb[0] == sizeof(float));
assert(src1->nb[0] == sizeof(float));
for (int i = 0; i < n; i++) {
fun(nc,
(float *) ((char *) dst->data + i*( dst->nb[1])),

View File

@@ -2400,7 +2400,7 @@ void main() {
"""
# ROPE
rope_norm_src = """
rope_src = """
#version 450
#extension GL_EXT_shader_16bit_storage : require
@@ -2408,21 +2408,17 @@ rope_norm_src = """
layout(local_size_x = 1, local_size_y = 256, local_size_z = 1) in;
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
layout (binding = 1) readonly buffer Y {int data_pos[];};
layout (binding = 2) readonly buffer Z {float data_ff[];};
layout (binding = 3) writeonly buffer D {D_TYPE data_d[];};
layout (binding = 1) readonly buffer Y {int data_b[];};
layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};
layout (push_constant) uniform parameter {
uint ncols;
uint n_dims;
float freq_scale;
uint p_delta_rows;
float freq_base;
float ext_factor;
float attn_factor;
float corr_dims[2];
float theta_scale;
uint has_ff;
float corr_dims[4];
} p;
float rope_yarn_ramp(const float low, const float high, const uint i0) {
@@ -2454,24 +2450,14 @@ void main() {
return;
}
if (col >= p.n_dims) {
const uint i = row*p.ncols + col;
data_d[i + 0] = data_a[i + 0];
data_d[i + 1] = data_a[i + 1];
return;
}
const uint i = row*p.ncols + col;
const uint i2 = row/p.p_delta_rows;
const float theta_base = data_pos[i2] * pow(p.theta_scale, col/2.0f);
const float freq_factor = p.has_ff != 0 ? data_ff[col/2] : 1.0f;
const int pos = data_b[i2];
const float theta_base = pos * pow(p.freq_base, -float(col)/p.ncols);
float cos_theta, sin_theta;
rope_yarn(theta_base / freq_factor, col, cos_theta, sin_theta);
rope_yarn(theta_base, col, cos_theta, sin_theta);
const float x0 = float(data_a[i + 0]);
const float x1 = float(data_a[i + 1]);
@@ -2489,21 +2475,22 @@ rope_neox_src = """
layout(local_size_x = 1, local_size_y = 256, local_size_z = 1) in;
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
layout (binding = 1) readonly buffer Y {int data_pos[];};
layout (binding = 2) readonly buffer Z {float data_ff[];};
layout (binding = 1) readonly buffer Y {int data_b[];};
layout (binding = 2) readonly buffer Z {float data_freq_factors[];};
layout (binding = 3) writeonly buffer D {D_TYPE data_d[];};
layout (push_constant) uniform parameter {
uint ncols;
uint n_dims;
uint ndims;
float freq_scale;
uint p_delta_rows;
float freq_base;
float ext_factor;
float attn_factor;
float corr_dims[2];
float corr_dims[4];
float theta_scale;
uint has_ff;
float inv_ndims;
uint has_freq_facs;
} p;
float rope_yarn_ramp(const float low, const float high, const uint i0) {
@@ -2535,8 +2522,11 @@ void main() {
return;
}
if (col >= p.n_dims) {
const uint i = row*p.ncols + col;
const uint ib = col / p.ndims;
const uint ic = col % p.ndims;
if (ib > 0) {
const uint i = row*p.ncols + ib*p.ndims + ic;
data_d[i + 0] = data_a[i + 0];
data_d[i + 1] = data_a[i + 1];
@@ -2544,27 +2534,29 @@ void main() {
return;
}
const uint i = row*p.ncols + col/2;
const uint i = row*p.ncols + ib*p.ndims + ic/2;
const uint i2 = row/p.p_delta_rows;
const float theta_base = data_pos[i2] * pow(p.theta_scale, col/2.0f);
const float freq_factor = p.has_ff != 0 ? data_ff[col/2] : 1.0f;
const int pos = data_b[i2];
const float freq_factor = p.has_freq_facs != 0 ? data_freq_factors[ic/2] : 1.0f;
const float theta_base = pos*p.freq_scale*pow(p.theta_scale, col/2.0f) / freq_factor;
float cos_theta, sin_theta;
rope_yarn(theta_base / freq_factor, col, cos_theta, sin_theta);
rope_yarn(theta_base, ic, cos_theta, sin_theta);
const float x0 = float(data_a[i + 0]);
const float x1 = float(data_a[i + p.n_dims/2]);
const float x1 = float(data_a[i + p.ndims/2]);
data_d[i + 0] = D_TYPE(x0*cos_theta - x1*sin_theta);
data_d[i + p.n_dims/2] = D_TYPE(x0*sin_theta + x1*cos_theta);
data_d[i + p.ndims/2] = D_TYPE(x0*sin_theta + x1*cos_theta);
}
"""
argsort_src = """
#version 450
#extension GL_EXT_shader_16bit_storage : require
#define BLOCK_SIZE 1024
#define ASC 0
@@ -3047,8 +3039,8 @@ async def main():
tasks.append(string_to_spv("soft_max_f32", f"{soft_max_head}\n{shader_f32}\n{soft_max_body}", {"A_TYPE": "float", "B_TYPE": "float", "C_TYPE": "float", "D_TYPE": "float"}))
tasks.append(string_to_spv("soft_max_f32_f16", f"{soft_max_head}\n{shader_f32}\n{soft_max_body}", {"A_TYPE": "float", "B_TYPE": "float16_t", "C_TYPE": "float16_t", "D_TYPE": "float"}))
tasks.append(string_to_spv("rope_norm_f32", rope_norm_src, {"A_TYPE": "float", "D_TYPE": "float"}))
tasks.append(string_to_spv("rope_norm_f16", rope_norm_src, {"A_TYPE": "float16_t", "D_TYPE": "float16_t"}))
tasks.append(string_to_spv("rope_f32", rope_src, {"A_TYPE": "float", "D_TYPE": "float"}))
tasks.append(string_to_spv("rope_f16", rope_src, {"A_TYPE": "float16_t", "D_TYPE": "float16_t"}))
tasks.append(string_to_spv("rope_neox_f32", rope_neox_src, {"A_TYPE": "float", "D_TYPE": "float"}))
tasks.append(string_to_spv("rope_neox_f16", rope_neox_src, {"A_TYPE": "float16_t", "D_TYPE": "float16_t"}))

View File

@@ -94,8 +94,6 @@ This guide provides a brief overview. Check out the GBNF files in this directory
./main -m <model> --grammar-file grammars/some-grammar.gbnf -p 'Some prompt'
```
`llama.cpp` can also convert JSON schemas to grammars either ahead of time or at each request, see below.
## Troubleshooting
Grammars currently have performance gotchas (see https://github.com/ggerganov/llama.cpp/issues/4218).
@@ -105,40 +103,3 @@ Grammars currently have performance gotchas (see https://github.com/ggerganov/ll
A common pattern is to allow repetitions of a pattern `x` up to N times.
While semantically correct, the syntax `x? x? x?.... x?` (with N repetitions) may result in extremely slow sampling. Instead, you can write `x{0,N}` (or `(x (x (x ... (x)?...)?)?)?` w/ N-deep nesting in earlier llama.cpp versions).
## Using GBNF grammars
You can use GBNF grammars:
- In the [server](../examples/server)'s completion endpoints, passed as the `grammar` body field
- In the [main](../examples/main) CLI, passed as the `--grammar` & `--grammar-file` flags
- With the [gbnf-validator](../examples/gbnf-validator) tool, to test them against strings.
## JSON Schemas → GBNF
`llama.cpp` supports converting a subset of https://json-schema.org/ to GBNF grammars:
- In the [server](../examples/server):
- For any completion endpoints, passed as the `json_schema` body field
- For the `/chat/completions` endpoint, passed inside the `result_format` body field (e.g. `{"type", "json_object", "schema": {"items": {}}}`)
- In the [main](../examples/main) CLI, passed as the `--json` / `-j` flag
- To convert to a grammar ahead of time:
- in CLI, with [json_schema_to_grammar.py](../examples/json_schema_to_grammar.py)
- in JavaScript with [json-schema-to-grammar.mjs](../examples/server/public/json-schema-to-grammar.mjs) (this is used by the [server](../examples/server)'s Web UI)
Take a look at [tests](../../tests/test-json-schema-to-grammar.cpp) to see which features are likely supported (you'll also find usage examples in https://github.com/ggerganov/llama.cpp/pull/5978, https://github.com/ggerganov/llama.cpp/pull/6659 & https://github.com/ggerganov/llama.cpp/pull/6555).
Here is also a non-exhaustive list of **unsupported** features:
- `additionalProperties`: to be fixed in https://github.com/ggerganov/llama.cpp/pull/7840
- `minimum`, `exclusiveMinimum`, `maximum`, `exclusiveMaximum`
- `integer` constraints to be implemented in https://github.com/ggerganov/llama.cpp/pull/7797
- Remote `$ref`s in the C++ version (Python & JavaScript versions fetch https refs)
- Mixing `properties` w/ `anyOf` / `oneOf` in the same type (https://github.com/ggerganov/llama.cpp/issues/7703)
- `string` formats `uri`, `email`
- [`contains`](https://json-schema.org/draft/2020-12/json-schema-core#name-contains) / `minContains`
- `uniqueItems`
- `$anchor` (cf. [dereferencing](https://json-schema.org/draft/2020-12/json-schema-core#name-dereferencing))
- [`not`](https://json-schema.org/draft/2020-12/json-schema-core#name-not)
- [Conditionals](https://json-schema.org/draft/2020-12/json-schema-core#name-keywords-for-applying-subsche) `if` / `then` / `else` / `dependentSchemas`
- [`patternProperties`](https://json-schema.org/draft/2020-12/json-schema-core#name-patternproperties)

View File

@@ -16,10 +16,10 @@ array ::=
string ::=
"\"" (
[^"\\\x7F\x00-\x1F] |
"\\" (["\\bfnrt] | "u" [0-9a-fA-F]{4}) # escapes
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes
)* "\"" ws
number ::= ("-"? ([0-9] | [1-9] [0-9]{0,15})) ("." [0-9]+)? ([eE] [-+]? [0-9] [1-9]{0,15})? ws
number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws
# Optional space: by convention, applied in this grammar after literal chars when allowed
ws ::= | " " | "\n" [ \t]{0,20}
ws ::= ([ \t\n] ws)?

View File

@@ -25,10 +25,10 @@ array ::=
string ::=
"\"" (
[^"\\\x7F\x00-\x1F] |
"\\" (["\\bfnrt] | "u" [0-9a-fA-F]{4}) # escapes
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes
)* "\"" ws
number ::= ("-"? ([0-9] | [1-9] [0-9]{0,15})) ("." [0-9]+)? ([eE] [-+]? [1-9] [0-9]{0,15})? ws
number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws
# Optional space: by convention, applied in this grammar after literal chars when allowed
ws ::= | " " | "\n" [ \t]{0,20}
ws ::= ([ \t\n] ws)?

View File

@@ -642,29 +642,20 @@ struct test_case {
struct test_unary : public test_case {
const ggml_unary_op op;
const ggml_type type;
const std::array<int64_t, 4> ne_a;
int v; // view (1 : non-contiguous a)
const std::array<int64_t, 4> ne;
std::string vars() override {
return VARS_TO_STR3(type, ne_a, v);
return VARS_TO_STR2(type, ne);
}
test_unary(ggml_unary_op op,
ggml_type type = GGML_TYPE_F32,
std::array<int64_t, 4> ne_a = {128, 10, 10, 10},
int v = 0)
: op(op), type(type), ne_a(ne_a), v(v) {}
std::array<int64_t, 4> ne = {128, 10, 10, 10})
: op(op), type(type), ne(ne) {}
ggml_tensor * build_graph(ggml_context * ctx) override {
ggml_tensor * a;
if (v & 1) {
auto ne = ne_a; ne[0] *= 3;
a = ggml_new_tensor(ctx, type, 4, ne.data());
a = ggml_view_4d(ctx, a, ne_a[0], ne_a[1], ne_a[2], ne_a[3], a->nb[1], a->nb[2], a->nb[3], 0);
} else {
a = ggml_new_tensor(ctx, type, 4, ne_a.data());
}
ggml_tensor * out = ggml_unary(ctx, a, op);
ggml_tensor * in = ggml_new_tensor(ctx, type, 4, ne.data());
ggml_tensor * out = ggml_unary(ctx, in, op);
return out;
}
@@ -2025,11 +2016,9 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
};
// unary ops
for (int v : {0, 1}) {
for (int op = 0; op < GGML_UNARY_OP_COUNT; op++) {
test_cases.emplace_back(new test_unary((ggml_unary_op) op, GGML_TYPE_F32, { 128, 10, 10, 10 }, v));
test_cases.emplace_back(new test_unary((ggml_unary_op) op, GGML_TYPE_F32, { 7, 13, 19, 23 }, v));
}
for (int op = 0; op < GGML_UNARY_OP_COUNT; op++) {
test_cases.emplace_back(new test_unary((ggml_unary_op) op));
test_cases.emplace_back(new test_unary((ggml_unary_op) op, GGML_TYPE_F32, { 7, 13, 19, 23 }));
}
test_cases.emplace_back(new test_get_rows(GGML_TYPE_F32, 1, 8, 2, 1, false));

View File

@@ -105,14 +105,14 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
R"""(
array ::= "[" space ( value ("," space value)* )? "]" space
boolean ::= ("true" | "false") space
char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F]{4})
decimal-part ::= [0-9]{1,16}
integral-part ::= [0] | [1-9] [0-9]{0,15}
null ::= "null" space
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? "}" space
root ::= object
space ::= | " " | "\n" [ \t]{0,20}
space ::= " "?
string ::= "\"" char* "\"" space
value ::= object | array | string | number | boolean | null
)"""
@@ -135,7 +135,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
date-time ::= date "T" time
date-time-string ::= "\"" date-time "\"" space
root ::= "[" space tuple-0 "," space uuid "," space tuple-2 "," space tuple-3 "]" space
space ::= | " " | "\n" [ \t]{0,20}
space ::= " "?
time ::= ([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9]{3} )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )
time-string ::= "\"" time "\"" space
tuple-0 ::= date-string
@@ -152,9 +152,9 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
"type": "string"
})""",
R"""(
char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F]{4})
root ::= "\"" char* "\"" space
space ::= | " " | "\n" [ \t]{0,20}
space ::= " "?
)"""
});
@@ -166,9 +166,9 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
"minLength": 1
})""",
R"""(
char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F]{4})
root ::= "\"" char+ "\"" space
space ::= | " " | "\n" [ \t]{0,20}
space ::= " "?
)"""
});
@@ -180,9 +180,9 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
"minLength": 3
})""",
R"""(
char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F]{4})
root ::= "\"" char{3,} "\"" space
space ::= | " " | "\n" [ \t]{0,20}
space ::= " "?
)"""
});
@@ -194,9 +194,9 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
"maxLength": 3
})""",
R"""(
char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F]{4})
root ::= "\"" char{0,3} "\"" space
space ::= | " " | "\n" [ \t]{0,20}
space ::= " "?
)"""
});
@@ -209,9 +209,9 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
"maxLength": 4
})""",
R"""(
char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F]{4})
root ::= "\"" char{1,4} "\"" space
space ::= | " " | "\n" [ \t]{0,20}
space ::= " "?
)"""
});
@@ -223,7 +223,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
})""",
R"""(
root ::= ("true" | "false") space
space ::= | " " | "\n" [ \t]{0,20}
space ::= " "?
)"""
});
@@ -236,7 +236,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
R"""(
integral-part ::= [0] | [1-9] [0-9]{0,15}
root ::= ("-"? integral-part) space
space ::= | " " | "\n" [ \t]{0,20}
space ::= " "?
)"""
});
@@ -248,7 +248,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
})""",
R"""(
root ::= "\"foo\""
space ::= | " " | "\n" [ \t]{0,20}
space ::= " "?
)"""
});
@@ -260,7 +260,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
})""",
R"""(
root ::= "123"
space ::= | " " | "\n" [ \t]{0,20}
space ::= " "?
)"""
});
@@ -272,7 +272,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
})""",
R"""(
root ::= "\"red\"" | "\"amber\"" | "\"green\"" | "null" | "42" | "[\"foo\"]"
space ::= | " " | "\n" [ \t]{0,20}
space ::= " "?
)"""
});
@@ -283,9 +283,9 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
"prefixItems": [{ "type": "string" }]
})""",
R"""(
char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F]{4})
root ::= "[" space string "]" space
space ::= | " " | "\n" [ \t]{0,20}
space ::= " "?
string ::= "\"" char* "\"" space
)"""
});
@@ -297,12 +297,12 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
"prefixItems": [{ "type": "string" }, { "type": "number" }]
})""",
R"""(
char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F]{4})
decimal-part ::= [0-9]{1,16}
integral-part ::= [0] | [1-9] [0-9]{0,15}
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
root ::= "[" space string "," space number "]" space
space ::= | " " | "\n" [ \t]{0,20}
space ::= " "?
string ::= "\"" char* "\"" space
)"""
});
@@ -317,7 +317,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
decimal-part ::= [0-9]{1,16}
integral-part ::= [0] | [1-9] [0-9]{0,15}
root ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
space ::= | " " | "\n" [ \t]{0,20}
space ::= " "?
)"""
});
@@ -333,7 +333,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
R"""(
boolean ::= ("true" | "false") space
root ::= "[" space boolean ("," space boolean)+ "]" space
space ::= | " " | "\n" [ \t]{0,20}
space ::= " "?
)"""
});
@@ -349,7 +349,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
R"""(
boolean ::= ("true" | "false") space
root ::= "[" space boolean? "]" space
space ::= | " " | "\n" [ \t]{0,20}
space ::= " "?
)"""
});
@@ -365,7 +365,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
R"""(
boolean ::= ("true" | "false") space
root ::= "[" space (boolean ("," space boolean)?)? "]" space
space ::= | " " | "\n" [ \t]{0,20}
space ::= " "?
)"""
});
@@ -386,7 +386,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
item ::= number | integer
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
root ::= "[" space item ("," space item){2,4} "]" space
space ::= | " " | "\n" [ \t]{0,20}
space ::= " "?
)"""
});
@@ -399,7 +399,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
})""",
R"""(
root ::= "\"" "ab" "c"? "d"* "ef" "g"+ ("hij")? "kl" "\"" space
space ::= | " " | "\n" [ \t]{0,20}
space ::= " "?
)"""
});
@@ -412,7 +412,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
})""",
R"""(
root ::= "\"" "[]{}()|+*?" "\"" space
space ::= | " " | "\n" [ \t]{0,20}
space ::= " "?
)"""
});
@@ -425,7 +425,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
})""",
R"""(
root ::= "\"" "\"" "\"" space
space ::= | " " | "\n" [ \t]{0,20}
space ::= " "?
)"""
});
@@ -440,7 +440,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
dot ::= [^\x0A\x0D]
root ::= "\"" ("(" root-1{1,3} ")")? root-1{3,3} "-" root-1{4,4} " " "a"{3,5} "nd" dot dot dot "\"" space
root-1 ::= [0-9]
space ::= | " " | "\n" [ \t]{0,20}
space ::= " "?
)"""
});
@@ -466,9 +466,9 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
a-kv ::= "\"a\"" space ":" space string
b-kv ::= "\"b\"" space ":" space string
c-kv ::= "\"c\"" space ":" space string
char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F]{4})
root ::= "{" space b-kv "," space c-kv "," space a-kv "}" space
space ::= | " " | "\n" [ \t]{0,20}
space ::= " "?
string ::= "\"" char* "\"" space
)"""
});
@@ -486,9 +486,9 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
})""",
R"""(
a-kv ::= "\"a\"" space ":" space string
char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F]{4})
root ::= "{" space (a-kv )? "}" space
space ::= | " " | "\n" [ \t]{0,20}
space ::= " "?
string ::= "\"" char* "\"" space
)"""
});
@@ -510,9 +510,9 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
b-kv ::= "\"b\"" space ":" space string
b-rest ::= ( "," space c-kv )?
c-kv ::= "\"c\"" space ":" space string
char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F]{4})
root ::= "{" space (a-kv a-rest | b-kv b-rest | c-kv )? "}" space
space ::= | " " | "\n" [ \t]{0,20}
space ::= " "?
string ::= "\"" char* "\"" space
)"""
});
@@ -534,11 +534,11 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
a-kv ::= "\"a\"" space ":" space string
b-kv ::= "\"b\"" space ":" space string
c-kv ::= "\"c\"" space ":" space string
char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F]{4})
d-kv ::= "\"d\"" space ":" space string
d-rest ::= ( "," space c-kv )?
root ::= "{" space b-kv "," space a-kv ( "," space ( d-kv d-rest | c-kv ) )? "}" space
space ::= | " " | "\n" [ \t]{0,20}
space ::= " "?
string ::= "\"" char* "\"" space
)"""
});
@@ -554,12 +554,12 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
additional-kv ::= string ":" space additional-value
additional-kvs ::= additional-kv ( "," space additional-kv )*
additional-value ::= "[" space (number ("," space number)*)? "]" space
char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F]{4})
decimal-part ::= [0-9]{1,16}
integral-part ::= [0] | [1-9] [0-9]{0,15}
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
root ::= "{" space (additional-kvs )? "}" space
space ::= | " " | "\n" [ \t]{0,20}
space ::= " "?
string ::= "\"" char* "\"" space
)"""
});
@@ -574,14 +574,14 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
R"""(
array ::= "[" space ( value ("," space value)* )? "]" space
boolean ::= ("true" | "false") space
char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F]{4})
decimal-part ::= [0-9]{1,16}
integral-part ::= [0] | [1-9] [0-9]{0,15}
null ::= "null" space
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? "}" space
root ::= object
space ::= | " " | "\n" [ \t]{0,20}
space ::= " "?
string ::= "\"" char* "\"" space
value ::= object | array | string | number | boolean | null
)"""
@@ -596,14 +596,14 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
R"""(
array ::= "[" space ( value ("," space value)* )? "]" space
boolean ::= ("true" | "false") space
char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F]{4})
decimal-part ::= [0-9]{1,16}
integral-part ::= [0] | [1-9] [0-9]{0,15}
null ::= "null" space
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? "}" space
root ::= object
space ::= | " " | "\n" [ \t]{0,20}
space ::= " "?
string ::= "\"" char* "\"" space
value ::= object | array | string | number | boolean | null
)"""
@@ -618,7 +618,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
})""",
R"""(
root ::= "{" space "}" space
space ::= | " " | "\n" [ \t]{0,20}
space ::= " "?
)"""
});
@@ -637,12 +637,12 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
a-kv ::= "\"a\"" space ":" space number
additional-kv ::= string ":" space string
additional-kvs ::= additional-kv ( "," space additional-kv )*
char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F]{4})
decimal-part ::= [0-9]{1,16}
integral-part ::= [0] | [1-9] [0-9]{0,15}
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
root ::= "{" space a-kv ( "," space ( additional-kvs ) )? "}" space
space ::= | " " | "\n" [ \t]{0,20}
space ::= " "?
string ::= "\"" char* "\"" space
)"""
});
@@ -662,12 +662,12 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
a-rest ::= additional-kvs
additional-kv ::= string ":" space number
additional-kvs ::= additional-kv ( "," space additional-kv )*
char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F]{4})
decimal-part ::= [0-9]{1,16}
integral-part ::= [0] | [1-9] [0-9]{0,15}
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
root ::= "{" space (a-kv a-rest | additional-kvs )? "}" space
space ::= | " " | "\n" [ \t]{0,20}
space ::= " "?
string ::= "\"" char* "\"" space
)"""
});
@@ -690,12 +690,12 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
additional-kvs ::= additional-kv ( "," space additional-kv )*
b-kv ::= "\"b\"" space ":" space number
b-rest ::= additional-kvs
char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F]{4})
decimal-part ::= [0-9]{1,16}
integral-part ::= [0] | [1-9] [0-9]{0,15}
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
root ::= "{" space a-kv ( "," space ( b-kv b-rest | additional-kvs ) )? "}" space
space ::= | " " | "\n" [ \t]{0,20}
space ::= " "?
string ::= "\"" char* "\"" space
)"""
});
@@ -721,11 +721,11 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
}
})""",
R"""(
char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F]{4})
foo ::= "{" space foo-a-kv "}" space
foo-a-kv ::= "\"a\"" space ":" space string
root ::= foo
space ::= | " " | "\n" [ \t]{0,20}
space ::= " "?
string ::= "\"" char* "\"" space
)"""
});
@@ -759,7 +759,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
integral-part ::= [0] | [1-9] [0-9]{0,15}
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
root ::= alternative-0 | alternative-1
space ::= | " " | "\n" [ \t]{0,20}
space ::= " "?
)"""
});
@@ -803,7 +803,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
integral-part ::= [0] | [1-9] [0-9]{0,15}
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
root ::= "{" space a-kv "," space b-kv ( "," space ( d-kv d-rest | c-kv ) )? "}" space
space ::= | " " | "\n" [ \t]{0,20}
space ::= " "?
)"""
});
@@ -851,7 +851,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
number-number-kv ::= "\"number\"" space ":" space number-number
number-number-root-kv ::= "\"root\"" space ":" space number
root ::= "{" space number-kv "}" space
space ::= | " " | "\n" [ \t]{0,20}
space ::= " "?
)"""
});
}
@@ -870,7 +870,7 @@ int main() {
}
});
if (getenv("LLAMA_PYTHON_AVAILABLE") || (std::system("python -c \"import sys; exit(1) if sys.version_info < (3, 8) else print('Python version is sufficient')\"") == 0)) {
if (getenv("LLAMA_PYTHON_AVAILABLE") || (std::system("python --version") == 0)) {
test_all("Python", [](const TestCase & tc) {
write("test-json-schema-input.tmp", tc.schema);
tc.verify_status(std::system(
@@ -878,7 +878,7 @@ int main() {
tc.verify(read("test-grammar-output.tmp"));
});
} else {
fprintf(stderr, "\033[33mWARNING: Python not found (min version required is 3.8), skipping Python JSON schema -> grammar tests.\n\033[0m");
fprintf(stderr, "\033[33mWARNING: Python not found, skipping Python JSON schema -> grammar tests.\n\033[0m");
}
if (getenv("LLAMA_NODE_AVAILABLE") || (std::system("node --version") == 0)) {