Compare commits

..

9 Commits

Author SHA1 Message Date
Johannes Gäßler
ac3b886953 llama : fix embd when offloading non-repeating layers (#1891) 2023-06-16 21:25:51 +03:00
FrankHB
5b9ccaf104 Fixed possible macro redefinition (#1892)
MinGW libstdc++ may define `NOMINMAX` unconditionally. This fixes the case when it is already defined.
2023-06-16 21:25:01 +03:00
Borislav Stanimirov
9cbf50c041 build : fix and ignore MSVC warnings (#1889) 2023-06-16 21:23:53 +03:00
Kawrakow
3d01122610 CUDA : faster k-quant dot kernels (#1862)
* cuda : faster k-quant dot kernels

* Imrove Q2_K dot kernel on older GPUs

We now have a K_QUANTS_PER_ITERATION macro, which should be
set to 1 on older and to 2 on newer GPUs.
With this, we preserve the performance of the original
PR on RTX-4080, and are faster compared to master on
GTX-1660.

* Imrove Q6_K dot kernel on older GPUs

Using the same K_QUANTS_PER_ITERATION macro as last commit,
we preserve performance on RTX-4080 and speed up
Q6_K on a GTX-1660.

* Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile

Allowed values are 1 or 2. 2 gives the best performance on
modern GPUs and is set as default. On older GPUs 1 may work
better.

* PR comments

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2023-06-16 20:08:44 +03:00
Borislav Stanimirov
602c748863 gitignore : add several entries specific to Visual Studio (#1888) 2023-06-16 09:58:11 +03:00
Johannes Gäßler
a09f9195be Fixed CUDA runtime version check (#1879) 2023-06-15 21:49:08 +02:00
Georgi Gerganov
bed9275617 cmake : remove whitespaces 2023-06-15 21:56:50 +03:00
yangli2
c36e81da62 examples : add chat-vicuna.sh (#1854)
Co-authored-by: Yang Li <yangliyl@google.com>
2023-06-15 21:05:53 +03:00
Igor Okulist
3559433fec cmake : set include path for OpenBlas (#1830) 2023-06-15 20:51:26 +03:00
22 changed files with 537 additions and 266 deletions

2
.gitignore vendored
View File

@@ -22,6 +22,7 @@ build-metal/
build-no-accel/
build-sanitize-addr/
build-sanitize-thread/
out/
models/*
*.bin
@@ -41,6 +42,7 @@ models/*
build-info.h
arm_neon.h
compile_commands.json
CMakeSettings.json
__pycache__

View File

@@ -70,6 +70,7 @@ set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
option(LLAMA_CUBLAS "llama: use cuBLAS" OFF)
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels")
set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
option(LLAMA_METAL "llama: use Metal" OFF)
option(LLAMA_K_QUANTS "llama: use k-quants" ON)
@@ -163,12 +164,23 @@ if (LLAMA_BLAS)
if (BLAS_FOUND)
message(STATUS "BLAS found, Libraries: ${BLAS_LIBRARIES}")
# BLAS_INCLUDE_DIRS is missing in FindBLAS.cmake.
# see https://gitlab.kitware.com/cmake/cmake/-/issues/20268
find_path(BLAS_INCLUDE_DIRS
NAMES cblas.h
HINTS
/usr/include
/usr/local/include
/usr/include/openblas
)
message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}")
add_compile_options(${BLAS_LINKER_FLAGS})
add_compile_definitions(GGML_USE_OPENBLAS)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${BLAS_LIBRARIES})
set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS})
message("${BLAS_LIBRARIES} ${BLAS_INCLUDE_DIRS}")
include_directories(${BLAS_INCLUDE_DIRS})
else()
message(WARNING "BLAS not found, please refer to "
"https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors"
@@ -190,6 +202,7 @@ if (LLAMA_CUBLAS)
add_compile_definitions(GGML_USE_CUBLAS)
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
add_compile_definitions(GGML_CUDA_DMMV_Y=${LLAMA_CUDA_DMMV_Y})
add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
if (LLAMA_STATIC)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
@@ -408,7 +421,7 @@ add_library(ggml OBJECT
${GGML_SOURCES_EXTRA}
)
target_include_directories(ggml PUBLIC .)
target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})
target_compile_features(ggml PUBLIC c_std_11) # don't bump
target_link_libraries(ggml PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})

View File

@@ -171,6 +171,11 @@ ifdef LLAMA_CUDA_DMMV_Y
else
NVCCFLAGS += -DGGML_CUDA_DMMV_Y=1
endif # LLAMA_CUDA_DMMV_Y
ifdef LLAMA_CUDA_KQUANTS_ITER
NVCCFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
else
NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
endif
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
endif # LLAMA_CUBLAS

View File

@@ -4,6 +4,10 @@
#include <random>
#include <cstring>
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
float frand() {
return (float)rand()/(float)RAND_MAX;
}
@@ -1470,7 +1474,7 @@ struct ggml_tensor * square_error_loss(struct ggml_context * ctx, struct ggml_te
}
struct ggml_tensor * cross_entropy_loss(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) {
const float eps = 1e-3;
const float eps = 1e-3f;
return
ggml_sum(ctx,
ggml_neg(ctx,

View File

@@ -16,6 +16,10 @@
#include <iterator>
#include <algorithm>
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
float tensor_sum_elements(const ggml_tensor * tensor) {
float sum = 0;
if (tensor->type==GGML_TYPE_F32) {
@@ -29,9 +33,9 @@ float tensor_sum_elements(const ggml_tensor * tensor) {
}
void tensor_dump(const ggml_tensor * tensor, const char * name) {
printf("%15s: type = %i (%5s) ne = %5d x %5d x %5d, nb = (%5li, %5li, %5li) - ", name,
printf("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi) - ", name,
tensor->type, ggml_type_name(tensor->type),
(int) tensor->ne[0], (int) tensor->ne[1], (int) tensor->ne[2], tensor->nb[0], tensor->nb[1], tensor->nb[2]);
tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->nb[0], tensor->nb[1], tensor->nb[2]);
float sum = tensor_sum_elements(tensor);
printf("Sum of tensor %s is %6.2f\n", name, sum);
}
@@ -120,7 +124,7 @@ int main(int argc, char ** argv) {
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS
ctx_size += 1024*1024*16;
printf("Allocating Memory of size %li bytes, %li MB\n",ctx_size, (ctx_size/1024/1024));
printf("Allocating Memory of size %zi bytes, %zi MB\n",ctx_size, (ctx_size/1024/1024));
struct ggml_init_params params = {
/*.mem_size =*/ ctx_size,

41
examples/chat-vicuna.sh Executable file
View File

@@ -0,0 +1,41 @@
#!/bin/bash
set -e
cd "$(dirname "$0")/.." || exit
MODEL="${MODEL:-./models/ggml-vic13b-uncensored-q5_0.bin}"
PROMPT_TEMPLATE=${PROMPT_TEMPLATE:-./prompts/chat.txt}
USER_NAME="### Human"
AI_NAME="### Assistant"
# Adjust to the number of CPU cores you want to use.
N_THREAD="${N_THREAD:-8}"
# Number of tokens to predict (made it larger than default because we want a long interaction)
N_PREDICTS="${N_PREDICTS:-2048}"
# Note: you can also override the generation options by specifying them on the command line:
# For example, override the context size by doing: ./chatLLaMa --ctx_size 1024
GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647}"
DATE_TIME=$(date +%H:%M)
DATE_YEAR=$(date +%Y)
PROMPT_FILE=$(mktemp -t llamacpp_prompt.XXXXXXX.txt)
sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \
-e "s/\[\[AI_NAME\]\]/$AI_NAME/g" \
-e "s/\[\[DATE_TIME\]\]/$DATE_TIME/g" \
-e "s/\[\[DATE_YEAR\]\]/$DATE_YEAR/g" \
$PROMPT_TEMPLATE > $PROMPT_FILE
# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
./bin/main $GEN_OPTIONS \
--model "$MODEL" \
--threads "$N_THREAD" \
--n_predict "$N_PREDICTS" \
--color --interactive \
--file ${PROMPT_FILE} \
--reverse-prompt "### Human:" \
--in-prefix ' ' \
"$@"

View File

@@ -28,6 +28,10 @@
#include <wchar.h>
#endif
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
int32_t get_num_physical_cores() {
#ifdef __linux__
// enumerate the set of thread siblings, num entries is num cores
@@ -373,7 +377,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
} else {
throw std::exception();
}
} catch (const std::exception &e) {
} catch (const std::exception&) {
invalid_param = true;
break;
}

View File

@@ -4,6 +4,10 @@
#include <ctime>
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
int main(int argc, char ** argv) {
gpt_params params;

View File

@@ -23,11 +23,17 @@
#include <unistd.h>
#elif defined (_WIN32)
#define WIN32_LEAN_AND_MEAN
#ifndef NOMINMAX
#define NOMINMAX
#endif
#include <windows.h>
#include <signal.h>
#endif
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
static console_state con_st;
static llama_context ** g_ctx;
@@ -348,7 +354,7 @@ int main(int argc, char ** argv) {
if ((int)embd.size() > max_embd_size) {
auto skipped_tokens = embd.size() - max_embd_size;
console_set_color(con_st, CONSOLE_COLOR_ERROR);
printf("<<input too long: skipped %ld token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
printf("<<input too long: skipped %" PRIu64 " token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
console_set_color(con_st, CONSOLE_COLOR_DEFAULT);
fflush(stdout);
embd.resize(max_embd_size);

View File

@@ -5,6 +5,10 @@
#include <cmath>
#include <ctime>
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
std::vector<float> softmax(const std::vector<float>& logits) {
std::vector<float> probs(logits.size());
float max_logit = logits[0];

View File

@@ -19,6 +19,10 @@
#include <thread>
#include <mutex>
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
struct quantize_stats_params {
std::string model = "models/7B/ggml-model-f16.bin";
bool verbose = false;

View File

@@ -37,7 +37,7 @@ int main(int argc, char ** argv) {
// init
auto ctx = llama_init_from_file(params.model.c_str(), lparams);
auto tokens = std::vector<llama_token>(params.n_ctx);
auto n_prompt_tokens = llama_tokenize(ctx, params.prompt.c_str(), tokens.data(), tokens.size(), true);
auto n_prompt_tokens = llama_tokenize(ctx, params.prompt.c_str(), tokens.data(), int(tokens.size()), true);
if (n_prompt_tokens < 1) {
fprintf(stderr, "%s : failed to tokenize prompt\n", __func__);

View File

@@ -12,6 +12,9 @@
#include <algorithm>
#include <string>
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
struct random_normal_distribution {
std::mt19937 gen;
@@ -20,7 +23,6 @@ struct random_normal_distribution {
float max;
};
struct random_uniform_distribution {
std::mt19937 gen;
std::uniform_real_distribution<float> rd;
@@ -2366,7 +2368,7 @@ void write_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
file->write_u32(0);
file->write_u32(0);
file->write_u32(GGML_TYPE_F32);
file->seek(-file->tell() & 31, SEEK_CUR);
file->seek(0-file->tell() & 31, SEEK_CUR);
return;
}
const char * name = ggml_get_name(tensor);
@@ -2381,7 +2383,7 @@ void write_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
file->write_u32(tensor->type);
file->write_raw(ne, sizeof(ne[0]) * nd);
file->write_raw(name, name_len);
file->seek(-file->tell() & 31, SEEK_CUR);
file->seek(0-file->tell() & 31, SEEK_CUR);
file->write_raw(tensor->data, ggml_nbytes(tensor));
}
@@ -2402,7 +2404,7 @@ void read_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
std::string name = file->read_string(name_len);
GGML_ASSERT(strncmp(ggml_get_name(tensor), name.c_str(), sizeof(tensor->name)-1) == 0);
file->seek(-file->tell() & 31, SEEK_CUR);
file->seek(0-file->tell() & 31, SEEK_CUR);
file->read_raw(tensor->data, ggml_nbytes(tensor));
}
@@ -2756,8 +2758,8 @@ struct train_params get_default_train_params() {
params.lbfgs_n_iter = 16;
params.adam_n_iter = 16;
params.adam_alpha = 1e-3;
params.adam_decay = 1e-3;
params.adam_alpha = 1e-3f;
params.adam_decay = 1e-3f;
params.mem_model_gb = 2;
params.mem_compute_gb = 24;
@@ -3331,8 +3333,8 @@ int main(int argc, char ** argv) {
int n_gen = params.n_predict;
int sample_ctx = n_tokens - n_tokens/8;
sampler.params.temp = 0.2;
sampler.params.repeat_penalty = 1.1;
sampler.params.temp = 0.2f;
sampler.params.repeat_penalty = 1.1f;
sampler.params.mirostat = 2;
init_sampler(&sampler, lctx);

View File

@@ -25,7 +25,7 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
} \
} while (0)
#if CUDART_VERSION >= 12
#if CUDART_VERSION >= 12000
#define CUBLAS_CHECK(err) \
do { \
cublasStatus_t err_ = (err); \
@@ -167,6 +167,12 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
#define GGML_CUDA_DMMV_Y 1
#endif
#ifndef K_QUANTS_PER_ITERATION
#define K_QUANTS_PER_ITERATION 2
#else
static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
#endif
static __global__ void add_f32(const float * x, const float * y, float * dst, const int k) {
const int i = blockDim.x*blockIdx.x + threadIdx.x;
@@ -326,37 +332,6 @@ static __global__ void dequantize_block_q2_K(const void * vx, float * yy) {
}
static __device__ void vec_dot_q2_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
const block_q2_K * x = (const block_q2_K *) vx;
// if n is 0, we want to do the lower 128, else the upper 128,
// covering y[l+0], y[l+32], y[l+64], y[l+96] and
// y[l+16], y[l+48], y[l+80], y[l+112]
int n = iqs/128; // 0 or 1
int r = iqs - 128*n; // 0...120 in steps of 8
int l = r/8; // 0...15 in steps of 1
const float * y = yy + 128*n + l;
const uint8_t * q = x[ib].qs + 32*n + l;
const uint8_t * s = x[ib].scales + 8*n;
const float dall = x[ib].d;
const float dmin = x[ib].dmin;
float sum = y[ 0] * (dall * ((s[0] & 0xF) * ((q[ 0] >> 0) & 3)) - dmin * (s[0] >> 4))
+ y[ 32] * (dall * ((s[2] & 0xF) * ((q[ 0] >> 2) & 3)) - dmin * (s[2] >> 4))
+ y[ 64] * (dall * ((s[4] & 0xF) * ((q[ 0] >> 4) & 3)) - dmin * (s[4] >> 4))
+ y[ 96] * (dall * ((s[6] & 0xF) * ((q[ 0] >> 6) & 3)) - dmin * (s[6] >> 4))
+ y[ 16] * (dall * ((s[1] & 0xF) * ((q[16] >> 0) & 3)) - dmin * (s[1] >> 4))
+ y[ 48] * (dall * ((s[3] & 0xF) * ((q[16] >> 2) & 3)) - dmin * (s[3] >> 4))
+ y[ 80] * (dall * ((s[5] & 0xF) * ((q[16] >> 4) & 3)) - dmin * (s[5] >> 4))
+ y[112] * (dall * ((s[7] & 0xF) * ((q[16] >> 6) & 3)) - dmin * (s[7] >> 4));
result = sum;
}
static __global__ void dequantize_block_q3_K(const void * vx, float * yy) {
int r = threadIdx.x/4;
@@ -388,51 +363,6 @@ static __global__ void dequantize_block_q3_K(const void * vx, float * yy) {
}
static __device__ void vec_dot_q3_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
const block_q3_K * x = (const block_q3_K *) vx;
const uint32_t kmask1 = 0x03030303;
const uint32_t kmask2 = 0x0f0f0f0f;
uint32_t aux[3];
uint32_t utmp[4];
// if n is 0, we want to do the lower 128, else the upper 128,
// covering y[l+0], y[l+32], y[l+64], y[l+96] and
// y[l+16], y[l+48], y[l+80], y[l+112]
int n = iqs/128; // 0 or 1
int r = iqs - 128*n; // 0...120 in steps of 8
int l = r/8; // 0...15 in steps of 1
const float * y = yy + 128*n + l;
const uint8_t * q = x[ib].qs + 32*n + l;
const uint8_t * hm = x[ib].hmask + l;
const int8_t * s = (const int8_t *)utmp + 8*n;
memcpy(aux, x[ib].scales, 12);
utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
const float dall = x[ib].d;
const uint8_t m = 1 << (4*n);
float sum = y[ 0] * (s[0] - 32) * (((q[ 0] >> 0) & 3) - (hm[ 0] & (m << 0) ? 0 : 4))
+ y[ 32] * (s[2] - 32) * (((q[ 0] >> 2) & 3) - (hm[ 0] & (m << 1) ? 0 : 4))
+ y[ 64] * (s[4] - 32) * (((q[ 0] >> 4) & 3) - (hm[ 0] & (m << 2) ? 0 : 4))
+ y[ 96] * (s[6] - 32) * (((q[ 0] >> 6) & 3) - (hm[ 0] & (m << 3) ? 0 : 4))
+ y[ 16] * (s[1] - 32) * (((q[16] >> 0) & 3) - (hm[16] & (m << 0) ? 0 : 4))
+ y[ 48] * (s[3] - 32) * (((q[16] >> 2) & 3) - (hm[16] & (m << 1) ? 0 : 4))
+ y[ 80] * (s[5] - 32) * (((q[16] >> 4) & 3) - (hm[16] & (m << 2) ? 0 : 4))
+ y[112] * (s[7] - 32) * (((q[16] >> 6) & 3) - (hm[16] & (m << 3) ? 0 : 4));
result = sum * dall;
}
static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
if (j < 4) {
d = q[j] & 63; m = q[j + 4] & 63;
@@ -479,38 +409,6 @@ static __global__ void dequantize_block_q4_K(const void * vx, float * yy) {
}
}
static __device__ void vec_dot_q4_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
const block_q4_K * x = (const block_q4_K *) vx;
// iqs is in 0...248 in steps of 8 =>
const int j = iqs / 64; // j is in 0...3
const int ir = (iqs - 64*j)/2; // ir is in 0...28 in steps of 4
const int is = 2*j; // is is in 0...6 in steps of 2
const float * y = yy + 64*j + ir;
const uint8_t * q = x[ib].qs + 32*j + ir;
const float dall = x[ib].d;
const float dmin = x[ib].dmin;
uint8_t sc, m;
get_scale_min_k4(is + 0, x[ib].scales, sc, m);
const float d1 = dall * sc;
const float m1 = dmin * m;
get_scale_min_k4(is + 1, x[ib].scales, sc, m);
const float d2 = dall * sc;
const float m2 = dmin * m;
float sum = 0;
for (int k = 0; k < 4; ++k) {
sum += y[k + 0] * (d1 * (q[k] & 0xF) - m1);
sum += y[k + 32] * (d2 * (q[k] >> 4) - m2);
}
result = sum;
}
static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
const block_q5_K * x = (const block_q5_K *) vx;
@@ -544,43 +442,6 @@ static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2;
}
static __device__ void vec_dot_q5_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
const block_q5_K * x = (const block_q5_K *) vx;
// iqs is in 0...248 in steps of 8 =>
const int j = iqs / 64; // j is in 0...3
const int ir = (iqs - 64*j)/2; // ir is in 0...28 in steps of 4
const int is = 2*j; // is is in 0...6 in steps of 2
const float * y = yy + 64*j + ir;
const uint8_t * ql = x[ib].qs + 32*j + ir;
const uint8_t * qh = x[ib].qh + ir;
const float dall = x[ib].d;
const float dmin = x[ib].dmin;
uint8_t sc, m;
get_scale_min_k4(is + 0, x[ib].scales, sc, m);
const float d1 = dall * sc;
const float m1 = dmin * m;
get_scale_min_k4(is + 1, x[ib].scales, sc, m);
const float d2 = dall * sc;
const float m2 = dmin * m;
uint8_t hm = 1 << is;
float sum = 0;
for (int k = 0; k < 4; ++k) {
sum += y[k + 0] * (d1 * ((ql[k] & 0xF) + (qh[k] & hm ? 16 : 0)) - m1);
}
hm <<= 1;
for (int k = 0; k < 4; ++k) {
sum += y[k + 32] * (d2 * ((ql[k] >> 4) + (qh[k] & hm ? 16 : 0)) - m2);
}
result = sum;
}
static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
const block_q6_K * x = (const block_q6_K *) vx;
@@ -606,31 +467,376 @@ static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
}
static __device__ void vec_dot_q6_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
const block_q6_K * x = (const block_q6_K *) vx;
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
const int ip = iqs / 128; // 0 or 1
const int il = (iqs - 128*ip)/8; // 0...15
const int is = 8*ip;
const int row = blockIdx.y*blockDim.y + threadIdx.y;
if (row > nrows) return;
const float * y = yy + 128*ip + il;
const int num_blocks_per_row = ncols / QK_K;
const int ib0 = row*num_blocks_per_row;
const float d = x[ib].d;
const block_q2_K * x = (const block_q2_K *)vx + ib0;
const uint8_t * ql = x[ib].ql + 64*ip + il;
const uint8_t * qh = x[ib].qh + 32*ip + il;
const int8_t * sc = x[ib].scales + is;
const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31
const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0
result = y[ 0] * d * sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh[ 0] >> 0) & 3) << 4)) - 32)
+ y[ 32] * d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh[ 0] >> 2) & 3) << 4)) - 32)
+ y[ 64] * d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh[ 0] >> 4) & 3) << 4)) - 32)
+ y[ 96] * d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh[ 0] >> 6) & 3) << 4)) - 32)
+ y[ 16] * d * sc[1] * ((int8_t)((ql[16] & 0xF) | (((qh[16] >> 0) & 3) << 4)) - 32)
+ y[ 48] * d * sc[3] * ((int8_t)((ql[48] & 0xF) | (((qh[16] >> 2) & 3) << 4)) - 32)
+ y[ 80] * d * sc[5] * ((int8_t)((ql[16] >> 4) | (((qh[16] >> 4) & 3) << 4)) - 32)
+ y[112] * d * sc[7] * ((int8_t)((ql[48] >> 4) | (((qh[16] >> 6) & 3) << 4)) - 32);
const int step = 16/K_QUANTS_PER_ITERATION;
const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
const int in = tid - step*im; // 0...7
const int l0 = K_QUANTS_PER_ITERATION*in; // 0...14 in steps of 4
const int q_offset = 32*im + l0;
const int s_offset = 8*im;
const int y_offset = 128*im + l0;
float tmp = 0; // partial sum for thread in warp
uint32_t aux[4];
const uint8_t * d = (const uint8_t *)aux;
const uint8_t * m = (const uint8_t *)(aux + 2);
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
const float * y = yy + i * QK_K + y_offset;
const uint8_t * q = x[i].qs + q_offset;
const float dall = x[i].d;
const float dmin = x[i].dmin;
const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset);
aux[0] = a[0] & 0x0f0f0f0f;
aux[1] = a[1] & 0x0f0f0f0f;
aux[2] = (a[0] >> 4) & 0x0f0f0f0f;
aux[3] = (a[1] >> 4) & 0x0f0f0f0f;
float sum1 = 0, sum2 = 0;
for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
sum1 += y[l+ 0] * d[0] * ((q[l+ 0] >> 0) & 3)
+ y[l+32] * d[2] * ((q[l+ 0] >> 2) & 3)
+ y[l+64] * d[4] * ((q[l+ 0] >> 4) & 3)
+ y[l+96] * d[6] * ((q[l+ 0] >> 6) & 3)
+ y[l+16] * d[1] * ((q[l+16] >> 0) & 3)
+ y[l+48] * d[3] * ((q[l+16] >> 2) & 3)
+ y[l+80] * d[5] * ((q[l+16] >> 4) & 3)
+y[l+112] * d[7] * ((q[l+16] >> 6) & 3);
sum2 += y[l+ 0] * m[0] + y[l+32] * m[2] + y[l+64] * m[4] + y[ l+96] * m[6]
+ y[l+16] * m[1] + y[l+48] * m[3] + y[l+80] * m[5] + y[l+112] * m[7];
}
tmp += dall * sum1 - dmin * sum2;
}
// sum up partial sums and write back result
__syncthreads();
#pragma unroll
for (int mask = 16; mask > 0; mask >>= 1) {
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
}
if (tid == 0) {
dst[row] = tmp;
}
}
static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float * yy, float * dst, const int ncols) {
const uint16_t kmask1 = 0x0303;
const uint16_t kmask2 = 0x0f0f;
const int row = blockIdx.x;
const int num_blocks_per_row = ncols / QK_K;
const int ib0 = row*num_blocks_per_row;
const block_q3_K * x = (const block_q3_K *)vx + ib0;
const int tid = threadIdx.x/2; // 0...15
const int ix = threadIdx.x%2; // 0, 1
const int n = 2; // iterations in the inner loop
const int im = tid/8; // 0 or 1. 0 computes 0..., 1 computes 128...
const int in = tid - 8*im; // 0...7
const uint8_t m = 1 << (4*im);
const int l0 = n*in; // 0...28 in steps of 4
const int q_offset = 32*im + l0;
const int y_offset = 128*im + l0;
uint16_t utmp[4];
const int8_t * s = (const int8_t *)utmp;
const uint16_t s_shift = 4*im;
float tmp = 0; // partial sum for thread in warp
for (int i = ix; i < num_blocks_per_row; i += 2) {
const float * y = yy + i * QK_K + y_offset;
const uint8_t * q = x[i].qs + q_offset;
const uint8_t * h = x[i].hmask + l0;
const uint16_t * a = (const uint16_t *)x[i].scales;
utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4);
utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4);
utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4);
utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4);
const float d = x[i].d;
float sum = 0;
for (int l = 0; l < n; ++l) {
sum += y[l+ 0] * (s[0] - 32) * (((q[l] >> 0) & 3) - (h[l] & (m << 0) ? 0 : 4))
+ y[l+32] * (s[2] - 32) * (((q[l] >> 2) & 3) - (h[l] & (m << 1) ? 0 : 4))
+ y[l+64] * (s[4] - 32) * (((q[l] >> 4) & 3) - (h[l] & (m << 2) ? 0 : 4))
+ y[l+96] * (s[6] - 32) * (((q[l] >> 6) & 3) - (h[l] & (m << 3) ? 0 : 4));
sum += y[l+16] * (s[1] - 32) * (((q[l+16] >> 0) & 3) - (h[l+16] & (m << 0) ? 0 : 4))
+ y[l+48] * (s[3] - 32) * (((q[l+16] >> 2) & 3) - (h[l+16] & (m << 1) ? 0 : 4))
+ y[l+80] * (s[5] - 32) * (((q[l+16] >> 4) & 3) - (h[l+16] & (m << 2) ? 0 : 4))
+ y[l+112] * (s[7] - 32) * (((q[l+16] >> 6) & 3) - (h[l+16] & (m << 3) ? 0 : 4));
}
tmp += d * sum;
}
// sum up partial sums and write back result
__syncthreads();
#pragma unroll
for (int mask = 16; mask > 0; mask >>= 1) {
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
}
if (tid == 0) {
dst[row] = tmp;
}
}
static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float * yy, float * dst, const int ncols) {
const uint16_t kmask1 = 0x3f3f;
const uint16_t kmask2 = 0x0f0f;
const uint16_t kmask3 = 0xc0c0;
const int row = blockIdx.x;
const int num_blocks_per_row = ncols / QK_K;
const int ib0 = row*num_blocks_per_row;
const int tid = threadIdx.x/2; // 0...15
const int ix = threadIdx.x%2;
const int il = tid/4; // 0...3
const int ir = tid - 4*il;// 0...3
const int n = 4;
const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
const int in = il%2;
const int l0 = n*(2*ir + in);
const int q_offset = 32*im + l0;
const int y_offset = 64*im + l0;
uint16_t aux[4];
const uint8_t * sc = (const uint8_t *)aux;
const block_q4_K * x = (const block_q4_K *)vx + ib0;
float tmp = 0; // partial sum for thread in warp
for (int i = ix; i < num_blocks_per_row; i += 2) {
const uint8_t * q1 = x[i].qs + q_offset;
const uint8_t * q2 = q1 + 64;
const float * y1 = yy + i*QK_K + y_offset;
const float * y2 = y1 + 128;
const float dall = x[i].d;
const float dmin = x[i].dmin;
const uint16_t * a = (const uint16_t *)x[i].scales;
aux[0] = a[im+0] & kmask1;
aux[1] = a[im+2] & kmask1;
aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
float4 s = {0.f, 0.f, 0.f, 0.f};
float smin = 0;
for (int l = 0; l < n; ++l) {
s.x += y1[l] * (q1[l] & 0xF); s.y += y1[l+32] * (q1[l] >> 4);
s.z += y2[l] * (q2[l] & 0xF); s.w += y2[l+32] * (q2[l] >> 4);
smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
}
tmp += dall * (s.x * sc[0] + s.y * sc[1] + s.z * sc[4] + s.w * sc[5]) - dmin * smin;
}
// sum up partial sums and write back result
__syncthreads();
#pragma unroll
for (int mask = 16; mask > 0; mask >>= 1) {
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
}
if (tid == 0) {
dst[row] = tmp;
}
}
static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float * yy, float * dst, const int ncols) {
const uint16_t kmask1 = 0x3f3f;
const uint16_t kmask2 = 0x0f0f;
const uint16_t kmask3 = 0xc0c0;
//const int row = blockIdx.x*blockDim.y + threadIdx.y;
const int row = blockIdx.x;
const int num_blocks_per_row = ncols / QK_K;
const int ib0 = row*num_blocks_per_row;
const int tid = threadIdx.x/2; // 0...15
const int ix = threadIdx.x%2;
const int il = tid/4; // 0...3
const int ir = tid - 4*il;// 0...3
const int n = 4;
const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
const int in = il%2;
const int l0 = n*(2*ir + in);
const int q_offset = 32*im + l0;
const int y_offset = 64*im + l0;
const uint8_t hm1 = 1 << (2*im);
const uint8_t hm2 = hm1 << 4;
uint16_t aux[4];
const uint8_t * sc = (const uint8_t *)aux;
const block_q5_K * x = (const block_q5_K *)vx + ib0;
float tmp = 0; // partial sum for thread in warp
for (int i = ix; i < num_blocks_per_row; i += 2) {
const uint8_t * ql1 = x[i].qs + q_offset;
const uint8_t * ql2 = ql1 + 64;
const uint8_t * qh = x[i].qh + l0;
const float * y1 = yy + i*QK_K + y_offset;
const float * y2 = y1 + 128;
const float dall = x[i].d;
const float dmin = x[i].dmin;
const uint16_t * a = (const uint16_t *)x[i].scales;
aux[0] = a[im+0] & kmask1;
aux[1] = a[im+2] & kmask1;
aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
float4 sum = {0.f, 0.f, 0.f, 0.f};
float smin = 0;
for (int l = 0; l < n; ++l) {
sum.x += y1[l+ 0] * ((ql1[l] & 0xF) + (qh[l] & (hm1 << 0) ? 16 : 0));
sum.y += y1[l+32] * ((ql1[l] >> 4) + (qh[l] & (hm1 << 1) ? 16 : 0));
sum.z += y2[l+ 0] * ((ql2[l] & 0xF) + (qh[l] & (hm2 << 0) ? 16 : 0));
sum.w += y2[l+32] * ((ql2[l] >> 4) + (qh[l] & (hm2 << 1) ? 16 : 0));
smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
}
tmp += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin;
}
// sum up partial sums and write back result
__syncthreads();
#pragma unroll
for (int mask = 16; mask > 0; mask >>= 1) {
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
}
if (tid == 0) {
dst[row] = tmp;
}
}
static __global__ void dequantize_mul_mat_vec_q6_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
const int row = blockIdx.y*blockDim.y + threadIdx.y;
if (row > nrows) return;
const int num_blocks_per_row = ncols / QK_K;
const int ib0 = row*num_blocks_per_row;
const block_q6_K * x = (const block_q6_K *)vx + ib0;
const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0, 1
const int step = 16/K_QUANTS_PER_ITERATION; // 16 or 8
const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
const int in = tid - step*im; // 0...15 or 0...7
#if K_QUANTS_PER_ITERATION == 1
const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15
const int is = 0;
#else
const int l0 = 4 * in; // 0, 4, 8, ..., 28
const int is = in / 4;
#endif
const int ql_offset = 64*im + l0;
const int qh_offset = 32*im + l0;
const int s_offset = 8*im + is;
const int y_offset = 128*im + l0;
float tmp = 0; // partial sum for thread in warp
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
const float * y = yy + i * QK_K + y_offset;
const uint8_t * ql = x[i].ql + ql_offset;
const uint8_t * qh = x[i].qh + qh_offset;
const int8_t * s = x[i].scales + s_offset;
const float d = x[i].d;
#if K_QUANTS_PER_ITERATION == 1
float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32)
+ y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32)
+ y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32)
+ y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32)
+ y[64] * s[4] * d * ((int8_t)((ql[ 0] >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32)
+ y[80] * s[5] * d * ((int8_t)((ql[16] >> 4) | ((qh[16] & 0x30) >> 0)) - 32)
+ y[96] * s[6] * d * ((int8_t)((ql[32] >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32)
+y[112] * s[7] * d * ((int8_t)((ql[48] >> 4) | ((qh[16] & 0xc0) >> 2)) - 32);
tmp += sum;
#else
float sum = 0;
for (int l = 0; l < 4; ++l) {
sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32)
+ y[l+32] * s[2] * d * ((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32)
+ y[l+64] * s[4] * d * ((int8_t)((ql[l+ 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32)
+ y[l+96] * s[6] * d * ((int8_t)((ql[l+32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
}
tmp += sum;
#endif
}
// sum up partial sums and write back result
__syncthreads();
#pragma unroll
for (int mask = 16; mask > 0; mask >>= 1) {
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
}
if (tid == 0) {
dst[row] = tmp;
}
}
static __device__ void convert_f16(const void * vx, const int ib, const int iqs, float & v0, float & v1){
@@ -712,46 +918,6 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y,
}
}
template <int n_thread, dot_kernel_k_t dot_kernel>
static __global__ void dequantize_mul_mat_vec_k(const void * vx, const float * y, float * dst, const int ncols, const int nrows) {
const int row = blockIdx.y*blockDim.y + threadIdx.y;
if (row >= nrows) {
return;
}
const int tid = threadIdx.x;
const int iter_stride = QK_K;
const int vals_per_iter = iter_stride / n_thread;
const int num_blocks_per_row = ncols / QK_K;
const int ib0 = row*num_blocks_per_row;
float tmp = 0; // partial sum for thread in warp
for (int i = 0; i < ncols; i += iter_stride) {
const int col = i + vals_per_iter*tid;
const int ib = ib0 + col/QK_K; // x block index
const int iqs = col%QK_K; // x quant index
const int iybs = col - col%QK_K; // y block start index
float v;
dot_kernel(vx, ib, iqs, y + iybs, v);
tmp += v;
}
// sum up partial sums and write back result
__syncthreads();
#pragma unroll
for (int mask = 16; mask > 0; mask >>= 1) {
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
}
if (tid == 0) {
dst[row] = tmp;
}
}
static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nchannels_x) {
const half * x = (half *) vx;
@@ -1094,43 +1260,34 @@ static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, f
const int block_num_y = (nrows + ny - 1) / ny;
const dim3 block_nums(1, block_num_y, 1);
const dim3 block_dims(32, ny, 1);
dequantize_mul_mat_vec_k<32, vec_dot_q2_K><<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
dequantize_mul_mat_vec_q2_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
}
static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
GGML_ASSERT(ncols % QK_K == 0);
const int ny = 2;
const int block_num_y = (nrows + ny - 1) / ny;
const dim3 block_nums(1, block_num_y, 1);
const dim3 block_dims(32, ny, 1);
dequantize_mul_mat_vec_k<32, vec_dot_q3_K><<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
const dim3 block_dims(32, 1, 1);
dequantize_mul_mat_vec_q3_k<<<nrows, block_dims, 0, stream>>>(vx, y, dst, ncols);
}
static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
GGML_ASSERT(ncols % QK_K == 0);
const int ny = 2;
const int block_num_y = (nrows + ny - 1) / ny;
const dim3 block_nums(1, block_num_y, 1);
const dim3 block_dims(32, ny, 1);
dequantize_mul_mat_vec_k<32, vec_dot_q4_K><<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
const dim3 block_dims(32, 1, 1);
dequantize_mul_mat_vec_q4_k<<<nrows, block_dims, 0, stream>>>(vx, y, dst, ncols);
}
static void dequantize_mul_mat_vec_q5_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
GGML_ASSERT(ncols % QK_K == 0);
const int ny = 2;
const int block_num_y = (nrows + ny - 1) / ny;
const dim3 block_nums(1, block_num_y, 1);
const dim3 block_dims(32, ny, 1);
dequantize_mul_mat_vec_k<32, vec_dot_q5_K><<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
const dim3 block_dims(32, 1, 1);
dequantize_mul_mat_vec_q5_k<<<nrows, block_dims, 0, stream>>>(vx, y, dst, ncols);
}
static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
GGML_ASSERT(ncols % QK_K == 0);
const int ny = 2;
const int ny = 2 / K_QUANTS_PER_ITERATION;
const int block_num_y = (nrows + ny - 1) / ny;
const dim3 block_nums(1, block_num_y, 1);
const dim3 block_dims(32, ny, 1);
dequantize_mul_mat_vec_k<32, vec_dot_q6_K><<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
}
static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {

6
ggml.c
View File

@@ -35,6 +35,12 @@
#define static_assert(cond, msg) struct global_scope_noop_trick
#endif
#if defined(_MSC_VER)
// disable "possible loss of data" to avoid hundreds of casts
// we should just be careful :)
#pragma warning(disable: 4244 4267)
#endif
#if defined(_WIN32)
#include <windows.h>

View File

@@ -40,6 +40,10 @@
#include <sstream>
#include <numeric>
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
#define LLAMA_USE_SCRATCH
#define LLAMA_MAX_SCRATCH_BUFFERS 16
@@ -1654,7 +1658,7 @@ static bool llama_eval_internal(
// cur = cur*norm(broadcasted)
cur = ggml_mul(ctx0, cur, model.norm);
offload_func_nr(cur);
// offload_func_nr(cur); // TODO CPU + GPU mirrored backend
ggml_set_name(cur, "result_norm");
embeddings = cur;

View File

@@ -244,9 +244,9 @@ extern "C" {
LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);
// Special tokens
LLAMA_API llama_token llama_token_bos();
LLAMA_API llama_token llama_token_eos();
LLAMA_API llama_token llama_token_nl();
LLAMA_API llama_token llama_token_bos(); // beginning-of-sentence
LLAMA_API llama_token llama_token_eos(); // end-of-sentence
LLAMA_API llama_token llama_token_nl(); // next-line
// Sampling functions

View File

@@ -10,6 +10,10 @@
#include <ggml.h>
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
constexpr int kVecSize = 1 << 18;
float drawFromGaussianPdf(std::mt19937& rndm) {

View File

@@ -9,12 +9,15 @@
#include <string>
#include <vector>
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
const float MAX_QUANTIZATION_REFERENCE_ERROR = 0.0001;
const float MAX_QUANTIZATION_TOTAL_ERROR = 0.002;
const float MAX_QUANTIZATION_TOTAL_ERROR_2BITS = 0.0075;
const float MAX_QUANTIZATION_TOTAL_ERROR_3BITS = 0.0040;
const float MAX_DOT_PRODUCT_ERROR = 0.02;
const float MAX_QUANTIZATION_REFERENCE_ERROR = 0.0001f;
const float MAX_QUANTIZATION_TOTAL_ERROR = 0.002f;
const float MAX_QUANTIZATION_TOTAL_ERROR_2BITS = 0.0075f;
const float MAX_QUANTIZATION_TOTAL_ERROR_3BITS = 0.0040f;
const float MAX_DOT_PRODUCT_ERROR = 0.02f;
const char* RESULT_STR[] = {"ok", "FAILED"};

View File

@@ -13,6 +13,10 @@
#include <string>
#include <vector>
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
#define MAX_ALIGNMENT 64
#define QK 32
#define WARMUP 5

View File

@@ -176,27 +176,27 @@ void test_frequency_presence_penalty(
int main(void) {
ggml_time_init();
test_top_k({0.1, 0.2, 0.3, 0.4}, {0.4}, 1);
test_top_k({0.1, 0.2, 0.3, 0.4}, {0.4, 0.3, 0.2}, 3);
test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f}, 1);
test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f}, 3);
test_top_p({0.1, 0.2, 0.3, 0.4}, {0.4}, 0);
test_top_p({0.1, 0.2, 0.3, 0.4}, {0.4, 0.3}, 0.7);
test_top_p({0.1, 0.2, 0.3, 0.4}, {0.4, 0.3, 0.2, 0.1}, 1);
test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f}, 0);
test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f}, 0.7f);
test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 1);
test_tfs({0.1, 0.15, 0.2, 0.25, 0.3}, {0.3}, 0.25);
test_tfs({0.1, 0.15, 0.2, 0.25, 0.3}, {0.3, 0.25}, 0.75);
test_tfs({0.1, 0.15, 0.2, 0.25, 0.3}, {0.3, 0.25}, 0.99);
test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f}, 0.25f);
test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f, 0.25f}, 0.75f);
test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f, 0.25f}, 0.99f);
test_typical({0.97, 0.01, 0.01, 0.01}, {0.97}, 0.5);
test_typical({0.4, 0.2, 0.2, 0.2}, {0.2, 0.2, 0.2}, 0.5);
test_typical({0.97f, 0.01f, 0.01f, 0.01f}, {0.97f}, 0.5f);
test_typical({0.4f, 0.2f, 0.2f, 0.2f}, {0.2f, 0.2f, 0.2f}, 0.5f);
test_repetition_penalty({0.2, 0.2, 0.2, 0.2, 0.2}, {0}, {0.25, 0.25, 0.25, 0.25, 0}, 50.0);
test_repetition_penalty({0.2, 0.2, 0.2, 0.2, 0.2}, {0, 1, 2}, {0.5, 0.5, 0, 0, 0}, 50.0);
test_repetition_penalty({0.2, 0.2, 0.2, 0.2, 0.2}, {0, 1, 2, 0, 0}, {0.5, 0.5, 0, 0, 0}, 50.0);
test_repetition_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0}, {0.25f, 0.25f, 0.25f, 0.25f, 0}, 50.0f);
test_repetition_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2}, {0.5f, 0.5f, 0, 0, 0}, 50.0f);
test_repetition_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.5f, 0.5f, 0, 0, 0}, 50.0f);
test_frequency_presence_penalty({0.2, 0.2, 0.2, 0.2, 0.2}, {0}, {0.249997, 0.249997, 0.249997, 0.249997, 0.000011}, 5.0, 5.0);
test_frequency_presence_penalty({0.2, 0.2, 0.2, 0.2, 0.2}, {0, 1, 2}, {0.499966, 0.499966, 0.000023, 0.000023, 0.000023}, 5.0, 5.0);
test_frequency_presence_penalty({0.2, 0.2, 0.2, 0.2, 0.2}, {0, 1, 2, 0, 0}, {0.499977, 0.499977, 0.000023, 0.000023, 0.000000}, 5.0, 5.0);
test_frequency_presence_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0}, {0.249997f, 0.249997f, 0.249997f, 0.249997f, 0.000011f}, 5.0f, 5.0f);
test_frequency_presence_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2}, {0.499966f, 0.499966f, 0.000023f, 0.000023f, 0.000023f}, 5.0f, 5.0f);
test_frequency_presence_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.499977f, 0.499977f, 0.000023f, 0.000023f, 0.000000f}, 5.0f, 5.0f);
printf("OK\n");
}

View File

@@ -53,7 +53,7 @@ int main(int argc, char **argv) {
for (const auto & test_kv : k_tests()) {
std::vector<llama_token> res(test_kv.first.size());
const int n = llama_tokenize(ctx, test_kv.first.c_str(), res.data(), res.size(), true);
const int n = llama_tokenize(ctx, test_kv.first.c_str(), res.data(), int(res.size()), true);
res.resize(n);
bool correct = res.size() == test_kv.second.size();