mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-02-12 14:03:20 +02:00
Compare commits
48 Commits
b1420
...
llama-refa
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
afb3929279 | ||
|
|
07178c98e1 | ||
|
|
5baefef497 | ||
|
|
207b51900e | ||
|
|
6e08281e58 | ||
|
|
2046eb4345 | ||
|
|
71a09da301 | ||
|
|
d69d777c02 | ||
|
|
210e6e5d02 | ||
|
|
79ad734417 | ||
|
|
761087932b | ||
|
|
8925cf9ef8 | ||
|
|
1e9c5443c2 | ||
|
|
da936188d8 | ||
|
|
739b85c985 | ||
|
|
25cfbf6776 | ||
|
|
b4ad03b3a7 | ||
|
|
79617902ea | ||
|
|
e14aa46151 | ||
|
|
0dc05b8433 | ||
|
|
4e98897ede | ||
|
|
51c4f9ee9f | ||
|
|
3af8771389 | ||
|
|
83d2c43791 | ||
|
|
38aca9e1ab | ||
|
|
5946d98fc8 | ||
|
|
8b2420d249 | ||
|
|
ff3bad83e2 | ||
|
|
82a6646e02 | ||
|
|
ba231e8a6d | ||
|
|
8a2f2fea29 | ||
|
|
bd6d9e2059 | ||
|
|
ee1a0ec9cb | ||
|
|
177461104b | ||
|
|
fdee152e4e | ||
|
|
41aee4df82 | ||
|
|
6d459cbfbe | ||
|
|
c8d6a1f34a | ||
|
|
2f9ec7e271 | ||
|
|
34b2a5e1ee | ||
|
|
6961c4bd0b | ||
|
|
cc44877486 | ||
|
|
ad93962657 | ||
|
|
1717521cdb | ||
|
|
b2f7e04bd3 | ||
|
|
abd21fc99f | ||
|
|
2b4ea35e56 | ||
|
|
daab3d7f45 |
2
.github/ISSUE_TEMPLATE/bug.md
vendored
2
.github/ISSUE_TEMPLATE/bug.md
vendored
@@ -1,7 +1,7 @@
|
||||
---
|
||||
name: Bug template
|
||||
about: Used to report bugs in llama.cpp
|
||||
labels: ["bug"]
|
||||
labels: ["bug-unconfirmed"]
|
||||
assignees: ''
|
||||
|
||||
---
|
||||
|
||||
@@ -82,6 +82,7 @@ set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
|
||||
option(LLAMA_CUBLAS "llama: use CUDA" OFF)
|
||||
#option(LLAMA_CUDA_CUBLAS "llama: use cuBLAS for prompt processing" OFF)
|
||||
option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF)
|
||||
option(LLAMA_CUDA_FORCE_MMQ "llama: use mmq kernels instead of cuBLAS" OFF)
|
||||
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
|
||||
set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
|
||||
option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some calculations" OFF)
|
||||
@@ -93,7 +94,6 @@ option(LLAMA_CLBLAST "llama: use CLBlast"
|
||||
option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT})
|
||||
option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging" OFF)
|
||||
option(LLAMA_MPI "llama: use MPI" OFF)
|
||||
option(LLAMA_K_QUANTS "llama: use k-quants" ON)
|
||||
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
|
||||
|
||||
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
|
||||
@@ -277,13 +277,8 @@ if (LLAMA_BLAS)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (LLAMA_K_QUANTS)
|
||||
set(GGML_HEADERS_EXTRA k_quants.h)
|
||||
set(GGML_SOURCES_EXTRA k_quants.c)
|
||||
add_compile_definitions(GGML_USE_K_QUANTS)
|
||||
if (LLAMA_QKK_64)
|
||||
add_compile_definitions(GGML_QKK_64)
|
||||
endif()
|
||||
if (LLAMA_QKK_64)
|
||||
add_compile_definitions(GGML_QKK_64)
|
||||
endif()
|
||||
|
||||
if (LLAMA_CUBLAS)
|
||||
@@ -305,6 +300,9 @@ if (LLAMA_CUBLAS)
|
||||
if (LLAMA_CUDA_FORCE_DMMV)
|
||||
add_compile_definitions(GGML_CUDA_FORCE_DMMV)
|
||||
endif()
|
||||
if (LLAMA_CUDA_FORCE_MMQ)
|
||||
add_compile_definitions(GGML_CUDA_FORCE_MMQ)
|
||||
endif()
|
||||
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
|
||||
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
|
||||
if (DEFINED LLAMA_CUDA_DMMV_Y)
|
||||
@@ -331,6 +329,7 @@ if (LLAMA_CUBLAS)
|
||||
set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
|
||||
else()
|
||||
set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
|
||||
#set(CMAKE_CUDA_ARCHITECTURES "") # use this to compile much faster, but only F16 models work
|
||||
endif()
|
||||
endif()
|
||||
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
|
||||
@@ -404,6 +403,9 @@ if (LLAMA_HIPBLAS)
|
||||
if (LLAMA_CUDA_FORCE_DMMV)
|
||||
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_DMMV)
|
||||
endif()
|
||||
if (LLAMA_CUDA_FORCE_MMQ)
|
||||
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_MMQ)
|
||||
endif()
|
||||
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
|
||||
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
|
||||
target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
|
||||
@@ -665,6 +667,8 @@ add_library(ggml OBJECT
|
||||
ggml-alloc.h
|
||||
ggml-backend.c
|
||||
ggml-backend.h
|
||||
ggml-quants.c
|
||||
ggml-quants.h
|
||||
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
|
||||
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
|
||||
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
|
||||
|
||||
27
Makefile
27
Makefile
@@ -342,13 +342,9 @@ else
|
||||
MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
|
||||
endif
|
||||
|
||||
ifndef LLAMA_NO_K_QUANTS
|
||||
MK_CPPFLAGS += -DGGML_USE_K_QUANTS
|
||||
OBJS += k_quants.o
|
||||
ifdef LLAMA_QKK_64
|
||||
MK_CPPFLAGS += -DGGML_QKK_64
|
||||
endif
|
||||
endif
|
||||
|
||||
ifndef LLAMA_NO_ACCELERATE
|
||||
# Mac OS - include Accelerate framework.
|
||||
@@ -365,7 +361,7 @@ ifdef LLAMA_MPI
|
||||
MK_CPPFLAGS += -DGGML_USE_MPI
|
||||
MK_CFLAGS += -Wno-cast-qual
|
||||
MK_CXXFLAGS += -Wno-cast-qual
|
||||
OBJS += ggml-mpi.o
|
||||
OBJS += ggml-mpi.o
|
||||
endif # LLAMA_MPI
|
||||
|
||||
ifdef LLAMA_OPENBLAS
|
||||
@@ -382,7 +378,7 @@ endif # LLAMA_BLIS
|
||||
ifdef LLAMA_CUBLAS
|
||||
MK_CPPFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
|
||||
MK_LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
|
||||
OBJS += ggml-cuda.o
|
||||
OBJS += ggml-cuda.o
|
||||
NVCCFLAGS = --forward-unknown-to-host-compiler -use_fast_math
|
||||
ifdef LLAMA_CUDA_NVCC
|
||||
NVCC = $(LLAMA_CUDA_NVCC)
|
||||
@@ -397,6 +393,9 @@ endif # CUDA_DOCKER_ARCH
|
||||
ifdef LLAMA_CUDA_FORCE_DMMV
|
||||
NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV
|
||||
endif # LLAMA_CUDA_FORCE_DMMV
|
||||
ifdef LLAMA_CUDA_FORCE_MMQ
|
||||
NVCCFLAGS += -DGGML_CUDA_FORCE_MMQ
|
||||
endif # LLAMA_CUDA_FORCE_MMQ
|
||||
ifdef LLAMA_CUDA_DMMV_X
|
||||
NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
|
||||
else
|
||||
@@ -494,11 +493,6 @@ ggml-mpi.o: ggml-mpi.c ggml-mpi.h
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
endif # LLAMA_MPI
|
||||
|
||||
ifndef LLAMA_NO_K_QUANTS
|
||||
k_quants.o: k_quants.c k_quants.h
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
endif # LLAMA_NO_K_QUANTS
|
||||
|
||||
# combine build flags with cmdline overrides
|
||||
override CFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CFLAGS) $(CFLAGS)
|
||||
override CXXFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CXXFLAGS) $(CXXFLAGS)
|
||||
@@ -539,15 +533,18 @@ ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
|
||||
ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
|
||||
OBJS += ggml-alloc.o ggml-backend.o
|
||||
ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
|
||||
OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o
|
||||
|
||||
llama.o: llama.cpp ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
COMMON_H_DEPS = common/common.h common/sampling.h build-info.h common/log.h
|
||||
COMMON_DEPS = $(COMMON_H_DEPS) common.o sampling.o grammar-parser.o
|
||||
COMMON_H_DEPS = common/common.h common/sampling.h common/log.h
|
||||
COMMON_DEPS = common.o sampling.o grammar-parser.o
|
||||
|
||||
common.o: common/common.cpp $(COMMON_H_DEPS)
|
||||
common.o: common/common.cpp build-info.h $(COMMON_H_DEPS)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
sampling.o: common/sampling.cpp $(COMMON_H_DEPS)
|
||||
|
||||
@@ -42,13 +42,12 @@ let package = Package(
|
||||
"llama.cpp",
|
||||
"ggml-alloc.c",
|
||||
"ggml-backend.c",
|
||||
"k_quants.c",
|
||||
"ggml-quants.c",
|
||||
] + additionalSources,
|
||||
resources: resources,
|
||||
publicHeadersPath: "spm-headers",
|
||||
cSettings: [
|
||||
.unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
|
||||
.define("GGML_USE_K_QUANTS"),
|
||||
.define("GGML_USE_ACCELERATE")
|
||||
// NOTE: NEW_LAPACK will required iOS version 16.4+
|
||||
// We should consider add this in the future when we drop support for iOS 14
|
||||
|
||||
21
build.zig
21
build.zig
@@ -116,15 +116,10 @@ pub fn build(b: *std.build.Builder) !void {
|
||||
var make = try Maker.init(b);
|
||||
make.enable_lto = b.option(bool, "lto", "Enable LTO optimization, (default: false)") orelse false;
|
||||
|
||||
if (b.option(bool, "k-quants", "Enable K-quants, (default: true)") orelse true) {
|
||||
try make.addFlag("-DGGML_USE_K_QUANTS");
|
||||
const k_quants = make.obj("k_quants", "k_quants.c");
|
||||
try make.objs.append(k_quants);
|
||||
}
|
||||
|
||||
const ggml = make.obj("ggml", "ggml.c");
|
||||
const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
|
||||
const ggml_backend = make.obj("ggml-backend", "ggml-backend.c");
|
||||
const ggml_quants = make.obj("ggml-quants", "ggml-quants.c");
|
||||
const llama = make.obj("llama", "llama.cpp");
|
||||
const common = make.obj("common", "common/common.cpp");
|
||||
const console = make.obj("console", "common/console.cpp");
|
||||
@@ -133,14 +128,14 @@ pub fn build(b: *std.build.Builder) !void {
|
||||
const train = make.obj("train", "common/train.cpp");
|
||||
const clip = make.obj("clip", "examples/llava/clip.cpp");
|
||||
|
||||
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, sampling, console, grammar_parser });
|
||||
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common });
|
||||
_ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common });
|
||||
_ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common });
|
||||
_ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, train });
|
||||
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, train });
|
||||
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, sampling, console, grammar_parser });
|
||||
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common });
|
||||
_ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common });
|
||||
_ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common });
|
||||
_ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, train });
|
||||
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, train });
|
||||
|
||||
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, sampling, grammar_parser, clip });
|
||||
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, sampling, grammar_parser, clip });
|
||||
if (server.target.isWindows()) {
|
||||
server.linkSystemLibrary("ws2_32");
|
||||
}
|
||||
|
||||
@@ -224,6 +224,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
break;
|
||||
}
|
||||
sparams.temp = std::stof(argv[i]);
|
||||
sparams.temp = std::max(sparams.temp, 0.0f);
|
||||
} else if (arg == "--tfs") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
@@ -743,7 +744,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||
#endif // GGML_USE_CUBLAS
|
||||
#endif
|
||||
printf(" --verbose-prompt print prompt before generation\n");
|
||||
fprintf(stderr, " --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n");
|
||||
printf(" --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n");
|
||||
printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
|
||||
printf(" --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n");
|
||||
printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
|
||||
@@ -888,7 +889,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
||||
|
||||
std::vector<llama_token> tmp = { llama_token_bos(model), llama_token_eos(model), };
|
||||
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
|
||||
llama_kv_cache_tokens_rm(lctx, -1, -1);
|
||||
llama_kv_cache_clear(lctx);
|
||||
llama_reset_timings(lctx);
|
||||
}
|
||||
|
||||
|
||||
35
common/log.h
35
common/log.h
@@ -97,22 +97,23 @@
|
||||
#define LOG_TEE_TARGET stderr
|
||||
#endif
|
||||
|
||||
// NOTE: currently disabled as it produces too many log files
|
||||
// Utility to obtain "pid" like unique process id and use it when creating log files.
|
||||
inline std::string log_get_pid()
|
||||
{
|
||||
static std::string pid;
|
||||
if (pid.empty())
|
||||
{
|
||||
// std::this_thread::get_id() is the most portable way of obtaining a "process id"
|
||||
// it's not the same as "pid" but is unique enough to solve multiple instances
|
||||
// trying to write to the same log.
|
||||
std::stringstream ss;
|
||||
ss << std::this_thread::get_id();
|
||||
pid = ss.str();
|
||||
}
|
||||
|
||||
return pid;
|
||||
}
|
||||
//inline std::string log_get_pid()
|
||||
//{
|
||||
// static std::string pid;
|
||||
// if (pid.empty())
|
||||
// {
|
||||
// // std::this_thread::get_id() is the most portable way of obtaining a "process id"
|
||||
// // it's not the same as "pid" but is unique enough to solve multiple instances
|
||||
// // trying to write to the same log.
|
||||
// std::stringstream ss;
|
||||
// ss << std::this_thread::get_id();
|
||||
// pid = ss.str();
|
||||
// }
|
||||
//
|
||||
// return pid;
|
||||
//}
|
||||
|
||||
// Utility function for generating log file names with unique id based on thread id.
|
||||
// invocation with log_filename_generator( "llama", "log" ) creates a string "llama.<number>.log"
|
||||
@@ -126,8 +127,8 @@ inline std::string log_filename_generator_impl(const std::string & log_file_base
|
||||
std::stringstream buf;
|
||||
|
||||
buf << log_file_basename;
|
||||
buf << ".";
|
||||
buf << log_get_pid();
|
||||
//buf << ".";
|
||||
//buf << log_get_pid();
|
||||
buf << ".";
|
||||
buf << log_file_extension;
|
||||
|
||||
|
||||
@@ -167,8 +167,12 @@ llama_token llama_sampling_sample(
|
||||
llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar);
|
||||
}
|
||||
|
||||
if (temp <= 0) {
|
||||
// greedy sampling
|
||||
if (temp < 0.0) {
|
||||
// greedy sampling, with probs
|
||||
llama_sample_softmax(ctx_main, &cur_p);
|
||||
id = cur_p.data[0].id;
|
||||
} else if (temp == 0.0) {
|
||||
// greedy sampling, no probs
|
||||
id = llama_sample_token_greedy(ctx_main, &cur_p);
|
||||
} else {
|
||||
if (mirostat == 1) {
|
||||
|
||||
21
convert.py
21
convert.py
@@ -366,16 +366,19 @@ class SentencePieceVocab:
|
||||
added_tokens = {}
|
||||
|
||||
vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
|
||||
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
|
||||
actual_ids = sorted(added_tokens.values())
|
||||
if expected_ids != actual_ids:
|
||||
raise Exception(f"Expected added token IDs to be sequential and start at {vocab_size}; got {actual_ids}")
|
||||
|
||||
items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
|
||||
self.added_tokens_list = [text for (text, idx) in items]
|
||||
self.vocab_size_base: int = vocab_size
|
||||
self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
|
||||
self.fname_tokenizer = fname_tokenizer
|
||||
new_tokens = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
|
||||
expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens)))
|
||||
actual_new_ids = sorted(new_tokens.keys())
|
||||
|
||||
if expected_new_ids != actual_new_ids:
|
||||
raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}")
|
||||
|
||||
# Token pieces that were added to the base vocabulary.
|
||||
self.added_tokens_list = [new_tokens[id] for id in actual_new_ids]
|
||||
self.vocab_size_base = vocab_size
|
||||
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
|
||||
self.fname_tokenizer = fname_tokenizer
|
||||
self.fname_added_tokens = fname_added_tokens
|
||||
|
||||
def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||
|
||||
@@ -154,6 +154,10 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
}
|
||||
|
||||
LOG_TEE("\n");
|
||||
LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, mmq = %d\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, mmq);
|
||||
LOG_TEE("\n");
|
||||
|
||||
LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
|
||||
LOG_TEE("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
|
||||
|
||||
@@ -181,7 +185,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
const auto t_pp_start = ggml_time_us();
|
||||
|
||||
llama_kv_cache_tokens_rm(ctx, -1, -1);
|
||||
llama_kv_cache_clear(ctx);
|
||||
|
||||
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
|
||||
LOG_TEE("%s: llama_decode() failed\n", __func__);
|
||||
|
||||
@@ -11,7 +11,7 @@ int main(int argc, char ** argv) {
|
||||
gpt_params params;
|
||||
|
||||
if (argc == 1 || argv[1][0] == '-') {
|
||||
printf("usage: %s MODEL_PATH [PROMPT] [PARALLEL] [LEN]\n" , argv[0]);
|
||||
printf("usage: %s MODEL_PATH [PROMPT] [PARALLEL] [LEN] [NGL]\n" , argv[0]);
|
||||
return 1 ;
|
||||
}
|
||||
|
||||
@@ -21,6 +21,9 @@ int main(int argc, char ** argv) {
|
||||
// total length of the sequences including the prompt
|
||||
int n_len = 32;
|
||||
|
||||
// number of layers to offload to the GPU
|
||||
int n_gpu_layers = 0;
|
||||
|
||||
if (argc >= 2) {
|
||||
params.model = argv[1];
|
||||
}
|
||||
@@ -37,6 +40,10 @@ int main(int argc, char ** argv) {
|
||||
n_len = std::atoi(argv[4]);
|
||||
}
|
||||
|
||||
if (argc >= 6) {
|
||||
n_gpu_layers = std::atoi(argv[5]);
|
||||
}
|
||||
|
||||
if (params.prompt.empty()) {
|
||||
params.prompt = "Hello my name is";
|
||||
}
|
||||
@@ -49,7 +56,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
llama_model_params model_params = llama_model_default_params();
|
||||
|
||||
// model_params.n_gpu_layers = 99; // offload all layers to the GPU
|
||||
model_params.n_gpu_layers = n_gpu_layers;
|
||||
|
||||
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
|
||||
|
||||
|
||||
@@ -1037,7 +1037,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
test t(inst, lmodel, ctx);
|
||||
|
||||
llama_kv_cache_tokens_rm(ctx, -1, -1);
|
||||
llama_kv_cache_clear(ctx);
|
||||
|
||||
// warmup run
|
||||
if (t.n_prompt > 0) {
|
||||
@@ -1048,7 +1048,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
for (int i = 0; i < params.reps; i++) {
|
||||
llama_kv_cache_tokens_rm(ctx, -1, -1);
|
||||
llama_kv_cache_clear(ctx);
|
||||
|
||||
uint64_t t_start = get_time_ns();
|
||||
if (t.n_prompt > 0) {
|
||||
|
||||
@@ -16,6 +16,8 @@ add_library(common OBJECT
|
||||
${_common_path}/console.cpp
|
||||
${_common_path}/grammar-parser.h
|
||||
${_common_path}/grammar-parser.cpp
|
||||
${_common_path}/sampling.h
|
||||
${_common_path}/sampling.cpp
|
||||
)
|
||||
|
||||
# WARNING: because build-info.h is auto-generated, it will only
|
||||
|
||||
@@ -298,7 +298,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
// remove any "future" tokens that we might have inherited from the previous session
|
||||
llama_kv_cache_tokens_rm(ctx, n_matching_session_tokens, -1);
|
||||
llama_kv_cache_seq_rm(ctx, -1, n_matching_session_tokens, -1);
|
||||
}
|
||||
|
||||
LOGLN(
|
||||
|
||||
@@ -210,7 +210,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
|
||||
const auto t_start = std::chrono::high_resolution_clock::now();
|
||||
|
||||
// clear the KV cache
|
||||
llama_kv_cache_tokens_rm(ctx, -1, -1);
|
||||
llama_kv_cache_clear(ctx);
|
||||
|
||||
for (int j = 0; j < num_batches; ++j) {
|
||||
const int batch_start = start + j * n_batch;
|
||||
@@ -339,7 +339,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
||||
const auto t_start = std::chrono::high_resolution_clock::now();
|
||||
|
||||
// clear the KV cache
|
||||
llama_kv_cache_tokens_rm(ctx, -1, -1);
|
||||
llama_kv_cache_clear(ctx);
|
||||
|
||||
for (int j = 0; j < num_batches; ++j) {
|
||||
const int batch_start = start + j * n_batch;
|
||||
@@ -573,7 +573,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
||||
}
|
||||
|
||||
// clear the KV cache
|
||||
llama_kv_cache_tokens_rm(ctx, -1, -1);
|
||||
llama_kv_cache_clear(ctx);
|
||||
|
||||
auto logits = hellaswag_evaluate_tokens(ctx, query_embd, 0, params.n_batch, n_vocab);
|
||||
if (logits.empty()) {
|
||||
|
||||
@@ -18,7 +18,6 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
|
||||
{ "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 3.90G, +0.1585 ppl @ LLaMA-v1-7B", },
|
||||
{ "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 4.33G, +0.0683 ppl @ LLaMA-v1-7B", },
|
||||
{ "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 4.70G, +0.0349 ppl @ LLaMA-v1-7B", },
|
||||
#ifdef GGML_USE_K_QUANTS
|
||||
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
|
||||
{ "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
|
||||
{ "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", },
|
||||
@@ -31,7 +30,6 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
|
||||
{ "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 4.33G, +0.0400 ppl @ LLaMA-v1-7B", },
|
||||
{ "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0122 ppl @ LLaMA-v1-7B", },
|
||||
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 5.15G, -0.0008 ppl @ LLaMA-v1-7B", },
|
||||
#endif
|
||||
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", },
|
||||
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "13.00G @ 7B", },
|
||||
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
|
||||
@@ -70,13 +68,14 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
|
||||
}
|
||||
|
||||
// usage:
|
||||
// ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
|
||||
// ./quantize [--allow-requantize] [--leave-output-tensor] [--pure] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
|
||||
//
|
||||
[[noreturn]]
|
||||
static void usage(const char * executable) {
|
||||
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
|
||||
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
|
||||
printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
|
||||
printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
|
||||
printf(" --pure: Disable k-quant mixtures and quantize all tensors to the same type\n");
|
||||
printf("\nAllowed quantization types:\n");
|
||||
for (auto & it : QUANT_OPTIONS) {
|
||||
if (it.name != "COPY") {
|
||||
@@ -103,6 +102,8 @@ int main(int argc, char ** argv) {
|
||||
params.quantize_output_tensor = false;
|
||||
} else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) {
|
||||
params.allow_requantize = true;
|
||||
} else if (strcmp(argv[arg_idx], "--pure") == 0) {
|
||||
params.pure = true;
|
||||
} else {
|
||||
usage(argv[0]);
|
||||
}
|
||||
|
||||
@@ -454,7 +454,7 @@ struct llama_client_slot
|
||||
}
|
||||
|
||||
void release() {
|
||||
if (state == PROCESSING)
|
||||
if (state == IDLE || state == PROCESSING)
|
||||
{
|
||||
t_token_generation = (ggml_time_us() - t_start_genereration) / 1e3;
|
||||
command = RELEASE;
|
||||
@@ -754,6 +754,7 @@ struct llama_server_context
|
||||
}
|
||||
|
||||
slot->params.antiprompt.clear();
|
||||
|
||||
const auto &stop = data.find("stop");
|
||||
if (stop != data.end() && stop->is_array())
|
||||
{
|
||||
@@ -856,7 +857,7 @@ struct llama_server_context
|
||||
|
||||
void kv_cache_clear() {
|
||||
// clear the entire KV cache
|
||||
llama_kv_cache_tokens_rm(ctx, -1, -1);
|
||||
llama_kv_cache_clear(ctx);
|
||||
clean_kv_cache = false;
|
||||
}
|
||||
|
||||
@@ -867,7 +868,7 @@ struct llama_server_context
|
||||
|
||||
kv_cache_clear();
|
||||
|
||||
for (int32_t i = 0; i < batch.n_tokens; ++i)
|
||||
for (int i = 0; i < (int) system_tokens.size(); ++i)
|
||||
{
|
||||
llama_batch_add(batch, system_tokens[i], i, { 0 }, false);
|
||||
}
|
||||
@@ -894,16 +895,8 @@ struct llama_server_context
|
||||
{
|
||||
slot.release();
|
||||
}
|
||||
wait_all_are_idle();
|
||||
all_slots_are_idle = true;
|
||||
|
||||
// wait until system prompt load
|
||||
system_need_update = true;
|
||||
while (system_need_update)
|
||||
{
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(5));
|
||||
}
|
||||
// system prompt loaded, continue
|
||||
}
|
||||
|
||||
void process_system_prompt_data(const json &sys_props) {
|
||||
@@ -915,26 +908,6 @@ struct llama_server_context
|
||||
{
|
||||
notify_system_prompt_changed();
|
||||
}
|
||||
else
|
||||
{
|
||||
system_need_update = true;
|
||||
}
|
||||
}
|
||||
|
||||
void wait_all_are_idle() {
|
||||
bool wait = true;
|
||||
while (wait)
|
||||
{
|
||||
wait = false;
|
||||
for (auto &slot : slots)
|
||||
{
|
||||
if (!slot.available())
|
||||
{
|
||||
wait = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static size_t find_stopping_strings(const std::string &text, const size_t last_token_size,
|
||||
@@ -965,7 +938,6 @@ struct llama_server_context
|
||||
slot.has_next_token = false;
|
||||
}
|
||||
stop_pos = pos;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1444,7 +1416,7 @@ struct llama_server_context
|
||||
process_tasks();
|
||||
|
||||
// update the system prompt wait until all slots are idle state
|
||||
if (system_need_update)
|
||||
if (system_need_update && all_slots_are_idle)
|
||||
{
|
||||
LOG_TEE("updating system prompt\n");
|
||||
update_system_prompt();
|
||||
@@ -1498,7 +1470,7 @@ struct llama_server_context
|
||||
for (auto & slot : slots)
|
||||
{
|
||||
// release the slot
|
||||
if (slot.state == PROCESSING && slot.command == RELEASE)
|
||||
if (slot.command == RELEASE)
|
||||
{
|
||||
slot.state = IDLE;
|
||||
slot.command = NONE;
|
||||
@@ -1509,7 +1481,7 @@ struct llama_server_context
|
||||
continue;
|
||||
}
|
||||
|
||||
if (slot.state == IDLE || slot.command == RELEASE)
|
||||
if (slot.state == IDLE)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
@@ -1530,6 +1502,17 @@ struct llama_server_context
|
||||
{
|
||||
for (auto & slot : slots)
|
||||
{
|
||||
const bool has_prompt = slot.prompt.is_array() || (slot.prompt.is_string() && !slot.prompt.get<std::string>().empty()) || !slot.images.empty();
|
||||
|
||||
// empty prompt passed -> release the slot and send empty response
|
||||
if (slot.state == IDLE && slot.command == LOAD_PROMPT && !has_prompt)
|
||||
{
|
||||
slot.release();
|
||||
slot.print_timings();
|
||||
send_final_response(slot);
|
||||
continue;
|
||||
}
|
||||
|
||||
// need process the prompt
|
||||
if (slot.state == IDLE && slot.command == LOAD_PROMPT)
|
||||
{
|
||||
@@ -1749,8 +1732,8 @@ struct llama_server_context
|
||||
if (!process_token(result, slot))
|
||||
{
|
||||
slot.release();
|
||||
send_final_response(slot);
|
||||
slot.print_timings();
|
||||
send_final_response(slot);
|
||||
}
|
||||
|
||||
slot.i_batch = -1;
|
||||
@@ -1766,15 +1749,16 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
||||
printf("usage: %s [options]\n", argv0);
|
||||
printf("\n");
|
||||
printf("options:\n");
|
||||
printf(" -h, --help show this help message and exit\n");
|
||||
printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
|
||||
printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
||||
printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
|
||||
printf(" --rope-freq-base N RoPE base frequency (default: loaded from model)\n");
|
||||
printf(" --rope-freq-scale N RoPE frequency scaling factor (default: loaded from model)\n");
|
||||
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
||||
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
||||
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
|
||||
printf(" -h, --help show this help message and exit\n");
|
||||
printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
|
||||
printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
||||
printf(" -tb N, --threads-batch N number of threads to use during batch and prompt processing (default: same as --threads)\n");
|
||||
printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
|
||||
printf(" --rope-freq-base N RoPE base frequency (default: loaded from model)\n");
|
||||
printf(" --rope-freq-scale N RoPE frequency scaling factor (default: loaded from model)\n");
|
||||
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
||||
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
||||
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
|
||||
if (llama_mlock_supported())
|
||||
{
|
||||
printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
|
||||
@@ -1924,6 +1908,15 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||
}
|
||||
params.n_threads = std::stoi(argv[i]);
|
||||
}
|
||||
else if (arg == "--threads-batch" || arg == "-tb")
|
||||
{
|
||||
if (++i >= argc)
|
||||
{
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.n_threads_batch = std::stoi(argv[i]);
|
||||
}
|
||||
else if (arg == "-b" || arg == "--batch-size")
|
||||
{
|
||||
if (++i >= argc)
|
||||
@@ -2285,7 +2278,7 @@ int main(int argc, char **argv)
|
||||
if (!json_value(data, "stream", false)) {
|
||||
std::string completion_text;
|
||||
task_result result = llama.next_result(task_id);
|
||||
if(!result.error && result.stop) {
|
||||
if (!result.error && result.stop) {
|
||||
res.set_content(result.result_json.dump(-1, ' ', false, json::error_handler_t::replace), "application/json");
|
||||
}
|
||||
else
|
||||
@@ -2312,7 +2305,7 @@ int main(int argc, char **argv)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
if(result.stop) {
|
||||
if (result.stop) {
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
|
||||
@@ -95,13 +95,8 @@ int main(int argc, char ** argv) {
|
||||
llama_batch batch = llama_batch_init(512, 0, 1);
|
||||
|
||||
// evaluate the initial prompt
|
||||
batch.n_tokens = tokens_list.size();
|
||||
|
||||
for (int32_t i = 0; i < batch.n_tokens; i++) {
|
||||
batch.token[i] = tokens_list[i];
|
||||
batch.pos[i] = i;
|
||||
batch.seq_id[i] = 0;
|
||||
batch.logits[i] = false;
|
||||
for (size_t i = 0; i < tokens_list.size(); i++) {
|
||||
llama_batch_add(batch, tokens_list[i], i, { 0 }, false);
|
||||
}
|
||||
|
||||
// llama_decode will output logits only for the last token of the prompt
|
||||
@@ -148,15 +143,10 @@ int main(int argc, char ** argv) {
|
||||
fflush(stdout);
|
||||
|
||||
// prepare the next batch
|
||||
batch.n_tokens = 0;
|
||||
llama_batch_clear(batch);
|
||||
|
||||
// push this new token for next evaluation
|
||||
batch.token [batch.n_tokens] = new_token_id;
|
||||
batch.pos [batch.n_tokens] = n_cur;
|
||||
batch.seq_id[batch.n_tokens] = 0;
|
||||
batch.logits[batch.n_tokens] = true;
|
||||
|
||||
batch.n_tokens += 1;
|
||||
llama_batch_add(batch, new_token_id, n_cur, { 0 }, true);
|
||||
|
||||
n_decode += 1;
|
||||
}
|
||||
|
||||
@@ -8,6 +8,9 @@
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 100
|
||||
#define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
|
||||
|
||||
struct seq_draft {
|
||||
bool active = false;
|
||||
bool drafting = false;
|
||||
@@ -64,6 +67,33 @@ int main(int argc, char ** argv) {
|
||||
params.n_gpu_layers = params.n_gpu_layers_draft;
|
||||
std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params);
|
||||
|
||||
{
|
||||
const int n_vocab_tgt = llama_n_vocab(model_tgt);
|
||||
const int n_vocab_dft = llama_n_vocab(model_dft);
|
||||
const int vocab_diff = n_vocab_tgt > n_vocab_dft
|
||||
? n_vocab_tgt - n_vocab_dft
|
||||
: n_vocab_dft - n_vocab_tgt;
|
||||
|
||||
if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) {
|
||||
fprintf(stderr, "%s: error: draft model vocab must closely match target model to use speculation but ", __func__);
|
||||
fprintf(stderr, "target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
|
||||
n_vocab_tgt, llama_n_vocab(model_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
|
||||
return 1;
|
||||
}
|
||||
|
||||
for (int i = SPEC_VOCAB_CHECK_START_TOKEN_ID; i < std::min(n_vocab_tgt, n_vocab_dft); ++i) {
|
||||
const char * token_text_tgt = llama_token_get_text(model_tgt, i);
|
||||
const char * token_text_dft = llama_token_get_text(model_dft, i);
|
||||
if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
|
||||
fprintf(stderr, "%s: error: draft model vocab must match target model to use speculation but ", __func__);
|
||||
fprintf(stderr, "token %d content differs - target '%s', draft '%s'\n", i,
|
||||
llama_token_to_piece(ctx_tgt, i).c_str(),
|
||||
llama_token_to_piece(ctx_dft, i).c_str());
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// tokenize the prompt
|
||||
std::vector<llama_token> inp;
|
||||
inp = ::llama_tokenize(ctx_tgt, params.prompt, true);
|
||||
@@ -118,7 +148,7 @@ int main(int argc, char ** argv) {
|
||||
std::vector<seq_draft> drafts(n_seq_dft);
|
||||
|
||||
params.sparams.grammar.clear(); // the draft samplers will copy the target sampler's grammar
|
||||
params.sparams.temp = std::max(0.01f, params.sparams.temp);
|
||||
params.sparams.temp = -1.0f; // force greedy sampling with probs for the draft model
|
||||
|
||||
for (int s = 0; s < n_seq_dft; ++s) {
|
||||
drafts[s].ctx_sampling = llama_sampling_init(params.sparams);
|
||||
@@ -227,6 +257,7 @@ int main(int argc, char ** argv) {
|
||||
llama_batch_add (batch_dft, id, n_past_dft, { 0 }, true);
|
||||
|
||||
llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1);
|
||||
// LOG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
|
||||
llama_decode (ctx_dft, batch_dft);
|
||||
|
||||
++n_past_dft;
|
||||
@@ -370,7 +401,7 @@ int main(int argc, char ** argv) {
|
||||
llama_kv_cache_seq_cp(ctx_tgt, 0, s, -1, -1);
|
||||
}
|
||||
|
||||
//LOG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt));
|
||||
// LOG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str());
|
||||
llama_decode(ctx_tgt, batch_tgt);
|
||||
++n_past_tgt;
|
||||
}
|
||||
|
||||
12
flake.lock
generated
12
flake.lock
generated
@@ -5,11 +5,11 @@
|
||||
"systems": "systems"
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1692799911,
|
||||
"narHash": "sha256-3eihraek4qL744EvQXsK1Ha6C3CR7nnT8X2qWap4RNk=",
|
||||
"lastModified": 1694529238,
|
||||
"narHash": "sha256-zsNZZGTGnMOf9YpHKJqMSsa0dXbfmxeoJ7xHlrt+xmY=",
|
||||
"owner": "numtide",
|
||||
"repo": "flake-utils",
|
||||
"rev": "f9e7cf818399d17d347f847525c5a5a8032e4e44",
|
||||
"rev": "ff7b65b44d01cf9ba6a71320833626af21126384",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
@@ -20,11 +20,11 @@
|
||||
},
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1692913444,
|
||||
"narHash": "sha256-1SvMQm2DwofNxXVtNWWtIcTh7GctEVrS/Xel/mdc6iY=",
|
||||
"lastModified": 1698318101,
|
||||
"narHash": "sha256-gUihHt3yPD7bVqg+k/UVHgngyaJ3DMEBchbymBMvK1E=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "18324978d632ffc55ef1d928e81630c620f4f447",
|
||||
"rev": "63678e9f3d3afecfeafa0acead6239cdb447574c",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
|
||||
17
flake.nix
17
flake.nix
@@ -11,8 +11,7 @@
|
||||
meta.mainProgram = "llama";
|
||||
inherit (pkgs.stdenv) isAarch32 isAarch64 isDarwin;
|
||||
buildInputs = with pkgs; [ openmpi ];
|
||||
osSpecific = with pkgs; buildInputs ++
|
||||
(
|
||||
osSpecific = with pkgs; buildInputs ++ (
|
||||
if isAarch64 && isDarwin then
|
||||
with pkgs.darwin.apple_sdk_11_0.frameworks; [
|
||||
Accelerate
|
||||
@@ -51,6 +50,9 @@
|
||||
};
|
||||
llama-python =
|
||||
pkgs.python3.withPackages (ps: with ps; [ numpy sentencepiece ]);
|
||||
# TODO(Green-Sky): find a better way to opt-into the heavy ml python runtime
|
||||
llama-python-extra =
|
||||
pkgs.python3.withPackages (ps: with ps; [ numpy sentencepiece torchWithoutCuda transformers ]);
|
||||
postPatch = ''
|
||||
substituteInPlace ./ggml-metal.m \
|
||||
--replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
|
||||
@@ -93,12 +95,15 @@
|
||||
};
|
||||
packages.rocm = pkgs.stdenv.mkDerivation {
|
||||
inherit name src meta postPatch nativeBuildInputs postInstall;
|
||||
buildInputs = with pkgs; buildInputs ++ [ hip hipblas rocblas ];
|
||||
buildInputs = with pkgs.rocmPackages; buildInputs ++ [ clr hipblas rocblas ];
|
||||
cmakeFlags = cmakeFlags ++ [
|
||||
"-DLLAMA_HIPBLAS=1"
|
||||
"-DCMAKE_C_COMPILER=hipcc"
|
||||
"-DCMAKE_CXX_COMPILER=hipcc"
|
||||
"-DCMAKE_POSITION_INDEPENDENT_CODE=ON"
|
||||
# Build all targets supported by rocBLAS. When updating search for TARGET_LIST_ROCM
|
||||
# in github.com/ROCmSoftwarePlatform/rocBLAS/blob/develop/CMakeLists.txt
|
||||
# and select the line that matches the current nixpkgs version of rocBLAS.
|
||||
"-DAMDGPU_TARGETS=gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102"
|
||||
];
|
||||
};
|
||||
apps.llama-server = {
|
||||
@@ -126,5 +131,9 @@
|
||||
buildInputs = [ llama-python ];
|
||||
packages = nativeBuildInputs ++ osSpecific;
|
||||
};
|
||||
devShells.extra = pkgs.mkShell {
|
||||
buildInputs = [ llama-python-extra ];
|
||||
packages = nativeBuildInputs ++ osSpecific;
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
332
ggml-cuda.cu
332
ggml-cuda.cu
@@ -29,6 +29,8 @@
|
||||
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
|
||||
#define cublasCreate hipblasCreate
|
||||
#define cublasGemmEx hipblasGemmEx
|
||||
#define cublasGemmBatchedEx hipblasGemmBatchedEx
|
||||
#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
|
||||
#define cublasHandle_t hipblasHandle_t
|
||||
#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
|
||||
#define cublasSetStream hipblasSetStream
|
||||
@@ -85,6 +87,24 @@
|
||||
#define CC_OFFSET_AMD 1000000
|
||||
#define CC_RDNA2 (CC_OFFSET_AMD + 1030)
|
||||
|
||||
// define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication
|
||||
// on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant
|
||||
// for large computational tasks. the drawback is that this requires some extra amount of VRAM:
|
||||
// - 7B quantum model: +100-200 MB
|
||||
// - 13B quantum model: +200-400 MB
|
||||
//
|
||||
//#define GGML_CUDA_FORCE_MMQ
|
||||
|
||||
// TODO: improve this to be correct for more hardware
|
||||
// for example, currently fails for GeForce GTX 1660 which is TURING arch (> VOLTA) but does not have tensor cores
|
||||
// probably other such cases, and not sure what happens on AMD hardware
|
||||
#if !defined(GGML_CUDA_FORCE_MMQ)
|
||||
#define CUDA_USE_TENSOR_CORES
|
||||
#endif
|
||||
|
||||
// max batch size to use MMQ kernels when tensor cores are available
|
||||
#define MMQ_MAX_BATCH_SIZE 32
|
||||
|
||||
#if defined(GGML_USE_HIPBLAS)
|
||||
#define __CUDA_ARCH__ 1300
|
||||
|
||||
@@ -468,7 +488,6 @@ static int g_device_count = -1;
|
||||
static int g_main_device = 0;
|
||||
static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
|
||||
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
|
||||
static bool g_mul_mat_q = true;
|
||||
|
||||
static void * g_scratch_buffer = nullptr;
|
||||
static size_t g_scratch_size = 0; // disabled by default
|
||||
@@ -3552,9 +3571,15 @@ static __device__ __forceinline__ void mul_mat_q(
|
||||
#define MMQ_X_Q4_0_RDNA1 64
|
||||
#define MMQ_Y_Q4_0_RDNA1 64
|
||||
#define NWARPS_Q4_0_RDNA1 8
|
||||
#if defined(CUDA_USE_TENSOR_CORES)
|
||||
#define MMQ_X_Q4_0_AMPERE 4
|
||||
#define MMQ_Y_Q4_0_AMPERE 32
|
||||
#define NWARPS_Q4_0_AMPERE 4
|
||||
#else
|
||||
#define MMQ_X_Q4_0_AMPERE 64
|
||||
#define MMQ_Y_Q4_0_AMPERE 128
|
||||
#define NWARPS_Q4_0_AMPERE 4
|
||||
#endif
|
||||
#define MMQ_X_Q4_0_PASCAL 64
|
||||
#define MMQ_Y_Q4_0_PASCAL 64
|
||||
#define NWARPS_Q4_0_PASCAL 8
|
||||
@@ -3613,9 +3638,15 @@ template <bool need_check> static __global__ void
|
||||
#define MMQ_X_Q4_1_RDNA1 64
|
||||
#define MMQ_Y_Q4_1_RDNA1 64
|
||||
#define NWARPS_Q4_1_RDNA1 8
|
||||
#if defined(CUDA_USE_TENSOR_CORES)
|
||||
#define MMQ_X_Q4_1_AMPERE 4
|
||||
#define MMQ_Y_Q4_1_AMPERE 32
|
||||
#define NWARPS_Q4_1_AMPERE 4
|
||||
#else
|
||||
#define MMQ_X_Q4_1_AMPERE 64
|
||||
#define MMQ_Y_Q4_1_AMPERE 128
|
||||
#define NWARPS_Q4_1_AMPERE 4
|
||||
#endif
|
||||
#define MMQ_X_Q4_1_PASCAL 64
|
||||
#define MMQ_Y_Q4_1_PASCAL 64
|
||||
#define NWARPS_Q4_1_PASCAL 8
|
||||
@@ -3676,9 +3707,15 @@ template <bool need_check> static __global__ void
|
||||
#define MMQ_X_Q5_0_RDNA1 64
|
||||
#define MMQ_Y_Q5_0_RDNA1 64
|
||||
#define NWARPS_Q5_0_RDNA1 8
|
||||
#if defined(CUDA_USE_TENSOR_CORES)
|
||||
#define MMQ_X_Q5_0_AMPERE 4
|
||||
#define MMQ_Y_Q5_0_AMPERE 32
|
||||
#define NWARPS_Q5_0_AMPERE 4
|
||||
#else
|
||||
#define MMQ_X_Q5_0_AMPERE 128
|
||||
#define MMQ_Y_Q5_0_AMPERE 64
|
||||
#define NWARPS_Q5_0_AMPERE 4
|
||||
#endif
|
||||
#define MMQ_X_Q5_0_PASCAL 64
|
||||
#define MMQ_Y_Q5_0_PASCAL 64
|
||||
#define NWARPS_Q5_0_PASCAL 8
|
||||
@@ -3737,9 +3774,15 @@ template <bool need_check> static __global__ void
|
||||
#define MMQ_X_Q5_1_RDNA1 64
|
||||
#define MMQ_Y_Q5_1_RDNA1 64
|
||||
#define NWARPS_Q5_1_RDNA1 8
|
||||
#if defined(CUDA_USE_TENSOR_CORES)
|
||||
#define MMQ_X_Q5_1_AMPERE 4
|
||||
#define MMQ_Y_Q5_1_AMPERE 32
|
||||
#define NWARPS_Q5_1_AMPERE 4
|
||||
#else
|
||||
#define MMQ_X_Q5_1_AMPERE 128
|
||||
#define MMQ_Y_Q5_1_AMPERE 64
|
||||
#define NWARPS_Q5_1_AMPERE 4
|
||||
#endif
|
||||
#define MMQ_X_Q5_1_PASCAL 64
|
||||
#define MMQ_Y_Q5_1_PASCAL 64
|
||||
#define NWARPS_Q5_1_PASCAL 8
|
||||
@@ -3798,9 +3841,15 @@ mul_mat_q5_1(
|
||||
#define MMQ_X_Q8_0_RDNA1 64
|
||||
#define MMQ_Y_Q8_0_RDNA1 64
|
||||
#define NWARPS_Q8_0_RDNA1 8
|
||||
#if defined(CUDA_USE_TENSOR_CORES)
|
||||
#define MMQ_X_Q8_0_AMPERE 4
|
||||
#define MMQ_Y_Q8_0_AMPERE 32
|
||||
#define NWARPS_Q8_0_AMPERE 4
|
||||
#else
|
||||
#define MMQ_X_Q8_0_AMPERE 128
|
||||
#define MMQ_Y_Q8_0_AMPERE 64
|
||||
#define NWARPS_Q8_0_AMPERE 4
|
||||
#endif
|
||||
#define MMQ_X_Q8_0_PASCAL 64
|
||||
#define MMQ_Y_Q8_0_PASCAL 64
|
||||
#define NWARPS_Q8_0_PASCAL 8
|
||||
@@ -3859,9 +3908,15 @@ template <bool need_check> static __global__ void
|
||||
#define MMQ_X_Q2_K_RDNA1 128
|
||||
#define MMQ_Y_Q2_K_RDNA1 32
|
||||
#define NWARPS_Q2_K_RDNA1 8
|
||||
#if defined(CUDA_USE_TENSOR_CORES)
|
||||
#define MMQ_X_Q2_K_AMPERE 4
|
||||
#define MMQ_Y_Q2_K_AMPERE 32
|
||||
#define NWARPS_Q2_K_AMPERE 4
|
||||
#else
|
||||
#define MMQ_X_Q2_K_AMPERE 64
|
||||
#define MMQ_Y_Q2_K_AMPERE 128
|
||||
#define NWARPS_Q2_K_AMPERE 4
|
||||
#endif
|
||||
#define MMQ_X_Q2_K_PASCAL 64
|
||||
#define MMQ_Y_Q2_K_PASCAL 64
|
||||
#define NWARPS_Q2_K_PASCAL 8
|
||||
@@ -3920,9 +3975,15 @@ mul_mat_q2_K(
|
||||
#define MMQ_X_Q3_K_RDNA1 32
|
||||
#define MMQ_Y_Q3_K_RDNA1 128
|
||||
#define NWARPS_Q3_K_RDNA1 8
|
||||
#if defined(CUDA_USE_TENSOR_CORES)
|
||||
#define MMQ_X_Q3_K_AMPERE 4
|
||||
#define MMQ_Y_Q3_K_AMPERE 32
|
||||
#define NWARPS_Q3_K_AMPERE 4
|
||||
#else
|
||||
#define MMQ_X_Q3_K_AMPERE 128
|
||||
#define MMQ_Y_Q3_K_AMPERE 128
|
||||
#define NWARPS_Q3_K_AMPERE 4
|
||||
#endif
|
||||
#define MMQ_X_Q3_K_PASCAL 64
|
||||
#define MMQ_Y_Q3_K_PASCAL 64
|
||||
#define NWARPS_Q3_K_PASCAL 8
|
||||
@@ -3983,9 +4044,15 @@ template <bool need_check> static __global__ void
|
||||
#define MMQ_X_Q4_K_RDNA1 32
|
||||
#define MMQ_Y_Q4_K_RDNA1 64
|
||||
#define NWARPS_Q4_K_RDNA1 8
|
||||
#if defined(CUDA_USE_TENSOR_CORES)
|
||||
#define MMQ_X_Q4_K_AMPERE 4
|
||||
#define MMQ_Y_Q4_K_AMPERE 32
|
||||
#define NWARPS_Q4_K_AMPERE 4
|
||||
#else
|
||||
#define MMQ_X_Q4_K_AMPERE 64
|
||||
#define MMQ_Y_Q4_K_AMPERE 128
|
||||
#define NWARPS_Q4_K_AMPERE 4
|
||||
#endif
|
||||
#define MMQ_X_Q4_K_PASCAL 64
|
||||
#define MMQ_Y_Q4_K_PASCAL 64
|
||||
#define NWARPS_Q4_K_PASCAL 8
|
||||
@@ -4046,9 +4113,15 @@ template <bool need_check> static __global__ void
|
||||
#define MMQ_X_Q5_K_RDNA1 32
|
||||
#define MMQ_Y_Q5_K_RDNA1 64
|
||||
#define NWARPS_Q5_K_RDNA1 8
|
||||
#if defined(CUDA_USE_TENSOR_CORES)
|
||||
#define MMQ_X_Q5_K_AMPERE 4
|
||||
#define MMQ_Y_Q5_K_AMPERE 32
|
||||
#define NWARPS_Q5_K_AMPERE 4
|
||||
#else
|
||||
#define MMQ_X_Q5_K_AMPERE 64
|
||||
#define MMQ_Y_Q5_K_AMPERE 128
|
||||
#define NWARPS_Q5_K_AMPERE 4
|
||||
#endif
|
||||
#define MMQ_X_Q5_K_PASCAL 64
|
||||
#define MMQ_Y_Q5_K_PASCAL 64
|
||||
#define NWARPS_Q5_K_PASCAL 8
|
||||
@@ -4107,9 +4180,15 @@ mul_mat_q5_K(
|
||||
#define MMQ_X_Q6_K_RDNA1 32
|
||||
#define MMQ_Y_Q6_K_RDNA1 64
|
||||
#define NWARPS_Q6_K_RDNA1 8
|
||||
#if defined(CUDA_USE_TENSOR_CORES)
|
||||
#define MMQ_X_Q6_K_AMPERE 4
|
||||
#define MMQ_Y_Q6_K_AMPERE 32
|
||||
#define NWARPS_Q6_K_AMPERE 4
|
||||
#else
|
||||
#define MMQ_X_Q6_K_AMPERE 64
|
||||
#define MMQ_Y_Q6_K_AMPERE 64
|
||||
#define NWARPS_Q6_K_AMPERE 4
|
||||
#endif
|
||||
#define MMQ_X_Q6_K_PASCAL 64
|
||||
#define MMQ_Y_Q6_K_PASCAL 64
|
||||
#define NWARPS_Q6_K_PASCAL 8
|
||||
@@ -4326,13 +4405,13 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
|
||||
|
||||
const half * x = (const half *) vx;
|
||||
|
||||
const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
|
||||
const int channel = blockDim.z*blockIdx.z + threadIdx.z;
|
||||
const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
|
||||
const int channel = blockDim.z*blockIdx.z + threadIdx.z;
|
||||
const int channel_x = channel / channel_x_divisor;
|
||||
|
||||
const int nrows_y = ncols_x;
|
||||
const int nrows_y = ncols_x;
|
||||
const int nrows_dst = nrows_x;
|
||||
const int row_dst = row_x;
|
||||
const int row_dst = row_x;
|
||||
|
||||
const int idst = channel*nrows_dst + row_dst;
|
||||
|
||||
@@ -4345,13 +4424,13 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
|
||||
break;
|
||||
}
|
||||
|
||||
const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x;
|
||||
const float xi = __half2float(x[ix]);
|
||||
|
||||
const int row_y = col_x;
|
||||
|
||||
const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x;
|
||||
const int iy = channel*nrows_y + row_y;
|
||||
|
||||
const float xi = __half2float(x[ix]);
|
||||
|
||||
tmp += xi * y[iy];
|
||||
}
|
||||
|
||||
@@ -5661,11 +5740,21 @@ void ggml_init_cublas() {
|
||||
CUDA_CHECK(cudaGetDeviceCount(&g_device_count));
|
||||
GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
|
||||
int64_t total_vram = 0;
|
||||
#if defined(GGML_CUDA_FORCE_MMQ)
|
||||
fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
|
||||
#else
|
||||
fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
|
||||
#endif
|
||||
#if defined(CUDA_USE_TENSOR_CORES)
|
||||
fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: yes\n", __func__);
|
||||
#else
|
||||
fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: no\n", __func__);
|
||||
#endif
|
||||
fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
|
||||
for (int64_t id = 0; id < g_device_count; ++id) {
|
||||
for (int id = 0; id < g_device_count; ++id) {
|
||||
cudaDeviceProp prop;
|
||||
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
|
||||
fprintf(stderr, " Device %ld: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor);
|
||||
fprintf(stderr, " Device %d: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor);
|
||||
|
||||
g_tensor_split[id] = total_vram;
|
||||
total_vram += prop.totalGlobalMem;
|
||||
@@ -5675,15 +5764,15 @@ void ggml_init_cublas() {
|
||||
g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
|
||||
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||
}
|
||||
for (int64_t id = 0; id < g_device_count; ++id) {
|
||||
for (int id = 0; id < g_device_count; ++id) {
|
||||
g_tensor_split[id] /= total_vram;
|
||||
}
|
||||
|
||||
for (int64_t id = 0; id < g_device_count; ++id) {
|
||||
for (int id = 0; id < g_device_count; ++id) {
|
||||
CUDA_CHECK(ggml_cuda_set_device(id));
|
||||
|
||||
// create cuda streams
|
||||
for (int64_t is = 0; is < MAX_STREAMS; ++is) {
|
||||
for (int is = 0; is < MAX_STREAMS; ++is) {
|
||||
CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams[id][is], cudaStreamNonBlocking));
|
||||
}
|
||||
|
||||
@@ -6252,16 +6341,15 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
||||
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
||||
const int64_t src1_padded_row_size, const cudaStream_t & stream) {
|
||||
|
||||
GGML_ASSERT(src0_dd_i != nullptr);
|
||||
GGML_ASSERT(src0_dd_i != nullptr);
|
||||
GGML_ASSERT(src1_ddf_i != nullptr);
|
||||
GGML_ASSERT(dst_dd_i != nullptr);
|
||||
|
||||
GGML_ASSERT(dst_dd_i != nullptr);
|
||||
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
|
||||
const int64_t ne10 = src1->ne[0];
|
||||
|
||||
const int64_t ne0 = dst->ne[0];
|
||||
|
||||
const int64_t row_diff = row_high - row_low;
|
||||
|
||||
int id;
|
||||
@@ -6346,7 +6434,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
||||
cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
|
||||
row_diff, src1_ncols, ne10,
|
||||
&alpha, src0_ddf_i, ne00,
|
||||
src1_ddf_i, ne10,
|
||||
src1_ddf_i, ne10,
|
||||
&beta, dst_dd_i, ldc));
|
||||
|
||||
if (src0_as != 0) {
|
||||
@@ -7013,7 +7101,8 @@ static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tens
|
||||
}
|
||||
|
||||
static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
|
||||
GGML_ASSERT(!ggml_is_contiguous(src0) && ggml_is_contiguous(src1));
|
||||
GGML_ASSERT(!ggml_is_transposed(src0));
|
||||
GGML_ASSERT(!ggml_is_transposed(src1));
|
||||
GGML_ASSERT(!ggml_is_permuted(src0));
|
||||
GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
||||
@@ -7023,11 +7112,11 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
|
||||
const int64_t ne01 = src0->ne[1];
|
||||
const int64_t ne02 = src0->ne[2];
|
||||
|
||||
const int64_t ne12 = src1->ne[2];
|
||||
|
||||
const int64_t nb01 = src0->nb[1];
|
||||
const int64_t nb02 = src0->nb[2];
|
||||
|
||||
const int64_t ne12 = src1->ne[2];
|
||||
|
||||
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
||||
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
||||
|
||||
@@ -7046,27 +7135,200 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
|
||||
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
|
||||
}
|
||||
|
||||
static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(!ggml_is_transposed(src0));
|
||||
GGML_ASSERT(!ggml_is_transposed(src1));
|
||||
|
||||
GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
|
||||
const int64_t ne00 = src0->ne[0]; GGML_UNUSED(ne00);
|
||||
const int64_t ne01 = src0->ne[1];
|
||||
const int64_t ne02 = src0->ne[2];
|
||||
const int64_t ne03 = src0->ne[3];
|
||||
|
||||
const int64_t nb01 = src0->nb[1];
|
||||
const int64_t nb02 = src0->nb[2]; GGML_UNUSED(nb02);
|
||||
const int64_t nb03 = src0->nb[3]; GGML_UNUSED(nb03);
|
||||
|
||||
const int64_t ne10 = src1->ne[0];
|
||||
const int64_t ne11 = src1->ne[1];
|
||||
const int64_t ne12 = src1->ne[2];
|
||||
const int64_t ne13 = src1->ne[3];
|
||||
|
||||
const int64_t nb11 = src1->nb[1];
|
||||
const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
|
||||
const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
|
||||
|
||||
const int64_t ne1 = ggml_nelements(src1);
|
||||
const int64_t ne = ggml_nelements(dst);
|
||||
|
||||
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
||||
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
||||
|
||||
int id;
|
||||
CUDA_CHECK(cudaGetDevice(&id));
|
||||
CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], main_stream));
|
||||
|
||||
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
||||
void * src0_ddq = src0_extra->data_device[g_main_device];
|
||||
half * src0_as_f16 = (half *) src0_ddq;
|
||||
|
||||
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
||||
float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
|
||||
|
||||
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
||||
|
||||
// convert src1 to fp16
|
||||
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
|
||||
GGML_ASSERT(to_fp16_cuda != nullptr);
|
||||
|
||||
size_t src1_as = 0;
|
||||
half * src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne1 * sizeof(half), &src1_as);
|
||||
to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream);
|
||||
|
||||
size_t dst_as = 0;
|
||||
half * dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as);
|
||||
|
||||
GGML_ASSERT(ne12 % ne02 == 0);
|
||||
GGML_ASSERT(ne13 % ne03 == 0);
|
||||
|
||||
// broadcast factors
|
||||
const int64_t r2 = ne12/ne02;
|
||||
const int64_t r3 = ne13/ne03;
|
||||
|
||||
const half alpha_f16 = 1.0f;
|
||||
const half beta_f16 = 0.0f;
|
||||
|
||||
#if 0
|
||||
// use cublasGemmEx
|
||||
{
|
||||
for (int i13 = 0; i13 < ne13; ++i13) {
|
||||
for (int i12 = 0; i12 < ne12; ++i12) {
|
||||
int i03 = i13 / r3;
|
||||
int i02 = i12 / r2;
|
||||
|
||||
CUBLAS_CHECK(
|
||||
cublasGemmEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
|
||||
ne01, ne11, ne10,
|
||||
&alpha_f16, (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , CUDA_R_16F, nb01/sizeof(half),
|
||||
(const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F, nb11/sizeof(float),
|
||||
&beta_f16, ( char *) dst_f16 + i12* dst->nb[2]/2 + i13* dst->nb[3]/2, CUDA_R_16F, ne01,
|
||||
CUBLAS_COMPUTE_16F,
|
||||
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
if (r2 == 1 && r3 == 1 && src0->nb[2]*src0->ne[2] == src0->nb[3] && src1->nb[2]*src1->ne[2] == src1->nb[3]) {
|
||||
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
|
||||
// use cublasGemmStridedBatchedEx
|
||||
CUBLAS_CHECK(
|
||||
cublasGemmStridedBatchedEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
|
||||
ne01, ne11, ne10,
|
||||
&alpha_f16, (const char *) src0_as_f16, CUDA_R_16F, nb01/sizeof(half), src0->nb[2]/sizeof(half), // strideA
|
||||
(const char *) src1_as_f16, CUDA_R_16F, nb11/sizeof(float), src1->nb[2]/sizeof(float), // strideB
|
||||
&beta_f16, ( char *) dst_f16, CUDA_R_16F, ne01, dst->nb[2]/sizeof(float), // strideC
|
||||
ne12*ne13,
|
||||
CUBLAS_COMPUTE_16F,
|
||||
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
||||
} else {
|
||||
// use cublasGemmBatchedEx
|
||||
// TODO: https://github.com/ggerganov/llama.cpp/pull/3749#discussion_r1369997000
|
||||
const int ne23 = ne12*ne13;
|
||||
|
||||
// TODO: avoid this alloc
|
||||
void ** ptrs = (void **) malloc(3*ne23*sizeof(void *));
|
||||
|
||||
for (int i13 = 0; i13 < ne13; ++i13) {
|
||||
for (int i12 = 0; i12 < ne12; ++i12) {
|
||||
int i03 = i13 / r3;
|
||||
int i02 = i12 / r2;
|
||||
|
||||
ptrs[0*ne23 + i12 + i13*ne12] = (char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3];
|
||||
ptrs[1*ne23 + i12 + i13*ne12] = (char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2;
|
||||
ptrs[2*ne23 + i12 + i13*ne12] = (char *) dst_f16 + i12* dst->nb[2]/2 + i13* dst->nb[3]/2;
|
||||
}
|
||||
}
|
||||
|
||||
// allocate device memory for pointers
|
||||
void ** ptrs_as = nullptr;
|
||||
CUDA_CHECK(cudaMalloc(&ptrs_as, 3*ne23*sizeof(void *)));
|
||||
|
||||
// TODO: this does not work for some reason -- not sure why?
|
||||
//size_t ptrs_s = 0;
|
||||
//ptrs_as = (void **) ggml_cuda_pool_malloc(3*ne23*sizeof(void *), &ptrs_s);
|
||||
|
||||
// copy pointers to device
|
||||
CUDA_CHECK(cudaMemcpy(ptrs_as, ptrs, 3*ne23*sizeof(void *), cudaMemcpyHostToDevice));
|
||||
|
||||
free(ptrs);
|
||||
|
||||
CUBLAS_CHECK(
|
||||
cublasGemmBatchedEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
|
||||
ne01, ne11, ne10,
|
||||
&alpha_f16, (const void **) (ptrs_as + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
|
||||
(const void **) (ptrs_as + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
|
||||
&beta_f16, ( void **) (ptrs_as + 2*ne23), CUDA_R_16F, ne01,
|
||||
ne23,
|
||||
CUBLAS_COMPUTE_16F,
|
||||
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
||||
|
||||
// free device memory for pointers
|
||||
CUDA_CHECK(cudaFree(ptrs_as));
|
||||
//ggml_cuda_pool_free(ptrs_as, ptrs_s);
|
||||
}
|
||||
#endif
|
||||
|
||||
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
|
||||
to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);
|
||||
|
||||
ggml_cuda_pool_free(src1_as_f16, src1_as);
|
||||
ggml_cuda_pool_free(dst_f16, dst_as);
|
||||
}
|
||||
|
||||
static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
bool all_on_device = (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
|
||||
src1->backend == GGML_BACKEND_GPU && dst->backend == GGML_BACKEND_GPU;
|
||||
const bool all_on_device =
|
||||
(src0->backend == GGML_BACKEND_GPU) &&
|
||||
(src1->backend == GGML_BACKEND_GPU) &&
|
||||
( dst->backend == GGML_BACKEND_GPU);
|
||||
|
||||
int64_t min_compute_capability = INT_MAX;
|
||||
for (int64_t id = 0; id < g_device_count; ++id) {
|
||||
if (min_compute_capability > g_compute_capabilities[id]
|
||||
&& g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
|
||||
if (min_compute_capability > g_compute_capabilities[id] && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
|
||||
min_compute_capability = g_compute_capabilities[id];
|
||||
}
|
||||
}
|
||||
|
||||
if (all_on_device && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
||||
#ifdef CUDA_USE_TENSOR_CORES
|
||||
const bool use_tensor_cores = true;
|
||||
#else
|
||||
const bool use_tensor_cores = false;
|
||||
#endif
|
||||
|
||||
// debug helpers
|
||||
//printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
|
||||
//printf(" %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
|
||||
//printf("src1: %8d %8d %8d %8d\n", src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3]);
|
||||
//printf(" %8d %8d %8d %8d\n", src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3]);
|
||||
//printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
|
||||
//printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
|
||||
|
||||
if (all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
||||
// KQ single-batch
|
||||
ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
|
||||
} else if (all_on_device && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && src1->ne[1] == 1) {
|
||||
} else if (all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
||||
// KQV single-batch
|
||||
ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
|
||||
} else if (all_on_device && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
|
||||
// KQ + KQV multi-batch
|
||||
ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
|
||||
} else if (src0->type == GGML_TYPE_F32) {
|
||||
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
|
||||
} else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
|
||||
if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
|
||||
|
||||
#ifdef GGML_CUDA_FORCE_DMMV
|
||||
const bool use_mul_mat_vec_q = false;
|
||||
#else
|
||||
@@ -7079,7 +7341,15 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
||||
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
|
||||
}
|
||||
} else {
|
||||
if (g_mul_mat_q && ggml_is_quantized(src0->type) && min_compute_capability >= MIN_CC_DP4A) {
|
||||
bool use_mul_mat_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
|
||||
|
||||
// when tensor cores are available, use them for large batch size
|
||||
// ref: https://github.com/ggerganov/llama.cpp/pull/3776
|
||||
if (use_tensor_cores && min_compute_capability >= CC_VOLTA && src1->ne[1] > MMQ_MAX_BATCH_SIZE) {
|
||||
use_mul_mat_q = false;
|
||||
}
|
||||
|
||||
if (use_mul_mat_q) {
|
||||
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
|
||||
} else {
|
||||
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
|
||||
@@ -7433,10 +7703,6 @@ void ggml_cuda_set_main_device(const int main_device) {
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cuda_set_mul_mat_q(const bool mul_mat_q) {
|
||||
g_mul_mat_q = mul_mat_q;
|
||||
}
|
||||
|
||||
void ggml_cuda_set_scratch_size(const size_t scratch_size) {
|
||||
// this is a hack to not completely break llama.cpp when using multiple models or contexts simultaneously
|
||||
// it still won't always work as expected, but it's better than nothing
|
||||
|
||||
237
ggml-impl.h
Normal file
237
ggml-impl.h
Normal file
@@ -0,0 +1,237 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
// GGML internal header
|
||||
|
||||
#include <assert.h>
|
||||
#include <stddef.h>
|
||||
#include <stdbool.h>
|
||||
#include <string.h> // memcpy
|
||||
#include <math.h> // fabsf
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// static_assert should be a #define, but if it's not,
|
||||
// fall back to the _Static_assert C11 keyword.
|
||||
// if C99 - static_assert is noop
|
||||
// ref: https://stackoverflow.com/a/53923785/4039976
|
||||
#ifndef static_assert
|
||||
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
|
||||
#define static_assert(cond, msg) _Static_assert(cond, msg)
|
||||
#else
|
||||
#define static_assert(cond, msg) struct global_scope_noop_trick
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
|
||||
#if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
|
||||
#ifndef __FMA__
|
||||
#define __FMA__
|
||||
#endif
|
||||
#ifndef __F16C__
|
||||
#define __F16C__
|
||||
#endif
|
||||
#ifndef __SSE3__
|
||||
#define __SSE3__
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#undef MIN
|
||||
#undef MAX
|
||||
|
||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||
|
||||
// 16-bit float
|
||||
// on Arm, we use __fp16
|
||||
// on x86, we use uint16_t
|
||||
#if defined(__ARM_NEON) && !defined(_MSC_VER)
|
||||
|
||||
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
|
||||
//
|
||||
// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
|
||||
//
|
||||
#include <arm_neon.h>
|
||||
|
||||
#define GGML_COMPUTE_FP16_TO_FP32(x) ((float) (x))
|
||||
#define GGML_COMPUTE_FP32_TO_FP16(x) (x)
|
||||
|
||||
#define GGML_FP16_TO_FP32(x) ((float) (x))
|
||||
#define GGML_FP32_TO_FP16(x) (x)
|
||||
|
||||
#else
|
||||
|
||||
#ifdef __wasm_simd128__
|
||||
#include <wasm_simd128.h>
|
||||
#else
|
||||
#ifdef __POWER9_VECTOR__
|
||||
#include <altivec.h>
|
||||
#undef bool
|
||||
#define bool _Bool
|
||||
#else
|
||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||
#include <intrin.h>
|
||||
#else
|
||||
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__)
|
||||
#if !defined(__riscv)
|
||||
#include <immintrin.h>
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef __riscv_v_intrinsic
|
||||
#include <riscv_vector.h>
|
||||
#endif
|
||||
|
||||
#ifdef __F16C__
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
|
||||
#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
|
||||
#else
|
||||
#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
|
||||
#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
|
||||
#endif
|
||||
|
||||
#elif defined(__POWER9_VECTOR__)
|
||||
|
||||
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
||||
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
||||
/* the inline asm below is about 12% faster than the lookup method */
|
||||
#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
|
||||
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
|
||||
|
||||
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
||||
register float f;
|
||||
register double d;
|
||||
__asm__(
|
||||
"mtfprd %0,%2\n"
|
||||
"xscvhpdp %0,%0\n"
|
||||
"frsp %1,%0\n" :
|
||||
/* temp */ "=d"(d),
|
||||
/* out */ "=f"(f):
|
||||
/* in */ "r"(h));
|
||||
return f;
|
||||
}
|
||||
|
||||
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
||||
register double d;
|
||||
register ggml_fp16_t r;
|
||||
__asm__( /* xscvdphp can work on double or single precision */
|
||||
"xscvdphp %0,%2\n"
|
||||
"mffprd %1,%0\n" :
|
||||
/* temp */ "=d"(d),
|
||||
/* out */ "=r"(r):
|
||||
/* in */ "f"(f));
|
||||
return r;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
// FP16 <-> FP32
|
||||
// ref: https://github.com/Maratyszcza/FP16
|
||||
|
||||
static inline float fp32_from_bits(uint32_t w) {
|
||||
union {
|
||||
uint32_t as_bits;
|
||||
float as_value;
|
||||
} fp32;
|
||||
fp32.as_bits = w;
|
||||
return fp32.as_value;
|
||||
}
|
||||
|
||||
static inline uint32_t fp32_to_bits(float f) {
|
||||
union {
|
||||
float as_value;
|
||||
uint32_t as_bits;
|
||||
} fp32;
|
||||
fp32.as_value = f;
|
||||
return fp32.as_bits;
|
||||
}
|
||||
|
||||
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
||||
const uint32_t w = (uint32_t) h << 16;
|
||||
const uint32_t sign = w & UINT32_C(0x80000000);
|
||||
const uint32_t two_w = w + w;
|
||||
|
||||
const uint32_t exp_offset = UINT32_C(0xE0) << 23;
|
||||
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
|
||||
const float exp_scale = 0x1.0p-112f;
|
||||
#else
|
||||
const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
|
||||
#endif
|
||||
const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
|
||||
|
||||
const uint32_t magic_mask = UINT32_C(126) << 23;
|
||||
const float magic_bias = 0.5f;
|
||||
const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
|
||||
|
||||
const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
|
||||
const uint32_t result = sign |
|
||||
(two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
|
||||
return fp32_from_bits(result);
|
||||
}
|
||||
|
||||
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
||||
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
|
||||
const float scale_to_inf = 0x1.0p+112f;
|
||||
const float scale_to_zero = 0x1.0p-110f;
|
||||
#else
|
||||
const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
|
||||
const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
|
||||
#endif
|
||||
float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
|
||||
|
||||
const uint32_t w = fp32_to_bits(f);
|
||||
const uint32_t shl1_w = w + w;
|
||||
const uint32_t sign = w & UINT32_C(0x80000000);
|
||||
uint32_t bias = shl1_w & UINT32_C(0xFF000000);
|
||||
if (bias < UINT32_C(0x71000000)) {
|
||||
bias = UINT32_C(0x71000000);
|
||||
}
|
||||
|
||||
base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
|
||||
const uint32_t bits = fp32_to_bits(base);
|
||||
const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
|
||||
const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
|
||||
const uint32_t nonsign = exp_bits + mantissa_bits;
|
||||
return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
|
||||
}
|
||||
|
||||
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
||||
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
||||
|
||||
#endif // __F16C__
|
||||
|
||||
#endif // __ARM_NEON
|
||||
|
||||
// precomputed f32 table for f16 (256 KB)
|
||||
// defined in ggml.c, initialized in ggml_init()
|
||||
extern float ggml_table_f32_f16[1 << 16];
|
||||
|
||||
// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
|
||||
// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
|
||||
// This is also true for POWER9.
|
||||
#if !defined(GGML_FP16_TO_FP32) || !defined(GGML_FP32_TO_FP16)
|
||||
|
||||
inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
|
||||
uint16_t s;
|
||||
memcpy(&s, &f, sizeof(uint16_t));
|
||||
return ggml_table_f32_f16[s];
|
||||
}
|
||||
|
||||
#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
|
||||
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
|
||||
|
||||
#endif
|
||||
|
||||
// TODO: backend v2 PR
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
15
ggml-metal.m
15
ggml-metal.m
@@ -210,6 +210,10 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
|
||||
|
||||
NSString * sourcePath = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
|
||||
if (sourcePath == nil) {
|
||||
GGML_METAL_LOG_WARN("%s: error: could not use bundle path to find ggml-metal.metal, falling back to trying cwd\n", __func__);
|
||||
sourcePath = @"ggml-metal.metal";
|
||||
}
|
||||
GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [sourcePath UTF8String]);
|
||||
NSString * src = [NSString stringWithContentsOfFile:sourcePath encoding:NSUTF8StringEncoding error:&error];
|
||||
if (error) {
|
||||
@@ -234,14 +238,17 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
// load kernels
|
||||
{
|
||||
NSError * error = nil;
|
||||
#define GGML_METAL_ADD_KERNEL(name) \
|
||||
ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \
|
||||
ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \
|
||||
|
||||
/*
|
||||
GGML_METAL_LOG_INFO("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \
|
||||
(int) ctx->pipeline_##name.maxTotalThreadsPerThreadgroup, \
|
||||
(int) ctx->pipeline_##name.threadExecutionWidth); \
|
||||
*/
|
||||
#define GGML_METAL_ADD_KERNEL(name) \
|
||||
ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \
|
||||
ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \
|
||||
if (error) { \
|
||||
GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
|
||||
GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
|
||||
return NULL; \
|
||||
}
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,11 +1,63 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-impl.h"
|
||||
|
||||
// GGML internal header
|
||||
|
||||
#include <stdint.h>
|
||||
#include <assert.h>
|
||||
#include <stddef.h>
|
||||
|
||||
#define QK4_0 32
|
||||
typedef struct {
|
||||
ggml_fp16_t d; // delta
|
||||
uint8_t qs[QK4_0 / 2]; // nibbles / quants
|
||||
} block_q4_0;
|
||||
static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding");
|
||||
|
||||
#define QK4_1 32
|
||||
typedef struct {
|
||||
ggml_fp16_t d; // delta
|
||||
ggml_fp16_t m; // min
|
||||
uint8_t qs[QK4_1 / 2]; // nibbles / quants
|
||||
} block_q4_1;
|
||||
static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_fp16_t) + QK4_1 / 2, "wrong q4_1 block size/padding");
|
||||
|
||||
#define QK5_0 32
|
||||
typedef struct {
|
||||
ggml_fp16_t d; // delta
|
||||
uint8_t qh[4]; // 5-th bit of quants
|
||||
uint8_t qs[QK5_0 / 2]; // nibbles / quants
|
||||
} block_q5_0;
|
||||
static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
|
||||
|
||||
#define QK5_1 32
|
||||
typedef struct {
|
||||
ggml_fp16_t d; // delta
|
||||
ggml_fp16_t m; // min
|
||||
uint8_t qh[4]; // 5-th bit of quants
|
||||
uint8_t qs[QK5_1 / 2]; // nibbles / quants
|
||||
} block_q5_1;
|
||||
static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
|
||||
|
||||
#define QK8_0 32
|
||||
typedef struct {
|
||||
ggml_fp16_t d; // delta
|
||||
int8_t qs[QK8_0]; // quants
|
||||
} block_q8_0;
|
||||
static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
|
||||
|
||||
#define QK8_1 32
|
||||
typedef struct {
|
||||
float d; // delta
|
||||
float s; // d * sum(qs[i])
|
||||
int8_t qs[QK8_1]; // quants
|
||||
} block_q8_1;
|
||||
static_assert(sizeof(block_q8_1) == 2*sizeof(float) + QK8_1, "wrong q8_1 block size/padding");
|
||||
|
||||
//
|
||||
// Super-block quantization structures
|
||||
//
|
||||
|
||||
// Super-block size
|
||||
#ifdef GGML_QKK_64
|
||||
#define QK_K 64
|
||||
@@ -15,18 +67,6 @@
|
||||
#define K_SCALE_SIZE 12
|
||||
#endif
|
||||
|
||||
#ifndef static_assert
|
||||
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
|
||||
#define static_assert(cond, msg) _Static_assert(cond, msg)
|
||||
#else
|
||||
#define static_assert(cond, msg) struct global_scope_noop_trick
|
||||
#endif
|
||||
#endif
|
||||
|
||||
//
|
||||
// Super-block quantization structures
|
||||
//
|
||||
|
||||
// 2-bit quantization
|
||||
// weight is represented as x = a * q + b
|
||||
// 16 blocks of 16 elements each
|
||||
@@ -127,6 +167,13 @@ static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_
|
||||
|
||||
|
||||
// Quantization
|
||||
void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k);
|
||||
void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k);
|
||||
void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k);
|
||||
void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k);
|
||||
void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k);
|
||||
void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k);
|
||||
|
||||
void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k);
|
||||
void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k);
|
||||
void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k);
|
||||
@@ -134,6 +181,13 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict
|
||||
void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k);
|
||||
void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);
|
||||
|
||||
void quantize_row_q4_0(const float * restrict x, void * restrict y, int k);
|
||||
void quantize_row_q4_1(const float * restrict x, void * restrict y, int k);
|
||||
void quantize_row_q5_0(const float * restrict x, void * restrict y, int k);
|
||||
void quantize_row_q5_1(const float * restrict x, void * restrict y, int k);
|
||||
void quantize_row_q8_0(const float * restrict x, void * restrict y, int k);
|
||||
void quantize_row_q8_1(const float * restrict x, void * restrict y, int k);
|
||||
|
||||
void quantize_row_q2_K(const float * restrict x, void * restrict y, int k);
|
||||
void quantize_row_q3_K(const float * restrict x, void * restrict y, int k);
|
||||
void quantize_row_q4_K(const float * restrict x, void * restrict y, int k);
|
||||
@@ -142,6 +196,13 @@ void quantize_row_q6_K(const float * restrict x, void * restrict y, int k);
|
||||
void quantize_row_q8_K(const float * restrict x, void * restrict y, int k);
|
||||
|
||||
// Dequantization
|
||||
void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k);
|
||||
void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int k);
|
||||
void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k);
|
||||
void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int k);
|
||||
void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int k);
|
||||
//void dequantize_row_q8_1(const block_q8_1 * restrict x, float * restrict y, int k);
|
||||
|
||||
void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k);
|
||||
void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k);
|
||||
void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k);
|
||||
@@ -150,16 +211,14 @@ void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int
|
||||
void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k);
|
||||
|
||||
// Dot product
|
||||
void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||
void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||
void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||
void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||
void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||
|
||||
void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||
void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||
void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||
void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||
void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||
|
||||
// Quantization with histogram collection
|
||||
size_t ggml_quantize_q2_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
size_t ggml_quantize_q3_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
|
||||
24
ggml.h
24
ggml.h
@@ -401,15 +401,16 @@ extern "C" {
|
||||
GGML_OP_ALIBI,
|
||||
GGML_OP_CLAMP,
|
||||
GGML_OP_CONV_1D,
|
||||
GGML_OP_CONV_2D,
|
||||
GGML_OP_CONV_1D_STAGE_0, // internal
|
||||
GGML_OP_CONV_1D_STAGE_1, // internal
|
||||
GGML_OP_CONV_TRANSPOSE_1D,
|
||||
GGML_OP_CONV_2D,
|
||||
GGML_OP_CONV_2D_STAGE_0, // internal
|
||||
GGML_OP_CONV_2D_STAGE_1, // internal
|
||||
GGML_OP_CONV_TRANSPOSE_2D,
|
||||
GGML_OP_POOL_1D,
|
||||
GGML_OP_POOL_2D,
|
||||
|
||||
GGML_OP_CONV_1D_STAGE_0, // internal
|
||||
GGML_OP_CONV_1D_STAGE_1, // internal
|
||||
|
||||
GGML_OP_UPSCALE, // nearest interpolate
|
||||
|
||||
GGML_OP_FLASH_ATTN,
|
||||
@@ -708,7 +709,7 @@ extern "C" {
|
||||
// Context tensor enumeration and lookup
|
||||
GGML_API struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx);
|
||||
GGML_API struct ggml_tensor * ggml_get_next_tensor (struct ggml_context * ctx, struct ggml_tensor * tensor);
|
||||
GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
|
||||
GGML_API struct ggml_tensor * ggml_get_tensor (struct ggml_context * ctx, const char * name);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
|
||||
GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
|
||||
@@ -1020,9 +1021,9 @@ extern "C" {
|
||||
struct ggml_tensor * b,
|
||||
float eps);
|
||||
|
||||
// A: n columns, m rows
|
||||
// B: n columns, p rows (i.e. we transpose it internally)
|
||||
// result is m columns, p rows
|
||||
// A: k columns, n rows => [ne03, ne02, n, k]
|
||||
// B: k columns, m rows (i.e. we transpose it internally) => [ne03 * x, ne02 * y, m, k]
|
||||
// result is n columns, m rows => [ne03 * x, ne02 * y, m, n]
|
||||
GGML_API struct ggml_tensor * ggml_mul_mat(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
@@ -1929,12 +1930,19 @@ extern "C" {
|
||||
// quantization
|
||||
//
|
||||
|
||||
// TODO: these would probably get removed in favor of the more general ggml_quantize_chunk
|
||||
GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
|
||||
GGML_API size_t ggml_quantize_q2_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
GGML_API size_t ggml_quantize_q3_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
GGML_API size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
GGML_API size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
GGML_API size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
|
||||
GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
|
||||
|
||||
//
|
||||
|
||||
19
llama.h
19
llama.h
@@ -178,7 +178,7 @@ extern "C" {
|
||||
float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
|
||||
|
||||
// Keep the booleans together to avoid misalignment during copy-by-value.
|
||||
bool mul_mat_q; // if true, use experimental mul_mat_q kernels
|
||||
bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
|
||||
bool f16_kv; // use fp16 for KV cache, fp32 otherwise
|
||||
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
||||
bool embedding; // embedding mode only
|
||||
@@ -191,6 +191,7 @@ extern "C" {
|
||||
bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
||||
bool quantize_output_tensor; // quantize output.weight
|
||||
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
||||
bool pure; // disable k-quant mixtures and quantize all tensors to the same type
|
||||
} llama_model_quantize_params;
|
||||
|
||||
// grammar types
|
||||
@@ -333,17 +334,14 @@ extern "C" {
|
||||
LLAMA_API DEPRECATED(int llama_get_kv_cache_token_count(const struct llama_context * ctx),
|
||||
"avoid using this, it will be removed in the future, instead - count the tokens in user code");
|
||||
|
||||
// Remove all tokens data of cells in [c0, c1)
|
||||
// c0 < 0 : [0, c1]
|
||||
// c1 < 0 : [c0, inf)
|
||||
LLAMA_API void llama_kv_cache_tokens_rm(
|
||||
struct llama_context * ctx,
|
||||
int32_t c0,
|
||||
int32_t c1);
|
||||
// Clear the KV cache
|
||||
LLAMA_API void llama_kv_cache_clear(
|
||||
struct llama_context * ctx);
|
||||
|
||||
// Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
|
||||
// p0 < 0 : [0, p1]
|
||||
// p1 < 0 : [p0, inf)
|
||||
// seq_id < 0 : match any sequence
|
||||
// p0 < 0 : [0, p1]
|
||||
// p1 < 0 : [p0, inf)
|
||||
LLAMA_API void llama_kv_cache_seq_rm(
|
||||
struct llama_context * ctx,
|
||||
llama_seq_id seq_id,
|
||||
@@ -658,6 +656,7 @@ extern "C" {
|
||||
float * mu);
|
||||
|
||||
/// @details Selects the token with the highest probability.
|
||||
/// Does not compute the token probabilities. Use llama_sample_softmax() instead.
|
||||
LLAMA_API llama_token llama_sample_token_greedy(
|
||||
struct llama_context * ctx,
|
||||
llama_token_data_array * candidates);
|
||||
|
||||
BIN
models/ggml-vocab-baichuan.gguf
Normal file
BIN
models/ggml-vocab-baichuan.gguf
Normal file
Binary file not shown.
BIN
models/ggml-vocab-gpt-neox.gguf
Normal file
BIN
models/ggml-vocab-gpt-neox.gguf
Normal file
Binary file not shown.
BIN
models/ggml-vocab-refact.gguf
Normal file
BIN
models/ggml-vocab-refact.gguf
Normal file
Binary file not shown.
BIN
models/ggml-vocab-starcoder.gguf
Normal file
BIN
models/ggml-vocab-starcoder.gguf
Normal file
Binary file not shown.
@@ -28,10 +28,14 @@ llama_build_executable(test-tokenizer-0-falcon.cpp)
|
||||
llama_test_executable (test-tokenizer-0-falcon test-tokenizer-0-falcon.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
||||
llama_build_executable(test-tokenizer-1-llama.cpp)
|
||||
llama_test_executable (test-tokenizer-1-llama test-tokenizer-1-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
|
||||
llama_test_executable(test-tokenizer-1-baichuan test-tokenizer-1-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
|
||||
llama_build_executable(test-tokenizer-1-bpe.cpp)
|
||||
llama_test_executable (test-tokenizer-1-falcon test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
||||
llama_test_executable(test-tokenizer-1-aquila test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
|
||||
llama_test_executable(test-tokenizer-1-mpt test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
|
||||
llama_test_executable(test-tokenizer-1-gpt-neox test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-neox.gguf)
|
||||
llama_test_executable(test-tokenizer-1-refact test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
|
||||
llama_test_executable(test-tokenizer-1-starcoder test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
|
||||
llama_build_and_test_executable(test-grammar-parser.cpp)
|
||||
llama_build_and_test_executable(test-llama-grammar.cpp)
|
||||
llama_build_and_test_executable(test-grad0.cpp) # SLOW
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
|
||||
#undef NDEBUG
|
||||
#include <cassert>
|
||||
#if !defined(__riscv) && !defined(__s390__)
|
||||
#if !defined(__riscv) && !defined(__s390__) && !defined(__ARM_NEON)
|
||||
#include <immintrin.h>
|
||||
#endif
|
||||
#include <cmath>
|
||||
|
||||
@@ -129,6 +129,13 @@ int main(int argc, char * argv[]) {
|
||||
ggml_type type = (ggml_type) i;
|
||||
ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
|
||||
|
||||
// deprecated - skip
|
||||
if (qfns.blck_size == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
printf("Testing %s\n", ggml_type_name((ggml_type) i));
|
||||
|
||||
if (qfns.from_float && qfns.to_float) {
|
||||
const float total_error = total_quantization_error(qfns, test_size, test_data.data());
|
||||
const float max_quantization_error =
|
||||
|
||||
@@ -91,9 +91,19 @@ int main(int argc, char **argv) {
|
||||
}
|
||||
}
|
||||
}
|
||||
// TODO: why doesn't this work for the full range of Unicodes?
|
||||
// Restrict to assigned unicode planes
|
||||
// for (uint32_t cp = 0x10000; cp < 0x0010ffff; ++cp) {
|
||||
for (uint32_t cp = 0x10000; cp < 0x00080000; ++cp) {
|
||||
for (uint32_t cp = 0x10000; cp < 0x00040000; ++cp) {
|
||||
std::string str = codepoint_to_utf8(cp);
|
||||
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
|
||||
std::string check = llama_detokenize_bpe(ctx, tokens);
|
||||
if (str != check) {
|
||||
fprintf(stderr, "%s : error: codepoint %x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
|
||||
__func__, cp, check.c_str(), check.length(), str.c_str(), str.length());
|
||||
return 4;
|
||||
}
|
||||
}
|
||||
for (uint32_t cp = 0x000e0000; cp < 0x0010ffff; ++cp) {
|
||||
std::string str = codepoint_to_utf8(cp);
|
||||
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
|
||||
std::string check = llama_detokenize_bpe(ctx, tokens);
|
||||
@@ -103,7 +113,6 @@ int main(int argc, char **argv) {
|
||||
return 4;
|
||||
}
|
||||
}
|
||||
|
||||
llama_free_model(model);
|
||||
llama_free(ctx);
|
||||
|
||||
|
||||
Reference in New Issue
Block a user