mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-04-30 16:47:31 +03:00
Compare commits
18 Commits
b2050
...
ik/ggml-qu
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
91c453fb11 | ||
|
|
44bf949248 | ||
|
|
e6f8177532 | ||
|
|
30679d438d | ||
|
|
4be04c8965 | ||
|
|
5d55b0cd82 | ||
|
|
4833ac209d | ||
|
|
9392ebd49e | ||
|
|
5ed26e1fc9 | ||
|
|
277fad30c6 | ||
|
|
3c0d25c475 | ||
|
|
3cc5ed353c | ||
|
|
60ecf099ed | ||
|
|
e920ed393d | ||
|
|
52bb63c708 | ||
|
|
1ec3332ade | ||
|
|
6a66c5071a | ||
|
|
a305dba8ff |
@@ -13,18 +13,22 @@
|
||||
cudaPackages,
|
||||
darwin,
|
||||
rocmPackages,
|
||||
vulkan-headers,
|
||||
vulkan-loader,
|
||||
clblast,
|
||||
useBlas ? builtins.all (x: !x) [
|
||||
useCuda
|
||||
useMetalKit
|
||||
useOpenCL
|
||||
useRocm
|
||||
useVulkan
|
||||
],
|
||||
useCuda ? config.cudaSupport,
|
||||
useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin && !useOpenCL,
|
||||
useMpi ? false, # Increases the runtime closure size by ~700M
|
||||
useOpenCL ? false,
|
||||
useRocm ? config.rocmSupport,
|
||||
useVulkan ? false,
|
||||
llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake
|
||||
}@inputs:
|
||||
|
||||
@@ -48,7 +52,8 @@ let
|
||||
++ lib.optionals useMetalKit [ "MetalKit" ]
|
||||
++ lib.optionals useMpi [ "MPI" ]
|
||||
++ lib.optionals useOpenCL [ "OpenCL" ]
|
||||
++ lib.optionals useRocm [ "ROCm" ];
|
||||
++ lib.optionals useRocm [ "ROCm" ]
|
||||
++ lib.optionals useVulkan [ "Vulkan" ];
|
||||
|
||||
pnameSuffix =
|
||||
strings.optionalString (suffices != [ ])
|
||||
@@ -108,6 +113,11 @@ let
|
||||
hipblas
|
||||
rocblas
|
||||
];
|
||||
|
||||
vulkanBuildInputs = [
|
||||
vulkan-headers
|
||||
vulkan-loader
|
||||
];
|
||||
in
|
||||
|
||||
effectiveStdenv.mkDerivation (
|
||||
@@ -164,7 +174,8 @@ effectiveStdenv.mkDerivation (
|
||||
++ optionals useCuda cudaBuildInputs
|
||||
++ optionals useMpi [ mpi ]
|
||||
++ optionals useOpenCL [ clblast ]
|
||||
++ optionals useRocm rocmBuildInputs;
|
||||
++ optionals useRocm rocmBuildInputs
|
||||
++ optionals useVulkan vulkanBuildInputs;
|
||||
|
||||
cmakeFlags =
|
||||
[
|
||||
@@ -178,6 +189,7 @@ effectiveStdenv.mkDerivation (
|
||||
(cmakeBool "LLAMA_HIPBLAS" useRocm)
|
||||
(cmakeBool "LLAMA_METAL" useMetalKit)
|
||||
(cmakeBool "LLAMA_MPI" useMpi)
|
||||
(cmakeBool "LLAMA_VULKAN" useVulkan)
|
||||
]
|
||||
++ optionals useCuda [
|
||||
(
|
||||
@@ -218,6 +230,7 @@ effectiveStdenv.mkDerivation (
|
||||
useMpi
|
||||
useOpenCL
|
||||
useRocm
|
||||
useVulkan
|
||||
;
|
||||
|
||||
shell = mkShell {
|
||||
@@ -242,11 +255,11 @@ effectiveStdenv.mkDerivation (
|
||||
# Configurations we don't want even the CI to evaluate. Results in the
|
||||
# "unsupported platform" messages. This is mostly a no-op, because
|
||||
# cudaPackages would've refused to evaluate anyway.
|
||||
badPlatforms = optionals (useCuda || useOpenCL) lib.platforms.darwin;
|
||||
badPlatforms = optionals (useCuda || useOpenCL || useVulkan) lib.platforms.darwin;
|
||||
|
||||
# Configurations that are known to result in build failures. Can be
|
||||
# overridden by importing Nixpkgs with `allowBroken = true`.
|
||||
broken = (useMetalKit && !effectiveStdenv.isDarwin);
|
||||
broken = (useMetalKit && !effectiveStdenv.isDarwin) || (useVulkan && effectiveStdenv.isDarwin);
|
||||
|
||||
description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}";
|
||||
homepage = "https://github.com/ggerganov/llama.cpp/";
|
||||
|
||||
@@ -79,7 +79,7 @@ if (NOT MSVC)
|
||||
endif()
|
||||
|
||||
if (WIN32)
|
||||
option(LLAMA_WIN_VER "llama: Windows Version" 0x602)
|
||||
set(LLAMA_WIN_VER "0x602" CACHE STRING "llama: Windows Version")
|
||||
endif()
|
||||
|
||||
# 3rd party libs
|
||||
@@ -100,6 +100,10 @@ option(LLAMA_HIPBLAS "llama: use hipBLAS"
|
||||
option(LLAMA_HIP_UMA "llama: use HIP unified memory architecture" OFF)
|
||||
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
|
||||
option(LLAMA_VULKAN "llama: use Vulkan" OFF)
|
||||
option(LLAMA_VULKAN_CHECK_RESULTS "llama: run Vulkan op checks" OFF)
|
||||
option(LLAMA_VULKAN_DEBUG "llama: enable Vulkan debug output" OFF)
|
||||
option(LLAMA_VULKAN_VALIDATE "llama: enable Vulkan validation" OFF)
|
||||
option(LLAMA_VULKAN_RUN_TESTS "llama: run Vulkan tests" OFF)
|
||||
option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT})
|
||||
option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging" OFF)
|
||||
option(LLAMA_METAL_SHADER_DEBUG "llama: compile Metal with -fno-fast-math" OFF)
|
||||
@@ -431,6 +435,22 @@ if (LLAMA_VULKAN)
|
||||
|
||||
add_compile_definitions(GGML_USE_VULKAN)
|
||||
|
||||
if (LLAMA_VULKAN_CHECK_RESULTS)
|
||||
target_compile_definitions(ggml-vulkan PRIVATE GGML_VULKAN_CHECK_RESULTS)
|
||||
endif()
|
||||
|
||||
if (LLAMA_VULKAN_DEBUG)
|
||||
target_compile_definitions(ggml-vulkan PRIVATE GGML_VULKAN_DEBUG)
|
||||
endif()
|
||||
|
||||
if (LLAMA_VULKAN_VALIDATE)
|
||||
target_compile_definitions(ggml-vulkan PRIVATE GGML_VULKAN_VALIDATE)
|
||||
endif()
|
||||
|
||||
if (LLAMA_VULKAN_RUN_TESTS)
|
||||
target_compile_definitions(ggml-vulkan PRIVATE GGML_VULKAN_RUN_TESTS)
|
||||
endif()
|
||||
|
||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ggml-vulkan)
|
||||
else()
|
||||
message(WARNING "Vulkan not found")
|
||||
|
||||
22
Makefile
22
Makefile
@@ -109,6 +109,7 @@ MK_NVCCFLAGS += -O3
|
||||
else
|
||||
MK_CFLAGS += -O3
|
||||
MK_CXXFLAGS += -O3
|
||||
MK_NVCCFLAGS += -O3
|
||||
endif
|
||||
|
||||
# clock_gettime came in POSIX.1b (1993)
|
||||
@@ -365,7 +366,7 @@ ifdef LLAMA_CUBLAS
|
||||
MK_CPPFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include -I/usr/local/cuda/targets/aarch64-linux/include
|
||||
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib -L/usr/local/cuda/targets/aarch64-linux/lib -L/usr/lib/wsl/lib
|
||||
OBJS += ggml-cuda.o
|
||||
MK_NVCCFLAGS = -use_fast_math
|
||||
MK_NVCCFLAGS += -use_fast_math
|
||||
ifndef JETSON_EOL_MODULE_DETECT
|
||||
MK_NVCCFLAGS += --forward-unknown-to-host-compiler
|
||||
endif # JETSON_EOL_MODULE_DETECT
|
||||
@@ -457,6 +458,18 @@ ifdef LLAMA_VULKAN_CHECK_RESULTS
|
||||
MK_CPPFLAGS += -DGGML_VULKAN_CHECK_RESULTS
|
||||
endif
|
||||
|
||||
ifdef LLAMA_VULKAN_DEBUG
|
||||
MK_CPPFLAGS += -DGGML_VULKAN_DEBUG
|
||||
endif
|
||||
|
||||
ifdef LLAMA_VULKAN_VALIDATE
|
||||
MK_CPPFLAGS += -DGGML_VULKAN_VALIDATE
|
||||
endif
|
||||
|
||||
ifdef LLAMA_VULKAN_RUN_TESTS
|
||||
MK_CPPFLAGS += -DGGML_VULKAN_RUN_TESTS
|
||||
endif
|
||||
|
||||
ggml-vulkan.o: ggml-vulkan.cpp ggml-vulkan.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
endif # LLAMA_VULKAN
|
||||
@@ -540,8 +553,11 @@ $(info I CFLAGS: $(CFLAGS))
|
||||
$(info I CXXFLAGS: $(CXXFLAGS))
|
||||
$(info I NVCCFLAGS: $(NVCCFLAGS))
|
||||
$(info I LDFLAGS: $(LDFLAGS))
|
||||
$(info I CC: $(shell $(CC) --version | head -n 1))
|
||||
$(info I CXX: $(shell $(CXX) --version | head -n 1))
|
||||
$(info I CC: $(shell $(CC) --version | head -n 1))
|
||||
$(info I CXX: $(shell $(CXX) --version | head -n 1))
|
||||
ifdef LLAMA_CUBLAS
|
||||
$(info I NVCC: $(shell $(NVCC) --version | tail -n 1))
|
||||
endif # LLAMA_CUBLAS
|
||||
$(info )
|
||||
|
||||
#
|
||||
|
||||
@@ -107,6 +107,7 @@ as the main playground for developing new features for the [ggml](https://github
|
||||
- [x] [Mixtral MoE](https://huggingface.co/models?search=mistral-ai/Mixtral)
|
||||
- [x] [PLaMo-13B](https://github.com/ggerganov/llama.cpp/pull/3557)
|
||||
- [x] [GPT-2](https://huggingface.co/gpt2)
|
||||
- [x] [CodeShell](https://github.com/WisdomShell/codeshell)
|
||||
|
||||
**Multimodal models:**
|
||||
|
||||
@@ -143,6 +144,7 @@ as the main playground for developing new features for the [ggml](https://github
|
||||
- [psugihara/FreeChat](https://github.com/psugihara/FreeChat)
|
||||
- [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal)
|
||||
- [iohub/collama](https://github.com/iohub/coLLaMA)
|
||||
- [pythops/tenere](https://github.com/pythops/tenere)
|
||||
|
||||
---
|
||||
|
||||
|
||||
@@ -399,6 +399,18 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||
break;
|
||||
}
|
||||
sparams.penalty_present = std::stof(argv[i]);
|
||||
} else if (arg == "--dynatemp-range") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
sparams.dynatemp_range = std::stof(argv[i]);
|
||||
} else if (arg == "--dynatemp-exp") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
sparams.dynatemp_exponent = std::stof(argv[i]);
|
||||
} else if (arg == "--mirostat") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
@@ -515,7 +527,7 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.lora_adapter.push_back(std::make_tuple(argv[i], 1.0f));
|
||||
params.lora_adapter.emplace_back(argv[i], 1.0f);
|
||||
params.use_mmap = false;
|
||||
} else if (arg == "--lora-scaled") {
|
||||
if (++i >= argc) {
|
||||
@@ -527,7 +539,7 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.lora_adapter.push_back(std::make_tuple(lora_adapter, std::stof(argv[i])));
|
||||
params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
|
||||
params.use_mmap = false;
|
||||
} else if (arg == "--lora-base") {
|
||||
if (++i >= argc) {
|
||||
@@ -664,7 +676,7 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.antiprompt.push_back(argv[i]);
|
||||
params.antiprompt.emplace_back(argv[i]);
|
||||
} else if (arg == "-ld" || arg == "--logdir") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
@@ -880,7 +892,7 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||
}
|
||||
|
||||
if (!params.kv_overrides.empty()) {
|
||||
params.kv_overrides.emplace_back(llama_model_kv_override());
|
||||
params.kv_overrides.emplace_back();
|
||||
params.kv_overrides.back().key[0] = 0;
|
||||
}
|
||||
|
||||
@@ -942,6 +954,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||
printf(" --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)sparams.penalty_repeat);
|
||||
printf(" --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_present);
|
||||
printf(" --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_freq);
|
||||
printf(" --dynatemp-range N dynamic temperature range (default: %.1f, 0.0 = disabled)\n", (double)sparams.dynatemp_range);
|
||||
printf(" --dynatemp-exp N dynamic temperature exponent (default: %.1f)\n", (double)sparams.dynatemp_exponent);
|
||||
printf(" --mirostat N use Mirostat sampling.\n");
|
||||
printf(" Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n");
|
||||
printf(" (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", sparams.mirostat);
|
||||
|
||||
@@ -75,8 +75,7 @@ struct gpt_params {
|
||||
float yarn_beta_fast = 32.0f; // YaRN low correction dim
|
||||
float yarn_beta_slow = 1.0f; // YaRN high correction dim
|
||||
int32_t yarn_orig_ctx = 0; // YaRN original context length
|
||||
int8_t rope_scaling_type = LLAMA_ROPE_SCALING_UNSPECIFIED; // TODO: better to be int32_t for alignment
|
||||
// pinging @cebtenzzre
|
||||
int32_t rope_scaling_type = LLAMA_ROPE_SCALING_UNSPECIFIED;
|
||||
|
||||
// // sampling parameters
|
||||
struct llama_sampling_params sparams;
|
||||
|
||||
@@ -36,6 +36,8 @@ public:
|
||||
void set_parameters(StatParams&& params) { m_params = std::move(params); }
|
||||
bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data);
|
||||
void save_imatrix() const;
|
||||
bool load_imatrix(const char * file_name, bool add);
|
||||
static bool load_imatrix(const char * file_name, std::unordered_map<std::string, Stats>& imatrix);
|
||||
private:
|
||||
std::unordered_map<std::string, Stats> m_stats;
|
||||
StatParams m_params;
|
||||
@@ -189,6 +191,57 @@ void IMatrixCollector::save_imatrix(const char * fname) const {
|
||||
}
|
||||
}
|
||||
|
||||
bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_map<std::string, Stats>& imatrix_data) {
|
||||
std::ifstream in(imatrix_file, std::ios::binary);
|
||||
if (!in) {
|
||||
printf("%s: failed to open %s\n",__func__,imatrix_file);
|
||||
return false;
|
||||
}
|
||||
int n_entries;
|
||||
in.read((char*)&n_entries, sizeof(n_entries));
|
||||
if (in.fail() || n_entries < 1) {
|
||||
printf("%s: no data in file %s\n", __func__, imatrix_file);
|
||||
return false;
|
||||
}
|
||||
for (int i = 0; i < n_entries; ++i) {
|
||||
int len; in.read((char *)&len, sizeof(len));
|
||||
std::vector<char> name_as_vec(len+1);
|
||||
in.read((char *)name_as_vec.data(), len);
|
||||
if (in.fail()) {
|
||||
printf("%s: failed reading name for entry %d from %s\n",__func__,i+1,imatrix_file);
|
||||
return false;
|
||||
}
|
||||
name_as_vec[len] = 0;
|
||||
std::string name{name_as_vec.data()};
|
||||
auto& e = imatrix_data[std::move(name)];
|
||||
int ncall;
|
||||
in.read((char*)&ncall, sizeof(ncall));
|
||||
int nval;
|
||||
in.read((char *)&nval, sizeof(nval));
|
||||
if (in.fail() || nval < 1) {
|
||||
printf("%s: failed reading number of values for entry %d\n",__func__,i);
|
||||
imatrix_data = {};
|
||||
return false;
|
||||
}
|
||||
e.values.resize(nval);
|
||||
in.read((char*)e.values.data(), nval*sizeof(float));
|
||||
if (in.fail()) {
|
||||
printf("%s: failed reading data for entry %d\n",__func__,i);
|
||||
imatrix_data = {};
|
||||
return false;
|
||||
}
|
||||
e.ncall = ncall;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool IMatrixCollector::load_imatrix(const char * file_name, bool add) {
|
||||
if (!add) {
|
||||
m_stats.clear();
|
||||
}
|
||||
return load_imatrix(file_name, m_stats);
|
||||
}
|
||||
|
||||
static IMatrixCollector g_collector;
|
||||
|
||||
static bool ik_collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
|
||||
@@ -269,7 +322,7 @@ static void process_logits(
|
||||
}
|
||||
}
|
||||
|
||||
static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool compute_ppl) {
|
||||
static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool compute_ppl, int from_chunk) {
|
||||
|
||||
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
||||
const int n_ctx = llama_n_ctx(ctx);
|
||||
@@ -282,6 +335,15 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
|
||||
auto tim2 = std::chrono::high_resolution_clock::now();
|
||||
fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
|
||||
|
||||
if (from_chunk > 0) {
|
||||
if (size_t((from_chunk + 2)*n_ctx) >= tokens.size()) {
|
||||
fprintf(stderr, "%s: there will be not enough tokens left after removing %d chunks\n", __func__, from_chunk);
|
||||
return false;
|
||||
}
|
||||
fprintf(stderr, "%s: removing initial %d chunks (%d tokens)\n", __func__, from_chunk, from_chunk*n_ctx);
|
||||
tokens.erase(tokens.begin(), tokens.begin() + from_chunk*n_ctx);
|
||||
}
|
||||
|
||||
if (int(tokens.size()) < 2*n_ctx) {
|
||||
fprintf(stderr, "%s: you need at least %d tokens for a context of %d tokens\n",__func__,2*n_ctx,
|
||||
n_ctx);
|
||||
@@ -402,7 +464,10 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
|
||||
int main(int argc, char ** argv) {
|
||||
|
||||
StatParams sparams;
|
||||
std::string prev_result_file;
|
||||
std::string combine_files;
|
||||
bool compute_ppl = true;
|
||||
int from_chunk = 0;
|
||||
std::vector<char*> args;
|
||||
args.push_back(argv[0]);
|
||||
int iarg = 1;
|
||||
@@ -423,6 +488,13 @@ int main(int argc, char ** argv) {
|
||||
compute_ppl = false;
|
||||
} else if (arg == "--keep-imatrix") {
|
||||
sparams.keep_every = std::stoi(argv[++iarg]);
|
||||
} else if (arg == "--continue-from") {
|
||||
prev_result_file = argv[++iarg];
|
||||
} else if (arg == "--combine") {
|
||||
combine_files = argv[++iarg];
|
||||
}
|
||||
else if (arg == "--from-chunk") {
|
||||
from_chunk = std::stoi(argv[++iarg]);
|
||||
} else {
|
||||
args.push_back(argv[iarg]);
|
||||
}
|
||||
@@ -436,14 +508,50 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
}
|
||||
|
||||
g_collector.set_parameters(std::move(sparams));
|
||||
|
||||
if (!combine_files.empty()) {
|
||||
std::vector<std::string> files;
|
||||
size_t pos = 0;
|
||||
while (true) {
|
||||
auto new_pos = combine_files.find(',', pos);
|
||||
if (new_pos != std::string::npos) {
|
||||
files.emplace_back(combine_files.substr(pos, new_pos - pos));
|
||||
pos = new_pos + 1;
|
||||
} else {
|
||||
files.emplace_back(combine_files.substr(pos));
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (files.size() < 2) {
|
||||
fprintf(stderr, "You must provide at least two comma separated files to use --combine\n");
|
||||
return 1;
|
||||
}
|
||||
printf("Combining the following %d files\n", int(files.size()));
|
||||
for (auto& file : files) {
|
||||
printf(" %s\n", file.c_str());
|
||||
if (!g_collector.load_imatrix(file.c_str(), true)) {
|
||||
fprintf(stderr, "Failed to load %s\n", file.c_str());
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
g_collector.save_imatrix();
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (!prev_result_file.empty()) {
|
||||
if (!g_collector.load_imatrix(prev_result_file.c_str(), false)) {
|
||||
fprintf(stderr, "=============== Failed to load %s\n", prev_result_file.c_str());
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
gpt_params params;
|
||||
params.n_batch = 512;
|
||||
if (!gpt_params_parse(args.size(), args.data(), params)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
g_collector.set_parameters(std::move(sparams));
|
||||
|
||||
params.logits_all = true;
|
||||
params.n_batch = std::min(params.n_batch, params.n_ctx);
|
||||
|
||||
@@ -495,7 +603,7 @@ int main(int argc, char ** argv) {
|
||||
fprintf(stderr, "%s\n", get_system_info(params).c_str());
|
||||
}
|
||||
|
||||
bool OK = compute_imatrix(ctx, params, compute_ppl);
|
||||
bool OK = compute_imatrix(ctx, params, compute_ppl, from_chunk);
|
||||
if (!OK) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -948,46 +948,46 @@ struct markdown_printer : public printer {
|
||||
|
||||
void print_header(const cmd_params & params) override {
|
||||
// select fields to print
|
||||
fields.push_back("model");
|
||||
fields.push_back("size");
|
||||
fields.push_back("params");
|
||||
fields.push_back("backend");
|
||||
fields.emplace_back("model");
|
||||
fields.emplace_back("size");
|
||||
fields.emplace_back("params");
|
||||
fields.emplace_back("backend");
|
||||
bool is_cpu_backend = test::get_backend() == "CPU" || test::get_backend() == "BLAS";
|
||||
if (!is_cpu_backend) {
|
||||
fields.push_back("n_gpu_layers");
|
||||
fields.emplace_back("n_gpu_layers");
|
||||
}
|
||||
if (params.n_threads.size() > 1 || params.n_threads != cmd_params_defaults.n_threads || is_cpu_backend) {
|
||||
fields.push_back("n_threads");
|
||||
fields.emplace_back("n_threads");
|
||||
}
|
||||
if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) {
|
||||
fields.push_back("n_batch");
|
||||
fields.emplace_back("n_batch");
|
||||
}
|
||||
if (params.type_k.size() > 1 || params.type_k != cmd_params_defaults.type_k) {
|
||||
fields.push_back("type_k");
|
||||
fields.emplace_back("type_k");
|
||||
}
|
||||
if (params.type_v.size() > 1 || params.type_v != cmd_params_defaults.type_v) {
|
||||
fields.push_back("type_v");
|
||||
fields.emplace_back("type_v");
|
||||
}
|
||||
if (params.main_gpu.size() > 1 || params.main_gpu != cmd_params_defaults.main_gpu) {
|
||||
fields.push_back("main_gpu");
|
||||
fields.emplace_back("main_gpu");
|
||||
}
|
||||
if (params.split_mode.size() > 1 || params.split_mode != cmd_params_defaults.split_mode) {
|
||||
fields.push_back("split_mode");
|
||||
fields.emplace_back("split_mode");
|
||||
}
|
||||
if (params.mul_mat_q.size() > 1 || params.mul_mat_q != cmd_params_defaults.mul_mat_q) {
|
||||
fields.push_back("mul_mat_q");
|
||||
fields.emplace_back("mul_mat_q");
|
||||
}
|
||||
if (params.no_kv_offload.size() > 1 || params.no_kv_offload != cmd_params_defaults.no_kv_offload) {
|
||||
fields.push_back("no_kv_offload");
|
||||
fields.emplace_back("no_kv_offload");
|
||||
}
|
||||
if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) {
|
||||
fields.push_back("tensor_split");
|
||||
fields.emplace_back("tensor_split");
|
||||
}
|
||||
if (params.use_mmap.size() > 1 || params.use_mmap != cmd_params_defaults.use_mmap) {
|
||||
fields.push_back("use_mmap");
|
||||
fields.emplace_back("use_mmap");
|
||||
}
|
||||
fields.push_back("test");
|
||||
fields.push_back("t/s");
|
||||
fields.emplace_back("test");
|
||||
fields.emplace_back("t/s");
|
||||
|
||||
fprintf(fout, "|");
|
||||
for (const auto & field : fields) {
|
||||
|
||||
@@ -352,12 +352,12 @@ int main(int argc, char ** argv) {
|
||||
// in instruct mode, we inject a prefix and a suffix to each input by the user
|
||||
if (params.instruct) {
|
||||
params.interactive_first = true;
|
||||
params.antiprompt.push_back("### Instruction:\n\n");
|
||||
params.antiprompt.emplace_back("### Instruction:\n\n");
|
||||
}
|
||||
// similar for chatml mode
|
||||
else if (params.chatml) {
|
||||
params.interactive_first = true;
|
||||
params.antiprompt.push_back("<|im_start|>user\n");
|
||||
params.antiprompt.emplace_back("<|im_start|>user\n");
|
||||
}
|
||||
|
||||
// enable interactive mode if interactive start is specified
|
||||
|
||||
@@ -881,7 +881,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
||||
size_t li = hs_cur.common_prefix;
|
||||
for (int s = 0; s < 4; ++s) {
|
||||
for (size_t j = hs_cur.common_prefix; j < hs_cur.seq_tokens[s].size() - 1; j++) {
|
||||
eval_pairs.push_back(std::make_pair(hs_cur.i_batch + li++, hs_cur.seq_tokens[s][j + 1]));
|
||||
eval_pairs.emplace_back(hs_cur.i_batch + li++, hs_cur.seq_tokens[s][j + 1]);
|
||||
}
|
||||
++li;
|
||||
}
|
||||
@@ -1159,13 +1159,13 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
|
||||
const int last_1st = task.seq_tokens[0].size() - n_base1 > 1 ? 1 : 0;
|
||||
size_t li = n_base1 - 1;
|
||||
for (size_t j = n_base1-1; j < task.seq_tokens[0].size()-1-last_1st; ++j) {
|
||||
eval_pairs.push_back(std::make_pair(task.i_batch + li++, task.seq_tokens[0][j+1]));
|
||||
eval_pairs.emplace_back(task.i_batch + li++, task.seq_tokens[0][j+1]);
|
||||
}
|
||||
const auto& n_base2 = skip_choice ? task.n_base2 : task.common_prefix;
|
||||
const int last_2nd = task.seq_tokens[1].size() - n_base2 > 1 ? 1 : 0;
|
||||
li = task.seq_tokens[0].size() - task.common_prefix + n_base2 - 1;
|
||||
for (size_t j = n_base2-1; j < task.seq_tokens[1].size()-1-last_2nd; ++j) {
|
||||
eval_pairs.push_back(std::make_pair(task.i_batch + li++, task.seq_tokens[1][j+1]));
|
||||
eval_pairs.emplace_back(task.i_batch + li++, task.seq_tokens[1][j+1]);
|
||||
}
|
||||
}
|
||||
compute_logprobs(batch_logits.data(), n_vocab, workers, eval_pairs, eval_results);
|
||||
@@ -1524,7 +1524,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
||||
size_t li = cur_task.common_prefix;
|
||||
for (int s = 0; s < int(cur_task.seq_tokens.size()); ++s) {
|
||||
for (size_t j = cur_task.common_prefix; j < cur_task.seq_tokens[s].size() - 1; j++) {
|
||||
eval_pairs.push_back(std::make_pair(cur_task.i_batch + li++, cur_task.seq_tokens[s][j + 1]));
|
||||
eval_pairs.emplace_back(cur_task.i_batch + li++, cur_task.seq_tokens[s][j + 1]);
|
||||
}
|
||||
++li;
|
||||
}
|
||||
|
||||
@@ -257,13 +257,13 @@ int main(int argc, char ** argv) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.include_layers.push_back(argv[i]);
|
||||
params.include_layers.emplace_back(argv[i]);
|
||||
} else if (arg == "-L" || arg == "--exclude-layer") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.exclude_layers.push_back(argv[i]);
|
||||
params.exclude_layers.emplace_back(argv[i]);
|
||||
} else if (arg == "-t" || arg == "--type") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
|
||||
@@ -208,13 +208,13 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
} else if (strcmp(argv[arg_idx], "--include-weights") == 0) {
|
||||
if (arg_idx < argc-1) {
|
||||
included_weights.push_back(argv[++arg_idx]);
|
||||
included_weights.emplace_back(argv[++arg_idx]);
|
||||
} else {
|
||||
usage(argv[0]);
|
||||
}
|
||||
} else if (strcmp(argv[arg_idx], "--exclude-weights") == 0) {
|
||||
if (arg_idx < argc-1) {
|
||||
excluded_weights.push_back(argv[++arg_idx]);
|
||||
excluded_weights.emplace_back(argv[++arg_idx]);
|
||||
} else {
|
||||
usage(argv[0]);
|
||||
}
|
||||
|
||||
@@ -1884,7 +1884,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
sparams.api_keys.push_back(argv[i]);
|
||||
sparams.api_keys.emplace_back(argv[i]);
|
||||
}
|
||||
else if (arg == "--api-key-file")
|
||||
{
|
||||
@@ -2160,7 +2160,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.lora_adapter.push_back(std::make_tuple(argv[i], 1.0f));
|
||||
params.lora_adapter.emplace_back(argv[i], 1.0f);
|
||||
params.use_mmap = false;
|
||||
}
|
||||
else if (arg == "--lora-scaled")
|
||||
@@ -2176,7 +2176,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.lora_adapter.push_back(std::make_tuple(lora_adapter, std::stof(argv[i])));
|
||||
params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
|
||||
params.use_mmap = false;
|
||||
}
|
||||
else if (arg == "--lora-base")
|
||||
@@ -2318,7 +2318,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||
}
|
||||
}
|
||||
if (!params.kv_overrides.empty()) {
|
||||
params.kv_overrides.emplace_back(llama_model_kv_override());
|
||||
params.kv_overrides.emplace_back();
|
||||
params.kv_overrides.back().key[0] = 0;
|
||||
}
|
||||
|
||||
|
||||
18
flake.lock
generated
18
flake.lock
generated
@@ -5,11 +5,11 @@
|
||||
"nixpkgs-lib": "nixpkgs-lib"
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1704982712,
|
||||
"narHash": "sha256-2Ptt+9h8dczgle2Oo6z5ni5rt/uLMG47UFTR1ry/wgg=",
|
||||
"lastModified": 1706830856,
|
||||
"narHash": "sha256-a0NYyp+h9hlb7ddVz4LUn1vT/PLwqfrWYcHMvFB1xYg=",
|
||||
"owner": "hercules-ci",
|
||||
"repo": "flake-parts",
|
||||
"rev": "07f6395285469419cf9d078f59b5b49993198c00",
|
||||
"rev": "b253292d9c0a5ead9bc98c4e9a26c6312e27d69f",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
@@ -20,11 +20,11 @@
|
||||
},
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1706191920,
|
||||
"narHash": "sha256-eLihrZAPZX0R6RyM5fYAWeKVNuQPYjAkCUBr+JNvtdE=",
|
||||
"lastModified": 1706732774,
|
||||
"narHash": "sha256-hqJlyJk4MRpcItGYMF+3uHe8HvxNETWvlGtLuVpqLU0=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "ae5c332cbb5827f6b1f02572496b141021de335f",
|
||||
"rev": "b8b232ae7b8b144397fdb12d20f592e5e7c1a64d",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
@@ -37,11 +37,11 @@
|
||||
"nixpkgs-lib": {
|
||||
"locked": {
|
||||
"dir": "lib",
|
||||
"lastModified": 1703961334,
|
||||
"narHash": "sha256-M1mV/Cq+pgjk0rt6VxoyyD+O8cOUiai8t9Q6Yyq4noY=",
|
||||
"lastModified": 1706550542,
|
||||
"narHash": "sha256-UcsnCG6wx++23yeER4Hg18CXWbgNpqNXcHIo5/1Y+hc=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "b0d36bd0a420ecee3bc916c91886caca87c894e9",
|
||||
"rev": "97b17f32362e475016f942bbdfda4a4a72a8a652",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
|
||||
@@ -157,6 +157,7 @@
|
||||
|
||||
mpi-cpu = config.packages.default.override { useMpi = true; };
|
||||
mpi-cuda = config.packages.default.override { useMpi = true; };
|
||||
vulkan = config.packages.default.override { useVulkan = true; };
|
||||
}
|
||||
// lib.optionalAttrs (system == "x86_64-linux") {
|
||||
rocm = config.legacyPackages.llamaPackagesRocm.llama-cpp;
|
||||
|
||||
@@ -19,6 +19,7 @@ extern "C" {
|
||||
// fall back to the _Static_assert C11 keyword.
|
||||
// if C99 - static_assert is noop
|
||||
// ref: https://stackoverflow.com/a/53923785/4039976
|
||||
#ifndef __cplusplus
|
||||
#ifndef static_assert
|
||||
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
|
||||
#define static_assert(cond, msg) _Static_assert(cond, msg)
|
||||
@@ -26,6 +27,7 @@ extern "C" {
|
||||
#define static_assert(cond, msg) struct global_scope_noop_trick
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
|
||||
#if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
|
||||
|
||||
117
ggml-quants.h
117
ggml-quants.h
@@ -191,70 +191,74 @@ typedef struct {
|
||||
} block_iq3_xxs;
|
||||
static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// Quantization
|
||||
void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k);
|
||||
void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k);
|
||||
void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k);
|
||||
void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k);
|
||||
void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k);
|
||||
void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k);
|
||||
void quantize_row_q4_0_reference(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q4_1_reference(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q5_0_reference(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q5_1_reference(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q8_0_reference(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q8_1_reference(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int k);
|
||||
|
||||
void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k);
|
||||
void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k);
|
||||
void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k);
|
||||
void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k);
|
||||
void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k);
|
||||
void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);
|
||||
void quantize_row_iq3_xxs_reference(const float * restrict x, block_iq3_xxs * restrict y, int k);
|
||||
void quantize_row_q2_K_reference(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q3_K_reference(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q4_K_reference(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int k);
|
||||
void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int k);
|
||||
|
||||
void quantize_row_q4_0(const float * restrict x, void * restrict y, int k);
|
||||
void quantize_row_q4_1(const float * restrict x, void * restrict y, int k);
|
||||
void quantize_row_q5_0(const float * restrict x, void * restrict y, int k);
|
||||
void quantize_row_q5_1(const float * restrict x, void * restrict y, int k);
|
||||
void quantize_row_q8_0(const float * restrict x, void * restrict y, int k);
|
||||
void quantize_row_q8_1(const float * restrict x, void * restrict y, int k);
|
||||
void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
|
||||
void quantize_row_q2_K(const float * restrict x, void * restrict y, int k);
|
||||
void quantize_row_q3_K(const float * restrict x, void * restrict y, int k);
|
||||
void quantize_row_q4_K(const float * restrict x, void * restrict y, int k);
|
||||
void quantize_row_q5_K(const float * restrict x, void * restrict y, int k);
|
||||
void quantize_row_q6_K(const float * restrict x, void * restrict y, int k);
|
||||
void quantize_row_q8_K(const float * restrict x, void * restrict y, int k);
|
||||
void quantize_row_iq3_xxs(const float * restrict x, void * restrict y, int k);
|
||||
void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
|
||||
// Dequantization
|
||||
void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k);
|
||||
void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int k);
|
||||
void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k);
|
||||
void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int k);
|
||||
void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int k);
|
||||
//void dequantize_row_q8_1(const block_q8_1 * restrict x, float * restrict y, int k);
|
||||
void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
//void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
|
||||
void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k);
|
||||
void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k);
|
||||
void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k);
|
||||
void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k);
|
||||
void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k);
|
||||
void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k);
|
||||
void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int k);
|
||||
void dequantize_row_iq2_xs (const block_iq2_xs * restrict x, float * restrict y, int k);
|
||||
void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y, int k);
|
||||
void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
|
||||
// Dot product
|
||||
void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||
void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||
void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||
void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||
void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
||||
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
||||
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
||||
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
||||
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
||||
|
||||
void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||
void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||
void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||
void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||
void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||
void ggml_vec_dot_iq2_xs_q8_K (int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||
void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
||||
void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
||||
void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
||||
void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
||||
void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
||||
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
||||
void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
||||
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
||||
|
||||
//
|
||||
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
|
||||
@@ -276,3 +280,8 @@ void iq2xs_init_impl(int grid_size);
|
||||
void iq2xs_free_impl(int grid_size);
|
||||
void iq3xs_init_impl(int grid_size);
|
||||
void iq3xs_free_impl(int grid_size);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
211
ggml-sycl.cpp
211
ggml-sycl.cpp
@@ -7693,6 +7693,13 @@ static void cpy_1_f16_f16(const char * cxi, char * cdsti) {
|
||||
*dsti = *xi;
|
||||
}
|
||||
|
||||
static void cpy_1_f16_f32(const char * cxi, char * cdsti) {
|
||||
const sycl::half *xi = (const sycl::half *)cxi;
|
||||
float *dsti = (float *)cdsti;
|
||||
|
||||
*dsti = *xi;
|
||||
}
|
||||
|
||||
static void cpy_1_i16_i16(const char * cxi, char * cdsti) {
|
||||
const int16_t *xi = (const int16_t *)cxi;
|
||||
int16_t *dsti = (int16_t *)cdsti;
|
||||
@@ -7709,9 +7716,9 @@ static void cpy_1_i32_i32(const char * cxi, char * cdsti) {
|
||||
|
||||
template <cpy_kernel_t cpy_1>
|
||||
static void cpy_f32_f16(const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
||||
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
||||
const int nb12, const int nb13, const sycl::nd_item<3> &item_ct1) {
|
||||
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
||||
item_ct1.get_local_id(2);
|
||||
|
||||
@@ -7721,15 +7728,17 @@ static void cpy_f32_f16(const char * cx, char * cdst, const int ne,
|
||||
|
||||
// determine indices i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor
|
||||
// then combine those indices with the corresponding byte offsets to get the total offsets
|
||||
const int i02 = i / (ne00*ne01);
|
||||
const int i01 = (i - i02*ne01*ne00) / ne00;
|
||||
const int i00 = i - i02*ne01*ne00 - i01*ne00;
|
||||
const int x_offset = i00*nb00 + i01*nb01 + i02*nb02;
|
||||
const int i03 = i/(ne00 * ne01 * ne02);
|
||||
const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
|
||||
const int i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00;
|
||||
const int i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
|
||||
const int x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
|
||||
|
||||
const int i12 = i / (ne10*ne11);
|
||||
const int i11 = (i - i12*ne10*ne11) / ne10;
|
||||
const int i10 = i - i12*ne10*ne11 - i11*ne10;
|
||||
const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12;
|
||||
const int i13 = i/(ne10 * ne11 * ne12);
|
||||
const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
|
||||
const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
|
||||
const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
|
||||
const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13 * nb13;
|
||||
|
||||
cpy_1(cx + x_offset, cdst + dst_offset);
|
||||
}
|
||||
@@ -7823,9 +7832,9 @@ static void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) {
|
||||
|
||||
template <cpy_kernel_t cpy_blck, int qk>
|
||||
static void cpy_f32_q(const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
||||
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
||||
const int nb12, const int nb13, const sycl::nd_item<3> &item_ct1) {
|
||||
const int i = (item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
||||
item_ct1.get_local_id(2)) *
|
||||
qk;
|
||||
@@ -7834,15 +7843,17 @@ static void cpy_f32_q(const char * cx, char * cdst, const int ne,
|
||||
return;
|
||||
}
|
||||
|
||||
const int i02 = i / (ne00*ne01);
|
||||
const int i01 = (i - i02*ne01*ne00) / ne00;
|
||||
const int i00 = (i - i02*ne01*ne00 - i01*ne00);
|
||||
const int x_offset = i00*nb00 + i01*nb01 + i02*nb02;
|
||||
const int i03 = i/(ne00 * ne01 * ne02);
|
||||
const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
|
||||
const int i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00;
|
||||
const int i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
|
||||
const int x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
|
||||
|
||||
const int i12 = i / (ne10*ne11);
|
||||
const int i11 = (i - i12*ne10*ne11) / ne10;
|
||||
const int i10 = (i - i12*ne10*ne11 - i11*ne10)/qk;
|
||||
const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12;
|
||||
const int i13 = i/(ne10 * ne11 * ne12);
|
||||
const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
|
||||
const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
|
||||
const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
|
||||
const int dst_offset = (i10/qk)*nb10 + i11*nb11 + i12*nb12 + i13*nb13;
|
||||
|
||||
cpy_blck(cx + x_offset, cdst + dst_offset);
|
||||
}
|
||||
@@ -8247,7 +8258,8 @@ static void clamp_f32(const float * x, float * dst, const float min, const float
|
||||
dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
|
||||
}
|
||||
|
||||
static void im2col_f32_f16(const float *x, sycl::half *dst, int offset_delta,
|
||||
template <typename T>
|
||||
static void im2col_kernel(const float *x, T *dst, int offset_delta,
|
||||
int IW, int IH, int OW, int KW, int KH,
|
||||
int pelements, int CHW, int s0, int s1, int p0,
|
||||
int p1, int d0, int d1,
|
||||
@@ -10598,10 +10610,12 @@ static void ggml_mul_mat_vec_nc_f16_f32_sycl(
|
||||
|
||||
static void ggml_cpy_f32_f32_sycl(const char *cx, char *cdst, const int ne,
|
||||
const int ne00, const int ne01,
|
||||
const int nb00, const int nb01,
|
||||
const int nb02, const int ne10,
|
||||
const int ne11, const int nb10,
|
||||
const int nb11, const int nb12,
|
||||
const int ne02, const int nb00,
|
||||
const int nb01, const int nb02,
|
||||
const int nb03, const int ne10,
|
||||
const int ne11, const int ne12,
|
||||
const int nb10, const int nb11,
|
||||
const int nb12, const int nb13,
|
||||
dpct::queue_ptr stream) {
|
||||
|
||||
const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE;
|
||||
@@ -10614,8 +10628,8 @@ static void ggml_cpy_f32_f32_sycl(const char *cx, char *cdst, const int ne,
|
||||
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
|
||||
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
cpy_f32_f16<cpy_1_f32_f32>(cx, cdst, ne, ne00, ne01, nb00, nb01,
|
||||
nb02, ne10, ne11, nb10, nb11, nb12,
|
||||
cpy_f32_f16<cpy_1_f32_f32>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
|
||||
nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
|
||||
item_ct1);
|
||||
});
|
||||
}
|
||||
@@ -10623,10 +10637,12 @@ static void ggml_cpy_f32_f32_sycl(const char *cx, char *cdst, const int ne,
|
||||
|
||||
static void ggml_cpy_f32_f16_sycl(const char *cx, char *cdst, const int ne,
|
||||
const int ne00, const int ne01,
|
||||
const int nb00, const int nb01,
|
||||
const int nb02, const int ne10,
|
||||
const int ne11, const int nb10,
|
||||
const int nb11, const int nb12,
|
||||
const int ne02, const int nb00,
|
||||
const int nb01, const int nb02,
|
||||
const int nb03, const int ne10,
|
||||
const int ne11, const int ne12,
|
||||
const int nb10, const int nb11,
|
||||
const int nb12, const int nb13,
|
||||
dpct::queue_ptr stream) {
|
||||
|
||||
const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE;
|
||||
@@ -10639,8 +10655,8 @@ static void ggml_cpy_f32_f16_sycl(const char *cx, char *cdst, const int ne,
|
||||
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
|
||||
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
cpy_f32_f16<cpy_1_f32_f16>(cx, cdst, ne, ne00, ne01, nb00, nb01,
|
||||
nb02, ne10, ne11, nb10, nb11, nb12,
|
||||
cpy_f32_f16<cpy_1_f32_f16>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
|
||||
nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
|
||||
item_ct1);
|
||||
});
|
||||
}
|
||||
@@ -10648,10 +10664,12 @@ static void ggml_cpy_f32_f16_sycl(const char *cx, char *cdst, const int ne,
|
||||
|
||||
static void ggml_cpy_f32_q8_0_sycl(const char *cx, char *cdst, const int ne,
|
||||
const int ne00, const int ne01,
|
||||
const int nb00, const int nb01,
|
||||
const int nb02, const int ne10,
|
||||
const int ne11, const int nb10,
|
||||
const int nb11, const int nb12,
|
||||
const int ne02, const int nb00,
|
||||
const int nb01, const int nb02,
|
||||
const int nb03, const int ne10,
|
||||
const int ne11, const int ne12,
|
||||
const int nb10, const int nb11,
|
||||
const int nb12, const int nb13,
|
||||
dpct::queue_ptr stream) {
|
||||
|
||||
GGML_ASSERT(ne % QK8_0 == 0);
|
||||
@@ -10660,17 +10678,20 @@ static void ggml_cpy_f32_q8_0_sycl(const char *cx, char *cdst, const int ne,
|
||||
sycl::range<3>(1, 1, 1)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
cpy_f32_q<cpy_blck_f32_q8_0, QK8_0>(
|
||||
cx, cdst, ne, ne00, ne01, nb00, nb01, nb02,
|
||||
ne10, ne11, nb10, nb11, nb12, item_ct1);
|
||||
cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
|
||||
nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
|
||||
item_ct1);
|
||||
});
|
||||
}
|
||||
|
||||
static void ggml_cpy_f32_q4_0_sycl(const char *cx, char *cdst, const int ne,
|
||||
const int ne00, const int ne01,
|
||||
const int nb00, const int nb01,
|
||||
const int nb02, const int ne10,
|
||||
const int ne11, const int nb10,
|
||||
const int nb11, const int nb12,
|
||||
const int ne02, const int nb00,
|
||||
const int nb01, const int nb02,
|
||||
const int nb03, const int ne10,
|
||||
const int ne11, const int ne12,
|
||||
const int nb10, const int nb11,
|
||||
const int nb12, const int nb13,
|
||||
dpct::queue_ptr stream) {
|
||||
|
||||
GGML_ASSERT(ne % QK4_0 == 0);
|
||||
@@ -10679,17 +10700,20 @@ static void ggml_cpy_f32_q4_0_sycl(const char *cx, char *cdst, const int ne,
|
||||
sycl::range<3>(1, 1, 1)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
cpy_f32_q<cpy_blck_f32_q4_0, QK4_0>(
|
||||
cx, cdst, ne, ne00, ne01, nb00, nb01, nb02,
|
||||
ne10, ne11, nb10, nb11, nb12, item_ct1);
|
||||
cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
|
||||
nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
|
||||
item_ct1);
|
||||
});
|
||||
}
|
||||
|
||||
static void ggml_cpy_f32_q4_1_sycl(const char *cx, char *cdst, const int ne,
|
||||
const int ne00, const int ne01,
|
||||
const int nb00, const int nb01,
|
||||
const int nb02, const int ne10,
|
||||
const int ne11, const int nb10,
|
||||
const int nb11, const int nb12,
|
||||
const int ne02, const int nb00,
|
||||
const int nb01, const int nb02,
|
||||
const int nb03, const int ne10,
|
||||
const int ne11, const int ne12,
|
||||
const int nb10, const int nb11,
|
||||
const int nb12, const int nb13,
|
||||
dpct::queue_ptr stream) {
|
||||
|
||||
GGML_ASSERT(ne % QK4_1 == 0);
|
||||
@@ -10698,17 +10722,20 @@ static void ggml_cpy_f32_q4_1_sycl(const char *cx, char *cdst, const int ne,
|
||||
sycl::range<3>(1, 1, 1)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
cpy_f32_q<cpy_blck_f32_q4_1, QK4_1>(
|
||||
cx, cdst, ne, ne00, ne01, nb00, nb01, nb02,
|
||||
ne10, ne11, nb10, nb11, nb12, item_ct1);
|
||||
cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
|
||||
nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
|
||||
item_ct1);
|
||||
});
|
||||
}
|
||||
|
||||
static void ggml_cpy_f16_f16_sycl(const char *cx, char *cdst, const int ne,
|
||||
const int ne00, const int ne01,
|
||||
const int nb00, const int nb01,
|
||||
const int nb02, const int ne10,
|
||||
const int ne11, const int nb10,
|
||||
const int nb11, const int nb12,
|
||||
const int ne02, const int nb00,
|
||||
const int nb01, const int nb02,
|
||||
const int nb03, const int ne10,
|
||||
const int ne11, const int ne12,
|
||||
const int nb10, const int nb11,
|
||||
const int nb12, const int nb13,
|
||||
dpct::queue_ptr stream) {
|
||||
|
||||
const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE;
|
||||
@@ -10721,8 +10748,8 @@ static void ggml_cpy_f16_f16_sycl(const char *cx, char *cdst, const int ne,
|
||||
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
|
||||
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
cpy_f32_f16<cpy_1_f16_f16>(cx, cdst, ne, ne00, ne01, nb00, nb01,
|
||||
nb02, ne10, ne11, nb10, nb11, nb12,
|
||||
cpy_f32_f16<cpy_1_f16_f16>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
|
||||
nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
|
||||
item_ct1);
|
||||
});
|
||||
}
|
||||
@@ -10730,10 +10757,12 @@ static void ggml_cpy_f16_f16_sycl(const char *cx, char *cdst, const int ne,
|
||||
|
||||
static void ggml_cpy_i16_i16_sycl(const char *cx, char *cdst, const int ne,
|
||||
const int ne00, const int ne01,
|
||||
const int nb00, const int nb01,
|
||||
const int nb02, const int ne10,
|
||||
const int ne11, const int nb10,
|
||||
const int nb11, const int nb12,
|
||||
const int ne02, const int nb00,
|
||||
const int nb01, const int nb02,
|
||||
const int nb03, const int ne10,
|
||||
const int ne11, const int ne12,
|
||||
const int nb10, const int nb11,
|
||||
const int nb12, const int nb13,
|
||||
dpct::queue_ptr stream) {
|
||||
|
||||
const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE;
|
||||
@@ -10746,8 +10775,8 @@ static void ggml_cpy_i16_i16_sycl(const char *cx, char *cdst, const int ne,
|
||||
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
|
||||
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
cpy_f32_f16<cpy_1_i16_i16>(cx, cdst, ne, ne00, ne01, nb00, nb01,
|
||||
nb02, ne10, ne11, nb10, nb11, nb12,
|
||||
cpy_f32_f16<cpy_1_i16_i16>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
|
||||
nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
|
||||
item_ct1);
|
||||
});
|
||||
}
|
||||
@@ -10755,10 +10784,12 @@ static void ggml_cpy_i16_i16_sycl(const char *cx, char *cdst, const int ne,
|
||||
|
||||
static void ggml_cpy_i32_i32_sycl(const char *cx, char *cdst, const int ne,
|
||||
const int ne00, const int ne01,
|
||||
const int nb00, const int nb01,
|
||||
const int nb02, const int ne10,
|
||||
const int ne11, const int nb10,
|
||||
const int nb11, const int nb12,
|
||||
const int ne02, const int nb00,
|
||||
const int nb01, const int nb02,
|
||||
const int nb03, const int ne10,
|
||||
const int ne11, const int ne12,
|
||||
const int nb10, const int nb11,
|
||||
const int nb12, const int nb13,
|
||||
dpct::queue_ptr stream) {
|
||||
|
||||
const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE;
|
||||
@@ -10771,8 +10802,8 @@ static void ggml_cpy_i32_i32_sycl(const char *cx, char *cdst, const int ne,
|
||||
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
|
||||
sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
cpy_f32_f16<cpy_1_i32_i32>(cx, cdst, ne, ne00, ne01, nb00, nb01,
|
||||
nb02, ne10, ne11, nb10, nb11, nb12,
|
||||
cpy_f32_f16<cpy_1_i32_i32>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
|
||||
nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
|
||||
item_ct1);
|
||||
});
|
||||
}
|
||||
@@ -11019,7 +11050,8 @@ static void soft_max_f32_sycl(const float *x, const float *y, float *dst,
|
||||
});
|
||||
}
|
||||
|
||||
static void im2col_f32_f16_sycl(const float *x, sycl::half *dst, int IW, int IH,
|
||||
template <typename T>
|
||||
static void im2col_sycl(const float *x, T *dst, int IW, int IH,
|
||||
int OW, int OH, int KW, int KH, int IC,
|
||||
int offset_delta, int s0, int s1, int p0,
|
||||
int p1, int d0, int d1,
|
||||
@@ -11036,7 +11068,7 @@ static void im2col_f32_f16_sycl(const float *x, sycl::half *dst, int IW, int IH,
|
||||
sycl::range<3>(1, 1, SYCL_IM2COL_BLOCK_SIZE),
|
||||
sycl::range<3>(1, 1, SYCL_IM2COL_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
im2col_f32_f16(x, dst, offset_delta, IW, IH, OW, KW, KH,
|
||||
im2col_kernel(x, dst, offset_delta, IW, IH, OW, KW, KH,
|
||||
parallel_elements, (IC * KH * KW), s0, s1, p0,
|
||||
p1, d0, d1, item_ct1);
|
||||
});
|
||||
@@ -12424,7 +12456,7 @@ inline void ggml_sycl_op_im2col(const ggml_tensor *src0,
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
|
||||
|
||||
const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
|
||||
const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
|
||||
@@ -12447,8 +12479,11 @@ inline void ggml_sycl_op_im2col(const ggml_tensor *src0,
|
||||
|
||||
const size_t delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
|
||||
|
||||
im2col_f32_f16_sycl(src1_dd, (sycl::half *)dst_dd, IW, IH, OW, OH, KW, KH,
|
||||
IC, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
|
||||
if (dst->type == GGML_TYPE_F16) {
|
||||
im2col_sycl(src1_dd, (sycl::half *)dst_dd, IW, IH, OW, OH, KW, KH, IC, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
|
||||
} else {
|
||||
im2col_sycl(src1_dd, (float *)dst_dd, IW, IH, OW, OH, KW, KH, IC, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
|
||||
}
|
||||
|
||||
(void) src0;
|
||||
(void) src0_dd;
|
||||
@@ -13905,19 +13940,23 @@ static void ggml_sycl_cpy(const ggml_tensor *src0, const ggml_tensor *src1,
|
||||
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
const int64_t ne01 = src0->ne[1];
|
||||
GGML_ASSERT(src0->ne[3] == 1);
|
||||
const int64_t ne02 = src0->ne[2];
|
||||
|
||||
|
||||
const int64_t nb00 = src0->nb[0];
|
||||
const int64_t nb01 = src0->nb[1];
|
||||
const int64_t nb02 = src0->nb[2];
|
||||
const int64_t nb03 = src0->nb[3];
|
||||
|
||||
const int64_t ne10 = src1->ne[0];
|
||||
const int64_t ne11 = src1->ne[1];
|
||||
GGML_ASSERT(src1->ne[3] == 1);
|
||||
const int64_t ne12 = src1->ne[2];
|
||||
|
||||
|
||||
const int64_t nb10 = src1->nb[0];
|
||||
const int64_t nb11 = src1->nb[1];
|
||||
const int64_t nb12 = src1->nb[2];
|
||||
const int64_t nb13 = src1->nb[3];
|
||||
|
||||
SYCL_CHECK(ggml_sycl_set_device(g_main_device));
|
||||
dpct::queue_ptr main_stream = g_syclStreams[g_main_device_index][0];
|
||||
@@ -13929,21 +13968,21 @@ static void ggml_sycl_cpy(const ggml_tensor *src0, const ggml_tensor *src1,
|
||||
char * src1_ddc = (char *) src1_extra->data_device[g_main_device_index];
|
||||
|
||||
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
||||
ggml_cpy_f32_f32_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
||||
ggml_cpy_f32_f32_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
|
||||
ggml_cpy_f32_f16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
||||
ggml_cpy_f32_f16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
|
||||
ggml_cpy_f32_q8_0_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
||||
ggml_cpy_f32_q8_0_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
|
||||
ggml_cpy_f32_q4_0_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
||||
ggml_cpy_f32_q4_0_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
|
||||
ggml_cpy_f32_q4_1_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
||||
ggml_cpy_f32_q4_1_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
|
||||
ggml_cpy_f16_f16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
||||
ggml_cpy_f16_f16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_I16 && src1->type == GGML_TYPE_I16) {
|
||||
ggml_cpy_i16_i16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
||||
ggml_cpy_i16_i16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32) {
|
||||
ggml_cpy_i32_i32_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
||||
ggml_cpy_i32_i32_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else {
|
||||
fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
|
||||
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
||||
|
||||
10922
ggml-vulkan-shaders.hpp
10922
ggml-vulkan-shaders.hpp
File diff suppressed because it is too large
Load Diff
420
ggml-vulkan.cpp
420
ggml-vulkan.cpp
File diff suppressed because it is too large
Load Diff
@@ -157,19 +157,10 @@ struct block_q6_K
|
||||
|
||||
# Dequant functions
|
||||
shader_f16_dequant_func = """
|
||||
#define DEQUANT_FUNC f16vec2 v = f16vec2(data_a[ib + 0], data_a[ib + 1]);
|
||||
"""
|
||||
shader_f16_dequant_func_compat = """
|
||||
#define DEQUANT_FUNC vec2 v = vec2(data_a[ib + 0], data_a[ib + 1]);
|
||||
"""
|
||||
|
||||
shader_q4_0_dequant_func = """
|
||||
#define DEQUANT_FUNC const float16_t d = data_a[ib].d; \
|
||||
const uint8_t vui = data_a[ib].qs[iqs]; \
|
||||
f16vec2 v = f16vec2(vui & 0xF, vui >> 4); \
|
||||
v = (v - 8.0hf)*d;
|
||||
"""
|
||||
shader_q4_0_dequant_func_compat = """
|
||||
#define DEQUANT_FUNC const float d = float(data_a[ib].d); \
|
||||
const uint vui = uint(data_a[ib].qs[iqs]); \
|
||||
vec2 v = vec2(vui & 0xF, vui >> 4); \
|
||||
@@ -177,13 +168,6 @@ v = (v - 8.0f)*d;
|
||||
"""
|
||||
|
||||
shader_q4_1_dequant_func = """
|
||||
#define DEQUANT_FUNC const float16_t d = data_a[ib].d; \
|
||||
const float16_t m = data_a[ib].m; \
|
||||
const uint8_t vui = data_a[ib].qs[iqs]; \
|
||||
f16vec2 v = f16vec2(vui & 0xF, vui >> 4); \
|
||||
v = v*d + m;
|
||||
"""
|
||||
shader_q4_1_dequant_func_compat = """
|
||||
#define DEQUANT_FUNC const float d = float(data_a[ib].d); \
|
||||
const float m = float(data_a[ib].m); \
|
||||
const uint vui = uint(data_a[ib].qs[iqs]); \
|
||||
@@ -192,14 +176,6 @@ v = v*d + m;
|
||||
"""
|
||||
|
||||
shader_q5_0_dequant_func = """
|
||||
#define DEQUANT_FUNC const float16_t d = data_a[ib].d; \
|
||||
const uint uint_qh = uint(data_a[ib].qh[1]) << 16 | data_a[ib].qh[0]; \
|
||||
const ivec2 qh = ivec2(((uint_qh >> iqs) << 4) & 0x10, (uint_qh >> (iqs + 12)) & 0x10); \
|
||||
const uint8_t vui = data_a[ib].qs[iqs]; \
|
||||
f16vec2 v = f16vec2((vui & 0xF) | qh.x, (vui >> 4) | qh.y); \
|
||||
v = (v - 16.0hf) * d;
|
||||
"""
|
||||
shader_q5_0_dequant_func_compat = """
|
||||
#define DEQUANT_FUNC const float d = float(data_a[ib].d); \
|
||||
const uint uint_qh = uint(data_a[ib].qh[1]) << 16 | data_a[ib].qh[0]; \
|
||||
const ivec2 qh = ivec2(((uint_qh >> iqs) << 4) & 0x10, (uint_qh >> (iqs + 12)) & 0x10); \
|
||||
@@ -209,14 +185,6 @@ v = (v - 16.0f) * d;
|
||||
"""
|
||||
|
||||
shader_q5_1_dequant_func = """
|
||||
#define DEQUANT_FUNC const float16_t d = data_a[ib].d; \
|
||||
const float16_t m = data_a[ib].m; \
|
||||
const ivec2 qh = ivec2(((data_a[ib].qh >> iqs) << 4) & 0x10, (data_a[ib].qh >> (iqs + 12)) & 0x10); \
|
||||
const uint8_t vui = data_a[ib].qs[iqs]; \
|
||||
f16vec2 v = f16vec2((vui & 0xF) | qh.x, (vui >> 4) | qh.y); \
|
||||
v = v*d + m;
|
||||
"""
|
||||
shader_q5_1_dequant_func_compat = """
|
||||
#define DEQUANT_FUNC const float d = float(data_a[ib].d); \
|
||||
const float m = float(data_a[ib].m); \
|
||||
const ivec2 qh = ivec2(((data_a[ib].qh >> iqs) << 4) & 0x10, (data_a[ib].qh >> (iqs + 12)) & 0x10); \
|
||||
@@ -226,11 +194,6 @@ v = v*d + m;
|
||||
"""
|
||||
|
||||
shader_q8_0_dequant_func = """
|
||||
#define DEQUANT_FUNC const float16_t d = data_a[ib].d; \
|
||||
f16vec2 v = f16vec2(data_a[ib].qs[iqs], data_a[ib].qs[iqs + 1]); \
|
||||
v = v * d;
|
||||
"""
|
||||
shader_q8_0_dequant_func_compat = """
|
||||
#define DEQUANT_FUNC const float d = float(data_a[ib].d); \
|
||||
vec2 v = vec2(int(data_a[ib].qs[iqs]), int(data_a[ib].qs[iqs + 1])); \
|
||||
v = v * d;
|
||||
@@ -2110,7 +2073,7 @@ lock = asyncio.Lock()
|
||||
shader_fnames = []
|
||||
|
||||
|
||||
async def string_to_spv(name, code, defines, fp16):
|
||||
async def string_to_spv(name, code, defines, fp16=True):
|
||||
f = NamedTemporaryFile(mode="w", delete=False)
|
||||
f.write(code)
|
||||
f.flush()
|
||||
@@ -2200,64 +2163,6 @@ async def main():
|
||||
tasks.append(string_to_spv("matmul_f16_f32_aligned_m", "".join(stream), {"LOAD_VEC": load_vec, "A_TYPE": vec_type_f16, "B_TYPE": vec_type, "D_TYPE": "float"}, fp16))
|
||||
tasks.append(string_to_spv("matmul_f16_f32_aligned_s", "".join(stream), {"LOAD_VEC": load_vec, "A_TYPE": vec_type_f16, "B_TYPE": vec_type, "D_TYPE": "float"}, fp16))
|
||||
|
||||
# Build dequant shaders
|
||||
tasks.append(string_to_spv("f32_to_f16", f32_to_f16_src, {}, fp16))
|
||||
|
||||
for i in range(0, VK_NUM_TYPES):
|
||||
stream.clear()
|
||||
|
||||
stream.extend((dequant_head, shader_int8_ext, shader_float_type))
|
||||
|
||||
if i == GGML_TYPE_F16:
|
||||
stream.extend((shader_f16_defines, shader_f16_dequant_func_compat if not fp16 else shader_f16_dequant_func, dequant_body))
|
||||
elif i == GGML_TYPE_Q4_0:
|
||||
stream.extend((shader_q4_0_defines, shader_q4_0_dequant_func_compat if not fp16 else shader_q4_0_dequant_func, dequant_body))
|
||||
elif i == GGML_TYPE_Q4_1:
|
||||
stream.extend((shader_q4_1_defines, shader_q4_1_dequant_func_compat if not fp16 else shader_q4_1_dequant_func, dequant_body))
|
||||
elif i == GGML_TYPE_Q5_0:
|
||||
stream.extend((shader_q5_0_defines, shader_q5_0_dequant_func_compat if not fp16 else shader_q5_0_dequant_func, dequant_body))
|
||||
elif i == GGML_TYPE_Q5_1:
|
||||
stream.extend((shader_q5_1_defines, shader_q5_1_dequant_func_compat if not fp16 else shader_q5_1_dequant_func, dequant_body))
|
||||
elif i == GGML_TYPE_Q8_0:
|
||||
stream.extend((shader_q8_0_defines, shader_q8_0_dequant_func_compat if not fp16 else shader_q8_0_dequant_func, dequant_body))
|
||||
elif i == GGML_TYPE_Q2_K:
|
||||
stream.extend((shader_q2_K_defines, dequant_q2_K_body))
|
||||
elif i == GGML_TYPE_Q3_K:
|
||||
stream.extend((shader_q3_K_defines, dequant_q3_K_body))
|
||||
elif i == GGML_TYPE_Q4_K:
|
||||
stream.extend((shader_q4_K_defines, dequant_q4_K_body))
|
||||
elif i == GGML_TYPE_Q5_K:
|
||||
stream.extend((shader_q5_K_defines, dequant_q5_K_body))
|
||||
elif i == GGML_TYPE_Q6_K:
|
||||
stream.extend((shader_q6_K_defines, dequant_q6_K_body))
|
||||
else:
|
||||
continue
|
||||
|
||||
tasks.append(string_to_spv(f"dequant_{type_names[i]}", "".join(stream), {"D_TYPE": "float16_t"}, fp16))
|
||||
|
||||
# get_rows
|
||||
for i in range(0, VK_NUM_TYPES):
|
||||
stream.clear()
|
||||
stream.extend((generic_head, shader_int8_ext, shader_float_type))
|
||||
|
||||
if i == GGML_TYPE_F16:
|
||||
stream.extend((shader_f16_defines, shader_f16_dequant_func_compat if not fp16 else shader_f16_dequant_func, get_rows_body))
|
||||
elif i == GGML_TYPE_Q4_0:
|
||||
stream.extend((shader_q4_0_defines, shader_q4_0_dequant_func_compat if not fp16 else shader_q4_0_dequant_func, get_rows_body))
|
||||
elif i == GGML_TYPE_Q4_1:
|
||||
stream.extend((shader_q4_1_defines, shader_q4_1_dequant_func_compat if not fp16 else shader_q4_1_dequant_func, get_rows_body))
|
||||
elif i == GGML_TYPE_Q5_0:
|
||||
stream.extend((shader_q5_0_defines, shader_q5_0_dequant_func_compat if not fp16 else shader_q5_0_dequant_func, get_rows_body))
|
||||
elif i == GGML_TYPE_Q5_1:
|
||||
stream.extend((shader_q5_1_defines, shader_q5_1_dequant_func_compat if not fp16 else shader_q5_1_dequant_func, get_rows_body))
|
||||
elif i == GGML_TYPE_Q8_0:
|
||||
stream.extend((shader_q8_0_defines, shader_q8_0_dequant_func_compat if not fp16 else shader_q8_0_dequant_func, get_rows_body))
|
||||
else:
|
||||
continue
|
||||
|
||||
tasks.append(string_to_spv(f"get_rows_{type_names[i]}", "".join(stream), {"B_TYPE": "float", "D_TYPE": "float16_t"}, fp16))
|
||||
tasks.append(string_to_spv(f"get_rows_{type_names[i]}_f32", "".join(stream), {"B_TYPE": "float", "D_TYPE": "float"}, fp16))
|
||||
|
||||
# Shaders where precision is needed, so no fp16 version
|
||||
|
||||
# mul mat vec
|
||||
@@ -2266,17 +2171,17 @@ async def main():
|
||||
stream.extend((mul_mat_vec_head, shader_int8_ext, shader_f32))
|
||||
|
||||
if i == GGML_TYPE_F16:
|
||||
stream.extend((shader_f16_defines, shader_f16_dequant_func_compat, mul_mat_vec_body))
|
||||
stream.extend((shader_f16_defines, shader_f16_dequant_func, mul_mat_vec_body))
|
||||
elif i == GGML_TYPE_Q4_0:
|
||||
stream.extend((shader_q4_0_defines, shader_q4_0_dequant_func_compat, mul_mat_vec_body))
|
||||
stream.extend((shader_q4_0_defines, shader_q4_0_dequant_func, mul_mat_vec_body))
|
||||
elif i == GGML_TYPE_Q4_1:
|
||||
stream.extend((shader_q4_1_defines, shader_q4_1_dequant_func_compat, mul_mat_vec_body))
|
||||
stream.extend((shader_q4_1_defines, shader_q4_1_dequant_func, mul_mat_vec_body))
|
||||
elif i == GGML_TYPE_Q5_0:
|
||||
stream.extend((shader_q5_0_defines, shader_q5_0_dequant_func_compat, mul_mat_vec_body))
|
||||
stream.extend((shader_q5_0_defines, shader_q5_0_dequant_func, mul_mat_vec_body))
|
||||
elif i == GGML_TYPE_Q5_1:
|
||||
stream.extend((shader_q5_1_defines, shader_q5_1_dequant_func_compat, mul_mat_vec_body))
|
||||
stream.extend((shader_q5_1_defines, shader_q5_1_dequant_func, mul_mat_vec_body))
|
||||
elif i == GGML_TYPE_Q8_0:
|
||||
stream.extend((shader_q8_0_defines, shader_q8_0_dequant_func_compat, mul_mat_vec_body))
|
||||
stream.extend((shader_q8_0_defines, shader_q8_0_dequant_func, mul_mat_vec_body))
|
||||
elif i == GGML_TYPE_Q2_K:
|
||||
stream.extend((shader_q2_K_defines, mul_mat_vec_q2_K_body))
|
||||
elif i == GGML_TYPE_Q3_K:
|
||||
@@ -2290,43 +2195,101 @@ async def main():
|
||||
else:
|
||||
continue
|
||||
|
||||
tasks.append(string_to_spv(f"mul_mat_vec_{type_names[i]}_f32", "".join(stream), {"B_TYPE": "float", "D_TYPE": "float", "K_QUANTS_PER_ITERATION": K_QUANTS_PER_ITERATION}, fp16))
|
||||
tasks.append(string_to_spv(f"mul_mat_vec_{type_names[i]}_f32", "".join(stream), {"B_TYPE": "float", "D_TYPE": "float", "K_QUANTS_PER_ITERATION": K_QUANTS_PER_ITERATION}))
|
||||
|
||||
tasks.append(string_to_spv("mul_mat_vec_p021_f16_f32", mul_mat_p021_src, {"A_TYPE": "float16_t", "B_TYPE": "float", "D_TYPE": "float"}, True))
|
||||
tasks.append(string_to_spv("mul_mat_vec_nc_f16_f32", mul_mat_nc_src, {"A_TYPE": "float16_t", "B_TYPE": "float", "D_TYPE": "float"}, True))
|
||||
# Dequant shaders
|
||||
for i in range(0, VK_NUM_TYPES):
|
||||
stream.clear()
|
||||
|
||||
stream.extend((dequant_head, shader_int8_ext, shader_f32))
|
||||
|
||||
if i == GGML_TYPE_F16:
|
||||
stream.extend((shader_f16_defines, shader_f16_dequant_func, dequant_body))
|
||||
elif i == GGML_TYPE_Q4_0:
|
||||
stream.extend((shader_q4_0_defines, shader_q4_0_dequant_func, dequant_body))
|
||||
elif i == GGML_TYPE_Q4_1:
|
||||
stream.extend((shader_q4_1_defines, shader_q4_1_dequant_func, dequant_body))
|
||||
elif i == GGML_TYPE_Q5_0:
|
||||
stream.extend((shader_q5_0_defines, shader_q5_0_dequant_func, dequant_body))
|
||||
elif i == GGML_TYPE_Q5_1:
|
||||
stream.extend((shader_q5_1_defines, shader_q5_1_dequant_func, dequant_body))
|
||||
elif i == GGML_TYPE_Q8_0:
|
||||
stream.extend((shader_q8_0_defines, shader_q8_0_dequant_func, dequant_body))
|
||||
elif i == GGML_TYPE_Q2_K:
|
||||
stream.extend((shader_q2_K_defines, dequant_q2_K_body))
|
||||
elif i == GGML_TYPE_Q3_K:
|
||||
stream.extend((shader_q3_K_defines, dequant_q3_K_body))
|
||||
elif i == GGML_TYPE_Q4_K:
|
||||
stream.extend((shader_q4_K_defines, dequant_q4_K_body))
|
||||
elif i == GGML_TYPE_Q5_K:
|
||||
stream.extend((shader_q5_K_defines, dequant_q5_K_body))
|
||||
elif i == GGML_TYPE_Q6_K:
|
||||
stream.extend((shader_q6_K_defines, dequant_q6_K_body))
|
||||
else:
|
||||
continue
|
||||
|
||||
tasks.append(string_to_spv(f"dequant_{type_names[i]}", "".join(stream), {"D_TYPE": "float16_t"}))
|
||||
|
||||
tasks.append(string_to_spv("f32_to_f16", f32_to_f16_src, {}))
|
||||
|
||||
# get_rows
|
||||
for i in range(0, VK_NUM_TYPES):
|
||||
stream.clear()
|
||||
stream.extend((generic_head, shader_int8_ext, shader_f32))
|
||||
|
||||
if i == GGML_TYPE_F16:
|
||||
stream.extend((shader_f16_defines, shader_f16_dequant_func, get_rows_body))
|
||||
elif i == GGML_TYPE_Q4_0:
|
||||
stream.extend((shader_q4_0_defines, shader_q4_0_dequant_func, get_rows_body))
|
||||
elif i == GGML_TYPE_Q4_1:
|
||||
stream.extend((shader_q4_1_defines, shader_q4_1_dequant_func, get_rows_body))
|
||||
elif i == GGML_TYPE_Q5_0:
|
||||
stream.extend((shader_q5_0_defines, shader_q5_0_dequant_func, get_rows_body))
|
||||
elif i == GGML_TYPE_Q5_1:
|
||||
stream.extend((shader_q5_1_defines, shader_q5_1_dequant_func, get_rows_body))
|
||||
elif i == GGML_TYPE_Q8_0:
|
||||
stream.extend((shader_q8_0_defines, shader_q8_0_dequant_func, get_rows_body))
|
||||
else:
|
||||
continue
|
||||
|
||||
tasks.append(string_to_spv(f"get_rows_{type_names[i]}", "".join(stream), {"B_TYPE": "float", "D_TYPE": "float16_t"}))
|
||||
tasks.append(string_to_spv(f"get_rows_{type_names[i]}_f32", "".join(stream), {"B_TYPE": "float", "D_TYPE": "float"}))
|
||||
|
||||
tasks.append(string_to_spv("mul_mat_vec_p021_f16_f32", mul_mat_p021_src, {"A_TYPE": "float16_t", "B_TYPE": "float", "D_TYPE": "float"}))
|
||||
tasks.append(string_to_spv("mul_mat_vec_nc_f16_f32", mul_mat_nc_src, {"A_TYPE": "float16_t", "B_TYPE": "float", "D_TYPE": "float"}))
|
||||
|
||||
# Norms
|
||||
tasks.append(string_to_spv("norm_f32", f"{generic_head}\n{shader_f32}\n{norm_body}", {"A_TYPE": "float", "D_TYPE": "float"}, True))
|
||||
tasks.append(string_to_spv("rms_norm_f32", f"{generic_head}\n{shader_f32}\n{rms_norm_body}", {"A_TYPE": "float", "D_TYPE": "float"}, True))
|
||||
tasks.append(string_to_spv("norm_f32", f"{generic_head}\n{shader_f32}\n{norm_body}", {"A_TYPE": "float", "D_TYPE": "float"}))
|
||||
tasks.append(string_to_spv("rms_norm_f32", f"{generic_head}\n{shader_f32}\n{rms_norm_body}", {"A_TYPE": "float", "D_TYPE": "float"}))
|
||||
|
||||
tasks.append(string_to_spv("cpy_f32_f32", f"{cpy_src}\n{cpy_end}", {"A_TYPE": "float", "D_TYPE": "float"}, True))
|
||||
tasks.append(string_to_spv("cpy_f32_f16", f"{cpy_src}\n{cpy_end}", {"A_TYPE": "float", "D_TYPE": "float16_t"}, True))
|
||||
tasks.append(string_to_spv("cpy_f16_f16", f"{cpy_src}\n{cpy_f16_f16_end}", {"A_TYPE": "float16_t", "D_TYPE": "float16_t"}, True))
|
||||
tasks.append(string_to_spv("cpy_f32_f32", f"{cpy_src}\n{cpy_end}", {"A_TYPE": "float", "D_TYPE": "float"}))
|
||||
tasks.append(string_to_spv("cpy_f32_f16", f"{cpy_src}\n{cpy_end}", {"A_TYPE": "float", "D_TYPE": "float16_t"}))
|
||||
tasks.append(string_to_spv("cpy_f16_f16", f"{cpy_src}\n{cpy_f16_f16_end}", {"A_TYPE": "float16_t", "D_TYPE": "float16_t"}))
|
||||
|
||||
tasks.append(string_to_spv("add_f32", f"{generic_head}\n{shader_f32}\n{add_body}", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float"}, True))
|
||||
tasks.append(string_to_spv("add_f32", f"{generic_head}\n{shader_f32}\n{add_body}", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float"}))
|
||||
|
||||
tasks.append(string_to_spv("split_k_reduce", mulmat_split_k_reduce_src, {}, True))
|
||||
tasks.append(string_to_spv("mul_f32", f"{generic_head}\n{shader_f32}\n{mul_body}", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float"}, True))
|
||||
tasks.append(string_to_spv("split_k_reduce", mulmat_split_k_reduce_src, {}))
|
||||
tasks.append(string_to_spv("mul_f32", f"{generic_head}\n{shader_f32}\n{mul_body}", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float"}))
|
||||
|
||||
tasks.append(string_to_spv("scale_f32", f"{generic_head}\n{shader_f32}\n{scale_body}", {"A_TYPE": "float", "D_TYPE": "float"}, True))
|
||||
tasks.append(string_to_spv("scale_f32", f"{generic_head}\n{shader_f32}\n{scale_body}", {"A_TYPE": "float", "D_TYPE": "float"}))
|
||||
|
||||
tasks.append(string_to_spv("sqr_f32", f"{generic_head}\n{shader_f32}\n{sqr_body}", {"A_TYPE": "float", "D_TYPE": "float"}, True))
|
||||
tasks.append(string_to_spv("sqr_f32", f"{generic_head}\n{shader_f32}\n{sqr_body}", {"A_TYPE": "float", "D_TYPE": "float"}))
|
||||
|
||||
tasks.append(string_to_spv("clamp_f32", f"{generic_head}\n{shader_f32}\n{clamp_body}", {"A_TYPE": "float", "D_TYPE": "float"}, True))
|
||||
tasks.append(string_to_spv("clamp_f32", f"{generic_head}\n{shader_f32}\n{clamp_body}", {"A_TYPE": "float", "D_TYPE": "float"}))
|
||||
|
||||
tasks.append(string_to_spv("gelu_f32", f"{generic_head}\n{shader_f32}\n{gelu_body}", {"A_TYPE": "float", "D_TYPE": "float"}, True))
|
||||
tasks.append(string_to_spv("silu_f32", f"{generic_head}\n{shader_f32}\n{silu_body}", {"A_TYPE": "float", "D_TYPE": "float"}, True))
|
||||
tasks.append(string_to_spv("relu_f32", f"{generic_head}\n{shader_f32}\n{relu_body}", {"A_TYPE": "float", "D_TYPE": "float"}, True))
|
||||
tasks.append(string_to_spv("gelu_f32", f"{generic_head}\n{shader_f32}\n{gelu_body}", {"A_TYPE": "float", "D_TYPE": "float"}))
|
||||
tasks.append(string_to_spv("silu_f32", f"{generic_head}\n{shader_f32}\n{silu_body}", {"A_TYPE": "float", "D_TYPE": "float"}))
|
||||
tasks.append(string_to_spv("relu_f32", f"{generic_head}\n{shader_f32}\n{relu_body}", {"A_TYPE": "float", "D_TYPE": "float"}))
|
||||
|
||||
tasks.append(string_to_spv("diag_mask_inf_f32", f"{diag_mask_inf_head}\n{shader_f32}\n{diag_mask_inf_body}", {"A_TYPE": "float", "D_TYPE": "float"}, True))
|
||||
tasks.append(string_to_spv("diag_mask_inf_f32", f"{diag_mask_inf_head}\n{shader_f32}\n{diag_mask_inf_body}", {"A_TYPE": "float", "D_TYPE": "float"}))
|
||||
|
||||
tasks.append(string_to_spv("soft_max_f32", f"{generic_head}\n{shader_f32}\n{soft_max_body}", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float"}, True))
|
||||
tasks.append(string_to_spv("soft_max_f32", f"{generic_head}\n{shader_f32}\n{soft_max_body}", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float"}))
|
||||
|
||||
tasks.append(string_to_spv("rope_f32", rope_src, {"A_TYPE": "float", "D_TYPE": "float"}, True))
|
||||
tasks.append(string_to_spv("rope_f16", rope_src, {"A_TYPE": "float16_t", "D_TYPE": "float16_t"}, True))
|
||||
tasks.append(string_to_spv("rope_f32", rope_src, {"A_TYPE": "float", "D_TYPE": "float"}))
|
||||
tasks.append(string_to_spv("rope_f16", rope_src, {"A_TYPE": "float16_t", "D_TYPE": "float16_t"}))
|
||||
|
||||
tasks.append(string_to_spv("rope_neox_f32", rope_neox_src, {"A_TYPE": "float", "D_TYPE": "float"}, True))
|
||||
tasks.append(string_to_spv("rope_neox_f16", rope_neox_src, {"A_TYPE": "float16_t", "D_TYPE": "float16_t"}, True))
|
||||
tasks.append(string_to_spv("rope_neox_f32", rope_neox_src, {"A_TYPE": "float", "D_TYPE": "float"}))
|
||||
tasks.append(string_to_spv("rope_neox_f16", rope_neox_src, {"A_TYPE": "float16_t", "D_TYPE": "float16_t"}))
|
||||
|
||||
await asyncio.gather(*tasks)
|
||||
|
||||
|
||||
24
llama.cpp
24
llama.cpp
@@ -208,7 +208,7 @@ enum llm_arch {
|
||||
LLM_ARCH_UNKNOWN,
|
||||
};
|
||||
|
||||
static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
|
||||
static std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_LLAMA, "llama" },
|
||||
{ LLM_ARCH_FALCON, "falcon" },
|
||||
{ LLM_ARCH_GPT2, "gpt2" },
|
||||
@@ -285,7 +285,7 @@ enum llm_kv {
|
||||
LLM_KV_TOKENIZER_RWKV,
|
||||
};
|
||||
|
||||
static std::map<llm_kv, std::string> LLM_KV_NAMES = {
|
||||
static std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
{ LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" },
|
||||
{ LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" },
|
||||
{ LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
|
||||
@@ -346,7 +346,7 @@ struct LLM_KV {
|
||||
llm_arch arch;
|
||||
|
||||
std::string operator()(llm_kv kv) const {
|
||||
return ::format(LLM_KV_NAMES[kv].c_str(), LLM_ARCH_NAMES[arch].c_str());
|
||||
return ::format(LLM_KV_NAMES[kv], LLM_ARCH_NAMES[arch]);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -747,13 +747,13 @@ struct LLM_TN {
|
||||
// gguf helpers
|
||||
//
|
||||
|
||||
static std::map<int8_t, std::string> LLAMA_ROPE_SCALING_TYPES = {
|
||||
static std::map<int32_t, const char *> LLAMA_ROPE_SCALING_TYPES = {
|
||||
{ LLAMA_ROPE_SCALING_NONE, "none" },
|
||||
{ LLAMA_ROPE_SCALING_LINEAR, "linear" },
|
||||
{ LLAMA_ROPE_SCALING_YARN, "yarn" },
|
||||
};
|
||||
|
||||
static int8_t llama_rope_scaling_type_from_string(const std::string & name) {
|
||||
static int32_t llama_rope_scaling_type_from_string(const std::string & name) {
|
||||
for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
|
||||
if (kv.second == name) {
|
||||
return kv.first;
|
||||
@@ -1415,6 +1415,7 @@ static const size_t GiB = 1024*MiB;
|
||||
|
||||
struct llama_hparams {
|
||||
bool vocab_only;
|
||||
bool rope_finetuned;
|
||||
uint32_t n_vocab;
|
||||
uint32_t n_ctx_train; // context size the model was trained on
|
||||
uint32_t n_embd;
|
||||
@@ -1434,8 +1435,7 @@ struct llama_hparams {
|
||||
float rope_freq_base_train;
|
||||
float rope_freq_scale_train;
|
||||
uint32_t n_yarn_orig_ctx;
|
||||
int8_t rope_scaling_type_train : 3;
|
||||
bool rope_finetuned : 1;
|
||||
int32_t rope_scaling_type_train;
|
||||
|
||||
float f_clamp_kqv;
|
||||
float f_max_alibi_bias;
|
||||
@@ -2701,7 +2701,7 @@ struct llama_model_loader {
|
||||
// load LLaMA models
|
||||
//
|
||||
|
||||
static std::string llama_model_arch_name(llm_arch arch) {
|
||||
static const char * llama_model_arch_name(llm_arch arch) {
|
||||
auto it = LLM_ARCH_NAMES.find(arch);
|
||||
if (it == LLM_ARCH_NAMES.end()) {
|
||||
return "unknown";
|
||||
@@ -3310,11 +3310,11 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
||||
const auto & hparams = model.hparams;
|
||||
const auto & vocab = model.vocab;
|
||||
|
||||
const auto rope_scaling_type = LLAMA_ROPE_SCALING_TYPES.at(hparams.rope_scaling_type_train);
|
||||
const char * rope_scaling_type = LLAMA_ROPE_SCALING_TYPES.at(hparams.rope_scaling_type_train);
|
||||
|
||||
// hparams
|
||||
LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver));
|
||||
LLAMA_LOG_INFO("%s: arch = %s\n", __func__, LLM_ARCH_NAMES.at(model.arch).c_str());
|
||||
LLAMA_LOG_INFO("%s: arch = %s\n", __func__, LLM_ARCH_NAMES.at(model.arch));
|
||||
LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, llama_model_vocab_type_name(vocab.type));
|
||||
LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab);
|
||||
LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (int) vocab.bpe_ranks.size());
|
||||
@@ -3336,7 +3336,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
||||
LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
|
||||
LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
|
||||
LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
|
||||
LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str());
|
||||
LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type);
|
||||
LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
|
||||
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
|
||||
LLAMA_LOG_INFO("%s: n_yarn_orig_ctx = %u\n", __func__, hparams.n_yarn_orig_ctx);
|
||||
@@ -10735,7 +10735,7 @@ int32_t llama_model_meta_val_str_by_index(const struct llama_model * model, int3
|
||||
|
||||
int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
|
||||
return snprintf(buf, buf_size, "%s %s %s",
|
||||
llama_model_arch_name(model->arch).c_str(),
|
||||
llama_model_arch_name(model->arch),
|
||||
llama_model_type_name(model->type),
|
||||
llama_model_ftype_name(model->ftype).c_str());
|
||||
}
|
||||
|
||||
2
llama.h
2
llama.h
@@ -213,7 +213,7 @@ extern "C" {
|
||||
uint32_t n_batch; // prompt processing maximum batch size
|
||||
uint32_t n_threads; // number of threads to use for generation
|
||||
uint32_t n_threads_batch; // number of threads to use for batch processing
|
||||
int8_t rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
|
||||
int32_t rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
|
||||
|
||||
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
||||
float rope_freq_base; // RoPE base frequency, 0 = from model
|
||||
|
||||
@@ -14,16 +14,17 @@
|
||||
# - Might be unstable!
|
||||
#
|
||||
# Usage:
|
||||
# ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose]
|
||||
# ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose] [-non-interactive]
|
||||
#
|
||||
# --port: port number, default is 8888
|
||||
# --repo: path to a repo containing GGUF model files
|
||||
# --wtype: weights type (f16, q8_0, q4_0, q4_1), default is user-input
|
||||
# --backend: cpu, cuda, metal, opencl, depends on the OS
|
||||
# --gpu-id: gpu id, default is 0
|
||||
# --n-parallel: number of parallel requests, default is 8
|
||||
# --n-kv: KV cache size, default is 4096
|
||||
# --verbose: verbose output
|
||||
# --port: port number, default is 8888
|
||||
# --repo: path to a repo containing GGUF model files
|
||||
# --wtype: weights type (f16, q8_0, q4_0, q4_1), default is user-input
|
||||
# --backend: cpu, cuda, metal, opencl, depends on the OS
|
||||
# --gpu-id: gpu id, default is 0
|
||||
# --n-parallel: number of parallel requests, default is 8
|
||||
# --n-kv: KV cache size, default is 4096
|
||||
# --verbose: verbose output
|
||||
# --non-interactive: run without asking a permission to run
|
||||
#
|
||||
# Example:
|
||||
#
|
||||
@@ -47,6 +48,7 @@ if ! command -v make &> /dev/null; then
|
||||
fi
|
||||
|
||||
# parse arguments
|
||||
is_interactive=1
|
||||
port=8888
|
||||
repo=""
|
||||
wtype=""
|
||||
@@ -66,15 +68,16 @@ verbose=0
|
||||
|
||||
function print_usage {
|
||||
printf "Usage:\n"
|
||||
printf " ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose]\n\n"
|
||||
printf " --port: port number, default is 8888\n"
|
||||
printf " --repo: path to a repo containing GGUF model files\n"
|
||||
printf " --wtype: weights type (f16, q8_0, q4_0, q4_1), default is user-input\n"
|
||||
printf " --backend: cpu, cuda, metal, opencl, depends on the OS\n"
|
||||
printf " --gpu-id: gpu id, default is 0\n"
|
||||
printf " --n-parallel: number of parallel requests, default is 8\n"
|
||||
printf " --n-kv: KV cache size, default is 4096\n"
|
||||
printf " --verbose: verbose output\n\n"
|
||||
printf " ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose] [-non-interactive]\n\n"
|
||||
printf " --port: port number, default is 8888\n"
|
||||
printf " --repo: path to a repo containing GGUF model files\n"
|
||||
printf " --wtype: weights type (f16, q8_0, q4_0, q4_1), default is user-input\n"
|
||||
printf " --backend: cpu, cuda, metal, opencl, depends on the OS\n"
|
||||
printf " --gpu-id: gpu id, default is 0\n"
|
||||
printf " --n-parallel: number of parallel requests, default is 8\n"
|
||||
printf " --n-kv: KV cache size, default is 4096\n"
|
||||
printf " --verbose: verbose output\n\n"
|
||||
printf " --non-interactive: run without asking a permission to run\n"
|
||||
printf "Example:\n\n"
|
||||
printf ' bash -c "$(curl -s https://ggml.ai/server-llm.sh)"\n\n'
|
||||
}
|
||||
@@ -82,6 +85,10 @@ function print_usage {
|
||||
while [[ $# -gt 0 ]]; do
|
||||
key="$1"
|
||||
case $key in
|
||||
--non-interactive)
|
||||
is_interactive=0
|
||||
shift
|
||||
;;
|
||||
--port)
|
||||
port="$2"
|
||||
shift
|
||||
@@ -176,31 +183,32 @@ repos=(
|
||||
"https://huggingface.co/TheBloke/OpenHermes-2-Mistral-7B-GGUF"
|
||||
"https://huggingface.co/TheBloke/CausalLM-7B-GGUF"
|
||||
)
|
||||
if [ $is_interactive -eq 1 ]; then
|
||||
printf "\n"
|
||||
printf "[I] This is a helper script for deploying llama.cpp's server on this machine.\n\n"
|
||||
printf " Based on the options that follow, the script might download a model file\n"
|
||||
printf " from the internet, which can be a few GBs in size. The script will also\n"
|
||||
printf " build the latest llama.cpp source code from GitHub, which can be unstable.\n"
|
||||
printf "\n"
|
||||
printf " Upon success, an HTTP server will be started and it will serve the selected\n"
|
||||
printf " model using llama.cpp for demonstration purposes.\n"
|
||||
printf "\n"
|
||||
printf " Please note:\n"
|
||||
printf "\n"
|
||||
printf " - All new data will be stored in the current folder\n"
|
||||
printf " - The server will be listening on all network interfaces\n"
|
||||
printf " - The server will run with default settings which are not always optimal\n"
|
||||
printf " - Do not judge the quality of a model based on the results from this script\n"
|
||||
printf " - Do not use this script to benchmark llama.cpp\n"
|
||||
printf " - Do not use this script in production\n"
|
||||
printf " - This script is only for demonstration purposes\n"
|
||||
printf "\n"
|
||||
printf " If you don't know what you are doing, please press Ctrl-C to abort now\n"
|
||||
printf "\n"
|
||||
printf " Press Enter to continue ...\n\n"
|
||||
|
||||
printf "\n"
|
||||
printf "[I] This is a helper script for deploying llama.cpp's server on this machine.\n\n"
|
||||
printf " Based on the options that follow, the script might download a model file\n"
|
||||
printf " from the internet, which can be a few GBs in size. The script will also\n"
|
||||
printf " build the latest llama.cpp source code from GitHub, which can be unstable.\n"
|
||||
printf "\n"
|
||||
printf " Upon success, an HTTP server will be started and it will serve the selected\n"
|
||||
printf " model using llama.cpp for demonstration purposes.\n"
|
||||
printf "\n"
|
||||
printf " Please note:\n"
|
||||
printf "\n"
|
||||
printf " - All new data will be stored in the current folder\n"
|
||||
printf " - The server will be listening on all network interfaces\n"
|
||||
printf " - The server will run with default settings which are not always optimal\n"
|
||||
printf " - Do not judge the quality of a model based on the results from this script\n"
|
||||
printf " - Do not use this script to benchmark llama.cpp\n"
|
||||
printf " - Do not use this script in production\n"
|
||||
printf " - This script is only for demonstration purposes\n"
|
||||
printf "\n"
|
||||
printf " If you don't know what you are doing, please press Ctrl-C to abort now\n"
|
||||
printf "\n"
|
||||
printf " Press Enter to continue ...\n\n"
|
||||
|
||||
read
|
||||
read
|
||||
fi
|
||||
|
||||
if [[ -z "$repo" ]]; then
|
||||
printf "[+] No repo provided from the command line\n"
|
||||
|
||||
@@ -105,7 +105,7 @@ int main()
|
||||
|
||||
for (auto rule : expected_rules)
|
||||
{
|
||||
parsed_grammar.rules.push_back({});
|
||||
parsed_grammar.rules.emplace_back();
|
||||
for (auto element : rule)
|
||||
{
|
||||
parsed_grammar.rules.back().push_back(element);
|
||||
|
||||
Reference in New Issue
Block a user