Compare commits

..

21 Commits

Author SHA1 Message Date
Georgi Gerganov
0492363137 mpi : fix after master merge 2023-07-09 22:23:04 +03:00
Georgi Gerganov
81c5ddd532 Merge branch 'mpi' into refactor-mpi 2023-07-09 22:20:14 +03:00
Evan Miller
03cc12be0d [mpi] continue-on-error: true 2023-07-09 15:10:43 -04:00
Evan Miller
4a9a4748e9 Add OpenMPI to GH action 2023-07-09 15:05:58 -04:00
Evan Miller
0f557c2ac4 Merge branch 'master' into mpi 2023-07-09 15:02:19 -04:00
Georgi Gerganov
9da9d26c70 mpi : minor 2023-07-09 18:38:32 +03:00
Georgi Gerganov
beadbf3380 mpi : fix inference 2023-07-09 18:26:20 +03:00
Georgi Gerganov
ef37dd14e7 mpi : fix output tensor after MPI compute (still not working) 2023-07-09 17:01:08 +03:00
Georgi Gerganov
c717c5185f mpi : various fixes - communication now works but results are wrong 2023-07-09 16:40:16 +03:00
Georgi Gerganov
01abb3b3b9 mpi : move all MPI logic into ggml-mpi
Not tested yet
2023-07-09 16:04:27 +03:00
Georgi Gerganov
e339d35579 mpi : add names for layer inputs + prep ggml_mpi_graph_compute() 2023-07-09 14:42:36 +03:00
Georgi Gerganov
3232db628c mpi : trying to move more MPI stuff into ggml-mpi (WIP) (#2099) 2023-07-09 14:08:53 +03:00
Evan Miller
ef61acfbf5 Add info to README 2023-07-07 09:02:23 -04:00
Evan Miller
55207ba2b8 Add GH workflow, fix test 2023-07-06 21:40:18 -04:00
Evan Miller
1f0a2cfeda Update CMakeLists.txt 2023-07-06 21:25:34 -04:00
Evan Miller
06a239343c PR comments 2023-07-06 20:18:41 -04:00
Evan Miller
32deabfdc8 Merge branch 'master' into mpi 2023-07-06 19:04:50 -04:00
Evan Miller
042c5b278f wrap includes 2023-07-04 00:13:20 -04:00
Evan Miller
668ba5fe0b fixes 2023-07-04 00:09:02 -04:00
Evan Miller
d05ca74dd8 fix warnings, update README 2023-07-03 23:53:43 -04:00
Evan Miller
f85785f650 MPI support, first cut 2023-07-03 21:51:05 -04:00
25 changed files with 849 additions and 1940 deletions

View File

@@ -10,13 +10,13 @@ shift
# Join the remaining arguments into a single string
arg2="$@"
if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
python3 ./convert.py "$arg2"
elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
./quantize "$arg2"
elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
./main "$arg2"
elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
if [[ $arg1 == '--convert' || $arg1 == '-c' ]]; then
python3 ./convert.py $arg2
elif [[ $arg1 == '--quantize' || $arg1 == '-q' ]]; then
./quantize $arg2
elif [[ $arg1 == '--run' || $arg1 == '-r' ]]; then
./main $arg2
elif [[ $arg1 == '--all-in-one' || $arg1 == '-a' ]]; then
echo "Converting PTH to GGML..."
for i in `ls $1/$2/ggml-model-f16.bin*`; do
if [ -f "${i/f16/q4_0}" ]; then
@@ -26,8 +26,6 @@ elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
./quantize "$i" "${i/f16/q4_0}" q4_0
fi
done
elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
./server "$arg2"
else
echo "Unknown command: $arg1"
echo "Available commands: "
@@ -39,6 +37,4 @@ else
echo " ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
echo " --all-in-one (-a): Execute --convert & --quantize"
echo " ex: \"/models/\" 7B"
echo " --server (-s): Run a model on the server"
echo " ex: -m /models/7B/ggml-model-q4_0.bin -c 2048 -ngl 43 -mg 1 --port 8080"
fi

1
.gitignore vendored
View File

@@ -20,7 +20,6 @@ build-static/
build-cublas/
build-opencl/
build-metal/
build-mpi/
build-no-accel/
build-sanitize-addr/
build-sanitize-thread/

View File

@@ -272,7 +272,7 @@ if (LLAMA_CUBLAS)
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
if (LLAMA_CUDA_DMMV_F16)
set(CMAKE_CUDA_ARCHITECTURES "60;61") # needed for f16 CUDA intrinsics
set(CMAKE_CUDA_ARCHITECTURES "61") # needed for f16 CUDA intrinsics
else()
set(CMAKE_CUDA_ARCHITECTURES "52;61") # lowest CUDA 12 standard + lowest for integer intrinsics
endif()
@@ -321,11 +321,6 @@ if (LLAMA_MPI)
set(c_flags ${c_flags} -Wno-cast-qual)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${MPI_C_LIBRARIES})
set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${MPI_C_INCLUDE_DIRS})
# Even if you're only using the C header, C++ programs may bring in MPI
# C++ functions, so more linkage is needed
if (MPI_CXX_FOUND)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${MPI_CXX_LIBRARIES})
endif()
else()
message(WARNING "MPI not found")
endif()

View File

@@ -151,6 +151,9 @@ ifdef LLAMA_MPI
CFLAGS += -DGGML_USE_MPI -Wno-cast-qual
CXXFLAGS += -DGGML_USE_MPI -Wno-cast-qual
OBJS += ggml-mpi.o
ggml-mpi.o: ggml-mpi.c ggml-mpi.h
$(CC) $(CFLAGS) -c $< -o $@
endif # LLAMA_MPI
ifdef LLAMA_OPENBLAS
@@ -223,6 +226,9 @@ ifdef LLAMA_METAL
CXXFLAGS += -DGGML_USE_METAL
LDFLAGS += -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
OBJS += ggml-metal.o
ggml-metal.o: ggml-metal.m ggml-metal.h
$(CC) $(CFLAGS) -c $< -o $@
endif # LLAMA_METAL
ifneq ($(filter aarch64%,$(UNAME_M)),)
@@ -247,16 +253,6 @@ ifneq ($(filter armv8%,$(UNAME_M)),)
CFLAGS += -mfp16-format=ieee -mno-unaligned-access
endif
ifdef LLAMA_METAL
ggml-metal.o: ggml-metal.m ggml-metal.h
$(CC) $(CFLAGS) -c $< -o $@
endif # LLAMA_METAL
ifdef LLAMA_MPI
ggml-mpi.o: ggml-mpi.c ggml-mpi.h
$(CC) $(CFLAGS) -c $< -o $@
endif # LLAMA_MPI
ifdef LLAMA_NO_K_QUANTS
k_quants.o: k_quants.c k_quants.h
$(CC) $(CFLAGS) -c $< -o $@

View File

@@ -239,7 +239,7 @@ In order to build llama.cpp you have three different options.
- Using `Zig`:
```bash
zig build -Doptimize=ReleaseFast
zig build -Drelease-fast
```
### Metal Build

View File

@@ -1,19 +1,9 @@
const std = @import("std");
const commit_hash = @embedFile(".git/refs/heads/master");
// Zig Version: 0.11.0-dev.3986+e05c242cd
// Zig Version: 0.11.0-dev.3379+629f0d23b
pub fn build(b: *std.build.Builder) void {
const target = b.standardTargetOptions(.{});
const optimize = b.standardOptimizeOption(.{});
const config_header = b.addConfigHeader(
.{ .style = .blank, .include_path = "build-info.h" },
.{
.BUILD_NUMBER = 0,
.BUILD_COMMIT = commit_hash[0 .. commit_hash.len - 1], // omit newline
},
);
const lib = b.addStaticLibrary(.{
.name = "llama",
.target = target,
@@ -23,21 +13,24 @@ pub fn build(b: *std.build.Builder) void {
lib.linkLibCpp();
lib.addIncludePath(".");
lib.addIncludePath("./examples");
lib.addConfigHeader(config_header);
lib.addCSourceFiles(&.{"ggml.c"}, &.{"-std=c11"});
lib.addCSourceFiles(&.{"llama.cpp"}, &.{"-std=c++11"});
lib.addCSourceFiles(&.{
"ggml.c",
}, &.{"-std=c11"});
lib.addCSourceFiles(&.{
"llama.cpp",
}, &.{"-std=c++11"});
b.installArtifact(lib);
const examples = .{
"main",
"baby-llama",
"embedding",
"metal",
// "metal",
"perplexity",
"quantize",
"quantize-stats",
"save-load-state",
"server",
// "server",
"simple",
"train-text-from-scratch",
};
@@ -50,19 +43,16 @@ pub fn build(b: *std.build.Builder) void {
});
exe.addIncludePath(".");
exe.addIncludePath("./examples");
exe.addConfigHeader(config_header);
exe.addCSourceFiles(&.{
std.fmt.comptimePrint("examples/{s}/{s}.cpp", .{ example_name, example_name }),
std.fmt.comptimePrint("examples/{s}/{s}.cpp", .{example_name, example_name}),
"examples/common.cpp",
}, &.{"-std=c++11"});
exe.linkLibrary(lib);
b.installArtifact(exe);
const run_cmd = b.addRunArtifact(exe);
run_cmd.step.dependOn(b.getInstallStep());
if (b.args) |args| run_cmd.addArgs(args);
const run_step = b.step("run-" ++ example_name, "Run the app");
const run_step = b.step("run_" ++ example_name, "Run the app");
run_step.dependOn(&run_cmd.step);
}
}

View File

@@ -236,24 +236,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
break;
}
params.mirostat_tau = std::stof(argv[i]);
} else if (arg == "--cfg-negative-prompt") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.cfg_negative_prompt = argv[i];
} else if (arg == "--cfg-scale") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.cfg_scale = std::stof(argv[i]);
} else if (arg == "--cfg-smooth-factor") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.cfg_smooth_factor = std::stof(argv[i]);
} else if (arg == "-b" || arg == "--batch-size") {
if (++i >= argc) {
invalid_param = true;
@@ -488,10 +470,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
fprintf(stderr, " modifies the likelihood of token appearing in the completion,\n");
fprintf(stderr, " i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n");
fprintf(stderr, " or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n");
fprintf(stderr, " --cfg-negative-prompt PROMPT \n");
fprintf(stderr, " negative prompt to use for guidance. (default: empty)\n");
fprintf(stderr, " --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
fprintf(stderr, " --cfg-smooth-factor N smooth factor between old and new logits (default: %f, 1.0 = no smoothing)\n", params.cfg_smooth_factor);
fprintf(stderr, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
fprintf(stderr, " --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
fprintf(stderr, " --no-penalize-nl do not penalize newline token\n");
@@ -558,7 +536,7 @@ std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::s
return res;
}
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(const gpt_params & params) {
auto lparams = llama_context_default_params();
lparams.n_ctx = params.n_ctx;
@@ -574,12 +552,6 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
lparams.logits_all = params.perplexity;
lparams.embedding = params.embedding;
return lparams;
}
std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(const gpt_params & params) {
auto lparams = llama_context_params_from_gpt_params(params);
llama_model * model = llama_load_model_from_file(params.model.c_str(), lparams);
if (model == NULL) {
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());

View File

@@ -48,12 +48,6 @@ struct gpt_params {
float mirostat_tau = 5.00f; // target entropy
float mirostat_eta = 0.10f; // learning rate
// Classifier-Free Guidance
// https://arxiv.org/abs/2306.17806
std::string cfg_negative_prompt; // string to help guidance
float cfg_scale = 1.f; // How strong is guidance
float cfg_smooth_factor = 1.f; // Smooth factor between old and new logits
std::string model = "models/7B/ggml-model.bin"; // model path
std::string model_alias = "unknown"; // model alias
std::string prompt = "";
@@ -105,7 +99,6 @@ std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::s
//
std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(const gpt_params & params);
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
//
// Console utils

View File

@@ -17,7 +17,7 @@ make
import torch
bin_path = "../LLaVA-13b-delta-v1-1/pytorch_model-00003-of-00003.bin"
pth_path = "./examples/embd-input/llava_projection.pth"
pth_path = "./examples/embd_input/llava_projection.pth"
dic = torch.load(bin_path)
used_key = ["model.mm_projector.weight","model.mm_projector.bias"]

View File

@@ -59,7 +59,7 @@ if __name__=="__main__":
# Also here can use pytorch_model-00003-of-00003.bin directly.
a.load_projection(os.path.join(
os.path.dirname(__file__) ,
"llava_projection.pth"))
"llava_projetion.pth"))
respose = a.chat_with_image(
Image.open("./media/llama1-logo.png").convert('RGB'),
"what is the text in the picture?")

View File

@@ -109,16 +109,10 @@ int main(int argc, char ** argv) {
llama_model * model;
llama_context * ctx;
llama_context * ctx_guidance = NULL;
g_ctx = &ctx;
// load the model and apply lora adapter, if any
std::tie(model, ctx) = llama_init_from_gpt_params(params);
if (params.cfg_scale > 1.f) {
struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
ctx_guidance = llama_new_context_with_model(model, lparams);
}
if (model == NULL) {
fprintf(stderr, "%s: error: unable to load model\n", __func__);
return 1;
@@ -189,28 +183,15 @@ int main(int argc, char ** argv) {
// tokenize the prompt
std::vector<llama_token> embd_inp;
// Add a space in front of the first character to match OG llama tokenizer behavior
params.prompt.insert(0, 1, ' ');
if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
// Add a space in front of the first character to match OG llama tokenizer behavior
params.prompt.insert(0, 1, ' ');
embd_inp = ::llama_tokenize(ctx, params.prompt, true);
} else {
embd_inp = session_tokens;
}
// Tokenize negative prompt
std::vector<llama_token> guidance_inp;
int guidance_offset = 0;
int original_prompt_len = 0;
if (ctx_guidance) {
params.cfg_negative_prompt.insert(0, 1, ' ');
guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, true);
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true);
original_prompt_len = original_inp.size();
guidance_offset = (int)guidance_inp.size() - original_prompt_len;
}
const int n_ctx = llama_n_ctx(ctx);
if ((int) embd_inp.size() > n_ctx - 4) {
@@ -277,16 +258,6 @@ int main(int argc, char ** argv) {
for (int i = 0; i < (int) embd_inp.size(); i++) {
fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]));
}
if (ctx_guidance) {
fprintf(stderr, "\n");
fprintf(stderr, "%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str());
fprintf(stderr, "%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
for (int i = 0; i < (int) guidance_inp.size(); i++) {
fprintf(stderr, "%6d -> '%s'\n", guidance_inp[i], llama_token_to_str(ctx, guidance_inp[i]));
}
}
if (params.n_keep > 0) {
fprintf(stderr, "%s: static prompt based on n_keep: '", __func__);
for (int i = 0; i < params.n_keep; i++) {
@@ -363,13 +334,11 @@ int main(int argc, char ** argv) {
int n_remain = params.n_predict;
int n_consumed = 0;
int n_session_consumed = 0;
int n_past_guidance = 0;
// the first thing we will do is to output the prompt, so set color accordingly
console_set_color(con_st, CONSOLE_COLOR_PROMPT);
std::vector<llama_token> embd;
std::vector<llama_token> embd_guidance;
// do one empty run to warm up the model
{
@@ -398,12 +367,11 @@ int main(int argc, char ** argv) {
// if we run out of context:
// - take the n_keep first tokens from the original prompt (via n_past)
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > n_ctx) {
if (n_past + (int) embd.size() > n_ctx) {
const int n_left = n_past - params.n_keep;
// always keep the first token - BOS
n_past = std::max(1, params.n_keep);
n_past_guidance = std::max(1, params.n_keep + guidance_offset);
// insert n_left/2 tokens at the start of embd from last_n_tokens
embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size());
@@ -444,48 +412,6 @@ int main(int argc, char ** argv) {
// evaluate tokens in batches
// embd is typically prepared beforehand to fit within a batch, but not always
if (ctx_guidance) {
int input_size = 0;
llama_token* input_buf = NULL;
if (n_past_guidance < (int) guidance_inp.size()) {
// Guidance context should have the same data with these modifications:
//
// * Replace the initial prompt
// * Shift everything by guidance_offset
embd_guidance = guidance_inp;
if (embd.begin() + original_prompt_len < embd.end()) {
embd_guidance.insert(
embd_guidance.end(),
embd.begin() + original_prompt_len,
embd.end()
);
}
input_buf = embd_guidance.data();
input_size = embd_guidance.size();
//fprintf(stderr, "\n---------------------\n");
//for (int i = 0; i < (int) embd_guidance.size(); i++) {
//fprintf(stderr, "%s", llama_token_to_str(ctx, embd_guidance[i]));
//}
//fprintf(stderr, "\n---------------------\n");
} else {
input_buf = embd.data();
input_size = embd.size();
}
for (int i = 0; i < input_size; i += params.n_batch) {
int n_eval = std::min(input_size - i, params.n_batch);
if (llama_eval(ctx_guidance, input_buf + i, n_eval, n_past_guidance, params.n_threads)) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return 1;
}
n_past_guidance += n_eval;
}
}
for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
int n_eval = (int) embd.size() - i;
if (n_eval > params.n_batch) {
@@ -505,7 +431,6 @@ int main(int argc, char ** argv) {
}
embd.clear();
embd_guidance.clear();
if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
// out of user input, sample next token
@@ -548,10 +473,6 @@ int main(int argc, char ** argv) {
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
if (ctx_guidance) {
llama_sample_classifier_free_guidance(ctx, &candidates_p, ctx_guidance, params.cfg_scale, params.cfg_smooth_factor);
}
// Apply penalties
float nl_logit = logits[llama_token_nl()];
auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
@@ -747,7 +668,6 @@ int main(int argc, char ** argv) {
}
llama_print_timings(ctx);
if (ctx_guidance) { llama_free(ctx_guidance); }
llama_free(ctx);
llama_free_model(model);

View File

@@ -1354,9 +1354,17 @@ struct ggml_tensor * expand(struct ggml_cgraph * g, struct ggml_tensor * t) {
}
}
for (int i = 0; i < GGML_MAX_SRC; ++i) {
if (t->src[i]) {
expand(g, t->src[i]);
if (t->src0) {
expand(g, t->src0);
}
if (t->src1) {
expand(g, t->src1);
}
for (int i = 0; i < GGML_MAX_OPT; ++i) {
if (t->opt[i]) {
expand(g, t->opt[i]);
}
}

View File

@@ -13,8 +13,6 @@
#include "ggml-cuda.h"
#include "ggml.h"
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
@@ -76,7 +74,7 @@ typedef void (*ggml_cuda_op_t)(
#define QK4_0 32
#define QR4_0 2
#define QI4_0 (QK4_0 / (4 * QR4_0))
#define QI4_0 4
typedef struct {
half d; // delta
uint8_t qs[QK4_0 / 2]; // nibbles / quants
@@ -85,7 +83,7 @@ static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0
#define QK4_1 32
#define QR4_1 2
#define QI4_1 (QK4_1 / (4 * QR4_1))
#define QI4_1 4
typedef struct {
half d; // delta
half m; // min
@@ -95,7 +93,7 @@ static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong
#define QK5_0 32
#define QR5_0 2
#define QI5_0 (QK5_0 / (4 * QR5_0))
#define QI5_0 4
typedef struct {
half d; // delta
uint8_t qh[4]; // 5-th bit of quants
@@ -105,7 +103,7 @@ static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5
#define QK5_1 32
#define QR5_1 2
#define QI5_1 (QK5_1 / (4 * QR5_1))
#define QI5_1 4
typedef struct {
half d; // delta
half m; // min
@@ -116,7 +114,7 @@ static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) +
#define QK8_0 32
#define QR8_0 1
#define QI8_0 (QK8_0 / (4 * QR8_0))
#define QI8_0 8
typedef struct {
half d; // delta
int8_t qs[QK8_0]; // quants
@@ -125,7 +123,7 @@ static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 blo
#define QK8_1 32
#define QR8_1 1
#define QI8_1 (QK8_1 / (4 * QR8_1))
#define QI8_1 8
typedef struct {
half d; // delta
half s; // unquantized sum
@@ -145,8 +143,6 @@ typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_
#define K_SCALE_SIZE 12
#endif
#define QR2_K 4
#define QI2_K (QK_K / (4*QR2_K))
typedef struct {
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
uint8_t qs[QK_K/4]; // quants
@@ -155,8 +151,6 @@ typedef struct {
} block_q2_K;
static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
#define QR3_K 4
#define QI3_K (QK_K / (4*QR3_K))
typedef struct {
uint8_t hmask[QK_K/8]; // quants - high bit
uint8_t qs[QK_K/4]; // quants - low 2 bits
@@ -169,8 +163,6 @@ typedef struct {
} block_q3_K;
//static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + K_SCALE_SIZE, "wrong q3_K block size/padding");
#define QR4_K 2
#define QI4_K (QK_K / (4*QR4_K))
#ifdef GGML_QKK_64
typedef struct {
half d[2]; // super-block scales/mins
@@ -188,8 +180,6 @@ typedef struct {
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding");
#endif
#define QR5_K 2
#define QI5_K (QK_K / (4*QR5_K))
#ifdef GGML_QKK_64
typedef struct {
half d; // super-block scale
@@ -209,8 +199,6 @@ typedef struct {
static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
#endif
#define QR6_K 2
#define QI6_K (QK_K / (4*QR6_K))
typedef struct {
uint8_t ql[QK_K/2]; // quants, lower 4 bits
uint8_t qh[QK_K/4]; // quants, upper 2 bits
@@ -224,7 +212,6 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
#define CUDA_ADD_BLOCK_SIZE 256
#define CUDA_MUL_BLOCK_SIZE 256
#define CUDA_GELU_BLOCK_SIZE 256
#define CUDA_SILU_BLOCK_SIZE 256
#define CUDA_CPY_BLOCK_SIZE 32
#define CUDA_SCALE_BLOCK_SIZE 256
@@ -252,13 +239,13 @@ struct ggml_tensor_extra_gpu {
cudaEvent_t events[GGML_CUDA_MAX_DEVICES]; // events for synchronizing multiple GPUs
};
static __global__ void add_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
static __global__ void add_f32(const float * x, const float * y, float * dst, const int k) {
const int i = blockDim.x*blockIdx.x + threadIdx.x;
if (i >= kx) {
if (i >= k) {
return;
}
dst[i] = x[i] + y[i%ky];
dst[i] = x[i] + y[i];
}
static __global__ void add_f16_f32_f16(const half * x, const float * y, half * dst, const int k) {
@@ -279,19 +266,6 @@ static __global__ void mul_f32(const float * x, const float * y, float * dst, co
dst[i] = x[i] * y[i%ky];
}
static __global__ void gelu_f32(const float * x, float * dst, const int k) {
const float GELU_COEF_A = 0.044715f;
const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
const int i = blockDim.x*blockIdx.x + threadIdx.x;
if (i >= k) {
return;
}
float xi = x[i];
dst[i] = 0.5f*xi*(1.0f + tanhf(SQRT_2_OVER_PI*xi*(1.0f + GELU_COEF_A*xi*xi)));
}
static __global__ void silu_f32(const float * x, float * dst, const int k) {
const int i = blockDim.x*blockIdx.x + threadIdx.x;
@@ -301,46 +275,16 @@ static __global__ void silu_f32(const float * x, float * dst, const int k) {
dst[i] = x[i] / (1.0f + expf(-x[i]));
}
static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
const int row = blockIdx.x*blockDim.y + threadIdx.y;
const int tid = threadIdx.x;
const float eps = 1e-5f;
float mean = 0.0f;
float var = 0.0f;
for (int col = tid; col < ncols; col += WARP_SIZE) {
const float xi = x[row*ncols + col];
mean += xi;
var += xi * xi;
}
// sum up partial sums
#pragma unroll
for (int mask = 16; mask > 0; mask >>= 1) {
mean += __shfl_xor_sync(0xffffffff, mean, mask, 32);
var += __shfl_xor_sync(0xffffffff, var, mask, 32);
}
mean /= ncols;
var = var / ncols - mean * mean;
const float inv_var = rsqrtf(var + eps);
for (int col = tid; col < ncols; col += WARP_SIZE) {
dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_var;
}
}
static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols) {
const int row = blockIdx.x*blockDim.y + threadIdx.y;
const int tid = threadIdx.x;
const float eps = 1e-6f;
const float eps = 1e-6;
float tmp = 0.0f; // partial sum for thread in warp
for (int col = tid; col < ncols; col += WARP_SIZE) {
for (int i = 0; i < ncols; i += WARP_SIZE) {
const int col = i + tid;
const float xi = x[row*ncols + col];
tmp += xi * xi;
}
@@ -352,9 +296,10 @@ static __global__ void rms_norm_f32(const float * x, float * dst, const int ncol
}
const float mean = tmp / ncols;
const float scale = rsqrtf(mean + eps);
const float scale = 1.0f / sqrtf(mean + eps);
for (int col = tid; col < ncols; col += WARP_SIZE) {
for (int i = 0; i < ncols; i += WARP_SIZE) {
const int col = i + tid;
dst[row*ncols + col] = scale * x[row*ncols + col];
}
}
@@ -1283,9 +1228,8 @@ static __global__ void dequantize_block(const void * __restrict__ vx, float * __
y[iybs + iqs + y_offset] = v.y;
}
static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
int vi;
@@ -1306,12 +1250,11 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
return sumi*d;
#else
return 0.0f; // only to satisfy the compiler
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
#endif // __CUDA_ARCH__ >= 600
}
static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
const int vi = *((int *) &bq4_1->qs[sizeof(int) * (iqs + 0)]);
@@ -1332,12 +1275,11 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
return sumi*d + m*s / QI4_1; // scale sum by QI4_1 because there are QI4_1 threads working on this block
#else
return 0.0f; // only to satisfy the compiler
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
#endif // __CUDA_ARCH__ >= 600
}
static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
int qs;
@@ -1368,12 +1310,11 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
return sumi*d;
#else
return 0.0f; // only to satisfy the compiler
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
#endif // __CUDA_ARCH__ >= 600
}
static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
const int qs = *((int *) &bq5_1->qs[sizeof(int) * (iqs + 0)]);
@@ -1403,12 +1344,11 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
return sumi*d + m*s / QI5_1; // scale sum by QI5_1 because there are QI5_1 threads working on this block
#else
return 0.0f; // only to satisfy the compiler
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
#endif // __CUDA_ARCH__ >= 600
}
static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
int vi;
@@ -1423,220 +1363,7 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
return sumi*d;
#else
return 0.0f; // only to satisfy the compiler
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
const block_q2_K * bq2_K = (const block_q2_K *) vbq;
const int bq8_offset = QR2_K * (iqs / QI8_1);
const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
float sumf_d = 0.0f;
float sumf_m = 0.0f;
const float d = bq2_K->d;
const float dmin = bq2_K->dmin;
const int v = *((int *) &bq2_K->qs[sizeof(int) * iqs]);
for (int i = 0; i < QR2_K; ++i) {
const int sc = bq2_K->scales[scale_offset + 2*i];
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
const float d8i = bq8i->d;
const int vi = (v >> (2*i)) & 0x03030303;
const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
sumf_d += d8i * (__dp4a(vi, ui, 0) * (sc & 0xF)); // SIMD dot product
sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * (sc >> 4)); // multiply constant q2_K part with sum of q8_1 values
}
return d*sumf_d - dmin*sumf_m;
#else
return 0.0f; // only to satisfy the compiler
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
const block_q3_K * bq3_K = (const block_q3_K *) vbq;
const int bq8_offset = QR3_K * (iqs / (QI3_K/2));
const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
float sumf = 0.0f;
const float d = bq3_K->d;
int vl;
memcpy(&vl, &bq3_K->qs[sizeof(int) * iqs], sizeof(int));
int vh;
memcpy(&vh, &bq3_K->hmask[sizeof(int) * (iqs % (QI3_K/2))], sizeof(int));
vh = ~vh; // invert the mask so that a 0/1 results in 4/0 being subtracted
vh >>= bq8_offset;
for (int i = 0; i < QR3_K; ++i) {
const int isc = scale_offset + 2*i;
const int isc_low = isc % (QK_K/32);
const int sc_shift_low = 4 * (isc / (QK_K/32));
const int sc_low = (bq3_K->scales[isc_low] >> sc_shift_low) & 0xF;
const int isc_high = isc % (QK_K/64);
const int sc_shift_high = 2 * (isc / (QK_K/64));
const int sc_high = ((bq3_K->scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4;
const int sc = (sc_low | sc_high) - 32;
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
const float d8i = bq8i->d;
const int vil = (vl >> (2*i)) & 0x03030303;
const int vih = ((vh >> i) << 2) & 0x04040404;
const int vi = __vsubss4(vil, vih);
sumf += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
}
return d*sumf;
#else
return 0.0f; // only to satisfy the compiler
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
const block_q4_K * bq4_K = (const block_q4_K *) vbq;
const int bq8_offset = QR4_K * (iqs / QI8_1);
float sumf_d = 0.0f;
float sumf_m = 0.0f;
const float d = bq4_K->d;
const float dmin = bq4_K->dmin;
const int v = *((int *) &bq4_K->qs[sizeof(int) * iqs]);
for (int i = 0; i < QR4_K; ++i) {
const int isc = bq8_offset + i;
uint8_t sc, m;
get_scale_min_k4(isc, bq4_K->scales, sc, m);
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
const float d8i = bq8i->d;
const int vi = (v >> (4*i)) & 0x0F0F0F0F;
sumf_d += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * m); // multiply constant part of q4_K with sum of q8_1 values
}
return d*sumf_d - dmin*sumf_m;
#else
return 0.0f; // only to satisfy the compiler
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
const block_q5_K * bq5_K = (const block_q5_K *) vbq;
const int bq8_offset = QR5_K * (iqs / QI8_1);
float sumf_d = 0.0f;
float sumf_m = 0.0f;
const float d = bq5_K->d;
const float dmin = bq5_K->dmin;
const int vl = *((int *) &bq5_K->qs[sizeof(int) * iqs]);
const int vh = (*((int *) &bq5_K->qh[sizeof(int) * (iqs % (QI5_K/4))])) >> bq8_offset;
for (int i = 0; i < QR5_K; ++i) {
const int isc = bq8_offset + i;
uint8_t sc, m;
get_scale_min_k4(isc, bq5_K->scales, sc, m);
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
const float d8i = bq8i->d;
const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
const int vih = ((vh >> i) << 4) & 0x10101010;
const int vi = vil | vih;
sumf_d += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * m); // multiply constant part of q5_K with sum of q8_1 values
}
return d*sumf_d - dmin*sumf_m;
#else
return 0.0f; // only to satisfy the compiler
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
}
static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
const block_q6_K * bq6_K = (const block_q6_K *) vbq;
const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/4);
const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8);
const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4));
float sumf = 0.0f;
const float d = bq6_K->d;
int vl;
memcpy(&vl, &bq6_K->ql[sizeof(int) * iqs], sizeof(int));
int vh;
memcpy(&vh, &bq6_K->qh[sizeof(int) * ((QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4))], sizeof(int));
for (int i = 0; i < QR6_K; ++i) {
const int sc = bq6_K->scales[scale_offset + 4*i];
const block_q8_1 * bq8i = bq8_1 + bq8_offset + 2*i;
const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % (QI8_1))]);
const float d8i = bq8i->d;
const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
const int vih = ((vh >> (vh_shift + 4*i)) << 4) & 0x30303030;
const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32
sumf += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
}
return d*sumf;
#else
return 0.0f; // only to satisfy the compiler
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
#endif // __CUDA_ARCH__ >= 600
}
template <int qk, int qi, typename block_q_t, vec_dot_q_cuda_t vec_dot_q_cuda>
@@ -1659,7 +1386,7 @@ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void *
for (int i = 0; i < blocks_per_row; i += blocks_per_warp) {
const int ibx = row*blocks_per_row + i + threadIdx.x / qi; // x block index
const int iby = (i + threadIdx.x / qi) * qk/QK8_1; // y block index that aligns with ibx
const int iby = i + threadIdx.x / qi; // y block index
const int iqs = threadIdx.x % qi; // x block quant index when casting the quants to int
@@ -1897,40 +1624,6 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
dst[i + 1] = x0*sin_theta + x1*cos_theta;
}
static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p, const float block_p, const float theta_scale) {
const int col = blockDim.x*blockIdx.x + threadIdx.x;
const int half_n_dims = ncols/4;
if (col >= half_n_dims) {
return;
}
const int row = blockDim.y*blockIdx.y + threadIdx.y;
const int i = row*ncols + col;
const float col_theta_scale = powf(theta_scale, col);
const float theta = p*col_theta_scale;
const float sin_theta = sinf(theta);
const float cos_theta = cosf(theta);
const float x0 = x[i + 0];
const float x1 = x[i + half_n_dims];
dst[i + 0] = x0*cos_theta - x1*sin_theta;
dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta;
const float block_theta = block_p*col_theta_scale;
const float sin_block_theta = sinf(block_theta);
const float cos_block_theta = cosf(block_theta);
const float x2 = x[i + half_n_dims * 2];
const float x3 = x[i + half_n_dims * 3];
dst[i + half_n_dims * 2] = x2*cos_block_theta - x3*sin_block_theta;
dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta;
}
static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
const int col = blockDim.x*blockIdx.x + threadIdx.x;
const int row = blockDim.y*blockIdx.y + threadIdx.y;
@@ -1996,9 +1689,9 @@ static __global__ void scale_f32(const float * x, float * dst, const float scale
dst[i] = scale * x[i];
}
static void add_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
const int num_blocks = (kx + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
static void add_f32_cuda(const float * x, const float * y, float * dst, const int k, cudaStream_t stream) {
const int num_blocks = (k + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
}
static void add_f16_f32_f16_cuda(const half * x, const float * y, half * dst, const int k, cudaStream_t stream) {
@@ -2011,22 +1704,11 @@ static void mul_f32_cuda(const float * x, const float * y, float * dst, const in
mul_f32<<<num_blocks, CUDA_MUL_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
}
static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
gelu_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
}
static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
const int num_blocks = (k + CUDA_SILU_BLOCK_SIZE - 1) / CUDA_SILU_BLOCK_SIZE;
silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
}
static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
GGML_ASSERT(ncols % WARP_SIZE == 0);
const dim3 block_dims(WARP_SIZE, 1, 1);
norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
}
static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
GGML_ASSERT(ncols % WARP_SIZE == 0);
const dim3 block_dims(WARP_SIZE, 1, 1);
@@ -2192,7 +1874,7 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
}
static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
GGML_ASSERT(ncols % QK4_0 == 0);
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
const dim3 block_nums(1, block_num_y, 1);
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
@@ -2201,7 +1883,7 @@ static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float *
}
static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
GGML_ASSERT(ncols % QK4_1 == 0);
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
const dim3 block_nums(1, block_num_y, 1);
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
@@ -2210,7 +1892,7 @@ static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float *
}
static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
GGML_ASSERT(ncols % QK5_0 == 0);
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
const dim3 block_nums(1, block_num_y, 1);
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
@@ -2219,7 +1901,7 @@ static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float *
}
static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
GGML_ASSERT(ncols % QK5_1 == 0);
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
const dim3 block_nums(1, block_num_y, 1);
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
@@ -2228,7 +1910,7 @@ static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float *
}
static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
GGML_ASSERT(ncols % QK8_0 == 0);
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
const dim3 block_nums(1, block_num_y, 1);
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
@@ -2236,51 +1918,6 @@ static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float *
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
}
static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
GGML_ASSERT(ncols % QK_K == 0);
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
const dim3 block_nums(1, block_num_y, 1);
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
mul_mat_vec_q<QK_K, QI2_K, block_q2_K, vec_dot_q2_K_q8_1>
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
}
static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
GGML_ASSERT(ncols % QK_K == 0);
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
const dim3 block_nums(1, block_num_y, 1);
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
mul_mat_vec_q<QK_K, QI3_K, block_q3_K, vec_dot_q3_K_q8_1>
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
}
static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
GGML_ASSERT(ncols % QK_K == 0);
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
const dim3 block_nums(1, block_num_y, 1);
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
mul_mat_vec_q<QK_K, QI4_K, block_q4_K, vec_dot_q4_K_q8_1>
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
}
static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
GGML_ASSERT(ncols % QK_K == 0);
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
const dim3 block_nums(1, block_num_y, 1);
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
mul_mat_vec_q<QK_K, QI5_K, block_q5_K, vec_dot_q5_K_q8_1>
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
}
static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
GGML_ASSERT(ncols % QK_K == 0);
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
const dim3 block_nums(1, block_num_y, 1);
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
mul_mat_vec_q<QK_K, QI6_K, block_q6_K, vec_dot_q6_K_q8_1>
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
}
static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
@@ -2373,14 +2010,6 @@ static void rope_f32_cuda(const float * x, float * dst, const int ncols, const i
rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, theta_scale);
}
static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float block_p, const float theta_scale, cudaStream_t stream) {
GGML_ASSERT(nrows % 4 == 0);
const dim3 block_dims(4*CUDA_ROPE_BLOCK_SIZE, 1, 1);
const int num_blocks_x = (ncols + 4*CUDA_ROPE_BLOCK_SIZE - 1) / (4*CUDA_ROPE_BLOCK_SIZE);
const dim3 block_nums(num_blocks_x, nrows, 1);
rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, block_p, theta_scale);
}
static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) {
const dim3 block_dims(CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1, 1);
const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;
@@ -2608,19 +2237,16 @@ inline void ggml_cuda_op_add(
GGML_ASSERT(src0_ddq_i != nullptr || src0_ddf_i != nullptr);
GGML_ASSERT(src1_ddf_i != nullptr);
GGML_ASSERT(dst_ddf_i != nullptr);
GGML_ASSERT(dst_ddf_i != nullptr);
const int64_t ne00 = src0->ne[0];
const int64_t ne0 = src0->ne[0];
const int64_t i01_diff = i01_high - i01_low;
const int64_t ne10 = src1->ne[0];
const int64_t ne11 = src1->ne[1];
// compute
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, ne10*ne11, cudaStream_main);
add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne0*i01_diff, cudaStream_main);
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
add_f16_f32_f16_cuda((half *) src0_ddq_i, src1_ddf_i, (half *) dst_ddf_i, ne00*i01_diff, cudaStream_main);
add_f16_f32_f16_cuda((half *) src0_ddq_i, src1_ddf_i, (half *) dst_ddf_i, ne0*i01_diff, cudaStream_main);
} else {
GGML_ASSERT(false);
}
@@ -2639,43 +2265,29 @@ inline void ggml_cuda_op_mul(
GGML_ASSERT(src0_ddf_i != nullptr);
GGML_ASSERT(src1_ddf_i != nullptr);
GGML_ASSERT(dst_ddf_i != nullptr);
GGML_ASSERT(dst_ddf_i != nullptr);
const int64_t ne00 = src0->ne[0];
const int64_t i01_diff = i01_high - i01_low;
const int64_t ne10 = src1->ne[0];
const int64_t ne11 = src1->ne[1];
mul_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, ne10*ne11, cudaStream_main);
for (int64_t i01 = i01_low; i01 < i01_high; i01++) {
const int64_t i11 = i1*ne11 + i01%ne11; // broadcast src1 across src0
float * src0_ddf_i01 = src0_ddf_i + i01*ne00;
float * src1_ddf_i01 = src1_ddf_i + i11*ne10;
float * dst_ddf_i01 = dst_ddf_i + i01*ne00;
// compute
mul_f32_cuda(src0_ddf_i01, src1_ddf_i01, dst_ddf_i01, ne00, ne10, cudaStream_main);
}
(void) dst;
(void) src0_ddq_i;
(void) i02;
}
inline void ggml_cuda_op_gelu(
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
cudaStream_t & cudaStream_main){
GGML_ASSERT(src0_ddf_i != nullptr);
GGML_ASSERT(dst_ddf_i != nullptr);
const int64_t ne00 = src0->ne[0];
const int64_t i01_diff = i01_high - i01_low;
// compute
gelu_f32_cuda(src0_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
(void) src1;
(void) dst;
(void) src0_ddq_i;
(void) src1_ddf_i;
(void) i02;
(void) i1;
}
inline void ggml_cuda_op_silu(
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
@@ -2698,28 +2310,6 @@ inline void ggml_cuda_op_silu(
(void) i1;
}
inline void ggml_cuda_op_norm(
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
cudaStream_t & cudaStream_main){
GGML_ASSERT(src0_ddf_i != nullptr);
GGML_ASSERT(dst_ddf_i != nullptr);
const int64_t ne00 = src0->ne[0];
const int64_t i01_diff = i01_high - i01_low;
// compute
norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
(void) src1;
(void) dst;
(void) src0_ddq_i;
(void) src1_ddf_i;
(void) i02;
(void) i1;
}
inline void ggml_cuda_op_rms_norm(
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
@@ -2760,22 +2350,13 @@ inline void ggml_cuda_op_mul_mat_vec(
int id;
CUDA_CHECK(cudaGetDevice(&id));
bool mul_mat_vec_q_implemented =
src0->type == GGML_TYPE_Q4_0 ||
const bool mul_mat_vec_q_implemented = src0->type == GGML_TYPE_Q4_0 ||
src0->type == GGML_TYPE_Q4_1 ||
src0->type == GGML_TYPE_Q5_0 ||
src0->type == GGML_TYPE_Q5_1 ||
src0->type == GGML_TYPE_Q8_0;
#if QK_K == 256
mul_mat_vec_q_implemented = mul_mat_vec_q_implemented ||
src0->type == GGML_TYPE_Q2_K ||
src0->type == GGML_TYPE_Q3_K ||
src0->type == GGML_TYPE_Q4_K ||
src0->type == GGML_TYPE_Q5_K ||
src0->type == GGML_TYPE_Q6_K;
#endif // QK_K == 256
const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= MIN_CC_DP4A && mul_mat_vec_q_implemented;
const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= 600 && mul_mat_vec_q_implemented;
#endif
if (use_mul_mat_vec_q) {
@@ -2801,21 +2382,6 @@ inline void ggml_cuda_op_mul_mat_vec(
case GGML_TYPE_Q8_0:
mul_mat_vec_q8_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
break;
case GGML_TYPE_Q2_K:
mul_mat_vec_q2_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
break;
case GGML_TYPE_Q3_K:
mul_mat_vec_q3_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
break;
case GGML_TYPE_Q4_K:
mul_mat_vec_q4_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
break;
case GGML_TYPE_Q5_K:
mul_mat_vec_q5_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
break;
case GGML_TYPE_Q6_K:
mul_mat_vec_q6_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
break;
default:
GGML_ASSERT(false);
break;
@@ -2950,21 +2516,13 @@ inline void ggml_cuda_op_rope(
const int n_past = ((int32_t *) src1->data)[0];
const int n_dims = ((int32_t *) src1->data)[1];
const int mode = ((int32_t *) src1->data)[2];
const int n_ctx = ((int32_t *) src1->data)[3];
GGML_ASSERT(mode == 0);
const float theta_scale = powf(10000.0, -2.0f/n_dims);
const float p = ((mode & 1) == 0 ? n_past + i02 : i02);
bool is_glm = mode & 4;
// compute
if (is_glm) {
const float id_p = min(p, n_ctx - 2.f);
const float block_p = max(p - (n_ctx - 2.f), 0.f);
rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
} else {
rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main);
}
rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main);
(void) dst;
(void) src0_ddq_i;
@@ -3367,21 +2925,11 @@ void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul, true, false); // TODO ggml_cuda_op needs modification for flatten
}
void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_gelu, true, true);
}
void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_silu, true, true);
}
void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_norm, true, true);
}
void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rms_norm, true, true);
@@ -3612,7 +3160,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
}
CUDA_CHECK(cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice));
cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice);
extra->data_device[id] = buf;
@@ -3652,36 +3200,36 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
}
// recursively assign CUDA buffers until a compute tensor is found
if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
const ggml_op src0_op = tensor->src[0]->op;
if (tensor->src0 != nullptr && tensor->src0->backend == GGML_BACKEND_CPU) {
const ggml_op src0_op = tensor->src0->op;
if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW) {
ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace);
ggml_cuda_assign_buffers_impl(tensor->src0, scratch, force_inplace);
}
}
if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_CPU) {
ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace);
if (tensor->op == GGML_OP_CPY && tensor->src1->backend == GGML_BACKEND_CPU) {
ggml_cuda_assign_buffers_impl(tensor->src1, scratch, force_inplace);
}
tensor->backend = GGML_BACKEND_GPU;
struct ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu;
memset(extra, 0, sizeof(*extra));
const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
const bool inplace = (tensor->src0 != nullptr && tensor->src0->data == tensor->data) ||
tensor->op == GGML_OP_VIEW ||
force_inplace;
const size_t size = ggml_nbytes(tensor);
CUDA_CHECK(cudaSetDevice(g_main_device));
if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
if (inplace && (tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT)) {
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src0->extra;
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
size_t offset = 0;
if (tensor->op == GGML_OP_VIEW) {
memcpy(&offset, tensor->src[2]->data, sizeof(size_t));
memcpy(&offset, tensor->opt[0]->data, sizeof(size_t));
}
extra->data_device[g_main_device] = src0_ddc + offset;
} else if (tensor->op == GGML_OP_CPY) {
struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src1->extra;
void * src1_ddv = src1_extra->data_device[g_main_device];
extra->data_device[g_main_device] = src1_ddv;
} else if (scratch) {
@@ -3752,8 +3300,8 @@ void ggml_cuda_free_scratch() {
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){
ggml_cuda_func_t func;
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
|| (tensor->src0 != nullptr && (tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT))
|| (tensor->src1 != nullptr && tensor->src1->backend == GGML_BACKEND_GPU);
switch (tensor->op) {
case GGML_OP_ADD:
@@ -3768,24 +3316,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
}
func = ggml_cuda_mul;
break;
case GGML_OP_GELU:
if (!any_on_device) {
return false;
}
func = ggml_cuda_gelu;
break;
case GGML_OP_SILU:
if (!any_on_device) {
return false;
}
func = ggml_cuda_silu;
break;
case GGML_OP_NORM:
if (!any_on_device) {
return false;
}
func = ggml_cuda_norm;
break;
case GGML_OP_RMS_NORM:
if (!any_on_device) {
return false;
@@ -3793,7 +3329,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
func = ggml_cuda_rms_norm;
break;
case GGML_OP_MUL_MAT:
if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) {
if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src0, tensor->src1, tensor)) {
return false;
}
func = ggml_cuda_mul_mat;
@@ -3847,6 +3383,6 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
return true;
}
func(tensor->src[0], tensor->src[1], tensor);
func(tensor->src0, tensor->src1, tensor);
return true;
}

View File

@@ -393,8 +393,8 @@ void ggml_metal_graph_compute(
for (int i = node_start; i < node_end; ++i) {
metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
struct ggml_tensor * src0 = gf->nodes[i]->src[0];
struct ggml_tensor * src1 = gf->nodes[i]->src[1];
struct ggml_tensor * src0 = gf->nodes[i]->src0;
struct ggml_tensor * src1 = gf->nodes[i]->src1;
struct ggml_tensor * dst = gf->nodes[i];
const int64_t ne00 = src0 ? src0->ne[0] : 0;
@@ -450,7 +450,6 @@ void ggml_metal_graph_compute(
//}
switch (dst->op) {
case GGML_OP_NONE:
case GGML_OP_RESHAPE:
case GGML_OP_VIEW:
case GGML_OP_TRANSPOSE:
@@ -740,7 +739,8 @@ void ggml_metal_graph_compute(
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:14];
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1) {
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7) / 8, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
[encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0];
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
}
else if (src0t == GGML_TYPE_Q2_K ||
src0t == GGML_TYPE_Q3_K ||

View File

@@ -365,10 +365,6 @@ kernel void kernel_rms_norm(
}
}
// putting them in the kernel cause a significant performance penalty
#define N_DST 4 // each SIMD group works on 4 rows
#define N_SIMDGROUP 2 // number of SIMD groups in a thread group
#define N_SIMDWIDTH 32 // assuming SIMD group size is 32
kernel void kernel_mul_mat_q4_0_f32(
device const void * src0,
device const float * src1,
@@ -376,83 +372,64 @@ kernel void kernel_mul_mat_q4_0_f32(
constant int64_t & ne00,
constant int64_t & ne10,
constant int64_t & ne0,
constant int64_t & ne01[[buffer(4)]],
threadgroup float * sum [[threadgroup(0)]],
uint2 tgpig[[threadgroup_position_in_grid]],
uint tiisg[[thread_index_in_simdgroup]],
uint sgitg[[simdgroup_index_in_threadgroup]]) {
uint2 tpitg[[thread_position_in_threadgroup]],
uint2 tptg[[threads_per_threadgroup]]) {
const int nb = ne00/QK4_0;
const int r0 = tgpig.x;
const int r1 = tgpig.y;
device const block_q4_0 * x = (device const block_q4_0 *) src0 + (r0 * N_SIMDGROUP + sgitg) * N_DST * nb;
const int64_t r0 = tgpig.x;
const int64_t r1 = tgpig.y;
device const block_q4_0 * x = (device const block_q4_0 *) src0 + r0*nb;
device const float * y = (device const float *) src1 + r1*ne10;
block_q4_0 qb_curr, qb_next;
float4 y_curr[8]; // src1 vector cache
float sumf[N_DST]={0.f}, all_sum;
thread float * yl=(thread float *)y_curr;
// bootstrap
qb_curr = x[tiisg];
// each thread in a SIMD group deals with 1 block.
for (int column = 0; column < nb / N_SIMDWIDTH; column++) {
const int nth = tptg.x*tptg.y;
const int ith = tptg.y*tpitg.x + tpitg.y;
const int ix = tpitg.y/4; // 0 or 1
const int iy = tpitg.y - 4*ix; // 0...3
const int first = 4 * iy;
float sumf = 0;
for (int i = 2*tpitg.x + ix; i < nb; i += 2*tptg.x) {
const float d = (float)x[i].d;
device const uint8_t * xl = x[i].qs + first;
device const float * yl = y + i * QK4_0 + first;
float2 acc = {0.0f, 0.0f};
for (int j = 0; j < 4; ++j) {
acc[0] += yl[j] * (xl[j] & 0xF) + yl[j+16] * (xl[j] >> 4);
acc[1] += yl[j] + yl[j+16];
float sumy = 0;
for (int i = 0; i < QK4_0 / 4; i++) {
y_curr[i] = *((device float4 *)(y + N_SIMDWIDTH * (tiisg + column * QK4_0) + 4 * i));
sumy += y_curr[i][0] + y_curr[i][1] + y_curr[i][2] + y_curr[i][3];
}
sumy *= (-8.f);
for (int row = 0; row < N_DST; row++) {
// prefetch next x block
qb_next = x[tiisg + ((row + 1) % N_DST) * nb + (column + ((row + 1) / N_DST)) * N_SIMDWIDTH];
// calculate
float d = qb_curr.d;
float acc = sumy;
for (int i = 0; i < 16; i++) {
acc += yl[i] * (qb_curr.qs[i] & 0xF) + yl[i+16] * (qb_curr.qs[i] >> 4);
}
sumf[row] += d * acc;
qb_curr = qb_next;
}
sumf += d * (acc[0] - 8.f*acc[1]);
}
if (nb % N_SIMDWIDTH == 0) {
for (int row = 0; row < N_DST; ++row) {
all_sum = simd_sum(sumf[row]);
if (tiisg == 0 && ((r0 * N_SIMDGROUP + sgitg) * N_DST + row) < ne01) {
dst[r1*ne0 + (r0 * N_SIMDGROUP + sgitg) * N_DST + row] = all_sum;
}
}
} else {
sum[ith] = sumf;
float sumy = 0;
for (int i = 0; i < QK4_0 / 4; i++) {
y_curr[i] = *((device float4 *)(y + N_SIMDWIDTH * (tiisg + (nb / N_SIMDWIDTH) * QK4_0) + 4 * i));
sumy += y_curr[i][0] + y_curr[i][1] + y_curr[i][2] + y_curr[i][3];
}
sumy *= (-8.f);
for (int row = 0; row < N_DST; row++) {
// prefetch next x block
qb_next = x[tiisg + ((row + 1) % N_DST) * nb + (nb / N_SIMDWIDTH + ((row + 1) / N_DST)) * N_SIMDWIDTH];
// calculate
float d = qb_curr.d;
float acc = sumy;
for (int i = 0; i < 16; i++) {
acc += yl[i] * (qb_curr.qs[i] & 0xF) + yl[i+16] * (qb_curr.qs[i] >> 4);
}
if (tiisg < nb % N_SIMDWIDTH) {
sumf[row] += d * acc;
}
qb_curr = qb_next;
all_sum = simd_sum(sumf[row]);
if (tiisg == 0 && ((r0 * N_SIMDGROUP + sgitg) * N_DST + row) < ne01) {
dst[r1*ne0 + (r0 * N_SIMDGROUP + sgitg) * N_DST + row] = all_sum;
}
}
//
// Accumulate the sum from all threads in the threadgroup
//
threadgroup_barrier(mem_flags::mem_threadgroup);
if (ith%4 == 0) {
sum[ith] += sum[ith+1] + sum[ith+2] + sum[ith+3];
}
threadgroup_barrier(mem_flags::mem_threadgroup);
if (ith%16 == 0) {
sum[ith] += sum[ith+4] + sum[ith+8] + sum[ith+12];
}
threadgroup_barrier(mem_flags::mem_threadgroup);
if (ith == 0) {
for (int i = 16; i < nth; i += 16) sum[0] += sum[i];
dst[r1*ne0 + r0] = sum[0];
}
}
@@ -463,83 +440,65 @@ kernel void kernel_mul_mat_q4_1_f32(
constant int64_t & ne00,
constant int64_t & ne10,
constant int64_t & ne0,
constant int64_t & ne01[[buffer(4)]],
threadgroup float * sum [[threadgroup(0)]],
uint2 tgpig[[threadgroup_position_in_grid]],
uint tiisg[[thread_index_in_simdgroup]],
uint sgitg[[simdgroup_index_in_threadgroup]]) {
const int nb = ne00/QK4_0;
const int r0 = tgpig.x;
const int r1 = tgpig.y;
device const block_q4_1 * x = (device const block_q4_1 *) src0 + (r0 * N_SIMDGROUP + sgitg) * N_DST * nb;
uint2 tpitg[[thread_position_in_threadgroup]],
uint2 tptg[[threads_per_threadgroup]]) {
const int nb = ne00/QK4_1;
const int64_t r0 = tgpig.x;
const int64_t r1 = tgpig.y;
device const block_q4_1 * x = (device const block_q4_1 *) src0 + r0*nb;
device const float * y = (device const float *) src1 + r1*ne10;
block_q4_1 qb_curr, qb_next;
float4 y_curr[8]; // src1 vector cache
float sumf[N_DST]={0.f}, all_sum;
thread float * yl=(thread float *)y_curr;
// bootstrap
qb_curr = x[tiisg];
// each thread in a SIMD group deals with 1 block.
for (int column = 0; column < nb / N_SIMDWIDTH; column++) {
const uint nth = tptg.x*tptg.y;
const uint ith = tptg.y*tpitg.x + tpitg.y;
const int ix = tpitg.y/4; // 0 or 1
const int iy = tpitg.y - 4*ix; // 0...3
const int first = 4 * iy;
float sumf = 0;
for (int i = 2*tpitg.x + ix; i < nb; i += 2*tptg.x) {
const float d = (float)x[i].d;
const float m = (float)x[i].m;
device const uint8_t * xl = x[i].qs + first;
device const float * yl = y + i * QK4_1 + first;
float2 acc = {0.0f, 0.0f};
for (int j = 0; j < 4; ++j) {
acc[0] += yl[j+ 0] * (d * (xl[j] & 0xF) + m);
acc[1] += yl[j+16] * (d * (xl[j] >> 4) + m);
float sumy = 0;
for (int i = 0; i < QK4_0 / 4; i++) {
y_curr[i] = *((device float4 *)(y + N_SIMDWIDTH * (tiisg + column * QK4_0) + 4 * i));
sumy += y_curr[i][0] + y_curr[i][1] + y_curr[i][2] + y_curr[i][3];
}
for (int row = 0; row < N_DST; row++) {
// prefetch next x block
qb_next = x[tiisg + ((row + 1) % N_DST) * nb + (column + ((row + 1) / N_DST)) * N_SIMDWIDTH];
// calculate
const float d = qb_curr.d;
const float m = qb_curr.m;
float acc = 0.f;
for (int i = 0; i < 16; i++) {
acc += yl[i] * (qb_curr.qs[i] & 0xF) + yl[i+16] * (qb_curr.qs[i] >> 4);
}
sumf[row] += d * acc + m * sumy;
qb_curr = qb_next;
}
sumf += acc[0] + acc[1];
}
if (nb % N_SIMDWIDTH == 0) {
for (int row = 0; row < N_DST; ++row) {
all_sum = simd_sum(sumf[row]);
if (tiisg == 0 && ((r0 * N_SIMDGROUP + sgitg) * N_DST + row) < ne01) {
dst[r1*ne0 + (r0 * N_SIMDGROUP + sgitg) * N_DST + row] = all_sum;
}
}
} else {
sum[ith] = sumf;
float sumy = 0;
for (int i = 0; i < QK4_0 / 4; i++) {
y_curr[i] = *((device float4 *)(y + N_SIMDWIDTH * (tiisg + (nb / N_SIMDWIDTH) * QK4_0) + 4 * i));
sumy += y_curr[i][0] + y_curr[i][1] + y_curr[i][2] + y_curr[i][3];
}
for (int row = 0; row < N_DST; row++) {
// prefetch next x block
qb_next = x[tiisg + ((row + 1) % N_DST) * nb + (nb / N_SIMDWIDTH + ((row + 1) / N_DST)) * N_SIMDWIDTH];
// calculate
const float d = qb_curr.d;
const float m = qb_curr.m;
float acc = 0.f;
for (int i = 0; i < 16; i++) {
acc += yl[i] * (qb_curr.qs[i] & 0xF) + yl[i+16] * (qb_curr.qs[i] >> 4);
}
if (tiisg < nb % N_SIMDWIDTH) {
sumf[row] += d * acc + m * sumy;
}
qb_curr = qb_next;
all_sum = simd_sum(sumf[row]);
if (tiisg == 0 && ((r0 * N_SIMDGROUP + sgitg) * N_DST + row) < ne01) {
dst[r1*ne0 + (r0 * N_SIMDGROUP + sgitg) * N_DST + row] = all_sum;
}
}
//
// Accumulate the sum from all threads in the threadgroup
//
threadgroup_barrier(mem_flags::mem_threadgroup);
if (ith%4 == 0) {
sum[ith] += sum[ith+1] + sum[ith+2] + sum[ith+3];
}
threadgroup_barrier(mem_flags::mem_threadgroup);
if (ith%16 == 0) {
sum[ith] += sum[ith+4] + sum[ith+8] + sum[ith+12];
}
threadgroup_barrier(mem_flags::mem_threadgroup);
if (ith == 0) {
for (uint i = 16; i < nth; i += 16) sum[0] += sum[i];
dst[r1*ne0 + r0] = sum[0];
}
}

View File

@@ -56,7 +56,7 @@ void ggml_mpi_eval_init(
MPI_Bcast(n_threads, 1, MPI_INT, 0, MPI_COMM_WORLD);
}
static int ggml_graph_get_node_idx(struct ggml_cgraph * gf, const char * name) {
int ggml_graph_get_node_idx(struct ggml_cgraph * gf, const char * name) {
struct ggml_tensor * t = ggml_graph_get_tensor(gf, name);
if (t == NULL) {
fprintf(stderr, "%s: tensor %s not found\n", __func__, name);
@@ -73,39 +73,13 @@ static int ggml_graph_get_node_idx(struct ggml_cgraph * gf, const char * name) {
return -1;
}
static void ggml_mpi_tensor_send(struct ggml_tensor * t, int mpi_rank_dst) {
MPI_Datatype mpi_type;
switch (t->type) {
case GGML_TYPE_I32: mpi_type = MPI_INT32_T; break;
case GGML_TYPE_F32: mpi_type = MPI_FLOAT; break;
default: GGML_ASSERT(false && "not implemented");
}
const int retval = MPI_Send(t->data, ggml_nelements(t), mpi_type, mpi_rank_dst, 0, MPI_COMM_WORLD);
GGML_ASSERT(retval == MPI_SUCCESS);
}
static void ggml_mpi_tensor_recv(struct ggml_tensor * t, int mpi_rank_src) {
MPI_Datatype mpi_type;
switch (t->type) {
case GGML_TYPE_I32: mpi_type = MPI_INT32_T; break;
case GGML_TYPE_F32: mpi_type = MPI_FLOAT; break;
default: GGML_ASSERT(false && "not implemented");
}
MPI_Status status; UNUSED(status);
const int retval = MPI_Recv(t->data, ggml_nelements(t), mpi_type, mpi_rank_src, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
GGML_ASSERT(retval == MPI_SUCCESS);
}
// TODO: there are many improvements that can be done to this implementation
void ggml_mpi_graph_compute_pre(
void ggml_mpi_graph_compute(
struct ggml_mpi_context * ctx_mpi,
struct ggml_context * ctx,
struct ggml_cgraph * gf,
int n_layers) {
int n_layers,
int n_threads) {
const int mpi_rank = ctx_mpi->rank;
const int mpi_size = ctx_mpi->size;
@@ -135,19 +109,41 @@ void ggml_mpi_graph_compute_pre(
// node 0: [(n-1) * n_per_node, n_nodes)
//
if (mpi_rank > 0) {
if (mpi_rank == 1) {
// the first node (1) receives the input tokens from the main node (0)
ggml_mpi_tensor_recv(inp_tokens, 0);
} else {
// recv input data for each node into the "inp0" tensor (i.e. the first node in the compute graph)
ggml_mpi_tensor_recv(inp0, mpi_rank - 1);
if (mpi_rank == 1) { // the first node receives the input tokens from the main node
MPI_Status status; UNUSED(status);
const int mpi_rank_src = mpi_rank - 1;
const int retval = MPI_Recv(inp_tokens->data, ggml_nelements(inp_tokens), MPI_INT, mpi_rank_src, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
GGML_ASSERT(retval == MPI_SUCCESS);
} else { // recv input data for each node into the "inp0" tensor (i.e. the first node in the compute graph)
MPI_Status status; UNUSED(status);
const int mpi_rank_src = mpi_rank - 1;
//printf("%s: node %d: waiting for %d elements from %d\n", __func__, mpi_rank, (int) ggml_nelements(inp0), mpi_rank_src);
const int retval = MPI_Recv(inp0->data, ggml_nelements(inp0), MPI_FLOAT, mpi_rank_src, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
GGML_ASSERT(retval == MPI_SUCCESS);
}
} else if (mpi_size > 1) {
// node 0 sends the input tokens to node 1
ggml_mpi_tensor_send(inp_tokens, 1);
{
const int mpi_rank_dst = mpi_rank + 1;
const int retval = MPI_Send(inp_tokens->data, ggml_nelements(inp_tokens), MPI_INT, mpi_rank_dst, 0, MPI_COMM_WORLD);
GGML_ASSERT(retval == MPI_SUCCESS);
}
// recv the output data from the last node
ggml_mpi_tensor_recv(inp0, mpi_size - 1);
{
MPI_Status status; UNUSED(status);
const int mpi_rank_src = mpi_size - 1;
//fprintf(stderr, "%s: node %d: waiting for %d elements from %d\n", __func__, mpi_rank, (int) ggml_nelements(inp0), mpi_rank_src);
const int retval = MPI_Recv(inp0->data, ggml_nelements(inp0), MPI_FLOAT, mpi_rank_src, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
GGML_ASSERT(retval == MPI_SUCCESS);
}
}
{
@@ -175,11 +171,11 @@ void ggml_mpi_graph_compute_pre(
// attach the input data to all nodes that need it
// TODO: not great - should be able to do this without modifying the compute graph (see next TODO below)
for (int i = idx_l0; i < idx_l1; i++) {
if (gf->nodes[i]->src[0] == gf->nodes[idx_l0]) {
gf->nodes[i]->src[0] = inp0;
if (gf->nodes[i]->src0 == gf->nodes[idx_l0]) {
gf->nodes[i]->src0 = inp0;
}
if (gf->nodes[i]->src[1] == gf->nodes[idx_l0]) {
gf->nodes[i]->src[1] = inp0;
if (gf->nodes[i]->src1 == gf->nodes[idx_l0]) {
gf->nodes[i]->src1 = inp0;
}
}
@@ -198,19 +194,20 @@ void ggml_mpi_graph_compute_pre(
//fprintf(stderr, "%s: node %d: processing %d nodes [%d, %d)\n", __func__, mpi_rank, gf->n_nodes, il0, il1);
}
}
void ggml_mpi_graph_compute_post(
struct ggml_mpi_context * ctx_mpi,
struct ggml_cgraph * gf,
int n_layers) {
UNUSED(n_layers);
ggml_graph_compute_with_ctx(ctx, gf, n_threads);
const int mpi_rank = ctx_mpi->rank;
const int mpi_size = ctx_mpi->size;
//fprintf(stderr, "%s: node %d: done\n", __func__, mpi_rank);
// send the output data to the next node
if (mpi_rank > 0) {
ggml_mpi_tensor_send(gf->nodes[gf->n_nodes - 1], (mpi_rank + 1) % mpi_size);
struct ggml_tensor * output = gf->nodes[gf->n_nodes - 1];
const int mpi_rank_dst = (mpi_rank + 1) % mpi_size;
//fprintf(stderr, "%s: node %d: sending %d elements to node %d\n", __func__, mpi_rank, ggml_nelements(output), mpi_rank_dst);
const int retval = MPI_Send(output->data, ggml_nelements(output), MPI_FLOAT, mpi_rank_dst, 0, MPI_COMM_WORLD);
GGML_ASSERT(retval == MPI_SUCCESS);
}
}

View File

@@ -24,15 +24,12 @@ void ggml_mpi_eval_init(
int * n_past,
int * n_threads);
void ggml_mpi_graph_compute_pre(
void ggml_mpi_graph_compute(
struct ggml_mpi_context * ctx_mpi,
struct ggml_context * ctx,
struct ggml_cgraph * gf,
int n_layers);
void ggml_mpi_graph_compute_post(
struct ggml_mpi_context * ctx_mpi,
struct ggml_cgraph * gf,
int n_layers);
int n_layers,
int n_threads);
#ifdef __cplusplus
}

1305
ggml.c

File diff suppressed because it is too large Load Diff

48
ggml.h
View File

@@ -132,10 +132,10 @@
// {
// struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 2, 3);
//
// // a[2, 1] = 1.0f;
// // a[1, 2] = 1.0f;
// *(float *) ((char *) a->data + 2*a->nb[1] + 1*a->nb[0]) = 1.0f;
//
// // a[0, 2] = 2.0f;
// // a[2, 0] = 2.0f;
// *(float *) ((char *) a->data + 0*a->nb[1] + 2*a->nb[0]) = 2.0f;
//
// ...
@@ -197,17 +197,12 @@
#define GGML_MAX_NODES 4096
#define GGML_MAX_PARAMS 256
#define GGML_MAX_CONTEXTS 64
#define GGML_MAX_SRC 6
#define GGML_MAX_OPT 4
#define GGML_MAX_NAME 48
#define GGML_DEFAULT_N_THREADS 4
#define GGML_EXIT_SUCCESS 0
#define GGML_EXIT_ABORTED 1
#define GGML_UNUSED(x) (void)(x)
#define GGML_ASSERT(x) \
do { \
if (!(x)) { \
@@ -368,8 +363,6 @@ extern "C" {
GGML_OP_CLAMP,
GGML_OP_CONV_1D,
GGML_OP_CONV_2D,
GGML_OP_POOL_1D,
GGML_OP_POOL_2D,
GGML_OP_FLASH_ATTN,
GGML_OP_FLASH_FF,
@@ -421,7 +414,9 @@ extern "C" {
bool is_param;
struct ggml_tensor * grad;
struct ggml_tensor * src[GGML_MAX_SRC];
struct ggml_tensor * src0;
struct ggml_tensor * src1;
struct ggml_tensor * opt[GGML_MAX_OPT];
// performance
int perf_runs;
@@ -449,10 +444,6 @@ extern "C" {
// the `n_tasks` of nodes, 1:1 mapping to cgraph nodes
int n_tasks[GGML_MAX_NODES];
// abort ggml_graph_compute when true
bool (*abort_callback)(void * data);
void * abort_callback_data;
};
// computation graph
@@ -1175,31 +1166,6 @@ extern "C" {
int s,
int d);
enum ggml_op_pool {
GGML_OP_POOL_MAX,
GGML_OP_POOL_AVG,
GGML_OP_POOL_COUNT,
};
GGML_API struct ggml_tensor* ggml_pool_1d(
struct ggml_context * ctx,
struct ggml_tensor * a,
enum ggml_op_pool op,
int k0, // kernel size
int s0, // stride
int p0); // padding
GGML_API struct ggml_tensor* ggml_pool_2d(
struct ggml_context * ctx,
struct ggml_tensor * a,
enum ggml_op_pool op,
int k0,
int k1,
int s0,
int s1,
int p0,
int p1);
GGML_API struct ggml_tensor * ggml_flash_attn(
struct ggml_context * ctx,
struct ggml_tensor * q,
@@ -1339,7 +1305,7 @@ extern "C" {
// ggml_graph_plan() has to be called before ggml_graph_compute()
// when plan.work_size > 0, caller must allocate memory for plan.work_data
GGML_API struct ggml_cplan ggml_graph_plan (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
GGML_API int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
GGML_API void ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
// same as ggml_graph_compute() but the work data is allocated as a part of the context

View File

@@ -15,14 +15,6 @@
#define K_SCALE_SIZE 12
#endif
#ifndef static_assert
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
#define static_assert(cond, msg) _Static_assert(cond, msg)
#else
#define static_assert(cond, msg) struct global_scope_noop_trick
#endif
#endif
//
// Super-block quantization structures
//

197
llama.cpp
View File

@@ -303,7 +303,7 @@ struct llama_model {
};
struct llama_context {
llama_context(const llama_model & model) : model(model), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
llama_context(const llama_model & model, const llama_vocab & vocab) : model(model), vocab(vocab), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
#ifdef GGML_USE_METAL
~llama_context() {
if (ctx_metal) {
@@ -324,6 +324,7 @@ struct llama_context {
int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
const llama_model & model;
const llama_vocab & vocab;
bool model_owner = false;
@@ -1631,10 +1632,6 @@ static bool llama_eval_internal(
// run the computation
ggml_build_forward_expand(&gf, cur);
#if GGML_USE_MPI
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, &gf, n_layer);
#endif
#ifdef GGML_USE_METAL
if (lctx.ctx_metal && N == 1) {
ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
@@ -1659,19 +1656,14 @@ static bool llama_eval_internal(
ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
}
#elif GGML_USE_MPI
ggml_mpi_graph_compute(lctx.ctx_mpi, ctx0, &gf, n_layer, n_threads);
cur = gf.nodes[gf.n_nodes - 1];
#else
ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
#endif
#if GGML_USE_MPI
ggml_mpi_graph_compute_post(lctx.ctx_mpi, &gf, n_layer);
#endif
// update kv token count
lctx.kv_self.n = n_past + N;
struct ggml_tensor * res = gf.nodes[gf.n_nodes - 1];
if (cgraph_fname) {
ggml_graph_export(&gf, cgraph_fname);
}
@@ -1687,26 +1679,38 @@ static bool llama_eval_internal(
// ggml_graph_dump_dot(&gf, NULL, "llama.dot");
//}
// extract logits
//embd_w.resize(n_vocab*N);
//memcpy(embd_w.data(), ggml_get_data(cur), sizeof(float)*n_vocab*N);
// update kv token count
lctx.kv_self.n = n_past + N;
#ifdef GGML_USE_MPI
if (ggml_mpi_rank(lctx.ctx_mpi) == 0) {
#else
{
auto & logits_out = lctx.logits;
#endif
// extract logits
{
auto & logits_out = lctx.logits;
if (lctx.logits_all) {
logits_out.resize(n_vocab * N);
memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*N);
} else {
// return result for just the last token
logits_out.resize(n_vocab);
memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
if (lctx.logits_all) {
logits_out.resize(n_vocab * N);
memcpy(logits_out.data(), (float *) ggml_get_data(cur), sizeof(float)*n_vocab*N);
} else {
// return result for just the last token
logits_out.resize(n_vocab);
memcpy(logits_out.data(), (float *) ggml_get_data(cur) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
}
}
}
// extract embeddings
if (!lctx.embedding.empty()) {
auto & embedding_out = lctx.embedding;
// extract embeddings
if (!lctx.embedding.empty()) {
auto & embedding_out = lctx.embedding;
embedding_out.resize(n_embd);
memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd);
embedding_out.resize(n_embd);
memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd);
}
}
if (mem_per_token == 0) {
@@ -2166,62 +2170,6 @@ void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, l
}
}
static void llama_log_softmax(float * array, size_t size) {
float max_l = *std::max_element(array, array + size);
float sum = 0.f;
for (size_t i = 0; i < size; ++i) {
float p = expf(array[i] - max_l);
sum += p;
array[i] = p;
}
for (size_t i = 0; i < size; ++i) {
array[i] = logf(array[i] / sum);
}
}
void llama_sample_classifier_free_guidance(
struct llama_context * ctx,
llama_token_data_array * candidates,
struct llama_context * guidance_ctx,
float scale,
float smooth_factor) {
int64_t t_start_sample_us = t_start_sample_us = ggml_time_us();
assert(ctx);
auto n_vocab = llama_n_vocab(ctx);
assert(n_vocab == (int)candidates->size);
assert(!candidates->sorted);
std::vector<float> logits_base;
logits_base.reserve(candidates->size);
for (size_t i = 0; i < candidates->size; ++i) {
logits_base.push_back(candidates->data[i].logit);
}
llama_log_softmax(logits_base.data(), candidates->size);
float* logits_guidance = llama_get_logits(guidance_ctx);
llama_log_softmax(logits_guidance, n_vocab);
for (int i = 0; i < n_vocab; ++i) {
float logit_guidance = logits_guidance[i];
float logit_base = logits_base[i];
logits_guidance[i] = scale * (logit_base - logit_guidance) + logit_guidance;
}
llama_log_softmax(logits_guidance, n_vocab);
for (int i = 0; i < n_vocab; ++i) {
float logit_base = logits_base[i];
float logit_guidance = logits_guidance[i];
candidates->data[i].logit = smooth_factor * logit_guidance + (1.f - smooth_factor) * logit_base;
}
if (ctx) {
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
}
}
llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu) {
assert(ctx);
@@ -2509,14 +2457,15 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
} else {
new_type = quantized_type;
#ifdef GGML_USE_K_QUANTS
bool convert_incompatible_tensor = false;
if (quantized_type == GGML_TYPE_Q2_K || quantized_type == GGML_TYPE_Q3_K || quantized_type == GGML_TYPE_Q4_K ||
quantized_type == GGML_TYPE_Q5_K || quantized_type == GGML_TYPE_Q6_K) {
int nx = tensor.ne.at(0);
int ny = tensor.ne.at(1);
if (nx % QK_K != 0 || ny % QK_K != 0) {
fprintf(stderr, "\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K);
convert_incompatible_tensor = true;
fprintf(stderr, "\n\n========================= Tensor sizes %d x %d are not divisible by %d\n",nx,ny,QK_K);
fprintf(stderr, "This is required to be able to use k-quants for now!\n");
fprintf(stderr, "========================================================================================\n\n");
throw std::runtime_error("Unsupported tensor size encountered\n");
}
}
if (tensor.name == "output.weight") {
@@ -2544,17 +2493,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
}
if (convert_incompatible_tensor) {
if (tensor.name == "output.weight") {
new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
fprintf(stderr, "F16 will be used for this tensor instead.\n");
} else if (tensor.name == "tok_embeddings.weight") {
new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
fprintf(stderr, "Q4_0 will be used for this tensor instead.\n");
} else {
throw std::runtime_error("Unsupported tensor size encountered\n");
}
}
#endif
float * f32_data;
@@ -2696,7 +2634,7 @@ struct llama_context * llama_new_context_with_model(
return nullptr;
}
llama_context * ctx = new llama_context(*model);
llama_context * ctx = new llama_context(*model, model->vocab);
if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = time(NULL);
@@ -3534,13 +3472,13 @@ int llama_eval_export(struct llama_context * ctx, const char * fname) {
return 0;
}
int llama_tokenize_with_model(
const struct llama_model * model,
int llama_tokenize(
struct llama_context * ctx,
const char * text,
llama_token * tokens,
int n_max_tokens,
bool add_bos) {
auto res = llama_tokenize(model->vocab, text, add_bos);
auto res = llama_tokenize(ctx->vocab, text, add_bos);
if (n_max_tokens < (int) res.size()) {
fprintf(stderr, "%s: too many tokens\n", __func__);
@@ -3554,29 +3492,8 @@ int llama_tokenize_with_model(
return res.size();
}
int llama_tokenize(
struct llama_context * ctx,
const char * text,
llama_token * tokens,
int n_max_tokens,
bool add_bos) {
return llama_tokenize_with_model(&ctx->model, text, tokens, n_max_tokens, add_bos);
}
int llama_n_vocab_from_model(const struct llama_model * model) {
return model->vocab.id_to_token.size();
}
int llama_n_ctx_from_model(const struct llama_model * model) {
return model->hparams.n_ctx;
}
int llama_n_embd_from_model(const struct llama_model * model) {
return model->hparams.n_embd;
}
int llama_n_vocab(const struct llama_context * ctx) {
return ctx->model.vocab.id_to_token.size();
return ctx->vocab.id_to_token.size();
}
int llama_n_ctx(const struct llama_context * ctx) {
@@ -3587,25 +3504,17 @@ int llama_n_embd(const struct llama_context * ctx) {
return ctx->model.hparams.n_embd;
}
int llama_get_vocab_from_model(
const struct llama_model * model,
const char * * strings,
float * scores,
int capacity) {
int n = std::min(capacity, (int) model->vocab.id_to_token.size());
for (int i = 0; i<n; ++i) {
strings[i] = model->vocab.id_to_token[i].tok.c_str();
scores[i] = model->vocab.id_to_token[i].score;
}
return n;
}
int llama_get_vocab(
const struct llama_context * ctx,
const char * * strings,
float * scores,
int capacity) {
return llama_get_vocab_from_model(&ctx->model, strings, scores, capacity);
int n = std::min(capacity, (int) ctx->vocab.id_to_token.size());
for (int i = 0; i<n; ++i) {
strings[i] = ctx->vocab.id_to_token[i].tok.c_str();
scores[i] = ctx->vocab.id_to_token[i].score;
}
return n;
}
float * llama_get_logits(struct llama_context * ctx) {
@@ -3616,16 +3525,12 @@ float * llama_get_embeddings(struct llama_context * ctx) {
return ctx->embedding.data();
}
const char * llama_token_to_str_with_model(const struct llama_model * model, llama_token token) {
if (token >= llama_n_vocab_from_model(model)) {
const char * llama_token_to_str(const struct llama_context * ctx, llama_token token) {
if (token >= llama_n_vocab(ctx)) {
return nullptr;
}
return model->vocab.id_to_token[token].tok.c_str();
}
const char * llama_token_to_str(const struct llama_context * ctx, llama_token token) {
return llama_token_to_str_with_model(&ctx->model, token);
return ctx->vocab.id_to_token[token].tok.c_str();
}
llama_token llama_token_bos() {

37
llama.h
View File

@@ -270,21 +270,10 @@ extern "C" {
int n_max_tokens,
bool add_bos);
LLAMA_API int llama_tokenize_with_model(
const struct llama_model * model,
const char * text,
llama_token * tokens,
int n_max_tokens,
bool add_bos);
LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
LLAMA_API int llama_n_embd (const struct llama_context * ctx);
LLAMA_API int llama_n_vocab_from_model(const struct llama_model * model);
LLAMA_API int llama_n_ctx_from_model (const struct llama_model * model);
LLAMA_API int llama_n_embd_from_model (const struct llama_model * model);
// Get the vocabulary as output parameters.
// Returns number of results.
LLAMA_API int llama_get_vocab(
@@ -293,12 +282,6 @@ extern "C" {
float * scores,
int capacity);
LLAMA_API int llama_get_vocab_from_model(
const struct llama_model * model,
const char * * strings,
float * scores,
int capacity);
// Token logits obtained from the last call to llama_eval()
// The logits for the last token are stored in the last row
// Can be mutated in order to change the probabilities of the next token
@@ -311,13 +294,7 @@ extern "C" {
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
// Token Id -> String. Uses the vocabulary in the provided context
LLAMA_API const char * llama_token_to_str(
const struct llama_context * ctx,
llama_token token);
LLAMA_API const char * llama_token_to_str_with_model(
const struct llama_model * model,
llama_token token);
LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);
// Special tokens
LLAMA_API llama_token llama_token_bos(); // beginning-of-sentence
@@ -332,18 +309,6 @@ extern "C" {
/// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
/// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
/// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
/// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
/// @params smooth_factor Smooth factor between guidance logits and original logits. 1.0f means only use guidance logits. 0.0f means only original logits.
LLAMA_API void llama_sample_classifier_free_guidance(
struct llama_context * ctx,
llama_token_data_array * candidates,
struct llama_context * guidance_ctx,
float scale,
float smooth_factor);
/// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);

View File

@@ -10,9 +10,7 @@
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
#if defined(__GNUC__)
#pragma GCC diagnostic ignored "-Wdouble-promotion"
#endif
#define MAX_NARGS 3

View File

@@ -7,9 +7,7 @@
#define MAX_NARGS 2
#if defined(__GNUC__)
#pragma GCC diagnostic ignored "-Wdouble-promotion"
#endif
//
// logging

View File

@@ -31,7 +31,7 @@ int main(int argc, char **argv) {
llama_model * model;
llama_context * ctx;
llama_backend_init(false);
llama_init_backend(false);
// load the vocab
{
@@ -99,7 +99,7 @@ int main(int argc, char **argv) {
llama_free_model(model);
llama_free(ctx);
llama_backend_free();
llama_finalize_backend();
return 0;
}