mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-04-30 16:47:31 +03:00
Compare commits
50 Commits
b1428
...
llama-refa
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
7923b70cb8 | ||
|
|
2073347e3b | ||
|
|
fc5a26aade | ||
|
|
0bfdcdd0f8 | ||
|
|
6669cd8329 | ||
|
|
2926ef63b1 | ||
|
|
a3f80013ad | ||
|
|
792d1a1b16 | ||
|
|
f39e6075cf | ||
|
|
c9121fdd0f | ||
|
|
a104abea48 | ||
|
|
31a12f3d03 | ||
|
|
5990861938 | ||
|
|
3e0462594b | ||
|
|
909d64471b | ||
|
|
38728a0be0 | ||
|
|
dbf836bb64 | ||
|
|
7db9c96d8a | ||
|
|
210e6e5d02 | ||
|
|
79ad734417 | ||
|
|
761087932b | ||
|
|
8925cf9ef8 | ||
|
|
1e9c5443c2 | ||
|
|
da936188d8 | ||
|
|
739b85c985 | ||
|
|
25cfbf6776 | ||
|
|
b4ad03b3a7 | ||
|
|
79617902ea | ||
|
|
e14aa46151 | ||
|
|
0dc05b8433 | ||
|
|
4e98897ede | ||
|
|
51c4f9ee9f | ||
|
|
3af8771389 | ||
|
|
83d2c43791 | ||
|
|
38aca9e1ab | ||
|
|
5946d98fc8 | ||
|
|
8b2420d249 | ||
|
|
ff3bad83e2 | ||
|
|
82a6646e02 | ||
|
|
ba231e8a6d | ||
|
|
8a2f2fea29 | ||
|
|
bd6d9e2059 | ||
|
|
ee1a0ec9cb | ||
|
|
177461104b | ||
|
|
fdee152e4e | ||
|
|
41aee4df82 | ||
|
|
6d459cbfbe | ||
|
|
c8d6a1f34a | ||
|
|
2f9ec7e271 | ||
|
|
34b2a5e1ee |
2
.github/ISSUE_TEMPLATE/bug.md
vendored
2
.github/ISSUE_TEMPLATE/bug.md
vendored
@@ -1,7 +1,7 @@
|
||||
---
|
||||
name: Bug template
|
||||
about: Used to report bugs in llama.cpp
|
||||
labels: ["bug"]
|
||||
labels: ["bug-unconfirmed"]
|
||||
assignees: ''
|
||||
|
||||
---
|
||||
|
||||
@@ -82,6 +82,7 @@ set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
|
||||
option(LLAMA_CUBLAS "llama: use CUDA" OFF)
|
||||
#option(LLAMA_CUDA_CUBLAS "llama: use cuBLAS for prompt processing" OFF)
|
||||
option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF)
|
||||
option(LLAMA_CUDA_FORCE_MMQ "llama: use mmq kernels instead of cuBLAS" OFF)
|
||||
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
|
||||
set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
|
||||
option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some calculations" OFF)
|
||||
@@ -305,6 +306,9 @@ if (LLAMA_CUBLAS)
|
||||
if (LLAMA_CUDA_FORCE_DMMV)
|
||||
add_compile_definitions(GGML_CUDA_FORCE_DMMV)
|
||||
endif()
|
||||
if (LLAMA_CUDA_FORCE_MMQ)
|
||||
add_compile_definitions(GGML_CUDA_FORCE_MMQ)
|
||||
endif()
|
||||
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
|
||||
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
|
||||
if (DEFINED LLAMA_CUDA_DMMV_Y)
|
||||
@@ -405,6 +409,9 @@ if (LLAMA_HIPBLAS)
|
||||
if (LLAMA_CUDA_FORCE_DMMV)
|
||||
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_DMMV)
|
||||
endif()
|
||||
if (LLAMA_CUDA_FORCE_MMQ)
|
||||
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_MMQ)
|
||||
endif()
|
||||
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
|
||||
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
|
||||
target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
|
||||
|
||||
3
Makefile
3
Makefile
@@ -397,6 +397,9 @@ endif # CUDA_DOCKER_ARCH
|
||||
ifdef LLAMA_CUDA_FORCE_DMMV
|
||||
NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV
|
||||
endif # LLAMA_CUDA_FORCE_DMMV
|
||||
ifdef LLAMA_CUDA_FORCE_MMQ
|
||||
NVCCFLAGS += -DGGML_CUDA_FORCE_MMQ
|
||||
endif # LLAMA_CUDA_FORCE_MMQ
|
||||
ifdef LLAMA_CUDA_DMMV_X
|
||||
NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
|
||||
else
|
||||
|
||||
@@ -224,6 +224,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
break;
|
||||
}
|
||||
sparams.temp = std::stof(argv[i]);
|
||||
sparams.temp = std::max(sparams.temp, 0.0f);
|
||||
} else if (arg == "--tfs") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
@@ -743,7 +744,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||
#endif // GGML_USE_CUBLAS
|
||||
#endif
|
||||
printf(" --verbose-prompt print prompt before generation\n");
|
||||
fprintf(stderr, " --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n");
|
||||
printf(" --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n");
|
||||
printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
|
||||
printf(" --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n");
|
||||
printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
|
||||
|
||||
@@ -167,8 +167,12 @@ llama_token llama_sampling_sample(
|
||||
llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar);
|
||||
}
|
||||
|
||||
if (temp <= 0) {
|
||||
// greedy sampling
|
||||
if (temp < 0.0) {
|
||||
// greedy sampling, with probs
|
||||
llama_sample_softmax(ctx_main, &cur_p);
|
||||
id = cur_p.data[0].id;
|
||||
} else if (temp == 0.0) {
|
||||
// greedy sampling, no probs
|
||||
id = llama_sample_token_greedy(ctx_main, &cur_p);
|
||||
} else {
|
||||
if (mirostat == 1) {
|
||||
|
||||
21
convert.py
21
convert.py
@@ -366,16 +366,19 @@ class SentencePieceVocab:
|
||||
added_tokens = {}
|
||||
|
||||
vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
|
||||
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
|
||||
actual_ids = sorted(added_tokens.values())
|
||||
if expected_ids != actual_ids:
|
||||
raise Exception(f"Expected added token IDs to be sequential and start at {vocab_size}; got {actual_ids}")
|
||||
|
||||
items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
|
||||
self.added_tokens_list = [text for (text, idx) in items]
|
||||
self.vocab_size_base: int = vocab_size
|
||||
self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
|
||||
self.fname_tokenizer = fname_tokenizer
|
||||
new_tokens = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
|
||||
expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens)))
|
||||
actual_new_ids = sorted(new_tokens.keys())
|
||||
|
||||
if expected_new_ids != actual_new_ids:
|
||||
raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}")
|
||||
|
||||
# Token pieces that were added to the base vocabulary.
|
||||
self.added_tokens_list = [new_tokens[id] for id in actual_new_ids]
|
||||
self.vocab_size_base = vocab_size
|
||||
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
|
||||
self.fname_tokenizer = fname_tokenizer
|
||||
self.fname_added_tokens = fname_added_tokens
|
||||
|
||||
def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||
|
||||
@@ -1502,7 +1502,7 @@ struct llama_server_context
|
||||
{
|
||||
for (auto & slot : slots)
|
||||
{
|
||||
const bool has_prompt = slot.prompt.is_array() || (slot.prompt.is_string() && !slot.prompt.get<std::string>().empty());
|
||||
const bool has_prompt = slot.prompt.is_array() || (slot.prompt.is_string() && !slot.prompt.get<std::string>().empty()) || !slot.images.empty();
|
||||
|
||||
// empty prompt passed -> release the slot and send empty response
|
||||
if (slot.state == IDLE && slot.command == LOAD_PROMPT && !has_prompt)
|
||||
|
||||
@@ -95,13 +95,8 @@ int main(int argc, char ** argv) {
|
||||
llama_batch batch = llama_batch_init(512, 0, 1);
|
||||
|
||||
// evaluate the initial prompt
|
||||
batch.n_tokens = tokens_list.size();
|
||||
|
||||
for (int32_t i = 0; i < batch.n_tokens; i++) {
|
||||
batch.token[i] = tokens_list[i];
|
||||
batch.pos[i] = i;
|
||||
batch.seq_id[i] = 0;
|
||||
batch.logits[i] = false;
|
||||
for (size_t i = 0; i < tokens_list.size(); i++) {
|
||||
llama_batch_add(batch, tokens_list[i], i, { 0 }, false);
|
||||
}
|
||||
|
||||
// llama_decode will output logits only for the last token of the prompt
|
||||
@@ -148,15 +143,10 @@ int main(int argc, char ** argv) {
|
||||
fflush(stdout);
|
||||
|
||||
// prepare the next batch
|
||||
batch.n_tokens = 0;
|
||||
llama_batch_clear(batch);
|
||||
|
||||
// push this new token for next evaluation
|
||||
batch.token [batch.n_tokens] = new_token_id;
|
||||
batch.pos [batch.n_tokens] = n_cur;
|
||||
batch.seq_id[batch.n_tokens] = 0;
|
||||
batch.logits[batch.n_tokens] = true;
|
||||
|
||||
batch.n_tokens += 1;
|
||||
llama_batch_add(batch, new_token_id, n_cur, { 0 }, true);
|
||||
|
||||
n_decode += 1;
|
||||
}
|
||||
|
||||
@@ -8,6 +8,9 @@
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 100
|
||||
#define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
|
||||
|
||||
struct seq_draft {
|
||||
bool active = false;
|
||||
bool drafting = false;
|
||||
@@ -64,6 +67,33 @@ int main(int argc, char ** argv) {
|
||||
params.n_gpu_layers = params.n_gpu_layers_draft;
|
||||
std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params);
|
||||
|
||||
{
|
||||
const int n_vocab_tgt = llama_n_vocab(model_tgt);
|
||||
const int n_vocab_dft = llama_n_vocab(model_dft);
|
||||
const int vocab_diff = n_vocab_tgt > n_vocab_dft
|
||||
? n_vocab_tgt - n_vocab_dft
|
||||
: n_vocab_dft - n_vocab_tgt;
|
||||
|
||||
if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) {
|
||||
fprintf(stderr, "%s: error: draft model vocab must closely match target model to use speculation but ", __func__);
|
||||
fprintf(stderr, "target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
|
||||
n_vocab_tgt, llama_n_vocab(model_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
|
||||
return 1;
|
||||
}
|
||||
|
||||
for (int i = SPEC_VOCAB_CHECK_START_TOKEN_ID; i < std::min(n_vocab_tgt, n_vocab_dft); ++i) {
|
||||
const char * token_text_tgt = llama_token_get_text(model_tgt, i);
|
||||
const char * token_text_dft = llama_token_get_text(model_dft, i);
|
||||
if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
|
||||
fprintf(stderr, "%s: error: draft model vocab must match target model to use speculation but ", __func__);
|
||||
fprintf(stderr, "token %d content differs - target '%s', draft '%s'\n", i,
|
||||
llama_token_to_piece(ctx_tgt, i).c_str(),
|
||||
llama_token_to_piece(ctx_dft, i).c_str());
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// tokenize the prompt
|
||||
std::vector<llama_token> inp;
|
||||
inp = ::llama_tokenize(ctx_tgt, params.prompt, true);
|
||||
@@ -118,7 +148,7 @@ int main(int argc, char ** argv) {
|
||||
std::vector<seq_draft> drafts(n_seq_dft);
|
||||
|
||||
params.sparams.grammar.clear(); // the draft samplers will copy the target sampler's grammar
|
||||
params.sparams.temp = std::max(0.01f, params.sparams.temp);
|
||||
params.sparams.temp = -1.0f; // force greedy sampling with probs for the draft model
|
||||
|
||||
for (int s = 0; s < n_seq_dft; ++s) {
|
||||
drafts[s].ctx_sampling = llama_sampling_init(params.sparams);
|
||||
@@ -227,6 +257,7 @@ int main(int argc, char ** argv) {
|
||||
llama_batch_add (batch_dft, id, n_past_dft, { 0 }, true);
|
||||
|
||||
llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1);
|
||||
// LOG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
|
||||
llama_decode (ctx_dft, batch_dft);
|
||||
|
||||
++n_past_dft;
|
||||
@@ -370,7 +401,7 @@ int main(int argc, char ** argv) {
|
||||
llama_kv_cache_seq_cp(ctx_tgt, 0, s, -1, -1);
|
||||
}
|
||||
|
||||
//LOG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt));
|
||||
// LOG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str());
|
||||
llama_decode(ctx_tgt, batch_tgt);
|
||||
++n_past_tgt;
|
||||
}
|
||||
|
||||
6
flake.lock
generated
6
flake.lock
generated
@@ -20,11 +20,11 @@
|
||||
},
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1692913444,
|
||||
"narHash": "sha256-1SvMQm2DwofNxXVtNWWtIcTh7GctEVrS/Xel/mdc6iY=",
|
||||
"lastModified": 1698134075,
|
||||
"narHash": "sha256-foCD+nuKzfh49bIoiCBur4+Fx1nozo+4C/6k8BYk4sg=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "18324978d632ffc55ef1d928e81630c620f4f447",
|
||||
"rev": "8efd5d1e283604f75a808a20e6cde0ef313d07d4",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
|
||||
@@ -51,6 +51,9 @@
|
||||
};
|
||||
llama-python =
|
||||
pkgs.python3.withPackages (ps: with ps; [ numpy sentencepiece ]);
|
||||
# TODO(Green-Sky): find a better way to opt-into the heavy ml python runtime
|
||||
llama-python-extra =
|
||||
pkgs.python3.withPackages (ps: with ps; [ numpy sentencepiece torchWithoutCuda transformers ]);
|
||||
postPatch = ''
|
||||
substituteInPlace ./ggml-metal.m \
|
||||
--replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
|
||||
@@ -126,5 +129,9 @@
|
||||
buildInputs = [ llama-python ];
|
||||
packages = nativeBuildInputs ++ osSpecific;
|
||||
};
|
||||
devShells.extra = pkgs.mkShell {
|
||||
buildInputs = [ llama-python-extra ];
|
||||
packages = nativeBuildInputs ++ osSpecific;
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
130
ggml-cuda.cu
130
ggml-cuda.cu
@@ -87,6 +87,24 @@
|
||||
#define CC_OFFSET_AMD 1000000
|
||||
#define CC_RDNA2 (CC_OFFSET_AMD + 1030)
|
||||
|
||||
// define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication
|
||||
// on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant
|
||||
// for large computational tasks. the drawback is that this requires some extra amount of VRAM:
|
||||
// - 7B quantum model: +100-200 MB
|
||||
// - 13B quantum model: +200-400 MB
|
||||
//
|
||||
//#define GGML_CUDA_FORCE_MMQ
|
||||
|
||||
// TODO: improve this to be correct for more hardware
|
||||
// for example, currently fails for GeForce GTX 1660 which is TURING arch (> VOLTA) but does not have tensor cores
|
||||
// probably other such cases, and not sure what happens on AMD hardware
|
||||
#if !defined(GGML_CUDA_FORCE_MMQ)
|
||||
#define CUDA_USE_TENSOR_CORES
|
||||
#endif
|
||||
|
||||
// max batch size to use MMQ kernels when tensor cores are available
|
||||
#define MMQ_MAX_BATCH_SIZE 32
|
||||
|
||||
#if defined(GGML_USE_HIPBLAS)
|
||||
#define __CUDA_ARCH__ 1300
|
||||
|
||||
@@ -470,7 +488,6 @@ static int g_device_count = -1;
|
||||
static int g_main_device = 0;
|
||||
static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
|
||||
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
|
||||
static bool g_mul_mat_q = true;
|
||||
|
||||
static void * g_scratch_buffer = nullptr;
|
||||
static size_t g_scratch_size = 0; // disabled by default
|
||||
@@ -3554,9 +3571,15 @@ static __device__ __forceinline__ void mul_mat_q(
|
||||
#define MMQ_X_Q4_0_RDNA1 64
|
||||
#define MMQ_Y_Q4_0_RDNA1 64
|
||||
#define NWARPS_Q4_0_RDNA1 8
|
||||
#if defined(CUDA_USE_TENSOR_CORES)
|
||||
#define MMQ_X_Q4_0_AMPERE 4
|
||||
#define MMQ_Y_Q4_0_AMPERE 32
|
||||
#define NWARPS_Q4_0_AMPERE 4
|
||||
#else
|
||||
#define MMQ_X_Q4_0_AMPERE 64
|
||||
#define MMQ_Y_Q4_0_AMPERE 128
|
||||
#define NWARPS_Q4_0_AMPERE 4
|
||||
#endif
|
||||
#define MMQ_X_Q4_0_PASCAL 64
|
||||
#define MMQ_Y_Q4_0_PASCAL 64
|
||||
#define NWARPS_Q4_0_PASCAL 8
|
||||
@@ -3615,9 +3638,15 @@ template <bool need_check> static __global__ void
|
||||
#define MMQ_X_Q4_1_RDNA1 64
|
||||
#define MMQ_Y_Q4_1_RDNA1 64
|
||||
#define NWARPS_Q4_1_RDNA1 8
|
||||
#if defined(CUDA_USE_TENSOR_CORES)
|
||||
#define MMQ_X_Q4_1_AMPERE 4
|
||||
#define MMQ_Y_Q4_1_AMPERE 32
|
||||
#define NWARPS_Q4_1_AMPERE 4
|
||||
#else
|
||||
#define MMQ_X_Q4_1_AMPERE 64
|
||||
#define MMQ_Y_Q4_1_AMPERE 128
|
||||
#define NWARPS_Q4_1_AMPERE 4
|
||||
#endif
|
||||
#define MMQ_X_Q4_1_PASCAL 64
|
||||
#define MMQ_Y_Q4_1_PASCAL 64
|
||||
#define NWARPS_Q4_1_PASCAL 8
|
||||
@@ -3678,9 +3707,15 @@ template <bool need_check> static __global__ void
|
||||
#define MMQ_X_Q5_0_RDNA1 64
|
||||
#define MMQ_Y_Q5_0_RDNA1 64
|
||||
#define NWARPS_Q5_0_RDNA1 8
|
||||
#if defined(CUDA_USE_TENSOR_CORES)
|
||||
#define MMQ_X_Q5_0_AMPERE 4
|
||||
#define MMQ_Y_Q5_0_AMPERE 32
|
||||
#define NWARPS_Q5_0_AMPERE 4
|
||||
#else
|
||||
#define MMQ_X_Q5_0_AMPERE 128
|
||||
#define MMQ_Y_Q5_0_AMPERE 64
|
||||
#define NWARPS_Q5_0_AMPERE 4
|
||||
#endif
|
||||
#define MMQ_X_Q5_0_PASCAL 64
|
||||
#define MMQ_Y_Q5_0_PASCAL 64
|
||||
#define NWARPS_Q5_0_PASCAL 8
|
||||
@@ -3739,9 +3774,15 @@ template <bool need_check> static __global__ void
|
||||
#define MMQ_X_Q5_1_RDNA1 64
|
||||
#define MMQ_Y_Q5_1_RDNA1 64
|
||||
#define NWARPS_Q5_1_RDNA1 8
|
||||
#if defined(CUDA_USE_TENSOR_CORES)
|
||||
#define MMQ_X_Q5_1_AMPERE 4
|
||||
#define MMQ_Y_Q5_1_AMPERE 32
|
||||
#define NWARPS_Q5_1_AMPERE 4
|
||||
#else
|
||||
#define MMQ_X_Q5_1_AMPERE 128
|
||||
#define MMQ_Y_Q5_1_AMPERE 64
|
||||
#define NWARPS_Q5_1_AMPERE 4
|
||||
#endif
|
||||
#define MMQ_X_Q5_1_PASCAL 64
|
||||
#define MMQ_Y_Q5_1_PASCAL 64
|
||||
#define NWARPS_Q5_1_PASCAL 8
|
||||
@@ -3800,9 +3841,15 @@ mul_mat_q5_1(
|
||||
#define MMQ_X_Q8_0_RDNA1 64
|
||||
#define MMQ_Y_Q8_0_RDNA1 64
|
||||
#define NWARPS_Q8_0_RDNA1 8
|
||||
#if defined(CUDA_USE_TENSOR_CORES)
|
||||
#define MMQ_X_Q8_0_AMPERE 4
|
||||
#define MMQ_Y_Q8_0_AMPERE 32
|
||||
#define NWARPS_Q8_0_AMPERE 4
|
||||
#else
|
||||
#define MMQ_X_Q8_0_AMPERE 128
|
||||
#define MMQ_Y_Q8_0_AMPERE 64
|
||||
#define NWARPS_Q8_0_AMPERE 4
|
||||
#endif
|
||||
#define MMQ_X_Q8_0_PASCAL 64
|
||||
#define MMQ_Y_Q8_0_PASCAL 64
|
||||
#define NWARPS_Q8_0_PASCAL 8
|
||||
@@ -3861,9 +3908,15 @@ template <bool need_check> static __global__ void
|
||||
#define MMQ_X_Q2_K_RDNA1 128
|
||||
#define MMQ_Y_Q2_K_RDNA1 32
|
||||
#define NWARPS_Q2_K_RDNA1 8
|
||||
#if defined(CUDA_USE_TENSOR_CORES)
|
||||
#define MMQ_X_Q2_K_AMPERE 4
|
||||
#define MMQ_Y_Q2_K_AMPERE 32
|
||||
#define NWARPS_Q2_K_AMPERE 4
|
||||
#else
|
||||
#define MMQ_X_Q2_K_AMPERE 64
|
||||
#define MMQ_Y_Q2_K_AMPERE 128
|
||||
#define NWARPS_Q2_K_AMPERE 4
|
||||
#endif
|
||||
#define MMQ_X_Q2_K_PASCAL 64
|
||||
#define MMQ_Y_Q2_K_PASCAL 64
|
||||
#define NWARPS_Q2_K_PASCAL 8
|
||||
@@ -3922,9 +3975,15 @@ mul_mat_q2_K(
|
||||
#define MMQ_X_Q3_K_RDNA1 32
|
||||
#define MMQ_Y_Q3_K_RDNA1 128
|
||||
#define NWARPS_Q3_K_RDNA1 8
|
||||
#if defined(CUDA_USE_TENSOR_CORES)
|
||||
#define MMQ_X_Q3_K_AMPERE 4
|
||||
#define MMQ_Y_Q3_K_AMPERE 32
|
||||
#define NWARPS_Q3_K_AMPERE 4
|
||||
#else
|
||||
#define MMQ_X_Q3_K_AMPERE 128
|
||||
#define MMQ_Y_Q3_K_AMPERE 128
|
||||
#define NWARPS_Q3_K_AMPERE 4
|
||||
#endif
|
||||
#define MMQ_X_Q3_K_PASCAL 64
|
||||
#define MMQ_Y_Q3_K_PASCAL 64
|
||||
#define NWARPS_Q3_K_PASCAL 8
|
||||
@@ -3985,9 +4044,15 @@ template <bool need_check> static __global__ void
|
||||
#define MMQ_X_Q4_K_RDNA1 32
|
||||
#define MMQ_Y_Q4_K_RDNA1 64
|
||||
#define NWARPS_Q4_K_RDNA1 8
|
||||
#if defined(CUDA_USE_TENSOR_CORES)
|
||||
#define MMQ_X_Q4_K_AMPERE 4
|
||||
#define MMQ_Y_Q4_K_AMPERE 32
|
||||
#define NWARPS_Q4_K_AMPERE 4
|
||||
#else
|
||||
#define MMQ_X_Q4_K_AMPERE 64
|
||||
#define MMQ_Y_Q4_K_AMPERE 128
|
||||
#define NWARPS_Q4_K_AMPERE 4
|
||||
#endif
|
||||
#define MMQ_X_Q4_K_PASCAL 64
|
||||
#define MMQ_Y_Q4_K_PASCAL 64
|
||||
#define NWARPS_Q4_K_PASCAL 8
|
||||
@@ -4048,9 +4113,15 @@ template <bool need_check> static __global__ void
|
||||
#define MMQ_X_Q5_K_RDNA1 32
|
||||
#define MMQ_Y_Q5_K_RDNA1 64
|
||||
#define NWARPS_Q5_K_RDNA1 8
|
||||
#if defined(CUDA_USE_TENSOR_CORES)
|
||||
#define MMQ_X_Q5_K_AMPERE 4
|
||||
#define MMQ_Y_Q5_K_AMPERE 32
|
||||
#define NWARPS_Q5_K_AMPERE 4
|
||||
#else
|
||||
#define MMQ_X_Q5_K_AMPERE 64
|
||||
#define MMQ_Y_Q5_K_AMPERE 128
|
||||
#define NWARPS_Q5_K_AMPERE 4
|
||||
#endif
|
||||
#define MMQ_X_Q5_K_PASCAL 64
|
||||
#define MMQ_Y_Q5_K_PASCAL 64
|
||||
#define NWARPS_Q5_K_PASCAL 8
|
||||
@@ -4109,9 +4180,15 @@ mul_mat_q5_K(
|
||||
#define MMQ_X_Q6_K_RDNA1 32
|
||||
#define MMQ_Y_Q6_K_RDNA1 64
|
||||
#define NWARPS_Q6_K_RDNA1 8
|
||||
#if defined(CUDA_USE_TENSOR_CORES)
|
||||
#define MMQ_X_Q6_K_AMPERE 4
|
||||
#define MMQ_Y_Q6_K_AMPERE 32
|
||||
#define NWARPS_Q6_K_AMPERE 4
|
||||
#else
|
||||
#define MMQ_X_Q6_K_AMPERE 64
|
||||
#define MMQ_Y_Q6_K_AMPERE 64
|
||||
#define NWARPS_Q6_K_AMPERE 4
|
||||
#endif
|
||||
#define MMQ_X_Q6_K_PASCAL 64
|
||||
#define MMQ_Y_Q6_K_PASCAL 64
|
||||
#define NWARPS_Q6_K_PASCAL 8
|
||||
@@ -5663,6 +5740,16 @@ void ggml_init_cublas() {
|
||||
CUDA_CHECK(cudaGetDeviceCount(&g_device_count));
|
||||
GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
|
||||
int64_t total_vram = 0;
|
||||
#if defined(GGML_CUDA_FORCE_MMQ)
|
||||
fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
|
||||
#else
|
||||
fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
|
||||
#endif
|
||||
#if defined(CUDA_USE_TENSOR_CORES)
|
||||
fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: yes\n", __func__);
|
||||
#else
|
||||
fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: no\n", __func__);
|
||||
#endif
|
||||
fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
|
||||
for (int id = 0; id < g_device_count; ++id) {
|
||||
cudaDeviceProp prop;
|
||||
@@ -6347,7 +6434,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
||||
cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
|
||||
row_diff, src1_ncols, ne10,
|
||||
&alpha, src0_ddf_i, ne00,
|
||||
src1_ddf_i, ne10,
|
||||
src1_ddf_i, ne10,
|
||||
&beta, dst_dd_i, ldc));
|
||||
|
||||
if (src0_as != 0) {
|
||||
@@ -7048,9 +7135,10 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
|
||||
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
|
||||
}
|
||||
|
||||
static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
|
||||
static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(!ggml_is_transposed(src0));
|
||||
GGML_ASSERT(!ggml_is_transposed(src1));
|
||||
|
||||
GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
@@ -7202,17 +7290,24 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
||||
}
|
||||
|
||||
static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
bool all_on_device = (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
|
||||
src1->backend == GGML_BACKEND_GPU && dst->backend == GGML_BACKEND_GPU;
|
||||
const bool all_on_device =
|
||||
(src0->backend == GGML_BACKEND_GPU) &&
|
||||
(src1->backend == GGML_BACKEND_GPU) &&
|
||||
( dst->backend == GGML_BACKEND_GPU);
|
||||
|
||||
int64_t min_compute_capability = INT_MAX;
|
||||
for (int64_t id = 0; id < g_device_count; ++id) {
|
||||
if (min_compute_capability > g_compute_capabilities[id]
|
||||
&& g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
|
||||
if (min_compute_capability > g_compute_capabilities[id] && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
|
||||
min_compute_capability = g_compute_capabilities[id];
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef CUDA_USE_TENSOR_CORES
|
||||
const bool use_tensor_cores = true;
|
||||
#else
|
||||
const bool use_tensor_cores = false;
|
||||
#endif
|
||||
|
||||
// debug helpers
|
||||
//printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
|
||||
//printf(" %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
|
||||
@@ -7221,20 +7316,19 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
||||
//printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
|
||||
//printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
|
||||
|
||||
if (all_on_device && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
||||
if (all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
||||
// KQ single-batch
|
||||
ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
|
||||
} else if (all_on_device && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
||||
} else if (all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
||||
// KQV single-batch
|
||||
ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
|
||||
} else if (all_on_device && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
|
||||
} else if (all_on_device && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
|
||||
// KQ + KQV multi-batch
|
||||
ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
|
||||
} else if (src0->type == GGML_TYPE_F32) {
|
||||
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
|
||||
} else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
|
||||
if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
|
||||
|
||||
#ifdef GGML_CUDA_FORCE_DMMV
|
||||
const bool use_mul_mat_vec_q = false;
|
||||
#else
|
||||
@@ -7247,7 +7341,15 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
||||
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
|
||||
}
|
||||
} else {
|
||||
if (g_mul_mat_q && ggml_is_quantized(src0->type) && min_compute_capability >= MIN_CC_DP4A) {
|
||||
bool use_mul_mat_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
|
||||
|
||||
// when tensor cores are available, use them for large batch size
|
||||
// ref: https://github.com/ggerganov/llama.cpp/pull/3776
|
||||
if (use_tensor_cores && min_compute_capability >= CC_VOLTA && src1->ne[1] > MMQ_MAX_BATCH_SIZE) {
|
||||
use_mul_mat_q = false;
|
||||
}
|
||||
|
||||
if (use_mul_mat_q) {
|
||||
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
|
||||
} else {
|
||||
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
|
||||
@@ -7601,10 +7703,6 @@ void ggml_cuda_set_main_device(const int main_device) {
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cuda_set_mul_mat_q(const bool mul_mat_q) {
|
||||
g_mul_mat_q = mul_mat_q;
|
||||
}
|
||||
|
||||
void ggml_cuda_set_scratch_size(const size_t scratch_size) {
|
||||
// this is a hack to not completely break llama.cpp when using multiple models or contexts simultaneously
|
||||
// it still won't always work as expected, but it's better than nothing
|
||||
|
||||
15
ggml-metal.m
15
ggml-metal.m
@@ -210,6 +210,10 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
|
||||
|
||||
NSString * sourcePath = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
|
||||
if (sourcePath == nil) {
|
||||
GGML_METAL_LOG_WARN("%s: error: could not use bundle path to find ggml-metal.metal, falling back to trying cwd\n", __func__);
|
||||
sourcePath = @"ggml-metal.metal";
|
||||
}
|
||||
GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [sourcePath UTF8String]);
|
||||
NSString * src = [NSString stringWithContentsOfFile:sourcePath encoding:NSUTF8StringEncoding error:&error];
|
||||
if (error) {
|
||||
@@ -234,14 +238,17 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
// load kernels
|
||||
{
|
||||
NSError * error = nil;
|
||||
#define GGML_METAL_ADD_KERNEL(name) \
|
||||
ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \
|
||||
ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \
|
||||
|
||||
/*
|
||||
GGML_METAL_LOG_INFO("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \
|
||||
(int) ctx->pipeline_##name.maxTotalThreadsPerThreadgroup, \
|
||||
(int) ctx->pipeline_##name.threadExecutionWidth); \
|
||||
*/
|
||||
#define GGML_METAL_ADD_KERNEL(name) \
|
||||
ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \
|
||||
ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \
|
||||
if (error) { \
|
||||
GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
|
||||
GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
|
||||
return NULL; \
|
||||
}
|
||||
|
||||
|
||||
2
ggml.h
2
ggml.h
@@ -709,7 +709,7 @@ extern "C" {
|
||||
// Context tensor enumeration and lookup
|
||||
GGML_API struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx);
|
||||
GGML_API struct ggml_tensor * ggml_get_next_tensor (struct ggml_context * ctx, struct ggml_tensor * tensor);
|
||||
GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
|
||||
GGML_API struct ggml_tensor * ggml_get_tensor (struct ggml_context * ctx, const char * name);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
|
||||
GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
|
||||
|
||||
3
llama.h
3
llama.h
@@ -178,7 +178,7 @@ extern "C" {
|
||||
float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
|
||||
|
||||
// Keep the booleans together to avoid misalignment during copy-by-value.
|
||||
bool mul_mat_q; // if true, use experimental mul_mat_q kernels
|
||||
bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
|
||||
bool f16_kv; // use fp16 for KV cache, fp32 otherwise
|
||||
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
||||
bool embedding; // embedding mode only
|
||||
@@ -658,6 +658,7 @@ extern "C" {
|
||||
float * mu);
|
||||
|
||||
/// @details Selects the token with the highest probability.
|
||||
/// Does not compute the token probabilities. Use llama_sample_softmax() instead.
|
||||
LLAMA_API llama_token llama_sample_token_greedy(
|
||||
struct llama_context * ctx,
|
||||
llama_token_data_array * candidates);
|
||||
|
||||
Reference in New Issue
Block a user