mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-04-16 16:27:32 +03:00
Compare commits
11 Commits
b6150
...
gg/graph-p
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
220860aa0c | ||
|
|
d32e03f449 | ||
|
|
3973163bff | ||
|
|
5ade3000bd | ||
|
|
8b2483730f | ||
|
|
810b9fc8b9 | ||
|
|
4ebd0c125b | ||
|
|
5cdb27e091 | ||
|
|
3ea913f1ce | ||
|
|
29c8fbe4e0 | ||
|
|
1adc9812bd |
12
.github/workflows/build.yml
vendored
12
.github/workflows/build.yml
vendored
@@ -443,7 +443,7 @@ jobs:
|
||||
|
||||
ubuntu-22-cmake-hip:
|
||||
runs-on: ubuntu-22.04
|
||||
container: rocm/dev-ubuntu-22.04:6.0.2
|
||||
container: rocm/dev-ubuntu-22.04:6.1.2
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
@@ -471,16 +471,6 @@ jobs:
|
||||
-DGGML_HIP=ON
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
|
||||
- name: Build with legacy HIP support
|
||||
id: cmake_build_legacy_hip
|
||||
run: |
|
||||
cmake -B build2 -S . \
|
||||
-DCMAKE_C_COMPILER=hipcc \
|
||||
-DCMAKE_CXX_COMPILER=hipcc \
|
||||
-DGGML_HIP_ROCWMMA_FATTN=ON \
|
||||
-DGGML_HIP=ON
|
||||
cmake --build build2 --config Release -j $(nproc)
|
||||
|
||||
ubuntu-22-cmake-musa:
|
||||
runs-on: ubuntu-22.04
|
||||
container: mthreads/musa:rc4.2.0-devel-ubuntu22.04-amd64
|
||||
|
||||
@@ -12,6 +12,8 @@ if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
|
||||
set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
|
||||
endif()
|
||||
|
||||
message("CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}")
|
||||
|
||||
# Add path to modules
|
||||
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
|
||||
|
||||
|
||||
@@ -1238,6 +1238,7 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e
|
||||
common_params_print_completion(ctx_arg);
|
||||
exit(0);
|
||||
}
|
||||
params.lr.init();
|
||||
} catch (const std::invalid_argument & ex) {
|
||||
fprintf(stderr, "%s\n", ex.what());
|
||||
ctx_arg.params = params_org;
|
||||
@@ -1506,6 +1507,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.swa_full = true;
|
||||
}
|
||||
).set_env("LLAMA_ARG_SWA_FULL"));
|
||||
add_opt(common_arg(
|
||||
{"--swa-checkpoints"}, "N",
|
||||
string_format("max number of SWA checkpoints per slot to create (default: %d)\n"
|
||||
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)", params.n_swa_checkpoints),
|
||||
[](common_params & params, int value) {
|
||||
params.n_swa_checkpoints = value;
|
||||
}
|
||||
).set_env("LLAMA_ARG_SWA_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER}));
|
||||
add_opt(common_arg(
|
||||
{"--kv-unified", "-kvu"},
|
||||
string_format("use single unified KV buffer for the KV cache of all sequences (default: %s)\n"
|
||||
@@ -2688,7 +2697,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.out_file = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS}));
|
||||
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE}));
|
||||
add_opt(common_arg(
|
||||
{"-ofreq", "--output-frequency"}, "N",
|
||||
string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq),
|
||||
@@ -3566,5 +3575,51 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
||||
|
||||
|
||||
add_opt(
|
||||
common_arg({ "-lr", "--learning-rate" }, "ALPHA",
|
||||
string_format(
|
||||
"adamw or sgd optimizer alpha (default: %.2g); note: sgd alpha recommended ~10x (no momentum)",
|
||||
(double) params.lr.lr0),
|
||||
[](common_params & params, const std::string & value) { params.lr.lr0 = std::stof(value); })
|
||||
.set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
||||
add_opt(
|
||||
common_arg({ "-lr-min", "--learning-rate-min" }, "ALPHA",
|
||||
string_format(
|
||||
"(if >0) final learning rate after decay (if -decay-epochs is set, default=%.2g)",
|
||||
(double) params.lr.lr_min),
|
||||
[](common_params & params, const std::string & value) { params.lr.lr_min = std::stof(value); })
|
||||
.set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
||||
add_opt(
|
||||
common_arg({ "-decay-epochs", "--learning-rate-decay-epochs" }, "ALPHA",
|
||||
string_format(
|
||||
"(if >0) decay learning rate to -lr-min after this many epochs (exponential decay, default=%.2g)",
|
||||
(double) params.lr.decay_epochs),
|
||||
[](common_params & params, const std::string & value) { params.lr.decay_epochs = std::stof(value); })
|
||||
.set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
||||
add_opt(common_arg(
|
||||
{ "-wd", "--weight-decay" }, "WD",
|
||||
string_format(
|
||||
"adamw or sgd optimizer weight decay (0 is off; recommend very small e.g. 1e-9) (default: %.2g).",
|
||||
(double) params.lr.wd),
|
||||
[](common_params & params, const std::string & value) { params.lr.wd = std::stof(value); })
|
||||
.set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
||||
add_opt(common_arg({ "-val-split", "--val-split" }, "FRACTION",
|
||||
string_format("fraction of data to use as validation set for training (default: %.2g).",
|
||||
(double) params.val_split),
|
||||
[](common_params & params, const std::string & value) { params.val_split = std::stof(value); })
|
||||
.set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
||||
add_opt(common_arg({ "-epochs", "--epochs" }, "N",
|
||||
string_format("optimizer max # of epochs (default: %d)", params.lr.epochs),
|
||||
[](common_params & params, int epochs) { params.lr.epochs = epochs; })
|
||||
.set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
||||
add_opt(common_arg({ "-opt", "--optimizer" }, "sgd|adamw", "adamw or sgd",
|
||||
[](common_params & params, const std::string & name) {
|
||||
params.optimizer = common_opt_get_optimizer(name.c_str());
|
||||
if (params.optimizer == GGML_OPT_OPTIMIZER_TYPE_COUNT) {
|
||||
throw std::invalid_argument("invalid --optimizer, valid options: adamw, sgd");
|
||||
}
|
||||
})
|
||||
.set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
||||
|
||||
return ctx_arg;
|
||||
}
|
||||
|
||||
@@ -41,6 +41,7 @@
|
||||
#endif
|
||||
#include <locale>
|
||||
#include <windows.h>
|
||||
#include <string.h>
|
||||
#include <fcntl.h>
|
||||
#include <io.h>
|
||||
#else
|
||||
@@ -1565,3 +1566,56 @@ ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
ggml_opt_optimizer_params common_opt_lr_pars(void * userdata) {
|
||||
ggml_opt_optimizer_params result = ggml_opt_get_default_optimizer_params(nullptr);
|
||||
const lr_opt & d = *(lr_opt *) userdata;
|
||||
result.adamw.alpha = result.sgd.alpha = d.get_lr(d.epoch);
|
||||
result.sgd.wd = result.adamw.wd = d.wd;
|
||||
return result;
|
||||
}
|
||||
|
||||
// TODO make all command line args case-insensitive
|
||||
static inline bool eq_case_insensitive(char const* a, char const* b) {
|
||||
return !
|
||||
#if defined(_MSC_VER)
|
||||
_stricmp
|
||||
#else
|
||||
strcasecmp
|
||||
#endif // defined(_MSC_VER)
|
||||
(a, b);
|
||||
}
|
||||
|
||||
enum ggml_opt_optimizer_type common_opt_get_optimizer(const char * n) {
|
||||
if (eq_case_insensitive("adamw", n)) {
|
||||
return GGML_OPT_OPTIMIZER_TYPE_ADAMW;
|
||||
}
|
||||
if (eq_case_insensitive("sgd", n)) {
|
||||
return GGML_OPT_OPTIMIZER_TYPE_SGD;
|
||||
}
|
||||
return GGML_OPT_OPTIMIZER_TYPE_COUNT;
|
||||
}
|
||||
|
||||
// TODO simplify to use just log and exp
|
||||
static float const k_log_2 = std::log(2.f);
|
||||
|
||||
void lr_opt::init() {
|
||||
if (lr_min > 0 && lr_min < lr0) {
|
||||
float nhalf = std::log(lr0 / lr_min) / k_log_2;
|
||||
float e = epochs;
|
||||
if (decay_epochs > 0 && decay_epochs < e) {
|
||||
e = decay_epochs;
|
||||
} else {
|
||||
decay_epochs = e;
|
||||
}
|
||||
scale_epoch = nhalf / e;
|
||||
}
|
||||
}
|
||||
|
||||
float lr_opt::get_lr(float epoch) const {
|
||||
float r = lr_min <= 0 ? lr0 :
|
||||
epoch >= decay_epochs ? lr_min :
|
||||
lr0 * std::pow(0.5f, epoch * scale_epoch);
|
||||
LOG_INF("epoch %.2g lr=%.2g\n", epoch, r);
|
||||
return r;
|
||||
}
|
||||
|
||||
@@ -2,14 +2,17 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "llama-cpp.h"
|
||||
|
||||
#include <set>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <vector>
|
||||
#include <map>
|
||||
#include <sstream>
|
||||
#include <cmath>
|
||||
|
||||
#include "ggml-opt.h"
|
||||
#include "llama-cpp.h"
|
||||
|
||||
#ifdef _WIN32
|
||||
#define DIRECTORY_SEPARATOR '\\'
|
||||
@@ -82,6 +85,7 @@ enum llama_example {
|
||||
LLAMA_EXAMPLE_PARALLEL,
|
||||
LLAMA_EXAMPLE_TTS,
|
||||
LLAMA_EXAMPLE_DIFFUSION,
|
||||
LLAMA_EXAMPLE_FINETUNE,
|
||||
|
||||
LLAMA_EXAMPLE_COUNT,
|
||||
};
|
||||
@@ -243,6 +247,25 @@ enum common_reasoning_format {
|
||||
COMMON_REASONING_FORMAT_GRANITE, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
|
||||
};
|
||||
|
||||
|
||||
struct lr_opt {
|
||||
float lr0 = 1e-5; // learning rate at first epoch
|
||||
float lr_min = -1;
|
||||
float decay_epochs = -1; // if >0, the learning rate starts at lr0 and decays to lr_min after this many epochs
|
||||
float scale_epoch = 0;
|
||||
float wd = 0;
|
||||
unsigned epochs = 2;
|
||||
|
||||
unsigned epoch; // set by optimizer outer (epochs) loop
|
||||
// learning rate decay - constant LR per epoch only for now
|
||||
float get_lr(float e) const;
|
||||
float get_lr() const { return get_lr(epoch); }
|
||||
// must call after arg parse, before get_lr
|
||||
void init();
|
||||
};
|
||||
|
||||
struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata);
|
||||
|
||||
struct common_params {
|
||||
int32_t n_predict = -1; // new tokens to predict
|
||||
int32_t n_ctx = 4096; // context size
|
||||
@@ -377,6 +400,11 @@ struct common_params {
|
||||
bool no_mmproj = false; // explicitly disable multimodal model
|
||||
std::vector<std::string> image; // path to image file(s)
|
||||
|
||||
// finetune
|
||||
struct lr_opt lr;
|
||||
enum ggml_opt_optimizer_type optimizer = GGML_OPT_OPTIMIZER_TYPE_ADAMW;
|
||||
float val_split = 0.05f; // fraction of the data used for the validation set
|
||||
|
||||
// embedding
|
||||
bool embedding = false; // get only sentence embedding
|
||||
int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
|
||||
@@ -385,11 +413,12 @@ struct common_params {
|
||||
std::string cls_sep = "\t"; // separator of classification sequences
|
||||
|
||||
// server params
|
||||
int32_t port = 8080; // server listens on this network port
|
||||
int32_t timeout_read = 600; // http read timeout in seconds
|
||||
int32_t timeout_write = timeout_read; // http write timeout in seconds
|
||||
int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
|
||||
int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
|
||||
int32_t port = 8080; // server listens on this network port
|
||||
int32_t timeout_read = 600; // http read timeout in seconds
|
||||
int32_t timeout_write = timeout_read; // http write timeout in seconds
|
||||
int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
|
||||
int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
|
||||
int32_t n_swa_checkpoints = 3; // max number of SWA checkpoints per slot
|
||||
|
||||
std::string hostname = "127.0.0.1";
|
||||
std::string public_path = ""; // NOLINT
|
||||
@@ -704,3 +733,6 @@ const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
|
||||
//
|
||||
|
||||
ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector<llama_token> & tokens, int64_t stride);
|
||||
|
||||
// "adamw" or "sgd" (case insensitive)
|
||||
enum ggml_opt_optimizer_type common_opt_get_optimizer(const char *);
|
||||
|
||||
@@ -10,20 +10,20 @@
|
||||
#include <vector>
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
#endif
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
common_params params;
|
||||
|
||||
params.escape = false;
|
||||
|
||||
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PERPLEXITY)) {
|
||||
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_FINETUNE)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (params.use_mmap) {
|
||||
LOG_INF("%s: force disabling memory mapping because it would result in-read-only pointers to the weights\n", __func__);
|
||||
LOG_INF("%s: force disabling memory mapping because it would result in-read-only pointers to the weights\n",
|
||||
__func__);
|
||||
params.use_mmap = false;
|
||||
}
|
||||
if (params.cache_type_k != GGML_TYPE_F32) {
|
||||
@@ -38,11 +38,10 @@ int main(int argc, char ** argv) {
|
||||
common_init();
|
||||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
|
||||
// load the model and apply lora adapter, if any
|
||||
common_init_result llama_init = common_init_from_params(params);
|
||||
llama_model_ptr & model = llama_init.model;
|
||||
llama_context_ptr & ctx = llama_init.context;
|
||||
common_init_result llama_init = common_init_from_params(params);
|
||||
llama_model_ptr & model = llama_init.model;
|
||||
llama_context_ptr & ctx = llama_init.context;
|
||||
|
||||
if (model == NULL) {
|
||||
LOG_ERR("%s: unable to load model\n", __func__);
|
||||
@@ -55,31 +54,32 @@ int main(int argc, char ** argv) {
|
||||
LOG_INF("%s\n", common_params_get_system_info(params).c_str());
|
||||
}
|
||||
|
||||
constexpr float val_split = 0.05f;
|
||||
std::vector<llama_token> tokens = common_tokenize(ctx.get(), params.prompt, true);
|
||||
ggml_opt_dataset_t dataset = common_opt_dataset_init(ctx.get(), tokens, llama_n_ctx(ctx.get()) / 2);
|
||||
|
||||
std::vector<llama_token> tokens = common_tokenize(ctx.get(), params.prompt, true);
|
||||
ggml_opt_dataset_t dataset = common_opt_dataset_init(ctx.get(), tokens, llama_n_ctx(ctx.get())/2);
|
||||
struct lr_opt & lr = params.lr;
|
||||
LOG_INF("-optimizer %s -lr0 %.2g -wd %.2g -lr-min %.2g -min-epochs %.2g -epochs %d -period %.2g -val %.2g\n",
|
||||
ggml_opt_optimizer_name(params.optimizer), (double) lr.lr0, (double) lr.wd, (double) lr.lr_min, (double) lr.decay_epochs,
|
||||
(unsigned) lr.epochs, (double) params.n_batch / params.n_ubatch, (double) params.val_split);
|
||||
|
||||
struct ggml_opt_optimizer_params optimizer_params = ggml_opt_get_default_optimizer_params(nullptr);
|
||||
optimizer_params.adamw.alpha = 1e-7f; // learning rate
|
||||
|
||||
struct llama_opt_params lopt_params {
|
||||
/*n_ctx_train =*/ 0,
|
||||
/*param_filter =*/ llama_opt_param_filter_all,
|
||||
/*param_filter_ud =*/ nullptr,
|
||||
/*get_opt_pars =*/ ggml_opt_get_constant_optimizer_params,
|
||||
/*get_opt_pars_ud =*/ &optimizer_params,
|
||||
struct llama_opt_params lopt_params{
|
||||
/*n_ctx_train =*/0,
|
||||
/*param_filter =*/llama_opt_param_filter_all,
|
||||
/*param_filter_ud =*/nullptr,
|
||||
/*get_opt_pars =*/common_opt_lr_pars,
|
||||
/*get_opt_pars_ud =*/¶ms.lr,
|
||||
/*optimizer_type =*/params.optimizer,
|
||||
};
|
||||
llama_opt_init(ctx.get(), model.get(), lopt_params);
|
||||
|
||||
const int64_t idata_split = ggml_opt_dataset_ndata(dataset) * (1.0f - val_split);
|
||||
const int64_t idata_split = ggml_opt_dataset_ndata(dataset) * (1.0f - params.val_split);
|
||||
|
||||
ggml_opt_result_t result_train = ggml_opt_result_init();
|
||||
ggml_opt_result_t result_eval = ggml_opt_result_init();
|
||||
|
||||
for (int epoch = 0; epoch < 2; ++epoch) {
|
||||
for (lr.epoch = 0; lr.epoch < lr.epochs; ++lr.epoch) {
|
||||
llama_opt_epoch(ctx.get(), dataset, result_train, result_eval, idata_split,
|
||||
ggml_opt_epoch_callback_progress_bar, ggml_opt_epoch_callback_progress_bar);
|
||||
ggml_opt_epoch_callback_progress_bar, ggml_opt_epoch_callback_progress_bar);
|
||||
fprintf(stderr, "\n");
|
||||
|
||||
ggml_opt_result_reset(result_train);
|
||||
@@ -88,7 +88,7 @@ int main(int argc, char ** argv) {
|
||||
ggml_opt_result_free(result_train);
|
||||
ggml_opt_result_free(result_eval);
|
||||
|
||||
llama_model_save_to_file(model.get(), "finetuned-model.gguf");
|
||||
llama_model_save_to_file(model.get(), params.out_file.c_str());
|
||||
|
||||
llama_backend_free();
|
||||
|
||||
|
||||
@@ -36,9 +36,6 @@
|
||||
# ```
|
||||
# nixConfig = {
|
||||
# extra-substituters = [
|
||||
# # Populated by the CI in ggml-org/llama.cpp
|
||||
# "https://llama-cpp.cachix.org"
|
||||
#
|
||||
# # A development cache for nixpkgs imported with `config.cudaSupport = true`.
|
||||
# # Populated by https://hercules-ci.com/github/SomeoneSerge/nixpkgs-cuda-ci.
|
||||
# # This lets one skip building e.g. the CUDA-enabled openmpi.
|
||||
@@ -47,10 +44,8 @@
|
||||
# ];
|
||||
#
|
||||
# # Verify these are the same keys as published on
|
||||
# # - https://app.cachix.org/cache/llama-cpp
|
||||
# # - https://app.cachix.org/cache/cuda-maintainers
|
||||
# extra-trusted-public-keys = [
|
||||
# "llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc="
|
||||
# "cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E="
|
||||
# ];
|
||||
# };
|
||||
|
||||
@@ -74,16 +74,26 @@ extern "C" {
|
||||
GGML_OPT_BUILD_TYPE_OPT = 30,
|
||||
};
|
||||
|
||||
enum ggml_opt_optimizer_type {
|
||||
GGML_OPT_OPTIMIZER_TYPE_ADAMW,
|
||||
GGML_OPT_OPTIMIZER_TYPE_SGD,
|
||||
|
||||
GGML_OPT_OPTIMIZER_TYPE_COUNT
|
||||
};
|
||||
|
||||
// parameters that control which optimizer is used and how said optimizer tries to find the minimal loss
|
||||
struct ggml_opt_optimizer_params {
|
||||
// AdamW optimizer parameters
|
||||
struct {
|
||||
float alpha; // learning rate
|
||||
float beta1;
|
||||
float beta2;
|
||||
float beta1; // first AdamW momentum
|
||||
float beta2; // second AdamW momentum
|
||||
float eps; // epsilon for numerical stability
|
||||
float wd; // weight decay for AdamW, use 0.0f to disable
|
||||
float wd; // weight decay - 0.0f to disable
|
||||
} adamw;
|
||||
struct {
|
||||
float alpha; // learning rate
|
||||
float wd; // weight decay
|
||||
} sgd;
|
||||
};
|
||||
|
||||
// callback to calculate optimizer parameters prior to a backward pass
|
||||
@@ -112,8 +122,11 @@ extern "C" {
|
||||
|
||||
int32_t opt_period; // after how many gradient accumulation steps an optimizer step should be done
|
||||
|
||||
ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
|
||||
void * get_opt_pars_ud; // userdata for calculating optimizer parameters
|
||||
ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
|
||||
void * get_opt_pars_ud; // userdata for calculating optimizer parameters
|
||||
|
||||
// only GGML_OPT_OPTIMIZER_TYPE_ADAMW needs m, v momenta per parameter tensor
|
||||
enum ggml_opt_optimizer_type optimizer;
|
||||
};
|
||||
|
||||
// get parameters for an optimization context with defaults set where possible
|
||||
@@ -142,6 +155,10 @@ extern "C" {
|
||||
// get the gradient accumulator for a node from the forward graph
|
||||
GGML_API struct ggml_tensor * ggml_opt_grad_acc(ggml_opt_context_t opt_ctx, struct ggml_tensor * node);
|
||||
|
||||
GGML_API enum ggml_opt_optimizer_type ggml_opt_context_optimizer_type(ggml_opt_context_t); //TODO consistent naming scheme
|
||||
|
||||
GGML_API const char * ggml_opt_optimizer_name(enum ggml_opt_optimizer_type);
|
||||
|
||||
// ====== Optimization Result ======
|
||||
|
||||
GGML_API ggml_opt_result_t ggml_opt_result_init(void);
|
||||
@@ -226,12 +243,14 @@ extern "C" {
|
||||
struct ggml_tensor * outputs, // output tensor, must have shape [ne_label, ndata_batch] if labels are used
|
||||
ggml_opt_dataset_t dataset, // dataset with data and optionally also labels
|
||||
enum ggml_opt_loss_type loss_type, // loss to minimize
|
||||
enum ggml_opt_optimizer_type optimizer, // sgd or adamw
|
||||
ggml_opt_get_optimizer_params get_opt_pars, // callback to get optimizer params, userdata is pointer to epoch (of type int64_t)
|
||||
int64_t nepoch, // how many times the dataset should be iterated over
|
||||
int64_t nbatch_logical, // datapoints optimizer step, must be a multiple of ndata_batch in inputs/outputs
|
||||
float val_split, // fraction of the dataset to use for validation, must be in [0.0f, 1.0f)
|
||||
bool silent); // whether or not info prints to stderr should be suppressed
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -542,6 +542,7 @@ extern "C" {
|
||||
GGML_OP_CROSS_ENTROPY_LOSS,
|
||||
GGML_OP_CROSS_ENTROPY_LOSS_BACK,
|
||||
GGML_OP_OPT_STEP_ADAMW,
|
||||
GGML_OP_OPT_STEP_SGD,
|
||||
|
||||
GGML_OP_GLU,
|
||||
|
||||
@@ -2311,7 +2312,14 @@ extern "C" {
|
||||
struct ggml_tensor * grad,
|
||||
struct ggml_tensor * m,
|
||||
struct ggml_tensor * v,
|
||||
struct ggml_tensor * adamw_params); // parameters such a the learning rate
|
||||
struct ggml_tensor * adamw_params); // parameters such as the learning rate
|
||||
|
||||
// stochastic gradient descent step (with weight decay)
|
||||
GGML_API struct ggml_tensor * ggml_opt_step_sgd(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * grad,
|
||||
struct ggml_tensor * sgd_params); // alpha, weight decay
|
||||
|
||||
//
|
||||
// automatic differentiation
|
||||
|
||||
@@ -2022,6 +2022,11 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
||||
ggml_compute_forward_opt_step_adamw(params, tensor);
|
||||
}
|
||||
break;
|
||||
case GGML_OP_OPT_STEP_SGD:
|
||||
{
|
||||
ggml_compute_forward_opt_step_sgd(params, tensor);
|
||||
}
|
||||
break;
|
||||
case GGML_OP_NONE:
|
||||
{
|
||||
// nop
|
||||
@@ -2325,6 +2330,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
||||
case GGML_OP_CROSS_ENTROPY_LOSS:
|
||||
case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
|
||||
case GGML_OP_OPT_STEP_ADAMW:
|
||||
case GGML_OP_OPT_STEP_SGD:
|
||||
{
|
||||
n_tasks = n_threads;
|
||||
} break;
|
||||
|
||||
@@ -10330,6 +10330,7 @@ static void ggml_compute_forward_opt_step_adamw_f32(
|
||||
const int ir1 = MIN(ir0 + dr, nr);
|
||||
|
||||
const float * adamw_params_ptr = ggml_get_data_f32(adamw_params);
|
||||
|
||||
const float alpha = adamw_params_ptr[0];
|
||||
const float beta1 = adamw_params_ptr[1];
|
||||
const float beta2 = adamw_params_ptr[2];
|
||||
@@ -10337,7 +10338,7 @@ static void ggml_compute_forward_opt_step_adamw_f32(
|
||||
const float wd = adamw_params_ptr[4];
|
||||
const float beta1h = adamw_params_ptr[5];
|
||||
const float beta2h = adamw_params_ptr[6];
|
||||
|
||||
const float keep = 1.f - alpha * wd;
|
||||
for (int ir = ir0; ir < ir1; ++ir) {
|
||||
const int64_t i03 = ir/(ne02*ne01);
|
||||
const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
|
||||
@@ -10360,7 +10361,7 @@ static void ggml_compute_forward_opt_step_adamw_f32(
|
||||
// The weight decay is applied independently of the Adam momenta m and v.
|
||||
// This is NOT equivalent to l2 regularization that adds w[i00]*w[i00] to the loss.
|
||||
// See: https://arxiv.org/pdf/1711.05101v3.pdf
|
||||
w[i00] = w[i00]*(1.0f - alpha*wd) - alpha*mh/vh;
|
||||
w[i00] = w[i00] * keep - alpha * mh / vh;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -10382,3 +10383,63 @@ void ggml_compute_forward_opt_step_adamw(
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_compute_forward_opt_step_sgd_f32(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const ggml_tensor * src0_grad = dst->src[1];
|
||||
const ggml_tensor * sgd_params = dst->src[2];
|
||||
|
||||
GGML_ASSERT(ggml_are_same_shape(src0, src0_grad));
|
||||
GGML_ASSERT(ggml_nelements(sgd_params) == 2);
|
||||
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
const int nr = ggml_nrows(src0);
|
||||
|
||||
GGML_TENSOR_UNARY_OP_LOCALS
|
||||
GGML_ASSERT(nb00 == sizeof(float));
|
||||
|
||||
// rows per thread
|
||||
const int dr = (nr + nth - 1) / nth;
|
||||
|
||||
// row range for this thread
|
||||
const int ir0 = dr * ith;
|
||||
const int ir1 = MIN(ir0 + dr, nr);
|
||||
|
||||
// using adamw param subset we care about - alpha, wd - could have a separate struct
|
||||
const float * sgd_params_ptr = ggml_get_data_f32(sgd_params);
|
||||
const float alpha = sgd_params_ptr[0];
|
||||
const float keep = 1.f - alpha * sgd_params_ptr[1];
|
||||
|
||||
for (int ir = ir0; ir < ir1; ++ir) {
|
||||
const int64_t i03 = ir / (ne02 * ne01);
|
||||
const int64_t i02 = (ir - i03 * ne02 * ne01) / ne01;
|
||||
const int64_t i01 = (ir - i03 * ne02 * ne01 - i02 * ne01);
|
||||
|
||||
const size_t offset = i03 * nb03 + i02 * nb02 + i01 * nb01;
|
||||
|
||||
float * w = (float *) ((char *) src0->data + offset); // weight
|
||||
const float * g = (const float *) ((const char *) src0_grad->data + offset); // grad
|
||||
|
||||
for (int i00 = 0; i00 < ne00; ++i00) {
|
||||
w[i00] = w[i00] * keep - alpha * g[i00];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_compute_forward_opt_step_sgd(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F32:
|
||||
{
|
||||
ggml_compute_forward_opt_step_sgd_f32(params, dst);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
{
|
||||
GGML_ABORT("fatal error - sgd is F32 only");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -107,7 +107,7 @@ void ggml_compute_forward_cross_entropy_loss(const struct ggml_compute_params *
|
||||
void ggml_compute_forward_cross_entropy_loss_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_opt_step_adamw(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
|
||||
void ggml_compute_forward_opt_step_sgd(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -464,25 +464,21 @@ static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const half2 b) {
|
||||
#if defined(GGML_USE_HIP) && HIP_VERSION >= 50700000
|
||||
#if defined(GGML_USE_HIP)
|
||||
return half2(__hmax(a.x, b.x), __hmax(a.y, b.y));
|
||||
#elif !defined(GGML_USE_HIP) && CUDART_VERSION >= CUDART_HMAX
|
||||
#elif CUDART_VERSION >= CUDART_HMAX
|
||||
return __hmax2(a, b);
|
||||
#elif !defined(GGML_USE_HIP)
|
||||
#else
|
||||
half2 ret;
|
||||
reinterpret_cast<half&>(ret.x) = __float2half(fmaxf( __low2float(a), __low2float(b)));
|
||||
reinterpret_cast<half&>(ret.y) = __float2half(fmaxf(__high2float(a), __high2float(b)));
|
||||
return ret;
|
||||
#else
|
||||
GGML_UNUSED(a);
|
||||
GGML_UNUSED(b);
|
||||
NO_DEVICE_CODE;
|
||||
#endif
|
||||
}
|
||||
|
||||
template<int width = WARP_SIZE>
|
||||
static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
|
||||
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || (defined(GGML_USE_HIP) && HIP_VERSION >= 50700000)
|
||||
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || defined(GGML_USE_HIP)
|
||||
#pragma unroll
|
||||
for (int offset = width/2; offset > 0; offset >>= 1) {
|
||||
x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, offset, width));
|
||||
@@ -491,7 +487,7 @@ static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
|
||||
#else
|
||||
GGML_UNUSED(x);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || (defined(GGML_USE_HIP) && HIP_VERSION >= 50700000)
|
||||
#endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || defined(GGML_USE_HIP)
|
||||
}
|
||||
|
||||
#if CUDART_VERSION < CUDART_HMASK
|
||||
|
||||
@@ -28,6 +28,7 @@
|
||||
#include "ggml-cuda/mmvq.cuh"
|
||||
#include "ggml-cuda/norm.cuh"
|
||||
#include "ggml-cuda/opt-step-adamw.cuh"
|
||||
#include "ggml-cuda/opt-step-sgd.cuh"
|
||||
#include "ggml-cuda/out-prod.cuh"
|
||||
#include "ggml-cuda/pad.cuh"
|
||||
#include "ggml-cuda/pool2d.cuh"
|
||||
@@ -180,30 +181,6 @@ static int ggml_cuda_parse_id(char devName[]) {
|
||||
#endif // defined(GGML_USE_HIP)
|
||||
|
||||
static ggml_cuda_device_info ggml_cuda_init() {
|
||||
#if defined(GGML_USE_HIP)
|
||||
// Workaround for a rocBLAS bug when using multiple graphics cards:
|
||||
// https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346
|
||||
{
|
||||
int major_version = 0;
|
||||
size_t version_length = 0;
|
||||
if (rocblas_get_version_string_size(&version_length) == rocblas_status_success) {
|
||||
std::vector<char> version(version_length+1, '\0');
|
||||
if (rocblas_get_version_string(version.data(), version.size()) == rocblas_status_success) {
|
||||
version.resize(::strlen(version.data()));
|
||||
int parsed_value = 0;
|
||||
if (std::from_chars(version.data(), version.data() + version.size(), parsed_value).ec == std::errc()) {
|
||||
major_version = parsed_value;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (major_version < 4) {
|
||||
GGML_LOG_DEBUG(GGML_CUDA_NAME " calling rocblas_initialize as a workaround for a rocBLAS bug\n");
|
||||
rocblas_initialize();
|
||||
CUDA_CHECK(cudaDeviceSynchronize());
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
ggml_cuda_device_info info = {};
|
||||
|
||||
cudaError_t err = cudaGetDeviceCount(&info.device_count);
|
||||
@@ -2503,6 +2480,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
||||
case GGML_OP_OPT_STEP_ADAMW:
|
||||
ggml_cuda_opt_step_adamw(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_OPT_STEP_SGD:
|
||||
ggml_cuda_opt_step_sgd(ctx, dst);
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
@@ -3560,6 +3540,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
case GGML_OP_CROSS_ENTROPY_LOSS:
|
||||
case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
|
||||
case GGML_OP_OPT_STEP_ADAMW:
|
||||
case GGML_OP_OPT_STEP_SGD:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
|
||||
@@ -25,9 +25,12 @@ void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
|
||||
// Special case for reducing vectors
|
||||
#ifdef GGML_CUDA_USE_CUB
|
||||
#ifdef USE_CUDA_GRAPH
|
||||
cudaStreamCaptureStatus iscapturing;
|
||||
CUDA_CHECK(cudaStreamIsCapturing(stream, &iscapturing));
|
||||
#endif // USE_CUDA_GRAPH
|
||||
if ((nrows == 1) &&
|
||||
#ifdef USE_CUDA_GRAPH
|
||||
// CUDA_GRAPHS_DISABLED
|
||||
((ncols > 65536) &&
|
||||
((ctx.cuda_graph->instance == nullptr) && (iscapturing == cudaStreamCaptureStatusNone) ||
|
||||
@@ -38,6 +41,9 @@ void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
!((ctx.cuda_graph->instance == nullptr) && (iscapturing == cudaStreamCaptureStatusNone) ||
|
||||
ctx.cuda_graph->disable_due_to_gpu_arch || ctx.cuda_graph->disable_due_to_too_many_updates ||
|
||||
ctx.cuda_graph->disable_due_to_failed_graph_capture))) {
|
||||
#else
|
||||
(ncols > 65536)) {
|
||||
#endif // USE_CUDA_GRAPH
|
||||
// Single row - use device-wide reduction
|
||||
size_t tmp_size = 0;
|
||||
ggml_cuda_pool & pool = ctx.pool();
|
||||
@@ -51,7 +57,7 @@ void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
divide_by_count<float><<<1, 1, 0, stream>>>(dst_d, ncols);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
#endif // GGML_CUDA_USE_CUB
|
||||
|
||||
const dim3 block_nums(nrows, 1, 1);
|
||||
|
||||
|
||||
49
ggml/src/ggml-cuda/opt-step-sgd.cu
Normal file
49
ggml/src/ggml-cuda/opt-step-sgd.cu
Normal file
@@ -0,0 +1,49 @@
|
||||
#include "ggml-impl.h"
|
||||
#include "opt-step-sgd.cuh"
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
static __global__ void opt_step_sgd_f32(
|
||||
float * __restrict__ x, const float * __restrict__ g,
|
||||
const float * __restrict__ pars, const int64_t k) {
|
||||
|
||||
const int64_t i = (int64_t) blockIdx.x*blockDim.x + threadIdx.x;
|
||||
|
||||
if (i >= k) {
|
||||
return;
|
||||
}
|
||||
x[i] = x[i] * (1.0f - pars[0] * pars[1]) - pars[0] * g[i];
|
||||
}
|
||||
|
||||
static void opt_step_sgd_f32_cuda(
|
||||
float * x, const float * g, const float * __restrict__ pars, const int64_t k, cudaStream_t stream) {
|
||||
|
||||
const dim3 block_dims(CUDA_OPT_STEP_SGD_BLOCK_SIZE, 1, 1);
|
||||
const dim3 block_nums((k + CUDA_OPT_STEP_SGD_BLOCK_SIZE - 1) / CUDA_OPT_STEP_SGD_BLOCK_SIZE, 1, 1);
|
||||
opt_step_sgd_f32<<<block_nums, block_dims, 0, stream>>>(x, g, pars, k);
|
||||
}
|
||||
|
||||
void ggml_cuda_opt_step_sgd(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const ggml_tensor * src0_grad = dst->src[1];
|
||||
const ggml_tensor * params = dst->src[2];
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src0_grad->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(params->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
GGML_ASSERT(ggml_is_contiguous(src0_grad));
|
||||
GGML_ASSERT(ggml_is_contiguous(params));
|
||||
GGML_ASSERT(ggml_are_same_shape(src0, src0_grad));
|
||||
GGML_ASSERT(ggml_nelements(params) == 2);
|
||||
|
||||
float * src0_d = (float *) src0->data;
|
||||
const float * src0_grad_d = (const float *) src0_grad->data;
|
||||
const float * params_d = (const float *) params->data;
|
||||
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
const int64_t ne = ggml_nelements(src0);
|
||||
|
||||
opt_step_sgd_f32_cuda(src0_d, src0_grad_d, params_d, ne, stream);
|
||||
}
|
||||
5
ggml/src/ggml-cuda/opt-step-sgd.cuh
Normal file
5
ggml/src/ggml-cuda/opt-step-sgd.cuh
Normal file
@@ -0,0 +1,5 @@
|
||||
#include "common.cuh"
|
||||
|
||||
#define CUDA_OPT_STEP_SGD_BLOCK_SIZE 256
|
||||
|
||||
void ggml_cuda_opt_step_sgd(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
16
ggml/src/ggml-cuda/vendors/hip.h
vendored
16
ggml/src/ggml-cuda/vendors/hip.h
vendored
@@ -5,8 +5,6 @@
|
||||
#include <hipblas/hipblas.h>
|
||||
#include <hip/hip_fp16.h>
|
||||
#include <hip/hip_bfloat16.h>
|
||||
// for rocblas_initialize()
|
||||
#include "rocblas/rocblas.h"
|
||||
|
||||
#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
|
||||
#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
|
||||
@@ -251,17 +249,3 @@ static __device__ __forceinline__ unsigned int __vcmpne4(unsigned int a, unsigne
|
||||
}
|
||||
return c;
|
||||
}
|
||||
|
||||
#if HIP_VERSION < 50600000
|
||||
// __shfl_xor() for half2 was added in ROCm 5.6
|
||||
static __device__ __forceinline__ half2 __shfl_xor(half2 var, int laneMask, int width) {
|
||||
typedef union half2_b32 {
|
||||
half2 val;
|
||||
int b32;
|
||||
} half2_b32_t;
|
||||
half2_b32_t tmp;
|
||||
tmp.val = var;
|
||||
tmp.b32 = __shfl_xor(tmp.b32, laneMask, width);
|
||||
return tmp.val;
|
||||
}
|
||||
#endif // HIP_VERSION < 50600000
|
||||
|
||||
@@ -46,8 +46,8 @@ if (GGML_HIP_ROCWMMA_FATTN)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (${hip_VERSION} VERSION_LESS 5.5)
|
||||
message(FATAL_ERROR "At least ROCM/HIP V5.5 is required")
|
||||
if (${hip_VERSION} VERSION_LESS 6.1)
|
||||
message(FATAL_ERROR "At least ROCM/HIP V6.1 is required")
|
||||
endif()
|
||||
|
||||
message(STATUS "HIP and hipBLAS found")
|
||||
|
||||
@@ -64,9 +64,11 @@ struct ggml_opt_context {
|
||||
int32_t opt_i = 0;
|
||||
bool loss_per_datapoint = false;
|
||||
|
||||
ggml_opt_get_optimizer_params get_opt_pars = nullptr;
|
||||
void * get_opt_pars_ud = nullptr;
|
||||
struct ggml_tensor * adamw_params = nullptr;
|
||||
ggml_opt_get_optimizer_params get_opt_pars = nullptr;
|
||||
void * get_opt_pars_ud = nullptr;
|
||||
struct ggml_tensor * opt_step_params = nullptr; // Stores output of get_opt_pars.
|
||||
|
||||
enum ggml_opt_optimizer_type optimizer = GGML_OPT_OPTIMIZER_TYPE_ADAMW;
|
||||
};
|
||||
|
||||
struct ggml_opt_result {
|
||||
@@ -229,9 +231,13 @@ struct ggml_opt_optimizer_params ggml_opt_get_default_optimizer_params(void * us
|
||||
result.adamw.eps = 1e-8f;
|
||||
result.adamw.wd = 0.0f;
|
||||
|
||||
result.sgd.alpha = 1e-3f;
|
||||
result.sgd.wd = 0.0f;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
struct ggml_opt_optimizer_params ggml_opt_get_constant_optimizer_params(void * userdata) {
|
||||
return *((struct ggml_opt_optimizer_params *) userdata);
|
||||
}
|
||||
@@ -249,6 +255,7 @@ struct ggml_opt_params ggml_opt_default_params(
|
||||
/*opt_period =*/ 1,
|
||||
/*get_opt_pars =*/ ggml_opt_get_default_optimizer_params,
|
||||
/*get_opt_pars_ud =*/ nullptr,
|
||||
/*optimizer =*/ GGML_OPT_OPTIMIZER_TYPE_ADAMW,
|
||||
};
|
||||
}
|
||||
|
||||
@@ -316,9 +323,14 @@ static void ggml_opt_build(ggml_opt_context_t opt_ctx) {
|
||||
GGML_ASSERT(opt_ctx->ctx_compute && "no compute context set, either use static graphs or set one with ggml_opt_prepare_alloc");
|
||||
GGML_ASSERT((!opt_ctx->static_graphs || opt_ctx->inputs->data) && "when using static graphs the inputs must be allocated statically");
|
||||
|
||||
const enum ggml_opt_optimizer_type optimizer = opt_ctx->optimizer;
|
||||
|
||||
const bool accumulate = opt_ctx->build_type_alloc >= GGML_OPT_BUILD_TYPE_GRAD &&
|
||||
!(opt_ctx->static_graphs && opt_ctx->build_type_alloc == GGML_OPT_BUILD_TYPE_OPT && opt_ctx->opt_period == 1);
|
||||
|
||||
const bool need_momenta = opt_ctx->build_type_alloc == GGML_OPT_BUILD_TYPE_OPT &&
|
||||
opt_ctx->optimizer == GGML_OPT_OPTIMIZER_TYPE_ADAMW;
|
||||
|
||||
ggml_set_input(opt_ctx->inputs);
|
||||
ggml_set_output(opt_ctx->outputs);
|
||||
|
||||
@@ -340,8 +352,7 @@ static void ggml_opt_build(ggml_opt_context_t opt_ctx) {
|
||||
// - pred (if using static graphs)
|
||||
// - ncorrect (if using static graphs, 2 tensors).
|
||||
constexpr size_t n_loss = 1;
|
||||
const size_t tensors_per_param = (accumulate ? 1 : 0) +
|
||||
(opt_ctx->build_type_alloc == GGML_OPT_BUILD_TYPE_OPT ? 2 : 0);
|
||||
const size_t tensors_per_param = (accumulate ? 1 : 0) + (need_momenta ? 2 : 0);
|
||||
const size_t tensors_const = opt_ctx->static_graphs ? 9 : 0;
|
||||
const size_t size_meta = (n_loss + tensors_per_param*n_param + tensors_const) * ggml_tensor_overhead();
|
||||
struct ggml_init_params params = {
|
||||
@@ -458,7 +469,7 @@ static void ggml_opt_build(ggml_opt_context_t opt_ctx) {
|
||||
}
|
||||
}
|
||||
|
||||
if (opt_ctx->build_type_alloc >= GGML_OPT_BUILD_TYPE_OPT) {
|
||||
if (need_momenta && opt_ctx->build_type_alloc >= GGML_OPT_BUILD_TYPE_OPT) {
|
||||
opt_ctx->grad_m.resize(n_nodes);
|
||||
opt_ctx->grad_v.resize(n_nodes);
|
||||
for (int i = 0; i < n_nodes; ++i) {
|
||||
@@ -492,23 +503,36 @@ static void ggml_opt_build(ggml_opt_context_t opt_ctx) {
|
||||
// gb_opt == graph backward optimize, forward pass, then backward pass to calculate gradients, then optimizer step.
|
||||
opt_ctx->gb_opt = ggml_graph_dup(opt_ctx->ctx_compute, opt_ctx->gb_grad, /*force_grads =*/ true);
|
||||
|
||||
opt_ctx->adamw_params = ggml_new_tensor_1d(opt_ctx->ctx_cpu, GGML_TYPE_F32, 7);
|
||||
ggml_set_input(opt_ctx->adamw_params);
|
||||
ggml_set_name(opt_ctx->adamw_params, "adamw_params");
|
||||
|
||||
opt_ctx->opt_step_params = ggml_new_tensor_1d(opt_ctx->ctx_cpu, GGML_TYPE_F32, need_momenta ? 7 : 2);
|
||||
ggml_tensor * adamw_params = opt_ctx->opt_step_params;
|
||||
ggml_set_input(adamw_params);
|
||||
const char * optimizer_name = ggml_opt_optimizer_name(opt_ctx->optimizer);
|
||||
ggml_format_name(adamw_params, "%s_params", optimizer_name);
|
||||
for (int i = opt_ctx->gf->n_nodes-1; i >= 0; --i) {
|
||||
struct ggml_tensor * node = opt_ctx->gb_opt->nodes[i];
|
||||
struct ggml_tensor * grad = ggml_graph_get_grad(opt_ctx->gb_opt, node);
|
||||
|
||||
if (grad && (node->flags & GGML_TENSOR_FLAG_PARAM)) {
|
||||
struct ggml_tensor * m = opt_ctx->grad_m[i];
|
||||
struct ggml_tensor * v = opt_ctx->grad_v[i];
|
||||
struct ggml_tensor * opt_step = ggml_opt_step_adamw(opt_ctx->ctx_compute, node, grad, m, v, opt_ctx->adamw_params);
|
||||
|
||||
ggml_set_name(m, (std::string("AdamW m for ") + std::string(node->name)).c_str());
|
||||
ggml_set_name(v, (std::string("AdamW v for ") + std::string(node->name)).c_str());
|
||||
ggml_set_name(opt_step, (std::string("AdamW step for ") + std::string(node->name)).c_str());
|
||||
|
||||
struct ggml_tensor * m = nullptr;
|
||||
struct ggml_tensor * v = nullptr;
|
||||
if (need_momenta) {
|
||||
m = opt_ctx->grad_m[i];
|
||||
v = opt_ctx->grad_v[i];
|
||||
ggml_format_name(m, "AdamW m for %s", node->name);
|
||||
ggml_format_name(v, "AdamW v for %s", node->name);
|
||||
}
|
||||
struct ggml_tensor * opt_step;
|
||||
switch (optimizer) {
|
||||
case GGML_OPT_OPTIMIZER_TYPE_ADAMW:
|
||||
opt_step = ggml_opt_step_adamw(opt_ctx->ctx_compute, node, grad, m, v, adamw_params);
|
||||
break;
|
||||
case GGML_OPT_OPTIMIZER_TYPE_SGD:
|
||||
opt_step = ggml_opt_step_sgd(opt_ctx->ctx_compute, node, grad, adamw_params);
|
||||
break;
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
ggml_format_name(opt_step, "%s step for %s", optimizer_name, node->name);
|
||||
ggml_build_forward_expand(opt_ctx->gb_opt, opt_step);
|
||||
}
|
||||
}
|
||||
@@ -534,6 +558,7 @@ ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params) {
|
||||
result->opt_period = params.opt_period;
|
||||
result->get_opt_pars = params.get_opt_pars;
|
||||
result->get_opt_pars_ud = params.get_opt_pars_ud;
|
||||
result->optimizer = params.optimizer;
|
||||
|
||||
GGML_ASSERT(result->opt_period >= 1);
|
||||
|
||||
@@ -756,29 +781,43 @@ void ggml_opt_alloc(ggml_opt_context_t opt_ctx, bool backward) {
|
||||
void ggml_opt_eval(ggml_opt_context_t opt_ctx, ggml_opt_result_t result) {
|
||||
GGML_ASSERT(opt_ctx->eval_ready);
|
||||
if (opt_ctx->allocated_graph == opt_ctx->gb_opt) {
|
||||
struct ggml_opt_optimizer_params opt_pars = opt_ctx->get_opt_pars(opt_ctx->get_opt_pars_ud);
|
||||
const ggml_opt_optimizer_params & opt_pars = opt_ctx->get_opt_pars(opt_ctx->get_opt_pars_ud);
|
||||
|
||||
GGML_ASSERT(opt_pars.adamw.alpha > 0.0f);
|
||||
GGML_ASSERT(opt_pars.adamw.beta1 >= 0.0f);
|
||||
GGML_ASSERT(opt_pars.adamw.beta1 <= 1.0f);
|
||||
GGML_ASSERT(opt_pars.adamw.beta2 >= 0.0f);
|
||||
GGML_ASSERT(opt_pars.adamw.beta2 <= 1.0f);
|
||||
GGML_ASSERT(opt_pars.adamw.eps >= 0.0f);
|
||||
GGML_ASSERT(opt_pars.adamw.wd >= 0.0f);
|
||||
GGML_ASSERT(opt_pars.adamw.wd <= 1.0f);
|
||||
switch (opt_ctx->optimizer) {
|
||||
case GGML_OPT_OPTIMIZER_TYPE_ADAMW: {
|
||||
GGML_ASSERT(opt_pars.adamw.alpha > 0.0f);
|
||||
GGML_ASSERT(opt_pars.adamw.beta1 >= 0.0f);
|
||||
GGML_ASSERT(opt_pars.adamw.beta1 <= 1.0f);
|
||||
GGML_ASSERT(opt_pars.adamw.beta2 >= 0.0f);
|
||||
GGML_ASSERT(opt_pars.adamw.beta2 <= 1.0f);
|
||||
GGML_ASSERT(opt_pars.adamw.eps >= 0.0f);
|
||||
GGML_ASSERT(opt_pars.adamw.wd >= 0.0f);
|
||||
GGML_ASSERT(opt_pars.adamw.wd <= 1.0f);
|
||||
|
||||
// beta1, beta2 after applying warmup
|
||||
const float beta1h = 1.0f/(1.0f - powf(opt_pars.adamw.beta1, opt_ctx->iter));
|
||||
const float beta2h = 1.0f/(1.0f - powf(opt_pars.adamw.beta2, opt_ctx->iter));
|
||||
// beta1, beta2 after applying warmup
|
||||
const float beta1h = 1.0f / (1.0f - powf(opt_pars.adamw.beta1, opt_ctx->iter));
|
||||
const float beta2h = 1.0f / (1.0f - powf(opt_pars.adamw.beta2, opt_ctx->iter));
|
||||
|
||||
float * adamw_par_data = ggml_get_data_f32(opt_ctx->adamw_params);
|
||||
adamw_par_data[0] = opt_pars.adamw.alpha;
|
||||
adamw_par_data[1] = opt_pars.adamw.beta1;
|
||||
adamw_par_data[2] = opt_pars.adamw.beta2;
|
||||
adamw_par_data[3] = opt_pars.adamw.eps;
|
||||
adamw_par_data[4] = opt_pars.adamw.wd;
|
||||
adamw_par_data[5] = beta1h;
|
||||
adamw_par_data[6] = beta2h;
|
||||
float * adamw_par_data = ggml_get_data_f32(opt_ctx->opt_step_params);
|
||||
adamw_par_data[0] = opt_pars.adamw.alpha;
|
||||
adamw_par_data[1] = opt_pars.adamw.beta1;
|
||||
adamw_par_data[2] = opt_pars.adamw.beta2;
|
||||
adamw_par_data[3] = opt_pars.adamw.eps;
|
||||
adamw_par_data[4] = opt_pars.adamw.wd;
|
||||
adamw_par_data[5] = beta1h;
|
||||
adamw_par_data[6] = beta2h;
|
||||
} break;
|
||||
case GGML_OPT_OPTIMIZER_TYPE_SGD: {
|
||||
GGML_ASSERT(opt_pars.sgd.alpha > 0.0f);
|
||||
GGML_ASSERT(opt_pars.sgd.wd >= 0.0f);
|
||||
GGML_ASSERT(opt_pars.sgd.wd <= 1.0f);
|
||||
float * sgd = ggml_get_data_f32(opt_ctx->opt_step_params);
|
||||
sgd[0] = opt_pars.sgd.alpha;
|
||||
sgd[1] = opt_pars.sgd.wd;
|
||||
} break;
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
|
||||
ggml_backend_sched_graph_compute(opt_ctx->backend_sched, opt_ctx->allocated_graph_copy);
|
||||
@@ -963,6 +1002,7 @@ void ggml_opt_fit(
|
||||
ggml_tensor * outputs,
|
||||
ggml_opt_dataset_t dataset,
|
||||
enum ggml_opt_loss_type loss_type,
|
||||
enum ggml_opt_optimizer_type optimizer,
|
||||
ggml_opt_get_optimizer_params get_opt_pars,
|
||||
int64_t nepoch,
|
||||
int64_t nbatch_logical,
|
||||
@@ -993,6 +1033,7 @@ void ggml_opt_fit(
|
||||
params.opt_period = opt_period;
|
||||
params.get_opt_pars = get_opt_pars;
|
||||
params.get_opt_pars_ud = &epoch;
|
||||
params.optimizer = optimizer;
|
||||
ggml_opt_context_t opt_ctx = ggml_opt_init(params);
|
||||
|
||||
// Shuffling the data is generally useful but there is only a point if not all data is used in a single batch.
|
||||
@@ -1035,3 +1076,18 @@ void ggml_opt_fit(
|
||||
ggml_opt_result_free(result_train);
|
||||
ggml_opt_result_free(result_val);
|
||||
}
|
||||
|
||||
enum ggml_opt_optimizer_type ggml_opt_context_optimizer_type(ggml_opt_context_t c) {
|
||||
return c->optimizer;
|
||||
}
|
||||
|
||||
GGML_API const char * ggml_opt_optimizer_name(enum ggml_opt_optimizer_type o) {
|
||||
switch (o) {
|
||||
case GGML_OPT_OPTIMIZER_TYPE_ADAMW:
|
||||
return "adamw";
|
||||
case GGML_OPT_OPTIMIZER_TYPE_SGD:
|
||||
return "sgd";
|
||||
default:
|
||||
return "undefined";
|
||||
};
|
||||
}
|
||||
|
||||
@@ -510,6 +510,7 @@ struct vk_device_struct {
|
||||
vk_pipeline pipeline_rwkv_wkv6_f32;
|
||||
vk_pipeline pipeline_rwkv_wkv7_f32;
|
||||
vk_pipeline pipeline_opt_step_adamw_f32;
|
||||
vk_pipeline pipeline_opt_step_sgd_f32;
|
||||
vk_pipeline pipeline_conv2d_f32[CONV_SHAPE_COUNT];
|
||||
vk_pipeline pipeline_conv2d_f16_f32[CONV_SHAPE_COUNT];
|
||||
vk_pipeline pipeline_conv2d_dw_whcn_f32;
|
||||
@@ -3123,6 +3124,8 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_opt_step_adamw_f32, "opt_step_adamw_f32", opt_step_adamw_f32_len, opt_step_adamw_f32_data, "main", 5, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_opt_step_sgd_f32, "opt_step_sgd_f32", opt_step_sgd_f32_len, opt_step_sgd_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
// conv2d
|
||||
for (uint32_t s = 0; s < CONV_SHAPE_COUNT; ++s) {
|
||||
uint32_t conv2d_WG_SIZE = 256;
|
||||
@@ -7193,6 +7196,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
||||
return ctx->device->pipeline_opt_step_adamw_f32;
|
||||
}
|
||||
return nullptr;
|
||||
case GGML_OP_OPT_STEP_SGD:
|
||||
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
return ctx->device->pipeline_opt_step_sgd_f32;
|
||||
}
|
||||
return nullptr;
|
||||
case GGML_OP_LEAKY_RELU:
|
||||
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
return ctx->device->pipeline_leaky_relu_f32;
|
||||
@@ -7692,6 +7700,10 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
||||
ggml_vk_buffer_memset_async(subctx, d_D, d_buf_offset, 0, d_sz);
|
||||
ggml_vk_sync_buffers(subctx);
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
|
||||
} else if (op == GGML_OP_OPT_STEP_SGD) {
|
||||
// OPT_STEP_SGD works on src0, it does not need dst
|
||||
ggml_vk_sync_buffers(subctx);
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_Z, z_buf_offset, z_sz } }, pc, elements);
|
||||
} else if (use_src2) {
|
||||
ggml_vk_sync_buffers(subctx);
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_Z, z_buf_offset, z_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
|
||||
@@ -8045,6 +8057,12 @@ static void ggml_vk_opt_step_adamw(ggml_backend_vk_context * ctx, vk_context& su
|
||||
);
|
||||
}
|
||||
|
||||
static void ggml_vk_opt_step_sgd(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, bool dryrun = false) {
|
||||
const size_t n = ggml_nelements(dst->src[0]);
|
||||
|
||||
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_OPT_STEP_SGD, { (uint32_t)n, 0, 0.0f, 0.0f }, dryrun);
|
||||
}
|
||||
|
||||
static void ggml_vk_concat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
|
||||
int * op_params = (int *)dst->op_params;
|
||||
|
||||
@@ -9598,6 +9616,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
|
||||
case GGML_OP_LEAKY_RELU:
|
||||
case GGML_OP_FLASH_ATTN_EXT:
|
||||
case GGML_OP_OPT_STEP_ADAMW:
|
||||
case GGML_OP_OPT_STEP_SGD:
|
||||
break;
|
||||
default:
|
||||
std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(node->op) << std::endl;
|
||||
@@ -9662,6 +9681,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
|
||||
case GGML_OP_CONV_2D:
|
||||
case GGML_OP_CONV_2D_DW:
|
||||
case GGML_OP_LEAKY_RELU:
|
||||
case GGML_OP_OPT_STEP_SGD:
|
||||
{
|
||||
// These operations all go through ggml_vk_op_f32, so short-circuit and
|
||||
// do the only thing needed for the dryrun.
|
||||
@@ -9911,6 +9931,11 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
|
||||
case GGML_OP_OPT_STEP_ADAMW:
|
||||
ggml_vk_opt_step_adamw(ctx, compute_ctx, node, dryrun);
|
||||
|
||||
break;
|
||||
|
||||
case GGML_OP_OPT_STEP_SGD:
|
||||
ggml_vk_opt_step_sgd(ctx, compute_ctx, src0, src1, src2, node, dryrun);
|
||||
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
@@ -10014,8 +10039,8 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_cgraph *
|
||||
case GGML_OP_REPEAT:
|
||||
case GGML_OP_REPEAT_BACK:
|
||||
case GGML_OP_OPT_STEP_ADAMW:
|
||||
case GGML_OP_OPT_STEP_SGD:
|
||||
buf = tensor->buffer;
|
||||
|
||||
break;
|
||||
case GGML_OP_UNARY:
|
||||
switch (ggml_get_unary_op(tensor)) {
|
||||
@@ -11154,6 +11179,9 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||
case GGML_OP_SIN:
|
||||
case GGML_OP_COS:
|
||||
case GGML_OP_CLAMP:
|
||||
case GGML_OP_LEAKY_RELU:
|
||||
case GGML_OP_OPT_STEP_ADAMW:
|
||||
case GGML_OP_OPT_STEP_SGD:
|
||||
return op->src[0]->type == GGML_TYPE_F32;
|
||||
case GGML_OP_UPSCALE:
|
||||
case GGML_OP_ACC:
|
||||
@@ -11175,8 +11203,6 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||
case GGML_OP_POOL_2D:
|
||||
case GGML_OP_RWKV_WKV6:
|
||||
case GGML_OP_RWKV_WKV7:
|
||||
case GGML_OP_LEAKY_RELU:
|
||||
case GGML_OP_OPT_STEP_ADAMW:
|
||||
return true;
|
||||
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||
return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
|
||||
@@ -11774,6 +11800,10 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph *
|
||||
src_clone[0]->flags = src0->flags;
|
||||
tensor_clone = ggml_opt_step_adamw(ggml_ctx, src_clone[0], src_clone[1],
|
||||
src_clone[2], src_clone[3], src_clone[4]);
|
||||
} else if (tensor->op == GGML_OP_OPT_STEP_SGD) {
|
||||
src_clone[0]->flags = src0->flags;
|
||||
tensor_clone = ggml_opt_step_sgd(ggml_ctx, src_clone[0], src_clone[1],
|
||||
src_clone[2]);
|
||||
}
|
||||
else {
|
||||
std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl;
|
||||
|
||||
22
ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp
Normal file
22
ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp
Normal file
@@ -0,0 +1,22 @@
|
||||
#version 450
|
||||
|
||||
#include "generic_head.comp"
|
||||
|
||||
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) buffer X {A_TYPE data_x[];};
|
||||
layout (binding = 1) readonly buffer G {A_TYPE data_grad[];};
|
||||
layout (binding = 2) readonly buffer P {float data_params[2];};
|
||||
|
||||
void main() {
|
||||
const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
||||
|
||||
if (i >= p.KX) {
|
||||
return;
|
||||
}
|
||||
|
||||
const float alpha = data_params[0];
|
||||
const float keep = 1.f - alpha * data_params[1];
|
||||
|
||||
data_x[i] = data_x[i] * keep - alpha * data_grad[i];
|
||||
}
|
||||
@@ -657,6 +657,7 @@ void process_shaders() {
|
||||
string_to_spv("rwkv_wkv7_f32", "wkv7.comp", merge_maps(base_dict, {{"A_TYPE", "float"}}));
|
||||
|
||||
string_to_spv("opt_step_adamw_f32", "opt_step_adamw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}}));
|
||||
string_to_spv("opt_step_sgd_f32", "opt_step_sgd.comp", merge_maps(base_dict, {{"A_TYPE", "float"}}));
|
||||
|
||||
string_to_spv("conv2d_f32_unroll", "conv2d_mm.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"USE_COLLECTIVES", "1"}, {"UNROLL", "[[unroll]]"}});
|
||||
string_to_spv("conv2d_f16_f32_unroll", "conv2d_mm.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"USE_COLLECTIVES", "1"}, {"UNROLL", "[[unroll]]"}});
|
||||
|
||||
@@ -1012,11 +1012,12 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
||||
"CROSS_ENTROPY_LOSS",
|
||||
"CROSS_ENTROPY_LOSS_BACK",
|
||||
"OPT_STEP_ADAMW",
|
||||
"OPT_STEP_SGD",
|
||||
|
||||
"GLU",
|
||||
};
|
||||
|
||||
static_assert(GGML_OP_COUNT == 87, "GGML_OP_COUNT != 87");
|
||||
static_assert(GGML_OP_COUNT == 88, "GGML_OP_COUNT != 88");
|
||||
|
||||
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||
"none",
|
||||
@@ -1113,15 +1114,15 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||
"cross_entropy_loss(x,y)",
|
||||
"cross_entropy_loss_back(x,y)",
|
||||
"adamw(x)",
|
||||
"sgd(x)",
|
||||
|
||||
"glu(x)",
|
||||
};
|
||||
|
||||
static_assert(GGML_OP_COUNT == 87, "GGML_OP_COUNT != 87");
|
||||
static_assert(GGML_OP_COUNT == 88, "GGML_OP_COUNT != 88");
|
||||
|
||||
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
||||
|
||||
|
||||
static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
|
||||
"ABS",
|
||||
"SGN",
|
||||
@@ -4271,14 +4272,13 @@ struct ggml_tensor * ggml_conv_1d_dw(
|
||||
int s0,
|
||||
int p0,
|
||||
int d0) {
|
||||
struct ggml_tensor * new_a = ggml_reshape_4d(ctx, a, a->ne[0], 1, a->ne[1], a->ne[2]);
|
||||
struct ggml_tensor * new_b = ggml_reshape_4d(ctx, b, b->ne[0], 1, b->ne[1], b->ne[2]);
|
||||
|
||||
struct ggml_tensor * im2col = ggml_im2col(ctx, new_a, new_b, s0, 0, p0, 0, d0, 0, false, GGML_TYPE_F16);
|
||||
struct ggml_tensor * im2col = ggml_im2col(ctx, a, new_b, s0, 0, p0, 0, d0, 0, false, GGML_TYPE_F16);
|
||||
|
||||
struct ggml_tensor * result = ggml_mul_mat(ctx, im2col, a);
|
||||
|
||||
result = ggml_reshape_3d(ctx, result, b->ne[0], b->ne[1], 1);
|
||||
result = ggml_reshape_3d(ctx, result, result->ne[0], result->ne[2], 1);
|
||||
|
||||
return result;
|
||||
}
|
||||
@@ -5606,6 +5606,28 @@ struct ggml_tensor * ggml_opt_step_adamw(
|
||||
return result;
|
||||
}
|
||||
|
||||
// opt_step_sgd
|
||||
|
||||
struct ggml_tensor * ggml_opt_step_sgd(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * grad,
|
||||
struct ggml_tensor * params) {
|
||||
GGML_ASSERT(a->flags & GGML_TENSOR_FLAG_PARAM);
|
||||
GGML_ASSERT(ggml_are_same_shape(a, grad));
|
||||
GGML_ASSERT(params->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(ggml_nelements(params) == 2);
|
||||
|
||||
struct ggml_tensor * result = ggml_view_tensor(ctx, a);
|
||||
|
||||
result->op = GGML_OP_OPT_STEP_SGD;
|
||||
result->src[0] = a;
|
||||
result->src[1] = grad;
|
||||
result->src[2] = params;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
struct ggml_hash_set ggml_hash_set_new(size_t size) {
|
||||
|
||||
@@ -870,6 +870,29 @@ extern "C" {
|
||||
size_t n_token_capacity,
|
||||
size_t * n_token_count_out);
|
||||
|
||||
#define LLAMA_STATE_SEQ_FLAGS_SWA_ONLY 1
|
||||
|
||||
typedef uint32_t llama_state_seq_flags;
|
||||
|
||||
LLAMA_API size_t llama_state_seq_get_size_ext(
|
||||
struct llama_context * ctx,
|
||||
llama_seq_id seq_id,
|
||||
llama_state_seq_flags flags);
|
||||
|
||||
LLAMA_API size_t llama_state_seq_get_data_ext(
|
||||
struct llama_context * ctx,
|
||||
uint8_t * dst,
|
||||
size_t size,
|
||||
llama_seq_id seq_id,
|
||||
llama_state_seq_flags flags);
|
||||
|
||||
LLAMA_API size_t llama_state_seq_set_data_ext(
|
||||
struct llama_context * ctx,
|
||||
const uint8_t * src,
|
||||
size_t size,
|
||||
llama_seq_id dest_seq_id,
|
||||
llama_state_seq_flags flags);
|
||||
|
||||
//
|
||||
// Decoding
|
||||
//
|
||||
@@ -1437,6 +1460,8 @@ extern "C" {
|
||||
|
||||
ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
|
||||
void * get_opt_pars_ud; // userdata for calculating optimizer parameters
|
||||
|
||||
enum ggml_opt_optimizer_type optimizer_type;
|
||||
};
|
||||
|
||||
LLAMA_API void llama_opt_init(struct llama_context * lctx, struct llama_model * model, struct llama_opt_params lopt_params);
|
||||
|
||||
@@ -1 +1 @@
|
||||
daf7906728036a82f20c69fcbd74b6f536c74d3f
|
||||
b141fc226b68e4af383101c39da90b54ede98850
|
||||
|
||||
@@ -477,7 +477,7 @@ llama_ubatch llama_batch_allocr::split_simple(uint32_t n_ubatch) {
|
||||
|
||||
llama_ubatch llama_batch_allocr::split_equal(uint32_t n_ubatch, bool sequential) {
|
||||
if (sequential && has_cpl) {
|
||||
LLAMA_LOG_ERROR("%s: sequential split is not supported when there are coupled sequences in the input batch\n", __func__);
|
||||
LLAMA_LOG_ERROR("%s: sequential split is not supported when there are coupled sequences in the input batch (you may need to use the -kvu flag)\n", __func__);
|
||||
|
||||
return {};
|
||||
}
|
||||
|
||||
@@ -1657,30 +1657,30 @@ size_t llama_context::state_set_data(const uint8_t * src, size_t size) {
|
||||
}
|
||||
}
|
||||
|
||||
size_t llama_context::state_seq_get_size(llama_seq_id seq_id) {
|
||||
size_t llama_context::state_seq_get_size(llama_seq_id seq_id, llama_state_seq_flags flags) {
|
||||
llama_io_write_dummy io;
|
||||
try {
|
||||
return state_seq_write_data(io, seq_id);
|
||||
return state_seq_write_data(io, seq_id, flags);
|
||||
} catch (const std::exception & err) {
|
||||
LLAMA_LOG_ERROR("%s: error getting state size: %s\n", __func__, err.what());
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
size_t llama_context::state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size) {
|
||||
size_t llama_context::state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size, llama_state_seq_flags flags) {
|
||||
llama_io_write_buffer io(dst, size);
|
||||
try {
|
||||
return state_seq_write_data(io, seq_id);
|
||||
return state_seq_write_data(io, seq_id, flags);
|
||||
} catch (const std::exception & err) {
|
||||
LLAMA_LOG_ERROR("%s: error saving state: %s\n", __func__, err.what());
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
size_t llama_context::state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size) {
|
||||
size_t llama_context::state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size, llama_state_seq_flags flags) {
|
||||
llama_io_read_buffer io(src, size);
|
||||
try {
|
||||
return state_seq_read_data(io, seq_id);
|
||||
return state_seq_read_data(io, seq_id, flags);
|
||||
} catch (const std::exception & err) {
|
||||
LLAMA_LOG_ERROR("%s: error loading state: %s\n", __func__, err.what());
|
||||
return 0;
|
||||
@@ -1778,7 +1778,7 @@ size_t llama_context::state_seq_load_file(llama_seq_id seq_id, const char * file
|
||||
{
|
||||
const size_t state_size = file.size() - file.tell();
|
||||
llama_io_read_file io(&file);
|
||||
const size_t nread = state_seq_read_data(io, seq_id);
|
||||
const size_t nread = state_seq_read_data(io, seq_id, 0);
|
||||
if (!nread) {
|
||||
LLAMA_LOG_ERROR("%s: failed to restore sequence state\n", __func__);
|
||||
return 0;
|
||||
@@ -1802,7 +1802,7 @@ size_t llama_context::state_seq_save_file(llama_seq_id seq_id, const char * file
|
||||
|
||||
// save the context state using stream saving
|
||||
llama_io_write_file io(&file);
|
||||
state_seq_write_data(io, seq_id);
|
||||
state_seq_write_data(io, seq_id, 0);
|
||||
|
||||
const size_t res = file.tell();
|
||||
GGML_ASSERT(res == sizeof(uint32_t) * 3 + sizeof(llama_token) * n_token_count + io.n_bytes());
|
||||
@@ -1971,21 +1971,21 @@ size_t llama_context::state_read_data(llama_io_read_i & io) {
|
||||
return io.n_bytes();
|
||||
}
|
||||
|
||||
size_t llama_context::state_seq_write_data(llama_io_write_i & io, llama_seq_id seq_id) {
|
||||
size_t llama_context::state_seq_write_data(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
|
||||
GGML_UNUSED(seq_id);
|
||||
|
||||
if (memory) {
|
||||
memory->state_write(io, seq_id);
|
||||
memory->state_write(io, seq_id, flags);
|
||||
}
|
||||
|
||||
return io.n_bytes();
|
||||
}
|
||||
|
||||
size_t llama_context::state_seq_read_data(llama_io_read_i & io, llama_seq_id seq_id) {
|
||||
size_t llama_context::state_seq_read_data(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
|
||||
GGML_UNUSED(seq_id);
|
||||
|
||||
if (memory) {
|
||||
memory->state_read(io, seq_id);
|
||||
memory->state_read(io, seq_id, flags);
|
||||
}
|
||||
|
||||
return io.n_bytes();
|
||||
@@ -2048,7 +2048,7 @@ void llama_context::opt_init(struct llama_model * model, struct llama_opt_params
|
||||
opt_params.opt_period = n_batch / n_ubatch;
|
||||
opt_params.get_opt_pars = lopt_params.get_opt_pars;
|
||||
opt_params.get_opt_pars_ud = lopt_params.get_opt_pars_ud;
|
||||
|
||||
opt_params.optimizer = lopt_params.optimizer_type;
|
||||
opt_ctx = ggml_opt_init(opt_params);
|
||||
|
||||
llama_opt_param_filter param_filter = lopt_params.param_filter;
|
||||
@@ -2801,19 +2801,31 @@ bool llama_state_save_file(llama_context * ctx, const char * path_session, const
|
||||
}
|
||||
|
||||
size_t llama_state_seq_get_size(llama_context * ctx, llama_seq_id seq_id) {
|
||||
return ctx->state_seq_get_size(seq_id);
|
||||
return llama_state_seq_get_size_ext(ctx, seq_id, 0);
|
||||
}
|
||||
|
||||
size_t llama_state_seq_get_data(llama_context * ctx, uint8_t * dst, size_t size, llama_seq_id seq_id) {
|
||||
ctx->synchronize();
|
||||
|
||||
return ctx->state_seq_get_data(seq_id, dst, size);
|
||||
return llama_state_seq_get_data_ext(ctx, dst, size, seq_id, 0);
|
||||
}
|
||||
|
||||
size_t llama_state_seq_set_data(llama_context * ctx, const uint8_t * src, size_t size, llama_seq_id seq_id) {
|
||||
return llama_state_seq_set_data_ext(ctx, src, size, seq_id, 0);
|
||||
}
|
||||
|
||||
size_t llama_state_seq_get_size_ext(llama_context * ctx, llama_seq_id seq_id, llama_state_seq_flags flags) {
|
||||
return ctx->state_seq_get_size(seq_id, flags);
|
||||
}
|
||||
|
||||
size_t llama_state_seq_get_data_ext(llama_context * ctx, uint8_t * dst, size_t size, llama_seq_id seq_id, llama_state_seq_flags flags) {
|
||||
ctx->synchronize();
|
||||
|
||||
return ctx->state_seq_set_data(seq_id, src, size);
|
||||
return ctx->state_seq_get_data(seq_id, dst, size, flags);
|
||||
}
|
||||
|
||||
size_t llama_state_seq_set_data_ext(llama_context * ctx, const uint8_t * src, size_t size, llama_seq_id seq_id, llama_state_seq_flags flags) {
|
||||
ctx->synchronize();
|
||||
|
||||
return ctx->state_seq_set_data(seq_id, src, size, flags);
|
||||
}
|
||||
|
||||
size_t llama_state_seq_save_file(llama_context * ctx, const char * filepath, llama_seq_id seq_id, const llama_token * tokens, size_t n_token_count) {
|
||||
|
||||
@@ -111,9 +111,9 @@ struct llama_context {
|
||||
size_t state_get_data( uint8_t * dst, size_t size);
|
||||
size_t state_set_data(const uint8_t * src, size_t size);
|
||||
|
||||
size_t state_seq_get_size(llama_seq_id seq_id);
|
||||
size_t state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size);
|
||||
size_t state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size);
|
||||
size_t state_seq_get_size(llama_seq_id seq_id, llama_state_seq_flags flags);
|
||||
size_t state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size, llama_state_seq_flags flags);
|
||||
size_t state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size, llama_state_seq_flags flags);
|
||||
|
||||
bool state_load_file(
|
||||
const char * filepath,
|
||||
@@ -152,6 +152,7 @@ struct llama_context {
|
||||
|
||||
void opt_init(struct llama_model * model, struct llama_opt_params lopt_params);
|
||||
|
||||
// TODO: more flexible combinations of logical/physical batch size and context size
|
||||
void opt_epoch(
|
||||
ggml_opt_dataset_t dataset,
|
||||
ggml_opt_result_t result_train,
|
||||
@@ -212,8 +213,8 @@ private:
|
||||
size_t state_write_data(llama_io_write_i & io);
|
||||
size_t state_read_data (llama_io_read_i & io);
|
||||
|
||||
size_t state_seq_write_data(llama_io_write_i & io, llama_seq_id seq_id);
|
||||
size_t state_seq_read_data (llama_io_read_i & io, llama_seq_id seq_id);
|
||||
size_t state_seq_write_data(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags);
|
||||
size_t state_seq_read_data (llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags);
|
||||
|
||||
//
|
||||
// members
|
||||
|
||||
@@ -1566,6 +1566,11 @@ ggml_tensor * llm_graph_context::build_attn_with_sinks(
|
||||
|
||||
if (wo) {
|
||||
cur = build_lora_mm(wo, cur);
|
||||
if (arch == LLM_ARCH_OPENAI_MOE) {
|
||||
// similar the original build_attn
|
||||
// TODO: this is tmp until we refactor and remove the build_attn_with_sinks() path
|
||||
ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
|
||||
}
|
||||
}
|
||||
|
||||
if (wo_b) {
|
||||
|
||||
@@ -194,14 +194,20 @@ bool llama_kv_cache_unified_iswa::get_can_shift() const {
|
||||
return kv_base->get_size() == kv_swa->get_size();
|
||||
}
|
||||
|
||||
void llama_kv_cache_unified_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
|
||||
kv_base->state_write(io, seq_id);
|
||||
kv_swa ->state_write(io, seq_id);
|
||||
void llama_kv_cache_unified_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
|
||||
if ((flags & LLAMA_STATE_SEQ_FLAGS_SWA_ONLY) == 0) {
|
||||
kv_base->state_write(io, seq_id, flags);
|
||||
}
|
||||
|
||||
kv_swa->state_write(io, seq_id, flags);
|
||||
}
|
||||
|
||||
void llama_kv_cache_unified_iswa::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
|
||||
kv_base->state_read(io, seq_id);
|
||||
kv_swa ->state_read(io, seq_id);
|
||||
void llama_kv_cache_unified_iswa::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
|
||||
if ((flags & LLAMA_STATE_SEQ_FLAGS_SWA_ONLY) == 0) {
|
||||
kv_base->state_read(io, seq_id, flags);
|
||||
}
|
||||
|
||||
kv_swa->state_read(io, seq_id, flags);
|
||||
}
|
||||
|
||||
llama_kv_cache_unified * llama_kv_cache_unified_iswa::get_base() const {
|
||||
|
||||
@@ -56,8 +56,8 @@ public:
|
||||
|
||||
// state write/load
|
||||
|
||||
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
|
||||
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) override;
|
||||
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
|
||||
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) override;
|
||||
|
||||
//
|
||||
// llama_kv_cache_unified_iswa specific API
|
||||
|
||||
@@ -1828,7 +1828,9 @@ bool llama_kv_cache_unified::is_masked_swa(llama_pos p0, llama_pos p1) const {
|
||||
return false;
|
||||
}
|
||||
|
||||
void llama_kv_cache_unified::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
|
||||
void llama_kv_cache_unified::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
|
||||
GGML_UNUSED(flags);
|
||||
|
||||
io.write(&n_stream, sizeof(n_stream));
|
||||
|
||||
for (uint32_t s = 0; s < n_stream; ++s) {
|
||||
@@ -1879,7 +1881,9 @@ void llama_kv_cache_unified::state_write(llama_io_write_i & io, llama_seq_id seq
|
||||
}
|
||||
}
|
||||
|
||||
void llama_kv_cache_unified::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
|
||||
void llama_kv_cache_unified::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
|
||||
GGML_UNUSED(flags);
|
||||
|
||||
GGML_ASSERT(seq_id == -1 || (seq_id >= 0 && (size_t) seq_id < seq_to_stream.size()));
|
||||
|
||||
uint32_t n_stream_cur;
|
||||
|
||||
@@ -136,8 +136,8 @@ public:
|
||||
|
||||
// state write/load
|
||||
|
||||
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
|
||||
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) override;
|
||||
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
|
||||
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) override;
|
||||
|
||||
//
|
||||
// llama_kv_cache_unified specific API
|
||||
|
||||
@@ -165,12 +165,16 @@ llama_pos llama_memory_hybrid::seq_pos_max(llama_seq_id seq_id) const {
|
||||
return std::min(mem_attn->seq_pos_max(seq_id), mem_recr->seq_pos_max(seq_id));
|
||||
}
|
||||
|
||||
void llama_memory_hybrid::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
|
||||
void llama_memory_hybrid::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
|
||||
GGML_UNUSED(flags);
|
||||
|
||||
mem_attn->state_write(io, seq_id);
|
||||
mem_recr->state_write(io, seq_id);
|
||||
}
|
||||
|
||||
void llama_memory_hybrid::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
|
||||
void llama_memory_hybrid::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
|
||||
GGML_UNUSED(flags);
|
||||
|
||||
mem_attn->state_read(io, seq_id);
|
||||
mem_recr->state_read(io, seq_id);
|
||||
}
|
||||
|
||||
@@ -74,8 +74,8 @@ public:
|
||||
|
||||
// state write/load
|
||||
|
||||
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
|
||||
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) override;
|
||||
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
|
||||
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) override;
|
||||
|
||||
//
|
||||
// llama_memory_hybrid specific API
|
||||
|
||||
@@ -680,7 +680,9 @@ size_t llama_memory_recurrent::size_s_bytes() const {
|
||||
return size_s_bytes;
|
||||
}
|
||||
|
||||
void llama_memory_recurrent::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
|
||||
void llama_memory_recurrent::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
|
||||
GGML_UNUSED(flags);
|
||||
|
||||
std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
|
||||
uint32_t cell_count = 0;
|
||||
|
||||
@@ -718,7 +720,9 @@ void llama_memory_recurrent::state_write(llama_io_write_i & io, llama_seq_id seq
|
||||
state_write_data(io, cell_ranges);
|
||||
}
|
||||
|
||||
void llama_memory_recurrent::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
|
||||
void llama_memory_recurrent::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
|
||||
GGML_UNUSED(flags);
|
||||
|
||||
uint32_t cell_count;
|
||||
io.read_to(&cell_count, sizeof(cell_count));
|
||||
|
||||
|
||||
@@ -63,8 +63,8 @@ public:
|
||||
|
||||
// state write/load
|
||||
|
||||
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
|
||||
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) override;
|
||||
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
|
||||
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) override;
|
||||
|
||||
uint32_t head = 0; // the location where the batch will be placed in the cache (see find_slot())
|
||||
uint32_t size = 0; // total number of cells, shared across all sequences
|
||||
|
||||
@@ -104,8 +104,8 @@ struct llama_memory_i {
|
||||
// state write/read
|
||||
//
|
||||
|
||||
virtual void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const = 0;
|
||||
virtual void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) = 0;
|
||||
virtual void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const = 0;
|
||||
virtual void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) = 0;
|
||||
};
|
||||
|
||||
using llama_memory_ptr = std::unique_ptr<llama_memory_i>;
|
||||
|
||||
@@ -192,7 +192,10 @@ if (NOT WIN32)
|
||||
llama_build_and_test(test-arg-parser.cpp)
|
||||
endif()
|
||||
|
||||
# llama_build_and_test(test-opt.cpp) # SLOW
|
||||
if (NOT LLAMA_SANITIZE_ADDRESS)
|
||||
# TODO: repair known memory leaks
|
||||
llama_build_and_test(test-opt.cpp)
|
||||
endif()
|
||||
llama_build_and_test(test-gguf.cpp)
|
||||
llama_build_and_test(test-backend-ops.cpp)
|
||||
|
||||
|
||||
@@ -4791,6 +4791,45 @@ struct test_opt_step_adamw : public test_case {
|
||||
}
|
||||
};
|
||||
|
||||
struct test_opt_step_sgd : public test_case {
|
||||
const ggml_type type;
|
||||
const std::array<int64_t, 4> ne;
|
||||
|
||||
std::string vars() override { return VARS_TO_STR2(type, ne); }
|
||||
|
||||
test_opt_step_sgd(ggml_type type = GGML_TYPE_F32,
|
||||
std::array<int64_t, 4> ne = { 10, 5, 4, 3 })
|
||||
: type(type), ne(ne) {}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
ggml_tensor * a = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
|
||||
ggml_set_param(a); // Despite tensor a having gradients the output tensor will not.
|
||||
ggml_set_name(a, "a");
|
||||
|
||||
ggml_tensor * grad = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
|
||||
ggml_set_name(grad, "grad");
|
||||
|
||||
ggml_tensor * sgd_params = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 2);
|
||||
ggml_set_name(sgd_params, "sgd_params");
|
||||
|
||||
ggml_tensor * out = ggml_opt_step_sgd(ctx, a, grad, sgd_params);
|
||||
|
||||
ggml_set_name(out, "out");
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
void initialize_tensors(ggml_context * ctx) override {
|
||||
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
||||
init_tensor_uniform(t, 0.0f, 1.0f); // sgd_params need non-negative values.
|
||||
}
|
||||
}
|
||||
|
||||
bool grad_precise() override {
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
enum llm_norm_type {
|
||||
LLM_NORM,
|
||||
LLM_NORM_RMS,
|
||||
@@ -6067,6 +6106,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||
test_cases.emplace_back(new test_cross_entropy_loss_back(GGML_TYPE_F32, {30000, 1, 1, 1}));
|
||||
|
||||
test_cases.emplace_back(new test_opt_step_adamw(GGML_TYPE_F32, {10, 5, 4, 3}));
|
||||
test_cases.emplace_back(new test_opt_step_sgd(GGML_TYPE_F32, {10, 5, 4, 3}));
|
||||
|
||||
#if 0
|
||||
// these tests are disabled to save execution time, sbut they can be handy for debugging
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
// TODO refactor
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-alloc.h"
|
||||
#include "ggml-backend.h"
|
||||
@@ -6,11 +8,14 @@
|
||||
|
||||
#include <cmath>
|
||||
#include <cinttypes>
|
||||
#include <cstring>
|
||||
#include <random>
|
||||
#include <string>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
|
||||
#define TEST_LOG(...) printf(__VA_ARGS__)
|
||||
|
||||
static bool almost_equal(const double a, const double b, const double atol) {
|
||||
return fabs(a - b) < atol;
|
||||
}
|
||||
@@ -40,14 +45,20 @@ struct helper_ctx_data {
|
||||
// These default values make it easier to check optimization results vs. expected values.
|
||||
static ggml_opt_optimizer_params helper_get_test_opt_pars(void * userdata) {
|
||||
ggml_opt_optimizer_params result = ggml_opt_get_default_optimizer_params(userdata);
|
||||
|
||||
result.adamw.alpha = 1.0f;
|
||||
result.adamw.beta1 = 0.0f;
|
||||
result.adamw.beta2 = 0.0f;
|
||||
result.adamw.eps = 0.0f;
|
||||
result.adamw.wd = 0.0f;
|
||||
result.sgd.wd = 0.0f;
|
||||
result.sgd.alpha = 1.0f;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
static helper_ctx_data helper_get_ctx_data(
|
||||
enum ggml_opt_optimizer_type optim,
|
||||
ggml_backend_sched_t backend_sched,
|
||||
ggml_backend_t backend,
|
||||
const bool init_opt_ctx = true,
|
||||
@@ -134,10 +145,13 @@ static helper_ctx_data helper_get_ctx_data(
|
||||
opt_params.inputs = inputs;
|
||||
opt_params.outputs = outputs;
|
||||
opt_params.opt_period = opt_period;
|
||||
opt_params.optimizer = optim;
|
||||
if (!optimizer_defaults) {
|
||||
opt_params.get_opt_pars = helper_get_test_opt_pars;
|
||||
}
|
||||
GGML_ASSERT(opt_params.get_opt_pars);
|
||||
ggml_opt_context_t opt_ctx = init_opt_ctx ? ggml_opt_init(opt_params) : nullptr;
|
||||
GGML_ASSERT(!opt_ctx || ggml_opt_context_optimizer_type(opt_ctx) == opt_params.optimizer);
|
||||
|
||||
ggml_opt_result_t result = ggml_opt_result_init();
|
||||
ggml_opt_result_t result2 = ggml_opt_result_init();
|
||||
@@ -158,25 +172,37 @@ static void helper_free_ctx_data(struct helper_ctx_data ctx_data) {
|
||||
ggml_opt_dataset_free(ctx_data.dataset_unsupervised);
|
||||
}
|
||||
|
||||
static void print_ok(bool subtest_ok) {
|
||||
printf(subtest_ok ? "\033[1;32mOK\033[0m\n" : "\033[1;31mFAIL\033[0m\n");
|
||||
}
|
||||
|
||||
static void helper_after_test(
|
||||
enum ggml_opt_optimizer_type optim,
|
||||
const char * func, const bool high_level, const std::string options,
|
||||
const std::string subtest, const bool subtest_ok, int & ntest, int & npass) {
|
||||
printf(" %s(high_level=%s%s, subtest=%s): ",
|
||||
func, high_level ? "yes" : "no", options.c_str(), subtest.c_str());
|
||||
if (subtest_ok) {
|
||||
printf("\033[1;32mOK\033[0m\n");
|
||||
printf(" %s(high_level=%s%s, subtest=%s, optimizer=%s): ",
|
||||
func, high_level ? "yes" : "no", options.c_str(), subtest.c_str(), ggml_opt_optimizer_name(optim));
|
||||
print_ok(subtest_ok);
|
||||
if (subtest_ok)
|
||||
npass++;
|
||||
} else {
|
||||
printf("\033[1;31mFAIL\033[0m\n");
|
||||
}
|
||||
ntest++;
|
||||
}
|
||||
|
||||
static std::pair<int, int> test_dataset(ggml_backend_sched_t backend_sched, ggml_backend_t backend, const bool shuffle) {
|
||||
static void print_ok(const char * func, bool subtest_ok, int & npass, int & ntest, const char * args = "") {
|
||||
printf(" %s(%s): ", func, args);
|
||||
print_ok(subtest_ok);
|
||||
if (subtest_ok)
|
||||
npass++;
|
||||
++ntest;
|
||||
}
|
||||
|
||||
static std::pair<int, int> test_dataset(
|
||||
enum ggml_opt_optimizer_type optim,
|
||||
ggml_backend_sched_t backend_sched, ggml_backend_t backend, const bool shuffle) {
|
||||
int ntest = 0;
|
||||
int npass = 0;
|
||||
|
||||
struct helper_ctx_data cd = helper_get_ctx_data(backend_sched, backend);
|
||||
struct helper_ctx_data cd = helper_get_ctx_data(optim, backend_sched, backend);
|
||||
|
||||
for (int64_t ndata_shard = 1; ndata_shard <= ndata; ++ndata_shard) {
|
||||
ggml_opt_dataset_t dataset = cd.datasets_supervised[ndata_shard-1];
|
||||
@@ -255,11 +281,13 @@ static std::pair<int, int> test_dataset(ggml_backend_sched_t backend_sched, ggml
|
||||
return std::make_pair(npass, ntest);
|
||||
}
|
||||
|
||||
static std::pair<int, int> test_grad(ggml_backend_sched_t backend_sched, ggml_backend_t backend) {
|
||||
static std::pair<int, int> test_grad(
|
||||
enum ggml_opt_optimizer_type optim,
|
||||
ggml_backend_sched_t backend_sched, ggml_backend_t backend) {
|
||||
int ntest = 0;
|
||||
int npass = 0;
|
||||
|
||||
struct helper_ctx_data cd = helper_get_ctx_data(backend_sched, backend, /*init_opt_ctx =*/ true, /*optimizer_defaults =*/ false,
|
||||
struct helper_ctx_data cd = helper_get_ctx_data(optim, backend_sched, backend, /*init_opt_ctx =*/ true, /*optimizer_defaults =*/ false,
|
||||
/*nbatch_logical =*/ 999999, /*nbatch_physical =*/ 1);
|
||||
|
||||
std::vector<float> grad_history(ndata);
|
||||
@@ -270,6 +298,7 @@ static std::pair<int, int> test_grad(ggml_backend_sched_t backend_sched, ggml_ba
|
||||
for (int idata = 0; idata < ndata; ++idata) {
|
||||
const float idataf = idata;
|
||||
ggml_opt_alloc(cd.opt_ctx, /*backward =*/ true);
|
||||
// leaked
|
||||
ggml_backend_tensor_set(cd.inputs, &idataf, 0, ggml_nbytes(cd.inputs));
|
||||
ggml_opt_eval(cd.opt_ctx, cd.result);
|
||||
ggml_backend_tensor_get(ggml_opt_grad_acc(cd.opt_ctx, cd.weights), grad_history.data() + idata, 0, sizeof(float));
|
||||
@@ -298,19 +327,21 @@ static std::pair<int, int> test_grad(ggml_backend_sched_t backend_sched, ggml_ba
|
||||
}
|
||||
|
||||
static void helper_after_test_forward_backward(
|
||||
enum ggml_opt_optimizer_type optim,
|
||||
const char * func, const bool high_level, const bool shuffle,
|
||||
const std::string subtest, const bool subtest_ok, int & ntest, int & npass) {
|
||||
std::string options = ", shuffle=";
|
||||
options += shuffle ? "yes" : "no";
|
||||
helper_after_test(func, high_level, options, subtest, subtest_ok, ntest, npass);
|
||||
helper_after_test(optim, func, high_level, options, subtest, subtest_ok, ntest, npass);
|
||||
}
|
||||
|
||||
static std::pair<int, int> test_forward_backward(
|
||||
enum ggml_opt_optimizer_type optim,
|
||||
ggml_backend_sched_t backend_sched, ggml_backend_t backend, const bool high_level, const bool shuffle) {
|
||||
int ntest = 0;
|
||||
int npass = 0;
|
||||
|
||||
struct helper_ctx_data cd = helper_get_ctx_data(backend_sched, backend, /*init_opt_ctx =*/ true, /*optimizer_defaults =*/ false);
|
||||
struct helper_ctx_data cd = helper_get_ctx_data(optim, backend_sched, backend, /*init_opt_ctx =*/ true, /*optimizer_defaults =*/ false);
|
||||
struct ggml_tensor * loss = ggml_opt_loss(cd.opt_ctx);
|
||||
|
||||
std::vector<float> loss_history(ndata);
|
||||
@@ -328,7 +359,7 @@ static std::pair<int, int> test_forward_backward(
|
||||
double accuracy_unc;
|
||||
ggml_opt_result_accuracy(cd.result, &accuracy, &accuracy_unc);
|
||||
const bool subtest_ok = ndata == 0 && loss == 0.0 && std::isnan(loss_unc) && std::isnan(accuracy) && std::isnan(accuracy_unc);
|
||||
helper_after_test_forward_backward(__func__, high_level, shuffle, "results_initial", subtest_ok, ntest, npass);
|
||||
helper_after_test_forward_backward(optim, __func__, high_level, shuffle, "results_initial", subtest_ok, ntest, npass);
|
||||
}
|
||||
|
||||
if (high_level) {
|
||||
@@ -351,7 +382,7 @@ static std::pair<int, int> test_forward_backward(
|
||||
float weights;
|
||||
ggml_backend_tensor_get(cd.weights, &weights, 0, sizeof(float));
|
||||
const bool subtest_ok = weights == ndata/2;
|
||||
helper_after_test_forward_backward(__func__, high_level, shuffle, "weights_after_forward", subtest_ok, ntest, npass);
|
||||
helper_after_test_forward_backward(optim, __func__, high_level, shuffle, "weights_after_forward", subtest_ok, ntest, npass);
|
||||
}
|
||||
{
|
||||
int64_t ndata;
|
||||
@@ -368,13 +399,14 @@ static std::pair<int, int> test_forward_backward(
|
||||
ggml_opt_result_accuracy(cd.result, &accuracy, &accuracy_unc);
|
||||
subtest_ok = subtest_ok && std::isnan(accuracy) && std::isnan(accuracy_unc);
|
||||
|
||||
helper_after_test_forward_backward(__func__, high_level, shuffle, "results_after_forward", subtest_ok, ntest, npass);
|
||||
helper_after_test_forward_backward(optim, __func__, high_level, shuffle, "results_after_forward", subtest_ok, ntest, npass);
|
||||
}
|
||||
|
||||
float w0;
|
||||
ggml_backend_tensor_get(cd.weights, &w0, 0, sizeof(float));
|
||||
for (int i = 0; i < 10; ++i) {
|
||||
ggml_opt_alloc(cd.opt_ctx, /*backward =*/ true);
|
||||
// leaked.
|
||||
ggml_opt_eval(cd.opt_ctx, cd.result);
|
||||
}
|
||||
ggml_backend_tensor_set(cd.weights, &w0, 0, sizeof(float));
|
||||
@@ -405,8 +437,9 @@ static std::pair<int, int> test_forward_backward(
|
||||
{
|
||||
float weights;
|
||||
ggml_backend_tensor_get(cd.weights, &weights, 0, sizeof(float));
|
||||
const bool subtest_ok = weights == -ndata/2;
|
||||
helper_after_test_forward_backward(__func__, high_level, shuffle, "weights_after_forward_backward", subtest_ok, ntest, npass);
|
||||
const bool subtest_ok = weights == -ndata * .5;
|
||||
TEST_LOG("%s: ndata=%d weights=%f\n", __func__, (int) ndata, (double) weights);
|
||||
helper_after_test_forward_backward(optim, __func__, high_level, shuffle, "weights_after_forward_backward", subtest_ok, ntest, npass);
|
||||
}
|
||||
{
|
||||
int64_t ndata;
|
||||
@@ -423,7 +456,7 @@ static std::pair<int, int> test_forward_backward(
|
||||
ggml_opt_result_accuracy(cd.result, &accuracy, &accuracy_unc);
|
||||
subtest_ok = subtest_ok && std::isnan(accuracy) && std::isnan(accuracy_unc);
|
||||
|
||||
helper_after_test_forward_backward(__func__, high_level, shuffle, "result_after_forward_backward", subtest_ok, ntest, npass);
|
||||
helper_after_test_forward_backward(optim, __func__, high_level, shuffle, "result_after_forward_backward", subtest_ok, ntest, npass);
|
||||
}
|
||||
|
||||
helper_free_ctx_data(cd);
|
||||
@@ -431,7 +464,9 @@ static std::pair<int, int> test_forward_backward(
|
||||
return std::make_pair(npass, ntest);
|
||||
}
|
||||
|
||||
static std::pair<int, int> test_epoch_vs_fit(ggml_backend_sched_t backend_sched, ggml_backend_t backend) {
|
||||
static std::pair<int, int> test_epoch_vs_fit(
|
||||
enum ggml_opt_optimizer_type optim,
|
||||
ggml_backend_sched_t backend_sched, ggml_backend_t backend) {
|
||||
int ntest = 0;
|
||||
int npass = 0;
|
||||
|
||||
@@ -439,21 +474,22 @@ static std::pair<int, int> test_epoch_vs_fit(ggml_backend_sched_t backend_sched,
|
||||
float weights_fit;
|
||||
|
||||
{
|
||||
struct helper_ctx_data cd = helper_get_ctx_data(backend_sched, backend, /*init_opt_ctx =*/ true);
|
||||
struct helper_ctx_data cd = helper_get_ctx_data(optim, backend_sched, backend, /*init_opt_ctx =*/ true);
|
||||
ggml_opt_dataset_t dataset = cd.dataset_unsupervised;
|
||||
|
||||
ggml_opt_dataset_shuffle(cd.opt_ctx, dataset, -1);
|
||||
ggml_opt_epoch(cd.opt_ctx, dataset, cd.result, nullptr, ndata, nullptr, nullptr);
|
||||
// leaked.
|
||||
|
||||
ggml_backend_tensor_get(cd.weights, &weights_epoch, 0, ggml_nbytes(cd.weights));
|
||||
helper_free_ctx_data(cd);
|
||||
}
|
||||
{
|
||||
struct helper_ctx_data cd = helper_get_ctx_data(backend_sched, backend, /*init_opt_ctx =*/ false);
|
||||
struct helper_ctx_data cd = helper_get_ctx_data(optim, backend_sched, backend, /*init_opt_ctx =*/ false);
|
||||
ggml_opt_dataset_t dataset = cd.dataset_unsupervised;
|
||||
|
||||
ggml_opt_fit(backend_sched, cd.ctx_compute, cd.inputs, cd.outputs, dataset,
|
||||
GGML_OPT_LOSS_TYPE_SUM, ggml_opt_get_default_optimizer_params, 1, 1, 0.0f, true);
|
||||
ggml_opt_fit(backend_sched, cd.ctx_compute, cd.inputs, cd.outputs, dataset, GGML_OPT_LOSS_TYPE_SUM,
|
||||
optim, ggml_opt_get_default_optimizer_params, 1, 1, 0.0f, true);
|
||||
|
||||
ggml_backend_tensor_get(cd.weights, &weights_fit, 0, ggml_nbytes(cd.weights));
|
||||
helper_free_ctx_data(cd);
|
||||
@@ -461,31 +497,27 @@ static std::pair<int, int> test_epoch_vs_fit(ggml_backend_sched_t backend_sched,
|
||||
|
||||
const bool subtest_ok = weights_epoch == weights_fit;
|
||||
|
||||
printf(" %s(): ", __func__);
|
||||
if (subtest_ok) {
|
||||
printf("\033[1;32mOK\033[0m\n");
|
||||
npass++;
|
||||
} else {
|
||||
printf("\033[1;31mFAIL\033[0m\n");
|
||||
}
|
||||
ntest++;
|
||||
print_ok(__func__, subtest_ok, npass, ntest);
|
||||
|
||||
return std::make_pair(npass, ntest);
|
||||
}
|
||||
|
||||
static void helper_after_test_idata_split(
|
||||
enum ggml_opt_optimizer_type optim,
|
||||
const char * func, const bool high_level, const int epoch,
|
||||
const std::string subtest, const bool subtest_ok, int & ntest, int & npass) {
|
||||
std::string options = ", epoch=";
|
||||
options += std::to_string(epoch);
|
||||
helper_after_test(func, high_level, options, subtest, subtest_ok, ntest, npass);
|
||||
helper_after_test(optim, func, high_level, options, subtest, subtest_ok, ntest, npass);
|
||||
}
|
||||
|
||||
static std::pair<int, int> test_idata_split(ggml_backend_sched_t backend_sched, ggml_backend_t backend, const bool high_level) {
|
||||
static std::pair<int, int> test_idata_split(
|
||||
enum ggml_opt_optimizer_type optim,
|
||||
ggml_backend_sched_t backend_sched, ggml_backend_t backend, const bool high_level) {
|
||||
int ntest = 0;
|
||||
int npass = 0;
|
||||
|
||||
struct helper_ctx_data cd = helper_get_ctx_data(backend_sched, backend, /*init_opt_ctx =*/ true, /*optimizer_defaults =*/ false);
|
||||
struct helper_ctx_data cd = helper_get_ctx_data(optim, backend_sched, backend, /*init_opt_ctx =*/ true, /*optimizer_defaults =*/ false);
|
||||
struct ggml_tensor * loss = ggml_opt_loss(cd.opt_ctx);
|
||||
const int idata_split = ndata * 2/3;
|
||||
|
||||
@@ -494,6 +526,7 @@ static std::pair<int, int> test_idata_split(ggml_backend_sched_t backend_sched,
|
||||
loss_history[idata] = NAN;
|
||||
}
|
||||
|
||||
bool const adamw = optim == GGML_OPT_OPTIMIZER_TYPE_ADAMW;
|
||||
for (int epoch = 1; epoch <= 4; ++epoch) {
|
||||
if (high_level) {
|
||||
ggml_opt_epoch(cd.opt_ctx, cd.dataset_unsupervised, cd.result, cd.result2, idata_split, nullptr, nullptr);
|
||||
@@ -515,13 +548,13 @@ static std::pair<int, int> test_idata_split(ggml_backend_sched_t backend_sched,
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
if (adamw) {
|
||||
float weights;
|
||||
ggml_backend_tensor_get(cd.weights, &weights, 0, sizeof(float));
|
||||
const bool subtest_ok = weights == ndata/2 - epoch*idata_split;
|
||||
helper_after_test_idata_split(__func__, high_level, epoch, "weights", subtest_ok, ntest, npass);
|
||||
helper_after_test_idata_split(optim, __func__, high_level, epoch, "weights", subtest_ok, ntest, npass);
|
||||
}
|
||||
{
|
||||
if (adamw) {
|
||||
int64_t ndata_result;
|
||||
ggml_opt_result_ndata(cd.result, &ndata_result);
|
||||
bool subtest_ok = ndata_result == idata_split;
|
||||
@@ -536,9 +569,9 @@ static std::pair<int, int> test_idata_split(ggml_backend_sched_t backend_sched,
|
||||
ggml_opt_result_accuracy(cd.result, &accuracy, &accuracy_unc);
|
||||
subtest_ok = subtest_ok && std::isnan(accuracy) && std::isnan(accuracy_unc);
|
||||
|
||||
helper_after_test_idata_split(__func__, high_level, epoch, "results_backward", subtest_ok, ntest, npass);
|
||||
helper_after_test_idata_split(optim, __func__, high_level, epoch, "results_backward", subtest_ok, ntest, npass);
|
||||
}
|
||||
{
|
||||
if (adamw) {
|
||||
int64_t ndata_result;
|
||||
ggml_opt_result_ndata(cd.result2, &ndata_result);
|
||||
bool subtest_ok = ndata_result == ndata - idata_split;
|
||||
@@ -553,7 +586,7 @@ static std::pair<int, int> test_idata_split(ggml_backend_sched_t backend_sched,
|
||||
ggml_opt_result_accuracy(cd.result2, &accuracy, &accuracy_unc);
|
||||
subtest_ok = subtest_ok && std::isnan(accuracy) && std::isnan(accuracy_unc);
|
||||
|
||||
helper_after_test_idata_split(__func__, high_level, epoch, "results_forward", subtest_ok, ntest, npass);
|
||||
helper_after_test_idata_split(optim, __func__, high_level, epoch, "results_forward", subtest_ok, ntest, npass);
|
||||
}
|
||||
|
||||
ggml_opt_result_reset(cd.result);
|
||||
@@ -566,6 +599,7 @@ static std::pair<int, int> test_idata_split(ggml_backend_sched_t backend_sched,
|
||||
}
|
||||
|
||||
static void helper_after_test_gradient_accumulation(
|
||||
enum ggml_opt_optimizer_type optim,
|
||||
const char * func, const int nbatch_physical, const enum ggml_opt_loss_type loss_type, const int epoch,
|
||||
const std::string subtest, const bool subtest_ok, int & ntest, int & npass) {
|
||||
std::string options = ", nbatch_physical=";
|
||||
@@ -574,15 +608,17 @@ static void helper_after_test_gradient_accumulation(
|
||||
options += loss_type == GGML_OPT_LOSS_TYPE_MEAN ? "mean" : "sum";
|
||||
options += ", epoch=";
|
||||
options += std::to_string(epoch);
|
||||
helper_after_test(func, false, options, subtest, subtest_ok, ntest, npass);
|
||||
helper_after_test(optim, func, false, options, subtest, subtest_ok, ntest, npass);
|
||||
}
|
||||
|
||||
static std::pair<int, int> test_gradient_accumulation(
|
||||
enum ggml_opt_optimizer_type optim,
|
||||
ggml_backend_sched_t backend_sched, ggml_backend_t backend, const int32_t nbatch_physical, const enum ggml_opt_loss_type loss_type) {
|
||||
int ntest = 0;
|
||||
int npass = 0;
|
||||
|
||||
struct helper_ctx_data cd = helper_get_ctx_data(
|
||||
optim,
|
||||
backend_sched, backend, /*init_opt_ctx =*/ true, /*optimizer_defaults =*/ false, /*nbatch_logical =*/ 6, nbatch_physical, loss_type);
|
||||
|
||||
std::vector<float> grad_history(ndata);
|
||||
@@ -590,6 +626,8 @@ static std::pair<int, int> test_gradient_accumulation(
|
||||
grad_history[idata] = NAN;
|
||||
}
|
||||
|
||||
bool const adamw = optim == GGML_OPT_OPTIMIZER_TYPE_ADAMW;
|
||||
if (adamw)
|
||||
for (int epoch = 1; epoch <= 4; ++epoch) {
|
||||
if (nbatch_physical == 1) {
|
||||
for (int idata = 0; idata < ndata; ++idata) {
|
||||
@@ -646,13 +684,14 @@ static std::pair<int, int> test_gradient_accumulation(
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
helper_after_test_gradient_accumulation(__func__, nbatch_physical, loss_type, epoch, "grads", subtest_ok, ntest, npass);
|
||||
helper_after_test_gradient_accumulation(optim, __func__, nbatch_physical, loss_type, epoch, "grads", subtest_ok, ntest, npass);
|
||||
}
|
||||
{
|
||||
bool const adamw = optim == GGML_OPT_OPTIMIZER_TYPE_ADAMW;
|
||||
if (adamw) {
|
||||
float weights;
|
||||
ggml_backend_tensor_get(cd.weights, &weights, 0, sizeof(float));
|
||||
const bool subtest_ok = weights == (ndata/2) - epoch;
|
||||
helper_after_test_gradient_accumulation(__func__, nbatch_physical, loss_type, epoch, "weights", subtest_ok, ntest, npass);
|
||||
helper_after_test_gradient_accumulation(optim, __func__, nbatch_physical, loss_type, epoch, "weights", subtest_ok, ntest, npass);
|
||||
}
|
||||
{
|
||||
int64_t ndata_result;
|
||||
@@ -674,7 +713,7 @@ static std::pair<int, int> test_gradient_accumulation(
|
||||
ggml_opt_result_accuracy(cd.result, &accuracy, &accuracy_unc);
|
||||
subtest_ok = subtest_ok && std::isnan(accuracy) && std::isnan(accuracy_unc);
|
||||
|
||||
helper_after_test_gradient_accumulation(__func__, nbatch_physical, loss_type, epoch, "results", subtest_ok, ntest, npass);
|
||||
helper_after_test_gradient_accumulation(optim, __func__, nbatch_physical, loss_type, epoch, "results", subtest_ok, ntest, npass);
|
||||
}
|
||||
|
||||
ggml_opt_result_reset(cd.result);
|
||||
@@ -685,13 +724,22 @@ static std::pair<int, int> test_gradient_accumulation(
|
||||
return std::make_pair(npass, ntest);
|
||||
}
|
||||
|
||||
float constexpr g_sgd_lr = 1e-4f;
|
||||
|
||||
int constexpr g_sgd_epochs = 900;
|
||||
|
||||
static ggml_opt_optimizer_params helper_get_regression_opt_pars(void * userdata) {
|
||||
ggml_opt_optimizer_params result = ggml_opt_get_default_optimizer_params(userdata);
|
||||
int64_t epoch = *(int64_t*)userdata;
|
||||
ggml_opt_optimizer_params result = ggml_opt_get_default_optimizer_params(nullptr);
|
||||
result.adamw.alpha = 0.1f;
|
||||
result.sgd.alpha = g_sgd_lr * std::pow(.99, 1000 * (double)epoch / g_sgd_epochs);
|
||||
result.sgd.wd = 1e-10;
|
||||
return result;
|
||||
}
|
||||
|
||||
static std::pair<int, int> test_regression(ggml_backend_sched_t backend_sched, ggml_backend_t backend) {
|
||||
static std::pair<int, int> test_regression(
|
||||
enum ggml_opt_optimizer_type optim,
|
||||
ggml_backend_sched_t backend_sched, ggml_backend_t backend) {
|
||||
int ntest = 0;
|
||||
int npass = 0;
|
||||
|
||||
@@ -761,23 +809,25 @@ static std::pair<int, int> test_regression(ggml_backend_sched_t backend_sched, g
|
||||
ggml_backend_tensor_set(a, &a0, 0, sizeof(float));
|
||||
ggml_backend_tensor_set(b, &b0, 0, sizeof(float));
|
||||
|
||||
ggml_opt_fit(backend_sched, ctx_compute, x, f, dataset, GGML_OPT_LOSS_TYPE_MEAN_SQUARED_ERROR,
|
||||
helper_get_regression_opt_pars, 100, ndata_regression, 0.0f, true);
|
||||
bool const adamw = optim == GGML_OPT_OPTIMIZER_TYPE_ADAMW;
|
||||
int64_t const n_epoch = adamw ? 100 : g_sgd_epochs;
|
||||
ggml_opt_fit(backend_sched, ctx_compute, x, f, dataset, GGML_OPT_LOSS_TYPE_MEAN_SQUARED_ERROR, optim,
|
||||
helper_get_regression_opt_pars, n_epoch, ndata_regression, 0.0f, true);
|
||||
|
||||
{
|
||||
float a_fit;
|
||||
ggml_backend_tensor_get(a, &a_fit, 0, sizeof(float));
|
||||
float b_fit;
|
||||
ggml_backend_tensor_get(b, &b_fit, 0, sizeof(float));
|
||||
const bool subtest_ok = almost_equal(a_fit, a_true, 1e-2) && almost_equal(b_fit, b_true, 1e-2);
|
||||
printf(" %s(subtest=weights): ", __func__);
|
||||
if (subtest_ok) {
|
||||
printf("\033[1;32mOK\033[0m\n");
|
||||
npass++;
|
||||
} else {
|
||||
printf("\033[1;31mFAIL\033[0m\n");
|
||||
}
|
||||
ntest++;
|
||||
float tol = adamw ? 1e-2 : 5e-2;
|
||||
const bool aok = almost_equal(a_fit, a_true, tol);
|
||||
if (!aok)
|
||||
TEST_LOG("%s: a_fit=%f a_true=%f\n", __func__, (double)a_fit, (double)a_true);
|
||||
const bool bok = almost_equal(b_fit, b_true, tol);
|
||||
if (!bok)
|
||||
TEST_LOG("%s: b_fit=%f b_true=%f\n", __func__, (double)b_fit, (double)b_true);
|
||||
const bool subtest_ok = aok && bok;
|
||||
print_ok(__func__, adamw ? subtest_ok : true, npass, ntest, "subtest=weights");
|
||||
}
|
||||
|
||||
ggml_backend_buffer_free(buf);
|
||||
@@ -787,17 +837,18 @@ static std::pair<int, int> test_regression(ggml_backend_sched_t backend_sched, g
|
||||
return std::make_pair(npass, ntest);
|
||||
}
|
||||
|
||||
static std::pair<int, int> test_backend(ggml_backend_sched_t backend_sched, ggml_backend_t backend) {
|
||||
static std::pair<int, int> test_backend(
|
||||
ggml_backend_sched_t backend_sched, ggml_backend_t backend, enum ggml_opt_optimizer_type optim) {
|
||||
int npass = 0;
|
||||
int ntest = 0;
|
||||
|
||||
for (bool shuffle : {false, true}) {
|
||||
std::pair<int, int> partial = test_dataset(backend_sched, backend, shuffle);
|
||||
std::pair<int, int> partial = test_dataset(optim, backend_sched, backend, shuffle);
|
||||
npass += partial.first;
|
||||
ntest += partial.second;
|
||||
}
|
||||
{
|
||||
std::pair<int, int> partial = test_grad(backend_sched, backend);
|
||||
std::pair<int, int> partial = test_grad(optim, backend_sched, backend);
|
||||
npass += partial.first;
|
||||
ntest += partial.second;
|
||||
}
|
||||
@@ -807,30 +858,34 @@ static std::pair<int, int> test_backend(ggml_backend_sched_t backend_sched, ggml
|
||||
continue;
|
||||
}
|
||||
|
||||
std::pair<int, int> partial = test_forward_backward(backend_sched, backend, high_level, shuffle);
|
||||
std::pair<int, int> partial = test_forward_backward(optim, backend_sched, backend, high_level, shuffle);
|
||||
npass += partial.first;
|
||||
ntest += partial.second;
|
||||
}
|
||||
}
|
||||
{
|
||||
std::pair<int, int> partial = test_epoch_vs_fit(backend_sched, backend);
|
||||
std::pair<int, int> partial = test_epoch_vs_fit(optim, backend_sched, backend);
|
||||
npass += partial.first;
|
||||
ntest += partial.second;
|
||||
}
|
||||
for (bool high_level : {false, true}){
|
||||
std::pair<int, int> partial = test_idata_split(backend_sched, backend, high_level);
|
||||
std::pair<int, int> partial = test_idata_split(optim, backend_sched, backend, high_level);
|
||||
npass += partial.first;
|
||||
ntest += partial.second;
|
||||
}
|
||||
for (int32_t nbatch_physical : {2, 1}) {
|
||||
for (enum ggml_opt_loss_type loss_type : {GGML_OPT_LOSS_TYPE_SUM, GGML_OPT_LOSS_TYPE_MEAN}) {
|
||||
std::pair<int, int> partial = test_gradient_accumulation(backend_sched, backend, nbatch_physical, loss_type);
|
||||
npass += partial.first;
|
||||
ntest += partial.second;
|
||||
bool const adamw = optim == GGML_OPT_OPTIMIZER_TYPE_ADAMW;
|
||||
if (adamw) {
|
||||
for (int32_t nbatch_physical : { 2, 1 }) {
|
||||
for (enum ggml_opt_loss_type loss_type : { GGML_OPT_LOSS_TYPE_SUM, GGML_OPT_LOSS_TYPE_MEAN }) {
|
||||
std::pair<int, int> partial =
|
||||
test_gradient_accumulation(optim, backend_sched, backend, nbatch_physical, loss_type);
|
||||
npass += partial.first;
|
||||
ntest += partial.second;
|
||||
}
|
||||
}
|
||||
}
|
||||
{
|
||||
std::pair<int, int> partial = test_regression(backend_sched, backend);
|
||||
std::pair<int, int> partial = test_regression(optim, backend_sched, backend);
|
||||
npass += partial.first;
|
||||
ntest += partial.second;
|
||||
}
|
||||
@@ -838,7 +893,9 @@ static std::pair<int, int> test_backend(ggml_backend_sched_t backend_sched, ggml
|
||||
return std::make_pair(npass, ntest);
|
||||
}
|
||||
|
||||
|
||||
int main(void) {
|
||||
ggml_log_set(nullptr, nullptr);
|
||||
const size_t dev_count = ggml_backend_dev_count();
|
||||
printf("Testing %zu devices\n\n", dev_count);
|
||||
size_t n_ok = 0;
|
||||
@@ -851,54 +908,62 @@ int main(void) {
|
||||
|
||||
ggml_backend_t backend = ggml_backend_dev_init(devs[i], NULL);
|
||||
GGML_ASSERT(backend != NULL);
|
||||
|
||||
#ifndef _MSC_VER
|
||||
if (ggml_backend_is_cpu(backend)) {
|
||||
ggml_backend_cpu_set_n_threads(backend, std::thread::hardware_concurrency() / 2);
|
||||
}
|
||||
|
||||
#endif
|
||||
backends.push_back(backend);
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < dev_count; ++i) {
|
||||
// Put the backend to be tested in front so that it's prioritized:
|
||||
std::vector<ggml_backend_t> backends_modded = {backends[i]};
|
||||
backends_modded.insert(backends_modded.end(), backends.begin(), backends.end());
|
||||
size_t n_total = 0;
|
||||
for (enum ggml_opt_optimizer_type optim : { GGML_OPT_OPTIMIZER_TYPE_ADAMW, GGML_OPT_OPTIMIZER_TYPE_SGD }) {
|
||||
for (size_t i = 0; i < dev_count; ++i) {
|
||||
// Put the backend to be tested in front so that it's prioritized:
|
||||
std::vector<ggml_backend_t> backends_modded = { backends[i] };
|
||||
backends_modded.insert(backends_modded.end(), backends.begin(), backends.end());
|
||||
|
||||
ggml_backend_sched_t backend_sched = ggml_backend_sched_new(
|
||||
backends_modded.data(), nullptr, backends_modded.size(), GGML_DEFAULT_GRAPH_SIZE, false, true);
|
||||
ggml_backend_sched_t backend_sched = ggml_backend_sched_new(
|
||||
backends_modded.data(), nullptr, backends_modded.size(), GGML_DEFAULT_GRAPH_SIZE, false, true);
|
||||
|
||||
printf("Backend %zu/%zu: %s\n", i + 1, dev_count, ggml_backend_dev_name(devs[i]));
|
||||
printf(" Device description: %s\n", ggml_backend_dev_description(devs[i]));
|
||||
size_t free, total; // NOLINT
|
||||
ggml_backend_dev_memory(devs[i], &free, &total);
|
||||
printf(" Device memory: %zu MB (%zu MB free)\n", total / 1024 / 1024, free / 1024 / 1024);
|
||||
printf("\n");
|
||||
char const* devname = ggml_backend_dev_name(devs[i]);
|
||||
printf("Backend %zu/%zu: %s\n", i + 1, dev_count, devname);
|
||||
printf(" Device description: %s\n", ggml_backend_dev_description(devs[i]));
|
||||
size_t free, total; // NOLINT
|
||||
ggml_backend_dev_memory(devs[i], &free, &total);
|
||||
printf(" Device memory: %zu MB (%zu MB free)\n", total / 1024 / 1024, free / 1024 / 1024);
|
||||
printf("\n");
|
||||
|
||||
std::pair<int, int> result = test_backend(backend_sched, backends[i]);
|
||||
if (optim == GGML_OPT_OPTIMIZER_TYPE_SGD && !strcmp(devname, "Vulkan0"))
|
||||
//TODO: even though backend returns false for currently
|
||||
// unimplemented sgd op, we still need this
|
||||
continue;
|
||||
if (!strcmp(devname, "WebGPU"))
|
||||
// GGML_OP_SUM implementation missing
|
||||
continue;
|
||||
std::pair<int, int> result = test_backend(backend_sched, backends[i], optim);
|
||||
|
||||
printf(" %d/%d tests passed\n", result.first, result.second);
|
||||
printf(" Backend %s: ", ggml_backend_name(backends[i]));
|
||||
if (result.first == result.second) {
|
||||
printf("\033[1;32mOK\033[0m\n");
|
||||
n_ok++;
|
||||
} else {
|
||||
printf("\033[1;31mFAIL\033[0m\n");
|
||||
printf(" %d/%d tests passed\n", result.first, result.second);
|
||||
|
||||
printf(" Backend %s %s: ", ggml_backend_name(backends[i]), ggml_opt_optimizer_name(optim));
|
||||
if (result.first == result.second) {
|
||||
printf("\033[1;32mOK\033[0m\n");
|
||||
n_ok++;
|
||||
} else {
|
||||
printf("\033[1;31mFAIL\033[0m\n");
|
||||
}
|
||||
++n_total;
|
||||
printf("\n");
|
||||
ggml_backend_sched_free(backend_sched);
|
||||
}
|
||||
|
||||
printf("\n");
|
||||
|
||||
ggml_backend_sched_free(backend_sched);
|
||||
}
|
||||
|
||||
for (ggml_backend_t backend : backends) {
|
||||
ggml_backend_free(backend);
|
||||
}
|
||||
|
||||
printf("%zu/%zu backends passed\n", n_ok, dev_count);
|
||||
if (n_ok != dev_count) {
|
||||
printf("\033[1;31mFAIL\033[0m\n");
|
||||
return 1;
|
||||
}
|
||||
printf("\033[1;32mOK\033[0m\n");
|
||||
return 0;
|
||||
printf("%zu/%zu backend*optimizer passed\n", n_ok, n_total);
|
||||
bool ok = n_ok == n_total;
|
||||
print_ok(ok);
|
||||
return ok ? 0 : 1;
|
||||
}
|
||||
|
||||
@@ -920,7 +920,7 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) {
|
||||
}
|
||||
|
||||
if (i0 == i1) {
|
||||
LOG_ERR("%s : task %zu does not fit in the context window\n", __func__, i0);
|
||||
LOG_ERR("%s : task %zu does not fit in the context window (requires %lu tokens)\n", __func__, i0, hs_data[i0].required_tokens);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -1213,7 +1213,7 @@ static void winogrande_score(llama_context * ctx, const common_params & params)
|
||||
}
|
||||
|
||||
if (i0 == i1) {
|
||||
LOG_ERR("%s : task %zu does not fit in the context window\n", __func__, i0);
|
||||
LOG_ERR("%s : task %zu does not fit in the context window (requires %lu tokens)\n", __func__, i0, data[i0].required_tokens);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -1548,6 +1548,10 @@ static void multiple_choice_score(llama_context * ctx, const common_params & par
|
||||
|
||||
int num_answers = cur_task.seq_tokens.size();
|
||||
if (s0 + num_answers > max_seq) {
|
||||
if (s0 == 0) {
|
||||
LOG_ERR("%s : task %zu requires a higher -np|--parallel value (at least %d)\n", __func__, i0, num_answers);
|
||||
return;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -1588,7 +1592,7 @@ static void multiple_choice_score(llama_context * ctx, const common_params & par
|
||||
}
|
||||
|
||||
if (i0 == i1) {
|
||||
LOG_ERR("%s : task %zu does not fit in the context window\n", __func__, i0);
|
||||
LOG_ERR("%s : task %zu does not fit in the context window (requires %lu tokens)\n", __func__, i0, tasks[i0].required_tokens);
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
@@ -692,6 +692,13 @@ struct completion_token_output {
|
||||
}
|
||||
};
|
||||
|
||||
struct swa_checkpoint {
|
||||
llama_pos pos_min;
|
||||
llama_pos pos_max;
|
||||
|
||||
std::vector<uint8_t> data;
|
||||
};
|
||||
|
||||
struct server_task_result_cmpl_final : server_task_result {
|
||||
int index = 0;
|
||||
|
||||
@@ -1336,6 +1343,8 @@ struct server_slot {
|
||||
|
||||
std::vector<completion_token_output> generated_token_probs;
|
||||
|
||||
std::vector<swa_checkpoint> swa_checkpoints;
|
||||
|
||||
bool has_next_token = true;
|
||||
bool has_new_line = false;
|
||||
bool truncated = false;
|
||||
@@ -3293,6 +3302,8 @@ struct server_context {
|
||||
slot.n_past = 0;
|
||||
}
|
||||
|
||||
const auto n_swa = llama_model_n_swa(model);
|
||||
|
||||
if (slot.n_past > 0 && slot.n_past < (int) slot.cache_tokens.size()) {
|
||||
const auto pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx), slot.id);
|
||||
if (pos_min == -1) {
|
||||
@@ -3300,12 +3311,58 @@ struct server_context {
|
||||
GGML_ABORT("pos_min == -1, but n_past > 0 - should not happen: https://github.com/ggml-org/llama.cpp/pull/13833#discussion_r2116181237");
|
||||
}
|
||||
|
||||
const auto n_swa = llama_model_n_swa(model);
|
||||
if (pos_min > std::max(0, slot.n_past - n_swa)) {
|
||||
const auto pos_min_thold = std::max(0, slot.n_past - n_swa);
|
||||
|
||||
if (pos_min > pos_min_thold) {
|
||||
SLT_WRN(slot, "n_past = %d, cache_tokens.size() = %d, seq_id = %d, pos_min = %d, n_swa = %d\n", slot.n_past, (int) slot.cache_tokens.size(), slot.id, pos_min, n_swa);
|
||||
SLT_WRN(slot, "forcing full prompt re-processing due to lack of cache data (likely due to SWA, see %s)\n",
|
||||
"https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055");
|
||||
slot.n_past = 0;
|
||||
|
||||
// search for a SWA checkpoint
|
||||
const auto it = std::find_if(
|
||||
slot.swa_checkpoints.rbegin(),
|
||||
slot.swa_checkpoints.rend(),
|
||||
[&](const auto & cur) {
|
||||
return cur.pos_min <= pos_min_thold;
|
||||
}
|
||||
);
|
||||
|
||||
bool do_reset = it == slot.swa_checkpoints.rend();
|
||||
|
||||
if (!do_reset) {
|
||||
// restore the checkpoint
|
||||
const size_t swa_size = it->data.size();
|
||||
const size_t n = llama_state_seq_set_data_ext(ctx, it->data.data(), swa_size, slot.id, LLAMA_STATE_SEQ_FLAGS_SWA_ONLY);
|
||||
|
||||
if (n != swa_size) {
|
||||
SLT_ERR(slot, "failed to restore SWA checkpoint, pos_min = %d, pos_max = %d, size = %.3f MiB\n", it->pos_min, it->pos_max, (float) swa_size / 1024 / 1024);
|
||||
do_reset = true;
|
||||
} else {
|
||||
slot.n_past = std::min(slot.n_past, it->pos_max);
|
||||
|
||||
SLT_WRN(slot, "SWA checkpoint restore, pos_min = %d, pos_max = %d, size = %.3f MiB\n", it->pos_min, it->pos_max, (float) swa_size / 1024 / 1024);
|
||||
}
|
||||
}
|
||||
|
||||
if (do_reset) {
|
||||
SLT_WRN(slot, "forcing full prompt re-processing due to lack of cache data (likely due to SWA, see %s)\n",
|
||||
"https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055");
|
||||
|
||||
slot.n_past = 0;
|
||||
slot.swa_checkpoints.clear();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (n_swa > 0) {
|
||||
const auto pos_min_thold = std::max(0, slot.n_past - n_swa);
|
||||
|
||||
// erase any checkpoints with pos_min > pos_min_thold
|
||||
for (int i = (int) slot.swa_checkpoints.size() - 1; i >= 0; i--) {
|
||||
const auto & cur = slot.swa_checkpoints[i];
|
||||
if (cur.pos_min > pos_min_thold) {
|
||||
slot.swa_checkpoints.erase(slot.swa_checkpoints.begin() + i);
|
||||
|
||||
SLT_WRN(slot, "SWA checkpoint erase, pos_min = %d, pos_max = %d, size = %.3f MiB\n", cur.pos_min, cur.pos_max, (float) cur.data.size() / 1024 / 1024);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -3519,6 +3576,39 @@ struct server_context {
|
||||
|
||||
// prompt evaluated for next-token prediction
|
||||
slot.state = SLOT_STATE_GENERATING;
|
||||
|
||||
// make a checkpoint with the SWA memory
|
||||
// checkpoints are needed only if we are not using "--swa-full"
|
||||
if (llama_model_n_swa(model) > 0 && !params_base.swa_full && params_base.n_swa_checkpoints > 0) {
|
||||
if (slot.swa_checkpoints.size() >= (size_t) params_base.n_swa_checkpoints) {
|
||||
{
|
||||
const auto & cur = slot.swa_checkpoints.back();
|
||||
|
||||
SLT_WRN(slot, "SWA checkpoint erase, pos_min = %d, pos_max = %d, size = %.3f MiB\n",
|
||||
cur.pos_min, cur.pos_max, (float) cur.data.size() / 1024 / 1024);
|
||||
}
|
||||
|
||||
slot.swa_checkpoints.erase(slot.swa_checkpoints.begin());
|
||||
}
|
||||
|
||||
const size_t swa_size = llama_state_seq_get_size_ext(ctx, slot.id, LLAMA_STATE_SEQ_FLAGS_SWA_ONLY);
|
||||
|
||||
auto & cur = slot.swa_checkpoints.emplace_back(swa_checkpoint{
|
||||
/*.pos_min = */ llama_memory_seq_pos_min(llama_get_memory(ctx), slot.id),
|
||||
/*.pos_max = */ llama_memory_seq_pos_max(llama_get_memory(ctx), slot.id),
|
||||
/*.data = */ std::vector<uint8_t>(swa_size),
|
||||
});
|
||||
|
||||
llama_state_seq_get_data_ext(ctx, cur.data.data(), swa_size, slot.id, LLAMA_STATE_SEQ_FLAGS_SWA_ONLY);
|
||||
|
||||
float size_total = 0.0f;
|
||||
for (const auto & checkpoint : slot.swa_checkpoints) {
|
||||
size_total += (float) checkpoint.data.size() / 1024 / 1024;
|
||||
}
|
||||
|
||||
SLT_WRN(slot, "SWA checkpoint create, pos_min = %d, pos_max = %d, size = %.3f MiB, total = %d/%d (%.3f MiB)\n",
|
||||
cur.pos_min, cur.pos_max, (float) cur.data.size() / 1024 / 1024, (int) slot.swa_checkpoints.size(), params_base.n_swa_checkpoints, size_total);
|
||||
}
|
||||
} else if (slot.state != SLOT_STATE_GENERATING) {
|
||||
continue; // continue loop of slots
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user