Compare commits

..

21 Commits

Author SHA1 Message Date
Georgi Gerganov
afb3929279 Merge branch 'master' into llama-refactor 2023-10-31 20:35:31 +02:00
Georgi Gerganov
5baefef497 llama : add llm_build helper functions (#3848)
* llama : add llm_build_norm helper function

ggml-ci

* llama : add llm_build_ffn helper function (#3849)

ggml-ci

* llama : add llm_build_k_shift helper

ggml-ci

* llama : fix offloading after recent changes

* llama : add llm_build_kv_store helper

ggml-ci

* llama : remove obsolete offload names

* llama : fix llm_build_k_shift to use n_head_kv instead of n_head

* llama : simplify falcon Q, K, V computation

* llama : remove obsolete comments in build graphs

* llama : add llm_build_kqv helper

ggml-ci

* llama : minor

* llama : add LLAMA_OFFLOAD_DEBUG + fix starcoder offloading

* llama : fix input allocation logic

* llama : update offload functions for KQ tensors

* llama : normalize tensor names

ggml-ci

* llama : enable warning about not offloaded tensors

* llama : remove extra ; + deduplicate gate_b logic

* llama : add llm_build_inp_embd helper
2023-10-31 19:23:12 +02:00
Georgi Gerganov
210e6e5d02 llama : remove obsolete map for layer counting 2023-10-29 13:39:04 +02:00
Georgi Gerganov
79ad734417 llama : comment
ggml-ci
2023-10-29 13:27:53 +02:00
Georgi Gerganov
761087932b llama : add functional header 2023-10-29 13:26:32 +02:00
Georgi Gerganov
8925cf9ef8 llama : add layer index to all tensor names 2023-10-29 13:22:15 +02:00
Georgi Gerganov
1e9c5443c2 llama : refactor tensor offloading as callback 2023-10-29 13:05:10 +02:00
Georgi Gerganov
da936188d8 llama : move refact in correct place + optimize graph input 2023-10-29 11:48:58 +02:00
Georgi Gerganov
739b85c985 llama : try to fix build 2023-10-29 11:25:32 +02:00
Georgi Gerganov
25cfbf6776 llama : fix non-CUDA build 2023-10-29 11:12:03 +02:00
Georgi Gerganov
b4ad03b3a7 llama : try to optimize offloading code 2023-10-29 10:33:11 +02:00
Georgi Gerganov
79617902ea llama : fix res_norm offloading 2023-10-29 09:20:35 +02:00
Georgi Gerganov
e14aa46151 llama : do tensor offload only with CUDA 2023-10-29 08:03:46 +02:00
Georgi Gerganov
0dc05b8433 llama : factor graph input into a function 2023-10-29 07:52:43 +02:00
Georgi Gerganov
4e98897ede llama : support offloading result_norm + comments 2023-10-29 07:36:07 +02:00
Georgi Gerganov
51c4f9ee9f llama : comments 2023-10-28 22:50:08 +03:00
Georgi Gerganov
3af8771389 llama : update offload log messages to print node index 2023-10-28 22:36:44 +03:00
Georgi Gerganov
83d2c43791 llama : offload rest of the models
ggml-ci
2023-10-28 22:30:54 +03:00
Georgi Gerganov
38aca9e1ab llama : factor out tensor offloading outside the build call (wip)
ggml-ci
2023-10-28 21:22:31 +03:00
Georgi Gerganov
5946d98fc8 metal : disable kernel load log 2023-10-28 21:22:01 +03:00
Georgi Gerganov
8b2420d249 llama : factor out ggml-alloc from graph graph build functions
ggml-ci
2023-10-28 19:54:28 +03:00
20 changed files with 1359 additions and 1778 deletions

1
.gitignore vendored
View File

@@ -15,7 +15,6 @@
.DS_Store
.build/
.cache/
.ccls-cache/
.direnv/
.envrc
.swiftpm

View File

@@ -103,24 +103,9 @@ void process_escapes(std::string& input) {
}
bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
bool result = true;
try {
if (!gpt_params_parse_ex(argc, argv, params)) {
gpt_print_usage(argc, argv, gpt_params());
exit(0);
}
}
catch (const std::invalid_argument & ex) {
fprintf(stderr, "%s\n", ex.what());
gpt_print_usage(argc, argv, gpt_params());
exit(1);
}
return result;
}
bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
bool invalid_param = false;
std::string arg;
gpt_params default_params;
const std::string arg_prefix = "--";
llama_sampling_params & sparams = params.sparams;
@@ -233,12 +218,6 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
break;
}
sparams.top_p = std::stof(argv[i]);
} else if (arg == "--min-p") {
if (++i >= argc) {
invalid_param = true;
break;
}
sparams.min_p = std::stof(argv[i]);
} else if (arg == "--temp") {
if (++i >= argc) {
invalid_param = true;
@@ -569,8 +548,11 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
break;
}
} else if (arg == "-h" || arg == "--help") {
return false;
gpt_print_usage(argc, argv, default_params);
#ifndef LOG_DISABLE_LOGS
log_print_usage();
#endif // LOG_DISABLE_LOGS
exit(0);
} else if (arg == "--random-prompt") {
params.random_prompt = true;
} else if (arg == "--in-prefix-bos") {
@@ -629,17 +611,22 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
// End of Parse args for logging parameters
#endif // LOG_DISABLE_LOGS
} else {
throw std::invalid_argument("error: unknown argument: " + arg);
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
gpt_print_usage(argc, argv, default_params);
exit(1);
}
}
if (invalid_param) {
throw std::invalid_argument("error: invalid parameter for argument: " + arg);
fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
gpt_print_usage(argc, argv, default_params);
exit(1);
}
if (params.prompt_cache_all &&
(params.interactive || params.interactive_first ||
params.instruct)) {
throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
fprintf(stderr, "error: --prompt-cache-all not supported in interactive mode yet\n");
gpt_print_usage(argc, argv, default_params);
exit(1);
}
if (params.escape) {
@@ -658,7 +645,6 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
const llama_sampling_params & sparams = params.sparams;
printf("\n");
printf("usage: %s [options]\n", argv[0]);
printf("\n");
printf("options:\n");
@@ -693,7 +679,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k);
printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p);
printf(" --min-p N min-p sampling (default: %.1f, 0.0 = disabled)\n", (double)sparams.min_p);
printf(" --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)sparams.tfs_z);
printf(" --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)sparams.typical_p);
printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", sparams.penalty_last_n);
@@ -770,9 +755,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" -ld LOGDIR, --logdir LOGDIR\n");
printf(" path under which to save YAML logs (no logging if unset)\n");
printf("\n");
#ifndef LOG_DISABLE_LOGS
log_print_usage();
#endif // LOG_DISABLE_LOGS
}
std::string get_system_info(const gpt_params & params) {
@@ -1293,7 +1275,6 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
fprintf(stream, "threads: %d # default: %d\n", params.n_threads, std::thread::hardware_concurrency());
fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p);
fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
}

View File

@@ -110,8 +110,6 @@ struct gpt_params {
std::string image = ""; // path to an image file
};
bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params);
bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
void gpt_print_usage(int argc, char ** argv, const gpt_params & params);

View File

@@ -97,56 +97,38 @@
#define LOG_TEE_TARGET stderr
#endif
// Utility for synchronizing log configuration state
// since std::optional was introduced only in c++17
enum LogTriState
{
LogTriStateSame,
LogTriStateFalse,
LogTriStateTrue
};
// NOTE: currently disabled as it produces too many log files
// Utility to obtain "pid" like unique process id and use it when creating log files.
inline std::string log_get_pid()
{
static std::string pid;
if (pid.empty())
{
// std::this_thread::get_id() is the most portable way of obtaining a "process id"
// it's not the same as "pid" but is unique enough to solve multiple instances
// trying to write to the same log.
std::stringstream ss;
ss << std::this_thread::get_id();
pid = ss.str();
}
return pid;
}
//inline std::string log_get_pid()
//{
// static std::string pid;
// if (pid.empty())
// {
// // std::this_thread::get_id() is the most portable way of obtaining a "process id"
// // it's not the same as "pid" but is unique enough to solve multiple instances
// // trying to write to the same log.
// std::stringstream ss;
// ss << std::this_thread::get_id();
// pid = ss.str();
// }
//
// return pid;
//}
// Utility function for generating log file names with unique id based on thread id.
// invocation with log_filename_generator( "llama", "log" ) creates a string "llama.<number>.log"
// where the number is a runtime id of the current thread.
#define log_filename_generator(log_file_basename, log_file_extension) log_filename_generator_impl(LogTriStateSame, log_file_basename, log_file_extension)
#define log_filename_generator(log_file_basename, log_file_extension) log_filename_generator_impl(log_file_basename, log_file_extension)
// INTERNAL, DO NOT USE
inline std::string log_filename_generator_impl(LogTriState multilog, const std::string & log_file_basename, const std::string & log_file_extension)
inline std::string log_filename_generator_impl(const std::string & log_file_basename, const std::string & log_file_extension)
{
static bool _multilog = false;
if (multilog != LogTriStateSame)
{
_multilog = multilog == LogTriStateTrue;
}
std::stringstream buf;
buf << log_file_basename;
if (_multilog)
{
buf << ".";
buf << log_get_pid();
}
//buf << ".";
//buf << log_get_pid();
buf << ".";
buf << log_file_extension;
@@ -231,6 +213,15 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
#define LOG_TEE_FLF_VAL ,""
#endif
// Utility for synchronizing log configuration state
// since std::optional was introduced only in c++17
enum LogTriState
{
LogTriStateSame,
LogTriStateFalse,
LogTriStateTrue
};
// INTERNAL, DO NOT USE
// USE LOG() INSTEAD
//
@@ -324,23 +315,16 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
#endif
// INTERNAL, DO NOT USE
inline FILE *log_handler1_impl(bool change = false, LogTriState append = LogTriStateSame, LogTriState disable = LogTriStateSame, const std::string & filename = LOG_DEFAULT_FILE_NAME, FILE *target = nullptr)
inline FILE *log_handler1_impl(bool change = false, LogTriState disable = LogTriStateSame, const std::string & filename = LOG_DEFAULT_FILE_NAME, FILE *target = nullptr)
{
static bool _initialized = false;
static bool _append = false;
static bool _disabled = filename.empty() && target == nullptr;
static bool _initialized{false};
static bool _disabled{(filename.empty() && target == nullptr)};
static std::string log_current_filename{filename};
static FILE *log_current_target{target};
static FILE *logfile = nullptr;
if (change)
{
if (append != LogTriStateSame)
{
_append = append == LogTriStateTrue;
return logfile;
}
if (disable == LogTriStateTrue)
{
// Disable primary target
@@ -393,7 +377,7 @@ inline FILE *log_handler1_impl(bool change = false, LogTriState append = LogTriS
}
}
logfile = fopen(filename.c_str(), _append ? "a" : "w");
logfile = fopen(filename.c_str(), "w");
}
if (!logfile)
@@ -414,9 +398,9 @@ inline FILE *log_handler1_impl(bool change = false, LogTriState append = LogTriS
}
// INTERNAL, DO NOT USE
inline FILE *log_handler2_impl(bool change = false, LogTriState append = LogTriStateSame, LogTriState disable = LogTriStateSame, FILE *target = nullptr, const std::string & filename = LOG_DEFAULT_FILE_NAME)
inline FILE *log_handler2_impl(bool change = false, LogTriState disable = LogTriStateSame, FILE *target = nullptr, const std::string & filename = LOG_DEFAULT_FILE_NAME)
{
return log_handler1_impl(change, append, disable, filename, target);
return log_handler1_impl(change, disable, filename, target);
}
// Disables logs entirely at runtime.
@@ -427,7 +411,7 @@ inline FILE *log_handler2_impl(bool change = false, LogTriState append = LogTriS
// INTERNAL, DO NOT USE
inline FILE *log_disable_impl()
{
return log_handler1_impl(true, LogTriStateSame, LogTriStateTrue);
return log_handler1_impl(true, LogTriStateTrue);
}
// Enables logs at runtime.
@@ -436,31 +420,19 @@ inline FILE *log_disable_impl()
// INTERNAL, DO NOT USE
inline FILE *log_enable_impl()
{
return log_handler1_impl(true, LogTriStateSame, LogTriStateFalse);
return log_handler1_impl(true, LogTriStateFalse);
}
// Sets target fir logs, either by a file name or FILE* pointer (stdout, stderr, or any valid FILE*)
#define log_set_target(target) log_set_target_impl(target)
// INTERNAL, DO NOT USE
inline FILE *log_set_target_impl(const std::string & filename) { return log_handler1_impl(true, LogTriStateSame, LogTriStateSame, filename); }
inline FILE *log_set_target_impl(FILE *target) { return log_handler2_impl(true, LogTriStateSame, LogTriStateSame, target); }
inline FILE *log_set_target_impl(const std::string & filename) { return log_handler1_impl(true, LogTriStateSame, filename); }
inline FILE *log_set_target_impl(FILE *target) { return log_handler2_impl(true, LogTriStateSame, target); }
// INTERNAL, DO NOT USE
inline FILE *log_handler() { return log_handler1_impl(); }
// Enable or disable creating separate log files for each run.
// can ONLY be invoked BEFORE first log use.
#define log_multilog(enable) log_filename_generator_impl((enable) ? LogTriStateTrue : LogTriStateFalse, "", "")
// Enable or disable append mode for log file.
// can ONLY be invoked BEFORE first log use.
#define log_append(enable) log_append_impl(enable)
// INTERNAL, DO NOT USE
inline FILE *log_append_impl(bool enable)
{
return log_handler1_impl(true, enable ? LogTriStateTrue : LogTriStateFalse, LogTriStateSame);
}
inline void log_test()
{
log_disable();
@@ -522,18 +494,6 @@ inline bool log_param_single_parse(const std::string & param)
return true;
}
if (param == "--log-new")
{
log_multilog(true);
return true;
}
if (param == "--log-append")
{
log_append(true);
return true;
}
return false;
}
@@ -563,9 +523,7 @@ inline void log_print_usage()
printf(" --log-disable Disable trace logs\n");
printf(" --log-enable Enable trace logs\n");
printf(" --log-file Specify a log filename (without extension)\n");
printf(" --log-new Create a separate new log file on start. "
"Each log file will have unique name: \"<name>.<ID>.log\"\n");
printf(" --log-append Don't truncate the old log file.\n");
printf(" Log file will be tagged with unique ID and written as \"<name>.<ID>.log\"\n"); /* */
}
#define log_dump_cmdline(argc, argv) log_dump_cmdline_impl(argc, argv)

View File

@@ -39,7 +39,6 @@ void llama_sampling_free(struct llama_sampling_context * ctx) {
void llama_sampling_reset(llama_sampling_context * ctx) {
if (ctx->grammar != NULL) {
llama_grammar_free(ctx->grammar);
ctx->grammar = NULL;
}
if (!ctx->parsed_grammar.rules.empty()) {
@@ -90,10 +89,10 @@ std::string llama_sampling_print(const llama_sampling_params & params) {
snprintf(result, sizeof(result),
"\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
"\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, typical_p = %.3f, temp = %.3f\n"
"\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, typical_p = %.3f, temp = %.3f\n"
"\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present,
params.top_k, params.tfs_z, params.top_p, params.min_p, params.typical_p, params.temp,
params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp,
params.mirostat, params.mirostat_eta, params.mirostat_tau);
return std::string(result);
@@ -111,7 +110,6 @@ llama_token llama_sampling_sample(
const float temp = params.temp;
const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k;
const float top_p = params.top_p;
const float min_p = params.min_p;
const float tfs_z = params.tfs_z;
const float typical_p = params.typical_p;
const int32_t penalty_last_n = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n;
@@ -192,7 +190,6 @@ llama_token llama_sampling_sample(
llama_sample_tail_free(ctx_main, &cur_p, tfs_z, min_keep);
llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep);
llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep);
llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep);
llama_sample_temp (ctx_main, &cur_p, temp);
id = llama_sample_token(ctx_main, &cur_p);

View File

@@ -14,7 +14,6 @@ typedef struct llama_sampling_params {
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
int32_t top_k = 40; // <= 0 to use vocab size
float top_p = 0.95f; // 1.0 = disabled
float min_p = 0.05f; // 0.0 = disabled
float tfs_z = 1.00f; // 1.0 = disabled
float typical_p = 1.00f; // 1.0 = disabled
float temp = 0.80f; // 1.0 = disabled

View File

@@ -1045,7 +1045,6 @@ struct train_params_common get_default_train_params_common() {
params.n_batch = 8;
params.n_gradient_accumulation = 1;
params.n_epochs = -1;
params.n_gpu_layers = 0;
params.custom_n_ctx = false;
@@ -1081,7 +1080,6 @@ struct train_params_common get_default_train_params_common() {
params.adam_beta2 = 0.999f;
params.adam_gclip = 1.0f;
params.adam_eps_f = 0.0f;
return params;
}

View File

@@ -44,7 +44,6 @@ struct train_params_common {
int n_batch;
int n_gradient_accumulation;
int n_epochs;
int n_gpu_layers;
bool custom_n_ctx;

View File

@@ -652,7 +652,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
GGML_ASSERT(tokens_input->type == GGML_TYPE_I32);
auto add_to_f32 = [] (struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) {
if (ggml_is_quantized(a->type) || a->type == GGML_TYPE_F16) {
if (ggml_is_quantized(a->type)) {
return ggml_add_cast(ctx, a, b, GGML_TYPE_F32);
} else if (a->type == GGML_TYPE_F32) {
return ggml_add(ctx, a, b);
@@ -1459,17 +1459,6 @@ static bool train_params_parse(int argc, char ** argv, struct train_params * par
}
params->n_rank_w3 = std::stoi(argv[i]);
params->custom_n_rank_w3 = true;
} else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") {
if (++i >= argc) {
invalid_param = true;
break;
}
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
params->common.n_gpu_layers = std::stoi(argv[i]);
#else
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
#endif
} else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
train_print_usage(argc, argv, &default_params);
@@ -1556,7 +1545,6 @@ int main(int argc, char ** argv) {
srand(params.common.seed);
struct llama_model_params llama_mparams = llama_model_default_params();
llama_mparams.n_gpu_layers = params.common.n_gpu_layers;
llama_mparams.vocab_only = false;
printf("%s: model base = '%s'\n", __func__, params.fn_model_base);

View File

@@ -1,34 +0,0 @@
#!/bin/bash
cd `dirname $0`
cd ../..
EXE="./finetune"
if [[ ! $LLAMA_MODEL_DIR ]]; then LLAMA_MODEL_DIR="./models"; fi
if [[ ! $LLAMA_TRAINING_DIR ]]; then LLAMA_TRAINING_DIR="."; fi
# MODEL="$LLAMA_MODEL_DIR/openllama-3b-v2-q8_0.gguf" # This is the model the readme uses.
MODEL="$LLAMA_MODEL_DIR/openllama-3b-v2.gguf" # An f16 model. Note in this case with "-g", you get an f32-format .BIN file that isn't yet supported if you use it with "main --lora" with GPU inferencing.
while getopts "dg" opt; do
case $opt in
d)
DEBUGGER="gdb --args"
;;
g)
EXE="./build/bin/Release/finetune"
GPUARG="--gpu-layers 25"
;;
esac
done
$DEBUGGER $EXE \
--model-base $MODEL \
$GPUARG \
--checkpoint-in chk-ol3b-shakespeare-LATEST.gguf \
--checkpoint-out chk-ol3b-shakespeare-ITERATION.gguf \
--lora-out lora-ol3b-shakespeare-ITERATION.bin \
--train-data "$LLAMA_TRAINING_DIR\shakespeare.txt" \
--save-every 10 \
--threads 10 --adam-iter 30 --batch 4 --ctx 64 \
--use-checkpointing

View File

@@ -208,14 +208,6 @@ Top-p sampling, also known as nucleus sampling, is another text generation metho
Example usage: `--top-p 0.95`
### Min P Sampling
- `--min-p N`: Sets a minimum base probability threshold for token selection (default: 0.05).
The Min-P sampling method was designed as an alternative to Top-P, and aims to ensure a balance of quality and variety. The parameter *p* represents the minimum probability for a token to be considered, relative to the probability of the most likely token. For example, with *p*=0.05 and the most likely token having a probability of 0.9, logits with a value less than 0.045 are filtered out.
Example usage: `--min-p 0.05`
### Tail Free Sampling (TFS)
- `--tfs N`: Enable tail free sampling with parameter z (default: 1.0, 1.0 = disabled).

View File

@@ -149,7 +149,6 @@ struct task_server {
task_type type;
json data;
bool infill_mode = false;
bool embedding_mode = false;
};
struct task_result {
@@ -372,7 +371,6 @@ struct llama_client_slot
std::vector<completion_token_output> generated_token_probs;
bool infill = false;
bool embedding = false;
bool has_next_token = true;
bool truncated = false;
bool stopped_eos = false;
@@ -1246,14 +1244,13 @@ struct llama_server_context
queue_results.push_back(res);
}
int request_completion(json data, bool infill, bool embedding)
int request_completion(json data, bool infill)
{
std::lock_guard<std::mutex> lock(mutex_tasks);
task_server task;
task.id = id_gen++;
task.data = data;
task.infill_mode = infill;
task.embedding_mode = embedding;
task.type = COMPLETION_TASK;
queue_tasks.push_back(task);
return task.id;
@@ -1379,7 +1376,7 @@ struct llama_server_context
{
LOG_TEE("slot unavailable\n");
// send error result
send_error(task.id, "slot unavailable");
send_error(task.id, "slot unavaliable");
return;
}
@@ -1391,7 +1388,6 @@ struct llama_server_context
slot->reset();
slot->infill = task.infill_mode;
slot->embedding = task.embedding_mode;
slot->task_id = task.id;
if (!launch_slot_with_data(slot, task.data))
@@ -1699,7 +1695,7 @@ struct llama_server_context
}
// prompt evaluated for embedding
if (slot.embedding)
if (params.embedding)
{
send_embedding(slot);
slot.release();
@@ -2278,7 +2274,7 @@ int main(int argc, char **argv)
svr.Post("/completion", [&llama](const httplib::Request &req, httplib::Response &res)
{
json data = json::parse(req.body);
const int task_id = llama.request_completion(data, false, false);
const int task_id = llama.request_completion(data, false);
if (!json_value(data, "stream", false)) {
std::string completion_text;
task_result result = llama.next_result(task_id);
@@ -2333,7 +2329,7 @@ int main(int argc, char **argv)
svr.Post("/infill", [&llama](const httplib::Request &req, httplib::Response &res)
{
json data = json::parse(req.body);
const int task_id = llama.request_completion(data, true, false);
const int task_id = llama.request_completion(data, true);
if (!json_value(data, "stream", false)) {
std::string completion_text;
task_result result = llama.next_result(task_id);
@@ -2437,7 +2433,7 @@ int main(int argc, char **argv)
{
prompt = "";
}
const int task_id = llama.request_completion({ {"prompt", prompt}, { "n_predict", 0} }, false, true);
const int task_id = llama.request_completion({ {"prompt", prompt}, { "n_predict", 0} }, false);
task_result result = llama.next_result(task_id);
return res.set_content(result.result_json.dump(), "application/json");
});

View File

@@ -513,15 +513,6 @@ static __global__ void add_f16_f32_f16(const half * x, const float * y, half * d
dst[i] = __hadd(x[i], __float2half(y[i]));
}
static __global__ void add_f16_f32_f32(const half * x, const float * y, float * dst, const int k) {
const int i = blockDim.x*blockIdx.x + threadIdx.x;
if (i >= k) {
return;
}
dst[i] = __half2float(x[i]) + y[i];
}
static __global__ void mul_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
const int i = blockDim.x*blockIdx.x + threadIdx.x;
@@ -4702,11 +4693,6 @@ static void add_f16_f32_f16_cuda(const half * x, const float * y, half * dst, co
add_f16_f32_f16<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
}
static void add_f16_f32_f32_cuda(const half * x, const float * y, float * dst, const int k, cudaStream_t stream) {
const int num_blocks = (k + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
add_f16_f32_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
}
static void mul_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
const int num_blocks = (kx + CUDA_MUL_BLOCK_SIZE - 1) / CUDA_MUL_BLOCK_SIZE;
mul_f32<<<num_blocks, CUDA_MUL_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
@@ -6010,10 +5996,7 @@ inline void ggml_cuda_op_add(
add_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(src0), ne10*ne11, main_stream);
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
add_f16_f32_f16_cuda((const half *) src0_dd, src1_dd, (half *) dst_dd, ggml_nelements(src0), main_stream);
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
add_f16_f32_f32_cuda((const half *) src0_dd, src1_dd, dst_dd, ggml_nelements(src0), main_stream);
} else {
fprintf(stderr, "src0->type: %d dst->type: %d\n", src0->type, dst->type);
GGML_ASSERT(false);
}

View File

@@ -1001,15 +1001,11 @@ void ggml_metal_graph_compute(
} break;
case GGML_OP_SOFT_MAX:
{
int nth = 32; // SIMD width
const int nth = MIN(32, ne00);
if (ne00%4 == 0) {
[encoder setComputePipelineState:ctx->pipeline_soft_max_4];
} else {
do {
nth *= 2;
} while (nth <= ne00 && nth <= 1024);
nth /= 2;
[encoder setComputePipelineState:ctx->pipeline_soft_max];
}
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
@@ -1017,9 +1013,8 @@ void ggml_metal_graph_compute(
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
[encoder setThreadgroupMemoryLength:nth/32*sizeof(float) atIndex:0];
[encoder dispatchThreadgroups:MTLSizeMake(ne01*ne02*ne03, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
} break;
case GGML_OP_DIAG_MASK_INF:
{

View File

@@ -184,73 +184,36 @@ kernel void kernel_soft_max(
constant int64_t & ne00,
constant int64_t & ne01,
constant int64_t & ne02,
threadgroup float * buf [[threadgroup(0)]],
uint tgpig[[threadgroup_position_in_grid]],
uint tpitg[[thread_position_in_threadgroup]],
uint sgitg[[simdgroup_index_in_threadgroup]],
uint tiisg[[thread_index_in_simdgroup]],
uint ntg[[threads_per_threadgroup]]) {
const int64_t i03 = (tgpig) / (ne02*ne01);
const int64_t i02 = (tgpig - i03*ne02*ne01) / ne01;
const int64_t i01 = (tgpig - i03*ne02*ne01 - i02*ne01);
uint3 tgpig[[threadgroup_position_in_grid]],
uint3 tpitg[[thread_position_in_threadgroup]],
uint3 ntg[[threads_per_threadgroup]]) {
const int64_t i03 = tgpig[2];
const int64_t i02 = tgpig[1];
const int64_t i01 = tgpig[0];
device const float * psrc0 = src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
device float * pdst = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
// parallel max
float lmax = tpitg < ne00 ? psrc0[tpitg] : -INFINITY;
for (int i00 = tpitg + ntg; i00 < ne00; i00 += ntg) {
float lmax = tpitg[0] < ne00 ? psrc0[tpitg[0]] : -INFINITY;
for (int i00 = tpitg[0] + ntg[0]; i00 < ne00; i00 += ntg[0]) {
lmax = MAX(lmax, psrc0[i00]);
}
float max = simd_max(lmax);
if (tiisg == 0) {
buf[sgitg] = max;
}
threadgroup_barrier(mem_flags::mem_threadgroup);
// broadcast, simd group number is ntg / 32
for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
if (tpitg < i) {
buf[tpitg] = MAX(buf[tpitg], buf[tpitg + i]);
}
}
threadgroup_barrier(mem_flags::mem_threadgroup);
max = buf[0];
const float max = simd_max(lmax);
// parallel sum
float lsum = 0.0f;
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) {
const float exp_psrc0 = exp(psrc0[i00] - max);
lsum += exp_psrc0;
// Remember the result of exp here. exp is expensive, so we really do not
// wish to compute it twice.
// whish to compute it twice.
pdst[i00] = exp_psrc0;
}
float sum = simd_sum(lsum);
if (tiisg == 0) {
buf[sgitg] = sum;
}
const float sum = simd_sum(lsum);
threadgroup_barrier(mem_flags::mem_threadgroup);
// broadcast, simd group number is ntg / 32
for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
if (tpitg < i) {
buf[tpitg] += buf[tpitg + i];
}
}
threadgroup_barrier(mem_flags::mem_threadgroup);
sum = buf[0];
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) {
pdst[i00] /= sum;
}
}
@@ -261,73 +224,37 @@ kernel void kernel_soft_max_4(
constant int64_t & ne00,
constant int64_t & ne01,
constant int64_t & ne02,
threadgroup float * buf [[threadgroup(0)]],
uint tgpig[[threadgroup_position_in_grid]],
uint tpitg[[thread_position_in_threadgroup]],
uint sgitg[[simdgroup_index_in_threadgroup]],
uint tiisg[[thread_index_in_simdgroup]],
uint ntg[[threads_per_threadgroup]]) {
const int64_t i03 = (tgpig) / (ne02*ne01);
const int64_t i02 = (tgpig - i03*ne02*ne01) / ne01;
const int64_t i01 = (tgpig - i03*ne02*ne01 - i02*ne01);
uint3 tgpig[[threadgroup_position_in_grid]],
uint3 tpitg[[thread_position_in_threadgroup]],
uint3 ntg[[threads_per_threadgroup]]) {
const int64_t i03 = tgpig[2];
const int64_t i02 = tgpig[1];
const int64_t i01 = tgpig[0];
device const float4 * psrc4 = (device const float4 *)(src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
device float4 * pdst4 = (device float4 *)(dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
// parallel max
float4 lmax4 = tpitg < ne00/4 ? psrc4[tpitg] : -INFINITY;
for (int i00 = tpitg + ntg; i00 < ne00/4; i00 += ntg) {
float4 lmax4 = tpitg[0] < ne00/4 ? psrc4[tpitg[0]] : -INFINITY;
for (int i00 = tpitg[0] + ntg[0]; i00 < ne00/4; i00 += ntg[0]) {
lmax4 = fmax(lmax4, psrc4[i00]);
}
float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3]));
const float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3]));
float max = simd_max(lmax);
if (tiisg == 0) {
buf[sgitg] = max;
}
threadgroup_barrier(mem_flags::mem_threadgroup);
// broadcast, simd group number is ntg / 32
for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
if (tpitg < i) {
buf[tpitg] = MAX(buf[tpitg], buf[tpitg + i]);
}
}
threadgroup_barrier(mem_flags::mem_threadgroup);
max = buf[0];
const float max = simd_max(lmax);
// parallel sum
float4 lsum4 = 0.0f;
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
for (int i00 = tpitg[0]; i00 < ne00/4; i00 += ntg[0]) {
const float4 exp_psrc4 = exp(psrc4[i00] - max);
lsum4 += exp_psrc4;
pdst4[i00] = exp_psrc4;
}
float lsum = lsum4[0] + lsum4[1] + lsum4[2] + lsum4[3];
const float lsum = lsum4[0] + lsum4[1] + lsum4[2] + lsum4[3];
float sum = simd_sum(lsum);
if (tiisg == 0) {
buf[sgitg] = sum;
}
const float sum = simd_sum(lsum);
threadgroup_barrier(mem_flags::mem_threadgroup);
// broadcast, simd group number is ntg / 32
for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
if (tpitg < i) {
buf[tpitg] += buf[tpitg + i];
}
}
threadgroup_barrier(mem_flags::mem_threadgroup);
sum = buf[0];
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
for (int i00 = tpitg[0]; i00 < ne00/4; i00 += ntg[0]) {
pdst4[i00] /= sum;
}
}
@@ -347,7 +274,7 @@ kernel void kernel_diag_mask_inf(
dst[i02*ne01*ne00 + i01*ne00 + i00] = -INFINITY;
} else {
dst[i02*ne01*ne00 + i01*ne00 + i00] = src0[i02*ne01*ne00 + i01*ne00 + i00];
}
}
}
kernel void kernel_diag_mask_inf_8(

View File

@@ -716,7 +716,6 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int k) {
__riscv_vse8_v_i8m1(y[i].qs , vs, vl);
}
#else
GGML_UNUSED(nb);
// scalar
quantize_row_q8_0_reference(x, y, k);
#endif
@@ -970,7 +969,6 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
y[i].s = sum*d;
}
#else
GGML_UNUSED(nb);
// scalar
quantize_row_q8_1_reference(x, y, k);
#endif

49
ggml.c
View File

@@ -3153,7 +3153,7 @@ static struct ggml_tensor * ggml_add_cast_impl(
// TODO: support less-strict constraint
// GGML_ASSERT(ggml_can_repeat(b, a));
GGML_ASSERT(ggml_can_repeat_rows(b, a));
GGML_ASSERT(ggml_is_quantized(a->type) || a->type == GGML_TYPE_F16); // currently only supported for quantized input and f16
GGML_ASSERT(ggml_is_quantized(a->type)); // currently only supported for quantized input
bool is_node = false;
@@ -6927,15 +6927,9 @@ static void ggml_compute_forward_add_f16_f32(
GGML_ASSERT(src0->type == GGML_TYPE_F16);
GGML_ASSERT(src1->type == GGML_TYPE_F32);
GGML_ASSERT(dst->type == GGML_TYPE_F16);
if (dst->type == GGML_TYPE_F32) {
GGML_ASSERT( nb0 == sizeof(float));
}
else {
GGML_ASSERT(dst->type == GGML_TYPE_F16);
GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
}
GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
// rows per thread
@@ -6946,35 +6940,18 @@ static void ggml_compute_forward_add_f16_f32(
const int ir1 = MIN(ir0 + dr, nr);
if (nb10 == sizeof(float)) {
if (dst->type == GGML_TYPE_F16) {
for (int ir = ir0; ir < ir1; ++ir) {
// src0, src1 and dst are same shape => same indices
const int i3 = ir/(ne2*ne1);
const int i2 = (ir - i3*ne2*ne1)/ne1;
const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
for (int ir = ir0; ir < ir1; ++ir) {
// src0, src1 and dst are same shape => same indices
const int i3 = ir/(ne2*ne1);
const int i2 = (ir - i3*ne2*ne1)/ne1;
const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1);
ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);
ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1);
ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);
for (int i = 0; i < ne0; i++) {
dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + src1_ptr[i]);
}
}
} else {
for (int ir = ir0; ir < ir1; ++ir) {
// src0, src1 and dst are same shape => same indices
const int i3 = ir/(ne2*ne1);
const int i2 = (ir - i3*ne2*ne1)/ne1;
const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
float * dst_ptr = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1);
ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);
for (int i = 0; i < ne0; i++) {
dst_ptr[i] = GGML_FP16_TO_FP32(src0_ptr[i]) + src1_ptr[i];
}
for (int i = 0; i < ne0; i++) {
dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + src1_ptr[i]);
}
}
}

2276
llama.cpp

File diff suppressed because it is too large Load Diff

View File

@@ -598,13 +598,6 @@ extern "C" {
float p,
size_t min_keep);
/// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
LLAMA_API void llama_sample_min_p(
struct llama_context * ctx,
llama_token_data_array * candidates,
float p,
size_t min_keep);
/// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
LLAMA_API void llama_sample_tail_free(
struct llama_context * ctx,

View File

@@ -1,391 +0,0 @@
#!/bin/bash
#
# Helper script for deploying llama.cpp server with a single Bash command
#
# - Works on Linux and macOS
# - Supports: CPU, CUDA, Metal, OpenCL
# - Can run all GGUF models from HuggingFace
# - Can serve requests in parallel
# - Always builds latest llama.cpp from GitHub
#
# Limitations
#
# - Chat templates are poorly supported (base models recommended)
# - Might be unstable!
#
# Usage:
# ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose]
#
# --port: port number, default is 8888
# --repo: path to a repo containing GGUF model files
# --wtype: weights type (f16, q8_0, q4_0, q4_1), default is user-input
# --backend: cpu, cuda, metal, opencl, depends on the OS
# --gpu-id: gpu id, default is 0
# --n-parallel: number of parallel requests, default is 8
# --n-kv: KV cache size, default is 4096
# --verbose: verbose output
#
# Example:
#
# bash -c "$(curl -s https://ggml.ai/server-llm.sh)"
#
set -e
# required utils: curl, git, make
if ! command -v curl &> /dev/null; then
printf "[-] curl not found\n"
exit 1
fi
if ! command -v git &> /dev/null; then
printf "[-] git not found\n"
exit 1
fi
if ! command -v make &> /dev/null; then
printf "[-] make not found\n"
exit 1
fi
# parse arguments
port=8888
repo=""
wtype=""
backend="cpu"
# if macOS, use metal backend by default
if [[ "$OSTYPE" == "darwin"* ]]; then
backend="metal"
elif command -v nvcc &> /dev/null; then
backend="cuda"
fi
gpu_id=0
n_parallel=8
n_kv=4096
verbose=0
function print_usage {
printf "Usage:\n"
printf " ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose]\n\n"
printf " --port: port number, default is 8888\n"
printf " --repo: path to a repo containing GGUF model files\n"
printf " --wtype: weights type (f16, q8_0, q4_0, q4_1), default is user-input\n"
printf " --backend: cpu, cuda, metal, opencl, depends on the OS\n"
printf " --gpu-id: gpu id, default is 0\n"
printf " --n-parallel: number of parallel requests, default is 8\n"
printf " --n-kv: KV cache size, default is 4096\n"
printf " --verbose: verbose output\n\n"
printf "Example:\n\n"
printf ' bash -c "$(curl -s https://ggml.ai/server-llm.sh)"\n\n'
}
while [[ $# -gt 0 ]]; do
key="$1"
case $key in
--port)
port="$2"
shift
shift
;;
--repo)
repo="$2"
shift
shift
;;
--wtype)
wtype="$2"
shift
shift
;;
--backend)
backend="$2"
shift
shift
;;
--gpu-id)
gpu_id="$2"
shift
shift
;;
--n-parallel)
n_parallel="$2"
shift
shift
;;
--n-kv)
n_kv="$2"
shift
shift
;;
--verbose)
verbose=1
shift
;;
--help)
print_usage
exit 0
;;
*)
echo "Unknown argument: $key"
print_usage
exit 1
;;
esac
done
# available weights types
wtypes=("F16" "Q8_0" "Q4_0" "Q4_1" "Q5_0" "Q5_1" "Q6_K" "Q5_K_M" "Q5_K_S" "Q4_K_M" "Q4_K_S" "Q3_K_L" "Q3_K_M" "Q3_K_S" "Q2_K")
wfiles=()
for wt in "${wtypes[@]}"; do
wfiles+=("")
done
# sample repos
repos=(
"https://huggingface.co/TheBloke/Llama-2-7B-GGUF"
"https://huggingface.co/TheBloke/Llama-2-13B-GGUF"
"https://huggingface.co/TheBloke/Llama-2-70B-GGUF"
"https://huggingface.co/TheBloke/CodeLlama-7B-GGUF"
"https://huggingface.co/TheBloke/CodeLlama-13B-GGUF"
"https://huggingface.co/TheBloke/CodeLlama-34B-GGUF"
"https://huggingface.co/TheBloke/Mistral-7B-v0.1-GGUF"
"https://huggingface.co/TheBloke/zephyr-7B-beta-GGUF"
"https://huggingface.co/TheBloke/OpenHermes-2-Mistral-7B-GGUF"
"https://huggingface.co/TheBloke/CausalLM-7B-GGUF"
)
printf "\n"
printf "[I] This is a helper script for deploying llama.cpp's server on this machine.\n\n"
printf " Based on the options that follow, the script might download a model file\n"
printf " from the internet, which can be a few GBs in size. The script will also\n"
printf " build the latest llama.cpp source code from GitHub, which can be unstable.\n"
printf "\n"
printf " Upon success, an HTTP server will be started and it will serve the selected\n"
printf " model using llama.cpp for demonstration purposes.\n"
printf "\n"
printf " Please note:\n"
printf "\n"
printf " - All new data will be stored in the current folder\n"
printf " - The server will be listening on all network interfaces\n"
printf " - The server will run with default settings which are not always optimal\n"
printf " - Do not judge the quality of a model based on the results from this script\n"
printf " - Do not use this script to benchmark llama.cpp\n"
printf " - Do not use this script in production\n"
printf " - This script is only for demonstration purposes\n"
printf "\n"
printf " If you don't know what you are doing, please press Ctrl-C to abort now\n"
printf "\n"
printf " Press Enter to continue ...\n\n"
read
if [[ -z "$repo" ]]; then
printf "[+] No repo provided from the command line\n"
printf " Please select a number from the list below or enter an URL:\n\n"
is=0
for r in "${repos[@]}"; do
printf " %2d) %s\n" $is "$r"
is=$((is+1))
done
# ask for repo until index of sample repo is provided or an URL
while [[ -z "$repo" ]]; do
printf "\n Or choose one from: https://huggingface.co/models?sort=trending&search=gguf\n\n"
read -p "[+] Select repo: " repo
# check if the input is a number
if [[ "$repo" =~ ^[0-9]+$ ]]; then
if [[ "$repo" -ge 0 && "$repo" -lt ${#repos[@]} ]]; then
repo="${repos[$repo]}"
else
printf "[-] Invalid repo index: %s\n" "$repo"
repo=""
fi
elif [[ "$repo" =~ ^https?:// ]]; then
repo="$repo"
else
printf "[-] Invalid repo URL: %s\n" "$repo"
repo=""
fi
done
fi
# remove suffix
repo=$(echo "$repo" | sed -E 's/\/tree\/main$//g')
printf "[+] Checking for GGUF model files in %s\n" "$repo"
# find GGUF files in the source
# TODO: better logic
model_tree="${repo%/}/tree/main"
model_files=$(curl -s "$model_tree" | grep -i "\\.gguf</span>" | sed -E 's/.*<span class="truncate group-hover:underline">(.*)<\/span><\/a>/\1/g')
# list all files in the provided git repo
printf "[+] Model files:\n\n"
for file in $model_files; do
# determine iw by grepping the filename with wtypes
iw=-1
is=0
for wt in "${wtypes[@]}"; do
# uppercase
ufile=$(echo "$file" | tr '[:lower:]' '[:upper:]')
if [[ "$ufile" =~ "$wt" ]]; then
iw=$is
break
fi
is=$((is+1))
done
if [[ $iw -eq -1 ]]; then
continue
fi
wfiles[$iw]="$file"
have=" "
if [[ -f "$file" ]]; then
have="*"
fi
printf " %2d) %s %s\n" $iw "$have" "$file"
done
# ask for weights type until provided and available
while [[ -z "$wtype" ]]; do
printf "\n"
read -p "[+] Select weight type: " wtype
wfile="${wfiles[$wtype]}"
if [[ -z "$wfile" ]]; then
printf "[-] Invalid weight type: %s\n" "$wtype"
wtype=""
fi
done
printf "[+] Selected weight type: %s (%s)\n" "$wtype" "$wfile"
url="${repo%/}/resolve/main/$wfile"
# check file if the model has been downloaded before
chk="$wfile.chk"
# check if we should download the file
# - if $wfile does not exist
# - if $wfile exists but $chk does not exist
# - if $wfile exists and $chk exists but $wfile is newer than $chk
# TODO: better logic using git lfs info
do_download=0
if [[ ! -f "$wfile" ]]; then
do_download=1
elif [[ ! -f "$chk" ]]; then
do_download=1
elif [[ "$wfile" -nt "$chk" ]]; then
do_download=1
fi
if [[ $do_download -eq 1 ]]; then
printf "[+] Downloading weights from %s\n" "$url"
# download the weights file
curl -o "$wfile" -# -L "$url"
# create a check file if successful
if [[ $? -eq 0 ]]; then
printf "[+] Creating check file %s\n" "$chk"
touch "$chk"
fi
else
printf "[+] Using cached weights %s\n" "$wfile"
fi
# get latest llama.cpp and build
printf "[+] Downloading latest llama.cpp\n"
llama_cpp_dir="__llama_cpp_port_${port}__"
if [[ -d "$llama_cpp_dir" && ! -f "$llama_cpp_dir/__ggml_script__" ]]; then
# if the dir exists and there isn't a file "__ggml_script__" in it, abort
printf "[-] Directory %s already exists\n" "$llama_cpp_dir"
printf "[-] Please remove it and try again\n"
exit 1
elif [[ -d "$llama_cpp_dir" ]]; then
printf "[+] Directory %s already exists\n" "$llama_cpp_dir"
printf "[+] Using cached llama.cpp\n"
cd "$llama_cpp_dir"
git reset --hard
git fetch
git checkout origin/master
cd ..
else
printf "[+] Cloning llama.cpp\n"
git clone https://github.com/ggerganov/llama.cpp "$llama_cpp_dir"
fi
# mark that that the directory is made by this script
touch "$llama_cpp_dir/__ggml_script__"
if [[ $verbose -eq 1 ]]; then
set -x
fi
# build
cd "$llama_cpp_dir"
make clean
log="--silent"
if [[ $verbose -eq 1 ]]; then
log=""
fi
if [[ "$backend" == "cuda" ]]; then
printf "[+] Building with CUDA backend\n"
LLAMA_CUBLAS=1 make -j server $log
elif [[ "$backend" == "cpu" ]]; then
printf "[+] Building with CPU backend\n"
make -j server $log
elif [[ "$backend" == "metal" ]]; then
printf "[+] Building with Metal backend\n"
make -j server $log
elif [[ "$backend" == "opencl" ]]; then
printf "[+] Building with OpenCL backend\n"
LLAMA_CLBLAST=1 make -j server $log
else
printf "[-] Unknown backend: %s\n" "$backend"
exit 1
fi
# run the server
printf "[+] Running server\n"
args=""
if [[ "$backend" == "cuda" ]]; then
export CUDA_VISIBLE_DEVICES=$gpu_id
args="-ngl 999"
elif [[ "$backend" == "cpu" ]]; then
args="-ngl 0"
elif [[ "$backend" == "metal" ]]; then
args="-ngl 999"
elif [[ "$backend" == "opencl" ]]; then
args="-ngl 999"
else
printf "[-] Unknown backend: %s\n" "$backend"
exit 1
fi
if [[ $verbose -eq 1 ]]; then
args="$args --verbose"
fi
./server -m "../$wfile" --host 0.0.0.0 --port "$port" -c $n_kv -np "$n_parallel" $args
exit 0