Compare commits

..

5 Commits

Author SHA1 Message Date
Kylin
1e3bc523d8 ggml : support CUDA's half type for aarch64(#1455) (#2670)
* ggml: support CUDA's half type for aarch64(#1455)
support CUDA's half type for aarch64 in ggml_fp16_t definition

* ggml: use __CUDACC__ to recognise nvcc compiler
2023-08-22 10:14:23 +03:00
Shouzheng Liu
14b1d7e6f7 metal : add missing barriers for mul-mat (#2699) 2023-08-22 09:18:40 +03:00
Jhen-Jie Hong
226255b44e server : fallback to default if client param is null (#2688)
* server : fallback to default if client param is null

* server : do not overwrite 404 if status is 500 from exception_handler
2023-08-22 08:32:00 +08:00
Kerfuffle
930523c8e1 Fix convert-llama-ggmlv3-to-gguf.py vocab conversion (#2698)
When converting without metadata, the hex value for bytes entries weren't 0 padded to 2 digits.
2023-08-21 18:01:34 -06:00
Georgi Gerganov
c8dba409e6 py : remove obsolete script 2023-08-21 23:40:22 +03:00
5 changed files with 40 additions and 43 deletions

View File

@@ -236,8 +236,7 @@ class GGMLToGGUF:
if len(vbytes) == 0:
tt = 3 # Control
elif tokid >= 3 and tokid <= 258 and len(vbytes) == 1:
hv = hex(vbytes[0])[2:].upper()
vbytes = bytes(f'<0x{hv}>', encoding = 'UTF-8')
vbytes = bytes(f'<0x{vbytes[0]:02X}>', encoding = 'UTF-8')
tt = 6 # Byte
else:
vbytes = vbytes.replace(b' ', b'\xe2\x96\x81')

View File

@@ -1,13 +0,0 @@
# Compatibility stub
import argparse
import convert
parser = argparse.ArgumentParser(
description="""[DEPRECATED - use `convert.py` instead]
Convert a LLaMA model checkpoint to a ggml compatible file""")
parser.add_argument('dir_model', help='directory containing the model checkpoint')
parser.add_argument('ftype', help='file type (0: float32, 1: float16)', type=int, choices=[0, 1], default=1)
args = parser.parse_args()
convert.main(['--outtype', 'f16' if args.ftype == 1 else 'f32', '--', args.dir_model])

View File

@@ -1056,33 +1056,42 @@ static json format_tokenizer_response(const std::vector<llama_token> &tokens)
{"tokens", tokens}};
}
template <typename T>
static T json_value(const json &body, const std::string &key, const T &default_value)
{
// Fallback null to default value
return body.contains(key) && !body.at(key).is_null()
? body.value(key, default_value)
: default_value;
}
static void parse_options_completion(const json &body, llama_server_context &llama)
{
gpt_params default_params;
llama.stream = body.value("stream", false);
llama.params.n_predict = body.value("n_predict", default_params.n_predict);
llama.params.top_k = body.value("top_k", default_params.top_k);
llama.params.top_p = body.value("top_p", default_params.top_p);
llama.params.tfs_z = body.value("tfs_z", default_params.tfs_z);
llama.params.typical_p = body.value("typical_p", default_params.typical_p);
llama.params.repeat_last_n = body.value("repeat_last_n", default_params.repeat_last_n);
llama.params.temp = body.value("temperature", default_params.temp);
llama.params.repeat_penalty = body.value("repeat_penalty", default_params.repeat_penalty);
llama.params.presence_penalty = body.value("presence_penalty", default_params.presence_penalty);
llama.params.frequency_penalty = body.value("frequency_penalty", default_params.frequency_penalty);
llama.params.mirostat = body.value("mirostat", default_params.mirostat);
llama.params.mirostat_tau = body.value("mirostat_tau", default_params.mirostat_tau);
llama.params.mirostat_eta = body.value("mirostat_eta", default_params.mirostat_eta);
llama.params.penalize_nl = body.value("penalize_nl", default_params.penalize_nl);
llama.params.n_keep = body.value("n_keep", default_params.n_keep);
llama.params.seed = body.value("seed", default_params.seed);
llama.params.prompt = body.value("prompt", default_params.prompt);
llama.params.grammar = body.value("grammar", default_params.grammar);
llama.params.n_probs = body.value("n_probs", default_params.n_probs);
llama.stream = json_value(body, "stream", false);
llama.params.n_predict = json_value(body, "n_predict", default_params.n_predict);
llama.params.top_k = json_value(body, "top_k", default_params.top_k);
llama.params.top_p = json_value(body, "top_p", default_params.top_p);
llama.params.tfs_z = json_value(body, "tfs_z", default_params.tfs_z);
llama.params.typical_p = json_value(body, "typical_p", default_params.typical_p);
llama.params.repeat_last_n = json_value(body, "repeat_last_n", default_params.repeat_last_n);
llama.params.temp = json_value(body, "temperature", default_params.temp);
llama.params.repeat_penalty = json_value(body, "repeat_penalty", default_params.repeat_penalty);
llama.params.presence_penalty = json_value(body, "presence_penalty", default_params.presence_penalty);
llama.params.frequency_penalty = json_value(body, "frequency_penalty", default_params.frequency_penalty);
llama.params.mirostat = json_value(body, "mirostat", default_params.mirostat);
llama.params.mirostat_tau = json_value(body, "mirostat_tau", default_params.mirostat_tau);
llama.params.mirostat_eta = json_value(body, "mirostat_eta", default_params.mirostat_eta);
llama.params.penalize_nl = json_value(body, "penalize_nl", default_params.penalize_nl);
llama.params.n_keep = json_value(body, "n_keep", default_params.n_keep);
llama.params.seed = json_value(body, "seed", default_params.seed);
llama.params.prompt = json_value(body, "prompt", default_params.prompt);
llama.params.grammar = json_value(body, "grammar", default_params.grammar);
llama.params.n_probs = json_value(body, "n_probs", default_params.n_probs);
llama.params.logit_bias.clear();
if (body.value("ignore_eos", false))
if (json_value(body, "ignore_eos", false))
{
llama.params.logit_bias[llama_token_eos(llama.ctx)] = -INFINITY;
}
@@ -1337,7 +1346,7 @@ int main(int argc, char **argv)
auto lock = llama.lock();
const json body = json::parse(req.body);
const std::string content = body.value("content", "");
const std::string content = json_value<std::string>(body, "content", "");
const std::vector<llama_token> tokens = llama_tokenize(llama.ctx, content, false);
const json data = format_tokenizer_response(tokens);
return res.set_content(data.dump(), "application/json"); });
@@ -1350,7 +1359,7 @@ int main(int argc, char **argv)
llama.rewind();
llama_reset_timings(llama.ctx);
llama.params.prompt = body.value("content", "");
llama.params.prompt = json_value<std::string>(body, "content", "");
llama.params.n_predict = 0;
llama.loadPrompt();
llama.beginCompletion();
@@ -1379,7 +1388,7 @@ int main(int argc, char **argv)
{
if (res.status == 400) {
res.set_content("Invalid request", "text/plain");
} else {
} else if (res.status != 500) {
res.set_content("File Not Found", "text/plain");
res.status = 404;
} });

View File

@@ -1850,6 +1850,7 @@ kernel void kernel_mul_mm(device const uchar * src0,
//load data and store to threadgroup memory
half4x4 temp_a;
dequantize_func(x, il, temp_a);
threadgroup_barrier(mem_flags::mem_threadgroup);
#pragma unroll(16)
for (int i = 0; i < 16; i++) {
*(sa + SG_MAT_SIZE * ((tiitg / THREAD_PER_ROW / 8) \
@@ -1895,14 +1896,14 @@ kernel void kernel_mul_mm(device const uchar * src0,
}
} else {
// block is smaller than 64x32, we should avoid writing data outside of the matrix
threadgroup_barrier(mem_flags::mem_threadgroup);
threadgroup float *temp_str = ((threadgroup float *)shared_memory) \
+ 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M;
for (int i = 0; i < 8; i++) {
threadgroup_barrier(mem_flags::mem_device);
simdgroup_store(c_res[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_M * (i/4), BLOCK_SIZE_M);
}
threadgroup_barrier(mem_flags::mem_device);
threadgroup_barrier(mem_flags::mem_threadgroup);
device float *C = dst + BLOCK_SIZE_M * r0 + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0;
if (sgitg==0) {
for (int i = 0; i < n_rows; i++) {

5
ggml.h
View File

@@ -259,8 +259,9 @@
extern "C" {
#endif
#ifdef __ARM_NEON
// we use the built-in 16-bit float type
#if defined(__ARM_NEON) && defined(__CUDACC__)
typedef half ggml_fp16_t;
#elif defined(__ARM_NEON)
typedef __fp16 ggml_fp16_t;
#else
typedef uint16_t ggml_fp16_t;