Compare commits

...

12 Commits

Author SHA1 Message Date
Borislav Stanimirov
ff966e7ca6 build : fix several cast and printf warnings (#2499) 2023-08-04 13:07:21 +03:00
Evan Jones
8183159cf3 examples : generate JSON according to schema (#1887)
* examples : add JSON schema grammars

* complete JSON grammar

* ensure primitive types can be used as root of schema

* support integer type and adjust usage text
2023-08-02 22:05:44 -04:00
Johannes Gäßler
468ea24fb4 CUDA: faster non k-quant mul_mat_q kernels (#2483) 2023-08-02 18:04:04 +02:00
Johannes Gäßler
4f6b60c776 CUDA: Fix models with output size != 32000 (#2480) 2023-08-02 16:48:10 +02:00
ldwang
220d931864 readme : add Aquila-7B model series to supported models (#2487)
* support bpe tokenizer in convert

Signed-off-by: ldwang <ftgreat@gmail.com>

* support bpe tokenizer in convert

Signed-off-by: ldwang <ftgreat@gmail.com>

* support bpe tokenizer in convert, fix

Signed-off-by: ldwang <ftgreat@gmail.com>

* Add Aquila-7B models in README.md

Signed-off-by: ldwang <ftgreat@gmail.com>

* Up Aquila-7B models in README.md

Signed-off-by: ldwang <ftgreat@gmail.com>

---------

Signed-off-by: ldwang <ftgreat@gmail.com>
Co-authored-by: ldwang <ftgreat@gmail.com>
2023-08-02 11:21:11 +03:00
Eve
81844fbcfd tests : Fix compilation warnings (Linux/GCC) (#2451)
* fix hellaswag print format, cast away warning in test-double-float

* c++11 cannot use designated initializers

* add static to test-grad0.c internal functions

* use memcpy in test-double-float.c

* port c tests to c++

* use initializer list for ggml_init_params
2023-08-02 11:06:19 +03:00
Yiming Cui
a312193e18 readme : Add Chinese LLaMA-2 / Alpaca-2 to supported models (#2475)
* add support for chinese llama-2 / alpaca-2

* remove white spaces
2023-08-02 09:18:31 +03:00
Bono Lv
c574bddb36 fix a typo in examples/server/README.md (#2478) 2023-08-01 14:54:28 +02:00
ebraminio
86aeb27734 server : Support dark mode (#2414)
* server : Support dark mode

So it respects user system light / dark settings.

* Update index.html.hpp by running ./deps.sh
2023-08-01 10:56:23 +02:00
Matteo Boschini
1873ff586b metal : add gqa8 kernel to allow llama-2-70B on metal (#2459)
* Added gqa8 kernel to allow llama-2-70B on metal

* Update ggml-metal.m

Co-authored-by: Cebtenzzre <cebtenzzre@gmail.com>

* Extend kernel_mul_mat_f16_f32 to handle gqa broadcast

* Added ne03==ne13 assertion

---------

Co-authored-by: Cebtenzzre <cebtenzzre@gmail.com>
2023-08-01 10:43:12 +03:00
Johannes Gäßler
49e7cb5bb1 CUDA: fixed LLAMA_FAST compilation option (#2473) 2023-07-31 21:02:19 +02:00
Johannes Gäßler
b772bba42e CUDA: fixed cmake F16 option (#2471) 2023-07-31 19:52:22 +02:00
21 changed files with 1897 additions and 1426 deletions

View File

@@ -73,7 +73,7 @@ set(LLAMA_CUDA_MMQ_Y "64" CACHE STRING "llama: y tile size for mmq CUDA ke
option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF)
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
option(LLAMA_CUDA_DMMV_F16 "llama: use 16 bit floats for dmmv CUDA kernels" OFF)
option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some calculations" OFF)
set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
option(LLAMA_METAL "llama: use Metal" OFF)
@@ -265,8 +265,8 @@ if (LLAMA_CUBLAS)
if (DEFINED LLAMA_CUDA_DMMV_Y)
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_DMMV_Y}) # for backwards compatibility
endif()
if (LLAMA_CUDA_DMMV_F16)
add_compile_definitions(GGML_CUDA_DMMV_F16)
if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
add_compile_definitions(GGML_CUDA_F16)
endif()
add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
@@ -280,8 +280,8 @@ if (LLAMA_CUBLAS)
# 52 == lowest CUDA 12 standard
# 60 == f16 CUDA intrinsics
# 61 == integer CUDA intrinsics
# 70 == (assumed) compute capability at which unrolling a loop in mul_mat_q kernels is faster
if (LLAMA_CUDA_DMMV_F16)
# 70 == compute capability at which unrolling a loop in mul_mat_q kernels is faster
if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
else()
set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics

View File

@@ -243,7 +243,7 @@ ifdef LLAMA_CUDA_CCBIN
NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
endif
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
$(NVCC) $(NVCCFLAGS) $(subst -Ofast,-O3,$(CXXFLAGS)) -Wno-pedantic -c $< -o $@
endif # LLAMA_CUBLAS
ifdef LLAMA_CLBLAST
@@ -411,13 +411,13 @@ benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o
vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
tests/test-double-float: tests/test-double-float.c build-info.h ggml.o llama.o common.o $(OBJS)
tests/test-double-float: tests/test-double-float.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
tests/test-grad0: tests/test-grad0.c build-info.h ggml.o llama.o common.o $(OBJS)
tests/test-grad0: tests/test-grad0.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
tests/test-opt: tests/test-opt.c build-info.h ggml.o llama.o common.o $(OBJS)
tests/test-opt: tests/test-opt.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
tests/test-quantize-fns: tests/test-quantize-fns.cpp build-info.h ggml.o llama.o common.o $(OBJS)

View File

@@ -80,7 +80,7 @@ as the main playground for developing new features for the [ggml](https://github
- [x] LLaMA 2 🦙🦙
- [X] [Alpaca](https://github.com/ggerganov/llama.cpp#instruction-mode-with-alpaca)
- [X] [GPT4All](https://github.com/ggerganov/llama.cpp#using-gpt4all)
- [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca)
- [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2)
- [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne)
- [X] [Vicuna](https://github.com/ggerganov/llama.cpp/discussions/643#discussioncomment-5533894)
- [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/)
@@ -88,6 +88,7 @@ as the main playground for developing new features for the [ggml](https://github
- [X] [Pygmalion 7B / Metharme 7B](#using-pygmalion-7b--metharme-7b)
- [X] [WizardLM](https://github.com/nlpxucan/WizardLM)
- [X] [Baichuan-7B](https://huggingface.co/baichuan-inc/baichuan-7B) and its derivations (such as [baichuan-7b-sft](https://huggingface.co/hiyouga/baichuan-7b-sft))
- [X] [Aquila-7B](https://huggingface.co/BAAI/Aquila-7B) / [AquilaChat-7B](https://huggingface.co/BAAI/AquilaChat-7B)
**Bindings:**
@@ -492,6 +493,9 @@ Building the program with BLAS support may lead to some performance improvements
# obtain the original LLaMA model weights and place them in ./models
ls ./models
65B 30B 13B 7B tokenizer_checklist.chk tokenizer.model
# [Optional] for models using BPE tokenizers
ls ./models
65B 30B 13B 7B vocab.json
# install Python dependencies
python3 -m pip install -r requirements.txt
@@ -499,6 +503,9 @@ python3 -m pip install -r requirements.txt
# convert the 7B model to ggml FP16 format
python3 convert.py models/7B/
# [Optional] for models using BPE tokenizers
python convert.py models/7B/ --vocabtype bpe
# quantize the model to 4-bits (using q4_0 method)
./quantize ./models/7B/ggml-model-f16.bin ./models/7B/ggml-model-q4_0.bin q4_0

View File

@@ -572,7 +572,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
fprintf(stdout, " --temp N temperature (default: %.1f)\n", (double)params.temp);
fprintf(stdout, " --perplexity compute perplexity over each ctx window of the prompt\n");
fprintf(stdout, " --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n");
fprintf(stdout, " --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %d)\n", params.hellaswag_tasks);
fprintf(stdout, " --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
fprintf(stdout, " --keep N number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
fprintf(stdout, " --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
if (llama_mlock_supported()) {

View File

@@ -30,7 +30,7 @@ struct MyModel* create_mymodel(int argc, char ** argv) {
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = time(NULL);
params.seed = uint32_t(time(NULL));
}
fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);

View File

@@ -405,7 +405,7 @@ namespace grammar_parser {
for (size_t i = 0, end = state.rules.size(); i < end; i++) {
// fprintf(file, "%zu: ", i);
// print_rule_binary(file, state.rules[i]);
print_rule(file, i, state.rules[i], symbol_id_names);
print_rule(file, uint32_t(i), state.rules[i], symbol_id_names);
// fprintf(file, "\n");
}
} catch (const std::exception & err) {

View File

@@ -0,0 +1,132 @@
import argparse
import json
import re
import sys
# whitespace is constrained to a single space char to prevent model "running away" in
# whitespace. Also maybe improves generation quality?
SPACE_RULE = '" "?'
PRIMITIVE_RULES = {
'boolean': '("true" | "false") space',
'number': '("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space',
'integer': '("-"? ([0-9] | [1-9] [0-9]*)) space',
'string': r''' "\"" (
[^"\\] |
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
)* "\"" space ''',
'null': '"null" space',
}
INVALID_RULE_CHARS_RE = re.compile(r'[^a-zA-Z0-9-]+')
GRAMMAR_LITERAL_ESCAPE_RE = re.compile(r'[\r\n"]')
GRAMMAR_LITERAL_ESCAPES = {'\r': '\\r', '\n': '\\n', '"': '\\"'}
class SchemaConverter:
def __init__(self, prop_order):
self._prop_order = prop_order
self._rules = {'space': SPACE_RULE}
def _format_literal(self, literal):
escaped = GRAMMAR_LITERAL_ESCAPE_RE.sub(
lambda m: GRAMMAR_LITERAL_ESCAPES.get(m.group(0)), json.dumps(literal)
)
return f'"{escaped}"'
def _add_rule(self, name, rule):
esc_name = INVALID_RULE_CHARS_RE.sub('-', name)
if esc_name not in self._rules or self._rules[esc_name] == rule:
key = esc_name
else:
i = 0
while f'{esc_name}{i}' in self._rules:
i += 1
key = f'{esc_name}{i}'
self._rules[key] = rule
return key
def visit(self, schema, name):
schema_type = schema.get('type')
rule_name = name or 'root'
if 'oneOf' in schema or 'anyOf' in schema:
rule = ' | '.join((
self.visit(alt_schema, f'{name}{"-" if name else ""}{i}')
for i, alt_schema in enumerate(schema.get('oneOf') or schema['anyOf'])
))
return self._add_rule(rule_name, rule)
elif 'const' in schema:
return self._add_rule(rule_name, self._format_literal(schema['const']))
elif 'enum' in schema:
rule = ' | '.join((self._format_literal(v) for v in schema['enum']))
return self._add_rule(rule_name, rule)
elif schema_type == 'object' and 'properties' in schema:
# TODO: `required` keyword
prop_order = self._prop_order
prop_pairs = sorted(
schema['properties'].items(),
# sort by position in prop_order (if specified) then by key
key=lambda kv: (prop_order.get(kv[0], len(prop_order)), kv[0]),
)
rule = '"{" space'
for i, (prop_name, prop_schema) in enumerate(prop_pairs):
prop_rule_name = self.visit(prop_schema, f'{name}{"-" if name else ""}{prop_name}')
if i > 0:
rule += ' "," space'
rule += fr' {self._format_literal(prop_name)} space ":" space {prop_rule_name}'
rule += ' "}" space'
return self._add_rule(rule_name, rule)
elif schema_type == 'array' and 'items' in schema:
# TODO `prefixItems` keyword
item_rule_name = self.visit(schema['items'], f'{name}{"-" if name else ""}item')
rule = f'"[" space ({item_rule_name} ("," space {item_rule_name})*)? "]" space'
return self._add_rule(rule_name, rule)
else:
assert schema_type in PRIMITIVE_RULES, f'Unrecognized schema: {schema}'
return self._add_rule(
'root' if rule_name == 'root' else schema_type,
PRIMITIVE_RULES[schema_type]
)
def format_grammar(self):
return '\n'.join((f'{name} ::= {rule}' for name, rule in self._rules.items()))
def main(args_in = None):
parser = argparse.ArgumentParser(
description='''
Generates a grammar (suitable for use in ./main) that produces JSON conforming to a
given JSON schema. Only a subset of JSON schema features are supported; more may be
added in the future.
''',
)
parser.add_argument(
'--prop-order',
default=[],
type=lambda s: s.split(','),
help='''
comma-separated property names defining the order of precedence for object properties;
properties not specified here are given lower precedence than those that are, and are
sorted alphabetically
'''
)
parser.add_argument('schema', help='file containing JSON schema ("-" for stdin)')
args = parser.parse_args(args_in)
schema = json.load(sys.stdin if args.schema == '-' else open(args.schema))
prop_order = {name: idx for idx, name in enumerate(args.prop_order)}
converter = SchemaConverter(prop_order)
converter.visit(schema, '')
print(converter.format_grammar())
if __name__ == '__main__':
main()

View File

@@ -153,7 +153,7 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
}
size_t hs_task_count = prompt_lines.size()/6;
fprintf(stderr, "%s : loaded %lu tasks from prompt.\n", __func__, hs_task_count);
fprintf(stderr, "%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count);
// This is needed as usual for LLaMA models
bool prepend_bos = true;
@@ -178,7 +178,7 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
double ending_logprob[4];
};
fprintf(stderr, "%s : selecting %lu %s tasks.\n", __func__, hs_task_count, (randomize_tasks?"randomized":"the first") );
fprintf(stderr, "%s : selecting %zu %s tasks.\n", __func__, hs_task_count, (randomize_tasks?"randomized":"the first") );
// Select and read data from prompt lines
hs_data_t *hs_data = new hs_data_t[hs_task_count];
@@ -223,7 +223,7 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
// Stop if query wont fit the ctx window
if (query_size > (size_t)params.n_ctx) {
fprintf(stderr, "%s : number of tokens in query %lu > n_ctxl\n", __func__, query_size);
fprintf(stderr, "%s : number of tokens in query %zu > n_ctxl\n", __func__, query_size);
return;
}
@@ -284,7 +284,7 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
}
// Print the accumulated accuracy mean x 100
printf("%li\t%.8lf\n",task_idx+1, acc/double(task_idx+1)*100.0);
printf("%zu\t%.8lf\n",task_idx+1, acc/double(task_idx+1)*100.0);
fflush(stdout);
}

View File

@@ -163,7 +163,7 @@ node .
`content`: Set the text to tokenize.
Note that the special `BOS` token is not added in fron of the text and also a space character is not inserted automatically as it is for `/completion`.
Note that the special `BOS` token is not added in front of the text and also a space character is not inserted automatically as it is for `/completion`.
- **POST** `/embedding`: Generate embedding of a given text just as [the embedding example](../embedding) does.

File diff suppressed because it is too large Load Diff

View File

@@ -3,12 +3,11 @@
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1" />
<meta name="color-scheme" content="light dark">
<title>llama.cpp - chat</title>
<style>
body {
background-color: #fff;
color: #000;
font-family: system-ui;
font-size: 90%;
}

View File

@@ -123,7 +123,7 @@ int main(int argc, char ** argv)
// Evaluate the tokens :
//---------------------------------
if ( llama_eval( ctx , tokens_list.data() , tokens_list.size() , llama_get_kv_cache_token_count( ctx ) , params.n_threads ) )
if ( llama_eval( ctx , tokens_list.data() , int(tokens_list.size()) , llama_get_kv_cache_token_count( ctx ) , params.n_threads ) )
{
fprintf( stderr, "%s : failed to eval\n" , __func__ );
return 1;

File diff suppressed because it is too large Load Diff

View File

@@ -718,7 +718,8 @@ void ggml_metal_graph_compute(
// TODO: needs to be updated after PR: https://github.com/ggerganov/ggml/pull/224
GGML_ASSERT(ne00 == ne10);
GGML_ASSERT(ne02 == ne12);
// GGML_ASSERT(ne02 == ne12); // Should be checked on individual data types until broadcast is implemented everywhere
GGML_ASSERT(ne03 == ne13);
if (ggml_is_contiguous(src0) &&
ggml_is_contiguous(src1) &&
@@ -746,11 +747,11 @@ void ggml_metal_graph_compute(
initWithDevice:ctx->device transposeLeft:false transposeRight:true
resultRows:ne11 resultColumns:ne01 interiorColumns:ne00 alpha:1.0 beta:0.0];
// we need to do ne02 multiplications
// we need to do ne12 multiplications
// TODO: is there a way to do this in parallel - currently very slow ..
// TODO: might be possible to offload part of the computation to ANE using Accelerate's CBLAS
for (int64_t i02 = 0; i02 < ne02; ++i02) {
size_t offs_src0_cur = offs_src0 + i02*nb02;
for (int64_t i02 = 0; i02 < ne12; ++i02) {
size_t offs_src0_cur = offs_src0 + i02/(ne12/ne02)*nb02; // gqa not used for now
size_t offs_src1_cur = offs_src1 + i02*nb12;
size_t offs_dst_cur = offs_dst + i02*nb2;
@@ -772,8 +773,6 @@ void ggml_metal_graph_compute(
switch (src0t) {
case GGML_TYPE_F16:
{
GGML_ASSERT(ne02 == ne12);
nth0 = 64;
nth1 = 1;
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
@@ -853,16 +852,18 @@ void ggml_metal_graph_compute(
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
[encoder setBytes:&nb00 length:sizeof(nb00) atIndex:5];
[encoder setBytes:&nb01 length:sizeof(nb01) atIndex:6];
[encoder setBytes:&nb02 length:sizeof(nb02) atIndex:7];
[encoder setBytes:&ne10 length:sizeof(ne10) atIndex:8];
[encoder setBytes:&ne11 length:sizeof(ne11) atIndex:9];
[encoder setBytes:&nb10 length:sizeof(nb10) atIndex:10];
[encoder setBytes:&nb11 length:sizeof(nb11) atIndex:11];
[encoder setBytes:&nb12 length:sizeof(nb12) atIndex:12];
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:13];
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:14];
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
[encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
[encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
[encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
[encoder setBytes:&ne10 length:sizeof(ne10) atIndex:9];
[encoder setBytes:&ne11 length:sizeof(ne11) atIndex:10];
[encoder setBytes:&ne12 length:sizeof(ne12) atIndex:11];
[encoder setBytes:&nb10 length:sizeof(nb10) atIndex:12];
[encoder setBytes:&nb11 length:sizeof(nb11) atIndex:13];
[encoder setBytes:&nb12 length:sizeof(nb12) atIndex:14];
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:15];
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:16];
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 ||
src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) {

View File

@@ -509,11 +509,13 @@ kernel void kernel_mul_mat_f16_f32(
device float * dst,
constant int64_t & ne00,
constant int64_t & ne01,
constant int64_t & ne02,
constant uint64_t & nb00,
constant uint64_t & nb01,
constant uint64_t & nb02,
constant int64_t & ne10,
constant int64_t & ne11,
constant int64_t & ne12,
constant uint64_t & nb10,
constant uint64_t & nb11,
constant uint64_t & nb12,
@@ -529,7 +531,7 @@ kernel void kernel_mul_mat_f16_f32(
const int64_t r1 = tgpig.y;
const int64_t im = tgpig.z;
device const half * x = (device const half *) (src0 + r0*nb01 + im*nb02);
device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
sum[tpitg.x] = 0.0f;
@@ -552,6 +554,7 @@ kernel void kernel_mul_mat_f16_f32(
}
}
kernel void kernel_alibi_f32(
device const float * src0,
device float * dst,

View File

@@ -1,29 +1,25 @@
# Grammar for subset of JSON - doesn't support full string or number syntax
root ::= object
value ::= object | array | string | number | boolean | "null"
root ::= object
value ::= object | array | string | number | ("true" | "false" | "null") ws
object ::=
"{" ws (
string ":" ws value
("," ws string ":" ws value)*
)? "}"
)? "}" ws
array ::=
"[" ws (
value
("," ws value)*
)? "]"
)? "]" ws
string ::=
string ::=
"\"" (
[^"\\] |
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes
)* "\"" ws
# Only plain integers currently
number ::= "-"? [0-9]+ ws
boolean ::= ("true" | "false") ws
number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws
# Optional space: by convention, applied in this grammar after literal chars when allowed
ws ::= ([ \t\n] ws)?

View File

@@ -10,5 +10,5 @@ cp -rpv ../ggml/src/ggml-metal.m ./ggml-metal.m
cp -rpv ../ggml/src/ggml-metal.metal ./ggml-metal.metal
cp -rpv ../ggml/include/ggml/ggml.h ./ggml.h
cp -rpv ../ggml/tests/test-opt.c ./tests/test-opt.c
cp -rpv ../ggml/tests/test-grad0.c ./tests/test-grad0.c
cp -rpv ../ggml/tests/test-opt.cpp ./tests/test-opt.cpp
cp -rpv ../ggml/tests/test-grad0.cpp ./tests/test-grad0.cpp

View File

@@ -6,10 +6,10 @@ function(llama_add_test source)
add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN})
endfunction()
# llama_add_test(test-double-float.c) # SLOW
# llama_add_test(test-double-float.cpp) # SLOW
llama_add_test(test-quantize-fns.cpp)
llama_add_test(test-quantize-perf.cpp)
llama_add_test(test-sampling.cpp)
llama_add_test(test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin)
llama_add_test(test-grad0.c) # SLOW
# llama_add_test(test-opt.c) # SLOW
llama_add_test(test-grad0.cpp) # SLOW
# llama_add_test(test-opt.cpp) # SLOW

View File

@@ -3,10 +3,11 @@
// This is done by checking all finite (non-NaN, non-infinite) floats.
#undef NDEBUG
#include <assert.h>
#include <cassert>
#include <immintrin.h>
#include <math.h>
#include <stdint.h>
#include <cmath>
#include <cstdint>
#include <cstring>
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wdouble-promotion"
@@ -32,8 +33,9 @@ inline static float silu_float(float x) {
int main(void) {
uint32_t x = UINT32_MAX;
do {
float f = *(float *)&x;
assert(!isfinite(f) || (round_orig(f) == round_float(f)));
float f;
memcpy(&f, &x, sizeof(x));
assert(!std::isfinite(f) || (round_orig(f) == round_float(f)));
} while (x--);
#ifdef __F16C__

View File

@@ -1,10 +1,10 @@
#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows
#include "ggml.h"
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <cmath>
#include <cstdio>
#include <cstdlib>
#include <cassert>
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
@@ -47,16 +47,16 @@
#define GGML_PRINT(...) printf(__VA_ARGS__)
float frand(void) {
static float frand(void) {
return (float)rand()/(float)RAND_MAX;
}
int irand(int n) {
static int irand(int n) {
if (n == 0) return 0;
return rand()%n;
}
void get_random_dims(int64_t * dims, int ndims) {
static void get_random_dims(int64_t * dims, int ndims) {
dims[0] = dims[1] = dims[2] = dims[3] = 1;
for (int i = 0; i < ndims; i++) {
@@ -64,7 +64,7 @@ void get_random_dims(int64_t * dims, int ndims) {
}
}
struct ggml_tensor * get_random_tensor_f32(
static struct ggml_tensor * get_random_tensor_f32(
struct ggml_context * ctx0,
int ndims,
int64_t ne[],
@@ -112,7 +112,7 @@ struct ggml_tensor * get_random_tensor_f32(
return result;
}
struct ggml_tensor * get_random_tensor_f16(
static struct ggml_tensor * get_random_tensor_f16(
struct ggml_context * ctx0,
int ndims,
int64_t ne[],
@@ -160,7 +160,7 @@ struct ggml_tensor * get_random_tensor_f16(
return result;
}
struct ggml_tensor * get_random_tensor_i32(
static struct ggml_tensor * get_random_tensor_i32(
struct ggml_context * ctx0,
int ndims,
int64_t ne[],
@@ -208,7 +208,7 @@ struct ggml_tensor * get_random_tensor_i32(
return result;
}
void print_elements(const char* label, const struct ggml_tensor * t) {
static void print_elements(const char* label, const struct ggml_tensor * t) {
if (!t) {
printf("%s: %s = null\n", __func__, label);
return;
@@ -228,7 +228,7 @@ void print_elements(const char* label, const struct ggml_tensor * t) {
}
bool check_gradient(
static bool check_gradient(
const char * op_name,
struct ggml_context * ctx0,
struct ggml_tensor * x[],
@@ -310,7 +310,7 @@ bool check_gradient(
}
// TODO: clean-up this ..
bool check_mat_mul(
static bool check_mat_mul(
const struct ggml_tensor * y,
const struct ggml_tensor * x0,
const struct ggml_tensor * x1) {
@@ -373,9 +373,9 @@ bool check_mat_mul(
int main(int argc, const char ** argv) {
struct ggml_init_params params = {
.mem_size = 128*1024*1024,
.mem_buffer = NULL,
.no_alloc = false,
/* .mem_size = */ 128*1024*1024,
/* .mem_buffer = */ NULL,
/* .no_alloc = */ false,
};
int64_t ne[4];

View File

@@ -1,9 +1,9 @@
#include "ggml.h"
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <cmath>
#include <cstdio>
#include <cstdlib>
#include <cassert>
#define MAX_NARGS 2
@@ -119,10 +119,11 @@ void set_element(struct ggml_tensor * t, int idx, float value) {
int main(void) {
struct ggml_init_params params = {
.mem_size = 1024*1024*1024,
.mem_buffer = NULL,
.no_alloc = false,
/* .mem_size = */ 1024*1024*1024,
/* .mem_buffer = */ NULL,
/* .no_alloc = */ false,
};
struct ggml_context * ctx = ggml_init(params);
int64_t ne1[4] = {4, 128, 1, 1};