Compare commits

...

8 Commits

Author SHA1 Message Date
Georgi Gerganov
aa485cee33 ggml : use posix_memalign on non-Windows env 2023-04-15 14:25:45 +03:00
Ivan Komarov
c12b14b77f benchmark : fix result validation in benchmark-q4_0-matmult (#987) 2023-04-15 08:51:54 +03:00
katsu560
106faaf297 cmake : add finding the OpenBLAS header file (#992) 2023-04-15 08:51:11 +03:00
Pavol Rusnak
c85e03d12e Revert "main : alternative instruct mode (Vicuna support, etc.) (#863)" (#982)
This reverts commit f4d277ae17.
2023-04-14 22:58:43 +03:00
Pavol Rusnak
489093548c py : bump sentencepiece to 0.1.98 to support Python 3.11 (#976) 2023-04-14 19:46:49 +00:00
Stephan Walter
93265e988a make : fix dependencies, use auto variables (#983) 2023-04-14 22:39:48 +03:00
Pavol Rusnak
c56b715269 Expose type name from ggml (#970)
Avoid duplication of type names in utils

Co-authored-by: Håkon H. Hitland <haakon@likedan.net>
2023-04-14 20:05:37 +02:00
Tomáš Pazdiora
f4d277ae17 main : alternative instruct mode (Vicuna support, etc.) (#863)
* Add support for configs, add configurable prefixes / suffixes, deprecate instruct mode, add stop prompt

* Add multiline mode, update text input.

* bugfix

* update implementation

* typos

* Change --multiline implementation to be toggled by EOF.

* bugfix

* default multiline mode

* add more configs

* update formating

* update formatting

* apply suggestions
2023-04-14 18:19:17 +03:00
8 changed files with 72 additions and 41 deletions

View File

@@ -120,6 +120,21 @@ if (LLAMA_OPENBLAS)
add_compile_definitions(GGML_USE_OPENBLAS)
add_link_options(${BLAS_LIBRARIES})
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} openblas)
# find header file
set(OPENBLAS_INCLUDE_SEARCH_PATHS
/usr/include
/usr/include/openblas
/usr/include/openblas-base
/usr/local/include
/usr/local/include/openblas
/usr/local/include/openblas-base
/opt/OpenBLAS/include
$ENV{OpenBLAS_HOME}
$ENV{OpenBLAS_HOME}/include
)
find_path(OPENBLAS_INC NAMES cblas.h PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS})
add_compile_options(-I${OPENBLAS_INC})
else()
message(WARNING "OpenBLAS not found")
endif()

View File

@@ -140,44 +140,44 @@ default: main quantize perplexity embedding
#
ggml.o: ggml.c ggml.h
$(CC) $(CFLAGS) -c ggml.c -o ggml.o
$(CC) $(CFLAGS) -c $< -o $@
llama.o: llama.cpp llama.h llama_util.h
$(CXX) $(CXXFLAGS) -c llama.cpp -o llama.o
llama.o: llama.cpp ggml.h llama.h llama_util.h
$(CXX) $(CXXFLAGS) -c $< -o $@
common.o: examples/common.cpp examples/common.h
$(CXX) $(CXXFLAGS) -c examples/common.cpp -o common.o
$(CXX) $(CXXFLAGS) -c $< -o $@
clean:
rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-q4_0-matmult
main: examples/main/main.cpp ggml.o llama.o common.o
$(CXX) $(CXXFLAGS) examples/main/main.cpp ggml.o llama.o common.o -o main $(LDFLAGS)
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
@echo
@echo '==== Run ./main -h for help. ===='
@echo
quantize: examples/quantize/quantize.cpp ggml.o llama.o
$(CXX) $(CXXFLAGS) examples/quantize/quantize.cpp ggml.o llama.o -o quantize $(LDFLAGS)
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o
$(CXX) $(CXXFLAGS) examples/quantize-stats/quantize-stats.cpp ggml.o llama.o -o quantize-stats $(LDFLAGS)
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o
$(CXX) $(CXXFLAGS) examples/perplexity/perplexity.cpp ggml.o llama.o common.o -o perplexity $(LDFLAGS)
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o
$(CXX) $(CXXFLAGS) examples/embedding/embedding.cpp ggml.o llama.o common.o -o embedding $(LDFLAGS)
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
libllama.so: llama.o ggml.o
$(CXX) $(CXXFLAGS) -shared -fPIC -o libllama.so llama.o ggml.o $(LDFLAGS)
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
#
# Tests
#
benchmark: ggml.o
$(CXX) $(CXXFLAGS) examples/benchmark/benchmark-q4_0-matmult.c ggml.o -o benchmark-q4_0-matmult $(LDFLAGS)
benchmark: examples/benchmark/benchmark-q4_0-matmult.c ggml.o
$(CXX) $(CXXFLAGS) $^ -o benchmark-q4_0-matmult $(LDFLAGS)
./benchmark-q4_0-matmult
.PHONY: tests

View File

@@ -24,7 +24,7 @@
float tensor_sum_elements(struct ggml_tensor * tensor) {
float sum = 0;
if (tensor->type==6) {
if (tensor->type==GGML_TYPE_F32) {
for (int j = 0; j < tensor->ne[1]; j++) {
for (int k = 0; k < tensor->ne[0]; k++) {
sum += ((float *) tensor->data)[j*tensor->ne[0]+k];

View File

@@ -16,9 +16,6 @@
#include <unordered_map>
#include <vector>
static const char * type_strs[] = { "q4_0", "q4_1", "i8", "i16", "i32", "f16", "f32" };
static_assert(sizeof(type_strs) == GGML_TYPE_COUNT * sizeof(char *), "Incomplete type list");
struct quantize_stats_params {
std::string model = "models/7B/ggml-model-f16.bin";
bool verbose = false;
@@ -224,7 +221,7 @@ int main(int argc, char ** argv) {
break;
}
int j;
for (j = 0; j < GGML_TYPE_COUNT && strcmp(argv[i], type_strs[j]) != 0; j++) {
for (j = 0; j < GGML_TYPE_COUNT && strcmp(argv[i], ggml_type_name((ggml_type) i)) != 0; j++) {
// find match
}
if (j < GGML_TYPE_COUNT) {
@@ -279,7 +276,7 @@ int main(int argc, char ** argv) {
continue;
}
if (params.verbose) {
printf("%s: type %s, size %" PRId64 "\n", kv_tensor.first.c_str(), type_strs[kv_tensor.second->type], ggml_nelements(kv_tensor.second));
printf("%s: type %s, size %" PRId64 "\n", kv_tensor.first.c_str(), ggml_type_name(kv_tensor.second->type), ggml_nelements(kv_tensor.second));
}
if (kv_tensor.second->type == GGML_TYPE_F16) {
is_f16 = true;
@@ -304,13 +301,14 @@ int main(int argc, char ** argv) {
// loop throught quantization types
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
const ggml_type type = (ggml_type) i;
if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
continue;
}
quantize_fns_t qfns = ggml_internal_get_quantize_fn(i);
if (qfns.quantize_row_q && qfns.dequantize_row_q) {
if (params.verbose) {
printf("testing %s ...\n", type_strs[i]);
printf("testing %s ...\n", ggml_type_name(type));
}
error_stats global_stats {};
@@ -322,7 +320,7 @@ int main(int argc, char ** argv) {
if (params.verbose) {
printf(" %s ...\n", kv_tensor.first.c_str());
}
std::string layer_name { type_strs[i] };
std::string layer_name { ggml_type_name(type) };
layer_name += "::" + kv_tensor.first;
test_roundtrip_on_layer(
layer_name,
@@ -337,7 +335,7 @@ int main(int argc, char ** argv) {
);
}
print_error_stats(type_strs[i], global_stats, params.print_histogram);
print_error_stats(ggml_type_name(type), global_stats, params.print_histogram);
}
}

40
ggml.c
View File

@@ -118,7 +118,16 @@ typedef void* thread_ret_t;
#define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
#define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
#else
#define GGML_ALIGNED_MALLOC(size) aligned_alloc(GGML_MEM_ALIGN, size)
inline static void* ggml_aligned_malloc(size_t size) {
void* aligned_memory = NULL;
int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size);
if (result != 0) {
// Handle allocation failure
return NULL;
}
return aligned_memory;
}
#define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size)
#define GGML_ALIGNED_FREE(ptr) free(ptr)
#endif
@@ -531,31 +540,31 @@ inline static float vaddvq_f32(float32x4_t v) {
return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
}
inline float vminvq_f32(float32x4_t v) {
float vminvq_f32(float32x4_t v) {
return
MIN(MIN(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
MIN(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
}
inline float vmaxvq_f32(float32x4_t v) {
float vmaxvq_f32(float32x4_t v) {
return
MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
}
inline int8x8_t vzip1_s8(int8x8_t a, int8x8_t b) {
int8x8_t vzip1_s8(int8x8_t a, int8x8_t b) {
return vget_low_s8(vcombine_s8(a, b));
}
inline int8x8_t vzip2_s8(int8x8_t a, int8x8_t b) {
int8x8_t vzip2_s8(int8x8_t a, int8x8_t b) {
return vget_high_s8(vcombine_s8(a, b));
}
inline uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
return vget_low_u8(vcombine_u8(a, b));
}
inline uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
return vget_high_u8(vcombine_u8(a, b));
}
@@ -2671,6 +2680,18 @@ static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
};
static_assert(GGML_TYPE_COUNT == 7, "GGML_TYPE_SIZE is outdated");
static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
[GGML_TYPE_F32] = "f32",
[GGML_TYPE_F16] = "f16",
[GGML_TYPE_Q4_0] = "q4_0",
[GGML_TYPE_Q4_1] = "q4_1",
[GGML_TYPE_I8] = "i8",
[GGML_TYPE_I16] = "i16",
[GGML_TYPE_I32] = "i32",
};
static_assert(GGML_TYPE_COUNT == 7, "GGML_TYPE_NAME is outdated");
static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
"NONE",
@@ -2895,6 +2916,11 @@ float ggml_type_sizef(enum ggml_type type) {
return ((float)(GGML_TYPE_SIZE[type]))/GGML_BLCK_SIZE[type];
}
const char * ggml_type_name(enum ggml_type type) {
return GGML_TYPE_NAME[type];
}
size_t ggml_element_size(const struct ggml_tensor * tensor) {
return GGML_TYPE_SIZE[tensor->type];
}

2
ggml.h
View File

@@ -354,6 +354,8 @@ int ggml_blck_size (enum ggml_type type);
size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
const char * ggml_type_name(enum ggml_type type);
size_t ggml_element_size(const struct ggml_tensor * tensor);
struct ggml_context * ggml_init(struct ggml_init_params params);

View File

@@ -269,16 +269,6 @@ static std::string llama_format_tensor_shape(const std::vector<uint32_t> & ne) {
return ret;
}
static const char * llama_format_type(enum ggml_type type) {
switch (type) {
case GGML_TYPE_F32: return "f32";
case GGML_TYPE_F16: return "f16";
case GGML_TYPE_Q4_0: return "q4_0";
case GGML_TYPE_Q4_1: return "q4_1";
default: LLAMA_ASSERT(false);
}
}
static size_t llama_calc_tensor_size(const std::vector<uint32_t> & ne, enum ggml_type type) {
size_t size = ggml_type_size(type);
for (uint32_t dim : ne) {
@@ -1582,7 +1572,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
printf("[%zu/%zu] %36s - %s, type = %6s, ",
++idx, model_loader->tensors_map.tensors.size(),
tensor.name.c_str(), llama_format_tensor_shape(tensor.ne).c_str(),
llama_format_type(tensor.type));
ggml_type_name(tensor.type));
// This used to be a regex, but <regex> has an extreme cost to compile times.
bool quantize = tensor.name.rfind("weight") == tensor.name.size() - 6; // ends with 'weight'?
@@ -1615,7 +1605,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
f32_data[i] = ggml_fp16_to_fp32(f16_data[i]);
}
} else {
throw format("type %s unsupported for integer quantization", llama_format_type(tensor.type));
throw format("type %s unsupported for integer quantization", ggml_type_name(tensor.type));
}
printf("quantizing .. ");

View File

@@ -1,2 +1,2 @@
numpy==1.24
sentencepiece==0.1.97
sentencepiece==0.1.98