Compare commits

..

24 Commits

Author SHA1 Message Date
DannyDaemonic
55bc5f0900 Call sh on build-info.sh (#1294) 2023-05-02 17:52:35 -07:00
kuvaus
9daff419f6 fix build-info.h for git submodules (#1289)
* make git build info work with submodules

---------

Co-authored-by: Green Sky <green@g-s.xyz>
2023-05-03 02:43:43 +02:00
slaren
bf4b22ffe4 fix missing parameters in llama_init_from_gpt_params (#1293) 2023-05-03 01:36:45 +02:00
Ron Evans
67c77799e0 examples : add llama_init_from_gpt_params() common function (#1290)
Signed-off-by: deadprogram <ron@hybridgroup.com>
2023-05-02 23:39:51 +03:00
Georgi Gerganov
0e6cbff1b7 llama : fix compile warnings 2023-05-02 23:09:08 +03:00
Georgi Gerganov
5d5817ca60 ggml : fix 32-bit ARM 2023-05-02 22:14:50 +03:00
Ron Evans
8c9be35ff9 examples : improve vertical alignment of a few variables (#1286)
Signed-off-by: deadprogram <ron@hybridgroup.com>
2023-05-02 20:53:52 +03:00
Marvin Gießing
cc0bb7235c ggml : fix ppc64le build error and make cmake detect Power processors (#1284)
* Fix ppc64le build issue

* Added support to detect ppc64* processors
2023-05-02 19:42:16 +03:00
Robert Brisita
2bb992f034 llama : allow 0 as a seed number. (#1275) 2023-05-02 19:23:44 +03:00
Ron Evans
e2cd506999 main : switch input_noecho to input_echo to remove negation (#979)
Signed-off-by: deadprogram <ron@hybridgroup.com>
2023-05-02 19:13:26 +03:00
slaren
2d099e5193 ggml: add names to tensors (#1268)
* ggml: add names to tensors

* minor improvements to dot file formatting
2023-05-02 16:03:00 +02:00
DannyDaemonic
f4cef87edf Add git-based build information for better issue tracking (#1232)
* Add git-based build information for better issue tracking

* macOS fix

* "build (hash)" and "CMAKE_SOURCE_DIR" changes

* Redo "CMAKE_CURRENT_SOURCE_DIR" and clearer build messages

* Fix conditional dependency on missing target

* Broke out build-info.cmake, added find_package fallback, and added build into to all examples, added dependencies to Makefile

* 4 space indenting for cmake, attempt to clean up my mess in Makefile

* Short hash, less fancy Makefile, and don't modify build-info.h if it wouldn't change it
2023-05-01 18:23:47 +02:00
slaren
58b367c2d7 cuBLAS: refactor and optimize f16 mat mul performance (#1259)
* cuBLAS: refactor, convert fp16 to fp32 on device

* cuBLAS: use multiple streams, choose smartly between mul_mat_q and mul_mat_f16

* fix build

* cuBLAS: update block_q5_1
2023-05-01 18:11:07 +02:00
xloem
ea3a0ad6b6 llama : update stubs for systems without mmap and mlock (#1266)
Co-authored-by: John Doe <john.doe@example.com>
2023-05-01 15:58:51 +03:00
Kerfuffle
2bdc09646d ggml : fix ggml_used_mem() (#1264) 2023-05-01 14:56:07 +03:00
Georgi Gerganov
70269cae37 llama : fix session load / save (#1263) 2023-05-01 14:54:59 +03:00
slaren
b925f1f1b0 cuBLAS: fall back to pageable memory if pinned alloc fails (#1233)
* cuBLAS: fall back to pageable memory if pinned alloc fails

* cuBLAS: do not use pinned memory if env variable GGML_CUDA_NO_PINNED is set
2023-05-01 13:32:22 +02:00
Alex Klinkhamer
90b19bd6ee llama : let context be const when accessing const data (#1261) 2023-05-01 10:24:20 +03:00
Georgi Gerganov
7ff0dcd320 ggml : fix UB (int << 31) 2023-04-30 22:28:51 +03:00
Pavol Rusnak
6f79699286 build: add armv{6,7,8} support to cmake (#1251)
- flags copied from Makefile
- updated comments in both CMakeLists.txt and Makefile to match reality
2023-04-30 20:48:38 +02:00
jon-chuang
a5d30b1f53 common : better default number of threads (#934)
* commit

* fix

* try-catch

* apply code review

* improve

* improve

* add macos headers

* done

* remove color

* fix windows

* minor

* fix

* Apply suggestions from code review

Co-authored-by: DannyDaemonic <DannyDaemonic@gmail.com>

* remove

* minor

* minor

---------

Co-authored-by: jon-chuang <jon-chuang@users.noreply.github.com>
Co-authored-by: DannyDaemonic <DannyDaemonic@gmail.com>
2023-04-30 21:41:35 +03:00
0cc4m
76a884920a ggml : add CLBlast q5_0, q5_1, q8_0 dequant kernels (#1225)
* Implement q5_0, q5_1 and q8_0

* Work around q5_0 OpenCL issue

* Fix q8_0 dequant kernel

* Move cl kernels into ggml-opencl.c

* Use two memcpy calls for q5_0 buffer transfer
2023-04-30 21:34:52 +03:00
Georgi Gerganov
6bc4400e67 ggml : add Q5 WASM SIMD + GGML_FTYPE 2023-04-30 19:07:43 +03:00
Stephan Walter
f0d70f147d Various fixes to mat_mul benchmark (#1253) 2023-04-30 12:32:37 +00:00
33 changed files with 1519 additions and 626 deletions

3
.gitignore vendored
View File

@@ -28,10 +28,11 @@ models/*
/result
/perplexity
/embedding
/benchmark-q4_0-matmult
/benchmark-matmult
/vdot
/Pipfile
build-info.h
arm_neon.h
compile_commands.json

View File

@@ -72,6 +72,39 @@ option(LLAMA_CLBLAST "llama: use CLBlast"
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
#
# Build info header
#
# Generate initial build-info.h
include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/.git")
set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/.git")
# Is git submodule
if(NOT IS_DIRECTORY "${GIT_DIR}")
file(READ ${GIT_DIR} REAL_GIT_DIR_LINK)
string(REGEX REPLACE "gitdir: (.*)\n$" "\\1" REAL_GIT_DIR ${REAL_GIT_DIR_LINK})
set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/${REAL_GIT_DIR}")
endif()
# Add a custom target for build-info.h
add_custom_target(BUILD_INFO ALL DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.h")
# Add a custom command to rebuild build-info.h when .git/index changes
add_custom_command(
OUTPUT "${CMAKE_CURRENT_SOURCE_DIR}/build-info.h"
COMMENT "Generating build details from Git"
COMMAND ${CMAKE_COMMAND} -P "${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake"
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
DEPENDS "${GIT_DIR}/index"
VERBATIM
)
else()
message(WARNING "Git repository not found; to enable automatic generation of build info, make sure Git is installed and the project is a Git repository.")
endif()
#
# Compile flags
#
@@ -258,9 +291,22 @@ if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES
# TODO: arm msvc?
else()
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
# Apple M1, M2, etc.
# Raspberry Pi 3, 4, Zero 2 (64-bit)
add_compile_options(-mcpu=native)
endif()
# TODO: armv6,7,8 version specific flags
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6")
# Raspberry Pi 1, Zero
add_compile_options(-mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access)
endif()
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7")
# Raspberry Pi 2
add_compile_options(-mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations)
endif()
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8")
# Raspberry Pi 3, 4, Zero 2 (32-bit)
add_compile_options(-mfp16-format=ieee -mno-unaligned-access)
endif()
endif()
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$")
message(STATUS "x86 detected")
@@ -311,8 +357,11 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$")
add_compile_options(-mavx512vnni)
endif()
endif()
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
message(STATUS "PowerPC detected")
add_compile_options(-mcpu=native -mtune=native)
#TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
else()
# TODO: support PowerPC
message(STATUS "Unknown architecture")
endif()

View File

@@ -135,19 +135,21 @@ ifdef LLAMA_PERF
CXXFLAGS += -DGGML_PERF
endif
ifneq ($(filter aarch64%,$(UNAME_M)),)
# Apple M1, M2, etc.
# Raspberry Pi 3, 4, Zero 2 (64-bit)
CFLAGS += -mcpu=native
CXXFLAGS += -mcpu=native
endif
ifneq ($(filter armv6%,$(UNAME_M)),)
# Raspberry Pi 1, 2, 3
# Raspberry Pi 1, Zero
CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
endif
ifneq ($(filter armv7%,$(UNAME_M)),)
# Raspberry Pi 4
# Raspberry Pi 2
CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
endif
ifneq ($(filter armv8%,$(UNAME_M)),)
# Raspberry Pi 4
# Raspberry Pi 3, 4, Zero 2 (32-bit)
CFLAGS += -mfp16-format=ieee -mno-unaligned-access
endif
@@ -179,40 +181,55 @@ llama.o: llama.cpp ggml.h ggml-cuda.h llama.h llama-util.h
common.o: examples/common.cpp examples/common.h
$(CXX) $(CXXFLAGS) -c $< -o $@
clean:
rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-q4_0-matmult
libllama.so: llama.o ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
main: examples/main/main.cpp ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
clean:
rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state build-info.h
#
# Examples
#
main: examples/main/main.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
@echo
@echo '==== Run ./main -h for help. ===='
@echo
quantize: examples/quantize/quantize.cpp ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
quantize: examples/quantize/quantize.cpp build-info.h ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.h ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
perplexity: examples/perplexity/perplexity.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
embedding: examples/embedding/embedding.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
libllama.so: llama.o ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
build-info.h: $(wildcard .git/index) scripts/build-info.sh
@sh scripts/build-info.sh > $@.tmp
@if ! cmp -s $@.tmp $@; then \
mv $@.tmp $@; \
else \
rm $@.tmp; \
fi
#
# Tests
#
benchmark: examples/benchmark/benchmark-q4_0-matmult.c ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) $^ -o benchmark-q4_0-matmult $(LDFLAGS)
./benchmark-q4_0-matmult
benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
./$@
vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
.PHONY: tests
tests:

View File

@@ -35,4 +35,5 @@ else()
add_subdirectory(perplexity)
add_subdirectory(embedding)
add_subdirectory(save-load-state)
add_subdirectory(benchmark)
endif()

View File

@@ -0,0 +1,7 @@
set(TARGET benchmark)
add_executable(${TARGET} benchmark-matmult.cpp)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()

View File

@@ -1,13 +1,6 @@
/*
License: MIT License
Changelog:
- 2023-03-31 Initial version by Sebastian Apel (https://github.com/SebastianApel)
*/
#include <locale.h>
#include "ggml.h"
#include "build-info.h"
#include <assert.h>
#include <math.h>
#include <cstring>
@@ -47,7 +40,7 @@ float tensor_sum_elements(struct ggml_tensor * tensor) {
#define TENSOR_DUMP(TENSOR) printf("%15s: type = %i (%5s) ne = %5d x %5d x %5d, nb = (%5li, %5li, %5li) - ", #TENSOR, \
TENSOR->type,TENSOR_TYPE_AS_STR(TENSOR->type),\
TENSOR->ne[0], TENSOR->ne[1], TENSOR->ne[2], TENSOR->nb[0], TENSOR->nb[1], TENSOR->nb[2]); \
(int) TENSOR->ne[0], (int) TENSOR->ne[1], (int) TENSOR->ne[2], TENSOR->nb[0], TENSOR->nb[1], TENSOR->nb[2]); \
{ float sum = tensor_sum_elements(TENSOR); printf("Sum of tensor %s is %6.2f\n",#TENSOR, sum); }
struct benchmark_params_struct {
@@ -98,12 +91,10 @@ int main(int argc, char ** argv) {
}
}
// create the ggml context
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
printf("Starting Test\n");
// create the ggml context
struct ggml_context * ctx;
//const int sizex = 4096;
//const int sizey = 11008;
@@ -125,16 +116,18 @@ int main(int argc, char ** argv) {
#endif
//printf("Memsize required = %i\n", sizex*sizex);
ggml_type wtype = GGML_TYPE_F32;
size_t ctx_size = 0;
ctx_size += sizex*sizey*ggml_type_sizef(wtype);
ctx_size += sizex*sizey*ggml_type_sizef(wtype);
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32);
ctx_size += sizex*sizeof(float);
ctx_size += 1024*1024*100;
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32);
ctx_size += sizex*sizez*ggml_type_sizef(GGML_TYPE_F32);
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_Q4_0);
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_Q4_0);
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS
ctx_size += 1024*1024*16;
printf("Allocating Memory of size %li byes, %li MB\n",ctx_size, (ctx_size/1024/1024));
printf("Allocating Memory of size %li bytes, %li MB\n",ctx_size, (ctx_size/1024/1024));
struct ggml_init_params params = {
/*.mem_size =*/ ctx_size,
@@ -145,7 +138,7 @@ int main(int argc, char ** argv) {
ctx = ggml_init(params);
if (!ctx) {
fprintf(stderr, "%s: ggml_init() failed\n", __func__);
return false;
return 1;
}
@@ -217,7 +210,7 @@ int main(int argc, char ** argv) {
const int dimz = sizez;
long long int flops_per_dot_product = dimy + dimy;
long long int flops_per_matrix = flops_per_dot_product * dimx * dimz; ;
printf("Matrix Multiplication of (%i,%i,%i) x (%i,%i,%i) - aboout %6.2f gFLOPS\n\n", sizex, sizey, 1, sizex, sizez, 1, 1.0f*flops_per_matrix / 1000 / 1000 / 1000);
printf("Matrix Multiplication of (%i,%i,%i) x (%i,%i,%i) - about %6.2f gFLOPS\n\n", sizex, sizey, 1, sizex, sizez, 1, 1.0f*flops_per_matrix / 1000 / 1000 / 1000);
// Let's use the F32 result from above as a reference for the q4_0 multiplication
@@ -234,7 +227,6 @@ int main(int argc, char ** argv) {
ggml_graph_compute(ctx, &gf31);
long long int stop = ggml_time_us();
long long int usec = stop-start;
float sec = usec/1000000;
float flops_per_usec = (1.0f*flops_per_matrix)/usec;
printf("%9i;%8i;%6i;%6i;%6i;%15lli;%18lli;%19.2f\n",
i,

View File

@@ -1,13 +1,18 @@
#include "common.h"
#include <cassert>
#include <iostream>
#include <cstring>
#include <fstream>
#include <string>
#include <iterator>
#include <algorithm>
#include <sstream>
#include <iostream>
#if defined(__APPLE__) && defined(__MACH__)
#include <sys/types.h>
#include <sys/sysctl.h>
#endif
#if defined (_WIN32)
#include <fcntl.h>
@@ -25,19 +30,43 @@ extern "C" __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int
#define CP_UTF8 65001
#endif
bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
// determine sensible default number of threads.
// std::thread::hardware_concurrency may not be equal to the number of cores, or may return 0.
int32_t get_num_physical_cores() {
#ifdef __linux__
std::ifstream cpuinfo("/proc/cpuinfo");
params.n_threads = std::count(std::istream_iterator<std::string>(cpuinfo),
std::istream_iterator<std::string>(),
std::string("processor"));
#endif
if (params.n_threads == 0) {
params.n_threads = std::max(1, (int32_t) std::thread::hardware_concurrency());
std::string line;
while (std::getline(cpuinfo, line)) {
std::size_t pos = line.find("cpu cores");
if (pos != std::string::npos) {
pos = line.find(": ", pos);
if (pos != std::string::npos) {
try {
// Extract the number and return it
return static_cast<int32_t>(std::stoul(line.substr(pos + 2)));
} catch (const std::invalid_argument &) {
// Ignore if we could not parse
}
}
}
}
#elif defined(__APPLE__) && defined(__MACH__)
int32_t num_physical_cores;
size_t len = sizeof(num_physical_cores);
int result = sysctlbyname("hw.perflevel0.physicalcpu", &num_physical_cores, &len, NULL, 0);
if (result == 0) {
return num_physical_cores;
}
result = sysctlbyname("hw.physicalcpu", &num_physical_cores, &len, NULL, 0);
if (result == 0) {
return num_physical_cores;
}
#elif defined(_WIN32)
//TODO: Implement
#endif
unsigned int n_threads = std::thread::hardware_concurrency();
return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
}
bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
bool invalid_param = false;
std::string arg;
gpt_params default_params;
@@ -295,7 +324,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
fprintf(stderr, " run in interactive mode and poll user input upon seeing PROMPT (can be\n");
fprintf(stderr, " specified more than once for multiple prompts).\n");
fprintf(stderr, " --color colorise output to distinguish prompt and user input from generations\n");
fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for <= 0)\n");
fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n");
fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
fprintf(stderr, " -p PROMPT, --prompt PROMPT\n");
fprintf(stderr, " prompt to start generation with (default: empty)\n");
@@ -376,6 +405,39 @@ std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::s
return res;
}
struct llama_context * llama_init_from_gpt_params(const gpt_params & params) {
auto lparams = llama_context_default_params();
lparams.n_ctx = params.n_ctx;
lparams.n_parts = params.n_parts;
lparams.seed = params.seed;
lparams.f16_kv = params.memory_f16;
lparams.use_mmap = params.use_mmap;
lparams.use_mlock = params.use_mlock;
lparams.logits_all = params.perplexity;
lparams.embedding = params.embedding;
llama_context * lctx = llama_init_from_file(params.model.c_str(), lparams);
if (lctx == NULL) {
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
return NULL;
}
if (!params.lora_adapter.empty()) {
int err = llama_apply_lora_from_file(lctx,
params.lora_adapter.c_str(),
params.lora_base.empty() ? NULL : params.lora_base.c_str(),
params.n_threads);
if (err != 0) {
fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
return NULL;
}
}
return lctx;
}
/* Keep track of current color of output, and emit ANSI code if it changes. */
void set_console_color(console_state & con_st, console_color_t color) {
if (con_st.use_color && con_st.color != color) {

View File

@@ -13,11 +13,12 @@
//
// CLI argument parsing
//
int32_t get_num_physical_cores();
struct gpt_params {
int32_t seed = -1; // RNG seed
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
int32_t n_predict = -1; // new tokens to predict
int32_t n_threads = get_num_physical_cores();
int32_t n_predict = -1; // new tokens to predict
int32_t n_parts = -1; // amount of model parts (-1 = determine from model dimensions)
int32_t n_ctx = 512; // context size
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
@@ -76,6 +77,12 @@ std::string gpt_random_prompt(std::mt19937 & rng);
std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos);
//
// Model utils
//
struct llama_context * llama_init_from_gpt_params(const gpt_params & params);
//
// Console utils
//

View File

@@ -2,3 +2,6 @@ set(TARGET embedding)
add_executable(${TARGET} embedding.cpp)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()

View File

@@ -1,5 +1,6 @@
#include "common.h"
#include "llama.h"
#include "build-info.h"
#include <ctime>
@@ -18,11 +19,13 @@ int main(int argc, char ** argv) {
"expect poor results\n", __func__, params.n_ctx);
}
if (params.seed <= 0) {
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
if (params.seed < 0) {
params.seed = time(NULL);
}
fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
std::mt19937 rng(params.seed);
if (params.random_prompt) {
@@ -32,24 +35,10 @@ int main(int argc, char ** argv) {
llama_context * ctx;
// load the model
{
auto lparams = llama_context_default_params();
lparams.n_ctx = params.n_ctx;
lparams.n_parts = params.n_parts;
lparams.seed = params.seed;
lparams.f16_kv = params.memory_f16;
lparams.logits_all = params.perplexity;
lparams.use_mmap = params.use_mmap;
lparams.use_mlock = params.use_mlock;
lparams.embedding = params.embedding;
ctx = llama_init_from_file(params.model.c_str(), lparams);
if (ctx == NULL) {
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
return 1;
}
ctx = llama_init_from_gpt_params(params);
if (ctx == NULL) {
fprintf(stderr, "%s: error: unable to load model\n", __func__);
return 1;
}
// print system information

View File

@@ -2,3 +2,6 @@ set(TARGET main)
add_executable(${TARGET} main.cpp)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()

View File

@@ -130,7 +130,7 @@ It is important to note that the generated text may be shorter than the specifie
- `-s SEED, --seed SEED`: Set the random number generator (RNG) seed (default: -1).
The RNG seed is used to initialize the random number generator that influences the text generation process. By setting a specific seed value, you can obtain consistent and reproducible results across multiple runs with the same input and settings. This can be helpful for testing, debugging, or comparing the effects of different options on the generated text to see when they diverge. If the seed is set to a value less than or equal to 0, a random seed will be used, which will result in different outputs on each run.
The RNG seed is used to initialize the random number generator that influences the text generation process. By setting a specific seed value, you can obtain consistent and reproducible results across multiple runs with the same input and settings. This can be helpful for testing, debugging, or comparing the effects of different options on the generated text to see when they diverge. If the seed is set to a value less than 0, a random seed will be used, which will result in different outputs on each run.
### Temperature

View File

@@ -5,6 +5,7 @@
#include "common.h"
#include "llama.h"
#include "build-info.h"
#include <cassert>
#include <cinttypes>
@@ -81,11 +82,13 @@ int main(int argc, char ** argv) {
"expect poor results\n", __func__, params.n_ctx);
}
if (params.seed <= 0) {
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
if (params.seed < 0) {
params.seed = time(NULL);
}
fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
std::mt19937 rng(params.seed);
if (params.random_prompt) {
@@ -98,34 +101,11 @@ int main(int argc, char ** argv) {
llama_context * ctx;
g_ctx = &ctx;
// load the model
{
auto lparams = llama_context_default_params();
lparams.n_ctx = params.n_ctx;
lparams.n_parts = params.n_parts;
lparams.seed = params.seed;
lparams.f16_kv = params.memory_f16;
lparams.use_mmap = params.use_mmap;
lparams.use_mlock = params.use_mlock;
ctx = llama_init_from_file(params.model.c_str(), lparams);
if (ctx == NULL) {
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
return 1;
}
}
if (!params.lora_adapter.empty()) {
int err = llama_apply_lora_from_file(ctx,
params.lora_adapter.c_str(),
params.lora_base.empty() ? NULL : params.lora_base.c_str(),
params.n_threads);
if (err != 0) {
fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
return 1;
}
// load the model and apply lora adapter, if any
ctx = llama_init_from_gpt_params(params);
if (ctx == NULL) {
fprintf(stderr, "%s: error: unable to load model\n", __func__);
return 1;
}
// print system information
@@ -161,23 +141,22 @@ int main(int argc, char ** argv) {
std::vector<llama_token> session_tokens;
if (!path_session.empty()) {
fprintf(stderr, "%s: attempting to load saved session from %s..\n", __func__, path_session.c_str());
fprintf(stderr, "%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str());
// REVIEW - fopen to check for existing session
// fopen to check for existing session
FILE * fp = std::fopen(path_session.c_str(), "rb");
if (fp != NULL) {
std::fclose(fp);
session_tokens.resize(params.n_ctx);
size_t n_token_count_out = 0;
const size_t n_session_bytes = llama_load_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out);
if (!llama_load_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
fprintf(stderr, "%s: error: failed to load session file '%s'\n", __func__, path_session.c_str());
return 1;
}
session_tokens.resize(n_token_count_out);
if (n_session_bytes > 0) {
fprintf(stderr, "%s: loaded %zu bytes of session data!\n", __func__, n_session_bytes);
} else {
fprintf(stderr, "%s: could not load session file, will recreate\n", __func__);
}
fprintf(stderr, "%s: loaded a session with prompt size of %d tokens\n", __func__, (int) session_tokens.size());
} else {
fprintf(stderr, "%s: session file does not exist, will create\n", __func__);
}
@@ -214,7 +193,7 @@ int main(int argc, char ** argv) {
}
// number of tokens to keep when resetting context
if (params.n_keep < 0 || params.n_keep > (int)embd_inp.size() || params.instruct) {
if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size() || params.instruct) {
params.n_keep = (int)embd_inp.size();
}
@@ -296,7 +275,7 @@ int main(int argc, char ** argv) {
}
bool is_antiprompt = false;
bool input_noecho = false;
bool input_echo = true;
// HACK - because session saving incurs a non-negligible delay, for now skip re-saving session
// if we loaded a session with at least 75% similarity. It's currently just used to speed up the
@@ -304,9 +283,9 @@ int main(int argc, char ** argv) {
bool need_to_save_session = !path_session.empty() && n_matching_session_tokens < (embd_inp.size() * 3 / 4);
int n_past = 0;
int n_remain = params.n_predict;
int n_consumed = 0;
int n_past = 0;
int n_remain = params.n_predict;
int n_consumed = 0;
int n_session_consumed = 0;
// the first thing we will do is to output the prompt, so set color accordingly
@@ -329,7 +308,7 @@ int main(int argc, char ** argv) {
// insert n_left/2 tokens at the start of embd from last_n_tokens
embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size());
// REVIEW - stop saving session if we run out of context
// stop saving session if we run out of context
path_session = "";
//printf("\n---\n");
@@ -355,6 +334,7 @@ int main(int argc, char ** argv) {
n_session_consumed++;
if (n_session_consumed >= (int) session_tokens.size()) {
++i;
break;
}
}
@@ -410,7 +390,7 @@ int main(int argc, char ** argv) {
llama_token id = 0;
{
auto logits = llama_get_logits(ctx);
auto logits = llama_get_logits(ctx);
auto n_vocab = llama_n_vocab(ctx);
// Apply params.logit_bias map
@@ -482,7 +462,7 @@ int main(int argc, char ** argv) {
embd.push_back(id);
// echo this to console
input_noecho = false;
input_echo = true;
// decrement remaining sampling budget
--n_remain;
@@ -500,14 +480,14 @@ int main(int argc, char ** argv) {
}
// display text
if (!input_noecho) {
if (input_echo) {
for (auto id : embd) {
printf("%s", llama_token_to_str(ctx, id));
}
fflush(stdout);
}
// reset color to default if we there is no pending user input
if (!input_noecho && (int)embd_inp.size() == n_consumed) {
if (input_echo && (int)embd_inp.size() == n_consumed) {
set_console_color(con_st, CONSOLE_COLOR_DEFAULT);
}
@@ -602,7 +582,7 @@ int main(int argc, char ** argv) {
n_remain -= line_inp.size();
}
input_noecho = true; // do not echo this again
input_echo = false; // do not echo this again
}
if (n_past > 0) {

View File

@@ -2,3 +2,6 @@ set(TARGET perplexity)
add_executable(${TARGET} perplexity.cpp)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()

View File

@@ -1,5 +1,6 @@
#include "common.h"
#include "llama.h"
#include "build-info.h"
#include <cmath>
#include <ctime>
@@ -106,11 +107,13 @@ int main(int argc, char ** argv) {
"expect poor results\n", __func__, params.n_ctx);
}
if (params.seed <= 0) {
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
if (params.seed < 0) {
params.seed = time(NULL);
}
fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
std::mt19937 rng(params.seed);
if (params.random_prompt) {
@@ -119,36 +122,11 @@ int main(int argc, char ** argv) {
llama_context * ctx;
// load the model
{
auto lparams = llama_context_default_params();
lparams.n_ctx = params.n_ctx;
lparams.n_parts = params.n_parts;
lparams.seed = params.seed;
lparams.f16_kv = params.memory_f16;
lparams.logits_all = params.perplexity;
lparams.use_mmap = params.use_mmap;
lparams.use_mlock = params.use_mlock;
lparams.embedding = params.embedding;
ctx = llama_init_from_file(params.model.c_str(), lparams);
if (ctx == NULL) {
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
return 1;
}
}
if (!params.lora_adapter.empty()) {
int err = llama_apply_lora_from_file(ctx,
params.lora_adapter.c_str(),
params.lora_base.empty() ? NULL : params.lora_base.c_str(),
params.n_threads);
if (err != 0) {
fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
return 1;
}
// load the model and apply lora adapter, if any
ctx = llama_init_from_gpt_params(params);
if (ctx == NULL) {
fprintf(stderr, "%s: error: unable to load model\n", __func__);
return 1;
}
// print system information

View File

@@ -1,4 +1,5 @@
#include "ggml.h"
#include "build-info.h"
#define LLAMA_API_INTERNAL
#include "llama.h"
@@ -308,6 +309,8 @@ int main(int argc, char ** argv) {
return 1;
}
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
// load the model
fprintf(stderr, "Loading model\n");

View File

@@ -2,3 +2,6 @@ set(TARGET quantize)
add_executable(${TARGET} quantize.cpp)
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()

View File

@@ -1,5 +1,6 @@
#include "ggml.h"
#include "llama.h"
#include "build-info.h"
#include <cstdio>
#include <map>
@@ -50,6 +51,8 @@ int main(int argc, char ** argv) {
ftype = (enum llama_ftype)atoi(argv[3]);
}
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
int nthread = argc > 4 ? atoi(argv[4]) : 0;
const int64_t t_main_start_us = ggml_time_us();

View File

@@ -2,3 +2,6 @@ set(TARGET save-load-state)
add_executable(${TARGET} save-load-state.cpp)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()

View File

@@ -1,5 +1,6 @@
#include "common.h"
#include "llama.h"
#include "build-info.h"
#include <vector>
#include <cstdio>
@@ -17,6 +18,8 @@ int main(int argc, char ** argv) {
return 1;
}
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
if (params.n_predict < 0) {
params.n_predict = 16;
}

View File

@@ -1,11 +1,38 @@
#include <cstddef>
#include <cstdint>
#include <stdint.h>
#include <stdio.h>
#include <cuda_fp16.h>
#include <atomic>
#include "ggml-cuda.h"
typedef uint16_t ggml_fp16_t;
static_assert(sizeof(__half) == sizeof(ggml_fp16_t), "wrong fp16 size");
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cuda_fp16.h>
#include "ggml-cuda.h"
#include "ggml.h"
static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
#define CUDA_CHECK(err) \
do { \
cudaError_t err_ = (err); \
if (err_ != cudaSuccess) { \
fprintf(stderr, "CUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \
cudaGetErrorString(err_)); \
exit(1); \
} \
} while (0)
#define CUBLAS_CHECK(err) \
do { \
cublasStatus_t err_ = (err); \
if (err_ != CUBLAS_STATUS_SUCCESS) { \
fprintf(stderr, "cuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__); \
exit(1); \
} \
} while (0)
typedef void (*to_fp32_cuda_t)(const void * x, float * y, int k, cudaStream_t stream);
#define QK4_0 32
typedef struct {
@@ -24,14 +51,14 @@ static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK4_1 / 2, "wrong q4_1 b
#define QK4_2 16
typedef struct {
__half d; // delta
half d; // delta
uint8_t qs[QK4_2 / 2]; // nibbles / quants
} block_q4_2;
static_assert(sizeof(block_q4_2) == sizeof(ggml_fp16_t) + QK4_2 / 2, "wrong q4_2 block size/padding");
#define QK5_0 32
typedef struct {
__half d; // delta
half d; // delta
uint8_t qh[4]; // 5-th bit of quants
uint8_t qs[QK5_0 / 2]; // nibbles / quants
} block_q5_0;
@@ -39,9 +66,9 @@ static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5
#define QK5_1 32
typedef struct {
__half d; // delta
__half m; // min
uint32_t qh; // 5-th bit of quants
half d; // delta
half m; // min
uint8_t qh[4]; // 5-th bit of quants
uint8_t qs[QK5_1 / 2]; // nibbles / quants
} block_q5_1;
static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
@@ -162,7 +189,8 @@ static __global__ void dequantize_block_q5_1(const void * vx, float * y) {
const uint8_t * pp = x[i].qs;
const uint32_t qh = x[i].qh;
uint32_t qh;
memcpy(&qh, x[i].qh, sizeof(qh));
for (int l = 0; l < QK5_1; l += 2) {
const uint8_t vi = pp[l/2];
@@ -197,37 +225,50 @@ static __global__ void dequantize_block_q8_0(const void * vx, float * y) {
}
}
void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
static void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
const int nb = k / QK4_0;
dequantize_block_q4_0<<<nb, 1, 0, stream>>>(vx, y);
}
void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
static void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
const int nb = k / QK4_1;
dequantize_block_q4_1<<<nb, 1, 0, stream>>>(vx, y);
}
void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
static void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
const int nb = k / QK4_2;
dequantize_block_q4_2<<<nb, 1, 0, stream>>>(vx, y);
}
void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
static void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
const int nb = k / QK5_0;
dequantize_block_q5_0<<<nb, 1, 0, stream>>>(vx, y);
}
void dequantize_row_q5_1_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
static void dequantize_row_q5_1_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
const int nb = k / QK5_1;
dequantize_block_q5_1<<<nb, 1, 0, stream>>>(vx, y);
}
void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
static void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
const int nb = k / QK8_0;
dequantize_block_q8_0<<<nb, 1, 0, stream>>>(vx, y);
}
dequantize_row_q_cuda_t ggml_get_dequantize_row_q_cuda(ggml_type type) {
// TODO: optimize
static __global__ void convert_fp16_to_fp32(const void * vx, float * y) {
const half * x = (const half *) vx;
const int i = blockIdx.x;
y[i] = __half2float(x[i]);
}
static void convert_fp16_to_fp32_cuda(const void * x, float * y, int k, cudaStream_t stream) {
convert_fp16_to_fp32<<<k, 1, 0, stream>>>(x, y);
}
static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
switch (type) {
case GGML_TYPE_Q4_0:
return dequantize_row_q4_0_cuda;
@@ -241,6 +282,8 @@ dequantize_row_q_cuda_t ggml_get_dequantize_row_q_cuda(ggml_type type) {
return dequantize_row_q5_1_cuda;
case GGML_TYPE_Q8_0:
return dequantize_row_q8_0_cuda;
case GGML_TYPE_F16:
return convert_fp16_to_fp32_cuda;
default:
return nullptr;
}
@@ -271,7 +314,7 @@ struct cuda_buffer {
static cuda_buffer g_cuda_buffer_pool[MAX_CUDA_BUFFERS];
static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT;
void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
scoped_spin_lock lock(g_cuda_pool_lock);
for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
@@ -290,7 +333,7 @@ void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
return ptr;
}
void ggml_cuda_pool_free(void * ptr, size_t size) {
static void ggml_cuda_pool_free(void * ptr, size_t size) {
scoped_spin_lock lock(g_cuda_pool_lock);
for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
@@ -305,28 +348,55 @@ void ggml_cuda_pool_free(void * ptr, size_t size) {
CUDA_CHECK(cudaFree(ptr));
}
cublasHandle_t g_cublasH = nullptr;
cudaStream_t g_cudaStream = nullptr;
cudaStream_t g_cudaStream2 = nullptr;
cudaEvent_t g_cudaEvent = nullptr;
#define GGML_CUDA_MAX_STREAMS 8
#define GGML_CUDA_MAX_EVENTS 64
static cublasHandle_t g_cublasH = nullptr;
static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_STREAMS] = { nullptr };
static cudaStream_t g_cudaStreams2[GGML_CUDA_MAX_STREAMS] = { nullptr };
static cudaEvent_t g_cudaEvents[GGML_CUDA_MAX_EVENTS] = { nullptr };
void ggml_init_cublas() {
if (g_cublasH == nullptr) {
// create cublas handle, bind a stream
CUBLAS_CHECK(cublasCreate(&g_cublasH));
CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStream, cudaStreamNonBlocking));
CUBLAS_CHECK(cublasSetStream(g_cublasH, g_cudaStream));
// create streams
for (int i = 0; i < GGML_CUDA_MAX_STREAMS; ++i) {
CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams[i], cudaStreamNonBlocking));
CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams2[i], cudaStreamNonBlocking));
}
// create events
for (int i = 0; i < GGML_CUDA_MAX_EVENTS; ++i) {
CUDA_CHECK(cudaEventCreateWithFlags(&g_cudaEvents[i], cudaEventDisableTiming));
}
// create additional stream and event for synchronization
CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStream2, cudaStreamNonBlocking));
CUDA_CHECK(cudaEventCreateWithFlags(&g_cudaEvent, cudaEventDisableTiming));
// create cublas handle
CUBLAS_CHECK(cublasCreate(&g_cublasH));
CUBLAS_CHECK(cublasSetMathMode(g_cublasH, CUBLAS_TF32_TENSOR_OP_MATH));
// configure logging to stdout
// CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, NULL));
// CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
}
}
cudaError_t ggml_cuda_h2d_tensor_2d(void * dst, const struct ggml_tensor * src, uint64_t i3, uint64_t i2, cudaStream_t stream) {
void * ggml_cuda_host_malloc(size_t size) {
if (getenv("GGML_CUDA_NO_PINNED") != nullptr) {
return nullptr;
}
void * ptr = nullptr;
cudaError_t err = cudaMallocHost((void **) &ptr, size);
if (err != cudaSuccess) {
fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
size/1024.0/1024.0, cudaGetErrorString(err));
return nullptr;
}
return ptr;
}
void ggml_cuda_host_free(void * ptr) {
CUDA_CHECK(cudaFreeHost(ptr));
}
static cudaError_t ggml_cuda_h2d_tensor_2d(void * dst, const struct ggml_tensor * src, uint64_t i3, uint64_t i2, cudaStream_t stream) {
const uint64_t ne0 = src->ne[0];
const uint64_t ne1 = src->ne[1];
const uint64_t nb0 = src->nb[0];
@@ -354,12 +424,293 @@ cudaError_t ggml_cuda_h2d_tensor_2d(void * dst, const struct ggml_tensor * src,
}
}
void * ggml_cuda_host_malloc(size_t size) {
void * ptr;
CUDA_CHECK(cudaMallocHost((void **) &ptr, size));
return ptr;
static void ggml_cuda_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
const int64_t ne00 = src0->ne[0];
const int64_t ne01 = src0->ne[1];
const int64_t ne02 = src0->ne[2];
const int64_t ne03 = src0->ne[3];
const int64_t ne10 = src1->ne[0];
const int64_t ne11 = src1->ne[1];
const int nb2 = dst->nb[2];
const int nb3 = dst->nb[3];
const float alpha = 1.0f;
const float beta = 0.0f;
const int x_ne = ne01 * ne00;
const int y_ne = ne11 * ne10;
const int d_ne = ne11 * ne01;
const int n_mm = ne03 * ne02;
size_t x_size, y_size, d_size;
float * d_X = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * x_ne, &x_size);
float * d_Y = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * y_ne, &y_size);
float * d_D = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * d_ne, &d_size);
for (int64_t i03 = 0; i03 < ne03; i03++) {
for (int64_t i02 = 0; i02 < ne02; i02++) {
int i = i03*ne02 + i02;
cudaStream_t cudaStream = g_cudaStreams[i % GGML_CUDA_MAX_STREAMS];
float * c_X = d_X + i * x_ne;
float * c_Y = d_Y + i * y_ne;
float * c_D = d_D + i * d_ne;
// copy data to device
CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_X, src0, i03, i02, cudaStream));
CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_Y, src1, i03, i02, cudaStream));
// compute
CUBLAS_CHECK(cublasSetStream(g_cublasH, cudaStream));
CUBLAS_CHECK(
cublasSgemm(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
ne01, ne11, ne10,
&alpha, c_X, ne00,
c_Y, ne10,
&beta, c_D, ne01));
// copy dst to host
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
CUDA_CHECK(cudaMemcpyAsync(d, c_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, cudaStream));
}
}
CUDA_CHECK(cudaDeviceSynchronize());
ggml_cuda_pool_free(d_X, x_size);
ggml_cuda_pool_free(d_Y, y_size);
ggml_cuda_pool_free(d_D, d_size);
}
void ggml_cuda_host_free(void * ptr) {
CUDA_CHECK(cudaFreeHost(ptr));
static void ggml_cuda_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t /* wsize */) {
const int64_t ne00 = src0->ne[0];
const int64_t ne01 = src0->ne[1];
const int64_t ne02 = src0->ne[2];
const int64_t ne03 = src0->ne[3];
const int64_t ne10 = src1->ne[0];
const int64_t ne11 = src1->ne[1];
const int nb10 = src1->nb[0];
const int nb11 = src1->nb[1];
const int nb12 = src1->nb[2];
const int nb13 = src1->nb[3];
const int nb2 = dst->nb[2];
const int nb3 = dst->nb[3];
const float alpha = 1.0f;
const float beta = 0.0f;
const int x_ne = ne01 * ne00;
const int y_ne = ne11 * ne10;
const int d_ne = ne11 * ne01;
const int n_mm = ne03 * ne02;
size_t x_size, y_size, d_size;
half * d_X = (half *) ggml_cuda_pool_malloc(n_mm * sizeof(half) * x_ne, &x_size);
half * d_Y = (half *) ggml_cuda_pool_malloc(n_mm * sizeof(half) * y_ne, &y_size);
float * d_D = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * d_ne, &d_size);
bool src1_cont_rows = nb10 == sizeof(float);
bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float);
for (int64_t i03 = 0; i03 < ne03; i03++) {
for (int64_t i02 = 0; i02 < ne02; i02++) {
int i = i03*ne02 + i02;
cudaStream_t cudaStream = g_cudaStreams[i % GGML_CUDA_MAX_STREAMS];
half * c_X = d_X + i * x_ne;
half * c_Y = d_Y + i * y_ne;
float * c_D = d_D + i * d_ne;
// copy src0 to device
CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_X, src0, i03, i02, cudaStream));
// convert src1 to fp16
// TODO: use multiple threads
ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (i03 * ne02 + i02);
char * src1i = (char *) src1->data + i03*nb13 + i02*nb12;
if (src1_cont_rows) {
if (src1_cont_cols) {
ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11);
}
else {
for (int64_t i01 = 0; i01 < ne11; i01++) {
ggml_fp32_to_fp16_row((float *) (src1i + i01*nb11), tmp + i01*ne10, ne10);
}
}
}
else {
for (int64_t i01 = 0; i01 < ne11; i01++) {
for (int64_t i00 = 0; i00 < ne10; i00++) {
// very slow due to no inlining
tmp[i01*ne10 + i00] = ggml_fp32_to_fp16(*(float *) (src1i + i01*nb11 + i00*nb10));
}
}
}
// copy src1 to device
CUDA_CHECK(cudaMemcpyAsync(c_Y, tmp, sizeof(half) * y_ne, cudaMemcpyHostToDevice, cudaStream));
// compute
CUBLAS_CHECK(cublasSetStream(g_cublasH, cudaStream));
CUBLAS_CHECK(
cublasGemmEx(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
ne01, ne11, ne10,
&alpha, c_X, CUDA_R_16F, ne00,
c_Y, CUDA_R_16F, ne10,
&beta, c_D, CUDA_R_32F, ne01,
CUBLAS_COMPUTE_32F_FAST_16F,
CUBLAS_GEMM_DEFAULT));
// copy dst to host
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
CUDA_CHECK(cudaMemcpyAsync(d, c_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, cudaStream));
}
}
CUDA_CHECK(cudaDeviceSynchronize());
ggml_cuda_pool_free(d_X, x_size);
ggml_cuda_pool_free(d_Y, y_size);
ggml_cuda_pool_free(d_D, d_size);
}
static void ggml_cuda_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
const int64_t ne00 = src0->ne[0];
const int64_t ne01 = src0->ne[1];
const int64_t ne02 = src0->ne[2];
const int64_t ne03 = src0->ne[3];
const int64_t ne10 = src1->ne[0];
const int64_t ne11 = src1->ne[1];
const int nb2 = dst->nb[2];
const int nb3 = dst->nb[3];
const ggml_type type = src0->type;
const float alpha = 1.0f;
const float beta = 0.0f;
const int x_ne = ne01 * ne00;
const int y_ne = ne11 * ne10;
const int d_ne = ne11 * ne01;
const int n_mm = ne03 * ne02;
const size_t q_sz = ggml_type_size(type) * x_ne / ggml_blck_size(type);
size_t x_size, y_size, d_size, q_size;
float * d_X = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * x_ne, &x_size);
float * d_Y = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * y_ne, &y_size);
float * d_D = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * d_ne, &d_size);
char * d_Q = (char *) ggml_cuda_pool_malloc(n_mm * q_sz, &q_size);
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(type);
GGML_ASSERT(to_fp32_cuda != nullptr);
for (int64_t i03 = 0; i03 < ne03; i03++) {
for (int64_t i02 = 0; i02 < ne02; i02++) {
int i = i03*ne02 + i02;
cudaStream_t cudaStream = g_cudaStreams[i % GGML_CUDA_MAX_STREAMS];
cudaStream_t cudaStream2 = g_cudaStreams2[i % GGML_CUDA_MAX_STREAMS];
cudaEvent_t cudaEvent = g_cudaEvents[i % GGML_CUDA_MAX_EVENTS];
float * c_X = d_X + i * x_ne;
float * c_Y = d_Y + i * y_ne;
float * c_D = d_D + i * d_ne;
char * c_Q = d_Q + i * q_sz;
// copy src0 and convert to fp32 on device
CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_Q, src0, i03, i02, cudaStream2));
to_fp32_cuda(c_Q, c_X, x_ne, cudaStream2);
CUDA_CHECK(cudaGetLastError());
CUDA_CHECK(cudaEventRecord(cudaEvent, cudaStream2));
// copy src1 to device
CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_Y, src1, i03, i02, cudaStream));
// wait for conversion
CUDA_CHECK(cudaStreamWaitEvent(cudaStream, cudaEvent, 0));
// compute
CUBLAS_CHECK(cublasSetStream(g_cublasH, cudaStream));
CUBLAS_CHECK(
cublasSgemm(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
ne01, ne11, ne10,
&alpha, c_X, ne00,
c_Y, ne10,
&beta, c_D, ne01));
// copy dst to host
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
CUDA_CHECK(cudaMemcpyAsync(d, c_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, cudaStream));
}
}
CUDA_CHECK(cudaDeviceSynchronize());
ggml_cuda_pool_free(d_X, x_size);
ggml_cuda_pool_free(d_Y, y_size);
ggml_cuda_pool_free(d_D, d_size);
ggml_cuda_pool_free(d_Q, q_size);
}
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
const int64_t ne10 = src1->ne[0];
const int64_t ne0 = dst->ne[0];
const int64_t ne1 = dst->ne[1];
// TODO: find the optimal values for these
if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
src1->type == GGML_TYPE_F32 &&
dst->type == GGML_TYPE_F32 &&
(ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
return true;
}
return false;
}
bool ggml_cuda_mul_mat_use_f16(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * /* dst */) {
size_t src0_sz = ggml_nbytes(src0);
size_t src1_sz = ggml_nbytes(src1);
// mul_mat_q: src0 is converted to fp32 on device
size_t mul_mat_q_transfer = src0_sz + src1_sz;
// mul_mat_f16: src1 is converted to fp16 on cpu
size_t mul_mat_f16_transfer = src0_sz + sizeof(half) * ggml_nelements(src1);
// choose the smaller one to transfer to the device
// TODO: this is not always the best choice due to the overhead of converting to fp16
return mul_mat_f16_transfer < mul_mat_q_transfer;
}
void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t wsize) {
GGML_ASSERT(ggml_cuda_can_mul_mat(src0, src1, dst));
if (src0->type == GGML_TYPE_F32) {
ggml_cuda_mul_mat_f32(src0, src1, dst);
}
else if (src0->type == GGML_TYPE_F16) {
if (ggml_cuda_mul_mat_use_f16(src0, src1, dst)) {
ggml_cuda_mul_mat_f16(src0, src1, dst, wdata, wsize);
}
else {
ggml_cuda_mul_mat_q_f32(src0, src1, dst);
}
}
else if (ggml_is_quantized(src0->type)) {
ggml_cuda_mul_mat_q_f32(src0, src1, dst);
}
else {
GGML_ASSERT(false);
}
}
size_t ggml_cuda_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
if (ggml_cuda_mul_mat_use_f16(src0, src1, dst)) {
return ggml_nelements(src1) * sizeof(ggml_fp16_t);
}
else {
return 0;
}
}

View File

@@ -1,54 +1,19 @@
#include <cublas_v2.h>
#include <cuda_runtime.h>
#include "ggml.h"
#ifdef __cplusplus
extern "C" {
#endif
#define CUDA_CHECK(err) \
do { \
cudaError_t err_ = (err); \
if (err_ != cudaSuccess) { \
fprintf(stderr, "CUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \
cudaGetErrorString(err_)); \
exit(1); \
} \
} while (0)
#define CUBLAS_CHECK(err) \
do { \
cublasStatus_t err_ = (err); \
if (err_ != CUBLAS_STATUS_SUCCESS) { \
fprintf(stderr, "cuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__); \
exit(1); \
} \
} while (0)
extern cublasHandle_t g_cublasH;
extern cudaStream_t g_cudaStream;
extern cudaStream_t g_cudaStream2;
extern cudaEvent_t g_cudaEvent;
void ggml_init_cublas(void);
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
size_t ggml_cuda_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
void ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
// TODO: export these with GGML_API
void * ggml_cuda_host_malloc(size_t size);
void ggml_cuda_host_free(void * ptr);
void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size);
void ggml_cuda_pool_free(void * ptr, size_t size);
void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);
void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream);
void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream);
void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);
void dequantize_row_q5_1_cuda(const void * vx, float * y, int k, cudaStream_t stream);
void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);
cudaError_t ggml_cuda_h2d_tensor_2d(void * dst, const struct ggml_tensor * src, uint64_t i3, uint64_t i2, cudaStream_t stream);
typedef void (*dequantize_row_q_cuda_t)(const void * x, float * y, int k, cudaStream_t stream);
dequantize_row_q_cuda_t ggml_get_dequantize_row_q_cuda(enum ggml_type type);
#ifdef __cplusplus
}
#endif

View File

@@ -1,63 +0,0 @@
#define MULTILINE_QUOTE(...) #__VA_ARGS__
const char * clblast_dequant = MULTILINE_QUOTE(
struct block_q4_0
{
float d;
uchar qs[16];
};
__kernel void dequantize_row_q4_0(__global struct block_q4_0* blocks, __global float* result) {
const uint i = get_global_id(0) / 32;
const uint l = get_local_id(0);
const float d = blocks[i].d;
const uchar vi = blocks[i].qs[l];
const uint index = i*32 + l*2;
result[index + 0] = ((vi & 0xf) - 8)*d;
result[index + 1] = ((vi >> 4) - 8)*d;
}
struct block_q4_1
{
float d;
float m;
uchar qs[16];
};
__kernel void dequantize_row_q4_1(__global struct block_q4_1* blocks, __global float* result) {
const uint i = get_global_id(0) / 32;
const uint l = get_local_id(0);
const float d = blocks[i].d;
const float m = blocks[i].m;
const uchar vi = blocks[i].qs[l];
const uint index = i*32 + l*2;
result[index + 0] = (vi & 0xf) * d + m;
result[index + 1] = (vi >> 4) * d + m;
}
struct block_q4_2
{
ushort d;
uchar qs[8];
};
__kernel void dequantize_row_q4_2(__global struct block_q4_2* blocks, __global float* result) {
const uint i = get_global_id(0) / 16;
const uint l = get_local_id(0);
const float d = vload_half(0, (__global half*) &blocks[i].d);;
const uchar vi = blocks[i].qs[l];
const uint index = i*16 + l*2;
result[index + 0] = ((vi & 0xf) - 8)*d;
result[index + 1] = ((vi >> 4) - 8)*d;
}
);

View File

@@ -3,12 +3,141 @@
#define CL_TARGET_OPENCL_VERSION 110
#include <clblast_c.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include "ggml.h"
#include "ggml-opencl-dequant.cl"
#define MULTILINE_QUOTE(...) #__VA_ARGS__
const char * clblast_dequant = MULTILINE_QUOTE(
struct block_q4_0
{
float d;
uchar qs[16];
};
__kernel void dequantize_row_q4_0(__global struct block_q4_0* blocks, __global float* result) {
const uint i = get_global_id(0) / 32;
const uint l = get_local_id(0);
const float d = blocks[i].d;
const uchar vi = blocks[i].qs[l];
const uint index = i*32 + l*2;
result[index + 0] = ((vi & 0xf) - 8)*d;
result[index + 1] = ((vi >> 4) - 8)*d;
}
struct block_q4_1
{
float d;
float m;
uchar qs[16];
};
__kernel void dequantize_row_q4_1(__global struct block_q4_1* blocks, __global float* result) {
const uint i = get_global_id(0) / 32;
const uint l = get_local_id(0);
const float d = blocks[i].d;
const float m = blocks[i].m;
const uchar vi = blocks[i].qs[l];
const uint index = i*32 + l*2;
result[index + 0] = (vi & 0xf) * d + m;
result[index + 1] = (vi >> 4) * d + m;
}
struct block_q4_2
{
ushort d;
uchar qs[8];
};
__kernel void dequantize_row_q4_2(__global struct block_q4_2* blocks, __global float* result) {
const uint i = get_global_id(0) / 16;
const uint l = get_local_id(0);
const float d = vload_half(0, (__global half*) &blocks[i].d);
const uchar vi = blocks[i].qs[l];
const uint index = i*16 + l*2;
result[index + 0] = ((vi & 0xf) - 8)*d;
result[index + 1] = ((vi >> 4) - 8)*d;
}
struct block_q5_0
{
float d;
uint qh;
uchar qs[16];
};
__kernel void dequantize_row_q5_0(__global struct block_q5_0* blocks, __global float* result) {
const uint i = get_global_id(0) / 32;
const uint l = get_local_id(0);
const float d = blocks[i].d;
const uchar vi = blocks[i].qs[l];
const uint l2 = l * 2;
const uchar vh0 = ((blocks[i].qh & (1 << (l2 + 0))) >> (l2 + 0)) << 4;
const uchar vh1 = ((blocks[i].qh & (1 << (l2 + 1))) >> (l2 + 1)) << 4;
const uint index = i*32 + l2;
result[index + 0] = (((vi & 0xf) | vh0) - 16)*d;
result[index + 1] = (((vi >> 4) | vh1) - 16)*d;
}
struct block_q5_1
{
ushort d;
ushort m;
uint qh;
uchar qs[16];
};
__kernel void dequantize_row_q5_1(__global struct block_q5_1* blocks, __global float* result) {
const uint i = get_global_id(0) / 32;
const uint l = get_local_id(0);
const float d = vload_half(0, (__global half*) &blocks[i].d);
const float m = vload_half(0, (__global half*) &blocks[i].m);
const uchar vi = blocks[i].qs[l];
const uint l2 = l * 2;
const uchar vh0 = ((blocks[i].qh & (1 << (l2 + 0))) >> (l2 + 0)) << 4;
const uchar vh1 = ((blocks[i].qh & (1 << (l2 + 1))) >> (l2 + 1)) << 4;
const uint index = i*32 + l2;
result[index + 0] = ((vi & 0xf) | vh0)*d + m;
result[index + 1] = ((vi >> 4) | vh1)*d + m;
}
struct block_q8_0
{
float d;
char qs[32];
};
__kernel void dequantize_row_q8_0(__global struct block_q8_0* blocks, __global float* result) {
const uint i = get_global_id(0) / 32;
const uint l = get_local_id(0);
result[i*32 + l] = blocks[i].qs[l] * blocks[i].d;
}
);
#define CL_CHECK(err, name) \
do { \
@@ -19,12 +148,26 @@
} \
} while (0)
#define QK5_0 32
typedef struct {
ggml_fp16_t d; // delta
uint8_t qh[4]; // 5-th bit of quants
uint8_t qs[QK5_0 / 2]; // nibbles / quants
} block_q5_0;
typedef struct {
float d; // delta
uint32_t qh; // 5-th bit of quants
uint8_t qs[QK5_0 / 2]; // nibbles / quants
} cl_block_q5_0;
static cl_platform_id platform;
static cl_device_id device;
static cl_context context;
static cl_command_queue queue;
static cl_program program;
static cl_kernel kernel_q4_0, kernel_q4_1, kernel_q4_2;
static cl_kernel kernel_q4_0, kernel_q4_1, kernel_q4_2, kernel_q5_0, kernel_q5_1, kernel_q8_0;
static cl_mem cl_buffer_a, cl_buffer_qb, cl_buffer_b, cl_buffer_c;
static size_t cl_size_a = 0, cl_size_qb = 0, cl_size_b = 0, cl_size_c = 0;
@@ -97,6 +240,12 @@ void ggml_cl_init(void) {
CL_CHECK(err, "clCreateKernel");
kernel_q4_2 = clCreateKernel(program, "dequantize_row_q4_2", &err);
CL_CHECK(err, "clCreateKernel");
kernel_q5_0 = clCreateKernel(program, "dequantize_row_q5_0", &err);
CL_CHECK(err, "clCreateKernel");
kernel_q5_1 = clCreateKernel(program, "dequantize_row_q5_1", &err);
CL_CHECK(err, "clCreateKernel");
kernel_q8_0 = clCreateKernel(program, "dequantize_row_q8_0", &err);
CL_CHECK(err, "clCreateKernel");
}
static void ggml_cl_malloc(size_t req_size, size_t* cur_size, cl_mem_flags flags, cl_mem* buf) {
@@ -125,6 +274,7 @@ void ggml_cl_sgemm_wrapper(
cl_kernel kernel;
size_t global = n * k, local, size_qb;
bool dequant;
cl_block_q5_0* cl_host_b;
switch (btype) {
case GGML_TYPE_F32:
@@ -146,7 +296,36 @@ void ggml_cl_sgemm_wrapper(
dequant = true;
kernel = kernel_q4_2;
local = 8;
size_qb = global * (sizeof(short) + local) / 16;
size_qb = global * (sizeof(ggml_fp16_t) + local) / 16;
break;
case GGML_TYPE_Q5_0:
dequant = true;
kernel = kernel_q5_0;
local = 16;
// For some reason OpenCL seems to be incapable of working with structs of size 22.
// 20 and 24 bytes are fine. Workaround to do the fp16 to fp32 step on CPU...
// TODO Find the reason, fix and remove workaround.
const block_q5_0* b = (const block_q5_0*) host_b;
cl_host_b = (cl_block_q5_0*) malloc(sizeof(cl_block_q5_0) * global / 32);
for (size_t i = 0; i < global / 32; i++) {
cl_host_b[i].d = ggml_fp16_to_fp32(b[i].d);
memcpy(&cl_host_b[i].qh, b[i].qh, sizeof(uint32_t));
memcpy(&cl_host_b[i].qs, b[i].qs, QK5_0 / 2);
}
host_b = (const float*) cl_host_b;
size_qb = global * (sizeof(float) + sizeof(uint32_t) + local) / 32;
break;
case GGML_TYPE_Q5_1:
dequant = true;
kernel = kernel_q5_1;
local = 16;
size_qb = global * (sizeof(ggml_fp16_t) * 2 + sizeof(uint32_t) + local) / 32;
break;
case GGML_TYPE_Q8_0:
dequant = true;
kernel = kernel_q8_0;
local = 32;
size_qb = global * (sizeof(float) + local) / 32;
break;
default:
fprintf(stderr, "Error: Unsupported OpenCL btype %d\n", btype);
@@ -171,12 +350,15 @@ void ggml_cl_sgemm_wrapper(
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &cl_buffer_qb);
err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_buffer_b);
CL_CHECK(err, "clSetKernelArg");
clEnqueueWriteBuffer(queue, cl_buffer_qb, CL_FALSE, 0, size_qb, host_b, 0, NULL, &ev_qb);
err = clEnqueueWriteBuffer(queue, cl_buffer_qb, CL_FALSE, 0, size_qb, host_b, 0, NULL, &ev_qb);
CL_CHECK(err, "clEnqueueWriteBuffer qb");
} else {
clEnqueueWriteBuffer(queue, cl_buffer_b, CL_FALSE, 0, size_b, host_b, 0, NULL, &ev_b);
err = clEnqueueWriteBuffer(queue, cl_buffer_b, CL_FALSE, 0, size_b, host_b, 0, NULL, &ev_b);
CL_CHECK(err, "clEnqueueWriteBuffer b");
}
clEnqueueWriteBuffer(queue, cl_buffer_a, CL_FALSE, 0, size_a, host_a, 0, NULL, &ev_a);
err = clEnqueueWriteBuffer(queue, cl_buffer_a, CL_FALSE, 0, size_a, host_a, 0, NULL, &ev_a);
CL_CHECK(err, "clEnqueueWriteBuffer a");
if (dequant) {
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 1, &ev_qb, &ev_b);
CL_CHECK(err, "clEnqueueNDRangeKernel");
@@ -188,15 +370,20 @@ void ggml_cl_sgemm_wrapper(
clReleaseEvent(ev_b);
cl_event ev_sgemm;
CLBlastSgemm((CLBlastLayout)order,
(CLBlastTranspose)trans_a, (CLBlastTranspose)trans_b,
m, n, k,
alpha,
cl_buffer_a, 0, lda,
cl_buffer_b, 0, ldb,
beta,
cl_buffer_c, 0, ldc,
&queue, &ev_sgemm);
CLBlastStatusCode status = CLBlastSgemm((CLBlastLayout)order,
(CLBlastTranspose)trans_a, (CLBlastTranspose)trans_b,
m, n, k,
alpha,
cl_buffer_a, 0, lda,
cl_buffer_b, 0, ldb,
beta,
cl_buffer_c, 0, ldc,
&queue, &ev_sgemm);
if (status != CLBlastSuccess) {
fprintf(stderr, "Error: CLBlast SGEMM %d\n", status);
abort();
}
cl_event ev_c;
clEnqueueReadBuffer(queue, cl_buffer_c, CL_TRUE, 0, size_c, host_c, 1, &ev_sgemm, &ev_c);
@@ -205,4 +392,7 @@ void ggml_cl_sgemm_wrapper(
clWaitForEvents(1, &ev_c);
clReleaseEvent(ev_sgemm);
clReleaseEvent(ev_c);
if (btype == GGML_TYPE_Q5_0) {
free((void*) cl_host_b);
}
}

569
ggml.c
View File

@@ -135,14 +135,6 @@ inline static void* ggml_aligned_malloc(size_t size) {
#define UNUSED(x) (void)(x)
#define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
#define GGML_ASSERT(x) \
do { \
if (!(x)) { \
fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
abort(); \
} \
} while (0)
#if defined(GGML_USE_ACCELERATE)
#include <Accelerate/Accelerate.h>
#elif defined(GGML_USE_OPENBLAS)
@@ -330,7 +322,7 @@ static ggml_fp16_t table_exp_f16[1 << 16];
// precomputed f32 table for f16 (256 KB)
static float table_f32_f16[1 << 16];
#if defined(__ARM_NEON)
#if defined(__ARM_NEON) || defined(__wasm_simd128__)
#define B1(c,s,n) 0x ## n ## c , 0x ## n ## s
#define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s)
#define B3(c,s,n) B2(c,s,n ## c), B2(c,s,n ## s)
@@ -370,6 +362,32 @@ ggml_fp16_t ggml_fp32_to_fp16(float x) {
return GGML_FP32_TO_FP16(x);
}
void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, size_t n) {
for (size_t i = 0; i < n; i++) {
y[i] = GGML_FP16_TO_FP32(x[i]);
}
}
void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n) {
size_t i = 0;
#if defined(__F16C__)
for (; i + 7 < n; i += 8) {
__m256 x_vec = _mm256_loadu_ps(x + i);
__m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
_mm_storeu_si128((__m128i *)(y + i), y_vec);
}
for(; i + 3 < n; i += 4) {
__m128 x_vec = _mm_loadu_ps(x + i);
__m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
_mm_storel_epi64((__m128i *)(y + i), y_vec);
}
#endif
for (; i < n; i++) {
y[i] = GGML_FP32_TO_FP16(x[i]);
}
}
//
// timing
//
@@ -653,35 +671,91 @@ float vmaxvq_f32(float32x4_t v) {
}
int8x8_t vzip1_s8(int8x8_t a, int8x8_t b) {
return vget_low_s8(vcombine_s8(a, b));
int8x8_t res;
res[0] = a[0]; res[1] = b[0];
res[2] = a[1]; res[3] = b[1];
res[4] = a[2]; res[5] = b[2];
res[6] = a[3]; res[7] = b[3];
return res;
}
int8x8_t vzip2_s8(int8x8_t a, int8x8_t b) {
return vget_high_s8(vcombine_s8(a, b));
int8x8_t res;
res[0] = a[4]; res[1] = b[4];
res[2] = a[5]; res[3] = b[5];
res[4] = a[6]; res[5] = b[6];
res[6] = a[7]; res[7] = b[7];
return res;
}
uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
return vget_low_u8(vcombine_u8(a, b));
uint8x8_t res;
res[0] = a[0]; res[1] = b[0];
res[2] = a[1]; res[3] = b[1];
res[4] = a[2]; res[5] = b[2];
res[6] = a[3]; res[7] = b[3];
return res;
}
uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
return vget_high_u8(vcombine_u8(a, b));
uint8x8_t res;
res[0] = a[4]; res[1] = b[4];
res[2] = a[5]; res[3] = b[5];
res[4] = a[6]; res[5] = b[6];
res[6] = a[7]; res[7] = b[7];
return res;
}
int8x16_t vzip1q_s8(int8x16_t a, int8x16_t b) {
return vcombine_s8(vget_low_s8(a), vget_low_s8(b));
int8x16_t res;
res[0] = a[0]; res[1] = b[0]; res[2] = a[1]; res[3] = b[1];
res[4] = a[2]; res[5] = b[2]; res[6] = a[3]; res[7] = b[3];
res[8] = a[4]; res[9] = b[4]; res[10] = a[5]; res[11] = b[5];
res[12] = a[6]; res[13] = b[6]; res[14] = a[7]; res[15] = b[7];
return res;
}
int8x16_t vzip2q_s8(int8x16_t a, int8x16_t b) {
return vcombine_s8(vget_high_s8(a), vget_high_s8(b));
int8x16_t res;
res[0] = a[8]; res[1] = b[8]; res[2] = a[9]; res[3] = b[9];
res[4] = a[10]; res[5] = b[10]; res[6] = a[11]; res[7] = b[11];
res[8] = a[12]; res[9] = b[12]; res[10] = a[13]; res[11] = b[13];
res[12] = a[14]; res[13] = b[14]; res[14] = a[15]; res[15] = b[15];
return res;
}
uint8x16_t vzip1q_u8(uint8x16_t a, uint8x16_t b) {
return vcombine_u8(vget_low_u8(a), vget_low_u8(b));
uint8x16_t res;
res[0] = a[0]; res[1] = b[0]; res[2] = a[1]; res[3] = b[1];
res[4] = a[2]; res[5] = b[2]; res[6] = a[3]; res[7] = b[3];
res[8] = a[4]; res[9] = b[4]; res[10] = a[5]; res[11] = b[5];
res[12] = a[6]; res[13] = b[6]; res[14] = a[7]; res[15] = b[7];
return res;
}
uint8x16_t vzip2q_u8(uint8x16_t a, uint8x16_t b) {
return vcombine_u8(vget_high_u8(a), vget_high_u8(b));
uint8x16_t res;
res[0] = a[8]; res[1] = b[8]; res[2] = a[9]; res[3] = b[9];
res[4] = a[10]; res[5] = b[10]; res[6] = a[11]; res[7] = b[11];
res[8] = a[12]; res[9] = b[12]; res[10] = a[13]; res[11] = b[13];
res[12] = a[14]; res[13] = b[14]; res[14] = a[15]; res[15] = b[15];
return res;
}
int32x4_t vcvtnq_s32_f32(float32x4_t v) {
@@ -808,6 +882,7 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int
float max = 0.0f;
float min = 0.0f;
vector float asrcv [8];
vector float srcv [8];
vector float maxv[8];
vector float minv[8];
@@ -1087,7 +1162,7 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int
const v128_t v = wasm_f32x4_mul(srcv[l], wasm_f32x4_splat(id));
const v128_t vf = wasm_f32x4_add(v, wasm_f32x4_splat(8.5f));
const v128_t vi = wasm_i32x4_trunc_sat_f32x4(vf);
const v128_t vc = wasm_i32x4_min_u(vi, wasm_i32x4_splat(15));
const v128_t vc = wasm_i32x4_min(vi, wasm_i32x4_splat(15));
y[i].qs[2*l + 0] = wasm_i32x4_extract_lane(vc, 0) | (wasm_i32x4_extract_lane(vc, 1) << 4);
y[i].qs[2*l + 1] = wasm_i32x4_extract_lane(vc, 2) | (wasm_i32x4_extract_lane(vc, 3) << 4);
@@ -1911,8 +1986,8 @@ static void dequantize_row_q5_0(const void * restrict vx, float * restrict y, in
const uint8_t vi = pp[l/2];
// extract the 5-th bit from qh
const uint8_t vh0 = ((qh & (1 << (l + 0))) >> (l + 0)) << 4;
const uint8_t vh1 = ((qh & (1 << (l + 1))) >> (l + 1)) << 4;
const uint8_t vh0 = ((qh & (1u << (l + 0))) >> (l + 0)) << 4;
const uint8_t vh1 = ((qh & (1u << (l + 1))) >> (l + 1)) << 4;
const int8_t vi0 = (vi & 0x0F) | vh0;
const int8_t vi1 = (vi >> 4) | vh1;
@@ -1948,8 +2023,8 @@ static void dequantize_row_q5_1(const void * restrict vx, float * restrict y, in
const uint8_t vi = pp[l/2];
// extract the 5-th bit from qh
const uint8_t vh0 = ((qh & (1 << (l + 0))) >> (l + 0)) << 4;
const uint8_t vh1 = ((qh & (1 << (l + 1))) >> (l + 1)) << 4;
const uint8_t vh0 = ((qh & (1u << (l + 0))) >> (l + 0)) << 4;
const uint8_t vh1 = ((qh & (1u << (l + 1))) >> (l + 1)) << 4;
const uint8_t vi0 = (vi & 0x0F) | vh0;
const uint8_t vi1 = (vi >> 4) | vh1;
@@ -3180,6 +3255,72 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
}
*s = vaddvq_f32(sumv);
#elif defined(__wasm_simd128__)
v128_t sumv = wasm_f32x4_splat(0.0f);
uint64_t tmp[4];
for (int i = 0; i < nb; ++i) {
const block_q5_0 * restrict x0 = &x[i];
const block_q8_0 * restrict y0 = &y[i];
const v128_t m4b = wasm_i8x16_splat(0x0F);
const v128_t s16b = wasm_i8x16_splat(0x10);
// extract the 5th bit
uint32_t qh;
memcpy(&qh, x0->qh, sizeof(qh));
tmp[0] = table_b2b_u[(qh >> 0) & 0xFF];
tmp[1] = table_b2b_u[(qh >> 8) & 0xFF];
tmp[2] = table_b2b_u[(qh >> 16) & 0xFF];
tmp[3] = table_b2b_u[(qh >> 24) ];
const v128_t qhl = wasm_v128_load(tmp + 0);
const v128_t qhh = wasm_v128_load(tmp + 2);
const v128_t v0 = wasm_v128_load(x0->qs);
// 4-bit -> 8-bit
const v128_t v0l = wasm_v128_and (v0, m4b);
const v128_t v0h = wasm_u8x16_shr(v0, 4);
// interleave
const v128_t v0lz = wasm_v8x16_shuffle(v0l, v0h, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
const v128_t v0hz = wasm_v8x16_shuffle(v0l, v0h, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
// add high bit and sub 16
const v128_t v0lf = wasm_i8x16_sub(wasm_v128_or(v0lz, qhl), s16b);
const v128_t v0hf = wasm_i8x16_sub(wasm_v128_or(v0hz, qhh), s16b);
// load y
const v128_t v1l = wasm_v128_load(y0->qs);
const v128_t v1h = wasm_v128_load(y0->qs + 16);
// int8x16 -> int16x8
const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf);
const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf);
const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf);
const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf);
const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l);
const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l);
const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
const float x0d = GGML_FP16_TO_FP32(x0->d);
// dot product
sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(
wasm_i32x4_add(
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), wasm_f32x4_splat(x0d*y0->d)));
}
*s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3);
#elif defined(__AVX2__)
// Initialize accumulator with zeros
__m256 acc = _mm256_setzero_ps();
@@ -3220,8 +3361,8 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
for (int j = 0; j < QK8_0/2; j++) {
const uint8_t v0 = x0[j];
const int x0_0h = ((qh & (1 << (2*j + 0))) >> (2*j + 0)) << 4;
const int x1_0h = ((qh & (1 << (2*j + 1))) >> (2*j + 1)) << 4;
const int x0_0h = ((qh & (1u << (2*j + 0))) >> (2*j + 0)) << 4;
const int x1_0h = ((qh & (1u << (2*j + 1))) >> (2*j + 1)) << 4;
const int x0_0 = ((v0 & 0x0F) | x0_0h) - 16;
const int x1_0 = ((v0 >> 4) | x1_0h) - 16;
@@ -3311,6 +3452,77 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
}
*s = vaddvq_f32(sumv) + summs;
#elif defined(__wasm_simd128__)
v128_t sumv = wasm_f32x4_splat(0.0f);
float summs = 0.0f;
uint64_t tmp[4];
for (int i = 0; i < nb; ++i) {
const block_q5_1 * restrict x0 = &x[i];
const block_q8_1 * restrict y0 = &y[i];
summs += GGML_FP16_TO_FP32(x0->m) * (y0->s0 + y0->s1);
const v128_t m4b = wasm_i8x16_splat(0x0F);
// extract the 5th bit
uint32_t qh;
memcpy(&qh, x0->qh, sizeof(qh));
tmp[0] = table_b2b_u[(qh >> 0) & 0xFF];
tmp[1] = table_b2b_u[(qh >> 8) & 0xFF];
tmp[2] = table_b2b_u[(qh >> 16) & 0xFF];
tmp[3] = table_b2b_u[(qh >> 24) ];
const v128_t qhl = wasm_v128_load(tmp + 0);
const v128_t qhh = wasm_v128_load(tmp + 2);
const v128_t v0 = wasm_v128_load(x0->qs);
// 4-bit -> 8-bit
const v128_t v0l = wasm_v128_and (v0, m4b);
const v128_t v0h = wasm_u8x16_shr(v0, 4);
static bool x = true;
// interleave
const v128_t v0lz = wasm_v8x16_shuffle(v0l, v0h, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
const v128_t v0hz = wasm_v8x16_shuffle(v0l, v0h, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
// add high bit
const v128_t v0lf = wasm_v128_or(v0lz, qhl);
const v128_t v0hf = wasm_v128_or(v0hz, qhh);
// load y
const v128_t v1l = wasm_v128_load(y0->qs);
const v128_t v1h = wasm_v128_load(y0->qs + 16);
// int8x16 -> int16x8
const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf);
const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf);
const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf);
const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf);
const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l);
const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l);
const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
const float x0d = GGML_FP16_TO_FP32(x0->d);
// dot product
sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(
wasm_i32x4_add(
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), wasm_f32x4_splat(x0d*y0->d)));
}
*s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3) + summs;
#elif defined(__AVX2__)
// Initialize accumulator with zeros
__m256 acc = _mm256_setzero_ps();
@@ -3354,8 +3566,8 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
for (int j = 0; j < QK8_1/2; j++) {
const uint8_t v0 = x0[j];
const int x0_0h = ((qh & (1 << (2*j + 0))) >> (2*j + 0)) << 4;
const int x1_0h = ((qh & (1 << (2*j + 1))) >> (2*j + 1)) << 4;
const int x0_0h = ((qh & (1u << (2*j + 0))) >> (2*j + 0)) << 4;
const int x1_0h = ((qh & (1u << (2*j + 1))) >> (2*j + 1)) << 4;
const int x0_0 = (v0 & 0x0F) | x0_0h;
const int x1_0 = (v0 >> 4) | x1_0h;
@@ -4057,6 +4269,27 @@ bool ggml_is_quantized(enum ggml_type type) {
return GGML_IS_QUANTIZED[type];
}
enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
enum ggml_type wtype = GGML_TYPE_COUNT;
switch (ftype) {
case GGML_FTYPE_ALL_F32: wtype = GGML_TYPE_F32; break;
case GGML_FTYPE_MOSTLY_F16: wtype = GGML_TYPE_F16; break;
case GGML_FTYPE_MOSTLY_Q4_0: wtype = GGML_TYPE_Q4_0; break;
case GGML_FTYPE_MOSTLY_Q4_1: wtype = GGML_TYPE_Q4_1; break;
case GGML_FTYPE_MOSTLY_Q4_2: wtype = GGML_TYPE_Q4_2; break;
case GGML_FTYPE_MOSTLY_Q5_0: wtype = GGML_TYPE_Q5_0; break;
case GGML_FTYPE_MOSTLY_Q5_1: wtype = GGML_TYPE_Q5_1; break;
case GGML_FTYPE_MOSTLY_Q8_0: wtype = GGML_TYPE_Q8_0; break;
case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
}
GGML_ASSERT(wtype != GGML_TYPE_COUNT);
return wtype;
}
static inline bool ggml_is_transposed(const struct ggml_tensor * tensor) {
return tensor->nb[0] > tensor->nb[1];
}
@@ -4167,12 +4400,11 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
}
// initialize cuBLAS
#if defined(GGML_USE_CUBLAS)
#if defined(GGML_USE_CUBLAS)
ggml_init_cublas();
#elif defined(GGML_USE_CLBLAST)
#elif defined(GGML_USE_CLBLAST)
ggml_cl_init();
#endif
#endif
is_first_call = false;
}
@@ -4253,7 +4485,7 @@ void ggml_free(struct ggml_context * ctx) {
}
size_t ggml_used_mem(const struct ggml_context * ctx) {
return ctx->objects_end->offs + ctx->objects_end->size;
return ctx->objects_end == NULL ? 0 : ctx->objects_end->offs + ctx->objects_end->size;
}
size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch) {
@@ -4366,6 +4598,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
/*.perf_cycles =*/ 0,
/*.perf_time_us =*/ 0,
/*.data =*/ (data == NULL && !ctx->no_alloc) ? (void *)(result + 1) : data,
/*.name =*/ { 0 },
/*.pad =*/ { 0 },
};
@@ -4720,6 +4953,15 @@ float * ggml_get_data_f32(const struct ggml_tensor * tensor) {
return (float *)(tensor->data);
}
const char * ggml_get_name(const struct ggml_tensor * tensor) {
return tensor->name;
}
void ggml_set_name(struct ggml_tensor * tensor, const char * name) {
strncpy(tensor->name, name, sizeof(tensor->name));
tensor->name[sizeof(tensor->name) - 1] = '\0';
}
struct ggml_tensor * ggml_view_tensor(
struct ggml_context * ctx,
const struct ggml_tensor * src) {
@@ -5819,6 +6061,7 @@ struct ggml_tensor * ggml_diag_mask_inf(
//struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
struct ggml_tensor * result = ggml_view_tensor(ctx, a);
struct ggml_tensor * b = ggml_new_i32(ctx, n_past);
ggml_set_name(b, "n_past");
result->op = GGML_OP_DIAG_MASK_INF;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -5876,6 +6119,7 @@ struct ggml_tensor * ggml_rope(
((int32_t *) b->data)[0] = n_past;
((int32_t *) b->data)[1] = n_dims;
((int32_t *) b->data)[2] = mode;
ggml_set_name(b, "n_past, n_dims, mode");
result->op = GGML_OP_ROPE;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -7943,7 +8187,7 @@ static void ggml_compute_forward_rms_norm(
// ggml_compute_forward_mul_mat
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
// helper function to determine if it is better to use BLAS or not
// for large matrices, BLAS is faster
static bool ggml_compute_forward_mul_mat_use_blas(
@@ -7959,12 +8203,9 @@ static bool ggml_compute_forward_mul_mat_use_blas(
const int64_t ne1 = dst->ne[1];
// TODO: find the optimal values for these
if (
#if !defined(GGML_USE_CUBLAS)
ggml_is_contiguous(src0) &&
if (ggml_is_contiguous(src0) &&
ggml_is_contiguous(src1) &&
#endif
((ne0 >= 32 && ne1 >= 32 && ne10 >= 32))) {
(ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
/*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
return true;
@@ -7972,7 +8213,6 @@ static bool ggml_compute_forward_mul_mat_use_blas(
return false;
}
#endif
static void ggml_compute_forward_mul_mat_f32(
@@ -7988,7 +8228,7 @@ static void ggml_compute_forward_mul_mat_f32(
const int64_t ne02 = src0->ne[2];
const int64_t ne03 = src0->ne[3];
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
const int64_t ne10 = src1->ne[0];
#endif
const int64_t ne11 = src1->ne[1];
@@ -8045,7 +8285,16 @@ static void ggml_compute_forward_mul_mat_f32(
// nb01 >= nb00 - src0 is not transposed
// compute by src0 rows
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
#if defined(GGML_USE_CUBLAS)
if (ggml_cuda_can_mul_mat(src0, src1, dst)) {
if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
ggml_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize);
}
return;
}
#endif
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
if (params->ith != 0) {
return;
@@ -8059,43 +8308,13 @@ static void ggml_compute_forward_mul_mat_f32(
return;
}
#if defined(GGML_USE_CUBLAS)
const float alpha = 1.0f;
const float beta = 0.0f;
const int x_ne = ne01 * ne00;
const int y_ne = ne11 * ne10;
const int d_ne = ne11 * ne01;
size_t x_size, y_size, d_size;
float *d_X = ggml_cuda_pool_malloc(sizeof(float) * x_ne, &x_size);
float *d_Y = ggml_cuda_pool_malloc(sizeof(float) * y_ne, &y_size);
float *d_D = ggml_cuda_pool_malloc(sizeof(float) * d_ne, &d_size);
#endif
for (int64_t i03 = 0; i03 < ne03; i03++) {
for (int64_t i02 = 0; i02 < ne02; i02++) {
#if !defined(GGML_USE_CUBLAS)
const float * x = (float *) ((char *) src0->data + i02*nb02 + i03*nb03);
const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
#endif
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
#if defined(GGML_USE_CUBLAS)
// copy data to device
CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_X, src0, i03, i02, g_cudaStream));
CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_Y, src1, i03, i02, g_cudaStream));
// compute
CUBLAS_CHECK(
cublasSgemm(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
ne01, ne11, ne10,
&alpha, d_X, ne00,
d_Y, ne10,
&beta, d_D, ne01));
// copy data to host
CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, g_cudaStream));
#elif defined(GGML_USE_CLBLAST)
#if defined(GGML_USE_CLBLAST)
// zT = y * xT
ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
ne11, ne01, ne10,
@@ -8112,12 +8331,6 @@ static void ggml_compute_forward_mul_mat_f32(
#endif
}
}
#if defined(GGML_USE_CUBLAS)
CUDA_CHECK(cudaStreamSynchronize(g_cudaStream));
ggml_cuda_pool_free(d_X, x_size);
ggml_cuda_pool_free(d_Y, y_size);
ggml_cuda_pool_free(d_D, d_size);
#endif
//printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
return;
@@ -8247,7 +8460,16 @@ static void ggml_compute_forward_mul_mat_f16_f32(
// nb01 >= nb00 - src0 is not transposed
// compute by src0 rows
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
#if defined(GGML_USE_CUBLAS)
if (ggml_cuda_can_mul_mat(src0, src1, dst)) {
if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
ggml_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize);
}
return;
}
#endif
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
GGML_ASSERT(nb10 == sizeof(float));
@@ -8263,37 +8485,8 @@ static void ggml_compute_forward_mul_mat_f16_f32(
return;
}
#if defined(GGML_USE_CUBLAS)
const float alpha = 1.0f;
const float beta = 0.0f;
const int x_ne = ne01 * ne00;
const int y_ne = ne11 * ne10;
const int d_ne = ne11 * ne01;
size_t x_size, y_size, d_size;
ggml_fp16_t * d_X = ggml_cuda_pool_malloc(sizeof(float) * x_ne, &x_size);
ggml_fp16_t * d_Y = ggml_cuda_pool_malloc(sizeof(float) * y_ne, &y_size);
float * d_D = ggml_cuda_pool_malloc(sizeof(float) * d_ne, &d_size);
#endif
for (int64_t i03 = 0; i03 < ne03; i03++) {
for (int64_t i02 = 0; i02 < ne02; i02++) {
#if defined(GGML_USE_CUBLAS)
// copy src0 while converting src1
CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_X, src0, i03, i02, g_cudaStream));
// with cuBlAS, instead of converting src0 to fp32, we convert src1 to fp16
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + (ne11 * ne10) * (i03 * ne02 + i02);
{
size_t id = 0;
for (int64_t i01 = 0; i01 < ne11; ++i01) {
for (int64_t i00 = 0; i00 < ne10; ++i00) {
wdata[id++] = GGML_FP32_TO_FP16(*(float *) ((char *) src1->data + i03*nb13 + i02*nb12 + i01*nb11 + i00*nb10));
}
}
assert(id*sizeof(ggml_fp16_t) <= params->wsize);
}
#else
float * const wdata = params->wdata;
{
size_t id = 0;
@@ -8305,28 +8498,8 @@ static void ggml_compute_forward_mul_mat_f16_f32(
assert(id*sizeof(float) <= params->wsize);
}
#endif
#if defined(GGML_USE_CUBLAS)
const ggml_fp16_t * y = (ggml_fp16_t *) wdata;
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
// copy data to device
CUDA_CHECK(cudaMemcpyAsync(d_Y, y, sizeof(ggml_fp16_t) * y_ne, cudaMemcpyHostToDevice, g_cudaStream));
// compute
CUBLAS_CHECK(
cublasGemmEx(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
ne01, ne11, ne10,
&alpha, d_X, CUDA_R_16F, ne00,
d_Y, CUDA_R_16F, ne10,
&beta, d_D, CUDA_R_32F, ne01,
CUBLAS_COMPUTE_32F,
CUBLAS_GEMM_DEFAULT));
// copy data to host
CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, g_cudaStream));
#elif defined(GGML_USE_CLBLAST)
#if defined(GGML_USE_CLBLAST)
const float * x = wdata;
const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
@@ -8355,12 +8528,6 @@ static void ggml_compute_forward_mul_mat_f16_f32(
}
}
#if defined(GGML_USE_CUBLAS)
CUDA_CHECK(cudaStreamSynchronize(g_cudaStream));
ggml_cuda_pool_free(d_X, x_size);
ggml_cuda_pool_free(d_Y, y_size);
ggml_cuda_pool_free(d_D, d_size);
#endif
/*printf("CBLAS F16 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);*/
return;
@@ -8513,7 +8680,16 @@ static void ggml_compute_forward_mul_mat_q_f32(
// nb01 >= nb00 - src0 is not transposed
// compute by src0 rows
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
#if defined(GGML_USE_CUBLAS)
if (ggml_cuda_can_mul_mat(src0, src1, dst)) {
if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
ggml_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize);
}
return;
}
#endif
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
if (params->ith != 0) {
return;
@@ -8527,25 +8703,8 @@ static void ggml_compute_forward_mul_mat_q_f32(
return;
}
#if defined(GGML_USE_CUBLAS)
const float alpha = 1.0f;
const float beta = 0.0f;
const int x_ne = ne01 * ne00;
const int y_ne = ne11 * ne10;
const int d_ne = ne11 * ne01;
size_t x_size, y_size, d_size, q_size;
float * d_X = ggml_cuda_pool_malloc(sizeof(float) * x_ne, &x_size);
float * d_Y = ggml_cuda_pool_malloc(sizeof(float) * y_ne, &y_size);
float * d_D = ggml_cuda_pool_malloc(sizeof(float) * d_ne, &d_size);
void * d_Q = ggml_cuda_pool_malloc(GGML_TYPE_SIZE[type] * x_ne / GGML_BLCK_SIZE[type], &q_size);
const dequantize_row_q_cuda_t dequantize_row_q_cuda = ggml_get_dequantize_row_q_cuda(type);
GGML_ASSERT(dequantize_row_q_cuda != NULL);
#else
float * const wdata = params->wdata;
dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q;
#endif
for (int64_t i03 = 0; i03 < ne03; i03++) {
for (int64_t i02 = 0; i02 < ne02; i02++) {
@@ -8553,14 +8712,7 @@ static void ggml_compute_forward_mul_mat_q_f32(
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
#if defined(GGML_USE_CUBLAS)
// copy and dequantize on device
CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_Q, src0, i03, i02, g_cudaStream2));
dequantize_row_q_cuda(d_Q, d_X, x_ne, g_cudaStream2);
CUDA_CHECK(cudaGetLastError());
CUDA_CHECK(cudaEventRecord(g_cudaEvent, g_cudaStream2));
#elif defined(GGML_USE_CLBLAST)
#if defined(GGML_USE_CLBLAST)
const void* x = (char *) src0->data + i03*nb03 + i02*nb02;
#else
{
@@ -8576,24 +8728,7 @@ static void ggml_compute_forward_mul_mat_q_f32(
const float * x = wdata;
#endif
#if defined(GGML_USE_CUBLAS)
// copy data to device
CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_Y, src1, i03, i02, g_cudaStream));
// wait for dequantization
CUDA_CHECK(cudaStreamWaitEvent(g_cudaStream, g_cudaEvent, 0));
// compute
CUBLAS_CHECK(
cublasSgemm(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
ne01, ne11, ne10,
&alpha, d_X, ne00,
d_Y, ne10,
&beta, d_D, ne01));
// copy data to host
CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, g_cudaStream));
#elif defined(GGML_USE_CLBLAST)
#if defined(GGML_USE_CLBLAST)
// zT = y * xT
ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
ne11, ne01, ne10,
@@ -8611,13 +8746,6 @@ static void ggml_compute_forward_mul_mat_q_f32(
}
}
#if defined(GGML_USE_CUBLAS)
CUDA_CHECK(cudaStreamSynchronize(g_cudaStream));
ggml_cuda_pool_free(d_X, x_size);
ggml_cuda_pool_free(d_Y, y_size);
ggml_cuda_pool_free(d_D, d_size);
ggml_cuda_pool_free(d_Q, q_size);
#endif
//printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
return;
@@ -11601,18 +11729,21 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
size_t cur = 0;
#if defined(GGML_USE_CUBLAS)
if (ggml_cuda_can_mul_mat(node->src0, node->src1, node)) {
node->n_tasks = 1; // TODO: this actually is doing nothing
// the threads are still spinning
cur = ggml_cuda_mul_mat_get_wsize(node->src0, node->src1, node);
}
else
#endif
if (node->src0->type == GGML_TYPE_F16 && node->src1->type == GGML_TYPE_F32) {
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
node->n_tasks = 1; // TODO: this actually is doing nothing
// the threads are still spinning
#if defined(GGML_USE_CUBLAS)
// with cuBLAS, we need memory for the full 3D / 4D data of src1
cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1);
#else
// here we need memory just for single 2D matrix from src0
cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
#endif
} else {
cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1);
}
@@ -11621,13 +11752,13 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
#endif
} else if (node->src0->type == GGML_TYPE_F32 && node->src1->type == GGML_TYPE_F32) {
cur = 0;
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
node->n_tasks = 1;
}
#endif
} else if (ggml_is_quantized(node->src0->type) && node->src1->type == GGML_TYPE_F32) {
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
node->n_tasks = 1;
cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
@@ -12056,10 +12187,16 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
snprintf(color, sizeof(color), "white");
}
fprintf(fp, " \"%p\" [ \
style = filled; fillcolor = %s; shape = record; \
label=\"%d [%" PRId64 ", %" PRId64 "] | <x>%s",
(void *) node, color,
fprintf(fp, " \"%p\" [ "
"style = filled; fillcolor = %s; shape = record; "
"label=\"",
(void *) node, color);
if (strlen(node->name) > 0) {
fprintf(fp, "%s |", node->name);
}
fprintf(fp, "%d [%" PRId64 ", %" PRId64 "] | <x>%s",
i, node->ne[0], node->ne[1],
GGML_OP_SYMBOL[node->op]);
@@ -12075,18 +12212,26 @@ label=\"%d [%" PRId64 ", %" PRId64 "] | <x>%s",
snprintf(color, sizeof(color), "pink");
if (ggml_nelements(node) == 1) {
fprintf(fp, " \"%p\" [ \
style = filled; fillcolor = %s; shape = record; \
label=\"<x>%.1e\"; ]\n",
(void *) node, color, (double)ggml_get_f32_1d(node, 0));
} else {
fprintf(fp, " \"%p\" [ \
style = filled; fillcolor = %s; shape = record; \
label=\"<x>CONST %d [%" PRId64 ", %" PRId64 "]\"; ]\n",
(void *) node, color,
i, node->ne[0], node->ne[1]);
fprintf(fp, " \"%p\" [ "
"style = filled; fillcolor = %s; shape = record; "
"label=\"<x>",
(void *) node, color);
if (strlen(node->name) > 0) {
fprintf(fp, "%s | ", node->name);
}
if (ggml_nelements(node) == 1) {
if (node->type == GGML_TYPE_I8 || node->type == GGML_TYPE_I16 || node->type == GGML_TYPE_I32) {
fprintf(fp, "%d", ggml_get_i32_1d(node, 0));
}
else {
fprintf(fp, "%.1e", (double)ggml_get_f32_1d(node, 0));
}
}
else {
fprintf(fp, "CONST %d [%" PRId64 ", %" PRId64 "]", i, node->ne[0], node->ne[1]);
}
fprintf(fp, "\"; ]\n");
}
for (int i = 0; i < gb->n_nodes; i++) {
@@ -12899,8 +13044,8 @@ size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t *
memcpy(&qh, &y[i].qh, sizeof(qh));
for (int l = 0; l < QK5_0; l += 2) {
const uint8_t vh0 = ((qh & (1 << (l + 0))) >> (l + 0)) << 4;
const uint8_t vh1 = ((qh & (1 << (l + 1))) >> (l + 1)) << 4;
const uint8_t vh0 = ((qh & (1u << (l + 0))) >> (l + 0)) << 4;
const uint8_t vh1 = ((qh & (1u << (l + 1))) >> (l + 1)) << 4;
// cast to 16 bins
const uint8_t vi0 = ((y[i].qs[l/2] & 0x0F) | vh0) / 2;
@@ -12929,8 +13074,8 @@ size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t *
memcpy(&qh, &y[i].qh, sizeof(qh));
for (int l = 0; l < QK5_1; l += 2) {
const uint8_t vh0 = ((qh & (1 << (l + 0))) >> (l + 0)) << 4;
const uint8_t vh1 = ((qh & (1 << (l + 1))) >> (l + 1)) << 4;
const uint8_t vh0 = ((qh & (1u << (l + 0))) >> (l + 0)) << 4;
const uint8_t vh1 = ((qh & (1u << (l + 1))) >> (l + 1)) << 4;
// cast to 16 bins
const uint8_t vi0 = ((y[i].qs[l/2] & 0x0F) | vh0) / 2;

36
ggml.h
View File

@@ -197,6 +197,14 @@
#define GGML_MAX_OPT 4
#define GGML_DEFAULT_N_THREADS 4
#define GGML_ASSERT(x) \
do { \
if (!(x)) { \
fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
abort(); \
} \
} while (0)
#ifdef __cplusplus
extern "C" {
#endif
@@ -212,6 +220,9 @@ extern "C" {
GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, size_t n);
GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n);
struct ggml_object;
struct ggml_context;
@@ -232,6 +243,20 @@ extern "C" {
GGML_TYPE_COUNT,
};
// model file types
enum ggml_ftype {
GGML_FTYPE_UNKNOWN = -1,
GGML_FTYPE_ALL_F32 = 0,
GGML_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
GGML_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
GGML_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
GGML_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors
GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
};
// available tensor operations:
enum ggml_op {
GGML_OP_NONE = 0,
@@ -325,7 +350,10 @@ extern "C" {
int64_t perf_time_us;
void * data;
char padding[8];
char name[32];
char padding[8]; // TODO: remove and add padding to name?
};
// computation graph
@@ -385,6 +413,9 @@ extern "C" {
GGML_API bool ggml_is_quantized(enum ggml_type type);
// TODO: temporary until model loading of ggml examples is refactored
GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
// main
GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
@@ -445,6 +476,9 @@ extern "C" {
GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
GGML_API const char * ggml_get_name(const struct ggml_tensor * tensor);
GGML_API void ggml_set_name(struct ggml_tensor * tensor, const char * name);
//
// operations on tensors with backpropagation
//

View File

@@ -243,7 +243,8 @@ struct llama_mmap {
#else
static constexpr bool SUPPORTED = false;
llama_mmap(struct llama_file *) {
llama_mmap(struct llama_file *, bool prefetch = true) {
(void)prefetch;
throw std::string("mmap not supported");
}
#endif
@@ -382,8 +383,13 @@ struct llama_mlock {
#else
static constexpr bool SUPPORTED = false;
void raw_lock(const void * addr, size_t size) {
size_t lock_granularity() {
return (size_t) 65536;
}
bool raw_lock(const void * addr, size_t size) {
fprintf(stderr, "warning: mlock not supported on this system\n");
return false;
}
void raw_unlock(const void * addr, size_t size) {}
@@ -395,6 +401,8 @@ struct llama_buffer {
uint8_t * addr = NULL;
size_t size = 0;
llama_buffer() = default;
void resize(size_t size) {
delete[] addr;
addr = new uint8_t[size];
@@ -404,27 +412,59 @@ struct llama_buffer {
~llama_buffer() {
delete[] addr;
}
// disable copy and move
llama_buffer(const llama_buffer&) = delete;
llama_buffer(llama_buffer&&) = delete;
llama_buffer& operator=(const llama_buffer&) = delete;
llama_buffer& operator=(llama_buffer&&) = delete;
};
#ifdef GGML_USE_CUBLAS
#include "ggml-cuda.h"
struct llama_ctx_buffer {
uint8_t * addr = NULL;
bool is_cuda;
size_t size = 0;
llama_ctx_buffer() = default;
void resize(size_t size) {
if (addr) {
ggml_cuda_host_free(addr);
}
free();
addr = (uint8_t *) ggml_cuda_host_malloc(size);
if (addr) {
is_cuda = true;
}
else {
// fall back to pageable memory
addr = new uint8_t[size];
is_cuda = false;
}
this->size = size;
}
~llama_ctx_buffer() {
void free() {
if (addr) {
ggml_cuda_host_free(addr);
if (is_cuda) {
ggml_cuda_host_free(addr);
}
else {
delete[] addr;
}
}
addr = NULL;
}
~llama_ctx_buffer() {
free();
}
// disable copy and move
llama_ctx_buffer(const llama_ctx_buffer&) = delete;
llama_ctx_buffer(llama_ctx_buffer&&) = delete;
llama_ctx_buffer& operator=(const llama_ctx_buffer&) = delete;
llama_ctx_buffer& operator=(llama_ctx_buffer&&) = delete;
};
#else
typedef llama_buffer llama_ctx_buffer;

182
llama.cpp
View File

@@ -659,6 +659,7 @@ struct llama_model_loader {
LLAMA_ASSERT(lt.ne.size() == 1);
tensor = ggml_new_tensor_1d(ggml_ctx, lt.type, lt.ne.at(0));
}
ggml_set_name(tensor, lt.name.c_str());
LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
lt.ggml_tensor = tensor;
num_ggml_tensors_created++;
@@ -727,8 +728,7 @@ struct llama_model_loader {
LLAMA_ASSERT(offset == lt.size);
} else if (lt.split_type == SPLIT_BY_COLUMNS) {
// Let's load the data into temporary buffers to ensure the OS performs large loads.
std::vector<llama_buffer> tmp_bufs;
tmp_bufs.resize(lt.shards.size());
std::vector<llama_buffer> tmp_bufs(lt.shards.size());
for (size_t i = 0; i < lt.shards.size(); i++) {
llama_load_tensor_shard & shard = lt.shards.at(i);
llama_file & file = file_loaders.at(shard.file_idx)->file;
@@ -799,6 +799,8 @@ static bool kv_cache_init(
cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
ggml_set_name(cache.k, "cache_k");
ggml_set_name(cache.v, "cache_v");
return true;
}
@@ -807,7 +809,7 @@ struct llama_context_params llama_context_default_params() {
struct llama_context_params result = {
/*.n_ctx =*/ 512,
/*.n_parts =*/ -1,
/*.seed =*/ 0,
/*.seed =*/ -1,
/*.f16_kv =*/ false,
/*.logits_all =*/ false,
/*.vocab_only =*/ false,
@@ -1085,6 +1087,7 @@ static bool llama_eval_internal(
gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
ggml_set_name(embd, "embd");
memcpy(embd->data, tokens, N*ggml_element_size(embd));
struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
@@ -1111,6 +1114,8 @@ static bool llama_eval_internal(
// compute Q and K and RoPE them
struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
ggml_set_name(Qcur, "Qcur");
ggml_set_name(Kcur, "Kcur");
// store key and value to memory
{
@@ -1131,6 +1136,7 @@ static bool llama_eval_internal(
ggml_permute(ctx0,
Qcur,
0, 2, 1, 3);
ggml_set_name(Q, "Q");
struct ggml_tensor * K =
ggml_permute(ctx0,
@@ -1138,21 +1144,26 @@ static bool llama_eval_internal(
ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
n_embd/n_head, n_head, n_past + N),
0, 2, 1, 3);
ggml_set_name(K, "K");
// K * Q
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
ggml_set_name(KQ, "KQ");
// KQ_scaled = KQ / sqrt(n_embd/n_head)
struct ggml_tensor * KQ_scaled =
ggml_scale(ctx0,
KQ,
ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));
struct ggml_tensor * KQ_scale = ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head));
ggml_set_name(KQ_scale, "1/sqrt(n_embd/n_head)");
struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
ggml_set_name(KQ_scaled, "KQ_scaled");
// KQ_masked = mask_past(KQ_scaled)
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
ggml_set_name(KQ_masked, "KQ_masked");
// KQ = soft_max(KQ_masked)
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
ggml_set_name(KQ_soft_max, "KQ_soft_max");
// split cached V into n_head heads
struct ggml_tensor * V =
@@ -1161,9 +1172,11 @@ static bool llama_eval_internal(
n_ctx*ggml_element_size(kv_self.v),
n_ctx*ggml_element_size(kv_self.v)*n_embd/n_head,
il*n_ctx*ggml_element_size(kv_self.v)*n_embd);
ggml_set_name(V, "V");
#if 1
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
ggml_set_name(KQV, "KQV");
#else
// make V contiguous in memory to speed up the matmul, however we waste time on the copy
// on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
@@ -1174,11 +1187,13 @@ static bool llama_eval_internal(
// KQV_merged = KQV.permute(0, 2, 1, 3)
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
ggml_set_name(KQV_merged, "KQV_merged");
// cur = KQV_merged.contiguous().view(n_embd, N)
cur = ggml_cpy(ctx0,
KQV_merged,
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
ggml_set_name(cur, "KQV_merged_contiguous");
// projection (no bias)
cur = ggml_mul_mat(ctx0,
@@ -1687,7 +1702,7 @@ void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array
}
}
void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, llama_token * last_tokens, size_t last_tokens_size, float penalty) {
void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty) {
if (last_tokens_size == 0 || penalty == 1.0f) {
return;
}
@@ -1716,7 +1731,7 @@ void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_dat
}
}
void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, llama_token * last_tokens_p, size_t last_tokens_size, float alpha_frequency, float alpha_presence) {
void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens_p, size_t last_tokens_size, float alpha_frequency, float alpha_presence) {
if (last_tokens_size == 0 || (alpha_frequency == 0.0f && alpha_presence == 0.0f)) {
return;
}
@@ -2038,7 +2053,7 @@ struct llama_context * llama_init_from_file(
llama_context * ctx = new llama_context;
if (params.seed <= 0) {
if (params.seed < 0) {
params.seed = time(NULL);
}
@@ -2373,21 +2388,21 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
}
}
int llama_get_kv_cache_token_count(struct llama_context * ctx) {
int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
return ctx->model.kv_self.n;
}
#define LLAMA_MAX_RNG_STATE 64*1024
void llama_set_rng_seed(struct llama_context * ctx, int seed) {
if (seed <= 0) {
if (seed < 0) {
seed = time(NULL);
}
ctx->rng.seed(seed);
}
// Returns the size of the state
size_t llama_get_state_size(struct llama_context * ctx) {
size_t llama_get_state_size(const struct llama_context * ctx) {
// we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
// for reference, std::mt19937(1337) serializes to 6701 bytes.
const size_t s_rng_size = sizeof(size_t);
@@ -2567,6 +2582,85 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
return nread;
}
bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
llama_file file(path_session, "rb");
// sanity checks
{
const uint32_t magic = file.read_u32();
const uint32_t version = file.read_u32();
if (!(magic == LLAMA_SESSION_MAGIC && version == LLAMA_SESSION_VERSION)) {
fprintf(stderr, "%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
return false;
}
llama_hparams session_hparams;
file.read_raw(&session_hparams, sizeof(llama_hparams));
if (session_hparams != ctx->model.hparams) {
fprintf(stderr, "%s : model hparams didn't match from session file!\n", __func__);
return false;
}
}
// load the prompt
{
const uint32_t n_token_count = file.read_u32();
if (n_token_count > n_token_capacity) {
fprintf(stderr, "%s : token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
return false;
}
file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
*n_token_count_out = n_token_count;
}
// restore the context state
{
const size_t n_state_size_cur = file.size - file.tell();
const size_t n_state_size_exp = llama_get_state_size(ctx);
if (n_state_size_cur != n_state_size_exp) {
fprintf(stderr, "%s : the state size in session file didn't match! expected %zu, got %zu\n", __func__, n_state_size_exp, n_state_size_cur);
return false;
}
std::vector<uint8_t> state_data(n_state_size_cur);
file.read_raw(state_data.data(), n_state_size_cur);
llama_set_state_data(ctx, state_data.data());
}
return true;
}
bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
llama_file file(path_session, "wb");
file.write_u32(LLAMA_SESSION_MAGIC);
file.write_u32(LLAMA_SESSION_VERSION);
file.write_raw(&ctx->model.hparams, sizeof(llama_hparams));
// save the prompt
file.write_u32((uint32_t) n_token_count);
file.write_raw(tokens, sizeof(llama_token) * n_token_count);
// save the context state
{
const size_t n_state_size = llama_get_state_size(ctx);
std::vector<uint8_t> state_data(n_state_size);
llama_copy_state_data(ctx, state_data.data());
file.write_raw(state_data.data(), n_state_size);
}
return true;
}
int llama_eval(
struct llama_context * ctx,
const llama_token * tokens,
@@ -2605,15 +2699,15 @@ int llama_tokenize(
return res.size();
}
int llama_n_vocab(struct llama_context * ctx) {
int llama_n_vocab(const struct llama_context * ctx) {
return ctx->vocab.id_to_token.size();
}
int llama_n_ctx(struct llama_context * ctx) {
int llama_n_ctx(const struct llama_context * ctx) {
return ctx->model.hparams.n_ctx;
}
int llama_n_embd(struct llama_context * ctx) {
int llama_n_embd(const struct llama_context * ctx) {
return ctx->model.hparams.n_embd;
}
@@ -2625,7 +2719,7 @@ float * llama_get_embeddings(struct llama_context * ctx) {
return ctx->embedding.data();
}
const char * llama_token_to_str(struct llama_context * ctx, llama_token token) {
const char * llama_token_to_str(const struct llama_context * ctx, llama_token token) {
if (token >= llama_n_vocab(ctx)) {
return nullptr;
}
@@ -2694,57 +2788,3 @@ const char * llama_print_system_info(void) {
std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
return ctx->model.tensors_by_name;
}
size_t llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
// TODO leverage mmap
llama_file file(path_session, "rb");
const uint32_t magic = file.read_u32();
const uint32_t version = file.read_u32();
if (!(magic == 'ggsn' && version == 0)) {
fprintf(stderr, "%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
return 0;
}
llama_hparams session_hparams;
file.read_raw(&session_hparams, sizeof(llama_hparams));
// REVIEW
if (session_hparams != ctx->model.hparams) {
fprintf(stderr, "%s : model hparams didn't match from session file!\n", __func__);
return 0;
}
const uint32_t n_token_count = file.read_u32();
LLAMA_ASSERT(n_token_capacity >= n_token_count);
file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
*n_token_count_out = n_token_count;
const size_t n_state_size = file.size - file.tell();
const size_t n_orig_state_size = llama_get_state_size(ctx);
if (n_state_size != n_orig_state_size) {
fprintf(stderr, "%s : failed to validate state size\n", __func__);
}
std::unique_ptr<uint8_t[]> state_data(new uint8_t[n_state_size]);
file.read_raw(state_data.get(), n_state_size);
return llama_set_state_data(ctx, state_data.get());
}
size_t llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
// TODO save temp & swap
llama_file file(path_session, "wb");
const size_t n_state_size = llama_get_state_size(ctx);
std::unique_ptr<uint8_t[]> state_data(new uint8_t[n_state_size]);
llama_copy_state_data(ctx, state_data.get());
file.write_u32('ggsn'); // magic
file.write_u32(0); // version
file.write_raw(&ctx->model.hparams, sizeof(llama_hparams));
file.write_u32((uint32_t) n_token_count); // REVIEW
file.write_raw(tokens, sizeof(llama_token) * n_token_count);
file.write_raw(state_data.get(), n_state_size);
return n_state_size; // REVIEW
}

30
llama.h
View File

@@ -19,9 +19,11 @@
# define LLAMA_API
#endif
#define LLAMA_FILE_VERSION 1
#define LLAMA_FILE_MAGIC 0x67676a74 // 'ggjt' in hex
#define LLAMA_FILE_MAGIC_UNVERSIONED 0x67676d6c // pre-versioned files
#define LLAMA_FILE_VERSION 1
#define LLAMA_FILE_MAGIC 'ggjt'
#define LLAMA_FILE_MAGIC_UNVERSIONED 'ggml'
#define LLAMA_SESSION_MAGIC 'ggsn'
#define LLAMA_SESSION_VERSION 0
#ifdef __cplusplus
extern "C" {
@@ -54,7 +56,7 @@ extern "C" {
struct llama_context_params {
int n_ctx; // text context
int n_parts; // -1 for default
int seed; // RNG seed, 0 for random
int seed; // RNG seed, -1 for random
bool f16_kv; // use fp16 for KV cache
bool logits_all; // the llama_eval() call computes all logits, not just the last one
@@ -120,13 +122,13 @@ extern "C" {
int n_threads);
// Returns the number of tokens in the KV cache
LLAMA_API int llama_get_kv_cache_token_count(struct llama_context * ctx);
LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
// Sets the current rng seed.
LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, int seed);
// Returns the size in bytes of the state (rng, logits, embedding and kv_cache)
LLAMA_API size_t llama_get_state_size(struct llama_context * ctx);
LLAMA_API size_t llama_get_state_size(const struct llama_context * ctx);
// Copies the state to the specified destination address.
// Destination needs to have allocated enough memory.
@@ -138,8 +140,8 @@ extern "C" {
LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src);
// Save/load session file
LLAMA_API size_t llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
LLAMA_API size_t llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count);
LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
LLAMA_API bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count);
// Run the llama inference to obtain the logits and probabilities for the next token.
// tokens + n_tokens is the provided batch of new tokens to process
@@ -164,9 +166,9 @@ extern "C" {
int n_max_tokens,
bool add_bos);
LLAMA_API int llama_n_vocab(struct llama_context * ctx);
LLAMA_API int llama_n_ctx (struct llama_context * ctx);
LLAMA_API int llama_n_embd (struct llama_context * ctx);
LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
LLAMA_API int llama_n_embd (const struct llama_context * ctx);
// Token logits obtained from the last call to llama_eval()
// The logits for the last token are stored in the last row
@@ -180,7 +182,7 @@ extern "C" {
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
// Token Id -> String. Uses the vocabulary in the provided context
LLAMA_API const char * llama_token_to_str(struct llama_context * ctx, llama_token token);
LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);
// Special tokens
LLAMA_API llama_token llama_token_bos();
@@ -190,10 +192,10 @@ extern "C" {
// Sampling functions
/// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, llama_token * last_tokens, size_t last_tokens_size, float penalty);
LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty);
/// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
/// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);

53
scripts/build-info.cmake Normal file
View File

@@ -0,0 +1,53 @@
set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.h.in")
set(HEADER_FILE "${CMAKE_CURRENT_SOURCE_DIR}/build-info.h")
set(BUILD_NUMBER 0)
set(BUILD_COMMIT "unknown")
# Look for git
find_package(Git)
if(NOT Git_FOUND)
execute_process(
COMMAND which git
OUTPUT_VARIABLE GIT_EXECUTABLE
OUTPUT_STRIP_TRAILING_WHITESPACE
)
if(NOT GIT_EXECUTABLE STREQUAL "")
set(Git_FOUND TRUE)
message(STATUS "Found Git using 'which': ${GIT_EXECUTABLE}")
else()
message(WARNING "Git not found using 'find_package' or 'which'. Build info will not be accurate. Consider installing Git or ensuring it is in the PATH.")
endif()
endif()
# Get the commit count and hash
if(Git_FOUND)
execute_process(
COMMAND ${GIT_EXECUTABLE} rev-parse --short HEAD
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
OUTPUT_VARIABLE HEAD
OUTPUT_STRIP_TRAILING_WHITESPACE
RESULT_VARIABLE GIT_HEAD_RESULT
)
execute_process(
COMMAND ${GIT_EXECUTABLE} rev-list --count HEAD
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
OUTPUT_VARIABLE COUNT
OUTPUT_STRIP_TRAILING_WHITESPACE
RESULT_VARIABLE GIT_COUNT_RESULT
)
if(GIT_HEAD_RESULT EQUAL 0 AND GIT_COUNT_RESULT EQUAL 0)
set(BUILD_COMMIT ${HEAD})
set(BUILD_NUMBER ${COUNT})
endif()
endif()
# Only write the header if it's changed to prevent unnecessary recompilation
if(EXISTS ${HEADER_FILE})
file(STRINGS ${HEADER_FILE} CONTENTS REGEX "BUILD_COMMIT \"([^\"]*)\"")
list(GET CONTENTS 0 EXISTING)
if(NOT EXISTING STREQUAL "#define BUILD_COMMIT \"${BUILD_COMMIT}\"")
configure_file(${TEMPLATE_FILE} ${HEADER_FILE})
endif()
else()
configure_file(${TEMPLATE_FILE} ${HEADER_FILE})
endif()

7
scripts/build-info.h.in Normal file
View File

@@ -0,0 +1,7 @@
#ifndef BUILD_INFO_H
#define BUILD_INFO_H
#define BUILD_NUMBER @BUILD_NUMBER@
#define BUILD_COMMIT "@BUILD_COMMIT@"
#endif // BUILD_INFO_H

22
scripts/build-info.sh Executable file
View File

@@ -0,0 +1,22 @@
#!/bin/sh
BUILD_NUMBER="0"
BUILD_COMMIT="unknown"
REV_LIST=$(git rev-list --count HEAD)
if [ $? -eq 0 ]; then
BUILD_NUMBER=$REV_LIST
fi
REV_PARSE=$(git rev-parse --short HEAD)
if [ $? -eq 0 ]; then
BUILD_COMMIT=$REV_PARSE
fi
echo "#ifndef BUILD_INFO_H"
echo "#define BUILD_INFO_H"
echo ""
echo "#define BUILD_NUMBER $BUILD_NUMBER"
echo "#define BUILD_COMMIT \"$BUILD_COMMIT\""
echo ""
echo "#endif // BUILD_INFO_H"

View File

@@ -131,7 +131,7 @@ void test_repetition_penalty(
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
llama_sample_softmax(nullptr, &candidates_p);
DUMP(&candidates_p);
llama_sample_repetition_penalty(nullptr, &candidates_p, (llama_token *)last_tokens.data(), last_tokens.size(), penalty);
llama_sample_repetition_penalty(nullptr, &candidates_p, (const llama_token *) last_tokens.data(), last_tokens.size(), penalty);
llama_sample_softmax(nullptr, &candidates_p);
DUMP(&candidates_p);
@@ -160,7 +160,7 @@ void test_frequency_presence_penalty(
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
llama_sample_softmax(nullptr, &candidates_p);
// DUMP(&candidates_p);
llama_sample_frequency_and_presence_penalties(nullptr, &candidates_p, (llama_token *)last_tokens.data(), last_tokens.size(), alpha_frequency, alpha_presence);
llama_sample_frequency_and_presence_penalties(nullptr, &candidates_p, (const llama_token *) last_tokens.data(), last_tokens.size(), alpha_frequency, alpha_presence);
llama_sample_softmax(nullptr, &candidates_p);
// DUMP(&candidates_p);