mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-04-16 16:27:32 +03:00
Compare commits
49 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
29fe516913 | ||
|
|
207b51900e | ||
|
|
6e08281e58 | ||
|
|
2046eb4345 | ||
|
|
71a09da301 | ||
|
|
d69d777c02 | ||
|
|
ff3bad83e2 | ||
|
|
82a6646e02 | ||
|
|
ba231e8a6d | ||
|
|
8a2f2fea29 | ||
|
|
bd6d9e2059 | ||
|
|
ee1a0ec9cb | ||
|
|
177461104b | ||
|
|
fdee152e4e | ||
|
|
41aee4df82 | ||
|
|
6d459cbfbe | ||
|
|
c8d6a1f34a | ||
|
|
2f9ec7e271 | ||
|
|
34b2a5e1ee | ||
|
|
6961c4bd0b | ||
|
|
cc44877486 | ||
|
|
ad93962657 | ||
|
|
1717521cdb | ||
|
|
b2f7e04bd3 | ||
|
|
abd21fc99f | ||
|
|
2b4ea35e56 | ||
|
|
daab3d7f45 | ||
|
|
469c9addef | ||
|
|
e3932593d4 | ||
|
|
9d02956443 | ||
|
|
69a6735087 | ||
|
|
5be6c803fa | ||
|
|
6336701c93 | ||
|
|
96981f37b1 | ||
|
|
438c2ca830 | ||
|
|
9e70cc0322 | ||
|
|
5a42a5f8e8 | ||
|
|
a5e7dbd614 | ||
|
|
d3956aea53 | ||
|
|
22c69a2794 | ||
|
|
465219b914 | ||
|
|
d1031cf49c | ||
|
|
8cf19d60dc | ||
|
|
a0edf73bda | ||
|
|
f439e506e8 | ||
|
|
e78f3ef24a | ||
|
|
f3b25e4043 | ||
|
|
60abea9798 | ||
|
|
004797f6ac |
@@ -1,8 +1,7 @@
|
||||
---
|
||||
name: Issue and enhancement template
|
||||
about: Used to report issues and request enhancements for llama.cpp
|
||||
title: "[User] Insert summary of your issue or enhancement.."
|
||||
labels: ''
|
||||
name: Bug template
|
||||
about: Used to report bugs in llama.cpp
|
||||
labels: ["bug-unconfirmed"]
|
||||
assignees: ''
|
||||
|
||||
---
|
||||
@@ -46,7 +45,7 @@ $ g++ --version
|
||||
|
||||
# Failure Information (for bugs)
|
||||
|
||||
Please help provide information about the failure if this is a bug. If it is not a bug, please remove the rest of this template.
|
||||
Please help provide information about the failure / bug.
|
||||
|
||||
# Steps to Reproduce
|
||||
|
||||
28
.github/ISSUE_TEMPLATE/enhancement.md
vendored
Normal file
28
.github/ISSUE_TEMPLATE/enhancement.md
vendored
Normal file
@@ -0,0 +1,28 @@
|
||||
---
|
||||
name: Enhancement template
|
||||
about: Used to request enhancements for llama.cpp
|
||||
labels: ["enhancement"]
|
||||
assignees: ''
|
||||
|
||||
---
|
||||
|
||||
# Prerequisites
|
||||
|
||||
Please answer the following questions for yourself before submitting an issue.
|
||||
|
||||
- [ ] I am running the latest code. Development is very rapid so there are no tagged versions as of now.
|
||||
- [ ] I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md).
|
||||
- [ ] I [searched using keywords relevant to my issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/filtering-and-searching-issues-and-pull-requests) to make sure that I am creating a new issue that is not already open (or closed).
|
||||
- [ ] I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new bug or useful enhancement to share.
|
||||
|
||||
# Feature Description
|
||||
|
||||
Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an enhancement.
|
||||
|
||||
# Motivation
|
||||
|
||||
Please provide a detailed written description of reasons why this feature is necessary and how it is useful to `llama.cpp` users.
|
||||
|
||||
# Possible Implementation
|
||||
|
||||
If you have an idea as to how it can be implemented, please write a detailed description. Feel free to give links to external sources or share visuals that might be helpful to understand the details better.
|
||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -10,6 +10,7 @@
|
||||
*.gcno
|
||||
*.gcda
|
||||
*.dot
|
||||
*.bat
|
||||
*.metallib
|
||||
.DS_Store
|
||||
.build/
|
||||
|
||||
@@ -82,6 +82,7 @@ set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
|
||||
option(LLAMA_CUBLAS "llama: use CUDA" OFF)
|
||||
#option(LLAMA_CUDA_CUBLAS "llama: use cuBLAS for prompt processing" OFF)
|
||||
option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF)
|
||||
option(LLAMA_CUDA_FORCE_MMQ "llama: use mmq kernels instead of cuBLAS" OFF)
|
||||
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
|
||||
set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
|
||||
option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some calculations" OFF)
|
||||
@@ -93,7 +94,6 @@ option(LLAMA_CLBLAST "llama: use CLBlast"
|
||||
option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT})
|
||||
option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging" OFF)
|
||||
option(LLAMA_MPI "llama: use MPI" OFF)
|
||||
option(LLAMA_K_QUANTS "llama: use k-quants" ON)
|
||||
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
|
||||
|
||||
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
|
||||
@@ -277,13 +277,8 @@ if (LLAMA_BLAS)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (LLAMA_K_QUANTS)
|
||||
set(GGML_HEADERS_EXTRA k_quants.h)
|
||||
set(GGML_SOURCES_EXTRA k_quants.c)
|
||||
add_compile_definitions(GGML_USE_K_QUANTS)
|
||||
if (LLAMA_QKK_64)
|
||||
add_compile_definitions(GGML_QKK_64)
|
||||
endif()
|
||||
if (LLAMA_QKK_64)
|
||||
add_compile_definitions(GGML_QKK_64)
|
||||
endif()
|
||||
|
||||
if (LLAMA_CUBLAS)
|
||||
@@ -305,6 +300,9 @@ if (LLAMA_CUBLAS)
|
||||
if (LLAMA_CUDA_FORCE_DMMV)
|
||||
add_compile_definitions(GGML_CUDA_FORCE_DMMV)
|
||||
endif()
|
||||
if (LLAMA_CUDA_FORCE_MMQ)
|
||||
add_compile_definitions(GGML_CUDA_FORCE_MMQ)
|
||||
endif()
|
||||
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
|
||||
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
|
||||
if (DEFINED LLAMA_CUDA_DMMV_Y)
|
||||
@@ -331,6 +329,7 @@ if (LLAMA_CUBLAS)
|
||||
set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
|
||||
else()
|
||||
set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
|
||||
#set(CMAKE_CUDA_ARCHITECTURES "") # use this to compile much faster, but only F16 models work
|
||||
endif()
|
||||
endif()
|
||||
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
|
||||
@@ -404,6 +403,9 @@ if (LLAMA_HIPBLAS)
|
||||
if (LLAMA_CUDA_FORCE_DMMV)
|
||||
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_DMMV)
|
||||
endif()
|
||||
if (LLAMA_CUDA_FORCE_MMQ)
|
||||
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_MMQ)
|
||||
endif()
|
||||
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
|
||||
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
|
||||
target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
|
||||
@@ -665,6 +667,8 @@ add_library(ggml OBJECT
|
||||
ggml-alloc.h
|
||||
ggml-backend.c
|
||||
ggml-backend.h
|
||||
ggml-quants.c
|
||||
ggml-quants.h
|
||||
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
|
||||
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
|
||||
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
|
||||
|
||||
40
Makefile
40
Makefile
@@ -1,7 +1,7 @@
|
||||
# Define the default target now so that it is always the first target
|
||||
BUILD_TARGETS = \
|
||||
main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
|
||||
simple batched batched-bench save-load-state server embd-input-test gguf llama-bench llava baby-llama beam-search \
|
||||
simple batched batched-bench save-load-state server gguf llama-bench llava baby-llama beam-search \
|
||||
speculative infill benchmark-matmult parallel finetune export-lora tests/test-c.o
|
||||
|
||||
# Binaries only useful for tests
|
||||
@@ -342,13 +342,9 @@ else
|
||||
MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
|
||||
endif
|
||||
|
||||
ifndef LLAMA_NO_K_QUANTS
|
||||
MK_CPPFLAGS += -DGGML_USE_K_QUANTS
|
||||
OBJS += k_quants.o
|
||||
ifdef LLAMA_QKK_64
|
||||
MK_CPPFLAGS += -DGGML_QKK_64
|
||||
endif
|
||||
endif
|
||||
|
||||
ifndef LLAMA_NO_ACCELERATE
|
||||
# Mac OS - include Accelerate framework.
|
||||
@@ -365,7 +361,7 @@ ifdef LLAMA_MPI
|
||||
MK_CPPFLAGS += -DGGML_USE_MPI
|
||||
MK_CFLAGS += -Wno-cast-qual
|
||||
MK_CXXFLAGS += -Wno-cast-qual
|
||||
OBJS += ggml-mpi.o
|
||||
OBJS += ggml-mpi.o
|
||||
endif # LLAMA_MPI
|
||||
|
||||
ifdef LLAMA_OPENBLAS
|
||||
@@ -382,7 +378,7 @@ endif # LLAMA_BLIS
|
||||
ifdef LLAMA_CUBLAS
|
||||
MK_CPPFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
|
||||
MK_LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
|
||||
OBJS += ggml-cuda.o
|
||||
OBJS += ggml-cuda.o
|
||||
NVCCFLAGS = --forward-unknown-to-host-compiler -use_fast_math
|
||||
ifdef LLAMA_CUDA_NVCC
|
||||
NVCC = $(LLAMA_CUDA_NVCC)
|
||||
@@ -397,6 +393,9 @@ endif # CUDA_DOCKER_ARCH
|
||||
ifdef LLAMA_CUDA_FORCE_DMMV
|
||||
NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV
|
||||
endif # LLAMA_CUDA_FORCE_DMMV
|
||||
ifdef LLAMA_CUDA_FORCE_MMQ
|
||||
NVCCFLAGS += -DGGML_CUDA_FORCE_MMQ
|
||||
endif # LLAMA_CUDA_FORCE_MMQ
|
||||
ifdef LLAMA_CUDA_DMMV_X
|
||||
NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
|
||||
else
|
||||
@@ -494,11 +493,6 @@ ggml-mpi.o: ggml-mpi.c ggml-mpi.h
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
endif # LLAMA_MPI
|
||||
|
||||
ifndef LLAMA_NO_K_QUANTS
|
||||
k_quants.o: k_quants.c k_quants.h
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
endif # LLAMA_NO_K_QUANTS
|
||||
|
||||
# combine build flags with cmdline overrides
|
||||
override CFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CFLAGS) $(CFLAGS)
|
||||
override CXXFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CXXFLAGS) $(CXXFLAGS)
|
||||
@@ -539,15 +533,18 @@ ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
|
||||
ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
|
||||
OBJS += ggml-alloc.o ggml-backend.o
|
||||
ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
|
||||
OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o
|
||||
|
||||
llama.o: llama.cpp ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
COMMON_H_DEPS = common/common.h common/sampling.h build-info.h common/log.h
|
||||
COMMON_DEPS = $(COMMON_H_DEPS) common.o sampling.o grammar-parser.o
|
||||
COMMON_H_DEPS = common/common.h common/sampling.h common/log.h
|
||||
COMMON_DEPS = common.o sampling.o grammar-parser.o
|
||||
|
||||
common.o: common/common.cpp $(COMMON_H_DEPS)
|
||||
common.o: common/common.cpp build-info.h $(COMMON_H_DEPS)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
sampling.o: common/sampling.cpp $(COMMON_H_DEPS)
|
||||
@@ -605,15 +602,8 @@ embedding: examples/embedding/embedding.cpp build-info.h ggml.
|
||||
save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp build-info.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2)
|
||||
|
||||
$(LIB_PRE)embdinput$(DSO_EXT): examples/embd-input/embd-input.h examples/embd-input/embd-input-lib.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) --shared $(CXXFLAGS) $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)
|
||||
|
||||
|
||||
embd-input-test: $(LIB_PRE)embdinput$(DSO_EXT) examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %$(DSO_EXT),$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput
|
||||
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h build-info.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2) -Wno-cast-qual
|
||||
|
||||
gguf: examples/gguf/gguf.cpp ggml.o llama.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
@@ -42,13 +42,12 @@ let package = Package(
|
||||
"llama.cpp",
|
||||
"ggml-alloc.c",
|
||||
"ggml-backend.c",
|
||||
"k_quants.c",
|
||||
"ggml-quants.c",
|
||||
] + additionalSources,
|
||||
resources: resources,
|
||||
publicHeadersPath: "spm-headers",
|
||||
cSettings: [
|
||||
.unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
|
||||
.define("GGML_USE_K_QUANTS"),
|
||||
.define("GGML_USE_ACCELERATE")
|
||||
// NOTE: NEW_LAPACK will required iOS version 16.4+
|
||||
// We should consider add this in the future when we drop support for iOS 14
|
||||
|
||||
11
README.md
11
README.md
@@ -10,13 +10,9 @@
|
||||
Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
|
||||
|
||||
### Hot topics
|
||||
- ‼️ BPE tokenizer update: existing Falcon and Starcoder `.gguf` models will need to be reconverted: [#3252](https://github.com/ggerganov/llama.cpp/pull/3252)
|
||||
- ‼️ Breaking change: `rope_freq_base` and `rope_freq_scale` must be set to zero to use the model default values: [#3401](https://github.com/ggerganov/llama.cpp/pull/3401)
|
||||
- Parallel decoding + continuous batching support added: [#3228](https://github.com/ggerganov/llama.cpp/pull/3228) \
|
||||
**Devs should become familiar with the new API**
|
||||
- Local Falcon 180B inference on Mac Studio
|
||||
|
||||
https://github.com/ggerganov/llama.cpp/assets/1991296/98abd4e8-7077-464c-ae89-aebabca7757e
|
||||
- LLaVA support: https://github.com/ggerganov/llama.cpp/pull/3436
|
||||
- ‼️ BPE tokenizer update: existing Falcon and Starcoder `.gguf` models will need to be reconverted: [#3252](https://github.com/ggerganov/llama.cpp/pull/3252)
|
||||
|
||||
----
|
||||
|
||||
@@ -105,7 +101,7 @@ as the main playground for developing new features for the [ggml](https://github
|
||||
|
||||
- Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
|
||||
- Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
|
||||
- Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp), [hlhr202/llama-node](https://github.com/hlhr202/llama-node)
|
||||
- Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp)
|
||||
- Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
|
||||
- Rust: [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
|
||||
- C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp)
|
||||
@@ -966,7 +962,6 @@ docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /
|
||||
|
||||
- [main](./examples/main/README.md)
|
||||
- [server](./examples/server/README.md)
|
||||
- [embd-input](./examples/embd-input/README.md)
|
||||
- [jeopardy](./examples/jeopardy/README.md)
|
||||
- [BLIS](./docs/BLIS.md)
|
||||
- [Performance troubleshooting](./docs/token_generation_performance_tips.md)
|
||||
|
||||
22
build.zig
22
build.zig
@@ -116,30 +116,26 @@ pub fn build(b: *std.build.Builder) !void {
|
||||
var make = try Maker.init(b);
|
||||
make.enable_lto = b.option(bool, "lto", "Enable LTO optimization, (default: false)") orelse false;
|
||||
|
||||
if (b.option(bool, "k-quants", "Enable K-quants, (default: true)") orelse true) {
|
||||
try make.addFlag("-DGGML_USE_K_QUANTS");
|
||||
const k_quants = make.obj("k_quants", "k_quants.c");
|
||||
try make.objs.append(k_quants);
|
||||
}
|
||||
|
||||
const ggml = make.obj("ggml", "ggml.c");
|
||||
const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
|
||||
const ggml_backend = make.obj("ggml-backend", "ggml-backend.c");
|
||||
const ggml_quants = make.obj("ggml-quants", "ggml-quants.c");
|
||||
const llama = make.obj("llama", "llama.cpp");
|
||||
const common = make.obj("common", "common/common.cpp");
|
||||
const console = make.obj("console", "common/console.cpp");
|
||||
const sampling = make.obj("sampling", "common/sampling.cpp");
|
||||
const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp");
|
||||
const train = make.obj("train", "common/train.cpp");
|
||||
const clip = make.obj("clip", "examples/llava/clip.cpp");
|
||||
|
||||
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, sampling, console, grammar_parser });
|
||||
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common });
|
||||
_ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common });
|
||||
_ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common });
|
||||
_ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, train });
|
||||
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, train });
|
||||
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, sampling, console, grammar_parser });
|
||||
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common });
|
||||
_ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common });
|
||||
_ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common });
|
||||
_ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, train });
|
||||
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, train });
|
||||
|
||||
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, sampling, grammar_parser });
|
||||
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, sampling, grammar_parser, clip });
|
||||
if (server.target.isWindows()) {
|
||||
server.linkSystemLibrary("ws2_32");
|
||||
}
|
||||
|
||||
@@ -107,7 +107,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
std::string arg;
|
||||
gpt_params default_params;
|
||||
const std::string arg_prefix = "--";
|
||||
llama_sampling_params & sparams = params.sampling_params;
|
||||
llama_sampling_params & sparams = params.sparams;
|
||||
|
||||
for (int i = 1; i < argc; i++) {
|
||||
arg = argv[i];
|
||||
@@ -224,6 +224,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
break;
|
||||
}
|
||||
sparams.temp = std::stof(argv[i]);
|
||||
sparams.temp = std::max(sparams.temp, 0.0f);
|
||||
} else if (arg == "--tfs") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
@@ -241,25 +242,26 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
sparams.repeat_last_n = std::stoi(argv[i]);
|
||||
sparams.penalty_last_n = std::stoi(argv[i]);
|
||||
sparams.n_prev = std::max(sparams.n_prev, sparams.penalty_last_n);
|
||||
} else if (arg == "--repeat-penalty") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
sparams.repeat_penalty = std::stof(argv[i]);
|
||||
sparams.penalty_repeat = std::stof(argv[i]);
|
||||
} else if (arg == "--frequency-penalty") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
sparams.frequency_penalty = std::stof(argv[i]);
|
||||
sparams.penalty_freq = std::stof(argv[i]);
|
||||
} else if (arg == "--presence-penalty") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
sparams.presence_penalty = std::stof(argv[i]);
|
||||
sparams.penalty_present = std::stof(argv[i]);
|
||||
} else if (arg == "--mirostat") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
@@ -572,7 +574,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.grammar = argv[i];
|
||||
sparams.grammar = argv[i];
|
||||
} else if (arg == "--grammar-file") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
@@ -587,7 +589,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
std::copy(
|
||||
std::istreambuf_iterator<char>(file),
|
||||
std::istreambuf_iterator<char>(),
|
||||
std::back_inserter(params.grammar)
|
||||
std::back_inserter(sparams.grammar)
|
||||
);
|
||||
#ifndef LOG_DISABLE_LOGS
|
||||
// Parse args for logging parameters
|
||||
@@ -631,6 +633,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
process_escapes(params.prompt);
|
||||
process_escapes(params.input_prefix);
|
||||
process_escapes(params.input_suffix);
|
||||
process_escapes(sparams.cfg_negative_prompt);
|
||||
for (auto & antiprompt : params.antiprompt) {
|
||||
process_escapes(antiprompt);
|
||||
}
|
||||
@@ -640,7 +643,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
}
|
||||
|
||||
void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||
const llama_sampling_params & sparams = params.sampling_params;
|
||||
const llama_sampling_params & sparams = params.sparams;
|
||||
|
||||
printf("usage: %s [options]\n", argv[0]);
|
||||
printf("\n");
|
||||
@@ -678,10 +681,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||
printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p);
|
||||
printf(" --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)sparams.tfs_z);
|
||||
printf(" --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)sparams.typical_p);
|
||||
printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", sparams.repeat_last_n);
|
||||
printf(" --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)sparams.repeat_penalty);
|
||||
printf(" --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.presence_penalty);
|
||||
printf(" --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.frequency_penalty);
|
||||
printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", sparams.penalty_last_n);
|
||||
printf(" --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)sparams.penalty_repeat);
|
||||
printf(" --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_present);
|
||||
printf(" --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_freq);
|
||||
printf(" --mirostat N use Mirostat sampling.\n");
|
||||
printf(" Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n");
|
||||
printf(" (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", sparams.mirostat);
|
||||
@@ -741,7 +744,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||
#endif // GGML_USE_CUBLAS
|
||||
#endif
|
||||
printf(" --verbose-prompt print prompt before generation\n");
|
||||
fprintf(stderr, " --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n");
|
||||
printf(" --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n");
|
||||
printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
|
||||
printf(" --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n");
|
||||
printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
|
||||
@@ -878,15 +881,15 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
||||
}
|
||||
|
||||
if (params.ignore_eos) {
|
||||
params.sampling_params.logit_bias[llama_token_eos(lctx)] = -INFINITY;
|
||||
params.sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
|
||||
}
|
||||
|
||||
{
|
||||
LOG("warming up the model with an empty run\n");
|
||||
|
||||
std::vector<llama_token> tmp = { llama_token_bos(lctx), llama_token_eos(lctx), };
|
||||
std::vector<llama_token> tmp = { llama_token_bos(model), llama_token_eos(model), };
|
||||
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
|
||||
llama_kv_cache_tokens_rm(lctx, -1, -1);
|
||||
llama_kv_cache_clear(lctx);
|
||||
llama_reset_timings(lctx);
|
||||
}
|
||||
|
||||
@@ -939,7 +942,7 @@ std::string llama_token_to_piece(const struct llama_context * ctx, llama_token t
|
||||
}
|
||||
|
||||
std::string llama_detokenize_spm(llama_context * ctx, const std::vector<llama_token> & tokens) {
|
||||
const llama_token bos_id = llama_token_bos(ctx);
|
||||
const llama_token bos_id = llama_token_bos(llama_get_model(ctx));
|
||||
|
||||
std::string piece;
|
||||
std::string result;
|
||||
@@ -1123,28 +1126,28 @@ std::string get_sortable_timestamp() {
|
||||
|
||||
void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const llama_context * lctx,
|
||||
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
|
||||
const llama_sampling_params & sparams = params.sampling_params;
|
||||
const llama_sampling_params & sparams = params.sparams;
|
||||
|
||||
fprintf(stream, "build_commit: %s\n", BUILD_COMMIT);
|
||||
fprintf(stream, "build_number: %d\n", BUILD_NUMBER);
|
||||
fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_avx: %s\n", ggml_cpu_has_avx() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_avx2: %s\n", ggml_cpu_has_avx2() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_avx: %s\n", ggml_cpu_has_avx() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_avx2: %s\n", ggml_cpu_has_avx2() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_cublas: %s\n", ggml_cpu_has_cublas() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_gpublas: %s\n", ggml_cpu_has_gpublas() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_neon: %s\n", ggml_cpu_has_neon() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_f16c: %s\n", ggml_cpu_has_f16c() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_fp16_va: %s\n", ggml_cpu_has_fp16_va() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_wasm_simd: %s\n", ggml_cpu_has_wasm_simd() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_vsx: %s\n", ggml_cpu_has_vsx() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_cublas: %s\n", ggml_cpu_has_cublas() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_gpublas: %s\n", ggml_cpu_has_gpublas() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_neon: %s\n", ggml_cpu_has_neon() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_f16c: %s\n", ggml_cpu_has_f16c() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_fp16_va: %s\n", ggml_cpu_has_fp16_va() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_wasm_simd: %s\n", ggml_cpu_has_wasm_simd() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_vsx: %s\n", ggml_cpu_has_vsx() ? "true" : "false");
|
||||
|
||||
#ifdef NDEBUG
|
||||
fprintf(stream, "debug: false\n");
|
||||
@@ -1178,13 +1181,13 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
||||
fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
|
||||
fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
|
||||
fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
|
||||
fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.frequency_penalty);
|
||||
dump_string_yaml_multiline(stream, "grammar", params.grammar.c_str());
|
||||
fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.penalty_freq);
|
||||
dump_string_yaml_multiline(stream, "grammar", sparams.grammar.c_str());
|
||||
fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n");
|
||||
fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
|
||||
fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);
|
||||
|
||||
const auto logit_bias_eos = sparams.logit_bias.find(llama_token_eos(lctx));
|
||||
const auto logit_bias_eos = sparams.logit_bias.find(llama_token_eos(llama_get_model(lctx)));
|
||||
const bool ignore_eos = logit_bias_eos != sparams.logit_bias.end() && logit_bias_eos->second == -INFINITY;
|
||||
fprintf(stream, "ignore_eos: %s # default: false\n", ignore_eos ? "true" : "false");
|
||||
|
||||
@@ -1238,14 +1241,14 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
||||
fprintf(stream, "numa: %s # default: false\n", params.numa ? "true" : "false");
|
||||
fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
|
||||
fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
|
||||
fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.presence_penalty);
|
||||
fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.penalty_present);
|
||||
dump_string_yaml_multiline(stream, "prompt", params.prompt.c_str());
|
||||
fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str());
|
||||
fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false");
|
||||
fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false");
|
||||
dump_vector_int_yaml(stream, "prompt_tokens", prompt_tokens);
|
||||
fprintf(stream, "random_prompt: %s # default: false\n", params.random_prompt ? "true" : "false");
|
||||
fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.repeat_penalty);
|
||||
fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.penalty_repeat);
|
||||
|
||||
fprintf(stream, "reverse_prompt:\n");
|
||||
for (std::string ap : params.antiprompt) {
|
||||
|
||||
@@ -56,7 +56,7 @@ struct gpt_params {
|
||||
float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
|
||||
|
||||
// // sampling parameters
|
||||
struct llama_sampling_params sampling_params;
|
||||
struct llama_sampling_params sparams;
|
||||
|
||||
std::string model = "models/7B/ggml-model-f16.gguf"; // model path
|
||||
std::string model_draft = ""; // draft model for speculative decoding
|
||||
@@ -66,7 +66,6 @@ struct gpt_params {
|
||||
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
|
||||
std::string input_prefix = ""; // string to prefix user inputs with
|
||||
std::string input_suffix = ""; // string to suffix user inputs with
|
||||
std::string grammar = ""; // optional BNF-like grammar to constrain sampling
|
||||
std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
|
||||
std::string logdir = ""; // directory in which to save YAML log files
|
||||
|
||||
|
||||
@@ -399,7 +399,7 @@ namespace grammar_parser {
|
||||
void print_grammar(FILE * file, const parse_state & state) {
|
||||
try {
|
||||
std::map<uint32_t, std::string> symbol_id_names;
|
||||
for (auto kv : state.symbol_ids) {
|
||||
for (const auto & kv : state.symbol_ids) {
|
||||
symbol_id_names[kv.second] = kv.first;
|
||||
}
|
||||
for (size_t i = 0, end = state.rules.size(); i < end; i++) {
|
||||
|
||||
35
common/log.h
35
common/log.h
@@ -97,22 +97,23 @@
|
||||
#define LOG_TEE_TARGET stderr
|
||||
#endif
|
||||
|
||||
// NOTE: currently disabled as it produces too many log files
|
||||
// Utility to obtain "pid" like unique process id and use it when creating log files.
|
||||
inline std::string log_get_pid()
|
||||
{
|
||||
static std::string pid;
|
||||
if (pid.empty())
|
||||
{
|
||||
// std::this_thread::get_id() is the most portable way of obtaining a "process id"
|
||||
// it's not the same as "pid" but is unique enough to solve multiple instances
|
||||
// trying to write to the same log.
|
||||
std::stringstream ss;
|
||||
ss << std::this_thread::get_id();
|
||||
pid = ss.str();
|
||||
}
|
||||
|
||||
return pid;
|
||||
}
|
||||
//inline std::string log_get_pid()
|
||||
//{
|
||||
// static std::string pid;
|
||||
// if (pid.empty())
|
||||
// {
|
||||
// // std::this_thread::get_id() is the most portable way of obtaining a "process id"
|
||||
// // it's not the same as "pid" but is unique enough to solve multiple instances
|
||||
// // trying to write to the same log.
|
||||
// std::stringstream ss;
|
||||
// ss << std::this_thread::get_id();
|
||||
// pid = ss.str();
|
||||
// }
|
||||
//
|
||||
// return pid;
|
||||
//}
|
||||
|
||||
// Utility function for generating log file names with unique id based on thread id.
|
||||
// invocation with log_filename_generator( "llama", "log" ) creates a string "llama.<number>.log"
|
||||
@@ -126,8 +127,8 @@ inline std::string log_filename_generator_impl(const std::string & log_file_base
|
||||
std::stringstream buf;
|
||||
|
||||
buf << log_file_basename;
|
||||
buf << ".";
|
||||
buf << log_get_pid();
|
||||
//buf << ".";
|
||||
//buf << log_get_pid();
|
||||
buf << ".";
|
||||
buf << log_file_extension;
|
||||
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
#include "sampling.h"
|
||||
|
||||
struct llama_sampling_context * llama_sampling_init(const struct gpt_params & params) {
|
||||
struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params) {
|
||||
struct llama_sampling_context * result = new llama_sampling_context();
|
||||
|
||||
result->params = params.sampling_params;
|
||||
result->params = params;
|
||||
result->grammar = nullptr;
|
||||
|
||||
// if there is a grammar, parse it
|
||||
@@ -23,7 +23,7 @@ struct llama_sampling_context * llama_sampling_init(const struct gpt_params & pa
|
||||
grammar_rules.size(), result->parsed_grammar.symbol_ids.at("root"));
|
||||
}
|
||||
|
||||
result->prev.resize(params.n_ctx);
|
||||
result->prev.resize(params.n_prev);
|
||||
|
||||
return result;
|
||||
}
|
||||
@@ -66,25 +66,56 @@ void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * ds
|
||||
dst->prev = src->prev;
|
||||
}
|
||||
|
||||
llama_token llama_sampling_last(llama_sampling_context * ctx) {
|
||||
return ctx->prev.back();
|
||||
}
|
||||
|
||||
std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama_context * ctx_main, int n) {
|
||||
const int size = ctx_sampling->prev.size();
|
||||
|
||||
n = std::min(n, size);
|
||||
|
||||
std::string result;
|
||||
|
||||
for (int i = size - n; i < size; i++) {
|
||||
result += llama_token_to_piece(ctx_main, ctx_sampling->prev[i]);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
std::string llama_sampling_print(const llama_sampling_params & params) {
|
||||
char result[1024];
|
||||
|
||||
snprintf(result, sizeof(result),
|
||||
"\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
|
||||
"\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, typical_p = %.3f, temp = %.3f\n"
|
||||
"\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
|
||||
params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present,
|
||||
params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp,
|
||||
params.mirostat, params.mirostat_eta, params.mirostat_tau);
|
||||
|
||||
return std::string(result);
|
||||
}
|
||||
|
||||
llama_token llama_sampling_sample(
|
||||
struct llama_sampling_context * ctx_sampling,
|
||||
struct llama_context * ctx_main,
|
||||
struct llama_context * ctx_cfg,
|
||||
const int idx) {
|
||||
const int n_ctx = llama_n_ctx(ctx_main);
|
||||
const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
|
||||
|
||||
const llama_sampling_params & params = ctx_sampling->params;
|
||||
|
||||
const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
|
||||
|
||||
const float temp = params.temp;
|
||||
const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k;
|
||||
const float top_p = params.top_p;
|
||||
const float tfs_z = params.tfs_z;
|
||||
const float typical_p = params.typical_p;
|
||||
const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
|
||||
const float repeat_penalty = params.repeat_penalty;
|
||||
const float alpha_presence = params.presence_penalty;
|
||||
const float alpha_frequency = params.frequency_penalty;
|
||||
const int32_t penalty_last_n = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n;
|
||||
const float penalty_repeat = params.penalty_repeat;
|
||||
const float penalty_freq = params.penalty_freq;
|
||||
const float penalty_present = params.penalty_present;
|
||||
const int mirostat = params.mirostat;
|
||||
const float mirostat_tau = params.mirostat_tau;
|
||||
const float mirostat_eta = params.mirostat_eta;
|
||||
@@ -97,7 +128,7 @@ llama_token llama_sampling_sample(
|
||||
|
||||
float * logits = llama_get_logits_ith(ctx_main, idx);
|
||||
|
||||
// Apply params.logit_bias map
|
||||
// apply params.logit_bias map
|
||||
for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
|
||||
logits[it->first] += it->second;
|
||||
}
|
||||
@@ -116,19 +147,15 @@ llama_token llama_sampling_sample(
|
||||
|
||||
// apply penalties
|
||||
if (!prev.empty()) {
|
||||
const float nl_logit = logits[llama_token_nl(ctx_main)];
|
||||
const int last_n_repeat = std::min(std::min((int)prev.size(), repeat_last_n), n_ctx);
|
||||
const float nl_logit = logits[llama_token_nl(llama_get_model(ctx_main))];
|
||||
|
||||
llama_sample_repetition_penalty(ctx_main, &cur_p,
|
||||
prev.data() + prev.size() - last_n_repeat,
|
||||
last_n_repeat, repeat_penalty);
|
||||
llama_sample_frequency_and_presence_penalties(ctx_main, &cur_p,
|
||||
prev.data() + prev.size() - last_n_repeat,
|
||||
last_n_repeat, alpha_frequency, alpha_presence);
|
||||
llama_sample_repetition_penalties(ctx_main, &cur_p,
|
||||
prev.data() + prev.size() - penalty_last_n,
|
||||
penalty_last_n, penalty_repeat, penalty_freq, penalty_present);
|
||||
|
||||
if (!penalize_nl) {
|
||||
for (size_t idx = 0; idx < cur_p.size; idx++) {
|
||||
if (cur_p.data[idx].id == llama_token_nl(ctx_main)) {
|
||||
if (cur_p.data[idx].id == llama_token_nl(llama_get_model(ctx_main))) {
|
||||
cur_p.data[idx].logit = nl_logit;
|
||||
break;
|
||||
}
|
||||
@@ -140,8 +167,12 @@ llama_token llama_sampling_sample(
|
||||
llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar);
|
||||
}
|
||||
|
||||
if (temp <= 0) {
|
||||
// Greedy sampling
|
||||
if (temp < 0.0) {
|
||||
// greedy sampling, with probs
|
||||
llama_sample_softmax(ctx_main, &cur_p);
|
||||
id = cur_p.data[0].id;
|
||||
} else if (temp == 0.0) {
|
||||
// greedy sampling, no probs
|
||||
id = llama_sample_token_greedy(ctx_main, &cur_p);
|
||||
} else {
|
||||
if (mirostat == 1) {
|
||||
@@ -152,8 +183,9 @@ llama_token llama_sampling_sample(
|
||||
llama_sample_temp(ctx_main, &cur_p, temp);
|
||||
id = llama_sample_token_mirostat_v2(ctx_main, &cur_p, mirostat_tau, mirostat_eta, &ctx_sampling->mirostat_mu);
|
||||
} else {
|
||||
// Temperature sampling
|
||||
// temperature sampling
|
||||
size_t min_keep = std::max(1, params.n_probs);
|
||||
|
||||
llama_sample_top_k (ctx_main, &cur_p, top_k, min_keep);
|
||||
llama_sample_tail_free(ctx_main, &cur_p, tfs_z, min_keep);
|
||||
llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep);
|
||||
@@ -183,11 +215,12 @@ llama_token llama_sampling_sample(
|
||||
void llama_sampling_accept(
|
||||
struct llama_sampling_context * ctx_sampling,
|
||||
struct llama_context * ctx_main,
|
||||
llama_token id) {
|
||||
llama_token id,
|
||||
bool apply_grammar) {
|
||||
ctx_sampling->prev.erase(ctx_sampling->prev.begin());
|
||||
ctx_sampling->prev.push_back(id);
|
||||
|
||||
if (ctx_sampling->grammar != NULL) {
|
||||
if (ctx_sampling->grammar != NULL && apply_grammar) {
|
||||
llama_grammar_accept_token(ctx_main, ctx_sampling->grammar, id);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -10,30 +10,30 @@
|
||||
|
||||
// sampling parameters
|
||||
typedef struct llama_sampling_params {
|
||||
int32_t n_prev = 64; // number of previous tokens to remember
|
||||
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
||||
int32_t top_k = 40; // <= 0 to use vocab size
|
||||
float top_p = 0.95f; // 1.0 = disabled
|
||||
float tfs_z = 1.00f; // 1.0 = disabled
|
||||
float typical_p = 1.00f; // 1.0 = disabled
|
||||
float temp = 0.80f; // 1.0 = disabled
|
||||
float repeat_penalty = 1.10f; // 1.0 = disabled
|
||||
int32_t repeat_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
||||
float frequency_penalty = 0.00f; // 0.0 = disabled
|
||||
float presence_penalty = 0.00f; // 0.0 = disabled
|
||||
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
||||
float penalty_repeat = 1.10f; // 1.0 = disabled
|
||||
float penalty_freq = 0.00f; // 0.0 = disabled
|
||||
float penalty_present = 0.00f; // 0.0 = disabled
|
||||
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
|
||||
float mirostat_tau = 5.00f; // target entropy
|
||||
float mirostat_eta = 0.10f; // learning rate
|
||||
|
||||
bool penalize_nl = true; // consider newlines as a repeatable token
|
||||
|
||||
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
||||
std::string grammar; // optional BNF-like grammar to constrain sampling
|
||||
|
||||
// Classifier-Free Guidance
|
||||
// https://arxiv.org/abs/2306.17806
|
||||
std::string cfg_negative_prompt; // string to help guidance
|
||||
float cfg_scale = 1.f; // How strong is guidance
|
||||
std::string cfg_negative_prompt; // string to help guidance
|
||||
float cfg_scale = 1.f; // how strong is guidance
|
||||
|
||||
std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
|
||||
|
||||
} llama_sampling_params;
|
||||
|
||||
// general sampler context
|
||||
@@ -58,7 +58,7 @@ struct llama_sampling_context {
|
||||
#include "common.h"
|
||||
|
||||
// Create a new sampling context instance.
|
||||
struct llama_sampling_context * llama_sampling_init(const struct gpt_params & params);
|
||||
struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params);
|
||||
|
||||
void llama_sampling_free(struct llama_sampling_context * ctx);
|
||||
|
||||
@@ -70,6 +70,15 @@ void llama_sampling_reset(llama_sampling_context * ctx);
|
||||
// Copy the sampler context
|
||||
void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst);
|
||||
|
||||
// Get the last sampled token
|
||||
llama_token llama_sampling_last(llama_sampling_context * ctx);
|
||||
|
||||
// Get a string representation of the last sampled tokens
|
||||
std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama_context * ctx_main, int n);
|
||||
|
||||
// Print sampling parameters into a string
|
||||
std::string llama_sampling_print(const llama_sampling_params & params);
|
||||
|
||||
// this is a common sampling function used across the examples for convenience
|
||||
// it can serve as a starting point for implementing your own sampling function
|
||||
// Note: When using multiple sequences, it is the caller's responsibility to call
|
||||
@@ -96,4 +105,5 @@ llama_token llama_sampling_sample(
|
||||
void llama_sampling_accept(
|
||||
struct llama_sampling_context * ctx_sampling,
|
||||
struct llama_context * ctx_main,
|
||||
llama_token id);
|
||||
llama_token id,
|
||||
bool apply_grammar);
|
||||
|
||||
@@ -236,8 +236,8 @@ int64_t get_example_targets_batch(
|
||||
int64_t used_samples = 0;
|
||||
|
||||
ggml_set_f32(target_probs, 0.0f);
|
||||
llama_token bos = llama_token_bos(lctx);
|
||||
llama_token eos = llama_token_eos(lctx);
|
||||
llama_token bos = llama_token_bos(llama_get_model(lctx));
|
||||
llama_token eos = llama_token_eos(llama_get_model(lctx));
|
||||
// printf("%s: example_id=%d n_batch=%d n_train_samples=%zu\n", __func__, example_id, n_batch, n_train_samples);
|
||||
for (int k=0; k<n_batch; ++k) {
|
||||
// printf("%s: batch %d\n", __func__, k);
|
||||
@@ -924,7 +924,7 @@ size_t tokenize_file(
|
||||
for (llama_token token=0; token < n_vocab; ++token) {
|
||||
max_token_text_size = std::max(
|
||||
max_token_text_size,
|
||||
strlen(llama_token_get_text(lctx, token)));
|
||||
strlen(llama_token_get_text(llama_get_model(lctx), token)));
|
||||
}
|
||||
|
||||
// upper bound of context byte length.
|
||||
@@ -1425,7 +1425,7 @@ void train_opt_callback(void * vdata, int accum_step, float * sched, bool * canc
|
||||
|
||||
int impr_plot = -(int)(1 + (opt->loss_before - opt->loss_after) * 10.0f + 0.5f);
|
||||
if (impr_plot > 0) impr_plot = 0;
|
||||
if (std::isnan(opt->loss_before) || std::isnan(opt->loss_before)) impr_plot = 0;
|
||||
if (std::isnan(opt->loss_before) || std::isnan(opt->loss_after)) impr_plot = 0;
|
||||
printf("%s: iter=%6d sample=%zu/%zu sched=%f loss=%f",
|
||||
__func__, opt->iter, std::min(1+train->shuffle_next_sample, train->shuffle_sample_count), train->shuffle_sample_count,
|
||||
*sched, opt->loss_after);
|
||||
|
||||
@@ -76,6 +76,7 @@ def parse_args() -> argparse.Namespace:
|
||||
"ftype", type=int, choices=[0, 1], default=1, nargs='?',
|
||||
help="output format - use 0 for float32, 1 for float16",
|
||||
)
|
||||
parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine")
|
||||
return parser.parse_args()
|
||||
|
||||
args = parse_args()
|
||||
@@ -86,6 +87,11 @@ if not dir_model.is_dir():
|
||||
print(f'Error: {args.model} is not a directory', file = sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
endianess = gguf.GGUFEndian.LITTLE
|
||||
if args.bigendian:
|
||||
endianess = gguf.GGUFEndian.BIG
|
||||
endianess_str = "Big Endian" if args.bigendian else "Little Endian"
|
||||
print(f"gguf: Conversion Endianess {endianess}")
|
||||
# possible tensor data types
|
||||
# ftype == 0 -> float32
|
||||
# ftype == 1 -> float16
|
||||
@@ -104,7 +110,7 @@ print("gguf: loading model "+dir_model.name)
|
||||
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
|
||||
hparams = json.load(f)
|
||||
print("hello print: ",hparams["architectures"][0])
|
||||
if hparams["architectures"][0] != "BaichuanForCausalLM":
|
||||
if hparams["architectures"][0] != "BaichuanForCausalLM" and hparams["architectures"][0] != "BaiChuanForCausalLM":
|
||||
print("Model architecture not supported: " + hparams["architectures"][0])
|
||||
|
||||
sys.exit()
|
||||
@@ -113,7 +119,7 @@ if hparams["architectures"][0] != "BaichuanForCausalLM":
|
||||
num_parts = count_model_parts(dir_model)
|
||||
print(f"num_parts:{num_parts}\n")
|
||||
ARCH=gguf.MODEL_ARCH.BAICHUAN
|
||||
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
|
||||
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
|
||||
|
||||
print("gguf: get model metadata")
|
||||
|
||||
@@ -224,7 +230,7 @@ gguf_writer.add_token_list(tokens)
|
||||
gguf_writer.add_token_scores(scores)
|
||||
gguf_writer.add_token_types(toktypes)
|
||||
|
||||
special_vocab = gguf.SpecialVocab(dir_model)
|
||||
special_vocab = gguf.SpecialVocab(dir_model, n_vocab = len(tokens))
|
||||
special_vocab.add_to_gguf(gguf_writer)
|
||||
|
||||
# TENSORS
|
||||
|
||||
@@ -118,18 +118,27 @@ tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
||||
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
|
||||
assert max(tokenizer.vocab.values()) < vocab_size
|
||||
|
||||
added_vocab = tokenizer.get_added_vocab()
|
||||
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
|
||||
|
||||
for i in range(vocab_size):
|
||||
tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
|
||||
scores.append(0.0) # dummy
|
||||
toktypes.append(gguf.TokenType.NORMAL)
|
||||
if i not in reverse_vocab:
|
||||
tokens.append(f"[PAD{i}]")
|
||||
toktypes.append(gguf.TokenType.USER_DEFINED)
|
||||
elif reverse_vocab[i] in added_vocab:
|
||||
tokens.append(reverse_vocab[i])
|
||||
if tokenizer.added_tokens_decoder[i].special:
|
||||
toktypes.append(gguf.TokenType.CONTROL)
|
||||
else:
|
||||
toktypes.append(gguf.TokenType.USER_DEFINED)
|
||||
else:
|
||||
tokens.append(reverse_vocab[i])
|
||||
toktypes.append(gguf.TokenType.NORMAL)
|
||||
|
||||
gguf_writer.add_token_list(tokens)
|
||||
gguf_writer.add_token_scores(scores)
|
||||
gguf_writer.add_token_types(toktypes)
|
||||
|
||||
special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
|
||||
special_vocab = gguf.SpecialVocab(dir_model, load_merges=True, n_vocab = len(tokens))
|
||||
special_vocab.add_to_gguf(gguf_writer)
|
||||
|
||||
# TENSORS
|
||||
|
||||
@@ -78,7 +78,7 @@ print("gguf: loading model "+dir_model.name)
|
||||
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
|
||||
hparams = json.load(f)
|
||||
|
||||
if hparams["architectures"][0] != "FalconForCausalLM":
|
||||
if hparams["architectures"][0] not in ("RWForCausalLM", "FalconForCausalLM"):
|
||||
print("Model architecture not supported: " + hparams["architectures"][0])
|
||||
|
||||
sys.exit(1)
|
||||
@@ -97,7 +97,17 @@ gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
|
||||
|
||||
print("gguf: get model metadata")
|
||||
|
||||
block_count = hparams["num_hidden_layers"]
|
||||
block_count = hparams.get("num_hidden_layers")
|
||||
if block_count is None:
|
||||
block_count = hparams["n_layer"] # old name
|
||||
|
||||
n_head = hparams.get("num_attention_heads")
|
||||
if n_head is None:
|
||||
n_head = hparams["n_head"] # old name
|
||||
|
||||
n_head_kv = hparams.get("num_kv_heads")
|
||||
if n_head_kv is None:
|
||||
n_head_kv = hparams.get("n_head_kv", 1) # old name
|
||||
|
||||
gguf_writer.add_name("Falcon")
|
||||
gguf_writer.add_context_length(2048) # not in config.json
|
||||
@@ -105,11 +115,8 @@ gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform
|
||||
gguf_writer.add_embedding_length(hparams["hidden_size"])
|
||||
gguf_writer.add_feed_forward_length(4 * hparams["hidden_size"])
|
||||
gguf_writer.add_block_count(block_count)
|
||||
gguf_writer.add_head_count(hparams["num_attention_heads"])
|
||||
if "num_kv_heads" in hparams:
|
||||
gguf_writer.add_head_count_kv(hparams["num_kv_heads"])
|
||||
else:
|
||||
gguf_writer.add_head_count_kv(1)
|
||||
gguf_writer.add_head_count(n_head)
|
||||
gguf_writer.add_head_count_kv(n_head_kv)
|
||||
gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
|
||||
gguf_writer.add_file_type(ftype)
|
||||
|
||||
@@ -145,17 +152,13 @@ gguf_writer.add_token_list(tokens)
|
||||
gguf_writer.add_token_scores(scores)
|
||||
gguf_writer.add_token_types(toktypes)
|
||||
|
||||
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
|
||||
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True, n_vocab = len(tokens))
|
||||
special_vocab.add_to_gguf(gguf_writer)
|
||||
|
||||
# TENSORS
|
||||
|
||||
tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
|
||||
|
||||
# params for qkv transform
|
||||
n_head = hparams["num_attention_heads"]
|
||||
n_head_kv = hparams["num_kv_heads"] if "num_kv_heads" in hparams else 1
|
||||
|
||||
head_dim = hparams["hidden_size"] // n_head
|
||||
|
||||
# tensor info
|
||||
|
||||
@@ -123,18 +123,27 @@ tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
||||
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
|
||||
assert max(tokenizer.vocab.values()) < vocab_size
|
||||
|
||||
added_vocab = tokenizer.get_added_vocab()
|
||||
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
|
||||
|
||||
for i in range(vocab_size):
|
||||
tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
|
||||
scores.append(0.0) # dummy
|
||||
toktypes.append(gguf.TokenType.NORMAL)
|
||||
if i not in reverse_vocab:
|
||||
tokens.append(f"[PAD{i}]")
|
||||
toktypes.append(gguf.TokenType.USER_DEFINED)
|
||||
elif reverse_vocab[i] in added_vocab:
|
||||
tokens.append(reverse_vocab[i])
|
||||
if tokenizer.added_tokens_decoder[i].special:
|
||||
toktypes.append(gguf.TokenType.CONTROL)
|
||||
else:
|
||||
toktypes.append(gguf.TokenType.USER_DEFINED)
|
||||
else:
|
||||
tokens.append(reverse_vocab[i])
|
||||
toktypes.append(gguf.TokenType.NORMAL)
|
||||
|
||||
gguf_writer.add_token_list(tokens)
|
||||
gguf_writer.add_token_scores(scores)
|
||||
gguf_writer.add_token_types(toktypes)
|
||||
|
||||
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
|
||||
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True, n_vocab = len(tokens))
|
||||
special_vocab.add_to_gguf(gguf_writer)
|
||||
|
||||
# TENSORS
|
||||
|
||||
@@ -388,7 +388,9 @@ def handle_metadata(cfg, hp):
|
||||
cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir,
|
||||
cfg.vocabtype )
|
||||
# FIXME: Respect cfg.vocab_dir?
|
||||
svocab = gguf.SpecialVocab(cfg.model_metadata_dir)
|
||||
svocab = gguf.SpecialVocab(cfg.model_metadata_dir,
|
||||
load_merges = cfg.vocabtype == 'bpe',
|
||||
n_vocab = vocab.vocab_size)
|
||||
convert.check_vocab_size(params, vocab)
|
||||
return (params, vocab, svocab)
|
||||
|
||||
|
||||
@@ -128,18 +128,27 @@ vocab_size = hparams["vocab_size"]
|
||||
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
|
||||
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
||||
|
||||
added_vocab = tokenizer.get_added_vocab()
|
||||
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
|
||||
|
||||
for i in range(vocab_size):
|
||||
tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
|
||||
scores.append(0.0) # dummy
|
||||
toktypes.append(gguf.TokenType.NORMAL)
|
||||
if i not in reverse_vocab:
|
||||
tokens.append(f"[PAD{i}]")
|
||||
toktypes.append(gguf.TokenType.USER_DEFINED)
|
||||
elif reverse_vocab[i] in added_vocab:
|
||||
tokens.append(reverse_vocab[i])
|
||||
if tokenizer.added_tokens_decoder[i].special:
|
||||
toktypes.append(gguf.TokenType.CONTROL)
|
||||
else:
|
||||
toktypes.append(gguf.TokenType.USER_DEFINED)
|
||||
else:
|
||||
tokens.append(reverse_vocab[i])
|
||||
toktypes.append(gguf.TokenType.NORMAL)
|
||||
|
||||
gguf_writer.add_token_list(tokens)
|
||||
gguf_writer.add_token_scores(scores)
|
||||
gguf_writer.add_token_types(toktypes)
|
||||
|
||||
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
|
||||
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True, n_vocab = len(tokens))
|
||||
special_vocab.add_to_gguf(gguf_writer)
|
||||
|
||||
# TENSORS
|
||||
|
||||
@@ -139,18 +139,27 @@ tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
||||
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
|
||||
assert max(tokenizer.vocab.values()) < vocab_size
|
||||
|
||||
added_vocab = tokenizer.get_added_vocab()
|
||||
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
|
||||
|
||||
for i in range(vocab_size):
|
||||
tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
|
||||
scores.append(0.0) # dummy
|
||||
toktypes.append(gguf.TokenType.NORMAL)
|
||||
if i not in reverse_vocab:
|
||||
tokens.append(f"[PAD{i}]")
|
||||
toktypes.append(gguf.TokenType.USER_DEFINED)
|
||||
elif reverse_vocab[i] in added_vocab:
|
||||
tokens.append(reverse_vocab[i])
|
||||
if tokenizer.added_tokens_decoder[i].special:
|
||||
toktypes.append(gguf.TokenType.CONTROL)
|
||||
else:
|
||||
toktypes.append(gguf.TokenType.USER_DEFINED)
|
||||
else:
|
||||
tokens.append(reverse_vocab[i])
|
||||
toktypes.append(gguf.TokenType.NORMAL)
|
||||
|
||||
gguf_writer.add_token_list(tokens)
|
||||
gguf_writer.add_token_scores(scores)
|
||||
gguf_writer.add_token_types(toktypes)
|
||||
|
||||
special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
|
||||
special_vocab = gguf.SpecialVocab(dir_model, load_merges=True, n_vocab = len(tokens))
|
||||
special_vocab.add_to_gguf(gguf_writer)
|
||||
|
||||
# TENSORS
|
||||
|
||||
@@ -111,18 +111,26 @@ tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
||||
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
|
||||
assert max(tokenizer.vocab.values()) < vocab_size
|
||||
|
||||
added_vocab = tokenizer.get_added_vocab()
|
||||
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
|
||||
|
||||
for i in range(vocab_size):
|
||||
tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
|
||||
scores.append(0.0) # dummy
|
||||
toktypes.append(gguf.TokenType.NORMAL)
|
||||
if i not in reverse_vocab:
|
||||
tokens.append(f"[PAD{i}]")
|
||||
toktypes.append(gguf.TokenType.USER_DEFINED)
|
||||
elif reverse_vocab[i] in added_vocab:
|
||||
tokens.append(reverse_vocab[i])
|
||||
if tokenizer.added_tokens_decoder[i].special:
|
||||
toktypes.append(gguf.TokenType.CONTROL)
|
||||
else:
|
||||
toktypes.append(gguf.TokenType.USER_DEFINED)
|
||||
else:
|
||||
tokens.append(reverse_vocab[i])
|
||||
toktypes.append(gguf.TokenType.NORMAL)
|
||||
|
||||
gguf_writer.add_token_list(tokens)
|
||||
gguf_writer.add_token_scores(scores)
|
||||
gguf_writer.add_token_types(toktypes)
|
||||
|
||||
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
|
||||
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True, n_vocab = len(tokens))
|
||||
special_vocab.add_to_gguf(gguf_writer)
|
||||
|
||||
# TENSORS
|
||||
|
||||
52
convert.py
52
convert.py
@@ -366,16 +366,19 @@ class SentencePieceVocab:
|
||||
added_tokens = {}
|
||||
|
||||
vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
|
||||
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
|
||||
actual_ids = sorted(added_tokens.values())
|
||||
if expected_ids != actual_ids:
|
||||
raise Exception(f"Expected added token IDs to be sequential and start at {len(added_tokens)}; got {actual_ids}")
|
||||
|
||||
items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
|
||||
self.added_tokens_list = [text for (text, idx) in items]
|
||||
self.vocab_size_base: int = vocab_size
|
||||
self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
|
||||
self.fname_tokenizer = fname_tokenizer
|
||||
new_tokens = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
|
||||
expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens)))
|
||||
actual_new_ids = sorted(new_tokens.keys())
|
||||
|
||||
if expected_new_ids != actual_new_ids:
|
||||
raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}")
|
||||
|
||||
# Token pieces that were added to the base vocabulary.
|
||||
self.added_tokens_list = [new_tokens[id] for id in actual_new_ids]
|
||||
self.vocab_size_base = vocab_size
|
||||
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
|
||||
self.fname_tokenizer = fname_tokenizer
|
||||
self.fname_added_tokens = fname_added_tokens
|
||||
|
||||
def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||
@@ -803,8 +806,8 @@ def check_vocab_size(params: Params, vocab: Vocab) -> None:
|
||||
|
||||
|
||||
class OutputFile:
|
||||
def __init__(self, fname_out: Path) -> None:
|
||||
self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
|
||||
def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian=gguf.GGUFEndian.LITTLE) -> None:
|
||||
self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
|
||||
|
||||
def add_meta_arch(self, params: Params) -> None:
|
||||
name = "LLaMA"
|
||||
@@ -875,10 +878,10 @@ class OutputFile:
|
||||
self.gguf.close()
|
||||
|
||||
@staticmethod
|
||||
def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab) -> None:
|
||||
def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, endianess:gguf.GGUFEndian=gguf.GGUFEndian.LITTLE) -> None:
|
||||
check_vocab_size(params, vocab)
|
||||
|
||||
of = OutputFile(fname_out)
|
||||
of = OutputFile(fname_out, endianess=endianess)
|
||||
|
||||
# meta data
|
||||
of.add_meta_arch(params)
|
||||
@@ -903,10 +906,10 @@ class OutputFile:
|
||||
return dt.quantize(arr)
|
||||
|
||||
@staticmethod
|
||||
def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY) -> None:
|
||||
def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY, endianess=gguf.GGUFEndian.LITTLE) -> None:
|
||||
check_vocab_size(params, vocab)
|
||||
|
||||
of = OutputFile(fname_out)
|
||||
of = OutputFile(fname_out, endianess=endianess)
|
||||
|
||||
# meta data
|
||||
of.add_meta_arch(params)
|
||||
@@ -1123,8 +1126,9 @@ def main(args_in: list[str] | None = None) -> None:
|
||||
parser.add_argument("--vocabtype", choices=["spm", "bpe"], help="vocab format (default: spm)", default="spm")
|
||||
parser.add_argument("--ctx", type=int, help="model training context (default: based on input)")
|
||||
parser.add_argument("--concurrency", type=int, help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default = DEFAULT_CONCURRENCY)
|
||||
args = parser.parse_args(args_in)
|
||||
parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine")
|
||||
|
||||
args = parser.parse_args(args_in)
|
||||
if args.dump_single:
|
||||
model_plus = lazy_load_file(args.model)
|
||||
do_dump_model(model_plus)
|
||||
@@ -1138,6 +1142,9 @@ def main(args_in: list[str] | None = None) -> None:
|
||||
if args.dump:
|
||||
do_dump_model(model_plus)
|
||||
return
|
||||
endianess = gguf.GGUFEndian.LITTLE
|
||||
if args.bigendian:
|
||||
endianess = gguf.GGUFEndian.BIG
|
||||
|
||||
params = Params.load(model_plus)
|
||||
if params.n_ctx == -1:
|
||||
@@ -1159,10 +1166,13 @@ def main(args_in: list[str] | None = None) -> None:
|
||||
|
||||
vocab: Vocab
|
||||
if args.vocab_only:
|
||||
assert args.outfile, "need --outfile if using --vocab-only"
|
||||
if not args.outfile:
|
||||
raise ValueError("need --outfile if using --vocab-only")
|
||||
# FIXME: Try to respect vocab_dir somehow?
|
||||
vocab = load_vocab(args.vocab_dir or args.model, args.vocabtype)
|
||||
special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent, load_merges = args.vocabtype == 'bpe')
|
||||
special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent,
|
||||
load_merges = args.vocabtype == 'bpe',
|
||||
n_vocab = vocab.vocab_size)
|
||||
outfile = args.outfile
|
||||
OutputFile.write_vocab_only(outfile, params, vocab, special_vocab)
|
||||
print(f"Wrote {outfile}")
|
||||
@@ -1174,7 +1184,9 @@ def main(args_in: list[str] | None = None) -> None:
|
||||
vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
|
||||
vocab = load_vocab(vocab_dir, args.vocabtype)
|
||||
# FIXME: Try to respect vocab_dir somehow?
|
||||
special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent, load_merges = args.vocabtype == 'bpe')
|
||||
special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent,
|
||||
load_merges = args.vocabtype == 'bpe',
|
||||
n_vocab = vocab.vocab_size)
|
||||
|
||||
model = model_plus.model
|
||||
model = convert_model_names(model, params)
|
||||
@@ -1185,7 +1197,7 @@ def main(args_in: list[str] | None = None) -> None:
|
||||
params.ftype = ftype
|
||||
print(f"Writing {outfile}, format {ftype}")
|
||||
|
||||
OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, concurrency = args.concurrency)
|
||||
OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, concurrency = args.concurrency, endianess=endianess)
|
||||
print(f"Wrote {outfile}")
|
||||
|
||||
|
||||
|
||||
@@ -12,26 +12,26 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
|
||||
|
||||
if (EMSCRIPTEN)
|
||||
else()
|
||||
add_subdirectory(main)
|
||||
add_subdirectory(quantize)
|
||||
add_subdirectory(quantize-stats)
|
||||
add_subdirectory(perplexity)
|
||||
add_subdirectory(embedding)
|
||||
add_subdirectory(save-load-state)
|
||||
add_subdirectory(benchmark)
|
||||
add_subdirectory(baby-llama)
|
||||
add_subdirectory(train-text-from-scratch)
|
||||
add_subdirectory(finetune)
|
||||
add_subdirectory(convert-llama2c-to-ggml)
|
||||
add_subdirectory(simple)
|
||||
add_subdirectory(batched)
|
||||
add_subdirectory(batched-bench)
|
||||
add_subdirectory(speculative)
|
||||
add_subdirectory(parallel)
|
||||
add_subdirectory(embd-input)
|
||||
add_subdirectory(llava)
|
||||
add_subdirectory(llama-bench)
|
||||
add_subdirectory(beam-search)
|
||||
add_subdirectory(benchmark)
|
||||
add_subdirectory(convert-llama2c-to-ggml)
|
||||
add_subdirectory(embedding)
|
||||
add_subdirectory(finetune)
|
||||
add_subdirectory(infill)
|
||||
add_subdirectory(llama-bench)
|
||||
add_subdirectory(llava)
|
||||
add_subdirectory(main)
|
||||
add_subdirectory(parallel)
|
||||
add_subdirectory(perplexity)
|
||||
add_subdirectory(quantize)
|
||||
add_subdirectory(quantize-stats)
|
||||
add_subdirectory(save-load-state)
|
||||
add_subdirectory(simple)
|
||||
add_subdirectory(speculative)
|
||||
add_subdirectory(train-text-from-scratch)
|
||||
if (LLAMA_METAL)
|
||||
add_subdirectory(metal)
|
||||
endif()
|
||||
|
||||
@@ -154,6 +154,10 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
}
|
||||
|
||||
LOG_TEE("\n");
|
||||
LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, mmq = %d\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, mmq);
|
||||
LOG_TEE("\n");
|
||||
|
||||
LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
|
||||
LOG_TEE("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
|
||||
|
||||
@@ -181,7 +185,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
const auto t_pp_start = ggml_time_us();
|
||||
|
||||
llama_kv_cache_tokens_rm(ctx, -1, -1);
|
||||
llama_kv_cache_clear(ctx);
|
||||
|
||||
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
|
||||
LOG_TEE("%s: llama_decode() failed\n", __func__);
|
||||
|
||||
@@ -11,12 +11,19 @@ int main(int argc, char ** argv) {
|
||||
gpt_params params;
|
||||
|
||||
if (argc == 1 || argv[1][0] == '-') {
|
||||
printf("usage: %s MODEL_PATH [PROMPT] [PARALLEL]\n" , argv[0]);
|
||||
printf("usage: %s MODEL_PATH [PROMPT] [PARALLEL] [LEN] [NGL]\n" , argv[0]);
|
||||
return 1 ;
|
||||
}
|
||||
|
||||
// number of parallel batches
|
||||
int n_parallel = 1;
|
||||
|
||||
// total length of the sequences including the prompt
|
||||
int n_len = 32;
|
||||
|
||||
// number of layers to offload to the GPU
|
||||
int n_gpu_layers = 0;
|
||||
|
||||
if (argc >= 2) {
|
||||
params.model = argv[1];
|
||||
}
|
||||
@@ -29,13 +36,18 @@ int main(int argc, char ** argv) {
|
||||
n_parallel = std::atoi(argv[3]);
|
||||
}
|
||||
|
||||
if (argc >= 5) {
|
||||
n_len = std::atoi(argv[4]);
|
||||
}
|
||||
|
||||
if (argc >= 6) {
|
||||
n_gpu_layers = std::atoi(argv[5]);
|
||||
}
|
||||
|
||||
if (params.prompt.empty()) {
|
||||
params.prompt = "Hello my name is";
|
||||
}
|
||||
|
||||
// total length of the sequences including the prompt
|
||||
const int n_len = 32;
|
||||
|
||||
// init LLM
|
||||
|
||||
llama_backend_init(params.numa);
|
||||
@@ -44,7 +56,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
llama_model_params model_params = llama_model_default_params();
|
||||
|
||||
// model_params.n_gpu_layers = 99; // offload all layers to the GPU
|
||||
model_params.n_gpu_layers = n_gpu_layers;
|
||||
|
||||
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
|
||||
|
||||
@@ -175,7 +187,7 @@ int main(int argc, char ** argv) {
|
||||
//const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
|
||||
|
||||
// is it an end of stream? -> mark the stream as finished
|
||||
if (new_token_id == llama_token_eos(ctx) || n_cur == n_len) {
|
||||
if (new_token_id == llama_token_eos(model) || n_cur == n_len) {
|
||||
i_batch[i] = -1;
|
||||
LOG_TEE("\n");
|
||||
if (n_parallel > 1) {
|
||||
|
||||
@@ -47,7 +47,7 @@ struct beam_search_callback_data {
|
||||
// In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same.
|
||||
// For example, eob can be flagged due to maximum token length, stop words, etc.
|
||||
static bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, size_t n_tokens) {
|
||||
return n_tokens && tokens[n_tokens-1] == llama_token_eos(callback_data.ctx);
|
||||
return n_tokens && tokens[n_tokens-1] == llama_token_eos(llama_get_model(callback_data.ctx));
|
||||
}
|
||||
|
||||
// Function matching type llama_beam_search_callback_fn_t.
|
||||
|
||||
@@ -536,7 +536,7 @@ static bool is_ggml_file(const char * filename) {
|
||||
if (file.size < 4) {
|
||||
return false;
|
||||
}
|
||||
uint32_t magic = file.read_u32();
|
||||
std::string magic = file.read_string(4);
|
||||
return magic == GGUF_MAGIC;
|
||||
}
|
||||
|
||||
|
||||
4
examples/embd-input/.gitignore
vendored
4
examples/embd-input/.gitignore
vendored
@@ -1,4 +0,0 @@
|
||||
PandaGPT
|
||||
MiniGPT-4
|
||||
*.pth
|
||||
|
||||
@@ -1,17 +0,0 @@
|
||||
set(TARGET embdinput)
|
||||
add_library(${TARGET} embd-input-lib.cpp embd-input.h)
|
||||
install(TARGETS ${TARGET} LIBRARY)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
if(TARGET BUILD_INFO)
|
||||
add_dependencies(${TARGET} BUILD_INFO)
|
||||
endif()
|
||||
|
||||
set(TARGET embd-input-test)
|
||||
add_executable(${TARGET} embd-input-test.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama embdinput ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
if(TARGET BUILD_INFO)
|
||||
add_dependencies(${TARGET} BUILD_INFO)
|
||||
endif()
|
||||
@@ -1,63 +0,0 @@
|
||||
### Examples for input embedding directly
|
||||
|
||||
## Requirement
|
||||
build `libembdinput.so`
|
||||
run the following comman in main dir (../../).
|
||||
```
|
||||
make
|
||||
```
|
||||
|
||||
## [LLaVA](https://github.com/haotian-liu/LLaVA/) example (llava.py)
|
||||
|
||||
1. Obtian LLaVA model (following https://github.com/haotian-liu/LLaVA/ , use https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/).
|
||||
2. Convert it to ggml format.
|
||||
3. `llava_projection.pth` is [pytorch_model-00003-of-00003.bin](https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/blob/main/pytorch_model-00003-of-00003.bin).
|
||||
|
||||
```
|
||||
import torch
|
||||
|
||||
bin_path = "../LLaVA-13b-delta-v1-1/pytorch_model-00003-of-00003.bin"
|
||||
pth_path = "./examples/embd-input/llava_projection.pth"
|
||||
|
||||
dic = torch.load(bin_path)
|
||||
used_key = ["model.mm_projector.weight","model.mm_projector.bias"]
|
||||
torch.save({k: dic[k] for k in used_key}, pth_path)
|
||||
```
|
||||
4. Check the path of LLaVA model and `llava_projection.pth` in `llava.py`.
|
||||
|
||||
|
||||
## [PandaGPT](https://github.com/yxuansu/PandaGPT) example (panda_gpt.py)
|
||||
|
||||
1. Obtian PandaGPT lora model from https://github.com/yxuansu/PandaGPT. Rename the file to `adapter_model.bin`. Use [convert-lora-to-ggml.py](../../convert-lora-to-ggml.py) to convert it to ggml format.
|
||||
The `adapter_config.json` is
|
||||
```
|
||||
{
|
||||
"peft_type": "LORA",
|
||||
"fan_in_fan_out": false,
|
||||
"bias": null,
|
||||
"modules_to_save": null,
|
||||
"r": 32,
|
||||
"lora_alpha": 32,
|
||||
"lora_dropout": 0.1,
|
||||
"target_modules": ["q_proj", "k_proj", "v_proj", "o_proj"]
|
||||
}
|
||||
```
|
||||
2. Papare the `vicuna` v0 model.
|
||||
3. Obtain the [ImageBind](https://dl.fbaipublicfiles.com/imagebind/imagebind_huge.pth) model.
|
||||
4. Clone the PandaGPT source.
|
||||
```
|
||||
git clone https://github.com/yxuansu/PandaGPT
|
||||
```
|
||||
5. Install the requirement of PandaGPT.
|
||||
6. Check the path of PandaGPT source, ImageBind model, lora model and vicuna model in panda_gpt.py.
|
||||
|
||||
## [MiniGPT-4](https://github.com/Vision-CAIR/MiniGPT-4/) example (minigpt4.py)
|
||||
|
||||
1. Obtain MiniGPT-4 model from https://github.com/Vision-CAIR/MiniGPT-4/ and put it in `embd-input`.
|
||||
2. Clone the MiniGPT-4 source.
|
||||
```
|
||||
git clone https://github.com/Vision-CAIR/MiniGPT-4/
|
||||
```
|
||||
3. Install the requirement of PandaGPT.
|
||||
4. Papare the `vicuna` v0 model.
|
||||
5. Check the path of MiniGPT-4 source, MiniGPT-4 model and vicuna model in `minigpt4.py`.
|
||||
@@ -1,221 +0,0 @@
|
||||
#include "build-info.h"
|
||||
#include "common.h"
|
||||
#include "embd-input.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <cinttypes>
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <ctime>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
static llama_context ** g_ctx;
|
||||
|
||||
extern "C" {
|
||||
|
||||
struct MyModel* create_mymodel(int argc, char ** argv) {
|
||||
gpt_params params;
|
||||
|
||||
if (!gpt_params_parse(argc, argv, params)) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
print_build_info();
|
||||
|
||||
if (params.seed == LLAMA_DEFAULT_SEED) {
|
||||
params.seed = uint32_t(time(NULL));
|
||||
}
|
||||
fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
|
||||
|
||||
llama_backend_init(params.numa);
|
||||
|
||||
llama_model * model;
|
||||
llama_context * ctx;
|
||||
|
||||
g_ctx = &ctx;
|
||||
|
||||
// load the model and apply lora adapter, if any
|
||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||
if (model == NULL) {
|
||||
fprintf(stderr, "%s: error: unable to load model\n", __func__);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// print system information
|
||||
{
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "%s\n", get_system_info(params).c_str());
|
||||
}
|
||||
struct MyModel * ret = new MyModel();
|
||||
ret->ctx = ctx;
|
||||
ret->params = params;
|
||||
ret->n_past = 0;
|
||||
// printf("ctx: %d\n", ret->ctx);
|
||||
return ret;
|
||||
}
|
||||
|
||||
void free_mymodel(struct MyModel * mymodel) {
|
||||
llama_context * ctx = mymodel->ctx;
|
||||
llama_print_timings(ctx);
|
||||
llama_free(ctx);
|
||||
delete mymodel;
|
||||
}
|
||||
|
||||
|
||||
bool eval_float(void * model, float * input, int N){
|
||||
MyModel * mymodel = (MyModel*)model;
|
||||
llama_context * ctx = mymodel->ctx;
|
||||
gpt_params params = mymodel->params;
|
||||
int n_emb = llama_n_embd(llama_get_model(ctx));
|
||||
int n_past = mymodel->n_past;
|
||||
int n_batch = N; // params.n_batch;
|
||||
|
||||
for (int i = 0; i < (int) N; i += n_batch) {
|
||||
int n_eval = (int) N - i;
|
||||
if (n_eval > n_batch) {
|
||||
n_eval = n_batch;
|
||||
}
|
||||
llama_batch batch = { int32_t(n_eval), nullptr, (input+i*n_emb), nullptr, nullptr, nullptr, nullptr, n_past, 1, 0, };
|
||||
if (llama_decode(ctx, batch)) {
|
||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||
return false;
|
||||
}
|
||||
n_past += n_eval;
|
||||
}
|
||||
mymodel->n_past = n_past;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool eval_tokens(void * model, std::vector<llama_token> tokens) {
|
||||
MyModel * mymodel = (MyModel* )model;
|
||||
llama_context * ctx;
|
||||
ctx = mymodel->ctx;
|
||||
gpt_params params = mymodel->params;
|
||||
int n_past = mymodel->n_past;
|
||||
for (int i = 0; i < (int) tokens.size(); i += params.n_batch) {
|
||||
int n_eval = (int) tokens.size() - i;
|
||||
if (n_eval > params.n_batch) {
|
||||
n_eval = params.n_batch;
|
||||
}
|
||||
if (llama_decode(ctx, llama_batch_get_one(&tokens[i], n_eval, n_past, 0))) {
|
||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||
return false;
|
||||
}
|
||||
n_past += n_eval;
|
||||
}
|
||||
mymodel->n_past = n_past;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool eval_id(struct MyModel* mymodel, int id) {
|
||||
std::vector<llama_token> tokens;
|
||||
tokens.push_back(id);
|
||||
return eval_tokens(mymodel, tokens);
|
||||
}
|
||||
|
||||
bool eval_string(struct MyModel * mymodel,const char* str){
|
||||
llama_context * ctx = mymodel->ctx;
|
||||
std::string str2 = str;
|
||||
std::vector<llama_token> embd_inp = ::llama_tokenize(ctx, str2, true);
|
||||
eval_tokens(mymodel, embd_inp);
|
||||
return true;
|
||||
}
|
||||
|
||||
llama_token sampling_id(struct MyModel* mymodel) {
|
||||
llama_context* ctx = mymodel->ctx;
|
||||
gpt_params params = mymodel->params;
|
||||
llama_sampling_params & sparams = params.sampling_params;
|
||||
// int n_ctx = llama_n_ctx(ctx);
|
||||
|
||||
// out of user input, sample next token
|
||||
const float temp = sparams.temp;
|
||||
const int32_t top_k = sparams.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx)) : sparams.top_k;
|
||||
const float top_p = sparams.top_p;
|
||||
const float tfs_z = sparams.tfs_z;
|
||||
const float typical_p = sparams.typical_p;
|
||||
// const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
|
||||
// const float repeat_penalty = params.repeat_penalty;
|
||||
// const float alpha_presence = params.presence_penalty;
|
||||
// const float alpha_frequency = params.frequency_penalty;
|
||||
const int mirostat = sparams.mirostat;
|
||||
const float mirostat_tau = sparams.mirostat_tau;
|
||||
const float mirostat_eta = sparams.mirostat_eta;
|
||||
// const bool penalize_nl = params.penalize_nl;
|
||||
|
||||
llama_token id = 0;
|
||||
{
|
||||
auto logits = llama_get_logits(ctx);
|
||||
auto n_vocab = llama_n_vocab(llama_get_model(ctx));
|
||||
|
||||
// Apply params.logit_bias map
|
||||
for (auto it = sparams.logit_bias.begin(); it != sparams.logit_bias.end(); it++) {
|
||||
logits[it->first] += it->second;
|
||||
}
|
||||
|
||||
std::vector<llama_token_data> candidates;
|
||||
candidates.reserve(n_vocab);
|
||||
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
||||
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
|
||||
}
|
||||
|
||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
||||
|
||||
// TODO: Apply penalties
|
||||
// float nl_logit = logits[llama_token_nl(ctx)];
|
||||
// auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
|
||||
// llama_sample_repetition_penalty(ctx, &candidates_p,
|
||||
// last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
|
||||
// last_n_repeat, repeat_penalty);
|
||||
// llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
|
||||
// last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
|
||||
// last_n_repeat, alpha_frequency, alpha_presence);
|
||||
// if (!penalize_nl) {
|
||||
// logits[llama_token_nl(ctx)] = nl_logit;
|
||||
// }
|
||||
|
||||
if (temp <= 0) {
|
||||
// Greedy sampling
|
||||
id = llama_sample_token_greedy(ctx, &candidates_p);
|
||||
} else {
|
||||
if (mirostat == 1) {
|
||||
static float mirostat_mu = 2.0f * mirostat_tau;
|
||||
const int mirostat_m = 100;
|
||||
llama_sample_temp(ctx, &candidates_p, temp);
|
||||
id = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
|
||||
} else if (mirostat == 2) {
|
||||
static float mirostat_mu = 2.0f * mirostat_tau;
|
||||
llama_sample_temp(ctx, &candidates_p, temp);
|
||||
id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
|
||||
} else {
|
||||
// Temperature sampling
|
||||
llama_sample_top_k(ctx, &candidates_p, top_k, 1);
|
||||
llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1);
|
||||
llama_sample_typical(ctx, &candidates_p, typical_p, 1);
|
||||
llama_sample_top_p(ctx, &candidates_p, top_p, 1);
|
||||
llama_sample_temp(ctx, &candidates_p, temp);
|
||||
id = llama_sample_token(ctx, &candidates_p);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return id;
|
||||
}
|
||||
|
||||
const char * sampling(struct MyModel * mymodel) {
|
||||
llama_context * ctx = mymodel->ctx;
|
||||
int id = sampling_id(mymodel);
|
||||
static std::string ret;
|
||||
if (id == llama_token_eos(ctx)) {
|
||||
ret = "</s>";
|
||||
} else {
|
||||
ret = llama_token_to_piece(ctx, id);
|
||||
}
|
||||
eval_id(mymodel, id);
|
||||
return ret.c_str();
|
||||
}
|
||||
|
||||
}
|
||||
@@ -1,35 +0,0 @@
|
||||
#include "embd-input.h"
|
||||
#include <stdlib.h>
|
||||
#include <random>
|
||||
#include <string.h>
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
|
||||
auto mymodel = create_mymodel(argc, argv);
|
||||
int N = 10;
|
||||
int max_tgt_len = 500;
|
||||
int n_embd = llama_n_embd(llama_get_model(mymodel->ctx));
|
||||
|
||||
// add random float embd to test evaluation
|
||||
float * data = new float[N*n_embd];
|
||||
std::default_random_engine e;
|
||||
std::uniform_real_distribution<float> u(0,1);
|
||||
for (int i=0;i<N*n_embd;i++) {
|
||||
data[i] = u(e);
|
||||
}
|
||||
|
||||
eval_string(mymodel, "user: what is the color of the flag of UN?");
|
||||
eval_float(mymodel, data, N);
|
||||
eval_string(mymodel, "assistant:");
|
||||
eval_string(mymodel, mymodel->params.prompt.c_str());
|
||||
const char* tmp;
|
||||
for (int i=0; i<max_tgt_len; i++) {
|
||||
tmp = sampling(mymodel);
|
||||
if (strcmp(tmp, "</s>")==0) break;
|
||||
printf("%s", tmp);
|
||||
fflush(stdout);
|
||||
}
|
||||
printf("\n");
|
||||
free_mymodel(mymodel);
|
||||
return 0;
|
||||
}
|
||||
@@ -1,27 +0,0 @@
|
||||
#ifndef _EMBD_INPUT_H_
|
||||
#define _EMBD_INPUT_H_ 1
|
||||
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
|
||||
extern "C" {
|
||||
|
||||
typedef struct MyModel {
|
||||
llama_context* ctx;
|
||||
gpt_params params;
|
||||
int n_past = 0;
|
||||
} MyModel;
|
||||
|
||||
struct MyModel* create_mymodel(int argc, char ** argv);
|
||||
|
||||
bool eval_float(void* model, float* input, int N);
|
||||
bool eval_tokens(void* model, std::vector<llama_token> tokens);
|
||||
bool eval_id(struct MyModel* mymodel, int id);
|
||||
bool eval_string(struct MyModel* mymodel, const char* str);
|
||||
const char * sampling(struct MyModel* mymodel);
|
||||
llama_token sampling_id(struct MyModel* mymodel);
|
||||
void free_mymodel(struct MyModel* mymodel);
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -1,72 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
import ctypes
|
||||
from ctypes import cdll, c_char_p, c_void_p, POINTER, c_float, c_int
|
||||
import numpy as np
|
||||
import os
|
||||
|
||||
libc = cdll.LoadLibrary("./libembdinput.so")
|
||||
libc.sampling.restype=c_char_p
|
||||
libc.create_mymodel.restype=c_void_p
|
||||
libc.eval_string.argtypes=[c_void_p, c_char_p]
|
||||
libc.sampling.argtypes=[c_void_p]
|
||||
libc.eval_float.argtypes=[c_void_p, POINTER(c_float), c_int]
|
||||
|
||||
|
||||
class MyModel:
|
||||
def __init__(self, args):
|
||||
argc = len(args)
|
||||
c_str = [c_char_p(i.encode()) for i in args]
|
||||
args_c = (c_char_p * argc)(*c_str)
|
||||
self.model = c_void_p(libc.create_mymodel(argc, args_c))
|
||||
self.max_tgt_len = 512
|
||||
self.print_string_eval = True
|
||||
|
||||
def __del__(self):
|
||||
libc.free_mymodel(self.model)
|
||||
|
||||
def eval_float(self, x):
|
||||
libc.eval_float(self.model, x.astype(np.float32).ctypes.data_as(POINTER(c_float)), x.shape[1])
|
||||
|
||||
def eval_string(self, x):
|
||||
libc.eval_string(self.model, x.encode()) # c_char_p(x.encode()))
|
||||
if self.print_string_eval:
|
||||
print(x)
|
||||
|
||||
def eval_token(self, x):
|
||||
libc.eval_id(self.model, x)
|
||||
|
||||
def sampling(self):
|
||||
s = libc.sampling(self.model)
|
||||
return s
|
||||
|
||||
def stream_generate(self, end="</s>"):
|
||||
ret = b""
|
||||
end = end.encode()
|
||||
for _ in range(self.max_tgt_len):
|
||||
tmp = self.sampling()
|
||||
ret += tmp
|
||||
yield tmp
|
||||
if ret.endswith(end):
|
||||
break
|
||||
|
||||
def generate_with_print(self, end="</s>"):
|
||||
ret = b""
|
||||
for i in self.stream_generate(end=end):
|
||||
ret += i
|
||||
print(i.decode(errors="replace"), end="", flush=True)
|
||||
print("")
|
||||
return ret.decode(errors="replace")
|
||||
|
||||
|
||||
def generate(self, end="</s>"):
|
||||
text = b"".join(self.stream_generate(end=end))
|
||||
return text.decode(errors="replace")
|
||||
|
||||
if __name__ == "__main__":
|
||||
model = MyModel(["main", "--model", "../llama.cpp/models/ggml-vic13b-q4_1.bin", "-c", "2048"])
|
||||
model.eval_string("""user: what is the color of the flag of UN?""")
|
||||
x = np.random.random((5120,10))# , dtype=np.float32)
|
||||
model.eval_float(x)
|
||||
model.eval_string("""assistant:""")
|
||||
for i in model.generate():
|
||||
print(i.decode(errors="replace"), end="", flush=True)
|
||||
@@ -1,71 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
import sys
|
||||
import os
|
||||
sys.path.insert(0, os.path.dirname(__file__))
|
||||
from embd_input import MyModel
|
||||
import numpy as np
|
||||
from torch import nn
|
||||
import torch
|
||||
from transformers import CLIPVisionModel, CLIPImageProcessor
|
||||
from PIL import Image
|
||||
|
||||
# model parameters from 'liuhaotian/LLaVA-13b-delta-v1-1'
|
||||
vision_tower = "openai/clip-vit-large-patch14"
|
||||
select_hidden_state_layer = -2
|
||||
# (vision_config.image_size // vision_config.patch_size) ** 2
|
||||
image_token_len = (224//14)**2
|
||||
|
||||
class Llava:
|
||||
def __init__(self, args):
|
||||
self.image_processor = CLIPImageProcessor.from_pretrained(vision_tower)
|
||||
self.vision_tower = CLIPVisionModel.from_pretrained(vision_tower)
|
||||
self.mm_projector = nn.Linear(1024, 5120)
|
||||
self.model = MyModel(["main", *args])
|
||||
|
||||
def load_projection(self, path):
|
||||
state = torch.load(path)
|
||||
self.mm_projector.load_state_dict({
|
||||
"weight": state["model.mm_projector.weight"],
|
||||
"bias": state["model.mm_projector.bias"]})
|
||||
|
||||
def chat(self, question):
|
||||
self.model.eval_string("user: ")
|
||||
self.model.eval_string(question)
|
||||
self.model.eval_string("\nassistant: ")
|
||||
return self.model.generate_with_print()
|
||||
|
||||
def chat_with_image(self, image, question):
|
||||
with torch.no_grad():
|
||||
embd_image = self.image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
|
||||
image_forward_out = self.vision_tower(embd_image.unsqueeze(0), output_hidden_states=True)
|
||||
select_hidden_state = image_forward_out.hidden_states[select_hidden_state_layer]
|
||||
image_feature = select_hidden_state[:, 1:]
|
||||
embd_image = self.mm_projector(image_feature)
|
||||
embd_image = embd_image.cpu().numpy()[0]
|
||||
self.model.eval_string("user: ")
|
||||
self.model.eval_token(32003-2) # im_start
|
||||
self.model.eval_float(embd_image.T)
|
||||
for i in range(image_token_len-embd_image.shape[0]):
|
||||
self.model.eval_token(32003-3) # im_patch
|
||||
self.model.eval_token(32003-1) # im_end
|
||||
self.model.eval_string(question)
|
||||
self.model.eval_string("\nassistant: ")
|
||||
return self.model.generate_with_print()
|
||||
|
||||
|
||||
if __name__=="__main__":
|
||||
# model form liuhaotian/LLaVA-13b-delta-v1-1
|
||||
a = Llava(["--model", "./models/ggml-llava-13b-v1.1.bin", "-c", "2048"])
|
||||
# Extract from https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/blob/main/pytorch_model-00003-of-00003.bin.
|
||||
# Also here can use pytorch_model-00003-of-00003.bin directly.
|
||||
a.load_projection(os.path.join(
|
||||
os.path.dirname(__file__) ,
|
||||
"llava_projection.pth"))
|
||||
respose = a.chat_with_image(
|
||||
Image.open("./media/llama1-logo.png").convert('RGB'),
|
||||
"what is the text in the picture?")
|
||||
respose
|
||||
a.chat("what is the color of it?")
|
||||
|
||||
|
||||
|
||||
@@ -1,129 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
import sys
|
||||
import os
|
||||
sys.path.insert(0, os.path.dirname(__file__))
|
||||
from embd_input import MyModel
|
||||
import numpy as np
|
||||
from torch import nn
|
||||
import torch
|
||||
from PIL import Image
|
||||
|
||||
minigpt4_path = os.path.join(os.path.dirname(__file__), "MiniGPT-4")
|
||||
sys.path.insert(0, minigpt4_path)
|
||||
from minigpt4.models.blip2 import Blip2Base
|
||||
from minigpt4.processors.blip_processors import Blip2ImageEvalProcessor
|
||||
|
||||
|
||||
class MiniGPT4(Blip2Base):
|
||||
"""
|
||||
MiniGPT4 model from https://github.com/Vision-CAIR/MiniGPT-4
|
||||
"""
|
||||
def __init__(self,
|
||||
args,
|
||||
vit_model="eva_clip_g",
|
||||
q_former_model="https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xxl.pth",
|
||||
img_size=224,
|
||||
drop_path_rate=0,
|
||||
use_grad_checkpoint=False,
|
||||
vit_precision="fp32",
|
||||
freeze_vit=True,
|
||||
freeze_qformer=True,
|
||||
num_query_token=32,
|
||||
llama_model="",
|
||||
prompt_path="",
|
||||
prompt_template="",
|
||||
max_txt_len=32,
|
||||
end_sym='\n',
|
||||
low_resource=False, # use 8 bit and put vit in cpu
|
||||
device_8bit=0
|
||||
):
|
||||
super().__init__()
|
||||
self.img_size = img_size
|
||||
self.low_resource = low_resource
|
||||
self.preprocessor = Blip2ImageEvalProcessor(img_size)
|
||||
|
||||
print('Loading VIT')
|
||||
self.visual_encoder, self.ln_vision = self.init_vision_encoder(
|
||||
vit_model, img_size, drop_path_rate, use_grad_checkpoint, vit_precision
|
||||
)
|
||||
print('Loading VIT Done')
|
||||
print('Loading Q-Former')
|
||||
self.Qformer, self.query_tokens = self.init_Qformer(
|
||||
num_query_token, self.visual_encoder.num_features
|
||||
)
|
||||
self.Qformer.cls = None
|
||||
self.Qformer.bert.embeddings.word_embeddings = None
|
||||
self.Qformer.bert.embeddings.position_embeddings = None
|
||||
for layer in self.Qformer.bert.encoder.layer:
|
||||
layer.output = None
|
||||
layer.intermediate = None
|
||||
self.load_from_pretrained(url_or_filename=q_former_model)
|
||||
print('Loading Q-Former Done')
|
||||
self.llama_proj = nn.Linear(
|
||||
self.Qformer.config.hidden_size, 5120 # self.llama_model.config.hidden_size
|
||||
)
|
||||
self.max_txt_len = max_txt_len
|
||||
self.end_sym = end_sym
|
||||
self.model = MyModel(["main", *args])
|
||||
# system prompt
|
||||
self.model.eval_string("Give the following image: <Img>ImageContent</Img>. "
|
||||
"You will be able to see the image once I provide it to you. Please answer my questions."
|
||||
"###")
|
||||
|
||||
def encode_img(self, image):
|
||||
image = self.preprocessor(image)
|
||||
image = image.unsqueeze(0)
|
||||
device = image.device
|
||||
if self.low_resource:
|
||||
self.vit_to_cpu()
|
||||
image = image.to("cpu")
|
||||
|
||||
with self.maybe_autocast():
|
||||
image_embeds = self.ln_vision(self.visual_encoder(image)).to(device)
|
||||
image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(device)
|
||||
|
||||
query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)
|
||||
query_output = self.Qformer.bert(
|
||||
query_embeds=query_tokens,
|
||||
encoder_hidden_states=image_embeds,
|
||||
encoder_attention_mask=image_atts,
|
||||
return_dict=True,
|
||||
)
|
||||
|
||||
inputs_llama = self.llama_proj(query_output.last_hidden_state)
|
||||
# atts_llama = torch.ones(inputs_llama.size()[:-1], dtype=torch.long).to(image.device)
|
||||
return inputs_llama
|
||||
|
||||
def load_projection(self, path):
|
||||
state = torch.load(path)["model"]
|
||||
self.llama_proj.load_state_dict({
|
||||
"weight": state["llama_proj.weight"],
|
||||
"bias": state["llama_proj.bias"]})
|
||||
|
||||
def chat(self, question):
|
||||
self.model.eval_string("Human: ")
|
||||
self.model.eval_string(question)
|
||||
self.model.eval_string("\n### Assistant:")
|
||||
return self.model.generate_with_print(end="###")
|
||||
|
||||
def chat_with_image(self, image, question):
|
||||
with torch.no_grad():
|
||||
embd_image = self.encode_img(image)
|
||||
embd_image = embd_image.cpu().numpy()[0]
|
||||
self.model.eval_string("Human: <Img>")
|
||||
self.model.eval_float(embd_image.T)
|
||||
self.model.eval_string("</Img> ")
|
||||
self.model.eval_string(question)
|
||||
self.model.eval_string("\n### Assistant:")
|
||||
return self.model.generate_with_print(end="###")
|
||||
|
||||
|
||||
if __name__=="__main__":
|
||||
a = MiniGPT4(["--model", "./models/ggml-vicuna-13b-v0-q4_1.bin", "-c", "2048"])
|
||||
a.load_projection(os.path.join(
|
||||
os.path.dirname(__file__) ,
|
||||
"pretrained_minigpt4.pth"))
|
||||
respose = a.chat_with_image(
|
||||
Image.open("./media/llama1-logo.png").convert('RGB'),
|
||||
"what is the text in the picture?")
|
||||
a.chat("what is the color of it?")
|
||||
@@ -1,99 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
import sys
|
||||
import os
|
||||
sys.path.insert(0, os.path.dirname(__file__))
|
||||
from embd_input import MyModel
|
||||
import numpy as np
|
||||
from torch import nn
|
||||
import torch
|
||||
|
||||
# use PandaGPT path
|
||||
panda_gpt_path = os.path.join(os.path.dirname(__file__), "PandaGPT")
|
||||
imagebind_ckpt_path = "./models/panda_gpt/"
|
||||
|
||||
sys.path.insert(0, os.path.join(panda_gpt_path,"code","model"))
|
||||
from ImageBind.models import imagebind_model
|
||||
from ImageBind import data
|
||||
|
||||
ModalityType = imagebind_model.ModalityType
|
||||
max_tgt_len = 400
|
||||
|
||||
class PandaGPT:
|
||||
def __init__(self, args):
|
||||
self.visual_encoder,_ = imagebind_model.imagebind_huge(pretrained=True, store_path=imagebind_ckpt_path)
|
||||
self.visual_encoder.eval()
|
||||
self.llama_proj = nn.Linear(1024, 5120) # self.visual_hidden_size, 5120)
|
||||
self.max_tgt_len = max_tgt_len
|
||||
self.model = MyModel(["main", *args])
|
||||
self.generated_text = ""
|
||||
self.device = "cpu"
|
||||
|
||||
def load_projection(self, path):
|
||||
state = torch.load(path, map_location="cpu")
|
||||
self.llama_proj.load_state_dict({
|
||||
"weight": state["llama_proj.weight"],
|
||||
"bias": state["llama_proj.bias"]})
|
||||
|
||||
def eval_inputs(self, inputs):
|
||||
self.model.eval_string("<Img>")
|
||||
embds = self.extract_multimoal_feature(inputs)
|
||||
for i in embds:
|
||||
self.model.eval_float(i.T)
|
||||
self.model.eval_string("</Img> ")
|
||||
|
||||
def chat(self, question):
|
||||
return self.chat_with_image(None, question)
|
||||
|
||||
def chat_with_image(self, inputs, question):
|
||||
if self.generated_text == "":
|
||||
self.model.eval_string("###")
|
||||
self.model.eval_string(" Human: ")
|
||||
if inputs:
|
||||
self.eval_inputs(inputs)
|
||||
self.model.eval_string(question)
|
||||
self.model.eval_string("\n### Assistant:")
|
||||
ret = self.model.generate_with_print(end="###")
|
||||
self.generated_text += ret
|
||||
return ret
|
||||
|
||||
def extract_multimoal_feature(self, inputs):
|
||||
features = []
|
||||
for key in ["image", "audio", "video", "thermal"]:
|
||||
if key + "_paths" in inputs:
|
||||
embeds = self.encode_data(key, inputs[key+"_paths"])
|
||||
features.append(embeds)
|
||||
return features
|
||||
|
||||
def encode_data(self, data_type, data_paths):
|
||||
|
||||
type_map = {
|
||||
"image": ModalityType.VISION,
|
||||
"audio": ModalityType.AUDIO,
|
||||
"video": ModalityType.VISION,
|
||||
"thermal": ModalityType.THERMAL,
|
||||
}
|
||||
load_map = {
|
||||
"image": data.load_and_transform_vision_data,
|
||||
"audio": data.load_and_transform_audio_data,
|
||||
"video": data.load_and_transform_video_data,
|
||||
"thermal": data.load_and_transform_thermal_data
|
||||
}
|
||||
|
||||
load_function = load_map[data_type]
|
||||
key = type_map[data_type]
|
||||
|
||||
inputs = {key: load_function(data_paths, self.device)}
|
||||
with torch.no_grad():
|
||||
embeddings = self.visual_encoder(inputs)
|
||||
embeds = embeddings[key]
|
||||
embeds = self.llama_proj(embeds).cpu().numpy()
|
||||
return embeds
|
||||
|
||||
|
||||
if __name__=="__main__":
|
||||
a = PandaGPT(["--model", "./models/ggml-vicuna-13b-v0-q4_1.bin", "-c", "2048", "--lora", "./models/panda_gpt/ggml-adapter-model.bin","--temp", "0"])
|
||||
a.load_projection("./models/panda_gpt/adapter_model.bin")
|
||||
a.chat_with_image(
|
||||
{"image_paths": ["./media/llama1-logo.png"]},
|
||||
"what is the text in the picture? 'llama' or 'lambda'?")
|
||||
a.chat("what is the color of it?")
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -4,5 +4,5 @@ install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
if(TARGET BUILD_INFO)
|
||||
add_dependencies(${TARGET} BUILD_INFO)
|
||||
add_dependencies(${TARGET} BUILD_INFO)
|
||||
endif()
|
||||
|
||||
@@ -39,8 +39,8 @@ static gpt_params * g_params;
|
||||
static std::vector<llama_token> * g_input_tokens;
|
||||
static std::ostringstream * g_output_ss;
|
||||
static std::vector<llama_token> * g_output_tokens;
|
||||
static bool is_interacting = false;
|
||||
|
||||
static bool is_interacting = false;
|
||||
|
||||
static void write_logfile(
|
||||
const llama_context * ctx, const gpt_params & params, const llama_model * model,
|
||||
@@ -104,7 +104,7 @@ static void sigint_handler(int signo) {
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
gpt_params params;
|
||||
llama_sampling_params & sparams = params.sampling_params;
|
||||
llama_sampling_params & sparams = params.sparams;
|
||||
g_params = ¶ms;
|
||||
|
||||
if (!gpt_params_parse(argc, argv, params)) {
|
||||
@@ -246,14 +246,14 @@ int main(int argc, char ** argv) {
|
||||
if (suff_rm_leading_spc && inp_sfx[0] == space_token) {
|
||||
inp_sfx.erase(inp_sfx.begin());
|
||||
}
|
||||
inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(ctx));
|
||||
inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
|
||||
if (add_bos) {
|
||||
inp_pfx.insert(inp_pfx.begin(), llama_token_bos(ctx));
|
||||
inp_pfx.insert(inp_pfx.begin(), llama_token_bos(model));
|
||||
}
|
||||
inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(ctx));
|
||||
inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
|
||||
embd_inp = inp_pfx;
|
||||
embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
|
||||
embd_inp.push_back(llama_token_middle(ctx));
|
||||
embd_inp.push_back(llama_token_middle(model));
|
||||
|
||||
LOG("prefix: \"%s\"\n", log_tostr(params.input_prefix));
|
||||
LOG("suffix: \"%s\"\n", log_tostr(params.input_suffix));
|
||||
@@ -261,7 +261,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// Should not run without any tokens
|
||||
if (embd_inp.empty()) {
|
||||
embd_inp.push_back(llama_token_bos(ctx));
|
||||
embd_inp.push_back(llama_token_bos(model));
|
||||
LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
|
||||
}
|
||||
|
||||
@@ -358,36 +358,10 @@ int main(int argc, char ** argv) {
|
||||
LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
|
||||
}
|
||||
}
|
||||
LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
|
||||
sparams.repeat_last_n, sparams.repeat_penalty, sparams.presence_penalty, sparams.frequency_penalty, sparams.top_k, sparams.tfs_z, sparams.top_p, sparams.typical_p, sparams.temp, sparams.mirostat, sparams.mirostat_eta, sparams.mirostat_tau);
|
||||
LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str());
|
||||
LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
||||
LOG_TEE("\n\n");
|
||||
|
||||
struct llama_grammar * grammar = NULL;
|
||||
grammar_parser::parse_state parsed_grammar;
|
||||
|
||||
if (!params.grammar.empty()) {
|
||||
parsed_grammar = grammar_parser::parse(params.grammar.c_str());
|
||||
// will be empty (default) if there are parse errors
|
||||
if (parsed_grammar.rules.empty()) {
|
||||
return 1;
|
||||
}
|
||||
LOG_TEE("%s: grammar:\n", __func__);
|
||||
grammar_parser::print_grammar(stderr, parsed_grammar);
|
||||
LOG_TEE("\n");
|
||||
|
||||
{
|
||||
auto it = sparams.logit_bias.find(llama_token_eos(ctx));
|
||||
if (it != sparams.logit_bias.end() && it->second == -INFINITY) {
|
||||
LOG_TEE("%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
|
||||
grammar = llama_grammar_init(
|
||||
grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
|
||||
}
|
||||
|
||||
LOG_TEE("\n##### Infill mode #####\n\n");
|
||||
if (params.infill) {
|
||||
printf("\n************\n");
|
||||
@@ -430,7 +404,7 @@ int main(int argc, char ** argv) {
|
||||
std::vector<llama_token> embd;
|
||||
std::vector<llama_token> embd_guidance;
|
||||
|
||||
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params);
|
||||
struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
|
||||
|
||||
while (n_remain != 0 || params.interactive) {
|
||||
// predict
|
||||
@@ -549,7 +523,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance);
|
||||
|
||||
llama_sampling_accept(ctx_sampling, ctx, id);
|
||||
llama_sampling_accept(ctx_sampling, ctx, id, true);
|
||||
|
||||
LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str());
|
||||
|
||||
@@ -567,8 +541,11 @@ int main(int argc, char ** argv) {
|
||||
LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
|
||||
while ((int) embd_inp.size() > n_consumed) {
|
||||
embd.push_back(embd_inp[n_consumed]);
|
||||
ctx_sampling->prev.erase(ctx_sampling->prev.begin());
|
||||
ctx_sampling->prev.push_back(embd_inp[n_consumed]);
|
||||
|
||||
// push the prompt in the sampling context in order to apply repetition penalties later
|
||||
// for the prompt, we don't apply grammar rules
|
||||
llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], false);
|
||||
|
||||
++n_consumed;
|
||||
if ((int) embd.size() >= params.n_batch) {
|
||||
break;
|
||||
@@ -600,10 +577,10 @@ int main(int argc, char ** argv) {
|
||||
if ((int) embd_inp.size() <= n_consumed) {
|
||||
|
||||
// deal with eot token in infill mode
|
||||
if ((ctx_sampling->prev.back() == llama_token_eot(ctx) || is_interacting) && params.interactive){
|
||||
if ((llama_sampling_last(ctx_sampling) == llama_token_eot(model) || is_interacting) && params.interactive){
|
||||
if(is_interacting && !params.interactive_first) {
|
||||
// print an eot token
|
||||
printf("%s", llama_token_to_piece(ctx, llama_token_eot(ctx)).c_str());
|
||||
printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
|
||||
}
|
||||
fflush(stdout);
|
||||
printf("\n");
|
||||
@@ -617,7 +594,7 @@ int main(int argc, char ** argv) {
|
||||
buffer += line;
|
||||
} while (another_line);
|
||||
// check if we got an empty line, if so we use the old input
|
||||
if(!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) {
|
||||
if (!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) {
|
||||
params.input_prefix = buffer;
|
||||
}
|
||||
buffer.clear();
|
||||
@@ -627,7 +604,7 @@ int main(int argc, char ** argv) {
|
||||
buffer += line;
|
||||
} while (another_line);
|
||||
// check if we got an empty line
|
||||
if(!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) {
|
||||
if (!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) {
|
||||
params.input_suffix = buffer;
|
||||
}
|
||||
buffer.clear();
|
||||
@@ -640,7 +617,7 @@ int main(int argc, char ** argv) {
|
||||
process_escapes(params.input_suffix);
|
||||
}
|
||||
suff_rm_leading_spc = params.escape;
|
||||
if (suff_rm_leading_spc && params.input_suffix.find_first_of(" ") == 0 && params.input_suffix.size() > 1) {
|
||||
if (suff_rm_leading_spc && params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
|
||||
params.input_suffix.erase(0, 1);
|
||||
suff_rm_leading_spc = false;
|
||||
}
|
||||
@@ -650,14 +627,14 @@ int main(int argc, char ** argv) {
|
||||
if (suff_rm_leading_spc && inp_sfx[0] == space_token) {
|
||||
inp_sfx.erase(inp_sfx.begin());
|
||||
}
|
||||
inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(ctx));
|
||||
inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
|
||||
if (add_bos) {
|
||||
inp_pfx.insert(inp_pfx.begin(), llama_token_bos(ctx));
|
||||
inp_pfx.insert(inp_pfx.begin(), llama_token_bos(model));
|
||||
}
|
||||
inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(ctx));
|
||||
inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
|
||||
embd_inp = inp_pfx;
|
||||
embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
|
||||
embd_inp.push_back(llama_token_middle(ctx));
|
||||
embd_inp.push_back(llama_token_middle(model));
|
||||
embd.clear();
|
||||
embd_guidance.clear();
|
||||
n_remain = params.n_predict;
|
||||
@@ -667,7 +644,7 @@ int main(int argc, char ** argv) {
|
||||
is_interacting = false;
|
||||
}
|
||||
// deal with end of text token in interactive mode
|
||||
else if (ctx_sampling->prev.back() == llama_token_eos(ctx)) {
|
||||
else if (llama_sampling_last(ctx_sampling) == llama_token_eos(model)) {
|
||||
LOG("found EOS token\n");
|
||||
|
||||
if (params.interactive) {
|
||||
@@ -684,7 +661,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
if (params.input_prefix_bos) {
|
||||
LOG("adding input prefix BOS token\n");
|
||||
embd_inp.push_back(llama_token_bos(ctx));
|
||||
embd_inp.push_back(llama_token_bos(model));
|
||||
}
|
||||
|
||||
std::string buffer;
|
||||
@@ -740,22 +717,14 @@ int main(int argc, char ** argv) {
|
||||
|
||||
if (n_past > 0) {
|
||||
if (is_interacting) {
|
||||
// reset grammar state if we're restarting generation
|
||||
if (grammar != NULL) {
|
||||
llama_grammar_free(grammar);
|
||||
|
||||
std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
|
||||
grammar = llama_grammar_init(
|
||||
grammar_rules.data(), grammar_rules.size(),
|
||||
parsed_grammar.symbol_ids.at("root"));
|
||||
}
|
||||
llama_sampling_reset(ctx_sampling);
|
||||
}
|
||||
is_interacting = false;
|
||||
}
|
||||
}
|
||||
|
||||
// end of text token
|
||||
if (!embd.empty() && embd.back() == llama_token_eos(ctx) && !params.interactive) {
|
||||
if (!embd.empty() && embd.back() == llama_token_eos(model) && !params.interactive) {
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -767,7 +736,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
}
|
||||
if (!params.interactive && n_remain <= 0) {
|
||||
printf("%s", llama_token_to_piece(ctx, llama_token_eot(ctx)).c_str());
|
||||
printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
|
||||
fflush(stdout);
|
||||
}
|
||||
|
||||
@@ -778,9 +747,7 @@ int main(int argc, char ** argv) {
|
||||
llama_free(ctx);
|
||||
llama_free_model(model);
|
||||
|
||||
if (grammar != NULL) {
|
||||
llama_grammar_free(grammar);
|
||||
}
|
||||
llama_sampling_free(ctx_sampling);
|
||||
llama_backend_free();
|
||||
|
||||
#ifndef LOG_DISABLE_LOGS
|
||||
|
||||
@@ -933,7 +933,7 @@ struct sql_printer : public printer {
|
||||
};
|
||||
|
||||
static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_batch, int n_threads) {
|
||||
std::vector<llama_token> tokens(n_batch, llama_token_bos(ctx));
|
||||
std::vector<llama_token> tokens(n_batch, llama_token_bos(llama_get_model(ctx)));
|
||||
int n_processed = 0;
|
||||
|
||||
llama_set_n_threads(ctx, n_threads, n_threads);
|
||||
@@ -946,7 +946,7 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_bat
|
||||
}
|
||||
|
||||
static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads) {
|
||||
llama_token token = llama_token_bos(ctx);
|
||||
llama_token token = llama_token_bos(llama_get_model(ctx));
|
||||
|
||||
llama_set_n_threads(ctx, n_threads, n_threads);
|
||||
|
||||
@@ -1037,7 +1037,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
test t(inst, lmodel, ctx);
|
||||
|
||||
llama_kv_cache_tokens_rm(ctx, -1, -1);
|
||||
llama_kv_cache_clear(ctx);
|
||||
|
||||
// warmup run
|
||||
if (t.n_prompt > 0) {
|
||||
@@ -1048,7 +1048,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
for (int i = 0; i < params.reps; i++) {
|
||||
llama_kv_cache_tokens_rm(ctx, -1, -1);
|
||||
llama_kv_cache_clear(ctx);
|
||||
|
||||
uint64_t t_start = get_time_ns();
|
||||
if (t.n_prompt > 0) {
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
set(TARGET clip)
|
||||
add_library(${TARGET} clip.cpp clip.h)
|
||||
install(TARGETS ${TARGET} LIBRARY)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE common ggml ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
if (NOT MSVC)
|
||||
target_compile_options(${TARGET} PRIVATE -Wno-cast-qual) # stb_image.h
|
||||
|
||||
@@ -112,8 +112,7 @@ static float get_f32(const gguf_context * ctx, const std::string & key) {
|
||||
static struct ggml_tensor * get_tensor(struct ggml_context * ctx, const std::string & name) {
|
||||
struct ggml_tensor * cur = ggml_get_tensor(ctx, name.c_str());
|
||||
if (!cur) {
|
||||
printf("unable to find tensor %s\n", name.c_str());
|
||||
throw std::runtime_error(format("unable to find tensor %s\n", name.c_str()));
|
||||
throw std::runtime_error(format("%s: unable to find tensor %s\n", __func__, name.c_str()));
|
||||
}
|
||||
|
||||
return cur;
|
||||
@@ -136,7 +135,7 @@ static std::string get_ftype(int ftype) {
|
||||
case 8:
|
||||
return "q8_0";
|
||||
default:
|
||||
throw std::runtime_error(format("Unrecognized file type: %d\n", ftype));
|
||||
throw std::runtime_error(format("%s: Unrecognized file type: %d\n", __func__, ftype));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -462,6 +461,9 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
};
|
||||
|
||||
struct gguf_context * ctx = gguf_init_from_file(fname, params);
|
||||
if (!ctx) {
|
||||
throw std::runtime_error(format("%s: failed to load CLIP model from %s. Does this file exist?\n", __func__, fname));
|
||||
}
|
||||
|
||||
if (verbosity >= 1) {
|
||||
const int n_tensors = gguf_get_n_tensors(ctx);
|
||||
@@ -608,8 +610,8 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
int idx_mean = get_key_idx(ctx, KEY_IMAGE_MEAN);
|
||||
int idx_std = get_key_idx(ctx, KEY_IMAGE_STD);
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
new_clip->image_mean[i] = *((float *)gguf_get_arr_data(ctx, idx_mean));
|
||||
new_clip->image_std[i] = *((float *)gguf_get_arr_data(ctx, idx_std));
|
||||
new_clip->image_mean[i] = *((const float *)gguf_get_arr_data(ctx, idx_mean));
|
||||
new_clip->image_std[i] = *((const float *)gguf_get_arr_data(ctx, idx_std));
|
||||
}
|
||||
|
||||
if (verbosity >= 2) {
|
||||
|
||||
@@ -16,13 +16,29 @@ checkpoint = torch.load(path)
|
||||
mm_tensors = [k for k, v in checkpoint.items() if k.startswith("model.mm_projector")]
|
||||
|
||||
# store these tensors in a new dictionary and torch.save them
|
||||
projector = {name: checkpoint[name] for name in mm_tensors}
|
||||
projector = {name: checkpoint[name].float() for name in mm_tensors}
|
||||
torch.save(projector, f"{args.model}/llava.projector")
|
||||
|
||||
# remove these tensors from the checkpoint and save it again
|
||||
for name in mm_tensors:
|
||||
del checkpoint[name]
|
||||
|
||||
# BakLLaVA models contain CLIP tensors in it
|
||||
clip_tensors = [k for k, v in checkpoint.items() if k.startswith("model.vision_tower")]
|
||||
if len(clip_tensors) > 0:
|
||||
clip = {name.replace("vision_tower.vision_tower.", ""): checkpoint[name].float() for name in clip_tensors}
|
||||
torch.save(clip, f"{args.model}/llava.clip")
|
||||
|
||||
# remove these tensors
|
||||
for name in clip_tensors:
|
||||
del checkpoint[name]
|
||||
|
||||
# added tokens should be removed to be able to convert Mistral models
|
||||
if os.path.exists(f"{args.model}/added_tokens.json"):
|
||||
with open(f"{args.model}/added_tokens.json", "w") as f:
|
||||
f.write("{}\n")
|
||||
|
||||
|
||||
torch.save(checkpoint, path)
|
||||
|
||||
print("Done!")
|
||||
|
||||
@@ -58,28 +58,30 @@ inline bool eval_string(struct llama_context * ctx_llama, const char* str, int n
|
||||
|
||||
// TODO: use common/sampling.h
|
||||
inline llama_token sample_id(llama_context * ctx_llama, gpt_params & params) {
|
||||
// out of user input, sample next token
|
||||
const float temp = params.sampling_params.temp;
|
||||
const int32_t top_k = params.sampling_params.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx_llama)) : params.sampling_params.top_k;
|
||||
const float top_p = params.sampling_params.top_p;
|
||||
const float tfs_z = params.sampling_params.tfs_z;
|
||||
const float typical_p = params.sampling_params.typical_p;
|
||||
// const int32_t repeat_last_n = params.sampling_params.repeat_last_n < 0 ? n_ctx : params.sampling_params.repeat_last_n;
|
||||
// const float repeat_penalty = params.sampling_params.repeat_penalty;
|
||||
// const float alpha_presence = params.sampling_params.presence_penalty;
|
||||
// const float alpha_frequency = params.sampling_params.frequency_penalty;
|
||||
const int mirostat = params.sampling_params.mirostat;
|
||||
const float mirostat_tau = params.sampling_params.mirostat_tau;
|
||||
const float mirostat_eta = params.sampling_params.mirostat_eta;
|
||||
// const bool penalize_nl = params.sampling_params.penalize_nl;
|
||||
auto & sparams = params.sparams;
|
||||
|
||||
// out of user input, sample next token
|
||||
const float temp = sparams.temp;
|
||||
const int32_t top_k = sparams.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx_llama)) : sparams.top_k;
|
||||
const float top_p = sparams.top_p;
|
||||
const float tfs_z = sparams.tfs_z;
|
||||
const float typical_p = sparams.typical_p;
|
||||
// const int32_t repeat_last_n = sparams.repeat_last_n < 0 ? n_ctx : sparams.repeat_last_n;
|
||||
// const float repeat_penalty = sparams.repeat_penalty;
|
||||
// const float alpha_presence = sparams.presence_penalty;
|
||||
// const float alpha_frequency = sparams.frequency_penalty;
|
||||
const int mirostat = sparams.mirostat;
|
||||
const float mirostat_tau = sparams.mirostat_tau;
|
||||
const float mirostat_eta = sparams.mirostat_eta;
|
||||
// const bool penalize_nl = sparams.penalize_nl;
|
||||
|
||||
llama_token id = 0;
|
||||
{
|
||||
auto logits = llama_get_logits(ctx_llama);
|
||||
auto n_vocab = llama_n_vocab(llama_get_model(ctx_llama));
|
||||
|
||||
// Apply params.logit_bias map
|
||||
for (auto it = params.sampling_params.logit_bias.begin(); it != params.sampling_params.logit_bias.end(); it++) {
|
||||
// Apply params.logit_bias map
|
||||
for (auto it = sparams.logit_bias.begin(); it != sparams.logit_bias.end(); it++) {
|
||||
logits[it->first] += it->second;
|
||||
}
|
||||
|
||||
@@ -91,18 +93,18 @@ inline llama_token sample_id(llama_context * ctx_llama, gpt_params & params) {
|
||||
|
||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
||||
|
||||
// TODO: Apply penalties
|
||||
// float nl_logit = logits[llama_token_nl(ctx)];
|
||||
// auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
|
||||
// llama_sample_repetition_penalty(ctx, &candidates_p,
|
||||
// last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
|
||||
// last_n_repeat, repeat_penalty);
|
||||
// llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
|
||||
// last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
|
||||
// last_n_repeat, alpha_frequency, alpha_presence);
|
||||
// if (!penalize_nl) {
|
||||
// logits[llama_token_nl(ctx)] = nl_logit;
|
||||
// }
|
||||
// TODO: Apply penalties
|
||||
// float nl_logit = logits[llama_token_nl(ctx)];
|
||||
// auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
|
||||
// llama_sample_repetition_penalty(ctx, &candidates_p,
|
||||
// last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
|
||||
// last_n_repeat, repeat_penalty);
|
||||
// llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
|
||||
// last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
|
||||
// last_n_repeat, alpha_frequency, alpha_presence);
|
||||
// if (!penalize_nl) {
|
||||
// logits[llama_token_nl(ctx)] = nl_logit;
|
||||
// }
|
||||
|
||||
if (temp <= 0) {
|
||||
// Greedy sampling
|
||||
@@ -135,7 +137,7 @@ inline llama_token sample_id(llama_context * ctx_llama, gpt_params & params) {
|
||||
inline const char * sample(struct llama_context * ctx_llama, gpt_params & params, int * n_past) {
|
||||
int id = sample_id(ctx_llama, params);
|
||||
static std::string ret;
|
||||
if (id == llama_token_eos(ctx_llama)) {
|
||||
if (id == llama_token_eos(llama_get_model(ctx_llama))) {
|
||||
ret = "</s>";
|
||||
} else {
|
||||
ret = llama_token_to_piece(ctx_llama, id);
|
||||
|
||||
@@ -16,6 +16,8 @@ add_library(common OBJECT
|
||||
${_common_path}/console.cpp
|
||||
${_common_path}/grammar-parser.h
|
||||
${_common_path}/grammar-parser.cpp
|
||||
${_common_path}/sampling.h
|
||||
${_common_path}/sampling.cpp
|
||||
)
|
||||
|
||||
# WARNING: because build-info.h is auto-generated, it will only
|
||||
|
||||
@@ -108,7 +108,7 @@ int main(int argc, char ** argv) {
|
||||
if (!gpt_params_parse(argc, argv, params)) {
|
||||
return 1;
|
||||
}
|
||||
llama_sampling_params & sparams = params.sampling_params;
|
||||
llama_sampling_params & sparams = params.sparams;
|
||||
|
||||
#ifndef LOG_DISABLE_LOGS
|
||||
log_set_target(log_filename_generator("main", "log"));
|
||||
@@ -248,7 +248,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// Should not run without any tokens
|
||||
if (embd_inp.empty()) {
|
||||
embd_inp.push_back(llama_token_bos(ctx));
|
||||
embd_inp.push_back(llama_token_bos(model));
|
||||
LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
|
||||
}
|
||||
|
||||
@@ -298,7 +298,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
// remove any "future" tokens that we might have inherited from the previous session
|
||||
llama_kv_cache_tokens_rm(ctx, n_matching_session_tokens, -1);
|
||||
llama_kv_cache_seq_rm(ctx, -1, n_matching_session_tokens, -1);
|
||||
}
|
||||
|
||||
LOGLN(
|
||||
@@ -415,8 +415,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
}
|
||||
}
|
||||
LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
|
||||
sparams.repeat_last_n, sparams.repeat_penalty, sparams.presence_penalty, sparams.frequency_penalty, sparams.top_k, sparams.tfs_z, sparams.top_p, sparams.typical_p, sparams.temp, sparams.mirostat, sparams.mirostat_eta, sparams.mirostat_tau);
|
||||
LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str());
|
||||
LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
||||
LOG_TEE("\n\n");
|
||||
|
||||
@@ -459,7 +458,7 @@ int main(int argc, char ** argv) {
|
||||
std::vector<llama_token> embd;
|
||||
std::vector<llama_token> embd_guidance;
|
||||
|
||||
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params);
|
||||
struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
|
||||
|
||||
while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
|
||||
// predict
|
||||
@@ -612,7 +611,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance);
|
||||
|
||||
llama_sampling_accept(ctx_sampling, ctx, id);
|
||||
llama_sampling_accept(ctx_sampling, ctx, id, true);
|
||||
|
||||
LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str());
|
||||
|
||||
@@ -631,12 +630,9 @@ int main(int argc, char ** argv) {
|
||||
while ((int) embd_inp.size() > n_consumed) {
|
||||
embd.push_back(embd_inp[n_consumed]);
|
||||
|
||||
// GG: I'm not sure it's a good idea to push the prompt tokens into the sampling context
|
||||
// Most likely will remove this in the future to avoid exposing "prev"
|
||||
// Same thing is done in "server". If we stop pushing the prompt tokens, then the repetition
|
||||
// penalty will be applied only based on the tokens generated by the model.
|
||||
ctx_sampling->prev.erase(ctx_sampling->prev.begin());
|
||||
ctx_sampling->prev.push_back(embd_inp[n_consumed]);
|
||||
// push the prompt in the sampling context in order to apply repetition penalties later
|
||||
// for the prompt, we don't apply grammar rules
|
||||
llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], false);
|
||||
|
||||
++n_consumed;
|
||||
if ((int) embd.size() >= params.n_batch) {
|
||||
@@ -667,12 +663,10 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// if not currently processing queued inputs;
|
||||
if ((int) embd_inp.size() <= n_consumed) {
|
||||
// check for reverse prompt
|
||||
// check for reverse prompt in the last n_prev tokens
|
||||
if (!params.antiprompt.empty()) {
|
||||
std::string last_output;
|
||||
for (auto id : ctx_sampling->prev) {
|
||||
last_output += llama_token_to_piece(ctx, id);
|
||||
}
|
||||
const int n_prev = 32;
|
||||
const std::string last_output = llama_sampling_prev_str(ctx_sampling, ctx, n_prev);
|
||||
|
||||
is_antiprompt = false;
|
||||
// Check if each of the reverse prompts appears at the end of the output.
|
||||
@@ -699,7 +693,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
// deal with end of text token in interactive mode
|
||||
if (ctx_sampling->prev.back() == llama_token_eos(ctx)) {
|
||||
if (llama_sampling_last(ctx_sampling) == llama_token_eos(model)) {
|
||||
LOG("found EOS token\n");
|
||||
|
||||
if (params.interactive) {
|
||||
@@ -726,7 +720,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
if (params.input_prefix_bos) {
|
||||
LOG("adding input prefix BOS token\n");
|
||||
embd_inp.push_back(llama_token_bos(ctx));
|
||||
embd_inp.push_back(llama_token_bos(model));
|
||||
}
|
||||
|
||||
std::string buffer;
|
||||
@@ -767,6 +761,9 @@ int main(int argc, char ** argv) {
|
||||
n_consumed = embd_inp.size();
|
||||
embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end());
|
||||
}
|
||||
if (params.escape) {
|
||||
process_escapes(buffer);
|
||||
}
|
||||
|
||||
const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
|
||||
const auto line_inp = ::llama_tokenize(ctx, buffer, false, false);
|
||||
@@ -807,7 +804,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
// end of text token
|
||||
if (!embd.empty() && embd.back() == llama_token_eos(ctx) && !(params.instruct || params.interactive)) {
|
||||
if (!embd.empty() && embd.back() == llama_token_eos(model) && !(params.instruct || params.interactive)) {
|
||||
LOG_TEE(" [end of text]\n");
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -157,7 +157,7 @@ int main(int argc, char ** argv) {
|
||||
for (size_t i = 0; i < clients.size(); ++i) {
|
||||
auto & client = clients[i];
|
||||
client.id = i;
|
||||
client.ctx_sampling = llama_sampling_init(params);
|
||||
client.ctx_sampling = llama_sampling_init(params.sparams);
|
||||
}
|
||||
|
||||
std::vector<llama_token> tokens_system;
|
||||
@@ -330,7 +330,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
const llama_token id = llama_sampling_sample(client.ctx_sampling, ctx, NULL, client.i_batch - i);
|
||||
|
||||
llama_sampling_accept(client.ctx_sampling, ctx, id);
|
||||
llama_sampling_accept(client.ctx_sampling, ctx, id, true);
|
||||
|
||||
if (client.n_decoded == 1) {
|
||||
// start measuring generation time after the first token to make sure all concurrent clients
|
||||
@@ -347,7 +347,7 @@ int main(int argc, char ** argv) {
|
||||
// client.id, client.seq_id, id, client.n_decoded, client.i_batch, token_str.c_str());
|
||||
|
||||
if (client.n_decoded > 2 &&
|
||||
(id == llama_token_eos(ctx) ||
|
||||
(id == llama_token_eos(model) ||
|
||||
(params.n_predict > 0 && client.n_decoded + client.n_prompt >= params.n_predict) ||
|
||||
client.response.find("User:") != std::string::npos ||
|
||||
client.response.find('\n') != std::string::npos)) {
|
||||
|
||||
@@ -210,7 +210,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
|
||||
const auto t_start = std::chrono::high_resolution_clock::now();
|
||||
|
||||
// clear the KV cache
|
||||
llama_kv_cache_tokens_rm(ctx, -1, -1);
|
||||
llama_kv_cache_clear(ctx);
|
||||
|
||||
for (int j = 0; j < num_batches; ++j) {
|
||||
const int batch_start = start + j * n_batch;
|
||||
@@ -227,7 +227,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
|
||||
|
||||
// add BOS token for the first batch of each chunk
|
||||
if (add_bos && j == 0) {
|
||||
tokens[batch_start] = llama_token_bos(ctx);
|
||||
tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
|
||||
}
|
||||
|
||||
const auto batch_logits = llama_get_logits(ctx);
|
||||
@@ -339,7 +339,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
||||
const auto t_start = std::chrono::high_resolution_clock::now();
|
||||
|
||||
// clear the KV cache
|
||||
llama_kv_cache_tokens_rm(ctx, -1, -1);
|
||||
llama_kv_cache_clear(ctx);
|
||||
|
||||
for (int j = 0; j < num_batches; ++j) {
|
||||
const int batch_start = start + j * n_batch;
|
||||
@@ -350,7 +350,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
||||
|
||||
// add BOS token for the first batch of each chunk
|
||||
if (add_bos && j == 0) {
|
||||
tokens[batch_start] = llama_token_bos(ctx);
|
||||
tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
|
||||
}
|
||||
|
||||
if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
|
||||
@@ -573,7 +573,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
||||
}
|
||||
|
||||
// clear the KV cache
|
||||
llama_kv_cache_tokens_rm(ctx, -1, -1);
|
||||
llama_kv_cache_clear(ctx);
|
||||
|
||||
auto logits = hellaswag_evaluate_tokens(ctx, query_embd, 0, params.n_batch, n_vocab);
|
||||
if (logits.empty()) {
|
||||
|
||||
@@ -18,7 +18,6 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
|
||||
{ "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 3.90G, +0.1585 ppl @ LLaMA-v1-7B", },
|
||||
{ "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 4.33G, +0.0683 ppl @ LLaMA-v1-7B", },
|
||||
{ "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 4.70G, +0.0349 ppl @ LLaMA-v1-7B", },
|
||||
#ifdef GGML_USE_K_QUANTS
|
||||
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
|
||||
{ "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
|
||||
{ "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", },
|
||||
@@ -31,7 +30,6 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
|
||||
{ "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 4.33G, +0.0400 ppl @ LLaMA-v1-7B", },
|
||||
{ "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0122 ppl @ LLaMA-v1-7B", },
|
||||
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 5.15G, -0.0008 ppl @ LLaMA-v1-7B", },
|
||||
#endif
|
||||
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", },
|
||||
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "13.00G @ 7B", },
|
||||
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
|
||||
@@ -70,13 +68,14 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
|
||||
}
|
||||
|
||||
// usage:
|
||||
// ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
|
||||
// ./quantize [--allow-requantize] [--leave-output-tensor] [--pure] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
|
||||
//
|
||||
[[noreturn]]
|
||||
static void usage(const char * executable) {
|
||||
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
|
||||
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
|
||||
printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
|
||||
printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
|
||||
printf(" --pure: Disable k-quant mixtures and quantize all tensors to the same type\n");
|
||||
printf("\nAllowed quantization types:\n");
|
||||
for (auto & it : QUANT_OPTIONS) {
|
||||
if (it.name != "COPY") {
|
||||
@@ -103,6 +102,8 @@ int main(int argc, char ** argv) {
|
||||
params.quantize_output_tensor = false;
|
||||
} else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) {
|
||||
params.allow_requantize = true;
|
||||
} else if (strcmp(argv[arg_idx], "--pure") == 0) {
|
||||
params.pure = true;
|
||||
} else {
|
||||
usage(argv[0]);
|
||||
}
|
||||
|
||||
@@ -6,7 +6,7 @@ install(TARGETS ${TARGET} RUNTIME)
|
||||
target_compile_definitions(${TARGET} PRIVATE
|
||||
SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
|
||||
)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE common llama clip ${CMAKE_THREAD_LIBS_INIT})
|
||||
if (WIN32)
|
||||
TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
|
||||
endif()
|
||||
|
||||
@@ -24,6 +24,10 @@ Command line options:
|
||||
- `--port`: Set the port to listen. Default: `8080`.
|
||||
- `--path`: path from which to serve static files (default examples/server/public)
|
||||
- `--embedding`: Enable embedding extraction, Default: disabled.
|
||||
- `-np N`, `--parallel N`: Set the number of slots for process requests (default: 1)
|
||||
- `-cb`, `--cont-batching`: enable continuous batching (a.k.a dynamic batching) (default: disabled)
|
||||
- `-spf FNAME`, `--system-prompt-file FNAME` Set a file to load "a system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
|
||||
- `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA.
|
||||
|
||||
## Build
|
||||
|
||||
@@ -158,6 +162,8 @@ node index.js
|
||||
|
||||
`n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token (default: 0)
|
||||
|
||||
`image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `prompt`. You can determine the place of the image in the prompt as in the following: `USER:[img-12]Describe the image in detail.\nASSISTANT:` In this case, `[img-12]` will be replaced by the embeddings of the image id 12 in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 12}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.
|
||||
|
||||
*Result JSON:*
|
||||
|
||||
Note: When using streaming mode (`stream`) only `content` and `stop` will be returned until end of completion.
|
||||
@@ -188,6 +194,12 @@ node index.js
|
||||
|
||||
`truncated`: Boolean indicating if the context size was exceeded during generation, i.e. the number of tokens provided in the prompt (`tokens_evaluated`) plus tokens generated (`tokens predicted`) exceeded the context size (`n_ctx`)
|
||||
|
||||
`slot_id`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot (default: -1)
|
||||
|
||||
`cache_prompt`: Save the prompt and generation for avoid reprocess entire prompt if a part of this isn't change (default: false)
|
||||
|
||||
`system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
|
||||
|
||||
- **POST** `/tokenize`: Tokenize a given text.
|
||||
|
||||
*Options:*
|
||||
@@ -218,8 +230,32 @@ node index.js
|
||||
|
||||
It also accepts all the options of `/completion` except `stream` and `prompt`.
|
||||
|
||||
- **GET** `/props`: Return the required assistant name and anti-prompt to generate the prompt in case you have specified a system prompt for all slots.
|
||||
|
||||
## More examples
|
||||
|
||||
### Change system prompt on runtime
|
||||
|
||||
To use the server example to serve multiple chat-type clients while keeping the same system prompt, you can utilize the option `system_prompt` to achieve that. This only needs to be done once to establish it.
|
||||
|
||||
`prompt`: Specify a context that you want all connecting clients to respect.
|
||||
|
||||
`anti_prompt`: Specify the word you want to use to instruct the model to stop. This must be sent to each client through the `/props` endpoint.
|
||||
|
||||
`assistant_name`: The bot's name is necessary for each customer to generate the prompt. This must be sent to each client through the `/props` endpoint.
|
||||
|
||||
```json
|
||||
{
|
||||
"system_prompt": {
|
||||
"prompt": "Transcript of a never ending dialog, where the User interacts with an Assistant.\nThe Assistant is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.\nUser: Recommend a nice restaurant in the area.\nAssistant: I recommend the restaurant \"The Golden Duck\". It is a 5 star restaurant with a great view of the city. The food is delicious and the service is excellent. The prices are reasonable and the portions are generous. The restaurant is located at 123 Main Street, New York, NY 10001. The phone number is (212) 555-1234. The hours are Monday through Friday from 11:00 am to 10:00 pm. The restaurant is closed on Saturdays and Sundays.\nUser: Who is Richard Feynman?\nAssistant: Richard Feynman was an American physicist who is best known for his work in quantum mechanics and particle physics. He was awarded the Nobel Prize in Physics in 1965 for his contributions to the development of quantum electrodynamics. He was a popular lecturer and author, and he wrote several books, including \"Surely You're Joking, Mr. Feynman!\" and \"What Do You Care What Other People Think?\".\nUser:",
|
||||
"anti_prompt": "User:",
|
||||
"assistant_name": "Assistant:"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**NOTE**: You can do this automatically when starting the server by simply creating a .json file with these options and using the CLI option `-spf FNAME` or `--system-prompt-file FNAME`.
|
||||
|
||||
### Interactive mode
|
||||
|
||||
Check the sample in [chat.mjs](chat.mjs).
|
||||
|
||||
@@ -8,6 +8,7 @@ import json
|
||||
|
||||
|
||||
app = Flask(__name__)
|
||||
slot_id = -1
|
||||
|
||||
parser = argparse.ArgumentParser(description="An example of using server.cpp with a similar API to OAI. It must be used together with server.cpp.")
|
||||
parser.add_argument("--chat-prompt", type=str, help="the top prompt in chat completions(default: 'A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.\\n')", default='A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.\\n')
|
||||
@@ -77,7 +78,8 @@ def make_postData(body, chat=False, stream=False):
|
||||
if(is_present(body, "stop")): postData["stop"] += body["stop"]
|
||||
postData["n_keep"] = -1
|
||||
postData["stream"] = stream
|
||||
|
||||
postData["cache_prompt"] = True
|
||||
postData["slot_id"] = slot_id
|
||||
return postData
|
||||
|
||||
def make_resData(data, chat=False, promptToken=[]):
|
||||
@@ -128,6 +130,7 @@ def make_resData_stream(data, chat=False, time_now = 0, start=False):
|
||||
}
|
||||
]
|
||||
}
|
||||
slot_id = data["slot_id"]
|
||||
if (chat):
|
||||
if (start):
|
||||
resData["choices"][0]["delta"] = {
|
||||
|
||||
@@ -7,6 +7,11 @@ const args = process.argv.slice(2);
|
||||
const grammarJsonSchemaFile = args.find(
|
||||
(_, index) => args[index - 1] === "--grammar-json-schema"
|
||||
);
|
||||
|
||||
const no_cached_prompt = args.find(
|
||||
(_, index) => args[index - 1] === "--no-cache-prompt"
|
||||
) ?? "false";
|
||||
|
||||
const grammarFile = args.find((_, index) => args[index - 1] === "--grammar");
|
||||
|
||||
// Example usage: function,arguments
|
||||
@@ -30,6 +35,9 @@ if (grammarFile) {
|
||||
grammar = readFileSync(grammarFile, 'utf-8')
|
||||
}
|
||||
|
||||
// for cached prompt
|
||||
let slot_id = -1;
|
||||
|
||||
const API_URL = 'http://127.0.0.1:8080'
|
||||
|
||||
const chat = [
|
||||
@@ -76,6 +84,8 @@ async function chat_completion(question) {
|
||||
top_p: 0.9,
|
||||
n_keep: n_keep,
|
||||
n_predict: 256,
|
||||
cache_prompt: no_cached_prompt === "false",
|
||||
slot_id: slot_id,
|
||||
stop: ["\n### Human:"], // stop completion after generating this
|
||||
grammar,
|
||||
stream: true,
|
||||
@@ -92,6 +102,7 @@ async function chat_completion(question) {
|
||||
const t = Buffer.from(chunk).toString('utf8')
|
||||
if (t.startsWith('data: ')) {
|
||||
const message = JSON.parse(t.substring(6))
|
||||
slot_id = message.slot_id
|
||||
answer += message.content
|
||||
process.stdout.write(message.content)
|
||||
if (message.stop) {
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -125,6 +125,7 @@
|
||||
background-color: #222;
|
||||
color: #ddd;
|
||||
}
|
||||
|
||||
code {
|
||||
font-family: monospace;
|
||||
padding: 0.1em 0.3em;
|
||||
@@ -141,7 +142,8 @@
|
||||
display: inline;
|
||||
}
|
||||
|
||||
header, footer {
|
||||
header,
|
||||
footer {
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
@@ -163,6 +165,7 @@
|
||||
0% {
|
||||
background-position: 0%;
|
||||
}
|
||||
|
||||
100% {
|
||||
background-position: 100%;
|
||||
}
|
||||
@@ -181,6 +184,7 @@
|
||||
--loading-color-1: #22222200;
|
||||
--loading-color-2: #222222ff;
|
||||
}
|
||||
|
||||
.popover-content {
|
||||
background-color: black;
|
||||
}
|
||||
@@ -194,6 +198,8 @@
|
||||
|
||||
import { llama } from '/completion.js';
|
||||
import { SchemaConverter } from '/json-schema-to-grammar.mjs';
|
||||
let selected_image = false;
|
||||
var slot_id = -1;
|
||||
|
||||
const session = signal({
|
||||
prompt: "This is a conversation between User and Llama, a friendly chatbot. Llama is helpful, kind, honest, good at writing, and never fails to answer any requests immediately and with precision.",
|
||||
@@ -203,6 +209,7 @@
|
||||
type: "chat", // "chat" | "completion"
|
||||
char: "Llama",
|
||||
user: "User",
|
||||
image_selected: ''
|
||||
})
|
||||
|
||||
const params = signal({
|
||||
@@ -220,7 +227,9 @@
|
||||
mirostat_tau: 5, // target entropy
|
||||
mirostat_eta: 0.1, // learning rate
|
||||
grammar: '',
|
||||
n_probs: 0, // no completion_probabilities
|
||||
n_probs: 0, // no completion_probabilities,
|
||||
image_data: [],
|
||||
cache_prompt: true
|
||||
})
|
||||
|
||||
/* START: Support for storing prompt templates and parameters in borwser LocalStorage */
|
||||
@@ -270,6 +279,7 @@
|
||||
// saved templates were successfuly imported.
|
||||
|
||||
console.log('Processing saved templates and updating default template')
|
||||
params.value = { ...params.value, image_data: [] };
|
||||
|
||||
//console.log(importedTemplates);
|
||||
savedUserTemplates.value = importedTemplates;
|
||||
@@ -294,7 +304,9 @@
|
||||
|
||||
function userTemplateApply(t) {
|
||||
session.value = t.data.session;
|
||||
session.value = { ...session.value, image_selected: '' };
|
||||
params.value = t.data.params;
|
||||
params.value = { ...params.value, image_data: [] };
|
||||
}
|
||||
|
||||
function userTemplateResetToDefaultAndApply() {
|
||||
@@ -385,20 +397,25 @@
|
||||
throw new Error("already running");
|
||||
}
|
||||
controller.value = new AbortController();
|
||||
for await (const chunk of llama(prompt, llamaParams, {controller: controller.value})) {
|
||||
for await (const chunk of llama(prompt, llamaParams, { controller: controller.value })) {
|
||||
const data = chunk.data;
|
||||
|
||||
if (data.stop) {
|
||||
while (
|
||||
currentMessages.length > 0 &&
|
||||
currentMessages[currentMessages.length - 1].content.match(/\n$/) != null
|
||||
) {
|
||||
) {
|
||||
currentMessages.pop();
|
||||
}
|
||||
transcriptUpdate([...history, [char, currentMessages]])
|
||||
console.log("Completion finished: '", currentMessages.map(msg => msg.content).join(''), "', summary: ", data);
|
||||
} else {
|
||||
currentMessages.push(data);
|
||||
slot_id = data.slot_id;
|
||||
if (selected_image && !data.multimodal) {
|
||||
alert("The server was not compiled for multimodal or the model projector can't be loaded.");
|
||||
return;
|
||||
}
|
||||
transcriptUpdate([...history, [char, currentMessages]])
|
||||
}
|
||||
|
||||
@@ -419,7 +436,7 @@
|
||||
|
||||
transcriptUpdate([...session.value.transcript, ["{{user}}", msg]])
|
||||
|
||||
const prompt = template(session.value.template, {
|
||||
let prompt = template(session.value.template, {
|
||||
message: msg,
|
||||
history: session.value.transcript.flatMap(
|
||||
([name, data]) =>
|
||||
@@ -434,9 +451,12 @@
|
||||
)
|
||||
).join("\n"),
|
||||
});
|
||||
|
||||
if (selected_image) {
|
||||
prompt = `A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:[img-10]${msg}\nASSISTANT:`;
|
||||
}
|
||||
await runLlama(prompt, {
|
||||
...params.value,
|
||||
slot_id: slot_id,
|
||||
stop: ["</s>", template("{{char}}:"), template("{{user}}:")],
|
||||
}, "{{char}}");
|
||||
}
|
||||
@@ -446,10 +466,11 @@
|
||||
console.log('already running...');
|
||||
return;
|
||||
}
|
||||
const {prompt} = session.value;
|
||||
const { prompt } = session.value;
|
||||
transcriptUpdate([...session.value.transcript, ["", prompt]]);
|
||||
await runLlama(prompt, {
|
||||
...params.value,
|
||||
slot_id: slot_id,
|
||||
stop: [],
|
||||
}, "");
|
||||
}
|
||||
@@ -467,6 +488,27 @@
|
||||
transcriptUpdate([]);
|
||||
}
|
||||
|
||||
const uploadImage = (e) => {
|
||||
e.preventDefault();
|
||||
document.getElementById("fileInput").click();
|
||||
document.getElementById("fileInput").addEventListener("change", function (event) {
|
||||
const selectedFile = event.target.files[0];
|
||||
if (selectedFile) {
|
||||
const reader = new FileReader();
|
||||
reader.onload = function () {
|
||||
const image_data = reader.result;
|
||||
session.value = { ...session.value, image_selected: image_data };
|
||||
params.value = {
|
||||
...params.value, image_data: [
|
||||
{ data: image_data.replace(/data:image\/[^;]+;base64,/, ''), id: 10 }]
|
||||
}
|
||||
};
|
||||
selected_image = true;
|
||||
reader.readAsDataURL(selectedFile);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
function MessageInput() {
|
||||
const message = useSignal("")
|
||||
|
||||
@@ -497,6 +539,7 @@
|
||||
</div>
|
||||
<div class="right">
|
||||
<button type="submit" disabled=${generating.value}>Send</button>
|
||||
<button onclick=${uploadImage}>Upload Image</button>
|
||||
<button onclick=${stop} disabled=${!generating.value}>Stop</button>
|
||||
<button onclick=${reset}>Reset</button>
|
||||
</div>
|
||||
@@ -540,7 +583,7 @@
|
||||
data;
|
||||
message = html`<${Markdownish} text=${template(text)} />`
|
||||
}
|
||||
if(user) {
|
||||
if (user) {
|
||||
return html`<p key=${index}><strong>${template(user)}:</strong> ${message}</p>`
|
||||
} else {
|
||||
return html`<p key=${index}>${message}</p>`
|
||||
@@ -549,6 +592,7 @@
|
||||
|
||||
return html`
|
||||
<section id="chat" ref=${container}>
|
||||
<img style="width: 60%;${!session.value.image_selected ? `display: none;` : ``}" src="${session.value.image_selected}"/>
|
||||
${messages.flatMap(chatLine)}
|
||||
</section>`;
|
||||
};
|
||||
@@ -567,7 +611,7 @@
|
||||
const converter = new SchemaConverter(
|
||||
grammarJsonSchemaPropOrder.value
|
||||
.split(',')
|
||||
.reduce((acc, cur, i) => ({...acc, [cur.trim()]: i}), {})
|
||||
.reduce((acc, cur, i) => ({ ...acc, [cur.trim()]: i }), {})
|
||||
)
|
||||
converter.visit(schema, '')
|
||||
params.value = {
|
||||
@@ -579,7 +623,7 @@
|
||||
}
|
||||
}
|
||||
|
||||
const FloatField = ({label, max, min, name, step, value}) => {
|
||||
const FloatField = ({ label, max, min, name, step, value }) => {
|
||||
return html`
|
||||
<div>
|
||||
<label for="${name}">${label}</label>
|
||||
@@ -589,7 +633,7 @@
|
||||
`
|
||||
};
|
||||
|
||||
const IntField = ({label, max, min, name, value}) => {
|
||||
const IntField = ({ label, max, min, name, value }) => {
|
||||
return html`
|
||||
<div>
|
||||
<label for="${name}">${label}</label>
|
||||
@@ -672,7 +716,7 @@
|
||||
${GrammarControl()}
|
||||
</fieldset>
|
||||
`
|
||||
);
|
||||
);
|
||||
|
||||
const CompletionConfigForm = () => (
|
||||
html`
|
||||
@@ -694,20 +738,20 @@
|
||||
${session.value.type === 'chat' ? ChatConfigForm() : CompletionConfigForm()}
|
||||
|
||||
<fieldset class="two">
|
||||
${IntField({label: "Predictions", max: 2048, min: -1, name: "n_predict", value: params.value.n_predict})}
|
||||
${FloatField({label: "Temperature", max: 1.5, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature})}
|
||||
${FloatField({label: "Penalize repeat sequence", max: 2.0, min: 0.0, name: "repeat_penalty", step: 0.01, value: params.value.repeat_penalty})}
|
||||
${IntField({label: "Consider N tokens for penalize", max: 2048, min: 0, name: "repeat_last_n", value: params.value.repeat_last_n})}
|
||||
${IntField({label: "Top-K sampling", max: 100, min: -1, name: "top_k", value: params.value.top_k})}
|
||||
${FloatField({label: "Top-P sampling", max: 1.0, min: 0.0, name: "top_p", step: 0.01, value: params.value.top_p})}
|
||||
${IntField({ label: "Predictions", max: 2048, min: -1, name: "n_predict", value: params.value.n_predict })}
|
||||
${FloatField({ label: "Temperature", max: 1.5, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature })}
|
||||
${FloatField({ label: "Penalize repeat sequence", max: 2.0, min: 0.0, name: "repeat_penalty", step: 0.01, value: params.value.repeat_penalty })}
|
||||
${IntField({ label: "Consider N tokens for penalize", max: 2048, min: 0, name: "repeat_last_n", value: params.value.repeat_last_n })}
|
||||
${IntField({ label: "Top-K sampling", max: 100, min: -1, name: "top_k", value: params.value.top_k })}
|
||||
${FloatField({ label: "Top-P sampling", max: 1.0, min: 0.0, name: "top_p", step: 0.01, value: params.value.top_p })}
|
||||
</fieldset>
|
||||
<details>
|
||||
<summary>More options</summary>
|
||||
<fieldset class="two">
|
||||
${FloatField({label: "TFS-Z", max: 1.0, min: 0.0, name: "tfs_z", step: 0.01, value: params.value.tfs_z})}
|
||||
${FloatField({label: "Typical P", max: 1.0, min: 0.0, name: "typical_p", step: 0.01, value: params.value.typical_p})}
|
||||
${FloatField({label: "Presence penalty", max: 1.0, min: 0.0, name: "presence_penalty", step: 0.01, value: params.value.presence_penalty})}
|
||||
${FloatField({label: "Frequency penalty", max: 1.0, min: 0.0, name: "frequency_penalty", step: 0.01, value: params.value.frequency_penalty})}
|
||||
${FloatField({ label: "TFS-Z", max: 1.0, min: 0.0, name: "tfs_z", step: 0.01, value: params.value.tfs_z })}
|
||||
${FloatField({ label: "Typical P", max: 1.0, min: 0.0, name: "typical_p", step: 0.01, value: params.value.typical_p })}
|
||||
${FloatField({ label: "Presence penalty", max: 1.0, min: 0.0, name: "presence_penalty", step: 0.01, value: params.value.presence_penalty })}
|
||||
${FloatField({ label: "Frequency penalty", max: 1.0, min: 0.0, name: "frequency_penalty", step: 0.01, value: params.value.frequency_penalty })}
|
||||
</fieldset>
|
||||
<hr />
|
||||
<fieldset class="three">
|
||||
@@ -716,11 +760,11 @@
|
||||
<label><input type="radio" name="mirostat" value="1" checked=${params.value.mirostat == 1} oninput=${updateParamsInt} /> Mirostat v1</label>
|
||||
<label><input type="radio" name="mirostat" value="2" checked=${params.value.mirostat == 2} oninput=${updateParamsInt} /> Mirostat v2</label>
|
||||
</div>
|
||||
${FloatField({label: "Mirostat tau", max: 10.0, min: 0.0, name: "mirostat_tau", step: 0.01, value: params.value.mirostat_tau})}
|
||||
${FloatField({label: "Mirostat eta", max: 1.0, min: 0.0, name: "mirostat_eta", step: 0.01, value: params.value.mirostat_eta})}
|
||||
${FloatField({ label: "Mirostat tau", max: 10.0, min: 0.0, name: "mirostat_tau", step: 0.01, value: params.value.mirostat_tau })}
|
||||
${FloatField({ label: "Mirostat eta", max: 1.0, min: 0.0, name: "mirostat_eta", step: 0.01, value: params.value.mirostat_eta })}
|
||||
</fieldset>
|
||||
<fieldset>
|
||||
${IntField({label: "Show Probabilities", max: 10, min: 0, name: "n_probs", value: params.value.n_probs})}
|
||||
${IntField({ label: "Show Probabilities", max: 10, min: 0, name: "n_probs", value: params.value.n_probs })}
|
||||
</fieldset>
|
||||
</details>
|
||||
</form>
|
||||
@@ -759,20 +803,20 @@
|
||||
const popoverChildren = html`
|
||||
<div class="prob-set">
|
||||
${probs.map((p, index) => {
|
||||
return html`
|
||||
return html`
|
||||
<div
|
||||
key=${index}
|
||||
title=${`prob: ${p.prob}`}
|
||||
style=${{
|
||||
padding: '0.3em',
|
||||
backgroundColor: p.tok_str === content ? probColor(p.prob) : 'transparent'
|
||||
}}
|
||||
padding: '0.3em',
|
||||
backgroundColor: p.tok_str === content ? probColor(p.prob) : 'transparent'
|
||||
}}
|
||||
>
|
||||
<span>${p.tok_str}: </span>
|
||||
<span>${Math.floor(p.prob * 100)}%</span>
|
||||
</div>
|
||||
`
|
||||
})}
|
||||
})}
|
||||
</div>
|
||||
`
|
||||
|
||||
@@ -851,9 +895,9 @@
|
||||
ref=${popoverRef}
|
||||
class="popover-content"
|
||||
style=${{
|
||||
top: position.value.top,
|
||||
left: position.value.left,
|
||||
}}
|
||||
top: position.value.top,
|
||||
left: position.value.left,
|
||||
}}
|
||||
>
|
||||
${props.popoverChildren}
|
||||
</div>
|
||||
@@ -952,8 +996,11 @@
|
||||
</head>
|
||||
|
||||
<body>
|
||||
<div id="container"></div>
|
||||
<div id="container">
|
||||
<input type="file" id="fileInput" accept="image/*" style="display: none;">
|
||||
</div>
|
||||
<div id="portal"></div>
|
||||
</body>
|
||||
|
||||
</html>
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -95,13 +95,8 @@ int main(int argc, char ** argv) {
|
||||
llama_batch batch = llama_batch_init(512, 0, 1);
|
||||
|
||||
// evaluate the initial prompt
|
||||
batch.n_tokens = tokens_list.size();
|
||||
|
||||
for (int32_t i = 0; i < batch.n_tokens; i++) {
|
||||
batch.token[i] = tokens_list[i];
|
||||
batch.pos[i] = i;
|
||||
batch.seq_id[i] = 0;
|
||||
batch.logits[i] = false;
|
||||
for (size_t i = 0; i < tokens_list.size(); i++) {
|
||||
llama_batch_add(batch, tokens_list[i], i, { 0 }, false);
|
||||
}
|
||||
|
||||
// llama_decode will output logits only for the last token of the prompt
|
||||
@@ -138,7 +133,7 @@ int main(int argc, char ** argv) {
|
||||
const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
|
||||
|
||||
// is it an end of stream?
|
||||
if (new_token_id == llama_token_eos(ctx) || n_cur == n_len) {
|
||||
if (new_token_id == llama_token_eos(model) || n_cur == n_len) {
|
||||
LOG_TEE("\n");
|
||||
|
||||
break;
|
||||
@@ -148,15 +143,10 @@ int main(int argc, char ** argv) {
|
||||
fflush(stdout);
|
||||
|
||||
// prepare the next batch
|
||||
batch.n_tokens = 0;
|
||||
llama_batch_clear(batch);
|
||||
|
||||
// push this new token for next evaluation
|
||||
batch.token [batch.n_tokens] = new_token_id;
|
||||
batch.pos [batch.n_tokens] = n_cur;
|
||||
batch.seq_id[batch.n_tokens] = 0;
|
||||
batch.logits[batch.n_tokens] = true;
|
||||
|
||||
batch.n_tokens += 1;
|
||||
llama_batch_add(batch, new_token_id, n_cur, { 0 }, true);
|
||||
|
||||
n_decode += 1;
|
||||
}
|
||||
|
||||
@@ -8,6 +8,9 @@
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 100
|
||||
#define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
|
||||
|
||||
struct seq_draft {
|
||||
bool active = false;
|
||||
bool drafting = false;
|
||||
@@ -64,6 +67,33 @@ int main(int argc, char ** argv) {
|
||||
params.n_gpu_layers = params.n_gpu_layers_draft;
|
||||
std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params);
|
||||
|
||||
{
|
||||
const int n_vocab_tgt = llama_n_vocab(model_tgt);
|
||||
const int n_vocab_dft = llama_n_vocab(model_dft);
|
||||
const int vocab_diff = n_vocab_tgt > n_vocab_dft
|
||||
? n_vocab_tgt - n_vocab_dft
|
||||
: n_vocab_dft - n_vocab_tgt;
|
||||
|
||||
if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) {
|
||||
fprintf(stderr, "%s: error: draft model vocab must closely match target model to use speculation but ", __func__);
|
||||
fprintf(stderr, "target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
|
||||
n_vocab_tgt, llama_n_vocab(model_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
|
||||
return 1;
|
||||
}
|
||||
|
||||
for (int i = SPEC_VOCAB_CHECK_START_TOKEN_ID; i < std::min(n_vocab_tgt, n_vocab_dft); ++i) {
|
||||
const char * token_text_tgt = llama_token_get_text(model_tgt, i);
|
||||
const char * token_text_dft = llama_token_get_text(model_dft, i);
|
||||
if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
|
||||
fprintf(stderr, "%s: error: draft model vocab must match target model to use speculation but ", __func__);
|
||||
fprintf(stderr, "token %d content differs - target '%s', draft '%s'\n", i,
|
||||
llama_token_to_piece(ctx_tgt, i).c_str(),
|
||||
llama_token_to_piece(ctx_dft, i).c_str());
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// tokenize the prompt
|
||||
std::vector<llama_token> inp;
|
||||
inp = ::llama_tokenize(ctx_tgt, params.prompt, true);
|
||||
@@ -112,16 +142,16 @@ int main(int argc, char ** argv) {
|
||||
bool has_eos = false;
|
||||
|
||||
// target model sampling context
|
||||
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params);
|
||||
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams);
|
||||
|
||||
// draft sequence data
|
||||
std::vector<seq_draft> drafts(n_seq_dft);
|
||||
|
||||
params.grammar.clear(); // the draft samplers will copy the target sampler's grammar
|
||||
params.sampling_params.temp = std::max(0.01f, params.sampling_params.temp);
|
||||
params.sparams.grammar.clear(); // the draft samplers will copy the target sampler's grammar
|
||||
params.sparams.temp = -1.0f; // force greedy sampling with probs for the draft model
|
||||
|
||||
for (int s = 0; s < n_seq_dft; ++s) {
|
||||
drafts[s].ctx_sampling = llama_sampling_init(params);
|
||||
drafts[s].ctx_sampling = llama_sampling_init(params.sparams);
|
||||
}
|
||||
|
||||
llama_batch batch_dft = llama_batch_init(params.n_ctx, 0, 1);
|
||||
@@ -154,7 +184,7 @@ int main(int argc, char ** argv) {
|
||||
// sample from the target model
|
||||
llama_token id = llama_sampling_sample(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft]);
|
||||
|
||||
llama_sampling_accept(ctx_sampling, ctx_tgt, id);
|
||||
llama_sampling_accept(ctx_sampling, ctx_tgt, id, true);
|
||||
|
||||
//LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, ctx_sampling->prev).c_str());
|
||||
|
||||
@@ -163,7 +193,7 @@ int main(int argc, char ** argv) {
|
||||
printf("%s", token_str.c_str());
|
||||
fflush(stdout);
|
||||
|
||||
if (id == llama_token_eos(ctx_tgt)) {
|
||||
if (id == llama_token_eos(model_tgt)) {
|
||||
has_eos = true;
|
||||
}
|
||||
|
||||
@@ -227,6 +257,7 @@ int main(int argc, char ** argv) {
|
||||
llama_batch_add (batch_dft, id, n_past_dft, { 0 }, true);
|
||||
|
||||
llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1);
|
||||
// LOG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
|
||||
llama_decode (ctx_dft, batch_dft);
|
||||
|
||||
++n_past_dft;
|
||||
@@ -328,7 +359,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
const int s = sa[is];
|
||||
|
||||
llama_sampling_accept(drafts[s].ctx_sampling, ctx_dft, id);
|
||||
llama_sampling_accept(drafts[s].ctx_sampling, ctx_dft, id, true);
|
||||
|
||||
drafts[s].tokens.push_back(id);
|
||||
|
||||
@@ -370,7 +401,7 @@ int main(int argc, char ** argv) {
|
||||
llama_kv_cache_seq_cp(ctx_tgt, 0, s, -1, -1);
|
||||
}
|
||||
|
||||
//LOG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt));
|
||||
// LOG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str());
|
||||
llama_decode(ctx_tgt, batch_tgt);
|
||||
++n_past_tgt;
|
||||
}
|
||||
|
||||
6
flake.lock
generated
6
flake.lock
generated
@@ -20,11 +20,11 @@
|
||||
},
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1692913444,
|
||||
"narHash": "sha256-1SvMQm2DwofNxXVtNWWtIcTh7GctEVrS/Xel/mdc6iY=",
|
||||
"lastModified": 1698134075,
|
||||
"narHash": "sha256-foCD+nuKzfh49bIoiCBur4+Fx1nozo+4C/6k8BYk4sg=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "18324978d632ffc55ef1d928e81630c620f4f447",
|
||||
"rev": "8efd5d1e283604f75a808a20e6cde0ef313d07d4",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
|
||||
@@ -51,6 +51,9 @@
|
||||
};
|
||||
llama-python =
|
||||
pkgs.python3.withPackages (ps: with ps; [ numpy sentencepiece ]);
|
||||
# TODO(Green-Sky): find a better way to opt-into the heavy ml python runtime
|
||||
llama-python-extra =
|
||||
pkgs.python3.withPackages (ps: with ps; [ numpy sentencepiece torchWithoutCuda transformers ]);
|
||||
postPatch = ''
|
||||
substituteInPlace ./ggml-metal.m \
|
||||
--replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
|
||||
@@ -126,5 +129,9 @@
|
||||
buildInputs = [ llama-python ];
|
||||
packages = nativeBuildInputs ++ osSpecific;
|
||||
};
|
||||
devShells.extra = pkgs.mkShell {
|
||||
buildInputs = [ llama-python-extra ];
|
||||
packages = nativeBuildInputs ++ osSpecific;
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
472
ggml-cuda.cu
472
ggml-cuda.cu
@@ -29,6 +29,8 @@
|
||||
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
|
||||
#define cublasCreate hipblasCreate
|
||||
#define cublasGemmEx hipblasGemmEx
|
||||
#define cublasGemmBatchedEx hipblasGemmBatchedEx
|
||||
#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
|
||||
#define cublasHandle_t hipblasHandle_t
|
||||
#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
|
||||
#define cublasSetStream hipblasSetStream
|
||||
@@ -85,6 +87,24 @@
|
||||
#define CC_OFFSET_AMD 1000000
|
||||
#define CC_RDNA2 (CC_OFFSET_AMD + 1030)
|
||||
|
||||
// define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication
|
||||
// on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant
|
||||
// for large computational tasks. the drawback is that this requires some extra amount of VRAM:
|
||||
// - 7B quantum model: +100-200 MB
|
||||
// - 13B quantum model: +200-400 MB
|
||||
//
|
||||
//#define GGML_CUDA_FORCE_MMQ
|
||||
|
||||
// TODO: improve this to be correct for more hardware
|
||||
// for example, currently fails for GeForce GTX 1660 which is TURING arch (> VOLTA) but does not have tensor cores
|
||||
// probably other such cases, and not sure what happens on AMD hardware
|
||||
#if !defined(GGML_CUDA_FORCE_MMQ)
|
||||
#define CUDA_USE_TENSOR_CORES
|
||||
#endif
|
||||
|
||||
// max batch size to use MMQ kernels when tensor cores are available
|
||||
#define MMQ_MAX_BATCH_SIZE 32
|
||||
|
||||
#if defined(GGML_USE_HIPBLAS)
|
||||
#define __CUDA_ARCH__ 1300
|
||||
|
||||
@@ -468,7 +488,6 @@ static int g_device_count = -1;
|
||||
static int g_main_device = 0;
|
||||
static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
|
||||
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
|
||||
static bool g_mul_mat_q = true;
|
||||
|
||||
static void * g_scratch_buffer = nullptr;
|
||||
static size_t g_scratch_size = 0; // disabled by default
|
||||
@@ -3552,9 +3571,15 @@ static __device__ __forceinline__ void mul_mat_q(
|
||||
#define MMQ_X_Q4_0_RDNA1 64
|
||||
#define MMQ_Y_Q4_0_RDNA1 64
|
||||
#define NWARPS_Q4_0_RDNA1 8
|
||||
#if defined(CUDA_USE_TENSOR_CORES)
|
||||
#define MMQ_X_Q4_0_AMPERE 4
|
||||
#define MMQ_Y_Q4_0_AMPERE 32
|
||||
#define NWARPS_Q4_0_AMPERE 4
|
||||
#else
|
||||
#define MMQ_X_Q4_0_AMPERE 64
|
||||
#define MMQ_Y_Q4_0_AMPERE 128
|
||||
#define NWARPS_Q4_0_AMPERE 4
|
||||
#endif
|
||||
#define MMQ_X_Q4_0_PASCAL 64
|
||||
#define MMQ_Y_Q4_0_PASCAL 64
|
||||
#define NWARPS_Q4_0_PASCAL 8
|
||||
@@ -3613,9 +3638,15 @@ template <bool need_check> static __global__ void
|
||||
#define MMQ_X_Q4_1_RDNA1 64
|
||||
#define MMQ_Y_Q4_1_RDNA1 64
|
||||
#define NWARPS_Q4_1_RDNA1 8
|
||||
#if defined(CUDA_USE_TENSOR_CORES)
|
||||
#define MMQ_X_Q4_1_AMPERE 4
|
||||
#define MMQ_Y_Q4_1_AMPERE 32
|
||||
#define NWARPS_Q4_1_AMPERE 4
|
||||
#else
|
||||
#define MMQ_X_Q4_1_AMPERE 64
|
||||
#define MMQ_Y_Q4_1_AMPERE 128
|
||||
#define NWARPS_Q4_1_AMPERE 4
|
||||
#endif
|
||||
#define MMQ_X_Q4_1_PASCAL 64
|
||||
#define MMQ_Y_Q4_1_PASCAL 64
|
||||
#define NWARPS_Q4_1_PASCAL 8
|
||||
@@ -3676,9 +3707,15 @@ template <bool need_check> static __global__ void
|
||||
#define MMQ_X_Q5_0_RDNA1 64
|
||||
#define MMQ_Y_Q5_0_RDNA1 64
|
||||
#define NWARPS_Q5_0_RDNA1 8
|
||||
#if defined(CUDA_USE_TENSOR_CORES)
|
||||
#define MMQ_X_Q5_0_AMPERE 4
|
||||
#define MMQ_Y_Q5_0_AMPERE 32
|
||||
#define NWARPS_Q5_0_AMPERE 4
|
||||
#else
|
||||
#define MMQ_X_Q5_0_AMPERE 128
|
||||
#define MMQ_Y_Q5_0_AMPERE 64
|
||||
#define NWARPS_Q5_0_AMPERE 4
|
||||
#endif
|
||||
#define MMQ_X_Q5_0_PASCAL 64
|
||||
#define MMQ_Y_Q5_0_PASCAL 64
|
||||
#define NWARPS_Q5_0_PASCAL 8
|
||||
@@ -3737,9 +3774,15 @@ template <bool need_check> static __global__ void
|
||||
#define MMQ_X_Q5_1_RDNA1 64
|
||||
#define MMQ_Y_Q5_1_RDNA1 64
|
||||
#define NWARPS_Q5_1_RDNA1 8
|
||||
#if defined(CUDA_USE_TENSOR_CORES)
|
||||
#define MMQ_X_Q5_1_AMPERE 4
|
||||
#define MMQ_Y_Q5_1_AMPERE 32
|
||||
#define NWARPS_Q5_1_AMPERE 4
|
||||
#else
|
||||
#define MMQ_X_Q5_1_AMPERE 128
|
||||
#define MMQ_Y_Q5_1_AMPERE 64
|
||||
#define NWARPS_Q5_1_AMPERE 4
|
||||
#endif
|
||||
#define MMQ_X_Q5_1_PASCAL 64
|
||||
#define MMQ_Y_Q5_1_PASCAL 64
|
||||
#define NWARPS_Q5_1_PASCAL 8
|
||||
@@ -3798,9 +3841,15 @@ mul_mat_q5_1(
|
||||
#define MMQ_X_Q8_0_RDNA1 64
|
||||
#define MMQ_Y_Q8_0_RDNA1 64
|
||||
#define NWARPS_Q8_0_RDNA1 8
|
||||
#if defined(CUDA_USE_TENSOR_CORES)
|
||||
#define MMQ_X_Q8_0_AMPERE 4
|
||||
#define MMQ_Y_Q8_0_AMPERE 32
|
||||
#define NWARPS_Q8_0_AMPERE 4
|
||||
#else
|
||||
#define MMQ_X_Q8_0_AMPERE 128
|
||||
#define MMQ_Y_Q8_0_AMPERE 64
|
||||
#define NWARPS_Q8_0_AMPERE 4
|
||||
#endif
|
||||
#define MMQ_X_Q8_0_PASCAL 64
|
||||
#define MMQ_Y_Q8_0_PASCAL 64
|
||||
#define NWARPS_Q8_0_PASCAL 8
|
||||
@@ -3859,9 +3908,15 @@ template <bool need_check> static __global__ void
|
||||
#define MMQ_X_Q2_K_RDNA1 128
|
||||
#define MMQ_Y_Q2_K_RDNA1 32
|
||||
#define NWARPS_Q2_K_RDNA1 8
|
||||
#if defined(CUDA_USE_TENSOR_CORES)
|
||||
#define MMQ_X_Q2_K_AMPERE 4
|
||||
#define MMQ_Y_Q2_K_AMPERE 32
|
||||
#define NWARPS_Q2_K_AMPERE 4
|
||||
#else
|
||||
#define MMQ_X_Q2_K_AMPERE 64
|
||||
#define MMQ_Y_Q2_K_AMPERE 128
|
||||
#define NWARPS_Q2_K_AMPERE 4
|
||||
#endif
|
||||
#define MMQ_X_Q2_K_PASCAL 64
|
||||
#define MMQ_Y_Q2_K_PASCAL 64
|
||||
#define NWARPS_Q2_K_PASCAL 8
|
||||
@@ -3920,9 +3975,15 @@ mul_mat_q2_K(
|
||||
#define MMQ_X_Q3_K_RDNA1 32
|
||||
#define MMQ_Y_Q3_K_RDNA1 128
|
||||
#define NWARPS_Q3_K_RDNA1 8
|
||||
#if defined(CUDA_USE_TENSOR_CORES)
|
||||
#define MMQ_X_Q3_K_AMPERE 4
|
||||
#define MMQ_Y_Q3_K_AMPERE 32
|
||||
#define NWARPS_Q3_K_AMPERE 4
|
||||
#else
|
||||
#define MMQ_X_Q3_K_AMPERE 128
|
||||
#define MMQ_Y_Q3_K_AMPERE 128
|
||||
#define NWARPS_Q3_K_AMPERE 4
|
||||
#endif
|
||||
#define MMQ_X_Q3_K_PASCAL 64
|
||||
#define MMQ_Y_Q3_K_PASCAL 64
|
||||
#define NWARPS_Q3_K_PASCAL 8
|
||||
@@ -3983,9 +4044,15 @@ template <bool need_check> static __global__ void
|
||||
#define MMQ_X_Q4_K_RDNA1 32
|
||||
#define MMQ_Y_Q4_K_RDNA1 64
|
||||
#define NWARPS_Q4_K_RDNA1 8
|
||||
#if defined(CUDA_USE_TENSOR_CORES)
|
||||
#define MMQ_X_Q4_K_AMPERE 4
|
||||
#define MMQ_Y_Q4_K_AMPERE 32
|
||||
#define NWARPS_Q4_K_AMPERE 4
|
||||
#else
|
||||
#define MMQ_X_Q4_K_AMPERE 64
|
||||
#define MMQ_Y_Q4_K_AMPERE 128
|
||||
#define NWARPS_Q4_K_AMPERE 4
|
||||
#endif
|
||||
#define MMQ_X_Q4_K_PASCAL 64
|
||||
#define MMQ_Y_Q4_K_PASCAL 64
|
||||
#define NWARPS_Q4_K_PASCAL 8
|
||||
@@ -4046,9 +4113,15 @@ template <bool need_check> static __global__ void
|
||||
#define MMQ_X_Q5_K_RDNA1 32
|
||||
#define MMQ_Y_Q5_K_RDNA1 64
|
||||
#define NWARPS_Q5_K_RDNA1 8
|
||||
#if defined(CUDA_USE_TENSOR_CORES)
|
||||
#define MMQ_X_Q5_K_AMPERE 4
|
||||
#define MMQ_Y_Q5_K_AMPERE 32
|
||||
#define NWARPS_Q5_K_AMPERE 4
|
||||
#else
|
||||
#define MMQ_X_Q5_K_AMPERE 64
|
||||
#define MMQ_Y_Q5_K_AMPERE 128
|
||||
#define NWARPS_Q5_K_AMPERE 4
|
||||
#endif
|
||||
#define MMQ_X_Q5_K_PASCAL 64
|
||||
#define MMQ_Y_Q5_K_PASCAL 64
|
||||
#define NWARPS_Q5_K_PASCAL 8
|
||||
@@ -4107,9 +4180,15 @@ mul_mat_q5_K(
|
||||
#define MMQ_X_Q6_K_RDNA1 32
|
||||
#define MMQ_Y_Q6_K_RDNA1 64
|
||||
#define NWARPS_Q6_K_RDNA1 8
|
||||
#if defined(CUDA_USE_TENSOR_CORES)
|
||||
#define MMQ_X_Q6_K_AMPERE 4
|
||||
#define MMQ_Y_Q6_K_AMPERE 32
|
||||
#define NWARPS_Q6_K_AMPERE 4
|
||||
#else
|
||||
#define MMQ_X_Q6_K_AMPERE 64
|
||||
#define MMQ_Y_Q6_K_AMPERE 64
|
||||
#define NWARPS_Q6_K_AMPERE 4
|
||||
#endif
|
||||
#define MMQ_X_Q6_K_PASCAL 64
|
||||
#define MMQ_Y_Q6_K_PASCAL 64
|
||||
#define NWARPS_Q6_K_PASCAL 8
|
||||
@@ -4165,7 +4244,7 @@ template <bool need_check> static __global__ void
|
||||
}
|
||||
|
||||
template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
|
||||
static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) {
|
||||
static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst0, const int ncols, const int nrows) {
|
||||
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
||||
|
||||
if (row >= nrows) {
|
||||
@@ -4179,7 +4258,9 @@ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void *
|
||||
float tmp = 0.0f;
|
||||
|
||||
const block_q_t * x = (const block_q_t *) vx;
|
||||
const block_q8_1 * y = (const block_q8_1 *) vy;
|
||||
const block_q8_1 * y = (const block_q8_1 *) vy + blockIdx.x*blocks_per_row;
|
||||
|
||||
float * dst = dst0 + blockIdx.x*nrows;
|
||||
|
||||
for (int i = 0; i < blocks_per_row; i += blocks_per_warp) {
|
||||
const int ibx = row*blocks_per_row + i + threadIdx.x / (qi/vdr); // x block index
|
||||
@@ -4203,11 +4284,14 @@ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void *
|
||||
}
|
||||
|
||||
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
||||
static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
|
||||
static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y0, float * __restrict__ dst0, const int ncols, const int nrows) {
|
||||
// qk = quantized weights per x block
|
||||
// qr = number of quantized weights per data value in x block
|
||||
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
||||
|
||||
const dfloat * y = y0 + blockIdx.x*ncols;
|
||||
float * dst = dst0 + blockIdx.x*nrows;
|
||||
|
||||
if (row >= nrows) {
|
||||
return;
|
||||
}
|
||||
@@ -4326,13 +4410,13 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
|
||||
|
||||
const half * x = (const half *) vx;
|
||||
|
||||
const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
|
||||
const int channel = blockDim.z*blockIdx.z + threadIdx.z;
|
||||
const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
|
||||
const int channel = blockDim.z*blockIdx.z + threadIdx.z;
|
||||
const int channel_x = channel / channel_x_divisor;
|
||||
|
||||
const int nrows_y = ncols_x;
|
||||
const int nrows_y = ncols_x;
|
||||
const int nrows_dst = nrows_x;
|
||||
const int row_dst = row_x;
|
||||
const int row_dst = row_x;
|
||||
|
||||
const int idst = channel*nrows_dst + row_dst;
|
||||
|
||||
@@ -4345,13 +4429,13 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
|
||||
break;
|
||||
}
|
||||
|
||||
const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x;
|
||||
const float xi = __half2float(x[ix]);
|
||||
|
||||
const int row_y = col_x;
|
||||
|
||||
const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x;
|
||||
const int iy = channel*nrows_y + row_y;
|
||||
|
||||
const float xi = __half2float(x[ix]);
|
||||
|
||||
tmp += xi * y[iy];
|
||||
}
|
||||
|
||||
@@ -4734,178 +4818,178 @@ static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cu
|
||||
#endif
|
||||
}
|
||||
|
||||
static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nb, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(1, block_num_y, 1);
|
||||
const dim3 block_nums(nb, block_num_y, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
||||
}
|
||||
|
||||
static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nb, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(1, block_num_y, 1);
|
||||
const dim3 block_nums(nb, block_num_y, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
||||
}
|
||||
|
||||
static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nb, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(1, block_num_y, 1);
|
||||
const dim3 block_nums(nb, block_num_y, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
||||
}
|
||||
|
||||
static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nb, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(1, block_num_y, 1);
|
||||
const dim3 block_nums(nb, block_num_y, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
||||
}
|
||||
|
||||
static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nb, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(1, block_num_y, 1);
|
||||
const dim3 block_nums(nb, block_num_y, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
||||
}
|
||||
|
||||
static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nb, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % QK_K == 0);
|
||||
const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2
|
||||
const int block_num_y = (nrows + ny - 1) / ny;
|
||||
const dim3 block_nums(1, block_num_y, 1);
|
||||
const dim3 block_nums(nb, block_num_y, 1);
|
||||
const dim3 block_dims(32, ny, 1);
|
||||
dequantize_mul_mat_vec_q2_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
||||
}
|
||||
|
||||
static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nb, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % QK_K == 0);
|
||||
const int ny = 2 / K_QUANTS_PER_ITERATION;
|
||||
const int block_num_y = (nrows + ny - 1) / ny;
|
||||
const dim3 block_nums(1, block_num_y, 1);
|
||||
const dim3 block_nums(nb, block_num_y, 1);
|
||||
const dim3 block_dims(32, ny, 1);
|
||||
dequantize_mul_mat_vec_q3_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
||||
}
|
||||
|
||||
static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nb, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % QK_K == 0);
|
||||
const int ny = 2 / K_QUANTS_PER_ITERATION;
|
||||
const int block_num_y = (nrows + ny - 1) / ny;
|
||||
const dim3 block_nums(1, block_num_y, 1);
|
||||
const dim3 block_nums(nb, block_num_y, 1);
|
||||
const dim3 block_dims(32, ny, 1);
|
||||
dequantize_mul_mat_vec_q4_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
||||
}
|
||||
|
||||
static void dequantize_mul_mat_vec_q5_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
static void dequantize_mul_mat_vec_q5_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nb, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % QK_K == 0);
|
||||
const dim3 block_dims(32, 1, 1);
|
||||
dequantize_mul_mat_vec_q5_k<<<nrows, block_dims, 0, stream>>>(vx, y, dst, ncols);
|
||||
}
|
||||
|
||||
static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nb, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % QK_K == 0);
|
||||
const int ny = 2 / K_QUANTS_PER_ITERATION;
|
||||
const int block_num_y = (nrows + ny - 1) / ny;
|
||||
const dim3 block_nums(1, block_num_y, 1);
|
||||
const dim3 block_nums(nb, block_num_y, 1);
|
||||
const dim3 block_dims(32, ny, 1);
|
||||
dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
||||
}
|
||||
|
||||
static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nb, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % QK4_0 == 0);
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(1, block_num_y, 1);
|
||||
const dim3 block_nums(nb, block_num_y, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||
}
|
||||
|
||||
static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nb, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % QK4_1 == 0);
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(1, block_num_y, 1);
|
||||
const dim3 block_nums(nb, block_num_y, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||
}
|
||||
|
||||
static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nb, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % QK5_0 == 0);
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(1, block_num_y, 1);
|
||||
const dim3 block_nums(nb, block_num_y, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||
}
|
||||
|
||||
static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nb, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % QK5_1 == 0);
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(1, block_num_y, 1);
|
||||
const dim3 block_nums(nb, block_num_y, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||
}
|
||||
|
||||
static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nb, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % QK8_0 == 0);
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(1, block_num_y, 1);
|
||||
const dim3 block_nums(nb, block_num_y, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||
}
|
||||
|
||||
static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nb, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % QK_K == 0);
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(1, block_num_y, 1);
|
||||
const dim3 block_nums(nb, block_num_y, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||
}
|
||||
|
||||
static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nb, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % QK_K == 0);
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(1, block_num_y, 1);
|
||||
const dim3 block_nums(nb, block_num_y, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||
}
|
||||
|
||||
static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nb, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % QK_K == 0);
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(1, block_num_y, 1);
|
||||
const dim3 block_nums(nb, block_num_y, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||
}
|
||||
|
||||
static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nb, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % QK_K == 0);
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(1, block_num_y, 1);
|
||||
const dim3 block_nums(nb, block_num_y, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||
}
|
||||
|
||||
static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nb, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % QK_K == 0);
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(1, block_num_y, 1);
|
||||
const dim3 block_nums(nb, block_num_y, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||
@@ -4921,10 +5005,10 @@ static void convert_fp32_to_fp16_cuda(const void * vx, half * y, const int k, cu
|
||||
dequantize_block<1, 1, convert_f32><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
||||
}
|
||||
|
||||
static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nb, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(1, block_num_y, 1);
|
||||
const dim3 block_nums(nb, block_num_y, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
dequantize_mul_mat_vec<1, 1, convert_f16>
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
||||
@@ -5661,11 +5745,21 @@ void ggml_init_cublas() {
|
||||
CUDA_CHECK(cudaGetDeviceCount(&g_device_count));
|
||||
GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
|
||||
int64_t total_vram = 0;
|
||||
#if defined(GGML_CUDA_FORCE_MMQ)
|
||||
fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
|
||||
#else
|
||||
fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
|
||||
#endif
|
||||
#if defined(CUDA_USE_TENSOR_CORES)
|
||||
fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: yes\n", __func__);
|
||||
#else
|
||||
fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: no\n", __func__);
|
||||
#endif
|
||||
fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
|
||||
for (int64_t id = 0; id < g_device_count; ++id) {
|
||||
for (int id = 0; id < g_device_count; ++id) {
|
||||
cudaDeviceProp prop;
|
||||
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
|
||||
fprintf(stderr, " Device %ld: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor);
|
||||
fprintf(stderr, " Device %d: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor);
|
||||
|
||||
g_tensor_split[id] = total_vram;
|
||||
total_vram += prop.totalGlobalMem;
|
||||
@@ -5675,15 +5769,15 @@ void ggml_init_cublas() {
|
||||
g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
|
||||
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||
}
|
||||
for (int64_t id = 0; id < g_device_count; ++id) {
|
||||
for (int id = 0; id < g_device_count; ++id) {
|
||||
g_tensor_split[id] /= total_vram;
|
||||
}
|
||||
|
||||
for (int64_t id = 0; id < g_device_count; ++id) {
|
||||
for (int id = 0; id < g_device_count; ++id) {
|
||||
CUDA_CHECK(ggml_cuda_set_device(id));
|
||||
|
||||
// create cuda streams
|
||||
for (int64_t is = 0; is < MAX_STREAMS; ++is) {
|
||||
for (int is = 0; is < MAX_STREAMS; ++is) {
|
||||
CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams[id][is], cudaStreamNonBlocking));
|
||||
}
|
||||
|
||||
@@ -6123,38 +6217,39 @@ inline void ggml_cuda_op_mul_mat_vec_q(
|
||||
const int64_t src1_padded_row_size, const cudaStream_t & stream) {
|
||||
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
const int64_t ne11 = src1->ne[1];
|
||||
const int64_t row_diff = row_high - row_low;
|
||||
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
mul_mat_vec_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
||||
mul_mat_vec_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, ne11, row_diff, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q4_1:
|
||||
mul_mat_vec_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
||||
mul_mat_vec_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, ne11, row_diff, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q5_0:
|
||||
mul_mat_vec_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
||||
mul_mat_vec_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, ne11, row_diff, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q5_1:
|
||||
mul_mat_vec_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
||||
mul_mat_vec_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, ne11, row_diff, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q8_0:
|
||||
mul_mat_vec_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
||||
mul_mat_vec_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, ne11, row_diff, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q2_K:
|
||||
mul_mat_vec_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
||||
mul_mat_vec_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, ne11, row_diff, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q3_K:
|
||||
mul_mat_vec_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
||||
mul_mat_vec_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, ne11, row_diff, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q4_K:
|
||||
mul_mat_vec_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
||||
mul_mat_vec_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, ne11, row_diff, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q5_K:
|
||||
mul_mat_vec_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
||||
mul_mat_vec_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, ne11, row_diff, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q6_K:
|
||||
mul_mat_vec_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
||||
mul_mat_vec_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, ne11, row_diff, stream);
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
@@ -6174,6 +6269,7 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
|
||||
const int64_t src1_padded_row_size, const cudaStream_t & stream) {
|
||||
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
const int64_t ne11 = src1->ne[1];
|
||||
const int64_t row_diff = row_high - row_low;
|
||||
|
||||
// on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
|
||||
@@ -6197,37 +6293,37 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
|
||||
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
dequantize_mul_mat_vec_q4_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
|
||||
dequantize_mul_mat_vec_q4_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, ne11, row_diff, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q4_1:
|
||||
dequantize_mul_mat_vec_q4_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
|
||||
dequantize_mul_mat_vec_q4_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, ne11, row_diff, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q5_0:
|
||||
dequantize_mul_mat_vec_q5_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
|
||||
dequantize_mul_mat_vec_q5_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, ne11, row_diff, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q5_1:
|
||||
dequantize_mul_mat_vec_q5_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
|
||||
dequantize_mul_mat_vec_q5_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, ne11, row_diff, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q8_0:
|
||||
dequantize_mul_mat_vec_q8_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
|
||||
dequantize_mul_mat_vec_q8_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, ne11, row_diff, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q2_K:
|
||||
dequantize_mul_mat_vec_q2_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
|
||||
dequantize_mul_mat_vec_q2_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, ne11, row_diff, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q3_K:
|
||||
dequantize_mul_mat_vec_q3_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
|
||||
dequantize_mul_mat_vec_q3_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, ne11, row_diff, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q4_K:
|
||||
dequantize_mul_mat_vec_q4_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
|
||||
dequantize_mul_mat_vec_q4_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, ne11, row_diff, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q5_K:
|
||||
dequantize_mul_mat_vec_q5_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
|
||||
dequantize_mul_mat_vec_q5_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, ne11, row_diff, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q6_K:
|
||||
dequantize_mul_mat_vec_q6_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
|
||||
dequantize_mul_mat_vec_q6_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, ne11, row_diff, stream);
|
||||
break;
|
||||
case GGML_TYPE_F16:
|
||||
convert_mul_mat_vec_f16_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
|
||||
convert_mul_mat_vec_f16_cuda (src0_dd_i, src1_dfloat, dst_dd_i, ne00, ne11, row_diff, stream);
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
@@ -6252,16 +6348,15 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
||||
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
||||
const int64_t src1_padded_row_size, const cudaStream_t & stream) {
|
||||
|
||||
GGML_ASSERT(src0_dd_i != nullptr);
|
||||
GGML_ASSERT(src0_dd_i != nullptr);
|
||||
GGML_ASSERT(src1_ddf_i != nullptr);
|
||||
GGML_ASSERT(dst_dd_i != nullptr);
|
||||
|
||||
GGML_ASSERT(dst_dd_i != nullptr);
|
||||
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
|
||||
const int64_t ne10 = src1->ne[0];
|
||||
|
||||
const int64_t ne0 = dst->ne[0];
|
||||
|
||||
const int64_t row_diff = row_high - row_low;
|
||||
|
||||
int id;
|
||||
@@ -6346,7 +6441,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
||||
cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
|
||||
row_diff, src1_ncols, ne10,
|
||||
&alpha, src0_ddf_i, ne00,
|
||||
src1_ddf_i, ne10,
|
||||
src1_ddf_i, ne10,
|
||||
&beta, dst_dd_i, ldc));
|
||||
|
||||
if (src0_as != 0) {
|
||||
@@ -7013,7 +7108,8 @@ static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tens
|
||||
}
|
||||
|
||||
static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
|
||||
GGML_ASSERT(!ggml_is_contiguous(src0) && ggml_is_contiguous(src1));
|
||||
GGML_ASSERT(!ggml_is_transposed(src0));
|
||||
GGML_ASSERT(!ggml_is_transposed(src1));
|
||||
GGML_ASSERT(!ggml_is_permuted(src0));
|
||||
GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
||||
@@ -7023,11 +7119,11 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
|
||||
const int64_t ne01 = src0->ne[1];
|
||||
const int64_t ne02 = src0->ne[2];
|
||||
|
||||
const int64_t ne12 = src1->ne[2];
|
||||
|
||||
const int64_t nb01 = src0->nb[1];
|
||||
const int64_t nb02 = src0->nb[2];
|
||||
|
||||
const int64_t ne12 = src1->ne[2];
|
||||
|
||||
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
||||
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
||||
|
||||
@@ -7046,27 +7142,201 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
|
||||
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
|
||||
}
|
||||
|
||||
static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(!ggml_is_transposed(src0));
|
||||
GGML_ASSERT(!ggml_is_transposed(src1));
|
||||
|
||||
GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
|
||||
const int64_t ne00 = src0->ne[0]; GGML_UNUSED(ne00);
|
||||
const int64_t ne01 = src0->ne[1];
|
||||
const int64_t ne02 = src0->ne[2];
|
||||
const int64_t ne03 = src0->ne[3];
|
||||
|
||||
const int64_t nb01 = src0->nb[1];
|
||||
const int64_t nb02 = src0->nb[2]; GGML_UNUSED(nb02);
|
||||
const int64_t nb03 = src0->nb[3]; GGML_UNUSED(nb03);
|
||||
|
||||
const int64_t ne10 = src1->ne[0];
|
||||
const int64_t ne11 = src1->ne[1];
|
||||
const int64_t ne12 = src1->ne[2];
|
||||
const int64_t ne13 = src1->ne[3];
|
||||
|
||||
const int64_t nb11 = src1->nb[1];
|
||||
const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
|
||||
const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
|
||||
|
||||
const int64_t ne1 = ggml_nelements(src1);
|
||||
const int64_t ne = ggml_nelements(dst);
|
||||
|
||||
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
||||
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
||||
|
||||
int id;
|
||||
CUDA_CHECK(cudaGetDevice(&id));
|
||||
CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], main_stream));
|
||||
|
||||
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
||||
void * src0_ddq = src0_extra->data_device[g_main_device];
|
||||
half * src0_as_f16 = (half *) src0_ddq;
|
||||
|
||||
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
||||
float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
|
||||
|
||||
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
||||
|
||||
// convert src1 to fp16
|
||||
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
|
||||
GGML_ASSERT(to_fp16_cuda != nullptr);
|
||||
|
||||
size_t src1_as = 0;
|
||||
half * src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne1 * sizeof(half), &src1_as);
|
||||
to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream);
|
||||
|
||||
size_t dst_as = 0;
|
||||
half * dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as);
|
||||
|
||||
GGML_ASSERT(ne12 % ne02 == 0);
|
||||
GGML_ASSERT(ne13 % ne03 == 0);
|
||||
|
||||
// broadcast factors
|
||||
const int64_t r2 = ne12/ne02;
|
||||
const int64_t r3 = ne13/ne03;
|
||||
|
||||
const half alpha_f16 = 1.0f;
|
||||
const half beta_f16 = 0.0f;
|
||||
|
||||
#if 0
|
||||
// use cublasGemmEx
|
||||
{
|
||||
for (int i13 = 0; i13 < ne13; ++i13) {
|
||||
for (int i12 = 0; i12 < ne12; ++i12) {
|
||||
int i03 = i13 / r3;
|
||||
int i02 = i12 / r2;
|
||||
|
||||
CUBLAS_CHECK(
|
||||
cublasGemmEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
|
||||
ne01, ne11, ne10,
|
||||
&alpha_f16, (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , CUDA_R_16F, nb01/sizeof(half),
|
||||
(const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F, nb11/sizeof(float),
|
||||
&beta_f16, ( char *) dst_f16 + i12* dst->nb[2]/2 + i13* dst->nb[3]/2, CUDA_R_16F, ne01,
|
||||
CUBLAS_COMPUTE_16F,
|
||||
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
if (r2 == 1 && r3 == 1 && src0->nb[2]*src0->ne[2] == src0->nb[3] && src1->nb[2]*src1->ne[2] == src1->nb[3]) {
|
||||
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
|
||||
// use cublasGemmStridedBatchedEx
|
||||
CUBLAS_CHECK(
|
||||
cublasGemmStridedBatchedEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
|
||||
ne01, ne11, ne10,
|
||||
&alpha_f16, (const char *) src0_as_f16, CUDA_R_16F, nb01/sizeof(half), src0->nb[2]/sizeof(half), // strideA
|
||||
(const char *) src1_as_f16, CUDA_R_16F, nb11/sizeof(float), src1->nb[2]/sizeof(float), // strideB
|
||||
&beta_f16, ( char *) dst_f16, CUDA_R_16F, ne01, dst->nb[2]/sizeof(float), // strideC
|
||||
ne12*ne13,
|
||||
CUBLAS_COMPUTE_16F,
|
||||
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
||||
} else {
|
||||
// use cublasGemmBatchedEx
|
||||
// TODO: https://github.com/ggerganov/llama.cpp/pull/3749#discussion_r1369997000
|
||||
const int ne23 = ne12*ne13;
|
||||
|
||||
// TODO: avoid this alloc
|
||||
void ** ptrs = (void **) malloc(3*ne23*sizeof(void *));
|
||||
|
||||
for (int i13 = 0; i13 < ne13; ++i13) {
|
||||
for (int i12 = 0; i12 < ne12; ++i12) {
|
||||
int i03 = i13 / r3;
|
||||
int i02 = i12 / r2;
|
||||
|
||||
ptrs[0*ne23 + i12 + i13*ne12] = (char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3];
|
||||
ptrs[1*ne23 + i12 + i13*ne12] = (char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2;
|
||||
ptrs[2*ne23 + i12 + i13*ne12] = (char *) dst_f16 + i12* dst->nb[2]/2 + i13* dst->nb[3]/2;
|
||||
}
|
||||
}
|
||||
|
||||
// allocate device memory for pointers
|
||||
void ** ptrs_as = nullptr;
|
||||
CUDA_CHECK(cudaMalloc(&ptrs_as, 3*ne23*sizeof(void *)));
|
||||
|
||||
// TODO: this does not work for some reason -- not sure why?
|
||||
//size_t ptrs_s = 0;
|
||||
//ptrs_as = (void **) ggml_cuda_pool_malloc(3*ne23*sizeof(void *), &ptrs_s);
|
||||
|
||||
// copy pointers to device
|
||||
CUDA_CHECK(cudaMemcpy(ptrs_as, ptrs, 3*ne23*sizeof(void *), cudaMemcpyHostToDevice));
|
||||
|
||||
free(ptrs);
|
||||
|
||||
CUBLAS_CHECK(
|
||||
cublasGemmBatchedEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
|
||||
ne01, ne11, ne10,
|
||||
&alpha_f16, (const void **) (ptrs_as + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
|
||||
(const void **) (ptrs_as + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
|
||||
&beta_f16, ( void **) (ptrs_as + 2*ne23), CUDA_R_16F, ne01,
|
||||
ne23,
|
||||
CUBLAS_COMPUTE_16F,
|
||||
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
||||
|
||||
// free device memory for pointers
|
||||
CUDA_CHECK(cudaFree(ptrs_as));
|
||||
//ggml_cuda_pool_free(ptrs_as, ptrs_s);
|
||||
}
|
||||
#endif
|
||||
|
||||
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
|
||||
to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);
|
||||
|
||||
ggml_cuda_pool_free(src1_as_f16, src1_as);
|
||||
ggml_cuda_pool_free(dst_f16, dst_as);
|
||||
}
|
||||
|
||||
static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
bool all_on_device = (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
|
||||
src1->backend == GGML_BACKEND_GPU && dst->backend == GGML_BACKEND_GPU;
|
||||
const bool all_on_device =
|
||||
(src0->backend == GGML_BACKEND_GPU) &&
|
||||
(src1->backend == GGML_BACKEND_GPU) &&
|
||||
( dst->backend == GGML_BACKEND_GPU);
|
||||
|
||||
int64_t min_compute_capability = INT_MAX;
|
||||
for (int64_t id = 0; id < g_device_count; ++id) {
|
||||
if (min_compute_capability > g_compute_capabilities[id]
|
||||
&& g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
|
||||
if (min_compute_capability > g_compute_capabilities[id] && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
|
||||
min_compute_capability = g_compute_capabilities[id];
|
||||
}
|
||||
}
|
||||
|
||||
if (all_on_device && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
||||
#ifdef CUDA_USE_TENSOR_CORES
|
||||
const bool use_tensor_cores = true;
|
||||
#else
|
||||
const bool use_tensor_cores = false;
|
||||
#endif
|
||||
|
||||
// debug helpers
|
||||
//printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
|
||||
//printf(" %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
|
||||
//printf("src1: %8d %8d %8d %8d\n", src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3]);
|
||||
//printf(" %8d %8d %8d %8d\n", src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3]);
|
||||
//printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
|
||||
//printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
|
||||
|
||||
if (all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
||||
// KQ single-batch
|
||||
ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
|
||||
} else if (all_on_device && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && src1->ne[1] == 1) {
|
||||
} else if (all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
||||
// KQV single-batch
|
||||
ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
|
||||
} else if (all_on_device && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
|
||||
// KQ + KQV multi-batch
|
||||
ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
|
||||
} else if (src0->type == GGML_TYPE_F32) {
|
||||
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
|
||||
} else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
|
||||
if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
|
||||
|
||||
if ((src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) ||
|
||||
(src1->ne[1] <= MMQ_MAX_BATCH_SIZE && src0->ne[0] % MATRIX_ROW_PADDING == 0)) {
|
||||
#ifdef GGML_CUDA_FORCE_DMMV
|
||||
const bool use_mul_mat_vec_q = false;
|
||||
#else
|
||||
@@ -7079,7 +7349,15 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
||||
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
|
||||
}
|
||||
} else {
|
||||
if (g_mul_mat_q && ggml_is_quantized(src0->type) && min_compute_capability >= MIN_CC_DP4A) {
|
||||
bool use_mul_mat_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
|
||||
|
||||
// when tensor cores are available, use them for large batch size
|
||||
// ref: https://github.com/ggerganov/llama.cpp/pull/3776
|
||||
if (use_tensor_cores && min_compute_capability >= CC_VOLTA && src1->ne[1] > MMQ_MAX_BATCH_SIZE) {
|
||||
use_mul_mat_q = false;
|
||||
}
|
||||
|
||||
if (use_mul_mat_q) {
|
||||
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
|
||||
} else {
|
||||
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
|
||||
@@ -7433,10 +7711,6 @@ void ggml_cuda_set_main_device(const int main_device) {
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cuda_set_mul_mat_q(const bool mul_mat_q) {
|
||||
g_mul_mat_q = mul_mat_q;
|
||||
}
|
||||
|
||||
void ggml_cuda_set_scratch_size(const size_t scratch_size) {
|
||||
// this is a hack to not completely break llama.cpp when using multiple models or contexts simultaneously
|
||||
// it still won't always work as expected, but it's better than nothing
|
||||
|
||||
237
ggml-impl.h
Normal file
237
ggml-impl.h
Normal file
@@ -0,0 +1,237 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
// GGML internal header
|
||||
|
||||
#include <assert.h>
|
||||
#include <stddef.h>
|
||||
#include <stdbool.h>
|
||||
#include <string.h> // memcpy
|
||||
#include <math.h> // fabsf
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// static_assert should be a #define, but if it's not,
|
||||
// fall back to the _Static_assert C11 keyword.
|
||||
// if C99 - static_assert is noop
|
||||
// ref: https://stackoverflow.com/a/53923785/4039976
|
||||
#ifndef static_assert
|
||||
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
|
||||
#define static_assert(cond, msg) _Static_assert(cond, msg)
|
||||
#else
|
||||
#define static_assert(cond, msg) struct global_scope_noop_trick
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
|
||||
#if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
|
||||
#ifndef __FMA__
|
||||
#define __FMA__
|
||||
#endif
|
||||
#ifndef __F16C__
|
||||
#define __F16C__
|
||||
#endif
|
||||
#ifndef __SSE3__
|
||||
#define __SSE3__
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#undef MIN
|
||||
#undef MAX
|
||||
|
||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||
|
||||
// 16-bit float
|
||||
// on Arm, we use __fp16
|
||||
// on x86, we use uint16_t
|
||||
#if defined(__ARM_NEON) && !defined(_MSC_VER)
|
||||
|
||||
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
|
||||
//
|
||||
// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
|
||||
//
|
||||
#include <arm_neon.h>
|
||||
|
||||
#define GGML_COMPUTE_FP16_TO_FP32(x) ((float) (x))
|
||||
#define GGML_COMPUTE_FP32_TO_FP16(x) (x)
|
||||
|
||||
#define GGML_FP16_TO_FP32(x) ((float) (x))
|
||||
#define GGML_FP32_TO_FP16(x) (x)
|
||||
|
||||
#else
|
||||
|
||||
#ifdef __wasm_simd128__
|
||||
#include <wasm_simd128.h>
|
||||
#else
|
||||
#ifdef __POWER9_VECTOR__
|
||||
#include <altivec.h>
|
||||
#undef bool
|
||||
#define bool _Bool
|
||||
#else
|
||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||
#include <intrin.h>
|
||||
#else
|
||||
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__)
|
||||
#if !defined(__riscv)
|
||||
#include <immintrin.h>
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef __riscv_v_intrinsic
|
||||
#include <riscv_vector.h>
|
||||
#endif
|
||||
|
||||
#ifdef __F16C__
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
|
||||
#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
|
||||
#else
|
||||
#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
|
||||
#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
|
||||
#endif
|
||||
|
||||
#elif defined(__POWER9_VECTOR__)
|
||||
|
||||
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
||||
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
||||
/* the inline asm below is about 12% faster than the lookup method */
|
||||
#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
|
||||
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
|
||||
|
||||
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
||||
register float f;
|
||||
register double d;
|
||||
__asm__(
|
||||
"mtfprd %0,%2\n"
|
||||
"xscvhpdp %0,%0\n"
|
||||
"frsp %1,%0\n" :
|
||||
/* temp */ "=d"(d),
|
||||
/* out */ "=f"(f):
|
||||
/* in */ "r"(h));
|
||||
return f;
|
||||
}
|
||||
|
||||
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
||||
register double d;
|
||||
register ggml_fp16_t r;
|
||||
__asm__( /* xscvdphp can work on double or single precision */
|
||||
"xscvdphp %0,%2\n"
|
||||
"mffprd %1,%0\n" :
|
||||
/* temp */ "=d"(d),
|
||||
/* out */ "=r"(r):
|
||||
/* in */ "f"(f));
|
||||
return r;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
// FP16 <-> FP32
|
||||
// ref: https://github.com/Maratyszcza/FP16
|
||||
|
||||
static inline float fp32_from_bits(uint32_t w) {
|
||||
union {
|
||||
uint32_t as_bits;
|
||||
float as_value;
|
||||
} fp32;
|
||||
fp32.as_bits = w;
|
||||
return fp32.as_value;
|
||||
}
|
||||
|
||||
static inline uint32_t fp32_to_bits(float f) {
|
||||
union {
|
||||
float as_value;
|
||||
uint32_t as_bits;
|
||||
} fp32;
|
||||
fp32.as_value = f;
|
||||
return fp32.as_bits;
|
||||
}
|
||||
|
||||
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
||||
const uint32_t w = (uint32_t) h << 16;
|
||||
const uint32_t sign = w & UINT32_C(0x80000000);
|
||||
const uint32_t two_w = w + w;
|
||||
|
||||
const uint32_t exp_offset = UINT32_C(0xE0) << 23;
|
||||
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
|
||||
const float exp_scale = 0x1.0p-112f;
|
||||
#else
|
||||
const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
|
||||
#endif
|
||||
const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
|
||||
|
||||
const uint32_t magic_mask = UINT32_C(126) << 23;
|
||||
const float magic_bias = 0.5f;
|
||||
const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
|
||||
|
||||
const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
|
||||
const uint32_t result = sign |
|
||||
(two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
|
||||
return fp32_from_bits(result);
|
||||
}
|
||||
|
||||
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
||||
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
|
||||
const float scale_to_inf = 0x1.0p+112f;
|
||||
const float scale_to_zero = 0x1.0p-110f;
|
||||
#else
|
||||
const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
|
||||
const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
|
||||
#endif
|
||||
float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
|
||||
|
||||
const uint32_t w = fp32_to_bits(f);
|
||||
const uint32_t shl1_w = w + w;
|
||||
const uint32_t sign = w & UINT32_C(0x80000000);
|
||||
uint32_t bias = shl1_w & UINT32_C(0xFF000000);
|
||||
if (bias < UINT32_C(0x71000000)) {
|
||||
bias = UINT32_C(0x71000000);
|
||||
}
|
||||
|
||||
base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
|
||||
const uint32_t bits = fp32_to_bits(base);
|
||||
const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
|
||||
const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
|
||||
const uint32_t nonsign = exp_bits + mantissa_bits;
|
||||
return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
|
||||
}
|
||||
|
||||
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
||||
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
||||
|
||||
#endif // __F16C__
|
||||
|
||||
#endif // __ARM_NEON
|
||||
|
||||
// precomputed f32 table for f16 (256 KB)
|
||||
// defined in ggml.c, initialized in ggml_init()
|
||||
extern float ggml_table_f32_f16[1 << 16];
|
||||
|
||||
// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
|
||||
// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
|
||||
// This is also true for POWER9.
|
||||
#if !defined(GGML_FP16_TO_FP32) || !defined(GGML_FP32_TO_FP16)
|
||||
|
||||
inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
|
||||
uint16_t s;
|
||||
memcpy(&s, &f, sizeof(uint16_t));
|
||||
return ggml_table_f32_f16[s];
|
||||
}
|
||||
|
||||
#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
|
||||
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
|
||||
|
||||
#endif
|
||||
|
||||
// TODO: backend v2 PR
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
22
ggml-metal.m
22
ggml-metal.m
@@ -62,6 +62,7 @@ struct ggml_metal_context {
|
||||
GGML_METAL_DECL_KERNEL(mul);
|
||||
GGML_METAL_DECL_KERNEL(mul_row); // TODO: avoid this extra kernel, instead extend the "mul" kernel to support broadcast
|
||||
GGML_METAL_DECL_KERNEL(scale);
|
||||
GGML_METAL_DECL_KERNEL(scale_4);
|
||||
GGML_METAL_DECL_KERNEL(silu);
|
||||
GGML_METAL_DECL_KERNEL(relu);
|
||||
GGML_METAL_DECL_KERNEL(gelu);
|
||||
@@ -209,6 +210,10 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
|
||||
|
||||
NSString * sourcePath = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
|
||||
if (sourcePath == nil) {
|
||||
GGML_METAL_LOG_WARN("%s: error: could not use bundle path to find ggml-metal.metal, falling back to trying cwd\n", __func__);
|
||||
sourcePath = @"ggml-metal.metal";
|
||||
}
|
||||
GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [sourcePath UTF8String]);
|
||||
NSString * src = [NSString stringWithContentsOfFile:sourcePath encoding:NSUTF8StringEncoding error:&error];
|
||||
if (error) {
|
||||
@@ -249,6 +254,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
GGML_METAL_ADD_KERNEL(mul);
|
||||
GGML_METAL_ADD_KERNEL(mul_row);
|
||||
GGML_METAL_ADD_KERNEL(scale);
|
||||
GGML_METAL_ADD_KERNEL(scale_4);
|
||||
GGML_METAL_ADD_KERNEL(silu);
|
||||
GGML_METAL_ADD_KERNEL(relu);
|
||||
GGML_METAL_ADD_KERNEL(gelu);
|
||||
@@ -347,6 +353,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
||||
GGML_METAL_DEL_KERNEL(mul);
|
||||
GGML_METAL_DEL_KERNEL(mul_row);
|
||||
GGML_METAL_DEL_KERNEL(scale);
|
||||
GGML_METAL_DEL_KERNEL(scale_4);
|
||||
GGML_METAL_DEL_KERNEL(silu);
|
||||
GGML_METAL_DEL_KERNEL(relu);
|
||||
GGML_METAL_DEL_KERNEL(gelu);
|
||||
@@ -923,15 +930,20 @@ void ggml_metal_graph_compute(
|
||||
|
||||
const float scale = *(const float *) src1->data;
|
||||
|
||||
[encoder setComputePipelineState:ctx->pipeline_scale];
|
||||
int64_t n = ggml_nelements(dst);
|
||||
|
||||
if (n % 4 == 0) {
|
||||
n /= 4;
|
||||
[encoder setComputePipelineState:ctx->pipeline_scale_4];
|
||||
} else {
|
||||
[encoder setComputePipelineState:ctx->pipeline_scale];
|
||||
}
|
||||
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
[encoder setBytes:&scale length:sizeof(scale) atIndex:2];
|
||||
|
||||
const int64_t n = ggml_nelements(dst);
|
||||
GGML_ASSERT(n % 4 == 0);
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
} break;
|
||||
case GGML_OP_UNARY:
|
||||
switch (ggml_get_unary_op(gf->nodes[i])) {
|
||||
|
||||
@@ -125,9 +125,17 @@ kernel void kernel_mul_row(
|
||||
}
|
||||
|
||||
kernel void kernel_scale(
|
||||
device const float * src0,
|
||||
device float * dst,
|
||||
constant float & scale,
|
||||
uint tpig[[thread_position_in_grid]]) {
|
||||
dst[tpig] = src0[tpig] * scale;
|
||||
}
|
||||
|
||||
kernel void kernel_scale_4(
|
||||
device const float4 * src0,
|
||||
device float4 * dst,
|
||||
constant float & scale,
|
||||
constant float & scale,
|
||||
uint tpig[[thread_position_in_grid]]) {
|
||||
dst[tpig] = src0[tpig] * scale;
|
||||
}
|
||||
|
||||
332
ggml-opencl.cpp
332
ggml-opencl.cpp
@@ -1489,46 +1489,45 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
||||
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
|
||||
|
||||
size_t x_offset = 0;
|
||||
int64_t pi02 = -1;
|
||||
int64_t pi03 = -1;
|
||||
|
||||
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
||||
int64_t i03 = i13 / r3;
|
||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||
// TODO: copy src0 here when r3>1
|
||||
for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
|
||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||
if (src0->backend == GGML_BACKEND_GPU) {
|
||||
x_offset = (i03 * ne02 + i02) * x_ne;
|
||||
} else {
|
||||
// copy src0 to device
|
||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
|
||||
}
|
||||
|
||||
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
||||
int64_t i02 = i12 / r2;
|
||||
for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
|
||||
// copy src1 to device
|
||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
|
||||
|
||||
// copy data to device
|
||||
if (src0->backend == GGML_BACKEND_GPU) {
|
||||
x_offset = (i03 * ne02 + i02) * x_ne;
|
||||
} else if (i02 != pi02 || i03 != pi03) {
|
||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
|
||||
pi02 = i02;
|
||||
pi03 = i03;
|
||||
CL_CHECK(clFinish(queue));
|
||||
|
||||
// compute
|
||||
cl_event ev_sgemm;
|
||||
clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor,
|
||||
clblast::Transpose::kYes, clblast::Transpose::kNo,
|
||||
ne01, ne11, ne10,
|
||||
alpha,
|
||||
d_X, x_offset, ne00,
|
||||
d_Y, 0, ne10,
|
||||
beta,
|
||||
d_D, 0, ne01,
|
||||
&queue, &ev_sgemm);
|
||||
|
||||
if (status != clblast::StatusCode::kSuccess) {
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
|
||||
// copy dst to host
|
||||
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
||||
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
|
||||
}
|
||||
}
|
||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
|
||||
|
||||
CL_CHECK(clFinish(queue));
|
||||
|
||||
// compute
|
||||
cl_event ev_sgemm;
|
||||
clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor,
|
||||
clblast::Transpose::kYes, clblast::Transpose::kNo,
|
||||
ne01, ne11, ne10,
|
||||
alpha,
|
||||
d_X, x_offset, ne00,
|
||||
d_Y, 0, ne10,
|
||||
beta,
|
||||
d_D, 0, ne01,
|
||||
&queue, &ev_sgemm);
|
||||
|
||||
if (status != clblast::StatusCode::kSuccess) {
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
|
||||
// copy dst to host
|
||||
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
||||
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1589,73 +1588,70 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
||||
bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float);
|
||||
|
||||
size_t x_offset = 0;
|
||||
int64_t pi02 = -1;
|
||||
int64_t pi03 = -1;
|
||||
|
||||
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
||||
int64_t i03 = i13 / r3;
|
||||
|
||||
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
||||
int64_t i02 = i12 / r2;
|
||||
|
||||
// copy src0 to device
|
||||
if (src0->backend == GGML_BACKEND_GPU) {
|
||||
x_offset = (i03 * ne02 + i02) * x_ne;
|
||||
} else if (i02 != pi02 || i03 != pi03) {
|
||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
|
||||
pi02 = i02;
|
||||
pi03 = i03;
|
||||
}
|
||||
|
||||
// convert src1 to fp16
|
||||
// TODO: use multiple threads
|
||||
char * src1i = (char *) src1->data + i13*nb13 + i12*nb12;
|
||||
if (src1_cont_rows) {
|
||||
if (src1_cont_cols) {
|
||||
ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11);
|
||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||
// TODO: copy src0 here when r3>1
|
||||
for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
|
||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||
if (src0->backend == GGML_BACKEND_GPU) {
|
||||
x_offset = (i03 * ne02 + i02) * x_ne;
|
||||
} else {
|
||||
// copy src0 to device
|
||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
|
||||
}
|
||||
else {
|
||||
for (int64_t i11 = 0; i11 < ne11; i11++) {
|
||||
ggml_fp32_to_fp16_row((float *) (src1i + i11*nb11), tmp + i11*ne10, ne10);
|
||||
|
||||
for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
|
||||
// convert src1 to fp16
|
||||
// TODO: use multiple threads
|
||||
char * src1i = (char *) src1->data + i13*nb13 + i12*nb12;
|
||||
if (src1_cont_rows) {
|
||||
if (src1_cont_cols) {
|
||||
ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11);
|
||||
}
|
||||
else {
|
||||
for (int64_t i11 = 0; i11 < ne11; i11++) {
|
||||
ggml_fp32_to_fp16_row((float *) (src1i + i11*nb11), tmp + i11*ne10, ne10);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
for (int64_t i11 = 0; i11 < ne11; i11++) {
|
||||
for (int64_t i10 = 0; i10 < ne10; i10++) {
|
||||
// very slow due to no inlining
|
||||
tmp[i11*ne10 + i10] = ggml_fp32_to_fp16(*(float *) (src1i + i11*nb11 + i10*nb10));
|
||||
else {
|
||||
for (int64_t i11 = 0; i11 < ne11; i11++) {
|
||||
for (int64_t i10 = 0; i10 < ne10; i10++) {
|
||||
// very slow due to no inlining
|
||||
tmp[i11*ne10 + i10] = ggml_fp32_to_fp16(*(float *) (src1i + i11*nb11 + i10*nb10));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// copy src1 to device
|
||||
CL_CHECK(clEnqueueWriteBuffer(queue, d_Y, false, 0, sizeof(ggml_fp16_t) * y_ne, tmp, 0, NULL, NULL));
|
||||
|
||||
CL_CHECK(clFinish(queue));
|
||||
|
||||
// compute
|
||||
cl_event ev_sgemm;
|
||||
clblast::StatusCode status = clblast::Gemm<cl_half>(clblast::Layout::kColMajor,
|
||||
clblast::Transpose::kYes, clblast::Transpose::kNo,
|
||||
ne01, ne11, ne10,
|
||||
alpha,
|
||||
d_X, x_offset, ne00,
|
||||
d_Y, 0, ne10,
|
||||
beta,
|
||||
d_D, 0, ne01,
|
||||
&queue, &ev_sgemm);
|
||||
|
||||
if (status != clblast::StatusCode::kSuccess) {
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
|
||||
// copy dst to host, then convert to float
|
||||
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
|
||||
|
||||
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
||||
|
||||
ggml_fp16_to_fp32_row(tmp, d, d_ne);
|
||||
}
|
||||
}
|
||||
|
||||
// copy src1 to device
|
||||
CL_CHECK(clEnqueueWriteBuffer(queue, d_Y, false, 0, sizeof(ggml_fp16_t) * y_ne, tmp, 0, NULL, NULL));
|
||||
|
||||
CL_CHECK(clFinish(queue));
|
||||
|
||||
// compute
|
||||
cl_event ev_sgemm;
|
||||
clblast::StatusCode status = clblast::Gemm<cl_half>(clblast::Layout::kColMajor,
|
||||
clblast::Transpose::kYes, clblast::Transpose::kNo,
|
||||
ne01, ne11, ne10,
|
||||
alpha,
|
||||
d_X, x_offset, ne00,
|
||||
d_Y, 0, ne10,
|
||||
beta,
|
||||
d_D, 0, ne01,
|
||||
&queue, &ev_sgemm);
|
||||
|
||||
if (status != clblast::StatusCode::kSuccess) {
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
|
||||
// copy dst to host, then convert to float
|
||||
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
|
||||
|
||||
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
||||
|
||||
ggml_fp16_to_fp32_row(tmp, d, d_ne);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1718,85 +1714,81 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
||||
size_t ev_idx = 0;
|
||||
std::vector<cl_event> events;
|
||||
|
||||
int64_t pi02 = -1;
|
||||
int64_t pi03 = -1;
|
||||
|
||||
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
||||
int64_t i03 = i13 / r3;
|
||||
|
||||
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
||||
int64_t i02 = i12 / r2;
|
||||
|
||||
// copy src0 to device if necessary
|
||||
if (src0->backend == GGML_BACKEND_CPU) {
|
||||
if (i02 != pi02 || i03 != pi03) {
|
||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||
// TODO: copy and dequantize src0 here when r3>1
|
||||
for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
|
||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||
// copy src0 to device if necessary
|
||||
if (src0->backend == GGML_BACKEND_CPU) {
|
||||
events.emplace_back();
|
||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
|
||||
pi02 = i02;
|
||||
pi03 = i03;
|
||||
}
|
||||
} else if (src0->backend == GGML_BACKEND_GPU) {
|
||||
d_Q = (cl_mem) src0->extra;
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
|
||||
// copy src1 to device
|
||||
events.emplace_back();
|
||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, events.data() + ev_idx++));
|
||||
|
||||
// compute
|
||||
const size_t global = ne01 * local;
|
||||
const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
|
||||
const cl_int ncols = ne00;
|
||||
events.emplace_back();
|
||||
CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
|
||||
CL_CHECK(clSetKernelArg(*dmmv, 1, sizeof(float) * local, NULL));
|
||||
CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y));
|
||||
CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D));
|
||||
CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols));
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, &offset, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
|
||||
} else { // general dequantization kernel + CLBlast matrix matrix multiplication
|
||||
// convert src0 to fp32 on device
|
||||
const size_t global = x_ne / global_denom;
|
||||
const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
|
||||
CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
|
||||
CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, offset > 0 ? &offset : NULL, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
|
||||
|
||||
// copy src1 to device
|
||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
|
||||
|
||||
events.emplace_back();
|
||||
|
||||
// wait for conversion
|
||||
CL_CHECK(clFinish(queue));
|
||||
|
||||
// compute
|
||||
clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor,
|
||||
clblast::Transpose::kYes, clblast::Transpose::kNo,
|
||||
ne01, ne11, ne10,
|
||||
alpha,
|
||||
d_X, 0, ne00,
|
||||
d_Y, 0, ne10,
|
||||
beta,
|
||||
d_D, 0, ne01,
|
||||
&queue, events.data() + ev_idx++);
|
||||
|
||||
if (status != clblast::StatusCode::kSuccess) {
|
||||
} else if (src0->backend == GGML_BACKEND_GPU) {
|
||||
d_Q = (cl_mem) src0->extra;
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
}
|
||||
|
||||
// copy dst to host
|
||||
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
||||
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &events[events.size() - 1], NULL));
|
||||
for (auto *event : events) {
|
||||
clReleaseEvent(event);
|
||||
}
|
||||
if (!mul_mat_vec) {
|
||||
// convert src0 to fp32 on device
|
||||
const size_t global = x_ne / global_denom;
|
||||
const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
|
||||
CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
|
||||
CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, &offset, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
|
||||
}
|
||||
|
||||
ev_idx = 0;
|
||||
events.clear();
|
||||
for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
|
||||
if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
|
||||
// copy src1 to device
|
||||
events.emplace_back();
|
||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, events.data() + ev_idx++));
|
||||
|
||||
// compute
|
||||
const size_t global = ne01 * local;
|
||||
const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
|
||||
const cl_int ncols = ne00;
|
||||
events.emplace_back();
|
||||
CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
|
||||
CL_CHECK(clSetKernelArg(*dmmv, 1, sizeof(float) * local, NULL));
|
||||
CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y));
|
||||
CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D));
|
||||
CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols));
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, &offset, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
|
||||
} else { // CLBlast matrix matrix multiplication
|
||||
// copy src1 to device
|
||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
|
||||
|
||||
// wait for conversion
|
||||
CL_CHECK(clFinish(queue));
|
||||
|
||||
// compute
|
||||
events.emplace_back();
|
||||
clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor,
|
||||
clblast::Transpose::kYes, clblast::Transpose::kNo,
|
||||
ne01, ne11, ne10,
|
||||
alpha,
|
||||
d_X, 0, ne00,
|
||||
d_Y, 0, ne10,
|
||||
beta,
|
||||
d_D, 0, ne01,
|
||||
&queue, events.data() + ev_idx++);
|
||||
|
||||
if (status != clblast::StatusCode::kSuccess) {
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
}
|
||||
|
||||
// copy dst to host
|
||||
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
||||
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &events[events.size() - 1], NULL));
|
||||
for (auto *event : events) {
|
||||
clReleaseEvent(event);
|
||||
}
|
||||
|
||||
ev_idx = 0;
|
||||
events.clear();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,11 +1,63 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-impl.h"
|
||||
|
||||
// GGML internal header
|
||||
|
||||
#include <stdint.h>
|
||||
#include <assert.h>
|
||||
#include <stddef.h>
|
||||
|
||||
#define QK4_0 32
|
||||
typedef struct {
|
||||
ggml_fp16_t d; // delta
|
||||
uint8_t qs[QK4_0 / 2]; // nibbles / quants
|
||||
} block_q4_0;
|
||||
static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding");
|
||||
|
||||
#define QK4_1 32
|
||||
typedef struct {
|
||||
ggml_fp16_t d; // delta
|
||||
ggml_fp16_t m; // min
|
||||
uint8_t qs[QK4_1 / 2]; // nibbles / quants
|
||||
} block_q4_1;
|
||||
static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_fp16_t) + QK4_1 / 2, "wrong q4_1 block size/padding");
|
||||
|
||||
#define QK5_0 32
|
||||
typedef struct {
|
||||
ggml_fp16_t d; // delta
|
||||
uint8_t qh[4]; // 5-th bit of quants
|
||||
uint8_t qs[QK5_0 / 2]; // nibbles / quants
|
||||
} block_q5_0;
|
||||
static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
|
||||
|
||||
#define QK5_1 32
|
||||
typedef struct {
|
||||
ggml_fp16_t d; // delta
|
||||
ggml_fp16_t m; // min
|
||||
uint8_t qh[4]; // 5-th bit of quants
|
||||
uint8_t qs[QK5_1 / 2]; // nibbles / quants
|
||||
} block_q5_1;
|
||||
static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
|
||||
|
||||
#define QK8_0 32
|
||||
typedef struct {
|
||||
ggml_fp16_t d; // delta
|
||||
int8_t qs[QK8_0]; // quants
|
||||
} block_q8_0;
|
||||
static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
|
||||
|
||||
#define QK8_1 32
|
||||
typedef struct {
|
||||
float d; // delta
|
||||
float s; // d * sum(qs[i])
|
||||
int8_t qs[QK8_1]; // quants
|
||||
} block_q8_1;
|
||||
static_assert(sizeof(block_q8_1) == 2*sizeof(float) + QK8_1, "wrong q8_1 block size/padding");
|
||||
|
||||
//
|
||||
// Super-block quantization structures
|
||||
//
|
||||
|
||||
// Super-block size
|
||||
#ifdef GGML_QKK_64
|
||||
#define QK_K 64
|
||||
@@ -15,18 +67,6 @@
|
||||
#define K_SCALE_SIZE 12
|
||||
#endif
|
||||
|
||||
#ifndef static_assert
|
||||
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
|
||||
#define static_assert(cond, msg) _Static_assert(cond, msg)
|
||||
#else
|
||||
#define static_assert(cond, msg) struct global_scope_noop_trick
|
||||
#endif
|
||||
#endif
|
||||
|
||||
//
|
||||
// Super-block quantization structures
|
||||
//
|
||||
|
||||
// 2-bit quantization
|
||||
// weight is represented as x = a * q + b
|
||||
// 16 blocks of 16 elements each
|
||||
@@ -127,6 +167,13 @@ static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_
|
||||
|
||||
|
||||
// Quantization
|
||||
void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k);
|
||||
void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k);
|
||||
void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k);
|
||||
void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k);
|
||||
void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k);
|
||||
void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k);
|
||||
|
||||
void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k);
|
||||
void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k);
|
||||
void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k);
|
||||
@@ -134,6 +181,13 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict
|
||||
void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k);
|
||||
void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);
|
||||
|
||||
void quantize_row_q4_0(const float * restrict x, void * restrict y, int k);
|
||||
void quantize_row_q4_1(const float * restrict x, void * restrict y, int k);
|
||||
void quantize_row_q5_0(const float * restrict x, void * restrict y, int k);
|
||||
void quantize_row_q5_1(const float * restrict x, void * restrict y, int k);
|
||||
void quantize_row_q8_0(const float * restrict x, void * restrict y, int k);
|
||||
void quantize_row_q8_1(const float * restrict x, void * restrict y, int k);
|
||||
|
||||
void quantize_row_q2_K(const float * restrict x, void * restrict y, int k);
|
||||
void quantize_row_q3_K(const float * restrict x, void * restrict y, int k);
|
||||
void quantize_row_q4_K(const float * restrict x, void * restrict y, int k);
|
||||
@@ -142,6 +196,13 @@ void quantize_row_q6_K(const float * restrict x, void * restrict y, int k);
|
||||
void quantize_row_q8_K(const float * restrict x, void * restrict y, int k);
|
||||
|
||||
// Dequantization
|
||||
void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k);
|
||||
void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int k);
|
||||
void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k);
|
||||
void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int k);
|
||||
void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int k);
|
||||
//void dequantize_row_q8_1(const block_q8_1 * restrict x, float * restrict y, int k);
|
||||
|
||||
void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k);
|
||||
void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k);
|
||||
void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k);
|
||||
@@ -150,16 +211,14 @@ void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int
|
||||
void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k);
|
||||
|
||||
// Dot product
|
||||
void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||
void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||
void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||
void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||
void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||
|
||||
void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||
void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||
void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||
void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||
void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||
|
||||
// Quantization with histogram collection
|
||||
size_t ggml_quantize_q2_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
size_t ggml_quantize_q3_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
|
||||
27
ggml.h
27
ggml.h
@@ -231,8 +231,9 @@
|
||||
#define GGML_EXIT_SUCCESS 0
|
||||
#define GGML_EXIT_ABORTED 1
|
||||
|
||||
#define GGUF_MAGIC 0x46554747 // "GGUF"
|
||||
#define GGUF_VERSION 2
|
||||
#define GGUF_MAGIC "GGUF"
|
||||
|
||||
#define GGUF_VERSION 3
|
||||
|
||||
#define GGUF_DEFAULT_ALIGNMENT 32
|
||||
|
||||
@@ -400,15 +401,16 @@ extern "C" {
|
||||
GGML_OP_ALIBI,
|
||||
GGML_OP_CLAMP,
|
||||
GGML_OP_CONV_1D,
|
||||
GGML_OP_CONV_2D,
|
||||
GGML_OP_CONV_1D_STAGE_0, // internal
|
||||
GGML_OP_CONV_1D_STAGE_1, // internal
|
||||
GGML_OP_CONV_TRANSPOSE_1D,
|
||||
GGML_OP_CONV_2D,
|
||||
GGML_OP_CONV_2D_STAGE_0, // internal
|
||||
GGML_OP_CONV_2D_STAGE_1, // internal
|
||||
GGML_OP_CONV_TRANSPOSE_2D,
|
||||
GGML_OP_POOL_1D,
|
||||
GGML_OP_POOL_2D,
|
||||
|
||||
GGML_OP_CONV_1D_STAGE_0, // internal
|
||||
GGML_OP_CONV_1D_STAGE_1, // internal
|
||||
|
||||
GGML_OP_UPSCALE, // nearest interpolate
|
||||
|
||||
GGML_OP_FLASH_ATTN,
|
||||
@@ -1019,9 +1021,9 @@ extern "C" {
|
||||
struct ggml_tensor * b,
|
||||
float eps);
|
||||
|
||||
// A: n columns, m rows
|
||||
// B: n columns, p rows (i.e. we transpose it internally)
|
||||
// result is m columns, p rows
|
||||
// A: k columns, n rows => [ne03, ne02, n, k]
|
||||
// B: k columns, m rows (i.e. we transpose it internally) => [ne03 * x, ne02 * y, m, k]
|
||||
// result is n columns, m rows => [ne03 * x, ne02 * y, m, n]
|
||||
GGML_API struct ggml_tensor * ggml_mul_mat(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
@@ -1928,12 +1930,19 @@ extern "C" {
|
||||
// quantization
|
||||
//
|
||||
|
||||
// TODO: these would probably get removed in favor of the more general ggml_quantize_chunk
|
||||
GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
|
||||
GGML_API size_t ggml_quantize_q2_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
GGML_API size_t ggml_quantize_q3_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
GGML_API size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
GGML_API size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
GGML_API size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
|
||||
GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
|
||||
|
||||
//
|
||||
|
||||
@@ -19,9 +19,10 @@ import numpy as np
|
||||
#
|
||||
|
||||
GGUF_MAGIC = 0x46554747
|
||||
GGUF_VERSION = 2
|
||||
GGUF_VERSION = 3
|
||||
GGUF_DEFAULT_ALIGNMENT = 32
|
||||
|
||||
|
||||
# general
|
||||
KEY_GENERAL_ARCHITECTURE = "general.architecture"
|
||||
KEY_GENERAL_QUANTIZATION_VERSION = "general.quantization_version"
|
||||
@@ -597,6 +598,10 @@ class GGMLQuantizationType(IntEnum):
|
||||
Q6_K = 14
|
||||
Q8_K = 15
|
||||
|
||||
class GGUFEndian(IntEnum):
|
||||
LITTLE = 0
|
||||
BIG = 1
|
||||
|
||||
|
||||
class GGUFValueType(IntEnum):
|
||||
UINT8 = 0
|
||||
@@ -644,18 +649,41 @@ class GGUFWriter:
|
||||
temp_file: tempfile.SpooledTemporaryFile[bytes] | None = None
|
||||
tensors: list[tuple[np.ndarray[Any, Any], int]]
|
||||
|
||||
def __init__(self, path: os.PathLike[str] | str, arch: str, use_temp_file = True):
|
||||
@property
|
||||
def pack_prefix(self):
|
||||
if self.endianess==GGUFEndian.LITTLE:
|
||||
return "<"
|
||||
else:
|
||||
return ">"
|
||||
|
||||
def __init__(self, path: os.PathLike[str] | str, arch: str, use_temp_file = True, endianess=GGUFEndian.LITTLE):
|
||||
self.fout = open(path, "wb")
|
||||
self.arch = arch
|
||||
self.endianess = endianess
|
||||
self._simple_value_packing = {
|
||||
GGUFValueType.UINT8: f"{self.pack_prefix}B",
|
||||
GGUFValueType.INT8: f"{self.pack_prefix}b",
|
||||
GGUFValueType.UINT16: f"{self.pack_prefix}H",
|
||||
GGUFValueType.INT16: f"{self.pack_prefix}h",
|
||||
GGUFValueType.UINT32: f"{self.pack_prefix}I",
|
||||
GGUFValueType.INT32: f"{self.pack_prefix}i",
|
||||
GGUFValueType.FLOAT32: f"{self.pack_prefix}f",
|
||||
GGUFValueType.UINT64: f"{self.pack_prefix}Q",
|
||||
GGUFValueType.INT64: f"{self.pack_prefix}q",
|
||||
GGUFValueType.FLOAT64: f"{self.pack_prefix}d",
|
||||
GGUFValueType.BOOL: "?" ,
|
||||
}
|
||||
self.add_architecture()
|
||||
self.use_temp_file = use_temp_file
|
||||
self.tensors = []
|
||||
endianess_str = "Big Endian" if self.endianess == GGUFEndian.BIG else "Little Endian"
|
||||
print(f"This gguf file is for {endianess_str} only")
|
||||
|
||||
def write_header_to_file(self):
|
||||
self.fout.write(struct.pack("<I", GGUF_MAGIC))
|
||||
self.fout.write(struct.pack("<I", GGUF_VERSION))
|
||||
self.fout.write(struct.pack("<Q", self.ti_data_count))
|
||||
self.fout.write(struct.pack("<Q", self.kv_data_count))
|
||||
self.fout.write(struct.pack(f"{self.pack_prefix}I", GGUF_VERSION))
|
||||
self.fout.write(struct.pack(f"{self.pack_prefix}Q", self.ti_data_count))
|
||||
self.fout.write(struct.pack(f"{self.pack_prefix}Q", self.kv_data_count))
|
||||
self.flush()
|
||||
# print("tensors " + str(self.ti_data_count) + " kv " + str(self.kv_data_count))
|
||||
|
||||
@@ -727,25 +755,12 @@ class GGUFWriter:
|
||||
self.add_key(key)
|
||||
self.add_val(val, GGUFValueType.ARRAY)
|
||||
|
||||
_simple_value_packing = {
|
||||
GGUFValueType.UINT8: "<B",
|
||||
GGUFValueType.INT8: "<b",
|
||||
GGUFValueType.UINT16: "<H",
|
||||
GGUFValueType.INT16: "<h",
|
||||
GGUFValueType.UINT32: "<I",
|
||||
GGUFValueType.INT32: "<i",
|
||||
GGUFValueType.FLOAT32: "<f",
|
||||
GGUFValueType.UINT64: "<Q",
|
||||
GGUFValueType.INT64: "<q",
|
||||
GGUFValueType.FLOAT64: "<d",
|
||||
GGUFValueType.BOOL: "?" ,
|
||||
}
|
||||
def add_val(self, val: Any, vtype: GGUFValueType | None = None, add_vtype: bool = True):
|
||||
if vtype is None:
|
||||
vtype = GGUFValueType.get_type(val)
|
||||
|
||||
if add_vtype:
|
||||
self.kv_data += struct.pack("<I", vtype)
|
||||
self.kv_data += struct.pack(f"{self.pack_prefix}I", vtype)
|
||||
self.kv_data_count += 1
|
||||
|
||||
pack_fmt = self._simple_value_packing.get(vtype)
|
||||
@@ -753,14 +768,14 @@ class GGUFWriter:
|
||||
self.kv_data += struct.pack(pack_fmt, val)
|
||||
elif vtype == GGUFValueType.STRING:
|
||||
encoded_val = val.encode("utf8") if isinstance(val, str) else val
|
||||
self.kv_data += struct.pack("<Q", len(encoded_val))
|
||||
self.kv_data += struct.pack(f"{self.pack_prefix}Q", len(encoded_val))
|
||||
self.kv_data += encoded_val
|
||||
elif vtype == GGUFValueType.ARRAY and isinstance(val, Sequence) and len(val) > 0:
|
||||
ltype = GGUFValueType.get_type(val[0])
|
||||
if not all(GGUFValueType.get_type(i) is ltype for i in val[1:]):
|
||||
raise ValueError("All items in a GGUF array should be of the same type")
|
||||
self.kv_data += struct.pack("<I", ltype)
|
||||
self.kv_data += struct.pack("<Q", len(val))
|
||||
self.kv_data += struct.pack(f"{self.pack_prefix}I", ltype)
|
||||
self.kv_data += struct.pack(f"{self.pack_prefix}Q", len(val))
|
||||
for item in val:
|
||||
self.add_val(item, add_vtype=False)
|
||||
else:
|
||||
@@ -774,22 +789,24 @@ class GGUFWriter:
|
||||
assert raw_dtype is not None or tensor_dtype in (np.float32, np.float16), "Only F32 and F16 tensors are supported for now"
|
||||
|
||||
encoded_name = name.encode("utf8")
|
||||
self.ti_data += struct.pack("<Q", len(encoded_name))
|
||||
self.ti_data += struct.pack(f"{self.pack_prefix}Q", len(encoded_name))
|
||||
self.ti_data += encoded_name
|
||||
n_dims = len(tensor_shape)
|
||||
self.ti_data += struct.pack("<I", n_dims)
|
||||
self.ti_data += struct.pack(f"{self.pack_prefix}I", n_dims)
|
||||
for i in range(n_dims):
|
||||
self.ti_data += struct.pack("<Q", tensor_shape[n_dims - 1 - i])
|
||||
self.ti_data += struct.pack(f"{self.pack_prefix}Q", tensor_shape[n_dims - 1 - i])
|
||||
if raw_dtype is None:
|
||||
dtype = GGMLQuantizationType.F32 if tensor_dtype == np.float32 else GGMLQuantizationType.F16
|
||||
else:
|
||||
dtype = raw_dtype
|
||||
self.ti_data += struct.pack("<I", dtype)
|
||||
self.ti_data += struct.pack("<Q", self.offset_tensor)
|
||||
self.ti_data += struct.pack(f"{self.pack_prefix}I", dtype)
|
||||
self.ti_data += struct.pack(f"{self.pack_prefix}Q", self.offset_tensor)
|
||||
self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment)
|
||||
self.ti_data_count += 1
|
||||
|
||||
def add_tensor(self, name: str, tensor: np.ndarray[Any, Any], raw_shape: Sequence[int] | None = None, raw_dtype: GGMLQuantizationType | None = None):
|
||||
if self.endianess == GGUFEndian.BIG:
|
||||
tensor.byteswap(inplace=True)
|
||||
if self.use_temp_file and self.temp_file is None:
|
||||
fp = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256*1024*1024)
|
||||
fp.seek(0)
|
||||
@@ -815,6 +832,8 @@ class GGUFWriter:
|
||||
fp.write(bytes([0] * pad))
|
||||
|
||||
def write_tensor_data(self, tensor: np.ndarray[Any, Any]):
|
||||
if self.endianess==GGUFEndian.BIG:
|
||||
tensor.byteswap(inplace=True)
|
||||
self.write_padding(self.fout, self.fout.tell())
|
||||
tensor.tofile(self.fout)
|
||||
self.write_padding(self.fout, tensor.nbytes)
|
||||
@@ -968,12 +987,15 @@ class SpecialVocab:
|
||||
merges: list[str] = []
|
||||
special_token_types: tuple[str, ...] = ('bos', 'eos', 'unk', 'sep', 'pad')
|
||||
special_token_ids: dict[str, int] = {}
|
||||
n_vocab: int | None = None
|
||||
|
||||
def __init__(
|
||||
self, path: str | os.PathLike[str], load_merges: bool = False,
|
||||
special_token_types: tuple[str, ...] | None = None,
|
||||
n_vocab: int | None = None,
|
||||
):
|
||||
self.special_token_ids = {}
|
||||
self.n_vocab = n_vocab
|
||||
self.load_merges = load_merges
|
||||
if special_token_types is not None:
|
||||
self.special_token_types = special_token_types
|
||||
@@ -983,6 +1005,16 @@ class SpecialVocab:
|
||||
if not self._try_load_from_tokenizer_json(path):
|
||||
self._try_load_from_config_json(path)
|
||||
|
||||
def _set_special_token(self, typ: str, tid: Any):
|
||||
if not isinstance(tid, int) or tid < 0:
|
||||
return
|
||||
if self.n_vocab is None or tid < self.n_vocab:
|
||||
self.special_token_ids[typ] = tid
|
||||
return
|
||||
print(f'gguf: WARNING: Special token type {typ}, id {tid} out of range, must be under {self.n_vocab} - skipping',
|
||||
file = sys.stderr)
|
||||
|
||||
|
||||
def _try_load_from_tokenizer_json(self, path: Path) -> bool:
|
||||
tokenizer_file = path / 'tokenizer.json'
|
||||
if not tokenizer_file.is_file():
|
||||
@@ -1010,10 +1042,11 @@ class SpecialVocab:
|
||||
tc_content = entry_content
|
||||
else:
|
||||
continue
|
||||
for maybe_token_id in (atok.get('id') for atok in added_tokens if atok.get('content') == tc_content):
|
||||
if isinstance(maybe_token_id, int) and maybe_token_id >= 0:
|
||||
self.special_token_ids[typ] = maybe_token_id
|
||||
break
|
||||
# We only need the first match here.
|
||||
maybe_token_id = next((
|
||||
atok.get('id') for atok in added_tokens
|
||||
if atok.get('content') == tc_content), None)
|
||||
self._set_special_token(typ, maybe_token_id)
|
||||
return True
|
||||
|
||||
def _try_load_from_config_json(self, path: Path) -> bool:
|
||||
@@ -1023,21 +1056,21 @@ class SpecialVocab:
|
||||
with open(config_file, encoding = 'utf-8') as f:
|
||||
config = json.load(f)
|
||||
for typ in self.special_token_types:
|
||||
maybe_token_id = config.get(f'{typ}_token_id')
|
||||
if isinstance(maybe_token_id, int) and maybe_token_id >= 0:
|
||||
self.special_token_ids[typ] = maybe_token_id
|
||||
self._set_special_token(typ, config.get(f'{typ}_token_id'))
|
||||
return True
|
||||
|
||||
def add_to_gguf(self, gw: GGUFWriter) -> None:
|
||||
def add_to_gguf(self, gw: GGUFWriter, quiet: bool = False) -> None:
|
||||
if len(self.merges) > 0:
|
||||
print(f'gguf: Adding {len(self.merges)} merge(s).')
|
||||
if not quiet:
|
||||
print(f'gguf: Adding {len(self.merges)} merge(s).')
|
||||
gw.add_token_merges(self.merges)
|
||||
for typ, tokid in self.special_token_ids.items():
|
||||
handler: Callable[[int], None] | None = getattr(gw, f'add_{typ}_token_id', None)
|
||||
if handler is None:
|
||||
print(f'gguf: WARNING: No handler for special token type {typ} with id {tokid} - skipping')
|
||||
print(f'gguf: WARNING: No handler for special token type {typ} with id {tokid} - skipping', file = sys.stderr)
|
||||
continue
|
||||
print(f'gguf: Setting special token type {typ} to {tokid}')
|
||||
if not quiet:
|
||||
print(f'gguf: Setting special token type {typ} to {tokid}')
|
||||
handler(tokid)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[tool.poetry]
|
||||
name = "gguf"
|
||||
version = "0.4.4"
|
||||
version = "0.4.5"
|
||||
description = "Write ML models in GGUF for GGML"
|
||||
authors = ["GGML <ggml@ggml.ai>"]
|
||||
packages = [
|
||||
|
||||
504
llama.cpp
504
llama.cpp
@@ -19,13 +19,11 @@
|
||||
#ifdef GGML_USE_MPI
|
||||
# include "ggml-mpi.h"
|
||||
#endif
|
||||
#ifdef GGML_USE_K_QUANTS
|
||||
# ifndef QK_K
|
||||
# ifdef GGML_QKK_64
|
||||
# define QK_K 64
|
||||
# else
|
||||
# define QK_K 256
|
||||
# endif
|
||||
#ifndef QK_K
|
||||
# ifdef GGML_QKK_64
|
||||
# define QK_K 64
|
||||
# else
|
||||
# define QK_K 256
|
||||
# endif
|
||||
#endif
|
||||
|
||||
@@ -975,14 +973,15 @@ static void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
|
||||
(void) tensor;
|
||||
}
|
||||
|
||||
static std::string llama_token_to_str(const struct llama_context * ctx, llama_token token) {
|
||||
static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
|
||||
std::vector<char> result(8, 0);
|
||||
const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
|
||||
if (n_tokens < 0) {
|
||||
result.resize(-n_tokens);
|
||||
int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
|
||||
GGML_ASSERT(check == -n_tokens);
|
||||
} else {
|
||||
}
|
||||
else {
|
||||
result.resize(n_tokens);
|
||||
}
|
||||
|
||||
@@ -1018,8 +1017,8 @@ enum e_model {
|
||||
};
|
||||
|
||||
static const size_t kB = 1024;
|
||||
static const size_t MB = kB*kB;
|
||||
static const size_t GB = kB*kB*kB;
|
||||
static const size_t MB = 1024*kB;
|
||||
static const size_t GB = 1024*MB;
|
||||
|
||||
struct llama_hparams {
|
||||
bool vocab_only;
|
||||
@@ -1042,21 +1041,21 @@ struct llama_hparams {
|
||||
float f_max_alibi_bias;
|
||||
|
||||
bool operator!=(const llama_hparams & other) const {
|
||||
if (this->vocab_only != other.vocab_only) return true;
|
||||
if (this->n_vocab != other.n_vocab) return true;
|
||||
if (this->vocab_only != other.vocab_only) return true;
|
||||
if (this->n_vocab != other.n_vocab) return true;
|
||||
if (this->n_ctx_train != other.n_ctx_train) return true;
|
||||
if (this->n_embd != other.n_embd) return true;
|
||||
if (this->n_head != other.n_head) return true;
|
||||
if (this->n_head_kv != other.n_head_kv) return true;
|
||||
if (this->n_layer != other.n_layer) return true;
|
||||
if (this->n_rot != other.n_rot) return true;
|
||||
if (this->n_ff != other.n_ff) return true;
|
||||
if (this->n_embd != other.n_embd) return true;
|
||||
if (this->n_head != other.n_head) return true;
|
||||
if (this->n_head_kv != other.n_head_kv) return true;
|
||||
if (this->n_layer != other.n_layer) return true;
|
||||
if (this->n_rot != other.n_rot) return true;
|
||||
if (this->n_ff != other.n_ff) return true;
|
||||
|
||||
const float EPSILON = 1e-9;
|
||||
|
||||
if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
|
||||
if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
|
||||
if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
|
||||
if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
|
||||
if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
|
||||
if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
|
||||
if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
|
||||
|
||||
return false;
|
||||
@@ -1195,17 +1194,17 @@ struct llama_vocab {
|
||||
id special_sep_id = -1;
|
||||
id special_pad_id = -1;
|
||||
|
||||
id linefeed_id = 13;
|
||||
id linefeed_id = 13;
|
||||
id special_prefix_id = 32007;
|
||||
id special_middle_id = 32009;
|
||||
id special_suffix_id = 32008;
|
||||
id special_eot_id = 32010;
|
||||
id special_eot_id = 32010;
|
||||
|
||||
int find_bpe_rank(std::string token_left, std::string token_right) const {
|
||||
replace_all(token_left, " ", "\u0120");
|
||||
replace_all(token_left, "\n", "\u010A");
|
||||
replace_all(token_right, " ", "\u0120");
|
||||
replace_all(token_right, "\n", "\u010A");
|
||||
GGML_ASSERT(token_left.find(" ") == std::string::npos);
|
||||
GGML_ASSERT(token_left.find("\n") == std::string::npos);
|
||||
GGML_ASSERT(token_right.find(" ") == std::string::npos);
|
||||
GGML_ASSERT(token_right.find("\n") == std::string::npos);
|
||||
|
||||
auto it = bpe_ranks.find(std::make_pair(token_left, token_right));
|
||||
if (it == bpe_ranks.end()) {
|
||||
@@ -1359,10 +1358,7 @@ static bool llama_kv_cache_init(
|
||||
cache.cells.clear();
|
||||
cache.cells.resize(n_ctx);
|
||||
|
||||
// TODO: this should be:
|
||||
// cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*ggml_tensor_overhead());
|
||||
// change it and test that it works
|
||||
cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
|
||||
cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*ggml_tensor_overhead());
|
||||
memset(cache.buf.data, 0, cache.buf.size);
|
||||
|
||||
struct ggml_init_params params;
|
||||
@@ -1470,17 +1466,12 @@ static int32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void llama_kv_cache_tokens_rm(struct llama_kv_cache & cache, int32_t c0, int32_t c1) {
|
||||
if (c0 < 0) c0 = 0;
|
||||
if (c1 < 0) c1 = cache.size;
|
||||
|
||||
for (int32_t i = c0; i < c1; ++i) {
|
||||
static void llama_kv_cache_clear(struct llama_kv_cache & cache) {
|
||||
for (int32_t i = 0; i < (int32_t) cache.size; ++i) {
|
||||
cache.cells[i].pos = -1;
|
||||
cache.cells[i].seq_id.clear();
|
||||
}
|
||||
|
||||
// Searching for a free slot can start here since we know it will be empty.
|
||||
cache.head = uint32_t(c0);
|
||||
cache.head = 0;
|
||||
}
|
||||
|
||||
static void llama_kv_cache_seq_rm(
|
||||
@@ -1494,8 +1485,14 @@ static void llama_kv_cache_seq_rm(
|
||||
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
|
||||
|
||||
for (uint32_t i = 0; i < cache.size; ++i) {
|
||||
if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
|
||||
cache.cells[i].seq_id.erase(seq_id);
|
||||
if (cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
|
||||
if (seq_id < 0) {
|
||||
cache.cells[i].seq_id.clear();
|
||||
} else if (cache.cells[i].has_seq_id(seq_id)) {
|
||||
cache.cells[i].seq_id.erase(seq_id);
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
if (cache.cells[i].seq_id.empty()) {
|
||||
cache.cells[i].pos = -1;
|
||||
if (new_head == cache.size) new_head = i;
|
||||
@@ -1556,14 +1553,14 @@ static void llama_kv_cache_seq_shift(
|
||||
|
||||
for (uint32_t i = 0; i < cache.size; ++i) {
|
||||
if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
|
||||
cache.cells[i].pos += delta;
|
||||
cache.has_shift = true;
|
||||
cache.cells[i].pos += delta;
|
||||
cache.cells[i].delta += delta;
|
||||
|
||||
if (cache.cells[i].pos < 0) {
|
||||
cache.cells[i].pos = -1;
|
||||
cache.cells[i].seq_id.clear();
|
||||
if (new_head == cache.size) new_head = i;
|
||||
} else {
|
||||
cache.has_shift = true;
|
||||
cache.cells[i].delta = delta;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1580,12 +1577,14 @@ static void llama_kv_cache_seq_shift(
|
||||
enum llama_fver {
|
||||
GGUF_FILE_VERSION_V1 = 1,
|
||||
GGUF_FILE_VERSION_V2 = 2,
|
||||
GGUF_FILE_VERSION_V3 = 3,
|
||||
};
|
||||
|
||||
static const char * llama_file_version_name(llama_fver version) {
|
||||
switch (version) {
|
||||
case GGUF_FILE_VERSION_V1: return "GGUF V1 (support until nov 2023)";
|
||||
case GGUF_FILE_VERSION_V2: return "GGUF V2 (latest)";
|
||||
case GGUF_FILE_VERSION_V2: return "GGUF V2";
|
||||
case GGUF_FILE_VERSION_V3: return "GGUF V3 (latest)";
|
||||
}
|
||||
|
||||
return "unknown";
|
||||
@@ -2241,15 +2240,35 @@ static void llm_load_vocab(
|
||||
if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
|
||||
vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
|
||||
} else {
|
||||
vocab.linefeed_id = llama_tokenize_internal(vocab, "\u010A", false)[0];
|
||||
const std::vector<int> ids = llama_tokenize_internal(vocab, "\u010A", false);
|
||||
GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
|
||||
vocab.linefeed_id = ids[0];
|
||||
}
|
||||
|
||||
// special tokens
|
||||
GGUF_GET_KEY(ctx, vocab.special_bos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_BOS_ID));
|
||||
GGUF_GET_KEY(ctx, vocab.special_eos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_EOS_ID));
|
||||
GGUF_GET_KEY(ctx, vocab.special_unk_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_UNK_ID));
|
||||
GGUF_GET_KEY(ctx, vocab.special_sep_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_SEP_ID));
|
||||
GGUF_GET_KEY(ctx, vocab.special_pad_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_PAD_ID));
|
||||
{
|
||||
const std::vector<std::pair<enum llm_kv, int32_t &>> special_token_types = {
|
||||
{ LLM_KV_TOKENIZER_BOS_ID, vocab.special_bos_id },
|
||||
{ LLM_KV_TOKENIZER_EOS_ID, vocab.special_eos_id },
|
||||
{ LLM_KV_TOKENIZER_UNK_ID, vocab.special_unk_id },
|
||||
{ LLM_KV_TOKENIZER_SEP_ID, vocab.special_sep_id },
|
||||
{ LLM_KV_TOKENIZER_PAD_ID, vocab.special_pad_id },
|
||||
};
|
||||
for (const auto & it : special_token_types) {
|
||||
const std::string & key = kv(std::get<0>(it));
|
||||
int32_t & id = std::get<1>(it), old_id = id;
|
||||
|
||||
GGUF_GET_KEY(ctx, id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, key);
|
||||
// Must be >= -1 and < vocab size. Since the key is unsigned, -1
|
||||
// can only come from the default value, so there's no point in
|
||||
// validating that.
|
||||
if (size_t(id + 1) > vocab.id_to_token.size()) {
|
||||
LLAMA_LOG_WARN("%s: bad special token: '%s' = %d, using default id %d\n",
|
||||
__func__, key.c_str(), id, old_id);
|
||||
id = old_id;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// build special tokens cache
|
||||
{
|
||||
@@ -2675,8 +2694,8 @@ static void llm_load_tensors(
|
||||
} break;
|
||||
case LLM_ARCH_STARCODER:
|
||||
{
|
||||
model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
||||
model.pos_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, GGML_BACKEND_CPU);
|
||||
model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
||||
model.pos_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, GGML_BACKEND_CPU);
|
||||
|
||||
// output
|
||||
{
|
||||
@@ -2727,19 +2746,19 @@ static void llm_load_tensors(
|
||||
layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
|
||||
|
||||
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
|
||||
layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split);
|
||||
layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend);
|
||||
|
||||
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
||||
layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split);
|
||||
layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend);
|
||||
|
||||
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
||||
layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
|
||||
|
||||
layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
|
||||
layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split);
|
||||
layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend);
|
||||
|
||||
layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
||||
layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split);
|
||||
layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
|
||||
|
||||
if (backend == GGML_BACKEND_GPU) {
|
||||
vram_weights +=
|
||||
@@ -4596,6 +4615,8 @@ static struct ggml_cgraph * llm_build_starcoder(
|
||||
|
||||
const float norm_eps = hparams.f_norm_eps;
|
||||
|
||||
const int n_gpu_layers = model.n_gpu_layers;
|
||||
|
||||
const int32_t n_tokens = batch.n_tokens;
|
||||
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
|
||||
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
|
||||
@@ -4640,6 +4661,27 @@ static struct ggml_cgraph * llm_build_starcoder(
|
||||
}
|
||||
}
|
||||
|
||||
const int i_gpu_start = n_layer - n_gpu_layers;
|
||||
(void) i_gpu_start;
|
||||
|
||||
// offload functions set the tensor output backend to GPU
|
||||
// tensors are GPU-accelerated if any input or the output has been offloaded
|
||||
offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
|
||||
offload_func_t offload_func_kq = llama_nop;
|
||||
offload_func_t offload_func_v = llama_nop;
|
||||
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
if (n_gpu_layers > n_layer) {
|
||||
offload_func_nr = ggml_cuda_assign_buffers_no_alloc;
|
||||
}
|
||||
if (n_gpu_layers > n_layer + 1) {
|
||||
offload_func_v = ggml_cuda_assign_buffers_no_alloc;
|
||||
}
|
||||
if (n_gpu_layers > n_layer + 2) {
|
||||
offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
|
||||
}
|
||||
#endif // GGML_USE_CUBLAS
|
||||
|
||||
{
|
||||
// Compute position embeddings.
|
||||
struct ggml_tensor * inp_positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
||||
@@ -4665,6 +4707,7 @@ static struct ggml_cgraph * llm_build_starcoder(
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
||||
ggml_set_name(KQ_mask, "KQ_mask");
|
||||
offload_func_kq(KQ_mask);
|
||||
ggml_allocr_alloc(lctx.alloc, KQ_mask);
|
||||
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
||||
float * data = (float *) KQ_mask->data;
|
||||
@@ -4688,44 +4731,67 @@ static struct ggml_cgraph * llm_build_starcoder(
|
||||
ggml_set_name(inpL, "inpL");
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
offload_func_t offload_func = llama_nop;
|
||||
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
if (il >= i_gpu_start) {
|
||||
offload_func = ggml_cuda_assign_buffers_no_alloc;
|
||||
}
|
||||
#endif // GGML_USE_CUBLAS
|
||||
|
||||
{
|
||||
// Norm
|
||||
cur = ggml_norm(ctx0, inpL, norm_eps);
|
||||
offload_func(cur);
|
||||
|
||||
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b);
|
||||
offload_func(cur);
|
||||
}
|
||||
|
||||
{
|
||||
// Self Attention
|
||||
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv);
|
||||
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
||||
offload_func_kq(cur);
|
||||
|
||||
struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*n_embd);
|
||||
struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*n_embd);
|
||||
struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa));
|
||||
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
||||
offload_func_kq(cur);
|
||||
|
||||
struct ggml_tensor * Qcur = tmpq;
|
||||
struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
||||
struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
||||
struct ggml_tensor * tmpv = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
||||
|
||||
ggml_set_name(tmpq, "tmpq");
|
||||
ggml_set_name(tmpk, "tmpk");
|
||||
ggml_set_name(tmpv, "tmpv");
|
||||
|
||||
offload_func_kq(tmpq);
|
||||
offload_func_kq(tmpk);
|
||||
offload_func_v (tmpv);
|
||||
|
||||
struct ggml_tensor * Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens);
|
||||
struct ggml_tensor * Kcur = tmpk;
|
||||
|
||||
{
|
||||
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
|
||||
struct ggml_tensor * Vcur = ggml_transpose(ctx0, tmpv);
|
||||
offload_func_v(Vcur);
|
||||
ggml_set_name(Vcur, "Vcur");
|
||||
|
||||
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
|
||||
offload_func_kq(k);
|
||||
ggml_set_name(k, "k");
|
||||
|
||||
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
|
||||
( n_ctx)*ggml_element_size(kv_self.v),
|
||||
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
|
||||
offload_func_v(v);
|
||||
ggml_set_name(v, "v");
|
||||
|
||||
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
||||
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
|
||||
}
|
||||
|
||||
struct ggml_tensor * Q =
|
||||
ggml_permute(ctx0,
|
||||
ggml_cpy(ctx0,
|
||||
Qcur,
|
||||
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, n_tokens)),
|
||||
0, 2, 1, 3);
|
||||
struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
|
||||
offload_func_kq(Q);
|
||||
ggml_set_name(Q, "Q");
|
||||
|
||||
struct ggml_tensor * K =
|
||||
@@ -4734,23 +4800,28 @@ static struct ggml_cgraph * llm_build_starcoder(
|
||||
ggml_element_size(kv_self.k)*n_embd_gqa,
|
||||
ggml_element_size(kv_self.k)*n_embd_head,
|
||||
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
||||
offload_func_kq(K);
|
||||
ggml_set_name(K, "K");
|
||||
|
||||
// K * Q
|
||||
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
||||
offload_func_kq(KQ);
|
||||
ggml_set_name(KQ, "KQ");
|
||||
|
||||
// KQ_scaled = KQ / sqrt(n_embd_head)
|
||||
// KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
|
||||
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
|
||||
offload_func_kq(KQ_scaled);
|
||||
ggml_set_name(KQ_scaled, "KQ_scaled");
|
||||
|
||||
// KQ_masked = mask_past(KQ_scaled)
|
||||
struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
|
||||
offload_func_kq(KQ_masked);
|
||||
ggml_set_name(KQ_masked, "KQ_masked");
|
||||
|
||||
// KQ = soft_max(KQ_masked)
|
||||
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
||||
offload_func_v(KQ_soft_max);
|
||||
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
||||
|
||||
// split cached V into n_head heads
|
||||
@@ -4763,22 +4834,25 @@ static struct ggml_cgraph * llm_build_starcoder(
|
||||
ggml_set_name(V, "V");
|
||||
|
||||
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
||||
offload_func_v(KQV);
|
||||
ggml_set_name(KQV, "KQV");
|
||||
|
||||
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
||||
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
||||
offload_func_v(KQV_merged);
|
||||
ggml_set_name(KQV_merged, "KQV_merged");
|
||||
|
||||
// cur = KQV_merged.contiguous().view(n_embd, n_tokens)
|
||||
cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
|
||||
offload_func_v(cur);
|
||||
ggml_set_name(cur, "KQV_merged_contiguous");
|
||||
}
|
||||
|
||||
// Projection
|
||||
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo);
|
||||
offload_func(cur);
|
||||
|
||||
// Add the input
|
||||
cur = ggml_add(ctx0, cur, inpL);
|
||||
offload_func(cur);
|
||||
|
||||
struct ggml_tensor * inpFF = cur;
|
||||
|
||||
@@ -4787,27 +4861,36 @@ static struct ggml_cgraph * llm_build_starcoder(
|
||||
// Norm
|
||||
{
|
||||
cur = ggml_norm(ctx0, inpFF, norm_eps);
|
||||
offload_func_nr(cur);
|
||||
|
||||
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b);
|
||||
offload_func_nr(cur);
|
||||
}
|
||||
|
||||
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3);
|
||||
offload_func(cur);
|
||||
|
||||
// GELU activation
|
||||
cur = ggml_gelu(ctx0, cur);
|
||||
offload_func(cur);
|
||||
|
||||
// Projection
|
||||
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w2, cur), model.layers[il].b2);
|
||||
offload_func(cur);
|
||||
}
|
||||
|
||||
inpL = ggml_add(ctx0, cur, inpFF);
|
||||
|
||||
}
|
||||
|
||||
// Output Norm
|
||||
{
|
||||
cur = ggml_norm(ctx0, inpL, norm_eps);
|
||||
offload_func_nr(cur);
|
||||
|
||||
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.output_norm), model.output_norm_b);
|
||||
ggml_set_name(cur, "result_norm");
|
||||
}
|
||||
ggml_set_name(cur, "result_norm");
|
||||
|
||||
cur = ggml_mul_mat(ctx0, model.output, cur);
|
||||
ggml_set_name(cur, "result_output");
|
||||
@@ -5941,8 +6024,6 @@ static int llama_decode_internal(
|
||||
}
|
||||
}
|
||||
|
||||
ggml_cuda_set_mul_mat_q(cparams.mul_mat_q);
|
||||
|
||||
// HACK: ggml-alloc may change the tensor backend when reusing a parent, so force output to be on the CPU here if needed
|
||||
if (!lctx.embedding.empty()) {
|
||||
embeddings->backend = GGML_BACKEND_CPU;
|
||||
@@ -5993,11 +6074,20 @@ static int llama_decode_internal(
|
||||
#endif
|
||||
|
||||
// update the kv ring buffer
|
||||
lctx.kv_self.has_shift = false;
|
||||
lctx.kv_self.head += n_tokens;
|
||||
// Ensure kv cache head points to a valid index.
|
||||
if (lctx.kv_self.head >= lctx.kv_self.size) {
|
||||
lctx.kv_self.head = 0;
|
||||
{
|
||||
if (kv_self.has_shift) {
|
||||
kv_self.has_shift = false;
|
||||
for (uint32_t i = 0; i < kv_self.size; ++i) {
|
||||
kv_self.cells[i].delta = 0;
|
||||
}
|
||||
}
|
||||
|
||||
kv_self.head += n_tokens;
|
||||
|
||||
// Ensure kv cache head points to a valid index.
|
||||
if (kv_self.head >= kv_self.size) {
|
||||
kv_self.head = 0;
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef GGML_PERF
|
||||
@@ -6106,11 +6196,10 @@ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
|
||||
}
|
||||
|
||||
static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
|
||||
static const char * hex = "0123456789ABCDEF";
|
||||
switch (llama_vocab_get_type(vocab)) {
|
||||
case LLAMA_VOCAB_TYPE_SPM: {
|
||||
char buf[7];
|
||||
int result = snprintf(buf, sizeof(buf), "<0x%02X>", ch);
|
||||
GGML_ASSERT(0 <= result && result < 7);
|
||||
const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
|
||||
return vocab.token_to_id.at(buf);
|
||||
}
|
||||
case LLAMA_VOCAB_TYPE_BPE: {
|
||||
@@ -6324,7 +6413,6 @@ struct llm_tokenizer_bpe {
|
||||
llm_symbol sym;
|
||||
size_t char_len = std::min(word.size() - offset, (size_t) ::utf8_len(word[offset]));
|
||||
sym.text = word.c_str() + offset;
|
||||
sym.n = 1;
|
||||
sym.n = char_len;
|
||||
offset += sym.n;
|
||||
sym.prev = index - 1;
|
||||
@@ -7054,7 +7142,7 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
|
||||
std::vector<llama_grammar_candidate> rejects;
|
||||
|
||||
if (stack.empty()) {
|
||||
for (auto tok : candidates) {
|
||||
for (const auto & tok : candidates) {
|
||||
if (*tok.code_points != 0 || tok.partial_utf8.n_remain != 0) {
|
||||
rejects.push_back(tok);
|
||||
}
|
||||
@@ -7065,7 +7153,7 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
|
||||
const llama_grammar_element * stack_pos = stack.back();
|
||||
|
||||
std::vector<llama_grammar_candidate> next_candidates;
|
||||
for (auto tok : candidates) {
|
||||
for (const auto & tok : candidates) {
|
||||
if (*tok.code_points == 0) {
|
||||
// reached end of full codepoints in token, reject iff it ended in a partial sequence
|
||||
// that cannot satisfy this position in grammar
|
||||
@@ -7091,7 +7179,7 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
|
||||
llama_grammar_advance_stack(rules, stack_after, next_stacks);
|
||||
|
||||
auto next_rejects = llama_grammar_reject_candidates(rules, next_stacks, next_candidates);
|
||||
for (auto tok : next_rejects) {
|
||||
for (const auto & tok : next_rejects) {
|
||||
rejects.push_back({ tok.index, tok.code_points - 1, tok.partial_utf8 });
|
||||
}
|
||||
|
||||
@@ -7418,37 +7506,15 @@ void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array
|
||||
llama_sample_temp(ctx, candidates_p, temp);
|
||||
}
|
||||
|
||||
void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty) {
|
||||
if (last_tokens_size == 0 || penalty == 1.0f) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int64_t t_start_sample_us = ggml_time_us();
|
||||
|
||||
for (size_t i = 0; i < candidates->size; ++i) {
|
||||
const auto * token_iter = std::find(last_tokens, last_tokens + last_tokens_size, candidates->data[i].id);
|
||||
if (token_iter == last_tokens + last_tokens_size) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong.
|
||||
// This is common fix for this problem, which is to multiply by the penalty instead of dividing.
|
||||
if (candidates->data[i].logit <= 0) {
|
||||
candidates->data[i].logit *= penalty;
|
||||
} else {
|
||||
candidates->data[i].logit /= penalty;
|
||||
}
|
||||
}
|
||||
|
||||
candidates->sorted = false;
|
||||
|
||||
if (ctx) {
|
||||
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
||||
}
|
||||
}
|
||||
|
||||
void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens_p, size_t last_tokens_size, float alpha_frequency, float alpha_presence) {
|
||||
if (last_tokens_size == 0 || (alpha_frequency == 0.0f && alpha_presence == 0.0f)) {
|
||||
void llama_sample_repetition_penalties(
|
||||
struct llama_context * ctx,
|
||||
llama_token_data_array * candidates,
|
||||
const llama_token * last_tokens,
|
||||
size_t penalty_last_n,
|
||||
float penalty_repeat,
|
||||
float penalty_freq,
|
||||
float penalty_present) {
|
||||
if (penalty_last_n == 0 || (penalty_repeat == 1.0f && penalty_freq == 0.0f && penalty_present == 0.0f)) {
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -7456,19 +7522,28 @@ void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, l
|
||||
|
||||
// Create a frequency map to count occurrences of each token in last_tokens
|
||||
std::unordered_map<llama_token, int> token_count;
|
||||
for (size_t i = 0; i < last_tokens_size; ++i) {
|
||||
token_count[last_tokens_p[i]]++;
|
||||
for (size_t i = 0; i < penalty_last_n; ++i) {
|
||||
token_count[last_tokens[i]]++;
|
||||
}
|
||||
|
||||
// Apply frequency and presence penalties to the candidates
|
||||
for (size_t i = 0; i < candidates->size; ++i) {
|
||||
auto token_iter = token_count.find(candidates->data[i].id);
|
||||
const auto token_iter = token_count.find(candidates->data[i].id);
|
||||
if (token_iter == token_count.end()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
int count = token_iter->second;
|
||||
candidates->data[i].logit -= float(count) * alpha_frequency + float(count > 0) * alpha_presence;
|
||||
const int count = token_iter->second;
|
||||
|
||||
// The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong.
|
||||
// This is common fix for this problem, which is to multiply by the penalty instead of dividing.
|
||||
if (candidates->data[i].logit <= 0) {
|
||||
candidates->data[i].logit *= penalty_repeat;
|
||||
} else {
|
||||
candidates->data[i].logit /= penalty_repeat;
|
||||
}
|
||||
|
||||
candidates->data[i].logit -= float(count) * penalty_freq + float(count > 0) * penalty_present;
|
||||
}
|
||||
|
||||
candidates->sorted = false;
|
||||
@@ -7490,14 +7565,14 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
|
||||
}
|
||||
}
|
||||
|
||||
const llama_token eos = llama_token_eos(ctx);
|
||||
const llama_token eos = llama_token_eos(&ctx->model);
|
||||
|
||||
std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
|
||||
std::vector<llama_grammar_candidate> candidates_grammar;
|
||||
|
||||
for (size_t i = 0; i < candidates->size; ++i) {
|
||||
const llama_token id = candidates->data[i].id;
|
||||
const std::string piece = llama_token_to_str(ctx, id);
|
||||
const std::string piece = llama_token_to_piece(ctx, id);
|
||||
if (id == eos) {
|
||||
if (!allow_eos) {
|
||||
candidates->data[i].logit = -INFINITY;
|
||||
@@ -7700,7 +7775,7 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
|
||||
void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token) {
|
||||
const int64_t t_start_sample_us = ggml_time_us();
|
||||
|
||||
if (token == llama_token_eos(ctx)) {
|
||||
if (token == llama_token_eos(&ctx->model)) {
|
||||
for (const auto & stack : grammar->stacks) {
|
||||
if (stack.empty()) {
|
||||
return;
|
||||
@@ -7709,7 +7784,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
|
||||
const std::string piece = llama_token_to_str(ctx, token);
|
||||
const std::string piece = llama_token_to_piece(ctx, token);
|
||||
|
||||
// Note terminating 0 in decoded string
|
||||
const auto decoded = decode_utf8(piece.c_str(), grammar->partial_utf8);
|
||||
@@ -7982,6 +8057,24 @@ struct no_init {
|
||||
no_init() { /* do nothing */ }
|
||||
};
|
||||
|
||||
struct quantize_state_internal {
|
||||
const llama_model & model;
|
||||
const llama_model_quantize_params * params;
|
||||
|
||||
int n_attention_wv = 0;
|
||||
int n_feed_forward_w2 = 0;
|
||||
int i_attention_wv = 0;
|
||||
int i_feed_forward_w2 = 0;
|
||||
|
||||
int n_k_quantized = 0;
|
||||
int n_fallback = 0;
|
||||
|
||||
quantize_state_internal(const llama_model & model, const llama_model_quantize_params * params)
|
||||
: model(model)
|
||||
, params(params)
|
||||
{}
|
||||
};
|
||||
|
||||
static void llama_convert_tensor_internal(
|
||||
struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
|
||||
const size_t nelements, const int nthread
|
||||
@@ -8040,14 +8133,14 @@ static void llama_convert_tensor_internal(
|
||||
workers.clear();
|
||||
}
|
||||
|
||||
#ifdef GGML_USE_K_QUANTS
|
||||
static ggml_type get_k_quant_type(
|
||||
ggml_type new_type, const ggml_tensor * tensor, const llama_model & model, llama_ftype ftype, int * i_attention_wv,
|
||||
int n_attention_wv, int * i_feed_forward_w2, int n_feed_forward_w2
|
||||
quantize_state_internal & qs,
|
||||
ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype
|
||||
) {
|
||||
const std::string name = ggml_get_name(tensor);
|
||||
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
||||
const auto tn = LLM_TN(model.arch);
|
||||
const llm_arch arch = qs.model.arch;
|
||||
const auto tn = LLM_TN(arch);
|
||||
|
||||
auto use_more_bits = [](int i_layer, int num_layers) -> bool {
|
||||
return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
|
||||
@@ -8055,7 +8148,7 @@ static ggml_type get_k_quant_type(
|
||||
|
||||
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
|
||||
int nx = tensor->ne[0];
|
||||
if (model.arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
|
||||
if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
|
||||
new_type = GGML_TYPE_Q8_0;
|
||||
}
|
||||
else if (new_type != GGML_TYPE_Q8_0) {
|
||||
@@ -8064,46 +8157,46 @@ static ggml_type get_k_quant_type(
|
||||
} else if (name.find("attn_v.weight") != std::string::npos) {
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
||||
new_type = *i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
||||
new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
||||
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
||||
use_more_bits(*i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && *i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
|
||||
use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
|
||||
else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
|
||||
(*i_attention_wv < n_attention_wv/8 || *i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
|
||||
if (model.type == MODEL_70B) {
|
||||
(qs.i_attention_wv < qs.n_attention_wv/8 || qs.i_attention_wv >= 7*qs.n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
|
||||
if (qs.model.type == MODEL_70B) {
|
||||
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
|
||||
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
|
||||
// nearly negligible increase in model size by quantizing this tensor with more bits:
|
||||
if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
|
||||
}
|
||||
++*i_attention_wv;
|
||||
++qs.i_attention_wv;
|
||||
} else if (name.find("ffn_down.weight") != std::string::npos) {
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
||||
new_type = *i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
|
||||
: model.arch != LLM_ARCH_FALCON || use_more_bits(*i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K
|
||||
new_type = qs.i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
|
||||
: arch != LLM_ARCH_FALCON || use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q4_K
|
||||
: GGML_TYPE_Q3_K;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
|
||||
new_type = model.arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
|
||||
new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
|
||||
if (model.arch == LLM_ARCH_FALCON) {
|
||||
new_type = *i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
|
||||
use_more_bits(*i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
||||
if (arch == LLM_ARCH_FALCON) {
|
||||
new_type = qs.i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
|
||||
use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
||||
} else {
|
||||
if (use_more_bits(*i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
||||
if (use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
||||
}
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(*i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model.arch != LLM_ARCH_FALCON && *i_feed_forward_w2 < 4) {
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && qs.i_feed_forward_w2 < 4) {
|
||||
new_type = GGML_TYPE_Q5_K;
|
||||
}
|
||||
++*i_feed_forward_w2;
|
||||
++qs.i_feed_forward_w2;
|
||||
} else if (name.find("attn_output.weight") != std::string::npos) {
|
||||
if (model.arch != LLM_ARCH_FALCON) {
|
||||
if (arch != LLM_ARCH_FALCON) {
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
||||
@@ -8130,25 +8223,27 @@ static ggml_type get_k_quant_type(
|
||||
int nx = tensor->ne[0];
|
||||
int ny = tensor->ne[1];
|
||||
if (nx % QK_K != 0) {
|
||||
LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for k-quants\n", __func__, nx, ny, QK_K);
|
||||
LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for %s", __func__, nx, ny, QK_K, ggml_type_name(new_type));
|
||||
convert_incompatible_tensor = true;
|
||||
} else {
|
||||
++qs.n_k_quantized;
|
||||
}
|
||||
}
|
||||
if (convert_incompatible_tensor) {
|
||||
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
|
||||
new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
|
||||
LLAMA_LOG_WARN("F16 will be used for this tensor instead.\n");
|
||||
} else if (name == tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
|
||||
new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
|
||||
LLAMA_LOG_WARN("Q4_0 will be used for this tensor instead.\n");
|
||||
} else {
|
||||
throw std::runtime_error("Unsupported tensor size encountered\n");
|
||||
switch (new_type) {
|
||||
case GGML_TYPE_Q2_K: new_type = GGML_TYPE_Q4_0; break;
|
||||
case GGML_TYPE_Q3_K: new_type = GGML_TYPE_Q4_1; break;
|
||||
case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
|
||||
case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break;
|
||||
case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break;
|
||||
default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
|
||||
}
|
||||
LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
|
||||
++qs.n_fallback;
|
||||
}
|
||||
|
||||
return new_type;
|
||||
}
|
||||
#endif
|
||||
|
||||
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
|
||||
ggml_type quantized_type;
|
||||
@@ -8163,7 +8258,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
case LLAMA_FTYPE_MOSTLY_F16: quantized_type = GGML_TYPE_F16; break;
|
||||
case LLAMA_FTYPE_ALL_F32: quantized_type = GGML_TYPE_F32; break;
|
||||
|
||||
#ifdef GGML_USE_K_QUANTS
|
||||
// K-quants
|
||||
case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q3_K_S:
|
||||
@@ -8174,7 +8268,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
case LLAMA_FTYPE_MOSTLY_Q5_K_S:
|
||||
case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
|
||||
#endif
|
||||
|
||||
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
||||
}
|
||||
|
||||
@@ -8201,6 +8295,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
llm_load_arch(ml, model);
|
||||
llm_load_hparams(ml, model);
|
||||
|
||||
struct quantize_state_internal qs(model, params);
|
||||
|
||||
if (params->only_copy) {
|
||||
ftype = model.ftype;
|
||||
}
|
||||
@@ -8213,10 +8309,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
|
||||
gguf_set_val_u32(ctx_out, "general.file_type", ftype);
|
||||
|
||||
#ifdef GGML_USE_K_QUANTS
|
||||
int n_attention_wv = 0;
|
||||
int n_feed_forward_w2 = 0;
|
||||
|
||||
for (int i = 0; i < ml.n_tensors; ++i) {
|
||||
struct ggml_tensor * meta = ml.get_tensor_meta(i);
|
||||
|
||||
@@ -8224,21 +8316,17 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
|
||||
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
||||
if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) {
|
||||
++n_attention_wv;
|
||||
++qs.n_attention_wv;
|
||||
}
|
||||
else if (name.find("ffn_down.weight") != std::string::npos) {
|
||||
++n_feed_forward_w2;
|
||||
++qs.n_feed_forward_w2;
|
||||
}
|
||||
}
|
||||
if (n_attention_wv != n_feed_forward_w2 || (uint32_t)n_attention_wv != model.hparams.n_layer) {
|
||||
if (qs.n_attention_wv != qs.n_feed_forward_w2 || (uint32_t)qs.n_attention_wv != model.hparams.n_layer) {
|
||||
LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_feed_forward_w2 = %d, hparams.n_layer = %d\n",
|
||||
__func__, n_attention_wv, n_feed_forward_w2, model.hparams.n_layer);
|
||||
__func__, qs.n_attention_wv, qs.n_feed_forward_w2, model.hparams.n_layer);
|
||||
}
|
||||
|
||||
int i_attention_wv = 0;
|
||||
int i_feed_forward_w2 = 0;
|
||||
#endif
|
||||
|
||||
size_t total_size_org = 0;
|
||||
size_t total_size_new = 0;
|
||||
std::vector<int64_t> hist_all(1 << 4, 0);
|
||||
@@ -8302,11 +8390,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
|
||||
if (quantize) {
|
||||
new_type = quantized_type;
|
||||
#ifdef GGML_USE_K_QUANTS
|
||||
new_type = get_k_quant_type(
|
||||
new_type, tensor, model, ftype, &i_attention_wv, n_attention_wv, &i_feed_forward_w2, n_feed_forward_w2
|
||||
);
|
||||
#endif
|
||||
if (!params->pure) {
|
||||
new_type = get_k_quant_type(qs, new_type, tensor, ftype);
|
||||
}
|
||||
|
||||
// If we've decided to quantize to the same type the tensor is already
|
||||
// in then there's nothing to do.
|
||||
quantize = tensor->type != new_type;
|
||||
@@ -8431,6 +8518,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
LLAMA_LOG_INFO("\n");
|
||||
}
|
||||
}
|
||||
|
||||
if (qs.n_fallback > 0) {
|
||||
LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) incompatible with k-quants and required fallback quantization\n",
|
||||
__func__, qs.n_fallback, qs.n_k_quantized + qs.n_fallback);
|
||||
}
|
||||
}
|
||||
|
||||
static int llama_apply_lora_from_file_internal(
|
||||
@@ -8755,6 +8847,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
|
||||
/*.allow_requantize =*/ false,
|
||||
/*.quantize_output_tensor =*/ true,
|
||||
/*.only_copy =*/ false,
|
||||
/*.pure =*/ false,
|
||||
};
|
||||
|
||||
return result;
|
||||
@@ -8909,7 +9002,7 @@ struct llama_context * llama_new_context_with_model(
|
||||
// build worst-case graph
|
||||
int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_batch);
|
||||
int n_past = cparams.n_ctx - n_tokens;
|
||||
llama_token token = llama_token_bos(ctx); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
|
||||
llama_token token = llama_token_bos(&ctx->model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
|
||||
ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0));
|
||||
|
||||
#ifdef GGML_USE_METAL
|
||||
@@ -9115,8 +9208,8 @@ int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
|
||||
return ctx->kv_self.head;
|
||||
}
|
||||
|
||||
void llama_kv_cache_tokens_rm(struct llama_context * ctx, int32_t c0, int32_t c1) {
|
||||
llama_kv_cache_tokens_rm(ctx->kv_self, c0, c1);
|
||||
void llama_kv_cache_clear(struct llama_context * ctx) {
|
||||
llama_kv_cache_clear(ctx->kv_self);
|
||||
}
|
||||
|
||||
void llama_kv_cache_seq_rm(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
|
||||
@@ -9562,7 +9655,7 @@ int llama_eval(
|
||||
llama_token * tokens,
|
||||
int32_t n_tokens,
|
||||
int n_past) {
|
||||
llama_kv_cache_tokens_rm(ctx->kv_self, n_past, -1);
|
||||
llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1);
|
||||
|
||||
const int ret = llama_decode_internal(*ctx, llama_batch_get_one(tokens, n_tokens, n_past, 0));
|
||||
if (ret < 0) {
|
||||
@@ -9577,7 +9670,7 @@ int llama_eval_embd(
|
||||
float * embd,
|
||||
int32_t n_tokens,
|
||||
int n_past) {
|
||||
llama_kv_cache_tokens_rm(ctx->kv_self, n_past, -1);
|
||||
llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1);
|
||||
|
||||
llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, nullptr, n_past, 1, 0, };
|
||||
|
||||
@@ -9670,43 +9763,44 @@ float * llama_get_embeddings(struct llama_context * ctx) {
|
||||
return ctx->embedding.data();
|
||||
}
|
||||
|
||||
const char * llama_token_get_text(const struct llama_context * ctx, llama_token token) {
|
||||
return ctx->model.vocab.id_to_token[token].text.c_str();
|
||||
const char * llama_token_get_text(const struct llama_model * model, llama_token token) {
|
||||
return model->vocab.id_to_token[token].text.c_str();
|
||||
}
|
||||
|
||||
float llama_token_get_score(const struct llama_context * ctx, llama_token token) {
|
||||
return ctx->model.vocab.id_to_token[token].score;
|
||||
float llama_token_get_score(const struct llama_model * model, llama_token token) {
|
||||
return model->vocab.id_to_token[token].score;
|
||||
}
|
||||
|
||||
llama_token_type llama_token_get_type(const struct llama_context * ctx, llama_token token) {
|
||||
return ctx->model.vocab.id_to_token[token].type;
|
||||
llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token) {
|
||||
return model->vocab.id_to_token[token].type;
|
||||
}
|
||||
|
||||
llama_token llama_token_bos(const struct llama_context * ctx) {
|
||||
return ctx->model.vocab.special_bos_id;
|
||||
llama_token llama_token_bos(const struct llama_model * model) {
|
||||
return model->vocab.special_bos_id;
|
||||
}
|
||||
|
||||
llama_token llama_token_eos(const struct llama_context * ctx) {
|
||||
return ctx->model.vocab.special_eos_id;
|
||||
llama_token llama_token_eos(const struct llama_model * model) {
|
||||
return model->vocab.special_eos_id;
|
||||
}
|
||||
|
||||
llama_token llama_token_nl(const struct llama_context * ctx) {
|
||||
return ctx->model.vocab.linefeed_id;
|
||||
}
|
||||
llama_token llama_token_prefix(const struct llama_context * ctx) {
|
||||
return ctx->model.vocab.special_prefix_id;
|
||||
llama_token llama_token_nl(const struct llama_model * model) {
|
||||
return model->vocab.linefeed_id;
|
||||
}
|
||||
|
||||
llama_token llama_token_middle(const struct llama_context * ctx) {
|
||||
return ctx->model.vocab.special_middle_id;
|
||||
llama_token llama_token_prefix(const struct llama_model * model) {
|
||||
return model->vocab.special_prefix_id;
|
||||
}
|
||||
|
||||
llama_token llama_token_suffix(const struct llama_context * ctx) {
|
||||
return ctx->model.vocab.special_suffix_id;
|
||||
llama_token llama_token_middle(const struct llama_model * model) {
|
||||
return model->vocab.special_middle_id;
|
||||
}
|
||||
|
||||
llama_token llama_token_eot(const struct llama_context * ctx) {
|
||||
return ctx->model.vocab.special_eot_id;
|
||||
llama_token llama_token_suffix(const struct llama_model * model) {
|
||||
return model->vocab.special_suffix_id;
|
||||
}
|
||||
|
||||
llama_token llama_token_eot(const struct llama_model * model) {
|
||||
return model->vocab.special_eot_id;
|
||||
}
|
||||
|
||||
int llama_tokenize(
|
||||
|
||||
56
llama.h
56
llama.h
@@ -178,7 +178,7 @@ extern "C" {
|
||||
float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
|
||||
|
||||
// Keep the booleans together to avoid misalignment during copy-by-value.
|
||||
bool mul_mat_q; // if true, use experimental mul_mat_q kernels
|
||||
bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
|
||||
bool f16_kv; // use fp16 for KV cache, fp32 otherwise
|
||||
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
||||
bool embedding; // embedding mode only
|
||||
@@ -191,6 +191,7 @@ extern "C" {
|
||||
bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
||||
bool quantize_output_tensor; // quantize output.weight
|
||||
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
||||
bool pure; // disable k-quant mixtures and quantize all tensors to the same type
|
||||
} llama_model_quantize_params;
|
||||
|
||||
// grammar types
|
||||
@@ -333,17 +334,14 @@ extern "C" {
|
||||
LLAMA_API DEPRECATED(int llama_get_kv_cache_token_count(const struct llama_context * ctx),
|
||||
"avoid using this, it will be removed in the future, instead - count the tokens in user code");
|
||||
|
||||
// Remove all tokens data of cells in [c0, c1)
|
||||
// c0 < 0 : [0, c1]
|
||||
// c1 < 0 : [c0, inf)
|
||||
LLAMA_API void llama_kv_cache_tokens_rm(
|
||||
struct llama_context * ctx,
|
||||
int32_t c0,
|
||||
int32_t c1);
|
||||
// Clear the KV cache
|
||||
LLAMA_API void llama_kv_cache_clear(
|
||||
struct llama_context * ctx);
|
||||
|
||||
// Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
|
||||
// p0 < 0 : [0, p1]
|
||||
// p1 < 0 : [p0, inf)
|
||||
// seq_id < 0 : match any sequence
|
||||
// p0 < 0 : [0, p1]
|
||||
// p1 < 0 : [p0, inf)
|
||||
LLAMA_API void llama_kv_cache_seq_rm(
|
||||
struct llama_context * ctx,
|
||||
llama_seq_id seq_id,
|
||||
@@ -494,21 +492,22 @@ extern "C" {
|
||||
// Vocab
|
||||
//
|
||||
|
||||
LLAMA_API const char * llama_token_get_text(const struct llama_context * ctx, llama_token token);
|
||||
LLAMA_API const char * llama_token_get_text(const struct llama_model * model, llama_token token);
|
||||
|
||||
LLAMA_API float llama_token_get_score(const struct llama_context * ctx, llama_token token);
|
||||
LLAMA_API float llama_token_get_score(const struct llama_model * model, llama_token token);
|
||||
|
||||
LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_context * ctx, llama_token token);
|
||||
LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token);
|
||||
|
||||
// Special tokens
|
||||
LLAMA_API llama_token llama_token_bos(const struct llama_context * ctx); // beginning-of-sentence
|
||||
LLAMA_API llama_token llama_token_eos(const struct llama_context * ctx); // end-of-sentence
|
||||
LLAMA_API llama_token llama_token_nl (const struct llama_context * ctx); // next-line
|
||||
LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
|
||||
LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
|
||||
LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
|
||||
|
||||
// codellama infill tokens
|
||||
LLAMA_API llama_token llama_token_prefix(const struct llama_context * ctx); // Beginning of infill prefix
|
||||
LLAMA_API llama_token llama_token_middle(const struct llama_context * ctx); // Beginning of infill middle
|
||||
LLAMA_API llama_token llama_token_suffix(const struct llama_context * ctx); // Beginning of infill suffix
|
||||
LLAMA_API llama_token llama_token_eot (const struct llama_context * ctx); // End of infill middle
|
||||
LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
|
||||
LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle
|
||||
LLAMA_API llama_token llama_token_suffix(const struct llama_model * model); // Beginning of infill suffix
|
||||
LLAMA_API llama_token llama_token_eot (const struct llama_model * model); // End of infill middle
|
||||
|
||||
//
|
||||
// Tokenization
|
||||
@@ -560,21 +559,15 @@ extern "C" {
|
||||
LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed);
|
||||
|
||||
/// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
|
||||
LLAMA_API void llama_sample_repetition_penalty(
|
||||
struct llama_context * ctx,
|
||||
llama_token_data_array * candidates,
|
||||
const llama_token * last_tokens,
|
||||
size_t last_tokens_size,
|
||||
float penalty);
|
||||
|
||||
/// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
|
||||
LLAMA_API void llama_sample_frequency_and_presence_penalties(
|
||||
LLAMA_API void llama_sample_repetition_penalties(
|
||||
struct llama_context * ctx,
|
||||
llama_token_data_array * candidates,
|
||||
const llama_token * last_tokens,
|
||||
size_t last_tokens_size,
|
||||
float alpha_frequency,
|
||||
float alpha_presence);
|
||||
size_t penalty_last_n,
|
||||
float penalty_repeat,
|
||||
float penalty_freq,
|
||||
float penalty_present);
|
||||
|
||||
/// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
|
||||
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
|
||||
@@ -663,6 +656,7 @@ extern "C" {
|
||||
float * mu);
|
||||
|
||||
/// @details Selects the token with the highest probability.
|
||||
/// Does not compute the token probabilities. Use llama_sample_softmax() instead.
|
||||
LLAMA_API llama_token llama_sample_token_greedy(
|
||||
struct llama_context * ctx,
|
||||
llama_token_data_array * candidates);
|
||||
|
||||
BIN
models/ggml-vocab-baichuan.gguf
Normal file
BIN
models/ggml-vocab-baichuan.gguf
Normal file
Binary file not shown.
BIN
models/ggml-vocab-gpt-neox.gguf
Normal file
BIN
models/ggml-vocab-gpt-neox.gguf
Normal file
Binary file not shown.
BIN
models/ggml-vocab-mpt.gguf
Normal file
BIN
models/ggml-vocab-mpt.gguf
Normal file
Binary file not shown.
BIN
models/ggml-vocab-refact.gguf
Normal file
BIN
models/ggml-vocab-refact.gguf
Normal file
Binary file not shown.
BIN
models/ggml-vocab-starcoder.gguf
Normal file
BIN
models/ggml-vocab-starcoder.gguf
Normal file
Binary file not shown.
@@ -28,9 +28,14 @@ llama_build_executable(test-tokenizer-0-falcon.cpp)
|
||||
llama_test_executable (test-tokenizer-0-falcon test-tokenizer-0-falcon.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
||||
llama_build_executable(test-tokenizer-1-llama.cpp)
|
||||
llama_test_executable (test-tokenizer-1-llama test-tokenizer-1-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
|
||||
llama_test_executable(test-tokenizer-1-baichuan test-tokenizer-1-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
|
||||
llama_build_executable(test-tokenizer-1-bpe.cpp)
|
||||
llama_test_executable (test-tokenizer-1-falcon test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
||||
llama_test_executable(test-tokenizer-1-aquila test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
|
||||
llama_test_executable(test-tokenizer-1-mpt test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
|
||||
llama_test_executable(test-tokenizer-1-gpt-neox test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-neox.gguf)
|
||||
llama_test_executable(test-tokenizer-1-refact test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
|
||||
llama_test_executable(test-tokenizer-1-starcoder test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
|
||||
llama_build_and_test_executable(test-grammar-parser.cpp)
|
||||
llama_build_and_test_executable(test-llama-grammar.cpp)
|
||||
llama_build_and_test_executable(test-grad0.cpp) # SLOW
|
||||
|
||||
@@ -4,7 +4,9 @@
|
||||
|
||||
#undef NDEBUG
|
||||
#include <cassert>
|
||||
#if !defined(__riscv) && !defined(__s390__) && !defined(__ARM_NEON)
|
||||
#include <immintrin.h>
|
||||
#endif
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
|
||||
@@ -129,6 +129,13 @@ int main(int argc, char * argv[]) {
|
||||
ggml_type type = (ggml_type) i;
|
||||
ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
|
||||
|
||||
// deprecated - skip
|
||||
if (qfns.blck_size == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
printf("Testing %s\n", ggml_type_name((ggml_type) i));
|
||||
|
||||
if (qfns.from_float && qfns.to_float) {
|
||||
const float total_error = total_quantization_error(qfns, test_size, test_data.data());
|
||||
const float max_quantization_error =
|
||||
|
||||
@@ -8,11 +8,9 @@
|
||||
#include <cmath>
|
||||
#include <numeric>
|
||||
#include <cassert>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
|
||||
|
||||
static void dump(const llama_token_data_array * candidates) {
|
||||
for (size_t i = 0; i < candidates->size; i++) {
|
||||
printf("%d: %f (%f)\n", candidates->data[i].id, candidates->data[i].p, candidates->data[i].logit);
|
||||
@@ -21,7 +19,6 @@ static void dump(const llama_token_data_array * candidates) {
|
||||
|
||||
#define DUMP(__candidates) do { printf("%s:%d (%s)\n", __FILE__, __LINE__, __func__); dump((__candidates)); printf("-\n"); } while(0)
|
||||
|
||||
|
||||
static void test_top_k(const std::vector<float> & probs, const std::vector<float> & expected_probs, int k) {
|
||||
size_t n_vocab = probs.size();
|
||||
std::vector<llama_token_data> candidates;
|
||||
@@ -37,13 +34,12 @@ static void test_top_k(const std::vector<float> & probs, const std::vector<float
|
||||
llama_sample_top_k(nullptr, &candidates_p, k, 1);
|
||||
DUMP(&candidates_p);
|
||||
|
||||
assert(candidates_p.size == expected_probs.size());
|
||||
GGML_ASSERT(candidates_p.size == expected_probs.size());
|
||||
for (size_t i = 0; i < candidates_p.size; i++) {
|
||||
assert(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-5);
|
||||
GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-5);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void test_top_p(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
|
||||
size_t n_vocab = probs.size();
|
||||
std::vector<llama_token_data> candidates;
|
||||
@@ -59,13 +55,12 @@ static void test_top_p(const std::vector<float> & probs, const std::vector<float
|
||||
llama_sample_top_p(nullptr, &candidates_p, p, 1);
|
||||
DUMP(&candidates_p);
|
||||
|
||||
assert(candidates_p.size == expected_probs.size());
|
||||
GGML_ASSERT(candidates_p.size == expected_probs.size());
|
||||
for (size_t i = 0; i < candidates_p.size; i++) {
|
||||
assert(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
|
||||
GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void test_tfs(const std::vector<float> & probs, const std::vector<float> & expected_probs, float z) {
|
||||
size_t n_vocab = probs.size();
|
||||
std::vector<llama_token_data> candidates;
|
||||
@@ -80,13 +75,12 @@ static void test_tfs(const std::vector<float> & probs, const std::vector<float>
|
||||
llama_sample_tail_free(nullptr, &candidates_p, z, 1);
|
||||
DUMP(&candidates_p);
|
||||
|
||||
assert(candidates_p.size == expected_probs.size());
|
||||
GGML_ASSERT(candidates_p.size == expected_probs.size());
|
||||
for (size_t i = 0; i < candidates_p.size; i++) {
|
||||
assert(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
|
||||
GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void test_typical(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
|
||||
size_t n_vocab = probs.size();
|
||||
std::vector<llama_token_data> candidates;
|
||||
@@ -101,18 +95,17 @@ static void test_typical(const std::vector<float> & probs, const std::vector<flo
|
||||
llama_sample_typical(nullptr, &candidates_p, p, 1);
|
||||
DUMP(&candidates_p);
|
||||
|
||||
assert(candidates_p.size == expected_probs.size());
|
||||
GGML_ASSERT(candidates_p.size == expected_probs.size());
|
||||
for (size_t i = 0; i < candidates_p.size; i++) {
|
||||
assert(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
|
||||
GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void test_repetition_penalty(
|
||||
static void test_repetition_penalties(
|
||||
const std::vector<float> & probs, const std::vector<llama_token> & last_tokens,
|
||||
const std::vector<float> & expected_probs, float penalty
|
||||
const std::vector<float> & expected_probs, float repeat_penalty, float alpha_frequency, float alpha_presence
|
||||
) {
|
||||
assert(probs.size() == expected_probs.size());
|
||||
GGML_ASSERT(probs.size() == expected_probs.size());
|
||||
|
||||
size_t n_vocab = probs.size();
|
||||
std::vector<llama_token_data> candidates;
|
||||
@@ -125,41 +118,13 @@ static void test_repetition_penalty(
|
||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
||||
llama_sample_softmax(nullptr, &candidates_p);
|
||||
DUMP(&candidates_p);
|
||||
llama_sample_repetition_penalty(nullptr, &candidates_p, (const llama_token *) last_tokens.data(), last_tokens.size(), penalty);
|
||||
llama_sample_repetition_penalties(nullptr, &candidates_p, (const llama_token *) last_tokens.data(), last_tokens.size(), repeat_penalty, alpha_frequency, alpha_presence);
|
||||
llama_sample_softmax(nullptr, &candidates_p);
|
||||
DUMP(&candidates_p);
|
||||
|
||||
assert(candidates_p.size == expected_probs.size());
|
||||
GGML_ASSERT(candidates_p.size == expected_probs.size());
|
||||
for (size_t i = 0; i < candidates_p.size; i++) {
|
||||
assert(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-6);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void test_frequency_presence_penalty(
|
||||
const std::vector<float> & probs, const std::vector<llama_token> & last_tokens,
|
||||
const std::vector<float> & expected_probs, float alpha_frequency, float alpha_presence
|
||||
) {
|
||||
assert(probs.size() == expected_probs.size());
|
||||
|
||||
size_t n_vocab = probs.size();
|
||||
std::vector<llama_token_data> candidates;
|
||||
candidates.reserve(n_vocab);
|
||||
for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
|
||||
float logit = log(probs[token_id]);
|
||||
candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
|
||||
}
|
||||
|
||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
||||
llama_sample_softmax(nullptr, &candidates_p);
|
||||
// DUMP(&candidates_p);
|
||||
llama_sample_frequency_and_presence_penalties(nullptr, &candidates_p, (const llama_token *) last_tokens.data(), last_tokens.size(), alpha_frequency, alpha_presence);
|
||||
llama_sample_softmax(nullptr, &candidates_p);
|
||||
// DUMP(&candidates_p);
|
||||
|
||||
assert(candidates_p.size == expected_probs.size());
|
||||
for (size_t i = 0; i < candidates_p.size; i++) {
|
||||
assert(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
|
||||
GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -181,13 +146,13 @@ int main(void) {
|
||||
test_typical({0.97f, 0.01f, 0.01f, 0.01f}, {0.97f}, 0.5f);
|
||||
test_typical({0.4f, 0.2f, 0.2f, 0.2f}, {0.2f, 0.2f, 0.2f}, 0.5f);
|
||||
|
||||
test_repetition_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0}, {0.25f, 0.25f, 0.25f, 0.25f, 0}, 50.0f);
|
||||
test_repetition_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2}, {0.5f, 0.5f, 0, 0, 0}, 50.0f);
|
||||
test_repetition_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.5f, 0.5f, 0, 0, 0}, 50.0f);
|
||||
test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0}, {0.25f, 0.25f, 0.25f, 0.25f, 0}, 50.0f, 0.0f, 0.0f);
|
||||
test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2}, {0.5f, 0.5f, 0, 0, 0}, 50.0f, 0.0f, 0.0f);
|
||||
test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.5f, 0.5f, 0, 0, 0}, 50.0f, 0.0f, 0.0f);
|
||||
|
||||
test_frequency_presence_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0}, {0.249997f, 0.249997f, 0.249997f, 0.249997f, 0.000011f}, 5.0f, 5.0f);
|
||||
test_frequency_presence_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2}, {0.499966f, 0.499966f, 0.000023f, 0.000023f, 0.000023f}, 5.0f, 5.0f);
|
||||
test_frequency_presence_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.499977f, 0.499977f, 0.000023f, 0.000023f, 0.000000f}, 5.0f, 5.0f);
|
||||
test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0}, {0.249997f, 0.249997f, 0.249997f, 0.249997f, 0.000011f}, 1.0f, 5.0f, 5.0f);
|
||||
test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2}, {0.499966f, 0.499966f, 0.000023f, 0.000023f, 0.000023f}, 1.0f, 5.0f, 5.0f);
|
||||
test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.499977f, 0.499977f, 0.000023f, 0.000023f, 0.000000f}, 1.0f, 5.0f, 5.0f);
|
||||
|
||||
printf("OK\n");
|
||||
|
||||
|
||||
@@ -91,9 +91,19 @@ int main(int argc, char **argv) {
|
||||
}
|
||||
}
|
||||
}
|
||||
// TODO: why doesn't this work for the full range of Unicodes?
|
||||
// Restrict to assigned unicode planes
|
||||
// for (uint32_t cp = 0x10000; cp < 0x0010ffff; ++cp) {
|
||||
for (uint32_t cp = 0x10000; cp < 0x00080000; ++cp) {
|
||||
for (uint32_t cp = 0x10000; cp < 0x00040000; ++cp) {
|
||||
std::string str = codepoint_to_utf8(cp);
|
||||
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
|
||||
std::string check = llama_detokenize_bpe(ctx, tokens);
|
||||
if (str != check) {
|
||||
fprintf(stderr, "%s : error: codepoint %x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
|
||||
__func__, cp, check.c_str(), check.length(), str.c_str(), str.length());
|
||||
return 4;
|
||||
}
|
||||
}
|
||||
for (uint32_t cp = 0x000e0000; cp < 0x0010ffff; ++cp) {
|
||||
std::string str = codepoint_to_utf8(cp);
|
||||
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
|
||||
std::string check = llama_detokenize_bpe(ctx, tokens);
|
||||
@@ -103,7 +113,6 @@ int main(int argc, char **argv) {
|
||||
return 4;
|
||||
}
|
||||
}
|
||||
|
||||
llama_free_model(model);
|
||||
llama_free(ctx);
|
||||
|
||||
|
||||
Reference in New Issue
Block a user