mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-02-26 14:23:22 +02:00
Compare commits
41 Commits
master-751
...
master-b47
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b47b8a9cfe | ||
|
|
b5fe67f8c6 | ||
|
|
24baa54ac1 | ||
|
|
dd6c67d3cb | ||
|
|
5d500e8ccf | ||
|
|
7d5f18468c | ||
|
|
d924522a46 | ||
|
|
4d76a5f49b | ||
|
|
0db14fef06 | ||
|
|
03e566977b | ||
|
|
513f861953 | ||
|
|
3973b25a64 | ||
|
|
ab0e26bdfb | ||
|
|
73643f5fb1 | ||
|
|
a814d04f81 | ||
|
|
4c013bb738 | ||
|
|
42c7c2e2e9 | ||
|
|
78a3d13424 | ||
|
|
ae178ab46b | ||
|
|
54e3bc76fe | ||
|
|
019fe257bb | ||
|
|
e68c96f7fe | ||
|
|
9cf022a188 | ||
|
|
e782c9e735 | ||
|
|
785829dfe8 | ||
|
|
fff0e0eafe | ||
|
|
417a85a001 | ||
|
|
294f424554 | ||
|
|
45a1b07e9b | ||
|
|
b1f4290953 | ||
|
|
d01bccde9f | ||
|
|
6cbf9dfb32 | ||
|
|
7568d1a2b2 | ||
|
|
b7647436cc | ||
|
|
672dda10e4 | ||
|
|
27ab66e437 | ||
|
|
6e7cca4047 | ||
|
|
a6803cab94 | ||
|
|
7dabc66f3c | ||
|
|
7cdd30bf1f | ||
|
|
e8035f141e |
20
.gitignore
vendored
20
.gitignore
vendored
@@ -16,6 +16,8 @@ build/
|
||||
build-em/
|
||||
build-debug/
|
||||
build-release/
|
||||
build-ci-debug/
|
||||
build-ci-release/
|
||||
build-static/
|
||||
build-cublas/
|
||||
build-opencl/
|
||||
@@ -25,9 +27,10 @@ build-no-accel/
|
||||
build-sanitize-addr/
|
||||
build-sanitize-thread/
|
||||
out/
|
||||
tmp/
|
||||
|
||||
models/*
|
||||
*.bin
|
||||
models-mnt
|
||||
|
||||
/main
|
||||
/quantize
|
||||
@@ -58,3 +61,18 @@ qnt-*.txt
|
||||
perf-*.txt
|
||||
|
||||
examples/jeopardy/results.txt
|
||||
|
||||
|
||||
pyproject.toml
|
||||
poetry.lock
|
||||
poetry.toml
|
||||
|
||||
# Test binaries
|
||||
tests/test-double-float
|
||||
tests/test-grad0
|
||||
tests/test-opt
|
||||
tests/test-quantize-fns
|
||||
tests/test-quantize-perf
|
||||
tests/test-sampling
|
||||
tests/test-tokenizer-0
|
||||
|
||||
|
||||
@@ -512,6 +512,7 @@ if (BUILD_SHARED_LIBS)
|
||||
set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
add_library(ggml_shared SHARED $<TARGET_OBJECTS:ggml>)
|
||||
target_link_libraries(ggml_shared PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
|
||||
install(TARGETS ggml_shared LIBRARY)
|
||||
endif()
|
||||
|
||||
add_library(llama
|
||||
@@ -533,8 +534,32 @@ if (BUILD_SHARED_LIBS)
|
||||
if (LLAMA_METAL)
|
||||
set_target_properties(llama PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal")
|
||||
endif()
|
||||
install(TARGETS llama LIBRARY)
|
||||
endif()
|
||||
|
||||
include(GNUInstallDirs)
|
||||
install(
|
||||
FILES convert.py
|
||||
PERMISSIONS
|
||||
OWNER_READ
|
||||
OWNER_WRITE
|
||||
OWNER_EXECUTE
|
||||
GROUP_READ
|
||||
GROUP_EXECUTE
|
||||
WORLD_READ
|
||||
WORLD_EXECUTE
|
||||
DESTINATION ${CMAKE_INSTALL_BINDIR})
|
||||
install(
|
||||
FILES convert-lora-to-ggml.py
|
||||
PERMISSIONS
|
||||
OWNER_READ
|
||||
OWNER_WRITE
|
||||
OWNER_EXECUTE
|
||||
GROUP_READ
|
||||
GROUP_EXECUTE
|
||||
WORLD_READ
|
||||
WORLD_EXECUTE
|
||||
DESTINATION ${CMAKE_INSTALL_BINDIR})
|
||||
|
||||
#
|
||||
# programs, examples and tests
|
||||
|
||||
78
Makefile
78
Makefile
@@ -1,5 +1,8 @@
|
||||
# Define the default target now so that it is always the first target
|
||||
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch simple server libembdinput.so embd-input-test
|
||||
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch simple server embd-input-test
|
||||
|
||||
# Binaries only useful for tests
|
||||
TEST_TARGETS = tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0
|
||||
|
||||
default: $(BUILD_TARGETS)
|
||||
|
||||
@@ -90,6 +93,28 @@ ifeq ($(UNAME_S),Haiku)
|
||||
CXXFLAGS += -pthread
|
||||
endif
|
||||
|
||||
# detect Windows
|
||||
ifneq ($(findstring _NT,$(UNAME_S)),)
|
||||
_WIN32 := 1
|
||||
endif
|
||||
|
||||
# library name prefix
|
||||
ifneq ($(_WIN32),1)
|
||||
LIB_PRE := lib
|
||||
endif
|
||||
|
||||
# Dynamic Shared Object extension
|
||||
ifneq ($(_WIN32),1)
|
||||
DSO_EXT := .so
|
||||
else
|
||||
DSO_EXT := .dll
|
||||
endif
|
||||
|
||||
# Windows Sockets 2 (Winsock) for network-capable apps
|
||||
ifeq ($(_WIN32),1)
|
||||
LWINSOCK2 := -lws2_32
|
||||
endif
|
||||
|
||||
ifdef LLAMA_GPROF
|
||||
CFLAGS += -pg
|
||||
CXXFLAGS += -pg
|
||||
@@ -102,7 +127,7 @@ endif
|
||||
# Architecture specific
|
||||
# TODO: probably these flags need to be tweaked on some architectures
|
||||
# feel free to update the Makefile for your architecture and send a pull request or issue
|
||||
ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))
|
||||
ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
|
||||
# Use all CPU extensions that are available:
|
||||
CFLAGS += -march=native -mtune=native
|
||||
CXXFLAGS += -march=native -mtune=native
|
||||
@@ -154,8 +179,8 @@ ifdef LLAMA_MPI
|
||||
endif # LLAMA_MPI
|
||||
|
||||
ifdef LLAMA_OPENBLAS
|
||||
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas -I/usr/include/openblas
|
||||
LDFLAGS += -lopenblas
|
||||
CFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags openblas)
|
||||
LDFLAGS += $(shell pkg-config --libs openblas)
|
||||
endif # LLAMA_OPENBLAS
|
||||
|
||||
ifdef LLAMA_BLIS
|
||||
@@ -168,8 +193,12 @@ ifdef LLAMA_CUBLAS
|
||||
CXXFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
|
||||
LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
|
||||
OBJS += ggml-cuda.o
|
||||
NVCC = nvcc
|
||||
NVCCFLAGS = --forward-unknown-to-host-compiler
|
||||
ifdef LLAMA_CUDA_NVCC
|
||||
NVCC = $(LLAMA_CUDA_NVCC)
|
||||
else
|
||||
NVCC = nvcc
|
||||
endif #LLAMA_CUDA_NVCC
|
||||
ifdef CUDA_DOCKER_ARCH
|
||||
NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH)
|
||||
else
|
||||
@@ -198,7 +227,9 @@ ifdef LLAMA_CUDA_KQUANTS_ITER
|
||||
else
|
||||
NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
|
||||
endif
|
||||
|
||||
ifdef LLAMA_CUDA_CCBIN
|
||||
NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
|
||||
endif
|
||||
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
|
||||
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
|
||||
endif # LLAMA_CUBLAS
|
||||
@@ -294,7 +325,7 @@ libllama.so: llama.o ggml.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
|
||||
|
||||
clean:
|
||||
rm -vf *.o *.so main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server simple vdot train-text-from-scratch embd-input-test build-info.h
|
||||
rm -vf *.o *.so *.dll main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server simple vdot train-text-from-scratch embd-input-test build-info.h $(TEST_TARGETS)
|
||||
|
||||
#
|
||||
# Examples
|
||||
@@ -325,14 +356,14 @@ save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)
|
||||
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2)
|
||||
|
||||
libembdinput.so: examples/embd-input/embd-input.h examples/embd-input/embd-input-lib.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||
$(LIB_PRE)embdinput$(DSO_EXT): examples/embd-input/embd-input.h examples/embd-input/embd-input-lib.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||
$(CXX) --shared $(CXXFLAGS) $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)
|
||||
|
||||
|
||||
embd-input-test: libembdinput.so examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.so,$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput
|
||||
embd-input-test: $(LIB_PRE)embdinput$(DSO_EXT) examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %$(DSO_EXT),$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput
|
||||
|
||||
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp build-info.h ggml.o llama.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
@@ -349,6 +380,8 @@ build-info.h: $(wildcard .git/index) scripts/build-info.sh
|
||||
# Tests
|
||||
#
|
||||
|
||||
tests: $(TEST_TARGETS)
|
||||
|
||||
benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
./$@
|
||||
@@ -356,6 +389,23 @@ benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o
|
||||
vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||
|
||||
.PHONY: tests clean
|
||||
tests:
|
||||
bash ./tests/run-tests.sh
|
||||
tests/test-double-float: tests/test-double-float.c build-info.h ggml.o llama.o common.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-grad0: tests/test-grad0.c build-info.h ggml.o llama.o common.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-opt: tests/test-opt.c build-info.h ggml.o llama.o common.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-quantize-fns: tests/test-quantize-fns.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-quantize-perf: tests/test-quantize-perf.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-sampling: tests/test-sampling.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-tokenizer-0: tests/test-tokenizer-0.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
@@ -640,7 +640,7 @@ Please verify the [sha256 checksums](SHA256SUMS) of all downloaded model files t
|
||||
|
||||
```bash
|
||||
# run the verification script
|
||||
python3 .\scripts\verify-checksum-models.py
|
||||
./scripts/verify-checksum-models.py
|
||||
```
|
||||
|
||||
- On linux or macOS it is also possible to run the following commands to verify if you have all possible latest files in your self-installed `./models` subdirectory:
|
||||
|
||||
25
ci/README.md
Normal file
25
ci/README.md
Normal file
@@ -0,0 +1,25 @@
|
||||
# CI
|
||||
|
||||
In addition to [Github Actions](https://github.com/ggerganov/llama.cpp/actions) `llama.cpp` uses a custom CI framework:
|
||||
|
||||
https://github.com/ggml-org/ci
|
||||
|
||||
It monitors the `master` branch for new commits and runs the
|
||||
[ci/run.sh](https://github.com/ggerganov/llama.cpp/blob/master/ci/run.sh) script on dedicated cloud instances. This allows us
|
||||
to execute heavier workloads compared to just using Github Actions. Also with time, the cloud instances will be scaled
|
||||
to cover various hardware architectures, including GPU and Apple Silicon instances.
|
||||
|
||||
Collaborators can optionally trigger the CI run by adding the `ggml-ci` keyword to their commit message.
|
||||
Only the branches of this repo are monitored for this keyword.
|
||||
|
||||
It is a good practice, before publishing changes to execute the full CI locally on your machine:
|
||||
|
||||
```bash
|
||||
mkdir tmp
|
||||
|
||||
# CPU-only build
|
||||
bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
|
||||
# with CUDA support
|
||||
GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
```
|
||||
409
ci/run.sh
Normal file
409
ci/run.sh
Normal file
@@ -0,0 +1,409 @@
|
||||
#/bin/bash
|
||||
#
|
||||
# sample usage:
|
||||
#
|
||||
# mkdir tmp
|
||||
#
|
||||
# # CPU-only build
|
||||
# bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
#
|
||||
# # with CUDA support
|
||||
# GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
#
|
||||
|
||||
if [ -z "$2" ]; then
|
||||
echo "usage: $0 <output-dir> <mnt-dir>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
mkdir -p "$1"
|
||||
mkdir -p "$2"
|
||||
|
||||
OUT=$(realpath "$1")
|
||||
MNT=$(realpath "$2")
|
||||
|
||||
rm -v $OUT/*.log
|
||||
rm -v $OUT/*.exit
|
||||
rm -v $OUT/*.md
|
||||
|
||||
sd=`dirname $0`
|
||||
cd $sd/../
|
||||
SRC=`pwd`
|
||||
|
||||
## helpers
|
||||
|
||||
# download a file if it does not exist or if it is outdated
|
||||
function gg_wget {
|
||||
local out=$1
|
||||
local url=$2
|
||||
|
||||
local cwd=`pwd`
|
||||
|
||||
mkdir -p $out
|
||||
cd $out
|
||||
|
||||
# should not re-download if file is the same
|
||||
wget -nv -N $url
|
||||
|
||||
cd $cwd
|
||||
}
|
||||
|
||||
function gg_printf {
|
||||
printf -- "$@" >> $OUT/README.md
|
||||
}
|
||||
|
||||
function gg_run {
|
||||
ci=$1
|
||||
|
||||
set -o pipefail
|
||||
set -x
|
||||
|
||||
gg_run_$ci | tee $OUT/$ci.log
|
||||
cur=$?
|
||||
echo "$cur" > $OUT/$ci.exit
|
||||
|
||||
set +x
|
||||
set +o pipefail
|
||||
|
||||
gg_sum_$ci
|
||||
|
||||
ret=$((ret | cur))
|
||||
}
|
||||
|
||||
## ci
|
||||
|
||||
# ctest_debug
|
||||
|
||||
function gg_run_ctest_debug {
|
||||
cd ${SRC}
|
||||
|
||||
rm -rf build-ci-debug && mkdir build-ci-debug && cd build-ci-debug
|
||||
|
||||
set -e
|
||||
|
||||
(time cmake -DCMAKE_BUILD_TYPE=Debug .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
|
||||
(time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||
|
||||
set +e
|
||||
}
|
||||
|
||||
function gg_sum_ctest_debug {
|
||||
gg_printf '### %s\n\n' "${ci}"
|
||||
|
||||
gg_printf 'Runs ctest in debug mode\n'
|
||||
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
||||
gg_printf '```\n'
|
||||
gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
|
||||
gg_printf '```\n'
|
||||
gg_printf '\n'
|
||||
}
|
||||
|
||||
# ctest_release
|
||||
|
||||
function gg_run_ctest_release {
|
||||
cd ${SRC}
|
||||
|
||||
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
|
||||
|
||||
set -e
|
||||
|
||||
(time cmake -DCMAKE_BUILD_TYPE=Release .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
|
||||
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
||||
(time ctest --output-on-failure ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||
else
|
||||
(time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||
fi
|
||||
|
||||
set +e
|
||||
}
|
||||
|
||||
function gg_sum_ctest_release {
|
||||
gg_printf '### %s\n\n' "${ci}"
|
||||
|
||||
gg_printf 'Runs ctest in release mode\n'
|
||||
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
||||
gg_printf '```\n'
|
||||
gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
|
||||
gg_printf '```\n'
|
||||
}
|
||||
|
||||
# open_llama_3b_v2
|
||||
|
||||
function gg_run_open_llama_3b_v2 {
|
||||
cd ${SRC}
|
||||
|
||||
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/config.json
|
||||
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/tokenizer.model
|
||||
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/tokenizer_config.json
|
||||
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/special_tokens_map.json
|
||||
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/pytorch_model.bin
|
||||
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/generation_config.json
|
||||
|
||||
gg_wget models-mnt/wikitext/ https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip
|
||||
unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
|
||||
head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw
|
||||
|
||||
path_models="../models-mnt/open-llama/3B-v2"
|
||||
path_wiki="../models-mnt/wikitext/wikitext-2-raw"
|
||||
|
||||
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
|
||||
|
||||
set -e
|
||||
|
||||
(time cmake -DCMAKE_BUILD_TYPE=Release -DLLAMA_QKK_64=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
|
||||
python3 ../convert.py ${path_models}
|
||||
|
||||
model_f16="${path_models}/ggml-model-f16.bin"
|
||||
model_q8_0="${path_models}/ggml-model-q8_0.bin"
|
||||
model_q4_0="${path_models}/ggml-model-q4_0.bin"
|
||||
model_q4_1="${path_models}/ggml-model-q4_1.bin"
|
||||
model_q5_0="${path_models}/ggml-model-q5_0.bin"
|
||||
model_q5_1="${path_models}/ggml-model-q5_1.bin"
|
||||
model_q2_k="${path_models}/ggml-model-q2_k.bin"
|
||||
model_q3_k="${path_models}/ggml-model-q3_k.bin"
|
||||
model_q4_k="${path_models}/ggml-model-q4_k.bin"
|
||||
model_q5_k="${path_models}/ggml-model-q5_k.bin"
|
||||
model_q6_k="${path_models}/ggml-model-q6_k.bin"
|
||||
|
||||
wiki_test_60="${path_wiki}/wiki.test-60.raw"
|
||||
|
||||
./bin/quantize ${model_f16} ${model_q8_0} q8_0
|
||||
./bin/quantize ${model_f16} ${model_q4_0} q4_0
|
||||
./bin/quantize ${model_f16} ${model_q4_1} q4_1
|
||||
./bin/quantize ${model_f16} ${model_q5_0} q5_0
|
||||
./bin/quantize ${model_f16} ${model_q5_1} q5_1
|
||||
./bin/quantize ${model_f16} ${model_q2_k} q2_k
|
||||
./bin/quantize ${model_f16} ${model_q3_k} q3_k
|
||||
./bin/quantize ${model_f16} ${model_q4_k} q4_k
|
||||
./bin/quantize ${model_f16} ${model_q5_k} q5_k
|
||||
./bin/quantize ${model_f16} ${model_q6_k} q6_k
|
||||
|
||||
(time ./bin/main --model ${model_f16} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/main --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/main --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/main --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/main --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/main --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/main --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/main --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/main --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/main --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/main --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
|
||||
(time ./bin/perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
|
||||
function check_ppl {
|
||||
qnt="$1"
|
||||
ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
|
||||
|
||||
if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
|
||||
printf ' - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
|
||||
return 20
|
||||
fi
|
||||
|
||||
printf ' - %s @ %s OK\n' "$qnt" "$ppl"
|
||||
return 0
|
||||
}
|
||||
|
||||
check_ppl "f16" "$(cat $OUT/${ci}-tg-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
|
||||
set +e
|
||||
}
|
||||
|
||||
function gg_sum_open_llama_3b_v2 {
|
||||
gg_printf '### %s\n\n' "${ci}"
|
||||
|
||||
gg_printf 'OpenLLaMA 3B-v2:\n'
|
||||
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
||||
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
|
||||
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
|
||||
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
|
||||
gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
|
||||
gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
|
||||
gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
|
||||
gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
|
||||
gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
|
||||
gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
|
||||
gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
|
||||
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
|
||||
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
|
||||
}
|
||||
|
||||
# open_llama_7b_v2
|
||||
# requires: GG_BUILD_CUDA
|
||||
|
||||
function gg_run_open_llama_7b_v2 {
|
||||
cd ${SRC}
|
||||
|
||||
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/config.json
|
||||
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/tokenizer.model
|
||||
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/tokenizer_config.json
|
||||
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/special_tokens_map.json
|
||||
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/pytorch_model.bin.index.json
|
||||
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/pytorch_model-00001-of-00002.bin
|
||||
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/pytorch_model-00002-of-00002.bin
|
||||
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/generation_config.json
|
||||
|
||||
gg_wget models-mnt/wikitext/ https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip
|
||||
unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
|
||||
|
||||
path_models="../models-mnt/open-llama/7B-v2"
|
||||
path_wiki="../models-mnt/wikitext/wikitext-2-raw"
|
||||
|
||||
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
|
||||
|
||||
set -e
|
||||
|
||||
(time cmake -DCMAKE_BUILD_TYPE=Release -DLLAMA_CUBLAS=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
|
||||
python3 ../convert.py ${path_models}
|
||||
|
||||
model_f16="${path_models}/ggml-model-f16.bin"
|
||||
model_q8_0="${path_models}/ggml-model-q8_0.bin"
|
||||
model_q4_0="${path_models}/ggml-model-q4_0.bin"
|
||||
model_q4_1="${path_models}/ggml-model-q4_1.bin"
|
||||
model_q5_0="${path_models}/ggml-model-q5_0.bin"
|
||||
model_q5_1="${path_models}/ggml-model-q5_1.bin"
|
||||
model_q2_k="${path_models}/ggml-model-q2_k.bin"
|
||||
model_q3_k="${path_models}/ggml-model-q3_k.bin"
|
||||
model_q4_k="${path_models}/ggml-model-q4_k.bin"
|
||||
model_q5_k="${path_models}/ggml-model-q5_k.bin"
|
||||
model_q6_k="${path_models}/ggml-model-q6_k.bin"
|
||||
|
||||
wiki_test="${path_wiki}/wiki.test.raw"
|
||||
|
||||
./bin/quantize ${model_f16} ${model_q8_0} q8_0
|
||||
./bin/quantize ${model_f16} ${model_q4_0} q4_0
|
||||
./bin/quantize ${model_f16} ${model_q4_1} q4_1
|
||||
./bin/quantize ${model_f16} ${model_q5_0} q5_0
|
||||
./bin/quantize ${model_f16} ${model_q5_1} q5_1
|
||||
./bin/quantize ${model_f16} ${model_q2_k} q2_k
|
||||
./bin/quantize ${model_f16} ${model_q3_k} q3_k
|
||||
./bin/quantize ${model_f16} ${model_q4_k} q4_k
|
||||
./bin/quantize ${model_f16} ${model_q5_k} q5_k
|
||||
./bin/quantize ${model_f16} ${model_q6_k} q6_k
|
||||
|
||||
(time ./bin/main --model ${model_f16} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/main --model ${model_q8_0} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/main --model ${model_q4_0} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/main --model ${model_q4_1} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/main --model ${model_q5_0} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/main --model ${model_q5_1} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/main --model ${model_q2_k} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/main --model ${model_q3_k} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/main --model ${model_q4_k} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/main --model ${model_q5_k} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/main --model ${model_q6_k} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
|
||||
(time ./bin/perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
|
||||
function check_ppl {
|
||||
qnt="$1"
|
||||
ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
|
||||
|
||||
if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
|
||||
printf ' - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
|
||||
return 20
|
||||
fi
|
||||
|
||||
printf ' - %s @ %s OK\n' "$qnt" "$ppl"
|
||||
return 0
|
||||
}
|
||||
|
||||
check_ppl "f16" "$(cat $OUT/${ci}-tg-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
|
||||
set +e
|
||||
}
|
||||
|
||||
function gg_sum_open_llama_7b_v2 {
|
||||
gg_printf '### %s\n\n' "${ci}"
|
||||
|
||||
gg_printf 'OpenLLaMA 7B-v2:\n'
|
||||
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
||||
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
|
||||
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
|
||||
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
|
||||
gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
|
||||
gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
|
||||
gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
|
||||
gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
|
||||
gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
|
||||
gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
|
||||
gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
|
||||
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
|
||||
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
|
||||
}
|
||||
|
||||
## main
|
||||
|
||||
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
||||
rm -rf ${SRC}/models-mnt
|
||||
|
||||
mnt_models=${MNT}/models
|
||||
mkdir -p ${mnt_models}
|
||||
ln -sfn ${mnt_models} ${SRC}/models-mnt
|
||||
|
||||
python3 -m pip install -r ${SRC}/requirements.txt
|
||||
fi
|
||||
|
||||
ret=0
|
||||
|
||||
test $ret -eq 0 && gg_run ctest_debug
|
||||
test $ret -eq 0 && gg_run ctest_release
|
||||
|
||||
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
||||
if [ -z ${GG_BUILD_CUDA} ]; then
|
||||
test $ret -eq 0 && gg_run open_llama_3b_v2
|
||||
else
|
||||
test $ret -eq 0 && gg_run open_llama_7b_v2
|
||||
fi
|
||||
fi
|
||||
|
||||
exit $ret
|
||||
1
convert-lora-to-ggml.py
Normal file → Executable file
1
convert-lora-to-ggml.py
Normal file → Executable file
@@ -1,3 +1,4 @@
|
||||
#!/usr/bin/env python
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
|
||||
1
convert.py
Normal file → Executable file
1
convert.py
Normal file → Executable file
@@ -1,3 +1,4 @@
|
||||
#!/usr/bin/env python
|
||||
import argparse
|
||||
import concurrent.futures
|
||||
import copy
|
||||
|
||||
@@ -2,21 +2,21 @@
|
||||
set -e
|
||||
|
||||
AI_NAME="${AI_NAME:-Miku}"
|
||||
MODEL="${MODEL:-./models/gpt4all-7B/gpt4all-lora-unfiltered-quantized.bin}"
|
||||
MODEL="${MODEL:-./models/llama-2-7b-chat.ggmlv3.q4_K_M.bin}"
|
||||
USER_NAME="${USER_NAME:-Anon}"
|
||||
|
||||
# Uncomment and adjust to the number of CPU cores you want to use.
|
||||
#N_THREAD="${N_THREAD:-4}"
|
||||
CTX_SIZE="${CTX_SIZE:-4096}"
|
||||
N_PREDICTS="${N_PREDICTS:-4096}"
|
||||
|
||||
GEN_OPTIONS=(--batch_size 1024
|
||||
--ctx_size 2048
|
||||
--ctx_size "$CTX_SIZE"
|
||||
--keep -1
|
||||
--repeat_last_n 256
|
||||
--repeat_penalty 1.17647
|
||||
--temp 0.7
|
||||
--top_k 40
|
||||
--top_p 0.5)
|
||||
--temp 0.6
|
||||
--mirostat 2)
|
||||
|
||||
if [ -n "$N_THREAD" ]; then
|
||||
GEN_OPTIONS+=(--threads "$N_THREAD")
|
||||
@@ -24,16 +24,17 @@ fi
|
||||
|
||||
./main "${GEN_OPTIONS[@]}" \
|
||||
--model "$MODEL" \
|
||||
--in-prefix " " \
|
||||
--in-suffix "${AI_NAME}:" \
|
||||
--n_predict "$N_PREDICTS" \
|
||||
--color --interactive \
|
||||
--reverse-prompt "${USER_NAME}:" \
|
||||
--prompt "
|
||||
This is a transcript of a 1000 page, never ending conversation between ${USER_NAME} and the cute and helpful AI assistant ${AI_NAME}. ${AI_NAME} is a girl who is an AI running on the user's computer.
|
||||
--prompt "This is a transcript of a 1000 page, never ending conversation between ${USER_NAME} and the cute and helpful AI assistant ${AI_NAME}. ${AI_NAME} is a girl who is an AI running on the user's computer.
|
||||
${AI_NAME} can think for herself without the user seeing her thoughts by adding a /think prefix to her output. She uses this to reason about the world and to think about what she should say next.
|
||||
${AI_NAME} is always coherent and makes sense, but if she isn't sure if what she is saying is correct, she will ask the user for help.
|
||||
${AI_NAME} is a very helpful AI and will help the user with anything they need. She is also very friendly and will try to make the user feel better if they are sad.
|
||||
${AI_NAME} is also very curious and will ask the user a lot of questions about themselves and their life. She will also try to make the user like her.
|
||||
The conversation is only between ${USER_NAME} and ${AI_NAME}
|
||||
The conversation is only between ${USER_NAME} and ${AI_NAME}.
|
||||
The conversation is only through text, so ${AI_NAME} can't see ${USER_NAME}'s face or hear his voice.
|
||||
${AI_NAME} can only communicate through text, so she can't send images or videos.
|
||||
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
set(TARGET baby-llama)
|
||||
add_executable(${TARGET} baby-llama.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
set(TARGET benchmark)
|
||||
add_executable(${TARGET} benchmark-matmult.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
if(TARGET BUILD_INFO)
|
||||
|
||||
@@ -168,6 +168,18 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
break;
|
||||
}
|
||||
params.n_ctx = std::stoi(argv[i]);
|
||||
} else if (arg == "--rope-freq-base") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.rope_freq_base = std::stof(argv[i]);
|
||||
} else if (arg == "--rope-freq-scale") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.rope_freq_scale = std::stof(argv[i]);
|
||||
} else if (arg == "--memory-f32") {
|
||||
params.memory_f16 = false;
|
||||
} else if (arg == "--top-p") {
|
||||
@@ -248,12 +260,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
break;
|
||||
}
|
||||
params.cfg_scale = std::stof(argv[i]);
|
||||
} else if (arg == "--cfg-smooth-factor") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.cfg_smooth_factor = std::stof(argv[i]);
|
||||
} else if (arg == "-b" || arg == "--batch-size") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
@@ -267,6 +273,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
break;
|
||||
}
|
||||
params.n_keep = std::stoi(argv[i]);
|
||||
} else if (arg == "--chunks") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.n_chunks = std::stoi(argv[i]);
|
||||
} else if (arg == "-m" || arg == "--model") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
@@ -375,6 +387,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
params.antiprompt.push_back(argv[i]);
|
||||
} else if (arg == "--perplexity") {
|
||||
params.perplexity = true;
|
||||
} else if (arg == "--perplexity-lines") {
|
||||
params.perplexity_lines = true;
|
||||
} else if (arg == "--ignore-eos") {
|
||||
params.logit_bias[llama_token_eos()] = -INFINITY;
|
||||
} else if (arg == "--no-penalize-nl") {
|
||||
@@ -491,16 +505,19 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||
fprintf(stderr, " --cfg-negative-prompt PROMPT \n");
|
||||
fprintf(stderr, " negative prompt to use for guidance. (default: empty)\n");
|
||||
fprintf(stderr, " --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
|
||||
fprintf(stderr, " --cfg-smooth-factor N smooth factor between old and new logits (default: %f, 1.0 = no smoothing)\n", params.cfg_smooth_factor);
|
||||
fprintf(stderr, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
|
||||
fprintf(stderr, " --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
|
||||
fprintf(stderr, " --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
|
||||
fprintf(stderr, " --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
|
||||
fprintf(stderr, " --no-penalize-nl do not penalize newline token\n");
|
||||
fprintf(stderr, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
||||
fprintf(stderr, " not recommended: doubles context memory required and no measurable increase in quality\n");
|
||||
fprintf(stderr, " --temp N temperature (default: %.1f)\n", (double)params.temp);
|
||||
fprintf(stderr, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
||||
fprintf(stderr, " --perplexity compute perplexity over the prompt\n");
|
||||
fprintf(stderr, " --perplexity compute perplexity over each ctx window of the prompt\n");
|
||||
fprintf(stderr, " --perplexity-lines compute perplexity over each line of the prompt\n");
|
||||
fprintf(stderr, " --keep number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
|
||||
fprintf(stderr, " --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
|
||||
if (llama_mlock_supported()) {
|
||||
fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n");
|
||||
}
|
||||
@@ -561,18 +578,20 @@ std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::s
|
||||
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
|
||||
auto lparams = llama_context_default_params();
|
||||
|
||||
lparams.n_ctx = params.n_ctx;
|
||||
lparams.n_batch = params.n_batch;
|
||||
lparams.n_gpu_layers = params.n_gpu_layers;
|
||||
lparams.main_gpu = params.main_gpu;
|
||||
memcpy(lparams.tensor_split, params.tensor_split, LLAMA_MAX_DEVICES*sizeof(float));
|
||||
lparams.low_vram = params.low_vram;
|
||||
lparams.seed = params.seed;
|
||||
lparams.f16_kv = params.memory_f16;
|
||||
lparams.use_mmap = params.use_mmap;
|
||||
lparams.use_mlock = params.use_mlock;
|
||||
lparams.logits_all = params.perplexity;
|
||||
lparams.embedding = params.embedding;
|
||||
lparams.n_ctx = params.n_ctx;
|
||||
lparams.n_batch = params.n_batch;
|
||||
lparams.n_gpu_layers = params.n_gpu_layers;
|
||||
lparams.main_gpu = params.main_gpu;
|
||||
lparams.tensor_split = params.tensor_split;
|
||||
lparams.low_vram = params.low_vram;
|
||||
lparams.seed = params.seed;
|
||||
lparams.f16_kv = params.memory_f16;
|
||||
lparams.use_mmap = params.use_mmap;
|
||||
lparams.use_mlock = params.use_mlock;
|
||||
lparams.logits_all = params.perplexity;
|
||||
lparams.embedding = params.embedding;
|
||||
lparams.rope_freq_base = params.rope_freq_base;
|
||||
lparams.rope_freq_scale = params.rope_freq_scale;
|
||||
|
||||
return lparams;
|
||||
}
|
||||
|
||||
@@ -28,10 +28,13 @@ struct gpt_params {
|
||||
int32_t n_ctx = 512; // context size
|
||||
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
|
||||
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
||||
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
|
||||
int32_t n_gpu_layers = 0; // number of layers to store in VRAM
|
||||
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
|
||||
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
||||
float rope_freq_base = 10000.0f; // RoPE base frequency
|
||||
float rope_freq_scale = 1.0f; // RoPE frequency scaling factor
|
||||
|
||||
// sampling parameters
|
||||
std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
|
||||
@@ -52,7 +55,6 @@ struct gpt_params {
|
||||
// https://arxiv.org/abs/2306.17806
|
||||
std::string cfg_negative_prompt; // string to help guidance
|
||||
float cfg_scale = 1.f; // How strong is guidance
|
||||
float cfg_smooth_factor = 1.f; // Smooth factor between old and new logits
|
||||
|
||||
std::string model = "models/7B/ggml-model.bin"; // model path
|
||||
std::string model_alias = "unknown"; // model alias
|
||||
@@ -80,6 +82,7 @@ struct gpt_params {
|
||||
bool instruct = false; // instruction mode (used for Alpaca models)
|
||||
bool penalize_nl = true; // consider newlines as a repeatable token
|
||||
bool perplexity = false; // compute perplexity over the prompt
|
||||
bool perplexity_lines = false; // compute perplexity over each line of the prompt
|
||||
bool use_mmap = true; // use mmap for faster loads
|
||||
bool use_mlock = false; // use mlock to keep model in memory
|
||||
bool mem_test = false; // compute maximum memory usage
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
set(TARGET embdinput)
|
||||
add_library(${TARGET} embd-input-lib.cpp embd-input.h)
|
||||
install(TARGETS ${TARGET} LIBRARY)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
if(TARGET BUILD_INFO)
|
||||
@@ -8,6 +9,7 @@ endif()
|
||||
|
||||
set(TARGET embd-input-test)
|
||||
add_executable(${TARGET} embd-input-test.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama embdinput ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
if(TARGET BUILD_INFO)
|
||||
|
||||
@@ -64,7 +64,7 @@ class MiniGPT4(Blip2Base):
|
||||
self.max_txt_len = max_txt_len
|
||||
self.end_sym = end_sym
|
||||
self.model = MyModel(["main", *args])
|
||||
# system promt
|
||||
# system prompt
|
||||
self.model.eval_string("Give the following image: <Img>ImageContent</Img>. "
|
||||
"You will be able to see the image once I provide it to you. Please answer my questions."
|
||||
"###")
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
set(TARGET embedding)
|
||||
add_executable(${TARGET} embedding.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
if(TARGET BUILD_INFO)
|
||||
|
||||
18
examples/llama2-13b.sh
Executable file
18
examples/llama2-13b.sh
Executable file
@@ -0,0 +1,18 @@
|
||||
#!/bin/bash
|
||||
|
||||
#
|
||||
# Temporary script - will be removed in the future
|
||||
#
|
||||
|
||||
cd `dirname $0`
|
||||
cd ..
|
||||
|
||||
./main -m models/available/Llama2/13B/llama-2-13b.ggmlv3.q4_0.bin \
|
||||
--color \
|
||||
--ctx_size 2048 \
|
||||
-n -1 \
|
||||
-ins -b 256 \
|
||||
--top_k 10000 \
|
||||
--temp 0.2 \
|
||||
--repeat_penalty 1.1 \
|
||||
-t 8
|
||||
18
examples/llama2.sh
Executable file
18
examples/llama2.sh
Executable file
@@ -0,0 +1,18 @@
|
||||
#!/bin/bash
|
||||
|
||||
#
|
||||
# Temporary script - will be removed in the future
|
||||
#
|
||||
|
||||
cd `dirname $0`
|
||||
cd ..
|
||||
|
||||
./main -m models/available/Llama2/7B/llama-2-7b.ggmlv3.q4_0.bin \
|
||||
--color \
|
||||
--ctx_size 2048 \
|
||||
-n -1 \
|
||||
-ins -b 256 \
|
||||
--top_k 10000 \
|
||||
--temp 0.2 \
|
||||
--repeat_penalty 1.1 \
|
||||
-t 8
|
||||
58
examples/llm.vim
Normal file
58
examples/llm.vim
Normal file
@@ -0,0 +1,58 @@
|
||||
function! Llm()
|
||||
|
||||
let url = "http://127.0.0.1:8080/completion"
|
||||
|
||||
" Save the current cursor position
|
||||
let save_cursor = getpos('.')
|
||||
|
||||
silent! %s/\n/\\n/g
|
||||
silent! %s/\t/\\t/g
|
||||
silent! %s/\\n$//
|
||||
|
||||
" Get the content of the current buffer
|
||||
let buffer_content = join(getline(1, '$'), "\n")
|
||||
|
||||
" Replace true newlines with "\n"
|
||||
let buffer_content = substitute(buffer_content, '\n', '\\n', 'g')
|
||||
|
||||
" Trim leading/trailing whitespace
|
||||
let buffer_content = substitute(buffer_content, '^\s\+', '', '')
|
||||
let buffer_content = substitute(buffer_content, '\s\+$', '', '')
|
||||
|
||||
" Create the JSON payload
|
||||
" can't escape backslash, \n gets replaced as \\n
|
||||
let json_payload = '{"prompt":"' . escape(buffer_content, '"/') . '","temp":0.72,"top_k":100,"top_p":0.73,"repeat_penalty":1.100000023841858,"n_predict":10,"stream":false}'
|
||||
|
||||
let prompt_tmpfile = tempname()
|
||||
let response_tmpfile = tempname()
|
||||
call writefile([json_payload], prompt_tmpfile)
|
||||
|
||||
" Define the curl command
|
||||
let curl_command = 'curl -k -s -X POST -H "Content-Type: application/json" -o ' . shellescape(response_tmpfile) . ' -d @' . shellescape(prompt_tmpfile) . ' ' . url
|
||||
silent execute '!'.curl_command
|
||||
|
||||
let response = join(readfile(response_tmpfile), '')
|
||||
let start_marker = '{"content":"'
|
||||
let end_marker = '","generation_settings'
|
||||
let content_start = stridx(response, start_marker) + len(start_marker)
|
||||
let content_end = stridx(response, end_marker, content_start)
|
||||
|
||||
" Extract the content field from the response
|
||||
let content = strpart(response, content_start, content_end - content_start)
|
||||
|
||||
" Insert the content at the cursor position
|
||||
call setline(line('.'), getline('.') . content)
|
||||
|
||||
" Replace newline "\n" strings with actual newlines in the content
|
||||
silent! %s/\\n/\r/g
|
||||
" and tabs
|
||||
silent! %s/\\t/\t/g
|
||||
" and quote marks for C sources
|
||||
silent! %s/\\"/\"/g
|
||||
|
||||
" Remove the temporary file
|
||||
call delete(prompt_tmpfile)
|
||||
call delete(response_tmpfile)
|
||||
endfunction
|
||||
|
||||
command! Llm call Llm()
|
||||
@@ -1,5 +1,6 @@
|
||||
set(TARGET main)
|
||||
add_executable(${TARGET} main.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
if(TARGET BUILD_INFO)
|
||||
|
||||
@@ -84,9 +84,17 @@ int main(int argc, char ** argv) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (params.rope_freq_base != 10000.0) {
|
||||
fprintf(stderr, "%s: warning: changing RoPE frequency base to %g (default 10000.0)\n", __func__, params.rope_freq_base);
|
||||
}
|
||||
|
||||
if (params.rope_freq_scale != 1.0) {
|
||||
fprintf(stderr, "%s: warning: scaling RoPE frequency by %g (default 1.0)\n", __func__, params.rope_freq_scale);
|
||||
}
|
||||
|
||||
if (params.n_ctx > 2048) {
|
||||
fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);"
|
||||
"expect poor results\n", __func__, params.n_ctx);
|
||||
fprintf(stderr, "%s: warning: base model only supports context sizes no greater than 2048 tokens (%d specified);"
|
||||
" you are on your own\n", __func__, params.n_ctx);
|
||||
} else if (params.n_ctx < 8) {
|
||||
fprintf(stderr, "%s: warning: minimum context size is 8, using minimum size.\n", __func__);
|
||||
params.n_ctx = 8;
|
||||
@@ -131,17 +139,14 @@ int main(int argc, char ** argv) {
|
||||
params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
|
||||
}
|
||||
|
||||
// determine the maximum memory usage needed to do inference for the given n_batch and n_predict parameters
|
||||
// determine the maximum memory usage needed to do inference for the given n_batch and n_ctx parameters
|
||||
// uncomment the "used_mem" line in llama.cpp to see the results
|
||||
if (params.mem_test) {
|
||||
{
|
||||
const std::vector<llama_token> tmp(params.n_batch, llama_token_bos());
|
||||
llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads);
|
||||
}
|
||||
fprintf(stderr, "%s: testing memory usage for n_batch = %d, n_ctx = %d\n", __func__, params.n_batch, params.n_ctx);
|
||||
|
||||
{
|
||||
const std::vector<llama_token> tmp = { 0, };
|
||||
llama_eval(ctx, tmp.data(), tmp.size(), params.n_predict - 1, params.n_threads);
|
||||
const std::vector<llama_token> tmp(params.n_batch, llama_token_bos());
|
||||
llama_eval(ctx, tmp.data(), tmp.size(), params.n_ctx, params.n_threads);
|
||||
}
|
||||
|
||||
llama_print_timings(ctx);
|
||||
@@ -549,7 +554,7 @@ int main(int argc, char ** argv) {
|
||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
||||
|
||||
if (ctx_guidance) {
|
||||
llama_sample_classifier_free_guidance(ctx, &candidates_p, ctx_guidance, params.cfg_scale, params.cfg_smooth_factor);
|
||||
llama_sample_classifier_free_guidance(ctx, &candidates_p, ctx_guidance, params.cfg_scale);
|
||||
}
|
||||
|
||||
// Apply penalties
|
||||
|
||||
92
examples/make-ggml.py
Normal file
92
examples/make-ggml.py
Normal file
@@ -0,0 +1,92 @@
|
||||
"""
|
||||
This script converts Hugging Face llama models to GGML and quantizes them.
|
||||
|
||||
Usage:
|
||||
python make-ggml.py --model {model_dir_or_hf_repo_name} [--outname {output_name} (Optional)] [--outdir {output_directory} (Optional)] [--quants {quant_types} (Optional)] [--keep_fp16 (Optional)]
|
||||
|
||||
Arguments:
|
||||
- --model: (Required) The directory of the downloaded Hugging Face model or the name of the Hugging Face model repository. If the model directory does not exist, it will be downloaded from the Hugging Face model hub.
|
||||
- --outname: (Optional) The name of the output model. If not specified, the last part of the model directory path or the Hugging Face model repo name will be used.
|
||||
- --outdir: (Optional) The directory where the output model(s) will be stored. If not specified, '../models/{outname}' will be used.
|
||||
- --quants: (Optional) The types of quantization to apply. This should be a space-separated list. The default is 'Q4_K_M Q5_K_S'.
|
||||
- --keep_fp16: (Optional) If specified, the FP16 model will not be deleted after the quantized models are created.
|
||||
|
||||
Quant types:
|
||||
- Q4_0: small, very high quality loss - legacy, prefer using Q3_K_M
|
||||
- Q4_1: small, substantial quality loss - legacy, prefer using Q3_K_L
|
||||
- Q5_0: medium, balanced quality - legacy, prefer using Q4_K_M
|
||||
- Q5_1: medium, low quality loss - legacy, prefer using Q5_K_M
|
||||
- Q2_K: smallest, extreme quality loss - not recommended
|
||||
- Q3_K: alias for Q3_K_M
|
||||
- Q3_K_S: very small, very high quality loss
|
||||
- Q3_K_M: very small, very high quality loss
|
||||
- Q3_K_L: small, substantial quality loss
|
||||
- Q4_K: alias for Q4_K_M
|
||||
- Q4_K_S: small, significant quality loss
|
||||
- Q4_K_M: medium, balanced quality - recommended
|
||||
- Q5_K: alias for Q5_K_M
|
||||
- Q5_K_S: large, low quality loss - recommended
|
||||
- Q5_K_M: large, very low quality loss - recommended
|
||||
- Q6_K: very large, extremely low quality loss
|
||||
- Q8_0: very large, extremely low quality loss - not recommended
|
||||
- F16: extremely large, virtually no quality loss - not recommended
|
||||
- F32: absolutely huge, lossless - not recommended
|
||||
"""
|
||||
import subprocess
|
||||
subprocess.run(f"pip install huggingface-hub==0.16.4", shell=True, check=True)
|
||||
|
||||
import argparse
|
||||
import os
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
def main(model, outname, outdir, quants, keep_fp16):
|
||||
ggml_version = "v3"
|
||||
|
||||
if not os.path.isdir(model):
|
||||
print(f"Model not found at {model}. Downloading...")
|
||||
try:
|
||||
if outname is None:
|
||||
outname = model.split('/')[-1]
|
||||
model = snapshot_download(repo_id=model, cache_dir='../models/hf_cache')
|
||||
except Exception as e:
|
||||
raise Exception(f"Could not download the model: {e}")
|
||||
|
||||
if outdir is None:
|
||||
outdir = f'../models/{outname}'
|
||||
|
||||
if not os.path.isfile(f"{model}/config.json"):
|
||||
raise Exception(f"Could not find config.json in {model}")
|
||||
|
||||
os.makedirs(outdir, exist_ok=True)
|
||||
|
||||
print("Building llama.cpp")
|
||||
subprocess.run(f"cd .. && make quantize", shell=True, check=True)
|
||||
|
||||
fp16 = f"{outdir}/{outname}.ggml{ggml_version}.fp16.bin"
|
||||
|
||||
print(f"Making unquantised GGML at {fp16}")
|
||||
if not os.path.isfile(fp16):
|
||||
subprocess.run(f"python3 ../convert.py {model} --outtype f16 --outfile {fp16}", shell=True, check=True)
|
||||
else:
|
||||
print(f"Unquantised GGML already exists at: {fp16}")
|
||||
|
||||
print("Making quants")
|
||||
for type in quants:
|
||||
outfile = f"{outdir}/{outname}.ggml{ggml_version}.{type}.bin"
|
||||
print(f"Making {type} : {outfile}")
|
||||
subprocess.run(f"../quantize {fp16} {outfile} {type}", shell=True, check=True)
|
||||
|
||||
if not keep_fp16:
|
||||
os.remove(fp16)
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description='Convert/Quantize HF to GGML. If you have the HF model downloaded already, pass the path to the model dir. Otherwise, pass the Hugging Face model repo name. You need to be in the /examples folder for it to work.')
|
||||
parser.add_argument('--model', required=True, help='Downloaded model dir or Hugging Face model repo name')
|
||||
parser.add_argument('--outname', default=None, help='Output model(s) name')
|
||||
parser.add_argument('--outdir', default=None, help='Output directory')
|
||||
parser.add_argument('--quants', nargs='*', default=["Q4_K_M", "Q5_K_S"], help='Quant types')
|
||||
parser.add_argument('--keep_fp16', action='store_true', help='Keep fp16 model', default=False)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
main(args.model, args.outname, args.outdir, args.quants, args.keep_fp16)
|
||||
@@ -1,3 +1,4 @@
|
||||
set(TEST_TARGET metal)
|
||||
add_executable(${TEST_TARGET} metal.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TEST_TARGET} PRIVATE ggml)
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
set(TARGET perplexity)
|
||||
add_executable(${TARGET} perplexity.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
if(TARGET BUILD_INFO)
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
|
||||
#include <cmath>
|
||||
#include <ctime>
|
||||
#include <sstream>
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
@@ -32,13 +33,15 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
|
||||
// BOS tokens will be added for each chunk before eval
|
||||
auto tokens = ::llama_tokenize(ctx, params.prompt, true);
|
||||
|
||||
int count = 0;
|
||||
const int n_chunk_max = tokens.size() / params.n_ctx;
|
||||
|
||||
const int n_chunk = tokens.size() / params.n_ctx;
|
||||
const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
|
||||
const int n_vocab = llama_n_vocab(ctx);
|
||||
const int n_batch = params.n_batch;
|
||||
|
||||
int count = 0;
|
||||
double nll = 0.0;
|
||||
|
||||
fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
|
||||
|
||||
for (int i = 0; i < n_chunk; ++i) {
|
||||
@@ -118,6 +121,77 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
void perplexity_lines(llama_context * ctx, const gpt_params & params) {
|
||||
// Calculates perplexity over each line of the prompt
|
||||
|
||||
std::vector<std::string> prompt_lines;
|
||||
std::istringstream strstream(params.prompt);
|
||||
std::string line;
|
||||
|
||||
while (std::getline(strstream,line,'\n')) {
|
||||
prompt_lines.push_back(line);
|
||||
}
|
||||
|
||||
const int n_vocab = llama_n_vocab(ctx);
|
||||
|
||||
int counttotal = 0;
|
||||
size_t n_lines = prompt_lines.size();
|
||||
|
||||
double nll = 0.0;
|
||||
|
||||
fprintf(stderr, "%s: calculating perplexity over %lu lines\n", __func__, n_lines);
|
||||
|
||||
printf("\nLine\tPPL line\tPPL cumulative\n");
|
||||
|
||||
for (size_t i = 0; i < n_lines; ++i) {
|
||||
|
||||
// Tokenize and insert BOS at start
|
||||
std::vector<int> batch_embd = ::llama_tokenize(ctx, prompt_lines[i], true);
|
||||
|
||||
size_t batch_size = batch_embd.size();
|
||||
|
||||
// Stop if line is too long
|
||||
if( batch_size > (size_t)params.n_ctx ) {
|
||||
fprintf(stderr, "%s : tokens in line %lu > n_ctxl\n", __func__, i);
|
||||
return;
|
||||
}
|
||||
|
||||
if (llama_eval(ctx, batch_embd.data(), batch_size, 0, params.n_threads)) {
|
||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||
return;
|
||||
}
|
||||
|
||||
const auto batch_logits = llama_get_logits(ctx);
|
||||
std::vector<float> logits;
|
||||
logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
|
||||
|
||||
double nllline = 0.0;
|
||||
int countline = 0;
|
||||
|
||||
// Perplexity over second half of the line
|
||||
for (size_t j = batch_size/2; j < batch_size - 1; ++j) {
|
||||
// Calculate probability of next token, given the previous ones.
|
||||
const std::vector<float> tok_logits(
|
||||
logits.begin() + (j + 0) * n_vocab,
|
||||
logits.begin() + (j + 1) * n_vocab);
|
||||
|
||||
const float prob = softmax(tok_logits)[batch_embd[ j + 1]];
|
||||
|
||||
nllline += -std::log(prob);
|
||||
++countline;
|
||||
}
|
||||
|
||||
nll += nllline;
|
||||
counttotal += countline;
|
||||
|
||||
// perplexity is e^(average negative log-likelihood)
|
||||
printf("%lu\t%.8lf\t%.8lf\n", i + 1, std::exp(nllline/countline), std::exp(nll / counttotal) );
|
||||
fflush(stdout);
|
||||
}
|
||||
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
gpt_params params;
|
||||
|
||||
@@ -166,7 +240,11 @@ int main(int argc, char ** argv) {
|
||||
params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
|
||||
}
|
||||
|
||||
perplexity(ctx, params);
|
||||
if (params.perplexity_lines) {
|
||||
perplexity_lines(ctx, params);
|
||||
} else {
|
||||
perplexity(ctx, params);
|
||||
}
|
||||
|
||||
llama_print_timings(ctx);
|
||||
llama_free(ctx);
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
set(TARGET quantize-stats)
|
||||
add_executable(${TARGET} quantize-stats.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
set(TARGET quantize)
|
||||
add_executable(${TARGET} quantize.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
if(TARGET BUILD_INFO)
|
||||
|
||||
@@ -14,103 +14,27 @@ struct quant_option {
|
||||
};
|
||||
|
||||
static const std::vector<struct quant_option> QUANT_OPTIONS = {
|
||||
{
|
||||
"Q4_0",
|
||||
LLAMA_FTYPE_MOSTLY_Q4_0,
|
||||
" 3.50G, +0.2499 ppl @ 7B - small, very high quality loss - legacy, prefer using Q3_K_M",
|
||||
},
|
||||
{
|
||||
"Q4_1",
|
||||
LLAMA_FTYPE_MOSTLY_Q4_1,
|
||||
" 3.90G, +0.1846 ppl @ 7B - small, substantial quality loss - legacy, prefer using Q3_K_L",
|
||||
},
|
||||
{
|
||||
"Q5_0",
|
||||
LLAMA_FTYPE_MOSTLY_Q5_0,
|
||||
" 4.30G, +0.0796 ppl @ 7B - medium, balanced quality - legacy, prefer using Q4_K_M",
|
||||
},
|
||||
{
|
||||
"Q5_1",
|
||||
LLAMA_FTYPE_MOSTLY_Q5_1,
|
||||
" 4.70G, +0.0415 ppl @ 7B - medium, low quality loss - legacy, prefer using Q5_K_M",
|
||||
},
|
||||
{ "Q4_0", LLAMA_FTYPE_MOSTLY_Q4_0, " 3.50G, +0.2499 ppl @ 7B", },
|
||||
{ "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 3.90G, +0.1846 ppl @ 7B", },
|
||||
{ "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 4.30G, +0.0796 ppl @ 7B", },
|
||||
{ "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 4.70G, +0.0415 ppl @ 7B", },
|
||||
#ifdef GGML_USE_K_QUANTS
|
||||
{
|
||||
"Q2_K",
|
||||
LLAMA_FTYPE_MOSTLY_Q2_K,
|
||||
" 2.67G, +0.8698 ppl @ 7B - smallest, extreme quality loss - not recommended",
|
||||
},
|
||||
{
|
||||
"Q3_K",
|
||||
LLAMA_FTYPE_MOSTLY_Q3_K_M,
|
||||
"alias for Q3_K_M"
|
||||
},
|
||||
{
|
||||
"Q3_K_S",
|
||||
LLAMA_FTYPE_MOSTLY_Q3_K_S,
|
||||
" 2.75G, +0.5505 ppl @ 7B - very small, very high quality loss",
|
||||
},
|
||||
{
|
||||
"Q3_K_M",
|
||||
LLAMA_FTYPE_MOSTLY_Q3_K_M,
|
||||
" 3.06G, +0.2437 ppl @ 7B - very small, very high quality loss",
|
||||
},
|
||||
{
|
||||
"Q3_K_L",
|
||||
LLAMA_FTYPE_MOSTLY_Q3_K_L,
|
||||
" 3.35G, +0.1803 ppl @ 7B - small, substantial quality loss",
|
||||
},
|
||||
{
|
||||
"Q4_K",
|
||||
LLAMA_FTYPE_MOSTLY_Q4_K_M,
|
||||
"alias for Q4_K_M",
|
||||
},
|
||||
{
|
||||
"Q4_K_S",
|
||||
LLAMA_FTYPE_MOSTLY_Q4_K_S,
|
||||
" 3.56G, +0.1149 ppl @ 7B - small, significant quality loss",
|
||||
},
|
||||
{
|
||||
"Q4_K_M",
|
||||
LLAMA_FTYPE_MOSTLY_Q4_K_M,
|
||||
" 3.80G, +0.0535 ppl @ 7B - medium, balanced quality - *recommended*",
|
||||
},
|
||||
{
|
||||
"Q5_K",
|
||||
LLAMA_FTYPE_MOSTLY_Q5_K_M,
|
||||
"alias for Q5_K_M",
|
||||
},
|
||||
{
|
||||
"Q5_K_S",
|
||||
LLAMA_FTYPE_MOSTLY_Q5_K_S,
|
||||
" 4.33G, +0.0353 ppl @ 7B - large, low quality loss - *recommended*",
|
||||
},
|
||||
{
|
||||
"Q5_K_M",
|
||||
LLAMA_FTYPE_MOSTLY_Q5_K_M,
|
||||
" 4.45G, +0.0142 ppl @ 7B - large, very low quality loss - *recommended*",
|
||||
},
|
||||
{
|
||||
"Q6_K",
|
||||
LLAMA_FTYPE_MOSTLY_Q6_K,
|
||||
" 5.15G, +0.0044 ppl @ 7B - very large, extremely low quality loss",
|
||||
},
|
||||
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.67G, +0.8698 ppl @ 7B", },
|
||||
{ "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
|
||||
{ "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5505 ppl @ 7B", },
|
||||
{ "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.06G, +0.2437 ppl @ 7B", },
|
||||
{ "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 3.35G, +0.1803 ppl @ 7B", },
|
||||
{ "Q4_K", LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", },
|
||||
{ "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 3.56G, +0.1149 ppl @ 7B", },
|
||||
{ "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 3.80G, +0.0535 ppl @ 7B", },
|
||||
{ "Q5_K", LLAMA_FTYPE_MOSTLY_Q5_K_M, "alias for Q5_K_M", },
|
||||
{ "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 4.33G, +0.0353 ppl @ 7B", },
|
||||
{ "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0142 ppl @ 7B", },
|
||||
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 5.15G, +0.0044 ppl @ 7B", },
|
||||
#endif
|
||||
{
|
||||
"Q8_0",
|
||||
LLAMA_FTYPE_MOSTLY_Q8_0,
|
||||
" 6.70G, +0.0004 ppl @ 7B - very large, extremely low quality loss - not recommended",
|
||||
},
|
||||
{
|
||||
"F16",
|
||||
LLAMA_FTYPE_MOSTLY_F16,
|
||||
"13.00G @ 7B - extremely large, virtually no quality loss - not recommended",
|
||||
},
|
||||
{
|
||||
"F32",
|
||||
LLAMA_FTYPE_ALL_F32,
|
||||
"26.00G @ 7B - absolutely huge, lossless - not recommended",
|
||||
},
|
||||
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ 7B", },
|
||||
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "13.00G @ 7B", },
|
||||
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
|
||||
};
|
||||
|
||||
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
set(TARGET save-load-state)
|
||||
add_executable(${TARGET} save-load-state.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
if(TARGET BUILD_INFO)
|
||||
|
||||
@@ -2,10 +2,14 @@ set(TARGET server)
|
||||
option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
|
||||
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
|
||||
add_executable(${TARGET} server.cpp json.hpp httplib.h)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_compile_definitions(${TARGET} PRIVATE
|
||||
SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
|
||||
)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
if (WIN32)
|
||||
TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
|
||||
endif()
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
if(TARGET BUILD_INFO)
|
||||
add_dependencies(${TARGET} BUILD_INFO)
|
||||
|
||||
@@ -66,6 +66,7 @@ Using [curl](https://curl.se/). On Windows `curl.exe` should be available in the
|
||||
```sh
|
||||
curl --request POST \
|
||||
--url http://localhost:8080/completion \
|
||||
--header "Content-Type: application/json" \
|
||||
--data '{"prompt": "Building a website can be done in 10 simple steps:","n_predict": 128}'
|
||||
```
|
||||
|
||||
|
||||
@@ -32,6 +32,7 @@ tokenize() {
|
||||
--silent \
|
||||
--request POST \
|
||||
--url "${API_URL}/tokenize" \
|
||||
--header "Content-Type: application/json" \
|
||||
--data-raw "$(jq -ns --arg content "$1" '{content:$content}')" \
|
||||
| jq '.tokens[]'
|
||||
}
|
||||
@@ -64,6 +65,7 @@ chat_completion() {
|
||||
--no-buffer \
|
||||
--request POST \
|
||||
--url "${API_URL}/completion" \
|
||||
--header "Content-Type: application/json" \
|
||||
--data-raw "${DATA}")
|
||||
|
||||
printf "\n"
|
||||
|
||||
@@ -608,6 +608,8 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
||||
fprintf(stderr, " -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
|
||||
fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
||||
fprintf(stderr, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
|
||||
fprintf(stderr, " --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
|
||||
fprintf(stderr, " --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
|
||||
fprintf(stderr, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
||||
fprintf(stderr, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
||||
fprintf(stderr, " not recommended: doubles context memory required and no measurable increase in quality\n");
|
||||
@@ -722,6 +724,22 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||
}
|
||||
params.n_ctx = std::stoi(argv[i]);
|
||||
}
|
||||
else if (arg == "--rope-freq-base")
|
||||
{
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.rope_freq_base = std::stof(argv[i]);
|
||||
}
|
||||
else if (arg == "--rope-freq-scale")
|
||||
{
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.rope_freq_scale = std::stof(argv[i]);
|
||||
}
|
||||
else if (arg == "--memory-f32" || arg == "--memory_f32")
|
||||
{
|
||||
params.memory_f16 = false;
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
set(TARGET simple)
|
||||
add_executable(${TARGET} simple.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
if(TARGET BUILD_INFO)
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
set(TARGET train-text-from-scratch)
|
||||
add_executable(${TARGET} train-text-from-scratch.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
|
||||
@@ -1434,7 +1434,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
|
||||
gf->perf_time_us = 0;
|
||||
|
||||
const auto & hparams = model->hparams;
|
||||
//const int n_ctx = hparams.n_ctx;
|
||||
const int n_ctx = hparams.n_ctx;
|
||||
const int n_vocab = hparams.n_vocab;
|
||||
const int n_embd = hparams.n_embd;
|
||||
const int n_layer = hparams.n_layer;
|
||||
@@ -1863,10 +1863,10 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
|
||||
t12->grad = expand(gb, ggml_permute(ctx0, t15->grad, 0, 2, 3, 1)); assert_shape_4d(t12->grad, N, n_batch, n_embd/n_head, n_head);
|
||||
t11->grad = expand(gb, ggml_reshape_2d(ctx0, ggml_cont(ctx0, t12->grad), N*n_batch, n_embd)); assert_shape_2d(t11->grad, N*n_batch, n_embd);
|
||||
t10->grad = expand(gb, ggml_permute(ctx0, t14->grad, 0, 2, 1, 3)); assert_shape_4d(t10->grad, n_embd/n_head, n_head, N, n_batch);
|
||||
t09->grad = expand(gb, ggml_rope_back(ctx0, t10->grad, n_past, n_rot, rope_mode)); assert_shape_4d(t09->grad, n_embd/n_head, n_head, N, n_batch);
|
||||
t09->grad = expand(gb, ggml_rope_back(ctx0, t10->grad, n_past, n_rot, rope_mode, n_ctx)); assert_shape_4d(t09->grad, n_embd/n_head, n_head, N, n_batch);
|
||||
t08->grad = expand(gb, ggml_reshape_2d(ctx0, t09->grad, n_embd, N*n_batch)); assert_shape_2d(t08->grad, n_embd, N*n_batch);
|
||||
t07->grad = expand(gb, ggml_permute(ctx0, t13->grad, 0, 2, 1, 3)); assert_shape_4d(t07->grad, n_embd/n_head, n_head, N, n_batch);
|
||||
t06->grad = expand(gb, ggml_rope_back(ctx0, t07->grad, n_past, n_rot, rope_mode)); assert_shape_4d(t06->grad, n_embd/n_head, n_head, N, n_batch);
|
||||
t06->grad = expand(gb, ggml_rope_back(ctx0, t07->grad, n_past, n_rot, rope_mode, n_ctx)); assert_shape_4d(t06->grad, n_embd/n_head, n_head, N, n_batch);
|
||||
t05->grad = expand(gb, ggml_reshape_2d(ctx0, t06->grad, n_embd, N*n_batch)); assert_shape_2d(t05->grad, n_embd, N*n_batch);
|
||||
t04->grad = expand(gb, ggml_add_inplace(ctx0,
|
||||
ggml_add_inplace(ctx0,
|
||||
|
||||
68
flake.nix
68
flake.nix
@@ -6,51 +6,51 @@
|
||||
outputs = { self, nixpkgs, flake-utils }:
|
||||
flake-utils.lib.eachDefaultSystem (system:
|
||||
let
|
||||
inherit (pkgs.stdenv) isAarch64 isDarwin;
|
||||
inherit (pkgs.lib) optionals;
|
||||
isM1 = isAarch64 && isDarwin;
|
||||
osSpecific = if isM1 then
|
||||
with pkgs.darwin.apple_sdk_11_0.frameworks; [
|
||||
Accelerate
|
||||
MetalKit
|
||||
MetalPerformanceShaders
|
||||
MetalPerformanceShadersGraph
|
||||
]
|
||||
else if isDarwin then
|
||||
with pkgs.darwin.apple_sdk.frameworks; [
|
||||
Accelerate
|
||||
CoreGraphics
|
||||
CoreVideo
|
||||
]
|
||||
else
|
||||
[ ];
|
||||
inherit (pkgs.stdenv) isAarch32 isAarch64 isDarwin;
|
||||
osSpecific = with pkgs; [ openmpi ] ++
|
||||
(
|
||||
if isAarch64 && isDarwin then
|
||||
with pkgs.darwin.apple_sdk_11_0.frameworks; [
|
||||
Accelerate
|
||||
MetalKit
|
||||
MetalPerformanceShaders
|
||||
MetalPerformanceShadersGraph
|
||||
]
|
||||
else if isAarch32 && isDarwin then
|
||||
with pkgs.darwin.apple_sdk.frameworks; [
|
||||
Accelerate
|
||||
CoreGraphics
|
||||
CoreVideo
|
||||
]
|
||||
else
|
||||
with pkgs; [ openblas ]
|
||||
);
|
||||
pkgs = import nixpkgs { inherit system; };
|
||||
nativeBuildInputs = with pkgs; [ cmake pkgconfig ];
|
||||
llama-python =
|
||||
pkgs.python310.withPackages (ps: with ps; [ numpy sentencepiece ]);
|
||||
pkgs.python3.withPackages (ps: with ps; [ numpy sentencepiece ]);
|
||||
in {
|
||||
packages.default = pkgs.stdenv.mkDerivation {
|
||||
name = "llama.cpp";
|
||||
src = ./.;
|
||||
postPatch = if isM1 then ''
|
||||
postPatch = ''
|
||||
substituteInPlace ./ggml-metal.m \
|
||||
--replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
|
||||
'' else
|
||||
"";
|
||||
nativeBuildInputs = with pkgs; [ cmake ];
|
||||
substituteInPlace ./*.py --replace '/usr/bin/env python' '${llama-python}/bin/python'
|
||||
'';
|
||||
nativeBuildInputs = nativeBuildInputs;
|
||||
buildInputs = osSpecific;
|
||||
cmakeFlags = [ "-DLLAMA_BUILD_SERVER=ON" ] ++ (optionals isM1 [
|
||||
"-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1"
|
||||
"-DLLAMA_METAL=ON"
|
||||
cmakeFlags = [ "-DLLAMA_BUILD_SERVER=ON" "-DLLAMA_MPI=ON" "-DBUILD_SHARED_LIBS=ON" "-DCMAKE_SKIP_BUILD_RPATH=ON" ]
|
||||
++ (if isAarch64 && isDarwin then [
|
||||
"-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1"
|
||||
"-DLLAMA_METAL=ON"
|
||||
] else [
|
||||
"-DLLAMA_BLAS=ON"
|
||||
"-DLLAMA_BLAS_VENDOR=OpenBLAS"
|
||||
]);
|
||||
installPhase = ''
|
||||
mkdir -p $out/bin
|
||||
mv bin/* $out/bin/
|
||||
postInstall = ''
|
||||
mv $out/bin/main $out/bin/llama
|
||||
mv $out/bin/server $out/bin/llama-server
|
||||
|
||||
echo "#!${llama-python}/bin/python" > $out/bin/convert.py
|
||||
cat ${./convert.py} >> $out/bin/convert.py
|
||||
chmod +x $out/bin/convert.py
|
||||
'';
|
||||
meta.mainProgram = "llama";
|
||||
};
|
||||
@@ -68,7 +68,7 @@
|
||||
};
|
||||
apps.default = self.apps.${system}.llama;
|
||||
devShells.default = pkgs.mkShell {
|
||||
packages = with pkgs; [ cmake llama-python ] ++ osSpecific;
|
||||
packages = nativeBuildInputs ++ osSpecific;
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
106
ggml-cuda.cu
106
ggml-cuda.cu
@@ -2423,20 +2423,53 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
|
||||
scoped_spin_lock lock(g_cuda_pool_lock);
|
||||
int id;
|
||||
CUDA_CHECK(cudaGetDevice(&id));
|
||||
|
||||
#ifdef DEBUG_CUDA_MALLOC
|
||||
int nnz = 0;
|
||||
size_t max_size = 0, tot_size = 0;
|
||||
#endif
|
||||
size_t best_diff = 1ull << 36;
|
||||
int ibest = -1;
|
||||
for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
|
||||
cuda_buffer& b = g_cuda_buffer_pool[id][i];
|
||||
if (b.size >= size && b.ptr != nullptr) {
|
||||
void * ptr = b.ptr;
|
||||
*actual_size = b.size;
|
||||
b.ptr = nullptr;
|
||||
b.size = 0;
|
||||
return ptr;
|
||||
if (b.ptr != nullptr) {
|
||||
#ifdef DEBUG_CUDA_MALLOC
|
||||
++nnz;
|
||||
tot_size += b.size;
|
||||
if (b.size > max_size) max_size = b.size;
|
||||
#endif
|
||||
if (b.size >= size) {
|
||||
size_t diff = b.size - size;
|
||||
if (diff < best_diff) {
|
||||
best_diff = diff;
|
||||
ibest = i;
|
||||
if (!best_diff) {
|
||||
void * ptr = b.ptr;
|
||||
*actual_size = b.size;
|
||||
b.ptr = nullptr;
|
||||
b.size = 0;
|
||||
return ptr;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (ibest >= 0) {
|
||||
cuda_buffer& b = g_cuda_buffer_pool[id][ibest];
|
||||
void * ptr = b.ptr;
|
||||
*actual_size = b.size;
|
||||
b.ptr = nullptr;
|
||||
b.size = 0;
|
||||
return ptr;
|
||||
}
|
||||
#ifdef DEBUG_CUDA_MALLOC
|
||||
fprintf(stderr, "%s: %d buffers, max_size = %u MB, tot_size = %u MB, requested %u MB\n", __func__, nnz,
|
||||
(uint32_t)(max_size/1024/1024), (uint32_t)(tot_size/1024/1024), (uint32_t)(size/1024/1024));
|
||||
#endif
|
||||
void * ptr;
|
||||
CUDA_CHECK(cudaMalloc((void **) &ptr, size));
|
||||
*actual_size = size;
|
||||
size_t look_ahead_size = (size_t) (1.05 * size);
|
||||
look_ahead_size = 256 * ((look_ahead_size + 255)/256);
|
||||
CUDA_CHECK(cudaMalloc((void **) &ptr, look_ahead_size));
|
||||
*actual_size = look_ahead_size;
|
||||
return ptr;
|
||||
}
|
||||
|
||||
@@ -2512,6 +2545,9 @@ void ggml_init_cublas() {
|
||||
}
|
||||
|
||||
void ggml_cuda_set_tensor_split(const float * tensor_split) {
|
||||
if (tensor_split == nullptr) {
|
||||
return;
|
||||
}
|
||||
bool all_zero = true;
|
||||
for (int i = 0; i < g_device_count; ++i) {
|
||||
if (tensor_split[i] != 0.0f) {
|
||||
@@ -2952,8 +2988,13 @@ inline void ggml_cuda_op_rope(
|
||||
const int mode = ((int32_t *) src1->data)[2];
|
||||
const int n_ctx = ((int32_t *) src1->data)[3];
|
||||
|
||||
const float theta_scale = powf(10000.0, -2.0f/n_dims);
|
||||
const float p = ((mode & 1) == 0 ? n_past + i02 : i02);
|
||||
// RoPE alteration for extended context
|
||||
float freq_base, freq_scale;
|
||||
memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float));
|
||||
memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
|
||||
|
||||
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
||||
const float p = (((mode & 1) == 0 ? n_past + i02 : i02)) * freq_scale;
|
||||
|
||||
bool is_glm = mode & 4;
|
||||
|
||||
@@ -3537,6 +3578,11 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
|
||||
(void) dst;
|
||||
}
|
||||
|
||||
void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
ggml_cuda_cpy(src0, dst, nullptr);
|
||||
(void) src1;
|
||||
}
|
||||
|
||||
void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
||||
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_diag_mask_inf, true, true);
|
||||
@@ -3646,6 +3692,22 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
|
||||
delete extra;
|
||||
}
|
||||
|
||||
static struct ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr;
|
||||
static size_t g_temp_tensor_extra_index = 0;
|
||||
|
||||
static struct ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
|
||||
if (g_temp_tensor_extras == nullptr) {
|
||||
g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES];
|
||||
}
|
||||
|
||||
size_t alloc_index = g_temp_tensor_extra_index;
|
||||
g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_MAX_NODES;
|
||||
struct ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
|
||||
memset(extra, 0, sizeof(*extra));
|
||||
|
||||
return extra;
|
||||
}
|
||||
|
||||
void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace) {
|
||||
if (scratch && g_scratch_size == 0) {
|
||||
return;
|
||||
@@ -3654,7 +3716,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
|
||||
// recursively assign CUDA buffers until a compute tensor is found
|
||||
if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
|
||||
const ggml_op src0_op = tensor->src[0]->op;
|
||||
if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW) {
|
||||
if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) {
|
||||
ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace);
|
||||
}
|
||||
}
|
||||
@@ -3663,8 +3725,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
|
||||
}
|
||||
|
||||
tensor->backend = GGML_BACKEND_GPU;
|
||||
struct ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu;
|
||||
memset(extra, 0, sizeof(*extra));
|
||||
struct ggml_tensor_extra_gpu * extra;
|
||||
|
||||
const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
|
||||
tensor->op == GGML_OP_VIEW ||
|
||||
@@ -3679,10 +3740,12 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
|
||||
if (tensor->op == GGML_OP_VIEW) {
|
||||
memcpy(&offset, tensor->src[2]->data, sizeof(size_t));
|
||||
}
|
||||
extra = ggml_cuda_alloc_temp_tensor_extra();
|
||||
extra->data_device[g_main_device] = src0_ddc + offset;
|
||||
} else if (tensor->op == GGML_OP_CPY) {
|
||||
struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
|
||||
void * src1_ddv = src1_extra->data_device[g_main_device];
|
||||
extra = ggml_cuda_alloc_temp_tensor_extra();
|
||||
extra->data_device[g_main_device] = src1_ddv;
|
||||
} else if (scratch) {
|
||||
GGML_ASSERT(size <= g_scratch_size);
|
||||
@@ -3695,6 +3758,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
|
||||
CUDA_CHECK(cudaMalloc(&data, g_scratch_size));
|
||||
g_scratch_buffer = data;
|
||||
}
|
||||
extra = ggml_cuda_alloc_temp_tensor_extra();
|
||||
extra->data_device[g_main_device] = data + g_scratch_offset;
|
||||
|
||||
g_scratch_offset += size;
|
||||
@@ -3704,6 +3768,8 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
|
||||
void * data;
|
||||
CUDA_CHECK(cudaMalloc(&data, size));
|
||||
CUDA_CHECK(cudaMemset(data, 0, size));
|
||||
extra = new ggml_tensor_extra_gpu;
|
||||
memset(extra, 0, sizeof(*extra));
|
||||
extra->data_device[g_main_device] = data;
|
||||
}
|
||||
|
||||
@@ -3756,6 +3822,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
||||
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
|
||||
|
||||
switch (tensor->op) {
|
||||
case GGML_OP_DUP:
|
||||
if (!any_on_device) {
|
||||
return false;
|
||||
}
|
||||
func = ggml_cuda_dup;
|
||||
break;
|
||||
case GGML_OP_ADD:
|
||||
if (!any_on_device) {
|
||||
return false;
|
||||
@@ -3810,6 +3882,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
||||
}
|
||||
func = ggml_cuda_cpy;
|
||||
break;
|
||||
case GGML_OP_CONT:
|
||||
if (!any_on_device) {
|
||||
return false;
|
||||
}
|
||||
func = ggml_cuda_dup;
|
||||
break;
|
||||
case GGML_OP_RESHAPE:
|
||||
case GGML_OP_VIEW:
|
||||
case GGML_OP_PERMUTE:
|
||||
|
||||
91
ggml-metal.m
91
ggml-metal.m
@@ -676,8 +676,8 @@ void ggml_metal_graph_compute(
|
||||
GGML_ASSERT(ne02 == 1);
|
||||
GGML_ASSERT(ne12 == 1);
|
||||
|
||||
nth0 = 4;
|
||||
nth1 = 16;
|
||||
nth0 = 2;
|
||||
nth1 = 32;
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q2_K_f32];
|
||||
} break;
|
||||
case GGML_TYPE_Q3_K:
|
||||
@@ -685,8 +685,8 @@ void ggml_metal_graph_compute(
|
||||
GGML_ASSERT(ne02 == 1);
|
||||
GGML_ASSERT(ne12 == 1);
|
||||
|
||||
nth0 = 4;
|
||||
nth1 = 16;
|
||||
nth0 = 2;
|
||||
nth1 = 32;
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q3_K_f32];
|
||||
} break;
|
||||
case GGML_TYPE_Q4_K:
|
||||
@@ -694,8 +694,8 @@ void ggml_metal_graph_compute(
|
||||
GGML_ASSERT(ne02 == 1);
|
||||
GGML_ASSERT(ne12 == 1);
|
||||
|
||||
nth0 = 4;
|
||||
nth1 = 16;
|
||||
nth0 = 2;
|
||||
nth1 = 32;
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_K_f32];
|
||||
} break;
|
||||
case GGML_TYPE_Q5_K:
|
||||
@@ -703,8 +703,8 @@ void ggml_metal_graph_compute(
|
||||
GGML_ASSERT(ne02 == 1);
|
||||
GGML_ASSERT(ne12 == 1);
|
||||
|
||||
nth0 = 4;
|
||||
nth1 = 16;
|
||||
nth0 = 2;
|
||||
nth1 = 32;
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q5_K_f32];
|
||||
} break;
|
||||
case GGML_TYPE_Q6_K:
|
||||
@@ -712,8 +712,8 @@ void ggml_metal_graph_compute(
|
||||
GGML_ASSERT(ne02 == 1);
|
||||
GGML_ASSERT(ne12 == 1);
|
||||
|
||||
nth0 = 4;
|
||||
nth1 = 16;
|
||||
nth0 = 2;
|
||||
nth1 = 32;
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q6_K_f32];
|
||||
} break;
|
||||
default:
|
||||
@@ -739,16 +739,22 @@ void ggml_metal_graph_compute(
|
||||
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:13];
|
||||
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:14];
|
||||
|
||||
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1) {
|
||||
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 ||
|
||||
src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) {
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7) / 8, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
else if (src0t == GGML_TYPE_Q2_K ||
|
||||
src0t == GGML_TYPE_Q3_K ||
|
||||
src0t == GGML_TYPE_Q4_K ||
|
||||
src0t == GGML_TYPE_Q5_K ||
|
||||
src0t == GGML_TYPE_Q6_K) {
|
||||
[encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
else if (src0t == GGML_TYPE_Q3_K) {
|
||||
#ifdef GGML_QKK_64
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01+1)/2, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
#else
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01+3)/4, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
#endif
|
||||
}
|
||||
else if (src0t == GGML_TYPE_Q5_K) {
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3) / 4, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
else if (src0t == GGML_TYPE_Q6_K) {
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01+1)/2, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
} else {
|
||||
[encoder setThreadgroupMemoryLength:nth0*sizeof(float) atIndex:0];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
@@ -792,7 +798,7 @@ void ggml_metal_graph_compute(
|
||||
|
||||
const float eps = 1e-6f;
|
||||
|
||||
const int nth = 256;
|
||||
const int nth = 512;
|
||||
|
||||
[encoder setComputePipelineState:ctx->pipeline_rms_norm];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
@@ -800,7 +806,7 @@ void ggml_metal_graph_compute(
|
||||
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
|
||||
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3];
|
||||
[encoder setBytes:&eps length:sizeof( float) atIndex:4];
|
||||
[encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0];
|
||||
[encoder setThreadgroupMemoryLength:nth/32*sizeof(float) atIndex:0];
|
||||
|
||||
const int64_t nrows = ggml_nrows(src0);
|
||||
|
||||
@@ -881,28 +887,35 @@ void ggml_metal_graph_compute(
|
||||
|
||||
const int n_past = ((int32_t *)(src1->data))[0];
|
||||
|
||||
float freq_base;
|
||||
float freq_scale;
|
||||
memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float));
|
||||
memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
|
||||
|
||||
[encoder setComputePipelineState:ctx->pipeline_rope];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
|
||||
[encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3];
|
||||
[encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4];
|
||||
[encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5];
|
||||
[encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6];
|
||||
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7];
|
||||
[encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8];
|
||||
[encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9];
|
||||
[encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10];
|
||||
[encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11];
|
||||
[encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12];
|
||||
[encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13];
|
||||
[encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14];
|
||||
[encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15];
|
||||
[encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16];
|
||||
[encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17];
|
||||
[encoder setBytes:&n_past length:sizeof( int) atIndex:18];
|
||||
[encoder setBytes:&n_dims length:sizeof( int) atIndex:19];
|
||||
[encoder setBytes:&mode length:sizeof( int) atIndex:20];
|
||||
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
|
||||
[encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3];
|
||||
[encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4];
|
||||
[encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5];
|
||||
[encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6];
|
||||
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7];
|
||||
[encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8];
|
||||
[encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9];
|
||||
[encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10];
|
||||
[encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11];
|
||||
[encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12];
|
||||
[encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13];
|
||||
[encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14];
|
||||
[encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15];
|
||||
[encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16];
|
||||
[encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17];
|
||||
[encoder setBytes:&n_past length:sizeof( int) atIndex:18];
|
||||
[encoder setBytes:&n_dims length:sizeof( int) atIndex:19];
|
||||
[encoder setBytes:&mode length:sizeof( int) atIndex:20];
|
||||
[encoder setBytes:&freq_base length:sizeof(float) atIndex:21];
|
||||
[encoder setBytes:&freq_scale length:sizeof(float) atIndex:22];
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
} break;
|
||||
|
||||
1139
ggml-metal.metal
1139
ggml-metal.metal
File diff suppressed because it is too large
Load Diff
85
ggml.c
85
ggml.c
@@ -31,11 +31,17 @@
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
// static_assert should be a #define, but if it's not,
|
||||
// fall back to the _Static_assert C11 keyword.
|
||||
// if C99 - static_assert is noop
|
||||
// ref: https://stackoverflow.com/a/53923785/4039976
|
||||
#ifndef static_assert
|
||||
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
|
||||
#define static_assert(cond, msg) _Static_assert(cond, msg)
|
||||
#else
|
||||
#define static_assert(cond, msg) struct global_scope_noop_trick
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
// disable "possible loss of data" to avoid hundreds of casts
|
||||
@@ -112,10 +118,6 @@ typedef void * thread_ret_t;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef __HAIKU__
|
||||
#define static_assert(cond, msg) _Static_assert(cond, msg)
|
||||
#endif
|
||||
|
||||
/*#define GGML_PERF*/
|
||||
#define GGML_DEBUG 0
|
||||
#define GGML_GELU_FP16
|
||||
@@ -4410,8 +4412,8 @@ void ggml_free(struct ggml_context * ctx) {
|
||||
if (&g_state.contexts[i].context == ctx) {
|
||||
g_state.contexts[i].used = false;
|
||||
|
||||
GGML_PRINT_DEBUG("%s: context %d with %d objects has been freed. memory used = %zu\n",
|
||||
__func__, i, ctx->n_objects, ctx->objects_end->offs + ctx->objects_end->size);
|
||||
GGML_PRINT_DEBUG("%s: context %d has been freed. memory used = %zu\n",
|
||||
__func__, i, ggml_used_mem(ctx));
|
||||
|
||||
if (ctx->mem_buffer_owned) {
|
||||
GGML_ALIGNED_FREE(ctx->mem_buffer);
|
||||
@@ -6955,6 +6957,8 @@ struct ggml_tensor * ggml_rope_impl(
|
||||
int n_dims,
|
||||
int mode,
|
||||
int n_ctx,
|
||||
float freq_base,
|
||||
float freq_scale,
|
||||
bool inplace) {
|
||||
GGML_ASSERT(n_past >= 0);
|
||||
bool is_node = false;
|
||||
@@ -6967,12 +6971,14 @@ struct ggml_tensor * ggml_rope_impl(
|
||||
|
||||
ggml_scratch_save(ctx);
|
||||
|
||||
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4);
|
||||
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 6);
|
||||
|
||||
((int32_t *) b->data)[0] = n_past;
|
||||
((int32_t *) b->data)[1] = n_dims;
|
||||
((int32_t *) b->data)[2] = mode;
|
||||
((int32_t *) b->data)[3] = n_ctx;
|
||||
memcpy((int32_t *) b->data + 4, &freq_base, sizeof(float));
|
||||
memcpy((int32_t *) b->data + 5, &freq_scale, sizeof(float));
|
||||
|
||||
ggml_scratch_load(ctx);
|
||||
|
||||
@@ -6991,7 +6997,7 @@ struct ggml_tensor * ggml_rope(
|
||||
int n_dims,
|
||||
int mode,
|
||||
int n_ctx) {
|
||||
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, false);
|
||||
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, false);
|
||||
}
|
||||
|
||||
struct ggml_tensor * ggml_rope_inplace(
|
||||
@@ -7001,7 +7007,19 @@ struct ggml_tensor * ggml_rope_inplace(
|
||||
int n_dims,
|
||||
int mode,
|
||||
int n_ctx) {
|
||||
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, true);
|
||||
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, true);
|
||||
}
|
||||
|
||||
struct ggml_tensor * ggml_rope_custom_inplace(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int n_past,
|
||||
int n_dims,
|
||||
int mode,
|
||||
int n_ctx,
|
||||
float freq_base,
|
||||
float freq_scale) {
|
||||
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, true);
|
||||
}
|
||||
|
||||
// ggml_rope_back
|
||||
@@ -7011,7 +7029,8 @@ struct ggml_tensor * ggml_rope_back(
|
||||
struct ggml_tensor * a,
|
||||
int n_past,
|
||||
int n_dims,
|
||||
int mode) {
|
||||
int mode,
|
||||
int n_ctx) {
|
||||
GGML_ASSERT(n_past >= 0);
|
||||
GGML_ASSERT((mode & 4) == 0 && "ggml_rope_back() for ChatGLM not implemented yet");
|
||||
|
||||
@@ -7025,12 +7044,13 @@ struct ggml_tensor * ggml_rope_back(
|
||||
|
||||
ggml_scratch_save(ctx);
|
||||
|
||||
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
|
||||
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4);
|
||||
ggml_set_name(b, "n_past, n_dims, mode");
|
||||
|
||||
((int32_t *) b->data)[0] = n_past;
|
||||
((int32_t *) b->data)[1] = n_dims;
|
||||
((int32_t *) b->data)[2] = mode;
|
||||
((int32_t *) b->data)[3] = n_ctx;
|
||||
|
||||
ggml_scratch_load(ctx);
|
||||
|
||||
@@ -12072,16 +12092,21 @@ static void ggml_compute_forward_rope_f32(
|
||||
const struct ggml_tensor * src1,
|
||||
struct ggml_tensor * dst) {
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
||||
GGML_ASSERT(ggml_nelements(src1) == 4);
|
||||
GGML_ASSERT(ggml_nelements(src1) == 6);
|
||||
|
||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
||||
return;
|
||||
}
|
||||
|
||||
float freq_base;
|
||||
float freq_scale;
|
||||
|
||||
const int n_past = ((int32_t *) src1->data)[0];
|
||||
const int n_dims = ((int32_t *) src1->data)[1];
|
||||
const int mode = ((int32_t *) src1->data)[2];
|
||||
const int n_ctx = ((int32_t *) src1->data)[3];
|
||||
memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float));
|
||||
memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
|
||||
|
||||
assert(n_past >= 0);
|
||||
|
||||
@@ -12110,7 +12135,7 @@ static void ggml_compute_forward_rope_f32(
|
||||
// row index used to determine which thread to use
|
||||
int ir = 0;
|
||||
|
||||
const float theta_scale = powf(10000.0, -2.0f/n_dims);
|
||||
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
||||
|
||||
const bool is_neox = mode & 2;
|
||||
const bool is_glm = mode & 4;
|
||||
@@ -12122,7 +12147,7 @@ static void ggml_compute_forward_rope_f32(
|
||||
if (ir++ < ir0) continue;
|
||||
if (ir > ir1) break;
|
||||
|
||||
float theta = (float)p;
|
||||
float theta = freq_scale * (float)p;
|
||||
|
||||
if (is_glm) {
|
||||
theta = MIN(p, n_ctx - 2);
|
||||
@@ -12199,16 +12224,21 @@ static void ggml_compute_forward_rope_f16(
|
||||
const struct ggml_tensor * src1,
|
||||
struct ggml_tensor * dst) {
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
||||
GGML_ASSERT(ggml_nelements(src1) == 4);
|
||||
GGML_ASSERT(ggml_nelements(src1) == 6);
|
||||
|
||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
||||
return;
|
||||
}
|
||||
|
||||
float freq_base;
|
||||
float freq_scale;
|
||||
|
||||
const int n_past = ((int32_t *) src1->data)[0];
|
||||
const int n_dims = ((int32_t *) src1->data)[1];
|
||||
const int mode = ((int32_t *) src1->data)[2];
|
||||
const int n_ctx = ((int32_t *) src1->data)[3];
|
||||
memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float));
|
||||
memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
|
||||
|
||||
assert(n_past >= 0);
|
||||
|
||||
@@ -12237,7 +12267,7 @@ static void ggml_compute_forward_rope_f16(
|
||||
// row index used to determine which thread to use
|
||||
int ir = 0;
|
||||
|
||||
const float theta_scale = powf(10000.0, -2.0f/n_dims);
|
||||
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
||||
|
||||
const bool is_neox = mode & 2;
|
||||
const bool is_glm = mode & 4;
|
||||
@@ -12249,7 +12279,7 @@ static void ggml_compute_forward_rope_f16(
|
||||
if (ir++ < ir0) continue;
|
||||
if (ir > ir1) break;
|
||||
|
||||
float theta = (float)p;
|
||||
float theta = freq_scale * (float)p;
|
||||
|
||||
if (is_glm) {
|
||||
theta = MIN(p, n_ctx - 2);
|
||||
@@ -12310,7 +12340,7 @@ static void ggml_compute_forward_rope_f16(
|
||||
const float x0 = GGML_FP16_TO_FP32(src[0]);
|
||||
const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
|
||||
|
||||
dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
|
||||
dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
|
||||
dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
|
||||
}
|
||||
}
|
||||
@@ -12349,7 +12379,7 @@ static void ggml_compute_forward_rope_back_f32(
|
||||
const struct ggml_tensor * src1,
|
||||
struct ggml_tensor * dst) {
|
||||
assert(src1->type == GGML_TYPE_I32);
|
||||
assert(ggml_nelements(src1) == 3);
|
||||
assert(ggml_nelements(src1) == 4);
|
||||
|
||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
||||
return;
|
||||
@@ -15708,17 +15738,19 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
||||
// necessary for llama
|
||||
if (src0->grad) {
|
||||
assert(src1->type == GGML_TYPE_I32);
|
||||
assert(ggml_nelements(src1) == 4);
|
||||
assert(ggml_nelements(src1) == 6);
|
||||
const int n_past = ((int32_t *) src1->data)[0];
|
||||
const int n_dims = ((int32_t *) src1->data)[1];
|
||||
const int mode = ((int32_t *) src1->data)[2];
|
||||
const int n_ctx = ((int32_t *) src1->data)[3];
|
||||
src0->grad = ggml_add_impl(ctx,
|
||||
src0->grad,
|
||||
ggml_rope_back(ctx,
|
||||
tensor->grad,
|
||||
n_past,
|
||||
n_dims,
|
||||
mode),
|
||||
mode,
|
||||
n_ctx),
|
||||
inplace);
|
||||
}
|
||||
if (src1->grad) {
|
||||
@@ -16289,8 +16321,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
||||
if (GGML_OP_HAS_FINALIZE[node->op]) {
|
||||
params.nth = n_tasks_arr[node_n];
|
||||
ggml_compute_forward(¶ms, node);
|
||||
ggml_graph_compute_perf_stats_node(node, state->shared);
|
||||
}
|
||||
ggml_graph_compute_perf_stats_node(node, state->shared);
|
||||
}
|
||||
|
||||
// distribute new work or execute it direct if 1T
|
||||
@@ -16320,8 +16352,9 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
||||
if (GGML_OP_HAS_FINALIZE[node->op]) {
|
||||
params.type = GGML_TASK_FINALIZE;
|
||||
ggml_compute_forward(¶ms, node);
|
||||
ggml_graph_compute_perf_stats_node(node, state->shared);
|
||||
}
|
||||
|
||||
ggml_graph_compute_perf_stats_node(node, state->shared);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
@@ -16863,9 +16896,6 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char
|
||||
}
|
||||
|
||||
void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
|
||||
//assert(cgraph->work == NULL);
|
||||
//assert(cgraph->work_size == 0);
|
||||
|
||||
uint64_t size_eval = 0;
|
||||
|
||||
// compute size of intermediate results
|
||||
@@ -17304,9 +17334,6 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
|
||||
|
||||
GGML_PRINT("=== GRAPH ===\n");
|
||||
|
||||
GGML_PRINT_DEBUG("n_threads = %d\n", cgraph->n_threads);
|
||||
GGML_PRINT_DEBUG("total work size = %zu bytes\n", cgraph->work_size);
|
||||
|
||||
GGML_PRINT("n_nodes = %d\n", cgraph->n_nodes);
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
struct ggml_tensor * node = cgraph->nodes[i];
|
||||
|
||||
14
ggml.h
14
ggml.h
@@ -1121,6 +1121,17 @@ extern "C" {
|
||||
int mode,
|
||||
int n_ctx);
|
||||
|
||||
// custom RoPE, in-place, returns view(a)
|
||||
GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int n_past,
|
||||
int n_dims,
|
||||
int mode,
|
||||
int n_ctx,
|
||||
float freq_base,
|
||||
float freq_scale);
|
||||
|
||||
// rotary position embedding backward, i.e compute dx from dy
|
||||
// a - dy
|
||||
GGML_API struct ggml_tensor * ggml_rope_back(
|
||||
@@ -1128,7 +1139,8 @@ extern "C" {
|
||||
struct ggml_tensor * a,
|
||||
int n_past,
|
||||
int n_dims,
|
||||
int mode);
|
||||
int mode,
|
||||
int n_ctx);
|
||||
|
||||
// alibi position embedding
|
||||
// in-place, returns view(a)
|
||||
|
||||
@@ -15,6 +15,14 @@
|
||||
#define K_SCALE_SIZE 12
|
||||
#endif
|
||||
|
||||
#ifndef static_assert
|
||||
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
|
||||
#define static_assert(cond, msg) _Static_assert(cond, msg)
|
||||
#else
|
||||
#define static_assert(cond, msg) struct global_scope_noop_trick
|
||||
#endif
|
||||
#endif
|
||||
|
||||
//
|
||||
// Super-block quantization structures
|
||||
//
|
||||
|
||||
199
llama.cpp
199
llama.cpp
@@ -98,17 +98,17 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
|
||||
}
|
||||
|
||||
//
|
||||
// memory sizes
|
||||
// memory sizes (calculated for n_batch == 512)
|
||||
//
|
||||
|
||||
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
|
||||
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0(int n_ctx)
|
||||
{
|
||||
static std::map<e_model, size_t> k_sizes = {
|
||||
{ MODEL_3B, 256ull * MB },
|
||||
{ MODEL_7B, 512ull * MB },
|
||||
{ MODEL_13B, 512ull * MB },
|
||||
{ MODEL_30B, 512ull * MB },
|
||||
{ MODEL_65B, 1024ull * MB },
|
||||
{ MODEL_3B, ((size_t) n_ctx / 16ull + 92ull) * MB },
|
||||
{ MODEL_7B, ((size_t) n_ctx / 16ull + 100ull) * MB },
|
||||
{ MODEL_13B, ((size_t) n_ctx / 12ull + 120ull) * MB },
|
||||
{ MODEL_30B, ((size_t) n_ctx / 9ull + 160ull) * MB },
|
||||
{ MODEL_65B, ((size_t) n_ctx / 6ull + 256ull) * MB }, // guess
|
||||
};
|
||||
return k_sizes;
|
||||
}
|
||||
@@ -116,38 +116,24 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
|
||||
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
|
||||
{
|
||||
static std::map<e_model, size_t> k_sizes = {
|
||||
{ MODEL_3B, 256ull * MB },
|
||||
{ MODEL_7B, 512ull * MB },
|
||||
{ MODEL_13B, 512ull * MB },
|
||||
{ MODEL_30B, 512ull * MB },
|
||||
{ MODEL_65B, 1024ull * MB },
|
||||
{ MODEL_3B, 128ull * MB },
|
||||
{ MODEL_7B, 160ull * MB },
|
||||
{ MODEL_13B, 192ull * MB },
|
||||
{ MODEL_30B, 256ull * MB },
|
||||
{ MODEL_65B, 384ull * MB }, // guess
|
||||
};
|
||||
return k_sizes;
|
||||
}
|
||||
|
||||
// 2*n_embd*n_ctx*n_layer*sizeof(float16)
|
||||
static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
|
||||
{
|
||||
static std::map<e_model, size_t> k_sizes = {
|
||||
{ MODEL_3B, 682ull * MB },
|
||||
{ MODEL_7B, 1026ull * MB },
|
||||
{ MODEL_13B, 1608ull * MB },
|
||||
{ MODEL_30B, 3124ull * MB },
|
||||
{ MODEL_65B, 5120ull * MB },
|
||||
};
|
||||
return k_sizes;
|
||||
}
|
||||
|
||||
// this is mostly needed for temporary mul_mat buffers to dequantize the data
|
||||
// not actually needed if BLAS is disabled
|
||||
// used to store the compute graph tensors + non-scratch data
|
||||
static const std::map<e_model, size_t> & MEM_REQ_EVAL()
|
||||
{
|
||||
static std::map<e_model, size_t> k_sizes = {
|
||||
{ MODEL_3B, 512ull * MB },
|
||||
{ MODEL_7B, 768ull * MB },
|
||||
{ MODEL_13B, 1024ull * MB },
|
||||
{ MODEL_30B, 1280ull * MB },
|
||||
{ MODEL_65B, 1536ull * MB },
|
||||
{ MODEL_3B, 8ull * MB },
|
||||
{ MODEL_7B, 10ull * MB },
|
||||
{ MODEL_13B, 12ull * MB },
|
||||
{ MODEL_30B, 16ull * MB },
|
||||
{ MODEL_65B, 24ull * MB }, // guess
|
||||
};
|
||||
return k_sizes;
|
||||
}
|
||||
@@ -189,11 +175,24 @@ struct llama_hparams {
|
||||
uint32_t n_head = 32;
|
||||
uint32_t n_layer = 32;
|
||||
uint32_t n_rot = 64;
|
||||
|
||||
float rope_freq_base = 10000.0f;
|
||||
float rope_freq_scale = 1.0f;
|
||||
|
||||
enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
|
||||
|
||||
bool operator!=(const llama_hparams & other) const {
|
||||
return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams)));
|
||||
}
|
||||
|
||||
size_t kv_size() const {
|
||||
size_t result = 2ull;
|
||||
result *= (size_t) n_embd;
|
||||
result *= (size_t) n_ctx;
|
||||
result *= (size_t) n_layer;
|
||||
result *= sizeof(ggml_fp16_t);
|
||||
return result;
|
||||
}
|
||||
};
|
||||
|
||||
struct llama_layer {
|
||||
@@ -550,7 +549,9 @@ struct llama_file_loader {
|
||||
}
|
||||
|
||||
// skip to the next multiple of 32 bytes
|
||||
file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
|
||||
if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) {
|
||||
file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
|
||||
}
|
||||
|
||||
tensor.file_off = file.tell();
|
||||
tensor.name = name;
|
||||
@@ -647,7 +648,7 @@ struct llama_model_loader {
|
||||
*ctx_size_p = *mmapped_size_p = 0;
|
||||
for (const llama_load_tensor & lt : tensors_map.tensors) {
|
||||
*ctx_size_p += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE;
|
||||
*(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size;
|
||||
*(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size + 16;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -842,7 +843,9 @@ struct llama_context_params llama_context_default_params() {
|
||||
/*.n_batch =*/ 512,
|
||||
/*.gpu_layers =*/ 0,
|
||||
/*.main_gpu =*/ 0,
|
||||
/*.tensor_split =*/ {0},
|
||||
/*.tensor_split =*/ nullptr,
|
||||
/*.rope_freq_base =*/ 10000.0f,
|
||||
/*.rope_freq_scale =*/ 1.0f,
|
||||
/*.progress_callback =*/ nullptr,
|
||||
/*.progress_callback_user_data =*/ nullptr,
|
||||
/*.low_vram =*/ false,
|
||||
@@ -868,6 +871,10 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
|
||||
return result;
|
||||
}
|
||||
|
||||
int llama_max_devices() {
|
||||
return LLAMA_MAX_DEVICES;
|
||||
}
|
||||
|
||||
bool llama_mmap_supported() {
|
||||
return llama_mmap::SUPPORTED;
|
||||
}
|
||||
@@ -966,6 +973,8 @@ static void llama_model_load_internal(
|
||||
int n_gpu_layers,
|
||||
int main_gpu,
|
||||
const float * tensor_split,
|
||||
float rope_freq_base,
|
||||
float rope_freq_scale,
|
||||
bool low_vram,
|
||||
ggml_type memory_type,
|
||||
bool use_mmap,
|
||||
@@ -1000,22 +1009,27 @@ static void llama_model_load_internal(
|
||||
}
|
||||
|
||||
hparams.n_ctx = n_ctx;
|
||||
|
||||
hparams.rope_freq_base = rope_freq_base;
|
||||
hparams.rope_freq_scale = rope_freq_scale;
|
||||
}
|
||||
|
||||
const uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
|
||||
|
||||
{
|
||||
fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version));
|
||||
fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab);
|
||||
fprintf(stderr, "%s: n_ctx = %u\n", __func__, hparams.n_ctx);
|
||||
fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd);
|
||||
fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult);
|
||||
fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head);
|
||||
fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer);
|
||||
fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot);
|
||||
fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version));
|
||||
fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab);
|
||||
fprintf(stderr, "%s: n_ctx = %u\n", __func__, hparams.n_ctx);
|
||||
fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd);
|
||||
fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult);
|
||||
fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head);
|
||||
fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer);
|
||||
fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot);
|
||||
fprintf(stderr, "%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base);
|
||||
fprintf(stderr, "%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale);
|
||||
fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
|
||||
fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
|
||||
fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
|
||||
fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
|
||||
fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
|
||||
}
|
||||
|
||||
if (file_version < LLAMA_FILE_VERSION_GGJT_V2) {
|
||||
@@ -1049,7 +1063,7 @@ static void llama_model_load_internal(
|
||||
{
|
||||
model.buf.resize(ctx_size);
|
||||
if (use_mlock) {
|
||||
model.mlock_buf.init(model.buf.addr);
|
||||
model.mlock_buf.init (model.buf.addr);
|
||||
model.mlock_buf.grow_to(model.buf.size);
|
||||
}
|
||||
|
||||
@@ -1164,13 +1178,13 @@ static void llama_model_load_internal(
|
||||
const size_t mem_required =
|
||||
ctx_size +
|
||||
mmapped_size - vram_weights + // weights in VRAM not in memory
|
||||
MEM_REQ_SCRATCH0().at(model.type) +
|
||||
MEM_REQ_SCRATCH0(hparams.n_ctx).at(model.type) +
|
||||
MEM_REQ_SCRATCH1().at(model.type) +
|
||||
MEM_REQ_EVAL().at (model.type);
|
||||
MEM_REQ_EVAL().at(model.type);
|
||||
|
||||
// this is the memory required by one llama_state
|
||||
const size_t mem_required_state =
|
||||
scale*MEM_REQ_KV_SELF().at(model.type);
|
||||
scale*hparams.kv_size();
|
||||
|
||||
fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
|
||||
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
|
||||
@@ -1211,7 +1225,7 @@ static void llama_model_load_internal(
|
||||
fprintf(stderr, "%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
|
||||
} else {
|
||||
fprintf(stderr, "%s: offloading v cache to GPU\n", __func__);
|
||||
vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
|
||||
vram_kv_cache += hparams.kv_size() / 2;
|
||||
}
|
||||
}
|
||||
if (n_gpu_layers > (int) hparams.n_layer + 2) {
|
||||
@@ -1219,7 +1233,7 @@ static void llama_model_load_internal(
|
||||
fprintf(stderr, "%s: cannot offload k cache to GPU due to low VRAM option\n", __func__);
|
||||
} else {
|
||||
fprintf(stderr, "%s: offloading k cache to GPU\n", __func__);
|
||||
vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
|
||||
vram_kv_cache += hparams.kv_size() / 2;
|
||||
}
|
||||
}
|
||||
#elif defined(GGML_USE_CLBLAST)
|
||||
@@ -1269,7 +1283,9 @@ static bool llama_model_load(
|
||||
int n_batch,
|
||||
int n_gpu_layers,
|
||||
int main_gpu,
|
||||
float * tensor_split,
|
||||
const float * tensor_split,
|
||||
float rope_freq_base,
|
||||
float rope_freq_scale,
|
||||
bool low_vram,
|
||||
ggml_type memory_type,
|
||||
bool use_mmap,
|
||||
@@ -1278,7 +1294,7 @@ static bool llama_model_load(
|
||||
llama_progress_callback progress_callback,
|
||||
void *progress_callback_user_data) {
|
||||
try {
|
||||
llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
|
||||
llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, rope_freq_base, rope_freq_scale, low_vram, memory_type,
|
||||
use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
|
||||
return true;
|
||||
} catch (const std::exception & err) {
|
||||
@@ -1330,6 +1346,9 @@ static bool llama_eval_internal(
|
||||
const int n_rot = hparams.n_embd/hparams.n_head;
|
||||
const int n_gpu_layers = model.n_gpu_layers;
|
||||
|
||||
const float freq_base = hparams.rope_freq_base;
|
||||
const float freq_scale = hparams.rope_freq_scale;
|
||||
|
||||
auto & mem_per_token = lctx.mem_per_token;
|
||||
auto & buf_compute = lctx.buf_compute;
|
||||
|
||||
@@ -1427,11 +1446,11 @@ static bool llama_eval_internal(
|
||||
offload_func_kq(tmpq);
|
||||
ggml_set_name(tmpq, "tmpq");
|
||||
|
||||
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
|
||||
struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0, freq_base, freq_scale);
|
||||
offload_func_kq(Kcur);
|
||||
ggml_set_name(Kcur, "Kcur");
|
||||
|
||||
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
|
||||
struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0, freq_base, freq_scale);
|
||||
offload_func_kq(Qcur);
|
||||
ggml_set_name(Qcur, "Qcur");
|
||||
|
||||
@@ -1714,10 +1733,12 @@ static bool llama_eval_internal(
|
||||
}
|
||||
|
||||
#if 0
|
||||
printf("\n%s: used_mem = %.3f MB, scratch -- %.3f MB %.3f MB\n", __func__,
|
||||
printf("\n%s: used_mem: eval ctx %.3f MB, scratch %.3f MB %.3f MB, work buf %.3f MB, n_past = %d, N = %d\n", __func__,
|
||||
ggml_used_mem(ctx0)/1024.0/1024.0,
|
||||
lctx.get_buf_max_mem(0)/1024.0/1024.0,
|
||||
lctx.get_buf_max_mem(1)/1024.0/1024.0);
|
||||
lctx.get_buf_max_mem(1)/1024.0/1024.0,
|
||||
lctx.work_buffer.size()/1024.0/1024.0,
|
||||
n_past, N);
|
||||
#endif
|
||||
|
||||
ggml_free(ctx0);
|
||||
@@ -2005,9 +2026,18 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array *
|
||||
}
|
||||
|
||||
// Normalize the second derivatives
|
||||
float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
|
||||
for (float & value : second_derivatives) {
|
||||
value /= second_derivatives_sum;
|
||||
{
|
||||
const float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
|
||||
|
||||
if (second_derivatives_sum > 1e-6f) {
|
||||
for (float & value : second_derivatives) {
|
||||
value /= second_derivatives_sum;
|
||||
}
|
||||
} else {
|
||||
for (float & value : second_derivatives) {
|
||||
value = 1.0f / second_derivatives.size();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
float cum_sum = 0.0f;
|
||||
@@ -2184,9 +2214,8 @@ void llama_sample_classifier_free_guidance(
|
||||
struct llama_context * ctx,
|
||||
llama_token_data_array * candidates,
|
||||
struct llama_context * guidance_ctx,
|
||||
float scale,
|
||||
float smooth_factor) {
|
||||
int64_t t_start_sample_us = t_start_sample_us = ggml_time_us();
|
||||
float scale) {
|
||||
int64_t t_start_sample_us = ggml_time_us();
|
||||
|
||||
assert(ctx);
|
||||
auto n_vocab = llama_n_vocab(ctx);
|
||||
@@ -2206,16 +2235,7 @@ void llama_sample_classifier_free_guidance(
|
||||
for (int i = 0; i < n_vocab; ++i) {
|
||||
float logit_guidance = logits_guidance[i];
|
||||
float logit_base = logits_base[i];
|
||||
logits_guidance[i] = scale * (logit_base - logit_guidance) + logit_guidance;
|
||||
}
|
||||
|
||||
llama_log_softmax(logits_guidance, n_vocab);
|
||||
|
||||
for (int i = 0; i < n_vocab; ++i) {
|
||||
float logit_base = logits_base[i];
|
||||
float logit_guidance = logits_guidance[i];
|
||||
|
||||
candidates->data[i].logit = smooth_factor * logit_guidance + (1.f - smooth_factor) * logit_base;
|
||||
candidates->data[i].logit = scale * (logit_base - logit_guidance) + logit_guidance;
|
||||
}
|
||||
|
||||
if (ctx) {
|
||||
@@ -2424,8 +2444,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
|
||||
case LLAMA_FTYPE_MOSTLY_F16: quantized_type = GGML_TYPE_F16; break;
|
||||
case LLAMA_FTYPE_ALL_F32: quantized_type = GGML_TYPE_F32; break;
|
||||
case LLAMA_FTYPE_MOSTLY_F16: quantized_type = GGML_TYPE_F16; break;
|
||||
case LLAMA_FTYPE_ALL_F32: quantized_type = GGML_TYPE_F32; break;
|
||||
|
||||
#ifdef GGML_USE_K_QUANTS
|
||||
// K-quants
|
||||
@@ -2509,16 +2529,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
} else {
|
||||
new_type = quantized_type;
|
||||
#ifdef GGML_USE_K_QUANTS
|
||||
bool convert_incompatible_tensor = false;
|
||||
if (quantized_type == GGML_TYPE_Q2_K || quantized_type == GGML_TYPE_Q3_K || quantized_type == GGML_TYPE_Q4_K ||
|
||||
quantized_type == GGML_TYPE_Q5_K || quantized_type == GGML_TYPE_Q6_K) {
|
||||
int nx = tensor.ne.at(0);
|
||||
int ny = tensor.ne.at(1);
|
||||
if (nx % QK_K != 0 || ny % QK_K != 0) {
|
||||
fprintf(stderr, "\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K);
|
||||
convert_incompatible_tensor = true;
|
||||
}
|
||||
}
|
||||
if (tensor.name == "output.weight") {
|
||||
int nx = tensor.ne.at(0);
|
||||
int ny = tensor.ne.at(1);
|
||||
@@ -2544,6 +2554,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
||||
}
|
||||
bool convert_incompatible_tensor = false;
|
||||
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
|
||||
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
|
||||
int nx = tensor.ne.at(0);
|
||||
int ny = tensor.ne.at(1);
|
||||
if (nx % QK_K != 0 || ny % QK_K != 0) {
|
||||
fprintf(stderr, "\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K);
|
||||
convert_incompatible_tensor = true;
|
||||
}
|
||||
}
|
||||
if (convert_incompatible_tensor) {
|
||||
if (tensor.name == "output.weight") {
|
||||
new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
|
||||
@@ -2570,7 +2590,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
f32_data = (float *) f32_conv_buf.addr;
|
||||
}
|
||||
|
||||
printf("quantizing .. ");
|
||||
printf("quantizing to %s .. ", ggml_type_name(new_type));
|
||||
fflush(stdout);
|
||||
|
||||
work.resize(nelements * 4); // upper bound on size
|
||||
@@ -2674,8 +2694,9 @@ struct llama_model * llama_load_model_from_file(
|
||||
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
||||
|
||||
if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers,
|
||||
params.main_gpu, params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
|
||||
params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
|
||||
params.main_gpu, params.tensor_split, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
|
||||
memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback,
|
||||
params.progress_callback_user_data)) {
|
||||
delete model;
|
||||
fprintf(stderr, "%s: failed to load model\n", __func__);
|
||||
return nullptr;
|
||||
@@ -2752,7 +2773,7 @@ struct llama_context * llama_new_context_with_model(
|
||||
|
||||
ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type));
|
||||
|
||||
ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type));
|
||||
ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type));
|
||||
ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
|
||||
}
|
||||
|
||||
|
||||
14
llama.h
14
llama.h
@@ -88,7 +88,13 @@ extern "C" {
|
||||
int32_t n_batch; // prompt processing batch size
|
||||
int32_t n_gpu_layers; // number of layers to store in VRAM
|
||||
int32_t main_gpu; // the GPU that is used for scratch and small tensors
|
||||
float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
|
||||
|
||||
const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
|
||||
|
||||
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
||||
float rope_freq_base; // RoPE base frequency
|
||||
float rope_freq_scale; // RoPE frequency scaling factor
|
||||
|
||||
// called with a progress value between 0 and 1, pass NULL to disable
|
||||
llama_progress_callback progress_callback;
|
||||
// context pointer passed to the progress callback
|
||||
@@ -148,6 +154,8 @@ extern "C" {
|
||||
int32_t n_eval;
|
||||
};
|
||||
|
||||
LLAMA_API int llama_max_devices();
|
||||
|
||||
LLAMA_API struct llama_context_params llama_context_default_params();
|
||||
LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();
|
||||
|
||||
@@ -336,13 +344,11 @@ extern "C" {
|
||||
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
|
||||
/// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
|
||||
/// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
|
||||
/// @params smooth_factor Smooth factor between guidance logits and original logits. 1.0f means only use guidance logits. 0.0f means only original logits.
|
||||
LLAMA_API void llama_sample_classifier_free_guidance(
|
||||
struct llama_context * ctx,
|
||||
llama_token_data_array * candidates,
|
||||
struct llama_context * guidance_ctx,
|
||||
float scale,
|
||||
float smooth_factor);
|
||||
float scale);
|
||||
|
||||
/// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
|
||||
LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
|
||||
|
||||
2
scripts/verify-checksum-models.py
Normal file → Executable file
2
scripts/verify-checksum-models.py
Normal file → Executable file
@@ -1,3 +1,5 @@
|
||||
#!/bin/env python3
|
||||
|
||||
import os
|
||||
import hashlib
|
||||
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
function(llama_add_test source)
|
||||
get_filename_component(TEST_TARGET ${source} NAME_WE)
|
||||
add_executable(${TEST_TARGET} ${source})
|
||||
install(TARGETS ${TEST_TARGET} RUNTIME)
|
||||
target_link_libraries(${TEST_TARGET} PRIVATE llama)
|
||||
add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN})
|
||||
endfunction()
|
||||
|
||||
@@ -200,4 +200,6 @@ int main(void) {
|
||||
test_frequency_presence_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.499977f, 0.499977f, 0.000023f, 0.000023f, 0.000000f}, 5.0f, 5.0f);
|
||||
|
||||
printf("OK\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user