Compare commits

..

1 Commits

Author SHA1 Message Date
M. Yusuf Sarıgöz
932589c0ef Honor -ngl option for Cuda offloading in llava 2023-10-14 03:12:10 +03:00
156 changed files with 18578 additions and 19551 deletions

View File

@@ -1,7 +1,8 @@
---
name: Bug template
about: Used to report bugs in llama.cpp
labels: ["bug-unconfirmed"]
name: Issue and enhancement template
about: Used to report issues and request enhancements for llama.cpp
title: "[User] Insert summary of your issue or enhancement.."
labels: ''
assignees: ''
---
@@ -45,7 +46,7 @@ $ g++ --version
# Failure Information (for bugs)
Please help provide information about the failure / bug.
Please help provide information about the failure if this is a bug. If it is not a bug, please remove the rest of this template.
# Steps to Reproduce

View File

@@ -1,28 +0,0 @@
---
name: Enhancement template
about: Used to request enhancements for llama.cpp
labels: ["enhancement"]
assignees: ''
---
# Prerequisites
Please answer the following questions for yourself before submitting an issue.
- [ ] I am running the latest code. Development is very rapid so there are no tagged versions as of now.
- [ ] I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md).
- [ ] I [searched using keywords relevant to my issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/filtering-and-searching-issues-and-pull-requests) to make sure that I am creating a new issue that is not already open (or closed).
- [ ] I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new bug or useful enhancement to share.
# Feature Description
Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an enhancement.
# Motivation
Please provide a detailed written description of reasons why this feature is necessary and how it is useful to `llama.cpp` users.
# Possible Implementation
If you have an idea as to how it can be implemented, please write a detailed description. Feel free to give links to external sources or share visuals that might be helpful to understand the details better.

View File

@@ -288,7 +288,6 @@ jobs:
OPENBLAS_VERSION: 0.3.23
OPENCL_VERSION: 2023.04.17
CLBLAST_VERSION: 1.6.0
SDE_VERSION: 9.21.1-2023-04-24
strategy:
matrix:
@@ -384,23 +383,11 @@ jobs:
- name: Test
id: cmake_test
if: ${{ matrix.build != 'clblast' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }} # not all machines have native AVX-512
if: ${{ matrix.build != 'clblast' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }} # Test AVX-512 only when possible
run: |
cd build
ctest -C Release --verbose --timeout 900
- name: Test (Intel SDE)
id: cmake_test_sde
if: ${{ matrix.build == 'avx512' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation
run: |
curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/777395/sde-external-${env:SDE_VERSION}-win.tar.xz"
# for some weird reason windows tar doesn't like sde tar.xz
7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar.xz
7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar
$sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe)
cd build
& $sde -future -- ctest -C Release --verbose --timeout 900
- name: Determine tag name
id: tag
shell: bash

6
.gitignore vendored
View File

@@ -10,12 +10,10 @@
*.gcno
*.gcda
*.dot
*.bat
*.metallib
.DS_Store
.build/
.cache/
.ccls-cache/
.direnv/
.envrc
.swiftpm
@@ -46,7 +44,7 @@ models-mnt
/infill
/libllama.so
/llama-bench
/llava-cli
/llava
/main
/metal
/perplexity
@@ -65,7 +63,7 @@ models-mnt
/parallel
/train-text-from-scratch
/vdot
/common/build-info.cpp
build-info.h
arm_neon.h
compile_commands.json
CMakeSettings.json

View File

@@ -10,7 +10,7 @@ endif()
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
set(LLAMA_STANDALONE ON)
# configure project version
@@ -82,7 +82,6 @@ set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
option(LLAMA_CUBLAS "llama: use CUDA" OFF)
#option(LLAMA_CUDA_CUBLAS "llama: use cuBLAS for prompt processing" OFF)
option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF)
option(LLAMA_CUDA_FORCE_MMQ "llama: use mmq kernels instead of cuBLAS" OFF)
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some calculations" OFF)
@@ -94,12 +93,46 @@ option(LLAMA_CLBLAST "llama: use CLBlast"
option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT})
option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging" OFF)
option(LLAMA_MPI "llama: use MPI" OFF)
option(LLAMA_K_QUANTS "llama: use k-quants" ON)
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_SERVER "llama: build server example" ON)
#
# Build info header
#
# Generate initial build-info.h
include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/.git")
set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/.git")
# Is git submodule
if(NOT IS_DIRECTORY "${GIT_DIR}")
file(READ ${GIT_DIR} REAL_GIT_DIR_LINK)
string(REGEX REPLACE "gitdir: (.*)\n$" "\\1" REAL_GIT_DIR ${REAL_GIT_DIR_LINK})
set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/${REAL_GIT_DIR}")
endif()
# Add a custom target for build-info.h
add_custom_target(BUILD_INFO ALL DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.h")
# Add a custom command to rebuild build-info.h when .git/index changes
add_custom_command(
OUTPUT "${CMAKE_CURRENT_SOURCE_DIR}/build-info.h"
COMMENT "Generating build details from Git"
COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION} -DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake"
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
DEPENDS "${GIT_DIR}/index"
VERBATIM
)
else()
message(WARNING "Git repository not found; to enable automatic generation of build info, make sure Git is installed and the project is a Git repository.")
endif()
#
# Compile flags
#
@@ -244,8 +277,13 @@ if (LLAMA_BLAS)
endif()
endif()
if (LLAMA_QKK_64)
add_compile_definitions(GGML_QKK_64)
if (LLAMA_K_QUANTS)
set(GGML_HEADERS_EXTRA k_quants.h)
set(GGML_SOURCES_EXTRA k_quants.c)
add_compile_definitions(GGML_USE_K_QUANTS)
if (LLAMA_QKK_64)
add_compile_definitions(GGML_QKK_64)
endif()
endif()
if (LLAMA_CUBLAS)
@@ -267,9 +305,6 @@ if (LLAMA_CUBLAS)
if (LLAMA_CUDA_FORCE_DMMV)
add_compile_definitions(GGML_CUDA_FORCE_DMMV)
endif()
if (LLAMA_CUDA_FORCE_MMQ)
add_compile_definitions(GGML_CUDA_FORCE_MMQ)
endif()
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
if (DEFINED LLAMA_CUDA_DMMV_Y)
@@ -296,7 +331,6 @@ if (LLAMA_CUBLAS)
set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
else()
set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
#set(CMAKE_CUDA_ARCHITECTURES "") # use this to compile much faster, but only F16 models work
endif()
endif()
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
@@ -370,9 +404,6 @@ if (LLAMA_HIPBLAS)
if (LLAMA_CUDA_FORCE_DMMV)
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_DMMV)
endif()
if (LLAMA_CUDA_FORCE_MMQ)
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_MMQ)
endif()
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
@@ -510,10 +541,6 @@ if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATC
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "^(x86_64|i686|amd64|x64)$" )
message(STATUS "x86 detected")
if (MSVC)
# instruction set detection for MSVC only
if (LLAMA_NATIVE)
include(cmake/FindSIMD.cmake)
endif ()
if (LLAMA_AVX512)
add_compile_options($<$<COMPILE_LANGUAGE:C>:/arch:AVX512>)
add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX512>)
@@ -638,8 +665,6 @@ add_library(ggml OBJECT
ggml-alloc.h
ggml-backend.c
ggml-backend.h
ggml-quants.c
ggml-quants.h
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}

108
Makefile
View File

@@ -1,7 +1,7 @@
# Define the default target now so that it is always the first target
BUILD_TARGETS = \
main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search \
simple batched batched-bench save-load-state server embd-input-test gguf llama-bench llava baby-llama beam-search \
speculative infill benchmark-matmult parallel finetune export-lora tests/test-c.o
# Binaries only useful for tests
@@ -342,9 +342,13 @@ else
MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
endif
ifndef LLAMA_NO_K_QUANTS
MK_CPPFLAGS += -DGGML_USE_K_QUANTS
OBJS += k_quants.o
ifdef LLAMA_QKK_64
MK_CPPFLAGS += -DGGML_QKK_64
endif
endif
ifndef LLAMA_NO_ACCELERATE
# Mac OS - include Accelerate framework.
@@ -361,7 +365,7 @@ ifdef LLAMA_MPI
MK_CPPFLAGS += -DGGML_USE_MPI
MK_CFLAGS += -Wno-cast-qual
MK_CXXFLAGS += -Wno-cast-qual
OBJS += ggml-mpi.o
OBJS += ggml-mpi.o
endif # LLAMA_MPI
ifdef LLAMA_OPENBLAS
@@ -378,7 +382,7 @@ endif # LLAMA_BLIS
ifdef LLAMA_CUBLAS
MK_CPPFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
MK_LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
OBJS += ggml-cuda.o
OBJS += ggml-cuda.o
NVCCFLAGS = --forward-unknown-to-host-compiler -use_fast_math
ifdef LLAMA_CUDA_NVCC
NVCC = $(LLAMA_CUDA_NVCC)
@@ -393,9 +397,6 @@ endif # CUDA_DOCKER_ARCH
ifdef LLAMA_CUDA_FORCE_DMMV
NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV
endif # LLAMA_CUDA_FORCE_DMMV
ifdef LLAMA_CUDA_FORCE_MMQ
NVCCFLAGS += -DGGML_CUDA_FORCE_MMQ
endif # LLAMA_CUDA_FORCE_MMQ
ifdef LLAMA_CUDA_DMMV_X
NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
else
@@ -493,6 +494,11 @@ ggml-mpi.o: ggml-mpi.c ggml-mpi.h
$(CC) $(CFLAGS) -c $< -o $@
endif # LLAMA_MPI
ifndef LLAMA_NO_K_QUANTS
k_quants.o: k_quants.c k_quants.h
$(CC) $(CFLAGS) -c $< -o $@
endif # LLAMA_NO_K_QUANTS
# combine build flags with cmdline overrides
override CFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CFLAGS) $(CFLAGS)
override CXXFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CXXFLAGS) $(CXXFLAGS)
@@ -533,16 +539,13 @@ ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h
$(CC) $(CFLAGS) -c $< -o $@
ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h
$(CC) $(CFLAGS) -c $< -o $@
OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o
OBJS += ggml-alloc.o ggml-backend.o
llama.o: llama.cpp ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
$(CXX) $(CXXFLAGS) -c $< -o $@
COMMON_H_DEPS = common/common.h common/sampling.h common/log.h
COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o
COMMON_H_DEPS = common/common.h common/sampling.h build-info.h common/log.h
COMMON_DEPS = $(COMMON_H_DEPS) common.o sampling.o
common.o: common/common.cpp $(COMMON_H_DEPS)
$(CXX) $(CXXFLAGS) -c $< -o $@
@@ -563,47 +566,54 @@ libllama.so: llama.o ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
clean:
rm -vrf *.o tests/*.o *.so *.dll benchmark-matmult common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
rm -vrf *.o tests/*.o *.so *.dll benchmark-matmult build-info.h *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
#
# Examples
#
main: examples/main/main.cpp ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
main: examples/main/main.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
@echo
@echo '==== Run ./main -h for help. ===='
@echo
infill: examples/infill/infill.cpp ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
infill: examples/infill/infill.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
simple: examples/simple/simple.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
simple: examples/simple/simple.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
batched: examples/batched/batched.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
batched: examples/batched/batched.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.o llama.o common.o $(OBJS)
batched-bench: examples/batched-bench/batched-bench.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
quantize: examples/quantize/quantize.cpp build-info.o ggml.o llama.o $(OBJS)
quantize: examples/quantize/quantize.cpp build-info.h ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.o ggml.o llama.o $(OBJS)
quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.h ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
perplexity: examples/perplexity/perplexity.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
embedding: examples/embedding/embedding.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
embedding: examples/embedding/embedding.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2) -Wno-cast-qual
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp build-info.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2)
$(LIB_PRE)embdinput$(DSO_EXT): examples/embd-input/embd-input.h examples/embd-input/embd-input-lib.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) --shared $(CXXFLAGS) $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)
embd-input-test: $(LIB_PRE)embdinput$(DSO_EXT) examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %$(DSO_EXT),$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput
gguf: examples/gguf/gguf.cpp ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
@@ -614,31 +624,28 @@ train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratc
convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
llama-bench: examples/llama-bench/llama-bench.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
llama-bench: examples/llama-bench/llama-bench.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
libllava.a: examples/llava/llava.cpp examples/llava/llava.h examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h common/base64.hpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -static -fPIC -c $< -o $@ -Wno-cast-qual
llava-cli: examples/llava/llava-cli.cpp examples/llava/clip.h examples/llava/clip.cpp examples/llava/llava.h examples/llava/llava.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
llava: examples/llava/llava.cpp examples/llava/llava-utils.h examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
beam-search: examples/beam-search/beam-search.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
beam-search: examples/beam-search/beam-search.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
finetune: examples/finetune/finetune.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
finetune: examples/finetune/finetune.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
export-lora: examples/export-lora/export-lora.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
export-lora: examples/export-lora/export-lora.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
speculative: examples/speculative/speculative.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
parallel: examples/parallel/parallel.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
parallel: examples/parallel/parallel.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
ifdef LLAMA_METAL
@@ -651,7 +658,7 @@ swift: examples/batched.swift
(cd examples/batched.swift; make build)
endif
common/build-info.cpp: $(wildcard .git/index) scripts/build-info.sh
build-info.h: $(wildcard .git/index) scripts/build-info.sh
@sh scripts/build-info.sh $(CC) > $@.tmp
@if ! cmp -s $@.tmp $@; then \
mv $@.tmp $@; \
@@ -659,16 +666,13 @@ common/build-info.cpp: $(wildcard .git/index) scripts/build-info.sh
rm $@.tmp; \
fi
build-info.o: common/build-info.cpp
$(CXX) $(CXXFLAGS) -c $(filter-out %.h,$^) -o $@
#
# Tests
#
tests: $(TEST_TARGETS)
benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.o ggml.o $(OBJS)
benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
run-benchmark-matmult: benchmark-matmult
@@ -682,40 +686,40 @@ vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
q8dot: pocs/vdot/q8dot.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
tests/test-llama-grammar: tests/test-llama-grammar.cpp ggml.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
tests/test-llama-grammar: tests/test-llama-grammar.cpp build-info.h ggml.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-grammar-parser: tests/test-grammar-parser.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
tests/test-grammar-parser: tests/test-grammar-parser.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-double-float: tests/test-double-float.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
tests/test-double-float: tests/test-double-float.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-grad0: tests/test-grad0.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
tests/test-grad0: tests/test-grad0.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-opt: tests/test-opt.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
tests/test-opt: tests/test-opt.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-quantize-fns: tests/test-quantize-fns.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
tests/test-quantize-fns: tests/test-quantize-fns.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-quantize-perf: tests/test-quantize-perf.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
tests/test-quantize-perf: tests/test-quantize-perf.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-sampling: tests/test-sampling.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
tests/test-sampling: tests/test-sampling.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-c.o: tests/test-c.c llama.h

View File

@@ -42,12 +42,13 @@ let package = Package(
"llama.cpp",
"ggml-alloc.c",
"ggml-backend.c",
"ggml-quants.c",
"k_quants.c",
] + additionalSources,
resources: resources,
publicHeadersPath: "spm-headers",
cSettings: [
.unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
.define("GGML_USE_K_QUANTS"),
.define("GGML_USE_ACCELERATE")
// NOTE: NEW_LAPACK will required iOS version 16.4+
// We should consider add this in the future when we drop support for iOS 14

View File

@@ -2,6 +2,7 @@
![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png)
[![Actions Status](https://github.com/ggerganov/llama.cpp/workflows/CI/badge.svg)](https://github.com/ggerganov/llama.cpp/actions)
[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
@@ -10,7 +11,12 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
### Hot topics
- **Upcoming change that might break functionality. Help with testing is needed:** https://github.com/ggerganov/llama.cpp/pull/3912
- Breaking change: `rope_freq_base` and `rope_freq_scale` must be set to zero to use the model default values: [#3401](https://github.com/ggerganov/llama.cpp/pull/3401)
- Parallel decoding + continuous batching support added: [#3228](https://github.com/ggerganov/llama.cpp/pull/3228) \
**Devs should become familiar with the new API**
- Local Falcon 180B inference on Mac Studio
https://github.com/ggerganov/llama.cpp/assets/1991296/98abd4e8-7077-464c-ae89-aebabca7757e
----
@@ -83,23 +89,21 @@ as the main playground for developing new features for the [ggml](https://github
- [X] [Vicuna](https://github.com/ggerganov/llama.cpp/discussions/643#discussioncomment-5533894)
- [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/)
- [X] [OpenBuddy 🐶 (Multilingual)](https://github.com/OpenBuddy/OpenBuddy)
- [X] [Pygmalion/Metharme](#using-pygmalion-7b--metharme-7b)
- [X] [Pygmalion 7B / Metharme 7B](#using-pygmalion-7b--metharme-7b)
- [X] [WizardLM](https://github.com/nlpxucan/WizardLM)
- [X] [Baichuan 1 & 2](https://huggingface.co/models?search=baichuan-inc/Baichuan) + [derivations](https://huggingface.co/hiyouga/baichuan-7b-sft)
- [X] [Aquila 1 & 2](https://huggingface.co/models?search=BAAI/Aquila)
- [X] [Baichuan-7B](https://huggingface.co/baichuan-inc/baichuan-7B) and its derivations (such as [baichuan-7b-sft](https://huggingface.co/hiyouga/baichuan-7b-sft))
- [X] [Aquila-7B](https://huggingface.co/BAAI/Aquila-7B) / [AquilaChat-7B](https://huggingface.co/BAAI/AquilaChat-7B)
- [X] [Starcoder models](https://github.com/ggerganov/llama.cpp/pull/3187)
- [X] [Mistral AI v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1)
- [X] [Refact](https://huggingface.co/smallcloudai/Refact-1_6B-fim)
- [X] [Persimmon 8B](https://github.com/ggerganov/llama.cpp/pull/3410)
- [X] [MPT](https://github.com/ggerganov/llama.cpp/pull/3417)
- [X] [Bloom](https://github.com/ggerganov/llama.cpp/pull/3553)
- [X] [MPT](https://github.com/ggerganov/llama.cpp/pull/3417)
**Bindings:**
- Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
- Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
- Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp)
- Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp), [hlhr202/llama-node](https://github.com/hlhr202/llama-node)
- Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
- Rust: [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
- C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp)
@@ -202,7 +206,7 @@ https://user-images.githubusercontent.com/1991296/224442907-7693d4be-acaa-4e01-8
## Usage
Here are the end-to-end binary build and model conversion steps for the LLaMA-7B model.
Here are the steps for the LLaMA-7B model.
### Get the Code
@@ -424,7 +428,7 @@ Building the program with BLAS support may lead to some performance improvements
```
The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used.
If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 or 11.0.0 on RDNA3.
If your GPU is not officialy supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 or 11.0.0 on RDNA3.
The following compilation options are also available to tweak performance (yes, they refer to CUDA, not HIP, because it uses the same code as the cuBLAS version above):
| Option | Legal values | Default | Description |
@@ -569,18 +573,6 @@ python3 convert.py models/7B/
When running the larger models, make sure you have enough disk space to store all the intermediate files.
### Running on Windows with prebuilt binaries
You will find prebuilt Windows binaries on the release page.
Simply download and extract the latest zip package of choice: (e.g. `llama-b1380-bin-win-avx2-x64.zip`)
From the unzipped folder, open a terminal/cmd window here and place a pre-converted `.gguf` model file. Test out the main example like so:
```
.\main -m llama-2-7b.Q4_0.gguf -n 128
```
### Memory/Disk Requirements
As the models are currently fully loaded into memory, you will need adequate disk space to save them and sufficient RAM to load them. At the moment, memory and disk requirements are the same.
@@ -960,6 +952,7 @@ docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /
- [main](./examples/main/README.md)
- [server](./examples/server/README.md)
- [embd-input](./examples/embd-input/README.md)
- [jeopardy](./examples/jeopardy/README.md)
- [BLIS](./docs/BLIS.md)
- [Performance troubleshooting](./docs/token_generation_performance_tips.md)

View File

@@ -10,6 +10,7 @@ const Maker = struct {
builder: *std.build.Builder,
target: CrossTarget,
optimize: Mode,
config_header: *ConfigHeader,
enable_lto: bool,
include_dirs: ArrayList([]const u8),
@@ -40,24 +41,26 @@ const Maker = struct {
const commit_hash = try std.ChildProcess.exec(
.{ .allocator = builder.allocator, .argv = &.{ "git", "rev-parse", "HEAD" } },
);
try std.fs.cwd().writeFile("common/build-info.cpp", builder.fmt(
\\int LLAMA_BUILD_NUMBER = {};
\\char const *LLAMA_COMMIT = "{s}";
\\char const *LLAMA_COMPILER = "Zig {s}";
\\char const *LLAMA_BUILD_TARGET = "{s}";
\\
, .{ 0, commit_hash.stdout[0 .. commit_hash.stdout.len - 1], zig_version, try target.allocDescription(builder.allocator) }));
const config_header = builder.addConfigHeader(
.{ .style = .blank, .include_path = "build-info.h" },
.{
.BUILD_NUMBER = 0,
.BUILD_COMMIT = commit_hash.stdout[0 .. commit_hash.stdout.len - 1], // omit newline
.BUILD_COMPILER = builder.fmt("Zig {s}", .{zig_version}),
.BUILD_TARGET = try target.allocDescription(builder.allocator),
},
);
var m = Maker{
.builder = builder,
.target = target,
.optimize = builder.standardOptimizeOption(.{}),
.config_header = config_header,
.enable_lto = false,
.include_dirs = ArrayList([]const u8).init(builder.allocator),
.cflags = ArrayList([]const u8).init(builder.allocator),
.cxxflags = ArrayList([]const u8).init(builder.allocator),
.objs = ArrayList(*Compile).init(builder.allocator),
};
try m.addCFlag("-std=c11");
try m.addCxxFlag("-std=c++11");
try m.addProjectInclude(&.{});
@@ -69,7 +72,7 @@ const Maker = struct {
const o = m.builder.addObject(.{ .name = name, .target = m.target, .optimize = m.optimize });
if (o.target.getAbi() != .msvc)
o.defineCMacro("_GNU_SOURCE", null);
o.addConfigHeader(m.config_header);
if (std.mem.endsWith(u8, src, ".c")) {
o.addCSourceFiles(&.{src}, m.cflags.items);
o.linkLibC();
@@ -82,6 +85,7 @@ const Maker = struct {
o.linkLibCpp();
}
}
o.addConfigHeader(m.config_header);
for (m.include_dirs.items) |i| o.addIncludePath(.{ .path = i });
o.want_lto = m.enable_lto;
return o;
@@ -101,6 +105,7 @@ const Maker = struct {
// linkLibCpp already add (libc++ + libunwind + libc)
e.linkLibCpp();
}
e.addConfigHeader(m.config_header);
m.builder.installArtifact(e);
e.want_lto = m.enable_lto;
return e;
@@ -111,27 +116,30 @@ pub fn build(b: *std.build.Builder) !void {
var make = try Maker.init(b);
make.enable_lto = b.option(bool, "lto", "Enable LTO optimization, (default: false)") orelse false;
if (b.option(bool, "k-quants", "Enable K-quants, (default: true)") orelse true) {
try make.addFlag("-DGGML_USE_K_QUANTS");
const k_quants = make.obj("k_quants", "k_quants.c");
try make.objs.append(k_quants);
}
const ggml = make.obj("ggml", "ggml.c");
const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
const ggml_backend = make.obj("ggml-backend", "ggml-backend.c");
const ggml_quants = make.obj("ggml-quants", "ggml-quants.c");
const llama = make.obj("llama", "llama.cpp");
const buildinfo = make.obj("common", "common/build-info.cpp");
const common = make.obj("common", "common/common.cpp");
const console = make.obj("console", "common/console.cpp");
const sampling = make.obj("sampling", "common/sampling.cpp");
const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp");
const train = make.obj("train", "common/train.cpp");
const clip = make.obj("clip", "examples/llava/clip.cpp");
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, console, grammar_parser });
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo });
_ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo });
_ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo });
_ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, train });
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, train });
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, sampling, console, grammar_parser });
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common });
_ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common });
_ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common });
_ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, train });
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, train });
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, grammar_parser, clip });
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, sampling, grammar_parser });
if (server.target.isWindows()) {
server.linkSystemLibrary("ws2_32");
}

View File

@@ -208,8 +208,6 @@ function gg_run_open_llama_3b_v2 {
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
function check_ppl {
qnt="$1"
ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
@@ -298,7 +296,6 @@ function gg_sum_open_llama_3b_v2 {
gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)"
gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)"
gg_printf '- shakespeare (q8_0):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log)"
@@ -385,8 +382,6 @@ function gg_run_open_llama_7b_v2 {
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
function check_ppl {
qnt="$1"
ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
@@ -475,7 +470,6 @@ function gg_sum_open_llama_7b_v2 {
gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)"
gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)"
#gg_printf '- shakespeare (q8_0):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log)"

View File

@@ -1,100 +0,0 @@
include(CheckCSourceRuns)
set(AVX_CODE "
#include <immintrin.h>
int main()
{
__m256 a;
a = _mm256_set1_ps(0);
return 0;
}
")
set(AVX512_CODE "
#include <immintrin.h>
int main()
{
__m512i a = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0);
__m512i b = a;
__mmask64 equality_mask = _mm512_cmp_epi8_mask(a, b, _MM_CMPINT_EQ);
return 0;
}
")
set(AVX2_CODE "
#include <immintrin.h>
int main()
{
__m256i a = {0};
a = _mm256_abs_epi16(a);
__m256i x;
_mm256_extract_epi64(x, 0); // we rely on this in our AVX2 code
return 0;
}
")
set(FMA_CODE "
#include <immintrin.h>
int main()
{
__m256 acc = _mm256_setzero_ps();
const __m256 d = _mm256_setzero_ps();
const __m256 p = _mm256_setzero_ps();
acc = _mm256_fmadd_ps( d, p, acc );
return 0;
}
")
macro(check_sse type flags)
set(__FLAG_I 1)
set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
foreach (__FLAG ${flags})
if (NOT ${type}_FOUND)
set(CMAKE_REQUIRED_FLAGS ${__FLAG})
check_c_source_runs("${${type}_CODE}" HAS_${type}_${__FLAG_I})
if (HAS_${type}_${__FLAG_I})
set(${type}_FOUND TRUE CACHE BOOL "${type} support")
set(${type}_FLAGS "${__FLAG}" CACHE STRING "${type} flags")
endif()
math(EXPR __FLAG_I "${__FLAG_I}+1")
endif()
endforeach()
set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
if (NOT ${type}_FOUND)
set(${type}_FOUND FALSE CACHE BOOL "${type} support")
set(${type}_FLAGS "" CACHE STRING "${type} flags")
endif()
mark_as_advanced(${type}_FOUND ${type}_FLAGS)
endmacro()
# flags are for MSVC only!
check_sse("AVX" " ;/arch:AVX")
if (NOT ${AVX_FOUND})
set(LLAMA_AVX OFF)
else()
set(LLAMA_AVX ON)
endif()
check_sse("AVX2" " ;/arch:AVX2")
check_sse("FMA" " ;/arch:AVX2")
if ((NOT ${AVX2_FOUND}) OR (NOT ${FMA_FOUND}))
set(LLAMA_AVX2 OFF)
else()
set(LLAMA_AVX2 ON)
endif()
check_sse("AVX512" " ;/arch:AVX512")
if (NOT ${AVX512_FOUND})
set(LLAMA_AVX512 OFF)
else()
set(LLAMA_AVX512 ON)
endif()

View File

@@ -1,47 +1,8 @@
# common
# Build info header
#
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
# Is git submodule
if(NOT IS_DIRECTORY "${GIT_DIR}")
file(READ ${GIT_DIR} REAL_GIT_DIR_LINK)
string(REGEX REPLACE "gitdir: (.*)\n$" "\\1" REAL_GIT_DIR ${REAL_GIT_DIR_LINK})
set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../${REAL_GIT_DIR}")
endif()
set(GIT_INDEX "${GIT_DIR}/index")
else()
message(WARNING "Git repository not found; to enable automatic generation of build info, make sure Git is installed and the project is a Git repository.")
set(GIT_INDEX "")
endif()
# Add a custom command to rebuild build-info.cpp when .git/index changes
add_custom_command(
OUTPUT "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp"
COMMENT "Generating build details from Git"
COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION}
-DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME}
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/../scripts/build-info.cmake"
WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/.."
DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in" ${GIT_INDEX}
VERBATIM
)
set(TARGET build_info)
add_library(${TARGET} OBJECT build-info.cpp)
if (BUILD_SHARED_LIBS)
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
endif()
set(TARGET common)
add_library(${TARGET} STATIC
base64.hpp
add_library(${TARGET} OBJECT
common.h
common.cpp
sampling.h
@@ -60,4 +21,4 @@ endif()
target_include_directories(${TARGET} PUBLIC .)
target_compile_features(${TARGET} PUBLIC cxx_std_11)
target_link_libraries(${TARGET} PRIVATE llama build_info)
target_link_libraries(${TARGET} PRIVATE llama)

View File

@@ -1,392 +0,0 @@
/*
This is free and unencumbered software released into the public domain.
Anyone is free to copy, modify, publish, use, compile, sell, or
distribute this software, either in source code form or as a compiled
binary, for any purpose, commercial or non-commercial, and by any
means.
In jurisdictions that recognize copyright laws, the author or authors
of this software dedicate any and all copyright interest in the
software to the public domain. We make this dedication for the benefit
of the public at large and to the detriment of our heirs and
successors. We intend this dedication to be an overt act of
relinquishment in perpetuity of all present and future rights to this
software under copyright law.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.
For more information, please refer to <http://unlicense.org>
*/
#ifndef PUBLIC_DOMAIN_BASE64_HPP_
#define PUBLIC_DOMAIN_BASE64_HPP_
#include <cstdint>
#include <iterator>
#include <stdexcept>
#include <string>
class base64_error : public std::runtime_error
{
public:
using std::runtime_error::runtime_error;
};
class base64
{
public:
enum class alphabet
{
/** the alphabet is detected automatically */
auto_,
/** the standard base64 alphabet is used */
standard,
/** like `standard` except that the characters `+` and `/` are replaced by `-` and `_` respectively*/
url_filename_safe
};
enum class decoding_behavior
{
/** if the input is not padded, the remaining bits are ignored */
moderate,
/** if a padding character is encounter decoding is finished */
loose
};
/**
Encodes all the elements from `in_begin` to `in_end` to `out`.
@warning The source and destination cannot overlap. The destination must be able to hold at least
`required_encode_size(std::distance(in_begin, in_end))`, otherwise the behavior depends on the output iterator.
@tparam Input_iterator the source; the returned elements are cast to `std::uint8_t` and should not be greater than
8 bits
@tparam Output_iterator the destination; the elements written to it are from the type `char`
@param in_begin the beginning of the source
@param in_end the ending of the source
@param out the destination iterator
@param alphabet which alphabet should be used
@returns the iterator to the next element past the last element copied
@throws see `Input_iterator` and `Output_iterator`
*/
template<typename Input_iterator, typename Output_iterator>
static Output_iterator encode(Input_iterator in_begin, Input_iterator in_end, Output_iterator out,
alphabet alphabet = alphabet::standard)
{
constexpr auto pad = '=';
const char* alpha = alphabet == alphabet::url_filename_safe
? "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"
: "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
while (in_begin != in_end) {
std::uint8_t i0 = 0, i1 = 0, i2 = 0;
// first character
i0 = static_cast<std::uint8_t>(*in_begin);
++in_begin;
*out = alpha[i0 >> 2 & 0x3f];
++out;
// part of first character and second
if (in_begin != in_end) {
i1 = static_cast<std::uint8_t>(*in_begin);
++in_begin;
*out = alpha[((i0 & 0x3) << 4) | (i1 >> 4 & 0x0f)];
++out;
} else {
*out = alpha[(i0 & 0x3) << 4];
++out;
// last padding
*out = pad;
++out;
// last padding
*out = pad;
++out;
break;
}
// part of second character and third
if (in_begin != in_end) {
i2 = static_cast<std::uint8_t>(*in_begin);
++in_begin;
*out = alpha[((i1 & 0xf) << 2) | (i2 >> 6 & 0x03)];
++out;
} else {
*out = alpha[(i1 & 0xf) << 2];
++out;
// last padding
*out = pad;
++out;
break;
}
// rest of third
*out = alpha[i2 & 0x3f];
++out;
}
return out;
}
/**
Encodes a string.
@param str the string that should be encoded
@param alphabet which alphabet should be used
@returns the encoded base64 string
@throws see base64::encode()
*/
static std::string encode(const std::string& str, alphabet alphabet = alphabet::standard)
{
std::string result;
result.reserve(required_encode_size(str.length()) + 1);
encode(str.begin(), str.end(), std::back_inserter(result), alphabet);
return result;
}
/**
Encodes a char array.
@param buffer the char array
@param size the size of the array
@param alphabet which alphabet should be used
@returns the encoded string
*/
static std::string encode(const char* buffer, std::size_t size, alphabet alphabet = alphabet::standard)
{
std::string result;
result.reserve(required_encode_size(size) + 1);
encode(buffer, buffer + size, std::back_inserter(result), alphabet);
return result;
}
/**
Decodes all the elements from `in_begin` to `in_end` to `out`. `in_begin` may point to the same location as `out`,
in other words: inplace decoding is possible.
@warning The destination must be able to hold at least `required_decode_size(std::distance(in_begin, in_end))`,
otherwise the behavior depends on the output iterator.
@tparam Input_iterator the source; the returned elements are cast to `char`
@tparam Output_iterator the destination; the elements written to it are from the type `std::uint8_t`
@param in_begin the beginning of the source
@param in_end the ending of the source
@param out the destination iterator
@param alphabet which alphabet should be used
@param behavior the behavior when an error was detected
@returns the iterator to the next element past the last element copied
@throws base64_error depending on the set behavior
@throws see `Input_iterator` and `Output_iterator`
*/
template<typename Input_iterator, typename Output_iterator>
static Output_iterator decode(Input_iterator in_begin, Input_iterator in_end, Output_iterator out,
alphabet alphabet = alphabet::auto_,
decoding_behavior behavior = decoding_behavior::moderate)
{
//constexpr auto pad = '=';
std::uint8_t last = 0;
auto bits = 0;
while (in_begin != in_end) {
auto c = *in_begin;
++in_begin;
if (c == '=') {
break;
}
auto part = _base64_value(alphabet, c);
// enough bits for one byte
if (bits + 6 >= 8) {
*out = (last << (8 - bits)) | (part >> (bits - 2));
++out;
bits -= 2;
} else {
bits += 6;
}
last = part;
}
// check padding
if (behavior != decoding_behavior::loose) {
while (in_begin != in_end) {
auto c = *in_begin;
++in_begin;
if (c != '=') {
throw base64_error("invalid base64 character.");
}
}
}
return out;
}
/**
Decodes a string.
@param str the base64 encoded string
@param alphabet which alphabet should be used
@param behavior the behavior when an error was detected
@returns the decoded string
@throws see base64::decode()
*/
static std::string decode(const std::string& str, alphabet alphabet = alphabet::auto_,
decoding_behavior behavior = decoding_behavior::moderate)
{
std::string result;
result.reserve(max_decode_size(str.length()));
decode(str.begin(), str.end(), std::back_inserter(result), alphabet, behavior);
return result;
}
/**
Decodes a string.
@param buffer the base64 encoded buffer
@param size the size of the buffer
@param alphabet which alphabet should be used
@param behavior the behavior when an error was detected
@returns the decoded string
@throws see base64::decode()
*/
static std::string decode(const char* buffer, std::size_t size, alphabet alphabet = alphabet::auto_,
decoding_behavior behavior = decoding_behavior::moderate)
{
std::string result;
result.reserve(max_decode_size(size));
decode(buffer, buffer + size, std::back_inserter(result), alphabet, behavior);
return result;
}
/**
Decodes a string inplace.
@param[in,out] str the base64 encoded string
@param alphabet which alphabet should be used
@param behavior the behavior when an error was detected
@throws base64::decode_inplace()
*/
static void decode_inplace(std::string& str, alphabet alphabet = alphabet::auto_,
decoding_behavior behavior = decoding_behavior::moderate)
{
str.resize(decode(str.begin(), str.end(), str.begin(), alphabet, behavior) - str.begin());
}
/**
Decodes a char array inplace.
@param[in,out] str the string array
@param size the length of the array
@param alphabet which alphabet should be used
@param behavior the behavior when an error was detected
@returns the pointer to the next element past the last element decoded
@throws base64::decode_inplace()
*/
static char* decode_inplace(char* str, std::size_t size, alphabet alphabet = alphabet::auto_,
decoding_behavior behavior = decoding_behavior::moderate)
{
return decode(str, str + size, str, alphabet, behavior);
}
/**
Returns the required decoding size for a given size. The value is calculated with the following formula:
$$
\lceil \frac{size}{4} \rceil \cdot 3
$$
@param size the size of the encoded input
@returns the size of the resulting decoded buffer; this the absolute maximum
*/
static std::size_t max_decode_size(std::size_t size) noexcept
{
return (size / 4 + (size % 4 ? 1 : 0)) * 3;
}
/**
Returns the required encoding size for a given size. The value is calculated with the following formula:
$$
\lceil \frac{size}{3} \rceil \cdot 4
$$
@param size the size of the decoded input
@returns the size of the resulting encoded buffer
*/
static std::size_t required_encode_size(std::size_t size) noexcept
{
return (size / 3 + (size % 3 ? 1 : 0)) * 4;
}
private:
static std::uint8_t _base64_value(alphabet& alphabet, char c)
{
if (c >= 'A' && c <= 'Z') {
return c - 'A';
} else if (c >= 'a' && c <= 'z') {
return c - 'a' + 26;
} else if (c >= '0' && c <= '9') {
return c - '0' + 52;
}
// comes down to alphabet
if (alphabet == alphabet::standard) {
if (c == '+') {
return 62;
} else if (c == '/') {
return 63;
}
} else if (alphabet == alphabet::url_filename_safe) {
if (c == '-') {
return 62;
} else if (c == '_') {
return 63;
}
} // auto detect
else {
if (c == '+') {
alphabet = alphabet::standard;
return 62;
} else if (c == '/') {
alphabet = alphabet::standard;
return 63;
} else if (c == '-') {
alphabet = alphabet::url_filename_safe;
return 62;
} else if (c == '_') {
alphabet = alphabet::url_filename_safe;
return 63;
}
}
throw base64_error("invalid base64 character.");
}
};
#endif // !PUBLIC_DOMAIN_BASE64_HPP_

View File

@@ -1,4 +0,0 @@
int LLAMA_BUILD_NUMBER = @BUILD_NUMBER@;
char const *LLAMA_COMMIT = "@BUILD_COMMIT@";
char const *LLAMA_COMPILER = "@BUILD_COMPILER@";
char const *LLAMA_BUILD_TARGET = "@BUILD_TARGET@";

View File

@@ -1,4 +1,5 @@
#include "common.h"
#include "build-info.h"
#include "llama.h"
#include <algorithm>
@@ -90,19 +91,6 @@ void process_escapes(std::string& input) {
case '\'': input[output_idx++] = '\''; break;
case '\"': input[output_idx++] = '\"'; break;
case '\\': input[output_idx++] = '\\'; break;
case 'x':
// Handle \x12, etc
if (input_idx + 2 < input_len) {
const char x[3] = { input[input_idx + 1], input[input_idx + 2], 0 };
char *err_p = nullptr;
const long val = std::strtol(x, &err_p, 16);
if (err_p == x + 2) {
input_idx += 2;
input[output_idx++] = char(val);
break;
}
}
// fall through
default: input[output_idx++] = '\\';
input[output_idx++] = input[input_idx]; break;
}
@@ -115,26 +103,11 @@ void process_escapes(std::string& input) {
}
bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
bool result = true;
try {
if (!gpt_params_parse_ex(argc, argv, params)) {
gpt_print_usage(argc, argv, gpt_params());
exit(0);
}
}
catch (const std::invalid_argument & ex) {
fprintf(stderr, "%s\n", ex.what());
gpt_print_usage(argc, argv, gpt_params());
exit(1);
}
return result;
}
bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
bool invalid_param = false;
std::string arg;
gpt_params default_params;
const std::string arg_prefix = "--";
llama_sampling_params & sparams = params.sparams;
llama_sampling_params & sparams = params.sampling_params;
for (int i = 1; i < argc; i++) {
arg = argv[i];
@@ -231,52 +204,12 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
break;
}
params.rope_freq_scale = std::stof(argv[i]);
} else if (arg == "--rope-scaling") {
if (++i >= argc) {
invalid_param = true;
break;
}
std::string value(argv[i]);
/**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_NONE; }
else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_LINEAR; }
else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_YARN; }
else { invalid_param = true; break; }
} else if (arg == "--rope-scale") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.rope_freq_scale = 1.0f/std::stof(argv[i]);
} else if (arg == "--yarn-orig-ctx") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.yarn_orig_ctx = std::stoi(argv[i]);
} else if (arg == "--yarn-ext-factor") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.yarn_ext_factor = std::stof(argv[i]);
} else if (arg == "--yarn-attn-factor") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.yarn_attn_factor = std::stof(argv[i]);
} else if (arg == "--yarn-beta-fast") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.yarn_beta_fast = std::stof(argv[i]);
} else if (arg == "--yarn-beta-slow") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.yarn_beta_slow = std::stof(argv[i]);
} else if (arg == "--memory-f32") {
params.memory_f16 = false;
} else if (arg == "--top-p") {
@@ -285,19 +218,12 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
break;
}
sparams.top_p = std::stof(argv[i]);
} else if (arg == "--min-p") {
if (++i >= argc) {
invalid_param = true;
break;
}
sparams.min_p = std::stof(argv[i]);
} else if (arg == "--temp") {
if (++i >= argc) {
invalid_param = true;
break;
}
sparams.temp = std::stof(argv[i]);
sparams.temp = std::max(sparams.temp, 0.0f);
} else if (arg == "--tfs") {
if (++i >= argc) {
invalid_param = true;
@@ -315,26 +241,25 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
invalid_param = true;
break;
}
sparams.penalty_last_n = std::stoi(argv[i]);
sparams.n_prev = std::max(sparams.n_prev, sparams.penalty_last_n);
sparams.repeat_last_n = std::stoi(argv[i]);
} else if (arg == "--repeat-penalty") {
if (++i >= argc) {
invalid_param = true;
break;
}
sparams.penalty_repeat = std::stof(argv[i]);
sparams.repeat_penalty = std::stof(argv[i]);
} else if (arg == "--frequency-penalty") {
if (++i >= argc) {
invalid_param = true;
break;
}
sparams.penalty_freq = std::stof(argv[i]);
sparams.frequency_penalty = std::stof(argv[i]);
} else if (arg == "--presence-penalty") {
if (++i >= argc) {
invalid_param = true;
break;
}
sparams.penalty_present = std::stof(argv[i]);
sparams.presence_penalty = std::stof(argv[i]);
} else if (arg == "--mirostat") {
if (++i >= argc) {
invalid_param = true;
@@ -416,18 +341,6 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
break;
}
params.n_sequences = std::stoi(argv[i]);
} else if (arg == "--p-accept" || arg == "-pa") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.p_accept = std::stof(argv[i]);
} else if (arg == "--p-split" || arg == "-ps") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.p_split = std::stof(argv[i]);
} else if (arg == "-m" || arg == "--model") {
if (++i >= argc) {
invalid_param = true;
@@ -633,8 +546,11 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
break;
}
} else if (arg == "-h" || arg == "--help") {
return false;
gpt_print_usage(argc, argv, default_params);
#ifndef LOG_DISABLE_LOGS
log_print_usage();
#endif // LOG_DISABLE_LOGS
exit(0);
} else if (arg == "--random-prompt") {
params.random_prompt = true;
} else if (arg == "--in-prefix-bos") {
@@ -656,7 +572,7 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
invalid_param = true;
break;
}
sparams.grammar = argv[i];
params.grammar = argv[i];
} else if (arg == "--grammar-file") {
if (++i >= argc) {
invalid_param = true;
@@ -671,7 +587,7 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
std::copy(
std::istreambuf_iterator<char>(file),
std::istreambuf_iterator<char>(),
std::back_inserter(sparams.grammar)
std::back_inserter(params.grammar)
);
#ifndef LOG_DISABLE_LOGS
// Parse args for logging parameters
@@ -693,24 +609,28 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
// End of Parse args for logging parameters
#endif // LOG_DISABLE_LOGS
} else {
throw std::invalid_argument("error: unknown argument: " + arg);
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
gpt_print_usage(argc, argv, default_params);
exit(1);
}
}
if (invalid_param) {
throw std::invalid_argument("error: invalid parameter for argument: " + arg);
fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
gpt_print_usage(argc, argv, default_params);
exit(1);
}
if (params.prompt_cache_all &&
(params.interactive || params.interactive_first ||
params.instruct)) {
throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
fprintf(stderr, "error: --prompt-cache-all not supported in interactive mode yet\n");
gpt_print_usage(argc, argv, default_params);
exit(1);
}
if (params.escape) {
process_escapes(params.prompt);
process_escapes(params.input_prefix);
process_escapes(params.input_suffix);
process_escapes(sparams.cfg_negative_prompt);
for (auto & antiprompt : params.antiprompt) {
process_escapes(antiprompt);
}
@@ -720,9 +640,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
}
void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
const llama_sampling_params & sparams = params.sparams;
const llama_sampling_params & sparams = params.sampling_params;
printf("\n");
printf("usage: %s [options]\n", argv[0]);
printf("\n");
printf("options:\n");
@@ -757,13 +676,12 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k);
printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p);
printf(" --min-p N min-p sampling (default: %.1f, 0.0 = disabled)\n", (double)sparams.min_p);
printf(" --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)sparams.tfs_z);
printf(" --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)sparams.typical_p);
printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", sparams.penalty_last_n);
printf(" --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)sparams.penalty_repeat);
printf(" --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_present);
printf(" --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_freq);
printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", sparams.repeat_last_n);
printf(" --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)sparams.repeat_penalty);
printf(" --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.presence_penalty);
printf(" --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.frequency_penalty);
printf(" --mirostat N use Mirostat sampling.\n");
printf(" Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n");
printf(" (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", sparams.mirostat);
@@ -780,16 +698,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" --cfg-negative-prompt-file FNAME\n");
printf(" negative prompt file to use for guidance. (default: empty)\n");
printf(" --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", sparams.cfg_scale);
printf(" --rope-scaling {none,linear,yarn}\n");
printf(" RoPE frequency scaling method, defaults to linear unless specified by the model\n");
printf(" --rope-scale N RoPE context scaling factor, expands context by a factor of N\n");
printf(" --rope-scale N RoPE context linear scaling factor, inverse of --rope-freq-scale\n");
printf(" --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n");
printf(" --rope-freq-scale N RoPE frequency scaling factor, expands context by a factor of 1/N\n");
printf(" --yarn-orig-ctx N YaRN: original context size of model (default: 0 = model training context size)\n");
printf(" --yarn-ext-factor N YaRN: extrapolation mix factor (default: 1.0, 0.0 = full interpolation)\n");
printf(" --yarn-attn-factor N YaRN: scale sqrt(t) or attention magnitude (default: 1.0)\n");
printf(" --yarn-beta-slow N YaRN: high correction dim or alpha (default: %.1f)\n", params.yarn_beta_slow);
printf(" --yarn-beta-fast N YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast);
printf(" --rope-freq-scale N RoPE frequency linear scaling factor (default: loaded from model)\n");
printf(" --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
printf(" --no-penalize-nl do not penalize newline token\n");
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
@@ -803,8 +714,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
printf(" -np N, --parallel N number of parallel sequences to decode (default: %d)\n", params.n_parallel);
printf(" -ns N, --sequences N number of sequences to decode (default: %d)\n", params.n_sequences);
printf(" -pa N, --p-accept N speculative decoding accept probability (default: %.1f)\n", (double)params.p_accept);
printf(" -ps N, --p-split N speculative decoding split probability (default: %.1f)\n", (double)params.p_split);
printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA. see examples/llava/README.md\n");
printf(" --image IMAGE_FILE path to an image file. use with multimodal models\n");
@@ -832,7 +741,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
#endif // GGML_USE_CUBLAS
#endif
printf(" --verbose-prompt print prompt before generation\n");
printf(" --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n");
fprintf(stderr, " --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n");
printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
printf(" --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n");
printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
@@ -843,9 +752,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" -ld LOGDIR, --logdir LOGDIR\n");
printf(" path under which to save YAML logs (no logging if unset)\n");
printf("\n");
#ifndef LOG_DISABLE_LOGS
log_print_usage();
#endif // LOG_DISABLE_LOGS
}
std::string get_system_info(const gpt_params & params) {
@@ -899,48 +805,21 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
auto cparams = llama_context_default_params();
cparams.n_ctx = params.n_ctx;
cparams.n_batch = params.n_batch;
cparams.n_threads = params.n_threads;
cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
cparams.mul_mat_q = params.mul_mat_q;
cparams.seed = params.seed;
cparams.f16_kv = params.memory_f16;
cparams.logits_all = params.logits_all;
cparams.embedding = params.embedding;
cparams.rope_scaling_type = params.rope_scaling_type;
cparams.rope_freq_base = params.rope_freq_base;
cparams.rope_freq_scale = params.rope_freq_scale;
cparams.yarn_ext_factor = params.yarn_ext_factor;
cparams.yarn_attn_factor = params.yarn_attn_factor;
cparams.yarn_beta_fast = params.yarn_beta_fast;
cparams.yarn_beta_slow = params.yarn_beta_slow;
cparams.yarn_orig_ctx = params.yarn_orig_ctx;
cparams.n_ctx = params.n_ctx;
cparams.n_batch = params.n_batch;
cparams.n_threads = params.n_threads;
cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
cparams.mul_mat_q = params.mul_mat_q;
cparams.seed = params.seed;
cparams.f16_kv = params.memory_f16;
cparams.logits_all = params.logits_all;
cparams.embedding = params.embedding;
cparams.rope_freq_base = params.rope_freq_base;
cparams.rope_freq_scale = params.rope_freq_scale;
return cparams;
}
void llama_batch_clear(struct llama_batch & batch) {
batch.n_tokens = 0;
}
void llama_batch_add(
struct llama_batch & batch,
llama_token id,
llama_pos pos,
const std::vector<llama_seq_id> & seq_ids,
bool logits) {
batch.token [batch.n_tokens] = id;
batch.pos [batch.n_tokens] = pos,
batch.n_seq_id[batch.n_tokens] = seq_ids.size();
for (size_t i = 0; i < seq_ids.size(); ++i) {
batch.seq_id[batch.n_tokens][i] = seq_ids[i];
}
batch.logits [batch.n_tokens] = logits;
batch.n_tokens++;
}
std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params) {
auto mparams = llama_model_params_from_gpt_params(params);
@@ -978,15 +857,15 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
}
if (params.ignore_eos) {
params.sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
params.sampling_params.logit_bias[llama_token_eos(lctx)] = -INFINITY;
}
{
LOG("warming up the model with an empty run\n");
std::vector<llama_token> tmp = { llama_token_bos(model), llama_token_eos(model), };
std::vector<llama_token> tmp = { llama_token_bos(lctx), llama_token_eos(lctx), };
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
llama_kv_cache_clear(lctx);
llama_kv_cache_tokens_rm(lctx, -1, -1);
llama_reset_timings(lctx);
}
@@ -1000,23 +879,21 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
std::vector<llama_token> llama_tokenize(
const struct llama_context * ctx,
const std::string & text,
bool add_bos,
bool special) {
return llama_tokenize(llama_get_model(ctx), text, add_bos, special);
bool add_bos) {
return llama_tokenize(llama_get_model(ctx), text, add_bos);
}
std::vector<llama_token> llama_tokenize(
const struct llama_model * model,
const std::string & text,
bool add_bos,
bool special) {
bool add_bos) {
// upper limit for the number of tokens
int n_tokens = text.length() + add_bos;
std::vector<llama_token> result(n_tokens);
n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos, special);
n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos);
if (n_tokens < 0) {
result.resize(-n_tokens);
int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos, special);
int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos);
GGML_ASSERT(check == -n_tokens);
} else {
result.resize(n_tokens);
@@ -1039,7 +916,7 @@ std::string llama_token_to_piece(const struct llama_context * ctx, llama_token t
}
std::string llama_detokenize_spm(llama_context * ctx, const std::vector<llama_token> & tokens) {
const llama_token bos_id = llama_token_bos(llama_get_model(ctx));
const llama_token bos_id = llama_token_bos(ctx);
std::string piece;
std::string result;
@@ -1223,28 +1100,28 @@ std::string get_sortable_timestamp() {
void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const llama_context * lctx,
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
const llama_sampling_params & sparams = params.sparams;
const llama_sampling_params & sparams = params.sampling_params;
fprintf(stream, "build_commit: %s\n", LLAMA_COMMIT);
fprintf(stream, "build_number: %d\n", LLAMA_BUILD_NUMBER);
fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false");
fprintf(stream, "cpu_has_avx: %s\n", ggml_cpu_has_avx() ? "true" : "false");
fprintf(stream, "cpu_has_avx2: %s\n", ggml_cpu_has_avx2() ? "true" : "false");
fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false");
fprintf(stream, "build_commit: %s\n", BUILD_COMMIT);
fprintf(stream, "build_number: %d\n", BUILD_NUMBER);
fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false");
fprintf(stream, "cpu_has_avx: %s\n", ggml_cpu_has_avx() ? "true" : "false");
fprintf(stream, "cpu_has_avx2: %s\n", ggml_cpu_has_avx2() ? "true" : "false");
fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false");
fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
fprintf(stream, "cpu_has_cublas: %s\n", ggml_cpu_has_cublas() ? "true" : "false");
fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false");
fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false");
fprintf(stream, "cpu_has_gpublas: %s\n", ggml_cpu_has_gpublas() ? "true" : "false");
fprintf(stream, "cpu_has_neon: %s\n", ggml_cpu_has_neon() ? "true" : "false");
fprintf(stream, "cpu_has_f16c: %s\n", ggml_cpu_has_f16c() ? "true" : "false");
fprintf(stream, "cpu_has_fp16_va: %s\n", ggml_cpu_has_fp16_va() ? "true" : "false");
fprintf(stream, "cpu_has_wasm_simd: %s\n", ggml_cpu_has_wasm_simd() ? "true" : "false");
fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false");
fprintf(stream, "cpu_has_vsx: %s\n", ggml_cpu_has_vsx() ? "true" : "false");
fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
fprintf(stream, "cpu_has_cublas: %s\n", ggml_cpu_has_cublas() ? "true" : "false");
fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false");
fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false");
fprintf(stream, "cpu_has_gpublas: %s\n", ggml_cpu_has_gpublas() ? "true" : "false");
fprintf(stream, "cpu_has_neon: %s\n", ggml_cpu_has_neon() ? "true" : "false");
fprintf(stream, "cpu_has_f16c: %s\n", ggml_cpu_has_f16c() ? "true" : "false");
fprintf(stream, "cpu_has_fp16_va: %s\n", ggml_cpu_has_fp16_va() ? "true" : "false");
fprintf(stream, "cpu_has_wasm_simd: %s\n", ggml_cpu_has_wasm_simd() ? "true" : "false");
fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false");
fprintf(stream, "cpu_has_vsx: %s\n", ggml_cpu_has_vsx() ? "true" : "false");
#ifdef NDEBUG
fprintf(stream, "debug: false\n");
@@ -1278,13 +1155,13 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.penalty_freq);
dump_string_yaml_multiline(stream, "grammar", sparams.grammar.c_str());
fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.frequency_penalty);
dump_string_yaml_multiline(stream, "grammar", params.grammar.c_str());
fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n");
fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);
const auto logit_bias_eos = sparams.logit_bias.find(llama_token_eos(llama_get_model(lctx)));
const auto logit_bias_eos = sparams.logit_bias.find(llama_token_eos(lctx));
const bool ignore_eos = logit_bias_eos != sparams.logit_bias.end() && logit_bias_eos->second == -INFINITY;
fprintf(stream, "ignore_eos: %s # default: false\n", ignore_eos ? "true" : "false");
@@ -1338,14 +1215,14 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
fprintf(stream, "numa: %s # default: false\n", params.numa ? "true" : "false");
fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.penalty_present);
fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.presence_penalty);
dump_string_yaml_multiline(stream, "prompt", params.prompt.c_str());
fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str());
fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false");
fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false");
dump_vector_int_yaml(stream, "prompt_tokens", prompt_tokens);
fprintf(stream, "random_prompt: %s # default: false\n", params.random_prompt ? "true" : "false");
fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.penalty_repeat);
fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.repeat_penalty);
fprintf(stream, "reverse_prompt:\n");
for (std::string ap : params.antiprompt) {
@@ -1372,7 +1249,6 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
fprintf(stream, "threads: %d # default: %d\n", params.n_threads, std::thread::hardware_concurrency());
fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p);
fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
}

View File

@@ -9,7 +9,6 @@
#define LOG_NO_FILE_LINE_FUNCTION
#include "log.h"
#include <cmath>
#include <string>
#include <vector>
#include <random>
@@ -26,54 +25,38 @@
#define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0)
#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
#define print_build_info() do { \
fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
#define print_build_info() do { \
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); \
fprintf(stderr, "%s: built with %s for %s\n", __func__, BUILD_COMPILER, BUILD_TARGET); \
} while(0)
// build info
extern int LLAMA_BUILD_NUMBER;
extern char const *LLAMA_COMMIT;
extern char const *LLAMA_COMPILER;
extern char const *LLAMA_BUILD_TARGET;
//
// CLI argument parsing
//
int32_t get_num_physical_cores();
struct gpt_params {
uint32_t seed = -1; // RNG seed
uint32_t seed = -1; // RNG seed
int32_t n_threads = get_num_physical_cores();
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
int32_t n_predict = -1; // new tokens to predict
int32_t n_ctx = 512; // context size
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_keep = 0; // number of tokens to keep from initial prompt
int32_t n_draft = 16; // number of tokens to draft during speculative decoding
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
int32_t n_parallel = 1; // number of parallel sequences to decode
int32_t n_sequences = 1; // number of sequences to decode
float p_accept = 0.5f; // speculative decoding accept probability
float p_split = 0.1f; // speculative decoding split probability
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
int32_t n_beams = 0; // if non-zero then use beam search of given width.
float rope_freq_base = 0.0f; // RoPE base frequency
float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor
float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor
float yarn_beta_fast = 32.0f; // YaRN low correction dim
float yarn_beta_slow = 1.0f; // YaRN high correction dim
int32_t yarn_orig_ctx = 0; // YaRN original context length
int8_t rope_scaling_type = LLAMA_ROPE_SCALING_UNSPECIFIED; // TODO: better to be int32_t for alignment
// pinging @cebtenzzre
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
int32_t n_predict = -1; // new tokens to predict
int32_t n_ctx = 512; // context size
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_keep = 0; // number of tokens to keep from initial prompt
int32_t n_draft = 16; // number of tokens to draft during speculative decoding
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
int32_t n_parallel = 1; // number of parallel sequences to decode
int32_t n_sequences = 1; // number of sequences to decode
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
int32_t n_beams = 0; // if non-zero then use beam search of given width.
float rope_freq_base = 0.0f; // RoPE base frequency
float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
// // sampling parameters
struct llama_sampling_params sparams;
struct llama_sampling_params sampling_params;
std::string model = "models/7B/ggml-model-f16.gguf"; // model path
std::string model_draft = ""; // draft model for speculative decoding
@@ -83,10 +66,10 @@ struct gpt_params {
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
std::string input_prefix = ""; // string to prefix user inputs with
std::string input_suffix = ""; // string to suffix user inputs with
std::string grammar = ""; // optional BNF-like grammar to constrain sampling
std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
std::string logdir = ""; // directory in which to save YAML log files
// TODO: avoid tuple, use struct
std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
std::string lora_base = ""; // base model path for the lora adapter
@@ -94,7 +77,7 @@ struct gpt_params {
int ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
// (which is more convenient to use for plotting)
//
bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
bool mul_mat_q = true; // if true, use mul_mat_q kernels instead of cuBLAS
@@ -127,8 +110,6 @@ struct gpt_params {
std::string image = ""; // path to an image file
};
bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params);
bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
@@ -143,23 +124,10 @@ void process_escapes(std::string& input);
// Model utils
//
// TODO: avoid tuplue, use struct
std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params);
struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & params);
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
// Batch utils
void llama_batch_clear(struct llama_batch & batch);
void llama_batch_add(
struct llama_batch & batch,
llama_token id,
llama_pos pos,
const std::vector<llama_seq_id> & seq_ids,
bool logits);
//
// Vocab utils
//
@@ -169,14 +137,12 @@ void llama_batch_add(
std::vector<llama_token> llama_tokenize(
const struct llama_context * ctx,
const std::string & text,
bool add_bos,
bool special = false);
bool add_bos);
std::vector<llama_token> llama_tokenize(
const struct llama_model * model,
const std::string & text,
bool add_bos,
bool special = false);
bool add_bos);
// tokenizes a token into a piece
// should work similar to Python's `tokenizer.id_to_piece`

View File

@@ -399,7 +399,7 @@ namespace grammar_parser {
void print_grammar(FILE * file, const parse_state & state) {
try {
std::map<uint32_t, std::string> symbol_id_names;
for (const auto & kv : state.symbol_ids) {
for (auto kv : state.symbol_ids) {
symbol_id_names[kv.second] = kv.first;
}
for (size_t i = 0, end = state.rules.size(); i < end; i++) {

View File

@@ -97,56 +97,37 @@
#define LOG_TEE_TARGET stderr
#endif
// Utility for synchronizing log configuration state
// since std::optional was introduced only in c++17
enum LogTriState
{
LogTriStateSame,
LogTriStateFalse,
LogTriStateTrue
};
// Utility to obtain "pid" like unique process id and use it when creating log files.
inline std::string log_get_pid()
{
static std::string pid;
if (pid.empty())
{
// std::this_thread::get_id() is the most portable way of obtaining a "process id"
// it's not the same as "pid" but is unique enough to solve multiple instances
// trying to write to the same log.
std::stringstream ss;
ss << std::this_thread::get_id();
pid = ss.str();
}
static std::string pid;
if (pid.empty())
{
// std::this_thread::get_id() is the most portable way of obtaining a "process id"
// it's not the same as "pid" but is unique enough to solve multiple instances
// trying to write to the same log.
std::stringstream ss;
ss << std::this_thread::get_id();
pid = ss.str();
}
return pid;
return pid;
}
// Utility function for generating log file names with unique id based on thread id.
// invocation with log_filename_generator( "llama", "log" ) creates a string "llama.<number>.log"
// where the number is a runtime id of the current thread.
#define log_filename_generator(log_file_basename, log_file_extension) log_filename_generator_impl(LogTriStateSame, log_file_basename, log_file_extension)
#define log_filename_generator(log_file_basename, log_file_extension) log_filename_generator_impl(log_file_basename, log_file_extension)
// INTERNAL, DO NOT USE
inline std::string log_filename_generator_impl(LogTriState multilog, const std::string & log_file_basename, const std::string & log_file_extension)
inline std::string log_filename_generator_impl(const std::string & log_file_basename, const std::string & log_file_extension)
{
static bool _multilog = false;
if (multilog != LogTriStateSame)
{
_multilog = multilog == LogTriStateTrue;
}
std::stringstream buf;
buf << log_file_basename;
if (_multilog)
{
buf << ".";
buf << log_get_pid();
}
buf << ".";
buf << log_get_pid();
buf << ".";
buf << log_file_extension;
@@ -231,6 +212,15 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
#define LOG_TEE_FLF_VAL ,""
#endif
// Utility for synchronizing log configuration state
// since std::optional was introduced only in c++17
enum LogTriState
{
LogTriStateSame,
LogTriStateFalse,
LogTriStateTrue
};
// INTERNAL, DO NOT USE
// USE LOG() INSTEAD
//
@@ -324,23 +314,16 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
#endif
// INTERNAL, DO NOT USE
inline FILE *log_handler1_impl(bool change = false, LogTriState append = LogTriStateSame, LogTriState disable = LogTriStateSame, const std::string & filename = LOG_DEFAULT_FILE_NAME, FILE *target = nullptr)
inline FILE *log_handler1_impl(bool change = false, LogTriState disable = LogTriStateSame, const std::string & filename = LOG_DEFAULT_FILE_NAME, FILE *target = nullptr)
{
static bool _initialized = false;
static bool _append = false;
static bool _disabled = filename.empty() && target == nullptr;
static bool _initialized{false};
static bool _disabled{(filename.empty() && target == nullptr)};
static std::string log_current_filename{filename};
static FILE *log_current_target{target};
static FILE *logfile = nullptr;
if (change)
{
if (append != LogTriStateSame)
{
_append = append == LogTriStateTrue;
return logfile;
}
if (disable == LogTriStateTrue)
{
// Disable primary target
@@ -393,7 +376,7 @@ inline FILE *log_handler1_impl(bool change = false, LogTriState append = LogTriS
}
}
logfile = fopen(filename.c_str(), _append ? "a" : "w");
logfile = fopen(filename.c_str(), "w");
}
if (!logfile)
@@ -414,9 +397,9 @@ inline FILE *log_handler1_impl(bool change = false, LogTriState append = LogTriS
}
// INTERNAL, DO NOT USE
inline FILE *log_handler2_impl(bool change = false, LogTriState append = LogTriStateSame, LogTriState disable = LogTriStateSame, FILE *target = nullptr, const std::string & filename = LOG_DEFAULT_FILE_NAME)
inline FILE *log_handler2_impl(bool change = false, LogTriState disable = LogTriStateSame, FILE *target = nullptr, const std::string & filename = LOG_DEFAULT_FILE_NAME)
{
return log_handler1_impl(change, append, disable, filename, target);
return log_handler1_impl(change, disable, filename, target);
}
// Disables logs entirely at runtime.
@@ -427,7 +410,7 @@ inline FILE *log_handler2_impl(bool change = false, LogTriState append = LogTriS
// INTERNAL, DO NOT USE
inline FILE *log_disable_impl()
{
return log_handler1_impl(true, LogTriStateSame, LogTriStateTrue);
return log_handler1_impl(true, LogTriStateTrue);
}
// Enables logs at runtime.
@@ -436,31 +419,19 @@ inline FILE *log_disable_impl()
// INTERNAL, DO NOT USE
inline FILE *log_enable_impl()
{
return log_handler1_impl(true, LogTriStateSame, LogTriStateFalse);
return log_handler1_impl(true, LogTriStateFalse);
}
// Sets target fir logs, either by a file name or FILE* pointer (stdout, stderr, or any valid FILE*)
#define log_set_target(target) log_set_target_impl(target)
// INTERNAL, DO NOT USE
inline FILE *log_set_target_impl(const std::string & filename) { return log_handler1_impl(true, LogTriStateSame, LogTriStateSame, filename); }
inline FILE *log_set_target_impl(FILE *target) { return log_handler2_impl(true, LogTriStateSame, LogTriStateSame, target); }
inline FILE *log_set_target_impl(const std::string & filename) { return log_handler1_impl(true, LogTriStateSame, filename); }
inline FILE *log_set_target_impl(FILE *target) { return log_handler2_impl(true, LogTriStateSame, target); }
// INTERNAL, DO NOT USE
inline FILE *log_handler() { return log_handler1_impl(); }
// Enable or disable creating separate log files for each run.
// can ONLY be invoked BEFORE first log use.
#define log_multilog(enable) log_filename_generator_impl((enable) ? LogTriStateTrue : LogTriStateFalse, "", "")
// Enable or disable append mode for log file.
// can ONLY be invoked BEFORE first log use.
#define log_append(enable) log_append_impl(enable)
// INTERNAL, DO NOT USE
inline FILE *log_append_impl(bool enable)
{
return log_handler1_impl(true, enable ? LogTriStateTrue : LogTriStateFalse, LogTriStateSame);
}
inline void log_test()
{
log_disable();
@@ -522,18 +493,6 @@ inline bool log_param_single_parse(const std::string & param)
return true;
}
if (param == "--log-new")
{
log_multilog(true);
return true;
}
if (param == "--log-append")
{
log_append(true);
return true;
}
return false;
}
@@ -563,9 +522,7 @@ inline void log_print_usage()
printf(" --log-disable Disable trace logs\n");
printf(" --log-enable Enable trace logs\n");
printf(" --log-file Specify a log filename (without extension)\n");
printf(" --log-new Create a separate new log file on start. "
"Each log file will have unique name: \"<name>.<ID>.log\"\n");
printf(" --log-append Don't truncate the old log file.\n");
printf(" Log file will be tagged with unique ID and written as \"<name>.<ID>.log\"\n"); /* */
}
#define log_dump_cmdline(argc, argv) log_dump_cmdline_impl(argc, argv)
@@ -622,75 +579,38 @@ inline std::string log_var_to_string_impl(const std::vector<int> & var)
return buf.str();
}
template <typename C, typename T>
inline std::string LOG_TOKENS_TOSTR_PRETTY(const C & ctx, const T & tokens)
{
std::stringstream buf;
buf << "[ ";
bool first = true;
for (const auto &token : tokens)
{
if (!first) {
buf << ", ";
} else {
first = false;
}
auto detokenized = llama_token_to_piece(ctx, token);
detokenized.erase(
std::remove_if(
detokenized.begin(),
detokenized.end(),
[](const unsigned char c) { return !std::isprint(c); }),
detokenized.end());
buf
<< "'" << detokenized << "'"
<< ":" << std::to_string(token);
}
buf << " ]";
return buf.str();
}
template <typename C, typename B>
inline std::string LOG_BATCH_TOSTR_PRETTY(const C & ctx, const B & batch)
{
std::stringstream buf;
buf << "[ ";
bool first = true;
for (int i = 0; i < batch.n_tokens; ++i)
{
if (!first) {
buf << ", ";
} else {
first = false;
}
auto detokenized = llama_token_to_piece(ctx, batch.token[i]);
detokenized.erase(
std::remove_if(
detokenized.begin(),
detokenized.end(),
[](const unsigned char c) { return !std::isprint(c); }),
detokenized.end());
buf
<< "\n" << std::to_string(i)
<< ":token '" << detokenized << "'"
<< ":pos " << std::to_string(batch.pos[i])
<< ":n_seq_id " << std::to_string(batch.n_seq_id[i])
<< ":seq_id " << std::to_string(batch.seq_id[i][0])
<< ":logits " << std::to_string(batch.logits[i]);
}
buf << " ]";
return buf.str();
}
#define LOG_TOKENS_TOSTR_PRETTY(ctx, tokens) \
[&tokens, &ctx]() \
{ \
std::stringstream buf; \
buf << "[ "; \
\
bool first = true; \
for (const auto &token : tokens) \
{ \
if (!first) \
buf << ", "; \
else \
first = false; \
\
auto detokenized = llama_token_to_piece(ctx, token); \
\
detokenized.erase( \
std::remove_if( \
detokenized.begin(), \
detokenized.end(), \
[](const unsigned char c) { return !std::isprint(c); }), \
detokenized.end()); \
\
buf \
<< "'" << detokenized << "'" \
<< ":" << std::to_string(token); \
} \
buf << " ]"; \
\
return buf.str(); \
}() \
.c_str()
#ifdef LOG_DISABLE_LOGS

View File

@@ -1,163 +1,113 @@
#include "sampling.h"
struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params) {
struct llama_sampling_context * result = new llama_sampling_context();
result->params = params;
result->grammar = nullptr;
// if there is a grammar, parse it
if (!params.grammar.empty()) {
result->parsed_grammar = grammar_parser::parse(params.grammar.c_str());
// will be empty (default) if there are parse errors
if (result->parsed_grammar.rules.empty()) {
fprintf(stderr, "%s: failed to parse grammar\n", __func__);
return nullptr;
llama_sampling_context::~llama_sampling_context() {
for (auto & it : sequence_contexts) {
if (it.second.grammar != NULL) {
llama_grammar_free(it.second.grammar);
it.second.grammar = NULL;
}
std::vector<const llama_grammar_element *> grammar_rules(result->parsed_grammar.c_rules());
result->grammar = llama_grammar_init(
grammar_rules.data(),
grammar_rules.size(), result->parsed_grammar.symbol_ids.at("root"));
}
result->prev.resize(params.n_prev);
return result;
}
void llama_sampling_free(struct llama_sampling_context * ctx) {
if (ctx->grammar != NULL) {
llama_grammar_free(ctx->grammar);
}
llama_sampling_context llama_sampling_context_init(
const struct gpt_params & params,
llama_grammar * grammar) {
llama_sampling_context result;
delete ctx;
result.params = params.sampling_params;
result.grammar = grammar;
return result;
}
void llama_sampling_reset(llama_sampling_context * ctx) {
if (ctx->grammar != NULL) {
llama_grammar_free(ctx->grammar);
ctx->grammar = NULL;
// Note: Creates the context if it doesn't exist, so this always return something.
llama_sampler_sequence_context & llama_sampling_get_sequence_context(
llama_sampling_context & ctx_sampling,
const llama_seq_id seq) {
const auto it = ctx_sampling.sequence_contexts.find(seq);
if (it != ctx_sampling.sequence_contexts.end()) {
return it->second;
}
if (!ctx->parsed_grammar.rules.empty()) {
std::vector<const llama_grammar_element *> grammar_rules(ctx->parsed_grammar.c_rules());
ctx->grammar = llama_grammar_init(
grammar_rules.data(),
grammar_rules.size(), ctx->parsed_grammar.symbol_ids.at("root"));
}
std::fill(ctx->prev.begin(), ctx->prev.end(), 0);
ctx->cur.clear();
llama_sampler_sequence_context new_ctx = {
2.0f * ctx_sampling.params.mirostat_tau,
ctx_sampling.grammar != NULL ? llama_grammar_copy(ctx_sampling.grammar) : NULL,
};
return ctx_sampling.sequence_contexts.insert({seq, new_ctx}).first->second;
}
void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst) {
if (dst->grammar) {
llama_grammar_free(dst->grammar);
dst->grammar = nullptr;
bool llama_sampling_context_reset(
llama_sampling_context & ctx_sampling,
const llama_seq_id seq) {
const auto it = ctx_sampling.sequence_contexts.find(seq);
if (it == ctx_sampling.sequence_contexts.end()) return false;
if (it->second.grammar != NULL) {
llama_grammar_free(it->second.grammar);
it->second.grammar = NULL;
}
if (src->grammar) {
dst->grammar = llama_grammar_copy(src->grammar);
}
dst->prev = src->prev;
}
llama_token llama_sampling_last(llama_sampling_context * ctx) {
return ctx->prev.back();
}
std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama_context * ctx_main, int n) {
const int size = ctx_sampling->prev.size();
n = std::min(n, size);
std::string result;
for (int i = size - n; i < size; i++) {
result += llama_token_to_piece(ctx_main, ctx_sampling->prev[i]);
}
return result;
}
std::string llama_sampling_print(const llama_sampling_params & params) {
char result[1024];
snprintf(result, sizeof(result),
"\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
"\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, typical_p = %.3f, temp = %.3f\n"
"\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present,
params.top_k, params.tfs_z, params.top_p, params.min_p, params.typical_p, params.temp,
params.mirostat, params.mirostat_eta, params.mirostat_tau);
return std::string(result);
ctx_sampling.sequence_contexts.erase(it);
return true;
}
llama_token llama_sampling_sample(
struct llama_sampling_context * ctx_sampling,
struct llama_context * ctx_main,
struct llama_context * ctx_cfg,
const int idx) {
const llama_sampling_params & params = ctx_sampling->params;
const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
struct llama_context * ctx,
struct llama_context * ctx_guidance,
struct llama_sampling_context & ctx_sampling,
const std::vector<llama_token> & last_tokens,
std::vector<llama_token_data> & candidates,
const int idx,
llama_seq_id seq) {
const int n_ctx = llama_n_ctx(ctx);
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
const llama_sampling_params & params = ctx_sampling.params;
const float temp = params.temp;
const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k;
const float top_p = params.top_p;
const float min_p = params.min_p;
const float tfs_z = params.tfs_z;
const float typical_p = params.typical_p;
const int32_t penalty_last_n = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n;
const float penalty_repeat = params.penalty_repeat;
const float penalty_freq = params.penalty_freq;
const float penalty_present = params.penalty_present;
const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
const float repeat_penalty = params.repeat_penalty;
const float alpha_presence = params.presence_penalty;
const float alpha_frequency = params.frequency_penalty;
const int mirostat = params.mirostat;
const float mirostat_tau = params.mirostat_tau;
const float mirostat_eta = params.mirostat_eta;
const bool penalize_nl = params.penalize_nl;
auto & prev = ctx_sampling->prev;
auto & cur = ctx_sampling->cur;
llama_token id = 0;
float * logits = llama_get_logits_ith(ctx_main, idx);
float * logits = llama_get_logits_ith(ctx, idx);
// apply params.logit_bias map
// Apply params.logit_bias map
for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
logits[it->first] += it->second;
}
cur.clear();
candidates.clear();
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
}
llama_token_data_array cur_p = { cur.data(), cur.size(), false };
llama_token_data_array cur_p = { candidates.data(), candidates.size(), false };
if (ctx_cfg) {
llama_sample_classifier_free_guidance(ctx_main, &cur_p, ctx_cfg, params.cfg_scale);
if (ctx_guidance) {
llama_sample_classifier_free_guidance(ctx, &cur_p, ctx_guidance, params.cfg_scale);
}
// apply penalties
if (!prev.empty()) {
const float nl_logit = logits[llama_token_nl(llama_get_model(ctx_main))];
if (!last_tokens.empty()) {
const float nl_logit = logits[llama_token_nl(ctx)];
const int last_n_repeat = std::min(std::min((int)last_tokens.size(), repeat_last_n), n_ctx);
llama_sample_repetition_penalties(ctx_main, &cur_p,
prev.data() + prev.size() - penalty_last_n,
penalty_last_n, penalty_repeat, penalty_freq, penalty_present);
llama_sample_repetition_penalty(ctx, &cur_p,
last_tokens.data() + last_tokens.size() - last_n_repeat,
last_n_repeat, repeat_penalty);
llama_sample_frequency_and_presence_penalties(ctx, &cur_p,
last_tokens.data() + last_tokens.size() - last_n_repeat,
last_n_repeat, alpha_frequency, alpha_presence);
if (!penalize_nl) {
for (size_t idx = 0; idx < cur_p.size; idx++) {
if (cur_p.data[idx].id == llama_token_nl(llama_get_model(ctx_main))) {
if (cur_p.data[idx].id == llama_token_nl(ctx)) {
cur_p.data[idx].logit = nl_logit;
break;
}
@@ -165,65 +115,52 @@ llama_token llama_sampling_sample(
}
}
if (ctx_sampling->grammar != NULL) {
llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar);
llama_sampler_sequence_context & ctx_seq = llama_sampling_get_sequence_context(ctx_sampling, seq);
if (ctx_seq.grammar != NULL) {
llama_sample_grammar(ctx, &cur_p, ctx_seq.grammar);
}
if (temp < 0.0) {
// greedy sampling, with probs
llama_sample_softmax(ctx_main, &cur_p);
id = cur_p.data[0].id;
} else if (temp == 0.0) {
// greedy sampling, no probs
id = llama_sample_token_greedy(ctx_main, &cur_p);
if (temp <= 0) {
// Greedy sampling
id = llama_sample_token_greedy(ctx, &cur_p);
} else {
if (mirostat == 1) {
const int mirostat_m = 100;
llama_sample_temp(ctx_main, &cur_p, temp);
id = llama_sample_token_mirostat(ctx_main, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &ctx_sampling->mirostat_mu);
llama_sample_temp(ctx, &cur_p, temp);
id = llama_sample_token_mirostat(ctx, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &ctx_seq.mirostat_mu);
} else if (mirostat == 2) {
llama_sample_temp(ctx_main, &cur_p, temp);
id = llama_sample_token_mirostat_v2(ctx_main, &cur_p, mirostat_tau, mirostat_eta, &ctx_sampling->mirostat_mu);
llama_sample_temp(ctx, &cur_p, temp);
id = llama_sample_token_mirostat_v2(ctx, &cur_p, mirostat_tau, mirostat_eta, &ctx_seq.mirostat_mu);
} else {
// temperature sampling
// Temperature sampling
size_t min_keep = std::max(1, params.n_probs);
llama_sample_top_k (ctx, &cur_p, top_k, min_keep);
llama_sample_tail_free (ctx, &cur_p, tfs_z, min_keep);
llama_sample_typical (ctx, &cur_p, typical_p, min_keep);
llama_sample_top_p (ctx, &cur_p, top_p, min_keep);
llama_sample_temp(ctx, &cur_p, temp);
llama_sample_top_k (ctx_main, &cur_p, top_k, min_keep);
llama_sample_tail_free(ctx_main, &cur_p, tfs_z, min_keep);
llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep);
llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep);
llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep);
llama_sample_temp (ctx_main, &cur_p, temp);
{
const int n_top = 10;
LOG("top %d candidates:\n", n_top);
id = llama_sample_token(ctx_main, &cur_p);
for (int i = 0; i < n_top; i++) {
const llama_token id = cur_p.data[i].id;
(void)id; // To avoid a warning that id is unused when logging is disabled.
LOG(" - %5d: '%12s' (%.3f)\n", id, llama_token_to_piece(ctx, id).c_str(), cur_p.data[i].p);
}
}
//{
// const int n_top = 10;
// LOG("top %d candidates:\n", n_top);
id = llama_sample_token(ctx, &cur_p);
// for (int i = 0; i < n_top; i++) {
// const llama_token id = cur_p.data[i].id;
// (void)id; // To avoid a warning that id is unused when logging is disabled.
// LOG(" - %5d: '%12s' (%.3f)\n", id, llama_token_to_piece(ctx_main, id).c_str(), cur_p.data[i].p);
// }
//}
LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx_main, id).c_str());
LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx, id).c_str());
}
}
if (ctx_seq.grammar != NULL) {
llama_grammar_accept_token(ctx, ctx_seq.grammar, id);
}
return id;
}
void llama_sampling_accept(
struct llama_sampling_context * ctx_sampling,
struct llama_context * ctx_main,
llama_token id,
bool apply_grammar) {
ctx_sampling->prev.erase(ctx_sampling->prev.begin());
ctx_sampling->prev.push_back(id);
if (ctx_sampling->grammar != NULL && apply_grammar) {
llama_grammar_accept_token(ctx_main, ctx_sampling->grammar, id);
}
}

View File

@@ -2,109 +2,107 @@
#include "llama.h"
#include "grammar-parser.h"
#include <string>
#include <vector>
#include <unordered_map>
// sampling parameters
typedef struct llama_sampling_params {
int32_t n_prev = 64; // number of previous tokens to remember
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
int32_t top_k = 40; // <= 0 to use vocab size
float top_p = 0.95f; // 1.0 = disabled
float min_p = 0.05f; // 0.0 = disabled
float tfs_z = 1.00f; // 1.0 = disabled
float typical_p = 1.00f; // 1.0 = disabled
float temp = 0.80f; // 1.0 = disabled
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
float penalty_repeat = 1.10f; // 1.0 = disabled
float penalty_freq = 0.00f; // 0.0 = disabled
float penalty_present = 0.00f; // 0.0 = disabled
float repeat_penalty = 1.10f; // 1.0 = disabled
int32_t repeat_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
float frequency_penalty = 0.00f; // 0.0 = disabled
float presence_penalty = 0.00f; // 0.0 = disabled
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
float mirostat_tau = 5.00f; // target entropy
float mirostat_eta = 0.10f; // learning rate
bool penalize_nl = true; // consider newlines as a repeatable token
std::string grammar; // optional BNF-like grammar to constrain sampling
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
// Classifier-Free Guidance
// https://arxiv.org/abs/2306.17806
std::string cfg_negative_prompt; // string to help guidance
float cfg_scale = 1.f; // how strong is guidance
std::string cfg_negative_prompt; // string to help guidance
float cfg_scale = 1.f; // How strong is guidance
std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
} llama_sampling_params;
// per-sequence sampler context
typedef struct llama_sampler_sequence_context {
float mirostat_mu; // mirostat sampler state
llama_grammar * grammar;
} llama_sampler_sequence_context;
// general sampler context
// TODO: move to llama.h
struct llama_sampling_context {
// parameters that will be used for sampling
typedef struct llama_sampling_context {
~llama_sampling_context();
// parameters that will be used for sampling and when creating
// new llama_sampler_sequence_context instances
llama_sampling_params params;
// mirostat sampler state
float mirostat_mu;
// map of sequence ids to sampler contexts
std::unordered_map<llama_seq_id, llama_sampler_sequence_context> sequence_contexts;
// when non-NULL, new instances of llama_sampler_sequence_context
// will get a copy of the grammar here
// note: only the pointer is stored here, it is not a copy of
// the grammar and shouldn't be freed
llama_grammar * grammar;
// internal
grammar_parser::parse_state parsed_grammar;
// TODO: replace with ring-buffer
std::vector<llama_token> prev;
std::vector<llama_token_data> cur;
};
} llama_sampling_context;
#include "common.h"
// Create a new sampling context instance.
struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params);
llama_sampling_context llama_sampling_context_init(
const struct gpt_params & params,
llama_grammar * grammar = NULL);
void llama_sampling_free(struct llama_sampling_context * ctx);
// Fetches the sampler context for the specified sequence id (defaults to 0).
// If the context for that sequence id doesn't already exist, it will be created with
// default values based on the parameters in the ctx_sampling argument.
llama_sampler_sequence_context & llama_sampling_get_sequence_context(
llama_sampling_context & ctx_sampling,
const llama_seq_id seq = 0);
// Reset the sampler context
// - clear prev tokens
// - reset grammar
void llama_sampling_reset(llama_sampling_context * ctx);
// Copy the sampler context
void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst);
// Get the last sampled token
llama_token llama_sampling_last(llama_sampling_context * ctx);
// Get a string representation of the last sampled tokens
std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama_context * ctx_main, int n);
// Print sampling parameters into a string
std::string llama_sampling_print(const llama_sampling_params & params);
// Reset the sampler context for the supplied sequence id (defaults to 0).
// This is necessary to reuse a sequence id or free memory used by sequences
// that are no longer required.
bool llama_sampling_context_reset(
llama_sampling_context & ctx_sampling,
const llama_seq_id seq = 0);
// this is a common sampling function used across the examples for convenience
// it can serve as a starting point for implementing your own sampling function
// Note: When using multiple sequences, it is the caller's responsibility to call
// llama_sampling_reset when a sequence ends
// llama_sampling_context_reset when a sequence ends
//
// required:
// - ctx_main: context to use for sampling
// - ctx: context to use for sampling
// - ctx_sampling: sampling-specific context
//
// optional:
// - ctx_cfg: context to use for classifier-free guidance
// - idx: sample from llama_get_logits_ith(ctx, idx)
// - ctx_guidance: context to use for classifier-free guidance, ignore if NULL
// - last_tokens: needed for repetition penalty, ignore if empty
// - idx: sample from llama_get_logits_ith(ctx, idx)
// - seq: sequence id to associate sampler state with
//
// returns:
// - token: sampled token
// - candidates: vector of candidate tokens
//
llama_token llama_sampling_sample(
struct llama_sampling_context * ctx_sampling,
struct llama_context * ctx_main,
struct llama_context * ctx_cfg,
int idx = 0);
void llama_sampling_accept(
struct llama_sampling_context * ctx_sampling,
struct llama_context * ctx_main,
llama_token id,
bool apply_grammar);
struct llama_context * ctx,
struct llama_context * ctx_guidance,
struct llama_sampling_context & ctx_sampling,
const std::vector<llama_token> & last_tokens,
std::vector<llama_token_data> & candidates,
const int idx = 0,
llama_seq_id seq = 0);

View File

@@ -32,7 +32,6 @@ struct train_state * init_train_state() {
state->opt = new struct ggml_opt_context;
state->opt->ctx = NULL;
state->opt->params = ggml_opt_default_params(GGML_OPT_ADAM);
state->opt->params.graph_size = LLAMA_TRAIN_MAX_NODES;
state->opt->loss_after = 0.0f;
return state;
@@ -237,8 +236,8 @@ int64_t get_example_targets_batch(
int64_t used_samples = 0;
ggml_set_f32(target_probs, 0.0f);
llama_token bos = llama_token_bos(llama_get_model(lctx));
llama_token eos = llama_token_eos(llama_get_model(lctx));
llama_token bos = llama_token_bos(lctx);
llama_token eos = llama_token_eos(lctx);
// printf("%s: example_id=%d n_batch=%d n_train_samples=%zu\n", __func__, example_id, n_batch, n_train_samples);
for (int k=0; k<n_batch; ++k) {
// printf("%s: batch %d\n", __func__, k);
@@ -864,7 +863,7 @@ size_t tokenize_file(
(int) buf.size(),
out_tokens.data(),
(int) out_tokens.size(),
false, false);
false);
if (n_tokens < 0) {
out_tokens.resize(-n_tokens);
n_tokens = llama_tokenize(
@@ -873,7 +872,7 @@ size_t tokenize_file(
(int) buf.size(),
out_tokens.data(),
(int) out_tokens.size(),
false, false);
false);
}
if (n_tokens >= 0) {
out_tokens.resize(n_tokens);
@@ -925,7 +924,7 @@ size_t tokenize_file(
for (llama_token token=0; token < n_vocab; ++token) {
max_token_text_size = std::max(
max_token_text_size,
strlen(llama_token_get_text(llama_get_model(lctx), token)));
strlen(llama_token_get_text(lctx, token)));
}
// upper bound of context byte length.
@@ -967,7 +966,7 @@ size_t tokenize_file(
(int) buf_sample.size(),
tok_sample.data(),
(int) tok_sample.size(),
false, false);
false);
if (n_tokens < 0) {
tok_sample.resize(-n_tokens);
n_tokens = llama_tokenize(llama_get_model(lctx),
@@ -975,7 +974,7 @@ size_t tokenize_file(
(int) buf_sample.size(),
tok_sample.data(),
(int) tok_sample.size(),
false, false);
false);
GGML_ASSERT(n_tokens >= 0);
}
GGML_ASSERT(n_tokens <= (int) tok_sample.size());
@@ -1046,7 +1045,6 @@ struct train_params_common get_default_train_params_common() {
params.n_batch = 8;
params.n_gradient_accumulation = 1;
params.n_epochs = -1;
params.n_gpu_layers = 0;
params.custom_n_ctx = false;
@@ -1082,7 +1080,6 @@ struct train_params_common get_default_train_params_common() {
params.adam_beta2 = 0.999f;
params.adam_gclip = 1.0f;
params.adam_eps_f = 0.0f;
return params;
}
@@ -1428,7 +1425,7 @@ void train_opt_callback(void * vdata, int accum_step, float * sched, bool * canc
int impr_plot = -(int)(1 + (opt->loss_before - opt->loss_after) * 10.0f + 0.5f);
if (impr_plot > 0) impr_plot = 0;
if (std::isnan(opt->loss_before) || std::isnan(opt->loss_after)) impr_plot = 0;
if (std::isnan(opt->loss_before) || std::isnan(opt->loss_before)) impr_plot = 0;
printf("%s: iter=%6d sample=%zu/%zu sched=%f loss=%f",
__func__, opt->iter, std::min(1+train->shuffle_next_sample, train->shuffle_sample_count), train->shuffle_sample_count,
*sched, opt->loss_after);

View File

@@ -9,8 +9,6 @@
#include "ggml.h"
#include "llama.h"
#define LLAMA_TRAIN_MAX_NODES 16384
typedef std::string mt19937_state;
struct train_state {
@@ -46,7 +44,6 @@ struct train_params_common {
int n_batch;
int n_gradient_accumulation;
int n_epochs;
int n_gpu_layers;
bool custom_n_ctx;

View File

@@ -16,7 +16,7 @@ import torch
from sentencepiece import SentencePieceProcessor # type: ignore[import]
if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
import gguf
@@ -76,7 +76,6 @@ def parse_args() -> argparse.Namespace:
"ftype", type=int, choices=[0, 1], default=1, nargs='?',
help="output format - use 0 for float32, 1 for float16",
)
parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine")
return parser.parse_args()
args = parse_args()
@@ -87,11 +86,6 @@ if not dir_model.is_dir():
print(f'Error: {args.model} is not a directory', file = sys.stderr)
sys.exit(1)
endianess = gguf.GGUFEndian.LITTLE
if args.bigendian:
endianess = gguf.GGUFEndian.BIG
endianess_str = "Big Endian" if args.bigendian else "Little Endian"
print(f"gguf: Conversion Endianess {endianess}")
# possible tensor data types
# ftype == 0 -> float32
# ftype == 1 -> float16
@@ -110,7 +104,7 @@ print("gguf: loading model "+dir_model.name)
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
hparams = json.load(f)
print("hello print: ",hparams["architectures"][0])
if hparams["architectures"][0] != "BaichuanForCausalLM" and hparams["architectures"][0] != "BaiChuanForCausalLM":
if hparams["architectures"][0] != "BaichuanForCausalLM":
print("Model architecture not supported: " + hparams["architectures"][0])
sys.exit()
@@ -119,7 +113,7 @@ if hparams["architectures"][0] != "BaichuanForCausalLM" and hparams["architectur
num_parts = count_model_parts(dir_model)
print(f"num_parts:{num_parts}\n")
ARCH=gguf.MODEL_ARCH.BAICHUAN
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
print("gguf: get model metadata")
@@ -163,8 +157,7 @@ gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]:
if "type" in hparams["rope_scaling"]:
if hparams["rope_scaling"]["type"] == "linear":
gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
gguf_writer.add_rope_scaling_factor(hparams["rope_scaling"]["factor"])
gguf_writer.add_rope_scale_linear(hparams["rope_scaling"]["factor"])
# TOKENIZATION
@@ -231,7 +224,7 @@ gguf_writer.add_token_list(tokens)
gguf_writer.add_token_scores(scores)
gguf_writer.add_token_types(toktypes)
special_vocab = gguf.SpecialVocab(dir_model, n_vocab = len(tokens))
special_vocab = gguf.SpecialVocab(dir_model)
special_vocab.add_to_gguf(gguf_writer)
# TENSORS

238
convert-bloom-hf-to-gguf.py Executable file
View File

@@ -0,0 +1,238 @@
#!/usr/bin/env python3
# HF bloom --> gguf conversion
from __future__ import annotations
import argparse
import json
import os
import re
import struct
import sys
from pathlib import Path
from typing import Any
import numpy as np
import torch
from transformers import AutoTokenizer # type: ignore[import]
if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
import gguf
def count_model_parts(dir_model: Path) -> int:
num_parts = 0
for filename in os.listdir(dir_model):
if filename.startswith("pytorch_model-"):
num_parts += 1
if num_parts > 0:
print("gguf: found " + str(num_parts) + " model parts")
return num_parts
# Supported Models:
# https://huggingface.co/bigscience/bloom-1b7
# https://huggingface.co/bigscience/bloom-3b
# https://huggingface.co/bigscience/bloom-7b1
# https://huggingface.co/Langboat/bloom-1b4-zh
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Convert a Bloom model to a GGML compatible file")
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)")
parser.add_argument("ftype", type=int, help="output format - use 0 for float32, 1 for float16", choices=[0, 1], default = 1)
return parser.parse_args()
args = parse_args()
dir_model = args.model
ftype = args.ftype
if not dir_model.is_dir():
print(f'Error: {args.model} is not a directory', file = sys.stderr)
sys.exit(1)
# possible tensor data types
# ftype == 0 -> float32
# ftype == 1 -> float16
# map from ftype to string
ftype_str = ["f32", "f16"]
if args.outfile is not None:
fname_out = args.outfile
else:
# output in the same directory as the model by default
fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
print("gguf: loading model "+dir_model.name)
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
hparams = json.load(f)
if hparams["architectures"][0] != "BloomForCausalLM":
print("Model architecture not supported: " + hparams["architectures"][0])
sys.exit(1)
# get number of model parts
num_parts = count_model_parts(dir_model)
ARCH=gguf.MODEL_ARCH.BLOOM
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
print("gguf: get model metadata")
block_count = hparams["n_layer"]
gguf_writer.add_name("Bloom")
n_embed = hparams.get("hidden_size", hparams.get("n_embed"))
n_head = hparams.get("n_head", hparams.get("num_attention_heads"))
gguf_writer.add_context_length(hparams.get("seq_length", n_embed))
gguf_writer.add_embedding_length(n_embed)
gguf_writer.add_feed_forward_length(4 * n_embed)
gguf_writer.add_block_count(block_count)
gguf_writer.add_head_count(n_head)
gguf_writer.add_head_count_kv(n_head)
gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
gguf_writer.add_file_type(ftype)
# TOKENIZATION
print("gguf: get tokenizer metadata")
tokens: list[bytearray] = []
scores: list[float] = []
toktypes: list[int] = []
# gpt2 tokenizer
gguf_writer.add_tokenizer_model("gpt2")
print("gguf: get gpt2 tokenizer vocab")
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
tokenizer = AutoTokenizer.from_pretrained(dir_model)
# The number of tokens in tokenizer.json can differ from the expected vocab size.
# This causes downstream issues with mismatched tensor sizes when running the inference
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
assert max(tokenizer.vocab.values()) < vocab_size
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
for i in range(vocab_size):
tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
scores.append(0.0) # dummy
toktypes.append(gguf.TokenType.NORMAL)
gguf_writer.add_token_list(tokens)
gguf_writer.add_token_scores(scores)
gguf_writer.add_token_types(toktypes)
special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
special_vocab.add_to_gguf(gguf_writer)
# TENSORS
tensor_map = gguf.get_tensor_name_map(ARCH, block_count)
# params for qkv transform
n_head_kv = hparams.get("n_head_kv", n_head)
head_dim = n_embed // n_head
# tensor info
print("gguf: get tensor metadata")
if num_parts == 0:
part_names = iter(("pytorch_model.bin",))
else:
part_names = (
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
)
for part_name in part_names:
if args.vocab_only:
break
print("gguf: loading model part '" + part_name + "'")
model_part = torch.load(dir_model / part_name, map_location="cpu")
has_lm_head = True
if "lm_head.weight" not in model_part.keys() and "output.weight" not in model_part.keys():
has_lm_head = False
for original_name in model_part.keys():
data = model_part[original_name]
name = re.sub(r'transformer\.', '', original_name)
old_dtype = data.dtype
# convert any unsupported data types to float32
if data.dtype != torch.float16 and data.dtype != torch.float32:
data = data.to(torch.float32)
data = data.squeeze().numpy()
if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name):
# Map bloom-style qkv_linear to gpt-style qkv_linear
# bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa
# gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa
qkv_weights = data.reshape((n_head, 3, n_embed // n_head, n_embed))
data = np.concatenate(
(qkv_weights[:, 0, :, :].reshape((-1, n_embed)),
qkv_weights[:, 1, :, :].reshape((-1, n_embed)),
qkv_weights[:, 2, :, :].reshape((-1, n_embed))),
axis=0
)
print("re-format attention.linear_qkv.weight")
elif re.match(r"h\.\d+\.self_attention\.query_key_value\.bias", name):
qkv_bias = data.reshape((n_head, 3, n_embed // n_head))
data = np.concatenate(
(qkv_bias[:, 0, :].reshape((n_embed,)),
qkv_bias[:, 1, :].reshape((n_embed,)),
qkv_bias[:, 2, :].reshape((n_embed,))),
axis=0
)
print("re-format attention.linear_qkv.bias")
# map tensor names
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
if new_name is None:
print("Can not map tensor '" + name + "'")
sys.exit()
n_dims = len(data.shape)
data_dtype = data.dtype
# if f32 desired, convert any float16 to float32
if ftype == 0 and data_dtype == np.float16:
data = data.astype(np.float32)
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
data = data.astype(np.float32)
# if f16 desired, convert any float32 2-dim weight tensors to float16
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data = data.astype(np.float16)
print(name, "=>", new_name + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype))
gguf_writer.add_tensor(new_name, data)
if not has_lm_head and name == "word_embeddings.weight":
gguf_writer.add_tensor("output.weight", data)
print(name, "=>", "output.weight" + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype)) # noqa
print("gguf: write header")
gguf_writer.write_header_to_file()
print("gguf: write metadata")
gguf_writer.write_kv_data_to_file()
if not args.vocab_only:
print("gguf: write tensors")
gguf_writer.write_tensors_to_file()
gguf_writer.close()
print(f"gguf: model successfully exported to '{fname_out}'")
print("")

250
convert-falcon-hf-to-gguf.py Executable file
View File

@@ -0,0 +1,250 @@
#!/usr/bin/env python3
# HF falcon--> gguf conversion
from __future__ import annotations
import argparse
import contextlib
import json
import os
import struct
import sys
from pathlib import Path
from typing import Any
import numpy as np
import torch
from transformers import AutoTokenizer # type: ignore[import]
if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
import gguf
def count_model_parts(dir_model: Path, prefix: str) -> int:
num_parts = 0
for filename in os.listdir(dir_model):
if filename.startswith(prefix):
num_parts += 1
if num_parts > 0:
print("gguf: found " + str(num_parts) + " model parts")
return num_parts
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Convert a Falcon model to a GGML compatible file")
parser.add_argument(
"--vocab-only", action="store_true",
help="extract only the vocab",
)
parser.add_argument(
"--outfile", type=Path,
help="path to write to; default: based on input",
)
parser.add_argument(
"model", type=Path,
help="directory containing model file, or model file itself (*.bin)",
)
parser.add_argument(
"ftype", type=int, choices=[0, 1], default=1, nargs='?',
help="output format - use 0 for float32, 1 for float16",
)
return parser.parse_args()
args = parse_args()
dir_model = args.model
ftype = args.ftype
if not dir_model.is_dir():
print(f'Error: {args.model} is not a directory', file = sys.stderr)
sys.exit(1)
# possible tensor data types
# ftype == 0 -> float32
# ftype == 1 -> float16
# map from ftype to string
ftype_str = ["f32", "f16"]
if args.outfile is not None:
fname_out = args.outfile
else:
# output in the same directory as the model by default
fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
print("gguf: loading model "+dir_model.name)
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
hparams = json.load(f)
if hparams["architectures"][0] != "FalconForCausalLM":
print("Model architecture not supported: " + hparams["architectures"][0])
sys.exit(1)
# get number of model parts
num_parts = count_model_parts(dir_model, "model-00")
if num_parts:
is_safetensors = True
from safetensors import safe_open
else:
is_safetensors = False
num_parts = count_model_parts(dir_model, "pytorch_model-")
ARCH=gguf.MODEL_ARCH.FALCON
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
print("gguf: get model metadata")
block_count = hparams["num_hidden_layers"]
gguf_writer.add_name("Falcon")
gguf_writer.add_context_length(2048) # not in config.json
gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform
gguf_writer.add_embedding_length(hparams["hidden_size"])
gguf_writer.add_feed_forward_length(4 * hparams["hidden_size"])
gguf_writer.add_block_count(block_count)
gguf_writer.add_head_count(hparams["num_attention_heads"])
if "num_kv_heads" in hparams:
gguf_writer.add_head_count_kv(hparams["num_kv_heads"])
else:
gguf_writer.add_head_count_kv(1)
gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
gguf_writer.add_file_type(ftype)
# TOKENIZATION
print("gguf: get tokenizer metadata")
tokens: list[bytearray] = []
scores: list[float] = []
toktypes: list[int] = []
# gpt2 tokenizer
gguf_writer.add_tokenizer_model("gpt2")
print("gguf: get gpt2 tokenizer vocab")
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
tokenizer = AutoTokenizer.from_pretrained(dir_model)
# The number of tokens in tokenizer.json can differ from the expected vocab size.
# This causes downstream issues with mismatched tensor sizes when running the inference
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
assert max(tokenizer.vocab.values()) < vocab_size
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
for i in range(vocab_size):
tokens.append(reverse_vocab[i])
scores.append(0.0) # dummy
toktypes.append(gguf.TokenType.NORMAL)
gguf_writer.add_token_list(tokens)
gguf_writer.add_token_scores(scores)
gguf_writer.add_token_types(toktypes)
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
special_vocab.add_to_gguf(gguf_writer)
# TENSORS
tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
# params for qkv transform
n_head = hparams["num_attention_heads"]
n_head_kv = hparams["num_kv_heads"] if "num_kv_heads" in hparams else 1
head_dim = hparams["hidden_size"] // n_head
# tensor info
print("gguf: get tensor metadata")
if num_parts == 0:
part_names = iter(("pytorch_model.bin",))
elif is_safetensors:
part_names = (
f"model-{n:05}-of-{num_parts:05}.safetensors" for n in range(1, num_parts + 1)
)
else:
part_names = (
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
)
for part_name in part_names:
if args.vocab_only:
break
print("gguf: loading model part '" + part_name + "'")
if is_safetensors:
ctx = safe_open(dir_model / part_name, framework="pt", device="cpu")
else:
ctx = contextlib.nullcontext(torch.load(dir_model / part_name, map_location="cpu"))
with ctx as model_part:
for name in model_part.keys():
data = model_part.get_tensor(name) if is_safetensors else model_part[name]
old_dtype = data.dtype
# convert any unsupported data types to float32
if data.dtype != torch.float16 and data.dtype != torch.float32:
data = data.to(torch.float32)
# QKV tensor transform
# The original query_key_value tensor contains n_head_kv "kv groups",
# each consisting of n_head/n_head_kv query weights followed by one key
# and one value weight (shared by all query heads in the kv group).
# This layout makes it a big pain to work with in GGML.
# So we rearrange them here,, so that we have n_head query weights
# followed by n_head_kv key weights followed by n_head_kv value weights,
# in contiguous fashion.
# ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py
if "query_key_value" in name:
qkv = data.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head)
q = qkv[:, :-2 ].reshape(n_head * head_dim, head_dim * n_head)
k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head)
v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head)
data = torch.cat((q,k,v)).reshape_as(data)
data = data.squeeze().numpy()
# map tensor names
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
if new_name is None:
print("Can not map tensor '" + name + "'")
sys.exit()
n_dims = len(data.shape)
data_dtype = data.dtype
# if f32 desired, convert any float16 to float32
if ftype == 0 and data_dtype == np.float16:
data = data.astype(np.float32)
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
data = data.astype(np.float32)
# if f16 desired, convert any float32 2-dim weight tensors to float16
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data = data.astype(np.float16)
print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
gguf_writer.add_tensor(new_name, data)
print("gguf: write header")
gguf_writer.write_header_to_file()
print("gguf: write metadata")
gguf_writer.write_kv_data_to_file()
if not args.vocab_only:
print("gguf: write tensors")
gguf_writer.write_tensors_to_file()
gguf_writer.close()
print(f"gguf: model successfully exported to '{fname_out}'")
print("")

212
convert-gptneox-hf-to-gguf.py Executable file
View File

@@ -0,0 +1,212 @@
#!/usr/bin/env python3
# HF gptneox--> gguf conversion
from __future__ import annotations
import argparse
import json
import os
import struct
import sys
from pathlib import Path
from typing import Any
import numpy as np
import torch
from transformers import AutoTokenizer # type: ignore[import]
if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
import gguf
def count_model_parts(dir_model: Path) -> int:
num_parts = 0
for filename in os.listdir(dir_model):
if filename.startswith("pytorch_model-"):
num_parts += 1
if num_parts > 0:
print("gguf: found " + str(num_parts) + " model parts")
return num_parts
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Convert a GPT-NeoX model to a GGML compatible file")
parser.add_argument(
"--vocab-only", action="store_true",
help="extract only the vocab",
)
parser.add_argument(
"--outfile", type=Path,
help="path to write to; default: based on input",
)
parser.add_argument(
"model", type=Path,
help="directory containing model file, or model file itself (*.bin)",
)
parser.add_argument(
"ftype", type=int, choices=[0, 1], default=1, nargs='?',
help="output format - use 0 for float32, 1 for float16",
)
return parser.parse_args()
args = parse_args()
dir_model = args.model
ftype = args.ftype
if not dir_model.is_dir():
print(f'Error: {args.model} is not a directory', file = sys.stderr)
sys.exit(1)
# possible tensor data types
# ftype == 0 -> float32
# ftype == 1 -> float16
# map from ftype to string
ftype_str = ["f32", "f16"]
if args.outfile is not None:
fname_out = args.outfile
else:
# output in the same directory as the model by default
fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
print("gguf: loading model "+dir_model.name)
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
hparams = json.load(f)
if hparams["architectures"][0] != "GPTNeoXForCausalLM":
print("Model architecture not supported: " + hparams["architectures"][0])
sys.exit()
# get number of model parts
num_parts = count_model_parts(dir_model)
ARCH=gguf.MODEL_ARCH.GPTNEOX
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
print("gguf: get model metadata")
block_count = hparams["num_hidden_layers"]
gguf_writer.add_name(dir_model.name)
gguf_writer.add_context_length(hparams["max_position_embeddings"])
gguf_writer.add_embedding_length(hparams["hidden_size"])
gguf_writer.add_block_count(block_count)
gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
gguf_writer.add_rope_dimension_count(int(hparams["rotary_pct"]*(hparams["hidden_size"]//hparams["num_attention_heads"])))
gguf_writer.add_head_count(hparams["num_attention_heads"])
gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
gguf_writer.add_layer_norm_eps(hparams["layer_norm_eps"])
# TOKENIZATION
print("gguf: get tokenizer metadata")
tokens: list[bytearray] = []
scores: list[float] = []
toktypes: list[int] = []
# gpt2 tokenizer
gguf_writer.add_tokenizer_model("gpt2")
print("gguf: get gpt2 tokenizer vocab")
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
tokenizer = AutoTokenizer.from_pretrained(dir_model)
# The number of tokens in tokenizer.json can differ from the expected vocab size.
# This causes downstream issues with mismatched tensor sizes when running the inference
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
assert max(tokenizer.vocab.values()) < vocab_size
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
for i in range(vocab_size):
tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
scores.append(0.0) # dummy
toktypes.append(gguf.TokenType.NORMAL)
gguf_writer.add_token_list(tokens)
gguf_writer.add_token_scores(scores)
gguf_writer.add_token_types(toktypes)
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
special_vocab.add_to_gguf(gguf_writer)
# TENSORS
tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
# tensor info
print("gguf: get tensor metadata")
if num_parts == 0:
part_names = iter(("pytorch_model.bin",))
else:
part_names = (
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
)
for part_name in part_names:
if args.vocab_only:
break
print("gguf: loading model part '" + part_name + "'")
model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
for name in model_part.keys():
data = model_part[name]
# we don't need these
if name.endswith(".attention.masked_bias") or name.endswith(".attention.bias") or name.endswith(".attention.rotary_emb.inv_freq"):
continue
old_dtype = data.dtype
# convert any unsupported data types to float32
if data.dtype != torch.float16 and data.dtype != torch.float32:
data = data.to(torch.float32)
data = data.squeeze().numpy()
# map tensor names
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
if new_name is None:
print("Can not map tensor '" + name + "'")
sys.exit()
n_dims = len(data.shape)
data_dtype = data.dtype
# if f32 desired, convert any float16 to float32
if ftype == 0 and data_dtype == np.float16:
data = data.astype(np.float32)
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
data = data.astype(np.float32)
# if f16 desired, convert any float32 2-dim weight tensors to float16
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data = data.astype(np.float16)
print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
gguf_writer.add_tensor(new_name, data)
print("gguf: write header")
gguf_writer.write_header_to_file()
print("gguf: write metadata")
gguf_writer.write_kv_data_to_file()
if not args.vocab_only:
print("gguf: write tensors")
gguf_writer.write_tensors_to_file()
gguf_writer.close()
print(f"gguf: model successfully exported to '{fname_out}'")
print("")

View File

@@ -1,890 +0,0 @@
#!/usr/bin/env python3
from __future__ import annotations
import argparse
import contextlib
import json
import os
import re
import sys
from enum import IntEnum
from pathlib import Path
from typing import TYPE_CHECKING, Any, ContextManager, Iterator, cast
import numpy as np
import torch
if TYPE_CHECKING:
from torch import Tensor
if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
import gguf
###### MODEL DEFINITIONS ######
class SentencePieceTokenTypes(IntEnum):
NORMAL = 1
UNKNOWN = 2
CONTROL = 3
USER_DEFINED = 4
UNUSED = 5
BYTE = 6
class Model:
def __init__(self, dir_model: Path, ftype: int, fname_out: Path, is_big_endian: bool):
self.dir_model = dir_model
self.ftype = ftype
self.fname_out = fname_out
self.is_big_endian = is_big_endian
self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
self.is_safetensors = self._is_model_safetensors()
self.num_parts = Model.count_model_parts(self.dir_model, ".safetensors" if self.is_safetensors else ".bin")
self.part_names = self._get_part_names()
self.hparams = Model.load_hparams(self.dir_model)
self.model_arch = self._get_model_architecture()
self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess)
def set_vocab(self):
self._set_vocab_gpt2()
def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
for part_name in self.part_names:
print(f"gguf: loading model part '{part_name}'")
ctx: ContextManager[Any]
if self.is_safetensors:
from safetensors import safe_open
ctx = cast(ContextManager[Any], safe_open(self.dir_model / part_name, framework="pt", device="cpu"))
else:
ctx = contextlib.nullcontext(torch.load(self.dir_model / part_name, map_location="cpu"))
with ctx as model_part:
for name in model_part.keys():
data = model_part.get_tensor(name) if self.is_safetensors else model_part[name]
yield name, data
def set_gguf_parameters(self):
self.gguf_writer.add_name(self.dir_model.name)
self.gguf_writer.add_block_count(self.hparams.get(
"n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")),
))
if (n_ctx := self.hparams.get("max_position_embeddings")) is not None:
self.gguf_writer.add_context_length(n_ctx)
if (n_embd := self.hparams.get("hidden_size")) is not None:
self.gguf_writer.add_embedding_length(n_embd)
if (n_ff := self.hparams.get("intermediate_size")) is not None:
self.gguf_writer.add_feed_forward_length(n_ff)
if (n_head := self.hparams.get("num_attention_head")) is not None:
self.gguf_writer.add_head_count(n_head)
self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True))
def write_tensors(self):
block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
for name, data_torch in self.get_tensors():
# we don't need these
if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
continue
old_dtype = data_torch.dtype
# convert any unsupported data types to float32
if data_torch.dtype not in (torch.float16, torch.float32):
data_torch = data_torch.to(torch.float32)
data = data_torch.squeeze().numpy()
# map tensor names
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
if new_name is None:
print(f"Can not map tensor {name!r}")
sys.exit()
n_dims = len(data.shape)
data_dtype = data.dtype
# if f32 desired, convert any float16 to float32
if self.ftype == 0 and data_dtype == np.float16:
data = data.astype(np.float32)
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
data = data.astype(np.float32)
# if f16 desired, convert any float32 2-dim weight tensors to float16
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data = data.astype(np.float16)
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
self.gguf_writer.add_tensor(new_name, data)
def write(self):
self.write_tensors()
self.gguf_writer.write_header_to_file()
self.gguf_writer.write_kv_data_to_file()
self.gguf_writer.write_tensors_to_file()
self.gguf_writer.close()
def write_vocab(self):
self.gguf_writer.write_header_to_file()
self.gguf_writer.write_kv_data_to_file()
self.gguf_writer.close()
@staticmethod
def count_model_parts(dir_model: Path, prefix: str) -> int:
num_parts = 0
for filename in os.listdir(dir_model):
if filename.endswith(prefix):
num_parts += 1
return num_parts
@staticmethod
def load_hparams(dir_model):
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
return json.load(f)
@staticmethod
def from_model_architecture(model_architecture):
if model_architecture == "StableLMEpochForCausalLM":
return StableLMModel
if model_architecture == "GPTNeoXForCausalLM":
return GPTNeoXModel
if model_architecture == "BloomForCausalLM":
return BloomModel
if model_architecture == "MPTForCausalLM":
return MPTModel
if model_architecture in ("BaichuanForCausalLM", "BaiChuanForCausalLM"):
return BaichuanModel
if model_architecture in ("FalconForCausalLM", "RWForCausalLM"):
return FalconModel
if model_architecture == "GPTBigCodeForCausalLM":
return StarCoderModel
if model_architecture == "GPTRefactForCausalLM":
return RefactModel
if model_architecture == "PersimmonForCausalLM":
return PersimmonModel
return Model
def _is_model_safetensors(self) -> bool:
return Model.count_model_parts(self.dir_model, ".safetensors") > 0
def _get_part_names(self):
if self.is_safetensors:
if self.num_parts == 1: # there's only one .safetensors file
return ("model.safetensors",)
return (f"model-{n:05}-of-{self.num_parts:05}.safetensors" for n in range(1, self.num_parts + 1))
if self.num_parts == 1: # there's only one .bin file
return ("pytorch_model.bin",)
return (f"pytorch_model-{n:05}-of-{self.num_parts:05}.bin" for n in range(1, self.num_parts + 1))
def _get_model_architecture(self) -> gguf.MODEL_ARCH:
arch = self.hparams["architectures"][0]
if arch == "GPTNeoXForCausalLM":
return gguf.MODEL_ARCH.GPTNEOX
if arch == "BloomForCausalLM":
return gguf.MODEL_ARCH.BLOOM
if arch == "MPTForCausalLM":
return gguf.MODEL_ARCH.MPT
if arch in ("BaichuanForCausalLM", "BaiChuanForCausalLM"):
return gguf.MODEL_ARCH.BAICHUAN
if arch == "FalconForCausalLM":
return gguf.MODEL_ARCH.FALCON
if arch == "GPTBigCodeForCausalLM":
return gguf.MODEL_ARCH.STARCODER
if arch == "GPTRefactForCausalLM":
return gguf.MODEL_ARCH.REFACT
if arch == "PersimmonForCausalLM":
return gguf.MODEL_ARCH.PERSIMMON
raise NotImplementedError(f'Architecture "{arch}" not supported!')
def _set_vocab_gpt2(self):
dir_model = self.dir_model
hparams = self.hparams
tokens: list[bytearray] = []
toktypes: list[int] = []
from transformers import AutoTokenizer # type: ignore[attr-defined]
tokenizer = AutoTokenizer.from_pretrained(dir_model)
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
assert max(tokenizer.vocab.values()) < vocab_size
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
added_vocab = tokenizer.get_added_vocab()
for i in range(vocab_size):
if i not in reverse_vocab:
pad_token = f"[PAD{i}]".encode('utf-8')
tokens.append(bytearray(pad_token))
toktypes.append(gguf.TokenType.USER_DEFINED)
elif reverse_vocab[i] in added_vocab:
tokens.append(reverse_vocab[i])
if tokenizer.added_tokens_decoder[i].special:
toktypes.append(gguf.TokenType.CONTROL)
else:
toktypes.append(gguf.TokenType.USER_DEFINED)
else:
tokens.append(reverse_vocab[i])
toktypes.append(gguf.TokenType.NORMAL)
self.gguf_writer.add_tokenizer_model("gpt2")
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_types(toktypes)
special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
special_vocab.add_to_gguf(self.gguf_writer)
def _set_vocab_sentencepiece(self):
from sentencepiece import SentencePieceProcessor
tokenizer_path = self.dir_model / 'tokenizer.model'
tokens: list[bytes] = []
scores: list[float] = []
toktypes: list[int] = []
if not tokenizer_path.is_file():
print(f'Error: Missing {tokenizer_path}', file=sys.stderr)
sys.exit(1)
tokenizer = SentencePieceProcessor(str(tokenizer_path))
vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
for token_id in range(vocab_size):
piece = tokenizer.id_to_piece(token_id)
text = piece.encode("utf-8")
score = tokenizer.get_score(token_id)
toktype = SentencePieceTokenTypes.NORMAL
if tokenizer.is_unknown(token_id):
toktype = SentencePieceTokenTypes.UNKNOWN
elif tokenizer.is_control(token_id):
toktype = SentencePieceTokenTypes.CONTROL
elif tokenizer.is_unused(token_id):
toktype = SentencePieceTokenTypes.UNUSED
elif tokenizer.is_byte(token_id):
toktype = SentencePieceTokenTypes.BYTE
tokens.append(text)
scores.append(score)
toktypes.append(toktype)
added_tokens_file = self.dir_model / 'added_tokens.json'
if added_tokens_file.is_file():
with open(added_tokens_file, "r", encoding="utf-8") as f:
added_tokens_json = json.load(f)
for key in added_tokens_json:
tokens.append(key.encode("utf-8"))
scores.append(-1000.0)
toktypes.append(SentencePieceTokenTypes.USER_DEFINED)
self.gguf_writer.add_tokenizer_model("llama")
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_scores(scores)
self.gguf_writer.add_token_types(toktypes)
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
special_vocab.add_to_gguf(self.gguf_writer)
class StableLMModel(Model):
def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_rope_dimension_count(
int(self.hparams["rope_pct"] * (self.hparams["hidden_size"] // self.hparams["num_attention_heads"])),
)
self.gguf_writer.add_layer_norm_eps(1e-5)
class GPTNeoXModel(Model):
def set_gguf_parameters(self):
block_count = self.hparams["num_hidden_layers"]
self.gguf_writer.add_name(self.dir_model.name)
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
self.gguf_writer.add_block_count(block_count)
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
self.gguf_writer.add_rope_dimension_count(
int(self.hparams["rotary_pct"] * (self.hparams["hidden_size"] // self.hparams["num_attention_heads"])),
)
self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True))
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"])
class BloomModel(Model):
def set_gguf_parameters(self):
self.gguf_writer.add_name("Bloom")
n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed))
self.gguf_writer.add_embedding_length(n_embed)
self.gguf_writer.add_feed_forward_length(4 * n_embed)
self.gguf_writer.add_block_count(self.hparams["n_layer"])
self.gguf_writer.add_head_count(n_head)
self.gguf_writer.add_head_count_kv(n_head)
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
self.gguf_writer.add_file_type(self.ftype)
def write_tensors(self):
block_count = self.hparams["n_layer"]
tensors = dict(self.get_tensors())
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
has_lm_head = True
n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
for name, data_torch in tensors.items():
if "lm_head.weight" not in tensors.keys() and "output.weight" not in tensors.keys():
has_lm_head = False
name = re.sub(r'transformer\.', '', name)
old_dtype = data_torch.dtype
# convert any unsupported data types to float32
if data_torch.dtype not in (torch.float16, torch.float32):
data_torch = data_torch.to(torch.float32)
data = data_torch.squeeze().numpy()
if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name):
# Map bloom-style qkv_linear to gpt-style qkv_linear
# bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa
# gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa
qkv_weights = data.reshape((n_head, 3, n_embed // n_head, n_embed))
data = np.concatenate(
(
qkv_weights[:, 0, :, :].reshape((-1, n_embed)),
qkv_weights[:, 1, :, :].reshape((-1, n_embed)),
qkv_weights[:, 2, :, :].reshape((-1, n_embed)),
),
axis=0,
)
print("re-format attention.linear_qkv.weight")
elif re.match(r"h\.\d+\.self_attention\.query_key_value\.bias", name):
qkv_bias = data.reshape((n_head, 3, n_embed // n_head))
data = np.concatenate(
(
qkv_bias[:, 0, :].reshape((n_embed,)),
qkv_bias[:, 1, :].reshape((n_embed,)),
qkv_bias[:, 2, :].reshape((n_embed,)),
),
axis=0,
)
print("re-format attention.linear_qkv.bias")
# map tensor names
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
if new_name is None:
print(f"Can not map tensor {name!r}")
sys.exit()
n_dims = len(data.shape)
data_dtype = data.dtype
# if f32 desired, convert any float16 to float32
if self.ftype == 0 and data_dtype == np.float16:
data = data.astype(np.float32)
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
data = data.astype(np.float32)
# if f16 desired, convert any float32 2-dim weight tensors to float16
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data = data.astype(np.float16)
print(f"=> {new_name}, shape = {data.shape}, {old_dtype} --> {data.dtype}")
self.gguf_writer.add_tensor(new_name, data)
if not has_lm_head and name == "word_embeddings.weight":
self.gguf_writer.add_tensor("output.weight", data)
print(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}")
class MPTModel(Model):
def set_gguf_parameters(self):
block_count = self.hparams["n_layers"]
self.gguf_writer.add_name(self.dir_model.name)
self.gguf_writer.add_context_length(self.hparams["max_seq_len"])
self.gguf_writer.add_embedding_length(self.hparams["d_model"])
self.gguf_writer.add_block_count(block_count)
self.gguf_writer.add_feed_forward_length(4 * self.hparams["d_model"])
self.gguf_writer.add_head_count(self.hparams["n_heads"])
if kv_n_heads := self.hparams["attn_config"].get("kv_n_heads"):
self.gguf_writer.add_head_count_kv(kv_n_heads)
self.gguf_writer.add_layer_norm_eps(1e-5)
if self.hparams["attn_config"]["clip_qkv"] is not None:
self.gguf_writer.add_clamp_kqv(self.hparams["attn_config"]["clip_qkv"])
self.gguf_writer.add_max_alibi_bias(self.hparams["attn_config"]["alibi_bias_max"])
def write_tensors(self):
block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers"))
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
for name, data_torch in self.get_tensors():
# we don't need these
if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
continue
old_dtype = data_torch.dtype
# convert any unsupported data types to float32
if data_torch.dtype not in (torch.float16, torch.float32):
data_torch = data_torch.to(torch.float32)
data = data_torch.squeeze().numpy()
# map tensor names
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
if new_name is None:
print(f"Can not map tensor {name!r}")
sys.exit()
n_dims = len(data.shape)
data_dtype = data.dtype
# if f32 desired, convert any float16 to float32
if self.ftype == 0 and data_dtype == np.float16:
data = data.astype(np.float32)
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
data = data.astype(np.float32)
# if f16 desired, convert any float32 2-dim weight tensors to float16
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data = data.astype(np.float16)
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
self.gguf_writer.add_tensor(new_name, data)
# note: MPT output is tied to (same as) wte in original model;
# for easier implementation in llama.cpp it's duplicated in GGUF, though :/
if new_name == "token_embd.weight":
self.gguf_writer.add_tensor("output.weight", data)
class BaichuanModel(Model):
def set_vocab(self):
self._set_vocab_sentencepiece()
def set_gguf_parameters(self):
block_count = self.hparams["num_hidden_layers"]
head_count = self.hparams["num_attention_heads"]
head_count_kv = self.hparams.get("num_key_value_heads", head_count)
hf_repo = self.hparams.get("_name_or_path", "")
ctx_length = 0
if "max_sequence_length" in self.hparams:
ctx_length = self.hparams["max_sequence_length"]
elif "max_position_embeddings" in self.hparams:
ctx_length = self.hparams["max_position_embeddings"]
elif "model_max_length" in self.hparams:
ctx_length = self.hparams["model_max_length"]
else:
print("gguf: can not find ctx length parameter.")
sys.exit()
self.gguf_writer.add_name(self.dir_model.name)
self.gguf_writer.add_source_hf_repo(hf_repo)
self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
self.gguf_writer.add_context_length(ctx_length)
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
self.gguf_writer.add_block_count(block_count)
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
self.gguf_writer.add_head_count(head_count)
self.gguf_writer.add_head_count_kv(head_count_kv)
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
if self.hparams["rope_scaling"].get("type") == "linear":
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
def write_tensors(self):
# Collect tensors from generator object
model_kv = dict(self.get_tensors())
block_count = self.hparams["num_hidden_layers"]
head_count = self.hparams["num_attention_heads"]
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
head_count_kv = self.hparams.get("num_key_value_heads", head_count)
for i in range(block_count):
if (w := model_kv.get(f"model.layers.{i}.self_attn.W_pack.weight")) is not None:
print(f"Unpacking and permuting layer {i}")
model_kv[f"model.layers.{i}.self_attn.q_proj.weight"] = \
self._reverse_hf_permute_part(w, 0, head_count, head_count)
model_kv[f"model.layers.{i}.self_attn.k_proj.weight"] = \
self._reverse_hf_permute_part(w, 1, head_count, head_count_kv)
model_kv[f"model.layers.{i}.self_attn.v_proj.weight"] = \
self._reverse_hf_part(w, 2)
del model_kv[f"model.layers.{i}.self_attn.W_pack.weight"]
for name, data_torch in model_kv.items():
# we don't need these
if name.endswith(".rotary_emb.inv_freq"):
continue
old_dtype = data_torch.dtype
# convert any unsupported data types to float32
if data_torch.dtype not in (torch.float16, torch.float32):
data_torch = data_torch.to(torch.float32)
data = data_torch.squeeze().numpy()
# map tensor names
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
if new_name is None:
print(f"Can not map tensor {name!r}")
sys.exit()
n_dims = len(data.shape)
data_dtype = data.dtype
# if f32 desired, convert any float16 to float32
if self.ftype == 0 and data_dtype == np.float16:
data = data.astype(np.float32)
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
data = data.astype(np.float32)
# if f16 desired, convert any float32 2-dim weight tensors to float16
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data = data.astype(np.float16)
print(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
self.gguf_writer.add_tensor(new_name, data)
def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
if n_kv_head is not None and n_head != n_kv_head:
n_head //= n_kv_head
return (
weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
.swapaxes(1, 2)
.reshape(weights.shape)
)
def _reverse_hf_permute_part(
self, weights: Tensor, n_part: int, n_head: int, n_head_kv: int | None = None,
) -> Tensor:
r = weights.shape[0] // 3
return self._reverse_hf_permute(weights[r * n_part:r * n_part + r, ...], n_head, n_head_kv)
def _reverse_hf_part(self, weights: Tensor, n_part: int) -> Tensor:
r = weights.shape[0] // 3
return weights[r * n_part:r * n_part + r, ...]
class FalconModel(Model):
def set_gguf_parameters(self):
block_count = self.hparams.get("num_hidden_layers")
if block_count is None:
block_count = self.hparams["n_layer"] # old name
n_head = self.hparams.get("num_attention_heads")
if n_head is None:
n_head = self.hparams["n_head"] # old name
n_head_kv = self.hparams.get("num_kv_heads")
if n_head_kv is None:
n_head_kv = self.hparams.get("n_head_kv", 1) # old name
self.gguf_writer.add_name("Falcon")
self.gguf_writer.add_context_length(2048) # not in config.json
self.gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
self.gguf_writer.add_feed_forward_length(4 * self.hparams["hidden_size"])
self.gguf_writer.add_block_count(block_count)
self.gguf_writer.add_head_count(n_head)
self.gguf_writer.add_head_count_kv(n_head_kv)
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
self.gguf_writer.add_file_type(self.ftype)
def write_tensors(self):
block_count = self.hparams.get("num_hidden_layers")
if block_count is None:
block_count = self.hparams["n_layer"] # old name
n_head = self.hparams.get("num_attention_heads")
if n_head is None:
n_head = self.hparams["n_head"] # old name
n_head_kv = self.hparams.get("num_kv_heads")
if n_head_kv is None:
n_head_kv = self.hparams.get("n_head_kv", 1) # old name
head_dim = self.hparams["hidden_size"] // n_head
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
for name, data_torch in self.get_tensors():
old_dtype = data_torch.dtype
# convert any unsupported data types to float32
if data_torch.dtype not in (torch.float16, torch.float32):
data_torch = data_torch.to(torch.float32)
# QKV tensor transform
# The original query_key_value tensor contains n_head_kv "kv groups",
# each consisting of n_head/n_head_kv query weights followed by one key
# and one value weight (shared by all query heads in the kv group).
# This layout makes it a big pain to work with in GGML.
# So we rearrange them here,, so that we have n_head query weights
# followed by n_head_kv key weights followed by n_head_kv value weights,
# in contiguous fashion.
# ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py
if "query_key_value" in name:
qkv = data_torch.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head)
q = qkv[:, :-2].reshape(n_head * head_dim, head_dim * n_head)
k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head)
v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head)
data_torch = torch.cat((q, k, v)).reshape_as(data_torch)
data = data_torch.squeeze().numpy()
# map tensor names
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
if new_name is None:
print(f"Can not map tensor {name!r}")
sys.exit()
n_dims = len(data.shape)
data_dtype = data.dtype
# if f32 desired, convert any float16 to float32
if self.ftype == 0 and data_dtype == np.float16:
data = data.astype(np.float32)
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
data = data.astype(np.float32)
# if f16 desired, convert any float32 2-dim weight tensors to float16
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data = data.astype(np.float16)
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
self.gguf_writer.add_tensor(new_name, data)
class StarCoderModel(Model):
def set_gguf_parameters(self):
block_count = self.hparams["n_layer"]
self.gguf_writer.add_name("StarCoder")
self.gguf_writer.add_context_length(self.hparams["n_positions"])
self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
self.gguf_writer.add_block_count(block_count)
self.gguf_writer.add_head_count(self.hparams["n_head"])
self.gguf_writer.add_head_count_kv(1)
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
self.gguf_writer.add_file_type(self.ftype)
class RefactModel(Model):
def set_gguf_parameters(self):
hidden_dim = self.hparams["n_embd"]
inner_dim = 4 * hidden_dim
hidden_dim = int(2 * inner_dim / 3)
multiple_of = 256
ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
block_count = self.hparams["n_layer"]
self.gguf_writer.add_name("Refact")
# refact uses Alibi. So this is from config.json which might be used by training.
self.gguf_writer.add_context_length(self.hparams["n_positions"])
self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
self.gguf_writer.add_feed_forward_length(ff_dim)
self.gguf_writer.add_block_count(block_count)
self.gguf_writer.add_head_count(self.hparams["n_head"])
self.gguf_writer.add_head_count_kv(1)
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
self.gguf_writer.add_file_type(self.ftype)
def write_tensors(self):
hidden_dim = self.hparams["n_embd"]
inner_dim = 4 * hidden_dim
hidden_dim = int(2 * inner_dim / 3)
multiple_of = 256
ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
n_head = self.hparams["n_head"]
n_head_kv = 1
head_dim = self.hparams["n_embd"] // n_head
block_count = self.hparams["n_layer"]
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
tensors = dict(self.get_tensors())
for i in range(block_count):
if (w := tensors.get(f"transformer.h.{i}.attn.kv.weight")) is not None:
tensors[f"model.layers.{i}.self_attn.k_proj.weight"] = w[:n_head_kv * head_dim]
tensors[f"model.layers.{i}.self_attn.v_proj.weight"] = w[n_head_kv * head_dim:]
del tensors[f"transformer.h.{i}.attn.kv.weight"]
if (w := tensors.get(f"transformer.h.{i}.attn.q.weight")) is not None:
tensors[f"model.layers.{i}.self_attn.q_proj.weight"] = w
del tensors[f"transformer.h.{i}.attn.q.weight"]
if (w := tensors.get(f"transformer.h.{i}.mlp.gate_up_proj.weight")) is not None:
tensors[f"model.layers.{i}.mlp.gate_proj.weight"] = w[:ff_dim]
tensors[f"model.layers.{i}.mlp.up_proj.weight"] = w[ff_dim:]
del tensors[f"transformer.h.{i}.mlp.gate_up_proj.weight"]
for name, data_torch in tensors.items():
old_dtype = data_torch.dtype
# convert any unsupported data types to float32
if data_torch.dtype not in (torch.float16, torch.float32):
data_torch = data_torch.to(torch.float32)
data = data_torch.squeeze().numpy()
# map tensor names
new_name = tensor_map.get_name(name, try_suffixes=(".weight",))
if new_name is None:
print(f"Can not map tensor {name!r}")
sys.exit()
n_dims = len(data.shape)
data_dtype = data.dtype
# if f32 desired, convert any float16 to float32
if self.ftype == 0 and data_dtype == np.float16:
data = data.astype(np.float32)
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
data = data.astype(np.float32)
# if f16 desired, convert any float32 2-dim weight tensors to float16
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data = data.astype(np.float16)
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
self.gguf_writer.add_tensor(new_name, data)
class PersimmonModel(Model):
def set_gguf_parameters(self):
block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers"))
head_count = self.hparams["num_attention_heads"]
head_count_kv = head_count
hidden_size = self.hparams["hidden_size"]
self.gguf_writer.add_name('persimmon-8b-chat')
self.gguf_writer.add_embedding_length(hidden_size)
self.gguf_writer.add_block_count(block_count)
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
self.gguf_writer.add_rope_dimension_count(hidden_size // head_count)
self.gguf_writer.add_head_count(head_count)
self.gguf_writer.add_head_count_kv(head_count_kv)
self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"])
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"])
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
def set_vocab(self):
self._set_vocab_sentencepiece()
# self.gguf_writer.add_bos_token_id(71013)
# self.gguf_writer.add_eos_token_id(71013)
def write_tensors(self):
block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers"))
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
for name, data_torch in self.get_tensors():
if name.endswith(".self_attention.rotary_emb.inv_freq"):
continue
old_dtype = data_torch.dtype
# TODO: FP16 conversion produces garbage outputs. (Q8_0 does not, so..?)
data = data_torch.to(torch.float32).squeeze().numpy()
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
if new_name is None:
print(f"Can not map tensor {name!r}")
sys.exit()
n_dims = len(data.shape)
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
self.gguf_writer.add_tensor(new_name, data)
###### CONVERSION LOGIC ######
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Convert a huggingface model to a GGML compatible file")
parser.add_argument(
"--vocab-only", action="store_true",
help="extract only the vocab",
)
parser.add_argument(
"--outfile", type=Path,
help="path to write to; default: based on input",
)
parser.add_argument(
"--outtype", type=str, choices=["f32", "f16"], default="f16",
help="output format - use f32 for float32, f16 for float16",
)
parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine")
parser.add_argument(
"model", type=Path,
help="directory containing model file",
)
return parser.parse_args()
args = parse_args()
dir_model = args.model
if not dir_model.is_dir():
print(f'Error: {args.model} is not a directory', file=sys.stderr)
sys.exit(1)
ftype_map = {
"f32": gguf.GGMLQuantizationType.F32,
"f16": gguf.GGMLQuantizationType.F16,
}
if args.outfile is not None:
fname_out = args.outfile
else:
# output in the same directory as the model by default
fname_out = dir_model / f'ggml-model-{args.outtype}.gguf'
print(f"Loading model: {dir_model.name}")
hparams = Model.load_hparams(dir_model)
model_class = Model.from_model_architecture(hparams["architectures"][0])
model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian)
print("Set model parameters")
model_instance.set_gguf_parameters()
print("Set model tokenizer")
model_instance.set_vocab()
if args.vocab_only:
print(f"Exporting model vocab to '{fname_out}'")
model_instance.write_vocab()
else:
print(f"Exporting model to '{fname_out}'")
model_instance.write()
print(f"Model successfully exported to '{fname_out}'")

View File

@@ -12,9 +12,29 @@ import numpy as np
import os
if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
import gguf
# Note: Does not support GGML_QKK_64
QK_K = 256
# Items here are (block size, type size)
GGML_QUANT_SIZES = {
gguf.GGMLQuantizationType.F32 : (1, 4),
gguf.GGMLQuantizationType.F16 : (1, 2),
gguf.GGMLQuantizationType.Q4_0 : (32, 2 + 16),
gguf.GGMLQuantizationType.Q4_1 : (32, 2 + 2 + 16),
gguf.GGMLQuantizationType.Q5_0 : (32, 2 + 4 + 16),
gguf.GGMLQuantizationType.Q5_1 : (32, 2 + 2 + 4 + 16),
gguf.GGMLQuantizationType.Q8_0 : (32, 2 + 32),
gguf.GGMLQuantizationType.Q8_1 : (32, 4 + 4 + 32),
gguf.GGMLQuantizationType.Q2_K : (256, 2 + 2 + QK_K // 16 + QK_K // 4),
gguf.GGMLQuantizationType.Q3_K : (256, 2 + QK_K // 4 + QK_K // 8 + 12),
gguf.GGMLQuantizationType.Q4_K : (256, 2 + 2 + QK_K // 2 + 12),
gguf.GGMLQuantizationType.Q5_K : (256, 2 + 2 + QK_K // 2 + QK_K // 8 + 12),
gguf.GGMLQuantizationType.Q6_K : (256, 2 + QK_K // 2 + QK_K // 4 + QK_K // 16),
gguf.GGMLQuantizationType.Q8_K : (256, 4 + QK_K + QK_K // 8),
}
class GGMLFormat(IntEnum):
GGML = 0
GGMF = 1
@@ -105,7 +125,7 @@ class Tensor:
(n_dims, name_len, dtype) = struct.unpack('<3I', data[offset:offset + 12])
assert n_dims >= 0 and n_dims <= 4, f'Invalid tensor dimensions {n_dims}'
assert name_len < 4096, 'Absurd tensor name length'
quant = gguf.GGML_QUANT_SIZES.get(dtype)
quant = GGML_QUANT_SIZES.get(dtype)
assert quant is not None, 'Unknown tensor type'
(blksize, tysize) = quant
offset += 12
@@ -368,9 +388,7 @@ def handle_metadata(cfg, hp):
cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir,
cfg.vocabtype )
# FIXME: Respect cfg.vocab_dir?
svocab = gguf.SpecialVocab(cfg.model_metadata_dir,
load_merges = cfg.vocabtype == 'bpe',
n_vocab = vocab.vocab_size)
svocab = gguf.SpecialVocab(cfg.model_metadata_dir)
convert.check_vocab_size(params, vocab)
return (params, vocab, svocab)

216
convert-mpt-hf-to-gguf.py Executable file
View File

@@ -0,0 +1,216 @@
#!/usr/bin/env python3
# HF mpt--> gguf conversion
from __future__ import annotations
import argparse
import json
import os
import struct
import sys
from pathlib import Path
from typing import Any
import numpy as np
import torch
from transformers import AutoTokenizer # type: ignore[import]
if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
import gguf
def count_model_parts(dir_model: Path) -> int:
num_parts = 0
for filename in os.listdir(dir_model):
if filename.startswith("pytorch_model-"):
num_parts += 1
if num_parts > 0:
print("gguf: found " + str(num_parts) + " model parts")
return num_parts
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Convert an MPT model to a GGML compatible file")
parser.add_argument(
"--vocab-only", action="store_true",
help="extract only the vocab",
)
parser.add_argument(
"--outfile", type=Path,
help="path to write to; default: based on input",
)
parser.add_argument(
"model", type=Path,
help="directory containing model file, or model file itself (*.bin)",
)
parser.add_argument(
"ftype", type=int, choices=[0, 1], default=1, nargs='?',
help="output format - use 0 for float32, 1 for float16",
)
return parser.parse_args()
args = parse_args()
dir_model = args.model
ftype = args.ftype
if not dir_model.is_dir():
print(f'Error: {args.model} is not a directory', file = sys.stderr)
sys.exit(1)
# possible tensor data types
# ftype == 0 -> float32
# ftype == 1 -> float16
# map from ftype to string
ftype_str = ["f32", "f16"]
if args.outfile is not None:
fname_out = args.outfile
else:
# output in the same directory as the model by default
fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
print("gguf: loading model "+dir_model.name)
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
hparams = json.load(f)
if hparams["architectures"][0] != "MPTForCausalLM":
print("Model architecture not supported: " + hparams["architectures"][0])
sys.exit()
# get number of model parts
num_parts = count_model_parts(dir_model)
ARCH=gguf.MODEL_ARCH.MPT
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
print("gguf: get model metadata")
block_count = hparams["n_layers"]
gguf_writer.add_name(dir_model.name)
gguf_writer.add_context_length(hparams["max_seq_len"])
gguf_writer.add_embedding_length(hparams["d_model"])
gguf_writer.add_block_count(block_count)
gguf_writer.add_feed_forward_length(4 * hparams["d_model"])
gguf_writer.add_head_count(hparams["n_heads"])
gguf_writer.add_layer_norm_eps(1e-05)
if hparams["attn_config"]["clip_qkv"] is not None:
gguf_writer.add_clamp_kqv(hparams["attn_config"]["clip_qkv"])
gguf_writer.add_max_alibi_bias(hparams["attn_config"]["alibi_bias_max"])
# TOKENIZATION
print("gguf: get tokenizer metadata")
tokens: list[bytearray] = []
scores: list[float] = []
toktypes: list[int] = []
# gpt2 tokenizer
gguf_writer.add_tokenizer_model("gpt2")
print("gguf: get gpt2 tokenizer vocab")
# MPT token embedding tensors have dimension 50432 (hparams["vocab_size"]), but
# there are only 50254 (len(tokenizer.vocab)) tokens in the vocab, presumably to
# accomodate some "reserved" tokens; this is causing problems down the line in
# llama.cpp, so we pad the vocab with dummy tokens:
vocab_size = hparams["vocab_size"]
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
tokenizer = AutoTokenizer.from_pretrained(dir_model)
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
for i in range(vocab_size):
tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
scores.append(0.0) # dummy
toktypes.append(gguf.TokenType.NORMAL)
gguf_writer.add_token_list(tokens)
gguf_writer.add_token_scores(scores)
gguf_writer.add_token_types(toktypes)
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
special_vocab.add_to_gguf(gguf_writer)
# TENSORS
tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
# tensor info
print("gguf: get tensor metadata")
if num_parts == 0:
part_names = iter(("pytorch_model.bin",))
else:
part_names = (
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
)
for part_name in part_names:
if args.vocab_only:
break
print("gguf: loading model part '" + part_name + "'")
model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
for name in model_part.keys():
data = model_part[name]
old_dtype = data.dtype
# convert any unsupported data types to float32
if data.dtype != torch.float16 and data.dtype != torch.float32:
data = data.to(torch.float32)
data = data.squeeze().numpy()
# map tensor names
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
if new_name is None:
print("Cannot map tensor '" + name + "'")
continue # for the sake of compatibility with some old published models, don't quit
sys.exit()
n_dims = len(data.shape)
data_dtype = data.dtype
# if f32 desired, convert any float16 to float32
if ftype == 0 and data_dtype == np.float16:
data = data.astype(np.float32)
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
data = data.astype(np.float32)
# if f16 desired, convert any float32 2-dim weight tensors to float16
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data = data.astype(np.float16)
print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
gguf_writer.add_tensor(new_name, data)
# note: MPT output is tied to (same as) wte in original model;
# for easier implementation in llama.cpp it's duplicated in GGUF, though :/
if new_name == "token_embd.weight":
gguf_writer.add_tensor("output.weight", data)
print("gguf: write header")
gguf_writer.write_header_to_file()
print("gguf: write metadata")
gguf_writer.write_kv_data_to_file()
if not args.vocab_only:
print("gguf: write tensors")
gguf_writer.write_tensors_to_file()
gguf_writer.close()
print(f"gguf: model successfully exported to '{fname_out}'")
print("")

View File

@@ -6,7 +6,7 @@ import argparse
from pathlib import Path
from sentencepiece import SentencePieceProcessor
if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
import gguf
def _flatten_dict(dct, tensors, prefix=None):

263
convert-refact-hf-to-gguf.py Executable file
View File

@@ -0,0 +1,263 @@
#!/usr/bin/env python3
# HF refact--> gguf conversion
from __future__ import annotations
import argparse
import json
import os
import sys
from pathlib import Path
import numpy as np
import torch
from transformers import AutoTokenizer # type: ignore[import]
if "NO_LOCAL_GGUF" not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / "gguf-py" / "gguf"))
import gguf
def count_model_parts(dir_model: Path) -> int:
num_parts = 0
for filename in os.listdir(dir_model):
if filename.startswith("pytorch_model-"):
num_parts += 1
if num_parts > 0:
print("gguf: found " + str(num_parts) + " model parts")
return num_parts
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Convert a Refact model to a GGML compatible file"
)
parser.add_argument(
"--vocab-only",
action="store_true",
help="extract only the vocab",
)
parser.add_argument(
"--outfile",
type=Path,
help="path to write to; default: based on input",
)
parser.add_argument(
"model",
type=Path,
help="directory containing model file, or model file itself (*.bin)",
)
parser.add_argument(
"ftype",
type=int,
choices=[0, 1],
default=1,
nargs="?",
help="output format - use 0 for float32, 1 for float16",
)
return parser.parse_args()
args = parse_args()
dir_model = args.model
ftype = args.ftype
if not dir_model.is_dir():
print(f"Error: {args.model} is not a directory", file=sys.stderr)
sys.exit(1)
# possible tensor data types
# ftype == 0 -> float32
# ftype == 1 -> float16
# map from ftype to string
ftype_str = ["f32", "f16"]
if args.outfile is not None:
fname_out = args.outfile
else:
# output in the same directory as the model by default
fname_out = dir_model / f"ggml-model-{ftype_str[ftype]}.gguf"
print("gguf: loading model " + dir_model.name)
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
hparams = json.load(f)
if hparams["architectures"][0] != "GPTRefactForCausalLM":
print("Model architecture not supported: " + hparams["architectures"][0])
sys.exit(1)
# get number of model parts
num_parts = count_model_parts(dir_model)
ARCH = gguf.MODEL_ARCH.REFACT
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
print("gguf: get model metadata")
# Get refact feed forward dimension
hidden_dim = hparams["n_embd"]
inner_dim = 4 * hidden_dim
hidden_dim = int(2 * inner_dim / 3)
multiple_of = 256
ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
block_count = hparams["n_layer"]
gguf_writer.add_name("Refact")
# refact uses Alibi. So this is from config.json which might be used by training.
gguf_writer.add_context_length(hparams["n_positions"])
gguf_writer.add_embedding_length(hparams["n_embd"])
gguf_writer.add_feed_forward_length(ff_dim)
gguf_writer.add_block_count(block_count)
gguf_writer.add_head_count(hparams["n_head"])
gguf_writer.add_head_count_kv(1)
gguf_writer.add_layer_norm_rms_eps(hparams["layer_norm_epsilon"])
gguf_writer.add_file_type(ftype)
# TOKENIZATION
print("gguf: get tokenizer metadata")
tokens: list[bytearray] = []
scores: list[float] = []
toktypes: list[int] = []
# gpt2 tokenizer
gguf_writer.add_tokenizer_model("gpt2")
print("gguf: get gpt2 tokenizer vocab")
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
tokenizer = AutoTokenizer.from_pretrained(dir_model)
# The number of tokens in tokenizer.json can differ from the expected vocab size.
# This causes downstream issues with mismatched tensor sizes when running the inference
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
assert max(tokenizer.vocab.values()) < vocab_size
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
for i in range(vocab_size):
tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
scores.append(0.0) # dummy
toktypes.append(gguf.TokenType.NORMAL)
gguf_writer.add_token_list(tokens)
gguf_writer.add_token_scores(scores)
gguf_writer.add_token_types(toktypes)
special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
special_vocab.add_to_gguf(gguf_writer)
# TENSORS
tensor_map = gguf.get_tensor_name_map(ARCH, block_count)
# params for qkv transform
n_head = hparams["n_head"]
n_head_kv = 1
head_dim = hparams["n_embd"] // n_head
# tensor info
print("gguf: get tensor metadata")
if num_parts == 0:
part_names = iter(("pytorch_model.bin",))
else:
part_names = (
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
)
for part_name in part_names:
if args.vocab_only:
break
print("gguf: loading model part '" + part_name + "'")
model_part = torch.load(dir_model / part_name, map_location="cpu")
for i in range(block_count):
if f"transformer.h.{i}.attn.kv.weight" in model_part:
data = model_part[f"transformer.h.{i}.attn.kv.weight"]
model_part[f"model.layers.{i}.self_attn.k_proj.weight"] = data[
: n_head_kv * head_dim
]
model_part[f"model.layers.{i}.self_attn.v_proj.weight"] = data[
n_head_kv * head_dim :
]
del model_part[f"transformer.h.{i}.attn.kv.weight"]
if f"transformer.h.{i}.attn.q.weight" in model_part:
model_part[f"model.layers.{i}.self_attn.q_proj.weight"] = model_part[
f"transformer.h.{i}.attn.q.weight"
]
del model_part[f"transformer.h.{i}.attn.q.weight"]
if f"transformer.h.{i}.mlp.gate_up_proj.weight" in model_part:
data = model_part[f"transformer.h.{i}.mlp.gate_up_proj.weight"]
model_part[f"model.layers.{i}.mlp.gate_proj.weight"] = data[:ff_dim]
model_part[f"model.layers.{i}.mlp.up_proj.weight"] = data[ff_dim:]
del model_part[f"transformer.h.{i}.mlp.gate_up_proj.weight"]
for name in model_part.keys():
data = model_part[name]
old_dtype = data.dtype
# convert any unsupported data types to float32
if data.dtype != torch.float16 and data.dtype != torch.float32:
data = data.to(torch.float32)
data = data.squeeze().numpy()
# map tensor names
new_name = tensor_map.get_name(name, try_suffixes=(".weight",))
if new_name is None:
print("Can not map tensor '" + name + "'")
sys.exit()
n_dims = len(data.shape)
data_dtype = data.dtype
# if f32 desired, convert any float16 to float32
if ftype == 0 and data_dtype == np.float16:
data = data.astype(np.float32)
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
data = data.astype(np.float32)
# if f16 desired, convert any float32 2-dim weight tensors to float16
if (
ftype == 1
and data_dtype == np.float32
and name.endswith(".weight")
and n_dims == 2
):
data = data.astype(np.float16)
print(
new_name
+ ", n_dims = "
+ str(n_dims)
+ ", "
+ str(old_dtype)
+ " --> "
+ str(data.dtype)
)
gguf_writer.add_tensor(new_name, data)
print("gguf: write header")
gguf_writer.write_header_to_file()
print("gguf: write metadata")
gguf_writer.write_kv_data_to_file()
if not args.vocab_only:
print("gguf: write tensors")
gguf_writer.write_tensors_to_file()
gguf_writer.close()
print(f"gguf: model successfully exported to '{fname_out}'")
print("")

202
convert-starcoder-hf-to-gguf.py Executable file
View File

@@ -0,0 +1,202 @@
#!/usr/bin/env python3
# HF starcoder --> gguf conversion
from __future__ import annotations
import argparse
import json
import os
import struct
import sys
from pathlib import Path
from typing import Any
import numpy as np
import torch
from transformers import AutoTokenizer # type: ignore[import]
if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
import gguf
def count_model_parts(dir_model: Path) -> int:
num_parts = 0
for filename in os.listdir(dir_model):
if filename.startswith("pytorch_model-"):
num_parts += 1
if num_parts > 0:
print("gguf: found " + str(num_parts) + " model parts")
return num_parts
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Convert a StarCoder model to a GGML compatible file")
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)")
parser.add_argument("ftype", type=int, help="output format - use 0 for float32, 1 for float16", choices=[0, 1], default = 1)
return parser.parse_args()
args = parse_args()
dir_model = args.model
ftype = args.ftype
if not dir_model.is_dir():
print(f'Error: {args.model} is not a directory', file = sys.stderr)
sys.exit(1)
# possible tensor data types
# ftype == 0 -> float32
# ftype == 1 -> float16
# map from ftype to string
ftype_str = ["f32", "f16"]
if args.outfile is not None:
fname_out = args.outfile
else:
# output in the same directory as the model by default
fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
print("gguf: loading model "+dir_model.name)
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
hparams = json.load(f)
if hparams["architectures"][0] != "GPTBigCodeForCausalLM":
print("Model architecture not supported: " + hparams["architectures"][0])
sys.exit(1)
# get number of model parts
num_parts = count_model_parts(dir_model)
ARCH=gguf.MODEL_ARCH.STARCODER
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
print("gguf: get model metadata")
block_count = hparams["n_layer"]
gguf_writer.add_name("StarCoder")
gguf_writer.add_context_length(hparams["n_positions"])
gguf_writer.add_embedding_length(hparams["n_embd"])
gguf_writer.add_feed_forward_length(4 * hparams["n_embd"])
gguf_writer.add_block_count(block_count)
gguf_writer.add_head_count(hparams["n_head"])
gguf_writer.add_head_count_kv(1)
gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
gguf_writer.add_file_type(ftype)
# TOKENIZATION
print("gguf: get tokenizer metadata")
tokens: list[bytearray] = []
scores: list[float] = []
toktypes: list[int] = []
# gpt2 tokenizer
gguf_writer.add_tokenizer_model("gpt2")
print("gguf: get gpt2 tokenizer vocab")
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
tokenizer = AutoTokenizer.from_pretrained(dir_model)
# The number of tokens in tokenizer.json can differ from the expected vocab size.
# This causes downstream issues with mismatched tensor sizes when running the inference
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
assert max(tokenizer.vocab.values()) < vocab_size
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
for i in range(vocab_size):
tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
scores.append(0.0) # dummy
toktypes.append(gguf.TokenType.NORMAL)
gguf_writer.add_token_list(tokens)
gguf_writer.add_token_scores(scores)
gguf_writer.add_token_types(toktypes)
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
special_vocab.add_to_gguf(gguf_writer)
# TENSORS
tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
# params for qkv transform
n_head = hparams["n_head"]
n_head_kv = hparams["n_head_kv"] if "n_head_kv" in hparams else 1
head_dim = hparams["n_embd"] // n_head
# tensor info
print("gguf: get tensor metadata")
if num_parts == 0:
part_names = iter(("pytorch_model.bin",))
else:
part_names = (
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
)
for part_name in part_names:
if args.vocab_only:
break
print("gguf: loading model part '" + part_name + "'")
model_part = torch.load(dir_model / part_name, map_location="cpu")
for name in model_part.keys():
data = model_part[name]
old_dtype = data.dtype
# convert any unsupported data types to float32
if data.dtype != torch.float16 and data.dtype != torch.float32:
data = data.to(torch.float32)
data = data.squeeze().numpy()
# map tensor names
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
if new_name is None:
print("Can not map tensor '" + name + "'")
sys.exit()
n_dims = len(data.shape)
data_dtype = data.dtype
# if f32 desired, convert any float16 to float32
if ftype == 0 and data_dtype == np.float16:
data = data.astype(np.float32)
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
data = data.astype(np.float32)
# if f16 desired, convert any float32 2-dim weight tensors to float16
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data = data.astype(np.float16)
print(name, "=>", new_name + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype))
gguf_writer.add_tensor(new_name, data)
print("gguf: write header")
gguf_writer.write_header_to_file()
print("gguf: write metadata")
gguf_writer.write_kv_data_to_file()
if not args.vocab_only:
print("gguf: write tensors")
gguf_writer.write_tensors_to_file()
gguf_writer.close()
print(f"gguf: model successfully exported to '{fname_out}'")
print("")

View File

@@ -3,9 +3,11 @@ from __future__ import annotations
import argparse
import concurrent.futures
import copy
import enum
import faulthandler
import functools
import io
import itertools
import json
import math
@@ -21,14 +23,14 @@ from abc import ABCMeta, abstractmethod
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
from dataclasses import dataclass
from pathlib import Path
from typing import IO, TYPE_CHECKING, Any, Callable, Iterable, Literal, TypeVar
from typing import IO, TYPE_CHECKING, Any, Callable, Generator, Iterable, Literal, Sequence, TypeVar
import numpy as np
from sentencepiece import SentencePieceProcessor
from sentencepiece import SentencePieceProcessor # type: ignore[import]
import os
if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
import gguf
if TYPE_CHECKING:
@@ -149,11 +151,8 @@ class Params:
n_head_kv: int
f_norm_eps: float
rope_scaling_type: gguf.RopeScalingType | None = None
f_rope_freq_base: float | None = None
f_rope_scale: float | None = None
n_orig_ctx: int | None = None
rope_finetuned: bool | None = None
ftype: GGMLFileType | None = None
@@ -199,20 +198,20 @@ class Params:
def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params:
config = json.load(open(config_path))
rope_scaling_type = f_rope_scale = n_orig_ctx = rope_finetuned = None
rope_scaling = config.get("rope_scaling")
n_vocab = config["vocab_size"]
n_embd = config["hidden_size"]
n_layer = config["num_hidden_layers"]
n_ff = config["intermediate_size"]
n_head = config["num_attention_heads"]
n_head_kv = config["num_key_value_heads"] if "num_key_value_heads" in config else n_head
f_norm_eps = config["rms_norm_eps"]
f_rope_freq_base = config["rope_theta"] if "rope_theta" in config else None
if rope_scaling is not None and (typ := rope_scaling.get("type")):
rope_factor = rope_scaling.get("factor")
f_rope_scale = rope_factor
if typ == "linear":
rope_scaling_type = gguf.RopeScalingType.LINEAR
elif typ == "yarn":
rope_scaling_type = gguf.RopeScalingType.YARN
n_orig_ctx = rope_scaling['original_max_position_embeddings']
rope_finetuned = rope_scaling['finetuned']
else:
raise NotImplementedError(f'Unknown rope scaling type: {typ}')
rope_scaling = config.get("rope_scaling")
if isinstance(rope_scaling, dict) and rope_scaling.get("type") == "linear":
f_rope_scale = config["rope_scaling"].get("factor")
else:
f_rope_scale = None
if "max_sequence_length" in config:
n_ctx = config["max_sequence_length"]
@@ -223,19 +222,16 @@ class Params:
"Suggestion: provide 'config.json' of the model in the same directory containing model files.")
return Params(
n_vocab = config["vocab_size"],
n_embd = config["hidden_size"],
n_layer = config["num_hidden_layers"],
n_ctx = n_ctx,
n_ff = config["intermediate_size"],
n_head = (n_head := config["num_attention_heads"]),
n_head_kv = config.get("num_key_value_heads", n_head),
f_norm_eps = config["rms_norm_eps"],
f_rope_freq_base = config.get("rope_theta"),
rope_scaling_type = rope_scaling_type,
f_rope_scale = f_rope_scale,
n_orig_ctx = n_orig_ctx,
rope_finetuned = rope_finetuned,
n_vocab = n_vocab,
n_embd = n_embd,
n_layer = n_layer,
n_ctx = n_ctx,
n_ff = n_ff,
n_head = n_head,
n_head_kv = n_head_kv,
f_norm_eps = f_norm_eps,
f_rope_freq_base = f_rope_freq_base,
f_rope_scale = f_rope_scale,
)
# LLaMA v2 70B params.json
@@ -244,8 +240,17 @@ class Params:
def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:
config = json.load(open(config_path))
n_vocab = config["vocab_size"] if "vocab_size" in config else -1
n_embd = config["dim"]
n_layer = config["n_layers"]
n_ff = -1
n_head = config["n_heads"]
n_head_kv = config["n_kv_heads"] if "n_kv_heads" in config else n_head
f_norm_eps = config["norm_eps"]
f_rope_freq_base = config["rope_theta"] if "rope_theta" in config else None
# hack to determine LLaMA v1 vs v2 vs CodeLlama
if config.get("rope_theta") == 1000000:
if f_rope_freq_base == 1000000:
# CodeLlama
n_ctx = 16384
elif config["norm_eps"] == 1e-05:
@@ -255,16 +260,22 @@ class Params:
# LLaMA v1
n_ctx = 2048
if n_vocab == -1:
n_vocab = model["tok_embeddings.weight"].shape[0]
if n_ff == -1:
n_ff = model["layers.0.feed_forward.w1.weight"].shape[0]
return Params(
n_vocab = config.get("vocab_size", model["tok_embeddings.weight"].shape[0]),
n_embd = config["dim"],
n_layer = config["n_layers"],
n_vocab = n_vocab,
n_embd = n_embd,
n_layer = n_layer,
n_ctx = n_ctx,
n_ff = model["layers.0.feed_forward.w1.weight"].shape[0],
n_head = (n_head := config["n_heads"]),
n_head_kv = config.get("n_kv_heads", n_head),
f_norm_eps = config["norm_eps"],
f_rope_freq_base = config.get("rope_theta"),
n_ff = n_ff,
n_head = n_head,
n_head_kv = n_head_kv,
f_norm_eps = f_norm_eps,
f_rope_freq_base = f_rope_freq_base,
)
@staticmethod
@@ -326,7 +337,7 @@ class BpeVocab:
def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
tokenizer = self.bpe_tokenizer
from transformers.models.gpt2 import tokenization_gpt2
from transformers.models.gpt2 import tokenization_gpt2 # type: ignore[import]
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.items()}
for i, _ in enumerate(tokenizer):
@@ -355,19 +366,16 @@ class SentencePieceVocab:
added_tokens = {}
vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
actual_ids = sorted(added_tokens.values())
if expected_ids != actual_ids:
raise Exception(f"Expected added token IDs to be sequential and start at {len(added_tokens)}; got {actual_ids}")
new_tokens = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens)))
actual_new_ids = sorted(new_tokens.keys())
if expected_new_ids != actual_new_ids:
raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}")
# Token pieces that were added to the base vocabulary.
self.added_tokens_list = [new_tokens[id] for id in actual_new_ids]
self.vocab_size_base = vocab_size
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
self.fname_tokenizer = fname_tokenizer
items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
self.added_tokens_list = [text for (text, idx) in items]
self.vocab_size_base: int = vocab_size
self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
self.fname_tokenizer = fname_tokenizer
self.fname_added_tokens = fname_added_tokens
def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
@@ -795,8 +803,8 @@ def check_vocab_size(params: Params, vocab: Vocab) -> None:
class OutputFile:
def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian=gguf.GGUFEndian.LITTLE) -> None:
self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
def __init__(self, fname_out: Path) -> None:
self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
def add_meta_arch(self, params: Params) -> None:
name = "LLaMA"
@@ -820,16 +828,8 @@ class OutputFile:
if params.f_rope_freq_base is not None:
self.gguf.add_rope_freq_base(params.f_rope_freq_base)
if params.rope_scaling_type:
assert params.f_rope_scale is not None
self.gguf.add_rope_scaling_type(params.rope_scaling_type)
self.gguf.add_rope_scaling_factor(params.f_rope_scale)
if params.n_orig_ctx is not None:
self.gguf.add_rope_scaling_orig_ctx_len(params.n_orig_ctx)
if params.rope_finetuned is not None:
self.gguf.add_rope_scaling_finetuned(params.rope_finetuned)
if params.f_rope_scale is not None:
self.gguf.add_rope_scale_linear(params.f_rope_scale)
if params.ftype is not None:
self.gguf.add_file_type(params.ftype)
@@ -849,7 +849,7 @@ class OutputFile:
elif isinstance(vocab, BpeVocab):
self.gguf.add_tokenizer_model("gpt2")
else:
raise ValueError('Unknown vocab type: Not BpeVocab or SentencePieceVocab')
raise ValueError(f'Unknown vocab type: Not BpeVocab or SentencePieceVocab')
self.gguf.add_token_list(tokens)
self.gguf.add_token_scores(scores)
self.gguf.add_token_types(toktypes)
@@ -875,10 +875,10 @@ class OutputFile:
self.gguf.close()
@staticmethod
def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, endianess:gguf.GGUFEndian=gguf.GGUFEndian.LITTLE) -> None:
def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab) -> None:
check_vocab_size(params, vocab)
of = OutputFile(fname_out, endianess=endianess)
of = OutputFile(fname_out)
# meta data
of.add_meta_arch(params)
@@ -903,10 +903,10 @@ class OutputFile:
return dt.quantize(arr)
@staticmethod
def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE) -> None:
def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY) -> None:
check_vocab_size(params, vocab)
of = OutputFile(fname_out, endianess=endianess)
of = OutputFile(fname_out)
# meta data
of.add_meta_arch(params)
@@ -1112,24 +1112,19 @@ def do_dump_model(model_plus: ModelPlus) -> None:
def main(args_in: list[str] | None = None) -> None:
output_choices = ["f32", "f16"]
if np.uint32(1) == np.uint32(1).newbyteorder("<"):
# We currently only support Q8_0 output on little endian systems.
output_choices.append("q8_0")
parser = argparse.ArgumentParser(description="Convert a LLaMa model to a GGML compatible file")
parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model")
parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file")
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
parser.add_argument("--outtype", choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)")
parser.add_argument("--outtype", choices=["f32", "f16", "q8_0"], help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)")
parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file")
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
parser.add_argument("--vocabtype", choices=["spm", "bpe"], help="vocab format (default: spm)", default="spm")
parser.add_argument("--ctx", type=int, help="model training context (default: based on input)")
parser.add_argument("--concurrency", type=int, help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default = DEFAULT_CONCURRENCY)
parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine")
args = parser.parse_args(args_in)
if args.dump_single:
model_plus = lazy_load_file(args.model)
do_dump_model(model_plus)
@@ -1143,9 +1138,6 @@ def main(args_in: list[str] | None = None) -> None:
if args.dump:
do_dump_model(model_plus)
return
endianess = gguf.GGUFEndian.LITTLE
if args.bigendian:
endianess = gguf.GGUFEndian.BIG
params = Params.load(model_plus)
if params.n_ctx == -1:
@@ -1167,13 +1159,10 @@ def main(args_in: list[str] | None = None) -> None:
vocab: Vocab
if args.vocab_only:
if not args.outfile:
raise ValueError("need --outfile if using --vocab-only")
assert args.outfile, "need --outfile if using --vocab-only"
# FIXME: Try to respect vocab_dir somehow?
vocab = load_vocab(args.vocab_dir or args.model, args.vocabtype)
special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent,
load_merges = args.vocabtype == 'bpe',
n_vocab = vocab.vocab_size)
special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent, load_merges = args.vocabtype == 'bpe')
outfile = args.outfile
OutputFile.write_vocab_only(outfile, params, vocab, special_vocab)
print(f"Wrote {outfile}")
@@ -1185,9 +1174,7 @@ def main(args_in: list[str] | None = None) -> None:
vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
vocab = load_vocab(vocab_dir, args.vocabtype)
# FIXME: Try to respect vocab_dir somehow?
special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent,
load_merges = args.vocabtype == 'bpe',
n_vocab = vocab.vocab_size)
special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent, load_merges = args.vocabtype == 'bpe')
model = model_plus.model
model = convert_model_names(model, params)
@@ -1198,7 +1185,7 @@ def main(args_in: list[str] | None = None) -> None:
params.ftype = ftype
print(f"Writing {outfile}, format {ftype}")
OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, concurrency = args.concurrency, endianess=endianess)
OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, concurrency = args.concurrency)
print(f"Wrote {outfile}")

View File

@@ -17,7 +17,7 @@ llama_model_load_internal: [cublas] total VRAM used: 17223 MB
If you see these lines, then the GPU is being used.
## Verifying that the CPU is not oversaturated
llama accepts a `-t N` (or `--threads N`) parameter. It's extremely important that this parameter is not too large. If your token generation is extremely slow, try setting this number to 1. If this significantly improves your token generation speed, then your CPU is being oversaturated and you need to explicitly set this parameter to the number of the physical CPU cores on your machine (even if you utilize a GPU). If in doubt, start with 1 and double the amount until you hit a performance bottleneck, then scale the number down.
llama accepts a `-t N` (or `--threads N`) parameter. It's extremely important that this parameter is not too large. If your token generation is extremely slow, try setting this number to 1. If this significantly improves your token generation speed, then your CPU is being oversaturated and you need to explicitly set this parameter to the number of the physicial CPU cores on your machine (even if you utilize a GPU). If in doubt, start with 1 and double the amount until you hit a performance bottleneck, then scale the number down.
# Example of runtime flags effect on inference speed benchmark
These runs were tested on the following machine:

View File

@@ -12,26 +12,26 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
if (EMSCRIPTEN)
else()
add_subdirectory(baby-llama)
add_subdirectory(batched)
add_subdirectory(batched-bench)
add_subdirectory(beam-search)
add_subdirectory(benchmark)
add_subdirectory(convert-llama2c-to-ggml)
add_subdirectory(embedding)
add_subdirectory(finetune)
add_subdirectory(infill)
add_subdirectory(llama-bench)
add_subdirectory(llava)
add_subdirectory(main)
add_subdirectory(parallel)
add_subdirectory(perplexity)
add_subdirectory(quantize)
add_subdirectory(quantize-stats)
add_subdirectory(perplexity)
add_subdirectory(embedding)
add_subdirectory(save-load-state)
add_subdirectory(simple)
add_subdirectory(speculative)
add_subdirectory(benchmark)
add_subdirectory(baby-llama)
add_subdirectory(train-text-from-scratch)
add_subdirectory(finetune)
add_subdirectory(convert-llama2c-to-ggml)
add_subdirectory(simple)
add_subdirectory(batched)
add_subdirectory(batched-bench)
add_subdirectory(speculative)
add_subdirectory(parallel)
add_subdirectory(embd-input)
add_subdirectory(llava)
add_subdirectory(llama-bench)
add_subdirectory(beam-search)
if (LLAMA_METAL)
add_subdirectory(metal)
endif()

View File

@@ -114,7 +114,7 @@ int main(int argc, char ** argv) {
return 1;
}
llama_batch batch = llama_batch_init(n_kv_max, 0, 1);
llama_batch batch = llama_batch_init(n_kv_max, 0);
// decode in batches of ctx_params.n_batch tokens
auto decode_helper = [](llama_context * ctx, llama_batch & batch, int32_t n_batch) {
@@ -123,12 +123,11 @@ int main(int argc, char ** argv) {
llama_batch batch_view = {
n_tokens,
batch.token + i,
batch.token + i,
nullptr,
batch.pos + i,
batch.n_seq_id + i,
batch.seq_id + i,
batch.logits + i,
batch.pos + i,
batch.seq_id + i,
batch.logits + i,
0, 0, 0, // unused
};
@@ -144,8 +143,13 @@ int main(int argc, char ** argv) {
// warm up
{
for (int i = 0; i < 16; ++i) {
llama_batch_add(batch, 0, i, { 0 }, false);
batch.n_tokens = 16;
for (int i = 0; i < batch.n_tokens; ++i) {
batch.token[i] = 0;
batch.pos[i] = i;
batch.seq_id[i] = 0;
batch.logits[i] = false;
}
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
@@ -154,10 +158,6 @@ int main(int argc, char ** argv) {
}
}
LOG_TEE("\n");
LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, mmq = %d\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, mmq);
LOG_TEE("\n");
LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
LOG_TEE("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
@@ -174,18 +174,19 @@ int main(int argc, char ** argv) {
continue;
}
llama_batch_clear(batch);
batch.n_tokens = is_pp_shared ? pp : pl*pp;
const int n_tokens = is_pp_shared ? pp : pl*pp;
for (int i = 0; i < n_tokens; ++i) {
llama_batch_add(batch, 0, i, { 0 }, false);
for (int i = 0; i < batch.n_tokens; ++i) {
batch.token[i] = 0;
batch.pos[i] = i;
batch.seq_id[i] = 0;
batch.logits[i] = false;
}
batch.logits[batch.n_tokens - 1] = true;
const auto t_pp_start = ggml_time_us();
llama_kv_cache_clear(ctx);
llama_kv_cache_tokens_rm(ctx, -1, -1);
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
LOG_TEE("%s: llama_decode() failed\n", __func__);
@@ -203,10 +204,13 @@ int main(int argc, char ** argv) {
const auto t_tg_start = ggml_time_us();
for (int i = 0; i < tg; ++i) {
llama_batch_clear(batch);
batch.n_tokens = pl;
for (int j = 0; j < pl; ++j) {
llama_batch_add(batch, 0, pp + i, { j }, true);
batch.token[j] = 0;
batch.pos[j] = pp + i;
batch.seq_id[j] = j;
batch.logits[j] = true;
}
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {

View File

@@ -69,7 +69,7 @@ for id: llama_token in tokens {
print("\n")
var batch = llama_batch_init(max(Int32(tokens.count), Int32(n_parallel)), 0, 1)
var batch = llama_batch_init(max(Int32(tokens.count), Int32(n_parallel)), 0)
defer {
llama_batch_free(batch)
}
@@ -80,12 +80,7 @@ batch.n_tokens = Int32(tokens.count)
for (i, token) in tokens.enumerated() {
batch.token[i] = token
batch.pos[i] = Int32(i)
batch.n_seq_id[i] = 1
// batch.seq_id[i][0] = 0
// TODO: is this the proper way to do this?
if let seq_id = batch.seq_id[i] {
seq_id[0] = 0
}
batch.seq_id[i] = 0
batch.logits[i] = 0
}
@@ -174,10 +169,7 @@ while n_cur <= n_len {
// push this new token for next evaluation
batch.token[Int(batch.n_tokens)] = new_token_id
batch.pos[Int(batch.n_tokens)] = n_cur
batch.n_seq_id[Int(batch.n_tokens)] = 1
if let seq_id = batch.seq_id[Int(batch.n_tokens)] {
seq_id[0] = Int32(i)
}
batch.seq_id[Int(batch.n_tokens)] = Int32(i)
batch.logits[Int(batch.n_tokens)] = 1
i_batch[i] = batch.n_tokens
@@ -217,7 +209,7 @@ llama_print_timings(context)
private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
let n_tokens = text.count + (add_bos ? 1 : 0)
let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
let tokenCount = llama_tokenize(model, text, Int32(text.count), tokens, Int32(n_tokens), add_bos, /*special tokens*/ false)
let tokenCount = llama_tokenize(model, text, Int32(text.count), tokens, Int32(n_tokens), add_bos)
var swiftTokens: [llama_token] = []
for i in 0 ..< tokenCount {
swiftTokens.append(tokens[Int(i)])

View File

@@ -11,19 +11,12 @@ int main(int argc, char ** argv) {
gpt_params params;
if (argc == 1 || argv[1][0] == '-') {
printf("usage: %s MODEL_PATH [PROMPT] [PARALLEL] [LEN] [NGL]\n" , argv[0]);
printf("usage: %s MODEL_PATH [PROMPT] [PARALLEL]\n" , argv[0]);
return 1 ;
}
// number of parallel batches
int n_parallel = 1;
// total length of the sequences including the prompt
int n_len = 32;
// number of layers to offload to the GPU
int n_gpu_layers = 0;
if (argc >= 2) {
params.model = argv[1];
}
@@ -36,18 +29,13 @@ int main(int argc, char ** argv) {
n_parallel = std::atoi(argv[3]);
}
if (argc >= 5) {
n_len = std::atoi(argv[4]);
}
if (argc >= 6) {
n_gpu_layers = std::atoi(argv[5]);
}
if (params.prompt.empty()) {
params.prompt = "Hello my name is";
}
// total length of the sequences including the prompt
const int n_len = 32;
// init LLM
llama_backend_init(params.numa);
@@ -56,7 +44,7 @@ int main(int argc, char ** argv) {
llama_model_params model_params = llama_model_default_params();
model_params.n_gpu_layers = n_gpu_layers;
// model_params.n_gpu_layers = 99; // offload all layers to the GPU
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
@@ -109,15 +97,20 @@ int main(int argc, char ** argv) {
fflush(stderr);
// create a llama_batch
// create a llama_batch with size 512
// we use this object to submit token data for decoding
llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t)n_parallel), 0, 1);
llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t)n_parallel), 0);
// evaluate the initial prompt
for (size_t i = 0; i < tokens_list.size(); ++i) {
llama_batch_add(batch, tokens_list[i], i, { 0 }, false);
batch.n_tokens = tokens_list.size();
for (int32_t i = 0; i < batch.n_tokens; i++) {
batch.token[i] = tokens_list[i];
batch.pos[i] = i;
batch.seq_id[i] = 0;
batch.logits[i] = false;
}
GGML_ASSERT(batch.n_tokens == (int) tokens_list.size());
// llama_decode will output logits only for the last token of the prompt
batch.logits[batch.n_tokens - 1] = true;
@@ -153,7 +146,7 @@ int main(int argc, char ** argv) {
while (n_cur <= n_len) {
// prepare the next batch
llama_batch_clear(batch);
batch.n_tokens = 0;
// sample the next token for each parallel sequence / stream
for (int32_t i = 0; i < n_parallel; ++i) {
@@ -187,7 +180,7 @@ int main(int argc, char ** argv) {
//const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
// is it an end of stream? -> mark the stream as finished
if (new_token_id == llama_token_eos(model) || n_cur == n_len) {
if (new_token_id == llama_token_eos(ctx) || n_cur == n_len) {
i_batch[i] = -1;
LOG_TEE("\n");
if (n_parallel > 1) {
@@ -205,10 +198,15 @@ int main(int argc, char ** argv) {
streams[i] += llama_token_to_piece(ctx, new_token_id);
// push this new token for next evaluation
batch.token [batch.n_tokens] = new_token_id;
batch.pos [batch.n_tokens] = n_cur;
batch.seq_id[batch.n_tokens] = i;
batch.logits[batch.n_tokens] = true;
i_batch[i] = batch.n_tokens;
// push this new token for next evaluation
llama_batch_add(batch, new_token_id, n_cur, { i }, true);
batch.n_tokens += 1;
n_decode += 1;
}

View File

@@ -47,7 +47,7 @@ struct beam_search_callback_data {
// In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same.
// For example, eob can be flagged due to maximum token length, stop words, etc.
static bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, size_t n_tokens) {
return n_tokens && tokens[n_tokens-1] == llama_token_eos(llama_get_model(callback_data.ctx));
return n_tokens && tokens[n_tokens-1] == llama_token_eos(callback_data.ctx);
}
// Function matching type llama_beam_search_callback_fn_t.

View File

@@ -1,6 +1,9 @@
set(TARGET benchmark)
add_executable(${TARGET} benchmark-matmult.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
target_include_directories(${TARGET} PRIVATE ../../common)
target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()

View File

@@ -1,3 +1,4 @@
#include "build-info.h"
#include "common.h"
#include "ggml.h"
@@ -171,8 +172,7 @@ int main(int argc, char ** argv) {
struct ggml_tensor * m11xm2 = ggml_mul_mat(ctx, m11, m2);
// printf("Creating compute graph\n");
struct ggml_cgraph * gf = ggml_new_graph(ctx);
ggml_build_forward_expand(gf, m11xm2);
struct ggml_cgraph gf = ggml_build_forward(m11xm2);
printf("n_threads=%i\n", benchmark_params.n_threads);
@@ -181,9 +181,9 @@ int main(int argc, char ** argv) {
std::vector<uint8_t> work_buffer;
ggml_graph_compute_helper(work_buffer, gf, benchmark_params.n_threads);
ggml_graph_compute_helper(work_buffer, &gf, benchmark_params.n_threads);
TENSOR_DUMP(gf->nodes[0]);
TENSOR_DUMP(gf.nodes[0]);
printf("\n------ Test 2 - Matrix Mult via %s code\n", ggml_type_name(qtype));
@@ -201,8 +201,7 @@ int main(int argc, char ** argv) {
struct ggml_tensor * q31 = ggml_mul_mat(ctx, q11, m2);
// printf("Creating compute graph\n");
struct ggml_cgraph * gf31 = ggml_new_graph(ctx);
ggml_build_forward_expand(gf31, q31);
struct ggml_cgraph gf31 = ggml_build_forward(q31);
// Set up a second graph computation to make sure we override the CPU cache lines
// printf("Creating new tensor q12 & Running quantize\n");
@@ -213,8 +212,7 @@ int main(int argc, char ** argv) {
struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2);
//printf("Creating compute graph\n");
struct ggml_cgraph * gf32 = ggml_new_graph(ctx);
ggml_build_forward_expand(gf32, q32);
struct ggml_cgraph gf32 = ggml_build_forward(q32);
printf("n_threads=%i\n", benchmark_params.n_threads);
const int dimx = sizex;
@@ -226,7 +224,7 @@ int main(int argc, char ** argv) {
// Let's use the F32 result from above as a reference for the quantized multiplication
float sum_of_F32_reference = tensor_sum_elements(gf->nodes[0]);
float sum_of_F32_reference = tensor_sum_elements(gf.nodes[0]);
printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n");
printf("=====================================================================================\n");
@@ -236,7 +234,7 @@ int main(int argc, char ** argv) {
long long int start = ggml_time_us();
//printf("Running ggml_graph_compute\n");
ggml_graph_compute_helper(work_buffer, gf31, benchmark_params.n_threads);
ggml_graph_compute_helper(work_buffer, &gf31, benchmark_params.n_threads);
long long int stop = ggml_time_us();
long long int usec = stop-start;
@@ -254,7 +252,7 @@ int main(int argc, char ** argv) {
// Check that the matrix multiplication result is in the right ballpark
// We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different
float sum_of_Q4_result = tensor_sum_elements(gf31->nodes[0]);
float sum_of_Q4_result = tensor_sum_elements(gf31.nodes[0]);
float delta = std::abs(sum_of_Q4_result - sum_of_F32_reference);
float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; // Let's accept an epsilon of 10^-6
@@ -269,7 +267,7 @@ int main(int argc, char ** argv) {
}
// Running a different graph computation to make sure we override the CPU cache lines
ggml_graph_compute_helper(work_buffer, gf32, benchmark_params.n_threads);
ggml_graph_compute_helper(work_buffer, &gf32, benchmark_params.n_threads);
}
printf("\n");
printf("Average%78.2f\n",gflops_sum/((double)benchmark_params.n_iterations));

View File

@@ -536,7 +536,7 @@ static bool is_ggml_file(const char * filename) {
if (file.size < 4) {
return false;
}
std::string magic = file.read_string(4);
uint32_t magic = file.read_u32();
return magic == GGUF_MAGIC;
}

4
examples/embd-input/.gitignore vendored Normal file
View File

@@ -0,0 +1,4 @@
PandaGPT
MiniGPT-4
*.pth

View File

@@ -0,0 +1,17 @@
set(TARGET embdinput)
add_library(${TARGET} embd-input-lib.cpp embd-input.h)
install(TARGETS ${TARGET} LIBRARY)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()
set(TARGET embd-input-test)
add_executable(${TARGET} embd-input-test.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama embdinput ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()

View File

@@ -0,0 +1,63 @@
### Examples for input embedding directly
## Requirement
build `libembdinput.so`
run the following comman in main dir (../../).
```
make
```
## [LLaVA](https://github.com/haotian-liu/LLaVA/) example (llava.py)
1. Obtian LLaVA model (following https://github.com/haotian-liu/LLaVA/ , use https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/).
2. Convert it to ggml format.
3. `llava_projection.pth` is [pytorch_model-00003-of-00003.bin](https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/blob/main/pytorch_model-00003-of-00003.bin).
```
import torch
bin_path = "../LLaVA-13b-delta-v1-1/pytorch_model-00003-of-00003.bin"
pth_path = "./examples/embd-input/llava_projection.pth"
dic = torch.load(bin_path)
used_key = ["model.mm_projector.weight","model.mm_projector.bias"]
torch.save({k: dic[k] for k in used_key}, pth_path)
```
4. Check the path of LLaVA model and `llava_projection.pth` in `llava.py`.
## [PandaGPT](https://github.com/yxuansu/PandaGPT) example (panda_gpt.py)
1. Obtian PandaGPT lora model from https://github.com/yxuansu/PandaGPT. Rename the file to `adapter_model.bin`. Use [convert-lora-to-ggml.py](../../convert-lora-to-ggml.py) to convert it to ggml format.
The `adapter_config.json` is
```
{
"peft_type": "LORA",
"fan_in_fan_out": false,
"bias": null,
"modules_to_save": null,
"r": 32,
"lora_alpha": 32,
"lora_dropout": 0.1,
"target_modules": ["q_proj", "k_proj", "v_proj", "o_proj"]
}
```
2. Papare the `vicuna` v0 model.
3. Obtain the [ImageBind](https://dl.fbaipublicfiles.com/imagebind/imagebind_huge.pth) model.
4. Clone the PandaGPT source.
```
git clone https://github.com/yxuansu/PandaGPT
```
5. Install the requirement of PandaGPT.
6. Check the path of PandaGPT source, ImageBind model, lora model and vicuna model in panda_gpt.py.
## [MiniGPT-4](https://github.com/Vision-CAIR/MiniGPT-4/) example (minigpt4.py)
1. Obtain MiniGPT-4 model from https://github.com/Vision-CAIR/MiniGPT-4/ and put it in `embd-input`.
2. Clone the MiniGPT-4 source.
```
git clone https://github.com/Vision-CAIR/MiniGPT-4/
```
3. Install the requirement of PandaGPT.
4. Papare the `vicuna` v0 model.
5. Check the path of MiniGPT-4 source, MiniGPT-4 model and vicuna model in `minigpt4.py`.

View File

@@ -0,0 +1,221 @@
#include "build-info.h"
#include "common.h"
#include "embd-input.h"
#include <cassert>
#include <cinttypes>
#include <cmath>
#include <cstdio>
#include <cstring>
#include <ctime>
#include <fstream>
#include <iostream>
#include <string>
#include <vector>
static llama_context ** g_ctx;
extern "C" {
struct MyModel* create_mymodel(int argc, char ** argv) {
gpt_params params;
if (!gpt_params_parse(argc, argv, params)) {
return nullptr;
}
print_build_info();
if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = uint32_t(time(NULL));
}
fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
llama_backend_init(params.numa);
llama_model * model;
llama_context * ctx;
g_ctx = &ctx;
// load the model and apply lora adapter, if any
std::tie(model, ctx) = llama_init_from_gpt_params(params);
if (model == NULL) {
fprintf(stderr, "%s: error: unable to load model\n", __func__);
return nullptr;
}
// print system information
{
fprintf(stderr, "\n");
fprintf(stderr, "%s\n", get_system_info(params).c_str());
}
struct MyModel * ret = new MyModel();
ret->ctx = ctx;
ret->params = params;
ret->n_past = 0;
// printf("ctx: %d\n", ret->ctx);
return ret;
}
void free_mymodel(struct MyModel * mymodel) {
llama_context * ctx = mymodel->ctx;
llama_print_timings(ctx);
llama_free(ctx);
delete mymodel;
}
bool eval_float(void * model, float * input, int N){
MyModel * mymodel = (MyModel*)model;
llama_context * ctx = mymodel->ctx;
gpt_params params = mymodel->params;
int n_emb = llama_n_embd(llama_get_model(ctx));
int n_past = mymodel->n_past;
int n_batch = N; // params.n_batch;
for (int i = 0; i < (int) N; i += n_batch) {
int n_eval = (int) N - i;
if (n_eval > n_batch) {
n_eval = n_batch;
}
llama_batch batch = { int32_t(n_eval), nullptr, (input+i*n_emb), nullptr, nullptr, nullptr, n_past, 1, 0, };
if (llama_decode(ctx, batch)) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return false;
}
n_past += n_eval;
}
mymodel->n_past = n_past;
return true;
}
bool eval_tokens(void * model, std::vector<llama_token> tokens) {
MyModel * mymodel = (MyModel* )model;
llama_context * ctx;
ctx = mymodel->ctx;
gpt_params params = mymodel->params;
int n_past = mymodel->n_past;
for (int i = 0; i < (int) tokens.size(); i += params.n_batch) {
int n_eval = (int) tokens.size() - i;
if (n_eval > params.n_batch) {
n_eval = params.n_batch;
}
if (llama_decode(ctx, llama_batch_get_one(&tokens[i], n_eval, n_past, 0))) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return false;
}
n_past += n_eval;
}
mymodel->n_past = n_past;
return true;
}
bool eval_id(struct MyModel* mymodel, int id) {
std::vector<llama_token> tokens;
tokens.push_back(id);
return eval_tokens(mymodel, tokens);
}
bool eval_string(struct MyModel * mymodel,const char* str){
llama_context * ctx = mymodel->ctx;
std::string str2 = str;
std::vector<llama_token> embd_inp = ::llama_tokenize(ctx, str2, true);
eval_tokens(mymodel, embd_inp);
return true;
}
llama_token sampling_id(struct MyModel* mymodel) {
llama_context* ctx = mymodel->ctx;
gpt_params params = mymodel->params;
llama_sampling_params & sparams = params.sampling_params;
// int n_ctx = llama_n_ctx(ctx);
// out of user input, sample next token
const float temp = sparams.temp;
const int32_t top_k = sparams.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx)) : sparams.top_k;
const float top_p = sparams.top_p;
const float tfs_z = sparams.tfs_z;
const float typical_p = sparams.typical_p;
// const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
// const float repeat_penalty = params.repeat_penalty;
// const float alpha_presence = params.presence_penalty;
// const float alpha_frequency = params.frequency_penalty;
const int mirostat = sparams.mirostat;
const float mirostat_tau = sparams.mirostat_tau;
const float mirostat_eta = sparams.mirostat_eta;
// const bool penalize_nl = params.penalize_nl;
llama_token id = 0;
{
auto logits = llama_get_logits(ctx);
auto n_vocab = llama_n_vocab(llama_get_model(ctx));
// Apply params.logit_bias map
for (auto it = sparams.logit_bias.begin(); it != sparams.logit_bias.end(); it++) {
logits[it->first] += it->second;
}
std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab);
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
}
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
// TODO: Apply penalties
// float nl_logit = logits[llama_token_nl(ctx)];
// auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
// llama_sample_repetition_penalty(ctx, &candidates_p,
// last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
// last_n_repeat, repeat_penalty);
// llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
// last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
// last_n_repeat, alpha_frequency, alpha_presence);
// if (!penalize_nl) {
// logits[llama_token_nl(ctx)] = nl_logit;
// }
if (temp <= 0) {
// Greedy sampling
id = llama_sample_token_greedy(ctx, &candidates_p);
} else {
if (mirostat == 1) {
static float mirostat_mu = 2.0f * mirostat_tau;
const int mirostat_m = 100;
llama_sample_temp(ctx, &candidates_p, temp);
id = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
} else if (mirostat == 2) {
static float mirostat_mu = 2.0f * mirostat_tau;
llama_sample_temp(ctx, &candidates_p, temp);
id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
} else {
// Temperature sampling
llama_sample_top_k(ctx, &candidates_p, top_k, 1);
llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1);
llama_sample_typical(ctx, &candidates_p, typical_p, 1);
llama_sample_top_p(ctx, &candidates_p, top_p, 1);
llama_sample_temp(ctx, &candidates_p, temp);
id = llama_sample_token(ctx, &candidates_p);
}
}
}
return id;
}
const char * sampling(struct MyModel * mymodel) {
llama_context * ctx = mymodel->ctx;
int id = sampling_id(mymodel);
static std::string ret;
if (id == llama_token_eos(ctx)) {
ret = "</s>";
} else {
ret = llama_token_to_piece(ctx, id);
}
eval_id(mymodel, id);
return ret.c_str();
}
}

View File

@@ -0,0 +1,35 @@
#include "embd-input.h"
#include <stdlib.h>
#include <random>
#include <string.h>
int main(int argc, char** argv) {
auto mymodel = create_mymodel(argc, argv);
int N = 10;
int max_tgt_len = 500;
int n_embd = llama_n_embd(llama_get_model(mymodel->ctx));
// add random float embd to test evaluation
float * data = new float[N*n_embd];
std::default_random_engine e;
std::uniform_real_distribution<float> u(0,1);
for (int i=0;i<N*n_embd;i++) {
data[i] = u(e);
}
eval_string(mymodel, "user: what is the color of the flag of UN?");
eval_float(mymodel, data, N);
eval_string(mymodel, "assistant:");
eval_string(mymodel, mymodel->params.prompt.c_str());
const char* tmp;
for (int i=0; i<max_tgt_len; i++) {
tmp = sampling(mymodel);
if (strcmp(tmp, "</s>")==0) break;
printf("%s", tmp);
fflush(stdout);
}
printf("\n");
free_mymodel(mymodel);
return 0;
}

View File

@@ -0,0 +1,27 @@
#ifndef _EMBD_INPUT_H_
#define _EMBD_INPUT_H_ 1
#include "common.h"
#include "llama.h"
extern "C" {
typedef struct MyModel {
llama_context* ctx;
gpt_params params;
int n_past = 0;
} MyModel;
struct MyModel* create_mymodel(int argc, char ** argv);
bool eval_float(void* model, float* input, int N);
bool eval_tokens(void* model, std::vector<llama_token> tokens);
bool eval_id(struct MyModel* mymodel, int id);
bool eval_string(struct MyModel* mymodel, const char* str);
const char * sampling(struct MyModel* mymodel);
llama_token sampling_id(struct MyModel* mymodel);
void free_mymodel(struct MyModel* mymodel);
}
#endif

View File

@@ -0,0 +1,72 @@
#!/usr/bin/env python3
import ctypes
from ctypes import cdll, c_char_p, c_void_p, POINTER, c_float, c_int
import numpy as np
import os
libc = cdll.LoadLibrary("./libembdinput.so")
libc.sampling.restype=c_char_p
libc.create_mymodel.restype=c_void_p
libc.eval_string.argtypes=[c_void_p, c_char_p]
libc.sampling.argtypes=[c_void_p]
libc.eval_float.argtypes=[c_void_p, POINTER(c_float), c_int]
class MyModel:
def __init__(self, args):
argc = len(args)
c_str = [c_char_p(i.encode()) for i in args]
args_c = (c_char_p * argc)(*c_str)
self.model = c_void_p(libc.create_mymodel(argc, args_c))
self.max_tgt_len = 512
self.print_string_eval = True
def __del__(self):
libc.free_mymodel(self.model)
def eval_float(self, x):
libc.eval_float(self.model, x.astype(np.float32).ctypes.data_as(POINTER(c_float)), x.shape[1])
def eval_string(self, x):
libc.eval_string(self.model, x.encode()) # c_char_p(x.encode()))
if self.print_string_eval:
print(x)
def eval_token(self, x):
libc.eval_id(self.model, x)
def sampling(self):
s = libc.sampling(self.model)
return s
def stream_generate(self, end="</s>"):
ret = b""
end = end.encode()
for _ in range(self.max_tgt_len):
tmp = self.sampling()
ret += tmp
yield tmp
if ret.endswith(end):
break
def generate_with_print(self, end="</s>"):
ret = b""
for i in self.stream_generate(end=end):
ret += i
print(i.decode(errors="replace"), end="", flush=True)
print("")
return ret.decode(errors="replace")
def generate(self, end="</s>"):
text = b"".join(self.stream_generate(end=end))
return text.decode(errors="replace")
if __name__ == "__main__":
model = MyModel(["main", "--model", "../llama.cpp/models/ggml-vic13b-q4_1.bin", "-c", "2048"])
model.eval_string("""user: what is the color of the flag of UN?""")
x = np.random.random((5120,10))# , dtype=np.float32)
model.eval_float(x)
model.eval_string("""assistant:""")
for i in model.generate():
print(i.decode(errors="replace"), end="", flush=True)

71
examples/embd-input/llava.py Executable file
View File

@@ -0,0 +1,71 @@
#!/usr/bin/env python3
import sys
import os
sys.path.insert(0, os.path.dirname(__file__))
from embd_input import MyModel
import numpy as np
from torch import nn
import torch
from transformers import CLIPVisionModel, CLIPImageProcessor
from PIL import Image
# model parameters from 'liuhaotian/LLaVA-13b-delta-v1-1'
vision_tower = "openai/clip-vit-large-patch14"
select_hidden_state_layer = -2
# (vision_config.image_size // vision_config.patch_size) ** 2
image_token_len = (224//14)**2
class Llava:
def __init__(self, args):
self.image_processor = CLIPImageProcessor.from_pretrained(vision_tower)
self.vision_tower = CLIPVisionModel.from_pretrained(vision_tower)
self.mm_projector = nn.Linear(1024, 5120)
self.model = MyModel(["main", *args])
def load_projection(self, path):
state = torch.load(path)
self.mm_projector.load_state_dict({
"weight": state["model.mm_projector.weight"],
"bias": state["model.mm_projector.bias"]})
def chat(self, question):
self.model.eval_string("user: ")
self.model.eval_string(question)
self.model.eval_string("\nassistant: ")
return self.model.generate_with_print()
def chat_with_image(self, image, question):
with torch.no_grad():
embd_image = self.image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
image_forward_out = self.vision_tower(embd_image.unsqueeze(0), output_hidden_states=True)
select_hidden_state = image_forward_out.hidden_states[select_hidden_state_layer]
image_feature = select_hidden_state[:, 1:]
embd_image = self.mm_projector(image_feature)
embd_image = embd_image.cpu().numpy()[0]
self.model.eval_string("user: ")
self.model.eval_token(32003-2) # im_start
self.model.eval_float(embd_image.T)
for i in range(image_token_len-embd_image.shape[0]):
self.model.eval_token(32003-3) # im_patch
self.model.eval_token(32003-1) # im_end
self.model.eval_string(question)
self.model.eval_string("\nassistant: ")
return self.model.generate_with_print()
if __name__=="__main__":
# model form liuhaotian/LLaVA-13b-delta-v1-1
a = Llava(["--model", "./models/ggml-llava-13b-v1.1.bin", "-c", "2048"])
# Extract from https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/blob/main/pytorch_model-00003-of-00003.bin.
# Also here can use pytorch_model-00003-of-00003.bin directly.
a.load_projection(os.path.join(
os.path.dirname(__file__) ,
"llava_projection.pth"))
respose = a.chat_with_image(
Image.open("./media/llama1-logo.png").convert('RGB'),
"what is the text in the picture?")
respose
a.chat("what is the color of it?")

129
examples/embd-input/minigpt4.py Executable file
View File

@@ -0,0 +1,129 @@
#!/usr/bin/env python3
import sys
import os
sys.path.insert(0, os.path.dirname(__file__))
from embd_input import MyModel
import numpy as np
from torch import nn
import torch
from PIL import Image
minigpt4_path = os.path.join(os.path.dirname(__file__), "MiniGPT-4")
sys.path.insert(0, minigpt4_path)
from minigpt4.models.blip2 import Blip2Base
from minigpt4.processors.blip_processors import Blip2ImageEvalProcessor
class MiniGPT4(Blip2Base):
"""
MiniGPT4 model from https://github.com/Vision-CAIR/MiniGPT-4
"""
def __init__(self,
args,
vit_model="eva_clip_g",
q_former_model="https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xxl.pth",
img_size=224,
drop_path_rate=0,
use_grad_checkpoint=False,
vit_precision="fp32",
freeze_vit=True,
freeze_qformer=True,
num_query_token=32,
llama_model="",
prompt_path="",
prompt_template="",
max_txt_len=32,
end_sym='\n',
low_resource=False, # use 8 bit and put vit in cpu
device_8bit=0
):
super().__init__()
self.img_size = img_size
self.low_resource = low_resource
self.preprocessor = Blip2ImageEvalProcessor(img_size)
print('Loading VIT')
self.visual_encoder, self.ln_vision = self.init_vision_encoder(
vit_model, img_size, drop_path_rate, use_grad_checkpoint, vit_precision
)
print('Loading VIT Done')
print('Loading Q-Former')
self.Qformer, self.query_tokens = self.init_Qformer(
num_query_token, self.visual_encoder.num_features
)
self.Qformer.cls = None
self.Qformer.bert.embeddings.word_embeddings = None
self.Qformer.bert.embeddings.position_embeddings = None
for layer in self.Qformer.bert.encoder.layer:
layer.output = None
layer.intermediate = None
self.load_from_pretrained(url_or_filename=q_former_model)
print('Loading Q-Former Done')
self.llama_proj = nn.Linear(
self.Qformer.config.hidden_size, 5120 # self.llama_model.config.hidden_size
)
self.max_txt_len = max_txt_len
self.end_sym = end_sym
self.model = MyModel(["main", *args])
# system prompt
self.model.eval_string("Give the following image: <Img>ImageContent</Img>. "
"You will be able to see the image once I provide it to you. Please answer my questions."
"###")
def encode_img(self, image):
image = self.preprocessor(image)
image = image.unsqueeze(0)
device = image.device
if self.low_resource:
self.vit_to_cpu()
image = image.to("cpu")
with self.maybe_autocast():
image_embeds = self.ln_vision(self.visual_encoder(image)).to(device)
image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(device)
query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)
query_output = self.Qformer.bert(
query_embeds=query_tokens,
encoder_hidden_states=image_embeds,
encoder_attention_mask=image_atts,
return_dict=True,
)
inputs_llama = self.llama_proj(query_output.last_hidden_state)
# atts_llama = torch.ones(inputs_llama.size()[:-1], dtype=torch.long).to(image.device)
return inputs_llama
def load_projection(self, path):
state = torch.load(path)["model"]
self.llama_proj.load_state_dict({
"weight": state["llama_proj.weight"],
"bias": state["llama_proj.bias"]})
def chat(self, question):
self.model.eval_string("Human: ")
self.model.eval_string(question)
self.model.eval_string("\n### Assistant:")
return self.model.generate_with_print(end="###")
def chat_with_image(self, image, question):
with torch.no_grad():
embd_image = self.encode_img(image)
embd_image = embd_image.cpu().numpy()[0]
self.model.eval_string("Human: <Img>")
self.model.eval_float(embd_image.T)
self.model.eval_string("</Img> ")
self.model.eval_string(question)
self.model.eval_string("\n### Assistant:")
return self.model.generate_with_print(end="###")
if __name__=="__main__":
a = MiniGPT4(["--model", "./models/ggml-vicuna-13b-v0-q4_1.bin", "-c", "2048"])
a.load_projection(os.path.join(
os.path.dirname(__file__) ,
"pretrained_minigpt4.pth"))
respose = a.chat_with_image(
Image.open("./media/llama1-logo.png").convert('RGB'),
"what is the text in the picture?")
a.chat("what is the color of it?")

View File

@@ -0,0 +1,99 @@
#!/usr/bin/env python3
import sys
import os
sys.path.insert(0, os.path.dirname(__file__))
from embd_input import MyModel
import numpy as np
from torch import nn
import torch
# use PandaGPT path
panda_gpt_path = os.path.join(os.path.dirname(__file__), "PandaGPT")
imagebind_ckpt_path = "./models/panda_gpt/"
sys.path.insert(0, os.path.join(panda_gpt_path,"code","model"))
from ImageBind.models import imagebind_model
from ImageBind import data
ModalityType = imagebind_model.ModalityType
max_tgt_len = 400
class PandaGPT:
def __init__(self, args):
self.visual_encoder,_ = imagebind_model.imagebind_huge(pretrained=True, store_path=imagebind_ckpt_path)
self.visual_encoder.eval()
self.llama_proj = nn.Linear(1024, 5120) # self.visual_hidden_size, 5120)
self.max_tgt_len = max_tgt_len
self.model = MyModel(["main", *args])
self.generated_text = ""
self.device = "cpu"
def load_projection(self, path):
state = torch.load(path, map_location="cpu")
self.llama_proj.load_state_dict({
"weight": state["llama_proj.weight"],
"bias": state["llama_proj.bias"]})
def eval_inputs(self, inputs):
self.model.eval_string("<Img>")
embds = self.extract_multimoal_feature(inputs)
for i in embds:
self.model.eval_float(i.T)
self.model.eval_string("</Img> ")
def chat(self, question):
return self.chat_with_image(None, question)
def chat_with_image(self, inputs, question):
if self.generated_text == "":
self.model.eval_string("###")
self.model.eval_string(" Human: ")
if inputs:
self.eval_inputs(inputs)
self.model.eval_string(question)
self.model.eval_string("\n### Assistant:")
ret = self.model.generate_with_print(end="###")
self.generated_text += ret
return ret
def extract_multimoal_feature(self, inputs):
features = []
for key in ["image", "audio", "video", "thermal"]:
if key + "_paths" in inputs:
embeds = self.encode_data(key, inputs[key+"_paths"])
features.append(embeds)
return features
def encode_data(self, data_type, data_paths):
type_map = {
"image": ModalityType.VISION,
"audio": ModalityType.AUDIO,
"video": ModalityType.VISION,
"thermal": ModalityType.THERMAL,
}
load_map = {
"image": data.load_and_transform_vision_data,
"audio": data.load_and_transform_audio_data,
"video": data.load_and_transform_video_data,
"thermal": data.load_and_transform_thermal_data
}
load_function = load_map[data_type]
key = type_map[data_type]
inputs = {key: load_function(data_paths, self.device)}
with torch.no_grad():
embeddings = self.visual_encoder(inputs)
embeds = embeddings[key]
embeds = self.llama_proj(embeds).cpu().numpy()
return embeds
if __name__=="__main__":
a = PandaGPT(["--model", "./models/ggml-vicuna-13b-v0-q4_1.bin", "-c", "2048", "--lora", "./models/panda_gpt/ggml-adapter-model.bin","--temp", "0"])
a.load_projection("./models/panda_gpt/adapter_model.bin")
a.chat_with_image(
{"image_paths": ["./media/llama1-logo.png"]},
"what is the text in the picture? 'llama' or 'lambda'?")
a.chat("what is the color of it?")

View File

@@ -3,3 +3,6 @@ add_executable(${TARGET} embedding.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()

View File

@@ -1,3 +1,4 @@
#include "build-info.h"
#include "common.h"
#include "llama.h"

View File

@@ -240,7 +240,7 @@ static struct lora_data * load_lora(struct lora_info * info) {
}
struct ggml_init_params params_ggml;
params_ggml.mem_size = ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE;
params_ggml.mem_size = ggml_tensor_overhead() * GGML_MAX_NODES;
params_ggml.mem_buffer = NULL;
params_ggml.no_alloc = true;
result->ctx = ggml_init(params_ggml);
@@ -334,7 +334,7 @@ static bool apply_lora(struct ggml_tensor * tensor, struct lora_data * lora, int
float scaling = lora->info.scale * (float)lora->lora_alpha / (float)lora->lora_r;
struct ggml_init_params params;
params.mem_size = GGML_OBJECT_SIZE + ggml_graph_overhead() + ggml_tensor_overhead()*4 + GGML_MEM_ALIGN*5;
params.mem_size = GGML_OBJECT_SIZE + GGML_GRAPH_SIZE + ggml_tensor_overhead()*4 + GGML_MEM_ALIGN*5;
params.mem_buffer = NULL;
params.no_alloc = true;
struct ggml_context * ctx = NULL;

View File

@@ -642,9 +642,8 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
const int rope_mode = 0;
return ggml_rope_custom(ctx,
t, KQ_pos, n_rot, rope_mode, n_ctx, 0,
rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f
);
t, KQ_pos, n_rot, rope_mode, n_ctx,
rope_freq_base, rope_freq_scale);
};
set_name(tokens_input, "tokens_input");
@@ -653,7 +652,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
GGML_ASSERT(tokens_input->type == GGML_TYPE_I32);
auto add_to_f32 = [] (struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) {
if (ggml_is_quantized(a->type) || a->type == GGML_TYPE_F16) {
if (ggml_is_quantized(a->type)) {
return ggml_add_cast(ctx, a, b, GGML_TYPE_F32);
} else if (a->type == GGML_TYPE_F32) {
return ggml_add(ctx, a, b);
@@ -772,7 +771,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
if (enable_checkpointing) {
ggml_build_backward_gradient_checkpointing(ctx, gf, gb, gb_tmp, checkpoints.data(), (int) checkpoints.size());
} else {
ggml_graph_cpy(gf, gb);
*gb = *gf;
ggml_build_backward_expand(ctx, gf, gb, true);
}
@@ -1460,17 +1459,6 @@ static bool train_params_parse(int argc, char ** argv, struct train_params * par
}
params->n_rank_w3 = std::stoi(argv[i]);
params->custom_n_rank_w3 = true;
} else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") {
if (++i >= argc) {
invalid_param = true;
break;
}
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
params->common.n_gpu_layers = std::stoi(argv[i]);
#else
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
#endif
} else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
train_print_usage(argc, argv, &default_params);
@@ -1557,7 +1545,6 @@ int main(int argc, char ** argv) {
srand(params.common.seed);
struct llama_model_params llama_mparams = llama_model_default_params();
llama_mparams.n_gpu_layers = params.common.n_gpu_layers;
llama_mparams.vocab_only = false;
printf("%s: model base = '%s'\n", __func__, params.fn_model_base);
@@ -1615,7 +1602,6 @@ int main(int argc, char ** argv) {
opt->params = ggml_opt_default_params(GGML_OPT_ADAM);
opt->params.print_forward_graph = false;
opt->params.print_backward_graph = false;
opt->params.graph_size = LLAMA_TRAIN_MAX_NODES;
opt->params.n_threads = params.common.n_threads;
opt->params.past = params.common.opt_past;
opt->params.delta = params.common.opt_delta;
@@ -1742,9 +1728,11 @@ int main(int argc, char ** argv) {
ggml_allocr_free(alloc);
// context for compute tensors without their data
const size_t estimated_compute_size_wo_data = (
2*LLAMA_TRAIN_MAX_NODES*ggml_tensor_overhead() +
(params.common.use_checkpointing ? 3 : 2)*(GGML_OBJECT_SIZE+ggml_graph_overhead_custom(LLAMA_TRAIN_MAX_NODES, true))
size_t estimated_compute_size_wo_data = (
ggml_tensor_overhead()*GGML_MAX_NODES*2
+ (GGML_OBJECT_SIZE+GGML_GRAPH_SIZE)*(
params.common.use_checkpointing ? 3 : 2
)
);
struct ggml_init_params ctx_compute_params = {
estimated_compute_size_wo_data, // mem_size
@@ -1767,11 +1755,11 @@ int main(int argc, char ** argv) {
for (unsigned order = 0; order < (unsigned) GGML_CGRAPH_EVAL_ORDER_COUNT; ++order) {
ctx_compute = ggml_init(ctx_compute_params);
alloc = ggml_allocr_new_measure(tensor_alignment);
gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
gf = ggml_new_graph(ctx_compute);
gf->order = (enum ggml_cgraph_eval_order) order;
gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
gb = ggml_new_graph(ctx_compute);
gb_tmp = params.common.use_checkpointing
? ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true)
? ggml_new_graph(ctx_compute)
: NULL;
loss = llama_build_lora_finetune_graphs(
&model, &lora, alloc, ctx_compute,
@@ -1800,11 +1788,11 @@ int main(int argc, char ** argv) {
mem_compute_data.resize(max_compute_size);
ctx_compute = ggml_init(ctx_compute_params);
alloc = ggml_allocr_new(mem_compute_data.data(), mem_compute_data.size(), tensor_alignment);
gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
gf = ggml_new_graph(ctx_compute);
gf->order = best_order;
gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
gb = ggml_new_graph(ctx_compute);
gb_tmp = params.common.use_checkpointing
? ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true)
? ggml_new_graph(ctx_compute)
: NULL;
loss = llama_build_lora_finetune_graphs(
&model, &lora, alloc, ctx_compute,

View File

@@ -1,34 +0,0 @@
#!/bin/bash
cd `dirname $0`
cd ../..
EXE="./finetune"
if [[ ! $LLAMA_MODEL_DIR ]]; then LLAMA_MODEL_DIR="./models"; fi
if [[ ! $LLAMA_TRAINING_DIR ]]; then LLAMA_TRAINING_DIR="."; fi
# MODEL="$LLAMA_MODEL_DIR/openllama-3b-v2-q8_0.gguf" # This is the model the readme uses.
MODEL="$LLAMA_MODEL_DIR/openllama-3b-v2.gguf" # An f16 model. Note in this case with "-g", you get an f32-format .BIN file that isn't yet supported if you use it with "main --lora" with GPU inferencing.
while getopts "dg" opt; do
case $opt in
d)
DEBUGGER="gdb --args"
;;
g)
EXE="./build/bin/Release/finetune"
GPUARG="--gpu-layers 25"
;;
esac
done
$DEBUGGER $EXE \
--model-base $MODEL \
$GPUARG \
--checkpoint-in chk-ol3b-shakespeare-LATEST.gguf \
--checkpoint-out chk-ol3b-shakespeare-ITERATION.gguf \
--lora-out lora-ol3b-shakespeare-ITERATION.bin \
--train-data "$LLAMA_TRAINING_DIR\shakespeare.txt" \
--save-every 10 \
--threads 10 --adam-iter 30 --batch 4 --ctx 64 \
--use-checkpointing

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -3,3 +3,6 @@ add_executable(${TARGET} infill.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()

View File

@@ -2,6 +2,7 @@
#include "console.h"
#include "llama.h"
#include "build-info.h"
#include "grammar-parser.h"
#include <cassert>
@@ -38,9 +39,9 @@ static gpt_params * g_params;
static std::vector<llama_token> * g_input_tokens;
static std::ostringstream * g_output_ss;
static std::vector<llama_token> * g_output_tokens;
static bool is_interacting = false;
static void write_logfile(
const llama_context * ctx, const gpt_params & params, const llama_model * model,
const std::vector<llama_token> & input_tokens, const std::string & output,
@@ -103,7 +104,7 @@ static void sigint_handler(int signo) {
int main(int argc, char ** argv) {
gpt_params params;
llama_sampling_params & sparams = params.sparams;
llama_sampling_params & sparams = params.sampling_params;
g_params = &params;
if (!gpt_params_parse(argc, argv, params)) {
@@ -183,8 +184,8 @@ int main(int argc, char ** argv) {
LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
}
LOG_TEE("%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
LOG_TEE("%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);
LOG_TEE("%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
LOG_TEE("%s: built with %s for %s\n", __func__, BUILD_COMPILER, BUILD_TARGET);
if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = time(NULL);
@@ -245,23 +246,23 @@ int main(int argc, char ** argv) {
if (suff_rm_leading_spc && inp_sfx[0] == space_token) {
inp_sfx.erase(inp_sfx.begin());
}
inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(ctx));
if (add_bos) {
inp_pfx.insert(inp_pfx.begin(), llama_token_bos(model));
inp_pfx.insert(inp_pfx.begin(), llama_token_bos(ctx));
}
inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(ctx));
embd_inp = inp_pfx;
embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
embd_inp.push_back(llama_token_middle(model));
embd_inp.push_back(llama_token_middle(ctx));
LOG("prefix: \"%s\"\n", log_tostr(params.input_prefix));
LOG("suffix: \"%s\"\n", log_tostr(params.input_suffix));
LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp));
// Should not run without any tokens
if (embd_inp.empty()) {
embd_inp.push_back(llama_token_bos(model));
LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
embd_inp.push_back(llama_token_bos(ctx));
LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp));
}
// Tokenize negative prompt
@@ -272,10 +273,10 @@ int main(int argc, char ** argv) {
LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos);
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp));
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp));
original_prompt_len = original_inp.size();
guidance_offset = (int)guidance_inp.size() - original_prompt_len;
@@ -293,8 +294,8 @@ int main(int argc, char ** argv) {
params.n_keep = (int)embd_inp.size();
}
LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str());
LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str());
LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx));
LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx));
// enable interactive mode if interactive start is specified
@@ -357,10 +358,39 @@ int main(int argc, char ** argv) {
LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
}
}
LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str());
LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
sparams.repeat_last_n, sparams.repeat_penalty, sparams.presence_penalty, sparams.frequency_penalty, sparams.top_k, sparams.tfs_z, sparams.top_p, sparams.typical_p, sparams.temp, sparams.mirostat, sparams.mirostat_eta, sparams.mirostat_tau);
LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
LOG_TEE("\n\n");
struct llama_grammar * grammar = NULL;
grammar_parser::parse_state parsed_grammar;
if (!params.grammar.empty()) {
parsed_grammar = grammar_parser::parse(params.grammar.c_str());
// will be empty (default) if there are parse errors
if (parsed_grammar.rules.empty()) {
return 1;
}
LOG_TEE("%s: grammar:\n", __func__);
grammar_parser::print_grammar(stderr, parsed_grammar);
LOG_TEE("\n");
{
auto it = sparams.logit_bias.find(llama_token_eos(ctx));
if (it != sparams.logit_bias.end() && it->second == -INFINITY) {
LOG_TEE("%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
}
}
std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
grammar = llama_grammar_init(
grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
}
// TODO: replace with ring-buffer
std::vector<llama_token> last_tokens(n_ctx);
std::fill(last_tokens.begin(), last_tokens.end(), 0);
LOG_TEE("\n##### Infill mode #####\n\n");
if (params.infill) {
printf("\n************\n");
@@ -403,7 +433,11 @@ int main(int argc, char ** argv) {
std::vector<llama_token> embd;
std::vector<llama_token> embd_guidance;
struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
const int n_vocab = llama_n_vocab(model);
llama_sampling_context ctx_sampling = llama_sampling_context_init(params, grammar);
std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab);
while (n_remain != 0 || params.interactive) {
// predict
@@ -450,7 +484,7 @@ int main(int argc, char ** argv) {
LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance);
LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd));
}
@@ -478,7 +512,7 @@ int main(int argc, char ** argv) {
input_buf = embd_guidance.data();
input_size = embd_guidance.size();
LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance).c_str());
LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance));
} else {
input_buf = embd.data();
input_size = embd.size();
@@ -501,7 +535,7 @@ int main(int argc, char ** argv) {
n_eval = params.n_batch;
}
LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd));
if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
LOG_TEE("%s : failed to eval\n", __func__);
@@ -520,11 +554,12 @@ int main(int argc, char ** argv) {
if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance);
const llama_token id = llama_sampling_sample(ctx, ctx_guidance, ctx_sampling, last_tokens, candidates);
llama_sampling_accept(ctx_sampling, ctx, id, true);
last_tokens.erase(last_tokens.begin());
last_tokens.push_back(id);
LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str());
LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, last_tokens));
embd.push_back(id);
@@ -540,11 +575,8 @@ int main(int argc, char ** argv) {
LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
while ((int) embd_inp.size() > n_consumed) {
embd.push_back(embd_inp[n_consumed]);
// push the prompt in the sampling context in order to apply repetition penalties later
// for the prompt, we don't apply grammar rules
llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], false);
last_tokens.erase(last_tokens.begin());
last_tokens.push_back(embd_inp[n_consumed]);
++n_consumed;
if ((int) embd.size() >= params.n_batch) {
break;
@@ -576,10 +608,10 @@ int main(int argc, char ** argv) {
if ((int) embd_inp.size() <= n_consumed) {
// deal with eot token in infill mode
if ((llama_sampling_last(ctx_sampling) == llama_token_eot(model) || is_interacting) && params.interactive){
if ((last_tokens.back() == llama_token_eot(ctx) || is_interacting) && params.interactive){
if(is_interacting && !params.interactive_first) {
// print an eot token
printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
printf("%s", llama_token_to_piece(ctx, llama_token_eot(ctx)).c_str());
}
fflush(stdout);
printf("\n");
@@ -593,7 +625,7 @@ int main(int argc, char ** argv) {
buffer += line;
} while (another_line);
// check if we got an empty line, if so we use the old input
if (!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) {
if(!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) {
params.input_prefix = buffer;
}
buffer.clear();
@@ -603,7 +635,7 @@ int main(int argc, char ** argv) {
buffer += line;
} while (another_line);
// check if we got an empty line
if (!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) {
if(!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) {
params.input_suffix = buffer;
}
buffer.clear();
@@ -616,7 +648,7 @@ int main(int argc, char ** argv) {
process_escapes(params.input_suffix);
}
suff_rm_leading_spc = params.escape;
if (suff_rm_leading_spc && params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
if (suff_rm_leading_spc && params.input_suffix.find_first_of(" ") == 0 && params.input_suffix.size() > 1) {
params.input_suffix.erase(0, 1);
suff_rm_leading_spc = false;
}
@@ -626,14 +658,14 @@ int main(int argc, char ** argv) {
if (suff_rm_leading_spc && inp_sfx[0] == space_token) {
inp_sfx.erase(inp_sfx.begin());
}
inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(ctx));
if (add_bos) {
inp_pfx.insert(inp_pfx.begin(), llama_token_bos(model));
inp_pfx.insert(inp_pfx.begin(), llama_token_bos(ctx));
}
inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(ctx));
embd_inp = inp_pfx;
embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
embd_inp.push_back(llama_token_middle(model));
embd_inp.push_back(llama_token_middle(ctx));
embd.clear();
embd_guidance.clear();
n_remain = params.n_predict;
@@ -643,7 +675,7 @@ int main(int argc, char ** argv) {
is_interacting = false;
}
// deal with end of text token in interactive mode
else if (llama_sampling_last(ctx_sampling) == llama_token_eos(model)) {
else if (last_tokens.back() == llama_token_eos(ctx)) {
LOG("found EOS token\n");
if (params.interactive) {
@@ -660,7 +692,7 @@ int main(int argc, char ** argv) {
if (params.input_prefix_bos) {
LOG("adding input prefix BOS token\n");
embd_inp.push_back(llama_token_bos(model));
embd_inp.push_back(llama_token_bos(ctx));
}
std::string buffer;
@@ -695,7 +727,7 @@ int main(int argc, char ** argv) {
const size_t original_size = embd_inp.size();
const auto line_inp = ::llama_tokenize(ctx, buffer, false);
LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp));
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
@@ -716,14 +748,22 @@ int main(int argc, char ** argv) {
if (n_past > 0) {
if (is_interacting) {
llama_sampling_reset(ctx_sampling);
// reset grammar state if we're restarting generation
if (grammar != NULL) {
llama_grammar_free(grammar);
std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
grammar = llama_grammar_init(
grammar_rules.data(), grammar_rules.size(),
parsed_grammar.symbol_ids.at("root"));
}
}
is_interacting = false;
}
}
// end of text token
if (!embd.empty() && embd.back() == llama_token_eos(model) && !params.interactive) {
if (!embd.empty() && embd.back() == llama_token_eos(ctx) && !params.interactive) {
break;
}
@@ -735,7 +775,7 @@ int main(int argc, char ** argv) {
}
}
if (!params.interactive && n_remain <= 0) {
printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
printf("%s", llama_token_to_piece(ctx, llama_token_eot(ctx)).c_str());
fflush(stdout);
}
@@ -746,7 +786,9 @@ int main(int argc, char ** argv) {
llama_free(ctx);
llama_free_model(model);
llama_sampling_free(ctx_sampling);
if (grammar != NULL) {
llama_grammar_free(grammar);
}
llama_backend_free();
#ifndef LOG_DISABLE_LOGS

View File

@@ -3,3 +3,6 @@ add_executable(${TARGET} llama-bench.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()

View File

@@ -19,6 +19,7 @@
#include "ggml.h"
#include "llama.h"
#include "common.h"
#include "build-info.h"
#include "ggml-cuda.h"
// utils
@@ -640,8 +641,8 @@ struct test {
}
};
const std::string test::build_commit = LLAMA_COMMIT;
const int test::build_number = LLAMA_BUILD_NUMBER;
const std::string test::build_commit = BUILD_COMMIT;
const int test::build_number = BUILD_NUMBER;
const bool test::cuda = !!ggml_cpu_has_cublas();
const bool test::opencl = !!ggml_cpu_has_clblast();
const bool test::metal = !!ggml_cpu_has_metal();
@@ -932,7 +933,7 @@ struct sql_printer : public printer {
};
static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_batch, int n_threads) {
std::vector<llama_token> tokens(n_batch, llama_token_bos(llama_get_model(ctx)));
std::vector<llama_token> tokens(n_batch, llama_token_bos(ctx));
int n_processed = 0;
llama_set_n_threads(ctx, n_threads, n_threads);
@@ -945,7 +946,7 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_bat
}
static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads) {
llama_token token = llama_token_bos(llama_get_model(ctx));
llama_token token = llama_token_bos(ctx);
llama_set_n_threads(ctx, n_threads, n_threads);
@@ -1036,7 +1037,7 @@ int main(int argc, char ** argv) {
test t(inst, lmodel, ctx);
llama_kv_cache_clear(ctx);
llama_kv_cache_tokens_rm(ctx, -1, -1);
// warmup run
if (t.n_prompt > 0) {
@@ -1047,7 +1048,7 @@ int main(int argc, char ** argv) {
}
for (int i = 0; i < params.reps; i++) {
llama_kv_cache_clear(ctx);
llama_kv_cache_tokens_rm(ctx, -1, -1);
uint64_t t_start = get_time_ns();
if (t.n_prompt > 0) {

View File

@@ -1,36 +1,20 @@
add_library(llava OBJECT
llava.cpp
llava.h
clip.cpp
clip.h
)
target_link_libraries(llava PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
target_include_directories(llava PUBLIC .)
target_include_directories(llava PUBLIC ../..)
target_include_directories(llava PUBLIC ../../common)
target_compile_features(llava PRIVATE cxx_std_11)
add_library(llava_static STATIC $<TARGET_OBJECTS:llava>)
if (BUILD_SHARED_LIBS)
set_target_properties(llava PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_compile_definitions(llava PRIVATE LLAMA_SHARED LLAMA_BUILD)
add_library(llava_shared SHARED $<TARGET_OBJECTS:llava>)
target_link_libraries(llava_shared PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
install(TARGETS llava_shared LIBRARY)
endif()
set(TARGET clip)
add_library(${TARGET} clip.cpp clip.h)
install(TARGETS ${TARGET} LIBRARY)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
if (NOT MSVC)
target_compile_options(llava PRIVATE -Wno-cast-qual) # stb_image.h
target_compile_options(${TARGET} PRIVATE -Wno-cast-qual) # stb_image.h
endif()
if(TARGET BUILD_INFO)
add_dependencies(llava BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()
set(TARGET llava-cli)
add_executable(llava-cli llava-cli.cpp)
install(TARGETS llava-cli RUNTIME)
target_link_libraries(llava-cli PRIVATE common llama llava ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(llava PRIVATE cxx_std_11)
set(TARGET llava)
add_executable(${TARGET} llava.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama clip ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()

View File

@@ -9,12 +9,12 @@ models are available.
After API is confirmed, more models will be supported / uploaded.
## Usage
Build with cmake or run `make llava-cli` to build it.
Build with cmake or run `make llava` to build it.
After building, run: `./llava-cli` to see the usage. For example:
After building, run: `./llava` to see the usage. For example:
```sh
./llava-cli -m llava-v1.5-7b/ggml-model-q5_k.gguf --mmproj llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg
./llava -m llava-v1.5-7b/ggml-model-q5_k.gguf --mmproj llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg
```
**note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so.
@@ -51,6 +51,7 @@ Now both the LLaMA part and the image encoder is in the `llava-v1.5-7b` director
## TODO
- [ ] Support server mode.
- [ ] Support non-CPU backend for the image encoding part.
- [ ] Support different sampling methods.
- [ ] Support more model variants.

View File

@@ -112,7 +112,8 @@ static float get_f32(const gguf_context * ctx, const std::string & key) {
static struct ggml_tensor * get_tensor(struct ggml_context * ctx, const std::string & name) {
struct ggml_tensor * cur = ggml_get_tensor(ctx, name.c_str());
if (!cur) {
throw std::runtime_error(format("%s: unable to find tensor %s\n", __func__, name.c_str()));
printf("unable to find tensor %s\n", name.c_str());
throw std::runtime_error(format("unable to find tensor %s\n", name.c_str()));
}
return cur;
@@ -135,7 +136,7 @@ static std::string get_ftype(int ftype) {
case 8:
return "q8_0";
default:
throw std::runtime_error(format("%s: Unrecognized file type: %d\n", __func__, ftype));
throw std::runtime_error(format("Unrecognized file type: %d\n", ftype));
}
}
@@ -461,9 +462,6 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
};
struct gguf_context * ctx = gguf_init_from_file(fname, params);
if (!ctx) {
throw std::runtime_error(format("%s: failed to load CLIP model from %s. Does this file exist?\n", __func__, fname));
}
if (verbosity >= 1) {
const int n_tensors = gguf_get_n_tensors(ctx);
@@ -610,8 +608,8 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
int idx_mean = get_key_idx(ctx, KEY_IMAGE_MEAN);
int idx_std = get_key_idx(ctx, KEY_IMAGE_STD);
for (int i = 0; i < 3; ++i) {
new_clip->image_mean[i] = *((const float *)gguf_get_arr_data(ctx, idx_mean));
new_clip->image_std[i] = *((const float *)gguf_get_arr_data(ctx, idx_std));
new_clip->image_mean[i] = *((float *)gguf_get_arr_data(ctx, idx_mean));
new_clip->image_std[i] = *((float *)gguf_get_arr_data(ctx, idx_std));
}
if (verbosity >= 2) {
@@ -664,7 +662,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
// measure mem requirement and allocate
{
static const size_t tensor_alignment = 32;
new_clip->buf_compute.resize(ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead());
new_clip->buf_compute.resize(ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead());
new_clip->alloc = ggml_allocr_new_measure(tensor_alignment);
clip_image_f32_batch batch;
batch.size = 1;
@@ -680,44 +678,26 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
return new_clip;
}
clip_image_u8 * make_clip_image_u8() {
auto img = new clip_image_u8();
return img;
}
clip_image_u8 * make_clip_image_u8() { return new clip_image_u8(); }
clip_image_f32 * make_clip_image_f32() { return new clip_image_f32(); }
void clip_image_u8_free(clip_image_u8 * img) { if (img->data) { delete[] img->data; } delete img; }
void clip_image_f32_free(clip_image_f32 * img) { if (img->data) { delete[] img->data; } delete img; }
static void build_clip_img_from_data(const stbi_uc * data, int nx, int ny, clip_image_u8 * img) {
img->nx = nx;
img->ny = ny;
img->size = nx * ny * 3;
img->data = new uint8_t[img->size]();
memcpy(img->data, data, img->size);
}
bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
int nx, ny, nc;
auto data = stbi_load(fname, &nx, &ny, &nc, 3);
if (!data) {
fprintf(stderr, "%s: failed to load image '%s'\n", __func__, fname);
fprintf(stderr, "%s: failed to load '%s'\n", __func__, fname);
return false;
}
build_clip_img_from_data(data, nx, ny, img);
stbi_image_free(data);
return true;
}
bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img) {
int nx, ny, nc;
auto data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3);
if (!data) {
fprintf(stderr, "%s: failed to decode image bytes\n", __func__);
return false;
}
build_clip_img_from_data(data, nx, ny, img);
img->nx = nx;
img->ny = ny;
img->size = nx * ny * 3;
img->data = new uint8_t[img->size]();
memcpy(img->data, data, img->size);
stbi_image_free(data);
return true;
}
@@ -732,40 +712,39 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
// the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
// see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
clip_image_u8 * temp = make_clip_image_u8(); // we will keep the input image data here temporarily
clip_image_u8 temp; // we will keep the input image data here temporarily
if (pad2square && img->nx != img->ny) {
int longer_side = std::max(img->nx, img->ny);
temp->nx = longer_side;
temp->ny = longer_side;
temp->size = 3 * longer_side * longer_side;
temp->data = new uint8_t[temp->size]();
temp.nx = longer_side;
temp.ny = longer_side;
temp.size = 3 * longer_side * longer_side;
temp.data = new uint8_t[temp.size]();
uint8_t bc[3] = {122, 116, 104}; // bakground color in RGB from LLaVA
// fill with background color
for (size_t i = 0; i < temp->size; i++) {
temp->data[i] = bc[i % 3];
for (size_t i = 0; i < temp.size; i++) {
temp.data[i] = bc[i % 3];
}
// copy from the input image
for (int y = 0; y < img->ny; y++) {
for (int x = 0; x < img->nx; x++) {
const int i = 3 * (y * img->nx + x);
const int j = 3 * (y * temp->nx + x);
temp->data[j] = img->data[i];
temp->data[j+1] = img->data[i+1];
temp->data[j+2] = img->data[i+2];
const int j = 3 * (y * temp.nx + x);
temp.data[j] = img->data[i];
temp.data[j+1] = img->data[i+1];
temp.data[j+2] = img->data[i+2];
}
}
} else {
temp->nx = img->nx;
temp->ny = img->ny;
temp->size = img->size;
temp->data = new uint8_t[temp->size]();
*temp->data = *img->data; // copy
temp.nx = img->nx;
temp.ny = img->ny;
temp.size = img->size;
temp.data = img->data;
}
const int nx = temp->nx;
const int ny = temp->ny;
const int nx = temp.nx;
const int ny = temp.ny;
const int nx2 = ctx->vision_model.hparams.image_size;
const int ny2 = ctx->vision_model.hparams.image_size;
@@ -804,10 +783,10 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
const int j10 = 3 * (y1 * nx + x0) + c;
const int j11 = 3 * (y1 * nx + x1) + c;
const float v00 = temp->data[j00];
const float v01 = temp->data[j01];
const float v10 = temp->data[j10];
const float v11 = temp->data[j11];
const float v00 = temp.data[j00];
const float v01 = temp.data[j01];
const float v10 = temp.data[j10];
const float v11 = temp.data[j11];
const float v0 = v00 * (1.0f - dx) + v01 * dx;
const float v1 = v10 * (1.0f - dx) + v11 * dx;
@@ -822,7 +801,6 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
}
}
}
clip_image_u8_free(temp);
return true;
}
@@ -1069,16 +1047,16 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
return true;
}
int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
int clip_n_mmproj_embd(struct clip_ctx * ctx) {
return ctx->vision_model.mm_2_b->ne[0];
}
int clip_n_patches(const struct clip_ctx * ctx) {
int clip_n_patches(struct clip_ctx * ctx) {
auto & params = ctx->vision_model.hparams;
return (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
}
size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
size_t clip_embd_nbytes(struct clip_ctx * ctx) {
return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx) * sizeof(float);
}

View File

@@ -1,22 +1,7 @@
#ifndef CLIP_H
#define CLIP_H
#include <stddef.h>
#include <stdint.h>
#ifdef LLAMA_SHARED
# if defined(_WIN32) && !defined(__MINGW32__)
# ifdef LLAMA_BUILD
# define CLIP_API __declspec(dllexport)
# else
# define CLIP_API __declspec(dllimport)
# endif
# else
# define CLIP_API __attribute__ ((visibility ("default")))
# endif
#else
# define CLIP_API
#endif
#include "ggml.h"
struct clip_ctx;
@@ -35,20 +20,19 @@ struct clip_vision_hparams {
float eps;
};
/** load mmproj model */
CLIP_API struct clip_ctx * clip_model_load(const char * fname, const int verbosity);
/** free mmproj model */
CLIP_API void clip_free(struct clip_ctx * ctx);
struct clip_ctx * clip_model_load(const char * fname, const int verbosity);
size_t clip_embd_nbytes(const struct clip_ctx * ctx);
int clip_n_patches(const struct clip_ctx * ctx);
int clip_n_mmproj_embd(const struct clip_ctx * ctx);
void clip_free(struct clip_ctx * ctx);
size_t clip_embd_nbytes(struct clip_ctx * ctx);
int clip_n_patches(struct clip_ctx * ctx);
int clip_n_mmproj_embd(struct clip_ctx * ctx);
// RGB uint8 image
struct clip_image_u8 {
int nx;
int ny;
uint8_t * data = NULL;
uint8_t * data;
size_t size;
};
@@ -57,7 +41,7 @@ struct clip_image_u8 {
struct clip_image_f32 {
int nx;
int ny;
float * data = NULL;
float * data;
size_t size;
};
@@ -73,12 +57,7 @@ struct clip_image_f32_batch {
struct clip_image_u8 * make_clip_image_u8();
struct clip_image_f32 * make_clip_image_f32();
CLIP_API void clip_image_u8_free(clip_image_u8 * img);
CLIP_API void clip_image_f32_free(clip_image_f32 * img);
CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
/** interpret bytes as an image file with length bytes_length, and use the result to populate img */
CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
bool clip_image_preprocess(const struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32 * res, const bool pad2square);
bool clip_image_encode(const struct clip_ctx * ctx, const int n_threads, struct clip_image_f32 * img, float * vec);

View File

@@ -1,313 +0,0 @@
#include "ggml.h"
#include "common.h"
#include "clip.h"
#include "llava.h"
#include "llama.h"
#include "base64.hpp"
#include <cstdio>
#include <cstdlib>
#include <vector>
static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
int N = (int) tokens.size();
for (int i = 0; i < N; i += n_batch) {
int n_eval = (int) tokens.size() - i;
if (n_eval > n_batch) {
n_eval = n_batch;
}
if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
fprintf(stderr, "%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
return false;
}
*n_past += n_eval;
}
return true;
}
static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) {
std::vector<llama_token> tokens;
tokens.push_back(id);
return eval_tokens(ctx_llama, tokens, 1, n_past);
}
static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){
std::string str2 = str;
std::vector<llama_token> embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos);
eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
return true;
}
// TODO: use common/sampling.h
static llama_token sample_id(llama_context * ctx_llama, gpt_params & params) {
auto & sparams = params.sparams;
// out of user input, sample next token
const float temp = sparams.temp;
const int32_t top_k = sparams.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx_llama)) : sparams.top_k;
const float top_p = sparams.top_p;
const float tfs_z = sparams.tfs_z;
const float typical_p = sparams.typical_p;
// const int32_t repeat_last_n = sparams.repeat_last_n < 0 ? n_ctx : sparams.repeat_last_n;
// const float repeat_penalty = sparams.repeat_penalty;
// const float alpha_presence = sparams.presence_penalty;
// const float alpha_frequency = sparams.frequency_penalty;
const int mirostat = sparams.mirostat;
const float mirostat_tau = sparams.mirostat_tau;
const float mirostat_eta = sparams.mirostat_eta;
// const bool penalize_nl = sparams.penalize_nl;
llama_token id = 0;
{
auto logits = llama_get_logits(ctx_llama);
auto n_vocab = llama_n_vocab(llama_get_model(ctx_llama));
// Apply params.logit_bias map
for (auto it = sparams.logit_bias.begin(); it != sparams.logit_bias.end(); it++) {
logits[it->first] += it->second;
}
std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab);
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
}
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
if (temp <= 0) {
// Greedy sampling
id = llama_sample_token_greedy(ctx_llama, &candidates_p);
} else {
if (mirostat == 1) {
static float mirostat_mu = 2.0f * mirostat_tau;
const int mirostat_m = 100;
llama_sample_temp(ctx_llama, &candidates_p, temp);
id = llama_sample_token_mirostat(ctx_llama, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
} else if (mirostat == 2) {
static float mirostat_mu = 2.0f * mirostat_tau;
llama_sample_temp(ctx_llama, &candidates_p, temp);
id = llama_sample_token_mirostat_v2(ctx_llama, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
} else {
// Temperature sampling
llama_sample_top_k(ctx_llama, &candidates_p, top_k, 1);
llama_sample_tail_free(ctx_llama, &candidates_p, tfs_z, 1);
llama_sample_typical(ctx_llama, &candidates_p, typical_p, 1);
llama_sample_top_p(ctx_llama, &candidates_p, top_p, 1);
llama_sample_temp(ctx_llama, &candidates_p, temp);
id = llama_sample_token(ctx_llama, &candidates_p);
}
}
}
return id;
}
static const char * sample(struct llama_context * ctx_llama, gpt_params & params, int * n_past) {
int id = sample_id(ctx_llama, params);
static std::string ret;
if (id == llama_token_eos(llama_get_model(ctx_llama))) {
ret = "</s>";
} else {
ret = llama_token_to_piece(ctx_llama, id);
}
eval_id(ctx_llama, id, n_past);
return ret.c_str();
}
static const char* IMG_BASE64_TAG_BEGIN = "<img src=\"data:image/jpeg;base64,";
static const char* IMG_BASE64_TAG_END = "\">";
static void find_image_tag_in_prompt(const std::string& prompt, size_t& begin_out, size_t& end_out) {
begin_out = prompt.find(IMG_BASE64_TAG_BEGIN);
end_out = prompt.find(IMG_BASE64_TAG_END, (begin_out == std::string::npos) ? 0UL : begin_out);
}
static bool prompt_contains_image(const std::string& prompt) {
size_t begin, end;
find_image_tag_in_prompt(prompt, begin, end);
return (begin != std::string::npos);
}
// replaces the base64 image tag in the prompt with `replacement`
static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip_ctx * ctx_clip, int n_threads, const std::string& prompt) {
size_t img_base64_str_start, img_base64_str_end;
find_image_tag_in_prompt(prompt, img_base64_str_start, img_base64_str_end);
if (img_base64_str_start == std::string::npos || img_base64_str_end == std::string::npos) {
fprintf(stderr, "%s: invalid base64 image tag. must be %s<base64 byte string>%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END);
return NULL;
}
auto base64_bytes_start = img_base64_str_start + strlen(IMG_BASE64_TAG_BEGIN);
auto base64_bytes_count = img_base64_str_end - base64_bytes_start;
auto base64_str = prompt.substr(base64_bytes_start, base64_bytes_count );
auto required_bytes = base64::required_encode_size(base64_str.size());
auto img_bytes = std::vector<unsigned char>(required_bytes);
base64::decode(base64_str.begin(), base64_str.end(), img_bytes.begin());
auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, img_bytes.data(), img_bytes.size());
if (!embed) {
fprintf(stderr, "%s: could not load image from base64 string.\n", __func__);
return NULL;
}
return embed;
}
static std::string remove_image_from_prompt(const std::string& prompt, const char * replacement = "") {
size_t begin, end;
find_image_tag_in_prompt(prompt, begin, end);
if (begin == std::string::npos || end == std::string::npos) {
return prompt;
}
auto pre = prompt.substr(0, begin);
auto post = prompt.substr(end + strlen(IMG_BASE64_TAG_END));
return pre + replacement + post;
}
struct llava_context {
struct clip_ctx * ctx_clip = NULL;
struct llama_context * ctx_llama = NULL;
struct llama_model * model = NULL;
};
static void show_additional_info(int /*argc*/, char ** argv) {
printf("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
printf(" note: a lower temperature value like 0.1 is recommended for better quality.\n");
}
static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_params * params) {
// load and preprocess the image
llava_image_embed * embed = NULL;
auto prompt = params->prompt;
if (prompt_contains_image(prompt)) {
if (!params->image.empty()) {
printf("using base64 encoded image instead of command line image path\n");
}
embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->n_threads, prompt);
if (!embed) {
fprintf(stderr, "%s: can't load image from prompt\n", __func__);
return NULL;
}
params->prompt = remove_image_from_prompt(prompt);
} else {
embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->n_threads, params->image.c_str());
if (!embed) {
fprintf(stderr, "%s: is %s really an image file?\n", __func__, params->image.c_str());
return NULL;
}
}
return embed;
}
static void process_prompt(struct llava_context * ctx_llava, struct llava_image_embed * image_embed, gpt_params * params, const std::string & prompt) {
int n_past = 0;
const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;
// llava chat format is "<system_prompt>\nUSER:<image_embeddings>\n<textual_prompt>\nASSISTANT:"
eval_string(ctx_llava->ctx_llama, "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:", params->n_batch, &n_past, true);
llava_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past);
eval_string(ctx_llava->ctx_llama, (prompt + "\nASSISTANT:").c_str(), params->n_batch, &n_past, false);
// generate the response
printf("\n");
for (int i = 0; i < max_tgt_len; i++) {
const char * tmp = sample(ctx_llava->ctx_llama, *params, &n_past);
if (strcmp(tmp, "</s>") == 0) break;
printf("%s", tmp);
fflush(stdout);
}
printf("\n");
}
static struct llava_context * llava_init(gpt_params * params) {
const char * clip_path = params->mmproj.c_str();
auto prompt = params->prompt;
if (prompt.empty()) {
prompt = "describe the image in detail.";
}
auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
llama_backend_init(params->numa);
llama_model_params model_params = llama_model_params_from_gpt_params(*params);
llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
if (model == NULL) {
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
return NULL;
}
llama_context_params ctx_params = llama_context_params_from_gpt_params(*params);
ctx_params.n_ctx = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings
llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
if (ctx_llama == NULL) {
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
return NULL;
}
auto ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
ctx_llava->ctx_llama = ctx_llama;
ctx_llava->ctx_clip = ctx_clip;
ctx_llava->model = model;
return ctx_llava;
}
static void llava_free(struct llava_context * ctx_llava) {
if (ctx_llava->ctx_clip) {
clip_free(ctx_llava->ctx_clip);
ctx_llava->ctx_clip = NULL;
}
llama_free(ctx_llava->ctx_llama);
llama_free_model(ctx_llava->model);
llama_backend_free();
}
int main(int argc, char ** argv) {
ggml_time_init();
gpt_params params;
if (!gpt_params_parse(argc, argv, params)) {
show_additional_info(argc, argv);
return 1;
}
if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
gpt_print_usage(argc, argv, params);
show_additional_info(argc, argv);
return 1;
}
auto ctx_llava = llava_init(&params);
if (ctx_llava == NULL) {
fprintf(stderr, "%s: error: failed to init llava\n", __func__);
return 1;
}
auto image_embed = load_image(ctx_llava, &params);
// process the prompt
process_prompt(ctx_llava, image_embed, &params, params.prompt);
llama_print_timings(ctx_llava->ctx_llama);
llava_image_embed_free(image_embed);
llava_free(ctx_llava);
return 0;
}

View File

@@ -16,29 +16,13 @@ checkpoint = torch.load(path)
mm_tensors = [k for k, v in checkpoint.items() if k.startswith("model.mm_projector")]
# store these tensors in a new dictionary and torch.save them
projector = {name: checkpoint[name].float() for name in mm_tensors}
projector = {name: checkpoint[name] for name in mm_tensors}
torch.save(projector, f"{args.model}/llava.projector")
# remove these tensors from the checkpoint and save it again
for name in mm_tensors:
del checkpoint[name]
# BakLLaVA models contain CLIP tensors in it
clip_tensors = [k for k, v in checkpoint.items() if k.startswith("model.vision_tower")]
if len(clip_tensors) > 0:
clip = {name.replace("vision_tower.vision_tower.", ""): checkpoint[name].float() for name in clip_tensors}
torch.save(clip, f"{args.model}/llava.clip")
# remove these tensors
for name in clip_tensors:
del checkpoint[name]
# added tokens should be removed to be able to convert Mistral models
if os.path.exists(f"{args.model}/added_tokens.json"):
with open(f"{args.model}/added_tokens.json", "w") as f:
f.write("{}\n")
torch.save(checkpoint, path)
print("Done!")

View File

@@ -0,0 +1,145 @@
#pragma once
// this one and clip lib will be eventually merged to a single lib, let's keep it this way for now
#include "common.h"
#include "llama.h"
#include <cstdio>
#include <cstdlib>
#include <vector>
inline bool eval_image_embd(llama_context * ctx_llama, float * embd, int N, int n_batch, int * n_past) {
int n_embd = llama_n_embd(llama_get_model(ctx_llama));
for (int i = 0; i < N; i += n_batch) {
int n_eval = N - i;
if (n_eval > n_batch) {
n_eval = n_batch;
}
llama_batch batch = {int32_t(n_eval), nullptr, (embd+i*n_embd), nullptr, nullptr, nullptr, *n_past, 1, 0, };
if (llama_decode(ctx_llama, batch)) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return false;
}
*n_past += n_eval;
}
return true;
}
inline bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
int N = (int) tokens.size();
for (int i = 0; i < N; i += n_batch) {
int n_eval = (int) tokens.size() - i;
if (n_eval > n_batch) {
n_eval = n_batch;
}
if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return false;
}
*n_past += n_eval;
}
return true;
}
inline bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) {
std::vector<llama_token> tokens;
tokens.push_back(id);
return eval_tokens(ctx_llama, tokens, 1, n_past);
}
inline bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past){
std::string str2 = str;
std::vector<llama_token> embd_inp = ::llama_tokenize(ctx_llama, str2, true);
eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
return true;
}
// TODO: use common/sampling.h
inline llama_token sample_id(llama_context * ctx_llama, gpt_params & params) {
// out of user input, sample next token
const float temp = params.sampling_params.temp;
const int32_t top_k = params.sampling_params.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx_llama)) : params.sampling_params.top_k;
const float top_p = params.sampling_params.top_p;
const float tfs_z = params.sampling_params.tfs_z;
const float typical_p = params.sampling_params.typical_p;
// const int32_t repeat_last_n = params.sampling_params.repeat_last_n < 0 ? n_ctx : params.sampling_params.repeat_last_n;
// const float repeat_penalty = params.sampling_params.repeat_penalty;
// const float alpha_presence = params.sampling_params.presence_penalty;
// const float alpha_frequency = params.sampling_params.frequency_penalty;
const int mirostat = params.sampling_params.mirostat;
const float mirostat_tau = params.sampling_params.mirostat_tau;
const float mirostat_eta = params.sampling_params.mirostat_eta;
// const bool penalize_nl = params.sampling_params.penalize_nl;
llama_token id = 0;
{
auto logits = llama_get_logits(ctx_llama);
auto n_vocab = llama_n_vocab(llama_get_model(ctx_llama));
// Apply params.logit_bias map
for (auto it = params.sampling_params.logit_bias.begin(); it != params.sampling_params.logit_bias.end(); it++) {
logits[it->first] += it->second;
}
std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab);
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
}
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
// TODO: Apply penalties
// float nl_logit = logits[llama_token_nl(ctx)];
// auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
// llama_sample_repetition_penalty(ctx, &candidates_p,
// last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
// last_n_repeat, repeat_penalty);
// llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
// last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
// last_n_repeat, alpha_frequency, alpha_presence);
// if (!penalize_nl) {
// logits[llama_token_nl(ctx)] = nl_logit;
// }
if (temp <= 0) {
// Greedy sampling
id = llama_sample_token_greedy(ctx_llama, &candidates_p);
} else {
if (mirostat == 1) {
static float mirostat_mu = 2.0f * mirostat_tau;
const int mirostat_m = 100;
llama_sample_temp(ctx_llama, &candidates_p, temp);
id = llama_sample_token_mirostat(ctx_llama, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
} else if (mirostat == 2) {
static float mirostat_mu = 2.0f * mirostat_tau;
llama_sample_temp(ctx_llama, &candidates_p, temp);
id = llama_sample_token_mirostat_v2(ctx_llama, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
} else {
// Temperature sampling
llama_sample_top_k(ctx_llama, &candidates_p, top_k, 1);
llama_sample_tail_free(ctx_llama, &candidates_p, tfs_z, 1);
llama_sample_typical(ctx_llama, &candidates_p, typical_p, 1);
llama_sample_top_p(ctx_llama, &candidates_p, top_p, 1);
llama_sample_temp(ctx_llama, &candidates_p, temp);
id = llama_sample_token(ctx_llama, &candidates_p);
}
}
}
return id;
}
inline const char * sample(struct llama_context * ctx_llama, gpt_params & params, int * n_past) {
int id = sample_id(ctx_llama, params);
static std::string ret;
if (id == llama_token_eos(ctx_llama)) {
ret = "</s>";
} else {
ret = llama_token_to_piece(ctx_llama, id);
}
eval_id(ctx_llama, id, n_past);
return ret.c_str();
}

View File

@@ -1,156 +1,162 @@
#include "clip.h"
#include "llava-utils.h"
#include "common.h"
#include "llama.h"
#include "llava.h"
#include <cstdio>
#include <cstdlib>
#include <vector>
#include "base64.hpp"
static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) {
clip_image_f32 * img_res = make_clip_image_f32();
if (!clip_image_preprocess(ctx_clip, img, img_res, /*pad2square =*/ true)) {
fprintf(stderr, "%s: unable to preprocess image\n", __func__);
clip_image_f32_free(img_res);
return false;
}
*n_img_pos = clip_n_patches(ctx_clip);
const int64_t t_img_enc_start_us = ggml_time_us();
bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd);
clip_image_f32_free(img_res);
if (!encoded) {
fprintf(stderr, "Unable to encode image\n");
return false;
}
const int64_t t_img_enc_end_us = ggml_time_us();
float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0;
printf("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos);
return true;
static void show_additional_info(int /*argc*/, char ** argv) {
printf("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
printf(" note: a lower temperature value like 0.1 is recommended for better quality.\n");
}
bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip) {
// make sure that the correct mmproj was used, i.e., compare apples to apples
int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama));
auto n_image_embd = clip_n_mmproj_embd(ctx_clip);
if (n_image_embd != n_llama_embd) {
printf("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd);
return false;
}
return true;
}
int main(int argc, char ** argv) {
ggml_time_init();
gpt_params params;
if (!gpt_params_parse(argc, argv, params)) {
show_additional_info(argc, argv);
return 1;
}
if (params.mmproj.empty() || params.image.empty()) {
gpt_print_usage(argc, argv, params);
show_additional_info(argc, argv);
return 1;
}
const char * clip_path = params.mmproj.c_str();
const char * img_path = params.image.c_str();
if (params.prompt.empty()) {
params.prompt = "describe the image in detail.";
}
auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
// load and preprocess the image
clip_image_u8 img;
clip_image_f32 img_res;
if (!clip_image_load_from_file(img_path, &img)) {
fprintf(stderr, "%s: is %s really an image file?\n", __func__, img_path);
clip_free(ctx_clip);
return 1;
}
if (!clip_image_preprocess(ctx_clip, &img, &img_res, /*pad2square =*/ true)) {
fprintf(stderr, "%s: unable to preprocess %s\n", __func__, img_path);
clip_free(ctx_clip);
return 1;
}
int n_img_pos = clip_n_patches(ctx_clip);
int n_img_embd = clip_n_mmproj_embd(ctx_clip);
static bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) {
float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip));
if (!image_embd) {
fprintf(stderr, "Unable to allocate memory for image embeddings\n");
return 1;
}
const int64_t t_img_enc_start_us = ggml_time_us();
if (!clip_image_encode(ctx_clip, params.n_threads, &img_res, image_embd)) {
fprintf(stderr, "Unable to encode image\n");
return 1;
}
const int64_t t_img_enc_end_us = ggml_time_us();
// we get the embeddings, free up the memory required for CLIP
clip_free(ctx_clip);
llama_backend_init(params.numa);
llama_model_params model_params = llama_model_default_params();
model_params.n_gpu_layers = params.n_gpu_layers;
model_params.main_gpu = params.main_gpu;
model_params.tensor_split = params.tensor_split;
model_params.use_mmap = params.use_mmap;
model_params.use_mlock = params.use_mlock;
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
if (model == NULL) {
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
return 1;
}
llama_context_params ctx_params = llama_context_default_params();
ctx_params.n_ctx = params.n_ctx < 2048 ? 2048 : params.n_ctx; // we need a longer context size to process image embeddings
ctx_params.n_threads = params.n_threads;
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
if (ctx_llama == NULL) {
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
return 1;
}
// make sure that the correct mmproj was used, i.e., compare apples to apples
int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama));
if (n_img_embd != n_llama_embd) {
printf("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_img_embd, n_llama_embd);
llama_free(ctx_llama);
llama_free_model(model);
llama_backend_free();
free(image_embd);
return false;
return 1;
}
int n_img_pos;
if (!encode_image_with_clip(ctx_clip, n_threads, img, image_embd, &n_img_pos)) {
fprintf(stderr, "%s: cannot encode image, aborting\n", __func__);
free(image_embd);
return false;
}
*image_embd_out = image_embd;
*n_img_pos_out = n_img_pos;
// process the prompt
// llava chat format is "<system_prompt>USER: <image_embeddings>\n<textual_prompt>\nASSISTANT:"
return true;
}
bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed, int n_batch, int * n_past) {
int n_embd = llama_n_embd(llama_get_model(ctx_llama));
for (int i = 0; i < image_embed->n_image_pos; i += n_batch) {
int n_eval = image_embed->n_image_pos - i;
if (n_eval > n_batch) {
n_eval = n_batch;
}
llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
if (llama_decode(ctx_llama, batch)) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return false;
}
*n_past += n_eval;
}
return true;
}
LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length) {
clip_image_u8 * img = make_clip_image_u8();
if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) {
clip_image_u8_free(img);
fprintf(stderr, "%s: can't load image from bytes, is it a valid image?", __func__);
return NULL;
}
float* image_embed = NULL;
int n_image_pos = 0;
bool image_embed_result = llava_image_embed_make_with_clip_img(ctx_clip, n_threads, img, &image_embed, &n_image_pos);
if (!image_embed_result) {
clip_image_u8_free(img);
fprintf(stderr, "%s: coulnd't embed the image\n", __func__);
return NULL;
}
clip_image_u8_free(img);
auto result = (llava_image_embed*)malloc(sizeof(llava_image_embed));
result->embed = image_embed;
result->n_image_pos = n_image_pos;
return result;
}
static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long *sizeOut) {
auto file = fopen(path, "rb");
if (file == NULL) {
fprintf(stderr, "%s: can't read file %s\n", __func__, path);
return false;
}
fseek(file, 0, SEEK_END);
auto fileSize = ftell(file);
fseek(file, 0, SEEK_SET);
auto buffer = (unsigned char *)malloc(fileSize); // Allocate memory to hold the file data
if (buffer == NULL) {
fprintf(stderr, "%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path);
perror("Memory allocation error");
fclose(file);
return false;
}
fread(buffer, 1, fileSize, file); // Read the file into the buffer
fclose(file); // Close the file
*bytesOut = buffer;
*sizeOut = fileSize;
return true;
}
LLAVA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path) {
unsigned char* image_bytes;
long image_bytes_length;
auto loaded = load_file_to_bytes(image_path, &image_bytes, &image_bytes_length);
if (!loaded) {
fprintf(stderr, "%s: failed to load %s\n", __func__, image_path);
return NULL;
}
auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, image_bytes, image_bytes_length);
free(image_bytes);
return embed;
}
LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed) {
free(embed->embed);
free(embed);
int n_past = 0;
const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
// GG: are we sure that the should be a trailing whitespace at the end of this string?
eval_string(ctx_llama, "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER: ", params.n_batch, &n_past);
eval_image_embd(ctx_llama, image_embd, n_img_pos, params.n_batch, &n_past);
eval_string(ctx_llama, params.prompt.c_str(), params.n_batch, &n_past);
eval_string(ctx_llama, "\nASSISTANT:", params.n_batch, &n_past);
// generate the response
printf("\n");
for (int i = 0; i < max_tgt_len; i++) {
const char * tmp = sample(ctx_llama, params, &n_past);
if (strcmp(tmp, "</s>") == 0) break;
printf("%s", tmp);
fflush(stdout);
}
printf("\n");
{
const float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0;
printf("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / n_img_pos);
}
llama_print_timings(ctx_llama);
llama_free(ctx_llama);
llama_free_model(model);
llama_backend_free();
free(image_embd);
return 0;
}

View File

@@ -1,50 +0,0 @@
#ifndef LLAVA_H
#define LLAVA_H
#include "ggml.h"
#ifdef LLAMA_SHARED
# if defined(_WIN32) && !defined(__MINGW32__)
# ifdef LLAMA_BUILD
# define LLAVA_API __declspec(dllexport)
# else
# define LLAVA_API __declspec(dllimport)
# endif
# else
# define LLAVA_API __attribute__ ((visibility ("default")))
# endif
#else
# define LLAVA_API
#endif
struct clip_ctx;
#ifdef __cplusplus
extern "C" {
#endif
struct llava_image_embed {
float * embed;
int n_image_pos;
};
/** sanity check for clip <-> llava embed size match */
LLAVA_API bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip);
/** build an image embed from image file bytes */
LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length);
/** build an image embed from a path to an image filename */
LLAVA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path);
LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed);
/** free an embedding made with llava_image_embed_make_* */
/** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */
LLAVA_API bool llava_eval_image_embed(struct llama_context * ctx_llama, const struct llava_image_embed * embed, int n_batch, int * n_past);
#ifdef __cplusplus
}
#endif
#endif

View File

@@ -16,8 +16,6 @@ add_library(common OBJECT
${_common_path}/console.cpp
${_common_path}/grammar-parser.h
${_common_path}/grammar-parser.cpp
${_common_path}/sampling.h
${_common_path}/sampling.cpp
)
# WARNING: because build-info.h is auto-generated, it will only

View File

@@ -3,3 +3,6 @@ add_executable(${TARGET} main.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()

View File

@@ -142,7 +142,7 @@ The `--ctx-size` option allows you to set the size of the prompt context used by
### Extended Context Size
Some fine-tuned models have extended the context length by scaling RoPE. For example, if the original pre-trained model have a context length (max sequence length) of 4096 (4k) and the fine-tuned model have 32k. That is a scaling factor of 8, and should work by setting the above `--ctx-size` to 32768 (32k) and `--rope-scale` to 8.
Some fine-tuned models have extened the context length by scaling RoPE. For example, if the original pretrained model have a context length (max sequence length) of 4096 (4k) and the fine-tuned model have 32k. That is a scaling factor of 8, and should work by setting the above `--ctx-size` to 32768 (32k) and `--rope-scale` to 8.
- `--rope-scale N`: Where N is the linear scaling factor used by the fine-tuned model.
@@ -208,14 +208,6 @@ Top-p sampling, also known as nucleus sampling, is another text generation metho
Example usage: `--top-p 0.95`
### Min P Sampling
- `--min-p N`: Sets a minimum base probability threshold for token selection (default: 0.05).
The Min-P sampling method was designed as an alternative to Top-P, and aims to ensure a balance of quality and variety. The parameter *p* represents the minimum probability for a token to be considered, relative to the probability of the most likely token. For example, with *p*=0.05 and the most likely token having a probability of 0.9, logits with a value less than 0.045 are filtered out.
Example usage: `--min-p 0.05`
### Tail Free Sampling (TFS)
- `--tfs N`: Enable tail free sampling with parameter z (default: 1.0, 1.0 = disabled).

View File

@@ -2,6 +2,8 @@
#include "console.h"
#include "llama.h"
#include "build-info.h"
#include "grammar-parser.h"
#include <cassert>
#include <cinttypes>
@@ -107,7 +109,7 @@ int main(int argc, char ** argv) {
if (!gpt_params_parse(argc, argv, params)) {
return 1;
}
llama_sampling_params & sparams = params.sparams;
llama_sampling_params & sparams = params.sampling_params;
#ifndef LOG_DISABLE_LOGS
log_set_target(log_filename_generator("main", "log"));
@@ -152,8 +154,8 @@ int main(int argc, char ** argv) {
LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
}
LOG_TEE("%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
LOG_TEE("%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);
LOG_TEE("%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
LOG_TEE("%s: built with %s for %s\n", __func__, BUILD_COMPILER, BUILD_TARGET);
if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = time(NULL);
@@ -236,19 +238,19 @@ int main(int argc, char ** argv) {
if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
LOG("tokenize the prompt\n");
embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
} else {
LOG("use session tokens\n");
embd_inp = session_tokens;
}
LOG("prompt: \"%s\"\n", log_tostr(params.prompt));
LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp));
// Should not run without any tokens
if (embd_inp.empty()) {
embd_inp.push_back(llama_token_bos(model));
LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
embd_inp.push_back(llama_token_bos(ctx));
LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp));
}
// Tokenize negative prompt
@@ -258,11 +260,11 @@ int main(int argc, char ** argv) {
if (ctx_guidance) {
LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos, true);
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());
guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos);
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp));
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp));
original_prompt_len = original_inp.size();
guidance_offset = (int)guidance_inp.size() - original_prompt_len;
@@ -297,7 +299,7 @@ int main(int argc, char ** argv) {
}
// remove any "future" tokens that we might have inherited from the previous session
llama_kv_cache_seq_rm(ctx, -1, n_matching_session_tokens, -1);
llama_kv_cache_tokens_rm(ctx, n_matching_session_tokens, -1);
}
LOGLN(
@@ -318,11 +320,11 @@ int main(int argc, char ** argv) {
}
// prefix & suffix for instruct mode
const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", add_bos, true);
const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false, true);
const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", add_bos);
const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false);
LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str());
LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str());
LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx));
LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx));
// in instruct mode, we inject a prefix and a suffix to each input by the user
if (params.instruct) {
@@ -381,12 +383,6 @@ int main(int argc, char ** argv) {
if (!params.antiprompt.empty()) {
for (const auto & antiprompt : params.antiprompt) {
LOG_TEE("Reverse prompt: '%s'\n", antiprompt.c_str());
if (params.verbose_prompt) {
auto tmp = ::llama_tokenize(ctx, antiprompt, false, true);
for (int i = 0; i < (int) tmp.size(); i++) {
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
}
}
}
}
@@ -396,28 +392,46 @@ int main(int argc, char ** argv) {
if (!params.input_prefix.empty()) {
LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str());
if (params.verbose_prompt) {
auto tmp = ::llama_tokenize(ctx, params.input_prefix, true, true);
for (int i = 0; i < (int) tmp.size(); i++) {
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
}
}
}
if (!params.input_suffix.empty()) {
LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
if (params.verbose_prompt) {
auto tmp = ::llama_tokenize(ctx, params.input_suffix, false, true);
for (int i = 0; i < (int) tmp.size(); i++) {
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
}
}
}
}
LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str());
LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
sparams.repeat_last_n, sparams.repeat_penalty, sparams.presence_penalty, sparams.frequency_penalty, sparams.top_k, sparams.tfs_z, sparams.top_p, sparams.typical_p, sparams.temp, sparams.mirostat, sparams.mirostat_eta, sparams.mirostat_tau);
LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
LOG_TEE("\n\n");
struct llama_grammar * grammar = NULL;
grammar_parser::parse_state parsed_grammar;
if (!params.grammar.empty()) {
parsed_grammar = grammar_parser::parse(params.grammar.c_str());
// will be empty (default) if there are parse errors
if (parsed_grammar.rules.empty()) {
return 1;
}
LOG_TEE("%s: grammar:\n", __func__);
grammar_parser::print_grammar(stderr, parsed_grammar);
LOG_TEE("\n");
{
auto it = sparams.logit_bias.find(llama_token_eos(ctx));
if (it != sparams.logit_bias.end() && it->second == -INFINITY) {
LOG_TEE("%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
}
}
std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
grammar = llama_grammar_init(
grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
}
// TODO: replace with ring-buffer
std::vector<llama_token> last_tokens(n_ctx);
std::fill(last_tokens.begin(), last_tokens.end(), 0);
if (params.interactive) {
const char *control_message;
if (params.multiline_input) {
@@ -457,7 +471,11 @@ int main(int argc, char ** argv) {
std::vector<llama_token> embd;
std::vector<llama_token> embd_guidance;
struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
const int n_vocab = llama_n_vocab(model);
llama_sampling_context ctx_sampling = llama_sampling_context_init(params, grammar);
std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab);
while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
// predict
@@ -504,7 +522,7 @@ int main(int argc, char ** argv) {
LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance);
LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd));
LOG("clear session path\n");
path_session.clear();
@@ -534,6 +552,7 @@ int main(int argc, char ** argv) {
// evaluate tokens in batches
// embd is typically prepared beforehand to fit within a batch, but not always
if (ctx_guidance) {
int input_size = 0;
llama_token * input_buf = NULL;
@@ -555,7 +574,7 @@ int main(int argc, char ** argv) {
input_buf = embd_guidance.data();
input_size = embd_guidance.size();
LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance).c_str());
LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance));
} else {
input_buf = embd.data();
input_size = embd.size();
@@ -578,7 +597,7 @@ int main(int argc, char ** argv) {
n_eval = params.n_batch;
}
LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd));
if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
LOG_TEE("%s : failed to eval\n", __func__);
@@ -608,11 +627,12 @@ int main(int argc, char ** argv) {
LOG("saved session to %s\n", path_session.c_str());
}
const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance);
const llama_token id = llama_sampling_sample(ctx, ctx_guidance, ctx_sampling, last_tokens, candidates);
llama_sampling_accept(ctx_sampling, ctx, id, true);
last_tokens.erase(last_tokens.begin());
last_tokens.push_back(id);
LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str());
LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, last_tokens));
embd.push_back(id);
@@ -628,11 +648,8 @@ int main(int argc, char ** argv) {
LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
while ((int) embd_inp.size() > n_consumed) {
embd.push_back(embd_inp[n_consumed]);
// push the prompt in the sampling context in order to apply repetition penalties later
// for the prompt, we don't apply grammar rules
llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], false);
last_tokens.erase(last_tokens.begin());
last_tokens.push_back(embd_inp[n_consumed]);
++n_consumed;
if ((int) embd.size() >= params.n_batch) {
break;
@@ -662,10 +679,12 @@ int main(int argc, char ** argv) {
// if not currently processing queued inputs;
if ((int) embd_inp.size() <= n_consumed) {
// check for reverse prompt in the last n_prev tokens
// check for reverse prompt
if (!params.antiprompt.empty()) {
const int n_prev = 32;
const std::string last_output = llama_sampling_prev_str(ctx_sampling, ctx, n_prev);
std::string last_output;
for (auto id : last_tokens) {
last_output += llama_token_to_piece(ctx, id);
}
is_antiprompt = false;
// Check if each of the reverse prompts appears at the end of the output.
@@ -692,13 +711,13 @@ int main(int argc, char ** argv) {
}
// deal with end of text token in interactive mode
if (llama_sampling_last(ctx_sampling) == llama_token_eos(model)) {
if (last_tokens.back() == llama_token_eos(ctx)) {
LOG("found EOS token\n");
if (params.interactive) {
if (!params.antiprompt.empty()) {
// tokenize and inject first reverse prompt
const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false, true);
const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false);
embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
is_antiprompt = true;
}
@@ -719,13 +738,14 @@ int main(int argc, char ** argv) {
if (params.input_prefix_bos) {
LOG("adding input prefix BOS token\n");
embd_inp.push_back(llama_token_bos(model));
embd_inp.push_back(llama_token_bos(ctx));
}
std::string buffer;
if (!params.input_prefix.empty()) {
LOG("appending input prefix: '%s'\n", params.input_prefix.c_str());
printf("%s", params.input_prefix.c_str());
buffer += params.input_prefix;
printf("%s", buffer.c_str());
}
// color user input only
@@ -747,6 +767,7 @@ int main(int argc, char ** argv) {
// append input suffix if any
if (!params.input_suffix.empty()) {
LOG("appending input suffix: '%s'\n", params.input_suffix.c_str());
buffer += params.input_suffix;
printf("%s", params.input_suffix.c_str());
}
@@ -760,18 +781,11 @@ int main(int argc, char ** argv) {
n_consumed = embd_inp.size();
embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end());
}
if (params.escape) {
process_escapes(buffer);
}
const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
const auto line_inp = ::llama_tokenize(ctx, buffer, false, false);
const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
const auto line_inp = ::llama_tokenize(ctx, buffer, false);
LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp));
embd_inp.insert(embd_inp.end(), line_pfx.begin(), line_pfx.end());
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
embd_inp.insert(embd_inp.end(), line_sfx.begin(), line_sfx.end());
// instruct mode: insert response suffix
if (params.instruct) {
@@ -796,14 +810,22 @@ int main(int argc, char ** argv) {
if (n_past > 0) {
if (is_interacting) {
llama_sampling_reset(ctx_sampling);
// reset grammar state if we're restarting generation
if (grammar != NULL) {
llama_grammar_free(grammar);
std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
grammar = llama_grammar_init(
grammar_rules.data(), grammar_rules.size(),
parsed_grammar.symbol_ids.at("root"));
}
}
is_interacting = false;
}
}
// end of text token
if (!embd.empty() && embd.back() == llama_token_eos(model) && !(params.instruct || params.interactive)) {
if (!embd.empty() && embd.back() == llama_token_eos(ctx) && !(params.instruct || params.interactive)) {
LOG_TEE(" [end of text]\n");
break;
}
@@ -828,7 +850,9 @@ int main(int argc, char ** argv) {
llama_free(ctx);
llama_free_model(model);
llama_sampling_free(ctx_sampling);
if (grammar != NULL) {
llama_grammar_free(grammar);
}
llama_backend_free();
#ifndef LOG_DISABLE_LOGS

View File

@@ -34,7 +34,7 @@ int main(int argc, char ** argv) {
struct ggml_context * ctx_data = NULL;
struct ggml_context * ctx_eval = NULL;
struct ggml_cgraph * gf = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval);
struct ggml_cgraph gf = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval);
// this allocates all Metal resources and memory buffers
auto * ctx_metal = ggml_metal_init(1);
@@ -46,13 +46,13 @@ int main(int argc, char ** argv) {
// main
{
struct ggml_tensor * input = ggml_graph_get_tensor(gf, "embd");
struct ggml_tensor * input = ggml_graph_get_tensor(&gf, "embd");
*(int32_t *) input->data = 1; // BOS
ggml_metal_set_tensor(ctx_metal, input);
// warmup
ggml_metal_graph_compute(ctx_metal, gf);
ggml_metal_graph_compute(ctx_metal, &gf);
const int n_iter = 16;
@@ -60,7 +60,7 @@ int main(int argc, char ** argv) {
// the actual inference happens here
for (int i = 0; i < n_iter; ++i) {
ggml_metal_graph_compute(ctx_metal, gf);
ggml_metal_graph_compute(ctx_metal, &gf);
}
const int64_t t1 = ggml_time_us();
@@ -70,7 +70,7 @@ int main(int argc, char ** argv) {
// debug output
{
struct ggml_tensor * logits = gf->nodes[gf->n_nodes - 1];
struct ggml_tensor * logits = gf.nodes[gf.n_nodes - 1];
ggml_metal_get_tensor(ctx_metal, logits);
float * ptr = (float *) ggml_get_data(logits);

View File

@@ -3,3 +3,6 @@ add_executable(${TARGET} parallel.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()

View File

@@ -1,3 +1,3 @@
# llama.cpp/example/parallel
Simplified simulation of serving incoming requests in parallel
Simplified simluation for serving incoming requests in parallel

View File

@@ -1,6 +1,8 @@
// A basic application simulating a server with multiple clients.
// The clients submite requests to the server and they are processed in parallel.
#include "build-info.h"
#include "common.h"
#include "llama.h"
@@ -49,12 +51,6 @@ static std::vector<std::string> k_prompts = {
};
struct client {
~client() {
if (ctx_sampling) {
llama_sampling_free(ctx_sampling);
}
}
int32_t id = 0;
llama_seq_id seq_id = -1;
@@ -72,7 +68,7 @@ struct client {
std::string prompt;
std::string response;
struct llama_sampling_context * ctx_sampling = nullptr;
std::vector<llama_token> tokens_prev;
};
static void print_date_time() {
@@ -129,6 +125,8 @@ int main(int argc, char ** argv) {
params.logits_all = true;
std::tie(model, ctx) = llama_init_from_gpt_params(params);
llama_sampling_context ctx_sampling = llama_sampling_context_init(params, NULL);
// load the prompts from an external file if there are any
if (params.prompt.empty()) {
printf("\n\033[32mNo new questions so proceed with build-in defaults.\033[0m\n");
@@ -149,15 +147,20 @@ int main(int argc, char ** argv) {
fprintf(stderr, "\n\n");
fflush(stderr);
const int n_ctx = llama_n_ctx(ctx);
const int n_ctx = llama_n_ctx(ctx);
const int n_vocab = llama_n_vocab(model);
std::vector<client> clients(n_clients);
for (size_t i = 0; i < clients.size(); ++i) {
auto & client = clients[i];
client.id = i;
client.ctx_sampling = llama_sampling_init(params.sparams);
client.tokens_prev.resize(std::max(256, params.n_predict));
std::fill(client.tokens_prev.begin(), client.tokens_prev.end(), 0);
}
std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab);
std::vector<llama_token> tokens_system;
tokens_system = ::llama_tokenize(ctx, k_system, true);
const int32_t n_tokens_system = tokens_system.size();
@@ -166,7 +169,7 @@ int main(int argc, char ** argv) {
// the max batch size is as large as the context to handle cases where we get very long input prompt from multiple
// users. regardless of the size, the main loop will chunk the batch into a maximum of params.n_batch tokens at a time
llama_batch batch = llama_batch_init(n_ctx, 0, 1);
llama_batch batch = llama_batch_init(n_ctx, 0);
int32_t n_total_prompt = 0;
int32_t n_total_gen = 0;
@@ -181,8 +184,13 @@ int main(int argc, char ** argv) {
{
LOG_TEE("%s: Evaluating the system prompt ...\n", __func__);
for (int32_t i = 0; i < n_tokens_system; ++i) {
llama_batch_add(batch, tokens_system[i], i, { 0 }, false);
batch.n_tokens = n_tokens_system;
for (int32_t i = 0; i < batch.n_tokens; ++i) {
batch.token[i] = tokens_system[i];
batch.pos[i] = i;
batch.seq_id[i] = 0;
batch.logits[i] = false;
}
if (llama_decode(ctx, batch) != 0) {
@@ -201,7 +209,7 @@ int main(int argc, char ** argv) {
LOG_TEE("Processing requests ...\n\n");
while (true) {
llama_batch_clear(batch);
batch.n_tokens = 0;
// decode any currently ongoing sequences
for (auto & client : clients) {
@@ -209,11 +217,15 @@ int main(int argc, char ** argv) {
continue;
}
client.i_batch = batch.n_tokens;
llama_batch_add(batch, client.sampled, n_tokens_system + client.n_prompt + client.n_decoded, { client.id }, true);
batch.token [batch.n_tokens] = client.sampled;
batch.pos [batch.n_tokens] = n_tokens_system + client.n_prompt + client.n_decoded;
batch.seq_id[batch.n_tokens] = client.id;
batch.logits[batch.n_tokens] = true;
client.n_decoded += 1;
client.i_batch = batch.n_tokens;
batch.n_tokens += 1;
}
if (batch.n_tokens == 0) {
@@ -238,14 +250,18 @@ int main(int argc, char ** argv) {
client.prompt = client.input + "\nAssistant:";
client.response = "";
llama_sampling_reset(client.ctx_sampling);
std::fill(client.tokens_prev.begin(), client.tokens_prev.end(), 0);
// do not prepend BOS because we have a system prompt!
std::vector<llama_token> tokens_prompt;
tokens_prompt = ::llama_tokenize(ctx, client.prompt, false);
for (size_t i = 0; i < tokens_prompt.size(); ++i) {
llama_batch_add(batch, tokens_prompt[i], i + n_tokens_system, { client.id }, false);
batch.token [batch.n_tokens] = tokens_prompt[i];
batch.pos [batch.n_tokens] = i + n_tokens_system;
batch.seq_id[batch.n_tokens] = client.id;
batch.logits[batch.n_tokens] = false;
batch.n_tokens += 1;
}
// extract the logits only for the last token
@@ -288,12 +304,11 @@ int main(int argc, char ** argv) {
llama_batch batch_view = {
n_tokens,
batch.token + i,
batch.token + i,
nullptr,
batch.pos + i,
batch.n_seq_id + i,
batch.seq_id + i,
batch.logits + i,
batch.pos + i,
batch.seq_id + i,
batch.logits + i,
0, 0, 0, // unused
};
@@ -326,9 +341,7 @@ int main(int argc, char ** argv) {
//printf("client %d, seq %d, token %d, pos %d, batch %d\n",
// client.id, client.seq_id, client.sampled, client.n_decoded, client.i_batch);
const llama_token id = llama_sampling_sample(client.ctx_sampling, ctx, NULL, client.i_batch - i);
llama_sampling_accept(client.ctx_sampling, ctx, id, true);
const llama_token id = llama_sampling_sample(ctx, NULL, ctx_sampling, client.tokens_prev, candidates, client.i_batch - i, client.seq_id);
if (client.n_decoded == 1) {
// start measuring generation time after the first token to make sure all concurrent clients
@@ -336,8 +349,11 @@ int main(int argc, char ** argv) {
client.t_start_gen = ggml_time_us();
}
const std::string token_str = llama_token_to_piece(ctx, id);
// remember which tokens were sampled - used for repetition penalties during sampling
client.tokens_prev.erase(client.tokens_prev.begin());
client.tokens_prev.push_back(id);
const std::string token_str = llama_token_to_piece(ctx, id);
client.response += token_str;
client.sampled = id;
@@ -345,7 +361,7 @@ int main(int argc, char ** argv) {
// client.id, client.seq_id, id, client.n_decoded, client.i_batch, token_str.c_str());
if (client.n_decoded > 2 &&
(id == llama_token_eos(model) ||
(id == llama_token_eos(ctx) ||
(params.n_predict > 0 && client.n_decoded + client.n_prompt >= params.n_predict) ||
client.response.find("User:") != std::string::npos ||
client.response.find('\n') != std::string::npos)) {
@@ -370,7 +386,7 @@ int main(int argc, char ** argv) {
n_total_prompt += client.n_prompt;
n_total_gen += client.n_decoded;
llama_sampling_context_reset(ctx_sampling, client.seq_id);
client.seq_id = -1;
}

View File

@@ -3,3 +3,6 @@ add_executable(${TARGET} perplexity.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()

View File

@@ -1,3 +1,4 @@
#include "build-info.h"
#include "common.h"
#include "llama.h"
@@ -209,7 +210,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
const auto t_start = std::chrono::high_resolution_clock::now();
// clear the KV cache
llama_kv_cache_clear(ctx);
llama_kv_cache_tokens_rm(ctx, -1, -1);
for (int j = 0; j < num_batches; ++j) {
const int batch_start = start + j * n_batch;
@@ -226,7 +227,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
// add BOS token for the first batch of each chunk
if (add_bos && j == 0) {
tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
tokens[batch_start] = llama_token_bos(ctx);
}
const auto batch_logits = llama_get_logits(ctx);
@@ -338,7 +339,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
const auto t_start = std::chrono::high_resolution_clock::now();
// clear the KV cache
llama_kv_cache_clear(ctx);
llama_kv_cache_tokens_rm(ctx, -1, -1);
for (int j = 0; j < num_batches; ++j) {
const int batch_start = start + j * n_batch;
@@ -349,7 +350,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
// add BOS token for the first batch of each chunk
if (add_bos && j == 0) {
tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
tokens[batch_start] = llama_token_bos(ctx);
}
if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
@@ -572,7 +573,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
}
// clear the KV cache
llama_kv_cache_clear(ctx);
llama_kv_cache_tokens_rm(ctx, -1, -1);
auto logits = hellaswag_evaluate_tokens(ctx, query_embd, 0, params.n_batch, n_vocab);
if (logits.empty()) {

View File

@@ -1,6 +1,6 @@
set(TARGET quantize-stats)
add_executable(${TARGET} quantize-stats.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
target_include_directories(${TARGET} PRIVATE ../../common)
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View File

@@ -1,4 +1,5 @@
#define LLAMA_API_INTERNAL
#include "build-info.h"
#include "common.h"
#include "ggml.h"
#include "llama.h"

View File

@@ -1,6 +1,9 @@
set(TARGET quantize)
add_executable(${TARGET} quantize.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
target_include_directories(${TARGET} PRIVATE ../../common)
target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()

View File

@@ -1,3 +1,4 @@
#include "build-info.h"
#include "common.h"
#include "llama.h"
@@ -17,6 +18,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
{ "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 3.90G, +0.1585 ppl @ LLaMA-v1-7B", },
{ "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 4.33G, +0.0683 ppl @ LLaMA-v1-7B", },
{ "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 4.70G, +0.0349 ppl @ LLaMA-v1-7B", },
#ifdef GGML_USE_K_QUANTS
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
{ "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
{ "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", },
@@ -29,6 +31,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
{ "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 4.33G, +0.0400 ppl @ LLaMA-v1-7B", },
{ "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0122 ppl @ LLaMA-v1-7B", },
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 5.15G, -0.0008 ppl @ LLaMA-v1-7B", },
#endif
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", },
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "13.00G @ 7B", },
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
@@ -67,14 +70,13 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
}
// usage:
// ./quantize [--allow-requantize] [--leave-output-tensor] [--pure] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
// ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
//
[[noreturn]]
static void usage(const char * executable) {
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
printf(" --pure: Disable k-quant mixtures and quantize all tensors to the same type\n");
printf("\nAllowed quantization types:\n");
for (auto & it : QUANT_OPTIONS) {
if (it.name != "COPY") {
@@ -101,8 +103,6 @@ int main(int argc, char ** argv) {
params.quantize_output_tensor = false;
} else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) {
params.allow_requantize = true;
} else if (strcmp(argv[arg_idx], "--pure") == 0) {
params.pure = true;
} else {
usage(argv[0]);
}

View File

@@ -3,3 +3,6 @@ add_executable(${TARGET} save-load-state.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()

View File

@@ -1,3 +1,4 @@
#include "build-info.h"
#include "common.h"
#include "llama.h"
@@ -7,7 +8,10 @@
int main(int argc, char ** argv) {
gpt_params params;
llama_sampling_params & sparams = params.sampling_params;
params.seed = 42;
params.n_threads = 4;
sparams.repeat_last_n = 64;
params.prompt = "The quick brown fox";
if (!gpt_params_parse(argc, argv, params)) {
@@ -21,49 +25,56 @@ int main(int argc, char ** argv) {
}
auto n_past = 0;
std::string result0;
std::string result1;
auto last_n_tokens_data = std::vector<llama_token>(sparams.repeat_last_n, 0);
// init
llama_model * model;
llama_context * ctx;
std::tie(model, ctx) = llama_init_from_gpt_params(params);
if (model == nullptr || ctx == nullptr) {
fprintf(stderr, "%s : failed to init\n", __func__);
std::tie(model, ctx) = llama_init_from_gpt_params( params );
if (model == nullptr) {
return 1;
}
if (ctx == nullptr) {
llama_free_model(model);
return 1;
}
auto tokens = llama_tokenize(ctx, params.prompt, true);
auto n_prompt_tokens = tokens.size();
if (n_prompt_tokens < 1) {
fprintf(stderr, "%s : failed to tokenize prompt\n", __func__);
llama_free(ctx);
llama_free_model(model);
return 1;
}
// tokenize prompt
auto tokens = llama_tokenize(ctx, params.prompt, true);
// evaluate prompt
llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), n_past, 0));
n_past += tokens.size();
llama_decode(ctx, llama_batch_get_one(tokens.data(), n_prompt_tokens, n_past, 0));
// save state (rng, logits, embedding and kv_cache) to file
last_n_tokens_data.insert(last_n_tokens_data.end(), tokens.data(), tokens.data() + n_prompt_tokens);
n_past += n_prompt_tokens;
const size_t state_size = llama_get_state_size(ctx);
uint8_t * state_mem = new uint8_t[state_size];
// Save state (rng, logits, embedding and kv_cache) to file
{
std::vector<uint8_t> state_mem(llama_get_state_size(ctx));
{
FILE *fp_write = fopen("dump_state.bin", "wb");
llama_copy_state_data(ctx, state_mem.data()); // could also copy directly to memory mapped file
fwrite(state_mem.data(), 1, state_mem.size(), fp_write);
fclose(fp_write);
}
FILE *fp_write = fopen("dump_state.bin", "wb");
llama_copy_state_data(ctx, state_mem); // could also copy directly to memory mapped file
fwrite(state_mem, 1, state_size, fp_write);
fclose(fp_write);
}
// save state (last tokens)
const auto last_n_tokens_data_saved = std::vector<llama_token>(last_n_tokens_data);
const auto n_past_saved = n_past;
// first run
printf("\nfirst run: %s", params.prompt.c_str());
printf("\n%s", params.prompt.c_str());
for (auto i = 0; i < params.n_predict; i++) {
auto * logits = llama_get_logits(ctx);
auto n_vocab = llama_n_vocab(model);
std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab);
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
@@ -72,10 +83,9 @@ int main(int argc, char ** argv) {
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
auto next_token = llama_sample_token(ctx, &candidates_p);
auto next_token_str = llama_token_to_piece(ctx, next_token);
last_n_tokens_data.push_back(next_token);
printf("%s", next_token_str.c_str());
result0 += next_token_str;
if (llama_decode(ctx, llama_batch_get_one(&next_token, 1, n_past, 0))) {
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
llama_free(ctx);
@@ -93,28 +103,32 @@ int main(int argc, char ** argv) {
// make new context
auto * ctx2 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params));
printf("\nsecond run: %s", params.prompt.c_str());
// load state (rng, logits, embedding and kv_cache) from file
// Load state (rng, logits, embedding and kv_cache) from file
{
std::vector<uint8_t> state_mem(llama_get_state_size(ctx2));
FILE *fp_read = fopen("dump_state.bin", "rb");
if (state_size != llama_get_state_size(ctx2)) {
fprintf(stderr, "\n%s : failed to validate state size\n", __func__);
llama_free(ctx2);
llama_free_model(model);
return 1;
}
FILE * fp_read = fopen("dump_state.bin", "rb");
const size_t ret = fread(state_mem.data(), 1, state_mem.size(), fp_read);
if (ret != state_mem.size()) {
const size_t ret = fread(state_mem, 1, state_size, fp_read);
if (ret != state_size) {
fprintf(stderr, "\n%s : failed to read state\n", __func__);
llama_free(ctx2);
llama_free_model(model);
return 1;
}
llama_set_state_data(ctx2, state_mem.data());
llama_set_state_data(ctx2, state_mem); // could also read directly from memory mapped file
fclose(fp_read);
}
delete[] state_mem;
// restore state (last tokens)
last_n_tokens_data = last_n_tokens_data_saved;
n_past = n_past_saved;
// second run
@@ -129,11 +143,10 @@ int main(int argc, char ** argv) {
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
auto next_token = llama_sample_token(ctx2, &candidates_p);
auto next_token_str = llama_token_to_piece(ctx2, next_token);
last_n_tokens_data.push_back(next_token);
printf("%s", next_token_str.c_str());
result1 += next_token_str;
if (llama_decode(ctx2, llama_batch_get_one(&next_token, 1, n_past, 0))) {
if (llama_decode(ctx, llama_batch_get_one(&next_token, 1, n_past, 0))) {
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
llama_free(ctx2);
llama_free_model(model);
@@ -142,17 +155,10 @@ int main(int argc, char ** argv) {
n_past += 1;
}
printf("\n");
printf("\n\n");
llama_free(ctx2);
llama_free_model(model);
if (result0 != result1) {
fprintf(stderr, "\n%s : error : the 2 generations are different\n", __func__);
return 1;
}
fprintf(stderr, "\n%s : success\n", __func__);
return 0;
}

View File

@@ -6,8 +6,11 @@ install(TARGETS ${TARGET} RUNTIME)
target_compile_definitions(${TARGET} PRIVATE
SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
)
target_link_libraries(${TARGET} PRIVATE common llama llava ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
if (WIN32)
TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
endif()
target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()

View File

@@ -7,7 +7,7 @@ Command line options:
- `--threads N`, `-t N`: Set the number of threads to use during generation.
- `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation.
- `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
- `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
- `-m ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096.
- `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
@@ -24,10 +24,6 @@ Command line options:
- `--port`: Set the port to listen. Default: `8080`.
- `--path`: path from which to serve static files (default examples/server/public)
- `--embedding`: Enable embedding extraction, Default: disabled.
- `-np N`, `--parallel N`: Set the number of slots for process requests (default: 1)
- `-cb`, `--cont-batching`: enable continuous batching (a.k.a dynamic batching) (default: disabled)
- `-spf FNAME`, `--system-prompt-file FNAME` Set a file to load "a system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
- `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA.
## Build
@@ -110,27 +106,25 @@ node index.js
## API Endpoints
- **POST** `/completion`: Given a `prompt`, it returns the predicted completion.
- **POST** `/completion`: Given a prompt, it returns the predicted completion.
*Options:*
`prompt`: Provide the prompt for this completion as a string or as an array of strings or numbers representing tokens. Internally, the prompt is compared to the previous completion and only the "unseen" suffix is evaluated. If the prompt is a string or an array with the first element given as a string, a `bos` token is inserted in the front like `main` does.
`temperature`: Adjust the randomness of the generated text (default: 0.8).
`top_k`: Limit the next token selection to the K most probable tokens (default: 40).
`top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.95).
`min_p`: The minimum probability for a token to be considered, relative to the probability of the most likely token (default: 0.05).
`n_predict`: Set the number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. (default: -1, -1 = infinity).
`n_predict`: Set the maximum number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. (default: -1, -1 = infinity).
`n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded.
By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the prompt.
`n_keep`: Specify the number of tokens from the initial prompt to retain when the model resets its internal context.
By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the initial prompt.
`stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`.
`prompt`: Provide a prompt as a string, or as an array of strings and numbers representing tokens. Internally, the prompt is compared, and it detects if a part has already been evaluated, and the remaining part will be evaluate. If the prompt is a string, or an array with the first element given as a string, a space is inserted in the front like main.cpp does.
`stop`: Specify a JSON array of stopping strings.
These words will not be included in the completion, so make sure to add them to the prompt for the next iteration (default: []).
@@ -164,44 +158,6 @@ node index.js
`n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token (default: 0)
`image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `prompt`. You can determine the place of the image in the prompt as in the following: `USER:[img-12]Describe the image in detail.\nASSISTANT:` In this case, `[img-12]` will be replaced by the embeddings of the image id 12 in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 12}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.
*Result JSON:*
Note: When using streaming mode (`stream`) only `content` and `stop` will be returned until end of completion.
`content`: Completion result as a string (excluding `stopping_word` if any). In case of streaming mode, will contain the next token as a string.
`stop`: Boolean for use with `stream` to check whether the generation has stopped (Note: This is not related to stopping words array `stop` from input options)
`generation_settings`: The provided options above excluding `prompt` but including `n_ctx`, `model`
`model`: The path to the model loaded with `-m`
`prompt`: The provided `prompt`
`stopped_eos`: Indicating whether the completion has stopped because it encountered the EOS token
`stopped_limit`: Indicating whether the completion stopped because `n_predict` tokens were generated before stop words or EOS was encountered
`stopped_word`: Indicating whether the completion stopped due to encountering a stopping word from `stop` JSON array provided
`stopping_word`: The stopping word encountered which stopped the generation (or "" if not stopped due to a stopping word)
`timings`: Hash of timing information about the completion such as the number of tokens `predicted_per_second`
`tokens_cached`: Number of tokens from the prompt which could be re-used from previous completion (`n_past`)
`tokens_evaluated`: Number of tokens evaluated in total from the prompt
`truncated`: Boolean indicating if the context size was exceeded during generation, i.e. the number of tokens provided in the prompt (`tokens_evaluated`) plus tokens generated (`tokens predicted`) exceeded the context size (`n_ctx`)
`slot_id`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot (default: -1)
`cache_prompt`: Save the prompt and generation for avoid reprocess entire prompt if a part of this isn't change (default: false)
`system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
- **POST** `/tokenize`: Tokenize a given text.
*Options:*
@@ -232,32 +188,8 @@ node index.js
It also accepts all the options of `/completion` except `stream` and `prompt`.
- **GET** `/props`: Return the required assistant name and anti-prompt to generate the prompt in case you have specified a system prompt for all slots.
## More examples
### Change system prompt on runtime
To use the server example to serve multiple chat-type clients while keeping the same system prompt, you can utilize the option `system_prompt` to achieve that. This only needs to be done once to establish it.
`prompt`: Specify a context that you want all connecting clients to respect.
`anti_prompt`: Specify the word you want to use to instruct the model to stop. This must be sent to each client through the `/props` endpoint.
`assistant_name`: The bot's name is necessary for each customer to generate the prompt. This must be sent to each client through the `/props` endpoint.
```json
{
"system_prompt": {
"prompt": "Transcript of a never ending dialog, where the User interacts with an Assistant.\nThe Assistant is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.\nUser: Recommend a nice restaurant in the area.\nAssistant: I recommend the restaurant \"The Golden Duck\". It is a 5 star restaurant with a great view of the city. The food is delicious and the service is excellent. The prices are reasonable and the portions are generous. The restaurant is located at 123 Main Street, New York, NY 10001. The phone number is (212) 555-1234. The hours are Monday through Friday from 11:00 am to 10:00 pm. The restaurant is closed on Saturdays and Sundays.\nUser: Who is Richard Feynman?\nAssistant: Richard Feynman was an American physicist who is best known for his work in quantum mechanics and particle physics. He was awarded the Nobel Prize in Physics in 1965 for his contributions to the development of quantum electrodynamics. He was a popular lecturer and author, and he wrote several books, including \"Surely You're Joking, Mr. Feynman!\" and \"What Do You Care What Other People Think?\".\nUser:",
"anti_prompt": "User:",
"assistant_name": "Assistant:"
}
}
```
**NOTE**: You can do this automatically when starting the server by simply creating a .json file with these options and using the CLI option `-spf FNAME` or `--system-prompt-file FNAME`.
### Interactive mode
Check the sample in [chat.mjs](chat.mjs).

View File

@@ -8,7 +8,6 @@ import json
app = Flask(__name__)
slot_id = -1
parser = argparse.ArgumentParser(description="An example of using server.cpp with a similar API to OAI. It must be used together with server.cpp.")
parser.add_argument("--chat-prompt", type=str, help="the top prompt in chat completions(default: 'A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.\\n')", default='A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.\\n')
@@ -78,8 +77,7 @@ def make_postData(body, chat=False, stream=False):
if(is_present(body, "stop")): postData["stop"] += body["stop"]
postData["n_keep"] = -1
postData["stream"] = stream
postData["cache_prompt"] = True
postData["slot_id"] = slot_id
return postData
def make_resData(data, chat=False, promptToken=[]):
@@ -130,7 +128,6 @@ def make_resData_stream(data, chat=False, time_now = 0, start=False):
}
]
}
slot_id = data["slot_id"]
if (chat):
if (start):
resData["choices"][0]["delta"] = {

View File

@@ -7,11 +7,6 @@ const args = process.argv.slice(2);
const grammarJsonSchemaFile = args.find(
(_, index) => args[index - 1] === "--grammar-json-schema"
);
const no_cached_prompt = args.find(
(_, index) => args[index - 1] === "--no-cache-prompt"
) ?? "false";
const grammarFile = args.find((_, index) => args[index - 1] === "--grammar");
// Example usage: function,arguments
@@ -35,9 +30,6 @@ if (grammarFile) {
grammar = readFileSync(grammarFile, 'utf-8')
}
// for cached prompt
let slot_id = -1;
const API_URL = 'http://127.0.0.1:8080'
const chat = [
@@ -84,8 +76,6 @@ async function chat_completion(question) {
top_p: 0.9,
n_keep: n_keep,
n_predict: 256,
cache_prompt: no_cached_prompt === "false",
slot_id: slot_id,
stop: ["\n### Human:"], // stop completion after generating this
grammar,
stream: true,
@@ -102,7 +92,6 @@ async function chat_completion(question) {
const t = Buffer.from(chunk).toString('utf8')
if (t.startsWith('data: ')) {
const message = JSON.parse(t.substring(6))
slot_id = message.slot_id
answer += message.content
process.stdout.write(message.content)
if (message.stop) {

File diff suppressed because it is too large Load Diff

View File

@@ -125,7 +125,6 @@
background-color: #222;
color: #ddd;
}
code {
font-family: monospace;
padding: 0.1em 0.3em;
@@ -142,8 +141,7 @@
display: inline;
}
header,
footer {
header, footer {
text-align: center;
}
@@ -160,17 +158,11 @@
height: 10em;
}
[contenteditable] {
display: inline-block;
white-space: pre-wrap;
outline: 0px solid transparent;
}
@keyframes loading-bg-wipe {
0% {
background-position: 0%;
}
100% {
background-position: 100%;
}
@@ -189,7 +181,6 @@
--loading-color-1: #22222200;
--loading-color-2: #222222ff;
}
.popover-content {
background-color: black;
}
@@ -203,8 +194,6 @@
import { llama } from '/completion.js';
import { SchemaConverter } from '/json-schema-to-grammar.mjs';
let selected_image = false;
var slot_id = -1;
const session = signal({
prompt: "This is a conversation between User and Llama, a friendly chatbot. Llama is helpful, kind, honest, good at writing, and never fails to answer any requests immediately and with precision.",
@@ -214,7 +203,6 @@
type: "chat", // "chat" | "completion"
char: "Llama",
user: "User",
image_selected: ''
})
const params = signal({
@@ -224,7 +212,6 @@
repeat_penalty: 1.18, // 1.0 = disabled
top_k: 40, // <= 0 to use vocab size
top_p: 0.5, // 1.0 = disabled
min_p: 0.05, // 0 = disabled
tfs_z: 1.0, // 1.0 = disabled
typical_p: 1.0, // 1.0 = disabled
presence_penalty: 0.0, // 0.0 = disabled
@@ -233,9 +220,7 @@
mirostat_tau: 5, // target entropy
mirostat_eta: 0.1, // learning rate
grammar: '',
n_probs: 0, // no completion_probabilities,
image_data: [],
cache_prompt: true
n_probs: 0, // no completion_probabilities
})
/* START: Support for storing prompt templates and parameters in borwser LocalStorage */
@@ -285,7 +270,6 @@
// saved templates were successfuly imported.
console.log('Processing saved templates and updating default template')
params.value = { ...params.value, image_data: [] };
//console.log(importedTemplates);
savedUserTemplates.value = importedTemplates;
@@ -310,9 +294,7 @@
function userTemplateApply(t) {
session.value = t.data.session;
session.value = { ...session.value, image_selected: '' };
params.value = t.data.params;
params.value = { ...params.value, image_data: [] };
}
function userTemplateResetToDefaultAndApply() {
@@ -403,25 +385,20 @@
throw new Error("already running");
}
controller.value = new AbortController();
for await (const chunk of llama(prompt, llamaParams, { controller: controller.value })) {
for await (const chunk of llama(prompt, llamaParams, {controller: controller.value})) {
const data = chunk.data;
if (data.stop) {
while (
currentMessages.length > 0 &&
currentMessages[currentMessages.length - 1].content.match(/\n$/) != null
) {
) {
currentMessages.pop();
}
transcriptUpdate([...history, [char, currentMessages]])
console.log("Completion finished: '", currentMessages.map(msg => msg.content).join(''), "', summary: ", data);
} else {
currentMessages.push(data);
slot_id = data.slot_id;
if (selected_image && !data.multimodal) {
alert("The server was not compiled for multimodal or the model projector can't be loaded.");
return;
}
transcriptUpdate([...history, [char, currentMessages]])
}
@@ -442,7 +419,7 @@
transcriptUpdate([...session.value.transcript, ["{{user}}", msg]])
let prompt = template(session.value.template, {
const prompt = template(session.value.template, {
message: msg,
history: session.value.transcript.flatMap(
([name, data]) =>
@@ -457,33 +434,24 @@
)
).join("\n"),
});
if (selected_image) {
prompt = `A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:[img-10]${msg}\nASSISTANT:`;
}
await runLlama(prompt, {
...params.value,
slot_id: slot_id,
stop: ["</s>", template("{{char}}:"), template("{{user}}:")],
}, "{{char}}");
}
const runCompletion = () => {
const runCompletion = async () => {
if (controller.value) {
console.log('already running...');
return;
}
const { prompt } = session.value;
const {prompt} = session.value;
transcriptUpdate([...session.value.transcript, ["", prompt]]);
runLlama(prompt, {
await runLlama(prompt, {
...params.value,
slot_id: slot_id,
stop: [],
}, "").finally(() => {
session.value.prompt = session.value.transcript.map(([_, data]) =>
Array.isArray(data) ? data.map(msg => msg.content).join('') : data
).join('');
session.value.transcript = [];
})
}, "");
}
const stop = (e) => {
@@ -499,27 +467,6 @@
transcriptUpdate([]);
}
const uploadImage = (e) => {
e.preventDefault();
document.getElementById("fileInput").click();
document.getElementById("fileInput").addEventListener("change", function (event) {
const selectedFile = event.target.files[0];
if (selectedFile) {
const reader = new FileReader();
reader.onload = function () {
const image_data = reader.result;
session.value = { ...session.value, image_selected: image_data };
params.value = {
...params.value, image_data: [
{ data: image_data.replace(/data:image\/[^;]+;base64,/, ''), id: 10 }]
}
};
selected_image = true;
reader.readAsDataURL(selectedFile);
}
});
}
function MessageInput() {
const message = useSignal("")
@@ -550,7 +497,6 @@
</div>
<div class="right">
<button type="submit" disabled=${generating.value}>Send</button>
<button onclick=${uploadImage}>Upload Image</button>
<button onclick=${stop} disabled=${!generating.value}>Stop</button>
<button onclick=${reset}>Reset</button>
</div>
@@ -583,7 +529,6 @@
}
}, [messages])
const isCompletionMode = session.value.type === 'completion'
const chatLine = ([user, data], index) => {
let message
const isArrayMessage = Array.isArray(data)
@@ -593,31 +538,19 @@
const text = isArrayMessage ?
data.map(msg => msg.content).join('').replace(/^\s+/, '') :
data;
message = isCompletionMode ?
text :
html`<${Markdownish} text=${template(text)} />`
message = html`<${Markdownish} text=${template(text)} />`
}
if (user) {
if(user) {
return html`<p key=${index}><strong>${template(user)}:</strong> ${message}</p>`
} else {
return isCompletionMode ?
html`<span key=${index}>${message}</span>` :
html`<p key=${index}>${message}</p>`
return html`<p key=${index}>${message}</p>`
}
};
const handleCompletionEdit = (e) => {
session.value.prompt = e.target.innerText;
session.value.transcript = [];
}
return html`
<div id="chat" ref=${container} key=${messages.length}>
<img style="width: 60%;${!session.value.image_selected ? `display: none;` : ``}" src="${session.value.image_selected}"/>
<span contenteditable=${isCompletionMode} ref=${container} oninput=${handleCompletionEdit}>
${messages.flatMap(chatLine)}
</span>
</div>`;
<section id="chat" ref=${container}>
${messages.flatMap(chatLine)}
</section>`;
};
const ConfigForm = (props) => {
@@ -634,7 +567,7 @@
const converter = new SchemaConverter(
grammarJsonSchemaPropOrder.value
.split(',')
.reduce((acc, cur, i) => ({ ...acc, [cur.trim()]: i }), {})
.reduce((acc, cur, i) => ({...acc, [cur.trim()]: i}), {})
)
converter.visit(schema, '')
params.value = {
@@ -646,7 +579,7 @@
}
}
const FloatField = ({ label, max, min, name, step, value }) => {
const FloatField = ({label, max, min, name, step, value}) => {
return html`
<div>
<label for="${name}">${label}</label>
@@ -656,7 +589,7 @@
`
};
const IntField = ({ label, max, min, name, value }) => {
const IntField = ({label, max, min, name, value}) => {
return html`
<div>
<label for="${name}">${label}</label>
@@ -739,7 +672,7 @@
${GrammarControl()}
</fieldset>
`
);
);
const CompletionConfigForm = () => (
html`
@@ -761,21 +694,20 @@
${session.value.type === 'chat' ? ChatConfigForm() : CompletionConfigForm()}
<fieldset class="two">
${IntField({ label: "Predictions", max: 2048, min: -1, name: "n_predict", value: params.value.n_predict })}
${FloatField({ label: "Temperature", max: 1.5, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature })}
${FloatField({ label: "Penalize repeat sequence", max: 2.0, min: 0.0, name: "repeat_penalty", step: 0.01, value: params.value.repeat_penalty })}
${IntField({ label: "Consider N tokens for penalize", max: 2048, min: 0, name: "repeat_last_n", value: params.value.repeat_last_n })}
${IntField({ label: "Top-K sampling", max: 100, min: -1, name: "top_k", value: params.value.top_k })}
${FloatField({ label: "Top-P sampling", max: 1.0, min: 0.0, name: "top_p", step: 0.01, value: params.value.top_p })}
${FloatField({ label: "Min-P sampling", max: 1.0, min: 0.0, name: "min_p", step: 0.01, value: params.value.min_p })}
${IntField({label: "Predictions", max: 2048, min: -1, name: "n_predict", value: params.value.n_predict})}
${FloatField({label: "Temperature", max: 1.5, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature})}
${FloatField({label: "Penalize repeat sequence", max: 2.0, min: 0.0, name: "repeat_penalty", step: 0.01, value: params.value.repeat_penalty})}
${IntField({label: "Consider N tokens for penalize", max: 2048, min: 0, name: "repeat_last_n", value: params.value.repeat_last_n})}
${IntField({label: "Top-K sampling", max: 100, min: -1, name: "top_k", value: params.value.top_k})}
${FloatField({label: "Top-P sampling", max: 1.0, min: 0.0, name: "top_p", step: 0.01, value: params.value.top_p})}
</fieldset>
<details>
<summary>More options</summary>
<fieldset class="two">
${FloatField({ label: "TFS-Z", max: 1.0, min: 0.0, name: "tfs_z", step: 0.01, value: params.value.tfs_z })}
${FloatField({ label: "Typical P", max: 1.0, min: 0.0, name: "typical_p", step: 0.01, value: params.value.typical_p })}
${FloatField({ label: "Presence penalty", max: 1.0, min: 0.0, name: "presence_penalty", step: 0.01, value: params.value.presence_penalty })}
${FloatField({ label: "Frequency penalty", max: 1.0, min: 0.0, name: "frequency_penalty", step: 0.01, value: params.value.frequency_penalty })}
${FloatField({label: "TFS-Z", max: 1.0, min: 0.0, name: "tfs_z", step: 0.01, value: params.value.tfs_z})}
${FloatField({label: "Typical P", max: 1.0, min: 0.0, name: "typical_p", step: 0.01, value: params.value.typical_p})}
${FloatField({label: "Presence penalty", max: 1.0, min: 0.0, name: "presence_penalty", step: 0.01, value: params.value.presence_penalty})}
${FloatField({label: "Frequency penalty", max: 1.0, min: 0.0, name: "frequency_penalty", step: 0.01, value: params.value.frequency_penalty})}
</fieldset>
<hr />
<fieldset class="three">
@@ -784,11 +716,11 @@
<label><input type="radio" name="mirostat" value="1" checked=${params.value.mirostat == 1} oninput=${updateParamsInt} /> Mirostat v1</label>
<label><input type="radio" name="mirostat" value="2" checked=${params.value.mirostat == 2} oninput=${updateParamsInt} /> Mirostat v2</label>
</div>
${FloatField({ label: "Mirostat tau", max: 10.0, min: 0.0, name: "mirostat_tau", step: 0.01, value: params.value.mirostat_tau })}
${FloatField({ label: "Mirostat eta", max: 1.0, min: 0.0, name: "mirostat_eta", step: 0.01, value: params.value.mirostat_eta })}
${FloatField({label: "Mirostat tau", max: 10.0, min: 0.0, name: "mirostat_tau", step: 0.01, value: params.value.mirostat_tau})}
${FloatField({label: "Mirostat eta", max: 1.0, min: 0.0, name: "mirostat_eta", step: 0.01, value: params.value.mirostat_eta})}
</fieldset>
<fieldset>
${IntField({ label: "Show Probabilities", max: 10, min: 0, name: "n_probs", value: params.value.n_probs })}
${IntField({label: "Show Probabilities", max: 10, min: 0, name: "n_probs", value: params.value.n_probs})}
</fieldset>
</details>
</form>
@@ -827,20 +759,20 @@
const popoverChildren = html`
<div class="prob-set">
${probs.map((p, index) => {
return html`
return html`
<div
key=${index}
title=${`prob: ${p.prob}`}
style=${{
padding: '0.3em',
backgroundColor: p.tok_str === content ? probColor(p.prob) : 'transparent'
}}
padding: '0.3em',
backgroundColor: p.tok_str === content ? probColor(p.prob) : 'transparent'
}}
>
<span>${p.tok_str}: </span>
<span>${Math.floor(p.prob * 100)}%</span>
</div>
`
})}
})}
</div>
`
@@ -919,9 +851,9 @@
ref=${popoverRef}
class="popover-content"
style=${{
top: position.value.top,
left: position.value.left,
}}
top: position.value.top,
left: position.value.left,
}}
>
${props.popoverChildren}
</div>
@@ -1020,11 +952,8 @@
</head>
<body>
<div id="container">
<input type="file" id="fileInput" accept="image/*" style="display: none;">
</div>
<div id="container"></div>
<div id="portal"></div>
</body>
</html>

File diff suppressed because it is too large Load Diff

View File

@@ -92,11 +92,16 @@ int main(int argc, char ** argv) {
// create a llama_batch with size 512
// we use this object to submit token data for decoding
llama_batch batch = llama_batch_init(512, 0, 1);
llama_batch batch = llama_batch_init(512, 0);
// evaluate the initial prompt
for (size_t i = 0; i < tokens_list.size(); i++) {
llama_batch_add(batch, tokens_list[i], i, { 0 }, false);
batch.n_tokens = tokens_list.size();
for (int32_t i = 0; i < batch.n_tokens; i++) {
batch.token[i] = tokens_list[i];
batch.pos[i] = i;
batch.seq_id[i] = 0;
batch.logits[i] = false;
}
// llama_decode will output logits only for the last token of the prompt
@@ -133,7 +138,7 @@ int main(int argc, char ** argv) {
const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
// is it an end of stream?
if (new_token_id == llama_token_eos(model) || n_cur == n_len) {
if (new_token_id == llama_token_eos(ctx) || n_cur == n_len) {
LOG_TEE("\n");
break;
@@ -143,10 +148,15 @@ int main(int argc, char ** argv) {
fflush(stdout);
// prepare the next batch
llama_batch_clear(batch);
batch.n_tokens = 0;
// push this new token for next evaluation
llama_batch_add(batch, new_token_id, n_cur, { 0 }, true);
batch.token [batch.n_tokens] = new_token_id;
batch.pos [batch.n_tokens] = n_cur;
batch.seq_id[batch.n_tokens] = 0;
batch.logits[batch.n_tokens] = true;
batch.n_tokens += 1;
n_decode += 1;
}

View File

@@ -3,3 +3,6 @@ add_executable(${TARGET} speculative.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()

View File

@@ -1,27 +1,14 @@
#include "build-info.h"
#include "common.h"
#include "llama.h"
#include "grammar-parser.h"
#include <cmath>
#include <cstdio>
#include <string>
#include <vector>
#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 100
#define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
struct seq_draft {
bool active = false;
bool drafting = false;
bool skip = false;
int i_batch_dft = 0;
std::vector<int> i_batch_tgt;
std::vector<llama_token> tokens;
struct llama_sampling_context * ctx_sampling;
};
int main(int argc, char ** argv) {
gpt_params params;
@@ -34,15 +21,6 @@ int main(int argc, char ** argv) {
return 1;
}
// max number of parallel drafting sequences (i.e. tree branches)
const int n_seq_dft = params.n_parallel;
// probability threshold for accepting a token from the draft model
const float p_accept = params.p_accept;
// probability threshold for splitting a draft branch (only for n_seq_dft > 1)
const float p_split = params.p_split;
#ifndef LOG_DISABLE_LOGS
log_set_target(log_filename_generator("speculative", "log"));
LOG_TEE("Log start\n");
@@ -67,33 +45,6 @@ int main(int argc, char ** argv) {
params.n_gpu_layers = params.n_gpu_layers_draft;
std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params);
{
const int n_vocab_tgt = llama_n_vocab(model_tgt);
const int n_vocab_dft = llama_n_vocab(model_dft);
const int vocab_diff = n_vocab_tgt > n_vocab_dft
? n_vocab_tgt - n_vocab_dft
: n_vocab_dft - n_vocab_tgt;
if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) {
fprintf(stderr, "%s: error: draft model vocab must closely match target model to use speculation but ", __func__);
fprintf(stderr, "target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
n_vocab_tgt, llama_n_vocab(model_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
return 1;
}
for (int i = SPEC_VOCAB_CHECK_START_TOKEN_ID; i < std::min(n_vocab_tgt, n_vocab_dft); ++i) {
const char * token_text_tgt = llama_token_get_text(model_tgt, i);
const char * token_text_dft = llama_token_get_text(model_dft, i);
if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
fprintf(stderr, "%s: error: draft model vocab must match target model to use speculation but ", __func__);
fprintf(stderr, "token %d content differs - target '%s', draft '%s'\n", i,
llama_token_to_piece(ctx_tgt, i).c_str(),
llama_token_to_piece(ctx_dft, i).c_str());
return 1;
}
}
}
// tokenize the prompt
std::vector<llama_token> inp;
inp = ::llama_tokenize(ctx_tgt, params.prompt, true);
@@ -126,6 +77,8 @@ int main(int argc, char ** argv) {
const auto t_enc_end = ggml_time_us();
// the 2 models should have the same vocab
const int n_ctx = llama_n_ctx(ctx_tgt);
const int n_vocab = llama_n_vocab(model_tgt);
//GGML_ASSERT(n_vocab == llama_n_vocab(model_dft));
// how many tokens to draft each time
@@ -138,130 +91,117 @@ int main(int argc, char ** argv) {
int n_past_tgt = inp.size();
int n_past_dft = inp.size();
std::vector<llama_token> drafted;
std::vector<llama_token> last_tokens(n_ctx);
std::fill(last_tokens.begin(), last_tokens.end(), 0);
for (auto & id : inp) {
last_tokens.erase(last_tokens.begin());
last_tokens.push_back(id);
}
std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab);
// used to determine end of generation
bool has_eos = false;
// target model sampling context
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams);
// grammar stuff
struct llama_grammar * grammar_dft = NULL;
struct llama_grammar * grammar_tgt = NULL;
// draft sequence data
std::vector<seq_draft> drafts(n_seq_dft);
grammar_parser::parse_state parsed_grammar;
params.sparams.grammar.clear(); // the draft samplers will copy the target sampler's grammar
params.sparams.temp = -1.0f; // force greedy sampling with probs for the draft model
// if requested - load the grammar, error checking is omitted for brevity
if (!params.grammar.empty()) {
parsed_grammar = grammar_parser::parse(params.grammar.c_str());
// will be empty (default) if there are parse errors
if (parsed_grammar.rules.empty()) {
return 1;
}
for (int s = 0; s < n_seq_dft; ++s) {
drafts[s].ctx_sampling = llama_sampling_init(params.sparams);
std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
grammar_tgt = llama_grammar_init(grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
}
llama_batch batch_dft = llama_batch_init(params.n_ctx, 0, 1);
llama_batch batch_tgt = llama_batch_init(params.n_ctx, 0, n_seq_dft);
llama_sampling_context ctx_sampling = llama_sampling_context_init(params, grammar_tgt);
const auto t_dec_start = ggml_time_us();
// sample from the last token of the prompt
drafts[0].i_batch_tgt.resize(1);
drafts[0].i_batch_tgt[0] = 0;
while (true) {
// print current draft sequences
for (int s = 0; s < n_seq_dft; ++s) {
if (!drafts[s].active) {
continue;
}
LOG("drafted: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_dft, drafted));
const auto & tokens = drafts[s].tokens;
LOG("draft %d: %s\n", s, LOG_TOKENS_TOSTR_PRETTY(ctx_dft, tokens).c_str());
}
int i_dft = 0;
int s_keep = 0;
int i_dft = 0;
while (true) {
LOG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]);
// sample from the target model
llama_token id = llama_sampling_sample(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft]);
llama_token id = llama_sampling_sample(ctx_tgt, NULL, ctx_sampling, last_tokens, candidates, i_dft);
llama_sampling_accept(ctx_sampling, ctx_tgt, id, true);
// remember which tokens were sampled - used for repetition penalties during sampling
last_tokens.erase(last_tokens.begin());
last_tokens.push_back(id);
//LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, ctx_sampling->prev).c_str());
//LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, last_tokens));
const std::string token_str = llama_token_to_piece(ctx_tgt, id);
printf("%s", token_str.c_str());
fflush(stdout);
if (id == llama_token_eos(model_tgt)) {
if (id == llama_token_eos(ctx_tgt)) {
has_eos = true;
}
++n_predict;
// check if the target token matches any of the drafts
{
bool matches = false;
// check if the draft matches the target
if (i_dft < (int) drafted.size() && id == drafted[i_dft]) {
LOG("the sampled target token matches the %dth drafted token (%d, '%s') - accepted\n", i_dft, id, token_str.c_str());
++n_accept;
++n_past_tgt;
++n_past_dft;
++i_dft;
for (int s = 0; s < n_seq_dft; ++s) {
if (!drafts[s].active) {
continue;
}
if (i_dft < (int) drafts[s].tokens.size() && id == drafts[s].tokens[i_dft]) {
LOG("the sampled target token matches the %dth drafted token of sequence %d (%d, '%s') - accepted\n", i_dft, s, id, token_str.c_str());
s_keep = s;
matches = true;
} else {
drafts[s].active = false;
}
}
if (matches) {
++n_accept;
++n_past_tgt;
++n_past_dft;
++i_dft;
continue;
}
continue;
}
LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str());
// the drafted token was rejected or we are out of drafted tokens
// TODO: simplify
{
LOG("keeping sequence %d, n_past_tgt = %d, n_past_dft = %d\n", s_keep, n_past_tgt, n_past_dft);
llama_kv_cache_seq_keep(ctx_dft, s_keep);
llama_kv_cache_seq_cp (ctx_dft, s_keep, 0, -1, -1);
llama_kv_cache_seq_keep(ctx_dft, 0);
llama_kv_cache_seq_rm (ctx_tgt, s_keep, n_past_tgt, -1);
llama_kv_cache_seq_keep(ctx_tgt, s_keep);
llama_kv_cache_seq_cp (ctx_tgt, s_keep, 0, -1, -1);
llama_kv_cache_seq_keep(ctx_tgt, 0);
if (i_dft < (int) drafted.size()) {
LOG("the %dth drafted token (%d, '%s') does not match the sampled target token (%d, '%s') - rejected\n",
i_dft, drafted[i_dft], llama_token_to_piece(ctx_dft, drafted[i_dft]).c_str(), id, token_str.c_str());
} else {
LOG("out of drafted tokens\n");
}
for (int s = 0; s < n_seq_dft; ++s) {
drafts[s].active = false;
drafts[s].tokens.clear();
drafts[s].i_batch_tgt.clear();
}
// note: will be erased after the speculation phase
drafts[0].tokens.push_back(id);
drafts[0].i_batch_tgt.push_back(0);
llama_batch_clear(batch_dft);
llama_batch_add (batch_dft, id, n_past_dft, { 0 }, true);
llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1);
// LOG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
llama_decode (ctx_dft, batch_dft);
llama_decode(ctx_dft, llama_batch_get_one(&id, 1, n_past_dft, 0));
++n_past_dft;
// heuristic for n_draft
{
const int n_draft_cur = (int) drafted.size();
const bool all_accepted = i_dft == n_draft_cur;
LOG("n_draft = %d\n", n_draft);
LOG("n_draft_cur = %d\n", n_draft_cur);
LOG("i_dft = %d\n", i_dft);
LOG("all_accepted = %d\n", all_accepted);
if (all_accepted && n_draft == n_draft_cur) {
LOG(" - max drafted tokens accepted - n_draft += 8\n");
n_draft = std::min(30, n_draft + 8);
} else if (all_accepted) {
LOG(" - partially drafted tokens accepted - no change\n");
} else {
LOG(" - drafted token rejected - n_draft -= 1\n");
n_draft = std::max(2, n_draft - 1);
}
}
drafted.clear();
drafted.push_back(id);
break;
}
@@ -269,151 +209,78 @@ int main(int argc, char ** argv) {
break;
}
llama_sampling_cp(ctx_sampling, drafts[0].ctx_sampling);
if (grammar_tgt) {
if (grammar_dft) {
llama_grammar_free(grammar_dft);
}
// Note: Hardcoded to sequence id 0, if this ever supports parallel generation
// that will need to change.
auto it = ctx_sampling.sequence_contexts.find(0);
GGML_ASSERT(it != ctx_sampling.sequence_contexts.end());
// This is necessary because each sequence id in sequence_contexts
// uses a copy of the original grammar.
grammar_dft = llama_grammar_copy(it->second.grammar);
int n_seq_cur = 1;
int n_past_cur = n_past_dft;
for (int s = 0; s < n_seq_dft; ++s) {
drafts[s].active = false;
drafts[s].drafting = false;
LOG("copied target grammar to draft grammar\n");
}
drafts[0].active = true;
drafts[0].drafting = true;
drafts[0].i_batch_dft = 0;
llama_batch_clear(batch_tgt);
llama_batch_add (batch_tgt, drafts[0].tokens[0], n_past_tgt, { 0 }, true);
// sample n_draft tokens from the draft model using tree-based sampling
// sample n_draft tokens from the draft model using greedy decoding
int n_past_cur = n_past_dft;
for (int i = 0; i < n_draft; ++i) {
batch_dft.n_tokens = 0;
float * logits = llama_get_logits(ctx_dft);
for (int s = 0; s < n_seq_dft; ++s) {
drafts[s].skip = false;
candidates.clear();
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
}
for (int s = 0; s < n_seq_dft; ++s) {
if (!drafts[s].drafting || drafts[s].skip) {
continue;
}
llama_token_data_array cur_p = { candidates.data(), candidates.size(), false };
llama_sampling_sample(drafts[s].ctx_sampling, ctx_dft, NULL, drafts[s].i_batch_dft);
const auto & cur_p = drafts[s].ctx_sampling->cur;
for (int k = 0; k < std::min(n_seq_dft + 3, (int) cur_p.size()); ++k) {
LOG(" - draft candidate %3d for seq %3d, pos %3d: %6d (%8.3f) '%s'\n",
k, s, i, cur_p[k].id, cur_p[k].p, llama_token_to_piece(ctx_dft, cur_p[k].id).c_str());
}
if (cur_p[0].p < p_accept) {
LOG("stopping drafting for seq %3d, probability too low: %.3f < %.3f\n", s, cur_p[0].p, p_accept);
drafts[s].drafting = false;
continue;
}
std::vector<int> sa(1, s);
// attempt to split the branch if the probability is high enough
for (int f = 1; f < 8; ++f) {
if (n_seq_cur < n_seq_dft && cur_p[f].p > p_split) {
LOG("splitting seq %3d into %3d\n", s, n_seq_cur);
llama_kv_cache_seq_rm(ctx_dft, n_seq_cur, -1, -1);
llama_kv_cache_seq_cp(ctx_dft, s, n_seq_cur, -1, -1);
// all previous tokens from this branch are now also part of the new branch
for (int t = 0; t < batch_tgt.n_tokens; ++t) {
for (int p = 0; p < batch_tgt.n_seq_id[t]; ++p) {
if (batch_tgt.seq_id[t][p] == s) {
batch_tgt.seq_id[t][batch_tgt.n_seq_id[t]] = n_seq_cur;
batch_tgt.n_seq_id[t]++;
break;
}
}
}
// copy the draft state
drafts[n_seq_cur].active = true;
drafts[n_seq_cur].drafting = true;
drafts[n_seq_cur].skip = true;
drafts[n_seq_cur].tokens = drafts[s].tokens;
drafts[n_seq_cur].i_batch_dft = drafts[s].i_batch_dft;
drafts[n_seq_cur].i_batch_tgt = drafts[s].i_batch_tgt;
llama_sampling_cp(drafts[s].ctx_sampling, drafts[n_seq_cur].ctx_sampling);
sa.push_back(n_seq_cur);
n_seq_cur++;
} else {
break;
}
}
// add drafted token for each sequence
for (int is = 0; is < (int) sa.size(); ++is) {
const llama_token id = cur_p[is].id;
const int s = sa[is];
llama_sampling_accept(drafts[s].ctx_sampling, ctx_dft, id, true);
drafts[s].tokens.push_back(id);
// add unique drafted tokens to the target batch
drafts[s].i_batch_tgt.push_back(batch_tgt.n_tokens);
llama_batch_add(batch_tgt, id, n_past_tgt + i + 1, { s }, true);
// add the token to the batch for batched decoding with the draft model
drafts[s].i_batch_dft = batch_dft.n_tokens;
llama_batch_add(batch_dft, id, n_past_cur, { s }, true);
if (batch_tgt.n_tokens > n_draft) {
drafts[s].drafting = false;
}
}
if (grammar_dft != NULL) {
llama_sample_grammar(ctx_dft, &cur_p, grammar_dft);
}
// no sequence is drafting anymore
if (batch_dft.n_tokens == 0) {
// computes softmax and sorts the candidates
llama_sample_softmax(ctx_dft, &cur_p);
for (int i = 0; i < 3; ++i) {
LOG(" - draft candidate %3d: %6d (%8.3f) '%s'\n", i, cur_p.data[i].id, cur_p.data[i].p, llama_token_to_piece(ctx_dft, cur_p.data[i].id).c_str());
}
// TODO: better logic?
if (cur_p.data[0].p < 2*cur_p.data[1].p) {
LOG("stopping drafting, probability too low: %.3f < 2*%.3f\n", cur_p.data[0].p, cur_p.data[1].p);
break;
}
// evaluate the drafted tokens on the draft model
llama_decode(ctx_dft, batch_dft);
++n_past_cur;
// drafted token
const llama_token id = cur_p.data[0].id;
drafted.push_back(id);
++n_drafted;
if (batch_tgt.n_tokens > n_draft) {
// no need to evaluate the last drafted token, since we won't use the result
if (i == n_draft - 1) {
break;
}
// evaluate the drafted token on the draft model
llama_kv_cache_seq_rm(ctx_dft, 0, n_past_cur, -1);
llama_decode(ctx_dft, llama_batch_get_one(&drafted.back(), 1, n_past_cur, 0));
++n_past_cur;
if (grammar_dft != NULL) {
llama_grammar_accept_token(ctx_dft, grammar_dft, id);
}
}
// evaluate the target model on the drafted tokens
{
llama_kv_cache_seq_keep(ctx_tgt, 0);
for (int s = 1; s < n_seq_dft; ++s) {
llama_kv_cache_seq_cp(ctx_tgt, 0, s, -1, -1);
}
llama_kv_cache_seq_rm(ctx_tgt, 0, n_past_tgt, -1);
llama_decode(ctx_tgt, llama_batch_get_one(drafted.data(), drafted.size(), n_past_tgt, 0));
++n_past_tgt;
// LOG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str());
llama_decode(ctx_tgt, batch_tgt);
++n_past_tgt;
}
// the first token is always proposed by the traget model before the speculation loop so we erase it here
for (int s = 0; s < n_seq_dft; ++s) {
if (!drafts[s].active) {
continue;
}
drafts[s].tokens.erase(drafts[s].tokens.begin());
}
// the first token is always proposed by the traget model before the speculation loop
drafted.erase(drafted.begin());
}
auto t_dec_end = ggml_time_us();
@@ -421,8 +288,9 @@ int main(int argc, char ** argv) {
LOG_TEE("\n\n");
LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
// TODO: make sure these numbers are computed correctly
LOG_TEE("\n");
LOG_TEE("n_draft = %d\n", n_draft);
LOG_TEE("n_predict = %d\n", n_predict);
@@ -436,19 +304,16 @@ int main(int argc, char ** argv) {
LOG_TEE("\ntarget:\n");
llama_print_timings(ctx_tgt);
llama_sampling_free(ctx_sampling);
for (int s = 0; s < n_seq_dft; ++s) {
llama_sampling_free(drafts[s].ctx_sampling);
}
llama_batch_free(batch_dft);
llama_free(ctx_tgt);
llama_free_model(model_tgt);
llama_free(ctx_dft);
llama_free_model(model_dft);
if (grammar_dft != NULL) {
llama_grammar_free(grammar_dft);
llama_grammar_free(grammar_tgt);
}
llama_backend_free();
fprintf(stderr, "\n\n");

View File

@@ -9,7 +9,7 @@ import numpy as np
from pathlib import Path
if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / '..' / '..' / 'gguf-py'))
sys.path.insert(1, str(Path(__file__).parent / '..' / '..' / 'gguf-py' / 'gguf'))
import gguf
# gguf constants

Some files were not shown because too many files have changed in this diff Show More