mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-04-23 16:37:33 +03:00
Compare commits
23 Commits
b1570
...
ceb/libstd
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5b74310e6e | ||
|
|
1d144112c0 | ||
|
|
f43f09366d | ||
|
|
d2809a3ba2 | ||
|
|
15f5d96037 | ||
|
|
33c9892af5 | ||
|
|
8efa0f6ebe | ||
|
|
524907aa76 | ||
|
|
3bd2c7ce1b | ||
|
|
bde629bb53 | ||
|
|
f7f9e06212 | ||
|
|
74daabae69 | ||
|
|
b18c66ca6e | ||
|
|
f4d973cecb | ||
|
|
954e22858c | ||
|
|
e2bd725f4b | ||
|
|
1f5cd83275 | ||
|
|
4fea3420ee | ||
|
|
64e64aa255 | ||
|
|
8406b0924b | ||
|
|
b38a16dfcf | ||
|
|
0dab8cd7cc | ||
|
|
bb03290c17 |
@@ -13,6 +13,8 @@ elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
|
||||
./quantize "$@"
|
||||
elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
|
||||
./main "$@"
|
||||
elif [[ "$arg1" == '--finetune' || "$arg1" == '-f' ]]; then
|
||||
./finetune "$@"
|
||||
elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
|
||||
echo "Converting PTH to GGML..."
|
||||
for i in `ls $1/$2/ggml-model-f16.bin*`; do
|
||||
@@ -34,6 +36,8 @@ else
|
||||
echo " ex: --outtype f16 \"/models/7B/\" "
|
||||
echo " --quantize (-q): Optimize with quantization process ggml"
|
||||
echo " ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
|
||||
echo " --finetune (-f): Run finetune command to create a lora finetune of the model"
|
||||
echo " See documentation for finetune for command-line parameters"
|
||||
echo " --all-in-one (-a): Execute --convert & --quantize"
|
||||
echo " ex: \"/models/\" 7B"
|
||||
echo " --server (-s): Run a model on the server"
|
||||
|
||||
11
.github/workflows/build.yml
vendored
11
.github/workflows/build.yml
vendored
@@ -498,6 +498,17 @@ jobs:
|
||||
path: |
|
||||
cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
|
||||
|
||||
ios-xcode-build:
|
||||
runs-on: macos-latest
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Build Xcode project
|
||||
run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build
|
||||
|
||||
|
||||
# freeBSD-latest:
|
||||
# runs-on: macos-12
|
||||
# steps:
|
||||
|
||||
25
.gitignore
vendored
25
.gitignore
vendored
@@ -88,15 +88,16 @@ poetry.lock
|
||||
poetry.toml
|
||||
|
||||
# Test binaries
|
||||
tests/test-grammar-parser
|
||||
tests/test-llama-grammar
|
||||
tests/test-double-float
|
||||
tests/test-grad0
|
||||
tests/test-opt
|
||||
tests/test-quantize-fns
|
||||
tests/test-quantize-perf
|
||||
tests/test-sampling
|
||||
tests/test-tokenizer-0-llama
|
||||
tests/test-tokenizer-0-falcon
|
||||
tests/test-tokenizer-1-llama
|
||||
tests/test-tokenizer-1-bpe
|
||||
/tests/test-grammar-parser
|
||||
/tests/test-llama-grammar
|
||||
/tests/test-double-float
|
||||
/tests/test-grad0
|
||||
/tests/test-opt
|
||||
/tests/test-quantize-fns
|
||||
/tests/test-quantize-perf
|
||||
/tests/test-sampling
|
||||
/tests/test-tokenizer-0-llama
|
||||
/tests/test-tokenizer-0-falcon
|
||||
/tests/test-tokenizer-1-llama
|
||||
/tests/test-tokenizer-1-bpe
|
||||
/tests/test-rope
|
||||
|
||||
@@ -43,6 +43,7 @@ else()
|
||||
endif()
|
||||
|
||||
# general
|
||||
option(BUILD_SHARED_LIBS "build shared libraries" OFF)
|
||||
option(LLAMA_STATIC "llama: static link libraries" OFF)
|
||||
option(LLAMA_NATIVE "llama: enable -march=native flag" ON)
|
||||
option(LLAMA_LTO "llama: enable link time optimization" OFF)
|
||||
@@ -100,6 +101,9 @@ option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALO
|
||||
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_SERVER "llama: build server example" ON)
|
||||
|
||||
# Required for relocatable CMake package
|
||||
include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
|
||||
|
||||
#
|
||||
# Compile flags
|
||||
#
|
||||
@@ -112,6 +116,11 @@ set(THREADS_PREFER_PTHREAD_FLAG ON)
|
||||
find_package(Threads REQUIRED)
|
||||
include(CheckCXXCompilerFlag)
|
||||
|
||||
# enable libstdc++ assertions for debug builds
|
||||
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
|
||||
add_compile_definitions($<$<CONFIG:Debug>:_GLIBCXX_ASSERTIONS>)
|
||||
endif()
|
||||
|
||||
if (NOT MSVC)
|
||||
if (LLAMA_SANITIZE_THREAD)
|
||||
add_compile_options(-fsanitize=thread)
|
||||
@@ -161,7 +170,7 @@ if (LLAMA_METAL)
|
||||
#add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/")
|
||||
|
||||
# copy ggml-metal.metal to bin directory
|
||||
configure_file(ggml-metal.metal bin/ggml-metal.metal COPYONLY)
|
||||
configure_file(ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY)
|
||||
|
||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS}
|
||||
${FOUNDATION_LIBRARY}
|
||||
|
||||
29
Makefile
29
Makefile
@@ -8,7 +8,7 @@ BUILD_TARGETS = \
|
||||
TEST_TARGETS = \
|
||||
tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
|
||||
tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \
|
||||
tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe
|
||||
tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope
|
||||
|
||||
# Code coverage output files
|
||||
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
|
||||
@@ -30,7 +30,7 @@ ifeq '' '$(findstring clang,$(shell $(CC) --version))'
|
||||
CC_VER := $(shell $(CC) -dumpfullversion -dumpversion | awk -F. '{ printf("%02d%02d%02d", $$1, $$2, $$3) }')
|
||||
else
|
||||
CC_IS_CLANG=1
|
||||
ifeq '' '$(findstring Apple LLVM,$(shell $(CC) --version))'
|
||||
ifeq '' '$(findstring Apple,$(shell $(CC) --version))'
|
||||
CC_IS_LLVM_CLANG=1
|
||||
else
|
||||
CC_IS_APPLE_CLANG=1
|
||||
@@ -174,6 +174,10 @@ ifdef LLAMA_DEBUG
|
||||
MK_CFLAGS += -O0 -g
|
||||
MK_CXXFLAGS += -O0 -g
|
||||
MK_LDFLAGS += -g
|
||||
|
||||
ifeq ($(UNAME_S),Linux)
|
||||
MK_CXXFLAGS += -Wp,-D_GLIBCXX_ASSERTIONS
|
||||
endif
|
||||
else
|
||||
MK_CPPFLAGS += -DNDEBUG
|
||||
endif
|
||||
@@ -648,7 +652,7 @@ beam-search: examples/beam-search/beam-search.cpp ggml.o llama.o $(COMMON_DEPS)
|
||||
finetune: examples/finetune/finetune.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
export-lora: examples/export-lora/export-lora.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
export-lora: examples/export-lora/export-lora.cpp ggml.o common/common.h $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
||||
@@ -701,28 +705,28 @@ vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
|
||||
q8dot: pocs/vdot/q8dot.cpp ggml.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-llama-grammar: tests/test-llama-grammar.cpp ggml.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
||||
tests/test-llama-grammar: tests/test-llama-grammar.cpp ggml.o grammar-parser.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-grammar-parser: tests/test-grammar-parser.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
||||
tests/test-grammar-parser: tests/test-grammar-parser.cpp ggml.o llama.o grammar-parser.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-double-float: tests/test-double-float.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
tests/test-double-float: tests/test-double-float.cpp ggml.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-grad0: tests/test-grad0.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
tests/test-grad0: tests/test-grad0.cpp ggml.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-opt: tests/test-opt.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
tests/test-opt: tests/test-opt.cpp ggml.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-quantize-fns: tests/test-quantize-fns.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
tests/test-quantize-fns: tests/test-quantize-fns.cpp ggml.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-quantize-perf: tests/test-quantize-perf.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
tests/test-quantize-perf: tests/test-quantize-perf.cpp ggml.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-sampling: tests/test-sampling.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
tests/test-sampling: tests/test-sampling.cpp ggml.o llama.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
@@ -737,5 +741,8 @@ tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMM
|
||||
tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-rope: tests/test-rope.cpp ggml.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-c.o: tests/test-c.c llama.h
|
||||
$(CC) $(CFLAGS) -c $(filter-out %.h,$^) -o $@
|
||||
|
||||
@@ -116,6 +116,8 @@ as the main playground for developing new features for the [ggml](https://github
|
||||
- [nat/openplayground](https://github.com/nat/openplayground)
|
||||
- [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui)
|
||||
- [withcatai/catai](https://github.com/withcatai/catai)
|
||||
- [semperai/amica](https://github.com/semperai/amica)
|
||||
- [psugihara/FreeChat](https://github.com/psugihara/FreeChat)
|
||||
|
||||
---
|
||||
|
||||
@@ -322,7 +324,7 @@ mpirun -hostfile hostfile -n 3 ./main -m ./models/7B/ggml-model-q4_0.gguf -n 128
|
||||
|
||||
### BLAS Build
|
||||
|
||||
Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). BLAS doesn't affect the normal generation performance. There are currently three different implementations of it:
|
||||
Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Support with CPU-only BLAS implementations doesn't affect the normal generation performance. We may see generation performance improvements with GPU-involved BLAS implementations, e.g. cuBLAS, hipBLAS and CLBlast. There are currently several different BLAS implementations available for build and use:
|
||||
|
||||
- #### Accelerate Framework:
|
||||
|
||||
@@ -894,7 +896,7 @@ Additionally, there the following images, similar to the above:
|
||||
- `ghcr.io/ggerganov/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
|
||||
- `ghcr.io/ggerganov/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
|
||||
|
||||
The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](.devops/) and the Gitlab Action defined in [.github/workflows/docker.yml](.github/workflows/docker.yml). If you need different settings (for example, a different CUDA or ROCm library, you'll need to build the images locally for now).
|
||||
The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](.github/workflows/docker.yml). If you need different settings (for example, a different CUDA or ROCm library, you'll need to build the images locally for now).
|
||||
|
||||
#### Usage
|
||||
|
||||
|
||||
@@ -11,7 +11,12 @@ if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
|
||||
if(NOT IS_DIRECTORY "${GIT_DIR}")
|
||||
file(READ ${GIT_DIR} REAL_GIT_DIR_LINK)
|
||||
string(REGEX REPLACE "gitdir: (.*)\n$" "\\1" REAL_GIT_DIR ${REAL_GIT_DIR_LINK})
|
||||
set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../${REAL_GIT_DIR}")
|
||||
string(FIND "${REAL_GIT_DIR}" "/" SLASH_POS)
|
||||
if (SLASH_POS EQUAL 0)
|
||||
set(GIT_DIR "${REAL_GIT_DIR}")
|
||||
else()
|
||||
set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../${REAL_GIT_DIR}")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
set(GIT_INDEX "${GIT_DIR}/index")
|
||||
@@ -26,7 +31,7 @@ add_custom_command(
|
||||
COMMENT "Generating build details from Git"
|
||||
COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION}
|
||||
-DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME}
|
||||
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/../scripts/build-info.cmake"
|
||||
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/../scripts/gen-build-info-cpp.cmake"
|
||||
WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/.."
|
||||
DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in" ${GIT_INDEX}
|
||||
VERBATIM
|
||||
|
||||
@@ -267,7 +267,7 @@ class Params:
|
||||
n_ctx = 2048
|
||||
|
||||
return Params(
|
||||
n_vocab = config.get("vocab_size", model["tok_embeddings.weight"].shape[0]),
|
||||
n_vocab = model["tok_embeddings.weight"].shape[0],
|
||||
n_embd = config["dim"],
|
||||
n_layer = config["n_layers"],
|
||||
n_ctx = n_ctx,
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
This is a swift clone of `examples/batched`.
|
||||
|
||||
$ `make`
|
||||
$ `./swift MODEL_PATH [PROMPT] [PARALLEL]`
|
||||
$ `./batched_swift MODEL_PATH [PROMPT] [PARALLEL]`
|
||||
|
||||
1
examples/llama.swiftui/.gitignore
vendored
Normal file
1
examples/llama.swiftui/.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
||||
xcuserdata
|
||||
7
examples/llama.swiftui/README.md
Normal file
7
examples/llama.swiftui/README.md
Normal file
@@ -0,0 +1,7 @@
|
||||
# llama.swiftui
|
||||
|
||||
Local inference of llama.cpp on an iPhone.
|
||||
So far I only tested with starcoder 1B model, but it can most likely handle 7B models as well.
|
||||
|
||||
https://github.com/bachittle/llama.cpp/assets/39804642/e290827a-4edb-4093-9642-2a5e399ec545
|
||||
|
||||
176
examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
Normal file
176
examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
Normal file
@@ -0,0 +1,176 @@
|
||||
import Foundation
|
||||
|
||||
// import llama
|
||||
|
||||
enum LlamaError: Error {
|
||||
case couldNotInitializeContext
|
||||
}
|
||||
|
||||
actor LlamaContext {
|
||||
private var model: OpaquePointer
|
||||
private var context: OpaquePointer
|
||||
private var batch: llama_batch
|
||||
private var tokens_list: [llama_token]
|
||||
|
||||
var n_len: Int32 = 512
|
||||
var n_cur: Int32 = 0
|
||||
var n_decode: Int32 = 0
|
||||
|
||||
init(model: OpaquePointer, context: OpaquePointer) {
|
||||
self.model = model
|
||||
self.context = context
|
||||
self.tokens_list = []
|
||||
self.batch = llama_batch_init(512, 0, 1)
|
||||
}
|
||||
|
||||
deinit {
|
||||
llama_free(context)
|
||||
llama_free_model(model)
|
||||
llama_backend_free()
|
||||
}
|
||||
|
||||
static func createContext(path: String) throws -> LlamaContext {
|
||||
llama_backend_init(false)
|
||||
let model_params = llama_model_default_params()
|
||||
|
||||
let model = llama_load_model_from_file(path, model_params)
|
||||
guard let model else {
|
||||
print("Could not load model at \(path)")
|
||||
throw LlamaError.couldNotInitializeContext
|
||||
}
|
||||
var ctx_params = llama_context_default_params()
|
||||
ctx_params.seed = 1234
|
||||
ctx_params.n_ctx = 2048
|
||||
ctx_params.n_threads = 8
|
||||
ctx_params.n_threads_batch = 8
|
||||
|
||||
let context = llama_new_context_with_model(model, ctx_params)
|
||||
guard let context else {
|
||||
print("Could not load context!")
|
||||
throw LlamaError.couldNotInitializeContext
|
||||
}
|
||||
|
||||
return LlamaContext(model: model, context: context)
|
||||
}
|
||||
|
||||
func get_n_tokens() -> Int32 {
|
||||
return batch.n_tokens;
|
||||
}
|
||||
|
||||
func completion_init(text: String) {
|
||||
print("attempting to complete \"\(text)\"")
|
||||
|
||||
tokens_list = tokenize(text: text, add_bos: true)
|
||||
|
||||
let n_ctx = llama_n_ctx(context)
|
||||
let n_kv_req = tokens_list.count + (Int(n_len) - tokens_list.count)
|
||||
|
||||
print("\n n_len = \(n_len), n_ctx = \(n_ctx), n_kv_req = \(n_kv_req)")
|
||||
|
||||
if n_kv_req > n_ctx {
|
||||
print("error: n_kv_req > n_ctx, the required KV cache size is not big enough")
|
||||
}
|
||||
|
||||
for id in tokens_list {
|
||||
print(token_to_piece(token: id))
|
||||
}
|
||||
|
||||
// batch = llama_batch_init(512, 0) // done in init()
|
||||
batch.n_tokens = Int32(tokens_list.count)
|
||||
|
||||
for i1 in 0..<batch.n_tokens {
|
||||
let i = Int(i1)
|
||||
batch.token[i] = tokens_list[i]
|
||||
batch.pos[i] = i1
|
||||
batch.n_seq_id[Int(i)] = 1
|
||||
batch.seq_id[Int(i)]![0] = 0
|
||||
batch.logits[i] = 0
|
||||
}
|
||||
batch.logits[Int(batch.n_tokens) - 1] = 1 // true
|
||||
|
||||
if llama_decode(context, batch) != 0 {
|
||||
print("llama_decode() failed")
|
||||
}
|
||||
|
||||
n_cur = batch.n_tokens
|
||||
}
|
||||
|
||||
func completion_loop() -> String {
|
||||
var new_token_id: llama_token = 0
|
||||
|
||||
let n_vocab = llama_n_vocab(model)
|
||||
let logits = llama_get_logits_ith(context, batch.n_tokens - 1)
|
||||
|
||||
var candidates = Array<llama_token_data>()
|
||||
candidates.reserveCapacity(Int(n_vocab))
|
||||
|
||||
for token_id in 0..<n_vocab {
|
||||
candidates.append(llama_token_data(id: token_id, logit: logits![Int(token_id)], p: 0.0))
|
||||
}
|
||||
candidates.withUnsafeMutableBufferPointer() { buffer in
|
||||
var candidates_p = llama_token_data_array(data: buffer.baseAddress, size: buffer.count, sorted: false)
|
||||
|
||||
new_token_id = llama_sample_token_greedy(context, &candidates_p)
|
||||
}
|
||||
|
||||
if new_token_id == llama_token_eos(context) || n_cur == n_len {
|
||||
print("\n")
|
||||
return ""
|
||||
}
|
||||
|
||||
let new_token_str = token_to_piece(token: new_token_id)
|
||||
print(new_token_str)
|
||||
// tokens_list.append(new_token_id)
|
||||
|
||||
batch.n_tokens = 0
|
||||
|
||||
batch.token[Int(batch.n_tokens)] = new_token_id
|
||||
batch.pos[Int(batch.n_tokens)] = n_cur
|
||||
batch.n_seq_id[Int(batch.n_tokens)] = 1
|
||||
batch.seq_id[Int(batch.n_tokens)]![0] = 0
|
||||
batch.logits[Int(batch.n_tokens)] = 1 // true
|
||||
batch.n_tokens += 1
|
||||
|
||||
n_decode += 1
|
||||
|
||||
n_cur += 1
|
||||
|
||||
if llama_decode(context, batch) != 0 {
|
||||
print("failed to evaluate llama!")
|
||||
}
|
||||
|
||||
return new_token_str
|
||||
}
|
||||
|
||||
func clear() {
|
||||
tokens_list.removeAll()
|
||||
}
|
||||
|
||||
private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
|
||||
let n_tokens = text.count + (add_bos ? 1 : 0)
|
||||
let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
|
||||
let tokenCount = llama_tokenize(model, text, Int32(text.count), tokens, Int32(n_tokens), add_bos, false)
|
||||
|
||||
var swiftTokens: [llama_token] = []
|
||||
for i in 0..<tokenCount {
|
||||
swiftTokens.append(tokens[Int(i)])
|
||||
}
|
||||
|
||||
tokens.deallocate()
|
||||
|
||||
return swiftTokens
|
||||
}
|
||||
|
||||
private func token_to_piece(token: llama_token) -> String {
|
||||
let result = UnsafeMutablePointer<Int8>.allocate(capacity: 8)
|
||||
result.initialize(repeating: Int8(0), count: 8)
|
||||
|
||||
let _ = llama_token_to_piece(model, token, result, 8)
|
||||
|
||||
let resultStr = String(cString: result)
|
||||
|
||||
result.deallocate()
|
||||
|
||||
return resultStr
|
||||
}
|
||||
}
|
||||
5
examples/llama.swiftui/llama.cpp.swift/bridging-header.h
Normal file
5
examples/llama.swiftui/llama.cpp.swift/bridging-header.h
Normal file
@@ -0,0 +1,5 @@
|
||||
//
|
||||
// Use this file to import your target's public headers that you would like to expose to Swift.
|
||||
//
|
||||
|
||||
#import "llama.h"
|
||||
481
examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj
Normal file
481
examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj
Normal file
@@ -0,0 +1,481 @@
|
||||
// !$*UTF8*$!
|
||||
{
|
||||
archiveVersion = 1;
|
||||
classes = {
|
||||
};
|
||||
objectVersion = 56;
|
||||
objects = {
|
||||
|
||||
/* Begin PBXBuildFile section */
|
||||
542376082B0D9BFB008E6A1C /* ggml-quants.c in Sources */ = {isa = PBXBuildFile; fileRef = 542376072B0D9BFB008E6A1C /* ggml-quants.c */; };
|
||||
5423760B2B0D9C4B008E6A1C /* ggml-backend.c in Sources */ = {isa = PBXBuildFile; fileRef = 5423760A2B0D9C4B008E6A1C /* ggml-backend.c */; };
|
||||
542378792ACE3F3500834A7B /* ggml-metal.metal in Resources */ = {isa = PBXBuildFile; fileRef = 549479C82AC9E10B00E0F78B /* ggml-metal.metal */; };
|
||||
542EA09D2AC8723900A8AEE9 /* ggml.c in Sources */ = {isa = PBXBuildFile; fileRef = 542EA09B2AC8723900A8AEE9 /* ggml.c */; settings = {COMPILER_FLAGS = "-DGGML_USE_ACCELERATE -DGGML_USE_METAL -DGGML_USE_K_QUANTS -O3"; }; };
|
||||
542EA0A02AC8725700A8AEE9 /* ggml-alloc.c in Sources */ = {isa = PBXBuildFile; fileRef = 542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */; };
|
||||
542EA0A32AC8729100A8AEE9 /* llama.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 542EA0A12AC8729100A8AEE9 /* llama.cpp */; settings = {COMPILER_FLAGS = "-DGGML_USE_K_QUANTS -DGGML_USE_METAL -O3"; }; };
|
||||
549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 549479CA2AC9E16000E0F78B /* Metal.framework */; };
|
||||
549479CD2AC9E42A00E0F78B /* ggml-metal.m in Sources */ = {isa = PBXBuildFile; fileRef = 549479C52AC9E0F200E0F78B /* ggml-metal.m */; settings = {COMPILER_FLAGS = "-fno-objc-arc -DGGML_SWIFT -DGGML_USE_METAL -O3"; }; };
|
||||
8A1C83772AC328BD0096AF73 /* llama_swiftuiApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */; };
|
||||
8A1C83792AC328BD0096AF73 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A1C83782AC328BD0096AF73 /* ContentView.swift */; };
|
||||
8A1C837B2AC328BE0096AF73 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 8A1C837A2AC328BE0096AF73 /* Assets.xcassets */; };
|
||||
8A1C837E2AC328BE0096AF73 /* Preview Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */; };
|
||||
8A39BE0A2AC7601100BFEB40 /* Accelerate.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 8A39BE092AC7601000BFEB40 /* Accelerate.framework */; };
|
||||
8A3F84242AC4C891005E2EE8 /* models in Resources */ = {isa = PBXBuildFile; fileRef = 8A3F84232AC4C891005E2EE8 /* models */; };
|
||||
8A907F332AC7138A006146EA /* LibLlama.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A907F322AC7134E006146EA /* LibLlama.swift */; };
|
||||
8A9F7C4D2AC332EE008AE1EA /* LlamaState.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */; };
|
||||
/* End PBXBuildFile section */
|
||||
|
||||
/* Begin PBXFileReference section */
|
||||
542376062B0D9BEA008E6A1C /* ggml-quants.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-quants.h"; path = "../../ggml-quants.h"; sourceTree = "<group>"; };
|
||||
542376072B0D9BFB008E6A1C /* ggml-quants.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-quants.c"; path = "../../ggml-quants.c"; sourceTree = "<group>"; };
|
||||
542376092B0D9C40008E6A1C /* ggml-backend.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-backend.h"; path = "../../ggml-backend.h"; sourceTree = "<group>"; };
|
||||
5423760A2B0D9C4B008E6A1C /* ggml-backend.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-backend.c"; path = "../../ggml-backend.c"; sourceTree = "<group>"; };
|
||||
542EA09B2AC8723900A8AEE9 /* ggml.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = ggml.c; path = ../../ggml.c; sourceTree = "<group>"; };
|
||||
542EA09C2AC8723900A8AEE9 /* ggml.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = ggml.h; path = ../../ggml.h; sourceTree = "<group>"; };
|
||||
542EA09E2AC8725700A8AEE9 /* ggml-alloc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-alloc.h"; path = "../../ggml-alloc.h"; sourceTree = "<group>"; };
|
||||
542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-alloc.c"; path = "../../ggml-alloc.c"; sourceTree = "<group>"; };
|
||||
542EA0A12AC8729100A8AEE9 /* llama.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = llama.cpp; path = ../../llama.cpp; sourceTree = "<group>"; };
|
||||
542EA0A22AC8729100A8AEE9 /* llama.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = llama.h; path = ../../llama.h; sourceTree = "<group>"; };
|
||||
549479C52AC9E0F200E0F78B /* ggml-metal.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; name = "ggml-metal.m"; path = "../../ggml-metal.m"; sourceTree = "<group>"; };
|
||||
549479C62AC9E0F200E0F78B /* ggml-metal.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-metal.h"; path = "../../ggml-metal.h"; sourceTree = "<group>"; };
|
||||
549479C82AC9E10B00E0F78B /* ggml-metal.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; name = "ggml-metal.metal"; path = "../../ggml-metal.metal"; sourceTree = "<group>"; };
|
||||
549479CA2AC9E16000E0F78B /* Metal.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Metal.framework; path = System/Library/Frameworks/Metal.framework; sourceTree = SDKROOT; };
|
||||
8A08D20A2AC73B1500FE6CD4 /* bridging-header.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = "bridging-header.h"; sourceTree = "<group>"; };
|
||||
8A1C83732AC328BD0096AF73 /* llama.swiftui.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = llama.swiftui.app; sourceTree = BUILT_PRODUCTS_DIR; };
|
||||
8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = llama_swiftuiApp.swift; sourceTree = "<group>"; };
|
||||
8A1C83782AC328BD0096AF73 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = "<group>"; };
|
||||
8A1C837A2AC328BE0096AF73 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = "<group>"; };
|
||||
8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = "Preview Assets.xcassets"; sourceTree = "<group>"; };
|
||||
8A39BE092AC7601000BFEB40 /* Accelerate.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Accelerate.framework; path = System/Library/Frameworks/Accelerate.framework; sourceTree = SDKROOT; };
|
||||
8A3F841F2AC4C824005E2EE8 /* llama-2-7b-chat.Q2_K.gguf */ = {isa = PBXFileReference; lastKnownFileType = file; path = "llama-2-7b-chat.Q2_K.gguf"; sourceTree = "<group>"; };
|
||||
8A3F84232AC4C891005E2EE8 /* models */ = {isa = PBXFileReference; lastKnownFileType = folder; name = models; path = llama.swiftui/Resources/models; sourceTree = "<group>"; };
|
||||
8A907F322AC7134E006146EA /* LibLlama.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LibLlama.swift; sourceTree = "<group>"; };
|
||||
8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LlamaState.swift; sourceTree = "<group>"; };
|
||||
/* End PBXFileReference section */
|
||||
|
||||
/* Begin PBXFrameworksBuildPhase section */
|
||||
8A1C83702AC328BD0096AF73 /* Frameworks */ = {
|
||||
isa = PBXFrameworksBuildPhase;
|
||||
buildActionMask = 2147483647;
|
||||
files = (
|
||||
549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */,
|
||||
8A39BE0A2AC7601100BFEB40 /* Accelerate.framework in Frameworks */,
|
||||
);
|
||||
runOnlyForDeploymentPostprocessing = 0;
|
||||
};
|
||||
/* End PBXFrameworksBuildPhase section */
|
||||
|
||||
/* Begin PBXGroup section */
|
||||
8A08D1F62AC7383900FE6CD4 /* llama.cpp */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
5423760A2B0D9C4B008E6A1C /* ggml-backend.c */,
|
||||
542376092B0D9C40008E6A1C /* ggml-backend.h */,
|
||||
542376062B0D9BEA008E6A1C /* ggml-quants.h */,
|
||||
542376072B0D9BFB008E6A1C /* ggml-quants.c */,
|
||||
549479C82AC9E10B00E0F78B /* ggml-metal.metal */,
|
||||
549479C62AC9E0F200E0F78B /* ggml-metal.h */,
|
||||
549479C52AC9E0F200E0F78B /* ggml-metal.m */,
|
||||
542EA09B2AC8723900A8AEE9 /* ggml.c */,
|
||||
542EA09C2AC8723900A8AEE9 /* ggml.h */,
|
||||
542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */,
|
||||
542EA09E2AC8725700A8AEE9 /* ggml-alloc.h */,
|
||||
542EA0A12AC8729100A8AEE9 /* llama.cpp */,
|
||||
542EA0A22AC8729100A8AEE9 /* llama.h */,
|
||||
);
|
||||
name = llama.cpp;
|
||||
sourceTree = "<group>";
|
||||
};
|
||||
8A1C836A2AC328BD0096AF73 = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
8A08D1F62AC7383900FE6CD4 /* llama.cpp */,
|
||||
8A907F312AC7134E006146EA /* llama.cpp.swift */,
|
||||
8A3F84232AC4C891005E2EE8 /* models */,
|
||||
8A1C83752AC328BD0096AF73 /* llama.swiftui */,
|
||||
8A1C83742AC328BD0096AF73 /* Products */,
|
||||
8A39BE082AC7601000BFEB40 /* Frameworks */,
|
||||
);
|
||||
sourceTree = "<group>";
|
||||
};
|
||||
8A1C83742AC328BD0096AF73 /* Products */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
8A1C83732AC328BD0096AF73 /* llama.swiftui.app */,
|
||||
);
|
||||
name = Products;
|
||||
sourceTree = "<group>";
|
||||
};
|
||||
8A1C83752AC328BD0096AF73 /* llama.swiftui */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
8A3F84102AC4BD85005E2EE8 /* Resources */,
|
||||
8A9F7C4B2AC332DC008AE1EA /* Models */,
|
||||
8A9F7C4A2AC332BF008AE1EA /* UI */,
|
||||
8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */,
|
||||
8A1C837A2AC328BE0096AF73 /* Assets.xcassets */,
|
||||
8A1C837C2AC328BE0096AF73 /* Preview Content */,
|
||||
);
|
||||
path = llama.swiftui;
|
||||
sourceTree = "<group>";
|
||||
};
|
||||
8A1C837C2AC328BE0096AF73 /* Preview Content */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */,
|
||||
);
|
||||
path = "Preview Content";
|
||||
sourceTree = "<group>";
|
||||
};
|
||||
8A39BE082AC7601000BFEB40 /* Frameworks */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
549479CA2AC9E16000E0F78B /* Metal.framework */,
|
||||
8A39BE092AC7601000BFEB40 /* Accelerate.framework */,
|
||||
);
|
||||
name = Frameworks;
|
||||
sourceTree = "<group>";
|
||||
};
|
||||
8A3F84102AC4BD85005E2EE8 /* Resources */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
8A3F84112AC4BD8C005E2EE8 /* models */,
|
||||
);
|
||||
path = Resources;
|
||||
sourceTree = "<group>";
|
||||
};
|
||||
8A3F84112AC4BD8C005E2EE8 /* models */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
8A3F841F2AC4C824005E2EE8 /* llama-2-7b-chat.Q2_K.gguf */,
|
||||
);
|
||||
path = models;
|
||||
sourceTree = "<group>";
|
||||
};
|
||||
8A907F312AC7134E006146EA /* llama.cpp.swift */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
8A08D20A2AC73B1500FE6CD4 /* bridging-header.h */,
|
||||
8A907F322AC7134E006146EA /* LibLlama.swift */,
|
||||
);
|
||||
path = llama.cpp.swift;
|
||||
sourceTree = "<group>";
|
||||
};
|
||||
8A9F7C4A2AC332BF008AE1EA /* UI */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
8A1C83782AC328BD0096AF73 /* ContentView.swift */,
|
||||
);
|
||||
path = UI;
|
||||
sourceTree = "<group>";
|
||||
};
|
||||
8A9F7C4B2AC332DC008AE1EA /* Models */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */,
|
||||
);
|
||||
path = Models;
|
||||
sourceTree = "<group>";
|
||||
};
|
||||
/* End PBXGroup section */
|
||||
|
||||
/* Begin PBXNativeTarget section */
|
||||
8A1C83722AC328BD0096AF73 /* llama.swiftui */ = {
|
||||
isa = PBXNativeTarget;
|
||||
buildConfigurationList = 8A1C83812AC328BE0096AF73 /* Build configuration list for PBXNativeTarget "llama.swiftui" */;
|
||||
buildPhases = (
|
||||
8A1C836F2AC328BD0096AF73 /* Sources */,
|
||||
8A1C83702AC328BD0096AF73 /* Frameworks */,
|
||||
8A1C83712AC328BD0096AF73 /* Resources */,
|
||||
);
|
||||
buildRules = (
|
||||
);
|
||||
dependencies = (
|
||||
);
|
||||
name = llama.swiftui;
|
||||
packageProductDependencies = (
|
||||
);
|
||||
productName = llama.swiftui;
|
||||
productReference = 8A1C83732AC328BD0096AF73 /* llama.swiftui.app */;
|
||||
productType = "com.apple.product-type.application";
|
||||
};
|
||||
/* End PBXNativeTarget section */
|
||||
|
||||
/* Begin PBXProject section */
|
||||
8A1C836B2AC328BD0096AF73 /* Project object */ = {
|
||||
isa = PBXProject;
|
||||
attributes = {
|
||||
BuildIndependentTargetsInParallel = 1;
|
||||
LastSwiftUpdateCheck = 1500;
|
||||
LastUpgradeCheck = 1500;
|
||||
TargetAttributes = {
|
||||
8A1C83722AC328BD0096AF73 = {
|
||||
CreatedOnToolsVersion = 15.0;
|
||||
LastSwiftMigration = 1500;
|
||||
};
|
||||
};
|
||||
};
|
||||
buildConfigurationList = 8A1C836E2AC328BD0096AF73 /* Build configuration list for PBXProject "llama.swiftui" */;
|
||||
compatibilityVersion = "Xcode 14.0";
|
||||
developmentRegion = en;
|
||||
hasScannedForEncodings = 0;
|
||||
knownRegions = (
|
||||
en,
|
||||
Base,
|
||||
);
|
||||
mainGroup = 8A1C836A2AC328BD0096AF73;
|
||||
packageReferences = (
|
||||
);
|
||||
productRefGroup = 8A1C83742AC328BD0096AF73 /* Products */;
|
||||
projectDirPath = "";
|
||||
projectRoot = "";
|
||||
targets = (
|
||||
8A1C83722AC328BD0096AF73 /* llama.swiftui */,
|
||||
);
|
||||
};
|
||||
/* End PBXProject section */
|
||||
|
||||
/* Begin PBXResourcesBuildPhase section */
|
||||
8A1C83712AC328BD0096AF73 /* Resources */ = {
|
||||
isa = PBXResourcesBuildPhase;
|
||||
buildActionMask = 2147483647;
|
||||
files = (
|
||||
542378792ACE3F3500834A7B /* ggml-metal.metal in Resources */,
|
||||
8A3F84242AC4C891005E2EE8 /* models in Resources */,
|
||||
8A1C837E2AC328BE0096AF73 /* Preview Assets.xcassets in Resources */,
|
||||
8A1C837B2AC328BE0096AF73 /* Assets.xcassets in Resources */,
|
||||
);
|
||||
runOnlyForDeploymentPostprocessing = 0;
|
||||
};
|
||||
/* End PBXResourcesBuildPhase section */
|
||||
|
||||
/* Begin PBXSourcesBuildPhase section */
|
||||
8A1C836F2AC328BD0096AF73 /* Sources */ = {
|
||||
isa = PBXSourcesBuildPhase;
|
||||
buildActionMask = 2147483647;
|
||||
files = (
|
||||
542376082B0D9BFB008E6A1C /* ggml-quants.c in Sources */,
|
||||
549479CD2AC9E42A00E0F78B /* ggml-metal.m in Sources */,
|
||||
542EA09D2AC8723900A8AEE9 /* ggml.c in Sources */,
|
||||
8A907F332AC7138A006146EA /* LibLlama.swift in Sources */,
|
||||
542EA0A32AC8729100A8AEE9 /* llama.cpp in Sources */,
|
||||
8A9F7C4D2AC332EE008AE1EA /* LlamaState.swift in Sources */,
|
||||
8A1C83792AC328BD0096AF73 /* ContentView.swift in Sources */,
|
||||
8A1C83772AC328BD0096AF73 /* llama_swiftuiApp.swift in Sources */,
|
||||
542EA0A02AC8725700A8AEE9 /* ggml-alloc.c in Sources */,
|
||||
5423760B2B0D9C4B008E6A1C /* ggml-backend.c in Sources */,
|
||||
);
|
||||
runOnlyForDeploymentPostprocessing = 0;
|
||||
};
|
||||
/* End PBXSourcesBuildPhase section */
|
||||
|
||||
/* Begin XCBuildConfiguration section */
|
||||
8A1C837F2AC328BE0096AF73 /* Debug */ = {
|
||||
isa = XCBuildConfiguration;
|
||||
buildSettings = {
|
||||
ALWAYS_SEARCH_USER_PATHS = NO;
|
||||
ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES;
|
||||
CLANG_ANALYZER_NONNULL = YES;
|
||||
CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
|
||||
CLANG_CXX_LANGUAGE_STANDARD = "gnu++20";
|
||||
CLANG_ENABLE_MODULES = YES;
|
||||
CLANG_ENABLE_OBJC_ARC = YES;
|
||||
CLANG_ENABLE_OBJC_WEAK = YES;
|
||||
CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
|
||||
CLANG_WARN_BOOL_CONVERSION = YES;
|
||||
CLANG_WARN_COMMA = YES;
|
||||
CLANG_WARN_CONSTANT_CONVERSION = YES;
|
||||
CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
|
||||
CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
|
||||
CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
|
||||
CLANG_WARN_EMPTY_BODY = YES;
|
||||
CLANG_WARN_ENUM_CONVERSION = YES;
|
||||
CLANG_WARN_INFINITE_RECURSION = YES;
|
||||
CLANG_WARN_INT_CONVERSION = YES;
|
||||
CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
|
||||
CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
|
||||
CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
|
||||
CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
|
||||
CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES;
|
||||
CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
|
||||
CLANG_WARN_STRICT_PROTOTYPES = YES;
|
||||
CLANG_WARN_SUSPICIOUS_MOVE = YES;
|
||||
CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
|
||||
CLANG_WARN_UNREACHABLE_CODE = YES;
|
||||
CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
|
||||
COPY_PHASE_STRIP = NO;
|
||||
DEBUG_INFORMATION_FORMAT = dwarf;
|
||||
ENABLE_STRICT_OBJC_MSGSEND = YES;
|
||||
ENABLE_TESTABILITY = YES;
|
||||
ENABLE_USER_SCRIPT_SANDBOXING = YES;
|
||||
GCC_C_LANGUAGE_STANDARD = gnu17;
|
||||
GCC_DYNAMIC_NO_PIC = NO;
|
||||
GCC_NO_COMMON_BLOCKS = YES;
|
||||
GCC_OPTIMIZATION_LEVEL = 0;
|
||||
GCC_PREPROCESSOR_DEFINITIONS = (
|
||||
"DEBUG=1",
|
||||
"$(inherited)",
|
||||
);
|
||||
GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
|
||||
GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
|
||||
GCC_WARN_UNDECLARED_SELECTOR = YES;
|
||||
GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
|
||||
GCC_WARN_UNUSED_FUNCTION = YES;
|
||||
GCC_WARN_UNUSED_VARIABLE = YES;
|
||||
IPHONEOS_DEPLOYMENT_TARGET = 17.0;
|
||||
LOCALIZATION_PREFERS_STRING_CATALOGS = YES;
|
||||
MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE;
|
||||
MTL_FAST_MATH = YES;
|
||||
ONLY_ACTIVE_ARCH = YES;
|
||||
SDKROOT = iphoneos;
|
||||
SWIFT_ACTIVE_COMPILATION_CONDITIONS = "DEBUG $(inherited)";
|
||||
SWIFT_OPTIMIZATION_LEVEL = "-Onone";
|
||||
};
|
||||
name = Debug;
|
||||
};
|
||||
8A1C83802AC328BE0096AF73 /* Release */ = {
|
||||
isa = XCBuildConfiguration;
|
||||
buildSettings = {
|
||||
ALWAYS_SEARCH_USER_PATHS = NO;
|
||||
ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES;
|
||||
CLANG_ANALYZER_NONNULL = YES;
|
||||
CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
|
||||
CLANG_CXX_LANGUAGE_STANDARD = "gnu++20";
|
||||
CLANG_ENABLE_MODULES = YES;
|
||||
CLANG_ENABLE_OBJC_ARC = YES;
|
||||
CLANG_ENABLE_OBJC_WEAK = YES;
|
||||
CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
|
||||
CLANG_WARN_BOOL_CONVERSION = YES;
|
||||
CLANG_WARN_COMMA = YES;
|
||||
CLANG_WARN_CONSTANT_CONVERSION = YES;
|
||||
CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
|
||||
CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
|
||||
CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
|
||||
CLANG_WARN_EMPTY_BODY = YES;
|
||||
CLANG_WARN_ENUM_CONVERSION = YES;
|
||||
CLANG_WARN_INFINITE_RECURSION = YES;
|
||||
CLANG_WARN_INT_CONVERSION = YES;
|
||||
CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
|
||||
CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
|
||||
CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
|
||||
CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
|
||||
CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES;
|
||||
CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
|
||||
CLANG_WARN_STRICT_PROTOTYPES = YES;
|
||||
CLANG_WARN_SUSPICIOUS_MOVE = YES;
|
||||
CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
|
||||
CLANG_WARN_UNREACHABLE_CODE = YES;
|
||||
CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
|
||||
COPY_PHASE_STRIP = NO;
|
||||
DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
|
||||
ENABLE_NS_ASSERTIONS = NO;
|
||||
ENABLE_STRICT_OBJC_MSGSEND = YES;
|
||||
ENABLE_USER_SCRIPT_SANDBOXING = YES;
|
||||
GCC_C_LANGUAGE_STANDARD = gnu17;
|
||||
GCC_NO_COMMON_BLOCKS = YES;
|
||||
GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
|
||||
GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
|
||||
GCC_WARN_UNDECLARED_SELECTOR = YES;
|
||||
GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
|
||||
GCC_WARN_UNUSED_FUNCTION = YES;
|
||||
GCC_WARN_UNUSED_VARIABLE = YES;
|
||||
IPHONEOS_DEPLOYMENT_TARGET = 17.0;
|
||||
LOCALIZATION_PREFERS_STRING_CATALOGS = YES;
|
||||
MTL_ENABLE_DEBUG_INFO = NO;
|
||||
MTL_FAST_MATH = YES;
|
||||
SDKROOT = iphoneos;
|
||||
SWIFT_COMPILATION_MODE = wholemodule;
|
||||
VALIDATE_PRODUCT = YES;
|
||||
};
|
||||
name = Release;
|
||||
};
|
||||
8A1C83822AC328BE0096AF73 /* Debug */ = {
|
||||
isa = XCBuildConfiguration;
|
||||
buildSettings = {
|
||||
ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
|
||||
ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor;
|
||||
CLANG_ENABLE_MODULES = YES;
|
||||
CODE_SIGN_STYLE = Automatic;
|
||||
CURRENT_PROJECT_VERSION = 1;
|
||||
DEVELOPMENT_ASSET_PATHS = "\"llama.swiftui/Preview Content\"";
|
||||
DEVELOPMENT_TEAM = STLSG3FG8Q;
|
||||
ENABLE_PREVIEWS = YES;
|
||||
GENERATE_INFOPLIST_FILE = YES;
|
||||
INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES;
|
||||
INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES;
|
||||
INFOPLIST_KEY_UILaunchScreen_Generation = YES;
|
||||
INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
|
||||
INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
|
||||
IPHONEOS_DEPLOYMENT_TARGET = 16.0;
|
||||
LD_RUNPATH_SEARCH_PATHS = (
|
||||
"$(inherited)",
|
||||
"@executable_path/Frameworks",
|
||||
);
|
||||
MARKETING_VERSION = 1.0;
|
||||
PRODUCT_BUNDLE_IDENTIFIER = "com.bachittle.llama-swift";
|
||||
PRODUCT_NAME = "$(TARGET_NAME)";
|
||||
SWIFT_EMIT_LOC_STRINGS = YES;
|
||||
SWIFT_OBJC_BRIDGING_HEADER = "llama.cpp.swift/bridging-header.h";
|
||||
SWIFT_OPTIMIZATION_LEVEL = "-Onone";
|
||||
SWIFT_VERSION = 5.0;
|
||||
TARGETED_DEVICE_FAMILY = "1,2";
|
||||
};
|
||||
name = Debug;
|
||||
};
|
||||
8A1C83832AC328BE0096AF73 /* Release */ = {
|
||||
isa = XCBuildConfiguration;
|
||||
buildSettings = {
|
||||
ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
|
||||
ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor;
|
||||
CLANG_ENABLE_MODULES = YES;
|
||||
CODE_SIGN_STYLE = Automatic;
|
||||
CURRENT_PROJECT_VERSION = 1;
|
||||
DEVELOPMENT_ASSET_PATHS = "\"llama.swiftui/Preview Content\"";
|
||||
DEVELOPMENT_TEAM = STLSG3FG8Q;
|
||||
ENABLE_PREVIEWS = YES;
|
||||
GENERATE_INFOPLIST_FILE = YES;
|
||||
INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES;
|
||||
INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES;
|
||||
INFOPLIST_KEY_UILaunchScreen_Generation = YES;
|
||||
INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
|
||||
INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
|
||||
IPHONEOS_DEPLOYMENT_TARGET = 16.0;
|
||||
LD_RUNPATH_SEARCH_PATHS = (
|
||||
"$(inherited)",
|
||||
"@executable_path/Frameworks",
|
||||
);
|
||||
MARKETING_VERSION = 1.0;
|
||||
PRODUCT_BUNDLE_IDENTIFIER = "com.bachittle.llama-swift";
|
||||
PRODUCT_NAME = "$(TARGET_NAME)";
|
||||
SWIFT_EMIT_LOC_STRINGS = YES;
|
||||
SWIFT_OBJC_BRIDGING_HEADER = "llama.cpp.swift/bridging-header.h";
|
||||
SWIFT_VERSION = 5.0;
|
||||
TARGETED_DEVICE_FAMILY = "1,2";
|
||||
};
|
||||
name = Release;
|
||||
};
|
||||
/* End XCBuildConfiguration section */
|
||||
|
||||
/* Begin XCConfigurationList section */
|
||||
8A1C836E2AC328BD0096AF73 /* Build configuration list for PBXProject "llama.swiftui" */ = {
|
||||
isa = XCConfigurationList;
|
||||
buildConfigurations = (
|
||||
8A1C837F2AC328BE0096AF73 /* Debug */,
|
||||
8A1C83802AC328BE0096AF73 /* Release */,
|
||||
);
|
||||
defaultConfigurationIsVisible = 0;
|
||||
defaultConfigurationName = Release;
|
||||
};
|
||||
8A1C83812AC328BE0096AF73 /* Build configuration list for PBXNativeTarget "llama.swiftui" */ = {
|
||||
isa = XCConfigurationList;
|
||||
buildConfigurations = (
|
||||
8A1C83822AC328BE0096AF73 /* Debug */,
|
||||
8A1C83832AC328BE0096AF73 /* Release */,
|
||||
);
|
||||
defaultConfigurationIsVisible = 0;
|
||||
defaultConfigurationName = Release;
|
||||
};
|
||||
/* End XCConfigurationList section */
|
||||
};
|
||||
rootObject = 8A1C836B2AC328BD0096AF73 /* Project object */;
|
||||
}
|
||||
7
examples/llama.swiftui/llama.swiftui.xcodeproj/project.xcworkspace/contents.xcworkspacedata
generated
Normal file
7
examples/llama.swiftui/llama.swiftui.xcodeproj/project.xcworkspace/contents.xcworkspacedata
generated
Normal file
@@ -0,0 +1,7 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<Workspace
|
||||
version = "1.0">
|
||||
<FileRef
|
||||
location = "self:">
|
||||
</FileRef>
|
||||
</Workspace>
|
||||
@@ -0,0 +1,8 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
|
||||
<plist version="1.0">
|
||||
<dict>
|
||||
<key>IDEDidComputeMac32BitWarning</key>
|
||||
<true/>
|
||||
</dict>
|
||||
</plist>
|
||||
@@ -0,0 +1,11 @@
|
||||
{
|
||||
"colors" : [
|
||||
{
|
||||
"idiom" : "universal"
|
||||
}
|
||||
],
|
||||
"info" : {
|
||||
"author" : "xcode",
|
||||
"version" : 1
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,13 @@
|
||||
{
|
||||
"images" : [
|
||||
{
|
||||
"idiom" : "universal",
|
||||
"platform" : "ios",
|
||||
"size" : "1024x1024"
|
||||
}
|
||||
],
|
||||
"info" : {
|
||||
"author" : "xcode",
|
||||
"version" : 1
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,6 @@
|
||||
{
|
||||
"info" : {
|
||||
"author" : "xcode",
|
||||
"version" : 1
|
||||
}
|
||||
}
|
||||
45
examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift
Normal file
45
examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift
Normal file
@@ -0,0 +1,45 @@
|
||||
import Foundation
|
||||
|
||||
@MainActor
|
||||
class LlamaState: ObservableObject {
|
||||
@Published var messageLog = ""
|
||||
|
||||
private var llamaContext: LlamaContext?
|
||||
private var modelUrl: URL? {
|
||||
Bundle.main.url(forResource: "q8_0", withExtension: "gguf", subdirectory: "models")
|
||||
// Bundle.main.url(forResource: "llama-2-7b-chat", withExtension: "Q2_K.gguf", subdirectory: "models")
|
||||
}
|
||||
init() {
|
||||
do {
|
||||
try loadModel()
|
||||
} catch {
|
||||
messageLog += "Error!\n"
|
||||
}
|
||||
}
|
||||
|
||||
private func loadModel() throws {
|
||||
messageLog += "Loading model...\n"
|
||||
if let modelUrl {
|
||||
llamaContext = try LlamaContext.createContext(path: modelUrl.path())
|
||||
messageLog += "Loaded model \(modelUrl.lastPathComponent)\n"
|
||||
} else {
|
||||
messageLog += "Could not locate model\n"
|
||||
}
|
||||
}
|
||||
|
||||
func complete(text: String) async {
|
||||
guard let llamaContext else {
|
||||
return
|
||||
}
|
||||
messageLog += "Attempting to complete text...\n"
|
||||
await llamaContext.completion_init(text: text)
|
||||
messageLog += "\(text)"
|
||||
|
||||
while await llamaContext.n_cur <= llamaContext.n_len {
|
||||
let result = await llamaContext.completion_loop()
|
||||
messageLog += "\(result)"
|
||||
}
|
||||
await llamaContext.clear()
|
||||
messageLog += "\n\ndone\n"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,6 @@
|
||||
{
|
||||
"info" : {
|
||||
"author" : "xcode",
|
||||
"version" : 1
|
||||
}
|
||||
}
|
||||
0
examples/llama.swiftui/llama.swiftui/Resources/models/.gitignore
vendored
Normal file
0
examples/llama.swiftui/llama.swiftui/Resources/models/.gitignore
vendored
Normal file
42
examples/llama.swiftui/llama.swiftui/UI/ContentView.swift
Normal file
42
examples/llama.swiftui/llama.swiftui/UI/ContentView.swift
Normal file
@@ -0,0 +1,42 @@
|
||||
import SwiftUI
|
||||
|
||||
struct ContentView: View {
|
||||
@StateObject var llamaState = LlamaState()
|
||||
|
||||
@State private var multiLineText = ""
|
||||
|
||||
var body: some View {
|
||||
VStack {
|
||||
ScrollView(.vertical) {
|
||||
Text(llamaState.messageLog)
|
||||
}
|
||||
|
||||
TextEditor(text: $multiLineText)
|
||||
.frame(height: 200)
|
||||
.padding()
|
||||
.border(Color.gray, width: 0.5)
|
||||
Button(action: {
|
||||
sendText()
|
||||
}) {
|
||||
Text("Send")
|
||||
.padding()
|
||||
.background(Color.blue)
|
||||
.foregroundColor(.white)
|
||||
.cornerRadius(8)
|
||||
}
|
||||
}
|
||||
.padding()
|
||||
}
|
||||
|
||||
func sendText() {
|
||||
Task {
|
||||
await llamaState.complete(text: multiLineText)
|
||||
multiLineText = ""
|
||||
}
|
||||
}
|
||||
}
|
||||
/*
|
||||
#Preview {
|
||||
ContentView()
|
||||
}
|
||||
*/
|
||||
10
examples/llama.swiftui/llama.swiftui/llama_swiftuiApp.swift
Normal file
10
examples/llama.swiftui/llama.swiftui/llama_swiftuiApp.swift
Normal file
@@ -0,0 +1,10 @@
|
||||
import SwiftUI
|
||||
|
||||
@main
|
||||
struct llama_swiftuiApp: App {
|
||||
var body: some Scene {
|
||||
WindowGroup {
|
||||
ContentView()
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -5,7 +5,7 @@ import json
|
||||
import torch
|
||||
import numpy as np
|
||||
from gguf import *
|
||||
from transformers import CLIPModel, CLIPProcessor
|
||||
from transformers import CLIPModel, CLIPProcessor, CLIPVisionModel
|
||||
|
||||
TEXT = "clip.text"
|
||||
VISION = "clip.vision"
|
||||
@@ -78,11 +78,19 @@ ap.add_argument("--text-only", action="store_true", required=False,
|
||||
help="Save a text-only model. It can't be used to encode images")
|
||||
ap.add_argument("--vision-only", action="store_true", required=False,
|
||||
help="Save a vision-only model. It can't be used to encode texts")
|
||||
ap.add_argument("--clip_model_is_vision", action="store_true", required=False,
|
||||
help="The clip model is a pure vision model (ShareGPT4V vision extract for example)")
|
||||
ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.")
|
||||
ap.add_argument("--image-mean", nargs=3, type=float, required=False, help="Override image mean values")
|
||||
ap.add_argument("--image-std", nargs=3, type=float, required=False, help="Override image std values")
|
||||
ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
|
||||
# Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711
|
||||
default_image_mean = [0.48145466, 0.4578275, 0.40821073]
|
||||
default_image_std = [0.26862954, 0.26130258, 0.27577711]
|
||||
ap.add_argument('--image_mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None)
|
||||
ap.add_argument('--image_std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None)
|
||||
|
||||
# with proper
|
||||
args = ap.parse_args()
|
||||
|
||||
|
||||
@@ -96,15 +104,22 @@ if args.use_f32:
|
||||
# output in the same directory as the model if output_dir is None
|
||||
dir_model = args.model_dir
|
||||
|
||||
|
||||
with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f:
|
||||
vocab = json.load(f)
|
||||
tokens = [key for key in vocab]
|
||||
if args.clip_model_is_vision:
|
||||
vocab = None
|
||||
tokens = None
|
||||
else:
|
||||
with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f:
|
||||
vocab = json.load(f)
|
||||
tokens = [key for key in vocab]
|
||||
|
||||
with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
|
||||
config = json.load(f)
|
||||
v_hparams = config["vision_config"]
|
||||
t_hparams = config["text_config"]
|
||||
if args.clip_model_is_vision:
|
||||
v_hparams = config
|
||||
t_hparams = None
|
||||
else:
|
||||
v_hparams = config["vision_config"]
|
||||
t_hparams = config["text_config"]
|
||||
|
||||
# possible data types
|
||||
# ftype == 0 -> float32
|
||||
@@ -117,9 +132,12 @@ ftype = 1
|
||||
if args.use_f32:
|
||||
ftype = 0
|
||||
|
||||
|
||||
model = CLIPModel.from_pretrained(dir_model)
|
||||
processor = CLIPProcessor.from_pretrained(dir_model)
|
||||
if args.clip_model_is_vision:
|
||||
model = CLIPVisionModel.from_pretrained(dir_model)
|
||||
processor = None
|
||||
else:
|
||||
model = CLIPModel.from_pretrained(dir_model)
|
||||
processor = CLIPProcessor.from_pretrained(dir_model)
|
||||
|
||||
fname_middle = None
|
||||
has_text_encoder = True
|
||||
@@ -128,13 +146,13 @@ has_llava_projector = False
|
||||
if args.text_only:
|
||||
fname_middle = "text-"
|
||||
has_vision_encoder = False
|
||||
elif args.vision_only:
|
||||
fname_middle = "vision-"
|
||||
has_text_encoder = False
|
||||
elif args.llava_projector is not None:
|
||||
fname_middle = "mmproj-"
|
||||
has_text_encoder = False
|
||||
has_llava_projector = True
|
||||
elif args.vision_only:
|
||||
fname_middle = "vision-"
|
||||
has_text_encoder = False
|
||||
else:
|
||||
fname_middle = ""
|
||||
|
||||
@@ -182,8 +200,12 @@ if has_vision_encoder:
|
||||
block_count = v_hparams["num_hidden_layers"] - 1 if has_llava_projector else v_hparams["num_hidden_layers"]
|
||||
fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), block_count)
|
||||
|
||||
image_mean = processor.image_processor.image_mean if args.image_mean is None else args.image_mean
|
||||
image_std = processor.image_processor.image_std if args.image_std is None else args.image_std
|
||||
if processor is not None:
|
||||
image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean
|
||||
image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std
|
||||
else:
|
||||
image_mean = args.image_mean if args.image_mean is not None else default_image_mean
|
||||
image_std = args.image_std if args.image_std is not None else default_image_std
|
||||
fout.add_array("clip.vision.image_mean", image_mean)
|
||||
fout.add_array("clip.vision.image_std", image_std)
|
||||
|
||||
|
||||
7
examples/lookahead/README.md
Normal file
7
examples/lookahead/README.md
Normal file
@@ -0,0 +1,7 @@
|
||||
# llama.cpp/examples/lookahead
|
||||
|
||||
Demonstartion of lookahead decoding technique:
|
||||
|
||||
https://lmsys.org/blog/2023-11-21-lookahead-decoding/
|
||||
|
||||
More info: https://github.com/ggerganov/llama.cpp/pull/4207
|
||||
@@ -100,6 +100,12 @@ static void sigint_handler(int signo) {
|
||||
}
|
||||
#endif
|
||||
|
||||
static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
|
||||
(void) level;
|
||||
(void) user_data;
|
||||
LOG_TEE("%s", text);
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
gpt_params params;
|
||||
g_params = ¶ms;
|
||||
@@ -113,6 +119,7 @@ int main(int argc, char ** argv) {
|
||||
log_set_target(log_filename_generator("main", "log"));
|
||||
LOG_TEE("Log start\n");
|
||||
log_dump_cmdline(argc, argv);
|
||||
llama_log_set(llama_log_callback_logTee, nullptr);
|
||||
#endif // LOG_DISABLE_LOGS
|
||||
|
||||
// TODO: Dump params ?
|
||||
|
||||
@@ -11,10 +11,10 @@ app = Flask(__name__)
|
||||
slot_id = -1
|
||||
|
||||
parser = argparse.ArgumentParser(description="An example of using server.cpp with a similar API to OAI. It must be used together with server.cpp.")
|
||||
parser.add_argument("--chat-prompt", type=str, help="the top prompt in chat completions(default: 'A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.\\n')", default='A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.\\n')
|
||||
parser.add_argument("--user-name", type=str, help="USER name in chat completions(default: '\\nUSER: ')", default="\\nUSER: ")
|
||||
parser.add_argument("--ai-name", type=str, help="ASSISTANT name in chat completions(default: '\\nASSISTANT: ')", default="\\nASSISTANT: ")
|
||||
parser.add_argument("--system-name", type=str, help="SYSTEM name in chat completions(default: '\\nASSISTANT's RULE: ')", default="\\nASSISTANT's RULE: ")
|
||||
parser.add_argument("--chat-prompt", type=str, help="the top prompt in chat completions(default: 'A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.')", default='A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.')
|
||||
parser.add_argument("--user-name", type=str, help="USER name in chat completions(default: 'USER: ')", default="USER: ")
|
||||
parser.add_argument("--ai-name", type=str, help="ASSISTANT name in chat completions(default: 'ASSISTANT: ')", default="ASSISTANT: ")
|
||||
parser.add_argument("--system-name", type=str, help="SYSTEM name in chat completions(default: 'ASSISTANT's RULE: ')", default="ASSISTANT's RULE: ")
|
||||
parser.add_argument("--stop", type=str, help="the end of response in chat completions(default: '</s>')", default="</s>")
|
||||
parser.add_argument("--llama-api", type=str, help="Set the address of server.cpp in llama.cpp(default: http://127.0.0.1:8080)", default='http://127.0.0.1:8080')
|
||||
parser.add_argument("--api-key", type=str, help="Set the api key to allow only few user(default: NULL)", default="")
|
||||
@@ -34,19 +34,19 @@ def is_present(json, key):
|
||||
|
||||
#convert chat to prompt
|
||||
def convert_chat(messages):
|
||||
prompt = "" + args.chat_prompt.replace("\\n", "\n")
|
||||
|
||||
system_n = args.system_name.replace("\\n", "\n")
|
||||
user_n = args.user_name.replace("\\n", "\n")
|
||||
ai_n = args.ai_name.replace("\\n", "\n")
|
||||
stop = args.stop.replace("\\n", "\n")
|
||||
system_n = args.system_name
|
||||
user_n = args.user_name
|
||||
ai_n = args.ai_name
|
||||
stop = args.stop
|
||||
|
||||
prompt = "" + args.chat_prompt + stop
|
||||
|
||||
for line in messages:
|
||||
if (line["role"] == "system"):
|
||||
prompt += f"{system_n}{line['content']}"
|
||||
prompt += f"{system_n}{line['content']}{stop}"
|
||||
if (line["role"] == "user"):
|
||||
prompt += f"{user_n}{line['content']}"
|
||||
prompt += f"{user_n}{line['content']}{stop}"
|
||||
if (line["role"] == "assistant"):
|
||||
prompt += f"{ai_n}{line['content']}{stop}"
|
||||
prompt += ai_n.rstrip()
|
||||
@@ -130,7 +130,7 @@ def make_resData_stream(data, chat=False, time_now = 0, start=False):
|
||||
}
|
||||
]
|
||||
}
|
||||
slot_id = data["slot_id"]
|
||||
slot_id = data.get("slot_id")
|
||||
if (chat):
|
||||
if (start):
|
||||
resData["choices"][0]["delta"] = {
|
||||
@@ -150,11 +150,13 @@ def make_resData_stream(data, chat=False, time_now = 0, start=False):
|
||||
return resData
|
||||
|
||||
|
||||
@app.route('/chat/completions', methods=['POST'])
|
||||
@app.route('/v1/chat/completions', methods=['POST'])
|
||||
@app.route('/chat/completions', methods=['POST', 'OPTIONS'])
|
||||
@app.route('/v1/chat/completions', methods=['POST', 'OPTIONS'])
|
||||
def chat_completions():
|
||||
if (args.api_key != "" and request.headers["Authorization"].split()[1] != args.api_key):
|
||||
return Response(status=403)
|
||||
if request.method == 'OPTIONS':
|
||||
return Response(headers={"Access-Control-Allow-Origin": "*", "Access-Control-Allow-Headers": "*"})
|
||||
body = request.get_json()
|
||||
stream = False
|
||||
tokenize = False
|
||||
@@ -177,20 +179,22 @@ def chat_completions():
|
||||
data = requests.request("POST", urllib.parse.urljoin(args.llama_api, "/completion"), data=json.dumps(postData), stream=True)
|
||||
time_now = int(time.time())
|
||||
resData = make_resData_stream({}, chat=True, time_now=time_now, start=True)
|
||||
yield 'data: {}\n'.format(json.dumps(resData))
|
||||
yield 'data: {}\n\n'.format(json.dumps(resData))
|
||||
for line in data.iter_lines():
|
||||
if line:
|
||||
decoded_line = line.decode('utf-8')
|
||||
resData = make_resData_stream(json.loads(decoded_line[6:]), chat=True, time_now=time_now)
|
||||
yield 'data: {}\n'.format(json.dumps(resData))
|
||||
return Response(generate(), mimetype='text/event-stream')
|
||||
yield 'data: {}\n\n'.format(json.dumps(resData))
|
||||
return Response(generate(), mimetype='text/event-stream', headers={"Access-Control-Allow-Origin": "*", "Access-Control-Allow-Headers": "*"})
|
||||
|
||||
|
||||
@app.route('/completions', methods=['POST'])
|
||||
@app.route('/v1/completions', methods=['POST'])
|
||||
@app.route('/completions', methods=['POST', 'OPTIONS'])
|
||||
@app.route('/v1/completions', methods=['POST', 'OPTIONS'])
|
||||
def completion():
|
||||
if (args.api_key != "" and request.headers["Authorization"].split()[1] != args.api_key):
|
||||
return Response(status=403)
|
||||
if request.method == 'OPTIONS':
|
||||
return Response(headers={"Access-Control-Allow-Origin": "*", "Access-Control-Allow-Headers": "*"})
|
||||
body = request.get_json()
|
||||
stream = False
|
||||
tokenize = False
|
||||
@@ -216,8 +220,8 @@ def completion():
|
||||
if line:
|
||||
decoded_line = line.decode('utf-8')
|
||||
resData = make_resData_stream(json.loads(decoded_line[6:]), chat=False, time_now=time_now)
|
||||
yield 'data: {}\n'.format(json.dumps(resData))
|
||||
return Response(generate(), mimetype='text/event-stream')
|
||||
yield 'data: {}\n\n'.format(json.dumps(resData))
|
||||
return Response(generate(), mimetype='text/event-stream', headers={"Access-Control-Allow-Origin": "*", "Access-Control-Allow-Headers": "*"})
|
||||
|
||||
if __name__ == '__main__':
|
||||
app.run(args.host, port=args.port)
|
||||
|
||||
@@ -155,15 +155,23 @@ struct task_server {
|
||||
json data;
|
||||
bool infill_mode = false;
|
||||
bool embedding_mode = false;
|
||||
int multitask_id = -1;
|
||||
};
|
||||
|
||||
struct task_result {
|
||||
int id;
|
||||
int multitask_id = -1;
|
||||
bool stop;
|
||||
bool error;
|
||||
json result_json;
|
||||
};
|
||||
|
||||
struct task_multi {
|
||||
int id;
|
||||
std::set<int> subtasks_remaining{};
|
||||
std::vector<task_result> results{};
|
||||
};
|
||||
|
||||
// TODO: can become bool if we can't find use of more states
|
||||
enum slot_state
|
||||
{
|
||||
@@ -406,6 +414,9 @@ struct llama_client_slot
|
||||
double t_prompt_processing; // ms
|
||||
double t_token_generation; // ms
|
||||
|
||||
// multitasks
|
||||
int multitask_id = -1;
|
||||
|
||||
void reset() {
|
||||
num_prompt_tokens = 0;
|
||||
generated_text = "";
|
||||
@@ -529,7 +540,8 @@ struct llama_server_context
|
||||
|
||||
std::vector<task_server> queue_tasks;
|
||||
std::vector<task_result> queue_results;
|
||||
std::mutex mutex_tasks;
|
||||
std::vector<task_multi> queue_multitasks;
|
||||
std::mutex mutex_tasks; // also guards id_gen, and queue_multitasks
|
||||
std::mutex mutex_results;
|
||||
|
||||
~llama_server_context()
|
||||
@@ -1112,17 +1124,40 @@ struct llama_server_context
|
||||
return slot.images.size() > 0;
|
||||
}
|
||||
|
||||
void send_error(int id, std::string error)
|
||||
void send_error(task_server& task, std::string error)
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(mutex_results);
|
||||
task_result res;
|
||||
res.id = id;
|
||||
res.id = task.id;
|
||||
res.multitask_id = task.multitask_id;
|
||||
res.stop = false;
|
||||
res.error = true;
|
||||
res.result_json = { { "content", error } };
|
||||
queue_results.push_back(res);
|
||||
}
|
||||
|
||||
void add_multi_task(int id, std::vector<int>& sub_ids)
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(mutex_tasks);
|
||||
task_multi multi;
|
||||
multi.id = id;
|
||||
std::copy(sub_ids.begin(), sub_ids.end(), std::inserter(multi.subtasks_remaining, multi.subtasks_remaining.end()));
|
||||
queue_multitasks.push_back(multi);
|
||||
}
|
||||
|
||||
void update_multi_task(int multitask_id, int subtask_id, task_result& result)
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(mutex_tasks);
|
||||
for (auto& multitask : queue_multitasks)
|
||||
{
|
||||
if (multitask.id == multitask_id)
|
||||
{
|
||||
multitask.subtasks_remaining.erase(subtask_id);
|
||||
multitask.results.push_back(result);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
json get_model_props()
|
||||
{
|
||||
return get_formated_generation(slots[0]);
|
||||
@@ -1167,6 +1202,7 @@ struct llama_server_context
|
||||
std::lock_guard<std::mutex> lock(mutex_results);
|
||||
task_result res;
|
||||
res.id = slot.task_id;
|
||||
res.multitask_id = slot.multitask_id;
|
||||
res.error = false;
|
||||
res.stop = false;
|
||||
|
||||
@@ -1206,6 +1242,7 @@ struct llama_server_context
|
||||
std::lock_guard<std::mutex> lock(mutex_results);
|
||||
task_result res;
|
||||
res.id = slot.task_id;
|
||||
res.multitask_id = slot.multitask_id;
|
||||
res.error = false;
|
||||
res.stop = true;
|
||||
|
||||
@@ -1251,6 +1288,12 @@ struct llama_server_context
|
||||
res.result_json["model"] = slot.oaicompat_model;
|
||||
}
|
||||
|
||||
// parent multitask, if any, needs to be updated
|
||||
if (slot.multitask_id != -1)
|
||||
{
|
||||
update_multi_task(slot.multitask_id, slot.task_id, res);
|
||||
}
|
||||
|
||||
queue_results.push_back(res);
|
||||
}
|
||||
|
||||
@@ -1259,6 +1302,7 @@ struct llama_server_context
|
||||
std::lock_guard<std::mutex> lock(mutex_results);
|
||||
task_result res;
|
||||
res.id = slot.task_id;
|
||||
res.multitask_id = slot.multitask_id;
|
||||
res.error = false;
|
||||
res.stop = true;
|
||||
|
||||
@@ -1285,9 +1329,9 @@ struct llama_server_context
|
||||
queue_results.push_back(res);
|
||||
}
|
||||
|
||||
int request_completion(json data, bool infill, bool embedding)
|
||||
int request_completion(json data, bool infill, bool embedding, int multitask_id)
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(mutex_tasks);
|
||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||
task_server task;
|
||||
task.id = id_gen++;
|
||||
task.target_id = 0;
|
||||
@@ -1295,6 +1339,16 @@ struct llama_server_context
|
||||
task.infill_mode = infill;
|
||||
task.embedding_mode = embedding;
|
||||
task.type = COMPLETION_TASK;
|
||||
task.multitask_id = multitask_id;
|
||||
|
||||
// when a completion task's prompt array is not a singleton, we split it into multiple requests
|
||||
if (task.data.at("prompt").size() > 1)
|
||||
{
|
||||
lock.unlock(); // entering new func scope
|
||||
return split_multiprompt_task(task);
|
||||
}
|
||||
|
||||
// otherwise, it's a single-prompt task, we actually queue it
|
||||
queue_tasks.push_back(task);
|
||||
return task.id;
|
||||
}
|
||||
@@ -1313,8 +1367,17 @@ struct llama_server_context
|
||||
|
||||
for (int i = 0; i < (int) queue_results.size(); i++)
|
||||
{
|
||||
// for now, tasks that have associated parent multitasks just get erased once multitask picks up the result
|
||||
if (queue_results[i].multitask_id == task_id)
|
||||
{
|
||||
update_multi_task(task_id, queue_results[i].id, queue_results[i]);
|
||||
queue_results.erase(queue_results.begin() + i);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (queue_results[i].id == task_id)
|
||||
{
|
||||
assert(queue_results[i].multitask_id == -1);
|
||||
task_result res = queue_results[i];
|
||||
queue_results.erase(queue_results.begin() + i);
|
||||
return res;
|
||||
@@ -1404,6 +1467,27 @@ struct llama_server_context
|
||||
queue_tasks.push_back(task);
|
||||
}
|
||||
|
||||
int split_multiprompt_task(task_server& multiprompt_task)
|
||||
{
|
||||
auto prompt_count = multiprompt_task.data.at("prompt").size();
|
||||
assert(prompt_count > 1);
|
||||
|
||||
int multitask_id = id_gen++;
|
||||
std::vector<int> subtask_ids(prompt_count);
|
||||
for (int i = 0; i < prompt_count; i++)
|
||||
{
|
||||
json subtask_data = multiprompt_task.data;
|
||||
subtask_data["prompt"] = subtask_data["prompt"][i];
|
||||
|
||||
// subtasks inherit everything else (infill mode, embedding mode, etc.)
|
||||
subtask_ids[i] = request_completion(subtask_data, multiprompt_task.infill_mode, multiprompt_task.embedding_mode, multitask_id);
|
||||
}
|
||||
|
||||
// queue up the multitask so we can track its subtask progression
|
||||
add_multi_task(multitask_id, subtask_ids);
|
||||
return multitask_id;
|
||||
}
|
||||
|
||||
void process_tasks()
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(mutex_tasks);
|
||||
@@ -1419,7 +1503,7 @@ struct llama_server_context
|
||||
{
|
||||
LOG_TEE("slot unavailable\n");
|
||||
// send error result
|
||||
send_error(task.id, "slot unavailable");
|
||||
send_error(task, "slot unavailable");
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -1433,11 +1517,12 @@ struct llama_server_context
|
||||
slot->infill = task.infill_mode;
|
||||
slot->embedding = task.embedding_mode;
|
||||
slot->task_id = task.id;
|
||||
slot->multitask_id = task.multitask_id;
|
||||
|
||||
if (!launch_slot_with_data(slot, task.data))
|
||||
{
|
||||
// send error result
|
||||
send_error(task.id, "internal_error");
|
||||
send_error(task, "internal_error");
|
||||
break;
|
||||
}
|
||||
} break;
|
||||
@@ -1453,6 +1538,38 @@ struct llama_server_context
|
||||
} break;
|
||||
}
|
||||
}
|
||||
|
||||
// remove finished multitasks from the queue of multitasks, and add the corresponding result to the result queue
|
||||
auto queue_iterator = queue_multitasks.begin();
|
||||
while (queue_iterator != queue_multitasks.end())
|
||||
{
|
||||
if (queue_iterator->subtasks_remaining.empty())
|
||||
{
|
||||
// all subtasks done == multitask is done
|
||||
task_result aggregate_result;
|
||||
aggregate_result.id = queue_iterator->id;
|
||||
aggregate_result.stop = true;
|
||||
aggregate_result.error = false;
|
||||
|
||||
// collect json results into one json result
|
||||
std::vector<json> result_jsons;
|
||||
for (auto& subres : queue_iterator->results)
|
||||
{
|
||||
result_jsons.push_back(subres.result_json);
|
||||
aggregate_result.error = aggregate_result.error && subres.error;
|
||||
}
|
||||
aggregate_result.result_json = json{ "results", result_jsons };
|
||||
|
||||
std::lock_guard<std::mutex> lock(mutex_results);
|
||||
queue_results.push_back(aggregate_result);
|
||||
|
||||
queue_iterator = queue_multitasks.erase(queue_iterator);
|
||||
}
|
||||
else
|
||||
{
|
||||
++queue_iterator;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool update_slots() {
|
||||
@@ -1844,6 +1961,7 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
||||
printf(" -spf FNAME, --system-prompt-file FNAME\n");
|
||||
printf(" Set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n");
|
||||
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA.\n");
|
||||
printf(" --log-disable disables logging to a file.\n");
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
@@ -2198,6 +2316,11 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||
}
|
||||
params.mmproj = argv[i];
|
||||
}
|
||||
else if (arg == "--log-disable")
|
||||
{
|
||||
log_set_target(stdout);
|
||||
LOG_INFO("logging to file is disabled.", {});
|
||||
}
|
||||
else
|
||||
{
|
||||
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
||||
@@ -2596,7 +2719,7 @@ int main(int argc, char **argv)
|
||||
svr.Post("/completion", [&llama](const httplib::Request &req, httplib::Response &res)
|
||||
{
|
||||
json data = json::parse(req.body);
|
||||
const int task_id = llama.request_completion(data, false, false);
|
||||
const int task_id = llama.request_completion(data, false, false, -1);
|
||||
if (!json_value(data, "stream", false)) {
|
||||
std::string completion_text;
|
||||
task_result result = llama.next_result(task_id);
|
||||
@@ -2685,7 +2808,7 @@ int main(int argc, char **argv)
|
||||
{
|
||||
json data = oaicompat_completion_params_parse(json::parse(req.body));
|
||||
|
||||
const int task_id = llama.request_completion(data, false, false);
|
||||
const int task_id = llama.request_completion(data, false, false, -1);
|
||||
|
||||
if (!json_value(data, "stream", false)) {
|
||||
std::string completion_text;
|
||||
@@ -2754,7 +2877,7 @@ int main(int argc, char **argv)
|
||||
svr.Post("/infill", [&llama](const httplib::Request &req, httplib::Response &res)
|
||||
{
|
||||
json data = json::parse(req.body);
|
||||
const int task_id = llama.request_completion(data, true, false);
|
||||
const int task_id = llama.request_completion(data, true, false, -1);
|
||||
if (!json_value(data, "stream", false)) {
|
||||
std::string completion_text;
|
||||
task_result result = llama.next_result(task_id);
|
||||
@@ -2858,7 +2981,7 @@ int main(int argc, char **argv)
|
||||
{
|
||||
prompt = "";
|
||||
}
|
||||
const int task_id = llama.request_completion({ {"prompt", prompt}, { "n_predict", 0} }, false, true);
|
||||
const int task_id = llama.request_completion({ {"prompt", prompt}, { "n_predict", 0} }, false, true, -1);
|
||||
task_result result = llama.next_result(task_id);
|
||||
return res.set_content(result.result_json.dump(), "application/json");
|
||||
});
|
||||
|
||||
8
examples/speculative/README.md
Normal file
8
examples/speculative/README.md
Normal file
@@ -0,0 +1,8 @@
|
||||
# llama.cpp/examples/speculative
|
||||
|
||||
Demonstartion of speculative decoding and tree-based speculative decoding techniques
|
||||
|
||||
More info:
|
||||
|
||||
- https://github.com/ggerganov/llama.cpp/pull/2926
|
||||
- https://github.com/ggerganov/llama.cpp/pull/3624
|
||||
@@ -1,20 +1,18 @@
|
||||
#include "ggml.h"
|
||||
#include "ggml-opencl.h"
|
||||
|
||||
#include <array>
|
||||
#include <atomic>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <limits>
|
||||
#include <sstream>
|
||||
#include <vector>
|
||||
#include <limits>
|
||||
|
||||
#define CL_TARGET_OPENCL_VERSION 110
|
||||
#include <clblast.h>
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
#endif
|
||||
|
||||
2
ggml.c
2
ggml.c
@@ -9373,7 +9373,7 @@ static bool ggml_compute_forward_mul_mat_use_blas(
|
||||
// TODO: find the optimal values for these
|
||||
if (ggml_is_contiguous(src0) &&
|
||||
ggml_is_contiguous(src1) &&
|
||||
src0->type == GGML_TYPE_F32 &&
|
||||
//src0->type == GGML_TYPE_F32 &&
|
||||
src1->type == GGML_TYPE_F32 &&
|
||||
(ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
|
||||
|
||||
|
||||
5
ggml.h
5
ggml.h
@@ -244,11 +244,10 @@
|
||||
#define GGML_ASSERT(x) \
|
||||
do { \
|
||||
if (!(x)) { \
|
||||
fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
|
||||
fflush(stderr); \
|
||||
fflush(stdout); \
|
||||
fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
|
||||
ggml_print_backtrace(); \
|
||||
exit(1); \
|
||||
abort(); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
|
||||
28
llama.cpp
28
llama.cpp
@@ -46,7 +46,6 @@
|
||||
#endif
|
||||
#include <windows.h>
|
||||
#include <io.h>
|
||||
#include <stdio.h> // for _fseeki64
|
||||
#endif
|
||||
|
||||
#include <algorithm>
|
||||
@@ -2645,15 +2644,15 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
||||
}
|
||||
|
||||
// general kv
|
||||
LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str());
|
||||
LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str());
|
||||
|
||||
// special tokens
|
||||
if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
|
||||
if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
|
||||
if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
|
||||
if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
|
||||
if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
|
||||
if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
|
||||
if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
|
||||
if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
|
||||
if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
|
||||
if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
|
||||
if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
|
||||
if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
|
||||
}
|
||||
|
||||
static void llm_load_tensors(
|
||||
@@ -5550,18 +5549,8 @@ static int llama_decode_internal(
|
||||
n_threads = std::min(4, n_threads);
|
||||
}
|
||||
|
||||
// If all tensors can be run on the GPU then using more than 1 thread is detrimental.
|
||||
const bool full_offload_supported =
|
||||
model.arch == LLM_ARCH_LLAMA ||
|
||||
model.arch == LLM_ARCH_BAICHUAN ||
|
||||
model.arch == LLM_ARCH_FALCON ||
|
||||
model.arch == LLM_ARCH_REFACT ||
|
||||
model.arch == LLM_ARCH_MPT ||
|
||||
model.arch == LLM_ARCH_STARCODER ||
|
||||
model.arch == LLM_ARCH_STABLELM;
|
||||
|
||||
const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
|
||||
if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
|
||||
if (ggml_cpu_has_cublas() && fully_offloaded) {
|
||||
n_threads = 1;
|
||||
}
|
||||
|
||||
@@ -7037,6 +7026,7 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c
|
||||
// Replace the data in candidates with the new_candidates data
|
||||
std::copy(new_candidates.begin(), new_candidates.end(), candidates->data);
|
||||
candidates->size = new_candidates.size();
|
||||
candidates->sorted = false;
|
||||
|
||||
if (ctx) {
|
||||
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
||||
|
||||
@@ -1,5 +1,3 @@
|
||||
set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp.in")
|
||||
set(OUTPUT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp")
|
||||
set(BUILD_NUMBER 0)
|
||||
set(BUILD_COMMIT "unknown")
|
||||
set(BUILD_COMPILER "unknown")
|
||||
@@ -58,23 +56,3 @@ else()
|
||||
)
|
||||
set(BUILD_TARGET ${OUT})
|
||||
endif()
|
||||
|
||||
# Only write the build info if it changed
|
||||
if(EXISTS ${OUTPUT_FILE})
|
||||
file(READ ${OUTPUT_FILE} CONTENTS)
|
||||
string(REGEX MATCH "LLAMA_COMMIT = \"([^\"]*)\";" _ ${CONTENTS})
|
||||
set(OLD_COMMIT ${CMAKE_MATCH_1})
|
||||
string(REGEX MATCH "LLAMA_COMPILER = \"([^\"]*)\";" _ ${CONTENTS})
|
||||
set(OLD_COMPILER ${CMAKE_MATCH_1})
|
||||
string(REGEX MATCH "LLAMA_BUILD_TARGET = \"([^\"]*)\";" _ ${CONTENTS})
|
||||
set(OLD_TARGET ${CMAKE_MATCH_1})
|
||||
if (
|
||||
NOT OLD_COMMIT STREQUAL BUILD_COMMIT OR
|
||||
NOT OLD_COMPILER STREQUAL BUILD_COMPILER OR
|
||||
NOT OLD_TARGET STREQUAL BUILD_TARGET
|
||||
)
|
||||
configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
|
||||
endif()
|
||||
else()
|
||||
configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
|
||||
endif()
|
||||
|
||||
24
scripts/gen-build-info-cpp.cmake
Normal file
24
scripts/gen-build-info-cpp.cmake
Normal file
@@ -0,0 +1,24 @@
|
||||
include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
|
||||
|
||||
set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp.in")
|
||||
set(OUTPUT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp")
|
||||
|
||||
# Only write the build info if it changed
|
||||
if(EXISTS ${OUTPUT_FILE})
|
||||
file(READ ${OUTPUT_FILE} CONTENTS)
|
||||
string(REGEX MATCH "LLAMA_COMMIT = \"([^\"]*)\";" _ ${CONTENTS})
|
||||
set(OLD_COMMIT ${CMAKE_MATCH_1})
|
||||
string(REGEX MATCH "LLAMA_COMPILER = \"([^\"]*)\";" _ ${CONTENTS})
|
||||
set(OLD_COMPILER ${CMAKE_MATCH_1})
|
||||
string(REGEX MATCH "LLAMA_BUILD_TARGET = \"([^\"]*)\";" _ ${CONTENTS})
|
||||
set(OLD_TARGET ${CMAKE_MATCH_1})
|
||||
if (
|
||||
NOT OLD_COMMIT STREQUAL BUILD_COMMIT OR
|
||||
NOT OLD_COMPILER STREQUAL BUILD_COMPILER OR
|
||||
NOT OLD_TARGET STREQUAL BUILD_TARGET
|
||||
)
|
||||
configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
|
||||
endif()
|
||||
else()
|
||||
configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
|
||||
endif()
|
||||
Reference in New Issue
Block a user