mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-04-23 16:37:33 +03:00
Compare commits
28 Commits
b2213
...
gg/py-mino
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
56c047156a | ||
|
|
4cb4d8b22d | ||
|
|
3a03541ced | ||
|
|
56d03d92be | ||
|
|
a46f50747b | ||
|
|
c5688c6250 | ||
|
|
4ef245a92a | ||
|
|
973053d8b0 | ||
|
|
7c8bcc11dc | ||
|
|
7fe4678b02 | ||
|
|
ba2135ccae | ||
|
|
89febfed93 | ||
|
|
5022cf242d | ||
|
|
1ecea255eb | ||
|
|
a00a35cef9 | ||
|
|
eccd7a26dd | ||
|
|
c14f72db9c | ||
|
|
cc6cac08e3 | ||
|
|
580111d42b | ||
|
|
88c46cbdac | ||
|
|
a14679cc30 | ||
|
|
6560bed3f0 | ||
|
|
06bf2cf8c4 | ||
|
|
4ed8e4fbef | ||
|
|
9c405c9f9a | ||
|
|
5207b3fbc5 | ||
|
|
8dbbd75754 | ||
|
|
c0a8c6db37 |
7
.github/workflows/nix-ci-aarch64.yml
vendored
7
.github/workflows/nix-ci-aarch64.yml
vendored
@@ -19,7 +19,6 @@ on:
|
||||
|
||||
jobs:
|
||||
nix-build-aarch64:
|
||||
if: ${{ vars.CACHIX_NAME != '' }}
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
@@ -37,8 +36,8 @@ jobs:
|
||||
extra-conf: |
|
||||
extra-platforms = aarch64-linux
|
||||
extra-system-features = nixos-test kvm
|
||||
extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org
|
||||
extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
|
||||
extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
|
||||
extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
|
||||
- uses: DeterminateSystems/magic-nix-cache-action@v2
|
||||
with:
|
||||
upstream-cache: https://${{ matrix.cachixName }}.cachix.org
|
||||
@@ -46,7 +45,7 @@ jobs:
|
||||
uses: cachix/cachix-action@v13
|
||||
with:
|
||||
authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
|
||||
name: ${{ vars.CACHIX_NAME }}
|
||||
name: llama-cpp
|
||||
- name: Show all output paths
|
||||
run: >
|
||||
nix run github:nix-community/nix-eval-jobs
|
||||
|
||||
11
.github/workflows/nix-ci.yml
vendored
11
.github/workflows/nix-ci.yml
vendored
@@ -23,8 +23,8 @@ jobs:
|
||||
with:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
extra-conf: |
|
||||
extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org
|
||||
extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
|
||||
extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
|
||||
extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
|
||||
- uses: DeterminateSystems/magic-nix-cache-action@v2
|
||||
with:
|
||||
upstream-cache: https://${{ matrix.cachixName }}.cachix.org
|
||||
@@ -37,7 +37,6 @@ jobs:
|
||||
--flake
|
||||
".#packages.$(nix eval --raw --impure --expr builtins.currentSystem)"
|
||||
nix-build:
|
||||
if: ${{ vars.CACHIX_NAME != '' }}
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
@@ -51,8 +50,8 @@ jobs:
|
||||
with:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
extra-conf: |
|
||||
extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org
|
||||
extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
|
||||
extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
|
||||
extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
|
||||
- uses: DeterminateSystems/magic-nix-cache-action@v2
|
||||
with:
|
||||
upstream-cache: https://${{ matrix.cachixName }}.cachix.org
|
||||
@@ -60,7 +59,7 @@ jobs:
|
||||
uses: cachix/cachix-action@v13
|
||||
with:
|
||||
authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
|
||||
name: ${{ vars.CACHIX_NAME }}
|
||||
name: llama-cpp
|
||||
- name: Build
|
||||
run: >
|
||||
nix run github:Mic92/nix-fast-build
|
||||
|
||||
@@ -110,6 +110,7 @@ option(LLAMA_VULKAN_RUN_TESTS "llama: run Vulkan tests"
|
||||
option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT})
|
||||
option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging" OFF)
|
||||
option(LLAMA_METAL_SHADER_DEBUG "llama: compile Metal with -fno-fast-math" OFF)
|
||||
option(LLAMA_METAL_EMBED_LIBRARY "llama: embed Metal library" OFF)
|
||||
option(LLAMA_KOMPUTE "llama: use Kompute" OFF)
|
||||
option(LLAMA_MPI "llama: use MPI" OFF)
|
||||
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
|
||||
@@ -201,6 +202,29 @@ if (LLAMA_METAL)
|
||||
# copy ggml-metal.metal to bin directory
|
||||
configure_file(ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY)
|
||||
|
||||
if (LLAMA_METAL_EMBED_LIBRARY)
|
||||
enable_language(ASM)
|
||||
add_compile_definitions(GGML_METAL_EMBED_LIBRARY)
|
||||
|
||||
set(METALLIB_SOURCE "${CMAKE_SOURCE_DIR}/ggml-metal.metal")
|
||||
file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/autogenerated")
|
||||
set(EMBED_METALLIB_ASSEMBLY "${CMAKE_BINARY_DIR}/autogenerated/ggml-embed-metallib.s")
|
||||
|
||||
add_custom_command(
|
||||
OUTPUT ${EMBED_METALLIB_ASSEMBLY}
|
||||
COMMAND echo ".section __DATA,__ggml_metallib" > ${EMBED_METALLIB_ASSEMBLY}
|
||||
COMMAND echo ".globl _ggml_metallib_start" >> ${EMBED_METALLIB_ASSEMBLY}
|
||||
COMMAND echo "_ggml_metallib_start:" >> ${EMBED_METALLIB_ASSEMBLY}
|
||||
COMMAND echo ".incbin \\\"${METALLIB_SOURCE}\\\"" >> ${EMBED_METALLIB_ASSEMBLY}
|
||||
COMMAND echo ".globl _ggml_metallib_end" >> ${EMBED_METALLIB_ASSEMBLY}
|
||||
COMMAND echo "_ggml_metallib_end:" >> ${EMBED_METALLIB_ASSEMBLY}
|
||||
DEPENDS ${METALLIB_SOURCE}
|
||||
COMMENT "Generate assembly for embedded Metal library"
|
||||
)
|
||||
|
||||
set(GGML_SOURCES_METAL ${GGML_SOURCES_METAL} ${EMBED_METALLIB_ASSEMBLY})
|
||||
endif()
|
||||
|
||||
if (LLAMA_METAL_SHADER_DEBUG)
|
||||
# custom command to do the following:
|
||||
# xcrun -sdk macosx metal -fno-fast-math -c ggml-metal.metal -o ggml-metal.air
|
||||
|
||||
22
Makefile
22
Makefile
@@ -173,7 +173,7 @@ ifdef LLAMA_DEBUG
|
||||
MK_LDFLAGS += -g
|
||||
|
||||
ifeq ($(UNAME_S),Linux)
|
||||
MK_CXXFLAGS += -Wp,-D_GLIBCXX_ASSERTIONS
|
||||
MK_CPPFLAGS += -D_GLIBCXX_ASSERTIONS
|
||||
endif
|
||||
else
|
||||
MK_CPPFLAGS += -DNDEBUG
|
||||
@@ -533,11 +533,29 @@ ifdef LLAMA_METAL
|
||||
ifdef LLAMA_METAL_NDEBUG
|
||||
MK_CPPFLAGS += -DGGML_METAL_NDEBUG
|
||||
endif
|
||||
ifdef LLAMA_METAL_EMBED_LIBRARY
|
||||
MK_CPPFLAGS += -DGGML_METAL_EMBED_LIBRARY
|
||||
OBJS += ggml-metal-embed.o
|
||||
endif
|
||||
endif # LLAMA_METAL
|
||||
|
||||
ifdef LLAMA_METAL
|
||||
ggml-metal.o: ggml-metal.m ggml-metal.h
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
|
||||
ifdef LLAMA_METAL_EMBED_LIBRARY
|
||||
ggml-metal-embed.o: ggml-metal.metal
|
||||
@echo "Embedding Metal library"
|
||||
$(eval TEMP_ASSEMBLY=$(shell mktemp))
|
||||
@echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY)
|
||||
@echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY)
|
||||
@echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY)
|
||||
@echo ".incbin \"$<\"" >> $(TEMP_ASSEMBLY)
|
||||
@echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY)
|
||||
@echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY)
|
||||
@$(AS) $(TEMP_ASSEMBLY) -o $@
|
||||
@rm -f ${TEMP_ASSEMBLY}
|
||||
endif
|
||||
endif # LLAMA_METAL
|
||||
|
||||
ifdef LLAMA_MPI
|
||||
@@ -701,7 +719,7 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
||||
server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h examples/llava/llava.h examples/llava/llava.cpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual
|
||||
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h %.hpp $< examples/llava/clip.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) -o $@ $(LDFLAGS) $(LWINSOCK2)
|
||||
|
||||
13
README.md
13
README.md
@@ -10,13 +10,9 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
|
||||
|
||||
### Hot topics
|
||||
|
||||
- Remove LLAMA_MAX_DEVICES and LLAMA_SUPPORTS_GPU_OFFLOAD: https://github.com/ggerganov/llama.cpp/pull/5240
|
||||
- Incoming backends: https://github.com/ggerganov/llama.cpp/discussions/5138
|
||||
- [SYCL backend](README-sycl.md) is ready (1/28/2024), support Linux/Windows in Intel GPUs (iGPU, Arc/Flex/Max series)
|
||||
- New SOTA quantized models, including pure 2-bits: https://huggingface.co/ikawrakow
|
||||
- Collecting Apple Silicon performance stats:
|
||||
- M-series: https://github.com/ggerganov/llama.cpp/discussions/4167
|
||||
- A-series: https://github.com/ggerganov/llama.cpp/discussions/4508
|
||||
- Support for chat templates: [Wiki (contributions welcome)](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
|
||||
- Support for Gemma models: https://github.com/ggerganov/llama.cpp/pull/5631
|
||||
- Non-linear quantization IQ4_NL: https://github.com/ggerganov/llama.cpp/pull/5590
|
||||
- Looking for contributions to improve and maintain the `server` example: https://github.com/ggerganov/llama.cpp/issues/4216
|
||||
|
||||
----
|
||||
@@ -107,6 +103,7 @@ Typically finetunes of the base models below are supported as well.
|
||||
- [x] [Orion 14B](https://github.com/ggerganov/llama.cpp/pull/5118)
|
||||
- [x] [InternLM2](https://huggingface.co/models?search=internlm2)
|
||||
- [x] [CodeShell](https://github.com/WisdomShell/codeshell)
|
||||
- [x] [Gemma](https://ai.google.dev/gemma)
|
||||
|
||||
**Multimodal models:**
|
||||
|
||||
@@ -145,6 +142,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
|
||||
- [nat/openplayground](https://github.com/nat/openplayground)
|
||||
- [Faraday](https://faraday.dev/) (proprietary)
|
||||
- [LMStudio](https://lmstudio.ai/) (proprietary)
|
||||
- [LocalAI](https://github.com/mudler/LocalAI) (MIT)
|
||||
- [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL)
|
||||
- [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile)
|
||||
- [nomic-ai/gpt4all](https://github.com/nomic-ai/gpt4all)
|
||||
@@ -156,6 +154,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
|
||||
- [pythops/tenere](https://github.com/pythops/tenere) (AGPL)
|
||||
- [semperai/amica](https://github.com/semperai/amica)
|
||||
- [withcatai/catai](https://github.com/withcatai/catai)
|
||||
- [Mobile-Artificial-Intelligence/maid](https://github.com/Mobile-Artificial-Intelligence/maid) (MIT)
|
||||
|
||||
---
|
||||
|
||||
|
||||
@@ -123,6 +123,7 @@ pub fn build(b: *std.build.Builder) !void {
|
||||
const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp");
|
||||
const train = make.obj("train", "common/train.cpp");
|
||||
const clip = make.obj("clip", "examples/llava/clip.cpp");
|
||||
const llava = make.obj("llava", "examples/llava/llava.cpp");
|
||||
|
||||
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, console, grammar_parser });
|
||||
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo });
|
||||
@@ -131,7 +132,7 @@ pub fn build(b: *std.build.Builder) !void {
|
||||
_ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, train });
|
||||
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, train });
|
||||
|
||||
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, grammar_parser, clip });
|
||||
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, grammar_parser, clip, llava });
|
||||
if (server.target.isWindows()) {
|
||||
server.linkSystemLibrary("ws2_32");
|
||||
}
|
||||
|
||||
@@ -655,6 +655,8 @@ class OrionModel(Model):
|
||||
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
|
||||
self.gguf_writer.add_head_count(head_count)
|
||||
self.gguf_writer.add_head_count_kv(head_count_kv)
|
||||
# note: config provides rms norm but it is actually layer norm
|
||||
# ref: https://huggingface.co/OrionStarAI/Orion-14B-Chat/blob/276a17221ce42beb45f66fac657a41540e71f4f5/modeling_orion.py#L570-L571
|
||||
self.gguf_writer.add_layer_norm_eps(self.hparams["rms_norm_eps"])
|
||||
|
||||
def write_tensors(self):
|
||||
@@ -1031,7 +1033,6 @@ class PersimmonModel(Model):
|
||||
self.gguf_writer.add_head_count_kv(head_count_kv)
|
||||
self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"])
|
||||
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"])
|
||||
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
|
||||
|
||||
def set_vocab(self):
|
||||
self._set_vocab_sentencepiece()
|
||||
|
||||
@@ -59,14 +59,39 @@ python ./convert.py ../llava-v1.5-7b --skip-unknown
|
||||
Now both the LLaMA part and the image encoder is in the `llava-v1.5-7b` directory.
|
||||
|
||||
## LLaVA 1.6 gguf conversion
|
||||
|
||||
1) Backup your pth/safetensor model files as llava-surgery modifies them
|
||||
2) Use `python llava-surgery-v2.py -C -m /path/to/hf-model` which also supports llava-1.5 variants pytorch as well as safetensor models:
|
||||
1) First clone a LLaVA 1.6 model:
|
||||
```console
|
||||
git clone https://huggingface.co/liuhaotian/llava-v1.6-vicuna-7b
|
||||
```
|
||||
2) Use `llava-surgery-v2.py` which also supports llava-1.5 variants pytorch as well as safetensor models:
|
||||
```console
|
||||
python examples/llava/llava-surgery-v2.py -C -m ../llava-v1.6-vicuna-7b/
|
||||
```
|
||||
- you will find a llava.projector and a llava.clip file in your model directory
|
||||
3) Copy the llava.clip file into a subdirectory (like vit), rename it to pytorch_model.bin and add a fitting vit configuration to the directory (https://huggingface.co/cmp-nct/llava-1.6-gguf/blob/main/config_vit.json) and rename it to config.json.
|
||||
4) Create the visual gguf model: `python ./examples/llava/convert-image-encoder-to-gguf.py -m ../path/to/vit --llava-projector ../path/to/llava.projector --output-dir ../path/to/output --clip-model-is-vision`
|
||||
3) Copy the llava.clip file into a subdirectory (like vit), rename it to pytorch_model.bin and add a fitting vit configuration to the directory:
|
||||
```console
|
||||
mkdir vit
|
||||
cp ../llava-v1.6-vicuna-7b/llava.clip vit/pytorch_model.bin
|
||||
cp ../llava-v1.6-vicuna-7b/llava.projector vit/
|
||||
curl -s -q https://huggingface.co/cmp-nct/llava-1.6-gguf/raw/main/config_vit.json -o vit/config.json
|
||||
```
|
||||
|
||||
4) Create the visual gguf model:
|
||||
```console
|
||||
python ./examples/llava/convert-image-encoder-to-gguf.py -m vit --llava-projector vit/llava.projector --output-dir vit --clip-model-is-vision
|
||||
```
|
||||
- This is similar to llava-1.5, the difference is that we tell the encoder that we are working with the pure vision model part of CLIP
|
||||
5) Everything else as usual: convert.py the hf model, quantize as needed
|
||||
|
||||
5) Then convert the model to gguf format:
|
||||
```console
|
||||
python ./convert.py ../llava-v1.6-vicuna-7b/ --skip-unknown
|
||||
```
|
||||
|
||||
6) And finally we can run the llava-cli using the 1.6 model version:
|
||||
```console
|
||||
./llava-cli -m ../llava-v1.6-vicuna-7b/ggml-model-f16.gguf --mmproj vit/mmproj-model-f16.gguf --image some-image.jpg -c 4096
|
||||
```
|
||||
|
||||
**note** llava-1.6 needs more context than llava-1.5, at least 3000 is needed (just run it at -c 4096)
|
||||
**note** llava-1.6 greatly benefits from batched prompt processing (defaults work)
|
||||
|
||||
|
||||
@@ -65,9 +65,7 @@ def clean_vision_tower_from_checkpoint(checkpoint_path):
|
||||
for name in clip_tensors:
|
||||
del checkpoint[name]
|
||||
|
||||
# Save the updated checkpoint
|
||||
checkpoint_path = checkpoint_path
|
||||
save_model(checkpoint, checkpoint_path, file_type)
|
||||
return True
|
||||
return False
|
||||
|
||||
@@ -152,16 +150,6 @@ for name in first_mm_tensors:
|
||||
if len(projector) > 0:
|
||||
save_model(projector, f"{args.model}/llava.projector", 'pytorch')
|
||||
|
||||
for name in mm_tensors:
|
||||
del last_checkpoint[name]
|
||||
for name in first_mm_tensors:
|
||||
del first_checkpoint[name]
|
||||
|
||||
if len(mm_tensors) > 0:
|
||||
save_model(last_checkpoint, projector_checkpoint_path, file_type)
|
||||
if len(first_mm_tensors) > 0:
|
||||
save_model(first_checkpoint, newline_checkpoint_path, file_type)
|
||||
|
||||
print("Done!")
|
||||
print(f"Now you can convert {args.model} to a a regular LLaMA GGUF file.")
|
||||
print(f"Also, use {args.model}/llava.projector to prepare a llava-encoder.gguf file.")
|
||||
|
||||
@@ -311,7 +311,7 @@ bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx *
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) {
|
||||
bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) {
|
||||
float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*6); // TODO: base on gridsize/llava model
|
||||
if (!image_embd) {
|
||||
fprintf(stderr, "Unable to allocate memory for image embeddings\n");
|
||||
|
||||
@@ -31,6 +31,8 @@ struct llava_image_embed {
|
||||
/** sanity check for clip <-> llava embed size match */
|
||||
LLAVA_API bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip);
|
||||
|
||||
LLAVA_API bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out);
|
||||
|
||||
/** build an image embed from image file bytes */
|
||||
LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length);
|
||||
/** build an image embed from a path to an image filename */
|
||||
|
||||
@@ -334,6 +334,8 @@ int main(int argc, char ** argv) {
|
||||
// number of tokens to keep when resetting context
|
||||
if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size() || params.instruct || params.chatml) {
|
||||
params.n_keep = (int)embd_inp.size();
|
||||
} else {
|
||||
params.n_keep += add_bos; // always keep the BOS token
|
||||
}
|
||||
|
||||
// prefix & suffix for instruct mode
|
||||
@@ -383,8 +385,8 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
}
|
||||
|
||||
if (params.n_keep > 0) {
|
||||
LOG_TEE("%s: static prompt based on n_keep: '", __func__);
|
||||
if (params.n_keep > add_bos) {
|
||||
LOG_TEE("%s: static prompt based on n_keep: '", __func__);
|
||||
for (int i = 0; i < params.n_keep; i++) {
|
||||
LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
|
||||
}
|
||||
@@ -540,14 +542,14 @@ int main(int argc, char ** argv) {
|
||||
break;
|
||||
}
|
||||
|
||||
const int n_left = n_past - params.n_keep - 1;
|
||||
const int n_left = n_past - params.n_keep;
|
||||
const int n_discard = n_left/2;
|
||||
|
||||
LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
|
||||
n_past, n_left, n_ctx, params.n_keep, n_discard);
|
||||
|
||||
llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
|
||||
llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
|
||||
llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard);
|
||||
llama_kv_cache_seq_shift(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
|
||||
|
||||
n_past -= n_discard;
|
||||
|
||||
|
||||
@@ -32,6 +32,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
|
||||
{ "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", },
|
||||
{ "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.07G, +0.2496 ppl @ LLaMA-v1-7B", },
|
||||
{ "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 3.35G, +0.1764 ppl @ LLaMA-v1-7B", },
|
||||
{ "IQ4_NL", LLAMA_FTYPE_MOSTLY_IQ4_NL, " 4.25 bpw non-linear quantization", },
|
||||
{ "Q4_K", LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", },
|
||||
{ "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 3.59G, +0.0992 ppl @ LLaMA-v1-7B", },
|
||||
{ "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 3.80G, +0.0532 ppl @ LLaMA-v1-7B", },
|
||||
|
||||
@@ -41,6 +41,7 @@ see https://github.com/ggerganov/llama.cpp/issues/1437
|
||||
- `--grp-attn-w`: Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`
|
||||
- `-n, --n-predict`: Set the maximum tokens to predict (default: -1)
|
||||
- `--slots-endpoint-disable`: To disable slots state monitoring endpoint. Slots state may contain user data, prompts included.
|
||||
- `--chat-template JINJA_TEMPLATE`: Set custom jinja chat template. This parameter accepts a string, not a file name (default: template taken from model's metadata). We only support [some pre-defined templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
|
||||
|
||||
## Build
|
||||
|
||||
@@ -134,10 +135,13 @@ node index.js
|
||||
## API Endpoints
|
||||
|
||||
- **GET** `/health`: Returns the current state of the server:
|
||||
- `{"status": "loading model"}` if the model is still being loaded.
|
||||
- `{"status": "error"}` if the model failed to load.
|
||||
- `{"status": "ok"}` if the model is successfully loaded and the server is ready for further requests mentioned below.
|
||||
- `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if no slot are currently available
|
||||
- 503 -> `{"status": "loading model"}` if the model is still being loaded.
|
||||
- 500 -> `{"status": "error"}` if the model failed to load.
|
||||
- 200 -> `{"status": "ok", "slots_idle": 1, "slots_processing": 2 }` if the model is successfully loaded and the server is ready for further requests mentioned below.
|
||||
- 200 -> `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if no slot are currently available.
|
||||
- 503 -> `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if the query parameter `fail_on_no_slot` is provided and no slot are currently available.
|
||||
|
||||
If the query parameter `include_slots` is passed, `slots` field will contain internal slots data except if `--slots-endpoint-disable` is set.
|
||||
|
||||
- **POST** `/completion`: Given a `prompt`, it returns the predicted completion.
|
||||
|
||||
@@ -147,7 +151,7 @@ node index.js
|
||||
|
||||
`temperature`: Adjust the randomness of the generated text (default: 0.8).
|
||||
|
||||
`dynatemp_range`: Dynamic temperature range (default: 0.0, 0.0 = disabled).
|
||||
`dynatemp_range`: Dynamic temperature range. The final temperature will be in the range of `[temperature - dynatemp_range; temperature + dynatemp_range]` (default: 0.0, 0.0 = disabled).
|
||||
|
||||
`dynatemp_exponent`: Dynamic temperature exponent (default: 1.0).
|
||||
|
||||
@@ -205,7 +209,7 @@ node index.js
|
||||
|
||||
`slot_id`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot (default: -1)
|
||||
|
||||
`cache_prompt`: Save the prompt and generation for avoid reprocess entire prompt if a part of this isn't change (default: false)
|
||||
`cache_prompt`: Re-use previously cached prompt from the last request if possible. This may prevent re-caching the prompt from scratch. (default: false)
|
||||
|
||||
`system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
|
||||
|
||||
@@ -238,7 +242,7 @@ Notice that each `probs` is an array of length `n_probs`.
|
||||
|
||||
- `content`: Completion result as a string (excluding `stopping_word` if any). In case of streaming mode, will contain the next token as a string.
|
||||
- `stop`: Boolean for use with `stream` to check whether the generation has stopped (Note: This is not related to stopping words array `stop` from input options)
|
||||
- `generation_settings`: The provided options above excluding `prompt` but including `n_ctx`, `model`
|
||||
- `generation_settings`: The provided options above excluding `prompt` but including `n_ctx`, `model`. These options may differ from the original ones in some way (e.g. bad values filtered out, strings converted to tokens, etc.).
|
||||
- `model`: The path to the model loaded with `-m`
|
||||
- `prompt`: The provided `prompt`
|
||||
- `stopped_eos`: Indicating whether the completion has stopped because it encountered the EOS token
|
||||
|
||||
@@ -15,13 +15,11 @@
|
||||
using json = nlohmann::json;
|
||||
|
||||
inline static json oaicompat_completion_params_parse(
|
||||
const struct llama_model * model,
|
||||
const json &body, /* openai api json semantics */
|
||||
const std::string &chat_template)
|
||||
{
|
||||
json llama_params;
|
||||
std::string formatted_prompt = chat_template == "chatml"
|
||||
? format_chatml(body["messages"]) // OpenAI 'messages' to chatml (with <|im_start|>,...)
|
||||
: format_llama2(body["messages"]); // OpenAI 'messages' to llama2 (with [INST],...)
|
||||
|
||||
llama_params["__oaicompat"] = true;
|
||||
|
||||
@@ -34,7 +32,7 @@ inline static json oaicompat_completion_params_parse(
|
||||
// https://platform.openai.com/docs/api-reference/chat/create
|
||||
llama_sampling_params default_sparams;
|
||||
llama_params["model"] = json_value(body, "model", std::string("unknown"));
|
||||
llama_params["prompt"] = formatted_prompt;
|
||||
llama_params["prompt"] = format_chat(model, chat_template, body["messages"]);
|
||||
llama_params["cache_prompt"] = json_value(body, "cache_prompt", false);
|
||||
llama_params["temperature"] = json_value(body, "temperature", 0.0);
|
||||
llama_params["top_k"] = json_value(body, "top_k", default_sparams.top_k);
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
#include "oai.hpp"
|
||||
|
||||
#include "../llava/clip.h"
|
||||
#include "../llava/llava.h"
|
||||
|
||||
#include "stb_image.h"
|
||||
|
||||
@@ -37,7 +38,7 @@ struct server_params
|
||||
std::string hostname = "127.0.0.1";
|
||||
std::vector<std::string> api_keys;
|
||||
std::string public_path = "examples/server/public";
|
||||
std::string chat_template = "chatml";
|
||||
std::string chat_template = "";
|
||||
int32_t port = 8080;
|
||||
int32_t read_timeout = 600;
|
||||
int32_t write_timeout = 600;
|
||||
@@ -399,6 +400,16 @@ struct llama_server_context
|
||||
return true;
|
||||
}
|
||||
|
||||
void validate_model_chat_template(server_params & sparams) {
|
||||
llama_chat_message chat[] = {{"user", "test"}};
|
||||
std::vector<char> buf(1);
|
||||
int res = llama_chat_apply_template(model, nullptr, chat, 1, true, buf.data(), buf.size());
|
||||
if (res < 0) {
|
||||
LOG_ERROR("The chat template comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {});
|
||||
sparams.chat_template = "<|im_start|>"; // llama_chat_apply_template only checks if <|im_start|> exist in the template
|
||||
}
|
||||
}
|
||||
|
||||
void initialize() {
|
||||
// create slots
|
||||
all_slots_are_idle = true;
|
||||
@@ -997,43 +1008,12 @@ struct llama_server_context
|
||||
{
|
||||
continue;
|
||||
}
|
||||
clip_image_f32_batch img_res_v;
|
||||
img_res_v.size = 0;
|
||||
img_res_v.data = nullptr;
|
||||
if (!clip_image_preprocess(clp_ctx, img.img_data, img_res_v))
|
||||
{
|
||||
LOG_TEE("Error processing the given image");
|
||||
clip_free(clp_ctx);
|
||||
clip_image_f32_batch_free(img_res_v);
|
||||
return false;
|
||||
}
|
||||
if (img_res_v.size == 0)
|
||||
{
|
||||
|
||||
if (!llava_image_embed_make_with_clip_img(clp_ctx, params.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) {
|
||||
LOG_TEE("Error processing the given image");
|
||||
return false;
|
||||
}
|
||||
|
||||
// note: assumes only one image was returned by clip_image_preprocess
|
||||
clip_image_f32 * img_res = img_res_v.data;
|
||||
|
||||
img.image_tokens = clip_n_patches(clp_ctx);
|
||||
img.image_embedding = (float *)malloc(clip_embd_nbytes(clp_ctx));
|
||||
if (!img.image_embedding)
|
||||
{
|
||||
LOG_TEE("Unable to allocate memory for image embeddings\n");
|
||||
clip_image_f32_batch_free(img_res_v);
|
||||
clip_free(clp_ctx);
|
||||
return false;
|
||||
}
|
||||
LOG_TEE("slot %i - encoding image [id: %i]\n", slot.id, img.id);
|
||||
if (!clip_image_encode(clp_ctx, params.n_threads, img_res, img.image_embedding))
|
||||
{
|
||||
LOG_TEE("Unable to encode image\n");
|
||||
clip_image_f32_batch_free(img_res_v);
|
||||
return false;
|
||||
}
|
||||
|
||||
clip_image_f32_batch_free(img_res_v);
|
||||
|
||||
img.request_encode_image = false;
|
||||
}
|
||||
@@ -1424,6 +1404,46 @@ struct llama_server_context
|
||||
case TASK_TYPE_NEXT_RESPONSE: {
|
||||
// do nothing
|
||||
} break;
|
||||
case TASK_TYPE_SLOTS_DATA: {
|
||||
json slots_data = json::array();
|
||||
int n_idle_slots = 0;
|
||||
int n_processing_slots = 0;
|
||||
|
||||
for (llama_client_slot &slot: slots) {
|
||||
if (slot.available()) {
|
||||
n_idle_slots++;
|
||||
} else {
|
||||
n_processing_slots++;
|
||||
}
|
||||
json slot_data = get_formated_generation(slot);
|
||||
slot_data["id"] = slot.id;
|
||||
slot_data["task_id"] = slot.task_id;
|
||||
slot_data["state"] = slot.state;
|
||||
slot_data["prompt"] = slot.prompt;
|
||||
slot_data["next_token"] = {
|
||||
{"has_next_token", slot.has_next_token},
|
||||
{"n_remain", slot.n_remaining},
|
||||
{"num_tokens_predicted", slot.n_decoded},
|
||||
{"stopped_eos", slot.stopped_eos},
|
||||
{"stopped_word", slot.stopped_word},
|
||||
{"stopped_limit", slot.stopped_limit},
|
||||
{"stopping_word", slot.stopping_word},
|
||||
};
|
||||
slots_data.push_back(slot_data);
|
||||
}
|
||||
LOG_TEE("task %i - slots data: idle=%i processing=%i\n", task.id, n_idle_slots, n_processing_slots);
|
||||
task_result res;
|
||||
res.id = task.id;
|
||||
res.multitask_id = task.multitask_id;
|
||||
res.stop = true;
|
||||
res.error = false;
|
||||
res.result_json = {
|
||||
{ "idle", n_idle_slots },
|
||||
{ "processing", n_processing_slots },
|
||||
{ "slots", slots_data }
|
||||
};
|
||||
queue_results.send(res);
|
||||
} break;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1477,14 +1497,15 @@ struct llama_server_context
|
||||
if (slot.is_processing() && system_tokens.size() + slot.cache_tokens.size() >= (size_t) slot.n_ctx)
|
||||
{
|
||||
// Shift context
|
||||
const int n_left = system_tokens.size() + slot.n_past - slot.params.n_keep - 1;
|
||||
const int n_keep = slot.params.n_keep + add_bos_token;
|
||||
const int n_left = system_tokens.size() + slot.n_past - n_keep;
|
||||
const int n_discard = n_left / 2;
|
||||
|
||||
LOG_TEE("slot %d: context shift - n_keep = %d, n_left = %d, n_discard = %d\n", slot.id, slot.params.n_keep, n_left, n_discard);
|
||||
llama_kv_cache_seq_rm (ctx, slot.id, slot.params.n_keep + 1 , slot.params.n_keep + n_discard + 1);
|
||||
llama_kv_cache_seq_shift(ctx, slot.id, slot.params.n_keep + 1 + n_discard, system_tokens.size() + slot.n_past, -n_discard);
|
||||
LOG_TEE("slot %d: context shift - n_keep = %d, n_left = %d, n_discard = %d\n", slot.id, n_keep, n_left, n_discard);
|
||||
llama_kv_cache_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard);
|
||||
llama_kv_cache_seq_shift(ctx, slot.id, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard);
|
||||
|
||||
for (size_t i = slot.params.n_keep + 1 + n_discard; i < slot.cache_tokens.size(); i++)
|
||||
for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++)
|
||||
{
|
||||
slot.cache_tokens[i - n_discard] = slot.cache_tokens[i];
|
||||
}
|
||||
@@ -1497,7 +1518,7 @@ struct llama_server_context
|
||||
|
||||
LOG_VERBOSE("context shift", {
|
||||
{ "n_ctx", n_ctx },
|
||||
{ "n_keep", params.n_keep },
|
||||
{ "n_keep", n_keep },
|
||||
{ "n_left", n_left },
|
||||
});
|
||||
}
|
||||
@@ -1937,8 +1958,9 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
||||
printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
|
||||
printf(" -gan N, --grp-attn-n N set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`");
|
||||
printf(" -gaw N, --grp-attn-w N set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`");
|
||||
printf(" --chat-template FORMAT_NAME");
|
||||
printf(" set chat template, possible value is: llama2, chatml (default %s)", sparams.chat_template.c_str());
|
||||
printf(" --chat-template JINJA_TEMPLATE\n");
|
||||
printf(" set custom jinja chat template (default: template taken from model's metadata)\n");
|
||||
printf(" Note: only commonly used templates are accepted, since we don't have jinja parser\n");
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
@@ -2389,13 +2411,13 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
std::string value(argv[i]);
|
||||
if (value != "chatml" && value != "llama2") {
|
||||
fprintf(stderr, "error: chat template can be \"llama2\" or \"chatml\", but got: %s\n", value.c_str());
|
||||
if (!verify_custom_template(argv[i])) {
|
||||
fprintf(stderr, "error: the supplied chat template is not supported: %s\n", argv[i]);
|
||||
fprintf(stderr, "note: llama.cpp does not use jinja parser, we only support commonly used templates\n");
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
sparams.chat_template = value;
|
||||
sparams.chat_template = argv[i];
|
||||
}
|
||||
else if (arg == "--override-kv")
|
||||
{
|
||||
@@ -2582,40 +2604,44 @@ int main(int argc, char **argv)
|
||||
res.set_header("Access-Control-Allow-Headers", "*");
|
||||
});
|
||||
|
||||
svr.Get("/health", [&](const httplib::Request&, httplib::Response& res) {
|
||||
svr.Get("/health", [&](const httplib::Request& req, httplib::Response& res) {
|
||||
server_state current_state = state.load();
|
||||
switch(current_state) {
|
||||
case SERVER_STATE_READY:
|
||||
if (llama.all_slots_are_idle) {
|
||||
res.set_content(R"({"status": "ok"})", "application/json");
|
||||
res.status = 200; // HTTP OK
|
||||
} else {
|
||||
int available_slots = 0;
|
||||
int processing_slots = 0;
|
||||
for (llama_client_slot & slot : llama.slots) {
|
||||
if (slot.available()) {
|
||||
available_slots++;
|
||||
} else {
|
||||
processing_slots++;
|
||||
}
|
||||
}
|
||||
if (available_slots > 0) {
|
||||
json health = {
|
||||
{"status", "ok"},
|
||||
{"slots_idle", available_slots},
|
||||
{"slots_processing", processing_slots}};
|
||||
res.set_content(health.dump(), "application/json");
|
||||
res.status = 200; // HTTP OK
|
||||
} else {
|
||||
json health = {
|
||||
{"status", "no slot available"},
|
||||
{"slots_idle", available_slots},
|
||||
{"slots_processing", processing_slots}};
|
||||
res.set_content(health.dump(), "application/json");
|
||||
case SERVER_STATE_READY: {
|
||||
// request slots data using task queue
|
||||
task_server task;
|
||||
task.id = llama.queue_tasks.get_new_id();
|
||||
task.type = TASK_TYPE_SLOTS_DATA;
|
||||
task.target_id = -1;
|
||||
|
||||
llama.queue_results.add_waiting_task_id(task.id);
|
||||
llama.queue_tasks.post(task);
|
||||
|
||||
// get the result
|
||||
task_result result = llama.queue_results.recv(task.id);
|
||||
llama.queue_results.remove_waiting_task_id(task.id);
|
||||
|
||||
int n_idle_slots = result.result_json["idle"];
|
||||
int n_processing_slots = result.result_json["processing"];
|
||||
|
||||
json health = {
|
||||
{"status", "ok"},
|
||||
{"slots_idle", n_idle_slots},
|
||||
{"slots_processing", n_processing_slots}};
|
||||
res.status = 200; // HTTP OK
|
||||
if (sparams.slots_endpoint && req.has_param("include_slots")) {
|
||||
health["slots"] = result.result_json["slots"];
|
||||
}
|
||||
|
||||
if (n_idle_slots == 0) {
|
||||
health["status"] = "no slot available";
|
||||
if (req.has_param("fail_on_no_slot")) {
|
||||
res.status = 503; // HTTP Service Unavailable
|
||||
}
|
||||
}
|
||||
res.set_content(health.dump(), "application/json");
|
||||
break;
|
||||
}
|
||||
case SERVER_STATE_LOADING_MODEL:
|
||||
res.set_content(R"({"status": "loading model"})", "application/json");
|
||||
res.status = 503; // HTTP Service Unavailable
|
||||
@@ -2629,26 +2655,20 @@ int main(int argc, char **argv)
|
||||
|
||||
if (sparams.slots_endpoint) {
|
||||
svr.Get("/slots", [&](const httplib::Request&, httplib::Response& res) {
|
||||
json slots;
|
||||
for (llama_client_slot & slot : llama.slots) {
|
||||
json slot_data = llama.get_formated_generation(slot);
|
||||
slot_data["id"] = slot.id;
|
||||
slot_data["task_id"] = slot.task_id;
|
||||
slot_data["state"] = slot.state;
|
||||
slot_data["prompt"] = slot.prompt;
|
||||
slot_data["next_token"] = {
|
||||
{"has_next_token", slot.has_next_token},
|
||||
{"n_remain", slot.n_remaining},
|
||||
{"num_tokens_predicted", slot.n_decoded},
|
||||
{"stopped_eos", slot.stopped_eos},
|
||||
{"stopped_word", slot.stopped_word},
|
||||
{"stopped_limit", slot.stopped_limit},
|
||||
{"stopping_word", slot.stopping_word},
|
||||
};
|
||||
// request slots data using task queue
|
||||
task_server task;
|
||||
task.id = llama.queue_tasks.get_new_id();
|
||||
task.type = TASK_TYPE_SLOTS_DATA;
|
||||
task.target_id = -1;
|
||||
|
||||
slots.push_back(slot_data);
|
||||
}
|
||||
res.set_content(slots.dump(), "application/json");
|
||||
llama.queue_results.add_waiting_task_id(task.id);
|
||||
llama.queue_tasks.post(task);
|
||||
|
||||
// get the result
|
||||
task_result result = llama.queue_results.recv(task.id);
|
||||
llama.queue_results.remove_waiting_task_id(task.id);
|
||||
|
||||
res.set_content(result.result_json["slots"].dump(), "application/json");
|
||||
res.status = 200; // HTTP OK
|
||||
});
|
||||
}
|
||||
@@ -2742,6 +2762,11 @@ int main(int argc, char **argv)
|
||||
LOG_INFO("model loaded", {});
|
||||
}
|
||||
|
||||
if (sparams.chat_template.empty()) { // custom chat template is not supplied
|
||||
// check if the template comes with the model is supported by us
|
||||
llama.validate_model_chat_template(sparams);
|
||||
}
|
||||
|
||||
// Middleware for API key validation
|
||||
auto validate_api_key = [&sparams](const httplib::Request &req, httplib::Response &res) -> bool {
|
||||
// If API key is not set, skip validation
|
||||
@@ -2913,7 +2938,7 @@ int main(int argc, char **argv)
|
||||
if (!validate_api_key(req, res)) {
|
||||
return;
|
||||
}
|
||||
json data = oaicompat_completion_params_parse(json::parse(req.body), sparams.chat_template);
|
||||
json data = oaicompat_completion_params_parse(llama.model, json::parse(req.body), sparams.chat_template);
|
||||
|
||||
const int task_id = llama.queue_tasks.get_new_id();
|
||||
llama.queue_results.add_waiting_task_id(task_id);
|
||||
|
||||
@@ -49,7 +49,8 @@ enum server_state {
|
||||
enum task_type {
|
||||
TASK_TYPE_COMPLETION,
|
||||
TASK_TYPE_CANCEL,
|
||||
TASK_TYPE_NEXT_RESPONSE
|
||||
TASK_TYPE_NEXT_RESPONSE,
|
||||
TASK_TYPE_SLOTS_DATA
|
||||
};
|
||||
|
||||
struct task_server {
|
||||
@@ -167,50 +168,47 @@ static T json_value(const json &body, const std::string &key, const T &default_v
|
||||
: default_value;
|
||||
}
|
||||
|
||||
inline std::string format_llama2(std::vector<json> messages)
|
||||
{
|
||||
std::ostringstream output;
|
||||
bool is_inside_turn = false;
|
||||
|
||||
for (auto it = messages.begin(); it != messages.end(); ++it) {
|
||||
if (!is_inside_turn) {
|
||||
output << "[INST] ";
|
||||
}
|
||||
std::string role = json_value(*it, "role", std::string("user"));
|
||||
std::string content = json_value(*it, "content", std::string(""));
|
||||
if (role == "system") {
|
||||
output << "<<SYS>>\n" << content << "\n<<SYS>>\n\n";
|
||||
is_inside_turn = true;
|
||||
} else if (role == "user") {
|
||||
output << content << " [/INST]";
|
||||
is_inside_turn = true;
|
||||
} else {
|
||||
output << " " << content << " </s>";
|
||||
is_inside_turn = false;
|
||||
}
|
||||
}
|
||||
|
||||
LOG_VERBOSE("format_llama2", {{"text", output.str()}});
|
||||
|
||||
return output.str();
|
||||
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
|
||||
inline bool verify_custom_template(const std::string & tmpl) {
|
||||
llama_chat_message chat[] = {{"user", "test"}};
|
||||
std::vector<char> buf(1);
|
||||
int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, buf.data(), buf.size());
|
||||
return res >= 0;
|
||||
}
|
||||
|
||||
inline std::string format_chatml(std::vector<json> messages)
|
||||
// Format given chat. If tmpl is empty, we take the template from model metadata
|
||||
inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages)
|
||||
{
|
||||
std::ostringstream chatml_msgs;
|
||||
size_t alloc_size = 0;
|
||||
// vector holding all allocated string to be passed to llama_chat_apply_template
|
||||
std::vector<std::string> str(messages.size() * 2);
|
||||
std::vector<llama_chat_message> chat(messages.size());
|
||||
|
||||
for (auto it = messages.begin(); it != messages.end(); ++it) {
|
||||
chatml_msgs << "<|im_start|>"
|
||||
<< json_value(*it, "role", std::string("user")) << '\n';
|
||||
chatml_msgs << json_value(*it, "content", std::string(""))
|
||||
<< "<|im_end|>\n";
|
||||
for (size_t i = 0; i < messages.size(); ++i) {
|
||||
auto &curr_msg = messages[i];
|
||||
str[i*2 + 0] = json_value(curr_msg, "role", std::string(""));
|
||||
str[i*2 + 1] = json_value(curr_msg, "content", std::string(""));
|
||||
alloc_size += str[i*2 + 1].length();
|
||||
chat[i].role = str[i*2 + 0].c_str();
|
||||
chat[i].content = str[i*2 + 1].c_str();
|
||||
}
|
||||
|
||||
chatml_msgs << "<|im_start|>assistant" << '\n';
|
||||
const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str();
|
||||
std::vector<char> buf(alloc_size * 2);
|
||||
|
||||
LOG_VERBOSE("format_chatml", {{"text", chatml_msgs.str()}});
|
||||
// run the first time to get the total output length
|
||||
int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size());
|
||||
|
||||
return chatml_msgs.str();
|
||||
// if it turns out that our buffer is too small, we resize it
|
||||
if ((size_t) res > buf.size()) {
|
||||
buf.resize(res);
|
||||
res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size());
|
||||
}
|
||||
|
||||
std::string formatted_chat(buf.data(), res);
|
||||
LOG_VERBOSE("formatted_chat", {{"text", formatted_chat.c_str()}});
|
||||
|
||||
return formatted_chat;
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
98
ggml-cuda.cu
98
ggml-cuda.cu
@@ -528,6 +528,15 @@ typedef struct {
|
||||
} block_iq1_s;
|
||||
static_assert(sizeof(block_iq1_s) == sizeof(ggml_fp16_t) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding");
|
||||
|
||||
#define QK4_NL 32
|
||||
#define QR4_NL 2
|
||||
#define QI4_NL (QK4_NL / (4*QR4_NL))
|
||||
typedef struct {
|
||||
half d;
|
||||
uint8_t qs[QK4_NL/2];
|
||||
} block_iq4_nl;
|
||||
static_assert(sizeof(block_iq4_nl) == sizeof(ggml_fp16_t) + QK4_NL/2, "wrong iq4_nl block size/padding");
|
||||
|
||||
#define WARP_SIZE 32
|
||||
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
|
||||
|
||||
@@ -1987,6 +1996,26 @@ static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_
|
||||
|
||||
}
|
||||
|
||||
static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
|
||||
|
||||
template<typename dst_t>
|
||||
static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||
|
||||
const int i = blockIdx.x;
|
||||
const block_iq4_nl * x = (const block_iq4_nl *) vx + i*(QK_K/QK4_NL);
|
||||
|
||||
const int tid = threadIdx.x;
|
||||
const int il = tid/8; // 0...3
|
||||
const int ib = tid%8; // 0...7
|
||||
dst_t * y = yy + i*QK_K + 32*ib + 4*il;
|
||||
const uint8_t * q4 = x[ib].qs + 4*il;
|
||||
const float d = (float)x[ib].d;
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
|
||||
y[j+16] = d * kvalues_iq4nl[q4[j] >> 4];
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
||||
|
||||
@@ -4732,6 +4761,56 @@ static __device__ __forceinline__ float vec_dot_iq1_s_q8_1(
|
||||
#endif
|
||||
}
|
||||
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||
static __device__ __forceinline__ void get_int_from_table_16(const uint32_t & q4, const uint8_t * values,
|
||||
int & val1, int & val2) {
|
||||
|
||||
uint32_t aux32; const uint8_t * q8 = (const uint8_t *)&aux32;
|
||||
aux32 = q4 & 0x0f0f0f0f;
|
||||
uint16_t v1 = values[q8[0]] | (values[q8[1]] << 8);
|
||||
uint16_t v2 = values[q8[2]] | (values[q8[3]] << 8);
|
||||
val1 = v1 | (v2 << 16);
|
||||
aux32 = (q4 >> 4) & 0x0f0f0f0f;
|
||||
v1 = values[q8[0]] | (values[q8[1]] << 8);
|
||||
v2 = values[q8[2]] | (values[q8[3]] << 8);
|
||||
val2 = v1 | (v2 << 16);
|
||||
}
|
||||
#endif
|
||||
|
||||
static __device__ __forceinline__ float vec_dot_iq4_nl_q8_1(
|
||||
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
||||
|
||||
const block_iq4_nl * bq = (const block_iq4_nl *) vbq;
|
||||
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||
const uint16_t * q4 = (const uint16_t *)bq->qs + 2*iqs;
|
||||
const int32_t * q8 = (const int32_t *)bq8_1->qs + iqs;
|
||||
|
||||
const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
|
||||
|
||||
int v1, v2;
|
||||
int sumi1 = 0, sumi2 = 0;
|
||||
for (int l = 0; l < VDR_Q4_0_Q8_1_MMVQ; ++l) {
|
||||
const uint32_t aux = q4[2*l] | (q4[2*l+1] << 16);
|
||||
get_int_from_table_16(aux, values, v1, v2);
|
||||
sumi1 = __dp4a(v1, q8[l+0], sumi1);
|
||||
sumi2 = __dp4a(v2, q8[l+4], sumi2);
|
||||
}
|
||||
|
||||
#else
|
||||
const uint8_t * q4 = bq->qs + 4*iqs;
|
||||
const int8_t * q8 = bq8_1->qs + 4*iqs;
|
||||
|
||||
int sumi1 = 0, sumi2 = 0;
|
||||
for (int l = 0; l < 4*VDR_Q4_0_Q8_1_MMVQ; ++l) {
|
||||
sumi1 += q8[l+ 0] * kvalues_iq4nl[q4[l] & 0xf];
|
||||
sumi2 += q8[l+16] * kvalues_iq4nl[q4[l] >> 4];
|
||||
}
|
||||
#endif
|
||||
const float d = (float)bq->d * __low2float(bq8_1->ds);
|
||||
return d * (sumi1 + sumi2);
|
||||
}
|
||||
|
||||
template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x, int mmq_y, int nwarps,
|
||||
allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles, int vdr, vec_dot_q_mul_mat_cuda_t vec_dot>
|
||||
static __device__ __forceinline__ void mul_mat_q(
|
||||
@@ -6777,6 +6856,12 @@ static void dequantize_row_iq1_s_cuda(const void * vx, dst_t * y, const int k, c
|
||||
dequantize_block_iq1_s<<<nb, 32, 0, stream>>>(vx, y);
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_iq4_nl_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
||||
const int nb = (k + QK_K - 1) / QK_K;
|
||||
dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
|
||||
}
|
||||
|
||||
template <typename src_t, typename dst_t>
|
||||
static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
|
||||
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
||||
@@ -6818,6 +6903,8 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
||||
return dequantize_row_iq3_xxs_cuda;
|
||||
case GGML_TYPE_IQ1_S:
|
||||
return dequantize_row_iq1_s_cuda;
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
return dequantize_row_iq4_nl_cuda;
|
||||
case GGML_TYPE_F32:
|
||||
return convert_unary_cuda<float>;
|
||||
default:
|
||||
@@ -6855,6 +6942,8 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
||||
return dequantize_row_iq3_xxs_cuda;
|
||||
case GGML_TYPE_IQ1_S:
|
||||
return dequantize_row_iq1_s_cuda;
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
return dequantize_row_iq4_nl_cuda;
|
||||
case GGML_TYPE_F16:
|
||||
return convert_unary_cuda<half>;
|
||||
default:
|
||||
@@ -8599,6 +8688,7 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_CUD
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
case GGML_TYPE_IQ1_S:
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
return max_compute_capability >= CC_RDNA2 ? 128 : 64;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
@@ -8623,6 +8713,7 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_CUD
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
case GGML_TYPE_IQ1_S:
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
return max_compute_capability >= CC_VOLTA ? 128 : 64;
|
||||
case GGML_TYPE_Q6_K:
|
||||
return 64;
|
||||
@@ -8724,6 +8815,10 @@ static void ggml_cuda_op_mul_mat_vec_q(
|
||||
mul_mat_vec_q_cuda<QK_K, QI1_S, block_iq1_s, 1, vec_dot_iq1_s_q8_1>
|
||||
(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
mul_mat_vec_q_cuda<QK4_NL, QI4_NL, block_iq4_nl, VDR_Q4_0_Q8_1_MMVQ, vec_dot_iq4_nl_q8_1>
|
||||
(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
break;
|
||||
@@ -11446,7 +11541,8 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
||||
return false;
|
||||
}
|
||||
ggml_type a_type = a->type;
|
||||
if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS || a_type == GGML_TYPE_IQ1_S) {
|
||||
if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS ||
|
||||
a_type == GGML_TYPE_IQ1_S || a_type == GGML_TYPE_IQ4_NL) {
|
||||
if (b->ne[1] == 1 && ggml_nrows(b) > 1) {
|
||||
return false;
|
||||
}
|
||||
|
||||
35
ggml-metal.m
35
ggml-metal.m
@@ -62,6 +62,7 @@ enum ggml_metal_kernel_type {
|
||||
GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS,
|
||||
GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS,
|
||||
GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S,
|
||||
GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL,
|
||||
GGML_METAL_KERNEL_TYPE_GET_ROWS_I32,
|
||||
GGML_METAL_KERNEL_TYPE_RMS_NORM,
|
||||
GGML_METAL_KERNEL_TYPE_GROUP_NORM,
|
||||
@@ -85,6 +86,7 @@ enum ggml_metal_kernel_type {
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32,
|
||||
//GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F16,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32,
|
||||
@@ -104,6 +106,7 @@ enum ggml_metal_kernel_type {
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32,
|
||||
@@ -120,6 +123,7 @@ enum ggml_metal_kernel_type {
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32,
|
||||
@@ -136,6 +140,7 @@ enum ggml_metal_kernel_type {
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32,
|
||||
GGML_METAL_KERNEL_TYPE_ROPE_F32,
|
||||
GGML_METAL_KERNEL_TYPE_ROPE_F16,
|
||||
GGML_METAL_KERNEL_TYPE_ALIBI_F32,
|
||||
@@ -448,6 +453,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS, get_rows_iq2_xs, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS, get_rows_iq3_xxs, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S, get_rows_iq1_s, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL, get_rows_iq4_nl, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_I32, get_rows_i32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RMS_NORM, rms_norm, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GROUP_NORM, group_norm, ctx->support_simdgroup_reduction);
|
||||
@@ -471,6 +477,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32, mul_mv_iq2_xs_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32, mul_mv_iq3_xxs_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32, mul_mv_iq1_s_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32, mul_mv_iq4_nl_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32, mul_mv_id_f32_f32, ctx->support_simdgroup_reduction);
|
||||
//GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F16, mul_mv_id_f16_f16, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32, mul_mv_id_f16_f32, ctx->support_simdgroup_reduction);
|
||||
@@ -490,6 +497,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32, mul_mv_id_iq2_xs_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32, mul_mv_id_iq3_xxs_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32, mul_mv_id_iq1_s_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32, mul_mv_id_iq4_nl_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32, mul_mm_f32_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32, mul_mm_f16_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32, mul_mm_q4_0_f32, ctx->support_simdgroup_mm);
|
||||
@@ -506,6 +514,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32, mul_mm_iq2_xs_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32, mul_mm_iq3_xxs_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32, mul_mm_iq1_s_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32, mul_mm_iq4_nl_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32, mul_mm_id_f32_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32, mul_mm_id_f16_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32, mul_mm_id_q4_0_f32, ctx->support_simdgroup_mm);
|
||||
@@ -522,6 +531,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32, mul_mm_id_iq2_xs_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32, mul_mm_id_iq3_xxs_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32, mul_mm_id_iq1_s_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32, mul_mm_id_iq4_nl_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F32, rope_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F16, rope_f16, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ALIBI_F32, alibi_f32, true);
|
||||
@@ -1338,6 +1348,7 @@ static bool ggml_metal_graph_compute(
|
||||
case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32 ].pipeline; break;
|
||||
case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32].pipeline; break;
|
||||
case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32 ].pipeline; break;
|
||||
case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32 ].pipeline; break;
|
||||
default: GGML_ASSERT(false && "MUL MAT-MAT not implemented");
|
||||
}
|
||||
|
||||
@@ -1478,6 +1489,12 @@ static bool ggml_metal_graph_compute(
|
||||
nth1 = 16;
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32].pipeline;
|
||||
} break;
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
{
|
||||
nth0 = 4;
|
||||
nth1 = 16;
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32].pipeline;
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src0t);
|
||||
@@ -1525,6 +1542,11 @@ static bool ggml_metal_graph_compute(
|
||||
[encoder setThreadgroupMemoryLength:mem_size atIndex:0];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
else if (src0t == GGML_TYPE_IQ4_NL) {
|
||||
const int mem_size = 32*sizeof(float);
|
||||
[encoder setThreadgroupMemoryLength:mem_size atIndex:0];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
else if (src0t == GGML_TYPE_Q4_K) {
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
@@ -1619,6 +1641,7 @@ static bool ggml_metal_graph_compute(
|
||||
case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32 ].pipeline; break;
|
||||
case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32].pipeline; break;
|
||||
case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32 ].pipeline; break;
|
||||
case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32 ].pipeline; break;
|
||||
default: GGML_ASSERT(false && "MUL_MAT_ID not implemented");
|
||||
}
|
||||
|
||||
@@ -1762,6 +1785,12 @@ static bool ggml_metal_graph_compute(
|
||||
nth1 = 16;
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32].pipeline;
|
||||
} break;
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
{
|
||||
nth0 = 4;
|
||||
nth1 = 16;
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32].pipeline;
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src2t);
|
||||
@@ -1825,6 +1854,11 @@ static bool ggml_metal_graph_compute(
|
||||
[encoder setThreadgroupMemoryLength:mem_size atIndex:0];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
else if (src2t == GGML_TYPE_IQ4_NL) {
|
||||
const int mem_size = 32*sizeof(float);
|
||||
[encoder setThreadgroupMemoryLength:mem_size atIndex:0];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
else if (src2t == GGML_TYPE_Q4_K) {
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
@@ -1867,6 +1901,7 @@ static bool ggml_metal_graph_compute(
|
||||
case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS ].pipeline; break;
|
||||
case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS].pipeline; break;
|
||||
case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S ].pipeline; break;
|
||||
case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL ].pipeline; break;
|
||||
case GGML_TYPE_I32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_I32 ].pipeline; break;
|
||||
default: GGML_ASSERT(false && "not implemented");
|
||||
}
|
||||
|
||||
215
ggml-metal.metal
215
ggml-metal.metal
@@ -2531,6 +2531,12 @@ typedef struct {
|
||||
uint8_t scales[QK_K/16];
|
||||
} block_iq1_s;
|
||||
|
||||
// Non-linear quants
|
||||
#define QK4_NL 32
|
||||
typedef struct {
|
||||
half d;
|
||||
uint8_t qs[QK4_NL/2];
|
||||
} block_iq4_nl;
|
||||
|
||||
//====================================== dot products =========================
|
||||
|
||||
@@ -4384,7 +4390,6 @@ void kernel_mul_mv_iq1_s_f32_impl(
|
||||
const uint i13 = im/ne12;
|
||||
|
||||
const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
|
||||
|
||||
device const block_iq1_s * x = (device const block_iq1_s *) src0 + ib_row + offset0;
|
||||
device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
|
||||
|
||||
@@ -4447,6 +4452,103 @@ void kernel_mul_mv_iq1_s_f32_impl(
|
||||
}
|
||||
}
|
||||
|
||||
constexpr constant static float kvalues_iq4nl_f[16] = {
|
||||
-127.f, -104.f, -83.f, -65.f, -49.f, -35.f, -22.f, -10.f, 1.f, 13.f, 25.f, 38.f, 53.f, 69.f, 89.f, 113.f
|
||||
};
|
||||
|
||||
void kernel_mul_mv_iq4_nl_f32_impl(
|
||||
device const void * src0,
|
||||
device const float * src1,
|
||||
device float * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
constant int64_t & ne10,
|
||||
constant int64_t & ne12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant uint & r2,
|
||||
constant uint & r3,
|
||||
threadgroup float * shared_values [[threadgroup(0)]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint tiisg[[thread_index_in_simdgroup]],
|
||||
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||
|
||||
const int nb = ne00/QK4_NL;
|
||||
const int r0 = tgpig.x;
|
||||
const int r1 = tgpig.y;
|
||||
const int im = tgpig.z;
|
||||
const int first_row = (r0 * 2 + sgitg) * 2;
|
||||
const int ib_row = first_row * nb;
|
||||
|
||||
const uint i12 = im%ne12;
|
||||
const uint i13 = im/ne12;
|
||||
|
||||
const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
|
||||
device const block_iq4_nl * x = (device const block_iq4_nl *) src0 + ib_row + offset0;
|
||||
device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
|
||||
|
||||
const int ix = tiisg/2; // 0...15
|
||||
const int it = tiisg%2; // 0 or 1
|
||||
|
||||
shared_values[tiisg] = kvalues_iq4nl_f[tiisg%16];
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
float4 yl[4];
|
||||
float sumf[2]={0.f}, all_sum;
|
||||
|
||||
device const float * yb = y + ix * QK4_NL + it * 8;
|
||||
|
||||
uint32_t aux32[2];
|
||||
thread const uint8_t * q8 = (thread const uint8_t *)aux32;
|
||||
|
||||
float4 qf1, qf2;
|
||||
|
||||
for (int ib = ix; ib < nb; ib += 16) {
|
||||
|
||||
device const float4 * y4 = (device const float4 *)yb;
|
||||
yl[0] = y4[0]; yl[1] = y4[4]; yl[2] = y4[1]; yl[3] = y4[5];
|
||||
|
||||
for (int row = 0; row < 2; ++row) {
|
||||
|
||||
device const block_iq4_nl & xb = x[row*nb + ib];
|
||||
device const uint16_t * q4 = (device const uint16_t *)(xb.qs + 8*it);
|
||||
|
||||
float4 acc1 = {0.f}, acc2 = {0.f};
|
||||
|
||||
aux32[0] = q4[0] | (q4[1] << 16);
|
||||
aux32[1] = (aux32[0] >> 4) & 0x0f0f0f0f;
|
||||
aux32[0] &= 0x0f0f0f0f;
|
||||
qf1 = {shared_values[q8[0]], shared_values[q8[1]], shared_values[q8[2]], shared_values[q8[3]]};
|
||||
qf2 = {shared_values[q8[4]], shared_values[q8[5]], shared_values[q8[6]], shared_values[q8[7]]};
|
||||
acc1 += yl[0] * qf1;
|
||||
acc2 += yl[1] * qf2;
|
||||
|
||||
aux32[0] = q4[2] | (q4[3] << 16);
|
||||
aux32[1] = (aux32[0] >> 4) & 0x0f0f0f0f;
|
||||
aux32[0] &= 0x0f0f0f0f;
|
||||
qf1 = {shared_values[q8[0]], shared_values[q8[1]], shared_values[q8[2]], shared_values[q8[3]]};
|
||||
qf2 = {shared_values[q8[4]], shared_values[q8[5]], shared_values[q8[6]], shared_values[q8[7]]};
|
||||
acc1 += yl[2] * qf1;
|
||||
acc2 += yl[3] * qf2;
|
||||
|
||||
acc1 += acc2;
|
||||
|
||||
sumf[row] += (float)xb.d * (acc1[0] + acc1[1] + acc1[2] + acc1[3]);
|
||||
|
||||
}
|
||||
|
||||
yb += 16 * QK4_NL;
|
||||
}
|
||||
|
||||
for (int row = 0; row < 2; ++row) {
|
||||
all_sum = simd_sum(sumf[row]);
|
||||
if (tiisg == 0) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
[[host_name("kernel_mul_mv_iq1_s_f32")]]
|
||||
kernel void kernel_mul_mv_iq1_s_f32(
|
||||
device const void * src0,
|
||||
@@ -4475,6 +4577,34 @@ kernel void kernel_mul_mv_iq1_s_f32(
|
||||
kernel_mul_mv_iq1_s_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, tgpig, tiisg, sgitg);
|
||||
}
|
||||
|
||||
[[host_name("kernel_mul_mv_iq4_nl_f32")]]
|
||||
kernel void kernel_mul_mv_iq4_nl_f32(
|
||||
device const void * src0,
|
||||
device const float * src1,
|
||||
device float * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
constant uint64_t & nb00,
|
||||
constant uint64_t & nb01,
|
||||
constant uint64_t & nb02,
|
||||
constant int64_t & ne10,
|
||||
constant int64_t & ne11,
|
||||
constant int64_t & ne12,
|
||||
constant uint64_t & nb10,
|
||||
constant uint64_t & nb11,
|
||||
constant uint64_t & nb12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant uint & r2,
|
||||
constant uint & r3,
|
||||
threadgroup float * shared_values [[threadgroup(0)]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint tiisg[[thread_index_in_simdgroup]],
|
||||
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||
|
||||
kernel_mul_mv_iq4_nl_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
|
||||
}
|
||||
|
||||
//============================= templates and their specializations =============================
|
||||
|
||||
@@ -4838,6 +4968,21 @@ void dequantize_iq1_s(device const block_iq1_s * xb, short il, thread type4x4 &
|
||||
}
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_iq4_nl(device const block_iq4_nl * xb, short il, thread type4x4 & reg) {
|
||||
device const uint16_t * q4 = (device const uint16_t *)xb->qs;
|
||||
const float d = xb->d;
|
||||
uint32_t aux32;
|
||||
thread const uint8_t * q8 = (thread const uint8_t *)&aux32;
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
aux32 = ((q4[2*i] | (q4[2*i+1] << 16)) >> 4*il) & 0x0f0f0f0f;
|
||||
reg[i][0] = d * kvalues_iq4nl_f[q8[0]];
|
||||
reg[i][1] = d * kvalues_iq4nl_f[q8[1]];
|
||||
reg[i][2] = d * kvalues_iq4nl_f[q8[2]];
|
||||
reg[i][3] = d * kvalues_iq4nl_f[q8[3]];
|
||||
}
|
||||
}
|
||||
|
||||
template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread float4x4 &)>
|
||||
kernel void kernel_get_rows(
|
||||
device const void * src0,
|
||||
@@ -5381,6 +5526,7 @@ template [[host_name("kernel_get_rows_iq2_xxs")]] kernel get_rows_t kernel_get_r
|
||||
template [[host_name("kernel_get_rows_iq2_xs")]] kernel get_rows_t kernel_get_rows<block_iq2_xs, QK_NL, dequantize_iq2_xs>;
|
||||
template [[host_name("kernel_get_rows_iq3_xxs")]] kernel get_rows_t kernel_get_rows<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
|
||||
template [[host_name("kernel_get_rows_iq1_s")]] kernel get_rows_t kernel_get_rows<block_iq1_s, QK_NL, dequantize_iq1_s>;
|
||||
template [[host_name("kernel_get_rows_iq4_nl")]] kernel get_rows_t kernel_get_rows<block_iq4_nl, 2, dequantize_iq4_nl>;
|
||||
|
||||
//
|
||||
// matrix-matrix multiplication
|
||||
@@ -5421,6 +5567,7 @@ template [[host_name("kernel_mul_mm_iq2_xxs_f32")]] kernel mat_mm_t kernel_mul_m
|
||||
template [[host_name("kernel_mul_mm_iq2_xs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq2_xs, QK_NL, dequantize_iq2_xs>;
|
||||
template [[host_name("kernel_mul_mm_iq3_xxs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
|
||||
template [[host_name("kernel_mul_mm_iq1_s_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq1_s, QK_NL, dequantize_iq1_s>;
|
||||
template [[host_name("kernel_mul_mm_iq4_nl_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq4_nl, 2, dequantize_iq4_nl>;
|
||||
|
||||
//
|
||||
// indirect matrix-matrix multiplication
|
||||
@@ -5473,6 +5620,7 @@ template [[host_name("kernel_mul_mm_id_iq2_xxs_f32")]] kernel mat_mm_id_t kernel
|
||||
template [[host_name("kernel_mul_mm_id_iq2_xs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq2_xs, QK_NL, dequantize_iq2_xs>;
|
||||
template [[host_name("kernel_mul_mm_id_iq3_xxs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
|
||||
template [[host_name("kernel_mul_mm_id_iq1_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq1_s, QK_NL, dequantize_iq1_s>;
|
||||
template [[host_name("kernel_mul_mm_id_iq4_nl_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_nl, 2, dequantize_iq4_nl>;
|
||||
|
||||
//
|
||||
// matrix-vector multiplication
|
||||
@@ -6503,3 +6651,68 @@ kernel void kernel_mul_mv_id_iq1_s_f32(
|
||||
tiisg,
|
||||
sgitg);
|
||||
}
|
||||
|
||||
[[host_name("kernel_mul_mv_id_iq4_nl_f32")]]
|
||||
kernel void kernel_mul_mv_id_iq4_nl_f32(
|
||||
device const char * ids,
|
||||
device const char * src1,
|
||||
device float * dst,
|
||||
constant uint64_t & nbi1,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
constant uint64_t & nb00,
|
||||
constant uint64_t & nb01,
|
||||
constant uint64_t & nb02,
|
||||
constant int64_t & ne10,
|
||||
constant int64_t & ne11,
|
||||
constant int64_t & ne12,
|
||||
constant int64_t & ne13,
|
||||
constant uint64_t & nb10,
|
||||
constant uint64_t & nb11,
|
||||
constant uint64_t & nb12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant uint64_t & nb1,
|
||||
constant uint & r2,
|
||||
constant uint & r3,
|
||||
constant int & idx,
|
||||
device const char * src00,
|
||||
device const char * src01,
|
||||
device const char * src02,
|
||||
device const char * src03,
|
||||
device const char * src04,
|
||||
device const char * src05,
|
||||
device const char * src06,
|
||||
device const char * src07,
|
||||
threadgroup float * shared_values [[threadgroup(0)]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint tiitg[[thread_index_in_threadgroup]],
|
||||
uint tiisg[[thread_index_in_simdgroup]],
|
||||
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||
device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07};
|
||||
|
||||
const int64_t bid = tgpig.z/(ne12*ne13);
|
||||
|
||||
tgpig.z = tgpig.z%(ne12*ne13);
|
||||
|
||||
const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx];
|
||||
|
||||
kernel_mul_mv_iq4_nl_f32_impl(
|
||||
src0[id],
|
||||
(device const float *) (src1 + bid*nb11),
|
||||
dst + bid*ne0,
|
||||
ne00,
|
||||
ne01,
|
||||
ne02,
|
||||
ne10,
|
||||
ne12,
|
||||
ne0,
|
||||
ne1,
|
||||
r2,
|
||||
r3,
|
||||
shared_values,
|
||||
tgpig,
|
||||
tiisg,
|
||||
sgitg);
|
||||
}
|
||||
|
||||
234
ggml-quants.c
234
ggml-quants.c
@@ -3754,6 +3754,26 @@ void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, in
|
||||
}
|
||||
}
|
||||
|
||||
static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
|
||||
|
||||
void dequantize_row_iq4_nl(const block_iq4_nl * restrict x, float * restrict y, int k) {
|
||||
assert(k % QK4_NL == 0);
|
||||
const int nb = k / QK4_NL;
|
||||
|
||||
for (int i = 0; i < nb; i++) {
|
||||
|
||||
const uint8_t * qs = x[i].qs;
|
||||
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d);
|
||||
for (int j = 0; j < QK4_NL/2; ++j) {
|
||||
y[j+ 0] = d * kvalues_iq4nl[qs[j] & 0xf];
|
||||
y[j+QK4_NL/2] = d * kvalues_iq4nl[qs[j] >> 4];
|
||||
}
|
||||
y += QK4_NL;
|
||||
qs += QK4_NL/2;
|
||||
}
|
||||
}
|
||||
|
||||
//===================================== Q8_K ==============================================
|
||||
|
||||
void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k) {
|
||||
@@ -9148,7 +9168,6 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
||||
#endif
|
||||
}
|
||||
|
||||
// TODO
|
||||
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||
assert(n % QK_K == 0);
|
||||
assert(nrc == 1);
|
||||
@@ -9452,7 +9471,100 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const
|
||||
*s = sumf;
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
||||
assert(nrc == 1);
|
||||
UNUSED(nrc);
|
||||
UNUSED(bx);
|
||||
UNUSED(by);
|
||||
UNUSED(bs);
|
||||
assert(n % QK4_NL == 0);
|
||||
static_assert(QK4_NL == QK8_0, "QK4_NL and QK8_0 must be the same");
|
||||
|
||||
const block_iq4_nl * restrict x = vx;
|
||||
const block_q8_0 * restrict y = vy;
|
||||
|
||||
const int nb = n / QK4_NL;
|
||||
|
||||
#if defined __ARM_NEON
|
||||
const int8x16_t values = vld1q_s8(kvalues_iq4nl);
|
||||
const uint8x16_t m4b = vdupq_n_u8(0x0f);
|
||||
uint8x16x2_t q4bits;
|
||||
int8x16x4_t q4b;
|
||||
int8x16x4_t q8b;
|
||||
int32x4_t prod_1, prod_2;
|
||||
|
||||
float sumf = 0;
|
||||
|
||||
for (int ib = 0; ib < nb; ib += 2) {
|
||||
|
||||
q4bits.val[0] = vld1q_u8(x[ib+0].qs);
|
||||
q4bits.val[1] = vld1q_u8(x[ib+1].qs);
|
||||
q8b.val[0] = vld1q_s8(y[ib+0].qs);
|
||||
q8b.val[1] = vld1q_s8(y[ib+0].qs + 16);
|
||||
q8b.val[2] = vld1q_s8(y[ib+1].qs);
|
||||
q8b.val[3] = vld1q_s8(y[ib+1].qs + 16);
|
||||
|
||||
q4b.val[0] = vqtbl1q_s8(values, vandq_u8(q4bits.val[0], m4b));
|
||||
q4b.val[1] = vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[0], 4));
|
||||
q4b.val[2] = vqtbl1q_s8(values, vandq_u8(q4bits.val[1], m4b));
|
||||
q4b.val[3] = vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[1], 4));
|
||||
|
||||
prod_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[0], q8b.val[0]), q4b.val[1], q8b.val[1]);
|
||||
prod_2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[2], q8b.val[2]), q4b.val[3], q8b.val[3]);
|
||||
|
||||
sumf += (float)x[ib+0].d * (float)y[ib+0].d * vaddvq_s32(prod_1) + (float)x[ib+1].d * (float)y[ib+1].d * vaddvq_s32(prod_2);
|
||||
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
|
||||
#elif defined __AVX2__
|
||||
|
||||
const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
|
||||
const __m128i m4b = _mm_set1_epi8(0x0f);
|
||||
const __m256i mone = _mm256_set1_epi16(1);
|
||||
|
||||
__m256 accum1 = _mm256_setzero_ps();
|
||||
__m256 accum2 = _mm256_setzero_ps();
|
||||
for (int ib = 0; ib < nb; ib += 2) {
|
||||
const __m128i q4bits_1 = _mm_loadu_si128((const __m128i*)x[0].qs);
|
||||
const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)x[1].qs);
|
||||
const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)y[0].qs);
|
||||
const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)y[1].qs);
|
||||
const __m256i q4b_1 = _mm256_set_m128i(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
|
||||
_mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
|
||||
const __m256i q4b_2 = _mm256_set_m128i(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
|
||||
_mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
|
||||
const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
|
||||
const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
|
||||
const __m256i p_1 = _mm256_madd_epi16(p16_1, mone);
|
||||
const __m256i p_2 = _mm256_madd_epi16(p16_2, mone);
|
||||
accum1 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[0].d)*GGML_FP16_TO_FP32(x[0].d)),
|
||||
_mm256_cvtepi32_ps(p_1), accum1);
|
||||
accum2 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[1].d)*GGML_FP16_TO_FP32(x[1].d)),
|
||||
_mm256_cvtepi32_ps(p_2), accum2);
|
||||
|
||||
y += 2;
|
||||
x += 2;
|
||||
}
|
||||
|
||||
*s = hsum_float_8(_mm256_add_ps(accum1, accum2));
|
||||
|
||||
#else
|
||||
float sumf = 0;
|
||||
for (int ib = 0; ib < nb; ++ib) {
|
||||
const float d = GGML_FP16_TO_FP32(y[ib].d)*GGML_FP16_TO_FP32(x[ib].d);
|
||||
int sumi1 = 0, sumi2 = 0;
|
||||
for (int j = 0; j < QK4_NL/2; ++j) {
|
||||
sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf];
|
||||
sumi2 += y[ib].qs[j+QK4_NL/2] * kvalues_iq4nl[x[ib].qs[j] >> 4];
|
||||
}
|
||||
sumf += d * (sumi1 + sumi2);
|
||||
}
|
||||
*s = sumf;
|
||||
#endif
|
||||
}
|
||||
|
||||
// ================================ IQ2 quantization =============================================
|
||||
@@ -10729,3 +10841,123 @@ size_t quantize_iq1_s(const float * src, void * dst, int nrow, int n_per_row, in
|
||||
}
|
||||
return nrow * nblock * sizeof(block_iq1_s);
|
||||
}
|
||||
|
||||
// ============================ 4-bit non-linear quants
|
||||
|
||||
static inline int best_index_int8(int n, const int8_t * val, float x) {
|
||||
if (x <= val[0]) return 0;
|
||||
if (x >= val[n-1]) return n-1;
|
||||
int ml = 0, mu = n-1;
|
||||
while (mu-ml > 1) {
|
||||
int mav = (ml+mu)/2;
|
||||
if (x < val[mav]) mu = mav; else ml = mav;
|
||||
}
|
||||
return x - val[mu-1] < val[mu] - x ? mu-1 : mu;
|
||||
}
|
||||
|
||||
static void quantize_row_iq4_nl_impl(const int block_size, const float * GGML_RESTRICT x,
|
||||
ggml_fp16_t * dh, uint8_t * q4,
|
||||
float * weight, uint8_t * L,
|
||||
const int8_t * values,
|
||||
const float * quant_weights) {
|
||||
|
||||
const int ntry = 7;
|
||||
|
||||
float sigma2 = 0;
|
||||
for (int j = 0; j < QK4_NL; ++j) sigma2 += x[j]*x[j];
|
||||
sigma2 *= 2.f/QK4_NL;
|
||||
|
||||
const int nb = QK4_NL/block_size;
|
||||
|
||||
memset(q4, 0, QK4_NL/2);
|
||||
for (int ib = 0; ib < nb; ++ib) {
|
||||
dh[ib] = GGML_FP32_TO_FP16(0.f);
|
||||
const float * xb = x + ib*block_size;
|
||||
if (quant_weights) {
|
||||
const float * qw = quant_weights + ib*block_size;
|
||||
for (int j = 0; j < block_size; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
|
||||
} else {
|
||||
for (int j = 0; j < block_size; ++j) weight[j] = xb[j]*xb[j];
|
||||
}
|
||||
float amax = 0, max = 0;
|
||||
for (int j = 0; j < block_size; ++j) {
|
||||
float ax = fabsf(xb[j]);
|
||||
if (ax > amax) {
|
||||
amax = ax; max = xb[j];
|
||||
}
|
||||
}
|
||||
if (!amax) {
|
||||
continue;
|
||||
}
|
||||
float d = -max/values[0];
|
||||
float id = 1/d;
|
||||
float sumqx = 0, sumq2 = 0;
|
||||
for (int j = 0; j < block_size; ++j) {
|
||||
float al = id*xb[j];
|
||||
int l = best_index_int8(16, values, al);
|
||||
float q = values[l];
|
||||
float w = weight[j];
|
||||
sumqx += w*q*xb[j];
|
||||
sumq2 += w*q*q;
|
||||
}
|
||||
float best_id = id;
|
||||
d = sumqx/sumq2;
|
||||
float best = d*sumqx;
|
||||
for (int itry = -ntry; itry <= ntry; ++itry) {
|
||||
id = (itry + values[0])/max;
|
||||
sumqx = sumq2 = 0;
|
||||
for (int j = 0; j < block_size; ++j) {
|
||||
float al = id*xb[j];
|
||||
int l = best_index_int8(16, values, al);
|
||||
float q = values[l];
|
||||
float w = weight[j];
|
||||
sumqx += w*q*xb[j];
|
||||
sumq2 += w*q*q;
|
||||
}
|
||||
if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
|
||||
d = sumqx/sumq2; best = d * sumqx;
|
||||
best_id = id;
|
||||
}
|
||||
}
|
||||
dh[ib] = GGML_FP32_TO_FP16(d);
|
||||
for (int j = 0; j < block_size; ++j) {
|
||||
L[ib*block_size + j] = best_index_int8(16, values, best_id*xb[j]);
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < QK4_NL/32; ++i) {
|
||||
for (int j = 0; j < 16; ++j) {
|
||||
q4[16*i + j] = L[32*i + j] | (L[32*i + 16 + j] << 4);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
size_t quantize_iq4_nl(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
||||
(void)hist;
|
||||
GGML_ASSERT(n_per_row%QK4_NL == 0);
|
||||
int nblock = n_per_row/QK4_NL;
|
||||
char * qrow = (char *)dst;
|
||||
uint8_t L[QK4_NL];
|
||||
float weight[32];
|
||||
for (int row = 0; row < nrow; ++row) {
|
||||
block_iq4_nl * iq4 = (block_iq4_nl *)qrow;
|
||||
for (int ibl = 0; ibl < nblock; ++ibl) {
|
||||
const float * qw = quant_weights ? quant_weights + QK4_NL*ibl : NULL;
|
||||
quantize_row_iq4_nl_impl(32, src + QK4_NL*ibl, &iq4[ibl].d, iq4[ibl].qs, weight, L, kvalues_iq4nl, qw);
|
||||
}
|
||||
src += n_per_row;
|
||||
qrow += nblock*sizeof(block_iq4_nl);
|
||||
}
|
||||
return nrow * nblock * sizeof(block_iq4_nl);
|
||||
}
|
||||
|
||||
void quantize_row_iq4_nl(const float * restrict x, void * restrict vy, int k) {
|
||||
assert(k % QK4_NL == 0);
|
||||
block_iq4_nl * restrict y = vy;
|
||||
quantize_row_iq4_nl_reference(x, y, k);
|
||||
}
|
||||
|
||||
void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * restrict y, int k) {
|
||||
assert(k % QK4_NL == 0);
|
||||
quantize_iq4_nl(x, y, 1, k, NULL, NULL);
|
||||
}
|
||||
|
||||
|
||||
@@ -198,6 +198,14 @@ typedef struct {
|
||||
} block_iq1_s;
|
||||
static_assert(sizeof(block_iq1_s) == sizeof(ggml_fp16_t) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding");
|
||||
|
||||
// Non-linear quants
|
||||
#define QK4_NL 32
|
||||
typedef struct {
|
||||
ggml_fp16_t d;
|
||||
uint8_t qs[QK4_NL/2];
|
||||
} block_iq4_nl;
|
||||
static_assert(sizeof(block_iq4_nl) == sizeof(ggml_fp16_t) + QK4_NL/2, "wrong iq4_nl block size/padding");
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
@@ -217,6 +225,7 @@ void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGM
|
||||
void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int k);
|
||||
void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int k);
|
||||
void quantize_row_iq4_nl_reference (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int k);
|
||||
|
||||
void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
@@ -232,6 +241,7 @@ void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in
|
||||
void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
|
||||
// Dequantization
|
||||
void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
@@ -251,6 +261,7 @@ void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_
|
||||
void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
void dequantize_row_iq1_s (const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
void dequantize_row_iq4_nl (const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
|
||||
// Dot product
|
||||
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
@@ -268,6 +279,7 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
|
||||
void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
|
||||
//
|
||||
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
|
||||
@@ -276,6 +288,7 @@ size_t quantize_iq2_xxs(const float * src, void * dst, int nrows, int n_per_row,
|
||||
size_t quantize_iq2_xs (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||
size_t quantize_iq3_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||
size_t quantize_iq1_s (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||
size_t quantize_iq4_nl (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||
size_t quantize_q2_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||
size_t quantize_q3_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||
size_t quantize_q4_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||
|
||||
@@ -14642,7 +14642,8 @@ GGML_CALL static const char * ggml_backend_sycl_buffer_type_name(ggml_backend_bu
|
||||
static ggml_backend_buffer_t
|
||||
ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
|
||||
size_t size) try {
|
||||
int device = (int) (intptr_t) buft->context;
|
||||
ggml_backend_sycl_buffer_type_context * buft_ctx = (ggml_backend_sycl_buffer_type_context *)buft->context;
|
||||
int device = (int) buft_ctx->device;
|
||||
|
||||
ggml_sycl_set_device(device);
|
||||
int device_index = get_device_index_by_id(device);
|
||||
@@ -14720,7 +14721,7 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) {
|
||||
for (int i = 0; i < GGML_SYCL_MAX_DEVICES; i++) {
|
||||
ggml_backend_sycl_buffer_types[i] = {
|
||||
/* .iface = */ ggml_backend_sycl_buffer_type_interface,
|
||||
/* .context = */ (ggml_backend_buffer_type_context_t) (intptr_t) i,
|
||||
/* .context = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(i)},
|
||||
};
|
||||
}
|
||||
ggml_backend_sycl_buffer_type_initialized = true;
|
||||
@@ -14782,10 +14783,6 @@ ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type() {
|
||||
|
||||
// backend
|
||||
|
||||
struct ggml_backend_context_sycl {
|
||||
int device;
|
||||
};
|
||||
|
||||
static const char * ggml_backend_sycl_name(ggml_backend_t backend) {
|
||||
return GGML_SYCL_NAME;
|
||||
|
||||
@@ -14793,14 +14790,14 @@ static const char * ggml_backend_sycl_name(ggml_backend_t backend) {
|
||||
}
|
||||
|
||||
static void ggml_backend_sycl_free(ggml_backend_t backend) {
|
||||
ggml_backend_context_sycl * sycl_ctx = (ggml_backend_context_sycl *)backend->context;
|
||||
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
||||
|
||||
delete sycl_ctx;
|
||||
delete backend;
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_type_t ggml_backend_sycl_get_default_buffer_type(ggml_backend_t backend) {
|
||||
ggml_backend_context_sycl * sycl_ctx = (ggml_backend_context_sycl *)backend->context;
|
||||
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
||||
|
||||
return ggml_backend_sycl_buffer_type(sycl_ctx->device);
|
||||
}
|
||||
@@ -14809,7 +14806,7 @@ static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend,
|
||||
ggml_tensor *tensor,
|
||||
const void *data, size_t offset,
|
||||
size_t size) try {
|
||||
ggml_backend_context_sycl * sycl_ctx = (ggml_backend_context_sycl *)backend->context;
|
||||
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
||||
|
||||
GGML_ASSERT(tensor->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type");
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
||||
@@ -14827,7 +14824,7 @@ static void ggml_backend_sycl_get_tensor_async(ggml_backend_t backend,
|
||||
const ggml_tensor *tensor,
|
||||
void *data, size_t offset,
|
||||
size_t size) try {
|
||||
ggml_backend_context_sycl * sycl_ctx = (ggml_backend_context_sycl *)backend->context;
|
||||
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
||||
|
||||
GGML_ASSERT(tensor->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type");
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
||||
@@ -14842,7 +14839,7 @@ catch (sycl::exception const &exc) {
|
||||
}
|
||||
|
||||
static void ggml_backend_sycl_synchronize(ggml_backend_t backend) try {
|
||||
ggml_backend_context_sycl * sycl_ctx = (ggml_backend_context_sycl *)backend->context;
|
||||
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
||||
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(g_syclStreams[sycl_ctx->device][0]->wait()));
|
||||
|
||||
@@ -14878,7 +14875,7 @@ static void ggml_backend_sycl_graph_plan_compute(ggml_backend_t backend, ggml_ba
|
||||
}
|
||||
|
||||
static bool ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
||||
ggml_backend_context_sycl * sycl_ctx = (ggml_backend_context_sycl *)backend->context;
|
||||
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
||||
|
||||
ggml_sycl_set_main_device(sycl_ctx->device);
|
||||
|
||||
@@ -15092,8 +15089,9 @@ ggml_backend_t ggml_backend_sycl_init(int device) {
|
||||
// not strictly necessary, but it may reduce the overhead of the first graph_compute
|
||||
ggml_sycl_set_main_device(device);
|
||||
|
||||
ggml_backend_context_sycl * ctx = new ggml_backend_context_sycl {
|
||||
/* .device = */ device
|
||||
ggml_backend_sycl_context * ctx = new ggml_backend_sycl_context {
|
||||
/* .device = */ device,
|
||||
/* .name = */ GGML_SYCL_NAME + std::to_string(device),
|
||||
};
|
||||
|
||||
ggml_backend_t sycl_backend = new ggml_backend {
|
||||
|
||||
2
ggml.h
2
ggml.h
@@ -355,6 +355,7 @@ extern "C" {
|
||||
GGML_TYPE_IQ2_XS = 17,
|
||||
GGML_TYPE_IQ3_XXS = 18,
|
||||
GGML_TYPE_IQ1_S = 19,
|
||||
GGML_TYPE_IQ4_NL = 20,
|
||||
GGML_TYPE_I8,
|
||||
GGML_TYPE_I16,
|
||||
GGML_TYPE_I32,
|
||||
@@ -393,6 +394,7 @@ extern "C" {
|
||||
GGML_FTYPE_MOSTLY_IQ2_XS = 16, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_IQ3_XXS = 17, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_IQ1_S = 18, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_IQ4_NL = 19, // except 1d tensors
|
||||
};
|
||||
|
||||
// available tensor operations:
|
||||
|
||||
@@ -111,6 +111,7 @@ class MODEL_ARCH(IntEnum):
|
||||
ORION = auto()
|
||||
INTERNLM2 = auto()
|
||||
MINICPM = auto()
|
||||
GEMMA = auto()
|
||||
|
||||
|
||||
class MODEL_TENSOR(IntEnum):
|
||||
@@ -167,6 +168,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||
MODEL_ARCH.ORION: "orion",
|
||||
MODEL_ARCH.INTERNLM2: "internlm2",
|
||||
MODEL_ARCH.MINICPM: "minicpm",
|
||||
MODEL_ARCH.GEMMA: "gemma",
|
||||
}
|
||||
|
||||
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||
@@ -511,6 +513,19 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||
MODEL_TENSOR.FFN_UP_EXP,
|
||||
],
|
||||
MODEL_ARCH.GEMMA: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_Q,
|
||||
MODEL_TENSOR.ATTN_K,
|
||||
MODEL_TENSOR.ATTN_V,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.FFN_GATE,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
MODEL_TENSOR.FFN_NORM,
|
||||
],
|
||||
# TODO
|
||||
}
|
||||
|
||||
|
||||
270
llama.cpp
270
llama.cpp
@@ -208,6 +208,7 @@ enum llm_arch {
|
||||
LLM_ARCH_ORION,
|
||||
LLM_ARCH_INTERNLM2,
|
||||
LLM_ARCH_MINICPM,
|
||||
LLM_ARCH_GEMMA,
|
||||
LLM_ARCH_UNKNOWN,
|
||||
};
|
||||
|
||||
@@ -234,6 +235,7 @@ static std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_ORION, "orion" },
|
||||
{ LLM_ARCH_INTERNLM2, "internlm2" },
|
||||
{ LLM_ARCH_MINICPM, "minicpm" },
|
||||
{ LLM_ARCH_GEMMA, "gemma" },
|
||||
};
|
||||
|
||||
enum llm_kv {
|
||||
@@ -760,6 +762,22 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
||||
{ LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_GEMMA,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_UNKNOWN,
|
||||
{
|
||||
@@ -2527,6 +2545,7 @@ struct llama_model_loader {
|
||||
case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break;
|
||||
case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break;
|
||||
case GGML_TYPE_IQ1_S: ftype = LLAMA_FTYPE_MOSTLY_IQ1_S; break;
|
||||
case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break;
|
||||
default:
|
||||
{
|
||||
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
|
||||
@@ -2772,13 +2791,7 @@ struct llama_model_loader {
|
||||
|
||||
std::vector<no_init<uint8_t>> read_buf;
|
||||
|
||||
for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
|
||||
struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i));
|
||||
if (!cur) {
|
||||
// some tensors may be allocated in a different context
|
||||
continue;
|
||||
}
|
||||
|
||||
for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
|
||||
if (progress_callback) {
|
||||
if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
|
||||
return false;
|
||||
@@ -2877,6 +2890,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
||||
case LLAMA_FTYPE_MOSTLY_Q3_K_XS:return "Q3_K - Extra small";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ1_S :return "IQ1_S - 1.5625 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw";
|
||||
|
||||
default: return "unknown, may not work";
|
||||
}
|
||||
@@ -3241,6 +3255,16 @@ static void llm_load_hparams(
|
||||
default: model.type = e_model::MODEL_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_GEMMA:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 18: model.type = e_model::MODEL_2B; break;
|
||||
case 28: model.type = e_model::MODEL_7B; break;
|
||||
default: model.type = e_model::MODEL_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
default: (void)0;
|
||||
}
|
||||
|
||||
@@ -3692,7 +3716,7 @@ static bool llm_load_tensors(
|
||||
}
|
||||
|
||||
// create one context per buffer type
|
||||
size_t ctx_size = ggml_tensor_overhead()*ml.n_tensors;
|
||||
size_t ctx_size = ggml_tensor_overhead()*(ml.n_tensors + 1); // +1 for models where tok_embd is duplicated as output
|
||||
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
|
||||
for (auto & it : buft_layer_count) {
|
||||
struct ggml_init_params params = {
|
||||
@@ -3830,6 +3854,7 @@ static bool llm_load_tensors(
|
||||
} else {
|
||||
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
|
||||
ml.n_created--; // artificial tensor
|
||||
ml.size_data += ggml_nbytes(model.output);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4029,6 +4054,8 @@ static bool llm_load_tensors(
|
||||
// output
|
||||
{
|
||||
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
||||
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, false);
|
||||
|
||||
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
||||
}
|
||||
|
||||
@@ -4038,14 +4065,23 @@ static bool llm_load_tensors(
|
||||
|
||||
auto & layer = model.layers[i];
|
||||
|
||||
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
||||
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
||||
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, false);
|
||||
|
||||
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
||||
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
||||
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, false);
|
||||
|
||||
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
||||
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
||||
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
||||
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
||||
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, false);
|
||||
|
||||
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
||||
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, false);
|
||||
|
||||
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
||||
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, false);
|
||||
|
||||
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
||||
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, false);
|
||||
|
||||
// AWQ ScaleActivation layer
|
||||
layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, false);
|
||||
@@ -4358,6 +4394,40 @@ static bool llm_load_tensors(
|
||||
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_GEMMA:
|
||||
{
|
||||
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
||||
|
||||
// output
|
||||
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
||||
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // same as tok_embd, duplicated to allow offloading
|
||||
ml.n_created--; // artificial tensor
|
||||
ml.size_data += ggml_nbytes(model.output);
|
||||
|
||||
const int64_t n_ff = hparams.n_ff;
|
||||
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
||||
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
||||
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
||||
|
||||
for (uint32_t i = 0; i < n_layer; ++i) {
|
||||
ggml_context * ctx_layer = ctx_for_layer(i);
|
||||
ggml_context * ctx_split = ctx_for_layer_split(i);
|
||||
|
||||
auto & layer = model.layers[i];
|
||||
|
||||
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
||||
|
||||
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * hparams.n_head});
|
||||
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa});
|
||||
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa});
|
||||
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * hparams.n_head, n_embd});
|
||||
|
||||
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
||||
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
||||
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
||||
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
||||
}
|
||||
} break;
|
||||
default:
|
||||
throw std::runtime_error("unknown architecture");
|
||||
}
|
||||
@@ -6112,7 +6182,7 @@ struct llm_build_context {
|
||||
|
||||
attn_norm = llm_build_norm(ctx0, inpL, hparams,
|
||||
model.layers[il].attn_norm,
|
||||
NULL,
|
||||
model.layers[il].attn_norm_b,
|
||||
LLM_NORM, cb, il);
|
||||
cb(attn_norm, "attn_norm", il);
|
||||
|
||||
@@ -6123,6 +6193,11 @@ struct llm_build_context {
|
||||
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
||||
cb(cur, "wqkv", il);
|
||||
|
||||
if (model.layers[il].bqkv){
|
||||
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
||||
cb(cur, "bqkv", il);
|
||||
}
|
||||
|
||||
if (hparams.f_clamp_kqv > 0.0f) {
|
||||
cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
||||
cb(cur, "wqkv_clamped", il);
|
||||
@@ -6139,7 +6214,7 @@ struct llm_build_context {
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||
|
||||
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
||||
model.layers[il].wo, NULL,
|
||||
model.layers[il].wo, model.layers[il].bo,
|
||||
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||
cb(cur, "kqv_out", il);
|
||||
}
|
||||
@@ -6152,13 +6227,13 @@ struct llm_build_context {
|
||||
{
|
||||
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
||||
model.layers[il].ffn_norm,
|
||||
NULL,
|
||||
model.layers[il].ffn_norm_b,
|
||||
LLM_NORM, cb, il);
|
||||
cb(cur, "ffn_norm", il);
|
||||
cur = llm_build_ffn(ctx0, cur,
|
||||
model.layers[il].ffn_up, NULL,
|
||||
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
||||
NULL, NULL,
|
||||
model.layers[il].ffn_down, NULL,
|
||||
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
||||
model.layers[il].ffn_act,
|
||||
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
@@ -6175,7 +6250,7 @@ struct llm_build_context {
|
||||
|
||||
cur = llm_build_norm(ctx0, cur, hparams,
|
||||
model.output_norm,
|
||||
NULL,
|
||||
model.output_norm_b,
|
||||
LLM_NORM, cb, -1);
|
||||
cb(cur, "result_norm", -1);
|
||||
|
||||
@@ -7364,6 +7439,113 @@ struct llm_build_context {
|
||||
|
||||
return gf;
|
||||
}
|
||||
|
||||
struct ggml_cgraph * build_gemma() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
|
||||
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
|
||||
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
||||
cb(inpL, "inp_embd", -1);
|
||||
inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
|
||||
cb(inpL, "inp_scaled", -1);
|
||||
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
||||
cb(inp_pos, "inp_pos", -1);
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||
cb(KQ_mask, "KQ_mask", -1);
|
||||
|
||||
// shift the entire K-cache if needed
|
||||
if (do_rope_shift) {
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
||||
}
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
|
||||
// norm
|
||||
cur = llm_build_norm(ctx0, inpL, hparams,
|
||||
model.layers[il].attn_norm, NULL,
|
||||
LLM_NORM_RMS, cb, il);
|
||||
cb(cur, "attn_norm", il);
|
||||
|
||||
// self-attention
|
||||
{
|
||||
// compute Q and K and RoPE them
|
||||
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
Qcur = ggml_rope_custom(
|
||||
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos,
|
||||
n_embd_head_k, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
cb(Qcur, "Qcur", il);
|
||||
Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k)));
|
||||
cb(Qcur, "Qcur_scaled", il);
|
||||
|
||||
Kcur = ggml_rope_custom(
|
||||
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos,
|
||||
n_embd_head_k, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
||||
model.layers[il].wo, NULL,
|
||||
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
||||
cb(cur, "kqv_out", il);
|
||||
}
|
||||
struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
|
||||
cb(sa_out, "sa_out", il);
|
||||
|
||||
cur = llm_build_norm(ctx0, sa_out, hparams,
|
||||
model.layers[il].ffn_norm, NULL,
|
||||
LLM_NORM_RMS, cb, il);
|
||||
cb(cur, "ffn_norm", il);
|
||||
|
||||
// feed-forward network
|
||||
{
|
||||
cur = llm_build_ffn(ctx0, cur,
|
||||
model.layers[il].ffn_up, NULL,
|
||||
model.layers[il].ffn_gate, NULL,
|
||||
model.layers[il].ffn_down, NULL,
|
||||
NULL,
|
||||
LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
}
|
||||
|
||||
cur = ggml_add(ctx0, cur, sa_out);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
cur = inpL;
|
||||
|
||||
cur = llm_build_norm(ctx0, cur, hparams,
|
||||
model.output_norm, NULL,
|
||||
LLM_NORM_RMS, cb, -1);
|
||||
cb(cur, "result_norm", -1);
|
||||
|
||||
// lm_head
|
||||
cur = ggml_mul_mat(ctx0, model.output, cur);
|
||||
cb(cur, "result_output", -1);
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
return gf;
|
||||
}
|
||||
};
|
||||
|
||||
static struct ggml_cgraph * llama_build_graph(
|
||||
@@ -7472,6 +7654,10 @@ static struct ggml_cgraph * llama_build_graph(
|
||||
{
|
||||
result = llm.build_minicpm();
|
||||
} break;
|
||||
case LLM_ARCH_GEMMA:
|
||||
{
|
||||
result = llm.build_gemma();
|
||||
} break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
@@ -10354,6 +10540,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
||||
new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL && qs.model.hparams.n_gqa() >= 4) {
|
||||
new_type = GGML_TYPE_Q5_K;
|
||||
}
|
||||
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
||||
use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
|
||||
@@ -10406,6 +10595,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
||||
if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
|
||||
}
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL && !qs.has_imatrix) {
|
||||
if (i_layer < n_layer/8) new_type = GGML_TYPE_Q5_K;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) {
|
||||
new_type = GGML_TYPE_Q5_K;
|
||||
@@ -10422,7 +10614,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
||||
if (arch != LLM_ARCH_FALCON) {
|
||||
if (qs.model.hparams.n_expert == 8) {
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
|
||||
ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ||
|
||||
ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL ||
|
||||
ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
|
||||
new_type = GGML_TYPE_Q5_K;
|
||||
}
|
||||
@@ -10489,8 +10681,8 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
case GGML_TYPE_IQ1_S:
|
||||
case GGML_TYPE_Q2_K: new_type = GGML_TYPE_Q4_0; break;
|
||||
case GGML_TYPE_Q3_K: new_type = GGML_TYPE_Q4_1; break;
|
||||
case GGML_TYPE_Q2_K:
|
||||
case GGML_TYPE_Q3_K: new_type = GGML_TYPE_IQ4_NL; break;
|
||||
case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
|
||||
case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break;
|
||||
case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break;
|
||||
@@ -10531,7 +10723,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
case LLAMA_FTYPE_MOSTLY_IQ2_XXS: quantized_type = GGML_TYPE_IQ2_XXS; break;
|
||||
case LLAMA_FTYPE_MOSTLY_IQ2_XS: quantized_type = GGML_TYPE_IQ2_XS; break;
|
||||
case LLAMA_FTYPE_MOSTLY_IQ3_XXS: quantized_type = GGML_TYPE_IQ3_XXS; break;
|
||||
case LLAMA_FTYPE_MOSTLY_IQ1_S: quantized_type = GGML_TYPE_IQ1_S ; break;
|
||||
case LLAMA_FTYPE_MOSTLY_IQ1_S: quantized_type = GGML_TYPE_IQ1_S; break;
|
||||
case LLAMA_FTYPE_MOSTLY_IQ4_NL: quantized_type = GGML_TYPE_IQ4_NL; break;
|
||||
|
||||
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
||||
}
|
||||
@@ -11995,18 +12188,19 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
||||
data_ctx->write(&kv_used, sizeof(kv_used));
|
||||
|
||||
if (kv_buf_size) {
|
||||
const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
|
||||
|
||||
std::vector<uint8_t> tmp_buf;
|
||||
for (int il = 0; il < (int) n_layer; ++il) {
|
||||
tmp_buf.resize(elt_size*n_embd_k_gqa*kv_head);
|
||||
size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
|
||||
tmp_buf.resize(k_size);
|
||||
ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), 0, tmp_buf.size());
|
||||
data_ctx->write(tmp_buf.data(), tmp_buf.size());
|
||||
|
||||
// v is not contiguous, copy row by row
|
||||
tmp_buf.resize(elt_size*kv_head);
|
||||
size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
|
||||
size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx);
|
||||
tmp_buf.resize(v_row_size);
|
||||
for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
|
||||
ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), ir*elt_size*n_ctx, tmp_buf.size());
|
||||
ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), ir*v_row_stride, tmp_buf.size());
|
||||
data_ctx->write(tmp_buf.data(), tmp_buf.size());
|
||||
}
|
||||
}
|
||||
@@ -12108,17 +12302,16 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
||||
if (kv_buf_size) {
|
||||
GGML_ASSERT(kv_self.total_size() == kv_buf_size);
|
||||
|
||||
const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
|
||||
|
||||
for (int il = 0; il < (int) n_layer; ++il) {
|
||||
size_t k_size = elt_size*n_embd_k_gqa*kv_head;
|
||||
size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
|
||||
ggml_backend_tensor_set(kv_self.k_l[il], inp, 0, k_size);
|
||||
inp += k_size;
|
||||
|
||||
// v is not contiguous, copy row by row
|
||||
size_t v_row_size = elt_size*kv_head;
|
||||
size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
|
||||
size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx);
|
||||
for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
|
||||
ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*elt_size*n_ctx, v_row_size);
|
||||
ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*v_row_stride, v_row_size);
|
||||
inp += v_row_size;
|
||||
}
|
||||
}
|
||||
@@ -12580,6 +12773,15 @@ static int32_t llama_chat_apply_template_internal(
|
||||
if (add_ass) {
|
||||
ss << "<|assistant|>\n";
|
||||
}
|
||||
} else if (tmpl.find("bos_token + message['role']") != std::string::npos) {
|
||||
// mlabonne/AlphaMonarch-7B template (the <s> is included inside history)
|
||||
for (auto message : chat) {
|
||||
std::string bos = (message == chat.front()) ? "" : "<s>"; // skip BOS for first message
|
||||
ss << bos << message->role << "\n" << message->content << "</s>\n";
|
||||
}
|
||||
if (add_ass) {
|
||||
ss << "<s>assistant\n";
|
||||
}
|
||||
} else {
|
||||
// template not supported
|
||||
return -1;
|
||||
@@ -12602,7 +12804,7 @@ LLAMA_API int32_t llama_chat_apply_template(
|
||||
// load template from model
|
||||
std::vector<char> model_template(2048, 0); // longest known template is about 1200 bytes
|
||||
std::string template_key = "tokenizer.chat_template";
|
||||
int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), curr_tmpl.size());
|
||||
int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
|
||||
if (res < 0) {
|
||||
// worst case: there is no information about template, we will use chatml by default
|
||||
curr_tmpl = "<|im_start|>"; // see llama_chat_apply_template_internal
|
||||
|
||||
3
llama.h
3
llama.h
@@ -101,6 +101,7 @@ extern "C" {
|
||||
LLAMA_FTYPE_MOSTLY_Q3_K_XS = 22, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_IQ1_S = 24, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_IQ4_NL = 25, // except 1d tensors
|
||||
|
||||
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
||||
};
|
||||
@@ -707,7 +708,7 @@ extern "C" {
|
||||
|
||||
/// Apply chat template. Inspired by hf apply_chat_template() on python.
|
||||
/// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
|
||||
/// NOTE: This function only support some known jinja templates. It is not a jinja parser.
|
||||
/// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
|
||||
/// @param tmpl A Jinja template to use for this chat. If this is nullptr, the model’s default chat template will be used instead.
|
||||
/// @param chat Pointer to a list of multiple llama_chat_message
|
||||
/// @param n_msg Number of llama_chat_message in this chat
|
||||
|
||||
@@ -1 +1 @@
|
||||
818eeb8a3be99125746a90ec63af8f51516a2ec6
|
||||
30805514e1bf389a59d30a54a0525cbdc30d5bd1
|
||||
|
||||
@@ -1918,6 +1918,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
||||
GGML_TYPE_Q6_K,
|
||||
GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS,
|
||||
GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ1_S,
|
||||
GGML_TYPE_IQ4_NL,
|
||||
};
|
||||
|
||||
// unary ops
|
||||
|
||||
@@ -27,12 +27,20 @@ int main(void) {
|
||||
"{%- for idx in range(0, messages|length) -%}\\n{%- if messages[idx]['role'] == 'user' -%}\\n{%- if idx > 1 -%}\\n{{- bos_token + '[INST] ' + messages[idx]['content'] + ' [/INST]' -}}\\n{%- else -%}\\n{{- messages[idx]['content'] + ' [/INST]' -}}\\n{%- endif -%}\\n{% elif messages[idx]['role'] == 'system' %}\\n{{- '[INST] <<SYS>>\\\\n' + messages[idx]['content'] + '\\\\n<</SYS>>\\\\n\\\\n' -}}\\n{%- elif messages[idx]['role'] == 'assistant' -%}\\n{{- ' ' + messages[idx]['content'] + ' ' + eos_token -}}\\n{% endif %}\\n{% endfor %}",
|
||||
// bofenghuang/vigogne-2-70b-chat
|
||||
"{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif true == true and not '<<SYS>>' in messages[0]['content'] %}{% set loop_messages = messages %}{% set system_message = 'Vous êtes Vigogne, un assistant IA créé par Zaion Lab. Vous suivez extrêmement bien les instructions. Aidez autant que vous le pouvez.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\\\n' + system_message + '\\\\n<</SYS>>\\\\n\\\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'system' %}{{ '<<SYS>>\\\\n' + content.strip() + '\\\\n<</SYS>>\\\\n\\\\n' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}",
|
||||
// mlabonne/AlphaMonarch-7B
|
||||
"{% for message in messages %}{{bos_token + message['role'] + '\\n' + message['content'] + eos_token + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ bos_token + 'assistant\\n' }}{% endif %}",
|
||||
};
|
||||
std::vector<std::string> expected_substr = {
|
||||
"<|im_start|>assistant\n I am an assistant <|im_end|>\n<|im_start|>user\nAnother question<|im_end|>\n<|im_start|>assistant",
|
||||
"[/INST]Hi there</s>[INST] Who are you [/INST] I am an assistant </s>[INST] Another question [/INST]",
|
||||
"</s><s>[INST] Who are you [/INST] I am an assistant </s><s>[INST] Another question [/INST]",
|
||||
"[/INST] Hi there </s>[INST] Who are you [/INST] I am an assistant </s>[INST] Another question [/INST]",
|
||||
std::vector<std::string> expected_output = {
|
||||
// teknium/OpenHermes-2.5-Mistral-7B
|
||||
"<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\nHello<|im_end|>\n<|im_start|>assistant\nHi there<|im_end|>\n<|im_start|>user\nWho are you<|im_end|>\n<|im_start|>assistant\n I am an assistant <|im_end|>\n<|im_start|>user\nAnother question<|im_end|>\n<|im_start|>assistant\n",
|
||||
// mistralai/Mistral-7B-Instruct-v0.2
|
||||
"[INST] You are a helpful assistant\nHello [/INST]Hi there</s>[INST] Who are you [/INST] I am an assistant </s>[INST] Another question [/INST]",
|
||||
// TheBloke/FusionNet_34Bx2_MoE-AWQ
|
||||
"[INST] <<SYS>>\nYou are a helpful assistant\n<</SYS>>\n\nHello [/INST] Hi there </s><s>[INST] Who are you [/INST] I am an assistant </s><s>[INST] Another question [/INST]",
|
||||
// bofenghuang/vigogne-2-70b-chat
|
||||
"[INST] <<SYS>>\nYou are a helpful assistant\n<</SYS>>\n\nHello [/INST] Hi there </s>[INST] Who are you [/INST] I am an assistant </s>[INST] Another question [/INST]",
|
||||
// mlabonne/AlphaMonarch-7B
|
||||
"system\nYou are a helpful assistant</s>\n<s>user\nHello</s>\n<s>assistant\nHi there</s>\n<s>user\nWho are you</s>\n<s>assistant\n I am an assistant </s>\n<s>user\nAnother question</s>\n<s>assistant\n",
|
||||
};
|
||||
std::vector<char> formatted_chat(1024);
|
||||
int32_t res;
|
||||
@@ -43,7 +51,7 @@ int main(void) {
|
||||
|
||||
for (size_t i = 0; i < templates.size(); i++) {
|
||||
std::string custom_template = templates[i];
|
||||
std::string substr = expected_substr[i];
|
||||
std::string expected = expected_output[i];
|
||||
formatted_chat.resize(1024);
|
||||
res = llama_chat_apply_template(
|
||||
nullptr,
|
||||
@@ -57,8 +65,7 @@ int main(void) {
|
||||
formatted_chat.resize(res);
|
||||
std::string output(formatted_chat.data(), formatted_chat.size());
|
||||
std::cout << output << "\n-------------------------\n";
|
||||
// expect the "formatted_chat" to contain pre-defined strings
|
||||
assert(output.find(substr) != std::string::npos);
|
||||
assert(output == expected);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user