Compare commits

...

32 Commits
b3925 ... b3957

Author SHA1 Message Date
leo-pony
6b8447352d [CANN] Adapt to dynamically loadable backends mechanism (#9970)
* [CANN] Adapt to dynamically loadable backends mechanism

* Fix the Bug: inference running result is garbled in debug running model for LM models who's type is Q4_0 class

* Handle the review comments of this pull request
2024-10-22 16:16:01 +08:00
Daniel Bevenius
674804a996 arg : fix typo in embeddings argument help [no ci] (#9994)
This commit fixes two typos in the help text for the `--embd-normalize`
and `--embd-separator` arguments. It also updates common.h which contain
the same typo in two comments.
2024-10-22 10:40:02 +03:00
Georgi Gerganov
e94a138d64 llama.vim : fix info text display [no ci] (#9787) 2024-10-22 00:37:55 +03:00
Georgi Gerganov
e01c67affe llama.vim : move info to the right of screen [no ci] (#9787)
'eol' messes up the rendering with nvim v0.10.2 for some reason
2024-10-21 22:53:18 +03:00
Asghar Ghorbani
994cfb1acb readme : update UI list (#9972)
add PocketPal AI app
2024-10-21 21:20:59 +03:00
Daniel Bevenius
94008cc760 arg : fix attention non-causal arg value hint (#9985)
This commit updates the argument value hint for the `--attention`
argument to `non-causal`.

The motivation for this change is that the only values for this argument
are `causal` and `non-causal`.
2024-10-21 21:12:52 +03:00
Georgi Gerganov
dbd5f2f573 llama.vim : plugin for Neovim (#9787)
Some checks are pending
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full-cuda.Dockerfile platforms:linux/amd64 tag:full-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full-musa.Dockerfile platforms:linux/amd64 tag:full-musa]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full.Dockerfile platforms:linux/amd64,linux/arm64 tag:full]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-cuda.Dockerfile platforms:linux/amd64 tag:light-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-intel.Dockerfile platforms:linux/amd64 tag:light-intel]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-musa.Dockerfile platforms:linux/amd64 tag:light-musa]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli.Dockerfile platforms:linux/amd64,linux/arm64 tag:light]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-cuda.Dockerfile platforms:linux/amd64 tag:server-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-intel.Dockerfile platforms:linux/amd64 tag:server-intel]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-musa.Dockerfile platforms:linux/amd64 tag:server-musa]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server.Dockerfile platforms:linux/amd64,linux/arm64 tag:server]) (push) Waiting to run
Nix CI / nix-eval (macos-latest) (push) Waiting to run
Nix CI / nix-eval (ubuntu-latest) (push) Waiting to run
Nix CI / nix-build (macos-latest) (push) Waiting to run
Nix CI / nix-build (ubuntu-latest) (push) Waiting to run
flake8 Lint / Lint (push) Waiting to run
2024-10-21 20:25:02 +03:00
Georgi Gerganov
f594bc80ba ggml : add asserts for type conversion in fattn kernels (#9971)
ggml-ci
2024-10-21 16:20:46 +03:00
Radoslav Gerganov
d5ebd79c76 rpc : pack only RPC structs (#9959) 2024-10-21 13:35:40 +03:00
Georgi Gerganov
55e47786e3 llama : default sampling changes + greedy update (#9897)
Some checks are pending
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full-cuda.Dockerfile platforms:linux/amd64 tag:full-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full-musa.Dockerfile platforms:linux/amd64 tag:full-musa]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full.Dockerfile platforms:linux/amd64,linux/arm64 tag:full]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-cuda.Dockerfile platforms:linux/amd64 tag:light-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-intel.Dockerfile platforms:linux/amd64 tag:light-intel]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-musa.Dockerfile platforms:linux/amd64 tag:light-musa]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli.Dockerfile platforms:linux/amd64,linux/arm64 tag:light]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-cuda.Dockerfile platforms:linux/amd64 tag:server-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-intel.Dockerfile platforms:linux/amd64 tag:server-intel]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-musa.Dockerfile platforms:linux/amd64 tag:server-musa]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server.Dockerfile platforms:linux/amd64,linux/arm64 tag:server]) (push) Waiting to run
Nix CI / nix-eval (macos-latest) (push) Waiting to run
Nix CI / nix-eval (ubuntu-latest) (push) Waiting to run
Nix CI / nix-build (macos-latest) (push) Waiting to run
Nix CI / nix-build (ubuntu-latest) (push) Waiting to run
flake8 Lint / Lint (push) Waiting to run
* llama : deprecate softmax sampler + fix dist sampler

ggml-ci

* tests : replace macros with functions

ggml-ci

* sampling : change temperature sampler logic

For t <= 0.0f, keep the max logit intact and set the rest to -inf

* cont : no need for special "greedy" logic

top-k == 1 is the same

* tests : init prob correctly

* llama : handle temp <= 0.0 in the temp_ext sampler too

ggml-ci

* cont : avoid extra loop in temperature sampler for sub-zero temp

ggml-ci
2024-10-21 09:46:40 +03:00
Georgi Gerganov
bc21975084 speculative : fix handling of some input params (#9963)
* speculative : fix batch sizes at initialization

ggml-ci

* speculative : handle params.n_predict == -1

* speculative : limit batch size to llama_n_batch
2024-10-21 09:37:12 +03:00
Neo Zhang Jianyu
1db8c84fc6 fix mul_mat_vec_q and *_vec_q error (#9939)
Co-authored-by: arthw <14088817+arthw@users.noreply.github.com>
2024-10-21 14:26:09 +08:00
Loïc Carrère
45f097645e readme : update bindings list (#9951)
Some checks are pending
Nix CI / nix-eval (macos-latest) (push) Waiting to run
Nix CI / nix-eval (ubuntu-latest) (push) Waiting to run
Nix CI / nix-build (macos-latest) (push) Waiting to run
Nix CI / nix-build (ubuntu-latest) (push) Waiting to run
flake8 Lint / Lint (push) Waiting to run
Update the binding list by adding LM-Kit.NET (C# & VB.NET)
2024-10-20 19:25:41 +03:00
icppWorld
7cab2083c7 readme : update infra list (#9942)
llama_cpp_canister allows you to run llama.cpp as a Smart Contract on the Internet Computer. The smart contract runs as WebAssembly in a so-called 'canister'.
2024-10-20 19:01:34 +03:00
Xuan Son Nguyen
cda0e4b648 llama : remove all_pos_0, all_pos_1, all_seq_id from llama_batch (#9745)
Some checks failed
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full-cuda.Dockerfile platforms:linux/amd64 tag:full-cuda]) (push) Has been cancelled
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full-musa.Dockerfile platforms:linux/amd64 tag:full-musa]) (push) Has been cancelled
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full.Dockerfile platforms:linux/amd64,linux/arm64 tag:full]) (push) Has been cancelled
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-cuda.Dockerfile platforms:linux/amd64 tag:light-cuda]) (push) Has been cancelled
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-intel.Dockerfile platforms:linux/amd64 tag:light-intel]) (push) Has been cancelled
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-musa.Dockerfile platforms:linux/amd64 tag:light-musa]) (push) Has been cancelled
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli.Dockerfile platforms:linux/amd64,linux/arm64 tag:light]) (push) Has been cancelled
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-cuda.Dockerfile platforms:linux/amd64 tag:server-cuda]) (push) Has been cancelled
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-intel.Dockerfile platforms:linux/amd64 tag:server-intel]) (push) Has been cancelled
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-musa.Dockerfile platforms:linux/amd64 tag:server-musa]) (push) Has been cancelled
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server.Dockerfile platforms:linux/amd64,linux/arm64 tag:server]) (push) Has been cancelled
Nix CI / nix-eval (macos-latest) (push) Has been cancelled
Nix CI / nix-eval (ubuntu-latest) (push) Has been cancelled
Nix CI / nix-build (macos-latest) (push) Has been cancelled
Nix CI / nix-build (ubuntu-latest) (push) Has been cancelled
flake8 Lint / Lint (push) Has been cancelled
update-flake-lock / lockfile (push) Has been cancelled
* refactor llama_batch_get_one

* adapt all examples

* fix simple.cpp

* fix llama_bench

* fix

* fix context shifting

* free batch before return

* use common_batch_add, reuse llama_batch in loop

* null terminated seq_id list

* fix save-load-state example

* fix perplexity

* correct token pos in llama_batch_allocr
2024-10-18 23:18:01 +02:00
Radoslav Gerganov
afd9909a64 rpc : backend refactoring (#9912)
Some checks are pending
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full-cuda.Dockerfile platforms:linux/amd64 tag:full-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full-musa.Dockerfile platforms:linux/amd64 tag:full-musa]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full.Dockerfile platforms:linux/amd64,linux/arm64 tag:full]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-cuda.Dockerfile platforms:linux/amd64 tag:light-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-intel.Dockerfile platforms:linux/amd64 tag:light-intel]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-musa.Dockerfile platforms:linux/amd64 tag:light-musa]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli.Dockerfile platforms:linux/amd64,linux/arm64 tag:light]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-cuda.Dockerfile platforms:linux/amd64 tag:server-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-intel.Dockerfile platforms:linux/amd64 tag:server-intel]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-musa.Dockerfile platforms:linux/amd64 tag:server-musa]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server.Dockerfile platforms:linux/amd64,linux/arm64 tag:server]) (push) Waiting to run
Nix CI / nix-eval (macos-latest) (push) Waiting to run
Nix CI / nix-eval (ubuntu-latest) (push) Waiting to run
Nix CI / nix-build (macos-latest) (push) Waiting to run
Nix CI / nix-build (ubuntu-latest) (push) Waiting to run
flake8 Lint / Lint (push) Waiting to run
* rpc : refactor backend

Use structs for RPC request/response messages

* rpc : refactor server
2024-10-18 14:33:58 +03:00
Ouadie EL FAROUKI
87421a23e8 [SYCL] Add SYCL Backend registry, device and Event Interfaces (#9705)
Some checks are pending
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full-cuda.Dockerfile platforms:linux/amd64 tag:full-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full-musa.Dockerfile platforms:linux/amd64 tag:full-musa]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full.Dockerfile platforms:linux/amd64,linux/arm64 tag:full]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-cuda.Dockerfile platforms:linux/amd64 tag:light-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-intel.Dockerfile platforms:linux/amd64 tag:light-intel]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-musa.Dockerfile platforms:linux/amd64 tag:light-musa]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli.Dockerfile platforms:linux/amd64,linux/arm64 tag:light]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-cuda.Dockerfile platforms:linux/amd64 tag:server-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-intel.Dockerfile platforms:linux/amd64 tag:server-intel]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-musa.Dockerfile platforms:linux/amd64 tag:server-musa]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server.Dockerfile platforms:linux/amd64,linux/arm64 tag:server]) (push) Waiting to run
Nix CI / nix-eval (macos-latest) (push) Waiting to run
Nix CI / nix-eval (ubuntu-latest) (push) Waiting to run
Nix CI / nix-build (macos-latest) (push) Waiting to run
Nix CI / nix-build (ubuntu-latest) (push) Waiting to run
flake8 Lint / Lint (push) Waiting to run
* implemented missing SYCL event APIs

* sycl : Added device and backend reg interfaces

* Restructured ggml-sycl.cpp
2024-10-18 06:46:16 +01:00
Ma Mingfei
60ce97c9d8 add amx kernel for gemm (#8998)
add intel amx isa detection

add vnni kernel for gemv cases

add vnni and amx kernel support for block_q8_0

code cleanup

fix packing B issue

enable openmp

fine tune amx kernel

switch to aten parallel pattern

add error message for nested parallelism

code cleanup

add f16 support in ggml-amx

add amx kernels for QK_K quant formats: Q4_K, Q5_K, Q6_K and IQ4_XS

update CMakeList

update README

fix some compilation warning

fix compiler warning when amx is not enabled

minor change

ggml-ci

move ggml_amx_init from ggml.c to ggml-amx/mmq.cpp

ggml-ci

update CMakeLists with -mamx-tile, -mamx-int8 and -mamx-bf16

ggml-ci

add amx as an ggml-backend

update header file, the old path for immintrin.h has changed to ggml-cpu-impl.h

minor change

update CMakeLists.txt

minor change

apply weight prepacking in set_tensor method in ggml-backend

fix compile error

ggml-ci

minor change

ggml-ci

update CMakeLists.txt

ggml-ci

add march dependency

minor change

ggml-ci

change ggml_backend_buffer_is_host to return false for amx backend

ggml-ci

fix supports_op

use device reg for AMX backend

ggml-ci

minor change

ggml-ci

minor change

fix rebase

set .buffer_from_host_ptr to be false for AMX backend
2024-10-18 13:34:36 +08:00
Georgi Gerganov
8901755ba3 server : add n_indent parameter for line indentation requirement (#9929)
ggml-ci
2024-10-18 07:32:19 +03:00
Daniel Bevenius
6f55bccbb8 llama : rename batch_all to batch (#8881)
Some checks failed
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full-cuda.Dockerfile platforms:linux/amd64 tag:full-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full-musa.Dockerfile platforms:linux/amd64 tag:full-musa]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full.Dockerfile platforms:linux/amd64,linux/arm64 tag:full]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-cuda.Dockerfile platforms:linux/amd64 tag:light-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-intel.Dockerfile platforms:linux/amd64 tag:light-intel]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-musa.Dockerfile platforms:linux/amd64 tag:light-musa]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli.Dockerfile platforms:linux/amd64,linux/arm64 tag:light]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-cuda.Dockerfile platforms:linux/amd64 tag:server-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-intel.Dockerfile platforms:linux/amd64 tag:server-intel]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-musa.Dockerfile platforms:linux/amd64 tag:server-musa]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server.Dockerfile platforms:linux/amd64,linux/arm64 tag:server]) (push) Waiting to run
Nix CI / nix-eval (macos-latest) (push) Waiting to run
Nix CI / nix-eval (ubuntu-latest) (push) Waiting to run
Nix CI / nix-build (macos-latest) (push) Waiting to run
Nix CI / nix-build (ubuntu-latest) (push) Waiting to run
flake8 Lint / Lint (push) Waiting to run
Python Type-Check / pyright type-check (push) Has been cancelled
This commit addresses the TODO in the code to rename the `batch_all`
parameter to `batch` in `llama_decode_internal`.
2024-10-18 01:41:51 +02:00
Georgi Gerganov
17bb928080 readme : remove --memory-f32 references (#9925) 2024-10-17 23:43:05 +03:00
Georgi Gerganov
9f45fc1e99 llama : change warning to debug log 2024-10-17 23:27:42 +03:00
Georgi Gerganov
99bd4ac28c llama : infill sampling handle very long tokens (#9924)
* llama : infill sampling handle very long tokens

ggml-ci

* cont : better indices

ggml-ci
2024-10-17 22:32:47 +03:00
Tim Wang
3752217ed5 readme : update bindings list (#9918)
Some checks are pending
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full-cuda.Dockerfile platforms:linux/amd64 tag:full-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full-musa.Dockerfile platforms:linux/amd64 tag:full-musa]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full.Dockerfile platforms:linux/amd64,linux/arm64 tag:full]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-cuda.Dockerfile platforms:linux/amd64 tag:light-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-intel.Dockerfile platforms:linux/amd64 tag:light-intel]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-musa.Dockerfile platforms:linux/amd64 tag:light-musa]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli.Dockerfile platforms:linux/amd64,linux/arm64 tag:light]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-cuda.Dockerfile platforms:linux/amd64 tag:server-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-intel.Dockerfile platforms:linux/amd64 tag:server-intel]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-musa.Dockerfile platforms:linux/amd64 tag:server-musa]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server.Dockerfile platforms:linux/amd64,linux/arm64 tag:server]) (push) Waiting to run
Nix CI / nix-eval (macos-latest) (push) Waiting to run
Nix CI / nix-eval (ubuntu-latest) (push) Waiting to run
Nix CI / nix-build (macos-latest) (push) Waiting to run
Nix CI / nix-build (ubuntu-latest) (push) Waiting to run
flake8 Lint / Lint (push) Waiting to run
Co-authored-by: Tim Wang <tim.wang@ing.com>
2024-10-17 09:57:14 +03:00
Diego Devesa
f010b77a37 vulkan : add backend registry / device interfaces (#9721)
* vulkan : add backend registry / device interfaces

* llama : print devices used on model load
2024-10-17 02:46:58 +02:00
Gilad S.
2194200278 fix: allocating CPU buffer with size 0 (#9917) 2024-10-17 01:34:22 +02:00
Gilad S.
73afe681aa fix: use vm_allocate to allocate CPU backend buffer on macOS (#9875)
Some checks failed
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full-cuda.Dockerfile platforms:linux/amd64 tag:full-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full-musa.Dockerfile platforms:linux/amd64 tag:full-musa]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full.Dockerfile platforms:linux/amd64,linux/arm64 tag:full]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-cuda.Dockerfile platforms:linux/amd64 tag:light-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-intel.Dockerfile platforms:linux/amd64 tag:light-intel]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-musa.Dockerfile platforms:linux/amd64 tag:light-musa]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli.Dockerfile platforms:linux/amd64,linux/arm64 tag:light]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-cuda.Dockerfile platforms:linux/amd64 tag:server-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-intel.Dockerfile platforms:linux/amd64 tag:server-intel]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-musa.Dockerfile platforms:linux/amd64 tag:server-musa]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server.Dockerfile platforms:linux/amd64,linux/arm64 tag:server]) (push) Waiting to run
Nix CI / nix-eval (macos-latest) (push) Waiting to run
Nix CI / nix-eval (ubuntu-latest) (push) Waiting to run
Nix CI / nix-build (macos-latest) (push) Waiting to run
Nix CI / nix-build (ubuntu-latest) (push) Waiting to run
flake8 Lint / Lint (push) Waiting to run
Python Type-Check / pyright type-check (push) Has been cancelled
* fix: use `vm_allocate` to allocate CPU backend buffer on macOS

* fix: switch to `posix_memalign` to keep existing `free()` usages work

* feat: move `GGML_ALIGNED_MALLOC` to `ggml-backend-impl.h`, add support for `vm_allocate` on macOS

* style: formatting

* fix: move const outside of `#ifndef`

* style: formatting

* fix: unused var

* fix: transform `GGML_ALIGNED_MALLOC` and `GGML_ALIGNED_FREE` into functions and add them to `ggml-impl.h`

* fix: unused var

* fix: page align to `GGUF_DEFAULT_ALIGNMENT`

* fix: page align to `TENSOR_ALIGNMENT`

* fix: convert `TENSOR_ALIGNMENT` to a macro

* fix: increase page size to `32` on iOS

* fix: iOS page size

* fix: `hbw_posix_memalign` alignment
2024-10-17 00:36:51 +02:00
Daniel Bevenius
9e04102448 llama : suppress conversion from 'size_t' to 'int' (#9046)
* llama : suppress conversion from 'size_t' to 'int'

This commit updates llm_tokenizer_spm.tokenize to suppress/remove the
following warnings that are generated on Windows when using MSVC:

```console
src\llama-vocab.cpp(211,1): warning C4267: 'argument':
    conversion from 'size_t' to 'int', possible loss of data
src\llama-vocab.cpp(517,1): warning C4267: 'argument':
    conversion from 'size_t' to 'int', possible loss of data
```

This is done by adding a cast for the size_t returned from
symbols.size(). I believe this is safe as it seems unlikely that
symbols, which stores an entry for each UTF8 character, would become
larger than INT_MAX.

The motivation for this change is to reduce the number of warnings that
are currently generated when building on Windows.

* squash! llama : suppress conversion from 'size_t' to 'int'

Move cast into for loop.
2024-10-16 20:34:28 +03:00
Daniel Bevenius
dbf18e4de9 llava : fix typo in error message [no ci] (#9884) 2024-10-16 20:24:05 +03:00
Joe Eli McIlvain
66c2c93082 grammar : fix JSON Schema for string regex with top-level alt. (#9903)
Prior to this commit, using a JSON Schema containing a string
with `pattern` regular expression that uses top-level alternation
(e.g. `"pattern": "^A|B|C|D$"`) would result in invalid JSON
output from the constrained sampling grammar, because it
ended up creating a grammar rule like this for the string:

```
thing ::= "\"" "A" | "B" | "C" | "D" "\"" space
```

Note that this rule will only match a starting quote for the "A" case,
and will only match an ending quote for the "D" case,
so this rule will always produce invalid JSON when used for sampling
(that is, the JSON will always be lacking the starting quote,
the ending quote, or both).

This was fixed in a simple way by adding parentheses to the
generated rule (for all string pattern rules, to keep it simple),
such that the new generated rule looks like this (correct):

```
thing ::= "\"" ("A" | "B" | "C" | "D") "\"" space
```
2024-10-16 19:03:24 +03:00
Molly Sophia
10433e8b45 llama : add tensor name for "result_norm" (#9907)
Some checks are pending
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full-cuda.Dockerfile platforms:linux/amd64 tag:full-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full-musa.Dockerfile platforms:linux/amd64 tag:full-musa]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full.Dockerfile platforms:linux/amd64,linux/arm64 tag:full]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-cuda.Dockerfile platforms:linux/amd64 tag:light-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-intel.Dockerfile platforms:linux/amd64 tag:light-intel]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-musa.Dockerfile platforms:linux/amd64 tag:light-musa]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli.Dockerfile platforms:linux/amd64,linux/arm64 tag:light]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-cuda.Dockerfile platforms:linux/amd64 tag:server-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-intel.Dockerfile platforms:linux/amd64 tag:server-intel]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-musa.Dockerfile platforms:linux/amd64 tag:server-musa]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server.Dockerfile platforms:linux/amd64,linux/arm64 tag:server]) (push) Waiting to run
Nix CI / nix-eval (macos-latest) (push) Waiting to run
Nix CI / nix-eval (ubuntu-latest) (push) Waiting to run
Nix CI / nix-build (macos-latest) (push) Waiting to run
Nix CI / nix-build (ubuntu-latest) (push) Waiting to run
flake8 Lint / Lint (push) Waiting to run
Signed-off-by: Molly Sophia <mollysophia379@gmail.com>
2024-10-16 13:10:21 +03:00
Alexey Parfenov
1f66b699c4 server : fix the disappearance of the end of the text (#9867)
* server: fix the disappearance of the end of the text when streaming with stop strings

* simplify "send text" checks
2024-10-16 11:35:53 +03:00
59 changed files with 6873 additions and 2338 deletions

View File

@@ -88,6 +88,10 @@ if (NOT DEFINED GGML_LLAMAFILE)
set(GGML_LLAMAFILE_DEFAULT ON)
endif()
if (NOT DEFINED GGML_AMX)
set(GGML_AMX ON)
endif()
if (NOT DEFINED GGML_CUDA_GRAPHS)
set(GGML_CUDA_GRAPHS_DEFAULT ON)
endif()

View File

@@ -93,11 +93,6 @@ GGML_METAL := 1
DEPRECATE_WARNING := 1
endif
ifdef LLAMA_OPENMP
GGML_OPENMP := 1
DEPRECATE_WARNING := 1
endif
ifdef LLAMA_RPC
GGML_RPC := 1
DEPRECATE_WARNING := 1
@@ -584,6 +579,11 @@ ifndef GGML_NO_LLAMAFILE
OBJ_GGML += ggml/src/llamafile/sgemm.o
endif
ifndef GGML_NO_AMX
MK_CPPFLAGS += -DGGML_USE_AMX
OBJ_GGML += ggml/src/ggml-amx.o ggml/src/ggml-amx/mmq.o
endif
ifdef GGML_RPC
MK_CPPFLAGS += -DGGML_USE_RPC
OBJ_GGML += ggml/src/ggml-rpc.o
@@ -1087,6 +1087,19 @@ ggml/src/llamafile/sgemm.o: \
$(CXX) $(CXXFLAGS) -c $< -o $@
endif # GGML_NO_LLAMAFILE
ifndef GGML_NO_AMX
ggml/src/ggml-amx.o: \
ggml/src/ggml-amx.cpp \
ggml/include/ggml-amx.h
$(CXX) $(CXXFLAGS) -c $< -o $@
ggml/src/ggml-amx/mmq.o: \
ggml/src/ggml-amx/mmq.cpp \
ggml/src/ggml-amx/mmq.h \
ggml/include/ggml.h
$(CXX) $(CXXFLAGS) -c $< -o $@
endif
ifdef GGML_RPC
ggml/src/ggml-rpc.o: \
ggml/src/ggml-rpc.cpp \
@@ -1238,6 +1251,7 @@ clean:
rm -vrf ggml/src/ggml-metal-embed.metal
rm -vrf ggml/src/ggml-cuda/*.o
rm -vrf ggml/src/ggml-cuda/template-instances/*.o
rm -vrf ggml/src/ggml-amx/*.o
rm -rvf $(BUILD_TARGETS)
rm -rvf $(TEST_TARGETS)
rm -f vulkan-shaders-gen ggml/src/ggml-vulkan-shaders.hpp ggml/src/ggml-vulkan-shaders.cpp

View File

@@ -29,7 +29,7 @@ variety of hardware - locally and in the cloud.
- Plain C/C++ implementation without any dependencies
- Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
- AVX, AVX2 and AVX512 support for x86 architectures
- AVX, AVX2, AVX512 and AMX support for x86 architectures
- 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP and Moore Threads MTT GPUs via MUSA)
- Vulkan and SYCL backend support
@@ -122,6 +122,7 @@ Typically finetunes of the base models below are supported as well.
- Rust (nicer API): [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
- Rust (more direct bindings): [utilityai/llama-cpp-rs](https://github.com/utilityai/llama-cpp-rs)
- C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp)
- C#/VB.NET (more features - community license): [LM-Kit.NET](https://docs.lm-kit.com/lm-kit-net/index.html)
- Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s)
- Clojure: [phronmophobic/llama.clj](https://github.com/phronmophobic/llama.clj)
- React Native: [mybigday/llama.rn](https://github.com/mybigday/llama.rn)
@@ -131,6 +132,7 @@ Typically finetunes of the base models below are supported as well.
- PHP (API bindings and features built on top of llama.cpp): [distantmagic/resonance](https://github.com/distantmagic/resonance) [(more info)](https://github.com/ggerganov/llama.cpp/pull/6326)
- Guile Scheme: [guile_llama_cpp](https://savannah.nongnu.org/projects/guile-llama-cpp)
- Swift [srgtuszy/llama-cpp-swift](https://github.com/srgtuszy/llama-cpp-swift)
- Swift [ShenghaiWang/SwiftLlama](https://github.com/ShenghaiWang/SwiftLlama)
**UI:**
@@ -171,6 +173,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
- [LARS - The LLM & Advanced Referencing Solution](https://github.com/abgulati/LARS) (AGPL)
- [LLMUnity](https://github.com/undreamai/LLMUnity) (MIT)
- [Llama Assistant](https://github.com/vietanhdev/llama-assistant) (GPL)
- [PocketPal AI - An iOS and Android App](https://github.com/a-ghorbani/pocketpal-ai) (MIT)
*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
@@ -186,6 +189,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
- [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp
- [GPUStack](https://github.com/gpustack/gpustack) - Manage GPU clusters for running LLMs
- [llama_cpp_canister](https://github.com/onicai/llama_cpp_canister) - llama.cpp as a smart contract on the Internet Computer, using WebAssembly
**Games:**
- [Lucy's Labyrinth](https://github.com/MorganRO8/Lucys_Labyrinth) - A simple maze game where agents controlled by an AI model will try to trick you.

View File

@@ -1097,7 +1097,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_POOLING"));
add_opt(common_arg(
{"--attention"}, "{causal,non,causal}",
{"--attention"}, "{causal,non-causal}",
"attention type for embeddings, use model default if unspecified",
[](common_params & params, const std::string & value) {
/**/ if (value == "causal") { params.attention_type = LLAMA_ATTENTION_TYPE_CAUSAL; }
@@ -1695,7 +1695,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_examples({LLAMA_EXAMPLE_BENCH}));
add_opt(common_arg(
{"--embd-normalize"}, "N",
string_format("normalisation for embendings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize),
string_format("normalisation for embeddings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize),
[](common_params & params, int value) {
params.embd_normalize = value;
}
@@ -1709,7 +1709,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
add_opt(common_arg(
{"--embd-separator"}, "STRING",
"separator of embendings (default \\n) for example \"<#sep#>\"",
"separator of embeddings (default \\n) for example \"<#sep#>\"",
[](common_params & params, const std::string & value) {
params.embd_sep = value;
}

View File

@@ -955,7 +955,7 @@ struct common_init_result common_init_from_params(common_params & params) {
}
if (llama_model_has_encoder(model)) {
llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size(), 0, 0));
llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size()));
llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
if (decoder_start_token_id == -1) {
decoder_start_token_id = bos;
@@ -964,7 +964,7 @@ struct common_init_result common_init_from_params(common_params & params) {
tmp.push_back(decoder_start_token_id);
}
if (llama_model_has_decoder(model)) {
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
}
llama_kv_cache_clear(lctx);
llama_synchronize(lctx);
@@ -1035,7 +1035,7 @@ static ggml_type kv_cache_type_from_str(const std::string & s) {
return GGML_TYPE_Q5_1;
}
throw std::runtime_error("Invalid cache type: " + s);
throw std::runtime_error("Unsupported cache type: " + s);
}
struct llama_context_params common_context_params_to_llama(const common_params & params) {
@@ -1047,7 +1047,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
cparams.n_ubatch = params.n_ubatch;
cparams.n_threads = params.cpuparams.n_threads;
cparams.n_threads_batch = params.cpuparams_batch.n_threads == -1 ?
params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
cparams.logits_all = params.logits_all;
cparams.embeddings = params.embedding;
cparams.rope_scaling_type = params.rope_scaling_type;

View File

@@ -274,9 +274,9 @@ struct common_params {
// embedding
bool embedding = false; // get only sentence embedding
int32_t embd_normalize = 2; // normalisation for embendings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
std::string embd_sep = "\n"; // separator of embendings
std::string embd_sep = "\n"; // separator of embeddings
bool reranking = false; // enable reranking support on server
// server params

View File

@@ -611,7 +611,7 @@ private:
}
return join_seq();
};
return _add_rule(name, "\"\\\"\" " + to_rule(transform()) + " \"\\\"\" space");
return _add_rule(name, "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\" space");
}
/*

View File

@@ -171,60 +171,46 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
params.penalize_nl,
params.ignore_eos));
if (params.temp > 0.0f) {
if (params.mirostat == 0) {
for (const auto & cnstr : params.samplers) {
switch (cnstr) {
case COMMON_SAMPLER_TYPE_TOP_K:
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
break;
case COMMON_SAMPLER_TYPE_TOP_P:
llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep));
break;
case COMMON_SAMPLER_TYPE_MIN_P:
llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
break;
case COMMON_SAMPLER_TYPE_XTC:
llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
break;
case COMMON_SAMPLER_TYPE_TFS_Z:
llama_sampler_chain_add(result->chain, llama_sampler_init_tail_free(params.tfs_z, params.min_keep));
break;
case COMMON_SAMPLER_TYPE_TYPICAL_P:
llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep));
break;
case COMMON_SAMPLER_TYPE_TEMPERATURE:
llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
break;
case COMMON_SAMPLER_TYPE_INFILL:
llama_sampler_chain_add(result->chain, llama_sampler_init_infill (model));
break;
default:
GGML_ASSERT(false && "unknown sampler type");
}
if (params.mirostat == 0) {
for (const auto & cnstr : params.samplers) {
switch (cnstr) {
case COMMON_SAMPLER_TYPE_TOP_K:
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
break;
case COMMON_SAMPLER_TYPE_TOP_P:
llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep));
break;
case COMMON_SAMPLER_TYPE_MIN_P:
llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
break;
case COMMON_SAMPLER_TYPE_XTC:
llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
break;
case COMMON_SAMPLER_TYPE_TFS_Z:
llama_sampler_chain_add(result->chain, llama_sampler_init_tail_free(params.tfs_z, params.min_keep));
break;
case COMMON_SAMPLER_TYPE_TYPICAL_P:
llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep));
break;
case COMMON_SAMPLER_TYPE_TEMPERATURE:
llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
break;
case COMMON_SAMPLER_TYPE_INFILL:
llama_sampler_chain_add(result->chain, llama_sampler_init_infill (model));
break;
default:
GGML_ASSERT(false && "unknown sampler type");
}
llama_sampler_chain_add(result->chain, llama_sampler_init_softmax());
llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
} else if (params.mirostat == 1) {
llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_n_vocab(model), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
} else if (params.mirostat == 2) {
llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));
} else {
GGML_ASSERT(false && "unknown mirostat version");
}
llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
} else if (params.mirostat == 1) {
llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_n_vocab(model), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
} else if (params.mirostat == 2) {
llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));
} else {
if (params.n_probs > 0) {
// some use cases require to sample greedily, but still obtain the probabilities of the top tokens
// ref: https://github.com/ggerganov/llama.cpp/pull/9605
//
// the following will not produce exactly the same probs as applyging softmax to the full vocabulary, but
// it is much faster, since we avoid sorting all tokens and should give a good approximation
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k(params.n_probs));
llama_sampler_chain_add(result->chain, llama_sampler_init_softmax());
}
llama_sampler_chain_add(result->chain, llama_sampler_init_greedy());
GGML_ASSERT(false && "unknown mirostat version");
}
return result;

View File

@@ -74,7 +74,6 @@ int main(int argc, char ** argv) {
batch.n_seq_id + i,
batch.seq_id + i,
batch.logits + i,
0, 0, 0, // unused
};
const int ret = llama_decode(ctx, batch_view);

View File

@@ -339,7 +339,7 @@ static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
static bool get_hidden_layers(llama_context * ctx, std::vector<llama_token> & tokens) {
llama_kv_cache_clear(ctx);
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return false;
}

View File

@@ -131,7 +131,7 @@ static bool run(llama_context * ctx, const common_params & params) {
std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, add_bos);
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {
LOG_ERR("%s : failed to eval\n", __func__);
return false;
}

View File

@@ -496,6 +496,8 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
// clear the KV cache
llama_kv_cache_clear(ctx);
llama_batch batch = llama_batch_init(n_batch, 0, 1);
for (int j = 0; j < num_batches; ++j) {
const int batch_start = start + j * n_batch;
const int batch_size = std::min(end - batch_start, n_batch);
@@ -508,9 +510,14 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
}
// TODO: use batch.logits to save computations instead of relying on logits_all == true
if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
common_batch_clear(batch);
for (int i = 0; i < batch_size; i++) {
common_batch_add(batch, tokens[batch_start + i], j*n_batch + i, {0}, true);
}
if (llama_decode(ctx, batch)) {
LOG_ERR("%s : failed to eval\n", __func__);
llama_batch_free(batch);
return false;
}
@@ -523,6 +530,8 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
}
}
llama_batch_free(batch);
const auto t_end = std::chrono::high_resolution_clock::now();
if (i == 0) {

View File

@@ -396,7 +396,7 @@ int main(int argc, char ** argv) {
LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());
if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval))) {
LOG_ERR("%s : failed to eval\n", __func__);
return 1;
}

View File

@@ -540,7 +540,7 @@ class SchemaConverter:
return self._add_rule(
name,
to_rule(transform()) if self._raw_pattern \
else "\"\\\"\" " + to_rule(transform()) + " \"\\\"\" space")
else "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\" space")
def _resolve_ref(self, ref):

View File

@@ -151,7 +151,7 @@ static std::string get_gpu_info() {
int count = ggml_backend_sycl_get_device_count();
for (int i = 0; i < count; i++) {
char buf[128];
ggml_sycl_get_device_description(i, buf, sizeof(buf));
ggml_backend_sycl_get_device_description(i, buf, sizeof(buf));
id += buf;
if (i < count - 1) {
id += "/";
@@ -1428,7 +1428,7 @@ struct sql_printer : public printer {
}
};
static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_batch, int n_threads) {
static void test_prompt(llama_context * ctx, int n_prompt, int n_batch, int n_threads) {
llama_set_n_threads(ctx, n_threads, n_threads);
const llama_model * model = llama_get_model(ctx);
@@ -1444,14 +1444,14 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_bat
for (int i = 1; i < n_tokens; i++) {
tokens[i] = std::rand() % n_vocab;
}
llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens, n_past + n_processed, 0));
llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens));
n_processed += n_tokens;
}
llama_synchronize(ctx);
}
static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads) {
static void test_gen(llama_context * ctx, int n_gen, int n_threads) {
llama_set_n_threads(ctx, n_threads, n_threads);
const llama_model * model = llama_get_model(ctx);
@@ -1460,7 +1460,7 @@ static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads)
llama_token token = llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab;
for (int i = 0; i < n_gen; i++) {
llama_decode(ctx, llama_batch_get_one(&token, 1, n_past + i, 0));
llama_decode(ctx, llama_batch_get_one(&token, 1));
llama_synchronize(ctx);
token = std::rand() % n_vocab;
}
@@ -1596,13 +1596,13 @@ int main(int argc, char ** argv) {
fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup prompt run\n", params_idx, params_count);
}
//test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);
test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
}
if (t.n_gen > 0) {
if (params.progress) {
fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup generation run\n", params_idx, params_count);
}
test_gen(ctx, 1, 0, t.n_threads);
test_gen(ctx, 1, t.n_threads);
}
for (int i = 0; i < params.reps; i++) {
@@ -1614,13 +1614,13 @@ int main(int argc, char ** argv) {
if (params.progress) {
fprintf(stderr, "llama-bench: benchmark %d/%ld: prompt run %d/%d\n", params_idx, params_count, i + 1, params.reps);
}
test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
}
if (t.n_gen > 0) {
if (params.progress) {
fprintf(stderr, "llama-bench: benchmark %d/%ld: generation run %d/%d\n", params_idx, params_count, i + 1, params.reps);
}
test_gen(ctx, t.n_gen, t.n_prompt, t.n_threads);
test_gen(ctx, t.n_gen, t.n_threads);
}
uint64_t t_ns = get_time_ns() - t_start;

View File

@@ -283,9 +283,6 @@ Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens,
nullptr,
nullptr,
nullptr,
0,
0,
0,
};
if (embd) {

View File

@@ -46,7 +46,6 @@ actor LlamaContext {
let sparams = llama_sampler_chain_default_params()
self.sampling = llama_sampler_chain_init(sparams)
llama_sampler_chain_add(self.sampling, llama_sampler_init_temp(0.4))
llama_sampler_chain_add(self.sampling, llama_sampler_init_softmax())
llama_sampler_chain_add(self.sampling, llama_sampler_init_dist(1234))
}

697
examples/llama.vim Normal file
View File

@@ -0,0 +1,697 @@
" LLM-based text completion using llama.cpp
"
" requires:
"
" - neovim
" - curl
" - llama.cpp server instance
" - FIM-compatible model
"
" sample config:
"
" - Tab - accept the current suggestion
" - Shift+Tab - accept just the first line of the segguestion
" - Ctrl+F - toggle FIM completion manually
"
" make symlink or copy this file to ~/.config/nvim/autoload/llama.vim
"
" start the llama.cpp server with a FIM-compatible model. for example:
"
" $ llama-server -m {model.gguf} --port 8012 -ngl 99 -fa -dt 0.1 --ubatch-size 512 --batch-size 1024 --cache-reuse 256
"
" --batch-size [512, model max context]
"
" adjust the batch size to control how much of the provided local context will be used during the inference
" lower values will use smaller part of the context around the cursor, which will result in faster processing
"
" --ubatch-size [64, 2048]
"
" chunks the batch into smaller chunks for faster processing
" depends on the specific hardware. use llama-bench to profile and determine the best size
"
" --cache-reuse (ge:llama_config.n_predict, 1024]
"
" this should be either 0 (disabled) or strictly larger than g:llama_config.n_predict
" using non-zero value enables context reuse on the server side which dramatically improves the performance at
" large contexts. a value of 256 should be good for all cases
"
" run this once to initialise llama.vim:
"
" :call llama#init()
"
" more info: https://github.com/ggerganov/llama.cpp/pull/9787
"
" colors (adjust to your liking)
highlight llama_hl_hint guifg=#ff772f
highlight llama_hl_info guifg=#77ff2f
" general parameters:
"
" endpoint: llama.cpp server endpoint
" n_prefix: number of lines before the cursor location to include in the local prefix
" n_suffix: number of lines after the cursor location to include in the local suffix
" n_predict: max number of tokens to predict
" t_max_prompt_ms: max alloted time for the prompt processing (TODO: not yet supported)
" t_max_predict_ms: max alloted time for the prediction
" show_info: show extra info about the inference (0 - disabled, 1 - statusline, 2 - inline)
" auto_fim: trigger FIM completion automatically on cursor movement
" max_line_suffix: do not auto-trigger FIM completion if there are more than this number of characters to the right of the cursor
"
" ring buffer of chunks, accumulated with time upon:
"
" - completion request
" - yank
" - entering a buffer
" - leaving a buffer
" - writing a file
"
" parameters for the ring-buffer with extra context:
"
" ring_n_chunks: max number of chunks to pass as extra context to the server (0 to disable)
" ring_chunk_size: max size of the chunks (in number of lines)
" note: adjust these numbers so that you don't overrun your context
" at ring_n_chunks = 64 and ring_chunk_size = 64 you need ~32k context
" ring_scope: the range around the cursor position (in number of lines) for gathering chunks after FIM
" ring_update_ms: how often to process queued chunks in normal mode
"
let s:default_config = {
\ 'endpoint': 'http://127.0.0.1:8012/infill',
\ 'n_prefix': 256,
\ 'n_suffix': 64,
\ 'n_predict': 128,
\ 't_max_prompt_ms': 500,
\ 't_max_predict_ms': 1000,
\ 'show_info': 2,
\ 'auto_fim': v:true,
\ 'max_line_suffix': 8,
\ 'ring_n_chunks': 64,
\ 'ring_chunk_size': 64,
\ 'ring_scope': 1024,
\ 'ring_update_ms': 1000,
\ }
let g:llama_config = get(g:, 'llama_config', s:default_config)
function! s:rand(i0, i1) abort
return a:i0 + rand() % (a:i1 - a:i0 + 1)
endfunction
function! llama#init()
if !executable('curl')
echohl WarningMsg
echo 'llama.vim requires the "curl" command to be available'
echohl None
return
endif
let s:pos_x = 0 " cursor position upon start of completion
let s:pos_y = 0
let s:line_cur = ''
let s:line_cur_prefix = ''
let s:line_cur_suffix = ''
let s:ring_chunks = [] " current set of chunks used as extra context
let s:ring_queued = [] " chunks that are queued to be sent for processing
let s:ring_n_evict = 0
let s:hint_shown = v:false
let s:pos_y_pick = -9999 " last y where we picked a chunk
let s:pos_dx = 0
let s:content = []
let s:can_accept = v:false
let s:timer_fim = -1
let s:t_fim_start = reltime() " used to measure total FIM time
let s:t_last_move = reltime() " last time the cursor moved
let s:current_job = v:null
augroup llama
autocmd!
autocmd InsertEnter * inoremap <expr> <silent> <C-F> llama#fim_inline(v:false)
autocmd InsertLeavePre * call llama#fim_cancel()
autocmd CursorMoved * call s:on_move()
autocmd CursorMovedI * call s:on_move()
autocmd CompleteChanged * call llama#fim_cancel()
if g:llama_config.auto_fim
autocmd CursorMovedI * call llama#fim(v:true)
endif
" gather chunks upon yanking
autocmd TextYankPost * if v:event.operator ==# 'y' | call s:pick_chunk(v:event.regcontents, v:false, v:true) | endif
" gather chunks upon entering/leaving a buffer
autocmd BufEnter * call timer_start(100, {-> s:pick_chunk(getline(max([1, line('.') - g:llama_config.ring_chunk_size/2]), min([line('.') + g:llama_config.ring_chunk_size/2, line('$')])), v:true, v:true)})
autocmd BufLeave * call s:pick_chunk(getline(max([1, line('.') - g:llama_config.ring_chunk_size/2]), min([line('.') + g:llama_config.ring_chunk_size/2, line('$')])), v:true, v:true)
" gather chunk upon saving the file
autocmd BufWritePost * call s:pick_chunk(getline(max([1, line('.') - g:llama_config.ring_chunk_size/2]), min([line('.') + g:llama_config.ring_chunk_size/2, line('$')])), v:true, v:true)
augroup END
silent! call llama#fim_cancel()
" init background update of the ring buffer
if g:llama_config.ring_n_chunks > 0
call s:ring_update()
endif
endfunction
" compute how similar two chunks of text are
" 0 - no similarity, 1 - high similarity
" TODO: figure out something better
function! s:chunk_sim(c0, c1)
let l:lines0 = len(a:c0)
let l:lines1 = len(a:c1)
let l:common = 0
for l:line0 in a:c0
for l:line1 in a:c1
if l:line0 == l:line1
let l:common += 1
break
endif
endfor
endfor
return 2.0 * l:common / (l:lines0 + l:lines1)
endfunction
" pick a random chunk of size g:llama_config.ring_chunk_size from the provided text and queue it for processing
"
" no_mod - do not pick chunks from buffers with pending changes
" do_evict - evict chunks that are very similar to the new one
"
function! s:pick_chunk(text, no_mod, do_evict)
" do not pick chunks from buffers with pending changes or buffers that are not files
if a:no_mod && (getbufvar(bufnr('%'), '&modified') || !buflisted(bufnr('%')) || !filereadable(expand('%')))
return
endif
" if the extra context option is disabled - do nothing
if g:llama_config.ring_n_chunks <= 0
return
endif
" don't pick very small chunks
if len(a:text) < 3
return
endif
if len(a:text) + 1 < g:llama_config.ring_chunk_size
let l:chunk = a:text
else
let l:l0 = s:rand(0, max([0, len(a:text) - g:llama_config.ring_chunk_size/2]))
let l:l1 = min([l:l0 + g:llama_config.ring_chunk_size/2, len(a:text)])
let l:chunk = a:text[l:l0:l:l1]
endif
let l:chunk_str = join(l:chunk, "\n") . "\n"
" check if this chunk is already added
let l:exist = v:false
for i in range(len(s:ring_chunks))
if s:ring_chunks[i].data == l:chunk
let l:exist = v:true
break
endif
endfor
for i in range(len(s:ring_queued))
if s:ring_queued[i].data == l:chunk
let l:exist = v:true
break
endif
endfor
if l:exist
return
endif
" evict queued chunks that are very similar to the new one
for i in range(len(s:ring_queued) - 1, 0, -1)
if s:chunk_sim(s:ring_queued[i].data, l:chunk) > 0.9
if a:do_evict
call remove(s:ring_queued, i)
let s:ring_n_evict += 1
else
return
endif
endif
endfor
" also from s:ring_chunks
for i in range(len(s:ring_chunks) - 1, 0, -1)
if s:chunk_sim(s:ring_chunks[i].data, l:chunk) > 0.9
if a:do_evict
call remove(s:ring_chunks, i)
let s:ring_n_evict += 1
else
return
endif
endif
endfor
" TODO: become parameter ?
if len(s:ring_queued) == 16
call remove(s:ring_queued, 0)
endif
call add(s:ring_queued, {'data': l:chunk, 'str': l:chunk_str, 'time': reltime(), 'filename': expand('%')})
"let &statusline = 'extra context: ' . len(s:ring_chunks) . ' / ' . len(s:ring_queued)
endfunction
" picks a queued chunk, sends it for processing and adds it to s:ring_chunks
" called every g:llama_config.ring_update_ms
function! s:ring_update()
call timer_start(g:llama_config.ring_update_ms, {-> s:ring_update()})
" update only if in normal mode or if the cursor hasn't moved for a while
if mode() !=# 'n' && reltimefloat(reltime(s:t_last_move)) < 3.0
return
endif
if len(s:ring_queued) == 0
return
endif
" move the first queued chunk to the ring buffer
if len(s:ring_chunks) == g:llama_config.ring_n_chunks
call remove(s:ring_chunks, 0)
endif
call add(s:ring_chunks, remove(s:ring_queued, 0))
"let &statusline = 'updated context: ' . len(s:ring_chunks) . ' / ' . len(s:ring_queued)
" send asynchronous job with the new extra context so that it is ready for the next FIM
let l:extra_context = []
for l:chunk in s:ring_chunks
call add(l:extra_context, {
\ 'text': l:chunk.str,
\ 'time': l:chunk.time,
\ 'filename': l:chunk.filename
\ })
endfor
" no samplers needed here
let l:request = json_encode({
\ 'input_prefix': "",
\ 'input_suffix': "",
\ 'input_extra': l:extra_context,
\ 'prompt': "",
\ 'n_predict': 1,
\ 'temperature': 0.0,
\ 'stream': v:false,
\ 'samplers': ["temperature"],
\ 'cache_prompt': v:true,
\ 't_max_prompt_ms': 1,
\ 't_max_predict_ms': 1
\ })
let l:curl_command = printf(
\ "curl --silent --no-buffer --request POST --url %s --header \"Content-Type: application/json\" --data %s",
\ g:llama_config.endpoint, shellescape(l:request)
\ )
" no callbacks because we don't need to process the response
call jobstart(l:curl_command, {})
endfunction
" necessary for 'inoremap <expr>'
function! llama#fim_inline(is_auto) abort
call llama#fim(a:is_auto)
return ''
endfunction
" the main FIM call
" takes local context around the cursor and sends it together with the extra context to the server for completion
function! llama#fim(is_auto) abort
" we already have a suggestion for the current cursor position
if s:hint_shown && !a:is_auto
call llama#fim_cancel()
return
endif
call llama#fim_cancel()
" avoid sending repeated requests too fast
if reltimefloat(reltime(s:t_fim_start)) < 0.6
if s:timer_fim != -1
call timer_stop(s:timer_fim)
let s:timer_fim = -1
endif
let s:t_fim_start = reltime()
let s:timer_fim = timer_start(600, {-> llama#fim(v:true)})
return
endif
let s:t_fim_start = reltime()
let s:content = []
let s:can_accept = v:false
let s:pos_x = col('.') - 1
let s:pos_y = line('.')
let l:max_y = line('$')
let l:lines_prefix = getline(max([1, s:pos_y - g:llama_config.n_prefix]), s:pos_y - 1)
let l:lines_suffix = getline(s:pos_y + 1, min([l:max_y, s:pos_y + g:llama_config.n_suffix]))
let s:line_cur = getline('.')
let s:line_cur_prefix = strpart(s:line_cur, 0, s:pos_x)
let s:line_cur_suffix = strpart(s:line_cur, s:pos_x)
if a:is_auto && len(s:line_cur_suffix) > g:llama_config.max_line_suffix
return
endif
let l:prefix = ""
\ . join(l:lines_prefix, "\n")
\ . "\n"
let l:prompt = ""
\ . s:line_cur_prefix
let l:suffix = ""
\ . s:line_cur_suffix
\ . "\n"
\ . join(l:lines_suffix, "\n")
\ . "\n"
" prepare the extra context data
let l:extra_context = []
for l:chunk in s:ring_chunks
call add(l:extra_context, {
\ 'text': l:chunk.str,
\ 'time': l:chunk.time,
\ 'filename': l:chunk.filename
\ })
endfor
" the indentation of the current line
let l:indent = strlen(matchstr(s:line_cur_prefix, '^\s*'))
let l:request = json_encode({
\ 'input_prefix': l:prefix,
\ 'input_suffix': l:suffix,
\ 'input_extra': l:extra_context,
\ 'prompt': l:prompt,
\ 'n_predict': g:llama_config.n_predict,
\ 'n_indent': l:indent,
\ 'top_k': 40,
\ 'top_p': 0.99,
\ 'stream': v:false,
\ 'samplers': ["top_k", "top_p", "infill"],
\ 'cache_prompt': v:true,
\ 't_max_prompt_ms': g:llama_config.t_max_prompt_ms,
\ 't_max_predict_ms': g:llama_config.t_max_predict_ms
\ })
let l:curl_command = printf(
\ "curl --silent --no-buffer --request POST --url %s --header \"Content-Type: application/json\" --data %s",
\ g:llama_config.endpoint, shellescape(l:request)
\ )
if s:current_job != v:null
call jobstop(s:current_job)
endif
" send the request asynchronously
let s:current_job = jobstart(l:curl_command, {
\ 'on_stdout': function('s:fim_on_stdout'),
\ 'on_exit': function('s:fim_on_exit'),
\ 'stdout_buffered': v:true,
\ 'pos_x': s:pos_x,
\ 'pos_y': s:pos_y,
\ 'is_auto': a:is_auto
\ })
" TODO: per-file location
let l:delta_y = abs(s:pos_y - s:pos_y_pick)
" gather some extra context nearby and process it in the background
" only gather chunks if the cursor has moved a lot
" TODO: something more clever? reranking?
if a:is_auto && l:delta_y > 32
" expand the prefix even further
call s:pick_chunk(getline(max([1, s:pos_y - g:llama_config.ring_scope]), max([1, s:pos_y - g:llama_config.n_prefix])), v:false, v:false)
" pick a suffix chunk
call s:pick_chunk(getline(min([l:max_y, s:pos_y + g:llama_config.n_suffix]), min([l:max_y, s:pos_y + g:llama_config.n_suffix + g:llama_config.ring_chunk_size])), v:false, v:false)
let s:pos_y_pick = s:pos_y
endif
endfunction
" if first_line == v:true accept only the first line of the response
function! llama#fim_accept(first_line)
" insert the suggestion at the cursor location
if s:can_accept && len(s:content) > 0
call setline(s:pos_y, s:line_cur[:(s:pos_x - 1)] . s:content[0])
if len(s:content) > 1
if !a:first_line
call append(s:pos_y, s:content[1:-1])
endif
endif
" move the cursor to the end of the accepted text
if !a:first_line && len(s:content) > 1
call cursor(s:pos_y + len(s:content) - 1, s:pos_x + s:pos_dx + 1)
else
call cursor(s:pos_y, s:pos_x + len(s:content[0]))
endif
endif
call llama#fim_cancel()
endfunction
function! llama#fim_cancel()
let s:hint_shown = v:false
" clear the virtual text
let l:bufnr = bufnr('%')
let l:id_vt_fim = nvim_create_namespace('vt_fim')
call nvim_buf_clear_namespace(l:bufnr, l:id_vt_fim, 0, -1)
" remove the mappings
silent! iunmap <buffer> <Tab>
silent! iunmap <buffer> <S-Tab>
silent! iunmap <buffer> <Esc>
endfunction
function! s:on_move()
let s:t_last_move = reltime()
call llama#fim_cancel()
endfunction
" callback that processes the FIM result from the server and displays the suggestion
function! s:fim_on_stdout(job_id, data, event) dict
let l:raw = join(a:data, "\n")
if len(l:raw) == 0
return
endif
if self.pos_x != col('.') - 1 || self.pos_y != line('.')
return
endif
" show the suggestion only in insert mode
if mode() !=# 'i'
return
endif
let s:pos_x = self.pos_x
let s:pos_y = self.pos_y
let s:can_accept = v:true
let l:has_info = v:false
if s:can_accept && v:shell_error
if !self.is_auto
call add(s:content, "<| curl error: is the server on? |>")
endif
let s:can_accept = v:false
endif
let l:n_prompt = 0
let l:t_prompt_ms = 1.0
let l:s_prompt = 0
let l:n_predict = 0
let l:t_predict_ms = 1.0
let l:s_predict = 0
" get the generated suggestion
if s:can_accept
let l:response = json_decode(l:raw)
for l:part in split(get(l:response, 'content', ''), "\n", 1)
call add(s:content, l:part)
endfor
" remove trailing new lines
while len(s:content) > 0 && s:content[-1] == ""
call remove(s:content, -1)
endwhile
let l:generation_settings = get(l:response, 'generation_settings', {})
let l:n_ctx = get(l:generation_settings, 'n_ctx', 0)
let l:n_cached = get(l:response, 'tokens_cached', 0)
let l:truncated = get(l:response, 'truncated', v:false)
" if response.timings is available
if len(get(l:response, 'timings', {})) > 0
let l:has_info = v:true
let l:timings = get(l:response, 'timings', {})
let l:n_prompt = get(l:timings, 'prompt_n', 0)
let l:t_prompt_ms = get(l:timings, 'prompt_ms', 1)
let l:s_prompt = get(l:timings, 'prompt_per_second', 0)
let l:n_predict = get(l:timings, 'predicted_n', 0)
let l:t_predict_ms = get(l:timings, 'predicted_ms', 1)
let l:s_predict = get(l:timings, 'predicted_per_second', 0)
endif
endif
if len(s:content) == 0
call add(s:content, "")
let s:can_accept = v:false
endif
if len(s:content) == 0
return
endif
" NOTE: the following is logic for discarding predictions that repeat existing text
" the code is quite ugly and there is very likely a simpler and more canonical way to implement this
"
" still, I wonder if there is some better way that avoids having to do these special hacks?
" on one hand, the LLM 'sees' the contents of the file before we start editing, so it is normal that it would
" start generating whatever we have given it via the extra context. but on the other hand, it's not very
" helpful to re-generate the same code that is already there
" truncate the suggestion if the first line is empty
if len(s:content) == 1 && s:content[0] == ""
let s:content = [""]
endif
" ... and the next lines are repeated
if len(s:content) > 1 && s:content[0] == "" && s:content[1:] == getline(s:pos_y + 1, s:pos_y + len(s:content) - 1)
let s:content = [""]
endif
" truncate the suggestion if it repeats the suffix
if len(s:content) == 1 && s:content[0] == s:line_cur_suffix
let s:content = [""]
endif
" find the first non-empty line (strip whitespace)
let l:cmp_y = s:pos_y + 1
while l:cmp_y < line('$') && getline(l:cmp_y) =~? '^\s*$'
let l:cmp_y += 1
endwhile
if (s:line_cur_prefix . s:content[0]) == getline(l:cmp_y)
" truncate the suggestion if it repeats the next line
if len(s:content) == 1
let s:content = [""]
endif
" ... or if the second line of the suggestion is the prefix of line l:cmp_y + 1
if len(s:content) == 2 && s:content[-1] == getline(l:cmp_y + 1)[:len(s:content[-1]) - 1]
let s:content = [""]
endif
" ... or if the middle chunk of lines of the suggestion is the same as [l:cmp_y + 1, l:cmp_y + len(s:content) - 1)
if len(s:content) > 2 && join(s:content[1:-1], "\n") == join(getline(l:cmp_y + 1, l:cmp_y + len(s:content) - 1), "\n")
let s:content = [""]
endif
endif
" keep only lines that have the same or larger whitespace prefix as s:line_cur_prefix
"let l:indent = strlen(matchstr(s:line_cur_prefix, '^\s*'))
"for i in range(1, len(s:content) - 1)
" if strlen(matchstr(s:content[i], '^\s*')) < l:indent
" let s:content = s:content[:i - 1]
" break
" endif
"endfor
let s:pos_dx = len(s:content[-1])
let s:content[-1] .= s:line_cur_suffix
call llama#fim_cancel()
" display virtual text with the suggestion
let l:bufnr = bufnr('%')
let l:id_vt_fim = nvim_create_namespace('vt_fim')
" construct the info message
if g:llama_config.show_info > 0 && l:has_info
let l:prefix = ' '
if l:truncated
let l:info = printf("%s | WARNING: the context is full: %d / %d, increase the server context size or reduce g:llama_config.ring_n_chunks",
\ g:llama_config.show_info == 2 ? l:prefix : 'llama.vim',
\ l:n_cached, l:n_ctx
\ )
else
let l:info = printf("%s | c: %d / %d, r: %d / %d, e: %d, q: %d / 16 | p: %d (%.2f ms, %.2f t/s) | g: %d (%.2f ms, %.2f t/s) | t: %.2f ms",
\ g:llama_config.show_info == 2 ? l:prefix : 'llama.vim',
\ l:n_cached, l:n_ctx, len(s:ring_chunks), g:llama_config.ring_n_chunks, s:ring_n_evict, len(s:ring_queued),
\ l:n_prompt, l:t_prompt_ms, l:s_prompt,
\ l:n_predict, l:t_predict_ms, l:s_predict,
\ 1000.0 * reltimefloat(reltime(s:t_fim_start))
\ )
endif
if g:llama_config.show_info == 1
" display the info in the statusline
let &statusline = l:info
let l:info = ''
endif
endif
" display the suggestion and append the info to the end of the first line
call nvim_buf_set_extmark(l:bufnr, l:id_vt_fim, s:pos_y - 1, s:pos_x - 1, {
\ 'virt_text': [[s:content[0], 'llama_hl_hint'], [l:info, 'llama_hl_info']],
\ 'virt_text_win_col': virtcol('.') - 1
\ })
call nvim_buf_set_extmark(l:bufnr, l:id_vt_fim, s:pos_y - 1, 0, {
\ 'virt_lines': map(s:content[1:], {idx, val -> [[val, 'llama_hl_hint']]}),
\ 'virt_text_win_col': virtcol('.')
\ })
" setup accept shortcuts
inoremap <buffer> <Tab> <C-O>:call llama#fim_accept(v:false)<CR>
inoremap <buffer> <S-Tab> <C-O>:call llama#fim_accept(v:true)<CR>
let s:hint_shown = v:true
endfunction
function! s:fim_on_exit(job_id, exit_code, event) dict
if a:exit_code != 0
echom "Job failed with exit code: " . a:exit_code
endif
let s:current_job = v:null
endfunction

View File

@@ -20,7 +20,7 @@ static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_toke
if (n_eval > n_batch) {
n_eval = n_batch;
}
if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval))) {
LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
return false;
}

View File

@@ -401,6 +401,39 @@ bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, co
return true;
}
struct llava_embd_batch {
std::vector<llama_pos> pos;
std::vector<int32_t> n_seq_id;
std::vector<llama_seq_id> seq_id_0;
std::vector<llama_seq_id *> seq_ids;
std::vector<int8_t> logits;
llama_batch batch;
llava_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
pos .resize(n_tokens);
n_seq_id.resize(n_tokens);
seq_ids .resize(n_tokens + 1);
logits .resize(n_tokens);
seq_id_0.resize(1);
seq_id_0[0] = seq_id;
seq_ids [n_tokens] = nullptr;
batch = {
/*n_tokens =*/ n_tokens,
/*tokens =*/ nullptr,
/*embd =*/ embd,
/*pos =*/ pos.data(),
/*n_seq_id =*/ n_seq_id.data(),
/*seq_id =*/ seq_ids.data(),
/*logits =*/ logits.data(),
};
for (int i = 0; i < n_tokens; i++) {
batch.pos [i] = pos_0 + i;
batch.n_seq_id[i] = 1;
batch.seq_id [i] = seq_id_0.data();
batch.logits [i] = false;
}
}
};
bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed, int n_batch, int * n_past) {
int n_embd = llama_n_embd(llama_get_model(ctx_llama));
@@ -409,8 +442,9 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
if (n_eval > n_batch) {
n_eval = n_batch;
}
llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
if (llama_decode(ctx_llama, batch)) {
float * embd = image_embed->embed+i*n_embd;
llava_embd_batch llava_batch = llava_embd_batch(embd, n_eval, *n_past, 0);
if (llama_decode(ctx_llama, llava_batch.batch)) {
LOG_ERR("%s : failed to eval\n", __func__);
return false;
}
@@ -432,7 +466,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
bool image_embed_result = llava_image_embed_make_with_clip_img(ctx_clip, n_threads, img, &image_embed, &n_image_pos);
if (!image_embed_result) {
clip_image_u8_free(img);
LOG_ERR("%s: coulnd't embed the image\n", __func__);
LOG_ERR("%s: couldn't embed the image\n", __func__);
return NULL;
}

View File

@@ -97,7 +97,7 @@ static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_toke
if (n_eval > n_batch) {
n_eval = n_batch;
}
if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval))) {
LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
return false;
}

View File

@@ -89,8 +89,8 @@ int main(int argc, char ** argv) {
const auto t_enc_start = ggml_time_us();
// eval the prompt
llama_decode(ctx, llama_batch_get_one( inp.data(), n_input - 1, 0, 0));
llama_decode(ctx, llama_batch_get_one(&inp.back(), 1, n_input - 1, 0));
llama_decode(ctx, llama_batch_get_one( inp.data(), n_input - 1));
llama_decode(ctx, llama_batch_get_one(&inp.back(), 1));
for (int s = 1; s < W + G + 1; ++s) {
llama_kv_cache_seq_cp(ctx, 0, s, -1, -1);

View File

@@ -89,8 +89,8 @@ int main(int argc, char ** argv){
const auto t_enc_start = ggml_time_us();
llama_decode(ctx, llama_batch_get_one( inp.data(), n_input - 1, 0, 0));
llama_decode(ctx, llama_batch_get_one(&inp.back(), 1, n_input - 1, 0));
llama_decode(ctx, llama_batch_get_one( inp.data(), n_input - 1));
llama_decode(ctx, llama_batch_get_one(&inp.back(), 1));
const auto t_enc_end = ggml_time_us();

View File

@@ -297,10 +297,6 @@ These options help improve the performance and memory usage of the LLaMA models.
These flags attempt optimizations that help on some systems with non-uniform memory access. This currently consists of one of the above strategies, and disabling prefetch and readahead for mmap. The latter causes mapped pages to be faulted in on first access instead of all at once, and in combination with pinning threads to NUMA nodes, more of the pages end up on the NUMA node where they are used. Note that if the model is already in the system page cache, for example because of a previous run without this option, this will have little effect unless you drop the page cache first. This can be done by rebooting the system or on Linux by writing '3' to '/proc/sys/vm/drop_caches' as root.
### Memory Float 32
- `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. This doubles the context memory requirement and cached prompt file size but does not appear to increase generation quality in a measurable way. Not recommended.
### Batch Size
- `-b N, --batch-size N`: Set the batch size for prompt processing (default: `2048`). This large batch size benefits users who have BLAS installed and enabled it during the build. If you don't have BLAS enabled ("BLAS=0"), you can use a smaller number, such as 8, to see the prompt progress as it's evaluated in some situations.

View File

@@ -528,7 +528,7 @@ int main(int argc, char ** argv) {
int enc_input_size = embd_inp.size();
llama_token * enc_input_buf = embd_inp.data();
if (llama_encode(ctx, llama_batch_get_one(enc_input_buf, enc_input_size, 0, 0))) {
if (llama_encode(ctx, llama_batch_get_one(enc_input_buf, enc_input_size))) {
LOG_ERR("%s : failed to eval\n", __func__);
return 1;
}
@@ -648,7 +648,7 @@ int main(int argc, char ** argv) {
LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());
if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval))) {
LOG_ERR("%s : failed to eval\n", __func__);
return 1;
}

View File

@@ -308,7 +308,6 @@ int main(int argc, char ** argv) {
batch.n_seq_id + i,
batch.seq_id + i,
batch.logits + i,
0, 0, 0, // unused
};
const int ret = llama_decode(ctx, batch_view);

View File

@@ -408,14 +408,21 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params
// clear the KV cache
llama_kv_cache_clear(ctx);
llama_batch batch = llama_batch_init(n_batch, 0, 1);
for (int j = 0; j < num_batches; ++j) {
const int batch_start = start + j * n_batch;
const int batch_size = std::min(end - batch_start, n_batch);
common_batch_clear(batch);
for (int i = 0; i < batch_size; i++) {
common_batch_add(batch, tokens[batch_start + i], j*n_batch + i, {0}, true);
}
//LOG_DBG(" Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch);
// TODO: use llama_batch.logits instead of relying on logits_all == true
if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
if (llama_decode(ctx, batch)) {
//LOG_ERR("%s : failed to eval\n", __func__);
llama_batch_free(batch);
return {tokens, -1, logit_history, prob_history};
}
@@ -435,6 +442,8 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params
}
}
llama_batch_free(batch);
const auto t_end = std::chrono::high_resolution_clock::now();
if (i == 0) {
@@ -704,7 +713,6 @@ static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<
batch.n_seq_id + i,
batch.seq_id + i,
batch.logits + i,
0, 0, 0, // unused
};
const int ret = llama_decode(ctx, batch_view);
@@ -1791,6 +1799,8 @@ static void kl_divergence(llama_context * ctx, const common_params & params) {
// clear the KV cache
llama_kv_cache_clear(ctx);
llama_batch batch = llama_batch_init(n_batch, 0, 1);
for (int j = 0; j < num_batches; ++j) {
const int batch_start = start + j * n_batch;
const int batch_size = std::min(end - batch_start, n_batch);
@@ -1803,9 +1813,14 @@ static void kl_divergence(llama_context * ctx, const common_params & params) {
tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
}
// TODO: use llama_batch.logits instead of relying on logits_all == true
if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
common_batch_clear(batch);
for (int i = 0; i < batch_size; i++) {
common_batch_add(batch, tokens[batch_start + i], j*n_batch + i, {0}, true);
}
if (llama_decode(ctx, batch)) {
LOG_ERR("%s : failed to eval\n", __func__);
llama_batch_free(batch);
return;
}
@@ -1818,6 +1833,8 @@ static void kl_divergence(llama_context * ctx, const common_params & params) {
}
}
llama_batch_free(batch);
const auto t_end = std::chrono::high_resolution_clock::now();
if (i == 0) {

View File

@@ -42,15 +42,21 @@ int main(int argc, char ** argv) {
llama_sampler * smpl = llama_sampler_chain_init(sparams);
llama_sampler_chain_add(smpl, llama_sampler_init_softmax());
llama_sampler_chain_add(smpl, llama_sampler_init_dist(params.sparams.seed));
// tokenize prompt
auto tokens = common_tokenize(ctx, params.prompt, true);
// prepare the batch
llama_batch batch = llama_batch_init(tokens.size(), 0, 1);
for (size_t i = 0; i < tokens.size(); i++) {
common_batch_add(batch, tokens[i], i, {0}, false);
}
batch.logits[batch.n_tokens - 1] = true; // generate next token
// evaluate prompt
llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), n_past, 0));
n_past += tokens.size();
llama_decode(ctx, batch);
n_past += batch.n_tokens;
// save state (rng, logits, embedding and kv_cache) to file
{
@@ -77,8 +83,12 @@ int main(int argc, char ** argv) {
printf("%s", next_token_str.c_str());
result0 += next_token_str;
if (llama_decode(ctx, llama_batch_get_one(&next_token, 1, n_past, 0))) {
common_batch_clear(batch);
common_batch_add(batch, next_token, n_past, {0}, true);
if (llama_decode(ctx, batch)) {
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
llama_batch_free(batch);
llama_free(ctx);
llama_free_model(model);
return 1;
@@ -96,7 +106,6 @@ int main(int argc, char ** argv) {
llama_sampler * smpl2 = llama_sampler_chain_init(sparams);
llama_sampler_chain_add(smpl2, llama_sampler_init_softmax());
llama_sampler_chain_add(smpl2, llama_sampler_init_dist(params.sparams.seed));
printf("\nsecond run: %s", params.prompt.c_str());
@@ -133,8 +142,12 @@ int main(int argc, char ** argv) {
printf("%s", next_token_str.c_str());
result1 += next_token_str;
if (llama_decode(ctx2, llama_batch_get_one(&next_token, 1, n_past, 0))) {
common_batch_clear(batch);
common_batch_add(batch, next_token, n_past, {0}, true);
if (llama_decode(ctx2, batch)) {
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
llama_batch_free(batch);
llama_free(ctx2);
llama_free_model(model);
return 1;
@@ -156,7 +169,6 @@ int main(int argc, char ** argv) {
llama_sampler * smpl3 = llama_sampler_chain_init(sparams);
llama_sampler_chain_add(smpl3, llama_sampler_init_softmax());
llama_sampler_chain_add(smpl3, llama_sampler_init_dist(params.sparams.seed));
printf("\nsingle seq run: %s", params.prompt.c_str());
@@ -221,8 +233,12 @@ int main(int argc, char ** argv) {
printf("%s", next_token_str.c_str());
result2 += next_token_str;
if (llama_decode(ctx3, llama_batch_get_one(&next_token, 1, n_past, 1))) {
common_batch_clear(batch);
common_batch_add(batch, next_token, n_past, {1}, true);
if (llama_decode(ctx3, batch)) {
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
llama_batch_free(batch);
llama_free(ctx3);
llama_free_model(model);
return 1;
@@ -236,6 +252,7 @@ int main(int argc, char ** argv) {
llama_sampler_free(smpl2);
llama_sampler_free(smpl3);
llama_batch_free(batch);
llama_free(ctx3);
llama_free_model(model);

View File

@@ -333,6 +333,8 @@ node index.js
`n_predict`: Set the maximum number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. Default: `-1`, where `-1` is infinity.
`n_indent`: Specify the minimum line indentation for the generated text in number of whitespace characters. Useful for code completion tasks. Default: `0`
`n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded. The number excludes the BOS token.
By default, this value is set to `0`, meaning no tokens are kept. Use `-1` to retain all tokens from the prompt.

View File

@@ -529,7 +529,7 @@ export class SchemaConverter {
return joinSeq();
};
return this._addRule(name, "\"\\\"\" " + toRule(transform()) + " \"\\\"\" space")
return this._addRule(name, "\"\\\"\" (" + toRule(transform()) + ") \"\\\"\" space")
}
_notStrings(strings) {

View File

@@ -131,6 +131,7 @@ struct slot_params {
int32_t n_keep = 0; // number of tokens to keep from initial prompt
int32_t n_discard = 0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
int32_t n_predict = -1; // new tokens to predict
int32_t n_indent = 0; // mininum line indentation for the generated text in number of whitespace characters
int64_t t_max_prompt_ms = -1; // TODO: implement
int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
@@ -173,6 +174,8 @@ struct server_slot {
std::vector<llama_token> prompt_tokens;
std::vector<llama_token> extra_tokens;
size_t last_nl_pos = 0;
std::string generated_text;
std::vector<llama_token> cache_tokens;
std::vector<completion_token_output> generated_token_probs;
@@ -215,6 +218,7 @@ struct server_slot {
SLT_DBG(*this, "%s", "\n");
n_prompt_tokens = 0;
last_nl_pos = 0;
generated_text = "";
has_new_line = false;
truncated = false;
@@ -860,6 +864,7 @@ struct server_context {
slot.params.stream = json_value(data, "stream", false);
slot.params.cache_prompt = json_value(data, "cache_prompt", false);
slot.params.n_predict = json_value(data, "n_predict", json_value(data, "max_tokens", default_params.n_predict));
slot.params.n_indent = json_value(data, "n_indent", default_params.n_indent);
slot.sparams.top_k = json_value(data, "top_k", default_sparams.top_k);
slot.sparams.top_p = json_value(data, "top_p", default_sparams.top_p);
slot.sparams.min_p = json_value(data, "min_p", default_sparams.min_p);
@@ -878,7 +883,7 @@ struct server_context {
slot.sparams.mirostat_tau = json_value(data, "mirostat_tau", default_sparams.mirostat_tau);
slot.sparams.mirostat_eta = json_value(data, "mirostat_eta", default_sparams.mirostat_eta);
slot.sparams.penalize_nl = json_value(data, "penalize_nl", default_sparams.penalize_nl);
slot.params.n_keep = json_value(data, "n_keep", slot.params.n_keep);
slot.params.n_keep = json_value(data, "n_keep", default_params.n_keep);
slot.params.n_discard = json_value(data, "n_discard", default_params.n_discard);
slot.sparams.seed = json_value(data, "seed", default_sparams.seed);
slot.sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
@@ -1090,22 +1095,21 @@ struct server_context {
size_t pos = std::min(slot.n_sent_text, slot.generated_text.size());
const std::string str_test = slot.generated_text.substr(pos);
bool is_stop_full = false;
bool send_text = true;
size_t stop_pos = slot.find_stopping_strings(str_test, token_str.size(), STOP_TYPE_FULL);
if (stop_pos != std::string::npos) {
is_stop_full = true;
slot.generated_text.erase(
slot.generated_text.begin() + pos + stop_pos,
slot.generated_text.end());
pos = std::min(slot.n_sent_text, slot.generated_text.size());
} else {
is_stop_full = false;
} else if (slot.has_next_token) {
stop_pos = slot.find_stopping_strings(str_test, token_str.size(), STOP_TYPE_PARTIAL);
send_text = stop_pos == std::string::npos;
}
// check if there is any token to predict
if (stop_pos == std::string::npos || (!slot.has_next_token && !is_stop_full && stop_pos > 0)) {
if (send_text) {
// no send the stop word in the response
result.text_to_send = slot.generated_text.substr(pos, std::string::npos);
slot.n_sent_text += result.text_to_send.size();
@@ -1130,13 +1134,48 @@ struct server_context {
SLT_DBG(slot, "stopped by limit, n_decoded = %d, n_predict = %d\n", slot.n_decoded, slot.params.n_predict);
}
// if we have already seen a new line, we stop after a certain time limit
if (slot.has_new_line && slot.params.t_max_predict_ms > 0 &&
(ggml_time_us() - slot.t_start_generation > 1000.0f*slot.params.t_max_predict_ms)) {
slot.stopped_limit = true;
slot.has_next_token = false;
if (slot.has_new_line) {
// if we have already seen a new line, we stop after a certain time limit
if (slot.params.t_max_predict_ms > 0 && (ggml_time_us() - slot.t_start_generation > 1000.0f*slot.params.t_max_predict_ms)) {
slot.stopped_limit = true;
slot.has_next_token = false;
SLT_DBG(slot, "stopped by time limit, n_decoded = %d, t_max_predict_ms = %d ms\n", slot.n_decoded, (int) slot.params.t_max_predict_ms);
SLT_DBG(slot, "stopped by time limit, n_decoded = %d, t_max_predict_ms = %d ms\n", slot.n_decoded, (int) slot.params.t_max_predict_ms);
}
// require that each new line has a whitespace prefix (i.e. indentation) of at least slot.params.n_indent
if (slot.params.n_indent > 0) {
// check the current indentation
// TODO: improve by not doing it more than once for each new line
if (slot.last_nl_pos > 0) {
size_t pos = slot.last_nl_pos;
int n_indent = 0;
while (pos < slot.generated_text.size() && (slot.generated_text[pos] == ' ' || slot.generated_text[pos] == '\t')) {
n_indent++;
pos++;
}
if (pos < slot.generated_text.size() && n_indent < slot.params.n_indent) {
slot.stopped_limit = true;
slot.has_next_token = false;
// cut the last line
slot.generated_text.erase(pos, std::string::npos);
SLT_DBG(slot, "stopped by indentation limit, n_decoded = %d, n_indent = %d\n", slot.n_decoded, n_indent);
}
}
// find the next new line
{
const size_t pos = slot.generated_text.find('\n', slot.last_nl_pos);
if (pos != std::string::npos) {
slot.last_nl_pos = pos + 1;
}
}
}
}
// check if there is a new line in the generated text
@@ -2287,7 +2326,6 @@ struct server_context {
batch.n_seq_id + i,
batch.seq_id + i,
batch.logits + i,
0, 0, 0, // unused
};
const int ret = llama_decode(ctx, batch_view);

View File

@@ -138,7 +138,7 @@ int main(int argc, char ** argv) {
// prepare a batch for the prompt
llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size(), 0, 0);
llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size());
// main loop
@@ -175,7 +175,7 @@ int main(int argc, char ** argv) {
fflush(stdout);
// prepare the next batch with the sampled token
batch = llama_batch_get_one(&new_token_id, 1, n_pos, 0);
batch = llama_batch_get_one(&new_token_id, 1);
n_decode += 1;
}

View File

@@ -39,6 +39,11 @@ int main(int argc, char ** argv) {
return 1;
}
if (params.n_predict < -1) {
LOG_ERR("%s: --n-predict must be >= -1\n", __func__);
return 1;
}
common_init();
if (params.model_draft.empty()) {
@@ -155,9 +160,9 @@ int main(int argc, char ** argv) {
const auto t_enc_start = ggml_time_us();
// eval the prompt with both models
llama_decode(ctx_tgt, llama_batch_get_one( inp.data(), n_input - 1, 0, 0));
llama_decode(ctx_tgt, llama_batch_get_one(&inp.back(), 1, n_input - 1, 0));
llama_decode(ctx_dft, llama_batch_get_one( inp.data(), n_input, 0, 0));
llama_decode(ctx_tgt, llama_batch_get_one( inp.data(), n_input - 1));
llama_decode(ctx_tgt, llama_batch_get_one(&inp.back(), 1));
llama_decode(ctx_dft, llama_batch_get_one( inp.data(), n_input));
const auto t_enc_end = ggml_time_us();
@@ -180,8 +185,6 @@ int main(int argc, char ** argv) {
// target model sampling context (reuse the llama_context's sampling instance)
struct common_sampler * smpl = common_sampler_init(model_tgt, params.sparams);
struct llama_sampler * softmax = llama_sampler_init_softmax();
// draft sequence data
std::vector<seq_draft> drafts(n_seq_dft);
@@ -190,8 +193,8 @@ int main(int argc, char ** argv) {
drafts[s].smpl = common_sampler_init(model_dft, params.sparams);
}
llama_batch batch_dft = llama_batch_init(params.n_ctx, 0, 1);
llama_batch batch_tgt = llama_batch_init(params.n_ctx, 0, n_seq_dft);
llama_batch batch_dft = llama_batch_init(llama_n_batch(ctx_dft), 0, 1);
llama_batch batch_tgt = llama_batch_init(llama_n_batch(ctx_tgt), 0, n_seq_dft);
const auto t_dec_start = ggml_time_us();
@@ -441,7 +444,7 @@ int main(int argc, char ** argv) {
++n_past_dft;
}
if (n_predict > params.n_predict || has_eos) {
if ((params.n_predict >= 0 && n_predict > params.n_predict) || has_eos) {
break;
}
@@ -624,7 +627,6 @@ int main(int argc, char ** argv) {
common_sampler_free(drafts[s].smpl);
}
llama_sampler_free(softmax);
llama_batch_free(batch_dft);
llama_free(ctx_tgt);

View File

@@ -99,6 +99,9 @@ option(GGML_AVX512 "ggml: enable AVX512" OFF)
option(GGML_AVX512_VBMI "ggml: enable AVX512-VBMI" OFF)
option(GGML_AVX512_VNNI "ggml: enable AVX512-VNNI" OFF)
option(GGML_AVX512_BF16 "ggml: enable AVX512-BF16" OFF)
option(GGML_AMX_TILE "ggml: enable AMX-TILE" OFF)
option(GGML_AMX_INT8 "ggml: enable AMX-INT8" OFF)
option(GGML_AMX_BF16 "ggml: enable AMX-BF16" OFF)
option(GGML_FMA "ggml: enable FMA" ${INS_ENB})
if (NOT MSVC)
option(GGML_F16C "ggml: enable F16C" ${INS_ENB}) # in MSVC F16C is implied with AVX2/AVX512
@@ -158,6 +161,7 @@ set (GGML_METAL_MACOSX_VERSION_MIN "" CACHE STRING
set (GGML_METAL_STD "" CACHE STRING "ggml: metal standard version (-std flag)")
option(GGML_OPENMP "ggml: use OpenMP" ON)
option(GGML_RPC "ggml: use RPC" OFF)
option(GGML_AMX "ggml: use AMX" OFF)
option(GGML_SYCL "ggml: use SYCL" OFF)
option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF)
set (GGML_SYCL_TARGET "INTEL" CACHE STRING

25
ggml/include/ggml-amx.h Normal file
View File

@@ -0,0 +1,25 @@
#pragma once
#include "ggml.h"
#include "ggml-backend.h"
#ifdef __cplusplus
extern "C" {
#endif
// buffer_type API
GGML_API ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void);
GGML_API bool ggml_backend_is_amx(ggml_backend_t backend);
// backend API
GGML_API ggml_backend_t ggml_backend_amx_init(void);
GGML_API void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads);
GGML_API ggml_backend_reg_t ggml_backend_amx_reg(void);
#ifdef __cplusplus
}
#endif

View File

@@ -34,6 +34,8 @@ extern "C" {
*/
#define GGML_CANN_MAX_DEVICES 16
GGML_API ggml_backend_reg_t ggml_backend_cann_reg(void);
/**
* @brief Initializes the CANN backend for a specified device.
*

View File

@@ -19,6 +19,8 @@ extern "C" {
// backend API
GGML_API ggml_backend_t ggml_backend_sycl_init(int device);
GGML_API bool ggml_backend_is_sycl(ggml_backend_t backend);
// devide buffer
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);
@@ -29,14 +31,19 @@ GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const fl
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
GGML_API void ggml_backend_sycl_print_sycl_devices(void);
GGML_API void ggml_sycl_get_gpu_list(int *id_list, int max_len);
GGML_API void ggml_sycl_get_device_description(int device, char *description, size_t description_size);
GGML_API void ggml_backend_sycl_get_gpu_list(int *id_list, int max_len);
GGML_API void ggml_backend_sycl_get_device_description(int device,
char *description,
size_t description_size);
GGML_API int ggml_backend_sycl_get_device_count();
GGML_API void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
// SYCL doesn't support registering host memory, keep here for reference
// GGML_API bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size);
// GGML_API void ggml_backend_sycl_unregister_host_buffer(void * buffer);
GGML_API ggml_backend_reg_t ggml_backend_sycl_reg(void);
#ifdef __cplusplus
}
#endif

View File

@@ -24,6 +24,8 @@ GGML_API ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num);
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
GGML_API ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void);
GGML_API ggml_backend_reg_t ggml_backend_vk_reg(void);
#ifdef __cplusplus
}
#endif

View File

@@ -2488,6 +2488,7 @@ extern "C" {
GGML_API int ggml_cpu_has_avx512_vbmi(void);
GGML_API int ggml_cpu_has_avx512_vnni(void);
GGML_API int ggml_cpu_has_avx512_bf16(void);
GGML_API int ggml_cpu_has_amx_int8 (void);
GGML_API int ggml_cpu_has_fma (void);
GGML_API int ggml_cpu_has_neon (void);
GGML_API int ggml_cpu_has_sve (void);

View File

@@ -267,6 +267,26 @@ if (GGML_LLAMAFILE)
set(GGML_SOURCES_LLAMAFILE llamafile/sgemm.cpp)
endif()
if (GGML_AMX)
if (CMAKE_COMPILER_IS_GNUCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 11.0)
else()
set(GGML_AMX OFF)
message(WARNING "AMX requires gcc version > 11.0. Turning off GGML_AMX.")
endif()
if (GGML_AMX)
message(STATUS "Using AMX")
list(APPEND GGML_CDEF_PUBLIC GGML_USE_AMX)
file(GLOB GGML_HEADERS_AMX "ggml-amx/*.h")
list(APPEND GGML_HEADERS_AMX "../include/ggml-amx.h")
file(GLOB GGML_SOURCES_AMX "ggml-amx/*.cpp")
list(APPEND GGML_SOURCES_AMX "ggml-amx.cpp")
endif()
endif()
if (GGML_CUDA)
cmake_minimum_required(VERSION 3.18) # for CMAKE_CUDA_ARCHITECTURES
@@ -1180,6 +1200,18 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512BF16__>)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512BF16__>)
endif()
if (GGML_AMX_TILE)
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_TILE__>)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_TILE__>)
endif()
if (GGML_AMX_INT8)
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_INT8__>)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_INT8__>)
endif()
if (GGML_AMX_BF16)
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_BF16__>)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_BF16__>)
endif()
elseif (GGML_AVX2)
list(APPEND ARCH_FLAGS /arch:AVX2)
elseif (GGML_AVX)
@@ -1215,6 +1247,15 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
if (GGML_AVX512_BF16)
list(APPEND ARCH_FLAGS -mavx512bf16)
endif()
if (GGML_AMX_TILE)
list(APPEND ARCH_FLAGS -mamx-tile)
endif()
if (GGML_AMX_INT8)
list(APPEND ARCH_FLAGS -mamx-int8)
endif()
if (GGML_AMX_BF16)
list(APPEND ARCH_FLAGS -mamx-bf16)
endif()
endif()
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
message(STATUS "PowerPC detected")
@@ -1340,6 +1381,7 @@ add_library(ggml
${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM}
${GGML_SOURCES_BLAS} ${GGML_HEADERS_BLAS}
${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE}
${GGML_SOURCES_AMX} ${GGML_HEADERS_AMX}
${GGML_SOURCES_CANN} ${GGML_HEADERS_CANN}
ggml-aarch64.c ggml-aarch64.h
)

453
ggml/src/ggml-amx.cpp Normal file
View File

@@ -0,0 +1,453 @@
#include "ggml-amx.h"
#include "ggml-amx/common.h"
#include "ggml-amx/mmq.h"
#include "ggml-backend-impl.h"
#include "ggml-impl.h"
#if defined(__gnu_linux__)
#include <sys/syscall.h>
#include <unistd.h>
#endif
#include <cstdlib>
#include <cstring>
#include <memory>
#if defined(__AMX_INT8__)
// AMX buffer interface
static const char * ggml_backend_amx_buffer_get_name(ggml_backend_buffer_t buffer) {
return "AMX";
GGML_UNUSED(buffer);
}
static void ggml_backend_amx_buffer_free_buffer(ggml_backend_buffer_t buffer) {
free(buffer->context);
}
static void * ggml_backend_amx_buffer_get_base(ggml_backend_buffer_t buffer) {
return (void *)(buffer->context);
}
static void ggml_backend_amx_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
memset((char *)tensor->data + offset, value, size);
GGML_UNUSED(buffer);
}
static void ggml_backend_amx_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
if (qtype_has_amx_kernels(tensor->type)) {
ggml_backend_amx_convert_weight(tensor, data, offset, size);
} else {
memcpy((char *)tensor->data + offset, data, size);
}
GGML_UNUSED(buffer);
}
static void ggml_backend_amx_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
GGML_ASSERT(!qtype_has_amx_kernels(tensor->type));
memcpy(data, (const char *)tensor->data + offset, size);
GGML_UNUSED(buffer);
}
static bool ggml_backend_amx_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
if (ggml_backend_buffer_is_host(src->buffer)) {
if (qtype_has_amx_kernels(src->type)) {
ggml_backend_amx_convert_weight(dst, src->data, 0, ggml_backend_amx_get_alloc_size(dst));
} else {
memcpy(dst->data, src->data, ggml_nbytes(src));
}
return true;
}
return false;
GGML_UNUSED(buffer);
}
static void ggml_backend_amx_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
memset(buffer->context, value, buffer->size);
}
static ggml_backend_buffer_i ggml_backend_amx_buffer_interface = {
/* .get_name = */ ggml_backend_amx_buffer_get_name,
/* .free_buffer = */ ggml_backend_amx_buffer_free_buffer,
/* .get_base = */ ggml_backend_amx_buffer_get_base,
/* .init_tensor = */ NULL, // no initialization required
/* .memset_tensor = */ ggml_backend_amx_buffer_memset_tensor,
/* .set_tensor = */ ggml_backend_amx_buffer_set_tensor,
/* .get_tensor = */ ggml_backend_amx_buffer_get_tensor,
/* .cpy_tensor = */ ggml_backend_amx_buffer_cpy_tensor,
/* .clear = */ ggml_backend_amx_buffer_clear,
/* .reset = */ NULL,
};
static const char * ggml_backend_amx_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
return "AMX";
GGML_UNUSED(buft);
}
static ggml_backend_buffer_t ggml_backend_amx_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
void * data = aligned_alloc(TENSOR_ALIGNMENT, size);
if (data == NULL) {
fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
return NULL;
}
return ggml_backend_buffer_init(buft, ggml_backend_amx_buffer_interface, data, size);
}
static size_t ggml_backend_amx_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
return TENSOR_ALIGNMENT;
GGML_UNUSED(buft);
}
static size_t ggml_backend_amx_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor* tensor) {
return ggml_backend_amx_get_alloc_size(tensor);
GGML_UNUSED(buft);
}
static bool ggml_backend_amx_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
return false;
GGML_UNUSED(buft);
}
ggml_backend_buffer_type_t ggml_backend_amx_buffer_type() {
static struct ggml_backend_buffer_type ggml_backend_buffer_type_amx = {
/* .iface = */ {
/* .get_name = */ ggml_backend_amx_buffer_type_get_name,
/* .alloc_buffer = */ ggml_backend_amx_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_amx_buffer_type_get_alignment,
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ ggml_backend_amx_buffer_type_get_alloc_size,
/* .is_host = */ ggml_backend_amx_buffer_type_is_host,
},
/* .device = */ NULL,
/* .context = */ NULL,
};
return &ggml_backend_buffer_type_amx;
}
// backend interface
static const char * ggml_backend_amx_name(ggml_backend_t backend) {
return "AMX";
GGML_UNUSED(backend);
}
static void ggml_backend_amx_free(ggml_backend_t backend) {
ggml_backend_amx_context * ctx = (ggml_backend_amx_context *)backend->context;
delete ctx;
delete backend;
}
static ggml_backend_buffer_type_t ggml_backend_amx_get_default_buffer_type(ggml_backend_t backend) {
return ggml_backend_amx_buffer_type();
GGML_UNUSED(backend);
}
static enum ggml_status ggml_backend_amx_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
ggml_backend_amx_context * ctx = (ggml_backend_amx_context *)backend->context;
for (int i = 0; i < cgraph->n_nodes; i++) {
struct ggml_tensor * node = cgraph->nodes[i];
switch (node->op) {
case GGML_OP_MUL_MAT:
ggml_backend_amx_mul_mat(ctx, node);
break;
case GGML_OP_NONE:
case GGML_OP_RESHAPE:
case GGML_OP_VIEW:
case GGML_OP_PERMUTE:
case GGML_OP_TRANSPOSE:
break;
default:
fprintf(stderr, "%s: unsupported op %s\n", __func__, ggml_op_desc(node));
GGML_ASSERT(false);
}
}
return GGML_STATUS_SUCCESS;
GGML_UNUSED(backend);
}
static struct ggml_backend_i ggml_backend_amx_i = {
/* .get_name = */ ggml_backend_amx_name,
/* .free = */ ggml_backend_amx_free,
/* .get_default_buffer_type = */ ggml_backend_amx_get_default_buffer_type,
/* .set_tensor_async = */ NULL,
/* .get_tensor_async = */ NULL,
/* .cpy_tensor_async = */ NULL,
/* .synchronize = */ NULL,
/* .graph_plan_create = */ NULL,
/* .graph_plan_free = */ NULL,
/* .graph_plan_update = */ NULL,
/* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_amx_graph_compute,
/* .supports_op = */ NULL,
/* .supports_buft = */ NULL,
/* .offload_op = */ NULL,
/* .event_record = */ NULL,
/* .event_wait = */ NULL,
};
static ggml_guid_t ggml_backend_amx_guid() {
static ggml_guid guid = { 0x13, 0xb8, 0xa4, 0xc4, 0xba, 0xfe, 0x51, 0x67, 0x87, 0x44, 0x55, 0x15, 0xb2, 0x35, 0x62, 0x3e };
return &guid;
}
#define ARCH_GET_XCOMP_PERM 0x1022
#define ARCH_REQ_XCOMP_PERM 0x1023
#define XFEATURE_XTILECFG 17
#define XFEATURE_XTILEDATA 18
static bool ggml_amx_init() {
#if defined(__gnu_linux__)
if (syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA)) {
fprintf(stderr, "AMX is not ready to be used!\n");
return false;
}
return true;
#elif defined(_WIN32)
return true;
#endif
}
ggml_backend_t ggml_backend_amx_init() {
// invoke a Linux system call to request access to AMX features
ggml_amx_init();
// backend context
ggml_backend_amx_context * ctx = new ggml_backend_amx_context;
// ggml amx backend
ggml_backend_t backend = new ggml_backend {
/* .guid = */ ggml_backend_amx_guid(),
/* .interface = */ ggml_backend_amx_i,
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_amx_reg(), 0),
/* .context = */ ctx,
};
return backend;
}
bool ggml_backend_is_amx(ggml_backend_t backend) {
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_amx_guid());
}
void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads) {
GGML_ASSERT(ggml_backend_is_amx(backend_amx));
ggml_backend_amx_context * ctx = (ggml_backend_amx_context *)backend_amx->context;
ctx->n_threads = n_threads;
}
// device interface
static const char * ggml_backend_amx_device_get_name(ggml_backend_dev_t dev) {
return "AMX";
GGML_UNUSED(dev);
}
static const char * ggml_backend_amx_device_get_description(ggml_backend_dev_t dev) {
return "Intel Advanced Matrix Extensions";
GGML_UNUSED(dev);
}
static void ggml_backend_amx_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
// TODO
*free = 0;
*total = 0;
GGML_UNUSED(dev);
}
static enum ggml_backend_dev_type ggml_backend_amx_device_get_type(ggml_backend_dev_t dev) {
return GGML_BACKEND_DEVICE_TYPE_CPU;
GGML_UNUSED(dev);
}
static void ggml_backend_amx_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
props->name = ggml_backend_amx_device_get_name(dev);
props->description = ggml_backend_amx_device_get_description(dev);
props->type = ggml_backend_amx_device_get_type(dev);
ggml_backend_amx_device_get_memory(dev, &props->memory_free, &props->memory_total);
// `buffer_from_host_ptr` is intended to be used in mmap, when memory layout unchanged
props->caps = {
/* .async = */ false,
/* .host_buffer = */ false,
/* .buffer_from_host_ptr = */ false,
/* .events = */ false,
};
}
static ggml_backend_t ggml_backend_amx_device_init(ggml_backend_dev_t dev, const char * params) {
return ggml_backend_amx_init();
GGML_UNUSED(dev);
GGML_UNUSED(params);
}
static ggml_backend_buffer_type_t ggml_backend_amx_device_get_buffer_type(ggml_backend_dev_t dev) {
return ggml_backend_amx_buffer_type();
GGML_UNUSED(dev);
}
static bool ggml_backend_amx_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
// handle only 2d gemm for now
auto is_contiguous_2d = [](const struct ggml_tensor * t) {
return ggml_is_contiguous(t) && t->ne[3] == 1 && t->ne[2] == 1;
};
switch (op->op) {
case GGML_OP_NONE:
case GGML_OP_RESHAPE:
case GGML_OP_VIEW:
case GGML_OP_PERMUTE:
case GGML_OP_TRANSPOSE:
return true;
case GGML_OP_MUL_MAT: {
const struct ggml_tensor * src0 = op->src[0];
const struct ggml_tensor * src1 = op->src[1];
const enum ggml_type type = src0->type;
const int64_t ne0 = op->ne[0];
bool is_training = src0->grad || src1->grad;
// amx kernels enables for Q4_0, Q4_1, Q8_0, F16
// Q4_K, Q5_K, Q6_K, IQ4_XS enabled for QK_K = 256
bool has_amx_kernels = qtype_has_amx_kernels(type) || (type == GGML_TYPE_F16);
bool can_use_amx =
is_contiguous_2d(src0) && // src0 must be contiguous
is_contiguous_2d(src1) && // src1 must be contiguous
!is_training && // inference only
src1->type == GGML_TYPE_F32 && // src1 must be float32
has_amx_kernels && // with amx kernel impls
ne0 % (TILE_N * 2) == 0; // out_features is 32x
return can_use_amx;
}
default:
return false;
}
GGML_UNUSED(dev);
}
static bool ggml_backend_amx_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
return buft->iface.get_name == ggml_backend_amx_buffer_type_get_name;
GGML_UNUSED(dev);
}
static const struct ggml_backend_device_i ggml_backend_amx_device_i = {
/* .get_name = */ ggml_backend_amx_device_get_name,
/* .get_description = */ ggml_backend_amx_device_get_description,
/* .get_memory = */ ggml_backend_amx_device_get_memory,
/* .get_type = */ ggml_backend_amx_device_get_type,
/* .get_props = */ ggml_backend_amx_device_get_props,
/* .init_backend = */ ggml_backend_amx_device_init,
/* .get_buffer_type = */ ggml_backend_amx_device_get_buffer_type,
/* .get_host_buffer_type = */ NULL,
/* .buffer_from_host_ptr = */ NULL,
/* .supports_op = */ ggml_backend_amx_device_supports_op,
/* .supports_buft = */ ggml_backend_amx_device_supports_buft,
/* .offload_op = */ NULL,
/* .event_new = */ NULL,
/* .event_free = */ NULL,
/* .event_synchronize = */ NULL,
};
// backend reg interface
static const char * ggml_backend_amx_reg_get_name(ggml_backend_reg_t reg) {
return "AMX";
GGML_UNUSED(reg);
}
static size_t ggml_backend_amx_reg_get_device_count(ggml_backend_reg_t reg) {
return 1;
GGML_UNUSED(reg);
}
static ggml_backend_dev_t ggml_backend_amx_reg_get_device(ggml_backend_reg_t reg, size_t index) {
GGML_ASSERT(index == 0);
static ggml_backend_device ggml_backend_amx_device = {
/* .iface = */ ggml_backend_amx_device_i,
/* .reg = */ reg,
/* .context = */ nullptr,
};
return &ggml_backend_amx_device;
GGML_UNUSED(reg);
GGML_UNUSED(index);
}
static void * ggml_backend_amx_get_proc_address(ggml_backend_reg_t reg, const char * name) {
if (std::strcmp(name, "ggml_backend_set_n_threads") == 0) {
return (void *)ggml_backend_amx_set_n_threads;
}
return NULL;
GGML_UNUSED(reg);
GGML_UNUSED(name);
}
static const struct ggml_backend_reg_i ggml_backend_amx_reg_i = {
/* .get_name = */ ggml_backend_amx_reg_get_name,
/* .get_device_count = */ ggml_backend_amx_reg_get_device_count,
/* .get_device = */ ggml_backend_amx_reg_get_device,
/* .get_proc_address = */ ggml_backend_amx_get_proc_address,
};
ggml_backend_reg_t ggml_backend_amx_reg(void) {
static struct ggml_backend_reg ggml_backend_amx_reg = {
/* .iface = */ ggml_backend_amx_reg_i,
/* .context = */ NULL,
};
return &ggml_backend_amx_reg;
}
#else // if defined(__AMX_INT8__)
ggml_backend_t ggml_backend_amx_init(void) {
fprintf(stderr, "GGML is not compiled with AMX support!\n");
return ggml_backend_t{};
}
void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads) {
fprintf(stderr, "GGML is not compiled with AMX support!\n");
GGML_UNUSED(backend_amx);
GGML_UNUSED(n_threads);
}
#endif

View File

@@ -0,0 +1,93 @@
#pragma once
#include "ggml.h"
#include "ggml-cpu-impl.h" // <immintrin.h>
#include <algorithm>
#include <memory>
#include <type_traits>
#if defined(_OPENMP)
#include <omp.h>
#endif
#define TILE_M 16
#define TILE_N 16
#define TILE_K 32
#define VNNI_BLK 4
#define AMX_BLK_SIZE 32
#define TMM0 0
#define TMM1 1
#define TMM2 2
#define TMM3 3
#define TMM4 4
#define TMM5 5
#define TMM6 6
#define TMM7 7
// parallel routines
template <typename T, typename std::enable_if<std::is_integral<T>::value, int>::type = 0>
inline T div_up(T x, T y) { return (x + y - 1) / y; }
template <typename T>
inline void balance211(T n, T nth, T ith, T& n_start, T& n_end) {
#if 0
// onednn partition pattern
T& n_my = n_end;
if (nth <= 1 || n == 0) {
n_start = 0;
n_my = n;
} else {
T n1 = div_up(n, nth);
T n2 = n1 - 1;
T T1 = n - n2 * nth;
n_my = ith < T1 ? n1 : n2;
n_start = ith <= T1 ? ith*n1 : T1 * n1 + (ith - T1) * n2;
}
n_end += n_start;
#else
// pytorch aten partition pattern
T n_my = div_up(n, nth);
n_start = ith * n_my;
n_end = std::min(n_start + n_my, n);
#endif
}
template <typename func_t>
inline void parallel_for(int nth, int n, const func_t& f) {
#if defined(_OPENMP)
#pragma omp parallel num_threads(nth)
{
//int nth = omp_get_num_threads();
int ith = omp_get_thread_num();
int tbegin, tend;
balance211(n, nth, ith, tbegin, tend);
f(tbegin, tend);
}
#else
f(0, n);
GGML_UNUSED(nth);
#endif
}
// quantized types that have AMX support
inline bool qtype_has_amx_kernels(const enum ggml_type type) {
// TODO: fix padding for vnni format
return (type == GGML_TYPE_Q4_0) ||
(type == GGML_TYPE_Q4_1);
//(type == GGML_TYPE_Q8_0) ||
//(type == GGML_TYPE_Q4_K) ||
//(type == GGML_TYPE_Q5_K) ||
//(type == GGML_TYPE_Q6_K) ||
//(type == GGML_TYPE_IQ4_XS);
}
// ggml backend context
struct ggml_backend_amx_context {
int n_threads = GGML_DEFAULT_N_THREADS;
std::unique_ptr<char[]> work_data;
size_t work_size = 0;
};

2509
ggml/src/ggml-amx/mmq.cpp Normal file

File diff suppressed because it is too large Load Diff

17
ggml/src/ggml-amx/mmq.h Normal file
View File

@@ -0,0 +1,17 @@
#pragma once
#include "common.h"
#include <stdint.h>
#ifdef __cplusplus
extern "C" {
#endif
size_t ggml_backend_amx_get_alloc_size(const struct ggml_tensor * tensor);
void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
void ggml_backend_amx_mul_mat(ggml_backend_amx_context * ctx, struct ggml_tensor * dst);
#ifdef __cplusplus
}
#endif

View File

@@ -329,7 +329,6 @@ bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type
if (backend->device) {
return ggml_backend_dev_supports_buft(backend->device, buft);
}
return backend->iface.supports_buft(backend, buft);
}
@@ -538,6 +537,14 @@ void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * na
#include "ggml-metal.h"
#endif
#ifdef GGML_USE_SYCL
#include "ggml-sycl.h"
#endif
#ifdef GGML_USE_VULKAN
#include "ggml-vulkan.h"
#endif
#ifdef GGML_USE_BLAS
#include "ggml-blas.h"
#endif
@@ -546,6 +553,18 @@ void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * na
#include "ggml-rpc.h"
#endif
#ifndef __AMX_INT8__
#undef GGML_USE_AMX
#endif
#ifdef GGML_USE_AMX
# include "ggml-amx.h"
#endif
#ifdef GGML_USE_CANN
#include "ggml-cann.h"
#endif
struct ggml_backend_registry {
std::vector<ggml_backend_reg_t> backends;
std::vector<ggml_backend_dev_t> devices;
@@ -557,14 +576,26 @@ struct ggml_backend_registry {
#ifdef GGML_USE_METAL
register_backend(ggml_backend_metal_reg());
#endif
#ifdef GGML_USE_SYCL
register_backend(ggml_backend_sycl_reg());
#endif
#ifdef GGML_USE_VULKAN
register_backend(ggml_backend_vk_reg());
#endif
#ifdef GGML_USE_BLAS
register_backend(ggml_backend_blas_reg());
#endif
#ifdef GGML_USE_RPC
register_backend(ggml_backend_rpc_reg());
#endif
#ifdef GGML_USE_AMX
register_backend(ggml_backend_amx_reg());
#endif
#ifdef GGML_USE_CANN
register_backend(ggml_backend_cann_reg());
#endif
// TODO: sycl, vulkan, kompute, cann
// TODO: kompute
register_backend(ggml_backend_cpu_reg());
}
@@ -682,8 +713,6 @@ ggml_backend_t ggml_backend_init_best(void) {
// backend CPU
static const size_t TENSOR_ALIGNMENT = 32; // required for mmap as gguf only guarantees 32-byte alignment
static const char * ggml_backend_cpu_buffer_get_name(ggml_backend_buffer_t buffer) {
return "CPU";
@@ -702,7 +731,7 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
}
static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
free(buffer->context);
ggml_aligned_free(buffer->context, buffer->size);
}
static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
@@ -770,14 +799,19 @@ static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_ty
}
static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
void * data = malloc(size); // TODO: use GGML_ALIGNED_MALLOC (move to ggml-impl.h)
auto alloc_size = size;
if (alloc_size == 0) {
alloc_size = 1;
}
void * data = ggml_aligned_malloc(alloc_size);
if (data == NULL) {
GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size);
GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, alloc_size);
return NULL;
}
return ggml_backend_buffer_init(buft, ggml_backend_cpu_buffer_i, data, size);
return ggml_backend_buffer_init(buft, ggml_backend_cpu_buffer_i, data, alloc_size);
}
static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
@@ -2234,6 +2268,7 @@ ggml_backend_sched_t ggml_backend_sched_new(
sched->backends[b] = backends[b];
sched->bufts[b] = bufts ? bufts[b] : ggml_backend_get_default_buffer_type(backends[b]);
GGML_ASSERT(ggml_backend_supports_buft(backends[b], sched->bufts[b]));
if (sched->n_copies > 1) {
for (int c = 0; c < sched->n_copies; c++) {
sched->events[b][c] = ggml_backend_event_new(backends[b]->device);

View File

@@ -39,6 +39,8 @@
#include "ggml-common.h"
#define GGML_CANN_NAME "CANN"
/**
* @brief Handles CANN errors by printing an error message and aborting.
*
@@ -851,13 +853,6 @@ static void ggml_backend_cann_buffer_set_tensor(
void *transform_buffer = malloc(size);
ggml_backend_cann_transform(tensor, data, transform_buffer);
#ifndef NDEBUG
void *check_buffer = malloc(size);
ggml_backend_cann_transform_back(tensor, transform_buffer,
check_buffer);
GGML_ASSERT(memcmp(data, check_buffer, size) == 0);
free(check_buffer);
#endif
ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size,
transform_buffer, size,
ACL_MEMCPY_HOST_TO_DEVICE));
@@ -969,7 +964,7 @@ static void ggml_backend_cann_buffer_clear(
* This structure defines function pointers to operations that can be performed
* on a CANN buffer within the backend.
*/
static ggml_backend_buffer_i ggml_backend_cann_buffer_interface = {
static const ggml_backend_buffer_i ggml_backend_cann_buffer_interface = {
/* .get_name = */ ggml_backend_cann_buffer_get_name,
/* .free_buffer = */ ggml_backend_cann_buffer_free_buffer,
/* .get_base = */ ggml_backend_cann_buffer_get_base,
@@ -1105,19 +1100,25 @@ static size_t ggml_backend_cann_buffer_type_get_alloc_size(
GGML_UNUSED(buft);
}
static bool ggml_backend_cann_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
return false;
GGML_UNUSED(buft);
}
/**
* @brief Interface for managing CANN buffer types in the GGML backend.
*
* Provides function pointers for allocating, querying properties, and managing
* memory for CANN buffer types in the GGML backend.
*/
static ggml_backend_buffer_type_i ggml_backend_cann_buffer_type_interface = {
static const ggml_backend_buffer_type_i ggml_backend_cann_buffer_type_interface = {
/* .get_name = */ ggml_backend_cann_buffer_type_name,
/* .alloc_buffer = */ ggml_backend_cann_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_cann_buffer_type_get_alignment,
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ ggml_backend_cann_buffer_type_get_alloc_size,
/* .is_host = */ NULL,
/* .is_host = */ ggml_backend_cann_buffer_type_is_host,
};
/**
@@ -1148,7 +1149,7 @@ ggml_backend_cann_buffer_type(int32_t device) {
for (int32_t i = 0; i < GGML_CANN_MAX_DEVICES; i++) {
ggml_backend_cann_buffer_types[i] = {
/* .iface = */ ggml_backend_cann_buffer_type_interface,
/* .device = */ nullptr,
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), device),
/* .context = */
new ggml_backend_cann_buffer_type_context{
i, "CANN" + std::to_string(i)},
@@ -1264,7 +1265,7 @@ ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type() {
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
},
/* .device = */ nullptr,
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), 0),
/* .context = */ nullptr,
};
@@ -1511,13 +1512,6 @@ static void ggml_backend_cann_set_tensor_async(ggml_backend_t backend,
void *transform_buffer = malloc(size);
ggml_backend_cann_transform(tensor, data, transform_buffer);
#ifndef NDEBUG
void *check_buffer = malloc(size);
ggml_backend_cann_transform_back(tensor, transform_buffer,
check_buffer);
GGML_ASSERT(memcmp(data, check_buffer, size));
free(check_buffer);
#endif
ACL_CHECK(aclrtMemcpyAsync(
(char *)tensor->data + offset, size, transform_buffer, size,
ACL_MEMCPY_HOST_TO_DEVICE, cann_ctx->stream()));
@@ -1692,7 +1686,7 @@ static enum ggml_status ggml_backend_cann_graph_compute(
* @return bool Returns true if the operation is supported by the backend,
* otherwise false.
*/
static bool ggml_backend_cann_supports_op(ggml_backend_t backend,
static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
const ggml_tensor* op) {
switch (op->op) {
case GGML_OP_UNARY:
@@ -1783,7 +1777,7 @@ static bool ggml_backend_cann_supports_op(ggml_backend_t backend,
return false;
}
GGML_UNUSED(backend);
GGML_UNUSED(dev);
}
/**
@@ -1801,31 +1795,6 @@ static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft) {
return buft->iface.get_name == ggml_backend_cann_buffer_type_name;
}
/**
* @brief Checks if the CANN backend supports a specific backend buffer type.
*
* This function determines whether the CANN backend supports the given backend
* buffer type by comparing the device context of the backend and buffer type.
* It returns true if the devices are same between the backend context and
* buffer type context.
*
* @param backend Pointer to the CANN backend.
* @param buft Pointer to the backend buffer type to check.
* @return bool Returns true if the CANN backend supports the buffer type,
* otherwise false.
*/
static bool ggml_backend_cann_supports_buft(
ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
if (ggml_backend_buft_is_cann(buft)) {
ggml_backend_cann_context * cann_ctx =
(ggml_backend_cann_context *)backend->context;
ggml_backend_cann_buffer_type_context * buft_ctx =
(ggml_backend_cann_buffer_type_context *)buft->context;
return buft_ctx->device == cann_ctx->device;
}
return false;
}
/**
* @brief Determines if a tensor operation should be offloaded to the CANN
* backend.
@@ -1840,54 +1809,14 @@ static bool ggml_backend_cann_supports_buft(
* @return bool Returns true if the operation should be offloaded, otherwise
* false.
*/
static bool ggml_backend_cann_offload_op(ggml_backend_t backend,
static bool ggml_backend_cann_offload_op(ggml_backend_dev_t dev,
const ggml_tensor* op) {
const int min_batch_size = 32;
GGML_UNUSED(backend);
GGML_UNUSED(dev);
return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS;
}
/**
* @brief Creates a new event for the CANN backend.
*
* This function initializes a new event for the CANN backend by setting the
* device and creating an ACL runtime event. The created event is then wrapped
* in a ggml_backend_event structure and returned.
*
* @param backend Pointer to the CANN backend.
* @return ggml_backend_event_t Returns a pointer to the new event structure.
*/
static ggml_backend_event_t ggml_backend_cann_event_new(
ggml_backend_t backend) {
ggml_backend_cann_context* cann_ctx =
(ggml_backend_cann_context*)backend->context;
ggml_cann_set_device(cann_ctx->device);
aclrtEvent event;
ACL_CHECK(aclrtCreateEvent(&event));
return new ggml_backend_event{
/* .device = */ nullptr,
/* .context = */ event,
};
}
/**
* @brief Frees a CANN backend event.
*
* This function destroys the ACL runtime event associated with the given CANN
* backend event and then deletes the event structure itself.
*
* @param event Pointer to the event structure to be freed.
*/
static void ggml_backend_cann_event_free(ggml_backend_event_t event) {
ACL_CHECK(aclrtDestroyEvent((aclrtEvent)event->context));
delete event;
}
/**
* @brief Records an event on the CANN backend stream.
*
@@ -1924,17 +1853,6 @@ static void ggml_backend_cann_event_wait(ggml_backend_t backend,
}
}
/**
* @brief Synchronizes the given event on the CANN backend.
*
* This function waits for the specified event to complete on the ACL runtime.
*
* @param event Pointer to the event structure to be synchronized.
*/
static void ggml_backend_cann_event_synchronize(ggml_backend_event_t event) {
ACL_CHECK(aclrtSynchronizeEvent((aclrtEvent)event->context));
}
/**
* @brief Structure defining the interface for the CANN backend.
*
@@ -1942,7 +1860,7 @@ static void ggml_backend_cann_event_synchronize(ggml_backend_event_t event) {
* supported by the CANN backend, including name retrieval, memory
* management, tensor operations, synchronization, and event handling.
*/
static ggml_backend_i ggml_backend_cann_interface = {
static const ggml_backend_i ggml_backend_cann_interface = {
/* .get_name = */ ggml_backend_cann_name,
/* .free = */ ggml_backend_cann_free,
/* .get_default_buffer_type = */ ggml_backend_cann_get_default_buffer_type,
@@ -1955,9 +1873,9 @@ static ggml_backend_i ggml_backend_cann_interface = {
/* .graph_plan_update = */ NULL,
/* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_cann_graph_compute,
/* .supports_op = */ ggml_backend_cann_supports_op,
/* .supports_buft = */ ggml_backend_cann_supports_buft,
/* .offload_op = */ ggml_backend_cann_offload_op,
/* .supports_op = */ NULL, // moved to device
/* .supports_buft = */ NULL, // moved to device
/* .offload_op = */ NULL, // moved to device
/* .event_record = */ ggml_backend_cann_event_record,
/* .event_wait = */ ggml_backend_cann_event_wait,
};
@@ -1976,6 +1894,234 @@ static ggml_guid_t ggml_backend_cann_guid() {
return &guid;
}
// backend device
struct ggml_backend_cann_device_context {
int device;
std::string name;
std::string description;
};
static const char * ggml_backend_cann_device_get_name(ggml_backend_dev_t dev) {
ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
return ctx->name.c_str();
}
static const char* ggml_backend_cann_device_get_description(ggml_backend_dev_t dev) {
ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
return ctx->description.c_str();
}
static void ggml_backend_cann_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
ggml_backend_cann_get_device_memory(ctx->device, free, total);
}
static enum ggml_backend_dev_type ggml_backend_cann_device_get_type(ggml_backend_dev_t dev) {
GGML_UNUSED(dev);
return GGML_BACKEND_DEVICE_TYPE_GPU_FULL;
}
static void ggml_backend_cann_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
props->name = ggml_backend_cann_device_get_name(dev);
props->description = ggml_backend_cann_device_get_description(dev);
props->type = ggml_backend_cann_device_get_type(dev);
ggml_backend_cann_device_get_memory(dev, &props->memory_free, &props->memory_total);
bool host_buffer = getenv("GGML_CANN_NO_PINNED") == nullptr;
props->caps = {
/* .async = */ false,
/* .host_buffer = */ host_buffer,
/* .buffer_from_host_ptr = */ false,
/* .events = */ true,
};
}
static ggml_backend_t ggml_backend_cann_device_init(ggml_backend_dev_t dev, const char * params) {
GGML_UNUSED(params);
ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
return ggml_backend_cann_init(ctx->device);
}
/**
* @brief Checks if the CANN backend supports a specific backend buffer type.
*
* This function determines whether the CANN backend supports the given backend
* buffer type by comparing the device context of the backend and buffer type.
* It returns true if the devices are same between the backend context and
* buffer type context.
*
* @param backend Pointer to the CANN backend.
* @param buft Pointer to the backend buffer type to check.
* @return bool Returns true if the CANN backend supports the buffer type,
* otherwise false.
*/
static bool ggml_backend_cann_supports_buft(
ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
if (ggml_backend_buft_is_cann(buft)) {
ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *)dev->context;
ggml_backend_cann_buffer_type_context * buft_ctx =
(ggml_backend_cann_buffer_type_context *)buft->context;
return buft_ctx->device == dev_ctx->device;
}
return false;
}
static ggml_backend_buffer_type_t ggml_backend_cann_device_get_buffer_type(ggml_backend_dev_t dev) {
ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
return ggml_backend_cann_buffer_type(ctx->device);
}
static ggml_backend_buffer_type_t ggml_backend_cann_device_get_host_buffer_type(ggml_backend_dev_t dev) {
GGML_UNUSED(dev);
return ggml_backend_cann_host_buffer_type();
}
/**
* @brief Creates a new event for the CANN backend device.
*
* This function initializes a new event for the CANN backend by setting the
* device and creating an ACL runtime event. The created event is then wrapped
* in a ggml_backend_event structure and returned.
*
* @param backend Pointer to the CANN backend.
* @return ggml_backend_event_t Returns a pointer to the new event structure.
*/
static ggml_backend_event_t ggml_backend_cann_device_event_new(
ggml_backend_dev_t dev) {
ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *)dev->context;
ggml_cann_set_device(dev_ctx->device);
aclrtEvent event;
ACL_CHECK(aclrtCreateEvent(&event));
return new ggml_backend_event{
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), dev_ctx->device),
/* .context = */ event,
};
}
/**
* @brief Frees a CANN backend event.
*
* This function destroys the ACL runtime event associated with the given CANN
* backend event and then deletes the event structure itself.
*
* @param event Pointer to the event structure to be freed.
*/
static void ggml_backend_cann_device_event_free(ggml_backend_dev_t dev, ggml_backend_event_t event) {
ACL_CHECK(aclrtDestroyEvent((aclrtEvent)event->context));
delete event;
GGML_UNUSED(dev);
}
/**
* @brief Synchronizes the given event on the CANN backend.
*
* This function waits for the specified event to complete on the ACL runtime.
*
* @param event Pointer to the event structure to be synchronized.
*/
static void ggml_backend_cann_device_event_synchronize(ggml_backend_dev_t dev, ggml_backend_event_t event) {
ACL_CHECK(aclrtSynchronizeEvent((aclrtEvent)event->context));
GGML_UNUSED(dev);
}
static const ggml_backend_device_i ggml_backend_cann_device_interface = {
/* .get_name = */ ggml_backend_cann_device_get_name,
/* .get_description = */ ggml_backend_cann_device_get_description,
/* .get_memory = */ ggml_backend_cann_device_get_memory,
/* .get_type = */ ggml_backend_cann_device_get_type,
/* .get_props = */ ggml_backend_cann_device_get_props,
/* .init_backend = */ ggml_backend_cann_device_init, // called for every card
/* .get_buffer_type = */ ggml_backend_cann_device_get_buffer_type,
/* .get_host_buffer_type = */ ggml_backend_cann_device_get_host_buffer_type,
/* .buffer_from_host_ptr = */ NULL, // not supported for CANN
/* .supports_op = */ ggml_backend_cann_supports_op,
/* .supports_buft = */ ggml_backend_cann_supports_buft,
/* .offload_op = */ ggml_backend_cann_offload_op,
/* .event_new = */ ggml_backend_cann_device_event_new,
/* .event_free = */ ggml_backend_cann_device_event_free,
/* .event_synchronize = */ ggml_backend_cann_device_event_synchronize,
};
// backend reg
struct ggml_backend_cann_reg_context {
std::vector<ggml_backend_dev_t> devices;
};
static const char * ggml_backend_cann_reg_get_name(ggml_backend_reg_t reg) {
GGML_UNUSED(reg);
return GGML_CANN_NAME;
}
static size_t ggml_backend_cann_reg_get_device_count(ggml_backend_reg_t reg) {
ggml_backend_cann_reg_context * ctx = (ggml_backend_cann_reg_context *)reg->context;
return ctx->devices.size();
}
static ggml_backend_dev_t ggml_backend_cann_reg_get_device(ggml_backend_reg_t reg, size_t index) {
ggml_backend_cann_reg_context * ctx = (ggml_backend_cann_reg_context *)reg->context;
GGML_ASSERT(index < ctx->devices.size());
return ctx->devices[index];
}
static void * ggml_backend_cann_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
GGML_UNUSED(reg);
GGML_UNUSED(name);
// reserved for future use
return nullptr;
}
static const ggml_backend_reg_i ggml_backend_cann_reg_interface = {
/* .get_name = */ ggml_backend_cann_reg_get_name,
/* .get_device_count = */ ggml_backend_cann_reg_get_device_count,
/* .get_device_get = */ ggml_backend_cann_reg_get_device,
/* .get_proc_address = */ ggml_backend_cann_reg_get_proc_address,
};
// backend registry, called only once for cann backend
ggml_backend_reg_t ggml_backend_cann_reg() {
static ggml_backend_reg reg;
static bool initialized = false;
{
static std::mutex mutex;
std::lock_guard<std::mutex> lock(mutex);
if (!initialized) {
aclInit(nullptr);
ggml_backend_cann_reg_context * ctx = new ggml_backend_cann_reg_context;
for (int i = 0; i < ggml_cann_info().device_count; i++) {
ggml_backend_cann_device_context* dev_ctx = new ggml_backend_cann_device_context();
dev_ctx->description = aclrtGetSocName();
dev_ctx->device = i;
dev_ctx->name = GGML_CANN_NAME + std::to_string(i);
ggml_cann_set_device(i);
ggml_backend_dev_t dev = new ggml_backend_device {
/* .interface = */ ggml_backend_cann_device_interface,
/* .reg = */ &reg,
/* .context = */ dev_ctx
};
ctx->devices.push_back(dev);
}
reg = ggml_backend_reg {
/* .interface = */ ggml_backend_cann_reg_interface,
/* .context = */ ctx
};
}
initialized = true;
}
return &reg;
}
ggml_backend_t ggml_backend_cann_init(int32_t device) {
aclInit(nullptr);
if (device < 0 || device >= ggml_backend_cann_get_device_count()) {
@@ -1992,7 +2138,7 @@ ggml_backend_t ggml_backend_cann_init(int32_t device) {
ggml_backend_t cann_backend =
new ggml_backend{/* .guid = */ ggml_backend_cann_guid(),
/* .interface = */ ggml_backend_cann_interface,
/* .device = */ nullptr,
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), device),
/* .context = */ ctx};
return cann_backend;

View File

@@ -19,6 +19,9 @@ extern "C" {
#define MIN(a, b) ((a) < (b) ? (a) : (b))
#define MAX(a, b) ((a) > (b) ? (a) : (b))
// required for mmap as gguf only guarantees 32-byte alignment
#define TENSOR_ALIGNMENT 32
// static_assert should be a #define, but if it's not,
// fall back to the _Static_assert C11 keyword.
// if C99 - static_assert is noop
@@ -196,6 +199,11 @@ struct ggml_cgraph {
struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);
// Memory allocation
void * ggml_aligned_malloc(size_t size);
void ggml_aligned_free(void * ptr, size_t size);
#ifdef __cplusplus
}
#endif

View File

@@ -57,8 +57,9 @@ struct socket_t {
}
};
// ggml_tensor is serialized into rpc_tensor
// all RPC structures must be packed
#pragma pack(push, 1)
// ggml_tensor is serialized into rpc_tensor
struct rpc_tensor {
uint64_t id;
uint32_t type;
@@ -76,7 +77,6 @@ struct rpc_tensor {
char padding[4];
};
#pragma pack(pop)
static_assert(sizeof(rpc_tensor) % 8 == 0, "rpc_tensor size must be multiple of 8");
@@ -96,6 +96,65 @@ enum rpc_cmd {
RPC_CMD_COUNT,
};
struct rpc_msg_alloc_buffer_req {
uint64_t size;
};
struct rpc_msg_alloc_buffer_rsp {
uint64_t remote_ptr;
uint64_t remote_size;
};
struct rpc_msg_get_alignment_rsp {
uint64_t alignment;
};
struct rpc_msg_get_max_size_rsp {
uint64_t max_size;
};
struct rpc_msg_buffer_get_base_req {
uint64_t remote_ptr;
};
struct rpc_msg_buffer_get_base_rsp {
uint64_t base_ptr;
};
struct rpc_msg_free_buffer_req {
uint64_t remote_ptr;
};
struct rpc_msg_buffer_clear_req {
uint64_t remote_ptr;
uint8_t value;
};
struct rpc_msg_get_tensor_req {
rpc_tensor tensor;
uint64_t offset;
uint64_t size;
};
struct rpc_msg_copy_tensor_req {
rpc_tensor src;
rpc_tensor dst;
};
struct rpc_msg_copy_tensor_rsp {
uint8_t result;
};
struct rpc_msg_graph_compute_rsp {
uint8_t result;
};
struct rpc_msg_get_device_memory_rsp {
uint64_t free_mem;
uint64_t total_mem;
};
#pragma pack(pop)
// RPC data structures
static ggml_guid_t ggml_backend_rpc_guid() {
@@ -240,6 +299,38 @@ static bool recv_data(sockfd_t sockfd, void * data, size_t size) {
return true;
}
static bool send_msg(sockfd_t sockfd, const void * msg, size_t msg_size) {
if (!send_data(sockfd, &msg_size, sizeof(msg_size))) {
return false;
}
return send_data(sockfd, msg, msg_size);
}
static bool recv_msg(sockfd_t sockfd, void * msg, size_t msg_size) {
uint64_t size;
if (!recv_data(sockfd, &size, sizeof(size))) {
return false;
}
if (size != msg_size) {
return false;
}
return recv_data(sockfd, msg, msg_size);
}
static bool recv_msg(sockfd_t sockfd, std::vector<uint8_t> & input) {
uint64_t size;
if (!recv_data(sockfd, &size, sizeof(size))) {
return false;
}
try {
input.resize(size);
} catch (const std::bad_alloc & e) {
fprintf(stderr, "Failed to allocate input buffer of size %" PRIu64 "\n", size);
return false;
}
return recv_data(sockfd, input.data(), size);
}
static bool parse_endpoint(const std::string & endpoint, std::string & host, int & port) {
size_t pos = endpoint.find(':');
if (pos == std::string::npos) {
@@ -252,28 +343,27 @@ static bool parse_endpoint(const std::string & endpoint, std::string & host, int
// RPC request : | rpc_cmd (1 byte) | request_size (8 bytes) | request_data (request_size bytes) |
// RPC response: | response_size (8 bytes) | response_data (response_size bytes) |
static bool send_rpc_cmd(const std::shared_ptr<socket_t> & sock, enum rpc_cmd cmd, const std::vector<uint8_t> & input, std::vector<uint8_t> & output) {
static bool send_rpc_cmd(const std::shared_ptr<socket_t> & sock, enum rpc_cmd cmd, const void * input, size_t input_size, void * output, size_t output_size) {
uint8_t cmd_byte = cmd;
if (!send_data(sock->fd, &cmd_byte, sizeof(cmd_byte))) {
return false;
}
uint64_t input_size = input.size();
if (!send_data(sock->fd, &input_size, sizeof(input_size))) {
return false;
}
if (!send_data(sock->fd, input.data(), input.size())) {
if (!send_data(sock->fd, input, input_size)) {
return false;
}
uint64_t output_size;
if (!recv_data(sock->fd, &output_size, sizeof(output_size))) {
// TODO: currently the output_size is always known, do we need support for commands with variable output size?
// even if we do, we can skip sending output_size from the server for commands with known output size
uint64_t out_size;
if (!recv_data(sock->fd, &out_size, sizeof(out_size))) {
return false;
}
if (output_size == 0) {
output.clear();
return true;
if (out_size != output_size) {
return false;
}
output.resize(output_size);
if (!recv_data(sock->fd, output.data(), output_size)) {
if (!recv_data(sock->fd, output, output_size)) {
return false;
}
return true;
@@ -326,14 +416,9 @@ static const char * ggml_backend_rpc_buffer_get_name(ggml_backend_buffer_t buffe
static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
// input serialization format: | remote_ptr (8 bytes) |
std::vector<uint8_t> input(sizeof(uint64_t), 0);
uint64_t remote_ptr = ctx->remote_ptr;
memcpy(input.data(), &remote_ptr, sizeof(remote_ptr));
std::vector<uint8_t> output;
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_FREE_BUFFER, input, output);
rpc_msg_free_buffer_req request = {ctx->remote_ptr};
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_FREE_BUFFER, &request, sizeof(request), nullptr, 0);
GGML_ASSERT(status);
GGML_ASSERT(output.empty());
delete ctx;
}
@@ -342,20 +427,13 @@ static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) {
if (ctx->base_cache.find(buffer) != ctx->base_cache.end()) {
return ctx->base_cache[buffer];
}
// input serialization format: | remote_ptr (8 bytes) |
std::vector<uint8_t> input(sizeof(uint64_t), 0);
uint64_t remote_ptr = ctx->remote_ptr;
memcpy(input.data(), &remote_ptr, sizeof(remote_ptr));
std::vector<uint8_t> output;
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_BUFFER_GET_BASE, input, output);
rpc_msg_buffer_get_base_req request = {ctx->remote_ptr};
rpc_msg_buffer_get_base_rsp response;
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_BUFFER_GET_BASE, &request, sizeof(request), &response, sizeof(response));
GGML_ASSERT(status);
GGML_ASSERT(output.size() == sizeof(uint64_t));
// output serialization format: | base_ptr (8 bytes) |
uint64_t base_ptr;
memcpy(&base_ptr, output.data(), sizeof(base_ptr));
void * base = reinterpret_cast<void *>(base_ptr);
ctx->base_cache[buffer] = base;
return base;
void * base_ptr = reinterpret_cast<void *>(response.base_ptr);
ctx->base_cache[buffer] = base_ptr;
return base_ptr;
}
static rpc_tensor serialize_tensor(const ggml_tensor * tensor) {
@@ -405,26 +483,18 @@ static void ggml_backend_rpc_buffer_set_tensor(ggml_backend_buffer_t buffer, ggm
memcpy(input.data(), &rpc_tensor, sizeof(rpc_tensor));
memcpy(input.data() + sizeof(rpc_tensor), &offset, sizeof(offset));
memcpy(input.data() + sizeof(rpc_tensor) + sizeof(offset), data, size);
std::vector<uint8_t> output;
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_SET_TENSOR, input, output);
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_SET_TENSOR, input.data(), input.size(), nullptr, 0);
GGML_ASSERT(status);
}
static void ggml_backend_rpc_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
// input serialization format: | rpc_tensor | offset (8 bytes) | size (8 bytes) |
int input_size = sizeof(rpc_tensor) + 2*sizeof(uint64_t);
std::vector<uint8_t> input(input_size, 0);
rpc_tensor rpc_tensor = serialize_tensor(tensor);
memcpy(input.data(), &rpc_tensor, sizeof(rpc_tensor));
memcpy(input.data() + sizeof(rpc_tensor), &offset, sizeof(offset));
memcpy(input.data() + sizeof(rpc_tensor) + sizeof(offset), &size, sizeof(size));
std::vector<uint8_t> output;
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_GET_TENSOR, input, output);
rpc_msg_get_tensor_req request;
request.tensor = serialize_tensor(tensor);
request.offset = offset;
request.size = size;
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_GET_TENSOR, &request, sizeof(request), data, size);
GGML_ASSERT(status);
GGML_ASSERT(output.size() == size);
// output serialization format: | data (size bytes) |
memcpy(data, output.data(), size);
}
static bool ggml_backend_rpc_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
@@ -437,30 +507,19 @@ static bool ggml_backend_rpc_buffer_cpy_tensor(ggml_backend_buffer_t buffer, con
return false;
}
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
// input serialization format: | rpc_tensor src | rpc_tensor dst |
int input_size = 2*sizeof(rpc_tensor);
std::vector<uint8_t> input(input_size, 0);
rpc_tensor rpc_src = serialize_tensor(src);
rpc_tensor rpc_dst = serialize_tensor(dst);
memcpy(input.data(), &rpc_src, sizeof(rpc_src));
memcpy(input.data() + sizeof(rpc_src), &rpc_dst, sizeof(rpc_dst));
std::vector<uint8_t> output;
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_COPY_TENSOR, input, output);
rpc_msg_copy_tensor_req request;
request.src = serialize_tensor(src);
request.dst = serialize_tensor(dst);
rpc_msg_copy_tensor_rsp response;
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_COPY_TENSOR, &request, sizeof(request), &response, sizeof(response));
GGML_ASSERT(status);
// output serialization format: | result (1 byte) |
GGML_ASSERT(output.size() == 1);
return output[0];
return response.result;
}
static void ggml_backend_rpc_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
// serialization format: | bufptr (8 bytes) | value (1 byte) |
int input_size = sizeof(uint64_t) + sizeof(uint8_t);
std::vector<uint8_t> input(input_size, 0);
memcpy(input.data(), &ctx->remote_ptr, sizeof(ctx->remote_ptr));
memcpy(input.data() + sizeof(ctx->remote_ptr), &value, sizeof(value));
std::vector<uint8_t> output;
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_BUFFER_CLEAR, input, output);
rpc_msg_buffer_clear_req request = {ctx->remote_ptr, value};
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_BUFFER_CLEAR, &request, sizeof(request), nullptr, 0);
GGML_ASSERT(status);
}
@@ -484,25 +543,16 @@ static const char * ggml_backend_rpc_buffer_type_name(ggml_backend_buffer_type_t
static ggml_backend_buffer_t ggml_backend_rpc_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
// input serialization format: | size (8 bytes) |
int input_size = sizeof(uint64_t);
std::vector<uint8_t> input(input_size, 0);
memcpy(input.data(), &size, sizeof(size));
std::vector<uint8_t> output;
rpc_msg_alloc_buffer_req request = {size};
rpc_msg_alloc_buffer_rsp response;
auto sock = get_socket(buft_ctx->endpoint);
bool status = send_rpc_cmd(sock, RPC_CMD_ALLOC_BUFFER, input, output);
bool status = send_rpc_cmd(sock, RPC_CMD_ALLOC_BUFFER, &request, sizeof(request), &response, sizeof(response));
GGML_ASSERT(status);
GGML_ASSERT(output.size() == 2*sizeof(uint64_t));
// output serialization format: | remote_ptr (8 bytes) | remote_size (8 bytes) |
uint64_t remote_ptr;
memcpy(&remote_ptr, output.data(), sizeof(remote_ptr));
size_t remote_size;
memcpy(&remote_size, output.data() + sizeof(uint64_t), sizeof(remote_size));
if (remote_ptr != 0) {
if (response.remote_ptr != 0) {
ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft,
ggml_backend_rpc_buffer_interface,
new ggml_backend_rpc_buffer_context{sock, {}, remote_ptr, "RPC[" + std::string(buft_ctx->endpoint) + "]"},
remote_size);
new ggml_backend_rpc_buffer_context{sock, {}, response.remote_ptr, "RPC[" + std::string(buft_ctx->endpoint) + "]"},
response.remote_size);
return buffer;
} else {
return nullptr;
@@ -510,16 +560,10 @@ static ggml_backend_buffer_t ggml_backend_rpc_buffer_type_alloc_buffer(ggml_back
}
static size_t get_alignment(const std::shared_ptr<socket_t> & sock) {
// input serialization format: | 0 bytes |
std::vector<uint8_t> input;
std::vector<uint8_t> output;
bool status = send_rpc_cmd(sock, RPC_CMD_GET_ALIGNMENT, input, output);
rpc_msg_get_alignment_rsp response;
bool status = send_rpc_cmd(sock, RPC_CMD_GET_ALIGNMENT, nullptr, 0, &response, sizeof(response));
GGML_ASSERT(status);
GGML_ASSERT(output.size() == sizeof(uint64_t));
// output serialization format: | alignment (8 bytes) |
uint64_t alignment;
memcpy(&alignment, output.data(), sizeof(alignment));
return alignment;
return response.alignment;
}
static size_t ggml_backend_rpc_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
@@ -528,16 +572,10 @@ static size_t ggml_backend_rpc_buffer_type_get_alignment(ggml_backend_buffer_typ
}
static size_t get_max_size(const std::shared_ptr<socket_t> & sock) {
// input serialization format: | 0 bytes |
std::vector<uint8_t> input;
std::vector<uint8_t> output;
bool status = send_rpc_cmd(sock, RPC_CMD_GET_MAX_SIZE, input, output);
rpc_msg_get_max_size_rsp response;
bool status = send_rpc_cmd(sock, RPC_CMD_GET_MAX_SIZE, nullptr, 0, &response, sizeof(response));
GGML_ASSERT(status);
GGML_ASSERT(output.size() == sizeof(uint64_t));
// output serialization format: | max_size (8 bytes) |
uint64_t max_size;
memcpy(&max_size, output.data(), sizeof(max_size));
return max_size;
return response.max_size;
}
static size_t ggml_backend_rpc_get_max_size(ggml_backend_buffer_type_t buft) {
@@ -622,12 +660,11 @@ static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t backend, g
ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
std::vector<uint8_t> input;
serialize_graph(cgraph, input);
std::vector<uint8_t> output;
rpc_msg_graph_compute_rsp response;
auto sock = get_socket(rpc_ctx->endpoint);
bool status = send_rpc_cmd(sock, RPC_CMD_GRAPH_COMPUTE, input, output);
bool status = send_rpc_cmd(sock, RPC_CMD_GRAPH_COMPUTE, input.data(), input.size(), &response, sizeof(response));
GGML_ASSERT(status);
GGML_ASSERT(output.size() == 1);
return (enum ggml_status)output[0];
return (enum ggml_status)response.result;
}
static ggml_backend_i ggml_backend_rpc_interface = {
@@ -702,19 +739,11 @@ GGML_API bool ggml_backend_is_rpc(ggml_backend_t backend) {
}
static void get_device_memory(const std::shared_ptr<socket_t> & sock, size_t * free, size_t * total) {
// input serialization format: | 0 bytes |
std::vector<uint8_t> input;
std::vector<uint8_t> output;
bool status = send_rpc_cmd(sock, RPC_CMD_GET_DEVICE_MEMORY, input, output);
rpc_msg_get_device_memory_rsp response;
bool status = send_rpc_cmd(sock, RPC_CMD_GET_DEVICE_MEMORY, nullptr, 0, &response, sizeof(response));
GGML_ASSERT(status);
GGML_ASSERT(output.size() == 2*sizeof(uint64_t));
// output serialization format: | free (8 bytes) | total (8 bytes) |
uint64_t free_mem;
memcpy(&free_mem, output.data(), sizeof(free_mem));
uint64_t total_mem;
memcpy(&total_mem, output.data() + sizeof(uint64_t), sizeof(total_mem));
*free = free_mem;
*total = total_mem;
*free = response.free_mem;
*total = response.total_mem;
}
GGML_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total) {
@@ -734,16 +763,16 @@ public:
rpc_server(ggml_backend_t backend) : backend(backend) {}
~rpc_server();
bool alloc_buffer(const std::vector<uint8_t> & input, std::vector<uint8_t> & output);
void get_alignment(std::vector<uint8_t> & output);
void get_max_size(std::vector<uint8_t> & output);
bool buffer_get_base(const std::vector<uint8_t> & input, std::vector<uint8_t> & output);
bool free_buffer(const std::vector<uint8_t> & input);
bool buffer_clear(const std::vector<uint8_t> & input);
void alloc_buffer(const rpc_msg_alloc_buffer_req & request, rpc_msg_alloc_buffer_rsp & response);
void get_alignment(rpc_msg_get_alignment_rsp & response);
void get_max_size(rpc_msg_get_max_size_rsp & response);
bool buffer_get_base(const rpc_msg_buffer_get_base_req & request, rpc_msg_buffer_get_base_rsp & response);
bool free_buffer(const rpc_msg_free_buffer_req & request);
bool buffer_clear(const rpc_msg_buffer_clear_req & request);
bool set_tensor(const std::vector<uint8_t> & input);
bool get_tensor(const std::vector<uint8_t> & input, std::vector<uint8_t> & output);
bool copy_tensor(const std::vector<uint8_t> & input, std::vector<uint8_t> & output);
bool graph_compute(const std::vector<uint8_t> & input, std::vector<uint8_t> & output);
bool get_tensor(const rpc_msg_get_tensor_req & request, std::vector<uint8_t> & response);
bool copy_tensor(const rpc_msg_copy_tensor_req & request, rpc_msg_copy_tensor_rsp & response);
bool graph_compute(const std::vector<uint8_t> & input, rpc_msg_graph_compute_rsp & response);
private:
ggml_tensor * deserialize_tensor(struct ggml_context * ctx, const rpc_tensor * tensor);
@@ -757,80 +786,50 @@ private:
std::unordered_set<ggml_backend_buffer_t> buffers;
};
bool rpc_server::alloc_buffer(const std::vector<uint8_t> & input, std::vector<uint8_t> & output) {
// input serialization format: | size (8 bytes) |
if (input.size() != sizeof(uint64_t)) {
return false;
}
uint64_t size;
memcpy(&size, input.data(), sizeof(size));
void rpc_server::alloc_buffer(const rpc_msg_alloc_buffer_req & request, rpc_msg_alloc_buffer_rsp & response) {
ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backend);
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
uint64_t remote_ptr = 0;
uint64_t remote_size = 0;
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, request.size);
response.remote_ptr = 0;
response.remote_size = 0;
if (buffer != nullptr) {
remote_ptr = reinterpret_cast<uint64_t>(buffer);
remote_size = buffer->size;
GGML_PRINT_DEBUG("[%s] size: %" PRIu64 " -> remote_ptr: %" PRIx64 ", remote_size: %" PRIu64 "\n", __func__, size, remote_ptr, remote_size);
response.remote_ptr = reinterpret_cast<uint64_t>(buffer);
response.remote_size = buffer->size;
GGML_PRINT_DEBUG("[%s] size: %" PRIu64 " -> remote_ptr: %" PRIx64 ", remote_size: %" PRIu64 "\n", __func__, request.size, response.remote_ptr, response.remote_size);
buffers.insert(buffer);
} else {
GGML_PRINT_DEBUG("[%s] size: %" PRIu64 " -> failed\n", __func__, size);
GGML_PRINT_DEBUG("[%s] size: %" PRIu64 " -> failed\n", __func__, request.size);
}
// output serialization format: | remote_ptr (8 bytes) | remote_size (8 bytes) |
output.resize(2*sizeof(uint64_t), 0);
memcpy(output.data(), &remote_ptr, sizeof(remote_ptr));
memcpy(output.data() + sizeof(uint64_t), &remote_size, sizeof(remote_size));
return true;
}
void rpc_server::get_alignment(std::vector<uint8_t> & output) {
void rpc_server::get_alignment(rpc_msg_get_alignment_rsp & response) {
ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backend);
size_t alignment = ggml_backend_buft_get_alignment(buft);
GGML_PRINT_DEBUG("[%s] alignment: %lu\n", __func__, alignment);
// output serialization format: | alignment (8 bytes) |
output.resize(sizeof(uint64_t), 0);
memcpy(output.data(), &alignment, sizeof(alignment));
response.alignment = alignment;
}
void rpc_server::get_max_size(std::vector<uint8_t> & output) {
void rpc_server::get_max_size(rpc_msg_get_max_size_rsp & response) {
ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backend);
size_t max_size = ggml_backend_buft_get_max_size(buft);
GGML_PRINT_DEBUG("[%s] max_size: %lu\n", __func__, max_size);
// output serialization format: | max_size (8 bytes) |
output.resize(sizeof(uint64_t), 0);
memcpy(output.data(), &max_size, sizeof(max_size));
response.max_size = max_size;
}
bool rpc_server::buffer_get_base(const std::vector<uint8_t> & input, std::vector<uint8_t> & output) {
// input serialization format: | remote_ptr (8 bytes) |
if (input.size() != sizeof(uint64_t)) {
return false;
}
uint64_t remote_ptr;
memcpy(&remote_ptr, input.data(), sizeof(remote_ptr));
GGML_PRINT_DEBUG("[%s] remote_ptr: %" PRIx64 "\n", __func__, remote_ptr);
ggml_backend_buffer_t buffer = reinterpret_cast<ggml_backend_buffer_t>(remote_ptr);
bool rpc_server::buffer_get_base(const rpc_msg_buffer_get_base_req & request, rpc_msg_buffer_get_base_rsp & response) {
GGML_PRINT_DEBUG("[%s] remote_ptr: %" PRIx64 "\n", __func__, request.remote_ptr);
ggml_backend_buffer_t buffer = reinterpret_cast<ggml_backend_buffer_t>(request.remote_ptr);
if (buffers.find(buffer) == buffers.end()) {
GGML_PRINT_DEBUG("[%s] buffer not found\n", __func__);
return false;
}
void * base = ggml_backend_buffer_get_base(buffer);
// output serialization format: | base_ptr (8 bytes) |
uint64_t base_ptr = reinterpret_cast<uint64_t>(base);
output.resize(sizeof(uint64_t), 0);
memcpy(output.data(), &base_ptr, sizeof(base_ptr));
response.base_ptr = reinterpret_cast<uint64_t>(base);
return true;
}
bool rpc_server::free_buffer(const std::vector<uint8_t> & input) {
// input serialization format: | remote_ptr (8 bytes) |
if (input.size() != sizeof(uint64_t)) {
return false;
}
uint64_t remote_ptr;
memcpy(&remote_ptr, input.data(), sizeof(remote_ptr));
GGML_PRINT_DEBUG("[%s] remote_ptr: %" PRIx64 "\n", __func__, remote_ptr);
ggml_backend_buffer_t buffer = reinterpret_cast<ggml_backend_buffer_t>(remote_ptr);
bool rpc_server::free_buffer(const rpc_msg_free_buffer_req & request) {
GGML_PRINT_DEBUG("[%s] remote_ptr: %" PRIx64 "\n", __func__, request.remote_ptr);
ggml_backend_buffer_t buffer = reinterpret_cast<ggml_backend_buffer_t>(request.remote_ptr);
if (buffers.find(buffer) == buffers.end()) {
GGML_PRINT_DEBUG("[%s] buffer not found\n", __func__);
return false;
@@ -840,22 +839,14 @@ bool rpc_server::free_buffer(const std::vector<uint8_t> & input) {
return true;
}
bool rpc_server::buffer_clear(const std::vector<uint8_t> & input) {
// input serialization format: | remote_ptr (8 bytes) | value (1 byte) |
if (input.size() != sizeof(uint64_t) + sizeof(uint8_t)) {
return false;
}
uint64_t remote_ptr;
memcpy(&remote_ptr, input.data(), sizeof(remote_ptr));
uint8_t value;
memcpy(&value, input.data() + sizeof(uint64_t), sizeof(value));
GGML_PRINT_DEBUG("[%s] remote_ptr: %" PRIx64 ", value: %u\n", __func__, remote_ptr, value);
ggml_backend_buffer_t buffer = reinterpret_cast<ggml_backend_buffer_t>(remote_ptr);
bool rpc_server::buffer_clear(const rpc_msg_buffer_clear_req & request) {
GGML_PRINT_DEBUG("[%s] remote_ptr: %" PRIx64 ", value: %u\n", __func__, request.remote_ptr, request.value);
ggml_backend_buffer_t buffer = reinterpret_cast<ggml_backend_buffer_t>(request.remote_ptr);
if (buffers.find(buffer) == buffers.end()) {
GGML_PRINT_DEBUG("[%s] buffer not found\n", __func__);
return false;
}
ggml_backend_buffer_clear(buffer, value);
ggml_backend_buffer_clear(buffer, request.value);
return true;
}
@@ -930,74 +921,55 @@ bool rpc_server::set_tensor(const std::vector<uint8_t> & input) {
return true;
}
bool rpc_server::get_tensor(const std::vector<uint8_t> & input, std::vector<uint8_t> & output) {
// serialization format: | rpc_tensor | offset (8 bytes) | size (8 bytes) |
if (input.size() != sizeof(rpc_tensor) + 2*sizeof(uint64_t)) {
return false;
}
const rpc_tensor * in_tensor = (const rpc_tensor *)input.data();
uint64_t offset;
memcpy(&offset, input.data() + sizeof(rpc_tensor), sizeof(offset));
uint64_t size;
memcpy(&size, input.data() + sizeof(rpc_tensor) + sizeof(offset), sizeof(size));
bool rpc_server::get_tensor(const rpc_msg_get_tensor_req & request, std::vector<uint8_t> & response) {
struct ggml_init_params params {
/*.mem_size =*/ ggml_tensor_overhead(),
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true,
};
struct ggml_context * ctx = ggml_init(params);
ggml_tensor * tensor = deserialize_tensor(ctx, in_tensor);
ggml_tensor * tensor = deserialize_tensor(ctx, &request.tensor);
if (tensor == nullptr) {
GGML_PRINT_DEBUG("[%s] error deserializing tensor\n", __func__);
ggml_free(ctx);
return false;
}
GGML_PRINT_DEBUG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %" PRIu64 "\n", __func__, (void*)tensor->buffer, tensor->data, offset, size);
GGML_PRINT_DEBUG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %" PRIu64 "\n", __func__, (void*)tensor->buffer, tensor->data, request.offset, request.size);
// sanitize tensor->data
{
const size_t p0 = (size_t) ggml_backend_buffer_get_base(tensor->buffer);
const size_t p1 = p0 + ggml_backend_buffer_get_size(tensor->buffer);
if (in_tensor->data + offset < p0 || in_tensor->data + offset >= p1 || size > (p1 - in_tensor->data - offset)) {
GGML_ABORT("[%s] tensor->data out of bounds\n", __func__);
if (request.tensor.data + request.offset < p0 ||
request.tensor.data + request.offset >= p1 ||
request.size > (p1 - request.tensor.data - request.offset)) {
GGML_ABORT("[%s] tensor->data out of bounds\n", __func__);
}
}
// output serialization format: | data (size bytes) |
output.resize(size, 0);
ggml_backend_tensor_get(tensor, output.data(), offset, size);
response.resize(request.size, 0);
ggml_backend_tensor_get(tensor, response.data(), request.offset, request.size);
ggml_free(ctx);
return true;
}
bool rpc_server::copy_tensor(const std::vector<uint8_t> & input, std::vector<uint8_t> & output) {
// serialization format: | rpc_tensor src | rpc_tensor dst |
if (input.size() != 2*sizeof(rpc_tensor)) {
return false;
}
const rpc_tensor * rpc_src = (const rpc_tensor *)input.data();
const rpc_tensor * rpc_dst = (const rpc_tensor *)(input.data() + sizeof(rpc_src));
bool rpc_server::copy_tensor(const rpc_msg_copy_tensor_req & request, rpc_msg_copy_tensor_rsp & response) {
struct ggml_init_params params {
/*.mem_size =*/ 2*ggml_tensor_overhead(),
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true,
};
struct ggml_context * ctx = ggml_init(params);
ggml_tensor * src = deserialize_tensor(ctx, rpc_src);
ggml_tensor * dst = deserialize_tensor(ctx, rpc_dst);
ggml_tensor * src = deserialize_tensor(ctx, &request.src);
ggml_tensor * dst = deserialize_tensor(ctx, &request.dst);
if (src == nullptr || dst == nullptr) {
GGML_PRINT_DEBUG("[%s] error deserializing tensors\n", __func__);
ggml_free(ctx);
return false;
}
GGML_PRINT_DEBUG("[%s] src->buffer: %p, dst->buffer: %p\n", __func__, (void*)src->buffer, (void*)dst->buffer);
bool result = ggml_backend_buffer_copy_tensor(src, dst);
// output serialization format: | result (1 byte) |
output.resize(1, 0);
output[0] = result;
response.result = ggml_backend_buffer_copy_tensor(src, dst);
ggml_free(ctx);
return true;
}
@@ -1026,7 +998,7 @@ ggml_tensor * rpc_server::create_node(uint64_t id,
return result;
}
bool rpc_server::graph_compute(const std::vector<uint8_t> & input, std::vector<uint8_t> & output) {
bool rpc_server::graph_compute(const std::vector<uint8_t> & input, rpc_msg_graph_compute_rsp & response) {
// serialization format:
// | n_nodes (4 bytes) | nodes (n_nodes * sizeof(uint64_t) | n_tensors (4 bytes) | tensors (n_tensors * sizeof(rpc_tensor)) |
if (input.size() < sizeof(uint32_t)) {
@@ -1066,9 +1038,7 @@ bool rpc_server::graph_compute(const std::vector<uint8_t> & input, std::vector<u
graph->nodes[i] = create_node(id, ctx, tensor_ptrs, tensor_map);
}
ggml_status status = ggml_backend_graph_compute(backend, graph);
// output serialization format: | status (1 byte) |
output.resize(1, 0);
output[0] = status;
response.result = status;
ggml_free(ctx);
return true;
}
@@ -1091,85 +1061,153 @@ static void rpc_serve_client(ggml_backend_t backend, sockfd_t sockfd, size_t fre
fprintf(stderr, "Unknown command: %d\n", cmd);
break;
}
std::vector<uint8_t> input;
std::vector<uint8_t> output;
uint64_t input_size;
if (!recv_data(sockfd, &input_size, sizeof(input_size))) {
break;
}
try {
input.resize(input_size);
} catch (const std::bad_alloc & e) {
fprintf(stderr, "Failed to allocate input buffer of size %" PRIu64 "\n", input_size);
break;
}
if (!recv_data(sockfd, input.data(), input_size)) {
break;
}
bool ok = true;
switch (cmd) {
case RPC_CMD_ALLOC_BUFFER: {
ok = server.alloc_buffer(input, output);
rpc_msg_alloc_buffer_req request;
if (!recv_msg(sockfd, &request, sizeof(request))) {
return;
}
rpc_msg_alloc_buffer_rsp response;
server.alloc_buffer(request, response);
if (!send_msg(sockfd, &response, sizeof(response))) {
return;
}
break;
}
case RPC_CMD_GET_ALIGNMENT: {
server.get_alignment(output);
if (!recv_msg(sockfd, nullptr, 0)) {
return;
}
rpc_msg_get_alignment_rsp response;
server.get_alignment(response);
if (!send_msg(sockfd, &response, sizeof(response))) {
return;
}
break;
}
case RPC_CMD_GET_MAX_SIZE: {
server.get_max_size(output);
if (!recv_msg(sockfd, nullptr, 0)) {
return;
}
rpc_msg_get_max_size_rsp response;
server.get_max_size(response);
if (!send_msg(sockfd, &response, sizeof(response))) {
return;
}
break;
}
case RPC_CMD_BUFFER_GET_BASE: {
ok = server.buffer_get_base(input, output);
rpc_msg_buffer_get_base_req request;
if (!recv_msg(sockfd, &request, sizeof(request))) {
return;
}
rpc_msg_buffer_get_base_rsp response;
if (!server.buffer_get_base(request, response)) {
return;
}
if (!send_msg(sockfd, &response, sizeof(response))) {
return;
}
break;
}
case RPC_CMD_FREE_BUFFER: {
ok = server.free_buffer(input);
rpc_msg_free_buffer_req request;
if (!recv_msg(sockfd, &request, sizeof(request))) {
return;
}
if (!server.free_buffer(request)) {
return;
}
if (!send_msg(sockfd, nullptr, 0)) {
return;
}
break;
}
case RPC_CMD_BUFFER_CLEAR: {
ok = server.buffer_clear(input);
rpc_msg_buffer_clear_req request;
if (!recv_msg(sockfd, &request, sizeof(request))) {
return;
}
if (!server.buffer_clear(request)) {
return;
}
if (!send_msg(sockfd, nullptr, 0)) {
return;
}
break;
}
case RPC_CMD_SET_TENSOR: {
ok = server.set_tensor(input);
std::vector<uint8_t> input;
if (!recv_msg(sockfd, input)) {
return;
}
if (!server.set_tensor(input)) {
return;
}
if (!send_msg(sockfd, nullptr, 0)) {
return;
}
break;
}
case RPC_CMD_GET_TENSOR: {
ok = server.get_tensor(input, output);
rpc_msg_get_tensor_req request;
if (!recv_msg(sockfd, &request, sizeof(request))) {
return;
}
std::vector<uint8_t> response;
if (!server.get_tensor(request, response)) {
return;
}
if (!send_msg(sockfd, response.data(), response.size())) {
return;
}
break;
}
case RPC_CMD_COPY_TENSOR: {
ok = server.copy_tensor(input, output);
rpc_msg_copy_tensor_req request;
if (!recv_msg(sockfd, &request, sizeof(request))) {
return;
}
rpc_msg_copy_tensor_rsp response;
if (!server.copy_tensor(request, response)) {
return;
}
if (!send_msg(sockfd, &response, sizeof(response))) {
return;
}
break;
}
case RPC_CMD_GRAPH_COMPUTE: {
ok = server.graph_compute(input, output);
std::vector<uint8_t> input;
if (!recv_msg(sockfd, input)) {
return;
}
rpc_msg_graph_compute_rsp response;
if (!server.graph_compute(input, response)) {
return;
}
if (!send_msg(sockfd, &response, sizeof(response))) {
return;
}
break;
}
case RPC_CMD_GET_DEVICE_MEMORY: {
// output serialization format: | free (8 bytes) | total (8 bytes) |
output.resize(2*sizeof(uint64_t), 0);
memcpy(output.data(), &free_mem, sizeof(free_mem));
memcpy(output.data() + sizeof(uint64_t), &total_mem, sizeof(total_mem));
if (!recv_msg(sockfd, nullptr, 0)) {
return;
}
rpc_msg_get_device_memory_rsp response;
response.free_mem = free_mem;
response.total_mem = total_mem;
if (!send_msg(sockfd, &response, sizeof(response))) {
return;
}
break;
}
default: {
fprintf(stderr, "Unknown command: %d\n", cmd);
ok = false;
return;
}
}
if (!ok) {
break;
}
uint64_t output_size = output.size();
if (!send_data(sockfd, &output_size, sizeof(output_size))) {
break;
}
if (!send_data(sockfd, output.data(), output_size)) {
break;
}
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -1,6 +1,6 @@
#include "mmvq.hpp"
#include "vecdotq.hpp"
#include <cassert>
template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_sycl_t vec_dot_q_sycl>
static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows,
@@ -13,7 +13,8 @@ static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict_
}
const int blocks_per_row = ncols / qk;
const int blocks_per_warp = vdr * WARP_SIZE / qi;
const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
assert(blocks_per_warp>0);
// partial sum for each thread
float tmp = 0.0f;
@@ -37,7 +38,7 @@ static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict_
// sum up partial sums and write back result
#pragma unroll
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
tmp +=
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
}
@@ -61,7 +62,8 @@ static void mul_mat_vec_q_iq2_xxs_q8_1(const void *__restrict__ vx,
}
const int blocks_per_row = ncols / qk;
const int blocks_per_warp = vdr * WARP_SIZE / qi;
const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
assert(blocks_per_warp>0);
// partial sum for each thread
float tmp = 0.0f;
@@ -85,7 +87,7 @@ static void mul_mat_vec_q_iq2_xxs_q8_1(const void *__restrict__ vx,
// sum up partial sums and write back result
#pragma unroll
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
tmp +=
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
}
@@ -109,8 +111,8 @@ static void mul_mat_vec_q_iq2_xs_q8_1(const void *__restrict__ vx,
}
const int blocks_per_row = ncols / qk;
const int blocks_per_warp = vdr * WARP_SIZE / qi;
const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
assert(blocks_per_warp>0);
// partial sum for each thread
float tmp = 0.0f;
@@ -133,7 +135,7 @@ static void mul_mat_vec_q_iq2_xs_q8_1(const void *__restrict__ vx,
// sum up partial sums and write back result
#pragma unroll
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
tmp +=
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
}
@@ -157,8 +159,8 @@ static void mul_mat_vec_q_iq2_s_q8_1(const void *__restrict__ vx,
}
const int blocks_per_row = ncols / qk;
const int blocks_per_warp = vdr * WARP_SIZE / qi;
const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
assert(blocks_per_warp>0);
// partial sum for each thread
float tmp = 0.0f;
@@ -181,7 +183,7 @@ static void mul_mat_vec_q_iq2_s_q8_1(const void *__restrict__ vx,
// sum up partial sums and write back result
#pragma unroll
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
tmp +=
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
}
@@ -205,8 +207,8 @@ static void mul_mat_vec_q_iq3_xxs_q8_1(const void *__restrict__ vx,
}
const int blocks_per_row = ncols / qk;
const int blocks_per_warp = vdr * WARP_SIZE / qi;
const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
assert(blocks_per_warp>0);
// partial sum for each thread
float tmp = 0.0f;
@@ -229,7 +231,7 @@ static void mul_mat_vec_q_iq3_xxs_q8_1(const void *__restrict__ vx,
// sum up partial sums and write back result
#pragma unroll
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
tmp +=
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
}
@@ -253,8 +255,8 @@ static void mul_mat_vec_q_iq3_s_q8_1(const void *__restrict__ vx,
}
const int blocks_per_row = ncols / qk;
const int blocks_per_warp = vdr * WARP_SIZE / qi;
const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
assert(blocks_per_warp>0);
// partial sum for each thread
float tmp = 0.0f;
@@ -277,7 +279,7 @@ static void mul_mat_vec_q_iq3_s_q8_1(const void *__restrict__ vx,
// sum up partial sums and write back result
#pragma unroll
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
tmp +=
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
}
@@ -301,8 +303,8 @@ static void mul_mat_vec_q_iq1_s_q8_1(const void *__restrict__ vx,
}
const int blocks_per_row = ncols / qk;
const int blocks_per_warp = vdr * WARP_SIZE / qi;
const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
assert(blocks_per_warp>0);
// partial sum for each thread
float tmp = 0.0f;
@@ -325,7 +327,7 @@ static void mul_mat_vec_q_iq1_s_q8_1(const void *__restrict__ vx,
// sum up partial sums and write back result
#pragma unroll
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
tmp +=
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
}
@@ -349,8 +351,8 @@ static void mul_mat_vec_q_iq1_m_q8_1(const void *__restrict__ vx,
}
const int blocks_per_row = ncols / qk;
const int blocks_per_warp = vdr * WARP_SIZE / qi;
const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
assert(blocks_per_warp>0);
// partial sum for each thread
float tmp = 0.0f;
@@ -373,7 +375,7 @@ static void mul_mat_vec_q_iq1_m_q8_1(const void *__restrict__ vx,
// sum up partial sums and write back result
#pragma unroll
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
tmp +=
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
}
@@ -397,8 +399,8 @@ static void mul_mat_vec_q_iq4_nl_q8_1(const void *__restrict__ vx,
}
const int blocks_per_row = ncols / qk;
const int blocks_per_warp = vdr * WARP_SIZE / qi;
const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
assert(blocks_per_warp>0);
// partial sum for each thread
float tmp = 0.0f;
@@ -421,7 +423,7 @@ static void mul_mat_vec_q_iq4_nl_q8_1(const void *__restrict__ vx,
// sum up partial sums and write back result
#pragma unroll
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
tmp +=
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
}
@@ -446,8 +448,8 @@ static void mul_mat_vec_q_iq4_xs_q8_1(const void *__restrict__ vx,
}
const int blocks_per_row = ncols / qk;
const int blocks_per_warp = vdr * WARP_SIZE / qi;
const int blocks_per_warp = vdr * QK_WARP_SIZE / qi;
assert(blocks_per_warp>0);
// partial sum for each thread
float tmp = 0.0f;
@@ -470,7 +472,7 @@ static void mul_mat_vec_q_iq4_xs_q8_1(const void *__restrict__ vx,
// sum up partial sums and write back result
#pragma unroll
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
tmp +=
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
}
@@ -487,7 +489,7 @@ static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy,
GGML_ASSERT(ncols % QK4_0 == 0);
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
const sycl::range<3> block_nums(1, 1, block_num_y);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
{
stream->submit([&](sycl::handler &cgh) {
@@ -495,7 +497,7 @@ static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy,
cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1)
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
[[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
mul_mat_vec_q<QK4_0, QI4_0, block_q4_0,
VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
vx, vy, dst, ncols, nrows, item_ct1);
@@ -511,7 +513,7 @@ static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
GGML_ASSERT(ncols % QK4_1 == 0);
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
const sycl::range<3> block_nums(1, 1, block_num_y);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
{
stream->submit([&](sycl::handler &cgh) {
@@ -519,7 +521,7 @@ static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1)
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
[[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
mul_mat_vec_q<QK4_0, QI4_1, block_q4_1,
VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>(
vx, vy, dst, ncols, nrows, item_ct1);
@@ -535,7 +537,7 @@ static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
GGML_ASSERT(ncols % QK5_0 == 0);
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
const sycl::range<3> block_nums(1, 1, block_num_y);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
{
stream->submit([&](sycl::handler &cgh) {
@@ -543,7 +545,7 @@ static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1)
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
[[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
mul_mat_vec_q<QK5_0, QI5_0, block_q5_0,
VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>(
vx, vy, dst, ncols, nrows, item_ct1);
@@ -559,7 +561,7 @@ static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
GGML_ASSERT(ncols % QK5_1 == 0);
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
const sycl::range<3> block_nums(1, 1, block_num_y);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
{
stream->submit([&](sycl::handler &cgh) {
@@ -567,7 +569,7 @@ static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1)
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
[[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
mul_mat_vec_q<QK5_1, QI5_1, block_q5_1,
VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>(
vx, vy, dst, ncols, nrows, item_ct1);
@@ -583,7 +585,7 @@ static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
GGML_ASSERT(ncols % QK8_0 == 0);
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
const sycl::range<3> block_nums(1, 1, block_num_y);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
{
stream->submit([&](sycl::handler &cgh) {
@@ -591,7 +593,7 @@ static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1)
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
[[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
mul_mat_vec_q<QK8_0, QI8_0, block_q8_0,
VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>(
vx, vy, dst, ncols, nrows, item_ct1);
@@ -607,7 +609,7 @@ static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
GGML_ASSERT(ncols % QK_K == 0);
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
const sycl::range<3> block_nums(1, 1, block_num_y);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
{
stream->submit([&](sycl::handler &cgh) {
@@ -615,7 +617,7 @@ static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1)
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
[[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
mul_mat_vec_q<QK_K, QI2_K, block_q2_K,
VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>(
vx, vy, dst, ncols, nrows, item_ct1);
@@ -631,7 +633,7 @@ static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
GGML_ASSERT(ncols % QK_K == 0);
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
const sycl::range<3> block_nums(1, 1, block_num_y);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
{
stream->submit([&](sycl::handler &cgh) {
@@ -639,7 +641,7 @@ static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1)
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
[[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
mul_mat_vec_q<QK_K, QI3_K, block_q3_K,
VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>(
vx, vy, dst, ncols, nrows, item_ct1);
@@ -655,7 +657,7 @@ static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
GGML_ASSERT(ncols % QK_K == 0);
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
const sycl::range<3> block_nums(1, 1, block_num_y);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
{
stream->submit([&](sycl::handler &cgh) {
@@ -663,7 +665,7 @@ static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1)
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
[[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
mul_mat_vec_q<QK_K, QI4_K, block_q4_K,
VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>(
vx, vy, dst, ncols, nrows, item_ct1);
@@ -679,7 +681,7 @@ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
GGML_ASSERT(ncols % QK_K == 0);
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
const sycl::range<3> block_nums(1, 1, block_num_y);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
{
stream->submit([&](sycl::handler &cgh) {
@@ -687,7 +689,7 @@ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1)
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
[[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
mul_mat_vec_q<QK_K, QI5_K, block_q5_K,
VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>(
vx, vy, dst, ncols, nrows, item_ct1);
@@ -703,7 +705,7 @@ static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
GGML_ASSERT(ncols % QK_K == 0);
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
const sycl::range<3> block_nums(1, 1, block_num_y);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
{
stream->submit([&](sycl::handler &cgh) {
@@ -711,7 +713,7 @@ static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1)
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
[[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
mul_mat_vec_q<QK_K, QI6_K, block_q6_K,
VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>(
vx, vy, dst, ncols, nrows, item_ct1);
@@ -728,13 +730,13 @@ static void mul_mat_vec_iq2_xxs_q8_1_sycl(const void *vx, const void *vy,
GGML_ASSERT(ncols % QK_K == 0);
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
const sycl::range<3> block_nums(1, 1, block_num_y);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
{
stream->submit([&](sycl::handler &cgh) {
cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1)
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
[[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
mul_mat_vec_q_iq2_xxs_q8_1<QK_K, QI2_XXS/2, block_iq2_xxs, 1>(
vx, vy, dst, ncols, nrows, item_ct1);
});
@@ -749,7 +751,7 @@ static void mul_mat_vec_iq2_xs_q8_1_sycl(const void *vx, const void *vy,
GGML_ASSERT(ncols % QK_K == 0);
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
const sycl::range<3> block_nums(1, 1, block_num_y);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
{
stream->submit([&](sycl::handler &cgh) {
@@ -759,7 +761,7 @@ static void mul_mat_vec_iq2_xs_q8_1_sycl(const void *vx, const void *vy,
cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1)
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
[[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
mul_mat_vec_q_iq2_xs_q8_1<QK_K, QI2_XS/2, block_iq2_xs, 1>(
vx, vy, dst, ncols, nrows, item_ct1);
});
@@ -774,7 +776,7 @@ static void mul_mat_vec_iq2_s_q8_1_sycl(const void *vx, const void *vy,
GGML_ASSERT(ncols % QK_K == 0);
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
const sycl::range<3> block_nums(1, 1, block_num_y);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
{
stream->submit([&](sycl::handler &cgh) {
@@ -784,7 +786,7 @@ static void mul_mat_vec_iq2_s_q8_1_sycl(const void *vx, const void *vy,
cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1)
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
[[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
mul_mat_vec_q_iq2_s_q8_1<QK_K, QI2_S/2, block_iq2_s, 1>(
vx, vy, dst, ncols, nrows, item_ct1);
});
@@ -799,7 +801,7 @@ static void mul_mat_vec_iq3_xxs_q8_1_sycl(const void *vx, const void *vy,
GGML_ASSERT(ncols % QK_K == 0);
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
const sycl::range<3> block_nums(1, 1, block_num_y);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
{
stream->submit([&](sycl::handler &cgh) {
@@ -809,7 +811,7 @@ static void mul_mat_vec_iq3_xxs_q8_1_sycl(const void *vx, const void *vy,
cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1)
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
[[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
mul_mat_vec_q_iq3_xxs_q8_1<QK_K, QI3_XXS/2, block_iq3_xxs, 1>(
vx, vy, dst, ncols, nrows, item_ct1);
});
@@ -824,7 +826,7 @@ static void mul_mat_vec_iq3_s_q8_1_sycl(const void *vx, const void *vy,
GGML_ASSERT(ncols % QK_K == 0);
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
const sycl::range<3> block_nums(1, 1, block_num_y);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
{
stream->submit([&](sycl::handler &cgh) {
@@ -833,7 +835,7 @@ static void mul_mat_vec_iq3_s_q8_1_sycl(const void *vx, const void *vy,
cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1)
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
[[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
mul_mat_vec_q_iq3_s_q8_1<QK_K, QI3_S/2, block_iq3_s, 1>(
vx, vy, dst, ncols, nrows, item_ct1);
});
@@ -848,7 +850,7 @@ static void mul_mat_vec_iq1_s_q8_1_sycl(const void *vx, const void *vy,
GGML_ASSERT(ncols % QK_K == 0);
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
const sycl::range<3> block_nums(1, 1, block_num_y);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
{
stream->submit([&](sycl::handler &cgh) {
@@ -858,7 +860,7 @@ static void mul_mat_vec_iq1_s_q8_1_sycl(const void *vx, const void *vy,
cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1)
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
[[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
mul_mat_vec_q_iq1_s_q8_1<QK_K, QI1_S, block_iq1_s, 1>(
vx, vy, dst, ncols, nrows, item_ct1);
});
@@ -873,13 +875,13 @@ static void mul_mat_vec_iq1_m_q8_1_sycl(const void *vx, const void *vy,
GGML_ASSERT(ncols % QK_K == 0);
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
const sycl::range<3> block_nums(1, 1, block_num_y);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
{
stream->submit([&](sycl::handler &cgh) {
cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1)
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
[[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
mul_mat_vec_q_iq1_m_q8_1<QK_K, QI1_S, block_iq1_m, 1>(
vx, vy, dst, ncols, nrows, item_ct1);
});
@@ -894,14 +896,14 @@ static void mul_mat_vec_iq4_nl_q8_1_sycl(const void *vx, const void *vy,
GGML_ASSERT(ncols % QK4_NL == 0);
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
const sycl::range<3> block_nums(1, 1, block_num_y);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
{
stream->submit([&](sycl::handler &cgh) {
cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1)
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
[[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
mul_mat_vec_q_iq4_nl_q8_1<QK4_NL, QI4_NL, block_iq4_nl, 2>(
vx, vy, dst, ncols, nrows, item_ct1);
});
@@ -916,14 +918,14 @@ static void mul_mat_vec_iq4_xs_q8_1_sycl(const void *vx, const void *vy,
GGML_ASSERT(ncols % QK_K == 0);
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
const sycl::range<3> block_nums(1, 1, block_num_y);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
{
stream->submit([&](sycl::handler &cgh) {
cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1)
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
[[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
mul_mat_vec_q_iq4_xs_q8_1<QK_K, QI4_XS/4, block_iq4_xs, 1>(
vx, vy, dst, ncols, nrows, item_ct1);
});

View File

@@ -1941,7 +1941,7 @@ static vk_device ggml_vk_get_device(size_t idx) {
if (device->fp16) {
device_extensions.push_back("VK_KHR_shader_float16_int8");
}
device->name = device->properties.deviceName.data();
device->name = GGML_VK_NAME + std::to_string(idx);
device_create_info = {
vk::DeviceCreateFlags(),
@@ -1968,7 +1968,7 @@ static vk_device ggml_vk_get_device(size_t idx) {
device->buffer_type = {
/* .iface = */ ggml_backend_vk_buffer_type_interface,
/* .device = */ nullptr,
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_vk_reg(), idx),
/* .context = */ new ggml_backend_vk_buffer_type_context{ device->name, device },
};
@@ -6378,7 +6378,7 @@ ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() {
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
},
/* .device = */ nullptr,
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_vk_reg(), 0),
/* .context = */ nullptr,
};
@@ -6581,9 +6581,135 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
UNUSED(backend);
}
static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
// ggml_backend_vk_context * ctx = (ggml_backend_vk_context *) backend->context;
// TODO: enable async and synchronize
static ggml_backend_i ggml_backend_vk_interface = {
/* .get_name = */ ggml_backend_vk_name,
/* .free = */ ggml_backend_vk_free,
/* .get_default_buffer_type = */ ggml_backend_vk_get_default_buffer_type,
/* .set_tensor_async = */ NULL, // ggml_backend_vk_set_tensor_async,
/* .get_tensor_async = */ NULL, // ggml_backend_vk_get_tensor_async,
/* .cpy_tensor_async = */ NULL, // ggml_backend_vk_cpy_tensor_async,
/* .synchronize = */ NULL, // ggml_backend_vk_synchronize,
/* .graph_plan_create = */ NULL,
/* .graph_plan_free = */ NULL,
/* .graph_plan_update = */ NULL,
/* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_vk_graph_compute,
/* .supports_op = */ NULL,
/* .supports_buft = */ NULL,
/* .offload_op = */ NULL,
/* .event_record = */ NULL,
/* .event_wait = */ NULL,
};
static ggml_guid_t ggml_backend_vk_guid() {
static ggml_guid guid = { 0xb8, 0xf7, 0x4f, 0x86, 0x40, 0x3c, 0xe1, 0x02, 0x91, 0xc8, 0xdd, 0xe9, 0x02, 0x3f, 0xc0, 0x2b };
return &guid;
}
ggml_backend_t ggml_backend_vk_init(size_t dev_num) {
VK_LOG_DEBUG("ggml_backend_vk_init(" << dev_num << ")");
ggml_backend_vk_context * ctx = new ggml_backend_vk_context;
ggml_vk_init(ctx, dev_num);
ggml_backend_t vk_backend = new ggml_backend {
/* .guid = */ ggml_backend_vk_guid(),
/* .interface = */ ggml_backend_vk_interface,
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_vk_reg(), dev_num),
/* .context = */ ctx,
};
return vk_backend;
}
bool ggml_backend_is_vk(ggml_backend_t backend) {
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_vk_guid());
}
int ggml_backend_vk_get_device_count() {
return ggml_vk_get_device_count();
}
void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size) {
GGML_ASSERT(device < (int) vk_instance.device_indices.size());
int dev_idx = vk_instance.device_indices[device];
ggml_vk_get_device_description(dev_idx, description, description_size);
}
void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total) {
GGML_ASSERT(device < (int) vk_instance.device_indices.size());
vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device]];
vk::PhysicalDeviceMemoryProperties memprops = vkdev.getMemoryProperties();
for (const vk::MemoryHeap& heap : memprops.memoryHeaps) {
if (heap.flags & vk::MemoryHeapFlagBits::eDeviceLocal) {
*total = heap.size;
*free = heap.size;
break;
}
}
}
//////////////////////////
struct ggml_backend_vk_device_context {
int device;
std::string name;
std::string description;
};
static const char * ggml_backend_vk_device_get_name(ggml_backend_dev_t dev) {
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
return ctx->name.c_str();
}
static const char * ggml_backend_vk_device_get_description(ggml_backend_dev_t dev) {
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
return ctx->description.c_str();
}
static void ggml_backend_vk_device_get_memory(ggml_backend_dev_t device, size_t * free, size_t * total) {
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)device->context;
ggml_backend_vk_get_device_memory(ctx->device, free, total);
}
static ggml_backend_buffer_type_t ggml_backend_vk_device_get_buffer_type(ggml_backend_dev_t dev) {
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
return ggml_backend_vk_buffer_type(ctx->device);
}
static ggml_backend_buffer_type_t ggml_backend_vk_device_get_host_buffer_type(ggml_backend_dev_t dev) {
UNUSED(dev);
return ggml_backend_vk_host_buffer_type();
}
static enum ggml_backend_dev_type ggml_backend_vk_device_get_type(ggml_backend_dev_t dev) {
UNUSED(dev);
return GGML_BACKEND_DEVICE_TYPE_GPU_FULL;
}
static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
props->name = ggml_backend_vk_device_get_name(dev);
props->description = ggml_backend_vk_device_get_description(dev);
props->type = ggml_backend_vk_device_get_type(dev);
ggml_backend_vk_device_get_memory(dev, &props->memory_free, &props->memory_total);
props->caps = {
/* async */ false,
/* host_buffer */ true,
/* events */ false,
};
}
static ggml_backend_t ggml_backend_vk_device_init(ggml_backend_dev_t dev, const char * params) {
UNUSED(params);
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
return ggml_backend_vk_init(ctx->device);
}
static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
switch (op->op) {
case GGML_OP_UNARY:
switch (ggml_get_unary_op(op)) {
@@ -6701,97 +6827,101 @@ static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const ggml_tenso
return false;
}
UNUSED(backend);
UNUSED(dev);
}
static bool ggml_backend_vk_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
static bool ggml_backend_vk_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
if (buft->iface.get_name != ggml_backend_vk_buffer_type_name) {
return false;
}
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
ggml_backend_vk_buffer_type_context * buft_ctx = (ggml_backend_vk_buffer_type_context *)buft->context;
return buft_ctx->device->idx == ctx->device;
}
static bool ggml_backend_vk_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
const int min_batch_size = 32;
return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) ||
(op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID);
UNUSED(backend);
UNUSED(dev);
}
static bool ggml_backend_vk_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
if (buft->iface.get_name != ggml_backend_vk_buffer_type_name) {
return false;
}
ggml_backend_vk_buffer_type_context * buft_ctx = (ggml_backend_vk_buffer_type_context *)buft->context;
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
return buft_ctx->device == ctx->device;
}
// TODO: enable async and synchronize
static ggml_backend_i ggml_backend_vk_interface = {
/* .get_name = */ ggml_backend_vk_name,
/* .free = */ ggml_backend_vk_free,
/* .get_default_buffer_type = */ ggml_backend_vk_get_default_buffer_type,
/* .set_tensor_async = */ NULL, // ggml_backend_vk_set_tensor_async,
/* .get_tensor_async = */ NULL, // ggml_backend_vk_get_tensor_async,
/* .cpy_tensor_async = */ NULL, // ggml_backend_vk_cpy_tensor_async,
/* .synchronize = */ NULL, // ggml_backend_vk_synchronize,
/* .graph_plan_create = */ NULL,
/* .graph_plan_free = */ NULL,
/* .graph_plan_update = */ NULL,
/* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_vk_graph_compute,
/* .supports_op = */ ggml_backend_vk_supports_op,
/* .supports_buft = */ ggml_backend_vk_supports_buft,
/* .offload_op = */ ggml_backend_vk_offload_op,
/* .event_record = */ NULL,
/* .event_wait = */ NULL,
static const struct ggml_backend_device_i ggml_backend_vk_device_i = {
/* .get_name = */ ggml_backend_vk_device_get_name,
/* .get_description = */ ggml_backend_vk_device_get_description,
/* .get_memory = */ ggml_backend_vk_device_get_memory,
/* .get_type = */ ggml_backend_vk_device_get_type,
/* .get_props = */ ggml_backend_vk_device_get_props,
/* .init_backend = */ ggml_backend_vk_device_init,
/* .get_buffer_type = */ ggml_backend_vk_device_get_buffer_type,
/* .get_host_buffer_type = */ ggml_backend_vk_device_get_host_buffer_type,
/* .buffer_from_host_ptr = */ NULL,
/* .supports_op = */ ggml_backend_vk_device_supports_op,
/* .supports_buft = */ ggml_backend_vk_device_supports_buft,
/* .offload_op = */ ggml_backend_vk_device_offload_op,
/* .event_new = */ NULL,
/* .event_free = */ NULL,
/* .event_synchronize = */ NULL,
};
static ggml_guid_t ggml_backend_vk_guid() {
static ggml_guid guid = { 0xb8, 0xf7, 0x4f, 0x86, 0x40, 0x3c, 0xe1, 0x02, 0x91, 0xc8, 0xdd, 0xe9, 0x02, 0x3f, 0xc0, 0x2b };
return &guid;
static const char * ggml_backend_vk_reg_get_name(ggml_backend_reg_t reg) {
UNUSED(reg);
return GGML_VK_NAME;
}
ggml_backend_t ggml_backend_vk_init(size_t dev_num) {
VK_LOG_DEBUG("ggml_backend_vk_init(" << dev_num << ")");
ggml_backend_vk_context * ctx = new ggml_backend_vk_context;
ggml_vk_init(ctx, dev_num);
ggml_backend_t vk_backend = new ggml_backend {
/* .guid = */ ggml_backend_vk_guid(),
/* .interface = */ ggml_backend_vk_interface,
/* .device = */ nullptr,
/* .context = */ ctx,
};
return vk_backend;
static size_t ggml_backend_vk_reg_get_device_count(ggml_backend_reg_t reg) {
UNUSED(reg);
return ggml_backend_vk_get_device_count();
}
bool ggml_backend_is_vk(ggml_backend_t backend) {
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_vk_guid());
}
static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg, size_t device) {
static std::vector<ggml_backend_dev_t> devices;
int ggml_backend_vk_get_device_count() {
return ggml_vk_get_device_count();
}
static bool initialized = false;
void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size) {
ggml_vk_get_device_description(device, description, description_size);
}
void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total) {
GGML_ASSERT(device < (int) vk_instance.device_indices.size());
vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device]];
vk::PhysicalDeviceMemoryProperties memprops = vkdev.getMemoryProperties();
for (const vk::MemoryHeap& heap : memprops.memoryHeaps) {
if (heap.flags & vk::MemoryHeapFlagBits::eDeviceLocal) {
*total = heap.size;
*free = heap.size;
break;
{
static std::mutex mutex;
std::lock_guard<std::mutex> lock(mutex);
if (!initialized) {
for (size_t i = 0; i < ggml_backend_vk_get_device_count(); i++) {
ggml_backend_vk_device_context * ctx = new ggml_backend_vk_device_context;
char desc[256];
ggml_backend_vk_get_device_description(i, desc, sizeof(desc));
ctx->device = i;
ctx->name = GGML_VK_NAME + std::to_string(i);
ctx->description = desc;
devices.push_back(new ggml_backend_device {
/* .iface = */ ggml_backend_vk_device_i,
/* .reg = */ reg,
/* .context = */ ctx,
});
}
initialized = true;
}
}
GGML_ASSERT(device < devices.size());
return devices[device];
}
static const struct ggml_backend_reg_i ggml_backend_vk_reg_i = {
/* .get_name = */ ggml_backend_vk_reg_get_name,
/* .get_device_count = */ ggml_backend_vk_reg_get_device_count,
/* .get_device = */ ggml_backend_vk_reg_get_device,
/* .get_proc_address = */ NULL,
};
ggml_backend_reg_t ggml_backend_vk_reg() {
static ggml_backend_reg reg = {
/* .iface = */ ggml_backend_vk_reg_i,
/* .context = */ nullptr,
};
return &reg;
}
// Extension availability

View File

@@ -35,10 +35,6 @@
#include <omp.h>
#endif
#ifdef GGML_USE_METAL
#include <unistd.h>
#endif
#if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
#undef GGML_USE_LLAMAFILE
#endif
@@ -189,6 +185,8 @@ typedef pthread_t ggml_thread_t;
#endif
#if defined(__APPLE__)
#include <unistd.h>
#include <mach/mach.h>
#include <TargetConditionals.h>
#endif
@@ -326,8 +324,9 @@ struct ggml_logger_state {
static struct ggml_logger_state g_logger_state = {ggml_log_callback_default, NULL};
static void ggml_log_internal_v(enum ggml_log_level level, const char * format, va_list args) {
if (format == NULL)
if (format == NULL) {
return;
}
va_list args_copy;
va_copy(args_copy, args);
char buffer[128];
@@ -386,22 +385,40 @@ void ggml_log_callback_default(enum ggml_log_level level, const char * text, voi
//#define GGML_SOFT_MAX_ACCELERATE
#endif
void * ggml_aligned_malloc(size_t size) {
#if defined(_MSC_VER) || defined(__MINGW32__)
#define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
#define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
return _aligned_malloc(size, TENSOR_ALIGNMENT);
#else
inline static void * ggml_aligned_malloc(size_t size) {
if (size == 0) {
GGML_LOG_WARN("Behavior may be unexpected when allocating 0 bytes for ggml_aligned_malloc!\n");
return NULL;
}
void * aligned_memory = NULL;
#ifdef GGML_USE_CPU_HBM
int result = hbw_posix_memalign(&aligned_memory, 16, size);
int result = hbw_posix_memalign(&aligned_memory, TENSOR_ALIGNMENT, size);
#elif TARGET_OS_OSX
kern_return_t alloc_status = vm_allocate((vm_map_t) mach_task_self(), (vm_address_t *) &aligned_memory, size, VM_FLAGS_ANYWHERE);
int result = EFAULT;
switch (alloc_status) {
case KERN_SUCCESS:
result = 0;
break;
case KERN_INVALID_ADDRESS:
result = EINVAL;
break;
case KERN_NO_SPACE:
result = ENOMEM;
break;
default:
result = EFAULT;
break;
}
#elif GGML_USE_METAL
int result = posix_memalign(&aligned_memory, sysconf(_SC_PAGESIZE), size);
const long page_size = sysconf(_SC_PAGESIZE);
int result = posix_memalign(&aligned_memory, MAX(TENSOR_ALIGNMENT, page_size), size);
#else
int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size);
int result = posix_memalign(&aligned_memory, TENSOR_ALIGNMENT, size);
#endif
if (result != 0) {
// Handle allocation failure
@@ -419,14 +436,26 @@ inline static void * ggml_aligned_malloc(size_t size) {
return NULL;
}
return aligned_memory;
#endif
}
#define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size)
#ifdef GGML_USE_CPU_HBM
#define GGML_ALIGNED_FREE(ptr) if(NULL != ptr) hbw_free(ptr)
void ggml_aligned_free(void * ptr, size_t size) {
GGML_UNUSED(size);
#if defined(_MSC_VER) || defined(__MINGW32__)
_aligned_free(ptr);
#elif GGML_USE_CPU_HBM
if (ptr != NULL) {
hbw_free(ptr);
}
#elif TARGET_OS_OSX
if (ptr != NULL) {
vm_deallocate((vm_map_t)mach_task_self(), (vm_address_t)ptr, size);
}
#else
#define GGML_ALIGNED_FREE(ptr) free(ptr)
#endif
free(ptr);
#endif
}
inline static void * ggml_malloc(size_t size) {
if (size == 0) {
@@ -3869,7 +3898,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
*ctx = (struct ggml_context) {
/*.mem_size =*/ mem_size,
/*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : GGML_ALIGNED_MALLOC(mem_size),
/*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : ggml_aligned_malloc(mem_size),
/*.mem_buffer_owned =*/ params.mem_buffer ? false : true,
/*.no_alloc =*/ params.no_alloc,
/*.no_alloc_save =*/ params.no_alloc,
@@ -3909,7 +3938,7 @@ void ggml_free(struct ggml_context * ctx) {
__func__, i, ggml_used_mem(ctx));
if (ctx->mem_buffer_owned) {
GGML_ALIGNED_FREE(ctx->mem_buffer);
ggml_aligned_free(ctx->mem_buffer, ctx->mem_size);
}
found = true;
@@ -15695,6 +15724,9 @@ static void ggml_compute_forward_flash_attn_ext_f16(
ggml_vec_dot_t const kq_vec_dot = type_traits[k->type].vec_dot;
ggml_to_float_t const v_to_float = type_traits[v->type].to_float;
GGML_ASSERT(q_to_vec_dot && "fattn: unsupported K-type");
GGML_ASSERT(v_to_float && "fattn: unsupported V-type");
// loop over n_batch and n_head
for (int ir = ir0; ir < ir1; ++ir) {
// q indices
@@ -19608,9 +19640,10 @@ static void ggml_thread_cpumask_next(const bool * global_mask, bool * local_mask
void ggml_threadpool_free(struct ggml_threadpool* threadpool) {
if (!threadpool) return;
const int n_threads = threadpool->n_threads_max;
#ifndef GGML_USE_OPENMP
struct ggml_compute_state* workers = threadpool->workers;
const int n_threads = threadpool->n_threads_max;
ggml_mutex_lock(&threadpool->mutex);
@@ -19630,8 +19663,9 @@ void ggml_threadpool_free(struct ggml_threadpool* threadpool) {
ggml_cond_destroy(&threadpool->cond);
#endif // GGML_USE_OPENMP
GGML_ALIGNED_FREE(threadpool->workers);
GGML_ALIGNED_FREE(threadpool);
const size_t workers_size = sizeof(struct ggml_compute_state) * n_threads;
ggml_aligned_free(threadpool->workers, workers_size);
ggml_aligned_free(threadpool, sizeof(struct ggml_threadpool));
}
#ifndef GGML_USE_OPENMP
@@ -20063,7 +20097,7 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
struct ggml_cplan * cplan) {
struct ggml_threadpool * threadpool =
GGML_ALIGNED_MALLOC(sizeof(struct ggml_threadpool));
ggml_aligned_malloc(sizeof(struct ggml_threadpool));
{
threadpool->cgraph = cgraph;
threadpool->cplan = cplan;
@@ -20084,7 +20118,7 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
// Allocate and init workers state
const size_t workers_size = sizeof(struct ggml_compute_state) * tpp->n_threads;
struct ggml_compute_state * workers = GGML_ALIGNED_MALLOC(workers_size);
struct ggml_compute_state * workers = ggml_aligned_malloc(workers_size);
memset(workers, 0, workers_size);
for (int j = 0; j < tpp->n_threads; j++) {
@@ -23222,6 +23256,14 @@ int ggml_cpu_has_avx512_bf16(void) {
#endif
}
int ggml_cpu_has_amx_int8(void) {
#if defined(__AMX_INT8__)
return 1;
#else
return 0;
#endif
}
int ggml_cpu_has_fma(void) {
#if defined(__FMA__)
return 1;

View File

@@ -217,6 +217,7 @@ extern "C" {
typedef struct llama_token_data_array {
// TODO: consider SoA
// NOTE: this pointer can be modified by the samplers
llama_token_data * data;
size_t size;
int64_t selected; // this is the index in the data array (i.e. not the token id)
@@ -232,8 +233,11 @@ extern "C" {
// - token : the token ids of the input (used when embd is NULL)
// - embd : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
// - pos : the positions of the respective token in the sequence
// (if set to NULL, the token position will be tracked automatically by llama_decode)
// - seq_id : the sequence to which the respective token belongs
// (if set to NULL, the sequence ID will be assumed to be 0)
// - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output
// (if set to NULL, only the logits for last token will be returned)
//
typedef struct llama_batch {
int32_t n_tokens;
@@ -244,15 +248,6 @@ extern "C" {
int32_t * n_seq_id;
llama_seq_id ** seq_id;
int8_t * logits; // TODO: rename this to "output"
// NOTE: helpers for smooth API transition - can be deprecated in the future
// for future-proof code, use the above fields instead and ignore everything below
//
// pos[i] = all_pos_0 + i*all_pos_1
//
llama_pos all_pos_0; // used if pos == NULL
llama_pos all_pos_1; // used if pos == NULL
llama_seq_id all_seq_id; // used if seq_id == NULL
} llama_batch;
enum llama_model_kv_override_type {
@@ -776,15 +771,15 @@ extern "C" {
// Decoding
//
// Return batch for single sequence of tokens starting at pos_0
// Return batch for single sequence of tokens
// The sequence ID will be fixed to 0
// The position of the tokens will be tracked automatically by llama_decode
//
// NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it
//
LLAMA_API struct llama_batch llama_batch_get_one(
llama_token * tokens,
int32_t n_tokens,
llama_pos pos_0,
llama_seq_id seq_id);
int32_t n_tokens);
// Allocates a batch of tokens on the heap that can hold a maximum of n_tokens
// Each token can be assigned up to n_seq_max sequence ids
@@ -953,12 +948,6 @@ extern "C" {
int32_t lstrip,
bool special);
// check if token0 is contained as a prefix in token1
LLAMA_API bool llama_token_is_prefix(
const struct llama_model * model,
llama_token token0,
llama_token token1);
/// @details Convert the provided tokens into text (inverse of llama_tokenize()).
/// @param text The char pointer must be large enough to hold the resulting text.
/// @return Returns the number of chars/bytes on success, no more than text_len_max.
@@ -1081,12 +1070,13 @@ extern "C" {
// available samplers:
LLAMA_API struct llama_sampler * llama_sampler_init_greedy (void);
LLAMA_API struct llama_sampler * llama_sampler_init_dist (uint32_t seed);
LLAMA_API struct llama_sampler * llama_sampler_init_greedy(void);
LLAMA_API struct llama_sampler * llama_sampler_init_dist (uint32_t seed);
/// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
/// NOTE: Avoid using on the full vocabulary as the sorting can become slow. For example, apply top-k or top-p sampling first.
LLAMA_API struct llama_sampler * llama_sampler_init_softmax (void);
DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_softmax (void),
"will be removed in the future (see https://github.com/ggerganov/llama.cpp/pull/9896#discussion_r1800920915)");
/// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
LLAMA_API struct llama_sampler * llama_sampler_init_top_k (int32_t k);
@@ -1102,6 +1092,8 @@ extern "C" {
/// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
LLAMA_API struct llama_sampler * llama_sampler_init_typical (float p, size_t min_keep);
/// #details Updates the logits l_i` = l_i/t. When t <= 0.0f, the maximum logit is kept at it's original value, the rest are set to -inf
LLAMA_API struct llama_sampler * llama_sampler_init_temp (float t);
/// @details Dynamic temperature implementation (a.k.a. entropy) described in the paper https://arxiv.org/abs/2309.02772.

View File

@@ -15,7 +15,7 @@ CLI_ARGS_LLAMA_CLI_PERPLEXITY = [
"export", "file", "frequency-penalty", "grammar", "grammar-file", "hellaswag",
"hellaswag-tasks", "ignore-eos", "in-prefix", "in-prefix-bos", "in-suffix",
"interactive", "interactive-first", "keep", "logdir", "logit-bias", "lora", "lora-base",
"low-vram", "main-gpu", "memory-f32", "mirostat", "mirostat-ent", "mirostat-lr", "mlock",
"low-vram", "main-gpu", "mirostat", "mirostat-ent", "mirostat-lr", "mlock",
"model", "multiline-input", "n-gpu-layers", "n-predict", "no-mmap", "no-mul-mat-q",
"np-penalize-nl", "numa", "ppl-output-type", "ppl-stride", "presence-penalty", "prompt",
"prompt-cache", "prompt-cache-all", "prompt-cache-ro", "repeat-last-n",
@@ -25,12 +25,12 @@ CLI_ARGS_LLAMA_CLI_PERPLEXITY = [
]
CLI_ARGS_LLAMA_BENCH = [
"batch-size", "memory-f32", "low-vram", "model", "mul-mat-q", "n-gen", "n-gpu-layers",
"batch-size", "low-vram", "model", "mul-mat-q", "n-gen", "n-gpu-layers",
"n-prompt", "output", "repetitions", "tensor-split", "threads", "verbose"
]
CLI_ARGS_LLAMA_SERVER = [
"alias", "batch-size", "ctx-size", "embedding", "host", "memory-f32", "lora", "lora-base",
"alias", "batch-size", "ctx-size", "embedding", "host", "lora", "lora-base",
"low-vram", "main-gpu", "mlock", "model", "n-gpu-layers", "n-probs", "no-mmap", "no-mul-mat-q",
"numa", "path", "port", "rope-freq-base", "timeout", "rope-freq-scale", "tensor-split",
"threads", "verbose"

View File

@@ -63,6 +63,30 @@ static void llama_log_softmax(float * array, size_t size) {
}
*/
static void llama_sampler_temp_impl(llama_token_data_array * cur_p, float temp) {
if (temp <= 0.0f) {
// find the token with the highest logit and set the rest to -inf
size_t max_i = 0;
float max_l = cur_p->data[0].logit;
for (size_t i = 1; i < cur_p->size; ++i) {
if (cur_p->data[i ].logit > max_l) {
cur_p->data[max_i].logit = -INFINITY;
max_i = i;
max_l = cur_p->data[i].logit;
} else {
cur_p->data[i].logit = -INFINITY;
}
}
return;
}
for (size_t i = 0; i < cur_p->size; ++i) {
cur_p->data[i].logit /= temp;
}
}
static void llama_sampler_softmax_impl(llama_token_data_array * cur_p) {
GGML_ASSERT(cur_p->size > 0);
@@ -427,6 +451,9 @@ static const char * llama_sampler_dist_name(const struct llama_sampler * /*smpl*
static void llama_sampler_dist_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
auto * ctx = (llama_sampler_dist *) smpl->ctx;
llama_sampler_softmax_impl(cur_p);
cur_p->selected = llama_sample_dist(cur_p, ctx->rng);
}
@@ -912,9 +939,8 @@ static const char * llama_sampler_temp_name(const struct llama_sampler * /*smpl*
static void llama_sampler_temp_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
const auto * ctx = (llama_sampler_temp *) smpl->ctx;
for (size_t i = 0; i < cur_p->size; ++i) {
cur_p->data[i].logit /= ctx->temp;
}
llama_sampler_temp_impl(cur_p, ctx->temp);
}
static struct llama_sampler * llama_sampler_temp_clone(const struct llama_sampler * smpl) {
@@ -961,6 +987,7 @@ static void llama_sampler_temp_ext_apply(struct llama_sampler * smpl, llama_toke
if (ctx->delta > 0) {
const float min_temp = std::max(0.0f, ctx->temp - ctx->delta);
const float max_temp = ctx->temp + ctx->delta;
float exponent_val = ctx->exponent;
// no need to do anything if there is only one (or zero) candidates
@@ -998,9 +1025,7 @@ static void llama_sampler_temp_ext_apply(struct llama_sampler * smpl, llama_toke
#endif
// Apply the dynamically calculated temperature scaling
for (size_t i = 0; i < cur_p->size; ++i) {
cur_p->data[i].logit /= dyn_temp;
}
llama_sampler_temp_impl(cur_p, dyn_temp);
// Re-compute softmax probabilities after scaling logits with dynamic temperature
const double max_l_double = cur_p->data[0].logit;
@@ -1024,9 +1049,7 @@ static void llama_sampler_temp_ext_apply(struct llama_sampler * smpl, llama_toke
}
#endif
} else {
for (size_t i = 0; i < cur_p->size; ++i) {
cur_p->data[i].logit /= ctx->temp;
}
llama_sampler_temp_impl(cur_p, ctx->temp);
}
}
@@ -1745,6 +1768,9 @@ struct llama_sampler * llama_sampler_init_logit_bias(
struct llama_sampler_infill {
const struct llama_vocab * vocab;
std::vector<char> buf0;
std::vector<char> buf1;
};
static const char * llama_sampler_infill_name(const struct llama_sampler * /*smpl*/) {
@@ -1810,27 +1836,44 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
size_t n_combined = 0; GGML_UNUSED(n_combined);
// combine tokens with common prefix
for (size_t i = 0; i < cur_p->size; ++i) {
for (size_t j = 0; j < cur_p->size; ++j) {
if (cur_p->data[i].logit == -INFINITY) {
for (size_t i0 = 0; i0 < cur_p->size; ++i0) {
for (size_t i1 = 0; i1 < cur_p->size; ++i1) {
if (cur_p->data[i0].logit == -INFINITY) {
break;
}
if (i == j || cur_p->data[j].logit == -INFINITY) {
if (i0 == i1 || cur_p->data[i1].logit == -INFINITY) {
continue;
}
if (llama_token_is_prefix_impl(*ctx->vocab, cur_p->data[i].id, cur_p->data[j].id)) {
if (cur_p->data[i].p > cur_p->data[j].p) {
cur_p->data[i].p += cur_p->data[j].p;
cur_p->data[j].logit = -INFINITY;
cur_p->data[j].p = 0.0f;
} else {
cur_p->data[j].p += cur_p->data[i].p;
cur_p->data[i].logit = -INFINITY;
cur_p->data[i].p = 0.0f;
int len0 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i0].id, ctx->buf0.data(), ctx->buf0.size(), 0, false);
if (len0 < 0) {
ctx->buf0.resize(len0);
len0 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i0].id, ctx->buf0.data(), ctx->buf0.size(), 0, false);
assert(len0 > 0);
}
int len1 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i1].id, ctx->buf1.data(), ctx->buf1.size(), 0, false);
if (len1 < 0) {
ctx->buf1.resize(len1);
len1 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i1].id, ctx->buf1.data(), ctx->buf1.size(), 0, false);
assert(len1 > 0);
}
// token i0 is a prefix of token i1
if (len0 > 0 && len0 <= len1 && memcmp(ctx->buf0.data(), ctx->buf1.data(), len0) == 0) {
int dst = i0;
int src = i1;
// merge into the token with higher probability
if (cur_p->data[i1].p > cur_p->data[i0].p) {
std::swap(dst, src);
}
cur_p->data[dst].p += cur_p->data[src].p;
cur_p->data[src].logit = -INFINITY;
cur_p->data[src].p = 0.0f;
n_combined++;
}
}
@@ -1936,6 +1979,8 @@ struct llama_sampler * llama_sampler_init_infill_impl(
/* .iface = */ &llama_sampler_infill_i,
/* .ctx = */ new llama_sampler_infill {
/* .vocab = */ &vocab,
/* .buf0 = */ std::vector<char>(512),
/* .buf1 = */ std::vector<char>(512),
},
};
}

View File

@@ -221,7 +221,7 @@ struct llm_tokenizer_spm_session {
}
// seed the work queue with all possible 2-character tokens.
for (size_t i = 1; i < symbols.size(); ++i) {
for (int i = 1; i < (int) symbols.size(); ++i) {
try_add_bigram(i - 1, i);
}
@@ -563,7 +563,7 @@ struct llm_tokenizer_bpe_session {
index++;
symbols.emplace_back(sym);
}
for (size_t i = 1; i < symbols.size(); ++i) {
for (int i = 1; i < (int) symbols.size(); ++i) {
add_new_bigram(i - 1, i);
}
@@ -1858,23 +1858,6 @@ int32_t llama_token_to_piece_impl(const struct llama_vocab & vocab, llama_token
return 0;
}
bool llama_token_is_prefix_impl(
const struct llama_vocab & vocab,
llama_token token0,
llama_token token1) {
char text_buf_0[128];
char text_buf_1[128];
const int32_t len0 = llama_token_to_piece_impl(vocab, token0, text_buf_0, sizeof(text_buf_0) - 1, 0, false);
const int32_t len1 = llama_token_to_piece_impl(vocab, token1, text_buf_1, sizeof(text_buf_1) - 1, 0, false);
if (len0 <= 0 || len1 <= 0) {
return false;
}
return len0 <= len1 && memcmp(text_buf_0, text_buf_1, len0) == 0;
}
int32_t llama_detokenize_impl(
const struct llama_vocab & vocab,
const llama_token * tokens,

View File

@@ -8,14 +8,16 @@
#include "ggml-alloc.h"
#include "ggml-backend.h"
#if defined(GGML_USE_VULKAN)
# include "ggml-vulkan.h"
#elif defined(GGML_USE_SYCL)
# include "ggml-sycl.h"
#elif defined(GGML_USE_KOMPUTE)
#if defined(GGML_USE_KOMPUTE)
# include "ggml-kompute.h"
#elif defined(GGML_USE_CANN)
# include "ggml-cann.h"
#endif
#ifndef __AMX_INT8__
#undef GGML_USE_AMX
#endif
#ifdef GGML_USE_AMX
# include "ggml-amx.h"
#endif
// TODO: replace with ggml API call
@@ -2945,9 +2947,6 @@ struct llama_sbatch_seq {
llama_seq_id * seq_id;
size_t offset;
size_t length;
// helper for smoother batch API transition -- can be deprecated in the future
llama_seq_id all_seq_id; // used if seq_id == NULL
};
// sequence-length-aware batch splitting
@@ -3042,30 +3041,18 @@ struct llama_sbatch {
} else {
ubatch.embd = nullptr;
}
// from here on, the else branches are deprecated;
// they are helpers for smoother batch API transition
if (batch->pos) {
if (ubatch.equal_seqs) {
for (size_t i = 0; i < length; ++i) {
ubatch.pos[ubatch.n_tokens + i] = batch->pos[ids[seq.offset + i]];
}
} else {
// simple split
ubatch.pos = batch->pos + seq.offset;
if (ubatch.equal_seqs) {
for (size_t i = 0; i < length; ++i) {
ubatch.pos[ubatch.n_tokens + i] = batch->pos[ids[seq.offset + i]];
}
} else {
for (size_t i = 0; i < length; ++i) {
llama_pos bi = ids[seq.offset + i];
ubatch.pos[ubatch.n_tokens + i] = batch->all_pos_0 + (bi * batch->all_pos_1);
}
// simple split
ubatch.pos = batch->pos + seq.offset;
}
if (ubatch.equal_seqs) {
ubatch.n_seq_id[ubatch.n_seqs] = seq.n_seq_id;
if (seq.seq_id) {
ubatch.seq_id[ubatch.n_seqs] = seq.seq_id;
} else {
GGML_ASSERT(seq.n_seq_id == 1);
ubatch.seq_id[ubatch.n_seqs] = &seq.all_seq_id;
}
} else {
// simple split
@@ -3078,10 +3065,6 @@ struct llama_sbatch {
}
if (batch->seq_id) {
ubatch.seq_id = batch->seq_id + seq.offset;
} else {
for (size_t i = 0; i < length; ++i) {
ubatch.seq_id[ubatch.n_seqs + i] = &seq.all_seq_id;
}
}
}
if (logits_all) {
@@ -3200,7 +3183,6 @@ struct llama_sbatch {
s.seq_id = nullptr;
s.offset = 0;
s.length = n_tokens;
s.all_seq_id = batch.all_seq_id;
return;
}
std::sort(ids.begin(), ids.end(),
@@ -3223,7 +3205,7 @@ struct llama_sbatch {
if (batch.pos) {
return batch.pos[a] < batch.pos[b];
}
// no pos, sort by id (assuming batch.all_pos_1 is positive)
// no pos, sort by id
return a < b;
}
// shared prompts go first
@@ -3233,30 +3215,25 @@ struct llama_sbatch {
// init seq
llama_sbatch_seq * last_seq = nullptr;
if (batch.n_seq_id != nullptr && batch.seq_id != nullptr) {
for (size_t i = 0; i < n_tokens; ++i) {
const size_t bi = ids[i];
const int32_t n_seqs = batch.n_seq_id[bi];
llama_seq_id * seq_ids = batch.seq_id[bi];
if (last_seq != nullptr) {
bool same = n_seqs == last_seq->n_seq_id;
for (int32_t j = 0; same && j < n_seqs; ++j) {
if (seq_ids[j] != last_seq->seq_id[j]) {
same = false;
}
}
if (same) {
last_seq->length += 1;
continue;
for (size_t i = 0; i < n_tokens; ++i) {
const size_t bi = ids[i];
const int32_t n_seqs = batch.n_seq_id[bi];
llama_seq_id * seq_ids = batch.seq_id[bi];
if (last_seq != nullptr) {
bool same = n_seqs == last_seq->n_seq_id;
for (int32_t j = 0; same && j < n_seqs; ++j) {
if (seq_ids[j] != last_seq->seq_id[j]) {
same = false;
}
}
llama_sbatch_seq new_seq = {n_seqs, seq_ids, i, 1, batch.all_seq_id};
seq.push_back(new_seq);
last_seq = &seq.back();
if (same) {
last_seq->length += 1;
continue;
}
}
} else {
llama_sbatch_seq new_seq = {1, nullptr, 0, n_tokens, batch.all_seq_id};
llama_sbatch_seq new_seq = {n_seqs, seq_ids, i, 1};
seq.push_back(new_seq);
last_seq = &seq.back();
}
// keep shared prompts first at the end, then sort by length descending.
std::sort(seq.begin(), seq.end(),
@@ -3416,12 +3393,8 @@ struct llama_lora_adapter {
static int llama_get_device_count(const llama_model & model) {
int count = (int) model.devices.size();
#if defined(GGML_USE_SYCL)
count += ggml_backend_sycl_get_device_count();
#elif defined(GGML_USE_VULKAN)
count += ggml_backend_vk_get_device_count();
#elif defined(GGML_USE_CANN)
count += ggml_backend_cann_get_device_count();
#if defined(GGML_USE_RPC)
count += (int) model.rpc_servers.size();
#endif
return count;
@@ -3441,20 +3414,8 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(const llama_mode
}
}
#if defined(GGML_USE_SYCL)
if (host_buffer) {
buft = ggml_backend_sycl_host_buffer_type();
}
#elif defined(GGML_USE_CANN)
if (host_buffer) {
buft = ggml_backend_cann_host_buffer_type();
}
#elif defined(GGML_USE_CPU_HBM)
#if defined(GGML_USE_CPU_HBM)
buft = ggml_backend_cpu_hbm_buffer_type();
#elif defined(GGML_USE_VULKAN)
if (host_buffer) {
buft = ggml_backend_vk_host_buffer_type();
}
#endif
if (buft == nullptr) {
@@ -3473,14 +3434,8 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_
}
device -= (int)model.devices.size();
#if defined(GGML_USE_VULKAN)
buft = ggml_backend_vk_buffer_type(device);
#elif defined(GGML_USE_SYCL)
buft = ggml_backend_sycl_buffer_type(device);
#elif defined(GGML_USE_KOMPUTE)
#if defined(GGML_USE_KOMPUTE)
buft = ggml_backend_kompute_buffer_type(device);
#elif defined(GGML_USE_CANN)
buft = ggml_backend_cann_buffer_type(device);
#endif
if (buft == nullptr) {
@@ -3507,12 +3462,6 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_mo
}
}
#ifdef GGML_USE_SYCL
if (ggml_backend_sycl_get_device_count() > 1) {
buft = ggml_backend_sycl_split_buffer_type(tensor_split);
}
#endif
if (buft == nullptr) {
buft = llama_default_buffer_type_offload(model, fallback_gpu);
}
@@ -3530,24 +3479,14 @@ static size_t llama_get_device_memory(const llama_model & model, int device) {
return free;
}
#if defined(GGML_USE_SYCL)
size_t total;
size_t free;
ggml_backend_sycl_get_device_memory(device, &free, &total);
return free;
#elif defined(GGML_USE_VULKAN)
size_t total;
size_t free;
ggml_backend_vk_get_device_memory(device, &free, &total);
return free;
#elif defined(GGML_USE_CANN)
size_t total;
size_t free;
ggml_backend_cann_get_device_memory(device, &free, &total);
return free;
#else
if (model.devices.size() > 0) {
ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(model.devices[0]);
LLAMA_LOG_WARN("%s: failed to get free memmory of device:%d of backend:%s, for device id is out of range.\n", __func__, device, ggml_backend_reg_name(reg));
} else {
LLAMA_LOG_WARN("%s: failed to get free memmory of device, no devices in inputted model.\n", __func__);
}
return 1;
#endif
GGML_UNUSED(model);
GGML_UNUSED(device);
}
@@ -6750,9 +6689,9 @@ static void llm_load_vocab(
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
}
} else {
// token is control, but not marked as EOG -> print a warning
// token is control, but not marked as EOG -> print a debug log
if (vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL && vocab.special_eog_ids.count(t.second) == 0) {
LLAMA_LOG_WARN("%s: control token: %6d '%s' is not marked as EOG\n",
LLAMA_LOG_DEBUG("%s: control token: %6d '%s' is not marked as EOG\n",
__func__, t.second, t.first.c_str());
}
}
@@ -7046,7 +6985,14 @@ static bool llm_load_tensors(
// assign cpu layers
for (int i = 0; i < i_gpu_start; ++i) {
#ifdef GGML_USE_AMX
model.buft_layer[i] = {
ggml_backend_amx_buffer_type(),
llama_default_buffer_type_cpu(model, true)
};
#else
model.buft_layer[i] = llama_default_buffer_type_cpu(model, true);
#endif
}
if (split_mode == LLAMA_SPLIT_MODE_LAYER) {
@@ -16095,9 +16041,11 @@ struct llm_build_context {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, model.output_norm_b, LLM_NORM, cb, -1);
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
cb(cur, "result_norm", -1);
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
cb(cur, "result_output", -1);
ggml_build_forward_expand(gf, cur);
return gf;
@@ -17147,10 +17095,10 @@ static void llama_graph_compute(
//
static int llama_decode_internal(
llama_context & lctx,
llama_batch batch_all) { // TODO: rename back to batch
llama_batch batch) {
lctx.is_encoding = false;
const uint32_t n_tokens_all = batch_all.n_tokens;
const uint32_t n_tokens_all = batch.n_tokens;
if (n_tokens_all == 0) {
LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
@@ -17161,12 +17109,12 @@ static int llama_decode_internal(
const auto & hparams = model.hparams;
const auto & cparams = lctx.cparams;
GGML_ASSERT((!batch_all.token && batch_all.embd) || (batch_all.token && !batch_all.embd)); // NOLINT
GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
if (batch_all.token) {
if (batch.token) {
for (uint32_t i = 0; i < n_tokens_all; ++i) {
if (batch_all.token[i] < 0 || (uint32_t)batch_all.token[i] >= model.vocab.n_vocab) {
LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch_all.token[i]);
if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= model.vocab.n_vocab) {
LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]);
return -1;
}
}
@@ -17197,9 +17145,9 @@ static int llama_decode_internal(
lctx.embd_seq.clear();
// count outputs
if (batch_all.logits && !embd_pooled) {
if (batch.logits && !embd_pooled) {
for (uint32_t i = 0; i < n_tokens_all; ++i) {
n_outputs += batch_all.logits[i] != 0;
n_outputs += batch.logits[i] != 0;
}
} else if (lctx.logits_all || embd_pooled) {
n_outputs = n_tokens_all;
@@ -17208,7 +17156,7 @@ static int llama_decode_internal(
n_outputs = 1;
}
lctx.sbatch.from_batch(batch_all, n_embd,
lctx.sbatch.from_batch(batch, n_embd,
/* simple_split */ !kv_self.recurrent,
/* logits_all */ n_outputs == n_tokens_all);
@@ -19093,8 +19041,7 @@ bool llama_supports_mlock(void) {
}
bool llama_supports_gpu_offload(void) {
#if defined(GGML_USE_VULKAN) || \
defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
#if defined(GGML_USE_KOMPUTE)
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
return true;
#else
@@ -19225,8 +19172,13 @@ struct llama_model * llama_load_model_from_file(
case GGML_BACKEND_DEVICE_TYPE_GPU:
case GGML_BACKEND_DEVICE_TYPE_GPU_FULL:
{
size_t free, total; // NOLINT
ggml_backend_dev_memory(dev, &free, &total);
LLAMA_LOG_INFO("%s: using device %s (%s) - %zu MiB free\n", __func__, ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), free/1024/1024);
model->devices.push_back(dev);
break;
}
}
}
@@ -19278,7 +19230,7 @@ struct llama_context * llama_new_context_with_model(
params.flash_attn = false;
}
if (params.type_v != GGML_TYPE_F16 && !params.flash_attn) {
if (ggml_is_quantized(params.type_v) && !params.flash_attn) {
LLAMA_LOG_ERROR("%s: V cache quantization requires flash_attn\n", __func__);
return nullptr;
}
@@ -19421,54 +19373,7 @@ struct llama_context * llama_new_context_with_model(
main_gpu -= (int)model->devices.size();
}
#if defined(GGML_USE_VULKAN)
if (model->split_mode == LLAMA_SPLIT_MODE_ROW) {
LLAMA_LOG_ERROR("%s: Row split not supported. Failed to initialize Vulkan backend\n", __func__);
llama_free(ctx);
return nullptr;
}
if (model->split_mode == LLAMA_SPLIT_MODE_NONE) {
ggml_backend_t backend = ggml_backend_vk_init(main_gpu);
if (backend == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__);
llama_free(ctx);
return nullptr;
}
ctx->backends.push_back(backend);
} else {
for (int device = 0; device < ggml_backend_vk_get_device_count(); ++device) {
ggml_backend_t backend = ggml_backend_vk_init(device);
if (backend == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize Vulkan%d backend\n", __func__, device);
llama_free(ctx);
return nullptr;
}
ctx->backends.push_back(backend);
}
}
#elif defined(GGML_USE_SYCL)
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
ggml_backend_t backend = ggml_backend_sycl_init(main_gpu);
if (backend == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, main_gpu);
llama_free(ctx);
return nullptr;
}
ctx->backends.push_back(backend);
} else {
// LLAMA_SPLIT_LAYER requires a backend for each GPU
for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) {
ggml_backend_t backend = ggml_backend_sycl_init(i);
if (backend == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d for No.%d backend\n", __func__, i, i);
llama_free(ctx);
return nullptr;
}
ctx->backends.push_back(backend);
}
}
#elif defined(GGML_USE_KOMPUTE)
#if defined(GGML_USE_KOMPUTE)
if (model->n_gpu_layers > 0) {
auto * backend = ggml_backend_kompute_init(main_gpu);
if (backend == nullptr) {
@@ -19478,30 +19383,6 @@ struct llama_context * llama_new_context_with_model(
}
ctx->backends.push_back(backend);
}
#elif defined(GGML_USE_CANN)
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
// TODO: ggml_backend_cann is not support split tensor now, just leave code here.
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
ggml_backend_t backend = ggml_backend_cann_init(main_gpu);
if (backend == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize CANN%d backend\n", __func__, main_gpu);
llama_free(ctx);
return nullptr;
}
ctx->backends.push_back(backend);
} else {
// LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
// TODO: currently, CANN can't use multi-gpus, just leave code here for further cann version.
for (int32_t device = 0; device < ggml_backend_cann_get_device_count(); ++device) {
ggml_backend_t backend = ggml_backend_cann_init(device);
if (backend == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize CANN%d backend\n", __func__, device);
llama_free(ctx);
return nullptr;
}
ctx->backends.push_back(backend);
}
}
#endif
// add other backends (such as BLAS)
@@ -21153,9 +21034,7 @@ void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
struct llama_batch llama_batch_get_one(
llama_token * tokens,
int32_t n_tokens,
llama_pos pos_0,
llama_seq_id seq_id) {
int32_t n_tokens) {
return {
/*n_tokens =*/ n_tokens,
/*tokens =*/ tokens,
@@ -21164,9 +21043,6 @@ struct llama_batch llama_batch_get_one(
/*n_seq_id =*/ nullptr,
/*seq_id =*/ nullptr,
/*logits =*/ nullptr,
/*all_pos_0 =*/ pos_0,
/*all_pos_1 =*/ 1,
/*all_seq_id =*/ seq_id,
};
}
@@ -21179,9 +21055,6 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
/*n_seq_id =*/ nullptr,
/*seq_id =*/ nullptr,
/*logits =*/ nullptr,
/*all_pos_0 =*/ 0,
/*all_pos_1 =*/ 0,
/*all_seq_id =*/ 0,
};
if (embd) {
@@ -21217,11 +21090,62 @@ void llama_batch_free(struct llama_batch batch) {
if (batch.logits) free(batch.logits);
}
// temporary allocate memory for the input batch if needed
static const llama_seq_id batch_default_seq_id = 0;
struct llama_batch_allocr {
std::array<llama_seq_id, 1> seq_id_0 = {batch_default_seq_id};
std::vector<llama_pos> pos;
std::vector<int32_t> n_seq_id;
std::vector<llama_seq_id *> seq_id;
std::vector<int8_t> logits;
struct llama_batch batch;
// optionally fulfill the batch returned by llama_batch_get_one
llama_batch_allocr(struct llama_context * ctx, struct llama_batch in_batch) {
batch = in_batch;
if (!batch.pos) {
// determine the last position in KV cache
llama_pos last_pos = -1;
for (const auto & cell : ctx->kv_self.cells) {
if (cell.has_seq_id(batch_default_seq_id)) {
last_pos = std::max(last_pos, cell.pos);
}
}
last_pos++; // next position
pos.resize(batch.n_tokens);
for (int32_t i = 0; i < batch.n_tokens; i++) {
pos[i] = i+last_pos;
}
batch.pos = pos.data();
}
if (!batch.n_seq_id) {
n_seq_id.resize(batch.n_tokens);
for (int32_t i = 0; i < batch.n_tokens; i++) {
n_seq_id[i] = seq_id_0.size();
}
batch.n_seq_id = n_seq_id.data();
}
if (!batch.seq_id) {
seq_id.resize(batch.n_tokens + 1);
seq_id[batch.n_tokens] = NULL;
for (int32_t i = 0; i < batch.n_tokens; i++) {
seq_id[i] = seq_id_0.data();
}
batch.seq_id = seq_id.data();
}
if (!batch.logits) {
logits.resize(batch.n_tokens);
logits[logits.size() - 1] = true;
batch.logits = logits.data();
}
}
};
int32_t llama_encode(
struct llama_context * ctx,
struct llama_batch batch) {
const int ret = llama_encode_internal(*ctx, batch);
if (ret < 0) {
llama_batch_allocr batch_allocr(ctx, batch);
const int ret = llama_encode_internal(*ctx, batch_allocr.batch);
if (ret != 0) {
LLAMA_LOG_ERROR("%s: failed to encode, ret = %d\n", __func__, ret);
}
@@ -21231,8 +21155,9 @@ int32_t llama_encode(
int32_t llama_decode(
struct llama_context * ctx,
struct llama_batch batch) {
const int ret = llama_decode_internal(*ctx, batch);
if (ret < 0) {
llama_batch_allocr batch_allocr(ctx, batch);
const int ret = llama_decode_internal(*ctx, batch_allocr.batch);
if (ret != 0) {
LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
}
@@ -21500,13 +21425,6 @@ int32_t llama_token_to_piece(
return llama_token_to_piece_impl(model->vocab, token, buf, length, lstrip, special);
}
bool llama_token_is_prefix(
const struct llama_model * model,
llama_token token0,
llama_token token1) {
return llama_token_is_prefix_impl(model->vocab, token0, token1);
}
int32_t llama_detokenize(
const struct llama_model * model,
const llama_token * tokens,
@@ -21880,6 +21798,7 @@ const char * llama_print_system_info(void) {
s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | ";
s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | ";
s += "AVX512_BF16 = " + std::to_string(ggml_cpu_has_avx512_bf16()) + " | ";
s += "AMX_INT8 = " + std::to_string(ggml_cpu_has_amx_int8()) + " | ";
s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
s += "SVE = " + std::to_string(ggml_cpu_has_sve()) + " | ";

View File

@@ -696,7 +696,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
"pattern": "^abc?d*efg+(hij)?kl$"
})""",
R"""(
root ::= "\"" "ab" "c"? "d"* "ef" "g"+ ("hij")? "kl" "\"" space
root ::= "\"" ("ab" "c"? "d"* "ef" "g"+ ("hij")? "kl") "\"" space
space ::= | " " | "\n" [ \t]{0,20}
)"""
});
@@ -709,7 +709,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
"pattern": "^\\[\\]\\{\\}\\(\\)\\|\\+\\*\\?$"
})""",
R"""(
root ::= "\"" "[]{}()|+*?" "\"" space
root ::= "\"" ("[]{}()|+*?") "\"" space
space ::= | " " | "\n" [ \t]{0,20}
)"""
});
@@ -722,7 +722,20 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
"pattern": "^\"$"
})""",
R"""(
root ::= "\"" "\"" "\"" space
root ::= "\"" ("\"") "\"" space
space ::= | " " | "\n" [ \t]{0,20}
)"""
});
test({
SUCCESS,
"regexp with top-level alternation",
R"""({
"type": "string",
"pattern": "^A|B|C|D$"
})""",
R"""(
root ::= "\"" ("A" | "B" | "C" | "D") "\"" space
space ::= | " " | "\n" [ \t]{0,20}
)"""
});
@@ -736,7 +749,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
})""",
R"""(
dot ::= [^\x0A\x0D]
root ::= "\"" ("(" root-1{1,3} ")")? root-1{3,3} "-" root-1{4,4} " " "a"{3,5} "nd" dot dot dot "\"" space
root ::= "\"" (("(" root-1{1,3} ")")? root-1{3,3} "-" root-1{4,4} " " "a"{3,5} "nd" dot dot dot) "\"" space
root-1 ::= [0-9]
space ::= | " " | "\n" [ \t]{0,20}
)"""

View File

@@ -18,203 +18,176 @@ static void dump(const llama_token_data_array * cur_p) {
#define DUMP(__cur_p) do { printf("%s:%d (%s)\n", __FILE__, __LINE__, __func__); dump((__cur_p)); printf("-\n"); } while(0)
#define APPLY(__cnstr, __cur_p) do { \
auto * cnstr = (__cnstr); \
llama_sampler_apply(cnstr, (__cur_p)); \
llama_sampler_free(cnstr); \
} while(0)
struct sampler_tester {
sampler_tester(size_t n_vocab) {
cur.reserve(n_vocab);
for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
const float logit = logf(token_id);
cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
}
static void test_top_k(const std::vector<float> & probs, const std::vector<float> & expected_probs, int k) {
const size_t n_vocab = probs.size();
cur_p = llama_token_data_array { cur.data(), cur.size(), -1, false };
}
sampler_tester(const std::vector<float> & probs, const std::vector<float> & probs_expected) : probs_expected(probs_expected) {
cur.reserve(probs.size());
for (llama_token token_id = 0; token_id < (llama_token)probs.size(); token_id++) {
const float logit = logf(probs[token_id]);
cur.emplace_back(llama_token_data{token_id, logit, probs[token_id]});
}
cur_p = llama_token_data_array { cur.data(), cur.size(), -1, false };
}
void apply(llama_sampler * sampler) {
llama_sampler_apply(sampler, &cur_p);
llama_sampler_free(sampler);
}
void check() {
GGML_ASSERT(cur_p.size == probs_expected.size());
for (size_t i = 0; i < cur_p.size; i++) {
GGML_ASSERT(fabs(cur_p.data[i].p - probs_expected[i]) < 1e-5);
}
}
llama_token_data_array cur_p;
private:
const std::vector<float> probs_expected;
std::vector<llama_token_data> cur;
cur.reserve(n_vocab);
for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
const float logit = logf(probs[token_id]);
cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
}
};
llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
APPLY(llama_sampler_init_softmax(), &cur_p);
DUMP(&cur_p);
APPLY(llama_sampler_init_top_k(k), &cur_p);
DUMP(&cur_p);
static void test_temp(const std::vector<float> & probs, const std::vector<float> & probs_expected, float temp) {
sampler_tester tester(probs, probs_expected);
GGML_ASSERT(cur_p.size == expected_probs.size());
for (size_t i = 0; i < cur_p.size; i++) {
GGML_ASSERT(fabs(cur_p.data[i].p - expected_probs[i]) < 1e-5);
}
DUMP(&tester.cur_p);
tester.apply(llama_sampler_init_temp(temp));
tester.apply(llama_sampler_init_dist(0));
DUMP(&tester.cur_p);
tester.check();
}
static void test_top_p(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
const size_t n_vocab = probs.size();
static void test_temp_ext(const std::vector<float> & probs, const std::vector<float> & probs_expected, float temp, float delta, float exponent) {
sampler_tester tester(probs, probs_expected);
std::vector<llama_token_data> cur;
cur.reserve(n_vocab);
for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
const float logit = logf(probs[token_id]);
cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
}
DUMP(&tester.cur_p);
tester.apply(llama_sampler_init_temp_ext(temp, delta, exponent));
tester.apply(llama_sampler_init_dist (0));
DUMP(&tester.cur_p);
llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
APPLY(llama_sampler_init_softmax(), &cur_p);
DUMP(&cur_p);
APPLY(llama_sampler_init_top_p(p, 1), &cur_p);
DUMP(&cur_p);
GGML_ASSERT(cur_p.size == expected_probs.size());
for (size_t i = 0; i < cur_p.size; i++) {
GGML_ASSERT(fabs(cur_p.data[i].p - expected_probs[i]) < 1e-3);
}
tester.check();
}
static void test_tfs(const std::vector<float> & probs, const std::vector<float> & expected_probs, float z) {
const size_t n_vocab = probs.size();
static void test_top_k(const std::vector<float> & probs, const std::vector<float> & probs_expected, int k) {
sampler_tester tester(probs, probs_expected);
std::vector<llama_token_data> cur;
cur.reserve(n_vocab);
for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
const float logit = logf(probs[token_id]);
cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
}
DUMP(&tester.cur_p);
tester.apply(llama_sampler_init_top_k(k));
tester.apply(llama_sampler_init_dist (0));
DUMP(&tester.cur_p);
llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
DUMP(&cur_p);
APPLY(llama_sampler_init_tail_free(z, 1), &cur_p);
DUMP(&cur_p);
GGML_ASSERT(cur_p.size == expected_probs.size());
for (size_t i = 0; i < cur_p.size; i++) {
GGML_ASSERT(fabs(cur_p.data[i].p - expected_probs[i]) < 1e-3);
}
tester.check();
}
static void test_min_p(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
const size_t n_vocab = probs.size();
static void test_top_p(const std::vector<float> & probs, const std::vector<float> & probs_expected, float p) {
sampler_tester tester(probs, probs_expected);
std::vector<llama_token_data> cur;
cur.reserve(n_vocab);
for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
const float logit = logf(probs[token_id]);
cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
}
DUMP(&tester.cur_p);
tester.apply(llama_sampler_init_top_p(p, 1));
tester.apply(llama_sampler_init_dist (0));
DUMP(&tester.cur_p);
llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
DUMP(&cur_p);
APPLY(llama_sampler_init_min_p(p, 1), &cur_p);
DUMP(&cur_p);
APPLY(llama_sampler_init_softmax(), &cur_p);
GGML_ASSERT(cur_p.size == expected_probs.size());
for (size_t i = 0; i < cur_p.size; i++) {
GGML_ASSERT(fabs(cur_p.data[i].p - expected_probs[i]) < 1e-3);
}
tester.check();
}
static void test_xtc(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p, float t) {
const size_t n_vocab = probs.size();
static void test_tfs(const std::vector<float> & probs, const std::vector<float> & probs_expected, float z) {
sampler_tester tester(probs, probs_expected);
std::vector<llama_token_data> cur;
cur.reserve(n_vocab);
for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
const float logit = logf(probs[token_id]);
cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
}
DUMP(&tester.cur_p);
tester.apply(llama_sampler_init_tail_free(z, 1));
DUMP(&tester.cur_p);
llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
APPLY(llama_sampler_init_softmax(), &cur_p);
DUMP(&cur_p);
APPLY(llama_sampler_init_xtc(p, t, 0, 0), &cur_p);
DUMP(&cur_p);
GGML_ASSERT(cur_p.size == expected_probs.size());
for (size_t i = 0; i < cur_p.size; i++) {
GGML_ASSERT(fabs(cur_p.data[i].p - expected_probs[i]) < 1e-5);
}
tester.check();
}
static void test_typical(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
const size_t n_vocab = probs.size();
static void test_min_p(const std::vector<float> & probs, const std::vector<float> & probs_expected, float p) {
sampler_tester tester(probs, probs_expected);
std::vector<llama_token_data> cur;
cur.reserve(n_vocab);
for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
const float logit = logf(probs[token_id]);
cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
}
DUMP(&tester.cur_p);
tester.apply(llama_sampler_init_min_p(p, 1));
tester.apply(llama_sampler_init_dist (0));
DUMP(&tester.cur_p);
llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
DUMP(&cur_p);
APPLY(llama_sampler_init_typical(p, 1), &cur_p);
DUMP(&cur_p);
tester.check();
}
GGML_ASSERT(cur_p.size == expected_probs.size());
for (size_t i = 0; i < cur_p.size; i++) {
GGML_ASSERT(fabs(cur_p.data[i].p - expected_probs[i]) < 1e-3);
}
static void test_xtc(const std::vector<float> & probs, const std::vector<float> & probs_expected, float p, float t) {
sampler_tester tester(probs, probs_expected);
DUMP(&tester.cur_p);
tester.apply(llama_sampler_init_xtc(p, t, 0, 0));
DUMP(&tester.cur_p);
tester.check();
}
static void test_typical(const std::vector<float> & probs, const std::vector<float> & probs_expected, float p) {
sampler_tester tester(probs, probs_expected);
DUMP(&tester.cur_p);
tester.apply(llama_sampler_init_typical(p, 1));
DUMP(&tester.cur_p);
tester.check();
}
static void test_penalties(
const std::vector<float> & probs, const std::vector<llama_token> & last_tokens,
const std::vector<float> & expected_probs, float repeat_penalty, float alpha_frequency, float alpha_presence
const std::vector<float> & probs_expected, float repeat_penalty, float alpha_frequency, float alpha_presence
) {
GGML_ASSERT(probs.size() == expected_probs.size());
GGML_ASSERT(probs.size() == probs_expected.size());
sampler_tester tester(probs, probs_expected);
const size_t n_vocab = probs.size();
std::vector<llama_token_data> cur;
cur.reserve(n_vocab);
for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
const float logit = logf(probs[token_id]);
cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
}
llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
auto * sampler = llama_sampler_init_penalties(n_vocab, LLAMA_TOKEN_NULL, LLAMA_TOKEN_NULL, last_tokens.size(), repeat_penalty, alpha_frequency, alpha_presence, false, false);
for (size_t i = 0; i < last_tokens.size(); i++) {
llama_sampler_accept(sampler, last_tokens[i]);
}
APPLY(llama_sampler_init_softmax(), &cur_p);
DUMP(&cur_p);
APPLY(sampler, &cur_p);
APPLY(llama_sampler_init_softmax(), &cur_p);
DUMP(&cur_p);
DUMP(&tester.cur_p);
tester.apply(sampler);
tester.apply(llama_sampler_init_dist(0));
DUMP(&tester.cur_p);
GGML_ASSERT(cur_p.size == expected_probs.size());
for (size_t i = 0; i < cur_p.size; i++) {
GGML_ASSERT(fabs(cur_p.data[i].p - expected_probs[i]) < 1e-3);
}
tester.check();
}
static void test_sampler_queue(const size_t n_vocab, const std::string & samplers_sequence, const int top_k, const float top_p, const float min_p
) {
std::vector<llama_token_data> cur;
cur.reserve(n_vocab);
for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
const float logit = logf(token_id);
cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
}
llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
sampler_tester tester(n_vocab);
llama_token min_token_id = 0;
const llama_token max_token_id = n_vocab-1;
for (auto s : samplers_sequence) {
switch (s){
case 'k': APPLY(llama_sampler_init_top_k(top_k), &cur_p); break;
case 'k': tester.apply(llama_sampler_init_top_k(top_k)); break;
case 'f': GGML_ABORT("tail_free test not implemented");
case 'y': GGML_ABORT("typical test not implemented");
case 'p': APPLY(llama_sampler_init_top_p(top_p, 1), &cur_p); break;
case 'm': APPLY(llama_sampler_init_min_p(min_p, 1), &cur_p); break;
case 'p': tester.apply(llama_sampler_init_top_p(top_p, 1)); break;
case 'm': tester.apply(llama_sampler_init_min_p(min_p, 1)); break;
case 't': GGML_ABORT("temperature test not implemented");
default : GGML_ABORT("Unknown sampler");
}
APPLY(llama_sampler_init_softmax(), &cur_p); // make sure tokens are sorted for tests
tester.apply(llama_sampler_init_dist(0));
auto & cur_p = tester.cur_p;
const int size = cur_p.size;
@@ -307,21 +280,26 @@ static void test_perf() {
BENCH(llama_sampler_init_tail_free(0.5f, 1), data, 32);
BENCH(llama_sampler_init_typical (0.5f, 1), data, 32);
BENCH(llama_sampler_init_xtc (1.0f, 0.1f, 1, 1), data, 32);
BENCH(llama_sampler_init_softmax (), data, 32);
}
int main(void) {
ggml_time_init();
test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f}, 1);
test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f}, 3);
test_temp({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 1.0f);
test_temp({0.1f, 0.2f, 0.3f, 0.4f}, {1.0f, 0.0f, 0.0f, 0.0f}, 0.0f);
test_temp_ext({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 1.0f, 0.0f, 1.0f);
test_temp_ext({0.1f, 0.2f, 0.3f, 0.4f}, {1.0f, 0.0f, 0.0f, 0.0f}, 0.0f, 0.0f, 1.0f);
test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {1.0f}, 1);
test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.44444f, 0.33333f, 0.22222f}, 3);
test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 4);
test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 0);
test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f}, 0);
test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f}, 0.7f);
test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f}, 0.8f);
test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 1);
test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {1.0f}, 0);
test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.571429f, 0.428571f}, 0.7f);
test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.44444f, 0.33333f, 0.22222f}, 0.8f);
test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 1.0f);
test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/1.0f, 0.3f/1.0f, 0.2f/1.0f, 0.1f/1.0f}, 0.00f);
test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/1.0f, 0.3f/1.0f, 0.2f/1.0f, 0.1f/1.0f}, 0.24f);