mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-05-28 17:27:26 +03:00
Compare commits
13 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
afcda09d15 | ||
|
|
bbce619adb | ||
|
|
4f0e43da6f | ||
|
|
bb28c1fe24 | ||
|
|
ee7c30578a | ||
|
|
47c0eda9d4 | ||
|
|
5306f4b3b5 | ||
|
|
40d5358d3c | ||
|
|
b65bb4baae | ||
|
|
a1a69f777a | ||
|
|
52fb93a2bd | ||
|
|
c9021714e8 | ||
|
|
1d7ab2b947 |
5
.github/workflows/build-apple.yml
vendored
5
.github/workflows/build-apple.yml
vendored
@@ -59,6 +59,7 @@ jobs:
|
||||
cmake -B build -G Xcode \
|
||||
-DGGML_METAL_USE_BF16=ON \
|
||||
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||
-DLLAMA_BUILD_APP=OFF \
|
||||
-DLLAMA_BUILD_COMMON=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||
-DLLAMA_BUILD_TOOLS=OFF \
|
||||
@@ -89,6 +90,7 @@ jobs:
|
||||
-DGGML_METAL_USE_BF16=ON \
|
||||
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DLLAMA_BUILD_APP=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||
-DLLAMA_BUILD_TOOLS=OFF \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
@@ -138,6 +140,7 @@ jobs:
|
||||
-DGGML_METAL_USE_BF16=ON \
|
||||
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||
-DLLAMA_BUILD_COMMON=OFF \
|
||||
-DLLAMA_BUILD_APP=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||
-DLLAMA_BUILD_TOOLS=OFF \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
@@ -163,6 +166,7 @@ jobs:
|
||||
-DGGML_METAL_USE_BF16=ON \
|
||||
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||
-DLLAMA_BUILD_COMMON=OFF \
|
||||
-DLLAMA_BUILD_APP=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||
-DLLAMA_BUILD_TOOLS=OFF \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
@@ -206,6 +210,7 @@ jobs:
|
||||
-DGGML_METAL_USE_BF16=ON \
|
||||
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DLLAMA_BUILD_APP=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||
-DLLAMA_BUILD_TOOLS=OFF \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
|
||||
11
.github/workflows/build-cmake-pkg.yml
vendored
11
.github/workflows/build-cmake-pkg.yml
vendored
@@ -19,9 +19,14 @@ jobs:
|
||||
- name: Build
|
||||
run: |
|
||||
PREFIX="$(pwd)"/inst
|
||||
cmake -S . -B build -DCMAKE_PREFIX_PATH="$PREFIX" \
|
||||
-DLLAMA_OPENSSL=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=OFF -DCMAKE_BUILD_TYPE=Release
|
||||
cmake -S . -B build \
|
||||
-DCMAKE_PREFIX_PATH="$PREFIX" \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
-DLLAMA_BUILD_TOOLS=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||
-DLLAMA_BUILD_APP=OFF \
|
||||
-DCMAKE_BUILD_TYPE=Release
|
||||
cmake --build build --config Release
|
||||
cmake --install build --prefix "$PREFIX" --config Release
|
||||
|
||||
|
||||
1
.github/workflows/release.yml
vendored
1
.github/workflows/release.yml
vendored
@@ -1108,6 +1108,7 @@ jobs:
|
||||
-DGGML_METAL_USE_BF16=ON \
|
||||
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DLLAMA_BUILD_APP=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||
-DLLAMA_BUILD_TOOLS=OFF \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
You are a coding agent. Here are some very important rules that you must follow:
|
||||
|
||||
General:
|
||||
- By very precise and concise when writing code, comments, explanations, etc.
|
||||
- Be very precise and concise when writing code, comments, explanations, etc.
|
||||
- PR and commit titles format: `<module> : <title>`. Lookup recents for examples
|
||||
- Don't try to build or run the code unless you are explicitly asked to do so
|
||||
- Use the `gh` CLI tool when querying PRs, issues, or other GitHub resources
|
||||
@@ -16,7 +16,8 @@ Pull requests (PRs):
|
||||
- New branch names are prefixed with "gg/"
|
||||
- Before opening a pull request, ask the user to confirm the description
|
||||
- When creating a pull request, look for the repository's PR template and follow it
|
||||
- For the AI usage disclosure section, write "YES. llama.cpp + pi"
|
||||
- For the AI usage disclosure section, write "YES. llama.cpp + pi + [MODEL]"
|
||||
- Ask the user to tell you what model was used and write it in place of [MODEL]
|
||||
- Always create the pull requests in draft mode
|
||||
|
||||
Commits:
|
||||
|
||||
@@ -108,7 +108,7 @@ option(LLAMA_BUILD_TESTS "llama: build tests"
|
||||
option(LLAMA_BUILD_TOOLS "llama: build tools" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_APP "llama: build the unified binary" OFF)
|
||||
option(LLAMA_BUILD_APP "llama: build the unified binary" ON)
|
||||
option(LLAMA_BUILD_UI "llama: build the embedded Web UI for server" ON)
|
||||
option(LLAMA_USE_PREBUILT_UI "llama: use prebuilt UI from HF Bucket when available (requires LLAMA_BUILD_UI=ON)" ON)
|
||||
|
||||
|
||||
@@ -49,7 +49,6 @@
|
||||
/examples/parallel/ @ggerganov
|
||||
/examples/passkey/ @ggerganov
|
||||
/examples/retrieval/ @ggerganov
|
||||
/examples/save-load-state/ @ggerganov
|
||||
/examples/speculative-simple/ @ggerganov
|
||||
/examples/speculative/ @ggerganov
|
||||
/ggml/cmake/ @ggerganov
|
||||
|
||||
@@ -27,6 +27,7 @@ LLM inference in C/C++
|
||||
- Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim
|
||||
- Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggml-org/llama.cpp/discussions/9669
|
||||
- Hugging Face GGUF editor: [discussion](https://github.com/ggml-org/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor)
|
||||
- WebGPU support is now available in the browser, see a blog/demo introducing it [here](https://reeselevine.github.io/llamas-on-the-web/).
|
||||
|
||||
----
|
||||
|
||||
@@ -290,7 +291,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
||||
| [CANN](docs/build.md#cann) | Ascend NPU |
|
||||
| [OpenCL](docs/backend/OPENCL.md) | Adreno GPU |
|
||||
| [IBM zDNN](docs/backend/zDNN.md) | IBM Z & LinuxONE |
|
||||
| [WebGPU [In Progress]](docs/build.md#webgpu) | All |
|
||||
| [WebGPU](docs/build.md#webgpu) | All |
|
||||
| [RPC](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) | All |
|
||||
| [Hexagon [In Progress]](docs/backend/snapdragon/README.md) | Snapdragon |
|
||||
| [VirtGPU](docs/backend/VirtGPU.md) | VirtGPU APIR |
|
||||
|
||||
@@ -3,7 +3,16 @@ set(TARGET llama-app)
|
||||
add_executable(${TARGET} llama.cpp)
|
||||
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama)
|
||||
|
||||
target_link_libraries(${TARGET} PRIVATE llama-server-impl llama-cli-impl llama-completion-impl llama-bench-impl)
|
||||
target_link_libraries(${TARGET} PRIVATE
|
||||
llama-server-impl
|
||||
llama-cli-impl
|
||||
llama-completion-impl
|
||||
llama-bench-impl
|
||||
llama-batched-bench-impl
|
||||
llama-fit-params-impl
|
||||
llama-quantize-impl
|
||||
llama-perplexity-impl
|
||||
)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
if(LLAMA_TOOLS_INSTALL)
|
||||
|
||||
@@ -1,15 +1,22 @@
|
||||
#include "build-info.h"
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
// visible
|
||||
int llama_server(int argc, char ** argv);
|
||||
int llama_cli(int argc, char ** argv);
|
||||
|
||||
// hidden
|
||||
int llama_completion(int argc, char ** argv);
|
||||
int llama_bench(int argc, char ** argv);
|
||||
int llama_batched_bench(int argc, char ** argv);
|
||||
int llama_fit_params(int argc, char ** argv);
|
||||
int llama_quantize(int argc, char ** argv);
|
||||
int llama_perplexity(int argc, char ** argv);
|
||||
|
||||
static int help(int argc, char ** argv);
|
||||
static int version(int argc, char ** argv);
|
||||
|
||||
@@ -22,12 +29,16 @@ struct command {
|
||||
};
|
||||
|
||||
static const command cmds[] = {
|
||||
{"serve", "HTTP API server", {"server"}, false, llama_server },
|
||||
{"cli", "Command-line interactive interface", {"client"}, false, llama_cli },
|
||||
{"completion", "Text completion", {"complete"}, true, llama_completion },
|
||||
{"bench", "Benchmarking tool", {}, true, llama_bench },
|
||||
{"version", "Show version", {}, true, version },
|
||||
{"help", "Show available commands", {}, true, help },
|
||||
{"serve", "HTTP API server", {"server"}, false, llama_server },
|
||||
{"cli", "Command-line interactive interface", {"client"}, false, llama_cli },
|
||||
{"completion", "Text completion", {"complete"}, true, llama_completion },
|
||||
{"bench", "Benchmark prompt processing and text generation", {}, true, llama_bench },
|
||||
{"batched-bench", "Benchmark batched decoding performance", {}, true, llama_batched_bench},
|
||||
{"fit-params", "Compute parameters to fit a model in device memory", {}, true, llama_fit_params },
|
||||
{"quantize", "Quantize a model", {}, true, llama_quantize },
|
||||
{"perplexity", "Compute model perplexity and KL divergence", {}, true, llama_perplexity },
|
||||
{"version", "Show version", {}, true, version },
|
||||
{"help", "Show available commands", {}, true, help },
|
||||
};
|
||||
|
||||
static int version(int argc, char ** argv) {
|
||||
@@ -67,6 +78,14 @@ int main(int argc, char ** argv) {
|
||||
|
||||
for (const auto & cmd : cmds) {
|
||||
if (matches(arg, cmd)) {
|
||||
|
||||
// router spawns children through this same binary, it needs the
|
||||
// subcommand to relaunch as 'llama serve' and not bare options
|
||||
#ifdef _WIN32
|
||||
_putenv_s("LLAMA_APP_CMD", cmd.name);
|
||||
#else
|
||||
setenv("LLAMA_APP_CMD", cmd.name, 1);
|
||||
#endif
|
||||
return cmd.func(argc - 1, argv + 1);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,6 +7,7 @@ VISIONOS_MIN_OS_VERSION=1.0
|
||||
TVOS_MIN_OS_VERSION=16.4
|
||||
|
||||
BUILD_SHARED_LIBS=OFF
|
||||
LLAMA_BUILD_APP=OFF
|
||||
LLAMA_BUILD_EXAMPLES=OFF
|
||||
LLAMA_BUILD_TOOLS=OFF
|
||||
LLAMA_BUILD_TESTS=OFF
|
||||
@@ -31,6 +32,7 @@ COMMON_CMAKE_ARGS=(
|
||||
-DCMAKE_XCODE_ATTRIBUTE_STRIP_INSTALLED_PRODUCT=NO
|
||||
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
|
||||
-DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS}
|
||||
-DLLAMA_BUILD_APP=${LLAMA_BUILD_APP}
|
||||
-DLLAMA_BUILD_EXAMPLES=${LLAMA_BUILD_EXAMPLES}
|
||||
-DLLAMA_BUILD_TOOLS=${LLAMA_BUILD_TOOLS}
|
||||
-DLLAMA_BUILD_TESTS=${LLAMA_BUILD_TESTS}
|
||||
|
||||
@@ -461,10 +461,10 @@ function gg_run_qwen3_0_6b {
|
||||
|
||||
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||
|
||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa off --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa on --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa on ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/test-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa off --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/test-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa on --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/test-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/test-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa on ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
|
||||
function check_ppl {
|
||||
qnt="$1"
|
||||
|
||||
@@ -1617,6 +1617,11 @@ class TextModel(ModelBase):
|
||||
assert max(tokenizer.vocab.values()) < vocab_size # ty: ignore[unresolved-attribute]
|
||||
|
||||
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} # ty: ignore[unresolved-attribute]
|
||||
# k-mers can share text with a base-vocab BPE token (e.g. CCCCCC) and get
|
||||
# dropped by get_vocab(); a reserved marker suffix (U+E000) keeps each
|
||||
# k-mer's own id (llama.cpp strips it on detokenization)
|
||||
for kmer in tokenizer.kmers: # ty: ignore[unresolved-attribute]
|
||||
reverse_vocab[tokenizer.dna_token_to_id[kmer]] = kmer + "\ue000" # ty: ignore[unresolved-attribute]
|
||||
added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute]
|
||||
added_tokens_decoder = tokenizer.added_tokens_decoder # ty: ignore[unresolved-attribute]
|
||||
|
||||
|
||||
@@ -735,7 +735,7 @@ ninja
|
||||
|
||||
To read documentation for how to build on Android, [click here](./android.md)
|
||||
|
||||
## WebGPU [In Progress]
|
||||
## WebGPU
|
||||
|
||||
The WebGPU backend relies on [Dawn](https://dawn.googlesource.com/dawn). Follow the instructions [here](https://dawn.googlesource.com/dawn/+/refs/heads/main/docs/quickstart-cmake.md) to install Dawn locally so that llama.cpp can find it using CMake. The current implementation is up-to-date with Dawn commit `18eb229`.
|
||||
|
||||
|
||||
@@ -27,7 +27,6 @@ else()
|
||||
add_subdirectory(parallel)
|
||||
add_subdirectory(passkey)
|
||||
add_subdirectory(retrieval)
|
||||
add_subdirectory(save-load-state)
|
||||
add_subdirectory(simple)
|
||||
add_subdirectory(simple-chat)
|
||||
add_subdirectory(speculative)
|
||||
|
||||
@@ -25,6 +25,7 @@ android {
|
||||
arguments += "-DCMAKE_VERBOSE_MAKEFILE=ON"
|
||||
|
||||
arguments += "-DBUILD_SHARED_LIBS=ON"
|
||||
arguments += "-DLLAMA_BUILD_APP=OFF"
|
||||
arguments += "-DLLAMA_BUILD_COMMON=ON"
|
||||
arguments += "-DLLAMA_OPENSSL=OFF"
|
||||
|
||||
|
||||
@@ -1,5 +0,0 @@
|
||||
set(TARGET llama-save-load-state)
|
||||
add_executable(${TARGET} save-load-state.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
@@ -1561,7 +1561,8 @@ static __inline__ void ggml_cuda_kernel_launch(Kernel kernel, const ggml_cuda_ke
|
||||
return env == nullptr || std::atoi(env) != 0;
|
||||
}();
|
||||
|
||||
if (env_pdl_enabled && ggml_cuda_info().devices[ggml_cuda_get_device()].cc >= GGML_CUDA_CC_HOPPER) {
|
||||
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
|
||||
if (env_pdl_enabled && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_HOPPER) {
|
||||
auto pdl_cfg = ggml_cuda_pdl_config(launch_params);
|
||||
|
||||
CUDA_CHECK(cudaLaunchKernelEx(&pdl_cfg.cfg, kernel, std::forward<Args>(args)... ));
|
||||
|
||||
@@ -564,9 +564,20 @@ int ggml_metal_op_concat(ggml_metal_op_t ctx, int idx) {
|
||||
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), 2);
|
||||
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 3);
|
||||
|
||||
const int nth = std::min(1024, ne0);
|
||||
int nth = std::min(256, ne0);
|
||||
|
||||
ggml_metal_encoder_dispatch_threadgroups(enc, ne1, ne2, ne3, nth, 1, 1);
|
||||
// when rows are small, we can batch them together in a single threadgroup
|
||||
int nrptg = 1;
|
||||
if (nth < 256) {
|
||||
nrptg = std::min((256 + nth - 1) / nth, ne1);
|
||||
if (nrptg * nth > 256) {
|
||||
nrptg = 256 / nth;
|
||||
}
|
||||
}
|
||||
|
||||
const int nw0 = (ne1 + nrptg - 1) / nrptg;
|
||||
|
||||
ggml_metal_encoder_dispatch_threadgroups(enc, nw0, ne2, ne3, nth, nrptg, 1);
|
||||
|
||||
return 1;
|
||||
}
|
||||
@@ -1786,7 +1797,7 @@ int ggml_metal_op_set(ggml_metal_op_t ctx, int idx) {
|
||||
nk0 = ne10/ggml_blck_size(op->type);
|
||||
}
|
||||
|
||||
int nth = std::min<int>(nk0, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
|
||||
int nth = std::min<int>(nk0*ne11, 256);
|
||||
|
||||
// when rows are small, we can batch them together in a single threadgroup
|
||||
int nrptg = 1;
|
||||
@@ -1797,7 +1808,7 @@ int ggml_metal_op_set(ggml_metal_op_t ctx, int idx) {
|
||||
nrptg = (nth + nk0 - 1)/nk0;
|
||||
nth = nk0;
|
||||
|
||||
if (nrptg*nth > ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
|
||||
if (nrptg*nth > 256) {
|
||||
nrptg--;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7486,7 +7486,11 @@ kernel void kernel_concat(
|
||||
|
||||
const int i3 = tgpig.z;
|
||||
const int i2 = tgpig.y;
|
||||
const int i1 = tgpig.x;
|
||||
const int i1 = ntg.y == 1 ? tgpig.x : tgpig.x*ntg.y + tpitg.y;
|
||||
|
||||
if (i1 >= args.ne1) {
|
||||
return;
|
||||
}
|
||||
|
||||
int o[4] = {0, 0, 0, 0};
|
||||
o[args.dim] = args.dim == 0 ? args.ne00 : (args.dim == 1 ? args.ne01 : (args.dim == 2 ? args.ne02 : args.ne03));
|
||||
|
||||
@@ -499,6 +499,12 @@ static constexpr std::initializer_list<ggml_op> topk_moe_late_softmax { GGM
|
||||
GGML_OP_GET_ROWS, GGML_OP_RESHAPE,
|
||||
GGML_OP_SOFT_MAX, GGML_OP_RESHAPE };
|
||||
|
||||
// Snake activation: y = x + sin(a*x)^2 * inv_b. Used by the optimize_graph reorder
|
||||
// pass so it keeps the chain contiguous and by the dispatcher to detect the fusion.
|
||||
static constexpr std::initializer_list<ggml_op> snake_pattern { GGML_OP_MUL, GGML_OP_SIN,
|
||||
GGML_OP_SQR, GGML_OP_MUL,
|
||||
GGML_OP_ADD };
|
||||
|
||||
//node #978 ( SOFT_MAX): ffn_moe_probs-15 ( 0K) [Vulka ] use=2: ffn_moe_logits-15 ( 0K) [Vulka ]
|
||||
//node #979 ( RESHAPE): ffn_moe_probs-15 (re ( 0K) [Vulka ] use=1: ffn_moe_probs-15 ( 0K) [Vulka ]
|
||||
//node #980 ( ARGSORT): ffn_moe_argsort-15 ( 0K) [Vulka ] use=1: ffn_moe_probs-15 ( 0K) [Vulka ]
|
||||
@@ -846,6 +852,9 @@ struct vk_device_struct {
|
||||
vk_pipeline pipeline_im2col_3d_f32, pipeline_im2col_3d_f32_f16;
|
||||
vk_pipeline pipeline_timestep_embedding_f32;
|
||||
vk_pipeline pipeline_conv_transpose_1d_f32;
|
||||
vk_pipeline pipeline_snake_f32;
|
||||
vk_pipeline pipeline_snake_f16;
|
||||
vk_pipeline pipeline_snake_bf16;
|
||||
vk_pipeline pipeline_pool2d_f32;
|
||||
vk_pipeline pipeline_rwkv_wkv6_f32;
|
||||
vk_pipeline pipeline_rwkv_wkv7_f32;
|
||||
@@ -1475,6 +1484,11 @@ struct vk_op_conv_transpose_1d_push_constants {
|
||||
int32_t s0;
|
||||
};
|
||||
|
||||
struct vk_op_snake_push_constants {
|
||||
uint32_t ne0;
|
||||
uint32_t ne1;
|
||||
};
|
||||
|
||||
struct vk_op_pool2d_push_constants {
|
||||
uint32_t IW; uint32_t IH;
|
||||
uint32_t OW; uint32_t OH;
|
||||
@@ -4845,6 +4859,10 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_conv_transpose_1d_f32, "conv_transpose_1d_f32", conv_transpose_1d_f32_len, conv_transpose_1d_f32_data, "main", 3, sizeof(vk_op_conv_transpose_1d_push_constants), {1, 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_snake_f32, "snake_f32", snake_f32_len, snake_f32_data, "main", 4, sizeof(vk_op_snake_push_constants), {256, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_snake_f16, "snake_f16", snake_f16_len, snake_f16_data, "main", 4, sizeof(vk_op_snake_push_constants), {256, 1, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(device, device->pipeline_snake_bf16, "snake_bf16", snake_bf16_len, snake_bf16_data, "main", 4, sizeof(vk_op_snake_push_constants), {256, 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_pool2d_f32, "pool2d_f32", pool2d_f32_len, pool2d_f32_data, "main", 2, sizeof(vk_op_pool2d_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_rwkv_wkv6_f32, "rwkv_wkv6_f32", rwkv_wkv6_f32_len, rwkv_wkv6_f32_data, "main", 7, sizeof(vk_op_rwkv_wkv6_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
|
||||
@@ -12110,6 +12128,45 @@ static void ggml_vk_conv_transpose_1d(ggml_backend_vk_context * ctx, vk_context&
|
||||
ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_CONV_TRANSPOSE_1D, std::move(p));
|
||||
}
|
||||
|
||||
// Dispatch the fused snake activation: y = x + sin^2(a * x) * inv_b.
|
||||
// Match the naive mul -> sin -> sqr -> mul -> add chain and run the
|
||||
// dedicated kernel directly. The pattern is validated by
|
||||
// ggml_vk_can_fuse_snake before this call.
|
||||
static void ggml_vk_snake_dispatch_fused(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_cgraph * cgraph, int node_idx) {
|
||||
const ggml_tensor * mul0 = cgraph->nodes[node_idx + 0];
|
||||
const ggml_tensor * sqr = cgraph->nodes[node_idx + 2];
|
||||
const ggml_tensor * mul1 = cgraph->nodes[node_idx + 3];
|
||||
ggml_tensor * add = cgraph->nodes[node_idx + 4];
|
||||
|
||||
// x carries the full activation shape, a is the broadcast operand
|
||||
const ggml_tensor * x = ggml_are_same_shape(mul0, mul0->src[0]) ? mul0->src[0] : mul0->src[1];
|
||||
const ggml_tensor * a = (x == mul0->src[0]) ? mul0->src[1] : mul0->src[0];
|
||||
|
||||
// mul1 reads sqr and inv_b in either operand order
|
||||
const ggml_tensor * inv_b = (mul1->src[0] == sqr) ? mul1->src[1] : mul1->src[0];
|
||||
|
||||
vk_pipeline pipeline = nullptr;
|
||||
switch (x->type) {
|
||||
case GGML_TYPE_F32: pipeline = ctx->device->pipeline_snake_f32; break;
|
||||
case GGML_TYPE_F16: pipeline = ctx->device->pipeline_snake_f16; break;
|
||||
case GGML_TYPE_BF16: pipeline = ctx->device->pipeline_snake_bf16; break;
|
||||
default: GGML_ABORT("unsupported type");
|
||||
}
|
||||
ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
|
||||
|
||||
vk_subbuffer x_buf = ggml_vk_tensor_subbuffer(ctx, x);
|
||||
vk_subbuffer a_buf = ggml_vk_tensor_subbuffer(ctx, a);
|
||||
vk_subbuffer inv_b_buf = ggml_vk_tensor_subbuffer(ctx, inv_b);
|
||||
vk_subbuffer dst_buf = ggml_vk_tensor_subbuffer(ctx, add);
|
||||
|
||||
vk_op_snake_push_constants pc{};
|
||||
pc.ne0 = static_cast<uint32_t>(x->ne[0]);
|
||||
pc.ne1 = static_cast<uint32_t>(x->ne[1]);
|
||||
|
||||
std::array<uint32_t, 3> elements = { pc.ne0, pc.ne1, 1 };
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { x_buf, a_buf, inv_b_buf, dst_buf }, pc, elements);
|
||||
}
|
||||
|
||||
static void ggml_vk_pool_2d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
||||
uint32_t op = static_cast<uint32_t>(dst->op_params[0]);
|
||||
const int32_t k1 = dst->op_params[1];
|
||||
@@ -13318,7 +13375,11 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
|
||||
|
||||
break;
|
||||
case GGML_OP_MUL:
|
||||
ggml_vk_mul(ctx, compute_ctx, src0, src1, node);
|
||||
if (ctx->num_additional_fused_ops) {
|
||||
ggml_vk_snake_dispatch_fused(ctx, compute_ctx, cgraph, node_idx);
|
||||
} else {
|
||||
ggml_vk_mul(ctx, compute_ctx, src0, src1, node);
|
||||
}
|
||||
|
||||
break;
|
||||
case GGML_OP_DIV:
|
||||
@@ -14691,6 +14752,65 @@ static bool ggml_vk_can_fuse_rope_set_rows(ggml_backend_vk_context * ctx, const
|
||||
return true;
|
||||
}
|
||||
|
||||
// Pattern check for the 5-op Snake fusion: mul -> sin -> sqr -> mul -> add.
|
||||
// Verifies the chain shape, the closure x_in_add == x_in_mul0, and that
|
||||
// the broadcast operands a and inv_b share a [1, C] layout.
|
||||
static bool ggml_vk_can_fuse_snake(ggml_backend_vk_context * ctx, const struct ggml_cgraph * cgraph, int node_idx) {
|
||||
GGML_UNUSED(ctx);
|
||||
if (!ggml_can_fuse(cgraph, node_idx, snake_pattern)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const ggml_tensor * mul0 = cgraph->nodes[node_idx + 0];
|
||||
const ggml_tensor * sin_node = cgraph->nodes[node_idx + 1];
|
||||
const ggml_tensor * sqr = cgraph->nodes[node_idx + 2];
|
||||
const ggml_tensor * mul1 = cgraph->nodes[node_idx + 3];
|
||||
const ggml_tensor * add = cgraph->nodes[node_idx + 4];
|
||||
|
||||
const ggml_tensor * x = ggml_are_same_shape(mul0, mul0->src[0]) ? mul0->src[0] : mul0->src[1];
|
||||
const ggml_tensor * a = (x == mul0->src[0]) ? mul0->src[1] : mul0->src[0];
|
||||
|
||||
const ggml_tensor * inv_b = (mul1->src[0] == sqr) ? mul1->src[1] : mul1->src[0];
|
||||
const ggml_tensor * x_in_add = (add->src[0] == mul1) ? add->src[1] : add->src[0];
|
||||
|
||||
if (x_in_add != x) {
|
||||
return false;
|
||||
}
|
||||
if (x->type != GGML_TYPE_F32 && x->type != GGML_TYPE_F16 && x->type != GGML_TYPE_BF16) {
|
||||
return false;
|
||||
}
|
||||
// Shader bindings: data_a is A_TYPE so it follows x's precision, while
|
||||
// data_b and data_c are hardcoded float, so the broadcast operands must
|
||||
// be F32 regardless of x's type.
|
||||
if (a->type != GGML_TYPE_F32) return false;
|
||||
if (inv_b->type != GGML_TYPE_F32) return false;
|
||||
// Chain intermediates and output share x's precision (single A_TYPE / D_TYPE pipeline).
|
||||
if (mul0->type != x->type) return false;
|
||||
if (sin_node->type != x->type) return false;
|
||||
if (sqr->type != x->type) return false;
|
||||
if (mul1->type != x->type) return false;
|
||||
if (add->type != x->type) return false;
|
||||
if (!ggml_are_same_shape(a, inv_b)) {
|
||||
return false;
|
||||
}
|
||||
if (a->ne[0] != 1 || a->ne[1] != x->ne[1]) {
|
||||
return false;
|
||||
}
|
||||
// Dispatch is 2D over (ne0, ne1), so x and add must be 2D and a / inv_b
|
||||
// must collapse to [1, C, 1, 1]. Higher dims are not handled by the shader.
|
||||
if (x->ne[2] != 1 || x->ne[3] != 1) return false;
|
||||
if (add->ne[2] != 1 || add->ne[3] != 1) return false;
|
||||
if (a->ne[2] != 1 || a->ne[3] != 1) return false;
|
||||
if (inv_b->ne[2] != 1 || inv_b->ne[3] != 1) return false;
|
||||
// Shader uses idx = i0 + i1 * ne0 and reads data_b[i1] / data_c[i1],
|
||||
// so every operand must be contiguous.
|
||||
if (!ggml_is_contiguous(x) || !ggml_is_contiguous(add) ||
|
||||
!ggml_is_contiguous(a) || !ggml_is_contiguous(inv_b)) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Check whether the tensors overlap in memory.
|
||||
// Fusions can potentially overwrite src tensors in ways that are not prevented
|
||||
// by ggml-alloc. If the fusion src is being applied in a way that's elementwise
|
||||
@@ -14998,6 +15118,14 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
||||
op_srcs_fused_elementwise[0] = false;
|
||||
op_srcs_fused_elementwise[1] = false;
|
||||
op_srcs_fused_elementwise[2] = false;
|
||||
} else if (ggml_vk_can_fuse_snake(ctx, cgraph, i)) {
|
||||
ctx->num_additional_fused_ops = 4;
|
||||
fusion_string = "SNAKE";
|
||||
// elementwise=true: snake.comp is safe under exact aliasing because each
|
||||
// thread reads data_x[idx] into a register before writing data_d[idx]
|
||||
// with a data dependency on that register. The overlap check still
|
||||
// rejects partial overlaps (different base or size).
|
||||
std::fill_n(op_srcs_fused_elementwise, 5, true);
|
||||
} else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_early_softmax_norm, { i + 3, i + 9 }) &&
|
||||
ggml_check_edges(cgraph, i, topk_moe_early_softmax_norm_edges) &&
|
||||
ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_EARLY_SOFTMAX_NORM)) {
|
||||
@@ -15288,6 +15416,9 @@ static void ggml_vk_graph_optimize(ggml_backend_t backend, struct ggml_cgraph *
|
||||
if (keep_pattern(topk_moe_late_softmax)) {
|
||||
continue;
|
||||
}
|
||||
if (keep_pattern(snake_pattern)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// First, grab the next unused node.
|
||||
current_set.push_back(first_unused);
|
||||
@@ -15310,7 +15441,8 @@ static void ggml_vk_graph_optimize(ggml_backend_t backend, struct ggml_cgraph *
|
||||
if (match_pattern(topk_moe_early_softmax_norm, j) ||
|
||||
match_pattern(topk_moe_sigmoid_norm_bias, j) ||
|
||||
match_pattern(topk_moe_early_softmax, j) ||
|
||||
match_pattern(topk_moe_late_softmax, j)) {
|
||||
match_pattern(topk_moe_late_softmax, j) ||
|
||||
match_pattern(snake_pattern, j)) {
|
||||
continue;
|
||||
}
|
||||
bool ok = true;
|
||||
|
||||
49
ggml/src/ggml-vulkan/vulkan-shaders/snake.comp
Normal file
49
ggml/src/ggml-vulkan/vulkan-shaders/snake.comp
Normal file
@@ -0,0 +1,49 @@
|
||||
#version 450
|
||||
|
||||
#include "types.glsl"
|
||||
|
||||
// Fused snake activation: y = x + sin(b * x)^2 * c
|
||||
// data_a [ne0, ne1] per element activation x (A_TYPE)
|
||||
// data_b [1, ne1] per channel multiplier (float)
|
||||
// data_c [1, ne1] per channel inverse scale (float, precomputed as 1 / freq)
|
||||
// data_d [ne0, ne1] output y (D_TYPE)
|
||||
layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
|
||||
layout (binding = 1) readonly buffer B {float data_b[];};
|
||||
layout (binding = 2) readonly buffer C {float data_c[];};
|
||||
layout (binding = 3) writeonly buffer D {D_TYPE data_d[];};
|
||||
|
||||
layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint32_t ne0;
|
||||
uint32_t ne1;
|
||||
} p;
|
||||
|
||||
// Load A_TYPE to float
|
||||
float load_val(uint32_t idx) {
|
||||
#if defined(DATA_A_BF16)
|
||||
return bf16_to_fp32(uint32_t(data_a[idx]));
|
||||
#else
|
||||
return float(data_a[idx]);
|
||||
#endif
|
||||
}
|
||||
|
||||
// Store float as D_TYPE
|
||||
void store_val(uint32_t idx, float v) {
|
||||
#if defined(DATA_D_BF16)
|
||||
data_d[idx] = D_TYPE(fp32_to_bf16(v));
|
||||
#else
|
||||
data_d[idx] = D_TYPE(v);
|
||||
#endif
|
||||
}
|
||||
|
||||
void main() {
|
||||
const uint32_t i0 = gl_GlobalInvocationID.x;
|
||||
const uint32_t i1 = gl_GlobalInvocationID.y;
|
||||
if (i0 >= p.ne0 || i1 >= p.ne1) return;
|
||||
|
||||
const uint32_t idx = i0 + i1 * p.ne0;
|
||||
const float xi = load_val(idx);
|
||||
const float s = sin(data_b[i1] * xi);
|
||||
store_val(idx, xi + s * s * data_c[i1]);
|
||||
}
|
||||
@@ -952,6 +952,10 @@ void process_shaders() {
|
||||
|
||||
string_to_spv("conv_transpose_1d_f32", "conv_transpose_1d.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
|
||||
string_to_spv("snake_f32", "snake.comp", {{"DATA_A_F32", "1"}, {"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||
string_to_spv("snake_f16", "snake.comp", {{"DATA_A_F16", "1"}, {"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
||||
string_to_spv("snake_bf16", "snake.comp", {{"DATA_A_BF16", "1"}, {"DATA_D_BF16", "1"}, {"A_TYPE", "uint16_t"}, {"D_TYPE", "uint16_t"}});
|
||||
|
||||
string_to_spv("pool2d_f32", "pool2d.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||
|
||||
string_to_spv("rwkv_wkv6_f32", "wkv6.comp", merge_maps(base_dict, {{"A_TYPE", "float"}}));
|
||||
|
||||
@@ -122,9 +122,9 @@ const V_CHUNKS: u32 = HEAD_DIM_V / 4u;
|
||||
const SCORE_REGS_PER_LANE: u32 = (KV_TILE + MIN_SUBGROUP_SIZE - 1u) / MIN_SUBGROUP_SIZE;
|
||||
const OUT_REGS_PER_LANE: u32 = (V_CHUNKS + MIN_SUBGROUP_SIZE - 1u) / MIN_SUBGROUP_SIZE;
|
||||
|
||||
var<workgroup> q_shmem: array<f32, Q_TILE * HEAD_DIM_QK>;
|
||||
var<workgroup> kv_shmem: array<f32, KV_TILE * KV_STAGE_STRIDE>;
|
||||
var<workgroup> p_shmem: array<f32, Q_TILE * KV_TILE>;
|
||||
var<workgroup> q_shmem: array<Q_TYPE, Q_TILE * HEAD_DIM_QK>;
|
||||
var<workgroup> kv_shmem: array<KV_TYPE, KV_TILE * KV_STAGE_STRIDE>;
|
||||
var<workgroup> p_shmem: array<KV_TYPE, Q_TILE * KV_TILE>;
|
||||
|
||||
@compute @workgroup_size(WG_SIZE)
|
||||
fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
|
||||
@@ -169,10 +169,10 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
|
||||
|
||||
let head = f32(head_idx);
|
||||
let slope = select(1.0,
|
||||
select(pow(params.m1, 2.0 * (head - params.n_head_log2) + 1.0),
|
||||
pow(params.m0, head + 1.0),
|
||||
head < params.n_head_log2),
|
||||
params.max_bias > 0.0);
|
||||
select(pow(params.m1, 2.0 * (head - params.n_head_log2) + 1.0),
|
||||
pow(params.m0, head + 1.0),
|
||||
head < params.n_head_log2),
|
||||
params.max_bias > 0.0);
|
||||
|
||||
for (var elem_idx = local_id.x; elem_idx < Q_TILE * HEAD_DIM_QK; elem_idx += WG_SIZE) {
|
||||
let q_tile_row = elem_idx / HEAD_DIM_QK;
|
||||
@@ -181,7 +181,7 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
|
||||
let global_q_row_offset = q_head_offset + head_q_row * params.stride_q1;
|
||||
q_shmem[elem_idx] = select(
|
||||
0.0,
|
||||
f32(Q[global_q_row_offset + q_col]) * params.scale,
|
||||
Q_TYPE(Q[global_q_row_offset + q_col]) * params.scale,
|
||||
head_q_row < params.seq_len_q);
|
||||
}
|
||||
|
||||
@@ -213,10 +213,10 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
|
||||
let k_vec_index = (k_head_offset + global_k_row * params.stride_k1 + chunk * 4u) >> 2u;
|
||||
let k4 = K[k_vec_index];
|
||||
let kv_off = kv_local * KV_STAGE_STRIDE + chunk * 4u;
|
||||
kv_shmem[kv_off + 0u] = f32(k4.x);
|
||||
kv_shmem[kv_off + 1u] = f32(k4.y);
|
||||
kv_shmem[kv_off + 2u] = f32(k4.z);
|
||||
kv_shmem[kv_off + 3u] = f32(k4.w);
|
||||
kv_shmem[kv_off + 0u] = KV_TYPE(k4.x);
|
||||
kv_shmem[kv_off + 1u] = KV_TYPE(k4.y);
|
||||
kv_shmem[kv_off + 2u] = KV_TYPE(k4.z);
|
||||
kv_shmem[kv_off + 3u] = KV_TYPE(k4.w);
|
||||
}
|
||||
|
||||
workgroupBarrier();
|
||||
@@ -233,18 +233,18 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
|
||||
var dot_val = 0.0;
|
||||
for (var chunk = 0u; chunk < Q_CHUNKS; chunk += 1u) {
|
||||
let q_off = q_base + chunk * 4u;
|
||||
let qv = vec4<f32>(
|
||||
let qv = vec4<Q_TYPE>(
|
||||
q_shmem[q_off + 0u],
|
||||
q_shmem[q_off + 1u],
|
||||
q_shmem[q_off + 2u],
|
||||
q_shmem[q_off + 3u]);
|
||||
let kv_off = kv_local * KV_STAGE_STRIDE + chunk * 4u;
|
||||
let kv = vec4<f32>(
|
||||
let kv = vec4<KV_TYPE>(
|
||||
kv_shmem[kv_off + 0u],
|
||||
kv_shmem[kv_off + 1u],
|
||||
kv_shmem[kv_off + 2u],
|
||||
kv_shmem[kv_off + 3u]);
|
||||
dot_val += dot(qv, kv);
|
||||
dot_val += dot(vec4<f32>(qv), vec4<f32>(kv));
|
||||
}
|
||||
#ifdef LOGIT_SOFTCAP
|
||||
dot_val = params.logit_softcap * tanh(dot_val);
|
||||
@@ -271,7 +271,7 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
|
||||
let kv_local = sg_inv_id + slot * subgroup_size;
|
||||
if (row_active && kv_local < kv_count) {
|
||||
let p = exp(local_scores[slot] - new_max);
|
||||
p_shmem[subgroup_p_offset + kv_local] = p;
|
||||
p_shmem[subgroup_p_offset + kv_local] = KV_TYPE(p);
|
||||
local_sum += p;
|
||||
}
|
||||
}
|
||||
@@ -285,10 +285,10 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
|
||||
let v_vec_index = (v_head_offset + global_v_row * params.stride_v1 + chunk * 4u) >> 2u;
|
||||
let v4 = V[v_vec_index];
|
||||
let kv_off = kv_local * KV_STAGE_STRIDE + chunk * 4u;
|
||||
kv_shmem[kv_off + 0u] = f32(v4.x);
|
||||
kv_shmem[kv_off + 1u] = f32(v4.y);
|
||||
kv_shmem[kv_off + 2u] = f32(v4.z);
|
||||
kv_shmem[kv_off + 3u] = f32(v4.w);
|
||||
kv_shmem[kv_off + 0u] = KV_TYPE(v4.x);
|
||||
kv_shmem[kv_off + 1u] = KV_TYPE(v4.y);
|
||||
kv_shmem[kv_off + 2u] = KV_TYPE(v4.z);
|
||||
kv_shmem[kv_off + 3u] = KV_TYPE(v4.w);
|
||||
}
|
||||
|
||||
workgroupBarrier();
|
||||
@@ -308,12 +308,12 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
|
||||
for (var kv_local = 0u; kv_local < kv_count; kv_local += 1u) {
|
||||
let p = p_shmem[subgroup_p_offset + kv_local];
|
||||
let kv_off = kv_local * KV_STAGE_STRIDE + chunk * 4u;
|
||||
let v4 = vec4<f32>(
|
||||
let v4 = vec4<KV_TYPE>(
|
||||
kv_shmem[kv_off + 0u],
|
||||
kv_shmem[kv_off + 1u],
|
||||
kv_shmem[kv_off + 2u],
|
||||
kv_shmem[kv_off + 3u]);
|
||||
acc += p * v4;
|
||||
acc += f32(p) * vec4<f32>(v4);
|
||||
}
|
||||
out_regs[reg_idx] = acc;
|
||||
}
|
||||
|
||||
@@ -1581,6 +1581,11 @@ private:
|
||||
const llm_tokenizer_plamo2 & tokenizer;
|
||||
};
|
||||
|
||||
// reserved suffix (U+E000) that keeps DNA k-mers distinct from identical
|
||||
// base-vocab BPE tokens (e.g. CCCCCC) in token_to_id; erased from id_to_token
|
||||
// text at load
|
||||
static const std::string dna_kmer_marker = "\xee\x80\x80";
|
||||
|
||||
struct llm_tokenizer_hybriddna_session : llm_tokenizer_bpe_session {
|
||||
llm_tokenizer_hybriddna_session(const llama_vocab & vocab, const llm_tokenizer_bpe & tokenizer) : llm_tokenizer_bpe_session{vocab, tokenizer}, vocab{vocab} {}
|
||||
|
||||
@@ -1636,34 +1641,22 @@ private:
|
||||
c = char(c - 32);
|
||||
}
|
||||
}
|
||||
auto is_valid_kmer = [](const std::string & s) {
|
||||
for (char c : s) {
|
||||
if (c != 'A' && c != 'C' && c != 'G' && c != 'T') {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
|
||||
// k-mers carry the reserved marker suffix; a non-ACGT k-mer simply
|
||||
// isn't in the vocab and falls back to <oov>
|
||||
auto kmer_token = [&](const std::string & kmer) {
|
||||
const auto tok = vocab.text_to_token(kmer + dna_kmer_marker);
|
||||
return tok != LLAMA_TOKEN_NULL ? tok : oov_id;
|
||||
};
|
||||
|
||||
size_t i = 0;
|
||||
for (; i + k <= seq.size(); i += k) {
|
||||
const std::string kmer = seq.substr(i, k);
|
||||
if (is_valid_kmer(kmer)) {
|
||||
const auto tok = vocab.text_to_token(kmer);
|
||||
output.push_back(tok != LLAMA_TOKEN_NULL ? tok : oov_id);
|
||||
} else {
|
||||
output.push_back(oov_id);
|
||||
}
|
||||
output.push_back(kmer_token(seq.substr(i, k)));
|
||||
}
|
||||
if (i < seq.size()) {
|
||||
std::string kmer = seq.substr(i);
|
||||
kmer.append(k - kmer.size(), 'A');
|
||||
if (is_valid_kmer(kmer)) {
|
||||
const auto tok = vocab.text_to_token(kmer);
|
||||
output.push_back(tok != LLAMA_TOKEN_NULL ? tok : oov_id);
|
||||
} else {
|
||||
output.push_back(oov_id);
|
||||
}
|
||||
output.push_back(kmer_token(kmer));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2357,6 +2350,23 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
}
|
||||
GGML_ASSERT(id_to_token.size() == token_to_id.size());
|
||||
|
||||
// hybriddna: the marker suffix kept k-mer ids distinct in token_to_id; erase
|
||||
// it from id_to_token so the k-mers detokenize to the bare DNA sequence. The
|
||||
// k-mers are the block right after <oov>, so only scan from there.
|
||||
if (tokenizer_model == "hybriddna") {
|
||||
const auto idx = token_to_id.find("<oov>");
|
||||
if (idx != token_to_id.end()) {
|
||||
auto it = id_to_token.begin() + idx->second + 1;
|
||||
for (; it != id_to_token.end(); ++it) {
|
||||
std::string & text = it->text;
|
||||
if (text.size() > dna_kmer_marker.size()
|
||||
&& text.compare(text.size() - dna_kmer_marker.size(), dna_kmer_marker.size(), dna_kmer_marker) == 0) {
|
||||
text.erase(text.size() - dna_kmer_marker.size());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
init_tokenizer(type);
|
||||
|
||||
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
|
||||
|
||||
@@ -255,6 +255,10 @@ set_tests_properties(test-state-restore-fragmented PROPERTIES FIXTURES_REQUIRED
|
||||
llama_build_and_test(test-recurrent-state-rollback.cpp LABEL "model" ARGS -m "${MODEL_DEST}")
|
||||
set_tests_properties(test-recurrent-state-rollback PROPERTIES FIXTURES_REQUIRED test-download-model)
|
||||
|
||||
# Test state save/load functionality
|
||||
llama_build_and_test(test-save-load-state.cpp LABEL "model" ARGS -m "${MODEL_DEST}")
|
||||
set_tests_properties(test-save-load-state PROPERTIES FIXTURES_REQUIRED test-download-model)
|
||||
|
||||
if (NOT GGML_BACKEND_DL)
|
||||
# these tests use the backends directly and cannot be built with dynamic loading
|
||||
llama_build_and_test(test-barrier.cpp)
|
||||
|
||||
@@ -2866,15 +2866,24 @@ struct test_set : public test_case {
|
||||
struct test_cpy : public test_case {
|
||||
const ggml_type type_src;
|
||||
const ggml_type type_dst;
|
||||
const std::array<int64_t, 4> ne;
|
||||
const std::array<int64_t, 4> ne_src;
|
||||
const std::array<int64_t, 4> ne_dst;
|
||||
const std::array<int64_t, 4> permute_src;
|
||||
const std::array<int64_t, 4> permute_dst;
|
||||
bool _src_use_permute;
|
||||
bool _dst_use_permute;
|
||||
bool _src_transpose;
|
||||
bool _use_dst_shape;
|
||||
|
||||
std::string vars() override {
|
||||
return VARS_TO_STR6(type_src, type_dst, ne, permute_src, permute_dst, _src_transpose);
|
||||
if (_use_dst_shape) {
|
||||
return VARS_TO_STR7(type_src, type_dst, ne_src, ne_dst, permute_src, permute_dst, _src_transpose);
|
||||
}
|
||||
return VARS_TO_STR6(type_src, type_dst, ne_src, permute_src, permute_dst, _src_transpose);
|
||||
}
|
||||
|
||||
int64_t total_elements() const {
|
||||
return ne_src[0] * ne_src[1] * ne_src[2] * ne_src[3];
|
||||
}
|
||||
|
||||
double max_nmse_err() override {
|
||||
@@ -2899,7 +2908,7 @@ struct test_cpy : public test_case {
|
||||
err_estimate /= 8.0f;
|
||||
}
|
||||
err_estimate *= err_estimate;
|
||||
err_estimate /= (150.0f*150.0f*0.25f)*float(ne[0] * ne[1] * ne[2] * ne[3]);
|
||||
err_estimate /= (150.0f*150.0f*0.25f)*float(total_elements());
|
||||
return err_estimate;
|
||||
}
|
||||
return 1e-6;
|
||||
@@ -2910,17 +2919,19 @@ struct test_cpy : public test_case {
|
||||
}
|
||||
|
||||
test_cpy(ggml_type type_src = GGML_TYPE_F32, ggml_type type_dst = GGML_TYPE_F32,
|
||||
std::array<int64_t, 4> ne = {10, 10, 10, 1},
|
||||
std::array<int64_t, 4> ne_src = {10, 10, 10, 1},
|
||||
std::array<int64_t, 4> ne_dst = {-1, -1, -1, -1},
|
||||
std::array<int64_t, 4> permute_src = {0, 0, 0, 0},
|
||||
std::array<int64_t, 4> permute_dst = {0, 0, 0, 0},
|
||||
bool transpose_src = false)
|
||||
: type_src(type_src), type_dst(type_dst), ne(ne), permute_src(permute_src), permute_dst(permute_dst),
|
||||
: type_src(type_src), type_dst(type_dst), ne_src(ne_src), ne_dst(ne_dst), permute_src(permute_src), permute_dst(permute_dst),
|
||||
_src_use_permute(permute_src[0] + permute_src[1] + permute_src[2] + permute_src[3] > 0),
|
||||
_dst_use_permute(permute_dst[0] + permute_dst[1] + permute_dst[2] + permute_dst[3] > 0),
|
||||
_src_transpose(transpose_src){}
|
||||
_src_transpose(transpose_src),
|
||||
_use_dst_shape(ne_dst[0] >= 0 && ne_dst[1] >= 0 && ne_dst[2] >= 0 && ne_dst[3] >= 0){}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
ggml_tensor * src = ggml_new_tensor(ctx, type_src, 4, ne.data());
|
||||
ggml_tensor * src = ggml_new_tensor(ctx, type_src, 4, ne_src.data());
|
||||
ggml_set_param(src);
|
||||
ggml_set_name(src, "src");
|
||||
|
||||
@@ -2934,7 +2945,8 @@ struct test_cpy : public test_case {
|
||||
ggml_set_name(src, "src_transposed");
|
||||
}
|
||||
|
||||
ggml_tensor * dst = ggml_new_tensor(ctx, type_dst, 4, src->ne);
|
||||
std::array<int64_t, 4> dst_ne = _use_dst_shape ? ne_dst : std::array<int64_t, 4>{src->ne[0], src->ne[1], src->ne[2], src->ne[3]};
|
||||
ggml_tensor * dst = ggml_new_tensor(ctx, type_dst, 4, dst_ne.data());
|
||||
ggml_set_name(dst, "dst");
|
||||
|
||||
if (_dst_use_permute) {
|
||||
@@ -8040,42 +8052,72 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||
|
||||
for (int k = 1; k < 4; ++k) {
|
||||
test_cases.emplace_back(new test_cpy(type, type, {k*nk, 2, 3, 4}));
|
||||
test_cases.emplace_back(new test_cpy(type, type, {k*nk, 2, 3, 4}, {0, 2, 1, 3}));
|
||||
test_cases.emplace_back(new test_cpy(type, type, {k*nk, 2, 3, 4}, {0, 3, 1, 2}, {0, 2, 1, 3}));
|
||||
test_cases.emplace_back(new test_cpy(type, type, {k*nk, 2, 3, 4}, {-1,-1,-1,-1}, {0, 2, 1, 3}));
|
||||
test_cases.emplace_back(new test_cpy(type, type, {k*nk, 2, 3, 4}, {-1,-1,-1,-1}, {0, 3, 1, 2}, {0, 2, 1, 3}));
|
||||
}
|
||||
}
|
||||
|
||||
for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_F32}) {
|
||||
for (ggml_type type_dst : all_types) {
|
||||
test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 4, 4, 4}));
|
||||
test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {0, 2, 1, 3})); // cpy by rows
|
||||
test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {-1,-1,-1,-1}, {0, 2, 1, 3})); // cpy by rows
|
||||
}
|
||||
}
|
||||
for (ggml_type type_src : all_types) {
|
||||
for (ggml_type type_dst : {GGML_TYPE_F32}) {
|
||||
test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 4, 4, 4}));
|
||||
test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {0, 2, 1, 3})); // cpy by rows
|
||||
test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {-1,-1,-1,-1}, {0, 2, 1, 3})); // cpy by rows
|
||||
}
|
||||
}
|
||||
for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_F32}) {
|
||||
for (ggml_type type_dst : {GGML_TYPE_F16, GGML_TYPE_F32}) {
|
||||
test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {1, 0, 2, 3})); // cpy not-contiguous
|
||||
test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {-1,-1,-1,-1}, {1, 0, 2, 3})); // cpy not-contiguous
|
||||
}
|
||||
}
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_I32, {256, 2, 3, 4}));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_I32, {256, 2, 3, 4}, {1, 0, 2, 3}));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_I32, {256, 2, 3, 4}, {-1,-1,-1,-1}, {1, 0, 2, 3}));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_F32, {256, 2, 3, 4}));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_F32, {256, 2, 3, 4}, {1, 0, 2, 3}));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {256, 4, 3, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 4, 3, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 4, 3, 3}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {256, 4, 3, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {256, 4, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 4, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {256, 4, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_I32, {256, 4, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_I32, {256, 1, 4, 1}, {1, 2, 0, 3}, {0, 0, 0, 0}));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 1, 4, 1}, {1, 2, 0, 3}, {0, 0, 0, 0}));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_F32, {256, 2, 3, 4}, {-1,-1,-1,-1}, {1, 0, 2, 3}));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {256, 4, 3, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 4, 3, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 4, 3, 3}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {256, 4, 3, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {256, 4, 1, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 4, 1, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {256, 4, 1, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_I32, {256, 4, 1, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_I32, {256, 1, 4, 1}, {-1,-1,-1,-1}, {1, 2, 0, 3}, {0, 0, 0, 0}));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 1, 4, 1}, {-1,-1,-1,-1}, {1, 2, 0, 3}, {0, 0, 0, 0}));
|
||||
|
||||
// CPY - different src/dst shapes (reshaping via CPY)
|
||||
// Use permutations of {3, 5, 7, 32}. Total elements: 3*5*7*32 = 3360.
|
||||
// Each src permutation is tested against canonical sorted and reverse dst (skip self).
|
||||
{
|
||||
std::array<int64_t, 4> dims = {3, 5, 7, 32};
|
||||
std::sort(dims.begin(), dims.end());
|
||||
std::array<int64_t, 4> canonical = dims;
|
||||
std::array<int64_t, 4> reversed = {32, 7, 5, 3};
|
||||
for (ggml_type type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
|
||||
std::array<int64_t, 4> cur = dims;
|
||||
do {
|
||||
if (cur != canonical) {
|
||||
test_cases.emplace_back(new test_cpy(type, type, cur, canonical));
|
||||
}
|
||||
if (cur != reversed) {
|
||||
test_cases.emplace_back(new test_cpy(type, type, cur, reversed));
|
||||
}
|
||||
if (cur[0] == 32 && type == GGML_TYPE_F32) {
|
||||
if (canonical[0] == 32) {
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_Q4_0, GGML_TYPE_Q4_0, cur, canonical));
|
||||
}
|
||||
if (reversed[0] == 32) {
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_Q4_0, GGML_TYPE_Q4_0, cur, reversed));
|
||||
}
|
||||
}
|
||||
std::next_permutation(cur.begin(), cur.end());
|
||||
} while (cur != canonical);
|
||||
}
|
||||
}
|
||||
|
||||
for (ggml_type type_dst : { GGML_TYPE_F32, GGML_TYPE_I32, GGML_TYPE_F16, GGML_TYPE_BF16 }) {
|
||||
for (bool use_view_slice : { true, false }) {
|
||||
@@ -8830,9 +8872,24 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||
test_cases.emplace_back(new test_acc(GGML_TYPE_F32, {256, 17, 2, 3}, {256, 16, 2, 3}, 1));
|
||||
test_cases.emplace_back(new test_acc(GGML_TYPE_F32, {256, 17, 2, 3}, {128, 16, 2, 3}, 2));
|
||||
test_cases.emplace_back(new test_acc(GGML_TYPE_F32, {256, 17, 2, 3}, {64, 16, 2, 3}, 3));
|
||||
|
||||
test_cases.emplace_back(new test_pad());
|
||||
test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {33, 17, 2, 1}, 4, 3, true)); // circular
|
||||
test_cases.emplace_back(new test_pad_ext());
|
||||
test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {1024, 1, 1, 1}, 1, 0, false));
|
||||
test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {1024, 2, 1, 1}, 1, 0, false));
|
||||
test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {1024, 16, 1, 1}, 0, 1, false));
|
||||
test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {1023, 1, 1, 1}, 1, 0, false));
|
||||
test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {1023, 8, 1, 1}, 1, 0, false));
|
||||
test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {1025, 1, 1, 1}, 1, 0, false));
|
||||
test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {1025, 8, 1, 1}, 1, 0, false));
|
||||
test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {2048, 1, 1, 1}, 1, 0, false));
|
||||
test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {2048, 4, 1, 1}, 1, 0, false));
|
||||
test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {2049, 1, 1, 1}, 1, 0, false));
|
||||
test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {100, 1, 1, 1}, 100, 0, false));
|
||||
test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {100, 1, 1, 1}, 0, 100, false));
|
||||
test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {100, 100, 1, 1}, 50, 50, false));
|
||||
|
||||
test_cases.emplace_back(new test_pad_reflect_1d());
|
||||
test_cases.emplace_back(new test_pad_reflect_1d(GGML_TYPE_F32, {3000, 384, 4, 1}));
|
||||
test_cases.emplace_back(new test_roll());
|
||||
@@ -9132,22 +9189,21 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
|
||||
test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1, 512, 1, 1}));
|
||||
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F16, {512, 3072, 1, 1}));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {8192, 512, 2, 1}, {0, 2, 1, 3}));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {3072, 512, 2, 1}, {0, 2, 1, 3}));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {8192, 512, 2, 1}, {-1,-1,-1,-1}, {0, 2, 1, 3}));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {3072, 512, 2, 1}, {-1,-1,-1,-1}, {0, 2, 1, 3}));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_Q4_0, {8192, 512, 2, 1}));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_Q4_0, GGML_TYPE_F32, {8192, 512, 2, 1}));
|
||||
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {768*1024, 256, 1, 1}, {1, 0, 2, 3}, {0, 0, 0, 0}));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {768*1024, 256, 1, 1}, {1, 0, 2, 3}, {0, 0, 0, 0}));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {768, 1024, 256, 1}, {1, 0, 2, 3}, {0, 0, 0, 0}));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {768, 1024, 256, 1}, {1, 0, 2, 3}, {0, 0, 0, 0}));
|
||||
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {768*1024, 256, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {768, 1024, 256, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {768*1024, 256, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {768, 1024, 256, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {768, 1024, 256, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {768*1024, 256, 1, 1}, {-1,-1,-1,-1}, {1, 0, 2, 3}, {0, 0, 0, 0}));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {768*1024, 256, 1, 1}, {-1,-1,-1,-1}, {1, 0, 2, 3}, {0, 0, 0, 0}));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {768, 1024, 256, 1}, {-1,-1,-1,-1}, {1, 0, 2, 3}, {0, 0, 0, 0}));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {768, 1024, 256, 1}, {-1,-1,-1,-1}, {1, 0, 2, 3}, {0, 0, 0, 0}));
|
||||
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {768*1024, 256, 1, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {768, 1024, 256, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {768*1024, 256, 1, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {768, 1024, 256, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {768, 1024, 256, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
|
||||
|
||||
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {4096, 4096, 5, 1}, false, false, GGML_TYPE_F32, {1, 1}, 1.0f, 0.0f));
|
||||
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {12888, 256, 5, 1}, false, false, GGML_TYPE_F32, {1, 1}, 1.0f, 0.0f));
|
||||
|
||||
@@ -1,6 +1,23 @@
|
||||
# llama-batched-bench-impl: batched-bench logic, reusable by app
|
||||
|
||||
set(TARGET llama-batched-bench-impl)
|
||||
|
||||
add_library(${TARGET} batched-bench.cpp)
|
||||
set_target_properties(${TARGET} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON)
|
||||
|
||||
target_include_directories(${TARGET} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
|
||||
target_link_libraries(${TARGET} PUBLIC llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
|
||||
if(LLAMA_TOOLS_INSTALL)
|
||||
install(TARGETS ${TARGET} LIBRARY)
|
||||
endif()
|
||||
|
||||
# llama-batched-bench executable
|
||||
|
||||
set(TARGET llama-batched-bench)
|
||||
add_executable(${TARGET} batched-bench.cpp)
|
||||
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
|
||||
add_executable(${TARGET} main.cpp)
|
||||
target_link_libraries(${TARGET} PRIVATE llama-batched-bench-impl)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
if(LLAMA_TOOLS_INSTALL)
|
||||
|
||||
@@ -15,7 +15,10 @@ static void print_usage(int, char ** argv) {
|
||||
LOG("\n");
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
// satisfies -Wmissing-declarations
|
||||
int llama_batched_bench(int argc, char ** argv);
|
||||
|
||||
int llama_batched_bench(int argc, char ** argv) {
|
||||
std::setlocale(LC_NUMERIC, "C");
|
||||
|
||||
common_params params;
|
||||
|
||||
5
tools/batched-bench/main.cpp
Normal file
5
tools/batched-bench/main.cpp
Normal file
@@ -0,0 +1,5 @@
|
||||
int llama_batched_bench(int argc, char ** argv);
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
return llama_batched_bench(argc, argv);
|
||||
}
|
||||
@@ -2,11 +2,16 @@
|
||||
|
||||
set(TARGET llama-cli-impl)
|
||||
|
||||
add_library(${TARGET} STATIC cli.cpp)
|
||||
add_library(${TARGET} cli.cpp)
|
||||
set_target_properties(${TARGET} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON)
|
||||
|
||||
target_include_directories(${TARGET} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} ../server)
|
||||
target_link_libraries(${TARGET} PUBLIC server-context llama-common ${CMAKE_THREAD_LIBS_INIT})
|
||||
|
||||
if(LLAMA_TOOLS_INSTALL)
|
||||
install(TARGETS ${TARGET} LIBRARY)
|
||||
endif()
|
||||
|
||||
# llama-cli executable
|
||||
|
||||
set(TARGET llama-cli)
|
||||
|
||||
@@ -2,11 +2,16 @@
|
||||
|
||||
set(TARGET llama-completion-impl)
|
||||
|
||||
add_library(${TARGET} STATIC completion.cpp)
|
||||
add_library(${TARGET} completion.cpp)
|
||||
set_target_properties(${TARGET} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON)
|
||||
|
||||
target_include_directories(${TARGET} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
|
||||
target_link_libraries(${TARGET} PUBLIC llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
|
||||
if(LLAMA_TOOLS_INSTALL)
|
||||
install(TARGETS ${TARGET} LIBRARY)
|
||||
endif()
|
||||
|
||||
# llama-completion executable
|
||||
|
||||
set(TARGET llama-completion)
|
||||
|
||||
@@ -1,6 +1,23 @@
|
||||
# llama-fit-params-impl: fit-params logic, reusable by app
|
||||
|
||||
set(TARGET llama-fit-params-impl)
|
||||
|
||||
add_library(${TARGET} fit-params.cpp)
|
||||
set_target_properties(${TARGET} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON)
|
||||
|
||||
target_include_directories(${TARGET} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
|
||||
target_link_libraries(${TARGET} PUBLIC llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
|
||||
if(LLAMA_TOOLS_INSTALL)
|
||||
install(TARGETS ${TARGET} LIBRARY)
|
||||
endif()
|
||||
|
||||
# llama-fit-params executable
|
||||
|
||||
set(TARGET llama-fit-params)
|
||||
add_executable(${TARGET} fit-params.cpp)
|
||||
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
|
||||
add_executable(${TARGET} main.cpp)
|
||||
target_link_libraries(${TARGET} PRIVATE llama-fit-params-impl)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
if(LLAMA_TOOLS_INSTALL)
|
||||
|
||||
@@ -12,7 +12,10 @@
|
||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
#endif
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
// satisfies -Wmissing-declarations
|
||||
int llama_fit_params(int argc, char ** argv);
|
||||
|
||||
int llama_fit_params(int argc, char ** argv) {
|
||||
common_params params;
|
||||
|
||||
common_init();
|
||||
|
||||
5
tools/fit-params/main.cpp
Normal file
5
tools/fit-params/main.cpp
Normal file
@@ -0,0 +1,5 @@
|
||||
int llama_fit_params(int argc, char ** argv);
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
return llama_fit_params(argc, argv);
|
||||
}
|
||||
@@ -2,11 +2,16 @@
|
||||
|
||||
set(TARGET llama-bench-impl)
|
||||
|
||||
add_library(${TARGET} STATIC llama-bench.cpp)
|
||||
add_library(${TARGET} llama-bench.cpp)
|
||||
set_target_properties(${TARGET} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON)
|
||||
|
||||
target_include_directories(${TARGET} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
|
||||
target_link_libraries(${TARGET} PUBLIC llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
|
||||
if(LLAMA_TOOLS_INSTALL)
|
||||
install(TARGETS ${TARGET} LIBRARY)
|
||||
endif()
|
||||
|
||||
# llama-bench executable
|
||||
|
||||
set(TARGET llama-bench)
|
||||
|
||||
@@ -1,6 +1,23 @@
|
||||
# llama-perplexity-impl: perplexity logic, reusable by app
|
||||
|
||||
set(TARGET llama-perplexity-impl)
|
||||
|
||||
add_library(${TARGET} perplexity.cpp)
|
||||
set_target_properties(${TARGET} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON)
|
||||
|
||||
target_include_directories(${TARGET} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
|
||||
target_link_libraries(${TARGET} PUBLIC llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
|
||||
if(LLAMA_TOOLS_INSTALL)
|
||||
install(TARGETS ${TARGET} LIBRARY)
|
||||
endif()
|
||||
|
||||
# llama-perplexity executable
|
||||
|
||||
set(TARGET llama-perplexity)
|
||||
add_executable(${TARGET} perplexity.cpp)
|
||||
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
|
||||
add_executable(${TARGET} main.cpp)
|
||||
target_link_libraries(${TARGET} PRIVATE llama-perplexity-impl)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
if(LLAMA_TOOLS_INSTALL)
|
||||
|
||||
5
tools/perplexity/main.cpp
Normal file
5
tools/perplexity/main.cpp
Normal file
@@ -0,0 +1,5 @@
|
||||
int llama_perplexity(int argc, char ** argv);
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
return llama_perplexity(argc, argv);
|
||||
}
|
||||
@@ -2005,7 +2005,10 @@ static void kl_divergence(llama_context * ctx, const common_params & params) {
|
||||
LOG("Same top p: %6.3lf ± %5.3lf %%\n", 100.0*same_top_p, 100.0*sqrt(same_top_p*(1.0 - same_top_p)/(kld.count - 1)));
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
// satisfies -Wmissing-declarations
|
||||
int llama_perplexity(int argc, char ** argv);
|
||||
|
||||
int llama_perplexity(int argc, char ** argv) {
|
||||
std::setlocale(LC_NUMERIC, "C");
|
||||
|
||||
common_params params;
|
||||
|
||||
@@ -1,7 +1,23 @@
|
||||
# llama-quantize-impl: quantize logic, reusable by app
|
||||
|
||||
set(TARGET llama-quantize-impl)
|
||||
|
||||
add_library(${TARGET} quantize.cpp)
|
||||
set_target_properties(${TARGET} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON)
|
||||
|
||||
target_include_directories(${TARGET} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
|
||||
target_link_libraries(${TARGET} PUBLIC llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
|
||||
if(LLAMA_TOOLS_INSTALL)
|
||||
install(TARGETS ${TARGET} LIBRARY)
|
||||
endif()
|
||||
|
||||
# llama-quantize executable
|
||||
|
||||
set(TARGET llama-quantize)
|
||||
add_executable(${TARGET} quantize.cpp)
|
||||
target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_include_directories(${TARGET} PRIVATE ../../common)
|
||||
|
||||
add_executable(${TARGET} main.cpp)
|
||||
target_link_libraries(${TARGET} PRIVATE llama-quantize-impl)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
if(LLAMA_TOOLS_INSTALL)
|
||||
|
||||
5
tools/quantize/main.cpp
Normal file
5
tools/quantize/main.cpp
Normal file
@@ -0,0 +1,5 @@
|
||||
int llama_quantize(int argc, char ** argv);
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
return llama_quantize(argc, argv);
|
||||
}
|
||||
@@ -490,7 +490,10 @@ static bool parse_layer_prune(const char * data, std::vector<int> & prune_layers
|
||||
return true;
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
// satisfies -Wmissing-declarations
|
||||
int llama_quantize(int argc, char ** argv);
|
||||
|
||||
int llama_quantize(int argc, char ** argv) {
|
||||
std::setlocale(LC_NUMERIC, "C");
|
||||
if (argc < 3) {
|
||||
usage(argv[0]);
|
||||
|
||||
@@ -31,18 +31,23 @@ target_link_libraries(${TARGET} PUBLIC llama-common mtmd ${CMAKE_THREAD_LIBS_INI
|
||||
|
||||
set(TARGET llama-server-impl)
|
||||
|
||||
add_library(${TARGET} STATIC
|
||||
add_library(${TARGET}
|
||||
server.cpp
|
||||
server-http.cpp
|
||||
server-http.h
|
||||
server-models.cpp
|
||||
server-models.h
|
||||
)
|
||||
set_target_properties(${TARGET} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON)
|
||||
|
||||
target_include_directories(${TARGET} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
|
||||
target_include_directories(${TARGET} PRIVATE ../mtmd ${CMAKE_SOURCE_DIR})
|
||||
target_link_libraries(${TARGET} PUBLIC server-context llama-ui cpp-httplib ${CMAKE_THREAD_LIBS_INIT})
|
||||
|
||||
if(LLAMA_TOOLS_INSTALL)
|
||||
install(TARGETS ${TARGET} LIBRARY)
|
||||
endif()
|
||||
|
||||
# llama-server executable
|
||||
|
||||
set(TARGET llama-server)
|
||||
|
||||
@@ -506,6 +506,9 @@ struct server_slot {
|
||||
|
||||
if (ptask) {
|
||||
res["id_task"] = ptask->id;
|
||||
res["n_prompt_tokens"] = (int32_t) prompt.tokens.size();
|
||||
res["n_prompt_tokens_processed"] = n_prompt_tokens_processed;
|
||||
res["n_prompt_tokens_cache"] = n_prompt_tokens_cache;
|
||||
res["params"] = ptask->params.to_json(only_metrics);
|
||||
res["next_token"] = {
|
||||
{
|
||||
@@ -701,6 +704,10 @@ private:
|
||||
bool sleeping = false;
|
||||
|
||||
void destroy() {
|
||||
spec.reset();
|
||||
ctx_dft.reset();
|
||||
model_dft.reset();
|
||||
|
||||
llama_init.reset();
|
||||
|
||||
ctx_tgt = nullptr;
|
||||
|
||||
@@ -14,6 +14,7 @@
|
||||
#include <mutex>
|
||||
#include <condition_variable>
|
||||
#include <cstring>
|
||||
#include <cstdlib>
|
||||
#include <atomic>
|
||||
#include <chrono>
|
||||
#include <queue>
|
||||
@@ -159,6 +160,13 @@ void server_model_meta::update_args(common_preset_context & ctx_preset, std::str
|
||||
// TODO: maybe validate preset before rendering ?
|
||||
// render args
|
||||
args = preset.to_args(bin_path);
|
||||
|
||||
// unified binary dispatches by subcommand, re-inject it right after the
|
||||
// binary path so the child starts as 'llama serve ...' not 'llama ...'
|
||||
const char * app_cmd = std::getenv("LLAMA_APP_CMD");
|
||||
if (app_cmd != nullptr && app_cmd[0] != '\0' && !bin_path.empty()) {
|
||||
args.insert(args.begin() + 1, app_cmd);
|
||||
}
|
||||
}
|
||||
|
||||
void server_model_meta::update_caps() {
|
||||
|
||||
Reference in New Issue
Block a user