mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-02-05 13:53:23 +02:00
Compare commits
77 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
923e3ea2e3 | ||
|
|
e743cddb60 | ||
|
|
05fec5bd29 | ||
|
|
dcf7f2ea3c | ||
|
|
84b396e051 | ||
|
|
c31e60647d | ||
|
|
67eade1bf9 | ||
|
|
7de5c7cab6 | ||
|
|
8eff95544e | ||
|
|
3120413ccd | ||
|
|
215535701d | ||
|
|
74bb294591 | ||
|
|
3e303b1107 | ||
|
|
0c1df14b5f | ||
|
|
b3ad3a0191 | ||
|
|
98197e5c98 | ||
|
|
f5e96b368f | ||
|
|
756aa1020a | ||
|
|
aaa088d87f | ||
|
|
0d5375d54b | ||
|
|
576c82eda2 | ||
|
|
0aedae00e6 | ||
|
|
6bdda13981 | ||
|
|
0b8855775c | ||
|
|
4bb625b713 | ||
|
|
11ee0fea2a | ||
|
|
a457551332 | ||
|
|
704bb7a71c | ||
|
|
435a6d10d6 | ||
|
|
f9a867f592 | ||
|
|
ac44eb6c80 | ||
|
|
a57d1bcb3c | ||
|
|
cb9178f885 | ||
|
|
4a5686da22 | ||
|
|
98bab638fb | ||
|
|
26a48ad699 | ||
|
|
ffd59e7d18 | ||
|
|
105554595f | ||
|
|
04655063c4 | ||
|
|
20b7bf8a32 | ||
|
|
6efcd65945 | ||
|
|
699f4392a3 | ||
|
|
08382869a2 | ||
|
|
bb4f7a9e4e | ||
|
|
b8eeb8741d | ||
|
|
17a1f0d2d4 | ||
|
|
8f22dc0a53 | ||
|
|
53903ae6fa | ||
|
|
4d0dcd4a06 | ||
|
|
75c91de6e9 | ||
|
|
68155c66f0 | ||
|
|
e1a7059053 | ||
|
|
12f55c302b | ||
|
|
b9c3eefde1 | ||
|
|
6491d6e4f1 | ||
|
|
e592be1575 | ||
|
|
a0374a67e2 | ||
|
|
ddef99522d | ||
|
|
6681688146 | ||
|
|
bac8bed248 | ||
|
|
b81510a7b7 | ||
|
|
ef797db357 | ||
|
|
67d1ef23c6 | ||
|
|
7b50f7c025 | ||
|
|
c79184d2d1 | ||
|
|
499a8f5a78 | ||
|
|
28657a8229 | ||
|
|
bee28421be | ||
|
|
2b72bedec1 | ||
|
|
c8c4495b8d | ||
|
|
7b63a71a6b | ||
|
|
0c2ee38ab7 | ||
|
|
a70c8a0c4b | ||
|
|
9067487c44 | ||
|
|
d4cdd9c1c3 | ||
|
|
55c2646b45 | ||
|
|
e75ba4c043 |
@@ -40,7 +40,7 @@ body:
|
||||
attributes:
|
||||
label: GGML backends
|
||||
description: Which GGML backends do you know to be affected?
|
||||
options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan, OpenCL]
|
||||
options: [AMX, BLAS, CPU, CUDA, HIP, Metal, Musa, RPC, SYCL, Vulkan, OpenCL]
|
||||
multiple: true
|
||||
validations:
|
||||
required: true
|
||||
|
||||
2
.github/ISSUE_TEMPLATE/011-bug-results.yml
vendored
2
.github/ISSUE_TEMPLATE/011-bug-results.yml
vendored
@@ -42,7 +42,7 @@ body:
|
||||
attributes:
|
||||
label: GGML backends
|
||||
description: Which GGML backends do you know to be affected?
|
||||
options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan, OpenCL]
|
||||
options: [AMX, BLAS, CPU, CUDA, HIP, Metal, Musa, RPC, SYCL, Vulkan, OpenCL]
|
||||
multiple: true
|
||||
validations:
|
||||
required: true
|
||||
|
||||
6
.github/labeler.yml
vendored
6
.github/labeler.yml
vendored
@@ -1,10 +1,4 @@
|
||||
# https://github.com/actions/labeler
|
||||
Kompute:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- ggml/include/ggml-kompute.h
|
||||
- ggml/src/ggml-kompute/**
|
||||
- README-kompute.md
|
||||
Apple Metal:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
|
||||
13
.github/workflows/build.yml
vendored
13
.github/workflows/build.yml
vendored
@@ -342,7 +342,7 @@ jobs:
|
||||
cd build
|
||||
export GGML_VK_VISIBLE_DEVICES=0
|
||||
# This is using llvmpipe and runs slower than other backends
|
||||
ctest -L main --verbose --timeout 3600
|
||||
ctest -L main --verbose --timeout 4200
|
||||
|
||||
ubuntu-22-cmake-hip:
|
||||
runs-on: ubuntu-22.04
|
||||
@@ -740,9 +740,6 @@ jobs:
|
||||
- build: 'llvm-arm64-opencl-adreno'
|
||||
arch: 'arm64'
|
||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON'
|
||||
# - build: 'kompute-x64'
|
||||
# arch: 'x64'
|
||||
# defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON'
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
@@ -756,12 +753,6 @@ jobs:
|
||||
variant: ccache
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Clone Kompute submodule
|
||||
id: clone_kompute
|
||||
if: ${{ matrix.build == 'kompute-x64' }}
|
||||
run: |
|
||||
git submodule update --init ggml/src/ggml-kompute/kompute
|
||||
|
||||
- name: Download OpenBLAS
|
||||
id: get_openblas
|
||||
if: ${{ matrix.build == 'openblas-x64' }}
|
||||
@@ -777,7 +768,7 @@ jobs:
|
||||
|
||||
- name: Install Vulkan SDK
|
||||
id: get_vulkan
|
||||
if: ${{ matrix.build == 'kompute-x64' || matrix.build == 'vulkan-x64' }}
|
||||
if: ${{ matrix.build == 'vulkan-x64' }}
|
||||
run: |
|
||||
curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/vulkansdk-windows-X64-${env:VULKAN_VERSION}.exe"
|
||||
& "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
|
||||
|
||||
40
.github/workflows/update-ops-docs.yml
vendored
Normal file
40
.github/workflows/update-ops-docs.yml
vendored
Normal file
@@ -0,0 +1,40 @@
|
||||
name: Update Operations Documentation
|
||||
|
||||
on:
|
||||
push:
|
||||
paths:
|
||||
- 'docs/ops/**'
|
||||
- 'scripts/create_ops_docs.py'
|
||||
pull_request:
|
||||
paths:
|
||||
- 'docs/ops/**'
|
||||
- 'scripts/create_ops_docs.py'
|
||||
|
||||
jobs:
|
||||
update-ops-docs:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.x'
|
||||
|
||||
- name: Generate operations documentation to temporary file
|
||||
run: |
|
||||
mkdir -p /tmp/ops_check
|
||||
./scripts/create_ops_docs.py /tmp/ops_check/ops.md
|
||||
|
||||
- name: Check if docs/ops.md matches generated version
|
||||
run: |
|
||||
if ! diff -q docs/ops.md /tmp/ops_check/ops.md; then
|
||||
echo "Operations documentation (docs/ops.md) is not up to date with the backend CSV files."
|
||||
echo "To fix: run ./scripts/create_ops_docs.py and commit the updated docs/ops.md along with your changes"
|
||||
echo "Differences found:"
|
||||
diff docs/ops.md /tmp/ops_check/ops.md || true
|
||||
exit 1
|
||||
fi
|
||||
echo "Operations documentation is up to date."
|
||||
3
.gitmodules
vendored
3
.gitmodules
vendored
@@ -1,3 +0,0 @@
|
||||
[submodule "kompute"]
|
||||
path = ggml/src/ggml-kompute/kompute
|
||||
url = https://github.com/nomic-ai/kompute.git
|
||||
|
||||
@@ -120,7 +120,6 @@ endfunction()
|
||||
|
||||
llama_option_depr(FATAL_ERROR LLAMA_CUBLAS GGML_CUDA)
|
||||
llama_option_depr(WARNING LLAMA_CUDA GGML_CUDA)
|
||||
llama_option_depr(WARNING LLAMA_KOMPUTE GGML_KOMPUTE)
|
||||
llama_option_depr(WARNING LLAMA_METAL GGML_METAL)
|
||||
llama_option_depr(WARNING LLAMA_METAL_EMBED_LIBRARY GGML_METAL_EMBED_LIBRARY)
|
||||
llama_option_depr(WARNING LLAMA_NATIVE GGML_NATIVE)
|
||||
|
||||
@@ -55,6 +55,17 @@
|
||||
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-apple-clang.cmake"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "x64-linux-gcc", "hidden": true,
|
||||
"cacheVariables": {
|
||||
"CMAKE_C_COMPILER": "gcc",
|
||||
"CMAKE_CXX_COMPILER": "g++"
|
||||
}
|
||||
},
|
||||
{ "name": "x64-linux-gcc-debug", "inherits": [ "base", "x64-linux-gcc", "debug" ] },
|
||||
{ "name": "x64-linux-gcc-release", "inherits": [ "base", "x64-linux-gcc", "release" ] },
|
||||
{ "name": "x64-linux-gcc-reldbg", "inherits": [ "base", "x64-linux-gcc", "reldbg" ] },
|
||||
{ "name": "x64-linux-gcc+static-release", "inherits": [ "base", "x64-linux-gcc", "release", "static" ] },
|
||||
|
||||
{ "name": "arm64-windows-llvm-debug", "inherits": [ "base", "arm64-windows-llvm", "debug" ] },
|
||||
{ "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg" ] },
|
||||
|
||||
10
README.md
10
README.md
@@ -6,9 +6,9 @@
|
||||
[](https://github.com/ggml-org/llama.cpp/releases)
|
||||
[](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml)
|
||||
|
||||
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Manifesto](https://github.com/ggml-org/llama.cpp/discussions/205) / [ggml](https://github.com/ggml-org/ggml)
|
||||
[Manifesto](https://github.com/ggml-org/llama.cpp/discussions/205) / [ggml](https://github.com/ggml-org/ggml) / [ops](https://github.com/ggml-org/llama.cpp/blob/master/docs/ops.md)
|
||||
|
||||
Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
|
||||
LLM inference in C/C++
|
||||
|
||||
## Recent API changes
|
||||
|
||||
@@ -17,10 +17,9 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
|
||||
|
||||
## Hot topics
|
||||
|
||||
- 🔥 Multimodal support arrived in `llama-server`: [#12898](https://github.com/ggml-org/llama.cpp/pull/12898) | [documentation](./docs/multimodal.md)
|
||||
- A new binary `llama-mtmd-cli` is introduced to replace `llava-cli`, `minicpmv-cli`, `gemma3-cli` ([#13012](https://github.com/ggml-org/llama.cpp/pull/13012)) and `qwen2vl-cli` ([#13141](https://github.com/ggml-org/llama.cpp/pull/13141)), `libllava` will be deprecated
|
||||
- Hot PRs: [All](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Apr+label%3Ahot+) | [Open](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Apr+label%3Ahot+is%3Aopen)
|
||||
- Multimodal support arrived in `llama-server`: [#12898](https://github.com/ggml-org/llama.cpp/pull/12898) | [documentation](./docs/multimodal.md)
|
||||
- VS Code extension for FIM completions: https://github.com/ggml-org/llama.vscode
|
||||
- Universal [tool call support](./docs/function-calling.md) in `llama-server` https://github.com/ggml-org/llama.cpp/pull/9639
|
||||
- Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim
|
||||
- Introducing GGUF-my-LoRA https://github.com/ggml-org/llama.cpp/discussions/10123
|
||||
- Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggml-org/llama.cpp/discussions/9669
|
||||
@@ -134,6 +133,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
||||
- [x] [GigaChat-20B-A3B](https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct)
|
||||
- [X] [Trillion-7B-preview](https://huggingface.co/trillionlabs/Trillion-7B-preview)
|
||||
- [x] [Ling models](https://huggingface.co/collections/inclusionAI/ling-67c51c85b34a7ea0aba94c32)
|
||||
- [x] [LFM2 models](https://huggingface.co/collections/LiquidAI/lfm2-686d721927015b2ad73eaa38)
|
||||
|
||||
#### Multimodal
|
||||
|
||||
|
||||
@@ -86,8 +86,7 @@ if (LLAMA_CURL)
|
||||
endif()
|
||||
target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL)
|
||||
include_directories(${CURL_INCLUDE_DIRS})
|
||||
find_library(CURL_LIBRARY curl REQUIRED)
|
||||
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARY})
|
||||
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARIES})
|
||||
endif ()
|
||||
|
||||
if (LLAMA_LLGUIDANCE)
|
||||
@@ -112,13 +111,13 @@ if (LLAMA_LLGUIDANCE)
|
||||
|
||||
ExternalProject_Add(llguidance_ext
|
||||
GIT_REPOSITORY https://github.com/guidance-ai/llguidance
|
||||
# v0.7.20 (+ fix to build on GCC 15):
|
||||
GIT_TAG b5b8b64dba11c4e4ee6b1d1450d3a3ae279891e8
|
||||
# v1.0.1:
|
||||
GIT_TAG d795912fedc7d393de740177ea9ea761e7905774
|
||||
PREFIX ${CMAKE_BINARY_DIR}/llguidance
|
||||
SOURCE_DIR ${LLGUIDANCE_SRC}
|
||||
BUILD_IN_SOURCE TRUE
|
||||
CONFIGURE_COMMAND ""
|
||||
BUILD_COMMAND cargo build --release
|
||||
BUILD_COMMAND cargo build --release --package llguidance
|
||||
INSTALL_COMMAND ""
|
||||
BUILD_BYPRODUCTS ${LLGUIDANCE_PATH}/${LLGUIDANCE_LIB_NAME} ${LLGUIDANCE_PATH}/llguidance.h
|
||||
UPDATE_COMMAND ""
|
||||
|
||||
@@ -2734,6 +2734,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.public_path = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STATIC_PATH"));
|
||||
add_opt(common_arg(
|
||||
{"--api-prefix"}, "PREFIX",
|
||||
string_format("prefix path the server serves from, without the trailing slash (default: %s)", params.api_prefix.c_str()),
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.api_prefix = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX"));
|
||||
add_opt(common_arg(
|
||||
{"--no-webui"},
|
||||
string_format("Disable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"),
|
||||
|
||||
@@ -370,6 +370,7 @@ struct common_params {
|
||||
|
||||
std::string hostname = "127.0.0.1";
|
||||
std::string public_path = ""; // NOLINT
|
||||
std::string api_prefix = ""; // NOLINT
|
||||
std::string chat_template = ""; // NOLINT
|
||||
bool use_jinja = false; // NOLINT
|
||||
bool enable_chat_template = true;
|
||||
|
||||
@@ -300,6 +300,7 @@ class ModelBase:
|
||||
gguf.MODEL_TENSOR.POS_EMBD,
|
||||
gguf.MODEL_TENSOR.TOKEN_TYPES,
|
||||
gguf.MODEL_TENSOR.SSM_CONV1D,
|
||||
gguf.MODEL_TENSOR.SHORTCONV_CONV,
|
||||
gguf.MODEL_TENSOR.TIME_MIX_FIRST,
|
||||
gguf.MODEL_TENSOR.TIME_MIX_W1,
|
||||
gguf.MODEL_TENSOR.TIME_MIX_W2,
|
||||
@@ -815,6 +816,30 @@ class TextModel(ModelBase):
|
||||
if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35":
|
||||
# ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0
|
||||
res = "minerva-7b"
|
||||
if chkhsh == "7e57df22b1fe23a7b1e1c7f3dc4e3f96d43a4eb0836d0c6bdc3436d7b2f1c664":
|
||||
# ref: https://huggingface.co/tencent/Hunyuan-A13B-Instruct
|
||||
res = "hunyuan"
|
||||
if chkhsh == "b0a6b1c0bd5998ebd9df08611efde34a4ff03faed45ae09c43e6b31ebd4b94cf":
|
||||
# ref: https://huggingface.co/skt/A.X-4.0
|
||||
res = "a.x-4.0"
|
||||
if chkhsh == "a6b57017d60e6edb4d88ecc2845188e0eb333a70357e45dcc9b53964a73bbae6":
|
||||
# ref: https://huggingface.co/tiiuae/Falcon-H1-0.5B-Base
|
||||
res = "falcon-h1"
|
||||
if chkhsh == "60476e1243776c4fb1b993dbd7a5f15ac22f83c80afdf425fa5ae01c8d44ef86":
|
||||
# ref: https://huggingface.co/tiiuae/Falcon-H1-1B-Base
|
||||
res = "falcon-h1"
|
||||
if chkhsh == "3eda48b4c4dc7de733d1a8b3e3b4a85243dbbf704da2ee9d42c6beced8897896":
|
||||
# ref: https://huggingface.co/tiiuae/Falcon-H1-7B-Base
|
||||
res = "falcon-h1"
|
||||
if chkhsh == "48f8e02c0359c0bbdd82f26909171fac1c18a457bb47573ed1fe3bbb2c1cfd4b":
|
||||
# ref: https://huggingface.co/tiiuae/Falcon-H1-34B-Base
|
||||
res = "falcon-h1"
|
||||
if chkhsh == "f6791d196f87ce6b56a7d234be618e0d58f8cda3549416635b2bebcd22cd95c4":
|
||||
# ref: https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct
|
||||
res = "midm-2.0"
|
||||
if chkhsh == "169bf0296a13c4d9b7672313f749eb36501d931022de052aad6e36f2bf34dd51":
|
||||
# ref: https://huggingface.co/LiquidAI/LFM2-Tokenizer
|
||||
res = "lfm2"
|
||||
|
||||
if res is None:
|
||||
logger.warning("\n")
|
||||
@@ -4408,9 +4433,6 @@ class Gemma3NModel(Gemma3Model):
|
||||
]
|
||||
|
||||
def set_vocab(self):
|
||||
with open(self.dir_model / "chat_template.jinja") as f:
|
||||
# quick hack to make sure chat template is added
|
||||
self.gguf_writer.add_chat_template(f.read())
|
||||
super().set_vocab()
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
@@ -4875,6 +4897,9 @@ class Mamba2Model(TextModel):
|
||||
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
|
||||
hparams = json.load(f)
|
||||
super().__init__(dir_model, *args, hparams=hparams, **kwargs)
|
||||
self.d_model = self.find_hparam(["hidden_size", "d_model", "dim"])
|
||||
self.d_inner = self.find_hparam(["mamba_d_ssm", "intermediate_size", "d_inner"], optional=True) or 2 * self.d_model
|
||||
self.n_group = self.find_hparam(["n_groups"], optional=True) or 1
|
||||
|
||||
def set_vocab(self):
|
||||
vocab_size = self.hparams["vocab_size"]
|
||||
@@ -4897,30 +4922,29 @@ class Mamba2Model(TextModel):
|
||||
self._set_vocab_builtin("gpt-neox", vocab_size)
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
d_model = self.find_hparam(["hidden_size", "d_model", "dim"])
|
||||
d_conv = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4
|
||||
d_inner = self.find_hparam(["intermediate_size", "d_inner"], optional=True) or 2 * d_model
|
||||
d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 128
|
||||
head_dim = self.find_hparam(["head_dim"], optional=True) or 64
|
||||
n_group = self.find_hparam(["n_groups"], optional=True) or 1
|
||||
d_conv = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4
|
||||
d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 128
|
||||
head_dim = self.find_hparam(["mamba_d_head", "head_dim"], optional=True) or 64
|
||||
|
||||
rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5
|
||||
|
||||
# Fail early for models which don't have a block expansion factor of 2
|
||||
# TODO: does this really matter?
|
||||
assert d_inner == 2 * d_model
|
||||
assert d_inner % head_dim == 0
|
||||
# skip the assertion for FalconH1 Model
|
||||
if self.model_arch != gguf.MODEL_ARCH.FALCON_H1:
|
||||
assert self.d_inner == 2 * self.d_model
|
||||
assert self.d_inner % head_dim == 0
|
||||
|
||||
self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default
|
||||
self.gguf_writer.add_embedding_length(d_model)
|
||||
self.gguf_writer.add_embedding_length(self.d_model)
|
||||
self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading
|
||||
self.gguf_writer.add_head_count(0) # unused, but seemingly required when loading
|
||||
self.gguf_writer.add_block_count(self.block_count)
|
||||
self.gguf_writer.add_ssm_conv_kernel(d_conv)
|
||||
self.gguf_writer.add_ssm_inner_size(d_inner)
|
||||
self.gguf_writer.add_ssm_inner_size(self.d_inner)
|
||||
self.gguf_writer.add_ssm_state_size(d_state)
|
||||
self.gguf_writer.add_ssm_time_step_rank(d_inner // head_dim)
|
||||
self.gguf_writer.add_ssm_group_count(n_group)
|
||||
self.gguf_writer.add_ssm_time_step_rank(self.d_inner // head_dim)
|
||||
self.gguf_writer.add_ssm_group_count(self.n_group)
|
||||
self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
|
||||
self.gguf_writer.add_file_type(self.ftype)
|
||||
|
||||
@@ -4945,10 +4969,7 @@ class Mamba2Model(TextModel):
|
||||
# (D is also unsqueezed, but for more straightforward broadcast internally)
|
||||
data_torch = data_torch.reshape((*data_torch.shape, 1))
|
||||
elif self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_NORM, bid):
|
||||
d_model = self.find_hparam(["hidden_size", "d_model", "dim"])
|
||||
d_inner = self.find_hparam(["intermediate_size", "d_inner"], optional=True) or 2 * d_model
|
||||
n_group = self.hparams.get("n_groups", 1)
|
||||
data_torch = data_torch.reshape((n_group, d_inner // n_group))
|
||||
data_torch = data_torch.reshape((self.n_group, self.d_inner // self.n_group))
|
||||
|
||||
if name.endswith(".A_log"):
|
||||
logger.debug("A_log --> A ==> " + new_name)
|
||||
@@ -4957,6 +4978,123 @@ class Mamba2Model(TextModel):
|
||||
yield (new_name, data_torch)
|
||||
|
||||
|
||||
@ModelBase.register("JambaForCausalLM")
|
||||
class JambaModel(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.JAMBA
|
||||
|
||||
def get_vocab_base_pre(self, tokenizer) -> str:
|
||||
del tokenizer # unused
|
||||
|
||||
return "gpt-2"
|
||||
|
||||
def set_vocab(self):
|
||||
if (self.dir_model / "tokenizer.model").is_file():
|
||||
# Using Jamba's tokenizer.json causes errors on model load
|
||||
# (something about "byte not found in vocab"),
|
||||
# but there's a working tokenizer.model
|
||||
self._set_vocab_sentencepiece()
|
||||
else:
|
||||
# Some Jamba models only have a tokenizer.json, which works.
|
||||
self._set_vocab_gpt2()
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
d_model = self.find_hparam(["hidden_size", "mamba_d_model"])
|
||||
d_conv = self.find_hparam(["mamba_d_conv"], optional=True) or 4
|
||||
d_inner = self.hparams["mamba_expand"] * d_model
|
||||
d_state = self.find_hparam(["mamba_d_state"], optional=True) or 16
|
||||
# ceiling division
|
||||
# ref: https://stackoverflow.com/a/17511341/22827863
|
||||
# ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58
|
||||
dt_rank = self.find_hparam(["mamba_dt_rank"], optional=True) or -(d_model // -16)
|
||||
rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-6
|
||||
n_kv_head = self.hparams["num_key_value_heads"]
|
||||
attn_offset = self.hparams["attn_layer_offset"]
|
||||
attn_period = self.hparams["attn_layer_period"]
|
||||
n_kv_vec = [0 for _ in range(attn_offset)] + [
|
||||
n_kv_head if (i - attn_offset) % attn_period == 0 else 0 for i in range(attn_offset, self.block_count)
|
||||
]
|
||||
|
||||
self.gguf_writer.add_block_count(self.block_count)
|
||||
self.gguf_writer.add_context_length(self.find_hparam(["max_position_embeddings", "n_ctx"]))
|
||||
self.gguf_writer.add_embedding_length(d_model)
|
||||
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
|
||||
self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
|
||||
self.gguf_writer.add_head_count_kv(n_kv_vec)
|
||||
self.gguf_writer.add_ssm_conv_kernel(d_conv)
|
||||
self.gguf_writer.add_ssm_inner_size(d_inner)
|
||||
self.gguf_writer.add_ssm_state_size(d_state)
|
||||
self.gguf_writer.add_ssm_time_step_rank(dt_rank)
|
||||
self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
|
||||
self.gguf_writer.add_expert_count(self.hparams["num_experts"])
|
||||
self.gguf_writer.add_expert_used_count(self.hparams["num_experts_per_tok"])
|
||||
self.gguf_writer.add_file_type(self.ftype)
|
||||
|
||||
_experts: list[dict[str, Tensor]] | None = None
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
|
||||
# Mini-Jamba
|
||||
name = name.replace(".moe.", ".feed_forward.")
|
||||
if bid is not None:
|
||||
moe_offset = self.hparams["expert_layer_offset"]
|
||||
moe_period = self.hparams["expert_layer_period"]
|
||||
|
||||
if not (bid >= moe_offset and (bid - moe_offset) % moe_period == 0):
|
||||
name = name.replace(".experts.0.", ".")
|
||||
|
||||
# process the experts separately
|
||||
if ".feed_forward.experts." in name:
|
||||
n_experts = self.hparams["num_experts"]
|
||||
|
||||
assert bid is not None
|
||||
|
||||
if self._experts is None:
|
||||
self._experts = [{} for _ in range(self.block_count)]
|
||||
|
||||
self._experts[bid][name] = data_torch
|
||||
|
||||
if len(self._experts[bid]) >= n_experts * 3:
|
||||
|
||||
# merge the experts into a single 3d tensor
|
||||
for wid in ["down_proj", "gate_proj", "up_proj"]:
|
||||
datas: list[Tensor] = []
|
||||
|
||||
for xid in range(n_experts):
|
||||
ename = f"model.layers.{bid}.feed_forward.experts.{xid}.{wid}.weight"
|
||||
datas.append(self._experts[bid][ename])
|
||||
del self._experts[bid][ename]
|
||||
|
||||
data_torch = torch.stack(datas, dim=0)
|
||||
|
||||
# using the same merged name as qwen2moe
|
||||
merged_name = f"model.layers.{bid}.mlp.experts.{wid}.weight"
|
||||
|
||||
new_name = self.map_tensor_name(merged_name)
|
||||
|
||||
yield new_name, data_torch
|
||||
return
|
||||
|
||||
new_name = self.map_tensor_name(name)
|
||||
|
||||
if self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_CONV1D, bid):
|
||||
data_torch = data_torch.squeeze()
|
||||
|
||||
if name.endswith(".A_log"):
|
||||
logger.debug("A_log --> A ==> " + new_name)
|
||||
data_torch = -torch.exp(data_torch)
|
||||
|
||||
yield (new_name, data_torch)
|
||||
|
||||
def prepare_tensors(self):
|
||||
super().prepare_tensors()
|
||||
|
||||
if self._experts is not None:
|
||||
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
||||
experts = [k for d in self._experts for k in d.keys()]
|
||||
if len(experts) > 0:
|
||||
raise ValueError(f"Unprocessed experts: {experts}")
|
||||
|
||||
|
||||
@ModelBase.register("CohereForCausalLM")
|
||||
class CommandR2Model(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.COMMAND_R
|
||||
@@ -6318,18 +6456,148 @@ class GraniteMoeModel(GraniteModel):
|
||||
(self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_EXP, bid), up),
|
||||
]
|
||||
|
||||
has_experts = bool(self.hparams.get('num_local_experts'))
|
||||
|
||||
if name.endswith("shared_mlp.input_linear.weight"):
|
||||
ffn_dim = self.hparams["shared_intermediate_size"]
|
||||
assert data_torch.shape[-2] == 2 * ffn_dim, "Merged FFN tensor size must be 2 * shared_intermediate_size"
|
||||
gate, up = data_torch.split(ffn_dim, dim=-2)
|
||||
if has_experts:
|
||||
return [
|
||||
(self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_SHEXP, bid), gate),
|
||||
(self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_SHEXP, bid), up),
|
||||
]
|
||||
return [
|
||||
(self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_SHEXP, bid), gate),
|
||||
(self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_SHEXP, bid), up),
|
||||
(self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), gate),
|
||||
(self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), up),
|
||||
]
|
||||
|
||||
if not has_experts and name.endswith("shared_mlp.output_linear.weight"):
|
||||
return [
|
||||
(self.format_tensor_name(gguf.MODEL_TENSOR.FFN_DOWN, bid), data_torch)
|
||||
]
|
||||
|
||||
return super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("GraniteMoeHybridForCausalLM", "BambaForCausalLM")
|
||||
class GraniteHybridModel(Mamba2Model, GraniteMoeModel):
|
||||
"""GraniteHybrid is a hybrid SSM + Attention model that uses Mamba2 SSM
|
||||
layers and optionally uses MoE w/ a shared expert"""
|
||||
model_arch = gguf.MODEL_ARCH.GRANITE_HYBRID
|
||||
undo_permute = True
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
|
||||
# Hybrid mamba models use a prefix for the mamba-specific params.
|
||||
# TODO: Extend this if the prefix(es) need to be configurable
|
||||
self.hparam_prefixes = ["mamba"]
|
||||
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
# Lists of which layers use ssm vs attention
|
||||
self._attn_layers = self.get_attn_layers()
|
||||
self._ssm_layers = [
|
||||
i for i in range(self.block_count)
|
||||
if i not in self._attn_layers
|
||||
]
|
||||
|
||||
# n_group and d_inner are used during reshape_tensors for mamba2
|
||||
self.d_model = self.find_hparam(["hidden_size", "d_model"])
|
||||
self.n_group = self.find_hparam(["n_groups"])
|
||||
self.d_inner = self.find_hparam(["expand"]) * self.d_model
|
||||
|
||||
def get_attn_layers(self):
|
||||
# Explicit list of layer type names
|
||||
if layer_types := self.hparams.get("layer_types"):
|
||||
return [
|
||||
i for i, typ in enumerate(layer_types)
|
||||
if typ == "attention"
|
||||
]
|
||||
|
||||
# Layer types indicated by index or period
|
||||
attn_layers = self.hparams.get("attn_layer_indices", [])
|
||||
if not attn_layers:
|
||||
attn_period = self.hparams.get("attn_layer_period")
|
||||
assert attn_period, "Didn't find attn_layer_indices or attn_layer_period"
|
||||
attn_offset = self.hparams.get("attn_layer_offset")
|
||||
assert attn_offset is not None, "No attention layer offset set with attn_layer_period"
|
||||
attn_layers = [
|
||||
i for i in range(self.block_count)
|
||||
if i % attn_period == attn_offset
|
||||
]
|
||||
return attn_layers
|
||||
|
||||
def find_hparam(self, keys: Iterable[str], *args, **kwargs) -> Any:
|
||||
prefixed = []
|
||||
for pfx in self.hparam_prefixes:
|
||||
prefixed.extend(
|
||||
"_".join([pfx, k])
|
||||
for k in keys
|
||||
)
|
||||
keys = list(keys) + prefixed
|
||||
return Mamba2Model.find_hparam(self, keys, *args, **kwargs)
|
||||
|
||||
def modify_tensors(
|
||||
self, data_torch: Tensor, name: str, bid: int | None
|
||||
) -> Iterable[tuple[str, Tensor]]:
|
||||
if (
|
||||
name.endswith("block_sparse_moe.input_linear.weight")
|
||||
or "shared_mlp" in name
|
||||
):
|
||||
return GraniteMoeModel.modify_tensors(self, data_torch, name, bid)
|
||||
|
||||
# Determine whether this is a mamba layer or an attention layer
|
||||
if bid in self._ssm_layers:
|
||||
return Mamba2Model.modify_tensors(self, data_torch, name, bid)
|
||||
elif bid in self._attn_layers:
|
||||
return GraniteMoeModel.modify_tensors(self, data_torch, name, bid)
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
"""This method merges params from both parents and some that are
|
||||
specific to this model. The result is some duplication of how the params
|
||||
get set. The following warnings are expected during conversion:
|
||||
|
||||
WARNING:Duplicated key name 'granitehybrid.attention.head_count_kv'
|
||||
WARNING:Duplicated key name 'granitehybrid.context_length'
|
||||
"""
|
||||
GraniteMoeModel.set_gguf_parameters(self)
|
||||
|
||||
## Mamba mixer params ##
|
||||
self.gguf_writer.add_ssm_conv_kernel(self.find_hparam(["conv_kernel", "d_conv"]))
|
||||
self.gguf_writer.add_ssm_state_size(self.find_hparam(["state_size", "d_state"]))
|
||||
self.gguf_writer.add_ssm_group_count(self.n_group)
|
||||
self.gguf_writer.add_ssm_inner_size(self.d_inner)
|
||||
# NOTE: The mamba_dt_rank is _not_ the right field for how this is used
|
||||
# in llama.cpp
|
||||
self.gguf_writer.add_ssm_time_step_rank(self.find_hparam(["n_heads"]))
|
||||
|
||||
## Attention params ##
|
||||
head_count_kv = self.find_hparam(["num_key_value_heads", "n_head_kv"])
|
||||
head_count_kv_vec = [
|
||||
head_count_kv if i in self._attn_layers else 0 for i in range(self.block_count)
|
||||
]
|
||||
if rope_dim := self.hparams.get("attn_rotary_emb"):
|
||||
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
||||
self.gguf_writer.add_head_count_kv(head_count_kv_vec)
|
||||
|
||||
## If Bamba, use rope, otherwise don't
|
||||
use_rope = "BambaForCausalLM" in self.hparams["architectures"]
|
||||
self.gguf_writer.add_rope_scaling_finetuned(use_rope)
|
||||
if not use_rope:
|
||||
self.gguf_writer.add_context_length(2**20)
|
||||
|
||||
## Validation ##
|
||||
d_head = self.find_hparam(["d_head"], optional=True) or 64
|
||||
assert self.hparams.get("hidden_act") in [None, "silu"], "Only SILU activation supported"
|
||||
assert self.d_inner % d_head == 0, f"SSM inner size {self.d_inner} not a multiple of head dim {d_head}"
|
||||
|
||||
def set_vocab(self):
|
||||
self.hparams["pad_vocab_size_multiple"] = 8
|
||||
Mamba2Model.set_vocab(self)
|
||||
|
||||
|
||||
@ModelBase.register("BailingMoeForCausalLM")
|
||||
class BailingMoeModel(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.BAILINGMOE
|
||||
@@ -6538,6 +6806,321 @@ class UltravoxWhisperEncoderModel(WhisperEncoderModel):
|
||||
super().set_gguf_parameters()
|
||||
self.gguf_writer.add_audio_stack_factor(self.global_config["stack_factor"])
|
||||
|
||||
|
||||
@ModelBase.register("FalconH1ForCausalLM")
|
||||
class FalconH1Model(Mamba2Model):
|
||||
model_arch = gguf.MODEL_ARCH.FALCON_H1
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
# Set the hparam prefixes for Falcon Mamba2
|
||||
self.hparam_prefixes = ["mamba"]
|
||||
|
||||
# Initialize the base Mamba2Model
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
# Use Llama conversion for attention
|
||||
self._transformer_model_class = LlamaModel
|
||||
|
||||
# n_group and d_inner are used during reshape_tensors for mamba2
|
||||
self.n_group = self.find_hparam(["n_groups"])
|
||||
self.d_inner = self.find_hparam(["mamba_d_ssm"])
|
||||
self.d_head = self.find_hparam(["d_head"])
|
||||
|
||||
# Initialize any Falcon Mamba2 specific attributes
|
||||
self.has_attention = True # Falcon Mamba2 has attention components
|
||||
|
||||
# Load Falcon-H1 multipliers from hyperparameters
|
||||
self.attention_in_multiplier = self.find_hparam(["attention_in_multiplier"], optional=True)
|
||||
self.attention_out_multiplier = self.find_hparam(["attention_out_multiplier"], optional=True)
|
||||
self.ssm_in_multiplier = self.find_hparam(["ssm_in_multiplier"], optional=True)
|
||||
self.ssm_out_multiplier = self.find_hparam(["ssm_out_multiplier"], optional=True)
|
||||
self.mlp_multipliers = self.find_hparam(["mlp_multipliers"], optional=True)
|
||||
self.ssm_multipliers = self.find_hparam(["ssm_multipliers"], optional=True)
|
||||
self.intermediate_size = self.find_hparam(["intermediate_size"])
|
||||
self.key_multiplier = self.find_hparam(["key_multiplier"], optional=True)
|
||||
|
||||
def find_hparam(self, keys: Iterable[str], *args, **kwargs) -> Any:
|
||||
prefixed = []
|
||||
for pfx in self.hparam_prefixes:
|
||||
prefixed.extend(
|
||||
"_".join([pfx, k])
|
||||
for k in keys
|
||||
)
|
||||
keys = list(keys) + prefixed
|
||||
return super().find_hparam(keys, *args, **kwargs)
|
||||
|
||||
def set_vocab(self):
|
||||
self._set_vocab_gpt2()
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
tensors = list(super().modify_tensors(data_torch, name, bid))
|
||||
tensor = tensors[0][1]
|
||||
|
||||
if "down_proj" in name:
|
||||
tensor = tensor * self.mlp_multipliers[1]
|
||||
elif "gate_proj" in name:
|
||||
tensor = tensor * self.mlp_multipliers[0]
|
||||
elif "k_proj" in name:
|
||||
tensor = tensor * self.key_multiplier * self.attention_in_multiplier
|
||||
elif "q_proj" in name:
|
||||
tensor = tensor * self.attention_in_multiplier
|
||||
elif "v_proj" in name:
|
||||
tensor = tensor * self.attention_in_multiplier
|
||||
elif "o_proj" in name:
|
||||
tensor = tensor * self.attention_out_multiplier
|
||||
elif "out_proj" in name:
|
||||
tensor = tensor * self.ssm_out_multiplier
|
||||
elif "in_proj" in name:
|
||||
tensor = tensor * self.ssm_in_multiplier
|
||||
zxbcdt_multipliers = self.hparams["ssm_multipliers"]
|
||||
intermediate_size = self.hparams["mamba_d_ssm"]
|
||||
groups_time_state_size = self.hparams["mamba_n_groups"] * self.hparams["mamba_d_state"]
|
||||
tensor[:intermediate_size, :] *= zxbcdt_multipliers[0]
|
||||
tensor[intermediate_size:2 * intermediate_size, :] *= zxbcdt_multipliers[1]
|
||||
tensor[2 * intermediate_size:2 * intermediate_size + groups_time_state_size, :] *= zxbcdt_multipliers[2]
|
||||
tensor[2 * intermediate_size + groups_time_state_size:2 * intermediate_size + 2 * groups_time_state_size, :] *= zxbcdt_multipliers[3]
|
||||
tensor[2 * intermediate_size + 2 * groups_time_state_size:, :] *= zxbcdt_multipliers[4]
|
||||
elif "lm_head" in name:
|
||||
tensor = tensor * self.hparams["lm_head_multiplier"]
|
||||
elif "embed_tokens" in name:
|
||||
tensor = tensor * self.hparams["embedding_multiplier"]
|
||||
elif "mamba.norm" in name:
|
||||
tensor = tensor.reshape(self.n_group, self.d_inner // self.n_group)
|
||||
|
||||
tensors = [(tensors[0][0], tensor)]
|
||||
return tensors
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
|
||||
## General Params ##
|
||||
self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
|
||||
# Override some Mamba2 defaults
|
||||
self.gguf_writer.add_block_count(self.block_count)
|
||||
self.gguf_writer.add_context_length(self.hparams.get("max_position_embeddings", 0))
|
||||
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
|
||||
|
||||
## Attention params ##
|
||||
self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) # Override value 0 from Mamba2
|
||||
self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
|
||||
self.gguf_writer.add_key_length(self.hparams["head_dim"])
|
||||
self.gguf_writer.add_value_length(self.hparams["head_dim"])
|
||||
|
||||
## Validation ##
|
||||
assert self.hparams.get("hidden_act") in [None, "silu"], "Only SILU activation supported"
|
||||
assert self.d_inner % self.d_head == 0, f"SSM inner size {self.d_inner} not a multiple of head dim {self.d_head}"
|
||||
|
||||
# Add any other Falcon Mamba2 specific configuration
|
||||
self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"]))
|
||||
|
||||
|
||||
@ModelBase.register("HunYuanMoEV1ForCausalLM")
|
||||
class HunYuanMoEModel(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.HUNYUAN_MOE
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
# For handling tied embeddings
|
||||
self._tok_embd = None
|
||||
|
||||
def set_vocab(self):
|
||||
from transformers import AutoTokenizer
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
|
||||
|
||||
# 1. Get the pre-tokenizer identifier hash
|
||||
tokpre = self.get_vocab_base_pre(tokenizer)
|
||||
|
||||
# 2. Reverse-engineer the merges list from mergeable_ranks
|
||||
merges = []
|
||||
vocab = {}
|
||||
mergeable_ranks = tokenizer.mergeable_ranks
|
||||
for token, rank in mergeable_ranks.items():
|
||||
vocab[QwenModel.token_bytes_to_string(token)] = rank
|
||||
if len(token) == 1:
|
||||
continue
|
||||
merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
|
||||
if len(merged) == 2: # todo this is an assert in Qwen, why?
|
||||
merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged)))
|
||||
|
||||
# 3. Generate the tokens and toktypes lists
|
||||
vocab_size = self.hparams["vocab_size"]
|
||||
assert tokenizer.vocab_size == vocab_size
|
||||
special_tokens = tokenizer.special_tokens
|
||||
reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()}
|
||||
tokens: list[str] = []
|
||||
toktypes: list[int] = []
|
||||
for i in range(vocab_size):
|
||||
if i not in reverse_vocab:
|
||||
tokens.append(f"[PAD{i}]")
|
||||
toktypes.append(gguf.TokenType.UNUSED)
|
||||
else:
|
||||
token = reverse_vocab[i]
|
||||
tokens.append(token)
|
||||
if i in special_tokens.values():
|
||||
toktypes.append(gguf.TokenType.CONTROL)
|
||||
else:
|
||||
toktypes.append(gguf.TokenType.NORMAL)
|
||||
|
||||
# 4. Write all vocab-related fields to the GGUF writer
|
||||
self.gguf_writer.add_tokenizer_model("gpt2")
|
||||
self.gguf_writer.add_tokenizer_pre(tokpre)
|
||||
self.gguf_writer.add_token_list(tokens)
|
||||
self.gguf_writer.add_token_types(toktypes)
|
||||
self.gguf_writer.add_token_merges(merges)
|
||||
|
||||
# 5. Add special tokens and chat templates
|
||||
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
|
||||
special_vocab.add_to_gguf(self.gguf_writer)
|
||||
# FIX for BOS token: Overwrite incorrect id read from config.json
|
||||
self.gguf_writer.add_bos_token_id(127959) # <|bos|>
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
hparams = self.hparams
|
||||
|
||||
self.gguf_writer.add_expert_count(hparams["num_experts"])
|
||||
self.gguf_writer.add_expert_shared_feed_forward_length(hparams["intermediate_size"])
|
||||
|
||||
moe_intermediate_size = hparams["moe_intermediate_size"]
|
||||
assert all(n == moe_intermediate_size[0] for n in moe_intermediate_size)
|
||||
self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size[0])
|
||||
|
||||
moe_topk = hparams["moe_topk"]
|
||||
assert all(topk == moe_topk[0] for topk in moe_topk)
|
||||
self.gguf_writer.add_expert_used_count(moe_topk[0])
|
||||
|
||||
moe_shared_expert = hparams["num_shared_expert"]
|
||||
assert all(n == moe_shared_expert[0] for n in moe_shared_expert)
|
||||
self.gguf_writer.add_expert_shared_count(moe_shared_expert[0])
|
||||
|
||||
# Rope
|
||||
rope_scaling = hparams.get("rope_scaling", {})
|
||||
if rope_scaling.get("type") == "dynamic":
|
||||
# HunYuan uses NTK Aware Alpha based scaling. Original implementation: https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
|
||||
# 1000 corresponds to a usable context length of 256k (https://github.com/Tencent-Hunyuan/Hunyuan-A13B/blob/main/report/Hunyuan_A13B_Technical_Report.pdf)
|
||||
alpha = rope_scaling.get("alpha", 1000)
|
||||
base = hparams.get("rope_theta", 10000.0)
|
||||
dim = (hparams["hidden_size"] // hparams["num_attention_heads"]) # 128
|
||||
scaled_base = base * (alpha ** (dim / (dim - 2))) # 10000 * (1000 ** (128 / 126)) = 11158839.9251
|
||||
self.gguf_writer.add_rope_freq_base(scaled_base)
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
|
||||
self.gguf_writer.add_rope_scaling_factor(1)
|
||||
# There is no consistent way to calculate ctx from alpha, and the config is incorrectly set to 32k
|
||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(256 * 1024) # 256k context length
|
||||
self.gguf_writer.add_context_length(256 * 1024) # 256k context length
|
||||
|
||||
# if any of our assumptions about the values are wrong, something has changed and this may need to be updated
|
||||
assert alpha == 1000 and base == 10000.0 and dim == 128 and self.hparams["max_position_embeddings"] in [32 * 1024, 256 * 1024] , \
|
||||
"HunYuan dynamic RoPE scaling assumptions changed, please update the logic or context length manually"
|
||||
|
||||
_experts: list[dict[str, Tensor]] | None = None
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
if name == "model.embed_tokens.weight":
|
||||
self._tok_embd = data_torch.clone()
|
||||
|
||||
if name == "lm_head.weight":
|
||||
if self.hparams.get("tie_word_embeddings", False):
|
||||
logger.info("Skipping tied output layer 'lm_head.weight'")
|
||||
return []
|
||||
|
||||
if name.find("mlp.experts") != -1:
|
||||
n_experts = self.hparams["num_experts"]
|
||||
assert bid is not None
|
||||
|
||||
if self._experts is None:
|
||||
self._experts = [{} for _ in range(self.block_count)]
|
||||
|
||||
self._experts[bid][name] = data_torch
|
||||
|
||||
if len(self._experts[bid]) >= n_experts * 3:
|
||||
# merge the experts into a single 3d tensor
|
||||
tensors: list[tuple[str, Tensor]] = []
|
||||
for w_name in ["down_proj", "gate_proj", "up_proj"]:
|
||||
datas: list[Tensor] = []
|
||||
|
||||
for xid in range(n_experts):
|
||||
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
|
||||
datas.append(self._experts[bid][ename])
|
||||
del self._experts[bid][ename]
|
||||
|
||||
data_torch = torch.stack(datas, dim=0)
|
||||
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
|
||||
new_name = self.map_tensor_name(merged_name)
|
||||
tensors.append((new_name, data_torch))
|
||||
|
||||
return tensors
|
||||
else:
|
||||
return []
|
||||
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
|
||||
def prepare_tensors(self):
|
||||
super().prepare_tensors()
|
||||
if self._experts is not None:
|
||||
experts = [k for d in self._experts for k in d.keys()]
|
||||
if len(experts) > 0:
|
||||
raise ValueError(f"Unprocessed experts: {experts}")
|
||||
|
||||
|
||||
@ModelBase.register("SmolLM3ForCausalLM")
|
||||
class SmolLM3Model(LlamaModel):
|
||||
model_arch = gguf.MODEL_ARCH.SMOLLM3
|
||||
|
||||
def set_vocab(self):
|
||||
super().set_vocab()
|
||||
# remove unsupported array slicing in chat template
|
||||
# ref: https://huggingface.co/ggml-org/SmolLM3-3B-GGUF/discussions/1
|
||||
from transformers import AutoTokenizer
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
|
||||
if tokenizer.chat_template is not None:
|
||||
chat_template = tokenizer.chat_template.replace("[:]", "")
|
||||
self.gguf_writer.add_chat_template(chat_template)
|
||||
|
||||
|
||||
@ModelBase.register("Lfm2ForCausalLM")
|
||||
@ModelBase.register("LFM2ForCausalLM")
|
||||
class LFM2Model(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.LFM2
|
||||
|
||||
def _add_feed_forward_length(self):
|
||||
ff_dim = self.hparams["block_ff_dim"]
|
||||
|
||||
auto_adjust_ff_dim = self.hparams["block_auto_adjust_ff_dim"]
|
||||
ff_dim = self.hparams["block_ff_dim"]
|
||||
ffn_dim_multiplier = self.hparams["block_ffn_dim_multiplier"]
|
||||
multiple_of = self.hparams["block_multiple_of"]
|
||||
|
||||
if auto_adjust_ff_dim:
|
||||
ff_dim = int(2 * ff_dim / 3)
|
||||
# custom dim factor multiplier
|
||||
if ffn_dim_multiplier is not None:
|
||||
ff_dim = int(ffn_dim_multiplier * ff_dim)
|
||||
ff_dim = multiple_of * ((ff_dim + multiple_of - 1) // multiple_of)
|
||||
|
||||
self.gguf_writer.add_feed_forward_length(ff_dim)
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
# set num_key_value_heads only for attention layers
|
||||
self.hparams["num_key_value_heads"] = [
|
||||
self.hparams["num_key_value_heads"] if layer_type == "full_attention" else 0
|
||||
for layer_type in self.hparams["layer_types"]
|
||||
]
|
||||
|
||||
super().set_gguf_parameters()
|
||||
self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
|
||||
self.gguf_writer.add_shortconv_l_cache(self.hparams["conv_L_cache"])
|
||||
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["norm_eps"])
|
||||
self._add_feed_forward_length()
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
# conv op requires 2d tensor
|
||||
if 'conv.conv' in name:
|
||||
data_torch = data_torch.squeeze(1)
|
||||
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
|
||||
|
||||
###### CONVERSION LOGIC ######
|
||||
|
||||
|
||||
|
||||
@@ -128,6 +128,9 @@ models = [
|
||||
{"name": "llama4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct", },
|
||||
{"name": "pixtral", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistral-community/pixtral-12b", },
|
||||
{"name": "seed-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base", },
|
||||
{"name": "a.x-4.0", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/skt/A.X-4.0", },
|
||||
{"name": "midm-2.0", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct", },
|
||||
{"name": "lfm2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LiquidAI/LFM2-Tokenizer"},
|
||||
]
|
||||
|
||||
# some models are known to be broken upstream, so we will skip them as exceptions
|
||||
@@ -137,6 +140,12 @@ pre_computed_hashes = [
|
||||
{"name": "chatglm-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-chat", "chkhsh": "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516"},
|
||||
{"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf", "chkhsh": "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2"},
|
||||
{"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", "chkhsh": "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35"},
|
||||
{"name": "hunyuan", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tencent/Hunyuan-A13B-Instruct", "chkhsh": "7e57df22b1fe23a7b1e1c7f3dc4e3f96d43a4eb0836d0c6bdc3436d7b2f1c664"},
|
||||
# falcon-h1 series uses 4 different tokenizers across model sizes (0.5b - 34b), hence we need to define 4 different hashes
|
||||
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-0.5B-Base", "chkhsh": "a6b57017d60e6edb4d88ecc2845188e0eb333a70357e45dcc9b53964a73bbae6"},
|
||||
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-1B-Base", "chkhsh": "60476e1243776c4fb1b993dbd7a5f15ac22f83c80afdf425fa5ae01c8d44ef86"},
|
||||
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-7B-Base", "chkhsh": "3eda48b4c4dc7de733d1a8b3e3b4a85243dbbf704da2ee9d42c6beced8897896"},
|
||||
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-34B-Base", "chkhsh": "48f8e02c0359c0bbdd82f26909171fac1c18a457bb47573ed1fe3bbb2c1cfd4b"},
|
||||
]
|
||||
|
||||
|
||||
|
||||
@@ -83,20 +83,22 @@ NOTE: Tensor names must end with `.weight` or `.bias` suffixes, that is the conv
|
||||
|
||||
### 2. Define the model architecture in `llama.cpp`
|
||||
|
||||
The model params and tensors layout must be defined in `llama.cpp`:
|
||||
1. Define a new `llm_arch`
|
||||
2. Define the tensors layout in `LLM_TENSOR_NAMES`
|
||||
3. Add any non-standard metadata in `llm_load_hparams`
|
||||
4. Create the tensors for inference in `llm_load_tensors`
|
||||
5. If the model has a RoPE operation, add the rope type in `llama_rope_type`
|
||||
The model params and tensors layout must be defined in `llama.cpp` source files:
|
||||
1. Define a new `llm_arch` enum value in `src/llama-arch.h`.
|
||||
2. In `src/llama-arch.cpp`:
|
||||
- Add the architecture name to the `LLM_ARCH_NAMES` map.
|
||||
- Add the tensor mappings to the `LLM_TENSOR_NAMES` map.
|
||||
3. Add any non-standard metadata loading in the `llama_model_loader` constructor in `src/llama-model-loader.cpp`.
|
||||
4. If the model has a RoPE operation, add a case for the architecture in `llama_model_rope_type` function in `src/llama-model.cpp`.
|
||||
|
||||
NOTE: The dimensions in `ggml` are typically in the reverse order of the `pytorch` dimensions.
|
||||
|
||||
### 3. Build the GGML graph implementation
|
||||
|
||||
This is the funniest part, you have to provide the inference graph implementation of the new model architecture in `llama_build_graph`.
|
||||
|
||||
Have a look at existing implementations like `build_llama`, `build_dbrx` or `build_bert`.
|
||||
This is the funniest part, you have to provide the inference graph implementation of the new model architecture in `src/llama-model.cpp`.
|
||||
Create a new struct that inherits from `llm_graph_context` and implement the graph-building logic in its constructor.
|
||||
Have a look at existing implementations like `llm_build_llama`, `llm_build_dbrx` or `llm_build_bert`.
|
||||
Then, in the `llama_model::build_graph` method, add a case for your architecture to instantiate your new graph-building struct.
|
||||
|
||||
Some `ggml` backends do not support all operations. Backend implementations can be added in a separate PR.
|
||||
|
||||
|
||||
95
docs/ops.md
Normal file
95
docs/ops.md
Normal file
@@ -0,0 +1,95 @@
|
||||
# GGML Operations
|
||||
|
||||
List of GGML operations and backend support status.
|
||||
|
||||
Legend:
|
||||
- ✅ Fully supported by this backend
|
||||
- 🟡 Partially supported by this backend
|
||||
- ❌ Not supported by this backend
|
||||
|
||||
| Operation | BLAS | CPU | CUDA | Metal |
|
||||
|-----------|------|------|------|------|
|
||||
| ABS | ❌ | ✅ | 🟡 | ❌ |
|
||||
| ACC | ❌ | ✅ | ✅ | ✅ |
|
||||
| ADD | ❌ | ✅ | ✅ | 🟡 |
|
||||
| ADD1 | ❌ | ✅ | ✅ | ❌ |
|
||||
| ARANGE | ❌ | ✅ | ✅ | ✅ |
|
||||
| ARGMAX | ❌ | ✅ | ✅ | ✅ |
|
||||
| ARGSORT | ❌ | ✅ | ✅ | ✅ |
|
||||
| CLAMP | ❌ | ✅ | ✅ | 🟡 |
|
||||
| CONCAT | ❌ | ✅ | 🟡 | ✅ |
|
||||
| CONT | ❌ | ✅ | 🟡 | ✅ |
|
||||
| CONV_2D_DW | ❌ | ✅ | ✅ | ❌ |
|
||||
| CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ |
|
||||
| CONV_TRANSPOSE_2D | ❌ | ✅ | ✅ | ❌ |
|
||||
| COS | ❌ | ✅ | ✅ | 🟡 |
|
||||
| COUNT_EQUAL | ❌ | ✅ | ✅ | ❌ |
|
||||
| CPY | ❌ | 🟡 | 🟡 | 🟡 |
|
||||
| CROSS_ENTROPY_LOSS | ❌ | ✅ | ✅ | ❌ |
|
||||
| CROSS_ENTROPY_LOSS_BACK | ❌ | ✅ | ✅ | ❌ |
|
||||
| DIAG_MASK_INF | ❌ | ✅ | ✅ | 🟡 |
|
||||
| DIV | ❌ | ✅ | ✅ | 🟡 |
|
||||
| DUP | ❌ | ✅ | 🟡 | 🟡 |
|
||||
| ELU | ❌ | ✅ | ❌ | 🟡 |
|
||||
| EXP | ❌ | ✅ | 🟡 | ❌ |
|
||||
| FLASH_ATTN_EXT | ❌ | ✅ | 🟡 | 🟡 |
|
||||
| GATED_LINEAR_ATTN | ❌ | ✅ | ✅ | ❌ |
|
||||
| GEGLU | ❌ | ✅ | ✅ | 🟡 |
|
||||
| GEGLU_ERF | ❌ | ✅ | ✅ | 🟡 |
|
||||
| GEGLU_QUICK | ❌ | ✅ | ✅ | 🟡 |
|
||||
| GELU | ❌ | ✅ | 🟡 | 🟡 |
|
||||
| GELU_ERF | ❌ | ✅ | 🟡 | 🟡 |
|
||||
| GELU_QUICK | ❌ | ✅ | 🟡 | 🟡 |
|
||||
| GET_ROWS | ❌ | ✅ | 🟡 | ✅ |
|
||||
| GET_ROWS_BACK | ❌ | 🟡 | 🟡 | ❌ |
|
||||
| GROUP_NORM | ❌ | ✅ | ✅ | ✅ |
|
||||
| HARDSIGMOID | ❌ | ✅ | 🟡 | ❌ |
|
||||
| HARDSWISH | ❌ | ✅ | 🟡 | ❌ |
|
||||
| IM2COL | ❌ | ✅ | ✅ | 🟡 |
|
||||
| L2_NORM | ❌ | ✅ | ✅ | ✅ |
|
||||
| LEAKY_RELU | ❌ | ✅ | ✅ | ✅ |
|
||||
| LOG | ❌ | ✅ | ✅ | ❌ |
|
||||
| MEAN | ❌ | ✅ | ✅ | ✅ |
|
||||
| MUL | ❌ | ✅ | ✅ | 🟡 |
|
||||
| MUL_MAT | 🟡 | 🟡 | 🟡 | 🟡 |
|
||||
| MUL_MAT_ID | ❌ | ✅ | ✅ | ✅ |
|
||||
| NEG | ❌ | ✅ | 🟡 | 🟡 |
|
||||
| NORM | ❌ | ✅ | ✅ | 🟡 |
|
||||
| OPT_STEP_ADAMW | ❌ | ✅ | ✅ | ❌ |
|
||||
| OUT_PROD | 🟡 | 🟡 | 🟡 | ❌ |
|
||||
| PAD | ❌ | ✅ | ✅ | ✅ |
|
||||
| PAD_REFLECT_1D | ❌ | ✅ | ❌ | ✅ |
|
||||
| POOL_2D | ❌ | ✅ | ✅ | ✅ |
|
||||
| REGLU | ❌ | ✅ | ✅ | 🟡 |
|
||||
| RELU | ❌ | ✅ | 🟡 | 🟡 |
|
||||
| REPEAT | ❌ | ✅ | 🟡 | ✅ |
|
||||
| REPEAT_BACK | ❌ | ✅ | ✅ | ❌ |
|
||||
| RMS_NORM | ❌ | ✅ | ✅ | 🟡 |
|
||||
| RMS_NORM_BACK | ❌ | ✅ | ✅ | ❌ |
|
||||
| RMS_NORM_MUL | ❌ | ✅ | ✅ | ✅ |
|
||||
| ROPE | ❌ | ✅ | ✅ | ✅ |
|
||||
| ROPE_BACK | ❌ | ✅ | ✅ | ❌ |
|
||||
| RWKV_WKV6 | ❌ | ✅ | ✅ | ✅ |
|
||||
| RWKV_WKV7 | ❌ | ✅ | ✅ | ✅ |
|
||||
| SCALE | ❌ | ✅ | ✅ | ✅ |
|
||||
| SET | ❌ | ✅ | ❌ | ✅ |
|
||||
| SET_ROWS | ❌ | 🟡 | ❌ | 🟡 |
|
||||
| SGN | ❌ | ✅ | 🟡 | ❌ |
|
||||
| SIGMOID | ❌ | ✅ | 🟡 | 🟡 |
|
||||
| SILU | ❌ | ✅ | 🟡 | 🟡 |
|
||||
| SILU_BACK | ❌ | ✅ | ✅ | ❌ |
|
||||
| SIN | ❌ | ✅ | ✅ | 🟡 |
|
||||
| SOFT_MAX | ❌ | ✅ | ✅ | ✅ |
|
||||
| SOFT_MAX_BACK | ❌ | 🟡 | 🟡 | ❌ |
|
||||
| SQR | ❌ | ✅ | ✅ | 🟡 |
|
||||
| SQRT | ❌ | ✅ | ✅ | 🟡 |
|
||||
| SSM_CONV | ❌ | ✅ | ✅ | ✅ |
|
||||
| SSM_SCAN | ❌ | ✅ | ✅ | ✅ |
|
||||
| STEP | ❌ | ✅ | 🟡 | ❌ |
|
||||
| SUB | ❌ | ✅ | ✅ | 🟡 |
|
||||
| SUM | ❌ | ✅ | ✅ | ❌ |
|
||||
| SUM_ROWS | ❌ | ✅ | ✅ | ✅ |
|
||||
| SWIGLU | ❌ | ✅ | ✅ | 🟡 |
|
||||
| TANH | ❌ | ✅ | 🟡 | 🟡 |
|
||||
| TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ |
|
||||
| UPSCALE | ❌ | ✅ | ✅ | 🟡 |
|
||||
6534
docs/ops/BLAS.csv
Normal file
6534
docs/ops/BLAS.csv
Normal file
File diff suppressed because it is too large
Load Diff
6534
docs/ops/CPU.csv
Normal file
6534
docs/ops/CPU.csv
Normal file
File diff suppressed because it is too large
Load Diff
6534
docs/ops/CUDA.csv
Normal file
6534
docs/ops/CUDA.csv
Normal file
File diff suppressed because it is too large
Load Diff
6534
docs/ops/Metal.csv
Normal file
6534
docs/ops/Metal.csv
Normal file
File diff suppressed because it is too large
Load Diff
@@ -136,6 +136,11 @@ static bool run(llama_context * ctx, const common_params & params) {
|
||||
|
||||
std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, add_bos);
|
||||
|
||||
if (tokens.empty()) {
|
||||
LOG_ERR("%s : there are not input tokens to process - (try to provide a prompt with '-p')\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {
|
||||
LOG_ERR("%s : failed to eval\n", __func__);
|
||||
return false;
|
||||
|
||||
@@ -181,7 +181,6 @@ option(GGML_VULKAN_MEMORY_DEBUG "ggml: enable Vulkan memory debug ou
|
||||
option(GGML_VULKAN_SHADER_DEBUG_INFO "ggml: enable Vulkan shader debug info" OFF)
|
||||
option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation" OFF)
|
||||
option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF)
|
||||
option(GGML_KOMPUTE "ggml: use Kompute" OFF)
|
||||
option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT})
|
||||
option(GGML_METAL_USE_BF16 "ggml: use bfloat if available" OFF)
|
||||
option(GGML_METAL_NDEBUG "ggml: disable Metal debugging" OFF)
|
||||
@@ -266,7 +265,6 @@ set(GGML_PUBLIC_HEADERS
|
||||
include/ggml-cann.h
|
||||
include/ggml-cpp.h
|
||||
include/ggml-cuda.h
|
||||
include/ggml-kompute.h
|
||||
include/ggml-opt.h
|
||||
include/ggml-metal.h
|
||||
include/ggml-rpc.h
|
||||
|
||||
@@ -1,50 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-backend.h"
|
||||
|
||||
#include <stdbool.h>
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define GGML_KOMPUTE_MAX_DEVICES 16
|
||||
|
||||
struct ggml_vk_device {
|
||||
int index;
|
||||
int type; // same as VkPhysicalDeviceType
|
||||
size_t heapSize;
|
||||
const char * name;
|
||||
const char * vendor;
|
||||
int subgroupSize;
|
||||
uint64_t bufferAlignment;
|
||||
uint64_t maxAlloc;
|
||||
};
|
||||
|
||||
struct ggml_vk_device * ggml_vk_available_devices(size_t memoryRequired, size_t * count);
|
||||
bool ggml_vk_get_device(struct ggml_vk_device * device, size_t memoryRequired, const char * name);
|
||||
bool ggml_vk_has_vulkan(void);
|
||||
bool ggml_vk_has_device(void);
|
||||
struct ggml_vk_device ggml_vk_current_device(void);
|
||||
|
||||
//
|
||||
// backend API
|
||||
//
|
||||
|
||||
// forward declaration
|
||||
typedef struct ggml_backend * ggml_backend_t;
|
||||
|
||||
GGML_BACKEND_API ggml_backend_t ggml_backend_kompute_init(int device);
|
||||
|
||||
GGML_BACKEND_API bool ggml_backend_is_kompute(ggml_backend_t backend);
|
||||
|
||||
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device);
|
||||
|
||||
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_kompute_reg(void);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
@@ -495,7 +495,7 @@ extern "C" {
|
||||
GGML_OP_POOL_1D,
|
||||
GGML_OP_POOL_2D,
|
||||
GGML_OP_POOL_2D_BACK,
|
||||
GGML_OP_UPSCALE, // nearest interpolate
|
||||
GGML_OP_UPSCALE,
|
||||
GGML_OP_PAD,
|
||||
GGML_OP_PAD_REFLECT_1D,
|
||||
GGML_OP_ROLL,
|
||||
@@ -557,6 +557,8 @@ extern "C" {
|
||||
GGML_GLU_OP_REGLU,
|
||||
GGML_GLU_OP_GEGLU,
|
||||
GGML_GLU_OP_SWIGLU,
|
||||
GGML_GLU_OP_GEGLU_ERF,
|
||||
GGML_GLU_OP_GEGLU_QUICK,
|
||||
|
||||
GGML_GLU_OP_COUNT,
|
||||
};
|
||||
@@ -1147,6 +1149,22 @@ extern "C" {
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_geglu_erf(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_geglu_erf_swapped(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_geglu_quick(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_geglu_quick_swapped(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
// A: n columns, r rows,
|
||||
// B: n columns, r rows,
|
||||
GGML_API struct ggml_tensor * ggml_glu_split(
|
||||
@@ -1170,6 +1188,16 @@ extern "C" {
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_geglu_erf_split(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_geglu_quick_split(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
|
||||
// normalize along rows
|
||||
GGML_API struct ggml_tensor * ggml_norm(
|
||||
struct ggml_context * ctx,
|
||||
@@ -1269,6 +1297,19 @@ extern "C" {
|
||||
struct ggml_tensor * a,
|
||||
float s);
|
||||
|
||||
// x = s * a + b
|
||||
GGML_API struct ggml_tensor * ggml_scale_bias(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
float s,
|
||||
float b);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_scale_bias_inplace(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
float s,
|
||||
float b);
|
||||
|
||||
// b -> view(a,offset,nb1,nb2,3), return modified a
|
||||
GGML_API struct ggml_tensor * ggml_set(
|
||||
struct ggml_context * ctx,
|
||||
@@ -1983,15 +2024,16 @@ extern "C" {
|
||||
|
||||
#define GGML_KQ_MASK_PAD 64
|
||||
|
||||
// q: [n_embd_k, n_batch, n_head, ne3]
|
||||
// k: [n_embd_k, n_kv, n_head_kv, ne3]
|
||||
// v: [n_embd_v, n_kv, n_head_kv, ne3] !! not transposed !!
|
||||
// mask: [n_kv, n_batch_pad, ne32, 1] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
|
||||
// res: [n_embd_v, n_head, n_batch, ne3] !! permuted !!
|
||||
// q: [n_embd_k, n_batch, n_head, ne3 ]
|
||||
// k: [n_embd_k, n_kv, n_head_kv, ne3 ]
|
||||
// v: [n_embd_v, n_kv, n_head_kv, ne3 ] !! not transposed !!
|
||||
// mask: [n_kv, n_batch_pad, ne32, ne33] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
|
||||
// res: [n_embd_v, n_head, n_batch, ne3 ] !! permuted !!
|
||||
//
|
||||
// broadcast:
|
||||
// n_head % n_head_kv == 0
|
||||
// ne3 % ne32 == 0
|
||||
// n_head % ne32 == 0
|
||||
// ne3 % ne33 == 0
|
||||
//
|
||||
GGML_API struct ggml_tensor * ggml_flash_attn_ext(
|
||||
struct ggml_context * ctx,
|
||||
|
||||
@@ -365,7 +365,6 @@ ggml_add_backend(BLAS)
|
||||
ggml_add_backend(CANN)
|
||||
ggml_add_backend(CUDA)
|
||||
ggml_add_backend(HIP)
|
||||
ggml_add_backend(Kompute)
|
||||
ggml_add_backend(METAL)
|
||||
ggml_add_backend(MUSA)
|
||||
ggml_add_backend(RPC)
|
||||
|
||||
@@ -61,10 +61,6 @@
|
||||
#include "ggml-cann.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_KOMPUTE
|
||||
#include "ggml-kompute.h"
|
||||
#endif
|
||||
|
||||
// disable C++17 deprecation warning for std::codecvt_utf8
|
||||
#if defined(__clang__)
|
||||
# pragma clang diagnostic push
|
||||
@@ -189,9 +185,6 @@ struct ggml_backend_registry {
|
||||
#ifdef GGML_USE_RPC
|
||||
register_backend(ggml_backend_rpc_reg());
|
||||
#endif
|
||||
#ifdef GGML_USE_KOMPUTE
|
||||
register_backend(ggml_backend_kompute_reg());
|
||||
#endif
|
||||
#ifdef GGML_USE_CPU
|
||||
register_backend(ggml_backend_cpu_reg());
|
||||
#endif
|
||||
@@ -575,7 +568,6 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
|
||||
ggml_backend_load_best("cann", silent, dir_path);
|
||||
ggml_backend_load_best("cuda", silent, dir_path);
|
||||
ggml_backend_load_best("hip", silent, dir_path);
|
||||
ggml_backend_load_best("kompute", silent, dir_path);
|
||||
ggml_backend_load_best("metal", silent, dir_path);
|
||||
ggml_backend_load_best("rpc", silent, dir_path);
|
||||
ggml_backend_load_best("sycl", silent, dir_path);
|
||||
|
||||
@@ -67,6 +67,7 @@
|
||||
#include <aclnnop/aclnn_pow.h>
|
||||
#include <aclnnop/aclnn_grouped_matmul_v3.h>
|
||||
#include <aclnnop/aclnn_fused_infer_attention_score_v2.h>
|
||||
#include <aclnnop/aclnn_zero.h>
|
||||
#include <float.h>
|
||||
|
||||
#include <cmath>
|
||||
@@ -804,10 +805,11 @@ static aclTensor* aclnn_zero(ggml_backend_cann_context& ctx, void* buffer,
|
||||
nb[i] = nb[i - 1] * ne[i - 1];
|
||||
}
|
||||
|
||||
ggml_cann_async_memset(ctx, buffer, n_bytes, 0);
|
||||
aclTensor* zero =
|
||||
ggml_cann_create_tensor(buffer, type, type_size, ne, nb, dims);
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceZero, zero);
|
||||
return zero;
|
||||
GGML_UNUSED(n_bytes);
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@@ -2086,6 +2086,13 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
||||
return false;
|
||||
}
|
||||
} break;
|
||||
case GGML_OP_SET_ROWS:
|
||||
{
|
||||
// TODO: add support
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/14274
|
||||
#pragma message("TODO: implement F32, F16, BF16, Q4_0, Q4_1, Q5_0, Q5_1, Q8_0, IQ4_NL support (https://github.com/ggml-org/llama.cpp/pull/14661)")
|
||||
return false;
|
||||
} break;
|
||||
case GGML_OP_CPY: {
|
||||
ggml_tensor *src = op->src[0];
|
||||
if ((op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_F16) ||
|
||||
@@ -2182,7 +2189,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
||||
case GGML_OP_MUL:
|
||||
case GGML_OP_DIV:
|
||||
case GGML_OP_RMS_NORM:
|
||||
case GGML_OP_SCALE:
|
||||
case GGML_OP_SQR:
|
||||
case GGML_OP_SQRT:
|
||||
case GGML_OP_CLAMP:
|
||||
@@ -2204,6 +2210,10 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
||||
case GGML_OP_PAD_REFLECT_1D:
|
||||
case GGML_OP_COUNT_EQUAL:
|
||||
return true;
|
||||
case GGML_OP_SCALE:
|
||||
float bias;
|
||||
memcpy(&bias, (float*)op->op_params + 1, sizeof(float));
|
||||
return bias == 0.0f; // TODO: support bias != 0.0f
|
||||
case GGML_OP_SOFT_MAX:
|
||||
// TODO: support broadcast
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/14435
|
||||
|
||||
@@ -2172,6 +2172,8 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
||||
case GGML_GLU_OP_REGLU:
|
||||
case GGML_GLU_OP_GEGLU:
|
||||
case GGML_GLU_OP_SWIGLU:
|
||||
case GGML_GLU_OP_GEGLU_ERF:
|
||||
case GGML_GLU_OP_GEGLU_QUICK:
|
||||
{
|
||||
n_tasks = n_threads;
|
||||
} break;
|
||||
|
||||
@@ -3614,6 +3614,292 @@ static void ggml_compute_forward_swiglu(
|
||||
}
|
||||
}
|
||||
|
||||
// ggml_compute_forward_geglu_erf
|
||||
|
||||
static void ggml_compute_forward_geglu_erf_f32(
|
||||
const ggml_compute_params * params,
|
||||
ggml_tensor * dst) {
|
||||
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const ggml_tensor * src1 = dst->src[1];
|
||||
char * src0_d = (char *) src0->data;
|
||||
char * src1_d = (char *) (src1 ? src1->data : src0->data);
|
||||
const size_t src0_o = src0->nb[1];
|
||||
const size_t src1_o = src1 ? src1->nb[1] : src0->nb[1];
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous_1(src0));
|
||||
GGML_ASSERT(ggml_is_contiguous_1(dst));
|
||||
|
||||
if (src1) {
|
||||
GGML_ASSERT(ggml_is_contiguous_1(src1));
|
||||
GGML_ASSERT(src0->type == src1->type);
|
||||
}
|
||||
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
const int nc = src1 ? src0->ne[0] : src0->ne[0] / 2;
|
||||
const int nr = ggml_nrows(src0);
|
||||
|
||||
GGML_ASSERT(dst->ne[0] == nc);
|
||||
GGML_ASSERT(ggml_nrows(dst) == nr);
|
||||
|
||||
const int32_t swapped = ggml_get_op_params_i32(dst, 1);
|
||||
|
||||
// rows per thread
|
||||
const int dr = (nr + nth - 1)/nth;
|
||||
|
||||
// row range for this thread
|
||||
const int ir0 = dr*ith;
|
||||
const int ir1 = MIN(ir0 + dr, nr);
|
||||
|
||||
for (int i1 = ir0; i1 < ir1; i1++) {
|
||||
float * src0_p = (float *) (src0_d + i1*src0_o);
|
||||
float * src1_p = (float *) (src1_d + i1*src1_o);
|
||||
|
||||
if (!src1) {
|
||||
src0_p += swapped ? nc : 0;
|
||||
src1_p += swapped ? 0 : nc;
|
||||
}
|
||||
|
||||
ggml_vec_geglu_erf_f32(nc, (float *) ((char *) dst->data + i1*(dst->nb[1])), src0_p, src1_p);
|
||||
|
||||
#ifndef NDEBUG
|
||||
for (int k = 0; k < nc; k++) {
|
||||
const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
|
||||
GGML_UNUSED(x);
|
||||
assert(!isnan(x));
|
||||
assert(!isinf(x));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_compute_forward_geglu_erf_f16(
|
||||
const ggml_compute_params * params,
|
||||
ggml_tensor * dst) {
|
||||
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const ggml_tensor * src1 = dst->src[1];
|
||||
char * src0_d = (char *) src0->data;
|
||||
char * src1_d = (char *) (src1 ? src1->data : src0->data);
|
||||
const size_t src0_o = src0->nb[1];
|
||||
const size_t src1_o = src1 ? src1->nb[1] : src0->nb[1];
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous_1(src0));
|
||||
GGML_ASSERT(ggml_is_contiguous_1(dst));
|
||||
|
||||
if (src1) {
|
||||
GGML_ASSERT(ggml_is_contiguous_1(src1));
|
||||
GGML_ASSERT(src0->type == src1->type);
|
||||
}
|
||||
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
const int nc = src1 ? src0->ne[0] : src0->ne[0] / 2;
|
||||
const int nr = ggml_nrows(src0);
|
||||
|
||||
GGML_ASSERT(dst->ne[0] == nc);
|
||||
GGML_ASSERT(ggml_nrows(dst) == nr);
|
||||
|
||||
const int32_t swapped = ggml_get_op_params_i32(dst, 1);
|
||||
|
||||
// rows per thread
|
||||
const int dr = (nr + nth - 1)/nth;
|
||||
|
||||
// row range for this thread
|
||||
const int ir0 = dr*ith;
|
||||
const int ir1 = MIN(ir0 + dr, nr);
|
||||
|
||||
for (int i1 = ir0; i1 < ir1; i1++) {
|
||||
ggml_fp16_t * src0_p = (ggml_fp16_t *) (src0_d + i1*src0_o);
|
||||
ggml_fp16_t * src1_p = (ggml_fp16_t *) (src1_d + i1*src1_o);
|
||||
|
||||
if (!src1) {
|
||||
src0_p += swapped ? nc : 0;
|
||||
src1_p += swapped ? 0 : nc;
|
||||
}
|
||||
|
||||
ggml_vec_geglu_erf_f16(nc, (ggml_fp16_t *) ((char *) dst->data + i1*(dst->nb[1])), src0_p, src1_p);
|
||||
|
||||
#ifndef NDEBUG
|
||||
for (int k = 0; k < nc; k++) {
|
||||
const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k];
|
||||
const float v = GGML_FP16_TO_FP32(x);
|
||||
GGML_UNUSED(v);
|
||||
assert(!isnan(v));
|
||||
assert(!isinf(v));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_compute_forward_geglu_erf(
|
||||
const ggml_compute_params * params,
|
||||
ggml_tensor * dst) {
|
||||
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F32:
|
||||
{
|
||||
ggml_compute_forward_geglu_erf_f32(params, dst);
|
||||
} break;
|
||||
case GGML_TYPE_F16:
|
||||
{
|
||||
ggml_compute_forward_geglu_erf_f16(params, dst);
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ggml_compute_forward_geglu_quick
|
||||
|
||||
static void ggml_compute_forward_geglu_quick_f32(
|
||||
const ggml_compute_params * params,
|
||||
ggml_tensor * dst) {
|
||||
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const ggml_tensor * src1 = dst->src[1];
|
||||
char * src0_d = (char *) src0->data;
|
||||
char * src1_d = (char *) (src1 ? src1->data : src0->data);
|
||||
const size_t src0_o = src0->nb[1];
|
||||
const size_t src1_o = src1 ? src1->nb[1] : src0->nb[1];
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous_1(src0));
|
||||
GGML_ASSERT(ggml_is_contiguous_1(dst));
|
||||
|
||||
if (src1) {
|
||||
GGML_ASSERT(ggml_is_contiguous_1(src1));
|
||||
GGML_ASSERT(src0->type == src1->type);
|
||||
}
|
||||
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
const int nc = src1 ? src0->ne[0] : src0->ne[0] / 2;
|
||||
const int nr = ggml_nrows(src0);
|
||||
|
||||
GGML_ASSERT(dst->ne[0] == nc);
|
||||
GGML_ASSERT(ggml_nrows(dst) == nr);
|
||||
|
||||
const int32_t swapped = ggml_get_op_params_i32(dst, 1);
|
||||
|
||||
// rows per thread
|
||||
const int dr = (nr + nth - 1)/nth;
|
||||
|
||||
// row range for this thread
|
||||
const int ir0 = dr*ith;
|
||||
const int ir1 = MIN(ir0 + dr, nr);
|
||||
|
||||
for (int i1 = ir0; i1 < ir1; i1++) {
|
||||
float * src0_p = (float *) (src0_d + i1*src0_o);
|
||||
float * src1_p = (float *) (src1_d + i1*src1_o);
|
||||
|
||||
if (!src1) {
|
||||
src0_p += swapped ? nc : 0;
|
||||
src1_p += swapped ? 0 : nc;
|
||||
}
|
||||
|
||||
ggml_vec_geglu_quick_f32(nc, (float *) ((char *) dst->data + i1*(dst->nb[1])), src0_p, src1_p);
|
||||
|
||||
#ifndef NDEBUG
|
||||
for (int k = 0; k < nc; k++) {
|
||||
const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
|
||||
GGML_UNUSED(x);
|
||||
assert(!isnan(x));
|
||||
assert(!isinf(x));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_compute_forward_geglu_quick_f16(
|
||||
const ggml_compute_params * params,
|
||||
ggml_tensor * dst) {
|
||||
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const ggml_tensor * src1 = dst->src[1];
|
||||
char * src0_d = (char *) src0->data;
|
||||
char * src1_d = (char *) (src1 ? src1->data : src0->data);
|
||||
const size_t src0_o = src0->nb[1];
|
||||
const size_t src1_o = src1 ? src1->nb[1] : src0->nb[1];
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous_1(src0));
|
||||
GGML_ASSERT(ggml_is_contiguous_1(dst));
|
||||
|
||||
if (src1) {
|
||||
GGML_ASSERT(ggml_is_contiguous_1(src1));
|
||||
GGML_ASSERT(src0->type == src1->type);
|
||||
}
|
||||
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
const int nc = src1 ? src0->ne[0] : src0->ne[0] / 2;
|
||||
const int nr = ggml_nrows(src0);
|
||||
|
||||
GGML_ASSERT(dst->ne[0] == nc);
|
||||
GGML_ASSERT(ggml_nrows(dst) == nr);
|
||||
|
||||
const int32_t swapped = ggml_get_op_params_i32(dst, 1);
|
||||
|
||||
// rows per thread
|
||||
const int dr = (nr + nth - 1)/nth;
|
||||
|
||||
// row range for this thread
|
||||
const int ir0 = dr*ith;
|
||||
const int ir1 = MIN(ir0 + dr, nr);
|
||||
|
||||
for (int i1 = ir0; i1 < ir1; i1++) {
|
||||
ggml_fp16_t * src0_p = (ggml_fp16_t *) (src0_d + i1*src0_o);
|
||||
ggml_fp16_t * src1_p = (ggml_fp16_t *) (src1_d + i1*src1_o);
|
||||
|
||||
if (!src1) {
|
||||
src0_p += swapped ? nc : 0;
|
||||
src1_p += swapped ? 0 : nc;
|
||||
}
|
||||
|
||||
ggml_vec_geglu_quick_f16(nc, (ggml_fp16_t *) ((char *) dst->data + i1*(dst->nb[1])), src0_p, src1_p);
|
||||
|
||||
#ifndef NDEBUG
|
||||
for (int k = 0; k < nc; k++) {
|
||||
const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k];
|
||||
const float v = GGML_FP16_TO_FP32(x);
|
||||
GGML_UNUSED(v);
|
||||
assert(!isnan(v));
|
||||
assert(!isinf(v));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_compute_forward_geglu_quick(
|
||||
const ggml_compute_params * params,
|
||||
ggml_tensor * dst) {
|
||||
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F32:
|
||||
{
|
||||
ggml_compute_forward_geglu_quick_f32(params, dst);
|
||||
} break;
|
||||
case GGML_TYPE_F16:
|
||||
{
|
||||
ggml_compute_forward_geglu_quick_f16(params, dst);
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ggml_compute_forward_norm
|
||||
|
||||
static void ggml_compute_forward_norm_f32(
|
||||
@@ -4357,9 +4643,11 @@ static void ggml_compute_forward_scale_f32(
|
||||
GGML_ASSERT(ggml_is_contiguous(dst));
|
||||
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
||||
|
||||
// scale factor
|
||||
float v;
|
||||
memcpy(&v, dst->op_params, sizeof(float));
|
||||
float s; // scale factor
|
||||
float b; // bias
|
||||
|
||||
memcpy(&s, (float *) dst->op_params + 0, sizeof(float));
|
||||
memcpy(&b, (float *) dst->op_params + 1, sizeof(float));
|
||||
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
@@ -4378,12 +4666,22 @@ static void ggml_compute_forward_scale_f32(
|
||||
|
||||
const size_t nb1 = dst->nb[1];
|
||||
|
||||
for (int i1 = ir0; i1 < ir1; i1++) {
|
||||
if (dst->data != src0->data) {
|
||||
// src0 is same shape as dst => same indices
|
||||
memcpy((char *)dst->data + i1*nb1, (char *)src0->data + i1*nb01, nc * sizeof(float));
|
||||
if (b == 0.0f) {
|
||||
for (int i1 = ir0; i1 < ir1; i1++) {
|
||||
if (dst->data != src0->data) {
|
||||
// src0 is same shape as dst => same indices
|
||||
// TODO: add x parameter to ggml_vec_scale_f32 and remove this memcpy
|
||||
memcpy((char *)dst->data + i1*nb1, (char *)src0->data + i1*nb01, nc * sizeof(float));
|
||||
}
|
||||
ggml_vec_scale_f32(nc, (float *) ((char *) dst->data + i1*nb1), s);
|
||||
}
|
||||
} else {
|
||||
for (int i1 = ir0; i1 < ir1; i1++) {
|
||||
ggml_vec_mad1_f32(nc,
|
||||
(float *) ((char *) dst->data + i1*nb1),
|
||||
(float *) ((char *) src0->data + i1*nb1),
|
||||
s, b);
|
||||
}
|
||||
ggml_vec_scale_f32(nc, (float *) ((char *) dst->data + i1*nb1), v);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7799,7 +8097,7 @@ static void ggml_compute_forward_flash_attn_ext_f16(
|
||||
memset(VKQ32, 0, DV*sizeof(float));
|
||||
}
|
||||
|
||||
const ggml_fp16_t * mp = mask ? (ggml_fp16_t *)((char *) mask->data + iq1*mask->nb[1] + (iq3%mask->ne[2])*mask->nb[2]) : NULL;
|
||||
const ggml_fp16_t * mp = mask ? (ggml_fp16_t *)((char *) mask->data + iq1*mask->nb[1] + (iq2%mask->ne[2])*mask->nb[2] + (iq3%mask->ne[3])*mask->nb[3]) : NULL;
|
||||
|
||||
// k indices
|
||||
const int ik3 = iq3 / rk3;
|
||||
@@ -8779,6 +9077,14 @@ void ggml_compute_forward_glu(
|
||||
{
|
||||
ggml_compute_forward_swiglu(params, dst);
|
||||
} break;
|
||||
case GGML_GLU_OP_GEGLU_ERF:
|
||||
{
|
||||
ggml_compute_forward_geglu_erf(params, dst);
|
||||
} break;
|
||||
case GGML_GLU_OP_GEGLU_QUICK:
|
||||
{
|
||||
ggml_compute_forward_geglu_quick(params, dst);
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
GGML_ABORT("fatal error");
|
||||
|
||||
@@ -351,6 +351,45 @@ inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int
|
||||
#endif
|
||||
}
|
||||
|
||||
inline static void ggml_vec_mad1_f32(const int n, float * y, const float * x, const float s, const float b) {
|
||||
#if defined(GGML_USE_ACCELERATE)
|
||||
vDSP_vsmsa(x, 1, &s, &b, y, 1, n);
|
||||
#elif defined(GGML_SIMD)
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
// scalar ; TODO: Write SVE code
|
||||
for (int i = 0; i < n; ++i) {
|
||||
y[i] = x[i]*s + b;
|
||||
}
|
||||
#else
|
||||
const int np = (n & ~(GGML_F32_STEP - 1));
|
||||
|
||||
GGML_F32_VEC vs = GGML_F32_VEC_SET1(s);
|
||||
GGML_F32_VEC vb = GGML_F32_VEC_SET1(b);
|
||||
|
||||
GGML_F32_VEC ay[GGML_F32_ARR];
|
||||
|
||||
for (int i = 0; i < np; i += GGML_F32_STEP) {
|
||||
for (int j = 0; j < GGML_F32_ARR; j++) {
|
||||
ay[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
|
||||
ay[j] = GGML_F32_VEC_FMA(ay[j], vs, vb);
|
||||
|
||||
GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
|
||||
}
|
||||
}
|
||||
|
||||
// leftovers
|
||||
for (int i = np; i < n; ++i) {
|
||||
y[i] = x[i]*s + b;
|
||||
}
|
||||
#endif
|
||||
#else
|
||||
// scalar
|
||||
for (int i = 0; i < n; ++i) {
|
||||
y[i] = x[i]*s + b;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
//inline static void ggml_vec_scale_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] *= v; }
|
||||
inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
|
||||
#if defined(GGML_USE_ACCELERATE)
|
||||
@@ -959,6 +998,46 @@ inline static void ggml_vec_swiglu_f16(const int n, ggml_fp16_t * y, const ggml_
|
||||
}
|
||||
}
|
||||
|
||||
inline static void ggml_vec_geglu_erf_f32(const int n, float * y, const float * x, const float * g) {
|
||||
for (int i = 0; i < n; ++i) {
|
||||
float xi = x[i];
|
||||
y[i] = 0.5f * xi * (1.0f + erff(xi*SQRT_2_INV)) * g[i];
|
||||
}
|
||||
}
|
||||
|
||||
inline static void ggml_vec_geglu_erf_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
|
||||
for (int i = 0; i < n; ++i) {
|
||||
float xi = GGML_CPU_FP16_TO_FP32(x[i]);
|
||||
float gi = GGML_CPU_FP16_TO_FP32(g[i]);
|
||||
y[i] = GGML_CPU_FP32_TO_FP16(0.5f * xi * (1.0f + erff(xi*SQRT_2_INV)) * gi);
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef GGML_GELU_QUICK_FP16
|
||||
inline static void ggml_vec_geglu_quick_f32(const int n, float * y, const float * x, const float * g) {
|
||||
uint16_t t;
|
||||
for (int i = 0; i < n; ++i) {
|
||||
ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]);
|
||||
memcpy(&t, &fp16, sizeof(uint16_t));
|
||||
y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_quick_f16[t]) * g[i];
|
||||
}
|
||||
}
|
||||
#else
|
||||
inline static void ggml_vec_geglu_quick_f32(const int n, float * y, const float * x, const float * g) {
|
||||
for (int i = 0; i < n; ++i) {
|
||||
y[i] = ggml_gelu_quick_f32(x[i]) * g[i];
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
inline static void ggml_vec_geglu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
|
||||
const uint16_t * i16 = (const uint16_t *) x;
|
||||
for (int i = 0; i < n; ++i) {
|
||||
float v = GGML_CPU_FP16_TO_FP32(g[i]);
|
||||
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(ggml_table_gelu_quick_f16[i16[i]]) * v);
|
||||
}
|
||||
}
|
||||
|
||||
inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) {
|
||||
#ifndef GGML_USE_ACCELERATE
|
||||
ggml_float sum = 0.0;
|
||||
|
||||
@@ -175,6 +175,23 @@ static const char * cu_get_error_str(CUresult err) {
|
||||
#define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str)
|
||||
#endif
|
||||
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
|
||||
# define CUDA_SET_SHARED_MEMORY_LIMIT(kernel, nbytes) \
|
||||
do { \
|
||||
static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = { false }; \
|
||||
const int id = ggml_cuda_get_device(); \
|
||||
if (!shared_memory_limit_raised[id]) { \
|
||||
CUDA_CHECK(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, nbytes)); \
|
||||
shared_memory_limit_raised[id] = true; \
|
||||
} \
|
||||
} while (0)
|
||||
#else
|
||||
# define CUDA_SET_SHARED_MEMORY_LIMIT(kernel, nbytes) \
|
||||
do { \
|
||||
GGML_UNUSED(nbytes); \
|
||||
} while (0)
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
|
||||
|
||||
#if CUDART_VERSION >= 11010 || defined(GGML_USE_MUSA)
|
||||
#define GGML_CUDA_ASSUME(x) __builtin_assume(x)
|
||||
#else
|
||||
|
||||
@@ -123,13 +123,7 @@ void ggml_cuda_cross_entropy_loss(ggml_backend_cuda_context & ctx, ggml_tensor *
|
||||
ggml_cuda_pool_alloc<float> dst_tmp(pool, blocks_num.x);
|
||||
|
||||
if (nbytes_shared <= smpbo) {
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
|
||||
static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
|
||||
if (!shared_memory_limit_raised[id]) {
|
||||
CUDA_CHECK(cudaFuncSetAttribute(cross_entropy_loss_f32<true>, cudaFuncAttributeMaxDynamicSharedMemorySize, smpbo));
|
||||
shared_memory_limit_raised[id] = true;
|
||||
}
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
|
||||
CUDA_SET_SHARED_MEMORY_LIMIT((cross_entropy_loss_f32<true>), smpbo);
|
||||
cross_entropy_loss_f32<true><<<blocks_num, blocks_dim, nbytes_shared, stream>>>(src0_d, src1_d, dst_tmp.ptr, ne00, nrows);
|
||||
} else {
|
||||
cross_entropy_loss_f32<false><<<blocks_num, blocks_dim, 0, stream>>>(src0_d, src1_d, dst_tmp.ptr, ne00, nrows);
|
||||
@@ -175,13 +169,7 @@ void ggml_cuda_cross_entropy_loss_back(ggml_backend_cuda_context & ctx, ggml_ten
|
||||
const size_t smpbo = ggml_cuda_info().devices[id].smpbo;
|
||||
|
||||
if (nbytes_shared <= smpbo) {
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
|
||||
static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
|
||||
if (!shared_memory_limit_raised[id]) {
|
||||
CUDA_CHECK(cudaFuncSetAttribute(cross_entropy_loss_back_f32<true>, cudaFuncAttributeMaxDynamicSharedMemorySize, smpbo));
|
||||
shared_memory_limit_raised[id] = true;
|
||||
}
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
|
||||
CUDA_SET_SHARED_MEMORY_LIMIT((cross_entropy_loss_back_f32<true>), smpbo);
|
||||
cross_entropy_loss_back_f32<true><<<blocks_num, blocks_dim, nbytes_shared, stream>>>(grad_d, src0f_d, src1f_d, dst_d, ne00);
|
||||
} else {
|
||||
cross_entropy_loss_back_f32<false><<<blocks_num, blocks_dim, 0, stream>>>(grad_d, src0f_d, src1f_d, dst_d, ne00);
|
||||
|
||||
@@ -299,14 +299,14 @@ static __global__ void flash_attn_tile_ext_f32(
|
||||
GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
|
||||
GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
|
||||
GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
|
||||
GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02);
|
||||
GGML_UNUSED(ne03); GGML_UNUSED(ne10); GGML_UNUSED(ne11);
|
||||
GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31);
|
||||
GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
|
||||
GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12);
|
||||
GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22);
|
||||
GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
|
||||
GGML_UNUSED(ne2); GGML_UNUSED(ne3);
|
||||
GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03);
|
||||
GGML_UNUSED(ne10); GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13);
|
||||
GGML_UNUSED(ne31); GGML_UNUSED(ne32);
|
||||
GGML_UNUSED(nb31); GGML_UNUSED(nb32);
|
||||
GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03);
|
||||
GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13);
|
||||
GGML_UNUSED(nb21); GGML_UNUSED(nb22); GGML_UNUSED(nb23);
|
||||
GGML_UNUSED(ne0); GGML_UNUSED(ne1); GGML_UNUSED(ne2); GGML_UNUSED(ne3);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // FLASH_ATTN_AVAILABLE
|
||||
}
|
||||
|
||||
@@ -337,13 +337,15 @@ static __global__ void flash_attn_vec_ext_f32(
|
||||
GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
|
||||
GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
|
||||
GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
|
||||
GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap); GGML_UNUSED(ne00);
|
||||
GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03); GGML_UNUSED(ne10);
|
||||
GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31);
|
||||
GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03);
|
||||
GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13); GGML_UNUSED(nb21);
|
||||
GGML_UNUSED(nb22); GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
|
||||
GGML_UNUSED(ne2); GGML_UNUSED(ne3);
|
||||
GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
|
||||
GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03);
|
||||
GGML_UNUSED(ne10); GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13);
|
||||
GGML_UNUSED(ne31); GGML_UNUSED(ne32);
|
||||
GGML_UNUSED(nb31); GGML_UNUSED(nb32);
|
||||
GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03);
|
||||
GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13);
|
||||
GGML_UNUSED(nb21); GGML_UNUSED(nb22); GGML_UNUSED(nb23);
|
||||
GGML_UNUSED(ne0); GGML_UNUSED(ne1); GGML_UNUSED(ne2); GGML_UNUSED(ne3);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // FLASH_ATTN_AVAILABLE
|
||||
}
|
||||
|
||||
@@ -168,6 +168,10 @@ static void ggml_cuda_get_rows_switch_src0_type(
|
||||
get_rows_cuda_float((const float *) src0_d, src1_d, dst_d,
|
||||
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
|
||||
break;
|
||||
case GGML_TYPE_I32:
|
||||
get_rows_cuda_float((const int32_t *) src0_d, src1_d, dst_d,
|
||||
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
|
||||
break;
|
||||
case GGML_TYPE_BF16:
|
||||
get_rows_cuda_float((const nv_bfloat16 *) src0_d, src1_d, dst_d,
|
||||
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
|
||||
@@ -210,6 +214,10 @@ void get_rows_cuda(
|
||||
ggml_cuda_get_rows_switch_src0_type(src0_d, src0_type, src1_d, (float *) dst_d,
|
||||
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
|
||||
break;
|
||||
case GGML_TYPE_I32:
|
||||
ggml_cuda_get_rows_switch_src0_type(src0_d, src0_type, src1_d, (int32_t *) dst_d,
|
||||
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
|
||||
break;
|
||||
case GGML_TYPE_F16:
|
||||
ggml_cuda_get_rows_switch_src0_type(src0_d, src0_type, src1_d, (half *) dst_d,
|
||||
ne00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb1, nb2, nb3, stream);
|
||||
|
||||
@@ -43,6 +43,7 @@
|
||||
#include "ggml-cuda/upscale.cuh"
|
||||
#include "ggml-cuda/wkv.cuh"
|
||||
#include "ggml-cuda/gla.cuh"
|
||||
#include "ggml-cuda/set-rows.cuh"
|
||||
#include "ggml.h"
|
||||
|
||||
#include <algorithm>
|
||||
@@ -2230,6 +2231,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
||||
case GGML_OP_GET_ROWS_BACK:
|
||||
ggml_cuda_op_get_rows_back(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_SET_ROWS:
|
||||
ggml_cuda_op_set_rows(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_DUP:
|
||||
ggml_cuda_dup(ctx, dst);
|
||||
break;
|
||||
@@ -2299,6 +2303,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
||||
case GGML_UNARY_OP_EXP:
|
||||
ggml_cuda_op_exp(ctx, dst);
|
||||
break;
|
||||
case GGML_UNARY_OP_ELU:
|
||||
ggml_cuda_op_elu(ctx, dst);
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
@@ -2314,6 +2321,12 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
||||
case GGML_GLU_OP_SWIGLU:
|
||||
ggml_cuda_op_swiglu(ctx, dst);
|
||||
break;
|
||||
case GGML_GLU_OP_GEGLU_ERF:
|
||||
ggml_cuda_op_geglu_erf(ctx, dst);
|
||||
break;
|
||||
case GGML_GLU_OP_GEGLU_QUICK:
|
||||
ggml_cuda_op_geglu_quick(ctx, dst);
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
@@ -3106,6 +3119,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
case GGML_UNARY_OP_GELU_QUICK:
|
||||
case GGML_UNARY_OP_TANH:
|
||||
case GGML_UNARY_OP_EXP:
|
||||
case GGML_UNARY_OP_ELU:
|
||||
return ggml_is_contiguous(op->src[0]);
|
||||
default:
|
||||
return false;
|
||||
@@ -3116,6 +3130,8 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
case GGML_GLU_OP_REGLU:
|
||||
case GGML_GLU_OP_GEGLU:
|
||||
case GGML_GLU_OP_SWIGLU:
|
||||
case GGML_GLU_OP_GEGLU_ERF:
|
||||
case GGML_GLU_OP_GEGLU_QUICK:
|
||||
return ggml_is_contiguous_1(op->src[0]);
|
||||
default:
|
||||
return false;
|
||||
@@ -3192,6 +3208,8 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
switch (op->src[0]->type) {
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_F32:
|
||||
case GGML_TYPE_BF16:
|
||||
case GGML_TYPE_I32:
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_1:
|
||||
case GGML_TYPE_Q5_0:
|
||||
@@ -3206,6 +3224,13 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
{
|
||||
return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32 && op->ne[2] == 1 && op->ne[3] == 1;
|
||||
} break;
|
||||
case GGML_OP_SET_ROWS:
|
||||
{
|
||||
#pragma message("TODO: implement Q4_0, Q4_1, Q5_0, Q5_1, Q8_0, IQ4_NL support (https://github.com/ggml-org/llama.cpp/pull/14661)")
|
||||
return (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16 || op->type == GGML_TYPE_BF16) &&
|
||||
op->src[0]->type == GGML_TYPE_F32 &&
|
||||
op->src[1]->type == GGML_TYPE_I64;
|
||||
} break;
|
||||
case GGML_OP_CPY:
|
||||
{
|
||||
ggml_type src0_type = op->src[0]->type;
|
||||
@@ -3325,8 +3350,8 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
case GGML_OP_SSM_SCAN: {
|
||||
if (op->src[3]->ne[0] == 1) {
|
||||
// Mamba2
|
||||
// (kernel only supports d_state == 128 && d_head % 16 == 0)
|
||||
return op->src[0]->ne[0] == 128 && op->src[0]->ne[1] % 16 == 0;
|
||||
// (kernel only supports (d_state == 128 || d_state == 256) && d_head % 16 == 0)
|
||||
return (op->src[0]->ne[0] == 128 || op->src[0]->ne[0] == 256) && op->src[0]->ne[1] % 16 == 0;
|
||||
} else {
|
||||
// Mamba
|
||||
// (kernel only supports d_state == 16, d_head == 1, n_head % 128 == 0, n_group == 1)
|
||||
@@ -3365,7 +3390,6 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
case GGML_OP_GROUP_NORM:
|
||||
return ggml_is_contiguous(op->src[0]);
|
||||
case GGML_OP_UPSCALE:
|
||||
return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST;
|
||||
case GGML_OP_PAD:
|
||||
case GGML_OP_ARANGE:
|
||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
@@ -3390,7 +3414,8 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
return false;
|
||||
}
|
||||
// TODO: support broadcast
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/14435
|
||||
// note: this was initially implemented in https://github.com/ggml-org/llama.cpp/pull/14500, but
|
||||
// the interface of ggml_flash_attn_ext() changed in https://github.com/ggml-org/llama.cpp/pull/14505
|
||||
if (op->src[0]->ne[3] != 1) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -3016,14 +3016,8 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
|
||||
|
||||
const int nbytes_shared = mmq_get_nbytes_shared<type>(mmq_x, mmq_y, cc);
|
||||
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
|
||||
static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
|
||||
if (!shared_memory_limit_raised[id]) {
|
||||
CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q<type, mmq_x, MMQ_NWARPS, false>, cudaFuncAttributeMaxDynamicSharedMemorySize, nbytes_shared));
|
||||
CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q<type, mmq_x, MMQ_NWARPS, true>, cudaFuncAttributeMaxDynamicSharedMemorySize, nbytes_shared));
|
||||
shared_memory_limit_raised[id] = true;
|
||||
}
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
|
||||
CUDA_SET_SHARED_MEMORY_LIMIT((mul_mat_q<type, mmq_x, MMQ_NWARPS, false>), nbytes_shared);
|
||||
CUDA_SET_SHARED_MEMORY_LIMIT((mul_mat_q<type, mmq_x, MMQ_NWARPS, true>), nbytes_shared);
|
||||
|
||||
const int nty = (args.nrows_x + mmq_y - 1) / mmq_y;
|
||||
const int ntx = (args.ncols_dst + mmq_x - 1) / mmq_x;
|
||||
|
||||
@@ -50,21 +50,19 @@ static __global__ void rope_norm(
|
||||
|
||||
const int row_dst = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
if (i0 >= n_dims) {
|
||||
const int i = row_dst*ne0 + i0;
|
||||
|
||||
dst[i + 0] = x[i + 0];
|
||||
dst[i + 1] = x[i + 1];
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
const int row_x = row_dst % ne1;
|
||||
const int channel_x = row_dst / ne1;
|
||||
|
||||
const int idst = row_dst*ne0 + i0;
|
||||
const int ix = channel_x*s2 + row_x*s1 + i0;
|
||||
|
||||
if (i0 >= n_dims) {
|
||||
dst[idst + 0] = x[ix + 0];
|
||||
dst[idst + 1] = x[ix + 1];
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
const float theta_base = pos[channel_x]*powf(theta_scale, i0/2.0f);
|
||||
|
||||
const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
|
||||
@@ -94,21 +92,19 @@ static __global__ void rope_neox(
|
||||
|
||||
const int row_dst = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
if (i0 >= n_dims) {
|
||||
const int i = row_dst*ne0 + i0;
|
||||
|
||||
dst[i + 0] = x[i + 0];
|
||||
dst[i + 1] = x[i + 1];
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
const int row_x = row_dst % ne1;
|
||||
const int channel_x = row_dst / ne1;
|
||||
|
||||
const int idst = row_dst*ne0 + i0/2;
|
||||
const int ix = channel_x*s2 + row_x*s1 + i0/2;
|
||||
|
||||
if (i0 >= n_dims) {
|
||||
dst[idst + i0/2 + 0] = x[ix + i0/2 + 0];
|
||||
dst[idst + i0/2 + 1] = x[ix + i0/2 + 1];
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
const float theta_base = pos[channel_x]*powf(theta_scale, i0/2.0f);
|
||||
|
||||
const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
|
||||
@@ -138,21 +134,19 @@ static __global__ void rope_multi(
|
||||
|
||||
const int row_dst = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
if (i0 >= n_dims) {
|
||||
const int i = row_dst*ne0 + i0;
|
||||
|
||||
dst[i + 0] = x[i + 0];
|
||||
dst[i + 1] = x[i + 1];
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
const int row_x = row_dst % ne1;
|
||||
const int channel_x = row_dst / ne1;
|
||||
|
||||
const int idst = row_dst*ne0 + i0/2;
|
||||
const int ix = channel_x*s2 + row_x*s1 + i0/2;
|
||||
|
||||
if (i0 >= n_dims) {
|
||||
dst[idst + i0/2 + 0] = x[ix + i0/2 + 0];
|
||||
dst[idst + i0/2 + 1] = x[ix + i0/2 + 1];
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
const int sect_dims = sections.v[0] + sections.v[1] + sections.v[2] + sections.v[3];
|
||||
const int sec_w = sections.v[1] + sections.v[0];
|
||||
const int sector = (i0 / 2) % sect_dims;
|
||||
|
||||
@@ -1,18 +1,18 @@
|
||||
#include "scale.cuh"
|
||||
|
||||
static __global__ void scale_f32(const float * x, float * dst, const float scale, const int k) {
|
||||
static __global__ void scale_f32(const float * x, float * dst, const float scale, const float bias, const int k) {
|
||||
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
if (i >= k) {
|
||||
return;
|
||||
}
|
||||
|
||||
dst[i] = scale * x[i];
|
||||
dst[i] = scale * x[i] + bias;
|
||||
}
|
||||
|
||||
static void scale_f32_cuda(const float * x, float * dst, const float scale, const int k, cudaStream_t stream) {
|
||||
static void scale_f32_cuda(const float * x, float * dst, const float scale, const float bias, const int k, cudaStream_t stream) {
|
||||
const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
|
||||
scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
|
||||
scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, bias, k);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_scale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
@@ -25,7 +25,9 @@ void ggml_cuda_op_scale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
|
||||
float scale;
|
||||
memcpy(&scale, dst->op_params, sizeof(float));
|
||||
float bias;
|
||||
memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
|
||||
memcpy(&bias, (float *) dst->op_params + 1, sizeof(float));
|
||||
|
||||
scale_f32_cuda(src0_d, dst_d, scale, ggml_nelements(src0), stream);
|
||||
scale_f32_cuda(src0_d, dst_d, scale, bias, ggml_nelements(src0), stream);
|
||||
}
|
||||
|
||||
145
ggml/src/ggml-cuda/set-rows.cu
Normal file
145
ggml/src/ggml-cuda/set-rows.cu
Normal file
@@ -0,0 +1,145 @@
|
||||
#include "set-rows.cuh"
|
||||
|
||||
typedef void (*set_rows_kernel_t)(const char * src, char * dst);
|
||||
|
||||
template<typename src_t, typename dst_t>
|
||||
__device__ void set_rows_1(const src_t * src_f, dst_t * dst_f) {}
|
||||
|
||||
template<>
|
||||
__device__ __forceinline__ void set_rows_1<float, half>(const float * src_f, half * dst_h) {
|
||||
*dst_h = __float2half(*src_f);
|
||||
}
|
||||
|
||||
template<>
|
||||
__device__ __forceinline__ void set_rows_1<float, nv_bfloat16>(const float * src_f, nv_bfloat16 * dst_b) {
|
||||
*dst_b = *src_f;
|
||||
}
|
||||
|
||||
template<>
|
||||
__device__ __forceinline__ void set_rows_1<float, float>(const float * src_f, float * dst_f) {
|
||||
*dst_f = *src_f;
|
||||
}
|
||||
|
||||
template<typename src_t, typename dst_t>
|
||||
static __global__ void k_set_rows(
|
||||
const src_t * __restrict__ src0, const int64_t * __restrict__ src1, dst_t * __restrict__ dst,
|
||||
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
|
||||
const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t ne13,
|
||||
const int64_t s01, const int64_t s02, const int64_t s03,
|
||||
const int64_t s10, const int64_t s11, const int64_t s12,
|
||||
const int64_t s1, const int64_t s2, const int64_t s3) {
|
||||
|
||||
const int64_t i = int64_t(blockDim.x) * blockIdx.x + threadIdx.x;
|
||||
const int64_t ne_total = ne00 * ne01 * ne02 * ne03;
|
||||
|
||||
if (i >= ne_total) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int64_t i03 = i / (ne00 * ne01 * ne02);
|
||||
const int64_t i02 = (i - i03 * ne00 * ne01 * ne02) / (ne00 * ne01);
|
||||
const int64_t i01 = (i - i03 * ne00 * ne01 * ne02 - i02 * ne00 * ne01) / ne00;
|
||||
const int64_t i00 = i - i03 * ne00 * ne01 * ne02 - i02 * ne00 * ne01 - i01 * ne00;
|
||||
|
||||
const int64_t i12 = i03 % ne12;
|
||||
const int64_t i11 = i02 % ne11;
|
||||
const int64_t i10 = i01;
|
||||
|
||||
const int64_t dst_row = *(src1 + i10*s10 + i11*s11 + i12*s12);
|
||||
|
||||
const src_t * src0_row = src0 + i01*s01 + i02*s02 + i03*s03;
|
||||
dst_t * dst_row_ptr = dst + dst_row*s1 + i02*s2 + i03*s3;
|
||||
|
||||
const src_t* src_elem = src0_row + i00;
|
||||
dst_t* dst_elem = dst_row_ptr + i00;
|
||||
set_rows_1(src_elem, dst_elem);
|
||||
}
|
||||
|
||||
template<typename src_t, typename dst_t>
|
||||
static void set_rows_cuda(
|
||||
const src_t * src0_d, const int64_t * src1_d, dst_t * dst_d,
|
||||
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
|
||||
const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t ne13,
|
||||
const size_t nb01, const size_t nb02, const size_t nb03,
|
||||
const size_t nb10, const size_t nb11, const size_t nb12,
|
||||
const size_t nb1, const size_t nb2, const size_t nb3,
|
||||
cudaStream_t stream) {
|
||||
|
||||
const int64_t ne_total = ne00 * ne01 * ne02 * ne03;
|
||||
const int num_blocks = (ne_total + CUDA_SET_ROWS_BLOCK_SIZE - 1) / CUDA_SET_ROWS_BLOCK_SIZE;
|
||||
const dim3 block_size(CUDA_SET_ROWS_BLOCK_SIZE);
|
||||
const dim3 grid_size(num_blocks);
|
||||
|
||||
|
||||
const int64_t s01 = nb01/sizeof(src_t);
|
||||
const int64_t s02 = nb02/sizeof(src_t);
|
||||
const int64_t s03 = nb03/sizeof(src_t);
|
||||
const int64_t s10 = nb10/sizeof(int64_t);
|
||||
const int64_t s11 = nb11/sizeof(int64_t);
|
||||
const int64_t s12 = nb12/sizeof(int64_t);
|
||||
const int64_t s1 = nb1/sizeof(dst_t);
|
||||
const int64_t s2 = nb2/sizeof(dst_t);
|
||||
const int64_t s3 = nb3/sizeof(dst_t);
|
||||
|
||||
if (ne_total > 0) {
|
||||
k_set_rows<<<grid_size, block_size, 0, stream>>>(
|
||||
src0_d, src1_d, dst_d,
|
||||
ne00, ne01, ne02, ne03,
|
||||
ne10, ne11, ne12, ne13,
|
||||
s01, s02, s03,
|
||||
s10, s11, s12,
|
||||
s1, s2, s3);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void ggml_cuda_op_set_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const ggml_tensor * src1 = dst->src[1];
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_I64);
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
||||
const float * src0_d = (const float *)src0->data;
|
||||
const int64_t * src1_d = (const int64_t *)src1->data;
|
||||
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
|
||||
|
||||
if (dst->type == GGML_TYPE_F32) {
|
||||
set_rows_cuda(
|
||||
src0_d, src1_d, (float*)dst->data,
|
||||
ne00, ne01, ne02, ne03,
|
||||
ne10, ne11, ne12, ne13,
|
||||
nb01, nb02, nb03,
|
||||
nb10, nb11, nb12,
|
||||
nb1, nb2, nb3,
|
||||
stream
|
||||
);
|
||||
} else if (dst->type == GGML_TYPE_F16) {
|
||||
set_rows_cuda(
|
||||
src0_d, src1_d, (half*)dst->data,
|
||||
ne00, ne01, ne02, ne03,
|
||||
ne10, ne11, ne12, ne13,
|
||||
nb01, nb02, nb03,
|
||||
nb10, nb11, nb12,
|
||||
nb1, nb2, nb3,
|
||||
stream
|
||||
);
|
||||
} else if (dst->type == GGML_TYPE_BF16) {
|
||||
set_rows_cuda(
|
||||
src0_d, src1_d, (nv_bfloat16*)dst->data,
|
||||
ne00, ne01, ne02, ne03,
|
||||
ne10, ne11, ne12, ne13,
|
||||
nb01, nb02, nb03,
|
||||
nb10, nb11, nb12,
|
||||
nb1, nb2, nb3,
|
||||
stream
|
||||
);
|
||||
} else {
|
||||
GGML_ABORT("unsupported type");
|
||||
}
|
||||
}
|
||||
7
ggml/src/ggml-cuda/set-rows.cuh
Normal file
7
ggml/src/ggml-cuda/set-rows.cuh
Normal file
@@ -0,0 +1,7 @@
|
||||
#pragma once
|
||||
|
||||
#include "common.cuh"
|
||||
|
||||
#define CUDA_SET_ROWS_BLOCK_SIZE 256
|
||||
|
||||
void ggml_cuda_op_set_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
@@ -2,6 +2,7 @@
|
||||
#include "ggml.h"
|
||||
#include "softmax.cuh"
|
||||
#include <cstdint>
|
||||
#include <utility>
|
||||
|
||||
template <typename T>
|
||||
static __device__ __forceinline__ float t2f32(T val) {
|
||||
@@ -181,6 +182,37 @@ static __global__ void soft_max_back_f32(
|
||||
}
|
||||
}
|
||||
|
||||
template<int... Ns, typename T>
|
||||
static void launch_soft_max_kernels(const float * x, const T * mask, float * dst,
|
||||
const soft_max_params & p, cudaStream_t stream, dim3 block_dims, dim3 block_nums, size_t nbytes_shared)
|
||||
{
|
||||
const int id = ggml_cuda_get_device();
|
||||
const size_t smpbo = ggml_cuda_info().devices[id].smpbo;
|
||||
|
||||
auto launch_kernel = [=](auto I) -> bool {
|
||||
constexpr int ncols = decltype(I)::value;
|
||||
constexpr int block = (ncols > 1024 ? 1024 : ncols);
|
||||
|
||||
if (p.ncols == ncols) {
|
||||
CUDA_SET_SHARED_MEMORY_LIMIT((soft_max_f32<true, ncols, block, T>), smpbo);
|
||||
soft_max_f32<true, ncols, block><<<block_nums, block_dims, nbytes_shared, stream>>>
|
||||
(x, mask, dst, p);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
};
|
||||
|
||||
// unary fold over launch_kernel
|
||||
if ((launch_kernel(std::integral_constant<int, Ns>{}) || ...)) {
|
||||
return;
|
||||
}
|
||||
|
||||
//default case
|
||||
CUDA_SET_SHARED_MEMORY_LIMIT((soft_max_f32<true, 0, 0, T>), smpbo);
|
||||
soft_max_f32<true, 0, 0><<<block_nums, block_dims, nbytes_shared, stream>>>(x, mask, dst, p);
|
||||
}
|
||||
|
||||
|
||||
template<typename T>
|
||||
static void soft_max_f32_cuda(const float * x, const T * mask, float * dst, const soft_max_params & params, cudaStream_t stream) {
|
||||
int nth = WARP_SIZE;
|
||||
@@ -193,46 +225,12 @@ static void soft_max_f32_cuda(const float * x, const T * mask, float * dst, cons
|
||||
static_assert(CUDA_SOFT_MAX_BLOCK_SIZE == 1024, "These values need to be adjusted.");
|
||||
|
||||
|
||||
// FIXME: this limit could be raised by ~2-4x on Ampere or newer
|
||||
if (nbytes_shared < ggml_cuda_info().devices[ggml_cuda_get_device()].smpb) {
|
||||
switch (ncols_x) {
|
||||
case 32:
|
||||
soft_max_f32<true, 32, 32><<<block_nums, block_dims, nbytes_shared, stream>>>
|
||||
(x, mask, dst, params);
|
||||
break;
|
||||
case 64:
|
||||
soft_max_f32<true, 64, 64><<<block_nums, block_dims, nbytes_shared, stream>>>
|
||||
(x, mask, dst, params);
|
||||
break;
|
||||
case 128:
|
||||
soft_max_f32<true, 128, 128><<<block_nums, block_dims, nbytes_shared, stream>>>
|
||||
(x, mask, dst, params);
|
||||
break;
|
||||
case 256:
|
||||
soft_max_f32<true, 256, 256><<<block_nums, block_dims, nbytes_shared, stream>>>
|
||||
(x, mask, dst, params);
|
||||
break;
|
||||
case 512:
|
||||
soft_max_f32<true, 512, 512><<<block_nums, block_dims, nbytes_shared, stream>>>
|
||||
(x, mask, dst, params);
|
||||
break;
|
||||
case 1024:
|
||||
soft_max_f32<true, 1024, 1024><<<block_nums, block_dims, nbytes_shared, stream>>>
|
||||
(x, mask, dst, params);
|
||||
break;
|
||||
case 2048:
|
||||
soft_max_f32<true, 2048, 1024><<<block_nums, block_dims, nbytes_shared, stream>>>
|
||||
(x, mask, dst, params);
|
||||
break;
|
||||
case 4096:
|
||||
soft_max_f32<true, 4096, 1024><<<block_nums, block_dims, nbytes_shared, stream>>>
|
||||
(x, mask, dst, params);
|
||||
break;
|
||||
default:
|
||||
soft_max_f32<true, 0, 0><<<block_nums, block_dims, nbytes_shared, stream>>>
|
||||
(x, mask, dst, params);
|
||||
break;
|
||||
}
|
||||
const int id = ggml_cuda_get_device();
|
||||
const size_t smpbo = ggml_cuda_info().devices[id].smpbo;
|
||||
|
||||
|
||||
if (nbytes_shared <= smpbo) {
|
||||
launch_soft_max_kernels<32, 64, 128, 256, 512, 1024, 2048, 4096>(x, mask, dst, params, stream, block_dims, block_nums, nbytes_shared);
|
||||
} else {
|
||||
const size_t nbytes_shared_low = WARP_SIZE*sizeof(float);
|
||||
soft_max_f32<false, 0, 0><<<block_nums, block_dims, nbytes_shared_low, stream>>>(x, mask, dst, params);
|
||||
|
||||
@@ -107,8 +107,11 @@ static void ssm_conv_f32_cuda(const float * src0, const float * src1, const int
|
||||
if (nc == 4) {
|
||||
ssm_conv_f32<threads, 4><<<blocks, threads, 0, stream>>>(src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1,
|
||||
dst, dst_nb0, dst_nb1, dst_nb2, n_t);
|
||||
} else if (nc == 3) {
|
||||
ssm_conv_f32<threads, 3><<<blocks, threads, 0, stream>>>(src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1,
|
||||
dst, dst_nb0, dst_nb1, dst_nb2, n_t);
|
||||
} else {
|
||||
GGML_ABORT("Only support kernel size = 4 now.");
|
||||
GGML_ABORT("Only support kernel size = 3 or size = 4 right now.");
|
||||
}
|
||||
} else {
|
||||
if (nc == 4) {
|
||||
@@ -116,8 +119,13 @@ static void ssm_conv_f32_cuda(const float * src0, const float * src1, const int
|
||||
dim3 blocks(n_s, (nr + threads - 1) / threads, (n_t + split_n_t - 1) / split_n_t);
|
||||
ssm_conv_long_token_f32<threads, 4, split_n_t><<<blocks, threads, 0, stream>>>(
|
||||
src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1, dst, dst_nb0, dst_nb1, dst_nb2, n_t);
|
||||
} else if (nc == 3) {
|
||||
const int64_t split_n_t = 32;
|
||||
dim3 blocks(n_s, (nr + threads - 1) / threads, (n_t + split_n_t - 1) / split_n_t);
|
||||
ssm_conv_long_token_f32<threads, 3, split_n_t><<<blocks, threads, 0, stream>>>(
|
||||
src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1, dst, dst_nb0, dst_nb1, dst_nb2, n_t);
|
||||
} else {
|
||||
GGML_ABORT("Only support kernel size = 4 right now.");
|
||||
GGML_ABORT("Only support kernel size = 3 or size = 4 right now.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -201,11 +201,11 @@ static void ssm_scan_f32_cuda(const float * src0, const float * src1, const floa
|
||||
const int src5_nb3, const int64_t s_off, const int64_t d_state, const int64_t head_dim,
|
||||
const int64_t n_head, const int64_t n_group, const int64_t n_tok, const int64_t n_seq,
|
||||
cudaStream_t stream) {
|
||||
const int threads = 128;
|
||||
// NOTE: if you change conditions here, be sure to update the corresponding supports_op condition!
|
||||
if (src3_nb1 == sizeof(float)) {
|
||||
// Mamba-2
|
||||
if (d_state == 128) {
|
||||
const int threads = 128;
|
||||
GGML_ASSERT(d_state % threads == 0);
|
||||
// NOTE: can be any power of two between 4 and 64
|
||||
const int splitH = 16;
|
||||
@@ -215,10 +215,21 @@ static void ssm_scan_f32_cuda(const float * src0, const float * src1, const floa
|
||||
src0, src1, src2, src3, src4, src5, src6, dst,
|
||||
src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2, src3_nb1,
|
||||
src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, head_dim, n_group, n_tok);
|
||||
} else if (d_state == 256) { // Falcon-H1
|
||||
const int threads = 256;
|
||||
// NOTE: can be any power of two between 8 and 64
|
||||
const int splitH = 16;
|
||||
GGML_ASSERT(head_dim % splitH == 0);
|
||||
const dim3 blocks((n_head * head_dim + (splitH - 1)) / splitH, n_seq, 1);
|
||||
ssm_scan_f32_group<16, 256><<<blocks, threads, 0, stream>>>(
|
||||
src0, src1, src2, src3, src4, src5, src6, dst,
|
||||
src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2, src3_nb1,
|
||||
src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, head_dim, n_group, n_tok);
|
||||
} else {
|
||||
GGML_ABORT("doesn't support d_state!=128.");
|
||||
GGML_ABORT("doesn't support d_state!=(128 or 256).");
|
||||
}
|
||||
} else {
|
||||
const int threads = 128;
|
||||
// Mamba-1
|
||||
GGML_ASSERT(n_head % threads == 0);
|
||||
GGML_ASSERT(head_dim == 1);
|
||||
|
||||
@@ -83,6 +83,10 @@ static __device__ __forceinline__ float op_log(float x) {
|
||||
return logf(x);
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ float op_elu(float x) {
|
||||
return (x > 0.f) ? x : expm1f(x);
|
||||
}
|
||||
|
||||
template <float (*op)(float), typename T>
|
||||
static __global__ void unary_op_kernel(const T * x, T * dst, const int k) {
|
||||
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
@@ -196,6 +200,9 @@ void ggml_cuda_op_log(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
ggml_cuda_op_unary<op_log>(ctx, dst);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_elu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
ggml_cuda_op_unary<op_elu>(ctx, dst);
|
||||
}
|
||||
/* gated ops */
|
||||
|
||||
template <float (*op)(float), typename T>
|
||||
@@ -285,6 +292,14 @@ void ggml_cuda_op_swiglu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
ggml_cuda_op_unary_gated<op_silu>(ctx, dst);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_geglu_erf(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
ggml_cuda_op_unary_gated<op_gelu_erf>(ctx, dst);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_geglu_quick(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
ggml_cuda_op_unary_gated<op_gelu_quick>(ctx, dst);
|
||||
}
|
||||
|
||||
/* silu_back */
|
||||
|
||||
static __device__ __forceinline__ float op_silu_back(float grad, float x) {
|
||||
|
||||
@@ -59,8 +59,14 @@ void ggml_cuda_op_cos(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_cuda_op_log(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_cuda_op_elu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_cuda_op_reglu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_cuda_op_geglu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_cuda_op_swiglu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_cuda_op_geglu_erf(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_cuda_op_geglu_quick(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
@@ -22,17 +22,88 @@ static __global__ void upscale_f32(const float * x, float * dst,
|
||||
dst[index] = *( (const float *)((const char *)x + i03 * nb03 + i02 * nb02 + i01 * nb01 + i00 * nb00) );
|
||||
}
|
||||
|
||||
static __global__ void upscale_f32_bilinear(const float * x, float * dst,
|
||||
const int nb00, const int nb01, const int nb02, const int nb03,
|
||||
const int ne00_src, const int ne01_src,
|
||||
const int ne10_dst, const int ne11_dst, const int ne12_dst, const int ne13_dst,
|
||||
const float sf0, const float sf1, const float sf2, const float sf3,
|
||||
const float pixel_offset) {
|
||||
const int64_t index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
const int64_t dst_total_elements = ne10_dst * ne11_dst * ne12_dst * ne13_dst;
|
||||
|
||||
if (index >= dst_total_elements) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int i10_dst = index % ne10_dst;
|
||||
const int i11_dst = (index / ne10_dst) % ne11_dst;
|
||||
const int i12_dst = (index / (ne10_dst * ne11_dst)) % ne12_dst;
|
||||
const int i13_dst = index / (ne10_dst * ne11_dst * ne12_dst);
|
||||
|
||||
const int i02_src = (int)(i12_dst / sf2);
|
||||
const int i03_src = (int)(i13_dst / sf3);
|
||||
|
||||
const float y_src_f = ((float)i11_dst + pixel_offset) / sf1 - pixel_offset;
|
||||
int y0_src = (int)floorf(y_src_f);
|
||||
int y1_src = y0_src + 1;
|
||||
|
||||
y0_src = max(0, min(y0_src, ne01_src - 1));
|
||||
y1_src = max(0, min(y1_src, ne01_src - 1));
|
||||
|
||||
float dy = y_src_f - (float)y0_src;
|
||||
dy = max(0.0f, min(dy, 1.0f));
|
||||
|
||||
float x_src_f = ((float)i10_dst + pixel_offset) / sf0 - pixel_offset;
|
||||
int x0_src = (int)floorf(x_src_f);
|
||||
int x1_src = x0_src + 1;
|
||||
|
||||
x0_src = max(0, min(x0_src, ne00_src - 1));
|
||||
x1_src = max(0, min(x1_src, ne00_src - 1));
|
||||
|
||||
float dx = x_src_f - (float)x0_src;
|
||||
dx = max(0.0f, min(dx, 1.0f));
|
||||
|
||||
const float * p_a = (const float *)((const char *)x + (int64_t)x0_src * nb00 + (int64_t)y0_src * nb01 + (int64_t)i02_src * nb02 + (int64_t)i03_src * nb03);
|
||||
const float * p_b = (const float *)((const char *)x + (int64_t)x1_src * nb00 + (int64_t)y0_src * nb01 + (int64_t)i02_src * nb02 + (int64_t)i03_src * nb03);
|
||||
const float * p_c = (const float *)((const char *)x + (int64_t)x0_src * nb00 + (int64_t)y1_src * nb01 + (int64_t)i02_src * nb02 + (int64_t)i03_src * nb03);
|
||||
const float * p_d = (const float *)((const char *)x + (int64_t)x1_src * nb00 + (int64_t)y1_src * nb01 + (int64_t)i02_src * nb02 + (int64_t)i03_src * nb03);
|
||||
|
||||
const float val_a = *p_a;
|
||||
const float val_b = *p_b;
|
||||
const float val_c = *p_c;
|
||||
const float val_d = *p_d;
|
||||
|
||||
float result = val_a * (1.0f - dx) * (1.0f - dy) +
|
||||
val_b * dx * (1.0f - dy) +
|
||||
val_c * (1.0f - dx) * dy +
|
||||
val_d * dx * dy;
|
||||
|
||||
dst[index] = result;
|
||||
}
|
||||
|
||||
static void upscale_f32_cuda(const float * x, float * dst,
|
||||
const int nb00, const int nb01, const int nb02, const int nb03,
|
||||
const int ne10, const int ne11, const int ne12, const int ne13,
|
||||
const float sf0, const float sf1, const float sf2, const float sf3,
|
||||
cudaStream_t stream) {
|
||||
int dst_size = ne10 * ne11 * ne12 * ne13;
|
||||
int num_blocks = (dst_size + CUDA_UPSCALE_BLOCK_SIZE - 1) / CUDA_UPSCALE_BLOCK_SIZE;
|
||||
const int64_t dst_size = ne10 * ne11 * ne12 * ne13;
|
||||
const int64_t num_blocks = (dst_size + CUDA_UPSCALE_BLOCK_SIZE - 1) / CUDA_UPSCALE_BLOCK_SIZE;
|
||||
|
||||
upscale_f32<<<num_blocks, CUDA_UPSCALE_BLOCK_SIZE,0,stream>>>(x, dst, nb00, nb01, nb02, nb03, ne10, ne11, ne12, ne13, sf0, sf1, sf2, sf3);
|
||||
}
|
||||
|
||||
static void upscale_f32_bilinear_cuda(const float * x, float * dst,
|
||||
const int nb00, const int nb01, const int nb02, const int nb03,
|
||||
const int ne00_src, const int ne01_src,
|
||||
const int ne10_dst, const int ne11_dst, const int ne12_dst, const int ne13_dst,
|
||||
const float sf0, const float sf1, const float sf2, const float sf3,
|
||||
const float pixel_offset, cudaStream_t stream) {
|
||||
const int64_t dst_size = ne10_dst * ne11_dst * ne12_dst * ne13_dst;
|
||||
const int64_t num_blocks = (dst_size + CUDA_UPSCALE_BLOCK_SIZE - 1) / CUDA_UPSCALE_BLOCK_SIZE;
|
||||
|
||||
upscale_f32_bilinear<<<num_blocks, CUDA_UPSCALE_BLOCK_SIZE,0,stream>>>(x, dst, nb00, nb01, nb02, nb03, ne00_src, ne01_src, ne10_dst, ne11_dst, ne12_dst, ne13_dst, sf0, sf1, sf2, sf3, pixel_offset);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_upscale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const float * src0_d = (const float *)src0->data;
|
||||
@@ -42,10 +113,25 @@ void ggml_cuda_op_upscale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
|
||||
const float sf0 = (float)dst->ne[0]/src0->ne[0];
|
||||
const float sf1 = (float)dst->ne[1]/src0->ne[1];
|
||||
const float sf2 = (float)dst->ne[2]/src0->ne[2];
|
||||
const int mode_flags = dst->op_params[0];
|
||||
const ggml_scale_mode mode = (ggml_scale_mode)(mode_flags & 0xFF);
|
||||
|
||||
float sf0 = (float)dst->ne[0]/src0->ne[0];
|
||||
float sf1 = (float)dst->ne[1]/src0->ne[1];
|
||||
float sf2 = (float)dst->ne[2]/src0->ne[2];
|
||||
const float sf3 = (float)dst->ne[3]/src0->ne[3];
|
||||
|
||||
upscale_f32_cuda(src0_d, dst_d, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3, stream);
|
||||
if (mode == GGML_SCALE_MODE_NEAREST) {
|
||||
upscale_f32_cuda(src0_d, dst_d, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3, stream);
|
||||
} else if (mode == GGML_SCALE_MODE_BILINEAR) {
|
||||
float pixel_offset = 0.5f;
|
||||
if (mode_flags & GGML_SCALE_FLAG_ALIGN_CORNERS) {
|
||||
sf0 = (float)(dst->ne[0] - 1) / (src0->ne[0] - 1);
|
||||
sf1 = (float)(dst->ne[1] - 1) / (src0->ne[1] - 1);
|
||||
pixel_offset = 0.0f;
|
||||
}
|
||||
upscale_f32_bilinear_cuda(src0_d, dst_d, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
|
||||
src0->ne[0], src0->ne[1], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
|
||||
sf0, sf1, sf2, sf3, pixel_offset, stream);
|
||||
}
|
||||
}
|
||||
|
||||
19
ggml/src/ggml-cuda/vendors/hip.h
vendored
19
ggml/src/ggml-cuda/vendors/hip.h
vendored
@@ -10,9 +10,6 @@
|
||||
#include "rocblas/rocblas.h"
|
||||
#endif // __HIP_PLATFORM_AMD__
|
||||
|
||||
#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
|
||||
#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
|
||||
#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
|
||||
#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
|
||||
#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
|
||||
#define CUBLAS_OP_N HIPBLAS_OP_N
|
||||
@@ -30,7 +27,6 @@
|
||||
#define CU_CHECK(fn) {hipError_t err = fn; if(err != hipSuccess) { GGML_ABORT("HipVMM Failure: %s\n", hipGetErrorString(err)); }}
|
||||
#define __shfl_sync(mask, var, laneMask, width) __shfl(var, laneMask, width)
|
||||
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
|
||||
#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
|
||||
#define cublasCreate hipblasCreate
|
||||
#define cublasDestroy hipblasDestroy
|
||||
#define cublasGemmEx hipblasGemmEx
|
||||
@@ -42,7 +38,6 @@
|
||||
#define cublasSgemm hipblasSgemm
|
||||
#define cublasStatus_t hipblasStatus_t
|
||||
#define cublasOperation_t hipblasOperation_t
|
||||
#define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6
|
||||
#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
|
||||
#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
|
||||
#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
|
||||
@@ -144,6 +139,20 @@
|
||||
#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
|
||||
#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED
|
||||
|
||||
#if defined(__HIP_PLATFORM_AMD__) && HIP_VERSION >= 70000000
|
||||
#define CUBLAS_COMPUTE_16F HIPBLAS_COMPUTE_16F
|
||||
#define CUBLAS_COMPUTE_32F HIPBLAS_COMPUTE_32F
|
||||
#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_COMPUTE_32F_FAST_16F
|
||||
#define cublasComputeType_t hipblasComputeType_t
|
||||
#define cudaDataType_t hipDataType
|
||||
#else
|
||||
#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
|
||||
#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
|
||||
#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
|
||||
#define cublasComputeType_t hipblasDatatype_t
|
||||
#define cudaDataType_t hipblasDatatype_t
|
||||
#endif
|
||||
|
||||
#define __CUDA_ARCH__ 1300
|
||||
|
||||
#if defined(__gfx803__) || defined(__gfx900__) || defined(__gfx906__)
|
||||
|
||||
@@ -1,166 +0,0 @@
|
||||
|
||||
find_package(Vulkan COMPONENTS glslc REQUIRED)
|
||||
find_program(glslc_executable NAMES glslc HINTS Vulkan::glslc)
|
||||
|
||||
if (NOT glslc_executable)
|
||||
message(FATAL_ERROR "glslc not found")
|
||||
endif()
|
||||
|
||||
ggml_add_backend_library(ggml-kompute
|
||||
ggml-kompute.cpp
|
||||
../../include/ggml-kompute.h
|
||||
)
|
||||
|
||||
target_link_libraries(ggml-kompute PRIVATE ggml-base kompute)
|
||||
target_include_directories(ggml-kompute PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
|
||||
|
||||
add_compile_definitions(VULKAN_HPP_DISPATCH_LOADER_DYNAMIC=1)
|
||||
|
||||
function(compile_shader)
|
||||
set(options)
|
||||
set(oneValueArgs)
|
||||
set(multiValueArgs SOURCES)
|
||||
cmake_parse_arguments(compile_shader "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
|
||||
foreach(source ${compile_shader_SOURCES})
|
||||
get_filename_component(filename ${source} NAME)
|
||||
set(spv_file ${filename}.spv)
|
||||
add_custom_command(
|
||||
OUTPUT ${spv_file}
|
||||
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${source}
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/common.comp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_getrows.comp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_mul_mv_q_n_pre.comp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_mul_mv_q_n.comp
|
||||
COMMAND ${glslc_executable} --target-env=vulkan1.2 -o ${spv_file} ${CMAKE_CURRENT_SOURCE_DIR}/${source}
|
||||
COMMENT "Compiling ${source} to ${spv_file}"
|
||||
)
|
||||
|
||||
get_filename_component(RAW_FILE_NAME ${spv_file} NAME)
|
||||
set(FILE_NAME "shader${RAW_FILE_NAME}")
|
||||
string(REPLACE ".comp.spv" ".h" HEADER_FILE ${FILE_NAME})
|
||||
string(TOUPPER ${HEADER_FILE} HEADER_FILE_DEFINE)
|
||||
string(REPLACE "." "_" HEADER_FILE_DEFINE "${HEADER_FILE_DEFINE}")
|
||||
set(OUTPUT_HEADER_FILE "${HEADER_FILE}")
|
||||
message(STATUS "${HEADER_FILE} generating ${HEADER_FILE_DEFINE}")
|
||||
if(CMAKE_GENERATOR MATCHES "Visual Studio")
|
||||
add_custom_command(
|
||||
OUTPUT ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_BINARY_DIR}/bin/$<CONFIG>/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
|
||||
DEPENDS ${spv_file} xxd
|
||||
COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/$<CONFIG>/xxd"
|
||||
)
|
||||
else()
|
||||
add_custom_command(
|
||||
OUTPUT ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_BINARY_DIR}/bin/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
|
||||
DEPENDS ${spv_file} xxd
|
||||
COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/xxd"
|
||||
)
|
||||
endif()
|
||||
endforeach()
|
||||
endfunction()
|
||||
|
||||
if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/kompute/CMakeLists.txt")
|
||||
message(STATUS "Kompute found")
|
||||
set(KOMPUTE_OPT_LOG_LEVEL Error CACHE STRING "Kompute log level")
|
||||
add_subdirectory(kompute)
|
||||
|
||||
# Compile our shaders
|
||||
compile_shader(SOURCES
|
||||
kompute-shaders/op_scale.comp
|
||||
kompute-shaders/op_scale_8.comp
|
||||
kompute-shaders/op_add.comp
|
||||
kompute-shaders/op_addrow.comp
|
||||
kompute-shaders/op_mul.comp
|
||||
kompute-shaders/op_silu.comp
|
||||
kompute-shaders/op_relu.comp
|
||||
kompute-shaders/op_gelu.comp
|
||||
kompute-shaders/op_softmax.comp
|
||||
kompute-shaders/op_norm.comp
|
||||
kompute-shaders/op_rmsnorm.comp
|
||||
kompute-shaders/op_diagmask.comp
|
||||
kompute-shaders/op_mul_mat_mat_f32.comp
|
||||
kompute-shaders/op_mul_mat_f16.comp
|
||||
kompute-shaders/op_mul_mat_q8_0.comp
|
||||
kompute-shaders/op_mul_mat_q4_0.comp
|
||||
kompute-shaders/op_mul_mat_q4_1.comp
|
||||
kompute-shaders/op_mul_mat_q4_k.comp
|
||||
kompute-shaders/op_mul_mat_q6_k.comp
|
||||
kompute-shaders/op_getrows_f32.comp
|
||||
kompute-shaders/op_getrows_f16.comp
|
||||
kompute-shaders/op_getrows_q4_0.comp
|
||||
kompute-shaders/op_getrows_q4_1.comp
|
||||
kompute-shaders/op_getrows_q6_k.comp
|
||||
kompute-shaders/op_rope_norm_f16.comp
|
||||
kompute-shaders/op_rope_norm_f32.comp
|
||||
kompute-shaders/op_rope_neox_f16.comp
|
||||
kompute-shaders/op_rope_neox_f32.comp
|
||||
kompute-shaders/op_cpy_f16_f16.comp
|
||||
kompute-shaders/op_cpy_f16_f32.comp
|
||||
kompute-shaders/op_cpy_f32_f16.comp
|
||||
kompute-shaders/op_cpy_f32_f32.comp
|
||||
)
|
||||
|
||||
# Create a custom target for our generated shaders
|
||||
add_custom_target(generated_shaders DEPENDS
|
||||
shaderop_scale.h
|
||||
shaderop_scale_8.h
|
||||
shaderop_add.h
|
||||
shaderop_addrow.h
|
||||
shaderop_mul.h
|
||||
shaderop_silu.h
|
||||
shaderop_relu.h
|
||||
shaderop_gelu.h
|
||||
shaderop_softmax.h
|
||||
shaderop_norm.h
|
||||
shaderop_rmsnorm.h
|
||||
shaderop_diagmask.h
|
||||
shaderop_mul_mat_mat_f32.h
|
||||
shaderop_mul_mat_f16.h
|
||||
shaderop_mul_mat_q8_0.h
|
||||
shaderop_mul_mat_q4_0.h
|
||||
shaderop_mul_mat_q4_1.h
|
||||
shaderop_mul_mat_q4_k.h
|
||||
shaderop_mul_mat_q6_k.h
|
||||
shaderop_getrows_f32.h
|
||||
shaderop_getrows_f16.h
|
||||
shaderop_getrows_q4_0.h
|
||||
shaderop_getrows_q4_1.h
|
||||
shaderop_getrows_q6_k.h
|
||||
shaderop_rope_norm_f16.h
|
||||
shaderop_rope_norm_f32.h
|
||||
shaderop_rope_neox_f16.h
|
||||
shaderop_rope_neox_f32.h
|
||||
shaderop_cpy_f16_f16.h
|
||||
shaderop_cpy_f16_f32.h
|
||||
shaderop_cpy_f32_f16.h
|
||||
shaderop_cpy_f32_f32.h
|
||||
)
|
||||
|
||||
# Create a custom command that depends on the generated_shaders
|
||||
add_custom_command(
|
||||
OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp
|
||||
COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp
|
||||
DEPENDS generated_shaders
|
||||
COMMENT "Ensuring shaders are generated before compiling ggml-kompute.cpp"
|
||||
)
|
||||
|
||||
# Add the stamp to the main sources to ensure dependency tracking
|
||||
target_sources(ggml-kompute PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp)
|
||||
else()
|
||||
message(WARNING "Kompute not found")
|
||||
endif()
|
||||
File diff suppressed because it is too large
Load Diff
Submodule ggml/src/ggml-kompute/kompute deleted from 4565194ed7
@@ -1,112 +0,0 @@
|
||||
#extension GL_EXT_shader_16bit_storage: require
|
||||
#extension GL_EXT_shader_8bit_storage: require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types_int64: require
|
||||
#extension GL_EXT_control_flow_attributes: enable
|
||||
#extension GL_KHR_shader_subgroup_arithmetic : require
|
||||
#extension GL_EXT_debug_printf : enable
|
||||
|
||||
#define QK4_0 32
|
||||
#define QK4_1 32
|
||||
|
||||
#define GELU_COEF_A 0.044715
|
||||
#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
|
||||
#define TWOPI_F 6.283185307179586f
|
||||
|
||||
#define QK_K 256
|
||||
#define K_SCALE_SIZE 12
|
||||
|
||||
#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
|
||||
#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
|
||||
#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
|
||||
#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
|
||||
|
||||
#define sizeof_block_q4_0 0x12
|
||||
struct block_q4_0 {
|
||||
float16_t d;
|
||||
uint8_t qs[QK4_0 / 2];
|
||||
};
|
||||
mat4 dequantize_q4_0(const block_q4_0 xb, uint il) {
|
||||
const float d1 = il != 0 ? (xb.d / 16.f) : xb.d;
|
||||
const float d2 = d1 / 256.f;
|
||||
const float md = -8.f * xb.d;
|
||||
const uint16_t mask0 = il != 0 ? uint16_t(0x00F0) : uint16_t(0x000F);
|
||||
const uint16_t mask1 = mask0 << 8;
|
||||
|
||||
mat4 reg;
|
||||
for (int i=0;i<8;i++) {
|
||||
uint16_t b = (uint16_t(xb.qs[2 * i + 1]) << 8) | uint16_t(xb.qs[2 * i]);
|
||||
reg[i/2][2*(i%2)+0] = d1 * (b & mask0) + md;
|
||||
reg[i/2][2*(i%2)+1] = d2 * (b & mask1) + md;
|
||||
}
|
||||
return reg;
|
||||
}
|
||||
|
||||
#define sizeof_block_q4_1 0x14
|
||||
struct block_q4_1 {
|
||||
float16_t d;
|
||||
float16_t m;
|
||||
uint8_t qs[QK4_1 / 2];
|
||||
};
|
||||
mat4 dequantize_q4_1(const block_q4_1 xb, uint il) {
|
||||
const float d1 = il != 0 ? (xb.d / 16.f) : xb.d;
|
||||
const float d2 = d1 / 256.f;
|
||||
const float m = xb.m;
|
||||
const uint16_t mask0 = il != 0 ? uint16_t(0x00F0) : uint16_t(0x000F);
|
||||
const uint16_t mask1 = mask0 << 8;
|
||||
|
||||
mat4 reg;
|
||||
for (int i=0;i<8;i++) {
|
||||
uint16_t b = (uint16_t(xb.qs[2 * i + 1]) << 8) | uint16_t(xb.qs[2 * i]);
|
||||
reg[i/2][2*(i%2)+0] = ((b & mask0) * d1) + m;
|
||||
reg[i/2][2*(i%2)+1] = ((b & mask1) * d2) + m;
|
||||
}
|
||||
return reg;
|
||||
}
|
||||
|
||||
#define sizeof_block_q4_k 144
|
||||
struct block_q4_k {
|
||||
float16_t d;
|
||||
float16_t dmin;
|
||||
uint8_t scales[K_SCALE_SIZE];
|
||||
uint8_t qs[QK_K/2];
|
||||
};
|
||||
|
||||
#define sizeof_block_q6_k 210
|
||||
struct block_q6_k {
|
||||
uint8_t ql[QK_K/2]; // quants, lower 4 bits
|
||||
uint8_t qh[QK_K/4]; // quants, upper 2 bits
|
||||
int8_t scales[QK_K/16]; // scales, quantized with 8 bits
|
||||
float16_t d; // super-block scale
|
||||
};
|
||||
mat4 dequantize_q6_k(const block_q6_k xb, uint il) {
|
||||
const float16_t d_all = xb.d;
|
||||
|
||||
const uint qlIndex = 64*(il/8) + 32*((il/2)&1) + 16*(il&1);
|
||||
const uint qhIndex = 32*(il/8) + 16*(il&1);
|
||||
float16_t sc = xb.scales[(il%2) + 2 * ((il/2))];
|
||||
il = (il/2) & 3;
|
||||
|
||||
const uint16_t kmask1 = il>1 ? uint16_t(il>2 ? 192 : 48) : uint16_t(il>0 ? 12 : 3);
|
||||
const uint16_t kmask2 = il>1 ? uint8_t(0xF0) : uint8_t(0x0F);
|
||||
const float16_t coef = il>1 ? float16_t(1.f/16.f) : float16_t(1.f);
|
||||
const float16_t ml = float16_t(d_all * sc * 32.f);
|
||||
const float16_t dl = float16_t(d_all * sc * coef);
|
||||
mat4 reg;
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
const float16_t q = (il&1) != 0 ? ((xb.ql[qlIndex + i] & kmask2) | ((xb.qh[qhIndex + i] & kmask1) << 2))
|
||||
: ((xb.ql[qlIndex + i] & kmask2) | ((xb.qh[qhIndex + i] & kmask1) << 4));
|
||||
reg[i/4][i%4] = dl * q - ml;
|
||||
}
|
||||
return reg;
|
||||
}
|
||||
|
||||
|
||||
#define QK8_0 32
|
||||
// struct block_q8_0 {
|
||||
// float16_t d; // delta
|
||||
// int8_t qs[QK8_0]; // quants
|
||||
// };
|
||||
#define sizeof_block_q8_0 34
|
||||
@@ -1,58 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
layout(local_size_x = 1024) in;
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
|
||||
layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; };
|
||||
layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
|
||||
|
||||
layout(push_constant) uniform PushConstants {
|
||||
uint inAOff;
|
||||
uint inBOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int nb00;
|
||||
int nb01;
|
||||
int nb02;
|
||||
int nb03;
|
||||
int ne10;
|
||||
int ne11;
|
||||
int ne12;
|
||||
int ne13;
|
||||
int nb10;
|
||||
int nb11;
|
||||
int nb12;
|
||||
int nb13;
|
||||
int ne0;
|
||||
int nb0;
|
||||
int nb1;
|
||||
int nb2;
|
||||
int nb3;
|
||||
//int offs; // TODO: needed for GGML_OP_ACC, see metal code
|
||||
} pcs;
|
||||
|
||||
// general-purpose kernel for addition of two tensors
|
||||
// pros: works for non-contiguous tensors, supports broadcast across dims 1, 2 and 3
|
||||
// cons: not very efficient
|
||||
void main() {
|
||||
const uint i03 = gl_WorkGroupID.z;
|
||||
const uint i02 = gl_WorkGroupID.y;
|
||||
const uint i01 = gl_WorkGroupID.x;
|
||||
|
||||
const uint i13 = i03 % pcs.ne13;
|
||||
const uint i12 = i02 % pcs.ne12;
|
||||
const uint i11 = i01 % pcs.ne11;
|
||||
|
||||
int offs = 0; // TMP (see above)
|
||||
|
||||
uint src0_off = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + offs) / 4);
|
||||
uint src1_off = uint((i13*pcs.nb13 + i12*pcs.nb12 + i11*pcs.nb11 ) / 4);
|
||||
uint dst_off = uint((i03*pcs.nb3 + i02*pcs.nb2 + i01*pcs.nb1 + offs) / 4);
|
||||
|
||||
for (uint i0 = gl_LocalInvocationID.x; i0 < pcs.ne0; i0 += gl_WorkGroupSize.x) {
|
||||
const uint i10 = i0 % pcs.ne10;
|
||||
out_[pcs.outOff + dst_off + i0] = inA[pcs.inAOff + src0_off + i0] + inB[pcs.inBOff + src1_off + i10];
|
||||
}
|
||||
}
|
||||
@@ -1,25 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
layout(local_size_x = 1) in;
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
|
||||
layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; };
|
||||
layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
|
||||
|
||||
layout(push_constant) uniform PushConstants {
|
||||
uint inAOff;
|
||||
uint inBOff;
|
||||
uint outOff;
|
||||
uint row;
|
||||
} pcs;
|
||||
|
||||
void main() {
|
||||
const uint baseIndex = gl_WorkGroupID.x * 4;
|
||||
|
||||
for (uint x = 0; x < 4; x++) {
|
||||
const uint i = baseIndex + x;
|
||||
out_[i + pcs.outOff] = inA[i + pcs.inAOff] + inB[(i % pcs.row) + pcs.inBOff];
|
||||
}
|
||||
}
|
||||
@@ -1,52 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
#define IN_TYPE float16_t
|
||||
#define IN_TYPE_SIZE 2
|
||||
#define OUT_TYPE float16_t
|
||||
#define OUT_TYPE_SIZE 2
|
||||
|
||||
layout(local_size_x = 1024) in;
|
||||
|
||||
layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
|
||||
layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint inOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int ne01;
|
||||
int ne02;
|
||||
uint nb00;
|
||||
uint nb01;
|
||||
uint nb02;
|
||||
uint nb03;
|
||||
int ne0;
|
||||
int ne1;
|
||||
int ne2;
|
||||
uint nb0;
|
||||
uint nb1;
|
||||
uint nb2;
|
||||
uint nb3;
|
||||
} pcs;
|
||||
|
||||
void main() {
|
||||
const uint i03 = gl_WorkGroupID.z;
|
||||
const uint i02 = gl_WorkGroupID.y;
|
||||
const uint i01 = gl_WorkGroupID.x;
|
||||
|
||||
const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00;
|
||||
|
||||
const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0);
|
||||
const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0);
|
||||
const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0;
|
||||
const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0);
|
||||
|
||||
const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
|
||||
|
||||
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
|
||||
const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
|
||||
out_[dst_data+i00] = OUT_TYPE(in_[src]);
|
||||
}
|
||||
}
|
||||
@@ -1,52 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
#define IN_TYPE float16_t
|
||||
#define IN_TYPE_SIZE 2
|
||||
#define OUT_TYPE float
|
||||
#define OUT_TYPE_SIZE 4
|
||||
|
||||
layout(local_size_x = 1024) in;
|
||||
|
||||
layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
|
||||
layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint inOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int ne01;
|
||||
int ne02;
|
||||
uint nb00;
|
||||
uint nb01;
|
||||
uint nb02;
|
||||
uint nb03;
|
||||
int ne0;
|
||||
int ne1;
|
||||
int ne2;
|
||||
uint nb0;
|
||||
uint nb1;
|
||||
uint nb2;
|
||||
uint nb3;
|
||||
} pcs;
|
||||
|
||||
void main() {
|
||||
const uint i03 = gl_WorkGroupID.z;
|
||||
const uint i02 = gl_WorkGroupID.y;
|
||||
const uint i01 = gl_WorkGroupID.x;
|
||||
|
||||
const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00;
|
||||
|
||||
const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0);
|
||||
const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0);
|
||||
const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0;
|
||||
const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0);
|
||||
|
||||
const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
|
||||
|
||||
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
|
||||
const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
|
||||
out_[dst_data+i00] = OUT_TYPE(in_[src]);
|
||||
}
|
||||
}
|
||||
@@ -1,52 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
#define IN_TYPE float
|
||||
#define IN_TYPE_SIZE 4
|
||||
#define OUT_TYPE float16_t
|
||||
#define OUT_TYPE_SIZE 2
|
||||
|
||||
layout(local_size_x = 1024) in;
|
||||
|
||||
layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
|
||||
layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint inOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int ne01;
|
||||
int ne02;
|
||||
uint nb00;
|
||||
uint nb01;
|
||||
uint nb02;
|
||||
uint nb03;
|
||||
int ne0;
|
||||
int ne1;
|
||||
int ne2;
|
||||
uint nb0;
|
||||
uint nb1;
|
||||
uint nb2;
|
||||
uint nb3;
|
||||
} pcs;
|
||||
|
||||
void main() {
|
||||
const uint i03 = gl_WorkGroupID.z;
|
||||
const uint i02 = gl_WorkGroupID.y;
|
||||
const uint i01 = gl_WorkGroupID.x;
|
||||
|
||||
const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00;
|
||||
|
||||
const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0);
|
||||
const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0);
|
||||
const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0;
|
||||
const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0);
|
||||
|
||||
const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
|
||||
|
||||
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
|
||||
const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
|
||||
out_[dst_data+i00] = OUT_TYPE(in_[src]);
|
||||
}
|
||||
}
|
||||
@@ -1,52 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
#define IN_TYPE float
|
||||
#define IN_TYPE_SIZE 4
|
||||
#define OUT_TYPE float
|
||||
#define OUT_TYPE_SIZE 4
|
||||
|
||||
layout(local_size_x = 1024) in;
|
||||
|
||||
layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
|
||||
layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint inOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int ne01;
|
||||
int ne02;
|
||||
uint nb00;
|
||||
uint nb01;
|
||||
uint nb02;
|
||||
uint nb03;
|
||||
int ne0;
|
||||
int ne1;
|
||||
int ne2;
|
||||
uint nb0;
|
||||
uint nb1;
|
||||
uint nb2;
|
||||
uint nb3;
|
||||
} pcs;
|
||||
|
||||
void main() {
|
||||
const uint i03 = gl_WorkGroupID.z;
|
||||
const uint i02 = gl_WorkGroupID.y;
|
||||
const uint i01 = gl_WorkGroupID.x;
|
||||
|
||||
const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00;
|
||||
|
||||
const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0);
|
||||
const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0);
|
||||
const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0;
|
||||
const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0);
|
||||
|
||||
const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
|
||||
|
||||
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
|
||||
const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
|
||||
out_[dst_data+i00] = OUT_TYPE(in_[src]);
|
||||
}
|
||||
}
|
||||
@@ -1,30 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
layout(local_size_x = 1) in;
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
|
||||
layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
|
||||
|
||||
layout(push_constant) uniform PushConstants {
|
||||
uint inOff;
|
||||
uint outOff;
|
||||
uint n_past;
|
||||
int ne00;
|
||||
int ne01;
|
||||
} pcs;
|
||||
|
||||
void main() {
|
||||
const uint i02 = gl_WorkGroupID.z;
|
||||
const uint i01 = gl_WorkGroupID.y;
|
||||
const uint i00 = gl_WorkGroupID.x;
|
||||
|
||||
const uint index = i02*pcs.ne01*pcs.ne00 + i01*pcs.ne00 + i00;
|
||||
|
||||
if (i00 > pcs.n_past + i01) {
|
||||
out_[index + pcs.outOff] = uintBitsToFloat(0xFF800000);
|
||||
} else {
|
||||
out_[index + pcs.outOff] = in_[index + pcs.inOff];
|
||||
}
|
||||
}
|
||||
@@ -1,22 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
layout(local_size_x = 1) in;
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
|
||||
layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
|
||||
layout(push_constant) uniform PushConstants {
|
||||
uint inOff;
|
||||
uint outOff;
|
||||
} pcs;
|
||||
|
||||
void main() {
|
||||
const uint baseIndex = gl_WorkGroupID.x * 8;
|
||||
|
||||
for (uint x = 0; x < 8; x++) {
|
||||
const uint i = baseIndex + x;
|
||||
const float y = in_[i + pcs.inOff];
|
||||
out_[i + pcs.outOff] = 0.5*y*(1.0 + tanh(clamp(SQRT_2_OVER_PI*y*(1.0 + GELU_COEF_A*y*y), -15.0, 15.0)));
|
||||
}
|
||||
}
|
||||
@@ -1,17 +0,0 @@
|
||||
void main() {
|
||||
const uint i = gl_WorkGroupID.x;
|
||||
const int r = inB[i + pcs.inBOff];
|
||||
|
||||
int z = 0;
|
||||
for (uint ind = gl_LocalInvocationID.x; ind < pcs.ne00/16; ind += gl_WorkGroupSize.x) {
|
||||
const uint inIndex = (r * pcs.nb01 + pcs.inAOff) + ind/NL * SIZE_OF_BLOCK;
|
||||
const mat4 result = dequantize_block(inIndex, ind%NL);
|
||||
for (uint j = 0; j < 4; ++j) {
|
||||
for (uint k = 0; k < 4; ++k) {
|
||||
const uint outIndex = i * pcs.nb1/BYTES_FOR_TYPE + pcs.outOff + z;
|
||||
out_[outIndex] = result[j][k];
|
||||
++z;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,31 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
layout(local_size_x = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer tensorInA { float16_t inA[]; };
|
||||
layout (binding = 1) readonly buffer tensorInB { int inB[]; };
|
||||
layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint inAOff;
|
||||
uint inBOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int nb01;
|
||||
int nb1;
|
||||
} pcs;
|
||||
|
||||
void dequantize_row_f16(uint x /*Based from inA unaligned*/, uint y /*Based from out_*/, int k) {
|
||||
for (int j = 0; j < k; j++) {
|
||||
out_[y + j] = inA[x + j];
|
||||
}
|
||||
}
|
||||
|
||||
void main() {
|
||||
const uint i = gl_WorkGroupID.x;
|
||||
const int r = inB[i + pcs.inBOff];
|
||||
|
||||
dequantize_row_f16(r*pcs.nb01/2/*bytes for float16*/ + pcs.inAOff, i*pcs.nb1/4 + pcs.outOff, pcs.ne00);
|
||||
}
|
||||
@@ -1,31 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
layout(local_size_x = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer tensorInA { float inA[]; };
|
||||
layout (binding = 1) readonly buffer tensorInB { int inB[]; };
|
||||
layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint inAOff;
|
||||
uint inBOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int nb01;
|
||||
int nb1;
|
||||
} pcs;
|
||||
|
||||
void dequantize_row_f32(uint x /*Based from inA unaligned*/, uint y /*Based from out_*/, int k) {
|
||||
for (int j = 0; j < k; j++) {
|
||||
out_[y + j] = inA[x + j];
|
||||
}
|
||||
}
|
||||
|
||||
void main() {
|
||||
const uint i = gl_WorkGroupID.x;
|
||||
const int r = inB[i + pcs.inBOff];
|
||||
|
||||
dequantize_row_f32(r*pcs.nb01/4 + pcs.inAOff, i*pcs.nb1/4 + pcs.outOff, pcs.ne00);
|
||||
}
|
||||
@@ -1,38 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
#define NL 2
|
||||
#define BYTES_FOR_TYPE 4 /*bytes for float*/
|
||||
#define SIZE_OF_BLOCK sizeof_block_q4_0
|
||||
|
||||
layout(local_size_x = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
|
||||
layout (binding = 1) readonly buffer tensorInB { int inB[]; };
|
||||
layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint inAOff;
|
||||
uint inBOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int nb01;
|
||||
int nb1;
|
||||
} pcs;
|
||||
|
||||
block_q4_0 get_unaligned_block_q4_0(uint index) {
|
||||
block_q4_0 fres;
|
||||
fres.d = u8BufToFloat16(inA, index);
|
||||
[[unroll]] for (uint it = 0; it != QK4_0 / 2; it++) {
|
||||
fres.qs[it] = inA[index+2+it];
|
||||
}
|
||||
return fres;
|
||||
}
|
||||
|
||||
mat4 dequantize_block(uint index, uint il) {
|
||||
const block_q4_0 block = get_unaligned_block_q4_0(index);
|
||||
return dequantize_q4_0(block, il);
|
||||
}
|
||||
|
||||
#include "op_getrows.comp"
|
||||
@@ -1,39 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
#define NL 2
|
||||
#define BYTES_FOR_TYPE 4 /*bytes for float*/
|
||||
#define SIZE_OF_BLOCK sizeof_block_q4_1
|
||||
|
||||
layout(local_size_x = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
|
||||
layout (binding = 1) readonly buffer tensorInB { int inB[]; };
|
||||
layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint inAOff;
|
||||
uint inBOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int nb01;
|
||||
int nb1;
|
||||
} pcs;
|
||||
|
||||
block_q4_1 get_unaligned_block_q4_1(uint index) {
|
||||
block_q4_1 fres;
|
||||
fres.d = u8BufToFloat16(inA, index);
|
||||
fres.m = u8BufToFloat16(inA, index+2);
|
||||
[[unroll]] for (uint it = 0; it != QK4_1 / 2; it++) {
|
||||
fres.qs[it] = inA[index+4+it];
|
||||
}
|
||||
return fres;
|
||||
}
|
||||
|
||||
mat4 dequantize_block(uint index, uint il) {
|
||||
const block_q4_1 block = get_unaligned_block_q4_1(index);
|
||||
return dequantize_q4_1(block, il);
|
||||
}
|
||||
|
||||
#include "op_getrows.comp"
|
||||
@@ -1,44 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
#define NL 16
|
||||
#define BYTES_FOR_TYPE 4 /*bytes for float*/
|
||||
#define SIZE_OF_BLOCK sizeof_block_q6_k
|
||||
|
||||
layout(local_size_x = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
|
||||
layout (binding = 1) readonly buffer tensorInB { int inB[]; };
|
||||
layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint inAOff;
|
||||
uint inBOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int nb01;
|
||||
int nb1;
|
||||
} pcs;
|
||||
|
||||
block_q6_k get_unaligned_block_q6_k(uint index) {
|
||||
block_q6_k fres;
|
||||
[[unroll]] for (uint it = 0; it != QK_K / 2; it++) {
|
||||
fres.ql[it] = inA[index + it];
|
||||
}
|
||||
[[unroll]] for (uint it = 0; it != QK_K / 4; it++) {
|
||||
fres.qh[it] = inA[index + QK_K/2 + it];
|
||||
}
|
||||
[[unroll]] for (uint it = 0; it != QK_K / 16; it++) {
|
||||
fres.scales[it] = int8_t(inA[index + QK_K/2 + QK_K/4 + it]);
|
||||
}
|
||||
fres.d = u8BufToFloat16(inA, index + QK_K/2 + QK_K/4 + QK_K/16);
|
||||
return fres;
|
||||
}
|
||||
|
||||
mat4 dequantize_block(uint index, uint il) {
|
||||
const block_q6_k block = get_unaligned_block_q6_k(index);
|
||||
return dequantize_q6_k(block, il);
|
||||
}
|
||||
|
||||
#include "op_getrows.comp"
|
||||
@@ -1,52 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
layout(local_size_x = 1024) in;
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
|
||||
layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; };
|
||||
layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
|
||||
|
||||
layout(push_constant) uniform PushConstants {
|
||||
uint inAOff;
|
||||
uint inBOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int nb00;
|
||||
int nb01;
|
||||
int nb02;
|
||||
int nb03;
|
||||
int ne10;
|
||||
int ne11;
|
||||
int ne12;
|
||||
int ne13;
|
||||
int nb10;
|
||||
int nb11;
|
||||
int nb12;
|
||||
int nb13;
|
||||
int ne0;
|
||||
int nb0;
|
||||
int nb1;
|
||||
int nb2;
|
||||
int nb3;
|
||||
} pcs;
|
||||
|
||||
void main() {
|
||||
const uint i03 = gl_WorkGroupID.z;
|
||||
const uint i02 = gl_WorkGroupID.y;
|
||||
const uint i01 = gl_WorkGroupID.x;
|
||||
|
||||
const uint i13 = i03 % pcs.ne13;
|
||||
const uint i12 = i02 % pcs.ne12;
|
||||
const uint i11 = i01 % pcs.ne11;
|
||||
|
||||
uint src0_off = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01) / 4);
|
||||
uint src1_off = uint((i13*pcs.nb13 + i12*pcs.nb12 + i11*pcs.nb11) / 4);
|
||||
uint dst_off = uint((i03*pcs.nb3 + i02*pcs.nb2 + i01*pcs.nb1) / 4);
|
||||
|
||||
for (uint i0 = gl_LocalInvocationID.x; i0 < pcs.ne0; i0 += gl_WorkGroupSize.x) {
|
||||
const uint i10 = i0 % pcs.ne10;
|
||||
out_[pcs.outOff + dst_off + i0] = inA[pcs.inAOff + src0_off + i0] * inB[pcs.inBOff + src1_off + i10];
|
||||
}
|
||||
}
|
||||
@@ -1,69 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
#extension GL_KHR_shader_subgroup_arithmetic : require
|
||||
|
||||
layout(local_size_x_id = 0) in;
|
||||
|
||||
layout (binding = 0) readonly buffer tensorInA { float16_t inA[]; };
|
||||
layout (binding = 1) readonly buffer tensorInB { float inB[]; };
|
||||
layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint inAOff;
|
||||
uint inBOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int ne01;
|
||||
int ne02;
|
||||
uint nb00;
|
||||
uint nb01;
|
||||
uint nb02;
|
||||
uint nb03;
|
||||
int ne10;
|
||||
int ne11;
|
||||
int ne12;
|
||||
uint nb10;
|
||||
uint nb11;
|
||||
uint nb12;
|
||||
uint nb13;
|
||||
int ne0;
|
||||
int ne1;
|
||||
uint r2;
|
||||
uint r3;
|
||||
} pcs;
|
||||
|
||||
#define N_F16_F32 4
|
||||
|
||||
void main() {
|
||||
const uint r0 = gl_WorkGroupID.x;
|
||||
const uint rb = gl_WorkGroupID.y*N_F16_F32;
|
||||
const uint im = gl_WorkGroupID.z;
|
||||
|
||||
const uint i12 = im%pcs.ne12;
|
||||
const uint i13 = im/pcs.ne12;
|
||||
|
||||
const uint offset0 = r0*pcs.nb01 + (i12/pcs.r2)*pcs.nb02 + (i13/pcs.r3)*pcs.nb03;
|
||||
|
||||
const uint x = offset0 / 2 + pcs.inAOff; // Based from inA
|
||||
|
||||
for (uint row = 0; row < N_F16_F32; ++row) {
|
||||
uint r1 = rb + row;
|
||||
if (r1 >= pcs.ne11) {
|
||||
break;
|
||||
}
|
||||
|
||||
const uint y = (r1*pcs.nb11 + i12*pcs.nb12 + i13*pcs.nb13) / 4 + pcs.inBOff;
|
||||
|
||||
float sumf = 0;
|
||||
for (uint i = gl_SubgroupInvocationID.x; i < pcs.ne00; i += gl_SubgroupSize) {
|
||||
sumf += float(inA[x+i]) * float(inB[y+i]);
|
||||
}
|
||||
|
||||
const float all_sum = subgroupAdd(sumf);
|
||||
if (subgroupElect()) {
|
||||
out_[im*pcs.ne1*pcs.ne0 + r1*pcs.ne0 + r0 + pcs.outOff] = all_sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,51 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
#extension GL_KHR_shader_subgroup_arithmetic : require
|
||||
#extension GL_EXT_debug_printf : enable
|
||||
|
||||
// device subgroup size
|
||||
layout (local_size_x_id = 0) in;
|
||||
|
||||
layout(binding = 0) readonly buffer tensorInA { float inA[]; };
|
||||
layout(binding = 1) readonly buffer tensorInB { float inB[]; };
|
||||
layout(binding = 2) writeonly buffer tensorOut { float out_[]; };
|
||||
|
||||
layout(push_constant) uniform parameter {
|
||||
uint inAOff;
|
||||
uint inBOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int ne01;
|
||||
int ne02;
|
||||
int ne11;
|
||||
int ne12;
|
||||
uint nb01;
|
||||
uint nb02;
|
||||
uint nb11;
|
||||
uint nb12;
|
||||
uint nb1;
|
||||
uint nb2;
|
||||
}
|
||||
pcs;
|
||||
|
||||
|
||||
void main() {
|
||||
uvec3 gid = gl_WorkGroupID;
|
||||
|
||||
uint bc_ab = pcs.ne12 > pcs.ne02 ? gid.z / (pcs.ne12 / pcs.ne02) : gid.z;
|
||||
uint bc_ba = pcs.ne02 > pcs.ne12 ? gid.z / (pcs.ne02 / pcs.ne12) : gid.z;
|
||||
|
||||
const uint x = (gid.x*pcs.nb01 + bc_ab*pcs.nb02) / 4 + pcs.inAOff; // Based from inA
|
||||
const uint y = (gid.y*pcs.nb11 + bc_ba*pcs.nb12) / 4 + pcs.inBOff; // based from inB
|
||||
float sum = 0.0f;
|
||||
for (uint i = gl_SubgroupInvocationID.x; i < pcs.ne00; i += gl_SubgroupSize) {
|
||||
sum += float(inA[x+i]) * float(inB[y+i]);
|
||||
}
|
||||
|
||||
const float all_sum = subgroupAdd(sum);
|
||||
if (subgroupElect()) {
|
||||
out_[gid.z*(pcs.nb2/4) + gid.y*(pcs.nb1/4) + gid.x + pcs.outOff] = all_sum;
|
||||
}
|
||||
}
|
||||
@@ -1,33 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
#define BLOCKS_IN_QUANT QK4_0
|
||||
#define SIZE_OF_BLOCK sizeof_block_q4_0
|
||||
#define N_ROWS 4
|
||||
|
||||
#include "op_mul_mv_q_n_pre.comp"
|
||||
|
||||
// The q4_0 version of this function
|
||||
float block_q_n_dot_y(uint block_index, uint yb, uint il) {
|
||||
vec2 acc = vec2(0.0, 0.0);
|
||||
const uint index = (block_index) * SIZE_OF_BLOCK + pcs.inAOff;
|
||||
float d = float(u8BufToFloat16(inA, index));
|
||||
float sumy = 0.0f;
|
||||
for (int i = 0; i < BLOCKS_IN_QUANT/4; i+=2) {
|
||||
const uint16_t b = u8BufToU16(inA, index + 2 + il + i);
|
||||
|
||||
const float yl0 = inB[yb + i];
|
||||
const float yl1 = inB[yb + i + 1];
|
||||
const float yl8 = inB[yb + i + BLOCKS_IN_QUANT/2];
|
||||
const float yl9 = inB[yb + i + BLOCKS_IN_QUANT/2 + 1];
|
||||
|
||||
sumy += yl0 + yl1 + yl8 + yl9;
|
||||
|
||||
acc[0] += yl0 * (b & 0x000F) + yl1 / 256.f * (b & 0x0F00);
|
||||
acc[1] += yl8 / 16.f * (b & 0x00F0) + yl9 / 4096.f * (b & 0xF000);
|
||||
}
|
||||
return d * (sumy * -8.f + acc[0] + acc[1]);
|
||||
}
|
||||
|
||||
#include "op_mul_mv_q_n.comp"
|
||||
@@ -1,35 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
#define BLOCKS_IN_QUANT QK4_1
|
||||
#define SIZE_OF_BLOCK sizeof_block_q4_1
|
||||
#define N_ROWS 4
|
||||
|
||||
#include "op_mul_mv_q_n_pre.comp"
|
||||
|
||||
// The q4_1 version of this function
|
||||
float block_q_n_dot_y(uint block_index, uint yb, uint il) {
|
||||
vec2 acc = vec2(0.0, 0.0);
|
||||
const uint index = (block_index) * SIZE_OF_BLOCK + pcs.inAOff;
|
||||
float d = float(u8BufToFloat16(inA, index));
|
||||
float m = float(u8BufToFloat16(inA, index+2));
|
||||
|
||||
float sumy = 0.0f;
|
||||
for (int i = 0; i < BLOCKS_IN_QUANT/4; i+=2) {
|
||||
const uint16_t b = u8BufToU16(inA, index + 4 + il + i);
|
||||
|
||||
const float yl0 = inB[yb + i];
|
||||
const float yl1 = inB[yb + i + 1];
|
||||
const float yl8 = inB[yb + i + BLOCKS_IN_QUANT/2];
|
||||
const float yl9 = inB[yb + i + BLOCKS_IN_QUANT/2 + 1];
|
||||
|
||||
sumy += yl0 + yl1 + yl8 + yl9;
|
||||
|
||||
acc[0] += yl0 * (b & 0x000F) + yl1 / 256.f * (b & 0x0F00);
|
||||
acc[1] += yl8 / 16.f * (b & 0x00F0) + yl9 / 4096.f * (b & 0xF000);
|
||||
}
|
||||
return d * (acc[0] + acc[1]) + sumy * m;
|
||||
}
|
||||
|
||||
#include "op_mul_mv_q_n.comp"
|
||||
@@ -1,140 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
#define N_DST 4
|
||||
#define SIZE_OF_BLOCK sizeof_block_q4_k
|
||||
|
||||
layout(local_size_x = 4) in;
|
||||
layout(local_size_y = 8) in;
|
||||
layout(local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer tensorInA { block_q4_k inA[]; };
|
||||
layout (binding = 1) readonly buffer tensorInB { float inB[]; };
|
||||
layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint inAOff;
|
||||
uint inBOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int ne10;
|
||||
int ne0;
|
||||
int ne1;
|
||||
int ne01;
|
||||
int ne02;
|
||||
int ne12;
|
||||
uint nb01;
|
||||
uint nb02;
|
||||
uint nb03;
|
||||
uint nb11;
|
||||
uint nb12;
|
||||
uint nb13;
|
||||
uint r2;
|
||||
uint r3;
|
||||
} pcs;
|
||||
|
||||
void main() {
|
||||
const uint16_t kmask1 = uint16_t(0x3f3f);
|
||||
const uint16_t kmask2 = uint16_t(0x0f0f);
|
||||
const uint16_t kmask3 = uint16_t(0xc0c0);
|
||||
|
||||
const uint ix = gl_SubgroupInvocationID/8; // 0...3
|
||||
const uint it = gl_SubgroupInvocationID%8; // 0...7
|
||||
const uint iq = it/4; // 0 or 1
|
||||
const uint ir = it%4; // 0...3
|
||||
|
||||
const uint nb = pcs.ne00/QK_K;
|
||||
|
||||
const uint r0 = gl_WorkGroupID.x;
|
||||
const uint r1 = gl_WorkGroupID.y;
|
||||
const uint im = gl_WorkGroupID.z;
|
||||
|
||||
const uint first_row = r0 * N_DST;
|
||||
const uint ib_row = first_row * nb;
|
||||
|
||||
const uint i12 = im%pcs.ne12;
|
||||
const uint i13 = im/pcs.ne12;
|
||||
|
||||
const uint offset0 = first_row*(pcs.nb01/SIZE_OF_BLOCK) + (i12/pcs.r2)*(pcs.nb02/SIZE_OF_BLOCK) + (i13/pcs.r3)*(pcs.nb03/SIZE_OF_BLOCK);
|
||||
const uint offset1 = r1*pcs.nb11 + (i12 )*pcs.nb12 + (i13 )*pcs.nb13;
|
||||
|
||||
const uint xblk = offset0 + pcs.inAOff;
|
||||
const uint y = (offset1 / 4) + pcs.inBOff;
|
||||
|
||||
float yl[16];
|
||||
float yh[16];
|
||||
float sumf[N_DST] = {0.f, 0.f, 0.f, 0.f};
|
||||
float all_sum = 0.f;
|
||||
|
||||
uint y4 = y + ix * QK_K + 64 * iq + 8 * ir;
|
||||
|
||||
for (uint ib = ix; ib < nb; ib += 4) {
|
||||
const uint blk_idx = ib + xblk;
|
||||
|
||||
float sumy[4] = {0.f, 0.f, 0.f, 0.f};
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
yl[i+0] = inB[y4+i+ 0]; sumy[0] += yl[i+0];
|
||||
yl[i+8] = inB[y4+i+ 32]; sumy[1] += yl[i+8];
|
||||
yh[i+0] = inB[y4+i+128]; sumy[2] += yh[i+0];
|
||||
yh[i+8] = inB[y4+i+160]; sumy[3] += yh[i+8];
|
||||
}
|
||||
|
||||
for (int row = 0; row < N_DST; row++) {
|
||||
uint row_idx = row * (pcs.nb01 / SIZE_OF_BLOCK);
|
||||
|
||||
uint16_t sc_0 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 0);
|
||||
uint16_t sc_1 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 2);
|
||||
uint16_t sc_2 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 4);
|
||||
uint16_t sc_3 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 6);
|
||||
uint16_t sc_4 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 8);
|
||||
|
||||
uint16_t sc16[4];
|
||||
sc16[0] = sc_0 & kmask1;
|
||||
sc16[1] = sc_2 & kmask1;
|
||||
sc16[2] = ((sc_4 >> 0) & kmask2) | ((sc_0 & kmask3) >> 2);
|
||||
sc16[3] = ((sc_4 >> 4) & kmask2) | ((sc_2 & kmask3) >> 2);
|
||||
|
||||
float acc1[4] = {0.f, 0.f, 0.f, 0.f};
|
||||
float acc2[4] = {0.f, 0.f, 0.f, 0.f};
|
||||
for (int i = 0; i < 8; i += 2) {
|
||||
uint16_t q1 = u8BufToU16(inA[blk_idx + row_idx].qs, 32 * iq + 8 * ir + i);
|
||||
uint16_t q2 = u8BufToU16(inA[blk_idx + row_idx].qs, 64 + 32 * iq + 8 * ir + i);
|
||||
acc1[0] += yl[i+0] * (q1 & 0x000F);
|
||||
acc1[1] += yl[i+1] * (q1 & 0x0F00);
|
||||
acc1[2] += yl[i+8] * (q1 & 0x00F0);
|
||||
acc1[3] += yl[i+9] * (q1 & 0xF000);
|
||||
acc2[0] += yh[i+0] * (q2 & 0x000F);
|
||||
acc2[1] += yh[i+1] * (q2 & 0x0F00);
|
||||
acc2[2] += yh[i+8] * (q2 & 0x00F0);
|
||||
acc2[3] += yh[i+9] * (q2 & 0xF000);
|
||||
}
|
||||
|
||||
uint8_t sc8_0 = uint8_t(sc16[0] & 0xFF);
|
||||
uint8_t sc8_1 = uint8_t(sc16[0] >> 8 );
|
||||
uint8_t sc8_2 = uint8_t(sc16[1] & 0xFF);
|
||||
uint8_t sc8_3 = uint8_t(sc16[1] >> 8 );
|
||||
uint8_t sc8_4 = uint8_t(sc16[2] & 0xFF);
|
||||
uint8_t sc8_5 = uint8_t(sc16[2] >> 8 );
|
||||
uint8_t sc8_6 = uint8_t(sc16[3] & 0xFF);
|
||||
uint8_t sc8_7 = uint8_t(sc16[3] >> 8 );
|
||||
|
||||
float dall = float(inA[blk_idx + row_idx].d);
|
||||
float dmin = float(inA[blk_idx + row_idx].dmin);
|
||||
sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc1[1]) * sc8_0 +
|
||||
(acc1[2] + 1.f/256.f * acc1[3]) * sc8_1 * 1.f/16.f +
|
||||
(acc2[0] + 1.f/256.f * acc2[1]) * sc8_4 +
|
||||
(acc2[2] + 1.f/256.f * acc2[3]) * sc8_5 * 1.f/16.f) -
|
||||
dmin * (sumy[0] * sc8_2 + sumy[1] * sc8_3 + sumy[2] * sc8_6 + sumy[3] * sc8_7);
|
||||
}
|
||||
|
||||
y4 += 4 * QK_K;
|
||||
}
|
||||
|
||||
for (int row = 0; row < N_DST; ++row) {
|
||||
all_sum = subgroupAdd(sumf[row]);
|
||||
if (subgroupElect()) {
|
||||
out_[r1*pcs.ne0 + im*pcs.ne0*pcs.ne1 + first_row + row + pcs.outOff] = all_sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,106 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
#define SIZE_OF_BLOCK sizeof_block_q6_k
|
||||
|
||||
layout(local_size_x_id = 0) in;
|
||||
layout(local_size_y_id = 1) in;
|
||||
layout(local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
|
||||
layout (binding = 1) readonly buffer tensorInB { float inB[]; };
|
||||
layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint inAOff;
|
||||
uint inBOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int ne10;
|
||||
int ne0;
|
||||
int ne1;
|
||||
int ne01;
|
||||
int ne02;
|
||||
int ne12;
|
||||
uint nb01;
|
||||
uint nb02;
|
||||
uint nb03;
|
||||
uint nb11;
|
||||
uint nb12;
|
||||
uint nb13;
|
||||
uint r2;
|
||||
uint r3;
|
||||
} pcs;
|
||||
|
||||
void main() {
|
||||
const uint8_t kmask1 = uint8_t(0x03);
|
||||
const uint8_t kmask2 = uint8_t(0x0C);
|
||||
const uint8_t kmask3 = uint8_t(0x30);
|
||||
const uint8_t kmask4 = uint8_t(0xC0);
|
||||
|
||||
const uint nb = pcs.ne00/QK_K;
|
||||
|
||||
const uint r0 = gl_WorkGroupID.x;
|
||||
const uint r1 = gl_WorkGroupID.y;
|
||||
const uint im = gl_WorkGroupID.z;
|
||||
|
||||
const uint row = (r0 * gl_NumSubgroups + gl_SubgroupID);
|
||||
|
||||
const uint i12 = im%pcs.ne12;
|
||||
const uint i13 = im/pcs.ne12;
|
||||
|
||||
const uint x = row*(pcs.nb01/SIZE_OF_BLOCK) + (i12/pcs.r2)*(pcs.nb02/SIZE_OF_BLOCK) + (i13/pcs.r3)*(pcs.nb03/SIZE_OF_BLOCK);
|
||||
const uint yy = (r1*pcs.nb11 + i12*pcs.nb12 + i13*pcs.nb13) / 4 + pcs.inBOff;
|
||||
|
||||
float sumf = 0;
|
||||
|
||||
// bits of invocation ID for gl_SubgroupSize=32:
|
||||
// x x x x x
|
||||
// 4 3 2 1 0
|
||||
// ( tid ) ix
|
||||
// ip ( il )
|
||||
|
||||
const uint block_stride = gl_SubgroupSize / 16; // number of blocks each subgroup processes
|
||||
const uint tid = gl_SubgroupInvocationID/block_stride; // first block_stride groups have tid=0
|
||||
const uint ix = gl_SubgroupInvocationID%block_stride; // first block is 0..block_stride-1
|
||||
const uint ip = tid/8; // first or second half of block (0 or 1)
|
||||
const uint il = tid%8; // each half has 8 parts, one per scale
|
||||
const uint n = 4; // 4 scales at a time (and 4 sums)
|
||||
const uint l0 = n*il; // offset into half-block, 0..28
|
||||
const uint is = 8*ip + l0/16; // 0, 1, 8, 9
|
||||
|
||||
const uint y_offset = 128*ip + l0;
|
||||
const uint q_offset_l = 64*ip + l0;
|
||||
const uint q_offset_h = 32*ip + l0;
|
||||
|
||||
for (uint i = ix; i < nb; i += block_stride) {
|
||||
|
||||
const uint baseIndex = (x + i) * SIZE_OF_BLOCK + pcs.inAOff;
|
||||
|
||||
const uint qlIndex = q_offset_l;
|
||||
const uint q2Index = qlIndex + QK_K/8;
|
||||
const uint qhIndex = q_offset_h;
|
||||
const uint y = yy + i * QK_K + y_offset;
|
||||
|
||||
float sums[4] = {0.0f, 0.0f, 0.0f, 0.0f};
|
||||
for (uint l = 0; l < n; ++l) {
|
||||
const uint8_t currentQ1 = inA[baseIndex + qlIndex + l];
|
||||
const uint8_t currentQ2 = inA[baseIndex + q2Index + l];
|
||||
const uint8_t currentQh = inA[baseIndex + QK_K/2 + qhIndex + l];
|
||||
|
||||
sums[0] += inB[y+l+ 0] * (int8_t((currentQ1 & 0xF) | ((currentQh & kmask1) << 4)) - 32);
|
||||
sums[1] += inB[y+l+32] * (int8_t((currentQ2 & 0xF) | ((currentQh & kmask2) << 2)) - 32);
|
||||
sums[2] += inB[y+l+64] * (int8_t((currentQ1 >> 4) | ((currentQh & kmask3) << 0)) - 32);
|
||||
sums[3] += inB[y+l+96] * (int8_t((currentQ2 >> 4) | ((currentQh & kmask4) >> 2)) - 32);
|
||||
}
|
||||
|
||||
float d = u8BufToFloat16(inA, baseIndex + QK_K/2 + QK_K/4 + QK_K/16);
|
||||
sumf += d * (sums[0] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + is]) + sums[1] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + 2 + is]) + sums[2] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + 4 + is]) + sums[3] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + 6 + is]));
|
||||
}
|
||||
|
||||
const float tot = subgroupAdd(sumf);
|
||||
if (subgroupElect()) {
|
||||
out_[r1*pcs.ne0 + im*pcs.ne0*pcs.ne1 + row + pcs.outOff] = tot;
|
||||
}
|
||||
}
|
||||
@@ -1,73 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
#include "op_mul_mv_q_n_pre.comp"
|
||||
|
||||
#define SIZE_OF_D 2
|
||||
|
||||
#define N_DST 4 // each SIMD group works on 4 rows
|
||||
#define N_SIMDGROUP 2 // number of SIMD groups in a thread group
|
||||
#define N_SIMDWIDTH 32 // assuming SIMD group size is 32
|
||||
|
||||
#define NB_Q8_0 8
|
||||
|
||||
void main() {
|
||||
// NB: hack to make compatible with AMD GPUs that have a subgroup size of 64
|
||||
if (gl_SubgroupInvocationID > 31)
|
||||
return;
|
||||
|
||||
const int nr = N_DST;
|
||||
const int nsg = N_SIMDGROUP;
|
||||
const int nw = N_SIMDWIDTH;
|
||||
|
||||
const int nb = pcs.ne00/QK8_0;
|
||||
const uint r0 = gl_WorkGroupID.x;
|
||||
const uint r1 = gl_WorkGroupID.y;
|
||||
const uint im = gl_WorkGroupID.z;
|
||||
|
||||
const uint first_row = (r0 * nsg + gl_SubgroupID) * nr;
|
||||
|
||||
const uint i12 = im%pcs.ne12;
|
||||
const uint i13 = im/pcs.ne12;
|
||||
|
||||
const uint offset0 = first_row * nb + (i12/pcs.r2)*(nb*pcs.ne01) + (i13/pcs.r3)*(nb*pcs.ne01*pcs.ne02);
|
||||
|
||||
const uint x = offset0*sizeof_block_q8_0 + pcs.inAOff; // Based from inA
|
||||
const uint y = r1*pcs.ne10 + im*pcs.ne00*pcs.ne1 + pcs.inBOff; // based from inB
|
||||
|
||||
float yl[NB_Q8_0];
|
||||
float sumf[N_DST]={0.f, 0.f, 0.f, 0.f};
|
||||
|
||||
const uint ix = gl_SubgroupInvocationID.x/4;
|
||||
const uint il = gl_SubgroupInvocationID.x%4;
|
||||
|
||||
uint yb = y + ix * QK8_0 + NB_Q8_0*il;
|
||||
|
||||
// each thread in a SIMD group deals with NB_Q8_0 quants at a time
|
||||
for (uint ib = ix; ib < nb; ib += nw/4) {
|
||||
for (int i = 0; i < NB_Q8_0; ++i) {
|
||||
yl[i] = inB[yb + i];
|
||||
}
|
||||
|
||||
for (int row = 0; row < nr; row++) {
|
||||
const uint block_offset = (ib+row*nb) * sizeof_block_q8_0;
|
||||
float sumq = 0.f;
|
||||
for (int iq = 0; iq < NB_Q8_0; ++iq) {
|
||||
const int8_t qs_iq = int8_t(inA[x + block_offset + SIZE_OF_D + NB_Q8_0*il + iq]);
|
||||
sumq += qs_iq * yl[iq];
|
||||
}
|
||||
const float16_t d = u8BufToFloat16(inA, x + block_offset);
|
||||
sumf[row] += sumq*d;
|
||||
}
|
||||
|
||||
yb += NB_Q8_0 * nw;
|
||||
}
|
||||
|
||||
for (int row = 0; row < nr; ++row) {
|
||||
const float tot = subgroupAdd(sumf[row]);
|
||||
if (subgroupElect() && first_row + row < pcs.ne01) {
|
||||
out_[r1*pcs.ne0 + im*pcs.ne0*pcs.ne1 + first_row + row] = tot;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,52 +0,0 @@
|
||||
void main() {
|
||||
// NB: hack to make compatible with AMD GPUs that have a subgroup size of 64
|
||||
if (gl_SubgroupInvocationID > 31)
|
||||
return;
|
||||
|
||||
const uint nb = uint(pcs.ne00/BLOCKS_IN_QUANT);
|
||||
|
||||
const uint r0 = gl_WorkGroupID.x;
|
||||
const uint r1 = gl_WorkGroupID.y;
|
||||
const uint im = gl_WorkGroupID.z;
|
||||
|
||||
const uint first_row = (r0 * gl_NumSubgroups + gl_SubgroupID) * N_ROWS;
|
||||
|
||||
const uint i12 = im%pcs.ne12;
|
||||
const uint i13 = im/pcs.ne12;
|
||||
|
||||
// pointers to src0 rows
|
||||
uint ax[N_ROWS];
|
||||
for (int row = 0; row < N_ROWS; ++row) {
|
||||
const uint offset0 = (first_row + row)*(pcs.nb01/SIZE_OF_BLOCK) + (i12/pcs.r2)*(pcs.nb02/SIZE_OF_BLOCK) + (i13/pcs.r3)*(pcs.nb03/SIZE_OF_BLOCK);
|
||||
|
||||
ax[row] = offset0 + pcs.inAOff;
|
||||
}
|
||||
|
||||
const uint y = (r1*pcs.nb11 + i12*pcs.nb12 + i13*pcs.nb13) / 4 + pcs.inBOff;
|
||||
|
||||
float sumf[N_ROWS] = {0.0f, 0.0f, 0.0f, 0.0f};
|
||||
|
||||
const uint ix = gl_SubgroupInvocationID/2;
|
||||
const uint il = (BLOCKS_IN_QUANT/4)*(gl_SubgroupInvocationID%2);
|
||||
|
||||
uint yb = y + ix * BLOCKS_IN_QUANT + il;
|
||||
|
||||
//debugPrintfEXT("gl_NumSubgroups=%d, gl_SubgroupID=%d, gl_SubgroupInvocationID=%d, glSubgroupSize=%d, gl_WorkGroupSize.x=%d, gl_WorkGroupSize.y=%d, gl_WorkGroupSize.z=%d\n",
|
||||
// gl_NumSubgroups, gl_SubgroupID, gl_SubgroupInvocationID, gl_SubgroupSize,
|
||||
// gl_WorkGroupSize.x, gl_WorkGroupSize.y, gl_WorkGroupSize.z);
|
||||
|
||||
for (uint ib = ix; ib < nb; ib += 16) {
|
||||
for (int row = 0; row < N_ROWS; row++) {
|
||||
sumf[row] += block_q_n_dot_y(ax[row] + ib, yb, il);
|
||||
}
|
||||
|
||||
yb += BLOCKS_IN_QUANT * 16;
|
||||
}
|
||||
|
||||
for (int row = 0; row < N_ROWS; ++row) {
|
||||
const float tot = subgroupAdd(sumf[row]);
|
||||
if (first_row + row < pcs.ne01 && subgroupElect()) {
|
||||
out_[r1*pcs.ne0 + im*pcs.ne0*pcs.ne1 + first_row + row + pcs.outOff] = tot;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,28 +0,0 @@
|
||||
layout(local_size_x_id = 0) in;
|
||||
layout(local_size_y = 8) in;
|
||||
layout(local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
|
||||
layout (binding = 1) readonly buffer tensorInB { float inB[]; };
|
||||
layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint inAOff;
|
||||
uint inBOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int ne01;
|
||||
int ne02;
|
||||
int ne10;
|
||||
int ne12;
|
||||
int ne0;
|
||||
int ne1;
|
||||
uint nb01;
|
||||
uint nb02;
|
||||
uint nb03;
|
||||
uint nb11;
|
||||
uint nb12;
|
||||
uint nb13;
|
||||
uint r2;
|
||||
uint r3;
|
||||
} pcs;
|
||||
@@ -1,84 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
layout(local_size_x = 256) in;
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
|
||||
layout(binding = 1) buffer restrict tensorOut { float out_[]; };
|
||||
|
||||
layout(push_constant) uniform PushConstants {
|
||||
uint inOff;
|
||||
uint outOff;
|
||||
uint ne00;
|
||||
uint nb01;
|
||||
float eps;
|
||||
} pcs;
|
||||
|
||||
shared float sum[gl_WorkGroupSize.x];
|
||||
|
||||
void main() {
|
||||
const uint x = (gl_WorkGroupID.x*pcs.nb01/4) + pcs.inOff; // Based from in_
|
||||
// MEAN
|
||||
// parallel sum
|
||||
sum[gl_LocalInvocationID.x] = 0.0;
|
||||
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
|
||||
sum[gl_LocalInvocationID.x] += in_[x+i00];
|
||||
}
|
||||
|
||||
// reduce
|
||||
barrier();
|
||||
memoryBarrierShared();
|
||||
[[unroll]] for (uint i = gl_WorkGroupSize.x/2; i > 0; i /= 2) {
|
||||
if (gl_LocalInvocationID.x < i) {
|
||||
sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i];
|
||||
}
|
||||
barrier();
|
||||
memoryBarrierShared();
|
||||
}
|
||||
|
||||
// broadcast
|
||||
if (gl_LocalInvocationID.x == 0) {
|
||||
sum[0] /= float(pcs.ne00);
|
||||
}
|
||||
barrier();
|
||||
memoryBarrierShared();
|
||||
const float mean = sum[0];
|
||||
|
||||
// recenter
|
||||
const uint y = (gl_WorkGroupID.x*pcs.ne00) + pcs.outOff; // Based from out_
|
||||
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
|
||||
out_[y+i00] = in_[x+i00] - mean;
|
||||
}
|
||||
|
||||
// VARIANCE
|
||||
// parallel sum
|
||||
sum[gl_LocalInvocationID.x] = 0.0;
|
||||
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
|
||||
sum[gl_LocalInvocationID.x] += out_[y+i00] * out_[y+i00];
|
||||
}
|
||||
|
||||
// reduce
|
||||
barrier();
|
||||
memoryBarrierShared();
|
||||
[[unroll]] for (uint i = gl_WorkGroupSize.x/2; i > 0; i /= 2) {
|
||||
if (gl_LocalInvocationID.x < i) {
|
||||
sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i];
|
||||
}
|
||||
barrier();
|
||||
memoryBarrierShared();
|
||||
}
|
||||
|
||||
// broadcast
|
||||
if (gl_LocalInvocationID.x == 0) {
|
||||
sum[0] /= float(pcs.ne00);
|
||||
}
|
||||
barrier();
|
||||
memoryBarrierShared();
|
||||
const float variance = sum[0];
|
||||
|
||||
const float scale = 1.0f/sqrt(variance + pcs.eps);
|
||||
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
|
||||
out_[y+i00] *= scale;
|
||||
}
|
||||
}
|
||||
@@ -1,21 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
layout(local_size_x = 1) in;
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
|
||||
layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
|
||||
layout(push_constant) uniform PushConstants {
|
||||
uint inOff;
|
||||
uint outOff;
|
||||
} pcs;
|
||||
|
||||
void main() {
|
||||
const uint baseIndex = gl_WorkGroupID.x * 4;
|
||||
|
||||
for (uint x = 0; x < 4; x++) {
|
||||
const uint i = baseIndex + x;
|
||||
out_[i + pcs.outOff] = max(0.0, in_[i + pcs.inOff]);
|
||||
}
|
||||
}
|
||||
@@ -1,53 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
layout(local_size_x = 512) in;
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
|
||||
layout(binding = 1) buffer restrict tensorOut { float out_[]; };
|
||||
|
||||
layout(push_constant) uniform PushConstants {
|
||||
uint inOff;
|
||||
uint outOff;
|
||||
uint ne00;
|
||||
uint nb01;
|
||||
float eps;
|
||||
} pcs;
|
||||
|
||||
shared float sum[gl_WorkGroupSize.x];
|
||||
|
||||
void main() {
|
||||
const uint x = (gl_WorkGroupID.x*pcs.nb01/4) + pcs.inOff; // Based from in_
|
||||
|
||||
// parallel sum
|
||||
sum[gl_LocalInvocationID.x] = 0.0;
|
||||
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
|
||||
sum[gl_LocalInvocationID.x] += in_[x+i00] * in_[x+i00];
|
||||
}
|
||||
|
||||
// reduce
|
||||
barrier();
|
||||
memoryBarrierShared();
|
||||
[[unroll]] for (uint i = gl_WorkGroupSize.x/2; i > 0; i /= 2) {
|
||||
if (gl_LocalInvocationID.x < i) {
|
||||
sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i];
|
||||
}
|
||||
barrier();
|
||||
memoryBarrierShared();
|
||||
}
|
||||
|
||||
// broadcast
|
||||
if (gl_LocalInvocationID.x == 0) {
|
||||
sum[0] /= float(pcs.ne00);
|
||||
}
|
||||
barrier();
|
||||
memoryBarrierShared();
|
||||
|
||||
const float scale = 1.0f/sqrt(sum[0] + pcs.eps);
|
||||
|
||||
const uint y = (gl_WorkGroupID.x*pcs.ne00) + pcs.outOff; // Based from out_
|
||||
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
|
||||
out_[y+i00] = in_[x+i00] * scale;
|
||||
}
|
||||
}
|
||||
@@ -1,52 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "rope_common.comp"
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorInA { float16_t inA[]; };
|
||||
layout(binding = 1) buffer restrict readonly tensorInB { int inB[]; };
|
||||
layout(binding = 2) buffer restrict readonly tensorInC { float inC[]; };
|
||||
layout(binding = 3) buffer restrict writeonly tensorOut { float16_t out_[]; };
|
||||
|
||||
void main() {
|
||||
const uint i3 = gl_WorkGroupID.z;
|
||||
const uint i2 = gl_WorkGroupID.y;
|
||||
const uint i1 = gl_WorkGroupID.x;
|
||||
|
||||
float corr_dims[2];
|
||||
rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
|
||||
|
||||
const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
|
||||
|
||||
float theta_base = float(inB[pcs.inBOff + i2]);
|
||||
float inv_ndims = -1.f/pcs.n_dims;
|
||||
|
||||
float cos_theta;
|
||||
float sin_theta;
|
||||
|
||||
for (uint i0 = 2*gl_LocalInvocationIndex; i0 < pcs.ne0; i0 += 2*gl_WorkGroupSize.x) {
|
||||
if (i0 < pcs.n_dims) {
|
||||
uint ic = i0/2;
|
||||
|
||||
float theta = theta_base * pow(pcs.freq_base, inv_ndims*i0);
|
||||
|
||||
const float freq_factor = pcs.has_freq_factors ? inC[pcs.inCOff + ic] : 1.0f;
|
||||
|
||||
rope_yarn(theta/freq_factor, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
|
||||
|
||||
const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + ic*pcs.nb00) / 2) + pcs.inAOff; // Based from in
|
||||
const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + ic*pcs.nb0) / 2) + pcs.outOff; // Based from out_
|
||||
|
||||
const float x0 = float(inA[src]);
|
||||
const float x1 = float(inA[src+pcs.n_dims/2]);
|
||||
|
||||
out_[dst_data] = float16_t(x0*cos_theta - x1*sin_theta);
|
||||
out_[dst_data+pcs.n_dims/2] = float16_t(x0*sin_theta + x1*cos_theta);
|
||||
} else {
|
||||
const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in
|
||||
const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 2) + pcs.outOff; // Based from out_
|
||||
|
||||
out_[dst_data] = inA[src];
|
||||
out_[dst_data+1] = inA[src+1];
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,52 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "rope_common.comp"
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
|
||||
layout(binding = 1) buffer restrict readonly tensorInB { int inB[]; };
|
||||
layout(binding = 2) buffer restrict readonly tensorInC { float inC[]; };
|
||||
layout(binding = 3) buffer restrict writeonly tensorOut { float out_[]; };
|
||||
|
||||
void main() {
|
||||
const uint i3 = gl_WorkGroupID.z;
|
||||
const uint i2 = gl_WorkGroupID.y;
|
||||
const uint i1 = gl_WorkGroupID.x;
|
||||
|
||||
float corr_dims[2];
|
||||
rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
|
||||
|
||||
const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
|
||||
|
||||
float theta_base = float(inB[pcs.inBOff + i2]);
|
||||
float inv_ndims = -1.f/pcs.n_dims;
|
||||
|
||||
float cos_theta;
|
||||
float sin_theta;
|
||||
|
||||
for (uint i0 = 2*gl_LocalInvocationIndex; i0 < pcs.ne0; i0 += 2*gl_WorkGroupSize.x) {
|
||||
if (i0 < pcs.n_dims) {
|
||||
uint ic = i0/2;
|
||||
|
||||
float theta = theta_base * pow(pcs.freq_base, inv_ndims*i0);
|
||||
|
||||
const float freq_factor = pcs.has_freq_factors ? inC[pcs.inCOff + ic] : 1.0f;
|
||||
|
||||
rope_yarn(theta/freq_factor, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
|
||||
|
||||
const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + ic*pcs.nb00) / 4) + pcs.inAOff; // Based from in
|
||||
const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + ic*pcs.nb0) / 4) + pcs.outOff; // Based from out_
|
||||
|
||||
const float x0 = inA[src];
|
||||
const float x1 = inA[src+pcs.n_dims/2];
|
||||
|
||||
out_[dst_data] = x0*cos_theta - x1*sin_theta;
|
||||
out_[dst_data+pcs.n_dims/2] = x0*sin_theta + x1*cos_theta;
|
||||
} else {
|
||||
const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
|
||||
const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_
|
||||
|
||||
out_[dst_data] = inA[src];
|
||||
out_[dst_data+1] = inA[src+1];
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,52 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "rope_common.comp"
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorInA { float16_t inA[]; };
|
||||
layout(binding = 1) buffer restrict readonly tensorInB { int inB[]; };
|
||||
layout(binding = 2) buffer restrict readonly tensorInC { float inC[]; };
|
||||
layout(binding = 3) buffer restrict writeonly tensorOut { float16_t out_[]; };
|
||||
|
||||
void main() {
|
||||
const uint i3 = gl_WorkGroupID.z;
|
||||
const uint i2 = gl_WorkGroupID.y;
|
||||
const uint i1 = gl_WorkGroupID.x;
|
||||
|
||||
float corr_dims[2];
|
||||
rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
|
||||
|
||||
const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
|
||||
|
||||
float theta_base = float(inB[pcs.inBOff + i2]);
|
||||
float inv_ndims = -1.f/pcs.n_dims;
|
||||
|
||||
float cos_theta;
|
||||
float sin_theta;
|
||||
|
||||
for (uint i0 = 2*gl_LocalInvocationIndex; i0 < pcs.ne0; i0 += 2*gl_WorkGroupSize.x) {
|
||||
if (i0 < pcs.n_dims) {
|
||||
uint ic = i0/2;
|
||||
|
||||
float theta = theta_base * pow(pcs.freq_base, inv_ndims*i0);
|
||||
|
||||
const float freq_factor = pcs.has_freq_factors ? inC[pcs.inCOff + ic] : 1.0f;
|
||||
|
||||
rope_yarn(theta/freq_factor, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
|
||||
|
||||
const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in
|
||||
const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 2) + pcs.outOff; // Based from out_
|
||||
|
||||
const float x0 = float(inA[src]);
|
||||
const float x1 = float(inA[src+1]);
|
||||
|
||||
out_[dst_data] = float16_t(x0*cos_theta - x1*sin_theta);
|
||||
out_[dst_data+1] = float16_t(x0*sin_theta + x1*cos_theta);
|
||||
} else {
|
||||
const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in
|
||||
const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 2) + pcs.outOff; // Based from out_
|
||||
|
||||
out_[dst_data] = inA[src];
|
||||
out_[dst_data+1] = inA[src+1];
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,52 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "rope_common.comp"
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
|
||||
layout(binding = 1) buffer restrict readonly tensorInB { int inB[]; };
|
||||
layout(binding = 2) buffer restrict readonly tensorInC { float inC[]; };
|
||||
layout(binding = 3) buffer restrict writeonly tensorOut { float out_[]; };
|
||||
|
||||
void main() {
|
||||
const uint i3 = gl_WorkGroupID.z;
|
||||
const uint i2 = gl_WorkGroupID.y;
|
||||
const uint i1 = gl_WorkGroupID.x;
|
||||
|
||||
float corr_dims[2];
|
||||
rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
|
||||
|
||||
const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
|
||||
|
||||
float theta_base = float(inB[pcs.inBOff + i2]);
|
||||
float inv_ndims = -1.f/pcs.n_dims;
|
||||
|
||||
float cos_theta;
|
||||
float sin_theta;
|
||||
|
||||
for (uint i0 = 2*gl_LocalInvocationIndex; i0 < pcs.ne0; i0 += 2*gl_WorkGroupSize.x) {
|
||||
if (i0 < pcs.n_dims) {
|
||||
uint ic = i0/2;
|
||||
|
||||
float theta = theta_base * pow(pcs.freq_base, inv_ndims*i0);
|
||||
|
||||
const float freq_factor = pcs.has_freq_factors ? inC[pcs.inCOff + ic] : 1.0f;
|
||||
|
||||
rope_yarn(theta/freq_factor, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
|
||||
|
||||
const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
|
||||
const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_
|
||||
|
||||
const float x0 = inA[src];
|
||||
const float x1 = inA[src+1];
|
||||
|
||||
out_[dst_data] = x0*cos_theta - x1*sin_theta;
|
||||
out_[dst_data+1] = x0*sin_theta + x1*cos_theta;
|
||||
} else {
|
||||
const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
|
||||
const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_
|
||||
|
||||
out_[dst_data] = inA[src];
|
||||
out_[dst_data+1] = inA[src+1];
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,19 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
layout(local_size_x = 1) in;
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
|
||||
layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
|
||||
|
||||
layout(push_constant) uniform PushConstants {
|
||||
uint inOff;
|
||||
uint outOff;
|
||||
float scale;
|
||||
} pcs;
|
||||
|
||||
void main() {
|
||||
const uint i = gl_WorkGroupID.x;
|
||||
out_[i + pcs.outOff] = in_[i + pcs.inOff] * pcs.scale;
|
||||
}
|
||||
@@ -1,23 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
layout(local_size_x = 1) in;
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
|
||||
layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
|
||||
|
||||
layout(push_constant) uniform PushConstants {
|
||||
uint inOff;
|
||||
uint outOff;
|
||||
float scale;
|
||||
} pcs;
|
||||
|
||||
void main() {
|
||||
const uint baseIndex = gl_WorkGroupID.x * 8;
|
||||
|
||||
for (uint x = 0; x < 8; x++) {
|
||||
const uint i = baseIndex + x;
|
||||
out_[i + pcs.outOff] = in_[i + pcs.inOff] * pcs.scale;
|
||||
}
|
||||
}
|
||||
@@ -1,22 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
layout(local_size_x = 1) in;
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
|
||||
layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
|
||||
layout(push_constant) uniform PushConstants {
|
||||
uint inOff;
|
||||
uint outOff;
|
||||
} pcs;
|
||||
|
||||
void main() {
|
||||
const uint baseIndex = gl_WorkGroupID.x * 4;
|
||||
|
||||
for (uint x = 0; x < 4; x++) {
|
||||
const uint i = baseIndex + x;
|
||||
const float y = in_[i + pcs.inOff];
|
||||
out_[i + pcs.outOff] = y / (1.0 + exp(-y));
|
||||
}
|
||||
}
|
||||
@@ -1,72 +0,0 @@
|
||||
// TODO: implement multi-simd softmax (llama.cpp commit e16b9fa4)
|
||||
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
layout(local_size_x_id = 0) in;
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
|
||||
layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; };
|
||||
layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
|
||||
|
||||
layout(push_constant) uniform PushConstants {
|
||||
uint inAOff;
|
||||
uint inBOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int ne01;
|
||||
int ne02;
|
||||
float scale;
|
||||
float max_bias;
|
||||
float m0;
|
||||
float m1;
|
||||
uint n_head_log2;
|
||||
int mask;
|
||||
} pcs;
|
||||
|
||||
void main() {
|
||||
if (gl_SubgroupInvocationID > 31)
|
||||
return;
|
||||
|
||||
const uint i03 = gl_WorkGroupID.z;
|
||||
const uint i02 = gl_WorkGroupID.y;
|
||||
const uint i01 = gl_WorkGroupID.x;
|
||||
|
||||
const uint extra_off = i03*pcs.ne02*pcs.ne01*pcs.ne00 + i02*pcs.ne01*pcs.ne00 + i01*pcs.ne00;
|
||||
const uint psrc0 = extra_off + pcs.inAOff; // Based from inA
|
||||
const uint pmask = i01*pcs.ne00 + pcs.inBOff; // Based from inB
|
||||
const uint pdst = extra_off + pcs.outOff; // Based from out_
|
||||
|
||||
float slope = 1.0f;
|
||||
|
||||
// ALiBi
|
||||
if (pcs.max_bias > 0.0f) {
|
||||
int64_t h = i02;
|
||||
|
||||
float base = h < pcs.n_head_log2 ? pcs.m0 : pcs.m1;
|
||||
int64_t exp = h < pcs.n_head_log2 ? h + 1 : 2*(h - pcs.n_head_log2) + 1;
|
||||
|
||||
slope = pow(base, float(exp));
|
||||
}
|
||||
|
||||
// parallel max
|
||||
float localMax = uintBitsToFloat(0xFF800000);
|
||||
for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += 32) {
|
||||
localMax = max(localMax, inA[psrc0 + i00]*pcs.scale + (pcs.mask!=0 ? slope*inB[pmask + i00] : 0.0f));
|
||||
}
|
||||
float max_ = subgroupMax(localMax);
|
||||
|
||||
// parallel sum
|
||||
float localSum = 0.0f;
|
||||
for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += 32) {
|
||||
const float exp_psrc0 = exp(inA[psrc0 + i00]*pcs.scale + (pcs.mask!=0 ? slope*inB[pmask + i00] : 0.0f) - max_);
|
||||
localSum += exp_psrc0;
|
||||
out_[pdst + i00] = exp_psrc0;
|
||||
}
|
||||
|
||||
const float sum = subgroupAdd(localSum);
|
||||
for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += 32) {
|
||||
out_[pdst + i00] /= sum;
|
||||
}
|
||||
}
|
||||
@@ -1,71 +0,0 @@
|
||||
#include "common.comp"
|
||||
|
||||
#define GGML_ROPE_TYPE_NEOX 2
|
||||
|
||||
// TODO: use a local size of 32 or more (Metal uses 1024)
|
||||
layout(local_size_x = 1) in;
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint inAOff;
|
||||
uint inBOff;
|
||||
uint inCOff;
|
||||
uint outOff;
|
||||
int n_dims;
|
||||
int mode;
|
||||
int n_ctx_orig;
|
||||
float freq_base;
|
||||
float freq_scale;
|
||||
bool has_freq_factors;
|
||||
float ext_factor;
|
||||
float attn_factor;
|
||||
float beta_fast;
|
||||
float beta_slow;
|
||||
uint nb00;
|
||||
uint nb01;
|
||||
uint nb02;
|
||||
uint nb03;
|
||||
int ne0;
|
||||
uint nb0;
|
||||
uint nb1;
|
||||
uint nb2;
|
||||
uint nb3;
|
||||
} pcs;
|
||||
|
||||
float rope_yarn_ramp(const float low, const float high, const float i0) {
|
||||
const float y = (i0 / 2 - low) / max(0.001f, high - low);
|
||||
return 1.0f - min(1.0f, max(0.0f, y));
|
||||
}
|
||||
|
||||
// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
|
||||
// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
|
||||
void rope_yarn(
|
||||
float theta_extrap, float freq_scale, float corr_dims[2], float i0, float ext_factor, float mscale,
|
||||
out float cos_theta, out float sin_theta
|
||||
) {
|
||||
// Get n-d rotational scaling corrected for extrapolation
|
||||
float theta_interp = freq_scale * theta_extrap;
|
||||
float theta = theta_interp;
|
||||
if (ext_factor != 0.0f) {
|
||||
float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor;
|
||||
theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
|
||||
|
||||
// Get n-d magnitude scaling corrected for interpolation
|
||||
mscale *= 1.0f + 0.1f * log(1.0f / freq_scale);
|
||||
}
|
||||
cos_theta = cos(theta) * mscale;
|
||||
sin_theta = sin(theta) * mscale;
|
||||
}
|
||||
|
||||
// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
|
||||
// `corr_fac(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
|
||||
float rope_yarn_corr_factor(int n_dims, int n_ctx_orig, float n_rot, float base) {
|
||||
return n_dims * log(n_ctx_orig / (n_rot * TWOPI_F)) / (2 * log(base));
|
||||
}
|
||||
|
||||
void rope_yarn_corr_dims(
|
||||
int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, out float dims[2]
|
||||
) {
|
||||
// start and end correction dims
|
||||
dims[0] = max(0.0f, floor(rope_yarn_corr_factor(n_dims, n_ctx_orig, beta_fast, freq_base)));
|
||||
dims[1] = min(n_dims - 1.0f, ceil(rope_yarn_corr_factor(n_dims, n_ctx_orig, beta_slow, freq_base)));
|
||||
}
|
||||
@@ -230,8 +230,10 @@ typedef struct {
|
||||
uint64_t nb22;
|
||||
uint64_t nb23;
|
||||
int32_t ne32;
|
||||
int32_t ne33;
|
||||
uint64_t nb31;
|
||||
uint64_t nb32;
|
||||
uint64_t nb33;
|
||||
int32_t ne1;
|
||||
int32_t ne2;
|
||||
float scale;
|
||||
|
||||
@@ -173,6 +173,12 @@ enum ggml_metal_kernel_type {
|
||||
GGML_METAL_KERNEL_TYPE_SILU,
|
||||
GGML_METAL_KERNEL_TYPE_SILU_4,
|
||||
GGML_METAL_KERNEL_TYPE_ELU,
|
||||
GGML_METAL_KERNEL_TYPE_ABS,
|
||||
GGML_METAL_KERNEL_TYPE_SGN,
|
||||
GGML_METAL_KERNEL_TYPE_STEP,
|
||||
GGML_METAL_KERNEL_TYPE_HARDSWISH,
|
||||
GGML_METAL_KERNEL_TYPE_HARDSIGMOID,
|
||||
GGML_METAL_KERNEL_TYPE_EXP,
|
||||
GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16,
|
||||
GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16_4,
|
||||
GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32,
|
||||
@@ -530,6 +536,8 @@ enum ggml_metal_kernel_type {
|
||||
GGML_METAL_KERNEL_TYPE_REGLU,
|
||||
GGML_METAL_KERNEL_TYPE_GEGLU,
|
||||
GGML_METAL_KERNEL_TYPE_SWIGLU,
|
||||
GGML_METAL_KERNEL_TYPE_GEGLU_ERF,
|
||||
GGML_METAL_KERNEL_TYPE_GEGLU_QUICK,
|
||||
GGML_METAL_KERNEL_TYPE_SUM_ROWS,
|
||||
GGML_METAL_KERNEL_TYPE_MEAN,
|
||||
GGML_METAL_KERNEL_TYPE_POOL_2D_AVG_F32,
|
||||
@@ -1153,6 +1161,12 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SILU, silu, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SILU_4, silu_4, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ELU, elu, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ABS, abs, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SGN, sgn, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_STEP, step, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_HARDSWISH, hardswish, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_HARDSIGMOID, hardsigmoid, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_EXP, exp, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16, soft_max_f16, has_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16_4, soft_max_f16_4, has_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32, soft_max_f32, has_simdgroup_reduction);
|
||||
@@ -1510,6 +1524,8 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_REGLU, reglu, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GEGLU, geglu, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SWIGLU, swiglu, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GEGLU_ERF, geglu_erf, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GEGLU_QUICK, geglu_quick, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUM_ROWS, sum_rows, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MEAN, mean, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGMAX, argmax, true);
|
||||
@@ -1684,6 +1700,12 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
|
||||
case GGML_UNARY_OP_SILU:
|
||||
case GGML_UNARY_OP_ELU:
|
||||
case GGML_UNARY_OP_NEG:
|
||||
case GGML_UNARY_OP_ABS:
|
||||
case GGML_UNARY_OP_SGN:
|
||||
case GGML_UNARY_OP_STEP:
|
||||
case GGML_UNARY_OP_HARDSWISH:
|
||||
case GGML_UNARY_OP_HARDSIGMOID:
|
||||
case GGML_UNARY_OP_EXP:
|
||||
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
|
||||
default:
|
||||
return false;
|
||||
@@ -1693,6 +1715,8 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
|
||||
case GGML_GLU_OP_REGLU:
|
||||
case GGML_GLU_OP_GEGLU:
|
||||
case GGML_GLU_OP_SWIGLU:
|
||||
case GGML_GLU_OP_GEGLU_ERF:
|
||||
case GGML_GLU_OP_GEGLU_QUICK:
|
||||
return ggml_is_contiguous_1(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
|
||||
default:
|
||||
return false;
|
||||
@@ -2250,7 +2274,9 @@ static bool ggml_metal_encode_node(
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
|
||||
float scale;
|
||||
memcpy(&scale, dst->op_params, sizeof(scale));
|
||||
float bias;
|
||||
memcpy(&scale, ((const int32_t *) dst->op_params) + 0, sizeof(float));
|
||||
memcpy(&bias, ((const int32_t *) dst->op_params) + 1, sizeof(float));
|
||||
|
||||
int64_t n = ggml_nelements(dst);
|
||||
|
||||
@@ -2267,6 +2293,7 @@ static bool ggml_metal_encode_node(
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
[encoder setBytes:&scale length:sizeof(scale) atIndex:2];
|
||||
[encoder setBytes:&bias length:sizeof(bias) atIndex:3];
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
} break;
|
||||
@@ -2430,6 +2457,78 @@ static bool ggml_metal_encode_node(
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
} break;
|
||||
case GGML_UNARY_OP_ABS:
|
||||
{
|
||||
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ABS].pipeline;
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
|
||||
const int64_t n = ggml_nelements(dst);
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
} break;
|
||||
case GGML_UNARY_OP_SGN:
|
||||
{
|
||||
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SGN].pipeline;
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
|
||||
const int64_t n = ggml_nelements(dst);
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
} break;
|
||||
case GGML_UNARY_OP_STEP:
|
||||
{
|
||||
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_STEP].pipeline;
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
|
||||
const int64_t n = ggml_nelements(dst);
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
} break;
|
||||
case GGML_UNARY_OP_HARDSWISH:
|
||||
{
|
||||
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_HARDSWISH].pipeline;
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
|
||||
const int64_t n = ggml_nelements(dst);
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
} break;
|
||||
case GGML_UNARY_OP_HARDSIGMOID:
|
||||
{
|
||||
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_HARDSIGMOID].pipeline;
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
|
||||
const int64_t n = ggml_nelements(dst);
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
} break;
|
||||
case GGML_UNARY_OP_EXP:
|
||||
{
|
||||
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_EXP].pipeline;
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
|
||||
const int64_t n = ggml_nelements(dst);
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
GGML_LOG_WARN("%s: node %3d, op = %8s not implemented\n", __func__, idx, ggml_op_name(dst->op));
|
||||
@@ -2456,6 +2555,12 @@ static bool ggml_metal_encode_node(
|
||||
case GGML_GLU_OP_SWIGLU:
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SWIGLU].pipeline;
|
||||
break;
|
||||
case GGML_GLU_OP_GEGLU_ERF:
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GEGLU_ERF].pipeline;
|
||||
break;
|
||||
case GGML_GLU_OP_GEGLU_QUICK:
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GEGLU_QUICK].pipeline;
|
||||
break;
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
@@ -5018,8 +5123,10 @@ static bool ggml_metal_encode_node(
|
||||
/*.nb22 =*/ nb22,
|
||||
/*.nb23 =*/ nb23,
|
||||
/*.ne32 =*/ ne32,
|
||||
/*.ne33 =*/ ne33,
|
||||
/*.nb31 =*/ nb31,
|
||||
/*.nb32 =*/ nb32,
|
||||
/*.nb33 =*/ nb33,
|
||||
/*.ne1 =*/ ne1,
|
||||
/*.ne2 =*/ ne2,
|
||||
/*.scale =*/ scale,
|
||||
|
||||
@@ -109,6 +109,7 @@ void dequantize_q4_0_t4(device const block_q4_0 * xb, short il, thread type4 & r
|
||||
}
|
||||
|
||||
void quantize_q4_0(device const float * src, device block_q4_0 & dst) {
|
||||
#pragma METAL fp math_mode(safe)
|
||||
float amax = 0.0f; // absolute max
|
||||
float max = 0.0f;
|
||||
|
||||
@@ -167,6 +168,7 @@ void quantize_q4_1(device const float * src, device block_q4_1 & dst) {
|
||||
}
|
||||
|
||||
void quantize_q5_0(device const float * src, device block_q5_0 & dst) {
|
||||
#pragma METAL fp math_mode(safe)
|
||||
float amax = 0.0f; // absolute max
|
||||
float max = 0.0f;
|
||||
|
||||
@@ -461,6 +463,7 @@ void dequantize_q8_0_t4(device const block_q8_0 *xb, short il, thread type4 & re
|
||||
}
|
||||
|
||||
void quantize_q8_0(device const float * src, device block_q8_0 & dst) {
|
||||
#pragma METAL fp math_mode(safe)
|
||||
float amax = 0.0f; // absolute max
|
||||
|
||||
for (int j = 0; j < QK8_0; j++) {
|
||||
@@ -1011,16 +1014,18 @@ kernel void kernel_scale(
|
||||
device const float * src0,
|
||||
device float * dst,
|
||||
constant float & scale,
|
||||
constant float & bias,
|
||||
uint tpig[[thread_position_in_grid]]) {
|
||||
dst[tpig] = src0[tpig] * scale;
|
||||
dst[tpig] = src0[tpig] * scale + bias;
|
||||
}
|
||||
|
||||
kernel void kernel_scale_4(
|
||||
device const float4 * src0,
|
||||
device float4 * dst,
|
||||
constant float & scale,
|
||||
constant float & bias,
|
||||
uint tpig[[thread_position_in_grid]]) {
|
||||
dst[tpig] = src0[tpig] * scale;
|
||||
dst[tpig] = src0[tpig] * scale + bias;
|
||||
}
|
||||
|
||||
kernel void kernel_clamp(
|
||||
@@ -1194,6 +1199,51 @@ kernel void kernel_neg(
|
||||
dst[tpig] = -src0[tpig];
|
||||
}
|
||||
|
||||
kernel void kernel_abs(
|
||||
device const float * src0,
|
||||
device float * dst,
|
||||
uint tpig[[thread_position_in_grid]]) {
|
||||
dst[tpig] = fabs(src0[tpig]);
|
||||
}
|
||||
|
||||
kernel void kernel_sgn(
|
||||
device const float * src0,
|
||||
device float * dst,
|
||||
uint tpig[[thread_position_in_grid]]) {
|
||||
device const float & x = src0[tpig];
|
||||
dst[tpig] = (x > 0.0f) ? 1.0f : ((x < 0.0f) ? -1.0f : 0.0f);
|
||||
}
|
||||
|
||||
kernel void kernel_step(
|
||||
device const float * src0,
|
||||
device float * dst,
|
||||
uint tpig[[thread_position_in_grid]]) {
|
||||
dst[tpig] = src0[tpig] > 0.0f ? 1.0f : 0.0f;
|
||||
}
|
||||
|
||||
kernel void kernel_hardswish(
|
||||
device const float * src0,
|
||||
device float * dst,
|
||||
uint tpig[[thread_position_in_grid]]) {
|
||||
device const float & x = src0[tpig];
|
||||
dst[tpig] = x * fmin(1.0f, fmax(0.0f, (x + 3.0f) / 6.0f));
|
||||
}
|
||||
|
||||
kernel void kernel_hardsigmoid(
|
||||
device const float * src0,
|
||||
device float * dst,
|
||||
uint tpig[[thread_position_in_grid]]) {
|
||||
device const float & x = src0[tpig];
|
||||
dst[tpig] = fmin(1.0f, fmax(0.0f, (x + 3.0f) / 6.0f));
|
||||
}
|
||||
|
||||
kernel void kernel_exp(
|
||||
device const float * src0,
|
||||
device float * dst,
|
||||
uint tpig[[thread_position_in_grid]]) {
|
||||
dst[tpig] = exp(src0[tpig]);
|
||||
}
|
||||
|
||||
kernel void kernel_reglu(
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
@@ -1258,6 +1308,50 @@ kernel void kernel_swiglu(
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_geglu_erf(
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
device char * dst,
|
||||
constant ggml_metal_kargs_glu & args,
|
||||
uint tgpig[[threadgroup_position_in_grid]],
|
||||
uint tpitg[[thread_position_in_threadgroup]],
|
||||
uint ntg[[threads_per_threadgroup]]) {
|
||||
device const float * src0_row = (device const float *) ((device const char *) src0 + tgpig*args.nb01) + args.i00;
|
||||
device const float * src1_row = (device const float *) ((device const char *) src1 + tgpig*args.nb11) + args.i10;
|
||||
device float * dst_row = (device float *) ((device char *) dst + tgpig*args.nb1);
|
||||
|
||||
for (int i0 = tpitg; i0 < args.ne0; i0 += ntg) {
|
||||
const float x0 = src0_row[i0];
|
||||
const float x1 = src1_row[i0];
|
||||
|
||||
const float gelu_erf = 0.5f*x0*(1.0f+erf_approx<float>(x0*SQRT_2_INV));
|
||||
|
||||
dst_row[i0] = gelu_erf*x1;
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_geglu_quick(
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
device char * dst,
|
||||
constant ggml_metal_kargs_glu & args,
|
||||
uint tgpig[[threadgroup_position_in_grid]],
|
||||
uint tpitg[[thread_position_in_threadgroup]],
|
||||
uint ntg[[threads_per_threadgroup]]) {
|
||||
device const float * src0_row = (device const float *) ((device const char *) src0 + tgpig*args.nb01) + args.i00;
|
||||
device const float * src1_row = (device const float *) ((device const char *) src1 + tgpig*args.nb11) + args.i10;
|
||||
device float * dst_row = (device float *) ((device char *) dst + tgpig*args.nb1);
|
||||
|
||||
for (int i0 = tpitg; i0 < args.ne0; i0 += ntg) {
|
||||
const float x0 = src0_row[i0];
|
||||
const float x1 = src1_row[i0];
|
||||
|
||||
const float gelu_quick = x0*(1.0f/(1.0f+exp(GELU_QUICK_COEF*x0)));
|
||||
|
||||
dst_row[i0] = gelu_quick*x1;
|
||||
}
|
||||
}
|
||||
|
||||
template <bool norm>
|
||||
kernel void kernel_sum_rows(
|
||||
constant ggml_metal_kargs_sum_rows & args,
|
||||
@@ -3857,7 +3951,7 @@ kernel void kernel_flash_attn_ext(
|
||||
// load the mask in shared memory
|
||||
#pragma unroll(Q)
|
||||
for (short j = 0; j < Q; ++j) {
|
||||
device const half * pm = (device const half *) ((device const char *) mask + (iq1 + j)*args.nb31 + (iq3%args.ne32)*args.nb32);
|
||||
device const half * pm = (device const half *) ((device const char *) mask + (iq1 + j)*args.nb31 + (iq2%args.ne32)*args.nb32 + (iq3%args.ne33)*args.nb33);
|
||||
|
||||
const float m = pm[ic + tiisg];
|
||||
|
||||
@@ -4343,7 +4437,7 @@ kernel void kernel_flash_attn_ext_vec(
|
||||
const bool has_mask = mask != q;
|
||||
|
||||
// pointer to the mask
|
||||
device const half * pm = (device const half *) (mask + iq1*args.nb31 + (iq3%args.ne32)*args.nb32);
|
||||
device const half * pm = (device const half *) (mask + iq1*args.nb31 + (iq2%args.ne32)*args.nb32 + (iq3%args.ne33)*args.nb33);
|
||||
|
||||
float slope = 1.0f;
|
||||
|
||||
|
||||
@@ -88,6 +88,7 @@ set(GGML_OPENCL_KERNELS
|
||||
rms_norm
|
||||
rope
|
||||
scale
|
||||
set_rows
|
||||
sigmoid
|
||||
silu
|
||||
softmax_4_f32
|
||||
@@ -103,6 +104,7 @@ set(GGML_OPENCL_KERNELS
|
||||
tanh
|
||||
pad
|
||||
repeat
|
||||
mul_mat_f16_f32
|
||||
)
|
||||
|
||||
foreach (K ${GGML_OPENCL_KERNELS})
|
||||
|
||||
@@ -351,6 +351,7 @@ struct ggml_backend_opencl_context {
|
||||
cl_program program_gemv_noshuffle_general;
|
||||
cl_program program_gemv_noshuffle;
|
||||
cl_program program_get_rows;
|
||||
cl_program program_set_rows;
|
||||
cl_program program_glu;
|
||||
cl_program program_im2col_f16;
|
||||
cl_program program_im2col_f32;
|
||||
@@ -367,6 +368,7 @@ struct ggml_backend_opencl_context {
|
||||
cl_program program_mul_mv_f16_f32;
|
||||
cl_program program_mul_mv_f32_f32;
|
||||
cl_program program_mul;
|
||||
cl_program program_mul_mat_f16_f32_tiled;
|
||||
cl_program program_div;
|
||||
cl_program program_sub;
|
||||
cl_program program_norm;
|
||||
@@ -398,12 +400,13 @@ struct ggml_backend_opencl_context {
|
||||
cl_kernel kernel_scale;
|
||||
cl_kernel kernel_silu, kernel_silu_4;
|
||||
cl_kernel kernel_gelu, kernel_gelu_4;
|
||||
cl_kernel kernel_gelu_erf, kernel_gelu_erf_4;
|
||||
cl_kernel kernel_gelu_quick, kernel_gelu_quick_4;
|
||||
cl_kernel kernel_relu;
|
||||
cl_kernel kernel_sigmoid_f32, kernel_sigmoid_f16;
|
||||
cl_kernel kernel_clamp;
|
||||
cl_kernel kernel_geglu, kernel_reglu, kernel_swiglu,
|
||||
kernel_geglu_f16, kernel_reglu_f16, kernel_swiglu_f16;
|
||||
cl_kernel kernel_geglu, kernel_reglu, kernel_swiglu, kernel_geglu_erf, kernel_geglu_quick,
|
||||
kernel_geglu_f16, kernel_reglu_f16, kernel_swiglu_f16, kernel_geglu_erf_f16, kernel_geglu_quick_f16;
|
||||
cl_kernel kernel_norm;
|
||||
cl_kernel kernel_rms_norm;
|
||||
cl_kernel kernel_group_norm;
|
||||
@@ -411,6 +414,7 @@ struct ggml_backend_opencl_context {
|
||||
cl_kernel kernel_soft_max, kernel_soft_max_4;
|
||||
cl_kernel kernel_soft_max_f16, kernel_soft_max_4_f16;
|
||||
cl_kernel kernel_get_rows_f32, kernel_get_rows_f16, kernel_get_rows_q4_0;
|
||||
cl_kernel kernel_set_rows_f32, kernel_set_rows_f16;
|
||||
cl_kernel kernel_rope_norm_f32, kernel_rope_norm_f16, kernel_rope_neox_f32, kernel_rope_neox_f16;
|
||||
cl_kernel kernel_rope_multi_f32, kernel_rope_multi_f16, kernel_rope_vision_f32, kernel_rope_vision_f16;
|
||||
cl_kernel kernel_cpy_f16_f16, kernel_cpy_f16_f32, kernel_cpy_f32_f16, kernel_cpy_f32_f32;
|
||||
@@ -419,6 +423,7 @@ struct ggml_backend_opencl_context {
|
||||
cl_kernel kernel_mul_mat_f16_f32_1row;
|
||||
cl_kernel kernel_mul_mat_f16_f32;
|
||||
cl_kernel kernel_mul_mat_f16_f32_l4;
|
||||
cl_kernel kernel_mul_mat_f16_f32_tiled;
|
||||
cl_kernel kernel_mul_mat_q4_0_f32, kernel_mul_mat_q4_0_f32_v;
|
||||
cl_kernel kernel_convert_block_q4_0, kernel_restore_block_q4_0;
|
||||
cl_kernel kernel_mul_mat_q4_0_f32_8x_flat;
|
||||
@@ -528,6 +533,16 @@ struct ggml_backend_opencl_context {
|
||||
fclose(ftrace);
|
||||
}
|
||||
|
||||
size_t get_kernel_workgroup_size(cl_kernel kernel) const {
|
||||
size_t workgroup_size = 0;
|
||||
size_t ret_size = 0;
|
||||
CL_CHECK(
|
||||
clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_WORK_GROUP_SIZE,
|
||||
sizeof(size_t), &workgroup_size, &ret_size));
|
||||
GGML_ASSERT(sizeof(size_t) == ret_size);
|
||||
return workgroup_size;
|
||||
}
|
||||
|
||||
void enqueue_ndrange_kernel(cl_kernel kernel, cl_uint work_dim, size_t *global_work_size, size_t *local_work_size, const ggml_tensor * tensor) {
|
||||
#ifdef GGML_OPENCL_PROFILING
|
||||
cl_event evt;
|
||||
@@ -736,6 +751,8 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_gelu = clCreateKernel(backend_ctx->program_gelu, "kernel_gelu", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_gelu_4 = clCreateKernel(backend_ctx->program_gelu, "kernel_gelu_4", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_gelu_erf = clCreateKernel(backend_ctx->program_gelu, "kernel_gelu_erf", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_gelu_erf_4 = clCreateKernel(backend_ctx->program_gelu, "kernel_gelu_erf_4", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_gelu_quick = clCreateKernel(backend_ctx->program_gelu, "kernel_gelu_quick", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_gelu_quick_4 = clCreateKernel(backend_ctx->program_gelu, "kernel_gelu_quick_4", &err), err));
|
||||
GGML_LOG_CONT(".");
|
||||
@@ -753,12 +770,16 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
||||
backend_ctx->program_glu =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_geglu = clCreateKernel(backend_ctx->program_glu, "kernel_geglu", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_reglu = clCreateKernel(backend_ctx->program_glu, "kernel_reglu", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_swiglu = clCreateKernel(backend_ctx->program_glu, "kernel_swiglu", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_geglu_f16 = clCreateKernel(backend_ctx->program_glu, "kernel_geglu_f16", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_reglu_f16 = clCreateKernel(backend_ctx->program_glu, "kernel_reglu_f16", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_swiglu_f16 = clCreateKernel(backend_ctx->program_glu, "kernel_swiglu_f16", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_geglu = clCreateKernel(backend_ctx->program_glu, "kernel_geglu", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_reglu = clCreateKernel(backend_ctx->program_glu, "kernel_reglu", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_swiglu = clCreateKernel(backend_ctx->program_glu, "kernel_swiglu", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_geglu_erf = clCreateKernel(backend_ctx->program_glu, "kernel_geglu_erf", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_geglu_quick = clCreateKernel(backend_ctx->program_glu, "kernel_geglu_quick", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_geglu_f16 = clCreateKernel(backend_ctx->program_glu, "kernel_geglu_f16", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_reglu_f16 = clCreateKernel(backend_ctx->program_glu, "kernel_reglu_f16", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_swiglu_f16 = clCreateKernel(backend_ctx->program_glu, "kernel_swiglu_f16", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_geglu_erf_f16 = clCreateKernel(backend_ctx->program_glu, "kernel_geglu_erf_f16", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_geglu_quick_f16 = clCreateKernel(backend_ctx->program_glu, "kernel_geglu_quick_f16", &err), err));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
@@ -996,6 +1017,22 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// mul_mat_f16_f32_tiled
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "mul_mat_f16_f32.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src = read_file("mul_mat_f16_f32.cl");
|
||||
#endif
|
||||
backend_ctx->program_mul_mat_f16_f32_tiled =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32_tiled = clCreateKernel(backend_ctx->program_mul_mat_f16_f32_tiled, "mul_mat_f16_f32", &err), err));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// mul
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
@@ -1424,6 +1461,23 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
||||
}
|
||||
}
|
||||
|
||||
// set_rows
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "set_rows.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src = read_file("set_rows.cl");
|
||||
#endif
|
||||
backend_ctx->program_set_rows =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_set_rows_f32 = clCreateKernel(backend_ctx->program_set_rows, "kernel_set_rows_f32", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_set_rows_f16 = clCreateKernel(backend_ctx->program_set_rows, "kernel_set_rows_f16", &err), err));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// mul_mv_id_q4_0_f32_8x_flat
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
@@ -2222,6 +2276,22 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
case GGML_OP_SET_ROWS:
|
||||
{
|
||||
// TODO: add support
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/14274
|
||||
#pragma message("TODO: implement BF16, Q4_0, Q4_1, Q5_0, Q5_1, Q8_0, IQ4_NL support (https://github.com/ggml-org/llama.cpp/pull/14661)")
|
||||
if (op->src[0]->type != GGML_TYPE_F32) {
|
||||
return false;
|
||||
}
|
||||
switch (op->type) {
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_F32:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
case GGML_OP_CPY:
|
||||
case GGML_OP_DUP:
|
||||
case GGML_OP_CONT:
|
||||
@@ -2256,6 +2326,7 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
||||
case GGML_UNARY_OP_GELU:
|
||||
case GGML_UNARY_OP_SILU:
|
||||
case GGML_UNARY_OP_RELU:
|
||||
case GGML_UNARY_OP_GELU_ERF:
|
||||
case GGML_UNARY_OP_GELU_QUICK:
|
||||
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
|
||||
case GGML_UNARY_OP_SIGMOID:
|
||||
@@ -2271,6 +2342,8 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
||||
case GGML_GLU_OP_GEGLU:
|
||||
case GGML_GLU_OP_REGLU:
|
||||
case GGML_GLU_OP_SWIGLU:
|
||||
case GGML_GLU_OP_GEGLU_ERF:
|
||||
case GGML_GLU_OP_GEGLU_QUICK:
|
||||
return ggml_is_contiguous_1(op->src[0]) && (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16);
|
||||
default:
|
||||
return false;
|
||||
@@ -3358,6 +3431,111 @@ static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, c
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
||||
}
|
||||
|
||||
static void ggml_cl_set_rows(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0);
|
||||
GGML_ASSERT(src0->extra);
|
||||
GGML_ASSERT(src1);
|
||||
GGML_ASSERT(src1->extra);
|
||||
GGML_ASSERT(dst);
|
||||
GGML_ASSERT(dst->extra);
|
||||
|
||||
// ne0 = ne00
|
||||
// ne2 = ne02
|
||||
// ne3 = ne03
|
||||
|
||||
const int ne01 = src0->ne[1];
|
||||
const int ne02 = src0->ne[2];
|
||||
const int ne03 = src0->ne[3];
|
||||
|
||||
const cl_ulong nb01 = src0->nb[1];
|
||||
const cl_ulong nb02 = src0->nb[2];
|
||||
const cl_ulong nb03 = src0->nb[3];
|
||||
|
||||
const int ne11 = src1->ne[1];
|
||||
const int ne12 = src1->ne[2];
|
||||
|
||||
const cl_ulong nb10 = src1->nb[0];
|
||||
const cl_ulong nb11 = src1->nb[1];
|
||||
const cl_ulong nb12 = src1->nb[2];
|
||||
|
||||
const int ne0 = dst->ne[0];
|
||||
|
||||
const cl_ulong nb1 = dst->nb[1];
|
||||
const cl_ulong nb2 = dst->nb[2];
|
||||
const cl_ulong nb3 = dst->nb[3];
|
||||
|
||||
const int nblk0 = ne0/ggml_blck_size(dst->type);
|
||||
|
||||
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
||||
|
||||
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
||||
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
||||
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
||||
|
||||
cl_ulong offset0 = extra0->offset + src0->view_offs;
|
||||
cl_ulong offset1 = extra1->offset + src1->view_offs;
|
||||
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
||||
|
||||
cl_kernel kernel;
|
||||
|
||||
switch (dst->type) {
|
||||
case GGML_TYPE_F32:
|
||||
kernel = backend_ctx->kernel_set_rows_f32;
|
||||
break;
|
||||
case GGML_TYPE_F16:
|
||||
kernel = backend_ctx->kernel_set_rows_f16;
|
||||
break;
|
||||
default:
|
||||
GGML_ABORT("not implemented");
|
||||
}
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb03));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne11));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb10));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb11));
|
||||
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &nblk0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb3));
|
||||
|
||||
int nth0 = 64;
|
||||
if (backend_ctx->gpu_family == INTEL) {
|
||||
nth0 = 32;
|
||||
} else if (backend_ctx->gpu_family == ADRENO) {
|
||||
nth0 = 64;
|
||||
}
|
||||
|
||||
int max_workgroup_size = backend_ctx->get_kernel_workgroup_size(kernel);
|
||||
while (nth0 < nblk0 && nth0 < max_workgroup_size) {
|
||||
nth0 *= 2;
|
||||
}
|
||||
|
||||
int rows_per_workgroup = 1;
|
||||
if (nth0 > nblk0) {
|
||||
rows_per_workgroup = nth0 / nblk0;
|
||||
nth0 = nblk0;
|
||||
}
|
||||
|
||||
size_t global_work_size[] = {
|
||||
(size_t)(ne01 + rows_per_workgroup - 1)/rows_per_workgroup*nth0,
|
||||
(size_t)ne02*rows_per_workgroup,
|
||||
(size_t)ne03};
|
||||
size_t local_work_size[] = {(size_t)nth0, (size_t)rows_per_workgroup, 1};
|
||||
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
||||
}
|
||||
|
||||
static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0);
|
||||
GGML_ASSERT(src0->extra);
|
||||
@@ -3858,6 +4036,44 @@ static void ggml_cl_gelu(ggml_backend_t backend, const ggml_tensor * src0, const
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
||||
}
|
||||
|
||||
static void ggml_cl_gelu_erf(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0);
|
||||
GGML_ASSERT(src0->extra);
|
||||
GGML_ASSERT(dst);
|
||||
GGML_ASSERT(dst->extra);
|
||||
|
||||
UNUSED(src1);
|
||||
|
||||
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
||||
|
||||
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
||||
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
||||
|
||||
cl_ulong offset0 = extra0->offset + src0->view_offs;
|
||||
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
||||
|
||||
cl_kernel kernel;
|
||||
|
||||
int n = ggml_nelements(dst);
|
||||
|
||||
if (n % 4 == 0) {
|
||||
kernel = backend_ctx->kernel_gelu_erf_4;
|
||||
n /= 4;
|
||||
} else {
|
||||
kernel = backend_ctx->kernel_gelu_erf;
|
||||
}
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
|
||||
|
||||
size_t global_work_size[] = {(size_t)n, 1, 1};
|
||||
size_t local_work_size[] = {64, 1, 1};
|
||||
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
||||
}
|
||||
|
||||
static void ggml_cl_gelu_quick(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0);
|
||||
GGML_ASSERT(src0->extra);
|
||||
@@ -4730,6 +4946,58 @@ static void ggml_cl_timestep_embedding(ggml_backend_t backend, const ggml_tensor
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, NULL, dst);
|
||||
}
|
||||
|
||||
static void ggml_cl_mul_mat_f16_f32_tiled(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
||||
|
||||
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
||||
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
||||
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
||||
|
||||
cl_ulong offset0 = extra0->offset + src0->view_offs;
|
||||
cl_ulong offset1 = extra1->offset + src1->view_offs;
|
||||
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
||||
|
||||
const int M = src0->ne[1];
|
||||
const int N = src1->ne[1];
|
||||
const int K = src0->ne[0];
|
||||
|
||||
cl_kernel kernel = backend_ctx->kernel_mul_mat_f16_f32_tiled;
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(int), &M));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), &N));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &K));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_ulong), &offset0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), &extra1->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &offset1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &offsetd));
|
||||
|
||||
// Tiling parameters. These need to be tuned for optimal performance.
|
||||
// They must match the #defines in the kernel mul_mat_f16_f32.cl.
|
||||
//
|
||||
// OPWM / OPWN: Output tile size per Work-Group. A work-group computes a tile of size OPWM x OPWN.
|
||||
// TPWM / TPWN: Threads per Work-group. This is the work-group size.
|
||||
// OPTM / OPTN: Output elements per Thread. Each thread computes OPTM x OPTN elements.
|
||||
//
|
||||
// The following relationships must hold:
|
||||
// OPWM = TPWM * OPTM
|
||||
// OPWN = TPWN * OPTN
|
||||
//
|
||||
const int OPWM = 64;
|
||||
const int OPWN = 64;
|
||||
const int TPWM = 16;
|
||||
const int TPWN = 8;
|
||||
|
||||
size_t local_work_size[2] = { TPWM, TPWN };
|
||||
size_t global_work_size[2] = {
|
||||
(size_t) ((M + OPWM - 1) / OPWM) * TPWM,
|
||||
(size_t) ((N + OPWN - 1) / OPWN) * TPWN,
|
||||
};
|
||||
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size, local_work_size, dst);
|
||||
}
|
||||
|
||||
static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0);
|
||||
GGML_ASSERT(src0->extra);
|
||||
@@ -4743,6 +5011,18 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
||||
|
||||
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
||||
|
||||
if (src0t == GGML_TYPE_F16 && src1t == GGML_TYPE_F32 &&
|
||||
src0->ne[1] > 32 && // M > 32
|
||||
src1->ne[1] > 32 && // N > 32
|
||||
src0->ne[0] > 32 && // K > 32
|
||||
src0->ne[2] == 1 && src0->ne[3] == 1 &&
|
||||
src1->ne[2] == 1 && src1->ne[3] == 1 &&
|
||||
ggml_is_contiguous(src0) && ggml_is_contiguous(src1) &&
|
||||
backend_ctx->kernel_mul_mat_f16_f32_tiled != NULL) {
|
||||
ggml_cl_mul_mat_f16_f32_tiled(backend, src0, src1, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
||||
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
||||
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
||||
@@ -5533,7 +5813,9 @@ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, cons
|
||||
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
||||
|
||||
float scale;
|
||||
memcpy(&scale, dst->op_params, sizeof(scale));
|
||||
float bias;
|
||||
memcpy(&scale, ((int32_t *) dst->op_params) + 0, sizeof(float));
|
||||
memcpy(&bias, ((int32_t *) dst->op_params) + 1, sizeof(float));
|
||||
|
||||
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
||||
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
||||
@@ -5548,6 +5830,7 @@ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, cons
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(float), &scale));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(float), &bias));
|
||||
|
||||
int n = ggml_nelements(dst)/4;
|
||||
|
||||
@@ -5757,19 +6040,31 @@ static void ggml_cl_soft_max(ggml_backend_t backend, const ggml_tensor * src0, c
|
||||
|
||||
cl_ulong offset1 = extra1 ? extra1->offset + src1->view_offs : offset0;
|
||||
|
||||
const int ne00 = src0 ? src0->ne[0] : 0;
|
||||
const int ne01 = src0 ? src0->ne[1] : 0;
|
||||
const int ne02 = src0 ? src0->ne[2] : 0;
|
||||
const int ne03 = src0 ? src0->ne[3] : 0;
|
||||
const int ne00 = src0->ne[0];
|
||||
const int ne01 = src0->ne[1];
|
||||
const int ne02 = src0->ne[2];
|
||||
const int ne03 = src0->ne[3];
|
||||
|
||||
const cl_long nb01 = src0->nb[1];
|
||||
const cl_long nb02 = src0->nb[2];
|
||||
const cl_long nb03 = src0->nb[3];
|
||||
|
||||
const int ne12 = src1 ? src1->ne[2] : 0;
|
||||
const int ne13 = src1 ? src1->ne[3] : 0;
|
||||
|
||||
const cl_long nb11 = src1 ? src1->nb[1] : 0;
|
||||
const cl_long nb12 = src1 ? src1->nb[2] : 0;
|
||||
const cl_long nb13 = src1 ? src1->nb[3] : 0;
|
||||
|
||||
const cl_long nb1 = dst->nb[1];
|
||||
const cl_long nb2 = dst->nb[2];
|
||||
const cl_long nb3 = dst->nb[3];
|
||||
|
||||
float scale, max_bias;
|
||||
memcpy(&scale, dst->op_params + 0, sizeof(float));
|
||||
memcpy(&max_bias, dst->op_params + 1, sizeof(float));
|
||||
|
||||
const int nrows_x = ggml_nrows(src0);
|
||||
const int nrows_y = src0->ne[1];
|
||||
|
||||
const int n_head = nrows_x/nrows_y;
|
||||
const int n_head = src0->ne[2];
|
||||
const int n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
|
||||
|
||||
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
||||
@@ -5814,13 +6109,22 @@ static void ggml_cl_soft_max(ggml_backend_t backend, const ggml_tensor * src0, c
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(float), &scale));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(float), &max_bias));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(float), &m0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float), &m1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &n_head_log2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb03));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne13));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb11));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb13));
|
||||
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb3));
|
||||
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(float), &scale));
|
||||
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(float), &max_bias));
|
||||
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(float), &m0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 21, sizeof(float), &m1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &n_head_log2));
|
||||
|
||||
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
|
||||
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
||||
@@ -6227,6 +6531,20 @@ static void ggml_cl_glu(ggml_backend_t backend, const ggml_tensor * src0, const
|
||||
kernel = backend_ctx->kernel_swiglu_f16;
|
||||
}
|
||||
break;
|
||||
case GGML_GLU_OP_GEGLU_ERF:
|
||||
if (dst->type == GGML_TYPE_F32) {
|
||||
kernel = backend_ctx->kernel_geglu_erf;
|
||||
} else {
|
||||
kernel = backend_ctx->kernel_geglu_erf_f16;
|
||||
}
|
||||
break;
|
||||
case GGML_GLU_OP_GEGLU_QUICK:
|
||||
if (dst->type == GGML_TYPE_F32) {
|
||||
kernel = backend_ctx->kernel_geglu_quick;
|
||||
} else {
|
||||
kernel = backend_ctx->kernel_geglu_quick_f16;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
GGML_ABORT("Unsupported glu op");
|
||||
}
|
||||
@@ -6296,6 +6614,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
|
||||
}
|
||||
func = ggml_cl_get_rows;
|
||||
break;
|
||||
case GGML_OP_SET_ROWS:
|
||||
if (!any_on_device) {
|
||||
return false;
|
||||
}
|
||||
func = ggml_cl_set_rows;
|
||||
break;
|
||||
case GGML_OP_CPY:
|
||||
if (!any_on_device) {
|
||||
return false;
|
||||
@@ -6341,6 +6665,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
|
||||
}
|
||||
func = ggml_cl_gelu;
|
||||
break;
|
||||
case GGML_UNARY_OP_GELU_ERF:
|
||||
if (!any_on_device) {
|
||||
return false;
|
||||
}
|
||||
func = ggml_cl_gelu_erf;
|
||||
break;
|
||||
case GGML_UNARY_OP_GELU_QUICK:
|
||||
if (!any_on_device) {
|
||||
return false;
|
||||
|
||||
@@ -6,6 +6,7 @@
|
||||
#define GELU_COEF_A 0.044715f
|
||||
#define GELU_QUICK_COEF -1.702f
|
||||
#define SQRT_2_OVER_PI 0.79788456080286535587989211986876f
|
||||
#define SQRT_2_INV 0.70710678118654752440084436210484f
|
||||
|
||||
kernel void kernel_gelu(
|
||||
global float * src0,
|
||||
@@ -35,6 +36,32 @@ kernel void kernel_gelu_4(
|
||||
dst[get_global_id(0)] = 0.5f*x*(1.0f + tanh(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
|
||||
}
|
||||
|
||||
kernel void kernel_gelu_erf(
|
||||
global float * src0,
|
||||
ulong offset0,
|
||||
global float * dst,
|
||||
ulong offsetd
|
||||
) {
|
||||
src0 = (global float*)((global char*)src0 + offset0);
|
||||
dst = (global float*)((global char*)dst + offsetd);
|
||||
|
||||
float x = src0[get_global_id(0)];
|
||||
dst[get_global_id(0)] = 0.5f*x*(1.0f + erf(x*SQRT_2_INV));
|
||||
}
|
||||
|
||||
kernel void kernel_gelu_erf_4(
|
||||
global float4 * src0,
|
||||
ulong offset0,
|
||||
global float4 * dst,
|
||||
ulong offsetd
|
||||
) {
|
||||
src0 = (global float4*)((global char*)src0 + offset0);
|
||||
dst = (global float4*)((global char*)dst + offsetd);
|
||||
|
||||
float4 x = src0[get_global_id(0)];
|
||||
dst[get_global_id(0)] = 0.5f*x*(1.0f + erf(x*SQRT_2_INV));
|
||||
}
|
||||
|
||||
kernel void kernel_gelu_quick(
|
||||
global float * src0,
|
||||
ulong offset0,
|
||||
|
||||
@@ -1,7 +1,9 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
#define GELU_COEF_A 0.044715f
|
||||
#define GELU_QUICK_COEF -1.702f
|
||||
#define SQRT_2_OVER_PI 0.79788456080286535587989211986876f
|
||||
#define SQRT_2_INV 0.70710678118654752440084436210484f
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// geglu
|
||||
@@ -199,3 +201,137 @@ kernel void kernel_swiglu_f16(
|
||||
dst_row[i0] = silu*x1;
|
||||
}
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// geglu_erf
|
||||
//------------------------------------------------------------------------------
|
||||
kernel void kernel_geglu_erf(
|
||||
global char * src0,
|
||||
ulong offset0,
|
||||
global char * src1,
|
||||
ulong offset1,
|
||||
global char * dst,
|
||||
ulong offsetd,
|
||||
ulong nb01,
|
||||
ulong nb11,
|
||||
int ne0,
|
||||
ulong nb1,
|
||||
int ne00_off,
|
||||
int ne10_off
|
||||
) {
|
||||
src0 = (global char*)((global char*)src0 + offset0);
|
||||
src1 = (global char*)((global char*)src1 + offset1);
|
||||
dst = (global char*)((global char*)dst + offsetd);
|
||||
|
||||
global float * src0_row = (global float *) ((global char *) src0 + get_group_id(0)*nb01) + ne00_off;
|
||||
global float * src1_row = (global float *) ((global char *) src1 + get_group_id(0)*nb11) + ne10_off;
|
||||
global float * dst_row = (global float *) ((global char *) dst + get_group_id(0)*nb1);
|
||||
|
||||
for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
|
||||
const float x0 = src0_row[i0];
|
||||
const float x1 = src1_row[i0];
|
||||
|
||||
const float gelu_erf = 0.5f*x0*(1.0f + erf(x0*SQRT_2_INV));
|
||||
|
||||
dst_row[i0] = gelu_erf*x1;
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_geglu_erf_f16(
|
||||
global char * src0,
|
||||
ulong offset0,
|
||||
global char * src1,
|
||||
ulong offset1,
|
||||
global char * dst,
|
||||
ulong offsetd,
|
||||
ulong nb01,
|
||||
ulong nb11,
|
||||
int ne0,
|
||||
ulong nb1,
|
||||
int ne00_off,
|
||||
int ne10_off
|
||||
) {
|
||||
src0 = (global char*)((global char*)src0 + offset0);
|
||||
src1 = (global char*)((global char*)src1 + offset1);
|
||||
dst = (global char*)((global char*)dst + offsetd);
|
||||
|
||||
global half * src0_row = (global half *) ((global char *) src0 + get_group_id(0)*nb01) + ne00_off;
|
||||
global half * src1_row = (global half *) ((global char *) src1 + get_group_id(0)*nb11) + ne10_off;
|
||||
global half * dst_row = (global half *) ((global char *) dst + get_group_id(0)*nb1);
|
||||
|
||||
for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
|
||||
const half x0 = src0_row[i0];
|
||||
const half x1 = src1_row[i0];
|
||||
|
||||
const half gelu_erf = 0.5f*x0*(1.0f + erf(x0*SQRT_2_INV));
|
||||
|
||||
dst_row[i0] = gelu_erf*x1;
|
||||
}
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// geglu_quick
|
||||
//------------------------------------------------------------------------------
|
||||
kernel void kernel_geglu_quick(
|
||||
global char * src0,
|
||||
ulong offset0,
|
||||
global char * src1,
|
||||
ulong offset1,
|
||||
global char * dst,
|
||||
ulong offsetd,
|
||||
ulong nb01,
|
||||
ulong nb11,
|
||||
int ne0,
|
||||
ulong nb1,
|
||||
int ne00_off,
|
||||
int ne10_off
|
||||
) {
|
||||
src0 = (global char*)((global char*)src0 + offset0);
|
||||
src1 = (global char*)((global char*)src1 + offset1);
|
||||
dst = (global char*)((global char*)dst + offsetd);
|
||||
|
||||
global float * src0_row = (global float *) ((global char *) src0 + get_group_id(0)*nb01) + ne00_off;
|
||||
global float * src1_row = (global float *) ((global char *) src1 + get_group_id(0)*nb11) + ne10_off;
|
||||
global float * dst_row = (global float *) ((global char *) dst + get_group_id(0)*nb1);
|
||||
|
||||
for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
|
||||
const float x0 = src0_row[i0];
|
||||
const float x1 = src1_row[i0];
|
||||
|
||||
const float gelu_quick = x0*(1.0f/(1.0f + exp(GELU_QUICK_COEF*x0)));
|
||||
|
||||
dst_row[i0] = gelu_quick*x1;
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_geglu_quick_f16(
|
||||
global char * src0,
|
||||
ulong offset0,
|
||||
global char * src1,
|
||||
ulong offset1,
|
||||
global char * dst,
|
||||
ulong offsetd,
|
||||
ulong nb01,
|
||||
ulong nb11,
|
||||
int ne0,
|
||||
ulong nb1,
|
||||
int ne00_off,
|
||||
int ne10_off
|
||||
) {
|
||||
src0 = (global char*)((global char*)src0 + offset0);
|
||||
src1 = (global char*)((global char*)src1 + offset1);
|
||||
dst = (global char*)((global char*)dst + offsetd);
|
||||
|
||||
global half * src0_row = (global half *) ((global char *) src0 + get_group_id(0)*nb01) + ne00_off;
|
||||
global half * src1_row = (global half *) ((global char *) src1 + get_group_id(0)*nb11) + ne10_off;
|
||||
global half * dst_row = (global half *) ((global char *) dst + get_group_id(0)*nb1);
|
||||
|
||||
for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
|
||||
const half x0 = src0_row[i0];
|
||||
const half x1 = src1_row[i0];
|
||||
|
||||
const half gelu_quick = x0*(1.0f/(1.0f + exp(GELU_QUICK_COEF*x0)));
|
||||
|
||||
dst_row[i0] = gelu_quick*x1;
|
||||
}
|
||||
}
|
||||
|
||||
130
ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl
Normal file
130
ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl
Normal file
@@ -0,0 +1,130 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
#if defined(cl_qcom_reqd_sub_group_size)
|
||||
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
||||
#else
|
||||
#define REQD_SUBGROUP_SIZE_128
|
||||
#endif
|
||||
|
||||
#define OPWM 64
|
||||
#define OPWN 64
|
||||
#define CPWK 8
|
||||
#define OPTM 4
|
||||
#define OPTN 8
|
||||
|
||||
#define WG_M (OPWM / OPTM)
|
||||
#define WG_N (OPWN / OPTN)
|
||||
#define VEC_K (CPWK / 4)
|
||||
|
||||
REQD_SUBGROUP_SIZE_128
|
||||
__kernel void mul_mat_f16_f32(
|
||||
const int M, const int N, const int K,
|
||||
__global const void* A_void, ulong A_offset,
|
||||
__global const void* B_void, ulong B_offset,
|
||||
__global void* C_void, ulong C_offset) {
|
||||
|
||||
__global const half* A = (__global const half* )((__global const char*)A_void + A_offset);
|
||||
__global const float* B = (__global const float*)((__global const char*)B_void + B_offset);
|
||||
__global float* C = (__global float*)((__global char*)C_void + C_offset);
|
||||
|
||||
const int lidm = get_local_id(0);
|
||||
const int lidn = get_local_id(1);
|
||||
const int lid = lidn * WG_M + lidm;
|
||||
|
||||
const int offsetM = get_group_id(0) * OPWM;
|
||||
const int offsetN = get_group_id(1) * OPWN;
|
||||
|
||||
__local half4 Alocal[OPWM][VEC_K];
|
||||
__local float4 Blocal[OPWN][VEC_K];
|
||||
|
||||
float sum[OPTM][OPTN];
|
||||
|
||||
for (int wm = 0; wm < OPTM; wm++) {
|
||||
for (int wn = 0; wn < OPTN; wn++) {
|
||||
sum[wm][wn] = 0.0f;
|
||||
}
|
||||
}
|
||||
|
||||
const int numTiles = (K + CPWK - 1) / CPWK;
|
||||
|
||||
const int load_row_a = lid % OPWM;
|
||||
const int load_vec_k_a = lid / OPWM;
|
||||
const int global_row_a = offsetM + load_row_a;
|
||||
|
||||
const int load_row_b = lid % OPWN;
|
||||
const int load_vec_k_b = lid / OPWN;
|
||||
const int global_row_b = offsetN + load_row_b;
|
||||
|
||||
for (int t = 0; t < numTiles; t++) {
|
||||
const int k_start = t * CPWK;
|
||||
const int k_vec_start_a = k_start + load_vec_k_a * 4;
|
||||
const int k_vec_start_b = k_start + load_vec_k_b * 4;
|
||||
|
||||
if (global_row_a < M && k_vec_start_a < K) {
|
||||
if (k_vec_start_a + 3 < K) {
|
||||
Alocal[load_row_a][load_vec_k_a] = vload4(0, A + global_row_a * K + k_vec_start_a);
|
||||
} else {
|
||||
half4 tempA = (half4)(0.0h);
|
||||
if (k_vec_start_a < K) tempA.s0 = A[global_row_a * K + k_vec_start_a];
|
||||
if (k_vec_start_a + 1 < K) tempA.s1 = A[global_row_a * K + k_vec_start_a + 1];
|
||||
if (k_vec_start_a + 2 < K) tempA.s2 = A[global_row_a * K + k_vec_start_a + 2];
|
||||
Alocal[load_row_a][load_vec_k_a] = tempA;
|
||||
}
|
||||
} else {
|
||||
Alocal[load_row_a][load_vec_k_a] = (half4)(0.0h);
|
||||
}
|
||||
|
||||
if (global_row_b < N && k_vec_start_b < K) {
|
||||
if (k_vec_start_b + 3 < K) {
|
||||
Blocal[load_row_b][load_vec_k_b] = vload4(0, B + global_row_b * K + k_vec_start_b);
|
||||
} else {
|
||||
float4 tempB = (float4)(0.0f);
|
||||
if (k_vec_start_b < K) tempB.s0 = B[global_row_b * K + k_vec_start_b];
|
||||
if (k_vec_start_b + 1 < K) tempB.s1 = B[global_row_b * K + k_vec_start_b + 1];
|
||||
if (k_vec_start_b + 2 < K) tempB.s2 = B[global_row_b * K + k_vec_start_b + 2];
|
||||
Blocal[load_row_b][load_vec_k_b] = tempB;
|
||||
}
|
||||
} else {
|
||||
Blocal[load_row_b][load_vec_k_b] = (float4)(0.0f);
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
#pragma unroll
|
||||
for (int k_vec = 0; k_vec < VEC_K; k_vec++) {
|
||||
float4 a_fvecs[OPTM];
|
||||
int current_row_a = lidm;
|
||||
for (int wm = 0; wm < OPTM; wm++) {
|
||||
a_fvecs[wm] = convert_float4(Alocal[current_row_a][k_vec]);
|
||||
current_row_a += WG_M;
|
||||
}
|
||||
|
||||
float4 b_fvecs[OPTN];
|
||||
int current_row_b = lidn;
|
||||
for (int wn = 0; wn < OPTN; wn++) {
|
||||
b_fvecs[wn] = Blocal[current_row_b][k_vec];
|
||||
current_row_b += WG_N;
|
||||
}
|
||||
|
||||
for (int wm = 0; wm < OPTM; wm++) {
|
||||
for (int wn = 0; wn < OPTN; wn++) {
|
||||
sum[wm][wn] += dot(a_fvecs[wm], b_fvecs[wn]);
|
||||
}
|
||||
}
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
for (int wm = 0; wm < OPTM; wm++) {
|
||||
int globalRow = offsetM + lidm + wm * WG_M;
|
||||
if (globalRow < M) {
|
||||
for (int wn = 0; wn < OPTN; wn++) {
|
||||
int globalCol = offsetN + lidn + wn * WG_N;
|
||||
if (globalCol < N) {
|
||||
C[globalCol * M + globalRow] = sum[wm][wn];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -8,9 +8,10 @@ kernel void kernel_scale(
|
||||
ulong offset0,
|
||||
global float4 * dst,
|
||||
ulong offsetd,
|
||||
float scale
|
||||
float scale,
|
||||
float bias
|
||||
) {
|
||||
src0 = (global float4*)((global char*)src0 + offset0);
|
||||
dst = (global float4*)((global char*)dst + offsetd);
|
||||
dst[get_global_id(0)] = src0[get_global_id(0)] * scale;
|
||||
dst[get_global_id(0)] = src0[get_global_id(0)] * scale + bias;
|
||||
}
|
||||
|
||||
95
ggml/src/ggml-opencl/kernels/set_rows.cl
Normal file
95
ggml/src/ggml-opencl/kernels/set_rows.cl
Normal file
@@ -0,0 +1,95 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
kernel void kernel_set_rows_f32(
|
||||
global char * src0,
|
||||
ulong offset0,
|
||||
global char * src1,
|
||||
ulong offset1,
|
||||
global char * dst,
|
||||
ulong offsetd,
|
||||
int ne01,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
int ne11,
|
||||
int ne12,
|
||||
ulong nb10,
|
||||
ulong nb11,
|
||||
ulong nb12,
|
||||
int nblk0,
|
||||
ulong nb1,
|
||||
ulong nb2,
|
||||
ulong nb3
|
||||
) {
|
||||
src0 = src0 + offset0;
|
||||
src1 = src1 + offset1;
|
||||
dst = dst + offsetd;
|
||||
|
||||
int i03 = get_group_id(2);
|
||||
int i02 = get_group_id(1);
|
||||
int i01 = get_group_id(0)*get_local_size(1) + get_local_id(1);
|
||||
|
||||
if (i01 >= ne01) {
|
||||
return;
|
||||
}
|
||||
|
||||
int i12 = i03%ne12;
|
||||
int i11 = i02%ne11;
|
||||
|
||||
int i10 = i01;
|
||||
long i1 = ((global long *)(src1 + i10*nb10 + i11*nb11 + i12*nb12))[0];
|
||||
|
||||
global float * dst_row = (global float *) (dst + i1*nb1 + i02*nb2 + i03*nb3);
|
||||
global float * src_row = (global float *) (src0 + i01*nb01 + i02*nb02 + i03*nb03);
|
||||
|
||||
for (int ind = get_local_id(0); ind < nblk0; ind += get_local_size(0)) {
|
||||
dst_row[ind] = (float)src_row[ind];
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_set_rows_f16(
|
||||
global char * src0,
|
||||
ulong offset0,
|
||||
global char * src1,
|
||||
ulong offset1,
|
||||
global char * dst,
|
||||
ulong offsetd,
|
||||
int ne01,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
int ne11,
|
||||
int ne12,
|
||||
ulong nb10,
|
||||
ulong nb11,
|
||||
ulong nb12,
|
||||
int nblk0,
|
||||
ulong nb1,
|
||||
ulong nb2,
|
||||
ulong nb3
|
||||
) {
|
||||
src0 = src0 + offset0;
|
||||
src1 = src1 + offset1;
|
||||
dst = dst + offsetd;
|
||||
|
||||
int i03 = get_group_id(2);
|
||||
int i02 = get_group_id(1);
|
||||
int i01 = get_group_id(0)*get_local_size(1) + get_local_id(1);
|
||||
|
||||
if (i01 >= ne01) {
|
||||
return;
|
||||
}
|
||||
|
||||
int i12 = i03%ne12;
|
||||
int i11 = i02%ne11;
|
||||
|
||||
int i10 = i01;
|
||||
long i1 = ((global long *)(src1 + i10*nb10 + i11*nb11 + i12*nb12))[0];
|
||||
|
||||
global half * dst_row = (global half *) (dst + i1*nb1 + i02*nb2 + i03*nb3);
|
||||
global float * src_row = (global float *) (src0 + i01*nb01 + i02*nb02 + i03*nb03);
|
||||
|
||||
for (int ind = get_local_id(0); ind < nblk0; ind += get_local_size(0)) {
|
||||
dst_row[ind] = src_row[ind];
|
||||
}
|
||||
}
|
||||
@@ -22,32 +22,45 @@
|
||||
REQD_SUBGROUP_SIZE_64
|
||||
#endif
|
||||
kernel void kernel_soft_max_4_f16(
|
||||
global float * src0,
|
||||
global char * src0,
|
||||
ulong offset0,
|
||||
global half * src1,
|
||||
global char * src1,
|
||||
ulong offset1,
|
||||
global float * dst,
|
||||
global char * dst,
|
||||
ulong offsetd,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
int ne12,
|
||||
int ne13,
|
||||
ulong nb11,
|
||||
ulong nb12,
|
||||
ulong nb13,
|
||||
ulong nb1,
|
||||
ulong nb2,
|
||||
ulong nb3,
|
||||
float scale,
|
||||
float max_bias,
|
||||
float m0,
|
||||
float m1,
|
||||
int n_head_log2
|
||||
) {
|
||||
src0 = (global float *)((global char *)src0 + offset0);
|
||||
src1 = (global half *)((global char *)src1 + offset1);
|
||||
dst = (global float *)((global char *)dst + offsetd);
|
||||
src0 = src0 + offset0;
|
||||
src1 = src1 + offset1;
|
||||
dst = dst + offsetd;
|
||||
|
||||
int i03 = get_group_id(2);
|
||||
int i02 = get_group_id(1);
|
||||
int i01 = get_group_id(0);
|
||||
|
||||
global float4 * psrc4 = (global float4 *)(src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
|
||||
global half4 * pmask = (global char *)src1 != (global char *)src0 ? (global half4 *)(src1 + i01*ne00) : 0;
|
||||
global float4 * pdst4 = (global float4 *)(dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
|
||||
int i13 = i03%ne13;
|
||||
int i12 = i02%ne12;
|
||||
int i11 = i01;
|
||||
|
||||
global float4 * psrc4 = (global float4 *)(src0 + i01*nb01 + i02*nb02 + i03*nb03);
|
||||
global half4 * pmask = src1 != src0 ? (global half4 *)(src1 + i11*nb11 + i12*nb12 + i13*nb13) : 0;
|
||||
global float4 * pdst4 = (global float4 *)(dst + i01*nb1 + i02*nb2 + i03*nb3);
|
||||
|
||||
float slope = 1.0f;
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user