mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-05-14 17:07:43 +03:00
Compare commits
16 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
9ed6e19b9d | ||
|
|
4c1c3ac09d | ||
|
|
7f3f843c31 | ||
|
|
ec562eb673 | ||
|
|
95d469a915 | ||
|
|
1e4579fbb8 | ||
|
|
527045bfb0 | ||
|
|
2dfeca31cc | ||
|
|
46be24d121 | ||
|
|
7e16646015 | ||
|
|
ad96bb8c0c | ||
|
|
e75cd5efb5 | ||
|
|
5d44db6008 | ||
|
|
3796c94bad | ||
|
|
634275fbbb | ||
|
|
bcfe63fc53 |
@@ -5,8 +5,15 @@ ARG ONEAPI_VERSION=2025.3.3-0-devel-ubuntu24.04
|
||||
FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS build
|
||||
|
||||
ARG GGML_SYCL_F16=OFF
|
||||
ARG LEVEL_ZERO_VERSION=1.28.2
|
||||
ARG LEVEL_ZERO_UBUNTU_VERSION=u24.04
|
||||
RUN apt-get update && \
|
||||
apt-get install -y git libssl-dev
|
||||
apt-get install -y git libssl-dev wget ca-certificates && \
|
||||
cd /tmp && \
|
||||
wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb && \
|
||||
wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb && \
|
||||
apt-get -o Dpkg::Options::="--force-overwrite" install -y ./level-zero.deb ./level-zero-devel.deb && \
|
||||
rm -f /tmp/level-zero.deb /tmp/level-zero-devel.deb
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
@@ -109,4 +116,3 @@ WORKDIR /app
|
||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||
|
||||
ENTRYPOINT [ "/app/llama-server" ]
|
||||
|
||||
|
||||
18
.github/workflows/build-sycl.yml
vendored
18
.github/workflows/build-sycl.yml
vendored
@@ -50,6 +50,8 @@ jobs:
|
||||
env:
|
||||
ONEAPI_ROOT: /opt/intel/oneapi/
|
||||
ONEAPI_INSTALLER_VERSION: "2025.3.3"
|
||||
LEVEL_ZERO_VERSION: "1.28.2"
|
||||
LEVEL_ZERO_UBUNTU_VERSION: "u24.04"
|
||||
|
||||
continue-on-error: true
|
||||
|
||||
@@ -71,6 +73,14 @@ jobs:
|
||||
wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
|
||||
sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
|
||||
|
||||
- name: Install Level Zero SDK
|
||||
shell: bash
|
||||
run: |
|
||||
cd /tmp
|
||||
wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb
|
||||
wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb
|
||||
sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb
|
||||
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
@@ -107,6 +117,7 @@ jobs:
|
||||
env:
|
||||
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
|
||||
WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
|
||||
LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip
|
||||
ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
|
||||
ONEAPI_INSTALLER_VERSION: "2025.3.3"
|
||||
steps:
|
||||
@@ -127,6 +138,13 @@ jobs:
|
||||
run: |
|
||||
scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
|
||||
|
||||
- name: Install Level Zero SDK
|
||||
shell: pwsh
|
||||
run: |
|
||||
Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip"
|
||||
Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force
|
||||
"LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
|
||||
51
.github/workflows/code-style.yml
vendored
Normal file
51
.github/workflows/code-style.yml
vendored
Normal file
@@ -0,0 +1,51 @@
|
||||
name: Code Style Checker
|
||||
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
pull_request:
|
||||
branches:
|
||||
- master
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
model-naming:
|
||||
runs-on: ubuntu-slim
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
- name: Check model naming conventions
|
||||
run: |
|
||||
python3 - << 'EOF'
|
||||
import re, os, sys
|
||||
|
||||
pairs = re.findall(
|
||||
r'case\s+(LLM_ARCH_\w+)\s*:\s*\n\s+return new (llama_model_\w+)\s*\(',
|
||||
open("src/llama-model.cpp").read())
|
||||
|
||||
errors = []
|
||||
for arch, cls in pairs:
|
||||
suffix = arch[len("LLM_ARCH_"):]
|
||||
csuffix = cls[len("llama_model_"):]
|
||||
fname = csuffix.replace("_", "-") + ".cpp"
|
||||
|
||||
if not re.fullmatch(r'[A-Z][A-Z0-9_]*', suffix):
|
||||
errors.append(f"{arch}: suffix not upper snake case, example: LLM_ARCH_MY_MODEL")
|
||||
|
||||
if not re.fullmatch(r'[a-z][a-z0-9_]*', csuffix):
|
||||
errors.append(f"{arch}: class suffix not lower snake case, example: llama_model_my_model")
|
||||
|
||||
elif suffix.lower() != csuffix:
|
||||
errors.append(f"{arch}: arch/class name mismatch, expected class 'llama_model_{suffix.lower()}' but got '{cls}'")
|
||||
|
||||
elif not os.path.isfile(f"src/models/{fname}"):
|
||||
errors.append(f"{arch}: expects model file name to be src/models/{fname}, but not found")
|
||||
|
||||
if errors:
|
||||
print('\n'.join(f" - {e}" for e in errors)); sys.exit(1)
|
||||
print(f"OK: {len(pairs)} mappings validated.")
|
||||
EOF
|
||||
5
.github/workflows/editorconfig.yml
vendored
5
.github/workflows/editorconfig.yml
vendored
@@ -2,11 +2,6 @@ name: EditorConfig Checker
|
||||
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
inputs:
|
||||
create_release:
|
||||
description: 'Create new release'
|
||||
required: true
|
||||
type: boolean
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
|
||||
25
.github/workflows/release.yml
vendored
25
.github/workflows/release.yml
vendored
@@ -600,6 +600,7 @@ jobs:
|
||||
env:
|
||||
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
|
||||
WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
|
||||
LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip
|
||||
ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
|
||||
ONEAPI_INSTALLER_VERSION: "2025.3.3"
|
||||
|
||||
@@ -621,6 +622,13 @@ jobs:
|
||||
run: |
|
||||
scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
|
||||
|
||||
- name: Install Level Zero SDK
|
||||
shell: pwsh
|
||||
run: |
|
||||
Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip"
|
||||
Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force
|
||||
"LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
@@ -655,6 +663,13 @@ jobs:
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin
|
||||
ZE_LOADER_DLL=$(find "${{ env.ONEAPI_ROOT }}" "$LEVEL_ZERO_V1_SDK_PATH" -iname ze_loader.dll -print -quit 2>/dev/null || true)
|
||||
if [ -n "$ZE_LOADER_DLL" ]; then
|
||||
echo "Using Level Zero loader: $ZE_LOADER_DLL"
|
||||
cp "$ZE_LOADER_DLL" ./build/bin
|
||||
else
|
||||
echo "Level Zero loader DLL not found in oneAPI or SDK; relying on system driver/runtime"
|
||||
fi
|
||||
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
|
||||
@@ -695,6 +710,8 @@ jobs:
|
||||
env:
|
||||
ONEAPI_ROOT: /opt/intel/oneapi/
|
||||
ONEAPI_INSTALLER_VERSION: "2025.3.3"
|
||||
LEVEL_ZERO_VERSION: "1.28.2"
|
||||
LEVEL_ZERO_UBUNTU_VERSION: "u24.04"
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
@@ -718,6 +735,14 @@ jobs:
|
||||
wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
|
||||
sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
|
||||
|
||||
- name: Install Level Zero SDK
|
||||
shell: bash
|
||||
run: |
|
||||
cd /tmp
|
||||
wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb
|
||||
wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb
|
||||
sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.21
|
||||
with:
|
||||
|
||||
@@ -357,8 +357,7 @@ static handle_model_result common_params_handle_model(struct common_params_model
|
||||
auto download_result = common_download_model(model, opts, true);
|
||||
|
||||
if (download_result.model_path.empty()) {
|
||||
LOG_ERR("error: failed to download model from Hugging Face\n");
|
||||
exit(1);
|
||||
throw std::runtime_error("failed to download model from Hugging Face");
|
||||
}
|
||||
|
||||
model.name = model.hf_repo;
|
||||
@@ -380,8 +379,7 @@ static handle_model_result common_params_handle_model(struct common_params_model
|
||||
opts.offline = offline;
|
||||
auto download_result = common_download_model(model, opts);
|
||||
if (download_result.model_path.empty()) {
|
||||
LOG_ERR("error: failed to download model from %s\n", model.url.c_str());
|
||||
exit(1);
|
||||
throw std::runtime_error("failed to download model from " + model.url);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2223,7 +2221,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
if (llama_supports_rpc()) {
|
||||
add_opt(common_arg(
|
||||
{"--rpc"}, "SERVERS",
|
||||
"comma separated list of RPC servers (host:port)",
|
||||
"comma-separated list of RPC servers (host:port)",
|
||||
[](common_params & params, const std::string & value) {
|
||||
add_rpc_devices(value);
|
||||
GGML_UNUSED(params);
|
||||
@@ -3555,7 +3553,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SPEC_DRAFT_MODEL"));
|
||||
add_opt(common_arg(
|
||||
{"--spec-type"}, common_speculative_all_types_str(),
|
||||
string_format("type of speculative decoding to use when no draft model is provided (default: %s)\n",
|
||||
string_format("comma-separated list of types of speculative decoding to use (default: %s)\n",
|
||||
common_speculative_type_name_str(params.speculative.types).c_str()),
|
||||
[](common_params & params, const std::string & value) {
|
||||
const auto enabled_types = string_split<std::string>(value, ',');
|
||||
|
||||
@@ -157,9 +157,9 @@ enum common_params_sampling_config : uint64_t {
|
||||
|
||||
enum common_speculative_type {
|
||||
COMMON_SPECULATIVE_TYPE_NONE, // no speculative decoding
|
||||
COMMON_SPECULATIVE_TYPE_DRAFT, // draft model
|
||||
COMMON_SPECULATIVE_TYPE_EAGLE3, // eagle draft model
|
||||
COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE, // simple self-speculative decoding
|
||||
COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE, // standalone draft model speculative decoding
|
||||
COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3, // Eagle3 speculative decoding
|
||||
COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE, // simple self-speculative decoding based on n-grams
|
||||
COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K, // self-speculative decoding with n-gram keys only
|
||||
COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V, // self-speculative decoding with n-gram keys and 4 m-gram values
|
||||
COMMON_SPECULATIVE_TYPE_NGRAM_MOD,
|
||||
@@ -342,6 +342,7 @@ struct common_params_speculative_ngram_cache {
|
||||
struct common_params_speculative {
|
||||
std::vector<enum common_speculative_type> types = { COMMON_SPECULATIVE_TYPE_NONE };
|
||||
|
||||
// used by Simple, MTP, Eagle3, etc. - all methods that require some kind of draft model
|
||||
common_params_speculative_draft draft;
|
||||
|
||||
common_params_speculative_ngram_mod ngram_mod;
|
||||
|
||||
@@ -21,8 +21,8 @@
|
||||
|
||||
const std::map<std::string, common_speculative_type> common_speculative_type_from_name_map = {
|
||||
{"none", COMMON_SPECULATIVE_TYPE_NONE},
|
||||
{"draft", COMMON_SPECULATIVE_TYPE_DRAFT},
|
||||
{"eagle3", COMMON_SPECULATIVE_TYPE_EAGLE3},
|
||||
{"draft-simple", COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE},
|
||||
{"draft-eagle3", COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3},
|
||||
{"ngram-simple", COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE},
|
||||
{"ngram-map-k", COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K},
|
||||
{"ngram-map-k4v", COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V},
|
||||
@@ -145,15 +145,15 @@ struct common_speculative_impl {
|
||||
virtual void accept(llama_seq_id seq_id, uint16_t n_accepted) = 0;
|
||||
};
|
||||
|
||||
struct common_speculative_state_draft : public common_speculative_impl {
|
||||
struct common_speculative_impl_draft_simple : public common_speculative_impl {
|
||||
common_params_speculative_draft params;
|
||||
|
||||
llama_batch batch;
|
||||
|
||||
std::vector<common_sampler_ptr> smpls;
|
||||
|
||||
common_speculative_state_draft(const common_params_speculative & params, uint32_t n_seq)
|
||||
: common_speculative_impl(COMMON_SPECULATIVE_TYPE_DRAFT, n_seq)
|
||||
common_speculative_impl_draft_simple(const common_params_speculative & params, uint32_t n_seq)
|
||||
: common_speculative_impl(COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE, n_seq)
|
||||
, params(params.draft)
|
||||
{
|
||||
auto * ctx_dft = this->params.ctx_dft;
|
||||
@@ -206,7 +206,7 @@ struct common_speculative_state_draft : public common_speculative_impl {
|
||||
}
|
||||
}
|
||||
|
||||
~common_speculative_state_draft() override {
|
||||
~common_speculative_impl_draft_simple() override {
|
||||
llama_batch_free(batch);
|
||||
}
|
||||
|
||||
@@ -340,11 +340,11 @@ struct common_speculative_state_draft : public common_speculative_impl {
|
||||
}
|
||||
};
|
||||
|
||||
struct common_speculative_state_eagle3 : public common_speculative_impl {
|
||||
struct common_speculative_impl_draft_eagle3 : public common_speculative_impl {
|
||||
//common_params_speculative_eagle3 params;
|
||||
|
||||
common_speculative_state_eagle3(const common_params_speculative & /*params*/, uint32_t n_seq)
|
||||
: common_speculative_impl(COMMON_SPECULATIVE_TYPE_EAGLE3, n_seq) {}
|
||||
common_speculative_impl_draft_eagle3(const common_params_speculative & /*params*/, uint32_t n_seq)
|
||||
: common_speculative_impl(COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3, n_seq) {}
|
||||
|
||||
void begin(llama_seq_id /*seq_id*/, const llama_tokens & /*prompt*/) override {
|
||||
// noop
|
||||
@@ -365,13 +365,13 @@ struct common_speculative_state_eagle3 : public common_speculative_impl {
|
||||
};
|
||||
|
||||
// state of self-speculation (simple implementation, not ngram-map)
|
||||
struct common_speculative_state_ngram_simple : public common_speculative_impl {
|
||||
struct common_speculative_impl_ngram_simple : public common_speculative_impl {
|
||||
common_params_speculative_ngram_map params;
|
||||
|
||||
// shared across all sequences
|
||||
common_ngram_simple_config config;
|
||||
|
||||
common_speculative_state_ngram_simple(
|
||||
common_speculative_impl_ngram_simple(
|
||||
const common_params_speculative & params, uint32_t n_seq,
|
||||
common_ngram_simple_config config)
|
||||
: common_speculative_impl(COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE, n_seq)
|
||||
@@ -405,13 +405,13 @@ struct common_speculative_state_ngram_simple : public common_speculative_impl {
|
||||
}
|
||||
};
|
||||
|
||||
struct common_speculative_state_ngram_map_k : public common_speculative_impl {
|
||||
struct common_speculative_impl_ngram_map_k : public common_speculative_impl {
|
||||
common_params_speculative_ngram_map params;
|
||||
|
||||
// n_seq configs
|
||||
std::vector<common_ngram_map> config;
|
||||
|
||||
common_speculative_state_ngram_map_k(
|
||||
common_speculative_impl_ngram_map_k(
|
||||
const common_params_speculative & params,
|
||||
const common_ngram_map & config,
|
||||
uint32_t n_seq)
|
||||
@@ -453,7 +453,7 @@ struct common_speculative_state_ngram_map_k : public common_speculative_impl {
|
||||
}
|
||||
};
|
||||
|
||||
struct common_speculative_state_ngram_mod : public common_speculative_impl {
|
||||
struct common_speculative_impl_ngram_mod : public common_speculative_impl {
|
||||
common_params_speculative_ngram_mod params;
|
||||
|
||||
// shared across all sequences
|
||||
@@ -475,7 +475,7 @@ struct common_speculative_state_ngram_mod : public common_speculative_impl {
|
||||
|
||||
std::vector<seq_info> sinfos;
|
||||
|
||||
common_speculative_state_ngram_mod(
|
||||
common_speculative_impl_ngram_mod(
|
||||
const common_params_speculative & params,
|
||||
uint32_t n_seq)
|
||||
: common_speculative_impl(COMMON_SPECULATIVE_TYPE_NGRAM_MOD, n_seq)
|
||||
@@ -621,7 +621,7 @@ struct common_speculative_state_ngram_mod : public common_speculative_impl {
|
||||
}
|
||||
};
|
||||
|
||||
struct common_speculative_state_ngram_cache : public common_speculative_impl {
|
||||
struct common_speculative_impl_ngram_cache : public common_speculative_impl {
|
||||
common_params_speculative_ngram_cache params;
|
||||
|
||||
uint16_t n_draft;
|
||||
@@ -639,7 +639,7 @@ struct common_speculative_state_ngram_cache : public common_speculative_impl {
|
||||
|
||||
std::vector<seq_info> sinfos;
|
||||
|
||||
common_speculative_state_ngram_cache(
|
||||
common_speculative_impl_ngram_cache(
|
||||
const common_params_speculative & params,
|
||||
uint32_t n_seq,
|
||||
uint16_t n_draft,
|
||||
@@ -775,7 +775,7 @@ static common_ngram_map get_common_ngram_map(
|
||||
return common_ngram_map(size_key, size_value, key_only, min_hits);
|
||||
}
|
||||
|
||||
static common_speculative_state_ngram_cache create_state_ngram_cache(
|
||||
static common_speculative_impl_ngram_cache create_state_ngram_cache(
|
||||
const common_speculative_config & config,
|
||||
uint32_t n_seq,
|
||||
const std::string & path_static,
|
||||
@@ -786,7 +786,7 @@ static common_speculative_state_ngram_cache create_state_ngram_cache(
|
||||
bool save_static = false;
|
||||
bool save_dynamic = false;
|
||||
|
||||
common_speculative_state_ngram_cache state(config.params, n_seq, n_draft, path_static, path_dynamic, save_static, save_dynamic);
|
||||
common_speculative_impl_ngram_cache state(config.params, n_seq, n_draft, path_static, path_dynamic, save_static, save_dynamic);
|
||||
|
||||
return state;
|
||||
}
|
||||
@@ -818,8 +818,8 @@ const char * common_speculative_all_types_str() {
|
||||
std::string common_speculative_type_to_str(common_speculative_type type) {
|
||||
switch (type) {
|
||||
case COMMON_SPECULATIVE_TYPE_NONE: return "none";
|
||||
case COMMON_SPECULATIVE_TYPE_DRAFT: return "draft";
|
||||
case COMMON_SPECULATIVE_TYPE_EAGLE3: return "eagle3";
|
||||
case COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE: return "draft-simple";
|
||||
case COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3: return "draft-eagle3";
|
||||
case COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE: return "ngram-simple";
|
||||
case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K: return "ngram-map-k";
|
||||
case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V: return "ngram-map-k4v";
|
||||
@@ -872,9 +872,9 @@ common_speculative * common_speculative_init(common_params_speculative & params,
|
||||
{
|
||||
uint32_t enabled_configs = common_get_enabled_speculative_configs(params.types);
|
||||
|
||||
bool has_draft = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT));
|
||||
bool has_draft_model = !params.draft.mparams.path.empty();
|
||||
bool has_draft_model_path = !params.draft.mparams.path.empty();
|
||||
|
||||
bool has_draft_simple = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE));
|
||||
// bool has_mtp = false; // TODO: add MTP here
|
||||
bool has_draft_eagle3 = false; // TODO PR-18039: if params.speculative.eagle3
|
||||
|
||||
@@ -906,22 +906,22 @@ common_speculative * common_speculative_init(common_params_speculative & params,
|
||||
if (has_ngram_cache) {
|
||||
configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_NGRAM_CACHE, params));
|
||||
}
|
||||
if (has_draft) {
|
||||
if (!has_draft_model) {
|
||||
if (has_draft_simple) {
|
||||
if (!has_draft_model_path) {
|
||||
LOG_WRN("%s: draft model is not specified - cannot use 'draft' type\n", __func__);
|
||||
has_draft = false;
|
||||
has_draft_simple = false;
|
||||
}
|
||||
} else if (has_draft_model) {
|
||||
} else if (has_draft_model_path) {
|
||||
LOG_WRN("%s: draft model is specified but 'draft' speculative type is not explicitly enabled - enabling it\n", __func__);
|
||||
has_draft = true;
|
||||
has_draft_simple = true;
|
||||
}
|
||||
|
||||
if (has_draft) {
|
||||
configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_DRAFT, params));
|
||||
if (has_draft_simple) {
|
||||
configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE, params));
|
||||
}
|
||||
// TODO: add MTP here
|
||||
if (has_draft_eagle3) {
|
||||
configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_EAGLE3, params));
|
||||
configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3, params));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -932,12 +932,12 @@ common_speculative * common_speculative_init(common_params_speculative & params,
|
||||
switch (config.type) {
|
||||
case COMMON_SPECULATIVE_TYPE_NONE:
|
||||
break;
|
||||
case COMMON_SPECULATIVE_TYPE_DRAFT: {
|
||||
impls.push_back(std::make_unique<common_speculative_state_draft>(config.params, n_seq));
|
||||
case COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE: {
|
||||
impls.push_back(std::make_unique<common_speculative_impl_draft_simple>(config.params, n_seq));
|
||||
break;
|
||||
}
|
||||
case COMMON_SPECULATIVE_TYPE_EAGLE3: {
|
||||
impls.push_back(std::make_unique<common_speculative_state_eagle3>(config.params, n_seq));
|
||||
case COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3: {
|
||||
impls.push_back(std::make_unique<common_speculative_impl_draft_eagle3>(config.params, n_seq));
|
||||
break;
|
||||
}
|
||||
case COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE: {
|
||||
@@ -950,7 +950,7 @@ common_speculative * common_speculative_init(common_params_speculative & params,
|
||||
/* .size_ngram = */ ngram_size_key,
|
||||
/* .size_mgram = */ mgram_size_value
|
||||
};
|
||||
auto state = std::make_unique<common_speculative_state_ngram_simple>(
|
||||
auto state = std::make_unique<common_speculative_impl_ngram_simple>(
|
||||
/* .params = */ config.params,
|
||||
/* .n_seq = */ n_seq,
|
||||
/* .state = */ config_simple
|
||||
@@ -961,13 +961,13 @@ common_speculative * common_speculative_init(common_params_speculative & params,
|
||||
case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K:
|
||||
case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V: {
|
||||
impls.push_back(
|
||||
std::make_unique<common_speculative_state_ngram_map_k>(
|
||||
std::make_unique<common_speculative_impl_ngram_map_k>(
|
||||
config.params, get_common_ngram_map(config.type, config.params.ngram_map_k), n_seq));
|
||||
break;
|
||||
}
|
||||
case COMMON_SPECULATIVE_TYPE_NGRAM_MOD: {
|
||||
impls.push_back(
|
||||
std::make_unique<common_speculative_state_ngram_mod>(config.params, n_seq));
|
||||
std::make_unique<common_speculative_impl_ngram_mod>(config.params, n_seq));
|
||||
break;
|
||||
}
|
||||
case COMMON_SPECULATIVE_TYPE_NGRAM_CACHE: {
|
||||
@@ -975,7 +975,7 @@ common_speculative * common_speculative_init(common_params_speculative & params,
|
||||
config, n_seq,
|
||||
params.ngram_cache.lookup_cache_static,
|
||||
params.ngram_cache.lookup_cache_dynamic);
|
||||
impls.push_back(std::make_unique<common_speculative_state_ngram_cache>(state));
|
||||
impls.push_back(std::make_unique<common_speculative_impl_ngram_cache>(state));
|
||||
break;
|
||||
}
|
||||
default:
|
||||
|
||||
@@ -57,17 +57,22 @@ Although OpenVINO supports a wide range of [Intel hardware](https://docs.openvin
|
||||
|
||||
## Validated Models
|
||||
|
||||
The following models have been validated for functionality on Intel® Core™ Ultra Series 1 and Series 2:
|
||||
The following models were validated on Intel® Core™ Ultra Series 2. While our testing was limited, the OpenVINO backend is expected to work across a broad range of [Intel hardware](https://docs.openvino.ai/2026/about-openvino/release-notes-openvino/system-requirements.html).
|
||||
- Use `GGML_OPENVINO_STATEFUL_EXECUTION=1` when using GPU device.
|
||||
- `-fa 1` is required when running llama-bench with the OpenVINO backend.
|
||||
- Additional model support, quantization formats and validations are work in progress.
|
||||
|
||||
- [Llama-3.2-1B-Instruct-GGUF](https://huggingface.co/unsloth/Llama-3.2-1B-Instruct-GGUF/)
|
||||
- [Llama-3.1-8B-Instruct](https://huggingface.co/bartowski/Meta-Llama-3.1-8B-Instruct-GGUF)
|
||||
- [microsoft/Phi-3-mini-4k-instruct-gguf](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf)
|
||||
- [Qwen/Qwen2.5-1.5B-Instruct-GGUF](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GGUF)
|
||||
- [Qwen/Qwen3-8B](https://huggingface.co/Qwen/Qwen3-8B-GGUF)
|
||||
- [openbmb/MiniCPM-1B-sft-bf16](https://huggingface.co/openbmb/MiniCPM-S-1B-sft-gguf)
|
||||
- [tencent/Hunyuan-7B-Instruct](https://huggingface.co/bartowski/tencent_Hunyuan-7B-Instruct-GGUF)
|
||||
- [mistralai/Mistral-7B-Instruct-v0.3](https://huggingface.co/bartowski/Mistral-7B-Instruct-v0.3-GGUF)
|
||||
- [bartowski/DeepSeek-R1-Distill-Llama-8B-GGUF](https://huggingface.co/bartowski/DeepSeek-R1-Distill-Llama-8B-GGUF)
|
||||
| Model | Validated | Known Issues |
|
||||
| :------| :---------- | :-------------|
|
||||
| [Llama-3.2-1B-Instruct](https://huggingface.co/unsloth/Llama-3.2-1B-Instruct-GGUF/) | `FP16`, `Q8_0`, `Q4_0`, `Q4_1`, `Q4_K_M` on CPU/GPU/NPU | — |
|
||||
| [Meta-Llama-3.1-8B-Instruct](https://huggingface.co/bartowski/Meta-Llama-3.1-8B-Instruct-GGUF) | `Q8_0`, `Q4_K_M` on CPU/GPU/NPU | `Q4_0_8_8`, `Q4_0_4_8`, `Q4_0_4_4` fail |
|
||||
| [Phi-3-mini-4k-instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf) | `FP16`, `Q4` on CPU/NPU | GPU unsupported for `FP16` and `Q4` (`llama-cli`, `llama-bench`) |
|
||||
| [Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GGUF) | `FP16`, `Q8_0`, `Q4_0`, `Q4_1`, `Q4_K_M` on CPU/GPU/NPU | — |
|
||||
| [Qwen3-8B-Instruct](https://huggingface.co/Qwen/Qwen3-8B-GGUF) | `FP16`, `Q8_0`, `Q4_0`, `Q4_1`, `Q4_K_M` on CPU/NPU; GPU works via `llama-bench` | GPU `llama-cli` unsupported for all quantizations |
|
||||
| [MiniCPM-V-2_6-GGUF](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) | `Q4_0` on CPU/GPU/NPU | — |
|
||||
| [DeepSeek-R1-Distill-Llama-8B](https://huggingface.co/bartowski/DeepSeek-R1-Distill-Llama-8B-GGUF) | `Q8_0`, `Q4_0`, `Q4_1`, `Q4_K_M` on CPU/GPU/NPU | — |
|
||||
| [Hunyuan-7B-Instruct](https://huggingface.co/bartowski/tencent_Hunyuan-7B-Instruct-GGUF) | CPU: `Q8_0`, `Q4_0`, `Q4_1`, `Q4_K_M`; GPU: `Q8_0`, `Q4_0`, `Q4_1`; NPU (`llama-bench` only): `Q4_0`, `Q4_1`, `Q4_K_M` | GPU `Q4_K_M` unsupported; NPU `llama-cli` unsupported |
|
||||
| [Mistral-7B-Instruct-v0.3](https://huggingface.co/bartowski/Mistral-7B-Instruct-v0.3-GGUF/) | CPU/GPU: `Q8_0`, `Q4_K_M`; NPU: `Q8_0`, `Q4_K_M` (via `llama-bench`) | NPU `llama-cli` unsupported for `Q8_0`, `Q4_K_M` |
|
||||
|
||||
## Build Instructions
|
||||
|
||||
|
||||
@@ -720,6 +720,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
|
||||
| GGML_SYCL_GRAPH | OFF *(default)* \|ON *(Optional)* | Enable build with [SYCL Graph extension](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc). |
|
||||
| GGML_SYCL_DNN | ON *(default)* \|OFF *(Optional)* | Enable build with oneDNN. |
|
||||
| GGML_SYCL_HOST_MEM_FALLBACK | ON *(default)* \|OFF *(Optional)* | Allow host memory fallback when device memory is full during quantized weight reorder. Enables inference to continue at reduced speed (reading over PCIe) instead of failing. Requires Linux kernel 6.8+. |
|
||||
| GGML_SYCL_SUPPORT_LEVEL_ZERO | ON *(default)* \|OFF *(Optional)* | Enable Level Zero API for device memory allocation. Requires Level Zero headers/library at build time and Intel GPU driver (Level Zero runtime) at run time. Reduces system RAM usage during multi-GPU inference. |
|
||||
| CMAKE_C_COMPILER | `icx` *(Linux)*, `icx/cl` *(Windows)* | Set `icx` compiler for SYCL code path. |
|
||||
| CMAKE_CXX_COMPILER | `icpx` *(Linux)*, `icx` *(Windows)* | Set `icpx/icx` compiler for SYCL code path. |
|
||||
|
||||
@@ -733,9 +734,10 @@ use 1 SYCL GPUs: [0] with Max compute units:512
|
||||
| GGML_SYCL_ENABLE_FLASH_ATTN | 1 (default) or 0| Enable Flash-Attention. It can reduce memory usage. The performance impact depends on the LLM.|
|
||||
| GGML_SYCL_DISABLE_OPT | 0 (default) or 1 | Disable optimize features for Intel GPUs. (Recommended to 1 for intel devices older than Gen 10) |
|
||||
| GGML_SYCL_DISABLE_GRAPH | 0 or 1 (default) | Disable running computations through SYCL Graphs feature. Disabled by default because SYCL Graph is still on development, no better performance. |
|
||||
| GGML_SYCL_ENABLE_LEVEL_ZERO | 1 (default) or 0 | Use Level Zero API for device memory allocation instead of SYCL. Reduces system RAM usage on Intel dGPUs by avoiding DMA-buf/TTM host memory staging. Requires GGML_SYCL_SUPPORT_LEVEL_ZERO=ON at build time. |
|
||||
| GGML_SYCL_DISABLE_DNN | 0 (default) or 1 | Disable running computations through oneDNN and always use oneMKL. |
|
||||
| ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |
|
||||
| UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS | 0 (default) or 1 | Support malloc device memory more than 4GB.|
|
||||
| UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS | 0 (default) or 1 | Allow SYCL/Unified Runtime Level Zero device allocations larger than 4 GiB. llama.cpp's direct Level Zero allocation path requests the relaxed maximum-size limit itself when GGML_SYCL_ENABLE_LEVEL_ZERO=1. |
|
||||
|
||||
## Compile-time Flags
|
||||
|
||||
@@ -819,7 +821,7 @@ Pass these via `CXXFLAGS` or add a one-off `#define` to enable a flag on the spo
|
||||
|
||||
- `ggml_backend_sycl_buffer_type_alloc_buffer: can't allocate 5000000000 Bytes of memory on device`
|
||||
|
||||
You need to enable to support 4GB memory malloc by:
|
||||
With the default `GGML_SYCL_ENABLE_LEVEL_ZERO=1`, llama.cpp requests Level Zero's relaxed maximum-size allocation limit directly. If Level Zero support is disabled at build time or runtime and the allocation goes through SYCL/Unified Runtime instead, enable support for allocations larger than 4 GiB by:
|
||||
```
|
||||
export UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
|
||||
set UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
#!/usr/bin/env python3
|
||||
# type: ignore
|
||||
|
||||
import argparse
|
||||
import json
|
||||
@@ -100,6 +99,8 @@ D) {D}
|
||||
|
||||
|
||||
class BaseDataset(ABC):
|
||||
questions: List[Dict]
|
||||
|
||||
@abstractmethod
|
||||
def get_question(self, index: int) -> Dict:
|
||||
pass
|
||||
@@ -573,7 +574,7 @@ def normalize_number(s: str) -> Optional[int]:
|
||||
class AimeDataset(BaseDataset):
|
||||
def __init__(self, split: str = "train"):
|
||||
self.split = split
|
||||
self.questions: List[Dict] = []
|
||||
self.questions = []
|
||||
self._load_dataset()
|
||||
|
||||
def _load_dataset(self):
|
||||
@@ -618,7 +619,7 @@ class AimeDataset(BaseDataset):
|
||||
|
||||
class Aime2025Dataset(BaseDataset):
|
||||
def __init__(self):
|
||||
self.questions: List[Dict] = []
|
||||
self.questions = []
|
||||
self._load_dataset()
|
||||
|
||||
def _load_dataset(self):
|
||||
@@ -681,7 +682,7 @@ class Aime2025Dataset(BaseDataset):
|
||||
class Gsm8kDataset(BaseDataset):
|
||||
def __init__(self, split: str = "test"):
|
||||
self.split = split
|
||||
self.questions: List[Dict] = []
|
||||
self.questions = []
|
||||
self._load_dataset()
|
||||
|
||||
def _load_dataset(self):
|
||||
@@ -742,7 +743,7 @@ class GpqaDataset(BaseDataset):
|
||||
def __init__(self, variant: str = "diamond", seed: int = 1234):
|
||||
self.variant = variant
|
||||
self.seed = seed
|
||||
self.questions: List[Dict] = []
|
||||
self.questions = []
|
||||
self._load_dataset()
|
||||
|
||||
def _load_dataset(self):
|
||||
|
||||
@@ -249,6 +249,7 @@ option(GGML_SYCL "ggml: use SYCL"
|
||||
option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF)
|
||||
option(GGML_SYCL_GRAPH "ggml: enable graphs in the SYCL backend" ON)
|
||||
option(GGML_SYCL_HOST_MEM_FALLBACK "ggml: allow host memory fallback in SYCL reorder (requires kernel 6.8+)" ON)
|
||||
option(GGML_SYCL_SUPPORT_LEVEL_ZERO "ggml: use Level Zero API in SYCL backend" ON)
|
||||
option(GGML_SYCL_DNN "ggml: enable oneDNN in the SYCL backend" ON)
|
||||
set (GGML_SYCL_TARGET "INTEL" CACHE STRING
|
||||
"ggml: sycl target device")
|
||||
|
||||
@@ -184,13 +184,15 @@ static __global__ void ggml_cuda_ar_kernel(
|
||||
#pragma unroll
|
||||
for (int k = 0; k < ELEMS_PER_VEC; ++k) {
|
||||
const T_wire d_low = ggml_cuda_cast<T_wire>(sendbuf[off + k]);
|
||||
recvbuf[off + k] = ggml_cuda_cast<T_dst>(d_low) + ggml_cuda_cast<T_dst>(wire[k]);
|
||||
recvbuf[off + k] = ggml_cuda_cast<T_dst>(
|
||||
ggml_cuda_cast<float>(d_low) + ggml_cuda_cast<float>(wire[k]));
|
||||
}
|
||||
}
|
||||
if (bid == 0 && tid < count - tail) {
|
||||
const T_wire d_low = ggml_cuda_cast<T_wire>(sendbuf[tail + tid]);
|
||||
recvbuf[tail + tid] =
|
||||
ggml_cuda_cast<T_dst>(d_low) + ggml_cuda_cast<T_dst>(host_other[tail + tid]);
|
||||
recvbuf[tail + tid] = ggml_cuda_cast<T_dst>(
|
||||
ggml_cuda_cast<float>(d_low) +
|
||||
ggml_cuda_cast<float>(host_other[tail + tid]));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -210,7 +212,8 @@ static __global__ void ggml_cuda_ar_add_kernel(
|
||||
const int nt = gridDim.x * blockDim.x;
|
||||
for (int i = tid; i < count; i += nt) {
|
||||
const T_src d_low = ggml_cuda_cast<T_src>(dst[i]);
|
||||
dst[i] = ggml_cuda_cast<T_dst>(d_low) + ggml_cuda_cast<T_dst>(src[i]);
|
||||
dst[i] = ggml_cuda_cast<T_dst>(
|
||||
ggml_cuda_cast<float>(d_low) + ggml_cuda_cast<float>(src[i]));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -2865,6 +2865,7 @@ static htp_op_code op_remap_to_htp(const ggml_tensor * t) {
|
||||
case GGML_UNARY_OP_NEG: return HTP_OP_UNARY_NEG;
|
||||
case GGML_UNARY_OP_EXP: return HTP_OP_UNARY_EXP;
|
||||
case GGML_UNARY_OP_SOFTPLUS: return HTP_OP_UNARY_SOFTPLUS;
|
||||
case GGML_UNARY_OP_TANH: return HTP_OP_UNARY_TANH;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
@@ -3335,6 +3336,7 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
|
||||
case GGML_UNARY_OP_EXP:
|
||||
case GGML_UNARY_OP_SIGMOID:
|
||||
case GGML_UNARY_OP_SOFTPLUS:
|
||||
case GGML_UNARY_OP_TANH:
|
||||
supp = ggml_hexagon_supported_unary(sess, op);
|
||||
break;
|
||||
case GGML_UNARY_OP_SILU:
|
||||
|
||||
@@ -62,6 +62,7 @@ enum htp_op_code {
|
||||
HTP_OP_UNARY_EXP,
|
||||
HTP_OP_UNARY_NEG,
|
||||
HTP_OP_UNARY_SOFTPLUS,
|
||||
HTP_OP_UNARY_TANH,
|
||||
HTP_OP_GLU_SWIGLU,
|
||||
HTP_OP_GLU_SWIGLU_OAI,
|
||||
HTP_OP_GLU_GEGLU,
|
||||
|
||||
@@ -542,6 +542,7 @@ static int execute_op(struct htp_ops_context * octx) {
|
||||
case HTP_OP_UNARY_SIGMOID:
|
||||
case HTP_OP_UNARY_NEG:
|
||||
case HTP_OP_UNARY_EXP:
|
||||
case HTP_OP_UNARY_TANH:
|
||||
case HTP_OP_L2_NORM:
|
||||
return op_unary(octx);
|
||||
|
||||
|
||||
@@ -373,6 +373,21 @@ static void l2_norm_f32(const float * restrict src,
|
||||
}
|
||||
}
|
||||
|
||||
static void tanh_f32(const float * restrict src,
|
||||
float * restrict dst,
|
||||
uint8_t * restrict spad,
|
||||
const uint32_t num_rows,
|
||||
const uint32_t row_elems,
|
||||
const size_t row_size,
|
||||
int32_t * op_params) {
|
||||
for (uint32_t ir = 0; ir < num_rows; ir++) {
|
||||
const uint8_t * restrict src_local = (const uint8_t *)src + (ir * row_size);
|
||||
uint8_t * restrict dst_local = (uint8_t *)dst + (ir * row_size);
|
||||
|
||||
hvx_tanh_f32_aa(dst_local, src_local, row_elems);
|
||||
}
|
||||
}
|
||||
|
||||
static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void * data) {
|
||||
const struct htp_unary_context * uctx = (const struct htp_unary_context *) data;
|
||||
struct htp_ops_context * octx = uctx->octx;
|
||||
@@ -477,6 +492,9 @@ static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void *
|
||||
case HTP_OP_UNARY_SOFTPLUS:
|
||||
softplus_f32(src0_spad, dst_spad, NULL, block_size, ne0, src0_row_size_aligned, op_params);
|
||||
break;
|
||||
case HTP_OP_UNARY_TANH:
|
||||
tanh_f32(src0_spad, dst_spad, NULL, block_size, ne0, src0_row_size_aligned, op_params);
|
||||
break;
|
||||
case HTP_OP_L2_NORM:
|
||||
l2_norm_f32(src0_spad, dst_spad, NULL, block_size, ne0, src0_row_size_aligned, op_params);
|
||||
break;
|
||||
@@ -547,10 +565,12 @@ static int execute_op_unary_f32(struct htp_ops_context * octx) {
|
||||
case HTP_OP_UNARY_SOFTPLUS:
|
||||
op_type = "softplus-f32";
|
||||
break;
|
||||
case HTP_OP_UNARY_TANH:
|
||||
op_type = "tanh-f32";
|
||||
break;
|
||||
case HTP_OP_L2_NORM:
|
||||
op_type = "l2norm-f32";
|
||||
break;
|
||||
|
||||
default:
|
||||
FARF(ERROR, "Unsupported unary Op %u\n", octx->op);
|
||||
return HTP_STATUS_NO_SUPPORT;
|
||||
|
||||
@@ -106,6 +106,10 @@ set(GGML_OPENCL_KERNELS
|
||||
gemv_moe_q4_0_f32_ns
|
||||
gemm_moe_q4_1_f32_ns
|
||||
gemv_moe_q4_1_f32_ns
|
||||
gemm_moe_q5_0_f32_ns
|
||||
gemv_moe_q5_0_f32_ns
|
||||
gemm_moe_q5_1_f32_ns
|
||||
gemv_moe_q5_1_f32_ns
|
||||
gemm_moe_mxfp4_f32
|
||||
gemv_moe_mxfp4_f32
|
||||
gemm_moe_mxfp4_f32_ns
|
||||
|
||||
@@ -556,6 +556,8 @@ struct ggml_backend_opencl_context {
|
||||
cl_kernel kernel_convert_block_q4_0_trans4_ns, kernel_restore_block_q4_0_trans4_ns;
|
||||
cl_kernel kernel_convert_block_q4_1, kernel_restore_block_q4_1;
|
||||
cl_kernel kernel_convert_block_q4_1_trans4_ns, kernel_restore_block_q4_1_trans4_ns;
|
||||
cl_kernel kernel_convert_block_q5_0_trans4_ns, kernel_restore_block_q5_0_trans4_ns;
|
||||
cl_kernel kernel_convert_block_q5_1_trans4_ns, kernel_restore_block_q5_1_trans4_ns;
|
||||
cl_kernel kernel_convert_block_mxfp4, kernel_convert_block_mxfp4_trans, kernel_restore_block_mxfp4, kernel_restore_block_mxfp4_trans;
|
||||
cl_kernel kernel_convert_block_mxfp4_trans4_ns, kernel_restore_block_mxfp4_trans4_ns;
|
||||
cl_kernel kernel_convert_block_q8_0, kernel_restore_block_q8_0, kernel_restore_block_q8_0_trans;
|
||||
@@ -615,6 +617,8 @@ struct ggml_backend_opencl_context {
|
||||
cl_kernel kernel_timestep_embedding;
|
||||
cl_kernel kernel_gemv_moe_q4_0_f32_ns, kernel_gemm_moe_q4_0_f32_ns;
|
||||
cl_kernel kernel_gemv_moe_q4_1_f32_ns, kernel_gemm_moe_q4_1_f32_ns;
|
||||
cl_kernel kernel_gemv_moe_q5_0_f32_ns, kernel_gemm_moe_q5_0_f32_ns;
|
||||
cl_kernel kernel_gemv_moe_q5_1_f32_ns, kernel_gemm_moe_q5_1_f32_ns;
|
||||
cl_kernel kernel_gemv_moe_mxfp4_f32, kernel_gemm_moe_mxfp4_f32;
|
||||
cl_kernel kernel_gemv_moe_mxfp4_f32_ns, kernel_gemm_moe_mxfp4_f32_ns;
|
||||
cl_kernel kernel_moe_reorder_b;
|
||||
@@ -973,6 +977,10 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
||||
CL_CHECK((backend_ctx->kernel_restore_block_q4_1 = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_1", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_convert_block_q4_1_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_1_trans4_ns", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_restore_block_q4_1_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_1_trans4_ns", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_convert_block_q5_0_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q5_0_trans4_ns", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_restore_block_q5_0_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q5_0_trans4_ns", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_convert_block_q5_1_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q5_1_trans4_ns", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_restore_block_q5_1_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q5_1_trans4_ns", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_convert_block_mxfp4 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_mxfp4", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_convert_block_mxfp4_trans = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_mxfp4_trans", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_convert_block_mxfp4_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_mxfp4_trans4_ns", &err), err));
|
||||
@@ -2995,6 +3003,74 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// gemv_moe_q5_0_f32_ns
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "gemv_moe_q5_0_f32_ns.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src = read_file("gemv_moe_q5_0_f32_ns.cl");
|
||||
#endif
|
||||
cl_program prog =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_moe_compile_opts);
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_gemv_moe_q5_0_f32_ns = clCreateKernel(prog, "kernel_gemv_moe_q5_0_f32_ns", &err), err));
|
||||
CL_CHECK(clReleaseProgram(prog));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// gemm_moe_q5_0_f32_ns
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "gemm_moe_q5_0_f32_ns.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src = read_file("gemm_moe_q5_0_f32_ns.cl");
|
||||
#endif
|
||||
cl_program prog =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_moe_compile_opts);
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_gemm_moe_q5_0_f32_ns = clCreateKernel(prog, "kernel_gemm_moe_q5_0_f32_ns", &err), err));
|
||||
CL_CHECK(clReleaseProgram(prog));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// gemv_moe_q5_1_f32_ns
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "gemv_moe_q5_1_f32_ns.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src = read_file("gemv_moe_q5_1_f32_ns.cl");
|
||||
#endif
|
||||
cl_program prog =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_moe_compile_opts);
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_gemv_moe_q5_1_f32_ns = clCreateKernel(prog, "kernel_gemv_moe_q5_1_f32_ns", &err), err));
|
||||
CL_CHECK(clReleaseProgram(prog));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// gemm_moe_q5_1_f32_ns
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "gemm_moe_q5_1_f32_ns.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src = read_file("gemm_moe_q5_1_f32_ns.cl");
|
||||
#endif
|
||||
cl_program prog =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_moe_compile_opts);
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_gemm_moe_q5_1_f32_ns = clCreateKernel(prog, "kernel_gemm_moe_q5_1_f32_ns", &err), err));
|
||||
CL_CHECK(clReleaseProgram(prog));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// gemv_moe_mxfp4_f32_ns
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
@@ -3852,6 +3928,122 @@ struct ggml_tensor_extra_cl_q4_1 {
|
||||
}
|
||||
};
|
||||
|
||||
struct ggml_tensor_extra_cl_q5_0 {
|
||||
// Quantized values.
|
||||
cl_mem qs = nullptr;
|
||||
// Quantized values in image1d_buffer_t.
|
||||
cl_mem qs_img = nullptr;
|
||||
// 5-th bit values.
|
||||
cl_mem qh = nullptr;
|
||||
// 5-th bit values in image1d_buffer_t.
|
||||
cl_mem qh_img = nullptr;
|
||||
// Scales.
|
||||
cl_mem d = nullptr;
|
||||
// Scales in image1d_buffer_t.
|
||||
cl_mem d_img = nullptr;
|
||||
// Size of quantized values.
|
||||
size_t size_qs = 0;
|
||||
// Size of 5-th bit values.
|
||||
size_t size_qh = 0;
|
||||
// Size of scales.
|
||||
size_t size_d = 0;
|
||||
|
||||
~ggml_tensor_extra_cl_q5_0() {
|
||||
reset();
|
||||
}
|
||||
|
||||
void reset() {
|
||||
if (qs != nullptr) {
|
||||
CL_CHECK(clReleaseMemObject(qs));
|
||||
qs = nullptr;
|
||||
}
|
||||
if (qh != nullptr) {
|
||||
CL_CHECK(clReleaseMemObject(qh));
|
||||
qh = nullptr;
|
||||
}
|
||||
if (d != nullptr) {
|
||||
CL_CHECK(clReleaseMemObject(d));
|
||||
d = nullptr;
|
||||
}
|
||||
if (qs_img != nullptr) {
|
||||
CL_CHECK(clReleaseMemObject(qs_img));
|
||||
qs_img = nullptr;
|
||||
}
|
||||
|
||||
qh_img = nullptr;
|
||||
d_img = nullptr;
|
||||
size_qs = 0;
|
||||
size_qh = 0;
|
||||
size_d = 0;
|
||||
}
|
||||
};
|
||||
|
||||
struct ggml_tensor_extra_cl_q5_1 {
|
||||
// Quantized values.
|
||||
cl_mem qs = nullptr;
|
||||
// Quantized values in image1d_buffer_t.
|
||||
cl_mem qs_img = nullptr;
|
||||
// 5-th bit values.
|
||||
cl_mem qh = nullptr;
|
||||
// 5-th bit values in image1d_buffer_t.
|
||||
cl_mem qh_img = nullptr;
|
||||
// Scales.
|
||||
cl_mem d = nullptr;
|
||||
// Scales in image1d_buffer_t.
|
||||
cl_mem d_img = nullptr;
|
||||
// Min
|
||||
cl_mem m = nullptr;
|
||||
// Min in image1d_buffer_t.
|
||||
cl_mem m_img = nullptr;
|
||||
// Size of quantized values.
|
||||
size_t size_qs = 0;
|
||||
// Size of 5-th bit values.
|
||||
size_t size_qh = 0;
|
||||
// Size of scales.
|
||||
size_t size_d = 0;
|
||||
// Size of min values.
|
||||
size_t size_m = 0;
|
||||
|
||||
~ggml_tensor_extra_cl_q5_1() {
|
||||
reset();
|
||||
}
|
||||
|
||||
void reset() {
|
||||
// q and d are subbuffers into the bigger buffer allocated in ggml_backend_buffer.
|
||||
// They must be properly released so that the original buffer can be
|
||||
// properly released to avoid memory leak.
|
||||
if (qs != nullptr) {
|
||||
CL_CHECK(clReleaseMemObject(qs));
|
||||
qs = nullptr;
|
||||
}
|
||||
if (qh != nullptr) {
|
||||
CL_CHECK(clReleaseMemObject(qh));
|
||||
qh = nullptr;
|
||||
}
|
||||
if (d != nullptr) {
|
||||
CL_CHECK(clReleaseMemObject(d));
|
||||
d = nullptr;
|
||||
}
|
||||
if (m != nullptr) {
|
||||
CL_CHECK(clReleaseMemObject(m));
|
||||
m = nullptr;
|
||||
}
|
||||
if (qs_img != nullptr) {
|
||||
CL_CHECK(clReleaseMemObject(qs_img));
|
||||
qs_img = nullptr;
|
||||
}
|
||||
// qh_img, d_img, and m_img are not currently allocated separately.
|
||||
// TODO: initialize them for non SMALL_PATH path, or remove them.
|
||||
qh_img = nullptr;
|
||||
d_img = nullptr;
|
||||
m_img = nullptr;
|
||||
size_qs = 0;
|
||||
size_qh = 0;
|
||||
size_d = 0;
|
||||
size_m = 0;
|
||||
}
|
||||
};
|
||||
|
||||
struct ggml_tensor_extra_cl_mxfp4 {
|
||||
// Quantized values.
|
||||
cl_mem q = nullptr;
|
||||
@@ -4506,7 +4698,9 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
||||
}
|
||||
// q4_0, q8_0 and mxfp4 have general MUL_MAT_ID support,
|
||||
// the quantizations here currently do not - they are only supported by Adreno with certain shapes
|
||||
if (op->src[0]->type == GGML_TYPE_Q4_1) {
|
||||
if (op->src[0]->type == GGML_TYPE_Q4_1 ||
|
||||
op->src[0]->type == GGML_TYPE_Q5_0 ||
|
||||
op->src[0]->type == GGML_TYPE_Q5_1) {
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
if (op->src[1]->type == GGML_TYPE_F32) {
|
||||
return use_adreno_moe_kernels(backend_ctx, op->src[0])
|
||||
@@ -4692,6 +4886,18 @@ struct ggml_backend_opencl_buffer_context {
|
||||
for (ggml_tensor_extra_cl_q4_1 * e : temp_tensor_extras_q4_1_in_use) {
|
||||
delete e;
|
||||
}
|
||||
for (ggml_tensor_extra_cl_q5_0 * e : temp_tensor_extras_q5_0) {
|
||||
delete e;
|
||||
}
|
||||
for (ggml_tensor_extra_cl_q5_0 * e : temp_tensor_extras_q5_0_in_use) {
|
||||
delete e;
|
||||
}
|
||||
for (ggml_tensor_extra_cl_q5_1 * e : temp_tensor_extras_q5_1) {
|
||||
delete e;
|
||||
}
|
||||
for (ggml_tensor_extra_cl_q5_1 * e : temp_tensor_extras_q5_1_in_use) {
|
||||
delete e;
|
||||
}
|
||||
for (ggml_tensor_extra_cl_mxfp4 * e : temp_tensor_extras_mxfp4) {
|
||||
delete e;
|
||||
}
|
||||
@@ -4775,6 +4981,36 @@ struct ggml_backend_opencl_buffer_context {
|
||||
return extra;
|
||||
}
|
||||
|
||||
ggml_tensor_extra_cl_q5_0 * ggml_opencl_alloc_temp_tensor_extra_q5_0() {
|
||||
ggml_tensor_extra_cl_q5_0 * extra;
|
||||
if (temp_tensor_extras_q5_0.empty()) {
|
||||
extra = new ggml_tensor_extra_cl_q5_0();
|
||||
} else {
|
||||
extra = temp_tensor_extras_q5_0.back();
|
||||
temp_tensor_extras_q5_0.pop_back();
|
||||
}
|
||||
|
||||
temp_tensor_extras_q5_0_in_use.push_back(extra);
|
||||
|
||||
extra->reset();
|
||||
return extra;
|
||||
}
|
||||
|
||||
ggml_tensor_extra_cl_q5_1 * ggml_opencl_alloc_temp_tensor_extra_q5_1() {
|
||||
ggml_tensor_extra_cl_q5_1 * extra;
|
||||
if (temp_tensor_extras_q5_1.empty()) {
|
||||
extra = new ggml_tensor_extra_cl_q5_1();
|
||||
} else {
|
||||
extra = temp_tensor_extras_q5_1.back();
|
||||
temp_tensor_extras_q5_1.pop_back();
|
||||
}
|
||||
|
||||
temp_tensor_extras_q5_1_in_use.push_back(extra);
|
||||
|
||||
extra->reset();
|
||||
return extra;
|
||||
}
|
||||
|
||||
ggml_tensor_extra_cl_mxfp4 * ggml_opencl_alloc_temp_tensor_extra_mxfp4() {
|
||||
ggml_tensor_extra_cl_mxfp4 * extra;
|
||||
if (temp_tensor_extras_mxfp4.empty()) {
|
||||
@@ -4881,6 +5117,16 @@ struct ggml_backend_opencl_buffer_context {
|
||||
}
|
||||
temp_tensor_extras_q4_1_in_use.clear();
|
||||
|
||||
for (ggml_tensor_extra_cl_q5_0 * e : temp_tensor_extras_q5_0_in_use) {
|
||||
temp_tensor_extras_q5_0.push_back(e);
|
||||
}
|
||||
temp_tensor_extras_q5_0_in_use.clear();
|
||||
|
||||
for (ggml_tensor_extra_cl_q5_1 * e : temp_tensor_extras_q5_1_in_use) {
|
||||
temp_tensor_extras_q5_1.push_back(e);
|
||||
}
|
||||
temp_tensor_extras_q5_1_in_use.clear();
|
||||
|
||||
for (ggml_tensor_extra_cl_mxfp4 * e : temp_tensor_extras_mxfp4_in_use) {
|
||||
temp_tensor_extras_mxfp4.push_back(e);
|
||||
}
|
||||
@@ -4923,6 +5169,10 @@ struct ggml_backend_opencl_buffer_context {
|
||||
std::vector<ggml_tensor_extra_cl_q4_0 *> temp_tensor_extras_q4_0_in_use;
|
||||
std::vector<ggml_tensor_extra_cl_q4_1 *> temp_tensor_extras_q4_1;
|
||||
std::vector<ggml_tensor_extra_cl_q4_1 *> temp_tensor_extras_q4_1_in_use;
|
||||
std::vector<ggml_tensor_extra_cl_q5_0 *> temp_tensor_extras_q5_0;
|
||||
std::vector<ggml_tensor_extra_cl_q5_0 *> temp_tensor_extras_q5_0_in_use;
|
||||
std::vector<ggml_tensor_extra_cl_q5_1 *> temp_tensor_extras_q5_1;
|
||||
std::vector<ggml_tensor_extra_cl_q5_1 *> temp_tensor_extras_q5_1_in_use;
|
||||
std::vector<ggml_tensor_extra_cl_mxfp4 *> temp_tensor_extras_mxfp4;
|
||||
std::vector<ggml_tensor_extra_cl_mxfp4 *> temp_tensor_extras_mxfp4_in_use;
|
||||
std::vector<ggml_tensor_extra_cl_q8_0 *> temp_tensor_extras_q8_0;
|
||||
@@ -5283,6 +5533,195 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
||||
// Transpose m as ushort
|
||||
transpose_2d_as_16b(backend_ctx, extra->m, extra->m, size_m, K/32, M);
|
||||
}
|
||||
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
return;
|
||||
}
|
||||
if (tensor->type == GGML_TYPE_Q5_0) {
|
||||
ggml_tensor_extra_cl * extra_orig = (ggml_tensor_extra_cl *)tensor->extra;
|
||||
GGML_ASSERT(extra_orig && "Tesnors in OpenCL backend should have been allocated and initialized");
|
||||
|
||||
// Allocate the new extra and create aliases from the original.
|
||||
ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
|
||||
ggml_tensor_extra_cl_q5_0 * extra = ctx->ggml_opencl_alloc_temp_tensor_extra_q5_0();
|
||||
|
||||
size_t size_d = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*sizeof(ggml_fp16_t);
|
||||
size_t size_qs = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*ggml_blck_size(tensor->type)/2;
|
||||
size_t size_qh = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*sizeof(int32_t);
|
||||
GGML_ASSERT(size_d + size_qs + size_qh == ggml_nbytes(tensor) && "Incorrect tensor size");
|
||||
|
||||
cl_int err;
|
||||
cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
|
||||
ggml_nbytes(tensor), NULL, &err);
|
||||
CL_CHECK(err);
|
||||
CL_CHECK(clEnqueueWriteBuffer(
|
||||
queue, data_device, CL_TRUE, 0,
|
||||
ggml_nbytes(tensor), data, 0, NULL, NULL));
|
||||
|
||||
cl_buffer_region region;
|
||||
|
||||
// Create subbuffer for scales.
|
||||
region.origin = align_to(extra_orig->offset + tensor->view_offs + offset, backend_ctx->alignment);
|
||||
region.size = size_d;
|
||||
extra->d = clCreateSubBuffer(
|
||||
extra_orig->data_device, CL_MEM_READ_WRITE,
|
||||
CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
|
||||
CL_CHECK(err);
|
||||
auto previous_origin = region.origin;
|
||||
|
||||
// Create subbuffer for qh.
|
||||
region.origin = align_to(previous_origin + size_d, backend_ctx->alignment);
|
||||
region.size = size_qh;
|
||||
extra->qh = clCreateSubBuffer(
|
||||
extra_orig->data_device, CL_MEM_READ_WRITE,
|
||||
CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
|
||||
CL_CHECK(err);
|
||||
previous_origin = region.origin;
|
||||
|
||||
// Create subbuffer for qs.
|
||||
region.origin = align_to(previous_origin + size_qh, backend_ctx->alignment);
|
||||
region.size = size_qs;
|
||||
extra->qs = clCreateSubBuffer(
|
||||
extra_orig->data_device, CL_MEM_READ_WRITE,
|
||||
CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
|
||||
CL_CHECK(err);
|
||||
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
// Adreno moe q5_0 kernel needs special transpose and unshuffling
|
||||
if (use_adreno_moe_kernels(backend_ctx, tensor)) {
|
||||
cl_kernel kernel = backend_ctx->kernel_convert_block_q5_0_trans4_ns;
|
||||
|
||||
int ne00 = tensor->ne[0];
|
||||
int ne01 = tensor->ne[1];
|
||||
int ne02 = tensor->ne[2];
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->qs));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->qh));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
|
||||
|
||||
size_t global_work_size[3] = {static_cast<size_t>(((ne01 + 63) / 64) * 64), static_cast<size_t>(ne00 / 32), static_cast<size_t>(ne02)};
|
||||
size_t local_work_size[3] = {64, 2, 1};
|
||||
|
||||
cl_event evt;
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
||||
CL_CHECK(clWaitForEvents(1, &evt));
|
||||
CL_CHECK(clReleaseMemObject(data_device));
|
||||
|
||||
// Create image for Q
|
||||
cl_image_format img_format_qs = {CL_R, CL_UNSIGNED_INT32};
|
||||
cl_image_desc img_desc_qs = {
|
||||
CL_MEM_OBJECT_IMAGE1D_BUFFER,
|
||||
static_cast<size_t>(ggml_nelements(tensor) / 8),
|
||||
0, 0, 0, 0, 0, 0, 0,
|
||||
{ extra->qs }
|
||||
};
|
||||
extra->qs_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_format_qs, &img_desc_qs, NULL, &err);
|
||||
tensor->extra = extra;
|
||||
|
||||
return;
|
||||
}
|
||||
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
return;
|
||||
}
|
||||
if (tensor->type == GGML_TYPE_Q5_1) {
|
||||
ggml_tensor_extra_cl * extra_orig = (ggml_tensor_extra_cl *)tensor->extra;
|
||||
GGML_ASSERT(extra_orig && "Tesnors in OpenCL backend should have been allocated and initialized");
|
||||
|
||||
// Allocate the new extra and create aliases from the original.
|
||||
ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
|
||||
ggml_tensor_extra_cl_q5_1 * extra = ctx->ggml_opencl_alloc_temp_tensor_extra_q5_1();
|
||||
|
||||
size_t size_d = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*sizeof(ggml_fp16_t);
|
||||
size_t size_m = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*sizeof(ggml_fp16_t);
|
||||
size_t size_qs = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*ggml_blck_size(tensor->type)/2;
|
||||
size_t size_qh = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*sizeof(int32_t);
|
||||
GGML_ASSERT(size_d + size_m + size_qs + size_qh == ggml_nbytes(tensor) && "Incorrect tensor size");
|
||||
|
||||
cl_int err;
|
||||
cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
|
||||
ggml_nbytes(tensor), NULL, &err);
|
||||
CL_CHECK(err);
|
||||
CL_CHECK(clEnqueueWriteBuffer(
|
||||
queue, data_device, CL_TRUE, 0,
|
||||
ggml_nbytes(tensor), data, 0, NULL, NULL));
|
||||
|
||||
cl_buffer_region region;
|
||||
|
||||
// The original tensor memory is divided into scales and quants, i.e.,
|
||||
// we first store scales, mins, then quants.
|
||||
// Create subbuffer for scales.
|
||||
region.origin = align_to(extra_orig->offset + tensor->view_offs + offset, backend_ctx->alignment);
|
||||
region.size = size_d;
|
||||
extra->d = clCreateSubBuffer(
|
||||
extra_orig->data_device, CL_MEM_READ_WRITE,
|
||||
CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
|
||||
CL_CHECK(err);
|
||||
auto previous_origin = region.origin;
|
||||
|
||||
// Create subbuffer for mins.
|
||||
region.origin = align_to(previous_origin + size_d, backend_ctx->alignment);
|
||||
region.size = size_m;
|
||||
extra->m = clCreateSubBuffer(
|
||||
extra_orig->data_device, CL_MEM_READ_WRITE,
|
||||
CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
|
||||
CL_CHECK(err);
|
||||
previous_origin = region.origin;
|
||||
|
||||
// Create subbuffer for qh.
|
||||
region.origin = align_to(previous_origin + size_m, backend_ctx->alignment);
|
||||
region.size = size_qh;
|
||||
extra->qh = clCreateSubBuffer(
|
||||
extra_orig->data_device, CL_MEM_READ_WRITE,
|
||||
CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
|
||||
CL_CHECK(err);
|
||||
previous_origin = region.origin;
|
||||
|
||||
// Create subbuffer for qs.
|
||||
region.origin = align_to(previous_origin + size_qh, backend_ctx->alignment);
|
||||
region.size = size_qs;
|
||||
extra->qs = clCreateSubBuffer(
|
||||
extra_orig->data_device, CL_MEM_READ_WRITE,
|
||||
CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
|
||||
CL_CHECK(err);
|
||||
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
// Adreno moe q5_1 kernel needs special transpose and unshuffling
|
||||
if (use_adreno_moe_kernels(backend_ctx, tensor)) {
|
||||
cl_kernel kernel = backend_ctx->kernel_convert_block_q5_1_trans4_ns;
|
||||
|
||||
int ne00 = tensor->ne[0];
|
||||
int ne01 = tensor->ne[1];
|
||||
int ne02 = tensor->ne[2];
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->qs));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->qh));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra->m));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne01));
|
||||
|
||||
size_t global_work_size[3] = {static_cast<size_t>(((ne01 + 63) / 64) * 64), static_cast<size_t>(ne00 / 32), static_cast<size_t>(ne02)};
|
||||
size_t local_work_size[3] = {64, 2, 1};
|
||||
|
||||
cl_event evt;
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
||||
CL_CHECK(clWaitForEvents(1, &evt));
|
||||
CL_CHECK(clReleaseMemObject(data_device));
|
||||
|
||||
// Create image for Q
|
||||
cl_image_format img_format_qs = {CL_R, CL_UNSIGNED_INT32};
|
||||
cl_image_desc img_desc_qs = {
|
||||
CL_MEM_OBJECT_IMAGE1D_BUFFER,
|
||||
static_cast<size_t>(ggml_nelements(tensor) / 8),
|
||||
0, 0, 0, 0, 0, 0, 0,
|
||||
{ extra->qs }
|
||||
};
|
||||
extra->qs_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_format_qs, &img_desc_qs, NULL, &err);
|
||||
tensor->extra = extra;
|
||||
|
||||
return;
|
||||
}
|
||||
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
return;
|
||||
}
|
||||
@@ -6109,6 +6548,89 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
||||
CL_CHECK(clReleaseMemObject(data_device));
|
||||
return;
|
||||
}
|
||||
if (tensor->type == GGML_TYPE_Q5_0) {
|
||||
ggml_tensor_extra_cl_q5_0 * extra = (ggml_tensor_extra_cl_q5_0 *)tensor->extra;
|
||||
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
if (use_adreno_moe_kernels(backend_ctx, tensor)) {
|
||||
cl_int err;
|
||||
// TODO: use ggml_cl_buffer to manage this temporary buffer
|
||||
cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
|
||||
ggml_nbytes(tensor), NULL, &err);
|
||||
CL_CHECK(err);
|
||||
|
||||
cl_kernel kernel = backend_ctx->kernel_restore_block_q5_0_trans4_ns;
|
||||
|
||||
int ne00 = tensor->ne[0];
|
||||
int ne01 = tensor->ne[1];
|
||||
int ne02 = tensor->ne[2];
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->qs));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->qh));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_int), &ne01));
|
||||
|
||||
size_t global_work_size[3] = {static_cast<size_t>(((ne01 + 63) / 64) * 64), static_cast<size_t>(ne00 / 32), static_cast<size_t>(ne02)};
|
||||
size_t local_work_size[3] = {64, 2, 1};
|
||||
|
||||
cl_event evt;
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
|
||||
global_work_size, local_work_size, 0, NULL, &evt));
|
||||
CL_CHECK(clWaitForEvents(1, &evt));
|
||||
CL_CHECK(clEnqueueReadBuffer(
|
||||
queue, data_device, CL_TRUE, offset,
|
||||
size, data, 0, NULL, NULL));
|
||||
CL_CHECK(clReleaseMemObject(data_device));
|
||||
return;
|
||||
}
|
||||
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
// TODO: normal q5_0
|
||||
(void) extra;
|
||||
return;
|
||||
}
|
||||
if (tensor->type == GGML_TYPE_Q5_1) {
|
||||
ggml_tensor_extra_cl_q5_1 * extra = (ggml_tensor_extra_cl_q5_1 *)tensor->extra;
|
||||
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
if (use_adreno_moe_kernels(backend_ctx, tensor)) {
|
||||
cl_int err;
|
||||
// TODO: use ggml_cl_buffer to manage this temporary buffer
|
||||
cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
|
||||
ggml_nbytes(tensor), NULL, &err);
|
||||
CL_CHECK(err);
|
||||
|
||||
cl_kernel kernel = backend_ctx->kernel_restore_block_q5_1_trans4_ns;
|
||||
|
||||
int ne00 = tensor->ne[0];
|
||||
int ne01 = tensor->ne[1];
|
||||
int ne02 = tensor->ne[2];
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->qs));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->qh));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->d));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->m));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_int), &ne01));
|
||||
|
||||
size_t global_work_size[3] = {static_cast<size_t>(((ne01 + 63) / 64) * 64), static_cast<size_t>(ne00 / 32), static_cast<size_t>(ne02)};
|
||||
size_t local_work_size[3] = {64, 2, 1};
|
||||
|
||||
cl_event evt;
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
|
||||
global_work_size, local_work_size, 0, NULL, &evt));
|
||||
CL_CHECK(clWaitForEvents(1, &evt));
|
||||
CL_CHECK(clEnqueueReadBuffer(
|
||||
queue, data_device, CL_TRUE, offset,
|
||||
size, data, 0, NULL, NULL));
|
||||
CL_CHECK(clReleaseMemObject(data_device));
|
||||
return;
|
||||
}
|
||||
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
// TODO: normal q5_1
|
||||
(void) extra;
|
||||
return;
|
||||
}
|
||||
if (tensor->type == GGML_TYPE_MXFP4) {
|
||||
ggml_tensor_extra_cl_mxfp4 * extra = (ggml_tensor_extra_cl_mxfp4 *)tensor->extra;
|
||||
|
||||
@@ -13132,7 +13654,7 @@ static void moe_router_reoerder(ggml_backend_t backend, const ggml_tensor * src,
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne02));
|
||||
|
||||
size_t histogram_global_size[] = {(size_t)(((ne21 + 63) / 64) * 64), static_cast<size_t>(ne20), 1};
|
||||
size_t histogram_local_size[] = {64, static_cast<size_t>(ne20), 1};
|
||||
size_t histogram_local_size[] = {64, 1, 1};
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, histogram_global_size, histogram_local_size, src);
|
||||
|
||||
// Scan
|
||||
@@ -13209,10 +13731,17 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
|
||||
#ifdef GGML_OPENCL_SOA_Q
|
||||
ggml_tensor_extra_cl_q4_0 * extra0_q4_0 = (ggml_tensor_extra_cl_q4_0 *)src0->extra;
|
||||
ggml_tensor_extra_cl_q4_1 * extra0_q4_1 = (ggml_tensor_extra_cl_q4_1 *)src0->extra;
|
||||
ggml_tensor_extra_cl_q5_0 * extra0_q5_0 = (ggml_tensor_extra_cl_q5_0 *)src0->extra;
|
||||
ggml_tensor_extra_cl_q5_1 * extra0_q5_1 = (ggml_tensor_extra_cl_q5_1 *)src0->extra;
|
||||
ggml_tensor_extra_cl_mxfp4 * extra0_mxfp4 = (ggml_tensor_extra_cl_mxfp4 *)src0->extra;
|
||||
ggml_tensor_extra_cl_q8_0 * extra0_q8_0 = (ggml_tensor_extra_cl_q8_0 *)src0->extra;
|
||||
#endif
|
||||
|
||||
// TODO: general MoE for the following types
|
||||
(void)extra0_q4_1;
|
||||
(void)extra0_q5_0;
|
||||
(void)extra0_q5_1;
|
||||
|
||||
const int ne00 = src0->ne[0];
|
||||
const int ne01 = src0->ne[1];
|
||||
const int ne02 = src0->ne[2];
|
||||
@@ -13540,8 +14069,11 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
|
||||
} else { // for gemm
|
||||
kernel = backend_ctx->kernel_gemm_moe_q4_1_f32_ns;
|
||||
|
||||
if (strstr(src0->name, "as") != NULL) {
|
||||
// Reorder router if called from test-backend-ops or when new router is generated.
|
||||
// Otherwise reuse the reordered result from previous mul_mat_id call.
|
||||
if ((strstr(src0->name, "as") != NULL) || backend_ctx->toggle_reorder) {
|
||||
moe_router_reoerder(backend, src2, ne20);
|
||||
backend_ctx->toggle_reorder = false;
|
||||
}
|
||||
|
||||
cl_mem sub_buf_src1_pre, buf_src1_reordered, image_src1_reordered, sub_buf_dst, buf_dst_image;
|
||||
@@ -13649,6 +14181,359 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
|
||||
}
|
||||
return;
|
||||
}
|
||||
#endif //GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
}
|
||||
case GGML_TYPE_Q5_0: {
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
if (use_adreno_moe_kernels(backend_ctx, src0)) {
|
||||
cl_int status;
|
||||
|
||||
size_t local_size[3] = {64, 2, 1};
|
||||
size_t global_size[3] = {64, 2, 1};
|
||||
|
||||
if (ne12 == 1) { // for gemv
|
||||
kernel = backend_ctx->kernel_gemv_moe_q5_0_f32_ns;
|
||||
|
||||
cl_mem src1_sub_buffer, buf_src1_image, buf_src2;
|
||||
|
||||
// create a sub_buffer for src2
|
||||
cl_buffer_region region;
|
||||
region.origin = offset2;
|
||||
region.size = ne20 * ne21 * sizeof(int);
|
||||
buf_src2 = clCreateSubBuffer(extra2->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status);
|
||||
CL_CHECK(status);
|
||||
|
||||
// set thread grid
|
||||
global_size[0] = static_cast<size_t>(ne01);
|
||||
global_size[1] = 4;
|
||||
global_size[2] = static_cast<size_t>(ne20);
|
||||
local_size[1] = 4;
|
||||
|
||||
// create a sub_buffer for src1
|
||||
region.origin = offset1;
|
||||
region.size = ne10 * ne11 * ne12 * sizeof(float);
|
||||
src1_sub_buffer = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status);
|
||||
CL_CHECK(status);
|
||||
|
||||
// create image for src1
|
||||
cl_image_format image_format_buf_src1 = {CL_RGBA, CL_FLOAT};
|
||||
cl_image_desc image_desc_buf_src1 = {CL_MEM_OBJECT_IMAGE1D_BUFFER, static_cast<size_t>(ne10 * ne11 * ne12 / 4), 0,0,0,0,0,0,0, {src1_sub_buffer}};
|
||||
buf_src1_image = clCreateImage(backend_ctx->context, CL_MEM_READ_ONLY, &image_format_buf_src1, &image_desc_buf_src1, NULL, &status);
|
||||
CL_CHECK(status);
|
||||
|
||||
// Set kernel args
|
||||
int arg_idx = 0;
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_0->qs));
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_0->qh));
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_0->d));
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src1_image));
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src2));
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne01));
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne11));
|
||||
|
||||
// launch kernel
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_size, local_size, dst);
|
||||
|
||||
// deallocate sub buffers and images
|
||||
CL_CHECK(clReleaseMemObject(src1_sub_buffer));
|
||||
CL_CHECK(clReleaseMemObject(buf_src1_image));
|
||||
CL_CHECK(clReleaseMemObject(buf_src2));
|
||||
|
||||
} else { // for gemm
|
||||
kernel = backend_ctx->kernel_gemm_moe_q5_0_f32_ns;
|
||||
|
||||
// Reorder router if called from test-backend-ops or when new router is generated.
|
||||
// Otherwise reuse the reordered result from previous mul_mat_id call.
|
||||
if ((strstr(src0->name, "as") != NULL) || backend_ctx->toggle_reorder) {
|
||||
moe_router_reoerder(backend, src2, ne20);
|
||||
backend_ctx->toggle_reorder = false;
|
||||
}
|
||||
|
||||
cl_mem sub_buf_src1_pre, buf_src1_reordered, image_src1_reordered, sub_buf_dst, buf_dst_image;
|
||||
cl_mem buf_src2, buf_src2_emap;
|
||||
|
||||
cl_buffer_region region;
|
||||
region.origin = 0;
|
||||
region.size = sizeof(int) * max_post_router_tile * n_tile_size;
|
||||
buf_src2 = clCreateSubBuffer(backend_ctx->prealloc_post_router.buffer, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status);
|
||||
CL_CHECK(status);
|
||||
|
||||
region.origin = 0;
|
||||
region.size = sizeof(short) * max_post_router_tile;
|
||||
buf_src2_emap = clCreateSubBuffer(backend_ctx->prealloc_emap.buffer, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status);
|
||||
CL_CHECK(status);
|
||||
|
||||
// Reorder activations
|
||||
// create a sub_buffer for src1
|
||||
region.origin = offset1;
|
||||
region.size = ne10 * ne11 * ne12 * sizeof(float);
|
||||
sub_buf_src1_pre = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status);
|
||||
CL_CHECK(status);
|
||||
|
||||
// Create image for reordered src1
|
||||
// Use pre-allocated placeholder
|
||||
region.origin = 0;
|
||||
region.size = ne00 * max_post_router_tile * n_tile_size * sizeof(float);
|
||||
backend_ctx->prealloc_act_trans.allocate(backend_ctx->context, region.size);
|
||||
buf_src1_reordered = clCreateSubBuffer(
|
||||
backend_ctx->prealloc_act_trans.buffer,
|
||||
0,
|
||||
CL_BUFFER_CREATE_TYPE_REGION,
|
||||
®ion,
|
||||
&status);
|
||||
CL_CHECK(status);
|
||||
cl_image_format image_format_buf_src1;
|
||||
cl_image_desc image_desc_buf_src1;
|
||||
image_format_buf_src1 = {CL_RGBA, CL_FLOAT};
|
||||
image_desc_buf_src1 = {CL_MEM_OBJECT_IMAGE1D_BUFFER, static_cast<size_t>(ne00 * max_post_router_tile * n_tile_size / 4), 0,0,0,0,0,0,0, {buf_src1_reordered}};
|
||||
image_src1_reordered = clCreateImage(backend_ctx->context, CL_MEM_READ_ONLY, &image_format_buf_src1, &image_desc_buf_src1, NULL, &status);
|
||||
CL_CHECK(status);
|
||||
|
||||
unsigned short map_ratio = ne20 / ne11;
|
||||
GGML_ASSERT(((map_ratio == 1) || (map_ratio == ne20)) && "Map ratio not supported\n");
|
||||
CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 0, sizeof(cl_mem), &sub_buf_src1_pre));
|
||||
CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 1, sizeof(cl_mem), &buf_src2));
|
||||
CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 2, sizeof(cl_mem), &buf_src1_reordered));
|
||||
CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 3, sizeof(cl_mem), &(backend_ctx->prealloc_total_tiles.buffer)));
|
||||
CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 4, sizeof(unsigned int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 5, sizeof(unsigned short), &map_ratio));
|
||||
CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 6, sizeof(unsigned int), &n_tile_size));
|
||||
|
||||
size_t reorder_b_local_size[3] = {256, 1, 1};
|
||||
size_t reorder_b_global_size[3] = {static_cast<size_t>(((ne00 / 4) + 255) / 256 * 256), static_cast<size_t>(max_post_router_tile * n_tile_size), 1};
|
||||
|
||||
// Dispatch reorder kernel
|
||||
backend_ctx->enqueue_ndrange_kernel(backend_ctx->kernel_moe_reorder_b, 3, reorder_b_global_size, reorder_b_local_size, dst);
|
||||
|
||||
// MoE kernel prepare
|
||||
// Create sub buffer for dst
|
||||
region.origin = offsetd;
|
||||
region.size = ne0 * ne1 * ne2 * sizeof(float);
|
||||
sub_buf_dst = clCreateSubBuffer(
|
||||
extrad->data_device,
|
||||
0,
|
||||
CL_BUFFER_CREATE_TYPE_REGION,
|
||||
®ion,
|
||||
&status);
|
||||
CL_CHECK(status);
|
||||
// Create image for dst
|
||||
cl_image_format image_format_buf_dst = {CL_R, CL_FLOAT};
|
||||
cl_image_desc image_desc_buf_dst = {CL_MEM_OBJECT_IMAGE1D_BUFFER, static_cast<size_t>(ne0 * ne1 * ne2), 0,0,0,0,0,0,0, {sub_buf_dst}};
|
||||
buf_dst_image = clCreateImage(backend_ctx->context, CL_MEM_WRITE_ONLY, &image_format_buf_dst, &image_desc_buf_dst, NULL, &status);
|
||||
CL_CHECK(status);
|
||||
|
||||
// Set kernel args
|
||||
int arg_idx = 0;
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_0->qs_img));
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_0->qh));
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_0->d));
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &image_src1_reordered));
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src2));
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src2_emap));
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_dst_image));
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &(backend_ctx->prealloc_total_tiles.buffer)));
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne01));
|
||||
|
||||
// set thread grid
|
||||
global_size[1] = static_cast<size_t>((ne01 + 63) / 64);
|
||||
global_size[2] = static_cast<size_t>(max_post_router_tile);
|
||||
local_size[1] = 1;
|
||||
local_size[2] = 1;
|
||||
|
||||
// Dispatch kernel
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_size, local_size, dst);
|
||||
|
||||
clReleaseMemObject(sub_buf_src1_pre);
|
||||
clReleaseMemObject(buf_src1_reordered);
|
||||
clReleaseMemObject(image_src1_reordered);
|
||||
clReleaseMemObject(buf_src2);
|
||||
clReleaseMemObject(buf_src2_emap);
|
||||
clReleaseMemObject(sub_buf_dst);
|
||||
clReleaseMemObject(buf_dst_image);
|
||||
}
|
||||
return;
|
||||
}
|
||||
#endif //GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
}
|
||||
case GGML_TYPE_Q5_1: {
|
||||
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
if (use_adreno_moe_kernels(backend_ctx, src0)) {
|
||||
cl_int status;
|
||||
|
||||
size_t local_size[3] = {64, 2, 1};
|
||||
size_t global_size[3] = {64, 2, 1};
|
||||
|
||||
if (ne12 == 1) { // for gemv
|
||||
kernel = backend_ctx->kernel_gemv_moe_q5_1_f32_ns;
|
||||
|
||||
cl_mem src1_sub_buffer, buf_src1_image, buf_src2;
|
||||
|
||||
// create a sub_buffer for src2
|
||||
cl_buffer_region region;
|
||||
region.origin = offset2;
|
||||
region.size = ne20 * ne21 * sizeof(int);
|
||||
buf_src2 = clCreateSubBuffer(extra2->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status);
|
||||
CL_CHECK(status);
|
||||
|
||||
// set thread grid
|
||||
global_size[0] = static_cast<size_t>(ne01);
|
||||
global_size[1] = 4;
|
||||
global_size[2] = static_cast<size_t>(ne20);
|
||||
local_size[1] = 4;
|
||||
|
||||
// create a sub_buffer for src1
|
||||
region.origin = offset1;
|
||||
region.size = ne10 * ne11 * ne12 * sizeof(float);
|
||||
src1_sub_buffer = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status);
|
||||
CL_CHECK(status);
|
||||
|
||||
// create image for src1
|
||||
cl_image_format image_format_buf_src1 = {CL_RGBA, CL_FLOAT};
|
||||
cl_image_desc image_desc_buf_src1 = {CL_MEM_OBJECT_IMAGE1D_BUFFER, static_cast<size_t>(ne10 * ne11 * ne12 / 4), 0,0,0,0,0,0,0, {src1_sub_buffer}};
|
||||
buf_src1_image = clCreateImage(backend_ctx->context, CL_MEM_READ_ONLY, &image_format_buf_src1, &image_desc_buf_src1, NULL, &status);
|
||||
CL_CHECK(status);
|
||||
|
||||
// Set kernel args
|
||||
int arg_idx = 0;
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_1->qs));
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_1->qh));
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_1->d));
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_1->m));
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src1_image));
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src2));
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne01));
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne11));
|
||||
|
||||
// launch kernel
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_size, local_size, dst);
|
||||
|
||||
// deallocate sub buffers and images
|
||||
CL_CHECK(clReleaseMemObject(src1_sub_buffer));
|
||||
CL_CHECK(clReleaseMemObject(buf_src1_image));
|
||||
CL_CHECK(clReleaseMemObject(buf_src2));
|
||||
} else { // for gemm
|
||||
kernel = backend_ctx->kernel_gemm_moe_q5_1_f32_ns;
|
||||
|
||||
// Reorder router if called from test-backend-ops or when new router is generated.
|
||||
// Otherwise reuse the reordered result from previous mul_mat_id call.
|
||||
if ((strstr(src0->name, "as") != NULL) || backend_ctx->toggle_reorder) {
|
||||
moe_router_reoerder(backend, src2, ne20);
|
||||
backend_ctx->toggle_reorder = false;
|
||||
}
|
||||
|
||||
cl_mem sub_buf_src1_pre, buf_src1_reordered, image_src1_reordered, sub_buf_dst, buf_dst_image;
|
||||
cl_mem buf_src2, buf_src2_emap;
|
||||
|
||||
cl_buffer_region region;
|
||||
region.origin = 0;
|
||||
region.size = sizeof(int) * max_post_router_tile * n_tile_size;
|
||||
buf_src2 = clCreateSubBuffer(backend_ctx->prealloc_post_router.buffer, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status);
|
||||
CL_CHECK(status);
|
||||
|
||||
region.origin = 0;
|
||||
region.size = sizeof(short) * max_post_router_tile;
|
||||
buf_src2_emap = clCreateSubBuffer(backend_ctx->prealloc_emap.buffer, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status);
|
||||
CL_CHECK(status);
|
||||
|
||||
// Reorder activations
|
||||
// create a sub_buffer for src1
|
||||
region.origin = offset1;
|
||||
region.size = ne10 * ne11 * ne12 * sizeof(float);
|
||||
sub_buf_src1_pre = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status);
|
||||
CL_CHECK(status);
|
||||
|
||||
// Create image for reordered src1
|
||||
// Use pre-allocated placeholder
|
||||
region.origin = 0;
|
||||
region.size = ne00 * max_post_router_tile * n_tile_size * sizeof(float);
|
||||
backend_ctx->prealloc_act_trans.allocate(backend_ctx->context, region.size);
|
||||
buf_src1_reordered = clCreateSubBuffer(
|
||||
backend_ctx->prealloc_act_trans.buffer,
|
||||
0,
|
||||
CL_BUFFER_CREATE_TYPE_REGION,
|
||||
®ion,
|
||||
&status);
|
||||
CL_CHECK(status);
|
||||
cl_image_format image_format_buf_src1;
|
||||
cl_image_desc image_desc_buf_src1;
|
||||
image_format_buf_src1 = {CL_RGBA, CL_FLOAT};
|
||||
image_desc_buf_src1 = {CL_MEM_OBJECT_IMAGE1D_BUFFER, static_cast<size_t>(ne00 * max_post_router_tile * n_tile_size / 4), 0,0,0,0,0,0,0, {buf_src1_reordered}};
|
||||
image_src1_reordered = clCreateImage(backend_ctx->context, CL_MEM_READ_ONLY, &image_format_buf_src1, &image_desc_buf_src1, NULL, &status);
|
||||
CL_CHECK(status);
|
||||
|
||||
unsigned short map_ratio = ne20 / ne11;
|
||||
GGML_ASSERT(((map_ratio == 1) || (map_ratio == ne20)) && "Map ratio not supported\n");
|
||||
CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 0, sizeof(cl_mem), &sub_buf_src1_pre));
|
||||
CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 1, sizeof(cl_mem), &buf_src2));
|
||||
CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 2, sizeof(cl_mem), &buf_src1_reordered));
|
||||
CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 3, sizeof(cl_mem), &(backend_ctx->prealloc_total_tiles.buffer)));
|
||||
CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 4, sizeof(unsigned int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 5, sizeof(unsigned short), &map_ratio));
|
||||
CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 6, sizeof(unsigned int), &n_tile_size));
|
||||
|
||||
size_t reorder_b_local_size[3] = {256, 1, 1};
|
||||
size_t reorder_b_global_size[3] = {static_cast<size_t>(((ne00 / 4) + 255) / 256 * 256), static_cast<size_t>(max_post_router_tile * n_tile_size), 1};
|
||||
|
||||
// Dispatch reorder kernel
|
||||
backend_ctx->enqueue_ndrange_kernel(backend_ctx->kernel_moe_reorder_b, 3, reorder_b_global_size, reorder_b_local_size, dst);
|
||||
|
||||
// MoE kernel prepare
|
||||
// Create sub buffer for dst
|
||||
region.origin = offsetd;
|
||||
region.size = ne0 * ne1 * ne2 * sizeof(float);
|
||||
sub_buf_dst = clCreateSubBuffer(
|
||||
extrad->data_device,
|
||||
0,
|
||||
CL_BUFFER_CREATE_TYPE_REGION,
|
||||
®ion,
|
||||
&status);
|
||||
CL_CHECK(status);
|
||||
// Create image for dst
|
||||
cl_image_format image_format_buf_dst = {CL_R, CL_FLOAT};
|
||||
cl_image_desc image_desc_buf_dst = {CL_MEM_OBJECT_IMAGE1D_BUFFER, static_cast<size_t>(ne0 * ne1 * ne2), 0,0,0,0,0,0,0, {sub_buf_dst}};
|
||||
buf_dst_image = clCreateImage(backend_ctx->context, CL_MEM_WRITE_ONLY, &image_format_buf_dst, &image_desc_buf_dst, NULL, &status);
|
||||
CL_CHECK(status);
|
||||
|
||||
// Set kernel args
|
||||
int arg_idx = 0;
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_1->qs_img));
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_1->qh));
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_1->d));
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_1->m));
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &image_src1_reordered));
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src2));
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src2_emap));
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_dst_image));
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &(backend_ctx->prealloc_total_tiles.buffer)));
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne00));
|
||||
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne01));
|
||||
|
||||
// set thread grid
|
||||
global_size[1] = static_cast<size_t>((ne01 + 63) / 64);
|
||||
global_size[2] = static_cast<size_t>(max_post_router_tile);
|
||||
local_size[1] = 1;
|
||||
local_size[2] = 1;
|
||||
|
||||
// Dispatch kernel
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_size, local_size, dst);
|
||||
|
||||
clReleaseMemObject(sub_buf_src1_pre);
|
||||
clReleaseMemObject(buf_src1_reordered);
|
||||
clReleaseMemObject(image_src1_reordered);
|
||||
clReleaseMemObject(buf_src2);
|
||||
clReleaseMemObject(buf_src2_emap);
|
||||
clReleaseMemObject(sub_buf_dst);
|
||||
clReleaseMemObject(buf_dst_image);
|
||||
}
|
||||
return;
|
||||
}
|
||||
#endif //GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
}
|
||||
case GGML_TYPE_Q8_0: {
|
||||
|
||||
@@ -56,6 +56,25 @@ struct block_q4_1 {
|
||||
uchar qs[QK4_1 / 2]; // nibbles / quants
|
||||
};
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// block_q5_0
|
||||
//------------------------------------------------------------------------------
|
||||
struct block_q5_0 {
|
||||
half d; // delta
|
||||
uchar qh[4]; // 5-th bit of quants
|
||||
uchar qs[QK5_0 / 2]; // nibbles / quants
|
||||
};
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// block_q5_1
|
||||
//------------------------------------------------------------------------------
|
||||
struct block_q5_1 {
|
||||
half d; // delta
|
||||
half m; // min
|
||||
uchar qh[4]; // 5-th bit of quants
|
||||
uchar qs[QK5_1 / 2]; // nibbles / quants
|
||||
};
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// block_q4_k
|
||||
//------------------------------------------------------------------------------
|
||||
@@ -460,6 +479,191 @@ kernel void kernel_restore_block_q4_1_trans4_ns(
|
||||
((__global ushort8 *)(&(b->qs[0])))[0] = pre_block;
|
||||
}
|
||||
|
||||
kernel void kernel_convert_block_q5_0_trans4_ns(
|
||||
__global struct block_q5_0 * src0,
|
||||
__global uint * dst_qs,
|
||||
__global uint * dst_qh,
|
||||
__global half * dst_d,
|
||||
uint ne00,
|
||||
uint ne01
|
||||
) {
|
||||
uint i00 = get_global_id(1);
|
||||
uint i01 = get_global_id(0);
|
||||
uint i02 = get_global_id(2);
|
||||
|
||||
uint ne00_blk = ne00 / QK5_0;
|
||||
uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
|
||||
uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
|
||||
|
||||
global struct block_q5_0 * b = src0 + src_blk_offset;
|
||||
dst_d[dst_blk_offset] = b->d;
|
||||
|
||||
dst_qh[dst_blk_offset] = ((global uint *)(&(b->qh[0])))[0];
|
||||
|
||||
// extract quantization and unshuffle
|
||||
ushort8 pre_block = ((global ushort8 *)(&(b->qs[0])))[0];
|
||||
ushort8 post_block = (ushort8)(0);
|
||||
|
||||
uchar * pre_block_ptr = (uchar *)(&pre_block);
|
||||
uchar * post_block_ptr = (uchar *)(&post_block);
|
||||
|
||||
for (int i = 0; i < QK5_0 / 4; ++i) {
|
||||
uchar x0 = pre_block_ptr[2*i + 0];
|
||||
uchar x1 = pre_block_ptr[2*i + 1];
|
||||
|
||||
post_block_ptr[i + 0 ] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
|
||||
post_block_ptr[i + QK5_0 / 4] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
|
||||
}
|
||||
|
||||
uint4 q_block = as_uint4(post_block);
|
||||
|
||||
uint offset = i02 * ne00_blk * ne01 * 4 + i00 * ne01 * 4 + i01;
|
||||
dst_qs[offset] = q_block.x;
|
||||
dst_qs[offset + ne01] = q_block.y;
|
||||
dst_qs[offset + ne01 * 2] = q_block.z;
|
||||
dst_qs[offset + ne01 * 3] = q_block.w;
|
||||
}
|
||||
|
||||
kernel void kernel_restore_block_q5_0_trans4_ns(
|
||||
__global uint * src_qs,
|
||||
__global uint * src_qh,
|
||||
__global half * src_d,
|
||||
__global struct block_q5_0 * dst0,
|
||||
uint ne00,
|
||||
uint ne01
|
||||
) {
|
||||
int i00 = get_global_id(1);
|
||||
uint i01 = get_global_id(0);
|
||||
uint i02 = get_global_id(2);
|
||||
|
||||
uint ne00_blk = ne00 / QK5_0;
|
||||
uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
|
||||
uint src_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
|
||||
|
||||
__global struct block_q5_0 * b = dst0 + dst_blk_offset;
|
||||
b->d = src_d[src_blk_offset];
|
||||
|
||||
((__global uint *)(&(b->qh[0])))[0] = src_qh[src_blk_offset];
|
||||
|
||||
// collect transposed quantization parts for a block
|
||||
uint src_q_offset = i02 * ne00_blk * ne01 * 4 + i00 * ne01 * 4 + i01;
|
||||
uint4 q_block;
|
||||
q_block.x = src_qs[src_q_offset];
|
||||
q_block.y = src_qs[src_q_offset + ne01];
|
||||
q_block.z = src_qs[src_q_offset + ne01 * 2];
|
||||
q_block.w = src_qs[src_q_offset + ne01 * 3];
|
||||
|
||||
ushort8 post_block = as_ushort8(q_block);
|
||||
ushort8 pre_block = (ushort8)(0);
|
||||
|
||||
uchar * pre_block_ptr = (uchar *)(&pre_block);
|
||||
uchar * post_block_ptr = (uchar *)(&post_block);
|
||||
|
||||
for (int i = 0; i < QK5_0 / 4; ++i) {
|
||||
uchar x0 = post_block_ptr[i + 0];
|
||||
uchar x1 = post_block_ptr[i + QK5_0 / 4];
|
||||
|
||||
pre_block_ptr[2 * i + 0] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
|
||||
pre_block_ptr[2 * i + 1] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
|
||||
}
|
||||
|
||||
((__global ushort8 *)(&(b->qs[0])))[0] = pre_block;
|
||||
}
|
||||
|
||||
kernel void kernel_convert_block_q5_1_trans4_ns(
|
||||
__global struct block_q5_1 * src0,
|
||||
__global uint * dst_qs,
|
||||
__global uint * dst_qh,
|
||||
__global half * dst_d,
|
||||
__global half * dst_m,
|
||||
uint ne00,
|
||||
uint ne01
|
||||
) {
|
||||
uint i00 = get_global_id(1);
|
||||
uint i01 = get_global_id(0);
|
||||
uint i02 = get_global_id(2);
|
||||
|
||||
uint ne00_blk = ne00 / QK5_1;
|
||||
uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
|
||||
uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
|
||||
|
||||
global struct block_q5_1 * b = src0 + src_blk_offset;
|
||||
dst_d[dst_blk_offset] = b->d;
|
||||
dst_m[dst_blk_offset] = b->m;
|
||||
|
||||
dst_qh[dst_blk_offset] = ((global uint *)(&(b->qh[0])))[0];
|
||||
|
||||
// extract quantization and unshuffle
|
||||
ushort8 pre_block = ((global ushort8 *)(&(b->qs[0])))[0];
|
||||
ushort8 post_block = (ushort8)(0);
|
||||
|
||||
uchar * pre_block_ptr = (uchar *)(&pre_block);
|
||||
uchar * post_block_ptr = (uchar *)(&post_block);
|
||||
|
||||
for (int i = 0; i < QK5_1 / 4; ++i) {
|
||||
uchar x0 = pre_block_ptr[2*i + 0];
|
||||
uchar x1 = pre_block_ptr[2*i + 1];
|
||||
|
||||
post_block_ptr[i + 0 ] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
|
||||
post_block_ptr[i + QK5_1 / 4] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
|
||||
}
|
||||
|
||||
uint4 q_block = as_uint4(post_block);
|
||||
|
||||
uint offset = i02 * ne00_blk * ne01 * 4 + i00 * ne01 * 4 + i01;
|
||||
dst_qs[offset] = q_block.x;
|
||||
dst_qs[offset + ne01] = q_block.y;
|
||||
dst_qs[offset + ne01 * 2] = q_block.z;
|
||||
dst_qs[offset + ne01 * 3] = q_block.w;
|
||||
}
|
||||
|
||||
kernel void kernel_restore_block_q5_1_trans4_ns(
|
||||
__global uint * src_qs,
|
||||
__global uint * src_qh,
|
||||
__global half * src_d,
|
||||
__global half * src_m,
|
||||
__global struct block_q5_1 * dst0,
|
||||
uint ne00,
|
||||
uint ne01
|
||||
) {
|
||||
int i00 = get_global_id(1);
|
||||
uint i01 = get_global_id(0);
|
||||
uint i02 = get_global_id(2);
|
||||
|
||||
uint ne00_blk = ne00 / QK5_1;
|
||||
uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
|
||||
uint src_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
|
||||
|
||||
__global struct block_q5_1 * b = dst0 + dst_blk_offset;
|
||||
b->d = src_d[src_blk_offset];
|
||||
b->m = src_m[src_blk_offset];
|
||||
|
||||
((__global uint *)(&(b->qh[0])))[0] = src_qh[src_blk_offset];
|
||||
|
||||
// collect transposed quantization parts for a block
|
||||
uint src_q_offset = i02 * ne00_blk * ne01 * 4 + i00 * ne01 * 4 + i01;
|
||||
uint4 q_block;
|
||||
q_block.x = src_qs[src_q_offset];
|
||||
q_block.y = src_qs[src_q_offset + ne01];
|
||||
q_block.z = src_qs[src_q_offset + ne01 * 2];
|
||||
q_block.w = src_qs[src_q_offset + ne01 * 3];
|
||||
|
||||
ushort8 post_block = as_ushort8(q_block);
|
||||
ushort8 pre_block = (ushort8)(0);
|
||||
|
||||
uchar * pre_block_ptr = (uchar *)(&pre_block);
|
||||
uchar * post_block_ptr = (uchar *)(&post_block);
|
||||
|
||||
for (int i = 0; i < QK5_1 / 4; ++i) {
|
||||
uchar x0 = post_block_ptr[i + 0];
|
||||
uchar x1 = post_block_ptr[i + QK5_1 / 4];
|
||||
|
||||
pre_block_ptr[2 * i + 0] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
|
||||
pre_block_ptr[2 * i + 1] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
|
||||
}
|
||||
((__global ushort8 *)(&(b->qs[0])))[0] = pre_block;
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// block_mxfp4
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
256
ggml/src/ggml-opencl/kernels/gemm_moe_q5_0_f32_ns.cl
Normal file
256
ggml/src/ggml-opencl/kernels/gemm_moe_q5_0_f32_ns.cl
Normal file
@@ -0,0 +1,256 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
||||
#pragma OPENCL EXTENSION cl_qcom_subgroup_uniform_load: enable
|
||||
#pragma OPENCL EXTENSION cl_qcom_subgroup_constant_load: enable
|
||||
#pragma OPENCL EXTENSION cl_qcom_extra_vector_types : enable
|
||||
|
||||
#define TILESIZE_K 16
|
||||
#define TILESIZE_M 64
|
||||
#define TILESIZE_N 32
|
||||
|
||||
|
||||
#define dequantize_q5_0(qs5x16, qh5x16, a_f16, scale) \
|
||||
a_f16.s0 = (half)((( qs5x16.s0 & 0x000F) | (( qh5x16.s0 & 0x01) << 4)) - 16) * scale; \
|
||||
a_f16.s1 = (half)((((qs5x16.s0 & 0x00F0) >> 4 ) | (((qh5x16.s0 >> 1) & 0x01) << 4)) - 16) * scale; \
|
||||
a_f16.s2 = (half)((((qs5x16.s0 & 0x0F00) >> 8 ) | (((qh5x16.s0 >> 2) & 0x01) << 4)) - 16) * scale; \
|
||||
a_f16.s3 = (half)((((qs5x16.s0 & 0xF000) >> 12) | (((qh5x16.s0 >> 3) & 0x01) << 4)) - 16) * scale; \
|
||||
a_f16.s4 = (half)((( qs5x16.s1 & 0x000F) | (((qh5x16.s0 >> 4) & 0x01) << 4)) - 16) * scale; \
|
||||
a_f16.s5 = (half)((((qs5x16.s1 & 0x00F0) >> 4 ) | (((qh5x16.s0 >> 5) & 0x01) << 4)) - 16) * scale; \
|
||||
a_f16.s6 = (half)((((qs5x16.s1 & 0x0F00) >> 8 ) | (((qh5x16.s0 >> 6) & 0x01) << 4)) - 16) * scale; \
|
||||
a_f16.s7 = (half)((((qs5x16.s1 & 0xF000) >> 12) | (((qh5x16.s0 >> 7) & 0x01) << 4)) - 16) * scale; \
|
||||
a_f16.s8 = (half)((( qs5x16.s2 & 0x000F) | (( qh5x16.s1 & 0x01) << 4)) - 16) * scale; \
|
||||
a_f16.s9 = (half)((((qs5x16.s2 & 0x00F0) >> 4 ) | (((qh5x16.s1 >> 1) & 0x01) << 4)) - 16) * scale; \
|
||||
a_f16.sa = (half)((((qs5x16.s2 & 0x0F00) >> 8 ) | (((qh5x16.s1 >> 2) & 0x01) << 4)) - 16) * scale; \
|
||||
a_f16.sb = (half)((((qs5x16.s2 & 0xF000) >> 12) | (((qh5x16.s1 >> 3) & 0x01) << 4)) - 16) * scale; \
|
||||
a_f16.sc = (half)((( qs5x16.s3 & 0x000F) | (((qh5x16.s1 >> 4) & 0x01) << 4)) - 16) * scale; \
|
||||
a_f16.sd = (half)((((qs5x16.s3 & 0x00F0) >> 4 ) | (((qh5x16.s1 >> 5) & 0x01) << 4)) - 16) * scale; \
|
||||
a_f16.se = (half)((((qs5x16.s3 & 0x0F00) >> 8 ) | (((qh5x16.s1 >> 6) & 0x01) << 4)) - 16) * scale; \
|
||||
a_f16.sf = (half)((((qs5x16.s3 & 0xF000) >> 12) | (((qh5x16.s1 >> 7) & 0x01) << 4)) - 16) * scale; \
|
||||
|
||||
|
||||
#define dotx16_reduce8(a_reg, b_lm, c_reg, lm_offset) \
|
||||
acc.s0 = dot(a_reg.s0123, b_lm[lm_offset + 0]); \
|
||||
acc.s1 = dot(a_reg.s0123, b_lm[lm_offset + 1]); \
|
||||
acc.s2 = dot(a_reg.s0123, b_lm[lm_offset + 2]); \
|
||||
acc.s3 = dot(a_reg.s0123, b_lm[lm_offset + 3]); \
|
||||
acc.s4 = dot(a_reg.s0123, b_lm[lm_offset + 4]); \
|
||||
acc.s5 = dot(a_reg.s0123, b_lm[lm_offset + 5]); \
|
||||
acc.s6 = dot(a_reg.s0123, b_lm[lm_offset + 6]); \
|
||||
acc.s7 = dot(a_reg.s0123, b_lm[lm_offset + 7]); \
|
||||
acc.s8 = dot(a_reg.s0123, b_lm[lm_offset + 8]); \
|
||||
acc.s9 = dot(a_reg.s0123, b_lm[lm_offset + 9]); \
|
||||
acc.sa = dot(a_reg.s0123, b_lm[lm_offset + 10]); \
|
||||
acc.sb = dot(a_reg.s0123, b_lm[lm_offset + 11]); \
|
||||
acc.sc = dot(a_reg.s0123, b_lm[lm_offset + 12]); \
|
||||
acc.sd = dot(a_reg.s0123, b_lm[lm_offset + 13]); \
|
||||
acc.se = dot(a_reg.s0123, b_lm[lm_offset + 14]); \
|
||||
acc.sf = dot(a_reg.s0123, b_lm[lm_offset + 15]); \
|
||||
acc.s0 += dot(a_reg.s4567, b_lm[lm_offset + 32]); \
|
||||
acc.s1 += dot(a_reg.s4567, b_lm[lm_offset + 33]); \
|
||||
acc.s2 += dot(a_reg.s4567, b_lm[lm_offset + 34]); \
|
||||
acc.s3 += dot(a_reg.s4567, b_lm[lm_offset + 35]); \
|
||||
acc.s4 += dot(a_reg.s4567, b_lm[lm_offset + 36]); \
|
||||
acc.s5 += dot(a_reg.s4567, b_lm[lm_offset + 37]); \
|
||||
acc.s6 += dot(a_reg.s4567, b_lm[lm_offset + 38]); \
|
||||
acc.s7 += dot(a_reg.s4567, b_lm[lm_offset + 39]); \
|
||||
acc.s8 += dot(a_reg.s4567, b_lm[lm_offset + 40]); \
|
||||
acc.s9 += dot(a_reg.s4567, b_lm[lm_offset + 41]); \
|
||||
acc.sa += dot(a_reg.s4567, b_lm[lm_offset + 42]); \
|
||||
acc.sb += dot(a_reg.s4567, b_lm[lm_offset + 43]); \
|
||||
acc.sc += dot(a_reg.s4567, b_lm[lm_offset + 44]); \
|
||||
acc.sd += dot(a_reg.s4567, b_lm[lm_offset + 45]); \
|
||||
acc.se += dot(a_reg.s4567, b_lm[lm_offset + 46]); \
|
||||
acc.sf += dot(a_reg.s4567, b_lm[lm_offset + 47]); \
|
||||
c_reg.lo += convert_float8(acc.lo); \
|
||||
c_reg.hi += convert_float8(acc.hi); \
|
||||
acc.s0 = dot(a_reg.s89ab, b_lm[lm_offset + 64]); \
|
||||
acc.s1 = dot(a_reg.s89ab, b_lm[lm_offset + 65]); \
|
||||
acc.s2 = dot(a_reg.s89ab, b_lm[lm_offset + 66]); \
|
||||
acc.s3 = dot(a_reg.s89ab, b_lm[lm_offset + 67]); \
|
||||
acc.s4 = dot(a_reg.s89ab, b_lm[lm_offset + 68]); \
|
||||
acc.s5 = dot(a_reg.s89ab, b_lm[lm_offset + 69]); \
|
||||
acc.s6 = dot(a_reg.s89ab, b_lm[lm_offset + 70]); \
|
||||
acc.s7 = dot(a_reg.s89ab, b_lm[lm_offset + 71]); \
|
||||
acc.s8 = dot(a_reg.s89ab, b_lm[lm_offset + 72]); \
|
||||
acc.s9 = dot(a_reg.s89ab, b_lm[lm_offset + 73]); \
|
||||
acc.sa = dot(a_reg.s89ab, b_lm[lm_offset + 74]); \
|
||||
acc.sb = dot(a_reg.s89ab, b_lm[lm_offset + 75]); \
|
||||
acc.sc = dot(a_reg.s89ab, b_lm[lm_offset + 76]); \
|
||||
acc.sd = dot(a_reg.s89ab, b_lm[lm_offset + 77]); \
|
||||
acc.se = dot(a_reg.s89ab, b_lm[lm_offset + 78]); \
|
||||
acc.sf = dot(a_reg.s89ab, b_lm[lm_offset + 79]); \
|
||||
acc.s0 += dot(a_reg.scdef, b_lm[lm_offset + 96]); \
|
||||
acc.s1 += dot(a_reg.scdef, b_lm[lm_offset + 97]); \
|
||||
acc.s2 += dot(a_reg.scdef, b_lm[lm_offset + 98]); \
|
||||
acc.s3 += dot(a_reg.scdef, b_lm[lm_offset + 99]); \
|
||||
acc.s4 += dot(a_reg.scdef, b_lm[lm_offset + 100]); \
|
||||
acc.s5 += dot(a_reg.scdef, b_lm[lm_offset + 101]); \
|
||||
acc.s6 += dot(a_reg.scdef, b_lm[lm_offset + 102]); \
|
||||
acc.s7 += dot(a_reg.scdef, b_lm[lm_offset + 103]); \
|
||||
acc.s8 += dot(a_reg.scdef, b_lm[lm_offset + 104]); \
|
||||
acc.s9 += dot(a_reg.scdef, b_lm[lm_offset + 105]); \
|
||||
acc.sa += dot(a_reg.scdef, b_lm[lm_offset + 106]); \
|
||||
acc.sb += dot(a_reg.scdef, b_lm[lm_offset + 107]); \
|
||||
acc.sc += dot(a_reg.scdef, b_lm[lm_offset + 108]); \
|
||||
acc.sd += dot(a_reg.scdef, b_lm[lm_offset + 109]); \
|
||||
acc.se += dot(a_reg.scdef, b_lm[lm_offset + 110]); \
|
||||
acc.sf += dot(a_reg.scdef, b_lm[lm_offset + 111]); \
|
||||
c_reg.lo += convert_float8(acc.lo); \
|
||||
c_reg.hi += convert_float8(acc.hi); \
|
||||
|
||||
|
||||
__attribute__((qcom_wave_pair_mode(1))) // 1=force single 2=force pair
|
||||
kernel void kernel_gemm_moe_q5_0_f32_ns(
|
||||
__read_only image1d_buffer_t src0_qs,
|
||||
__global uint * src0_qh,
|
||||
__global half * src0_d,
|
||||
__read_only image1d_buffer_t src1,
|
||||
__global uint * src2,
|
||||
__global ushort * src2_emap,
|
||||
__write_only image1d_buffer_t dst,
|
||||
__global int * total_tiles,
|
||||
uint ne00,
|
||||
uint ne01
|
||||
) {
|
||||
uint block_id_m = get_global_id(1); // m_tile
|
||||
uint block_id_n = get_global_id(2); // n_tile
|
||||
|
||||
// Boundary check
|
||||
if (((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) || (block_id_n >= total_tiles[0])) {
|
||||
return;
|
||||
}
|
||||
|
||||
__private half16 reg_a;
|
||||
__private float32 reg_c = (float32)(0);
|
||||
__local half4 shared_b[128];
|
||||
|
||||
const ushort expert_id = src2_emap[block_id_n];
|
||||
|
||||
const uint row = block_id_m * TILESIZE_M;
|
||||
const uint col = block_id_n * TILESIZE_N;
|
||||
|
||||
uint sub_block_id_m = get_local_id(0);
|
||||
uint2 b_global_offset;
|
||||
b_global_offset.x = ((sub_block_id_m & 3) << 2) + (sub_block_id_m >> 2) * ne00;
|
||||
b_global_offset.y = b_global_offset.x + (16 * ne00);
|
||||
uint2 b_local_offset;
|
||||
b_local_offset.x = (sub_block_id_m & 3) * 32 + (sub_block_id_m >> 2);
|
||||
b_local_offset.y = b_local_offset.x + 16;
|
||||
|
||||
// Loop along K axis, 32 elements (one block) for each iteration, divided into 2 sub-blocks
|
||||
for (uint step = 0; step < ne00; step += TILESIZE_K * 2) {
|
||||
// First sub-block
|
||||
uint q_sub_offset = row + ((ne01 * step) >> 3) + ((expert_id * ne00 * ne01) >> 3);
|
||||
uint s_sub_offset = row + ((ne01 * step) >> 5) + ((expert_id * ne00 * ne01) >> 5);
|
||||
uint b_sub_offset = col * ne00 + step;
|
||||
|
||||
// Load scale for current Q5_0 block
|
||||
uint blk_offset = s_sub_offset + get_global_id(0);
|
||||
half s = src0_d[blk_offset];
|
||||
|
||||
// Load 32 qh (5-th bit of each Q5) for the entire block
|
||||
uchar4 qhx32 = as_uchar4(src0_qh[blk_offset]);
|
||||
|
||||
// Load 16 qs (half block) in transposed layout
|
||||
uint2 qsx16;
|
||||
qsx16.x = read_imageui(src0_qs, q_sub_offset + sub_block_id_m).x;
|
||||
qsx16.y = read_imageui(src0_qs, q_sub_offset + sub_block_id_m + ne01).x;
|
||||
|
||||
// Load 16x32 floats from matrix B, each fiber out of 64 in a sub-group loads 8 elements
|
||||
float8 bx8_f32;
|
||||
bx8_f32.lo = read_imagef(src1, (b_sub_offset + b_global_offset.x) / 4);
|
||||
bx8_f32.hi = read_imagef(src1, (b_sub_offset + b_global_offset.y) / 4);
|
||||
// Convert to half and store to LM to share within the subgroup
|
||||
half8 bx8_f16 = convert_half8(bx8_f32);
|
||||
shared_b[b_local_offset.x] = bx8_f16.lo;
|
||||
shared_b[b_local_offset.y] = bx8_f16.hi;
|
||||
|
||||
// Dequantization
|
||||
dequantize_q5_0(as_ushort4(qsx16), qhx32.lo, reg_a, s);
|
||||
|
||||
sub_group_barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// 32 16x16 fp16 dot product with 8 elements reduction for better precision
|
||||
half16 acc;
|
||||
dotx16_reduce8(reg_a, shared_b, reg_c.lo, 0);
|
||||
dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16);
|
||||
|
||||
// Repeat for second sub-block
|
||||
uint half_step = step + TILESIZE_K;
|
||||
q_sub_offset = row + ((ne01 * half_step) >> 3) + ((expert_id * ne00 * ne01) >> 3);
|
||||
b_sub_offset = col * ne00 + half_step;
|
||||
|
||||
// Load next 16 qs in transposed layout
|
||||
qsx16.x = read_imageui(src0_qs, q_sub_offset + sub_block_id_m).x;
|
||||
qsx16.y = read_imageui(src0_qs, q_sub_offset + sub_block_id_m + ne01).x;
|
||||
|
||||
// Load 16x32 floats from matrix B, each fiber out of 64 in a sub-group loads 8 elements
|
||||
bx8_f32.lo = read_imagef(src1, (b_sub_offset + b_global_offset.x) / 4);
|
||||
bx8_f32.hi = read_imagef(src1, (b_sub_offset + b_global_offset.y) / 4);
|
||||
// Convert to half and store to LM to share within the subgroup
|
||||
bx8_f16 = convert_half8(bx8_f32);
|
||||
shared_b[b_local_offset.x] = bx8_f16.lo;
|
||||
shared_b[b_local_offset.y] = bx8_f16.hi;
|
||||
|
||||
// Dequantization
|
||||
dequantize_q5_0(as_ushort4(qsx16), qhx32.hi, reg_a, s);
|
||||
|
||||
sub_group_barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// 32 16x16 fp16 dot product with 3-levels reduction for better precision
|
||||
dotx16_reduce8(reg_a, shared_b, reg_c.lo, 0);
|
||||
dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16);
|
||||
}
|
||||
|
||||
// Load poster router and share in LM
|
||||
__local uint out_idx[TILESIZE_N];
|
||||
|
||||
if (get_local_id(0) < TILESIZE_N) {
|
||||
uint idx = src2[block_id_n * TILESIZE_N + get_local_id(0)];
|
||||
if (idx == 0xFFFFFFFF) {
|
||||
idx = src2[block_id_n * TILESIZE_N + 0];
|
||||
}
|
||||
out_idx[get_local_id(0)] = idx * ne01;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// Scatter results back to original position in output grid
|
||||
uint m_offset = row + get_local_id(0);
|
||||
|
||||
write_imagef(dst, out_idx[1] + m_offset, (reg_c.s1));
|
||||
write_imagef(dst, out_idx[2] + m_offset, (reg_c.s2));
|
||||
write_imagef(dst, out_idx[3] + m_offset, (reg_c.s3));
|
||||
write_imagef(dst, out_idx[4] + m_offset, (reg_c.s4));
|
||||
write_imagef(dst, out_idx[5] + m_offset, (reg_c.s5));
|
||||
write_imagef(dst, out_idx[6] + m_offset, (reg_c.s6));
|
||||
write_imagef(dst, out_idx[7] + m_offset, (reg_c.s7));
|
||||
write_imagef(dst, out_idx[8] + m_offset, (reg_c.s8));
|
||||
write_imagef(dst, out_idx[9] + m_offset, (reg_c.s9));
|
||||
write_imagef(dst, out_idx[10] + m_offset, (reg_c.sa));
|
||||
write_imagef(dst, out_idx[11] + m_offset, (reg_c.sb));
|
||||
write_imagef(dst, out_idx[12] + m_offset, (reg_c.sc));
|
||||
write_imagef(dst, out_idx[13] + m_offset, (reg_c.sd));
|
||||
write_imagef(dst, out_idx[14] + m_offset, (reg_c.se));
|
||||
write_imagef(dst, out_idx[15] + m_offset, (reg_c.sf));
|
||||
write_imagef(dst, out_idx[16] + m_offset, (reg_c.sg));
|
||||
write_imagef(dst, out_idx[17] + m_offset, (reg_c.sh));
|
||||
write_imagef(dst, out_idx[18] + m_offset, (reg_c.si));
|
||||
write_imagef(dst, out_idx[19] + m_offset, (reg_c.sj));
|
||||
write_imagef(dst, out_idx[20] + m_offset, (reg_c.sk));
|
||||
write_imagef(dst, out_idx[21] + m_offset, (reg_c.sl));
|
||||
write_imagef(dst, out_idx[22] + m_offset, (reg_c.sm));
|
||||
write_imagef(dst, out_idx[23] + m_offset, (reg_c.sn));
|
||||
write_imagef(dst, out_idx[24] + m_offset, (reg_c.so));
|
||||
write_imagef(dst, out_idx[25] + m_offset, (reg_c.sp));
|
||||
write_imagef(dst, out_idx[26] + m_offset, (reg_c.sq));
|
||||
write_imagef(dst, out_idx[27] + m_offset, (reg_c.sr));
|
||||
write_imagef(dst, out_idx[28] + m_offset, (reg_c.ss));
|
||||
write_imagef(dst, out_idx[29] + m_offset, (reg_c.st));
|
||||
write_imagef(dst, out_idx[30] + m_offset, (reg_c.su));
|
||||
write_imagef(dst, out_idx[31] + m_offset, (reg_c.sv));
|
||||
|
||||
// Store zero padding parts to the index of first output in tile, override correct result in the end
|
||||
barrier(CLK_GLOBAL_MEM_FENCE);
|
||||
write_imagef(dst, out_idx[0] + m_offset, (reg_c.s0));
|
||||
}
|
||||
258
ggml/src/ggml-opencl/kernels/gemm_moe_q5_1_f32_ns.cl
Normal file
258
ggml/src/ggml-opencl/kernels/gemm_moe_q5_1_f32_ns.cl
Normal file
@@ -0,0 +1,258 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
||||
#pragma OPENCL EXTENSION cl_qcom_subgroup_uniform_load: enable
|
||||
#pragma OPENCL EXTENSION cl_qcom_subgroup_constant_load: enable
|
||||
#pragma OPENCL EXTENSION cl_qcom_extra_vector_types : enable
|
||||
|
||||
#define TILESIZE_K 16
|
||||
#define TILESIZE_M 64
|
||||
#define TILESIZE_N 32
|
||||
|
||||
|
||||
#define dequantize_q5_1(qs5x16, qh5x16, a_f16, scale, m) \
|
||||
a_f16.s0 = (half)((( qs5x16.s0 & 0x000F) | (( qh5x16.s0 & 0x01) << 4)) * scale + m); \
|
||||
a_f16.s1 = (half)((((qs5x16.s0 & 0x00F0) >> 4 ) | (((qh5x16.s0 >> 1) & 0x01) << 4)) * scale + m); \
|
||||
a_f16.s2 = (half)((((qs5x16.s0 & 0x0F00) >> 8 ) | (((qh5x16.s0 >> 2) & 0x01) << 4)) * scale + m); \
|
||||
a_f16.s3 = (half)((((qs5x16.s0 & 0xF000) >> 12) | (((qh5x16.s0 >> 3) & 0x01) << 4)) * scale + m); \
|
||||
a_f16.s4 = (half)((( qs5x16.s1 & 0x000F) | (((qh5x16.s0 >> 4) & 0x01) << 4)) * scale + m); \
|
||||
a_f16.s5 = (half)((((qs5x16.s1 & 0x00F0) >> 4 ) | (((qh5x16.s0 >> 5) & 0x01) << 4)) * scale + m); \
|
||||
a_f16.s6 = (half)((((qs5x16.s1 & 0x0F00) >> 8 ) | (((qh5x16.s0 >> 6) & 0x01) << 4)) * scale + m); \
|
||||
a_f16.s7 = (half)((((qs5x16.s1 & 0xF000) >> 12) | (((qh5x16.s0 >> 7) & 0x01) << 4)) * scale + m); \
|
||||
a_f16.s8 = (half)((( qs5x16.s2 & 0x000F) | (( qh5x16.s1 & 0x01) << 4)) * scale + m); \
|
||||
a_f16.s9 = (half)((((qs5x16.s2 & 0x00F0) >> 4 ) | (((qh5x16.s1 >> 1) & 0x01) << 4)) * scale + m); \
|
||||
a_f16.sa = (half)((((qs5x16.s2 & 0x0F00) >> 8 ) | (((qh5x16.s1 >> 2) & 0x01) << 4)) * scale + m); \
|
||||
a_f16.sb = (half)((((qs5x16.s2 & 0xF000) >> 12) | (((qh5x16.s1 >> 3) & 0x01) << 4)) * scale + m); \
|
||||
a_f16.sc = (half)((( qs5x16.s3 & 0x000F) | (((qh5x16.s1 >> 4) & 0x01) << 4)) * scale + m); \
|
||||
a_f16.sd = (half)((((qs5x16.s3 & 0x00F0) >> 4 ) | (((qh5x16.s1 >> 5) & 0x01) << 4)) * scale + m); \
|
||||
a_f16.se = (half)((((qs5x16.s3 & 0x0F00) >> 8 ) | (((qh5x16.s1 >> 6) & 0x01) << 4)) * scale + m); \
|
||||
a_f16.sf = (half)((((qs5x16.s3 & 0xF000) >> 12) | (((qh5x16.s1 >> 7) & 0x01) << 4)) * scale + m); \
|
||||
|
||||
|
||||
#define dotx16_reduce8(a_reg, b_lm, c_reg, lm_offset) \
|
||||
acc.s0 = dot(a_reg.s0123, b_lm[lm_offset + 0]); \
|
||||
acc.s1 = dot(a_reg.s0123, b_lm[lm_offset + 1]); \
|
||||
acc.s2 = dot(a_reg.s0123, b_lm[lm_offset + 2]); \
|
||||
acc.s3 = dot(a_reg.s0123, b_lm[lm_offset + 3]); \
|
||||
acc.s4 = dot(a_reg.s0123, b_lm[lm_offset + 4]); \
|
||||
acc.s5 = dot(a_reg.s0123, b_lm[lm_offset + 5]); \
|
||||
acc.s6 = dot(a_reg.s0123, b_lm[lm_offset + 6]); \
|
||||
acc.s7 = dot(a_reg.s0123, b_lm[lm_offset + 7]); \
|
||||
acc.s8 = dot(a_reg.s0123, b_lm[lm_offset + 8]); \
|
||||
acc.s9 = dot(a_reg.s0123, b_lm[lm_offset + 9]); \
|
||||
acc.sa = dot(a_reg.s0123, b_lm[lm_offset + 10]); \
|
||||
acc.sb = dot(a_reg.s0123, b_lm[lm_offset + 11]); \
|
||||
acc.sc = dot(a_reg.s0123, b_lm[lm_offset + 12]); \
|
||||
acc.sd = dot(a_reg.s0123, b_lm[lm_offset + 13]); \
|
||||
acc.se = dot(a_reg.s0123, b_lm[lm_offset + 14]); \
|
||||
acc.sf = dot(a_reg.s0123, b_lm[lm_offset + 15]); \
|
||||
acc.s0 += dot(a_reg.s4567, b_lm[lm_offset + 32]); \
|
||||
acc.s1 += dot(a_reg.s4567, b_lm[lm_offset + 33]); \
|
||||
acc.s2 += dot(a_reg.s4567, b_lm[lm_offset + 34]); \
|
||||
acc.s3 += dot(a_reg.s4567, b_lm[lm_offset + 35]); \
|
||||
acc.s4 += dot(a_reg.s4567, b_lm[lm_offset + 36]); \
|
||||
acc.s5 += dot(a_reg.s4567, b_lm[lm_offset + 37]); \
|
||||
acc.s6 += dot(a_reg.s4567, b_lm[lm_offset + 38]); \
|
||||
acc.s7 += dot(a_reg.s4567, b_lm[lm_offset + 39]); \
|
||||
acc.s8 += dot(a_reg.s4567, b_lm[lm_offset + 40]); \
|
||||
acc.s9 += dot(a_reg.s4567, b_lm[lm_offset + 41]); \
|
||||
acc.sa += dot(a_reg.s4567, b_lm[lm_offset + 42]); \
|
||||
acc.sb += dot(a_reg.s4567, b_lm[lm_offset + 43]); \
|
||||
acc.sc += dot(a_reg.s4567, b_lm[lm_offset + 44]); \
|
||||
acc.sd += dot(a_reg.s4567, b_lm[lm_offset + 45]); \
|
||||
acc.se += dot(a_reg.s4567, b_lm[lm_offset + 46]); \
|
||||
acc.sf += dot(a_reg.s4567, b_lm[lm_offset + 47]); \
|
||||
c_reg.lo += convert_float8(acc.lo); \
|
||||
c_reg.hi += convert_float8(acc.hi); \
|
||||
acc.s0 = dot(a_reg.s89ab, b_lm[lm_offset + 64]); \
|
||||
acc.s1 = dot(a_reg.s89ab, b_lm[lm_offset + 65]); \
|
||||
acc.s2 = dot(a_reg.s89ab, b_lm[lm_offset + 66]); \
|
||||
acc.s3 = dot(a_reg.s89ab, b_lm[lm_offset + 67]); \
|
||||
acc.s4 = dot(a_reg.s89ab, b_lm[lm_offset + 68]); \
|
||||
acc.s5 = dot(a_reg.s89ab, b_lm[lm_offset + 69]); \
|
||||
acc.s6 = dot(a_reg.s89ab, b_lm[lm_offset + 70]); \
|
||||
acc.s7 = dot(a_reg.s89ab, b_lm[lm_offset + 71]); \
|
||||
acc.s8 = dot(a_reg.s89ab, b_lm[lm_offset + 72]); \
|
||||
acc.s9 = dot(a_reg.s89ab, b_lm[lm_offset + 73]); \
|
||||
acc.sa = dot(a_reg.s89ab, b_lm[lm_offset + 74]); \
|
||||
acc.sb = dot(a_reg.s89ab, b_lm[lm_offset + 75]); \
|
||||
acc.sc = dot(a_reg.s89ab, b_lm[lm_offset + 76]); \
|
||||
acc.sd = dot(a_reg.s89ab, b_lm[lm_offset + 77]); \
|
||||
acc.se = dot(a_reg.s89ab, b_lm[lm_offset + 78]); \
|
||||
acc.sf = dot(a_reg.s89ab, b_lm[lm_offset + 79]); \
|
||||
acc.s0 += dot(a_reg.scdef, b_lm[lm_offset + 96]); \
|
||||
acc.s1 += dot(a_reg.scdef, b_lm[lm_offset + 97]); \
|
||||
acc.s2 += dot(a_reg.scdef, b_lm[lm_offset + 98]); \
|
||||
acc.s3 += dot(a_reg.scdef, b_lm[lm_offset + 99]); \
|
||||
acc.s4 += dot(a_reg.scdef, b_lm[lm_offset + 100]); \
|
||||
acc.s5 += dot(a_reg.scdef, b_lm[lm_offset + 101]); \
|
||||
acc.s6 += dot(a_reg.scdef, b_lm[lm_offset + 102]); \
|
||||
acc.s7 += dot(a_reg.scdef, b_lm[lm_offset + 103]); \
|
||||
acc.s8 += dot(a_reg.scdef, b_lm[lm_offset + 104]); \
|
||||
acc.s9 += dot(a_reg.scdef, b_lm[lm_offset + 105]); \
|
||||
acc.sa += dot(a_reg.scdef, b_lm[lm_offset + 106]); \
|
||||
acc.sb += dot(a_reg.scdef, b_lm[lm_offset + 107]); \
|
||||
acc.sc += dot(a_reg.scdef, b_lm[lm_offset + 108]); \
|
||||
acc.sd += dot(a_reg.scdef, b_lm[lm_offset + 109]); \
|
||||
acc.se += dot(a_reg.scdef, b_lm[lm_offset + 110]); \
|
||||
acc.sf += dot(a_reg.scdef, b_lm[lm_offset + 111]); \
|
||||
c_reg.lo += convert_float8(acc.lo); \
|
||||
c_reg.hi += convert_float8(acc.hi); \
|
||||
|
||||
|
||||
__attribute__((qcom_wave_pair_mode(1))) // 1=force single 2=force pair
|
||||
kernel void kernel_gemm_moe_q5_1_f32_ns(
|
||||
__read_only image1d_buffer_t src0_qs,
|
||||
__global uint * src0_qh,
|
||||
__global half * src0_d,
|
||||
__global half * src0_m,
|
||||
__read_only image1d_buffer_t src1,
|
||||
__global uint * src2,
|
||||
__global ushort * src2_emap,
|
||||
__write_only image1d_buffer_t dst,
|
||||
__global int * total_tiles,
|
||||
uint ne00,
|
||||
uint ne01
|
||||
) {
|
||||
uint block_id_m = get_global_id(1); // m_tile
|
||||
uint block_id_n = get_global_id(2); // n_tile
|
||||
|
||||
// Boundary check
|
||||
if (((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) || (block_id_n >= total_tiles[0])) {
|
||||
return;
|
||||
}
|
||||
|
||||
__private half16 reg_a;
|
||||
__private float32 reg_c = (float32)(0);
|
||||
__local half4 shared_b[128];
|
||||
|
||||
const ushort expert_id = src2_emap[block_id_n];
|
||||
|
||||
const uint row = block_id_m * TILESIZE_M;
|
||||
const uint col = block_id_n * TILESIZE_N;
|
||||
|
||||
uint sub_block_id_m = get_local_id(0);
|
||||
uint2 b_global_offset;
|
||||
b_global_offset.x = ((sub_block_id_m & 3) << 2) + (sub_block_id_m >> 2) * ne00;
|
||||
b_global_offset.y = b_global_offset.x + (16 * ne00);
|
||||
uint2 b_local_offset;
|
||||
b_local_offset.x = (sub_block_id_m & 3) * 32 + (sub_block_id_m >> 2);
|
||||
b_local_offset.y = b_local_offset.x + 16;
|
||||
|
||||
// Loop along K axis, 32 elements (one block) for each iteration, divided into 2 sub-blocks
|
||||
for (uint step = 0; step < ne00; step += TILESIZE_K * 2) {
|
||||
// First sub-block
|
||||
uint q_sub_offset = row + ((ne01 * step) >> 3) + ((expert_id * ne00 * ne01) >> 3);
|
||||
uint s_sub_offset = row + ((ne01 * step) >> 5) + ((expert_id * ne00 * ne01) >> 5);
|
||||
uint b_sub_offset = col * ne00 + step;
|
||||
|
||||
// Load scale and m for current Q5_1 block
|
||||
uint blk_offset = s_sub_offset + get_global_id(0);
|
||||
half s = src0_d[blk_offset];
|
||||
half m = src0_m[blk_offset];
|
||||
|
||||
// Load 32 qh (5-th bit of each Q5) for the entire block
|
||||
uchar4 qhx32 = as_uchar4(src0_qh[blk_offset]);
|
||||
|
||||
// Load 16 qs (half block) in transposed layout
|
||||
uint2 qsx16;
|
||||
qsx16.x = read_imageui(src0_qs, q_sub_offset + sub_block_id_m).x;
|
||||
qsx16.y = read_imageui(src0_qs, q_sub_offset + sub_block_id_m + ne01).x;
|
||||
|
||||
// Load 16x32 floats from matrix B, each fiber out of 64 in a sub-group loads 8 elements
|
||||
float8 bx8_f32;
|
||||
bx8_f32.lo = read_imagef(src1, (b_sub_offset + b_global_offset.x) / 4);
|
||||
bx8_f32.hi = read_imagef(src1, (b_sub_offset + b_global_offset.y) / 4);
|
||||
// Convert to half and store to LM to share within the subgroup
|
||||
half8 bx8_f16 = convert_half8(bx8_f32);
|
||||
shared_b[b_local_offset.x] = bx8_f16.lo;
|
||||
shared_b[b_local_offset.y] = bx8_f16.hi;
|
||||
|
||||
// Dequantization
|
||||
dequantize_q5_1(as_ushort4(qsx16), qhx32.lo, reg_a, s, m);
|
||||
|
||||
sub_group_barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// 32 16x16 fp16 dot product with 8 elements reduction for better precision
|
||||
half16 acc;
|
||||
dotx16_reduce8(reg_a, shared_b, reg_c.lo, 0);
|
||||
dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16);
|
||||
|
||||
// Repeat for second sub-block
|
||||
uint half_step = step + TILESIZE_K;
|
||||
q_sub_offset = row + ((ne01 * half_step) >> 3) + ((expert_id * ne00 * ne01) >> 3);
|
||||
b_sub_offset = col * ne00 + half_step;
|
||||
|
||||
// Load next 16 qs in transposed layout
|
||||
qsx16.x = read_imageui(src0_qs, q_sub_offset + sub_block_id_m).x;
|
||||
qsx16.y = read_imageui(src0_qs, q_sub_offset + sub_block_id_m + ne01).x;
|
||||
|
||||
// Load 16x32 floats from matrix B, each fiber out of 64 in a sub-group loads 8 elements
|
||||
bx8_f32.lo = read_imagef(src1, (b_sub_offset + b_global_offset.x) / 4);
|
||||
bx8_f32.hi = read_imagef(src1, (b_sub_offset + b_global_offset.y) / 4);
|
||||
// Convert to half and store to LM to share within the subgroup
|
||||
bx8_f16 = convert_half8(bx8_f32);
|
||||
shared_b[b_local_offset.x] = bx8_f16.lo;
|
||||
shared_b[b_local_offset.y] = bx8_f16.hi;
|
||||
|
||||
// Dequantization
|
||||
dequantize_q5_1(as_ushort4(qsx16), qhx32.hi, reg_a, s, m);
|
||||
|
||||
sub_group_barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// 32 16x16 fp16 dot product with 3-levels reduction for better precision
|
||||
dotx16_reduce8(reg_a, shared_b, reg_c.lo, 0);
|
||||
dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16);
|
||||
}
|
||||
|
||||
// Load poster router and share in LM
|
||||
__local uint out_idx[TILESIZE_N];
|
||||
|
||||
if (get_local_id(0) < TILESIZE_N) {
|
||||
uint idx = src2[block_id_n * TILESIZE_N + get_local_id(0)];
|
||||
if (idx == 0xFFFFFFFF) {
|
||||
idx = src2[block_id_n * TILESIZE_N + 0];
|
||||
}
|
||||
out_idx[get_local_id(0)] = idx * ne01;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// Scatter results back to original position in output grid
|
||||
uint m_offset = row + get_local_id(0);
|
||||
|
||||
write_imagef(dst, out_idx[1] + m_offset, (reg_c.s1));
|
||||
write_imagef(dst, out_idx[2] + m_offset, (reg_c.s2));
|
||||
write_imagef(dst, out_idx[3] + m_offset, (reg_c.s3));
|
||||
write_imagef(dst, out_idx[4] + m_offset, (reg_c.s4));
|
||||
write_imagef(dst, out_idx[5] + m_offset, (reg_c.s5));
|
||||
write_imagef(dst, out_idx[6] + m_offset, (reg_c.s6));
|
||||
write_imagef(dst, out_idx[7] + m_offset, (reg_c.s7));
|
||||
write_imagef(dst, out_idx[8] + m_offset, (reg_c.s8));
|
||||
write_imagef(dst, out_idx[9] + m_offset, (reg_c.s9));
|
||||
write_imagef(dst, out_idx[10] + m_offset, (reg_c.sa));
|
||||
write_imagef(dst, out_idx[11] + m_offset, (reg_c.sb));
|
||||
write_imagef(dst, out_idx[12] + m_offset, (reg_c.sc));
|
||||
write_imagef(dst, out_idx[13] + m_offset, (reg_c.sd));
|
||||
write_imagef(dst, out_idx[14] + m_offset, (reg_c.se));
|
||||
write_imagef(dst, out_idx[15] + m_offset, (reg_c.sf));
|
||||
write_imagef(dst, out_idx[16] + m_offset, (reg_c.sg));
|
||||
write_imagef(dst, out_idx[17] + m_offset, (reg_c.sh));
|
||||
write_imagef(dst, out_idx[18] + m_offset, (reg_c.si));
|
||||
write_imagef(dst, out_idx[19] + m_offset, (reg_c.sj));
|
||||
write_imagef(dst, out_idx[20] + m_offset, (reg_c.sk));
|
||||
write_imagef(dst, out_idx[21] + m_offset, (reg_c.sl));
|
||||
write_imagef(dst, out_idx[22] + m_offset, (reg_c.sm));
|
||||
write_imagef(dst, out_idx[23] + m_offset, (reg_c.sn));
|
||||
write_imagef(dst, out_idx[24] + m_offset, (reg_c.so));
|
||||
write_imagef(dst, out_idx[25] + m_offset, (reg_c.sp));
|
||||
write_imagef(dst, out_idx[26] + m_offset, (reg_c.sq));
|
||||
write_imagef(dst, out_idx[27] + m_offset, (reg_c.sr));
|
||||
write_imagef(dst, out_idx[28] + m_offset, (reg_c.ss));
|
||||
write_imagef(dst, out_idx[29] + m_offset, (reg_c.st));
|
||||
write_imagef(dst, out_idx[30] + m_offset, (reg_c.su));
|
||||
write_imagef(dst, out_idx[31] + m_offset, (reg_c.sv));
|
||||
|
||||
// Store zero padding parts to the index of first output in tile, override correct result in the end
|
||||
barrier(CLK_GLOBAL_MEM_FENCE);
|
||||
write_imagef(dst, out_idx[0] + m_offset, (reg_c.s0));
|
||||
}
|
||||
119
ggml/src/ggml-opencl/kernels/gemv_moe_q5_0_f32_ns.cl
Normal file
119
ggml/src/ggml-opencl/kernels/gemv_moe_q5_0_f32_ns.cl
Normal file
@@ -0,0 +1,119 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
||||
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||
|
||||
#define QK_Q5_0 32
|
||||
#define N_SIMDGROUP 4
|
||||
#define SIMDGROUP_WIDTH 64
|
||||
|
||||
static inline float8 q5_0_to_fp32_packed8(ushort2 qs5x8, uchar qh5x8) {
|
||||
float8 fp32x8;
|
||||
fp32x8.s0 = (float)((( qs5x8.s0 & 0x000F) | (( qh5x8 & 0x01) << 4)) - 16);
|
||||
fp32x8.s1 = (float)((((qs5x8.s0 & 0x00F0) >> 4 ) | (((qh5x8 >> 1) & 0x01) << 4)) - 16);
|
||||
fp32x8.s2 = (float)((((qs5x8.s0 & 0x0F00) >> 8 ) | (((qh5x8 >> 2) & 0x01) << 4)) - 16);
|
||||
fp32x8.s3 = (float)((((qs5x8.s0 & 0xF000) >> 12) | (((qh5x8 >> 3) & 0x01) << 4)) - 16);
|
||||
fp32x8.s4 = (float)((( qs5x8.s1 & 0x000F) | (((qh5x8 >> 4) & 0x01) << 4)) - 16);
|
||||
fp32x8.s5 = (float)((((qs5x8.s1 & 0x00F0) >> 4 ) | (((qh5x8 >> 5) & 0x01) << 4)) - 16);
|
||||
fp32x8.s6 = (float)((((qs5x8.s1 & 0x0F00) >> 8 ) | (((qh5x8 >> 6) & 0x01) << 4)) - 16);
|
||||
fp32x8.s7 = (float)((((qs5x8.s1 & 0xF000) >> 12) | (((qh5x8 >> 7) & 0x01) << 4)) - 16);
|
||||
return fp32x8;
|
||||
}
|
||||
|
||||
|
||||
__attribute__((qcom_reqd_sub_group_size("half")))
|
||||
__kernel void kernel_gemv_moe_q5_0_f32_ns(
|
||||
__global uint * src0_qs,
|
||||
__global uint * src0_qh,
|
||||
__global half * src0_d,
|
||||
__read_only image1d_buffer_t src1,
|
||||
__global uint * src2,
|
||||
__global float * dst,
|
||||
ulong offsetd,
|
||||
uint ne00,
|
||||
uint ne01,
|
||||
uint ne11
|
||||
) {
|
||||
uint i01 = get_global_id(0);
|
||||
uint i20 = get_global_id(2);
|
||||
uint sgid = get_local_id(1);
|
||||
uint slid = get_sub_group_local_id();
|
||||
|
||||
uint i11 = i20 % ne11;
|
||||
|
||||
uint expert_id = src2[i20];
|
||||
uint expert_offset = expert_id * ne00 * ne01 / 32;
|
||||
|
||||
__private float sum = 0.0f; // each thread calculate partial sum of one output
|
||||
|
||||
// loop along ne00 in block granularity, skip 4 blocks every iter
|
||||
for (uint ib00 = sgid; ib00 < (ne00 / QK_Q5_0); ib00 += N_SIMDGROUP) {
|
||||
|
||||
// load one block of q
|
||||
uint4 regQ;
|
||||
uint block_offset = expert_offset * 4 + ib00 * ne01 * 4 + i01;
|
||||
|
||||
regQ.s0 = src0_qs[block_offset];
|
||||
regQ.s1 = src0_qs[block_offset + ne01];
|
||||
regQ.s2 = src0_qs[block_offset + ne01 * 2];
|
||||
regQ.s3 = src0_qs[block_offset + ne01 * 3];
|
||||
|
||||
uint offset = i11 * ne00 / 4 + ib00 * 8;
|
||||
|
||||
uchar4 regQh = as_uchar4(src0_qh[ib00 * ne01 + i01 + expert_offset]);
|
||||
half regS = src0_d[ib00 * ne01 + i01 + expert_offset];
|
||||
|
||||
float8 fp32x8 = q5_0_to_fp32_packed8(as_ushort2(regQ.s0), regQh.s0);
|
||||
|
||||
float4 shared_y4;
|
||||
shared_y4 = read_imagef(src1, (offset + 0));
|
||||
float4 acc = shared_y4 * fp32x8.lo;
|
||||
|
||||
shared_y4 = read_imagef(src1, (offset + 1));
|
||||
acc += shared_y4 * fp32x8.hi;
|
||||
|
||||
fp32x8 = q5_0_to_fp32_packed8(as_ushort2(regQ.s1), regQh.s1);
|
||||
|
||||
shared_y4 = read_imagef(src1, (offset + 2));
|
||||
acc += shared_y4 * fp32x8.lo;
|
||||
|
||||
shared_y4 = read_imagef(src1, (offset + 3));
|
||||
acc += shared_y4 * fp32x8.hi;
|
||||
|
||||
|
||||
fp32x8 = q5_0_to_fp32_packed8(as_ushort2(regQ.s2), regQh.s2);
|
||||
|
||||
shared_y4 = read_imagef(src1, (offset + 4));
|
||||
acc += shared_y4 * fp32x8.lo;
|
||||
|
||||
shared_y4 = read_imagef(src1, (offset + 5));
|
||||
acc += shared_y4 * fp32x8.hi;
|
||||
|
||||
|
||||
fp32x8 = q5_0_to_fp32_packed8(as_ushort2(regQ.s3), regQh.s3);
|
||||
|
||||
shared_y4 = read_imagef(src1, (offset + 6));
|
||||
acc += shared_y4 * fp32x8.lo;
|
||||
|
||||
shared_y4 = read_imagef(src1, (offset + 7));
|
||||
acc += shared_y4 * fp32x8.hi;
|
||||
|
||||
sum += (float)(regS) * ((acc.s0 + acc.s1) + (acc.s2 + acc.s3));
|
||||
}
|
||||
|
||||
// reduction in local memory, assumes #subgroups=4
|
||||
__local float reduceLM[SIMDGROUP_WIDTH * (N_SIMDGROUP - 1)];
|
||||
if (sgid == 1) reduceLM[SIMDGROUP_WIDTH * 0 + slid] = sum;
|
||||
if (sgid == 2) reduceLM[SIMDGROUP_WIDTH * 1 + slid] = sum;
|
||||
if (sgid == 3) reduceLM[SIMDGROUP_WIDTH * 2 + slid] = sum;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if (sgid == 0) sum += reduceLM[SIMDGROUP_WIDTH * 0 + slid];
|
||||
if (sgid == 0) sum += reduceLM[SIMDGROUP_WIDTH * 1 + slid];
|
||||
if (sgid == 0) sum += reduceLM[SIMDGROUP_WIDTH * 2 + slid];
|
||||
|
||||
// 1 outputs per thread in subgroup 0
|
||||
if (sgid == 0) {
|
||||
dst = dst + (offsetd >> 2);
|
||||
dst[i01 + i20 * ne01] = sum;
|
||||
}
|
||||
|
||||
}
|
||||
121
ggml/src/ggml-opencl/kernels/gemv_moe_q5_1_f32_ns.cl
Normal file
121
ggml/src/ggml-opencl/kernels/gemv_moe_q5_1_f32_ns.cl
Normal file
@@ -0,0 +1,121 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
||||
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||
|
||||
#define QK_Q5_1 32
|
||||
#define N_SIMDGROUP 4
|
||||
#define SIMDGROUP_WIDTH 64
|
||||
|
||||
static inline float8 q5_1_to_fp32_packed8(ushort2 qs5x8, uchar qh5x8, half s, half m) {
|
||||
float8 fp32x8;
|
||||
fp32x8.s0 = (float)((( qs5x8.s0 & 0x000F) | (( qh5x8 & 0x01) << 4)) * s + m);
|
||||
fp32x8.s1 = (float)((((qs5x8.s0 & 0x00F0) >> 4 ) | (((qh5x8 >> 1) & 0x01) << 4)) * s + m);
|
||||
fp32x8.s2 = (float)((((qs5x8.s0 & 0x0F00) >> 8 ) | (((qh5x8 >> 2) & 0x01) << 4)) * s + m);
|
||||
fp32x8.s3 = (float)((((qs5x8.s0 & 0xF000) >> 12) | (((qh5x8 >> 3) & 0x01) << 4)) * s + m);
|
||||
fp32x8.s4 = (float)((( qs5x8.s1 & 0x000F) | (((qh5x8 >> 4) & 0x01) << 4)) * s + m);
|
||||
fp32x8.s5 = (float)((((qs5x8.s1 & 0x00F0) >> 4 ) | (((qh5x8 >> 5) & 0x01) << 4)) * s + m);
|
||||
fp32x8.s6 = (float)((((qs5x8.s1 & 0x0F00) >> 8 ) | (((qh5x8 >> 6) & 0x01) << 4)) * s + m);
|
||||
fp32x8.s7 = (float)((((qs5x8.s1 & 0xF000) >> 12) | (((qh5x8 >> 7) & 0x01) << 4)) * s + m);
|
||||
return fp32x8;
|
||||
}
|
||||
|
||||
|
||||
__attribute__((qcom_reqd_sub_group_size("half")))
|
||||
__kernel void kernel_gemv_moe_q5_1_f32_ns(
|
||||
__global uint * src0_qs,
|
||||
__global uint * src0_qh,
|
||||
__global half * src0_d,
|
||||
__global half * src0_m,
|
||||
__read_only image1d_buffer_t src1,
|
||||
__global uint * src2,
|
||||
__global float * dst,
|
||||
ulong offsetd,
|
||||
uint ne00,
|
||||
uint ne01,
|
||||
uint ne11
|
||||
) {
|
||||
uint i01 = get_global_id(0);
|
||||
uint i20 = get_global_id(2);
|
||||
uint sgid = get_local_id(1);
|
||||
uint slid = get_sub_group_local_id();
|
||||
|
||||
uint i11 = i20 % ne11;
|
||||
|
||||
uint expert_id = src2[i20];
|
||||
uint expert_offset = expert_id * ne00 * ne01 / 32;
|
||||
|
||||
__private float sum = 0.0f; // each thread calculate partial sum of one output
|
||||
|
||||
// loop along ne00 in block granularity, skip 4 blocks every iter
|
||||
for (uint ib00 = sgid; ib00 < (ne00 / QK_Q5_1); ib00 += N_SIMDGROUP) {
|
||||
|
||||
// load one block of q
|
||||
uint4 regQ;
|
||||
uint block_offset = expert_offset * 4 + ib00 * ne01 * 4 + i01;
|
||||
|
||||
regQ.s0 = src0_qs[block_offset];
|
||||
regQ.s1 = src0_qs[block_offset + ne01];
|
||||
regQ.s2 = src0_qs[block_offset + ne01 * 2];
|
||||
regQ.s3 = src0_qs[block_offset + ne01 * 3];
|
||||
|
||||
uint offset = i11 * ne00 / 4 + ib00 * 8;
|
||||
|
||||
uchar4 regQh = as_uchar4(src0_qh[ib00 * ne01 + i01 + expert_offset]);
|
||||
half regM = src0_m[ib00 * ne01 + i01 + expert_offset];
|
||||
half regS = src0_d[ib00 * ne01 + i01 + expert_offset];
|
||||
|
||||
float8 fp32x8 = q5_1_to_fp32_packed8(as_ushort2(regQ.s0), regQh.s0, regS, regM);
|
||||
|
||||
float4 shared_y4;
|
||||
shared_y4 = read_imagef(src1, (offset + 0));
|
||||
float4 acc = shared_y4 * fp32x8.lo;
|
||||
|
||||
shared_y4 = read_imagef(src1, (offset + 1));
|
||||
acc += shared_y4 * fp32x8.hi;
|
||||
|
||||
fp32x8 = q5_1_to_fp32_packed8(as_ushort2(regQ.s1), regQh.s1, regS, regM);
|
||||
|
||||
shared_y4 = read_imagef(src1, (offset + 2));
|
||||
acc += shared_y4 * fp32x8.lo;
|
||||
|
||||
shared_y4 = read_imagef(src1, (offset + 3));
|
||||
acc += shared_y4 * fp32x8.hi;
|
||||
|
||||
|
||||
fp32x8 = q5_1_to_fp32_packed8(as_ushort2(regQ.s2), regQh.s2, regS, regM);
|
||||
|
||||
shared_y4 = read_imagef(src1, (offset + 4));
|
||||
acc += shared_y4 * fp32x8.lo;
|
||||
|
||||
shared_y4 = read_imagef(src1, (offset + 5));
|
||||
acc += shared_y4 * fp32x8.hi;
|
||||
|
||||
|
||||
fp32x8 = q5_1_to_fp32_packed8(as_ushort2(regQ.s3), regQh.s3, regS, regM);
|
||||
|
||||
shared_y4 = read_imagef(src1, (offset + 6));
|
||||
acc += shared_y4 * fp32x8.lo;
|
||||
|
||||
shared_y4 = read_imagef(src1, (offset + 7));
|
||||
acc += shared_y4 * fp32x8.hi;
|
||||
|
||||
sum += ((acc.s0 + acc.s1) + (acc.s2 + acc.s3));
|
||||
}
|
||||
|
||||
// reduction in local memory, assumes #subgroups=4
|
||||
__local float reduceLM[SIMDGROUP_WIDTH * (N_SIMDGROUP - 1)];
|
||||
if (sgid == 1) reduceLM[SIMDGROUP_WIDTH * 0 + slid] = sum;
|
||||
if (sgid == 2) reduceLM[SIMDGROUP_WIDTH * 1 + slid] = sum;
|
||||
if (sgid == 3) reduceLM[SIMDGROUP_WIDTH * 2 + slid] = sum;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if (sgid == 0) sum += reduceLM[SIMDGROUP_WIDTH * 0 + slid];
|
||||
if (sgid == 0) sum += reduceLM[SIMDGROUP_WIDTH * 1 + slid];
|
||||
if (sgid == 0) sum += reduceLM[SIMDGROUP_WIDTH * 2 + slid];
|
||||
|
||||
// 1 outputs per thread in subgroup 0
|
||||
if (sgid == 0) {
|
||||
dst = dst + (offsetd >> 2);
|
||||
dst[i01 + i20 * ne01] = sum;
|
||||
}
|
||||
|
||||
}
|
||||
@@ -39,6 +39,18 @@ if (WIN32)
|
||||
set(CMAKE_CXX_COMPILER "icx")
|
||||
set(CMAKE_CXX_COMPILER_ID "IntelLLVM")
|
||||
endif()
|
||||
# Level Zero SDK path for Windows (only when GGML_SYCL_SUPPORT_LEVEL_ZERO is enabled)
|
||||
if(GGML_SYCL_SUPPORT_LEVEL_ZERO)
|
||||
if(DEFINED ENV{LEVEL_ZERO_V1_SDK_PATH})
|
||||
set(LEVEL_ZERO_V1_SDK_PATH $ENV{LEVEL_ZERO_V1_SDK_PATH})
|
||||
if(EXISTS "${LEVEL_ZERO_V1_SDK_PATH}")
|
||||
target_include_directories(ggml-sycl PRIVATE "${LEVEL_ZERO_V1_SDK_PATH}/include")
|
||||
set(LEVEL_ZERO_V1_SDK_LIB_PATH "${LEVEL_ZERO_V1_SDK_PATH}/lib")
|
||||
else()
|
||||
message(WARNING "LEVEL_ZERO_V1_SDK_PATH set but folder not found: ${LEVEL_ZERO_V1_SDK_PATH}")
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
macro(detect_and_find_package package_name)
|
||||
@@ -93,6 +105,23 @@ endif()
|
||||
|
||||
target_compile_options(ggml-sycl PRIVATE "-Wno-narrowing")
|
||||
|
||||
message(STATUS "GGML_SYCL_SUPPORT_LEVEL_ZERO ${GGML_SYCL_SUPPORT_LEVEL_ZERO}")
|
||||
if (GGML_SYCL_SUPPORT_LEVEL_ZERO)
|
||||
# Link against Level Zero loader for direct device memory allocation.
|
||||
# Avoids sycl::malloc_device triggering DMA-buf/TTM system RAM staging
|
||||
# in the xe kernel driver during multi-GPU inference.
|
||||
find_path(LEVEL_ZERO_INCLUDE_DIR level_zero/ze_api.h HINTS ${ONEAPI_ROOT}/include ${LEVEL_ZERO_V1_SDK_PATH}/include)
|
||||
find_library(ZE_LOADER_LIB ze_loader HINTS ${ONEAPI_ROOT}/lib ${LEVEL_ZERO_V1_SDK_LIB_PATH} ENV LD_LIBRARY_PATH)
|
||||
if(ZE_LOADER_LIB AND LEVEL_ZERO_INCLUDE_DIR)
|
||||
target_link_libraries(ggml-sycl PRIVATE ${ZE_LOADER_LIB})
|
||||
target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_SUPPORT_LEVEL_ZERO)
|
||||
message(STATUS "Level Zero loader found: ${ZE_LOADER_LIB}")
|
||||
message(STATUS "Level Zero headers found: ${LEVEL_ZERO_INCLUDE_DIR}")
|
||||
else()
|
||||
message(WARNING "Level Zero loader or headers not found, Level Zero support disabled")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# Link against oneDNN
|
||||
set(GGML_SYCL_DNNL 0)
|
||||
if(GGML_SYCL_DNN)
|
||||
|
||||
@@ -11,6 +11,10 @@
|
||||
//
|
||||
|
||||
#include "common.hpp"
|
||||
#include <sycl/backend.hpp>
|
||||
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
|
||||
#include <level_zero/ze_api.h>
|
||||
#endif
|
||||
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-impl.h"
|
||||
@@ -55,6 +59,20 @@ bool gpu_has_xmx(sycl::device &dev) {
|
||||
return dev.has(sycl::aspect::ext_intel_matrix);
|
||||
}
|
||||
|
||||
static int ggml_sycl_get_env(const char *env_name, int default_val) {
|
||||
char *user_device_string = getenv(env_name);
|
||||
int user_number = default_val;
|
||||
|
||||
unsigned n;
|
||||
if (user_device_string != NULL &&
|
||||
sscanf(user_device_string, " %u", &n) == 1) {
|
||||
user_number = (int)n;
|
||||
} else {
|
||||
user_number = default_val;
|
||||
}
|
||||
return user_number;
|
||||
}
|
||||
|
||||
int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block_size) {
|
||||
const int64_t max_range = std::numeric_limits<int>::max();
|
||||
int64_t sycl_down_blk_size = block_size;
|
||||
@@ -66,6 +84,61 @@ int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block
|
||||
return sycl_down_blk_size;
|
||||
}
|
||||
|
||||
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
|
||||
static bool ggml_sycl_use_level_zero_device_alloc(sycl::queue &q) {
|
||||
return ggml_sycl_get_env("GGML_SYCL_ENABLE_LEVEL_ZERO", 1) &&
|
||||
q.get_device().is_gpu() &&
|
||||
q.get_backend() == sycl::backend::ext_oneapi_level_zero;
|
||||
}
|
||||
#endif
|
||||
|
||||
// Use Level Zero zeMemAllocDevice to avoid sycl::malloc_device triggering
|
||||
// DMA-buf/TTM system RAM staging in the xe kernel driver during multi-GPU inference.
|
||||
// The decision is made from the queue and runtime env because large buffers can be
|
||||
// allocated before ggml_check_sycl() initializes g_ggml_sycl_enable_level_zero.
|
||||
void * ggml_sycl_malloc_device(size_t size, sycl::queue &q) {
|
||||
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
|
||||
if (ggml_sycl_use_level_zero_device_alloc(q)) {
|
||||
void *ptr = nullptr;
|
||||
auto ze_ctx = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(q.get_context());
|
||||
auto ze_dev = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(q.get_device());
|
||||
#ifdef ZE_RELAXED_ALLOCATION_LIMITS_EXP_NAME
|
||||
ze_relaxed_allocation_limits_exp_desc_t relaxed_desc = {
|
||||
ZE_STRUCTURE_TYPE_RELAXED_ALLOCATION_LIMITS_EXP_DESC,
|
||||
nullptr,
|
||||
ZE_RELAXED_ALLOCATION_LIMITS_EXP_FLAG_MAX_SIZE,
|
||||
};
|
||||
ze_device_mem_alloc_desc_t alloc_desc = {
|
||||
ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC,
|
||||
&relaxed_desc,
|
||||
0,
|
||||
0,
|
||||
};
|
||||
#else
|
||||
ze_device_mem_alloc_desc_t alloc_desc = {ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC, nullptr, 0, 0};
|
||||
#endif
|
||||
ze_result_t r = zeMemAllocDevice(ze_ctx, &alloc_desc, size, 64, ze_dev, &ptr);
|
||||
if (r == ZE_RESULT_SUCCESS && ptr) {
|
||||
return ptr;
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
#endif
|
||||
return sycl::malloc_device(size, q);
|
||||
}
|
||||
|
||||
void ggml_sycl_free_device(void *ptr, sycl::queue &q) {
|
||||
if (!ptr) return;
|
||||
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
|
||||
if (ggml_sycl_use_level_zero_device_alloc(q)) {
|
||||
auto ze_ctx = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(q.get_context());
|
||||
zeMemFree(ze_ctx, ptr);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(ptr, q)));
|
||||
}
|
||||
|
||||
void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector<queue_ptr> streams) {
|
||||
for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
|
||||
for (int64_t is = 0; is < GGML_SYCL_MAX_STREAMS; ++is) {
|
||||
@@ -75,8 +148,7 @@ void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector<queue_ptr> str
|
||||
}
|
||||
if (extra->data_device[i] != nullptr && streams.size()>0) {
|
||||
ggml_sycl_set_device(i);
|
||||
SYCL_CHECK(
|
||||
CHECK_TRY_ERROR(sycl::free(extra->data_device[i], *(streams[i]))));
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(ggml_sycl_free_device(extra->data_device[i], *(streams[i]))));
|
||||
}
|
||||
}
|
||||
delete extra;
|
||||
|
||||
@@ -310,6 +310,10 @@ struct ggml_tensor_extra_gpu {
|
||||
optimize_feature optimized_feature;
|
||||
};
|
||||
|
||||
extern int g_ggml_sycl_enable_level_zero;
|
||||
void * ggml_sycl_malloc_device(size_t size, sycl::queue &q);
|
||||
void ggml_sycl_free_device(void *ptr, sycl::queue &q);
|
||||
|
||||
void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector<queue_ptr> streams={});
|
||||
|
||||
namespace sycl_ex = sycl::ext::oneapi::experimental;
|
||||
|
||||
@@ -30,6 +30,10 @@
|
||||
#include <regex>
|
||||
|
||||
#include <sycl/sycl.hpp>
|
||||
#include <sycl/backend.hpp>
|
||||
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
|
||||
#include <level_zero/ze_api.h>
|
||||
#endif
|
||||
#if defined(GGML_SYCL_GRAPH) && SYCL_EXT_ONEAPI_ASYNC_MEMORY_ALLOC
|
||||
# include <sycl/ext/oneapi/experimental/async_alloc/async_alloc.hpp>
|
||||
#endif
|
||||
@@ -68,6 +72,7 @@ int g_ggml_sycl_disable_graph = 0;
|
||||
int g_ggml_sycl_disable_dnn = 0;
|
||||
int g_ggml_sycl_prioritize_dmmv = 0;
|
||||
int g_ggml_sycl_use_async_mem_op = 0;
|
||||
int g_ggml_sycl_enable_level_zero = 0;
|
||||
int g_ggml_sycl_enable_flash_attention = 1;
|
||||
|
||||
|
||||
@@ -223,6 +228,27 @@ static void ggml_check_sycl() try {
|
||||
g_ggml_sycl_disable_graph = get_sycl_env("GGML_SYCL_DISABLE_GRAPH", 1);
|
||||
g_ggml_sycl_disable_dnn = get_sycl_env("GGML_SYCL_DISABLE_DNN", 0);
|
||||
g_ggml_sycl_prioritize_dmmv = get_sycl_env("GGML_SYCL_PRIORITIZE_DMMV", 0);
|
||||
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
|
||||
g_ggml_sycl_enable_level_zero = get_sycl_env("GGML_SYCL_ENABLE_LEVEL_ZERO", 1);
|
||||
#else
|
||||
g_ggml_sycl_enable_level_zero = 0;
|
||||
#endif
|
||||
if (g_ggml_sycl_enable_level_zero) {
|
||||
// Verify all GPU devices use the Level Zero backend before enabling L0 APIs.
|
||||
// Only check GPU devices; CPU devices use OpenCL and would otherwise
|
||||
// disable Level Zero for the GPUs on systems without ONEAPI_DEVICE_SELECTOR set.
|
||||
for (unsigned int i = 0; i < dpct::dev_mgr::instance().device_count(); i++) {
|
||||
auto & q = dpct::dev_mgr::instance().get_device(i).default_queue();
|
||||
if (!q.get_device().is_gpu()) {
|
||||
continue;
|
||||
}
|
||||
if (q.get_backend() != sycl::backend::ext_oneapi_level_zero) {
|
||||
GGML_LOG_WARN("SYCL GPU device %d does not use Level Zero backend, disabling Level Zero memory API\n", i);
|
||||
g_ggml_sycl_enable_level_zero = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef SYCL_FLASH_ATTN
|
||||
g_ggml_sycl_enable_flash_attention = get_sycl_env("GGML_SYCL_ENABLE_FLASH_ATTN", 1);
|
||||
@@ -253,6 +279,11 @@ static void ggml_check_sycl() try {
|
||||
#else
|
||||
GGML_LOG_INFO(" GGML_SYCL_DNNL: no\n");
|
||||
#endif
|
||||
#if defined(GGML_SYCL_SUPPORT_LEVEL_ZERO)
|
||||
GGML_LOG_INFO(" GGML_SYCL_SUPPORT_LEVEL_ZERO: yes\n");
|
||||
#else
|
||||
GGML_LOG_INFO(" GGML_SYCL_SUPPORT_LEVEL_ZERO: no\n");
|
||||
#endif
|
||||
|
||||
GGML_LOG_INFO("Running with Environment Variables:\n");
|
||||
GGML_LOG_INFO(" GGML_SYCL_DEBUG: %d\n", g_ggml_sycl_debug);
|
||||
@@ -262,6 +293,11 @@ static void ggml_check_sycl() try {
|
||||
#else
|
||||
GGML_LOG_INFO(" GGML_SYCL_DISABLE_GRAPH: graph disabled by compile flag\n");
|
||||
#endif
|
||||
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
|
||||
GGML_LOG_INFO(" GGML_SYCL_ENABLE_LEVEL_ZERO: %d\n", g_ggml_sycl_enable_level_zero);
|
||||
#else
|
||||
GGML_LOG_INFO(" GGML_SYCL_ENABLE_LEVEL_ZERO: Level Zero disabled by compile flag\n");
|
||||
#endif
|
||||
#if GGML_SYCL_DNNL
|
||||
GGML_LOG_INFO(" GGML_SYCL_DISABLE_DNN: %d\n", g_ggml_sycl_disable_dnn);
|
||||
#else
|
||||
@@ -371,7 +407,7 @@ struct ggml_backend_sycl_buffer_context {
|
||||
~ggml_backend_sycl_buffer_context() {
|
||||
if (dev_ptr != nullptr) {
|
||||
ggml_sycl_set_device(device);
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(dev_ptr, *stream)));
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(ggml_sycl_free_device(dev_ptr, *stream)));
|
||||
}
|
||||
|
||||
//release extra used by tensors
|
||||
@@ -504,8 +540,43 @@ catch (sycl::exception const &exc) {
|
||||
std::exit(1);
|
||||
}
|
||||
|
||||
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
|
||||
static bool ggml_sycl_is_l0_discrete_gpu(sycl::queue &q) {
|
||||
if (!q.get_device().is_gpu() || q.get_backend() != sycl::backend::ext_oneapi_level_zero) {
|
||||
return false;
|
||||
}
|
||||
|
||||
ze_device_handle_t ze_dev = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(q.get_device());
|
||||
ze_device_properties_t props = {};
|
||||
props.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
|
||||
ze_result_t r = zeDeviceGetProperties(ze_dev, &props);
|
||||
return r == ZE_RESULT_SUCCESS && !(props.flags & ZE_DEVICE_PROPERTY_FLAG_INTEGRATED);
|
||||
}
|
||||
#endif
|
||||
|
||||
static void dev2dev_memcpy(sycl::queue &q_dst, sycl::queue &q_src, void *ptr_dst,
|
||||
const void *ptr_src, size_t size) {
|
||||
#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
|
||||
// Use Level Zero direct copy for dGPU-to-dGPU transfers.
|
||||
const bool l0_copy_supported =
|
||||
ggml_sycl_is_l0_discrete_gpu(q_dst) && ggml_sycl_is_l0_discrete_gpu(q_src);
|
||||
if (g_ggml_sycl_enable_level_zero && l0_copy_supported) {
|
||||
auto ze_ctx = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(q_dst.get_context());
|
||||
auto ze_dev = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(q_dst.get_device());
|
||||
ze_command_queue_desc_t cq_desc = {ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, nullptr, 0, 0,
|
||||
0, ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS, ZE_COMMAND_QUEUE_PRIORITY_NORMAL};
|
||||
ze_command_list_handle_t cl;
|
||||
ze_result_t r = zeCommandListCreateImmediate(ze_ctx, ze_dev, &cq_desc, &cl);
|
||||
if (r == ZE_RESULT_SUCCESS) {
|
||||
r = zeCommandListAppendMemoryCopy(cl, ptr_dst, ptr_src, size, nullptr, 0, nullptr);
|
||||
zeCommandListDestroy(cl);
|
||||
if (r == ZE_RESULT_SUCCESS) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
// Host-staged copy
|
||||
char *host_buf = (char *)malloc(size);
|
||||
q_src.memcpy(host_buf, (const char *)ptr_src, size).wait();
|
||||
q_dst.memcpy((char *)ptr_dst, host_buf, size).wait();
|
||||
@@ -675,8 +746,7 @@ ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
|
||||
size = std::max(size, (size_t)1); // syclMalloc returns null for size 0
|
||||
|
||||
void * dev_ptr;
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(dev_ptr = (void *)sycl::malloc_device(
|
||||
size, *stream)));
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(dev_ptr = (void *)ggml_sycl_malloc_device(size, *stream)));
|
||||
if (!dev_ptr) {
|
||||
GGML_LOG_ERROR("%s: can't allocate %lu Bytes of memory on device\n", __func__, size);
|
||||
return nullptr;
|
||||
@@ -917,18 +987,10 @@ ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
|
||||
size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
|
||||
}
|
||||
|
||||
// FIXME: do not crash if SYCL Buffer alloc fails
|
||||
// currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
|
||||
ggml_sycl_set_device(i);
|
||||
const queue_ptr stream = ctx->streams[i];
|
||||
char * buf;
|
||||
/*
|
||||
DPCT1009:208: SYCL uses exceptions to report errors and does not use the
|
||||
error codes. The original code was commented out and a warning string
|
||||
was inserted. You need to rewrite this code.
|
||||
*/
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(buf = (char *)sycl::malloc_device(
|
||||
size, *stream)));
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(buf = (char *)ggml_sycl_malloc_device(size, *stream)));
|
||||
if (!buf) {
|
||||
char err_buf[1024];
|
||||
snprintf(err_buf, 1023, "%s: can't allocate %lu Bytes of memory on device\n", __func__, size);
|
||||
@@ -1306,7 +1368,7 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
|
||||
for (int i = 0; i < MAX_SYCL_BUFFERS; ++i) {
|
||||
ggml_sycl_buffer & b = buffer_pool[i];
|
||||
if (b.ptr != nullptr) {
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(b.ptr, *qptr)));
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(ggml_sycl_free_device(b.ptr, *qptr)));
|
||||
pool_size -= b.size;
|
||||
}
|
||||
}
|
||||
@@ -1374,9 +1436,7 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
|
||||
void * ptr;
|
||||
size_t look_ahead_size = (size_t) (1.05 * size);
|
||||
|
||||
SYCL_CHECK(
|
||||
CHECK_TRY_ERROR(ptr = (void *)sycl::malloc_device(
|
||||
look_ahead_size, *qptr)));
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(ptr = (void *)ggml_sycl_malloc_device(look_ahead_size, *qptr)));
|
||||
if (!ptr) {
|
||||
GGML_LOG_ERROR("%s: can't allocate %lu Bytes of memory on device/GPU\n", __func__, look_ahead_size);
|
||||
return nullptr;
|
||||
@@ -1404,7 +1464,7 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
|
||||
}
|
||||
}
|
||||
GGML_LOG_WARN("WARNING: sycl buffer pool full, increase MAX_sycl_BUFFERS\n");
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(ptr, *qptr)));
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(ggml_sycl_free_device(ptr, *qptr)));
|
||||
pool_size -= size;
|
||||
}
|
||||
};
|
||||
@@ -3405,7 +3465,7 @@ static inline void * sycl_ext_malloc_device(dpct::queue_ptr stream, size_t size)
|
||||
// If async allocation extension is not available, use_async should always be false.
|
||||
GGML_ASSERT(!use_async);
|
||||
#endif
|
||||
return sycl::malloc(size, *stream, sycl::usm::alloc::device);
|
||||
return ggml_sycl_malloc_device(size, *stream);
|
||||
}
|
||||
|
||||
static inline void sycl_ext_free(dpct::queue_ptr stream, void * ptr) {
|
||||
@@ -3419,7 +3479,7 @@ static inline void sycl_ext_free(dpct::queue_ptr stream, void * ptr) {
|
||||
// If async allocation extension is not available, use_async should always be false.
|
||||
GGML_ASSERT(!use_async);
|
||||
#endif
|
||||
sycl::free(ptr, *stream);
|
||||
ggml_sycl_free_device(ptr, *stream);
|
||||
}
|
||||
|
||||
// RAII wrapper for temporary reorder buffers with optional host memory fallback.
|
||||
|
||||
@@ -777,7 +777,10 @@ inline ggml_webgpu_flash_attn_decisions ggml_webgpu_flash_attn_get_decisions(
|
||||
const bool tile_can_dispatch_all_q_rows =
|
||||
context.max_subgroup_size > 0 &&
|
||||
context.max_wg_size >= GGML_WEBGPU_FLASH_ATTN_TILE_Q_TILE * context.max_subgroup_size;
|
||||
const bool use_tile = context.supports_subgroups && !context.supports_subgroup_matrix && K->type == GGML_TYPE_F16 &&
|
||||
const bool use_subgroup_matrix =
|
||||
context.supports_subgroup_matrix && context.sg_mat_k > 0 && context.sg_mat_n > 0 &&
|
||||
context.src0->ne[0] % context.sg_mat_k == 0 && context.src2->ne[0] % context.sg_mat_n == 0;
|
||||
const bool use_tile = context.supports_subgroups && !use_subgroup_matrix && K->type == GGML_TYPE_F16 &&
|
||||
V->type == GGML_TYPE_F16 && f16_vec4_aligned &&
|
||||
(context.src0->ne[0] % GGML_WEBGPU_FLASH_ATTN_TILE_KV_VEC_WIDTH == 0) &&
|
||||
(context.src2->ne[0] % GGML_WEBGPU_FLASH_ATTN_TILE_KV_VEC_WIDTH == 0) &&
|
||||
@@ -785,7 +788,7 @@ inline ggml_webgpu_flash_attn_decisions ggml_webgpu_flash_attn_get_decisions(
|
||||
|
||||
decisions.path = use_vec ? GGML_WEBGPU_FLASH_ATTN_PATH_VEC :
|
||||
use_tile ? GGML_WEBGPU_FLASH_ATTN_PATH_TILE :
|
||||
context.supports_subgroup_matrix ? GGML_WEBGPU_FLASH_ATTN_PATH_SUBGROUP_MATRIX :
|
||||
use_subgroup_matrix ? GGML_WEBGPU_FLASH_ATTN_PATH_SUBGROUP_MATRIX :
|
||||
GGML_WEBGPU_FLASH_ATTN_PATH_NONE;
|
||||
|
||||
if (decisions.path == GGML_WEBGPU_FLASH_ATTN_PATH_NONE) {
|
||||
|
||||
@@ -3148,6 +3148,16 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
|
||||
}
|
||||
ctx->param_arena.reset();
|
||||
commands.clear();
|
||||
#ifdef GGML_WEBGPU_GPU_PROFILE
|
||||
// flush before the next batch can overflow the QuerySet
|
||||
if (ctx->profile_timestamp_query_count + 2 * ctx->global_ctx->command_submit_batch_size >=
|
||||
WEBGPU_MAX_PROFILE_QUERY_COUNT) {
|
||||
ggml_backend_webgpu_collect_profile_results(ctx, profile_pipeline_names, num_inflight_batches);
|
||||
// reset profile timestamp state
|
||||
ctx->profile_timestamp_query_count = 0;
|
||||
profile_pipeline_names.clear();
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
node_idx += num_encoded_ops;
|
||||
|
||||
@@ -55,6 +55,7 @@
|
||||
| `-ctv, --cache-type-v TYPE` | KV cache data type for V<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V) |
|
||||
| `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
|
||||
| `-np, --parallel N` | number of parallel sequences to decode (default: 1)<br/>(env: LLAMA_ARG_N_PARALLEL) |
|
||||
| `--rpc SERVERS` | comma-separated list of RPC servers (host:port)<br/>(env: LLAMA_ARG_RPC) |
|
||||
| `--mlock` | force system to keep model in RAM rather than swapping or compressing<br/>(env: LLAMA_ARG_MLOCK) |
|
||||
| `--mmap, --no-mmap` | whether to memory-map model. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)<br/>(env: LLAMA_ARG_MMAP) |
|
||||
| `-dio, --direct-io, -ndio, --no-direct-io` | use DirectIO if available. (default: disabled)<br/>(env: LLAMA_ARG_DIO) |
|
||||
@@ -198,7 +199,7 @@
|
||||
| `--spec-draft-device, -devd, --device-draft <dev1,dev2,..>` | comma-separated list of devices to use for offloading the draft model (none = don't offload)<br/>use --list-devices to see a list of available devices |
|
||||
| `--spec-draft-ngl, -ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)<br/>(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) |
|
||||
| `--spec-draft-model, -md, --model-draft FNAME` | draft model for speculative decoding (default: unused)<br/>(env: LLAMA_ARG_SPEC_DRAFT_MODEL) |
|
||||
| `--spec-type [none\|ngram-cache\|ngram-simple\|ngram-map-k\|ngram-map-k4v\|ngram-mod]` | type of speculative decoding to use when no draft model is provided (default: none)<br/><br/>(env: LLAMA_ARG_SPEC_TYPE) |
|
||||
| `--spec-type none,draft-simple,draft-eagle3,ngram-simple,ngram-map-k,ngram-map-k4v,ngram-mod,ngram-cache` | comma-separated list of types of speculative decoding to use (default: none)<br/><br/>(env: LLAMA_ARG_SPEC_TYPE) |
|
||||
| `--spec-ngram-mod-n-min N` | minimum number of ngram tokens to use for ngram-based speculative decoding (default: 48) |
|
||||
| `--spec-ngram-mod-n-max N` | maximum number of ngram tokens to use for ngram-based speculative decoding (default: 64) |
|
||||
| `--spec-ngram-mod-n-match N` | ngram-mod lookup length (default: 24) |
|
||||
|
||||
@@ -138,6 +138,7 @@ llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1
|
||||
| `-ctv, --cache-type-v TYPE` | KV cache data type for V<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V) |
|
||||
| `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
|
||||
| `-np, --parallel N` | number of parallel sequences to decode (default: 1)<br/>(env: LLAMA_ARG_N_PARALLEL) |
|
||||
| `--rpc SERVERS` | comma-separated list of RPC servers (host:port)<br/>(env: LLAMA_ARG_RPC) |
|
||||
| `--mlock` | force system to keep model in RAM rather than swapping or compressing<br/>(env: LLAMA_ARG_MLOCK) |
|
||||
| `--mmap, --no-mmap` | whether to memory-map model. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)<br/>(env: LLAMA_ARG_MMAP) |
|
||||
| `-dio, --direct-io, -ndio, --no-direct-io` | use DirectIO if available. (default: disabled)<br/>(env: LLAMA_ARG_DIO) |
|
||||
|
||||
@@ -72,6 +72,7 @@ For the full list of features, please refer to [server's changelog](https://gith
|
||||
| `-ctk, --cache-type-k TYPE` | KV cache data type for K<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K) |
|
||||
| `-ctv, --cache-type-v TYPE` | KV cache data type for V<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V) |
|
||||
| `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
|
||||
| `--rpc SERVERS` | comma-separated list of RPC servers (host:port)<br/>(env: LLAMA_ARG_RPC) |
|
||||
| `--mlock` | force system to keep model in RAM rather than swapping or compressing<br/>(env: LLAMA_ARG_MLOCK) |
|
||||
| `--mmap, --no-mmap` | whether to memory-map model. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)<br/>(env: LLAMA_ARG_MMAP) |
|
||||
| `-dio, --direct-io, -ndio, --no-direct-io` | use DirectIO if available. (default: disabled)<br/>(env: LLAMA_ARG_DIO) |
|
||||
@@ -247,7 +248,7 @@ For the full list of features, please refer to [server's changelog](https://gith
|
||||
| `--spec-draft-device, -devd, --device-draft <dev1,dev2,..>` | comma-separated list of devices to use for offloading the draft model (none = don't offload)<br/>use --list-devices to see a list of available devices |
|
||||
| `--spec-draft-ngl, -ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)<br/>(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) |
|
||||
| `--spec-draft-model, -md, --model-draft FNAME` | draft model for speculative decoding (default: unused)<br/>(env: LLAMA_ARG_SPEC_DRAFT_MODEL) |
|
||||
| `--spec-type [none\|ngram-cache\|ngram-simple\|ngram-map-k\|ngram-map-k4v\|ngram-mod]` | type of speculative decoding to use when no draft model is provided (default: none)<br/><br/>(env: LLAMA_ARG_SPEC_TYPE) |
|
||||
| `--spec-type none,draft-simple,draft-eagle3,ngram-simple,ngram-map-k,ngram-map-k4v,ngram-mod,ngram-cache` | comma-separated list of types of speculative decoding to use (default: none)<br/><br/>(env: LLAMA_ARG_SPEC_TYPE) |
|
||||
| `--spec-ngram-mod-n-min N` | minimum number of ngram tokens to use for ngram-based speculative decoding (default: 48) |
|
||||
| `--spec-ngram-mod-n-max N` | maximum number of ngram tokens to use for ngram-based speculative decoding (default: 64) |
|
||||
| `--spec-ngram-mod-n-match N` | ngram-mod lookup length (default: 24) |
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1040,6 +1040,10 @@ json oaicompat_chat_params_parse(
|
||||
inputs.use_jinja = opt.use_jinja;
|
||||
inputs.parallel_tool_calls = json_value(body, "parallel_tool_calls", caps["supports_parallel_tool_calls"]);
|
||||
inputs.add_generation_prompt = json_value(body, "add_generation_prompt", true);
|
||||
const bool continue_final_message = json_value(body, "continue_final_message", false);
|
||||
if (continue_final_message && inputs.add_generation_prompt) {
|
||||
throw std::invalid_argument("Cannot set both add_generation_prompt and continue_final_message to true.");
|
||||
}
|
||||
inputs.reasoning_format = opt.reasoning_format;
|
||||
if (body.contains("reasoning_format")) {
|
||||
inputs.reasoning_format = common_reasoning_format_from_name(body.at("reasoning_format").get<std::string>());
|
||||
@@ -1071,7 +1075,10 @@ json oaicompat_chat_params_parse(
|
||||
|
||||
// if the assistant message appears at the end of list, we do not add end-of-turn token
|
||||
// for ex. this can be useful to modify the reasoning process in reasoning models
|
||||
bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant" && opt.prefill_assistant;
|
||||
// continue_final_message is the explicit opt in alias from the vLLM/transformers API,
|
||||
// equivalent to the prefill_assistant heuristic
|
||||
bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant"
|
||||
&& (continue_final_message || opt.prefill_assistant);
|
||||
common_chat_msg last_message;
|
||||
if (prefill_assistant_message) {
|
||||
last_message = inputs.messages.back();
|
||||
@@ -1082,11 +1089,12 @@ json oaicompat_chat_params_parse(
|
||||
throw std::invalid_argument("Cannot have 2 or more assistant messages at the end of the list.");
|
||||
}
|
||||
|
||||
/* TODO: test this properly */
|
||||
inputs.reasoning_format = COMMON_REASONING_FORMAT_NONE;
|
||||
|
||||
if ( inputs.enable_thinking ) {
|
||||
throw std::invalid_argument("Assistant response prefill is incompatible with enable_thinking.");
|
||||
// reject reasoning prefill on channel based templates that do not expose explicit thinking tags
|
||||
if (!last_message.reasoning_content.empty() && inputs.enable_thinking) {
|
||||
auto probe_params = common_chat_templates_apply(opt.tmpls.get(), inputs);
|
||||
if (probe_params.supports_thinking && probe_params.thinking_end_tag.empty()) {
|
||||
throw std::invalid_argument("Assistant prefill with reasoning_content is not supported yet for this template.");
|
||||
}
|
||||
}
|
||||
|
||||
inputs.add_generation_prompt = true;
|
||||
@@ -1098,6 +1106,42 @@ json oaicompat_chat_params_parse(
|
||||
|
||||
/* Append assistant prefilled message */
|
||||
if (prefill_assistant_message) {
|
||||
const bool thinking_active = chat_params.supports_thinking && !chat_params.thinking_end_tag.empty();
|
||||
const bool has_reasoning = !last_message.reasoning_content.empty();
|
||||
const bool has_content = !last_message.content.empty() || !last_message.content_parts.empty();
|
||||
const bool mid_reasoning = has_reasoning && !has_content;
|
||||
|
||||
// some templates inject thinking_start in generation_prompt, others let the model emit it
|
||||
const bool gp_has_think = thinking_active
|
||||
&& chat_params.generation_prompt.find(chat_params.thinking_start_tag) != std::string::npos;
|
||||
|
||||
// open the thinking block when reasoning is present and the template did not inject it
|
||||
if (has_reasoning) {
|
||||
if (thinking_active && !gp_has_think) {
|
||||
chat_params.prompt += chat_params.thinking_start_tag;
|
||||
}
|
||||
chat_params.prompt += last_message.reasoning_content;
|
||||
}
|
||||
|
||||
if (thinking_active) {
|
||||
if (mid_reasoning) {
|
||||
// model continues inside the thinking block, keep generation_prompt open on think
|
||||
if (!gp_has_think) {
|
||||
chat_params.generation_prompt += chat_params.thinking_start_tag;
|
||||
}
|
||||
} else {
|
||||
// close thinking block when reasoning is followed by content, or when the template forced it open
|
||||
if (has_reasoning || gp_has_think) {
|
||||
chat_params.prompt += chat_params.thinking_end_tag;
|
||||
}
|
||||
// strip thinking_start from generation_prompt so the parser routes model output as content
|
||||
auto pos = chat_params.generation_prompt.rfind(chat_params.thinking_start_tag);
|
||||
if (pos != std::string::npos) {
|
||||
chat_params.generation_prompt = chat_params.generation_prompt.substr(0, pos);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!last_message.content_parts.empty()) {
|
||||
for (auto & p : last_message.content_parts) {
|
||||
chat_params.prompt += p.text;
|
||||
|
||||
@@ -178,6 +178,45 @@ def test_chat_template_assistant_prefill(prefill, re_prefill):
|
||||
assert res.body["__verbose"]["prompt"] == f"<s> <|start_header_id|>system<|end_header_id|>\n\nBook<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat is the best book<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n{re_prefill}"
|
||||
|
||||
|
||||
def test_chat_template_continue_final_message_vllm_compat():
|
||||
"""continue_final_message is the vLLM/transformers explicit alias for the prefill_assistant heuristic.
|
||||
Both must produce the same prompt."""
|
||||
global server
|
||||
server.chat_template = "llama3"
|
||||
server.debug = True
|
||||
server.start()
|
||||
res = server.make_request("POST", "/chat/completions", data={
|
||||
"max_tokens": 8,
|
||||
"add_generation_prompt": False,
|
||||
"continue_final_message": True,
|
||||
"messages": [
|
||||
{"role": "system", "content": "Book"},
|
||||
{"role": "user", "content": "What is the best book"},
|
||||
{"role": "assistant", "content": "Whill"},
|
||||
]
|
||||
})
|
||||
assert res.status_code == 200
|
||||
assert "__verbose" in res.body
|
||||
assert res.body["__verbose"]["prompt"] == "<s> <|start_header_id|>system<|end_header_id|>\n\nBook<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat is the best book<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nWhill"
|
||||
|
||||
|
||||
def test_chat_template_continue_final_message_mutual_exclusion():
|
||||
"""add_generation_prompt and continue_final_message both set to true must be rejected"""
|
||||
global server
|
||||
server.chat_template = "llama3"
|
||||
server.start()
|
||||
res = server.make_request("POST", "/chat/completions", data={
|
||||
"max_tokens": 8,
|
||||
"add_generation_prompt": True,
|
||||
"continue_final_message": True,
|
||||
"messages": [
|
||||
{"role": "user", "content": "Hi"},
|
||||
{"role": "assistant", "content": "Hello"},
|
||||
]
|
||||
})
|
||||
assert res.status_code == 400
|
||||
|
||||
|
||||
def test_apply_chat_template():
|
||||
global server
|
||||
server.chat_template = "command-r"
|
||||
|
||||
@@ -179,7 +179,7 @@
|
||||
isEditing = false;
|
||||
|
||||
// If canceling a new system message with placeholder content, remove it without deleting children
|
||||
if (message.role === MessageRole.SYSTEM) {
|
||||
if (message.role === MessageRole.SYSTEM && message.content === SYSTEM_MESSAGE_PLACEHOLDER) {
|
||||
const conversationDeleted = await chatStore.removeSystemPromptPlaceholder(message.id);
|
||||
|
||||
if (conversationDeleted) {
|
||||
|
||||
@@ -74,7 +74,6 @@
|
||||
const editCtx = getMessageEditContext();
|
||||
|
||||
const isAgentic = $derived(hasAgenticContent(message, toolMessages));
|
||||
const hasReasoning = $derived(!!message.reasoningContent);
|
||||
const processingState = useProcessingState();
|
||||
|
||||
let currentConfig = $derived(config());
|
||||
@@ -329,7 +328,7 @@
|
||||
{onCopy}
|
||||
{onEdit}
|
||||
{onRegenerate}
|
||||
onContinue={currentConfig.enableContinueGeneration && !hasReasoning ? onContinue : undefined}
|
||||
onContinue={currentConfig.enableContinueGeneration ? onContinue : undefined}
|
||||
{onForkConversation}
|
||||
{onDelete}
|
||||
{onConfirmDelete}
|
||||
|
||||
@@ -31,9 +31,12 @@
|
||||
|
||||
let parsed = $derived(ModelsService.parseModelId(modelId));
|
||||
let resolvedShowRaw = $derived(showRaw ?? (config().showRawModelNames as boolean) ?? false);
|
||||
let displayName = $derived(parsed.modelName ?? modelId);
|
||||
let allAliases = $derived(aliases ?? []);
|
||||
let allTags = $derived([...(parsed.tags ?? []), ...(tags ?? [])]);
|
||||
|
||||
let uniqueAliases = $derived([...new Set(aliases ?? [])]);
|
||||
let uniqueTags = $derived([...new Set([...(parsed.tags ?? []), ...(tags ?? [])])]);
|
||||
|
||||
let primaryAlias = $derived(uniqueAliases.length === 1 ? uniqueAliases[0] : null);
|
||||
let displayName = $derived(primaryAlias ?? parsed.modelName ?? modelId);
|
||||
</script>
|
||||
|
||||
{#if resolvedShowRaw}
|
||||
@@ -56,14 +59,18 @@
|
||||
</span>
|
||||
{/if}
|
||||
|
||||
{#if allAliases.length > 0}
|
||||
{#each allAliases as alias (alias)}
|
||||
{#if primaryAlias}
|
||||
{#if primaryAlias !== parsed.modelName}
|
||||
<span class={badgeClass}>{parsed.modelName ?? modelId}</span>
|
||||
{/if}
|
||||
{:else if uniqueAliases.length > 1}
|
||||
{#each uniqueAliases as alias (alias)}
|
||||
<span class={badgeClass}>{alias}</span>
|
||||
{/each}
|
||||
{/if}
|
||||
|
||||
{#if allTags.length > 0}
|
||||
{#each allTags as tag (tag)}
|
||||
{#if uniqueTags.length > 0}
|
||||
{#each uniqueTags as tag (tag)}
|
||||
<span class={tagBadgeClass}>{tag}</span>
|
||||
{/each}
|
||||
{/if}
|
||||
|
||||
@@ -122,7 +122,7 @@ const SETTINGS_REGISTRY: Record<string, SettingsSectionEntry> = {
|
||||
{
|
||||
key: SETTINGS_KEYS.ENABLE_CONTINUE_GENERATION,
|
||||
label: 'Enable "Continue" button',
|
||||
help: 'Enable "Continue" button for assistant messages. Currently works only with non-reasoning models.',
|
||||
help: 'Enable "Continue" button for assistant messages, including reasoning models.',
|
||||
defaultValue: false,
|
||||
type: SettingsFieldType.CHECKBOX,
|
||||
section: SETTINGS_SECTION_SLUGS.GENERAL,
|
||||
|
||||
@@ -130,7 +130,8 @@ export class ChatService {
|
||||
timings_per_token,
|
||||
// Config options
|
||||
disableReasoningParsing,
|
||||
excludeReasoningFromContext
|
||||
excludeReasoningFromContext,
|
||||
continueFinalMessage
|
||||
} = options;
|
||||
|
||||
const normalizedMessages: ApiChatMessageData[] = messages
|
||||
@@ -209,6 +210,11 @@ export class ChatService {
|
||||
? ReasoningFormat.NONE
|
||||
: ReasoningFormat.AUTO;
|
||||
|
||||
if (continueFinalMessage) {
|
||||
requestBody.continue_final_message = true;
|
||||
requestBody.add_generation_prompt = false;
|
||||
}
|
||||
|
||||
if (temperature !== undefined) requestBody.temperature = temperature;
|
||||
if (max_tokens !== undefined) {
|
||||
// Set max_tokens to -1 (infinite) when explicitly configured as 0 or null
|
||||
|
||||
@@ -674,7 +674,8 @@ class ChatStore {
|
||||
},
|
||||
onReasoningChunk: (chunk: string) => {
|
||||
streamedReasoningContent += chunk;
|
||||
// Update UI to show reasoning is being received
|
||||
// mark streaming state so a stop mid-thinking can persist the partial reasoning
|
||||
this.setChatStreaming(convId, streamedContent, currentMessageId);
|
||||
const idx = conversationsStore.findMessageIndex(currentMessageId);
|
||||
conversationsStore.updateMessageAtIndex(idx, {
|
||||
reasoningContent: streamedReasoningContent
|
||||
@@ -989,38 +990,51 @@ class ChatStore {
|
||||
const conversationId = convId || conversationsStore.activeConversation?.id;
|
||||
if (!conversationId) return;
|
||||
const streamingState = this.getChatStreaming(conversationId);
|
||||
if (!streamingState || !streamingState.response.trim()) return;
|
||||
if (!streamingState) return;
|
||||
const messages =
|
||||
conversationId === conversationsStore.activeConversation?.id
|
||||
? conversationsStore.activeMessages
|
||||
: await conversationsStore.getConversationMessages(conversationId);
|
||||
if (!messages.length) return;
|
||||
const lastMessage = messages[messages.length - 1];
|
||||
if (lastMessage?.role === MessageRole.ASSISTANT) {
|
||||
try {
|
||||
const updateData: { content: string; timings?: ChatMessageTimings } = {
|
||||
content: streamingState.response
|
||||
};
|
||||
const lastKnownState = this.getProcessingState(conversationId);
|
||||
if (lastKnownState) {
|
||||
updateData.timings = {
|
||||
prompt_n: lastKnownState.promptTokens || 0,
|
||||
prompt_ms: lastKnownState.promptMs,
|
||||
predicted_n: lastKnownState.tokensDecoded || 0,
|
||||
cache_n: lastKnownState.cacheTokens || 0,
|
||||
predicted_ms:
|
||||
lastKnownState.tokensPerSecond && lastKnownState.tokensDecoded
|
||||
? (lastKnownState.tokensDecoded / lastKnownState.tokensPerSecond) * 1000
|
||||
: undefined
|
||||
};
|
||||
}
|
||||
await DatabaseService.updateMessage(lastMessage.id, updateData);
|
||||
lastMessage.content = streamingState.response;
|
||||
if (updateData.timings) lastMessage.timings = updateData.timings;
|
||||
} catch (error) {
|
||||
lastMessage.content = streamingState.response;
|
||||
console.error('Failed to save partial response:', error);
|
||||
if (lastMessage?.role !== MessageRole.ASSISTANT) return;
|
||||
|
||||
const partialContent = streamingState.response;
|
||||
const partialReasoning = lastMessage.reasoningContent || '';
|
||||
|
||||
// nothing to persist when both content and reasoning are empty (e.g. stop before any token)
|
||||
if (!partialContent.trim() && !partialReasoning.trim()) return;
|
||||
|
||||
try {
|
||||
const updateData: {
|
||||
content: string;
|
||||
reasoningContent?: string;
|
||||
timings?: ChatMessageTimings;
|
||||
} = {
|
||||
content: partialContent
|
||||
};
|
||||
if (partialReasoning) {
|
||||
updateData.reasoningContent = partialReasoning;
|
||||
}
|
||||
const lastKnownState = this.getProcessingState(conversationId);
|
||||
if (lastKnownState) {
|
||||
updateData.timings = {
|
||||
prompt_n: lastKnownState.promptTokens || 0,
|
||||
prompt_ms: lastKnownState.promptMs,
|
||||
predicted_n: lastKnownState.tokensDecoded || 0,
|
||||
cache_n: lastKnownState.cacheTokens || 0,
|
||||
predicted_ms:
|
||||
lastKnownState.tokensPerSecond && lastKnownState.tokensDecoded
|
||||
? (lastKnownState.tokensDecoded / lastKnownState.tokensPerSecond) * 1000
|
||||
: undefined
|
||||
};
|
||||
}
|
||||
await DatabaseService.updateMessage(lastMessage.id, updateData);
|
||||
lastMessage.content = partialContent;
|
||||
if (updateData.timings) lastMessage.timings = updateData.timings;
|
||||
} catch (error) {
|
||||
lastMessage.content = partialContent;
|
||||
console.error('Failed to save partial response:', error);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1265,7 +1279,11 @@ class ChatStore {
|
||||
const conversationContext = conversationsStore.activeMessages.slice(0, idx);
|
||||
const contextWithContinue = [
|
||||
...conversationContext,
|
||||
{ role: MessageRole.ASSISTANT as const, content: originalContent }
|
||||
{
|
||||
role: MessageRole.ASSISTANT as const,
|
||||
content: originalContent,
|
||||
reasoning_content: originalReasoning || undefined
|
||||
}
|
||||
];
|
||||
|
||||
let appendedContent = '';
|
||||
@@ -1283,6 +1301,7 @@ class ChatStore {
|
||||
contextWithContinue,
|
||||
{
|
||||
...this.getApiOptions(),
|
||||
continueFinalMessage: true,
|
||||
onChunk: (chunk: string) => {
|
||||
appendedContent += chunk;
|
||||
hasReceivedContent = true;
|
||||
@@ -1291,6 +1310,8 @@ class ChatStore {
|
||||
onReasoningChunk: (chunk: string) => {
|
||||
appendedReasoning += chunk;
|
||||
hasReceivedContent = true;
|
||||
// mark streaming state so a stop mid-thinking can persist the partial reasoning
|
||||
this.setChatStreaming(msg.convId, originalContent + appendedContent, msg.id);
|
||||
conversationsStore.updateMessageAtIndex(idx, {
|
||||
reasoningContent: originalReasoning + appendedReasoning
|
||||
});
|
||||
|
||||
3
tools/server/webui/src/lib/types/api.d.ts
vendored
3
tools/server/webui/src/lib/types/api.d.ts
vendored
@@ -239,6 +239,9 @@ export interface ApiChatCompletionRequest {
|
||||
// Custom parameters (JSON string)
|
||||
custom?: Record<string, unknown>;
|
||||
timings_per_token?: boolean;
|
||||
// Continuation control (vLLM compat)
|
||||
add_generation_prompt?: boolean;
|
||||
continue_final_message?: boolean;
|
||||
}
|
||||
|
||||
export interface ApiChatCompletionToolCallFunctionDelta {
|
||||
|
||||
@@ -92,6 +92,8 @@ export interface SettingsChatServiceOptions {
|
||||
// Custom parameters
|
||||
custom?: string;
|
||||
timings_per_token?: boolean;
|
||||
// Continuation control (vLLM compat), opt in to the explicit continue final message flag
|
||||
continueFinalMessage?: boolean;
|
||||
// Callbacks
|
||||
onChunk?: (chunk: string) => void;
|
||||
onReasoningChunk?: (chunk: string) => void;
|
||||
|
||||
Reference in New Issue
Block a user