Compare commits

..

19 Commits
b5392 ... b5411

Author SHA1 Message Date
Jeff Bolz
4f41ee11d6 vulkan: use scalar FA rather than coopmat2 when N==1 (#13554) 2025-05-17 08:35:47 +02:00
Z
3e0be1cace llguidance : official v0.7.20 release (no actual changes) [noci] (#13594) 2025-05-16 22:56:28 +02:00
Xuan-Son Nguyen
6aa892ec2a server : do not return error out of context (with ctx shift disabled) (#13577) 2025-05-16 21:50:00 +02:00
Xuan-Son Nguyen
aea9f8b4e7 webui : improve accessibility for visually impaired people (#13551)
* webui : improve accessibility for visually impaired people

* add a11y for extra contents

* fix some labels being read twice

* add skip to main content
2025-05-16 21:49:01 +02:00
Xuan-Son Nguyen
06c1e4abc1 readme : add list of dependencies and their license (#13591) 2025-05-16 20:04:18 +02:00
Diego Devesa
415e40a357 releases : use arm version of curl for arm releases (#13592) 2025-05-16 19:36:51 +02:00
Georgi Gerganov
654a67794f metal : add FA-vec kernel for head size 64 (#13583)
ggml-ci
2025-05-16 20:32:58 +03:00
Diego Devesa
5364ae4ba5 llama : print hint when loading a model when no backends are loaded (#13589) 2025-05-16 16:38:07 +02:00
Sigbjørn Skjæret
7c07ac244d ci : add ppc64el to build-linux-cross (#13575) 2025-05-16 14:54:23 +02:00
Łukasz Ślusarczyk
0a338ed013 sycl : fixed compilation warnings (#13582) 2025-05-16 18:15:29 +08:00
Olivier Chafik
bc098c3cf0 minja: sync (qwen3) (#13573)
* minja: sync f06140fa52

- https://github.com/google/minja/pull/67 (@grf53)
- https://github.com/google/minja/pull/66 (@taha-yassine)
- https://github.com/google/minja/pull/63 (@grf53)
- https://github.com/google/minja/pull/58

---------

Co-authored-by: ochafik <ochafik@google.com>
2025-05-15 23:29:10 +01:00
Diego Devesa
c6a2c9e741 gguf : use ggml log system (#13571)
* gguf : use ggml log system

* llama : remove unnecessary new lines in exception messages
2025-05-15 19:13:11 +02:00
Daniel Tang
07ad2b6db3 gguf-py : fix disconnect-before-connect in editor-gui (#13569)
The bug caused a crash upon load with venvs created with
--system-site-packages to use
python3-pyside6.qtwidgets=python3-pyside6.qtwidgets=6.6.2-4
from Kubuntu 24.10.
2025-05-15 18:47:10 +02:00
Xuan-Son Nguyen
c531edfa34 convert : fix conversion for llama 4 (#13567) 2025-05-15 17:40:07 +02:00
Atharva Dubey
02cdd2d8b0 sycl: simplify bin_bcast_kernel (#13383) 2025-05-15 17:39:52 +02:00
Svetlozar Georgiev
64bb51cf90 sycl: reordered Q4_K MMVQ (#13109) 2025-05-15 17:35:44 +02:00
Łukasz Ślusarczyk
9c404ed54c sycl: use oneDNN for matrices multiplication (#12972) 2025-05-15 16:53:41 +02:00
Diego Devesa
6c8b91500e llama-bench : fix -ot with dl backends (#13563) 2025-05-15 15:46:55 +02:00
Xuan-Son Nguyen
3cc1f1f1d2 webui : handle PDF input (as text or image) + convert pasted long content to file (#13562)
* webui : handle PDF input (as text or image)

* handle the case where pdf image + server without mtmd

* fix bug missing pages
2025-05-15 14:24:50 +02:00
47 changed files with 1510 additions and 614 deletions

View File

@@ -5,6 +5,10 @@ inputs:
description: 'CURL version'
required: false
default: '8.6.0_6'
architecture:
description: 'Architecture of the libcurl to download'
required: false
default: 'win64'
outputs:
curl_path:
description: "Path to the downloaded libcurl"
@@ -18,8 +22,9 @@ runs:
shell: powershell
env:
CURL_VERSION: ${{ inputs.curl_version }}
ARCHITECTURE: ${{ inputs.architecture }}
run: |
curl.exe -o $env:RUNNER_TEMP/curl.zip -L "https://curl.se/windows/dl-${env:CURL_VERSION}/curl-${env:CURL_VERSION}-win64-mingw.zip"
curl.exe -o $env:RUNNER_TEMP/curl.zip -L "https://curl.se/windows/dl-${env:CURL_VERSION}/curl-${env:CURL_VERSION}-${env:ARCHITECTURE}-mingw.zip"
mkdir $env:RUNNER_TEMP/libcurl
tar.exe -xvf $env:RUNNER_TEMP/curl.zip --strip-components=1 -C $env:RUNNER_TEMP/libcurl
echo "curl_path=$env:RUNNER_TEMP/libcurl" >> $env:GITHUB_OUTPUT

View File

@@ -140,3 +140,94 @@ jobs:
-DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
cmake --build build --config Release -j $(nproc)
ubuntu-24-ppc64el-cpu-cross:
runs-on: ubuntu-24.04
steps:
- uses: actions/checkout@v4
- name: Setup PowerPC64le
run: |
sudo dpkg --add-architecture ppc64el
# Add arch-specific repositories for non-amd64 architectures
cat << EOF | sudo tee /etc/apt/sources.list.d/ppc64el-ports.list
deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
EOF
sudo apt-get update || true ;# Prevent failure due to missing URLs.
sudo apt-get install -y --no-install-recommends \
build-essential \
gcc-14-powerpc64le-linux-gnu \
g++-14-powerpc64le-linux-gnu \
libcurl4-openssl-dev:ppc64el
- name: Build
run: |
cmake -B build -DCMAKE_BUILD_TYPE=Release \
-DGGML_OPENMP=OFF \
-DLLAMA_BUILD_EXAMPLES=ON \
-DLLAMA_BUILD_TOOLS=ON \
-DLLAMA_BUILD_TESTS=OFF \
-DCMAKE_SYSTEM_NAME=Linux \
-DCMAKE_SYSTEM_PROCESSOR=ppc64 \
-DCMAKE_C_COMPILER=powerpc64le-linux-gnu-gcc-14 \
-DCMAKE_CXX_COMPILER=powerpc64le-linux-gnu-g++-14 \
-DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-DCMAKE_FIND_ROOT_PATH=/usr/lib/powerpc64le-linux-gnu \
-DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
-DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
-DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
cmake --build build --config Release -j $(nproc)
ubuntu-24-ppc64el-vulkan-cross:
runs-on: ubuntu-24.04
steps:
- uses: actions/checkout@v4
- name: Setup PowerPC64le
run: |
sudo dpkg --add-architecture ppc64el
# Add arch-specific repositories for non-amd64 architectures
cat << EOF | sudo tee /etc/apt/sources.list.d/ppc64el-ports.list
deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
EOF
sudo apt-get update || true ;# Prevent failure due to missing URLs.
sudo apt-get install -y --no-install-recommends \
build-essential \
glslc \
gcc-14-powerpc64le-linux-gnu \
g++-14-powerpc64le-linux-gnu \
libvulkan-dev:ppc64el \
libcurl4-openssl-dev:ppc64el
- name: Build
run: |
cmake -B build -DCMAKE_BUILD_TYPE=Release \
-DGGML_VULKAN=ON \
-DGGML_OPENMP=OFF \
-DLLAMA_BUILD_EXAMPLES=ON \
-DLLAMA_BUILD_TOOLS=ON \
-DLLAMA_BUILD_TESTS=OFF \
-DCMAKE_SYSTEM_NAME=Linux \
-DCMAKE_SYSTEM_PROCESSOR=ppc64 \
-DCMAKE_C_COMPILER=powerpc64le-linux-gnu-gcc-14 \
-DCMAKE_CXX_COMPILER=powerpc64le-linux-gnu-g++-14 \
-DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-DCMAKE_FIND_ROOT_PATH=/usr/lib/powerpc64le-linux-gnu \
-DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
-DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
-DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
cmake --build build --config Release -j $(nproc)

View File

@@ -238,14 +238,19 @@ jobs:
matrix:
include:
- build: 'cpu-x64'
arch: 'x64'
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF'
#- build: 'openblas-x64'
# arch: 'x64'
# defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
- build: 'vulkan-x64'
arch: 'x64'
defines: '-DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON'
- build: 'cpu-arm64'
arch: 'arm64'
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF'
- build: 'opencl-adreno-arm64'
arch: 'arm64'
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON'
steps:
@@ -312,6 +317,8 @@ jobs:
- name: libCURL
id: get_libcurl
uses: ./.github/actions/windows-setup-curl
with:
architecture: ${{ matrix.arch == 'x64' && 'win64' || 'win64a' }}
- name: Build
id: cmake_build
@@ -339,7 +346,7 @@ jobs:
env:
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
run: |
Copy-Item $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\Release\libcurl-x64.dll
Copy-Item $env:CURL_PATH\bin\libcurl-${{ matrix.arch }}.dll .\build\bin\Release\
7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip .\build\bin\Release\*
- name: Upload artifacts

View File

@@ -572,4 +572,11 @@ automatically. For example:
$ echo "source ~/.llama-completion.bash" >> ~/.bashrc
```
## References
## Dependencies
- [yhirose/cpp-httplib](https://github.com/yhirose/cpp-httplib) - Single-header HTTP server, used by `llama-server` - MIT license
- [stb-image](https://github.com/nothings/stb) - Single-header image format decoder, used by multimodal subsystem - Public domain
- [nlohmann/json](https://github.com/nlohmann/json) - Single-header JSON library, used by various tools/examples - MIT License
- [minja](https://github.com/google/minja) - Minimal Jinja parser in C++, used by various tools/examples - MIT License
- [linenoise.cpp](./tools/run/linenoise.cpp/linenoise.cpp) - C++ library that provides readline-like line editing capabilities, used by `llama-run` - BSD 2-Clause License
- [curl](https://curl.se/) - Client-side URL transfer library, used by various tools/examples - [CURL License](https://curl.se/docs/copyright.html)

View File

@@ -121,8 +121,8 @@ if (LLAMA_LLGUIDANCE)
ExternalProject_Add(llguidance_ext
GIT_REPOSITORY https://github.com/guidance-ai/llguidance
# v0.7.19 (+ fancy-regex build fix):
GIT_TAG b59f98f85269892a7de3d3641ad155366f13daa6
# v0.7.20 (+ fix to build on GCC 15):
GIT_TAG b5b8b64dba11c4e4ee6b1d1450d3a3ae279891e8
PREFIX ${CMAKE_BINARY_DIR}/llguidance
SOURCE_DIR ${LLGUIDANCE_SRC}
BUILD_IN_SOURCE TRUE

View File

@@ -13,10 +13,12 @@
#include <chrono>
#include <cstddef>
#include <cstdio>
#include <ctime>
#include <exception>
#include <iomanip>
#include <memory>
#include <sstream>
#include <stdexcept>
#include <string>
#include <vector>
@@ -393,8 +395,8 @@ class chat_template {
for (const auto & message_ : adjusted_messages) {
auto message = message_;
if (!message.contains("role") || !message.contains("content")) {
throw std::runtime_error("message must have 'role' and 'content' fields: " + message.dump());
if (!message.contains("role") || (!message.contains("content") && !message.contains("tool_calls"))) {
throw std::runtime_error("message must have 'role' and one of 'content' or 'tool_calls' fields: " + message.dump());
}
std::string role = message.at("role");
@@ -415,7 +417,6 @@ class chat_template {
}
}
if (polyfill_tool_calls) {
auto content = message.at("content");
auto tool_calls = json::array();
for (const auto & tool_call : message.at("tool_calls")) {
if (tool_call.at("type") != "function") {
@@ -434,8 +435,11 @@ class chat_template {
auto obj = json {
{"tool_calls", tool_calls},
};
if (!content.is_null() && !content.empty()) {
obj["content"] = content;
if (message.contains("content")) {
auto content = message.at("content");
if (!content.is_null() && !content.empty()) {
obj["content"] = content;
}
}
message["content"] = obj.dump(2);
message.erase("tool_calls");

View File

@@ -11,6 +11,7 @@
#include <algorithm>
#include <cctype>
#include <cstddef>
#include <cstdint>
#include <cmath>
#include <exception>
#include <functional>
@@ -233,7 +234,7 @@ public:
}
} else if (is_object()) {
if (!index.is_hashable())
throw std::runtime_error("Unashable type: " + index.dump());
throw std::runtime_error("Unhashable type: " + index.dump());
auto it = object_->find(index.primitive_);
if (it == object_->end())
throw std::runtime_error("Key not found: " + index.dump());
@@ -252,7 +253,7 @@ public:
auto index = key.get<int>();
return array_->at(index < 0 ? array_->size() + index : index);
} else if (object_) {
if (!key.is_hashable()) throw std::runtime_error("Unashable type: " + dump());
if (!key.is_hashable()) throw std::runtime_error("Unhashable type: " + dump());
auto it = object_->find(key.primitive_);
if (it == object_->end()) return Value();
return it->second;
@@ -261,7 +262,7 @@ public:
}
void set(const Value& key, const Value& value) {
if (!object_) throw std::runtime_error("Value is not an object: " + dump());
if (!key.is_hashable()) throw std::runtime_error("Unashable type: " + dump());
if (!key.is_hashable()) throw std::runtime_error("Unhashable type: " + dump());
(*object_)[key.primitive_] = value;
}
Value call(const std::shared_ptr<Context> & context, ArgumentsValue & args) const {
@@ -398,7 +399,7 @@ public:
}
return false;
} else if (object_) {
if (!value.is_hashable()) throw std::runtime_error("Unashable type: " + value.dump());
if (!value.is_hashable()) throw std::runtime_error("Unhashable type: " + value.dump());
return object_->find(value.primitive_) != object_->end();
} else {
throw std::runtime_error("contains can only be called on arrays and objects: " + dump());
@@ -416,7 +417,7 @@ public:
return const_cast<Value*>(this)->at(index);
}
Value& at(const Value & index) {
if (!index.is_hashable()) throw std::runtime_error("Unashable type: " + dump());
if (!index.is_hashable()) throw std::runtime_error("Unhashable type: " + dump());
if (is_array()) return array_->at(index.get<int>());
if (is_object()) return object_->at(index.primitive_);
throw std::runtime_error("Value is not an array or object: " + dump());
@@ -676,8 +677,8 @@ public:
class VariableExpr : public Expression {
std::string name;
public:
VariableExpr(const Location & location, const std::string& n)
: Expression(location), name(n) {}
VariableExpr(const Location & loc, const std::string& n)
: Expression(loc), name(n) {}
std::string get_name() const { return name; }
Value do_evaluate(const std::shared_ptr<Context> & context) const override {
if (!context->contains(name)) {
@@ -1200,9 +1201,9 @@ public:
class SliceExpr : public Expression {
public:
std::shared_ptr<Expression> start, end;
SliceExpr(const Location & loc, std::shared_ptr<Expression> && s, std::shared_ptr<Expression> && e)
: Expression(loc), start(std::move(s)), end(std::move(e)) {}
std::shared_ptr<Expression> start, end, step;
SliceExpr(const Location & loc, std::shared_ptr<Expression> && s, std::shared_ptr<Expression> && e, std::shared_ptr<Expression> && st = nullptr)
: Expression(loc), start(std::move(s)), end(std::move(e)), step(std::move(st)) {}
Value do_evaluate(const std::shared_ptr<Context> &) const override {
throw std::runtime_error("SliceExpr not implemented");
}
@@ -1219,18 +1220,35 @@ public:
if (!index) throw std::runtime_error("SubscriptExpr.index is null");
auto target_value = base->evaluate(context);
if (auto slice = dynamic_cast<SliceExpr*>(index.get())) {
auto start = slice->start ? slice->start->evaluate(context).get<int64_t>() : 0;
auto end = slice->end ? slice->end->evaluate(context).get<int64_t>() : (int64_t) target_value.size();
auto len = target_value.size();
auto wrap = [len](int64_t i) -> int64_t {
if (i < 0) {
return i + len;
}
return i;
};
int64_t step = slice->step ? slice->step->evaluate(context).get<int64_t>() : 1;
if (!step) {
throw std::runtime_error("slice step cannot be zero");
}
int64_t start = slice->start ? wrap(slice->start->evaluate(context).get<int64_t>()) : (step < 0 ? len - 1 : 0);
int64_t end = slice->end ? wrap(slice->end->evaluate(context).get<int64_t>()) : (step < 0 ? -1 : len);
if (target_value.is_string()) {
std::string s = target_value.get<std::string>();
if (start < 0) start = s.size() + start;
if (end < 0) end = s.size() + end;
return s.substr(start, end - start);
std::string result;
if (start < end && step == 1) {
result = s.substr(start, end - start);
} else {
for (int64_t i = start; step > 0 ? i < end : i > end; i += step) {
result += s[i];
}
}
return result;
} else if (target_value.is_array()) {
if (start < 0) start = target_value.size() + start;
if (end < 0) end = target_value.size() + end;
auto result = Value::array();
for (auto i = start; i < end; ++i) {
for (int64_t i = start; step > 0 ? i < end : i > end; i += step) {
result.push_back(target_value.at(i));
}
return result;
@@ -1305,6 +1323,8 @@ public:
if (name == "iterable") return l.is_iterable();
if (name == "sequence") return l.is_array();
if (name == "defined") return !l.is_null();
if (name == "true") return l.to_bool();
if (name == "false") return !l.to_bool();
throw std::runtime_error("Unknown type for 'is' operator: " + name);
};
auto value = eval();
@@ -1520,6 +1540,10 @@ public:
vargs.expectArgs("endswith method", {1, 1}, {0, 0});
auto suffix = vargs.args[0].get<std::string>();
return suffix.length() <= str.length() && std::equal(suffix.rbegin(), suffix.rend(), str.rbegin());
} else if (method->get_name() == "startswith") {
vargs.expectArgs("startswith method", {1, 1}, {0, 0});
auto prefix = vargs.args[0].get<std::string>();
return prefix.length() <= str.length() && std::equal(prefix.begin(), prefix.end(), str.begin());
} else if (method->get_name() == "title") {
vargs.expectArgs("title method", {0, 0}, {0, 0});
auto res = str;
@@ -2082,28 +2106,37 @@ private:
while (it != end && consumeSpaces() && peekSymbols({ "[", "." })) {
if (!consumeToken("[").empty()) {
std::shared_ptr<Expression> index;
std::shared_ptr<Expression> index;
auto slice_loc = get_location();
std::shared_ptr<Expression> start, end, step;
bool has_first_colon = false, has_second_colon = false;
if (!peekSymbols({ ":" })) {
start = parseExpression();
}
if (!consumeToken(":").empty()) {
has_first_colon = true;
if (!peekSymbols({ ":", "]" })) {
end = parseExpression();
}
if (!consumeToken(":").empty()) {
auto slice_end = parseExpression();
index = std::make_shared<SliceExpr>(slice_end->location, nullptr, std::move(slice_end));
} else {
auto slice_start = parseExpression();
if (!consumeToken(":").empty()) {
consumeSpaces();
if (peekSymbols({ "]" })) {
index = std::make_shared<SliceExpr>(slice_start->location, std::move(slice_start), nullptr);
} else {
auto slice_end = parseExpression();
index = std::make_shared<SliceExpr>(slice_start->location, std::move(slice_start), std::move(slice_end));
}
} else {
index = std::move(slice_start);
has_second_colon = true;
if (!peekSymbols({ "]" })) {
step = parseExpression();
}
}
if (!index) throw std::runtime_error("Empty index in subscript");
if (consumeToken("]").empty()) throw std::runtime_error("Expected closing bracket in subscript");
}
value = std::make_shared<SubscriptExpr>(value->location, std::move(value), std::move(index));
if ((has_first_colon || has_second_colon) && (start || end || step)) {
index = std::make_shared<SliceExpr>(slice_loc, std::move(start), std::move(end), std::move(step));
} else {
index = std::move(start);
}
if (!index) throw std::runtime_error("Empty index in subscript");
if (consumeToken("]").empty()) throw std::runtime_error("Expected closing bracket in subscript");
value = std::make_shared<SubscriptExpr>(value->location, std::move(value), std::move(index));
} else if (!consumeToken(".").empty()) {
auto identifier = parseIdentifier();
if (!identifier) throw std::runtime_error("Expected identifier in subscript");

View File

@@ -2069,6 +2069,9 @@ class Llama4Model(LlamaModel):
self.gguf_writer.add_expert_feed_forward_length(self.hparams["intermediate_size_moe"])
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
if name.startswith("language_model."):
name = name.replace("language_model.", "")
# split the gate_up into gate and up
if "gate_up_proj" in name:
name_up = name.replace("gate_up_proj", "up_proj.weight")

View File

@@ -731,6 +731,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
| GGML_SYCL_DEVICE_ARCH | Optional (except for AMD) | Set the SYCL device architecture, optional except for AMD. Setting the device architecture can improve the performance. See the table [--offload-arch](https://github.com/intel/llvm/blob/sycl/sycl/doc/design/OffloadDesign.md#--offload-arch) for a list of valid architectures. |
| GGML_SYCL_F16 | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path. |
| GGML_SYCL_GRAPH | ON *(default)* \|OFF *(Optional)* | Enable build with [SYCL Graph extension](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc). |
| GGML_SYCL_DNN | ON *(default)* \|OFF *(Optional)* | Enable build with oneDNN. |
| CMAKE_C_COMPILER | `icx` *(Linux)*, `icx/cl` *(Windows)* | Set `icx` compiler for SYCL code path. |
| CMAKE_CXX_COMPILER | `icpx` *(Linux)*, `icx` *(Windows)* | Set `icpx/icx` compiler for SYCL code path. |
@@ -741,6 +742,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
| GGML_SYCL_DEBUG | 0 (default) or 1 | Enable log function by macro: GGML_SYCL_DEBUG |
| GGML_SYCL_DISABLE_OPT | 0 (default) or 1 | Disable optimize features based on Intel GPU type, to compare the performance increase |
| GGML_SYCL_DISABLE_GRAPH | 0 or 1 (default) | Disable running computations through SYCL Graphs feature. Disabled by default because graph performance isn't yet better than non-graph performance. |
| GGML_SYCL_DISABLE_DNN | 0 (default) or 1 | Disable running computations through oneDNN and always use oneMKL. |
| ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |

View File

@@ -193,6 +193,7 @@ option(GGML_RPC "ggml: use RPC"
option(GGML_SYCL "ggml: use SYCL" OFF)
option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF)
option(GGML_SYCL_GRAPH "ggml: enable graphs in the SYCL backend" ON)
option(GGML_SYCL_DNN "ggml: enable oneDNN in the SYCL backend" ON)
set (GGML_SYCL_TARGET "INTEL" CACHE STRING
"ggml: sycl target device")
set (GGML_SYCL_DEVICE_ARCH "" CACHE STRING

View File

@@ -415,6 +415,13 @@ enum ggml_metal_kernel_type {
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_HK192_HV128,
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H256,
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_HK576_HV512,
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H64,
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H64,
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H64,
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H64,
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H64,
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H64,
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H64,
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H96,
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H96,
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H96,
@@ -1362,6 +1369,13 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_HK192_HV128, flash_attn_ext_q8_0_hk192_hv128, has_simdgroup_mm);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H256, flash_attn_ext_q8_0_h256, has_simdgroup_mm);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_HK576_HV512, flash_attn_ext_q8_0_hk576_hv512, has_simdgroup_mm);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H64, flash_attn_ext_vec_f16_h64, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H64, flash_attn_ext_vec_bf16_h64, has_simdgroup_reduction && use_bfloat);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H64, flash_attn_ext_vec_q4_0_h64, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H64, flash_attn_ext_vec_q4_1_h64, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H64, flash_attn_ext_vec_q5_0_h64, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H64, flash_attn_ext_vec_q5_1_h64, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H64, flash_attn_ext_vec_q8_0_h64, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H96, flash_attn_ext_vec_f16_h96, has_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H96, flash_attn_ext_vec_bf16_h96, has_simdgroup_reduction && use_bfloat);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H96, flash_attn_ext_vec_q4_0_h96, has_simdgroup_reduction);
@@ -4358,7 +4372,7 @@ static bool ggml_metal_encode_node(
// TODO: add vec kernels for (ne00%64 == 0) and maybe also for (ne00%32 == 0)
// for now avoiding mainly to keep the number of templates/kernels a bit lower
// these are now trivial to add after: https://github.com/ggml-org/llama.cpp/pull/12612
if (ne01 >= 20 || (ne00%128 != 0 && ne00 != 96 && ne00 != 192 && ne00 != 576)) {
if (ne01 >= 20 || (ne00%128 != 0 && ne00 != 64 && ne00 != 96 && ne00 != 192 && ne00 != 576)) {
switch (src1->type) {
case GGML_TYPE_F16:
{
@@ -4539,6 +4553,24 @@ static bool ggml_metal_encode_node(
use_vec_kernel = true;
switch (ne00) {
case 64:
{
switch (src1->type) {
case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H64].pipeline; break;
case GGML_TYPE_BF16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H64].pipeline; break;
case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H64].pipeline; break;
case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H64].pipeline; break;
case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H64].pipeline; break;
case GGML_TYPE_Q5_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H64].pipeline; break;
case GGML_TYPE_Q8_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H64].pipeline; break;
default:
{
GGML_LOG_ERROR("unsupported type: %d\n", src1->type);
GGML_LOG_ERROR("add template specialization for this type\n");
GGML_ABORT("add template specialization for this type");
}
}
} break;
case 96:
{
switch (src1->type) {

View File

@@ -4124,6 +4124,16 @@ kernel void kernel_flash_attn_ext_vec(
typedef decltype(kernel_flash_attn_ext_vec<FA_TYPES, half4, 1, dequantize_f16_t4, half4, 1, dequantize_f16_t4, 128, 128, 4>) flash_attn_ext_vec_t;
template [[host_name("kernel_flash_attn_ext_vec_f16_h64")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, half4, 1, dequantize_f16_t4, half4, 1, dequantize_f16_t4, 64, 64, 8>;
#if defined(GGML_METAL_USE_BF16)
template [[host_name("kernel_flash_attn_ext_vec_bf16_h64")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, bfloat4, 1, dequantize_bf16_t4, bfloat4, 1, dequantize_bf16_t4, 64, 64, 8>;
#endif
template [[host_name("kernel_flash_attn_ext_vec_q4_0_h64")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q4_0, 8, dequantize_q4_0_t4, block_q4_0, 8, dequantize_q4_0_t4, 64, 64, 8>;
template [[host_name("kernel_flash_attn_ext_vec_q4_1_h64")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q4_1, 8, dequantize_q4_1_t4, block_q4_1, 8, dequantize_q4_1_t4, 64, 64, 8>;
template [[host_name("kernel_flash_attn_ext_vec_q5_0_h64")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q5_0, 8, dequantize_q5_0_t4, block_q5_0, 8, dequantize_q5_0_t4, 64, 64, 8>;
template [[host_name("kernel_flash_attn_ext_vec_q5_1_h64")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q5_1, 8, dequantize_q5_1_t4, block_q5_1, 8, dequantize_q5_1_t4, 64, 64, 8>;
template [[host_name("kernel_flash_attn_ext_vec_q8_0_h64")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q8_0, 8, dequantize_q8_0_t4, block_q8_0, 8, dequantize_q8_0_t4, 64, 64, 8>;
template [[host_name("kernel_flash_attn_ext_vec_f16_h96")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, half4, 1, dequantize_f16_t4, half4, 1, dequantize_f16_t4, 96, 96, 4>;
#if defined(GGML_METAL_USE_BF16)
template [[host_name("kernel_flash_attn_ext_vec_bf16_h96")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, bfloat4, 1, dequantize_bf16_t4, bfloat4, 1, dequantize_bf16_t4, 96, 96, 4>;

View File

@@ -49,34 +49,38 @@ endif()
target_compile_options(ggml-sycl PRIVATE "-Wno-narrowing")
# Link against oneDNN
find_package(DNNL)
set(GGML_SYCL_DNNL 0)
if(DNNL_FOUND)
if (NOT DEFINED DNNL_GPU_VENDOR)
# default to intel target
set(DNNL_GPU_VENDOR "INTEL")
if(NOT "${GGML_SYCL_TARGET}" STREQUAL "INTEL")
message(WARNING "oneDNN builds bundled with oneapi release only support INTEL target")
if(GGML_SYCL_DNN)
find_package(DNNL)
if(DNNL_FOUND)
if (NOT DEFINED DNNL_GPU_VENDOR)
# default to intel target
set(DNNL_GPU_VENDOR "INTEL")
if(NOT "${GGML_SYCL_TARGET}" STREQUAL "INTEL")
message(WARNING "oneDNN builds bundled with oneapi release only support INTEL target")
endif()
endif()
endif()
# Verify oneDNN was compiled for the same target as llama
if("${GGML_SYCL_TARGET}" STREQUAL "${DNNL_GPU_VENDOR}")
target_link_libraries(ggml-sycl PRIVATE DNNL::dnnl)
set(GGML_SYCL_DNNL 1)
get_target_property(CONFIGS DNNL::dnnl IMPORTED_CONFIGURATIONS)
foreach(CONFIG ${CONFIGS})
get_target_property(DNNL_LIB DNNL::dnnl IMPORTED_LOCATION_${CONFIG})
message(STATUS "Found oneDNN: ${DNNL_LIB}")
endforeach()
# Verify oneDNN was compiled for the same target as llama
if("${GGML_SYCL_TARGET}" STREQUAL "${DNNL_GPU_VENDOR}")
target_link_libraries(ggml-sycl PRIVATE DNNL::dnnl)
set(GGML_SYCL_DNNL 1)
get_target_property(CONFIGS DNNL::dnnl IMPORTED_CONFIGURATIONS)
foreach(CONFIG ${CONFIGS})
get_target_property(DNNL_LIB DNNL::dnnl IMPORTED_LOCATION_${CONFIG})
message(STATUS "Found oneDNN: ${DNNL_LIB}")
endforeach()
else()
message(WARNING
"oneDNN must be compiled for the same target as llama.cpp.
llama.cpp: ${GGML_SYCL_TARGET}, oneDNN: ${DNNL_GPU_VENDOR}.
Disabling oneDNN support.")
endif()
else()
message(WARNING
"oneDNN must be compiled for the same target as llama.cpp.
llama.cpp: ${GGML_SYCL_TARGET}, oneDNN: ${DNNL_GPU_VENDOR}.
Disabling oneDNN support.")
message(STATUS "oneDNN not found, disabling oneDNN support")
endif()
else()
message(STATUS "oneDNN not found, disabling oneDNN support")
message(STATUS "oneDNN support disabled by the user")
endif()
target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_DNNL=${GGML_SYCL_DNNL})

View File

@@ -1,93 +1,74 @@
#include "binbcast.hpp"
#include <array>
#include <cstddef>
#include <cstdint>
#include <sycl/sycl.hpp>
#include "dpct/helper.hpp"
#include "ggml.h"
template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
static void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
int ne0, int ne1, int ne2, int ne3,
int ne10, int ne11, int ne12, int ne13,
/*int s0, */ int s1, int s2, int s3,
/*int s00,*/ int s01, int s02, int s03,
/*int s10,*/ int s11, int s12, int s13,
const sycl::nd_item<3> &item_ct1) {
const int i0s = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
item_ct1.get_local_id(2);
const int i1 = (item_ct1.get_local_range(1) * item_ct1.get_group(1) +
item_ct1.get_local_id(1));
const int i2 = (item_ct1.get_local_range(0) * item_ct1.get_group(0) +
item_ct1.get_local_id(0)) /
ne3;
const int i3 = (item_ct1.get_local_range(0) * item_ct1.get_group(0) +
item_ct1.get_local_id(0)) %
ne3;
if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
return;
}
const int i11 = i1 % ne11;
const int i12 = i2 % ne12;
const int i13 = i3 % ne13;
const size_t i_src0 = i3*s03 + i2*s02 + i1*s01;
const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
const size_t i_dst = i3*s3 + i2*s2 + i1*s1;
const src0_t * src0_row = src0 + i_src0;
const src1_t * src1_row = src1 + i_src1;
dst_t * dst_row = dst + i_dst;
for (int i0 = i0s; i0 < ne0;
i0 += item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) {
const int i10 = i0 % ne10;
dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
template <float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
static __dpct_inline__ void k_bin_bcast_contiguous(const src0_t * __restrict__ src0, const src1_t * __restrict__ src1,
dst_t * dst, std::size_t num_elements, const sycl::nd_item<1> & it) {
auto element_id = it.get_global_id(0);
auto global_range = it.get_global_range(0);
for (; element_id < num_elements; element_id += global_range) {
auto src0_float_val = sycl::vec(src0[element_id]).template convert<float, sycl::rounding_mode::rte>();
auto src1_float_val = sycl::vec(src1[element_id]).template convert<float, sycl::rounding_mode::rte>();
float dst_val = bin_op(src0_float_val[0], src1_float_val[0]);
auto val_to_store = sycl::vec(dst_val).template convert<dst_t, sycl::rounding_mode::rte>();
dst[element_id] = val_to_store;
}
}
template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
static void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst,
int ne0, int ne1, int ne2, int ne3,
int ne10, int ne11, int ne12, int ne13,
/*int s0, */ int s1, int s2, int s3,
/*int s00,*/ int s01, int s02, int s03,
/*int s10,*/ int s11, int s12, int s13,
const sycl::nd_item<3> &item_ct1) {
template <float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
static __dpct_inline__ void k_bin_bcast(const src0_t * __restrict__ src0, const src1_t * __restrict__ src1, dst_t * dst,
int ne0, int ne1, int ne2, int ne3, int ne10, int ne11, int ne12, int ne13,
int s0, int s1, int s2, int s3, int s00, int s01, int s02, int s03, int s10,
int s11, int s12, int s13, std::size_t num_dst_elements,
const sycl::nd_item<1> & item_ct1) {
auto calculate_logical_index =
[](const std::array<int, 4> & dims, std::size_t element_id) __attribute__((always_inline))->std::array<int, 4> {
std::array<int, 4> logical_index;
#pragma unroll(4)
for (int i = 3; i >= 0; i--) {
logical_index[i] = element_id % dims[i];
element_id /= dims[i];
}
return logical_index;
};
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
item_ct1.get_local_id(2);
auto calculate_index = [](const std::array<int, 4> & dims, const std::array<int, 4> & strides,
const std::array<int, 4> & indices) __attribute__((always_inline))
->std::size_t {
std::size_t index = 0;
#pragma unroll(4)
for (int i = 0; i < 4; i++) {
auto index_i = indices[i];
if (indices[i] >= dims[i]) {
index_i = indices[i] % dims[i];
}
index += strides[i] * index_i;
}
return index;
};
const int i3 = i/(ne2*ne1*ne0);
const int i2 = (i/(ne1*ne0)) % ne2;
const int i1 = (i/ne0) % ne1;
const int i0 = i % ne0;
if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
return;
auto element_id = item_ct1.get_global_id(0);
for (; element_id < num_dst_elements; element_id += item_ct1.get_global_range(0)) {
auto logical_index = calculate_logical_index({ ne3, ne2, ne1, ne0 }, element_id);
auto src_0_index = calculate_index({ ne3, ne2, ne1, ne0 }, { s03, s02, s01, s00 }, logical_index);
auto src_1_index = calculate_index({ ne13, ne12, ne11, ne10 }, { s13, s12, s11, s10 }, logical_index);
auto dst_index = calculate_index({ ne3, ne2, ne1, ne0 }, { s3, s2, s1, s0 }, logical_index);
auto src0_float_val = sycl::vec(src0[src_0_index]).template convert<float, sycl::rounding_mode::rte>();
auto src1_float_val = sycl::vec(src1[src_1_index]).template convert<float, sycl::rounding_mode::rte>();
float dst_val = bin_op(src0_float_val[0], src1_float_val[0]);
auto val_to_store = sycl::vec(dst_val).template convert<dst_t, sycl::rounding_mode::rte>();
dst[dst_index] = val_to_store;
}
const int i11 = i1 % ne11;
const int i12 = i2 % ne12;
const int i13 = i3 % ne13;
const size_t i_src0 = i3*s03 + i2*s02 + i1*s01;
const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
const size_t i_dst = i3*s3 + i2*s2 + i1*s1;
const src0_t * src0_row = src0 + i_src0;
const src1_t * src1_row = src1 + i_src1;
dst_t * dst_row = dst + i_dst;
const int i10 = i0 % ne10;
dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
}
template<float (*bin_op)(const float, const float)>
struct bin_bcast_sycl {
template <float (*bin_op)(const float, const float)> struct bin_bcast_sycl {
template <typename src0_t, typename src1_t, typename dst_t>
void operator()(const src0_t * src0_dd, const src1_t * src1_dd, dst_t * dst_dd, const int64_t ne00,
const int64_t ne01, const int64_t ne02, const int64_t ne03, const int64_t ne10, const int64_t ne11,
@@ -96,165 +77,73 @@ struct bin_bcast_sycl {
const size_t nb10, const size_t nb11, const size_t nb12, const size_t nb13, const size_t nb0,
const size_t nb1, const size_t nb2, const size_t nb3, const bool src0_is_contiguous,
const bool src1_is_contiguous, const bool dst_is_contiguous, queue_ptr stream) {
int nr0 = ne10 / ne0;
int nr1 = ne11/ne1;
int nr2 = ne12/ne2;
int nr3 = ne13/ne3;
int nr[4] = { nr0, nr1, nr2, nr3 };
// collapse dimensions until first broadcast dimension
int64_t cne[] = {ne0, ne1, ne2, ne3};
int64_t cne0[] = {ne00, ne01, ne02, ne03};
int64_t cne1[] = {ne10, ne11, ne12, ne13};
size_t cnb[] = {nb0, nb1, nb2, nb3};
size_t cnb0[] = {nb00, nb01, nb02, nb03};
size_t cnb1[] = {nb10, nb11, nb12, nb13};
auto collapse = [](int64_t cne[]) {
cne[0] *= cne[1];
cne[1] = cne[2];
cne[2] = cne[3];
cne[3] = 1;
};
auto collapse_nb = [](size_t cnb[], int64_t cne[]) {
cnb[1] *= cne[1];
cnb[2] *= cne[2];
cnb[3] *= cne[3];
};
if (src0_is_contiguous && src1_is_contiguous && dst_is_contiguous) {
auto check_bcast_required = [](const std::array<int64_t, 4> & src_dims,
const std::array<int64_t, 4> & dst_dims) -> bool {
for (int i = 0; i < 4; i++) {
if (nr[i] != 1) {
break;
}
if (i > 0) {
collapse_nb(cnb, cne);
collapse_nb(cnb0, cne0);
collapse_nb(cnb1, cne1);
collapse(cne);
collapse(cne0);
collapse(cne1);
if (dst_dims[i] > src_dims[i]) {
return true;
}
}
}
{
int64_t ne0 = cne[0];
int64_t ne1 = cne[1];
int64_t ne2 = cne[2];
int64_t ne3 = cne[3];
return false;
};
int64_t ne10 = cne1[0];
int64_t ne11 = cne1[1];
int64_t ne12 = cne1[2];
int64_t ne13 = cne1[3];
dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
size_t nb0 = cnb[0];
size_t nb1 = cnb[1];
size_t nb2 = cnb[2];
size_t nb3 = cnb[3];
GGML_ASSERT(nb0 % sizeof(dst_t) == 0);
GGML_ASSERT(nb1 % sizeof(dst_t) == 0);
GGML_ASSERT(nb2 % sizeof(dst_t) == 0);
GGML_ASSERT(nb3 % sizeof(dst_t) == 0);
size_t nb00 = cnb0[0];
size_t nb01 = cnb0[1];
size_t nb02 = cnb0[2];
size_t nb03 = cnb0[3];
GGML_ASSERT(nb00 % sizeof(src0_t) == 0);
GGML_ASSERT(nb01 % sizeof(src0_t) == 0);
GGML_ASSERT(nb02 % sizeof(src0_t) == 0);
GGML_ASSERT(nb03 % sizeof(src0_t) == 0);
size_t nb10 = cnb1[0];
size_t nb11 = cnb1[1];
size_t nb12 = cnb1[2];
size_t nb13 = cnb1[3];
GGML_ASSERT(nb10 % sizeof(src1_t) == 0);
GGML_ASSERT(nb11 % sizeof(src1_t) == 0);
GGML_ASSERT(nb12 % sizeof(src1_t) == 0);
GGML_ASSERT(nb13 % sizeof(src1_t) == 0);
size_t s0 = nb0 / sizeof(dst_t);
size_t s1 = nb1 / sizeof(dst_t);
size_t s2 = nb2 / sizeof(dst_t);
size_t s3 = nb3 / sizeof(dst_t);
// dst strides in number of elements
size_t s0 = nb0 / sizeof(dst_t);
size_t s1 = nb1 / sizeof(dst_t);
size_t s2 = nb2 / sizeof(dst_t);
size_t s3 = nb3 / sizeof(dst_t);
size_t s10 = nb10 / sizeof(src1_t);
size_t s11 = nb11 / sizeof(src1_t);
size_t s12 = nb12 / sizeof(src1_t);
size_t s13 = nb13 / sizeof(src1_t);
// src1 strides in number of elements
size_t s10 = nb10 / sizeof(src0_t);
size_t s11 = nb11 / sizeof(src1_t);
size_t s12 = nb12 / sizeof(src1_t);
size_t s13 = nb13 / sizeof(src1_t);
size_t s00 = nb00 / sizeof(src0_t);
size_t s01 = nb01 / sizeof(src0_t);
size_t s02 = nb02 / sizeof(src0_t);
size_t s03 = nb03 / sizeof(src0_t);
// src0 strides in number of elements
size_t s00 = nb00 / sizeof(src0_t);
size_t s01 = nb01 / sizeof(src0_t);
size_t s02 = nb02 / sizeof(src0_t);
size_t s03 = nb03 / sizeof(src0_t);
GGML_UNUSED(s00);
std::size_t num_dst_elements = static_cast<std::size_t>(ne0) * static_cast<std::size_t>(ne1) *
static_cast<std::size_t>(ne2) * static_cast<std::size_t>(ne3);
std::size_t local_range = 256;
std::size_t global_range = ceil_div(num_dst_elements, local_range) * local_range;
GGML_ASSERT(nb0 % sizeof(dst_t) == 0);
GGML_ASSERT(nb1 % sizeof(dst_t) == 0);
GGML_ASSERT(nb2 % sizeof(dst_t) == 0);
GGML_ASSERT(nb3 % sizeof(dst_t) == 0);
bool needs_broadcasting = check_bcast_required({ ne00, ne01, ne02, ne03 }, { ne0, ne1, ne2, ne3 }) ||
check_bcast_required({ ne10, ne11, ne12, ne13 }, { ne0, ne1, ne2, ne3 });
bool all_contiguous = src0_is_contiguous && src1_is_contiguous && dst_is_contiguous;
GGML_ASSERT(nb00 % sizeof(src0_t) == 0);
GGML_ASSERT(nb01 % sizeof(src0_t) == 0);
GGML_ASSERT(nb02 % sizeof(src0_t) == 0);
GGML_ASSERT(nb03 % sizeof(src0_t) == 0);
GGML_ASSERT(nb10 % sizeof(src1_t) == 0);
GGML_ASSERT(nb11 % sizeof(src1_t) == 0);
GGML_ASSERT(nb12 % sizeof(src1_t) == 0);
GGML_ASSERT(nb13 % sizeof(src1_t) == 0);
GGML_ASSERT(s0 == 1);
GGML_ASSERT(s10 == 1);
const int block_size = 128;
int64_t hne0 = std::max(ne0/2LL, 1LL);
sycl::range<3> block_dims(1, 1, 1);
block_dims[2] = std::min<unsigned int>(hne0, block_size);
block_dims[1] = std::min<unsigned int>(
ne1, block_size / (unsigned int)block_dims[2]);
block_dims[0] = std::min(
std::min<unsigned int>(
ne2 * ne3, block_size / (unsigned int)block_dims[2] /
(unsigned int)block_dims[1]),
64U);
sycl::range<3> block_nums(
(ne2 * ne3 + block_dims[0] - 1) / block_dims[0],
(ne1 + block_dims[1] - 1) / block_dims[1],
(hne0 + block_dims[2] - 1) / block_dims[2]);
if (block_nums[0] > 65535) {
// this is the maximum number of blocks in z direction, fallback to 1D grid kernel
int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size;
{
dpct::has_capability_or_fail(stream->get_device(),
{sycl::aspect::fp16});
stream->parallel_for(
sycl::nd_range<3>(sycl::range<3>(1, 1, block_num) *
sycl::range<3>(1, 1, block_size),
sycl::range<3>(1, 1, block_size)),
[=](sycl::nd_item<3> item_ct1) {
k_bin_bcast_unravel<bin_op>(
src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3,
ne10, ne11, ne12, ne13, s1, s2, s3, s01, s02,
s03, s11, s12, s13, item_ct1);
});
}
} else {
/*
DPCT1049:16: The work-group size passed to the SYCL kernel may
exceed the limit. To get the device limit, query
info::device::max_work_group_size. Adjust the work-group size if
needed.
*/
dpct::has_capability_or_fail(stream->get_device(),
{sycl::aspect::fp16});
stream->parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) {
k_bin_bcast<bin_op>(src0_dd, src1_dd, dst_dd, ne0, ne1,
ne2, ne3, ne10, ne11, ne12, ne13,
s1, s2, s3, s01, s02, s03, s11, s12, s13,
item_ct1);
});
}
if (! needs_broadcasting && all_contiguous) {
stream->submit([&](sycl::handler & cgh) {
cgh.parallel_for(sycl::nd_range<1>({ global_range }, { local_range }), [=](sycl::nd_item<1> it) {
k_bin_bcast_contiguous<bin_op>(src0_dd, src1_dd, dst_dd, num_dst_elements, it);
});
});
} else {
stream->submit([&](sycl::handler & cgh) {
cgh.parallel_for(sycl::nd_range<1>({ global_range }, { local_range }), [=](sycl::nd_item<1> it) {
k_bin_bcast<bin_op>(src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3, ne10, ne11, ne12, ne13, s0, s1,
s2, s3, s00, s01, s02, s03, s10, s11, s12, s13, num_dst_elements, it);
});
});
}
}
};

View File

@@ -183,6 +183,24 @@ static void dequantize_row_q4_K_sycl(const void *vx, dst_t *y, const int64_t k,
}
}
template <typename dst_t>
static void dequantize_row_q4_K_sycl_reorder(const void * vx, dst_t * y, const int64_t k, dpct::queue_ptr stream) {
const int64_t nb = k / QK_K;
const size_t local_size = 32;
const size_t global_size = nb * local_size;
dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
stream->submit([&](sycl::handler & cgh) {
sycl::local_accessor<uint8_t, 1> scale_local_acc(sycl::range<1>(12), cgh);
cgh.parallel_for(sycl::nd_range<1>(sycl::range<1>(global_size), sycl::range<1>(local_size)),
[=](sycl::nd_item<1> item_ct1) {
dequantize_block_q4_K_reorder(vx, y, get_pointer(scale_local_acc), item_ct1, nb);
});
});
}
template <typename dst_t>
static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int64_t k,
dpct::queue_ptr stream) {
@@ -504,7 +522,11 @@ to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor * dst) {
case GGML_TYPE_Q3_K:
return dequantize_row_q3_K_sycl;
case GGML_TYPE_Q4_K:
return dequantize_row_q4_K_sycl;
if (dst->src[0]->extra && ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
return dequantize_row_q4_K_sycl_reorder;
} else {
return dequantize_row_q4_K_sycl;
}
case GGML_TYPE_Q5_K:
return dequantize_row_q5_K_sycl;
case GGML_TYPE_Q6_K:
@@ -556,7 +578,12 @@ to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst) {
case GGML_TYPE_Q3_K:
return dequantize_row_q3_K_sycl;
case GGML_TYPE_Q4_K:
return dequantize_row_q4_K_sycl;
if (dst->src[0]->extra &&
((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) {
return dequantize_row_q4_K_sycl_reorder;
} else {
return dequantize_row_q4_K_sycl;
}
case GGML_TYPE_Q5_K:
return dequantize_row_q5_K_sycl;
case GGML_TYPE_Q6_K:

View File

@@ -357,6 +357,28 @@ static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8
}
#endif
template <typename dst_t>
inline void dequantize_q4_K_common(dst_t * __restrict__ y, const uint8_t * __restrict__ qs_ptr, const float dall,
const float dmin, uint8_t * __restrict__ scales_local, int il, int ir) {
const int is = 2 * il;
constexpr int n = 4;
uint8_t sc, m;
get_scale_min_k4(is + 0, scales_local, sc, m);
const float d1 = dall * sc;
const float m1 = dmin * m;
get_scale_min_k4(is + 1, scales_local, sc, m);
const float d2 = dall * sc;
const float m2 = dmin * m;
sycl::vec<uint8_t, n> q_vec = vec_aligned_load<uint8_t, n>(qs_ptr + 32 * il + n * ir);
for (int l = 0; l < n; ++l) {
y[l + 0] = d1 * (q_vec[l] & 0xF) - m1;
y[l + 32] = d2 * (q_vec[l] >> 4) - m2;
}
}
template<typename dst_t>
static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
uint8_t* scales_local, const sycl::nd_item<3> &item_ct1) {
@@ -365,36 +387,22 @@ static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restri
const int64_t i = item_ct1.get_group(2);
#if QK_K == 256
// assume 32 threads
const int64_t tid = item_ct1.get_local_id(2);
const int64_t il = tid/8;
const int64_t ir = tid%8;
const int64_t is = 2*il;
const int64_t n = 4;
const int64_t il = tid / 8;
const int64_t ir = tid % 8;
dst_t * y = yy + i*QK_K + 64*il + n*ir;
dst_t * y = yy + i * QK_K + 64 * il + 4 * ir;
const sycl::half2 dm = x[i].dm;
const float dall = dm[0];
const float dmin = dm[1];
if (tid < 12)
if (tid < 12) {
scales_local[tid] = x[i].scales[tid];
item_ct1.barrier(sycl::access::fence_space::local_space);
uint8_t sc, m;
get_scale_min_k4(is + 0, scales_local, sc, m);
const float d1 = dall * sc;
const float m1 = dmin * m;
get_scale_min_k4(is + 1, scales_local, sc, m);
const float d2 = dall * sc;
const float m2 = dmin * m;
sycl::vec<uint8_t, n> q_vec = vec_aligned_load<uint8_t, n>(x[i].qs + 32*il + n*ir);
for (int l = 0; l < n; ++l) {
y[l + 0] = d1 * (q_vec[l] & 0xF) - m1;
y[l +32] = d2 * (q_vec[l] >> 4) - m2;
}
item_ct1.barrier(sycl::access::fence_space::local_space);
dequantize_q4_K_common(y, x[i].qs, dall, dmin, scales_local, il, ir);
#else
const int64_t tid = item_ct1.get_local_id(2);
const uint8_t * q = x[i].qs;
@@ -406,6 +414,36 @@ static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restri
#endif
}
template <typename dst_t>
static void dequantize_block_q4_K_reorder(const void * __restrict__ vx, dst_t * __restrict__ yy, uint8_t * scales_local,
const sycl::nd_item<1> & item_ct1, int64_t nb) {
const int64_t i = item_ct1.get_group(0); // block index
const int64_t tid = item_ct1.get_local_id(0); // thread index within block
const int64_t il = tid / 8;
const int64_t ir = tid % 8;
dst_t * y = yy + i * QK_K + 64 * il + 4 * ir;
const uint8_t * base = static_cast<const uint8_t *>(vx);
const size_t qs_offset = i * (QK_K / 2);
const size_t scales_offset = nb * (QK_K / 2) + i * K_SCALE_SIZE;
const size_t dm_offset = nb * (QK_K / 2) + nb * K_SCALE_SIZE + i * sizeof(ggml_half2);
const uint8_t * qs_ptr = base + qs_offset;
const uint8_t * scales_ptr = base + scales_offset;
ggml_half2 dm_values = *reinterpret_cast<const ggml_half2 *>(base + dm_offset);
const float dall = dm_values.x();
const float dmin = dm_values.y();
if (tid < 12) {
scales_local[tid] = scales_ptr[tid];
}
item_ct1.barrier(sycl::access::fence_space::local_space);
dequantize_q4_K_common(y, qs_ptr, dall, dmin, scales_local, il, ir);
}
template<typename dst_t>
static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
const sycl::nd_item<3> &item_ct1) {

View File

@@ -1129,7 +1129,13 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
dequantize_mul_mat_vec_q3_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
break;
case GGML_TYPE_Q4_K:
dequantize_mul_mat_vec_q4_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
if ((ggml_tensor_extra_gpu *) dst->src[0]->extra &&
((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
// reorder is currently not supported for dmmv
GGML_ABORT("Unimplemented dequantize case case for q4_k reorder");
} else {
dequantize_mul_mat_vec_q4_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
}
break;
case GGML_TYPE_Q5_K:
dequantize_mul_mat_vec_q5_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);

View File

@@ -655,7 +655,6 @@ inline void ggml_sycl_op_sgn(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
}
default:
GGML_ABORT("GGML tensor type not supported!\n");
break;
}
}
@@ -688,7 +687,6 @@ inline void ggml_sycl_op_abs(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
}
default:
GGML_ABORT("GGML tensor type not supported!\n");
break;
}
}
@@ -722,7 +720,6 @@ inline void ggml_sycl_op_elu(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
}
default:
GGML_ABORT("GGML tensor type not supported!\n");
break;
}
}
@@ -754,7 +751,6 @@ inline void ggml_sycl_op_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst
}
default:
GGML_ABORT("GGML tensor type not supported!\n");
break;
}
}
@@ -786,7 +782,6 @@ inline void ggml_sycl_op_gelu(ggml_backend_sycl_context & ctx, ggml_tensor * dst
}
default:
GGML_ABORT("GGML tensor type not supported!\n");
break;
}
}
@@ -818,7 +813,6 @@ inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor
}
default:
GGML_ABORT("GGML tensor type not supported!\n");
break;
}
}
@@ -850,7 +844,6 @@ inline void ggml_sycl_op_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst
}
default:
GGML_ABORT("GGML tensor type not supported!\n");
break;
}
}
@@ -883,7 +876,6 @@ inline void ggml_sycl_op_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst
}
default:
GGML_ABORT("GGML tensor type not supported!\n");
break;
}
}
@@ -917,7 +909,6 @@ inline void ggml_sycl_op_hardsigmoid(ggml_backend_sycl_context & ctx, ggml_tenso
}
default:
GGML_ABORT("GGML tensor type not supported!\n");
break;
}
}
@@ -949,7 +940,6 @@ inline void ggml_sycl_op_hardswish(ggml_backend_sycl_context & ctx, ggml_tensor
}
default:
GGML_ABORT("GGML tensor type not supported!\n");
break;
}
}
@@ -981,7 +971,6 @@ inline void ggml_sycl_op_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
}
default:
GGML_ABORT("GGML tensor type not supported!\n");
break;
}
}
@@ -1013,7 +1002,6 @@ inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
}
default:
GGML_ABORT("GGML tensor type not supported!\n");
break;
}
}
@@ -1045,7 +1033,6 @@ inline void ggml_sycl_op_sigmoid(ggml_backend_sycl_context & ctx, ggml_tensor *
}
default:
GGML_ABORT("GGML tensor type not supported!\n");
break;
}
}
@@ -1078,7 +1065,6 @@ inline void ggml_sycl_op_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor * dst
}
default:
GGML_ABORT("GGML tensor type not supported!\n");
break;
}
}
@@ -1110,7 +1096,6 @@ inline void ggml_sycl_op_sin(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
}
default:
GGML_ABORT("GGML tensor type not supported!\n");
break;
}
}
@@ -1142,7 +1127,6 @@ inline void ggml_sycl_op_cos(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
}
default:
GGML_ABORT("GGML tensor type not supported!\n");
break;
}
}
@@ -1174,7 +1158,6 @@ inline void ggml_sycl_op_step(ggml_backend_sycl_context & ctx, ggml_tensor * dst
}
default:
GGML_ABORT("GGML tensor type not supported!\n");
break;
}
}
@@ -1206,7 +1189,6 @@ inline void ggml_sycl_op_neg(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
}
default:
GGML_ABORT("GGML tensor type not supported!\n");
break;
}
}
@@ -1241,7 +1223,6 @@ inline void ggml_sycl_op_leaky_relu(ggml_backend_sycl_context & ctx, ggml_tensor
}
default:
GGML_ABORT("GGML tensor type not supported!\n");
break;
}
}
@@ -1273,7 +1254,6 @@ inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
}
default:
GGML_ABORT("GGML tensor type not supported!\n");
break;
}
}
@@ -1315,7 +1295,6 @@ inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, ggml_tensor *
}
default:
GGML_ABORT("GGML tensor type not supported!\n");
break;
}
}
@@ -1350,7 +1329,6 @@ inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
}
default:
GGML_ABORT("GGML tensor type not supported!\n");
break;
}
}
@@ -1388,7 +1366,6 @@ inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * ds
}
default:
GGML_ABORT("GGML tensor type not supported!\n");
break;
}
}

View File

@@ -32,16 +32,36 @@ public:
else static_assert(0);
}
static inline void row_gemm(ggml_backend_sycl_context & ctx, bool a_trans, bool b_trans, int m, int n, int k,
const void * a, dt at, const void * b, dt bt, void * c, dt ct, const queue_ptr & q) {
// matrix A has m rows, k columns
// matrix B has k rows, n columns
// nra - number of elements to skip when moving into next row in A
// nrb - number of elements to skip when moving into next row in B
// nca - number of elements to skip when moving into next column in A
// ncb - number of elements to skip when moving into next column in B
// stride_a - number of elements to skip when moving to next A matrix
// stride_b - number of elements to skip when moving to next B matrix
// batches_a - number of A matrices
// batches_b - number of B matrices
static void gemm(ggml_backend_sycl_context & ctx, int m, int n, int k,
const void * a, dt at, dnnl_dim_t nra, dnnl_dim_t nca, dnnl_dim_t stride_a,
const void * b, dt bt, dnnl_dim_t nrb, dnnl_dim_t ncb, dnnl_dim_t stride_b,
void * c, dt ct, const queue_ptr & q, dnnl_dim_t batches_a, dnnl_dim_t batches_b) {
auto stream = ctx.stream_dnnl(q);
auto eng = ctx.engine_dnnl(q);
dnnl::memory::dims a_dims = { m, k };
dnnl::memory::dims b_dims = { k, n };
dnnl::memory::dims c_dims = { m, n };
const auto a_in_md = dnnl::memory::desc(a_dims, at, a_trans ? tag::ba : tag::ab);
const auto b_in_md = dnnl::memory::desc(b_dims, bt, b_trans ? tag::ba : tag::ab);
const auto c_md = dnnl::memory::desc(c_dims, ct, tag::ab);
// { # strides, # rows, # columns }
dnnl::memory::dims a_dims = { batches_a, m, k };
dnnl::memory::dims b_dims = { batches_b, k, n };
dnnl::memory::dims c_dims = { std::max(batches_a, batches_b), m, n };
// { # elements to skip to next stride, # elements to skip to next row, # elements to skip to next column }
dnnl::memory::dims a_strides = { stride_a, nra, nca };
dnnl::memory::dims b_strides = { stride_b, nrb, ncb };
const auto a_in_md = dnnl::memory::desc(a_dims, at, a_strides);
const auto b_in_md = dnnl::memory::desc(b_dims, bt, b_strides);
const auto c_md = dnnl::memory::desc(c_dims, ct, tag::abc);
dnnl::primitive_attr primitive_attr;
primitive_attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
@@ -63,6 +83,15 @@ public:
matmul_prim.execute(stream, matmul_args);
}
// matrices A and B are column major, both having k rows
// matrix A has m column, matrix B has n columns
// output: column major matrix C = A transposed * B
static void row_gemm(ggml_backend_sycl_context & ctx, int m, int n, int k,
const void * a, dt at, const void * b, dt bt, void * c, dt ct, const queue_ptr & q) {
gemm(ctx, m, n, k, a, at, k, 1, k * m, b, bt, 1, k, n * k, c, ct, q, 1, 1);
}
};
#endif

View File

@@ -49,6 +49,7 @@ static bool g_sycl_loaded = false;
int g_ggml_sycl_debug = 0;
int g_ggml_sycl_disable_optimize = 0;
int g_ggml_sycl_disable_graph = 0;
int g_ggml_sycl_disable_dnn = 0;
int g_ggml_sycl_prioritize_dmmv = 0;
static ggml_sycl_device_info ggml_sycl_init() {
@@ -196,12 +197,22 @@ static void ggml_check_sycl() try {
g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
g_ggml_sycl_disable_optimize= get_sycl_env("GGML_SYCL_DISABLE_OPT", 1);
g_ggml_sycl_disable_graph = get_sycl_env("GGML_SYCL_DISABLE_GRAPH", 1);
g_ggml_sycl_disable_dnn = get_sycl_env("GGML_SYCL_DISABLE_DNN", 0);
g_ggml_sycl_prioritize_dmmv = get_sycl_env("GGML_SYCL_PRIORITIZE_DMMV", 0);
GGML_SYCL_DEBUG("[SYCL] call ggml_check_sycl\n");
GGML_LOG_INFO("Running with Environment Variables:\n");
GGML_LOG_INFO(" GGML_SYCL_DEBUG: %d\n", g_ggml_sycl_debug);
GGML_LOG_INFO(" GGML_SYCL_DISABLE_OPT: %d\n", g_ggml_sycl_disable_optimize);
#ifdef GGML_SYCL_GRAPH
GGML_LOG_INFO(" GGML_SYCL_DISABLE_GRAPH: %d\n", g_ggml_sycl_disable_graph);
#else
GGML_LOG_INFO(" GGML_SYCL_DISABLE_GRAPH: graph disabled by compile flag\n");
#endif
#if GGML_SYCL_DNNL
GGML_LOG_INFO(" GGML_SYCL_DISABLE_DNN: %d\n", g_ggml_sycl_disable_dnn);
#else
GGML_LOG_INFO(" GGML_SYCL_DISABLE_DNN: DNN disabled by compile flag\n");
#endif
GGML_LOG_INFO(" GGML_SYCL_PRIORITIZE_DMMV: %d\n", g_ggml_sycl_prioritize_dmmv);
GGML_LOG_INFO("Build with Macros:\n");
#if defined(GGML_SYCL_FORCE_MMQ)
@@ -341,7 +352,7 @@ ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
assert(tensor->view_src->buffer->buft == buffer->buft);
return GGML_STATUS_SUCCESS;
}
if (tensor->type == GGML_TYPE_Q4_0 && !g_ggml_sycl_disable_optimize) {
if ((tensor->type == GGML_TYPE_Q4_0 || tensor->type == GGML_TYPE_Q4_K) && !g_ggml_sycl_disable_optimize) {
ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
tensor->extra = extra;
ctx->tensor_extras.push_back(extra); //used to release it when destroy ctx.
@@ -1985,19 +1996,18 @@ inline void ggml_sycl_op_mul_mat_sycl(
const int64_t ne00 = src0->ne[0];
const int64_t ne10 = src1->ne[0];
GGML_ASSERT(ne00 == ne10);
const int64_t row_diff = row_high - row_low;
int id;
SYCL_CHECK(
CHECK_TRY_ERROR(id = get_current_device_id()));
#if !GGML_SYCL_DNNL
const int64_t ne0 = dst->ne[0];
const int64_t ne0 = dst->ne[0]; // used by MKL only
// the main device has a larger memory buffer to hold the results from all GPUs
// ldc == nrows of the matrix that cuBLAS writes into
int ldc = id == ctx.device ? ne0 : row_diff;
#endif
int ldc = id == ctx.device ? ne0 : row_diff; // used by MKL only
#ifdef GGML_SYCL_F16
bool use_fp16 = true; // TODO(Yu) SYCL capability check
@@ -2033,25 +2043,29 @@ inline void ggml_sycl_op_mul_mat_sycl(
: src1_as_f16.get();
ggml_sycl_pool_alloc<sycl::half> dst_f16(ctx.pool(), row_diff * src1_ncols);
#if !GGML_SYCL_DNNL
const sycl::half alpha_f16 = 1.0f;
const sycl::half beta_f16 = 0.0f;
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm(
*stream, oneapi::math::transpose::trans,
oneapi::math::transpose::nontrans, row_diff, src1_ncols, ne10,
&alpha_f16, src0_ptr, dpct::library_data_t::real_half, ne00,
src1_ptr, dpct::library_data_t::real_half, ne10, &beta_f16,
dst_f16.get(), dpct::library_data_t::real_half, ldc,
dpct::library_data_t::real_half)));
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16, dst);
to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
#else
DnnlGemmWrapper::row_gemm(ctx, false, true, src1_ncols, row_diff, ne10, src1_ptr,
DnnlGemmWrapper::to_dt<sycl::half>(), src0_ptr, DnnlGemmWrapper::to_dt<sycl::half>(),
dst_f16.get(), DnnlGemmWrapper::to_dt<sycl::half>(), stream);
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16, dst);
to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff* src1_ncols, stream);
#if GGML_SYCL_DNNL
if (!g_ggml_sycl_disable_dnn) {
DnnlGemmWrapper::row_gemm(ctx, src1_ncols, row_diff, ne10, src1_ptr,
DnnlGemmWrapper::to_dt<sycl::half>(), src0_ptr, DnnlGemmWrapper::to_dt<sycl::half>(),
dst_f16.get(), DnnlGemmWrapper::to_dt<sycl::half>(), stream);
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16, dst);
to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff* src1_ncols, stream);
}
else
#endif
{
const sycl::half alpha_f16 = 1.0f;
const sycl::half beta_f16 = 0.0f;
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm(
*stream, oneapi::math::transpose::trans,
oneapi::math::transpose::nontrans, row_diff, src1_ncols, ne10,
&alpha_f16, src0_ptr, dpct::library_data_t::real_half, ne00,
src1_ptr, dpct::library_data_t::real_half, ne10, &beta_f16,
dst_f16.get(), dpct::library_data_t::real_half, ldc,
dpct::library_data_t::real_half)));
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16, dst);
to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
}
}
else {
// GGML_SYCL_DEBUG("ggml_sycl_op_mul_mat_sycl - fp32 path\n");
@@ -2072,18 +2086,22 @@ inline void ggml_sycl_op_mul_mat_sycl(
const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32.get();
const float * src1_ddf1_i = src1->type == GGML_TYPE_F32 ? (const float *) src1_ddf_i : src1_ddq_as_f32.get();
#if !GGML_SYCL_DNNL
const float alpha = 1.0f;
const float beta = 0.0f;
SYCL_CHECK(CHECK_TRY_ERROR(oneapi::math::blas::column_major::gemm(
get_onemath_backend(*stream), oneapi::math::transpose::trans, oneapi::math::transpose::nontrans, row_diff,
src1_ncols, ne10, dpct::get_value(&alpha, *stream), src0_ddf_i, ne00, src1_ddf1_i, ne10,
dpct::get_value(&beta, *stream), dst_dd_i, ldc)));
#else
DnnlGemmWrapper::row_gemm(ctx, false, true, src1_ncols, row_diff, ne10, src1_ddf1_i,
DnnlGemmWrapper::to_dt<float>(), src0_ddf_i, DnnlGemmWrapper::to_dt<float>(),
dst_dd_i, DnnlGemmWrapper::to_dt<float>(), stream);
#if GGML_SYCL_DNNL
if (!g_ggml_sycl_disable_dnn) {
DnnlGemmWrapper::row_gemm(ctx, src1_ncols, row_diff, ne10, src1_ddf1_i,
DnnlGemmWrapper::to_dt<float>(), src0_ddf_i, DnnlGemmWrapper::to_dt<float>(),
dst_dd_i, DnnlGemmWrapper::to_dt<float>(), stream);
}
else
#endif
{
const float alpha = 1.0f;
const float beta = 0.0f;
SYCL_CHECK(CHECK_TRY_ERROR(oneapi::math::blas::column_major::gemm(
get_onemath_backend(*stream), oneapi::math::transpose::trans, oneapi::math::transpose::nontrans, row_diff,
src1_ncols, ne10, dpct::get_value(&alpha, *stream), src0_ddf_i, ne00, src1_ddf1_i, ne10,
dpct::get_value(&beta, *stream), dst_dd_i, ldc)));
}
}
GGML_UNUSED(dst);
GGML_UNUSED(src1_ddq_i);
@@ -2697,7 +2715,7 @@ catch (sycl::exception const &exc) {
std::exit(1);
}
static void k_compute_batched_ptrs(const sycl::half * src0_as_f16, const sycl::half * src1_as_f16, char * dst,
static void k_compute_batched_ptrs(const sycl::half * src0_as_f16, const sycl::half * src1_as_f16, void * dst,
const void ** ptrs_src, void ** ptrs_dst, int64_t ne12, int64_t ne13, int64_t ne23,
size_t nb02, size_t nb03, size_t nb12, size_t nb13, size_t nbd2, size_t nbd3,
int64_t r2, int64_t r3, const sycl::nd_item<3> & item_ct1) {
@@ -2713,7 +2731,7 @@ static void k_compute_batched_ptrs(const sycl::half * src0_as_f16, const sycl::h
const uint8_t * src0_bytes = reinterpret_cast<const uint8_t *>(src0_as_f16);
const uint8_t * src1_bytes = reinterpret_cast<const uint8_t *>(src1_as_f16);
uint8_t * dst_bytes = reinterpret_cast<uint8_t *>(dst);
uint8_t * dst_bytes = static_cast<uint8_t *>(dst);
ptrs_src[0 * ne23 + i12 + i13 * ne12] = src0_bytes + i02 * nb02 + i03 * nb03;
ptrs_src[1 * ne23 + i12 + i13 * ne12] = src1_bytes + i12 * nb12 + i13 * nb13;
@@ -2726,6 +2744,7 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
GGML_ASSERT(!ggml_is_transposed(src1));
GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(src0->buffer));
GGML_ASSERT(src0->type == GGML_TYPE_F16);
GGML_ASSERT(dst->type == GGML_TYPE_F32);
GGML_TENSOR_BINARY_OP_LOCALS
@@ -2766,7 +2785,6 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
}
ggml_sycl_pool_alloc<sycl::half> dst_f16(ctx.pool());
char * dst_t = reinterpret_cast<char *>(dst_ddf);
dpct::library_data_t mkl_compute_type = dpct::library_data_t::real_float;
dpct::library_data_t mkl_data_type = dpct::library_data_t::real_float;
@@ -2783,42 +2801,83 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
GGML_ASSERT(ne12 % ne02 == 0);
GGML_ASSERT(ne13 % ne03 == 0);
GGML_ASSERT(ne01 == static_cast<int64_t>(nb1/nb0));
GGML_ASSERT(ne10 == ne00);
// broadcast factors
const int64_t r2 = ne12 / ne02;
const int64_t r3 = ne13 / ne03;
if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(*queue, oneapi::math::transpose::trans,
oneapi::math::transpose::nontrans, ne01, ne11, ne10, alpha,
src0_f16, dpct::library_data_t::real_half, nb01 / nb00, nb02 / nb00,
src1_f16, dpct::library_data_t::real_half, s11, s12, beta, dst_t,
mkl_data_type, ne0, ne1 * ne0, ne12 * ne13, mkl_compute_type)));
} else {
const int ne23 = ne12 * ne13;
#if GGML_SYCL_DNNL
if (!g_ggml_sycl_disable_dnn) {
auto dnn_gemm = [&ctx, queue, ne11, ne01, ne10, nb00, nb01, nb02, s11, s12]
(const sycl::half* src1, const sycl::half* src0, float* dst, const dnnl_dim_t batches_a, const dnnl_dim_t batches_b) {
ggml_sycl_pool_alloc<const void *> ptrs_src(ctx.pool(), 2 * ne23);
ggml_sycl_pool_alloc<void *> ptrs_dst(ctx.pool(), 1 * ne23);
ggml_sycl_pool_alloc<matrix_info_t<float>> matrix_info(ctx.host_pool(), 1);
DnnlGemmWrapper::gemm(ctx, ne11,ne01, ne10,
src1, DnnlGemmWrapper::to_dt<sycl::half>(), s11, 1, s12,
src0, DnnlGemmWrapper::to_dt<sycl::half>(), 1, nb01/nb00, nb02/nb00,
dst, DnnlGemmWrapper::to_dt<float>(), queue, batches_a, batches_b);
};
sycl::range<3> block_dims(1, ne12, ne13);
queue->submit([&](sycl::handler & cgh) {
const void ** ptrs_src_get = ptrs_src.get();
void ** ptrs_dst_get = ptrs_dst.get();
size_t nb12_scaled = src1->type == GGML_TYPE_F16 ? nb12 : s12 * sizeof(sycl::half);
size_t nb13_scaled = src1->type == GGML_TYPE_F16 ? nb13 : s13 * sizeof(sycl::half);
cgh.parallel_for(sycl::nd_range<3>(block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
k_compute_batched_ptrs(src0_f16, src1_f16, dst_t, ptrs_src_get, ptrs_dst_get, ne12, ne13, ne23, nb02,
nb03, nb12_scaled, nb13_scaled, nbd2, nbd3, r2, r3, item_ct1);
if (r2 == 1 && r3 == 1) {
if (ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
dnn_gemm(src1_f16, src0_f16, dst_ddf, ne12*ne13, ne02 * ne03);
}
else {
for (int64_t ie03 = 0; ie03 < ne03; ++ie03) {
const sycl::half* src0_f16_shifted = src0_f16 + ((ie03*nb03)/sizeof(sycl::half)); // nb is in bytes
const sycl::half* src1_f16_shifted = src1_f16 + ie03*s13;
float* dst_shifted = dst_ddf + ((ie03*nb3)/sizeof(float));
dnn_gemm(src1_f16_shifted, src0_f16_shifted, dst_shifted, ne12, ne02);
}
}
} else {
// iterate over batches from smaller set of matrices (matrix 0)
for (int64_t ie02 = 0; ie02 < ne02; ++ie02) {
for (int64_t ie03 = 0; ie03 < ne03; ++ie03) {
const sycl::half* src0_f16_shifted = src0_f16 + ((ie02*nb02 + ie03*nb03)/sizeof(sycl::half));
const sycl::half* src1_f16_shifted = src1_f16 + ie02*s12*r2 + ie03*s13*r3;
float* dst_shifted = dst_ddf + ((ie02*nb2*r2 + ie03*nb3*r3)/sizeof(float));
dnn_gemm(src1_f16_shifted, src0_f16_shifted, dst_shifted, r2*r3, 1);
}
}
}
}
else
#endif
{
if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(*queue, oneapi::math::transpose::trans,
oneapi::math::transpose::nontrans, ne01, ne11, ne10, alpha,
src0_f16, dpct::library_data_t::real_half, nb01 / nb00, nb02 / nb00,
src1_f16, dpct::library_data_t::real_half, s11, s12, beta, dst_ddf,
mkl_data_type, ne0, ne1 * ne0, ne12 * ne13, mkl_compute_type)));
} else {
const int ne23 = ne12 * ne13;
ggml_sycl_pool_alloc<const void *> ptrs_src(ctx.pool(), 2 * ne23);
ggml_sycl_pool_alloc<void *> ptrs_dst(ctx.pool(), 1 * ne23);
ggml_sycl_pool_alloc<matrix_info_t<float>> matrix_info(ctx.host_pool(), 1);
sycl::range<3> block_dims(1, ne12, ne13);
queue->submit([&](sycl::handler & cgh) {
const void ** ptrs_src_get = ptrs_src.get();
void ** ptrs_dst_get = ptrs_dst.get();
size_t nb12_scaled = src1->type == GGML_TYPE_F16 ? nb12 : s12 * sizeof(sycl::half);
size_t nb13_scaled = src1->type == GGML_TYPE_F16 ? nb13 : s13 * sizeof(sycl::half);
cgh.parallel_for(sycl::nd_range<3>(block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
k_compute_batched_ptrs(src0_f16, src1_f16, dst_ddf, ptrs_src_get, ptrs_dst_get, ne12, ne13, ne23, nb02,
nb03, nb12_scaled, nb13_scaled, nbd2, nbd3, r2, r3, item_ct1);
});
});
});
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
*queue, oneapi::math::transpose::trans, oneapi::math::transpose::nontrans, ne01, ne11, ne10, alpha,
(const void **) (ptrs_src.get() + 0 * ne23), dpct::library_data_t::real_half, nb01 / nb00,
(const void **) (ptrs_src.get() + 1 * ne23), dpct::library_data_t::real_half, s11, beta,
(void **) (ptrs_dst.get() + 0 * ne23), mkl_data_type, ne0, ne23, mkl_compute_type, matrix_info.get())));
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
*queue, oneapi::math::transpose::trans, oneapi::math::transpose::nontrans, ne01, ne11, ne10, alpha,
(const void **) (ptrs_src.get() + 0 * ne23), dpct::library_data_t::real_half, nb01 / nb00,
(const void **) (ptrs_src.get() + 1 * ne23), dpct::library_data_t::real_half, s11, beta,
(void **) (ptrs_dst.get() + 0 * ne23), mkl_data_type, ne0, ne23, mkl_compute_type, matrix_info.get())));
}
}
} catch (const sycl::exception & exc) {
std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl;
@@ -2841,6 +2900,8 @@ inline bool ggml_sycl_supports_reorder_mul_mat_sycl(enum ggml_type type) {
switch (type) {
case GGML_TYPE_Q4_0:
return true;
case GGML_TYPE_Q4_K:
return !g_ggml_sycl_prioritize_dmmv;
default:
return false;
}
@@ -2858,6 +2919,7 @@ inline bool ggml_sycl_supports_reorder_dmmv(enum ggml_type type) {
inline bool ggml_sycl_supports_reorder_mmvq(enum ggml_type type) {
switch (type) {
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_K:
return true;
default:
return false;
@@ -2883,16 +2945,16 @@ static bool ggml_sycl_supports_dmmv(enum ggml_type type) {
}
}
static void reorder_qw(char *data_device, const int ncols, const int nrows,
size_t size, size_t offset, dpct::queue_ptr stream) {
auto tmp_buf = sycl::malloc_shared<char>(size, *stream);
static void reorder_qw_q4_0(uint8_t * data_device, const int ncols, const int nrows, size_t size, size_t offset,
dpct::queue_ptr stream) {
auto * tmp_buf = sycl::malloc_shared<uint8_t>(size, *stream);
SYCL_CHECK(
CHECK_TRY_ERROR((*stream).memcpy(tmp_buf, data_device, size)
.wait()));
GGML_ASSERT((size % sizeof(block_q4_0) == 0));
GGML_ASSERT((offset % sizeof(block_q4_0) == 0));
int offset_blks = offset / sizeof(block_q4_0);
auto qs_ptr = (uint8_t*)data_device + offset_blks * QK4_0 / 2;
auto qs_ptr = data_device + offset_blks * QK4_0 / 2;
auto d_ptr = (sycl::half*)(qs_ptr + ncols * nrows / 2) + offset_blks;
stream->parallel_for(
@@ -2906,18 +2968,59 @@ static void reorder_qw(char *data_device, const int ncols, const int nrows,
*(qs_ptr + ib * QK4_0 / 2 + j) = x[ib].qs[j];
}
*(d_ptr + ib) = x[ib].d;
});
}).wait_and_throw();
sycl::free(tmp_buf, *stream);
}
static void reorder_qw_q4_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) {
GGML_ASSERT(size % sizeof(block_q4_K) == 0);
GGML_ASSERT(offset % sizeof(block_q4_K) == 0);
const int nblocks = size / sizeof(block_q4_K);
auto * tmp_buf = sycl::malloc_shared<uint8_t>(size, *stream);
SYCL_CHECK(CHECK_TRY_ERROR((*stream).memcpy(tmp_buf, data_device, size).wait()));
auto * qs_ptr = data_device;
auto * scales_ptr = qs_ptr + QK_K / 2 * nblocks;
auto * dm_ptr = (sycl::half2 *) (scales_ptr + K_SCALE_SIZE * nblocks);
stream->parallel_for(nblocks, [=](auto i) {
const block_q4_K * x = (const block_q4_K *) tmp_buf;
const int ib = i;
for (int j = 0; j < QK_K / 2; ++j) {
qs_ptr[ib * (QK_K / 2) + j] = x[ib].qs[j];
}
for (int j = 0; j < K_SCALE_SIZE; ++j) {
scales_ptr[ib * K_SCALE_SIZE + j] = x[ib].scales[j];
}
dm_ptr[ib] = x[ib].dm;
}).wait_and_throw();
sycl::free(tmp_buf, *stream);
}
static void reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) {
char*data_device = (char*)src0->data;
uint8_t * data_device = (uint8_t *) src0->data;
size_t ncols = src0->ne[0];
size_t nrows = src0->ne[1];
size_t size = ggml_nbytes(src0);
reorder_qw(data_device, ncols, nrows, size, 0, stream);
switch (src0->type) {
case GGML_TYPE_Q4_0:
reorder_qw_q4_0(data_device, ncols, nrows, size, 0, stream);
break;
case GGML_TYPE_Q4_K:
reorder_qw_q4_k(data_device, size, 0, stream);
break;
default:
GGML_ABORT("reorder_qw() called with unsupported type");
break;
}
}
static bool should_reorder_tensor(ggml_backend_sycl_context& ctx, const ggml_tensor * dst) {
@@ -2960,8 +3063,18 @@ static void opt_for_reorder(ggml_backend_sycl_context * ctx, const ggml_tensor *
extra->optimized_feature.reorder = true; // Used to decode/dequan in next steps and avoid re-reordering
}
static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
static bool can_use_dequantize_mul_mat_vec(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
return ggml_sycl_supports_dmmv(src0->type) && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 &&
src0->ne[0] % GGML_SYCL_DMMV_X == 0 && src1->ne[1] == 1;
}
static bool can_use_mul_mat_vec_q(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
return ggml_is_quantized(src0->type) && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 &&
src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
}
static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
const bool split = ggml_backend_buffer_is_sycl_split(src0->buffer);
int64_t min_compute_capability = INT_MAX;
@@ -2984,13 +3097,9 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
}
// check data types and tensor shapes for custom matrix multiplication kernels:
bool use_dequantize_mul_mat_vec = ggml_sycl_supports_dmmv(src0->type)
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
&& src0->ne[0] % GGML_SYCL_DMMV_X == 0 && src1->ne[1] == 1;
bool use_dequantize_mul_mat_vec = can_use_dequantize_mul_mat_vec(src0, src1, dst);
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
&& src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
bool use_mul_mat_vec_q = can_use_mul_mat_vec_q(src0, src1, dst);
bool use_mul_mat_q = ggml_sycl_supports_mmq(src0->type)
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
@@ -3713,7 +3822,8 @@ static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_
return GGML_STATUS_SUCCESS;
}
sycl_ex::command_graph model_sycl_graph(*(sycl_ctx->stream()));
sycl_ex::command_graph model_sycl_graph(*(sycl_ctx->stream()), {sycl_ex::property::graph::assume_buffer_outlives_graph{}});
model_sycl_graph.begin_recording(*(sycl_ctx->stream()));
ggml_backend_sycl_graph_compute_impl(sycl_ctx, cgraph);
model_sycl_graph.end_recording();

View File

@@ -24,6 +24,7 @@ static void mul_mat_vec_q_reorder(const void * __restrict__ vx, const void * __r
const int blocks_per_row = ncols / block_traits::qk;
constexpr int blocks_per_subgroup = ceil_div(block_traits::vdr_mmvq * WARP_SIZE, block_traits::qi);
constexpr int block_elements_per_subgroup = block_traits::qi / block_traits::vdr_mmvq;
const int nblocks = nrows * (ncols / block_traits::qk);
static_assert(blocks_per_subgroup > 0);
static_assert(block_elements_per_subgroup > 0);
@@ -45,7 +46,7 @@ static void mul_mat_vec_q_reorder(const void * __restrict__ vx, const void * __r
// x block quant index when casting the quants to int
const int iqs = elem + block_traits::vdr_mmvq * (sg.get_local_linear_id() % block_elements_per_subgroup);
partial_sum += reorder_vec_dot_q_sycl()(vx, bx_offset, d_offset, &y[iby], iqs);
partial_sum += reorder_vec_dot_q_sycl()(vx, bx_offset, d_offset, &y[iby], iqs, nblocks);
}
}
@@ -739,6 +740,27 @@ static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
}
}
static void reorder_mul_mat_vec_q4_k_q8_1_sycl(const void * vx, const void * vy, float * dst, const int ncols,
const int nrows, dpct::queue_ptr stream) {
GGML_ASSERT(ncols % QK_K == 0);
const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y);
constexpr size_t num_subgroups = 16;
GGML_ASSERT(block_num_y % num_subgroups == 0);
const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE);
const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE);
stream->submit([&](sycl::handler & cgh) {
cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size),
[=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
mul_mat_vec_q_reorder<reorder_vec_dot_q_sycl<GGML_TYPE_Q4_K>>(vx, vy, dst, ncols,
nrows, nd_item);
});
});
}
static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
float *dst, const int ncols,
const int nrows,
@@ -1035,7 +1057,12 @@ void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tens
mul_mat_vec_q3_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
break;
case GGML_TYPE_Q4_K:
mul_mat_vec_q4_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
if ((ggml_tensor_extra_gpu *) dst->src[0]->extra &&
((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
reorder_mul_mat_vec_q4_k_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
} else {
mul_mat_vec_q4_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
}
break;
case GGML_TYPE_Q5_K:
mul_mat_vec_q5_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);

View File

@@ -56,6 +56,28 @@ template <> struct block_q_t<GGML_TYPE_Q4_0> {
static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; }
};
template <> struct block_q_t<GGML_TYPE_Q4_K> {
struct traits {
static constexpr uint32_t qk = QK_K;
static constexpr uint32_t qi = QI4_K;
static constexpr uint32_t qr = QR4_K;
static constexpr uint32_t vdr_mmvq = 2;
};
static constexpr int get_block_offset(const int block_index) { return block_index * (traits::qk / traits::qr); }
static constexpr int get_d_offset(int nrows, int ncols, const int block_index) {
auto nblocks = (nrows * (ncols / traits::qk));
return (nblocks * QK_K / 2) + (nblocks * K_SCALE_SIZE) + (block_index * sizeof(ggml_half2));
}
static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; }
constexpr size_t get_total_qs_bytes(int nblocks) { return nblocks * QK_K / 2; }
constexpr size_t get_dm_offset(int nblocks) { return get_total_qs_bytes(nblocks) + nblocks * K_SCALE_SIZE; }
};
} // namespace ggml_sycl_reordered
#endif // GGML_SYCL_QUANTS_HPP

View File

@@ -285,7 +285,7 @@ template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q4_0> {
}
__dpct_inline__ float operator()(const void * __restrict__ vbq, const int ibx_offset, const int d_offset,
const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
const block_q8_1 * __restrict__ bq8_1, const int & iqs, int /* nblocks */) {
const uint8_t * bq4_0 = static_cast<const uint8_t *>(vbq) + ibx_offset;
const ggml_half d = *(reinterpret_cast<const ggml_half *>(static_cast<const uint8_t *>(vbq) + d_offset));
int v[q4_0_traits::vdr_mmvq];
@@ -303,6 +303,67 @@ template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q4_0> {
};
};
static inline float vec_dot_q4_K_q8_1_common(const int * __restrict__ q4, const uint16_t * __restrict__ scales,
const ggml_half2 & dm, const block_q8_1 * __restrict__ bq8_1,
const int & iqs) {
int v[2];
int u[2 * QR4_K];
float d8[QR4_K];
v[0] = q4[0];
v[1] = q4[4];
uint16_t aux[2];
const int j = (QR4_K * ((iqs / 2) / (QI8_1 / 2))) / 2;
if (j < 2) {
aux[0] = scales[j + 0] & 0x3f3f;
aux[1] = scales[j + 2] & 0x3f3f;
} else {
aux[0] = ((scales[j + 2] >> 0) & 0x0f0f) | ((scales[j - 2] & 0xc0c0) >> 2);
aux[1] = ((scales[j + 2] >> 4) & 0x0f0f) | ((scales[j - 0] & 0xc0c0) >> 2);
}
const uint8_t * sc = (const uint8_t *) aux;
const uint8_t * m = sc + 2;
const int bq8_offset = QR4_K * ((iqs / 2) / (QI8_1 / 2));
for (int i = 0; i < QR4_K; ++i) {
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
d8[i] = bq8i->ds[0];
const int * q8 = (const int *) bq8i->qs + ((iqs / 2) % 4);
u[2 * i + 0] = q8[0];
u[2 * i + 1] = q8[4];
}
return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, dm, d8);
}
template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q4_K> {
static constexpr ggml_type gtype = GGML_TYPE_Q4_K;
using q4_k_block = ggml_sycl_reordered::block_q_t<GGML_TYPE_Q4_K>;
using q4_k_traits = typename q4_k_block::traits;
float operator()(const void * __restrict__ vbq, const int ibx_offset, const int d_offset,
const block_q8_1 * __restrict__ bq8_1, const int & iqs, int nblocks) {
const int ib = ibx_offset / (QK_K / 2);
const uint8_t * base = static_cast<const uint8_t *>(vbq);
const uint8_t * qs = base + ibx_offset;
const int total_qs_bytes = nblocks * (QK_K / 2);
const uint8_t * scs = base + total_qs_bytes + ib * K_SCALE_SIZE;
const ggml_half2 * dms = reinterpret_cast<const ggml_half2 *>(base + d_offset);
const int bq8_offset = QR4_K * ((iqs / 2) / (QI8_1 / 2));
const int * q4 = (const int *) (qs + 16 * bq8_offset + 4 * ((iqs / 2) % 4));
const uint16_t * scales = (const uint16_t *) scs;
return vec_dot_q4_K_q8_1_common(q4, scales, *dms, bq8_1, iqs);
}
};
#define VDR_Q4_0_Q8_1_MMVQ 2
#define VDR_Q4_0_Q8_1_MMQ 4
@@ -649,52 +710,17 @@ vec_dot_q3_K_q8_1(const void *__restrict__ vbq,
return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
}
static __dpct_inline__ float
vec_dot_q4_K_q8_1(const void *__restrict__ vbq,
const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
static __dpct_inline__ float vec_dot_q4_K_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1,
const int & iqs) {
#ifndef GGML_QKK_64
const block_q4_K * bq4_K = (const block_q4_K *) vbq;
int v[2];
int u[2*QR4_K];
float d8[QR4_K];
const int bq8_offset = QR4_K * ((iqs / 2) / (QI8_1 / 2));
const int * q4 = (const int *) (bq4_K->qs + 16 * bq8_offset + 4 * ((iqs / 2) % 4));
const uint16_t * scales = (const uint16_t *) bq4_K->scales;
// iqs is in 0,2..30. bq8_offset = iqs/4 -> bq8_offset = 0, 2, 4, 6
const int bq8_offset = QR4_K * ((iqs/2) / (QI8_1/2));
// iqs = 0....3 -> bq8_offset = 0, want q4_offset = 0, 4, 8, 12
// iqs = 4....7 -> bq8_offset = 2, want q4_offset = 32, 36, 40, 44
// iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76
// iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108
const int * q4 = (const int *)(bq4_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4));
v[0] = q4[0];
v[1] = q4[4];
const uint16_t * scales = (const uint16_t *)bq4_K->scales;
uint16_t aux[2];
const int j = bq8_offset/2;
if (j < 2) {
aux[0] = scales[j+0] & 0x3f3f;
aux[1] = scales[j+2] & 0x3f3f;
} else {
aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
}
const uint8_t * sc = (const uint8_t *)aux;
const uint8_t * m = sc + 2;
for (int i = 0; i < QR4_K; ++i) {
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
d8[i] = bq8i->ds[0];
const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
u[2*i+0] = q8[0];
u[2*i+1] = q8[4];
}
return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8);
return vec_dot_q4_K_q8_1_common(q4, scales, bq4_K->dm, bq8_1, iqs);
#else

View File

@@ -5872,10 +5872,17 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
vk_pipeline *pipelines;
bool small_rows = N <= get_fa_num_small_rows(path);
// coopmat1 does not actually support "small rows" (it needs 16 rows).
// So use scalar instead.
if (small_rows && path == FA_COOPMAT1) {
path = FA_SCALAR;
}
// scalar is faster than coopmat2 when N==1
if (N == 1 && path == FA_COOPMAT2) {
path = FA_SCALAR;
}
bool f32acc = path == FA_SCALAR || dst->op_params[3] == GGML_PREC_F32;
switch (path) {

View File

@@ -299,10 +299,10 @@ bool gguf_read_emplace_helper(const struct gguf_reader & gr, std::vector<struct
return false;
}
} catch (std::length_error &) {
fprintf(stderr, "%s: encountered length_error while reading value for key '%s'\n", __func__, key.c_str());
GGML_LOG_ERROR("%s: encountered length_error while reading value for key '%s'\n", __func__, key.c_str());
return false;
} catch (std::bad_alloc &) {
fprintf(stderr, "%s: encountered bad_alloc error while reading value for key '%s'\n", __func__, key.c_str());
GGML_LOG_ERROR("%s: encountered bad_alloc error while reading value for key '%s'\n", __func__, key.c_str());
return false;
}
kv.emplace_back(key, value);
@@ -328,14 +328,14 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
ok = ok && gr.read(magic, 4);
if (!ok) {
fprintf(stderr, "%s: failed to read magic\n", __func__);
GGML_LOG_ERROR("%s: failed to read magic\n", __func__);
gguf_free(ctx);
return nullptr;
}
for (uint32_t i = 0; i < magic.size(); i++) {
if (magic[i] != GGUF_MAGIC[i]) {
fprintf(stderr, "%s: invalid magic characters: '%c%c%c%c', expected 'GGUF'\n", __func__, magic[0], magic[1], magic[2], magic[3]);
GGML_LOG_ERROR("%s: invalid magic characters: '%c%c%c%c', expected 'GGUF'\n", __func__, magic[0], magic[1], magic[2], magic[3]);
gguf_free(ctx);
return nullptr;
}
@@ -348,11 +348,11 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
if (ok && gr.read(ctx->version)) {
if (ctx->version == 1) {
fprintf(stderr, "%s: GGUFv1 is no longer supported, please use a more up-to-date version\n", __func__);
GGML_LOG_ERROR("%s: GGUFv1 is no longer supported, please use a more up-to-date version\n", __func__);
ok = false;
}
if (ctx->version > GGUF_VERSION) {
fprintf(stderr, "%s: this GGUF file is version %" PRIu32 " but this software only supports up to version %d\n",
GGML_LOG_ERROR("%s: this GGUF file is version %" PRIu32 " but this software only supports up to version %d\n",
__func__, ctx->version, GGUF_VERSION);
ok = false;
}
@@ -363,7 +363,7 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
if (ok && gr.read(n_tensors)) {
static_assert(sizeof(size_t) <= 8 && sizeof(gguf_tensor_info) >= 2, "int64_t insufficient for indexing");
if (n_tensors < 0 || n_tensors > int64_t(SIZE_MAX/sizeof(gguf_tensor_info))) {
fprintf(stderr, "%s: number of tensors is %" PRIi64 " but must be in [0, %zu]\n",
GGML_LOG_ERROR("%s: number of tensors is %" PRIi64 " but must be in [0, %zu]\n",
__func__, n_tensors, SIZE_MAX/sizeof(gguf_tensor_info));
ok = false;
}
@@ -374,7 +374,7 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
if (ok && gr.read(n_kv)) {
static_assert(sizeof(size_t) <= 8 && sizeof(gguf_tensor_info) >= 2, "int64_t insufficient for indexing");
if (n_kv < 0 || n_kv > int64_t(SIZE_MAX/sizeof(gguf_kv))) {
fprintf(stderr, "%s: number of key value pairs is %" PRIi64 " but must be in [0, %zu]\n",
GGML_LOG_ERROR("%s: number of key value pairs is %" PRIi64 " but must be in [0, %zu]\n",
__func__, n_kv, SIZE_MAX/sizeof(gguf_kv));
ok = false;
}
@@ -383,7 +383,7 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
}
if (!ok) {
fprintf(stderr, "%s: failed to read header\n", __func__);
GGML_LOG_ERROR("%s: failed to read header\n", __func__);
gguf_free(ctx);
return nullptr;
}
@@ -399,15 +399,15 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
try {
ok = ok && gr.read(key);
} catch (std::length_error &) {
fprintf(stderr, "%s: encountered length_error while reading key %" PRIi64 "\n", __func__, i);
GGML_LOG_ERROR("%s: encountered length_error while reading key %" PRIi64 "\n", __func__, i);
ok = false;
} catch (std::bad_alloc &) {
fprintf(stderr, "%s: encountered bad_alloc error while reading key %" PRIi64 "\n", __func__, i);
GGML_LOG_ERROR("%s: encountered bad_alloc error while reading key %" PRIi64 "\n", __func__, i);
ok = false;
}
for (size_t j = 0; ok && j < ctx->kv.size(); ++j) {
if (key == ctx->kv[j].key) {
fprintf(stderr, "%s: duplicate key '%s' for tensors %zu and %" PRIi64 " \n", __func__, key.c_str(), j, i);
GGML_LOG_ERROR("%s: duplicate key '%s' for tensors %zu and %" PRIi64 " \n", __func__, key.c_str(), j, i);
ok = false;
}
}
@@ -441,14 +441,14 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
case GGUF_TYPE_ARRAY:
default:
{
fprintf(stderr, "%s: key '%s' has invalid GGUF type %d\n", __func__, key.c_str(), type);
GGML_LOG_ERROR("%s: key '%s' has invalid GGUF type %d\n", __func__, key.c_str(), type);
ok = false;
} break;
}
}
if (!ok) {
fprintf(stderr, "%s: failed to read key-value pairs\n", __func__);
GGML_LOG_ERROR("%s: failed to read key-value pairs\n", __func__);
gguf_free(ctx);
return nullptr;
}
@@ -458,7 +458,7 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
ctx->alignment = alignment_idx == -1 ? GGUF_DEFAULT_ALIGNMENT : gguf_get_val_u32(ctx, alignment_idx);
if (ctx->alignment == 0 || (ctx->alignment & (ctx->alignment - 1)) != 0) {
fprintf(stderr, "%s: alignment %zu is not a power of 2\n", __func__, ctx->alignment);
GGML_LOG_ERROR("%s: alignment %zu is not a power of 2\n", __func__, ctx->alignment);
gguf_free(ctx);
return nullptr;
}
@@ -474,14 +474,14 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
try {
ok = ok && gr.read(name);
} catch (std::length_error &) {
fprintf(stderr, "%s: encountered length_error while reading tensor name %" PRIi64 "\n", __func__, i);
GGML_LOG_ERROR("%s: encountered length_error while reading tensor name %" PRIi64 "\n", __func__, i);
ok = false;
} catch (std::bad_alloc &) {
fprintf(stderr, "%s: encountered bad_alloc error while reading tensor name %" PRIi64 "\n", __func__, i);
GGML_LOG_ERROR("%s: encountered bad_alloc error while reading tensor name %" PRIi64 "\n", __func__, i);
ok = false;
}
if (name.length() >= GGML_MAX_NAME) {
fprintf(stderr, "%s: tensor name %" PRIi64 " is too long: %zu >= %d\n", __func__, i, name.length(), GGML_MAX_NAME);
GGML_LOG_ERROR("%s: tensor name %" PRIi64 " is too long: %zu >= %d\n", __func__, i, name.length(), GGML_MAX_NAME);
ok = false;
break;
}
@@ -490,7 +490,7 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
// make sure there are no duplicate tensor names
for (int64_t j = 0; ok && j < i; ++j) {
if (strcmp(info.t.name, ctx->info[j].t.name) == 0) {
fprintf(stderr, "%s: duplicate tensor name '%s' for tensors %" PRIi64 " and %" PRIi64 "\n", __func__, info.t.name, j, i);
GGML_LOG_ERROR("%s: duplicate tensor name '%s' for tensors %" PRIi64 " and %" PRIi64 "\n", __func__, info.t.name, j, i);
ok = false;
break;
}
@@ -505,7 +505,7 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
uint32_t n_dims = -1;
ok = ok && gr.read(n_dims);
if (n_dims > GGML_MAX_DIMS) {
fprintf(stderr, "%s: tensor '%s' has invalid number of dimensions: %" PRIu32 " > %" PRIu32 "\n",
GGML_LOG_ERROR("%s: tensor '%s' has invalid number of dimensions: %" PRIu32 " > %" PRIu32 "\n",
__func__, info.t.name, n_dims, GGML_MAX_DIMS);
ok = false;
break;
@@ -518,7 +518,7 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
// check that all ne are non-negative
if (info.t.ne[j] < 0) {
fprintf(stderr, "%s: tensor '%s' dimension %" PRIu32 " has invalid number of elements: %" PRIi64 " < 0\n",
GGML_LOG_ERROR("%s: tensor '%s' dimension %" PRIu32 " has invalid number of elements: %" PRIi64 " < 0\n",
__func__, info.t.name, j, info.t.ne[j]);
ok = false;
break;
@@ -530,7 +530,7 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
(INT64_MAX/info.t.ne[2] <= info.t.ne[0]*info.t.ne[1]) ||
(INT64_MAX/info.t.ne[3] <= info.t.ne[0]*info.t.ne[1]*info.t.ne[2]))) {
fprintf(stderr, "%s: total number of elements in tensor '%s' with shape "
GGML_LOG_ERROR("%s: total number of elements in tensor '%s' with shape "
"(%" PRIi64 ", %" PRIi64 ", %" PRIi64 ", %" PRIi64 ") is >= %" PRIi64 "\n",
__func__, info.t.name, info.t.ne[0], info.t.ne[1], info.t.ne[2], info.t.ne[3], INT64_MAX);
ok = false;
@@ -547,7 +547,7 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
// check that tensor type is within defined range
if (info.t.type < 0 || info.t.type >= GGML_TYPE_COUNT) {
fprintf(stderr, "%s: tensor '%s' has invalid ggml type %d (%s)\n",
GGML_LOG_ERROR("%s: tensor '%s' has invalid ggml type %d (%s)\n",
__func__, info.t.name, info.t.type, ggml_type_name(info.t.type));
ok = false;
break;
@@ -557,7 +557,7 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
// check that row size is divisible by block size
if (blck_size == 0 || info.t.ne[0] % blck_size != 0) {
fprintf(stderr, "%s: tensor '%s' of type %d (%s) has %" PRId64 " elements per row, "
GGML_LOG_ERROR("%s: tensor '%s' of type %d (%s) has %" PRId64 " elements per row, "
"not a multiple of block size (%" PRId64 ")\n",
__func__, info.t.name, (int) info.t.type, ggml_type_name(info.t.type), info.t.ne[0], blck_size);
ok = false;
@@ -582,7 +582,7 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
}
if (!ok) {
fprintf(stderr, "%s: failed to read tensor info\n", __func__);
GGML_LOG_ERROR("%s: failed to read tensor info\n", __func__);
gguf_free(ctx);
return nullptr;
}
@@ -590,7 +590,7 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
// we require the data section to be aligned, so take into account any padding
if (fseek(file, GGML_PAD(ftell(file), ctx->alignment), SEEK_SET) != 0) {
fprintf(stderr, "%s: failed to seek to beginning of data section\n", __func__);
GGML_LOG_ERROR("%s: failed to seek to beginning of data section\n", __func__);
gguf_free(ctx);
return nullptr;
}
@@ -604,9 +604,9 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
for (size_t i = 0; i < ctx->info.size(); ++i) {
const gguf_tensor_info & ti = ctx->info[i];
if (ti.offset != ctx->size) {
fprintf(stderr, "%s: tensor '%s' has offset %" PRIu64 ", expected %zu\n",
GGML_LOG_ERROR("%s: tensor '%s' has offset %" PRIu64 ", expected %zu\n",
__func__, ti.t.name, ti.offset, ctx->size);
fprintf(stderr, "%s: failed to read tensor data\n", __func__);
GGML_LOG_ERROR("%s: failed to read tensor data\n", __func__);
gguf_free(ctx);
return nullptr;
}
@@ -634,7 +634,7 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
*params.ctx = ggml_init(pdata);
if (*params.ctx == nullptr) {
fprintf(stderr, "%s: failed to initialize ggml context for storing tensors\n", __func__);
GGML_LOG_ERROR("%s: failed to initialize ggml context for storing tensors\n", __func__);
gguf_free(ctx);
return nullptr;
}
@@ -656,7 +656,7 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
ok = ok && gr.read(data->data, ctx->size);
if (!ok) {
fprintf(stderr, "%s: failed to read tensor data binary blob\n", __func__);
GGML_LOG_ERROR("%s: failed to read tensor data binary blob\n", __func__);
ggml_free(ctx_data);
*params.ctx = nullptr;
gguf_free(ctx);
@@ -689,7 +689,7 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
}
if (!ok) {
fprintf(stderr, "%s: failed to create tensors\n", __func__);
GGML_LOG_ERROR("%s: failed to create tensors\n", __func__);
ggml_free(ctx_data);
*params.ctx = nullptr;
gguf_free(ctx);
@@ -706,7 +706,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
FILE * file = ggml_fopen(fname, "rb");
if (!file) {
fprintf(stderr, "%s: failed to open GGUF file '%s'\n", __func__, fname);
GGML_LOG_ERROR("%s: failed to open GGUF file '%s'\n", __func__, fname);
return nullptr;
}
@@ -1305,7 +1305,7 @@ bool gguf_write_to_file(const struct gguf_context * ctx, const char * fname, boo
FILE * file = ggml_fopen(fname, "wb");
if (!file) {
fprintf(stderr, "%s: failed to open file '%s' for writing GGUF data\n", __func__, fname);
GGML_LOG_ERROR("%s: failed to open file '%s' for writing GGUF data\n", __func__, fname);
return false;
}

View File

@@ -823,6 +823,7 @@ class GGUFEditorWindow(QMainWindow):
self.modified = False
self.metadata_changes = {} # Store changes to apply when saving
self.metadata_to_remove = set() # Store keys to remove when saving
self.on_metadata_changed_is_connected = False
self.setup_ui()
@@ -941,9 +942,11 @@ class GGUFEditorWindow(QMainWindow):
return
# Disconnect to prevent triggering during loading
with warnings.catch_warnings():
warnings.filterwarnings('ignore')
self.metadata_table.itemChanged.disconnect(self.on_metadata_changed)
if self.on_metadata_changed_is_connected:
with warnings.catch_warnings():
warnings.filterwarnings('ignore')
self.metadata_table.itemChanged.disconnect(self.on_metadata_changed)
self.on_metadata_changed_is_connected = False
for i, (key, field) in enumerate(self.reader.fields.items()):
self.metadata_table.insertRow(i)
@@ -1021,6 +1024,7 @@ class GGUFEditorWindow(QMainWindow):
# Reconnect after loading
self.metadata_table.itemChanged.connect(self.on_metadata_changed)
self.on_metadata_changed_is_connected = True
def extract_array_values(self, field: ReaderField) -> list:
"""Extract all values from an array field."""

View File

@@ -68,7 +68,7 @@ class TensorNameMap:
"output_layer", # chatglm
"head", # rwkv
"head.out", # wavtokenizer
"language_model.lm_head", # llama4
"lm_head", # llama4
),
# Output norm
@@ -91,7 +91,7 @@ class TensorNameMap:
"rwkv.ln_out", # rwkv6
"model.ln_out", # rwkv7
"backbone.final_layer_norm", # wavtokenizer
"language_model.model.norm", # llama4
"model.norm", # llama4
),
# Rope frequencies
@@ -133,7 +133,7 @@ class TensorNameMap:
"transformer.layers.{bid}.attn_norm", # openelm
"rwkv.blocks.{bid}.ln1", # rwkv6
"model.layers.{bid}.ln1", # rwkv7
"language_model.model.layers.{bid}.input_layernorm", # llama4
"model.layers.{bid}.input_layernorm", # llama4
),
# Attention norm 2
@@ -173,7 +173,7 @@ class TensorNameMap:
"model.layers.{bid}.attention.wq", # internlm2
"transformer.decoder_layer.{bid}.multi_head_attention.query",# Grok
"transformer.h.{bid}.attn.attention.q_proj", # exaone
"language_model.model.layers.{bid}.self_attn.q_proj", # llama4
"model.layers.{bid}.self_attn.q_proj", # llama4
),
# Attention key
@@ -188,7 +188,7 @@ class TensorNameMap:
"model.layers.{bid}.attention.wk", # internlm2
"transformer.decoder_layer.{bid}.multi_head_attention.key",# Grok
"transformer.h.{bid}.attn.attention.k_proj", # exaone
"language_model.model.layers.{bid}.self_attn.k_proj", # llama4
"model.layers.{bid}.self_attn.k_proj", # llama4
),
# Attention value
@@ -202,7 +202,7 @@ class TensorNameMap:
"model.layers.{bid}.attention.wv", # internlm2
"transformer.decoder_layer.{bid}.multi_head_attention.value",# Grok
"transformer.h.{bid}.attn.attention.v_proj", # exaone
"language_model.model.layers.{bid}.self_attn.v_proj", # llama4
"model.layers.{bid}.self_attn.v_proj", # llama4
),
# Attention output
@@ -229,7 +229,7 @@ class TensorNameMap:
"encoder.layers.{bid}.self_attention.dense", # chatglm
"transformer.layers.{bid}.attn.out_proj", # openelm
"transformer.h.{bid}.attn.attention.out_proj", # exaone
"language_model.model.layers.{bid}.self_attn.o_proj", # llama4
"model.layers.{bid}.self_attn.o_proj", # llama4
),
# Attention output norm
@@ -268,7 +268,7 @@ class TensorNameMap:
"transformer.decoder_layer.{bid}.rms_norm_2", # Grok
"encoder.layers.{bid}.post_attention_layernorm", # chatglm
"transformer.layers.{bid}.ffn_norm", # openelm
"language_model.model.layers.{bid}.post_attention_layernorm", # llama4
"model.layers.{bid}.post_attention_layernorm", # llama4
),
# Post feed-forward norm
@@ -289,7 +289,7 @@ class TensorNameMap:
"transformer.decoder_layer.{bid}.router", # Grok
"transformer.blocks.{bid}.ffn.router.layer", # dbrx
"model.layers.{bid}.block_sparse_moe.router.layer", # granitemoe
"language_model.model.layers.{bid}.feed_forward.router", # llama4
"model.layers.{bid}.feed_forward.router", # llama4
"encoder.layers.{bid}.mlp.router.layer", # nomic-bert-moe
),
@@ -329,7 +329,7 @@ class TensorNameMap:
"model.layers.{bid}.residual_mlp.w3", # arctic
"encoder.layers.{bid}.mlp.dense_h_to_4h", # chatglm
"transformer.h.{bid}.mlp.c_fc_1", # exaone
"language_model.model.layers.{bid}.feed_forward.up_proj", # llama4
"model.layers.{bid}.feed_forward.up_proj", # llama4
),
MODEL_TENSOR.FFN_UP_EXP: (
@@ -338,14 +338,14 @@ class TensorNameMap:
"transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx
"model.layers.{bid}.mlp.experts.up_proj", # qwen2moe olmoe (merged)
"model.layers.{bid}.block_sparse_moe.experts.w3", # phimoe (merged)
"language_model.model.layers.{bid}.feed_forward.experts.up_proj", # llama4
"model.layers.{bid}.feed_forward.experts.up_proj", # llama4
"encoder.layers.{bid}.mlp.experts.mlp.w1", # nomic-bert-moe
),
MODEL_TENSOR.FFN_UP_SHEXP: (
"model.layers.{bid}.mlp.shared_expert.up_proj", # qwen2moe
"model.layers.{bid}.mlp.shared_experts.up_proj", # deepseek deepseek2
"language_model.model.layers.{bid}.feed_forward.shared_expert.up_proj", # llama4
"model.layers.{bid}.mlp.shared_expert.up_proj", # qwen2moe
"model.layers.{bid}.mlp.shared_experts.up_proj", # deepseek deepseek2
"model.layers.{bid}.feed_forward.shared_expert.up_proj", # llama4
),
# AWQ-activation gate
@@ -366,22 +366,22 @@ class TensorNameMap:
"transformer.h.{bid}.mlp.linear_1", # refact
"model.layers.{bid}.residual_mlp.w1", # arctic
"transformer.h.{bid}.mlp.c_fc_0", # exaone
"language_model.model.layers.{bid}.feed_forward.gate_proj", # llama4
"model.layers.{bid}.feed_forward.gate_proj", # llama4
),
MODEL_TENSOR.FFN_GATE_EXP: (
"layers.{bid}.feed_forward.experts.w1", # mixtral (merged)
"transformer.decoder_layer.{bid}.moe.linear", # Grok (merged)
"transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx
"model.layers.{bid}.mlp.experts.gate_proj", # qwen2moe olmoe (merged)
"model.layers.{bid}.block_sparse_moe.experts.w1", # phimoe (merged)
"language_model.model.layers.{bid}.feed_forward.experts.gate_proj", # llama4
"layers.{bid}.feed_forward.experts.w1", # mixtral (merged)
"transformer.decoder_layer.{bid}.moe.linear", # Grok (merged)
"transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx
"model.layers.{bid}.mlp.experts.gate_proj", # qwen2moe olmoe (merged)
"model.layers.{bid}.block_sparse_moe.experts.w1", # phimoe (merged)
"model.layers.{bid}.feed_forward.experts.gate_proj", # llama4
),
MODEL_TENSOR.FFN_GATE_SHEXP: (
"model.layers.{bid}.mlp.shared_expert.gate_proj", # qwen2moe
"model.layers.{bid}.mlp.shared_experts.gate_proj", # deepseek deepseek2
"language_model.model.layers.{bid}.feed_forward.shared_expert.gate_proj", # llama4
"model.layers.{bid}.mlp.shared_expert.gate_proj", # qwen2moe
"model.layers.{bid}.mlp.shared_experts.gate_proj", # deepseek deepseek2
"model.layers.{bid}.feed_forward.shared_expert.gate_proj", # llama4
),
# Feed-forward down
@@ -410,7 +410,7 @@ class TensorNameMap:
"encoder.layer.{bid}.mlp.down_layer", # jina-bert-v2
"encoder.layers.{bid}.mlp.dense_4h_to_h", # chatglm
"model.layers.h.{bid}.mlp.c_proj", # exaone
"language_model.model.layers.{bid}.feed_forward.down_proj", # llama4
"model.layers.{bid}.feed_forward.down_proj", # llama4
),
MODEL_TENSOR.FFN_DOWN_EXP: (
@@ -420,15 +420,15 @@ class TensorNameMap:
"model.layers.{bid}.mlp.experts.down_proj", # qwen2moe olmoe (merged)
"model.layers.{bid}.block_sparse_moe.output_linear", # granitemoe
"model.layers.{bid}.block_sparse_moe.experts.w2", # phimoe (merged)
"language_model.model.layers.{bid}.feed_forward.experts.down_proj", # llama4
"model.layers.{bid}.feed_forward.experts.down_proj", # llama4
"encoder.layers.{bid}.mlp.experts.mlp.w2", # nomic-bert-moe
),
MODEL_TENSOR.FFN_DOWN_SHEXP: (
"model.layers.{bid}.mlp.shared_expert.down_proj", # qwen2moe
"model.layers.{bid}.mlp.shared_experts.down_proj", # deepseek deepseek2
"language_model.model.layers.{bid}.feed_forward.shared_expert.down_proj", # llama4
"model.layers.{bid}.shared_mlp.output_linear", # granitemoe
"model.layers.{bid}.mlp.shared_expert.down_proj", # qwen2moe
"model.layers.{bid}.mlp.shared_experts.down_proj", # deepseek deepseek2
"model.layers.{bid}.feed_forward.shared_expert.down_proj", # llama4
"model.layers.{bid}.shared_mlp.output_linear", # granitemoe
),
MODEL_TENSOR.ATTN_Q_NORM: (

View File

@@ -469,7 +469,7 @@ llama_model_loader::llama_model_loader(
meta.reset(gguf_init_from_file(fname.c_str(), params));
if (!meta) {
throw std::runtime_error(format("%s: failed to load model from %s\n", __func__, fname.c_str()));
throw std::runtime_error(format("%s: failed to load model from %s", __func__, fname.c_str()));
}
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
@@ -528,7 +528,7 @@ llama_model_loader::llama_model_loader(
};
gguf_context_ptr ctx_gguf { gguf_init_from_file(fname_split, split_params) };
if (!ctx_gguf) {
throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, fname_split));
throw std::runtime_error(format("%s: failed to load GGUF split from %s", __func__, fname_split));
}
// check idx

View File

@@ -140,6 +140,11 @@ static struct llama_model * llama_model_load_from_file_impl(
struct llama_model_params params) {
ggml_time_init();
if (!params.vocab_only && ggml_backend_reg_count() == 0) {
LLAMA_LOG_ERROR("%s: no backends are loaded. hint: use ggml_backend_load() or ggml_backend_load_all() to load a backend before calling this function\n", __func__);
return nullptr;
}
unsigned cur_percentage = 0;
if (params.progress_callback == NULL) {
params.progress_callback_user_data = &cur_percentage;

View File

@@ -687,7 +687,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
invalid_param = true;
break;
}
auto value = argv[i];
auto * value = argv[i];
/* static */ std::map<std::string, ggml_backend_buffer_type_t> buft_list;
if (buft_list.empty()) {
// enumerate all the devices and add their buffer types to the list
@@ -719,7 +719,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
// memory leak present in the implementation
// over in arg.cpp. Acceptable because we
// only parse these args once in this program.
auto override_group = value;
auto * override_group = value;
if (value[override_group_span_len] == '\0') {
value = &value[override_group_span_len];
last_group = true;
@@ -730,7 +730,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
std::vector<llama_model_tensor_buft_override> group_tensor_buft_overrides{};
auto override_span_len = std::strcspn(override_group, ";");
while (override_span_len > 0) {
auto override = override_group;
auto * override = override_group;
if (override_group[override_span_len] != '\0') {
override_group[override_span_len] = '\0';
override_group = &override_group[override_span_len + 1];
@@ -743,9 +743,10 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
break;
}
override[tensor_name_span_len] = '\0';
auto tensor_name = override;
auto buffer_type = &override[tensor_name_span_len + 1];
auto * tensor_name = override;
auto * buffer_type = &override[tensor_name_span_len + 1];
if (buft_list.find(buffer_type) == buft_list.end()) {
printf("error: unrecognized buffer type '%s'\n", buffer_type);
printf("Available buffer types:\n");
for (const auto & it : buft_list) {
printf(" %s\n", ggml_backend_buft_name(it.second));
@@ -1826,10 +1827,11 @@ int main(int argc, char ** argv) {
fprintf(stderr, "warning: sanitizer enabled, performance may be affected\n");
#endif
cmd_params params = parse_cmd_params(argc, argv);
// initialize backends
ggml_backend_load_all();
cmd_params params = parse_cmd_params(argc, argv);
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
if (!cpu_dev) {
fprintf(stderr, "%s: error: CPU backend is not loaded\n", __func__);

Binary file not shown.

View File

@@ -2251,6 +2251,14 @@ struct server_context {
slot.has_next_token = true;
}
// if context shifting is disabled, make sure that we don't run out of context
if (!params_base.ctx_shift && slot.n_past + 1 >= slot.n_ctx) {
slot.stop = STOP_TYPE_LIMIT;
slot.has_next_token = false;
SLT_DBG(slot, "stopped due to running out of context, n_past = %d, n_ctx = %d\n", slot.n_past, slot.n_ctx);
}
// check the limits
if (slot.n_decoded > 0 && slot.has_next_token && !slot.has_budget(params_base)) {
slot.stop = STOP_TYPE_LIMIT;

View File

@@ -65,3 +65,21 @@ def test_ctx_shift_disabled_long_prompt():
assert res.status_code != 200
assert "error" in res.body
assert "exceeds the available context size" in res.body["error"]["message"]
def test_ctx_shift_disabled_stream():
global server
server.disable_ctx_shift = True
server.start()
res = server.make_stream_request("POST", "/v1/completions", data={
"n_predict": 256,
"prompt": "Once",
"stream": True,
})
content = ""
for data in res:
choice = data["choices"][0]
if choice["finish_reason"] == "length":
assert len(content) > 0
else:
assert choice["finish_reason"] is None
content += choice["text"]

View File

@@ -18,6 +18,7 @@
"dexie": "^4.0.11",
"highlight.js": "^11.10.0",
"katex": "^0.16.15",
"pdfjs-dist": "^5.2.133",
"postcss": "^8.4.49",
"react": "^18.3.1",
"react-dom": "^18.3.1",
@@ -988,7 +989,7 @@
"version": "0.3.8",
"resolved": "https://registry.npmjs.org/@jridgewell/gen-mapping/-/gen-mapping-0.3.8.tgz",
"integrity": "sha512-imAbBGkb+ebQyxKgzv5Hu2nmROxoDOXHh80evxdoXNOrvAnVx7zimzc1Oo5h9RlfV4vPXaE2iM5pOFbvOCClWA==",
"dev": true,
"devOptional": true,
"license": "MIT",
"dependencies": {
"@jridgewell/set-array": "^1.2.1",
@@ -1003,7 +1004,7 @@
"version": "3.1.2",
"resolved": "https://registry.npmjs.org/@jridgewell/resolve-uri/-/resolve-uri-3.1.2.tgz",
"integrity": "sha512-bRISgCIjP20/tbWSPWMEi54QVPRZExkuD9lJL+UIxUKtwVJA8wW1Trb1jMs1RFXo1CBTNZ/5hpC9QvmKWdopKw==",
"dev": true,
"devOptional": true,
"license": "MIT",
"engines": {
"node": ">=6.0.0"
@@ -1013,30 +1014,224 @@
"version": "1.2.1",
"resolved": "https://registry.npmjs.org/@jridgewell/set-array/-/set-array-1.2.1.tgz",
"integrity": "sha512-R8gLRTZeyp03ymzP/6Lil/28tGeGEzhx1q2k703KGWRAI1VdvPIXdG70VJc2pAMw3NA6JKL5hhFu1sJX0Mnn/A==",
"dev": true,
"devOptional": true,
"license": "MIT",
"engines": {
"node": ">=6.0.0"
}
},
"node_modules/@jridgewell/source-map": {
"version": "0.3.6",
"resolved": "https://registry.npmjs.org/@jridgewell/source-map/-/source-map-0.3.6.tgz",
"integrity": "sha512-1ZJTZebgqllO79ue2bm3rIGud/bOe0pP5BjSRCRxxYkEZS8STV7zN84UBbiYu7jy+eCKSnVIUgoWWE/tt+shMQ==",
"license": "MIT",
"optional": true,
"peer": true,
"dependencies": {
"@jridgewell/gen-mapping": "^0.3.5",
"@jridgewell/trace-mapping": "^0.3.25"
}
},
"node_modules/@jridgewell/sourcemap-codec": {
"version": "1.5.0",
"resolved": "https://registry.npmjs.org/@jridgewell/sourcemap-codec/-/sourcemap-codec-1.5.0.tgz",
"integrity": "sha512-gv3ZRaISU3fjPAgNsriBRqGWQL6quFx04YMPW/zD8XMLsU32mhCCbfbO6KZFLjvYpCZ8zyDEgqsgf+PwPaM7GQ==",
"dev": true,
"devOptional": true,
"license": "MIT"
},
"node_modules/@jridgewell/trace-mapping": {
"version": "0.3.25",
"resolved": "https://registry.npmjs.org/@jridgewell/trace-mapping/-/trace-mapping-0.3.25.tgz",
"integrity": "sha512-vNk6aEwybGtawWmy/PzwnGDOjCkLWSD2wqvjGGAgOAwCGWySYXfYoxt00IJkTF+8Lb57DwOb3Aa0o9CApepiYQ==",
"dev": true,
"devOptional": true,
"license": "MIT",
"dependencies": {
"@jridgewell/resolve-uri": "^3.1.0",
"@jridgewell/sourcemap-codec": "^1.4.14"
}
},
"node_modules/@napi-rs/canvas": {
"version": "0.1.70",
"resolved": "https://registry.npmjs.org/@napi-rs/canvas/-/canvas-0.1.70.tgz",
"integrity": "sha512-nD6NGa4JbNYSZYsTnLGrqe9Kn/lCkA4ybXt8sx5ojDqZjr2i0TWAHxx/vhgfjX+i3hCdKWufxYwi7CfXqtITSA==",
"license": "MIT",
"optional": true,
"engines": {
"node": ">= 10"
},
"optionalDependencies": {
"@napi-rs/canvas-android-arm64": "0.1.70",
"@napi-rs/canvas-darwin-arm64": "0.1.70",
"@napi-rs/canvas-darwin-x64": "0.1.70",
"@napi-rs/canvas-linux-arm-gnueabihf": "0.1.70",
"@napi-rs/canvas-linux-arm64-gnu": "0.1.70",
"@napi-rs/canvas-linux-arm64-musl": "0.1.70",
"@napi-rs/canvas-linux-riscv64-gnu": "0.1.70",
"@napi-rs/canvas-linux-x64-gnu": "0.1.70",
"@napi-rs/canvas-linux-x64-musl": "0.1.70",
"@napi-rs/canvas-win32-x64-msvc": "0.1.70"
}
},
"node_modules/@napi-rs/canvas-android-arm64": {
"version": "0.1.70",
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-android-arm64/-/canvas-android-arm64-0.1.70.tgz",
"integrity": "sha512-I/YOuQ0wbkVYxVaYtCgN42WKTYxNqFA0gTcTrHIGG1jfpDSyZWII/uHcjOo4nzd19io6Y4+/BqP8E5hJgf9OmQ==",
"cpu": [
"arm64"
],
"license": "MIT",
"optional": true,
"os": [
"android"
],
"engines": {
"node": ">= 10"
}
},
"node_modules/@napi-rs/canvas-darwin-arm64": {
"version": "0.1.70",
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-darwin-arm64/-/canvas-darwin-arm64-0.1.70.tgz",
"integrity": "sha512-4pPGyXetHIHkw2TOJHujt3mkCP8LdDu8+CT15ld9Id39c752RcI0amDHSuMLMQfAjvusA9B5kKxazwjMGjEJpQ==",
"cpu": [
"arm64"
],
"license": "MIT",
"optional": true,
"os": [
"darwin"
],
"engines": {
"node": ">= 10"
}
},
"node_modules/@napi-rs/canvas-darwin-x64": {
"version": "0.1.70",
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-darwin-x64/-/canvas-darwin-x64-0.1.70.tgz",
"integrity": "sha512-+2N6Os9LbkmDMHL+raknrUcLQhsXzc5CSXRbXws9C3pv/mjHRVszQ9dhFUUe9FjfPhCJznO6USVdwOtu7pOrzQ==",
"cpu": [
"x64"
],
"license": "MIT",
"optional": true,
"os": [
"darwin"
],
"engines": {
"node": ">= 10"
}
},
"node_modules/@napi-rs/canvas-linux-arm-gnueabihf": {
"version": "0.1.70",
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm-gnueabihf/-/canvas-linux-arm-gnueabihf-0.1.70.tgz",
"integrity": "sha512-QjscX9OaKq/990sVhSMj581xuqLgiaPVMjjYvWaCmAJRkNQ004QfoSMEm3FoTqM4DRoquP8jvuEXScVJsc1rqQ==",
"cpu": [
"arm"
],
"license": "MIT",
"optional": true,
"os": [
"linux"
],
"engines": {
"node": ">= 10"
}
},
"node_modules/@napi-rs/canvas-linux-arm64-gnu": {
"version": "0.1.70",
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm64-gnu/-/canvas-linux-arm64-gnu-0.1.70.tgz",
"integrity": "sha512-LNakMOwwqwiHIwMpnMAbFRczQMQ7TkkMyATqFCOtUJNlE6LPP/QiUj/mlFrNbUn/hctqShJ60gWEb52ZTALbVw==",
"cpu": [
"arm64"
],
"license": "MIT",
"optional": true,
"os": [
"linux"
],
"engines": {
"node": ">= 10"
}
},
"node_modules/@napi-rs/canvas-linux-arm64-musl": {
"version": "0.1.70",
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm64-musl/-/canvas-linux-arm64-musl-0.1.70.tgz",
"integrity": "sha512-wBTOllEYNfJCHOdZj9v8gLzZ4oY3oyPX8MSRvaxPm/s7RfEXxCyZ8OhJ5xAyicsDdbE5YBZqdmaaeP5+xKxvtg==",
"cpu": [
"arm64"
],
"license": "MIT",
"optional": true,
"os": [
"linux"
],
"engines": {
"node": ">= 10"
}
},
"node_modules/@napi-rs/canvas-linux-riscv64-gnu": {
"version": "0.1.70",
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-riscv64-gnu/-/canvas-linux-riscv64-gnu-0.1.70.tgz",
"integrity": "sha512-GVUUPC8TuuFqHip0rxHkUqArQnlzmlXmTEBuXAWdgCv85zTCFH8nOHk/YCF5yo0Z2eOm8nOi90aWs0leJ4OE5Q==",
"cpu": [
"riscv64"
],
"license": "MIT",
"optional": true,
"os": [
"linux"
],
"engines": {
"node": ">= 10"
}
},
"node_modules/@napi-rs/canvas-linux-x64-gnu": {
"version": "0.1.70",
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-x64-gnu/-/canvas-linux-x64-gnu-0.1.70.tgz",
"integrity": "sha512-/kvUa2lZRwGNyfznSn5t1ShWJnr/m5acSlhTV3eXECafObjl0VBuA1HJw0QrilLpb4Fe0VLywkpD1NsMoVDROQ==",
"cpu": [
"x64"
],
"license": "MIT",
"optional": true,
"os": [
"linux"
],
"engines": {
"node": ">= 10"
}
},
"node_modules/@napi-rs/canvas-linux-x64-musl": {
"version": "0.1.70",
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-x64-musl/-/canvas-linux-x64-musl-0.1.70.tgz",
"integrity": "sha512-aqlv8MLpycoMKRmds7JWCfVwNf1fiZxaU7JwJs9/ExjTD8lX2KjsO7CTeAj5Cl4aEuzxUWbJPUUE2Qu9cZ1vfg==",
"cpu": [
"x64"
],
"license": "MIT",
"optional": true,
"os": [
"linux"
],
"engines": {
"node": ">= 10"
}
},
"node_modules/@napi-rs/canvas-win32-x64-msvc": {
"version": "0.1.70",
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-win32-x64-msvc/-/canvas-win32-x64-msvc-0.1.70.tgz",
"integrity": "sha512-Q9QU3WIpwBTVHk4cPfBjGHGU4U0llQYRXgJtFtYqqGNEOKVN4OT6PQ+ve63xwIPODMpZ0HHyj/KLGc9CWc3EtQ==",
"cpu": [
"x64"
],
"license": "MIT",
"optional": true,
"os": [
"win32"
],
"engines": {
"node": ">= 10"
}
},
"node_modules/@nodelib/fs.scandir": {
"version": "2.1.5",
"resolved": "https://registry.npmjs.org/@nodelib/fs.scandir/-/fs.scandir-2.1.5.tgz",
@@ -2002,7 +2197,7 @@
"version": "8.14.0",
"resolved": "https://registry.npmjs.org/acorn/-/acorn-8.14.0.tgz",
"integrity": "sha512-cl669nCJTZBsL97OF4kUQm5g5hC2uihk0NxY3WENAC0TYdILVkAyHymAntgxGkl7K+t0cXIrH5siy5S4XkFycA==",
"dev": true,
"devOptional": true,
"license": "MIT",
"bin": {
"acorn": "bin/acorn"
@@ -2186,6 +2381,14 @@
"devOptional": true,
"license": "MIT/X11"
},
"node_modules/buffer-from": {
"version": "1.1.2",
"resolved": "https://registry.npmjs.org/buffer-from/-/buffer-from-1.1.2.tgz",
"integrity": "sha512-E+XQCRwSbaaiChtv6k6Dwgc+bx+Bs6vuKJHHl5kox/BaKbhiXzqQOwK4cO22yElGp2OCmjwVhT3HmxgyPGnJfQ==",
"license": "MIT",
"optional": true,
"peer": true
},
"node_modules/callsites": {
"version": "3.1.0",
"resolved": "https://registry.npmjs.org/callsites/-/callsites-3.1.0.tgz",
@@ -4843,6 +5046,18 @@
"node": ">=8"
}
},
"node_modules/pdfjs-dist": {
"version": "5.2.133",
"resolved": "https://registry.npmjs.org/pdfjs-dist/-/pdfjs-dist-5.2.133.tgz",
"integrity": "sha512-abE6ZWDxztt+gGFzfm4bX2ggfxUk9wsDEoFzIJm9LozaY3JdXR7jyLK4Bjs+XLXplCduuWS1wGhPC4tgTn/kzg==",
"license": "Apache-2.0",
"engines": {
"node": ">=20.16.0 || >=22.3.0"
},
"optionalDependencies": {
"@napi-rs/canvas": "^0.1.67"
}
},
"node_modules/picocolors": {
"version": "1.1.1",
"resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.1.1.tgz",
@@ -5753,6 +5968,17 @@
"node": ">=8"
}
},
"node_modules/source-map": {
"version": "0.6.1",
"resolved": "https://registry.npmjs.org/source-map/-/source-map-0.6.1.tgz",
"integrity": "sha512-UjgapumWlbMhkBgzT7Ykc5YXUT46F0iKu8SGXq0bcwP5dz/h0Plj6enJqjz1Zbq2l5WaqYnrVbwWOWMyF3F47g==",
"license": "BSD-3-Clause",
"optional": true,
"peer": true,
"engines": {
"node": ">=0.10.0"
}
},
"node_modules/source-map-js": {
"version": "1.2.1",
"resolved": "https://registry.npmjs.org/source-map-js/-/source-map-js-1.2.1.tgz",
@@ -5762,6 +5988,18 @@
"node": ">=0.10.0"
}
},
"node_modules/source-map-support": {
"version": "0.5.21",
"resolved": "https://registry.npmjs.org/source-map-support/-/source-map-support-0.5.21.tgz",
"integrity": "sha512-uBHU3L3czsIyYXKX88fdrGovxdSCoTGDRZ6SYXtSRxLZUzHg5P/66Ht6uoUlHu9EZod+inXhKo3qQgwXUT/y1w==",
"license": "MIT",
"optional": true,
"peer": true,
"dependencies": {
"buffer-from": "^1.0.0",
"source-map": "^0.6.0"
}
},
"node_modules/space-separated-tokens": {
"version": "2.0.2",
"resolved": "https://registry.npmjs.org/space-separated-tokens/-/space-separated-tokens-2.0.2.tgz",
@@ -5859,6 +6097,34 @@
"node": ">=6"
}
},
"node_modules/terser": {
"version": "5.39.1",
"resolved": "https://registry.npmjs.org/terser/-/terser-5.39.1.tgz",
"integrity": "sha512-Mm6+uad0ZuDtcV8/4uOZQDQ8RuiC5Pu+iZRedJtF7yA/27sPL7d++In/AJKpWZlU3SYMPPkVfwetn6sgZ66pUA==",
"license": "BSD-2-Clause",
"optional": true,
"peer": true,
"dependencies": {
"@jridgewell/source-map": "^0.3.3",
"acorn": "^8.8.2",
"commander": "^2.20.0",
"source-map-support": "~0.5.20"
},
"bin": {
"terser": "bin/terser"
},
"engines": {
"node": ">=10"
}
},
"node_modules/terser/node_modules/commander": {
"version": "2.20.3",
"resolved": "https://registry.npmjs.org/commander/-/commander-2.20.3.tgz",
"integrity": "sha512-GpVkmM8vF2vQUkj2LvZmD35JxeJOLCwJ9cUkugyk2nuhbv3+mJvpLYYt+0+USMxE+oj+ey/lJEnhZw75x/OMcQ==",
"license": "MIT",
"optional": true,
"peer": true
},
"node_modules/textlinestream": {
"version": "1.1.1",
"resolved": "https://registry.npmjs.org/textlinestream/-/textlinestream-1.1.1.tgz",

View File

@@ -21,6 +21,7 @@
"dexie": "^4.0.11",
"highlight.js": "^11.10.0",
"katex": "^0.16.15",
"pdfjs-dist": "^5.2.133",
"postcss": "^8.4.49",
"react": "^18.3.1",
"react-dom": "^18.3.1",

View File

@@ -28,13 +28,13 @@ function AppLayout() {
return (
<>
<Sidebar />
<div
<main
className="drawer-content grow flex flex-col h-screen w-screen mx-auto px-4 overflow-auto bg-base-100"
id="main-scroll"
>
<Header />
<Outlet />
</div>
</main>
{
<SettingDialog
show={showSettings}

View File

@@ -16,6 +16,8 @@ export const CONFIG_DEFAULT = {
showTokensPerSecond: false,
showThoughtInProgress: false,
excludeThoughtOnReq: true,
pasteLongTextToFileLen: 2500,
pdfAsImage: false,
// make sure these default values are in sync with `common.h`
samplers: 'edkypmxt',
temperature: 0.8,
@@ -43,6 +45,8 @@ export const CONFIG_DEFAULT = {
export const CONFIG_INFO: Record<string, string> = {
apiKey: 'Set the API Key if you are using --api-key option for the server.',
systemMessage: 'The starting message that defines how model should behave.',
pasteLongTextToFileLen:
'On pasting long text, it will be converted to a file. You can control the file length by setting the value of this parameter. Value 0 means disable.',
samplers:
'The order at which samplers are applied, in simplified way. Default is "dkypmxt": dry->top_k->typ_p->top_p->min_p->xtc->temperature',
temperature:

View File

@@ -18,16 +18,26 @@ export default function ChatInputExtraContextItem({
if (!items) return null;
return (
<div className="flex flex-row gap-4 overflow-x-auto py-2 px-1 mb-1">
<div
className="flex flex-row gap-4 overflow-x-auto py-2 px-1 mb-1"
role="group"
aria-description="Selected files"
>
{items.map((item, i) => (
<div
className="indicator"
key={i}
onClick={() => clickToShow && setShow(i)}
tabIndex={0}
aria-description={
clickToShow ? `Click to show: ${item.name}` : undefined
}
role={clickToShow ? 'button' : 'menuitem'}
>
{removeItem && (
<div className="indicator-item indicator-top">
<button
aria-label="Remove file"
className="btn btn-neutral btn-sm w-4 h-4 p-0 rounded-full"
onClick={() => removeItem(i)}
>
@@ -46,13 +56,16 @@ export default function ChatInputExtraContextItem({
<>
<img
src={item.base64Url}
alt={item.name}
alt={`Preview image for ${item.name}`}
className="w-14 h-14 object-cover rounded-md"
/>
</>
) : (
<>
<div className="w-14 h-14 flex items-center justify-center">
<div
className="w-14 h-14 flex items-center justify-center"
aria-description="Document icon"
>
<DocumentTextIcon className="h-8 w-14 text-base-content/50" />
</div>
@@ -66,16 +79,25 @@ export default function ChatInputExtraContextItem({
))}
{showingItem && (
<dialog className="modal modal-open">
<dialog
className="modal modal-open"
aria-description={`Preview ${showingItem.name}`}
>
<div className="modal-box">
<div className="flex justify-between items-center mb-4">
<b>{showingItem.name ?? 'Extra content'}</b>
<button className="btn btn-ghost btn-sm">
<button
className="btn btn-ghost btn-sm"
aria-label="Close preview dialog"
>
<XMarkIcon className="h-5 w-5" onClick={() => setShow(-1)} />
</button>
</div>
{showingItem.type === 'imageFile' ? (
<img src={showingItem.base64Url} alt={showingItem.name} />
<img
src={showingItem.base64Url}
alt={`Preview image for ${showingItem.name}`}
/>
) : (
<div className="overflow-x-auto">
<pre className="whitespace-pre-wrap break-words text-sm">

View File

@@ -83,13 +83,20 @@ export default function ChatMessage({
if (!viewingChat) return null;
const isUser = msg.role === 'user';
return (
<div className="group" id={id}>
<div
className="group"
id={id}
role="group"
aria-description={`Message from ${msg.role}`}
>
<div
className={classNames({
chat: true,
'chat-start': msg.role !== 'user',
'chat-end': msg.role === 'user',
'chat-start': !isUser,
'chat-end': isUser,
})}
>
{msg.extra && msg.extra.length > 0 && (
@@ -99,7 +106,7 @@ export default function ChatMessage({
<div
className={classNames({
'chat-bubble markdown': true,
'chat-bubble bg-transparent': msg.role !== 'user',
'chat-bubble bg-transparent': !isUser,
})}
>
{/* textarea for editing message */}
@@ -142,7 +149,7 @@ export default function ChatMessage({
) : (
<>
{/* render message as markdown */}
<div dir="auto">
<div dir="auto" tabIndex={0}>
{thought && (
<ThoughtProcess
isThinking={!!isThinking && !!isPending}
@@ -196,13 +203,18 @@ export default function ChatMessage({
})}
>
{siblingLeafNodeIds && siblingLeafNodeIds.length > 1 && (
<div className="flex gap-1 items-center opacity-60 text-sm">
<div
className="flex gap-1 items-center opacity-60 text-sm"
role="navigation"
aria-description={`Message version ${siblingCurrIdx + 1} of ${siblingLeafNodeIds.length}`}
>
<button
className={classNames({
'btn btn-sm btn-ghost p-1': true,
'opacity-20': !prevSibling,
})}
onClick={() => prevSibling && onChangeSibling(prevSibling)}
aria-label="Previous message version"
>
<ChevronLeftIcon className="h-4 w-4" />
</button>
@@ -215,6 +227,7 @@ export default function ChatMessage({
'opacity-20': !nextSibling,
})}
onClick={() => nextSibling && onChangeSibling(nextSibling)}
aria-label="Next message version"
>
<ChevronRightIcon className="h-4 w-4" />
</button>
@@ -223,7 +236,7 @@ export default function ChatMessage({
{/* user message */}
{msg.role === 'user' && (
<BtnWithTooltips
className="btn-mini show-on-hover w-8 h-8"
className="btn-mini w-8 h-8"
onClick={() => setEditingContent(msg.content)}
disabled={msg.content === null}
tooltipsContent="Edit message"
@@ -236,7 +249,7 @@ export default function ChatMessage({
<>
{!isPending && (
<BtnWithTooltips
className="btn-mini show-on-hover w-8 h-8"
className="btn-mini w-8 h-8"
onClick={() => {
if (msg.content !== null) {
onRegenerateMessage(msg as Message);
@@ -250,10 +263,7 @@ export default function ChatMessage({
)}
</>
)}
<CopyButton
className="btn-mini show-on-hover w-8 h-8"
content={msg.content}
/>
<CopyButton className="btn-mini w-8 h-8" content={msg.content} />
</div>
)}
</div>
@@ -271,6 +281,8 @@ function ThoughtProcess({
}) {
return (
<div
role="button"
aria-label="Toggle thought process display"
tabIndex={0}
className={classNames({
'collapse bg-none': true,
@@ -292,7 +304,11 @@ function ThoughtProcess({
)}
</div>
</div>
<div className="collapse-content text-base-content/70 text-sm p-1">
<div
className="collapse-content text-base-content/70 text-sm p-1"
tabIndex={0}
aria-description="Thought process content"
>
<div className="border-l-2 border-base-content/20 pl-4 mb-4">
<MarkdownDisplay content={content} />
</div>

View File

@@ -279,7 +279,11 @@ export default function ChatScreen() {
function ServerInfo() {
const { serverProps } = useAppContext();
return (
<div className="card card-sm shadow-sm border-1 border-base-content/20 text-base-content/70 mb-6">
<div
className="card card-sm shadow-sm border-1 border-base-content/20 text-base-content/70 mb-6"
tabIndex={0}
aria-description="Server information"
>
<div className="card-body">
<b>Server Info</b>
<p>
@@ -306,10 +310,13 @@ function ChatInput({
onStop: () => void;
isGenerating: boolean;
}) {
const { config } = useAppContext();
const [isDrag, setIsDrag] = useState(false);
return (
<div
role="group"
aria-label="Chat input"
className={classNames({
'flex items-end pt-8 pb-6 sticky bottom-0 bg-base-100': true,
'opacity-50': isDrag, // simply visual feedback to inform user that the file will be accepted
@@ -328,7 +335,28 @@ function ChatInput({
{({ getRootProps, getInputProps }) => (
<div
className="flex flex-col rounded-xl border-1 border-base-content/30 p-3 w-full"
// when a file is pasted to the input, we handle it here
// if a text is pasted, and if it is long text, we will convert it to a file
onPasteCapture={(e: ClipboardEvent<HTMLInputElement>) => {
const text = e.clipboardData.getData('text/plain');
if (
text.length > 0 &&
config.pasteLongTextToFileLen > 0 &&
text.length > config.pasteLongTextToFileLen
) {
// if the text is too long, we will convert it to a file
extraContext.addItems([
{
type: 'context',
name: 'Pasted Content',
content: text,
},
]);
e.preventDefault();
return;
}
// if a file is pasted, we will handle it here
const files = Array.from(e.clipboardData.items)
.filter((item) => item.kind === 'file')
.map((item) => item.getAsFile())
@@ -378,13 +406,15 @@ function ChatInput({
'btn w-8 h-8 p-0 rounded-full': true,
'btn-disabled': isGenerating,
})}
aria-label="Upload file"
tabIndex={0}
role="button"
>
<PaperClipIcon className="h-5 w-5" />
</label>
<input
id="file-upload"
type="file"
className="hidden"
disabled={isGenerating}
{...getInputProps()}
hidden
@@ -400,6 +430,7 @@ function ChatInput({
<button
className="btn btn-primary w-8 h-8 p-0 rounded-full"
onClick={onSend}
aria-label="Send message"
>
<ArrowUpIcon className="h-5 w-5" />
</button>

View File

@@ -38,8 +38,12 @@ export default function Header() {
{/* action buttons (top right) */}
<div className="flex items-center">
<div className="tooltip tooltip-bottom" data-tip="Settings">
<button className="btn" onClick={() => setShowSettings(true)}>
<div
className="tooltip tooltip-bottom"
data-tip="Settings"
onClick={() => setShowSettings(true)}
>
<button className="btn" aria-hidden={true}>
{/* settings button */}
<Cog8ToothIcon className="w-5 h-5" />
</button>

View File

@@ -100,6 +100,16 @@ const SETTING_SECTIONS: SettingSection[] = [
key,
}) as SettingFieldInput
),
{
type: SettingInputType.SHORT_INPUT,
label: 'Paste length to file',
key: 'pasteLongTextToFileLen',
},
{
type: SettingInputType.CHECKBOX,
label: 'Parse PDF as image instead of text',
key: 'pdfAsImage',
},
],
},
{
@@ -325,14 +335,22 @@ export default function SettingDialog({
};
return (
<dialog className={classNames({ modal: true, 'modal-open': show })}>
<dialog
className={classNames({ modal: true, 'modal-open': show })}
aria-label="Settings dialog"
>
<div className="modal-box w-11/12 max-w-3xl">
<h3 className="text-lg font-bold mb-6">Settings</h3>
<div className="flex flex-col md:flex-row h-[calc(90vh-12rem)]">
{/* Left panel, showing sections - Desktop version */}
<div className="hidden md:flex flex-col items-stretch pr-4 mr-4 border-r-2 border-base-200">
<div
className="hidden md:flex flex-col items-stretch pr-4 mr-4 border-r-2 border-base-200"
role="complementary"
aria-description="Settings sections"
tabIndex={0}
>
{SETTING_SECTIONS.map((section, idx) => (
<div
<button
key={idx}
className={classNames({
'btn btn-ghost justify-start font-normal w-44 mb-1': true,
@@ -342,12 +360,16 @@ export default function SettingDialog({
dir="auto"
>
{section.title}
</div>
</button>
))}
</div>
{/* Left panel, showing sections - Mobile version */}
<div className="md:hidden flex flex-row gap-2 mb-4">
{/* This menu is skipped on a11y, otherwise it's repeated the desktop version */}
<div
className="md:hidden flex flex-row gap-2 mb-4"
aria-disabled={true}
>
<details className="dropdown">
<summary className="btn bt-sm w-full m-1">
{SETTING_SECTIONS[sectionIdx].title}
@@ -452,10 +474,10 @@ function SettingsModalLongInput({
label?: string;
}) {
return (
<label className="form-control mb-2">
<div className="label inline">{label || configKey}</div>
<label className="form-control">
<div className="label inline text-sm">{label || configKey}</div>
<textarea
className="textarea textarea-bordered h-24"
className="textarea textarea-bordered h-24 mb-2"
placeholder={`Default: ${CONFIG_DEFAULT[configKey] || 'none'}`}
value={value}
onChange={(e) => onChange(e.target.value)}
@@ -482,9 +504,7 @@ function SettingsModalShortInput({
<>
{/* on mobile, we simply show the help message here */}
{helpMsg && (
<div className="block md:hidden mb-1">
<b>{label || configKey}</b>
<br />
<div className="block mb-1 opacity-75">
<p className="text-xs">{helpMsg}</p>
</div>
)}
@@ -493,11 +513,6 @@ function SettingsModalShortInput({
<div tabIndex={0} role="button" className="font-bold hidden md:block">
{label || configKey}
</div>
{helpMsg && (
<div className="dropdown-content menu bg-base-100 rounded-box z-10 w-64 p-2 shadow mt-4">
{helpMsg}
</div>
)}
</div>
<input
type="text"

View File

@@ -50,44 +50,72 @@ export default function Sidebar() {
id="toggle-drawer"
type="checkbox"
className="drawer-toggle"
aria-label="Toggle sidebar"
defaultChecked
/>
<div className="drawer-side h-screen lg:h-screen z-50 lg:max-w-64">
<div
className="drawer-side h-screen lg:h-screen z-50 lg:max-w-64"
role="complementary"
aria-label="Sidebar"
tabIndex={0}
>
<label
htmlFor="toggle-drawer"
aria-label="close sidebar"
aria-label="Close sidebar"
className="drawer-overlay"
></label>
<a
href="#main-scroll"
className="absolute -left-80 top-0 w-1 h-1 overflow-hidden"
>
Skip to main content
</a>
<div className="flex flex-col bg-base-200 min-h-full max-w-64 py-4 px-4">
<div className="flex flex-row items-center justify-between mb-4 mt-4">
<h2 className="font-bold ml-4">Conversations</h2>
<h2 className="font-bold ml-4" role="heading">
Conversations
</h2>
{/* close sidebar button */}
<label htmlFor="toggle-drawer" className="btn btn-ghost lg:hidden">
<label
htmlFor="toggle-drawer"
className="btn btn-ghost lg:hidden"
aria-label="Close sidebar"
role="button"
tabIndex={0}
>
<XMarkIcon className="w-5 h-5" />
</label>
</div>
{/* new conversation button */}
<div
<button
className={classNames({
'btn btn-ghost justify-start px-2': true,
'btn-soft': !currConv,
})}
onClick={() => navigate('/')}
aria-label="New conversation"
>
<PencilSquareIcon className="w-5 h-5" />
New conversation
</div>
</button>
{/* list of conversations */}
{groupedConv.map((group, i) => (
<div key={i}>
<div key={i} role="group">
{/* group name (by date) */}
{group.title ? (
// we use btn class here to make sure that the padding/margin are aligned with the other items
<b className="btn btn-ghost btn-xs bg-none btn-disabled block text-xs text-base-content text-start px-2 mb-0 mt-6 font-bold">
<b
className="btn btn-ghost btn-xs bg-none btn-disabled block text-xs text-base-content text-start px-2 mb-0 mt-6 font-bold"
role="note"
aria-description={group.title}
tabIndex={0}
>
{group.title}
</b>
) : (
@@ -184,20 +212,23 @@ function ConversationItem({
}) {
return (
<div
role="menuitem"
tabIndex={0}
aria-label={conv.name}
className={classNames({
'group flex flex-row btn btn-ghost justify-start items-center font-normal px-2 h-9':
true,
'btn-soft': isCurrConv,
})}
>
<div
<button
key={conv.id}
className="w-full overflow-hidden truncate text-start"
onClick={onSelect}
dir="auto"
>
{conv.name}
</div>
</button>
<div className="dropdown dropdown-end h-5">
<BtnWithTooltips
// on mobile, we always show the ellipsis icon
@@ -211,22 +242,23 @@ function ConversationItem({
</BtnWithTooltips>
{/* dropdown menu */}
<ul
aria-label="More options"
tabIndex={0}
className="dropdown-content menu bg-base-100 rounded-box z-[1] p-2 shadow"
>
<li onClick={onRename}>
<li onClick={onRename} tabIndex={0}>
<a>
<PencilIcon className="w-4 h-4" />
Rename
</a>
</li>
<li onClick={onDownload}>
<li onClick={onDownload} tabIndex={0}>
<a>
<ArrowDownTrayIcon className="w-4 h-4" />
Download
</a>
</li>
<li className="text-error" onClick={onDelete}>
<li className="text-error" onClick={onDelete} tabIndex={0}>
<a>
<TrashIcon className="w-4 h-4" />
Delete

View File

@@ -2,6 +2,17 @@ import { useState } from 'react';
import { MessageExtra } from '../utils/types';
import toast from 'react-hot-toast';
import { useAppContext } from '../utils/app.context';
import * as pdfjs from 'pdfjs-dist';
import pdfjsWorkerSrc from 'pdfjs-dist/build/pdf.worker.min.mjs?url';
import { TextContent, TextItem } from 'pdfjs-dist/types/src/display/api';
pdfjs.GlobalWorkerOptions.workerSrc = pdfjsWorkerSrc;
// This file handles uploading extra context items (a.k.a files)
// It allows processing these kinds of files:
// - image files (converted to base64)
// - text files (including code files)
// - pdf (converted to text)
// Interface describing the API returned by the hook
export interface ChatExtraContextApi {
@@ -13,7 +24,7 @@ export interface ChatExtraContextApi {
}
export function useChatExtraContext(): ChatExtraContextApi {
const { serverProps } = useAppContext();
const { serverProps, config } = useAppContext();
const [items, setItems] = useState<MessageExtra[]>([]);
const addItems = (newItems: MessageExtra[]) => {
@@ -28,6 +39,8 @@ export function useChatExtraContext(): ChatExtraContextApi {
setItems([]);
};
const isSupportVision = serverProps?.modalities?.vision;
const onFileAdded = (files: File[]) => {
for (const file of files) {
const mimeType = file.type;
@@ -38,7 +51,7 @@ export function useChatExtraContext(): ChatExtraContextApi {
}
if (mimeType.startsWith('image/')) {
if (!serverProps?.modalities?.vision) {
if (!isSupportVision) {
toast.error('Multimodal is not supported by this server or model.');
break;
}
@@ -69,7 +82,43 @@ export function useChatExtraContext(): ChatExtraContextApi {
toast.error('Video and audio files are not supported yet.');
break;
} else if (mimeType.startsWith('application/pdf')) {
toast.error('PDF files are not supported yet.');
if (config.pdfAsImage && !isSupportVision) {
toast(
'Multimodal is not supported, PDF will be converted to text instead of image.'
);
break;
}
const promise =
config.pdfAsImage && isSupportVision
? convertPDFToImage(file).then((base64Urls) => {
addItems(
base64Urls.map((base64Url) => ({
type: 'imageFile',
name: file.name,
base64Url,
}))
);
})
: convertPDFToText(file).then((content) => {
if (isSupportVision) {
toast.success(
'PDF file converted to text. You can also convert it to image, see in Settings.'
);
}
addItems([
{
type: 'textFile',
name: file.name,
content,
},
]);
});
promise.catch((error) => {
console.error(error);
toast.error('Failed to parse PDF file.');
});
break;
} else {
// Because there can be many text file types (like code file), we will not check the mime type
@@ -105,11 +154,69 @@ export function useChatExtraContext(): ChatExtraContextApi {
};
}
async function getFileAsBuffer(file: File): Promise<ArrayBuffer> {
return new Promise((resolve, reject) => {
const reader = new FileReader();
reader.onload = (event) => {
if (event.target?.result) {
resolve(event.target.result as ArrayBuffer);
} else {
reject(new Error('Failed to read file.'));
}
};
reader.readAsArrayBuffer(file);
});
}
async function convertPDFToText(file: File): Promise<string> {
const buffer = await getFileAsBuffer(file);
const pdf = await pdfjs.getDocument(buffer).promise;
const numPages = pdf.numPages;
const textContentPromises: Promise<TextContent>[] = [];
for (let i = 1; i <= numPages; i++) {
textContentPromises.push(
pdf.getPage(i).then((page) => page.getTextContent())
);
}
const textContents = await Promise.all(textContentPromises);
const textItems = textContents.flatMap((textContent: TextContent) =>
textContent.items.map((item) => (item as TextItem).str ?? '')
);
return textItems.join('\n');
}
// returns list of base64 images
async function convertPDFToImage(file: File): Promise<string[]> {
const buffer = await getFileAsBuffer(file);
const doc = await pdfjs.getDocument(buffer).promise;
const pages: Promise<string>[] = [];
for (let i = 1; i <= doc.numPages; i++) {
const page = await doc.getPage(i);
const viewport = page.getViewport({ scale: 1.5 });
const canvas = document.createElement('canvas');
const ctx = canvas.getContext('2d');
canvas.width = viewport.width;
canvas.height = viewport.height;
if (!ctx) {
throw new Error('Failed to get 2D context from canvas');
}
const task = page.render({ canvasContext: ctx, viewport: viewport });
pages.push(
task.promise.then(() => {
return canvas.toDataURL();
})
);
}
return await Promise.all(pages);
}
// WARN: vibe code below
// This code is a heuristic to determine if a string is likely not binary.
// It is necessary because input file can have various mime types which we don't have time to investigate.
// For example, a python file can be text/plain, application/x-python, etc.
export function isLikelyNotBinary(str: string): boolean {
function isLikelyNotBinary(str: string): boolean {
const options = {
prefixLength: 1024 * 10, // Check the first 10KB of the string
suspiciousCharThresholdRatio: 0.15, // Allow up to 15% suspicious chars

View File

@@ -34,9 +34,6 @@ html {
/* TODO: fix markdown table */
}
.show-on-hover {
@apply md:opacity-0 md:group-hover:opacity-100;
}
.btn-mini {
@apply cursor-pointer;
}

View File

@@ -52,13 +52,20 @@ export function BtnWithTooltips({
tooltipsContent: string;
disabled?: boolean;
}) {
// the onClick handler is on the container, so screen readers can safely ignore the inner button
// this prevents the label from being read twice
return (
<div className="tooltip tooltip-bottom" data-tip={tooltipsContent}>
<div
className="tooltip tooltip-bottom"
data-tip={tooltipsContent}
role="button"
onClick={onClick}
>
<button
className={`${className ?? ''} flex items-center justify-center`}
onClick={onClick}
disabled={disabled}
onMouseLeave={onMouseLeave}
aria-hidden={true}
>
{children}
</button>

View File

@@ -7,7 +7,7 @@ import * as fflate from 'fflate';
/* eslint-disable */
const MAX_BUNDLE_SIZE = 1.5 * 1024 * 1024; // only increase when absolutely necessary
const MAX_BUNDLE_SIZE = 2 * 1024 * 1024; // only increase when absolutely necessary
const GUIDE_FOR_FRONTEND = `
<!--