mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-02-12 14:03:20 +02:00
Compare commits
19 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4f41ee11d6 | ||
|
|
3e0be1cace | ||
|
|
6aa892ec2a | ||
|
|
aea9f8b4e7 | ||
|
|
06c1e4abc1 | ||
|
|
415e40a357 | ||
|
|
654a67794f | ||
|
|
5364ae4ba5 | ||
|
|
7c07ac244d | ||
|
|
0a338ed013 | ||
|
|
bc098c3cf0 | ||
|
|
c6a2c9e741 | ||
|
|
07ad2b6db3 | ||
|
|
c531edfa34 | ||
|
|
02cdd2d8b0 | ||
|
|
64bb51cf90 | ||
|
|
9c404ed54c | ||
|
|
6c8b91500e | ||
|
|
3cc1f1f1d2 |
@@ -5,6 +5,10 @@ inputs:
|
||||
description: 'CURL version'
|
||||
required: false
|
||||
default: '8.6.0_6'
|
||||
architecture:
|
||||
description: 'Architecture of the libcurl to download'
|
||||
required: false
|
||||
default: 'win64'
|
||||
outputs:
|
||||
curl_path:
|
||||
description: "Path to the downloaded libcurl"
|
||||
@@ -18,8 +22,9 @@ runs:
|
||||
shell: powershell
|
||||
env:
|
||||
CURL_VERSION: ${{ inputs.curl_version }}
|
||||
ARCHITECTURE: ${{ inputs.architecture }}
|
||||
run: |
|
||||
curl.exe -o $env:RUNNER_TEMP/curl.zip -L "https://curl.se/windows/dl-${env:CURL_VERSION}/curl-${env:CURL_VERSION}-win64-mingw.zip"
|
||||
curl.exe -o $env:RUNNER_TEMP/curl.zip -L "https://curl.se/windows/dl-${env:CURL_VERSION}/curl-${env:CURL_VERSION}-${env:ARCHITECTURE}-mingw.zip"
|
||||
mkdir $env:RUNNER_TEMP/libcurl
|
||||
tar.exe -xvf $env:RUNNER_TEMP/curl.zip --strip-components=1 -C $env:RUNNER_TEMP/libcurl
|
||||
echo "curl_path=$env:RUNNER_TEMP/libcurl" >> $env:GITHUB_OUTPUT
|
||||
|
||||
91
.github/workflows/build-linux-cross.yml
vendored
91
.github/workflows/build-linux-cross.yml
vendored
@@ -140,3 +140,94 @@ jobs:
|
||||
-DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
|
||||
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
|
||||
ubuntu-24-ppc64el-cpu-cross:
|
||||
runs-on: ubuntu-24.04
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- name: Setup PowerPC64le
|
||||
run: |
|
||||
sudo dpkg --add-architecture ppc64el
|
||||
|
||||
# Add arch-specific repositories for non-amd64 architectures
|
||||
cat << EOF | sudo tee /etc/apt/sources.list.d/ppc64el-ports.list
|
||||
deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
|
||||
deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
|
||||
deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
|
||||
deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
|
||||
EOF
|
||||
|
||||
sudo apt-get update || true ;# Prevent failure due to missing URLs.
|
||||
|
||||
sudo apt-get install -y --no-install-recommends \
|
||||
build-essential \
|
||||
gcc-14-powerpc64le-linux-gnu \
|
||||
g++-14-powerpc64le-linux-gnu \
|
||||
libcurl4-openssl-dev:ppc64el
|
||||
|
||||
- name: Build
|
||||
run: |
|
||||
cmake -B build -DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_OPENMP=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=ON \
|
||||
-DLLAMA_BUILD_TOOLS=ON \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
-DCMAKE_SYSTEM_NAME=Linux \
|
||||
-DCMAKE_SYSTEM_PROCESSOR=ppc64 \
|
||||
-DCMAKE_C_COMPILER=powerpc64le-linux-gnu-gcc-14 \
|
||||
-DCMAKE_CXX_COMPILER=powerpc64le-linux-gnu-g++-14 \
|
||||
-DCMAKE_POSITION_INDEPENDENT_CODE=ON \
|
||||
-DCMAKE_FIND_ROOT_PATH=/usr/lib/powerpc64le-linux-gnu \
|
||||
-DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
|
||||
-DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
|
||||
-DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
|
||||
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
|
||||
ubuntu-24-ppc64el-vulkan-cross:
|
||||
runs-on: ubuntu-24.04
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- name: Setup PowerPC64le
|
||||
run: |
|
||||
sudo dpkg --add-architecture ppc64el
|
||||
|
||||
# Add arch-specific repositories for non-amd64 architectures
|
||||
cat << EOF | sudo tee /etc/apt/sources.list.d/ppc64el-ports.list
|
||||
deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
|
||||
deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
|
||||
deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
|
||||
deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
|
||||
EOF
|
||||
|
||||
sudo apt-get update || true ;# Prevent failure due to missing URLs.
|
||||
|
||||
sudo apt-get install -y --no-install-recommends \
|
||||
build-essential \
|
||||
glslc \
|
||||
gcc-14-powerpc64le-linux-gnu \
|
||||
g++-14-powerpc64le-linux-gnu \
|
||||
libvulkan-dev:ppc64el \
|
||||
libcurl4-openssl-dev:ppc64el
|
||||
|
||||
- name: Build
|
||||
run: |
|
||||
cmake -B build -DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_VULKAN=ON \
|
||||
-DGGML_OPENMP=OFF \
|
||||
-DLLAMA_BUILD_EXAMPLES=ON \
|
||||
-DLLAMA_BUILD_TOOLS=ON \
|
||||
-DLLAMA_BUILD_TESTS=OFF \
|
||||
-DCMAKE_SYSTEM_NAME=Linux \
|
||||
-DCMAKE_SYSTEM_PROCESSOR=ppc64 \
|
||||
-DCMAKE_C_COMPILER=powerpc64le-linux-gnu-gcc-14 \
|
||||
-DCMAKE_CXX_COMPILER=powerpc64le-linux-gnu-g++-14 \
|
||||
-DCMAKE_POSITION_INDEPENDENT_CODE=ON \
|
||||
-DCMAKE_FIND_ROOT_PATH=/usr/lib/powerpc64le-linux-gnu \
|
||||
-DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
|
||||
-DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
|
||||
-DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
|
||||
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
|
||||
9
.github/workflows/release.yml
vendored
9
.github/workflows/release.yml
vendored
@@ -238,14 +238,19 @@ jobs:
|
||||
matrix:
|
||||
include:
|
||||
- build: 'cpu-x64'
|
||||
arch: 'x64'
|
||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF'
|
||||
#- build: 'openblas-x64'
|
||||
# arch: 'x64'
|
||||
# defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
|
||||
- build: 'vulkan-x64'
|
||||
arch: 'x64'
|
||||
defines: '-DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON'
|
||||
- build: 'cpu-arm64'
|
||||
arch: 'arm64'
|
||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF'
|
||||
- build: 'opencl-adreno-arm64'
|
||||
arch: 'arm64'
|
||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON'
|
||||
|
||||
steps:
|
||||
@@ -312,6 +317,8 @@ jobs:
|
||||
- name: libCURL
|
||||
id: get_libcurl
|
||||
uses: ./.github/actions/windows-setup-curl
|
||||
with:
|
||||
architecture: ${{ matrix.arch == 'x64' && 'win64' || 'win64a' }}
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
@@ -339,7 +346,7 @@ jobs:
|
||||
env:
|
||||
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
|
||||
run: |
|
||||
Copy-Item $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\Release\libcurl-x64.dll
|
||||
Copy-Item $env:CURL_PATH\bin\libcurl-${{ matrix.arch }}.dll .\build\bin\Release\
|
||||
7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip .\build\bin\Release\*
|
||||
|
||||
- name: Upload artifacts
|
||||
|
||||
@@ -572,4 +572,11 @@ automatically. For example:
|
||||
$ echo "source ~/.llama-completion.bash" >> ~/.bashrc
|
||||
```
|
||||
|
||||
## References
|
||||
## Dependencies
|
||||
|
||||
- [yhirose/cpp-httplib](https://github.com/yhirose/cpp-httplib) - Single-header HTTP server, used by `llama-server` - MIT license
|
||||
- [stb-image](https://github.com/nothings/stb) - Single-header image format decoder, used by multimodal subsystem - Public domain
|
||||
- [nlohmann/json](https://github.com/nlohmann/json) - Single-header JSON library, used by various tools/examples - MIT License
|
||||
- [minja](https://github.com/google/minja) - Minimal Jinja parser in C++, used by various tools/examples - MIT License
|
||||
- [linenoise.cpp](./tools/run/linenoise.cpp/linenoise.cpp) - C++ library that provides readline-like line editing capabilities, used by `llama-run` - BSD 2-Clause License
|
||||
- [curl](https://curl.se/) - Client-side URL transfer library, used by various tools/examples - [CURL License](https://curl.se/docs/copyright.html)
|
||||
|
||||
@@ -121,8 +121,8 @@ if (LLAMA_LLGUIDANCE)
|
||||
|
||||
ExternalProject_Add(llguidance_ext
|
||||
GIT_REPOSITORY https://github.com/guidance-ai/llguidance
|
||||
# v0.7.19 (+ fancy-regex build fix):
|
||||
GIT_TAG b59f98f85269892a7de3d3641ad155366f13daa6
|
||||
# v0.7.20 (+ fix to build on GCC 15):
|
||||
GIT_TAG b5b8b64dba11c4e4ee6b1d1450d3a3ae279891e8
|
||||
PREFIX ${CMAKE_BINARY_DIR}/llguidance
|
||||
SOURCE_DIR ${LLGUIDANCE_SRC}
|
||||
BUILD_IN_SOURCE TRUE
|
||||
|
||||
@@ -13,10 +13,12 @@
|
||||
#include <chrono>
|
||||
#include <cstddef>
|
||||
#include <cstdio>
|
||||
#include <ctime>
|
||||
#include <exception>
|
||||
#include <iomanip>
|
||||
#include <memory>
|
||||
#include <sstream>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
@@ -393,8 +395,8 @@ class chat_template {
|
||||
|
||||
for (const auto & message_ : adjusted_messages) {
|
||||
auto message = message_;
|
||||
if (!message.contains("role") || !message.contains("content")) {
|
||||
throw std::runtime_error("message must have 'role' and 'content' fields: " + message.dump());
|
||||
if (!message.contains("role") || (!message.contains("content") && !message.contains("tool_calls"))) {
|
||||
throw std::runtime_error("message must have 'role' and one of 'content' or 'tool_calls' fields: " + message.dump());
|
||||
}
|
||||
std::string role = message.at("role");
|
||||
|
||||
@@ -415,7 +417,6 @@ class chat_template {
|
||||
}
|
||||
}
|
||||
if (polyfill_tool_calls) {
|
||||
auto content = message.at("content");
|
||||
auto tool_calls = json::array();
|
||||
for (const auto & tool_call : message.at("tool_calls")) {
|
||||
if (tool_call.at("type") != "function") {
|
||||
@@ -434,8 +435,11 @@ class chat_template {
|
||||
auto obj = json {
|
||||
{"tool_calls", tool_calls},
|
||||
};
|
||||
if (!content.is_null() && !content.empty()) {
|
||||
obj["content"] = content;
|
||||
if (message.contains("content")) {
|
||||
auto content = message.at("content");
|
||||
if (!content.is_null() && !content.empty()) {
|
||||
obj["content"] = content;
|
||||
}
|
||||
}
|
||||
message["content"] = obj.dump(2);
|
||||
message.erase("tool_calls");
|
||||
|
||||
@@ -11,6 +11,7 @@
|
||||
#include <algorithm>
|
||||
#include <cctype>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <cmath>
|
||||
#include <exception>
|
||||
#include <functional>
|
||||
@@ -233,7 +234,7 @@ public:
|
||||
}
|
||||
} else if (is_object()) {
|
||||
if (!index.is_hashable())
|
||||
throw std::runtime_error("Unashable type: " + index.dump());
|
||||
throw std::runtime_error("Unhashable type: " + index.dump());
|
||||
auto it = object_->find(index.primitive_);
|
||||
if (it == object_->end())
|
||||
throw std::runtime_error("Key not found: " + index.dump());
|
||||
@@ -252,7 +253,7 @@ public:
|
||||
auto index = key.get<int>();
|
||||
return array_->at(index < 0 ? array_->size() + index : index);
|
||||
} else if (object_) {
|
||||
if (!key.is_hashable()) throw std::runtime_error("Unashable type: " + dump());
|
||||
if (!key.is_hashable()) throw std::runtime_error("Unhashable type: " + dump());
|
||||
auto it = object_->find(key.primitive_);
|
||||
if (it == object_->end()) return Value();
|
||||
return it->second;
|
||||
@@ -261,7 +262,7 @@ public:
|
||||
}
|
||||
void set(const Value& key, const Value& value) {
|
||||
if (!object_) throw std::runtime_error("Value is not an object: " + dump());
|
||||
if (!key.is_hashable()) throw std::runtime_error("Unashable type: " + dump());
|
||||
if (!key.is_hashable()) throw std::runtime_error("Unhashable type: " + dump());
|
||||
(*object_)[key.primitive_] = value;
|
||||
}
|
||||
Value call(const std::shared_ptr<Context> & context, ArgumentsValue & args) const {
|
||||
@@ -398,7 +399,7 @@ public:
|
||||
}
|
||||
return false;
|
||||
} else if (object_) {
|
||||
if (!value.is_hashable()) throw std::runtime_error("Unashable type: " + value.dump());
|
||||
if (!value.is_hashable()) throw std::runtime_error("Unhashable type: " + value.dump());
|
||||
return object_->find(value.primitive_) != object_->end();
|
||||
} else {
|
||||
throw std::runtime_error("contains can only be called on arrays and objects: " + dump());
|
||||
@@ -416,7 +417,7 @@ public:
|
||||
return const_cast<Value*>(this)->at(index);
|
||||
}
|
||||
Value& at(const Value & index) {
|
||||
if (!index.is_hashable()) throw std::runtime_error("Unashable type: " + dump());
|
||||
if (!index.is_hashable()) throw std::runtime_error("Unhashable type: " + dump());
|
||||
if (is_array()) return array_->at(index.get<int>());
|
||||
if (is_object()) return object_->at(index.primitive_);
|
||||
throw std::runtime_error("Value is not an array or object: " + dump());
|
||||
@@ -676,8 +677,8 @@ public:
|
||||
class VariableExpr : public Expression {
|
||||
std::string name;
|
||||
public:
|
||||
VariableExpr(const Location & location, const std::string& n)
|
||||
: Expression(location), name(n) {}
|
||||
VariableExpr(const Location & loc, const std::string& n)
|
||||
: Expression(loc), name(n) {}
|
||||
std::string get_name() const { return name; }
|
||||
Value do_evaluate(const std::shared_ptr<Context> & context) const override {
|
||||
if (!context->contains(name)) {
|
||||
@@ -1200,9 +1201,9 @@ public:
|
||||
|
||||
class SliceExpr : public Expression {
|
||||
public:
|
||||
std::shared_ptr<Expression> start, end;
|
||||
SliceExpr(const Location & loc, std::shared_ptr<Expression> && s, std::shared_ptr<Expression> && e)
|
||||
: Expression(loc), start(std::move(s)), end(std::move(e)) {}
|
||||
std::shared_ptr<Expression> start, end, step;
|
||||
SliceExpr(const Location & loc, std::shared_ptr<Expression> && s, std::shared_ptr<Expression> && e, std::shared_ptr<Expression> && st = nullptr)
|
||||
: Expression(loc), start(std::move(s)), end(std::move(e)), step(std::move(st)) {}
|
||||
Value do_evaluate(const std::shared_ptr<Context> &) const override {
|
||||
throw std::runtime_error("SliceExpr not implemented");
|
||||
}
|
||||
@@ -1219,18 +1220,35 @@ public:
|
||||
if (!index) throw std::runtime_error("SubscriptExpr.index is null");
|
||||
auto target_value = base->evaluate(context);
|
||||
if (auto slice = dynamic_cast<SliceExpr*>(index.get())) {
|
||||
auto start = slice->start ? slice->start->evaluate(context).get<int64_t>() : 0;
|
||||
auto end = slice->end ? slice->end->evaluate(context).get<int64_t>() : (int64_t) target_value.size();
|
||||
auto len = target_value.size();
|
||||
auto wrap = [len](int64_t i) -> int64_t {
|
||||
if (i < 0) {
|
||||
return i + len;
|
||||
}
|
||||
return i;
|
||||
};
|
||||
int64_t step = slice->step ? slice->step->evaluate(context).get<int64_t>() : 1;
|
||||
if (!step) {
|
||||
throw std::runtime_error("slice step cannot be zero");
|
||||
}
|
||||
int64_t start = slice->start ? wrap(slice->start->evaluate(context).get<int64_t>()) : (step < 0 ? len - 1 : 0);
|
||||
int64_t end = slice->end ? wrap(slice->end->evaluate(context).get<int64_t>()) : (step < 0 ? -1 : len);
|
||||
if (target_value.is_string()) {
|
||||
std::string s = target_value.get<std::string>();
|
||||
if (start < 0) start = s.size() + start;
|
||||
if (end < 0) end = s.size() + end;
|
||||
return s.substr(start, end - start);
|
||||
|
||||
std::string result;
|
||||
if (start < end && step == 1) {
|
||||
result = s.substr(start, end - start);
|
||||
} else {
|
||||
for (int64_t i = start; step > 0 ? i < end : i > end; i += step) {
|
||||
result += s[i];
|
||||
}
|
||||
}
|
||||
return result;
|
||||
|
||||
} else if (target_value.is_array()) {
|
||||
if (start < 0) start = target_value.size() + start;
|
||||
if (end < 0) end = target_value.size() + end;
|
||||
auto result = Value::array();
|
||||
for (auto i = start; i < end; ++i) {
|
||||
for (int64_t i = start; step > 0 ? i < end : i > end; i += step) {
|
||||
result.push_back(target_value.at(i));
|
||||
}
|
||||
return result;
|
||||
@@ -1305,6 +1323,8 @@ public:
|
||||
if (name == "iterable") return l.is_iterable();
|
||||
if (name == "sequence") return l.is_array();
|
||||
if (name == "defined") return !l.is_null();
|
||||
if (name == "true") return l.to_bool();
|
||||
if (name == "false") return !l.to_bool();
|
||||
throw std::runtime_error("Unknown type for 'is' operator: " + name);
|
||||
};
|
||||
auto value = eval();
|
||||
@@ -1520,6 +1540,10 @@ public:
|
||||
vargs.expectArgs("endswith method", {1, 1}, {0, 0});
|
||||
auto suffix = vargs.args[0].get<std::string>();
|
||||
return suffix.length() <= str.length() && std::equal(suffix.rbegin(), suffix.rend(), str.rbegin());
|
||||
} else if (method->get_name() == "startswith") {
|
||||
vargs.expectArgs("startswith method", {1, 1}, {0, 0});
|
||||
auto prefix = vargs.args[0].get<std::string>();
|
||||
return prefix.length() <= str.length() && std::equal(prefix.begin(), prefix.end(), str.begin());
|
||||
} else if (method->get_name() == "title") {
|
||||
vargs.expectArgs("title method", {0, 0}, {0, 0});
|
||||
auto res = str;
|
||||
@@ -2082,28 +2106,37 @@ private:
|
||||
|
||||
while (it != end && consumeSpaces() && peekSymbols({ "[", "." })) {
|
||||
if (!consumeToken("[").empty()) {
|
||||
std::shared_ptr<Expression> index;
|
||||
std::shared_ptr<Expression> index;
|
||||
auto slice_loc = get_location();
|
||||
std::shared_ptr<Expression> start, end, step;
|
||||
bool has_first_colon = false, has_second_colon = false;
|
||||
|
||||
if (!peekSymbols({ ":" })) {
|
||||
start = parseExpression();
|
||||
}
|
||||
|
||||
if (!consumeToken(":").empty()) {
|
||||
has_first_colon = true;
|
||||
if (!peekSymbols({ ":", "]" })) {
|
||||
end = parseExpression();
|
||||
}
|
||||
if (!consumeToken(":").empty()) {
|
||||
auto slice_end = parseExpression();
|
||||
index = std::make_shared<SliceExpr>(slice_end->location, nullptr, std::move(slice_end));
|
||||
} else {
|
||||
auto slice_start = parseExpression();
|
||||
if (!consumeToken(":").empty()) {
|
||||
consumeSpaces();
|
||||
if (peekSymbols({ "]" })) {
|
||||
index = std::make_shared<SliceExpr>(slice_start->location, std::move(slice_start), nullptr);
|
||||
} else {
|
||||
auto slice_end = parseExpression();
|
||||
index = std::make_shared<SliceExpr>(slice_start->location, std::move(slice_start), std::move(slice_end));
|
||||
}
|
||||
} else {
|
||||
index = std::move(slice_start);
|
||||
has_second_colon = true;
|
||||
if (!peekSymbols({ "]" })) {
|
||||
step = parseExpression();
|
||||
}
|
||||
}
|
||||
if (!index) throw std::runtime_error("Empty index in subscript");
|
||||
if (consumeToken("]").empty()) throw std::runtime_error("Expected closing bracket in subscript");
|
||||
}
|
||||
|
||||
value = std::make_shared<SubscriptExpr>(value->location, std::move(value), std::move(index));
|
||||
if ((has_first_colon || has_second_colon) && (start || end || step)) {
|
||||
index = std::make_shared<SliceExpr>(slice_loc, std::move(start), std::move(end), std::move(step));
|
||||
} else {
|
||||
index = std::move(start);
|
||||
}
|
||||
if (!index) throw std::runtime_error("Empty index in subscript");
|
||||
if (consumeToken("]").empty()) throw std::runtime_error("Expected closing bracket in subscript");
|
||||
|
||||
value = std::make_shared<SubscriptExpr>(value->location, std::move(value), std::move(index));
|
||||
} else if (!consumeToken(".").empty()) {
|
||||
auto identifier = parseIdentifier();
|
||||
if (!identifier) throw std::runtime_error("Expected identifier in subscript");
|
||||
|
||||
@@ -2069,6 +2069,9 @@ class Llama4Model(LlamaModel):
|
||||
self.gguf_writer.add_expert_feed_forward_length(self.hparams["intermediate_size_moe"])
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
|
||||
if name.startswith("language_model."):
|
||||
name = name.replace("language_model.", "")
|
||||
|
||||
# split the gate_up into gate and up
|
||||
if "gate_up_proj" in name:
|
||||
name_up = name.replace("gate_up_proj", "up_proj.weight")
|
||||
|
||||
@@ -731,6 +731,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
|
||||
| GGML_SYCL_DEVICE_ARCH | Optional (except for AMD) | Set the SYCL device architecture, optional except for AMD. Setting the device architecture can improve the performance. See the table [--offload-arch](https://github.com/intel/llvm/blob/sycl/sycl/doc/design/OffloadDesign.md#--offload-arch) for a list of valid architectures. |
|
||||
| GGML_SYCL_F16 | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path. |
|
||||
| GGML_SYCL_GRAPH | ON *(default)* \|OFF *(Optional)* | Enable build with [SYCL Graph extension](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc). |
|
||||
| GGML_SYCL_DNN | ON *(default)* \|OFF *(Optional)* | Enable build with oneDNN. |
|
||||
| CMAKE_C_COMPILER | `icx` *(Linux)*, `icx/cl` *(Windows)* | Set `icx` compiler for SYCL code path. |
|
||||
| CMAKE_CXX_COMPILER | `icpx` *(Linux)*, `icx` *(Windows)* | Set `icpx/icx` compiler for SYCL code path. |
|
||||
|
||||
@@ -741,6 +742,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
|
||||
| GGML_SYCL_DEBUG | 0 (default) or 1 | Enable log function by macro: GGML_SYCL_DEBUG |
|
||||
| GGML_SYCL_DISABLE_OPT | 0 (default) or 1 | Disable optimize features based on Intel GPU type, to compare the performance increase |
|
||||
| GGML_SYCL_DISABLE_GRAPH | 0 or 1 (default) | Disable running computations through SYCL Graphs feature. Disabled by default because graph performance isn't yet better than non-graph performance. |
|
||||
| GGML_SYCL_DISABLE_DNN | 0 (default) or 1 | Disable running computations through oneDNN and always use oneMKL. |
|
||||
| ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |
|
||||
|
||||
|
||||
|
||||
@@ -193,6 +193,7 @@ option(GGML_RPC "ggml: use RPC"
|
||||
option(GGML_SYCL "ggml: use SYCL" OFF)
|
||||
option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF)
|
||||
option(GGML_SYCL_GRAPH "ggml: enable graphs in the SYCL backend" ON)
|
||||
option(GGML_SYCL_DNN "ggml: enable oneDNN in the SYCL backend" ON)
|
||||
set (GGML_SYCL_TARGET "INTEL" CACHE STRING
|
||||
"ggml: sycl target device")
|
||||
set (GGML_SYCL_DEVICE_ARCH "" CACHE STRING
|
||||
|
||||
@@ -415,6 +415,13 @@ enum ggml_metal_kernel_type {
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_HK192_HV128,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H256,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_HK576_HV512,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H64,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H64,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H64,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H64,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H64,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H64,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H64,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H96,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H96,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H96,
|
||||
@@ -1362,6 +1369,13 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_HK192_HV128, flash_attn_ext_q8_0_hk192_hv128, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H256, flash_attn_ext_q8_0_h256, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_HK576_HV512, flash_attn_ext_q8_0_hk576_hv512, has_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H64, flash_attn_ext_vec_f16_h64, has_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H64, flash_attn_ext_vec_bf16_h64, has_simdgroup_reduction && use_bfloat);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H64, flash_attn_ext_vec_q4_0_h64, has_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H64, flash_attn_ext_vec_q4_1_h64, has_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H64, flash_attn_ext_vec_q5_0_h64, has_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H64, flash_attn_ext_vec_q5_1_h64, has_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H64, flash_attn_ext_vec_q8_0_h64, has_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H96, flash_attn_ext_vec_f16_h96, has_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H96, flash_attn_ext_vec_bf16_h96, has_simdgroup_reduction && use_bfloat);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H96, flash_attn_ext_vec_q4_0_h96, has_simdgroup_reduction);
|
||||
@@ -4358,7 +4372,7 @@ static bool ggml_metal_encode_node(
|
||||
// TODO: add vec kernels for (ne00%64 == 0) and maybe also for (ne00%32 == 0)
|
||||
// for now avoiding mainly to keep the number of templates/kernels a bit lower
|
||||
// these are now trivial to add after: https://github.com/ggml-org/llama.cpp/pull/12612
|
||||
if (ne01 >= 20 || (ne00%128 != 0 && ne00 != 96 && ne00 != 192 && ne00 != 576)) {
|
||||
if (ne01 >= 20 || (ne00%128 != 0 && ne00 != 64 && ne00 != 96 && ne00 != 192 && ne00 != 576)) {
|
||||
switch (src1->type) {
|
||||
case GGML_TYPE_F16:
|
||||
{
|
||||
@@ -4539,6 +4553,24 @@ static bool ggml_metal_encode_node(
|
||||
use_vec_kernel = true;
|
||||
|
||||
switch (ne00) {
|
||||
case 64:
|
||||
{
|
||||
switch (src1->type) {
|
||||
case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H64].pipeline; break;
|
||||
case GGML_TYPE_BF16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H64].pipeline; break;
|
||||
case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H64].pipeline; break;
|
||||
case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H64].pipeline; break;
|
||||
case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H64].pipeline; break;
|
||||
case GGML_TYPE_Q5_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H64].pipeline; break;
|
||||
case GGML_TYPE_Q8_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H64].pipeline; break;
|
||||
default:
|
||||
{
|
||||
GGML_LOG_ERROR("unsupported type: %d\n", src1->type);
|
||||
GGML_LOG_ERROR("add template specialization for this type\n");
|
||||
GGML_ABORT("add template specialization for this type");
|
||||
}
|
||||
}
|
||||
} break;
|
||||
case 96:
|
||||
{
|
||||
switch (src1->type) {
|
||||
|
||||
@@ -4124,6 +4124,16 @@ kernel void kernel_flash_attn_ext_vec(
|
||||
|
||||
typedef decltype(kernel_flash_attn_ext_vec<FA_TYPES, half4, 1, dequantize_f16_t4, half4, 1, dequantize_f16_t4, 128, 128, 4>) flash_attn_ext_vec_t;
|
||||
|
||||
template [[host_name("kernel_flash_attn_ext_vec_f16_h64")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, half4, 1, dequantize_f16_t4, half4, 1, dequantize_f16_t4, 64, 64, 8>;
|
||||
#if defined(GGML_METAL_USE_BF16)
|
||||
template [[host_name("kernel_flash_attn_ext_vec_bf16_h64")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, bfloat4, 1, dequantize_bf16_t4, bfloat4, 1, dequantize_bf16_t4, 64, 64, 8>;
|
||||
#endif
|
||||
template [[host_name("kernel_flash_attn_ext_vec_q4_0_h64")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q4_0, 8, dequantize_q4_0_t4, block_q4_0, 8, dequantize_q4_0_t4, 64, 64, 8>;
|
||||
template [[host_name("kernel_flash_attn_ext_vec_q4_1_h64")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q4_1, 8, dequantize_q4_1_t4, block_q4_1, 8, dequantize_q4_1_t4, 64, 64, 8>;
|
||||
template [[host_name("kernel_flash_attn_ext_vec_q5_0_h64")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q5_0, 8, dequantize_q5_0_t4, block_q5_0, 8, dequantize_q5_0_t4, 64, 64, 8>;
|
||||
template [[host_name("kernel_flash_attn_ext_vec_q5_1_h64")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q5_1, 8, dequantize_q5_1_t4, block_q5_1, 8, dequantize_q5_1_t4, 64, 64, 8>;
|
||||
template [[host_name("kernel_flash_attn_ext_vec_q8_0_h64")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q8_0, 8, dequantize_q8_0_t4, block_q8_0, 8, dequantize_q8_0_t4, 64, 64, 8>;
|
||||
|
||||
template [[host_name("kernel_flash_attn_ext_vec_f16_h96")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, half4, 1, dequantize_f16_t4, half4, 1, dequantize_f16_t4, 96, 96, 4>;
|
||||
#if defined(GGML_METAL_USE_BF16)
|
||||
template [[host_name("kernel_flash_attn_ext_vec_bf16_h96")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, bfloat4, 1, dequantize_bf16_t4, bfloat4, 1, dequantize_bf16_t4, 96, 96, 4>;
|
||||
|
||||
@@ -49,34 +49,38 @@ endif()
|
||||
target_compile_options(ggml-sycl PRIVATE "-Wno-narrowing")
|
||||
|
||||
# Link against oneDNN
|
||||
find_package(DNNL)
|
||||
set(GGML_SYCL_DNNL 0)
|
||||
if(DNNL_FOUND)
|
||||
if (NOT DEFINED DNNL_GPU_VENDOR)
|
||||
# default to intel target
|
||||
set(DNNL_GPU_VENDOR "INTEL")
|
||||
if(NOT "${GGML_SYCL_TARGET}" STREQUAL "INTEL")
|
||||
message(WARNING "oneDNN builds bundled with oneapi release only support INTEL target")
|
||||
if(GGML_SYCL_DNN)
|
||||
find_package(DNNL)
|
||||
if(DNNL_FOUND)
|
||||
if (NOT DEFINED DNNL_GPU_VENDOR)
|
||||
# default to intel target
|
||||
set(DNNL_GPU_VENDOR "INTEL")
|
||||
if(NOT "${GGML_SYCL_TARGET}" STREQUAL "INTEL")
|
||||
message(WARNING "oneDNN builds bundled with oneapi release only support INTEL target")
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# Verify oneDNN was compiled for the same target as llama
|
||||
if("${GGML_SYCL_TARGET}" STREQUAL "${DNNL_GPU_VENDOR}")
|
||||
target_link_libraries(ggml-sycl PRIVATE DNNL::dnnl)
|
||||
set(GGML_SYCL_DNNL 1)
|
||||
get_target_property(CONFIGS DNNL::dnnl IMPORTED_CONFIGURATIONS)
|
||||
foreach(CONFIG ${CONFIGS})
|
||||
get_target_property(DNNL_LIB DNNL::dnnl IMPORTED_LOCATION_${CONFIG})
|
||||
message(STATUS "Found oneDNN: ${DNNL_LIB}")
|
||||
endforeach()
|
||||
# Verify oneDNN was compiled for the same target as llama
|
||||
if("${GGML_SYCL_TARGET}" STREQUAL "${DNNL_GPU_VENDOR}")
|
||||
target_link_libraries(ggml-sycl PRIVATE DNNL::dnnl)
|
||||
set(GGML_SYCL_DNNL 1)
|
||||
get_target_property(CONFIGS DNNL::dnnl IMPORTED_CONFIGURATIONS)
|
||||
foreach(CONFIG ${CONFIGS})
|
||||
get_target_property(DNNL_LIB DNNL::dnnl IMPORTED_LOCATION_${CONFIG})
|
||||
message(STATUS "Found oneDNN: ${DNNL_LIB}")
|
||||
endforeach()
|
||||
else()
|
||||
message(WARNING
|
||||
"oneDNN must be compiled for the same target as llama.cpp.
|
||||
llama.cpp: ${GGML_SYCL_TARGET}, oneDNN: ${DNNL_GPU_VENDOR}.
|
||||
Disabling oneDNN support.")
|
||||
endif()
|
||||
else()
|
||||
message(WARNING
|
||||
"oneDNN must be compiled for the same target as llama.cpp.
|
||||
llama.cpp: ${GGML_SYCL_TARGET}, oneDNN: ${DNNL_GPU_VENDOR}.
|
||||
Disabling oneDNN support.")
|
||||
message(STATUS "oneDNN not found, disabling oneDNN support")
|
||||
endif()
|
||||
else()
|
||||
message(STATUS "oneDNN not found, disabling oneDNN support")
|
||||
message(STATUS "oneDNN support disabled by the user")
|
||||
endif()
|
||||
target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_DNNL=${GGML_SYCL_DNNL})
|
||||
|
||||
|
||||
@@ -1,93 +1,74 @@
|
||||
#include "binbcast.hpp"
|
||||
|
||||
#include <array>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <sycl/sycl.hpp>
|
||||
|
||||
#include "dpct/helper.hpp"
|
||||
#include "ggml.h"
|
||||
|
||||
template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
|
||||
static void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
|
||||
int ne0, int ne1, int ne2, int ne3,
|
||||
int ne10, int ne11, int ne12, int ne13,
|
||||
/*int s0, */ int s1, int s2, int s3,
|
||||
/*int s00,*/ int s01, int s02, int s03,
|
||||
/*int s10,*/ int s11, int s12, int s13,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
const int i0s = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
||||
item_ct1.get_local_id(2);
|
||||
const int i1 = (item_ct1.get_local_range(1) * item_ct1.get_group(1) +
|
||||
item_ct1.get_local_id(1));
|
||||
const int i2 = (item_ct1.get_local_range(0) * item_ct1.get_group(0) +
|
||||
item_ct1.get_local_id(0)) /
|
||||
ne3;
|
||||
const int i3 = (item_ct1.get_local_range(0) * item_ct1.get_group(0) +
|
||||
item_ct1.get_local_id(0)) %
|
||||
ne3;
|
||||
|
||||
if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int i11 = i1 % ne11;
|
||||
const int i12 = i2 % ne12;
|
||||
const int i13 = i3 % ne13;
|
||||
|
||||
const size_t i_src0 = i3*s03 + i2*s02 + i1*s01;
|
||||
const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
|
||||
const size_t i_dst = i3*s3 + i2*s2 + i1*s1;
|
||||
|
||||
const src0_t * src0_row = src0 + i_src0;
|
||||
const src1_t * src1_row = src1 + i_src1;
|
||||
dst_t * dst_row = dst + i_dst;
|
||||
|
||||
for (int i0 = i0s; i0 < ne0;
|
||||
i0 += item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) {
|
||||
const int i10 = i0 % ne10;
|
||||
dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
|
||||
template <float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
|
||||
static __dpct_inline__ void k_bin_bcast_contiguous(const src0_t * __restrict__ src0, const src1_t * __restrict__ src1,
|
||||
dst_t * dst, std::size_t num_elements, const sycl::nd_item<1> & it) {
|
||||
auto element_id = it.get_global_id(0);
|
||||
auto global_range = it.get_global_range(0);
|
||||
for (; element_id < num_elements; element_id += global_range) {
|
||||
auto src0_float_val = sycl::vec(src0[element_id]).template convert<float, sycl::rounding_mode::rte>();
|
||||
auto src1_float_val = sycl::vec(src1[element_id]).template convert<float, sycl::rounding_mode::rte>();
|
||||
float dst_val = bin_op(src0_float_val[0], src1_float_val[0]);
|
||||
auto val_to_store = sycl::vec(dst_val).template convert<dst_t, sycl::rounding_mode::rte>();
|
||||
dst[element_id] = val_to_store;
|
||||
}
|
||||
}
|
||||
|
||||
template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
|
||||
static void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst,
|
||||
int ne0, int ne1, int ne2, int ne3,
|
||||
int ne10, int ne11, int ne12, int ne13,
|
||||
/*int s0, */ int s1, int s2, int s3,
|
||||
/*int s00,*/ int s01, int s02, int s03,
|
||||
/*int s10,*/ int s11, int s12, int s13,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
template <float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
|
||||
static __dpct_inline__ void k_bin_bcast(const src0_t * __restrict__ src0, const src1_t * __restrict__ src1, dst_t * dst,
|
||||
int ne0, int ne1, int ne2, int ne3, int ne10, int ne11, int ne12, int ne13,
|
||||
int s0, int s1, int s2, int s3, int s00, int s01, int s02, int s03, int s10,
|
||||
int s11, int s12, int s13, std::size_t num_dst_elements,
|
||||
const sycl::nd_item<1> & item_ct1) {
|
||||
auto calculate_logical_index =
|
||||
[](const std::array<int, 4> & dims, std::size_t element_id) __attribute__((always_inline))->std::array<int, 4> {
|
||||
std::array<int, 4> logical_index;
|
||||
#pragma unroll(4)
|
||||
for (int i = 3; i >= 0; i--) {
|
||||
logical_index[i] = element_id % dims[i];
|
||||
element_id /= dims[i];
|
||||
}
|
||||
return logical_index;
|
||||
};
|
||||
|
||||
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
||||
item_ct1.get_local_id(2);
|
||||
auto calculate_index = [](const std::array<int, 4> & dims, const std::array<int, 4> & strides,
|
||||
const std::array<int, 4> & indices) __attribute__((always_inline))
|
||||
->std::size_t {
|
||||
std::size_t index = 0;
|
||||
#pragma unroll(4)
|
||||
for (int i = 0; i < 4; i++) {
|
||||
auto index_i = indices[i];
|
||||
if (indices[i] >= dims[i]) {
|
||||
index_i = indices[i] % dims[i];
|
||||
}
|
||||
index += strides[i] * index_i;
|
||||
}
|
||||
return index;
|
||||
};
|
||||
|
||||
const int i3 = i/(ne2*ne1*ne0);
|
||||
const int i2 = (i/(ne1*ne0)) % ne2;
|
||||
const int i1 = (i/ne0) % ne1;
|
||||
const int i0 = i % ne0;
|
||||
|
||||
if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
|
||||
return;
|
||||
auto element_id = item_ct1.get_global_id(0);
|
||||
for (; element_id < num_dst_elements; element_id += item_ct1.get_global_range(0)) {
|
||||
auto logical_index = calculate_logical_index({ ne3, ne2, ne1, ne0 }, element_id);
|
||||
auto src_0_index = calculate_index({ ne3, ne2, ne1, ne0 }, { s03, s02, s01, s00 }, logical_index);
|
||||
auto src_1_index = calculate_index({ ne13, ne12, ne11, ne10 }, { s13, s12, s11, s10 }, logical_index);
|
||||
auto dst_index = calculate_index({ ne3, ne2, ne1, ne0 }, { s3, s2, s1, s0 }, logical_index);
|
||||
auto src0_float_val = sycl::vec(src0[src_0_index]).template convert<float, sycl::rounding_mode::rte>();
|
||||
auto src1_float_val = sycl::vec(src1[src_1_index]).template convert<float, sycl::rounding_mode::rte>();
|
||||
float dst_val = bin_op(src0_float_val[0], src1_float_val[0]);
|
||||
auto val_to_store = sycl::vec(dst_val).template convert<dst_t, sycl::rounding_mode::rte>();
|
||||
dst[dst_index] = val_to_store;
|
||||
}
|
||||
|
||||
const int i11 = i1 % ne11;
|
||||
const int i12 = i2 % ne12;
|
||||
const int i13 = i3 % ne13;
|
||||
|
||||
const size_t i_src0 = i3*s03 + i2*s02 + i1*s01;
|
||||
const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
|
||||
const size_t i_dst = i3*s3 + i2*s2 + i1*s1;
|
||||
|
||||
const src0_t * src0_row = src0 + i_src0;
|
||||
const src1_t * src1_row = src1 + i_src1;
|
||||
dst_t * dst_row = dst + i_dst;
|
||||
|
||||
const int i10 = i0 % ne10;
|
||||
dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
|
||||
}
|
||||
|
||||
|
||||
template<float (*bin_op)(const float, const float)>
|
||||
struct bin_bcast_sycl {
|
||||
template <float (*bin_op)(const float, const float)> struct bin_bcast_sycl {
|
||||
template <typename src0_t, typename src1_t, typename dst_t>
|
||||
void operator()(const src0_t * src0_dd, const src1_t * src1_dd, dst_t * dst_dd, const int64_t ne00,
|
||||
const int64_t ne01, const int64_t ne02, const int64_t ne03, const int64_t ne10, const int64_t ne11,
|
||||
@@ -96,165 +77,73 @@ struct bin_bcast_sycl {
|
||||
const size_t nb10, const size_t nb11, const size_t nb12, const size_t nb13, const size_t nb0,
|
||||
const size_t nb1, const size_t nb2, const size_t nb3, const bool src0_is_contiguous,
|
||||
const bool src1_is_contiguous, const bool dst_is_contiguous, queue_ptr stream) {
|
||||
int nr0 = ne10 / ne0;
|
||||
int nr1 = ne11/ne1;
|
||||
int nr2 = ne12/ne2;
|
||||
int nr3 = ne13/ne3;
|
||||
|
||||
int nr[4] = { nr0, nr1, nr2, nr3 };
|
||||
|
||||
// collapse dimensions until first broadcast dimension
|
||||
int64_t cne[] = {ne0, ne1, ne2, ne3};
|
||||
int64_t cne0[] = {ne00, ne01, ne02, ne03};
|
||||
int64_t cne1[] = {ne10, ne11, ne12, ne13};
|
||||
size_t cnb[] = {nb0, nb1, nb2, nb3};
|
||||
size_t cnb0[] = {nb00, nb01, nb02, nb03};
|
||||
size_t cnb1[] = {nb10, nb11, nb12, nb13};
|
||||
auto collapse = [](int64_t cne[]) {
|
||||
cne[0] *= cne[1];
|
||||
cne[1] = cne[2];
|
||||
cne[2] = cne[3];
|
||||
cne[3] = 1;
|
||||
};
|
||||
|
||||
auto collapse_nb = [](size_t cnb[], int64_t cne[]) {
|
||||
cnb[1] *= cne[1];
|
||||
cnb[2] *= cne[2];
|
||||
cnb[3] *= cne[3];
|
||||
};
|
||||
|
||||
if (src0_is_contiguous && src1_is_contiguous && dst_is_contiguous) {
|
||||
auto check_bcast_required = [](const std::array<int64_t, 4> & src_dims,
|
||||
const std::array<int64_t, 4> & dst_dims) -> bool {
|
||||
for (int i = 0; i < 4; i++) {
|
||||
if (nr[i] != 1) {
|
||||
break;
|
||||
}
|
||||
if (i > 0) {
|
||||
collapse_nb(cnb, cne);
|
||||
collapse_nb(cnb0, cne0);
|
||||
collapse_nb(cnb1, cne1);
|
||||
collapse(cne);
|
||||
collapse(cne0);
|
||||
collapse(cne1);
|
||||
if (dst_dims[i] > src_dims[i]) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
{
|
||||
int64_t ne0 = cne[0];
|
||||
int64_t ne1 = cne[1];
|
||||
int64_t ne2 = cne[2];
|
||||
int64_t ne3 = cne[3];
|
||||
return false;
|
||||
};
|
||||
|
||||
int64_t ne10 = cne1[0];
|
||||
int64_t ne11 = cne1[1];
|
||||
int64_t ne12 = cne1[2];
|
||||
int64_t ne13 = cne1[3];
|
||||
dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
|
||||
|
||||
size_t nb0 = cnb[0];
|
||||
size_t nb1 = cnb[1];
|
||||
size_t nb2 = cnb[2];
|
||||
size_t nb3 = cnb[3];
|
||||
GGML_ASSERT(nb0 % sizeof(dst_t) == 0);
|
||||
GGML_ASSERT(nb1 % sizeof(dst_t) == 0);
|
||||
GGML_ASSERT(nb2 % sizeof(dst_t) == 0);
|
||||
GGML_ASSERT(nb3 % sizeof(dst_t) == 0);
|
||||
|
||||
size_t nb00 = cnb0[0];
|
||||
size_t nb01 = cnb0[1];
|
||||
size_t nb02 = cnb0[2];
|
||||
size_t nb03 = cnb0[3];
|
||||
GGML_ASSERT(nb00 % sizeof(src0_t) == 0);
|
||||
GGML_ASSERT(nb01 % sizeof(src0_t) == 0);
|
||||
GGML_ASSERT(nb02 % sizeof(src0_t) == 0);
|
||||
GGML_ASSERT(nb03 % sizeof(src0_t) == 0);
|
||||
|
||||
size_t nb10 = cnb1[0];
|
||||
size_t nb11 = cnb1[1];
|
||||
size_t nb12 = cnb1[2];
|
||||
size_t nb13 = cnb1[3];
|
||||
GGML_ASSERT(nb10 % sizeof(src1_t) == 0);
|
||||
GGML_ASSERT(nb11 % sizeof(src1_t) == 0);
|
||||
GGML_ASSERT(nb12 % sizeof(src1_t) == 0);
|
||||
GGML_ASSERT(nb13 % sizeof(src1_t) == 0);
|
||||
|
||||
size_t s0 = nb0 / sizeof(dst_t);
|
||||
size_t s1 = nb1 / sizeof(dst_t);
|
||||
size_t s2 = nb2 / sizeof(dst_t);
|
||||
size_t s3 = nb3 / sizeof(dst_t);
|
||||
// dst strides in number of elements
|
||||
size_t s0 = nb0 / sizeof(dst_t);
|
||||
size_t s1 = nb1 / sizeof(dst_t);
|
||||
size_t s2 = nb2 / sizeof(dst_t);
|
||||
size_t s3 = nb3 / sizeof(dst_t);
|
||||
|
||||
size_t s10 = nb10 / sizeof(src1_t);
|
||||
size_t s11 = nb11 / sizeof(src1_t);
|
||||
size_t s12 = nb12 / sizeof(src1_t);
|
||||
size_t s13 = nb13 / sizeof(src1_t);
|
||||
// src1 strides in number of elements
|
||||
size_t s10 = nb10 / sizeof(src0_t);
|
||||
size_t s11 = nb11 / sizeof(src1_t);
|
||||
size_t s12 = nb12 / sizeof(src1_t);
|
||||
size_t s13 = nb13 / sizeof(src1_t);
|
||||
|
||||
size_t s00 = nb00 / sizeof(src0_t);
|
||||
size_t s01 = nb01 / sizeof(src0_t);
|
||||
size_t s02 = nb02 / sizeof(src0_t);
|
||||
size_t s03 = nb03 / sizeof(src0_t);
|
||||
// src0 strides in number of elements
|
||||
size_t s00 = nb00 / sizeof(src0_t);
|
||||
size_t s01 = nb01 / sizeof(src0_t);
|
||||
size_t s02 = nb02 / sizeof(src0_t);
|
||||
size_t s03 = nb03 / sizeof(src0_t);
|
||||
|
||||
GGML_UNUSED(s00);
|
||||
std::size_t num_dst_elements = static_cast<std::size_t>(ne0) * static_cast<std::size_t>(ne1) *
|
||||
static_cast<std::size_t>(ne2) * static_cast<std::size_t>(ne3);
|
||||
std::size_t local_range = 256;
|
||||
std::size_t global_range = ceil_div(num_dst_elements, local_range) * local_range;
|
||||
|
||||
GGML_ASSERT(nb0 % sizeof(dst_t) == 0);
|
||||
GGML_ASSERT(nb1 % sizeof(dst_t) == 0);
|
||||
GGML_ASSERT(nb2 % sizeof(dst_t) == 0);
|
||||
GGML_ASSERT(nb3 % sizeof(dst_t) == 0);
|
||||
bool needs_broadcasting = check_bcast_required({ ne00, ne01, ne02, ne03 }, { ne0, ne1, ne2, ne3 }) ||
|
||||
check_bcast_required({ ne10, ne11, ne12, ne13 }, { ne0, ne1, ne2, ne3 });
|
||||
bool all_contiguous = src0_is_contiguous && src1_is_contiguous && dst_is_contiguous;
|
||||
|
||||
GGML_ASSERT(nb00 % sizeof(src0_t) == 0);
|
||||
GGML_ASSERT(nb01 % sizeof(src0_t) == 0);
|
||||
GGML_ASSERT(nb02 % sizeof(src0_t) == 0);
|
||||
GGML_ASSERT(nb03 % sizeof(src0_t) == 0);
|
||||
|
||||
GGML_ASSERT(nb10 % sizeof(src1_t) == 0);
|
||||
GGML_ASSERT(nb11 % sizeof(src1_t) == 0);
|
||||
GGML_ASSERT(nb12 % sizeof(src1_t) == 0);
|
||||
GGML_ASSERT(nb13 % sizeof(src1_t) == 0);
|
||||
|
||||
GGML_ASSERT(s0 == 1);
|
||||
GGML_ASSERT(s10 == 1);
|
||||
|
||||
const int block_size = 128;
|
||||
|
||||
int64_t hne0 = std::max(ne0/2LL, 1LL);
|
||||
|
||||
sycl::range<3> block_dims(1, 1, 1);
|
||||
block_dims[2] = std::min<unsigned int>(hne0, block_size);
|
||||
block_dims[1] = std::min<unsigned int>(
|
||||
ne1, block_size / (unsigned int)block_dims[2]);
|
||||
block_dims[0] = std::min(
|
||||
std::min<unsigned int>(
|
||||
ne2 * ne3, block_size / (unsigned int)block_dims[2] /
|
||||
(unsigned int)block_dims[1]),
|
||||
64U);
|
||||
|
||||
sycl::range<3> block_nums(
|
||||
(ne2 * ne3 + block_dims[0] - 1) / block_dims[0],
|
||||
(ne1 + block_dims[1] - 1) / block_dims[1],
|
||||
(hne0 + block_dims[2] - 1) / block_dims[2]);
|
||||
|
||||
if (block_nums[0] > 65535) {
|
||||
// this is the maximum number of blocks in z direction, fallback to 1D grid kernel
|
||||
int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size;
|
||||
{
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, block_num) *
|
||||
sycl::range<3>(1, 1, block_size),
|
||||
sycl::range<3>(1, 1, block_size)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
k_bin_bcast_unravel<bin_op>(
|
||||
src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3,
|
||||
ne10, ne11, ne12, ne13, s1, s2, s3, s01, s02,
|
||||
s03, s11, s12, s13, item_ct1);
|
||||
});
|
||||
}
|
||||
} else {
|
||||
/*
|
||||
DPCT1049:16: The work-group size passed to the SYCL kernel may
|
||||
exceed the limit. To get the device limit, query
|
||||
info::device::max_work_group_size. Adjust the work-group size if
|
||||
needed.
|
||||
*/
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
k_bin_bcast<bin_op>(src0_dd, src1_dd, dst_dd, ne0, ne1,
|
||||
ne2, ne3, ne10, ne11, ne12, ne13,
|
||||
s1, s2, s3, s01, s02, s03, s11, s12, s13,
|
||||
item_ct1);
|
||||
});
|
||||
}
|
||||
if (! needs_broadcasting && all_contiguous) {
|
||||
stream->submit([&](sycl::handler & cgh) {
|
||||
cgh.parallel_for(sycl::nd_range<1>({ global_range }, { local_range }), [=](sycl::nd_item<1> it) {
|
||||
k_bin_bcast_contiguous<bin_op>(src0_dd, src1_dd, dst_dd, num_dst_elements, it);
|
||||
});
|
||||
});
|
||||
} else {
|
||||
stream->submit([&](sycl::handler & cgh) {
|
||||
cgh.parallel_for(sycl::nd_range<1>({ global_range }, { local_range }), [=](sycl::nd_item<1> it) {
|
||||
k_bin_bcast<bin_op>(src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3, ne10, ne11, ne12, ne13, s0, s1,
|
||||
s2, s3, s00, s01, s02, s03, s10, s11, s12, s13, num_dst_elements, it);
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
@@ -183,6 +183,24 @@ static void dequantize_row_q4_K_sycl(const void *vx, dst_t *y, const int64_t k,
|
||||
}
|
||||
}
|
||||
|
||||
template <typename dst_t>
|
||||
static void dequantize_row_q4_K_sycl_reorder(const void * vx, dst_t * y, const int64_t k, dpct::queue_ptr stream) {
|
||||
const int64_t nb = k / QK_K;
|
||||
const size_t local_size = 32;
|
||||
const size_t global_size = nb * local_size;
|
||||
|
||||
dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
|
||||
|
||||
stream->submit([&](sycl::handler & cgh) {
|
||||
sycl::local_accessor<uint8_t, 1> scale_local_acc(sycl::range<1>(12), cgh);
|
||||
|
||||
cgh.parallel_for(sycl::nd_range<1>(sycl::range<1>(global_size), sycl::range<1>(local_size)),
|
||||
[=](sycl::nd_item<1> item_ct1) {
|
||||
dequantize_block_q4_K_reorder(vx, y, get_pointer(scale_local_acc), item_ct1, nb);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
template <typename dst_t>
|
||||
static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int64_t k,
|
||||
dpct::queue_ptr stream) {
|
||||
@@ -504,7 +522,11 @@ to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor * dst) {
|
||||
case GGML_TYPE_Q3_K:
|
||||
return dequantize_row_q3_K_sycl;
|
||||
case GGML_TYPE_Q4_K:
|
||||
return dequantize_row_q4_K_sycl;
|
||||
if (dst->src[0]->extra && ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
|
||||
return dequantize_row_q4_K_sycl_reorder;
|
||||
} else {
|
||||
return dequantize_row_q4_K_sycl;
|
||||
}
|
||||
case GGML_TYPE_Q5_K:
|
||||
return dequantize_row_q5_K_sycl;
|
||||
case GGML_TYPE_Q6_K:
|
||||
@@ -556,7 +578,12 @@ to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst) {
|
||||
case GGML_TYPE_Q3_K:
|
||||
return dequantize_row_q3_K_sycl;
|
||||
case GGML_TYPE_Q4_K:
|
||||
return dequantize_row_q4_K_sycl;
|
||||
if (dst->src[0]->extra &&
|
||||
((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) {
|
||||
return dequantize_row_q4_K_sycl_reorder;
|
||||
} else {
|
||||
return dequantize_row_q4_K_sycl;
|
||||
}
|
||||
case GGML_TYPE_Q5_K:
|
||||
return dequantize_row_q5_K_sycl;
|
||||
case GGML_TYPE_Q6_K:
|
||||
|
||||
@@ -357,6 +357,28 @@ static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8
|
||||
}
|
||||
#endif
|
||||
|
||||
template <typename dst_t>
|
||||
inline void dequantize_q4_K_common(dst_t * __restrict__ y, const uint8_t * __restrict__ qs_ptr, const float dall,
|
||||
const float dmin, uint8_t * __restrict__ scales_local, int il, int ir) {
|
||||
const int is = 2 * il;
|
||||
constexpr int n = 4;
|
||||
|
||||
uint8_t sc, m;
|
||||
get_scale_min_k4(is + 0, scales_local, sc, m);
|
||||
const float d1 = dall * sc;
|
||||
const float m1 = dmin * m;
|
||||
|
||||
get_scale_min_k4(is + 1, scales_local, sc, m);
|
||||
const float d2 = dall * sc;
|
||||
const float m2 = dmin * m;
|
||||
|
||||
sycl::vec<uint8_t, n> q_vec = vec_aligned_load<uint8_t, n>(qs_ptr + 32 * il + n * ir);
|
||||
for (int l = 0; l < n; ++l) {
|
||||
y[l + 0] = d1 * (q_vec[l] & 0xF) - m1;
|
||||
y[l + 32] = d2 * (q_vec[l] >> 4) - m2;
|
||||
}
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
|
||||
uint8_t* scales_local, const sycl::nd_item<3> &item_ct1) {
|
||||
@@ -365,36 +387,22 @@ static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restri
|
||||
const int64_t i = item_ct1.get_group(2);
|
||||
|
||||
#if QK_K == 256
|
||||
// assume 32 threads
|
||||
const int64_t tid = item_ct1.get_local_id(2);
|
||||
const int64_t il = tid/8;
|
||||
const int64_t ir = tid%8;
|
||||
const int64_t is = 2*il;
|
||||
const int64_t n = 4;
|
||||
const int64_t il = tid / 8;
|
||||
const int64_t ir = tid % 8;
|
||||
|
||||
dst_t * y = yy + i*QK_K + 64*il + n*ir;
|
||||
dst_t * y = yy + i * QK_K + 64 * il + 4 * ir;
|
||||
|
||||
const sycl::half2 dm = x[i].dm;
|
||||
const float dall = dm[0];
|
||||
const float dmin = dm[1];
|
||||
|
||||
if (tid < 12)
|
||||
if (tid < 12) {
|
||||
scales_local[tid] = x[i].scales[tid];
|
||||
item_ct1.barrier(sycl::access::fence_space::local_space);
|
||||
|
||||
uint8_t sc, m;
|
||||
get_scale_min_k4(is + 0, scales_local, sc, m);
|
||||
const float d1 = dall * sc;
|
||||
const float m1 = dmin * m;
|
||||
get_scale_min_k4(is + 1, scales_local, sc, m);
|
||||
const float d2 = dall * sc;
|
||||
const float m2 = dmin * m;
|
||||
|
||||
sycl::vec<uint8_t, n> q_vec = vec_aligned_load<uint8_t, n>(x[i].qs + 32*il + n*ir);
|
||||
for (int l = 0; l < n; ++l) {
|
||||
y[l + 0] = d1 * (q_vec[l] & 0xF) - m1;
|
||||
y[l +32] = d2 * (q_vec[l] >> 4) - m2;
|
||||
}
|
||||
|
||||
item_ct1.barrier(sycl::access::fence_space::local_space);
|
||||
dequantize_q4_K_common(y, x[i].qs, dall, dmin, scales_local, il, ir);
|
||||
#else
|
||||
const int64_t tid = item_ct1.get_local_id(2);
|
||||
const uint8_t * q = x[i].qs;
|
||||
@@ -406,6 +414,36 @@ static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restri
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename dst_t>
|
||||
static void dequantize_block_q4_K_reorder(const void * __restrict__ vx, dst_t * __restrict__ yy, uint8_t * scales_local,
|
||||
const sycl::nd_item<1> & item_ct1, int64_t nb) {
|
||||
const int64_t i = item_ct1.get_group(0); // block index
|
||||
const int64_t tid = item_ct1.get_local_id(0); // thread index within block
|
||||
const int64_t il = tid / 8;
|
||||
const int64_t ir = tid % 8;
|
||||
|
||||
dst_t * y = yy + i * QK_K + 64 * il + 4 * ir;
|
||||
|
||||
const uint8_t * base = static_cast<const uint8_t *>(vx);
|
||||
const size_t qs_offset = i * (QK_K / 2);
|
||||
const size_t scales_offset = nb * (QK_K / 2) + i * K_SCALE_SIZE;
|
||||
const size_t dm_offset = nb * (QK_K / 2) + nb * K_SCALE_SIZE + i * sizeof(ggml_half2);
|
||||
|
||||
const uint8_t * qs_ptr = base + qs_offset;
|
||||
const uint8_t * scales_ptr = base + scales_offset;
|
||||
ggml_half2 dm_values = *reinterpret_cast<const ggml_half2 *>(base + dm_offset);
|
||||
|
||||
const float dall = dm_values.x();
|
||||
const float dmin = dm_values.y();
|
||||
|
||||
if (tid < 12) {
|
||||
scales_local[tid] = scales_ptr[tid];
|
||||
}
|
||||
|
||||
item_ct1.barrier(sycl::access::fence_space::local_space);
|
||||
dequantize_q4_K_common(y, qs_ptr, dall, dmin, scales_local, il, ir);
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
|
||||
@@ -1129,7 +1129,13 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
|
||||
dequantize_mul_mat_vec_q3_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q4_K:
|
||||
dequantize_mul_mat_vec_q4_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
|
||||
if ((ggml_tensor_extra_gpu *) dst->src[0]->extra &&
|
||||
((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
|
||||
// reorder is currently not supported for dmmv
|
||||
GGML_ABORT("Unimplemented dequantize case case for q4_k reorder");
|
||||
} else {
|
||||
dequantize_mul_mat_vec_q4_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
|
||||
}
|
||||
break;
|
||||
case GGML_TYPE_Q5_K:
|
||||
dequantize_mul_mat_vec_q5_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
|
||||
|
||||
@@ -655,7 +655,6 @@ inline void ggml_sycl_op_sgn(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
|
||||
}
|
||||
default:
|
||||
GGML_ABORT("GGML tensor type not supported!\n");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -688,7 +687,6 @@ inline void ggml_sycl_op_abs(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
|
||||
}
|
||||
default:
|
||||
GGML_ABORT("GGML tensor type not supported!\n");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -722,7 +720,6 @@ inline void ggml_sycl_op_elu(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
|
||||
}
|
||||
default:
|
||||
GGML_ABORT("GGML tensor type not supported!\n");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -754,7 +751,6 @@ inline void ggml_sycl_op_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst
|
||||
}
|
||||
default:
|
||||
GGML_ABORT("GGML tensor type not supported!\n");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -786,7 +782,6 @@ inline void ggml_sycl_op_gelu(ggml_backend_sycl_context & ctx, ggml_tensor * dst
|
||||
}
|
||||
default:
|
||||
GGML_ABORT("GGML tensor type not supported!\n");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -818,7 +813,6 @@ inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor
|
||||
}
|
||||
default:
|
||||
GGML_ABORT("GGML tensor type not supported!\n");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -850,7 +844,6 @@ inline void ggml_sycl_op_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst
|
||||
}
|
||||
default:
|
||||
GGML_ABORT("GGML tensor type not supported!\n");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -883,7 +876,6 @@ inline void ggml_sycl_op_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst
|
||||
}
|
||||
default:
|
||||
GGML_ABORT("GGML tensor type not supported!\n");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -917,7 +909,6 @@ inline void ggml_sycl_op_hardsigmoid(ggml_backend_sycl_context & ctx, ggml_tenso
|
||||
}
|
||||
default:
|
||||
GGML_ABORT("GGML tensor type not supported!\n");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -949,7 +940,6 @@ inline void ggml_sycl_op_hardswish(ggml_backend_sycl_context & ctx, ggml_tensor
|
||||
}
|
||||
default:
|
||||
GGML_ABORT("GGML tensor type not supported!\n");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -981,7 +971,6 @@ inline void ggml_sycl_op_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
|
||||
}
|
||||
default:
|
||||
GGML_ABORT("GGML tensor type not supported!\n");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1013,7 +1002,6 @@ inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
|
||||
}
|
||||
default:
|
||||
GGML_ABORT("GGML tensor type not supported!\n");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1045,7 +1033,6 @@ inline void ggml_sycl_op_sigmoid(ggml_backend_sycl_context & ctx, ggml_tensor *
|
||||
}
|
||||
default:
|
||||
GGML_ABORT("GGML tensor type not supported!\n");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1078,7 +1065,6 @@ inline void ggml_sycl_op_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor * dst
|
||||
}
|
||||
default:
|
||||
GGML_ABORT("GGML tensor type not supported!\n");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1110,7 +1096,6 @@ inline void ggml_sycl_op_sin(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
|
||||
}
|
||||
default:
|
||||
GGML_ABORT("GGML tensor type not supported!\n");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1142,7 +1127,6 @@ inline void ggml_sycl_op_cos(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
|
||||
}
|
||||
default:
|
||||
GGML_ABORT("GGML tensor type not supported!\n");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1174,7 +1158,6 @@ inline void ggml_sycl_op_step(ggml_backend_sycl_context & ctx, ggml_tensor * dst
|
||||
}
|
||||
default:
|
||||
GGML_ABORT("GGML tensor type not supported!\n");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1206,7 +1189,6 @@ inline void ggml_sycl_op_neg(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
|
||||
}
|
||||
default:
|
||||
GGML_ABORT("GGML tensor type not supported!\n");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1241,7 +1223,6 @@ inline void ggml_sycl_op_leaky_relu(ggml_backend_sycl_context & ctx, ggml_tensor
|
||||
}
|
||||
default:
|
||||
GGML_ABORT("GGML tensor type not supported!\n");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1273,7 +1254,6 @@ inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
|
||||
}
|
||||
default:
|
||||
GGML_ABORT("GGML tensor type not supported!\n");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1315,7 +1295,6 @@ inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, ggml_tensor *
|
||||
}
|
||||
default:
|
||||
GGML_ABORT("GGML tensor type not supported!\n");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1350,7 +1329,6 @@ inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
|
||||
}
|
||||
default:
|
||||
GGML_ABORT("GGML tensor type not supported!\n");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1388,7 +1366,6 @@ inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * ds
|
||||
}
|
||||
default:
|
||||
GGML_ABORT("GGML tensor type not supported!\n");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -32,16 +32,36 @@ public:
|
||||
else static_assert(0);
|
||||
}
|
||||
|
||||
static inline void row_gemm(ggml_backend_sycl_context & ctx, bool a_trans, bool b_trans, int m, int n, int k,
|
||||
const void * a, dt at, const void * b, dt bt, void * c, dt ct, const queue_ptr & q) {
|
||||
// matrix A has m rows, k columns
|
||||
// matrix B has k rows, n columns
|
||||
// nra - number of elements to skip when moving into next row in A
|
||||
// nrb - number of elements to skip when moving into next row in B
|
||||
// nca - number of elements to skip when moving into next column in A
|
||||
// ncb - number of elements to skip when moving into next column in B
|
||||
// stride_a - number of elements to skip when moving to next A matrix
|
||||
// stride_b - number of elements to skip when moving to next B matrix
|
||||
// batches_a - number of A matrices
|
||||
// batches_b - number of B matrices
|
||||
static void gemm(ggml_backend_sycl_context & ctx, int m, int n, int k,
|
||||
const void * a, dt at, dnnl_dim_t nra, dnnl_dim_t nca, dnnl_dim_t stride_a,
|
||||
const void * b, dt bt, dnnl_dim_t nrb, dnnl_dim_t ncb, dnnl_dim_t stride_b,
|
||||
void * c, dt ct, const queue_ptr & q, dnnl_dim_t batches_a, dnnl_dim_t batches_b) {
|
||||
|
||||
auto stream = ctx.stream_dnnl(q);
|
||||
auto eng = ctx.engine_dnnl(q);
|
||||
dnnl::memory::dims a_dims = { m, k };
|
||||
dnnl::memory::dims b_dims = { k, n };
|
||||
dnnl::memory::dims c_dims = { m, n };
|
||||
const auto a_in_md = dnnl::memory::desc(a_dims, at, a_trans ? tag::ba : tag::ab);
|
||||
const auto b_in_md = dnnl::memory::desc(b_dims, bt, b_trans ? tag::ba : tag::ab);
|
||||
const auto c_md = dnnl::memory::desc(c_dims, ct, tag::ab);
|
||||
|
||||
// { # strides, # rows, # columns }
|
||||
dnnl::memory::dims a_dims = { batches_a, m, k };
|
||||
dnnl::memory::dims b_dims = { batches_b, k, n };
|
||||
dnnl::memory::dims c_dims = { std::max(batches_a, batches_b), m, n };
|
||||
|
||||
// { # elements to skip to next stride, # elements to skip to next row, # elements to skip to next column }
|
||||
dnnl::memory::dims a_strides = { stride_a, nra, nca };
|
||||
dnnl::memory::dims b_strides = { stride_b, nrb, ncb };
|
||||
|
||||
const auto a_in_md = dnnl::memory::desc(a_dims, at, a_strides);
|
||||
const auto b_in_md = dnnl::memory::desc(b_dims, bt, b_strides);
|
||||
const auto c_md = dnnl::memory::desc(c_dims, ct, tag::abc);
|
||||
|
||||
dnnl::primitive_attr primitive_attr;
|
||||
primitive_attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
|
||||
@@ -63,6 +83,15 @@ public:
|
||||
|
||||
matmul_prim.execute(stream, matmul_args);
|
||||
}
|
||||
|
||||
// matrices A and B are column major, both having k rows
|
||||
// matrix A has m column, matrix B has n columns
|
||||
// output: column major matrix C = A transposed * B
|
||||
static void row_gemm(ggml_backend_sycl_context & ctx, int m, int n, int k,
|
||||
const void * a, dt at, const void * b, dt bt, void * c, dt ct, const queue_ptr & q) {
|
||||
|
||||
gemm(ctx, m, n, k, a, at, k, 1, k * m, b, bt, 1, k, n * k, c, ct, q, 1, 1);
|
||||
}
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
@@ -49,6 +49,7 @@ static bool g_sycl_loaded = false;
|
||||
int g_ggml_sycl_debug = 0;
|
||||
int g_ggml_sycl_disable_optimize = 0;
|
||||
int g_ggml_sycl_disable_graph = 0;
|
||||
int g_ggml_sycl_disable_dnn = 0;
|
||||
int g_ggml_sycl_prioritize_dmmv = 0;
|
||||
|
||||
static ggml_sycl_device_info ggml_sycl_init() {
|
||||
@@ -196,12 +197,22 @@ static void ggml_check_sycl() try {
|
||||
g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
|
||||
g_ggml_sycl_disable_optimize= get_sycl_env("GGML_SYCL_DISABLE_OPT", 1);
|
||||
g_ggml_sycl_disable_graph = get_sycl_env("GGML_SYCL_DISABLE_GRAPH", 1);
|
||||
g_ggml_sycl_disable_dnn = get_sycl_env("GGML_SYCL_DISABLE_DNN", 0);
|
||||
g_ggml_sycl_prioritize_dmmv = get_sycl_env("GGML_SYCL_PRIORITIZE_DMMV", 0);
|
||||
GGML_SYCL_DEBUG("[SYCL] call ggml_check_sycl\n");
|
||||
GGML_LOG_INFO("Running with Environment Variables:\n");
|
||||
GGML_LOG_INFO(" GGML_SYCL_DEBUG: %d\n", g_ggml_sycl_debug);
|
||||
GGML_LOG_INFO(" GGML_SYCL_DISABLE_OPT: %d\n", g_ggml_sycl_disable_optimize);
|
||||
#ifdef GGML_SYCL_GRAPH
|
||||
GGML_LOG_INFO(" GGML_SYCL_DISABLE_GRAPH: %d\n", g_ggml_sycl_disable_graph);
|
||||
#else
|
||||
GGML_LOG_INFO(" GGML_SYCL_DISABLE_GRAPH: graph disabled by compile flag\n");
|
||||
#endif
|
||||
#if GGML_SYCL_DNNL
|
||||
GGML_LOG_INFO(" GGML_SYCL_DISABLE_DNN: %d\n", g_ggml_sycl_disable_dnn);
|
||||
#else
|
||||
GGML_LOG_INFO(" GGML_SYCL_DISABLE_DNN: DNN disabled by compile flag\n");
|
||||
#endif
|
||||
GGML_LOG_INFO(" GGML_SYCL_PRIORITIZE_DMMV: %d\n", g_ggml_sycl_prioritize_dmmv);
|
||||
GGML_LOG_INFO("Build with Macros:\n");
|
||||
#if defined(GGML_SYCL_FORCE_MMQ)
|
||||
@@ -341,7 +352,7 @@ ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
|
||||
assert(tensor->view_src->buffer->buft == buffer->buft);
|
||||
return GGML_STATUS_SUCCESS;
|
||||
}
|
||||
if (tensor->type == GGML_TYPE_Q4_0 && !g_ggml_sycl_disable_optimize) {
|
||||
if ((tensor->type == GGML_TYPE_Q4_0 || tensor->type == GGML_TYPE_Q4_K) && !g_ggml_sycl_disable_optimize) {
|
||||
ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
|
||||
tensor->extra = extra;
|
||||
ctx->tensor_extras.push_back(extra); //used to release it when destroy ctx.
|
||||
@@ -1985,19 +1996,18 @@ inline void ggml_sycl_op_mul_mat_sycl(
|
||||
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
const int64_t ne10 = src1->ne[0];
|
||||
|
||||
GGML_ASSERT(ne00 == ne10);
|
||||
|
||||
const int64_t row_diff = row_high - row_low;
|
||||
|
||||
int id;
|
||||
SYCL_CHECK(
|
||||
CHECK_TRY_ERROR(id = get_current_device_id()));
|
||||
#if !GGML_SYCL_DNNL
|
||||
const int64_t ne0 = dst->ne[0];
|
||||
|
||||
const int64_t ne0 = dst->ne[0]; // used by MKL only
|
||||
// the main device has a larger memory buffer to hold the results from all GPUs
|
||||
// ldc == nrows of the matrix that cuBLAS writes into
|
||||
int ldc = id == ctx.device ? ne0 : row_diff;
|
||||
#endif
|
||||
int ldc = id == ctx.device ? ne0 : row_diff; // used by MKL only
|
||||
|
||||
#ifdef GGML_SYCL_F16
|
||||
bool use_fp16 = true; // TODO(Yu) SYCL capability check
|
||||
@@ -2033,25 +2043,29 @@ inline void ggml_sycl_op_mul_mat_sycl(
|
||||
: src1_as_f16.get();
|
||||
ggml_sycl_pool_alloc<sycl::half> dst_f16(ctx.pool(), row_diff * src1_ncols);
|
||||
|
||||
#if !GGML_SYCL_DNNL
|
||||
const sycl::half alpha_f16 = 1.0f;
|
||||
const sycl::half beta_f16 = 0.0f;
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm(
|
||||
*stream, oneapi::math::transpose::trans,
|
||||
oneapi::math::transpose::nontrans, row_diff, src1_ncols, ne10,
|
||||
&alpha_f16, src0_ptr, dpct::library_data_t::real_half, ne00,
|
||||
src1_ptr, dpct::library_data_t::real_half, ne10, &beta_f16,
|
||||
dst_f16.get(), dpct::library_data_t::real_half, ldc,
|
||||
dpct::library_data_t::real_half)));
|
||||
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16, dst);
|
||||
to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
|
||||
#else
|
||||
DnnlGemmWrapper::row_gemm(ctx, false, true, src1_ncols, row_diff, ne10, src1_ptr,
|
||||
DnnlGemmWrapper::to_dt<sycl::half>(), src0_ptr, DnnlGemmWrapper::to_dt<sycl::half>(),
|
||||
dst_f16.get(), DnnlGemmWrapper::to_dt<sycl::half>(), stream);
|
||||
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16, dst);
|
||||
to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff* src1_ncols, stream);
|
||||
#if GGML_SYCL_DNNL
|
||||
if (!g_ggml_sycl_disable_dnn) {
|
||||
DnnlGemmWrapper::row_gemm(ctx, src1_ncols, row_diff, ne10, src1_ptr,
|
||||
DnnlGemmWrapper::to_dt<sycl::half>(), src0_ptr, DnnlGemmWrapper::to_dt<sycl::half>(),
|
||||
dst_f16.get(), DnnlGemmWrapper::to_dt<sycl::half>(), stream);
|
||||
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16, dst);
|
||||
to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff* src1_ncols, stream);
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
const sycl::half alpha_f16 = 1.0f;
|
||||
const sycl::half beta_f16 = 0.0f;
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm(
|
||||
*stream, oneapi::math::transpose::trans,
|
||||
oneapi::math::transpose::nontrans, row_diff, src1_ncols, ne10,
|
||||
&alpha_f16, src0_ptr, dpct::library_data_t::real_half, ne00,
|
||||
src1_ptr, dpct::library_data_t::real_half, ne10, &beta_f16,
|
||||
dst_f16.get(), dpct::library_data_t::real_half, ldc,
|
||||
dpct::library_data_t::real_half)));
|
||||
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16, dst);
|
||||
to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
|
||||
}
|
||||
}
|
||||
else {
|
||||
// GGML_SYCL_DEBUG("ggml_sycl_op_mul_mat_sycl - fp32 path\n");
|
||||
@@ -2072,18 +2086,22 @@ inline void ggml_sycl_op_mul_mat_sycl(
|
||||
const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32.get();
|
||||
const float * src1_ddf1_i = src1->type == GGML_TYPE_F32 ? (const float *) src1_ddf_i : src1_ddq_as_f32.get();
|
||||
|
||||
#if !GGML_SYCL_DNNL
|
||||
const float alpha = 1.0f;
|
||||
const float beta = 0.0f;
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(oneapi::math::blas::column_major::gemm(
|
||||
get_onemath_backend(*stream), oneapi::math::transpose::trans, oneapi::math::transpose::nontrans, row_diff,
|
||||
src1_ncols, ne10, dpct::get_value(&alpha, *stream), src0_ddf_i, ne00, src1_ddf1_i, ne10,
|
||||
dpct::get_value(&beta, *stream), dst_dd_i, ldc)));
|
||||
#else
|
||||
DnnlGemmWrapper::row_gemm(ctx, false, true, src1_ncols, row_diff, ne10, src1_ddf1_i,
|
||||
DnnlGemmWrapper::to_dt<float>(), src0_ddf_i, DnnlGemmWrapper::to_dt<float>(),
|
||||
dst_dd_i, DnnlGemmWrapper::to_dt<float>(), stream);
|
||||
#if GGML_SYCL_DNNL
|
||||
if (!g_ggml_sycl_disable_dnn) {
|
||||
DnnlGemmWrapper::row_gemm(ctx, src1_ncols, row_diff, ne10, src1_ddf1_i,
|
||||
DnnlGemmWrapper::to_dt<float>(), src0_ddf_i, DnnlGemmWrapper::to_dt<float>(),
|
||||
dst_dd_i, DnnlGemmWrapper::to_dt<float>(), stream);
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
const float alpha = 1.0f;
|
||||
const float beta = 0.0f;
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(oneapi::math::blas::column_major::gemm(
|
||||
get_onemath_backend(*stream), oneapi::math::transpose::trans, oneapi::math::transpose::nontrans, row_diff,
|
||||
src1_ncols, ne10, dpct::get_value(&alpha, *stream), src0_ddf_i, ne00, src1_ddf1_i, ne10,
|
||||
dpct::get_value(&beta, *stream), dst_dd_i, ldc)));
|
||||
}
|
||||
}
|
||||
GGML_UNUSED(dst);
|
||||
GGML_UNUSED(src1_ddq_i);
|
||||
@@ -2697,7 +2715,7 @@ catch (sycl::exception const &exc) {
|
||||
std::exit(1);
|
||||
}
|
||||
|
||||
static void k_compute_batched_ptrs(const sycl::half * src0_as_f16, const sycl::half * src1_as_f16, char * dst,
|
||||
static void k_compute_batched_ptrs(const sycl::half * src0_as_f16, const sycl::half * src1_as_f16, void * dst,
|
||||
const void ** ptrs_src, void ** ptrs_dst, int64_t ne12, int64_t ne13, int64_t ne23,
|
||||
size_t nb02, size_t nb03, size_t nb12, size_t nb13, size_t nbd2, size_t nbd3,
|
||||
int64_t r2, int64_t r3, const sycl::nd_item<3> & item_ct1) {
|
||||
@@ -2713,7 +2731,7 @@ static void k_compute_batched_ptrs(const sycl::half * src0_as_f16, const sycl::h
|
||||
|
||||
const uint8_t * src0_bytes = reinterpret_cast<const uint8_t *>(src0_as_f16);
|
||||
const uint8_t * src1_bytes = reinterpret_cast<const uint8_t *>(src1_as_f16);
|
||||
uint8_t * dst_bytes = reinterpret_cast<uint8_t *>(dst);
|
||||
uint8_t * dst_bytes = static_cast<uint8_t *>(dst);
|
||||
|
||||
ptrs_src[0 * ne23 + i12 + i13 * ne12] = src0_bytes + i02 * nb02 + i03 * nb03;
|
||||
ptrs_src[1 * ne23 + i12 + i13 * ne12] = src1_bytes + i12 * nb12 + i13 * nb13;
|
||||
@@ -2726,6 +2744,7 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
|
||||
GGML_ASSERT(!ggml_is_transposed(src1));
|
||||
GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(src0->buffer));
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
||||
@@ -2766,7 +2785,6 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
|
||||
}
|
||||
|
||||
ggml_sycl_pool_alloc<sycl::half> dst_f16(ctx.pool());
|
||||
char * dst_t = reinterpret_cast<char *>(dst_ddf);
|
||||
|
||||
dpct::library_data_t mkl_compute_type = dpct::library_data_t::real_float;
|
||||
dpct::library_data_t mkl_data_type = dpct::library_data_t::real_float;
|
||||
@@ -2783,42 +2801,83 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
|
||||
|
||||
GGML_ASSERT(ne12 % ne02 == 0);
|
||||
GGML_ASSERT(ne13 % ne03 == 0);
|
||||
GGML_ASSERT(ne01 == static_cast<int64_t>(nb1/nb0));
|
||||
GGML_ASSERT(ne10 == ne00);
|
||||
|
||||
// broadcast factors
|
||||
const int64_t r2 = ne12 / ne02;
|
||||
const int64_t r3 = ne13 / ne03;
|
||||
|
||||
if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
|
||||
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(*queue, oneapi::math::transpose::trans,
|
||||
oneapi::math::transpose::nontrans, ne01, ne11, ne10, alpha,
|
||||
src0_f16, dpct::library_data_t::real_half, nb01 / nb00, nb02 / nb00,
|
||||
src1_f16, dpct::library_data_t::real_half, s11, s12, beta, dst_t,
|
||||
mkl_data_type, ne0, ne1 * ne0, ne12 * ne13, mkl_compute_type)));
|
||||
} else {
|
||||
const int ne23 = ne12 * ne13;
|
||||
#if GGML_SYCL_DNNL
|
||||
if (!g_ggml_sycl_disable_dnn) {
|
||||
auto dnn_gemm = [&ctx, queue, ne11, ne01, ne10, nb00, nb01, nb02, s11, s12]
|
||||
(const sycl::half* src1, const sycl::half* src0, float* dst, const dnnl_dim_t batches_a, const dnnl_dim_t batches_b) {
|
||||
|
||||
ggml_sycl_pool_alloc<const void *> ptrs_src(ctx.pool(), 2 * ne23);
|
||||
ggml_sycl_pool_alloc<void *> ptrs_dst(ctx.pool(), 1 * ne23);
|
||||
ggml_sycl_pool_alloc<matrix_info_t<float>> matrix_info(ctx.host_pool(), 1);
|
||||
DnnlGemmWrapper::gemm(ctx, ne11,ne01, ne10,
|
||||
src1, DnnlGemmWrapper::to_dt<sycl::half>(), s11, 1, s12,
|
||||
src0, DnnlGemmWrapper::to_dt<sycl::half>(), 1, nb01/nb00, nb02/nb00,
|
||||
dst, DnnlGemmWrapper::to_dt<float>(), queue, batches_a, batches_b);
|
||||
};
|
||||
|
||||
sycl::range<3> block_dims(1, ne12, ne13);
|
||||
queue->submit([&](sycl::handler & cgh) {
|
||||
const void ** ptrs_src_get = ptrs_src.get();
|
||||
void ** ptrs_dst_get = ptrs_dst.get();
|
||||
size_t nb12_scaled = src1->type == GGML_TYPE_F16 ? nb12 : s12 * sizeof(sycl::half);
|
||||
size_t nb13_scaled = src1->type == GGML_TYPE_F16 ? nb13 : s13 * sizeof(sycl::half);
|
||||
cgh.parallel_for(sycl::nd_range<3>(block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
||||
k_compute_batched_ptrs(src0_f16, src1_f16, dst_t, ptrs_src_get, ptrs_dst_get, ne12, ne13, ne23, nb02,
|
||||
nb03, nb12_scaled, nb13_scaled, nbd2, nbd3, r2, r3, item_ct1);
|
||||
if (r2 == 1 && r3 == 1) {
|
||||
if (ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
|
||||
dnn_gemm(src1_f16, src0_f16, dst_ddf, ne12*ne13, ne02 * ne03);
|
||||
}
|
||||
else {
|
||||
for (int64_t ie03 = 0; ie03 < ne03; ++ie03) {
|
||||
const sycl::half* src0_f16_shifted = src0_f16 + ((ie03*nb03)/sizeof(sycl::half)); // nb is in bytes
|
||||
const sycl::half* src1_f16_shifted = src1_f16 + ie03*s13;
|
||||
float* dst_shifted = dst_ddf + ((ie03*nb3)/sizeof(float));
|
||||
dnn_gemm(src1_f16_shifted, src0_f16_shifted, dst_shifted, ne12, ne02);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// iterate over batches from smaller set of matrices (matrix 0)
|
||||
for (int64_t ie02 = 0; ie02 < ne02; ++ie02) {
|
||||
for (int64_t ie03 = 0; ie03 < ne03; ++ie03) {
|
||||
const sycl::half* src0_f16_shifted = src0_f16 + ((ie02*nb02 + ie03*nb03)/sizeof(sycl::half));
|
||||
const sycl::half* src1_f16_shifted = src1_f16 + ie02*s12*r2 + ie03*s13*r3;
|
||||
float* dst_shifted = dst_ddf + ((ie02*nb2*r2 + ie03*nb3*r3)/sizeof(float));
|
||||
dnn_gemm(src1_f16_shifted, src0_f16_shifted, dst_shifted, r2*r3, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
|
||||
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(*queue, oneapi::math::transpose::trans,
|
||||
oneapi::math::transpose::nontrans, ne01, ne11, ne10, alpha,
|
||||
src0_f16, dpct::library_data_t::real_half, nb01 / nb00, nb02 / nb00,
|
||||
src1_f16, dpct::library_data_t::real_half, s11, s12, beta, dst_ddf,
|
||||
mkl_data_type, ne0, ne1 * ne0, ne12 * ne13, mkl_compute_type)));
|
||||
} else {
|
||||
const int ne23 = ne12 * ne13;
|
||||
|
||||
ggml_sycl_pool_alloc<const void *> ptrs_src(ctx.pool(), 2 * ne23);
|
||||
ggml_sycl_pool_alloc<void *> ptrs_dst(ctx.pool(), 1 * ne23);
|
||||
ggml_sycl_pool_alloc<matrix_info_t<float>> matrix_info(ctx.host_pool(), 1);
|
||||
|
||||
sycl::range<3> block_dims(1, ne12, ne13);
|
||||
queue->submit([&](sycl::handler & cgh) {
|
||||
const void ** ptrs_src_get = ptrs_src.get();
|
||||
void ** ptrs_dst_get = ptrs_dst.get();
|
||||
size_t nb12_scaled = src1->type == GGML_TYPE_F16 ? nb12 : s12 * sizeof(sycl::half);
|
||||
size_t nb13_scaled = src1->type == GGML_TYPE_F16 ? nb13 : s13 * sizeof(sycl::half);
|
||||
cgh.parallel_for(sycl::nd_range<3>(block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
|
||||
k_compute_batched_ptrs(src0_f16, src1_f16, dst_ddf, ptrs_src_get, ptrs_dst_get, ne12, ne13, ne23, nb02,
|
||||
nb03, nb12_scaled, nb13_scaled, nbd2, nbd3, r2, r3, item_ct1);
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
|
||||
*queue, oneapi::math::transpose::trans, oneapi::math::transpose::nontrans, ne01, ne11, ne10, alpha,
|
||||
(const void **) (ptrs_src.get() + 0 * ne23), dpct::library_data_t::real_half, nb01 / nb00,
|
||||
(const void **) (ptrs_src.get() + 1 * ne23), dpct::library_data_t::real_half, s11, beta,
|
||||
(void **) (ptrs_dst.get() + 0 * ne23), mkl_data_type, ne0, ne23, mkl_compute_type, matrix_info.get())));
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
|
||||
*queue, oneapi::math::transpose::trans, oneapi::math::transpose::nontrans, ne01, ne11, ne10, alpha,
|
||||
(const void **) (ptrs_src.get() + 0 * ne23), dpct::library_data_t::real_half, nb01 / nb00,
|
||||
(const void **) (ptrs_src.get() + 1 * ne23), dpct::library_data_t::real_half, s11, beta,
|
||||
(void **) (ptrs_dst.get() + 0 * ne23), mkl_data_type, ne0, ne23, mkl_compute_type, matrix_info.get())));
|
||||
}
|
||||
}
|
||||
} catch (const sycl::exception & exc) {
|
||||
std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl;
|
||||
@@ -2841,6 +2900,8 @@ inline bool ggml_sycl_supports_reorder_mul_mat_sycl(enum ggml_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
return true;
|
||||
case GGML_TYPE_Q4_K:
|
||||
return !g_ggml_sycl_prioritize_dmmv;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
@@ -2858,6 +2919,7 @@ inline bool ggml_sycl_supports_reorder_dmmv(enum ggml_type type) {
|
||||
inline bool ggml_sycl_supports_reorder_mmvq(enum ggml_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_K:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
@@ -2883,16 +2945,16 @@ static bool ggml_sycl_supports_dmmv(enum ggml_type type) {
|
||||
}
|
||||
}
|
||||
|
||||
static void reorder_qw(char *data_device, const int ncols, const int nrows,
|
||||
size_t size, size_t offset, dpct::queue_ptr stream) {
|
||||
auto tmp_buf = sycl::malloc_shared<char>(size, *stream);
|
||||
static void reorder_qw_q4_0(uint8_t * data_device, const int ncols, const int nrows, size_t size, size_t offset,
|
||||
dpct::queue_ptr stream) {
|
||||
auto * tmp_buf = sycl::malloc_shared<uint8_t>(size, *stream);
|
||||
SYCL_CHECK(
|
||||
CHECK_TRY_ERROR((*stream).memcpy(tmp_buf, data_device, size)
|
||||
.wait()));
|
||||
GGML_ASSERT((size % sizeof(block_q4_0) == 0));
|
||||
GGML_ASSERT((offset % sizeof(block_q4_0) == 0));
|
||||
int offset_blks = offset / sizeof(block_q4_0);
|
||||
auto qs_ptr = (uint8_t*)data_device + offset_blks * QK4_0 / 2;
|
||||
auto qs_ptr = data_device + offset_blks * QK4_0 / 2;
|
||||
auto d_ptr = (sycl::half*)(qs_ptr + ncols * nrows / 2) + offset_blks;
|
||||
|
||||
stream->parallel_for(
|
||||
@@ -2906,18 +2968,59 @@ static void reorder_qw(char *data_device, const int ncols, const int nrows,
|
||||
*(qs_ptr + ib * QK4_0 / 2 + j) = x[ib].qs[j];
|
||||
}
|
||||
*(d_ptr + ib) = x[ib].d;
|
||||
});
|
||||
}).wait_and_throw();
|
||||
|
||||
sycl::free(tmp_buf, *stream);
|
||||
}
|
||||
|
||||
static void reorder_qw_q4_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) {
|
||||
GGML_ASSERT(size % sizeof(block_q4_K) == 0);
|
||||
GGML_ASSERT(offset % sizeof(block_q4_K) == 0);
|
||||
|
||||
const int nblocks = size / sizeof(block_q4_K);
|
||||
|
||||
auto * tmp_buf = sycl::malloc_shared<uint8_t>(size, *stream);
|
||||
SYCL_CHECK(CHECK_TRY_ERROR((*stream).memcpy(tmp_buf, data_device, size).wait()));
|
||||
|
||||
auto * qs_ptr = data_device;
|
||||
auto * scales_ptr = qs_ptr + QK_K / 2 * nblocks;
|
||||
auto * dm_ptr = (sycl::half2 *) (scales_ptr + K_SCALE_SIZE * nblocks);
|
||||
|
||||
stream->parallel_for(nblocks, [=](auto i) {
|
||||
const block_q4_K * x = (const block_q4_K *) tmp_buf;
|
||||
const int ib = i;
|
||||
|
||||
for (int j = 0; j < QK_K / 2; ++j) {
|
||||
qs_ptr[ib * (QK_K / 2) + j] = x[ib].qs[j];
|
||||
}
|
||||
|
||||
for (int j = 0; j < K_SCALE_SIZE; ++j) {
|
||||
scales_ptr[ib * K_SCALE_SIZE + j] = x[ib].scales[j];
|
||||
}
|
||||
|
||||
dm_ptr[ib] = x[ib].dm;
|
||||
}).wait_and_throw();
|
||||
|
||||
sycl::free(tmp_buf, *stream);
|
||||
}
|
||||
|
||||
static void reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) {
|
||||
char*data_device = (char*)src0->data;
|
||||
uint8_t * data_device = (uint8_t *) src0->data;
|
||||
size_t ncols = src0->ne[0];
|
||||
size_t nrows = src0->ne[1];
|
||||
size_t size = ggml_nbytes(src0);
|
||||
|
||||
reorder_qw(data_device, ncols, nrows, size, 0, stream);
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
reorder_qw_q4_0(data_device, ncols, nrows, size, 0, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q4_K:
|
||||
reorder_qw_q4_k(data_device, size, 0, stream);
|
||||
break;
|
||||
default:
|
||||
GGML_ABORT("reorder_qw() called with unsupported type");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
static bool should_reorder_tensor(ggml_backend_sycl_context& ctx, const ggml_tensor * dst) {
|
||||
@@ -2960,8 +3063,18 @@ static void opt_for_reorder(ggml_backend_sycl_context * ctx, const ggml_tensor *
|
||||
extra->optimized_feature.reorder = true; // Used to decode/dequan in next steps and avoid re-reordering
|
||||
}
|
||||
|
||||
static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
|
||||
static bool can_use_dequantize_mul_mat_vec(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
return ggml_sycl_supports_dmmv(src0->type) && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 &&
|
||||
src0->ne[0] % GGML_SYCL_DMMV_X == 0 && src1->ne[1] == 1;
|
||||
}
|
||||
|
||||
static bool can_use_mul_mat_vec_q(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
return ggml_is_quantized(src0->type) && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 &&
|
||||
src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
|
||||
}
|
||||
|
||||
static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
const bool split = ggml_backend_buffer_is_sycl_split(src0->buffer);
|
||||
int64_t min_compute_capability = INT_MAX;
|
||||
|
||||
@@ -2984,13 +3097,9 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
|
||||
}
|
||||
|
||||
// check data types and tensor shapes for custom matrix multiplication kernels:
|
||||
bool use_dequantize_mul_mat_vec = ggml_sycl_supports_dmmv(src0->type)
|
||||
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
||||
&& src0->ne[0] % GGML_SYCL_DMMV_X == 0 && src1->ne[1] == 1;
|
||||
bool use_dequantize_mul_mat_vec = can_use_dequantize_mul_mat_vec(src0, src1, dst);
|
||||
|
||||
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
|
||||
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
||||
&& src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
|
||||
bool use_mul_mat_vec_q = can_use_mul_mat_vec_q(src0, src1, dst);
|
||||
|
||||
bool use_mul_mat_q = ggml_sycl_supports_mmq(src0->type)
|
||||
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
|
||||
@@ -3713,7 +3822,8 @@ static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_
|
||||
return GGML_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
sycl_ex::command_graph model_sycl_graph(*(sycl_ctx->stream()));
|
||||
sycl_ex::command_graph model_sycl_graph(*(sycl_ctx->stream()), {sycl_ex::property::graph::assume_buffer_outlives_graph{}});
|
||||
|
||||
model_sycl_graph.begin_recording(*(sycl_ctx->stream()));
|
||||
ggml_backend_sycl_graph_compute_impl(sycl_ctx, cgraph);
|
||||
model_sycl_graph.end_recording();
|
||||
|
||||
@@ -24,6 +24,7 @@ static void mul_mat_vec_q_reorder(const void * __restrict__ vx, const void * __r
|
||||
const int blocks_per_row = ncols / block_traits::qk;
|
||||
constexpr int blocks_per_subgroup = ceil_div(block_traits::vdr_mmvq * WARP_SIZE, block_traits::qi);
|
||||
constexpr int block_elements_per_subgroup = block_traits::qi / block_traits::vdr_mmvq;
|
||||
const int nblocks = nrows * (ncols / block_traits::qk);
|
||||
|
||||
static_assert(blocks_per_subgroup > 0);
|
||||
static_assert(block_elements_per_subgroup > 0);
|
||||
@@ -45,7 +46,7 @@ static void mul_mat_vec_q_reorder(const void * __restrict__ vx, const void * __r
|
||||
// x block quant index when casting the quants to int
|
||||
const int iqs = elem + block_traits::vdr_mmvq * (sg.get_local_linear_id() % block_elements_per_subgroup);
|
||||
|
||||
partial_sum += reorder_vec_dot_q_sycl()(vx, bx_offset, d_offset, &y[iby], iqs);
|
||||
partial_sum += reorder_vec_dot_q_sycl()(vx, bx_offset, d_offset, &y[iby], iqs, nblocks);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -739,6 +740,27 @@ static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
}
|
||||
}
|
||||
|
||||
static void reorder_mul_mat_vec_q4_k_q8_1_sycl(const void * vx, const void * vy, float * dst, const int ncols,
|
||||
const int nrows, dpct::queue_ptr stream) {
|
||||
GGML_ASSERT(ncols % QK_K == 0);
|
||||
|
||||
const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y);
|
||||
constexpr size_t num_subgroups = 16;
|
||||
GGML_ASSERT(block_num_y % num_subgroups == 0);
|
||||
|
||||
const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE);
|
||||
const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE);
|
||||
|
||||
stream->submit([&](sycl::handler & cgh) {
|
||||
cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size),
|
||||
[=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q_reorder<reorder_vec_dot_q_sycl<GGML_TYPE_Q4_K>>(vx, vy, dst, ncols,
|
||||
nrows, nd_item);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
|
||||
float *dst, const int ncols,
|
||||
const int nrows,
|
||||
@@ -1035,7 +1057,12 @@ void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tens
|
||||
mul_mat_vec_q3_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q4_K:
|
||||
mul_mat_vec_q4_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
||||
if ((ggml_tensor_extra_gpu *) dst->src[0]->extra &&
|
||||
((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
|
||||
reorder_mul_mat_vec_q4_k_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
||||
} else {
|
||||
mul_mat_vec_q4_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
||||
}
|
||||
break;
|
||||
case GGML_TYPE_Q5_K:
|
||||
mul_mat_vec_q5_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
||||
|
||||
@@ -56,6 +56,28 @@ template <> struct block_q_t<GGML_TYPE_Q4_0> {
|
||||
static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; }
|
||||
};
|
||||
|
||||
template <> struct block_q_t<GGML_TYPE_Q4_K> {
|
||||
struct traits {
|
||||
static constexpr uint32_t qk = QK_K;
|
||||
static constexpr uint32_t qi = QI4_K;
|
||||
static constexpr uint32_t qr = QR4_K;
|
||||
static constexpr uint32_t vdr_mmvq = 2;
|
||||
};
|
||||
|
||||
static constexpr int get_block_offset(const int block_index) { return block_index * (traits::qk / traits::qr); }
|
||||
|
||||
static constexpr int get_d_offset(int nrows, int ncols, const int block_index) {
|
||||
auto nblocks = (nrows * (ncols / traits::qk));
|
||||
return (nblocks * QK_K / 2) + (nblocks * K_SCALE_SIZE) + (block_index * sizeof(ggml_half2));
|
||||
}
|
||||
|
||||
static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; }
|
||||
|
||||
constexpr size_t get_total_qs_bytes(int nblocks) { return nblocks * QK_K / 2; }
|
||||
|
||||
constexpr size_t get_dm_offset(int nblocks) { return get_total_qs_bytes(nblocks) + nblocks * K_SCALE_SIZE; }
|
||||
};
|
||||
|
||||
} // namespace ggml_sycl_reordered
|
||||
|
||||
#endif // GGML_SYCL_QUANTS_HPP
|
||||
|
||||
@@ -285,7 +285,7 @@ template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q4_0> {
|
||||
}
|
||||
|
||||
__dpct_inline__ float operator()(const void * __restrict__ vbq, const int ibx_offset, const int d_offset,
|
||||
const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
||||
const block_q8_1 * __restrict__ bq8_1, const int & iqs, int /* nblocks */) {
|
||||
const uint8_t * bq4_0 = static_cast<const uint8_t *>(vbq) + ibx_offset;
|
||||
const ggml_half d = *(reinterpret_cast<const ggml_half *>(static_cast<const uint8_t *>(vbq) + d_offset));
|
||||
int v[q4_0_traits::vdr_mmvq];
|
||||
@@ -303,6 +303,67 @@ template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q4_0> {
|
||||
};
|
||||
};
|
||||
|
||||
static inline float vec_dot_q4_K_q8_1_common(const int * __restrict__ q4, const uint16_t * __restrict__ scales,
|
||||
const ggml_half2 & dm, const block_q8_1 * __restrict__ bq8_1,
|
||||
const int & iqs) {
|
||||
int v[2];
|
||||
int u[2 * QR4_K];
|
||||
float d8[QR4_K];
|
||||
|
||||
v[0] = q4[0];
|
||||
v[1] = q4[4];
|
||||
|
||||
uint16_t aux[2];
|
||||
const int j = (QR4_K * ((iqs / 2) / (QI8_1 / 2))) / 2;
|
||||
if (j < 2) {
|
||||
aux[0] = scales[j + 0] & 0x3f3f;
|
||||
aux[1] = scales[j + 2] & 0x3f3f;
|
||||
} else {
|
||||
aux[0] = ((scales[j + 2] >> 0) & 0x0f0f) | ((scales[j - 2] & 0xc0c0) >> 2);
|
||||
aux[1] = ((scales[j + 2] >> 4) & 0x0f0f) | ((scales[j - 0] & 0xc0c0) >> 2);
|
||||
}
|
||||
|
||||
const uint8_t * sc = (const uint8_t *) aux;
|
||||
const uint8_t * m = sc + 2;
|
||||
|
||||
const int bq8_offset = QR4_K * ((iqs / 2) / (QI8_1 / 2));
|
||||
|
||||
for (int i = 0; i < QR4_K; ++i) {
|
||||
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
||||
d8[i] = bq8i->ds[0];
|
||||
|
||||
const int * q8 = (const int *) bq8i->qs + ((iqs / 2) % 4);
|
||||
u[2 * i + 0] = q8[0];
|
||||
u[2 * i + 1] = q8[4];
|
||||
}
|
||||
|
||||
return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, dm, d8);
|
||||
}
|
||||
|
||||
template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q4_K> {
|
||||
static constexpr ggml_type gtype = GGML_TYPE_Q4_K;
|
||||
|
||||
using q4_k_block = ggml_sycl_reordered::block_q_t<GGML_TYPE_Q4_K>;
|
||||
using q4_k_traits = typename q4_k_block::traits;
|
||||
|
||||
float operator()(const void * __restrict__ vbq, const int ibx_offset, const int d_offset,
|
||||
const block_q8_1 * __restrict__ bq8_1, const int & iqs, int nblocks) {
|
||||
const int ib = ibx_offset / (QK_K / 2);
|
||||
|
||||
const uint8_t * base = static_cast<const uint8_t *>(vbq);
|
||||
const uint8_t * qs = base + ibx_offset;
|
||||
const int total_qs_bytes = nblocks * (QK_K / 2);
|
||||
const uint8_t * scs = base + total_qs_bytes + ib * K_SCALE_SIZE;
|
||||
const ggml_half2 * dms = reinterpret_cast<const ggml_half2 *>(base + d_offset);
|
||||
|
||||
const int bq8_offset = QR4_K * ((iqs / 2) / (QI8_1 / 2));
|
||||
const int * q4 = (const int *) (qs + 16 * bq8_offset + 4 * ((iqs / 2) % 4));
|
||||
const uint16_t * scales = (const uint16_t *) scs;
|
||||
|
||||
return vec_dot_q4_K_q8_1_common(q4, scales, *dms, bq8_1, iqs);
|
||||
}
|
||||
};
|
||||
|
||||
#define VDR_Q4_0_Q8_1_MMVQ 2
|
||||
#define VDR_Q4_0_Q8_1_MMQ 4
|
||||
|
||||
@@ -649,52 +710,17 @@ vec_dot_q3_K_q8_1(const void *__restrict__ vbq,
|
||||
return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
|
||||
}
|
||||
|
||||
static __dpct_inline__ float
|
||||
vec_dot_q4_K_q8_1(const void *__restrict__ vbq,
|
||||
const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
|
||||
|
||||
static __dpct_inline__ float vec_dot_q4_K_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1,
|
||||
const int & iqs) {
|
||||
#ifndef GGML_QKK_64
|
||||
|
||||
const block_q4_K * bq4_K = (const block_q4_K *) vbq;
|
||||
|
||||
int v[2];
|
||||
int u[2*QR4_K];
|
||||
float d8[QR4_K];
|
||||
const int bq8_offset = QR4_K * ((iqs / 2) / (QI8_1 / 2));
|
||||
const int * q4 = (const int *) (bq4_K->qs + 16 * bq8_offset + 4 * ((iqs / 2) % 4));
|
||||
const uint16_t * scales = (const uint16_t *) bq4_K->scales;
|
||||
|
||||
// iqs is in 0,2..30. bq8_offset = iqs/4 -> bq8_offset = 0, 2, 4, 6
|
||||
const int bq8_offset = QR4_K * ((iqs/2) / (QI8_1/2));
|
||||
|
||||
// iqs = 0....3 -> bq8_offset = 0, want q4_offset = 0, 4, 8, 12
|
||||
// iqs = 4....7 -> bq8_offset = 2, want q4_offset = 32, 36, 40, 44
|
||||
// iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76
|
||||
// iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108
|
||||
|
||||
const int * q4 = (const int *)(bq4_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4));
|
||||
v[0] = q4[0];
|
||||
v[1] = q4[4];
|
||||
|
||||
const uint16_t * scales = (const uint16_t *)bq4_K->scales;
|
||||
uint16_t aux[2];
|
||||
const int j = bq8_offset/2;
|
||||
if (j < 2) {
|
||||
aux[0] = scales[j+0] & 0x3f3f;
|
||||
aux[1] = scales[j+2] & 0x3f3f;
|
||||
} else {
|
||||
aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
|
||||
aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
|
||||
}
|
||||
const uint8_t * sc = (const uint8_t *)aux;
|
||||
const uint8_t * m = sc + 2;
|
||||
|
||||
for (int i = 0; i < QR4_K; ++i) {
|
||||
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
||||
d8[i] = bq8i->ds[0];
|
||||
|
||||
const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
|
||||
u[2*i+0] = q8[0];
|
||||
u[2*i+1] = q8[4];
|
||||
}
|
||||
|
||||
return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8);
|
||||
return vec_dot_q4_K_q8_1_common(q4, scales, bq4_K->dm, bq8_1, iqs);
|
||||
|
||||
#else
|
||||
|
||||
|
||||
@@ -5872,10 +5872,17 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
|
||||
vk_pipeline *pipelines;
|
||||
bool small_rows = N <= get_fa_num_small_rows(path);
|
||||
|
||||
// coopmat1 does not actually support "small rows" (it needs 16 rows).
|
||||
// So use scalar instead.
|
||||
if (small_rows && path == FA_COOPMAT1) {
|
||||
path = FA_SCALAR;
|
||||
}
|
||||
|
||||
// scalar is faster than coopmat2 when N==1
|
||||
if (N == 1 && path == FA_COOPMAT2) {
|
||||
path = FA_SCALAR;
|
||||
}
|
||||
|
||||
bool f32acc = path == FA_SCALAR || dst->op_params[3] == GGML_PREC_F32;
|
||||
|
||||
switch (path) {
|
||||
|
||||
@@ -299,10 +299,10 @@ bool gguf_read_emplace_helper(const struct gguf_reader & gr, std::vector<struct
|
||||
return false;
|
||||
}
|
||||
} catch (std::length_error &) {
|
||||
fprintf(stderr, "%s: encountered length_error while reading value for key '%s'\n", __func__, key.c_str());
|
||||
GGML_LOG_ERROR("%s: encountered length_error while reading value for key '%s'\n", __func__, key.c_str());
|
||||
return false;
|
||||
} catch (std::bad_alloc &) {
|
||||
fprintf(stderr, "%s: encountered bad_alloc error while reading value for key '%s'\n", __func__, key.c_str());
|
||||
GGML_LOG_ERROR("%s: encountered bad_alloc error while reading value for key '%s'\n", __func__, key.c_str());
|
||||
return false;
|
||||
}
|
||||
kv.emplace_back(key, value);
|
||||
@@ -328,14 +328,14 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
|
||||
ok = ok && gr.read(magic, 4);
|
||||
|
||||
if (!ok) {
|
||||
fprintf(stderr, "%s: failed to read magic\n", __func__);
|
||||
GGML_LOG_ERROR("%s: failed to read magic\n", __func__);
|
||||
gguf_free(ctx);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
for (uint32_t i = 0; i < magic.size(); i++) {
|
||||
if (magic[i] != GGUF_MAGIC[i]) {
|
||||
fprintf(stderr, "%s: invalid magic characters: '%c%c%c%c', expected 'GGUF'\n", __func__, magic[0], magic[1], magic[2], magic[3]);
|
||||
GGML_LOG_ERROR("%s: invalid magic characters: '%c%c%c%c', expected 'GGUF'\n", __func__, magic[0], magic[1], magic[2], magic[3]);
|
||||
gguf_free(ctx);
|
||||
return nullptr;
|
||||
}
|
||||
@@ -348,11 +348,11 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
|
||||
|
||||
if (ok && gr.read(ctx->version)) {
|
||||
if (ctx->version == 1) {
|
||||
fprintf(stderr, "%s: GGUFv1 is no longer supported, please use a more up-to-date version\n", __func__);
|
||||
GGML_LOG_ERROR("%s: GGUFv1 is no longer supported, please use a more up-to-date version\n", __func__);
|
||||
ok = false;
|
||||
}
|
||||
if (ctx->version > GGUF_VERSION) {
|
||||
fprintf(stderr, "%s: this GGUF file is version %" PRIu32 " but this software only supports up to version %d\n",
|
||||
GGML_LOG_ERROR("%s: this GGUF file is version %" PRIu32 " but this software only supports up to version %d\n",
|
||||
__func__, ctx->version, GGUF_VERSION);
|
||||
ok = false;
|
||||
}
|
||||
@@ -363,7 +363,7 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
|
||||
if (ok && gr.read(n_tensors)) {
|
||||
static_assert(sizeof(size_t) <= 8 && sizeof(gguf_tensor_info) >= 2, "int64_t insufficient for indexing");
|
||||
if (n_tensors < 0 || n_tensors > int64_t(SIZE_MAX/sizeof(gguf_tensor_info))) {
|
||||
fprintf(stderr, "%s: number of tensors is %" PRIi64 " but must be in [0, %zu]\n",
|
||||
GGML_LOG_ERROR("%s: number of tensors is %" PRIi64 " but must be in [0, %zu]\n",
|
||||
__func__, n_tensors, SIZE_MAX/sizeof(gguf_tensor_info));
|
||||
ok = false;
|
||||
}
|
||||
@@ -374,7 +374,7 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
|
||||
if (ok && gr.read(n_kv)) {
|
||||
static_assert(sizeof(size_t) <= 8 && sizeof(gguf_tensor_info) >= 2, "int64_t insufficient for indexing");
|
||||
if (n_kv < 0 || n_kv > int64_t(SIZE_MAX/sizeof(gguf_kv))) {
|
||||
fprintf(stderr, "%s: number of key value pairs is %" PRIi64 " but must be in [0, %zu]\n",
|
||||
GGML_LOG_ERROR("%s: number of key value pairs is %" PRIi64 " but must be in [0, %zu]\n",
|
||||
__func__, n_kv, SIZE_MAX/sizeof(gguf_kv));
|
||||
ok = false;
|
||||
}
|
||||
@@ -383,7 +383,7 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
|
||||
}
|
||||
|
||||
if (!ok) {
|
||||
fprintf(stderr, "%s: failed to read header\n", __func__);
|
||||
GGML_LOG_ERROR("%s: failed to read header\n", __func__);
|
||||
gguf_free(ctx);
|
||||
return nullptr;
|
||||
}
|
||||
@@ -399,15 +399,15 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
|
||||
try {
|
||||
ok = ok && gr.read(key);
|
||||
} catch (std::length_error &) {
|
||||
fprintf(stderr, "%s: encountered length_error while reading key %" PRIi64 "\n", __func__, i);
|
||||
GGML_LOG_ERROR("%s: encountered length_error while reading key %" PRIi64 "\n", __func__, i);
|
||||
ok = false;
|
||||
} catch (std::bad_alloc &) {
|
||||
fprintf(stderr, "%s: encountered bad_alloc error while reading key %" PRIi64 "\n", __func__, i);
|
||||
GGML_LOG_ERROR("%s: encountered bad_alloc error while reading key %" PRIi64 "\n", __func__, i);
|
||||
ok = false;
|
||||
}
|
||||
for (size_t j = 0; ok && j < ctx->kv.size(); ++j) {
|
||||
if (key == ctx->kv[j].key) {
|
||||
fprintf(stderr, "%s: duplicate key '%s' for tensors %zu and %" PRIi64 " \n", __func__, key.c_str(), j, i);
|
||||
GGML_LOG_ERROR("%s: duplicate key '%s' for tensors %zu and %" PRIi64 " \n", __func__, key.c_str(), j, i);
|
||||
ok = false;
|
||||
}
|
||||
}
|
||||
@@ -441,14 +441,14 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
|
||||
case GGUF_TYPE_ARRAY:
|
||||
default:
|
||||
{
|
||||
fprintf(stderr, "%s: key '%s' has invalid GGUF type %d\n", __func__, key.c_str(), type);
|
||||
GGML_LOG_ERROR("%s: key '%s' has invalid GGUF type %d\n", __func__, key.c_str(), type);
|
||||
ok = false;
|
||||
} break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!ok) {
|
||||
fprintf(stderr, "%s: failed to read key-value pairs\n", __func__);
|
||||
GGML_LOG_ERROR("%s: failed to read key-value pairs\n", __func__);
|
||||
gguf_free(ctx);
|
||||
return nullptr;
|
||||
}
|
||||
@@ -458,7 +458,7 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
|
||||
ctx->alignment = alignment_idx == -1 ? GGUF_DEFAULT_ALIGNMENT : gguf_get_val_u32(ctx, alignment_idx);
|
||||
|
||||
if (ctx->alignment == 0 || (ctx->alignment & (ctx->alignment - 1)) != 0) {
|
||||
fprintf(stderr, "%s: alignment %zu is not a power of 2\n", __func__, ctx->alignment);
|
||||
GGML_LOG_ERROR("%s: alignment %zu is not a power of 2\n", __func__, ctx->alignment);
|
||||
gguf_free(ctx);
|
||||
return nullptr;
|
||||
}
|
||||
@@ -474,14 +474,14 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
|
||||
try {
|
||||
ok = ok && gr.read(name);
|
||||
} catch (std::length_error &) {
|
||||
fprintf(stderr, "%s: encountered length_error while reading tensor name %" PRIi64 "\n", __func__, i);
|
||||
GGML_LOG_ERROR("%s: encountered length_error while reading tensor name %" PRIi64 "\n", __func__, i);
|
||||
ok = false;
|
||||
} catch (std::bad_alloc &) {
|
||||
fprintf(stderr, "%s: encountered bad_alloc error while reading tensor name %" PRIi64 "\n", __func__, i);
|
||||
GGML_LOG_ERROR("%s: encountered bad_alloc error while reading tensor name %" PRIi64 "\n", __func__, i);
|
||||
ok = false;
|
||||
}
|
||||
if (name.length() >= GGML_MAX_NAME) {
|
||||
fprintf(stderr, "%s: tensor name %" PRIi64 " is too long: %zu >= %d\n", __func__, i, name.length(), GGML_MAX_NAME);
|
||||
GGML_LOG_ERROR("%s: tensor name %" PRIi64 " is too long: %zu >= %d\n", __func__, i, name.length(), GGML_MAX_NAME);
|
||||
ok = false;
|
||||
break;
|
||||
}
|
||||
@@ -490,7 +490,7 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
|
||||
// make sure there are no duplicate tensor names
|
||||
for (int64_t j = 0; ok && j < i; ++j) {
|
||||
if (strcmp(info.t.name, ctx->info[j].t.name) == 0) {
|
||||
fprintf(stderr, "%s: duplicate tensor name '%s' for tensors %" PRIi64 " and %" PRIi64 "\n", __func__, info.t.name, j, i);
|
||||
GGML_LOG_ERROR("%s: duplicate tensor name '%s' for tensors %" PRIi64 " and %" PRIi64 "\n", __func__, info.t.name, j, i);
|
||||
ok = false;
|
||||
break;
|
||||
}
|
||||
@@ -505,7 +505,7 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
|
||||
uint32_t n_dims = -1;
|
||||
ok = ok && gr.read(n_dims);
|
||||
if (n_dims > GGML_MAX_DIMS) {
|
||||
fprintf(stderr, "%s: tensor '%s' has invalid number of dimensions: %" PRIu32 " > %" PRIu32 "\n",
|
||||
GGML_LOG_ERROR("%s: tensor '%s' has invalid number of dimensions: %" PRIu32 " > %" PRIu32 "\n",
|
||||
__func__, info.t.name, n_dims, GGML_MAX_DIMS);
|
||||
ok = false;
|
||||
break;
|
||||
@@ -518,7 +518,7 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
|
||||
|
||||
// check that all ne are non-negative
|
||||
if (info.t.ne[j] < 0) {
|
||||
fprintf(stderr, "%s: tensor '%s' dimension %" PRIu32 " has invalid number of elements: %" PRIi64 " < 0\n",
|
||||
GGML_LOG_ERROR("%s: tensor '%s' dimension %" PRIu32 " has invalid number of elements: %" PRIi64 " < 0\n",
|
||||
__func__, info.t.name, j, info.t.ne[j]);
|
||||
ok = false;
|
||||
break;
|
||||
@@ -530,7 +530,7 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
|
||||
(INT64_MAX/info.t.ne[2] <= info.t.ne[0]*info.t.ne[1]) ||
|
||||
(INT64_MAX/info.t.ne[3] <= info.t.ne[0]*info.t.ne[1]*info.t.ne[2]))) {
|
||||
|
||||
fprintf(stderr, "%s: total number of elements in tensor '%s' with shape "
|
||||
GGML_LOG_ERROR("%s: total number of elements in tensor '%s' with shape "
|
||||
"(%" PRIi64 ", %" PRIi64 ", %" PRIi64 ", %" PRIi64 ") is >= %" PRIi64 "\n",
|
||||
__func__, info.t.name, info.t.ne[0], info.t.ne[1], info.t.ne[2], info.t.ne[3], INT64_MAX);
|
||||
ok = false;
|
||||
@@ -547,7 +547,7 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
|
||||
|
||||
// check that tensor type is within defined range
|
||||
if (info.t.type < 0 || info.t.type >= GGML_TYPE_COUNT) {
|
||||
fprintf(stderr, "%s: tensor '%s' has invalid ggml type %d (%s)\n",
|
||||
GGML_LOG_ERROR("%s: tensor '%s' has invalid ggml type %d (%s)\n",
|
||||
__func__, info.t.name, info.t.type, ggml_type_name(info.t.type));
|
||||
ok = false;
|
||||
break;
|
||||
@@ -557,7 +557,7 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
|
||||
|
||||
// check that row size is divisible by block size
|
||||
if (blck_size == 0 || info.t.ne[0] % blck_size != 0) {
|
||||
fprintf(stderr, "%s: tensor '%s' of type %d (%s) has %" PRId64 " elements per row, "
|
||||
GGML_LOG_ERROR("%s: tensor '%s' of type %d (%s) has %" PRId64 " elements per row, "
|
||||
"not a multiple of block size (%" PRId64 ")\n",
|
||||
__func__, info.t.name, (int) info.t.type, ggml_type_name(info.t.type), info.t.ne[0], blck_size);
|
||||
ok = false;
|
||||
@@ -582,7 +582,7 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
|
||||
}
|
||||
|
||||
if (!ok) {
|
||||
fprintf(stderr, "%s: failed to read tensor info\n", __func__);
|
||||
GGML_LOG_ERROR("%s: failed to read tensor info\n", __func__);
|
||||
gguf_free(ctx);
|
||||
return nullptr;
|
||||
}
|
||||
@@ -590,7 +590,7 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
|
||||
|
||||
// we require the data section to be aligned, so take into account any padding
|
||||
if (fseek(file, GGML_PAD(ftell(file), ctx->alignment), SEEK_SET) != 0) {
|
||||
fprintf(stderr, "%s: failed to seek to beginning of data section\n", __func__);
|
||||
GGML_LOG_ERROR("%s: failed to seek to beginning of data section\n", __func__);
|
||||
gguf_free(ctx);
|
||||
return nullptr;
|
||||
}
|
||||
@@ -604,9 +604,9 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
|
||||
for (size_t i = 0; i < ctx->info.size(); ++i) {
|
||||
const gguf_tensor_info & ti = ctx->info[i];
|
||||
if (ti.offset != ctx->size) {
|
||||
fprintf(stderr, "%s: tensor '%s' has offset %" PRIu64 ", expected %zu\n",
|
||||
GGML_LOG_ERROR("%s: tensor '%s' has offset %" PRIu64 ", expected %zu\n",
|
||||
__func__, ti.t.name, ti.offset, ctx->size);
|
||||
fprintf(stderr, "%s: failed to read tensor data\n", __func__);
|
||||
GGML_LOG_ERROR("%s: failed to read tensor data\n", __func__);
|
||||
gguf_free(ctx);
|
||||
return nullptr;
|
||||
}
|
||||
@@ -634,7 +634,7 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
|
||||
|
||||
*params.ctx = ggml_init(pdata);
|
||||
if (*params.ctx == nullptr) {
|
||||
fprintf(stderr, "%s: failed to initialize ggml context for storing tensors\n", __func__);
|
||||
GGML_LOG_ERROR("%s: failed to initialize ggml context for storing tensors\n", __func__);
|
||||
gguf_free(ctx);
|
||||
return nullptr;
|
||||
}
|
||||
@@ -656,7 +656,7 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
|
||||
ok = ok && gr.read(data->data, ctx->size);
|
||||
|
||||
if (!ok) {
|
||||
fprintf(stderr, "%s: failed to read tensor data binary blob\n", __func__);
|
||||
GGML_LOG_ERROR("%s: failed to read tensor data binary blob\n", __func__);
|
||||
ggml_free(ctx_data);
|
||||
*params.ctx = nullptr;
|
||||
gguf_free(ctx);
|
||||
@@ -689,7 +689,7 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
|
||||
}
|
||||
|
||||
if (!ok) {
|
||||
fprintf(stderr, "%s: failed to create tensors\n", __func__);
|
||||
GGML_LOG_ERROR("%s: failed to create tensors\n", __func__);
|
||||
ggml_free(ctx_data);
|
||||
*params.ctx = nullptr;
|
||||
gguf_free(ctx);
|
||||
@@ -706,7 +706,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
FILE * file = ggml_fopen(fname, "rb");
|
||||
|
||||
if (!file) {
|
||||
fprintf(stderr, "%s: failed to open GGUF file '%s'\n", __func__, fname);
|
||||
GGML_LOG_ERROR("%s: failed to open GGUF file '%s'\n", __func__, fname);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
@@ -1305,7 +1305,7 @@ bool gguf_write_to_file(const struct gguf_context * ctx, const char * fname, boo
|
||||
FILE * file = ggml_fopen(fname, "wb");
|
||||
|
||||
if (!file) {
|
||||
fprintf(stderr, "%s: failed to open file '%s' for writing GGUF data\n", __func__, fname);
|
||||
GGML_LOG_ERROR("%s: failed to open file '%s' for writing GGUF data\n", __func__, fname);
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
@@ -823,6 +823,7 @@ class GGUFEditorWindow(QMainWindow):
|
||||
self.modified = False
|
||||
self.metadata_changes = {} # Store changes to apply when saving
|
||||
self.metadata_to_remove = set() # Store keys to remove when saving
|
||||
self.on_metadata_changed_is_connected = False
|
||||
|
||||
self.setup_ui()
|
||||
|
||||
@@ -941,9 +942,11 @@ class GGUFEditorWindow(QMainWindow):
|
||||
return
|
||||
|
||||
# Disconnect to prevent triggering during loading
|
||||
with warnings.catch_warnings():
|
||||
warnings.filterwarnings('ignore')
|
||||
self.metadata_table.itemChanged.disconnect(self.on_metadata_changed)
|
||||
if self.on_metadata_changed_is_connected:
|
||||
with warnings.catch_warnings():
|
||||
warnings.filterwarnings('ignore')
|
||||
self.metadata_table.itemChanged.disconnect(self.on_metadata_changed)
|
||||
self.on_metadata_changed_is_connected = False
|
||||
|
||||
for i, (key, field) in enumerate(self.reader.fields.items()):
|
||||
self.metadata_table.insertRow(i)
|
||||
@@ -1021,6 +1024,7 @@ class GGUFEditorWindow(QMainWindow):
|
||||
|
||||
# Reconnect after loading
|
||||
self.metadata_table.itemChanged.connect(self.on_metadata_changed)
|
||||
self.on_metadata_changed_is_connected = True
|
||||
|
||||
def extract_array_values(self, field: ReaderField) -> list:
|
||||
"""Extract all values from an array field."""
|
||||
|
||||
@@ -68,7 +68,7 @@ class TensorNameMap:
|
||||
"output_layer", # chatglm
|
||||
"head", # rwkv
|
||||
"head.out", # wavtokenizer
|
||||
"language_model.lm_head", # llama4
|
||||
"lm_head", # llama4
|
||||
),
|
||||
|
||||
# Output norm
|
||||
@@ -91,7 +91,7 @@ class TensorNameMap:
|
||||
"rwkv.ln_out", # rwkv6
|
||||
"model.ln_out", # rwkv7
|
||||
"backbone.final_layer_norm", # wavtokenizer
|
||||
"language_model.model.norm", # llama4
|
||||
"model.norm", # llama4
|
||||
),
|
||||
|
||||
# Rope frequencies
|
||||
@@ -133,7 +133,7 @@ class TensorNameMap:
|
||||
"transformer.layers.{bid}.attn_norm", # openelm
|
||||
"rwkv.blocks.{bid}.ln1", # rwkv6
|
||||
"model.layers.{bid}.ln1", # rwkv7
|
||||
"language_model.model.layers.{bid}.input_layernorm", # llama4
|
||||
"model.layers.{bid}.input_layernorm", # llama4
|
||||
),
|
||||
|
||||
# Attention norm 2
|
||||
@@ -173,7 +173,7 @@ class TensorNameMap:
|
||||
"model.layers.{bid}.attention.wq", # internlm2
|
||||
"transformer.decoder_layer.{bid}.multi_head_attention.query",# Grok
|
||||
"transformer.h.{bid}.attn.attention.q_proj", # exaone
|
||||
"language_model.model.layers.{bid}.self_attn.q_proj", # llama4
|
||||
"model.layers.{bid}.self_attn.q_proj", # llama4
|
||||
),
|
||||
|
||||
# Attention key
|
||||
@@ -188,7 +188,7 @@ class TensorNameMap:
|
||||
"model.layers.{bid}.attention.wk", # internlm2
|
||||
"transformer.decoder_layer.{bid}.multi_head_attention.key",# Grok
|
||||
"transformer.h.{bid}.attn.attention.k_proj", # exaone
|
||||
"language_model.model.layers.{bid}.self_attn.k_proj", # llama4
|
||||
"model.layers.{bid}.self_attn.k_proj", # llama4
|
||||
),
|
||||
|
||||
# Attention value
|
||||
@@ -202,7 +202,7 @@ class TensorNameMap:
|
||||
"model.layers.{bid}.attention.wv", # internlm2
|
||||
"transformer.decoder_layer.{bid}.multi_head_attention.value",# Grok
|
||||
"transformer.h.{bid}.attn.attention.v_proj", # exaone
|
||||
"language_model.model.layers.{bid}.self_attn.v_proj", # llama4
|
||||
"model.layers.{bid}.self_attn.v_proj", # llama4
|
||||
),
|
||||
|
||||
# Attention output
|
||||
@@ -229,7 +229,7 @@ class TensorNameMap:
|
||||
"encoder.layers.{bid}.self_attention.dense", # chatglm
|
||||
"transformer.layers.{bid}.attn.out_proj", # openelm
|
||||
"transformer.h.{bid}.attn.attention.out_proj", # exaone
|
||||
"language_model.model.layers.{bid}.self_attn.o_proj", # llama4
|
||||
"model.layers.{bid}.self_attn.o_proj", # llama4
|
||||
),
|
||||
|
||||
# Attention output norm
|
||||
@@ -268,7 +268,7 @@ class TensorNameMap:
|
||||
"transformer.decoder_layer.{bid}.rms_norm_2", # Grok
|
||||
"encoder.layers.{bid}.post_attention_layernorm", # chatglm
|
||||
"transformer.layers.{bid}.ffn_norm", # openelm
|
||||
"language_model.model.layers.{bid}.post_attention_layernorm", # llama4
|
||||
"model.layers.{bid}.post_attention_layernorm", # llama4
|
||||
),
|
||||
|
||||
# Post feed-forward norm
|
||||
@@ -289,7 +289,7 @@ class TensorNameMap:
|
||||
"transformer.decoder_layer.{bid}.router", # Grok
|
||||
"transformer.blocks.{bid}.ffn.router.layer", # dbrx
|
||||
"model.layers.{bid}.block_sparse_moe.router.layer", # granitemoe
|
||||
"language_model.model.layers.{bid}.feed_forward.router", # llama4
|
||||
"model.layers.{bid}.feed_forward.router", # llama4
|
||||
"encoder.layers.{bid}.mlp.router.layer", # nomic-bert-moe
|
||||
),
|
||||
|
||||
@@ -329,7 +329,7 @@ class TensorNameMap:
|
||||
"model.layers.{bid}.residual_mlp.w3", # arctic
|
||||
"encoder.layers.{bid}.mlp.dense_h_to_4h", # chatglm
|
||||
"transformer.h.{bid}.mlp.c_fc_1", # exaone
|
||||
"language_model.model.layers.{bid}.feed_forward.up_proj", # llama4
|
||||
"model.layers.{bid}.feed_forward.up_proj", # llama4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_UP_EXP: (
|
||||
@@ -338,14 +338,14 @@ class TensorNameMap:
|
||||
"transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx
|
||||
"model.layers.{bid}.mlp.experts.up_proj", # qwen2moe olmoe (merged)
|
||||
"model.layers.{bid}.block_sparse_moe.experts.w3", # phimoe (merged)
|
||||
"language_model.model.layers.{bid}.feed_forward.experts.up_proj", # llama4
|
||||
"model.layers.{bid}.feed_forward.experts.up_proj", # llama4
|
||||
"encoder.layers.{bid}.mlp.experts.mlp.w1", # nomic-bert-moe
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_UP_SHEXP: (
|
||||
"model.layers.{bid}.mlp.shared_expert.up_proj", # qwen2moe
|
||||
"model.layers.{bid}.mlp.shared_experts.up_proj", # deepseek deepseek2
|
||||
"language_model.model.layers.{bid}.feed_forward.shared_expert.up_proj", # llama4
|
||||
"model.layers.{bid}.mlp.shared_expert.up_proj", # qwen2moe
|
||||
"model.layers.{bid}.mlp.shared_experts.up_proj", # deepseek deepseek2
|
||||
"model.layers.{bid}.feed_forward.shared_expert.up_proj", # llama4
|
||||
),
|
||||
|
||||
# AWQ-activation gate
|
||||
@@ -366,22 +366,22 @@ class TensorNameMap:
|
||||
"transformer.h.{bid}.mlp.linear_1", # refact
|
||||
"model.layers.{bid}.residual_mlp.w1", # arctic
|
||||
"transformer.h.{bid}.mlp.c_fc_0", # exaone
|
||||
"language_model.model.layers.{bid}.feed_forward.gate_proj", # llama4
|
||||
"model.layers.{bid}.feed_forward.gate_proj", # llama4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_GATE_EXP: (
|
||||
"layers.{bid}.feed_forward.experts.w1", # mixtral (merged)
|
||||
"transformer.decoder_layer.{bid}.moe.linear", # Grok (merged)
|
||||
"transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx
|
||||
"model.layers.{bid}.mlp.experts.gate_proj", # qwen2moe olmoe (merged)
|
||||
"model.layers.{bid}.block_sparse_moe.experts.w1", # phimoe (merged)
|
||||
"language_model.model.layers.{bid}.feed_forward.experts.gate_proj", # llama4
|
||||
"layers.{bid}.feed_forward.experts.w1", # mixtral (merged)
|
||||
"transformer.decoder_layer.{bid}.moe.linear", # Grok (merged)
|
||||
"transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx
|
||||
"model.layers.{bid}.mlp.experts.gate_proj", # qwen2moe olmoe (merged)
|
||||
"model.layers.{bid}.block_sparse_moe.experts.w1", # phimoe (merged)
|
||||
"model.layers.{bid}.feed_forward.experts.gate_proj", # llama4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_GATE_SHEXP: (
|
||||
"model.layers.{bid}.mlp.shared_expert.gate_proj", # qwen2moe
|
||||
"model.layers.{bid}.mlp.shared_experts.gate_proj", # deepseek deepseek2
|
||||
"language_model.model.layers.{bid}.feed_forward.shared_expert.gate_proj", # llama4
|
||||
"model.layers.{bid}.mlp.shared_expert.gate_proj", # qwen2moe
|
||||
"model.layers.{bid}.mlp.shared_experts.gate_proj", # deepseek deepseek2
|
||||
"model.layers.{bid}.feed_forward.shared_expert.gate_proj", # llama4
|
||||
),
|
||||
|
||||
# Feed-forward down
|
||||
@@ -410,7 +410,7 @@ class TensorNameMap:
|
||||
"encoder.layer.{bid}.mlp.down_layer", # jina-bert-v2
|
||||
"encoder.layers.{bid}.mlp.dense_4h_to_h", # chatglm
|
||||
"model.layers.h.{bid}.mlp.c_proj", # exaone
|
||||
"language_model.model.layers.{bid}.feed_forward.down_proj", # llama4
|
||||
"model.layers.{bid}.feed_forward.down_proj", # llama4
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_DOWN_EXP: (
|
||||
@@ -420,15 +420,15 @@ class TensorNameMap:
|
||||
"model.layers.{bid}.mlp.experts.down_proj", # qwen2moe olmoe (merged)
|
||||
"model.layers.{bid}.block_sparse_moe.output_linear", # granitemoe
|
||||
"model.layers.{bid}.block_sparse_moe.experts.w2", # phimoe (merged)
|
||||
"language_model.model.layers.{bid}.feed_forward.experts.down_proj", # llama4
|
||||
"model.layers.{bid}.feed_forward.experts.down_proj", # llama4
|
||||
"encoder.layers.{bid}.mlp.experts.mlp.w2", # nomic-bert-moe
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_DOWN_SHEXP: (
|
||||
"model.layers.{bid}.mlp.shared_expert.down_proj", # qwen2moe
|
||||
"model.layers.{bid}.mlp.shared_experts.down_proj", # deepseek deepseek2
|
||||
"language_model.model.layers.{bid}.feed_forward.shared_expert.down_proj", # llama4
|
||||
"model.layers.{bid}.shared_mlp.output_linear", # granitemoe
|
||||
"model.layers.{bid}.mlp.shared_expert.down_proj", # qwen2moe
|
||||
"model.layers.{bid}.mlp.shared_experts.down_proj", # deepseek deepseek2
|
||||
"model.layers.{bid}.feed_forward.shared_expert.down_proj", # llama4
|
||||
"model.layers.{bid}.shared_mlp.output_linear", # granitemoe
|
||||
),
|
||||
|
||||
MODEL_TENSOR.ATTN_Q_NORM: (
|
||||
|
||||
@@ -469,7 +469,7 @@ llama_model_loader::llama_model_loader(
|
||||
|
||||
meta.reset(gguf_init_from_file(fname.c_str(), params));
|
||||
if (!meta) {
|
||||
throw std::runtime_error(format("%s: failed to load model from %s\n", __func__, fname.c_str()));
|
||||
throw std::runtime_error(format("%s: failed to load model from %s", __func__, fname.c_str()));
|
||||
}
|
||||
|
||||
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
|
||||
@@ -528,7 +528,7 @@ llama_model_loader::llama_model_loader(
|
||||
};
|
||||
gguf_context_ptr ctx_gguf { gguf_init_from_file(fname_split, split_params) };
|
||||
if (!ctx_gguf) {
|
||||
throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, fname_split));
|
||||
throw std::runtime_error(format("%s: failed to load GGUF split from %s", __func__, fname_split));
|
||||
}
|
||||
|
||||
// check idx
|
||||
|
||||
@@ -140,6 +140,11 @@ static struct llama_model * llama_model_load_from_file_impl(
|
||||
struct llama_model_params params) {
|
||||
ggml_time_init();
|
||||
|
||||
if (!params.vocab_only && ggml_backend_reg_count() == 0) {
|
||||
LLAMA_LOG_ERROR("%s: no backends are loaded. hint: use ggml_backend_load() or ggml_backend_load_all() to load a backend before calling this function\n", __func__);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
unsigned cur_percentage = 0;
|
||||
if (params.progress_callback == NULL) {
|
||||
params.progress_callback_user_data = &cur_percentage;
|
||||
|
||||
@@ -687,7 +687,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
auto value = argv[i];
|
||||
auto * value = argv[i];
|
||||
/* static */ std::map<std::string, ggml_backend_buffer_type_t> buft_list;
|
||||
if (buft_list.empty()) {
|
||||
// enumerate all the devices and add their buffer types to the list
|
||||
@@ -719,7 +719,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||
// memory leak present in the implementation
|
||||
// over in arg.cpp. Acceptable because we
|
||||
// only parse these args once in this program.
|
||||
auto override_group = value;
|
||||
auto * override_group = value;
|
||||
if (value[override_group_span_len] == '\0') {
|
||||
value = &value[override_group_span_len];
|
||||
last_group = true;
|
||||
@@ -730,7 +730,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||
std::vector<llama_model_tensor_buft_override> group_tensor_buft_overrides{};
|
||||
auto override_span_len = std::strcspn(override_group, ";");
|
||||
while (override_span_len > 0) {
|
||||
auto override = override_group;
|
||||
auto * override = override_group;
|
||||
if (override_group[override_span_len] != '\0') {
|
||||
override_group[override_span_len] = '\0';
|
||||
override_group = &override_group[override_span_len + 1];
|
||||
@@ -743,9 +743,10 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||
break;
|
||||
}
|
||||
override[tensor_name_span_len] = '\0';
|
||||
auto tensor_name = override;
|
||||
auto buffer_type = &override[tensor_name_span_len + 1];
|
||||
auto * tensor_name = override;
|
||||
auto * buffer_type = &override[tensor_name_span_len + 1];
|
||||
if (buft_list.find(buffer_type) == buft_list.end()) {
|
||||
printf("error: unrecognized buffer type '%s'\n", buffer_type);
|
||||
printf("Available buffer types:\n");
|
||||
for (const auto & it : buft_list) {
|
||||
printf(" %s\n", ggml_backend_buft_name(it.second));
|
||||
@@ -1826,10 +1827,11 @@ int main(int argc, char ** argv) {
|
||||
fprintf(stderr, "warning: sanitizer enabled, performance may be affected\n");
|
||||
#endif
|
||||
|
||||
cmd_params params = parse_cmd_params(argc, argv);
|
||||
|
||||
// initialize backends
|
||||
ggml_backend_load_all();
|
||||
|
||||
cmd_params params = parse_cmd_params(argc, argv);
|
||||
|
||||
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||
if (!cpu_dev) {
|
||||
fprintf(stderr, "%s: error: CPU backend is not loaded\n", __func__);
|
||||
|
||||
Binary file not shown.
@@ -2251,6 +2251,14 @@ struct server_context {
|
||||
slot.has_next_token = true;
|
||||
}
|
||||
|
||||
// if context shifting is disabled, make sure that we don't run out of context
|
||||
if (!params_base.ctx_shift && slot.n_past + 1 >= slot.n_ctx) {
|
||||
slot.stop = STOP_TYPE_LIMIT;
|
||||
slot.has_next_token = false;
|
||||
|
||||
SLT_DBG(slot, "stopped due to running out of context, n_past = %d, n_ctx = %d\n", slot.n_past, slot.n_ctx);
|
||||
}
|
||||
|
||||
// check the limits
|
||||
if (slot.n_decoded > 0 && slot.has_next_token && !slot.has_budget(params_base)) {
|
||||
slot.stop = STOP_TYPE_LIMIT;
|
||||
|
||||
@@ -65,3 +65,21 @@ def test_ctx_shift_disabled_long_prompt():
|
||||
assert res.status_code != 200
|
||||
assert "error" in res.body
|
||||
assert "exceeds the available context size" in res.body["error"]["message"]
|
||||
|
||||
def test_ctx_shift_disabled_stream():
|
||||
global server
|
||||
server.disable_ctx_shift = True
|
||||
server.start()
|
||||
res = server.make_stream_request("POST", "/v1/completions", data={
|
||||
"n_predict": 256,
|
||||
"prompt": "Once",
|
||||
"stream": True,
|
||||
})
|
||||
content = ""
|
||||
for data in res:
|
||||
choice = data["choices"][0]
|
||||
if choice["finish_reason"] == "length":
|
||||
assert len(content) > 0
|
||||
else:
|
||||
assert choice["finish_reason"] is None
|
||||
content += choice["text"]
|
||||
|
||||
278
tools/server/webui/package-lock.json
generated
278
tools/server/webui/package-lock.json
generated
@@ -18,6 +18,7 @@
|
||||
"dexie": "^4.0.11",
|
||||
"highlight.js": "^11.10.0",
|
||||
"katex": "^0.16.15",
|
||||
"pdfjs-dist": "^5.2.133",
|
||||
"postcss": "^8.4.49",
|
||||
"react": "^18.3.1",
|
||||
"react-dom": "^18.3.1",
|
||||
@@ -988,7 +989,7 @@
|
||||
"version": "0.3.8",
|
||||
"resolved": "https://registry.npmjs.org/@jridgewell/gen-mapping/-/gen-mapping-0.3.8.tgz",
|
||||
"integrity": "sha512-imAbBGkb+ebQyxKgzv5Hu2nmROxoDOXHh80evxdoXNOrvAnVx7zimzc1Oo5h9RlfV4vPXaE2iM5pOFbvOCClWA==",
|
||||
"dev": true,
|
||||
"devOptional": true,
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@jridgewell/set-array": "^1.2.1",
|
||||
@@ -1003,7 +1004,7 @@
|
||||
"version": "3.1.2",
|
||||
"resolved": "https://registry.npmjs.org/@jridgewell/resolve-uri/-/resolve-uri-3.1.2.tgz",
|
||||
"integrity": "sha512-bRISgCIjP20/tbWSPWMEi54QVPRZExkuD9lJL+UIxUKtwVJA8wW1Trb1jMs1RFXo1CBTNZ/5hpC9QvmKWdopKw==",
|
||||
"dev": true,
|
||||
"devOptional": true,
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">=6.0.0"
|
||||
@@ -1013,30 +1014,224 @@
|
||||
"version": "1.2.1",
|
||||
"resolved": "https://registry.npmjs.org/@jridgewell/set-array/-/set-array-1.2.1.tgz",
|
||||
"integrity": "sha512-R8gLRTZeyp03ymzP/6Lil/28tGeGEzhx1q2k703KGWRAI1VdvPIXdG70VJc2pAMw3NA6JKL5hhFu1sJX0Mnn/A==",
|
||||
"dev": true,
|
||||
"devOptional": true,
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">=6.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/@jridgewell/source-map": {
|
||||
"version": "0.3.6",
|
||||
"resolved": "https://registry.npmjs.org/@jridgewell/source-map/-/source-map-0.3.6.tgz",
|
||||
"integrity": "sha512-1ZJTZebgqllO79ue2bm3rIGud/bOe0pP5BjSRCRxxYkEZS8STV7zN84UBbiYu7jy+eCKSnVIUgoWWE/tt+shMQ==",
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"@jridgewell/gen-mapping": "^0.3.5",
|
||||
"@jridgewell/trace-mapping": "^0.3.25"
|
||||
}
|
||||
},
|
||||
"node_modules/@jridgewell/sourcemap-codec": {
|
||||
"version": "1.5.0",
|
||||
"resolved": "https://registry.npmjs.org/@jridgewell/sourcemap-codec/-/sourcemap-codec-1.5.0.tgz",
|
||||
"integrity": "sha512-gv3ZRaISU3fjPAgNsriBRqGWQL6quFx04YMPW/zD8XMLsU32mhCCbfbO6KZFLjvYpCZ8zyDEgqsgf+PwPaM7GQ==",
|
||||
"dev": true,
|
||||
"devOptional": true,
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/@jridgewell/trace-mapping": {
|
||||
"version": "0.3.25",
|
||||
"resolved": "https://registry.npmjs.org/@jridgewell/trace-mapping/-/trace-mapping-0.3.25.tgz",
|
||||
"integrity": "sha512-vNk6aEwybGtawWmy/PzwnGDOjCkLWSD2wqvjGGAgOAwCGWySYXfYoxt00IJkTF+8Lb57DwOb3Aa0o9CApepiYQ==",
|
||||
"dev": true,
|
||||
"devOptional": true,
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@jridgewell/resolve-uri": "^3.1.0",
|
||||
"@jridgewell/sourcemap-codec": "^1.4.14"
|
||||
}
|
||||
},
|
||||
"node_modules/@napi-rs/canvas": {
|
||||
"version": "0.1.70",
|
||||
"resolved": "https://registry.npmjs.org/@napi-rs/canvas/-/canvas-0.1.70.tgz",
|
||||
"integrity": "sha512-nD6NGa4JbNYSZYsTnLGrqe9Kn/lCkA4ybXt8sx5ojDqZjr2i0TWAHxx/vhgfjX+i3hCdKWufxYwi7CfXqtITSA==",
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"engines": {
|
||||
"node": ">= 10"
|
||||
},
|
||||
"optionalDependencies": {
|
||||
"@napi-rs/canvas-android-arm64": "0.1.70",
|
||||
"@napi-rs/canvas-darwin-arm64": "0.1.70",
|
||||
"@napi-rs/canvas-darwin-x64": "0.1.70",
|
||||
"@napi-rs/canvas-linux-arm-gnueabihf": "0.1.70",
|
||||
"@napi-rs/canvas-linux-arm64-gnu": "0.1.70",
|
||||
"@napi-rs/canvas-linux-arm64-musl": "0.1.70",
|
||||
"@napi-rs/canvas-linux-riscv64-gnu": "0.1.70",
|
||||
"@napi-rs/canvas-linux-x64-gnu": "0.1.70",
|
||||
"@napi-rs/canvas-linux-x64-musl": "0.1.70",
|
||||
"@napi-rs/canvas-win32-x64-msvc": "0.1.70"
|
||||
}
|
||||
},
|
||||
"node_modules/@napi-rs/canvas-android-arm64": {
|
||||
"version": "0.1.70",
|
||||
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-android-arm64/-/canvas-android-arm64-0.1.70.tgz",
|
||||
"integrity": "sha512-I/YOuQ0wbkVYxVaYtCgN42WKTYxNqFA0gTcTrHIGG1jfpDSyZWII/uHcjOo4nzd19io6Y4+/BqP8E5hJgf9OmQ==",
|
||||
"cpu": [
|
||||
"arm64"
|
||||
],
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"os": [
|
||||
"android"
|
||||
],
|
||||
"engines": {
|
||||
"node": ">= 10"
|
||||
}
|
||||
},
|
||||
"node_modules/@napi-rs/canvas-darwin-arm64": {
|
||||
"version": "0.1.70",
|
||||
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-darwin-arm64/-/canvas-darwin-arm64-0.1.70.tgz",
|
||||
"integrity": "sha512-4pPGyXetHIHkw2TOJHujt3mkCP8LdDu8+CT15ld9Id39c752RcI0amDHSuMLMQfAjvusA9B5kKxazwjMGjEJpQ==",
|
||||
"cpu": [
|
||||
"arm64"
|
||||
],
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"os": [
|
||||
"darwin"
|
||||
],
|
||||
"engines": {
|
||||
"node": ">= 10"
|
||||
}
|
||||
},
|
||||
"node_modules/@napi-rs/canvas-darwin-x64": {
|
||||
"version": "0.1.70",
|
||||
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-darwin-x64/-/canvas-darwin-x64-0.1.70.tgz",
|
||||
"integrity": "sha512-+2N6Os9LbkmDMHL+raknrUcLQhsXzc5CSXRbXws9C3pv/mjHRVszQ9dhFUUe9FjfPhCJznO6USVdwOtu7pOrzQ==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"os": [
|
||||
"darwin"
|
||||
],
|
||||
"engines": {
|
||||
"node": ">= 10"
|
||||
}
|
||||
},
|
||||
"node_modules/@napi-rs/canvas-linux-arm-gnueabihf": {
|
||||
"version": "0.1.70",
|
||||
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm-gnueabihf/-/canvas-linux-arm-gnueabihf-0.1.70.tgz",
|
||||
"integrity": "sha512-QjscX9OaKq/990sVhSMj581xuqLgiaPVMjjYvWaCmAJRkNQ004QfoSMEm3FoTqM4DRoquP8jvuEXScVJsc1rqQ==",
|
||||
"cpu": [
|
||||
"arm"
|
||||
],
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"os": [
|
||||
"linux"
|
||||
],
|
||||
"engines": {
|
||||
"node": ">= 10"
|
||||
}
|
||||
},
|
||||
"node_modules/@napi-rs/canvas-linux-arm64-gnu": {
|
||||
"version": "0.1.70",
|
||||
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm64-gnu/-/canvas-linux-arm64-gnu-0.1.70.tgz",
|
||||
"integrity": "sha512-LNakMOwwqwiHIwMpnMAbFRczQMQ7TkkMyATqFCOtUJNlE6LPP/QiUj/mlFrNbUn/hctqShJ60gWEb52ZTALbVw==",
|
||||
"cpu": [
|
||||
"arm64"
|
||||
],
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"os": [
|
||||
"linux"
|
||||
],
|
||||
"engines": {
|
||||
"node": ">= 10"
|
||||
}
|
||||
},
|
||||
"node_modules/@napi-rs/canvas-linux-arm64-musl": {
|
||||
"version": "0.1.70",
|
||||
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm64-musl/-/canvas-linux-arm64-musl-0.1.70.tgz",
|
||||
"integrity": "sha512-wBTOllEYNfJCHOdZj9v8gLzZ4oY3oyPX8MSRvaxPm/s7RfEXxCyZ8OhJ5xAyicsDdbE5YBZqdmaaeP5+xKxvtg==",
|
||||
"cpu": [
|
||||
"arm64"
|
||||
],
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"os": [
|
||||
"linux"
|
||||
],
|
||||
"engines": {
|
||||
"node": ">= 10"
|
||||
}
|
||||
},
|
||||
"node_modules/@napi-rs/canvas-linux-riscv64-gnu": {
|
||||
"version": "0.1.70",
|
||||
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-riscv64-gnu/-/canvas-linux-riscv64-gnu-0.1.70.tgz",
|
||||
"integrity": "sha512-GVUUPC8TuuFqHip0rxHkUqArQnlzmlXmTEBuXAWdgCv85zTCFH8nOHk/YCF5yo0Z2eOm8nOi90aWs0leJ4OE5Q==",
|
||||
"cpu": [
|
||||
"riscv64"
|
||||
],
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"os": [
|
||||
"linux"
|
||||
],
|
||||
"engines": {
|
||||
"node": ">= 10"
|
||||
}
|
||||
},
|
||||
"node_modules/@napi-rs/canvas-linux-x64-gnu": {
|
||||
"version": "0.1.70",
|
||||
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-x64-gnu/-/canvas-linux-x64-gnu-0.1.70.tgz",
|
||||
"integrity": "sha512-/kvUa2lZRwGNyfznSn5t1ShWJnr/m5acSlhTV3eXECafObjl0VBuA1HJw0QrilLpb4Fe0VLywkpD1NsMoVDROQ==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"os": [
|
||||
"linux"
|
||||
],
|
||||
"engines": {
|
||||
"node": ">= 10"
|
||||
}
|
||||
},
|
||||
"node_modules/@napi-rs/canvas-linux-x64-musl": {
|
||||
"version": "0.1.70",
|
||||
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-x64-musl/-/canvas-linux-x64-musl-0.1.70.tgz",
|
||||
"integrity": "sha512-aqlv8MLpycoMKRmds7JWCfVwNf1fiZxaU7JwJs9/ExjTD8lX2KjsO7CTeAj5Cl4aEuzxUWbJPUUE2Qu9cZ1vfg==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"os": [
|
||||
"linux"
|
||||
],
|
||||
"engines": {
|
||||
"node": ">= 10"
|
||||
}
|
||||
},
|
||||
"node_modules/@napi-rs/canvas-win32-x64-msvc": {
|
||||
"version": "0.1.70",
|
||||
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-win32-x64-msvc/-/canvas-win32-x64-msvc-0.1.70.tgz",
|
||||
"integrity": "sha512-Q9QU3WIpwBTVHk4cPfBjGHGU4U0llQYRXgJtFtYqqGNEOKVN4OT6PQ+ve63xwIPODMpZ0HHyj/KLGc9CWc3EtQ==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"os": [
|
||||
"win32"
|
||||
],
|
||||
"engines": {
|
||||
"node": ">= 10"
|
||||
}
|
||||
},
|
||||
"node_modules/@nodelib/fs.scandir": {
|
||||
"version": "2.1.5",
|
||||
"resolved": "https://registry.npmjs.org/@nodelib/fs.scandir/-/fs.scandir-2.1.5.tgz",
|
||||
@@ -2002,7 +2197,7 @@
|
||||
"version": "8.14.0",
|
||||
"resolved": "https://registry.npmjs.org/acorn/-/acorn-8.14.0.tgz",
|
||||
"integrity": "sha512-cl669nCJTZBsL97OF4kUQm5g5hC2uihk0NxY3WENAC0TYdILVkAyHymAntgxGkl7K+t0cXIrH5siy5S4XkFycA==",
|
||||
"dev": true,
|
||||
"devOptional": true,
|
||||
"license": "MIT",
|
||||
"bin": {
|
||||
"acorn": "bin/acorn"
|
||||
@@ -2186,6 +2381,14 @@
|
||||
"devOptional": true,
|
||||
"license": "MIT/X11"
|
||||
},
|
||||
"node_modules/buffer-from": {
|
||||
"version": "1.1.2",
|
||||
"resolved": "https://registry.npmjs.org/buffer-from/-/buffer-from-1.1.2.tgz",
|
||||
"integrity": "sha512-E+XQCRwSbaaiChtv6k6Dwgc+bx+Bs6vuKJHHl5kox/BaKbhiXzqQOwK4cO22yElGp2OCmjwVhT3HmxgyPGnJfQ==",
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"peer": true
|
||||
},
|
||||
"node_modules/callsites": {
|
||||
"version": "3.1.0",
|
||||
"resolved": "https://registry.npmjs.org/callsites/-/callsites-3.1.0.tgz",
|
||||
@@ -4843,6 +5046,18 @@
|
||||
"node": ">=8"
|
||||
}
|
||||
},
|
||||
"node_modules/pdfjs-dist": {
|
||||
"version": "5.2.133",
|
||||
"resolved": "https://registry.npmjs.org/pdfjs-dist/-/pdfjs-dist-5.2.133.tgz",
|
||||
"integrity": "sha512-abE6ZWDxztt+gGFzfm4bX2ggfxUk9wsDEoFzIJm9LozaY3JdXR7jyLK4Bjs+XLXplCduuWS1wGhPC4tgTn/kzg==",
|
||||
"license": "Apache-2.0",
|
||||
"engines": {
|
||||
"node": ">=20.16.0 || >=22.3.0"
|
||||
},
|
||||
"optionalDependencies": {
|
||||
"@napi-rs/canvas": "^0.1.67"
|
||||
}
|
||||
},
|
||||
"node_modules/picocolors": {
|
||||
"version": "1.1.1",
|
||||
"resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.1.1.tgz",
|
||||
@@ -5753,6 +5968,17 @@
|
||||
"node": ">=8"
|
||||
}
|
||||
},
|
||||
"node_modules/source-map": {
|
||||
"version": "0.6.1",
|
||||
"resolved": "https://registry.npmjs.org/source-map/-/source-map-0.6.1.tgz",
|
||||
"integrity": "sha512-UjgapumWlbMhkBgzT7Ykc5YXUT46F0iKu8SGXq0bcwP5dz/h0Plj6enJqjz1Zbq2l5WaqYnrVbwWOWMyF3F47g==",
|
||||
"license": "BSD-3-Clause",
|
||||
"optional": true,
|
||||
"peer": true,
|
||||
"engines": {
|
||||
"node": ">=0.10.0"
|
||||
}
|
||||
},
|
||||
"node_modules/source-map-js": {
|
||||
"version": "1.2.1",
|
||||
"resolved": "https://registry.npmjs.org/source-map-js/-/source-map-js-1.2.1.tgz",
|
||||
@@ -5762,6 +5988,18 @@
|
||||
"node": ">=0.10.0"
|
||||
}
|
||||
},
|
||||
"node_modules/source-map-support": {
|
||||
"version": "0.5.21",
|
||||
"resolved": "https://registry.npmjs.org/source-map-support/-/source-map-support-0.5.21.tgz",
|
||||
"integrity": "sha512-uBHU3L3czsIyYXKX88fdrGovxdSCoTGDRZ6SYXtSRxLZUzHg5P/66Ht6uoUlHu9EZod+inXhKo3qQgwXUT/y1w==",
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"buffer-from": "^1.0.0",
|
||||
"source-map": "^0.6.0"
|
||||
}
|
||||
},
|
||||
"node_modules/space-separated-tokens": {
|
||||
"version": "2.0.2",
|
||||
"resolved": "https://registry.npmjs.org/space-separated-tokens/-/space-separated-tokens-2.0.2.tgz",
|
||||
@@ -5859,6 +6097,34 @@
|
||||
"node": ">=6"
|
||||
}
|
||||
},
|
||||
"node_modules/terser": {
|
||||
"version": "5.39.1",
|
||||
"resolved": "https://registry.npmjs.org/terser/-/terser-5.39.1.tgz",
|
||||
"integrity": "sha512-Mm6+uad0ZuDtcV8/4uOZQDQ8RuiC5Pu+iZRedJtF7yA/27sPL7d++In/AJKpWZlU3SYMPPkVfwetn6sgZ66pUA==",
|
||||
"license": "BSD-2-Clause",
|
||||
"optional": true,
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"@jridgewell/source-map": "^0.3.3",
|
||||
"acorn": "^8.8.2",
|
||||
"commander": "^2.20.0",
|
||||
"source-map-support": "~0.5.20"
|
||||
},
|
||||
"bin": {
|
||||
"terser": "bin/terser"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=10"
|
||||
}
|
||||
},
|
||||
"node_modules/terser/node_modules/commander": {
|
||||
"version": "2.20.3",
|
||||
"resolved": "https://registry.npmjs.org/commander/-/commander-2.20.3.tgz",
|
||||
"integrity": "sha512-GpVkmM8vF2vQUkj2LvZmD35JxeJOLCwJ9cUkugyk2nuhbv3+mJvpLYYt+0+USMxE+oj+ey/lJEnhZw75x/OMcQ==",
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"peer": true
|
||||
},
|
||||
"node_modules/textlinestream": {
|
||||
"version": "1.1.1",
|
||||
"resolved": "https://registry.npmjs.org/textlinestream/-/textlinestream-1.1.1.tgz",
|
||||
|
||||
@@ -21,6 +21,7 @@
|
||||
"dexie": "^4.0.11",
|
||||
"highlight.js": "^11.10.0",
|
||||
"katex": "^0.16.15",
|
||||
"pdfjs-dist": "^5.2.133",
|
||||
"postcss": "^8.4.49",
|
||||
"react": "^18.3.1",
|
||||
"react-dom": "^18.3.1",
|
||||
|
||||
@@ -28,13 +28,13 @@ function AppLayout() {
|
||||
return (
|
||||
<>
|
||||
<Sidebar />
|
||||
<div
|
||||
<main
|
||||
className="drawer-content grow flex flex-col h-screen w-screen mx-auto px-4 overflow-auto bg-base-100"
|
||||
id="main-scroll"
|
||||
>
|
||||
<Header />
|
||||
<Outlet />
|
||||
</div>
|
||||
</main>
|
||||
{
|
||||
<SettingDialog
|
||||
show={showSettings}
|
||||
|
||||
@@ -16,6 +16,8 @@ export const CONFIG_DEFAULT = {
|
||||
showTokensPerSecond: false,
|
||||
showThoughtInProgress: false,
|
||||
excludeThoughtOnReq: true,
|
||||
pasteLongTextToFileLen: 2500,
|
||||
pdfAsImage: false,
|
||||
// make sure these default values are in sync with `common.h`
|
||||
samplers: 'edkypmxt',
|
||||
temperature: 0.8,
|
||||
@@ -43,6 +45,8 @@ export const CONFIG_DEFAULT = {
|
||||
export const CONFIG_INFO: Record<string, string> = {
|
||||
apiKey: 'Set the API Key if you are using --api-key option for the server.',
|
||||
systemMessage: 'The starting message that defines how model should behave.',
|
||||
pasteLongTextToFileLen:
|
||||
'On pasting long text, it will be converted to a file. You can control the file length by setting the value of this parameter. Value 0 means disable.',
|
||||
samplers:
|
||||
'The order at which samplers are applied, in simplified way. Default is "dkypmxt": dry->top_k->typ_p->top_p->min_p->xtc->temperature',
|
||||
temperature:
|
||||
|
||||
@@ -18,16 +18,26 @@ export default function ChatInputExtraContextItem({
|
||||
if (!items) return null;
|
||||
|
||||
return (
|
||||
<div className="flex flex-row gap-4 overflow-x-auto py-2 px-1 mb-1">
|
||||
<div
|
||||
className="flex flex-row gap-4 overflow-x-auto py-2 px-1 mb-1"
|
||||
role="group"
|
||||
aria-description="Selected files"
|
||||
>
|
||||
{items.map((item, i) => (
|
||||
<div
|
||||
className="indicator"
|
||||
key={i}
|
||||
onClick={() => clickToShow && setShow(i)}
|
||||
tabIndex={0}
|
||||
aria-description={
|
||||
clickToShow ? `Click to show: ${item.name}` : undefined
|
||||
}
|
||||
role={clickToShow ? 'button' : 'menuitem'}
|
||||
>
|
||||
{removeItem && (
|
||||
<div className="indicator-item indicator-top">
|
||||
<button
|
||||
aria-label="Remove file"
|
||||
className="btn btn-neutral btn-sm w-4 h-4 p-0 rounded-full"
|
||||
onClick={() => removeItem(i)}
|
||||
>
|
||||
@@ -46,13 +56,16 @@ export default function ChatInputExtraContextItem({
|
||||
<>
|
||||
<img
|
||||
src={item.base64Url}
|
||||
alt={item.name}
|
||||
alt={`Preview image for ${item.name}`}
|
||||
className="w-14 h-14 object-cover rounded-md"
|
||||
/>
|
||||
</>
|
||||
) : (
|
||||
<>
|
||||
<div className="w-14 h-14 flex items-center justify-center">
|
||||
<div
|
||||
className="w-14 h-14 flex items-center justify-center"
|
||||
aria-description="Document icon"
|
||||
>
|
||||
<DocumentTextIcon className="h-8 w-14 text-base-content/50" />
|
||||
</div>
|
||||
|
||||
@@ -66,16 +79,25 @@ export default function ChatInputExtraContextItem({
|
||||
))}
|
||||
|
||||
{showingItem && (
|
||||
<dialog className="modal modal-open">
|
||||
<dialog
|
||||
className="modal modal-open"
|
||||
aria-description={`Preview ${showingItem.name}`}
|
||||
>
|
||||
<div className="modal-box">
|
||||
<div className="flex justify-between items-center mb-4">
|
||||
<b>{showingItem.name ?? 'Extra content'}</b>
|
||||
<button className="btn btn-ghost btn-sm">
|
||||
<button
|
||||
className="btn btn-ghost btn-sm"
|
||||
aria-label="Close preview dialog"
|
||||
>
|
||||
<XMarkIcon className="h-5 w-5" onClick={() => setShow(-1)} />
|
||||
</button>
|
||||
</div>
|
||||
{showingItem.type === 'imageFile' ? (
|
||||
<img src={showingItem.base64Url} alt={showingItem.name} />
|
||||
<img
|
||||
src={showingItem.base64Url}
|
||||
alt={`Preview image for ${showingItem.name}`}
|
||||
/>
|
||||
) : (
|
||||
<div className="overflow-x-auto">
|
||||
<pre className="whitespace-pre-wrap break-words text-sm">
|
||||
|
||||
@@ -83,13 +83,20 @@ export default function ChatMessage({
|
||||
|
||||
if (!viewingChat) return null;
|
||||
|
||||
const isUser = msg.role === 'user';
|
||||
|
||||
return (
|
||||
<div className="group" id={id}>
|
||||
<div
|
||||
className="group"
|
||||
id={id}
|
||||
role="group"
|
||||
aria-description={`Message from ${msg.role}`}
|
||||
>
|
||||
<div
|
||||
className={classNames({
|
||||
chat: true,
|
||||
'chat-start': msg.role !== 'user',
|
||||
'chat-end': msg.role === 'user',
|
||||
'chat-start': !isUser,
|
||||
'chat-end': isUser,
|
||||
})}
|
||||
>
|
||||
{msg.extra && msg.extra.length > 0 && (
|
||||
@@ -99,7 +106,7 @@ export default function ChatMessage({
|
||||
<div
|
||||
className={classNames({
|
||||
'chat-bubble markdown': true,
|
||||
'chat-bubble bg-transparent': msg.role !== 'user',
|
||||
'chat-bubble bg-transparent': !isUser,
|
||||
})}
|
||||
>
|
||||
{/* textarea for editing message */}
|
||||
@@ -142,7 +149,7 @@ export default function ChatMessage({
|
||||
) : (
|
||||
<>
|
||||
{/* render message as markdown */}
|
||||
<div dir="auto">
|
||||
<div dir="auto" tabIndex={0}>
|
||||
{thought && (
|
||||
<ThoughtProcess
|
||||
isThinking={!!isThinking && !!isPending}
|
||||
@@ -196,13 +203,18 @@ export default function ChatMessage({
|
||||
})}
|
||||
>
|
||||
{siblingLeafNodeIds && siblingLeafNodeIds.length > 1 && (
|
||||
<div className="flex gap-1 items-center opacity-60 text-sm">
|
||||
<div
|
||||
className="flex gap-1 items-center opacity-60 text-sm"
|
||||
role="navigation"
|
||||
aria-description={`Message version ${siblingCurrIdx + 1} of ${siblingLeafNodeIds.length}`}
|
||||
>
|
||||
<button
|
||||
className={classNames({
|
||||
'btn btn-sm btn-ghost p-1': true,
|
||||
'opacity-20': !prevSibling,
|
||||
})}
|
||||
onClick={() => prevSibling && onChangeSibling(prevSibling)}
|
||||
aria-label="Previous message version"
|
||||
>
|
||||
<ChevronLeftIcon className="h-4 w-4" />
|
||||
</button>
|
||||
@@ -215,6 +227,7 @@ export default function ChatMessage({
|
||||
'opacity-20': !nextSibling,
|
||||
})}
|
||||
onClick={() => nextSibling && onChangeSibling(nextSibling)}
|
||||
aria-label="Next message version"
|
||||
>
|
||||
<ChevronRightIcon className="h-4 w-4" />
|
||||
</button>
|
||||
@@ -223,7 +236,7 @@ export default function ChatMessage({
|
||||
{/* user message */}
|
||||
{msg.role === 'user' && (
|
||||
<BtnWithTooltips
|
||||
className="btn-mini show-on-hover w-8 h-8"
|
||||
className="btn-mini w-8 h-8"
|
||||
onClick={() => setEditingContent(msg.content)}
|
||||
disabled={msg.content === null}
|
||||
tooltipsContent="Edit message"
|
||||
@@ -236,7 +249,7 @@ export default function ChatMessage({
|
||||
<>
|
||||
{!isPending && (
|
||||
<BtnWithTooltips
|
||||
className="btn-mini show-on-hover w-8 h-8"
|
||||
className="btn-mini w-8 h-8"
|
||||
onClick={() => {
|
||||
if (msg.content !== null) {
|
||||
onRegenerateMessage(msg as Message);
|
||||
@@ -250,10 +263,7 @@ export default function ChatMessage({
|
||||
)}
|
||||
</>
|
||||
)}
|
||||
<CopyButton
|
||||
className="btn-mini show-on-hover w-8 h-8"
|
||||
content={msg.content}
|
||||
/>
|
||||
<CopyButton className="btn-mini w-8 h-8" content={msg.content} />
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
@@ -271,6 +281,8 @@ function ThoughtProcess({
|
||||
}) {
|
||||
return (
|
||||
<div
|
||||
role="button"
|
||||
aria-label="Toggle thought process display"
|
||||
tabIndex={0}
|
||||
className={classNames({
|
||||
'collapse bg-none': true,
|
||||
@@ -292,7 +304,11 @@ function ThoughtProcess({
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
<div className="collapse-content text-base-content/70 text-sm p-1">
|
||||
<div
|
||||
className="collapse-content text-base-content/70 text-sm p-1"
|
||||
tabIndex={0}
|
||||
aria-description="Thought process content"
|
||||
>
|
||||
<div className="border-l-2 border-base-content/20 pl-4 mb-4">
|
||||
<MarkdownDisplay content={content} />
|
||||
</div>
|
||||
|
||||
@@ -279,7 +279,11 @@ export default function ChatScreen() {
|
||||
function ServerInfo() {
|
||||
const { serverProps } = useAppContext();
|
||||
return (
|
||||
<div className="card card-sm shadow-sm border-1 border-base-content/20 text-base-content/70 mb-6">
|
||||
<div
|
||||
className="card card-sm shadow-sm border-1 border-base-content/20 text-base-content/70 mb-6"
|
||||
tabIndex={0}
|
||||
aria-description="Server information"
|
||||
>
|
||||
<div className="card-body">
|
||||
<b>Server Info</b>
|
||||
<p>
|
||||
@@ -306,10 +310,13 @@ function ChatInput({
|
||||
onStop: () => void;
|
||||
isGenerating: boolean;
|
||||
}) {
|
||||
const { config } = useAppContext();
|
||||
const [isDrag, setIsDrag] = useState(false);
|
||||
|
||||
return (
|
||||
<div
|
||||
role="group"
|
||||
aria-label="Chat input"
|
||||
className={classNames({
|
||||
'flex items-end pt-8 pb-6 sticky bottom-0 bg-base-100': true,
|
||||
'opacity-50': isDrag, // simply visual feedback to inform user that the file will be accepted
|
||||
@@ -328,7 +335,28 @@ function ChatInput({
|
||||
{({ getRootProps, getInputProps }) => (
|
||||
<div
|
||||
className="flex flex-col rounded-xl border-1 border-base-content/30 p-3 w-full"
|
||||
// when a file is pasted to the input, we handle it here
|
||||
// if a text is pasted, and if it is long text, we will convert it to a file
|
||||
onPasteCapture={(e: ClipboardEvent<HTMLInputElement>) => {
|
||||
const text = e.clipboardData.getData('text/plain');
|
||||
if (
|
||||
text.length > 0 &&
|
||||
config.pasteLongTextToFileLen > 0 &&
|
||||
text.length > config.pasteLongTextToFileLen
|
||||
) {
|
||||
// if the text is too long, we will convert it to a file
|
||||
extraContext.addItems([
|
||||
{
|
||||
type: 'context',
|
||||
name: 'Pasted Content',
|
||||
content: text,
|
||||
},
|
||||
]);
|
||||
e.preventDefault();
|
||||
return;
|
||||
}
|
||||
|
||||
// if a file is pasted, we will handle it here
|
||||
const files = Array.from(e.clipboardData.items)
|
||||
.filter((item) => item.kind === 'file')
|
||||
.map((item) => item.getAsFile())
|
||||
@@ -378,13 +406,15 @@ function ChatInput({
|
||||
'btn w-8 h-8 p-0 rounded-full': true,
|
||||
'btn-disabled': isGenerating,
|
||||
})}
|
||||
aria-label="Upload file"
|
||||
tabIndex={0}
|
||||
role="button"
|
||||
>
|
||||
<PaperClipIcon className="h-5 w-5" />
|
||||
</label>
|
||||
<input
|
||||
id="file-upload"
|
||||
type="file"
|
||||
className="hidden"
|
||||
disabled={isGenerating}
|
||||
{...getInputProps()}
|
||||
hidden
|
||||
@@ -400,6 +430,7 @@ function ChatInput({
|
||||
<button
|
||||
className="btn btn-primary w-8 h-8 p-0 rounded-full"
|
||||
onClick={onSend}
|
||||
aria-label="Send message"
|
||||
>
|
||||
<ArrowUpIcon className="h-5 w-5" />
|
||||
</button>
|
||||
|
||||
@@ -38,8 +38,12 @@ export default function Header() {
|
||||
|
||||
{/* action buttons (top right) */}
|
||||
<div className="flex items-center">
|
||||
<div className="tooltip tooltip-bottom" data-tip="Settings">
|
||||
<button className="btn" onClick={() => setShowSettings(true)}>
|
||||
<div
|
||||
className="tooltip tooltip-bottom"
|
||||
data-tip="Settings"
|
||||
onClick={() => setShowSettings(true)}
|
||||
>
|
||||
<button className="btn" aria-hidden={true}>
|
||||
{/* settings button */}
|
||||
<Cog8ToothIcon className="w-5 h-5" />
|
||||
</button>
|
||||
|
||||
@@ -100,6 +100,16 @@ const SETTING_SECTIONS: SettingSection[] = [
|
||||
key,
|
||||
}) as SettingFieldInput
|
||||
),
|
||||
{
|
||||
type: SettingInputType.SHORT_INPUT,
|
||||
label: 'Paste length to file',
|
||||
key: 'pasteLongTextToFileLen',
|
||||
},
|
||||
{
|
||||
type: SettingInputType.CHECKBOX,
|
||||
label: 'Parse PDF as image instead of text',
|
||||
key: 'pdfAsImage',
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
@@ -325,14 +335,22 @@ export default function SettingDialog({
|
||||
};
|
||||
|
||||
return (
|
||||
<dialog className={classNames({ modal: true, 'modal-open': show })}>
|
||||
<dialog
|
||||
className={classNames({ modal: true, 'modal-open': show })}
|
||||
aria-label="Settings dialog"
|
||||
>
|
||||
<div className="modal-box w-11/12 max-w-3xl">
|
||||
<h3 className="text-lg font-bold mb-6">Settings</h3>
|
||||
<div className="flex flex-col md:flex-row h-[calc(90vh-12rem)]">
|
||||
{/* Left panel, showing sections - Desktop version */}
|
||||
<div className="hidden md:flex flex-col items-stretch pr-4 mr-4 border-r-2 border-base-200">
|
||||
<div
|
||||
className="hidden md:flex flex-col items-stretch pr-4 mr-4 border-r-2 border-base-200"
|
||||
role="complementary"
|
||||
aria-description="Settings sections"
|
||||
tabIndex={0}
|
||||
>
|
||||
{SETTING_SECTIONS.map((section, idx) => (
|
||||
<div
|
||||
<button
|
||||
key={idx}
|
||||
className={classNames({
|
||||
'btn btn-ghost justify-start font-normal w-44 mb-1': true,
|
||||
@@ -342,12 +360,16 @@ export default function SettingDialog({
|
||||
dir="auto"
|
||||
>
|
||||
{section.title}
|
||||
</div>
|
||||
</button>
|
||||
))}
|
||||
</div>
|
||||
|
||||
{/* Left panel, showing sections - Mobile version */}
|
||||
<div className="md:hidden flex flex-row gap-2 mb-4">
|
||||
{/* This menu is skipped on a11y, otherwise it's repeated the desktop version */}
|
||||
<div
|
||||
className="md:hidden flex flex-row gap-2 mb-4"
|
||||
aria-disabled={true}
|
||||
>
|
||||
<details className="dropdown">
|
||||
<summary className="btn bt-sm w-full m-1">
|
||||
{SETTING_SECTIONS[sectionIdx].title}
|
||||
@@ -452,10 +474,10 @@ function SettingsModalLongInput({
|
||||
label?: string;
|
||||
}) {
|
||||
return (
|
||||
<label className="form-control mb-2">
|
||||
<div className="label inline">{label || configKey}</div>
|
||||
<label className="form-control">
|
||||
<div className="label inline text-sm">{label || configKey}</div>
|
||||
<textarea
|
||||
className="textarea textarea-bordered h-24"
|
||||
className="textarea textarea-bordered h-24 mb-2"
|
||||
placeholder={`Default: ${CONFIG_DEFAULT[configKey] || 'none'}`}
|
||||
value={value}
|
||||
onChange={(e) => onChange(e.target.value)}
|
||||
@@ -482,9 +504,7 @@ function SettingsModalShortInput({
|
||||
<>
|
||||
{/* on mobile, we simply show the help message here */}
|
||||
{helpMsg && (
|
||||
<div className="block md:hidden mb-1">
|
||||
<b>{label || configKey}</b>
|
||||
<br />
|
||||
<div className="block mb-1 opacity-75">
|
||||
<p className="text-xs">{helpMsg}</p>
|
||||
</div>
|
||||
)}
|
||||
@@ -493,11 +513,6 @@ function SettingsModalShortInput({
|
||||
<div tabIndex={0} role="button" className="font-bold hidden md:block">
|
||||
{label || configKey}
|
||||
</div>
|
||||
{helpMsg && (
|
||||
<div className="dropdown-content menu bg-base-100 rounded-box z-10 w-64 p-2 shadow mt-4">
|
||||
{helpMsg}
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
<input
|
||||
type="text"
|
||||
|
||||
@@ -50,44 +50,72 @@ export default function Sidebar() {
|
||||
id="toggle-drawer"
|
||||
type="checkbox"
|
||||
className="drawer-toggle"
|
||||
aria-label="Toggle sidebar"
|
||||
defaultChecked
|
||||
/>
|
||||
|
||||
<div className="drawer-side h-screen lg:h-screen z-50 lg:max-w-64">
|
||||
<div
|
||||
className="drawer-side h-screen lg:h-screen z-50 lg:max-w-64"
|
||||
role="complementary"
|
||||
aria-label="Sidebar"
|
||||
tabIndex={0}
|
||||
>
|
||||
<label
|
||||
htmlFor="toggle-drawer"
|
||||
aria-label="close sidebar"
|
||||
aria-label="Close sidebar"
|
||||
className="drawer-overlay"
|
||||
></label>
|
||||
|
||||
<a
|
||||
href="#main-scroll"
|
||||
className="absolute -left-80 top-0 w-1 h-1 overflow-hidden"
|
||||
>
|
||||
Skip to main content
|
||||
</a>
|
||||
|
||||
<div className="flex flex-col bg-base-200 min-h-full max-w-64 py-4 px-4">
|
||||
<div className="flex flex-row items-center justify-between mb-4 mt-4">
|
||||
<h2 className="font-bold ml-4">Conversations</h2>
|
||||
<h2 className="font-bold ml-4" role="heading">
|
||||
Conversations
|
||||
</h2>
|
||||
|
||||
{/* close sidebar button */}
|
||||
<label htmlFor="toggle-drawer" className="btn btn-ghost lg:hidden">
|
||||
<label
|
||||
htmlFor="toggle-drawer"
|
||||
className="btn btn-ghost lg:hidden"
|
||||
aria-label="Close sidebar"
|
||||
role="button"
|
||||
tabIndex={0}
|
||||
>
|
||||
<XMarkIcon className="w-5 h-5" />
|
||||
</label>
|
||||
</div>
|
||||
|
||||
{/* new conversation button */}
|
||||
<div
|
||||
<button
|
||||
className={classNames({
|
||||
'btn btn-ghost justify-start px-2': true,
|
||||
'btn-soft': !currConv,
|
||||
})}
|
||||
onClick={() => navigate('/')}
|
||||
aria-label="New conversation"
|
||||
>
|
||||
<PencilSquareIcon className="w-5 h-5" />
|
||||
New conversation
|
||||
</div>
|
||||
</button>
|
||||
|
||||
{/* list of conversations */}
|
||||
{groupedConv.map((group, i) => (
|
||||
<div key={i}>
|
||||
<div key={i} role="group">
|
||||
{/* group name (by date) */}
|
||||
{group.title ? (
|
||||
// we use btn class here to make sure that the padding/margin are aligned with the other items
|
||||
<b className="btn btn-ghost btn-xs bg-none btn-disabled block text-xs text-base-content text-start px-2 mb-0 mt-6 font-bold">
|
||||
<b
|
||||
className="btn btn-ghost btn-xs bg-none btn-disabled block text-xs text-base-content text-start px-2 mb-0 mt-6 font-bold"
|
||||
role="note"
|
||||
aria-description={group.title}
|
||||
tabIndex={0}
|
||||
>
|
||||
{group.title}
|
||||
</b>
|
||||
) : (
|
||||
@@ -184,20 +212,23 @@ function ConversationItem({
|
||||
}) {
|
||||
return (
|
||||
<div
|
||||
role="menuitem"
|
||||
tabIndex={0}
|
||||
aria-label={conv.name}
|
||||
className={classNames({
|
||||
'group flex flex-row btn btn-ghost justify-start items-center font-normal px-2 h-9':
|
||||
true,
|
||||
'btn-soft': isCurrConv,
|
||||
})}
|
||||
>
|
||||
<div
|
||||
<button
|
||||
key={conv.id}
|
||||
className="w-full overflow-hidden truncate text-start"
|
||||
onClick={onSelect}
|
||||
dir="auto"
|
||||
>
|
||||
{conv.name}
|
||||
</div>
|
||||
</button>
|
||||
<div className="dropdown dropdown-end h-5">
|
||||
<BtnWithTooltips
|
||||
// on mobile, we always show the ellipsis icon
|
||||
@@ -211,22 +242,23 @@ function ConversationItem({
|
||||
</BtnWithTooltips>
|
||||
{/* dropdown menu */}
|
||||
<ul
|
||||
aria-label="More options"
|
||||
tabIndex={0}
|
||||
className="dropdown-content menu bg-base-100 rounded-box z-[1] p-2 shadow"
|
||||
>
|
||||
<li onClick={onRename}>
|
||||
<li onClick={onRename} tabIndex={0}>
|
||||
<a>
|
||||
<PencilIcon className="w-4 h-4" />
|
||||
Rename
|
||||
</a>
|
||||
</li>
|
||||
<li onClick={onDownload}>
|
||||
<li onClick={onDownload} tabIndex={0}>
|
||||
<a>
|
||||
<ArrowDownTrayIcon className="w-4 h-4" />
|
||||
Download
|
||||
</a>
|
||||
</li>
|
||||
<li className="text-error" onClick={onDelete}>
|
||||
<li className="text-error" onClick={onDelete} tabIndex={0}>
|
||||
<a>
|
||||
<TrashIcon className="w-4 h-4" />
|
||||
Delete
|
||||
|
||||
@@ -2,6 +2,17 @@ import { useState } from 'react';
|
||||
import { MessageExtra } from '../utils/types';
|
||||
import toast from 'react-hot-toast';
|
||||
import { useAppContext } from '../utils/app.context';
|
||||
import * as pdfjs from 'pdfjs-dist';
|
||||
import pdfjsWorkerSrc from 'pdfjs-dist/build/pdf.worker.min.mjs?url';
|
||||
import { TextContent, TextItem } from 'pdfjs-dist/types/src/display/api';
|
||||
|
||||
pdfjs.GlobalWorkerOptions.workerSrc = pdfjsWorkerSrc;
|
||||
|
||||
// This file handles uploading extra context items (a.k.a files)
|
||||
// It allows processing these kinds of files:
|
||||
// - image files (converted to base64)
|
||||
// - text files (including code files)
|
||||
// - pdf (converted to text)
|
||||
|
||||
// Interface describing the API returned by the hook
|
||||
export interface ChatExtraContextApi {
|
||||
@@ -13,7 +24,7 @@ export interface ChatExtraContextApi {
|
||||
}
|
||||
|
||||
export function useChatExtraContext(): ChatExtraContextApi {
|
||||
const { serverProps } = useAppContext();
|
||||
const { serverProps, config } = useAppContext();
|
||||
const [items, setItems] = useState<MessageExtra[]>([]);
|
||||
|
||||
const addItems = (newItems: MessageExtra[]) => {
|
||||
@@ -28,6 +39,8 @@ export function useChatExtraContext(): ChatExtraContextApi {
|
||||
setItems([]);
|
||||
};
|
||||
|
||||
const isSupportVision = serverProps?.modalities?.vision;
|
||||
|
||||
const onFileAdded = (files: File[]) => {
|
||||
for (const file of files) {
|
||||
const mimeType = file.type;
|
||||
@@ -38,7 +51,7 @@ export function useChatExtraContext(): ChatExtraContextApi {
|
||||
}
|
||||
|
||||
if (mimeType.startsWith('image/')) {
|
||||
if (!serverProps?.modalities?.vision) {
|
||||
if (!isSupportVision) {
|
||||
toast.error('Multimodal is not supported by this server or model.');
|
||||
break;
|
||||
}
|
||||
@@ -69,7 +82,43 @@ export function useChatExtraContext(): ChatExtraContextApi {
|
||||
toast.error('Video and audio files are not supported yet.');
|
||||
break;
|
||||
} else if (mimeType.startsWith('application/pdf')) {
|
||||
toast.error('PDF files are not supported yet.');
|
||||
if (config.pdfAsImage && !isSupportVision) {
|
||||
toast(
|
||||
'Multimodal is not supported, PDF will be converted to text instead of image.'
|
||||
);
|
||||
break;
|
||||
}
|
||||
|
||||
const promise =
|
||||
config.pdfAsImage && isSupportVision
|
||||
? convertPDFToImage(file).then((base64Urls) => {
|
||||
addItems(
|
||||
base64Urls.map((base64Url) => ({
|
||||
type: 'imageFile',
|
||||
name: file.name,
|
||||
base64Url,
|
||||
}))
|
||||
);
|
||||
})
|
||||
: convertPDFToText(file).then((content) => {
|
||||
if (isSupportVision) {
|
||||
toast.success(
|
||||
'PDF file converted to text. You can also convert it to image, see in Settings.'
|
||||
);
|
||||
}
|
||||
addItems([
|
||||
{
|
||||
type: 'textFile',
|
||||
name: file.name,
|
||||
content,
|
||||
},
|
||||
]);
|
||||
});
|
||||
|
||||
promise.catch((error) => {
|
||||
console.error(error);
|
||||
toast.error('Failed to parse PDF file.');
|
||||
});
|
||||
break;
|
||||
} else {
|
||||
// Because there can be many text file types (like code file), we will not check the mime type
|
||||
@@ -105,11 +154,69 @@ export function useChatExtraContext(): ChatExtraContextApi {
|
||||
};
|
||||
}
|
||||
|
||||
async function getFileAsBuffer(file: File): Promise<ArrayBuffer> {
|
||||
return new Promise((resolve, reject) => {
|
||||
const reader = new FileReader();
|
||||
reader.onload = (event) => {
|
||||
if (event.target?.result) {
|
||||
resolve(event.target.result as ArrayBuffer);
|
||||
} else {
|
||||
reject(new Error('Failed to read file.'));
|
||||
}
|
||||
};
|
||||
reader.readAsArrayBuffer(file);
|
||||
});
|
||||
}
|
||||
|
||||
async function convertPDFToText(file: File): Promise<string> {
|
||||
const buffer = await getFileAsBuffer(file);
|
||||
const pdf = await pdfjs.getDocument(buffer).promise;
|
||||
const numPages = pdf.numPages;
|
||||
const textContentPromises: Promise<TextContent>[] = [];
|
||||
for (let i = 1; i <= numPages; i++) {
|
||||
textContentPromises.push(
|
||||
pdf.getPage(i).then((page) => page.getTextContent())
|
||||
);
|
||||
}
|
||||
const textContents = await Promise.all(textContentPromises);
|
||||
const textItems = textContents.flatMap((textContent: TextContent) =>
|
||||
textContent.items.map((item) => (item as TextItem).str ?? '')
|
||||
);
|
||||
return textItems.join('\n');
|
||||
}
|
||||
|
||||
// returns list of base64 images
|
||||
async function convertPDFToImage(file: File): Promise<string[]> {
|
||||
const buffer = await getFileAsBuffer(file);
|
||||
const doc = await pdfjs.getDocument(buffer).promise;
|
||||
const pages: Promise<string>[] = [];
|
||||
|
||||
for (let i = 1; i <= doc.numPages; i++) {
|
||||
const page = await doc.getPage(i);
|
||||
const viewport = page.getViewport({ scale: 1.5 });
|
||||
const canvas = document.createElement('canvas');
|
||||
const ctx = canvas.getContext('2d');
|
||||
canvas.width = viewport.width;
|
||||
canvas.height = viewport.height;
|
||||
if (!ctx) {
|
||||
throw new Error('Failed to get 2D context from canvas');
|
||||
}
|
||||
const task = page.render({ canvasContext: ctx, viewport: viewport });
|
||||
pages.push(
|
||||
task.promise.then(() => {
|
||||
return canvas.toDataURL();
|
||||
})
|
||||
);
|
||||
}
|
||||
|
||||
return await Promise.all(pages);
|
||||
}
|
||||
|
||||
// WARN: vibe code below
|
||||
// This code is a heuristic to determine if a string is likely not binary.
|
||||
// It is necessary because input file can have various mime types which we don't have time to investigate.
|
||||
// For example, a python file can be text/plain, application/x-python, etc.
|
||||
export function isLikelyNotBinary(str: string): boolean {
|
||||
function isLikelyNotBinary(str: string): boolean {
|
||||
const options = {
|
||||
prefixLength: 1024 * 10, // Check the first 10KB of the string
|
||||
suspiciousCharThresholdRatio: 0.15, // Allow up to 15% suspicious chars
|
||||
|
||||
@@ -34,9 +34,6 @@ html {
|
||||
/* TODO: fix markdown table */
|
||||
}
|
||||
|
||||
.show-on-hover {
|
||||
@apply md:opacity-0 md:group-hover:opacity-100;
|
||||
}
|
||||
.btn-mini {
|
||||
@apply cursor-pointer;
|
||||
}
|
||||
|
||||
@@ -52,13 +52,20 @@ export function BtnWithTooltips({
|
||||
tooltipsContent: string;
|
||||
disabled?: boolean;
|
||||
}) {
|
||||
// the onClick handler is on the container, so screen readers can safely ignore the inner button
|
||||
// this prevents the label from being read twice
|
||||
return (
|
||||
<div className="tooltip tooltip-bottom" data-tip={tooltipsContent}>
|
||||
<div
|
||||
className="tooltip tooltip-bottom"
|
||||
data-tip={tooltipsContent}
|
||||
role="button"
|
||||
onClick={onClick}
|
||||
>
|
||||
<button
|
||||
className={`${className ?? ''} flex items-center justify-center`}
|
||||
onClick={onClick}
|
||||
disabled={disabled}
|
||||
onMouseLeave={onMouseLeave}
|
||||
aria-hidden={true}
|
||||
>
|
||||
{children}
|
||||
</button>
|
||||
|
||||
@@ -7,7 +7,7 @@ import * as fflate from 'fflate';
|
||||
|
||||
/* eslint-disable */
|
||||
|
||||
const MAX_BUNDLE_SIZE = 1.5 * 1024 * 1024; // only increase when absolutely necessary
|
||||
const MAX_BUNDLE_SIZE = 2 * 1024 * 1024; // only increase when absolutely necessary
|
||||
|
||||
const GUIDE_FOR_FRONTEND = `
|
||||
<!--
|
||||
|
||||
Reference in New Issue
Block a user