Compare commits

...

15 Commits

Author SHA1 Message Date
Xuan Son Nguyen
8654805027 docker : publish to both ggerganov and ggml-org 2025-02-15 15:18:04 +01:00
Woof Dog
31afcbee0e server : (webui) Give copy button back to all message bubbles (#11814)
* All messages get the copy button

* Update index.html.gz
2025-02-12 23:47:11 +01:00
uvos
5c4284d57b HIP: Remove GCN from list of devices that avoid MMQ (#11831) 2025-02-12 22:25:28 +01:00
JC
bfd11a2344 Fix: Compile failure due to Microsoft STL breaking change (#11836) 2025-02-12 21:36:11 +01:00
Georgi Gerganov
0fb77f821f sync : ggml 2025-02-12 21:46:02 +02:00
uvos
e598697d63 HIP: Switch to std::vector in rocblas version check (#11820) 2025-02-12 17:25:03 +01:00
bandoti
fef0cbeadf cleanup: fix compile warnings associated with gnu_printf (#11811) 2025-02-12 10:06:53 -04:00
Richard
748ee9fe93 ggml : fix multi-threaded clamp_f32 (#11824)
* Bug fix for clamp_f32

When using tensors larger than 1d clamp operation does not work due to the restriction of returning if ith is not 0.

* Bug fix for clamp_f32

* Bug fix for clamp_f32
2025-02-12 15:57:33 +02:00
Weizhao Ouyang
198b1ec611 ggml-cpu: Fix duplicate MATMUL_INT8 (#11817)
Signed-off-by: Weizhao Ouyang <o451686892@gmail.com>
2025-02-12 13:22:58 +01:00
Johannes Gäßler
c3d6af7cd2 CUDA: fix CUDART_VERSION checks (#11821) 2025-02-12 13:16:39 +01:00
Daniel Bevenius
369be5598a llama : fix typo in llama-grammar.h [no ci] (#11816) 2025-02-12 09:40:01 +02:00
lhez
4078c77f98 docs: add OpenCL (#11697) 2025-02-11 15:04:13 -07:00
Sheldon Robinson
90e4dba461 Fix #11802: Compile bug - RegQueryValueExA changed to RegQueryValueEx (#11803)
* Fix #11802: Compile bug - RegQueryValueExA changed to RegQueryValueEx

* Fix #11802: PR #11803 - keep RegQueryValueExA, remove TEXT macro, description needs to be ANSI string
2025-02-11 16:55:45 +01:00
Daniel Bevenius
a18f481f99 server : use common_token_to_piece instead of common_detokenize (#11740)
* server : use common_token_to_piece instead of common_detokenize

This commit replaces the call to common_detokenize with
common_token_to_piece in the populate_token_probs.

The motivation for this change is to avoid an issue where
common_detokenize would remove the word boundary character for tokens,
which caused a regression in the server generated token probabilities.

Resolves: https://github.com/ggerganov/llama.cpp/issues/11728

* squash! server : use common_token_to_piece instead of common_detokenize

Use common_token_to_piece for post_sampling_probs as well.
2025-02-11 14:06:45 +01:00
Johannes Gäßler
b9ab0a4d0b CUDA: use arch list for compatibility check (#11775)
* CUDA: use arch list for feature availability check

---------

Co-authored-by: Diego Devesa <slarengh@gmail.com>
2025-02-11 00:17:22 +01:00
25 changed files with 350 additions and 74 deletions

View File

@@ -78,16 +78,32 @@ jobs:
SAFE_NAME=$(echo "${{ env.GITHUB_BRANCH_NAME }}" | tr '/' '-')
TAG_POSTFIX="-${SAFE_NAME}-${SHORT_HASH}"
fi
# list all tags possible
if [[ "${{ matrix.config.tag }}" == "cpu" ]]; then
TYPE=""
TYPE=""
else
TYPE="-${{ matrix.config.tag }}"
TYPE="-${{ matrix.config.tag }}"
fi
# also publish to legacy, for smooth transition to ggml-org
if [[ "${REPO_OWNER}/${REPO_NAME}" == "ggml-org/llama.cpp" ]]; then
LEGACY_PREFIX="ghcr.io/ggerganov/llama.cpp:"
LEGACY_FULLTAGS=",${LEGACY_PREFIX}full${TYPE},${LEGACY_PREFIX}full${TYPE}${TAG_POSTFIX}"
LEGACY_LIGHTTAGS=",${LEGACY_PREFIX}light${TYPE},${LEGACY_PREFIX}light${TYPE}${TAG_POSTFIX}"
LEGACY_SERVERTAGS=",${LEGACY_PREFIX}server${TYPE},${LEGACY_PREFIX}server${TYPE}${TAG_POSTFIX}"
else
LEGACY_PREFIX=""
LEGACY_FULLTAGS=""
LEGACY_LIGHTTAGS=""
LEGACY_SERVERTAGS=""
fi
PREFIX="ghcr.io/${REPO_OWNER}/${REPO_NAME}:"
FULLTAGS="${PREFIX}full${TYPE},${PREFIX}full${TYPE}${TAG_POSTFIX}"
LIGHTTAGS="${PREFIX}light${TYPE},${PREFIX}light${TYPE}${TAG_POSTFIX}"
SERVERTAGS="${PREFIX}server${TYPE},${PREFIX}server${TYPE}${TAG_POSTFIX}"
FULLTAGS="${PREFIX}full${TYPE},${PREFIX}full${TYPE}${TAG_POSTFIX}${LEGACY_FULLTAGS}"
LIGHTTAGS="${PREFIX}light${TYPE},${PREFIX}light${TYPE}${TAG_POSTFIX}${LEGACY_LIGHTTAGS}"
SERVERTAGS="${PREFIX}server${TYPE},${PREFIX}server${TYPE}${TAG_POSTFIX}${LEGACY_SERVERTAGS}"
echo "full_output_tags=$FULLTAGS" >> $GITHUB_OUTPUT
echo "light_output_tags=$LIGHTTAGS" >> $GITHUB_OUTPUT
echo "server_output_tags=$SERVERTAGS" >> $GITHUB_OUTPUT

View File

@@ -235,6 +235,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
| [HIP](docs/build.md#hip) | AMD GPU |
| [Vulkan](docs/build.md#vulkan) | GPU |
| [CANN](docs/build.md#cann) | Ascend NPU |
| [OpenCL](docs/backend/OPENCL.md) | Adreno GPU |
## Building the project

View File

@@ -424,13 +424,13 @@ bool set_process_priority(enum ggml_sched_priority prio);
//
#ifdef __GNUC__
#ifdef __MINGW32__
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
# if defined(__MINGW32__) && !defined(__clang__)
# define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
# else
# define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
# endif
#else
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
#endif
#else
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
# define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
#endif
LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)

View File

@@ -1,5 +1,6 @@
#include "log.h"
#include <chrono>
#include <condition_variable>
#include <cstdarg>
#include <cstdio>

View File

@@ -15,7 +15,7 @@
#ifndef __GNUC__
# define LOG_ATTRIBUTE_FORMAT(...)
#elif defined(__MINGW32__)
#elif defined(__MINGW32__) && !defined(__clang__)
# define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
#else
# define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))

205
docs/backend/OPENCL.md Normal file
View File

@@ -0,0 +1,205 @@
# llama.cpp for OpenCL
- [Background](#background)
- [OS](#os)
- [Hardware](#hardware)
- [DataType Supports](#datatype-supports)
- [Model Preparation](#model-preparation)
- [CMake Options](#cmake-options)
- [Android](#android)
- [Windows 11 Arm64](#windows-11-arm64)
- [Known Issue](#known-issues)
- [TODO](#todo)
## Background
OpenCL (Open Computing Language) is an open, royalty-free standard for cross-platform, parallel programming of diverse accelerators found in supercomputers, cloud servers, personal computers, mobile devices and embedded platforms. OpenCL specifies a programming language (based on C99) for programming these devices and application programming interfaces (APIs) to control the platform and execute programs on the compute devices. Similar to CUDA, OpenCL has been widely used to program GPUs and is supported by most GPU vendors.
### Llama.cpp + OpenCL
The llama.cpp OpenCL backend is designed to enable llama.cpp on **Qualcomm Adreno GPU** firstly via OpenCL. Thanks to the portabilty of OpenCL, the OpenCL backend can also run on certain Intel GPUs although the performance is not optimal.
## OS
| OS | Status | Verified |
|---------|---------|------------------------------------------------|
| Android | Support | Snapdragon 8 Gen 3, Snapdragon 8 Elite |
| Windows | Support | Windows 11 Arm64 with Snapdragon X Elite |
| Linux | Support | Ubuntu 22.04 WSL2 with Intel 12700H |
## Hardware
### Adreno GPU
**Verified devices**
| Adreno GPU | Status |
|:------------------------------------:|:-------:|
| Adreno 750 (Snapdragon 8 Gen 3) | Support |
| Adreno 830 (Snapdragon 8 Elite) | Support |
| Adreno X85 (Snapdragon X Elite) | Support |
## DataType Supports
| DataType | Status |
|:----------------------:|:--------------------------:|
| Q4_0 | Support |
| Q6_K | Support, but not optimized |
## Model Preparation
You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model prepration.
Currently we support `Q4_0` quantization and have optimize for it. To achieve best performance on Adreno GPU, add `--pure` to `llama-quantize`. For example,
```sh
./llama-quantize --pure ggml-model-qwen2.5-3b-f16.gguf ggml-model-qwen-3b-Q4_0.gguf Q4_0
```
Since `Q6_K` is also supported, `Q4_0` quantization without `--pure` will also work. However, the performance will be worse compared to pure `Q4_0` quantization.
## CMake Options
The OpenCL backend has the following CMake options that control the behavior of the backend.
| CMake options | Default value | Description |
|:---------------------------------:|:--------------:|:------------------------------------------|
| `GGML_OPENCL_EMBED_KERNELS` | `ON` | Embed OpenCL kernels into the executable. |
| `GGML_OPENCL_USE_ADRENO_KERNELS` | `ON` | Use kernels optimized for Adreno. |
## Android
Ubuntu 22.04 is used for targeting Android. Make sure the following tools are accessible from command line,
* Git
* CMake 3.29
* Ninja
* Python3
### I. Setup Environment
1. **Install NDK**
```sh
cd ~
wget https://dl.google.com/android/repository/commandlinetools-linux-8512546_latest.zip && \
unzip commandlinetools-linux-8512546_latest.zip && \
mkdir -p ~/android-sdk/cmdline-tools && \
mv cmdline-tools latest && \
mv latest ~/android-sdk/cmdline-tools/ && \
rm -rf commandlinetools-linux-8512546_latest.zip
yes | ~/android-sdk/cmdline-tools/latest/bin/sdkmanager "ndk;26.3.11579264"
```
2. **Install OpenCL Headers and Library**
```sh
mkdir -p ~/dev/llm
cd ~/dev/llm
git clone https://github.com/KhronosGroup/OpenCL-Headers && \
cd OpenCL-Headers && \
cp -r CL ~/android-sdk/ndk/26.3.11579264/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include
cd ~/dev/llm
git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader && \
cd OpenCL-ICD-Loader && \
mkdir build_ndk26 && cd build_ndk26 && \
cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release \
-DCMAKE_TOOLCHAIN_FILE=$HOME/android-sdk/ndk/26.3.11579264/build/cmake/android.toolchain.cmake \
-DOPENCL_ICD_LOADER_HEADERS_DIR=$HOME/android-sdk/ndk/26.3.11579264/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include \
-DANDROID_ABI=arm64-v8a \
-DANDROID_PLATFORM=24 \
-DANDROID_STL=c++_shared && \
ninja && \
cp libOpenCL.so ~/android-sdk/ndk/26.3.11579264/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/lib/aarch64-linux-android
```
### II. Build llama.cpp
```sh
cd ~/dev/llm
git clone https://github.com/ggerganov/llama.cpp && \
cd llama.cpp && \
mkdir build-android && cd build-android
cmake .. -G Ninja \
-DCMAKE_TOOLCHAIN_FILE=$HOME/android-sdk/ndk/26.3.11579264/build/cmake/android.toolchain.cmake \
-DANDROID_ABI=arm64-v8a \
-DANDROID_PLATFORM=android-28 \
-DBUILD_SHARED_LIBS=OFF \
-DGGML_OPENCL=ON
ninja
```
## Windows 11 Arm64
A Snapdragon X Elite device with Windows 11 Arm64 is used. Make sure the following tools are accessible from command line,
* Git
* CMake 3.29
* Clang 19
* Ninja
* Visual Studio 2022
Powershell is used for the following instructions.
### I. Setup Environment
1. **Install OpenCL Headers and Library**
```powershell
mkdir -p ~/dev/llm
cd ~/dev/llm
git clone https://github.com/KhronosGroup/OpenCL-Headers && cd OpenCL-Headers
mkdir build && cd build
cmake .. -G Ninja `
-DBUILD_TESTING=OFF `
-DOPENCL_HEADERS_BUILD_TESTING=OFF `
-DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF `
-DCMAKE_INSTALL_PREFIX="$HOME/dev/llm/opencl"
cmake --build . --target install
cd ~/dev/llm
git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader && cd OpenCL-ICD-Loader
mkdir build && cd build
cmake .. -G Ninja `
-DCMAKE_BUILD_TYPE=Release `
-DCMAKE_PREFIX_PATH="$HOME/dev/llm/opencl" `
-DCMAKE_INSTALL_PREFIX="$HOME/dev/llm/opencl"
cmake --build . --target install
```
### II. Build llama.cpp
```powershell
mkdir -p ~/dev/llm
cd ~/dev/llm
git clone https://github.com/ggerganov/llama.cpp && cd llama.cpp
mkdir build && cd build
cmake .. -G Ninja `
-DCMAKE_TOOLCHAIN_FILE="$HOME/dev/llm/llama.cpp/cmake/arm64-windows-llvm.cmake" `
-DCMAKE_BUILD_TYPE=Release `
-DCMAKE_PREFIX_PATH="$HOME/dev/llm/opencl" `
-DBUILD_SHARED_LIBS=OFF `
-DGGML_OPENCL=ON
ninja
```
## Known Issues
- Qwen2.5 0.5B model produces gibberish output with Adreno kernels.
## TODO
- Fix Qwen2.5 0.5B
- Optimization for Q6_K
- Support and optimization for Q4_K

View File

@@ -3,6 +3,7 @@
#include "log.h"
#include "llama.h"
#include <chrono>
#include <cmath>
#include <cstdio>
#include <cstring>

View File

@@ -3,6 +3,7 @@
#include "log.h"
#include "llama.h"
#include <chrono>
#include <algorithm>
#include <array>
#include <atomic>

Binary file not shown.

View File

@@ -2279,7 +2279,7 @@ struct server_context {
for (size_t i = 0; i < std::min(max_probs, n_probs); i++) {
result.probs.push_back({
cur_p->data[i].id,
common_detokenize(ctx, {cur_p->data[i].id}, special),
common_token_to_piece(ctx, cur_p->data[i].id, special),
cur_p->data[i].p
});
}
@@ -2301,7 +2301,7 @@ struct server_context {
for (size_t i = 0; i < std::min(n_vocab, n_probs); i++) {
result.probs.push_back({
cur[i].id,
common_detokenize(ctx, {cur[i].id}, special),
common_token_to_piece(ctx, cur[i].id, special),
cur[i].p
});
}

View File

@@ -254,12 +254,12 @@ export default function ChatMessage({
🔄 Regenerate
</button>
)}
<CopyButton
className="badge btn-mini show-on-hover mr-2"
content={msg.content}
/>
</>
)}
<CopyButton
className="badge btn-mini show-on-hover mr-2"
content={msg.content}
/>
</div>
)}
</div>

View File

@@ -198,7 +198,7 @@
#ifndef __GNUC__
# define GGML_ATTRIBUTE_FORMAT(...)
#elif defined(__MINGW32__)
#elif defined(__MINGW32__) && !defined(__clang__)
# define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
#else
# define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))

View File

@@ -473,7 +473,6 @@ GGML_TABLE_BEGIN(uint8_t, ksigns_iq2xs, 128)
240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255,
GGML_TABLE_END()
//#if __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A // lowest compute capability for integer intrinsics
GGML_TABLE_BEGIN(uint64_t, ksigns64, 128)
0x0000000000000000, 0xff000000000000ff, 0xff0000000000ff00, 0x000000000000ffff,
0xff00000000ff0000, 0x0000000000ff00ff, 0x0000000000ffff00, 0xff00000000ffffff,
@@ -508,7 +507,6 @@ GGML_TABLE_BEGIN(uint64_t, ksigns64, 128)
0x00ffffffff000000, 0xffffffffff0000ff, 0xffffffffff00ff00, 0x00ffffffff00ffff,
0xffffffffffff0000, 0x00ffffffffff00ff, 0x00ffffffffffff00, 0xffffffffffffffff,
GGML_TABLE_END()
//#endif
GGML_TABLE_BEGIN(uint64_t, iq2xxs_grid, 256)

View File

@@ -9074,10 +9074,6 @@ static void ggml_compute_forward_clamp_f32(
const struct ggml_tensor * src0 = dst->src[0];
if (params->ith != 0) {
return;
}
float min;
float max;
memcpy(&min, (float *) dst->op_params + 0, sizeof(float));

View File

@@ -284,14 +284,14 @@ struct ggml_backend_cpu_device_context {
&hKey) == ERROR_SUCCESS) {
DWORD cpu_brand_size = 0;
if (RegQueryValueExA(hKey,
TEXT("ProcessorNameString"),
"ProcessorNameString",
NULL,
NULL,
NULL,
&cpu_brand_size) == ERROR_SUCCESS) {
description.resize(cpu_brand_size);
if (RegQueryValueExA(hKey,
TEXT("ProcessorNameString"),
"ProcessorNameString",
NULL,
NULL,
(LPBYTE)&description[0], // NOLINT
@@ -534,9 +534,6 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
if (ggml_cpu_has_dotprod()) {
features.push_back({ "DOTPROD", "1" });
}
if (ggml_cpu_has_matmul_int8()) {
features.push_back({ "MATMUL_INT8", "1" });
}
if (ggml_cpu_get_sve_cnt() > 0) {
static std::string sve_cnt = std::to_string(ggml_cpu_get_sve_cnt());
features.push_back({ "SVE_CNT", sve_cnt.c_str() });

View File

@@ -71,6 +71,47 @@
#define GGML_CUDA_CC_QY1 210
#define GGML_CUDA_CC_QY2 220
#ifdef __CUDA_ARCH_LIST__
constexpr bool ggml_cuda_has_arch_impl(int) {
return false;
}
template<class ... Archs>
constexpr bool ggml_cuda_has_arch_impl(const int arch, const int first, Archs... rest) {
return arch == first || ggml_cuda_has_arch_impl(arch, rest...);
}
constexpr bool ggml_cuda_has_arch(const int arch) {
return ggml_cuda_has_arch_impl(arch, __CUDA_ARCH_LIST__);
}
constexpr int ggml_cuda_highest_compiled_arch_impl(const int arch, const int cur) {
if (cur == 0) {
GGML_ABORT("ggml was not compiled with any CUDA arch <= %d", arch);
}
return cur;
}
template<class ... Archs>
constexpr int ggml_cuda_highest_compiled_arch_impl(const int arch, const int cur, const int first, Archs... rest) {
if (first <= arch && first > cur) {
return ggml_cuda_highest_compiled_arch_impl(arch, first, rest...);
} else {
return ggml_cuda_highest_compiled_arch_impl(arch, cur, rest...);
}
}
constexpr int ggml_cuda_highest_compiled_arch(const int arch) {
return ggml_cuda_highest_compiled_arch_impl(arch, 0, __CUDA_ARCH_LIST__);
}
#else
static int ggml_cuda_highest_compiled_arch(const int arch) {
return arch;
}
#endif // __CUDA_ARCH_LIST__
// ---------------------------------------------------------------------------------------------------------
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
#if defined(_MSC_VER)
@@ -124,11 +165,11 @@ static const char * cu_get_error_str(CUresult err) {
#define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str)
#endif
#if CUDART_VERSION >= 11100 || defined(GGML_USE_MUSA)
#if CUDART_VERSION >= 11010 || defined(GGML_USE_MUSA)
#define GGML_CUDA_ASSUME(x) __builtin_assume(x)
#else
#define GGML_CUDA_ASSUME(x)
#endif // CUDART_VERSION >= 11100
#endif // CUDART_VERSION >= 11010
#ifdef GGML_CUDA_F16
typedef half dfloat; // dequantize float
@@ -162,18 +203,32 @@ typedef float2 dfloat2;
#define FLASH_ATTN_AVAILABLE
#endif // !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= GGML_CUDA_CC_QY1)
static constexpr bool fast_fp16_available(const int cc) {
static bool fp16_available(const int cc) {
return ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_PASCAL;
}
static bool fast_fp16_available(const int cc) {
return fp16_available(cc) && cc != 610;
}
// To be used for feature selection of external libraries, e.g. cuBLAS.
static bool fast_fp16_hardware_available(const int cc) {
return cc >= GGML_CUDA_CC_PASCAL && cc != 610;
}
// Any FP16 tensor cores are available.
static constexpr bool fp16_mma_available(const int cc) {
// Any FP16 tensor core instructions are available for ggml code.
static bool fp16_mma_available(const int cc) {
return cc < GGML_CUDA_CC_OFFSET_AMD && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA;
}
// To be used for feature selection of external libraries, e.g. cuBLAS.
static bool fp16_mma_hardware_available(const int cc) {
return cc < GGML_CUDA_CC_OFFSET_AMD && cc >= GGML_CUDA_CC_VOLTA;
}
// Volta technically had FP16 tensor cores but they work very differently compared to Turing and later.
static constexpr bool new_mma_available(const int cc) {
return cc < GGML_CUDA_CC_OFFSET_AMD && cc >= GGML_CUDA_CC_TURING;
static bool new_mma_available(const int cc) {
return cc < GGML_CUDA_CC_OFFSET_AMD && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_TURING;
}
static constexpr __device__ int ggml_cuda_get_physical_warp_size() {

View File

@@ -599,7 +599,7 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
case GGML_TYPE_Q5_1:
return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
case GGML_TYPE_Q8_0:
if (ggml_cuda_info().devices[ggml_cuda_get_device()].cc >= GGML_CUDA_CC_PASCAL) {
if (fp16_available(ggml_cuda_info().devices[ggml_cuda_get_device()].cc)) {
return dequantize_block_q8_0_f16_cuda;
}
return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;

View File

@@ -178,11 +178,11 @@ static ggml_cuda_device_info ggml_cuda_init() {
int major_version = 0;
size_t version_length = 0;
if (rocblas_get_version_string_size(&version_length) == rocblas_status_success) {
std::string version(version_length, '\0');
std::vector<char> version(version_length+1, '\0');
if (rocblas_get_version_string(version.data(), version.size()) == rocblas_status_success) {
version.resize(::strlen(version.c_str()));
version.resize(::strlen(version.data()));
int parsed_value = 0;
if (std::from_chars(version.c_str(), version.c_str() + version.length(), parsed_value).ec == std::errc()) {
if (std::from_chars(version.data(), version.data() + version.size(), parsed_value).ec == std::errc()) {
major_version = parsed_value;
}
}
@@ -1867,14 +1867,14 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
const int cc = ggml_cuda_info().devices[id].cc;
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc);
any_gpus_without_fp16_mma = any_gpus_without_fp16_mma || !fp16_mma_available(cc);
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc);
any_gpus_without_fp16_mma = any_gpus_without_fp16_mma || !fp16_mma_hardware_available(cc);
}
} else {
const int cc = ggml_cuda_info().devices[ctx.device].cc;
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc);
any_gpus_without_fp16_mma = any_gpus_without_fp16_mma || !fp16_mma_available(cc);
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc);
any_gpus_without_fp16_mma = any_gpus_without_fp16_mma || !fp16_mma_hardware_available(cc);
}
// debug helpers
@@ -2840,7 +2840,7 @@ bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size) {
return false;
}
#if CUDART_VERSION >= 11100 || defined(GGML_USE_MUSA)
#if CUDART_VERSION >= 11010 || defined(GGML_USE_MUSA)
cudaError_t err = cudaHostRegister(buffer, size, cudaHostRegisterPortable | cudaHostRegisterReadOnly);
if (err != cudaSuccess) {
// clear the error
@@ -2852,8 +2852,10 @@ bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size) {
}
return true;
#else
GGML_UNUSED(buffer);
GGML_UNUSED(size);
return false;
#endif
#endif // CUDART_VERSION >= 11010 || defined(GGML_USE_MUSA)
}
void ggml_backend_cuda_unregister_host_buffer(void * buffer) {
@@ -3205,8 +3207,8 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
if (op->src[0]->ne[0] == 256 && op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16) {
return true;
}
const int cc = ggml_cuda_info().devices[dev_ctx->device].cc;
return cc >= GGML_CUDA_CC_VOLTA && cc < GGML_CUDA_CC_OFFSET_AMD && op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16;
return fp16_mma_available(ggml_cuda_info().devices[dev_ctx->device].cc) &&
op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16;
}
case GGML_OP_CROSS_ENTROPY_LOSS:
case GGML_OP_CROSS_ENTROPY_LOSS_BACK:

View File

@@ -18,7 +18,7 @@ void ggml_cuda_op_mul_mat_q(
const int64_t stride00 = ne00 / ggml_blck_size(src0->type);
int id = ggml_cuda_get_device();
const int compute_capability = ggml_cuda_info().devices[id].cc;
const int cc = ggml_cuda_info().devices[id].cc;
// the main device has a larger memory buffer to hold the results from all GPUs
// nrows_dst == nrows of the matrix that the kernel writes into
@@ -27,7 +27,8 @@ void ggml_cuda_op_mul_mat_q(
// The stream-k decomposition is only faster for recent NVIDIA GPUs.
// Also its fixup needs to allocate a temporary buffer in the memory pool.
// There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer.
const bool use_stream_k = compute_capability >= GGML_CUDA_CC_VOLTA && compute_capability < GGML_CUDA_CC_OFFSET_AMD && src1_ncols == ne11;
const bool use_stream_k = ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA &&
cc < GGML_CUDA_CC_OFFSET_AMD && src1_ncols == ne11;
const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst, use_stream_k};
switch (src0->type) {
@@ -136,7 +137,7 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
return true;
}
if (cc < GGML_CUDA_CC_DP4A) {
if (ggml_cuda_highest_compiled_arch(cc) < GGML_CUDA_CC_DP4A) {
return false;
}
@@ -145,8 +146,8 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
#endif //GGML_CUDA_FORCE_MMQ
if (cc < GGML_CUDA_CC_OFFSET_AMD) {
return cc < GGML_CUDA_CC_VOLTA || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
return !fp16_mma_hardware_available(cc) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
}
return (!GGML_CUDA_CC_IS_RDNA3(cc) && !GGML_CUDA_CC_IS_CDNA(cc) && !GGML_CUDA_CC_IS_GCN(cc)) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
return (!GGML_CUDA_CC_IS_RDNA3(cc) && !GGML_CUDA_CC_IS_CDNA(cc)) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
}

View File

@@ -86,12 +86,13 @@ struct tile_x_sizes {
int sc;
};
static constexpr int get_mmq_x_max_host(const int cc) {
static int get_mmq_x_max_host(const int cc) {
return new_mma_available(cc) ? 128 :
ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA && cc < GGML_CUDA_CC_OFFSET_AMD ?
#ifdef GGML_CUDA_FORCE_MMQ
cc >= GGML_CUDA_CC_VOLTA && cc < GGML_CUDA_CC_OFFSET_AMD ? 128 : 64;
128 : 64;
#else
cc >= GGML_CUDA_CC_VOLTA && cc < GGML_CUDA_CC_OFFSET_AMD ? MMQ_DP4A_MAX_BATCH_SIZE : 64;
MMQ_DP4A_MAX_BATCH_SIZE : 64;
#endif // GGML_CUDA_FORCE_MMQ
}
@@ -119,8 +120,9 @@ static constexpr __device__ int get_mmq_x_max_device() {
#endif // NEW_MMA_AVAILABLE
}
static constexpr int get_mmq_y_host(const int cc) {
return cc >= GGML_CUDA_CC_OFFSET_AMD ? (GGML_CUDA_CC_IS_RDNA1(cc) ? 64 : 128) : (cc >= GGML_CUDA_CC_VOLTA ? 128 : 64);
static int get_mmq_y_host(const int cc) {
return cc >= GGML_CUDA_CC_OFFSET_AMD ? (GGML_CUDA_CC_IS_RDNA1(cc) ? 64 : 128) :
(ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA ? 128 : 64);
}
static constexpr __device__ int get_mmq_y_device() {
@@ -2828,7 +2830,7 @@ void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cuda
const int mmq_x_max = get_mmq_x_max_host(cc);
const int mmq_y = get_mmq_y_host(cc);
const int block_num_y = (args.ne01 + mmq_y - 1) / mmq_y;
const bool use_stream_k = cc >= GGML_CUDA_CC_VOLTA && cc < GGML_CUDA_CC_OFFSET_AMD;
const bool use_stream_k = ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA && cc < GGML_CUDA_CC_OFFSET_AMD;
int mmq_x_best = 0;
int nparts_best = INT_MAX;

View File

@@ -1,6 +1,6 @@
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11700
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11070
#define USE_CUB
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11700
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11070
#ifdef USE_CUB
#include <cub/cub.cuh>

View File

@@ -1 +1 @@
08b538031f7f944e84f472483ef5d26bf5190ead
98a61a0d0b43cba06c3ac1c603813639552a0701

View File

@@ -116,7 +116,7 @@ struct llama_grammar {
llama_partial_utf8 partial_utf8;
// lazy grammars wait for trigger words or tokens before constraining the sampling.
// we still ahve trigger_tokens for non-lazy grammars to force printing of special trigger tokens.
// we still have trigger_tokens for non-lazy grammars to force printing of special trigger tokens.
// (useful e.g. for tool_choice=required)
bool lazy = false;
bool awaiting_trigger = false; // Initialized to true for lazy grammars only

View File

@@ -6,13 +6,13 @@
#include <vector>
#ifdef __GNUC__
#ifdef __MINGW32__
#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
# if defined(__MINGW32__) && !defined(__clang__)
# define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
# else
# define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
# endif
#else
#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
#endif
#else
#define LLAMA_ATTRIBUTE_FORMAT(...)
# define LLAMA_ATTRIBUTE_FORMAT(...)
#endif
//

View File

@@ -697,8 +697,8 @@ static std::pair<int, int> test_handcrafted_file(const unsigned int seed) {
#ifdef _WIN32
if (!file) {
printf("%s: failed to create tmpfile(), needs elevated privileges on Windows");
printf("%s: skipping tests");
printf("failed to create tmpfile(), needs elevated privileges on Windows");
printf("skipping tests");
continue;
}
#else
@@ -1086,8 +1086,8 @@ static std::pair<int, int> test_roundtrip(ggml_backend_dev_t dev, const unsigned
#ifdef _WIN32
if (!file) {
printf("%s: failed to create tmpfile(), needs elevated privileges on Windows");
printf("%s: skipping tests");
printf("failed to create tmpfile(), needs elevated privileges on Windows");
printf("skipping tests");
return std::make_pair(0, 0);
}
#else