mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-04-23 16:37:33 +03:00
Compare commits
15 Commits
b4686
...
xsn/ci_leg
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
8654805027 | ||
|
|
31afcbee0e | ||
|
|
5c4284d57b | ||
|
|
bfd11a2344 | ||
|
|
0fb77f821f | ||
|
|
e598697d63 | ||
|
|
fef0cbeadf | ||
|
|
748ee9fe93 | ||
|
|
198b1ec611 | ||
|
|
c3d6af7cd2 | ||
|
|
369be5598a | ||
|
|
4078c77f98 | ||
|
|
90e4dba461 | ||
|
|
a18f481f99 | ||
|
|
b9ab0a4d0b |
26
.github/workflows/docker.yml
vendored
26
.github/workflows/docker.yml
vendored
@@ -78,16 +78,32 @@ jobs:
|
||||
SAFE_NAME=$(echo "${{ env.GITHUB_BRANCH_NAME }}" | tr '/' '-')
|
||||
TAG_POSTFIX="-${SAFE_NAME}-${SHORT_HASH}"
|
||||
fi
|
||||
|
||||
# list all tags possible
|
||||
if [[ "${{ matrix.config.tag }}" == "cpu" ]]; then
|
||||
TYPE=""
|
||||
TYPE=""
|
||||
else
|
||||
TYPE="-${{ matrix.config.tag }}"
|
||||
TYPE="-${{ matrix.config.tag }}"
|
||||
fi
|
||||
|
||||
# also publish to legacy, for smooth transition to ggml-org
|
||||
if [[ "${REPO_OWNER}/${REPO_NAME}" == "ggml-org/llama.cpp" ]]; then
|
||||
LEGACY_PREFIX="ghcr.io/ggerganov/llama.cpp:"
|
||||
LEGACY_FULLTAGS=",${LEGACY_PREFIX}full${TYPE},${LEGACY_PREFIX}full${TYPE}${TAG_POSTFIX}"
|
||||
LEGACY_LIGHTTAGS=",${LEGACY_PREFIX}light${TYPE},${LEGACY_PREFIX}light${TYPE}${TAG_POSTFIX}"
|
||||
LEGACY_SERVERTAGS=",${LEGACY_PREFIX}server${TYPE},${LEGACY_PREFIX}server${TYPE}${TAG_POSTFIX}"
|
||||
else
|
||||
LEGACY_PREFIX=""
|
||||
LEGACY_FULLTAGS=""
|
||||
LEGACY_LIGHTTAGS=""
|
||||
LEGACY_SERVERTAGS=""
|
||||
fi
|
||||
|
||||
PREFIX="ghcr.io/${REPO_OWNER}/${REPO_NAME}:"
|
||||
FULLTAGS="${PREFIX}full${TYPE},${PREFIX}full${TYPE}${TAG_POSTFIX}"
|
||||
LIGHTTAGS="${PREFIX}light${TYPE},${PREFIX}light${TYPE}${TAG_POSTFIX}"
|
||||
SERVERTAGS="${PREFIX}server${TYPE},${PREFIX}server${TYPE}${TAG_POSTFIX}"
|
||||
FULLTAGS="${PREFIX}full${TYPE},${PREFIX}full${TYPE}${TAG_POSTFIX}${LEGACY_FULLTAGS}"
|
||||
LIGHTTAGS="${PREFIX}light${TYPE},${PREFIX}light${TYPE}${TAG_POSTFIX}${LEGACY_LIGHTTAGS}"
|
||||
SERVERTAGS="${PREFIX}server${TYPE},${PREFIX}server${TYPE}${TAG_POSTFIX}${LEGACY_SERVERTAGS}"
|
||||
|
||||
echo "full_output_tags=$FULLTAGS" >> $GITHUB_OUTPUT
|
||||
echo "light_output_tags=$LIGHTTAGS" >> $GITHUB_OUTPUT
|
||||
echo "server_output_tags=$SERVERTAGS" >> $GITHUB_OUTPUT
|
||||
|
||||
@@ -235,6 +235,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
||||
| [HIP](docs/build.md#hip) | AMD GPU |
|
||||
| [Vulkan](docs/build.md#vulkan) | GPU |
|
||||
| [CANN](docs/build.md#cann) | Ascend NPU |
|
||||
| [OpenCL](docs/backend/OPENCL.md) | Adreno GPU |
|
||||
|
||||
## Building the project
|
||||
|
||||
|
||||
@@ -424,13 +424,13 @@ bool set_process_priority(enum ggml_sched_priority prio);
|
||||
//
|
||||
|
||||
#ifdef __GNUC__
|
||||
#ifdef __MINGW32__
|
||||
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
|
||||
# if defined(__MINGW32__) && !defined(__clang__)
|
||||
# define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
|
||||
# else
|
||||
# define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
|
||||
# endif
|
||||
#else
|
||||
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
|
||||
#endif
|
||||
#else
|
||||
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
|
||||
# define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
|
||||
#endif
|
||||
|
||||
LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
#include "log.h"
|
||||
|
||||
#include <chrono>
|
||||
#include <condition_variable>
|
||||
#include <cstdarg>
|
||||
#include <cstdio>
|
||||
|
||||
@@ -15,7 +15,7 @@
|
||||
|
||||
#ifndef __GNUC__
|
||||
# define LOG_ATTRIBUTE_FORMAT(...)
|
||||
#elif defined(__MINGW32__)
|
||||
#elif defined(__MINGW32__) && !defined(__clang__)
|
||||
# define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
|
||||
#else
|
||||
# define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
|
||||
|
||||
205
docs/backend/OPENCL.md
Normal file
205
docs/backend/OPENCL.md
Normal file
@@ -0,0 +1,205 @@
|
||||
# llama.cpp for OpenCL
|
||||
|
||||
- [Background](#background)
|
||||
- [OS](#os)
|
||||
- [Hardware](#hardware)
|
||||
- [DataType Supports](#datatype-supports)
|
||||
- [Model Preparation](#model-preparation)
|
||||
- [CMake Options](#cmake-options)
|
||||
- [Android](#android)
|
||||
- [Windows 11 Arm64](#windows-11-arm64)
|
||||
- [Known Issue](#known-issues)
|
||||
- [TODO](#todo)
|
||||
|
||||
## Background
|
||||
|
||||
OpenCL (Open Computing Language) is an open, royalty-free standard for cross-platform, parallel programming of diverse accelerators found in supercomputers, cloud servers, personal computers, mobile devices and embedded platforms. OpenCL specifies a programming language (based on C99) for programming these devices and application programming interfaces (APIs) to control the platform and execute programs on the compute devices. Similar to CUDA, OpenCL has been widely used to program GPUs and is supported by most GPU vendors.
|
||||
|
||||
### Llama.cpp + OpenCL
|
||||
|
||||
The llama.cpp OpenCL backend is designed to enable llama.cpp on **Qualcomm Adreno GPU** firstly via OpenCL. Thanks to the portabilty of OpenCL, the OpenCL backend can also run on certain Intel GPUs although the performance is not optimal.
|
||||
|
||||
## OS
|
||||
|
||||
| OS | Status | Verified |
|
||||
|---------|---------|------------------------------------------------|
|
||||
| Android | Support | Snapdragon 8 Gen 3, Snapdragon 8 Elite |
|
||||
| Windows | Support | Windows 11 Arm64 with Snapdragon X Elite |
|
||||
| Linux | Support | Ubuntu 22.04 WSL2 with Intel 12700H |
|
||||
|
||||
## Hardware
|
||||
|
||||
### Adreno GPU
|
||||
|
||||
**Verified devices**
|
||||
|
||||
| Adreno GPU | Status |
|
||||
|:------------------------------------:|:-------:|
|
||||
| Adreno 750 (Snapdragon 8 Gen 3) | Support |
|
||||
| Adreno 830 (Snapdragon 8 Elite) | Support |
|
||||
| Adreno X85 (Snapdragon X Elite) | Support |
|
||||
|
||||
## DataType Supports
|
||||
|
||||
| DataType | Status |
|
||||
|:----------------------:|:--------------------------:|
|
||||
| Q4_0 | Support |
|
||||
| Q6_K | Support, but not optimized |
|
||||
|
||||
## Model Preparation
|
||||
|
||||
You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model prepration.
|
||||
|
||||
Currently we support `Q4_0` quantization and have optimize for it. To achieve best performance on Adreno GPU, add `--pure` to `llama-quantize`. For example,
|
||||
|
||||
```sh
|
||||
./llama-quantize --pure ggml-model-qwen2.5-3b-f16.gguf ggml-model-qwen-3b-Q4_0.gguf Q4_0
|
||||
```
|
||||
|
||||
Since `Q6_K` is also supported, `Q4_0` quantization without `--pure` will also work. However, the performance will be worse compared to pure `Q4_0` quantization.
|
||||
|
||||
## CMake Options
|
||||
|
||||
The OpenCL backend has the following CMake options that control the behavior of the backend.
|
||||
|
||||
| CMake options | Default value | Description |
|
||||
|:---------------------------------:|:--------------:|:------------------------------------------|
|
||||
| `GGML_OPENCL_EMBED_KERNELS` | `ON` | Embed OpenCL kernels into the executable. |
|
||||
| `GGML_OPENCL_USE_ADRENO_KERNELS` | `ON` | Use kernels optimized for Adreno. |
|
||||
|
||||
## Android
|
||||
|
||||
Ubuntu 22.04 is used for targeting Android. Make sure the following tools are accessible from command line,
|
||||
|
||||
* Git
|
||||
* CMake 3.29
|
||||
* Ninja
|
||||
* Python3
|
||||
|
||||
### I. Setup Environment
|
||||
|
||||
1. **Install NDK**
|
||||
|
||||
```sh
|
||||
cd ~
|
||||
wget https://dl.google.com/android/repository/commandlinetools-linux-8512546_latest.zip && \
|
||||
unzip commandlinetools-linux-8512546_latest.zip && \
|
||||
mkdir -p ~/android-sdk/cmdline-tools && \
|
||||
mv cmdline-tools latest && \
|
||||
mv latest ~/android-sdk/cmdline-tools/ && \
|
||||
rm -rf commandlinetools-linux-8512546_latest.zip
|
||||
|
||||
yes | ~/android-sdk/cmdline-tools/latest/bin/sdkmanager "ndk;26.3.11579264"
|
||||
```
|
||||
|
||||
2. **Install OpenCL Headers and Library**
|
||||
|
||||
```sh
|
||||
mkdir -p ~/dev/llm
|
||||
cd ~/dev/llm
|
||||
|
||||
git clone https://github.com/KhronosGroup/OpenCL-Headers && \
|
||||
cd OpenCL-Headers && \
|
||||
cp -r CL ~/android-sdk/ndk/26.3.11579264/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include
|
||||
|
||||
cd ~/dev/llm
|
||||
|
||||
git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader && \
|
||||
cd OpenCL-ICD-Loader && \
|
||||
mkdir build_ndk26 && cd build_ndk26 && \
|
||||
cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release \
|
||||
-DCMAKE_TOOLCHAIN_FILE=$HOME/android-sdk/ndk/26.3.11579264/build/cmake/android.toolchain.cmake \
|
||||
-DOPENCL_ICD_LOADER_HEADERS_DIR=$HOME/android-sdk/ndk/26.3.11579264/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include \
|
||||
-DANDROID_ABI=arm64-v8a \
|
||||
-DANDROID_PLATFORM=24 \
|
||||
-DANDROID_STL=c++_shared && \
|
||||
ninja && \
|
||||
cp libOpenCL.so ~/android-sdk/ndk/26.3.11579264/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/lib/aarch64-linux-android
|
||||
```
|
||||
|
||||
### II. Build llama.cpp
|
||||
|
||||
```sh
|
||||
cd ~/dev/llm
|
||||
|
||||
git clone https://github.com/ggerganov/llama.cpp && \
|
||||
cd llama.cpp && \
|
||||
mkdir build-android && cd build-android
|
||||
|
||||
cmake .. -G Ninja \
|
||||
-DCMAKE_TOOLCHAIN_FILE=$HOME/android-sdk/ndk/26.3.11579264/build/cmake/android.toolchain.cmake \
|
||||
-DANDROID_ABI=arm64-v8a \
|
||||
-DANDROID_PLATFORM=android-28 \
|
||||
-DBUILD_SHARED_LIBS=OFF \
|
||||
-DGGML_OPENCL=ON
|
||||
|
||||
ninja
|
||||
```
|
||||
|
||||
## Windows 11 Arm64
|
||||
|
||||
A Snapdragon X Elite device with Windows 11 Arm64 is used. Make sure the following tools are accessible from command line,
|
||||
|
||||
* Git
|
||||
* CMake 3.29
|
||||
* Clang 19
|
||||
* Ninja
|
||||
* Visual Studio 2022
|
||||
|
||||
Powershell is used for the following instructions.
|
||||
|
||||
### I. Setup Environment
|
||||
|
||||
1. **Install OpenCL Headers and Library**
|
||||
|
||||
```powershell
|
||||
mkdir -p ~/dev/llm
|
||||
|
||||
cd ~/dev/llm
|
||||
git clone https://github.com/KhronosGroup/OpenCL-Headers && cd OpenCL-Headers
|
||||
mkdir build && cd build
|
||||
cmake .. -G Ninja `
|
||||
-DBUILD_TESTING=OFF `
|
||||
-DOPENCL_HEADERS_BUILD_TESTING=OFF `
|
||||
-DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF `
|
||||
-DCMAKE_INSTALL_PREFIX="$HOME/dev/llm/opencl"
|
||||
cmake --build . --target install
|
||||
|
||||
cd ~/dev/llm
|
||||
git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader && cd OpenCL-ICD-Loader
|
||||
mkdir build && cd build
|
||||
cmake .. -G Ninja `
|
||||
-DCMAKE_BUILD_TYPE=Release `
|
||||
-DCMAKE_PREFIX_PATH="$HOME/dev/llm/opencl" `
|
||||
-DCMAKE_INSTALL_PREFIX="$HOME/dev/llm/opencl"
|
||||
cmake --build . --target install
|
||||
```
|
||||
|
||||
### II. Build llama.cpp
|
||||
|
||||
```powershell
|
||||
|
||||
mkdir -p ~/dev/llm
|
||||
cd ~/dev/llm
|
||||
|
||||
git clone https://github.com/ggerganov/llama.cpp && cd llama.cpp
|
||||
mkdir build && cd build
|
||||
|
||||
cmake .. -G Ninja `
|
||||
-DCMAKE_TOOLCHAIN_FILE="$HOME/dev/llm/llama.cpp/cmake/arm64-windows-llvm.cmake" `
|
||||
-DCMAKE_BUILD_TYPE=Release `
|
||||
-DCMAKE_PREFIX_PATH="$HOME/dev/llm/opencl" `
|
||||
-DBUILD_SHARED_LIBS=OFF `
|
||||
-DGGML_OPENCL=ON
|
||||
ninja
|
||||
```
|
||||
|
||||
## Known Issues
|
||||
|
||||
- Qwen2.5 0.5B model produces gibberish output with Adreno kernels.
|
||||
|
||||
## TODO
|
||||
|
||||
- Fix Qwen2.5 0.5B
|
||||
- Optimization for Q6_K
|
||||
- Support and optimization for Q4_K
|
||||
@@ -3,6 +3,7 @@
|
||||
#include "log.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <chrono>
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
#include "log.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <chrono>
|
||||
#include <algorithm>
|
||||
#include <array>
|
||||
#include <atomic>
|
||||
|
||||
Binary file not shown.
@@ -2279,7 +2279,7 @@ struct server_context {
|
||||
for (size_t i = 0; i < std::min(max_probs, n_probs); i++) {
|
||||
result.probs.push_back({
|
||||
cur_p->data[i].id,
|
||||
common_detokenize(ctx, {cur_p->data[i].id}, special),
|
||||
common_token_to_piece(ctx, cur_p->data[i].id, special),
|
||||
cur_p->data[i].p
|
||||
});
|
||||
}
|
||||
@@ -2301,7 +2301,7 @@ struct server_context {
|
||||
for (size_t i = 0; i < std::min(n_vocab, n_probs); i++) {
|
||||
result.probs.push_back({
|
||||
cur[i].id,
|
||||
common_detokenize(ctx, {cur[i].id}, special),
|
||||
common_token_to_piece(ctx, cur[i].id, special),
|
||||
cur[i].p
|
||||
});
|
||||
}
|
||||
|
||||
@@ -254,12 +254,12 @@ export default function ChatMessage({
|
||||
🔄 Regenerate
|
||||
</button>
|
||||
)}
|
||||
<CopyButton
|
||||
className="badge btn-mini show-on-hover mr-2"
|
||||
content={msg.content}
|
||||
/>
|
||||
</>
|
||||
)}
|
||||
<CopyButton
|
||||
className="badge btn-mini show-on-hover mr-2"
|
||||
content={msg.content}
|
||||
/>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
|
||||
@@ -198,7 +198,7 @@
|
||||
|
||||
#ifndef __GNUC__
|
||||
# define GGML_ATTRIBUTE_FORMAT(...)
|
||||
#elif defined(__MINGW32__)
|
||||
#elif defined(__MINGW32__) && !defined(__clang__)
|
||||
# define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
|
||||
#else
|
||||
# define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
|
||||
|
||||
@@ -473,7 +473,6 @@ GGML_TABLE_BEGIN(uint8_t, ksigns_iq2xs, 128)
|
||||
240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255,
|
||||
GGML_TABLE_END()
|
||||
|
||||
//#if __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A // lowest compute capability for integer intrinsics
|
||||
GGML_TABLE_BEGIN(uint64_t, ksigns64, 128)
|
||||
0x0000000000000000, 0xff000000000000ff, 0xff0000000000ff00, 0x000000000000ffff,
|
||||
0xff00000000ff0000, 0x0000000000ff00ff, 0x0000000000ffff00, 0xff00000000ffffff,
|
||||
@@ -508,7 +507,6 @@ GGML_TABLE_BEGIN(uint64_t, ksigns64, 128)
|
||||
0x00ffffffff000000, 0xffffffffff0000ff, 0xffffffffff00ff00, 0x00ffffffff00ffff,
|
||||
0xffffffffffff0000, 0x00ffffffffff00ff, 0x00ffffffffffff00, 0xffffffffffffffff,
|
||||
GGML_TABLE_END()
|
||||
//#endif
|
||||
|
||||
|
||||
GGML_TABLE_BEGIN(uint64_t, iq2xxs_grid, 256)
|
||||
|
||||
@@ -9074,10 +9074,6 @@ static void ggml_compute_forward_clamp_f32(
|
||||
|
||||
const struct ggml_tensor * src0 = dst->src[0];
|
||||
|
||||
if (params->ith != 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
float min;
|
||||
float max;
|
||||
memcpy(&min, (float *) dst->op_params + 0, sizeof(float));
|
||||
|
||||
@@ -284,14 +284,14 @@ struct ggml_backend_cpu_device_context {
|
||||
&hKey) == ERROR_SUCCESS) {
|
||||
DWORD cpu_brand_size = 0;
|
||||
if (RegQueryValueExA(hKey,
|
||||
TEXT("ProcessorNameString"),
|
||||
"ProcessorNameString",
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
&cpu_brand_size) == ERROR_SUCCESS) {
|
||||
description.resize(cpu_brand_size);
|
||||
if (RegQueryValueExA(hKey,
|
||||
TEXT("ProcessorNameString"),
|
||||
"ProcessorNameString",
|
||||
NULL,
|
||||
NULL,
|
||||
(LPBYTE)&description[0], // NOLINT
|
||||
@@ -534,9 +534,6 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
|
||||
if (ggml_cpu_has_dotprod()) {
|
||||
features.push_back({ "DOTPROD", "1" });
|
||||
}
|
||||
if (ggml_cpu_has_matmul_int8()) {
|
||||
features.push_back({ "MATMUL_INT8", "1" });
|
||||
}
|
||||
if (ggml_cpu_get_sve_cnt() > 0) {
|
||||
static std::string sve_cnt = std::to_string(ggml_cpu_get_sve_cnt());
|
||||
features.push_back({ "SVE_CNT", sve_cnt.c_str() });
|
||||
|
||||
@@ -71,6 +71,47 @@
|
||||
#define GGML_CUDA_CC_QY1 210
|
||||
#define GGML_CUDA_CC_QY2 220
|
||||
|
||||
#ifdef __CUDA_ARCH_LIST__
|
||||
constexpr bool ggml_cuda_has_arch_impl(int) {
|
||||
return false;
|
||||
}
|
||||
|
||||
template<class ... Archs>
|
||||
constexpr bool ggml_cuda_has_arch_impl(const int arch, const int first, Archs... rest) {
|
||||
return arch == first || ggml_cuda_has_arch_impl(arch, rest...);
|
||||
}
|
||||
|
||||
constexpr bool ggml_cuda_has_arch(const int arch) {
|
||||
return ggml_cuda_has_arch_impl(arch, __CUDA_ARCH_LIST__);
|
||||
}
|
||||
|
||||
constexpr int ggml_cuda_highest_compiled_arch_impl(const int arch, const int cur) {
|
||||
if (cur == 0) {
|
||||
GGML_ABORT("ggml was not compiled with any CUDA arch <= %d", arch);
|
||||
}
|
||||
return cur;
|
||||
}
|
||||
|
||||
template<class ... Archs>
|
||||
constexpr int ggml_cuda_highest_compiled_arch_impl(const int arch, const int cur, const int first, Archs... rest) {
|
||||
if (first <= arch && first > cur) {
|
||||
return ggml_cuda_highest_compiled_arch_impl(arch, first, rest...);
|
||||
} else {
|
||||
return ggml_cuda_highest_compiled_arch_impl(arch, cur, rest...);
|
||||
}
|
||||
}
|
||||
|
||||
constexpr int ggml_cuda_highest_compiled_arch(const int arch) {
|
||||
return ggml_cuda_highest_compiled_arch_impl(arch, 0, __CUDA_ARCH_LIST__);
|
||||
}
|
||||
#else
|
||||
static int ggml_cuda_highest_compiled_arch(const int arch) {
|
||||
return arch;
|
||||
}
|
||||
#endif // __CUDA_ARCH_LIST__
|
||||
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
|
||||
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
@@ -124,11 +165,11 @@ static const char * cu_get_error_str(CUresult err) {
|
||||
#define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str)
|
||||
#endif
|
||||
|
||||
#if CUDART_VERSION >= 11100 || defined(GGML_USE_MUSA)
|
||||
#if CUDART_VERSION >= 11010 || defined(GGML_USE_MUSA)
|
||||
#define GGML_CUDA_ASSUME(x) __builtin_assume(x)
|
||||
#else
|
||||
#define GGML_CUDA_ASSUME(x)
|
||||
#endif // CUDART_VERSION >= 11100
|
||||
#endif // CUDART_VERSION >= 11010
|
||||
|
||||
#ifdef GGML_CUDA_F16
|
||||
typedef half dfloat; // dequantize float
|
||||
@@ -162,18 +203,32 @@ typedef float2 dfloat2;
|
||||
#define FLASH_ATTN_AVAILABLE
|
||||
#endif // !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= GGML_CUDA_CC_QY1)
|
||||
|
||||
static constexpr bool fast_fp16_available(const int cc) {
|
||||
static bool fp16_available(const int cc) {
|
||||
return ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_PASCAL;
|
||||
}
|
||||
|
||||
static bool fast_fp16_available(const int cc) {
|
||||
return fp16_available(cc) && cc != 610;
|
||||
}
|
||||
|
||||
// To be used for feature selection of external libraries, e.g. cuBLAS.
|
||||
static bool fast_fp16_hardware_available(const int cc) {
|
||||
return cc >= GGML_CUDA_CC_PASCAL && cc != 610;
|
||||
}
|
||||
|
||||
// Any FP16 tensor cores are available.
|
||||
static constexpr bool fp16_mma_available(const int cc) {
|
||||
// Any FP16 tensor core instructions are available for ggml code.
|
||||
static bool fp16_mma_available(const int cc) {
|
||||
return cc < GGML_CUDA_CC_OFFSET_AMD && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA;
|
||||
}
|
||||
|
||||
// To be used for feature selection of external libraries, e.g. cuBLAS.
|
||||
static bool fp16_mma_hardware_available(const int cc) {
|
||||
return cc < GGML_CUDA_CC_OFFSET_AMD && cc >= GGML_CUDA_CC_VOLTA;
|
||||
}
|
||||
|
||||
// Volta technically had FP16 tensor cores but they work very differently compared to Turing and later.
|
||||
static constexpr bool new_mma_available(const int cc) {
|
||||
return cc < GGML_CUDA_CC_OFFSET_AMD && cc >= GGML_CUDA_CC_TURING;
|
||||
static bool new_mma_available(const int cc) {
|
||||
return cc < GGML_CUDA_CC_OFFSET_AMD && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_TURING;
|
||||
}
|
||||
|
||||
static constexpr __device__ int ggml_cuda_get_physical_warp_size() {
|
||||
|
||||
@@ -599,7 +599,7 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
||||
case GGML_TYPE_Q5_1:
|
||||
return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
|
||||
case GGML_TYPE_Q8_0:
|
||||
if (ggml_cuda_info().devices[ggml_cuda_get_device()].cc >= GGML_CUDA_CC_PASCAL) {
|
||||
if (fp16_available(ggml_cuda_info().devices[ggml_cuda_get_device()].cc)) {
|
||||
return dequantize_block_q8_0_f16_cuda;
|
||||
}
|
||||
return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
|
||||
|
||||
@@ -178,11 +178,11 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||
int major_version = 0;
|
||||
size_t version_length = 0;
|
||||
if (rocblas_get_version_string_size(&version_length) == rocblas_status_success) {
|
||||
std::string version(version_length, '\0');
|
||||
std::vector<char> version(version_length+1, '\0');
|
||||
if (rocblas_get_version_string(version.data(), version.size()) == rocblas_status_success) {
|
||||
version.resize(::strlen(version.c_str()));
|
||||
version.resize(::strlen(version.data()));
|
||||
int parsed_value = 0;
|
||||
if (std::from_chars(version.c_str(), version.c_str() + version.length(), parsed_value).ec == std::errc()) {
|
||||
if (std::from_chars(version.data(), version.data() + version.size(), parsed_value).ec == std::errc()) {
|
||||
major_version = parsed_value;
|
||||
}
|
||||
}
|
||||
@@ -1867,14 +1867,14 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
|
||||
|
||||
const int cc = ggml_cuda_info().devices[id].cc;
|
||||
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
|
||||
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc);
|
||||
any_gpus_without_fp16_mma = any_gpus_without_fp16_mma || !fp16_mma_available(cc);
|
||||
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc);
|
||||
any_gpus_without_fp16_mma = any_gpus_without_fp16_mma || !fp16_mma_hardware_available(cc);
|
||||
}
|
||||
} else {
|
||||
const int cc = ggml_cuda_info().devices[ctx.device].cc;
|
||||
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
|
||||
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc);
|
||||
any_gpus_without_fp16_mma = any_gpus_without_fp16_mma || !fp16_mma_available(cc);
|
||||
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc);
|
||||
any_gpus_without_fp16_mma = any_gpus_without_fp16_mma || !fp16_mma_hardware_available(cc);
|
||||
}
|
||||
|
||||
// debug helpers
|
||||
@@ -2840,7 +2840,7 @@ bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size) {
|
||||
return false;
|
||||
}
|
||||
|
||||
#if CUDART_VERSION >= 11100 || defined(GGML_USE_MUSA)
|
||||
#if CUDART_VERSION >= 11010 || defined(GGML_USE_MUSA)
|
||||
cudaError_t err = cudaHostRegister(buffer, size, cudaHostRegisterPortable | cudaHostRegisterReadOnly);
|
||||
if (err != cudaSuccess) {
|
||||
// clear the error
|
||||
@@ -2852,8 +2852,10 @@ bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size) {
|
||||
}
|
||||
return true;
|
||||
#else
|
||||
GGML_UNUSED(buffer);
|
||||
GGML_UNUSED(size);
|
||||
return false;
|
||||
#endif
|
||||
#endif // CUDART_VERSION >= 11010 || defined(GGML_USE_MUSA)
|
||||
}
|
||||
|
||||
void ggml_backend_cuda_unregister_host_buffer(void * buffer) {
|
||||
@@ -3205,8 +3207,8 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
if (op->src[0]->ne[0] == 256 && op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16) {
|
||||
return true;
|
||||
}
|
||||
const int cc = ggml_cuda_info().devices[dev_ctx->device].cc;
|
||||
return cc >= GGML_CUDA_CC_VOLTA && cc < GGML_CUDA_CC_OFFSET_AMD && op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16;
|
||||
return fp16_mma_available(ggml_cuda_info().devices[dev_ctx->device].cc) &&
|
||||
op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16;
|
||||
}
|
||||
case GGML_OP_CROSS_ENTROPY_LOSS:
|
||||
case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
|
||||
|
||||
@@ -18,7 +18,7 @@ void ggml_cuda_op_mul_mat_q(
|
||||
const int64_t stride00 = ne00 / ggml_blck_size(src0->type);
|
||||
|
||||
int id = ggml_cuda_get_device();
|
||||
const int compute_capability = ggml_cuda_info().devices[id].cc;
|
||||
const int cc = ggml_cuda_info().devices[id].cc;
|
||||
|
||||
// the main device has a larger memory buffer to hold the results from all GPUs
|
||||
// nrows_dst == nrows of the matrix that the kernel writes into
|
||||
@@ -27,7 +27,8 @@ void ggml_cuda_op_mul_mat_q(
|
||||
// The stream-k decomposition is only faster for recent NVIDIA GPUs.
|
||||
// Also its fixup needs to allocate a temporary buffer in the memory pool.
|
||||
// There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer.
|
||||
const bool use_stream_k = compute_capability >= GGML_CUDA_CC_VOLTA && compute_capability < GGML_CUDA_CC_OFFSET_AMD && src1_ncols == ne11;
|
||||
const bool use_stream_k = ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA &&
|
||||
cc < GGML_CUDA_CC_OFFSET_AMD && src1_ncols == ne11;
|
||||
const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst, use_stream_k};
|
||||
|
||||
switch (src0->type) {
|
||||
@@ -136,7 +137,7 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (cc < GGML_CUDA_CC_DP4A) {
|
||||
if (ggml_cuda_highest_compiled_arch(cc) < GGML_CUDA_CC_DP4A) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -145,8 +146,8 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
|
||||
#endif //GGML_CUDA_FORCE_MMQ
|
||||
|
||||
if (cc < GGML_CUDA_CC_OFFSET_AMD) {
|
||||
return cc < GGML_CUDA_CC_VOLTA || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
|
||||
return !fp16_mma_hardware_available(cc) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
|
||||
}
|
||||
|
||||
return (!GGML_CUDA_CC_IS_RDNA3(cc) && !GGML_CUDA_CC_IS_CDNA(cc) && !GGML_CUDA_CC_IS_GCN(cc)) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
|
||||
return (!GGML_CUDA_CC_IS_RDNA3(cc) && !GGML_CUDA_CC_IS_CDNA(cc)) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
|
||||
}
|
||||
|
||||
@@ -86,12 +86,13 @@ struct tile_x_sizes {
|
||||
int sc;
|
||||
};
|
||||
|
||||
static constexpr int get_mmq_x_max_host(const int cc) {
|
||||
static int get_mmq_x_max_host(const int cc) {
|
||||
return new_mma_available(cc) ? 128 :
|
||||
ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA && cc < GGML_CUDA_CC_OFFSET_AMD ?
|
||||
#ifdef GGML_CUDA_FORCE_MMQ
|
||||
cc >= GGML_CUDA_CC_VOLTA && cc < GGML_CUDA_CC_OFFSET_AMD ? 128 : 64;
|
||||
128 : 64;
|
||||
#else
|
||||
cc >= GGML_CUDA_CC_VOLTA && cc < GGML_CUDA_CC_OFFSET_AMD ? MMQ_DP4A_MAX_BATCH_SIZE : 64;
|
||||
MMQ_DP4A_MAX_BATCH_SIZE : 64;
|
||||
#endif // GGML_CUDA_FORCE_MMQ
|
||||
}
|
||||
|
||||
@@ -119,8 +120,9 @@ static constexpr __device__ int get_mmq_x_max_device() {
|
||||
#endif // NEW_MMA_AVAILABLE
|
||||
}
|
||||
|
||||
static constexpr int get_mmq_y_host(const int cc) {
|
||||
return cc >= GGML_CUDA_CC_OFFSET_AMD ? (GGML_CUDA_CC_IS_RDNA1(cc) ? 64 : 128) : (cc >= GGML_CUDA_CC_VOLTA ? 128 : 64);
|
||||
static int get_mmq_y_host(const int cc) {
|
||||
return cc >= GGML_CUDA_CC_OFFSET_AMD ? (GGML_CUDA_CC_IS_RDNA1(cc) ? 64 : 128) :
|
||||
(ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA ? 128 : 64);
|
||||
}
|
||||
|
||||
static constexpr __device__ int get_mmq_y_device() {
|
||||
@@ -2828,7 +2830,7 @@ void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cuda
|
||||
const int mmq_x_max = get_mmq_x_max_host(cc);
|
||||
const int mmq_y = get_mmq_y_host(cc);
|
||||
const int block_num_y = (args.ne01 + mmq_y - 1) / mmq_y;
|
||||
const bool use_stream_k = cc >= GGML_CUDA_CC_VOLTA && cc < GGML_CUDA_CC_OFFSET_AMD;
|
||||
const bool use_stream_k = ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA && cc < GGML_CUDA_CC_OFFSET_AMD;
|
||||
|
||||
int mmq_x_best = 0;
|
||||
int nparts_best = INT_MAX;
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11700
|
||||
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11070
|
||||
#define USE_CUB
|
||||
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11700
|
||||
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11070
|
||||
|
||||
#ifdef USE_CUB
|
||||
#include <cub/cub.cuh>
|
||||
|
||||
@@ -1 +1 @@
|
||||
08b538031f7f944e84f472483ef5d26bf5190ead
|
||||
98a61a0d0b43cba06c3ac1c603813639552a0701
|
||||
|
||||
@@ -116,7 +116,7 @@ struct llama_grammar {
|
||||
llama_partial_utf8 partial_utf8;
|
||||
|
||||
// lazy grammars wait for trigger words or tokens before constraining the sampling.
|
||||
// we still ahve trigger_tokens for non-lazy grammars to force printing of special trigger tokens.
|
||||
// we still have trigger_tokens for non-lazy grammars to force printing of special trigger tokens.
|
||||
// (useful e.g. for tool_choice=required)
|
||||
bool lazy = false;
|
||||
bool awaiting_trigger = false; // Initialized to true for lazy grammars only
|
||||
|
||||
@@ -6,13 +6,13 @@
|
||||
#include <vector>
|
||||
|
||||
#ifdef __GNUC__
|
||||
#ifdef __MINGW32__
|
||||
#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
|
||||
# if defined(__MINGW32__) && !defined(__clang__)
|
||||
# define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
|
||||
# else
|
||||
# define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
|
||||
# endif
|
||||
#else
|
||||
#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
|
||||
#endif
|
||||
#else
|
||||
#define LLAMA_ATTRIBUTE_FORMAT(...)
|
||||
# define LLAMA_ATTRIBUTE_FORMAT(...)
|
||||
#endif
|
||||
|
||||
//
|
||||
|
||||
@@ -697,8 +697,8 @@ static std::pair<int, int> test_handcrafted_file(const unsigned int seed) {
|
||||
|
||||
#ifdef _WIN32
|
||||
if (!file) {
|
||||
printf("%s: failed to create tmpfile(), needs elevated privileges on Windows");
|
||||
printf("%s: skipping tests");
|
||||
printf("failed to create tmpfile(), needs elevated privileges on Windows");
|
||||
printf("skipping tests");
|
||||
continue;
|
||||
}
|
||||
#else
|
||||
@@ -1086,8 +1086,8 @@ static std::pair<int, int> test_roundtrip(ggml_backend_dev_t dev, const unsigned
|
||||
|
||||
#ifdef _WIN32
|
||||
if (!file) {
|
||||
printf("%s: failed to create tmpfile(), needs elevated privileges on Windows");
|
||||
printf("%s: skipping tests");
|
||||
printf("failed to create tmpfile(), needs elevated privileges on Windows");
|
||||
printf("skipping tests");
|
||||
return std::make_pair(0, 0);
|
||||
}
|
||||
#else
|
||||
|
||||
Reference in New Issue
Block a user