Compare commits

...

3 Commits
b9331 ... b9334

Author SHA1 Message Date
Johannes Gäßler
192d8ae8b8 CUDA: missing PDL sync for FWHT, better fallback (#23690) 2026-05-26 11:05:51 +08:00
forforever73
35c9b1f39e metal : add apple device id (#23566)
Co-authored-by: lvyichen <lvyichen@stepfun.com>
2026-05-25 21:05:16 +03:00
Max Krasnyansky
4bead4e30d snapdragon: bump toolchain docker to v0.7 to fix ui build issues (#23680) 2026-05-25 10:57:43 -07:00
7 changed files with 92 additions and 28 deletions

View File

@@ -31,7 +31,7 @@ jobs:
android-ndk-snapdragon:
runs-on: ubuntu-latest
container:
image: 'ghcr.io/snapdragon-toolchain/arm64-android:v0.6'
image: 'ghcr.io/snapdragon-toolchain/arm64-android:v0.7'
defaults:
run:
shell: bash
@@ -61,7 +61,7 @@ jobs:
linux-iot-snapdragon:
runs-on: ubuntu-latest
container:
image: 'ghcr.io/snapdragon-toolchain/arm64-linux:v0.6'
image: 'ghcr.io/snapdragon-toolchain/arm64-linux:v0.7'
defaults:
run:
shell: bash

View File

@@ -10,7 +10,7 @@ This image includes Android NDK, OpenCL SDK, Hexagon SDK, CMake, etc.
This method works on Linux, macOS, and Windows. macOS and Windows users should install Docker Desktop.
```
~/src/llama.cpp$ docker run -it -u $(id -u):$(id -g) --volume $(pwd):/workspace --platform linux/amd64 ghcr.io/snapdragon-toolchain/arm64-android:v0.6
~/src/llama.cpp$ docker run -it -u $(id -u):$(id -g) --volume $(pwd):/workspace --platform linux/amd64 ghcr.io/snapdragon-toolchain/arm64-android:v0.7
[d]/> cd /workspace
```

View File

@@ -19,6 +19,7 @@ __global__ void fwht_cuda(const float * src, float * dst, const int64_t n_rows,
float reg[el_w];
const int lane = threadIdx.x;
ggml_cuda_pdl_sync();
#pragma unroll
for (int i = 0; i < el_w; ++i) {
reg[i] = src[i * warp_size + lane] * scale;
@@ -57,10 +58,11 @@ __global__ void fwht_cuda(const float * src, float * dst, const int64_t n_rows,
}
}
void ggml_cuda_op_fwht(ggml_backend_cuda_context & ctx, const ggml_tensor * src, ggml_tensor * dst) {
bool ggml_cuda_op_fwht(ggml_backend_cuda_context & ctx, const ggml_tensor * src, ggml_tensor * dst) {
GGML_ASSERT(ggml_are_same_shape(src, dst));
GGML_ASSERT(ggml_is_contiguous(src));
GGML_ASSERT(ggml_is_contiguous(dst));
if (!ggml_is_contiguous(src) || !ggml_is_contiguous(dst)) {
return false;
}
const int n = src->ne[0];
const int64_t rows = ggml_nrows(src);
@@ -68,7 +70,6 @@ void ggml_cuda_op_fwht(ggml_backend_cuda_context & ctx, const ggml_tensor * src,
float * dst_d = (float *) dst->data;
const int warp_size = ggml_cuda_info().devices[ggml_cuda_get_device()].warp_size;
GGML_ASSERT(n % warp_size == 0);
const int rows_per_block = 4;
const int64_t num_blocks = (rows + rows_per_block - 1) / rows_per_block;
@@ -83,26 +84,18 @@ void ggml_cuda_op_fwht(ggml_backend_cuda_context & ctx, const ggml_tensor * src,
switch (n) {
case 64:
{
ggml_cuda_kernel_launch(fwht_cuda<64>, launch_params, src_d, dst_d, rows, scale);
break;
}
ggml_cuda_kernel_launch(fwht_cuda<64>, launch_params, src_d, dst_d, rows, scale);
return true;
case 128:
{
ggml_cuda_kernel_launch(fwht_cuda<128>, launch_params, src_d, dst_d, rows, scale);
break;
}
ggml_cuda_kernel_launch(fwht_cuda<128>, launch_params, src_d, dst_d, rows, scale);
return true;
case 256:
{
ggml_cuda_kernel_launch(fwht_cuda<256>, launch_params, src_d, dst_d, rows, scale);
break;
}
ggml_cuda_kernel_launch(fwht_cuda<256>, launch_params, src_d, dst_d, rows, scale);
return true;
case 512:
{
ggml_cuda_kernel_launch(fwht_cuda<512>, launch_params, src_d, dst_d, rows, scale);
break;
}
ggml_cuda_kernel_launch(fwht_cuda<512>, launch_params, src_d, dst_d, rows, scale);
return true;
default:
GGML_ABORT("fatal error");
return false;
}
}

View File

@@ -1,3 +1,4 @@
#include "common.cuh"
void ggml_cuda_op_fwht(ggml_backend_cuda_context & ctx, const ggml_tensor * src, ggml_tensor * dst);
// Returns whether the Fast Walsh-Hadamard transform could be used.
bool ggml_cuda_op_fwht(ggml_backend_cuda_context & ctx, const ggml_tensor * src, ggml_tensor * dst);

View File

@@ -2596,9 +2596,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
bool use_batched_cublas_f32 = src0->type == GGML_TYPE_F32;
const int32_t hint = ggml_get_op_params_i32(dst, 1);
if (hint == GGML_HINT_SRC0_IS_HADAMARD) {
GGML_ASSERT(!split);
ggml_cuda_op_fwht(ctx, src1, dst);
if (hint == GGML_HINT_SRC0_IS_HADAMARD && !split && ggml_cuda_op_fwht(ctx, src1, dst)) {
return;
}

View File

@@ -215,6 +215,30 @@ void ggml_metal_rsets_free(ggml_metal_rsets_t rsets);
// device
//
enum ggml_metal_device_id {
GGML_METAL_DEVICE_GENERIC = 0,
GGML_METAL_DEVICE_M1,
GGML_METAL_DEVICE_M1_PRO,
GGML_METAL_DEVICE_M1_MAX,
GGML_METAL_DEVICE_M1_ULTRA,
GGML_METAL_DEVICE_M2,
GGML_METAL_DEVICE_M2_PRO,
GGML_METAL_DEVICE_M2_MAX,
GGML_METAL_DEVICE_M2_ULTRA,
GGML_METAL_DEVICE_M3,
GGML_METAL_DEVICE_M3_PRO,
GGML_METAL_DEVICE_M3_MAX,
GGML_METAL_DEVICE_M3_ULTRA,
GGML_METAL_DEVICE_M4,
GGML_METAL_DEVICE_M4_PRO,
GGML_METAL_DEVICE_M4_MAX,
GGML_METAL_DEVICE_M5,
GGML_METAL_DEVICE_M5_PRO,
GGML_METAL_DEVICE_M5_MAX,
GGML_METAL_DEVICE_M5_ULTRA,
};
struct ggml_metal_device_props {
int device;
char name[128];
@@ -234,6 +258,8 @@ struct ggml_metal_device_props {
bool supports_gpu_family_apple7;
enum ggml_metal_device_id device_id;
int op_offload_min_batch_size;
};

View File

@@ -628,6 +628,50 @@ void ggml_metal_rsets_free(ggml_metal_rsets_t rsets) {
free(rsets);
}
static enum ggml_metal_device_id ggml_metal_device_id_parse(const char * name) {
if (!name) {
return GGML_METAL_DEVICE_GENERIC;
}
static const char prefix[] = "Apple ";
if (strncmp(name, prefix, sizeof(prefix) - 1) != 0) {
return GGML_METAL_DEVICE_GENERIC;
}
const char * suffix = name + sizeof(prefix) - 1;
static const struct {
const char * name;
enum ggml_metal_device_id id;
} table[] = {
{"M1", GGML_METAL_DEVICE_M1},
{"M1 Pro", GGML_METAL_DEVICE_M1_PRO},
{"M1 Max", GGML_METAL_DEVICE_M1_MAX},
{"M1 Ultra", GGML_METAL_DEVICE_M1_ULTRA},
{"M2", GGML_METAL_DEVICE_M2},
{"M2 Pro", GGML_METAL_DEVICE_M2_PRO},
{"M2 Max", GGML_METAL_DEVICE_M2_MAX},
{"M2 Ultra", GGML_METAL_DEVICE_M2_ULTRA},
{"M3", GGML_METAL_DEVICE_M3},
{"M3 Pro", GGML_METAL_DEVICE_M3_PRO},
{"M3 Max", GGML_METAL_DEVICE_M3_MAX},
{"M3 Ultra", GGML_METAL_DEVICE_M3_ULTRA},
{"M4", GGML_METAL_DEVICE_M4},
{"M4 Pro", GGML_METAL_DEVICE_M4_PRO},
{"M4 Max", GGML_METAL_DEVICE_M4_MAX},
{"M5", GGML_METAL_DEVICE_M5},
{"M5 Pro", GGML_METAL_DEVICE_M5_PRO},
{"M5 Max", GGML_METAL_DEVICE_M5_MAX},
{"M5 Ultra", GGML_METAL_DEVICE_M5_ULTRA},
};
for (size_t i = 0; i < sizeof(table)/sizeof(table[0]); ++i) {
if (strcmp(suffix, table[i].name) == 0) {
return table[i].id;
}
}
return GGML_METAL_DEVICE_GENERIC;
}
ggml_metal_device_t ggml_metal_device_init(int device) {
ggml_metal_device_t dev = calloc(1, sizeof(struct ggml_metal_device));
@@ -795,6 +839,8 @@ ggml_metal_device_t ggml_metal_device_init(int device) {
dev->props.supports_gpu_family_apple7 = [dev->mtl_device supportsFamily:MTLGPUFamilyApple7];
dev->props.device_id = ggml_metal_device_id_parse([[dev->mtl_device name] UTF8String]);
dev->props.op_offload_min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;
dev->props.max_buffer_size = dev->mtl_device.maxBufferLength;