Compare commits

...

3 Commits
b8787 ... b8790

Author SHA1 Message Date
Adrien Gallouët
be76dd0bb2 vendor : update BoringSSL to 0.20260413.0 (#21881)
Signed-off-by: Adrien Gallouët <angt@huggingface.co>
2026-04-14 14:25:09 +03:00
Richard Davison
2e05f06ffb ggml : fix ARM NEON nvfp4 dot product on non-dotprod targets (#21559) 2026-04-14 14:23:45 +03:00
texasich
acc37a42ea cmake: fix CMP0194 warning on Windows with MSVC (#21630)
* cmake: fix CMP0194 warning on Windows with MSVC

Set CMP0194 policy to NEW before project() call in ggml/CMakeLists.txt to suppress the "MSVC is not an assembler for language ASM" warning introduced in CMake 4.1.

The ggml project enables ASM globally for Metal (macOS) and KleidiAI (ARM) backends. On Windows/MSVC, no assembler sources are used, but CMake 4.1+ warns because cl.exe is not a valid ASM compiler.

This follows the same pattern used in ggml-vulkan (CMP0114, CMP0147).

Closes ggml-org/llama.cpp#20311

* cmake: apply cisc's formatting suggestion

---------

Co-authored-by: texasich <texasich@users.noreply.github.com>
2026-04-14 13:47:56 +03:00
4 changed files with 51 additions and 8 deletions

View File

@@ -1,4 +1,11 @@
cmake_minimum_required(VERSION 3.14...3.28) # for add_link_options and implicit target directories.
# ref: https://cmake.org/cmake/help/latest/policy/CMP0194.html
# MSVC is not a valid assembler for the ASM language.
# Set to NEW to avoid a warning on CMake 4.1+ with MSVC.
if (POLICY CMP0194)
cmake_policy(SET CMP0194 NEW)
endif()
project("ggml" C CXX ASM)
### GGML Version

View File

@@ -783,6 +783,7 @@ void ggml_vec_dot_nvfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
const int8x16_t q4_lo_1 = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits_1, m4b));
const int8x16_t q4_hi_1 = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits_1, 4));
#if defined(__ARM_FEATURE_DOTPROD)
const int8x16_t q8_0a = vld1q_s8(y[2*ib].qs);
const int8x16_t q8_0b = vld1q_s8(y[2*ib].qs + 16);
const int8x16_t q8_lo_0 = vcombine_s8(vget_low_s8(q8_0a), vget_low_s8(q8_0b));
@@ -794,15 +795,40 @@ void ggml_vec_dot_nvfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
const int8x16_t q8_hi_1 = vcombine_s8(vget_high_s8(q8_1a), vget_high_s8(q8_1b));
const int32x4_t p0 = vaddq_s32(
ggml_vdotq_s32(vdupq_n_s32(0), q4_lo_0, q8_lo_0),
ggml_vdotq_s32(vdupq_n_s32(0), q4_hi_0, q8_hi_0));
vdotq_s32(vdupq_n_s32(0), q4_lo_0, q8_lo_0),
vdotq_s32(vdupq_n_s32(0), q4_hi_0, q8_hi_0));
const int32x4_t p1 = vaddq_s32(
ggml_vdotq_s32(vdupq_n_s32(0), q4_lo_1, q8_lo_1),
ggml_vdotq_s32(vdupq_n_s32(0), q4_hi_1, q8_hi_1));
vdotq_s32(vdupq_n_s32(0), q4_lo_1, q8_lo_1),
vdotq_s32(vdupq_n_s32(0), q4_hi_1, q8_hi_1));
const int32x4_t sums = vpaddq_s32(p0, p1);
const int32x4_t sumi = vpaddq_s32(p0, p1);
#else
const int8x8_t q4_0_lo = vget_low_s8(q4_lo_0);
const int8x8_t q4_0_hi = vget_low_s8(q4_hi_0);
const int8x8_t q4_1_lo = vget_high_s8(q4_lo_0);
const int8x8_t q4_1_hi = vget_high_s8(q4_hi_0);
const int8x8_t q4_2_lo = vget_low_s8(q4_lo_1);
const int8x8_t q4_2_hi = vget_low_s8(q4_hi_1);
const int8x8_t q4_3_lo = vget_high_s8(q4_lo_1);
const int8x8_t q4_3_hi = vget_high_s8(q4_hi_1);
const int8x8_t q8_0_lo = vld1_s8(y[2*ib].qs);
const int8x8_t q8_0_hi = vld1_s8(y[2*ib].qs + 8);
const int8x8_t q8_1_lo = vld1_s8(y[2*ib].qs + 16);
const int8x8_t q8_1_hi = vld1_s8(y[2*ib].qs + 24);
const int8x8_t q8_2_lo = vld1_s8(y[2*ib+1].qs);
const int8x8_t q8_2_hi = vld1_s8(y[2*ib+1].qs + 8);
const int8x8_t q8_3_lo = vld1_s8(y[2*ib+1].qs + 16);
const int8x8_t q8_3_hi = vld1_s8(y[2*ib+1].qs + 24);
const int32x4_t sumi = (int32x4_t){
vaddvq_s32(ggml_nvfp4_dot8(q4_0_lo, q8_0_lo, q4_0_hi, q8_0_hi)),
vaddvq_s32(ggml_nvfp4_dot8(q4_1_lo, q8_1_lo, q4_1_hi, q8_1_hi)),
vaddvq_s32(ggml_nvfp4_dot8(q4_2_lo, q8_2_lo, q4_2_hi, q8_2_hi)),
vaddvq_s32(ggml_nvfp4_dot8(q4_3_lo, q8_3_lo, q4_3_hi, q8_3_hi)),
};
#endif
// Decode 4 UE4M3 scales to f32 and multiply with q8 scales
const float dy0 = GGML_CPU_FP16_TO_FP32(y[2*ib].d);
const float dy1 = GGML_CPU_FP16_TO_FP32(y[2*ib+1].d);
const float32x4_t nvsc = {
@@ -813,7 +839,7 @@ void ggml_vec_dot_nvfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
};
const float32x4_t scales = vmulq_f32(nvsc, (float32x4_t){dy0, dy0, dy1, dy1});
acc = vfmaq_f32(acc, vcvtq_f32_s32(sums), scales);
acc = vfmaq_f32(acc, vcvtq_f32_s32(sumi), scales);
}
sumf = vaddvq_f32(acc);
#else

View File

@@ -306,6 +306,7 @@ inline static uint8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
#if !defined(__ARM_FEATURE_DOTPROD)
// NOTE: this fallback produces the same total sum as native vdotq_s32 but with different per-lane grouping — do not use when individual lane values matter.
inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) {
const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b));
const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b));
@@ -319,6 +320,15 @@ inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b)
#endif // !defined(__ARM_FEATURE_DOTPROD)
static inline int32x4_t ggml_nvfp4_dot8(const int8x8_t q4_lo, const int8x8_t q8_lo,
const int8x8_t q4_hi, const int8x8_t q8_hi) {
const int16x8_t p_lo = vmull_s8(q4_lo, q8_lo);
const int16x8_t p_hi = vmull_s8(q4_hi, q8_hi);
const int32x4_t sum_lo = vpaddlq_s16(p_lo);
const int32x4_t sum_hi = vpaddlq_s16(p_hi);
return vaddq_s32(sum_lo, sum_hi);
}
#endif // defined(__ARM_NEON)
#ifdef __wasm_simd128__

View File

@@ -39,7 +39,7 @@ if (LLAMA_BUILD_BORINGSSL)
set(FIPS OFF CACHE BOOL "Enable FIPS (BoringSSL)")
set(BORINGSSL_GIT "https://boringssl.googlesource.com/boringssl" CACHE STRING "BoringSSL git repository")
set(BORINGSSL_VERSION "0.20260327.0" CACHE STRING "BoringSSL version")
set(BORINGSSL_VERSION "0.20260413.0" CACHE STRING "BoringSSL version")
message(STATUS "Fetching BoringSSL version ${BORINGSSL_VERSION}")