Compare commits

...

16 Commits
b7381 ... b7397

Author SHA1 Message Date
lovedheart
4722671641 vulkan: improve mul_mat_vec_iq1_s speed (#17874) 2025-12-14 08:47:49 +01:00
Eve
d15d177f43 vulkan: faster q6_k matmul (#17813)
* q6_k faster mul mat

* 8 values

* fix comment

* switch to two at a time

* start ci for .glsl files
2025-12-14 08:29:37 +01:00
Georgi Gerganov
77ad8542bd model-conversion : cast logits to float32 (#18009) 2025-12-14 08:58:13 +02:00
Georgi Gerganov
609a2d0268 models : fix YaRN regression + consolidate logic (#18006)
* models : fix YaRN regression + consolidate logic

* cont : fix the fix

* cont : remove header

* cont : add header
2025-12-14 08:34:56 +02:00
Georgi Gerganov
a63cbafbbc ggml : arm repack fix build 2025-12-14 08:33:51 +02:00
Georgi Gerganov
0e59224990 sync : ggml 2025-12-14 08:33:51 +02:00
Georgi Gerganov
71fdcf0616 ggml : arm repack fix build (whisper/0) 2025-12-14 08:33:51 +02:00
Congcong Cai
615655aafe cmake : set CMAKE_RUNTIME_OUTPUT_DIRECTORY for non standalone build (ggml/1394)
Some backend depends on CMAKE_RUNTIME_OUTPUT_DIRECTORY to create temporary file like metal backened.
Missing CMAKE_RUNTIME_OUTPUT_DIRECTORY will cause some cmake error like permission denied (try to copy file to root).
This PR wants to setup a default path for CMAKE_RUNTIME_OUTPUT_DIRECTORY when it does not exist.
2025-12-14 08:33:51 +02:00
Xuan-Son Nguyen
c00ff929dc scripts: add script to compare logprobs of llama.cpp against other frameworks (#17947)
* scripts: add script to compare logits of llama.cpp against other frameworks

* accept custom prompt file

* fix code style

* clarify endpoint

* fix displaying

* use abs for diff

* fix vllm case

* rm output file

* rename to compare-logprobs

* add "pattern"
2025-12-13 22:33:29 +01:00
Sergey Fedorov
4ed2bae50d server-models.cpp: add missing <filesystem> (#18000)
Fixes: https://github.com/ggml-org/llama.cpp/issues/17999
2025-12-13 22:02:43 +01:00
Jeff Bolz
5266379bca llama_context: synchronize before reallocating output buffer (#17974) 2025-12-13 09:19:51 -06:00
Xuan-Son Nguyen
4d5ae24c0a arg: fix common_params_parse not accepting negated arg (#17991) 2025-12-13 12:53:37 +01:00
Gustavo Rocha Dias
66ba51252e cmake: correct scope - link ws2_32 for MinGW/w64devkit builds in cpp-httplib (#17972)
* fix - w64devkit build

* fix - w64devkit build private scope
2025-12-13 12:46:36 +01:00
Jeff Bolz
36255a2268 vulkan: support get_rows for i32 (#17941) 2025-12-13 10:12:53 +01:00
Jeff Bolz
3229a23fa6 vulkan: support GGML_OP_DIAG (#17893) 2025-12-13 10:07:49 +01:00
Jeff Bolz
303f8615e9 vulkan: Multi-pass softmax for large number of cols (#17892)
When the number of cols is large, split each row across multiple workgroups.
There are three phases that communicate partial results through temp buffers:
(1) compute max partials
(2) take max of partials, compute sum(exp(x-max)) partials
(3) sum partials, compute scaled result
2025-12-13 10:04:29 +01:00
29 changed files with 794 additions and 88 deletions

View File

@@ -20,7 +20,8 @@ on:
'**/*.swift',
'**/*.m',
'**/*.metal',
'**/*.comp'
'**/*.comp',
'**/*.glsl'
]
pull_request:
@@ -40,7 +41,8 @@ on:
'**/*.swift',
'**/*.m',
'**/*.metal',
'**/*.comp'
'**/*.comp',
'**/*.glsl'
]
concurrency:

1
.gitignore vendored
View File

@@ -54,6 +54,7 @@
/out/
/tmp/
/autogen-*.md
/common/build-info.cpp
# Deprecated

View File

@@ -724,7 +724,7 @@ static void add_rpc_devices(const std::string & servers) {
}
}
bool common_params_parse(int argc, char ** argv, llama_example ex, std::map<common_arg, std::string> & out_map) {
bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<common_arg, std::string> & out_map) {
common_params dummy_params;
common_params_context ctx_arg = common_params_parser_init(dummy_params, ex, nullptr);
@@ -733,6 +733,9 @@ bool common_params_parse(int argc, char ** argv, llama_example ex, std::map<comm
for (const auto & arg : opt.args) {
arg_to_options[arg] = &opt;
}
for (const auto & arg : opt.args_neg) {
arg_to_options[arg] = &opt;
}
}
// TODO @ngxson : find a way to deduplicate this code

View File

@@ -115,7 +115,7 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e
// parse input arguments from CLI into a map
// TODO: support repeated args in the future
bool common_params_parse(int argc, char ** argv, llama_example ex, std::map<common_arg, std::string> & out_map);
bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<common_arg, std::string> & out_map);
// initialize argument parser context - used by test-arg-parser and preset
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);

View File

@@ -200,7 +200,7 @@ with torch.no_grad():
logits = outputs.logits
# Extract logits for the last token (next token prediction)
last_logits = logits[0, -1, :].cpu().numpy()
last_logits = logits[0, -1, :].float().cpu().numpy()
print(f"Logits shape: {logits.shape}")
print(f"Last token logits shape: {last_logits.shape}")

View File

@@ -54,6 +54,10 @@ if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
# TODO
else()
set(GGML_STANDALONE OFF)
if (NOT CMAKE_RUNTIME_OUTPUT_DIRECTORY)
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
endif()
endif()
if (EMSCRIPTEN)

View File

@@ -24,6 +24,7 @@
#define UNUSED GGML_UNUSED
#if defined(__aarch64__) && defined(__ARM_NEON) && (defined(__ARM_FEATURE_MATMUL_INT8) || defined(__ARM_FEATURE_DOTPROD))
static inline void decode_q4_Kx8_scales_mins(const uint8_t * scales_in,
int16x8_t * out_mins,
int8_t * out_scales) {
@@ -46,6 +47,7 @@ static inline void decode_q4_Kx8_scales_mins(const uint8_t * scales_in,
scales_u32[1] = (sm[2] & kmask2) | (((sm[0] >> 6) & kmask3) << 4);
memcpy(out_scales, scales_u32, 8);
}
#endif
void ggml_quantize_mat_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
assert(QK8_0 == 32);

View File

@@ -659,6 +659,7 @@ struct vk_device_struct {
vk_pipeline pipeline_cos_f32;
vk_pipeline pipeline_log[2];
vk_pipeline pipeline_tri[2];
vk_pipeline pipeline_diag[2];
vk_pipeline pipeline_clamp_f32;
vk_pipeline pipeline_pad_f32;
vk_pipeline pipeline_roll_f32;
@@ -722,6 +723,11 @@ struct vk_device_struct {
vk_pipeline pipeline_soft_max_f32, pipeline_soft_max_f32_f16;
vk_pipeline pipeline_soft_max_f32_wg512, pipeline_soft_max_f32_f16_wg512;
vk_pipeline pipeline_soft_max_back_f32;
vk_pipeline pipeline_soft_max_large1_f32, pipeline_soft_max_large1_f32_f16;
vk_pipeline pipeline_soft_max_large2_f32, pipeline_soft_max_large2_f32_f16;
vk_pipeline pipeline_soft_max_large3_f32, pipeline_soft_max_large3_f32_f16;
vk_pipeline pipeline_rope_norm_f32, pipeline_rope_norm_f16, pipeline_rope_norm_f32_f16;
vk_pipeline pipeline_rope_neox_f32, pipeline_rope_neox_f16, pipeline_rope_neox_f32_f16;
vk_pipeline pipeline_rope_multi_f32, pipeline_rope_multi_f16;
@@ -3732,6 +3738,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ4_XS], "get_rows_iq4_xs", get_rows_iq4_xs_len, get_rows_iq4_xs_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ4_NL], "get_rows_iq4_nl", get_rows_iq4_nl_len, get_rows_iq4_nl_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_MXFP4], "get_rows_mxfp4", get_rows_mxfp4_len, get_rows_mxfp4_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_I32], "get_rows_i32", get_rows_i32_len, get_rows_i32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_F32 ], "get_rows_f32_f32", get_rows_f32_f32_len, get_rows_f32_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_F16 ], "get_rows_f16_f32", get_rows_f16_f32_len, get_rows_f16_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
@@ -3919,6 +3926,9 @@ static void ggml_vk_load_shaders(vk_device& device) {
ggml_vk_create_pipeline(device, device->pipeline_tri[0], "tri_f32", tri_f32_len, tri_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_tri[1], "tri_f16", tri_f16_len, tri_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_diag[0], "diag_f32", diag_f32_len, diag_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_diag[1], "diag_f16", diag_f16_len, diag_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_clamp_f32, "clamp_f32", clamp_f32_len, clamp_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_pad_f32, "pad_f32", pad_f32_len, pad_f32_data, "main", 2, sizeof(vk_op_pad_push_constants), {512, 1, 1}, {}, 1);
@@ -3998,6 +4008,13 @@ static void ggml_vk_load_shaders(vk_device& device) {
ggml_vk_create_pipeline(device, device->pipeline_soft_max_f32_f16_wg512, "soft_max_f32_f16_wg512", soft_max_f32_f16_len, soft_max_f32_f16_data, "main", 4, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 512 }, 1);
ggml_vk_create_pipeline(device, device->pipeline_soft_max_back_f32, "soft_max_back_f32", soft_max_back_f32_len, soft_max_back_f32_data, "main", 3, sizeof(vk_op_push_constants), {1, 1, 1}, { device->subgroup_size }, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_soft_max_large1_f32, "soft_max_large1_f32", soft_max_large1_f32_len, soft_max_large1_f32_data, "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_soft_max_large2_f32, "soft_max_large2_f32", soft_max_large2_f32_len, soft_max_large2_f32_data, "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_soft_max_large3_f32, "soft_max_large3_f32", soft_max_large3_f32_len, soft_max_large3_f32_data, "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_soft_max_large1_f32_f16, "soft_max_large1_f32_f16", soft_max_large1_f32_f16_len, soft_max_large1_f32_f16_data, "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_soft_max_large2_f32_f16, "soft_max_large2_f32_f16", soft_max_large2_f32_f16_len, soft_max_large2_f32_f16_data, "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_soft_max_large3_f32_f16, "soft_max_large3_f32_f16", soft_max_large3_f32_f16_len, soft_max_large3_f32_f16_data, "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_rope_norm_f32, "rope_norm_f32", rope_norm_f32_len, rope_norm_f32_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_rope_multi_f32, "rope_multi_f32", rope_multi_f32_len, rope_multi_f32_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
@@ -8278,6 +8295,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
switch (op) {
case GGML_OP_GET_ROWS:
GGML_ASSERT(src1->type == GGML_TYPE_I32);
if (src0->type == GGML_TYPE_I32) {
// i32 src only supports i32 result
GGML_ASSERT(dst->type == GGML_TYPE_I32);
return ctx->device->pipeline_get_rows[src0->type];
}
if (dst->type == GGML_TYPE_F16) {
return ctx->device->pipeline_get_rows[src0->type];
}
@@ -8404,6 +8426,12 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
return ctx->device->pipeline_tri[dst->type == GGML_TYPE_F16];
}
return nullptr;
case GGML_OP_DIAG:
if (src0->type == dst->type &&
(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16)) {
return ctx->device->pipeline_diag[dst->type == GGML_TYPE_F16];
}
return nullptr;
case GGML_OP_CLAMP:
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
return ctx->device->pipeline_clamp_f32;
@@ -9097,6 +9125,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
case GGML_OP_COS:
case GGML_OP_LOG:
case GGML_OP_TRI:
case GGML_OP_DIAG:
case GGML_OP_CLAMP:
case GGML_OP_PAD:
case GGML_OP_ROLL:
@@ -9784,6 +9813,12 @@ static void ggml_vk_tri(ggml_backend_vk_context * ctx, vk_context& subctx, const
ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_TRI, std::move(p));
}
static void ggml_vk_diag(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst, ggml_nelements(dst));
ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_DIAG, std::move(p));
}
static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst);
p.param1 = ggml_get_op_params_f32(dst, 0);
@@ -10117,7 +10152,7 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context& subctx,
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
ggml_vk_op_f32<vk_op_soft_max_push_constants>(ctx, subctx, src0, src1, src2, nullptr, dst, GGML_OP_SOFT_MAX, {
vk_op_soft_max_push_constants pc {
ncols,
src1 != nullptr ? nrows_y : (uint32_t)0,
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],
@@ -10128,7 +10163,55 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context& subctx,
n_head_log2,
nrows_x,
src2 != nullptr
});
};
if (ncols <= 16384) {
ggml_vk_op_f32<vk_op_soft_max_push_constants>(ctx, subctx, src0, src1, src2, nullptr, dst, GGML_OP_SOFT_MAX, std::move(pc));
} else {
vk_subbuffer buf_a = ggml_vk_tensor_subbuffer(ctx, src0);
vk_subbuffer buf_b = src1 ? ggml_vk_tensor_subbuffer(ctx, src1) : buf_a;
vk_subbuffer buf_c = src2 ? ggml_vk_tensor_subbuffer(ctx, src2) : buf_a;
vk_subbuffer buf_d = ggml_vk_tensor_subbuffer(ctx, dst);
uint32_t elems_per_wg = 128 * 4;
uint32_t num_wgs = CEIL_DIV(ncols, elems_per_wg);
size_t tmp_size = num_wgs * nrows_x * sizeof(float);
if (ctx->prealloc_size_x < tmp_size) {
ctx->prealloc_size_x = tmp_size;
ggml_vk_preallocate_buffers(ctx, subctx);
}
if (ctx->prealloc_size_y < tmp_size) {
ctx->prealloc_size_y = tmp_size;
ggml_vk_preallocate_buffers(ctx, subctx);
}
if (ctx->prealloc_x_need_sync || ctx->prealloc_y_need_sync) {
ggml_vk_sync_buffers(ctx, subctx);
}
vk_subbuffer buf_x = { ctx->prealloc_x, 0, tmp_size };
vk_subbuffer buf_y = { ctx->prealloc_y, 0, tmp_size };
std::array<uint32_t, 3> elements = { num_wgs, nrows_x, 1 };
vk_pipeline pipeline1 = src1 && src1->type == GGML_TYPE_F16 ? ctx->device->pipeline_soft_max_large1_f32_f16 : ctx->device->pipeline_soft_max_large1_f32;
vk_pipeline pipeline2 = src1 && src1->type == GGML_TYPE_F16 ? ctx->device->pipeline_soft_max_large2_f32_f16 : ctx->device->pipeline_soft_max_large2_f32;
vk_pipeline pipeline3 = src1 && src1->type == GGML_TYPE_F16 ? ctx->device->pipeline_soft_max_large3_f32_f16 : ctx->device->pipeline_soft_max_large3_f32;
ggml_pipeline_request_descriptor_sets(ctx, pipeline1, 1);
ggml_pipeline_request_descriptor_sets(ctx, pipeline2, 1);
ggml_pipeline_request_descriptor_sets(ctx, pipeline3, 1);
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline1, { buf_a, buf_b, buf_c, buf_d, buf_x, buf_y }, pc, elements);
ggml_vk_sync_buffers(ctx, subctx);
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline2, { buf_a, buf_b, buf_c, buf_d, buf_x, buf_y }, pc, elements);
ggml_vk_sync_buffers(ctx, subctx);
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline3, { buf_a, buf_b, buf_c, buf_d, buf_x, buf_y }, pc, elements);
ctx->prealloc_x_need_sync = true;
ctx->prealloc_y_need_sync = true;
}
}
static void ggml_vk_soft_max_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -11864,6 +11947,10 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
case GGML_OP_TRI:
ggml_vk_tri(ctx, compute_ctx, src0, node);
break;
case GGML_OP_DIAG:
ggml_vk_diag(ctx, compute_ctx, src0, node);
break;
case GGML_OP_CLAMP:
ggml_vk_clamp(ctx, compute_ctx, src0, node);
@@ -13883,6 +13970,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
case GGML_TYPE_IQ4_XS:
case GGML_TYPE_IQ4_NL:
case GGML_TYPE_MXFP4:
case GGML_TYPE_I32:
return true;
default:
return false;
@@ -14007,6 +14095,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
case GGML_OP_LOG:
case GGML_OP_TRI:
case GGML_OP_DIAG:
return (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) &&
op->type == op->src[0]->type;
case GGML_OP_ARGSORT:
@@ -14597,6 +14686,8 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph *
tensor_clone = ggml_log(ggml_ctx, src_clone[0]);
} else if (tensor->op == GGML_OP_TRI) {
tensor_clone = ggml_tri(ggml_ctx, src_clone[0], ggml_get_op_params_i32(tensor, 0));
} else if (tensor->op == GGML_OP_DIAG) {
tensor_clone = ggml_diag(ggml_ctx, src_clone[0]);
} else if (tensor->op == GGML_OP_CLAMP) {
const float * params = (const float *)tensor->op_params;
tensor_clone = ggml_clamp(ggml_ctx, src_clone[0], params[0], params[1]);

View File

@@ -0,0 +1,29 @@
#version 450
#include "rte.glsl"
#include "types.glsl"
#include "generic_unary_head.glsl"
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
void main() {
const uint idx = get_idx();
if (idx >= p.ne) {
return;
}
const uint i13 = fastdiv(idx, p.ne1_012mp, p.ne1_012L);
const uint i13_offset = i13 * p.ne12*p.ne11*p.ne10;
const uint i12 = fastdiv(idx - i13_offset, p.ne1_01mp, p.ne1_01L);
const uint i12_offset = i12*p.ne11*p.ne10;
const uint i11 = fastdiv(idx - i13_offset - i12_offset, p.ne1_0mp, p.ne1_0L);
const uint i10 = idx - i13_offset - i12_offset - i11*p.ne10;
if (i10 == i11) {
const float val = float(data_a[get_aoffset() + i13*p.nb03 + i12*p.nb02 + 0*p.nb01 + i10*p.nb00]);
data_d[get_doffset() + dst_idx(idx)] = D_TYPE(val);
} else {
data_d[get_doffset() + dst_idx(idx)] = D_TYPE(0);
}
}

View File

@@ -26,9 +26,9 @@ void main() {
const uint d_offset = get_doffset() + i10*p.nb21 + i11*p.nb22 + i12*p.nb23;
#if defined(DATA_A_BF16)
FLOAT_TYPE v = FLOAT_TYPE(bf16_to_fp32(data_a[a_offset + i00]));
TEMP_TYPE v = TEMP_TYPE(bf16_to_fp32(data_a[a_offset + i00]));
#else
FLOAT_TYPE v = FLOAT_TYPE(data_a[a_offset + i00]);
TEMP_TYPE v = TEMP_TYPE(data_a[a_offset + i00]);
#endif
#ifndef OPTIMIZATION_ERROR_WORKAROUND
data_d[d_offset + i00] = D_TYPE(v);

View File

@@ -7,34 +7,50 @@ layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
void calc_superblock(const uint a_offset, const uint b_offset, const uint ib32, const uint i, const uint num_blocks_per_row, const uint first_row, const uint num_rows) {
const uint y_idx = i * QUANT_K + 32 * ib32;
void calc_superblock(const uint a_offset, const uint b_offset, const uint ib32, const uint i,
const uint num_blocks_per_row, const uint first_row, const uint num_rows) {
const uint y_idx_base = i * QUANT_K + 32 * ib32;
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
const uint base_b_idx = (j * p.batch_stride_b + b_offset + y_idx_base) / 4;
[[unroll]] for (uint l = 0; l < 4; ++l) {
const vec4 b_val_0 = vec4(data_b_v4[base_b_idx + 2 * l]);
const vec4 b_val_1 = vec4(data_b_v4[base_b_idx + 2 * l + 1]);
uint ibi = a_offset / QUANT_K + first_row * num_blocks_per_row + i;
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
const float d = float(data_a[ibi].d);
const uint qh = data_a[ibi].qh[ib32];
const float dl = d * float(2 * bitfieldExtract(qh, 12, 3) + 1);
const float delta = ((qh & 0x8000) != 0) ? -IQ1S_DELTA : IQ1S_DELTA;
// index for data_a
uint ibi = a_offset / QUANT_K + first_row * num_blocks_per_row + i;
[[unroll]] for (uint l = 0; l < 4; ++l) {
const uint qs = data_a[ibi].qs[4 * ib32 + l];
const uint idxhi = bitfieldExtract(qh, 3 * int(l), 3);
const int16_t grid = int16_t(iq1s_grid[qs | (idxhi << 8)]);
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
const float d = float(data_a[ibi].d);
const uint qh = data_a[ibi].qh[ib32];
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
vec4 b0 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 2*l + 0]);
vec4 b4 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 2*l + 1]);
const float dl = d * float(2 * bitfieldExtract(qh, 12, 3) + 1);
const uint qs = data_a[ibi].qs[4 * ib32 + l];
const uint idxhi = bitfieldExtract(qh, 3 * int(l), 3);
const uint16_t grid = uint16_t(iq1s_grid[qs | (idxhi << 8)]);
FLOAT_TYPE sum = FLOAT_TYPE(0.0);
[[unroll]] for (int k = 0; k < 4; ++k) {
sum = fma(FLOAT_TYPE(b0[k]), bitfieldExtract(grid, 2 * k, 2) + delta,
fma(FLOAT_TYPE(b4[k]), bitfieldExtract(grid, 8 + 2 * k, 2) + delta, sum));
}
temp[j][n] = fma(dl, sum, temp[j][n]);
const float delta_val = ((qh & 0x8000) != 0) ? -IQ1S_DELTA : IQ1S_DELTA;
const vec4 delta_v = vec4(delta_val);
const vec4 fbits0 = vec4(
float(bitfieldExtract(grid, 0, 2)),
float(bitfieldExtract(grid, 2, 2)),
float(bitfieldExtract(grid, 4, 2)),
float(bitfieldExtract(grid, 6, 2))
);
const vec4 fbits1 = vec4(
float(bitfieldExtract(grid, 8, 2)),
float(bitfieldExtract(grid, 10, 2)),
float(bitfieldExtract(grid, 12, 2)),
float(bitfieldExtract(grid, 14, 2))
);
vec4 sum_v = fma(b_val_0, fbits0 + delta_v, vec4(0.0));
sum_v = fma(b_val_1, fbits1 + delta_v, sum_v);
FLOAT_TYPE sum = dot(sum_v, vec4(1.0));
temp[j][n] = fma(dl, sum, temp[j][n]);
ibi += num_blocks_per_row;
}
}
ibi += num_blocks_per_row;
}
}

View File

@@ -244,17 +244,20 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
const uint iqs = idx % 128; // 0..127
const uint n = iqs / 64; // 0,1
const uint b = (iqs % 64) / 32; // 0,1
const uint b = ((iqs % 64) / 32) * 4; // 0,4
const uint is_b = (iqs % 16) / 8; // 0,1
const uint qhshift = ((iqs % 64) / 16) * 2; // 0,2,4,6
const uint is = 8 * n + qhshift + is_b; // 0..15
const uint qsi = n * 64 + (iqs % 32) * 2; // 0,2,4..126
const uint qhi = n * 32 + (iqs % 16) * 2; // 0,2,4..62
const uint qsi = n * 32 + (iqs % 32); // 0..63
const uint qhi = n * 16 + (iqs % 16); // 0..31
const float dscale = float(data_a[ib].d) * float(data_a[ib].scales[is]);
buf_a[buf_idx] = FLOAT_TYPE_VEC2(dscale * float(int8_t(((data_a[ib].ql[qsi ] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi ] >> qhshift) & 3) << 4)) - 32),
dscale * float(int8_t(((data_a[ib].ql[qsi + 1] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi + 1] >> qhshift) & 3) << 4)) - 32));
const uint ql = (uint(data_a_packed16[ib].ql[qsi]) >> b) & 0x0F0F;
const uint qh = (uint(data_a_packed16[ib].qh[qhi]) >> qhshift) & 0x0303;
const vec2 q = (vec2(unpack8(ql | (qh << 4)).xy) - 32) * dscale;
buf_a[buf_idx] = FLOAT_TYPE_VEC2(q.x, q.y);
#elif defined(DATA_A_IQ1_S)
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2;

View File

@@ -0,0 +1,62 @@
#version 450
#include "soft_max_large_common.glsl"
void main() {
const uint tid = gl_LocalInvocationID.x;
const uint rowx = gl_WorkGroupID.y;
const uint wg_start = gl_WorkGroupID.x * BLOCK_SIZE * num_iters;
const uint32_t i03 = rowx / (p.ne01 * p.ne02);
const uint32_t i02 = (rowx - i03 * p.ne01 * p.ne02) / p.ne01;
const uint32_t i01 = rowx % p.ne01;
uint rowy_start = 0;
if (p.KY > 0) {
rowy_start = i01 * p.nb11 + (i02 % p.ne12) * p.nb12 + (i03 % p.ne13) * p.nb13;
}
if (rowx >= p.nrows_x) {
return;
}
float slope = get_slope(rowx);
// Find max
FLOAT_TYPE max_val = p.has_sinks == 0 ? uintBitsToFloat(0xFF800000) : data_c[i02];
[[unroll]] for (uint col0 = wg_start, idx = 0; idx < num_iters; col0 += BLOCK_SIZE, ++idx) {
const uint col = col0 + tid;
FLOAT_TYPE a = FLOAT_TYPE(0);
if (col < p.KX) {
a = data_a[rowx * p.KX + col];
}
FLOAT_TYPE b = FLOAT_TYPE(0);
if (p.KY > 0 && col < p.KX) {
b = data_b[rowy_start + col];
}
FLOAT_TYPE v = a * p.scale + slope * b;
if (col < p.KX) {
max_val = max(max_val, v);
}
}
// reduce across the workgroup
vals[tid] = max_val;
barrier();
[[unroll]] for (uint s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
if (tid < s) {
vals[tid] = max(vals[tid], vals[tid + s]);
}
barrier();
}
if (tid == 0) {
max_val = vals[0];
data_m[rowx * gl_NumWorkGroups.x + gl_WorkGroupID.x] = max_val;
}
}

View File

@@ -0,0 +1,79 @@
#version 450
#include "soft_max_large_common.glsl"
void main() {
const uint tid = gl_LocalInvocationID.x;
const uint rowx = gl_WorkGroupID.y;
const uint wg_start = gl_WorkGroupID.x * BLOCK_SIZE * num_iters;
const uint32_t i03 = rowx / (p.ne01 * p.ne02);
const uint32_t i02 = (rowx - i03 * p.ne01 * p.ne02) / p.ne01;
const uint32_t i01 = rowx % p.ne01;
uint rowy_start = 0;
if (p.KY > 0) {
rowy_start = i01 * p.nb11 + (i02 % p.ne12) * p.nb12 + (i03 % p.ne13) * p.nb13;
}
if (rowx >= p.nrows_x) {
return;
}
float slope = get_slope(rowx);
// Find max
FLOAT_TYPE max_val = p.has_sinks == 0 ? uintBitsToFloat(0xFF800000) : data_c[i02];
[[unroll]] for (uint i = 0; i < gl_NumWorkGroups.x; i += BLOCK_SIZE) {
if (i + tid < gl_NumWorkGroups.x) {
max_val = max(max_val, data_m[rowx * gl_NumWorkGroups.x + i + tid]);
}
}
// reduce across the workgroup
vals[tid] = max_val;
barrier();
[[unroll]] for (uint s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
if (tid < s) {
vals[tid] = max(max_val, vals[tid + s]);
}
barrier();
}
max_val = vals[0];
barrier();
FLOAT_TYPE sum = FLOAT_TYPE(0.0f);
// Compute sum{exp(x - max)}
[[unroll]] for (uint col0 = wg_start, idx = 0; idx < num_iters; col0 += BLOCK_SIZE, ++idx) {
const uint col = col0 + tid;
if (col >= p.KX) {
break;
}
// compute exp(a*scale+b*slope), add it to sum
const uint i = rowx * p.KX + col;
FLOAT_TYPE val;
val = exp(FLOAT_TYPE(data_a[i]) * p.scale + (p.KY > 0 ? slope * FLOAT_TYPE(data_b[rowy_start + col]) : FLOAT_TYPE(0.0f)) - max_val);
sum += val;
data_d[i] = D_TYPE(val);
}
// reduce across the workgroup
vals[tid] = sum;
barrier();
[[unroll]] for (uint s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
if (tid < s) {
vals[tid] += vals[tid + s];
}
barrier();
}
if (tid == 0) {
sum = vals[0];
data_s[rowx * gl_NumWorkGroups.x + gl_WorkGroupID.x] = sum;
}
}

View File

@@ -0,0 +1,65 @@
#version 450
#include "soft_max_large_common.glsl"
shared FLOAT_TYPE sumsh[BLOCK_SIZE];
void main() {
const uint tid = gl_LocalInvocationID.x;
const uint rowx = gl_WorkGroupID.y;
const uint wg_start = gl_WorkGroupID.x * BLOCK_SIZE * num_iters;
const uint32_t i03 = rowx / (p.ne01 * p.ne02);
const uint32_t i02 = (rowx - i03 * p.ne01 * p.ne02) / p.ne01;
const uint32_t i01 = rowx % p.ne01;
uint rowy_start = 0;
if (p.KY > 0) {
rowy_start = i01 * p.nb11 + (i02 % p.ne12) * p.nb12 + (i03 % p.ne13) * p.nb13;
}
if (rowx >= p.nrows_x) {
return;
}
FLOAT_TYPE max_val = p.has_sinks == 0 ? uintBitsToFloat(0xFF800000) : data_c[i02];
FLOAT_TYPE sum = FLOAT_TYPE(0.0f);
[[unroll]] for (uint i = 0; i < gl_NumWorkGroups.x; i += BLOCK_SIZE) {
if (i + tid < gl_NumWorkGroups.x) {
max_val = max(max_val, data_m[rowx * gl_NumWorkGroups.x + i + tid]);
sum += data_s[rowx * gl_NumWorkGroups.x + i + tid];
}
}
// reduce across the workgroup
vals[tid] = max_val;
sumsh[tid] = sum;
barrier();
[[unroll]] for (uint s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
if (tid < s) {
vals[tid] = max(max_val, vals[tid + s]);
sumsh[tid] += sumsh[tid + s];
}
barrier();
}
max_val = vals[0];
sum = sumsh[0];
if (p.has_sinks != 0) {
sum += FLOAT_TYPE(exp(FLOAT_TYPE(data_c[i02]) - max_val));
}
FLOAT_TYPE rcpdivisor = 1.0/sum;
[[unroll]] for (uint col0 = wg_start, idx = 0; idx < num_iters; col0 += BLOCK_SIZE, ++idx) {
const uint col = col0 + tid;
if (col >= p.KX) {
continue;
}
data_d[rowx*p.KX + col] *= D_TYPE(rcpdivisor);
}
}

View File

@@ -0,0 +1,53 @@
#extension GL_EXT_control_flow_attributes : enable
layout (push_constant) uniform parameter
{
uint KX;
uint KY;
uint ne00;
uint ne01;
uint ne02;
uint ne12;
uint ne13;
uint nb11;
uint nb12;
uint nb13;
float scale;
float max_bias;
float m0;
float m1;
uint n_head_log2;
uint nrows_x;
uint has_sinks;
} p;
#include "types.glsl"
layout(constant_id = 0) const uint BLOCK_SIZE = 128;
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
layout(constant_id = 1) const uint num_iters = 4;
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
layout (binding = 1) readonly buffer Y {B_TYPE data_b[];};
layout (binding = 2) readonly buffer Z {float data_c[];};
layout (binding = 3) buffer D {D_TYPE data_d[];};
layout (binding = 4) buffer M {float data_m[];};
layout (binding = 5) buffer S {float data_s[];};
shared FLOAT_TYPE vals[BLOCK_SIZE];
float get_slope(uint rowx) {
float slope = 1.0f;
// ALiBi
if (p.max_bias > 0.0f) {
const uint h = (rowx / p.ne01) % p.ne02; // head index
const float base = h < p.n_head_log2 ? p.m0 : p.m1;
const uint exp = h < p.n_head_log2 ? h + 1 : 2*(h - p.n_head_log2) + 1;
slope = pow(base, exp);
}
return slope;
}

View File

@@ -704,13 +704,15 @@ void process_shaders() {
shader = (tname == "f32" || tname == "f16" || tname == "bf16") ? "get_rows.comp" : "get_rows_quant.comp";
if (tname == "f16") {
string_to_spv("get_rows_" + tname, shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}}));
string_to_spv("get_rows_" + tname, shader, merge_maps(base_dict, {{"TEMP_TYPE", "FLOAT_TYPE"}, {data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}}));
} else {
string_to_spv("get_rows_" + tname, shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}}));
string_to_spv("get_rows_" + tname, shader, merge_maps(base_dict, {{"TEMP_TYPE", "FLOAT_TYPE"}, {data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}}));
}
string_to_spv("get_rows_" + tname + "_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float"}}));
string_to_spv("get_rows_" + tname + "_f32", shader, merge_maps(base_dict, {{"TEMP_TYPE", "FLOAT_TYPE"}, {data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float"}}));
}
string_to_spv("get_rows_i32", "get_rows.comp", {{"TEMP_TYPE", "uint"}, {"A_TYPE", "uint"}, {"B_TYPE", "int"}, {"D_TYPE", "uint"}});
string_to_spv("mul_mat_vec_p021_f16_f32_subgroup_add", "mul_mat_vec_p021.comp", {{"A_TYPE", "float16_t"}, {"A_TYPE_VEC4", "f16vec4"}, {"B_TYPE", "float"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}, {"USE_SUBGROUP_ADD", "1"}});
string_to_spv("mul_mat_vec_p021_f16_f32", "mul_mat_vec_p021.comp", {{"A_TYPE", "float16_t"}, {"A_TYPE_VEC4", "f16vec4"}, {"B_TYPE", "float"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}});
string_to_spv("mul_mat_vec_nc_f16_f32", "mul_mat_vec_nc.comp", {{"A_TYPE", "float16_t"}, {"A_TYPE_VEC4", "f16vec4"}, {"B_TYPE", "float"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}});
@@ -854,6 +856,8 @@ void process_shaders() {
string_to_spv("tri_f16", "tri.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
string_to_spv("tri_f32", "tri.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
string_to_spv("diag_f16", "diag.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
string_to_spv("diag_f32", "diag.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
string_to_spv("softplus_f16", "softplus.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
string_to_spv("softplus_f32", "softplus.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
@@ -899,6 +903,13 @@ void process_shaders() {
string_to_spv("soft_max_f32_f16", "soft_max.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}));
string_to_spv("soft_max_back_f32", "soft_max_back.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
string_to_spv("soft_max_large1_f32", "soft_max_large1.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
string_to_spv("soft_max_large2_f32", "soft_max_large2.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
string_to_spv("soft_max_large3_f32", "soft_max_large3.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
string_to_spv("soft_max_large1_f32_f16", "soft_max_large1.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}));
string_to_spv("soft_max_large2_f32_f16", "soft_max_large2.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}));
string_to_spv("soft_max_large3_f32_f16", "soft_max_large3.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}));
string_to_spv("rope_norm_f32", "rope_norm.comp", {{"A_TYPE", "float"}, {"ROPE_D_TYPE", "float"}});
string_to_spv("rope_norm_f16", "rope_norm.comp", {{"A_TYPE", "float16_t"}, {"ROPE_D_TYPE", "float16_t"}});
string_to_spv("rope_norm_f16_rte", "rope_norm.comp", {{"A_TYPE", "float16_t"}, {"ROPE_D_TYPE", "float16_t"}, {"RTE16", "1"}});

281
scripts/compare-logprobs.py Normal file
View File

@@ -0,0 +1,281 @@
import argparse
import requests
import json
from pathlib import Path
import logging
logger = logging.getLogger("compare-logprobs")
logging.basicConfig(level=logging.INFO)
DESCRIPTION = """
Compare logits between llama.cpp and another inference engine using OpenAI-compatible server endpoints.
Unlike compare-logits.py, it allows dumping logits from a hosted API endpoint. Useful when it's not possible to run both models locally.
Example usage:
Step 1: Dump logits from two different servers
python scripts/compare-logprobs.py dump logits_llama.log http://localhost:8080/v1/completions
python scripts/compare-logprobs.py dump logits_other.log http://other-engine:8000/v1/completions
(optionally, you can add --api-key <key> if the endpoint requires authentication)
Step 2: Compare the dumped logits
python scripts/compare-logprobs.py compare logits_llama.log logits_other.log report.md
"""
def generate_input_prompt(length: int) -> list[str]:
CORPUS = """
You are an advanced AI assistant capable of using tools to gather information, perform calculations, or execute tasks. Always think step by step before responding. If a user's query requires external data, computation, or actions beyond your internal knowledge, use the appropriate tools via function calls.
### Tool Call Format:
When you need to use a tool, output the call in this exact XML format. Include the opening and closing tags. Do not escape arguments; they will be parsed as plain text.
You can make multiple calls in one go by placing them one after another.
"""
words = [w.strip() for w in CORPUS.strip().split(" ")]
words = [w for w in words if len(w) > 0] # filter out empty strings
while len(words) < length:
words += words
return words[:length]
def dump_logits(
endpoint: str,
output_path: Path,
input_words: list[str],
pattern: list[tuple[bool, int]],
api_key=None,
):
logger.info(f"Dumping logits to {output_path} from endpoint {endpoint}...")
words = input_words
curr_text = ""
n_total = sum(n for get, n in pattern if get)
n_done = 0
i_cur = 0
i_total = len(words)
with output_path.open("w") as f:
for get, n in pattern:
if not get:
# skip n words
for i in range(n):
curr_text += words.pop(0) + " "
i_cur += 1
continue
# get n words
for i in range(n):
curr_text += words.pop(0) + " "
payload = {
"prompt": curr_text.strip(),
"temperature": 0.0,
"top_k": 1,
"max_tokens": 1,
"logprobs": 1,
"stream": False,
}
response = requests.post(
endpoint,
json=payload,
headers={"Authorization": f"Bearer {api_key}"} if api_key else {},
)
response.raise_for_status()
data = response.json()
data["__index"] = i_cur # add index for easier debugging later
data = json.dumps(data)
f.write(f"{data}\n")
n_done += 1
i_cur += 1
logger.info(
f"\n\n{data}\n\n[Step: {n_done}/{n_total} | Word: {i_cur}/{i_total}]"
)
logger.info(f"Logits dumped to {output_path}")
def get_token_logprobs(data: dict):
logprobs = data["choices"][0]["logprobs"]
if "content" in logprobs:
# llama.cpp case
top = logprobs["content"][0]["top_logprobs"][0]
return top["token"], top["logprob"]
else:
# vllm case
tokens = logprobs["tokens"]
token_logprobs = logprobs["token_logprobs"]
return tokens[0], token_logprobs[0]
def clean_text(text: str) -> str:
return (
"'"
+ text.replace("\n", "\\n")
.replace("\t", "\\t")
.replace("\r", "\\r")
.replace("|", "\\|")
+ "'"
)
def compare_logits(input1: Path, input2: Path, output_path: Path):
with input1.open("r") as f1, input2.open("r") as f2, output_path.open("w") as fout:
lines1 = f1.readlines()
lines2 = f2.readlines()
tab_header = [
"idx",
input1.name,
"logprob_1",
input2.name,
"logprob_2",
"diff (abs)",
]
tab_entries = []
tab_max_widths = [len(h) for h in tab_header]
assert len(lines1) == len(
lines2
), "Input files must have the same number of lines."
fout.write("# Logits Comparison Report\n\n")
for i, (line1, line2) in enumerate(zip(lines1, lines2)):
if not line1.strip() or not line2.strip():
continue # skip empty lines
data1 = json.loads(line1)
data2 = json.loads(line2)
idx1 = data1.get("__index", -1)
idx2 = data2.get("__index", -1)
if idx1 != idx2:
logger.warning(
f"Warning: Mismatched indices at line {i}: {idx1} vs {idx2}"
)
token1, logprob1 = get_token_logprobs(data1)
token2, logprob2 = get_token_logprobs(data2)
token1 = clean_text(token1)
token2 = clean_text(token2)
abs_diff = abs(logprob1 - logprob2)
tab_entries.append(
(
str(idx1 + 1),
token1,
f"{logprob1:.4f}",
token2,
f"{logprob2:.4f}",
f"{(abs_diff):.4f}",
)
)
for i in range(len(tab_entries)):
for j in range(len(tab_header)):
tab_max_widths[j] = max(tab_max_widths[j], len(tab_entries[i][j]))
output = ""
for j in range(len(tab_header)):
output += f"| {tab_header[j]:<{tab_max_widths[j]}} "
output += "|\n"
for j in range(len(tab_header)):
output += f"|{'-' * (tab_max_widths[j] + 2)}"
output += "|\n"
for entry in tab_entries:
for j in range(len(tab_header)):
output += f"| {entry[j]:<{tab_max_widths[j]}} "
output += "|\n"
logger.info("\n" + output)
fout.write(output)
logger.info(f"Report written to {output_path}")
def parse_pattern(pattern: str) -> list[tuple[bool, int]]:
parts = pattern.split(",")
result = []
for i, part in enumerate(parts):
n = int(part)
if i % 2 == 0:
result.append((True, n)) # get n words
else:
result.append((False, n)) # skip n words
return result
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description=DESCRIPTION, formatter_class=argparse.RawTextHelpFormatter
)
subparsers = parser.add_subparsers(
dest="verb", required=True, help="action to perform"
)
# dump subcommand
parser_dump = subparsers.add_parser("dump", help="dump logits from an endpoint")
parser_dump.add_argument(
"output", type=Path, help="output path for dumped logits (.log)"
)
parser_dump.add_argument(
"endpoint", type=str, help="OAI-compat /completions endpoint"
)
parser_dump.add_argument(
"--api-key",
type=str,
default=None,
help="API key for authentication (if required)",
)
parser_dump.add_argument(
"--file",
type=Path,
default=None,
help="File containing prompt to use instead of the default",
)
parser_dump.add_argument(
"--pattern",
type=str,
default="10,1000,10,4000,10",
help="Pattern n_get,n_skip,... where n_get is number of words to get and n_skip is number of words to skip (num of words, NOT num of tokens)",
)
# compare subcommand
parser_compare = subparsers.add_parser(
"compare", help="compare two dumped logits files"
)
parser_compare.add_argument("input1", type=Path, help="first input file (.log)")
parser_compare.add_argument("input2", type=Path, help="second input file (.log)")
parser_compare.add_argument(
"output", type=Path, help="output path for comparison report (.md)"
)
try:
return parser.parse_args()
except Exception as e:
parser.print_help()
raise e
def main():
args = parse_args()
if args.verb == "dump":
pattern = parse_pattern(args.pattern)
input_length = sum(n for _, n in pattern)
input_words = generate_input_prompt(input_length)
if args.file is not None:
with args.file.open("r") as f:
input_words = f.read().strip().split(" ")
if input_length < sum(n for _, n in pattern):
raise ValueError(
f"Input file has only {input_length} words, but pattern requires at least {input_length} words."
)
input_length = len(input_words)
logger.info(f"Using {input_length} words")
dump_logits(args.endpoint, args.output, input_words, pattern, args.api_key)
elif args.verb == "compare":
compare_logits(args.input1, args.input2, args.output)
else:
raise ValueError(f"Unknown verb: {args.verb}")
if __name__ == "__main__":
main()

View File

@@ -1 +1 @@
55bc9320a4aae82af18e23eefd5de319a755d7b9
130bc125a88bb57664b88932c48c38a1cb316fac

View File

@@ -9,6 +9,7 @@
#include "llama-model.h"
#include <cinttypes>
#include <cmath>
#include <cstring>
#include <limits>
#include <stdexcept>
@@ -72,6 +73,43 @@ llama_context::llama_context(
cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f;
}
if (cparams.yarn_ext_factor != 0) {
static auto get_mscale = [](float scale, float mscale) {
return scale <= 1.0f ? 1.0f : (0.1f * mscale * logf(scale) + 1.0f);
};
const float factor = 1.0f / cparams.rope_freq_scale;
// ref: https://github.com/huggingface/transformers/blob/6d00f6b0a5679c36510f203e4226e36f517c3032/src/transformers/modeling_rope_utils.py#L336-L348
if (hparams.rope_yarn_log_mul != 0.0f) {
// note: here we assume `mscale == 1.0f`
// TODO: start reading the actual value of mscale and handle the case where it is not 1.0f
float mscale = 1.0f;
const float mscale_all_dims = hparams.rope_yarn_log_mul;
// [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
// special-case DEEPSEEK v2:
// https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite-Chat/blob/main/config.json#L42-L43
if (model.arch == LLM_ARCH_DEEPSEEK2 && mscale_all_dims != 1.0f) {
mscale = mscale_all_dims;
}
cparams.yarn_attn_factor = get_mscale(factor, mscale) / get_mscale(factor, mscale_all_dims);
LLAMA_LOG_WARN("%s: setting new yarn_attn_factor = %.4f (mscale == %.1f, mscale_all_dim = %.1f)\n",
__func__, cparams.yarn_attn_factor, mscale, mscale_all_dims);
} else {
cparams.yarn_attn_factor = get_mscale(factor, 1.0f);
}
// when YARN is applied with yarn_ext_factor != 0.0f, we need to cancel this factor:
// https://github.com/ggml-org/llama.cpp/blob/a81a569577cc38b32558958b048228150be63eae/ggml/src/ggml-cpu/ops.cpp#L5541-L5544
//
// ref: https://github.com/ggml-org/llama.cpp/discussions/7416
// https://github.com/ggml-org/llama.cpp/pull/17945
cparams.yarn_attn_factor *= 1.0f / (1.0f + 0.1f * logf(factor));
}
cparams.yarn_attn_factor *= hparams.rope_attn_factor;
if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
@@ -1318,6 +1356,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
// This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark)
LLAMA_LOG_INFO("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
#endif
synchronize();
buf_output = nullptr;
logits = nullptr;
embd = nullptr;

View File

@@ -574,7 +574,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
freq_base (cparams.rope_freq_base),
freq_scale (cparams.rope_freq_scale),
ext_factor (cparams.yarn_ext_factor),
attn_factor (llama_hparams::yarn_attn_factor_adjust(cparams.yarn_attn_factor, cparams.rope_freq_scale, cparams.yarn_ext_factor)),
attn_factor (cparams.yarn_attn_factor),
beta_fast (cparams.yarn_beta_fast),
beta_slow (cparams.yarn_beta_slow),
norm_eps (hparams.f_norm_eps),

View File

@@ -3,7 +3,6 @@
#include "ggml.h"
#include <cassert>
#include <cmath>
void llama_hparams::set_swa_pattern(uint32_t n_pattern, bool dense_first) {
if (dense_first) {
@@ -231,13 +230,3 @@ bool llama_hparams::is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama
return false;
}
float llama_hparams::yarn_attn_factor_adjust(float attn_factor, float freq_scale, float ext_factor) {
GGML_ASSERT(ext_factor >= 0.0f);
if (ext_factor != 0.0f) {
attn_factor *= 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));
}
return attn_factor;
}

View File

@@ -268,13 +268,6 @@ struct llama_hparams {
// TODO: think of a better place for this function
// TODO: pack the SWA params in a struct?
static bool is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1);
// when YARN is applied with yarn_ext_factor != 0.0f, we need to cancel this factor:
// https://github.com/ggml-org/llama.cpp/blob/a81a569577cc38b32558958b048228150be63eae/ggml/src/ggml-cpu/ops.cpp#L5541-L5544
//
// ref: https://github.com/ggml-org/llama.cpp/discussions/7416
// https://github.com/ggml-org/llama.cpp/pull/17945
static float yarn_attn_factor_adjust(float attn_factor, float freq_scale, float ext_factor);
};
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");

View File

@@ -1372,7 +1372,7 @@ ggml_tensor * llama_kv_cache::build_rope_shift(
const auto & yarn_ext_factor = cparams.yarn_ext_factor;
const auto & yarn_beta_fast = cparams.yarn_beta_fast;
const auto & yarn_beta_slow = cparams.yarn_beta_slow;
const auto & yarn_attn_factor = llama_hparams::yarn_attn_factor_adjust(cparams.yarn_attn_factor, cparams.rope_freq_scale, cparams.yarn_ext_factor);
const auto & yarn_attn_factor = cparams.yarn_attn_factor;
const auto & n_rot = hparams.n_rot;
const auto & rope_type = hparams.rope_type == LLAMA_ROPE_TYPE_MROPE || hparams.rope_type == LLAMA_ROPE_TYPE_IMROPE

View File

@@ -2294,32 +2294,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
default: throw std::runtime_error("unsupported model architecture");
}
// ref: https://github.com/huggingface/transformers/blob/6d00f6b0a5679c36510f203e4226e36f517c3032/src/transformers/modeling_rope_utils.py#L336-L348
if (hparams.rope_yarn_log_mul != 0.0f) {
const float factor = 1.0f / hparams.rope_freq_scale_train;
// note: here we assume `mscale == 1.0f`
// TODO: start reading the actual value of mscale and handle the case where it is not 1.0f
float mscale = 1.0f;
const float mscale_all_dims = hparams.rope_yarn_log_mul;
// [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
// special-case DEEPSEEK v2:
// https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite-Chat/blob/main/config.json#L42-L43
if (arch == LLM_ARCH_DEEPSEEK2 && mscale_all_dims != 1.0f) {
mscale = mscale_all_dims;
}
static auto get_mscale = [](float scale, float mscale) {
return scale <= 1.0f ? 1.0f : (0.1f * mscale * logf(scale) + 1.0f);
};
hparams.yarn_attn_factor = get_mscale(factor, mscale) / get_mscale(factor, mscale_all_dims);
LLAMA_LOG_WARN("%s: setting new yarn_attn_factor = %.4f (mscale == %.1f, mscale_all_dim = %.1f)\n",
__func__, hparams.yarn_attn_factor, mscale, mscale_all_dims);
}
pimpl->n_bytes = ml.n_bytes;
pimpl->desc_str = arch_name() + " " + type_name() + " " + ml.ftype_name();

View File

@@ -72,6 +72,10 @@ int main(void) {
argv = {"binary_name", "--draft", "123"};
assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_EMBEDDING));
// negated arg
argv = {"binary_name", "--no-mmap"};
assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
printf("test-arg-parser: test valid usage\n\n");

View File

@@ -7652,6 +7652,9 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, true, GGML_TYPE_F32, {1, 1}, 0.1f, 8.0f));
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, true, GGML_TYPE_F16, {1, 1}, 0.1f, 8.0f));
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {200001, 2, 3, 1}, true, true, GGML_TYPE_F32, {1, 1}, 0.1f, 8.0f));
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {200001, 2, 3, 1}, true, true, GGML_TYPE_F16, {1, 1}, 0.1f, 8.0f));
for (float max_bias : {0.0f, 8.0f}) {
for (float scale : {1.0f, 0.1f}) {
for (int64_t ne0 : {16, 1024}) {

View File

@@ -16,6 +16,7 @@
#include <atomic>
#include <chrono>
#include <queue>
#include <filesystem>
#ifdef _WIN32
#include <winsock2.h>
@@ -171,7 +172,7 @@ server_presets::server_presets(int argc, char ** argv, common_params & base_para
}
// read base args from router's argv
common_params_parse(argc, argv, LLAMA_EXAMPLE_SERVER, base_args);
common_params_to_map(argc, argv, LLAMA_EXAMPLE_SERVER, base_args);
// remove any router-controlled args from base_args
for (const auto & cargs : control_args) {

View File

@@ -11,8 +11,9 @@ endif()
target_link_libraries (${TARGET} PRIVATE Threads::Threads)
if (WIN32 AND NOT MSVC)
target_link_libraries(${TARGET} PUBLIC ws2_32)
target_link_libraries(${TARGET} PRIVATE ws2_32)
endif()
target_compile_features(${TARGET} PRIVATE cxx_std_17)
target_compile_definitions(${TARGET} PRIVATE