Compare commits

...

3 Commits

Author SHA1 Message Date
Georgi Gerganov
7604a7d6b8 metal : fix floating-point range of attention scores in FA kernels (#13090)
ggml-ci
2025-04-24 10:38:30 +03:00
Eve
b3b6d862cf vulkan: matmul gcn tuning (#13016)
* tune matmul for gcn

* this one is more power efficient

* Update ggml/src/ggml-vulkan/ggml-vulkan.cpp

Co-authored-by: 0cc4m <picard12@live.de>

* disable this tune for the proprietary driver

---------

Co-authored-by: 0cc4m <picard12@live.de>
2025-04-24 09:18:33 +02:00
pl752
5630406959 llama-mtmd-cli: Sigint rework in mtmd vision example (#13080)
* Sigint rework in mtmd vision example

* Applied suggestions on mtmd-cli PR

* Forgot to invert one of the conditions

* Update examples/llava/mtmd-cli.cpp

* Removed redundant exit check

---------

Co-authored-by: pl752 <maximpl752@gmail.com>
Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com>
2025-04-23 23:32:35 +02:00
3 changed files with 34 additions and 10 deletions

View File

@@ -24,7 +24,9 @@
#include <signal.h>
#endif
static bool g_is_generating = false;
// volatile, because of signal being an interrupt
static volatile bool g_is_generating = false;
static volatile bool g_is_interrupted = false;
/**
* Please note that this is NOT a production-ready stuff.
@@ -50,8 +52,10 @@ static void sigint_handler(int signo) {
g_is_generating = false;
} else {
console::cleanup();
LOG("\nInterrupted by user\n");
_exit(130);
if (g_is_interrupted) {
_exit(1);
}
g_is_interrupted = true;
}
}
}
@@ -167,7 +171,7 @@ struct decode_embd_batch {
static int generate_response(mtmd_cli_context & ctx, common_sampler * smpl, int n_predict) {
llama_tokens generated_tokens;
for (int i = 0; i < n_predict; i++) {
if (i > n_predict || !g_is_generating) {
if (i > n_predict || !g_is_generating || g_is_interrupted) {
printf("\n");
break;
}
@@ -184,6 +188,11 @@ static int generate_response(mtmd_cli_context & ctx, common_sampler * smpl, int
printf("%s", common_token_to_piece(ctx.lctx, token_id).c_str());
fflush(stdout);
if (g_is_interrupted) {
printf("\n");
break;
}
// eval the token
common_batch_clear(ctx.batch);
common_batch_add(ctx.batch, token_id, ctx.n_past++, {0}, true);
@@ -219,6 +228,9 @@ static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg, std::vect
text.add_special = add_bos;
text.parse_special = true;
mtmd_input_chunks chunks;
if (g_is_interrupted) return 0;
int32_t res = mtmd_tokenize(ctx.ctx_vision.get(), chunks, text, bitmaps);
if (res != 0) {
LOG_ERR("Unable to tokenize prompt, res = %d\n", res);
@@ -276,6 +288,8 @@ int main(int argc, char ** argv) {
#endif
}
if (g_is_interrupted) return 130;
if (is_single_turn) {
g_is_generating = true;
if (params.prompt.find("<__image__>") == std::string::npos) {
@@ -287,7 +301,7 @@ int main(int argc, char ** argv) {
if (eval_message(ctx, msg, params.image, true)) {
return 1;
}
if (generate_response(ctx, smpl, n_predict)) {
if (!g_is_interrupted && generate_response(ctx, smpl, n_predict)) {
return 1;
}
@@ -302,12 +316,13 @@ int main(int argc, char ** argv) {
std::vector<std::string> images_fname;
std::string content;
while (true) {
while (!g_is_interrupted) {
g_is_generating = false;
LOG("\n> ");
console::set_display(console::user_input);
std::string line;
console::readline(line, false);
if (g_is_interrupted) break;
console::set_display(console::reset);
line = string_strip(line);
if (line.empty()) {
@@ -335,6 +350,7 @@ int main(int argc, char ** argv) {
msg.role = "user";
msg.content = content;
int ret = eval_message(ctx, msg, images_fname, is_first_msg);
if (g_is_interrupted) break;
if (ret == 2) {
// non-fatal error
images_fname.clear();
@@ -352,6 +368,7 @@ int main(int argc, char ** argv) {
is_first_msg = false;
}
}
if (g_is_interrupted) LOG("\nInterrupted by user\n");
llama_perf_context_print(ctx.lctx);
return 0;
return g_is_interrupted ? 130 : 0;
}

View File

@@ -3192,7 +3192,7 @@ kernel void kernel_flash_attn_ext(
{
float S[Q] = { [0 ... Q-1] = 0.0f };
float M[Q] = { [0 ... Q-1] = -__FLT16_MAX__/2 };
float M[Q] = { [0 ... Q-1] = -__FLT_MAX__/2 };
// thread indices inside the simdgroup
// TODO: see if we can utilize quad-group functions for better performance
@@ -3452,7 +3452,7 @@ kernel void kernel_flash_attn_ext(
// reduce the warps sequentially
for (ushort sg = 1; sg < nsg; ++sg) {
float S = { 0.0f };
float M = { -__FLT16_MAX__/2 };
float M = { -__FLT_MAX__/2 };
threadgroup_barrier(mem_flags::mem_threadgroup);
@@ -3699,7 +3699,7 @@ kernel void kernel_flash_attn_ext_vec(
{
float S = 0.0f;
float M = -__FLT16_MAX__/2;
float M = -__FLT_MAX__/2;
// thread indices inside the simdgroup
const short tx = tiisg%NL;

View File

@@ -246,6 +246,7 @@ struct vk_device_struct {
bool pipeline_robustness;
vk::Device device;
uint32_t vendor_id;
vk::DriverId driver_id;
vk_device_architecture architecture;
vk_queue compute_queue;
vk_queue transfer_queue;
@@ -1740,6 +1741,11 @@ static void ggml_vk_load_shaders(vk_device& device) {
m_warptile_mmq_int = { 128, 64, 64, 32, subgroup_size_8, 32, 2, 2, 2, 1, subgroup_size_8 };
s_warptile_mmq_int = { subgroup_size_32, 32, 32, 32, 32, 32, 2, 2, 1, 1, subgroup_size_8 };
// chip specific tuning
if ((device->architecture == AMD_GCN) && (device->driver_id != vk::DriverId::eAmdProprietary)) {
m_warptile_mmq = m_warptile_mmq_int = { 256, 64, 64, 32, 16, 16, 2, 2, 2, 1, 16 };
}
l_mmq_wg_denoms = l_wg_denoms = {128, 128, 1 };
m_mmq_wg_denoms = m_wg_denoms = { 64, 64, 1 };
s_mmq_wg_denoms = s_wg_denoms = { 32, 32, 1 };
@@ -2658,6 +2664,7 @@ static vk_device ggml_vk_get_device(size_t idx) {
device->physical_device.getProperties2(&props2);
device->properties = props2.properties;
device->vendor_id = device->properties.vendorID;
device->driver_id = driver_props.driverID;
const char* GGML_VK_FORCE_MAX_ALLOCATION_SIZE = getenv("GGML_VK_FORCE_MAX_ALLOCATION_SIZE");