Compare commits

..

2 Commits

Author SHA1 Message Date
forforever73
09343c0198 model : support step3-vl-10b (#21287)
* feat: support step3-vl-10b

* use fused QKV && mapping tensor in tensor_mapping.py

* guard hardcoded params and drop crop metadata

* get understand_projector_stride from global config

* img_u8_resize_bilinear_to_f32 move in step3vl class

* Apply suggestions from code review

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* fix the \r\n mess

* add width and heads to MmprojModel.set_gguf_parameters

---------

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
2026-04-08 09:51:31 +02:00
Hamish M. Blair
97508acb17 webui: fix syntax highlighting lost after streaming for non-common languages (#21206)
* webui: fix syntax highlighting lost for non-common languages after streaming

rehype-highlight uses lowlight internally, which only bundles 37 "common"
languages. The streaming code path uses highlight.js directly (192 languages),
so languages like Haskell highlight correctly while streaming but lose all
color once the code block closes. Pass the full lowlight language set to
rehype-highlight so both paths support the same languages.

* webui: rebuild static files after rebase
2026-04-08 08:58:08 +02:00
18 changed files with 704 additions and 827 deletions

View File

@@ -2219,10 +2219,10 @@ class MmprojModel(ModelBase):
self.image_size = self.find_vparam(["image_size"])
self.gguf_writer.add_vision_image_size(self.image_size)
self.gguf_writer.add_vision_patch_size(self.find_vparam(["patch_size"]))
self.gguf_writer.add_vision_embedding_length(self.find_vparam(["hidden_size", "vt_hidden_size"]))
self.gguf_writer.add_vision_embedding_length(self.find_vparam(["hidden_size", "width", "vt_hidden_size"]))
self.gguf_writer.add_vision_feed_forward_length(self.find_vparam(["intermediate_size", "vt_intermediate_size"]))
self.gguf_writer.add_vision_block_count(self.find_vparam(self.n_block_keys))
self.gguf_writer.add_vision_head_count(self.find_vparam(["num_attention_heads", "num_heads", "vt_num_attention_heads"]))
self.gguf_writer.add_vision_head_count(self.find_vparam(["num_attention_heads", "num_heads", "heads", "vt_num_attention_heads"]))
# preprocessor config
image_mean = _MISTRAL_COMMON_DATASET_MEAN if self.is_mistral_format else self.preprocessor_config["image_mean"]
@@ -4949,6 +4949,73 @@ class Glm4VVisionModel(Qwen3VLVisionModel):
yield from super().modify_tensors(data_torch, name, bid)
@ModelBase.register("StepVLForConditionalGeneration")
class Step3VLVisionModel(MmprojModel):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
assert self.hparams_vision is not None
if not self.hparams_vision.get("intermediate_size"):
hidden_size = self.hparams_vision.get("hidden_size") or self.hparams_vision.get("width") or 0
assert hidden_size > 0
mlp_ratio = float(self.hparams_vision.get("mlp_ratio", 8960 / 1536))
self.hparams_vision["intermediate_size"] = int(round(hidden_size * mlp_ratio))
self.preprocessor_config.setdefault("image_mean", list(_MISTRAL_COMMON_DATASET_MEAN))
self.preprocessor_config.setdefault("image_std", list(_MISTRAL_COMMON_DATASET_STD))
def set_gguf_parameters(self):
super().set_gguf_parameters()
assert self.hparams_vision is not None
projector_stride = int(self.global_config.get("understand_projector_stride", -1))
hidden_size = int(self.hparams_vision.get("hidden_size", self.hparams_vision.get("width", -1)))
num_layers = int(self.hparams_vision.get("num_hidden_layers", self.hparams_vision.get("layers", -1)))
assert (projector_stride, int(self.hparams_vision.get("image_size", -1)), hidden_size, num_layers) == (2, 728, 1536, 47), (
"current Step3-VL conversion path is only validated for Step3-VL-10B"
)
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.STEP3VL)
self.gguf_writer.add_vision_attention_layernorm_eps(float(self.hparams_vision.get("layer_norm_eps", 1e-5)))
self.gguf_writer.add_vision_projector_scale_factor(projector_stride ** 2)
# 3024 max resize comes from step3-vl-10b processing_step3.py.
self.gguf_writer.add_vision_preproc_image_size(3024)
def tensor_force_quant(self, name, new_name, bid, n_dims):
if ".position_embd." in new_name:
return gguf.GGMLQuantizationType.F32
return super().tensor_force_quant(name, new_name, bid, n_dims)
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
if name.startswith("model.") or name.startswith("lm_head."):
return
if name.startswith("vision_model.vit_downsampler"):
match = re.match(r"vision_model\.vit_downsampler(\d+)\.(weight|bias)", name)
if match is None:
raise ValueError(f"Unexpected Step3-VL projector tensor {name!r}")
proj_id = int(match.group(1)) - 1
suffix = f".{match.group(2)}"
yield (self.format_tensor_name(gguf.MODEL_TENSOR.V_MMPROJ, proj_id, suffix=suffix), data_torch)
return
if name == "vit_large_projector.weight":
yield (self.format_tensor_name(gguf.MODEL_TENSOR.V_MMPROJ_FC), data_torch)
return
if name.startswith("vision_model."):
if name == "vision_model.positional_embedding":
name += ".weight"
elif name.endswith(".gamma") and ".ls_" in name:
name = name.removesuffix(".gamma") + ".weight"
name = name.replace("attn.in_proj_weight", "attn.in_proj.weight")
name = name.replace("attn.in_proj_bias", "attn.in_proj.bias")
yield from super().modify_tensors(data_torch, name, bid)
@ModelBase.register("Qwen3VLForConditionalGeneration")
class Qwen3VLTextModel(Qwen3Model):
model_arch = gguf.MODEL_ARCH.QWEN3VL
@@ -4969,6 +5036,16 @@ class Qwen3VLTextModel(Qwen3Model):
yield from super().modify_tensors(data_torch, name, bid)
@ModelBase.register("StepVLForConditionalGeneration")
class Step3VLTextModel(Qwen3Model):
model_arch = gguf.MODEL_ARCH.QWEN3
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
if name.startswith("vision_model.") or name.startswith("model.vision_model.") or name.startswith("vit_large_projector."):
return
yield from super().modify_tensors(data_torch, name, bid)
@ModelBase.register("Qwen3VLMoeForConditionalGeneration")
class Qwen3VLMoeTextModel(Qwen3MoeModel):
model_arch = gguf.MODEL_ARCH.QWEN3VLMOE
@@ -12994,6 +13071,12 @@ def get_model_architecture(hparams: dict[str, Any], model_type: ModelType) -> st
# For non-hf Mamba and Mamba2 models
arch = hparams["ssm_cfg"].get("layer", "Mamba") + "ForCausalLM"
# Step3-VL keeps text config under text_config but uses a custom top-level architecture.
# For text conversion we route to a dedicated text-only class.
# TODO: refactor this later to avoid adding exception here
if model_type == ModelType.TEXT and arch == "StepVLForConditionalGeneration":
return arch
# if "architectures" is found in the sub-config, use that instead
if model_type == ModelType.TEXT and text_config.get("architectures") is not None:
arch = text_config["architectures"][0]

View File

@@ -223,7 +223,6 @@ option(GGML_VULKAN_MEMORY_DEBUG "ggml: enable Vulkan memory debug ou
option(GGML_VULKAN_SHADER_DEBUG_INFO "ggml: enable Vulkan shader debug info" OFF)
option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation" OFF)
option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF)
option(GGML_VULKAN_COPY_TESTS "ggml: run Vulkan cross-device copy benchmarks" OFF)
option(GGML_WEBGPU "ggml: use WebGPU" OFF)
option(GGML_WEBGPU_DEBUG "ggml: enable WebGPU debug output" OFF)
option(GGML_WEBGPU_CPU_PROFILE "ggml: enable WebGPU profiling (CPU)" OFF)

View File

@@ -120,10 +120,6 @@ if (Vulkan_FOUND)
add_compile_definitions(GGML_VULKAN_RUN_TESTS)
endif()
if (GGML_VULKAN_COPY_TESTS)
add_compile_definitions(GGML_VULKAN_COPY_TESTS)
endif()
# Set up toolchain for host compilation whether cross-compiling or not
if (CMAKE_CROSSCOMPILING)
if (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN)

View File

@@ -1,12 +1,9 @@
#include "ggml-vulkan.h"
#include <vulkan/vulkan_core.h>
#if defined(GGML_VULKAN_RUN_TESTS) || defined(GGML_VULKAN_CHECK_RESULTS) || defined(GGML_VULKAN_COPY_TESTS)
#if defined(GGML_VULKAN_RUN_TESTS) || defined(GGML_VULKAN_CHECK_RESULTS)
#include <chrono>
#include "ggml-cpu.h"
#endif
#if defined(GGML_VULKAN_COPY_TESTS) && !defined(_WIN32)
#include <unistd.h>
#endif
// See https://github.com/KhronosGroup/Vulkan-Hpp?tab=readme-ov-file#extensions--per-device-function-pointers-
#define VULKAN_HPP_DISPATCH_LOADER_DYNAMIC 1
@@ -594,7 +591,6 @@ struct vk_device_struct {
uint64_t suballocation_block_size;
uint64_t min_imported_host_pointer_alignment;
bool external_memory_host {};
bool external_semaphore_fd {};
bool fp16;
bool bf16;
bool pipeline_robustness;
@@ -1663,7 +1659,6 @@ struct ggml_vk_garbage_collector {
static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx, vk_context subctx);
static void ggml_vk_load_shaders(vk_device& device);
static void ggml_pipeline_allocate_descriptor_sets(ggml_backend_vk_context * ctx);
static vk_buffer ggml_vk_buffer_from_host_ptr(vk_device & device, void * ptr, size_t size);
static bool vk_memory_logger_enabled = false;
@@ -4887,8 +4882,6 @@ static vk_device ggml_vk_get_device(size_t idx) {
device->memory_priority = true;
} else if (strcmp("VK_EXT_external_memory_host", properties.extensionName) == 0) {
device->external_memory_host = true;
} else if (strcmp("VK_KHR_external_semaphore_fd", properties.extensionName) == 0) {
device->external_semaphore_fd = true;
#if defined(VK_EXT_shader_64bit_indexing)
} else if (strcmp("VK_EXT_shader_64bit_indexing", properties.extensionName) == 0) {
device->shader_64b_indexing = true;
@@ -5188,10 +5181,6 @@ static vk_device ggml_vk_get_device(size_t idx) {
device_extensions.push_back("VK_EXT_external_memory_host");
}
if (device->external_semaphore_fd) {
device_extensions.push_back("VK_KHR_external_semaphore_fd");
}
#if defined(VK_EXT_shader_64bit_indexing)
VkPhysicalDeviceShader64BitIndexingFeaturesEXT shader_64bit_indexing_features {};
shader_64bit_indexing_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_64_BIT_INDEXING_FEATURES_EXT;
@@ -12641,654 +12630,7 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
}
#endif
#ifdef GGML_VULKAN_COPY_TESTS
// Cross-device copy benchmark
// Tests different approaches to copying data between two Vulkan devices.
// Build with -DGGML_VULKAN_COPY_TESTS and run any llama.cpp command with >= 2 Vulkan devices.
// Helper: allocate shared staging buffer importable by both devices
struct vk_shared_staging {
void * host_ptr = nullptr;
vk_buffer buf_dev0;
vk_buffer buf_dev1;
size_t size = 0;
bool alloc(vk_device & dev0, vk_device & dev1, size_t sz) {
size_t align = std::max(dev0->min_imported_host_pointer_alignment,
dev1->min_imported_host_pointer_alignment);
size = (sz + align - 1) & ~(align - 1);
#ifdef _WIN32
host_ptr = _aligned_malloc(size, align);
#else
if (posix_memalign(&host_ptr, align, size) != 0) { host_ptr = nullptr; }
#endif
if (!host_ptr) return false;
buf_dev0 = ggml_vk_buffer_from_host_ptr(dev0, host_ptr, size);
buf_dev1 = ggml_vk_buffer_from_host_ptr(dev1, host_ptr, size);
return buf_dev0 && buf_dev1;
}
void free_resources() {
ggml_vk_destroy_buffer(buf_dev0);
ggml_vk_destroy_buffer(buf_dev1);
#ifdef _WIN32
_aligned_free(host_ptr);
#else
free(host_ptr);
#endif
host_ptr = nullptr;
}
};
// Helper: run a benchmark and print results
static void vk_bench_print(const char * name, std::vector<double> & times, size_t size) {
std::sort(times.begin(), times.end());
double median = times[times.size() / 2];
double bw = (size / (1024.0 * 1024.0 * 1024.0)) / (median / 1000.0);
std::cerr << " " << std::left << std::setw(22) << name << " : "
<< std::fixed << std::setprecision(3) << median << " ms "
<< std::setprecision(2) << bw << " GB/s" << std::endl;
}
// Results stored per (method, size) for table output
struct vk_copy_result {
std::string method;
double ms;
double gbps;
};
static void ggml_vk_bench_pair(
vk_device & dev0, vk_device & dev1,
const std::vector<size_t> & test_sizes,
std::map<std::string, std::vector<vk_copy_result>> & results) {
const size_t num_it = 20;
const size_t warmup = 3;
const size_t max_size = test_sizes.back();
// Allocate buffers
vk_buffer buf_src = ggml_vk_create_buffer_check(dev0, max_size, {vk::MemoryPropertyFlagBits::eDeviceLocal});
vk_buffer buf_dst = ggml_vk_create_buffer_check(dev1, max_size, {vk::MemoryPropertyFlagBits::eDeviceLocal});
vk_buffer staging_src = ggml_vk_create_buffer_check(dev0, max_size,
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
vk_buffer staging_dst = ggml_vk_create_buffer_check(dev1, max_size,
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
// Fill source
{
vk_context subctx = ggml_vk_create_temporary_context(dev0->transfer_queue.cmd_pool);
ggml_vk_ctx_begin(dev0, subctx);
subctx->s->buffer->buf.fillBuffer(buf_src->buffer, 0, max_size, 0xDEADBEEF);
ggml_vk_ctx_end(subctx);
ggml_vk_submit(subctx, dev0->fence);
VK_CHECK(dev0->device.waitForFences({ dev0->fence }, true, UINT64_MAX), "fill");
dev0->device.resetFences({ dev0->fence });
}
bool has_shared_staging = dev0->external_memory_host && dev1->external_memory_host;
bool has_syncfd = false;
#ifndef _WIN32
if (dev0->external_semaphore_fd && dev1->external_semaphore_fd) {
vk::PhysicalDeviceExternalSemaphoreInfo query{};
query.handleType = vk::ExternalSemaphoreHandleTypeFlagBits::eSyncFd;
auto p0 = dev0->physical_device.getExternalSemaphoreProperties(query);
auto p1 = dev1->physical_device.getExternalSemaphoreProperties(query);
has_syncfd =
(p0.externalSemaphoreFeatures & vk::ExternalSemaphoreFeatureFlagBits::eExportable) &&
(p0.compatibleHandleTypes & vk::ExternalSemaphoreHandleTypeFlagBits::eSyncFd) &&
(p1.externalSemaphoreFeatures & vk::ExternalSemaphoreFeatureFlagBits::eImportable) &&
(p1.compatibleHandleTypes & vk::ExternalSemaphoreHandleTypeFlagBits::eSyncFd);
}
#endif
// Helper to record a result
auto record = [&](const std::string & method, size_t size, std::vector<double> & times) {
std::sort(times.begin(), times.end());
double median = times[times.size() / 2];
double bw = (size / (1024.0 * 1024.0 * 1024.0)) / (median / 1000.0);
results[method].push_back({ method, median, bw });
};
// Helper to record a skipped size (sentinel: negative ms)
auto skip = [&](const std::string & method) {
results[method].push_back({ method, -1.0, -1.0 });
};
for (size_t size : test_sizes) {
// =================================================================
// 1. Baseline: current sync double-hop (separate staging buffers + memcpy)
// =================================================================
{
std::vector<double> times;
for (size_t i = 0; i < num_it + warmup; i++) {
auto begin = std::chrono::high_resolution_clock::now();
{
std::lock_guard<std::recursive_mutex> guard(dev0->mutex);
vk_context subctx = ggml_vk_create_temporary_context(dev0->transfer_queue.cmd_pool);
ggml_vk_ctx_begin(dev0, subctx);
ggml_vk_buffer_copy_async(subctx, staging_src, 0, buf_src, 0, size);
ggml_vk_ctx_end(subctx);
ggml_vk_submit(subctx, dev0->fence);
VK_CHECK(dev0->device.waitForFences({ dev0->fence }, true, UINT64_MAX), "baseline hop1");
dev0->device.resetFences({ dev0->fence });
}
memcpy(staging_dst->ptr, staging_src->ptr, size);
{
std::lock_guard<std::recursive_mutex> guard(dev1->mutex);
vk_context subctx = ggml_vk_create_temporary_context(dev1->transfer_queue.cmd_pool);
ggml_vk_ctx_begin(dev1, subctx);
ggml_vk_buffer_copy_async(subctx, buf_dst, 0, staging_dst, 0, size);
ggml_vk_ctx_end(subctx);
ggml_vk_submit(subctx, dev1->fence);
VK_CHECK(dev1->device.waitForFences({ dev1->fence }, true, UINT64_MAX), "baseline hop2");
dev1->device.resetFences({ dev1->fence });
}
auto end = std::chrono::high_resolution_clock::now();
if (i >= warmup) times.push_back(std::chrono::duration_cast<std::chrono::microseconds>(end - begin).count() / 1000.0);
}
record("baseline", size, times);
}
// =================================================================
// 2. Diagnostics: individual hop timings
// =================================================================
{
std::vector<double> times;
for (size_t i = 0; i < num_it + warmup; i++) {
auto begin = std::chrono::high_resolution_clock::now();
{
std::lock_guard<std::recursive_mutex> guard(dev0->mutex);
vk_context subctx = ggml_vk_create_temporary_context(dev0->transfer_queue.cmd_pool);
ggml_vk_ctx_begin(dev0, subctx);
ggml_vk_buffer_copy_async(subctx, staging_src, 0, buf_src, 0, size);
ggml_vk_ctx_end(subctx);
ggml_vk_submit(subctx, dev0->fence);
VK_CHECK(dev0->device.waitForFences({ dev0->fence }, true, UINT64_MAX), "diag hop1");
dev0->device.resetFences({ dev0->fence });
}
auto end = std::chrono::high_resolution_clock::now();
if (i >= warmup) times.push_back(std::chrono::duration_cast<std::chrono::microseconds>(end - begin).count() / 1000.0);
}
record("hop1_only", size, times);
}
{
std::vector<double> times;
for (size_t i = 0; i < num_it + warmup; i++) {
auto begin = std::chrono::high_resolution_clock::now();
{
std::lock_guard<std::recursive_mutex> guard(dev1->mutex);
vk_context subctx = ggml_vk_create_temporary_context(dev1->transfer_queue.cmd_pool);
ggml_vk_ctx_begin(dev1, subctx);
ggml_vk_buffer_copy_async(subctx, buf_dst, 0, staging_dst, 0, size);
ggml_vk_ctx_end(subctx);
ggml_vk_submit(subctx, dev1->fence);
VK_CHECK(dev1->device.waitForFences({ dev1->fence }, true, UINT64_MAX), "diag hop2");
dev1->device.resetFences({ dev1->fence });
}
auto end = std::chrono::high_resolution_clock::now();
if (i >= warmup) times.push_back(std::chrono::duration_cast<std::chrono::microseconds>(end - begin).count() / 1000.0);
}
record("hop2_only", size, times);
}
// =================================================================
// 3. Shared staging: single host buffer imported into both devices
// =================================================================
if (has_shared_staging) {
vk_shared_staging stg;
if (stg.alloc(dev0, dev1, size)) {
std::vector<double> times;
for (size_t i = 0; i < num_it + warmup; i++) {
auto begin = std::chrono::high_resolution_clock::now();
{
std::lock_guard<std::recursive_mutex> guard(dev0->mutex);
vk_context subctx = ggml_vk_create_temporary_context(dev0->transfer_queue.cmd_pool);
ggml_vk_ctx_begin(dev0, subctx);
ggml_vk_buffer_copy_async(subctx, stg.buf_dev0, 0, buf_src, 0, size);
ggml_vk_ctx_end(subctx);
ggml_vk_submit(subctx, dev0->fence);
VK_CHECK(dev0->device.waitForFences({ dev0->fence }, true, UINT64_MAX), "shared hop1");
dev0->device.resetFences({ dev0->fence });
}
{
std::lock_guard<std::recursive_mutex> guard(dev1->mutex);
vk_context subctx = ggml_vk_create_temporary_context(dev1->transfer_queue.cmd_pool);
ggml_vk_ctx_begin(dev1, subctx);
ggml_vk_buffer_copy_async(subctx, buf_dst, 0, stg.buf_dev1, 0, size);
ggml_vk_ctx_end(subctx);
ggml_vk_submit(subctx, dev1->fence);
VK_CHECK(dev1->device.waitForFences({ dev1->fence }, true, UINT64_MAX), "shared hop2");
dev1->device.resetFences({ dev1->fence });
}
auto end = std::chrono::high_resolution_clock::now();
if (i >= warmup) times.push_back(std::chrono::duration_cast<std::chrono::microseconds>(end - begin).count() / 1000.0);
}
record("shared_staging", size, times);
} else {
std::cerr << " shared_staging : SKIPPED (import failed)" << std::endl;
}
stg.free_resources();
}
// =================================================================
// 4. Chunked pipeline: split into N chunks, overlap hop1/hop2
// via full-duplex PCIe. Vary chunk count to find optimum.
// =================================================================
if (has_shared_staging) {
for (size_t n_chunks : { 2, 4, 8 }) {
char cname[32];
snprintf(cname, sizeof(cname), "chunked_%zu", n_chunks);
if (size < n_chunks * 4096) { skip(cname); continue; }
size_t align = std::max(dev0->min_imported_host_pointer_alignment,
dev1->min_imported_host_pointer_alignment);
size_t chunk_data = size / n_chunks;
size_t chunk_aligned = (chunk_data + align - 1) & ~(align - 1);
vk_shared_staging stg;
if (!stg.alloc(dev0, dev1, chunk_aligned * n_chunks)) {
std::cerr << " chunked_" << n_chunks << " : SKIPPED (import failed)" << std::endl;
stg.free_resources();
continue;
}
// Per-chunk timeline semaphores
std::vector<vk::Semaphore> chunk_sems(n_chunks);
std::vector<uint64_t> sem_vals(n_chunks, 0);
for (size_t c = 0; c < n_chunks; c++) {
vk::SemaphoreTypeCreateInfo tci{ vk::SemaphoreType::eTimeline, 0 };
vk::SemaphoreCreateInfo sci{};
sci.setPNext(&tci);
chunk_sems[c] = dev0->device.createSemaphore(sci);
}
std::vector<double> times;
for (size_t iter = 0; iter < num_it + warmup; iter++) {
auto begin = std::chrono::high_resolution_clock::now();
// Submit all hop1s upfront
for (size_t c = 0; c < n_chunks; c++) {
size_t off_src = c * chunk_data;
size_t off_stg = c * chunk_aligned;
size_t csz = (c == n_chunks - 1) ? (size - c * chunk_data) : chunk_data;
vk_context subctx = ggml_vk_create_temporary_context(dev0->transfer_queue.cmd_pool);
ggml_vk_ctx_begin(dev0, subctx);
ggml_vk_buffer_copy_async(subctx, stg.buf_dev0, off_stg, buf_src, off_src, csz);
sem_vals[c]++;
subctx->s->signal_semaphores.push_back({ chunk_sems[c], sem_vals[c] });
ggml_vk_ctx_end(subctx);
ggml_vk_submit(subctx, {});
}
// Per-chunk: CPU wait hop1, submit hop2
for (size_t c = 0; c < n_chunks; c++) {
size_t off_dst = c * chunk_data;
size_t off_stg = c * chunk_aligned;
size_t csz = (c == n_chunks - 1) ? (size - c * chunk_data) : chunk_data;
vk::SemaphoreWaitInfo swi{vk::SemaphoreWaitFlags{}, chunk_sems[c], sem_vals[c]};
VK_CHECK(dev0->device.waitSemaphores(swi, UINT64_MAX), "chunked sem wait");
vk_context subctx = ggml_vk_create_temporary_context(dev1->transfer_queue.cmd_pool);
ggml_vk_ctx_begin(dev1, subctx);
ggml_vk_buffer_copy_async(subctx, buf_dst, off_dst, stg.buf_dev1, off_stg, csz);
ggml_vk_ctx_end(subctx);
ggml_vk_submit(subctx, (c == n_chunks - 1) ? dev1->fence : vk::Fence{});
}
VK_CHECK(dev1->device.waitForFences({ dev1->fence }, true, UINT64_MAX), "chunked final");
dev1->device.resetFences({ dev1->fence });
auto end = std::chrono::high_resolution_clock::now();
if (iter >= warmup) times.push_back(std::chrono::duration_cast<std::chrono::microseconds>(end - begin).count() / 1000.0);
}
char name[32];
snprintf(name, sizeof(name), "chunked_%zu", n_chunks);
record(name, size, times);
for (size_t c = 0; c < n_chunks; c++) dev0->device.destroySemaphore(chunk_sems[c]);
stg.free_resources();
}
}
// =================================================================
// 5. sync_fd async: fully GPU-synchronised via Linux sync_file
// =================================================================
#ifndef _WIN32
if (has_shared_staging && has_syncfd) {
vk_shared_staging stg;
if (stg.alloc(dev0, dev1, size)) {
std::vector<double> times;
bool run_ok = true;
for (size_t i = 0; i < num_it + warmup && run_ok; i++) {
auto begin = std::chrono::high_resolution_clock::now();
vk::ExportSemaphoreCreateInfo esci{};
esci.handleTypes = vk::ExternalSemaphoreHandleTypeFlagBits::eSyncFd;
vk::SemaphoreCreateInfo sci{};
sci.setPNext(&esci);
vk::Semaphore sem_dev0 = dev0->device.createSemaphore(sci);
// Hop 1 + signal
{
vk_context subctx = ggml_vk_create_temporary_context(dev0->transfer_queue.cmd_pool);
ggml_vk_ctx_begin(dev0, subctx);
ggml_vk_buffer_copy_async(subctx, stg.buf_dev0, 0, buf_src, 0, size);
subctx->s->signal_semaphores.push_back({ sem_dev0, 0 });
ggml_vk_ctx_end(subctx);
ggml_vk_submit(subctx, {});
}
// Export + import sync_fd
int sync_fd = -1;
try {
vk::SemaphoreGetFdInfoKHR gi{};
gi.semaphore = sem_dev0;
gi.handleType = vk::ExternalSemaphoreHandleTypeFlagBits::eSyncFd;
sync_fd = dev0->device.getSemaphoreFdKHR(gi);
} catch (vk::SystemError& e) {
std::cerr << " syncfd_async : SKIPPED (export: " << e.what() << ")" << std::endl;
dev0->device.destroySemaphore(sem_dev0);
run_ok = false; break;
}
vk::Semaphore sem_dev1 = dev1->device.createSemaphore({});
try {
vk::ImportSemaphoreFdInfoKHR ii{};
ii.semaphore = sem_dev1;
ii.handleType = vk::ExternalSemaphoreHandleTypeFlagBits::eSyncFd;
ii.flags = vk::SemaphoreImportFlagBits::eTemporary;
ii.fd = sync_fd;
dev1->device.importSemaphoreFdKHR(ii);
} catch (vk::SystemError& e) {
std::cerr << " syncfd_async : SKIPPED (import: " << e.what() << ")" << std::endl;
dev0->device.destroySemaphore(sem_dev0);
dev1->device.destroySemaphore(sem_dev1);
close(sync_fd);
run_ok = false; break;
}
// Hop 2 with GPU-side wait
{
vk_context subctx = ggml_vk_create_temporary_context(dev1->transfer_queue.cmd_pool);
ggml_vk_ctx_begin(dev1, subctx);
subctx->s->wait_semaphores.push_back({ sem_dev1, 0 });
ggml_vk_buffer_copy_async(subctx, buf_dst, 0, stg.buf_dev1, 0, size);
ggml_vk_ctx_end(subctx);
ggml_vk_submit(subctx, dev1->fence);
VK_CHECK(dev1->device.waitForFences({ dev1->fence }, true, UINT64_MAX), "syncfd final");
dev1->device.resetFences({ dev1->fence });
}
dev0->device.destroySemaphore(sem_dev0);
dev1->device.destroySemaphore(sem_dev1);
auto end = std::chrono::high_resolution_clock::now();
if (i >= warmup) times.push_back(std::chrono::duration_cast<std::chrono::microseconds>(end - begin).count() / 1000.0);
}
if (run_ok) record("syncfd_async", size, times);
} else {
std::cerr << " syncfd_async : SKIPPED (import failed)" << std::endl;
}
stg.free_resources();
}
// =================================================================
// 6. sync_fd chunked: chunked pipeline with GPU-side sync_fd
// between hops (no CPU waits between chunks)
// =================================================================
if (has_shared_staging && has_syncfd) {
for (size_t n_chunks : { 2, 4, 8 }) {
char scname[48];
snprintf(scname, sizeof(scname), "syncfd_chunked_%zu", n_chunks);
if (size < n_chunks * 4096) { skip(scname); continue; }
size_t align = std::max(dev0->min_imported_host_pointer_alignment,
dev1->min_imported_host_pointer_alignment);
size_t chunk_data = size / n_chunks;
size_t chunk_aligned = (chunk_data + align - 1) & ~(align - 1);
vk_shared_staging stg;
if (!stg.alloc(dev0, dev1, chunk_aligned * n_chunks)) {
std::cerr << " syncfd_chunked_" << n_chunks << " : SKIPPED (import failed)" << std::endl;
stg.free_resources();
continue;
}
std::vector<double> times;
bool run_ok = true;
for (size_t iter = 0; iter < num_it + warmup && run_ok; iter++) {
auto begin = std::chrono::high_resolution_clock::now();
// Create per-chunk exportable semaphores
std::vector<vk::Semaphore> sems_dev0(n_chunks);
for (size_t c = 0; c < n_chunks; c++) {
vk::ExportSemaphoreCreateInfo esci{};
esci.handleTypes = vk::ExternalSemaphoreHandleTypeFlagBits::eSyncFd;
vk::SemaphoreCreateInfo sci{};
sci.setPNext(&esci);
sems_dev0[c] = dev0->device.createSemaphore(sci);
}
// Submit all hop1s with per-chunk signal
for (size_t c = 0; c < n_chunks; c++) {
size_t off_src = c * chunk_data;
size_t off_stg = c * chunk_aligned;
size_t csz = (c == n_chunks - 1) ? (size - c * chunk_data) : chunk_data;
vk_context subctx = ggml_vk_create_temporary_context(dev0->transfer_queue.cmd_pool);
ggml_vk_ctx_begin(dev0, subctx);
ggml_vk_buffer_copy_async(subctx, stg.buf_dev0, off_stg, buf_src, off_src, csz);
subctx->s->signal_semaphores.push_back({ sems_dev0[c], 0 });
ggml_vk_ctx_end(subctx);
ggml_vk_submit(subctx, {});
}
// Export all sync_fds and import on dev1, submit hop2s
for (size_t c = 0; c < n_chunks && run_ok; c++) {
size_t off_dst = c * chunk_data;
size_t off_stg = c * chunk_aligned;
size_t csz = (c == n_chunks - 1) ? (size - c * chunk_data) : chunk_data;
int sync_fd = -1;
try {
vk::SemaphoreGetFdInfoKHR gi{};
gi.semaphore = sems_dev0[c];
gi.handleType = vk::ExternalSemaphoreHandleTypeFlagBits::eSyncFd;
sync_fd = dev0->device.getSemaphoreFdKHR(gi);
} catch (vk::SystemError& e) {
char nm[48]; snprintf(nm, sizeof(nm), "syncfd_chunked_%zu", n_chunks);
std::cerr << " " << nm << " : SKIPPED (export: " << e.what() << ")" << std::endl;
run_ok = false; break;
}
vk::Semaphore sem_dev1 = dev1->device.createSemaphore({});
try {
vk::ImportSemaphoreFdInfoKHR ii{};
ii.semaphore = sem_dev1;
ii.handleType = vk::ExternalSemaphoreHandleTypeFlagBits::eSyncFd;
ii.flags = vk::SemaphoreImportFlagBits::eTemporary;
ii.fd = sync_fd;
dev1->device.importSemaphoreFdKHR(ii);
} catch (vk::SystemError& e) {
char nm[48]; snprintf(nm, sizeof(nm), "syncfd_chunked_%zu", n_chunks);
std::cerr << " " << nm << " : SKIPPED (import: " << e.what() << ")" << std::endl;
dev1->device.destroySemaphore(sem_dev1);
close(sync_fd);
run_ok = false; break;
}
vk_context subctx = ggml_vk_create_temporary_context(dev1->transfer_queue.cmd_pool);
ggml_vk_ctx_begin(dev1, subctx);
subctx->s->wait_semaphores.push_back({ sem_dev1, 0 });
ggml_vk_buffer_copy_async(subctx, buf_dst, off_dst, stg.buf_dev1, off_stg, csz);
ggml_vk_ctx_end(subctx);
ggml_vk_submit(subctx, (c == n_chunks - 1) ? dev1->fence : vk::Fence{});
dev1->device.destroySemaphore(sem_dev1);
}
if (run_ok) {
VK_CHECK(dev1->device.waitForFences({ dev1->fence }, true, UINT64_MAX), "syncfd_chunked final");
dev1->device.resetFences({ dev1->fence });
}
for (size_t c = 0; c < n_chunks; c++) dev0->device.destroySemaphore(sems_dev0[c]);
auto end = std::chrono::high_resolution_clock::now();
if (run_ok && iter >= warmup) times.push_back(std::chrono::duration_cast<std::chrono::microseconds>(end - begin).count() / 1000.0);
}
if (run_ok) {
char name[48];
snprintf(name, sizeof(name), "syncfd_chunked_%zu", n_chunks);
record(name, size, times);
}
stg.free_resources();
}
}
#endif
}
ggml_vk_destroy_buffer(buf_src);
ggml_vk_destroy_buffer(buf_dst);
ggml_vk_destroy_buffer(staging_src);
ggml_vk_destroy_buffer(staging_dst);
}
static void ggml_vk_test_cross_device_copy(ggml_backend_vk_context * ctx) {
ggml_vk_instance_init();
const size_t n_devices = vk_instance.device_indices.size();
if (n_devices < 2) {
std::cerr << "COPY TEST: Need at least 2 Vulkan devices, found " << n_devices << std::endl;
return;
}
// List devices
std::cerr << "\n=== Vulkan Devices ===" << std::endl;
std::vector<vk_device> devices(n_devices);
for (size_t i = 0; i < n_devices; i++) {
devices[i] = ggml_vk_get_device(i);
std::cerr << " [" << i << "] " << devices[i]->name << std::endl;
}
const std::vector<size_t> test_sizes = {
4096, // 4 KB
256 * 1024, // 256 KB
1 * 1024 * 1024, // 1 MB
16 * 1024 * 1024, // 16 MB
64 * 1024 * 1024, // 64 MB
256 * 1024 * 1024, // 256 MB
};
// Collect results: results[pair_label][method_name] = vector of vk_copy_result (one per size)
struct pair_results {
std::string label;
std::map<std::string, std::vector<vk_copy_result>> methods;
};
std::vector<pair_results> all_results;
// Run benchmarks for all ordered pairs
for (size_t i = 0; i < n_devices; i++) {
for (size_t j = 0; j < n_devices; j++) {
if (i == j) continue;
std::string label = devices[i]->name + " -> " + devices[j]->name;
std::cerr << "\n\n=== " << label << " ===" << std::endl;
pair_results pr;
pr.label = label;
ggml_vk_bench_pair(devices[i], devices[j], test_sizes, pr.methods);
all_results.push_back(std::move(pr));
}
}
// Output markdown tables: one table per method
// Collect all method names
std::vector<std::string> method_order;
if (!all_results.empty()) {
// Use first pair's method order as canonical
for (auto & [method, _] : all_results[0].methods) {
method_order.push_back(method);
}
// Add any methods from other pairs not in the first
for (auto & pr : all_results) {
for (auto & [method, _] : pr.methods) {
if (std::find(method_order.begin(), method_order.end(), method) == method_order.end()) {
method_order.push_back(method);
}
}
}
}
std::cerr << "\n\n# Cross-Device Copy Benchmark Results\n" << std::endl;
for (auto & method : method_order) {
std::cerr << "## " << method << "\n" << std::endl;
// Header: | Direction | 4KB | 256KB | ... |
std::cerr << "| Direction |";
for (size_t s : test_sizes) {
if (s < 1024 * 1024) {
std::cerr << " " << s / 1024 << " KB |";
} else {
std::cerr << " " << s / (1024 * 1024) << " MB |";
}
}
std::cerr << std::endl;
// Separator
std::cerr << "|---|";
for (size_t s = 0; s < test_sizes.size(); s++) {
std::cerr << "---|";
GGML_UNUSED(s);
}
std::cerr << std::endl;
// Data rows
for (auto & pr : all_results) {
std::cerr << "| " << pr.label << " |";
auto it = pr.methods.find(method);
if (it != pr.methods.end() && it->second.size() == test_sizes.size()) {
for (auto & r : it->second) {
if (r.ms < 0) {
std::cerr << " - |";
} else {
std::cerr << " " << std::fixed << std::setprecision(1) << r.ms << " ms (" << std::setprecision(1) << r.gbps << " GB/s) |";
}
}
} else {
for (size_t s = 0; s < test_sizes.size(); s++) {
std::cerr << " - |";
GGML_UNUSED(s);
}
}
std::cerr << std::endl;
}
std::cerr << std::endl;
}
GGML_ABORT("GGML_VULKAN_COPY_TESTS completed");
GGML_UNUSED(ctx);
}
#endif
static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx, vk_context subctx) {
#if defined(GGML_VULKAN_COPY_TESTS)
ggml_vk_test_cross_device_copy(ctx);
#endif
#if defined(GGML_VULKAN_RUN_TESTS)
const std::vector<size_t> vals {
512, 512, 128,

View File

@@ -506,6 +506,7 @@ class VISION_PROJECTOR_TYPE(IntEnum):
GEMMA3N = auto()
GEMMA3 = auto()
QWEN3VL = auto()
STEP3VL = auto()
COGVLM = auto()
@@ -987,6 +988,8 @@ VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = {
VISION_PROJECTOR_TYPE.GLM_EDGE: "adapter",
VISION_PROJECTOR_TYPE.MERGER: "qwen2vl_merger",
VISION_PROJECTOR_TYPE.GEMMA3: "gemma3",
VISION_PROJECTOR_TYPE.QWEN3VL: "qwen3vl_merger",
VISION_PROJECTOR_TYPE.STEP3VL: "step3vl",
}
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
@@ -4105,6 +4108,7 @@ class VisionProjectorType:
QWEN2VL = "qwen2vl_merger"
QWEN25VL = "qwen2.5vl_merger"
QWEN3VL = "qwen3vl_merger"
STEP3VL = "step3vl"
ULTRAVOX = "ultravox"
INTERNVL = "internvl"
QWEN2A = "qwen2a" # audio

View File

@@ -1406,6 +1406,7 @@ class TensorNameMap:
"siglip2.vision_model.embeddings.patch_embedding",
"vision_model.radio_model.model.patch_generator.embedder", # Nemotron Nano v2 VL
"model.vision_tower.patch_embedder.input_proj", # gemma4
"vision_model.conv1", # Step3-VL
),
MODEL_TENSOR.V_ENC_EMBD_NORM: (
@@ -1425,6 +1426,7 @@ class TensorNameMap:
"visual.embeddings.position_embedding", # glm4v
"vision_model.radio_model.model.patch_generator.pos_embed", # Nemotron Nano v2 VL
"model.vision_tower.patch_embedder.position_embedding_table", # gemma4
"vision_model.positional_embedding", # Step3-VL
),
MODEL_TENSOR.V_ENC_EMBD_IMGNL: (
@@ -1443,6 +1445,7 @@ class TensorNameMap:
"model.vision_model.transformer.layers.{bid}.self_attn.qkv_proj", # Deepseek-OCR CLIP
"vision_tower.encoder.blocks.{bid}.wqkv", # Kimi-K2.5
"vision_model.radio_model.model.blocks.{bid}.attn.qkv", # Nemotron Nano v2 VL
"vision_model.transformer.resblocks.{bid}.attn.in_proj", # Step3-VL
),
MODEL_TENSOR.V_ENC_ATTN_Q: (
@@ -1523,6 +1526,7 @@ class TensorNameMap:
"model.vision_model.transformer.layers.{bid}.layer_norm1", # Deepseek-OCR CLIP
"siglip2.vision_model.encoder.layers.{bid}.layer_norm1",
"vision_model.radio_model.model.blocks.{bid}.norm1", # Nemotron Nano v2 VL
"vision_model.transformer.resblocks.{bid}.ln_1", # Step3-VL
),
MODEL_TENSOR.V_ENC_ATTN_O: (
@@ -1543,6 +1547,7 @@ class TensorNameMap:
"siglip2.vision_model.encoder.layers.{bid}.self_attn.out_proj", # youtuvl
"vision_model.radio_model.model.blocks.{bid}.attn.proj", # Nemotron Nano v2 VL
"vision_model.model.layers.{bid}.self_attn.o_proj.linear", # gemma4
"vision_model.transformer.resblocks.{bid}.attn.out_proj", # Step3-VL
),
MODEL_TENSOR.V_ENC_POST_ATTN_NORM: (
@@ -1562,6 +1567,7 @@ class TensorNameMap:
"siglip2.vision_model.encoder.layers.{bid}.layer_norm2",
"vision_model.radio_model.model.blocks.{bid}.norm2", # Nemotron Nano v2 VL
"vision_model.model.layers.{bid}.pre_feedforward_layernorm", # gemma4
"vision_model.transformer.resblocks.{bid}.ln_2", # Step3-VL
),
MODEL_TENSOR.V_ENC_FFN_UP: (
@@ -1582,6 +1588,7 @@ class TensorNameMap:
"siglip2.vision_model.encoder.layers.{bid}.mlp.fc1",
"vision_model.radio_model.model.blocks.{bid}.mlp.fc1", # Nemotron Nano v2 VL
"vision_model.model.layers.{bid}.mlp.up_proj", # gemma4
"vision_model.transformer.resblocks.{bid}.mlp.c_fc", # Step3-VL
),
MODEL_TENSOR.V_ENC_FFN_GATE: (
@@ -1609,6 +1616,7 @@ class TensorNameMap:
"siglip2.vision_model.encoder.layers.{bid}.mlp.fc2",
"vision_model.radio_model.model.blocks.{bid}.mlp.fc2", # Nemotron Nano v2 VL
"vision_model.model.layers.{bid}.mlp.down_proj", # gemma4
"vision_model.transformer.resblocks.{bid}.mlp.c_proj", # Step3-VL
),
MODEL_TENSOR.V_ENC_ATTN_POST_NORM: (
@@ -1622,11 +1630,13 @@ class TensorNameMap:
MODEL_TENSOR.V_LAYER_SCALE_1: (
"vision_tower.vision_model.encoder.layers.{bid}.ls1", # InternVL
"model.vision_tower.encoder.layer.{bid}.lambda_1", # Intern-S1
"vision_model.transformer.resblocks.{bid}.ls_1", # Step3-VL
),
MODEL_TENSOR.V_LAYER_SCALE_2: (
"vision_tower.vision_model.encoder.layers.{bid}.ls2", # InternVL
"model.vision_tower.encoder.layer.{bid}.lambda_2", # Intern-S1
"vision_model.transformer.resblocks.{bid}.ls_2", # Step3-VL
),
MODEL_TENSOR.V_LAYER_OUT_SCALE: (
@@ -1639,6 +1649,7 @@ class TensorNameMap:
"vision_encoder.ln_pre", # pixtral
"vision_model.layernorm_pre", # llama4
"model.vision_model.pre_layrnorm", # Deepseek-OCR CLIP
"vision_model.ln_pre", # Step3-VL
),
MODEL_TENSOR.V_POST_NORM: (

View File

@@ -31,6 +31,7 @@ add_library(mtmd
models/pixtral.cpp
models/qwen2vl.cpp
models/qwen3vl.cpp
models/step3vl.cpp
models/siglip.cpp
models/whisper-enc.cpp
models/deepseekocr.cpp

View File

@@ -242,6 +242,7 @@ enum projector_type {
PROJECTOR_TYPE_GLM_EDGE,
PROJECTOR_TYPE_QWEN2VL,
PROJECTOR_TYPE_QWEN3VL,
PROJECTOR_TYPE_STEP3VL,
PROJECTOR_TYPE_GEMMA3,
PROJECTOR_TYPE_GEMMA3NV,
PROJECTOR_TYPE_GEMMA3NA,
@@ -284,6 +285,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
{ PROJECTOR_TYPE_QWEN2VL, "qwen2vl_merger"},
{ PROJECTOR_TYPE_QWEN25VL, "qwen2.5vl_merger"},
{ PROJECTOR_TYPE_QWEN3VL, "qwen3vl_merger"},
{ PROJECTOR_TYPE_STEP3VL, "step3vl"},
{ PROJECTOR_TYPE_GEMMA3, "gemma3"},
{ PROJECTOR_TYPE_GEMMA3NV, "gemma3nv"},
{ PROJECTOR_TYPE_GEMMA3NA, "gemma3na"},

View File

@@ -79,7 +79,6 @@ struct clip_hparams {
float eps = 1e-6;
float rope_theta = 0.0;
std::unordered_set<int32_t> vision_feature_layer;
int32_t attn_window_size = 0;
int32_t n_wa_pattern = 0;

View File

@@ -862,6 +862,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
{
builder = std::make_unique<clip_graph_qwen3vl>(ctx, img);
} break;
case PROJECTOR_TYPE_STEP3VL:
{
builder = std::make_unique<clip_graph_step3vl>(ctx, img);
} break;
case PROJECTOR_TYPE_MINICPMV:
{
builder = std::make_unique<clip_graph_minicpmv>(ctx, img);
@@ -1337,6 +1341,17 @@ struct clip_model_loader {
LOG_WRN("%s: more info: https://github.com/ggml-org/llama.cpp/issues/16842\n\n", __func__);
}
} break;
case PROJECTOR_TYPE_STEP3VL:
{
hparams.n_merge = 4; // two stride-2 downsamplers after patching
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
hparams.rope_theta = 10000.0f;
get_u32(KEY_PREPROC_IMAGE_SIZE, hparams.image_longest_edge, false);
if (hparams.image_longest_edge == 0) {
hparams.image_longest_edge = 3024;
}
hparams.warmup_image_size = hparams.image_size;
} break;
case PROJECTOR_TYPE_YOUTUVL:
{
hparams.n_merge = 2;
@@ -1769,6 +1784,14 @@ struct clip_model_loader {
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
} break;
case PROJECTOR_TYPE_STEP3VL:
{
model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));
model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"), false);
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"), false);
model.mm_model_proj = get_tensor(string_format(TN_MM_PROJECTOR, "weight"));
} break;
case PROJECTOR_TYPE_YOUTUVL:
{
model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM); // merger.ln_q (RMS norm)
@@ -2615,6 +2638,8 @@ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 *
case PROJECTOR_TYPE_HUNYUANOCR:
case PROJECTOR_TYPE_YOUTUVL:
return (img->nx / params.patch_size) / 2;
case PROJECTOR_TYPE_STEP3VL:
return img->nx / (params.patch_size * params.n_merge);
default:
break;
}
@@ -2632,6 +2657,8 @@ int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 *
case PROJECTOR_TYPE_PADDLEOCR:
case PROJECTOR_TYPE_YOUTUVL:
return (img->ny / params.patch_size) / 2;
case PROJECTOR_TYPE_STEP3VL:
return img->ny / (params.patch_size * params.n_merge);
default:
break;
}
@@ -2702,6 +2729,12 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
int y_patch = img->ny / (params.patch_size * 2);
n_patches = x_patch * y_patch;
} break;
case PROJECTOR_TYPE_STEP3VL:
{
int x_patch = img->nx / (params.patch_size * params.n_merge);
int y_patch = img->ny / (params.patch_size * params.n_merge);
n_patches = x_patch * y_patch;
} break;
case PROJECTOR_TYPE_GEMMA3:
case PROJECTOR_TYPE_GEMMA4V:
case PROJECTOR_TYPE_IDEFICS3:
@@ -3004,6 +3037,18 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
set_input_i32("positions", positions);
} break;
case PROJECTOR_TYPE_STEP3VL:
{
std::vector<int32_t> pos_data(n_pos);
for (int i = 0; i < n_pos; i++) {
pos_data[i] = i / pos_w;
}
set_input_i32("pos_h", pos_data);
for (int i = 0; i < n_pos; i++) {
pos_data[i] = i % pos_w;
}
set_input_i32("pos_w", pos_data);
} break;
case PROJECTOR_TYPE_PADDLEOCR:
{
const int merge_ratio = hparams.n_merge;
@@ -3358,6 +3403,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
case PROJECTOR_TYPE_QWEN3VL:
// main path + deepstack paths
return ctx->model.mm_1_b->ne[0] * (1 + ctx->model.n_deepstack_layers);
case PROJECTOR_TYPE_STEP3VL:
return ctx->model.mm_model_proj->ne[1];
case PROJECTOR_TYPE_GEMMA3:
case PROJECTOR_TYPE_GEMMA3NV:
return ctx->model.mm_input_proj_w->ne[0];

View File

@@ -33,6 +33,11 @@ struct clip_graph_qwen3vl : clip_graph {
ggml_cgraph * build() override;
};
struct clip_graph_step3vl : clip_graph {
clip_graph_step3vl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;
};
struct clip_graph_youtuvl : clip_graph {
clip_graph_youtuvl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;

View File

@@ -0,0 +1,81 @@
#include "models.h"
ggml_cgraph * clip_graph_step3vl::build() {
GGML_ASSERT(model.class_embedding == nullptr);
GGML_ASSERT(model.patch_embeddings_0 != nullptr);
GGML_ASSERT(model.position_embeddings != nullptr);
norm_type norm_t = NORM_TYPE_NORMAL;
ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
ggml_set_name(pos_h, "pos_h");
ggml_set_input(pos_h);
ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
ggml_set_name(pos_w, "pos_w");
ggml_set_input(pos_w);
ggml_tensor * inp = build_inp();
ggml_tensor * learned_pos_embd = resize_position_embeddings();
auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
return build_rope_2d(ctx0, cur, pos_w, pos_h, hparams.rope_theta, false);
};
auto add_spatial_bias = [&](ggml_tensor * cur, ggml_tensor * bias) {
if (bias == nullptr) {
return cur;
}
const int64_t width = cur->ne[0];
const int64_t height = cur->ne[1];
const int64_t channels = cur->ne[2];
cur = ggml_reshape_2d(ctx0, cur, width * height, channels);
cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
cur = ggml_add(ctx0, cur, bias);
cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
cur = ggml_reshape_3d(ctx0, cur, width, height, channels);
return cur;
};
ggml_tensor * cur = build_vit(
inp,
n_patches,
norm_t,
hparams.ffn_op,
learned_pos_embd,
add_pos);
cb(cur, "vit_out", -1);
// [n_embd, n_patches] -> [w, h, n_embd] for spatial downsampling convolutions.
cur = ggml_permute(ctx0, cur, 1, 0, 2, 3);
cur = ggml_cont_3d(ctx0, cur, n_patches_x, n_patches_y, n_embd);
// First downsampler: Conv2d(1536 -> 3072, k=3, s=2, p=1)
cur = ggml_conv_2d(ctx0, model.mm_0_w, cur, 2, 2, 1, 1, 1, 1);
cur = add_spatial_bias(cur, model.mm_0_b);
cb(cur, "downsample_0", -1);
// Second downsampler: Conv2d(3072 -> 6144, k=3, s=2, p=1)
cur = ggml_conv_2d(ctx0, model.mm_1_w, cur, 2, 2, 1, 1, 1, 1);
cur = add_spatial_bias(cur, model.mm_1_b);
cb(cur, "downsample_1", -1);
// [w, h, c] -> [c, w*h]
{
const int64_t w = cur->ne[0];
const int64_t h = cur->ne[1];
cur = ggml_reshape_3d(ctx0, cur, w * h, cur->ne[2], cur->ne[3]);
cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 0, 2, 3));
}
cb(cur, "downsample_flatten", -1);
// Final projector: Linear(6144 -> projection_dim)
cur = ggml_mul_mat(ctx0, model.mm_model_proj, cur);
cb(cur, "projector_out", -1);
ggml_build_forward_expand(gf, cur);
return gf;
}

View File

@@ -1114,6 +1114,260 @@ bool mtmd_image_preprocessor_deepseekocr::preprocess(const clip_image_u8 & img,
return true;
}
//
// mtmd_image_preprocessor_step3vl
//
void mtmd_image_preprocessor_step3vl::img_u8_resize_bilinear_to_f32(
const clip_image_u8 & src,
clip_image_f32 & dst,
int target_width,
int target_height,
const float mean[3],
const float std[3]) {
if (src.nx == target_width && src.ny == target_height) {
img_u8_to_f32(src, dst, mean, std);
return;
}
dst.nx = target_width;
dst.ny = target_height;
dst.buf.resize(3 * target_width * target_height);
const float scale_x = static_cast<float>(src.nx) / target_width;
const float scale_y = static_cast<float>(src.ny) / target_height;
for (int y = 0; y < target_height; ++y) {
const float src_y = (static_cast<float>(y) + 0.5f) * scale_y - 0.5f;
const int y0_floor = static_cast<int>(std::floor(src_y));
const int y0 = std::max(0, std::min(y0_floor, src.ny - 1));
const int y1 = std::max(0, std::min(y0_floor + 1, src.ny - 1));
const float ly = src_y - y0_floor;
for (int x = 0; x < target_width; ++x) {
const float src_x = (static_cast<float>(x) + 0.5f) * scale_x - 0.5f;
const int x0_floor = static_cast<int>(std::floor(src_x));
const int x0 = std::max(0, std::min(x0_floor, src.nx - 1));
const int x1 = std::max(0, std::min(x0_floor + 1, src.nx - 1));
const float lx = src_x - x0_floor;
const size_t idx00 = 3 * (y0 * src.nx + x0);
const size_t idx01 = 3 * (y0 * src.nx + x1);
const size_t idx10 = 3 * (y1 * src.nx + x0);
const size_t idx11 = 3 * (y1 * src.nx + x1);
const size_t idx_dst = 3 * (y * target_width + x);
for (int c = 0; c < 3; ++c) {
const float v00 = (static_cast<float>(src.buf[idx00 + c]) / 255.0f - mean[c]) / std[c];
const float v01 = (static_cast<float>(src.buf[idx01 + c]) / 255.0f - mean[c]) / std[c];
const float v10 = (static_cast<float>(src.buf[idx10 + c]) / 255.0f - mean[c]) / std[c];
const float v11 = (static_cast<float>(src.buf[idx11 + c]) / 255.0f - mean[c]) / std[c];
const float top = v00 + (v01 - v00) * lx;
const float bot = v10 + (v11 - v10) * lx;
dst.buf[idx_dst + c] = top + (bot - top) * ly;
}
}
}
}
int mtmd_image_preprocessor_step3vl::get_image_longest_edge(const clip_hparams & params) {
return params.image_longest_edge > 0 ? params.image_longest_edge : default_image_longest_edge;
}
int mtmd_image_preprocessor_step3vl::determine_window_size(const clip_hparams & params, int longer, int shorter) {
const int image_size = params.image_size;
const int crop_size = default_image_crop_size;
const float aspect_ratio = static_cast<float>(longer) / shorter;
if (longer <= image_size) {
return aspect_ratio > small_aspect_ratio_limit ? shorter : 0;
}
return aspect_ratio > wide_aspect_ratio_limit ? std::min(shorter, crop_size) : crop_size;
}
int mtmd_image_preprocessor_step3vl::calc_crop_extent(int length, int window_size) {
const float ratio = static_cast<float>(length) / window_size;
if (ratio < 1.0f) {
return length;
}
const float decimal = ratio - std::floor(ratio);
const int rounded = decimal > crop_rounding_threshold
? static_cast<int>(std::floor(ratio)) + 1
: static_cast<int>(std::floor(ratio));
return window_size * rounded;
}
std::vector<int> mtmd_image_preprocessor_step3vl::calc_grid(int length, int window_size) {
const int n = length <= window_size
? 1
: static_cast<int>(std::ceil(static_cast<float>(length - window_size) / window_size + 1.0f));
std::vector<int> starts(n);
for (int i = 0; i < n; ++i) {
starts[i] = window_size * i;
}
if (n > 1 && starts.back() + window_size > length) {
starts.back() = length - window_size;
}
return starts;
}
clip_image_u8 mtmd_image_preprocessor_step3vl::prepare_image(const clip_image_u8 & img, const clip_hparams & params) {
clip_image_u8 resized = img;
const float aspect_ratio = img.ny > 0 ? static_cast<float>(img.nx) / img.ny : 1.0f;
if (std::min(img.nx, img.ny) < 32 &&
(aspect_ratio > wide_aspect_ratio_limit ||
aspect_ratio < 1.0f / wide_aspect_ratio_limit)) {
const int square_size = std::max(img.nx, img.ny);
clip_image_u8 padded;
padded.nx = square_size;
padded.ny = square_size;
padded.buf.resize(3 * square_size * square_size);
img_tool::fill(padded, {0, 0, 0});
img_tool::composite(padded, img, 0, 0);
resized = std::move(padded);
}
const int max_image_size = get_image_longest_edge(params);
if (std::max(resized.nx, resized.ny) > max_image_size) {
const float scale = static_cast<float>(max_image_size) / std::max(resized.nx, resized.ny);
const clip_image_size new_size = {
std::max(1, static_cast<int>(std::floor(resized.nx * scale))),
std::max(1, static_cast<int>(std::floor(resized.ny * scale))),
};
clip_image_u8 scaled;
img_tool::resize(resized, scaled, new_size, RESIZE_ALGO_BILINEAR, false);
resized = std::move(scaled);
}
return resized;
}
clip_image_u8 mtmd_image_preprocessor_step3vl::crop_with_black_padding(const clip_image_u8 & image, int x, int y, int w, int h) {
clip_image_u8 dst;
dst.nx = w;
dst.ny = h;
dst.buf.resize(3 * w * h, 0);
const int src_x0 = std::max(0, x);
const int src_y0 = std::max(0, y);
const int src_x1 = std::min(image.nx, x + w);
const int src_y1 = std::min(image.ny, y + h);
if (src_x0 >= src_x1 || src_y0 >= src_y1) {
return dst;
}
const int dst_x0 = src_x0 - x;
const int dst_y0 = src_y0 - y;
for (int yy = 0; yy < src_y1 - src_y0; ++yy) {
for (int xx = 0; xx < src_x1 - src_x0; ++xx) {
const int src_idx = 3 * ((src_y0 + yy) * image.nx + (src_x0 + xx));
const int dst_idx = 3 * ((dst_y0 + yy) * w + (dst_x0 + xx));
dst.buf[dst_idx + 0] = image.buf[src_idx + 0];
dst.buf[dst_idx + 1] = image.buf[src_idx + 1];
dst.buf[dst_idx + 2] = image.buf[src_idx + 2];
}
}
return dst;
}
mtmd_image_preprocessor_step3vl::slice_instructions mtmd_image_preprocessor_step3vl::build_slice_instructions(
const clip_hparams & params,
const clip_image_size & prepared_size) {
slice_instructions instructions;
instructions.overview_size = prepared_size;
const int window_size = determine_window_size(
params,
std::max(prepared_size.width, prepared_size.height),
std::min(prepared_size.width, prepared_size.height));
if (window_size <= 0) {
instructions.refined_size = clip_image_size{0, 0};
instructions.grid_size = clip_image_size{0, 0};
return instructions;
}
const int crop_width = calc_crop_extent(prepared_size.width, window_size);
const int crop_height = calc_crop_extent(prepared_size.height, window_size);
instructions.refined_size = clip_image_size{crop_width, crop_height};
const auto xs = calc_grid(crop_width, window_size);
const auto ys = calc_grid(crop_height, window_size);
instructions.grid_size = clip_image_size{
static_cast<int>(xs.size()),
static_cast<int>(ys.size()),
};
for (int y : ys) {
for (int x : xs) {
instructions.slices.push_back(slice_coordinates{
/* x */ x,
/* y */ y,
/* size */ clip_image_size{window_size, window_size},
});
}
}
return instructions;
}
bool mtmd_image_preprocessor_step3vl::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
clip_image_u8 prepared = prepare_image(img, hparams);
const auto instructions = build_slice_instructions(hparams, {prepared.nx, prepared.ny});
clip_image_f32_ptr overview_f32(clip_image_f32_init());
img_u8_resize_bilinear_to_f32(
prepared,
*overview_f32,
hparams.image_size,
hparams.image_size,
hparams.image_mean,
hparams.image_std);
output.entries.push_back(std::move(overview_f32));
if (instructions.slices.empty()) {
output.grid_x = 0;
output.grid_y = 0;
return true;
}
clip_image_u8 img_for_crop = prepared;
if (instructions.refined_size.width != prepared.nx || instructions.refined_size.height != prepared.ny) {
clip_image_u8 refined;
img_tool::resize(prepared, refined, instructions.refined_size, RESIZE_ALGO_BILINEAR, false);
img_for_crop = std::move(refined);
}
const int crop_size = default_image_crop_size;
for (const auto & slice : instructions.slices) {
// If the requested patch extends past the source image, pad the out-of-bounds area with black.
clip_image_u8 patch = crop_with_black_padding(img_for_crop, slice.x, slice.y, slice.size.width, slice.size.height);
clip_image_f32_ptr patch_f32(clip_image_f32_init());
img_u8_resize_bilinear_to_f32(
patch,
*patch_f32,
crop_size,
crop_size,
hparams.image_mean,
hparams.image_std);
output.entries.push_back(std::move(patch_f32));
}
output.grid_x = instructions.grid_size.width;
output.grid_y = instructions.grid_size.height;
return true;
}
//
// mtmd_image_preprocessor_youtuvl
//

View File

@@ -144,6 +144,35 @@ struct mtmd_image_preprocessor_deepseekocr : mtmd_image_preprocessor {
bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
};
// custom image preprocessing for Step3VL
// ref: https://huggingface.co/stepfun-ai/Step3-VL-10B/blob/main/processing_step3.py
struct mtmd_image_preprocessor_step3vl : mtmd_image_preprocessor_llava_uhd {
mtmd_image_preprocessor_step3vl(const clip_ctx * ctx) : mtmd_image_preprocessor_llava_uhd(ctx) {}
bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
static slice_instructions build_slice_instructions(const clip_hparams & params, const clip_image_size & prepared_size);
private:
static constexpr int default_image_longest_edge = 3024;
static constexpr int default_image_crop_size = 504;
static constexpr float small_aspect_ratio_limit = 1.5f;
static constexpr float wide_aspect_ratio_limit = 4.0f;
static constexpr float crop_rounding_threshold = 0.2f;
void img_u8_resize_bilinear_to_f32(
const clip_image_u8 & src,
clip_image_f32 & dst,
int target_width,
int target_height,
const float mean[3],
const float std[3]);
static int get_image_longest_edge(const clip_hparams & params);
static int determine_window_size(const clip_hparams & params, int longer, int shorter);
static int calc_crop_extent(int length, int window_size);
static std::vector<int> calc_grid(int length, int window_size);
static clip_image_u8 prepare_image(const clip_image_u8 & img, const clip_hparams & params);
static clip_image_u8 crop_with_black_padding(const clip_image_u8 & image, int x, int y, int w, int h);
};
struct mtmd_image_preprocessor_youtuvl : mtmd_image_preprocessor {
mtmd_image_preprocessor_youtuvl(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;

View File

@@ -88,6 +88,7 @@ enum mtmd_slice_tmpl {
MTMD_SLICE_TMPL_LLAMA4,
MTMD_SLICE_TMPL_IDEFICS3,
MTMD_SLICE_TMPL_LFM2,
MTMD_SLICE_TMPL_STEP3VL,
};
const char * mtmd_default_marker() {
@@ -259,7 +260,6 @@ struct mtmd_context {
tok_row_end = {lookup_token("\n")};
tok_row_end_trail = false; // no trailing end-of-row token
ov_img_first = true;
} else if (minicpmv_version == 3 || minicpmv_version == 4 || minicpmv_version == 5 || minicpmv_version == 6 || minicpmv_version == 100045) {
// minicpmv 2.6 format:
// <image> (overview) </image><slice> (slice) </slice><slice> (slice) </slice>\n ...
@@ -331,6 +331,22 @@ struct mtmd_context {
" https://github.com/ggml-org/llama.cpp/pull/13282\n", __func__);
image_preproc = std::make_unique<mtmd_image_preprocessor_llava_uhd>(ctx_v);
} break;
case PROJECTOR_TYPE_STEP3VL:
{
// Step3 format:
// <patch_start> (patch) <patch_end> [<patch_newline>]
// ... (all patch rows)
// <im_start> (overview) <im_end>
slice_tmpl = MTMD_SLICE_TMPL_STEP3VL;
tok_ov_img_start = {lookup_token("<im_start>")};
tok_ov_img_end = {lookup_token("<im_end>")};
tok_sli_img_start = {lookup_token("<patch_start>")};
tok_sli_img_end = {lookup_token("<patch_end>")};
tok_row_end = {lookup_token("<patch_newline>")};
tok_row_end_trail = false;
ov_img_first = false; // patches first, overview last
image_preproc = std::make_unique<mtmd_image_preprocessor_step3vl>(ctx_v);
} break;
case PROJECTOR_TYPE_INTERNVL:
{
// <img> ... (image embeddings) ... </img>
@@ -682,6 +698,7 @@ struct mtmd_tokenizer {
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_LLAMA4
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_IDEFICS3
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_STEP3VL
|| (ctx->slice_tmpl == MTMD_SLICE_TMPL_LFM2 && has_tiling_grid)
) {
const int n_col = batch_f32.grid_x;

File diff suppressed because one or more lines are too long

View File

@@ -18,7 +18,7 @@
<div style="display: contents">
<script>
{
__sveltekit_10avopp = {
__sveltekit_1ppa22i = {
base: new URL('.', location).pathname.slice(0, -1)
};

View File

@@ -4,6 +4,7 @@
import remarkGfm from 'remark-gfm';
import remarkMath from 'remark-math';
import rehypeHighlight from 'rehype-highlight';
import { all as lowlightAll } from 'lowlight';
import remarkRehype from 'remark-rehype';
import rehypeKatex from 'rehype-katex';
import rehypeStringify from 'rehype-stringify';
@@ -96,6 +97,7 @@
return proc
.use(rehypeHighlight, {
languages: lowlightAll,
aliases: { [FileTypeText.XML]: [FileTypeText.SVELTE, FileTypeText.VUE] }
}) // Add syntax highlighting
.use(rehypeRestoreTableHtml) // Restore limited HTML (e.g., <br>, <ul>) inside Markdown tables