mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-04-16 16:27:32 +03:00
Compare commits
2 Commits
0cc4m/vulk
...
b8705
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
09343c0198 | ||
|
|
97508acb17 |
@@ -2219,10 +2219,10 @@ class MmprojModel(ModelBase):
|
||||
self.image_size = self.find_vparam(["image_size"])
|
||||
self.gguf_writer.add_vision_image_size(self.image_size)
|
||||
self.gguf_writer.add_vision_patch_size(self.find_vparam(["patch_size"]))
|
||||
self.gguf_writer.add_vision_embedding_length(self.find_vparam(["hidden_size", "vt_hidden_size"]))
|
||||
self.gguf_writer.add_vision_embedding_length(self.find_vparam(["hidden_size", "width", "vt_hidden_size"]))
|
||||
self.gguf_writer.add_vision_feed_forward_length(self.find_vparam(["intermediate_size", "vt_intermediate_size"]))
|
||||
self.gguf_writer.add_vision_block_count(self.find_vparam(self.n_block_keys))
|
||||
self.gguf_writer.add_vision_head_count(self.find_vparam(["num_attention_heads", "num_heads", "vt_num_attention_heads"]))
|
||||
self.gguf_writer.add_vision_head_count(self.find_vparam(["num_attention_heads", "num_heads", "heads", "vt_num_attention_heads"]))
|
||||
|
||||
# preprocessor config
|
||||
image_mean = _MISTRAL_COMMON_DATASET_MEAN if self.is_mistral_format else self.preprocessor_config["image_mean"]
|
||||
@@ -4949,6 +4949,73 @@ class Glm4VVisionModel(Qwen3VLVisionModel):
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("StepVLForConditionalGeneration")
|
||||
class Step3VLVisionModel(MmprojModel):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
assert self.hparams_vision is not None
|
||||
|
||||
if not self.hparams_vision.get("intermediate_size"):
|
||||
hidden_size = self.hparams_vision.get("hidden_size") or self.hparams_vision.get("width") or 0
|
||||
assert hidden_size > 0
|
||||
mlp_ratio = float(self.hparams_vision.get("mlp_ratio", 8960 / 1536))
|
||||
self.hparams_vision["intermediate_size"] = int(round(hidden_size * mlp_ratio))
|
||||
|
||||
self.preprocessor_config.setdefault("image_mean", list(_MISTRAL_COMMON_DATASET_MEAN))
|
||||
self.preprocessor_config.setdefault("image_std", list(_MISTRAL_COMMON_DATASET_STD))
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
assert self.hparams_vision is not None
|
||||
|
||||
projector_stride = int(self.global_config.get("understand_projector_stride", -1))
|
||||
hidden_size = int(self.hparams_vision.get("hidden_size", self.hparams_vision.get("width", -1)))
|
||||
num_layers = int(self.hparams_vision.get("num_hidden_layers", self.hparams_vision.get("layers", -1)))
|
||||
assert (projector_stride, int(self.hparams_vision.get("image_size", -1)), hidden_size, num_layers) == (2, 728, 1536, 47), (
|
||||
"current Step3-VL conversion path is only validated for Step3-VL-10B"
|
||||
)
|
||||
|
||||
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.STEP3VL)
|
||||
self.gguf_writer.add_vision_attention_layernorm_eps(float(self.hparams_vision.get("layer_norm_eps", 1e-5)))
|
||||
self.gguf_writer.add_vision_projector_scale_factor(projector_stride ** 2)
|
||||
# 3024 max resize comes from step3-vl-10b processing_step3.py.
|
||||
self.gguf_writer.add_vision_preproc_image_size(3024)
|
||||
|
||||
def tensor_force_quant(self, name, new_name, bid, n_dims):
|
||||
if ".position_embd." in new_name:
|
||||
return gguf.GGMLQuantizationType.F32
|
||||
return super().tensor_force_quant(name, new_name, bid, n_dims)
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
if name.startswith("model.") or name.startswith("lm_head."):
|
||||
return
|
||||
|
||||
if name.startswith("vision_model.vit_downsampler"):
|
||||
match = re.match(r"vision_model\.vit_downsampler(\d+)\.(weight|bias)", name)
|
||||
if match is None:
|
||||
raise ValueError(f"Unexpected Step3-VL projector tensor {name!r}")
|
||||
|
||||
proj_id = int(match.group(1)) - 1
|
||||
suffix = f".{match.group(2)}"
|
||||
yield (self.format_tensor_name(gguf.MODEL_TENSOR.V_MMPROJ, proj_id, suffix=suffix), data_torch)
|
||||
return
|
||||
|
||||
if name == "vit_large_projector.weight":
|
||||
yield (self.format_tensor_name(gguf.MODEL_TENSOR.V_MMPROJ_FC), data_torch)
|
||||
return
|
||||
|
||||
if name.startswith("vision_model."):
|
||||
if name == "vision_model.positional_embedding":
|
||||
name += ".weight"
|
||||
elif name.endswith(".gamma") and ".ls_" in name:
|
||||
name = name.removesuffix(".gamma") + ".weight"
|
||||
|
||||
name = name.replace("attn.in_proj_weight", "attn.in_proj.weight")
|
||||
name = name.replace("attn.in_proj_bias", "attn.in_proj.bias")
|
||||
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("Qwen3VLForConditionalGeneration")
|
||||
class Qwen3VLTextModel(Qwen3Model):
|
||||
model_arch = gguf.MODEL_ARCH.QWEN3VL
|
||||
@@ -4969,6 +5036,16 @@ class Qwen3VLTextModel(Qwen3Model):
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("StepVLForConditionalGeneration")
|
||||
class Step3VLTextModel(Qwen3Model):
|
||||
model_arch = gguf.MODEL_ARCH.QWEN3
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
if name.startswith("vision_model.") or name.startswith("model.vision_model.") or name.startswith("vit_large_projector."):
|
||||
return
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("Qwen3VLMoeForConditionalGeneration")
|
||||
class Qwen3VLMoeTextModel(Qwen3MoeModel):
|
||||
model_arch = gguf.MODEL_ARCH.QWEN3VLMOE
|
||||
@@ -12994,6 +13071,12 @@ def get_model_architecture(hparams: dict[str, Any], model_type: ModelType) -> st
|
||||
# For non-hf Mamba and Mamba2 models
|
||||
arch = hparams["ssm_cfg"].get("layer", "Mamba") + "ForCausalLM"
|
||||
|
||||
# Step3-VL keeps text config under text_config but uses a custom top-level architecture.
|
||||
# For text conversion we route to a dedicated text-only class.
|
||||
# TODO: refactor this later to avoid adding exception here
|
||||
if model_type == ModelType.TEXT and arch == "StepVLForConditionalGeneration":
|
||||
return arch
|
||||
|
||||
# if "architectures" is found in the sub-config, use that instead
|
||||
if model_type == ModelType.TEXT and text_config.get("architectures") is not None:
|
||||
arch = text_config["architectures"][0]
|
||||
|
||||
@@ -223,7 +223,6 @@ option(GGML_VULKAN_MEMORY_DEBUG "ggml: enable Vulkan memory debug ou
|
||||
option(GGML_VULKAN_SHADER_DEBUG_INFO "ggml: enable Vulkan shader debug info" OFF)
|
||||
option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation" OFF)
|
||||
option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF)
|
||||
option(GGML_VULKAN_COPY_TESTS "ggml: run Vulkan cross-device copy benchmarks" OFF)
|
||||
option(GGML_WEBGPU "ggml: use WebGPU" OFF)
|
||||
option(GGML_WEBGPU_DEBUG "ggml: enable WebGPU debug output" OFF)
|
||||
option(GGML_WEBGPU_CPU_PROFILE "ggml: enable WebGPU profiling (CPU)" OFF)
|
||||
|
||||
@@ -120,10 +120,6 @@ if (Vulkan_FOUND)
|
||||
add_compile_definitions(GGML_VULKAN_RUN_TESTS)
|
||||
endif()
|
||||
|
||||
if (GGML_VULKAN_COPY_TESTS)
|
||||
add_compile_definitions(GGML_VULKAN_COPY_TESTS)
|
||||
endif()
|
||||
|
||||
# Set up toolchain for host compilation whether cross-compiling or not
|
||||
if (CMAKE_CROSSCOMPILING)
|
||||
if (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN)
|
||||
|
||||
@@ -1,12 +1,9 @@
|
||||
#include "ggml-vulkan.h"
|
||||
#include <vulkan/vulkan_core.h>
|
||||
#if defined(GGML_VULKAN_RUN_TESTS) || defined(GGML_VULKAN_CHECK_RESULTS) || defined(GGML_VULKAN_COPY_TESTS)
|
||||
#if defined(GGML_VULKAN_RUN_TESTS) || defined(GGML_VULKAN_CHECK_RESULTS)
|
||||
#include <chrono>
|
||||
#include "ggml-cpu.h"
|
||||
#endif
|
||||
#if defined(GGML_VULKAN_COPY_TESTS) && !defined(_WIN32)
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
// See https://github.com/KhronosGroup/Vulkan-Hpp?tab=readme-ov-file#extensions--per-device-function-pointers-
|
||||
#define VULKAN_HPP_DISPATCH_LOADER_DYNAMIC 1
|
||||
@@ -594,7 +591,6 @@ struct vk_device_struct {
|
||||
uint64_t suballocation_block_size;
|
||||
uint64_t min_imported_host_pointer_alignment;
|
||||
bool external_memory_host {};
|
||||
bool external_semaphore_fd {};
|
||||
bool fp16;
|
||||
bool bf16;
|
||||
bool pipeline_robustness;
|
||||
@@ -1663,7 +1659,6 @@ struct ggml_vk_garbage_collector {
|
||||
static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx, vk_context subctx);
|
||||
static void ggml_vk_load_shaders(vk_device& device);
|
||||
static void ggml_pipeline_allocate_descriptor_sets(ggml_backend_vk_context * ctx);
|
||||
static vk_buffer ggml_vk_buffer_from_host_ptr(vk_device & device, void * ptr, size_t size);
|
||||
|
||||
static bool vk_memory_logger_enabled = false;
|
||||
|
||||
@@ -4887,8 +4882,6 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||
device->memory_priority = true;
|
||||
} else if (strcmp("VK_EXT_external_memory_host", properties.extensionName) == 0) {
|
||||
device->external_memory_host = true;
|
||||
} else if (strcmp("VK_KHR_external_semaphore_fd", properties.extensionName) == 0) {
|
||||
device->external_semaphore_fd = true;
|
||||
#if defined(VK_EXT_shader_64bit_indexing)
|
||||
} else if (strcmp("VK_EXT_shader_64bit_indexing", properties.extensionName) == 0) {
|
||||
device->shader_64b_indexing = true;
|
||||
@@ -5188,10 +5181,6 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||
device_extensions.push_back("VK_EXT_external_memory_host");
|
||||
}
|
||||
|
||||
if (device->external_semaphore_fd) {
|
||||
device_extensions.push_back("VK_KHR_external_semaphore_fd");
|
||||
}
|
||||
|
||||
#if defined(VK_EXT_shader_64bit_indexing)
|
||||
VkPhysicalDeviceShader64BitIndexingFeaturesEXT shader_64bit_indexing_features {};
|
||||
shader_64bit_indexing_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_64_BIT_INDEXING_FEATURES_EXT;
|
||||
@@ -12641,654 +12630,7 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef GGML_VULKAN_COPY_TESTS
|
||||
// Cross-device copy benchmark
|
||||
// Tests different approaches to copying data between two Vulkan devices.
|
||||
// Build with -DGGML_VULKAN_COPY_TESTS and run any llama.cpp command with >= 2 Vulkan devices.
|
||||
|
||||
// Helper: allocate shared staging buffer importable by both devices
|
||||
struct vk_shared_staging {
|
||||
void * host_ptr = nullptr;
|
||||
vk_buffer buf_dev0;
|
||||
vk_buffer buf_dev1;
|
||||
size_t size = 0;
|
||||
|
||||
bool alloc(vk_device & dev0, vk_device & dev1, size_t sz) {
|
||||
size_t align = std::max(dev0->min_imported_host_pointer_alignment,
|
||||
dev1->min_imported_host_pointer_alignment);
|
||||
size = (sz + align - 1) & ~(align - 1);
|
||||
#ifdef _WIN32
|
||||
host_ptr = _aligned_malloc(size, align);
|
||||
#else
|
||||
if (posix_memalign(&host_ptr, align, size) != 0) { host_ptr = nullptr; }
|
||||
#endif
|
||||
if (!host_ptr) return false;
|
||||
buf_dev0 = ggml_vk_buffer_from_host_ptr(dev0, host_ptr, size);
|
||||
buf_dev1 = ggml_vk_buffer_from_host_ptr(dev1, host_ptr, size);
|
||||
return buf_dev0 && buf_dev1;
|
||||
}
|
||||
|
||||
void free_resources() {
|
||||
ggml_vk_destroy_buffer(buf_dev0);
|
||||
ggml_vk_destroy_buffer(buf_dev1);
|
||||
#ifdef _WIN32
|
||||
_aligned_free(host_ptr);
|
||||
#else
|
||||
free(host_ptr);
|
||||
#endif
|
||||
host_ptr = nullptr;
|
||||
}
|
||||
};
|
||||
|
||||
// Helper: run a benchmark and print results
|
||||
static void vk_bench_print(const char * name, std::vector<double> & times, size_t size) {
|
||||
std::sort(times.begin(), times.end());
|
||||
double median = times[times.size() / 2];
|
||||
double bw = (size / (1024.0 * 1024.0 * 1024.0)) / (median / 1000.0);
|
||||
std::cerr << " " << std::left << std::setw(22) << name << " : "
|
||||
<< std::fixed << std::setprecision(3) << median << " ms "
|
||||
<< std::setprecision(2) << bw << " GB/s" << std::endl;
|
||||
}
|
||||
|
||||
// Results stored per (method, size) for table output
|
||||
struct vk_copy_result {
|
||||
std::string method;
|
||||
double ms;
|
||||
double gbps;
|
||||
};
|
||||
|
||||
static void ggml_vk_bench_pair(
|
||||
vk_device & dev0, vk_device & dev1,
|
||||
const std::vector<size_t> & test_sizes,
|
||||
std::map<std::string, std::vector<vk_copy_result>> & results) {
|
||||
|
||||
const size_t num_it = 20;
|
||||
const size_t warmup = 3;
|
||||
const size_t max_size = test_sizes.back();
|
||||
|
||||
// Allocate buffers
|
||||
vk_buffer buf_src = ggml_vk_create_buffer_check(dev0, max_size, {vk::MemoryPropertyFlagBits::eDeviceLocal});
|
||||
vk_buffer buf_dst = ggml_vk_create_buffer_check(dev1, max_size, {vk::MemoryPropertyFlagBits::eDeviceLocal});
|
||||
vk_buffer staging_src = ggml_vk_create_buffer_check(dev0, max_size,
|
||||
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
|
||||
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
|
||||
vk_buffer staging_dst = ggml_vk_create_buffer_check(dev1, max_size,
|
||||
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached,
|
||||
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
|
||||
|
||||
// Fill source
|
||||
{
|
||||
vk_context subctx = ggml_vk_create_temporary_context(dev0->transfer_queue.cmd_pool);
|
||||
ggml_vk_ctx_begin(dev0, subctx);
|
||||
subctx->s->buffer->buf.fillBuffer(buf_src->buffer, 0, max_size, 0xDEADBEEF);
|
||||
ggml_vk_ctx_end(subctx);
|
||||
ggml_vk_submit(subctx, dev0->fence);
|
||||
VK_CHECK(dev0->device.waitForFences({ dev0->fence }, true, UINT64_MAX), "fill");
|
||||
dev0->device.resetFences({ dev0->fence });
|
||||
}
|
||||
|
||||
bool has_shared_staging = dev0->external_memory_host && dev1->external_memory_host;
|
||||
bool has_syncfd = false;
|
||||
#ifndef _WIN32
|
||||
if (dev0->external_semaphore_fd && dev1->external_semaphore_fd) {
|
||||
vk::PhysicalDeviceExternalSemaphoreInfo query{};
|
||||
query.handleType = vk::ExternalSemaphoreHandleTypeFlagBits::eSyncFd;
|
||||
auto p0 = dev0->physical_device.getExternalSemaphoreProperties(query);
|
||||
auto p1 = dev1->physical_device.getExternalSemaphoreProperties(query);
|
||||
has_syncfd =
|
||||
(p0.externalSemaphoreFeatures & vk::ExternalSemaphoreFeatureFlagBits::eExportable) &&
|
||||
(p0.compatibleHandleTypes & vk::ExternalSemaphoreHandleTypeFlagBits::eSyncFd) &&
|
||||
(p1.externalSemaphoreFeatures & vk::ExternalSemaphoreFeatureFlagBits::eImportable) &&
|
||||
(p1.compatibleHandleTypes & vk::ExternalSemaphoreHandleTypeFlagBits::eSyncFd);
|
||||
}
|
||||
#endif
|
||||
|
||||
// Helper to record a result
|
||||
auto record = [&](const std::string & method, size_t size, std::vector<double> & times) {
|
||||
std::sort(times.begin(), times.end());
|
||||
double median = times[times.size() / 2];
|
||||
double bw = (size / (1024.0 * 1024.0 * 1024.0)) / (median / 1000.0);
|
||||
results[method].push_back({ method, median, bw });
|
||||
};
|
||||
|
||||
// Helper to record a skipped size (sentinel: negative ms)
|
||||
auto skip = [&](const std::string & method) {
|
||||
results[method].push_back({ method, -1.0, -1.0 });
|
||||
};
|
||||
|
||||
for (size_t size : test_sizes) {
|
||||
|
||||
// =================================================================
|
||||
// 1. Baseline: current sync double-hop (separate staging buffers + memcpy)
|
||||
// =================================================================
|
||||
{
|
||||
std::vector<double> times;
|
||||
for (size_t i = 0; i < num_it + warmup; i++) {
|
||||
auto begin = std::chrono::high_resolution_clock::now();
|
||||
|
||||
{
|
||||
std::lock_guard<std::recursive_mutex> guard(dev0->mutex);
|
||||
vk_context subctx = ggml_vk_create_temporary_context(dev0->transfer_queue.cmd_pool);
|
||||
ggml_vk_ctx_begin(dev0, subctx);
|
||||
ggml_vk_buffer_copy_async(subctx, staging_src, 0, buf_src, 0, size);
|
||||
ggml_vk_ctx_end(subctx);
|
||||
ggml_vk_submit(subctx, dev0->fence);
|
||||
VK_CHECK(dev0->device.waitForFences({ dev0->fence }, true, UINT64_MAX), "baseline hop1");
|
||||
dev0->device.resetFences({ dev0->fence });
|
||||
}
|
||||
memcpy(staging_dst->ptr, staging_src->ptr, size);
|
||||
{
|
||||
std::lock_guard<std::recursive_mutex> guard(dev1->mutex);
|
||||
vk_context subctx = ggml_vk_create_temporary_context(dev1->transfer_queue.cmd_pool);
|
||||
ggml_vk_ctx_begin(dev1, subctx);
|
||||
ggml_vk_buffer_copy_async(subctx, buf_dst, 0, staging_dst, 0, size);
|
||||
ggml_vk_ctx_end(subctx);
|
||||
ggml_vk_submit(subctx, dev1->fence);
|
||||
VK_CHECK(dev1->device.waitForFences({ dev1->fence }, true, UINT64_MAX), "baseline hop2");
|
||||
dev1->device.resetFences({ dev1->fence });
|
||||
}
|
||||
|
||||
auto end = std::chrono::high_resolution_clock::now();
|
||||
if (i >= warmup) times.push_back(std::chrono::duration_cast<std::chrono::microseconds>(end - begin).count() / 1000.0);
|
||||
}
|
||||
record("baseline", size, times);
|
||||
}
|
||||
|
||||
// =================================================================
|
||||
// 2. Diagnostics: individual hop timings
|
||||
// =================================================================
|
||||
{
|
||||
std::vector<double> times;
|
||||
for (size_t i = 0; i < num_it + warmup; i++) {
|
||||
auto begin = std::chrono::high_resolution_clock::now();
|
||||
{
|
||||
std::lock_guard<std::recursive_mutex> guard(dev0->mutex);
|
||||
vk_context subctx = ggml_vk_create_temporary_context(dev0->transfer_queue.cmd_pool);
|
||||
ggml_vk_ctx_begin(dev0, subctx);
|
||||
ggml_vk_buffer_copy_async(subctx, staging_src, 0, buf_src, 0, size);
|
||||
ggml_vk_ctx_end(subctx);
|
||||
ggml_vk_submit(subctx, dev0->fence);
|
||||
VK_CHECK(dev0->device.waitForFences({ dev0->fence }, true, UINT64_MAX), "diag hop1");
|
||||
dev0->device.resetFences({ dev0->fence });
|
||||
}
|
||||
auto end = std::chrono::high_resolution_clock::now();
|
||||
if (i >= warmup) times.push_back(std::chrono::duration_cast<std::chrono::microseconds>(end - begin).count() / 1000.0);
|
||||
}
|
||||
record("hop1_only", size, times);
|
||||
}
|
||||
{
|
||||
std::vector<double> times;
|
||||
for (size_t i = 0; i < num_it + warmup; i++) {
|
||||
auto begin = std::chrono::high_resolution_clock::now();
|
||||
{
|
||||
std::lock_guard<std::recursive_mutex> guard(dev1->mutex);
|
||||
vk_context subctx = ggml_vk_create_temporary_context(dev1->transfer_queue.cmd_pool);
|
||||
ggml_vk_ctx_begin(dev1, subctx);
|
||||
ggml_vk_buffer_copy_async(subctx, buf_dst, 0, staging_dst, 0, size);
|
||||
ggml_vk_ctx_end(subctx);
|
||||
ggml_vk_submit(subctx, dev1->fence);
|
||||
VK_CHECK(dev1->device.waitForFences({ dev1->fence }, true, UINT64_MAX), "diag hop2");
|
||||
dev1->device.resetFences({ dev1->fence });
|
||||
}
|
||||
auto end = std::chrono::high_resolution_clock::now();
|
||||
if (i >= warmup) times.push_back(std::chrono::duration_cast<std::chrono::microseconds>(end - begin).count() / 1000.0);
|
||||
}
|
||||
record("hop2_only", size, times);
|
||||
}
|
||||
|
||||
// =================================================================
|
||||
// 3. Shared staging: single host buffer imported into both devices
|
||||
// =================================================================
|
||||
if (has_shared_staging) {
|
||||
vk_shared_staging stg;
|
||||
if (stg.alloc(dev0, dev1, size)) {
|
||||
std::vector<double> times;
|
||||
for (size_t i = 0; i < num_it + warmup; i++) {
|
||||
auto begin = std::chrono::high_resolution_clock::now();
|
||||
|
||||
{
|
||||
std::lock_guard<std::recursive_mutex> guard(dev0->mutex);
|
||||
vk_context subctx = ggml_vk_create_temporary_context(dev0->transfer_queue.cmd_pool);
|
||||
ggml_vk_ctx_begin(dev0, subctx);
|
||||
ggml_vk_buffer_copy_async(subctx, stg.buf_dev0, 0, buf_src, 0, size);
|
||||
ggml_vk_ctx_end(subctx);
|
||||
ggml_vk_submit(subctx, dev0->fence);
|
||||
VK_CHECK(dev0->device.waitForFences({ dev0->fence }, true, UINT64_MAX), "shared hop1");
|
||||
dev0->device.resetFences({ dev0->fence });
|
||||
}
|
||||
{
|
||||
std::lock_guard<std::recursive_mutex> guard(dev1->mutex);
|
||||
vk_context subctx = ggml_vk_create_temporary_context(dev1->transfer_queue.cmd_pool);
|
||||
ggml_vk_ctx_begin(dev1, subctx);
|
||||
ggml_vk_buffer_copy_async(subctx, buf_dst, 0, stg.buf_dev1, 0, size);
|
||||
ggml_vk_ctx_end(subctx);
|
||||
ggml_vk_submit(subctx, dev1->fence);
|
||||
VK_CHECK(dev1->device.waitForFences({ dev1->fence }, true, UINT64_MAX), "shared hop2");
|
||||
dev1->device.resetFences({ dev1->fence });
|
||||
}
|
||||
|
||||
auto end = std::chrono::high_resolution_clock::now();
|
||||
if (i >= warmup) times.push_back(std::chrono::duration_cast<std::chrono::microseconds>(end - begin).count() / 1000.0);
|
||||
}
|
||||
record("shared_staging", size, times);
|
||||
} else {
|
||||
std::cerr << " shared_staging : SKIPPED (import failed)" << std::endl;
|
||||
}
|
||||
stg.free_resources();
|
||||
}
|
||||
|
||||
// =================================================================
|
||||
// 4. Chunked pipeline: split into N chunks, overlap hop1/hop2
|
||||
// via full-duplex PCIe. Vary chunk count to find optimum.
|
||||
// =================================================================
|
||||
if (has_shared_staging) {
|
||||
for (size_t n_chunks : { 2, 4, 8 }) {
|
||||
char cname[32];
|
||||
snprintf(cname, sizeof(cname), "chunked_%zu", n_chunks);
|
||||
if (size < n_chunks * 4096) { skip(cname); continue; }
|
||||
|
||||
size_t align = std::max(dev0->min_imported_host_pointer_alignment,
|
||||
dev1->min_imported_host_pointer_alignment);
|
||||
size_t chunk_data = size / n_chunks;
|
||||
size_t chunk_aligned = (chunk_data + align - 1) & ~(align - 1);
|
||||
|
||||
vk_shared_staging stg;
|
||||
if (!stg.alloc(dev0, dev1, chunk_aligned * n_chunks)) {
|
||||
std::cerr << " chunked_" << n_chunks << " : SKIPPED (import failed)" << std::endl;
|
||||
stg.free_resources();
|
||||
continue;
|
||||
}
|
||||
|
||||
// Per-chunk timeline semaphores
|
||||
std::vector<vk::Semaphore> chunk_sems(n_chunks);
|
||||
std::vector<uint64_t> sem_vals(n_chunks, 0);
|
||||
for (size_t c = 0; c < n_chunks; c++) {
|
||||
vk::SemaphoreTypeCreateInfo tci{ vk::SemaphoreType::eTimeline, 0 };
|
||||
vk::SemaphoreCreateInfo sci{};
|
||||
sci.setPNext(&tci);
|
||||
chunk_sems[c] = dev0->device.createSemaphore(sci);
|
||||
}
|
||||
|
||||
std::vector<double> times;
|
||||
for (size_t iter = 0; iter < num_it + warmup; iter++) {
|
||||
auto begin = std::chrono::high_resolution_clock::now();
|
||||
|
||||
// Submit all hop1s upfront
|
||||
for (size_t c = 0; c < n_chunks; c++) {
|
||||
size_t off_src = c * chunk_data;
|
||||
size_t off_stg = c * chunk_aligned;
|
||||
size_t csz = (c == n_chunks - 1) ? (size - c * chunk_data) : chunk_data;
|
||||
|
||||
vk_context subctx = ggml_vk_create_temporary_context(dev0->transfer_queue.cmd_pool);
|
||||
ggml_vk_ctx_begin(dev0, subctx);
|
||||
ggml_vk_buffer_copy_async(subctx, stg.buf_dev0, off_stg, buf_src, off_src, csz);
|
||||
sem_vals[c]++;
|
||||
subctx->s->signal_semaphores.push_back({ chunk_sems[c], sem_vals[c] });
|
||||
ggml_vk_ctx_end(subctx);
|
||||
ggml_vk_submit(subctx, {});
|
||||
}
|
||||
|
||||
// Per-chunk: CPU wait hop1, submit hop2
|
||||
for (size_t c = 0; c < n_chunks; c++) {
|
||||
size_t off_dst = c * chunk_data;
|
||||
size_t off_stg = c * chunk_aligned;
|
||||
size_t csz = (c == n_chunks - 1) ? (size - c * chunk_data) : chunk_data;
|
||||
|
||||
vk::SemaphoreWaitInfo swi{vk::SemaphoreWaitFlags{}, chunk_sems[c], sem_vals[c]};
|
||||
VK_CHECK(dev0->device.waitSemaphores(swi, UINT64_MAX), "chunked sem wait");
|
||||
|
||||
vk_context subctx = ggml_vk_create_temporary_context(dev1->transfer_queue.cmd_pool);
|
||||
ggml_vk_ctx_begin(dev1, subctx);
|
||||
ggml_vk_buffer_copy_async(subctx, buf_dst, off_dst, stg.buf_dev1, off_stg, csz);
|
||||
ggml_vk_ctx_end(subctx);
|
||||
ggml_vk_submit(subctx, (c == n_chunks - 1) ? dev1->fence : vk::Fence{});
|
||||
}
|
||||
|
||||
VK_CHECK(dev1->device.waitForFences({ dev1->fence }, true, UINT64_MAX), "chunked final");
|
||||
dev1->device.resetFences({ dev1->fence });
|
||||
|
||||
auto end = std::chrono::high_resolution_clock::now();
|
||||
if (iter >= warmup) times.push_back(std::chrono::duration_cast<std::chrono::microseconds>(end - begin).count() / 1000.0);
|
||||
}
|
||||
|
||||
char name[32];
|
||||
snprintf(name, sizeof(name), "chunked_%zu", n_chunks);
|
||||
record(name, size, times);
|
||||
|
||||
for (size_t c = 0; c < n_chunks; c++) dev0->device.destroySemaphore(chunk_sems[c]);
|
||||
stg.free_resources();
|
||||
}
|
||||
}
|
||||
|
||||
// =================================================================
|
||||
// 5. sync_fd async: fully GPU-synchronised via Linux sync_file
|
||||
// =================================================================
|
||||
#ifndef _WIN32
|
||||
if (has_shared_staging && has_syncfd) {
|
||||
vk_shared_staging stg;
|
||||
if (stg.alloc(dev0, dev1, size)) {
|
||||
std::vector<double> times;
|
||||
bool run_ok = true;
|
||||
|
||||
for (size_t i = 0; i < num_it + warmup && run_ok; i++) {
|
||||
auto begin = std::chrono::high_resolution_clock::now();
|
||||
|
||||
vk::ExportSemaphoreCreateInfo esci{};
|
||||
esci.handleTypes = vk::ExternalSemaphoreHandleTypeFlagBits::eSyncFd;
|
||||
vk::SemaphoreCreateInfo sci{};
|
||||
sci.setPNext(&esci);
|
||||
vk::Semaphore sem_dev0 = dev0->device.createSemaphore(sci);
|
||||
|
||||
// Hop 1 + signal
|
||||
{
|
||||
vk_context subctx = ggml_vk_create_temporary_context(dev0->transfer_queue.cmd_pool);
|
||||
ggml_vk_ctx_begin(dev0, subctx);
|
||||
ggml_vk_buffer_copy_async(subctx, stg.buf_dev0, 0, buf_src, 0, size);
|
||||
subctx->s->signal_semaphores.push_back({ sem_dev0, 0 });
|
||||
ggml_vk_ctx_end(subctx);
|
||||
ggml_vk_submit(subctx, {});
|
||||
}
|
||||
|
||||
// Export + import sync_fd
|
||||
int sync_fd = -1;
|
||||
try {
|
||||
vk::SemaphoreGetFdInfoKHR gi{};
|
||||
gi.semaphore = sem_dev0;
|
||||
gi.handleType = vk::ExternalSemaphoreHandleTypeFlagBits::eSyncFd;
|
||||
sync_fd = dev0->device.getSemaphoreFdKHR(gi);
|
||||
} catch (vk::SystemError& e) {
|
||||
std::cerr << " syncfd_async : SKIPPED (export: " << e.what() << ")" << std::endl;
|
||||
dev0->device.destroySemaphore(sem_dev0);
|
||||
run_ok = false; break;
|
||||
}
|
||||
|
||||
vk::Semaphore sem_dev1 = dev1->device.createSemaphore({});
|
||||
try {
|
||||
vk::ImportSemaphoreFdInfoKHR ii{};
|
||||
ii.semaphore = sem_dev1;
|
||||
ii.handleType = vk::ExternalSemaphoreHandleTypeFlagBits::eSyncFd;
|
||||
ii.flags = vk::SemaphoreImportFlagBits::eTemporary;
|
||||
ii.fd = sync_fd;
|
||||
dev1->device.importSemaphoreFdKHR(ii);
|
||||
} catch (vk::SystemError& e) {
|
||||
std::cerr << " syncfd_async : SKIPPED (import: " << e.what() << ")" << std::endl;
|
||||
dev0->device.destroySemaphore(sem_dev0);
|
||||
dev1->device.destroySemaphore(sem_dev1);
|
||||
close(sync_fd);
|
||||
run_ok = false; break;
|
||||
}
|
||||
|
||||
// Hop 2 with GPU-side wait
|
||||
{
|
||||
vk_context subctx = ggml_vk_create_temporary_context(dev1->transfer_queue.cmd_pool);
|
||||
ggml_vk_ctx_begin(dev1, subctx);
|
||||
subctx->s->wait_semaphores.push_back({ sem_dev1, 0 });
|
||||
ggml_vk_buffer_copy_async(subctx, buf_dst, 0, stg.buf_dev1, 0, size);
|
||||
ggml_vk_ctx_end(subctx);
|
||||
ggml_vk_submit(subctx, dev1->fence);
|
||||
VK_CHECK(dev1->device.waitForFences({ dev1->fence }, true, UINT64_MAX), "syncfd final");
|
||||
dev1->device.resetFences({ dev1->fence });
|
||||
}
|
||||
|
||||
dev0->device.destroySemaphore(sem_dev0);
|
||||
dev1->device.destroySemaphore(sem_dev1);
|
||||
|
||||
auto end = std::chrono::high_resolution_clock::now();
|
||||
if (i >= warmup) times.push_back(std::chrono::duration_cast<std::chrono::microseconds>(end - begin).count() / 1000.0);
|
||||
}
|
||||
if (run_ok) record("syncfd_async", size, times);
|
||||
} else {
|
||||
std::cerr << " syncfd_async : SKIPPED (import failed)" << std::endl;
|
||||
}
|
||||
stg.free_resources();
|
||||
}
|
||||
|
||||
// =================================================================
|
||||
// 6. sync_fd chunked: chunked pipeline with GPU-side sync_fd
|
||||
// between hops (no CPU waits between chunks)
|
||||
// =================================================================
|
||||
if (has_shared_staging && has_syncfd) {
|
||||
for (size_t n_chunks : { 2, 4, 8 }) {
|
||||
char scname[48];
|
||||
snprintf(scname, sizeof(scname), "syncfd_chunked_%zu", n_chunks);
|
||||
if (size < n_chunks * 4096) { skip(scname); continue; }
|
||||
|
||||
size_t align = std::max(dev0->min_imported_host_pointer_alignment,
|
||||
dev1->min_imported_host_pointer_alignment);
|
||||
size_t chunk_data = size / n_chunks;
|
||||
size_t chunk_aligned = (chunk_data + align - 1) & ~(align - 1);
|
||||
|
||||
vk_shared_staging stg;
|
||||
if (!stg.alloc(dev0, dev1, chunk_aligned * n_chunks)) {
|
||||
std::cerr << " syncfd_chunked_" << n_chunks << " : SKIPPED (import failed)" << std::endl;
|
||||
stg.free_resources();
|
||||
continue;
|
||||
}
|
||||
|
||||
std::vector<double> times;
|
||||
bool run_ok = true;
|
||||
|
||||
for (size_t iter = 0; iter < num_it + warmup && run_ok; iter++) {
|
||||
auto begin = std::chrono::high_resolution_clock::now();
|
||||
|
||||
// Create per-chunk exportable semaphores
|
||||
std::vector<vk::Semaphore> sems_dev0(n_chunks);
|
||||
for (size_t c = 0; c < n_chunks; c++) {
|
||||
vk::ExportSemaphoreCreateInfo esci{};
|
||||
esci.handleTypes = vk::ExternalSemaphoreHandleTypeFlagBits::eSyncFd;
|
||||
vk::SemaphoreCreateInfo sci{};
|
||||
sci.setPNext(&esci);
|
||||
sems_dev0[c] = dev0->device.createSemaphore(sci);
|
||||
}
|
||||
|
||||
// Submit all hop1s with per-chunk signal
|
||||
for (size_t c = 0; c < n_chunks; c++) {
|
||||
size_t off_src = c * chunk_data;
|
||||
size_t off_stg = c * chunk_aligned;
|
||||
size_t csz = (c == n_chunks - 1) ? (size - c * chunk_data) : chunk_data;
|
||||
|
||||
vk_context subctx = ggml_vk_create_temporary_context(dev0->transfer_queue.cmd_pool);
|
||||
ggml_vk_ctx_begin(dev0, subctx);
|
||||
ggml_vk_buffer_copy_async(subctx, stg.buf_dev0, off_stg, buf_src, off_src, csz);
|
||||
subctx->s->signal_semaphores.push_back({ sems_dev0[c], 0 });
|
||||
ggml_vk_ctx_end(subctx);
|
||||
ggml_vk_submit(subctx, {});
|
||||
}
|
||||
|
||||
// Export all sync_fds and import on dev1, submit hop2s
|
||||
for (size_t c = 0; c < n_chunks && run_ok; c++) {
|
||||
size_t off_dst = c * chunk_data;
|
||||
size_t off_stg = c * chunk_aligned;
|
||||
size_t csz = (c == n_chunks - 1) ? (size - c * chunk_data) : chunk_data;
|
||||
|
||||
int sync_fd = -1;
|
||||
try {
|
||||
vk::SemaphoreGetFdInfoKHR gi{};
|
||||
gi.semaphore = sems_dev0[c];
|
||||
gi.handleType = vk::ExternalSemaphoreHandleTypeFlagBits::eSyncFd;
|
||||
sync_fd = dev0->device.getSemaphoreFdKHR(gi);
|
||||
} catch (vk::SystemError& e) {
|
||||
char nm[48]; snprintf(nm, sizeof(nm), "syncfd_chunked_%zu", n_chunks);
|
||||
std::cerr << " " << nm << " : SKIPPED (export: " << e.what() << ")" << std::endl;
|
||||
run_ok = false; break;
|
||||
}
|
||||
|
||||
vk::Semaphore sem_dev1 = dev1->device.createSemaphore({});
|
||||
try {
|
||||
vk::ImportSemaphoreFdInfoKHR ii{};
|
||||
ii.semaphore = sem_dev1;
|
||||
ii.handleType = vk::ExternalSemaphoreHandleTypeFlagBits::eSyncFd;
|
||||
ii.flags = vk::SemaphoreImportFlagBits::eTemporary;
|
||||
ii.fd = sync_fd;
|
||||
dev1->device.importSemaphoreFdKHR(ii);
|
||||
} catch (vk::SystemError& e) {
|
||||
char nm[48]; snprintf(nm, sizeof(nm), "syncfd_chunked_%zu", n_chunks);
|
||||
std::cerr << " " << nm << " : SKIPPED (import: " << e.what() << ")" << std::endl;
|
||||
dev1->device.destroySemaphore(sem_dev1);
|
||||
close(sync_fd);
|
||||
run_ok = false; break;
|
||||
}
|
||||
|
||||
vk_context subctx = ggml_vk_create_temporary_context(dev1->transfer_queue.cmd_pool);
|
||||
ggml_vk_ctx_begin(dev1, subctx);
|
||||
subctx->s->wait_semaphores.push_back({ sem_dev1, 0 });
|
||||
ggml_vk_buffer_copy_async(subctx, buf_dst, off_dst, stg.buf_dev1, off_stg, csz);
|
||||
ggml_vk_ctx_end(subctx);
|
||||
ggml_vk_submit(subctx, (c == n_chunks - 1) ? dev1->fence : vk::Fence{});
|
||||
|
||||
dev1->device.destroySemaphore(sem_dev1);
|
||||
}
|
||||
|
||||
if (run_ok) {
|
||||
VK_CHECK(dev1->device.waitForFences({ dev1->fence }, true, UINT64_MAX), "syncfd_chunked final");
|
||||
dev1->device.resetFences({ dev1->fence });
|
||||
}
|
||||
|
||||
for (size_t c = 0; c < n_chunks; c++) dev0->device.destroySemaphore(sems_dev0[c]);
|
||||
|
||||
auto end = std::chrono::high_resolution_clock::now();
|
||||
if (run_ok && iter >= warmup) times.push_back(std::chrono::duration_cast<std::chrono::microseconds>(end - begin).count() / 1000.0);
|
||||
}
|
||||
|
||||
if (run_ok) {
|
||||
char name[48];
|
||||
snprintf(name, sizeof(name), "syncfd_chunked_%zu", n_chunks);
|
||||
record(name, size, times);
|
||||
}
|
||||
stg.free_resources();
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
ggml_vk_destroy_buffer(buf_src);
|
||||
ggml_vk_destroy_buffer(buf_dst);
|
||||
ggml_vk_destroy_buffer(staging_src);
|
||||
ggml_vk_destroy_buffer(staging_dst);
|
||||
}
|
||||
|
||||
static void ggml_vk_test_cross_device_copy(ggml_backend_vk_context * ctx) {
|
||||
ggml_vk_instance_init();
|
||||
|
||||
const size_t n_devices = vk_instance.device_indices.size();
|
||||
if (n_devices < 2) {
|
||||
std::cerr << "COPY TEST: Need at least 2 Vulkan devices, found " << n_devices << std::endl;
|
||||
return;
|
||||
}
|
||||
|
||||
// List devices
|
||||
std::cerr << "\n=== Vulkan Devices ===" << std::endl;
|
||||
std::vector<vk_device> devices(n_devices);
|
||||
for (size_t i = 0; i < n_devices; i++) {
|
||||
devices[i] = ggml_vk_get_device(i);
|
||||
std::cerr << " [" << i << "] " << devices[i]->name << std::endl;
|
||||
}
|
||||
|
||||
const std::vector<size_t> test_sizes = {
|
||||
4096, // 4 KB
|
||||
256 * 1024, // 256 KB
|
||||
1 * 1024 * 1024, // 1 MB
|
||||
16 * 1024 * 1024, // 16 MB
|
||||
64 * 1024 * 1024, // 64 MB
|
||||
256 * 1024 * 1024, // 256 MB
|
||||
};
|
||||
|
||||
// Collect results: results[pair_label][method_name] = vector of vk_copy_result (one per size)
|
||||
struct pair_results {
|
||||
std::string label;
|
||||
std::map<std::string, std::vector<vk_copy_result>> methods;
|
||||
};
|
||||
std::vector<pair_results> all_results;
|
||||
|
||||
// Run benchmarks for all ordered pairs
|
||||
for (size_t i = 0; i < n_devices; i++) {
|
||||
for (size_t j = 0; j < n_devices; j++) {
|
||||
if (i == j) continue;
|
||||
|
||||
std::string label = devices[i]->name + " -> " + devices[j]->name;
|
||||
std::cerr << "\n\n=== " << label << " ===" << std::endl;
|
||||
|
||||
pair_results pr;
|
||||
pr.label = label;
|
||||
ggml_vk_bench_pair(devices[i], devices[j], test_sizes, pr.methods);
|
||||
all_results.push_back(std::move(pr));
|
||||
}
|
||||
}
|
||||
|
||||
// Output markdown tables: one table per method
|
||||
// Collect all method names
|
||||
std::vector<std::string> method_order;
|
||||
if (!all_results.empty()) {
|
||||
// Use first pair's method order as canonical
|
||||
for (auto & [method, _] : all_results[0].methods) {
|
||||
method_order.push_back(method);
|
||||
}
|
||||
// Add any methods from other pairs not in the first
|
||||
for (auto & pr : all_results) {
|
||||
for (auto & [method, _] : pr.methods) {
|
||||
if (std::find(method_order.begin(), method_order.end(), method) == method_order.end()) {
|
||||
method_order.push_back(method);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::cerr << "\n\n# Cross-Device Copy Benchmark Results\n" << std::endl;
|
||||
|
||||
for (auto & method : method_order) {
|
||||
std::cerr << "## " << method << "\n" << std::endl;
|
||||
|
||||
// Header: | Direction | 4KB | 256KB | ... |
|
||||
std::cerr << "| Direction |";
|
||||
for (size_t s : test_sizes) {
|
||||
if (s < 1024 * 1024) {
|
||||
std::cerr << " " << s / 1024 << " KB |";
|
||||
} else {
|
||||
std::cerr << " " << s / (1024 * 1024) << " MB |";
|
||||
}
|
||||
}
|
||||
std::cerr << std::endl;
|
||||
|
||||
// Separator
|
||||
std::cerr << "|---|";
|
||||
for (size_t s = 0; s < test_sizes.size(); s++) {
|
||||
std::cerr << "---|";
|
||||
GGML_UNUSED(s);
|
||||
}
|
||||
std::cerr << std::endl;
|
||||
|
||||
// Data rows
|
||||
for (auto & pr : all_results) {
|
||||
std::cerr << "| " << pr.label << " |";
|
||||
auto it = pr.methods.find(method);
|
||||
if (it != pr.methods.end() && it->second.size() == test_sizes.size()) {
|
||||
for (auto & r : it->second) {
|
||||
if (r.ms < 0) {
|
||||
std::cerr << " - |";
|
||||
} else {
|
||||
std::cerr << " " << std::fixed << std::setprecision(1) << r.ms << " ms (" << std::setprecision(1) << r.gbps << " GB/s) |";
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (size_t s = 0; s < test_sizes.size(); s++) {
|
||||
std::cerr << " - |";
|
||||
GGML_UNUSED(s);
|
||||
}
|
||||
}
|
||||
std::cerr << std::endl;
|
||||
}
|
||||
std::cerr << std::endl;
|
||||
}
|
||||
|
||||
GGML_ABORT("GGML_VULKAN_COPY_TESTS completed");
|
||||
GGML_UNUSED(ctx);
|
||||
}
|
||||
#endif
|
||||
|
||||
static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx, vk_context subctx) {
|
||||
#if defined(GGML_VULKAN_COPY_TESTS)
|
||||
ggml_vk_test_cross_device_copy(ctx);
|
||||
#endif
|
||||
#if defined(GGML_VULKAN_RUN_TESTS)
|
||||
const std::vector<size_t> vals {
|
||||
512, 512, 128,
|
||||
|
||||
@@ -506,6 +506,7 @@ class VISION_PROJECTOR_TYPE(IntEnum):
|
||||
GEMMA3N = auto()
|
||||
GEMMA3 = auto()
|
||||
QWEN3VL = auto()
|
||||
STEP3VL = auto()
|
||||
COGVLM = auto()
|
||||
|
||||
|
||||
@@ -987,6 +988,8 @@ VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = {
|
||||
VISION_PROJECTOR_TYPE.GLM_EDGE: "adapter",
|
||||
VISION_PROJECTOR_TYPE.MERGER: "qwen2vl_merger",
|
||||
VISION_PROJECTOR_TYPE.GEMMA3: "gemma3",
|
||||
VISION_PROJECTOR_TYPE.QWEN3VL: "qwen3vl_merger",
|
||||
VISION_PROJECTOR_TYPE.STEP3VL: "step3vl",
|
||||
}
|
||||
|
||||
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||
@@ -4105,6 +4108,7 @@ class VisionProjectorType:
|
||||
QWEN2VL = "qwen2vl_merger"
|
||||
QWEN25VL = "qwen2.5vl_merger"
|
||||
QWEN3VL = "qwen3vl_merger"
|
||||
STEP3VL = "step3vl"
|
||||
ULTRAVOX = "ultravox"
|
||||
INTERNVL = "internvl"
|
||||
QWEN2A = "qwen2a" # audio
|
||||
|
||||
@@ -1406,6 +1406,7 @@ class TensorNameMap:
|
||||
"siglip2.vision_model.embeddings.patch_embedding",
|
||||
"vision_model.radio_model.model.patch_generator.embedder", # Nemotron Nano v2 VL
|
||||
"model.vision_tower.patch_embedder.input_proj", # gemma4
|
||||
"vision_model.conv1", # Step3-VL
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_EMBD_NORM: (
|
||||
@@ -1425,6 +1426,7 @@ class TensorNameMap:
|
||||
"visual.embeddings.position_embedding", # glm4v
|
||||
"vision_model.radio_model.model.patch_generator.pos_embed", # Nemotron Nano v2 VL
|
||||
"model.vision_tower.patch_embedder.position_embedding_table", # gemma4
|
||||
"vision_model.positional_embedding", # Step3-VL
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_EMBD_IMGNL: (
|
||||
@@ -1443,6 +1445,7 @@ class TensorNameMap:
|
||||
"model.vision_model.transformer.layers.{bid}.self_attn.qkv_proj", # Deepseek-OCR CLIP
|
||||
"vision_tower.encoder.blocks.{bid}.wqkv", # Kimi-K2.5
|
||||
"vision_model.radio_model.model.blocks.{bid}.attn.qkv", # Nemotron Nano v2 VL
|
||||
"vision_model.transformer.resblocks.{bid}.attn.in_proj", # Step3-VL
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_ATTN_Q: (
|
||||
@@ -1523,6 +1526,7 @@ class TensorNameMap:
|
||||
"model.vision_model.transformer.layers.{bid}.layer_norm1", # Deepseek-OCR CLIP
|
||||
"siglip2.vision_model.encoder.layers.{bid}.layer_norm1",
|
||||
"vision_model.radio_model.model.blocks.{bid}.norm1", # Nemotron Nano v2 VL
|
||||
"vision_model.transformer.resblocks.{bid}.ln_1", # Step3-VL
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_ATTN_O: (
|
||||
@@ -1543,6 +1547,7 @@ class TensorNameMap:
|
||||
"siglip2.vision_model.encoder.layers.{bid}.self_attn.out_proj", # youtuvl
|
||||
"vision_model.radio_model.model.blocks.{bid}.attn.proj", # Nemotron Nano v2 VL
|
||||
"vision_model.model.layers.{bid}.self_attn.o_proj.linear", # gemma4
|
||||
"vision_model.transformer.resblocks.{bid}.attn.out_proj", # Step3-VL
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_POST_ATTN_NORM: (
|
||||
@@ -1562,6 +1567,7 @@ class TensorNameMap:
|
||||
"siglip2.vision_model.encoder.layers.{bid}.layer_norm2",
|
||||
"vision_model.radio_model.model.blocks.{bid}.norm2", # Nemotron Nano v2 VL
|
||||
"vision_model.model.layers.{bid}.pre_feedforward_layernorm", # gemma4
|
||||
"vision_model.transformer.resblocks.{bid}.ln_2", # Step3-VL
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_FFN_UP: (
|
||||
@@ -1582,6 +1588,7 @@ class TensorNameMap:
|
||||
"siglip2.vision_model.encoder.layers.{bid}.mlp.fc1",
|
||||
"vision_model.radio_model.model.blocks.{bid}.mlp.fc1", # Nemotron Nano v2 VL
|
||||
"vision_model.model.layers.{bid}.mlp.up_proj", # gemma4
|
||||
"vision_model.transformer.resblocks.{bid}.mlp.c_fc", # Step3-VL
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_FFN_GATE: (
|
||||
@@ -1609,6 +1616,7 @@ class TensorNameMap:
|
||||
"siglip2.vision_model.encoder.layers.{bid}.mlp.fc2",
|
||||
"vision_model.radio_model.model.blocks.{bid}.mlp.fc2", # Nemotron Nano v2 VL
|
||||
"vision_model.model.layers.{bid}.mlp.down_proj", # gemma4
|
||||
"vision_model.transformer.resblocks.{bid}.mlp.c_proj", # Step3-VL
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_ATTN_POST_NORM: (
|
||||
@@ -1622,11 +1630,13 @@ class TensorNameMap:
|
||||
MODEL_TENSOR.V_LAYER_SCALE_1: (
|
||||
"vision_tower.vision_model.encoder.layers.{bid}.ls1", # InternVL
|
||||
"model.vision_tower.encoder.layer.{bid}.lambda_1", # Intern-S1
|
||||
"vision_model.transformer.resblocks.{bid}.ls_1", # Step3-VL
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_LAYER_SCALE_2: (
|
||||
"vision_tower.vision_model.encoder.layers.{bid}.ls2", # InternVL
|
||||
"model.vision_tower.encoder.layer.{bid}.lambda_2", # Intern-S1
|
||||
"vision_model.transformer.resblocks.{bid}.ls_2", # Step3-VL
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_LAYER_OUT_SCALE: (
|
||||
@@ -1639,6 +1649,7 @@ class TensorNameMap:
|
||||
"vision_encoder.ln_pre", # pixtral
|
||||
"vision_model.layernorm_pre", # llama4
|
||||
"model.vision_model.pre_layrnorm", # Deepseek-OCR CLIP
|
||||
"vision_model.ln_pre", # Step3-VL
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_POST_NORM: (
|
||||
|
||||
@@ -31,6 +31,7 @@ add_library(mtmd
|
||||
models/pixtral.cpp
|
||||
models/qwen2vl.cpp
|
||||
models/qwen3vl.cpp
|
||||
models/step3vl.cpp
|
||||
models/siglip.cpp
|
||||
models/whisper-enc.cpp
|
||||
models/deepseekocr.cpp
|
||||
|
||||
@@ -242,6 +242,7 @@ enum projector_type {
|
||||
PROJECTOR_TYPE_GLM_EDGE,
|
||||
PROJECTOR_TYPE_QWEN2VL,
|
||||
PROJECTOR_TYPE_QWEN3VL,
|
||||
PROJECTOR_TYPE_STEP3VL,
|
||||
PROJECTOR_TYPE_GEMMA3,
|
||||
PROJECTOR_TYPE_GEMMA3NV,
|
||||
PROJECTOR_TYPE_GEMMA3NA,
|
||||
@@ -284,6 +285,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
||||
{ PROJECTOR_TYPE_QWEN2VL, "qwen2vl_merger"},
|
||||
{ PROJECTOR_TYPE_QWEN25VL, "qwen2.5vl_merger"},
|
||||
{ PROJECTOR_TYPE_QWEN3VL, "qwen3vl_merger"},
|
||||
{ PROJECTOR_TYPE_STEP3VL, "step3vl"},
|
||||
{ PROJECTOR_TYPE_GEMMA3, "gemma3"},
|
||||
{ PROJECTOR_TYPE_GEMMA3NV, "gemma3nv"},
|
||||
{ PROJECTOR_TYPE_GEMMA3NA, "gemma3na"},
|
||||
|
||||
@@ -79,7 +79,6 @@ struct clip_hparams {
|
||||
|
||||
float eps = 1e-6;
|
||||
float rope_theta = 0.0;
|
||||
|
||||
std::unordered_set<int32_t> vision_feature_layer;
|
||||
int32_t attn_window_size = 0;
|
||||
int32_t n_wa_pattern = 0;
|
||||
|
||||
@@ -862,6 +862,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||
{
|
||||
builder = std::make_unique<clip_graph_qwen3vl>(ctx, img);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_STEP3VL:
|
||||
{
|
||||
builder = std::make_unique<clip_graph_step3vl>(ctx, img);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_MINICPMV:
|
||||
{
|
||||
builder = std::make_unique<clip_graph_minicpmv>(ctx, img);
|
||||
@@ -1337,6 +1341,17 @@ struct clip_model_loader {
|
||||
LOG_WRN("%s: more info: https://github.com/ggml-org/llama.cpp/issues/16842\n\n", __func__);
|
||||
}
|
||||
} break;
|
||||
case PROJECTOR_TYPE_STEP3VL:
|
||||
{
|
||||
hparams.n_merge = 4; // two stride-2 downsamplers after patching
|
||||
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
|
||||
hparams.rope_theta = 10000.0f;
|
||||
get_u32(KEY_PREPROC_IMAGE_SIZE, hparams.image_longest_edge, false);
|
||||
if (hparams.image_longest_edge == 0) {
|
||||
hparams.image_longest_edge = 3024;
|
||||
}
|
||||
hparams.warmup_image_size = hparams.image_size;
|
||||
} break;
|
||||
case PROJECTOR_TYPE_YOUTUVL:
|
||||
{
|
||||
hparams.n_merge = 2;
|
||||
@@ -1769,6 +1784,14 @@ struct clip_model_loader {
|
||||
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
|
||||
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
|
||||
} break;
|
||||
case PROJECTOR_TYPE_STEP3VL:
|
||||
{
|
||||
model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));
|
||||
model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"), false);
|
||||
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
|
||||
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"), false);
|
||||
model.mm_model_proj = get_tensor(string_format(TN_MM_PROJECTOR, "weight"));
|
||||
} break;
|
||||
case PROJECTOR_TYPE_YOUTUVL:
|
||||
{
|
||||
model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM); // merger.ln_q (RMS norm)
|
||||
@@ -2615,6 +2638,8 @@ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 *
|
||||
case PROJECTOR_TYPE_HUNYUANOCR:
|
||||
case PROJECTOR_TYPE_YOUTUVL:
|
||||
return (img->nx / params.patch_size) / 2;
|
||||
case PROJECTOR_TYPE_STEP3VL:
|
||||
return img->nx / (params.patch_size * params.n_merge);
|
||||
default:
|
||||
break;
|
||||
}
|
||||
@@ -2632,6 +2657,8 @@ int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 *
|
||||
case PROJECTOR_TYPE_PADDLEOCR:
|
||||
case PROJECTOR_TYPE_YOUTUVL:
|
||||
return (img->ny / params.patch_size) / 2;
|
||||
case PROJECTOR_TYPE_STEP3VL:
|
||||
return img->ny / (params.patch_size * params.n_merge);
|
||||
default:
|
||||
break;
|
||||
}
|
||||
@@ -2702,6 +2729,12 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
|
||||
int y_patch = img->ny / (params.patch_size * 2);
|
||||
n_patches = x_patch * y_patch;
|
||||
} break;
|
||||
case PROJECTOR_TYPE_STEP3VL:
|
||||
{
|
||||
int x_patch = img->nx / (params.patch_size * params.n_merge);
|
||||
int y_patch = img->ny / (params.patch_size * params.n_merge);
|
||||
n_patches = x_patch * y_patch;
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GEMMA3:
|
||||
case PROJECTOR_TYPE_GEMMA4V:
|
||||
case PROJECTOR_TYPE_IDEFICS3:
|
||||
@@ -3004,6 +3037,18 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
|
||||
set_input_i32("positions", positions);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_STEP3VL:
|
||||
{
|
||||
std::vector<int32_t> pos_data(n_pos);
|
||||
for (int i = 0; i < n_pos; i++) {
|
||||
pos_data[i] = i / pos_w;
|
||||
}
|
||||
set_input_i32("pos_h", pos_data);
|
||||
for (int i = 0; i < n_pos; i++) {
|
||||
pos_data[i] = i % pos_w;
|
||||
}
|
||||
set_input_i32("pos_w", pos_data);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_PADDLEOCR:
|
||||
{
|
||||
const int merge_ratio = hparams.n_merge;
|
||||
@@ -3358,6 +3403,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
||||
case PROJECTOR_TYPE_QWEN3VL:
|
||||
// main path + deepstack paths
|
||||
return ctx->model.mm_1_b->ne[0] * (1 + ctx->model.n_deepstack_layers);
|
||||
case PROJECTOR_TYPE_STEP3VL:
|
||||
return ctx->model.mm_model_proj->ne[1];
|
||||
case PROJECTOR_TYPE_GEMMA3:
|
||||
case PROJECTOR_TYPE_GEMMA3NV:
|
||||
return ctx->model.mm_input_proj_w->ne[0];
|
||||
|
||||
@@ -33,6 +33,11 @@ struct clip_graph_qwen3vl : clip_graph {
|
||||
ggml_cgraph * build() override;
|
||||
};
|
||||
|
||||
struct clip_graph_step3vl : clip_graph {
|
||||
clip_graph_step3vl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
};
|
||||
|
||||
struct clip_graph_youtuvl : clip_graph {
|
||||
clip_graph_youtuvl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
|
||||
81
tools/mtmd/models/step3vl.cpp
Normal file
81
tools/mtmd/models/step3vl.cpp
Normal file
@@ -0,0 +1,81 @@
|
||||
#include "models.h"
|
||||
|
||||
ggml_cgraph * clip_graph_step3vl::build() {
|
||||
GGML_ASSERT(model.class_embedding == nullptr);
|
||||
GGML_ASSERT(model.patch_embeddings_0 != nullptr);
|
||||
GGML_ASSERT(model.position_embeddings != nullptr);
|
||||
|
||||
norm_type norm_t = NORM_TYPE_NORMAL;
|
||||
|
||||
ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
|
||||
ggml_set_name(pos_h, "pos_h");
|
||||
ggml_set_input(pos_h);
|
||||
|
||||
ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
|
||||
ggml_set_name(pos_w, "pos_w");
|
||||
ggml_set_input(pos_w);
|
||||
|
||||
ggml_tensor * inp = build_inp();
|
||||
ggml_tensor * learned_pos_embd = resize_position_embeddings();
|
||||
|
||||
auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
|
||||
return build_rope_2d(ctx0, cur, pos_w, pos_h, hparams.rope_theta, false);
|
||||
};
|
||||
|
||||
auto add_spatial_bias = [&](ggml_tensor * cur, ggml_tensor * bias) {
|
||||
if (bias == nullptr) {
|
||||
return cur;
|
||||
}
|
||||
|
||||
const int64_t width = cur->ne[0];
|
||||
const int64_t height = cur->ne[1];
|
||||
const int64_t channels = cur->ne[2];
|
||||
|
||||
cur = ggml_reshape_2d(ctx0, cur, width * height, channels);
|
||||
cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
|
||||
cur = ggml_add(ctx0, cur, bias);
|
||||
cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
|
||||
cur = ggml_reshape_3d(ctx0, cur, width, height, channels);
|
||||
|
||||
return cur;
|
||||
};
|
||||
|
||||
ggml_tensor * cur = build_vit(
|
||||
inp,
|
||||
n_patches,
|
||||
norm_t,
|
||||
hparams.ffn_op,
|
||||
learned_pos_embd,
|
||||
add_pos);
|
||||
cb(cur, "vit_out", -1);
|
||||
|
||||
// [n_embd, n_patches] -> [w, h, n_embd] for spatial downsampling convolutions.
|
||||
cur = ggml_permute(ctx0, cur, 1, 0, 2, 3);
|
||||
cur = ggml_cont_3d(ctx0, cur, n_patches_x, n_patches_y, n_embd);
|
||||
|
||||
// First downsampler: Conv2d(1536 -> 3072, k=3, s=2, p=1)
|
||||
cur = ggml_conv_2d(ctx0, model.mm_0_w, cur, 2, 2, 1, 1, 1, 1);
|
||||
cur = add_spatial_bias(cur, model.mm_0_b);
|
||||
cb(cur, "downsample_0", -1);
|
||||
|
||||
// Second downsampler: Conv2d(3072 -> 6144, k=3, s=2, p=1)
|
||||
cur = ggml_conv_2d(ctx0, model.mm_1_w, cur, 2, 2, 1, 1, 1, 1);
|
||||
cur = add_spatial_bias(cur, model.mm_1_b);
|
||||
cb(cur, "downsample_1", -1);
|
||||
|
||||
// [w, h, c] -> [c, w*h]
|
||||
{
|
||||
const int64_t w = cur->ne[0];
|
||||
const int64_t h = cur->ne[1];
|
||||
cur = ggml_reshape_3d(ctx0, cur, w * h, cur->ne[2], cur->ne[3]);
|
||||
cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 0, 2, 3));
|
||||
}
|
||||
cb(cur, "downsample_flatten", -1);
|
||||
|
||||
// Final projector: Linear(6144 -> projection_dim)
|
||||
cur = ggml_mul_mat(ctx0, model.mm_model_proj, cur);
|
||||
cb(cur, "projector_out", -1);
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
return gf;
|
||||
}
|
||||
@@ -1114,6 +1114,260 @@ bool mtmd_image_preprocessor_deepseekocr::preprocess(const clip_image_u8 & img,
|
||||
return true;
|
||||
}
|
||||
|
||||
//
|
||||
// mtmd_image_preprocessor_step3vl
|
||||
//
|
||||
|
||||
void mtmd_image_preprocessor_step3vl::img_u8_resize_bilinear_to_f32(
|
||||
const clip_image_u8 & src,
|
||||
clip_image_f32 & dst,
|
||||
int target_width,
|
||||
int target_height,
|
||||
const float mean[3],
|
||||
const float std[3]) {
|
||||
if (src.nx == target_width && src.ny == target_height) {
|
||||
img_u8_to_f32(src, dst, mean, std);
|
||||
return;
|
||||
}
|
||||
|
||||
dst.nx = target_width;
|
||||
dst.ny = target_height;
|
||||
dst.buf.resize(3 * target_width * target_height);
|
||||
|
||||
const float scale_x = static_cast<float>(src.nx) / target_width;
|
||||
const float scale_y = static_cast<float>(src.ny) / target_height;
|
||||
|
||||
for (int y = 0; y < target_height; ++y) {
|
||||
const float src_y = (static_cast<float>(y) + 0.5f) * scale_y - 0.5f;
|
||||
const int y0_floor = static_cast<int>(std::floor(src_y));
|
||||
const int y0 = std::max(0, std::min(y0_floor, src.ny - 1));
|
||||
const int y1 = std::max(0, std::min(y0_floor + 1, src.ny - 1));
|
||||
const float ly = src_y - y0_floor;
|
||||
|
||||
for (int x = 0; x < target_width; ++x) {
|
||||
const float src_x = (static_cast<float>(x) + 0.5f) * scale_x - 0.5f;
|
||||
const int x0_floor = static_cast<int>(std::floor(src_x));
|
||||
const int x0 = std::max(0, std::min(x0_floor, src.nx - 1));
|
||||
const int x1 = std::max(0, std::min(x0_floor + 1, src.nx - 1));
|
||||
const float lx = src_x - x0_floor;
|
||||
|
||||
const size_t idx00 = 3 * (y0 * src.nx + x0);
|
||||
const size_t idx01 = 3 * (y0 * src.nx + x1);
|
||||
const size_t idx10 = 3 * (y1 * src.nx + x0);
|
||||
const size_t idx11 = 3 * (y1 * src.nx + x1);
|
||||
const size_t idx_dst = 3 * (y * target_width + x);
|
||||
|
||||
for (int c = 0; c < 3; ++c) {
|
||||
const float v00 = (static_cast<float>(src.buf[idx00 + c]) / 255.0f - mean[c]) / std[c];
|
||||
const float v01 = (static_cast<float>(src.buf[idx01 + c]) / 255.0f - mean[c]) / std[c];
|
||||
const float v10 = (static_cast<float>(src.buf[idx10 + c]) / 255.0f - mean[c]) / std[c];
|
||||
const float v11 = (static_cast<float>(src.buf[idx11 + c]) / 255.0f - mean[c]) / std[c];
|
||||
|
||||
const float top = v00 + (v01 - v00) * lx;
|
||||
const float bot = v10 + (v11 - v10) * lx;
|
||||
dst.buf[idx_dst + c] = top + (bot - top) * ly;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int mtmd_image_preprocessor_step3vl::get_image_longest_edge(const clip_hparams & params) {
|
||||
return params.image_longest_edge > 0 ? params.image_longest_edge : default_image_longest_edge;
|
||||
}
|
||||
|
||||
int mtmd_image_preprocessor_step3vl::determine_window_size(const clip_hparams & params, int longer, int shorter) {
|
||||
const int image_size = params.image_size;
|
||||
const int crop_size = default_image_crop_size;
|
||||
const float aspect_ratio = static_cast<float>(longer) / shorter;
|
||||
|
||||
if (longer <= image_size) {
|
||||
return aspect_ratio > small_aspect_ratio_limit ? shorter : 0;
|
||||
}
|
||||
|
||||
return aspect_ratio > wide_aspect_ratio_limit ? std::min(shorter, crop_size) : crop_size;
|
||||
}
|
||||
|
||||
int mtmd_image_preprocessor_step3vl::calc_crop_extent(int length, int window_size) {
|
||||
const float ratio = static_cast<float>(length) / window_size;
|
||||
if (ratio < 1.0f) {
|
||||
return length;
|
||||
}
|
||||
|
||||
const float decimal = ratio - std::floor(ratio);
|
||||
const int rounded = decimal > crop_rounding_threshold
|
||||
? static_cast<int>(std::floor(ratio)) + 1
|
||||
: static_cast<int>(std::floor(ratio));
|
||||
return window_size * rounded;
|
||||
}
|
||||
|
||||
std::vector<int> mtmd_image_preprocessor_step3vl::calc_grid(int length, int window_size) {
|
||||
const int n = length <= window_size
|
||||
? 1
|
||||
: static_cast<int>(std::ceil(static_cast<float>(length - window_size) / window_size + 1.0f));
|
||||
std::vector<int> starts(n);
|
||||
|
||||
for (int i = 0; i < n; ++i) {
|
||||
starts[i] = window_size * i;
|
||||
}
|
||||
|
||||
if (n > 1 && starts.back() + window_size > length) {
|
||||
starts.back() = length - window_size;
|
||||
}
|
||||
|
||||
return starts;
|
||||
}
|
||||
|
||||
clip_image_u8 mtmd_image_preprocessor_step3vl::prepare_image(const clip_image_u8 & img, const clip_hparams & params) {
|
||||
clip_image_u8 resized = img;
|
||||
const float aspect_ratio = img.ny > 0 ? static_cast<float>(img.nx) / img.ny : 1.0f;
|
||||
if (std::min(img.nx, img.ny) < 32 &&
|
||||
(aspect_ratio > wide_aspect_ratio_limit ||
|
||||
aspect_ratio < 1.0f / wide_aspect_ratio_limit)) {
|
||||
const int square_size = std::max(img.nx, img.ny);
|
||||
clip_image_u8 padded;
|
||||
padded.nx = square_size;
|
||||
padded.ny = square_size;
|
||||
padded.buf.resize(3 * square_size * square_size);
|
||||
img_tool::fill(padded, {0, 0, 0});
|
||||
img_tool::composite(padded, img, 0, 0);
|
||||
resized = std::move(padded);
|
||||
}
|
||||
|
||||
const int max_image_size = get_image_longest_edge(params);
|
||||
if (std::max(resized.nx, resized.ny) > max_image_size) {
|
||||
const float scale = static_cast<float>(max_image_size) / std::max(resized.nx, resized.ny);
|
||||
const clip_image_size new_size = {
|
||||
std::max(1, static_cast<int>(std::floor(resized.nx * scale))),
|
||||
std::max(1, static_cast<int>(std::floor(resized.ny * scale))),
|
||||
};
|
||||
clip_image_u8 scaled;
|
||||
img_tool::resize(resized, scaled, new_size, RESIZE_ALGO_BILINEAR, false);
|
||||
resized = std::move(scaled);
|
||||
}
|
||||
|
||||
return resized;
|
||||
}
|
||||
|
||||
clip_image_u8 mtmd_image_preprocessor_step3vl::crop_with_black_padding(const clip_image_u8 & image, int x, int y, int w, int h) {
|
||||
clip_image_u8 dst;
|
||||
dst.nx = w;
|
||||
dst.ny = h;
|
||||
dst.buf.resize(3 * w * h, 0);
|
||||
|
||||
const int src_x0 = std::max(0, x);
|
||||
const int src_y0 = std::max(0, y);
|
||||
const int src_x1 = std::min(image.nx, x + w);
|
||||
const int src_y1 = std::min(image.ny, y + h);
|
||||
|
||||
if (src_x0 >= src_x1 || src_y0 >= src_y1) {
|
||||
return dst;
|
||||
}
|
||||
|
||||
const int dst_x0 = src_x0 - x;
|
||||
const int dst_y0 = src_y0 - y;
|
||||
|
||||
for (int yy = 0; yy < src_y1 - src_y0; ++yy) {
|
||||
for (int xx = 0; xx < src_x1 - src_x0; ++xx) {
|
||||
const int src_idx = 3 * ((src_y0 + yy) * image.nx + (src_x0 + xx));
|
||||
const int dst_idx = 3 * ((dst_y0 + yy) * w + (dst_x0 + xx));
|
||||
dst.buf[dst_idx + 0] = image.buf[src_idx + 0];
|
||||
dst.buf[dst_idx + 1] = image.buf[src_idx + 1];
|
||||
dst.buf[dst_idx + 2] = image.buf[src_idx + 2];
|
||||
}
|
||||
}
|
||||
|
||||
return dst;
|
||||
}
|
||||
|
||||
mtmd_image_preprocessor_step3vl::slice_instructions mtmd_image_preprocessor_step3vl::build_slice_instructions(
|
||||
const clip_hparams & params,
|
||||
const clip_image_size & prepared_size) {
|
||||
slice_instructions instructions;
|
||||
instructions.overview_size = prepared_size;
|
||||
|
||||
const int window_size = determine_window_size(
|
||||
params,
|
||||
std::max(prepared_size.width, prepared_size.height),
|
||||
std::min(prepared_size.width, prepared_size.height));
|
||||
if (window_size <= 0) {
|
||||
instructions.refined_size = clip_image_size{0, 0};
|
||||
instructions.grid_size = clip_image_size{0, 0};
|
||||
return instructions;
|
||||
}
|
||||
|
||||
const int crop_width = calc_crop_extent(prepared_size.width, window_size);
|
||||
const int crop_height = calc_crop_extent(prepared_size.height, window_size);
|
||||
instructions.refined_size = clip_image_size{crop_width, crop_height};
|
||||
|
||||
const auto xs = calc_grid(crop_width, window_size);
|
||||
const auto ys = calc_grid(crop_height, window_size);
|
||||
instructions.grid_size = clip_image_size{
|
||||
static_cast<int>(xs.size()),
|
||||
static_cast<int>(ys.size()),
|
||||
};
|
||||
|
||||
for (int y : ys) {
|
||||
for (int x : xs) {
|
||||
instructions.slices.push_back(slice_coordinates{
|
||||
/* x */ x,
|
||||
/* y */ y,
|
||||
/* size */ clip_image_size{window_size, window_size},
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return instructions;
|
||||
}
|
||||
|
||||
bool mtmd_image_preprocessor_step3vl::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
|
||||
clip_image_u8 prepared = prepare_image(img, hparams);
|
||||
const auto instructions = build_slice_instructions(hparams, {prepared.nx, prepared.ny});
|
||||
|
||||
clip_image_f32_ptr overview_f32(clip_image_f32_init());
|
||||
img_u8_resize_bilinear_to_f32(
|
||||
prepared,
|
||||
*overview_f32,
|
||||
hparams.image_size,
|
||||
hparams.image_size,
|
||||
hparams.image_mean,
|
||||
hparams.image_std);
|
||||
output.entries.push_back(std::move(overview_f32));
|
||||
|
||||
if (instructions.slices.empty()) {
|
||||
output.grid_x = 0;
|
||||
output.grid_y = 0;
|
||||
return true;
|
||||
}
|
||||
|
||||
clip_image_u8 img_for_crop = prepared;
|
||||
if (instructions.refined_size.width != prepared.nx || instructions.refined_size.height != prepared.ny) {
|
||||
clip_image_u8 refined;
|
||||
img_tool::resize(prepared, refined, instructions.refined_size, RESIZE_ALGO_BILINEAR, false);
|
||||
img_for_crop = std::move(refined);
|
||||
}
|
||||
|
||||
const int crop_size = default_image_crop_size;
|
||||
for (const auto & slice : instructions.slices) {
|
||||
// If the requested patch extends past the source image, pad the out-of-bounds area with black.
|
||||
clip_image_u8 patch = crop_with_black_padding(img_for_crop, slice.x, slice.y, slice.size.width, slice.size.height);
|
||||
|
||||
clip_image_f32_ptr patch_f32(clip_image_f32_init());
|
||||
img_u8_resize_bilinear_to_f32(
|
||||
patch,
|
||||
*patch_f32,
|
||||
crop_size,
|
||||
crop_size,
|
||||
hparams.image_mean,
|
||||
hparams.image_std);
|
||||
output.entries.push_back(std::move(patch_f32));
|
||||
}
|
||||
|
||||
output.grid_x = instructions.grid_size.width;
|
||||
output.grid_y = instructions.grid_size.height;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
//
|
||||
// mtmd_image_preprocessor_youtuvl
|
||||
//
|
||||
|
||||
@@ -144,6 +144,35 @@ struct mtmd_image_preprocessor_deepseekocr : mtmd_image_preprocessor {
|
||||
bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
|
||||
};
|
||||
|
||||
// custom image preprocessing for Step3VL
|
||||
// ref: https://huggingface.co/stepfun-ai/Step3-VL-10B/blob/main/processing_step3.py
|
||||
struct mtmd_image_preprocessor_step3vl : mtmd_image_preprocessor_llava_uhd {
|
||||
mtmd_image_preprocessor_step3vl(const clip_ctx * ctx) : mtmd_image_preprocessor_llava_uhd(ctx) {}
|
||||
bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
|
||||
static slice_instructions build_slice_instructions(const clip_hparams & params, const clip_image_size & prepared_size);
|
||||
|
||||
private:
|
||||
static constexpr int default_image_longest_edge = 3024;
|
||||
static constexpr int default_image_crop_size = 504;
|
||||
static constexpr float small_aspect_ratio_limit = 1.5f;
|
||||
static constexpr float wide_aspect_ratio_limit = 4.0f;
|
||||
static constexpr float crop_rounding_threshold = 0.2f;
|
||||
|
||||
void img_u8_resize_bilinear_to_f32(
|
||||
const clip_image_u8 & src,
|
||||
clip_image_f32 & dst,
|
||||
int target_width,
|
||||
int target_height,
|
||||
const float mean[3],
|
||||
const float std[3]);
|
||||
static int get_image_longest_edge(const clip_hparams & params);
|
||||
static int determine_window_size(const clip_hparams & params, int longer, int shorter);
|
||||
static int calc_crop_extent(int length, int window_size);
|
||||
static std::vector<int> calc_grid(int length, int window_size);
|
||||
static clip_image_u8 prepare_image(const clip_image_u8 & img, const clip_hparams & params);
|
||||
static clip_image_u8 crop_with_black_padding(const clip_image_u8 & image, int x, int y, int w, int h);
|
||||
};
|
||||
|
||||
struct mtmd_image_preprocessor_youtuvl : mtmd_image_preprocessor {
|
||||
mtmd_image_preprocessor_youtuvl(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
|
||||
bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
|
||||
|
||||
@@ -88,6 +88,7 @@ enum mtmd_slice_tmpl {
|
||||
MTMD_SLICE_TMPL_LLAMA4,
|
||||
MTMD_SLICE_TMPL_IDEFICS3,
|
||||
MTMD_SLICE_TMPL_LFM2,
|
||||
MTMD_SLICE_TMPL_STEP3VL,
|
||||
};
|
||||
|
||||
const char * mtmd_default_marker() {
|
||||
@@ -259,7 +260,6 @@ struct mtmd_context {
|
||||
tok_row_end = {lookup_token("\n")};
|
||||
tok_row_end_trail = false; // no trailing end-of-row token
|
||||
ov_img_first = true;
|
||||
|
||||
} else if (minicpmv_version == 3 || minicpmv_version == 4 || minicpmv_version == 5 || minicpmv_version == 6 || minicpmv_version == 100045) {
|
||||
// minicpmv 2.6 format:
|
||||
// <image> (overview) </image><slice> (slice) </slice><slice> (slice) </slice>\n ...
|
||||
@@ -331,6 +331,22 @@ struct mtmd_context {
|
||||
" https://github.com/ggml-org/llama.cpp/pull/13282\n", __func__);
|
||||
image_preproc = std::make_unique<mtmd_image_preprocessor_llava_uhd>(ctx_v);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_STEP3VL:
|
||||
{
|
||||
// Step3 format:
|
||||
// <patch_start> (patch) <patch_end> [<patch_newline>]
|
||||
// ... (all patch rows)
|
||||
// <im_start> (overview) <im_end>
|
||||
slice_tmpl = MTMD_SLICE_TMPL_STEP3VL;
|
||||
tok_ov_img_start = {lookup_token("<im_start>")};
|
||||
tok_ov_img_end = {lookup_token("<im_end>")};
|
||||
tok_sli_img_start = {lookup_token("<patch_start>")};
|
||||
tok_sli_img_end = {lookup_token("<patch_end>")};
|
||||
tok_row_end = {lookup_token("<patch_newline>")};
|
||||
tok_row_end_trail = false;
|
||||
ov_img_first = false; // patches first, overview last
|
||||
image_preproc = std::make_unique<mtmd_image_preprocessor_step3vl>(ctx_v);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_INTERNVL:
|
||||
{
|
||||
// <img> ... (image embeddings) ... </img>
|
||||
@@ -682,6 +698,7 @@ struct mtmd_tokenizer {
|
||||
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6
|
||||
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_LLAMA4
|
||||
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_IDEFICS3
|
||||
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_STEP3VL
|
||||
|| (ctx->slice_tmpl == MTMD_SLICE_TMPL_LFM2 && has_tiling_grid)
|
||||
) {
|
||||
const int n_col = batch_f32.grid_x;
|
||||
|
||||
File diff suppressed because one or more lines are too long
@@ -18,7 +18,7 @@
|
||||
<div style="display: contents">
|
||||
<script>
|
||||
{
|
||||
__sveltekit_10avopp = {
|
||||
__sveltekit_1ppa22i = {
|
||||
base: new URL('.', location).pathname.slice(0, -1)
|
||||
};
|
||||
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
import remarkGfm from 'remark-gfm';
|
||||
import remarkMath from 'remark-math';
|
||||
import rehypeHighlight from 'rehype-highlight';
|
||||
import { all as lowlightAll } from 'lowlight';
|
||||
import remarkRehype from 'remark-rehype';
|
||||
import rehypeKatex from 'rehype-katex';
|
||||
import rehypeStringify from 'rehype-stringify';
|
||||
@@ -96,6 +97,7 @@
|
||||
|
||||
return proc
|
||||
.use(rehypeHighlight, {
|
||||
languages: lowlightAll,
|
||||
aliases: { [FileTypeText.XML]: [FileTypeText.SVELTE, FileTypeText.VUE] }
|
||||
}) // Add syntax highlighting
|
||||
.use(rehypeRestoreTableHtml) // Restore limited HTML (e.g., <br>, <ul>) inside Markdown tables
|
||||
|
||||
Reference in New Issue
Block a user