mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-04-30 16:47:31 +03:00
Compare commits
4 Commits
b8546
...
gg/attn-ro
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f0fea264b0 | ||
|
|
7711b3a36a | ||
|
|
832e32639f | ||
|
|
e5aa067d68 |
2
.github/workflows/build-android.yml
vendored
2
.github/workflows/build-android.yml
vendored
@@ -51,7 +51,7 @@ jobs:
|
||||
distribution: zulu
|
||||
|
||||
- name: Setup Android SDK
|
||||
uses: android-actions/setup-android@9fc6c4e9069bf8d3d10b2204b1fb8f6ef7065407 # v3
|
||||
uses: android-actions/setup-android@v3
|
||||
with:
|
||||
log-accepted-android-sdk-licenses: false
|
||||
|
||||
|
||||
2
.github/workflows/build-msys.yml
vendored
2
.github/workflows/build-msys.yml
vendored
@@ -43,7 +43,7 @@ jobs:
|
||||
# save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
|
||||
|
||||
- name: Setup ${{ matrix.sys }}
|
||||
uses: msys2/setup-msys2@cafece8e6baf9247cf9b1bf95097b0b983cc558d # v2
|
||||
uses: msys2/setup-msys2@v2
|
||||
with:
|
||||
update: true
|
||||
msystem: ${{matrix.sys}}
|
||||
|
||||
12
.github/workflows/docker.yml
vendored
12
.github/workflows/docker.yml
vendored
@@ -56,15 +56,15 @@ jobs:
|
||||
|
||||
- name: Set up QEMU
|
||||
if: ${{ matrix.config.tag != 's390x' }}
|
||||
uses: docker/setup-qemu-action@c7c53464625b32c7a7e944ae62b3e17d2b600130 # v3
|
||||
uses: docker/setup-qemu-action@v3
|
||||
with:
|
||||
image: tonistiigi/binfmt:qemu-v7.0.0-28
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # v3
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Log in to Docker Hub
|
||||
uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # v3
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
registry: ghcr.io
|
||||
username: ${{ github.repository_owner }}
|
||||
@@ -127,7 +127,7 @@ jobs:
|
||||
|
||||
- name: Build and push Full Docker image (tagged + versioned)
|
||||
if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.full == true }}
|
||||
uses: docker/build-push-action@10e90e3645eae34f1e60eeb005ba3a3d33f178e8 # v6
|
||||
uses: docker/build-push-action@v6
|
||||
with:
|
||||
context: .
|
||||
push: true
|
||||
@@ -152,7 +152,7 @@ jobs:
|
||||
|
||||
- name: Build and push Light Docker image (tagged + versioned)
|
||||
if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.light == true }}
|
||||
uses: docker/build-push-action@10e90e3645eae34f1e60eeb005ba3a3d33f178e8 # v6
|
||||
uses: docker/build-push-action@v6
|
||||
with:
|
||||
context: .
|
||||
push: true
|
||||
@@ -177,7 +177,7 @@ jobs:
|
||||
|
||||
- name: Build and push Server Docker image (tagged + versioned)
|
||||
if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.server == true }}
|
||||
uses: docker/build-push-action@10e90e3645eae34f1e60eeb005ba3a3d33f178e8 # v6
|
||||
uses: docker/build-push-action@v6
|
||||
with:
|
||||
context: .
|
||||
push: true
|
||||
|
||||
2
.github/workflows/editorconfig.yml
vendored
2
.github/workflows/editorconfig.yml
vendored
@@ -23,7 +23,7 @@ jobs:
|
||||
runs-on: ubuntu-slim
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
- uses: editorconfig-checker/action-editorconfig-checker@840e866d93b8e032123c23bac69dece044d4d84c # v2.2.0
|
||||
- uses: editorconfig-checker/action-editorconfig-checker@v2
|
||||
with:
|
||||
version: v3.0.3
|
||||
- run: editorconfig-checker
|
||||
|
||||
2
.github/workflows/gguf-publish.yml
vendored
2
.github/workflows/gguf-publish.yml
vendored
@@ -38,7 +38,7 @@ jobs:
|
||||
- name: Build package
|
||||
run: cd gguf-py && poetry build
|
||||
- name: Publish package
|
||||
uses: pypa/gh-action-pypi-publish@ed0c53931b1dc9bd32cbe73a98c7f6766f8a527e # release/v1
|
||||
uses: pypa/gh-action-pypi-publish@release/v1
|
||||
with:
|
||||
password: ${{ secrets.PYPI_API_TOKEN }}
|
||||
packages-dir: gguf-py/dist
|
||||
|
||||
2
.github/workflows/python-lint.yml
vendored
2
.github/workflows/python-lint.yml
vendored
@@ -31,6 +31,6 @@ jobs:
|
||||
with:
|
||||
python-version: "3.11"
|
||||
- name: flake8 Lint
|
||||
uses: py-actions/flake8@84ec6726560b6d5bd68f2a5bed83d62b52bb50ba # v2
|
||||
uses: py-actions/flake8@v2
|
||||
with:
|
||||
plugins: "flake8-no-print"
|
||||
|
||||
@@ -26,8 +26,6 @@ namespace nl = nlohmann;
|
||||
#include <windows.h>
|
||||
#else
|
||||
#define HOME_DIR "HOME"
|
||||
#include <unistd.h>
|
||||
#include <pwd.h>
|
||||
#endif
|
||||
|
||||
namespace hf_cache {
|
||||
@@ -53,13 +51,6 @@ static fs::path get_cache_directory() {
|
||||
return entry.path.empty() ? base : base / entry.path;
|
||||
}
|
||||
}
|
||||
#ifndef _WIN32
|
||||
const struct passwd * pw = getpwuid(getuid());
|
||||
|
||||
if (pw->pw_dir && *pw->pw_dir) {
|
||||
return fs::path(pw->pw_dir) / ".cache" / "huggingface" / "hub";
|
||||
}
|
||||
#endif
|
||||
throw std::runtime_error("Failed to determine HF cache directory");
|
||||
}();
|
||||
|
||||
|
||||
@@ -486,7 +486,7 @@ class ModelBase:
|
||||
elif quant_method == "modelopt":
|
||||
# Mixed-precision ModelOpt models: NVFP4 tensors are handled by
|
||||
# _generate_nvfp4_tensors; FP8 tensors have 1D weight_scale and
|
||||
# are dequantized here. k/v scale tensors are unused.
|
||||
# are dequantized here. input_scale tensors are unused.
|
||||
for name in self.model_tensors.keys():
|
||||
if name.endswith(".weight_scale"):
|
||||
weight_name = name.removesuffix("_scale")
|
||||
@@ -494,7 +494,7 @@ class ModelBase:
|
||||
s = self.model_tensors[name]
|
||||
self.model_tensors[weight_name] = lambda w=w, s=s: dequant_simple(w(), s(), None)
|
||||
tensors_to_remove.append(name)
|
||||
if name.endswith((".k_scale", ".v_scale")):
|
||||
if name.endswith((".input_scale", ".k_scale", ".v_scale")):
|
||||
tensors_to_remove.append(name)
|
||||
elif quant_method is not None:
|
||||
raise NotImplementedError(f"Quant method is not yet supported: {quant_method!r}")
|
||||
@@ -542,6 +542,7 @@ class ModelBase:
|
||||
raise NotImplementedError("set_gguf_parameters() must be implemented in subclasses")
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
|
||||
new_name = self.map_tensor_name(name)
|
||||
|
||||
# Handle gate/up expert tensor fusion if enabled
|
||||
@@ -606,12 +607,7 @@ class ModelBase:
|
||||
def _nvfp4_scale2_is_trivial(scale2: Tensor) -> bool:
|
||||
return scale2.numel() <= 1 and abs(float(scale2.float().sum()) - 1.0) < 1e-6
|
||||
|
||||
def _repack_nvfp4(self, name: str, weight: Tensor, scale: Tensor, scale2: Tensor, input_scale: Tensor):
|
||||
if "language_model." in name:
|
||||
name = name.replace("language_model.", "")
|
||||
|
||||
new_name = self.map_tensor_name(name)
|
||||
|
||||
def _repack_nvfp4(self, new_name: str, weight: Tensor, scale: Tensor, scale2: Tensor):
|
||||
raw, shape = self._nvfp4_pack(weight, scale)
|
||||
logger.info(f"Repacked {new_name} with shape {shape} and quantization NVFP4")
|
||||
self.gguf_writer.add_tensor(new_name, raw, raw_dtype=gguf.GGMLQuantizationType.NVFP4)
|
||||
@@ -623,18 +619,10 @@ class ModelBase:
|
||||
logger.info(f" + {scale_name} (per-tensor NVFP4 scale2, shape [{scale2_f32.size}])")
|
||||
self.gguf_writer.add_tensor(scale_name, scale2_f32)
|
||||
|
||||
# Emit per-tensor input_scale as a separate F32 tensor when non-trivial
|
||||
if not self._nvfp4_scale2_is_trivial(input_scale):
|
||||
input_scale_f32 = input_scale.float().numpy().flatten()
|
||||
input_scale_name = new_name.replace(".weight", ".input_scale")
|
||||
logger.info(f" + {input_scale_name} (per-tensor NVFP4 input_scale, shape [{input_scale_f32.size}])")
|
||||
self.gguf_writer.add_tensor(input_scale_name, input_scale_f32)
|
||||
|
||||
def _generate_nvfp4_tensors(self):
|
||||
# Per-layer expert merging to avoid holding all experts in memory
|
||||
expert_blocks: dict[tuple[int, str], list[tuple[int, np.ndarray]]] = {}
|
||||
expert_scales: dict[tuple[int, str], list[tuple[int, float]]] = {}
|
||||
expert_input_scales: dict[tuple[int, str], list[tuple[int, float]]] = {}
|
||||
expert_shapes: dict[tuple[int, str], list[int]] = {}
|
||||
n_experts = self.find_hparam(["num_local_experts", "num_experts"], optional=True) or 0
|
||||
consumed: list[str] = []
|
||||
@@ -644,7 +632,6 @@ class ModelBase:
|
||||
continue
|
||||
scale_name = name.replace(".weight", ".weight_scale")
|
||||
scale2_name = name.replace(".weight", ".weight_scale_2")
|
||||
input_scale_name = name.replace(".weight", ".input_scale")
|
||||
if scale_name not in self.model_tensors:
|
||||
continue
|
||||
# Force eager materialization of lazy tensors
|
||||
@@ -656,14 +643,11 @@ class ModelBase:
|
||||
continue
|
||||
|
||||
scale2 = LazyTorchTensor.to_eager(self.model_tensors.get(scale2_name, lambda: torch.tensor(1.0))())
|
||||
input_scale = LazyTorchTensor.to_eager(self.model_tensors.get(input_scale_name, lambda: torch.tensor(1.0))())
|
||||
|
||||
# Mark tensors for removal from model_tensors (already written to gguf)
|
||||
consumed.extend([name, scale_name])
|
||||
if scale2_name in self.model_tensors:
|
||||
consumed.append(scale2_name)
|
||||
if input_scale_name in self.model_tensors:
|
||||
consumed.append(input_scale_name)
|
||||
|
||||
# Check if this is a per-expert tensor
|
||||
m = re.search(r'\.experts\.(\d+)\.(gate_proj|up_proj|down_proj)\.weight$', name)
|
||||
@@ -679,37 +663,34 @@ class ModelBase:
|
||||
if key not in expert_blocks:
|
||||
expert_blocks[key] = []
|
||||
expert_scales[key] = []
|
||||
expert_input_scales[key] = []
|
||||
expert_shapes[key] = shape
|
||||
expert_blocks[key].append((expert_id, raw.copy()))
|
||||
# Collect per-expert scale2 (scalar per expert)
|
||||
expert_scales[key].append((expert_id, float(scale2.float().sum())))
|
||||
# Collect per-expert input_scale (scalar per expert)
|
||||
expert_input_scales[key].append((expert_id, float(input_scale.float().sum())))
|
||||
|
||||
# Flush when all experts for this (layer, proj) are collected
|
||||
if n_experts > 0 and len(expert_blocks[key]) >= n_experts:
|
||||
self._flush_nvfp4_experts(key, expert_blocks, expert_scales, expert_input_scales, expert_shapes, bid, proj_type)
|
||||
self._flush_nvfp4_experts(key, expert_blocks, expert_scales, expert_shapes, bid, proj_type)
|
||||
else:
|
||||
self._repack_nvfp4(name, weight, scale, scale2, input_scale)
|
||||
new_name = self.map_tensor_name(name)
|
||||
self._repack_nvfp4(new_name, weight, scale, scale2)
|
||||
|
||||
# Flush any remaining experts (fallback if n_experts was unknown)
|
||||
for (bid, proj_type) in list(expert_blocks.keys()):
|
||||
self._flush_nvfp4_experts((bid, proj_type), expert_blocks, expert_scales, expert_input_scales, expert_shapes, bid, proj_type)
|
||||
self._flush_nvfp4_experts((bid, proj_type), expert_blocks, expert_scales, expert_shapes, bid, proj_type)
|
||||
|
||||
# Remove consumed tensors so get_tensors/modify_tensors won't see them
|
||||
for name in consumed:
|
||||
self.model_tensors.pop(name, None)
|
||||
|
||||
# Remove any remaining unused auxiliary tensors
|
||||
# Remove unused auxiliary tensors (input_scale, k_scale, v_scale)
|
||||
for name in list(self.model_tensors.keys()):
|
||||
if name.endswith((".k_scale", ".v_scale")):
|
||||
if name.endswith((".input_scale", ".k_scale", ".v_scale")):
|
||||
del self.model_tensors[name]
|
||||
|
||||
def _flush_nvfp4_experts(self, key, expert_blocks, expert_scales, expert_input_scales, expert_shapes, bid, proj_type):
|
||||
def _flush_nvfp4_experts(self, key, expert_blocks, expert_scales, expert_shapes, bid, proj_type):
|
||||
experts = expert_blocks.pop(key)
|
||||
scales = expert_scales.pop(key)
|
||||
input_scales = expert_input_scales.pop(key)
|
||||
shape = expert_shapes.pop(key)
|
||||
|
||||
experts.sort(key=lambda x: x[0])
|
||||
@@ -727,14 +708,6 @@ class ModelBase:
|
||||
logger.info(f" + {scale_name} (per-expert NVFP4 scale2, shape [{len(scales)}])")
|
||||
self.gguf_writer.add_tensor(scale_name, scale_vals)
|
||||
|
||||
# Emit per-expert input_scale tensor if any expert has non-trivial input_scale
|
||||
input_scales.sort(key=lambda x: x[0])
|
||||
input_scale_vals = np.array([s[1] for s in input_scales], dtype=np.float32)
|
||||
if not np.allclose(input_scale_vals, 1.0, atol=1e-6):
|
||||
input_scale_name = new_name.replace(".weight", ".input_scale")
|
||||
logger.info(f" + {input_scale_name} (per-expert NVFP4 input_scale, shape [{len(input_scales)}])")
|
||||
self.gguf_writer.add_tensor(input_scale_name, input_scale_vals)
|
||||
|
||||
del experts, merged
|
||||
|
||||
def prepare_tensors(self):
|
||||
@@ -1338,9 +1311,6 @@ class TextModel(ModelBase):
|
||||
if chkhsh == "b3d1dd861f1d4c5c0d2569ce36baf3f90fe8a102db3de50dd71ff860d91be3df":
|
||||
# ref: https://huggingface.co/aari1995/German_Semantic_V3
|
||||
res = "jina-v2-de"
|
||||
if chkhsh == "0fe1cf6eda062318a1af7270f3331a85c539a01778ff948e24388e949c5282f4":
|
||||
# ref: https://huggingface.co/evilfreelancer/ruGPT3XL
|
||||
res = "gpt-2"
|
||||
if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
|
||||
# ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
|
||||
res = "llama-bpe"
|
||||
@@ -5041,97 +5011,6 @@ class _LinearAttentionVReorderBase(Qwen3NextModel):
|
||||
perm[dim], perm[dim + 1] = perm[dim + 1], perm[dim]
|
||||
return tensor.permute(*perm).contiguous().reshape(*shape)
|
||||
|
||||
def _transform_nvfp4_weight(self, name: str, weight: Tensor, scale: Tensor) -> tuple[Tensor, Tensor]:
|
||||
if not name.endswith((
|
||||
".linear_attn.in_proj_qkv.weight",
|
||||
".linear_attn.in_proj_z.weight",
|
||||
".linear_attn.in_proj_a.weight",
|
||||
".linear_attn.in_proj_b.weight",
|
||||
".linear_attn.out_proj.weight",
|
||||
)):
|
||||
return weight, scale
|
||||
|
||||
num_k_heads = self.hparams["linear_num_key_heads"]
|
||||
num_v_heads = self.hparams["linear_num_value_heads"]
|
||||
head_k_dim = self.hparams["linear_key_head_dim"]
|
||||
head_v_dim = self.hparams["linear_value_head_dim"]
|
||||
num_v_per_k = num_v_heads // num_k_heads
|
||||
|
||||
def unpack_nibbles(qs: Tensor) -> Tensor:
|
||||
lo = torch.bitwise_and(qs, 0x0F)
|
||||
hi = torch.bitwise_right_shift(qs, 4)
|
||||
return torch.stack((lo, hi), dim=-1).reshape(*qs.shape[:-1], qs.shape[-1] * 2)
|
||||
|
||||
def pack_nibbles(codes: Tensor) -> Tensor:
|
||||
codes = codes.reshape(*codes.shape[:-1], codes.shape[-1] // 2, 2)
|
||||
lo = torch.bitwise_and(codes[..., 0], 0x0F)
|
||||
hi = torch.bitwise_left_shift(torch.bitwise_and(codes[..., 1], 0x0F), 4)
|
||||
return torch.bitwise_or(lo, hi).contiguous()
|
||||
|
||||
def apply_col_perm(qs: Tensor, scales: Tensor, col_perm: Tensor) -> tuple[Tensor, Tensor]:
|
||||
assert qs.ndim >= 2
|
||||
assert scales.ndim >= 2
|
||||
|
||||
k = qs.shape[-1] * 2
|
||||
assert col_perm.numel() == k
|
||||
assert k % 16 == 0
|
||||
|
||||
group_cols = col_perm.reshape(-1, 16)
|
||||
group_starts = group_cols[:, 0]
|
||||
expected = group_starts.unsqueeze(1) + torch.arange(16, dtype=col_perm.dtype)
|
||||
assert torch.equal(group_cols, expected)
|
||||
assert torch.all(group_starts % 16 == 0)
|
||||
|
||||
group_perm = (group_starts // 16).to(dtype=torch.long)
|
||||
expected_groups = torch.arange(scales.shape[-1], dtype=torch.long)
|
||||
assert group_perm.numel() == scales.shape[-1]
|
||||
assert torch.equal(torch.sort(group_perm).values, expected_groups)
|
||||
|
||||
codes = unpack_nibbles(qs)
|
||||
codes = codes.index_select(-1, col_perm.to(device=qs.device, dtype=torch.long))
|
||||
qs = pack_nibbles(codes)
|
||||
scales = scales.index_select(-1, group_perm.to(device=scales.device))
|
||||
return qs, scales
|
||||
|
||||
def reorder_rows(qs: Tensor, scales: Tensor, head_dim: int) -> tuple[Tensor, Tensor]:
|
||||
row_perm = self._reorder_v_heads(
|
||||
torch.arange(num_v_heads * head_dim, dtype=torch.long).unsqueeze(-1),
|
||||
0, num_k_heads, num_v_per_k, head_dim,
|
||||
).squeeze(-1)
|
||||
return (
|
||||
qs.index_select(0, row_perm.to(device=qs.device)),
|
||||
scales.index_select(0, row_perm.to(device=scales.device)),
|
||||
)
|
||||
|
||||
if name.endswith(".linear_attn.in_proj_qkv.weight"):
|
||||
q_dim = head_k_dim * num_k_heads
|
||||
k_dim = head_k_dim * num_k_heads
|
||||
q = weight[:q_dim]
|
||||
k = weight[q_dim:q_dim + k_dim]
|
||||
v = weight[q_dim + k_dim:]
|
||||
q_scale = scale[:q_dim]
|
||||
k_scale = scale[q_dim:q_dim + k_dim]
|
||||
v_scale = scale[q_dim + k_dim:]
|
||||
v, v_scale = reorder_rows(v, v_scale, head_v_dim)
|
||||
return torch.cat([q, k, v], dim=0), torch.cat([q_scale, k_scale, v_scale], dim=0)
|
||||
|
||||
if name.endswith(".linear_attn.in_proj_z.weight"):
|
||||
weight, scale = reorder_rows(weight, scale, head_v_dim)
|
||||
elif name.endswith((".linear_attn.in_proj_a.weight", ".linear_attn.in_proj_b.weight")):
|
||||
weight, scale = reorder_rows(weight, scale, 1)
|
||||
elif name.endswith(".linear_attn.out_proj.weight"):
|
||||
col_perm = self._reorder_v_heads(
|
||||
torch.arange(num_v_heads * head_v_dim, dtype=torch.long).unsqueeze(0),
|
||||
1, num_k_heads, num_v_per_k, head_v_dim,
|
||||
).squeeze(0)
|
||||
weight, scale = apply_col_perm(weight, scale, col_perm)
|
||||
|
||||
return weight, scale
|
||||
|
||||
def _repack_nvfp4(self, name: str, weight: Tensor, scale: Tensor, scale2: Tensor, input_scale: Tensor):
|
||||
weight, scale = self._transform_nvfp4_weight(name, weight, scale)
|
||||
super()._repack_nvfp4(name, weight, scale, scale2, input_scale)
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
num_k_heads = self.hparams.get("linear_num_key_heads", 0)
|
||||
num_v_heads = self.hparams.get("linear_num_value_heads", 0)
|
||||
@@ -5221,47 +5100,6 @@ class GPT2Model(TextModel):
|
||||
yield from super().modify_tensors(data_torch, new_name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("RuGPT3XLForCausalLM")
|
||||
class RuGPT3XLModel(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.GPT2
|
||||
|
||||
_qkv_parts: list[dict[str, Tensor]] | None = None
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
# Fuse separate Q, K, V projections into a single QKV tensor
|
||||
if ".self_attn.q_proj." in name or ".self_attn.k_proj." in name or ".self_attn.v_proj." in name:
|
||||
suffix = "weight" if name.endswith(".weight") else "bias"
|
||||
part = "q" if ".q_proj." in name else ("k" if ".k_proj." in name else "v")
|
||||
key = f"{part}.{suffix}"
|
||||
|
||||
assert bid is not None
|
||||
if self._qkv_parts is None:
|
||||
self._qkv_parts = [{} for _ in range(self.block_count)]
|
||||
self._qkv_parts[bid][key] = data_torch
|
||||
|
||||
q_key, k_key, v_key = f"q.{suffix}", f"k.{suffix}", f"v.{suffix}"
|
||||
if all(k in self._qkv_parts[bid] for k in [q_key, k_key, v_key]):
|
||||
q = self._qkv_parts[bid].pop(q_key)
|
||||
k = self._qkv_parts[bid].pop(k_key)
|
||||
v = self._qkv_parts[bid].pop(v_key)
|
||||
data_torch = torch.cat([q, k, v], dim=0)
|
||||
name = self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_QKV, bid, f".{suffix}")
|
||||
logger.debug(f"Fused Q/K/V {suffix} for layer {bid} -> {name}")
|
||||
else:
|
||||
return
|
||||
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
def prepare_tensors(self):
|
||||
super().prepare_tensors()
|
||||
|
||||
if self._qkv_parts is not None:
|
||||
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
||||
parts = [f"({i}){k}" for i, d in enumerate(self._qkv_parts) for k in d.keys()]
|
||||
if len(parts) > 0:
|
||||
raise ValueError(f"Unprocessed Q/K/V parts: {parts}")
|
||||
|
||||
|
||||
@ModelBase.register("PhiForCausalLM")
|
||||
class Phi2Model(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.PHI2
|
||||
@@ -7150,8 +6988,6 @@ class DeepseekOCRVisionModel(MmprojModel):
|
||||
return gguf.GGMLQuantizationType.F32
|
||||
if ".rel_pos_h" in name or '.rel_pos_w' in name:
|
||||
return gguf.GGMLQuantizationType.F32
|
||||
if ".neck." in name or ".net_" in name:
|
||||
return gguf.GGMLQuantizationType.F32
|
||||
return super().tensor_force_quant(name, new_name, bid, n_dims)
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
|
||||
@@ -178,7 +178,6 @@ pre_computed_hashes = [
|
||||
{"name": "grok-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/alvarobartt/grok-2-tokenizer", "chkhsh": "66b8d4e19ab16c3bfd89bce5d785fb7e0155e8648708a1f42077cb9fe002c273"},
|
||||
# jina-v2-de variants
|
||||
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/aari1995/German_Semantic_V3", "chkhsh": "b3d1dd861f1d4c5c0d2569ce36baf3f90fe8a102db3de50dd71ff860d91be3df"},
|
||||
{"name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/evilfreelancer/ruGPT3XL", "chkhsh": "0fe1cf6eda062318a1af7270f3331a85c539a01778ff948e24388e949c5282f4"},
|
||||
]
|
||||
|
||||
|
||||
|
||||
@@ -802,13 +802,7 @@ static __device__ __forceinline__ float ggml_cuda_e8m0_to_fp32(uint8_t x) {
|
||||
static __device__ __forceinline__ float ggml_cuda_ue4m3_to_fp32(uint8_t x) {
|
||||
#ifdef FP8_AVAILABLE
|
||||
const uint32_t bits = x * (x != 0x7F && x != 0xFF); // Convert NaN to 0.0f to match CPU implementation.
|
||||
#if defined(GGML_USE_HIP) && defined(CDNA3)
|
||||
// ROCm dose not support fp8 in software on devices with fp8 hardware,
|
||||
// but CDNA3 supports only e4m3_fnuz (no inf).
|
||||
const __hip_fp8_e4m3_fnuz xf = *reinterpret_cast<const __hip_fp8_e4m3_fnuz *>(&bits);
|
||||
#else
|
||||
const __nv_fp8_e4m3 xf = *reinterpret_cast<const __nv_fp8_e4m3 *>(&bits);
|
||||
#endif // defined(GGML_USE_HIP) && defined(GGML_USE_HIP)
|
||||
return static_cast<float>(xf) / 2;
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
|
||||
@@ -394,9 +394,6 @@ struct ggml_backend_opencl_context {
|
||||
bool fp16_support;
|
||||
bool has_vector_subgroup_broadcast;
|
||||
bool disable_fusion;
|
||||
|
||||
bool adreno_has_large_buffer;
|
||||
bool adreno_use_large_buffer;
|
||||
ggml_cl_compiler_version adreno_cl_compiler_version;
|
||||
|
||||
int adreno_wave_size;
|
||||
@@ -790,10 +787,6 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
||||
" -cl-mad-enable -cl-unsafe-math-optimizations"
|
||||
" -cl-finite-math-only -cl-fast-relaxed-math";
|
||||
|
||||
if (backend_ctx->adreno_use_large_buffer) {
|
||||
compile_opts += " -qcom-enable-large-buffer ";
|
||||
}
|
||||
|
||||
GGML_LOG_INFO("ggml_opencl: loading OpenCL kernels");
|
||||
|
||||
// add
|
||||
@@ -3027,8 +3020,6 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
||||
// Check if ext_buffer contains cl_khr_fp16
|
||||
backend_ctx->fp16_support = strstr(ext_buffer, "cl_khr_fp16") != NULL;
|
||||
GGML_LOG_INFO("ggml_opencl: device FP16 support: %s\n", backend_ctx->fp16_support ? "true" : "false");
|
||||
// check Adreno large buffer support
|
||||
backend_ctx->adreno_has_large_buffer = strstr(ext_buffer, "cl_qcom_large_buffer") != NULL;
|
||||
|
||||
// fp16 is required
|
||||
if (!backend_ctx->fp16_support) {
|
||||
@@ -3095,18 +3086,6 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
||||
GGML_LOG_INFO("ggml_opencl: using kernels optimized for Adreno (GGML_OPENCL_USE_ADRENO_KERNELS)\n");
|
||||
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
||||
|
||||
// determine whether to use large buffer for Adreno
|
||||
backend_ctx->adreno_use_large_buffer = getenv("GGML_OPENCL_ADRENO_USE_LARGE_BUFFER") != nullptr &&
|
||||
backend_ctx->gpu_family == GPU_FAMILY::ADRENO;
|
||||
if (backend_ctx->adreno_use_large_buffer) {
|
||||
if (!backend_ctx->adreno_has_large_buffer) {
|
||||
GGML_LOG_INFO("ggml_opencl: Adreno large buffer requested but not supported by driver, will use regular buffer\n");
|
||||
backend_ctx->adreno_use_large_buffer = false;
|
||||
} else {
|
||||
GGML_LOG_INFO("ggml_opencl: Adreno large buffer enabled\n");
|
||||
}
|
||||
}
|
||||
|
||||
cl_int err;
|
||||
|
||||
// A local ref of cl_context for convenience
|
||||
@@ -5681,11 +5660,6 @@ static ggml_backend_buffer_t ggml_backend_opencl_buffer_type_alloc_buffer(ggml_b
|
||||
|
||||
cl_int err;
|
||||
cl_mem mem = clCreateBuffer(backend_ctx->context, CL_MEM_READ_WRITE, size, NULL, &err);
|
||||
if (err != CL_SUCCESS && backend_ctx->adreno_use_large_buffer) {
|
||||
cl_mem_properties props[] = { 0x41A6 /* CL_LARGE_BUFFER_QCOM */, 1, 0 };
|
||||
mem = clCreateBufferWithProperties(backend_ctx->context, props, CL_MEM_READ_WRITE, size, NULL, &err);
|
||||
}
|
||||
|
||||
if (err != CL_SUCCESS) {
|
||||
GGML_LOG_INFO("%s: failed to allocate %.2f MiB\n", __func__, size / 1024.0 / 1024.0);
|
||||
return nullptr;
|
||||
|
||||
@@ -63,7 +63,6 @@ class TensorNameMap:
|
||||
"transformer.wpe", # gpt2
|
||||
"embeddings.position_embeddings", # bert
|
||||
"wpe", # gpt2
|
||||
"model.embed_positions", # rugpt3xl
|
||||
),
|
||||
|
||||
# Output
|
||||
|
||||
@@ -52,6 +52,59 @@ static bool can_reuse_kq_mask(
|
||||
|
||||
// impl
|
||||
|
||||
static bool ggml_is_power_of_2(int n) {
|
||||
return (n & (n - 1)) == 0;
|
||||
}
|
||||
|
||||
// orthonormal Walsh-Hadamard rotation matrix
|
||||
static void set_input_hadamard(float * data, int n, int H) {
|
||||
assert(ggml_is_power_of_2(n));
|
||||
|
||||
data[0*n + 0] = 1.0 / sqrtf(n);
|
||||
|
||||
for (int s = 1; s < n; s *= 2) {
|
||||
for (int i = 0; i < s; i++) {
|
||||
for (int j = 0; j < s; j++) {
|
||||
const float val = data[i*n + j];
|
||||
|
||||
data[(i + s)*n + (j )] = val;
|
||||
data[(i )*n + (j + s)] = val;
|
||||
data[(i + s)*n + (j + s)] = -val;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
srand(1242);
|
||||
|
||||
// copy to other heads
|
||||
for (int h = 1; h < H; h++) {
|
||||
//memcpy(data + h*n*n, data + (h-1)*n*n, n*n*sizeof(float));
|
||||
|
||||
for (int i = 0; i < n; i++) {
|
||||
float sgn = rand() % 2 ? 1.0f : -1.0f;
|
||||
for (int j = 0; j < n; j++) {
|
||||
data[h*n*n + j*n + i] = sgn*data[j*n + i];
|
||||
//data[h*n*n + (h-1)*n + j] *= sgn;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static ggml_tensor * ggml_rotate_hadamard(
|
||||
ggml_context * ctx,
|
||||
ggml_tensor * cur,
|
||||
ggml_tensor * rot) {
|
||||
const auto n = rot->ne[0];
|
||||
|
||||
ggml_tensor * res;
|
||||
res = ggml_reshape_4d(ctx, cur, n, cur->ne[0]/(n), cur->ne[1], cur->ne[2]);
|
||||
//res = ggml_reshape_3d(ctx, cur, n, ggml_nelements(cur)/(n*cur->ne[1]), cur->ne[1]);
|
||||
res = ggml_mul_mat(ctx, rot, res);
|
||||
res = ggml_reshape_4d(ctx, res, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3]);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) {
|
||||
if (ubatch->token) {
|
||||
const int64_t n_tokens = ubatch->n_tokens;
|
||||
@@ -429,6 +482,22 @@ void llm_graph_input_attn_kv::set_input(const llama_ubatch * ubatch) {
|
||||
mctx->set_input_v_idxs(self_v_idxs, ubatch);
|
||||
|
||||
mctx->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
|
||||
|
||||
if (self_rotk) {
|
||||
GGML_ASSERT(ggml_backend_buffer_is_host(self_rotk->buffer));
|
||||
|
||||
float * data = (float *) self_rotk->data;
|
||||
|
||||
set_input_hadamard(data, self_rotk->ne[0], self_rotk->ne[2]);
|
||||
}
|
||||
|
||||
if (self_rotv) {
|
||||
GGML_ASSERT(ggml_backend_buffer_is_host(self_rotv->buffer));
|
||||
|
||||
float * data = (float *) self_rotv->data;
|
||||
|
||||
set_input_hadamard(data, self_rotv->ne[0], self_rotv->ne[2]);
|
||||
}
|
||||
}
|
||||
|
||||
bool llm_graph_input_attn_kv::can_reuse(const llm_graph_params & params) {
|
||||
@@ -476,6 +545,22 @@ void llm_graph_input_attn_kv_iswa::set_input(const llama_ubatch * ubatch) {
|
||||
mctx->get_swa()->set_input_v_idxs(self_v_idxs_swa, ubatch);
|
||||
|
||||
mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn);
|
||||
|
||||
if (self_rotk) {
|
||||
GGML_ASSERT(ggml_backend_buffer_is_host(self_rotk->buffer));
|
||||
|
||||
float * data = (float *) self_rotk->data;
|
||||
|
||||
set_input_hadamard(data, self_rotk->ne[0], self_rotk->ne[2]);
|
||||
}
|
||||
|
||||
if (self_rotv) {
|
||||
GGML_ASSERT(ggml_backend_buffer_is_host(self_rotv->buffer));
|
||||
|
||||
float * data = (float *) self_rotv->data;
|
||||
|
||||
set_input_hadamard(data, self_rotv->ne[0], self_rotv->ne[2]);
|
||||
}
|
||||
}
|
||||
|
||||
bool llm_graph_input_attn_kv_iswa::can_reuse(const llm_graph_params & params) {
|
||||
@@ -532,6 +617,22 @@ void llm_graph_input_mem_hybrid::set_input(const llama_ubatch * ubatch) {
|
||||
|
||||
mctx->get_attn()->set_input_kq_mask(inp_attn->self_kq_mask, ubatch, cparams.causal_attn);
|
||||
|
||||
if (inp_attn->self_rotk) {
|
||||
GGML_ASSERT(ggml_backend_buffer_is_host(inp_attn->self_rotk->buffer));
|
||||
|
||||
float * data = (float *) inp_attn->self_rotk->data;
|
||||
|
||||
set_input_hadamard(data, inp_attn->self_rotk->ne[0], inp_attn->self_rotk->ne[2]);
|
||||
}
|
||||
|
||||
if (inp_attn->self_rotv) {
|
||||
GGML_ASSERT(ggml_backend_buffer_is_host(inp_attn->self_rotv->buffer));
|
||||
|
||||
float * data = (float *) inp_attn->self_rotv->data;
|
||||
|
||||
set_input_hadamard(data, inp_attn->self_rotv->ne[0], inp_attn->self_rotv->ne[2]);
|
||||
}
|
||||
|
||||
const int64_t n_rs = mctx->get_recr()->get_n_rs();
|
||||
|
||||
if (inp_rs->s_copy) {
|
||||
@@ -630,6 +731,22 @@ void llm_graph_input_mem_hybrid_iswa::set_input(const llama_ubatch * ubatch) {
|
||||
attn_ctx->get_swa()->set_input_kq_mask(inp_attn->self_kq_mask_swa, ubatch, cparams.causal_attn);
|
||||
}
|
||||
|
||||
if (inp_attn->self_rotk) {
|
||||
GGML_ASSERT(ggml_backend_buffer_is_host(inp_attn->self_rotk->buffer));
|
||||
|
||||
float * data = (float *) inp_attn->self_rotk->data;
|
||||
|
||||
set_input_hadamard(data, inp_attn->self_rotk->ne[0], inp_attn->self_rotk->ne[2]);
|
||||
}
|
||||
|
||||
if (inp_attn->self_rotv) {
|
||||
GGML_ASSERT(ggml_backend_buffer_is_host(inp_attn->self_rotv->buffer));
|
||||
|
||||
float * data = (float *) inp_attn->self_rotv->data;
|
||||
|
||||
set_input_hadamard(data, inp_attn->self_rotv->ne[0], inp_attn->self_rotv->ne[2]);
|
||||
}
|
||||
|
||||
const int64_t n_rs = mctx->get_recr()->get_n_rs();
|
||||
|
||||
if (inp_rs->s_copy) {
|
||||
@@ -2003,12 +2120,52 @@ static std::unique_ptr<llm_graph_input_attn_kv> build_attn_inp_kv_impl(
|
||||
inp->self_v_idxs = mctx_cur->build_input_v_idxs(ctx0, ubatch);
|
||||
|
||||
inp->self_kq_mask = build_kq_mask(ctx0, mctx_cur, ubatch, cparams);
|
||||
|
||||
ggml_set_input(inp->self_kq_mask);
|
||||
|
||||
inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
|
||||
}
|
||||
|
||||
{
|
||||
const bool can_rotk =
|
||||
!hparams.is_n_embd_k_gqa_variable() &&
|
||||
hparams.n_embd_head_k() % 64 == 0 &&
|
||||
ggml_is_quantized(mctx_cur->type_k());
|
||||
|
||||
if (can_rotk) {
|
||||
int nrot = 64;
|
||||
//do {
|
||||
// nrot *= 2;
|
||||
//} while (hparams.n_embd_head_k() % nrot == 0);
|
||||
//nrot /= 2;
|
||||
|
||||
inp->self_rotk = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, nrot, nrot, hparams.n_head_kv());
|
||||
ggml_set_input(inp->self_rotk);
|
||||
} else {
|
||||
inp->self_rotk = nullptr;
|
||||
}
|
||||
|
||||
const bool can_rotv =
|
||||
!hparams.is_n_embd_v_gqa_variable() &&
|
||||
hparams.n_embd_head_v() % 64 == 0 &&
|
||||
ggml_is_quantized(mctx_cur->type_v());
|
||||
|
||||
if (can_rotv) {
|
||||
int nrot = 64;
|
||||
|
||||
// TODO: I think we can afford to rotate the V more compared to Q and K - to be confirmed
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/21038#issuecomment-4141323088
|
||||
//do {
|
||||
// nrot *= 2;
|
||||
//} while (hparams.n_embd_head_v() % nrot == 0);
|
||||
//nrot /= 2;
|
||||
|
||||
inp->self_rotv = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, nrot, nrot);
|
||||
ggml_set_input(inp->self_rotv);
|
||||
} else {
|
||||
inp->self_rotv = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
return inp;
|
||||
}
|
||||
|
||||
@@ -2034,6 +2191,15 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||
int il) const {
|
||||
GGML_ASSERT(v_mla == nullptr);
|
||||
|
||||
if (inp->self_rotk) {
|
||||
q_cur = ggml_rotate_hadamard(ctx0, q_cur, inp->self_rotk);
|
||||
k_cur = ggml_rotate_hadamard(ctx0, k_cur, inp->self_rotk);
|
||||
}
|
||||
|
||||
if (inp->self_rotv) {
|
||||
v_cur = ggml_rotate_hadamard(ctx0, v_cur, inp->self_rotv);
|
||||
}
|
||||
|
||||
// these nodes are added to the graph together so that they are not reordered
|
||||
// by doing so, the number of splits in the graph is reduced
|
||||
// expand k later to enable rope fusion which directly writes into k-v cache
|
||||
@@ -2061,6 +2227,10 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il);
|
||||
cb(cur, "kqv_out", il);
|
||||
|
||||
if (inp->self_rotv) {
|
||||
cur = ggml_rotate_hadamard(ctx0, cur, inp->self_rotv);
|
||||
}
|
||||
|
||||
if (wo) {
|
||||
cur = build_lora_mm(wo, cur);
|
||||
if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE || arch == LLM_ARCH_JAIS2) {
|
||||
@@ -2171,6 +2341,18 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||
ggml_tensor * v_mla,
|
||||
float kq_scale,
|
||||
int il) const {
|
||||
if (inp->self_rotk) {
|
||||
q_cur = ggml_rotate_hadamard(ctx0, q_cur, inp->self_rotk);
|
||||
if (k_cur) {
|
||||
k_cur = ggml_rotate_hadamard(ctx0, k_cur, inp->self_rotk);
|
||||
}
|
||||
}
|
||||
if (inp->self_rotv) {
|
||||
if (v_cur) {
|
||||
v_cur = ggml_rotate_hadamard(ctx0, v_cur, inp->self_rotv);
|
||||
}
|
||||
}
|
||||
|
||||
// these nodes are added to the graph together so that they are not reordered
|
||||
// by doing so, the number of splits in the graph is reduced
|
||||
ggml_build_forward_expand(gf, q_cur);
|
||||
@@ -2211,6 +2393,10 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il);
|
||||
cb(cur, "kqv_out", il);
|
||||
|
||||
if (inp->self_rotv) {
|
||||
cur = ggml_rotate_hadamard(ctx0, cur, inp->self_rotv);
|
||||
}
|
||||
|
||||
if (wo) {
|
||||
cur = build_lora_mm(wo, cur);
|
||||
}
|
||||
@@ -2315,6 +2501,48 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const
|
||||
ggml_set_name(inp->self_kq_mask_swa_cnv, "self_kq_mask_swa_cnv");
|
||||
}
|
||||
|
||||
{
|
||||
const bool can_rotk =
|
||||
!hparams.is_n_embd_k_gqa_variable() &&
|
||||
hparams.n_embd_head_k() % 64 == 0 &&
|
||||
ggml_is_quantized(mctx_cur->get_base()->type_k());
|
||||
|
||||
if (can_rotk) {
|
||||
int nrot = 64;
|
||||
//do {
|
||||
// nrot *= 2;
|
||||
//} while (hparams.n_embd_head_k() % nrot == 0);
|
||||
//nrot /= 2;
|
||||
|
||||
inp->self_rotk = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, nrot, nrot, hparams.n_head_kv());
|
||||
ggml_set_input(inp->self_rotk);
|
||||
} else {
|
||||
inp->self_rotk = nullptr;
|
||||
}
|
||||
|
||||
const bool can_rotv =
|
||||
!hparams.is_n_embd_v_gqa_variable() &&
|
||||
hparams.n_embd_head_v() % 64 == 0 &&
|
||||
ggml_is_quantized(mctx_cur->get_base()->type_v());
|
||||
|
||||
if (can_rotv) {
|
||||
int nrot = 64;
|
||||
|
||||
// TODO: I think we can afford to rotate the V more compared to Q and K - to be confirmed
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/21038#issuecomment-4141323088
|
||||
//do {
|
||||
// nrot *= 2;
|
||||
//} while (hparams.n_embd_head_v() % nrot == 0);
|
||||
//nrot /= 2;
|
||||
|
||||
inp->self_rotv = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, nrot, nrot);
|
||||
ggml_set_input(inp->self_rotv);
|
||||
} else {
|
||||
inp->self_rotv = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return (llm_graph_input_attn_kv_iswa *) res->add_input(std::move(inp));
|
||||
}
|
||||
|
||||
|
||||
@@ -308,6 +308,9 @@ public:
|
||||
ggml_tensor * self_kq_mask = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
|
||||
ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream]
|
||||
|
||||
ggml_tensor * self_rotk = nullptr;
|
||||
ggml_tensor * self_rotv = nullptr;
|
||||
|
||||
// note: these have to be copies because in order to be able to reuse a graph, its inputs
|
||||
// need to carry these parameters with them. otherwise, they can point to freed
|
||||
// llm_graph_params from a previous batch, causing stack-use-after-return
|
||||
@@ -384,6 +387,9 @@ public:
|
||||
ggml_tensor * self_kq_mask_swa = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
|
||||
ggml_tensor * self_kq_mask_swa_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream]
|
||||
|
||||
ggml_tensor * self_rotk = nullptr;
|
||||
ggml_tensor * self_rotv = nullptr;
|
||||
|
||||
const llama_hparams hparams;
|
||||
const llama_cparams cparams;
|
||||
|
||||
|
||||
@@ -1004,6 +1004,14 @@ bool llama_kv_cache::get_has_shift() const {
|
||||
return result;
|
||||
}
|
||||
|
||||
ggml_type llama_kv_cache::type_k() const {
|
||||
return layers[0].k->type;
|
||||
}
|
||||
|
||||
ggml_type llama_kv_cache::type_v() const {
|
||||
return layers[0].v->type;
|
||||
}
|
||||
|
||||
uint32_t llama_kv_cache::get_n_kv(const slot_info & sinfo) const {
|
||||
uint32_t result = 0;
|
||||
|
||||
@@ -2239,6 +2247,14 @@ uint32_t llama_kv_cache_context::get_n_kv() const {
|
||||
return n_kv;
|
||||
}
|
||||
|
||||
ggml_type llama_kv_cache_context::type_k() const {
|
||||
return kv->type_k();
|
||||
}
|
||||
|
||||
ggml_type llama_kv_cache_context::type_v() const {
|
||||
return kv->type_v();
|
||||
}
|
||||
|
||||
ggml_tensor * llama_kv_cache_context::get_k(ggml_context * ctx, int32_t il) const {
|
||||
return kv->get_k(ctx, il, n_kv, sinfos[i_cur]);
|
||||
}
|
||||
|
||||
@@ -152,6 +152,9 @@ public:
|
||||
|
||||
bool get_has_shift() const;
|
||||
|
||||
ggml_type type_k() const;
|
||||
ggml_type type_v() const;
|
||||
|
||||
//
|
||||
// graph_build API
|
||||
//
|
||||
@@ -328,6 +331,9 @@ public:
|
||||
|
||||
uint32_t get_n_kv() const;
|
||||
|
||||
ggml_type type_k() const;
|
||||
ggml_type type_v() const;
|
||||
|
||||
// get views of the current state of the cache
|
||||
ggml_tensor * get_k(ggml_context * ctx, int32_t il) const;
|
||||
ggml_tensor * get_v(ggml_context * ctx, int32_t il) const;
|
||||
|
||||
@@ -7578,65 +7578,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
if (!layer.ssm_beta_s && layer.ssm_beta) {
|
||||
layer.ssm_beta_s = create_tensor(tn(LLM_TENSOR_SSM_BETA, "scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
|
||||
// input scales
|
||||
if (!layer.wq_in_s && layer.wq) {
|
||||
layer.wq_in_s = create_tensor(tn(LLM_TENSOR_ATTN_Q, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
if (!layer.wk_in_s && layer.wk) {
|
||||
layer.wk_in_s = create_tensor(tn(LLM_TENSOR_ATTN_K, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
if (!layer.wv_in_s && layer.wv) {
|
||||
layer.wv_in_s = create_tensor(tn(LLM_TENSOR_ATTN_V, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
if (!layer.wo_in_s && layer.wo) {
|
||||
layer.wo_in_s = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
if (!layer.wqkv_in_s && layer.wqkv) {
|
||||
layer.wqkv_in_s = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
if (!layer.wqkv_gate_in_s && layer.wqkv_gate) {
|
||||
layer.wqkv_gate_in_s = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
if (!layer.ffn_gate_in_s && layer.ffn_gate) {
|
||||
layer.ffn_gate_in_s = create_tensor(tn(LLM_TENSOR_FFN_GATE, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
if (!layer.ffn_down_in_s && layer.ffn_down) {
|
||||
layer.ffn_down_in_s = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
if (!layer.ffn_up_in_s && layer.ffn_up) {
|
||||
layer.ffn_up_in_s = create_tensor(tn(LLM_TENSOR_FFN_UP, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
if (!layer.ffn_gate_exps_in_s && layer.ffn_gate_exps) {
|
||||
layer.ffn_gate_exps_in_s = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "input_scale", i), {n_expert}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
if (!layer.ffn_down_exps_in_s && layer.ffn_down_exps) {
|
||||
layer.ffn_down_exps_in_s = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "input_scale", i), {n_expert}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
if (!layer.ffn_up_exps_in_s && layer.ffn_up_exps) {
|
||||
layer.ffn_up_exps_in_s = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "input_scale", i), {n_expert}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
if (!layer.ffn_gate_shexp_in_s && layer.ffn_gate_shexp) {
|
||||
layer.ffn_gate_shexp_in_s = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
if (!layer.ffn_down_shexp_in_s && layer.ffn_down_shexp) {
|
||||
layer.ffn_down_shexp_in_s = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
if (!layer.ffn_up_shexp_in_s && layer.ffn_up_shexp) {
|
||||
layer.ffn_up_shexp_in_s = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
if (!layer.ssm_in_in_s && layer.ssm_in) {
|
||||
layer.ssm_in_in_s = create_tensor(tn(LLM_TENSOR_SSM_IN, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
if (!layer.ssm_out_in_s && layer.ssm_out) {
|
||||
layer.ssm_out_in_s = create_tensor(tn(LLM_TENSOR_SSM_OUT, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
if (!layer.ssm_alpha_in_s && layer.ssm_alpha) {
|
||||
layer.ssm_alpha_in_s = create_tensor(tn(LLM_TENSOR_SSM_ALPHA, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
if (!layer.ssm_beta_in_s && layer.ssm_beta) {
|
||||
layer.ssm_beta_in_s = create_tensor(tn(LLM_TENSOR_SSM_BETA, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -414,27 +414,6 @@ struct llama_layer {
|
||||
struct ggml_tensor * ssm_alpha_s = nullptr;
|
||||
struct ggml_tensor * ssm_beta_s = nullptr;
|
||||
|
||||
// input scales
|
||||
struct ggml_tensor * wq_in_s = nullptr;
|
||||
struct ggml_tensor * wk_in_s = nullptr;
|
||||
struct ggml_tensor * wv_in_s = nullptr;
|
||||
struct ggml_tensor * wo_in_s = nullptr;
|
||||
struct ggml_tensor * wqkv_in_s = nullptr;
|
||||
struct ggml_tensor * wqkv_gate_in_s = nullptr;
|
||||
struct ggml_tensor * ffn_gate_in_s = nullptr;
|
||||
struct ggml_tensor * ffn_up_in_s = nullptr;
|
||||
struct ggml_tensor * ffn_down_in_s = nullptr;
|
||||
struct ggml_tensor * ffn_gate_exps_in_s = nullptr;
|
||||
struct ggml_tensor * ffn_down_exps_in_s = nullptr;
|
||||
struct ggml_tensor * ffn_up_exps_in_s = nullptr;
|
||||
struct ggml_tensor * ffn_gate_shexp_in_s= nullptr;
|
||||
struct ggml_tensor * ffn_up_shexp_in_s = nullptr;
|
||||
struct ggml_tensor * ffn_down_shexp_in_s= nullptr;
|
||||
struct ggml_tensor * ssm_in_in_s = nullptr;
|
||||
struct ggml_tensor * ssm_out_in_s = nullptr;
|
||||
struct ggml_tensor * ssm_alpha_in_s = nullptr;
|
||||
struct ggml_tensor * ssm_beta_in_s = nullptr;
|
||||
|
||||
// altup & laurel
|
||||
struct ggml_tensor * per_layer_inp_gate = nullptr;
|
||||
struct ggml_tensor * per_layer_proj = nullptr;
|
||||
|
||||
@@ -345,12 +345,9 @@ static bool tensor_allows_quantization(const llama_model_quantize_params * param
|
||||
|
||||
// do not quantize specific multimodal tensors
|
||||
quantize &= name.find(".position_embd") == std::string::npos;
|
||||
quantize &= name.find("sam.patch_embd") == std::string::npos;
|
||||
quantize &= name.find("sam.pos_embd") == std::string::npos;
|
||||
quantize &= name.find("sam.neck.") == std::string::npos;
|
||||
quantize &= name.find("sam.net_") == std::string::npos;
|
||||
quantize &= name.find(".rel_pos") == std::string::npos;
|
||||
quantize &= name.find(".patch_embd") == std::string::npos;
|
||||
quantize &= name.find(".patch_merger") == std::string::npos;
|
||||
|
||||
return quantize;
|
||||
}
|
||||
|
||||
@@ -5,7 +5,6 @@ find_package(Threads REQUIRED)
|
||||
add_library(mtmd
|
||||
mtmd.cpp
|
||||
mtmd-audio.cpp
|
||||
mtmd-image.cpp
|
||||
mtmd.h
|
||||
mtmd-helper.cpp
|
||||
mtmd-helper.h
|
||||
|
||||
@@ -51,6 +51,7 @@
|
||||
|
||||
#define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type"
|
||||
#define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints"
|
||||
#define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution"
|
||||
#define KEY_WIN_ATTN_PATTERN "clip.vision.n_wa_pattern"
|
||||
#define KEY_WIN_ATTN_LAYER_INDEXES "clip.vision.wa_layer_indexes"
|
||||
#define KEY_ATTN_WINDOW_SIZE "clip.vision.window_size"
|
||||
|
||||
@@ -28,13 +28,6 @@ enum patch_merge_type {
|
||||
PATCH_MERGE_SPATIAL_UNPAD,
|
||||
};
|
||||
|
||||
enum resize_algo {
|
||||
RESIZE_ALGO_BILINEAR, // stretch to target resolution
|
||||
RESIZE_ALGO_BICUBIC, // center-crop when aspect ratio doesn't match
|
||||
RESIZE_ALGO_BICUBIC_PILLOW,
|
||||
// RESIZE_ALGO_LANCZOS, // TODO
|
||||
};
|
||||
|
||||
struct clip_hparams {
|
||||
int32_t image_size = 0;
|
||||
int32_t patch_size = 0;
|
||||
@@ -44,26 +37,13 @@ struct clip_hparams {
|
||||
int32_t n_head = 0;
|
||||
int32_t n_layer = 0;
|
||||
// idefics3
|
||||
int32_t n_merge = 0; // number of patch merges **per-side**
|
||||
|
||||
// for preprocessor
|
||||
int32_t image_longest_edge = 0;
|
||||
int32_t image_min_pixels = -1;
|
||||
int32_t image_max_pixels = -1;
|
||||
resize_algo image_resize_algo = RESIZE_ALGO_BICUBIC;
|
||||
bool image_resize_pad = true; // if false, center-crop will be applied when resizing
|
||||
std::array<uint8_t, 3> image_pad_color = {0, 0, 0};
|
||||
int32_t n_merge = 0; // number of patch merges **per-side**
|
||||
|
||||
// (preprocessor) for llava-uhd style models
|
||||
std::vector<clip_image_size> image_res_candidates;
|
||||
int32_t preproc_min_tiles = 0;
|
||||
int32_t preproc_max_tiles = 0;
|
||||
resize_algo image_resize_algo_rf = RESIZE_ALGO_BICUBIC;
|
||||
resize_algo image_resize_algo_ov = RESIZE_ALGO_BILINEAR;
|
||||
bool image_pad_rf = true; // if true, refined image will be padded (e.g. llava-1.6)
|
||||
bool image_pad_ov = false; // if true, overview image will be padded (e.g. llava-1.6)
|
||||
std::array<uint8_t, 3> image_pad_color_rf = {0, 0, 0}; // padding color for refined image
|
||||
std::array<uint8_t, 3> image_pad_color_ov = {0, 0, 0}; // padding color for overview image
|
||||
|
||||
float image_mean[3];
|
||||
float image_std[3];
|
||||
@@ -80,6 +60,8 @@ struct clip_hparams {
|
||||
float eps = 1e-6;
|
||||
float rope_theta = 0.0;
|
||||
|
||||
std::vector<clip_image_size> image_res_candidates; // for llava-uhd style models
|
||||
int32_t image_crop_resolution;
|
||||
std::unordered_set<int32_t> vision_feature_layer;
|
||||
int32_t attn_window_size = 0;
|
||||
int32_t n_wa_pattern = 0;
|
||||
|
||||
1450
tools/mtmd/clip.cpp
1450
tools/mtmd/clip.cpp
File diff suppressed because it is too large
Load Diff
@@ -97,6 +97,9 @@ struct clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch
|
||||
*/
|
||||
void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, struct clip_image_u8 * img);
|
||||
|
||||
/** preprocess img and store the result in res_imgs, pad_to_square may be overridden to false depending on model configuration */
|
||||
bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs );
|
||||
|
||||
struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);
|
||||
|
||||
bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec);
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,150 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "clip-model.h"
|
||||
|
||||
#include <vector>
|
||||
#include <string>
|
||||
|
||||
#define MTMD_INTERNAL_HEADER
|
||||
|
||||
// base class, models must inherit from this class
|
||||
struct mtmd_image_preprocessor {
|
||||
const clip_hparams & hparams;
|
||||
|
||||
mtmd_image_preprocessor(const clip_ctx * ctx): hparams(*clip_get_hparams(ctx)) {}
|
||||
|
||||
virtual ~mtmd_image_preprocessor() = default;
|
||||
virtual bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) = 0;
|
||||
|
||||
void img_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst, const float mean[3], const float std[3]);
|
||||
void img_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst);
|
||||
};
|
||||
|
||||
/**
|
||||
* implementation of LLaVA-UHD:
|
||||
* - https://arxiv.org/pdf/2403.11703
|
||||
* - https://github.com/thunlp/LLaVA-UHD
|
||||
* - https://github.com/thunlp/LLaVA-UHD/blob/302301bc2175f7e717fb8548516188e89f649753/llava_uhd/train/llava-uhd/slice_logic.py#L118
|
||||
*
|
||||
* overview:
|
||||
* - an image always have a single overview (downscaled image)
|
||||
* - an image can have 0 or multiple slices, depending on the image size
|
||||
* - each slice can then be considered as a separate image
|
||||
*
|
||||
* note: the term "slice" and "tile" are used interchangeably
|
||||
*
|
||||
* for example:
|
||||
*
|
||||
* [overview] --> [slice 1] --> [slice 2]
|
||||
* | |
|
||||
* +--> [slice 3] --> [slice 4]
|
||||
*/
|
||||
struct mtmd_image_preprocessor_llava_uhd : mtmd_image_preprocessor {
|
||||
mtmd_image_preprocessor_llava_uhd(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
|
||||
bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
|
||||
|
||||
struct slice_coordinates {
|
||||
int x;
|
||||
int y;
|
||||
clip_image_size size;
|
||||
};
|
||||
|
||||
struct slice_instructions {
|
||||
clip_image_size overview_size; // size of downscaled image
|
||||
clip_image_size refined_size; // size of image right before slicing (must be multiple of slice size)
|
||||
clip_image_size grid_size; // grid_size.width * grid_size.height = number of slices
|
||||
std::vector<slice_coordinates> slices;
|
||||
};
|
||||
|
||||
// LFM2 override this function to implement its custom slicing logic
|
||||
virtual slice_instructions get_slice_instructions(const clip_image_size & original_size);
|
||||
|
||||
std::vector<clip_image_u8_ptr> slice_image(const clip_image_u8 & img, const slice_instructions & inst, bool overview_first = true);
|
||||
|
||||
private:
|
||||
clip_image_size get_best_resize(const clip_image_size & original_size, int scale_resolution, int patch_size, bool allow_upscale = false);
|
||||
|
||||
clip_image_size resize_maintain_aspect_ratio(const clip_image_size & orig, const clip_image_size & target_max);
|
||||
|
||||
/**
|
||||
* Selects the best resolution from a list of possible resolutions based on the original size.
|
||||
*
|
||||
* For example, when given a list of resolutions:
|
||||
* - 100x100
|
||||
* - 200x100
|
||||
* - 100x200
|
||||
* - 200x200
|
||||
*
|
||||
* And an input image of size 111x200, then 100x200 is the best fit (least wasted resolution).
|
||||
*
|
||||
* @param original_size The original size of the image
|
||||
* @param possible_resolutions A list of possible resolutions
|
||||
* @return The best fit resolution
|
||||
*/
|
||||
clip_image_size select_best_resolution(const clip_image_size & original_size, const std::vector<clip_image_size> & possible_resolutions);
|
||||
int ensure_divide(int length, int patch_size);
|
||||
clip_image_size get_refine_size(const clip_image_size & original_size, const clip_image_size & grid, int scale_resolution, int patch_size, bool allow_upscale = false);
|
||||
clip_image_size get_best_grid(const int max_slice_nums, const int multiple, const float log_ratio);
|
||||
};
|
||||
|
||||
// downscale or upscale the input image to fixed size
|
||||
struct mtmd_image_preprocessor_fixed_size : mtmd_image_preprocessor {
|
||||
mtmd_image_preprocessor_fixed_size(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
|
||||
bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
|
||||
};
|
||||
|
||||
// resize image to multiple of patch_size*n_merge, while preserving aspect ratio
|
||||
// if image_resize_pad is true, the resized image will be padded, otherwise it will be either stretched or center-cropped depending on image_resize_pad
|
||||
// this is used by models with native support for dynamic image size, for example: Qwen-VL, Pixtral, Kimi-VL, etc
|
||||
struct mtmd_image_preprocessor_dyn_size : mtmd_image_preprocessor {
|
||||
mtmd_image_preprocessor_dyn_size(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
|
||||
bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
|
||||
};
|
||||
|
||||
// similar to mtmd_image_preprocessor_dyn_size, but resize the image to have longest edge equal to hparams.image_longest_edge, while preserving aspect ratio
|
||||
struct mtmd_image_preprocessor_longest_edge : mtmd_image_preprocessor {
|
||||
mtmd_image_preprocessor_longest_edge(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
|
||||
bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
|
||||
};
|
||||
|
||||
// custom llava-uhd slicing logic for LFM2
|
||||
// ref: https://github.com/huggingface/transformers/blob/v5.1.0/src/transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py
|
||||
struct mtmd_image_preprocessor_lfm2 : mtmd_image_preprocessor_llava_uhd {
|
||||
// ref: https://huggingface.co/LiquidAI/LFM2.5-VL-1.6B/blob/main/processor_config.json
|
||||
static constexpr int min_tiles = 2;
|
||||
static constexpr int max_tiles = 10;
|
||||
static constexpr float max_pixels_tolerance = 2.0f;
|
||||
static constexpr int tile_size = 512;
|
||||
|
||||
using mtmd_image_preprocessor_llava_uhd::mtmd_image_preprocessor_llava_uhd;
|
||||
slice_instructions get_slice_instructions(const clip_image_size & original_size) override;
|
||||
|
||||
private:
|
||||
clip_image_size find_closest_aspect_ratio(
|
||||
float aspect_ratio,
|
||||
const std::vector<clip_image_size> & target_ratios,
|
||||
int width, int height);
|
||||
std::vector<clip_image_size> get_target_ratios();
|
||||
clip_image_size get_grid_layout(int height, int width);
|
||||
};
|
||||
|
||||
struct mtmd_image_preprocessor_idefics3 : mtmd_image_preprocessor_llava_uhd {
|
||||
mtmd_image_preprocessor_idefics3(const clip_ctx * ctx) : mtmd_image_preprocessor_llava_uhd(ctx) {}
|
||||
bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
|
||||
};
|
||||
|
||||
struct mtmd_image_preprocessor_internvl : mtmd_image_preprocessor_llava_uhd {
|
||||
mtmd_image_preprocessor_internvl(const clip_ctx * ctx) : mtmd_image_preprocessor_llava_uhd(ctx) {}
|
||||
bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
|
||||
};
|
||||
|
||||
struct mtmd_image_preprocessor_deepseekocr : mtmd_image_preprocessor {
|
||||
mtmd_image_preprocessor_deepseekocr(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
|
||||
bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
|
||||
};
|
||||
|
||||
struct mtmd_image_preprocessor_youtuvl : mtmd_image_preprocessor {
|
||||
mtmd_image_preprocessor_youtuvl(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
|
||||
bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
|
||||
};
|
||||
@@ -2,7 +2,6 @@
|
||||
#include "clip-impl.h"
|
||||
#include "mtmd.h"
|
||||
#include "mtmd-audio.h"
|
||||
#include "mtmd-image.h"
|
||||
#include "debug/mtmd-debug.h"
|
||||
|
||||
#include "llama.h"
|
||||
@@ -139,7 +138,7 @@ struct mtmd_context {
|
||||
|
||||
// for llava-uhd style models, we need special tokens in-between slices
|
||||
// minicpmv calls them "slices", llama 4 calls them "tiles"
|
||||
mtmd_slice_tmpl slice_tmpl = MTMD_SLICE_TMPL_NONE;
|
||||
mtmd_slice_tmpl slice_tmpl = MTMD_SLICE_TMPL_NONE;
|
||||
std::vector<llama_token> tok_ov_img_start; // overview image
|
||||
std::vector<llama_token> tok_ov_img_end; // overview image
|
||||
std::vector<llama_token> tok_slices_start; // start of all slices
|
||||
@@ -148,14 +147,13 @@ struct mtmd_context {
|
||||
std::vector<llama_token> tok_sli_img_end; // single slice end
|
||||
std::vector<llama_token> tok_sli_img_mid; // between 2 slices
|
||||
std::vector<llama_token> tok_row_end; // end of row
|
||||
bool tok_row_end_trail = false;
|
||||
bool ov_img_first = false;
|
||||
bool tok_row_end_trail = false;
|
||||
bool ov_img_first = false;
|
||||
|
||||
// string template for slice image delimiters with row/col (idefics3)
|
||||
std::string sli_img_start_tmpl;
|
||||
|
||||
std::unique_ptr<mtmd_audio_preprocessor> audio_preproc;
|
||||
std::unique_ptr<mtmd_image_preprocessor> image_preproc;
|
||||
|
||||
// TODO @ngxson : add timings
|
||||
|
||||
@@ -223,193 +221,123 @@ struct mtmd_context {
|
||||
|
||||
void init_vision() {
|
||||
GGML_ASSERT(ctx_v != nullptr);
|
||||
image_preproc.reset();
|
||||
|
||||
projector_type proj = clip_get_projector_type(ctx_v);
|
||||
int minicpmv_version = clip_is_minicpmv(ctx_v);
|
||||
if (minicpmv_version == 2) {
|
||||
// minicpmv 2.5 format:
|
||||
// <image> (overview) </image><slice><image> (slice) </image><image> (slice) </image>\n ... </slice>
|
||||
slice_tmpl = MTMD_SLICE_TMPL_MINICPMV_2_5;
|
||||
tok_ov_img_start = {lookup_token("<image>")};
|
||||
tok_ov_img_end = {lookup_token("</image>")};
|
||||
tok_slices_start = {lookup_token("<slice>")};
|
||||
tok_slices_end = {lookup_token("</slice>")};
|
||||
tok_sli_img_start = tok_ov_img_start;
|
||||
tok_sli_img_end = tok_ov_img_end;
|
||||
tok_row_end = {lookup_token("\n")};
|
||||
tok_row_end_trail = false; // no trailing end-of-row token
|
||||
ov_img_first = true;
|
||||
|
||||
switch (proj) {
|
||||
case PROJECTOR_TYPE_MLP:
|
||||
case PROJECTOR_TYPE_MLP_NORM:
|
||||
case PROJECTOR_TYPE_LDP:
|
||||
case PROJECTOR_TYPE_LDPV2:
|
||||
case PROJECTOR_TYPE_COGVLM:
|
||||
case PROJECTOR_TYPE_JANUS_PRO:
|
||||
case PROJECTOR_TYPE_GLM_EDGE:
|
||||
{
|
||||
bool has_pinpoints = !clip_get_hparams(ctx_v)->image_res_candidates.empty();
|
||||
if (has_pinpoints) {
|
||||
image_preproc = std::make_unique<mtmd_image_preprocessor_llava_uhd>(ctx_v);
|
||||
} else {
|
||||
image_preproc = std::make_unique<mtmd_image_preprocessor_fixed_size>(ctx_v);
|
||||
}
|
||||
} break;
|
||||
case PROJECTOR_TYPE_MINICPMV:
|
||||
{
|
||||
int minicpmv_version = clip_is_minicpmv(ctx_v);
|
||||
if (minicpmv_version == 2) {
|
||||
// minicpmv 2.5 format:
|
||||
// <image> (overview) </image><slice><image> (slice) </image><image> (slice) </image>\n ... </slice>
|
||||
slice_tmpl = MTMD_SLICE_TMPL_MINICPMV_2_5;
|
||||
tok_ov_img_start = {lookup_token("<image>")};
|
||||
tok_ov_img_end = {lookup_token("</image>")};
|
||||
tok_slices_start = {lookup_token("<slice>")};
|
||||
tok_slices_end = {lookup_token("</slice>")};
|
||||
tok_sli_img_start = tok_ov_img_start;
|
||||
tok_sli_img_end = tok_ov_img_end;
|
||||
tok_row_end = {lookup_token("\n")};
|
||||
tok_row_end_trail = false; // no trailing end-of-row token
|
||||
ov_img_first = true;
|
||||
} else if (minicpmv_version == 3 || minicpmv_version == 4 || minicpmv_version == 5 || minicpmv_version == 6 || minicpmv_version == 100045) {
|
||||
// minicpmv 2.6 format:
|
||||
// <image> (overview) </image><slice> (slice) </slice><slice> (slice) </slice>\n ...
|
||||
slice_tmpl = MTMD_SLICE_TMPL_MINICPMV_2_6;
|
||||
tok_ov_img_start = {lookup_token("<image>")};
|
||||
tok_ov_img_end = {lookup_token("</image>")};
|
||||
tok_sli_img_start = {lookup_token("<slice>")};
|
||||
tok_sli_img_end = {lookup_token("</slice>")};
|
||||
tok_row_end = {lookup_token("\n")};
|
||||
tok_row_end_trail = false; // no trailing end-of-row token
|
||||
ov_img_first = true;
|
||||
|
||||
} else if (minicpmv_version == 3 || minicpmv_version == 4 || minicpmv_version == 5 || minicpmv_version == 6 || minicpmv_version == 100045) {
|
||||
// minicpmv 2.6 format:
|
||||
// <image> (overview) </image><slice> (slice) </slice><slice> (slice) </slice>\n ...
|
||||
slice_tmpl = MTMD_SLICE_TMPL_MINICPMV_2_6;
|
||||
tok_ov_img_start = {lookup_token("<image>")};
|
||||
tok_ov_img_end = {lookup_token("</image>")};
|
||||
tok_sli_img_start = {lookup_token("<slice>")};
|
||||
tok_sli_img_end = {lookup_token("</slice>")};
|
||||
tok_row_end = {lookup_token("\n")};
|
||||
tok_row_end_trail = false; // no trailing end-of-row token
|
||||
ov_img_first = true;
|
||||
|
||||
} else if (minicpmv_version != 0) {
|
||||
throw std::runtime_error(string_format("unsupported minicpmv version: %d\n", minicpmv_version));
|
||||
}
|
||||
image_preproc = std::make_unique<mtmd_image_preprocessor_llava_uhd>(ctx_v);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_QWEN2VL:
|
||||
case PROJECTOR_TYPE_QWEN25VL:
|
||||
case PROJECTOR_TYPE_QWEN3VL:
|
||||
{
|
||||
// <|vision_start|> ... (image embeddings) ... <|vision_end|>
|
||||
img_beg = "<|vision_start|>";
|
||||
img_end = "<|vision_end|>";
|
||||
image_preproc = std::make_unique<mtmd_image_preprocessor_dyn_size>(ctx_v);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_YOUTUVL:
|
||||
{
|
||||
// <|vision_start|> ... (image embeddings) ... <|vision_end|>
|
||||
img_beg = "<|vision_start|>";
|
||||
img_end = "<|vision_end|>";
|
||||
image_preproc = std::make_unique<mtmd_image_preprocessor_youtuvl>(ctx_v);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GEMMA3:
|
||||
case PROJECTOR_TYPE_GEMMA3NV:
|
||||
{
|
||||
// <start_of_image> ... (image embeddings) ... <end_of_image>
|
||||
img_beg = "<start_of_image>";
|
||||
img_end = "<end_of_image>";
|
||||
image_preproc = std::make_unique<mtmd_image_preprocessor_fixed_size>(ctx_v);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_IDEFICS3:
|
||||
{
|
||||
// https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215
|
||||
slice_tmpl = MTMD_SLICE_TMPL_IDEFICS3;
|
||||
tok_ov_img_start = {lookup_token("\n\n"), lookup_token("<fake_token_around_image>"), lookup_token("<global-img>")};
|
||||
tok_ov_img_end = {lookup_token("<fake_token_around_image>")};
|
||||
tok_row_end = {lookup_token("\n")};
|
||||
sli_img_start_tmpl = "<fake_token_around_image><row_%d_col_%d>";
|
||||
image_preproc = std::make_unique<mtmd_image_preprocessor_idefics3>(ctx_v);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_PIXTRAL:
|
||||
{
|
||||
// https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
|
||||
img_end = "[IMG_END]";
|
||||
image_preproc = std::make_unique<mtmd_image_preprocessor_dyn_size>(ctx_v);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_PHI4:
|
||||
{
|
||||
// Phi-4 uses media marker insertion only. Keep image boundary text empty.
|
||||
image_preproc = std::make_unique<mtmd_image_preprocessor_dyn_size>(ctx_v);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_LLAMA4:
|
||||
{
|
||||
// (more details in mtmd_context constructor)
|
||||
img_beg = "<|image_start|>";
|
||||
img_end = "<|image_end|>";
|
||||
LOG_WRN("%s: llama 4 vision is known to have degraded quality:\n"
|
||||
" https://github.com/ggml-org/llama.cpp/pull/13282\n", __func__);
|
||||
image_preproc = std::make_unique<mtmd_image_preprocessor_llava_uhd>(ctx_v);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_INTERNVL:
|
||||
{
|
||||
// <img> ... (image embeddings) ... </img>
|
||||
img_beg = "<img>";
|
||||
img_end = "</img>";
|
||||
image_preproc = std::make_unique<mtmd_image_preprocessor_internvl>(ctx_v);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_KIMIVL:
|
||||
{
|
||||
// <|media_start|> ... (image embeddings) ... <|media_end|>
|
||||
img_beg = "<|media_start|>";
|
||||
img_end = "<|media_end|>";
|
||||
image_preproc = std::make_unique<mtmd_image_preprocessor_dyn_size>(ctx_v);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_KIMIK25:
|
||||
{
|
||||
// <|media_begin|> ... (image embeddings) ... <|media_end|>
|
||||
img_beg = "<|media_begin|>";
|
||||
img_end = "<|media_end|>";
|
||||
image_preproc = std::make_unique<mtmd_image_preprocessor_dyn_size>(ctx_v);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_LIGHTONOCR:
|
||||
{
|
||||
// <|im_start|> ... (image embeddings) ... <|im_end|>
|
||||
img_beg = "<|im_start|>";
|
||||
img_end = "<|im_end|>";
|
||||
image_preproc = std::make_unique<mtmd_image_preprocessor_longest_edge>(ctx_v);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_NEMOTRON_V2_VL:
|
||||
{
|
||||
image_preproc = std::make_unique<mtmd_image_preprocessor_fixed_size>(ctx_v);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_LFM2:
|
||||
{
|
||||
// multi-tile:
|
||||
// <|image_start|>
|
||||
// <|img_row_1_col_1|> (tile) <|img_row_1_col_2|> (tile) ...
|
||||
// <|img_thumbnail|> (thumbnail)
|
||||
// <|image_end|>
|
||||
// single-tile:
|
||||
// <|image_start|> (image) <|image_end|>
|
||||
img_beg = "<|image_start|>";
|
||||
img_end = "<|image_end|>";
|
||||
slice_tmpl = MTMD_SLICE_TMPL_LFM2;
|
||||
sli_img_start_tmpl = "<|img_row_%d_col_%d|>";
|
||||
tok_ov_img_start = {lookup_token("<|img_thumbnail|>")};
|
||||
ov_img_first = false;
|
||||
image_preproc = std::make_unique<mtmd_image_preprocessor_lfm2>(ctx_v);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GLM4V:
|
||||
{
|
||||
// <|begin_of_image|> ... (image embeddings) ... <|end_of_image|>
|
||||
img_beg = "<|begin_of_image|>";
|
||||
img_end = "<|end_of_image|>";
|
||||
image_preproc = std::make_unique<mtmd_image_preprocessor_dyn_size>(ctx_v);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_PADDLEOCR:
|
||||
{
|
||||
// <|IMAGE_START|> ... (image embeddings) ... <|IMAGE_END|>
|
||||
img_beg = "<|IMAGE_START|>";
|
||||
img_end = "<|IMAGE_END|>";
|
||||
image_preproc = std::make_unique<mtmd_image_preprocessor_dyn_size>(ctx_v);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_DEEPSEEKOCR:
|
||||
{
|
||||
img_end = "\n"; // prevent empty batch on llama-server
|
||||
image_preproc = std::make_unique<mtmd_image_preprocessor_deepseekocr>(ctx_v);
|
||||
} break;
|
||||
default:
|
||||
throw std::runtime_error(string_format("%s: unexpected vision projector type %d\n", __func__, proj));
|
||||
} else if (minicpmv_version != 0) {
|
||||
GGML_ASSERT(false && "unsupported minicpmv version");
|
||||
} else if (proj == PROJECTOR_TYPE_LLAMA4) {
|
||||
// llama 4 format:
|
||||
// <|image_start|>
|
||||
// (slice) <|tile_x_separator|> (slice) <|tile_x_separator|> ... <|tile_y_separator|>
|
||||
// (slice) <|tile_x_separator|> (slice) <|tile_x_separator|> ... <|tile_y_separator|>
|
||||
// ... <|tile_y_separator|> <-- trailing end-of-row token
|
||||
// <|image|> (overview) <-- overview image is last
|
||||
// <|image_end|>
|
||||
slice_tmpl = MTMD_SLICE_TMPL_LLAMA4;
|
||||
tok_ov_img_start = {lookup_token("<|image|>")};
|
||||
tok_sli_img_mid = {lookup_token("<|tile_x_separator|>")};
|
||||
tok_row_end = {lookup_token("<|tile_y_separator|>")};
|
||||
tok_row_end_trail = true; // add trailing end-of-row token
|
||||
ov_img_first = false; // overview image is last
|
||||
}
|
||||
|
||||
GGML_ASSERT(image_preproc != nullptr);
|
||||
// set boi/eoi
|
||||
if (proj == PROJECTOR_TYPE_GEMMA3 || proj == PROJECTOR_TYPE_GEMMA3NV) {
|
||||
// <start_of_image> ... (image embeddings) ... <end_of_image>
|
||||
img_beg = "<start_of_image>";
|
||||
img_end = "<end_of_image>";
|
||||
|
||||
} else if (proj == PROJECTOR_TYPE_IDEFICS3) {
|
||||
// https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215
|
||||
slice_tmpl = MTMD_SLICE_TMPL_IDEFICS3;
|
||||
tok_ov_img_start = {lookup_token("\n\n"), lookup_token("<fake_token_around_image>"), lookup_token("<global-img>")};
|
||||
tok_ov_img_end = {lookup_token("<fake_token_around_image>")};
|
||||
tok_row_end = {lookup_token("\n")};
|
||||
sli_img_start_tmpl = "<fake_token_around_image><row_%d_col_%d>";
|
||||
|
||||
} else if (proj == PROJECTOR_TYPE_PIXTRAL) {
|
||||
// https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
|
||||
img_end = "[IMG_END]";
|
||||
|
||||
} else if (proj == PROJECTOR_TYPE_QWEN2VL || proj == PROJECTOR_TYPE_QWEN25VL || proj == PROJECTOR_TYPE_QWEN3VL || proj == PROJECTOR_TYPE_YOUTUVL) {
|
||||
// <|vision_start|> ... (image embeddings) ... <|vision_end|>
|
||||
img_beg = "<|vision_start|>";
|
||||
img_end = "<|vision_end|>";
|
||||
|
||||
} else if (proj == PROJECTOR_TYPE_PHI4) {
|
||||
// Phi-4 uses media marker insertion only. Keep image boundary text empty.
|
||||
|
||||
} else if (proj == PROJECTOR_TYPE_LLAMA4) {
|
||||
// (more details in mtmd_context constructor)
|
||||
img_beg = "<|image_start|>";
|
||||
img_end = "<|image_end|>";
|
||||
LOG_WRN("%s: llama 4 vision is known to have degraded quality:\n"
|
||||
" https://github.com/ggml-org/llama.cpp/pull/13282\n", __func__);
|
||||
|
||||
} else if (proj == PROJECTOR_TYPE_INTERNVL) {
|
||||
// <img> ... (image embeddings) ... </img>
|
||||
img_beg = "<img>";
|
||||
img_end = "</img>";
|
||||
|
||||
} else if (proj == PROJECTOR_TYPE_LIGHTONOCR) {
|
||||
// <|im_start|> ... (image embeddings) ... <|im_end|>
|
||||
img_beg = "<|im_start|>";
|
||||
img_end = "<|im_end|>";
|
||||
|
||||
} else if (proj == PROJECTOR_TYPE_LFM2) {
|
||||
// multi-tile:
|
||||
// <|image_start|>
|
||||
// <|img_row_1_col_1|> (tile) <|img_row_1_col_2|> (tile) ...
|
||||
// <|img_thumbnail|> (thumbnail)
|
||||
// <|image_end|>
|
||||
// single-tile:
|
||||
// <|image_start|> (image) <|image_end|>
|
||||
img_beg = "<|image_start|>";
|
||||
img_end = "<|image_end|>";
|
||||
slice_tmpl = MTMD_SLICE_TMPL_LFM2;
|
||||
sli_img_start_tmpl = "<|img_row_%d_col_%d|>";
|
||||
tok_ov_img_start = {lookup_token("<|img_thumbnail|>")};
|
||||
ov_img_first = false;
|
||||
} else if (proj == PROJECTOR_TYPE_GLM4V) {
|
||||
img_beg = "<|begin_of_image|>";
|
||||
img_end = "<|end_of_image|>";
|
||||
|
||||
} else if (proj == PROJECTOR_TYPE_PADDLEOCR) {
|
||||
// <|IMAGE_START|> ... (image embeddings) ... <|IMAGE_END|>
|
||||
img_beg = "<|IMAGE_START|>";
|
||||
img_end = "<|IMAGE_END|>";
|
||||
}
|
||||
}
|
||||
|
||||
void init_audio() {
|
||||
GGML_ASSERT(ctx_a != nullptr);
|
||||
audio_preproc.reset();
|
||||
|
||||
projector_type proj = clip_get_projector_type(ctx_a);
|
||||
|
||||
LOG_WRN("%s: audio input is in experimental stage and may have reduced quality:\n"
|
||||
@@ -419,40 +347,36 @@ struct mtmd_context {
|
||||
switch (proj) {
|
||||
case PROJECTOR_TYPE_QWEN2A:
|
||||
case PROJECTOR_TYPE_QWEN25O:
|
||||
{
|
||||
// <|audio_bos|> ... (embeddings) ... <|audio_eos|>
|
||||
aud_beg = "<|audio_bos|>";
|
||||
aud_end = "<|audio_eos|>";
|
||||
audio_preproc = std::make_unique<mtmd_audio_preprocessor_whisper>(ctx_a);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_VOXTRAL:
|
||||
{
|
||||
// [BEGIN_AUDIO] ... (embeddings) ...
|
||||
aud_beg = "[BEGIN_AUDIO]";
|
||||
audio_preproc = std::make_unique<mtmd_audio_preprocessor_whisper>(ctx_a);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
|
||||
{
|
||||
// <sound> ... (embeddings) ...
|
||||
aud_beg = "<sound>";
|
||||
audio_preproc = std::make_unique<mtmd_audio_preprocessor_whisper>(ctx_a);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_ULTRAVOX:
|
||||
case PROJECTOR_TYPE_VOXTRAL:
|
||||
case PROJECTOR_TYPE_GLMA:
|
||||
{
|
||||
audio_preproc = std::make_unique<mtmd_audio_preprocessor_whisper>(ctx_a);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
|
||||
audio_preproc = std::make_unique<mtmd_audio_preprocessor_whisper>(ctx_a);
|
||||
break;
|
||||
case PROJECTOR_TYPE_LFM2A:
|
||||
{
|
||||
audio_preproc = std::make_unique<mtmd_audio_preprocessor_conformer>(ctx_a);
|
||||
} break;
|
||||
audio_preproc = std::make_unique<mtmd_audio_preprocessor_conformer>(ctx_a);
|
||||
break;
|
||||
default:
|
||||
throw std::runtime_error(string_format("%s: unexpected audio projector type %d\n", __func__, proj));
|
||||
GGML_ABORT("unsupported audio projector type");
|
||||
}
|
||||
|
||||
// initialize audio preprocessor
|
||||
GGML_ASSERT(audio_preproc != nullptr);
|
||||
audio_preproc->initialize();
|
||||
|
||||
// set special tokens
|
||||
if (proj == PROJECTOR_TYPE_QWEN2A) {
|
||||
// <|audio_bos|> ... (embeddings) ... <|audio_eos|>
|
||||
aud_beg = "<|audio_bos|>";
|
||||
aud_end = "<|audio_eos|>";
|
||||
|
||||
} else if (proj == PROJECTOR_TYPE_ULTRAVOX) {
|
||||
// [BEGIN_AUDIO] ... (embeddings) ...
|
||||
aud_beg = "[BEGIN_AUDIO]";
|
||||
|
||||
} else if (proj == PROJECTOR_TYPE_MUSIC_FLAMINGO) {
|
||||
// <sound> ... (embeddings) ...
|
||||
aud_beg = "<sound>";
|
||||
}
|
||||
}
|
||||
|
||||
// get clip ctx based on chunk type
|
||||
@@ -649,9 +573,8 @@ struct mtmd_tokenizer {
|
||||
std::memcpy(img_u8->buf.data(), bitmap->data.data(), img_u8->nx * img_u8->ny * 3);
|
||||
|
||||
// preprocess image
|
||||
GGML_ASSERT(ctx->image_preproc != nullptr);
|
||||
clip_image_f32_batch batch_f32;
|
||||
bool ok = ctx->image_preproc->preprocess(*img_u8, batch_f32);
|
||||
bool ok = clip_image_preprocess(ctx->ctx_v, img_u8.get(), &batch_f32);
|
||||
if (!ok) {
|
||||
LOG_ERR("Unable to preprocess image\n");
|
||||
return 2;
|
||||
@@ -1302,8 +1225,7 @@ void mtmd_debug_preprocess_image(mtmd_context * ctx, const std::vector<uint8_t>
|
||||
img_u8.ny = ny;
|
||||
img_u8.buf = rgb_values;
|
||||
clip_image_f32_batch batch_f32;
|
||||
GGML_ASSERT(ctx->image_preproc != nullptr);
|
||||
bool ok = ctx->image_preproc->preprocess(img_u8, batch_f32);
|
||||
bool ok = clip_image_preprocess(ctx->ctx_v, &img_u8, &batch_f32);
|
||||
if (!ok) {
|
||||
LOG_ERR("%s: failed to preprocess image\n", __func__);
|
||||
return;
|
||||
|
||||
Reference in New Issue
Block a user