Compare commits

..

3 Commits

Author SHA1 Message Date
Georgi Gerganov
9065ca71a2 tests : sampling tests use min_keep == 0
ggml-ci
2025-05-27 11:30:41 +03:00
Georgi Gerganov
76f31abe53 sampling : same for typical sampling 2025-05-27 11:30:24 +03:00
Georgi Gerganov
fe12a5d47e sampling : min-p should always return at least one token
ggml-ci
2025-05-27 11:26:57 +03:00
62 changed files with 1335 additions and 3807 deletions

View File

@@ -49,6 +49,6 @@ charset = unset
trim_trailing_whitespace = unset
insert_final_newline = unset
[tools/mtmd/vendor/miniaudio.h]
[tools/mtmd/miniaudio.h]
trim_trailing_whitespace = unset
insert_final_newline = unset

View File

@@ -26,12 +26,12 @@ jobs:
sudo apt-get install -y --no-install-recommends \
build-essential \
gcc-14-riscv64-linux-gnu \
g++-14-riscv64-linux-gnu
g++-14-riscv64-linux-gnu \
libcurl4-openssl-dev:riscv64
- name: Build
run: |
cmake -B build -DLLAMA_CURL=OFF \
-DCMAKE_BUILD_TYPE=Release \
cmake -B build -DCMAKE_BUILD_TYPE=Release \
-DGGML_OPENMP=OFF \
-DLLAMA_BUILD_EXAMPLES=ON \
-DLLAMA_BUILD_TOOLS=ON \
@@ -72,12 +72,12 @@ jobs:
glslc \
gcc-14-riscv64-linux-gnu \
g++-14-riscv64-linux-gnu \
libvulkan-dev:riscv64
libvulkan-dev:riscv64 \
libcurl4-openssl-dev:riscv64
- name: Build
run: |
cmake -B build -DLLAMA_CURL=OFF \
-DCMAKE_BUILD_TYPE=Release \
cmake -B build -DCMAKE_BUILD_TYPE=Release \
-DGGML_VULKAN=ON \
-DGGML_OPENMP=OFF \
-DLLAMA_BUILD_EXAMPLES=ON \
@@ -118,12 +118,12 @@ jobs:
build-essential \
glslc \
crossbuild-essential-arm64 \
libvulkan-dev:arm64
libvulkan-dev:arm64 \
libcurl4-openssl-dev:arm64
- name: Build
run: |
cmake -B build -DLLAMA_CURL=OFF \
-DCMAKE_BUILD_TYPE=Release \
cmake -B build -DCMAKE_BUILD_TYPE=Release \
-DGGML_VULKAN=ON \
-DGGML_OPENMP=OFF \
-DLLAMA_BUILD_EXAMPLES=ON \
@@ -163,12 +163,12 @@ jobs:
sudo apt-get install -y --no-install-recommends \
build-essential \
gcc-14-powerpc64le-linux-gnu \
g++-14-powerpc64le-linux-gnu
g++-14-powerpc64le-linux-gnu \
libcurl4-openssl-dev:ppc64el
- name: Build
run: |
cmake -B build -DLLAMA_CURL=OFF \
-DCMAKE_BUILD_TYPE=Release \
cmake -B build -DCMAKE_BUILD_TYPE=Release \
-DGGML_OPENMP=OFF \
-DLLAMA_BUILD_EXAMPLES=ON \
-DLLAMA_BUILD_TOOLS=ON \
@@ -209,12 +209,12 @@ jobs:
glslc \
gcc-14-powerpc64le-linux-gnu \
g++-14-powerpc64le-linux-gnu \
libvulkan-dev:ppc64el
libvulkan-dev:ppc64el \
libcurl4-openssl-dev:ppc64el
- name: Build
run: |
cmake -B build -DLLAMA_CURL=OFF \
-DCMAKE_BUILD_TYPE=Release \
cmake -B build -DCMAKE_BUILD_TYPE=Release \
-DGGML_VULKAN=ON \
-DGGML_OPENMP=OFF \
-DLLAMA_BUILD_EXAMPLES=ON \

View File

@@ -903,16 +903,13 @@ struct common_init_result common_init_from_params(common_params & params) {
ok = false;
}
bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
if (!has_eos && !has_sep) {
LOG_WRN("%s: warning: vocab does not have an EOS token or SEP token, reranking will not work\n", __func__);
if (llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
LOG_WRN("%s: warning: vocab does not have an EOS token, reranking will not work\n", __func__);
ok = false;
} else if (!has_eos) {
LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
} else if (!has_sep) {
LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
}
if (llama_vocab_sep(vocab) == LLAMA_TOKEN_NULL) {
LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
ok = false;
}

View File

@@ -423,19 +423,16 @@ class ModelBase:
try:
# for security reason, we don't allow loading remote code by default
# if a model need remote code, we will fallback to config.json
config = AutoConfig.from_pretrained(dir_model, trust_remote_code=False).to_dict()
return AutoConfig.from_pretrained(dir_model, trust_remote_code=False).to_dict()
except Exception as e:
logger.warning(f"Failed to load model config from {dir_model}: {e}")
logger.warning("Trying to load config.json instead")
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
config = json.load(f)
if "llm_config" in config:
# rename for InternVL
config["text_config"] = config["llm_config"]
if "thinker_config" in config:
# rename for Qwen2.5-Omni
config["text_config"] = config["thinker_config"]["text_config"]
return config
if "llm_config" in config:
# rename for InternVL
config["text_config"] = config["llm_config"]
return config
@classmethod
def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]:
@@ -1124,21 +1121,18 @@ class MmprojModel(ModelBase):
preprocessor_config: dict[str, Any]
global_config: dict[str, Any]
n_block_keys = ["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth"]
has_vision_encoder: bool = True # by default
has_audio_encoder: bool = False
# for models having multiple encoders, we need to separate their hparams
hparams_vision: dict[str, Any] | None = None
hparams_audio: dict[str, Any] | None = None
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
if self.model_arch != gguf.MODEL_ARCH.MMPROJ:
raise TypeError("MmprojModel must be subclassed with model_arch = gguf.MODEL_ARCH.MMPROJ")
if self.has_vision_encoder and self.has_audio_encoder:
raise NotImplementedError("both vision + audio not supported yet")
# get n_embd of the text model
if "text_config" not in self.hparams:
self.hparams["text_config"] = {}
@@ -1149,32 +1143,22 @@ class MmprojModel(ModelBase):
assert self.n_embd_text > 0, "n_embd not found in hparams"
# move vision config to the top level, while preserving the original hparams in global_config
import copy
self.global_config = copy.deepcopy(self.hparams)
self.hparams_vision = self.get_vision_config()
self.hparams_audio = self.get_audio_config()
self.global_config = self.hparams
if self.hparams_vision is None and self.hparams_audio is None:
if "vision_config" in self.hparams:
self.hparams = self.hparams["vision_config"]
elif "audio_config" in self.hparams:
self.hparams = self.hparams["audio_config"]
else:
raise ValueError("vision_config / audio_config not found in hparams")
# for compat with vision-only models
self.hparams = self.hparams_vision or self.hparams_audio or self.hparams
# TODO @ngxson : this is a hack to support both vision and audio encoders
have_multiple_encoders = self.has_audio_encoder and self.has_vision_encoder
self.block_count = 128 if have_multiple_encoders else self.find_hparam(self.n_block_keys, True)
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth"])
self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.MMPROJ, self.block_count)
# load preprocessor config
with open(self.dir_model / "preprocessor_config.json", "r", encoding="utf-8") as f:
self.preprocessor_config = json.load(f)
def get_vision_config(self) -> dict[str, Any] | None:
return self.global_config.get("vision_config")
def get_audio_config(self) -> dict[str, Any] | None:
return self.global_config.get("audio_config")
def set_type(self):
self.gguf_writer.add_type(gguf.GGUFType.MMPROJ)
@@ -1186,49 +1170,33 @@ class MmprojModel(ModelBase):
self.gguf_writer.add_vision_projection_dim(self.n_embd_text)
# vision config
self.gguf_writer.add_vision_image_size(self.find_vparam(["image_size"]))
self.gguf_writer.add_vision_patch_size(self.find_vparam(["patch_size"]))
self.gguf_writer.add_vision_embedding_length(self.find_vparam(["hidden_size"]))
self.gguf_writer.add_vision_feed_forward_length(self.find_vparam(["intermediate_size"]))
self.gguf_writer.add_vision_block_count(self.find_vparam(self.n_block_keys))
self.gguf_writer.add_vision_head_count(self.find_vparam(["num_attention_heads"]))
self.gguf_writer.add_vision_image_size(self.find_hparam(["image_size"]))
self.gguf_writer.add_vision_patch_size(self.find_hparam(["patch_size"]))
self.gguf_writer.add_vision_embedding_length(self.find_hparam(["hidden_size"]))
self.gguf_writer.add_vision_feed_forward_length(self.find_hparam(["intermediate_size"]))
self.gguf_writer.add_vision_block_count(self.block_count)
self.gguf_writer.add_vision_head_count(self.find_hparam(["num_attention_heads"]))
# preprocessor config
self.gguf_writer.add_vision_image_mean(self.preprocessor_config["image_mean"])
self.gguf_writer.add_vision_image_std(self.preprocessor_config["image_std"])
if self.has_audio_encoder:
elif self.has_audio_encoder:
self.gguf_writer.add_clip_has_audio_encoder(True)
self.gguf_writer.add_audio_projection_dim(self.n_embd_text)
# audio config
self.gguf_writer.add_audio_embedding_length(self.find_aparam(["hidden_size"]))
self.gguf_writer.add_audio_feed_forward_length(self.find_aparam(["intermediate_size"]))
self.gguf_writer.add_audio_block_count(self.find_aparam(self.n_block_keys))
self.gguf_writer.add_audio_head_count(self.find_aparam(["num_attention_heads"]))
self.gguf_writer.add_audio_embedding_length(self.find_hparam(["hidden_size"]))
self.gguf_writer.add_audio_feed_forward_length(self.find_hparam(["intermediate_size"]))
self.gguf_writer.add_audio_block_count(self.block_count)
self.gguf_writer.add_audio_head_count(self.find_hparam(["num_attention_heads"]))
if not self.has_vision_encoder and not self.has_audio_encoder:
else:
raise ValueError("MmprojModel must have either vision or audio encoder")
def write_vocab(self):
raise ValueError("MmprojModel does not support vocab writing")
def find_vparam(self, keys: Iterable[str], optional: bool = False) -> Any:
assert self.hparams_vision is not None
return self._find_param(self.hparams_vision, keys, optional)
def find_aparam(self, keys: Iterable[str], optional: bool = False) -> Any:
assert self.hparams_audio is not None
return self._find_param(self.hparams_audio, keys, optional)
def _find_param(self, obj: dict[str, Any], keys: Iterable[str], optional: bool = False) -> Any:
key = next((k for k in keys if k in obj), None)
if key is not None:
return obj[key]
if optional:
return None
raise KeyError(f"could not find any of: {keys}")
@ModelBase.register("GPTNeoXForCausalLM")
class GPTNeoXModel(TextModel):
@@ -1841,8 +1809,7 @@ class StableLMModel(TextModel):
"MistralForCausalLM",
"MixtralForCausalLM",
"VLlama3ForCausalLM",
"LlavaForConditionalGeneration",
"LlamaModel")
"LlavaForConditionalGeneration")
class LlamaModel(TextModel):
model_arch = gguf.MODEL_ARCH.LLAMA
undo_permute = True
@@ -1922,8 +1889,6 @@ class LlamaModel(TextModel):
if is_vision_tensor:
return [] # skip vision tensors
elif self.hf_arch == "LlamaModel":
name = "model." + name
elif name.startswith("model.text_model"):
name = name.replace("text_model.", "") # for SmolVLM
elif name.startswith("language_model."):
@@ -2172,9 +2137,6 @@ class Llama4VisionModel(MmprojModel):
# process vision tensors
if "positional_embedding_vlm" in name and ".weight" not in name:
name += ".weight"
if "multi_modal_projector.linear_1" in name:
# despite the name with number postfix, this is a single fully connected layer
return [(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_MMPROJ_FC], data_torch)]
return [(self.map_tensor_name(name), data_torch)]
return []
@@ -2712,12 +2674,7 @@ class Qwen2Model(TextModel):
yield from super().modify_tensors(data_torch, name, bid)
@ModelBase.register(
"Qwen2VLModel",
"Qwen2VLForConditionalGeneration",
"Qwen2_5_VLForConditionalGeneration",
"Qwen2_5OmniModel",
)
@ModelBase.register("Qwen2VLModel", "Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration")
class Qwen2VLModel(TextModel):
model_arch = gguf.MODEL_ARCH.QWEN2VL
@@ -2735,11 +2692,8 @@ class Qwen2VLModel(TextModel):
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
del bid # unused
if name.startswith("thinker."):
name = name.replace("thinker.", "")
if name.startswith("visual") or name.startswith("audio") or \
name.startswith("talker") or name.startswith("token2wav"):
# skip multimodal tensors
if name.startswith("visual."):
# skip visual tensors
return []
return [(self.map_tensor_name(name), data_torch)]
@@ -2748,27 +2702,21 @@ class Qwen2VLModel(TextModel):
class Qwen2VLVisionModel(MmprojModel):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
assert self.hparams_vision is not None
self.hparams_vision["image_size"] = self.hparams_vision.get("image_size", 560)
self.hparams["image_size"] = self.hparams.get("image_size", 560)
# rename config.json values
self.hparams_vision["num_attention_heads"] = self.hparams_vision.get("num_heads")
self.hparams_vision["num_hidden_layers"] = self.hparams_vision.get("depth")
if "embed_dim" in self.hparams_vision: # qwen2vl
self.hparams_vision["intermediate_size"] = self.hparams_vision.get("hidden_size")
self.hparams_vision["hidden_size"] = self.hparams_vision.get("embed_dim")
self.hparams["num_attention_heads"] = self.hparams.get("num_heads")
self.hparams["num_hidden_layers"] = self.hparams.get("depth")
if "embed_dim" in self.hparams: # qwen2vl
self.hparams["intermediate_size"] = self.hparams.get("hidden_size")
self.hparams["hidden_size"] = self.hparams.get("embed_dim")
def set_gguf_parameters(self):
super().set_gguf_parameters()
assert self.hparams_vision is not None
hparams = self.hparams_vision
model_type = self.global_config['model_type']
if model_type == 'qwen2_vl':
hparams = self.hparams
if self.global_config['model_type'] == 'qwen2_vl':
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN2VL)
elif model_type == 'qwen2_5_vl' or model_type == 'qwen2_5_omni':
if model_type == 'qwen2_5_omni':
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN25O)
else:
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN25VL)
elif self.global_config['model_type'] == 'qwen2_5_vl':
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN25VL)
self.gguf_writer.add_vision_use_silu(True)
# find n_wa_pattern (window attention pattern)
fullatt_block_indexes = hparams.get("fullatt_block_indexes")
@@ -2826,66 +2774,6 @@ class Qwen2VLVisionModel(MmprojModel):
return [] # skip other tensors
@ModelBase.register("Qwen2_5OmniModel")
class Qwen25OmniModel(Qwen2VLVisionModel):
has_vision_encoder = True
has_audio_encoder = True
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
assert self.hparams_audio is not None
self.hparams_audio["hidden_size"] = self.hparams_audio["d_model"]
self.hparams_audio["intermediate_size"] = self.hparams_audio["encoder_ffn_dim"]
self.hparams_audio["num_attention_heads"] = self.hparams_audio["encoder_attention_heads"]
def set_gguf_parameters(self):
super().set_gguf_parameters()
assert self.hparams_audio is not None
self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["num_mel_bins"])
self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams_audio.get("layer_norm_eps", 1e-5))
def get_vision_config(self) -> dict[str, Any] | None:
return self.global_config["thinker_config"].get("vision_config")
def get_audio_config(self) -> dict[str, Any] | None:
return self.global_config["thinker_config"].get("audio_config")
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
# SinusoidsPositionEmbedding
assert self.hparams_audio is not None
max_timescale = 10000
length = 1500
channels = self.hparams_audio["hidden_size"]
log_timescale_increment = np.log(max_timescale) / (channels // 2 - 1)
inv_timescales = torch.exp(-log_timescale_increment * torch.arange(channels // 2).float())
scaled_time = torch.arange(length)[:, np.newaxis] * inv_timescales[np.newaxis, :]
pos_embd = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1).to(dtype=torch.float32)
yield ("audio_tower.embed_positions.weight", pos_embd)
def tensor_force_quant(self, name, new_name, bid, n_dims):
del bid, new_name, n_dims # unused
if ".conv" in name and ".weight" in name:
return gguf.GGMLQuantizationType.F16
return False
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
if name.startswith("thinker."):
name = name.replace("thinker.", "")
if name.startswith("audio_tower"):
# process audio tensors
if "conv1.bias" in name or "conv2.bias" in name:
# transpose conv1 and conv2 bias
data_torch = data_torch.unsqueeze(-1)
if "audio_bos_eos_token" in name:
# this tensor is left unused in transformers code
# https://github.com/huggingface/transformers/blob/6e3063422c4b1c014aa60c32b9254fd2902f0f28/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py#L1809
return []
return [(self.map_tensor_name(name), data_torch)]
return super().modify_tensors(data_torch, name, bid)
@ModelBase.register("InternVisionModel")
class InternVisionModel(MmprojModel):
def set_gguf_parameters(self):
@@ -3682,7 +3570,7 @@ class InternLM3Model(TextModel):
return [(self.map_tensor_name(name), data_torch)]
@ModelBase.register("BertModel", "BertForMaskedLM", "CamembertModel", "BertForSequenceClassification")
@ModelBase.register("BertModel", "BertForMaskedLM", "CamembertModel")
class BertModel(TextModel):
model_arch = gguf.MODEL_ARCH.BERT
@@ -3745,13 +3633,6 @@ class BertModel(TextModel):
if name.startswith("cls.seq_relationship"):
return []
# For BertForSequenceClassification (direct projection layer)
if name == "classifier.weight":
name = "classifier.out_proj.weight"
if name == "classifier.bias":
name = "classifier.out_proj.bias"
return [(self.map_tensor_name(name), data_torch)]
def _xlmroberta_tokenizer_init(self) -> None:

View File

@@ -288,7 +288,7 @@ logger.info("+++ convert_hf_to_gguf.py was updated")
tests = [
"ied 4 ½ months",
"Äpfel",
"Führer",
"",
" ",
" ",

View File

@@ -2,6 +2,7 @@
[chat.h](../common/chat.h) (https://github.com/ggml-org/llama.cpp/pull/9639) adds support for [OpenAI-style function calling](https://platform.openai.com/docs/guides/function-calling) and is used in:
- `llama-server` when started w/ `--jinja` flag
- `llama-cli` (WIP: https://github.com/ggml-org/llama.cpp/pull/11556)
## Universal support w/ Native & Generic handlers

View File

@@ -98,12 +98,3 @@ NOTE: some models may require large context window, for example: `-c 8192`
# note: no pre-quantized GGUF this model, as they have very poor result
# ref: https://github.com/ggml-org/llama.cpp/pull/13760
```
**Mixed modalities**:
```sh
# Qwen2.5 Omni
# Capabilities: audio input, vision input
(tool_name) -hf ggml-org/Qwen2.5-Omni-3B-GGUF
(tool_name) -hf ggml-org/Qwen2.5-Omni-7B-GGUF
```

View File

@@ -129,7 +129,6 @@ option(GGML_LASX "ggml: enable lasx" ON)
option(GGML_LSX "ggml: enable lsx" ON)
option(GGML_RVV "ggml: enable rvv" ON)
option(GGML_RV_ZFH "ggml: enable riscv zfh" OFF)
option(GGML_XTHEADVECTOR "ggml: enable xtheadvector" OFF)
option(GGML_VXE "ggml: enable vxe" ON)
option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
@@ -177,6 +176,7 @@ option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks"
option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF)
option(GGML_VULKAN_MEMORY_DEBUG "ggml: enable Vulkan memory debug output" OFF)
option(GGML_VULKAN_SHADER_DEBUG_INFO "ggml: enable Vulkan shader debug info" OFF)
option(GGML_VULKAN_PERF "ggml: enable Vulkan perf output" OFF)
option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation" OFF)
option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF)
option(GGML_KOMPUTE "ggml: use Kompute" OFF)

View File

@@ -935,15 +935,6 @@ extern "C" {
struct ggml_tensor * a,
struct ggml_tensor * b);
// repeat a to the specified shape
GGML_API struct ggml_tensor * ggml_repeat_4d(
struct ggml_context * ctx,
struct ggml_tensor * a,
int64_t ne0,
int64_t ne1,
int64_t ne2,
int64_t ne3);
// sums repetitions in a into shape of b
GGML_API struct ggml_tensor * ggml_repeat_back(
struct ggml_context * ctx,

View File

@@ -1598,9 +1598,6 @@ void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) {
for (int i = 0; i < sched->n_backends; i++) {
ggml_backend_synchronize(sched->backends[i]);
}
// reset the current copy to 0 so that the graphs will be similar during generation
// necessary for CUDA graphs
sched->cur_copy = 0;
}
void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {

View File

@@ -30,7 +30,6 @@ string(TOLOWER ${SOC_TYPE} SOC_VERSION) # SOC_VERSION need lower
string(REGEX MATCH "[0-9]+[a-zA-Z]" SOC_TYPE_MAJOR_SN "${SOC_VERSION}")
set(SOC_TYPE_COMPILE_OPTION "ASCEND_${SOC_TYPE_MAJOR_SN}")
string(TOUPPER ${SOC_TYPE_COMPILE_OPTION} SOC_TYPE_COMPILE_OPTION)
message(STATUS "CANN: SOC_VERSION = ${SOC_VERSION}")
if (CANN_INSTALL_DIR)
# Only Support Linux.

View File

@@ -299,25 +299,6 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
endif()
endif()
endif()
if (GGML_BACKEND_DL)
if (GGML_NATIVE)
# the feature check relies on ARCH_DEFINITIONS, but it is not set with GGML_NATIVE
message(FATAL_ERROR "GGML_NATIVE is not compatible with GGML_BACKEND_DL, consider using GGML_CPU_ALL_VARIANTS")
endif()
# The feature detection code is compiled as a separate target so that
# it can be built without the architecture flags
# Since multiple variants of the CPU backend may be included in the same
# build, using set_source_files_properties() to set the arch flags is not possible
set(GGML_CPU_FEATS_NAME ${GGML_CPU_NAME}-feats)
add_library(${GGML_CPU_FEATS_NAME} OBJECT ggml-cpu/cpu-feats-x86.cpp)
target_include_directories(${GGML_CPU_FEATS_NAME} PRIVATE . .. ../include)
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE ${ARCH_DEFINITIONS})
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE GGML_BACKEND_DL GGML_BACKEND_BUILD GGML_BACKEND_SHARED)
set_target_properties(${GGML_CPU_FEATS_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_link_libraries(${GGML_CPU_NAME} PRIVATE ${GGML_CPU_FEATS_NAME})
endif()
elseif ("${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "ppc64le " OR "${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "powerpc ")
message(STATUS "PowerPC detected")
if (GGML_NATIVE)
@@ -357,10 +338,8 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "riscv64")
message(STATUS "RISC-V detected")
if (GGML_RVV)
if (GGML_XTHEADVECTOR)
list(APPEND ARCH_FLAGS -march=rv64gc_xtheadvector -mabi=lp64d)
elseif (GGML_RV_ZFH)
list(APPEND ARCH_FLAGS -march=rv64gcv_zfhmin -mabi=lp64d)
if (GGML_RV_ZFH)
list(APPEND ARCH_FLAGS -march=rv64gcv_zfhmin -DGGML_RV_ZFH -mabi=lp64d)
else()
list(APPEND ARCH_FLAGS -march=rv64gcv -mabi=lp64d)
endif()
@@ -498,6 +477,25 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
target_compile_options(${GGML_CPU_NAME} PRIVATE ${ARCH_FLAGS})
target_compile_definitions(${GGML_CPU_NAME} PRIVATE ${ARCH_DEFINITIONS})
if (GGML_BACKEND_DL)
if (GGML_NATIVE)
# the feature check relies on ARCH_DEFINITIONS, but it is not set with GGML_NATIVE
message(FATAL_ERROR "GGML_NATIVE is not compatible with GGML_BACKEND_DL, consider using GGML_CPU_ALL_VARIANTS")
endif()
# The feature detection code is compiled as a separate target so that
# it can be built without the architecture flags
# Since multiple variants of the CPU backend may be included in the same
# build, using set_source_files_properties() to set the arch flags is not possible
set(GGML_CPU_FEATS_NAME ${GGML_CPU_NAME}-feats)
add_library(${GGML_CPU_FEATS_NAME} OBJECT ggml-cpu/cpu-feats-x86.cpp)
target_include_directories(${GGML_CPU_FEATS_NAME} PRIVATE . .. ../include)
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE ${ARCH_DEFINITIONS})
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE GGML_BACKEND_DL GGML_BACKEND_BUILD GGML_BACKEND_SHARED)
set_target_properties(${GGML_CPU_FEATS_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_link_libraries(${GGML_CPU_NAME} PRIVATE ${GGML_CPU_FEATS_NAME})
endif()
if (EMSCRIPTEN)
set_target_properties(${GGML_CPU_NAME} PROPERTIES COMPILE_FLAGS "-msimd128")
endif()

View File

@@ -1191,7 +1191,7 @@ static void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, c
}
}
return;
#elif defined __riscv_v
#elif defined(__riscv_v_intrinsic)
if (__riscv_vlenb() >= QK4_0) {
const size_t vl = QK4_0;
@@ -3783,7 +3783,7 @@ static void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, c
}
return;
}
#elif defined __riscv_v
#elif defined(__riscv_v_intrinsic)
if (__riscv_vlenb() >= QK4_0) {
const size_t vl = QK4_0;

View File

@@ -320,17 +320,21 @@ inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b)
#ifdef __wasm_simd128__
#include <wasm_simd128.h>
#endif
#else
#ifdef __POWER9_VECTOR__
#include <altivec.h>
#endif
#else
#if defined(_MSC_VER) || defined(__MINGW32__)
#include <intrin.h>
#elif defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) || defined(__SSE__)
#else
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) || defined(__SSE__)
#if !defined(__riscv)
#include <immintrin.h>
#endif
#endif
#endif
#endif
#endif
#ifdef __riscv_v_intrinsic
#include <riscv_vector.h>

View File

@@ -883,7 +883,7 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
_mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
#endif
}
#elif defined(__riscv_v)
#elif defined(__riscv_v_intrinsic)
size_t vl = QK8_0;
@@ -1221,7 +1221,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
_mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
#endif
}
#elif defined(__riscv_v)
#elif defined(__riscv_v_intrinsic)
size_t vl = QK8_1;
@@ -2384,7 +2384,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
}
sumf = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
#elif defined(__riscv_v)
#elif defined(__riscv_v_intrinsic)
size_t vl = qk / 2;
for (; ib < nb; ++ib) {
@@ -2774,7 +2774,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
}
sumf = hsum_float_8(acc) + summs;
#elif defined(__riscv_v)
#elif defined(__riscv_v_intrinsic)
size_t vl = qk / 2;
for (; ib < nb; ++ib) {
@@ -3121,7 +3121,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
}
sumf = hsum_float_8(acc);
#elif defined(__riscv_v)
#elif defined(__riscv_v_intrinsic)
size_t vl;
size_t vlenb = __riscv_vlenb();
@@ -3460,7 +3460,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
}
sumf = hsum_float_8(acc) + summs;
#elif defined(__riscv_v)
#elif defined(__riscv_v_intrinsic)
size_t vl;
size_t vlenb = __riscv_vlenb();
@@ -3897,7 +3897,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
}
sumf = hsum_float_8(accum);
#elif defined(__riscv_v)
#elif defined(__riscv_v_intrinsic)
size_t vl = qk;
for (; ib < nb; ++ib) {
@@ -5100,111 +5100,14 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
*s = sumf;
#elif defined __riscv_xtheadvector
float sumf = 0;
uint8_t atmp[16];
for (int i = 0; i < nb; ++i) {
const uint8_t * q2 = x[i].qs;
const int8_t * q8 = y[i].qs;
const uint8_t * sc = x[i].scales;
const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
uint8_t *patmp = atmp;
int vsums;
int tmp;
__asm__ __volatile__(
"th.vsetvli zero, %[vl16], e8, m1\n\t"
"th.vmv.v.x v8, zero\n\t"
"th.vlb.v v1, (%[sc])\n\t"
"th.vand.vi v0, v1, 0xF\n\t"
"th.vsrl.vi v1, v1, 4\n\t"
"th.vsb.v v0, (%[scale])\n\t"
"th.vwaddu.vx v16, v1, zero\n\t"
"th.vsetvli zero, %[vl16], e16, m2\n\t"
"th.vlh.v v2, (%[bsums])\n\t"
"th.vwmul.vv v4, v16, v2\n\t"
"th.vsetvli zero, %[vl16], e32, m4\n\t"
"th.vredsum.vs v8, v4, v8\n\t"
"th.vmv.x.s %[vsums], v8"
: [tmp] "=&r" (tmp), [vsums] "=&r" (vsums)
: [sc] "r" (sc), [scale] "r" (atmp), [bsums] "r" (y[i].bsums)
, [vl16] "r" (16)
: "memory"
, "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
, "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
, "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
, "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
);
sumf += dmin * vsums;
int isum = 0;
for (int j = 0; j < QK_K/128; ++j) {
__asm__ __volatile__(
"th.vsetvli zero, %[vl32], e8, m2\n\t"
"th.vlb.v v0, (%[q2])\n\t"
"th.vsrl.vi v2, v0, 2\n\t"
"th.vsrl.vi v4, v0, 4\n\t"
"th.vsrl.vi v6, v0, 6\n\t"
"th.vand.vi v0, v0, 0x3\n\t"
"th.vand.vi v2, v2, 0x3\n\t"
"th.vand.vi v4, v4, 0x3\n\t"
"th.vsetvli zero, %[vl128], e8, m8\n\t"
"th.vlb.v v8, (%[q8])\n\t"
"th.vsetvli zero, %[vl64], e8, m4\n\t"
"th.vwmul.vv v16, v0, v8\n\t"
"th.vwmul.vv v24, v4, v12\n\t"
"th.vsetvli zero, %[vl16], e16, m2\n\t"
"th.vmv.v.x v0, zero\n\t"
"th.vwredsum.vs v10, v16, v0\n\t"
"th.vwredsum.vs v9, v18, v0\n\t"
"th.vwredsum.vs v8, v20, v0\n\t"
"th.vwredsum.vs v7, v22, v0\n\t"
"th.vwredsum.vs v11, v24, v0\n\t"
"th.vwredsum.vs v12, v26, v0\n\t"
"th.vwredsum.vs v13, v28, v0\n\t"
"th.vwredsum.vs v14, v30, v0\n\t"
"li %[tmp], 4\n\t"
"th.vsetvli zero, %[tmp], e32, m1\n\t"
"th.vslideup.vi v10, v9, 1\n\t"
"th.vslideup.vi v8, v7, 1\n\t"
"th.vslideup.vi v11, v12, 1\n\t"
"th.vslideup.vi v13, v14, 1\n\t"
"th.vslideup.vi v10, v8, 2\n\t"
"th.vslideup.vi v11, v13, 2\n\t"
"li %[tmp], 8\n\t"
"th.vsetvli zero, %[tmp], e32, m2\n\t"
"th.vlbu.v v12, (%[scale])\n\t"
"th.vmul.vv v10, v10, v12\n\t"
"th.vredsum.vs v0, v10, v0\n\t"
"th.vmv.x.s %[tmp], v0\n\t"
"add %[isum], %[isum], %[tmp]"
: [tmp] "=&r" (tmp), [isum] "+&r" (isum)
: [q2] "r" (q2), [scale] "r" (patmp), [q8] "r" (q8)
, [vl16] "r" (16), [vl32] "r" (32), [vl64] "r" (64), [vl128] "r" (128)
: "memory"
, "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
, "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
, "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
, "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
);
q2 += 32; q8 += 128; patmp += 8;
}
sumf += dall * isum;
}
*s = sumf;
#elif defined __riscv_v
float sumf = 0;
uint8_t atmp[16];
#elif defined __riscv_v_intrinsic
const int vector_length = __riscv_vlenb() * 8;
float sumf = 0;
uint8_t temp_01[32] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 };
uint8_t atmp[16];
switch (vector_length) {
case 256:
@@ -6234,140 +6137,13 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
*s = sumf;
#elif defined __riscv_xtheadvector
#elif defined __riscv_v_intrinsic
uint32_t utmp[4];
float sumf = 0;
for (int i = 0; i < nb; ++i) {
const uint8_t * restrict q3 = x[i].qs;
const uint8_t * restrict qh = x[i].hmask;
const int8_t * restrict q8 = y[i].qs;
int8_t * scale = (int8_t *)utmp;
int tmp;
__asm__ __volatile__(
"li %[tmp], 12\n\t"
"th.vsetvli zero, %[tmp], e8, m1\n\t"
"th.vlb.v v0, (%[s6b])\n\t"
"th.vmv.v.v v2, v0\n\t"
"li %[tmp], 2\n\t"
"th.vsetvli zero, %[tmp], e64, m1\n\t"
"th.vmv.v.x v9, %[sh]\n\t"\
"th.vslidedown.vi v1, v0, 1\n\t"
"th.vslide1up.vx v8, v9, zero\n\t" // {0, 0, 4, 4}
"th.vslideup.vi v0, v2, 1\n\t" // {aux[0], aux[1], aux[0], aux[1]}
"li %[tmp], 4\n\t"
"th.vsetvli zero, %[tmp], e32, m1\n\t"
"th.vid.v v9\n\t"
"th.vmv.x.s %[tmp], v1\n\t"
"th.vsll.vi v9, v9, 1\n\t" // {0, 2, 4, 6}
"th.vmv.v.x v1, %[tmp]\n\t" // {aux[2], aux[2], aux[2], aux[2]}
"th.vsrl.vv v4, v1, v9\n\t"
"th.vsrl.vv v2, v0, v8\n\t"
"th.vand.vx v5, v4, %[kmask1]\n\t"
"th.vand.vx v3, v2, %[kmask2]\n\t"
"th.vsll.vi v6, v5, 4\n\t"
"th.vor.vv v7, v6, v3\n\t"
"li %[tmp], 16\n\t"
"th.vsetvli zero, %[tmp], e8, m1\n\t"
"th.vsub.vx v0, v7, %[c]\n\t"
"th.vsb.v v0, (%[scale])"
: [tmp] "=&r" (tmp)
: [sh] "r" (0x0000000400000004), [s6b] "r" (x[i].scales), [c] "r" (32)
, [scale] "r" (scale), [kmask1] "r" (kmask1), [kmask2] "r" (kmask2)
: "memory"
, "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
, "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
, "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
, "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
);
uint8_t m = 1;
int isum = 0;
for (int j = 0; j < QK_K; j += 128) {
__asm__ __volatile__(
// fixme: use v0p7 mask layout directly
"th.vsetvli zero, %[vl32], e8, m2\n\t"
"th.vlb.v v8, (%[q3])\n\t"
"th.vsrl.vi v10, v8, 2\n\t"
"th.vsrl.vi v12, v8, 4\n\t"
"th.vsrl.vi v14, v8, 6\n\t"
"th.vand.vi v8, v8, 3\n\t"
"th.vand.vi v10, v10, 3\n\t"
"th.vand.vi v12, v12, 3\n\t"
"th.vlb.v v2, (%[qh])\n\t"
"th.vand.vx v4, v2, %[m]\n\t"
"slli %[m], %[m], 1\n\t"
"th.vmseq.vx v0, v4, zero\n\t"
"th.vadd.vi v8, v8, -4, v0.t\n\t"
"th.vand.vx v4, v2, %[m]\n\t"
"slli %[m], %[m], 1\n\t"
"th.vmseq.vx v0, v4, zero\n\t"
"th.vadd.vi v10, v10, -4, v0.t\n\t"
"th.vand.vx v4, v2, %[m]\n\t"
"slli %[m], %[m], 1\n\t"
"th.vmseq.vx v0, v4, zero\n\t"
"th.vadd.vi v12, v12, -4, v0.t\n\t"
"th.vand.vx v4, v2, %[m]\n\t"
"slli %[m], %[m], 1\n\t"
"th.vmseq.vx v0, v4, zero\n\t"
"th.vadd.vi v14, v14, -4, v0.t\n\t"
"th.vsetvli zero, %[vl128], e8, m8\n\t"
"th.vlb.v v0, (%[q8])\n\t"
"th.vsetvli zero, %[vl64], e8, m4\n\t"
"th.vwmul.vv v16, v0, v8\n\t"
"th.vwmul.vv v24, v4, v12\n\t"
"li %[tmp], 16\n\t"
"th.vsetvli zero, %[tmp], e16, m2\n\t"
"th.vmv.v.x v0, zero\n\t"
"th.vwredsum.vs v10, v16, v0\n\t"
"th.vwredsum.vs v9, v18, v0\n\t"
"th.vwredsum.vs v8, v20, v0\n\t"
"th.vwredsum.vs v7, v22, v0\n\t"
"th.vwredsum.vs v11, v24, v0\n\t"
"th.vwredsum.vs v12, v26, v0\n\t"
"th.vwredsum.vs v13, v28, v0\n\t"
"th.vwredsum.vs v14, v30, v0\n\t"
"li %[tmp], 4\n\t"
"th.vsetvli zero, %[tmp], e32, m1\n\t"
"th.vslideup.vi v10, v9, 1\n\t"
"th.vslideup.vi v8, v7, 1\n\t"
"th.vslideup.vi v11, v12, 1\n\t"
"th.vslideup.vi v13, v14, 1\n\t"
"th.vslideup.vi v10, v8, 2\n\t"
"th.vslideup.vi v11, v13, 2\n\t"
"li %[tmp], 8\n\t"
"th.vsetvli zero, %[tmp], e32, m2\n\t"
"th.vlb.v v12, (%[scale])\n\t"
"th.vmul.vv v10, v10, v12\n\t"
"th.vredsum.vs v0, v10, v0\n\t"
"th.vmv.x.s %[tmp], v0\n\t"
"add %[isum], %[isum], %[tmp]"
: [tmp] "=&r" (tmp), [m] "+&r" (m), [isum] "+&r" (isum)
: [vl128] "r" (128), [vl64] "r" (64), [vl32] "r" (32)
, [q3] "r" (q3), [qh] "r" (qh), [scale] "r" (scale), [q8] "r" (q8)
: "memory"
, "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
, "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
, "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
, "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
);
q3 += 32; q8 += 128; scale += 8;
}
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
sumf += d * isum;
}
*s = sumf;
#elif defined __riscv_v
uint32_t utmp[4];
float sumf = 0;
uint32_t aux[3];
uint32_t utmp[4];
const int vector_length = __riscv_vlenb() * 8;
float sumf = 0;
switch (vector_length) {
case 256:
@@ -6555,7 +6331,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
"vslideup.vi v13, v14, 1\n\t"
"vslideup.vi v10, v8, 2\n\t"
"vslideup.vi v11, v13, 2\n\t"
"vsetivli zero, 8, e32, m2\n\t"
"vsetivli zero, 8, e32, m2\n\t"\
"vle8.v v15, (%[scale])\n\t"
"vsext.vf4 v12, v15\n\t"
"vmul.vv v10, v10, v12\n\t"
@@ -7404,129 +7180,13 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
*s = hsum_float_8(acc) + _mm_cvtss_f32(acc_m);
#elif defined __riscv_xtheadvector
#elif defined __riscv_v_intrinsic
const uint8_t * scales = (const uint8_t*)&utmp[0];
const uint8_t * mins = (const uint8_t*)&utmp[2];
float sumf = 0;
for (int i = 0; i < nb; ++i) {
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
int tmp, tmp2, sumi;
__asm__ __volatile__(
"li %[t1], 12\n\t"
"th.vsetvli zero, %[t1], e8, m1\n\t"
"th.vlb.v v1, (%[s6b])\n\t" // {aux[0], aux[1], aux[2]}
"li %[t1], 4\n\t"
"th.vsetvli zero, %[t1], e32, m1\n\t"
"th.vslidedown.vi v2, v1, 2\n\t"
"th.vmv.v.v v3, v2\n\t"
"th.vslideup.vi v2, v3, 1\n\t" // {aux[2], aux[2]}
"li %[t1], 2\n\t"
"th.vsetvli zero, %[t1], e32, m1\n\t"
"th.vmv.v.i v4, 4\n\t"
"th.vand.vx v8, v1, %[kmask1]\n\t"
"th.vslide1up.vx v5, v4, zero\n\t" // {0, 4}
"th.vsrl.vi v6, v1, 6\n\t"
"th.vsrl.vv v7, v2, v5\n\t"
"th.vand.vx v0, v6, %[kmask3]\n\t"
"th.vand.vx v2, v7, %[kmask2]\n\t"
"th.vsll.vi v6, v0, 4\n\t"
"li %[t2], 8\n\t"
"addi %[t1], %[utmp], 4\n\t"
"th.vor.vv v1, v6, v2\n\t"
"th.vssw.v v8, (%[utmp]), %[t2]\n\t"
"th.vssw.v v1, (%[t1]), %[t2]\n\t"
"th.vsetvli zero, zero, e32, m2\n\t" // vl == 8
"th.vlw.v v2, (%[bsums])\n\t"
"th.vsetvli zero, %[t2], e16, m1\n\t"
"th.vnsrl.vi v0, v2, 0\n\t"
"th.vnsrl.vi v1, v2, 16\n\t"
"th.vadd.vv v2, v0, v1\n\t"
"th.vlbu.v v4, (%[mins])\n\t"
"th.vwmul.vv v6, v4, v2\n\t"
"th.vmv.v.x v0, zero\n\t"
"th.vsetvli zero, %[t2], e32, m2\n\t"
"th.vredsum.vs v0, v6, v0\n\t"
"th.vmv.x.s %[sumi], v0"
: [t1] "=&r" (tmp), [t2] "=&r" (tmp2), [sumi] "=&r" (sumi)
: [bsums] "r" (y[i].bsums), [mins] "r" (mins), [utmp] "r" (utmp)
, [s6b] "r" (x[i].scales), [kmask1] "r" (kmask1)
, [kmask2] "r" (kmask2), [kmask3] "r" (kmask3)
: "memory"
, "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
, "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
, "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
, "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
);
sumf -= dmin * sumi;
const uint8_t * restrict q4 = x[i].qs;
const int8_t * restrict q8 = y[i].qs;
sumi = 0;
const uint8_t * scale = scales;
for (int j = 0; j < QK_K/128; ++j) {
int vl128 = 128, vl64 = 64, vl32 = 32;
__asm__ __volatile__(
"th.vsetvli zero, %[vl128], e8, m8\n\t"
"th.vlb.v v8, (%[q8])\n\t"
"th.vsetvli zero, %[vl64], e8, m4\n\t"
"th.vlb.v v0, (%[q4])\n\t"
"th.vsrl.vi v4, v0, 4\n\t"
"th.vand.vi v0, v0, 0xF\n\t"
"th.vsetvli zero, %[vl32], e8, m2\n\t"
"th.vwmul.vv v28, v6, v14\n\t"
"th.vwmul.vv v20, v4, v10\n\t"
"th.vwmul.vv v24, v2, v12\n\t"
"th.vwmul.vv v16, v0, v8\n\t"
"li %[tmp], 4\n\t"
"th.vsetvli zero, %[tmp], e32, m1\n\t"
"th.vlbu.v v1, (%[scale])\n\t"
"th.vmv.v.x v0, zero\n\t"
"th.vsetvli zero, %[vl32], e16, m4\n\t"
"th.vwredsum.vs v6, v24, v0\n\t"
"th.vwredsum.vs v7, v28, v0\n\t"
"th.vwredsum.vs v4, v16, v0\n\t"
"th.vwredsum.vs v5, v20, v0\n\t"
"th.vsetvli zero, %[tmp], e32, m1\n\t"
"th.vslideup.vi v6, v7, 1\n\t"
"th.vslideup.vi v4, v5, 1\n\t"
"th.vslideup.vi v4, v6, 2\n\t"
"th.vmul.vv v8, v4, v1\n\t"
"th.vredsum.vs v0, v8, v0\n\t"
"th.vmv.x.s %[tmp], v0\n\t"
"add %[sumi], %[sumi], %[tmp]"
: [tmp] "=&r" (tmp), [sumi] "+&r" (sumi)
: [vl128] "r" (vl128), [vl64] "r" (vl64), [vl32] "r" (vl32)
, [q4] "r" (q4), [q8] "r" (q8), [scale] "r" (scale)
: "memory"
, "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
, "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
, "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
, "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
);
q4 += 64; q8 += 128; scale += 4;
}
sumf += d * sumi;
}
*s = sumf;
#elif defined __riscv_v
const uint8_t * scales = (const uint8_t*)&utmp[0];
const uint8_t * mins = (const uint8_t*)&utmp[2];
float sumf = 0;
const int vector_length = __riscv_vlenb() * 8;
float sumf = 0;
switch (vector_length) {
case 256:
@@ -8414,7 +8074,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
*s = sumf;
#elif defined __riscv_v
#elif defined __riscv_v_intrinsic
const uint8_t * scales = (const uint8_t*)&utmp[0];
const uint8_t * mins = (const uint8_t*)&utmp[2];
@@ -9572,91 +9232,10 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
}
*s = sumf;
#elif defined __riscv_xtheadvector
#elif defined __riscv_v_intrinsic
float sumf = 0;
for (int i = 0; i < nb; ++i) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const uint8_t * restrict q6 = x[i].ql;
const uint8_t * restrict qh = x[i].qh;
const int8_t * restrict q8 = y[i].qs;
const int8_t * restrict scale = x[i].scales;
int sum_t = 0;
int t0;
for (int j = 0; j < QK_K/128; ++j) {
__asm__ __volatile__(
"th.vsetvli zero, %[vl32], e8, m2\n\t" // vl == 32
"th.vlb.v v4, (%[qh])\n\t"
"th.vsll.vi v0, v4, 4\n\t"
"th.vsll.vi v2, v4, 2\n\t"
"th.vsrl.vi v6, v4, 2\n\t"
"th.vsetvli zero, %[vl64], e8, m4\n\t" // vl == 64
"th.vlb.v v8, (%[q6])\n\t"
"th.vsrl.vi v12, v8, 4\n\t"
"th.vand.vi v8, v8, 0xF\n\t"
"th.vsetvli zero, %[vl128], e8, m8\n\t" // vl == 128
"th.vand.vx v0, v0, %[mask]\n\t"
"th.vor.vv v8, v8, v0\n\t"
"th.vlb.v v0, (%[q8])\n\t"
"th.vsub.vx v8, v8, %[vl32]\n\t"
"th.vsetvli zero, %[vl64], e8, m4\n\t" // vl == 64
"th.vwmul.vv v16, v0, v8\n\t"
"th.vwmul.vv v24, v4, v12\n\t"
"li %[t0], 16\n\t"
"th.vsetvli zero, %[t0], e16, m2\n\t" // vl == 16
"th.vmv.v.x v0, zero\n\t"
"th.vwredsum.vs v10, v16, v0\n\t"
"th.vwredsum.vs v9, v18, v0\n\t"
"th.vwredsum.vs v8, v20, v0\n\t"
"th.vwredsum.vs v7, v22, v0\n\t"
"th.vwredsum.vs v11, v24, v0\n\t"
"th.vwredsum.vs v12, v26, v0\n\t"
"th.vwredsum.vs v13, v28, v0\n\t"
"th.vwredsum.vs v14, v30, v0\n\t"
"li %[t0], 4\n\t"
"th.vsetvli zero, %[t0], e32, m1\n\t" // vl == 4
"th.vslideup.vi v10, v9, 1\n\t"
"th.vslideup.vi v8, v7, 1\n\t"
"th.vslideup.vi v11, v12, 1\n\t"
"th.vslideup.vi v13, v14, 1\n\t"
"th.vslideup.vi v10, v8, 2\n\t"
"th.vslideup.vi v11, v13, 2\n\t"
"li %[t0], 8\n\t"
"th.vsetvli zero, %[t0], e32, m2\n\t" // vl == 8
"th.vlb.v v4, (%[scale])\n\t"
"th.vmul.vv v2, v4, v10\n\t"
"th.vredsum.vs v0, v2, v0\n\t"
"th.vmv.x.s %[t0], v0\n\t"
"add %[sumi], %[sumi], %[t0]"
: [sumi] "+&r" (sum_t), [t0] "=&r" (t0)
: [qh] "r" (qh), [q6] "r" (q6), [q8] "r" (q8), [scale] "r" (scale)
, [vl32] "r" (32), [vl64] "r" (64), [vl128] "r" (128)
, [mask] "r" (0x30)
: "memory"
, "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
, "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
, "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
, "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
);
q6 += 64; qh += 32; q8 += 128; scale += 8;
}
sumf += d * sum_t;
}
*s = sumf;
#elif defined __riscv_v
float sumf = 0;
const int vector_length = __riscv_vlenb() * 8;
float sumf = 0;
switch (vector_length) {
case 256:

View File

@@ -7641,8 +7641,8 @@ static void ggml_compute_forward_ssm_scan_f32(
const float * A = (const float *) ((const char *) src3->data + ir0*(src3->nb[1])); // {d_state, d_inner}
const float * B = (const float *) ((const char *) src4->data + i2*(src4->nb[1]) + i3*(src4->nb[2])); // {d_state, n_t, n_s}
const float * C = (const float *) ((const char *) src5->data + i2*(src5->nb[1]) + i3*(src5->nb[2])); // {d_state, n_t, n_s}
float * y = ( float *) (( char *) dst->data + ir0*(src1->nb[0]) + i2*(src1->nb[1]) + i3*(src1->nb[2])); // {d_inner, n_t, n_s}
float * s = ( float *) (( char *) dst->data + ir0*(src0->nb[1]) + i3*(src0->nb[2]) + src1->nb[3]); // {d_state, d_inner, n_s}
float * y = ( float *) (( char *) dst->data + ir0*(src1->nb[0]) + i2*(src1->nb[1]) + i3*(src1->nb[2])); // {d_inner, n_t, n_s}
float * s = ( float *) (( char *) dst->data + ir0*(src0->nb[1]) + i3*(src0->nb[2]) + src1->nb[3]); // {d_state, d_inner, n_s}
// use the output as the source for the next token-wise iterations
if (i2 > 0) { s0 = s; }
@@ -8070,14 +8070,6 @@ static void ggml_compute_forward_rwkv_wkv6_f32(
#define GGML_F32X_MUL GGML_F32x16_MUL
#define GGML_F32X_FMA GGML_F32x16_FMA
#define WKV_VECTOR_SIZE 16
#elif defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
#define GGML_F32X GGML_F32xt
#define GGML_F32X_SET1 GGML_F32xt_SET1
#define GGML_F32X_LOAD GGML_F32xt_LOAD
#define GGML_F32X_STORE GGML_F32xt_STORE
#define GGML_F32X_MUL GGML_F32xt_MUL
#define GGML_F32X_FMA GGML_F32xt_FMA
#define WKV_VECTOR_SIZE 8
#elif defined(__ARM_NEON) && defined(__aarch64__)
#define GGML_F32X GGML_F32x4
#define GGML_F32X_SET1 GGML_F32x4_SET1
@@ -8088,14 +8080,8 @@ static void ggml_compute_forward_rwkv_wkv6_f32(
#define WKV_VECTOR_SIZE 4
#endif
int wkv_vector_size;
#ifdef WKV_VECTOR_SIZE
#if defined(__ARM_FEATURE_SVE)
wkv_vector_size = svcntw();
#else
wkv_vector_size = WKV_VECTOR_SIZE;
#endif
const int64_t vec_count = head_size / wkv_vector_size;
const int64_t vec_count = head_size / WKV_VECTOR_SIZE;
for (int64_t t = 0; t < T; t++) {
size_t t_offset = t * t_stride;
@@ -8125,7 +8111,7 @@ static void ggml_compute_forward_rwkv_wkv6_f32(
GGML_F32X time_decay_vec = GGML_F32X_SET1(time_decay_val);
for (int64_t j = 0; j < vec_count; j++) {
size_t base_j = j * wkv_vector_size;
size_t base_j = j * WKV_VECTOR_SIZE;
size_t t_h_j_offset = t_h_offset + base_j;
size_t h_2d_i_j_offset = h_2d_i_offset + base_j;
@@ -8150,7 +8136,7 @@ static void ggml_compute_forward_rwkv_wkv6_f32(
}
// Handle remaining elements, this will not be used.
for (int64_t j = vec_count * wkv_vector_size; j < head_size; j++) {
for (int64_t j = vec_count * WKV_VECTOR_SIZE; j < head_size; j++) {
size_t t_h_j_offset = t_h_offset + j;
size_t h_2d_i_j_offset = h_2d_i_offset + j;
float v_val = v[t_h_j_offset];
@@ -8286,14 +8272,6 @@ static void ggml_compute_forward_gla_f32(
#define GGML_F32X_MUL GGML_F32x16_MUL
#define GGML_F32X_FMA GGML_F32x16_FMA
#define GLA_VECTOR_SIZE 16
#elif defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
#define GGML_F32X GGML_F32xt
#define GGML_F32X_SET1 GGML_F32xt_SET1
#define GGML_F32X_LOAD GGML_F32xt_LOAD
#define GGML_F32X_STORE GGML_F32xt_STORE
#define GGML_F32X_MUL GGML_F32xt_MUL
#define GGML_F32X_FMA GGML_F32xt_FMA
#define GLA_VECTOR_SIZE 8
#elif defined(__ARM_NEON) && defined(__aarch64__)
#define GGML_F32X GGML_F32x4
#define GGML_F32X_SET1 GGML_F32x4_SET1
@@ -8304,14 +8282,8 @@ static void ggml_compute_forward_gla_f32(
#define GLA_VECTOR_SIZE 4
#endif
int gla_vector_size;
#ifdef GLA_VECTOR_SIZE
#if defined(__ARM_FEATURE_SVE)
gla_vector_size = svcntw();
#else
gla_vector_size = GLA_VECTOR_SIZE;
#endif
const int64_t vec_count = head_size / gla_vector_size;
const int64_t vec_count = head_size / GLA_VECTOR_SIZE;
for (int64_t t = 0; t < T; t++) {
size_t t_offset = t * t_stride;
@@ -8338,7 +8310,7 @@ static void ggml_compute_forward_gla_f32(
GGML_F32X g_vec = GGML_F32X_SET1(g_val);
for (int64_t j = 0; j < vec_count; j++) {
size_t base_j = j * gla_vector_size;
size_t base_j = j * GLA_VECTOR_SIZE;
size_t t_h_j_offset = t_h_offset + base_j;
size_t h_2d_i_j_offset = h_2d_i_offset + base_j;
@@ -8362,7 +8334,7 @@ static void ggml_compute_forward_gla_f32(
}
// Handle remaining elements, this will not be used.
for (int64_t j = vec_count * gla_vector_size; j < head_size; j++) {
for (int64_t j = vec_count * GLA_VECTOR_SIZE; j < head_size; j++) {
size_t t_h_j_offset = t_h_offset + j;
size_t h_2d_i_j_offset = h_2d_i_offset + j;
float v_val = v[t_h_j_offset];
@@ -8471,126 +8443,83 @@ static void ggml_compute_forward_rwkv_wkv7_f32(
int64_t h_stride_2d = head_size * head_size;
#if defined(GGML_SIMD)
#if defined(__ARM_FEATURE_SVE)
// scalar Route to scalar implementation //TODO: Write SVE code
for (int64_t t = 0; t < T; t++) {
int64_t t_offset = t * t_stride;
int64_t state_offset = head_size * C * (t / (T / n_seqs));
float * state_cur = state + state_offset;
float * state_prev = t % (T / n_seqs) ? state_cur : (float*)dst->src[6]->data + state_offset;
for (int64_t t = 0; t < T; t++) {
int64_t t_offset = t * t_stride;
int64_t state_offset = head_size * C * (t / (T / n_seqs));
float * state_cur = state + state_offset;
float * state_prev = t % (T / n_seqs) ? state_cur : (float*)dst->src[6]->data + state_offset;
for (int64_t h = h_start; h < h_end; h++) {
int64_t h_offset = h * h_stride;
int64_t t_h_offset = t_offset + h_offset;
int64_t h_2d_offset = h * h_stride_2d;
for (int64_t h = h_start; h < h_end; h++) {
int64_t h_offset = h * h_stride;
int64_t t_h_offset = t_offset + h_offset;
int64_t h_2d_offset = h * h_stride_2d;
for (int64_t i = 0; i < head_size; i++) {
int64_t t_h_i_offset = t_h_offset + i;
int64_t h_2d_i_offset = h_2d_offset + i * h_stride;
for (int64_t ii = 0; ii < head_size; ii++) {
int64_t t_h_i_offset = t_h_offset + ii;
int64_t h_2d_i_offset = h_2d_offset + ii * h_stride;
float v_val = v[t_h_i_offset];
GGML_F32_VEC v_vec = GGML_F32_VEC_SET1(v[t_h_i_offset]);
float sa = 0, result = 0;
for (int64_t j = 0; j < head_size; j++) {
sa += a[t_h_offset + j] * state_prev[h_2d_i_offset + j];
}
for (int64_t j = 0; j < head_size; j++) {
int64_t t_h_j_offset = t_h_offset + j;
int64_t h_2d_i_j_offset = h_2d_i_offset + j;
float r_val = r[t_h_j_offset];
float w_val = w[t_h_j_offset];
float k_val = k[t_h_j_offset];
float b_val = b[t_h_j_offset];
float kv_val = v_val * k_val;
float prev_state_val = state_prev[h_2d_i_j_offset];
state_cur[h_2d_i_j_offset] = prev_state_val * w_val + kv_val + sa * b_val;
result += state_cur[h_2d_i_j_offset] * r_val;
}
dst_data[t_h_i_offset] = result;
}
}
}
#else
for (int64_t t = 0; t < T; t++) {
int64_t t_offset = t * t_stride;
int64_t state_offset = head_size * C * (t / (T / n_seqs));
float * state_cur = state + state_offset;
float * state_prev = t % (T / n_seqs) ? state_cur : (float*)dst->src[6]->data + state_offset;
for (int64_t h = h_start; h < h_end; h++) {
int64_t h_offset = h * h_stride;
int64_t t_h_offset = t_offset + h_offset;
int64_t h_2d_offset = h * h_stride_2d;
for (int64_t ii = 0; ii < head_size; ii++) {
int64_t t_h_i_offset = t_h_offset + ii;
int64_t h_2d_i_offset = h_2d_offset + ii * h_stride;
GGML_F32_VEC v_vec = GGML_F32_VEC_SET1(v[t_h_i_offset]);
float sa = 0;
{
GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
GGML_F32_VEC ax[GGML_F32_ARR];
GGML_F32_VEC ay[GGML_F32_ARR];
for (int64_t j = 0; j < head_size; j += GGML_F32_STEP) {
for (int64_t kk = 0; kk < GGML_F32_ARR; kk++) {
ax[kk] = GGML_F32_VEC_LOAD(&a[t_h_offset + j + kk * GGML_F32_EPR]);
ay[kk] = GGML_F32_VEC_LOAD(&state_prev[h_2d_i_offset + j + kk * GGML_F32_EPR]);
sum[kk] = GGML_F32_VEC_FMA(sum[kk], ax[kk], ay[kk]);
}
}
GGML_F32_VEC_REDUCE(sa, sum);
}
GGML_F32_VEC sa_vec = GGML_F32_VEC_SET1(sa);
int64_t j = 0;
GGML_F32_VEC result_vec[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
for (; j < head_size; j += GGML_F32_STEP) {
float sa = 0;
{
GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
GGML_F32_VEC ax[GGML_F32_ARR];
GGML_F32_VEC ay[GGML_F32_ARR];
for (int64_t j = 0; j < head_size; j += GGML_F32_STEP) {
for (int64_t kk = 0; kk < GGML_F32_ARR; kk++) {
int64_t t_h_j_offset = t_h_offset + j + kk * GGML_F32_EPR;
int64_t h_2d_i_j_offset = h_2d_i_offset + j + kk * GGML_F32_EPR;
GGML_F32_VEC r_vec = GGML_F32_VEC_LOAD(&r[t_h_j_offset]);
GGML_F32_VEC w_vec = GGML_F32_VEC_LOAD(&w[t_h_j_offset]);
GGML_F32_VEC k_vec = GGML_F32_VEC_LOAD(&k[t_h_j_offset]);
GGML_F32_VEC b_vec = GGML_F32_VEC_LOAD(&b[t_h_j_offset]);
k_vec = GGML_F32_VEC_MUL(v_vec, k_vec);
GGML_F32_VEC state_vec = GGML_F32_VEC_LOAD(&state_prev[h_2d_i_j_offset]);
// kv + s * decay + sa * b
state_vec = GGML_F32_VEC_FMA(k_vec, state_vec, w_vec);
state_vec = GGML_F32_VEC_FMA(state_vec, sa_vec, b_vec);
GGML_F32_VEC_STORE(&state_cur[h_2d_i_j_offset], state_vec);
result_vec[kk] = GGML_F32_VEC_FMA(result_vec[kk], state_vec, r_vec);
ax[kk] = GGML_F32_VEC_LOAD(&a[t_h_offset + j + kk * GGML_F32_EPR]);
ay[kk] = GGML_F32_VEC_LOAD(&state_prev[h_2d_i_offset + j + kk * GGML_F32_EPR]);
sum[kk] = GGML_F32_VEC_FMA(sum[kk], ax[kk], ay[kk]);
}
}
GGML_F32_VEC_REDUCE(dst_data[t_h_i_offset], result_vec);
GGML_F32_VEC_REDUCE(sa, sum);
}
// There shouldn't be left-overs though.
for (; j < head_size; j++) {
int64_t t_h_j_offset = t_h_offset + j;
int64_t h_2d_i_j_offset = h_2d_i_offset + j;
GGML_F32_VEC sa_vec = GGML_F32_VEC_SET1(sa);
float r_val = r[t_h_j_offset];
float w_val = w[t_h_j_offset];
float k_val = k[t_h_j_offset];
float b_val = b[t_h_j_offset];
float kv_val = v[t_h_i_offset] * k_val;
int64_t j = 0;
GGML_F32_VEC result_vec[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
for (; j < head_size; j += GGML_F32_STEP) {
for (int64_t kk = 0; kk < GGML_F32_ARR; kk++) {
int64_t t_h_j_offset = t_h_offset + j + kk * GGML_F32_EPR;
int64_t h_2d_i_j_offset = h_2d_i_offset + j + kk * GGML_F32_EPR;
float prev_state_val = state_prev[h_2d_i_j_offset];
state_cur[h_2d_i_j_offset] = prev_state_val * w_val + kv_val + sa * b_val;
dst_data[t_h_i_offset] += state_cur[h_2d_i_j_offset] * r_val;
GGML_F32_VEC r_vec = GGML_F32_VEC_LOAD(&r[t_h_j_offset]);
GGML_F32_VEC w_vec = GGML_F32_VEC_LOAD(&w[t_h_j_offset]);
GGML_F32_VEC k_vec = GGML_F32_VEC_LOAD(&k[t_h_j_offset]);
GGML_F32_VEC b_vec = GGML_F32_VEC_LOAD(&b[t_h_j_offset]);
k_vec = GGML_F32_VEC_MUL(v_vec, k_vec);
GGML_F32_VEC state_vec = GGML_F32_VEC_LOAD(&state_prev[h_2d_i_j_offset]);
// kv + s * decay + sa * b
state_vec = GGML_F32_VEC_FMA(k_vec, state_vec, w_vec);
state_vec = GGML_F32_VEC_FMA(state_vec, sa_vec, b_vec);
GGML_F32_VEC_STORE(&state_cur[h_2d_i_j_offset], state_vec);
result_vec[kk] = GGML_F32_VEC_FMA(result_vec[kk], state_vec, r_vec);
}
}
GGML_F32_VEC_REDUCE(dst_data[t_h_i_offset], result_vec);
// There shouldn't be left-overs though.
for (; j < head_size; j++) {
int64_t t_h_j_offset = t_h_offset + j;
int64_t h_2d_i_j_offset = h_2d_i_offset + j;
float r_val = r[t_h_j_offset];
float w_val = w[t_h_j_offset];
float k_val = k[t_h_j_offset];
float b_val = b[t_h_j_offset];
float kv_val = v[t_h_i_offset] * k_val;
float prev_state_val = state_prev[h_2d_i_j_offset];
state_cur[h_2d_i_j_offset] = prev_state_val * w_val + kv_val + sa * b_val;
dst_data[t_h_i_offset] += state_cur[h_2d_i_j_offset] * r_val;
}
}
}
#endif
}
#else
for (int64_t t = 0; t < T; t++) {
int64_t t_offset = t * t_stride;

View File

@@ -17,123 +17,7 @@
// number of elements to fit in a single register
//
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_FMA)
#define GGML_SIMD
// F32 SVE
#define GGML_F32_EPR 8
#define DEFAULT_PG svptrue_b32()
#define GGML_F32xt svfloat32_t
#define GGML_F32xt_ZERO svdup_n_f32(0.0f)
#define GGML_F32xt_SET1(x) svdup_n_f32(x)
#define GGML_F32xt_LOAD_IMPL(pg, a, ...) svld1_f32(pg, a)
#define GGML_F32xt_LOAD(...) GGML_F32xt_LOAD_IMPL(DEFAULT_PG, __VA_ARGS__)
#define GGML_F32xt_STORE_IMPL(pg,a,b) svst1_f32(pg, a, b)
#define GGML_F32xt_STORE(...) GGML_F32xt_STORE_IMPL(DEFAULT_PG, __VA_ARGS__)
#define GGML_F32xt_FMA_IMPL(pg, a, b, c) svmad_f32_m(pg, a, b, c)
#define GGML_F32xt_FMA(...) GGML_F32xt_FMA_IMPL(DEFAULT_PG, __VA_ARGS__)
#define GGML_F32xt_ADD_IMPL(pg, a, b) svadd_f32_m(pg, a, b)
#define GGML_F32xt_ADD(...) GGML_F32xt_ADD_IMPL(DEFAULT_PG, __VA_ARGS__)
#define GGML_F32xt_MUL_IMPL(pg, a, b) svmul_f32_m(pg, a, b)
#define GGML_F32xt_MUL(...) GGML_F32xt_MUL_IMPL(DEFAULT_PG, __VA_ARGS__)
#define GGML_F32xt_REDUCE_ONE_IMPL(pg, a) svaddv(pg, a)
#define GGML_F32xt_REDUCE_ONE(...) GGML_F32xt_REDUCE_ONE_IMPL(DEFAULT_PG, __VA_ARGS__)
#define GGML_F32xt_REDUCE_IMPL(pg, res, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8) \
{ \
sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum2); \
sum3 = svadd_f32_m(DEFAULT_PG, sum3, sum4); \
sum5 = svadd_f32_m(DEFAULT_PG, sum5, sum6); \
sum7 = svadd_f32_m(DEFAULT_PG, sum7, sum8); \
sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum3); \
sum5 = svadd_f32_m(DEFAULT_PG, sum5, sum7); \
sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum5); \
(res) = (ggml_float) GGML_F32xt_REDUCE_ONE(sum1); \
}
#define GGML_F32xt_REDUCE(...) GGML_F32xt_REDUCE_IMPL(DEFAULT_PG, __VA_ARGS__)
#define GGML_F32_VEC GGML_F32xt
#define GGML_F32_VEC_ZERO GGML_F32xt_ZERO
#define GGML_F32_VEC_SET1 GGML_F32xt_SET1
#define GGML_F32_VEC_LOAD GGML_F32xt_LOAD
#define GGML_F32_VEC_STORE GGML_F32xt_STORE
#define GGML_F32_VEC_FMA GGML_F32xt_FMA
#define GGML_F32_VEC_ADD GGML_F32xt_ADD
#define GGML_F32_VEC_MUL GGML_F32xt_MUL
#define GGML_F32_VEC_REDUCE GGML_F32xt_REDUCE
// F16 NEON
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
#define GGML_F16_STEP 32
#define GGML_F16_EPR 8
#define GGML_F16x8 float16x8_t
#define GGML_F16x8_ZERO vdupq_n_f16(0.0f)
#define GGML_F16x8_SET1(x) vdupq_n_f16(x)
#define GGML_F16x8_LOAD(x) vld1q_f16((const __fp16 *)(x))
#define GGML_F16x8_STORE vst1q_f16
#define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
#define GGML_F16x8_ADD vaddq_f16
#define GGML_F16x8_MUL vmulq_f16
#define GGML_F16x8_REDUCE(res, x) \
do { \
int offset = GGML_F16_ARR >> 1; \
for (int i = 0; i < offset; ++i) { \
(x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
} \
offset >>= 1; \
for (int i = 0; i < offset; ++i) { \
(x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
} \
offset >>= 1; \
for (int i = 0; i < offset; ++i) { \
(x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
} \
const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 ((x)[0])); \
const float32x4_t t1 = vcvt_f32_f16(vget_high_f16((x)[0])); \
(res) = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \
} while (0)
#define GGML_F16_VEC GGML_F16x8
#define GGML_F16_VEC_ZERO GGML_F16x8_ZERO
#define GGML_F16_VEC_SET1 GGML_F16x8_SET1
#define GGML_F16_VEC_LOAD(p, i) GGML_F16x8_LOAD(p)
#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((__fp16 *)(p), (r)[i])
#define GGML_F16_VEC_FMA GGML_F16x8_FMA
#define GGML_F16_VEC_ADD GGML_F16x8_ADD
#define GGML_F16_VEC_MUL GGML_F16x8_MUL
#define GGML_F16_VEC_REDUCE GGML_F16x8_REDUCE
#else
// if FP16 vector arithmetic is not supported, we use FP32 instead
// and take advantage of the vcvt_ functions to convert to/from FP16
#define GGML_F16_STEP 16
#define GGML_F16_EPR 4
#define GGML_F32Cx4 float32x4_t
#define GGML_F32Cx4_ZERO vdupq_n_f32(0.0f)
#define GGML_F32Cx4_SET1(x) vdupq_n_f32(x)
#define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16((const __fp16 *)(x)))
#define GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y))
#define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
#define GGML_F32Cx4_ADD vaddq_f32
#define GGML_F32Cx4_MUL vmulq_f32
#define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE
#define GGML_F16_VEC GGML_F32Cx4
#define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
#define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p)
#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE((__fp16 *)(p), r[i])
#define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
#define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
#define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
#define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
#endif
#elif defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)
#define GGML_SIMD

View File

@@ -17,98 +17,29 @@ void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * G
#if defined(GGML_SIMD)
float sumf = 0.0f;
const int np = (n & ~(GGML_F32_STEP - 1));
#if defined(__ARM_FEATURE_SVE)
const int sve_register_length = ggml_cpu_get_sve_cnt() * 8;
const int ggml_f32_epr = sve_register_length / 32;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16
const int ggml_f32_step = 8 * ggml_f32_epr; // choose 8 SVE registers
GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
const int np = (n & ~(ggml_f32_step - 1));
svfloat32_t sum1 = svdup_n_f32(0.0f);
svfloat32_t sum2 = svdup_n_f32(0.0f);
svfloat32_t sum3 = svdup_n_f32(0.0f);
svfloat32_t sum4 = svdup_n_f32(0.0f);
svfloat32_t sum5 = svdup_n_f32(0.0f);
svfloat32_t sum6 = svdup_n_f32(0.0f);
svfloat32_t sum7 = svdup_n_f32(0.0f);
svfloat32_t sum8 = svdup_n_f32(0.0f);
svfloat32_t ax1,ax2,ax3,ax4,ax5,ax6,ax7,ax8;
svfloat32_t ay1,ay2,ay3,ay4,ay5,ay6,ay7,ay8;
for (int i = 0; i < np; i += ggml_f32_step) {
ax1 = GGML_F32_VEC_LOAD(x + i);
ay1 = GGML_F32_VEC_LOAD(y + i);
sum1 = GGML_F32_VEC_FMA(ax1, ay1, sum1);
GGML_F32_VEC ax[GGML_F32_ARR];
GGML_F32_VEC ay[GGML_F32_ARR];
ax2 = GGML_F32_VEC_LOAD(x + i + 1*ggml_f32_epr);
ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr);
sum2 = GGML_F32_VEC_FMA(ax2, ay2, sum2);
for (int i = 0; i < np; i += GGML_F32_STEP) {
for (int j = 0; j < GGML_F32_ARR; j++) {
ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
ax3 = GGML_F32_VEC_LOAD(x + i + 2*ggml_f32_epr);
ay3 = GGML_F32_VEC_LOAD(y + i + 2*ggml_f32_epr);
sum3 = GGML_F32_VEC_FMA(ax3, ay3, sum3);
ax4 = GGML_F32_VEC_LOAD(x + i + 3*ggml_f32_epr);
ay4 = GGML_F32_VEC_LOAD(y + i + 3*ggml_f32_epr);
sum4 = GGML_F32_VEC_FMA(ax4, ay4, sum4);
ax5 = GGML_F32_VEC_LOAD(x + i + 4*ggml_f32_epr);
ay5 = GGML_F32_VEC_LOAD(y + i + 4*ggml_f32_epr);
sum5 = GGML_F32_VEC_FMA(ax5, ay5, sum5);
ax6 = GGML_F32_VEC_LOAD(x + i + 5*ggml_f32_epr);
ay6 = GGML_F32_VEC_LOAD(y + i + 5*ggml_f32_epr);
sum6 = GGML_F32_VEC_FMA(ax6, ay6, sum6);
ax7 = GGML_F32_VEC_LOAD(x + i + 6*ggml_f32_epr);
ay7 = GGML_F32_VEC_LOAD(y + i + 6*ggml_f32_epr);
sum7 = GGML_F32_VEC_FMA(ax7, ay7, sum7);
ax8 = GGML_F32_VEC_LOAD(x + i + 7*ggml_f32_epr);
ay8 = GGML_F32_VEC_LOAD(y + i + 7*ggml_f32_epr);
sum8 = GGML_F32_VEC_FMA(ax8, ay8, sum8);
sum[j] = GGML_F32_VEC_FMA(sum[j], ax[j], ay[j]);
}
// leftovers
// Since 8 unrolls are done in above loop, leftovers lie in range [0, ggml_f32_step] which is handled in below loop
const int np2 = (n & ~(ggml_f32_epr - 1));
for (int i = np; i < np2; i += ggml_f32_epr) {
ax1 = GGML_F32_VEC_LOAD(x + i);
ay1 = GGML_F32_VEC_LOAD(y + i);
sum1 = GGML_F32_VEC_FMA(ax1, ay1, sum1);
}
// maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
if (np2 < n) {
svbool_t pg = svwhilelt_b32(np2, n);
ax1 = svld1_f32(pg, x + np2);
ay1 = svld1_f32(pg, y + np2);
sum1 = svmad_f32_m(pg, ax1, ay1, sum1);
}
// reduce sum1,sum2 to sum1
GGML_F32_VEC_REDUCE(sumf, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8);
#else
const int np = (n & ~(GGML_F32_STEP - 1));
}
GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
// reduce sum0..sum3 to sum0
GGML_F32_VEC_REDUCE(sumf, sum);
GGML_F32_VEC ax[GGML_F32_ARR];
GGML_F32_VEC ay[GGML_F32_ARR];
for (int i = 0; i < np; i += GGML_F32_STEP) {
for (int j = 0; j < GGML_F32_ARR; j++) {
ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
sum[j] = GGML_F32_VEC_FMA(sum[j], ax[j], ay[j]);
}
}
// reduce sum0..sum3 to sum0
GGML_F32_VEC_REDUCE(sumf, sum);
// leftovers
for (int i = np; i < n; ++i) {
sumf += x[i]*y[i];
}
#endif
// leftovers
for (int i = np; i < n; ++i) {
sumf += x[i]*y[i];
}
#else
// scalar
ggml_float sumf = 0.0;

View File

@@ -5,7 +5,6 @@
#include "ggml-impl.h"
#include "simd-mappings.h"
#include "ggml.h"
#include "ggml-cpu.h"
#if defined(GGML_USE_ACCELERATE)
#include <Accelerate/Accelerate.h>
@@ -149,108 +148,27 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const float * GGML_RESTRICT x, const float v) {
#if defined(GGML_SIMD)
#if defined(__ARM_FEATURE_SVE)
const int np = (n & ~(GGML_F32_STEP - 1));
const int sve_register_length = ggml_cpu_get_sve_cnt() * 8;
const int ggml_f32_epr = sve_register_length / 32;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16
const int ggml_f32_step = 8 * ggml_f32_epr; // choose 8 SVE registers
GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
const int np = (n & ~(ggml_f32_step - 1));
svfloat32_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
svfloat32_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
for (int i = 0; i < np; i += ggml_f32_step) {
GGML_F32_VEC ax[GGML_F32_ARR];
GGML_F32_VEC ay[GGML_F32_ARR];
ax1 = GGML_F32_VEC_LOAD(x + i);
ay1 = GGML_F32_VEC_LOAD(y + i);
ay1 = GGML_F32_VEC_FMA(ax1, vx, ay1);
for (int i = 0; i < np; i += GGML_F32_STEP) {
for (int j = 0; j < GGML_F32_ARR; j++) {
ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
ay[j] = GGML_F32_VEC_FMA(ay[j], ax[j], vx);
GGML_F32_VEC_STORE(y + i, ay1);
ax2 = GGML_F32_VEC_LOAD(x + i + 1*ggml_f32_epr);
ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr);
ay2 = GGML_F32_VEC_FMA(ax2, vx, ay2);
GGML_F32_VEC_STORE(y + i + 1*ggml_f32_epr, ay2);
ax3 = GGML_F32_VEC_LOAD(x + i + 2*ggml_f32_epr);
ay3 = GGML_F32_VEC_LOAD(y + i + 2*ggml_f32_epr);
ay3 = GGML_F32_VEC_FMA(ax3, vx, ay3);
GGML_F32_VEC_STORE(y + i + 2*ggml_f32_epr, ay3);
ax4 = GGML_F32_VEC_LOAD(x + i + 3*ggml_f32_epr);
ay4 = GGML_F32_VEC_LOAD(y + i + 3*ggml_f32_epr);
ay4 = GGML_F32_VEC_FMA(ax4, vx, ay4);
GGML_F32_VEC_STORE(y + i + 3*ggml_f32_epr, ay4);
ax5 = GGML_F32_VEC_LOAD(x + i + 4*ggml_f32_epr);
ay5 = GGML_F32_VEC_LOAD(y + i + 4*ggml_f32_epr);
ay5 = GGML_F32_VEC_FMA(ax5, vx, ay5);
GGML_F32_VEC_STORE(y + i + 4*ggml_f32_epr, ay5);
ax6 = GGML_F32_VEC_LOAD(x + i + 5*ggml_f32_epr);
ay6 = GGML_F32_VEC_LOAD(y + i + 5*ggml_f32_epr);
ay6 = GGML_F32_VEC_FMA(ax6, vx, ay6);
GGML_F32_VEC_STORE(y + i + 5*ggml_f32_epr, ay6);
ax7 = GGML_F32_VEC_LOAD(x + i + 6*ggml_f32_epr);
ay7 = GGML_F32_VEC_LOAD(y + i + 6*ggml_f32_epr);
ay7 = GGML_F32_VEC_FMA(ax7, vx, ay7);
GGML_F32_VEC_STORE(y + i + 6*ggml_f32_epr, ay7);
ax8 = GGML_F32_VEC_LOAD(x + i + 7*ggml_f32_epr);
ay8 = GGML_F32_VEC_LOAD(y + i + 7*ggml_f32_epr);
ay8 = GGML_F32_VEC_FMA(ax8, vx, ay8);
GGML_F32_VEC_STORE(y + i + 7*ggml_f32_epr, ay8);
GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
}
// leftovers
// Since 8 unrolls are done in above loop, leftovers lie in range [0, ggml_f32_step] which is handled in below loop
const int np2 = (n & ~(ggml_f32_epr - 1));
for (int i = np; i < np2; i += ggml_f32_epr) {
ax1 = GGML_F32_VEC_LOAD(x + i);
ay1 = GGML_F32_VEC_LOAD(y + i);
ay1 = GGML_F32_VEC_FMA(ax1, vx, ay1);
}
GGML_F32_VEC_STORE(y + i, ay1);
}
// maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
if (np2 < n) {
svbool_t pg =svwhilelt_b32(np2, n);
ax1 = svld1_f32(pg, x + np2);
ay1 = svld1_f32(pg, y + np2);
ay1 = svmad_f32_m(pg, ax1, vx, ay1);
svst1_f32(pg, y + np2, ay1);
}
#else
const int np = (n & ~(GGML_F32_STEP - 1));
GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
GGML_F32_VEC ax[GGML_F32_ARR];
GGML_F32_VEC ay[GGML_F32_ARR];
for (int i = 0; i < np; i += GGML_F32_STEP) {
for (int j = 0; j < GGML_F32_ARR; j++) {
ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
ay[j] = GGML_F32_VEC_FMA(ay[j], ax[j], vx);
GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
}
}
// leftovers
for (int i = np; i < n; ++i) {
y[i] += x[i]*v;
}
#endif
// leftovers
for (int i = np; i < n; ++i) {
y[i] += x[i]*v;
}
#else
// scalar
for (int i = 0; i < n; ++i) {
@@ -302,45 +220,36 @@ inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int
}
#if defined(GGML_SIMD)
#if defined(__ARM_FEATURE_SVE)
// scalar Route to scalar implementation //TODO: Write SVE code
for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
for (int i = 0; i < n; ++i) {
y[i] += x[k][i]*v[k][0];
const int np = (n & ~(GGML_F32_STEP - 1));
GGML_F32_VEC vx[GGML_VEC_MAD_UNROLL];
for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
vx[k] = GGML_F32_VEC_SET1(v[k][0]);
}
GGML_F32_VEC ax[GGML_VEC_MAD_UNROLL][GGML_F32_ARR];
GGML_F32_VEC ay[GGML_F32_ARR];
for (int i = 0; i < np; i += GGML_F32_STEP) {
for (int j = 0; j < GGML_F32_ARR; j++) {
ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
ax[k][j] = GGML_F32_VEC_LOAD(x[k] + i + j*GGML_F32_EPR);
ay[j] = GGML_F32_VEC_FMA(ay[j], ax[k][j], vx[k]);
}
GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
}
#else
const int np = (n & ~(GGML_F32_STEP - 1));
}
GGML_F32_VEC vx[GGML_VEC_MAD_UNROLL];
for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
vx[k] = GGML_F32_VEC_SET1(v[k][0]);
// leftovers
for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
for (int i = np; i < n; ++i) {
y[i] += x[k][i]*v[k][0];
}
GGML_F32_VEC ax[GGML_VEC_MAD_UNROLL][GGML_F32_ARR];
GGML_F32_VEC ay[GGML_F32_ARR];
for (int i = 0; i < np; i += GGML_F32_STEP) {
for (int j = 0; j < GGML_F32_ARR; j++) {
ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
ax[k][j] = GGML_F32_VEC_LOAD(x[k] + i + j*GGML_F32_EPR);
ay[j] = GGML_F32_VEC_FMA(ay[j], ax[k][j], vx[k]);
}
GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
}
}
// leftovers
for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
for (int i = np; i < n; ++i) {
y[i] += x[k][i]*v[k][0];
}
}
#endif
}
#else
// scalar
for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
@@ -356,53 +265,25 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
#if defined(GGML_USE_ACCELERATE)
vDSP_vsmul(y, 1, &v, y, 1, n);
#elif defined(GGML_SIMD)
#if defined(__ARM_FEATURE_SVE)
const int sve_register_length = ggml_cpu_get_sve_cnt() * 8;
const int ggml_f32_epr = sve_register_length / 32;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16
const int ggml_f32_step = 2 * ggml_f32_epr;
const int np = (n & ~(GGML_F32_STEP - 1));
GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
const int np = (n & ~(ggml_f32_step - 1));
svfloat32_t ay1;
svfloat32_t ay2;
for (int i = 0; i < np; i += ggml_f32_step) {
ay1 = GGML_F32_VEC_LOAD(y + i);
ay1 = GGML_F32_VEC_MUL(ay1, vx);
GGML_F32_VEC_STORE(y + i, ay1);
GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr);
ay2 = GGML_F32_VEC_MUL(ay2, vx);
GGML_F32_VEC_STORE(y + i + 1*ggml_f32_epr, ay2);
GGML_F32_VEC ay[GGML_F32_ARR];
for (int i = 0; i < np; i += GGML_F32_STEP) {
for (int j = 0; j < GGML_F32_ARR; j++) {
ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
ay[j] = GGML_F32_VEC_MUL(ay[j], vx);
GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
}
// leftovers
// maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
if (np < n) {
svbool_t pg = svwhilelt_b32(np, n);
ay1 = svld1_f32(pg, y + np);
ay1 = svmul_f32_m(pg, ay1, vx);
svst1_f32(pg, y + np, ay1);
}
#else
const int np = (n & ~(GGML_F32_STEP - 1));
}
GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
GGML_F32_VEC ay[GGML_F32_ARR];
for (int i = 0; i < np; i += GGML_F32_STEP) {
for (int j = 0; j < GGML_F32_ARR; j++) {
ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
ay[j] = GGML_F32_VEC_MUL(ay[j], vx);
GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
}
}
// leftovers
for (int i = np; i < n; ++i) {
y[i] *= v;
}
#endif
// leftovers
for (int i = np; i < n; ++i) {
y[i] *= v;
}
#else
// scalar
for (int i = 0; i < n; ++i) {

View File

@@ -623,8 +623,8 @@ static __global__ void flash_attn_combine_results(
__builtin_assume(tid < D);
extern __shared__ float2 meta[];
for (int i = tid; i < 2*parallel_blocks; i += D) {
((float *) meta)[i] = ((const float *)VKQ_meta) [blockIdx.z*(2*parallel_blocks) + i];
if (tid < 2*parallel_blocks) {
((float *) meta)[threadIdx.x] = ((const float *)VKQ_meta) [blockIdx.z*(2*parallel_blocks) + tid];
}
__syncthreads();

View File

@@ -386,7 +386,7 @@ GGML_API void ggml_aligned_free(void * ptr, size_t size);
return r;
}
#elif defined(__riscv) && defined(__riscv_zfhmin)
#elif defined(__riscv) && defined(GGML_RV_ZFH)
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
float f;

View File

@@ -55,17 +55,14 @@ endfunction()
set(GGML_OPENCL_KERNELS
add
argsort
clamp
cpy
cvt
diag_mask_inf
div
gelu
gemv_noshuffle_general
gemv_noshuffle
get_rows
group_norm
im2col_f32
im2col_f16
mul_mat_Ab_Bi_8x4
@@ -86,14 +83,11 @@ set(GGML_OPENCL_KERNELS
rms_norm
rope
scale
sigmoid
silu
softmax_4_f32
softmax_4_f16
softmax_f32
softmax_f16
sub
sum_rows
transpose
)

View File

@@ -299,37 +299,27 @@ struct ggml_backend_opencl_context {
cl_program program_mul_mv_f16_f32;
cl_program program_mul_mv_f32_f32;
cl_program program_mul;
cl_program program_div;
cl_program program_sub;
cl_program program_norm;
cl_program program_relu;
cl_program program_rms_norm;
cl_program program_group_norm;
cl_program program_rope;
cl_program program_scale;
cl_program program_silu;
cl_program program_sigmoid;
cl_program program_softmax_f32;
cl_program program_softmax_f16;
cl_program program_softmax_4_f32;
cl_program program_softmax_4_f16;
cl_program program_argsort_f32_i32;
cl_program program_sum_rows_f32;
cl_kernel kernel_add, kernel_add_row;
cl_kernel kernel_mul, kernel_mul_row;
cl_kernel kernel_div, kernel_div_row;
cl_kernel kernel_sub, kernel_sub_row;
cl_kernel kernel_scale;
cl_kernel kernel_silu, kernel_silu_4;
cl_kernel kernel_gelu, kernel_gelu_4;
cl_kernel kernel_gelu_quick, kernel_gelu_quick_4;
cl_kernel kernel_relu;
cl_kernel kernel_sigmoid_f32, kernel_sigmoid_f16;
cl_kernel kernel_clamp;
cl_kernel kernel_norm;
cl_kernel kernel_rms_norm;
cl_kernel kernel_group_norm;
cl_kernel kernel_diag_mask_inf, kernel_diag_mask_inf_8;
cl_kernel kernel_soft_max, kernel_soft_max_4;
cl_kernel kernel_soft_max_f16, kernel_soft_max_4_f16;
@@ -349,8 +339,6 @@ struct ggml_backend_opencl_context {
cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
cl_kernel kernel_mul_mv_q6_K_f32;
cl_kernel kernel_im2col_f32, kernel_im2col_f16;
cl_kernel kernel_argsort_f32_i32;
cl_kernel kernel_sum_rows_f32;
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
// Transpose kernels
@@ -998,105 +986,6 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
GGML_LOG_CONT(".");
}
// argsort
{
#ifdef GGML_OPENCL_EMBED_KERNELS
const std::string kernel_src {
#include "argsort.cl.h"
};
#else
const std::string kernel_src = read_file("argsort.cl");
#endif
backend_ctx->program_argsort_f32_i32 =
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
CL_CHECK((backend_ctx->kernel_argsort_f32_i32 = clCreateKernel(backend_ctx->program_argsort_f32_i32, "kernel_argsort_f32_i32", &err), err));
GGML_LOG_CONT(".");
}
// div
{
#ifdef GGML_OPENCL_EMBED_KERNELS
const std::string kernel_src {
#include "div.cl.h"
};
#else
const std::string kernel_src = read_file("div.cl");
#endif
backend_ctx->program_div =
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
CL_CHECK((backend_ctx->kernel_div = clCreateKernel(backend_ctx->program_div, "kernel_div", &err), err));
CL_CHECK((backend_ctx->kernel_div_row = clCreateKernel(backend_ctx->program_div, "kernel_div_row", &err), err));
GGML_LOG_CONT(".");
}
// sub
{
#ifdef GGML_OPENCL_EMBED_KERNELS
const std::string kernel_src {
#include "sub.cl.h"
};
#else
const std::string kernel_src = read_file("sub.cl");
#endif
backend_ctx->program_sub =
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
CL_CHECK((backend_ctx->kernel_sub = clCreateKernel(backend_ctx->program_sub, "kernel_sub", &err), err));
CL_CHECK((backend_ctx->kernel_sub_row = clCreateKernel(backend_ctx->program_sub, "kernel_sub_row", &err), err));
GGML_LOG_CONT(".");
}
// sum_rows
{
#ifdef GGML_OPENCL_EMBED_KERNELS
const std::string kernel_src {
#include "sum_rows.cl.h"
};
#else
const std::string kernel_src = read_file("sum_rows.cl");
#endif
backend_ctx->program_sum_rows_f32 =
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
CL_CHECK((backend_ctx->kernel_sum_rows_f32 = clCreateKernel(backend_ctx->program_sum_rows_f32, "kernel_sum_rows_f32", &err), err));
GGML_LOG_CONT(".");
}
// sigmoid
{
#ifdef GGML_OPENCL_EMBED_KERNELS
const std::string kernel_src {
#include "sigmoid.cl.h"
};
#else
const std::string kernel_src = read_file("sigmoid.cl");
#endif
backend_ctx->program_sigmoid =
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
CL_CHECK((backend_ctx->kernel_sigmoid_f32 = clCreateKernel(backend_ctx->program_sigmoid, "kernel_sigmoid_f32", &err), err));
CL_CHECK((backend_ctx->kernel_sigmoid_f16 = clCreateKernel(backend_ctx->program_sigmoid, "kernel_sigmoid_f16", &err), err));
GGML_LOG_CONT(".");
}
// group_norm
{
#ifdef GGML_OPENCL_EMBED_KERNELS
const std::string kernel_src {
#include "group_norm.cl.h"
};
#else
const std::string kernel_src = read_file("group_norm.cl");
#endif
backend_ctx->program_group_norm =
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
CL_CHECK((backend_ctx->kernel_group_norm = clCreateKernel(backend_ctx->program_group_norm, "kernel_group_norm", &err), err));
GGML_LOG_CONT(".");
}
// Adreno kernels
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
// transpose
@@ -1967,8 +1856,6 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
case GGML_OP_ADD:
case GGML_OP_SCALE:
case GGML_OP_MUL:
case GGML_OP_DIV:
case GGML_OP_SUB:
return op->src[0]->type == GGML_TYPE_F32;
case GGML_OP_UNARY:
switch (ggml_get_unary_op(op)) {
@@ -1976,9 +1863,7 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
case GGML_UNARY_OP_SILU:
case GGML_UNARY_OP_RELU:
case GGML_UNARY_OP_GELU_QUICK:
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
case GGML_UNARY_OP_SIGMOID:
return ggml_is_contiguous(op->src[0]);
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
default:
return false;
}
@@ -1988,13 +1873,11 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
case GGML_OP_NORM:
case GGML_OP_RMS_NORM:
return true;
case GGML_OP_GROUP_NORM:
return ggml_is_contiguous(op->src[0]);
case GGML_OP_MUL_MAT:
if (op->src[0]->type == GGML_TYPE_F16) {
return true;
} else if (op->src[0]->type == GGML_TYPE_F32) {
return op->src[1]->type == GGML_TYPE_F32;
return op->src[1]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
} else if (op->src[0]->type == GGML_TYPE_Q4_0 ||
op->src[0]->type == GGML_TYPE_Q6_K) {
return op->src[1]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
@@ -2029,10 +1912,6 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
}
case GGML_OP_IM2COL:
return true;
case GGML_OP_ARGSORT:
return op->src[0]->type == GGML_TYPE_F32;
case GGML_OP_SUM_ROWS:
return op->src[0]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]);
default:
return false;
}
@@ -3359,256 +3238,6 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
}
}
static void ggml_cl_div(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
GGML_ASSERT(src0);
GGML_ASSERT(src0->extra);
GGML_ASSERT(src1);
GGML_ASSERT(src1->extra);
GGML_ASSERT(dst);
GGML_ASSERT(dst->extra);
const int ne00 = src0->ne[0];
const int ne01 = src0->ne[1];
const int ne02 = src0->ne[2];
const int ne03 = src0->ne[3];
const cl_ulong nb00 = src0->nb[0];
const cl_ulong nb01 = src0->nb[1];
const cl_ulong nb02 = src0->nb[2];
const cl_ulong nb03 = src0->nb[3];
const int ne10 = src1->ne[0];
const int ne11 = src1->ne[1];
const int ne12 = src1->ne[2];
const int ne13 = src1->ne[3];
const cl_ulong nb10 = src1->nb[0];
const cl_ulong nb11 = src1->nb[1];
const cl_ulong nb12 = src1->nb[2];
const cl_ulong nb13 = src1->nb[3];
const int ne0 = dst->ne[0];
const cl_ulong nb0 = dst->nb[0];
const cl_ulong nb1 = dst->nb[1];
const cl_ulong nb2 = dst->nb[2];
const cl_ulong nb3 = dst->nb[3];
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
cl_command_queue queue = backend_ctx->queue;
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
cl_ulong offset0 = extra0->offset + src0->view_offs;
cl_ulong offset1 = extra1->offset + src1->view_offs;
cl_ulong offsetd = extrad->offset + dst->view_offs;
bool bcast_row = false;
cl_kernel kernel;
if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) {
GGML_ASSERT(ggml_is_contiguous(src0));
// src1 is a row
GGML_ASSERT(ne11 == 1);
bcast_row = true;
int ne = ne00 / 4;
kernel = backend_ctx->kernel_div_row;
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne));
} else {
kernel = backend_ctx->kernel_div;
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &nb00));
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb01));
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb02));
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb03));
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne10));
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne11));
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne12));
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne13));
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb10));
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb11));
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb12));
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb13));
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &ne0));
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb0));
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb1));
CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb2));
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &nb3));
}
if (bcast_row) {
int n = ggml_nelements(dst)/4;
size_t global_work_size[] = {(size_t)n, 1, 1};
size_t local_work_size[] = {64, 1, 1};
#ifdef GGML_OPENCL_PROFILING
cl_event evt;
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
g_profiling_info.emplace_back();
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
#else
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
#endif
} else {
unsigned int nth = MIN(64, ne0);
size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
size_t local_work_size[] = {nth, 1, 1};
#ifdef GGML_OPENCL_PROFILING
cl_event evt;
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
g_profiling_info.emplace_back();
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
#else
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
#endif
}
}
static void ggml_cl_sub(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
GGML_ASSERT(src0);
GGML_ASSERT(src0->extra);
GGML_ASSERT(src1);
GGML_ASSERT(src1->extra);
GGML_ASSERT(dst);
GGML_ASSERT(dst->extra);
const int ne00 = src0->ne[0];
const int ne01 = src0->ne[1];
const int ne02 = src0->ne[2];
const int ne03 = src0->ne[3];
const cl_ulong nb00 = src0->nb[0];
const cl_ulong nb01 = src0->nb[1];
const cl_ulong nb02 = src0->nb[2];
const cl_ulong nb03 = src0->nb[3];
const int ne10 = src1->ne[0];
const int ne11 = src1->ne[1];
const int ne12 = src1->ne[2];
const int ne13 = src1->ne[3];
const cl_ulong nb10 = src1->nb[0];
const cl_ulong nb11 = src1->nb[1];
const cl_ulong nb12 = src1->nb[2];
const cl_ulong nb13 = src1->nb[3];
const int ne0 = dst->ne[0];
const cl_ulong nb0 = dst->nb[0];
const cl_ulong nb1 = dst->nb[1];
const cl_ulong nb2 = dst->nb[2];
const cl_ulong nb3 = dst->nb[3];
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
cl_command_queue queue = backend_ctx->queue;
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
cl_ulong offset0 = extra0->offset + src0->view_offs;
cl_ulong offset1 = extra1->offset + src1->view_offs;
cl_ulong offsetd = extrad->offset + dst->view_offs;
bool bcast_row = false;
cl_kernel kernel;
if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) {
GGML_ASSERT(ggml_is_contiguous(src0));
// src1 is a row
GGML_ASSERT(ne11 == 1);
bcast_row = true;
int ne = ne00 / 4;
kernel = backend_ctx->kernel_sub_row;
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne));
} else {
kernel = backend_ctx->kernel_sub;
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &nb00));
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb01));
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb02));
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb03));
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne10));
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne11));
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne12));
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne13));
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb10));
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb11));
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb12));
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb13));
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &ne0));
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb0));
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb1));
CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb2));
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &nb3));
}
if (bcast_row) {
int n = ggml_nelements(dst)/4;
size_t global_work_size[] = {(size_t)n, 1, 1};
size_t local_work_size[] = {64, 1, 1};
#ifdef GGML_OPENCL_PROFILING
cl_event evt;
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
g_profiling_info.emplace_back();
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
#else
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
#endif
} else {
unsigned int nth = MIN(64, ne0);
size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
size_t local_work_size[] = {nth, 1, 1};
#ifdef GGML_OPENCL_PROFILING
cl_event evt;
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
g_profiling_info.emplace_back();
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
#else
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
#endif
}
}
static void ggml_cl_gelu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
GGML_ASSERT(src0);
GGML_ASSERT(src0->extra);
@@ -3800,58 +3429,6 @@ static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const
#endif
}
static void ggml_cl_sigmoid(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
GGML_ASSERT(src0);
GGML_ASSERT(src0->extra);
GGML_ASSERT(dst);
GGML_ASSERT(dst->extra);
UNUSED(src1);
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
cl_command_queue queue = backend_ctx->queue;
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
cl_ulong offset0 = extra0->offset + src0->view_offs;
cl_ulong offsetd = extrad->offset + dst->view_offs;
cl_kernel kernel;
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
kernel = backend_ctx->kernel_sigmoid_f32;
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
kernel = backend_ctx->kernel_sigmoid_f16;
} else {
GGML_ASSERT(false && "Unsupported data types for sigmoid (input and output must be both f32 or f16)");
}
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
const int64_t n = ggml_nelements(dst);
size_t global_work_size[] = {(size_t)n, 1, 1};
size_t local_work_size[] = {64, 1, 1};
size_t * local_work_size_ptr = local_work_size;
if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
}
#ifdef GGML_OPENCL_PROFILING
cl_event evt;
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
g_profiling_info.emplace_back();
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
#else
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
#endif
}
static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
GGML_ASSERT(src0);
GGML_ASSERT(src0->extra);
@@ -4049,65 +3626,6 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c
#endif
}
static void ggml_cl_group_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
GGML_ASSERT(src0);
GGML_ASSERT(src0->extra);
GGML_ASSERT(dst);
GGML_ASSERT(dst->extra);
UNUSED(src1);
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
cl_command_queue queue = backend_ctx->queue;
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
cl_ulong offset0 = extra0->offset + src0->view_offs;
cl_ulong offsetd = extrad->offset + dst->view_offs;
int32_t n_groups = ((const int32_t *) dst->op_params)[0];
int32_t group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + n_groups - 1) / n_groups);
float eps = ((const float *) dst->op_params)[1];
const int ne00 = src0->ne[0];
const int ne01 = src0->ne[1];
const int ne02 = src0->ne[2];
const int ne = ne00*ne01*ne02;
cl_kernel kernel = backend_ctx->kernel_group_norm;
size_t sgs = 64;
if (backend_ctx->gpu_family == ADRENO) {
sgs = 64;
} else if (backend_ctx->gpu_family == INTEL) {
sgs = 32;
} else {
GGML_ASSERT(false && "Unsupported GPU");
}
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &group_size));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(float), &eps));
size_t global_work_size[] = {(size_t)n_groups*sgs, 1, 1};
size_t local_work_size[] = {(size_t)sgs, 1, 1};
#ifdef GGML_OPENCL_PROFILING
cl_event evt;
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
g_profiling_info.emplace_back();
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
#else
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
#endif
}
static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
GGML_ASSERT(src0);
GGML_ASSERT(src0->extra);
@@ -5457,124 +4975,6 @@ static void ggml_cl_im2col(ggml_backend_t backend, const ggml_tensor * src0, con
#endif
}
static void ggml_cl_argsort(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
GGML_ASSERT(src0);
GGML_ASSERT(src0->extra);
GGML_ASSERT(dst);
GGML_ASSERT(dst->extra);
GGML_UNUSED(src1);
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_I32);
GGML_ASSERT(ggml_is_contiguous(src0));
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
cl_command_queue queue = backend_ctx->queue;
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
cl_ulong offset0 = extra0->offset + src0->view_offs;
cl_ulong offsetd = extrad->offset + dst->view_offs;
const int ne00 = src0->ne[0];
const int nrows = ggml_nrows(src0);
int ne00_padded = 1;
while (ne00_padded < ne00) {
ne00_padded *= 2;
}
int order = (enum ggml_sort_order) dst->op_params[0];
cl_kernel kernel = backend_ctx->kernel_argsort_f32_i32;
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne00_padded));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &order));
CL_CHECK(clSetKernelArg(kernel, 7, ne00_padded*sizeof(int), NULL));
size_t global_work_size[] = {(size_t)ne00_padded, (size_t)nrows, (size_t)1};
size_t local_work_size[] = {(size_t)ne00_padded, 1, 1};
#ifdef GGML_OPENCL_PROFILING
cl_event evt;
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
g_profiling_info.emplace_back();
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
#else
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
#endif
}
static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
GGML_ASSERT(src0);
GGML_ASSERT(src0->extra);
GGML_ASSERT(dst);
GGML_ASSERT(dst->extra);
GGML_UNUSED(src1);
GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
GGML_ASSERT(ggml_is_contiguous(src0));
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
cl_command_queue queue = backend_ctx->queue;
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
cl_ulong offset0 = extra0->offset + src0->view_offs;
cl_ulong offsetd = extrad->offset + dst->view_offs;
const int ne00 = src0->ne[0];
const int ne01 = src0->ne[1];
const int ne02 = src0->ne[2];
const int ne03 = src0->ne[3];
const cl_ulong nb01 = src0->nb[1];
const cl_ulong nb02 = src0->nb[2];
const cl_ulong nb03 = src0->nb[3];
const cl_ulong nb1 = dst->nb[1];
const cl_ulong nb2 = dst->nb[2];
const cl_ulong nb3 = dst->nb[3];
cl_kernel kernel = backend_ctx->kernel_sum_rows_f32;
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03));
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb01));
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb02));
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb03));
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb1));
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb2));
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb3));
size_t global_work_size[] = {(size_t)ne01, (size_t)ne02, (size_t)ne03};
size_t local_work_size[] = {(size_t)64, 1, 1};
#ifdef GGML_OPENCL_PROFILING
cl_event evt;
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
g_profiling_info.emplace_back();
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
#else
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
#endif
}
//------------------------------------------------------------------------------
// Op offloading
//------------------------------------------------------------------------------
@@ -5623,18 +5023,6 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
}
func = ggml_cl_mul;
break;
case GGML_OP_DIV:
if (!any_on_device) {
return false;
}
func = ggml_cl_div;
break;
case GGML_OP_SUB:
if (!any_on_device) {
return false;
}
func = ggml_cl_sub;
break;
case GGML_OP_UNARY:
switch (ggml_get_unary_op(tensor)) {
case GGML_UNARY_OP_GELU:
@@ -5661,12 +5049,6 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
}
func = ggml_cl_relu;
break;
case GGML_UNARY_OP_SIGMOID:
if (!any_on_device) {
return false;
}
func = ggml_cl_sigmoid;
break;
default:
return false;
} break;
@@ -5688,12 +5070,6 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
}
func = ggml_cl_rms_norm;
break;
case GGML_OP_GROUP_NORM:
if (!any_on_device) {
return false;
}
func = ggml_cl_group_norm;
break;
case GGML_OP_MUL_MAT:
if (!any_on_device && !ggml_cl_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) {
return false;
@@ -5739,18 +5115,6 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
}
func = ggml_cl_im2col;
break;
case GGML_OP_ARGSORT:
if (!any_on_device) {
return false;
}
func = ggml_cl_argsort;
break;
case GGML_OP_SUM_ROWS:
if (!any_on_device) {
return false;
}
func = ggml_cl_sum_rows;
break;
default:
return false;
}

View File

@@ -1,86 +0,0 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#ifdef cl_intel_subgroups
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
#else
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
#endif
#ifdef cl_intel_required_subgroup_size
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
#define INTEL_GPU 1
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
#elif defined(cl_qcom_reqd_sub_group_size)
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
#define ADRENO_GPU 1
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
#endif
#define SWAP(x, y, T) { T tmp = (x); (x) = (y); (y) = tmp; }
enum ggml_sort_order {
GGML_SORT_ORDER_ASC,
GGML_SORT_ORDER_DESC,
};
kernel void kernel_argsort_f32_i32(
global float * src0,
ulong offset0,
global int * dst,
ulong offsetd,
const int ne00,
const int ne00_pad,
const int order,
local int * dst_row
) {
// bitonic sort
int col = get_local_id(0);
int row = get_group_id(1);
if (col >= ne00_pad) {
return;
}
src0 = (global char *)((global char *)src0 + offset0);
dst = (global float *)((global char *)dst + offsetd);
global float * x_row = src0 + row * ne00;
// initialize indices
dst_row[col] = col;
barrier(CLK_LOCAL_MEM_FENCE);
for (int k = 2; k <= ne00_pad; k *= 2) {
for (int j = k / 2; j > 0; j /= 2) {
int ixj = col ^ j;
if (ixj > col) {
if ((col & k) == 0) {
if (dst_row[col] >= ne00 ||
(dst_row[ixj] < ne00 && (order == GGML_SORT_ORDER_ASC ?
x_row[dst_row[col]] > x_row[dst_row[ixj]] :
x_row[dst_row[col]] < x_row[dst_row[ixj]]))
) {
SWAP(dst_row[col], dst_row[ixj], int);
}
} else {
if (dst_row[ixj] >= ne00 ||
(dst_row[col] < ne00 && (order == GGML_SORT_ORDER_ASC ?
x_row[dst_row[col]] < x_row[dst_row[ixj]] :
x_row[dst_row[col]] > x_row[dst_row[ixj]]))
) {
SWAP(dst_row[col], dst_row[ixj], int);
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
}
// copy the result to dst without the padding
if (col < ne00) {
dst[row * ne00 + col] = dst_row[col];
}
}

View File

@@ -1,72 +0,0 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
//------------------------------------------------------------------------------
// div
//------------------------------------------------------------------------------
kernel void kernel_div(
global char * src0,
ulong offset0,
global char * src1,
ulong offset1,
global char * dst,
ulong offsetd,
ulong nb00,
ulong nb01,
ulong nb02,
ulong nb03,
int ne10,
int ne11,
int ne12,
int ne13,
ulong nb10,
ulong nb11,
ulong nb12,
ulong nb13,
int ne0,
ulong nb0,
ulong nb1,
ulong nb2,
ulong nb3
) {
src0 = src0 + offset0;
src1 = src1 + offset1;
dst = dst + offsetd;
int i03 = get_group_id(2);
int i02 = get_group_id(1);
int i01 = get_group_id(0);
int i13 = i03 % ne13;
int i12 = i02 % ne12;
int i11 = i01 % ne11;
global char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
global char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
global char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1;
for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
const int i10 = i0 % ne10;
*((global float *)(dst_ptr + i0*nb0)) = *((global float *)(src0_ptr + i0*nb00)) / *((global float *)(src1_ptr + i10*nb10));
}
}
// assumption: src1 is a row
// broadcast src1 into src0
kernel void kernel_div_row(
global float4 * src0,
ulong offset0,
global float4 * src1,
ulong offset1,
global float4 * dst,
ulong offsetd,
int ne
) {
src0 = (global float4*)((global char*)src0 + offset0);
src1 = (global float4*)((global char*)src1 + offset1);
dst = (global float4*)((global char*)dst + offsetd);
// This performs better than using %.
uint gid = get_global_id(0);
uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
dst[gid] = src0[gid] / src1[idx1];
}

View File

@@ -1,72 +0,0 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#ifdef cl_intel_subgroups
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
#else
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
#endif
#ifdef cl_intel_required_subgroup_size
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
#define INTEL_GPU 1
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
#elif defined(cl_qcom_reqd_sub_group_size)
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
#define ADRENO_GPU 1
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
#endif
// Workgroup must be a subgroup
#ifdef INTEL_GPU
REQD_SUBGROUP_SIZE_32
#elif defined (ADRENO_GPU)
REQD_SUBGROUP_SIZE_64
#endif
kernel void kernel_group_norm(
global float * src0,
ulong offset0,
global float * dst,
ulong offsetd,
int ne,
int group_size,
float eps
) {
src0 = (global float *)((global char *)src0 + offset0);
dst = (global float *)((global char *)dst + offsetd);
int start = get_group_id(0) * group_size;
int end = start + group_size;
start += get_local_id(0);
if (end >= ne) {
end = ne;
}
float tmp = 0.0f;
for (int j = start; j < end; j += get_local_size(0)) {
tmp += src0[j];
}
tmp = sub_group_reduce_add(tmp);
const float mean = tmp / group_size;
tmp = 0.0f;
for (int j = start; j < end; j += get_local_size(0)) {
float xi = src0[j] - mean;
dst[j] = xi;
tmp += xi * xi;
}
tmp = sub_group_reduce_add(tmp);
const float variance = tmp / group_size;
const float scale = 1.0f/sqrt(variance + eps);
for (int j = start; j < end; j += get_local_size(0)) {
dst[j] *= scale;
}
}

View File

@@ -1,29 +0,0 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
//------------------------------------------------------------------------------
// sigmoid
//------------------------------------------------------------------------------
kernel void kernel_sigmoid_f32(
global float * src0,
ulong offset0,
global float * dst,
ulong offsetd
) {
src0 = (global float*)((global char*)src0 + offset0);
dst = (global float*)((global char*)dst + offsetd);
dst[get_global_id(0)] = 1.0f / (1.0f + exp(-src0[get_global_id(0)]));
}
kernel void kernel_sigmoid_f16(
global half * src0,
ulong offset0,
global half * dst,
ulong offsetd
) {
src0 = (global half*)((global char*)src0 + offset0);
dst = (global half*)((global char*)dst + offsetd);
dst[get_global_id(0)] = 1.0f / (1.0f + exp(-src0[get_global_id(0)]));
}

View File

@@ -1,72 +0,0 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
//------------------------------------------------------------------------------
// div
//------------------------------------------------------------------------------
kernel void kernel_sub(
global char * src0,
ulong offset0,
global char * src1,
ulong offset1,
global char * dst,
ulong offsetd,
ulong nb00,
ulong nb01,
ulong nb02,
ulong nb03,
int ne10,
int ne11,
int ne12,
int ne13,
ulong nb10,
ulong nb11,
ulong nb12,
ulong nb13,
int ne0,
ulong nb0,
ulong nb1,
ulong nb2,
ulong nb3
) {
src0 = src0 + offset0;
src1 = src1 + offset1;
dst = dst + offsetd;
int i03 = get_group_id(2);
int i02 = get_group_id(1);
int i01 = get_group_id(0);
int i13 = i03 % ne13;
int i12 = i02 % ne12;
int i11 = i01 % ne11;
global char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
global char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
global char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1;
for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
const int i10 = i0 % ne10;
*((global float *)(dst_ptr + i0*nb0)) = *((global float *)(src0_ptr + i0*nb00)) - *((global float *)(src1_ptr + i10*nb10));
}
}
// assumption: src1 is a row
// broadcast src1 into src0
kernel void kernel_sub_row(
global float4 * src0,
ulong offset0,
global float4 * src1,
ulong offset1,
global float4 * dst,
ulong offsetd,
int ne
) {
src0 = (global float4*)((global char*)src0 + offset0);
src1 = (global float4*)((global char*)src1 + offset1);
dst = (global float4*)((global char*)dst + offsetd);
// This performs better than using %.
uint gid = get_global_id(0);
uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
dst[gid] = src0[gid] - src1[idx1];
}

View File

@@ -1,39 +0,0 @@
kernel void kernel_sum_rows_f32(
global float * src0,
ulong offset0,
global float * dst,
ulong offsetd,
int ne00,
int ne01,
int ne02,
int ne03,
ulong nb01,
ulong nb02,
ulong nb03,
ulong nb1,
ulong nb2,
ulong nb3
) {
src0 = (global float *)((global char *)src0 + offset0);
dst = (global float *)((global char *)dst + offsetd);
int i3 = get_global_id(2);
int i2 = get_global_id(1);
int i1 = get_global_id(0);
if (i3 >= ne03 || i2 >= ne02 || i1 >= ne01) {
return;
}
global float * src_row = (global float *) ((global char *) src0 + i1*nb01 + i2*nb02 + i3*nb03);
global float * dst_row = (global float *) ((global char *) dst + i1*nb1 + i2*nb2 + i3*nb3);
float row_sum = 0;
for (int i0 = 0; i0 < ne00; i0++) {
row_sum += src_row[i0];
}
dst_row[0] = row_sum;
}

View File

@@ -84,15 +84,6 @@ static void gelu_quick(const T *x, T *dst, int k,
dst[i] = x[i] * (static_cast<T>(1.0f) / (static_cast<T>(1.0f) + sycl::native::exp(GELU_QUICK_COEF * x[i])));
}
template<typename T>
static void gelu_erf(const T * x, T * dst, const int k, const sycl::nd_item<3> &item_ct1) {
const T SQRT_2_INV = static_cast<T>(0.70710678118654752440084436210484f);
for(auto i = item_ct1.get_global_id(2); i < (const size_t)k; i += item_ct1.get_global_range(2)) {
auto x_i = x[i];
dst[i] = static_cast<T>(0.5f) * x_i * (static_cast<T>(1.0f) + sycl::erf(x_i * SQRT_2_INV));
}
}
template<typename T>
static void tanh(const T *x, T *dst, int k,
const sycl::nd_item<3> &item_ct1) {
@@ -409,20 +400,6 @@ static void gelu_quick_sycl(const T *x, T *dst, const int k,
});
}
template<typename T>
static void gelu_erf_sycl(const T *x, T *dst, const int k,
queue_ptr stream) {
const int num_blocks = ceil_div(k, SYCL_GELU_BLOCK_SIZE);
stream->parallel_for(
sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE),
sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE)),
[=](sycl::nd_item<3> item_ct1) {
gelu_erf(x, dst, k, item_ct1);
});
}
template<typename T>
static void tanh_sycl(const T *x, T *dst, const int k,
queue_ptr stream) {
@@ -839,38 +816,6 @@ inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor
}
}
inline void ggml_sycl_op_gelu_erf(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
#if defined (GGML_SYCL_F16)
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
#else
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
GGML_ASSERT(dst->type == GGML_TYPE_F32);
#endif
GGML_ASSERT(dst->src[0]->type == dst->type);
dpct::queue_ptr main_stream = ctx.stream();
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
switch (dst->type) {
#if defined (GGML_SYCL_F16)
case GGML_TYPE_F16:
{
auto data_pts = cast_data<sycl::half>(dst);
gelu_erf_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
break;
}
#endif
case GGML_TYPE_F32:
{
auto data_pts = cast_data<float>(dst);
gelu_erf_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
break;
}
default:
GGML_ABORT("GGML tensor type not supported!\n");
}
}
inline void ggml_sycl_op_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
#if defined (GGML_SYCL_F16)
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
@@ -1480,11 +1425,6 @@ void ggml_sycl_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
ggml_sycl_op_gelu_quick(ctx, dst);
}
void ggml_sycl_gelu_erf(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
ggml_sycl_op_gelu_erf(ctx, dst);
}
void ggml_sycl_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
ggml_sycl_op_tanh(ctx, dst);

View File

@@ -38,8 +38,6 @@ void ggml_sycl_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
void ggml_sycl_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
void ggml_sycl_gelu_erf(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
void ggml_sycl_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
void ggml_sycl_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst);

View File

@@ -3543,9 +3543,6 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
case GGML_UNARY_OP_GELU_QUICK:
ggml_sycl_gelu_quick(ctx, dst);
break;
case GGML_UNARY_OP_GELU_ERF:
ggml_sycl_gelu_erf(ctx, dst);
break;
case GGML_UNARY_OP_TANH:
ggml_sycl_tanh(ctx, dst);
break;
@@ -4099,7 +4096,6 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_UNARY_OP_HARDSIGMOID:
case GGML_UNARY_OP_HARDSWISH:
case GGML_UNARY_OP_GELU_QUICK:
case GGML_UNARY_OP_GELU_ERF:
case GGML_UNARY_OP_TANH:
case GGML_UNARY_OP_EXP:
case GGML_UNARY_OP_SGN:

View File

@@ -109,6 +109,10 @@ if (Vulkan_FOUND)
add_compile_definitions(GGML_VULKAN_SHADER_DEBUG_INFO)
endif()
if (GGML_VULKAN_PERF)
add_compile_definitions(GGML_VULKAN_PERF)
endif()
if (GGML_VULKAN_VALIDATE)
add_compile_definitions(GGML_VULKAN_VALIDATE)
endif()

View File

@@ -1,6 +1,6 @@
#include "ggml-vulkan.h"
#include <vulkan/vulkan_core.h>
#if defined(GGML_VULKAN_RUN_TESTS) || defined(GGML_VULKAN_CHECK_RESULTS)
#if defined(GGML_VULKAN_RUN_TESTS) || defined(GGML_VULKAN_PERF) || defined(GGML_VULKAN_CHECK_RESULTS)
#include <chrono>
#include "ggml-cpu.h"
#endif
@@ -184,7 +184,9 @@ static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = {
#ifdef GGML_VULKAN_MEMORY_DEBUG
class vk_memory_logger;
#endif
#ifdef GGML_VULKAN_PERF
class vk_perf_logger;
#endif
static void ggml_vk_destroy_buffer(vk_buffer& buf);
static constexpr uint32_t mul_mat_vec_max_cols = 8;
@@ -440,11 +442,9 @@ struct vk_device_struct {
#ifdef GGML_VULKAN_MEMORY_DEBUG
std::unique_ptr<vk_memory_logger> memory_logger;
#endif
// for GGML_VK_PERF_LOGGER
#ifdef GGML_VULKAN_PERF
std::unique_ptr<vk_perf_logger> perf_logger;
vk::QueryPool query_pool;
uint32_t num_queries;
#endif
~vk_device_struct() {
VK_LOG_DEBUG("destroy device " << name);
@@ -828,6 +828,8 @@ private:
#define VK_LOG_MEMORY(msg) ((void) 0)
#endif // GGML_VULKAN_MEMORY_DEBUG
#if defined(GGML_VULKAN_PERF)
class vk_perf_logger {
public:
void print_timings() {
@@ -837,7 +839,7 @@ public:
for (const auto& time : t.second) {
total += time;
}
std::cerr << t.first << ": " << t.second.size() << " x " << (total / t.second.size() / 1000.0) << " us" << std::endl;
std::cerr << t.first << ": " << t.second.size() << " x " << (total / t.second.size() / 1000.0) << " ms" << std::endl;
}
timings.clear();
@@ -866,6 +868,7 @@ public:
private:
std::map<std::string, std::vector<uint64_t>> timings;
};
#endif // GGML_VULKAN_PERF
struct ggml_backend_vk_context {
std::string name;
@@ -955,8 +958,6 @@ struct vk_instance_t {
static bool vk_instance_initialized = false;
static vk_instance_t vk_instance;
static bool vk_perf_logger_enabled = false;
#ifdef GGML_VULKAN_CHECK_RESULTS
static size_t vk_skip_checks;
static size_t vk_output_tensor;
@@ -2756,9 +2757,9 @@ static vk_device ggml_vk_get_device(size_t idx) {
#ifdef GGML_VULKAN_MEMORY_DEBUG
device->memory_logger = std::unique_ptr<vk_memory_logger>(new vk_memory_logger());
#endif
if (vk_perf_logger_enabled) {
device->perf_logger = std::unique_ptr<vk_perf_logger>(new vk_perf_logger());
}
#ifdef GGML_VULKAN_PERF
device->perf_logger = std::unique_ptr<vk_perf_logger>(new vk_perf_logger());
#endif
size_t dev_num = vk_instance.device_indices[idx];
@@ -3546,8 +3547,6 @@ static void ggml_vk_instance_init() {
vk_instance.instance = vk::createInstance(instance_create_info);
vk_instance_initialized = true;
vk_perf_logger_enabled = getenv("GGML_VK_PERF_LOGGER") != nullptr;
size_t num_available_devices = vk_instance.instance.enumeratePhysicalDevices().size();
// Emulate behavior of CUDA_VISIBLE_DEVICES for Vulkan
@@ -8886,7 +8885,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
ctx->tensor_ctxs[node_idx] = compute_ctx;
#if defined(GGML_VULKAN_CHECK_RESULTS)
#if defined(GGML_VULKAN_CHECK_RESULTS) || defined(GGML_VULKAN_PERF)
// Force context reset on each node so that each tensor ends up in its own context
// and can be run and compared to its CPU equivalent separately
last_node = true;
@@ -9506,29 +9505,6 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
bool first_node_in_batch = true; // true if next node will be first node in a batch
int submit_node_idx = 0; // index to first node in a batch
vk_context compute_ctx;
if (vk_perf_logger_enabled) {
// allocate/resize the query pool
if (ctx->device->num_queries < cgraph->n_nodes + 1) {
if (ctx->device->query_pool) {
ctx->device->device.destroyQueryPool(ctx->device->query_pool);
}
VkQueryPoolCreateInfo query_create_info = { VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO };
query_create_info.queryType = VK_QUERY_TYPE_TIMESTAMP;
query_create_info.queryCount = cgraph->n_nodes + 100;
ctx->device->query_pool = ctx->device->device.createQueryPool(query_create_info);
ctx->device->num_queries = query_create_info.queryCount;
}
ctx->device->device.resetQueryPool(ctx->device->query_pool, 0, cgraph->n_nodes+1);
GGML_ASSERT(ctx->compute_ctx.expired());
compute_ctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
ctx->compute_ctx = compute_ctx;
ggml_vk_ctx_begin(ctx->device, compute_ctx);
compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->device->query_pool, 0);
}
// Submit after enough work has accumulated, to overlap CPU cmdbuffer generation with GPU execution.
// Estimate the amount of matmul work by looking at the weight matrix size, and submit every 100MB
// (and scaled down based on model size, so smaller models submit earlier).
@@ -9556,17 +9532,6 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
bool enqueued = ggml_vk_build_graph(ctx, cgraph->nodes[i], i, cgraph->nodes[submit_node_idx], submit_node_idx, false, i == last_node, almost_ready, submit);
if (vk_perf_logger_enabled) {
if (ctx->compute_ctx.expired()) {
compute_ctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
ctx->compute_ctx = compute_ctx;
ggml_vk_ctx_begin(ctx->device, compute_ctx);
} else {
compute_ctx = ctx->compute_ctx.lock();
}
compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->device->query_pool, i+1);
}
if (enqueued) {
++submitted_nodes;
@@ -9588,27 +9553,9 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
}
}
if (vk_perf_logger_enabled) {
// End the command buffer and submit/wait
GGML_ASSERT(!ctx->compute_ctx.expired());
compute_ctx = ctx->compute_ctx.lock();
ggml_vk_ctx_end(compute_ctx);
ggml_vk_submit(compute_ctx, ctx->device->fence);
VK_CHECK(ctx->device->device.waitForFences({ ctx->device->fence }, true, UINT64_MAX), "GGML_VULKAN_PERF waitForFences");
ctx->device->device.resetFences({ ctx->device->fence });
// Get the results and pass them to the logger
std::vector<uint64_t> timestamps(cgraph->n_nodes + 1);
ctx->device->device.getQueryPoolResults(ctx->device->query_pool, 0, cgraph->n_nodes + 1, (cgraph->n_nodes + 1)*sizeof(uint64_t), timestamps.data(), sizeof(uint64_t), vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait);
for (int i = 0; i < cgraph->n_nodes; i++) {
if (!ggml_vk_is_empty(cgraph->nodes[i])) {
ctx->device->perf_logger->log_timing(cgraph->nodes[i], uint64_t((timestamps[i+1] - timestamps[i]) * ctx->device->properties.limits.timestampPeriod));
}
}
ctx->device->perf_logger->print_timings();
}
#ifdef GGML_VULKAN_PERF
ctx->device->perf_logger->print_timings();
#endif
ggml_vk_graph_cleanup(ctx);

View File

@@ -2312,26 +2312,6 @@ struct ggml_tensor * ggml_repeat(
return result;
}
struct ggml_tensor * ggml_repeat_4d(
struct ggml_context * ctx,
struct ggml_tensor * a,
int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
const bool can_repeat = ggml_is_empty(a) || (
(ne0 % a->ne[0] == 0) &&
(ne1 % a->ne[1] == 0) &&
(ne2 % a->ne[2] == 0) &&
(ne3 % a->ne[3] == 0)
);
GGML_ASSERT(can_repeat);
struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, ne0, ne1, ne2, ne3);
result->op = GGML_OP_REPEAT;
result->src[0] = a;
return result;
}
// ggml_repeat_back
struct ggml_tensor * ggml_repeat_back(

View File

@@ -2260,7 +2260,6 @@ class VisionProjectorType:
ULTRAVOX = "ultravox"
INTERNVL = "internvl"
QWEN2A = "qwen2a" # audio
QWEN25O = "qwen2.5o" # omni
# Items here are (block size, type size)

View File

@@ -902,6 +902,7 @@ class TensorNameMap:
MODEL_TENSOR.V_MMPROJ_FC: (
"model.connector.modality_projection.proj", # SmolVLM
"multi_modal_projector.linear_1", # llama 4
),
MODEL_TENSOR.V_MMPROJ_MLP: (
@@ -1124,7 +1125,6 @@ class TensorNameMap:
MODEL_TENSOR.A_POST_NORM: (
"audio_tower.layer_norm", # ultravox
"audio_tower.ln_post", # qwen2omni
),
MODEL_TENSOR.A_ENC_ATTN_Q: (
@@ -1161,16 +1161,12 @@ class TensorNameMap:
"audio_tower.layers.{bid}.fc2", # ultravox
),
# note: some tensors below has "audio." pseudo-prefix, to prevent conflicts with vision tensors
# this prefix is added in the conversion code in modify_tensors()
MODEL_TENSOR.A_MMPROJ: (
"audio.multi_modal_projector.linear_{bid}", # ultravox
),
MODEL_TENSOR.A_MMPROJ_FC: (
"audio.multi_modal_projector.linear", # qwen2audio
"audio_tower.proj", # qwen2omni
),
MODEL_TENSOR.A_MM_NORM_PRE: (

View File

@@ -231,7 +231,7 @@ class SafetensorRemote:
response.raise_for_status()
# Get raw byte data
return response.content[slice(size if size > -1 else None)]
return response.content[:size]
@classmethod
def check_file_exist(cls, url: str) -> bool:

View File

@@ -1 +1 @@
06b715f4c170232af261425240914fa49c44f982
7c06c10c532a6cda913c17fc56341e8880ae341d

View File

@@ -14,7 +14,6 @@ add_library(llama
llama-batch.cpp
llama-chat.cpp
llama-context.cpp
llama-cparams.cpp
llama-grammar.cpp
llama-graph.cpp
llama-hparams.cpp

View File

@@ -455,7 +455,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
}
int64_t llm_graph_context::n_pos_per_embd() const {
return hparams.rope_type == LLAMA_ROPE_TYPE_MROPE ? 4 : 1;
return arch == LLM_ARCH_QWEN2VL ? 4 : 1;
}
void llm_graph_context::cb(ggml_tensor * cur, const char * name, int il) const {
@@ -1562,25 +1562,20 @@ void llm_graph_context::build_pooling(
ggml_tensor * inp_cls = build_inp_cls();
inp = ggml_get_rows(ctx0, inp, inp_cls);
if (cls != nullptr && cls_b != nullptr) {
// classification head
// https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/roberta/modeling_roberta.py#L1566
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, cls, inp), cls_b);
cur = ggml_tanh(ctx0, cur);
// classification head
// https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/roberta/modeling_roberta.py#L1566
GGML_ASSERT(cls != nullptr);
GGML_ASSERT(cls_b != nullptr);
// some models don't have `cls_out`, for example: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
// https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/blob/cb5347e43979c3084a890e3f99491952603ae1b7/modeling_bert.py#L884-L896
if (cls_out) {
GGML_ASSERT(cls_out_b != nullptr);
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, cls_out, cur), cls_out_b);
}
} else if (cls_out) {
// Single layer classification head (direct projection)
// https://github.com/huggingface/transformers/blob/f4fc42216cd56ab6b68270bf80d811614d8d59e4/src/transformers/models/bert/modeling_bert.py#L1476
cur = ggml_add (ctx0, ggml_mul_mat(ctx0, cls, inp), cls_b);
cur = ggml_tanh(ctx0, cur);
// some models don't have `cls_out`, for example: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
// https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/blob/cb5347e43979c3084a890e3f99491952603ae1b7/modeling_bert.py#L884-L896
if (cls_out) {
GGML_ASSERT(cls_out_b != nullptr);
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, cls_out, inp), cls_out_b);
} else {
GGML_ABORT("RANK pooling requires either cls+cls_b or cls_out+cls_out_b");
cur = ggml_add (ctx0, ggml_mul_mat(ctx0, cls_out, cur), cls_out_b);
}
} break;
default:

View File

@@ -286,11 +286,31 @@ void llama_kv_cache_unified::seq_div(llama_seq_id seq_id, llama_pos p0, llama_po
}
llama_pos llama_kv_cache_unified::seq_pos_min(llama_seq_id seq_id) const {
return cells.seq_pos_min(seq_id);
llama_pos result = std::numeric_limits<llama_pos>::max();
for (uint32_t i = 0; i < cells.size(); ++i) {
if (cells.seq_has(i, seq_id)) {
result = std::min(result, cells.pos_get(i));
}
}
if (result == std::numeric_limits<llama_pos>::max()) {
result = -1;
}
return result;
}
llama_pos llama_kv_cache_unified::seq_pos_max(llama_seq_id seq_id) const {
return cells.seq_pos_max(seq_id);
llama_pos result = -1;
for (uint32_t i = 0; i < cells.size(); ++i) {
if (cells.seq_has(i, seq_id)) {
result = std::max(result, cells.pos_get(i));
}
}
return result;
}
void llama_kv_cache_unified::restore() {
@@ -484,7 +504,7 @@ bool llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) {
// a heuristic, to avoid attending the full cache if it is not yet utilized
// after enough generations, the benefit from this heuristic disappears
// if we start defragmenting the cache, the benefit from this will be more important
n = std::min(cells.size(), std::max(n_pad, GGML_PAD(cells.used_max_p1(), n_pad)));
n = std::min(cells.size(), std::max(n_pad, GGML_PAD(cell_max(), n_pad)));
#ifdef FIND_SLOT_DEBUG
LLAMA_LOG_WARN("end: n = %5d, used = %5d, head = %5d, n_swa = %5d\n", n, used, head, n_swa);
@@ -757,19 +777,11 @@ ggml_tensor * llama_kv_cache_unified::build_rope_shift(
const auto & yarn_beta_slow = cparams.yarn_beta_slow;
const auto & n_rot = hparams.n_rot;
const auto & rope_type = hparams.rope_type == LLAMA_ROPE_TYPE_MROPE
// @ngxson : this is a workaround
// for M-RoPE, we want to rotate the whole vector when doing KV shift
// a normal RoPE should work, we just need to use the correct ordering
// ref: https://github.com/ggml-org/llama.cpp/pull/13870
? LLAMA_ROPE_TYPE_NEOX
: hparams.rope_type;
const auto & rope_type = hparams.rope_type;
// See llm_build_deepseek2() for why attn_factor has to be scaled for YaRN RoPE to work correctly.
// See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
const float yarn_attn_factor = model.arch == LLM_ARCH_DEEPSEEK2
? 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale))
: cparams.yarn_attn_factor;
const float yarn_attn_factor = model.arch == LLM_ARCH_DEEPSEEK2 ? 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale)) : cparams.yarn_attn_factor;
ggml_tensor * tmp;
@@ -1006,7 +1018,7 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
const uint32_t n_layer = layers.size();
const uint32_t n_kv = cells.used_max_p1();
const uint32_t n_kv = cell_max();
const uint32_t n_used = cells.get_used();
assert(n_used <= n_kv);
@@ -1132,6 +1144,16 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
return true;
}
uint32_t llama_kv_cache_unified::cell_max() const {
for (uint32_t i = cells.size(); i > 0; --i) {
if (!cells.is_empty(i - 1)) {
return i;
}
}
return 0;
}
bool llama_kv_cache_unified::is_masked_swa(llama_pos p0, llama_pos p1) const {
assert(p0 >= 0 && p1 >= 0);

View File

@@ -246,6 +246,10 @@ private:
// return true if cells have been moved
bool defrag_prepare(int32_t n_max_nodes);
// find how many cells are currently in use
// TODO: optimize
uint32_t cell_max() const;
size_t total_size() const;
size_t size_k_bytes() const;

View File

@@ -6,7 +6,6 @@
#include <bitset>
#include <cassert>
#include <vector>
#include <set>
// meta information about KV cells that can be part of multiple sequences at the same time
// TODO: add unit tests
@@ -19,13 +18,8 @@ public:
seq[i].reset();
}
used = 0;
has_shift = false;
used.clear();
for (uint32_t s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
seq_pos[s].clear();
}
}
void reset_shift() {
@@ -56,25 +50,7 @@ public:
}
uint32_t get_used() const {
return used.size();
}
// the index of the first cell that is used
// return 0 if no cells are used
uint32_t used_min() const {
return used.empty() ? 0 : *used.begin();
}
// the index of the last cell that is used + 1
// return 0 if no cells are used
uint32_t used_max_p1() const {
#if 0
if (!seq_pos[0].empty()) printf("kv_cells: min[0] = %5d, max[0] = %5d\n", *seq_pos[0].begin(), *seq_pos[0].rbegin());
if (!seq_pos[1].empty()) printf("kv_cells: min[1] = %5d, max[1] = %5d\n", *seq_pos[1].begin(), *seq_pos[1].rbegin());
if (!seq_pos[2].empty()) printf("kv_cells: min[2] = %5d, max[2] = %5d\n", *seq_pos[2].begin(), *seq_pos[2].rbegin());
#endif
return used.empty() ? 0 : *used.rbegin() + 1;
return used;
}
bool get_has_shift() const {
@@ -93,9 +69,6 @@ public:
pos [isrc] = -1;
shift[isrc] = 0;
seq [isrc].reset();
used.erase (isrc);
used.insert(idst);
}
// copy the state of cells [i, i + n) (used for save/restore the state of the cells)
@@ -122,24 +95,16 @@ public:
for (uint32_t j = 0; j < other.pos.size(); ++j) {
if (pos[i + j] == -1 && other.pos[j] != -1) {
used.insert(i + j);
used++;
}
if (pos[i + j] != -1 && other.pos[j] == -1) {
used.erase(i + j);
}
if (pos[i + j] != -1) {
seq_pos_rm(i + j);
used--;
}
pos[i + j] = other.pos[j];
seq[i + j] = other.seq[j];
if (pos[i + j] != -1) {
seq_pos_add(i + j);
}
assert(shift[i + j] == 0);
}
}
@@ -153,12 +118,11 @@ public:
assert(seq_id >= 0);
seq[i].reset(seq_id);
seq_pos[seq_id].erase(pos[i]);
if (seq[i].none()) {
pos[i] = -1;
used.erase(i);
used--;
return true;
}
@@ -171,22 +135,17 @@ public:
assert(i < pos.size());
if (seq[i].test(seq_id)) {
seq_pos_rm(i);
seq[i].reset();
seq[i].set(seq_id);
seq_pos[seq_id].insert(pos[i]);
return false;
}
if (seq[i].any()) {
seq_pos_rm(i);
seq[i].reset();
pos[i] = -1;
used.erase(i);
used--;
return true;
}
@@ -210,33 +169,6 @@ public:
assert(!seq[i].test(seq_id));
seq[i].set(seq_id);
seq_pos[seq_id].insert(pos[i]);
}
// the minimum position of sequence seq_id currently present in any of the cells
// return -1 if the sequence is not present
llama_pos seq_pos_min(llama_seq_id seq_id) const {
assert(seq_id >= 0);
assert(seq_id < LLAMA_MAX_PARALLEL_SEQUENCES);
if (seq_pos[seq_id].empty()) {
return -1;
}
return *seq_pos[seq_id].begin();
}
// the maximum position of sequence seq_id currently present in any of the cells
// return -1 if the sequence is not present
llama_pos seq_pos_max(llama_seq_id seq_id) const {
assert(seq_id >= 0);
assert(seq_id < LLAMA_MAX_PARALLEL_SEQUENCES);
if (seq_pos[seq_id].empty()) {
return -1;
}
return *seq_pos[seq_id].rbegin();
}
// note: call only if the cell is not empty
@@ -270,8 +202,7 @@ public:
assert(pos[i] == -1);
pos[i] = p;
used.insert(i);
used++;
}
// pos[i] = pos[i] + d
@@ -281,22 +212,16 @@ public:
assert(i < pos.size());
assert(pos[i] != -1);
seq_pos_rm(i);
pos[i] += d;
shift[i] += d;
seq_pos_add(i);
has_shift = true;
if (pos[i] < 0) {
seq_pos_rm(i);
seq[i].reset();
pos[i] = -1;
seq[i].reset();
used.erase(i);
used--;
return true;
}
@@ -313,21 +238,16 @@ public:
const llama_pos p_old = pos[i];
seq_pos_rm(i);
pos[i] /= d;
shift[i] += p_old - pos[i];
seq_pos_add(i);
has_shift = true;
}
private:
bool has_shift = false;
uint32_t used = 0; // used cells (i.e. pos[i] != -1, allowed to not have any seq_id)
// set of indices of used cells (i.e. pos[i] != -1, allowed to not have any seq_id)
std::set<uint32_t> used;
bool has_shift = false;
std::vector<llama_pos> pos;
@@ -348,32 +268,6 @@ private:
//
std::vector<llama_pos> shift;
using bits_t = std::bitset<LLAMA_MAX_PARALLEL_SEQUENCES>;
// the bitset seq[i] tells us which sequences are currently occupying the i-th cell
std::vector<bits_t> seq;
// the set seq_pos[s] tells us which positions are currently present for sequence s
// this way seq_pos[s].begin() and seq_pos[s].rbegin() give us the min/max positions currently in the cache
std::set<llama_pos> seq_pos[LLAMA_MAX_PARALLEL_SEQUENCES];
// helper functions for updating `seq_pos`, once cell at a time:
// remove cell i
void seq_pos_rm(uint32_t i) {
for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
if (seq[i].test(s)) {
seq_pos[s].erase(pos[i]);
}
}
}
// add cell i
void seq_pos_add(uint32_t i) {
for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
if (seq[i].test(s)) {
seq_pos[s].insert(pos[i]);
}
}
}
std::vector<std::bitset<LLAMA_MAX_PARALLEL_SEQUENCES>> seq;
};

View File

@@ -1,54 +1,48 @@
# mtmd
# compile mtmd-audio separately to avoid long compile times with miniaudio.h
# TODO @ngxson : move miniaudio.h and stb_image.h to mtmd-helper.cpp, then compile the helper as a separate library
add_library(mtmd_audio STATIC mtmd-audio.cpp mtmd-audio.h)
if (BUILD_SHARED_LIBS)
set_target_properties(mtmd_audio PROPERTIES POSITION_INDEPENDENT_CODE ON)
endif()
target_link_libraries(mtmd_audio PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(mtmd_audio PRIVATE cxx_std_17)
target_include_directories(mtmd_audio PRIVATE .)
add_library(mtmd OBJECT
mtmd.cpp
mtmd-audio.cpp
mtmd-helper.cpp
mtmd.h
clip.cpp
clip.h
clip-impl.h
)
target_link_libraries(mtmd PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(mtmd PRIVATE ggml llama mtmd_audio ${CMAKE_THREAD_LIBS_INIT})
target_include_directories(mtmd PUBLIC .)
target_include_directories(mtmd PRIVATE ../..)
target_include_directories(mtmd PRIVATE ../../common) # for stb_image.h
target_compile_features(mtmd PRIVATE cxx_std_17)
# compile the helper separately, to avoid long compile times with miniaudio.h and stb_image.h
add_library(mtmd_helper OBJECT
mtmd-helper.cpp
mtmd-helper.h
)
target_link_libraries(mtmd_helper PRIVATE ggml llama mtmd ${CMAKE_THREAD_LIBS_INIT})
target_include_directories(mtmd_helper PUBLIC .)
target_include_directories(mtmd_helper PRIVATE ./vendor)
target_include_directories(mtmd_helper PRIVATE ../..)
target_compile_features(mtmd_helper PRIVATE cxx_std_17)
add_library(mtmd_static STATIC $<TARGET_OBJECTS:mtmd>)
if (BUILD_SHARED_LIBS)
set_target_properties(mtmd PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_compile_definitions(mtmd PRIVATE LLAMA_SHARED LLAMA_BUILD)
add_library(mtmd_shared SHARED $<TARGET_OBJECTS:mtmd>)
target_link_libraries(mtmd_shared PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(mtmd_shared PRIVATE ggml llama mtmd_audio ${CMAKE_THREAD_LIBS_INIT})
install(TARGETS mtmd_shared LIBRARY)
set_target_properties(mtmd_helper PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_compile_definitions(mtmd_helper PRIVATE LLAMA_SHARED LLAMA_BUILD)
add_library(mtmd_helper_shared SHARED $<TARGET_OBJECTS:mtmd>)
target_link_libraries(mtmd_helper_shared PRIVATE ggml llama mtmd ${CMAKE_THREAD_LIBS_INIT})
install(TARGETS mtmd_helper_shared LIBRARY)
endif()
if (NOT MSVC)
# for stb_image.h and miniaudio.h
target_compile_options(mtmd_helper PRIVATE -Wno-cast-qual)
target_compile_options(mtmd PRIVATE -Wno-cast-qual) # stb_image.h
target_compile_options(mtmd_audio PRIVATE -Wno-cast-qual) # miniaudio.h
endif()
if(TARGET BUILD_INFO)
add_dependencies(mtmd BUILD_INFO)
add_dependencies(mtmd_helper BUILD_INFO)
endif()
add_executable(llama-llava-cli deprecation-warning.cpp)
@@ -60,5 +54,5 @@ set(TARGET llama-mtmd-cli)
add_executable(${TARGET} mtmd-cli.cpp)
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-mtmd-cli)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common mtmd mtmd_helper ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE common mtmd ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@@ -130,7 +130,6 @@ enum projector_type {
PROJECTOR_TYPE_INTERNVL,
PROJECTOR_TYPE_LLAMA4,
PROJECTOR_TYPE_QWEN2A,
PROJECTOR_TYPE_QWEN25O, // will be replaced by QWEN2A or QWEN25VL depending on clip_ctx
PROJECTOR_TYPE_UNKNOWN,
};
@@ -149,7 +148,6 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
{ PROJECTOR_TYPE_INTERNVL, "internvl"},
{ PROJECTOR_TYPE_LLAMA4, "llama4"},
{ PROJECTOR_TYPE_QWEN2A, "qwen2a"},
{ PROJECTOR_TYPE_QWEN25O, "qwen2.5o"},
};
static projector_type clip_projector_type_from_string(const std::string & str) {

File diff suppressed because it is too large Load Diff

View File

@@ -17,22 +17,12 @@ struct clip_image_f32;
struct clip_image_u8_batch;
struct clip_image_f32_batch;
enum clip_modality {
CLIP_MODALITY_VISION,
CLIP_MODALITY_AUDIO,
};
struct clip_context_params {
bool use_gpu;
enum ggml_log_level verbosity;
};
struct clip_init_result {
struct clip_ctx * ctx_v; // vision context
struct clip_ctx * ctx_a; // audio context
};
struct clip_init_result clip_init(const char * fname, struct clip_context_params ctx_params);
struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_params);
void clip_free(struct clip_ctx * ctx);

View File

@@ -1,5 +1,28 @@
// fix problem with std::min and std::max
#if defined(_WIN32)
#define WIN32_LEAN_AND_MEAN
#ifndef NOMINMAX
# define NOMINMAX
#endif
#include <windows.h>
#endif
#include "mtmd-audio.h"
//#define MTMD_AUDIO_DEBUG
#define MINIAUDIO_IMPLEMENTATION
#ifndef MTMD_AUDIO_DEBUG
# define MA_NO_ENCODING
#endif
#define MA_NO_DEVICE_IO
#define MA_NO_RESOURCE_MANAGER
#define MA_NO_NODE_GRAPH
#define MA_NO_ENGINE
#define MA_NO_GENERATION
#define MA_API static
#include "miniaudio.h"
#define _USE_MATH_DEFINES // for M_PI
#include <cmath>
#include <cstdint>
@@ -336,6 +359,69 @@ bool preprocess_audio(
} // namespace whisper_preprocessor
namespace audio_helpers {
bool is_audio_file(const char * buf, size_t len) {
if (len < 12) {
return false;
}
// RIFF ref: https://en.wikipedia.org/wiki/Resource_Interchange_File_Format
// WAV ref: https://www.mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/WAVE.html
bool is_wav = memcmp(buf, "RIFF", 4) == 0 && memcmp(buf + 8, "WAVE", 4) == 0;
bool is_mp3 = len >= 3 && (
memcmp(buf, "ID3", 3) == 0 ||
// Check for MPEG sync word (simplified check)
((unsigned char)buf[0] == 0xFF && ((unsigned char)buf[1] & 0xE0) == 0xE0)
);
bool is_flac = memcmp(buf, "fLaC", 4) == 0;
return is_wav || is_mp3 || is_flac;
}
// returns true if the buffer is a valid audio file
bool decode_audio_from_buf(const unsigned char * buf_in, size_t len, int target_sampler_rate, std::vector<float> & pcmf32_mono) {
ma_result result;
const int channels = 1;
ma_decoder_config decoder_config = ma_decoder_config_init(ma_format_f32, channels, target_sampler_rate);
ma_decoder decoder;
result = ma_decoder_init_memory(buf_in, len, &decoder_config, &decoder);
if (result != MA_SUCCESS) {
return false;
}
ma_uint64 frame_count;
ma_uint64 frames_read;
result = ma_decoder_get_length_in_pcm_frames(&decoder, &frame_count);
if (result != MA_SUCCESS) {
ma_decoder_uninit(&decoder);
return false;
}
pcmf32_mono.resize(frame_count);
result = ma_decoder_read_pcm_frames(&decoder, pcmf32_mono.data(), frame_count, &frames_read);
if (result != MA_SUCCESS) {
ma_decoder_uninit(&decoder);
return false;
}
#ifdef MTMD_AUDIO_DEBUG
// save audio to wav file
ma_encoder_config config = ma_encoder_config_init(ma_encoding_format_wav, ma_format_f32, 1, target_sampler_rate);
ma_encoder encoder;
ma_encoder_init_file("output.wav", &config, &encoder);
ma_encoder_write_pcm_frames(&encoder, pcmf32_mono.data(), pcmf32_mono.size(), &frames_read);
ma_encoder_uninit(&encoder);
#endif
ma_decoder_uninit(&decoder);
return true;
}
} // namespace wav_utils
// precalculated mel filter banks
// values are multiplied by 1000.0 to save space, and will be divided by 1000.0 in the end of the function
//

View File

@@ -32,7 +32,7 @@ struct whisper_filters {
std::vector<float> data;
};
bool preprocess_audio(
extern bool preprocess_audio(
const float * samples,
size_t n_samples,
const whisper_filters & filters,
@@ -40,8 +40,23 @@ bool preprocess_audio(
} // namespace whisper_preprocessor
// TODO @ngxson : move this helper to mtmd-helpers.cpp
namespace audio_helpers {
extern bool is_audio_file(const char * buf, size_t len);
extern bool decode_audio_from_buf(
const unsigned char * buf_in,
size_t len,
int target_sampler_rate,
std::vector<float> & pcmf32_mono);
} // namespace audio_helpers
namespace whisper_precalc_filters {
whisper_preprocessor::whisper_filters get_128_bins();
extern whisper_preprocessor::whisper_filters get_128_bins();
} // namespace whisper_precalc_filters

View File

@@ -7,7 +7,6 @@
#include "console.h"
#include "chat.h"
#include "mtmd.h"
#include "mtmd-helper.h"
#include <vector>
#include <limits.h>
@@ -144,7 +143,7 @@ struct mtmd_cli_context {
}
bool load_media(const std::string & fname) {
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(ctx_vision.get(), fname.c_str()));
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(fname.c_str()));
if (!bmp.ptr) {
return false;
}
@@ -285,9 +284,7 @@ int main(int argc, char ** argv) {
if (is_single_turn) {
g_is_generating = true;
if (params.prompt.find(mtmd_default_marker()) == std::string::npos) {
for (size_t i = 0; i < params.image.size(); i++) {
params.prompt += mtmd_default_marker();
}
params.prompt += mtmd_default_marker();
}
common_chat_msg msg;
msg.role = "user";

View File

@@ -1,37 +1,10 @@
// fix problem with std::min and std::max
#if defined(_WIN32)
#define WIN32_LEAN_AND_MEAN
#ifndef NOMINMAX
# define NOMINMAX
#endif
#include <windows.h>
#endif
#include "mtmd.h"
#include "mtmd-helper.h"
#include "llama.h"
#include <algorithm>
#include <cinttypes>
#include <vector>
//#define MTMD_AUDIO_DEBUG
#define MINIAUDIO_IMPLEMENTATION
#ifndef MTMD_AUDIO_DEBUG
# define MA_NO_ENCODING
#endif
#define MA_NO_DEVICE_IO
#define MA_NO_RESOURCE_MANAGER
#define MA_NO_NODE_GRAPH
#define MA_NO_ENGINE
#define MA_NO_GENERATION
#define MA_API static
#include "vendor/miniaudio.h"
#define STB_IMAGE_IMPLEMENTATION
#include "vendor/stb_image.h"
#define LOG_INF(...) fprintf(stdout, __VA_ARGS__)
#define LOG_ERR(...) fprintf(stderr, __VA_ARGS__)
@@ -93,8 +66,7 @@ struct decode_embd_batch {
}
}
// M-RoPE for image
void set_position_mrope_2d(llama_pos pos_0, int nx, int ny, llama_seq_id seq_id) {
void set_position_mrope(llama_pos pos_0, int nx, int ny, llama_seq_id seq_id) {
GGML_ASSERT(n_pos_per_embd == 4);
seq_id_0[0] = seq_id;
for (int y = 0; y < ny; y++) {
@@ -113,23 +85,6 @@ struct decode_embd_batch {
}
}
// M-RoPE for audio
void set_position_mrope_1d(llama_pos pos_0, llama_seq_id seq_id) {
GGML_ASSERT(n_pos_per_embd == 4);
seq_id_0[0] = seq_id;
for (int i = 0; i < batch.n_tokens; i++) {
pos[i ] = pos_0 + i;
pos[i + batch.n_tokens ] = pos_0 + i;
pos[i + batch.n_tokens * 2] = pos_0 + i;
pos[i + batch.n_tokens * 3] = 0; // last pos dim is unused
}
for (int i = 0; i < batch.n_tokens; i++) {
batch.n_seq_id[i] = 1;
batch.seq_id [i] = seq_id_0.data();
batch.logits [i] = false;
}
}
llama_batch get_view(int offset, int n_tokens) {
llama_pos * pos_ptr;
pos_view.clear();
@@ -191,20 +146,18 @@ int32_t mtmd_helper_decode_image_chunk(
decode_embd_batch batch_embd(encoded_embd, n_tokens, n_pos_per_embd, n_mmproj_embd);
if (mtmd_decode_use_mrope(ctx)) {
if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
const auto image_tokens = mtmd_input_chunk_get_tokens_image(chunk);
if (!image_tokens) {
LOG_ERR("failed to decode chunk: image tokens are null\n");
return -1;
}
const int nx = mtmd_image_tokens_get_nx(image_tokens);
const int ny = mtmd_image_tokens_get_ny(image_tokens);
batch_embd.set_position_mrope_2d(n_past, nx, ny, seq_id);
} else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
batch_embd.set_position_mrope_1d(n_past, seq_id);
} else {
GGML_ABORT("invalid chunk type for M-RoPE");
const auto image_tokens = mtmd_input_chunk_get_tokens_image(chunk);
if (chunk_type != MTMD_INPUT_CHUNK_TYPE_IMAGE) {
LOG_ERR("failed to decode chunk: M-RoPE only accepts image chunk\n");
return -1;
}
if (!image_tokens) {
LOG_ERR("failed to decode chunk: image tokens are null\n");
return -1;
}
const int nx = mtmd_image_tokens_get_nx(image_tokens);
const int ny = mtmd_image_tokens_get_ny(image_tokens);
batch_embd.set_position_mrope(n_past, nx, ny, seq_id);
} else {
batch_embd.set_position_normal(n_past, seq_id);
}
@@ -342,118 +295,3 @@ int32_t mtmd_helper_eval_chunks(mtmd_context * ctx,
return 0;
}
namespace audio_helpers {
static bool is_audio_file(const char * buf, size_t len) {
if (len < 12) {
return false;
}
// RIFF ref: https://en.wikipedia.org/wiki/Resource_Interchange_File_Format
// WAV ref: https://www.mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/WAVE.html
bool is_wav = memcmp(buf, "RIFF", 4) == 0 && memcmp(buf + 8, "WAVE", 4) == 0;
bool is_mp3 = len >= 3 && (
memcmp(buf, "ID3", 3) == 0 ||
// Check for MPEG sync word (simplified check)
((unsigned char)buf[0] == 0xFF && ((unsigned char)buf[1] & 0xE0) == 0xE0)
);
bool is_flac = memcmp(buf, "fLaC", 4) == 0;
return is_wav || is_mp3 || is_flac;
}
// returns true if the buffer is a valid audio file
static bool decode_audio_from_buf(const unsigned char * buf_in, size_t len, int target_sampler_rate, std::vector<float> & pcmf32_mono) {
ma_result result;
const int channels = 1;
ma_decoder_config decoder_config = ma_decoder_config_init(ma_format_f32, channels, target_sampler_rate);
ma_decoder decoder;
result = ma_decoder_init_memory(buf_in, len, &decoder_config, &decoder);
if (result != MA_SUCCESS) {
return false;
}
ma_uint64 frame_count;
ma_uint64 frames_read;
result = ma_decoder_get_length_in_pcm_frames(&decoder, &frame_count);
if (result != MA_SUCCESS) {
ma_decoder_uninit(&decoder);
return false;
}
pcmf32_mono.resize(frame_count);
result = ma_decoder_read_pcm_frames(&decoder, pcmf32_mono.data(), frame_count, &frames_read);
if (result != MA_SUCCESS) {
ma_decoder_uninit(&decoder);
return false;
}
#ifdef MTMD_AUDIO_DEBUG
// save audio to wav file
ma_encoder_config config = ma_encoder_config_init(ma_encoding_format_wav, ma_format_f32, 1, target_sampler_rate);
ma_encoder encoder;
ma_encoder_init_file("output.wav", &config, &encoder);
ma_encoder_write_pcm_frames(&encoder, pcmf32_mono.data(), pcmf32_mono.size(), &frames_read);
ma_encoder_uninit(&encoder);
#endif
ma_decoder_uninit(&decoder);
return true;
}
} // namespace audio_helpers
mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len) {
if (audio_helpers::is_audio_file((const char *)buf, len)) {
std::vector<float> pcmf32;
int bitrate = mtmd_get_audio_bitrate(ctx);
if (bitrate < 0) {
LOG_ERR("This model does not support audio input\n");
return nullptr;
}
if (!audio_helpers::decode_audio_from_buf(buf, len, bitrate, pcmf32)) {
LOG_ERR("Unable to read WAV audio file from buffer\n");
return nullptr;
}
return mtmd_bitmap_init_from_audio(pcmf32.size(), pcmf32.data());
}
// otherwise, we assume it's an image
mtmd_bitmap * result = nullptr;
{
int nx, ny, nc;
auto * data = stbi_load_from_memory(buf, len, &nx, &ny, &nc, 3);
if (!data) {
LOG_ERR("%s: failed to decode image bytes\n", __func__);
return nullptr;
}
result = mtmd_bitmap_init(nx, ny, data);
stbi_image_free(data);
}
return result;
}
mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname) {
std::vector<unsigned char> buf;
FILE * f = fopen(fname, "rb");
if (!f) {
LOG_ERR("Unable to open file %s: %s\n", fname, strerror(errno));
return nullptr;
}
fseek(f, 0, SEEK_END);
long file_size = ftell(f);
fseek(f, 0, SEEK_SET);
buf.resize(file_size);
size_t n_read = fread(buf.data(), 1, file_size, f);
fclose(f);
if (n_read != (size_t)file_size) {
LOG_ERR("Failed to read entire file %s", fname);
return nullptr;
}
return mtmd_helper_bitmap_init_from_buf(ctx, buf.data(), buf.size());
}

View File

@@ -1,91 +0,0 @@
#ifndef MTMD_HELPER_H
#define MTMD_HELPER_H
#include "ggml.h"
#include "llama.h"
#include "mtmd.h"
#include <stddef.h>
#include <stdint.h>
#include <stdbool.h>
#ifdef __cplusplus
extern "C" {
#endif
//
// libmtmd helper functions
//
// Please note that these helpers are not guaranteed to be stable.
// BREAKING CHANGES are expected.
//
// helper function to construct a mtmd_bitmap from a file
// it calls mtmd_helper_bitmap_init_from_buf() internally
// returns nullptr on failure
// this function is thread-safe
MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname);
// helper function to construct a mtmd_bitmap from a buffer containing a file
// supported formats:
// image: formats supported by stb_image: jpg, png, bmp, gif, etc.
// audio: formats supported by miniaudio: wav, mp3, flac
// note: audio files will be auto-detected based on magic bytes
// returns nullptr on failure
// this function is thread-safe
MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len);
// helper to count the total number of tokens from a list of chunks, useful to keep track of KV cache
MTMD_API size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks);
// helper to count the total position of tokens from a list of chunks, useful to keep track of n_past
// normally, n_pos is equal to n_tokens, but for M-RoPE it is different
MTMD_API llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks);
// helper function that automatically:
// 1. run llama_decode() on text chunks
// 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode()
// if any of the mtmd_encode() or llama_decode() calls return non-zero, stop and forward the error
// otherwise, returns 0 on success
// this function is NOT thread-safe
MTMD_API int32_t mtmd_helper_eval_chunks(mtmd_context * ctx,
struct llama_context * lctx,
const mtmd_input_chunks * chunks,
llama_pos n_past,
llama_seq_id seq_id,
int32_t n_batch,
bool logits_last,
llama_pos * new_n_past);
// works like mtmd_helper_eval_chunks(), but only for a single chunk
// this function is NOT thread-safe
MTMD_API int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
struct llama_context * lctx,
const mtmd_input_chunk * chunk,
llama_pos n_past,
llama_seq_id seq_id,
int32_t n_batch,
bool logits_last,
llama_pos * new_n_past);
// helper function to decode an image whose embeddings have already been calculated
// this helper will handle batching and pre/post decoding setup (for ex. gemma 3 requires non-causal attention)
// ret 0 on success, -1 on chunk not being a valid image chunk, 1 on decode failure
MTMD_API int32_t mtmd_helper_decode_image_chunk(mtmd_context * ctx,
struct llama_context * lctx,
const mtmd_input_chunk * chunk,
float * encoded_embd,
llama_pos n_past,
llama_seq_id seq_id,
int32_t n_batch,
llama_pos * new_n_past);
#ifdef __cplusplus
} // extern "C"
#endif
//
// C++ wrappers
//
#endif

View File

@@ -95,21 +95,15 @@ mtmd_context_params mtmd_context_params_default() {
}
struct mtmd_context {
struct clip_ctx * ctx_v; // vision
struct clip_ctx * ctx_a; // audio
struct clip_ctx * ctx_clip;
const struct llama_model * text_model;
std::vector<float> image_embd_v; // image embedding vector
bool print_timings;
int n_threads;
std::string media_marker;
const int n_embd_text;
// these are not token, but strings used to mark the beginning and end of image/audio embeddings
std::string img_beg;
std::string img_end;
std::string aud_beg;
std::string aud_end;
bool has_vision;
bool has_audio;
// for llava-uhd style models, we need special tokens in-between slices
// minicpmv calls them "slices", llama 4 calls them "tiles"
@@ -138,61 +132,33 @@ struct mtmd_context {
text_model (text_model),
print_timings(ctx_params.print_timings),
n_threads (ctx_params.n_threads),
media_marker (ctx_params.media_marker),
n_embd_text (llama_model_n_embd(text_model))
media_marker (ctx_params.media_marker)
{
if (std::string(ctx_params.image_marker) != MTMD_DEFAULT_IMAGE_MARKER) {
throw std::runtime_error("custom image_marker is not supported anymore, use media_marker instead");
}
if (media_marker.empty()) {
throw std::runtime_error("media_marker must not be empty");
}
clip_context_params ctx_clip_params;
ctx_clip_params.use_gpu = ctx_params.use_gpu;
ctx_clip_params.verbosity = ctx_params.verbosity;
auto res = clip_init(mmproj_fname, ctx_clip_params);
ctx_v = res.ctx_v;
ctx_a = res.ctx_a;
if (!ctx_v && !ctx_a) {
ctx_clip = clip_init(mmproj_fname, ctx_clip_params);
if (!ctx_clip) {
throw std::runtime_error(string_format("Failed to load CLIP model from %s\n", mmproj_fname));
}
// if both vision and audio mmproj are present, we need to validate their n_embd
if (ctx_v && ctx_a) {
int n_embd_v = clip_n_mmproj_embd(ctx_v);
int n_embd_a = clip_n_mmproj_embd(ctx_a);
if (n_embd_v != n_embd_a) {
throw std::runtime_error(string_format(
"mismatch between vision and audio mmproj (n_embd_v = %d, n_embd_a = %d)\n",
n_embd_v, n_embd_a));
}
}
// since we already validate n_embd of vision and audio mmproj,
// we can safely assume that they are the same
int n_embd_clip = clip_n_mmproj_embd(ctx_v ? ctx_v : ctx_a);
if (n_embd_text != n_embd_clip) {
if (llama_model_n_embd(text_model) != clip_n_mmproj_embd(ctx_clip)) {
throw std::runtime_error(string_format(
"mismatch between text model (n_embd = %d) and mmproj (n_embd = %d)\n"
"hint: you may be using wrong mmproj\n",
n_embd_text, n_embd_clip));
llama_model_n_embd(text_model), clip_n_mmproj_embd(ctx_clip)));
}
if (ctx_v) {
init_vision();
}
if (ctx_a) {
init_audio();
}
}
void init_vision() {
GGML_ASSERT(ctx_v != nullptr);
use_mrope = clip_is_qwen2vl(ctx_v);
has_vision = clip_has_vision_encoder(ctx_clip);
has_audio = clip_has_audio_encoder(ctx_clip);
use_mrope = clip_is_qwen2vl(ctx_clip);
projector_type proj = clip_get_projector_type(ctx_v);
int minicpmv_version = clip_is_minicpmv(ctx_v);
projector_type proj = clip_get_projector_type(ctx_clip);
int minicpmv_version = clip_is_minicpmv(ctx_clip);
if (minicpmv_version == 2) {
// minicpmv 2.5 format:
// <image> (overview) </image><slice><image> (slice) </image><image> (slice) </image>\n ... </slice>
@@ -237,82 +203,24 @@ struct mtmd_context {
ov_img_first = false; // overview image is last
}
// set boi/eoi
if (proj == PROJECTOR_TYPE_GEMMA3) {
// <start_of_image> ... (image embeddings) ... <end_of_image>
img_beg = "<start_of_image>";
img_end = "<end_of_image>";
} else if (proj == PROJECTOR_TYPE_IDEFICS3) {
// https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215
img_beg = "<fake_token_around_image><global-img>";
img_end = "<fake_token_around_image>";
} else if (proj == PROJECTOR_TYPE_PIXTRAL) {
// https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
img_end = "[IMG_END]";
} else if (proj == PROJECTOR_TYPE_QWEN2VL || proj == PROJECTOR_TYPE_QWEN25VL) {
// <|vision_start|> ... (image embeddings) ... <|vision_end|>
img_beg = "<|vision_start|>";
img_end = "<|vision_end|>";
} else if (proj == PROJECTOR_TYPE_LLAMA4) {
// (more details in mtmd_context constructor)
img_beg = "<|image_start|>";
img_end = "<|image_end|>";
LOG_WRN("%s: llama 4 vision is known to have degraded quality:\n"
" https://github.com/ggml-org/llama.cpp/pull/13282\n", __func__);
} else if (proj == PROJECTOR_TYPE_INTERNVL) {
// <img> ... (image embeddings) ... </img>
img_beg = "<img>";
img_end = "</img>";
}
}
void init_audio() {
GGML_ASSERT(ctx_a != nullptr);
projector_type proj = clip_get_projector_type(ctx_a);
if (clip_has_whisper_encoder(ctx_a)) {
if (clip_has_whisper_encoder(ctx_clip)) {
// TODO @ngxson : check if model n_mel is 128 or 80
w_filters = whisper_precalc_filters::get_128_bins();
}
LOG_WRN("%s: audio input is in experimental stage and may have reduced quality:\n"
" https://github.com/ggml-org/llama.cpp/discussions/13759\n", __func__);
if (proj == PROJECTOR_TYPE_QWEN2A) {
// <|audio_bos|> ... (embeddings) ... <|audio_eos|>
aud_beg = "<|audio_bos|>";
aud_end = "<|audio_eos|>";
// warning messages
if (proj == PROJECTOR_TYPE_LLAMA4) {
LOG_WRN("%s: llama 4 vision is known to have degraded quality:\n"
" https://github.com/ggml-org/llama.cpp/pull/13282\n", __func__);
}
}
// get clip ctx based on chunk type
clip_ctx * get_clip_ctx(const mtmd_input_chunk * chunk) const {
if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
return ctx_v;
} else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
return ctx_a;
if (has_audio) {
LOG_WRN("%s: audio input is in experimental stage and may have reduced quality:\n"
" https://github.com/ggml-org/llama.cpp/discussions/13759\n", __func__);
}
GGML_ABORT("unknown chunk type");
}
projector_type proj_type_v() const {
return ctx_v ? clip_get_projector_type(ctx_v) : PROJECTOR_TYPE_UNKNOWN;
}
projector_type proj_type_a() const {
return ctx_a ? clip_get_projector_type(ctx_a) : PROJECTOR_TYPE_UNKNOWN;
}
~mtmd_context() {
clip_free(ctx_a);
clip_free(ctx_v);
clip_free(ctx_clip);
}
private:
@@ -359,315 +267,107 @@ void mtmd_free(mtmd_context * ctx) {
}
}
struct mtmd_tokenizer {
mtmd_context * ctx;
std::vector<const mtmd_bitmap *> bitmaps;
// copied from common_tokenize
static std::vector<llama_token> mtmd_tokenize_text_internal(
const struct llama_vocab * vocab,
const std::string & text,
bool add_special,
bool parse_special) {
// upper limit for the number of tokens
int n_tokens = text.length() + 2 * add_special;
std::vector<llama_token> result(n_tokens);
n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
if (n_tokens < 0) {
result.resize(-n_tokens);
int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
GGML_ASSERT(check == -n_tokens);
} else {
result.resize(n_tokens);
}
return result;
}
std::string input_text;
bool add_special;
bool parse_special;
const llama_vocab * vocab;
mtmd_input_chunks cur;
mtmd_tokenizer(mtmd_context * ctx,
int32_t mtmd_tokenize(mtmd_context * ctx,
mtmd_input_chunks * output,
const mtmd_input_text * text,
const mtmd_bitmap ** bitmaps,
size_t n_bitmaps) : ctx(ctx), bitmaps(bitmaps, bitmaps + n_bitmaps) {
add_special = text->add_special;
parse_special = text->parse_special;
input_text = text->text;
vocab = llama_model_get_vocab(ctx->text_model);
size_t n_bitmaps) {
auto vocab = llama_model_get_vocab(ctx->text_model);
std::string prompt_modified(text->text);
std::string marker_modified(ctx->media_marker);
projector_type proj_type = clip_get_projector_type(ctx->ctx_clip);
// for compatibility, we convert image marker to media marker
string_replace_all(prompt_modified, MTMD_DEFAULT_IMAGE_MARKER, ctx->media_marker);
// a bit hacky here, but works for now
// for some models, we need to add prefix and suffix to the image embeddings
if (clip_is_gemma3(ctx->ctx_clip)) {
// gemma 3
// <start_of_image> ... (image embeddings) ... <end_of_image>
marker_modified = "<start_of_image>" + ctx->media_marker + "<end_of_image>";
string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
} else if (proj_type == PROJECTOR_TYPE_IDEFICS3) {
// https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215
marker_modified = "<fake_token_around_image><global-img>" + ctx->media_marker + "<fake_token_around_image>";
string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
} else if (proj_type == PROJECTOR_TYPE_PIXTRAL) {
// https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
marker_modified = ctx->media_marker + "[IMG_END]";
string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
} else if (proj_type == PROJECTOR_TYPE_QWEN2VL || proj_type == PROJECTOR_TYPE_QWEN25VL) {
// <|vision_start|> ... (image embeddings) ... <|vision_end|>
marker_modified = "<|vision_start|>" + ctx->media_marker + "<|vision_end|>";
string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
} else if (proj_type == PROJECTOR_TYPE_LLAMA4) {
// (more details in mtmd_context constructor)
marker_modified = "<|image_start|>" + ctx->media_marker + "<|image_end|>";
string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
} else if (proj_type == PROJECTOR_TYPE_INTERNVL) {
// <img> ... (image embeddings) ... </img>
marker_modified = "<img>" + ctx->media_marker + "</img>";
string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
} else if (proj_type == PROJECTOR_TYPE_QWEN2A) {
// <|audio_bos|> ... (embeddings) ... <|audio_eos|>
marker_modified = "<|audio_bos|>" + ctx->media_marker + "<|audio_eos|>";
string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
// for compatibility, we convert image marker to media marker
string_replace_all(input_text, MTMD_DEFAULT_IMAGE_MARKER, ctx->media_marker);
}
int32_t tokenize(mtmd_input_chunks * output) {
cur.entries.clear();
std::vector<std::string> parts = split_text(input_text, ctx->media_marker);
size_t i_bm = 0; // index of the current bitmap
for (auto & part : parts) {
if (part == ctx->media_marker) {
// this is a marker, we should add the next bitmap
if (i_bm >= bitmaps.size()) {
LOG_ERR("%s: error: number of bitmaps (%zu) does not match number of markers (%zu)\n",
__func__, bitmaps.size(), parts.size() - 1);
return 1;
}
const mtmd_bitmap * bitmap = bitmaps[i_bm++];
int32_t res = add_media(bitmap);
if (res != 0) {
return res;
}
} else {
// this is a text part, we should add it as text
add_text(part, parse_special);
}
}
// llava-1.5, llava-1.6, Yi-VL, Yi-34B, granite: don't need to add prefix and suffix
// for glm-edge, BOI and EOI token's embeddings are not present in the text model
if (add_special && llama_vocab_get_add_bos(vocab)) {
// if first chunk is text, we add BOS token to first text chunk
// otherwise, create a new text chunk with BOS token
if (!cur.entries.empty() && cur.entries[0].type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
// add BOS token to the beginning of first text chunk
cur.entries[0].tokens_text.insert(cur.entries[0].tokens_text.begin(), llama_vocab_bos(vocab));
} else {
// create a new text chunk with BOS token at the beginning
mtmd_input_chunk bos_chunk{
MTMD_INPUT_CHUNK_TYPE_TEXT,
{llama_vocab_bos(vocab)},
nullptr, // image tokens
nullptr, // audio tokens
};
cur.entries.insert(cur.entries.begin(), std::move(bos_chunk));
}
}
std::vector<std::string> parts = string_split_str(prompt_modified, ctx->media_marker);
output->entries.clear();
output->entries.reserve(parts.size());
if (add_special && llama_vocab_get_add_eos(vocab)) {
// if last chunk is text, we add EOS token to it
add_text({llama_vocab_eos(vocab)});
}
size_t i_bm = 0;
if (i_bm != bitmaps.size()) {
LOG_ERR("%s: error: number of bitmaps (%zu) does not match number of markers (%zu)\n",
__func__, bitmaps.size(), parts.size() - 1);
return 1;
}
// utility for adding raw tokens
auto add_text_chunk = [&output](std::vector<llama_token> && tokens) {
mtmd_input_chunk chunk{
MTMD_INPUT_CHUNK_TYPE_TEXT,
std::move(tokens),
nullptr, // image tokens
nullptr, // audio tokens
};
output->entries.emplace_back(std::move(chunk));
};
*output = std::move(cur);
return 0;
}
void add_text(const std::string & txt, bool parse_special) {
LOG_DBG("%s: %s\n", __func__, txt.c_str());
auto tokens = mtmd_tokenize_text_internal(vocab, txt, /* add_special */ false, parse_special);
add_text(tokens);
}
void add_text(const std::vector<llama_token> & tokens) {
if (tokens.empty()) {
return;
}
// if last entry is also a text chunk, add tokens to it instead of creating new chunk
if (!cur.entries.empty() && cur.entries.back().type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
cur.entries.back().tokens_text.insert(
cur.entries.back().tokens_text.end(),
tokens.begin(),
tokens.end());
} else {
mtmd_input_chunk chunk{
MTMD_INPUT_CHUNK_TYPE_TEXT,
tokens,
nullptr, // image tokens
nullptr, // audio tokens
};
cur.entries.emplace_back(std::move(chunk));
}
}
int32_t add_media(const mtmd_bitmap * bitmap) {
if (!bitmap->is_audio) {
// handle image
if (!ctx->ctx_v) {
LOG_ERR("%s: error: model does not support vision input\n", __func__);
return 2;
}
if (!ctx->img_beg.empty()) {
add_text(ctx->img_beg, true); // add image begin token
}
// convert mtmd_bitmap to clip_image_u8
clip_image_u8_ptr img_u8(clip_image_u8_init());
img_u8->nx = bitmap->nx;
img_u8->ny = bitmap->ny;
img_u8->buf.resize(bitmap->data.size());
std::memcpy(img_u8->buf.data(), bitmap->data.data(), img_u8->nx * img_u8->ny * 3);
// preprocess image
clip_image_f32_batch batch_f32;
bool ok = clip_image_preprocess(ctx->ctx_v, img_u8.get(), &batch_f32);
if (!ok) {
LOG_ERR("Unable to preprocess image\n");
return 2;
}
// handle llava-uhd style preprocessing
if (
ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_5
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_LLAMA4
) {
// split batch into chunks of single images
auto chunks = split_batch_to_chunk(std::move(batch_f32), bitmap->id);
GGML_ASSERT(chunks.size() > 0);
auto ov_chunk = std::move(chunks.front());
chunks.erase(chunks.begin());
// add overview image (first)
if (ctx->ov_img_first) {
if (ctx->tok_ov_img_start != LLAMA_TOKEN_NULL) {
add_text({ctx->tok_ov_img_start});
}
cur.entries.emplace_back(std::move(ov_chunk));
if (ctx->tok_ov_img_end != LLAMA_TOKEN_NULL) {
add_text({ctx->tok_ov_img_end});
}
}
// add slices (or tiles)
if (!chunks.empty()) {
const int n_col = batch_f32.grid_x;
const int n_row = batch_f32.grid_y;
if (ctx->tok_slices_start != LLAMA_TOKEN_NULL) {
add_text({ctx->tok_slices_start});
}
for (int y = 0; y < n_row; y++) {
for (int x = 0; x < n_col; x++) {
const bool is_last_in_row = (x == n_col - 1);
if (ctx->tok_sli_img_start != LLAMA_TOKEN_NULL) {
add_text({ctx->tok_sli_img_start});
}
cur.entries.emplace_back(std::move(chunks[y * n_col + x]));
if (ctx->tok_sli_img_end != LLAMA_TOKEN_NULL) {
add_text({ctx->tok_sli_img_end});
}
if (!is_last_in_row && ctx->tok_sli_img_mid != LLAMA_TOKEN_NULL) {
add_text({ctx->tok_sli_img_mid});
}
}
if ((y != n_row - 1 || ctx->tok_row_end_trail) && ctx->tok_row_end != LLAMA_TOKEN_NULL) {
add_text({ctx->tok_row_end});
}
}
if (ctx->tok_slices_end != LLAMA_TOKEN_NULL) {
add_text({ctx->tok_slices_end});
}
}
// add overview image (last)
if (!ctx->ov_img_first) {
if (ctx->tok_ov_img_start != LLAMA_TOKEN_NULL) {
add_text({ctx->tok_ov_img_start});
}
cur.entries.emplace_back(std::move(ov_chunk));
if (ctx->tok_ov_img_end != LLAMA_TOKEN_NULL) {
add_text({ctx->tok_ov_img_end});
}
}
} else {
size_t n_tokens = 0;
for (const auto & entry : batch_f32.entries) {
n_tokens += clip_n_output_tokens(ctx->ctx_v, entry.get());
}
mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
if (ctx->use_mrope) {
// for Qwen2VL, we need this information for M-RoPE decoding positions
image_tokens->nx = clip_n_output_tokens_x(ctx->ctx_v, batch_f32.entries[0].get());
image_tokens->ny = clip_n_output_tokens_y(ctx->ctx_v, batch_f32.entries[0].get());
image_tokens->use_mrope_pos = true;
} else {
// other models, we only need the total number of tokens
image_tokens->nx = n_tokens;
image_tokens->ny = 1;
}
image_tokens->batch_f32 = std::move(batch_f32);
image_tokens->id = bitmap->id; // optional
LOG_DBG("image_tokens->nx = %d\n", image_tokens->nx);
LOG_DBG("image_tokens->ny = %d\n", image_tokens->ny);
LOG_DBG("batch_f32 size = %d\n", (int)image_tokens->batch_f32.entries.size());
mtmd_input_chunk chunk{
MTMD_INPUT_CHUNK_TYPE_IMAGE,
{}, // text tokens
std::move(image_tokens),
nullptr, // audio tokens
};
cur.entries.emplace_back(std::move(chunk));
}
if (!ctx->img_end.empty()) {
add_text(ctx->img_end, true); // add image end token
}
} else {
// handle audio
if (!ctx->ctx_a) {
LOG_ERR("%s: error: model does not support audio input\n", __func__);
return 2;
}
if (bitmap->data.size() == 0) {
LOG_ERR("%s: error: empty audio data\n", __func__);
return 2;
}
if (!ctx->aud_beg.empty()) {
add_text(ctx->aud_beg, true); // add audio begin token
}
// preprocess audio
GGML_ASSERT(ctx->w_filters.n_mel); // make sure we have filter preloaded
std::vector<whisper_preprocessor::whisper_mel> mel_spec_chunks;
const float * samples = (const float *)bitmap->data.data();
size_t n_samples = bitmap->data.size() / sizeof(float);
bool ok = whisper_preprocessor::preprocess_audio(samples, n_samples, ctx->w_filters, mel_spec_chunks);
if (!ok) {
LOG_ERR("Unable to preprocess audio\n");
return 2;
}
// consider each mel_spec as a separate audio chunk
// TODO: maybe support batching, but this may come with memory cost
for (auto & mel_spec : mel_spec_chunks) {
clip_image_f32_ptr mel_f32(clip_image_f32_init());
mel_f32->nx = mel_spec.n_len;
mel_f32->ny = mel_spec.n_mel;
mel_f32->buf = std::move(mel_spec.data);
size_t n_tokens = clip_n_output_tokens(ctx->ctx_a, mel_f32.get());
clip_image_f32_batch batch_f32;
batch_f32.is_audio = true;
batch_f32.entries.push_back(std::move(mel_f32));
mtmd_audio_tokens_ptr audio_tokens(new mtmd_audio_tokens);
audio_tokens->n_tokens = n_tokens;
audio_tokens->batch_f32 = std::move(batch_f32);
audio_tokens->id = bitmap->id; // optional
LOG_DBG("audio_tokens->n_tokens = %d\n", audio_tokens->n_tokens);
mtmd_input_chunk chunk{
MTMD_INPUT_CHUNK_TYPE_AUDIO,
{}, // text tokens
nullptr, // image tokens
std::move(audio_tokens),
};
cur.entries.emplace_back(std::move(chunk));
}
if (!ctx->aud_end.empty()) {
add_text(ctx->aud_end, true); // add audio end token
}
}
return 0;
}
std::vector<mtmd_input_chunk> split_batch_to_chunk(clip_image_f32_batch && batch_f32, const std::string & id) {
// utility for splitting batch of multiple images into chunks of batch having single images
auto split_batch_to_chunk = [&ctx](clip_image_f32_batch && batch_f32, const std::string & id) {
std::vector<mtmd_input_chunk> chunks;
for (auto & entry : batch_f32.entries) {
mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
image_tokens->nx = clip_n_output_tokens(ctx->ctx_v, entry.get());
image_tokens->nx = clip_n_output_tokens(ctx->ctx_clip, entry.get());
image_tokens->ny = 1;
image_tokens->batch_f32.entries.push_back(std::move(entry));
image_tokens->id = id;
@@ -682,57 +382,222 @@ struct mtmd_tokenizer {
}
return chunks;
}
};
// for example: "a <__media__> b <__media__> c" --> "a", "<__media__>", "b", "<__media__>", "c"
static std::vector<std::string> split_text(const std::string & input, const std::string & delimiter) {
std::vector<std::string> result;
if (input.empty()) {
return result;
for (const auto & part : parts) {
// printf("tokenizing part: %s\n", part.c_str());
bool add_bos = &parts.front() == &part;
auto tokens = mtmd_tokenize_text_internal(vocab, part, text->add_special && add_bos, text->parse_special);
if (tokens.empty()) {
continue;
}
size_t start = 0;
size_t pos = 0;
while ((pos = input.find(delimiter, start)) != std::string::npos) {
if (pos > start) {
result.push_back(input.substr(start, pos - start));
mtmd_input_chunk chunk{
MTMD_INPUT_CHUNK_TYPE_TEXT,
std::move(tokens),
nullptr, // image tokens
nullptr, // audio tokens
};
output->entries.emplace_back(std::move(chunk));
// only add image/audio tokens to middle of 2 parts
// therefore, we skip handling image/audio if this is the last part
if (&parts.back() == &part) {
continue;
}
if (!bitmaps[i_bm]->is_audio) {
// handle image
if (i_bm >= n_bitmaps) {
LOG_ERR("%s: error: not enough images for %d parts\n", __func__, (int)parts.size());
return 1;
}
result.push_back(delimiter);
start = pos + delimiter.length();
}
if (start < input.length()) {
result.push_back(input.substr(start));
}
return result;
}
// copied from common_tokenize
static std::vector<llama_token> mtmd_tokenize_text_internal(
const struct llama_vocab * vocab,
const std::string & text,
bool add_special,
bool parse_special) {
// upper limit for the number of tokens
int n_tokens = text.length() + 2 * add_special;
std::vector<llama_token> result(n_tokens);
n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
if (n_tokens < 0) {
result.resize(-n_tokens);
int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
GGML_ASSERT(check == -n_tokens);
if (!ctx->has_vision) {
LOG_ERR("%s: error: model does not support vision input\n", __func__);
return 2;
}
// convert mtmd_bitmap to clip_image_u8
clip_image_u8_ptr img_u8(clip_image_u8_init());
img_u8->nx = bitmaps[i_bm]->nx;
img_u8->ny = bitmaps[i_bm]->ny;
img_u8->buf.resize(bitmaps[i_bm]->data.size());
std::memcpy(img_u8->buf.data(), bitmaps[i_bm]->data.data(), img_u8->nx * img_u8->ny * 3);
// preprocess image
clip_image_f32_batch batch_f32;
bool ok = clip_image_preprocess(ctx->ctx_clip, img_u8.get(), &batch_f32);
if (!ok) {
LOG_ERR("Unable to preprocess image\n");
return 2;
}
// handle llava-uhd style preprocessing
if (
ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_5
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_LLAMA4
) {
// split batch into chunks of single images
auto chunks = split_batch_to_chunk(std::move(batch_f32), bitmaps[i_bm]->id);
GGML_ASSERT(chunks.size() > 0);
auto ov_chunk = std::move(chunks.front());
chunks.erase(chunks.begin());
// add overview image (first)
if (ctx->ov_img_first) {
if (ctx->tok_ov_img_start != LLAMA_TOKEN_NULL) {
add_text_chunk({ctx->tok_ov_img_start});
}
output->entries.emplace_back(std::move(ov_chunk));
if (ctx->tok_ov_img_end != LLAMA_TOKEN_NULL) {
add_text_chunk({ctx->tok_ov_img_end});
}
}
// add slices (or tiles)
if (!chunks.empty()) {
const int n_col = batch_f32.grid_x;
const int n_row = batch_f32.grid_y;
if (ctx->tok_slices_start != LLAMA_TOKEN_NULL) {
add_text_chunk({ctx->tok_slices_start});
}
for (int y = 0; y < n_row; y++) {
for (int x = 0; x < n_col; x++) {
const bool is_last_in_row = (x == n_col - 1);
if (ctx->tok_sli_img_start != LLAMA_TOKEN_NULL) {
add_text_chunk({ctx->tok_sli_img_start});
}
output->entries.emplace_back(std::move(chunks[y * n_col + x]));
if (ctx->tok_sli_img_end != LLAMA_TOKEN_NULL) {
add_text_chunk({ctx->tok_sli_img_end});
}
if (!is_last_in_row && ctx->tok_sli_img_mid != LLAMA_TOKEN_NULL) {
add_text_chunk({ctx->tok_sli_img_mid});
}
}
if ((y != n_row - 1 || ctx->tok_row_end_trail) && ctx->tok_row_end != LLAMA_TOKEN_NULL) {
add_text_chunk({ctx->tok_row_end});
}
}
if (ctx->tok_slices_end != LLAMA_TOKEN_NULL) {
add_text_chunk({ctx->tok_slices_end});
}
}
// add overview image (last)
if (!ctx->ov_img_first) {
if (ctx->tok_ov_img_start != LLAMA_TOKEN_NULL) {
add_text_chunk({ctx->tok_ov_img_start});
}
output->entries.emplace_back(std::move(ov_chunk));
if (ctx->tok_ov_img_end != LLAMA_TOKEN_NULL) {
add_text_chunk({ctx->tok_ov_img_end});
}
}
} else {
size_t n_tokens = 0;
for (const auto & entry : batch_f32.entries) {
n_tokens += clip_n_output_tokens(ctx->ctx_clip, entry.get());
}
mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
if (ctx->use_mrope) {
// for Qwen2VL, we need this information for M-RoPE decoding positions
image_tokens->nx = clip_n_output_tokens_x(ctx->ctx_clip, batch_f32.entries[0].get());
image_tokens->ny = clip_n_output_tokens_y(ctx->ctx_clip, batch_f32.entries[0].get());
image_tokens->use_mrope_pos = true;
} else {
// other models, we only need the total number of tokens
image_tokens->nx = n_tokens;
image_tokens->ny = 1;
}
image_tokens->batch_f32 = std::move(batch_f32);
image_tokens->id = bitmaps[i_bm]->id; // optional
LOG_DBG("image_tokens->nx = %d\n", image_tokens->nx);
LOG_DBG("image_tokens->ny = %d\n", image_tokens->ny);
LOG_DBG("batch_f32 size = %d\n", (int)image_tokens->batch_f32.entries.size());
mtmd_input_chunk chunk{
MTMD_INPUT_CHUNK_TYPE_IMAGE,
{}, // text tokens
std::move(image_tokens),
nullptr, // audio tokens
};
output->entries.emplace_back(std::move(chunk));
}
i_bm++; // move to next image
continue;
} else {
result.resize(n_tokens);
}
return result;
}
};
// handle audio
int32_t mtmd_tokenize(mtmd_context * ctx,
mtmd_input_chunks * output,
const mtmd_input_text * text,
const mtmd_bitmap ** bitmaps,
size_t n_bitmaps) {
mtmd_tokenizer tokenizer(ctx, text, bitmaps, n_bitmaps);
return tokenizer.tokenize(output);
if (i_bm >= n_bitmaps) {
LOG_ERR("%s: error: not enough images for %d parts\n", __func__, (int)parts.size());
return 1;
}
if (!ctx->has_audio) {
LOG_ERR("%s: error: model does not support audio input\n", __func__);
return 2;
}
if (bitmaps[i_bm]->data.size() == 0) {
LOG_ERR("%s: error: empty audio data\n", __func__);
return 2;
}
// preprocess audio
GGML_ASSERT(ctx->w_filters.n_mel); // make sure we have filter preloaded
std::vector<whisper_preprocessor::whisper_mel> mel_spec_chunks;
const float * samples = (const float *)bitmaps[i_bm]->data.data();
size_t n_samples = bitmaps[i_bm]->data.size() / sizeof(float);
bool ok = whisper_preprocessor::preprocess_audio(samples, n_samples, ctx->w_filters, mel_spec_chunks);
if (!ok) {
LOG_ERR("Unable to preprocess audio\n");
return 2;
}
// consider each mel_spec as a separate audio chunk
// TODO: maybe support batching, but this may come with memory cost
for (auto & mel_spec : mel_spec_chunks) {
clip_image_f32_ptr mel_f32(clip_image_f32_init());
mel_f32->nx = mel_spec.n_len;
mel_f32->ny = mel_spec.n_mel;
mel_f32->buf = std::move(mel_spec.data);
size_t n_tokens = clip_n_output_tokens(ctx->ctx_clip, mel_f32.get());
clip_image_f32_batch batch_f32;
batch_f32.is_audio = true;
batch_f32.entries.push_back(std::move(mel_f32));
mtmd_audio_tokens_ptr audio_tokens(new mtmd_audio_tokens);
audio_tokens->n_tokens = n_tokens;
audio_tokens->batch_f32 = std::move(batch_f32);
audio_tokens->id = bitmaps[i_bm]->id; // optional
LOG_DBG("audio_tokens->n_tokens = %d\n", audio_tokens->n_tokens);
mtmd_input_chunk chunk{
MTMD_INPUT_CHUNK_TYPE_AUDIO,
{}, // text tokens
nullptr, // image tokens
std::move(audio_tokens),
};
output->entries.emplace_back(std::move(chunk));
}
i_bm++;
continue;
}
}
return 0;
}
int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) {
@@ -740,54 +605,41 @@ int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) {
LOG_WRN("mtmd_encode_chunk has no effect for text chunks\n");
return 0;
} else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
if (!ctx->ctx_v) {
LOG_ERR("%s: model does not support vision input\n", __func__);
return 1;
}
return mtmd_encode(ctx, chunk->tokens_image.get());
} else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
if (!ctx->ctx_a) {
LOG_ERR("%s: model does not support audio input\n", __func__);
return 1;
}
int n_mmproj_embd = ctx->n_embd_text;
int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
ctx->image_embd_v.resize(chunk->tokens_audio->n_tokens * n_mmproj_embd);
bool ok = clip_image_batch_encode(
ctx->ctx_a,
ctx->ctx_clip,
ctx->n_threads,
&chunk->tokens_audio->batch_f32,
ctx->image_embd_v.data());
return ok ? 0 : 1;
}
LOG_ERR("%s: unknown chunk type %d\n", __func__, (int)chunk->type);
LOG_ERR("mtmd_encode_chunk: unknown chunk type %d\n", (int)chunk->type);
return 1;
}
int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) {
clip_ctx * ctx_clip = ctx->ctx_v;
if (!ctx_clip) {
LOG_ERR("%s: this API does not support non-vision input, please use mtmd_encode_chunk instead\n", __func__);
return 1;
}
int n_mmproj_embd = clip_n_mmproj_embd(ctx_clip);
int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd);
bool ok = false;
if (clip_is_llava(ctx_clip) || clip_is_minicpmv(ctx_clip) || clip_is_glm(ctx_clip)) {
if (clip_is_llava(ctx->ctx_clip) || clip_is_minicpmv(ctx->ctx_clip) || clip_is_glm(ctx->ctx_clip)) {
// TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
const auto & entries = image_tokens->batch_f32.entries;
for (size_t i = 0; i < entries.size(); i++) {
int n_tokens_per_image = clip_n_output_tokens(ctx_clip, entries[i].get());
int n_tokens_per_image = clip_n_output_tokens(ctx->ctx_clip, entries[i].get());
ok = clip_image_encode(
ctx_clip,
ctx->ctx_clip,
ctx->n_threads,
entries[i].get(),
ctx->image_embd_v.data() + i*n_mmproj_embd*n_tokens_per_image);
}
} else {
ok = clip_image_batch_encode(
ctx_clip,
ctx->ctx_clip,
ctx->n_threads,
&image_tokens->batch_f32,
ctx->image_embd_v.data());
@@ -801,7 +653,8 @@ float * mtmd_get_output_embd(mtmd_context * ctx) {
}
bool mtmd_decode_use_non_causal(mtmd_context * ctx) {
if (ctx->ctx_v && clip_get_projector_type(ctx->ctx_v) == PROJECTOR_TYPE_GEMMA3) {
projector_type proj_type = clip_get_projector_type(ctx->ctx_clip);
if (proj_type == PROJECTOR_TYPE_GEMMA3) {
return true;
}
return false;
@@ -812,19 +665,60 @@ bool mtmd_decode_use_mrope(mtmd_context * ctx) {
}
bool mtmd_support_vision(mtmd_context * ctx) {
return ctx->ctx_v != nullptr;
return ctx->has_vision;
}
bool mtmd_support_audio(mtmd_context * ctx) {
return ctx->ctx_a != nullptr;
return ctx->has_audio;
}
int mtmd_get_audio_bitrate(mtmd_context * ctx) {
if (!ctx->ctx_a) {
return -1;
// these 2 helpers below use internal clip_image_u8_ptr,
// so unfortunately they cannot moved to mtmd-helper.h
// however, in theory, user can decode image file to bitmap using
// whichever library they want, and then use mtmd_bitmap_init() to create bitmap
mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(const unsigned char * buf, size_t len) {
if (audio_helpers::is_audio_file((const char *)buf, len)) {
std::vector<float> pcmf32;
if (!audio_helpers::decode_audio_from_buf(buf, len, COMMON_SAMPLE_RATE, pcmf32)) {
LOG_ERR("Unable to read WAV audio file from buffer\n");
return nullptr;
}
return mtmd_bitmap_init_from_audio(pcmf32.size(), pcmf32.data());
}
// for now, we assume that all audio models have the same bitrate
return 16000; // 16kHz
clip_image_u8_ptr img_u8(clip_image_u8_init());
bool ok = clip_image_load_from_bytes(buf, len, img_u8.get());
if (!ok) {
LOG_ERR("Unable to load image from buffer\n");
return nullptr;
}
uint32_t nx, ny;
unsigned char * data = clip_image_u8_get_data(img_u8.get(), &nx, &ny);
return mtmd_bitmap_init(nx, ny, data);
}
mtmd_bitmap * mtmd_helper_bitmap_init_from_file(const char * fname) {
std::vector<unsigned char> buf;
FILE * f = fopen(fname, "rb");
if (!f) {
LOG_ERR("Unable to open file %s: %s\n", fname, strerror(errno));
return nullptr;
}
fseek(f, 0, SEEK_END);
long file_size = ftell(f);
fseek(f, 0, SEEK_SET);
buf.resize(file_size);
size_t n_read = fread(buf.data(), 1, file_size, f);
fclose(f);
if (n_read != (size_t)file_size) {
LOG_ERR("Failed to read entire file %s", fname);
return nullptr;
}
return mtmd_helper_bitmap_init_from_buf(buf.data(), buf.size());
}
//

View File

@@ -109,10 +109,6 @@ MTMD_API bool mtmd_support_vision(mtmd_context * ctx);
// whether the current model supports audio input
MTMD_API bool mtmd_support_audio(mtmd_context * ctx);
// get audio bitrate in Hz, for example 16000 for Whisper
// return -1 if audio is not supported
MTMD_API int mtmd_get_audio_bitrate(mtmd_context * ctx);
// mtmd_bitmap
//
// if bitmap is image:
@@ -213,6 +209,75 @@ MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
/////////////////////////////////////////
//
// Helper functions (can be implemented based on other functions)
//
// Please note that these helpers are not guaranteed to be stable.
// BREAKING CHANGES are expected.
//
// helper function to construct a mtmd_bitmap from a file
// it calls mtmd_helper_bitmap_init_from_buf() internally
// returns nullptr on failure
// this function is thread-safe
MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(const char * fname);
// helper function to construct a mtmd_bitmap from a buffer containing a file
// supported formats:
// image: formats supported by stb_image: jpg, png, bmp, gif, etc.
// audio: formats supported by miniaudio: wav, mp3, flac
// note: audio files will be auto-detected based on magic bytes
// returns nullptr on failure
// this function is thread-safe
MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(const unsigned char * buf, size_t len);
// helper to count the total number of tokens from a list of chunks, useful to keep track of KV cache
MTMD_API size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks);
// helper to count the total position of tokens from a list of chunks, useful to keep track of n_past
// normally, n_pos is equal to n_tokens, but for M-RoPE it is different
MTMD_API llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks);
// helper function that automatically:
// 1. run llama_decode() on text chunks
// 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode()
// if any of the mtmd_encode() or llama_decode() calls return non-zero, stop and forward the error
// otherwise, returns 0 on success
// this function is NOT thread-safe
MTMD_API int32_t mtmd_helper_eval_chunks(mtmd_context * ctx,
struct llama_context * lctx,
const mtmd_input_chunks * chunks,
llama_pos n_past,
llama_seq_id seq_id,
int32_t n_batch,
bool logits_last,
llama_pos * new_n_past);
// works like mtmd_helper_eval_chunks(), but only for a single chunk
// this function is NOT thread-safe
MTMD_API int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
struct llama_context * lctx,
const mtmd_input_chunk * chunk,
llama_pos n_past,
llama_seq_id seq_id,
int32_t n_batch,
bool logits_last,
llama_pos * new_n_past);
// helper function to decode an image whose embeddings have already been calculated
// this helper will handle batching and pre/post decoding setup (for ex. gemma 3 requires non-causal attention)
// ret 0 on success, -1 on chunk not being a valid image chunk, 1 on decode failure
MTMD_API int32_t mtmd_helper_decode_image_chunk(mtmd_context * ctx,
struct llama_context * lctx,
const mtmd_input_chunk * chunk,
float * encoded_embd,
llama_pos n_past,
llama_seq_id seq_id,
int32_t n_batch,
llama_pos * new_n_past);
/////////////////////////////////////////
// test function, to be used in test-mtmd-c-api.c
MTMD_API mtmd_input_chunks * mtmd_test_create_input_chunks(void);

Binary file not shown.

View File

@@ -25,99 +25,80 @@ RUN_HUGE_TESTS=false
if [ "${1:-}" = "huge" ]; then
RUN_HUGE_TESTS=true
RUN_BIG_TESTS=true
echo "Include BIG and HUGE models..."
echo "Include BIG models..."
fi
###############
arr_prefix=()
arr_bin=()
arr_hf=()
arr_tmpl=() # chat template
arr_file=()
add_test_vision() {
local hf=$1
local tmpl=${2:-""} # default to empty string if not provided
arr_prefix+=("[vision]")
add_test() {
local bin=$1
local hf=$2
local tmpl=${3:-""} # default to empty string if not provided
arr_bin+=("$bin")
arr_hf+=("$hf")
arr_tmpl+=("$tmpl")
arr_file+=("test-1.jpeg")
}
add_test_audio() {
local hf=$1
arr_prefix+=("[audio] ")
arr_hf+=("$hf")
arr_tmpl+=("") # no need for chat tmpl
arr_file+=("test-2.mp3")
}
add_test_vision "ggml-org/SmolVLM-500M-Instruct-GGUF:Q8_0"
add_test_vision "ggml-org/SmolVLM2-2.2B-Instruct-GGUF:Q4_K_M"
add_test_vision "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF:Q8_0"
add_test_vision "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M"
add_test_vision "THUDM/glm-edge-v-5b-gguf:Q4_K_M"
add_test_vision "second-state/Llava-v1.5-7B-GGUF:Q2_K" "vicuna"
add_test_vision "cjpais/llava-1.6-mistral-7b-gguf:Q3_K_M" "vicuna"
add_test_vision "ibm-research/granite-vision-3.2-2b-GGUF:Q4_K_M"
add_test_vision "second-state/MiniCPM-Llama3-V-2_5-GGUF:Q2_K" # model from openbmb is corrupted
add_test_vision "openbmb/MiniCPM-V-2_6-gguf:Q2_K"
add_test_vision "openbmb/MiniCPM-o-2_6-gguf:Q4_0"
add_test_vision "bartowski/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M"
add_test_vision "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF:Q4_K_M"
add_test_vision "ggml-org/InternVL2_5-1B-GGUF:Q8_0"
add_test_vision "ggml-org/InternVL3-1B-Instruct-GGUF:Q8_0"
add_test_vision "ggml-org/Qwen2.5-Omni-3B-GGUF:Q4_K_M"
add_test_audio "ggml-org/ultravox-v0_5-llama-3_2-1b-GGUF:Q8_0"
add_test_audio "ggml-org/Qwen2.5-Omni-3B-GGUF:Q4_K_M"
add_test "llama-mtmd-cli" "ggml-org/SmolVLM-500M-Instruct-GGUF:Q8_0"
add_test "llama-mtmd-cli" "ggml-org/SmolVLM2-2.2B-Instruct-GGUF:Q4_K_M"
add_test "llama-mtmd-cli" "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF:Q8_0"
add_test "llama-mtmd-cli" "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M"
add_test "llama-mtmd-cli" "THUDM/glm-edge-v-5b-gguf:Q4_K_M"
add_test "llama-mtmd-cli" "second-state/Llava-v1.5-7B-GGUF:Q2_K" "vicuna"
add_test "llama-mtmd-cli" "cjpais/llava-1.6-mistral-7b-gguf:Q3_K_M" "vicuna"
add_test "llama-mtmd-cli" "ibm-research/granite-vision-3.2-2b-GGUF:Q4_K_M"
add_test "llama-mtmd-cli" "second-state/MiniCPM-Llama3-V-2_5-GGUF:Q2_K" # model from openbmb is corrupted
add_test "llama-mtmd-cli" "openbmb/MiniCPM-V-2_6-gguf:Q2_K"
add_test "llama-mtmd-cli" "openbmb/MiniCPM-o-2_6-gguf:Q4_0"
add_test "llama-mtmd-cli" "bartowski/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M"
add_test "llama-mtmd-cli" "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF:Q4_K_M"
add_test "llama-mtmd-cli" "ggml-org/InternVL2_5-1B-GGUF:Q8_0"
add_test "llama-mtmd-cli" "ggml-org/InternVL3-1B-Instruct-GGUF:Q8_0"
# to test the big models, run: ./tests.sh big
if [ "$RUN_BIG_TESTS" = true ]; then
add_test_vision "ggml-org/pixtral-12b-GGUF:Q4_K_M"
add_test_vision "ggml-org/Mistral-Small-3.1-24B-Instruct-2503-GGUF" "mistral-v7"
add_test_vision "ggml-org/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M"
add_test_vision "ggml-org/Qwen2-VL-7B-Instruct-GGUF:Q4_K_M"
add_test_vision "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF:Q4_K_M"
add_test_vision "ggml-org/Qwen2.5-VL-7B-Instruct-GGUF:Q4_K_M"
add_test_vision "ggml-org/InternVL3-8B-Instruct-GGUF:Q4_K_M"
add_test_vision "ggml-org/InternVL3-14B-Instruct-GGUF:Q4_K_M"
add_test_vision "ggml-org/Qwen2.5-Omni-7B-GGUF:Q4_K_M"
# add_test_vision "ggml-org/Qwen2.5-VL-32B-Instruct-GGUF:Q4_K_M" # does not work on my mac M3 Ultra
add_test_audio "ggml-org/ultravox-v0_5-llama-3_1-8b-GGUF:Q4_K_M"
add_test_audio "ggml-org/Qwen2.5-Omni-7B-GGUF:Q4_K_M"
add_test "llama-mtmd-cli" "ggml-org/pixtral-12b-GGUF:Q4_K_M"
add_test "llama-mtmd-cli" "ggml-org/Mistral-Small-3.1-24B-Instruct-2503-GGUF" "mistral-v7"
add_test "llama-mtmd-cli" "ggml-org/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M"
add_test "llama-mtmd-cli" "ggml-org/Qwen2-VL-7B-Instruct-GGUF:Q4_K_M"
add_test "llama-mtmd-cli" "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF:Q4_K_M"
add_test "llama-mtmd-cli" "ggml-org/Qwen2.5-VL-7B-Instruct-GGUF:Q4_K_M"
add_test "llama-mtmd-cli" "ggml-org/InternVL3-8B-Instruct-GGUF:Q4_K_M"
add_test "llama-mtmd-cli" "ggml-org/InternVL3-14B-Instruct-GGUF:Q4_K_M"
# add_test "llama-mtmd-cli" "ggml-org/Qwen2.5-VL-32B-Instruct-GGUF:Q4_K_M" # does not work on my mac M3 Ultra
fi
# to test the huge models, run: ./tests.sh huge
# this will run both the big and huge models
# huge models are > 32B parameters
if [ "$RUN_HUGE_TESTS" = true ]; then
add_test_vision "ggml-org/Qwen2.5-VL-72B-Instruct-GGUF:Q4_K_M"
add_test_vision "ggml-org/Llama-4-Scout-17B-16E-Instruct-GGUF:IQ1_S"
add_test "llama-mtmd-cli" "ggml-org/Qwen2.5-VL-72B-Instruct-GGUF:Q4_K_M"
add_test "llama-mtmd-cli" "ggml-org/Llama-4-Scout-17B-16E-Instruct-GGUF:IQ1_S"
fi
# these models always give the wrong answer, not sure why
# add_test_vision "ggml-org/SmolVLM-Instruct-GGUF:Q4_K_M"
# add_test_vision "ggml-org/SmolVLM-256M-Instruct-GGUF:Q8_0"
# add_test_vision "ggml-org/SmolVLM2-256M-Video-Instruct-GGUF:Q8_0"
# add_test "llama-mtmd-cli" "ggml-org/SmolVLM-Instruct-GGUF:Q4_K_M"
# add_test "llama-mtmd-cli" "ggml-org/SmolVLM-256M-Instruct-GGUF:Q8_0"
# add_test "llama-mtmd-cli" "ggml-org/SmolVLM2-256M-Video-Instruct-GGUF:Q8_0"
# this model has broken chat template, not usable
# add_test_vision "cmp-nct/Yi-VL-6B-GGUF:Q5_K"
# add_test_vision "guinmoon/MobileVLM-3B-GGUF:Q4_K_M" "deepseek"
# add_test "llama-mtmd-cli" "cmp-nct/Yi-VL-6B-GGUF:Q5_K"
# add_test "llama-mtmd-cli" "guinmoon/MobileVLM-3B-GGUF:Q4_K_M" "deepseek"
###############
cmake --build build -j --target llama-mtmd-cli
cmake --build build -j --target "${arr_bin[@]}"
arr_res=()
for i in "${!arr_hf[@]}"; do
bin="llama-mtmd-cli"
prefix="${arr_prefix[$i]}"
for i in "${!arr_bin[@]}"; do
bin="${arr_bin[$i]}"
hf="${arr_hf[$i]}"
tmpl="${arr_tmpl[$i]}"
inp_file="${arr_file[$i]}"
echo "Running test with binary: $bin and HF model: $hf"
echo ""
@@ -126,7 +107,7 @@ for i in "${!arr_hf[@]}"; do
output=$(\
"$PROJ_ROOT/build/bin/$bin" \
-hf "$hf" \
--image $SCRIPT_DIR/$inp_file \
--image $SCRIPT_DIR/test-1.jpeg \
-p "what is the publisher name of the newspaper?" \
--temp 0 -n 128 \
${tmpl:+--chat-template "$tmpl"} \
@@ -135,9 +116,9 @@ for i in "${!arr_hf[@]}"; do
echo "$output" > $SCRIPT_DIR/output/$bin-$(echo "$hf" | tr '/' '-').log
if echo "$output" | grep -iq "new york"; then
result="$prefix \033[32mOK\033[0m: $bin $hf"
result="\033[32mOK\033[0m: $bin $hf"
else
result="$prefix \033[31mFAIL\033[0m: $bin $hf"
result="\033[31mFAIL\033[0m: $bin $hf"
fi
echo -e "$result"
arr_res+=("$result")

View File

@@ -36,7 +36,7 @@ install(TARGETS ${TARGET} RUNTIME)
target_include_directories(${TARGET} PRIVATE ../llava)
target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR})
target_link_libraries(${TARGET} PRIVATE common mtmd mtmd_helper ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE common mtmd ${CMAKE_THREAD_LIBS_INIT})
if (LLAMA_SERVER_SSL)
find_package(OpenSSL REQUIRED)

View File

@@ -9,7 +9,6 @@
#include "sampling.h"
#include "speculative.h"
#include "mtmd.h"
#include "mtmd-helper.h"
// Change JSON_ASSERT from assert() to GGML_ASSERT:
#define JSON_ASSERT GGML_ASSERT
@@ -4188,7 +4187,7 @@ int main(int argc, char ** argv) {
throw std::runtime_error("This server does not support multimodal");
}
for (auto & file : files) {
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(ctx_server.mctx, file.data(), file.size()));
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(file.data(), file.size()));
if (!bmp.ptr) {
throw std::runtime_error("Failed to load image or audio file");
}

View File

@@ -6,7 +6,6 @@
#include "arg.h" // common_remote_get_content
#include "base64.hpp"
#include "mtmd.h"
#include "mtmd-helper.h"
// increase max payload length to allow use of larger context size
#define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
@@ -265,19 +264,13 @@ static size_t validate_utf8(const std::string& text) {
static llama_tokens format_rerank(const struct llama_vocab * vocab, const llama_tokens & query, const llama_tokens & doc) {
llama_tokens result;
// Get EOS token - use SEP token as fallback if EOS is not available
llama_token eos_token = llama_vocab_eos(vocab);
if (eos_token == LLAMA_TOKEN_NULL) {
eos_token = llama_vocab_sep(vocab);
}
result.reserve(doc.size() + query.size() + 4);
result.push_back(llama_vocab_bos(vocab));
result.insert(result.end(), query.begin(), query.end());
result.push_back(eos_token);
result.push_back(llama_vocab_eos(vocab));
result.push_back(llama_vocab_sep(vocab));
result.insert(result.end(), doc.begin(), doc.end());
result.push_back(eos_token);
result.push_back(llama_vocab_eos(vocab));
return result;
}
@@ -580,7 +573,7 @@ struct oaicompat_parser_options {
// used by /chat/completions endpoint
static json oaicompat_chat_params_parse(
json & body, /* openai api json semantics */
const json & body, /* openai api json semantics */
const oaicompat_parser_options & opt,
std::vector<raw_buffer> & out_files)
{
@@ -631,7 +624,7 @@ static json oaicompat_chat_params_parse(
if (!body.contains("messages")) {
throw std::runtime_error("'messages' is required");
}
json & messages = body.at("messages");
json messages = body.at("messages");
if (!messages.is_array()) {
throw std::runtime_error("Expected 'messages' to be an array");
}