mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-04-23 16:37:33 +03:00
Compare commits
31 Commits
b6955
...
compilade/
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2ef41855cf | ||
|
|
f88a4b9398 | ||
|
|
4be1a5d44b | ||
|
|
6ffa46d8f4 | ||
|
|
3126b5ee4e | ||
|
|
e097d98a22 | ||
|
|
5712aa895f | ||
|
|
d3fcb0e90e | ||
|
|
614b95a88d | ||
|
|
c3738cfcef | ||
|
|
791bd97b3c | ||
|
|
d921057027 | ||
|
|
562aa42c12 | ||
|
|
e996f3aef8 | ||
|
|
e7b7ed8ab1 | ||
|
|
c4b630f25d | ||
|
|
7f09a680af | ||
|
|
aa374175c3 | ||
|
|
5b180c3d60 | ||
|
|
b7f9010d24 | ||
|
|
4882f0ff78 | ||
|
|
9d7c518d64 | ||
|
|
22c8c3c6ad | ||
|
|
6db3d1ffe6 | ||
|
|
230d1169e5 | ||
|
|
a44d77126c | ||
|
|
5886f4f545 | ||
|
|
92bb84f775 | ||
|
|
13b339bcd9 | ||
|
|
2f0c2db43e | ||
|
|
fd2f84f468 |
@@ -507,6 +507,10 @@ struct common_params {
|
||||
// return false from callback to abort model loading or true to continue
|
||||
llama_progress_callback load_progress_callback = NULL;
|
||||
void * load_progress_callback_user_data = NULL;
|
||||
|
||||
bool has_speculative() const {
|
||||
return !speculative.model.path.empty() || !speculative.model.hf_repo.empty();
|
||||
}
|
||||
};
|
||||
|
||||
// call once at the start of a program if it uses libcommon
|
||||
|
||||
@@ -11,6 +11,7 @@ import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from dataclasses import dataclass
|
||||
from enum import IntEnum
|
||||
from pathlib import Path
|
||||
from hashlib import sha256
|
||||
@@ -76,6 +77,14 @@ class ModelType(IntEnum):
|
||||
AnyModel = TypeVar("AnyModel", bound="type[ModelBase]")
|
||||
|
||||
|
||||
@dataclass
|
||||
class ModelTensorInfo:
|
||||
load: Callable[[], Tensor]
|
||||
size: int # in elements
|
||||
src_type: str
|
||||
auto_qtype: gguf.GGMLQuantizationType | None = None
|
||||
|
||||
|
||||
class ModelBase:
|
||||
_model_classes: dict[ModelType, dict[str, type[ModelBase]]] = {
|
||||
ModelType.TEXT: {},
|
||||
@@ -84,14 +93,16 @@ class ModelBase:
|
||||
|
||||
dir_model: Path
|
||||
ftype: gguf.LlamaFileType
|
||||
ftype_guessed: bool
|
||||
fname_out: Path
|
||||
is_big_endian: bool
|
||||
endianess: gguf.GGUFEndian
|
||||
use_temp_file: bool
|
||||
use_reflinks: bool
|
||||
lazy: bool
|
||||
dry_run: bool
|
||||
hparams: dict[str, Any]
|
||||
model_tensors: dict[str, Callable[[], Tensor]]
|
||||
model_tensors: dict[str, ModelTensorInfo]
|
||||
gguf_writer: gguf.GGUFWriter
|
||||
model_name: str | None
|
||||
metadata_override: Path | None
|
||||
@@ -116,7 +127,8 @@ class ModelBase:
|
||||
split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False,
|
||||
small_first_shard: bool = False, hparams: dict[str, Any] | None = None, remote_hf_model_id: str | None = None,
|
||||
disable_mistral_community_chat_template: bool = False,
|
||||
sentence_transformers_dense_modules: bool = False):
|
||||
sentence_transformers_dense_modules: bool = False,
|
||||
use_reflinks: bool = False):
|
||||
if type(self) is ModelBase or \
|
||||
type(self) is TextModel or \
|
||||
type(self) is MmprojModel:
|
||||
@@ -127,10 +139,12 @@ class ModelBase:
|
||||
|
||||
self.dir_model = dir_model
|
||||
self.ftype = ftype
|
||||
self.ftype_guessed = ftype == gguf.LlamaFileType.GUESSED
|
||||
self.fname_out = fname_out
|
||||
self.is_big_endian = is_big_endian
|
||||
self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
|
||||
self.use_temp_file = use_temp_file
|
||||
self.use_reflinks = use_reflinks
|
||||
self.lazy = not eager or (remote_hf_model_id is not None)
|
||||
self.dry_run = dry_run
|
||||
self.remote_hf_model_id = remote_hf_model_id
|
||||
@@ -141,22 +155,40 @@ class ModelBase:
|
||||
self.model_name = model_name
|
||||
self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py
|
||||
|
||||
# Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type
|
||||
if self.ftype == gguf.LlamaFileType.GUESSED:
|
||||
# NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie.
|
||||
_, first_tensor = next(self.get_tensors())
|
||||
if first_tensor.dtype == torch.float16:
|
||||
logger.info(f"choosing --outtype f16 from first tensor type ({first_tensor.dtype})")
|
||||
self.ftype = gguf.LlamaFileType.MOSTLY_F16
|
||||
else:
|
||||
logger.info(f"choosing --outtype bf16 from first tensor type ({first_tensor.dtype})")
|
||||
self.ftype = gguf.LlamaFileType.MOSTLY_BF16
|
||||
|
||||
self.dequant_model()
|
||||
|
||||
if self.ftype == gguf.LlamaFileType.GUESSED:
|
||||
# find out the most common type
|
||||
hist: dict[gguf.GGMLQuantizationType, int] = {}
|
||||
for t in self.model_tensors.values():
|
||||
if t.auto_qtype is not None:
|
||||
if t.auto_qtype not in hist:
|
||||
hist[t.auto_qtype] = 0
|
||||
hist[t.auto_qtype] += t.size
|
||||
max_qtype = gguf.GGMLQuantizationType.F32
|
||||
max_size = 0
|
||||
for qtype, size in hist.items():
|
||||
if size > max_size:
|
||||
max_qtype = qtype
|
||||
max_size = size
|
||||
# TODO: add more type if they're used as auto_qtype
|
||||
if max_qtype == gguf.GGMLQuantizationType.F32:
|
||||
self.ftype = gguf.LlamaFileType.ALL_F32
|
||||
elif max_qtype == gguf.GGMLQuantizationType.F16:
|
||||
self.ftype = gguf.LlamaFileType.MOSTLY_F16
|
||||
elif max_qtype == gguf.GGMLQuantizationType.BF16:
|
||||
self.ftype = gguf.LlamaFileType.MOSTLY_BF16
|
||||
elif max_qtype == gguf.GGMLQuantizationType.Q8_0:
|
||||
self.ftype = gguf.LlamaFileType.MOSTLY_Q8_0
|
||||
elif max_qtype == gguf.GGMLQuantizationType.Q4_1:
|
||||
self.ftype = gguf.LlamaFileType.MOSTLY_Q4_1
|
||||
elif max_qtype == gguf.GGMLQuantizationType.TQ1_0:
|
||||
self.ftype = gguf.LlamaFileType.MOSTLY_TQ1_0
|
||||
|
||||
# Configure GGUF Writer
|
||||
self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file,
|
||||
split_max_tensors=split_max_tensors, split_max_size=split_max_size, dry_run=dry_run, small_first_shard=small_first_shard)
|
||||
split_max_tensors=split_max_tensors, split_max_size=split_max_size, dry_run=dry_run, small_first_shard=small_first_shard,
|
||||
use_reflinks=self.use_reflinks)
|
||||
|
||||
# Mistral specific
|
||||
self.disable_mistral_community_chat_template = disable_mistral_community_chat_template
|
||||
@@ -175,8 +207,8 @@ class ModelBase:
|
||||
return None
|
||||
raise KeyError(f"could not find any of: {keys}")
|
||||
|
||||
def index_tensors(self, remote_hf_model_id: str | None = None) -> dict[str, Callable[[], Tensor]]:
|
||||
tensors: dict[str, Callable[[], Tensor]] = {}
|
||||
def index_tensors(self, remote_hf_model_id: str | None = None) -> dict[str, ModelTensorInfo]:
|
||||
tensors: dict[str, ModelTensorInfo] = {}
|
||||
|
||||
if remote_hf_model_id is not None:
|
||||
is_safetensors = True
|
||||
@@ -184,7 +216,14 @@ class ModelBase:
|
||||
logger.info(f"Using remote model with HuggingFace id: {remote_hf_model_id}")
|
||||
remote_tensors = gguf.utility.SafetensorRemote.get_list_tensors_hf_model(remote_hf_model_id)
|
||||
for name, remote_tensor in remote_tensors.items():
|
||||
tensors[name] = lambda r=remote_tensor: LazyTorchTensor.from_remote_tensor(r)
|
||||
dtype = LazyTorchTensor._dtype_str_map[remote_tensor.dtype]
|
||||
qtype = LazyTorchTensor._qtype_map.get(dtype)
|
||||
tensors[name] = ModelTensorInfo(
|
||||
load=lambda r=remote_tensor: LazyTorchTensor.from_remote_tensor(r),
|
||||
size=math.prod(remote_tensor.shape),
|
||||
src_type=str(dtype),
|
||||
auto_qtype=qtype,
|
||||
)
|
||||
|
||||
return tensors
|
||||
|
||||
@@ -218,8 +257,7 @@ class ModelBase:
|
||||
logger.info(f"gguf: indexing model part '{part_name}'")
|
||||
ctx: ContextManager[Any]
|
||||
if is_safetensors:
|
||||
from safetensors import safe_open
|
||||
ctx = cast(ContextManager[Any], safe_open(self.dir_model / part_name, framework="pt", device="cpu"))
|
||||
ctx = cast(ContextManager[Any], gguf.utility.SafetensorsLocal(self.dir_model / part_name, reflink=self.use_reflinks))
|
||||
else:
|
||||
ctx = contextlib.nullcontext(torch.load(str(self.dir_model / part_name), map_location="cpu", mmap=True, weights_only=True))
|
||||
|
||||
@@ -228,19 +266,28 @@ class ModelBase:
|
||||
|
||||
for name in model_part.keys():
|
||||
if is_safetensors:
|
||||
data: gguf.utility.LocalTensor = model_part[name]
|
||||
dtype = LazyTorchTensor._dtype_str_map[data.dtype]
|
||||
size = math.prod(data.shape)
|
||||
if self.lazy:
|
||||
data = model_part.get_slice(name)
|
||||
data_gen = lambda data=data: LazyTorchTensor.from_safetensors_slice(data) # noqa: E731
|
||||
data_gen = lambda data=data: LazyTorchTensor.from_local_tensor(data) # noqa: E731
|
||||
else:
|
||||
data = model_part.get_tensor(name)
|
||||
data_gen = lambda data=data: data # noqa: E731
|
||||
data_gen = lambda data=data, dtype=dtype: torch.from_numpy(data.mmap_bytes()).view(dtype).reshape(data.shape) # noqa: E731
|
||||
else:
|
||||
data = model_part[name]
|
||||
data_torch: Tensor = model_part[name]
|
||||
size = data_torch.numel()
|
||||
dtype = data_torch.dtype
|
||||
if self.lazy:
|
||||
data_gen = lambda data=data: LazyTorchTensor.from_eager(data) # noqa: E731
|
||||
data_gen = lambda data=data_torch: LazyTorchTensor.from_eager(data) # noqa: E731
|
||||
else:
|
||||
data_gen = lambda data=data: data # noqa: E731
|
||||
tensors[name] = data_gen
|
||||
data_gen = lambda data=data_torch: data # noqa: E731
|
||||
qtype = LazyTorchTensor._qtype_map.get(dtype)
|
||||
tensors[name] = ModelTensorInfo(
|
||||
load=data_gen,
|
||||
size=size,
|
||||
src_type=str(dtype),
|
||||
auto_qtype=qtype,
|
||||
)
|
||||
|
||||
# verify tensor name presence and identify potentially missing files
|
||||
if len(tensor_names_from_index) > 0:
|
||||
@@ -261,7 +308,7 @@ class ModelBase:
|
||||
|
||||
def dequant_model(self):
|
||||
tensors_to_remove: list[str] = []
|
||||
new_tensors: dict[str, Callable[[], Tensor]] = {}
|
||||
new_tensors: dict[str, ModelTensorInfo] = {}
|
||||
|
||||
if (quant_config := self.hparams.get("quantization_config")) and isinstance(quant_config, dict):
|
||||
quant_method = quant_config.get("quant_method")
|
||||
@@ -339,7 +386,12 @@ class ModelBase:
|
||||
weight_name = name.removesuffix("_scale")
|
||||
w = self.model_tensors[weight_name]
|
||||
s = self.model_tensors[name]
|
||||
self.model_tensors[weight_name] = lambda w=w, s=s: dequant_bitnet(w(), s())
|
||||
self.model_tensors[weight_name] = ModelTensorInfo(
|
||||
load=lambda w=w, s=s: dequant_bitnet(w.load(), s.load()),
|
||||
size=w.size,
|
||||
src_type="bitnet",
|
||||
auto_qtype=gguf.GGMLQuantizationType.TQ1_0,
|
||||
)
|
||||
tensors_to_remove.append(name)
|
||||
elif quant_method == "fp8":
|
||||
for name in self.model_tensors.keys():
|
||||
@@ -347,9 +399,17 @@ class ModelBase:
|
||||
weight_name = name.removesuffix("_scale_inv")
|
||||
w = self.model_tensors[weight_name]
|
||||
s = self.model_tensors[name]
|
||||
self.model_tensors[weight_name] = lambda w=w, s=s: dequant_simple(w(), s())
|
||||
# TODO: change to FP8 once natively supported
|
||||
auto_qtype = s.auto_qtype if s.auto_qtype is not gguf.GGMLQuantizationType.F32 else gguf.GGMLQuantizationType.BF16
|
||||
self.model_tensors[weight_name] = ModelTensorInfo(
|
||||
load=lambda w=w, s=s: dequant_simple(w.load(), s.load()),
|
||||
size=w.size,
|
||||
src_type=w.src_type,
|
||||
auto_qtype=auto_qtype,
|
||||
)
|
||||
tensors_to_remove.append(name)
|
||||
elif quant_method == "gptq":
|
||||
bits = quant_config["bits"]
|
||||
for name in self.model_tensors.keys():
|
||||
if name.endswith(".qweight"):
|
||||
base_name = name.removesuffix(".qweight")
|
||||
@@ -357,10 +417,13 @@ class ModelBase:
|
||||
qweight = self.model_tensors[base_name + ".qweight"]
|
||||
qzeros = self.model_tensors[base_name + ".qzeros"]
|
||||
scales = self.model_tensors[base_name + ".scales"]
|
||||
new_tensors[base_name + ".weight"] = (
|
||||
lambda g=g_idx, z=qzeros, w=qweight, s=scales: dequant_gptq(
|
||||
g(), w(), z(), s()
|
||||
)
|
||||
new_tensors[base_name + ".weight"] = ModelTensorInfo(
|
||||
load=lambda g=g_idx, z=qzeros, w=qweight, s=scales: dequant_gptq(
|
||||
g.load(), w.load(), z.load(), s.load()
|
||||
),
|
||||
size=qweight.size, # TODO: use more accurate value
|
||||
src_type=f"GPTQ-{bits}bit",
|
||||
auto_qtype=gguf.GGMLQuantizationType.Q8_0 if bits == 8 else gguf.GGMLQuantizationType.Q4_1,
|
||||
)
|
||||
tensors_to_remove += [
|
||||
base_name + n
|
||||
@@ -382,8 +445,8 @@ class ModelBase:
|
||||
self.model_tensors[name] = value
|
||||
|
||||
def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
|
||||
for name, gen in self.model_tensors.items():
|
||||
yield name, gen()
|
||||
for name, t in self.model_tensors.items():
|
||||
yield name, t.load()
|
||||
|
||||
def format_tensor_name(self, key: gguf.MODEL_TENSOR, bid: int | None = None, suffix: str = ".weight") -> str:
|
||||
if key not in gguf.MODEL_TENSORS[self.model_arch]:
|
||||
@@ -438,10 +501,12 @@ class ModelBase:
|
||||
if name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")):
|
||||
continue
|
||||
|
||||
old_dtype = data_torch.dtype
|
||||
tensor_info = self.model_tensors.get(name)
|
||||
old_dtype: str = tensor_info.src_type if tensor_info is not None else str(data_torch.dtype)
|
||||
|
||||
# convert any unsupported data types to float32
|
||||
if data_torch.dtype not in (torch.float16, torch.float32):
|
||||
# TODO: handle pre-quantized tensors for repacking
|
||||
if data_torch.dtype not in (torch.float16, torch.bfloat16, torch.float32):
|
||||
data_torch = data_torch.to(torch.float32)
|
||||
|
||||
# use the first number-like part of the tensor name as the block id
|
||||
@@ -452,8 +517,18 @@ class ModelBase:
|
||||
break
|
||||
|
||||
for new_name, data_torch in (self.modify_tensors(data_torch, name, bid)):
|
||||
# TODO: why do we squeeze here?
|
||||
# data = data_torch.squeeze().numpy()
|
||||
old_qtype = LazyTorchTensor._qtype_map[data_torch.dtype]
|
||||
|
||||
# workaround BF16 not being supported by Numpy
|
||||
if data_torch.dtype == torch.bfloat16:
|
||||
# Need a contiguous last dimension otherwise byte view doesn't work
|
||||
# (problem can be reproduced with DeepSeek-V2-Lite-Chat)
|
||||
data_torch = data_torch.contiguous().view(torch.uint8)
|
||||
|
||||
# if data ends up empty, it means data_torch was a scalar tensor -> restore
|
||||
if len(data_torch.shape) == 0:
|
||||
data_torch = data_torch.reshape(1)
|
||||
|
||||
data = data_torch.numpy()
|
||||
|
||||
n_dims = len(data.shape)
|
||||
@@ -512,7 +587,9 @@ class ModelBase:
|
||||
|
||||
# No override (data_qtype is False), or wants to be quantized (data_qtype is True)
|
||||
if isinstance(data_qtype, bool):
|
||||
if self.ftype == gguf.LlamaFileType.ALL_F32:
|
||||
if self.ftype_guessed:
|
||||
data_qtype = old_qtype if tensor_info is None or tensor_info.auto_qtype is None else tensor_info.auto_qtype
|
||||
elif self.ftype == gguf.LlamaFileType.ALL_F32:
|
||||
data_qtype = gguf.GGMLQuantizationType.F32
|
||||
elif self.ftype == gguf.LlamaFileType.MOSTLY_F16:
|
||||
data_qtype = gguf.GGMLQuantizationType.F16
|
||||
@@ -527,12 +604,18 @@ class ModelBase:
|
||||
else:
|
||||
raise ValueError(f"Unknown file type: {self.ftype.name}")
|
||||
|
||||
try:
|
||||
data = gguf.quants.quantize(data, data_qtype)
|
||||
except gguf.QuantError as e:
|
||||
logger.warning("%s, %s", e, "falling back to F16")
|
||||
data_qtype = gguf.GGMLQuantizationType.F16
|
||||
data = gguf.quants.quantize(data, data_qtype)
|
||||
if old_qtype != data_qtype:
|
||||
if old_qtype not in (
|
||||
gguf.GGMLQuantizationType.F32,
|
||||
gguf.GGMLQuantizationType.F16,
|
||||
):
|
||||
data = gguf.quants.dequantize(data, old_qtype)
|
||||
try:
|
||||
data = gguf.quants.quantize(data, data_qtype)
|
||||
except gguf.QuantError as e:
|
||||
logger.warning("%s, %s", e, "falling back to F16")
|
||||
data_qtype = gguf.GGMLQuantizationType.F16
|
||||
data = gguf.quants.quantize(data, data_qtype)
|
||||
|
||||
shape = gguf.quant_shape_from_byte_shape(data.shape, data_qtype) if data.dtype == np.uint8 else data.shape
|
||||
|
||||
@@ -4705,7 +4788,7 @@ class Plamo2Model(TextModel):
|
||||
del bid # unused
|
||||
|
||||
if name.endswith(".A_log"):
|
||||
data_torch = -torch.exp(data_torch)
|
||||
data_torch = -torch.exp(data_torch.float())
|
||||
elif name.endswith(".dt_bias"):
|
||||
name = name.rpartition(".dt_bias")[0] + ".dt_proj.bias"
|
||||
elif name.endswith(".dt_norm_weight"):
|
||||
@@ -6229,7 +6312,7 @@ class MambaModel(TextModel):
|
||||
|
||||
if name.endswith(".A_log"):
|
||||
logger.debug("A_log --> A ==> " + new_name)
|
||||
data_torch = -torch.exp(data_torch)
|
||||
data_torch = -torch.exp(data_torch.float())
|
||||
|
||||
# [4 1 8192 1] -> [4 8192 1 1]
|
||||
if self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_CONV1D, bid):
|
||||
@@ -6334,7 +6417,7 @@ class Mamba2Model(TextModel):
|
||||
|
||||
if name.endswith(".A_log"):
|
||||
logger.debug("A_log --> A ==> " + new_name)
|
||||
data_torch = -torch.exp(data_torch)
|
||||
data_torch = -torch.exp(data_torch.float())
|
||||
|
||||
yield (new_name, data_torch)
|
||||
|
||||
@@ -6434,7 +6517,7 @@ class JambaModel(TextModel):
|
||||
|
||||
if name.endswith(".A_log"):
|
||||
logger.debug("A_log --> A ==> " + new_name)
|
||||
data_torch = -torch.exp(data_torch)
|
||||
data_torch = -torch.exp(data_torch.float())
|
||||
|
||||
yield (new_name, data_torch)
|
||||
|
||||
@@ -9983,12 +10066,20 @@ class LazyTorchTensor(gguf.LazyBase):
|
||||
"F8_E5M2": torch.float8_e5m2,
|
||||
}
|
||||
|
||||
_qtype_map: dict[torch.dtype, gguf.GGMLQuantizationType] = {
|
||||
torch.float64: gguf.GGMLQuantizationType.F64,
|
||||
torch.float32: gguf.GGMLQuantizationType.F32,
|
||||
torch.float16: gguf.GGMLQuantizationType.F16,
|
||||
torch.bfloat16: gguf.GGMLQuantizationType.BF16,
|
||||
}
|
||||
|
||||
def numpy(self) -> gguf.LazyNumpyTensor:
|
||||
dtype = self._dtype_map[self.dtype]
|
||||
return gguf.LazyNumpyTensor(
|
||||
meta=gguf.LazyNumpyTensor.meta_with_dtype_and_shape(dtype, self.shape),
|
||||
args=(self,),
|
||||
func=(lambda s: s.numpy())
|
||||
func=(lambda s: s.numpy()),
|
||||
ranges=self._ranges,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
@@ -10002,6 +10093,16 @@ class LazyTorchTensor(gguf.LazyBase):
|
||||
lazy = cls(meta=cls.meta_with_dtype_and_shape(dtype, shape), args=(st_slice,), func=lambda s: s[...] if len(s.get_shape()) == 0 else s[:])
|
||||
return cast(torch.Tensor, lazy)
|
||||
|
||||
@classmethod
|
||||
def from_local_tensor(cls, t: gguf.utility.LocalTensor) -> Tensor:
|
||||
def load_tensor(tensor: gguf.utility.LocalTensor) -> Tensor:
|
||||
dtype = cls._dtype_str_map[tensor.dtype]
|
||||
return torch.from_numpy(tensor.mmap_bytes()).view(dtype).reshape(tensor.shape)
|
||||
dtype = cls._dtype_str_map[t.dtype]
|
||||
shape = t.shape
|
||||
lazy = cls(meta=cls.meta_with_dtype_and_shape(dtype, shape), args=(t,), func=lambda r: load_tensor(r), ranges=(t.data_range,))
|
||||
return cast(torch.Tensor, lazy)
|
||||
|
||||
@classmethod
|
||||
def from_remote_tensor(cls, remote_tensor: gguf.utility.RemoteTensor):
|
||||
dtype = cls._dtype_str_map[remote_tensor.dtype]
|
||||
@@ -10020,7 +10121,27 @@ class LazyTorchTensor(gguf.LazyBase):
|
||||
if func is torch.Tensor.numpy:
|
||||
return args[0].numpy()
|
||||
|
||||
return cls._wrap_fn(func)(*args, **kwargs)
|
||||
result = cls._wrap_fn(func)(*args, **kwargs)
|
||||
|
||||
def get_dim(index: int, key: str = "dim", default: int = 0, args=args, kwargs=kwargs) -> int:
|
||||
# TODO: handle negative dim
|
||||
if len(args) > index:
|
||||
return args[index]
|
||||
else:
|
||||
return kwargs.get(key, default)
|
||||
|
||||
# Track file ranges
|
||||
# TODO: handle tensor splits (with torch.split, torch.chunk, and torch.__getitem__)
|
||||
if isinstance(result, LazyTorchTensor):
|
||||
if isinstance(args[0], LazyTorchTensor):
|
||||
if func is torch.Tensor.to and not isinstance(args[1], torch.dtype):
|
||||
result._ranges = args[0]._ranges
|
||||
if func is torch.stack and get_dim(1) == 0:
|
||||
if all(isinstance(t, LazyTorchTensor) and len(t._ranges) > 0 for t in args[0]):
|
||||
# collect ranges of all stacked tensors
|
||||
result._ranges = tuple(r for t in args[0] for r in t._ranges)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
@@ -10035,8 +10156,8 @@ def parse_args() -> argparse.Namespace:
|
||||
help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "tq1_0", "tq2_0", "auto"], default="f16",
|
||||
help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, tq1_0 or tq2_0 for ternary, and auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
|
||||
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "tq1_0", "tq2_0", "auto"], default="auto",
|
||||
help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, tq1_0 or tq2_0 for ternary, and auto for mostly unchanged types",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--bigendian", action="store_true",
|
||||
@@ -10055,6 +10176,10 @@ def parse_args() -> argparse.Namespace:
|
||||
"--no-lazy", action="store_true",
|
||||
help="use more RAM by computing all outputs before writing (use in case lazy evaluation is broken)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--reflink", action="store_true",
|
||||
help="(Experimental) Use copy-on-write reflinks when possible (e.g. on BTRFS, XFS, ZFS, etc.). File alignment and padding will differ compared to not using this option. Should be very fast when source model layout is compatible enough.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model-name", type=str, default=None,
|
||||
help="name of the model",
|
||||
@@ -10249,7 +10374,8 @@ def main() -> None:
|
||||
split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run,
|
||||
small_first_shard=args.no_tensor_first_split,
|
||||
remote_hf_model_id=hf_repo_id, disable_mistral_community_chat_template=disable_mistral_community_chat_template,
|
||||
sentence_transformers_dense_modules=args.sentence_transformers_dense_modules
|
||||
sentence_transformers_dense_modules=args.sentence_transformers_dense_modules,
|
||||
use_reflinks=args.reflink,
|
||||
)
|
||||
|
||||
if args.vocab_only:
|
||||
|
||||
@@ -178,6 +178,48 @@ GeForce RTX 3070 8.6
|
||||
cmake -B build -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES="86;89"
|
||||
```
|
||||
|
||||
### Overriding the CUDA Version
|
||||
|
||||
If you have multiple CUDA installations on your system and want to compile llama.cpp for a specific one, e.g. for CUDA 11.7 installed under `/opt/cuda-11.7`:
|
||||
|
||||
```bash
|
||||
cmake -B build -DGGML_CUDA=ON -DCMAKE_CUDA_COMPILER=/opt/cuda-11.7/bin/nvcc -DCMAKE_INSTALL_RPATH="/opt/cuda-11.7/lib64;\$ORIGIN" -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON
|
||||
```
|
||||
|
||||
#### Fixing Compatibility Issues with Old CUDA and New glibc
|
||||
|
||||
If you try to use an old CUDA version (e.g. v11.7) with a new glibc version you can get errors like this:
|
||||
|
||||
```
|
||||
/usr/include/bits/mathcalls.h(83): error: exception specification is
|
||||
incompatible with that of previous function "cospi"
|
||||
|
||||
|
||||
/opt/cuda-11.7/bin/../targets/x86_64-linux/include/crt/math_functions.h(5545):
|
||||
here
|
||||
```
|
||||
|
||||
It seems the least bad solution is to patch the CUDA installation to declare the correct signatures.
|
||||
Replace the following lines in `/path/to/your/cuda/installation/targets/x86_64-linux/include/crt/math_functions.h`:
|
||||
|
||||
```C++
|
||||
// original lines
|
||||
extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double cospi(double x);
|
||||
extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float cospif(float x);
|
||||
extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double sinpi(double x);
|
||||
extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float sinpif(float x);
|
||||
extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double rsqrt(double x);
|
||||
extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float rsqrtf(float x);
|
||||
|
||||
// edited lines
|
||||
extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double cospi(double x) noexcept (true);
|
||||
extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float cospif(float x) noexcept (true);
|
||||
extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double sinpi(double x) noexcept (true);
|
||||
extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float sinpif(float x) noexcept (true);
|
||||
extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double rsqrt(double x) noexcept (true);
|
||||
extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float rsqrtf(float x) noexcept (true);
|
||||
```
|
||||
|
||||
### Runtime CUDA environmental variables
|
||||
|
||||
You may set the [cuda environmental variables](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) at runtime.
|
||||
|
||||
@@ -24,7 +24,7 @@ Legend:
|
||||
| ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
| CEIL | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| CLAMP | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ |
|
||||
| CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | 🟡 | ✅ | ❌ |
|
||||
| CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ❌ |
|
||||
| CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ❌ |
|
||||
| CONV_2D | ❌ | ❌ | ✅ | 🟡 | ❌ | ✅ | ❌ | ✅ | ❌ |
|
||||
| CONV_2D_DW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
|
||||
|
||||
@@ -9307,37 +9307,37 @@
|
||||
"SYCL0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=24,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=0,inplace=1","support","1","yes","SYCL"
|
||||
"SYCL0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=24,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=0,inplace=1","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=0","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=0","support","0","no","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=0","support","0","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=0","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=0","support","0","no","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=0","support","0","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=0","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=0","support","0","no","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=0","support","0","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=0","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=0","support","0","no","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=0","support","0","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=1","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=1","support","0","no","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=1","support","0","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=1","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=1","support","0","no","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=1","support","0","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=1","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=1","support","0","no","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=1","support","0","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=1","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=1","support","0","no","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=1","support","0","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=2","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=2","support","0","no","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=2","support","0","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=2","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=2","support","0","no","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=2","support","0","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=2","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=2","support","0","no","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=2","support","0","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=2","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=2","support","0","no","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=2","support","0","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=3","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=3","support","0","no","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=3","support","0","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=3","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=3","support","0","no","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=3","support","0","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=3","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=3","support","0","no","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=3","support","0","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=3","support","1","yes","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=3","support","0","no","SYCL"
|
||||
"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=3","support","0","yes","SYCL"
|
||||
"SYCL0","ARGSORT","type=f32,ne=[8,1,1,1],order=0","support","1","yes","SYCL"
|
||||
"SYCL0","ARGSORT","type=f32,ne=[16,10,10,10],order=0","support","1","yes","SYCL"
|
||||
"SYCL0","ARGSORT","type=f32,ne=[60,10,10,10],order=0","support","1","yes","SYCL"
|
||||
|
||||
|
Can't render this file because it is too large.
|
@@ -184,8 +184,13 @@ static bool gguf_ex_read_1(const std::string & fname, bool check_data) {
|
||||
const char * name = gguf_get_tensor_name (ctx, i);
|
||||
const size_t size = gguf_get_tensor_size (ctx, i);
|
||||
const size_t offset = gguf_get_tensor_offset(ctx, i);
|
||||
const auto type = gguf_get_tensor_type (ctx, i);
|
||||
|
||||
printf("%s: tensor[%d]: name = %s, size = %zu, offset = %zu\n", __func__, i, name, size, offset);
|
||||
const char * type_name = ggml_type_name(type);
|
||||
const size_t type_size = ggml_type_size(type);
|
||||
const size_t n_elements = size / type_size;
|
||||
|
||||
printf("%s: tensor[%d]: name = %s, size = %zu, offset = %zu, type = %s, n_elts = %zu\n", __func__, i, name, size, offset, type_name, n_elements);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -580,16 +580,19 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
|
||||
uint8_t *patmp = atmp;
|
||||
int vsums;
|
||||
int tmp;
|
||||
int tmp, t1, t2, t3, t4, t5, t6, t7;
|
||||
__asm__ __volatile__(
|
||||
"vsetivli zero, 16, e8, m1\n\t"
|
||||
"vmv.v.x v8, zero\n\t"
|
||||
"lb zero, 15(%[sc])\n\t"
|
||||
"vle8.v v1, (%[sc])\n\t"
|
||||
"vle8.v v2, (%[bsums])\n\t"
|
||||
"addi %[tmp], %[bsums], 16\n\t"
|
||||
"vand.vi v0, v1, 0xF\n\t"
|
||||
"vsrl.vi v1, v1, 4\n\t"
|
||||
"vle8.v v3, (%[tmp])\n\t"
|
||||
"vse8.v v0, (%[scale])\n\t"
|
||||
"vsetivli zero, 16, e16, m2\n\t"
|
||||
"vle16.v v2, (%[bsums])\n\t"
|
||||
"vzext.vf2 v0, v1\n\t"
|
||||
"vwmul.vv v4, v0, v2\n\t"
|
||||
"vsetivli zero, 16, e32, m4\n\t"
|
||||
@@ -608,46 +611,89 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
for (int j = 0; j < QK_K/128; ++j) {
|
||||
__asm__ __volatile__(
|
||||
"vsetvli zero, %[vl32], e8, m2\n\t"
|
||||
"lb zero, 31(%[q2])\n\t"
|
||||
"addi %[tmp], %[q2], 16\n\t"
|
||||
"addi %[t1], %[q8], 16\n\t"
|
||||
"vsetivli zero, 16, e8, m1\n\t"
|
||||
"vle8.v v0, (%[q2])\n\t"
|
||||
"vle8.v v1, (%[tmp])\n\t"
|
||||
"vsrl.vi v2, v0, 2\n\t"
|
||||
"vsrl.vi v3, v1, 2\n\t"
|
||||
"vsrl.vi v4, v0, 4\n\t"
|
||||
"vsrl.vi v6, v0, 6\n\t"
|
||||
"vand.vi v0, v0, 0x3\n\t"
|
||||
"vand.vi v2, v2, 0x3\n\t"
|
||||
"vand.vi v4, v4, 0x3\n\t"
|
||||
"vsetvli zero, %[vl128], e8, m8\n\t"
|
||||
"addi %[tmp], %[q8], 32\n\t"
|
||||
"vle8.v v8, (%[q8])\n\t"
|
||||
"vsetvli zero, %[vl64], e8, m4\n\t"
|
||||
"vle8.v v9, (%[t1])\n\t"
|
||||
"addi %[t1], %[t1], 32\n\t"
|
||||
"vsrl.vi v5, v1, 4\n\t"
|
||||
"vsrl.vi v6, v0, 6\n\t"
|
||||
"vsrl.vi v7, v1, 6\n\t"
|
||||
"vle8.v v10, (%[tmp])\n\t"
|
||||
"vle8.v v11, (%[t1])\n\t"
|
||||
"addi %[tmp], %[tmp], 32\n\t"
|
||||
"addi %[t1], %[t1], 32\n\t"
|
||||
"vand.vi v0, v0, 0x3\n\t"
|
||||
"vand.vi v1, v1, 0x3\n\t"
|
||||
"vand.vi v2, v2, 0x3\n\t"
|
||||
"vle8.v v12, (%[tmp])\n\t"
|
||||
"vle8.v v13, (%[t1])\n\t"
|
||||
"addi %[tmp], %[tmp], 32\n\t"
|
||||
"addi %[t1], %[t1], 32\n\t"
|
||||
"vand.vi v3, v3, 0x3\n\t"
|
||||
"vand.vi v4, v4, 0x3\n\t"
|
||||
"vand.vi v5, v5, 0x3\n\t"
|
||||
"vle8.v v14, (%[tmp])\n\t"
|
||||
"vle8.v v15, (%[t1])\n\t"
|
||||
"vwmul.vv v16, v0, v8\n\t"
|
||||
"vwmul.vv v18, v1, v9\n\t"
|
||||
"vwmul.vv v20, v2, v10\n\t"
|
||||
"vwmul.vv v22, v3, v11\n\t"
|
||||
"vwmul.vv v24, v4, v12\n\t"
|
||||
"vsetivli zero, 16, e16, m2\n\t"
|
||||
"vwmul.vv v26, v5, v13\n\t"
|
||||
"vwmul.vv v28, v6, v14\n\t"
|
||||
"vwmul.vv v30, v7, v15\n\t"
|
||||
"vsetivli zero, 8, e16, m1\n\t"
|
||||
"vmv.v.x v0, zero\n\t"
|
||||
"vwredsum.vs v10, v16, v0\n\t"
|
||||
"lbu %[tmp], 0(%[scale])\n\t"
|
||||
"vwredsum.vs v8, v16, v0\n\t"
|
||||
"vwredsum.vs v9, v18, v0\n\t"
|
||||
"vwredsum.vs v8, v20, v0\n\t"
|
||||
"vwredsum.vs v7, v22, v0\n\t"
|
||||
"vwredsum.vs v11, v24, v0\n\t"
|
||||
"vwredsum.vs v12, v26, v0\n\t"
|
||||
"vwredsum.vs v13, v28, v0\n\t"
|
||||
"vwredsum.vs v14, v30, v0\n\t"
|
||||
"lbu %[t1], 1(%[scale])\n\t"
|
||||
"vwredsum.vs v10, v20, v0\n\t"
|
||||
"vwredsum.vs v11, v22, v0\n\t"
|
||||
"lbu %[t2], 2(%[scale])\n\t"
|
||||
"vwredsum.vs v12, v24, v0\n\t"
|
||||
"vwredsum.vs v13, v26, v0\n\t"
|
||||
"lbu %[t3], 3(%[scale])\n\t"
|
||||
"vwredsum.vs v14, v28, v0\n\t"
|
||||
"vwredsum.vs v15, v30, v0\n\t"
|
||||
"lbu %[t4], 4(%[scale])\n\t"
|
||||
"vwredsum.vs v8, v17, v8\n\t"
|
||||
"vwredsum.vs v9, v19, v9\n\t"
|
||||
"lbu %[t5], 5(%[scale])\n\t"
|
||||
"vwredsum.vs v10, v21, v10\n\t"
|
||||
"vwredsum.vs v11, v23, v11\n\t"
|
||||
"lbu %[t6], 6(%[scale])\n\t"
|
||||
"vwredsum.vs v12, v25, v12\n\t"
|
||||
"vwredsum.vs v13, v27, v13\n\t"
|
||||
"lbu %[t7], 7(%[scale])\n\t"
|
||||
"vwredsum.vs v14, v29, v14\n\t"
|
||||
"vwredsum.vs v15, v31, v15\n\t"
|
||||
"vsetivli zero, 4, e32, m1\n\t"
|
||||
"vslideup.vi v10, v9, 1\n\t"
|
||||
"vslideup.vi v8, v7, 1\n\t"
|
||||
"vslideup.vi v11, v12, 1\n\t"
|
||||
"vslideup.vi v13, v14, 1\n\t"
|
||||
"vslideup.vi v10, v8, 2\n\t"
|
||||
"vslideup.vi v11, v13, 2\n\t"
|
||||
"vsetivli zero, 8, e32, m2\n\t"
|
||||
"vle8.v v15, (%[scale])\n\t"
|
||||
"vzext.vf4 v12, v15\n\t"
|
||||
"vmul.vv v10, v10, v12\n\t"
|
||||
"vredsum.vs v0, v10, v0\n\t"
|
||||
"vmul.vx v0, v8, %[tmp]\n\t"
|
||||
"vmul.vx v1, v9, %[t1]\n\t"
|
||||
"vmacc.vx v0, %[t2], v10\n\t"
|
||||
"vmacc.vx v1, %[t3], v11\n\t"
|
||||
"vmacc.vx v0, %[t4], v12\n\t"
|
||||
"vmacc.vx v1, %[t5], v13\n\t"
|
||||
"vmacc.vx v0, %[t6], v14\n\t"
|
||||
"vmacc.vx v1, %[t7], v15\n\t"
|
||||
"vmv.x.s %[tmp], v0\n\t"
|
||||
"add %[isum], %[isum], %[tmp]"
|
||||
: [tmp] "=&r" (tmp), [isum] "+&r" (isum)
|
||||
"vmv.x.s %[t1], v1\n\t"
|
||||
"add %[isum], %[isum], %[tmp]\n\t"
|
||||
"add %[isum], %[isum], %[t1]"
|
||||
: [tmp] "=&r" (tmp), [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3)
|
||||
, [t4] "=&r" (t4), [t5] "=&r" (t5), [t6] "=&r" (t6), [t7] "=&r" (t7)
|
||||
, [isum] "+&r" (isum)
|
||||
: [q2] "r" (q2), [scale] "r" (patmp), [q8] "r" (q8)
|
||||
, [vl32] "r" (32), [vl64] "r" (64), [vl128] "r" (128)
|
||||
: "memory"
|
||||
, "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
|
||||
, "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
|
||||
@@ -929,7 +975,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
const int8_t * restrict q8 = y[i].qs;
|
||||
|
||||
int8_t * scale = (int8_t *)utmp;
|
||||
int tmp;
|
||||
int tmp, t1, t2, t3, t4, t5, t6, t7;
|
||||
__asm__ __volatile__(
|
||||
"vsetivli zero, 12, e8, m1\n\t"
|
||||
"vle8.v v0, (%[s6b])\n\t"
|
||||
@@ -967,19 +1013,23 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
int isum = 0;
|
||||
for (int j = 0; j < QK_K; j += 128) {
|
||||
__asm__ __volatile__(
|
||||
"lb zero, 31(%[q3])\n\t"
|
||||
"vsetvli zero, %[vl32], e8, m2, ta, mu\n\t"
|
||||
"vle8.v v8, (%[q3])\n\t"
|
||||
"vsrl.vi v10, v8, 2\n\t"
|
||||
"vsrl.vi v12, v8, 4\n\t"
|
||||
"vsrl.vi v14, v8, 6\n\t"
|
||||
"lb zero, 64(%[q8])\n\t"
|
||||
"vand.vi v8, v8, 3\n\t"
|
||||
"vand.vi v10, v10, 3\n\t"
|
||||
"vand.vi v12, v12, 3\n\t"
|
||||
"vle8.v v2, (%[qh])\n\t"
|
||||
"lb zero, 127(%[q8])\n\t"
|
||||
"vand.vx v4, v2, %[m]\n\t"
|
||||
"slli %[m], %[m], 1\n\t"
|
||||
"vmseq.vx v0, v4, zero\n\t"
|
||||
"vadd.vi v8, v8, -4, v0.t\n\t"
|
||||
"lb zero, 0(%[q8])\n\t"
|
||||
"vand.vx v4, v2, %[m]\n\t"
|
||||
"slli %[m], %[m], 1\n\t"
|
||||
"vmseq.vx v0, v4, zero\n\t"
|
||||
@@ -994,34 +1044,43 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
"vadd.vi v14, v14, -4, v0.t\n\t"
|
||||
"vsetvli zero, %[vl128], e8, m8\n\t"
|
||||
"vle8.v v0, (%[q8])\n\t"
|
||||
"lb %[tmp], 0(%[scale])\n\t"
|
||||
"lb %[t1], 1(%[scale])\n\t"
|
||||
"lb %[t2], 2(%[scale])\n\t"
|
||||
"lb %[t3], 3(%[scale])\n\t"
|
||||
"vsetvli zero, %[vl64], e8, m4\n\t"
|
||||
"vwmul.vv v16, v0, v8\n\t"
|
||||
"vwmul.vv v24, v4, v12\n\t"
|
||||
"vsetivli zero, 16, e16, m2\n\t"
|
||||
"vmv.v.x v0, zero\n\t"
|
||||
"vwredsum.vs v10, v16, v0\n\t"
|
||||
"vwredsum.vs v8, v16, v0\n\t"
|
||||
"lb %[t4], 4(%[scale])\n\t"
|
||||
"lb %[t5], 5(%[scale])\n\t"
|
||||
"vwredsum.vs v9, v18, v0\n\t"
|
||||
"vwredsum.vs v8, v20, v0\n\t"
|
||||
"vwredsum.vs v7, v22, v0\n\t"
|
||||
"vwredsum.vs v11, v24, v0\n\t"
|
||||
"vwredsum.vs v12, v26, v0\n\t"
|
||||
"vwredsum.vs v13, v28, v0\n\t"
|
||||
"vwredsum.vs v14, v30, v0\n\t"
|
||||
"vwredsum.vs v10, v20, v0\n\t"
|
||||
"vwredsum.vs v11, v22, v0\n\t"
|
||||
"vwredsum.vs v12, v24, v0\n\t"
|
||||
"lb %[t6], 6(%[scale])\n\t"
|
||||
"lb %[t7], 7(%[scale])\n\t"
|
||||
"vwredsum.vs v13, v26, v0\n\t"
|
||||
"vwredsum.vs v14, v28, v0\n\t"
|
||||
"vwredsum.vs v15, v30, v0\n\t"
|
||||
"vsetivli zero, 4, e32, m1\n\t"
|
||||
"vslideup.vi v10, v9, 1\n\t"
|
||||
"vslideup.vi v8, v7, 1\n\t"
|
||||
"vslideup.vi v11, v12, 1\n\t"
|
||||
"vslideup.vi v13, v14, 1\n\t"
|
||||
"vslideup.vi v10, v8, 2\n\t"
|
||||
"vslideup.vi v11, v13, 2\n\t"
|
||||
"vsetivli zero, 8, e32, m2\n\t"
|
||||
"vle8.v v15, (%[scale])\n\t"
|
||||
"vsext.vf4 v12, v15\n\t"
|
||||
"vmul.vv v10, v10, v12\n\t"
|
||||
"vredsum.vs v0, v10, v0\n\t"
|
||||
"vmul.vx v0, v8, %[tmp]\n\t"
|
||||
"vmul.vx v1, v9, %[t1]\n\t"
|
||||
"vmacc.vx v0, %[t2], v10\n\t"
|
||||
"vmacc.vx v1, %[t3], v11\n\t"
|
||||
"vmacc.vx v0, %[t4], v12\n\t"
|
||||
"vmacc.vx v1, %[t5], v13\n\t"
|
||||
"vmacc.vx v0, %[t6], v14\n\t"
|
||||
"vmacc.vx v1, %[t7], v15\n\t"
|
||||
"vmv.x.s %[tmp], v0\n\t"
|
||||
"add %[isum], %[isum], %[tmp]"
|
||||
: [tmp] "=&r" (tmp), [m] "+&r" (m), [isum] "+&r" (isum)
|
||||
"vmv.x.s %[t1], v1\n\t"
|
||||
"add %[isum], %[isum], %[tmp]\n\t"
|
||||
"add %[isum], %[isum], %[t1]"
|
||||
: [tmp] "=&r" (tmp), [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3)
|
||||
, [t4] "=&r" (t4), [t5] "=&r" (t5), [t6] "=&r" (t6), [t7] "=&r" (t7)
|
||||
, [m] "+&r" (m), [isum] "+&r" (isum)
|
||||
: [vl128] "r" (128), [vl64] "r" (64), [vl32] "r" (32)
|
||||
, [q3] "r" (q3), [qh] "r" (qh), [scale] "r" (scale), [q8] "r" (q8)
|
||||
: "memory"
|
||||
|
||||
@@ -7,6 +7,10 @@
|
||||
|
||||
typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
|
||||
|
||||
const int CUDA_CPY_TILE_DIM_2D = 32; // 2D tile dimension for transposed blocks
|
||||
const int CUDA_CPY_BLOCK_NM = 8; // block size of 3rd dimension if available
|
||||
const int CUDA_CPY_BLOCK_ROWS = 8; // block dimension for marching through rows
|
||||
|
||||
template <cpy_kernel_t cpy_1>
|
||||
static __global__ void cpy_flt(const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
@@ -35,6 +39,55 @@ static __global__ void cpy_flt(const char * cx, char * cdst, const int ne,
|
||||
cpy_1(cx + x_offset, cdst + dst_offset);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static __global__ void cpy_flt_transpose(const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
||||
const int nb12, const int nb13) {
|
||||
|
||||
const T* src = reinterpret_cast<const T*>(cx);
|
||||
T* dst = reinterpret_cast<T*>(cdst);
|
||||
|
||||
const int64_t nmat = ne / (ne00 * ne01);
|
||||
const int64_t n = ne00 * ne01;
|
||||
|
||||
const int x = blockIdx.x * CUDA_CPY_TILE_DIM_2D + threadIdx.x;
|
||||
const int y = blockIdx.y * CUDA_CPY_TILE_DIM_2D + threadIdx.y;
|
||||
const int tx = blockIdx.y * CUDA_CPY_TILE_DIM_2D + threadIdx.x; // transpose block offset
|
||||
const int ty = blockIdx.x * CUDA_CPY_TILE_DIM_2D + threadIdx.y;
|
||||
|
||||
__shared__ float tile[CUDA_CPY_TILE_DIM_2D][CUDA_CPY_TILE_DIM_2D+1];
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < CUDA_CPY_BLOCK_NM; ++i) {
|
||||
|
||||
const unsigned int imat = blockIdx.z * CUDA_CPY_BLOCK_NM + i;
|
||||
if (imat >= nmat)
|
||||
break;
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < CUDA_CPY_TILE_DIM_2D; j += CUDA_CPY_BLOCK_ROWS) {
|
||||
if(x < ne01 && y + j < ne00){
|
||||
const int row = threadIdx.y+j;
|
||||
const int col = threadIdx.x * sizeof(float)/sizeof(T);
|
||||
T *tile2 = reinterpret_cast<T*>(tile[row]);
|
||||
tile2[col] = src[imat*n + (y+j)*ne01 + x];
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < CUDA_CPY_TILE_DIM_2D; j += CUDA_CPY_BLOCK_ROWS) {
|
||||
if (ty + j < ne01 && tx < ne00) {
|
||||
const int col = (threadIdx.y+j)*sizeof(float)/sizeof(T);
|
||||
const T *tile2 = reinterpret_cast<const T*>(tile[threadIdx.x]);
|
||||
dst[imat*n + (ty+j)*ne00 + tx] = tile2[col];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static __device__ void cpy_blck_q8_0_f32(const char * cxi, char * cdsti) {
|
||||
float * cdstf = (float *)(cdsti);
|
||||
|
||||
@@ -136,15 +189,38 @@ cudaStream_t stream) {
|
||||
(cx, cdst, ne);
|
||||
}
|
||||
|
||||
template<typename src_t, typename dst_t>
|
||||
template<typename src_t, typename dst_t, bool transposed = false>
|
||||
static void ggml_cpy_flt_cuda(
|
||||
const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
|
||||
|
||||
const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
|
||||
cpy_flt<cpy_1_flt<src_t, dst_t>><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
if (transposed) {
|
||||
GGML_ASSERT(ne == ne00*ne01*ne02); // ne[3] is 1 assumed
|
||||
int ne00n, ne01n, ne02n;
|
||||
if (nb00 < nb02) {
|
||||
ne00n = ne00;
|
||||
ne01n = ne01;
|
||||
ne02n = ne02;
|
||||
} else if (nb00 > nb02) {
|
||||
ne00n = ne00;
|
||||
ne01n = ne01*ne02;
|
||||
ne02n = 1;
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
|
||||
dim3 dimGrid( (ne01n + CUDA_CPY_TILE_DIM_2D - 1) / CUDA_CPY_TILE_DIM_2D,
|
||||
(ne00n + CUDA_CPY_TILE_DIM_2D - 1) / CUDA_CPY_TILE_DIM_2D,
|
||||
(ne/(ne01n*ne00n) + CUDA_CPY_BLOCK_NM - 1) / CUDA_CPY_BLOCK_NM);
|
||||
dim3 dimBlock(CUDA_CPY_TILE_DIM_2D, CUDA_CPY_BLOCK_ROWS, 1);
|
||||
cpy_flt_transpose<dst_t><<<dimGrid, dimBlock, 0, stream>>>
|
||||
(cx, cdst, ne, ne00n, ne01n, ne02n, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
} else {
|
||||
const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
|
||||
cpy_flt<cpy_1_flt<src_t, dst_t>><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_cpy_f32_q8_0_cuda(
|
||||
@@ -310,6 +386,7 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
|
||||
char * src1_ddc = (char *) src1->data;
|
||||
|
||||
const bool contiguous_srcs = ggml_is_contiguous(src0) && ggml_is_contiguous(src1);
|
||||
const bool can_be_transposed = nb01 == (int64_t)ggml_element_size(src0) && src0->ne[3] == 1;
|
||||
|
||||
if (src0->type == src1->type && contiguous_srcs) {
|
||||
GGML_ASSERT(ggml_nbytes(src0) == ggml_nbytes(src1));
|
||||
@@ -322,7 +399,11 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
|
||||
CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
|
||||
}
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
||||
ggml_cpy_flt_cuda<float, float> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
if (can_be_transposed) {
|
||||
ggml_cpy_flt_cuda<float, float, true> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else {
|
||||
ggml_cpy_flt_cuda<float, float> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
}
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_BF16) {
|
||||
if (contiguous_srcs) {
|
||||
ggml_cpy_flt_contiguous_cuda<float, nv_bfloat16> (src0_ddc, src1_ddc, ne, main_stream);
|
||||
@@ -361,7 +442,11 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
|
||||
} else if (src0->type == GGML_TYPE_Q5_1 && src1->type == GGML_TYPE_F32) {
|
||||
ggml_cpy_q5_1_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
|
||||
ggml_cpy_flt_cuda<half, half> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
if (can_be_transposed) {
|
||||
ggml_cpy_flt_cuda<half, half, true> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else {
|
||||
ggml_cpy_flt_cuda<half, half> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
}
|
||||
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_BF16) {
|
||||
if (contiguous_srcs) {
|
||||
ggml_cpy_flt_contiguous_cuda<half, nv_bfloat16> (src0_ddc, src1_ddc, ne, main_stream);
|
||||
@@ -375,7 +460,11 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
|
||||
ggml_cpy_flt_cuda<half, float> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
}
|
||||
} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_BF16) {
|
||||
ggml_cpy_flt_cuda<nv_bfloat16, nv_bfloat16> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
if (can_be_transposed) {
|
||||
ggml_cpy_flt_cuda<nv_bfloat16, nv_bfloat16, true> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else {
|
||||
ggml_cpy_flt_cuda<nv_bfloat16, nv_bfloat16> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
}
|
||||
} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F16) {
|
||||
if (contiguous_srcs) {
|
||||
ggml_cpy_flt_contiguous_cuda<nv_bfloat16, half> (src0_ddc, src1_ddc, ne, main_stream);
|
||||
|
||||
@@ -2113,7 +2113,7 @@ static bool ggml_cuda_should_fuse_mul_mat_vec_f(const ggml_tensor * tensor) {
|
||||
src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
|
||||
|
||||
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
|
||||
use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, is_mul_mat_id ? src1->ne[2] : src1->ne[1]);
|
||||
use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src0->nb, is_mul_mat_id ? src1->ne[2] : src1->ne[1]);
|
||||
|
||||
const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft) ||
|
||||
ggml_backend_buft_is_cuda_split(src1->buffer->buft);
|
||||
@@ -2207,16 +2207,16 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
|
||||
const int cc = ggml_cuda_info().devices[id].cc;
|
||||
const int warp_size = ggml_cuda_info().devices[id].warp_size;
|
||||
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
|
||||
use_mul_mat_f = use_mul_mat_f && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src1->ne[1], /*mul_mat_id=*/false);
|
||||
use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src1->ne[1]);
|
||||
use_mul_mat_f = use_mul_mat_f && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src0->nb, src1->ne[1], /*mul_mat_id=*/false);
|
||||
use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src0->nb, src1->ne[1]);
|
||||
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc);
|
||||
}
|
||||
} else {
|
||||
const int cc = ggml_cuda_info().devices[ctx.device].cc;
|
||||
const int warp_size = ggml_cuda_info().devices[ctx.device].warp_size;
|
||||
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
|
||||
use_mul_mat_f = use_mul_mat_f && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src1->ne[1], /*mul_mat_id=*/false);
|
||||
use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src1->ne[1]);
|
||||
use_mul_mat_f = use_mul_mat_f && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src0->nb, src1->ne[1], /*mul_mat_id=*/false);
|
||||
use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src0->nb, src1->ne[1]);
|
||||
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc);
|
||||
}
|
||||
|
||||
@@ -2287,7 +2287,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
|
||||
return;
|
||||
}
|
||||
|
||||
if (ggml_cuda_should_use_mmf(src0->type, cc, WARP_SIZE, src0->ne, src1->ne[2], /*mul_mat_id=*/true)) {
|
||||
if (ggml_cuda_should_use_mmf(src0->type, cc, WARP_SIZE, src0->ne, src0->nb, src1->ne[2], /*mul_mat_id=*/true)) {
|
||||
ggml_cuda_mul_mat_f(ctx, src0, src1, ids, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -119,15 +119,21 @@ void ggml_cuda_mul_mat_f(ggml_backend_cuda_context & ctx, const ggml_tensor * sr
|
||||
}
|
||||
}
|
||||
|
||||
bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const int64_t * src0_ne, const int src1_ncols, bool mul_mat_id) {
|
||||
|
||||
bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const int64_t * src0_ne,
|
||||
const size_t * src0_nb, const int src1_ncols, bool mul_mat_id) {
|
||||
if (ggml_is_quantized(type)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (src0_ne[0] % (warp_size * (4/ggml_type_size(type))) != 0) {
|
||||
const size_t ts = ggml_type_size(type);
|
||||
if (src0_ne[0] % (warp_size * (4/ts)) != 0) {
|
||||
return false;
|
||||
}
|
||||
for (size_t i = 0; i < GGML_MAX_DIMS; ++i) {
|
||||
if (src0_nb[i] % (2*ts) != 0) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if (src0_ne[1] % MMF_ROWS_PER_BLOCK != 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -17,7 +17,7 @@ struct mmf_ids_data {
|
||||
|
||||
void ggml_cuda_mul_mat_f(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst);
|
||||
|
||||
bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const int64_t * scr0_ne, const int src1_ncols, bool mul_mat_id);
|
||||
bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const int64_t * scr0_ne, const size_t * src0_nb, const int src1_ncols, bool mul_mat_id);
|
||||
|
||||
template <typename T, int rows_per_block, int cols_per_block, int nwarps, bool has_ids>
|
||||
__launch_bounds__(ggml_cuda_get_physical_warp_size()*nwarps, 1)
|
||||
|
||||
@@ -716,10 +716,16 @@ void ggml_cuda_op_mul_mat_vec_f(
|
||||
GGML_UNUSED_VARS(ctx, src1, dst, src1_ddq_i, src1_ncols, src1_padded_row_size);
|
||||
}
|
||||
|
||||
bool ggml_cuda_should_use_mmvf(enum ggml_type type, int cc, const int64_t * src0_ne, int64_t ne11) {
|
||||
bool ggml_cuda_should_use_mmvf(enum ggml_type type, int cc, const int64_t * src0_ne, const size_t * src0_nb, int64_t ne11) {
|
||||
if (src0_ne[0] % 2 != 0) {
|
||||
return false;
|
||||
}
|
||||
const size_t ts = ggml_type_size(type);
|
||||
for (size_t i = 0; i < GGML_MAX_DIMS; ++i) {
|
||||
if (src0_nb[i] % (2*ts) != 0) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
switch (type) {
|
||||
case GGML_TYPE_F32:
|
||||
if (GGML_CUDA_CC_IS_NVIDIA(cc)) {
|
||||
|
||||
@@ -9,4 +9,4 @@ void ggml_cuda_op_mul_mat_vec_f(
|
||||
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
||||
const int64_t src1_padded_row_size, cudaStream_t stream);
|
||||
|
||||
bool ggml_cuda_should_use_mmvf(enum ggml_type type, int cc, const int64_t * src0_ne, int64_t ne11);
|
||||
bool ggml_cuda_should_use_mmvf(enum ggml_type type, int cc, const int64_t * src0_ne, const size_t * src0_nb, int64_t ne11);
|
||||
|
||||
@@ -367,7 +367,13 @@ struct ggml_backend_hexagon_buffer_context {
|
||||
ggml_backend_hexagon_buffer_context(ggml_hexagon_session * sess, size_t size, bool repack) {
|
||||
size += 4 * 1024; // extra page for padding
|
||||
|
||||
this->base = (uint8_t *) rpcmem_alloc2(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size);
|
||||
if (rpcmem_alloc2) {
|
||||
this->base = (uint8_t *) rpcmem_alloc2(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size);
|
||||
} else {
|
||||
GGML_LOG_INFO("ggml-hex: %s rpcmem_alloc2 not found, falling back to rpcmem_alloc\n", sess->name.c_str());
|
||||
this->base = (uint8_t *) rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size);
|
||||
}
|
||||
|
||||
if (!this->base) {
|
||||
GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer : size %zu\n", sess->name.c_str(), size);
|
||||
throw std::runtime_error("ggml-hex: rpcmem_alloc failed (see log for details)");
|
||||
@@ -1679,12 +1685,13 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) {
|
||||
}
|
||||
|
||||
// Get session URI
|
||||
char htp_uri[256];
|
||||
sprintf(htp_uri, "file:///libggml-htp-v%u.so?htp_iface_skel_handle_invoke&_modver=1.0", opt_arch);
|
||||
|
||||
char session_uri[256];
|
||||
{
|
||||
struct remote_rpc_get_uri u;
|
||||
char htp_uri[256];
|
||||
snprintf(htp_uri, sizeof(htp_uri), "file:///libggml-htp-v%u.so?htp_iface_skel_handle_invoke&_modver=1.0", opt_arch);
|
||||
|
||||
struct remote_rpc_get_uri u = {};
|
||||
u.session_id = this->session_id;
|
||||
u.domain_name = const_cast<char *>(CDSP_DOMAIN_NAME);
|
||||
u.domain_name_len = strlen(CDSP_DOMAIN_NAME);
|
||||
@@ -1695,8 +1702,12 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) {
|
||||
|
||||
int err = remote_session_control(FASTRPC_GET_URI, (void *) &u, sizeof(u));
|
||||
if (err != AEE_SUCCESS) {
|
||||
GGML_LOG_ERROR("ggml-hex: failed to get URI for session %d : error 0x%x\n", dev_id, err);
|
||||
throw std::runtime_error("ggml-hex: remote_session_control(get-uri) failed (see log for details)");
|
||||
// fallback to single session uris
|
||||
int htp_URI_domain_len = strlen(htp_uri) + MAX_DOMAIN_NAMELEN;
|
||||
|
||||
snprintf(session_uri, htp_URI_domain_len, "%s%s", htp_uri, my_domain->uri);
|
||||
|
||||
GGML_LOG_WARN("ggml-hex: failed to get URI for session %d : error 0x%x. Falling back to single session URI: %s\n", dev_id, err, session_uri);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3668,6 +3679,11 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
|
||||
}
|
||||
}
|
||||
|
||||
if(opt_arch < 75) {
|
||||
opt_ndev = 1;
|
||||
GGML_LOG_WARN("ggml-hex: forcing ndev to 1 for SoCs archs lower than v75.\n");
|
||||
}
|
||||
|
||||
GGML_LOG_INFO("ggml-hex: Hexagon Arch version v%d\n", opt_arch);
|
||||
|
||||
// Create devices / sessions
|
||||
|
||||
@@ -64,6 +64,7 @@ extern "C" {
|
||||
# pragma weak remote_handle64_control
|
||||
# pragma weak fastrpc_mmap
|
||||
# pragma weak fastrpc_munmap
|
||||
# pragma weak rpcmem_alloc2
|
||||
#endif
|
||||
|
||||
#if !defined(_WINDOWS)
|
||||
|
||||
@@ -42,8 +42,8 @@ void ggml_print_backtrace(void);
|
||||
# define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||
#endif
|
||||
|
||||
// required for mmap as gguf only guarantees 32-byte alignment
|
||||
#define TENSOR_ALIGNMENT 32
|
||||
// required for mmap as gguf converted with reflinks from safetensors only guarantees 8-byte alignment
|
||||
#define TENSOR_ALIGNMENT 8
|
||||
|
||||
// static_assert should be a #define, but if it's not,
|
||||
// fall back to the _Static_assert C11 keyword.
|
||||
|
||||
@@ -35,7 +35,6 @@ struct ggml_metal {
|
||||
// additional, inference-time compiled pipelines
|
||||
ggml_metal_pipelines_t pipelines_ext;
|
||||
|
||||
bool use_bfloat;
|
||||
bool use_fusion;
|
||||
bool use_concurrency;
|
||||
bool use_graph_optimize;
|
||||
@@ -121,11 +120,10 @@ ggml_metal_t ggml_metal_init(ggml_metal_device_t dev) {
|
||||
}
|
||||
}
|
||||
|
||||
const struct ggml_metal_device_props * props_dev = ggml_metal_device_get_props(dev);
|
||||
//const struct ggml_metal_device_props * props_dev = ggml_metal_device_get_props(dev);
|
||||
|
||||
res->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
|
||||
|
||||
res->use_bfloat = props_dev->has_bfloat;
|
||||
res->use_fusion = getenv("GGML_METAL_FUSION_DISABLE") == nil;
|
||||
res->use_concurrency = getenv("GGML_METAL_CONCURRENCY_DISABLE") == nil;
|
||||
|
||||
@@ -147,7 +145,6 @@ ggml_metal_t ggml_metal_init(ggml_metal_device_t dev) {
|
||||
|
||||
memset(res->fuse_cnt, 0, sizeof(res->fuse_cnt));
|
||||
|
||||
GGML_LOG_INFO("%s: use bfloat = %s\n", __func__, res->use_bfloat ? "true" : "false");
|
||||
GGML_LOG_INFO("%s: use fusion = %s\n", __func__, res->use_fusion ? "true" : "false");
|
||||
GGML_LOG_INFO("%s: use concurrency = %s\n", __func__, res->use_concurrency ? "true" : "false");
|
||||
GGML_LOG_INFO("%s: use graph optimize = %s\n", __func__, res->use_graph_optimize ? "true" : "false");
|
||||
|
||||
@@ -95,7 +95,9 @@ void ggml_metal_encoder_end_encoding(ggml_metal_encoder_t encoder);
|
||||
|
||||
typedef struct ggml_metal_library * ggml_metal_library_t;
|
||||
|
||||
ggml_metal_library_t ggml_metal_library_init(ggml_metal_device_t dev);
|
||||
ggml_metal_library_t ggml_metal_library_init (ggml_metal_device_t dev);
|
||||
ggml_metal_library_t ggml_metal_library_init_from_source(ggml_metal_device_t dev, const char * source, bool verbose);
|
||||
|
||||
void ggml_metal_library_free(ggml_metal_library_t lib);
|
||||
|
||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline (ggml_metal_library_t lib, const char * name);
|
||||
@@ -193,6 +195,7 @@ struct ggml_metal_device_props {
|
||||
bool has_simdgroup_mm;
|
||||
bool has_unified_memory;
|
||||
bool has_bfloat;
|
||||
bool has_tensor;
|
||||
bool use_residency_sets;
|
||||
bool use_shared_buffers;
|
||||
|
||||
|
||||
@@ -21,8 +21,9 @@
|
||||
#define GGML_METAL_HAS_RESIDENCY_SETS 1
|
||||
#endif
|
||||
|
||||
// overload of MTLGPUFamilyMetal3 (not available in some environments)
|
||||
// overload of MTLGPUFamilyMetalX (not available in some environments)
|
||||
static const NSInteger MTLGPUFamilyMetal3_GGML = 5001;
|
||||
static const NSInteger MTLGPUFamilyMetal4_GGML = 5002;
|
||||
|
||||
// virtual address for GPU memory allocations
|
||||
static atomic_uintptr_t g_addr_device = 0x000000400ULL;
|
||||
@@ -261,6 +262,10 @@ ggml_metal_library_t ggml_metal_library_init(ggml_metal_device_t dev) {
|
||||
[prep setObject:@"1" forKey:@"GGML_METAL_HAS_BF16"];
|
||||
}
|
||||
|
||||
if (ggml_metal_device_get_props(dev)->has_tensor) {
|
||||
[prep setObject:@"1" forKey:@"GGML_METAL_HAS_TENSOR"];
|
||||
}
|
||||
|
||||
#if GGML_METAL_EMBED_LIBRARY
|
||||
[prep setObject:@"1" forKey:@"GGML_METAL_EMBED_LIBRARY"];
|
||||
#endif
|
||||
@@ -298,6 +303,72 @@ ggml_metal_library_t ggml_metal_library_init(ggml_metal_device_t dev) {
|
||||
return res;
|
||||
}
|
||||
|
||||
ggml_metal_library_t ggml_metal_library_init_from_source(ggml_metal_device_t dev, const char * source, bool verbose) {
|
||||
if (source == NULL) {
|
||||
GGML_LOG_ERROR("%s: source is NULL\n", __func__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
id<MTLDevice> device = ggml_metal_device_get_obj(dev);
|
||||
id<MTLLibrary> library = nil;
|
||||
NSError * error = nil;
|
||||
|
||||
const int64_t t_start = ggml_time_us();
|
||||
|
||||
NSString * src = [[NSString alloc] initWithBytes:source
|
||||
length:strlen(source)
|
||||
encoding:NSUTF8StringEncoding];
|
||||
if (!src) {
|
||||
GGML_LOG_ERROR("%s: failed to create NSString from source\n", __func__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@autoreleasepool {
|
||||
NSMutableDictionary * prep = [NSMutableDictionary dictionary];
|
||||
|
||||
MTLCompileOptions * options = [MTLCompileOptions new];
|
||||
options.preprocessorMacros = prep;
|
||||
|
||||
library = [device newLibraryWithSource:src options:options error:&error];
|
||||
if (error) {
|
||||
if (verbose) {
|
||||
GGML_LOG_ERROR("%s: error compiling source: %s\n", __func__, [[error description] UTF8String]);
|
||||
} else {
|
||||
GGML_LOG_ERROR("%s: error compiling source\n", __func__);
|
||||
}
|
||||
library = nil;
|
||||
}
|
||||
|
||||
[options release];
|
||||
}
|
||||
|
||||
[src release];
|
||||
|
||||
if (!library) {
|
||||
if (verbose) {
|
||||
GGML_LOG_ERROR("%s: failed to create Metal library from source\n", __func__);
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (verbose) {
|
||||
GGML_LOG_INFO("%s: compiled in %.3f sec\n", __func__, (ggml_time_us() - t_start) / 1e6);
|
||||
}
|
||||
|
||||
ggml_metal_library_t res = calloc(1, sizeof(struct ggml_metal_library));
|
||||
if (!res) {
|
||||
GGML_LOG_ERROR("%s: calloc failed\n", __func__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
res->obj = library;
|
||||
res->device = device;
|
||||
res->pipelines = ggml_metal_pipelines_init();
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
void ggml_metal_library_free(ggml_metal_library_t lib) {
|
||||
if (!lib) {
|
||||
return;
|
||||
@@ -345,9 +416,9 @@ ggml_metal_pipeline_t ggml_metal_library_compile_pipeline(ggml_metal_library_t l
|
||||
if (!mtl_function) {
|
||||
ggml_critical_section_end();
|
||||
|
||||
GGML_LOG_ERROR("%s: error: failed to compile pipeline: base = '%s', name = '%s'\n", __func__, base, name);
|
||||
GGML_LOG_ERROR("%s: failed to compile pipeline: base = '%s', name = '%s'\n", __func__, base, name);
|
||||
if (error) {
|
||||
GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||
GGML_LOG_ERROR("%s: %s\n", __func__, [[error description] UTF8String]);
|
||||
}
|
||||
|
||||
return nil;
|
||||
@@ -355,13 +426,21 @@ ggml_metal_pipeline_t ggml_metal_library_compile_pipeline(ggml_metal_library_t l
|
||||
|
||||
res->obj = [lib->device newComputePipelineStateWithFunction:mtl_function error:&error];
|
||||
|
||||
ggml_metal_pipelines_add(lib->pipelines, name, res);
|
||||
|
||||
[mtl_function release];
|
||||
|
||||
GGML_LOG_DEBUG("%s: loaded %-40s %16p | th_max = %4d | th_width = %4d\n", __func__, name, (void *) res->obj,
|
||||
(int) res->obj.maxTotalThreadsPerThreadgroup,
|
||||
(int) res->obj.threadExecutionWidth);
|
||||
|
||||
if (res->obj.maxTotalThreadsPerThreadgroup == 0 || res->obj.threadExecutionWidth == 0) {
|
||||
ggml_critical_section_end();
|
||||
|
||||
GGML_LOG_ERROR("%s: incompatible pipeline %s\n", __func__, name);
|
||||
|
||||
return nil;
|
||||
}
|
||||
|
||||
ggml_metal_pipelines_add(lib->pipelines, name, res);
|
||||
}
|
||||
|
||||
ggml_critical_section_end();
|
||||
@@ -469,6 +548,126 @@ ggml_metal_device_t ggml_metal_device_init(void) {
|
||||
|
||||
dev->props.has_bfloat = [dev->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML];
|
||||
dev->props.has_bfloat |= [dev->mtl_device supportsFamily:MTLGPUFamilyApple6];
|
||||
if (getenv("GGML_METAL_BF16_DISABLE") != NULL) {
|
||||
dev->props.has_bfloat = false;
|
||||
}
|
||||
|
||||
dev->props.has_tensor = [dev->mtl_device supportsFamily:MTLGPUFamilyMetal4_GGML];
|
||||
if (getenv("GGML_METAL_TENSOR_DISABLE") != NULL) {
|
||||
dev->props.has_tensor = false;
|
||||
}
|
||||
|
||||
// note: disable the tensor API by default for old chips because with the current implementation it is not useful
|
||||
// - M2 Ultra: ~5% slower
|
||||
// - M4, M4 Max: no significant difference
|
||||
//
|
||||
// TODO: try to update the tensor API kernels to at least match the simdgroup performance
|
||||
if (getenv("GGML_METAL_TENSOR_ENABLE") == NULL &&
|
||||
![[dev->mtl_device name] containsString:@"M5"] &&
|
||||
![[dev->mtl_device name] containsString:@"M6"]) {
|
||||
GGML_LOG_WARN("%s: tensor API disabled for pre-M5 device\n", __func__);
|
||||
dev->props.has_tensor = false;
|
||||
}
|
||||
|
||||
// double-check that the tensor API compiles
|
||||
if (dev->props.has_tensor) {
|
||||
const char * src_tensor_f16 = "\n"
|
||||
"#include <metal_stdlib> \n"
|
||||
"#include <metal_tensor> \n"
|
||||
"#include <MetalPerformancePrimitives/MetalPerformancePrimitives.h> \n"
|
||||
" \n"
|
||||
"using namespace metal; \n"
|
||||
"using namespace mpp::tensor_ops; \n"
|
||||
" \n"
|
||||
"kernel void dummy_kernel( \n"
|
||||
" tensor<device half, dextents<int32_t, 2>> A [[buffer(0)]], \n"
|
||||
" tensor<device half, dextents<int32_t, 2>> B [[buffer(1)]], \n"
|
||||
" device float * C [[buffer(2)]], \n"
|
||||
" uint2 tgid [[threadgroup_position_in_grid]]) \n"
|
||||
"{ \n"
|
||||
" auto tA = A.slice(0, (int)tgid.y); \n"
|
||||
" auto tB = B.slice((int)tgid.x, 0); \n"
|
||||
" \n"
|
||||
" matmul2d< \n"
|
||||
" matmul2d_descriptor(8, 8, dynamic_extent), \n"
|
||||
" execution_simdgroups<4>> mm; \n"
|
||||
" \n"
|
||||
" auto cT = mm.get_destination_cooperative_tensor<decltype(tA), decltype(tB), float>(); \n"
|
||||
" \n"
|
||||
" auto sA = tA.slice(0, 0); \n"
|
||||
" auto sB = tB.slice(0, 0); \n"
|
||||
" mm.run(sB, sA, cT); \n"
|
||||
" \n"
|
||||
" auto tC = tensor<device float, dextents<int32_t, 2>, tensor_inline>(C, dextents<int32_t, 2>(4, 4)); \n"
|
||||
" \n"
|
||||
" cT.store(tC); \n"
|
||||
"}";
|
||||
|
||||
GGML_LOG_INFO("%s: testing tensor API for f16 support\n", __func__);
|
||||
ggml_metal_library_t lib = ggml_metal_library_init_from_source(dev, src_tensor_f16, false);
|
||||
if (lib == NULL) {
|
||||
GGML_LOG_WARN("%s: - the tensor API is not supported in this environment - disabling\n", __func__);
|
||||
dev->props.has_tensor = false;
|
||||
} else {
|
||||
ggml_metal_pipeline_t ppl = ggml_metal_library_compile_pipeline(lib, "dummy_kernel", "dummy_kernel", nil);
|
||||
if (!ppl) {
|
||||
GGML_LOG_WARN("%s: - the tensor API is not supported in this environment - disabling\n", __func__);
|
||||
dev->props.has_tensor = false;
|
||||
}
|
||||
|
||||
ggml_metal_library_free(lib);
|
||||
}
|
||||
}
|
||||
|
||||
// try to compile a dummy kernel to determine if the tensor API is supported for bfloat
|
||||
if (dev->props.has_tensor && dev->props.has_bfloat) {
|
||||
const char * src_tensor_bf16 = "\n"
|
||||
"#include <metal_stdlib> \n"
|
||||
"#include <metal_tensor> \n"
|
||||
"#include <MetalPerformancePrimitives/MetalPerformancePrimitives.h> \n"
|
||||
" \n"
|
||||
"using namespace metal; \n"
|
||||
"using namespace mpp::tensor_ops; \n"
|
||||
" \n"
|
||||
"kernel void dummy_kernel( \n"
|
||||
" tensor<device bfloat, dextents<int32_t, 2>> A [[buffer(0)]], \n"
|
||||
" tensor<device bfloat, dextents<int32_t, 2>> B [[buffer(1)]], \n"
|
||||
" device float * C [[buffer(2)]], \n"
|
||||
" uint2 tgid [[threadgroup_position_in_grid]]) \n"
|
||||
"{ \n"
|
||||
" auto tA = A.slice(0, (int)tgid.y); \n"
|
||||
" auto tB = B.slice((int)tgid.x, 0); \n"
|
||||
" \n"
|
||||
" matmul2d< \n"
|
||||
" matmul2d_descriptor(8, 8, dynamic_extent), \n"
|
||||
" execution_simdgroups<4>> mm; \n"
|
||||
" \n"
|
||||
" auto cT = mm.get_destination_cooperative_tensor<decltype(tA), decltype(tB), float>(); \n"
|
||||
" \n"
|
||||
" auto sA = tA.slice(0, 0); \n"
|
||||
" auto sB = tB.slice(0, 0); \n"
|
||||
" mm.run(sB, sA, cT); \n"
|
||||
" \n"
|
||||
" auto tC = tensor<device float, dextents<int32_t, 2>, tensor_inline>(C, dextents<int32_t, 2>(4, 4)); \n"
|
||||
" \n"
|
||||
" cT.store(tC); \n"
|
||||
"}";
|
||||
|
||||
GGML_LOG_INFO("%s: testing tensor API for bfloat support\n", __func__);
|
||||
ggml_metal_library_t lib = ggml_metal_library_init_from_source(dev, src_tensor_bf16, false);
|
||||
if (lib == NULL) {
|
||||
GGML_LOG_WARN("%s: - the tensor API does not support bfloat - disabling bfloat support\n", __func__);
|
||||
dev->props.has_bfloat = false;
|
||||
} else {
|
||||
ggml_metal_pipeline_t ppl = ggml_metal_library_compile_pipeline(lib, "dummy_kernel", "dummy_kernel", nil);
|
||||
if (!ppl) {
|
||||
GGML_LOG_WARN("%s: - the tensor API does not support bfloat - disabling bfloat support\n", __func__);
|
||||
dev->props.has_bfloat = false;
|
||||
}
|
||||
|
||||
ggml_metal_library_free(lib);
|
||||
}
|
||||
}
|
||||
|
||||
dev->props.use_residency_sets = true;
|
||||
#if defined(GGML_METAL_HAS_RESIDENCY_SETS)
|
||||
@@ -476,7 +675,6 @@ ggml_metal_device_t ggml_metal_device_init(void) {
|
||||
#endif
|
||||
|
||||
dev->props.use_shared_buffers = dev->props.has_unified_memory;
|
||||
|
||||
if (getenv("GGML_METAL_SHARED_BUFFERS_DISABLE") != NULL) {
|
||||
dev->props.use_shared_buffers = false;
|
||||
}
|
||||
@@ -529,6 +727,7 @@ ggml_metal_device_t ggml_metal_device_init(void) {
|
||||
GGML_LOG_INFO("%s: simdgroup matrix mul. = %s\n", __func__, dev->props.has_simdgroup_mm ? "true" : "false");
|
||||
GGML_LOG_INFO("%s: has unified memory = %s\n", __func__, dev->props.has_unified_memory ? "true" : "false");
|
||||
GGML_LOG_INFO("%s: has bfloat = %s\n", __func__, dev->props.has_bfloat ? "true" : "false");
|
||||
GGML_LOG_INFO("%s: has tensor = %s\n", __func__, dev->props.has_tensor ? "true" : "false");
|
||||
GGML_LOG_INFO("%s: use residency sets = %s\n", __func__, dev->props.use_residency_sets ? "true" : "false");
|
||||
GGML_LOG_INFO("%s: use shared buffers = %s\n", __func__, dev->props.use_shared_buffers ? "true" : "false");
|
||||
|
||||
|
||||
@@ -9,6 +9,12 @@ __embed_ggml-common.h__
|
||||
|
||||
#include <metal_stdlib>
|
||||
|
||||
#ifdef GGML_METAL_HAS_TENSOR
|
||||
#include <metal_tensor>
|
||||
|
||||
#include <MetalPerformancePrimitives/MetalPerformancePrimitives.h>
|
||||
#endif
|
||||
|
||||
using namespace metal;
|
||||
|
||||
#define MAX(x, y) ((x) > (y) ? (x) : (y))
|
||||
@@ -1742,7 +1748,7 @@ kernel void kernel_op_sum_f32(
|
||||
|
||||
float sumf = 0;
|
||||
|
||||
for (int64_t i0 = tpitg.x; i0 < args.np; i0 += ntg.x) {
|
||||
for (uint64_t i0 = tpitg.x; i0 < args.np; i0 += ntg.x) {
|
||||
sumf += src0[i0];
|
||||
}
|
||||
|
||||
@@ -5467,6 +5473,7 @@ template [[host_name("kernel_flash_attn_ext_q8_0_dk576_dv512")]] kernel flash_at
|
||||
|
||||
#undef FA_TYPES
|
||||
#undef FA_TYPES_BF
|
||||
#undef FA_TYPES_F32
|
||||
|
||||
constant bool FC_flash_attn_ext_vec_has_mask [[function_constant(FC_FLASH_ATTN_EXT_VEC + 0)]];
|
||||
constant bool FC_flash_attn_ext_vec_has_sinks [[function_constant(FC_FLASH_ATTN_EXT_VEC + 1)]];
|
||||
@@ -6088,6 +6095,7 @@ template [[host_name("kernel_flash_attn_ext_vec_q5_1_dk576_dv512")]] kernel flas
|
||||
template [[host_name("kernel_flash_attn_ext_vec_q8_0_dk576_dv512")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q8_0, 8, dequantize_q8_0_t4, block_q8_0, 8, dequantize_q8_0_t4, 576, 512, 2>;
|
||||
|
||||
#undef FA_TYPES
|
||||
#undef FA_TYPES_F32
|
||||
|
||||
constant int32_t FC_flash_attn_ext_vec_reduce_DV [[function_constant(FC_FLASH_ATTN_EXT_VEC_REDUCE + 0)]];
|
||||
constant int32_t FC_flash_attn_ext_vec_reduce_NWG [[function_constant(FC_FLASH_ATTN_EXT_VEC_REDUCE + 1)]];
|
||||
@@ -8141,17 +8149,6 @@ kernel void kernel_set_rows_f(
|
||||
constant bool FC_mul_mm_bc_inp [[function_constant(FC_MUL_MM + 0)]];
|
||||
constant bool FC_mul_mm_bc_out [[function_constant(FC_MUL_MM + 1)]];
|
||||
|
||||
#define BLOCK_SIZE_M 64 // 8 simdgroup matrices from matrix A
|
||||
#define BLOCK_SIZE_N 32 // 4 simdgroup matrices from matrix B
|
||||
#define BLOCK_SIZE_K 32
|
||||
#define THREAD_MAT_M 4 // each thread take 4 simdgroup matrices from matrix A
|
||||
#define THREAD_MAT_N 2 // each thread take 2 simdgroup matrices from matrix B
|
||||
#define THREAD_PER_BLOCK 128
|
||||
#define THREAD_PER_ROW 2 // 2 thread for each row in matrix A to load numbers
|
||||
#define THREAD_PER_COL 4 // 4 thread for each row in matrix B to load numbers
|
||||
#define SG_MAT_SIZE 64 // simdgroup matrix is of shape 8x8
|
||||
#define SG_MAT_ROW 8
|
||||
|
||||
// each block_q contains 16*nl weights
|
||||
template<typename S0, typename S0_4x4, typename S0_8x8, typename S1, typename S1_2x4, typename S1_8x8, typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread S0_4x4 &), typename T0, typename T0_4x4, typename T1, typename T1_2x4>
|
||||
kernel void kernel_mul_mm(
|
||||
@@ -8167,18 +8164,48 @@ kernel void kernel_mul_mm(
|
||||
threadgroup S0 * sa = (threadgroup S0 *)(shmem);
|
||||
threadgroup S1 * sb = (threadgroup S1 *)(shmem + 4096);
|
||||
|
||||
const int r0 = tgpig.y;
|
||||
const int r1 = tgpig.x;
|
||||
threadgroup float * sc = (threadgroup float *)(shmem);
|
||||
|
||||
constexpr int NR0 = 64;
|
||||
constexpr int NR1 = 32;
|
||||
|
||||
constexpr int NK = 32;
|
||||
constexpr int NL0 = NK/16;
|
||||
constexpr int NL1 = NK/8;
|
||||
|
||||
const int im = tgpig.z;
|
||||
const int r0 = tgpig.y*NR0;
|
||||
const int r1 = tgpig.x*NR1;
|
||||
|
||||
// if this block is of 64x32 shape or smaller
|
||||
const short n_rows = (args.ne0 - r0*BLOCK_SIZE_M < BLOCK_SIZE_M) ? (args.ne0 - r0*BLOCK_SIZE_M) : BLOCK_SIZE_M;
|
||||
const short n_cols = (args.ne1 - r1*BLOCK_SIZE_N < BLOCK_SIZE_N) ? (args.ne1 - r1*BLOCK_SIZE_N) : BLOCK_SIZE_N;
|
||||
const short nr0 = (args.ne0 - r0 < NR0) ? (args.ne0 - r0) : NR0;
|
||||
const short nr1 = (args.ne1 - r1 < NR1) ? (args.ne1 - r1) : NR1;
|
||||
|
||||
// a thread shouldn't load data outside of the matrix
|
||||
const short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1;
|
||||
const short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1;
|
||||
const short lr0 = ((short)tiitg/NL0) < nr0 ? ((short)tiitg/NL0) : nr0 - 1; // 0 .. 63
|
||||
const short lr1 = ((short)tiitg/NL1) < nr1 ? ((short)tiitg/NL1) : nr1 - 1; // 0 .. 31
|
||||
|
||||
const short il0 = (tiitg % NL0);
|
||||
|
||||
short il = il0;
|
||||
|
||||
const int i12 = im%args.ne12;
|
||||
const int i13 = im/args.ne12;
|
||||
|
||||
const uint64_t offset0 = (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
|
||||
const short offset1 = il0/nl;
|
||||
|
||||
device const block_q * x = (device const block_q *)(src0 + args.nb01*(r0 + lr0) + offset0) + offset1;
|
||||
|
||||
const short iy = 8*(tiitg % NL1);
|
||||
|
||||
device const T1 * y = (device const T1 *)(src1
|
||||
+ args.nb13*i13
|
||||
+ args.nb12*i12
|
||||
+ args.nb11*(r1 + lr1)
|
||||
+ args.nb10*iy);
|
||||
|
||||
#ifndef GGML_METAL_HAS_TENSOR
|
||||
S0_8x8 ma[4];
|
||||
S1_8x8 mb[2];
|
||||
|
||||
@@ -8187,36 +8214,36 @@ kernel void kernel_mul_mm(
|
||||
for (short i = 0; i < 8; i++){
|
||||
mc[i] = make_filled_simdgroup_matrix<float, 8>(0.f);
|
||||
}
|
||||
#else
|
||||
auto tA = tensor<threadgroup S0, dextents<int32_t, 2>, tensor_inline>(sa, dextents<int32_t, 2>(NK, NR0));
|
||||
auto tB = tensor<threadgroup S1, dextents<int32_t, 2>, tensor_inline>(sb, dextents<int32_t, 2>(NR1, NK ));
|
||||
|
||||
short il = (tiitg % THREAD_PER_ROW);
|
||||
mpp::tensor_ops::matmul2d<
|
||||
mpp::tensor_ops::matmul2d_descriptor(NR1, NR0, NK, false, true, false, mpp::tensor_ops::matmul2d_descriptor::mode::multiply_accumulate),
|
||||
execution_simdgroups<4>> mm;
|
||||
|
||||
const int i12 = im%args.ne12;
|
||||
const int i13 = im/args.ne12;
|
||||
auto cT = mm.get_destination_cooperative_tensor<decltype(tA), decltype(tB), float>();
|
||||
#endif
|
||||
|
||||
const uint64_t offset0 = (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
|
||||
const short offset1 = il/nl;
|
||||
|
||||
device const block_q * x = (device const block_q *)(src0
|
||||
+ args.nb01*(r0*BLOCK_SIZE_M + thread_row) + offset0) + offset1;
|
||||
|
||||
const short iy = (BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL));
|
||||
|
||||
device const T1 * y = (device const T1 *)(src1
|
||||
+ args.nb13*i13
|
||||
+ args.nb12*i12
|
||||
+ args.nb11*(r1*BLOCK_SIZE_N + thread_col)
|
||||
+ args.nb10*iy);
|
||||
|
||||
for (int loop_k = 0; loop_k < args.ne00; loop_k += BLOCK_SIZE_K) {
|
||||
for (int loop_k = 0; loop_k < args.ne00; loop_k += NK) {
|
||||
#ifndef GGML_METAL_HAS_TENSOR
|
||||
// load data and store to threadgroup memory
|
||||
if (is_same<T0_4x4, block_q>::value && FC_mul_mm_bc_inp) {
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
// no need for dequantization
|
||||
for (short i = 0; i < 16; i++) {
|
||||
*(sa + SG_MAT_SIZE * ((tiitg/THREAD_PER_ROW/8) \
|
||||
+ (tiitg%THREAD_PER_ROW)*16 + (i/8)*8) \
|
||||
+ (tiitg/THREAD_PER_ROW)%8 + (i&7)*8) = loop_k + 16*il + i < args.ne00 ? ((device T0 *) x)[i] : 0;
|
||||
const short sx = 2*il0 + i/8;
|
||||
const short sy = (tiitg/NL0)/8;
|
||||
|
||||
//const short lx = i%8;
|
||||
//const short ly = (tiitg/NL0)%8;
|
||||
const short lx = (tiitg/NL0)%8;
|
||||
const short ly = i%8;
|
||||
|
||||
const short ib = 8*sx + sy;
|
||||
|
||||
*(sa + 64*ib + 8*ly + lx) = loop_k + 16*il + i < args.ne00 ? *((device T0 *) x + i) : 0;
|
||||
}
|
||||
} else {
|
||||
S0_4x4 temp_a;
|
||||
@@ -8225,91 +8252,203 @@ kernel void kernel_mul_mm(
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
FOR_UNROLL (short i = 0; i < 16; i++) {
|
||||
*(sa + SG_MAT_SIZE * ((tiitg/THREAD_PER_ROW/8) \
|
||||
+ (tiitg%THREAD_PER_ROW)*16 + (i/8)*8) \
|
||||
+ (tiitg/THREAD_PER_ROW)%8 + (i&7)*8) = temp_a[i/4][i%4];
|
||||
const short sx = 2*il0 + i/8;
|
||||
const short sy = (tiitg/NL0)/8;
|
||||
|
||||
//const short lx = i%8;
|
||||
//const short ly = (tiitg/NL0)%8;
|
||||
const short lx = (tiitg/NL0)%8;
|
||||
const short ly = i%8;
|
||||
|
||||
const short ib = 8*sx + sy;
|
||||
|
||||
// NOTE: this is massively slower.. WTF?
|
||||
//sa[64*ib + 8*ly + lx] = temp_a[i/4][i%4];
|
||||
|
||||
*(sa + 64*ib + 8*ly + lx) = temp_a[i/4][i%4];
|
||||
}
|
||||
}
|
||||
|
||||
if (FC_mul_mm_bc_inp) {
|
||||
for (short i = 0; i < 8; ++i) {
|
||||
sb[32*8*(tiitg%THREAD_PER_COL) + 8*(tiitg/THREAD_PER_COL) + i] = loop_k + iy + i < args.ne00 ? (S1) ((device T1 *) y)[i] : 0;
|
||||
const short sx = (tiitg%NL1);
|
||||
const short sy = (tiitg/NL1)/8;
|
||||
|
||||
const short lx = i;
|
||||
const short ly = (tiitg/NL1)%8;
|
||||
//const short lx = (tiitg/NL1)%8;
|
||||
//const short ly = i;
|
||||
|
||||
const short ib = 4*sx + sy;
|
||||
|
||||
*(sb + 64*ib + 8*ly + lx) = loop_k + iy + i < args.ne00 ? (S1) *((device T1 *) y + i) : 0;
|
||||
}
|
||||
} else {
|
||||
*(threadgroup S1_2x4 *)(sb + 32*8*(tiitg%THREAD_PER_COL) + 8*(tiitg/THREAD_PER_COL)) = (S1_2x4)(*((device T1_2x4 *) y));
|
||||
const short sx = (tiitg%NL1);
|
||||
const short sy = (tiitg/NL1)/8;
|
||||
|
||||
const short dx = sx;
|
||||
const short dy = sy;
|
||||
|
||||
const short ly = (tiitg/NL1)%8;
|
||||
|
||||
const short ib = 4*sx + sy;
|
||||
|
||||
*(threadgroup S1_2x4 *)(sb + 64*ib + 8*ly) = (S1_2x4)(*((device T1_2x4 *) y));
|
||||
}
|
||||
#else
|
||||
// load data and store to threadgroup memory
|
||||
if (is_same<T0_4x4, block_q>::value && FC_mul_mm_bc_inp) {
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
// no need for dequantization
|
||||
for (short i = 0; i < 16; i++) {
|
||||
const short sx = 2*il0 + i/8;
|
||||
const short sy = (tiitg/NL0)/8;
|
||||
|
||||
const short lx = i%8;
|
||||
const short ly = (tiitg/NL0)%8;
|
||||
//const short lx = (tiitg/NL0)%8;
|
||||
//const short ly = i%8;
|
||||
|
||||
*(sa + NK*(8*sy + ly) + 8*sx + lx) = loop_k + 16*il + i < args.ne00 ? *((device T0 *) x + i) : 0;
|
||||
}
|
||||
} else {
|
||||
S0_4x4 temp_a;
|
||||
dequantize_func(x, il, temp_a);
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
FOR_UNROLL (short i = 0; i < 16; i++) {
|
||||
const short sx = 2*il0 + i/8;
|
||||
const short sy = (tiitg/NL0)/8;
|
||||
|
||||
const short lx = i%8;
|
||||
const short ly = (tiitg/NL0)%8;
|
||||
//const short lx = (tiitg/NL0)%8;
|
||||
//const short ly = i%8;
|
||||
|
||||
*(sa + NK*(8*sy + ly) + 8*sx + lx) = temp_a[i/4][i%4];
|
||||
}
|
||||
}
|
||||
|
||||
if (FC_mul_mm_bc_inp) {
|
||||
for (short i = 0; i < 8; ++i) {
|
||||
const short sx = (tiitg%NL1);
|
||||
const short sy = (tiitg/NL1)/8;
|
||||
|
||||
const short lx = i;
|
||||
const short ly = (tiitg/NL1)%8;
|
||||
//const short lx = (tiitg/NL1)%8;
|
||||
//const short ly = i;
|
||||
|
||||
*(sb + NK*(8*sy + ly) + 8*sx + lx) = loop_k + iy + i < args.ne00 ? (S1) *((device T1 *) y + i) : 0;
|
||||
}
|
||||
} else {
|
||||
const short sx = (tiitg%NL1);
|
||||
const short sy = (tiitg/NL1)/8;
|
||||
|
||||
//const short lx = i;
|
||||
const short ly = (tiitg/NL1)%8;
|
||||
//const short lx = (tiitg/NL1)%8;
|
||||
//const short ly = i;
|
||||
|
||||
*(threadgroup S1_2x4 *)(sb + NK*(8*sy + ly) + 8*sx) = (S1_2x4)(*((device T1_2x4 *) y));
|
||||
}
|
||||
#endif
|
||||
|
||||
il = (il + 2 < nl) ? il + 2 : il % 2;
|
||||
x = (il < 2) ? x + (2 + nl - 1)/nl : x;
|
||||
y += BLOCK_SIZE_K;
|
||||
|
||||
y += NK;
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
#ifndef GGML_METAL_HAS_TENSOR
|
||||
// load matrices from threadgroup memory and conduct outer products
|
||||
threadgroup const S0 * lsma = (sa + THREAD_MAT_M*SG_MAT_SIZE*(sgitg%2));
|
||||
threadgroup const S1 * lsmb = (sb + THREAD_MAT_N*SG_MAT_SIZE*(sgitg/2));
|
||||
threadgroup const S0 * lsma = (sa + 4*64*(sgitg%2));
|
||||
threadgroup const S1 * lsmb = (sb + 2*64*(sgitg/2));
|
||||
|
||||
#pragma unroll(4)
|
||||
for (short ik = 0; ik < BLOCK_SIZE_K/8; ik++) {
|
||||
FOR_UNROLL (short ik = 0; ik < NK/8; ik++) {
|
||||
simdgroup_barrier(mem_flags::mem_none);
|
||||
|
||||
#pragma unroll(4)
|
||||
for (short i = 0; i < 4; i++) {
|
||||
simdgroup_load(ma[i], lsma + SG_MAT_SIZE * i);
|
||||
}
|
||||
|
||||
#pragma unroll(2)
|
||||
for (short i = 0; i < 2; i++) {
|
||||
simdgroup_load(mb[i], lsmb + SG_MAT_SIZE * i);
|
||||
FOR_UNROLL (short i = 0; i < 4; i++) {
|
||||
simdgroup_load(ma[i], lsma + 64*i, 8, 0, false);
|
||||
}
|
||||
|
||||
simdgroup_barrier(mem_flags::mem_none);
|
||||
|
||||
#pragma unroll(8)
|
||||
for (short i = 0; i < 8; i++){
|
||||
FOR_UNROLL (short i = 0; i < 2; i++) {
|
||||
simdgroup_load(mb[i], lsmb + 64*i, 8, 0, false);
|
||||
}
|
||||
|
||||
simdgroup_barrier(mem_flags::mem_none);
|
||||
|
||||
FOR_UNROLL (short i = 0; i < 8; i++){
|
||||
simdgroup_multiply_accumulate(mc[i], mb[i/4], ma[i%4], mc[i]);
|
||||
}
|
||||
|
||||
lsma += (BLOCK_SIZE_M/SG_MAT_ROW)*SG_MAT_SIZE;
|
||||
lsmb += (BLOCK_SIZE_N/SG_MAT_ROW)*SG_MAT_SIZE;
|
||||
lsma += 8*64;
|
||||
lsmb += 4*64;
|
||||
}
|
||||
#else
|
||||
auto sA = tA.slice(0, 0);
|
||||
auto sB = tB.slice(0, 0);
|
||||
|
||||
mm.run(sB, sA, cT);
|
||||
#endif
|
||||
}
|
||||
|
||||
if (!FC_mul_mm_bc_out || ((r0 + 1) * BLOCK_SIZE_M <= args.ne0 && (r1 + 1) * BLOCK_SIZE_N <= args.ne1)) {
|
||||
if (!FC_mul_mm_bc_out || (r0 + NR0 <= args.ne0 && r1 + NR1 <= args.ne1)) {
|
||||
// if no bounds checks on the output are needed, we can directly write to device memory
|
||||
#ifdef GGML_METAL_HAS_TENSOR
|
||||
device float * C = (device float *) dst +
|
||||
(BLOCK_SIZE_M * r0 + 32*(sgitg & 1)) + \
|
||||
(BLOCK_SIZE_N * r1 + 16*(sgitg >> 1)) * args.ne0 + im*args.ne1*args.ne0;
|
||||
r0 + \
|
||||
r1 * args.ne0 + im*args.ne1*args.ne0;
|
||||
|
||||
auto tC = tensor<device float, dextents<int32_t, 2>, tensor_inline>(C, dextents<int32_t, 2>(args.ne0, NR1));
|
||||
cT.store(tC);
|
||||
#else
|
||||
device float * C = (device float *) dst +
|
||||
(r0 + 32*(sgitg & 1)) + \
|
||||
(r1 + 16*(sgitg >> 1)) * args.ne0 + im*args.ne1*args.ne0;
|
||||
|
||||
for (short i = 0; i < 8; i++) {
|
||||
simdgroup_store(mc[i], C + 8 * (i%4) + 8 * args.ne0 * (i/4), args.ne0);
|
||||
simdgroup_store(mc[i], C + 8*(i%4) + 8*args.ne0*(i/4), args.ne0, 0, false);
|
||||
}
|
||||
#endif
|
||||
} else {
|
||||
// block is smaller than 64x32, we should avoid writing data outside of the matrix
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
threadgroup float * temp_str = ((threadgroup float *) shmem) \
|
||||
+ 32*(sgitg&1) + (16*(sgitg >> 1))*BLOCK_SIZE_M;
|
||||
|
||||
threadgroup float * temp_str = ((threadgroup float *) shmem) + 32*(sgitg&1) + (16*(sgitg >> 1))*NR0;
|
||||
|
||||
#ifdef GGML_METAL_HAS_TENSOR
|
||||
auto tC = tensor<threadgroup float, dextents<int32_t, 2>, tensor_inline>(sc, dextents<int32_t, 2>(NR0, NR1));
|
||||
cT.store(tC);
|
||||
#else
|
||||
for (short i = 0; i < 8; i++) {
|
||||
simdgroup_store(mc[i], temp_str + 8*(i%4) + 8*BLOCK_SIZE_M*(i/4), BLOCK_SIZE_M);
|
||||
simdgroup_store(mc[i], temp_str + 8*(i%4) + 8*NR0*(i/4), NR0, 0, false);
|
||||
}
|
||||
#endif
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
if (sgitg == 0) {
|
||||
for (int j = tiitg; j < n_cols; j += BLOCK_SIZE_N) {
|
||||
device float * D = (device float *) dst + (r0*BLOCK_SIZE_M) + (r1*BLOCK_SIZE_N + j)*args.ne0 + im*args.ne1*args.ne0;
|
||||
for (int j = tiitg; j < nr1; j += NR1) {
|
||||
device float * D = (device float *) dst + r0 + (r1 + j)*args.ne0 + im*args.ne1*args.ne0;
|
||||
device float4 * D4 = (device float4 *) D;
|
||||
|
||||
threadgroup float * C = temp_str + (j*BLOCK_SIZE_M);
|
||||
threadgroup float * C = temp_str + (j*NR0);
|
||||
threadgroup float4 * C4 = (threadgroup float4 *) C;
|
||||
|
||||
int i = 0;
|
||||
for (; i < n_rows/4; i++) {
|
||||
for (; i < nr0/4; i++) {
|
||||
*(D4 + i) = *(C4 + i);
|
||||
}
|
||||
|
||||
i *= 4;
|
||||
for (; i < n_rows; i++) {
|
||||
for (; i < nr0; i++) {
|
||||
*(D + i) = *(C + i);
|
||||
}
|
||||
}
|
||||
@@ -8394,31 +8533,63 @@ kernel void kernel_mul_mm_id(
|
||||
ushort tiitg[[thread_index_in_threadgroup]],
|
||||
ushort tiisg[[thread_index_in_simdgroup]],
|
||||
ushort sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||
|
||||
threadgroup S0 * sa = (threadgroup S0 *)(shmem);
|
||||
threadgroup S1 * sb = (threadgroup S1 *)(shmem + 4096);
|
||||
|
||||
const int r0 = tgpig.y;
|
||||
const int r1 = tgpig.x;
|
||||
threadgroup float * sc = (threadgroup float *)(shmem);
|
||||
|
||||
constexpr int NR0 = 64;
|
||||
constexpr int NR1 = 32;
|
||||
|
||||
constexpr int NK = 32;
|
||||
constexpr int NL0 = NK/16;
|
||||
constexpr int NL1 = NK/8;
|
||||
|
||||
const int im = tgpig.z; // expert
|
||||
const int r0 = tgpig.y*NR0;
|
||||
const int r1 = tgpig.x*NR1;
|
||||
|
||||
device const uint32_t * tpe_u32 = (device const uint32_t *) (htpe);
|
||||
device const int32_t * ids_i32 = (device const int32_t *) (hids);
|
||||
|
||||
const int32_t neh1 = tpe_u32[im];
|
||||
|
||||
if (r1*BLOCK_SIZE_N >= neh1) {
|
||||
if (r1 >= neh1) {
|
||||
return;
|
||||
}
|
||||
|
||||
// if this block is of 64x32 shape or smaller
|
||||
const short n_rows = (args.ne0 - r0*BLOCK_SIZE_M < BLOCK_SIZE_M) ? (args.ne0 - r0*BLOCK_SIZE_M) : BLOCK_SIZE_M;
|
||||
const short n_cols = ( neh1 - r1*BLOCK_SIZE_N < BLOCK_SIZE_N) ? ( neh1 - r1*BLOCK_SIZE_N) : BLOCK_SIZE_N;
|
||||
const short nr0 = (args.ne0 - r0 < NR0) ? (args.ne0 - r0) : NR0;
|
||||
const short nr1 = ( neh1 - r1 < NR1) ? ( neh1 - r1) : NR1;
|
||||
|
||||
// a thread shouldn't load data outside of the matrix
|
||||
const short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1;
|
||||
const short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1;
|
||||
const short lr0 = ((short)tiitg/NL0) < nr0 ? ((short)tiitg/NL0) : nr0 - 1; // 0 .. 63
|
||||
const short lr1 = ((short)tiitg/NL1) < nr1 ? ((short)tiitg/NL1) : nr1 - 1; // 0 .. 31
|
||||
|
||||
const short il0 = (tiitg % NL0);
|
||||
|
||||
short il = il0;
|
||||
|
||||
const int id = ids_i32[im*args.ne21 + r1 + lr1];
|
||||
|
||||
const short i11 = (id % args.ne20) % args.ne11;
|
||||
const short i12 = (id / args.ne20);
|
||||
const short i13 = 0;
|
||||
|
||||
const uint64_t offset0 = im*args.nb02 + i13*args.nb03;
|
||||
const short offset1 = il0/nl;
|
||||
|
||||
device const block_q * x = (device const block_q *)(src0 + args.nb01*(r0 + lr0) + offset0) + offset1;
|
||||
|
||||
const short iy = 8*(tiitg % NL1);
|
||||
|
||||
device const T1 * y = (device const T1 *)(src1
|
||||
+ args.nb13*i13
|
||||
+ args.nb12*i12
|
||||
+ args.nb11*i11
|
||||
+ args.nb10*iy);
|
||||
|
||||
#ifndef GGML_METAL_HAS_TENSOR
|
||||
S0_8x8 ma[4];
|
||||
S1_8x8 mb[2];
|
||||
|
||||
@@ -8427,39 +8598,36 @@ kernel void kernel_mul_mm_id(
|
||||
for (short i = 0; i < 8; i++){
|
||||
mc[i] = make_filled_simdgroup_matrix<float, 8>(0.f);
|
||||
}
|
||||
#else
|
||||
auto tA = tensor<threadgroup S0, dextents<int32_t, 2>, tensor_inline>(sa, dextents<int32_t, 2>(NK, NR0));
|
||||
auto tB = tensor<threadgroup S1, dextents<int32_t, 2>, tensor_inline>(sb, dextents<int32_t, 2>(NR1, NK ));
|
||||
|
||||
short il = (tiitg % THREAD_PER_ROW);
|
||||
mpp::tensor_ops::matmul2d<
|
||||
mpp::tensor_ops::matmul2d_descriptor(NR1, NR0, NK, false, true, false, mpp::tensor_ops::matmul2d_descriptor::mode::multiply_accumulate),
|
||||
execution_simdgroups<4>> mm;
|
||||
|
||||
const int id = ids_i32[im*args.ne21 + r1*BLOCK_SIZE_N + thread_col];
|
||||
auto cT = mm.get_destination_cooperative_tensor<decltype(tA), decltype(tB), float>();
|
||||
#endif
|
||||
|
||||
const short i11 = (id % args.ne20) % args.ne11;
|
||||
const short i12 = (id / args.ne20);
|
||||
const short i13 = 0;
|
||||
|
||||
const uint64_t offset0 = im*args.nb02 + i13*args.nb03;
|
||||
const short offset1 = il/nl;
|
||||
|
||||
device const block_q * x = (device const block_q *)(src0
|
||||
+ args.nb01*(r0*BLOCK_SIZE_M + thread_row) + offset0) + offset1;
|
||||
|
||||
const short iy = (BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL));
|
||||
|
||||
device const T1 * y = (device const T1 *)(src1
|
||||
+ args.nb13*i13
|
||||
+ args.nb12*i12
|
||||
+ args.nb11*i11
|
||||
+ args.nb10*iy);
|
||||
|
||||
for (int loop_k = 0; loop_k < args.ne00; loop_k += BLOCK_SIZE_K) {
|
||||
for (int loop_k = 0; loop_k < args.ne00; loop_k += NK) {
|
||||
#ifndef GGML_METAL_HAS_TENSOR
|
||||
// load data and store to threadgroup memory
|
||||
if (is_same<T0_4x4, block_q>::value && FC_mul_mm_bc_inp) {
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
// no need for dequantization
|
||||
for (short i = 0; i < 16; i++) {
|
||||
*(sa + SG_MAT_SIZE * ((tiitg/THREAD_PER_ROW/8) \
|
||||
+ (tiitg%THREAD_PER_ROW)*16 + (i/8)*8) \
|
||||
+ (tiitg/THREAD_PER_ROW)%8 + (i&7)*8) = loop_k + 16*il + i < args.ne00 ? ((device T0 *) x)[i] : 0;
|
||||
const short sx = 2*il0 + i/8;
|
||||
const short sy = (tiitg/NL0)/8;
|
||||
|
||||
//const short lx = i%8;
|
||||
//const short ly = (tiitg/NL0)%8;
|
||||
const short lx = (tiitg/NL0)%8;
|
||||
const short ly = i%8;
|
||||
|
||||
const short ib = 8*sx + sy;
|
||||
|
||||
*(sa + 64*ib + 8*ly + lx) = loop_k + 16*il + i < args.ne00 ? *((device T0 *) x + i) : 0;
|
||||
}
|
||||
} else {
|
||||
S0_4x4 temp_a;
|
||||
@@ -8468,85 +8636,188 @@ kernel void kernel_mul_mm_id(
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
FOR_UNROLL (short i = 0; i < 16; i++) {
|
||||
*(sa + SG_MAT_SIZE * ((tiitg/THREAD_PER_ROW/8) \
|
||||
+ (tiitg%THREAD_PER_ROW)*16 + (i/8)*8) \
|
||||
+ (tiitg/THREAD_PER_ROW)%8 + (i&7)*8) = temp_a[i/4][i%4];
|
||||
const short sx = 2*il0 + i/8;
|
||||
const short sy = (tiitg/NL0)/8;
|
||||
|
||||
//const short lx = i%8;
|
||||
//const short ly = (tiitg/NL0)%8;
|
||||
const short lx = (tiitg/NL0)%8;
|
||||
const short ly = i%8;
|
||||
|
||||
const short ib = 8*sx + sy;
|
||||
|
||||
// NOTE: this is massively slower.. WTF?
|
||||
//sa[64*ib + 8*ly + lx] = temp_a[i/4][i%4];
|
||||
|
||||
*(sa + 64*ib + 8*ly + lx) = temp_a[i/4][i%4];
|
||||
}
|
||||
}
|
||||
|
||||
if (FC_mul_mm_bc_inp) {
|
||||
for (short i = 0; i < 8; ++i) {
|
||||
sb[32*8*(tiitg%THREAD_PER_COL) + 8*(tiitg/THREAD_PER_COL) + i] = loop_k + iy + i < args.ne00 ? (S1) ((device T1 *) y)[i] : 0;
|
||||
const short sx = (tiitg%NL1);
|
||||
const short sy = (tiitg/NL1)/8;
|
||||
|
||||
const short lx = i;
|
||||
const short ly = (tiitg/NL1)%8;
|
||||
//const short lx = (tiitg/NL1)%8;
|
||||
//const short ly = i;
|
||||
|
||||
const short ib = 4*sx + sy;
|
||||
|
||||
*(sb + 64*ib + 8*ly + lx) = loop_k + iy + i < args.ne00 ? (S1) *((device T1 *) y + i) : 0;
|
||||
}
|
||||
} else {
|
||||
*(threadgroup S1_2x4 *)(sb + 32*8*(tiitg%THREAD_PER_COL) + 8*(tiitg/THREAD_PER_COL)) = (S1_2x4)(*((device T1_2x4 *) y));
|
||||
const short sx = (tiitg%NL1);
|
||||
const short sy = (tiitg/NL1)/8;
|
||||
|
||||
const short dx = sx;
|
||||
const short dy = sy;
|
||||
|
||||
const short ly = (tiitg/NL1)%8;
|
||||
|
||||
const short ib = 4*sx + sy;
|
||||
|
||||
*(threadgroup S1_2x4 *)(sb + 64*ib + 8*ly) = (S1_2x4)(*((device T1_2x4 *) y));
|
||||
}
|
||||
#else
|
||||
// load data and store to threadgroup memory
|
||||
if (is_same<T0_4x4, block_q>::value && FC_mul_mm_bc_inp) {
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
// no need for dequantization
|
||||
for (short i = 0; i < 16; i++) {
|
||||
const short sx = 2*il0 + i/8;
|
||||
const short sy = (tiitg/NL0)/8;
|
||||
|
||||
const short lx = i%8;
|
||||
const short ly = (tiitg/NL0)%8;
|
||||
//const short lx = (tiitg/NL0)%8;
|
||||
//const short ly = i%8;
|
||||
|
||||
*(sa + NK*(8*sy + ly) + 8*sx + lx) = loop_k + 16*il + i < args.ne00 ? *((device T0 *) x + i) : 0;
|
||||
}
|
||||
} else {
|
||||
S0_4x4 temp_a;
|
||||
dequantize_func(x, il, temp_a);
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
FOR_UNROLL (short i = 0; i < 16; i++) {
|
||||
const short sx = 2*il0 + i/8;
|
||||
const short sy = (tiitg/NL0)/8;
|
||||
|
||||
const short lx = i%8;
|
||||
const short ly = (tiitg/NL0)%8;
|
||||
//const short lx = (tiitg/NL0)%8;
|
||||
//const short ly = i%8;
|
||||
|
||||
*(sa + NK*(8*sy + ly) + 8*sx + lx) = temp_a[i/4][i%4];
|
||||
}
|
||||
}
|
||||
|
||||
if (FC_mul_mm_bc_inp) {
|
||||
for (short i = 0; i < 8; ++i) {
|
||||
const short sx = (tiitg%NL1);
|
||||
const short sy = (tiitg/NL1)/8;
|
||||
|
||||
const short lx = i;
|
||||
const short ly = (tiitg/NL1)%8;
|
||||
//const short lx = (tiitg/NL1)%8;
|
||||
//const short ly = i;
|
||||
|
||||
*(sb + NK*(8*sy + ly) + 8*sx + lx) = loop_k + iy + i < args.ne00 ? (S1) *((device T1 *) y + i) : 0;
|
||||
}
|
||||
} else {
|
||||
const short sx = (tiitg%NL1);
|
||||
const short sy = (tiitg/NL1)/8;
|
||||
|
||||
//const short lx = i;
|
||||
const short ly = (tiitg/NL1)%8;
|
||||
//const short lx = (tiitg/NL1)%8;
|
||||
//const short ly = i;
|
||||
|
||||
*(threadgroup S1_2x4 *)(sb + NK*(8*sy + ly) + 8*sx) = (S1_2x4)(*((device T1_2x4 *) y));
|
||||
}
|
||||
#endif
|
||||
|
||||
il = (il + 2 < nl) ? il + 2 : il % 2;
|
||||
x = (il < 2) ? x + (2 + nl - 1)/nl : x;
|
||||
y += BLOCK_SIZE_K;
|
||||
|
||||
y += NK;
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
#ifndef GGML_METAL_HAS_TENSOR
|
||||
// load matrices from threadgroup memory and conduct outer products
|
||||
threadgroup const S0 * lsma = (sa + THREAD_MAT_M*SG_MAT_SIZE*(sgitg%2));
|
||||
threadgroup const S1 * lsmb = (sb + THREAD_MAT_N*SG_MAT_SIZE*(sgitg/2));
|
||||
threadgroup const S0 * lsma = (sa + 4*64*(sgitg%2));
|
||||
threadgroup const S1 * lsmb = (sb + 2*64*(sgitg/2));
|
||||
|
||||
#pragma unroll(4)
|
||||
for (short ik = 0; ik < BLOCK_SIZE_K/8; ik++) {
|
||||
#pragma unroll(4)
|
||||
for (short i = 0; i < 4; i++) {
|
||||
simdgroup_load(ma[i], lsma + SG_MAT_SIZE * i);
|
||||
FOR_UNROLL (short ik = 0; ik < NK/8; ik++) {
|
||||
simdgroup_barrier(mem_flags::mem_none);
|
||||
|
||||
FOR_UNROLL (short i = 0; i < 4; i++) {
|
||||
simdgroup_load(ma[i], lsma + 64*i, 8, 0, false);
|
||||
}
|
||||
|
||||
simdgroup_barrier(mem_flags::mem_none);
|
||||
|
||||
#pragma unroll(2)
|
||||
for (short i = 0; i < 2; i++) {
|
||||
simdgroup_load(mb[i], lsmb + SG_MAT_SIZE * i);
|
||||
FOR_UNROLL (short i = 0; i < 2; i++) {
|
||||
simdgroup_load(mb[i], lsmb + 64*i, 8, 0, false);
|
||||
}
|
||||
|
||||
#pragma unroll(8)
|
||||
for (short i = 0; i < 8; i++){
|
||||
simdgroup_barrier(mem_flags::mem_none);
|
||||
|
||||
FOR_UNROLL (short i = 0; i < 8; i++){
|
||||
simdgroup_multiply_accumulate(mc[i], mb[i/4], ma[i%4], mc[i]);
|
||||
}
|
||||
|
||||
lsma += (BLOCK_SIZE_M/SG_MAT_ROW)*SG_MAT_SIZE;
|
||||
lsmb += (BLOCK_SIZE_N/SG_MAT_ROW)*SG_MAT_SIZE;
|
||||
lsma += 8*64;
|
||||
lsmb += 4*64;
|
||||
}
|
||||
#else
|
||||
auto sA = tA.slice(0, 0);
|
||||
auto sB = tB.slice(0, 0);
|
||||
|
||||
mm.run(sB, sA, cT);
|
||||
#endif
|
||||
}
|
||||
|
||||
// block is smaller than 64x32, we should avoid writing data outside of the matrix
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
threadgroup float * temp_str = ((threadgroup float *) shmem) \
|
||||
+ 32*(sgitg&1) + (16*(sgitg >> 1))*BLOCK_SIZE_M;
|
||||
#ifdef GGML_METAL_HAS_TENSOR
|
||||
auto tC = tensor<threadgroup float, dextents<int32_t, 2>, tensor_inline>(sc, dextents<int32_t, 2>(NR0, NR1));
|
||||
cT.store(tC);
|
||||
#else
|
||||
threadgroup float * temp_str = ((threadgroup float *) shmem) + 32*(sgitg&1) + (16*(sgitg >> 1))*NR0;
|
||||
|
||||
#pragma unroll(8)
|
||||
for (short i = 0; i < 8; i++) {
|
||||
simdgroup_store(mc[i], temp_str + 8*(i%4) + 8*BLOCK_SIZE_M*(i/4), BLOCK_SIZE_M);
|
||||
simdgroup_store(mc[i], temp_str + 8*(i%4) + 8*NR0*(i/4), NR0, 0, false);
|
||||
}
|
||||
#endif
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
for (short j = sgitg; j < n_cols; j += 4) {
|
||||
const int id = ids_i32[im*args.ne21 + r1*BLOCK_SIZE_N + j];
|
||||
for (short j = sgitg; j < nr1; j += 4) {
|
||||
const int id = ids_i32[im*args.ne21 + r1 + j];
|
||||
|
||||
const short ide = id % args.ne20;
|
||||
const short idt = id / args.ne20;
|
||||
|
||||
device float * D = (device float *) dst + (r0*BLOCK_SIZE_M) + ide*args.ne0 + idt*args.ne1*args.ne0;
|
||||
device float * D = (device float *) dst + r0 + ide*args.ne0 + idt*args.ne1*args.ne0;
|
||||
device float4 * D4 = (device float4 *) D;
|
||||
|
||||
threadgroup float * C = (threadgroup float *) shmem + (j*BLOCK_SIZE_M);
|
||||
threadgroup float * C = (threadgroup float *) shmem + j*NR0;
|
||||
threadgroup float4 * C4 = (threadgroup float4 *) C;
|
||||
|
||||
int i = tiisg;
|
||||
for (; i < n_rows/4; i += 32) {
|
||||
for (; i < nr0/4; i += 32) {
|
||||
*(D4 + i) = *(C4 + i);
|
||||
}
|
||||
|
||||
i = (4*(n_rows/4)) + tiisg;
|
||||
for (; i < n_rows; i += 32) {
|
||||
i = (4*(nr0/4)) + tiisg;
|
||||
for (; i < nr0; i += 32) {
|
||||
*(D + i) = *(C + i);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -11,9 +11,13 @@
|
||||
//
|
||||
|
||||
#include "concat.hpp"
|
||||
#include "common.hpp"
|
||||
|
||||
static void concat_f32_dim0(const float *x, const float *y, float *dst,
|
||||
static inline size_t elem_size(ggml_type t) {
|
||||
return ggml_type_size(t) / ggml_blck_size(t);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static void concat_T_dim0(const T *x, const T *y, T *dst,
|
||||
const int ne0, const int ne00,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
int nidx = item_ct1.get_local_id(2) +
|
||||
@@ -36,7 +40,8 @@ static void concat_f32_dim0(const float *x, const float *y, float *dst,
|
||||
}
|
||||
}
|
||||
|
||||
static void concat_f32_dim1(const float *x, const float *y, float *dst,
|
||||
template <typename T>
|
||||
static void concat_T_dim1(const T *x, const T *y, T *dst,
|
||||
const int ne0, const int ne01,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
int nidx = item_ct1.get_local_id(2) +
|
||||
@@ -59,7 +64,8 @@ static void concat_f32_dim1(const float *x, const float *y, float *dst,
|
||||
}
|
||||
}
|
||||
|
||||
static void concat_f32_dim2(const float *x, const float *y, float *dst,
|
||||
template <typename T>
|
||||
static void concat_T_dim2(const T *x, const T *y, T *dst,
|
||||
const int ne0, const int ne02,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
int nidx = item_ct1.get_local_id(2) +
|
||||
@@ -82,45 +88,35 @@ static void concat_f32_dim2(const float *x, const float *y, float *dst,
|
||||
}
|
||||
}
|
||||
|
||||
static void concat_f32_sycl(const float *x, const float *y, float *dst,
|
||||
template <typename T>
|
||||
static void concat_T_sycl(const T *x, const T *y, T *dst,
|
||||
int ne00, int ne01, int ne02, int ne0, int ne1,
|
||||
int ne2, int dim, queue_ptr stream) {
|
||||
int num_blocks = (ne0 + SYCL_CONCAT_BLOCK_SIZE - 1) / SYCL_CONCAT_BLOCK_SIZE;
|
||||
sycl::range<3> gridDim(ne2, ne1, num_blocks);
|
||||
switch (dim) {
|
||||
case 0:
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(gridDim *
|
||||
sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
|
||||
sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
concat_f32_dim0(x, y, dst, ne0, ne00, item_ct1);
|
||||
});
|
||||
break;
|
||||
stream->parallel_for(sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
|
||||
sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<3> item_ct1) { concat_T_dim0<T>(x, y, dst, ne0, ne00, item_ct1); });
|
||||
break;
|
||||
case 1:
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(gridDim *
|
||||
sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
|
||||
sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
concat_f32_dim1(x, y, dst, ne0, ne01, item_ct1);
|
||||
});
|
||||
break;
|
||||
stream->parallel_for(sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
|
||||
sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<3> item_ct1) { concat_T_dim1<T>(x, y, dst, ne0, ne01, item_ct1); });
|
||||
break;
|
||||
// dim >=2 will be dispatched to the default path
|
||||
default:
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(gridDim *
|
||||
sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
|
||||
sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
concat_f32_dim2(x, y, dst, ne0, ne02, item_ct1);
|
||||
});
|
||||
break;
|
||||
stream->parallel_for(sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
|
||||
sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<3> item_ct1) { concat_T_dim2<T>(x, y, dst, ne0, ne02, item_ct1); });
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// non-contiguous kernel (slow)
|
||||
static void concat_f32_sycl_non_cont(
|
||||
template<typename T>
|
||||
static void concat_T_sycl_non_cont(
|
||||
queue_ptr stream, const char *src0, const char *src1, char *dst,
|
||||
int64_t ne00, int64_t ne01, int64_t ne02, int64_t ne03, uint64_t nb00,
|
||||
uint64_t nb01, uint64_t nb02, uint64_t nb03, int64_t /*ne10*/,
|
||||
@@ -137,24 +133,25 @@ static void concat_f32_sycl_non_cont(
|
||||
int64_t o[4] = { 0, 0, 0, 0 };
|
||||
o[dim] = dim == 0 ? ne00 : (dim == 1 ? ne01 : (dim == 2 ? ne02 : ne03));
|
||||
|
||||
const float * x;
|
||||
const T * x;
|
||||
|
||||
for (int i0 = item_ct1.get_local_id(2); i0 < ne0; i0 += item_ct1.get_local_range(2)) {
|
||||
if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
|
||||
x = (const float *) (src0 + (i3) *nb03 + (i2) *nb02 + (i1) *nb01 + (i0) *nb00);
|
||||
x = (const T *) (src0 + (i3) *nb03 + (i2) *nb02 + (i1) *nb01 + (i0) *nb00);
|
||||
} else {
|
||||
x = (const float *) (src1 + (i3 - o[3]) * nb13 + (i2 - o[2]) * nb12 + (i1 - o[1]) * nb11 +
|
||||
x = (const T *) (src1 + (i3 - o[3]) * nb13 + (i2 - o[2]) * nb12 + (i1 - o[1]) * nb11 +
|
||||
(i0 - o[0]) * nb10);
|
||||
}
|
||||
|
||||
float *y = (float *)(dst + i3 * nb3 + i2 * nb2 + i1 * nb1 + i0 * nb0);
|
||||
T *y = (T *)(dst + i3 * nb3 + i2 * nb2 + i1 * nb1 + i0 * nb0);
|
||||
|
||||
*y = *x;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
|
||||
template <typename T>
|
||||
void concat_impl_sycl(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
|
||||
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const ggml_tensor * src1 = dst->src[1];
|
||||
@@ -163,15 +160,14 @@ void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
|
||||
const int32_t dim = ((int32_t *) dst->op_params)[0];
|
||||
|
||||
if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
|
||||
const float * src0_d = (const float *) src0->data;
|
||||
const float * src1_d = (const float *) src1->data;
|
||||
|
||||
float * dst_d = (float *) dst->data;
|
||||
|
||||
const T * src0_d = (const T *) src0->data;
|
||||
const T * src1_d = (const T *) src1->data;
|
||||
T * dst_d = (T *) dst->data;
|
||||
size_t type_size = elem_size(dst->type);
|
||||
if (dim != 3) {
|
||||
for (int i3 = 0; i3 < dst->ne[3]; i3++) {
|
||||
concat_f32_sycl(src0_d + i3 * (src0->nb[3] / 4), src1_d + i3 * (src1->nb[3] / 4),
|
||||
dst_d + i3 * (dst->nb[3] / 4), src0->ne[0], src0->ne[1], src0->ne[2], dst->ne[0],
|
||||
concat_T_sycl<T>(src0_d + i3 * (src0->nb[3] / type_size), src1_d + i3 * (src1->nb[3] / type_size),
|
||||
dst_d + i3 * (dst->nb[3] / type_size), src0->ne[0], src0->ne[1], src0->ne[2], dst->ne[0],
|
||||
dst->ne[1], dst->ne[2], dim, stream);
|
||||
}
|
||||
} else {
|
||||
@@ -179,13 +175,28 @@ void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
|
||||
const size_t size1 = ggml_nbytes(src1);
|
||||
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(dst_d, src0_d, size0).wait()));
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(dst_d + size0 / 4, src1_d, size1).wait()));
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(dst_d + size0 / type_size, src1_d, size1).wait()));
|
||||
}
|
||||
} else {
|
||||
concat_f32_sycl_non_cont(stream, (const char *) src0->data, (const char *) src1->data, (char *) dst->data,
|
||||
concat_T_sycl_non_cont<T>(stream, (const char *) src0->data, (const char *) src1->data, (char *) dst->data,
|
||||
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0->nb[0], src0->nb[1],
|
||||
src0->nb[2], src0->nb[3], src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
|
||||
src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3], dst->ne[0], dst->ne[1], dst->ne[2],
|
||||
dst->ne[3], dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3], dim);
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
|
||||
|
||||
switch (dst->type) {
|
||||
case GGML_TYPE_F32:
|
||||
concat_impl_sycl<float>(ctx, dst);
|
||||
break;
|
||||
case GGML_TYPE_I32:
|
||||
concat_impl_sycl<int32_t>(ctx, dst);
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false && "ggml_sycl_op_concat: unsupported type");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4534,16 +4534,12 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
}
|
||||
return false;
|
||||
}
|
||||
case GGML_OP_CONCAT:
|
||||
{
|
||||
ggml_type src0_type = op->src[0]->type;
|
||||
return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
|
||||
}
|
||||
case GGML_OP_REPEAT_BACK:
|
||||
{
|
||||
ggml_type src0_type = op->src[0]->type;
|
||||
return src0_type == GGML_TYPE_F32;
|
||||
}
|
||||
case GGML_OP_CONCAT:
|
||||
case GGML_OP_DUP:
|
||||
case GGML_OP_ARGMAX:
|
||||
case GGML_OP_NONE:
|
||||
|
||||
@@ -14104,20 +14104,11 @@ size_t comp_size;
|
||||
size_t comp_nb[GGML_MAX_DIMS];
|
||||
size_t check_counter = 0;
|
||||
static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph * cgraph, int tensor_idx) {
|
||||
ggml_tensor * tensor = cgraph->nodes[tensor_idx];
|
||||
ggml_tensor * tensor = cgraph->nodes[tensor_idx + ctx->num_additional_fused_ops];
|
||||
if (tensor->op == GGML_OP_TRANSPOSE || tensor->op == GGML_OP_SET_ROWS) {
|
||||
return;
|
||||
}
|
||||
|
||||
bool fused_rms_norm_mul = false;
|
||||
int rms_norm_idx = -1;
|
||||
if (ctx->num_additional_fused_ops == 1 &&
|
||||
tensor->op == GGML_OP_RMS_NORM &&
|
||||
cgraph->nodes[tensor_idx + 1]->op == GGML_OP_MUL) {
|
||||
fused_rms_norm_mul = true;
|
||||
tensor = cgraph->nodes[tensor_idx + 1];
|
||||
}
|
||||
|
||||
check_counter++;
|
||||
if (!(vk_output_tensor > 0 && vk_output_tensor == check_counter) && check_counter <= vk_skip_checks) {
|
||||
return;
|
||||
@@ -14125,9 +14116,6 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph *
|
||||
|
||||
VK_LOG_DEBUG("ggml_vk_check_results_0(" << tensor->name << ")");
|
||||
|
||||
ggml_tensor * src0 = tensor->src[0];
|
||||
ggml_tensor * src1 = tensor->src[1];
|
||||
|
||||
struct ggml_init_params iparams = {
|
||||
/*.mem_size =*/ 2ul*1024ul*1024ul*1024ul,
|
||||
/*.mem_buffer =*/ NULL,
|
||||
@@ -14137,328 +14125,339 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph *
|
||||
struct ggml_context * ggml_ctx = ggml_init(iparams);
|
||||
|
||||
std::array<struct ggml_tensor *, GGML_MAX_SRC> src_clone = {nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr};
|
||||
std::array<size_t, GGML_MAX_SRC> src_size = {};
|
||||
std::array<void *, GGML_MAX_SRC> src_buffer = {};
|
||||
const char * srci_name[GGML_MAX_SRC] = {"src0", "src1", "src2", "src3", "src4", "src5", "src6", "src7", "src8", "src9"};
|
||||
|
||||
std::map<ggml_tensor *, ggml_tensor *> cloned_tensors;
|
||||
std::vector<void *> cloned_mallocs;
|
||||
|
||||
struct ggml_tensor * tensor_clone = nullptr;
|
||||
|
||||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||
ggml_tensor * srci = tensor->src[i];
|
||||
if (fused_rms_norm_mul) {
|
||||
rms_norm_idx = tensor->src[0]->op == GGML_OP_RMS_NORM ? 0 : 1;
|
||||
ggml_tensor *rms_norm = tensor->src[rms_norm_idx];
|
||||
switch (i) {
|
||||
case 0: srci = rms_norm->src[0]; break;
|
||||
case 1: srci = tensor->src[1 - rms_norm_idx]; break;
|
||||
default: continue;
|
||||
for (int f = 0; f < ctx->num_additional_fused_ops + 1; ++f) {
|
||||
tensor = cgraph->nodes[tensor_idx + f];
|
||||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||
ggml_tensor * srci = tensor->src[i];
|
||||
if (srci == nullptr) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if (srci == nullptr) {
|
||||
continue;
|
||||
}
|
||||
ggml_tensor * srci_clone = ggml_dup_tensor(ggml_ctx, srci);
|
||||
size_t srci_size = ggml_nbytes(srci);
|
||||
// If a src tensor has been cloned, use that one
|
||||
auto it = cloned_tensors.find(srci);
|
||||
if (it != cloned_tensors.end()) {
|
||||
src_clone[i] = it->second;
|
||||
continue;
|
||||
}
|
||||
ggml_tensor * srci_clone = ggml_dup_tensor(ggml_ctx, srci);
|
||||
size_t srci_size = ggml_nbytes(srci);
|
||||
|
||||
src_clone[i] = srci_clone;
|
||||
src_size[i] = ggml_nbytes(srci);
|
||||
src_buffer[i] = malloc(srci_size);
|
||||
src_clone[i] = srci_clone;
|
||||
void *src_buffer = malloc(srci_size);
|
||||
cloned_mallocs.push_back(src_buffer);
|
||||
|
||||
srci_clone->data = src_buffer[i];
|
||||
if (ggml_backend_buffer_is_host(srci->buffer)) {
|
||||
memcpy(srci_clone->data, srci->data, srci_size);
|
||||
memcpy(srci_clone->nb, srci->nb, sizeof(size_t) * GGML_MAX_DIMS);
|
||||
} else if (ggml_backend_buffer_is_vk(srci->buffer)) {
|
||||
ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)srci->buffer->context;
|
||||
vk_buffer& buffer_gpu = buf_ctx->dev_buffer;
|
||||
uint64_t offset = vk_tensor_offset(srci) + srci->view_offs;
|
||||
if (!ggml_is_contiguous(srci) && ggml_vk_dim01_contiguous(srci)) {
|
||||
for (int i3 = 0; i3 < srci->ne[3]; i3++) {
|
||||
for (int i2 = 0; i2 < srci->ne[2]; i2++) {
|
||||
const int idx = i3*srci->ne[2] + i2;
|
||||
ggml_vk_buffer_read(buffer_gpu, offset + idx * srci->nb[2], ((char *)srci_clone->data + idx * srci_clone->nb[2]), srci->ne[1] * srci->nb[1]);
|
||||
}
|
||||
}
|
||||
|
||||
srci_clone->nb[0] = srci->nb[0];
|
||||
srci_clone->nb[1] = srci->nb[1];
|
||||
for (int i = 2; i < GGML_MAX_DIMS; i++) {
|
||||
srci_clone->nb[i] = srci_clone->nb[i - 1]*srci_clone->ne[i - 1];
|
||||
}
|
||||
} else {
|
||||
if (offset + srci_size >= buffer_gpu->size) {
|
||||
srci_size = buffer_gpu->size - offset;
|
||||
}
|
||||
ggml_vk_buffer_read(buffer_gpu, offset, srci_clone->data, srci_size);
|
||||
srci_clone->data = src_buffer;
|
||||
if (ggml_backend_buffer_is_host(srci->buffer)) {
|
||||
memcpy(srci_clone->data, srci->data, srci_size);
|
||||
memcpy(srci_clone->nb, srci->nb, sizeof(size_t) * GGML_MAX_DIMS);
|
||||
} else if (ggml_backend_buffer_is_vk(srci->buffer)) {
|
||||
ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)srci->buffer->context;
|
||||
vk_buffer& buffer_gpu = buf_ctx->dev_buffer;
|
||||
uint64_t offset = vk_tensor_offset(srci) + srci->view_offs;
|
||||
if (!ggml_is_contiguous(srci) && ggml_vk_dim01_contiguous(srci)) {
|
||||
for (int i3 = 0; i3 < srci->ne[3]; i3++) {
|
||||
for (int i2 = 0; i2 < srci->ne[2]; i2++) {
|
||||
const int idx = i3*srci->ne[2] + i2;
|
||||
ggml_vk_buffer_read(buffer_gpu, offset + idx * srci->nb[2], ((char *)srci_clone->data + idx * srci_clone->nb[2]), srci->ne[1] * srci->nb[1]);
|
||||
}
|
||||
}
|
||||
|
||||
srci_clone->nb[0] = srci->nb[0];
|
||||
srci_clone->nb[1] = srci->nb[1];
|
||||
for (int i = 2; i < GGML_MAX_DIMS; i++) {
|
||||
srci_clone->nb[i] = srci_clone->nb[i - 1]*srci_clone->ne[i - 1];
|
||||
}
|
||||
} else {
|
||||
if (offset + srci_size >= buffer_gpu->size) {
|
||||
srci_size = buffer_gpu->size - offset;
|
||||
}
|
||||
ggml_vk_buffer_read(buffer_gpu, offset, srci_clone->data, srci_size);
|
||||
memcpy(srci_clone->nb, srci->nb, sizeof(size_t) * GGML_MAX_DIMS);
|
||||
}
|
||||
} else {
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
|
||||
ggml_vk_print_tensor(srci, srci_name[i]);
|
||||
}
|
||||
} else {
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
|
||||
ggml_vk_print_tensor(srci, srci_name[i]);
|
||||
}
|
||||
}
|
||||
|
||||
if (tensor->op == GGML_OP_FLASH_ATTN_EXT) {
|
||||
const float * params = (const float *)tensor->op_params;
|
||||
tensor_clone = ggml_flash_attn_ext(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], src_clone[3], params[0], params[1], params[2]);
|
||||
if (src_clone[4]) {
|
||||
ggml_flash_attn_ext_add_sinks(tensor_clone, src_clone[4]);
|
||||
}
|
||||
} else if (tensor->op == GGML_OP_MUL_MAT) {
|
||||
tensor_clone = ggml_mul_mat(ggml_ctx, src_clone[0], src_clone[1]);
|
||||
} else if (tensor->op == GGML_OP_MUL_MAT_ID) {
|
||||
tensor_clone = ggml_mul_mat_id(ggml_ctx, src_clone[0], src_clone[1], src_clone[2]);
|
||||
} else if (tensor->op == GGML_OP_SUB) {
|
||||
tensor_clone = ggml_sub(ggml_ctx, src_clone[0], src_clone[1]);
|
||||
} else if (tensor->op == GGML_OP_MUL) {
|
||||
if (fused_rms_norm_mul) {
|
||||
tensor_clone = ggml_rms_norm(ggml_ctx, src_clone[0], *(float *)tensor->src[rms_norm_idx]->op_params);
|
||||
tensor_clone = ggml_mul(ggml_ctx, tensor_clone, src_clone[1 - rms_norm_idx]);
|
||||
} else {
|
||||
tensor_clone = ggml_mul(ggml_ctx, src_clone[0], src_clone[1]);
|
||||
}
|
||||
} else if (tensor->op == GGML_OP_DIV) {
|
||||
tensor_clone = ggml_div(ggml_ctx, src_clone[0], src_clone[1]);
|
||||
} else if (tensor->op == GGML_OP_CONCAT) {
|
||||
tensor_clone = ggml_concat(ggml_ctx, src_clone[0], src_clone[1], *(int *)tensor->op_params);
|
||||
} else if (tensor->op == GGML_OP_UPSCALE) {
|
||||
tensor_clone = ggml_interpolate(ggml_ctx, src_clone[0], tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], (ggml_scale_mode) tensor->op_params[0]);
|
||||
} else if (tensor->op == GGML_OP_SCALE) {
|
||||
const float * params = (const float *)tensor->op_params;
|
||||
tensor_clone = ggml_scale_bias(ggml_ctx, src_clone[0], params[0], params[1]);
|
||||
} else if (tensor->op == GGML_OP_SQR) {
|
||||
tensor_clone = ggml_sqr(ggml_ctx, src_clone[0]);
|
||||
} else if (tensor->op == GGML_OP_SQRT) {
|
||||
tensor_clone = ggml_sqrt(ggml_ctx, src_clone[0]);
|
||||
} else if (tensor->op == GGML_OP_SIN) {
|
||||
tensor_clone = ggml_sin(ggml_ctx, src_clone[0]);
|
||||
} else if (tensor->op == GGML_OP_COS) {
|
||||
tensor_clone = ggml_cos(ggml_ctx, src_clone[0]);
|
||||
} else if (tensor->op == GGML_OP_CLAMP) {
|
||||
const float * params = (const float *)tensor->op_params;
|
||||
tensor_clone = ggml_clamp(ggml_ctx, src_clone[0], params[0], params[1]);
|
||||
} else if (tensor->op == GGML_OP_PAD) {
|
||||
tensor_clone = ggml_pad_ext(ggml_ctx, src_clone[0], tensor->op_params[0], tensor->op_params[1], tensor->op_params[2], tensor->op_params[3],
|
||||
tensor->op_params[4], tensor->op_params[5], tensor->op_params[6], tensor->op_params[7]);
|
||||
} else if (tensor->op == GGML_OP_REPEAT) {
|
||||
tensor_clone = ggml_repeat(ggml_ctx, src_clone[0], tensor);
|
||||
} else if (tensor->op == GGML_OP_REPEAT_BACK) {
|
||||
tensor_clone = ggml_repeat_back(ggml_ctx, src_clone[0], tensor);
|
||||
} else if (tensor->op == GGML_OP_ADD) {
|
||||
tensor_clone = ggml_add(ggml_ctx, src_clone[0], src_clone[1]);
|
||||
} else if (tensor->op == GGML_OP_ACC) {
|
||||
tensor_clone = ggml_acc(ggml_ctx, src_clone[0], src_clone[1], tensor->op_params[0], tensor->op_params[1], tensor->op_params[2], tensor->op_params[3]);
|
||||
} else if (tensor->op == GGML_OP_NORM) {
|
||||
tensor_clone = ggml_norm(ggml_ctx, src_clone[0], *(float *)tensor->op_params);
|
||||
} else if (tensor->op == GGML_OP_GROUP_NORM) {
|
||||
const float * float_params = (const float *)tensor->op_params;
|
||||
tensor_clone = ggml_group_norm(ggml_ctx, src_clone[0], tensor->op_params[0], float_params[1]);
|
||||
} else if (tensor->op == GGML_OP_RMS_NORM) {
|
||||
tensor_clone = ggml_rms_norm(ggml_ctx, src_clone[0], *(float *)tensor->op_params);
|
||||
} else if (tensor->op == GGML_OP_RMS_NORM_BACK) {
|
||||
const float eps = ((float *) tensor->op_params)[0];
|
||||
tensor_clone = ggml_rms_norm_back(ggml_ctx, src_clone[0], src_clone[1], eps);
|
||||
} else if (tensor->op == GGML_OP_SILU_BACK) {
|
||||
tensor_clone = ggml_silu_back(ggml_ctx, src_clone[0], src_clone[1]);
|
||||
} else if (tensor->op == GGML_OP_L2_NORM) {
|
||||
const float eps = ((float *) tensor->op_params)[0];
|
||||
tensor_clone = ggml_l2_norm(ggml_ctx, src_clone[0], eps);
|
||||
} else if (tensor->op == GGML_OP_SOFT_MAX) {
|
||||
if (src1 != nullptr) {
|
||||
if (tensor->op == GGML_OP_FLASH_ATTN_EXT) {
|
||||
const float * params = (const float *)tensor->op_params;
|
||||
tensor_clone = ggml_soft_max_ext(ggml_ctx, src_clone[0], src_clone[1], params[0], params[1]);
|
||||
} else {
|
||||
tensor_clone = ggml_soft_max(ggml_ctx, src_clone[0]);
|
||||
}
|
||||
} else if (tensor->op == GGML_OP_SOFT_MAX_BACK) {
|
||||
tensor_clone = ggml_soft_max_ext_back(ggml_ctx, src_clone[0], src_clone[1], ((float *)tensor->op_params)[0], ((float *)tensor->op_params)[1]);
|
||||
} else if (tensor->op == GGML_OP_DIAG_MASK_INF) {
|
||||
tensor_clone = ggml_diag_mask_inf(ggml_ctx, src_clone[0], tensor->op_params[0]);
|
||||
} else if (tensor->op == GGML_OP_ROPE || tensor->op == GGML_OP_ROPE_BACK) {
|
||||
const int n_dims = ((int32_t *) tensor->op_params)[1];
|
||||
const int mode = ((int32_t *) tensor->op_params)[2];
|
||||
//const int n_ctx_ggml = ((int32_t *) tensor->op_params)[3];
|
||||
const int n_ctx_orig_ggml = ((int32_t *) tensor->op_params)[4];
|
||||
const float freq_base = ((float *) tensor->op_params)[5];
|
||||
const float freq_scale = ((float *) tensor->op_params)[6];
|
||||
const float ext_factor = ((float *) tensor->op_params)[7];
|
||||
const float attn_factor = ((float *) tensor->op_params)[8];
|
||||
const float beta_fast = ((float *) tensor->op_params)[9];
|
||||
const float beta_slow = ((float *) tensor->op_params)[10];
|
||||
if (mode & GGML_ROPE_TYPE_MROPE) {
|
||||
int32_t *sections = ((int32_t *) tensor->op_params) + 11;
|
||||
if (tensor->op == GGML_OP_ROPE) {
|
||||
tensor_clone = ggml_rope_multi(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], n_dims, sections, mode, n_ctx_orig_ggml, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
} else {
|
||||
tensor_clone = ggml_rope_multi_back(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], n_dims, sections, mode, n_ctx_orig_ggml, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
tensor_clone = ggml_flash_attn_ext(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], src_clone[3], params[0], params[1], params[2]);
|
||||
if (src_clone[4]) {
|
||||
ggml_flash_attn_ext_add_sinks(tensor_clone, src_clone[4]);
|
||||
}
|
||||
} else {
|
||||
if (tensor->op == GGML_OP_ROPE) {
|
||||
tensor_clone = ggml_rope_ext(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], n_dims, mode, n_ctx_orig_ggml, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
} else if (tensor->op == GGML_OP_MUL_MAT) {
|
||||
tensor_clone = ggml_mul_mat(ggml_ctx, src_clone[0], src_clone[1]);
|
||||
} else if (tensor->op == GGML_OP_MUL_MAT_ID) {
|
||||
tensor_clone = ggml_mul_mat_id(ggml_ctx, src_clone[0], src_clone[1], src_clone[2]);
|
||||
} else if (tensor->op == GGML_OP_SUB) {
|
||||
tensor_clone = ggml_sub(ggml_ctx, src_clone[0], src_clone[1]);
|
||||
} else if (tensor->op == GGML_OP_MUL) {
|
||||
tensor_clone = ggml_mul(ggml_ctx, src_clone[0], src_clone[1]);
|
||||
} else if (tensor->op == GGML_OP_DIV) {
|
||||
tensor_clone = ggml_div(ggml_ctx, src_clone[0], src_clone[1]);
|
||||
} else if (tensor->op == GGML_OP_CONCAT) {
|
||||
tensor_clone = ggml_concat(ggml_ctx, src_clone[0], src_clone[1], *(int *)tensor->op_params);
|
||||
} else if (tensor->op == GGML_OP_UPSCALE) {
|
||||
tensor_clone = ggml_interpolate(ggml_ctx, src_clone[0], tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], (ggml_scale_mode) tensor->op_params[0]);
|
||||
} else if (tensor->op == GGML_OP_SCALE) {
|
||||
const float * params = (const float *)tensor->op_params;
|
||||
tensor_clone = ggml_scale_bias(ggml_ctx, src_clone[0], params[0], params[1]);
|
||||
} else if (tensor->op == GGML_OP_SQR) {
|
||||
tensor_clone = ggml_sqr(ggml_ctx, src_clone[0]);
|
||||
} else if (tensor->op == GGML_OP_SQRT) {
|
||||
tensor_clone = ggml_sqrt(ggml_ctx, src_clone[0]);
|
||||
} else if (tensor->op == GGML_OP_SIN) {
|
||||
tensor_clone = ggml_sin(ggml_ctx, src_clone[0]);
|
||||
} else if (tensor->op == GGML_OP_COS) {
|
||||
tensor_clone = ggml_cos(ggml_ctx, src_clone[0]);
|
||||
} else if (tensor->op == GGML_OP_CLAMP) {
|
||||
const float * params = (const float *)tensor->op_params;
|
||||
tensor_clone = ggml_clamp(ggml_ctx, src_clone[0], params[0], params[1]);
|
||||
} else if (tensor->op == GGML_OP_PAD) {
|
||||
tensor_clone = ggml_pad_ext(ggml_ctx, src_clone[0], tensor->op_params[0], tensor->op_params[1], tensor->op_params[2], tensor->op_params[3],
|
||||
tensor->op_params[4], tensor->op_params[5], tensor->op_params[6], tensor->op_params[7]);
|
||||
} else if (tensor->op == GGML_OP_REPEAT) {
|
||||
tensor_clone = ggml_repeat(ggml_ctx, src_clone[0], tensor);
|
||||
} else if (tensor->op == GGML_OP_REPEAT_BACK) {
|
||||
tensor_clone = ggml_repeat_back(ggml_ctx, src_clone[0], tensor);
|
||||
} else if (tensor->op == GGML_OP_ADD) {
|
||||
tensor_clone = ggml_add(ggml_ctx, src_clone[0], src_clone[1]);
|
||||
} else if (tensor->op == GGML_OP_ACC) {
|
||||
tensor_clone = ggml_acc(ggml_ctx, src_clone[0], src_clone[1], tensor->op_params[0], tensor->op_params[1], tensor->op_params[2], tensor->op_params[3]);
|
||||
} else if (tensor->op == GGML_OP_NORM) {
|
||||
tensor_clone = ggml_norm(ggml_ctx, src_clone[0], *(float *)tensor->op_params);
|
||||
} else if (tensor->op == GGML_OP_GROUP_NORM) {
|
||||
const float * float_params = (const float *)tensor->op_params;
|
||||
tensor_clone = ggml_group_norm(ggml_ctx, src_clone[0], tensor->op_params[0], float_params[1]);
|
||||
} else if (tensor->op == GGML_OP_RMS_NORM) {
|
||||
tensor_clone = ggml_rms_norm(ggml_ctx, src_clone[0], *(float *)tensor->op_params);
|
||||
} else if (tensor->op == GGML_OP_RMS_NORM_BACK) {
|
||||
const float eps = ((float *) tensor->op_params)[0];
|
||||
tensor_clone = ggml_rms_norm_back(ggml_ctx, src_clone[0], src_clone[1], eps);
|
||||
} else if (tensor->op == GGML_OP_SILU_BACK) {
|
||||
tensor_clone = ggml_silu_back(ggml_ctx, src_clone[0], src_clone[1]);
|
||||
} else if (tensor->op == GGML_OP_L2_NORM) {
|
||||
const float eps = ((float *) tensor->op_params)[0];
|
||||
tensor_clone = ggml_l2_norm(ggml_ctx, src_clone[0], eps);
|
||||
} else if (tensor->op == GGML_OP_SOFT_MAX) {
|
||||
if (tensor->src[1] != nullptr) {
|
||||
const float * params = (const float *)tensor->op_params;
|
||||
tensor_clone = ggml_soft_max_ext(ggml_ctx, src_clone[0], src_clone[1], params[0], params[1]);
|
||||
} else {
|
||||
tensor_clone = ggml_rope_ext_back(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], n_dims, mode, n_ctx_orig_ggml, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
tensor_clone = ggml_soft_max(ggml_ctx, src_clone[0]);
|
||||
}
|
||||
} else if (tensor->op == GGML_OP_SOFT_MAX_BACK) {
|
||||
tensor_clone = ggml_soft_max_ext_back(ggml_ctx, src_clone[0], src_clone[1], ((float *)tensor->op_params)[0], ((float *)tensor->op_params)[1]);
|
||||
} else if (tensor->op == GGML_OP_DIAG_MASK_INF) {
|
||||
tensor_clone = ggml_diag_mask_inf(ggml_ctx, src_clone[0], tensor->op_params[0]);
|
||||
} else if (tensor->op == GGML_OP_ROPE || tensor->op == GGML_OP_ROPE_BACK) {
|
||||
const int n_dims = ((int32_t *) tensor->op_params)[1];
|
||||
const int mode = ((int32_t *) tensor->op_params)[2];
|
||||
//const int n_ctx_ggml = ((int32_t *) tensor->op_params)[3];
|
||||
const int n_ctx_orig_ggml = ((int32_t *) tensor->op_params)[4];
|
||||
const float freq_base = ((float *) tensor->op_params)[5];
|
||||
const float freq_scale = ((float *) tensor->op_params)[6];
|
||||
const float ext_factor = ((float *) tensor->op_params)[7];
|
||||
const float attn_factor = ((float *) tensor->op_params)[8];
|
||||
const float beta_fast = ((float *) tensor->op_params)[9];
|
||||
const float beta_slow = ((float *) tensor->op_params)[10];
|
||||
if (mode & GGML_ROPE_TYPE_MROPE) {
|
||||
int32_t *sections = ((int32_t *) tensor->op_params) + 11;
|
||||
if (tensor->op == GGML_OP_ROPE) {
|
||||
tensor_clone = ggml_rope_multi(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], n_dims, sections, mode, n_ctx_orig_ggml, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
} else {
|
||||
tensor_clone = ggml_rope_multi_back(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], n_dims, sections, mode, n_ctx_orig_ggml, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
}
|
||||
} else {
|
||||
if (tensor->op == GGML_OP_ROPE) {
|
||||
tensor_clone = ggml_rope_ext(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], n_dims, mode, n_ctx_orig_ggml, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
} else {
|
||||
tensor_clone = ggml_rope_ext_back(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], n_dims, mode, n_ctx_orig_ggml, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
}
|
||||
}
|
||||
} else if (tensor->op == GGML_OP_UNARY) {
|
||||
switch (ggml_get_unary_op(tensor)) {
|
||||
case GGML_UNARY_OP_EXP:
|
||||
tensor_clone = ggml_exp(ggml_ctx, src_clone[0]);
|
||||
break;
|
||||
case GGML_UNARY_OP_SILU:
|
||||
tensor_clone = ggml_silu(ggml_ctx, src_clone[0]);
|
||||
break;
|
||||
case GGML_UNARY_OP_GELU:
|
||||
tensor_clone = ggml_gelu(ggml_ctx, src_clone[0]);
|
||||
break;
|
||||
case GGML_UNARY_OP_GELU_ERF:
|
||||
tensor_clone = ggml_gelu_erf(ggml_ctx, src_clone[0]);
|
||||
break;
|
||||
case GGML_UNARY_OP_GELU_QUICK:
|
||||
tensor_clone = ggml_gelu_quick(ggml_ctx, src_clone[0]);
|
||||
break;
|
||||
case GGML_UNARY_OP_RELU:
|
||||
tensor_clone = ggml_relu(ggml_ctx, src_clone[0]);
|
||||
break;
|
||||
case GGML_UNARY_OP_TANH:
|
||||
tensor_clone = ggml_tanh(ggml_ctx, src_clone[0]);
|
||||
break;
|
||||
case GGML_UNARY_OP_SIGMOID:
|
||||
tensor_clone = ggml_sigmoid(ggml_ctx, src_clone[0]);
|
||||
break;
|
||||
case GGML_UNARY_OP_HARDSIGMOID:
|
||||
tensor_clone = ggml_hardsigmoid(ggml_ctx, src_clone[0]);
|
||||
break;
|
||||
case GGML_UNARY_OP_HARDSWISH:
|
||||
tensor_clone = ggml_hardswish(ggml_ctx, src_clone[0]);
|
||||
break;
|
||||
default:
|
||||
std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl;
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
} else if (tensor->op == GGML_OP_GLU) {
|
||||
if (src_clone[1] == nullptr) {
|
||||
tensor_clone = ggml_glu(ggml_ctx, src_clone[0], (ggml_glu_op) tensor->op_params[0], tensor->op_params[1]);
|
||||
} else {
|
||||
tensor_clone = ggml_glu_split(ggml_ctx, src_clone[0], src_clone[1], (ggml_glu_op) tensor->op_params[0]);
|
||||
}
|
||||
ggml_set_op_params_i32(tensor_clone, 2, ggml_get_op_params_i32(tensor, 2));
|
||||
ggml_set_op_params_i32(tensor_clone, 3, ggml_get_op_params_i32(tensor, 3));
|
||||
} else if (tensor->op == GGML_OP_CPY || tensor->op == GGML_OP_DUP) {
|
||||
if (tensor->src[1] == nullptr) {
|
||||
tensor_clone = ggml_dup(ggml_ctx, src_clone[0]);
|
||||
tensor_clone->type = tensor->type;
|
||||
} else {
|
||||
tensor_clone = ggml_cpy(ggml_ctx, src_clone[0], src_clone[1]);
|
||||
}
|
||||
} else if (tensor->op == GGML_OP_CONT) {
|
||||
tensor_clone = ggml_cont_4d(ggml_ctx, src_clone[0], tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
|
||||
} else if (tensor->op == GGML_OP_RESHAPE) {
|
||||
tensor_clone = ggml_reshape_4d(ggml_ctx, src_clone[0], tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
|
||||
} else if (tensor->op == GGML_OP_VIEW) {
|
||||
tensor_clone = ggml_view_4d(ggml_ctx, src_clone[0], tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], tensor->nb[1], tensor->nb[2], tensor->nb[3], ((int32_t *) tensor->op_params)[0]);
|
||||
} else if (tensor->op == GGML_OP_PERMUTE) {
|
||||
int32_t * params = (int32_t *)tensor->op_params;
|
||||
tensor_clone = ggml_permute(ggml_ctx, src_clone[0], params[0], params[1], params[2], params[3]);
|
||||
} else if (tensor->op == GGML_OP_TRANSPOSE) {
|
||||
tensor_clone = ggml_transpose(ggml_ctx, src_clone[0]);
|
||||
} else if (tensor->op == GGML_OP_GET_ROWS) {
|
||||
tensor_clone = ggml_get_rows(ggml_ctx, src_clone[0], src_clone[1]);
|
||||
} else if (tensor->op == GGML_OP_ARGSORT) {
|
||||
tensor_clone = ggml_argsort(ggml_ctx, src_clone[0], (ggml_sort_order) *(int *)tensor->op_params);
|
||||
} else if (tensor->op == GGML_OP_SUM) {
|
||||
tensor_clone = ggml_sum(ggml_ctx, src_clone[0]);
|
||||
} else if (tensor->op == GGML_OP_SUM_ROWS) {
|
||||
tensor_clone = ggml_sum_rows(ggml_ctx, src_clone[0]);
|
||||
} else if (tensor->op == GGML_OP_MEAN) {
|
||||
tensor_clone = ggml_mean(ggml_ctx, src_clone[0]);
|
||||
} else if (tensor->op == GGML_OP_ARGMAX) {
|
||||
tensor_clone = ggml_argmax(ggml_ctx, src_clone[0]);
|
||||
} else if (tensor->op == GGML_OP_COUNT_EQUAL) {
|
||||
tensor_clone = ggml_count_equal(ggml_ctx, src_clone[0], src_clone[1]);
|
||||
} else if (tensor->op == GGML_OP_IM2COL) {
|
||||
const int32_t s0 = tensor->op_params[0];
|
||||
const int32_t s1 = tensor->op_params[1];
|
||||
const int32_t p0 = tensor->op_params[2];
|
||||
const int32_t p1 = tensor->op_params[3];
|
||||
const int32_t d0 = tensor->op_params[4];
|
||||
const int32_t d1 = tensor->op_params[5];
|
||||
|
||||
const bool is_2D = tensor->op_params[6] == 1;
|
||||
tensor_clone = ggml_im2col(ggml_ctx, src_clone[0], src_clone[1], s0, s1, p0, p1, d0, d1, is_2D, tensor->type);
|
||||
} else if (tensor->op == GGML_OP_IM2COL_3D) {
|
||||
const int32_t s0 = tensor->op_params[0];
|
||||
const int32_t s1 = tensor->op_params[1];
|
||||
const int32_t s2 = tensor->op_params[2];
|
||||
const int32_t p0 = tensor->op_params[3];
|
||||
const int32_t p1 = tensor->op_params[4];
|
||||
const int32_t p2 = tensor->op_params[5];
|
||||
const int32_t d0 = tensor->op_params[6];
|
||||
const int32_t d1 = tensor->op_params[7];
|
||||
const int32_t d2 = tensor->op_params[8];
|
||||
const int32_t IC = tensor->op_params[9];
|
||||
|
||||
tensor_clone = ggml_im2col_3d(ggml_ctx, src_clone[0], src_clone[1], IC, s0, s1, s2, p0, p1, p2, d0, d1, d2, tensor->type);
|
||||
} else if (tensor->op == GGML_OP_TIMESTEP_EMBEDDING) {
|
||||
const int32_t dim = tensor->op_params[0];
|
||||
const int32_t max_period = tensor->op_params[1];
|
||||
tensor_clone = ggml_timestep_embedding(ggml_ctx, src_clone[0], dim, max_period);
|
||||
} else if (tensor->op == GGML_OP_CONV_TRANSPOSE_1D){
|
||||
const int32_t s0 = tensor->op_params[0];
|
||||
const int32_t p0 = tensor->op_params[1];
|
||||
const int32_t d0 = tensor->op_params[2];
|
||||
tensor_clone = ggml_conv_transpose_1d(ggml_ctx, src_clone[0], src_clone[1], s0, p0, d0);
|
||||
} else if (tensor->op == GGML_OP_POOL_2D) {
|
||||
enum ggml_op_pool op = static_cast<ggml_op_pool>(tensor->op_params[0]);
|
||||
const int32_t k0 = tensor->op_params[1];
|
||||
const int32_t k1 = tensor->op_params[2];
|
||||
const int32_t s0 = tensor->op_params[3];
|
||||
const int32_t s1 = tensor->op_params[4];
|
||||
const int32_t p0 = tensor->op_params[5];
|
||||
const int32_t p1 = tensor->op_params[6];
|
||||
|
||||
tensor_clone = ggml_pool_2d(ggml_ctx, src_clone[0], op, k0, k1, s0, s1, p0, p1);
|
||||
} else if (tensor->op == GGML_OP_CONV_2D) {
|
||||
const int32_t s0 = tensor->op_params[0];
|
||||
const int32_t s1 = tensor->op_params[1];
|
||||
const int32_t p0 = tensor->op_params[2];
|
||||
const int32_t p1 = tensor->op_params[3];
|
||||
const int32_t d0 = tensor->op_params[4];
|
||||
const int32_t d1 = tensor->op_params[5];
|
||||
tensor_clone = ggml_conv_2d(ggml_ctx, src_clone[0], src_clone[1], s0, s1, p0, p1, d0, d1);
|
||||
} else if (tensor->op == GGML_OP_CONV_2D_DW) {
|
||||
const int32_t s0 = tensor->op_params[0];
|
||||
const int32_t s1 = tensor->op_params[1];
|
||||
const int32_t p0 = tensor->op_params[2];
|
||||
const int32_t p1 = tensor->op_params[3];
|
||||
const int32_t d0 = tensor->op_params[4];
|
||||
const int32_t d1 = tensor->op_params[5];
|
||||
tensor_clone = ggml_conv_2d_dw_direct(ggml_ctx, src_clone[0], src_clone[1], s0, s1, p0, p1, d0, d1);
|
||||
} else if (tensor->op == GGML_OP_CONV_TRANSPOSE_2D) {
|
||||
const int32_t s = tensor->op_params[0];
|
||||
tensor_clone = ggml_conv_transpose_2d_p0(ggml_ctx, src_clone[0], src_clone[1], s);
|
||||
} else if (tensor->op == GGML_OP_LEAKY_RELU) {
|
||||
const float * op_params = (const float *)tensor->op_params;
|
||||
tensor_clone = ggml_leaky_relu(ggml_ctx, src_clone[0], op_params[0], false);
|
||||
} else if (tensor->op == GGML_OP_RWKV_WKV6) {
|
||||
tensor_clone = ggml_rwkv_wkv6(ggml_ctx, src_clone[0], src_clone[1],
|
||||
src_clone[2], src_clone[3], src_clone[4], src_clone[5]);
|
||||
} else if (tensor->op == GGML_OP_RWKV_WKV7) {
|
||||
tensor_clone = ggml_rwkv_wkv7(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], src_clone[3],
|
||||
src_clone[4], src_clone[5], src_clone[6]);
|
||||
} else if (tensor->op == GGML_OP_OPT_STEP_ADAMW) {
|
||||
src_clone[0]->flags = tensor->src[0]->flags;
|
||||
tensor_clone = ggml_opt_step_adamw(ggml_ctx, src_clone[0], src_clone[1],
|
||||
src_clone[2], src_clone[3], src_clone[4]);
|
||||
} else if (tensor->op == GGML_OP_OPT_STEP_SGD) {
|
||||
src_clone[0]->flags = tensor->src[0]->flags;
|
||||
tensor_clone = ggml_opt_step_sgd(ggml_ctx, src_clone[0], src_clone[1],
|
||||
src_clone[2]);
|
||||
} else if (tensor->op == GGML_OP_ADD_ID) {
|
||||
tensor_clone = ggml_add_id(ggml_ctx, src_clone[0], src_clone[1], src_clone[2]);
|
||||
} else if (tensor->op == GGML_OP_SSM_SCAN) {
|
||||
tensor_clone = ggml_ssm_scan(ggml_ctx, src_clone[0], src_clone[1], src_clone[2],
|
||||
src_clone[3], src_clone[4], src_clone[5], src_clone[6]);
|
||||
} else if (tensor->op == GGML_OP_SSM_CONV) {
|
||||
tensor_clone = ggml_ssm_conv(ggml_ctx, src_clone[0], src_clone[1]);
|
||||
} else if (tensor->op == GGML_OP_ROLL) {
|
||||
const int32_t s0 = tensor->op_params[0];
|
||||
const int32_t s1 = tensor->op_params[1];
|
||||
const int32_t s2 = tensor->op_params[2];
|
||||
const int32_t s3 = tensor->op_params[3];
|
||||
tensor_clone = ggml_roll(ggml_ctx, src_clone[0], s0, s1, s2, s3);
|
||||
}
|
||||
} else if (tensor->op == GGML_OP_UNARY) {
|
||||
switch (ggml_get_unary_op(tensor)) {
|
||||
case GGML_UNARY_OP_EXP:
|
||||
tensor_clone = ggml_exp(ggml_ctx, src_clone[0]);
|
||||
break;
|
||||
case GGML_UNARY_OP_SILU:
|
||||
tensor_clone = ggml_silu(ggml_ctx, src_clone[0]);
|
||||
break;
|
||||
case GGML_UNARY_OP_GELU:
|
||||
tensor_clone = ggml_gelu(ggml_ctx, src_clone[0]);
|
||||
break;
|
||||
case GGML_UNARY_OP_GELU_ERF:
|
||||
tensor_clone = ggml_gelu_erf(ggml_ctx, src_clone[0]);
|
||||
break;
|
||||
case GGML_UNARY_OP_GELU_QUICK:
|
||||
tensor_clone = ggml_gelu_quick(ggml_ctx, src_clone[0]);
|
||||
break;
|
||||
case GGML_UNARY_OP_RELU:
|
||||
tensor_clone = ggml_relu(ggml_ctx, src_clone[0]);
|
||||
break;
|
||||
case GGML_UNARY_OP_TANH:
|
||||
tensor_clone = ggml_tanh(ggml_ctx, src_clone[0]);
|
||||
break;
|
||||
case GGML_UNARY_OP_SIGMOID:
|
||||
tensor_clone = ggml_sigmoid(ggml_ctx, src_clone[0]);
|
||||
break;
|
||||
case GGML_UNARY_OP_HARDSIGMOID:
|
||||
tensor_clone = ggml_hardsigmoid(ggml_ctx, src_clone[0]);
|
||||
break;
|
||||
case GGML_UNARY_OP_HARDSWISH:
|
||||
tensor_clone = ggml_hardswish(ggml_ctx, src_clone[0]);
|
||||
break;
|
||||
default:
|
||||
else {
|
||||
std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl;
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
} else if (tensor->op == GGML_OP_GLU) {
|
||||
if (src_clone[1] == nullptr) {
|
||||
tensor_clone = ggml_glu(ggml_ctx, src_clone[0], (ggml_glu_op) tensor->op_params[0], tensor->op_params[1]);
|
||||
} else {
|
||||
tensor_clone = ggml_glu_split(ggml_ctx, src_clone[0], src_clone[1], (ggml_glu_op) tensor->op_params[0]);
|
||||
}
|
||||
ggml_set_op_params_i32(tensor_clone, 2, ggml_get_op_params_i32(tensor, 2));
|
||||
ggml_set_op_params_i32(tensor_clone, 3, ggml_get_op_params_i32(tensor, 3));
|
||||
} else if (tensor->op == GGML_OP_CPY || tensor->op == GGML_OP_DUP) {
|
||||
if (src1 == nullptr) {
|
||||
tensor_clone = ggml_dup(ggml_ctx, src_clone[0]);
|
||||
tensor_clone->type = tensor->type;
|
||||
} else {
|
||||
tensor_clone = ggml_cpy(ggml_ctx, src_clone[0], src_clone[1]);
|
||||
}
|
||||
} else if (tensor->op == GGML_OP_CONT) {
|
||||
tensor_clone = ggml_cont_4d(ggml_ctx, src_clone[0], tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
|
||||
} else if (tensor->op == GGML_OP_RESHAPE) {
|
||||
tensor_clone = ggml_reshape_4d(ggml_ctx, src_clone[0], tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
|
||||
} else if (tensor->op == GGML_OP_VIEW) {
|
||||
tensor_clone = ggml_view_4d(ggml_ctx, src_clone[0], tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], tensor->nb[1], tensor->nb[2], tensor->nb[3], ((int32_t *) tensor->op_params)[0]);
|
||||
} else if (tensor->op == GGML_OP_PERMUTE) {
|
||||
int32_t * params = (int32_t *)tensor->op_params;
|
||||
tensor_clone = ggml_permute(ggml_ctx, src_clone[0], params[0], params[1], params[2], params[3]);
|
||||
} else if (tensor->op == GGML_OP_TRANSPOSE) {
|
||||
tensor_clone = ggml_transpose(ggml_ctx, src_clone[0]);
|
||||
} else if (tensor->op == GGML_OP_GET_ROWS) {
|
||||
tensor_clone = ggml_get_rows(ggml_ctx, src_clone[0], src_clone[1]);
|
||||
} else if (tensor->op == GGML_OP_ARGSORT) {
|
||||
tensor_clone = ggml_argsort(ggml_ctx, src_clone[0], (ggml_sort_order) *(int *)tensor->op_params);
|
||||
} else if (tensor->op == GGML_OP_SUM) {
|
||||
tensor_clone = ggml_sum(ggml_ctx, src_clone[0]);
|
||||
} else if (tensor->op == GGML_OP_SUM_ROWS) {
|
||||
tensor_clone = ggml_sum_rows(ggml_ctx, src_clone[0]);
|
||||
} else if (tensor->op == GGML_OP_MEAN) {
|
||||
tensor_clone = ggml_mean(ggml_ctx, src_clone[0]);
|
||||
} else if (tensor->op == GGML_OP_ARGMAX) {
|
||||
tensor_clone = ggml_argmax(ggml_ctx, src_clone[0]);
|
||||
} else if (tensor->op == GGML_OP_COUNT_EQUAL) {
|
||||
tensor_clone = ggml_count_equal(ggml_ctx, src_clone[0], src_clone[1]);
|
||||
} else if (tensor->op == GGML_OP_IM2COL) {
|
||||
const int32_t s0 = tensor->op_params[0];
|
||||
const int32_t s1 = tensor->op_params[1];
|
||||
const int32_t p0 = tensor->op_params[2];
|
||||
const int32_t p1 = tensor->op_params[3];
|
||||
const int32_t d0 = tensor->op_params[4];
|
||||
const int32_t d1 = tensor->op_params[5];
|
||||
|
||||
const bool is_2D = tensor->op_params[6] == 1;
|
||||
tensor_clone = ggml_im2col(ggml_ctx, src_clone[0], src_clone[1], s0, s1, p0, p1, d0, d1, is_2D, tensor->type);
|
||||
} else if (tensor->op == GGML_OP_IM2COL_3D) {
|
||||
const int32_t s0 = tensor->op_params[0];
|
||||
const int32_t s1 = tensor->op_params[1];
|
||||
const int32_t s2 = tensor->op_params[2];
|
||||
const int32_t p0 = tensor->op_params[3];
|
||||
const int32_t p1 = tensor->op_params[4];
|
||||
const int32_t p2 = tensor->op_params[5];
|
||||
const int32_t d0 = tensor->op_params[6];
|
||||
const int32_t d1 = tensor->op_params[7];
|
||||
const int32_t d2 = tensor->op_params[8];
|
||||
const int32_t IC = tensor->op_params[9];
|
||||
|
||||
tensor_clone = ggml_im2col_3d(ggml_ctx, src_clone[0], src_clone[1], IC, s0, s1, s2, p0, p1, p2, d0, d1, d2, tensor->type);
|
||||
} else if (tensor->op == GGML_OP_TIMESTEP_EMBEDDING) {
|
||||
const int32_t dim = tensor->op_params[0];
|
||||
const int32_t max_period = tensor->op_params[1];
|
||||
tensor_clone = ggml_timestep_embedding(ggml_ctx, src_clone[0], dim, max_period);
|
||||
} else if (tensor->op == GGML_OP_CONV_TRANSPOSE_1D){
|
||||
const int32_t s0 = tensor->op_params[0];
|
||||
const int32_t p0 = tensor->op_params[1];
|
||||
const int32_t d0 = tensor->op_params[2];
|
||||
tensor_clone = ggml_conv_transpose_1d(ggml_ctx, src_clone[0], src_clone[1], s0, p0, d0);
|
||||
} else if (tensor->op == GGML_OP_POOL_2D) {
|
||||
enum ggml_op_pool op = static_cast<ggml_op_pool>(tensor->op_params[0]);
|
||||
const int32_t k0 = tensor->op_params[1];
|
||||
const int32_t k1 = tensor->op_params[2];
|
||||
const int32_t s0 = tensor->op_params[3];
|
||||
const int32_t s1 = tensor->op_params[4];
|
||||
const int32_t p0 = tensor->op_params[5];
|
||||
const int32_t p1 = tensor->op_params[6];
|
||||
|
||||
tensor_clone = ggml_pool_2d(ggml_ctx, src_clone[0], op, k0, k1, s0, s1, p0, p1);
|
||||
} else if (tensor->op == GGML_OP_CONV_2D) {
|
||||
const int32_t s0 = tensor->op_params[0];
|
||||
const int32_t s1 = tensor->op_params[1];
|
||||
const int32_t p0 = tensor->op_params[2];
|
||||
const int32_t p1 = tensor->op_params[3];
|
||||
const int32_t d0 = tensor->op_params[4];
|
||||
const int32_t d1 = tensor->op_params[5];
|
||||
tensor_clone = ggml_conv_2d(ggml_ctx, src_clone[0], src_clone[1], s0, s1, p0, p1, d0, d1);
|
||||
} else if (tensor->op == GGML_OP_CONV_TRANSPOSE_2D) {
|
||||
const int32_t s = tensor->op_params[0];
|
||||
tensor_clone = ggml_conv_transpose_2d_p0(ggml_ctx, src_clone[0], src_clone[1], s);
|
||||
} else if (tensor->op == GGML_OP_LEAKY_RELU) {
|
||||
const float * op_params = (const float *)tensor->op_params;
|
||||
tensor_clone = ggml_leaky_relu(ggml_ctx, src_clone[0], op_params[0], false);
|
||||
} else if (tensor->op == GGML_OP_RWKV_WKV6) {
|
||||
tensor_clone = ggml_rwkv_wkv6(ggml_ctx, src_clone[0], src_clone[1],
|
||||
src_clone[2], src_clone[3], src_clone[4], src_clone[5]);
|
||||
} else if (tensor->op == GGML_OP_RWKV_WKV7) {
|
||||
tensor_clone = ggml_rwkv_wkv7(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], src_clone[3],
|
||||
src_clone[4], src_clone[5], src_clone[6]);
|
||||
} else if (tensor->op == GGML_OP_OPT_STEP_ADAMW) {
|
||||
src_clone[0]->flags = src0->flags;
|
||||
tensor_clone = ggml_opt_step_adamw(ggml_ctx, src_clone[0], src_clone[1],
|
||||
src_clone[2], src_clone[3], src_clone[4]);
|
||||
} else if (tensor->op == GGML_OP_OPT_STEP_SGD) {
|
||||
src_clone[0]->flags = src0->flags;
|
||||
tensor_clone = ggml_opt_step_sgd(ggml_ctx, src_clone[0], src_clone[1],
|
||||
src_clone[2]);
|
||||
} else if (tensor->op == GGML_OP_ADD_ID) {
|
||||
tensor_clone = ggml_add_id(ggml_ctx, src_clone[0], src_clone[1], src_clone[2]);
|
||||
} else if (tensor->op == GGML_OP_SSM_SCAN) {
|
||||
tensor_clone = ggml_ssm_scan(ggml_ctx, src_clone[0], src_clone[1], src_clone[2],
|
||||
src_clone[3], src_clone[4], src_clone[5], src_clone[6]);
|
||||
} else if (tensor->op == GGML_OP_SSM_CONV) {
|
||||
tensor_clone = ggml_ssm_conv(ggml_ctx, src_clone[0], src_clone[1]);
|
||||
}
|
||||
else {
|
||||
std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl;
|
||||
GGML_ABORT("fatal error");
|
||||
cloned_tensors[tensor] = tensor_clone;
|
||||
}
|
||||
|
||||
ggml_cgraph * cgraph_cpu = ggml_new_graph(ggml_ctx);
|
||||
@@ -14476,10 +14475,8 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph *
|
||||
memcpy(comp_result, tensor_clone->data, comp_size);
|
||||
memcpy(comp_nb, tensor_clone->nb, sizeof(size_t) * GGML_MAX_DIMS);
|
||||
|
||||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||
if (src_buffer[i] != nullptr) {
|
||||
free(src_buffer[i]);
|
||||
}
|
||||
for (auto m : cloned_mallocs) {
|
||||
free(m);
|
||||
}
|
||||
|
||||
ggml_free(ggml_ctx);
|
||||
@@ -14488,15 +14485,10 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph *
|
||||
}
|
||||
|
||||
static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_cgraph * cgraph, int tensor_idx) {
|
||||
ggml_tensor * tensor = cgraph->nodes[tensor_idx];
|
||||
ggml_tensor * tensor = cgraph->nodes[tensor_idx + ctx->num_additional_fused_ops];
|
||||
if (tensor->op == GGML_OP_TRANSPOSE || tensor->op == GGML_OP_SET_ROWS) {
|
||||
return;
|
||||
}
|
||||
if (ctx->num_additional_fused_ops == 1 &&
|
||||
tensor->op == GGML_OP_RMS_NORM &&
|
||||
cgraph->nodes[tensor_idx + 1]->op == GGML_OP_MUL) {
|
||||
tensor = cgraph->nodes[tensor_idx + 1];
|
||||
}
|
||||
|
||||
if (!(vk_output_tensor > 0 && vk_output_tensor == check_counter) && check_counter <= vk_skip_checks) {
|
||||
return;
|
||||
|
||||
@@ -624,14 +624,16 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
|
||||
ctx->size = 0;
|
||||
for (size_t i = 0; i < ctx->info.size(); ++i) {
|
||||
const gguf_tensor_info & ti = ctx->info[i];
|
||||
if (ti.offset != ctx->size) {
|
||||
// alignment offset only exists for GGUF converted with reflinks
|
||||
const size_t align_offset = ti.offset % ctx->alignment;
|
||||
if (ti.offset - align_offset != ctx->size) {
|
||||
GGML_LOG_ERROR("%s: tensor '%s' has offset %" PRIu64 ", expected %zu\n",
|
||||
__func__, ti.t.name, ti.offset, ctx->size);
|
||||
__func__, ti.t.name, ti.offset, ctx->size + align_offset);
|
||||
GGML_LOG_ERROR("%s: failed to read tensor data\n", __func__);
|
||||
gguf_free(ctx);
|
||||
return nullptr;
|
||||
}
|
||||
size_t padded_size = GGML_PAD(ggml_nbytes(&ti.t), ctx->alignment);
|
||||
size_t padded_size = GGML_PAD(ggml_nbytes(&ti.t) + align_offset, ctx->alignment);
|
||||
if (SIZE_MAX - ctx->size < padded_size) {
|
||||
GGML_LOG_ERROR("%s: tensor '%s' size overflow, cannot accumulate size %zu + %zu\n",
|
||||
__func__, ti.t.name, ctx->size, padded_size);
|
||||
|
||||
@@ -29,6 +29,7 @@ from .constants import (
|
||||
ExpertGatingFuncType,
|
||||
)
|
||||
|
||||
from .lazy import best_extra_offset, count_reflinkable_size
|
||||
from .quants import quant_shape_from_byte_shape
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -84,14 +85,16 @@ class GGUFWriter:
|
||||
|
||||
def __init__(
|
||||
self, path: os.PathLike[str] | str | None, arch: str, use_temp_file: bool = False, endianess: GGUFEndian = GGUFEndian.LITTLE,
|
||||
split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False
|
||||
split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False,
|
||||
use_reflinks = False, # opportunistically attempt to use copy-on-write
|
||||
):
|
||||
self.fout = None
|
||||
self.path = Path(path) if path else None
|
||||
self.arch = arch
|
||||
self.endianess = endianess
|
||||
self.data_alignment = GGUF_DEFAULT_ALIGNMENT
|
||||
self.use_temp_file = use_temp_file
|
||||
self.use_reflinks = use_reflinks
|
||||
self.use_temp_file = False if self.use_reflinks else use_temp_file
|
||||
self.temp_file = None
|
||||
self.tensors = [{}]
|
||||
self.kv_data = [{}]
|
||||
@@ -178,13 +181,28 @@ class GGUFWriter:
|
||||
self.fout = [open(filename, "wb") for filename in filenames]
|
||||
self.state = WriterState.EMPTY
|
||||
|
||||
if self.use_reflinks:
|
||||
# reflinks require alignment to the filesystem blocks
|
||||
block_size = os.stat(self.path.parent).st_blksize
|
||||
# necessary to get an appropriate data start offset when padding for reflinks;
|
||||
# using the real alignment (8 bytes, from safetensors) would result in a unusable base data offset
|
||||
self.data_alignment = block_size
|
||||
# for all shards to allow reading them on their own
|
||||
for i, kv in enumerate(self.kv_data):
|
||||
# insert at the start of the key-values
|
||||
if Keys.General.ALIGNMENT in kv:
|
||||
del kv[Keys.General.ALIGNMENT]
|
||||
self.kv_data[i] = {Keys.General.ALIGNMENT: GGUFValue(block_size, GGUFValueType.UINT32), **kv}
|
||||
|
||||
def print_plan(self) -> list[Path]:
|
||||
logger.info("Writing the following files:")
|
||||
assert self.path is not None
|
||||
filenames = self.format_shard_names(self.path)
|
||||
assert len(filenames) == len(self.tensors)
|
||||
for name, tensors in zip(filenames, self.tensors):
|
||||
logger.info(f"{name}: n_tensors = {len(tensors)}, total_size = {GGUFWriter.format_n_bytes_to_str(sum(ti.nbytes for ti in tensors.values()))}")
|
||||
total_size = sum(ti.nbytes for ti in tensors.values())
|
||||
reflinkable_size = count_reflinkable_size((name, ti.tensor) for name, ti in tensors.items()) if self.use_reflinks else 0
|
||||
logger.info(f"{name}: n_tensors = {len(tensors)}, total_size = {GGUFWriter.format_n_bytes_to_str(total_size)}{', reflinked = ' + GGUFWriter.format_n_bytes_to_str(total_size - reflinkable_size) if self.use_reflinks else ''}")
|
||||
|
||||
if self.dry_run:
|
||||
logger.info("Dry run, not writing files")
|
||||
@@ -257,14 +275,18 @@ class GGUFWriter:
|
||||
offset_tensor = 0
|
||||
|
||||
for name, ti in tensors.items():
|
||||
extra_offset = 0
|
||||
if self.use_reflinks:
|
||||
extra_offset = best_extra_offset(ti.tensor, offset_tensor)
|
||||
|
||||
ti_data += self._pack_val(name, GGUFValueType.STRING, add_vtype=False)
|
||||
n_dims = len(ti.shape)
|
||||
ti_data += self._pack("I", n_dims)
|
||||
for j in range(n_dims):
|
||||
ti_data += self._pack("Q", ti.shape[n_dims - 1 - j])
|
||||
ti_data += self._pack("I", ti.dtype)
|
||||
ti_data += self._pack("Q", offset_tensor)
|
||||
offset_tensor += GGUFWriter.ggml_pad(ti.nbytes, self.data_alignment)
|
||||
ti_data += self._pack("Q", offset_tensor + extra_offset)
|
||||
offset_tensor += GGUFWriter.ggml_pad(ti.nbytes + extra_offset, self.data_alignment)
|
||||
|
||||
fout.write(ti_data)
|
||||
fout.flush()
|
||||
@@ -392,7 +414,7 @@ class GGUFWriter:
|
||||
def write_padding(self, fp: IO[bytes], n: int, align: int | None = None) -> None:
|
||||
pad = GGUFWriter.ggml_pad(n, align if align is not None else self.data_alignment) - n
|
||||
if pad != 0:
|
||||
fp.write(bytes([0] * pad))
|
||||
fp.write(b"\x00" * pad)
|
||||
|
||||
def write_tensor_data(self, tensor: np.ndarray[Any, Any]) -> None:
|
||||
if self.state is not WriterState.TI_DATA and self.state is not WriterState.WEIGHTS:
|
||||
@@ -418,7 +440,7 @@ class GGUFWriter:
|
||||
|
||||
self.write_padding(fout, fout.tell())
|
||||
tensor.tofile(fout)
|
||||
self.write_padding(fout, tensor.nbytes)
|
||||
self.write_padding(fout, fout.tell())
|
||||
|
||||
self.state = WriterState.WEIGHTS
|
||||
|
||||
@@ -458,7 +480,7 @@ class GGUFWriter:
|
||||
shard_bar.update(ti.nbytes)
|
||||
if bar is not None:
|
||||
bar.update(ti.nbytes)
|
||||
self.write_padding(fout, ti.nbytes)
|
||||
self.write_padding(fout, fout.tell())
|
||||
ti.tensor = None
|
||||
else:
|
||||
self.temp_file.seek(0)
|
||||
|
||||
@@ -1,12 +1,19 @@
|
||||
from __future__ import annotations
|
||||
from abc import ABC, ABCMeta, abstractmethod
|
||||
|
||||
import logging
|
||||
from typing import Any, Callable
|
||||
from io import BufferedReader, BufferedWriter
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable, Iterable
|
||||
|
||||
import logging
|
||||
import numpy as np
|
||||
import os
|
||||
import shutil
|
||||
|
||||
from numpy.typing import DTypeLike
|
||||
|
||||
from .utility import LocalTensorRange
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -20,10 +27,11 @@ class LazyMeta(ABCMeta):
|
||||
return type(self)._wrap_fn(
|
||||
(lambda s, *args, **kwargs: getattr(s, name)(*args, **kwargs)),
|
||||
use_self=self,
|
||||
data_noop=name in ("view", "reshape", "squeeze", "unsqueeze", "contiguous"),
|
||||
)
|
||||
elif isinstance(meta_attr, self._tensor_type):
|
||||
# e.g. self.T with torch.Tensor should still be wrapped
|
||||
return type(self)._wrap_fn(lambda s: getattr(s, name))(self)
|
||||
return type(self)._wrap_fn(lambda s: getattr(s, name), use_self=self)()
|
||||
else:
|
||||
# no need to wrap non-tensor properties,
|
||||
# and they likely don't depend on the actual contents of the tensor
|
||||
@@ -39,8 +47,9 @@ class LazyMeta(ABCMeta):
|
||||
def wrapped_special_op(self, *args, **kwargs):
|
||||
return type(self)._wrap_fn(
|
||||
getattr(type(self)._tensor_type, op_name),
|
||||
use_self=self,
|
||||
meta_noop=meta_noop,
|
||||
)(self, *args, **kwargs)
|
||||
)(*args, **kwargs)
|
||||
return wrapped_special_op
|
||||
|
||||
# special methods bypass __getattr__, so they need to be added manually
|
||||
@@ -76,14 +85,16 @@ class LazyBase(ABC, metaclass=LazyMeta):
|
||||
_args: tuple
|
||||
_kwargs: dict[str, Any]
|
||||
_func: Callable[[Any], Any] | None
|
||||
_ranges: tuple[LocalTensorRange, ...]
|
||||
|
||||
def __init__(self, *, meta: Any, data: Any | None = None, args: tuple = (), kwargs: dict[str, Any] | None = None, func: Callable[[Any], Any] | None = None):
|
||||
def __init__(self, *, meta: Any, data: Any | None = None, args: tuple = (), kwargs: dict[str, Any] | None = None, func: Callable[[Any], Any] | None = None, ranges: tuple[LocalTensorRange, ...] = ()):
|
||||
super().__init__()
|
||||
self._meta = meta
|
||||
self._data = data
|
||||
self._args = args
|
||||
self._kwargs = kwargs if kwargs is not None else {}
|
||||
self._func = func
|
||||
self._ranges = ranges
|
||||
assert self._func is not None or self._data is not None
|
||||
|
||||
def __init_subclass__(cls) -> None:
|
||||
@@ -107,7 +118,7 @@ class LazyBase(ABC, metaclass=LazyMeta):
|
||||
return o
|
||||
|
||||
@classmethod
|
||||
def _wrap_fn(cls, fn: Callable, *, use_self: LazyBase | None = None, meta_noop: bool | DTypeLike | tuple[DTypeLike, Callable[[tuple[int, ...]], tuple[int, ...]]] = False) -> Callable[[Any], Any]:
|
||||
def _wrap_fn(cls, fn: Callable, *, use_self: LazyBase | None = None, meta_noop: bool | DTypeLike | tuple[DTypeLike, Callable[[tuple[int, ...]], tuple[int, ...]]] = False, data_noop: bool = False) -> Callable[[Any], Any]:
|
||||
def wrapped_fn(*args, **kwargs):
|
||||
if kwargs is None:
|
||||
kwargs = {}
|
||||
@@ -116,6 +127,8 @@ class LazyBase(ABC, metaclass=LazyMeta):
|
||||
meta_args = LazyBase._recurse_apply(args, lambda t: t._meta)
|
||||
# TODO: maybe handle tensors in kwargs too
|
||||
|
||||
ranges = use_self._ranges if use_self is not None and data_noop else ()
|
||||
|
||||
if isinstance(meta_noop, bool) and not meta_noop:
|
||||
try:
|
||||
res = fn(*meta_args, **kwargs)
|
||||
@@ -138,7 +151,7 @@ class LazyBase(ABC, metaclass=LazyMeta):
|
||||
res = cls.meta_with_dtype_and_shape(meta_noop, res.shape)
|
||||
|
||||
if isinstance(res, cls._tensor_type):
|
||||
return cls(meta=cls.eager_to_meta(res), args=args, kwargs=kwargs, func=fn)
|
||||
return cls(meta=cls.eager_to_meta(res), args=args, kwargs=kwargs, func=fn, ranges=ranges)
|
||||
elif isinstance(res, tuple) and all(isinstance(t, cls._tensor_type) for t in res):
|
||||
# share the evaluation between lazy tuple elements
|
||||
shared_args: list = [args, None]
|
||||
@@ -202,6 +215,7 @@ class LazyNumpyTensor(LazyBase):
|
||||
_tensor_type = np.ndarray
|
||||
|
||||
shape: tuple[int, ...] # Makes the type checker happy in quants.py
|
||||
nbytes: int
|
||||
|
||||
@classmethod
|
||||
def meta_with_dtype_and_shape(cls, dtype: DTypeLike, shape: tuple[int, ...]) -> np.ndarray[Any, Any]:
|
||||
@@ -214,10 +228,154 @@ class LazyNumpyTensor(LazyBase):
|
||||
def astype(self, dtype, *args, **kwargs):
|
||||
meta = type(self).meta_with_dtype_and_shape(dtype, self._meta.shape)
|
||||
full_args = (self, dtype,) + args
|
||||
return type(self)(meta=meta, args=full_args, kwargs=kwargs, func=(lambda a, *args, **kwargs: a.astype(*args, **kwargs)))
|
||||
ranges = self._ranges if self._meta.dtype == dtype else ()
|
||||
return type(self)(meta=meta, args=full_args, kwargs=kwargs, func=(lambda a, *args, **kwargs: a.astype(*args, **kwargs)), ranges=ranges)
|
||||
|
||||
def tofile(self, *args, **kwargs):
|
||||
eager = LazyNumpyTensor.to_eager(self)
|
||||
return eager.tofile(*args, **kwargs)
|
||||
def tofile(self, fid, *args, **kwargs):
|
||||
if isinstance(fid, BufferedWriter) and len(self._ranges) > 0:
|
||||
return copy_tensor_ranges(self, fid)
|
||||
else:
|
||||
eager = LazyNumpyTensor.to_eager(self)
|
||||
return eager.tofile(fid, *args, **kwargs)
|
||||
|
||||
# TODO: __array_function__
|
||||
|
||||
|
||||
# For aligning blocks when reflinking
|
||||
def best_extra_offset(t: np.ndarray | LazyNumpyTensor | None, current_offset: int) -> int:
|
||||
if not isinstance(t, LazyNumpyTensor):
|
||||
# no file ranges, no need for an offset
|
||||
return 0
|
||||
|
||||
ranges = t._ranges
|
||||
|
||||
histogram: dict[int, int] = {}
|
||||
|
||||
max_block_size = 0
|
||||
for r in ranges:
|
||||
# Ensure minimal alignment is 8 bytes (common with safetensors)
|
||||
# and that the block size is valid
|
||||
if r.offset % 8 == 0 and r.block_size > 0:
|
||||
align_offset = r.offset % r.block_size
|
||||
if align_offset not in histogram:
|
||||
histogram[align_offset] = 0
|
||||
histogram[align_offset] += r.size
|
||||
if r.block_size > max_block_size:
|
||||
max_block_size = r.block_size
|
||||
|
||||
best_offset = 0
|
||||
best_size = 0
|
||||
for offset, size in histogram.items():
|
||||
if size > best_size:
|
||||
best_size = size
|
||||
best_offset = offset
|
||||
|
||||
if max_block_size > 0:
|
||||
# the offset needs to be aligned properly
|
||||
# or else there's probably a block size mismatch
|
||||
assert current_offset % max_block_size == 0, current_offset % max_block_size
|
||||
|
||||
return best_offset
|
||||
|
||||
|
||||
def count_reflinkable_size(tensors: Iterable[tuple[str, np.ndarray | LazyNumpyTensor | None]]) -> int:
|
||||
if not hasattr(os, "copy_file_range"):
|
||||
return 0
|
||||
size = 0
|
||||
for name, t in tensors:
|
||||
if isinstance(t, LazyNumpyTensor) and len(t._ranges) > 0:
|
||||
align_offset = best_extra_offset(t, 0)
|
||||
misaligned = 0
|
||||
for range in t._ranges:
|
||||
if range.block_size > 0:
|
||||
if range.offset % range.block_size == align_offset:
|
||||
size += range.size
|
||||
else:
|
||||
misaligned += 1
|
||||
if misaligned > 0:
|
||||
logger.debug(f"{name} misaligned for reflinking, fallback to copy for {misaligned} of {len(t._ranges)} parts")
|
||||
return size
|
||||
|
||||
|
||||
# Copy tensor ranges using os.copy_file_range with aligned offsets and sizes
|
||||
# to make it more likely that copy-on-write is used where possible.
|
||||
# Block alignment is necessary for BTRFS and XFS (and likely for ZFS too).
|
||||
#
|
||||
# Falls back to shutil.copyfileobj when os.copy_file_range is not present.
|
||||
def copy_tensor_ranges(t: LazyNumpyTensor, fout: BufferedWriter):
|
||||
ranges = t._ranges
|
||||
assert len(ranges) > 0
|
||||
dst_offset = fout.tell()
|
||||
extra_offset = best_extra_offset(t, dst_offset)
|
||||
|
||||
if extra_offset > 0:
|
||||
# initial padding
|
||||
fout.write(b"\x00" * extra_offset)
|
||||
|
||||
dst_offset += extra_offset
|
||||
start_offset = dst_offset
|
||||
|
||||
src_files: dict[Path, BufferedReader] = {}
|
||||
for r in ranges:
|
||||
if r.filename not in src_files:
|
||||
src_files[r.filename] = open(r.filename, "rb")
|
||||
|
||||
has_copy_file_range = hasattr(os, "copy_file_range")
|
||||
|
||||
for r in ranges:
|
||||
src = src_files[r.filename]
|
||||
if has_copy_file_range:
|
||||
if r.block_size > 0 and (r.offset % r.block_size) == (start_offset % r.block_size):
|
||||
# Attempting to align copies for reflinking
|
||||
|
||||
# Block 0, 1, 2, 3, 4,
|
||||
# |___0000|0000000|0001111|1111111|111____|
|
||||
#
|
||||
# 1. block 0 is partially overwritten with contents from range[0]
|
||||
# 2. blocks 1 and 2 are copied from range[0] using os.copy_file_range
|
||||
# 3. block 2 is partially overwritten with contents from range[1]
|
||||
# 4. blocks 3 and 4 are copied from range[1] using os.copy_file_range
|
||||
# (repeated for further ranges)
|
||||
if dst_offset % r.block_size == 0:
|
||||
extra_size = 0
|
||||
else:
|
||||
extra_size = r.block_size - (dst_offset % r.block_size)
|
||||
extra_size = min(extra_size, r.size)
|
||||
src.seek(r.offset)
|
||||
buf = src.read(extra_size)
|
||||
fout.seek(dst_offset)
|
||||
fout.write(buf)
|
||||
dst_offset += extra_size
|
||||
if extra_size == r.size:
|
||||
continue
|
||||
|
||||
assert dst_offset % r.block_size == 0, dst_offset % r.block_size
|
||||
|
||||
offset_src = r.offset + extra_size
|
||||
offset_src_end = r.offset + r.size
|
||||
if offset_src_end % r.block_size != 0:
|
||||
offset_src_end += r.block_size - (offset_src_end % r.block_size)
|
||||
size = offset_src_end - offset_src
|
||||
os.copy_file_range(src.fileno(), fout.fileno(), size, offset_src, dst_offset)
|
||||
dst_offset += r.size - extra_size
|
||||
else:
|
||||
# not trying to use reflinks, but still using os.copy_file_range for speed
|
||||
try:
|
||||
os.copy_file_range(src.fileno(), fout.fileno(), r.size, r.offset, dst_offset)
|
||||
except OSError:
|
||||
# fallback when there's a problem (e.g. cross-filesystem copies)
|
||||
src.seek(r.offset)
|
||||
fout.seek(dst_offset)
|
||||
shutil.copyfileobj(src, fout, r.size)
|
||||
dst_offset += r.size
|
||||
else:
|
||||
# not using reflinks, fallback when os.copy_file_range is not supported
|
||||
src.seek(r.offset)
|
||||
fout.seek(dst_offset)
|
||||
shutil.copyfileobj(src, fout, r.size)
|
||||
dst_offset += r.size
|
||||
|
||||
for f in src_files.values():
|
||||
f.close()
|
||||
|
||||
fout.seek(dst_offset)
|
||||
|
||||
@@ -1,10 +1,15 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Literal
|
||||
|
||||
import os
|
||||
import json
|
||||
import logging
|
||||
import numpy as np
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def fill_templated_filename(filename: str, output_type: str | None) -> str:
|
||||
@@ -177,6 +182,10 @@ class SafetensorRemote:
|
||||
except KeyError as e:
|
||||
raise ValueError(f"Missing key in metadata for tensor '{name}': {e}, meta = {meta}")
|
||||
|
||||
# order by name (same as default safetensors behavior)
|
||||
# ref: https://github.com/huggingface/safetensors/blob/0816a1ae1d6b731cefd67f061d80d1cadd0dd7bb/bindings/python/src/lib.rs#L606
|
||||
res = dict(sorted(res.items(), key=lambda t: t[0]))
|
||||
|
||||
return res
|
||||
|
||||
@classmethod
|
||||
@@ -266,3 +275,82 @@ class SafetensorRemote:
|
||||
if os.environ.get("HF_TOKEN"):
|
||||
headers["Authorization"] = f"Bearer {os.environ['HF_TOKEN']}"
|
||||
return headers
|
||||
|
||||
|
||||
@dataclass
|
||||
class LocalTensorRange:
|
||||
filename: Path
|
||||
block_size: int
|
||||
offset: int
|
||||
size: int
|
||||
|
||||
|
||||
@dataclass
|
||||
class LocalTensor:
|
||||
dtype: str
|
||||
shape: tuple[int, ...]
|
||||
data_range: LocalTensorRange
|
||||
|
||||
def mmap_bytes(self) -> np.ndarray:
|
||||
return np.memmap(self.data_range.filename, offset=self.data_range.offset, shape=self.data_range.size)
|
||||
|
||||
|
||||
class SafetensorsLocal:
|
||||
"""
|
||||
Read a safetensors file from the local filesystem.
|
||||
|
||||
Custom parsing gives a bit more control over the memory usage.
|
||||
The official safetensors library doesn't expose file ranges.
|
||||
"""
|
||||
ALIGNMENT = 8 # bytes
|
||||
|
||||
tensors: dict[str, LocalTensor]
|
||||
|
||||
def __init__(self, filename: Path, *, reflink: bool = False):
|
||||
stat = os.stat(filename)
|
||||
# using the preferred block size to signal whether reflinks are desired when copying
|
||||
block_size = stat.st_blksize if reflink else -1
|
||||
with open(filename, "rb") as f:
|
||||
metadata_length = int.from_bytes(f.read(8), byteorder='little')
|
||||
file_size = stat.st_size
|
||||
if file_size < 8 + metadata_length:
|
||||
raise ValueError(f"Could not read complete metadata. Need {8 + metadata_length} bytes, got {file_size}")
|
||||
|
||||
metadata_str = f.read(metadata_length).decode('utf-8')
|
||||
try:
|
||||
metadata = json.loads(metadata_str)
|
||||
except json.JSONDecodeError as e:
|
||||
raise ValueError(f"Failed to parse safetensors metadata as JSON: {e}")
|
||||
|
||||
data_start_offset = f.tell()
|
||||
alignment = self.ALIGNMENT
|
||||
if data_start_offset % alignment != 0:
|
||||
data_start_offset += alignment - (data_start_offset % alignment)
|
||||
|
||||
tensors: dict[str, LocalTensor] = {}
|
||||
for name, meta in metadata.items():
|
||||
if name == "__metadata__":
|
||||
# ignore metadata, it's not a tensor
|
||||
continue
|
||||
|
||||
tensors[name] = LocalTensor(
|
||||
dtype=meta["dtype"],
|
||||
shape=tuple(meta["shape"]),
|
||||
data_range=LocalTensorRange(
|
||||
filename=filename,
|
||||
block_size=block_size,
|
||||
offset=data_start_offset + meta["data_offsets"][0],
|
||||
size=meta["data_offsets"][1] - meta["data_offsets"][0],
|
||||
),
|
||||
)
|
||||
|
||||
# order by name (same as default safetensors behavior)
|
||||
# ref: https://github.com/huggingface/safetensors/blob/0816a1ae1d6b731cefd67f061d80d1cadd0dd7bb/bindings/python/src/lib.rs#L606
|
||||
self.tensors = dict(sorted(tensors.items(), key=lambda t: t[0]))
|
||||
|
||||
def __enter__(self, *args, **kwargs):
|
||||
del args, kwargs # unused
|
||||
return self.tensors
|
||||
|
||||
def __exit__(self, *args, **kwargs):
|
||||
del args, kwargs # unused
|
||||
|
||||
@@ -21,6 +21,8 @@ llama_context::llama_context(
|
||||
llama_context_params params) :
|
||||
model(model),
|
||||
balloc(std::make_unique<llama_batch_allocr>(model.hparams.n_pos_per_embd())) {
|
||||
// TODO warning when creating llama_context with awkward ctx size that is not a power of 2,
|
||||
// may need to be backend-dependent
|
||||
LLAMA_LOG_INFO("%s: constructing llama_context\n", __func__);
|
||||
|
||||
t_start_us = model.t_start_us;
|
||||
|
||||
@@ -2576,9 +2576,10 @@ struct test_cpy : public test_case {
|
||||
const std::array<int64_t, 4> permute_dst;
|
||||
bool _src_use_permute;
|
||||
bool _dst_use_permute;
|
||||
bool _src_transpose;
|
||||
|
||||
std::string vars() override {
|
||||
return VARS_TO_STR5(type_src, type_dst, ne, permute_src, permute_dst);
|
||||
return VARS_TO_STR6(type_src, type_dst, ne, permute_src, permute_dst, _src_transpose);
|
||||
}
|
||||
|
||||
double max_nmse_err() override {
|
||||
@@ -2616,10 +2617,12 @@ struct test_cpy : public test_case {
|
||||
test_cpy(ggml_type type_src = GGML_TYPE_F32, ggml_type type_dst = GGML_TYPE_F32,
|
||||
std::array<int64_t, 4> ne = {10, 10, 10, 1},
|
||||
std::array<int64_t, 4> permute_src = {0, 0, 0, 0},
|
||||
std::array<int64_t, 4> permute_dst = {0, 0, 0, 0})
|
||||
std::array<int64_t, 4> permute_dst = {0, 0, 0, 0},
|
||||
bool transpose_src = false)
|
||||
: type_src(type_src), type_dst(type_dst), ne(ne), permute_src(permute_src), permute_dst(permute_dst),
|
||||
_src_use_permute(permute_src[0] + permute_src[1] + permute_src[2] + permute_src[3] > 0),
|
||||
_dst_use_permute(permute_dst[0] + permute_dst[1] + permute_dst[2] + permute_dst[3] > 0) {}
|
||||
_dst_use_permute(permute_dst[0] + permute_dst[1] + permute_dst[2] + permute_dst[3] > 0),
|
||||
_src_transpose(transpose_src){}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
ggml_tensor * src = ggml_new_tensor(ctx, type_src, 4, ne.data());
|
||||
@@ -2631,6 +2634,11 @@ struct test_cpy : public test_case {
|
||||
ggml_set_name(src, "src_permuted");
|
||||
}
|
||||
|
||||
if (_src_transpose) {
|
||||
src = ggml_transpose(ctx, src);
|
||||
ggml_set_name(src, "src_transposed");
|
||||
}
|
||||
|
||||
ggml_tensor * dst = ggml_new_tensor(ctx, type_dst, 4, src->ne);
|
||||
ggml_set_name(dst, "dst");
|
||||
|
||||
@@ -3377,11 +3385,11 @@ struct test_mul_mat : public test_case {
|
||||
const std::array<int64_t, 2> bs; // dims 3 and 4
|
||||
const std::array<int64_t, 2> nr; // repeat in dims 3 and 4
|
||||
const std::array<int64_t, 4> per; // permutation of dimensions
|
||||
const bool v; // whether a and b are non-contiguous views
|
||||
const int64_t k_v; // size of k in memory, resulting in a non-contiguous view for k_v > k, no view for k_v == 0
|
||||
const uint32_t o; // number of outputs
|
||||
|
||||
std::string vars() override {
|
||||
return VARS_TO_STR10(type_a, type_b, m, n, k, bs, nr, per, v, o);
|
||||
return VARS_TO_STR10(type_a, type_b, m, n, k, bs, nr, per, k_v, o);
|
||||
}
|
||||
|
||||
double max_nmse_err() override {
|
||||
@@ -3402,8 +3410,8 @@ struct test_mul_mat : public test_case {
|
||||
std::array<int64_t, 2> bs = {10, 10},
|
||||
std::array<int64_t, 2> nr = {2, 2},
|
||||
std::array<int64_t, 4> per = {0, 1, 2, 3},
|
||||
bool v = false, uint32_t o = 1)
|
||||
: type_a(type_a), type_b(type_b), m(m), n(n), k(k), bs(bs), nr(nr), per(per), v(v), o(o) {}
|
||||
int64_t k_v = 0, uint32_t o = 1)
|
||||
: type_a(type_a), type_b(type_b), m(m), n(n), k(k), bs(bs), nr(nr), per(per), k_v(k_v), o(o) {}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
// C^T = A * B^T: (k, m) * (k, n) => (m, n)
|
||||
@@ -3413,7 +3421,7 @@ struct test_mul_mat : public test_case {
|
||||
const int npermuted = (per[0] != 0) + (per[1] != 1) + (per[2] != 2) + (per[3] != 3);
|
||||
if (npermuted > 0) {
|
||||
GGML_ASSERT(npermuted == 2);
|
||||
GGML_ASSERT(!v); // not handled
|
||||
GGML_ASSERT(k_v == 0); // not handled
|
||||
GGML_ASSERT(!ggml_is_quantized(type_a) || per[0] == 0);
|
||||
GGML_ASSERT(!ggml_is_quantized(type_b) || per[0] == 0);
|
||||
|
||||
@@ -3437,29 +3445,21 @@ struct test_mul_mat : public test_case {
|
||||
ggml_set_name(a, "a_permuted");
|
||||
ggml_set_name(b, "b_permuted");
|
||||
} else {
|
||||
if (v) {
|
||||
a = ggml_new_tensor_4d(ctx, type_a, k*2, m, bs[0], bs[1]);
|
||||
b = ggml_new_tensor_4d(ctx, type_b, k*2, n, bs[0]*nr[0], bs[1]*nr[1]);
|
||||
const int64_t k_physical = k_v == 0 ? k : k_v;
|
||||
a = ggml_new_tensor_4d(ctx, type_a, k_physical, m, bs[0], bs[1]);
|
||||
b = ggml_new_tensor_4d(ctx, type_b, k_physical, n, bs[0]*nr[0], bs[1]*nr[1]);
|
||||
|
||||
if (!ggml_is_quantized(type_a)) {
|
||||
if (bs[1] == 1 && nr[1] == 1) {
|
||||
ggml_set_param(a);
|
||||
}
|
||||
ggml_set_param(b);
|
||||
if (!ggml_is_quantized(type_a)) {
|
||||
if (bs[1] == 1 && nr[1] == 1) {
|
||||
ggml_set_param(a);
|
||||
}
|
||||
ggml_set_param(b);
|
||||
}
|
||||
|
||||
if (k_v != 0) {
|
||||
GGML_ASSERT(k_v > k);
|
||||
a = ggml_view_4d(ctx, a, k, m, bs[0], bs[1], a->nb[1], a->nb[2], a->nb[3], 0);
|
||||
b = ggml_view_4d(ctx, b, k, n, bs[0]*nr[0], bs[1]*nr[1], b->nb[1], b->nb[2], b->nb[3], 0);
|
||||
} else {
|
||||
a = ggml_new_tensor_4d(ctx, type_a, k, m, bs[0], bs[1]);
|
||||
b = ggml_new_tensor_4d(ctx, type_b, k, n, bs[0]*nr[0], bs[1]*nr[1]);
|
||||
|
||||
if (!ggml_is_quantized(type_a)) {
|
||||
if (bs[1] == 1 && nr[1] == 1) {
|
||||
ggml_set_param(a);
|
||||
}
|
||||
ggml_set_param(b);
|
||||
}
|
||||
}
|
||||
ggml_set_name(a, "a");
|
||||
ggml_set_name(b, "b");
|
||||
@@ -6641,6 +6641,13 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_I32, {256, 2, 3, 4}, {1, 0, 2, 3}));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_F32, {256, 2, 3, 4}));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_F32, {256, 2, 3, 4}, {1, 0, 2, 3}));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {256, 4, 3, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 4, 3, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 4, 3, 3}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {256, 4, 3, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {256, 4, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 4, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {256, 4, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
|
||||
|
||||
test_cases.emplace_back(new test_cont());
|
||||
test_cases.emplace_back(new test_cont(GGML_TYPE_F32, {2, 1, 1 ,1}));
|
||||
@@ -6886,7 +6893,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128, 45, 64, { 8, 1}, {4, 1}));
|
||||
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 1056, 1, 193, {1, 1}, {4, 1}, {0, 2, 1, 3}));
|
||||
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 1056, 1, 67, {1, 1}, {4, 1}, {0, 2, 1, 3}));
|
||||
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F32, GGML_TYPE_F32, 16, 32, 32, { 1, 1}, {1, 1}, {0, 1, 2, 3}, true, 3));
|
||||
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F32, GGML_TYPE_F32, 16, 32, 32, { 1, 1}, {1, 1}, {0, 1, 2, 3}, 64, 3));
|
||||
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F32, GGML_TYPE_F32, 64, 77, 77, {12,1}, {1,1}));
|
||||
|
||||
#if 0
|
||||
@@ -6912,7 +6919,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||
for (uint32_t k = 0; k < 2; ++k) {
|
||||
for (ggml_type type: {GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_F32}) {
|
||||
test_cases.emplace_back(new test_mul_mat(type, GGML_TYPE_F32, 1056 + m, 1, 128 + k, {bs, bs2}, {nr, 1}, {0, 2, 1, 3}));
|
||||
test_cases.emplace_back(new test_mul_mat(type, GGML_TYPE_F32, 128 + m, 1, 1056 + k, {bs, bs2}, {nr, 1}, {0, 1, 2, 3}, true));
|
||||
test_cases.emplace_back(new test_mul_mat(type, GGML_TYPE_F32, 128 + m, 1, 1056 + k, {bs, bs2}, {nr, 1}, {0, 1, 2, 3}, 2*1056 + k));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -7385,6 +7392,18 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_Q4_0, {8192, 512, 2, 1}));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_Q4_0, GGML_TYPE_F32, {8192, 512, 2, 1}));
|
||||
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {768*1024, 256, 1, 1}, {1, 0, 2, 3}, {0, 0, 0, 0}));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {768*1024, 256, 1, 1}, {1, 0, 2, 3}, {0, 0, 0, 0}));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {768, 1024, 256, 1}, {1, 0, 2, 3}, {0, 0, 0, 0}));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {768, 1024, 256, 1}, {1, 0, 2, 3}, {0, 0, 0, 0}));
|
||||
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {768*1024, 256, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {768, 1024, 256, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {768*1024, 256, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {768, 1024, 256, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {768, 1024, 256, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
|
||||
|
||||
|
||||
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {4096, 4096, 5, 1}, false, false, GGML_TYPE_F32, {1, 1}, 1.0f, 0.0f));
|
||||
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {12888, 256, 5, 1}, false, false, GGML_TYPE_F32, {1, 1}, 1.0f, 0.0f));
|
||||
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {77, 4096, 5, 1}, false, false, GGML_TYPE_F32, {1, 1}, 1.0f, 0.0f));
|
||||
@@ -7405,7 +7424,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
|
||||
test_cases.emplace_back(new test_pad_reflect_1d(GGML_TYPE_F32, {3000, 384, 4, 1}));
|
||||
|
||||
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 16416, 1, 128, {8, 1}, {4, 1}, {0, 2, 1, 3}));
|
||||
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128, 1, 16416, {8, 1}, {4, 1}, {0, 1, 2, 3}, true));
|
||||
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128, 1, 16416, {8, 1}, {4, 1}, {0, 1, 2, 3}, 2*16416));
|
||||
|
||||
for (int bs : {1, 2, 3, 4, 5, 8, 512}) {
|
||||
for (ggml_type type_a : all_types) {
|
||||
|
||||
@@ -1083,16 +1083,24 @@ struct clip_graph {
|
||||
}
|
||||
|
||||
ggml_cgraph * build_minicpmv() {
|
||||
const int batch_size = 1;
|
||||
|
||||
GGML_ASSERT(model.class_embedding == nullptr);
|
||||
const int n_pos = n_patches;
|
||||
const int n_pos = n_patches;
|
||||
const int n_embd_proj = clip_n_mmproj_embd(ctx);
|
||||
|
||||
// position embeddings for the projector (not for ViT)
|
||||
int n_output_dim = clip_n_mmproj_embd(ctx);
|
||||
ggml_tensor * pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_output_dim, n_pos, batch_size);
|
||||
ggml_set_name(pos_embed, "pos_embed");
|
||||
ggml_set_input(pos_embed);
|
||||
// see: https://huggingface.co/openbmb/MiniCPM-o-2_6/blob/main/resampler.py#L70
|
||||
// base frequency omega
|
||||
ggml_tensor * omega = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_embd_proj / 4);
|
||||
ggml_set_name(omega, "omega");
|
||||
ggml_set_input(omega);
|
||||
|
||||
// 2D input positions (using float for sinusoidal embeddings)
|
||||
ggml_tensor * pos_h = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_pos);
|
||||
ggml_set_name(pos_h, "pos_h");
|
||||
ggml_set_input(pos_h);
|
||||
ggml_tensor * pos_w = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_pos);
|
||||
ggml_set_name(pos_w, "pos_w");
|
||||
ggml_set_input(pos_w);
|
||||
|
||||
// for selecting learned pos embd, used by ViT
|
||||
struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
|
||||
@@ -1103,7 +1111,7 @@ struct clip_graph {
|
||||
|
||||
ggml_tensor * inp = build_inp();
|
||||
ggml_tensor * embeddings = build_vit(
|
||||
inp, n_patches,
|
||||
inp, n_pos,
|
||||
NORM_TYPE_NORMAL,
|
||||
hparams.ffn_op,
|
||||
learned_pos_embd,
|
||||
@@ -1115,17 +1123,39 @@ struct clip_graph {
|
||||
ggml_tensor * v = ggml_mul_mat(ctx0, model.mm_model_kv_proj, embeddings);
|
||||
|
||||
// norm
|
||||
q = build_norm(q, model.mm_model_ln_q_w, model.mm_model_ln_q_b, NORM_TYPE_NORMAL, eps, -1);
|
||||
q = build_norm(q, model.mm_model_ln_q_w, model.mm_model_ln_q_b, NORM_TYPE_NORMAL, eps, -1);
|
||||
v = build_norm(v, model.mm_model_ln_kv_w, model.mm_model_ln_kv_b, NORM_TYPE_NORMAL, eps, -1);
|
||||
|
||||
// calculate sinusoidal pos embd
|
||||
ggml_tensor * pos_embed = nullptr;
|
||||
{
|
||||
// outer product
|
||||
ggml_tensor * omega_b = ggml_repeat_4d(ctx0, omega, omega->ne[0], n_pos, 1, 1); // n_pos rows
|
||||
ggml_tensor * theta_x = ggml_mul(ctx0, omega_b, pos_w);
|
||||
ggml_tensor * theta_y = ggml_mul(ctx0, omega_b, pos_h);
|
||||
// sin and cos
|
||||
ggml_tensor * pos_embd_x = ggml_concat(
|
||||
ctx0,
|
||||
ggml_sin(ctx0, theta_x),
|
||||
ggml_cos(ctx0, theta_x),
|
||||
0 // concat on first dim
|
||||
);
|
||||
ggml_tensor * pos_embd_y = ggml_concat(
|
||||
ctx0,
|
||||
ggml_sin(ctx0, theta_y),
|
||||
ggml_cos(ctx0, theta_y),
|
||||
0 // concat on first dim
|
||||
);
|
||||
pos_embed = ggml_concat(ctx0, pos_embd_x, pos_embd_y, 0);
|
||||
}
|
||||
|
||||
// k = v + pos_embed
|
||||
ggml_tensor * k = ggml_add(ctx0, v, pos_embed);
|
||||
|
||||
// attention
|
||||
{
|
||||
int n_embd = clip_n_mmproj_embd(ctx);
|
||||
const int d_head = 128;
|
||||
int n_head = n_embd/d_head;
|
||||
int n_head = n_embd_proj/d_head;
|
||||
// Use actual config value if available, otherwise fall back to hardcoded values
|
||||
int num_query = ctx->model.hparams.minicpmv_query_num;
|
||||
ggml_tensor * Q = ggml_add(ctx0,
|
||||
@@ -2761,6 +2791,7 @@ struct clip_model_loader {
|
||||
{
|
||||
// ref: https://huggingface.co/mistral-community/pixtral-12b/blob/main/preprocessor_config.json
|
||||
// TODO: verify the image_min_tokens
|
||||
hparams.n_merge = 1; // the original pixtral does not use patch merging
|
||||
hparams.rope_theta = 10000.0f;
|
||||
get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
|
||||
hparams.set_limit_image_tokens(8, 1024);
|
||||
@@ -2790,14 +2821,8 @@ struct clip_model_loader {
|
||||
get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
|
||||
get_u32(KEY_WIN_ATTN_PATTERN, hparams.n_wa_pattern, model.proj_type == PROJECTOR_TYPE_QWEN25VL); // only 2.5 requires it
|
||||
// ref: https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct/blob/main/preprocessor_config.json
|
||||
// the actual max limit is 12845056/14/14/2/2/4 = 4096 tokens
|
||||
// but we set a lower value to avoid OOM
|
||||
// TODO: make it configurable by user
|
||||
// TODO (2): bbox coordinates become inaccurate with small number of tokens,
|
||||
// therefore we need to increase the min_tokens
|
||||
// see: https://github.com/ggml-org/llama.cpp/issues/16842#issuecomment-3475144858
|
||||
hparams.set_limit_image_tokens(8, 2048);
|
||||
hparams.set_warmup_n_tokens(256); // avoid OOM on warmup
|
||||
hparams.set_limit_image_tokens(8, 4096);
|
||||
hparams.set_warmup_n_tokens(46*46); // avoid OOM on warmup
|
||||
const int warn_min_pixels = 1024 * hparams.n_merge * hparams.n_merge * hparams.patch_size * hparams.patch_size;
|
||||
if (hparams.image_min_pixels < warn_min_pixels) {
|
||||
LOG_WRN("%s: Qwen-VL models require at minimum 1024 image tokens to function correctly on grounding tasks\n", __func__);
|
||||
@@ -4569,92 +4594,6 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
|
||||
return n_patches;
|
||||
}
|
||||
|
||||
static std::vector<std::vector<std::vector<float>>> get_1d_sincos_pos_embed_from_grid_new(int embed_dim, const std::vector<std::vector<float>> & pos) {
|
||||
assert(embed_dim % 2 == 0);
|
||||
int H = pos.size();
|
||||
int W = pos[0].size();
|
||||
|
||||
std::vector<float> omega(embed_dim / 2);
|
||||
for (int i = 0; i < embed_dim / 2; ++i) {
|
||||
omega[i] = 1.0 / pow(10000.0, static_cast<float>(i) / (embed_dim / 2));
|
||||
}
|
||||
|
||||
std::vector<std::vector<std::vector<float>>> emb(H, std::vector<std::vector<float>>(W, std::vector<float>(embed_dim)));
|
||||
for (int h = 0; h < H; ++h) {
|
||||
for (int w = 0; w < W; ++w) {
|
||||
for (int d = 0; d < embed_dim / 2; ++d) {
|
||||
float out_value = pos[h][w] * omega[d];
|
||||
emb[h][w][d] = sin(out_value);
|
||||
emb[h][w][d + embed_dim / 2] = cos(out_value);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return emb;
|
||||
}
|
||||
|
||||
static std::vector<std::vector<std::vector<float>>> get_2d_sincos_pos_embed_from_grid(int embed_dim, const std::vector<std::vector<std::vector<float>>> & grid) {
|
||||
assert(embed_dim % 2 == 0);
|
||||
std::vector<std::vector<std::vector<float>>> emb_h = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, grid[0]); // (H, W, D/2)
|
||||
std::vector<std::vector<std::vector<float>>> emb_w = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, grid[1]); // (H, W, D/2)
|
||||
|
||||
int H = emb_h.size();
|
||||
int W = emb_h[0].size();
|
||||
std::vector<std::vector<std::vector<float>>> emb(H, std::vector<std::vector<float>>(W, std::vector<float>(embed_dim)));
|
||||
|
||||
for (int h = 0; h < H; ++h) {
|
||||
for (int w = 0; w < W; ++w) {
|
||||
for (int d = 0; d < embed_dim / 2; ++d) {
|
||||
emb[h][w][d] = emb_h[h][w][d];
|
||||
emb[h][w][d + embed_dim / 2] = emb_w[h][w][d];
|
||||
}
|
||||
}
|
||||
}
|
||||
return emb;
|
||||
}
|
||||
|
||||
static std::vector<std::vector<float>> get_2d_sincos_pos_embed(int embed_dim, const std::pair<int, int> image_size) {
|
||||
int grid_h_size = image_size.first;
|
||||
int grid_w_size = image_size.second;
|
||||
|
||||
std::vector<float> grid_h(grid_h_size);
|
||||
std::vector<float> grid_w(grid_w_size);
|
||||
|
||||
for (int i = 0; i < grid_h_size; ++i) {
|
||||
grid_h[i] = static_cast<float>(i);
|
||||
}
|
||||
for (int i = 0; i < grid_w_size; ++i) {
|
||||
grid_w[i] = static_cast<float>(i);
|
||||
}
|
||||
|
||||
std::vector<std::vector<float>> grid(grid_h_size, std::vector<float>(grid_w_size));
|
||||
for (int h = 0; h < grid_h_size; ++h) {
|
||||
for (int w = 0; w < grid_w_size; ++w) {
|
||||
grid[h][w] = grid_w[w];
|
||||
}
|
||||
}
|
||||
std::vector<std::vector<std::vector<float>>> grid_2d = {grid, grid};
|
||||
for (int h = 0; h < grid_h_size; ++h) {
|
||||
for (int w = 0; w < grid_w_size; ++w) {
|
||||
grid_2d[0][h][w] = grid_h[h];
|
||||
grid_2d[1][h][w] = grid_w[w];
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::vector<std::vector<float>>> pos_embed_3d = get_2d_sincos_pos_embed_from_grid(embed_dim, grid_2d);
|
||||
|
||||
int H = image_size.first;
|
||||
int W = image_size.second;
|
||||
std::vector<std::vector<float>> pos_embed_2d(H * W, std::vector<float>(embed_dim));
|
||||
for (int h = 0; h < H; ++h) {
|
||||
for (int w = 0; w < W; ++w) {
|
||||
pos_embed_2d[w * H + h] = pos_embed_3d[h][w];
|
||||
}
|
||||
}
|
||||
|
||||
return pos_embed_2d;
|
||||
}
|
||||
|
||||
bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
|
||||
clip_image_f32_batch imgs;
|
||||
clip_image_f32_ptr img_copy(clip_image_f32_init());
|
||||
@@ -4793,27 +4732,33 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
}
|
||||
set_input_i32("positions", positions);
|
||||
|
||||
// inspired from resampler of Qwen-VL:
|
||||
// -> https://huggingface.co/Qwen/Qwen-VL/tree/main
|
||||
// -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23
|
||||
int embed_dim = clip_n_mmproj_embd(ctx);
|
||||
|
||||
// TODO @ngxson : this is very inefficient, can we do this using ggml_sin and ggml_cos?
|
||||
auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h));
|
||||
|
||||
std::vector<float> pos_embed(embed_dim * pos_w * pos_h);
|
||||
for(int i = 0; i < pos_w * pos_h; ++i){
|
||||
for(int j = 0; j < embed_dim; ++j){
|
||||
pos_embed[i * embed_dim + j] = pos_embed_t[i][j];
|
||||
}
|
||||
// inputs for resampler projector
|
||||
// set the 2D positions (using float for sinusoidal embedding)
|
||||
int n_patches_per_col = image_size_width / patch_size;
|
||||
std::vector<float> pos_data(n_pos);
|
||||
// dimension H
|
||||
for (int i = 0; i < n_pos; i++) {
|
||||
pos_data[i] = static_cast<float>(i / n_patches_per_col);
|
||||
}
|
||||
|
||||
set_input_f32("pos_embed", pos_embed);
|
||||
set_input_f32("pos_h", pos_data);
|
||||
// dimension W
|
||||
for (int i = 0; i < n_pos; i++) {
|
||||
pos_data[i] = static_cast<float>(i % n_patches_per_col);
|
||||
}
|
||||
set_input_f32("pos_w", pos_data);
|
||||
// base frequency omega
|
||||
const float base_freq = 10000.0f;
|
||||
const int n_embd_proj = clip_n_mmproj_embd(ctx);
|
||||
std::vector<float> omega(n_embd_proj / 4);
|
||||
for (int i = 0; i < n_embd_proj / 4; ++i) {
|
||||
omega[i] = 1.0f / std::pow(base_freq, static_cast<float>(i) / (n_embd_proj / 4));
|
||||
}
|
||||
set_input_f32("omega", omega);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_QWEN2VL:
|
||||
case PROJECTOR_TYPE_QWEN3VL:
|
||||
{
|
||||
const int merge_ratio = 2;
|
||||
const int merge_ratio = hparams.n_merge;
|
||||
const int pw = image_size_width / patch_size;
|
||||
const int ph = image_size_height / patch_size;
|
||||
std::vector<int> positions(n_pos * 4);
|
||||
|
||||
@@ -101,16 +101,17 @@ static clip_flash_attn_type mtmd_get_clip_flash_attn_type(enum llama_flash_attn_
|
||||
}
|
||||
|
||||
mtmd_context_params mtmd_context_params_default() {
|
||||
mtmd_context_params params;
|
||||
params.use_gpu = true;
|
||||
params.print_timings = true;
|
||||
params.n_threads = 4;
|
||||
params.verbosity = GGML_LOG_LEVEL_INFO;
|
||||
params.image_marker = MTMD_DEFAULT_IMAGE_MARKER;
|
||||
params.media_marker = mtmd_default_marker();
|
||||
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO;
|
||||
params.image_min_tokens = -1;
|
||||
params.image_max_tokens = -1;
|
||||
mtmd_context_params params {
|
||||
/* use_gpu */ true,
|
||||
/* print_timings */ true,
|
||||
/* n_threads */ 4,
|
||||
/* verbosity */ GGML_LOG_LEVEL_INFO,
|
||||
/* image_marker */ MTMD_DEFAULT_IMAGE_MARKER,
|
||||
/* media_marker */ mtmd_default_marker(),
|
||||
/* flash_attn_type */ LLAMA_FLASH_ATTN_TYPE_AUTO,
|
||||
/* image_min_tokens */ -1,
|
||||
/* image_max_tokens */ -1,
|
||||
};
|
||||
return params;
|
||||
}
|
||||
|
||||
@@ -172,13 +173,13 @@ struct mtmd_context {
|
||||
throw std::runtime_error("media_marker must not be empty");
|
||||
}
|
||||
|
||||
clip_context_params ctx_clip_params;
|
||||
ctx_clip_params.use_gpu = ctx_params.use_gpu;
|
||||
ctx_clip_params.verbosity = ctx_params.verbosity;
|
||||
ctx_clip_params.flash_attn_type = mtmd_get_clip_flash_attn_type(ctx_params.flash_attn_type);
|
||||
// custom image token limits
|
||||
ctx_clip_params.image_min_tokens = ctx_params.image_min_tokens;
|
||||
ctx_clip_params.image_max_tokens = ctx_params.image_max_tokens;
|
||||
clip_context_params ctx_clip_params {
|
||||
/* use_gpu */ ctx_params.use_gpu,
|
||||
/* verbosity */ ctx_params.verbosity,
|
||||
/* flash_attn_type */ CLIP_FLASH_ATTN_TYPE_AUTO,
|
||||
/* image_min_tokens */ ctx_params.image_min_tokens,
|
||||
/* image_max_tokens */ ctx_params.image_max_tokens,
|
||||
};
|
||||
|
||||
auto res = clip_init(mmproj_fname, ctx_clip_params);
|
||||
ctx_v = res.ctx_v;
|
||||
|
||||
@@ -277,7 +277,7 @@ For more details, please refer to [multimodal documentation](../../docs/multimod
|
||||
|
||||
## Web UI
|
||||
|
||||
The project includes a web-based user interface that enables interaction with the model through the `/chat/completions` endpoint.
|
||||
The project includes a web-based user interface that enables interaction with the model through the `/v1/chat/completions` endpoint.
|
||||
|
||||
The web UI is developed using:
|
||||
- `react` framework for frontend development
|
||||
|
||||
@@ -2400,7 +2400,7 @@ struct server_context {
|
||||
|
||||
add_bos_token = llama_vocab_get_add_bos(vocab);
|
||||
|
||||
if (!params_base.speculative.model.path.empty() || !params_base.speculative.model.hf_repo.empty()) {
|
||||
if (params_base.has_speculative()) {
|
||||
SRV_INF("loading draft model '%s'\n", params_base.speculative.model.path.c_str());
|
||||
|
||||
auto params_dft = params_base;
|
||||
@@ -2476,7 +2476,7 @@ struct server_context {
|
||||
SRV_WRN("%s\n", "cache_reuse is not supported by multimodal, it will be disabled");
|
||||
}
|
||||
|
||||
if (!params_base.speculative.model.path.empty()) {
|
||||
if (params_base.has_speculative()) {
|
||||
SRV_ERR("%s\n", "err: speculative decode is not supported by multimodal");
|
||||
return false;
|
||||
}
|
||||
@@ -2520,6 +2520,7 @@ struct server_context {
|
||||
if (model_dft) {
|
||||
slot.batch_spec = llama_batch_init(params_base.speculative.n_max + 1, 0, 1);
|
||||
|
||||
// TODO: rework speculative decoding [TAG_SERVER_SPEC_REWORK]
|
||||
slot.ctx_dft = llama_init_from_model(model_dft, cparams_dft);
|
||||
if (slot.ctx_dft == nullptr) {
|
||||
SRV_ERR("%s", "failed to create draft context\n");
|
||||
@@ -2825,6 +2826,7 @@ struct server_context {
|
||||
}
|
||||
|
||||
// initialize draft batch
|
||||
// TODO: rework speculative decoding [TAG_SERVER_SPEC_REWORK]
|
||||
if (slot.ctx_dft) {
|
||||
llama_batch_free(slot.batch_spec);
|
||||
|
||||
@@ -3830,7 +3832,9 @@ struct server_context {
|
||||
// the largest pos_min required for a checkpoint to be useful
|
||||
const auto pos_min_thold = std::max(0, n_past - n_swa);
|
||||
|
||||
if (n_past > 0 && n_past < slot.prompt.n_tokens()) {
|
||||
// note: disallow with mtmd contexts for now
|
||||
// https://github.com/ggml-org/llama.cpp/issues/17043
|
||||
if (!mctx && n_past > 0 && n_past < slot.prompt.n_tokens()) {
|
||||
const auto pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx), slot.id);
|
||||
if (pos_min == -1) {
|
||||
SLT_ERR(slot, "n_past = %d, slot.prompt.tokens.size() = %d, seq_id = %d, pos_min = %d\n", n_past, (int) slot.prompt.tokens.size(), slot.id, pos_min);
|
||||
@@ -4291,6 +4295,8 @@ struct server_context {
|
||||
}
|
||||
|
||||
// do speculative decoding
|
||||
// TODO: rework to have a single draft llama_context shared across all slots [TAG_SERVER_SPEC_REWORK]
|
||||
// perform the speculative drafting for all sequences at the same time in a single batch
|
||||
for (auto & slot : slots) {
|
||||
if (!slot.is_processing() || !slot.can_speculate()) {
|
||||
continue;
|
||||
@@ -4445,8 +4451,10 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// TODO: should we have a separate n_parallel parameter for the server?
|
||||
// https://github.com/ggml-org/llama.cpp/pull/16736#discussion_r2483763177
|
||||
if (params.n_parallel == 1 && params.kv_unified == false) {
|
||||
LOG_WRN("%s: setting n_parallel = 4 and kv_unified = true\n", __func__);
|
||||
// TODO: this is a common configuration that is suitable for most local use cases
|
||||
// however, overriding the parameters is a bit confusing - figure out something more intuitive
|
||||
if (params.n_parallel == 1 && params.kv_unified == false && !params.has_speculative()) {
|
||||
LOG_WRN("%s: setting n_parallel = 4 and kv_unified = true (add -kvu to disable this)\n", __func__);
|
||||
|
||||
params.n_parallel = 4;
|
||||
params.kv_unified = true;
|
||||
|
||||
Reference in New Issue
Block a user