mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-04-23 16:37:33 +03:00
Compare commits
34 Commits
b2862
...
sl/async-w
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5de9b743f8 | ||
|
|
8f7080bf48 | ||
|
|
e1b40ac3b9 | ||
|
|
dc020985b8 | ||
|
|
344f9126cc | ||
|
|
9a17ab914b | ||
|
|
ea3b0590ee | ||
|
|
29499bb593 | ||
|
|
48aa8fd1f2 | ||
|
|
583fd6b000 | ||
|
|
9f773486ab | ||
|
|
e8a7fd4fb0 | ||
|
|
a5e3fde857 | ||
|
|
f308ea7059 | ||
|
|
c3c88f296a | ||
|
|
182adefcf3 | ||
|
|
0d26d8ccd8 | ||
|
|
4f0263633b | ||
|
|
1265c670fd | ||
|
|
5e31828d3e | ||
|
|
541600201e | ||
|
|
efc8f767c8 | ||
|
|
e0f556186b | ||
|
|
27f65d6267 | ||
|
|
ee52225067 | ||
|
|
614d3b914e | ||
|
|
30e70334f7 | ||
|
|
1c570d8bee | ||
|
|
948f4ec7c5 | ||
|
|
9aa672490c | ||
|
|
b1f8af1886 | ||
|
|
e586ee4259 | ||
|
|
cbf75894d2 | ||
|
|
0d5cef78ae |
47
.github/workflows/build.yml
vendored
47
.github/workflows/build.yml
vendored
@@ -340,6 +340,36 @@ jobs:
|
||||
cd build
|
||||
ctest -L main --verbose
|
||||
|
||||
ubuntu-latest-cmake-rpc:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
continue-on-error: true
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
mkdir build
|
||||
cd build
|
||||
cmake -DLLAMA_RPC=ON ..
|
||||
cmake --build . --config Release -j $(nproc)
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
run: |
|
||||
cd build
|
||||
ctest -L main --verbose
|
||||
|
||||
ubuntu-22-cmake-vulkan:
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
@@ -663,6 +693,8 @@ jobs:
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- build: 'rpc'
|
||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_RPC=ON -DBUILD_SHARED_LIBS=ON'
|
||||
- build: 'noavx'
|
||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF -DBUILD_SHARED_LIBS=ON'
|
||||
- build: 'avx2'
|
||||
@@ -898,9 +930,9 @@ jobs:
|
||||
shell: bash
|
||||
|
||||
env:
|
||||
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/62641e01-1e8d-4ace-91d6-ae03f7f8a71f/w_BaseKit_p_2024.0.0.49563_offline.exe
|
||||
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/7dff44ba-e3af-4448-841c-0d616c8da6e7/w_BaseKit_p_2024.1.0.595_offline.exe
|
||||
WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel
|
||||
|
||||
ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
@@ -932,6 +964,17 @@ jobs:
|
||||
id: pack_artifacts
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
run: |
|
||||
echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin"
|
||||
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.4.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin
|
||||
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/pi_win_proxy_loader.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/pi_level_zero.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl7.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
|
||||
echo "cp oneAPI running time dll files to ./build/bin done"
|
||||
7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/*
|
||||
|
||||
- name: Upload artifacts
|
||||
|
||||
@@ -123,6 +123,7 @@ set(LLAMA_METAL_MACOSX_VERSION_MIN "" CACHE STRING
|
||||
set(LLAMA_METAL_STD "" CACHE STRING "llama: metal standard version (-std flag)")
|
||||
option(LLAMA_KOMPUTE "llama: use Kompute" OFF)
|
||||
option(LLAMA_MPI "llama: use MPI" OFF)
|
||||
option(LLAMA_RPC "llama: use RPC" OFF)
|
||||
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
|
||||
option(LLAMA_SYCL "llama: use SYCL" OFF)
|
||||
option(LLAMA_SYCL_F16 "llama: use 16 bit floats for sycl calculations" OFF)
|
||||
@@ -494,6 +495,17 @@ if (LLAMA_MPI)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (LLAMA_RPC)
|
||||
add_compile_definitions(GGML_USE_RPC)
|
||||
|
||||
if (WIN32)
|
||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ws2_32)
|
||||
endif()
|
||||
|
||||
set(GGML_HEADERS_RPC ggml-rpc.h)
|
||||
set(GGML_SOURCES_RPC ggml-rpc.cpp)
|
||||
endif()
|
||||
|
||||
if (LLAMA_CLBLAST)
|
||||
find_package(CLBlast)
|
||||
if (CLBlast_FOUND)
|
||||
@@ -1176,6 +1188,7 @@ add_library(ggml OBJECT
|
||||
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
|
||||
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
|
||||
${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI}
|
||||
${GGML_SOURCES_RPC} ${GGML_HEADERS_RPC}
|
||||
${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
|
||||
${GGML_SOURCES_SYCL} ${GGML_HEADERS_SYCL}
|
||||
${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE}
|
||||
|
||||
@@ -532,7 +532,7 @@ Building the program with BLAS support may lead to some performance improvements
|
||||
cmake -B build -DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
|
||||
&& cmake --build build --config Release -- -j 16
|
||||
```
|
||||
On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DLLAMA_HIP_UMA=ON"`.
|
||||
On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DLLAMA_HIP_UMA=ON`.
|
||||
However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).
|
||||
|
||||
- Using `make` (example for target gfx1030, build with 16 CPU threads):
|
||||
|
||||
@@ -1060,6 +1060,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
||||
#endif // GGML_USE_CUDA_SYCL_VULKAN
|
||||
return true;
|
||||
}
|
||||
if (arg == "--rpc") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
return true;
|
||||
}
|
||||
params.rpc_servers = argv[i];
|
||||
return true;
|
||||
}
|
||||
if (arg == "--no-mmap") {
|
||||
params.use_mmap = false;
|
||||
return true;
|
||||
@@ -1557,6 +1565,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||
printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n");
|
||||
printf(" or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu);
|
||||
}
|
||||
printf(" --rpc SERVERS comma separated list of RPC servers\n");
|
||||
printf(" --verbose-prompt print a verbose prompt before generation (default: %s)\n", params.verbose_prompt ? "true" : "false");
|
||||
printf(" --no-display-prompt don't print prompt at generation (default: %s)\n", !params.display_prompt ? "true" : "false");
|
||||
printf(" -gan N, --grp-attn-n N\n");
|
||||
@@ -1830,6 +1839,7 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
|
||||
if (params.n_gpu_layers != -1) {
|
||||
mparams.n_gpu_layers = params.n_gpu_layers;
|
||||
}
|
||||
mparams.rpc_servers = params.rpc_servers.c_str();
|
||||
mparams.main_gpu = params.main_gpu;
|
||||
mparams.split_mode = params.split_mode;
|
||||
mparams.tensor_split = params.tensor_split;
|
||||
|
||||
@@ -82,6 +82,7 @@ struct gpt_params {
|
||||
float yarn_beta_slow = 1.0f; // YaRN high correction dim
|
||||
int32_t yarn_orig_ctx = 0; // YaRN original context length
|
||||
float defrag_thold = -1.0f; // KV cache defragmentation threshold
|
||||
std::string rpc_servers = ""; // comma separated list of RPC servers
|
||||
|
||||
ggml_backend_sched_eval_callback cb_eval = nullptr;
|
||||
void * cb_eval_user_data = nullptr;
|
||||
|
||||
@@ -74,9 +74,9 @@ models = [
|
||||
{"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },
|
||||
{"name": "olmo", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", },
|
||||
{"name": "dbrx", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", },
|
||||
{"name": "jina-en", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
|
||||
{"name": "jina-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
|
||||
{"name": "jina-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
|
||||
{"name": "jina-v2-en", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
|
||||
{"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
|
||||
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
|
||||
]
|
||||
|
||||
# make directory "models/tokenizers" if it doesn't exist
|
||||
|
||||
@@ -240,23 +240,6 @@ class Model:
|
||||
return False
|
||||
|
||||
def write_tensors(self):
|
||||
# same as ggml_compute_fp32_to_bf16 in ggml-impl.h
|
||||
def np_fp32_to_bf16(n: np.ndarray):
|
||||
# force nan to quiet
|
||||
n = np.where((n & 0x7fffffff) > 0x7f800000, (n & 0xffff0000) | (64 << 16), n)
|
||||
# flush subnormals to zero
|
||||
n = np.where((n & 0x7f800000) == 0, n & 0x80000000, n)
|
||||
# round to nearest even
|
||||
n = (n + (0x7fff + ((n >> 16) & 1))) >> 16
|
||||
return n.astype(np.int16)
|
||||
|
||||
# Doing this row-wise is much, much faster than element-wise, hence the signature
|
||||
v_fp32_to_bf16 = np.vectorize(np_fp32_to_bf16, otypes=[np.int16], signature="(n)->(n)")
|
||||
if self.lazy:
|
||||
# TODO: find a way to implicitly wrap np.vectorize functions
|
||||
# NOTE: the type is changed to reflect otypes passed to np.vectorize above
|
||||
v_fp32_to_bf16 = gguf.LazyNumpyTensor._wrap_fn(v_fp32_to_bf16, meta_noop=np.int16)
|
||||
|
||||
max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,")
|
||||
|
||||
for name, data_torch in self.get_tensors():
|
||||
@@ -309,27 +292,31 @@ class Model:
|
||||
))
|
||||
|
||||
if self.ftype != gguf.LlamaFileType.ALL_F32 and extra_f16 and not extra_f32:
|
||||
if self.ftype == gguf.LlamaFileType.MOSTLY_F16:
|
||||
if self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
|
||||
data = gguf.quantize_bf16(data)
|
||||
assert data.dtype == np.int16
|
||||
data_qtype = gguf.GGMLQuantizationType.BF16
|
||||
|
||||
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0 and gguf.can_quantize_to_q8_0(data):
|
||||
data = gguf.quantize_q8_0(data)
|
||||
assert data.dtype == np.uint8
|
||||
data_qtype = gguf.GGMLQuantizationType.Q8_0
|
||||
|
||||
else: # default to float16 for quantized tensors
|
||||
if data_dtype != np.float16:
|
||||
data = data.astype(np.float16)
|
||||
data_qtype = gguf.GGMLQuantizationType.F16
|
||||
|
||||
elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
|
||||
if data_dtype != np.float32:
|
||||
data = data.astype(np.float32)
|
||||
data = v_fp32_to_bf16(data.view(np.int32))
|
||||
assert data.dtype == np.int16
|
||||
data_qtype = gguf.GGMLQuantizationType.BF16
|
||||
|
||||
else: # by default, convert to float32
|
||||
if data_qtype is None: # by default, convert to float32
|
||||
if data_dtype != np.float32:
|
||||
data = data.astype(np.float32)
|
||||
data_qtype = gguf.GGMLQuantizationType.F32
|
||||
|
||||
assert data_qtype is not None
|
||||
|
||||
block_size, type_size = gguf.GGML_QUANT_SIZES[data_qtype]
|
||||
# reverse shape to make it similar to the internal ggml dimension order
|
||||
shape_str = f"{{{', '.join(str(n) for n in reversed(data.shape))}}}"
|
||||
shape_str = f"""{{{', '.join(str(n) for n in reversed(
|
||||
(*data.shape[:-1], data.shape[-1] * data.dtype.itemsize // type_size * block_size))
|
||||
)}}}"""
|
||||
|
||||
# n_dims is implicit in the shape
|
||||
logger.info(f"{f'%-{max_name_len}s' % f'{new_name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}")
|
||||
@@ -475,13 +462,13 @@ class Model:
|
||||
res = "dbrx"
|
||||
if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
|
||||
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-en
|
||||
res = "jina-en"
|
||||
res = "jina-v2-en"
|
||||
if chkhsh == "171aeeedd6fb548d418a7461d053f11b6f1f1fc9b387bd66640d28a4b9f5c643":
|
||||
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-es
|
||||
res = "jina-es"
|
||||
res = "jina-v2-es"
|
||||
if chkhsh == "27949a2493fc4a9f53f5b9b029c82689cfbe5d3a1929bb25e043089e28466de6":
|
||||
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de
|
||||
res = "jina-de"
|
||||
res = "jina-v2-de"
|
||||
|
||||
if res is None:
|
||||
logger.warning("\n")
|
||||
@@ -859,6 +846,7 @@ class BaichuanModel(Model):
|
||||
self.gguf_writer.add_head_count(head_count)
|
||||
self.gguf_writer.add_head_count_kv(head_count_kv)
|
||||
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
|
||||
self.gguf_writer.add_file_type(self.ftype)
|
||||
|
||||
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
||||
if self.hparams["rope_scaling"].get("type") == "linear":
|
||||
@@ -981,6 +969,7 @@ class XverseModel(Model):
|
||||
self.gguf_writer.add_head_count(head_count)
|
||||
self.gguf_writer.add_head_count_kv(head_count_kv)
|
||||
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
|
||||
self.gguf_writer.add_file_type(self.ftype)
|
||||
|
||||
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
||||
if self.hparams["rope_scaling"].get("type") == "linear":
|
||||
@@ -1215,6 +1204,7 @@ class StableLMModel(Model):
|
||||
self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"])
|
||||
self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
|
||||
self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_eps", "norm_eps"]))
|
||||
self.gguf_writer.add_file_type(self.ftype)
|
||||
|
||||
_q_norms: list[dict[str, Tensor]] | None = None
|
||||
_k_norms: list[dict[str, Tensor]] | None = None
|
||||
@@ -1591,6 +1581,7 @@ class QwenModel(Model):
|
||||
self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
|
||||
self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
|
||||
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
|
||||
self.gguf_writer.add_file_type(self.ftype)
|
||||
|
||||
|
||||
@Model.register("Qwen2ForCausalLM")
|
||||
@@ -1828,6 +1819,7 @@ class PlamoModel(Model):
|
||||
self.gguf_writer.add_head_count(hparams["num_attention_heads"])
|
||||
self.gguf_writer.add_head_count_kv(5) # hparams["num_key_value_heads"]) is wrong
|
||||
self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
|
||||
self.gguf_writer.add_file_type(self.ftype)
|
||||
|
||||
def shuffle_attn_q_weight(self, data_torch):
|
||||
assert data_torch.size() == (5120, 5120)
|
||||
@@ -2007,6 +1999,7 @@ in chat mode so that the conversation can end normally.")
|
||||
self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
|
||||
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
|
||||
self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
|
||||
self.gguf_writer.add_file_type(self.ftype)
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
num_heads = self.hparams["num_attention_heads"]
|
||||
@@ -2415,25 +2408,15 @@ class LazyTorchTensor(gguf.LazyBase):
|
||||
def numpy(self) -> gguf.LazyNumpyTensor:
|
||||
dtype = self._dtype_map[self.dtype]
|
||||
return gguf.LazyNumpyTensor(
|
||||
meta=np.lib.stride_tricks.as_strided(np.zeros(1, dtype), self.shape, (0 for _ in self.shape)),
|
||||
meta=gguf.LazyNumpyTensor.meta_with_dtype_and_shape(dtype, self.shape),
|
||||
lazy=self._lazy,
|
||||
args=(self,),
|
||||
func=(lambda s: s[0].numpy())
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def eager_to_meta(cls, t: Tensor) -> Tensor:
|
||||
if t.is_meta:
|
||||
return t
|
||||
return t.detach().to("meta")
|
||||
|
||||
@classmethod
|
||||
def meta_with_dtype(cls, m: Tensor, dtype: torch.dtype) -> Tensor:
|
||||
m = m.detach()
|
||||
if not m.is_meta:
|
||||
m = m.to("meta")
|
||||
m.dtype = dtype
|
||||
return m
|
||||
def meta_with_dtype_and_shape(cls, dtype: torch.dtype, shape: torch.Size) -> Tensor:
|
||||
return torch.empty(size=shape, dtype=dtype, device="meta")
|
||||
|
||||
@classmethod
|
||||
def __torch_function__(cls, func, types, args=(), kwargs=None):
|
||||
@@ -2464,8 +2447,8 @@ def parse_args() -> argparse.Namespace:
|
||||
help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--outtype", type=str, choices=["f32", "f16", "bf16", "auto"], default="f16",
|
||||
help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
|
||||
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16",
|
||||
help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--bigendian", action="store_true",
|
||||
@@ -2523,6 +2506,7 @@ def main() -> None:
|
||||
"f32": gguf.LlamaFileType.ALL_F32,
|
||||
"f16": gguf.LlamaFileType.MOSTLY_F16,
|
||||
"bf16": gguf.LlamaFileType.MOSTLY_BF16,
|
||||
"q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
|
||||
"auto": gguf.LlamaFileType.GUESSED,
|
||||
}
|
||||
|
||||
|
||||
178
convert.py
178
convert.py
@@ -24,7 +24,7 @@ from abc import ABC, abstractmethod
|
||||
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Any, Callable, ClassVar, IO, Iterable, Literal, Protocol, TypeVar, runtime_checkable
|
||||
from typing import TYPE_CHECKING, Any, Callable, ClassVar, IO, Iterable, Literal, Protocol, TypeVar, runtime_checkable, Optional
|
||||
|
||||
import numpy as np
|
||||
from sentencepiece import SentencePieceProcessor
|
||||
@@ -344,10 +344,47 @@ class Params:
|
||||
return params
|
||||
|
||||
|
||||
@dataclass
|
||||
class Metadata:
|
||||
name: Optional[str] = None
|
||||
author: Optional[str] = None
|
||||
version: Optional[str] = None
|
||||
url: Optional[str] = None
|
||||
description: Optional[str] = None
|
||||
licence: Optional[str] = None
|
||||
source_url: Optional[str] = None
|
||||
source_hf_repo: Optional[str] = None
|
||||
|
||||
@staticmethod
|
||||
def load(metadata_path: Path) -> Metadata:
|
||||
if metadata_path is None or not metadata_path.exists():
|
||||
return Metadata()
|
||||
|
||||
with open(metadata_path, 'r') as file:
|
||||
data = json.load(file)
|
||||
|
||||
# Create a new Metadata instance
|
||||
metadata = Metadata()
|
||||
|
||||
# Assigning values to Metadata attributes if they exist in the JSON file
|
||||
# This is based on LLM_KV_NAMES mapping in llama.cpp
|
||||
metadata.name = data.get("general.name")
|
||||
metadata.author = data.get("general.author")
|
||||
metadata.version = data.get("general.version")
|
||||
metadata.url = data.get("general.url")
|
||||
metadata.description = data.get("general.description")
|
||||
metadata.license = data.get("general.license")
|
||||
metadata.source_url = data.get("general.source.url")
|
||||
metadata.source_hf_repo = data.get("general.source.huggingface.repository")
|
||||
|
||||
return metadata
|
||||
|
||||
|
||||
#
|
||||
# vocab
|
||||
#
|
||||
|
||||
|
||||
@runtime_checkable
|
||||
class BaseVocab(Protocol):
|
||||
tokenizer_model: ClassVar[str]
|
||||
@@ -1066,21 +1103,42 @@ class OutputFile:
|
||||
def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE):
|
||||
self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
|
||||
|
||||
def add_meta_arch(self, params: Params) -> None:
|
||||
def add_meta_model(self, params: Params, metadata: Metadata) -> None:
|
||||
# Metadata About The Model And Its Provenence
|
||||
name = "LLaMA"
|
||||
|
||||
# TODO: better logic to determine model name
|
||||
if params.n_ctx == 4096:
|
||||
name = "LLaMA v2"
|
||||
if metadata is not None and metadata.name is not None:
|
||||
name = metadata.name
|
||||
elif params.path_model is not None:
|
||||
name = str(params.path_model.parent).split('/')[-1]
|
||||
name = str(params.path_model.parent).split("/")[-1]
|
||||
elif params.n_ctx == 4096:
|
||||
# Heuristic detection of LLaMA v2 model
|
||||
name = "LLaMA v2"
|
||||
|
||||
self.gguf.add_name (name)
|
||||
self.gguf.add_vocab_size (params.n_vocab)
|
||||
self.gguf.add_context_length (params.n_ctx)
|
||||
self.gguf.add_embedding_length (params.n_embd)
|
||||
self.gguf.add_block_count (params.n_layer)
|
||||
self.gguf.add_feed_forward_length (params.n_ff)
|
||||
self.gguf.add_name(name)
|
||||
|
||||
if metadata is not None:
|
||||
if metadata.author is not None:
|
||||
self.gguf.add_author(metadata.author)
|
||||
if metadata.version is not None:
|
||||
self.gguf.add_version(metadata.version)
|
||||
if metadata.url is not None:
|
||||
self.gguf.add_url(metadata.url)
|
||||
if metadata.description is not None:
|
||||
self.gguf.add_description(metadata.description)
|
||||
if metadata.licence is not None:
|
||||
self.gguf.add_licence(metadata.licence)
|
||||
if metadata.source_url is not None:
|
||||
self.gguf.add_source_url(metadata.source_url)
|
||||
if metadata.source_hf_repo is not None:
|
||||
self.gguf.add_source_hf_repo(metadata.source_hf_repo)
|
||||
|
||||
def add_meta_arch(self, params: Params) -> None:
|
||||
# Metadata About The Neural Architecture Itself
|
||||
self.gguf.add_vocab_size(params.n_vocab)
|
||||
self.gguf.add_context_length(params.n_ctx)
|
||||
self.gguf.add_embedding_length(params.n_embd)
|
||||
self.gguf.add_block_count(params.n_layer)
|
||||
self.gguf.add_feed_forward_length(params.n_ff)
|
||||
self.gguf.add_rope_dimension_count(params.n_embd // params.n_head)
|
||||
self.gguf.add_head_count (params.n_head)
|
||||
self.gguf.add_head_count_kv (params.n_head_kv)
|
||||
@@ -1183,13 +1241,14 @@ class OutputFile:
|
||||
@staticmethod
|
||||
def write_vocab_only(
|
||||
fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab,
|
||||
endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False,
|
||||
endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, metadata: Metadata = None,
|
||||
) -> None:
|
||||
check_vocab_size(params, vocab, pad_vocab=pad_vocab)
|
||||
|
||||
of = OutputFile(fname_out, endianess=endianess)
|
||||
|
||||
# meta data
|
||||
of.add_meta_model(params, metadata)
|
||||
of.add_meta_arch(params)
|
||||
of.add_meta_vocab(vocab)
|
||||
of.add_meta_special_vocab(svocab)
|
||||
@@ -1216,12 +1275,14 @@ class OutputFile:
|
||||
fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: BaseVocab, svocab: gguf.SpecialVocab,
|
||||
concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
|
||||
pad_vocab: bool = False,
|
||||
metadata: Metadata = None,
|
||||
) -> None:
|
||||
check_vocab_size(params, vocab, pad_vocab=pad_vocab)
|
||||
|
||||
of = OutputFile(fname_out, endianess=endianess)
|
||||
|
||||
# meta data
|
||||
of.add_meta_model(params, metadata)
|
||||
of.add_meta_arch(params)
|
||||
if isinstance(vocab, Vocab):
|
||||
of.add_meta_vocab(vocab)
|
||||
@@ -1257,6 +1318,37 @@ def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileT
|
||||
raise ValueError(f"Unexpected combination of types: {name_to_type}")
|
||||
|
||||
|
||||
def model_parameter_count(model: LazyModel) -> int:
|
||||
total_model_parameters = 0
|
||||
for i, (name, lazy_tensor) in enumerate(model.items()):
|
||||
sum_weights_in_tensor = 1
|
||||
for dim in lazy_tensor.shape:
|
||||
sum_weights_in_tensor *= dim
|
||||
total_model_parameters += sum_weights_in_tensor
|
||||
return total_model_parameters
|
||||
|
||||
|
||||
def model_parameter_count_rounded_notation(model_params_count: int) -> str:
|
||||
if model_params_count > 1e12 :
|
||||
# Trillions Of Parameters
|
||||
scaled_model_params = model_params_count * 1e-12
|
||||
scale_suffix = "T"
|
||||
elif model_params_count > 1e9 :
|
||||
# Billions Of Parameters
|
||||
scaled_model_params = model_params_count * 1e-9
|
||||
scale_suffix = "B"
|
||||
elif model_params_count > 1e6 :
|
||||
# Millions Of Parameters
|
||||
scaled_model_params = model_params_count * 1e-6
|
||||
scale_suffix = "M"
|
||||
else:
|
||||
# Thousands Of Parameters
|
||||
scaled_model_params = model_params_count * 1e-3
|
||||
scale_suffix = "K"
|
||||
|
||||
return f"{round(scaled_model_params)}{scale_suffix}"
|
||||
|
||||
|
||||
def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel:
|
||||
return {name: tensor.astype(output_type.type_for_tensor(name, tensor))
|
||||
for (name, tensor) in model.items()}
|
||||
@@ -1436,13 +1528,35 @@ class VocabFactory:
|
||||
return vocab, special_vocab
|
||||
|
||||
|
||||
def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path:
|
||||
namestr = {
|
||||
GGMLFileType.AllF32: "f32",
|
||||
GGMLFileType.MostlyF16: "f16",
|
||||
GGMLFileType.MostlyQ8_0:"q8_0",
|
||||
def default_convention_outfile(file_type: GGMLFileType, params: Params, model_params_count: int, metadata: Metadata) -> str:
|
||||
quantization = {
|
||||
GGMLFileType.AllF32: "F32",
|
||||
GGMLFileType.MostlyF16: "F16",
|
||||
GGMLFileType.MostlyQ8_0: "Q8_0",
|
||||
}[file_type]
|
||||
ret = model_paths[0].parent / f"ggml-model-{namestr}.gguf"
|
||||
|
||||
parameters = model_parameter_count_rounded_notation(model_params_count)
|
||||
|
||||
expert_count = ""
|
||||
if params.n_experts is not None:
|
||||
expert_count = f"{params.n_experts}x"
|
||||
|
||||
version = ""
|
||||
if metadata is not None and metadata.version is not None:
|
||||
version = f"-{metadata.version}"
|
||||
|
||||
name = "ggml-model"
|
||||
if metadata is not None and metadata.name is not None:
|
||||
name = metadata.name
|
||||
elif params.path_model is not None:
|
||||
name = params.path_model.name
|
||||
|
||||
return f"{name}{version}-{expert_count}{parameters}-{quantization}"
|
||||
|
||||
|
||||
def default_outfile(model_paths: list[Path], file_type: GGMLFileType, params: Params, model_params_count: int, metadata: Metadata) -> Path:
|
||||
default_filename = default_convention_outfile(file_type, params, model_params_count, metadata)
|
||||
ret = model_paths[0].parent / f"{default_filename}.gguf"
|
||||
if ret in model_paths:
|
||||
logger.error(
|
||||
f"Error: Default output path ({ret}) would overwrite the input. "
|
||||
@@ -1480,17 +1594,30 @@ def main(args_in: list[str] | None = None) -> None:
|
||||
parser.add_argument("--pad-vocab", action="store_true", help="add pad tokens when model vocab expects more than tokenizer metadata provides")
|
||||
parser.add_argument("--skip-unknown", action="store_true", help="skip unknown tensor names instead of failing")
|
||||
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
|
||||
parser.add_argument("--metadata", type=Path, help="Specify the path for a metadata file")
|
||||
parser.add_argument("--get-outfile", action="store_true", help="get calculated default outfile name")
|
||||
|
||||
args = parser.parse_args(args_in)
|
||||
|
||||
if args.verbose:
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
elif args.dump_single or args.dump:
|
||||
elif args.dump_single or args.dump or args.get_outfile:
|
||||
# Avoid printing anything besides the dump output
|
||||
logging.basicConfig(level=logging.WARNING)
|
||||
else:
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
metadata = Metadata.load(args.metadata)
|
||||
|
||||
if args.get_outfile:
|
||||
model_plus = load_some_model(args.model)
|
||||
params = Params.load(model_plus)
|
||||
model = convert_model_names(model_plus.model, params, args.skip_unknown)
|
||||
model_params_count = model_parameter_count(model_plus.model)
|
||||
ftype = pick_output_type(model, args.outtype)
|
||||
print(f"{default_convention_outfile(ftype, params, model_params_count, metadata)}") # noqa: NP100
|
||||
return
|
||||
|
||||
if args.no_vocab and args.vocab_only:
|
||||
raise ValueError("--vocab-only does not make sense with --no-vocab")
|
||||
|
||||
@@ -1504,6 +1631,9 @@ def main(args_in: list[str] | None = None) -> None:
|
||||
else:
|
||||
model_plus = ModelPlus(model = {}, paths = [args.model / 'dummy'], format = 'none', vocab = None)
|
||||
|
||||
model_params_count = model_parameter_count(model_plus.model)
|
||||
logger.info(f"model parameters count : {model_params_count} ({model_parameter_count_rounded_notation(model_params_count)})")
|
||||
|
||||
if args.dump:
|
||||
do_dump_model(model_plus)
|
||||
return
|
||||
@@ -1557,7 +1687,7 @@ def main(args_in: list[str] | None = None) -> None:
|
||||
f_norm_eps = 1e-5,
|
||||
)
|
||||
OutputFile.write_vocab_only(outfile, params, vocab, special_vocab,
|
||||
endianess=endianess, pad_vocab=args.pad_vocab)
|
||||
endianess=endianess, pad_vocab=args.pad_vocab, metadata=metadata)
|
||||
logger.info(f"Wrote {outfile}")
|
||||
return
|
||||
|
||||
@@ -1570,13 +1700,13 @@ def main(args_in: list[str] | None = None) -> None:
|
||||
model = convert_model_names(model, params, args.skip_unknown)
|
||||
ftype = pick_output_type(model, args.outtype)
|
||||
model = convert_to_output_type(model, ftype)
|
||||
outfile = args.outfile or default_outfile(model_plus.paths, ftype)
|
||||
outfile = args.outfile or default_outfile(model_plus.paths, ftype, params, model_params_count, metadata)
|
||||
|
||||
params.ftype = ftype
|
||||
logger.info(f"Writing {outfile}, format {ftype}")
|
||||
|
||||
OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab,
|
||||
concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab)
|
||||
concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab, metadata=metadata)
|
||||
logger.info(f"Wrote {outfile}")
|
||||
|
||||
|
||||
|
||||
@@ -49,4 +49,7 @@ else()
|
||||
add_subdirectory(server)
|
||||
endif()
|
||||
add_subdirectory(export-lora)
|
||||
if (LLAMA_RPC)
|
||||
add_subdirectory(rpc)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
@@ -211,6 +211,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// clean up
|
||||
llama_print_timings(ctx);
|
||||
llama_batch_free(batch);
|
||||
llama_free(ctx);
|
||||
llama_free_model(model);
|
||||
llama_backend_free();
|
||||
|
||||
@@ -1358,6 +1358,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
p->print_test(t);
|
||||
fflush(p->fout);
|
||||
|
||||
llama_print_timings(ctx);
|
||||
|
||||
|
||||
@@ -300,14 +300,10 @@ int main(int argc, char ** argv) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
for (auto & image : params.image) {
|
||||
if (prompt_contains_image(params.prompt)) {
|
||||
auto ctx_llava = llava_init_context(¶ms, model);
|
||||
|
||||
auto image_embed = load_image(ctx_llava, ¶ms, image);
|
||||
if (!image_embed) {
|
||||
std::cerr << "error: failed to load image " << image << ". Terminating\n\n";
|
||||
return 1;
|
||||
}
|
||||
auto image_embed = load_image(ctx_llava, ¶ms, "");
|
||||
|
||||
// process the prompt
|
||||
process_prompt(ctx_llava, image_embed, ¶ms, params.prompt);
|
||||
@@ -316,7 +312,26 @@ int main(int argc, char ** argv) {
|
||||
llava_image_embed_free(image_embed);
|
||||
ctx_llava->model = NULL;
|
||||
llava_free(ctx_llava);
|
||||
} else {
|
||||
for (auto & image : params.image) {
|
||||
auto ctx_llava = llava_init_context(¶ms, model);
|
||||
|
||||
auto image_embed = load_image(ctx_llava, ¶ms, image);
|
||||
if (!image_embed) {
|
||||
std::cerr << "error: failed to load image " << image << ". Terminating\n\n";
|
||||
return 1;
|
||||
}
|
||||
|
||||
// process the prompt
|
||||
process_prompt(ctx_llava, image_embed, ¶ms, params.prompt);
|
||||
|
||||
llama_print_timings(ctx_llava->ctx_llama);
|
||||
llava_image_embed_free(image_embed);
|
||||
ctx_llava->model = NULL;
|
||||
llava_free(ctx_llava);
|
||||
}
|
||||
}
|
||||
|
||||
llama_free_model(model);
|
||||
|
||||
return 0;
|
||||
|
||||
@@ -88,7 +88,6 @@ static struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair<
|
||||
// Take the image segments in a grid configuration and return the embeddings and the number of embeddings into preallocated memory (image_embd_out)
|
||||
static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out) {
|
||||
struct {
|
||||
struct ggml_tensor * newline;
|
||||
struct ggml_context * ctx;
|
||||
} model;
|
||||
|
||||
@@ -150,20 +149,6 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
|
||||
|
||||
model.ctx = ggml_init(params);
|
||||
|
||||
ggml_tensor * newline_tmp = clip_get_newline_tensor(ctx_clip);
|
||||
model.newline = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, newline_tmp->ne[0]);
|
||||
if (newline_tmp->backend != GGML_BACKEND_TYPE_CPU) {
|
||||
if (newline_tmp->buffer == NULL) {
|
||||
LOG_TEE("newline_tmp tensor buffer is NULL\n");
|
||||
}
|
||||
ggml_backend_tensor_get(newline_tmp, model.newline->data, 0, ggml_nbytes(newline_tmp));
|
||||
} else {
|
||||
model.newline->data = newline_tmp->data;
|
||||
if (model.newline->data == NULL) {
|
||||
LOG_TEE("newline_tmp tensor data is NULL\n");
|
||||
}
|
||||
}
|
||||
|
||||
struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_patches(ctx_clip), num_images - 1); // example: 4096 x 576 x 4
|
||||
// ggml_tensor_printf(image_features,"image_features",__LINE__,false,false);
|
||||
// fill it with the image embeddings, ignoring the base
|
||||
|
||||
@@ -7,6 +7,8 @@ Also note that finetunes typically result in a higher perplexity value even thou
|
||||
|
||||
Within llama.cpp the perplexity of base models is used primarily to judge the quality loss from e.g. quantized models vs. FP16.
|
||||
The convention among contributors is to use the Wikitext-2 test set for testing unless noted otherwise (can be obtained with `scripts/get-wikitext-2.sh`).
|
||||
When numbers are listed all command line arguments and compilation options are left at their defaults unless noted otherwise.
|
||||
llama.cpp numbers are **not** directly comparable to those of other projects because the exact values depend strongly on the implementation details.
|
||||
|
||||
By default only the mean perplexity value and the corresponding uncertainty is calculated.
|
||||
The uncertainty is determined empirically by assuming a Gaussian distribution of the "correct" logits per and then applying error propagation.
|
||||
@@ -32,7 +34,13 @@ In addition to the KL divergence the following statistics are calculated with `-
|
||||
|
||||
## LLaMA 3 8b Scoreboard
|
||||
|
||||
Results are sorted by Kullback-Leibler divergence relative to FP16.
|
||||
| Revision | f364eb6f |
|
||||
|:---------|:-------------------|
|
||||
| Backend | CUDA |
|
||||
| CPU | AMD Epyc 7742 |
|
||||
| GPU | 1x NVIDIA RTX 4090 |
|
||||
|
||||
Results were generated using the CUDA backend and are sorted by Kullback-Leibler divergence relative to FP16.
|
||||
The "WT" importance matrices were created using varying numbers of Wikitext tokens and can be found [here](https://huggingface.co/JohannesGaessler/llama.cpp_importance_matrices/blob/main/imatrix-llama_3-8b-f16-2.7m_tokens.dat).
|
||||
|
||||
| Quantization | imatrix | Model size [GiB] | PPL | ΔPPL | KLD | Mean Δp | RMS Δp |
|
||||
@@ -89,6 +97,12 @@ K-quants score better on mean Δp than the legacy quants than e.g. KL divergence
|
||||
|
||||
## LLaMA 2 vs. LLaMA 3 Quantization comparison
|
||||
|
||||
| Revision | f364eb6f |
|
||||
|:---------|:-------------------|
|
||||
| Backend | CUDA |
|
||||
| CPU | AMD Epyc 7742 |
|
||||
| GPU | 1x NVIDIA RTX 4090 |
|
||||
|
||||
| Metric | L2 7b q2_K | L3 8b q2_K | L2 7b q4_K_M | L3 8b q4_K_M | L2 7b q6_K | L3 8b q6_K | L2 7b q8_0 | L3 8b q8_0 |
|
||||
|-----------------|---------------------|---------------------|---------------------|---------------------|---------------------|---------------------|---------------------|---------------------|
|
||||
| Mean PPL | 5.794552 ± 0.032298 | 9.751568 ± 0.063312 | 5.877078 ± 0.032781 | 6.407115 ± 0.039119 | 5.808494 ± 0.032425 | 6.253382 ± 0.038078 | 5.798542 ± 0.032366 | 6.234284 ± 0.037878 |
|
||||
@@ -107,6 +121,50 @@ K-quants score better on mean Δp than the legacy quants than e.g. KL divergence
|
||||
| RMS Δp | 9.762 ± 0.053 % | 21.421 ± 0.079 % | 3.252 ± 0.024 % | 5.519 ± 0.050 % | 1.339 ± 0.010 % | 2.295 ± 0.019 % | 0.618 ± 0.011 % | 1.198 ± 0.007 % |
|
||||
| Same top p | 85.584 ± 0.086 % | 71.138 ± 0.119 % | 94.665 ± 0.055 % | 91.901 ± 0.072 % | 97.520 ± 0.038 % | 96.031 ± 0.051 % | 98.846 ± 0.026 % | 97.674 ± 0.040 % |
|
||||
|
||||
## LLaMA 3 BF16 vs. FP16 comparison
|
||||
|
||||
| Revision | 83330d8c |
|
||||
|:---------|:--------------|
|
||||
| Backend | CPU |
|
||||
| CPU | AMD Epyc 7742 |
|
||||
| GPU | N/A |
|
||||
|
||||
Results were calculated with LLaMA 3 8b BF16 as `--kl-divergence-base` and LLaMA 3 8b FP16 as the `--model` for comparison.
|
||||
|
||||
| Metric | Value |
|
||||
|--------------------------------|--------------------------|
|
||||
| Mean PPL(Q) | 6.227711 ± 0.037833 |
|
||||
| Mean PPL(base) | 6.225194 ± 0.037771 |
|
||||
| Cor(ln(PPL(Q)), ln(PPL(base))) | 99.990% |
|
||||
| Mean ln(PPL(Q)/PPL(base)) | 0.000404 ± 0.000086 |
|
||||
| Mean PPL(Q)/PPL(base) | 1.000404 ± 0.000086 |
|
||||
| Mean PPL(Q)-PPL(base) | 0.002517 ± 0.000536 |
|
||||
| Mean KLD | 0.00002515 ± 0.00000020 |
|
||||
| Maximum KLD | 0.012206 |
|
||||
| 99.9% KLD | 0.000799 |
|
||||
| 99.0% KLD | 0.000222 |
|
||||
| 99.0% KLD | 0.000222 |
|
||||
| Median KLD | 0.000013 |
|
||||
| 10.0% KLD | -0.000002 |
|
||||
| 5.0% KLD | -0.000008 |
|
||||
| 1.0% KLD | -0.000023 |
|
||||
| Minimum KLD | -0.000059 |
|
||||
| Mean Δp | -0.0000745 ± 0.0003952 % |
|
||||
| Maximum Δp | 4.186% |
|
||||
| 99.9% Δp | 1.049% |
|
||||
| 99.0% Δp | 0.439% |
|
||||
| 95.0% Δp | 0.207% |
|
||||
| 90.0% Δp | 0.125% |
|
||||
| 75.0% Δp | 0.029% |
|
||||
| Median Δp | 0.000% |
|
||||
| 25.0% Δp | -0.030% |
|
||||
| 10.0% Δp | -0.126% |
|
||||
| 5.0% Δp | -0.207% |
|
||||
| 1.0% Δp | -0.434% |
|
||||
| 0.1% Δp | -1.016% |
|
||||
| Minimum Δp | -4.672% |
|
||||
| RMS Δp | 0.150 ± 0.001 % |
|
||||
| Same top p | 99.739 ± 0.013 % |
|
||||
|
||||
## Old Numbers
|
||||
|
||||
|
||||
2
examples/rpc/CMakeLists.txt
Normal file
2
examples/rpc/CMakeLists.txt
Normal file
@@ -0,0 +1,2 @@
|
||||
add_executable(rpc-server rpc-server.cpp)
|
||||
target_link_libraries(rpc-server PRIVATE ggml llama)
|
||||
74
examples/rpc/README.md
Normal file
74
examples/rpc/README.md
Normal file
@@ -0,0 +1,74 @@
|
||||
## Overview
|
||||
|
||||
The `rpc-server` allows running `ggml` backend on a remote host.
|
||||
The RPC backend communicates with one or several instances of `rpc-server` and offloads computations to them.
|
||||
This can be used for distributed LLM inference with `llama.cpp` in the following way:
|
||||
|
||||
```mermaid
|
||||
flowchart TD
|
||||
rpcb---|TCP|srva
|
||||
rpcb---|TCP|srvb
|
||||
rpcb-.-|TCP|srvn
|
||||
subgraph hostn[Host N]
|
||||
srvn[rpc-server]-.-backend3["Backend (CUDA,Metal,etc.)"]
|
||||
end
|
||||
subgraph hostb[Host B]
|
||||
srvb[rpc-server]---backend2["Backend (CUDA,Metal,etc.)"]
|
||||
end
|
||||
subgraph hosta[Host A]
|
||||
srva[rpc-server]---backend["Backend (CUDA,Metal,etc.)"]
|
||||
end
|
||||
subgraph host[Main Host]
|
||||
ggml[llama.cpp]---rpcb[RPC backend]
|
||||
end
|
||||
style hostn stroke:#66,stroke-width:2px,stroke-dasharray: 5 5
|
||||
```
|
||||
|
||||
Each host can run a different backend, e.g. one with CUDA and another with Metal.
|
||||
You can also run multiple `rpc-server` instances on the same host, each with a different backend.
|
||||
|
||||
## Usage
|
||||
|
||||
On each host, build the corresponding backend with `cmake` and add `-DLLAMA_RPC=ON` to the build options.
|
||||
For example, to build the CUDA backend with RPC support:
|
||||
|
||||
```bash
|
||||
mkdir build-rpc-cuda
|
||||
cd build-rpc-cuda
|
||||
cmake .. -DLLAMA_CUDA=ON -DLLAMA_RPC=ON
|
||||
cmake --build . --config Release
|
||||
```
|
||||
|
||||
Then, start the `rpc-server` with the backend:
|
||||
|
||||
```bash
|
||||
$ bin/rpc-server 0.0.0.0 50052
|
||||
create_backend: using CUDA backend
|
||||
ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
|
||||
ggml_cuda_init: CUDA_USE_TENSOR_CORES: yes
|
||||
ggml_cuda_init: found 1 CUDA devices:
|
||||
Device 0: NVIDIA T1200 Laptop GPU, compute capability 7.5, VMM: yes
|
||||
Starting RPC server on 0.0.0.0:50052
|
||||
```
|
||||
|
||||
When using the CUDA backend, you can specify the device with the `CUDA_VISIBLE_DEVICES` environment variable, e.g.:
|
||||
```bash
|
||||
$ CUDA_VISIBLE_DEVICES=0 bin/rpc-server 0.0.0.0 50052
|
||||
```
|
||||
This way you can run multiple `rpc-server` instances on the same host, each with a different CUDA device.
|
||||
|
||||
|
||||
On the main host build `llama.cpp` only with `-DLLAMA_RPC=ON`:
|
||||
|
||||
```bash
|
||||
mkdir build-rpc
|
||||
cd build-rpc
|
||||
cmake .. -DLLAMA_RPC=ON
|
||||
cmake --build . --config Release
|
||||
```
|
||||
|
||||
Finally, use the `--rpc` option to specify the host and port of each `rpc-server`:
|
||||
|
||||
```bash
|
||||
$ bin/main -m ../models/tinyllama-1b/ggml-model-f16.gguf -p "Hello, my name is" --repeat-penalty 1.0 -n 64 --rpc 192.168.88.10:50052,192.168.88.11:50052 -ngl 99
|
||||
```
|
||||
70
examples/rpc/rpc-server.cpp
Normal file
70
examples/rpc/rpc-server.cpp
Normal file
@@ -0,0 +1,70 @@
|
||||
#ifdef GGML_USE_CUDA
|
||||
#include "ggml-cuda.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_METAL
|
||||
#include "ggml-metal.h"
|
||||
#endif
|
||||
|
||||
#include "ggml-rpc.h"
|
||||
#include <string>
|
||||
#include <stdio.h>
|
||||
|
||||
static ggml_backend_t create_backend() {
|
||||
ggml_backend_t backend = NULL;
|
||||
#ifdef GGML_USE_CUDA
|
||||
fprintf(stderr, "%s: using CUDA backend\n", __func__);
|
||||
backend = ggml_backend_cuda_init(0); // init device 0
|
||||
if (!backend) {
|
||||
fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
|
||||
}
|
||||
#elif GGML_USE_METAL
|
||||
fprintf(stderr, "%s: using Metal backend\n", __func__);
|
||||
backend = ggml_backend_metal_init();
|
||||
if (!backend) {
|
||||
fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
|
||||
}
|
||||
#endif
|
||||
|
||||
// if there aren't GPU Backends fallback to CPU backend
|
||||
if (!backend) {
|
||||
fprintf(stderr, "%s: using CPU backend\n", __func__);
|
||||
backend = ggml_backend_cpu_init();
|
||||
}
|
||||
return backend;
|
||||
}
|
||||
|
||||
static void get_backend_memory(size_t * free_mem, size_t * total_mem) {
|
||||
#ifdef GGML_USE_CUDA
|
||||
ggml_backend_cuda_get_device_memory(0, free_mem, total_mem);
|
||||
#else
|
||||
// TODO: implement for other backends
|
||||
*free_mem = 1;
|
||||
*total_mem = 1;
|
||||
#endif
|
||||
}
|
||||
|
||||
int main(int argc, char * argv[]) {
|
||||
if (argc < 3) {
|
||||
fprintf(stderr, "Usage: %s <host> <port>\n", argv[0]);
|
||||
return 1;
|
||||
}
|
||||
const char * host = argv[1];
|
||||
int port = std::stoi(argv[2]);
|
||||
if (port <= 0 || port > 65535) {
|
||||
fprintf(stderr, "Invalid port number: %d\n", port);
|
||||
return 1;
|
||||
}
|
||||
ggml_backend_t backend = create_backend();
|
||||
if (!backend) {
|
||||
fprintf(stderr, "Failed to create backend\n");
|
||||
return 1;
|
||||
}
|
||||
printf("Starting RPC server on %s:%d\n", host, port);
|
||||
size_t free_mem, total_mem;
|
||||
get_backend_memory(&free_mem, &total_mem);
|
||||
std::string endpoint = std::string(host) + ":" + std::to_string(port);
|
||||
start_rpc_server(backend, endpoint.c_str(), free_mem, total_mem);
|
||||
ggml_backend_free(backend);
|
||||
return 0;
|
||||
}
|
||||
@@ -48,7 +48,7 @@ page cache before using this. See https://github.com/ggerganov/llama.cpp/issues/
|
||||
- `--path`: Path from which to serve static files. Default: disabled
|
||||
- `--api-key`: Set an api key for request authorization. By default, the server responds to every request. With an api key set, the requests must have the Authorization header set with the api key as Bearer token. May be used multiple times to enable multiple valid keys.
|
||||
- `--api-key-file`: Path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access. May be used in conjunction with `--api-key`s.
|
||||
- `--embedding`: Enable embedding extraction. Default: disabled
|
||||
- `--embeddings`: Enable embedding vector output and the OAI compatible endpoint /v1/embeddings. Physical batch size (`--ubatch-size`) must be carefully defined. Default: disabled
|
||||
- `-np N`, `--parallel N`: Set the number of slots for process requests. Default: `1`
|
||||
- `-cb`, `--cont-batching`: Enable continuous batching (a.k.a dynamic batching). Default: disabled
|
||||
- `-spf FNAME`, `--system-prompt-file FNAME` Set a file to load a system prompt (initial prompt of all slots). This is useful for chat applications. [See more](#change-system-prompt-on-runtime)
|
||||
|
||||
@@ -293,13 +293,14 @@ def start_server_background(args):
|
||||
|
||||
|
||||
def is_server_listening(server_fqdn, server_port):
|
||||
with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
|
||||
result = sock.connect_ex((server_fqdn, server_port))
|
||||
_is_server_listening = result == 0
|
||||
if _is_server_listening:
|
||||
print(f"server is listening on {server_fqdn}:{server_port}...")
|
||||
return _is_server_listening
|
||||
|
||||
try:
|
||||
url = f"{server_fqdn}:{server_port}/health"
|
||||
if not url.startswith("http://"):
|
||||
url = f"http://{url}"
|
||||
result = requests.get(url)
|
||||
return result.status_code == 200
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
def escape_metric_name(metric_name):
|
||||
return re.sub('[^A-Z0-9]', '_', metric_name.upper())
|
||||
|
||||
@@ -671,6 +671,13 @@ struct server_context {
|
||||
model = nullptr;
|
||||
}
|
||||
|
||||
// Clear any sampling context
|
||||
for (server_slot & slot : slots) {
|
||||
if (slot.ctx_sampling != nullptr) {
|
||||
llama_sampling_free(slot.ctx_sampling);
|
||||
}
|
||||
}
|
||||
|
||||
llama_batch_free(batch);
|
||||
}
|
||||
|
||||
|
||||
@@ -887,6 +887,7 @@ async def oai_chat_completions(user_prompt,
|
||||
base_path,
|
||||
async_client,
|
||||
debug=False,
|
||||
temperature=None,
|
||||
model=None,
|
||||
n_predict=None,
|
||||
enable_streaming=None,
|
||||
@@ -913,7 +914,8 @@ async def oai_chat_completions(user_prompt,
|
||||
"model": model,
|
||||
"max_tokens": n_predict,
|
||||
"stream": enable_streaming,
|
||||
"seed": seed
|
||||
"temperature": temperature if temperature is not None else 0.0,
|
||||
"seed": seed,
|
||||
}
|
||||
if response_format is not None:
|
||||
payload['response_format'] = response_format
|
||||
@@ -978,7 +980,8 @@ async def oai_chat_completions(user_prompt,
|
||||
max_tokens=n_predict,
|
||||
stream=enable_streaming,
|
||||
response_format=payload.get('response_format'),
|
||||
seed=seed
|
||||
seed=seed,
|
||||
temperature=payload['temperature']
|
||||
)
|
||||
except openai.error.AuthenticationError as e:
|
||||
if expect_api_error is not None and expect_api_error:
|
||||
|
||||
@@ -371,7 +371,7 @@ static json oaicompat_completion_params_parse(
|
||||
llama_params["presence_penalty"] = json_value(body, "presence_penalty", 0.0);
|
||||
llama_params["seed"] = json_value(body, "seed", LLAMA_DEFAULT_SEED);
|
||||
llama_params["stream"] = json_value(body, "stream", false);
|
||||
llama_params["temperature"] = json_value(body, "temperature", 0.0);
|
||||
llama_params["temperature"] = json_value(body, "temperature", 1.0);
|
||||
llama_params["top_p"] = json_value(body, "top_p", 1.0);
|
||||
|
||||
// Apply chat template to the list of messages
|
||||
|
||||
@@ -114,6 +114,8 @@ extern "C" {
|
||||
void (*GGML_CALL event_record) (ggml_backend_event_t event);
|
||||
void (*GGML_CALL event_wait) (ggml_backend_t backend, ggml_backend_event_t event);
|
||||
void (*GGML_CALL event_synchronize) (ggml_backend_event_t event);
|
||||
|
||||
ggml_backend_t (*GGML_CALL backend_dup)(ggml_backend_t backend);
|
||||
};
|
||||
|
||||
struct ggml_backend {
|
||||
|
||||
167
ggml-backend.c
167
ggml-backend.c
@@ -180,6 +180,13 @@ void ggml_backend_free(ggml_backend_t backend) {
|
||||
backend->iface.free(backend);
|
||||
}
|
||||
|
||||
ggml_backend_t ggml_backend_dup(ggml_backend_t backend) {
|
||||
if (backend->iface.backend_dup) {
|
||||
return backend->iface.backend_dup(backend);
|
||||
}
|
||||
return backend;
|
||||
}
|
||||
|
||||
ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend) {
|
||||
return backend->iface.get_default_buffer_type(backend);
|
||||
}
|
||||
@@ -855,6 +862,7 @@ static struct ggml_backend_i cpu_backend_i = {
|
||||
/* .event_record = */ NULL,
|
||||
/* .event_wait = */ NULL,
|
||||
/* .event_synchronize = */ NULL,
|
||||
/* .backend_dup = */ NULL,
|
||||
};
|
||||
|
||||
static ggml_guid_t ggml_backend_cpu_guid(void) {
|
||||
@@ -1026,16 +1034,34 @@ static bool ggml_is_view_op(enum ggml_op op) {
|
||||
#define GGML_SCHED_MAX_COPIES 4
|
||||
#endif
|
||||
|
||||
#ifndef GGML_SCHED_MAX_COPY_STREAMS
|
||||
#define GGML_SCHED_MAX_COPY_STREAMS 8
|
||||
#endif
|
||||
|
||||
struct ggml_backend_sched_split {
|
||||
int backend_id;
|
||||
int i_start;
|
||||
int i_end;
|
||||
|
||||
// input tensors from other backends
|
||||
struct ggml_tensor * inputs[GGML_SCHED_MAX_SPLIT_INPUTS];
|
||||
int n_inputs;
|
||||
|
||||
// copy stream to use to copy the inputs that are weights (-1 = no copy stream)
|
||||
int w_copy_stream_id;
|
||||
|
||||
// graph view of this split
|
||||
struct ggml_cgraph graph;
|
||||
};
|
||||
|
||||
struct ggml_backend_sched_copy_stream {
|
||||
ggml_backend_t stream;
|
||||
ggml_backend_buffer_t buffer;
|
||||
ggml_backend_event_t event_copy;
|
||||
ggml_backend_event_t event_use;
|
||||
size_t max_size;
|
||||
};
|
||||
|
||||
struct ggml_backend_sched {
|
||||
bool is_reset; // true if the scheduler has been reset since the last graph split
|
||||
bool is_alloc;
|
||||
@@ -1046,6 +1072,9 @@ struct ggml_backend_sched {
|
||||
ggml_backend_buffer_type_t bufts[GGML_SCHED_MAX_BACKENDS];
|
||||
ggml_gallocr_t galloc;
|
||||
|
||||
struct ggml_backend_sched_copy_stream copy_streams[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPY_STREAMS];
|
||||
int cur_copy_stream[GGML_SCHED_MAX_BACKENDS];
|
||||
|
||||
// hash keys of the nodes in the graph
|
||||
struct ggml_hash_set hash_set;
|
||||
// hash values
|
||||
@@ -1228,6 +1257,14 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
|
||||
//#define DEBUG_PASS3
|
||||
//#define DEBUG_PASS4
|
||||
|
||||
static void init_split(ggml_backend_sched_t sched, struct ggml_backend_sched_split * split, int backend_id, int i_start) {
|
||||
split->backend_id = backend_id;
|
||||
split->i_start = i_start;
|
||||
split->i_end = -1;
|
||||
split->n_inputs = 0;
|
||||
split->w_copy_stream_id = -1;
|
||||
}
|
||||
|
||||
// assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
|
||||
static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
||||
// reset splits
|
||||
@@ -1406,19 +1443,17 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||
// pass 4: split graph, find tensors that need to be copied
|
||||
{
|
||||
int i_split = 0;
|
||||
int cur_backend_id = 0;
|
||||
struct ggml_backend_sched_split * split = &sched->splits[0];
|
||||
// find the backend of the first split, skipping view ops
|
||||
for (int i = 0; i < graph->n_nodes; i++) {
|
||||
struct ggml_tensor * node = graph->nodes[i];
|
||||
if (!ggml_is_view_op(node->op)) {
|
||||
split->backend_id = tensor_backend_id(node);
|
||||
cur_backend_id = tensor_backend_id(node);
|
||||
break;
|
||||
}
|
||||
}
|
||||
split->i_start = 0;
|
||||
split->n_inputs = 0;
|
||||
memset(split->inputs, 0, sizeof(split->inputs)); //HACK
|
||||
int cur_backend_id = split->backend_id;
|
||||
init_split(sched, split, cur_backend_id, 0);
|
||||
for (int i = 0; i < graph->n_nodes; i++) {
|
||||
struct ggml_tensor * node = graph->nodes[i];
|
||||
|
||||
@@ -1433,6 +1468,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||
// check if we should start a new split based on the sources of the current node
|
||||
bool need_new_split = false;
|
||||
if (node_backend_id == cur_backend_id && split->n_inputs > 0) {
|
||||
if (split->w_copy_stream_id != -1) {
|
||||
// the previous op used a weight copy stream, start a new split to allow the next copy to start immediately after the op
|
||||
need_new_split = true;
|
||||
}
|
||||
|
||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||
struct ggml_tensor * src = node->src[j];
|
||||
if (src == NULL) {
|
||||
@@ -1452,7 +1492,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||
const size_t id = hash_id(src);
|
||||
int src_backend_id = sched->tensor_backend_id[id];
|
||||
if (src_backend_id != cur_backend_id && sched->tensor_copies[hash_id(src)][cur_backend_id][0] == NULL) {
|
||||
//printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
|
||||
need_new_split = true;
|
||||
break;
|
||||
}
|
||||
@@ -1470,10 +1509,8 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||
}
|
||||
GGML_ASSERT(i_split < GGML_SCHED_MAX_SPLITS);
|
||||
split = &sched->splits[i_split];
|
||||
split->backend_id = node_backend_id;
|
||||
split->i_start = i;
|
||||
split->n_inputs = 0;
|
||||
cur_backend_id = node_backend_id;
|
||||
init_split(sched, split, cur_backend_id, i);
|
||||
}
|
||||
|
||||
// find inputs that are not on the same backend
|
||||
@@ -1529,6 +1566,13 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||
int n_inputs = split->n_inputs++;
|
||||
GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
|
||||
split->inputs[n_inputs] = src;
|
||||
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS && split->w_copy_stream_id == -1 && GGML_SCHED_MAX_COPY_STREAMS > 0) {
|
||||
split->w_copy_stream_id = sched->cur_copy_stream[cur_backend_id];
|
||||
sched->copy_streams[cur_backend_id][split->w_copy_stream_id].max_size = MAX(
|
||||
sched->copy_streams[cur_backend_id][split->w_copy_stream_id].max_size,
|
||||
ggml_backend_buft_get_alloc_size(sched->bufts[cur_backend_id], src));
|
||||
sched->cur_copy_stream[cur_backend_id] = (sched->cur_copy_stream[cur_backend_id] + 1) % GGML_SCHED_MAX_COPY_STREAMS;
|
||||
}
|
||||
}
|
||||
node->src[j] = sched->tensor_copies[id][cur_backend_id][sched->cur_copy];
|
||||
}
|
||||
@@ -1540,6 +1584,10 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||
#ifdef DEBUG_PASS4
|
||||
fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
|
||||
#endif
|
||||
if (getenv("GGML_DEBUG_SCHED")) {
|
||||
fprintf(stderr, "SPLIT GRAPH\n");
|
||||
ggml_backend_sched_print_assignments(sched, graph);
|
||||
}
|
||||
|
||||
// create copies of the graph for each split
|
||||
// TODO: avoid this copy
|
||||
@@ -1613,6 +1661,25 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||
}
|
||||
|
||||
static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
|
||||
// allocate weights in the copy buffers
|
||||
for (int s = 0; s < sched->n_splits; s++) {
|
||||
struct ggml_backend_sched_split * split = &sched->splits[s];
|
||||
if (split->w_copy_stream_id != -1) {
|
||||
struct ggml_backend_sched_copy_stream * stream = &sched->copy_streams[split->backend_id][split->w_copy_stream_id];
|
||||
ggml_backend_buffer_t buffer = stream->buffer;
|
||||
if (buffer == NULL) {
|
||||
continue;
|
||||
}
|
||||
for (int j = 0; j < split->n_inputs; j++) {
|
||||
struct ggml_tensor * input = split->inputs[j];
|
||||
if (input->buffer != NULL && input->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
||||
struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split->backend_id][sched->cur_copy];
|
||||
ggml_backend_tensor_alloc(buffer, input_cpy, ggml_backend_buffer_get_base(buffer));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// allocate graph
|
||||
if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
|
||||
// the re-allocation may cause the split inputs to be moved to a different address
|
||||
@@ -1637,6 +1704,11 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
||||
struct ggml_backend_sched_split * split = &splits[i];
|
||||
int split_backend_id = split->backend_id;
|
||||
ggml_backend_t split_backend = sched->backends[split_backend_id];
|
||||
struct ggml_backend_sched_copy_stream * stream = NULL;
|
||||
|
||||
if (split->w_copy_stream_id != -1) {
|
||||
stream = &sched->copy_streams[split_backend_id][split->w_copy_stream_id];
|
||||
}
|
||||
|
||||
// copy the input tensors to the split backend
|
||||
for (int j = 0; j < split->n_inputs; j++) {
|
||||
@@ -1644,7 +1716,9 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
||||
struct ggml_tensor * input = split->inputs[j];
|
||||
struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split_backend_id][sched->cur_copy];
|
||||
|
||||
if (input->flags & GGML_TENSOR_FLAG_INPUT) {
|
||||
if (input->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS && stream && stream->stream) {
|
||||
ggml_backend_tensor_copy_async(input_backend, stream->stream, input, input_cpy);
|
||||
} else if (input->flags & GGML_TENSOR_FLAG_INPUT) {
|
||||
// inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
|
||||
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
||||
ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
|
||||
@@ -1663,6 +1737,11 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
||||
}
|
||||
}
|
||||
|
||||
if (stream && stream->stream) {
|
||||
ggml_backend_event_record(stream->event_copy);
|
||||
ggml_backend_event_wait(split_backend, stream->event_copy);
|
||||
}
|
||||
|
||||
if (!sched->callback_eval) {
|
||||
enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph);
|
||||
if (ec != GGML_STATUS_SUCCESS) {
|
||||
@@ -1702,6 +1781,12 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
||||
}
|
||||
}
|
||||
|
||||
// record event of this copy stream
|
||||
if (stream && stream->stream) {
|
||||
ggml_backend_event_record(stream->event_use);
|
||||
ggml_backend_event_wait(stream->stream, stream->event_use);
|
||||
}
|
||||
|
||||
// record the event of this copy
|
||||
if (split->n_inputs > 0) {
|
||||
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
||||
@@ -1766,11 +1851,19 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
|
||||
if (sched == NULL) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (int b = 0; b < sched->n_backends; b++) {
|
||||
for (int c = 0; c < sched->n_copies; c++) {
|
||||
ggml_backend_event_free(sched->events[b][c]);
|
||||
}
|
||||
for (int s = 0; s < GGML_SCHED_MAX_COPY_STREAMS; s++) {
|
||||
ggml_backend_buffer_free(sched->copy_streams[b][s].buffer);
|
||||
ggml_backend_event_free(sched->copy_streams[b][s].event_copy);
|
||||
ggml_backend_event_free(sched->copy_streams[b][s].event_use);
|
||||
ggml_backend_free(sched->copy_streams[b][s].stream);
|
||||
}
|
||||
}
|
||||
|
||||
ggml_gallocr_free(sched->galloc);
|
||||
ggml_free(sched->ctx);
|
||||
free(sched->splits);
|
||||
@@ -1789,6 +1882,7 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
|
||||
memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); // NOLINT
|
||||
memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size);
|
||||
memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size);
|
||||
memset(sched->cur_copy_stream, 0, sizeof(sched->cur_copy_stream[0]) * sched->n_backends);
|
||||
|
||||
sched->is_reset = true;
|
||||
}
|
||||
@@ -1800,7 +1894,46 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
|
||||
|
||||
ggml_backend_sched_split_graph(sched, measure_graph);
|
||||
|
||||
// TODO: extract this to a separate function
|
||||
// allocate tensor copy streams
|
||||
for (int b = 0; b < sched->n_backends; b++) {
|
||||
for (int j = 0; j < GGML_SCHED_MAX_COPY_STREAMS; j++) {
|
||||
struct ggml_backend_sched_copy_stream * stream = &sched->copy_streams[b][j];
|
||||
if (stream->max_size > 0) {
|
||||
// backend
|
||||
if (!stream->stream) {
|
||||
stream->stream = ggml_backend_dup(sched->backends[b]);
|
||||
}
|
||||
|
||||
if (!stream->stream) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// events
|
||||
if (!stream->event_copy) {
|
||||
stream->event_copy = ggml_backend_event_new(stream->stream);
|
||||
}
|
||||
|
||||
if (!stream->event_use) {
|
||||
stream->event_use = ggml_backend_event_new(sched->backends[b]);
|
||||
}
|
||||
|
||||
if (!stream->event_copy || !stream->event_use) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// buffer
|
||||
if (!stream->buffer || ggml_backend_buffer_get_size(stream->buffer) < stream->max_size) {
|
||||
ggml_backend_buffer_free(stream->buffer);
|
||||
stream->buffer = ggml_backend_buft_alloc_buffer(sched->bufts[b], stream->max_size);
|
||||
if (stream->buffer == NULL) {
|
||||
fprintf(stderr, "%s: failed to allocate buffer for copy stream\n", __func__);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
|
||||
return false;
|
||||
}
|
||||
@@ -1868,7 +2001,16 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe
|
||||
int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
||||
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
||||
|
||||
return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
|
||||
size_t size = ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
|
||||
|
||||
for (int i = 0; i < GGML_SCHED_MAX_COPY_STREAMS; i++) {
|
||||
if (sched->copy_streams[backend_index][i].buffer == NULL) {
|
||||
continue;
|
||||
}
|
||||
size += ggml_backend_buffer_get_size(sched->copy_streams[backend_index][i].buffer);
|
||||
}
|
||||
|
||||
return size;
|
||||
}
|
||||
|
||||
void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
|
||||
@@ -1895,7 +2037,6 @@ void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * t
|
||||
|
||||
tensor->buffer = buffer;
|
||||
tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
|
||||
tensor->backend = tensor->view_src->backend;
|
||||
ggml_backend_buffer_init_tensor(buffer, tensor);
|
||||
}
|
||||
|
||||
|
||||
@@ -50,9 +50,10 @@ extern "C" {
|
||||
// Backend
|
||||
//
|
||||
|
||||
GGML_API ggml_guid_t ggml_backend_guid(ggml_backend_t backend);
|
||||
GGML_API const char * ggml_backend_name(ggml_backend_t backend);
|
||||
GGML_API void ggml_backend_free(ggml_backend_t backend);
|
||||
GGML_API ggml_guid_t ggml_backend_guid(ggml_backend_t backend);
|
||||
GGML_API const char * ggml_backend_name(ggml_backend_t backend);
|
||||
GGML_API void ggml_backend_free(ggml_backend_t backend);
|
||||
GGML_API ggml_backend_t ggml_backend_dup(ggml_backend_t backend);
|
||||
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend);
|
||||
GGML_API ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);
|
||||
|
||||
@@ -2558,7 +2558,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
||||
}
|
||||
|
||||
// Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates.
|
||||
if (cuda_graph_update_required) {
|
||||
if (use_cuda_graph && cuda_graph_update_required) {
|
||||
cuda_ctx->cuda_graph->number_consecutive_updates++;
|
||||
} else {
|
||||
cuda_ctx->cuda_graph->number_consecutive_updates = 0;
|
||||
@@ -2920,6 +2920,12 @@ static void ggml_backend_cuda_event_synchronize(ggml_backend_event_t event) {
|
||||
CUDA_CHECK(cudaEventSynchronize((cudaEvent_t)event->context));
|
||||
}
|
||||
|
||||
static ggml_backend_t ggml_backend_cuda_dup(ggml_backend_t backend) {
|
||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
||||
|
||||
return ggml_backend_cuda_init(cuda_ctx->device);
|
||||
}
|
||||
|
||||
static ggml_backend_i ggml_backend_cuda_interface = {
|
||||
/* .get_name = */ ggml_backend_cuda_name,
|
||||
/* .free = */ ggml_backend_cuda_free,
|
||||
@@ -2939,6 +2945,7 @@ static ggml_backend_i ggml_backend_cuda_interface = {
|
||||
/* .event_record = */ ggml_backend_cuda_event_record,
|
||||
/* .event_wait = */ ggml_backend_cuda_event_wait,
|
||||
/* .event_synchronize = */ ggml_backend_cuda_event_synchronize,
|
||||
/* .backend_dup = */ ggml_backend_cuda_dup,
|
||||
};
|
||||
|
||||
static ggml_guid_t ggml_backend_cuda_guid() {
|
||||
|
||||
@@ -1,35 +1,36 @@
|
||||
#include "upscale.cuh"
|
||||
|
||||
static __global__ void upscale_f32(const float * x, float * dst, const int ne00, const int ne00xne01, const int scale_factor) {
|
||||
// blockIdx.z: idx of ne02*ne03
|
||||
// blockIdx.y: idx of ne01*scale_factor, aka ne1
|
||||
// blockIDx.x: idx of ne00*scale_factor / BLOCK_SIZE
|
||||
// ne00xne01: ne00 * ne01
|
||||
int ne0 = ne00 * scale_factor;
|
||||
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
if (nidx >= ne0) {
|
||||
static __global__ void upscale_f32(const float * x, float * dst,
|
||||
const int nb00, const int nb01, const int nb02, const int nb03,
|
||||
const int ne10, const int ne11, const int ne12, const int ne13,
|
||||
const float sf0, const float sf1, const float sf2, const float sf3) {
|
||||
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
if (index >= ne10 * ne11 * ne12 * ne13) {
|
||||
return;
|
||||
}
|
||||
// operation
|
||||
int i00 = nidx / scale_factor;
|
||||
int i01 = blockIdx.y / scale_factor;
|
||||
int offset_src =
|
||||
i00 +
|
||||
i01 * ne00 +
|
||||
blockIdx.z * ne00xne01;
|
||||
int offset_dst =
|
||||
nidx +
|
||||
blockIdx.y * ne0 +
|
||||
blockIdx.z * ne0 * gridDim.y;
|
||||
dst[offset_dst] = x[offset_src];
|
||||
|
||||
int i10 = index % ne10;
|
||||
int i11 = (index / ne10) % ne11;
|
||||
int i12 = (index / (ne10 * ne11)) % ne12;
|
||||
int i13 = (index / (ne10 * ne11 * ne12)) % ne13;
|
||||
|
||||
int i00 = i10 / sf0;
|
||||
int i01 = i11 / sf1;
|
||||
int i02 = i12 / sf2;
|
||||
int i03 = i13 / sf3;
|
||||
|
||||
dst[index] = *(float *)((char *)x + i03 * nb03 + i02 * nb02 + i01 * nb01 + i00 * nb00);
|
||||
}
|
||||
|
||||
static void upscale_f32_cuda(const float * x, float * dst, const int ne00, const int ne01, const int ne02, const int ne03,
|
||||
const int scale_factor, cudaStream_t stream) {
|
||||
int ne0 = (ne00 * scale_factor);
|
||||
int num_blocks = (ne0 + CUDA_UPSCALE_BLOCK_SIZE - 1) / CUDA_UPSCALE_BLOCK_SIZE;
|
||||
dim3 gridDim(num_blocks, (ne01 * scale_factor), ne02*ne03);
|
||||
upscale_f32<<<gridDim, CUDA_UPSCALE_BLOCK_SIZE, 0, stream>>>(x, dst, ne00, ne00 * ne01, scale_factor);
|
||||
static void upscale_f32_cuda(const float * x, float * dst,
|
||||
const int nb00, const int nb01, const int nb02, const int nb03,
|
||||
const int ne10, const int ne11, const int ne12, const int ne13,
|
||||
const float sf0, const float sf1, const float sf2, const float sf3,
|
||||
cudaStream_t stream) {
|
||||
int dst_size = ne10 * ne11 * ne12 * ne13;
|
||||
int num_blocks = (dst_size + CUDA_UPSCALE_BLOCK_SIZE - 1) / CUDA_UPSCALE_BLOCK_SIZE;
|
||||
|
||||
upscale_f32<<<num_blocks, CUDA_UPSCALE_BLOCK_SIZE,0,stream>>>(x, dst, nb00, nb01, nb02, nb03, ne10, ne11, ne12, ne13, sf0, sf1, sf2, sf3);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_upscale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
@@ -39,10 +40,12 @@ void ggml_cuda_op_upscale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
|
||||
const int scale_factor = dst->op_params[0];
|
||||
const float sf0 = (float)dst->ne[0]/src0->ne[0];
|
||||
const float sf1 = (float)dst->ne[1]/src0->ne[1];
|
||||
const float sf2 = (float)dst->ne[2]/src0->ne[2];
|
||||
const float sf3 = (float)dst->ne[3]/src0->ne[3];
|
||||
|
||||
upscale_f32_cuda(src0_d, dst_d, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], scale_factor, stream);
|
||||
upscale_f32_cuda(src0_d, dst_d, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3, stream);
|
||||
}
|
||||
|
||||
@@ -120,9 +120,16 @@ extern "C" {
|
||||
#ifndef __F16C__
|
||||
#define __F16C__
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// __SSE3__ and __SSSE3__ are not defined in MSVC, but SSE3/SSSE3 are present when AVX/AVX2/AVX512 are available
|
||||
#if defined(_MSC_VER) && (defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__))
|
||||
#ifndef __SSE3__
|
||||
#define __SSE3__
|
||||
#endif
|
||||
#ifndef __SSSE3__
|
||||
#define __SSSE3__
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// 16-bit float
|
||||
|
||||
83
ggml-metal.m
83
ggml-metal.m
@@ -1378,7 +1378,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16);
|
||||
|
||||
if (ne00%4 == 0) {
|
||||
while (nth < ne00/4 && nth < 256) {
|
||||
while (nth < ne00/4 && nth*ne01*ne02*ne03 < 256) {
|
||||
nth *= 2;
|
||||
}
|
||||
if (use_f16) {
|
||||
@@ -1387,7 +1387,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32_4].pipeline;
|
||||
}
|
||||
} else {
|
||||
while (nth < ne00 && nth < 1024) {
|
||||
while (nth < ne00 && nth*ne01*ne02*ne03 < 256) {
|
||||
nth *= 2;
|
||||
}
|
||||
if (use_f16) {
|
||||
@@ -2353,7 +2353,10 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
{
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
|
||||
const int sf = dst->op_params[0];
|
||||
const float sf0 = (float)ne0/src0->ne[0];
|
||||
const float sf1 = (float)ne1/src0->ne[1];
|
||||
const float sf2 = (float)ne2/src0->ne[2];
|
||||
const float sf3 = (float)ne3/src0->ne[3];
|
||||
|
||||
const id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_UPSCALE_F32].pipeline;
|
||||
|
||||
@@ -2376,7 +2379,10 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
[encoder setBytes:&nb1 length:sizeof(nb1) atIndex:15];
|
||||
[encoder setBytes:&nb2 length:sizeof(nb2) atIndex:16];
|
||||
[encoder setBytes:&nb3 length:sizeof(nb3) atIndex:17];
|
||||
[encoder setBytes:&sf length:sizeof(sf) atIndex:18];
|
||||
[encoder setBytes:&sf0 length:sizeof(sf0) atIndex:18];
|
||||
[encoder setBytes:&sf1 length:sizeof(sf1) atIndex:19];
|
||||
[encoder setBytes:&sf2 length:sizeof(sf2) atIndex:20];
|
||||
[encoder setBytes:&sf3 length:sizeof(sf3) atIndex:21];
|
||||
|
||||
const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne0);
|
||||
|
||||
@@ -2512,13 +2518,14 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
} break;
|
||||
case GGML_OP_FLASH_ATTN_EXT:
|
||||
{
|
||||
GGML_ASSERT(ne00 % 4 == 0);
|
||||
GGML_ASSERT(ne00 % 4 == 0);
|
||||
GGML_ASSERT(ne11 % 32 == 0);
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
|
||||
struct ggml_tensor * src3 = gf->nodes[i]->src[3];
|
||||
GGML_ASSERT(ggml_are_same_shape (src1, src2));
|
||||
|
||||
GGML_ASSERT(ggml_are_same_shape(src1, src2));
|
||||
GGML_ASSERT(src3);
|
||||
struct ggml_tensor * src3 = gf->nodes[i]->src[3];
|
||||
|
||||
size_t offs_src3 = 0;
|
||||
|
||||
@@ -2528,6 +2535,11 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
GGML_ASSERT(!src3 || src3->ne[1] >= GGML_PAD(src0->ne[1], 8) &&
|
||||
"the Flash-Attention Metal kernel requires the mask to be padded to 8 and at least n_queries big");
|
||||
|
||||
const uint64_t nb20 = src2 ? src2->nb[0] : 0; GGML_UNUSED(nb20);
|
||||
const uint64_t nb21 = src2 ? src2->nb[1] : 0;
|
||||
const uint64_t nb22 = src2 ? src2->nb[2] : 0;
|
||||
const uint64_t nb23 = src2 ? src2->nb[3] : 0;
|
||||
|
||||
const int64_t ne30 = src3 ? src3->ne[0] : 0; GGML_UNUSED(ne30);
|
||||
//const int64_t ne31 = src3 ? src3->ne[1] : 0;
|
||||
const int64_t ne32 = src3 ? src3->ne[2] : 0; GGML_UNUSED(ne32);
|
||||
@@ -2590,34 +2602,35 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
||||
[encoder setBuffer:id_src2 offset:offs_src2 atIndex:2];
|
||||
[encoder setBuffer:id_src3 offset:offs_src3 atIndex:3];
|
||||
if (id_src3) {
|
||||
[encoder setBuffer:id_src3 offset:offs_src3 atIndex:3];
|
||||
} else {
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:3];
|
||||
}
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:4];
|
||||
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:5];
|
||||
[encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:6];
|
||||
[encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:7];
|
||||
[encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:8];
|
||||
[encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:9];
|
||||
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:10];
|
||||
[encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:11];
|
||||
[encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:12];
|
||||
[encoder setBytes:&ne10 length:sizeof( int64_t) atIndex:13];
|
||||
[encoder setBytes:&ne11 length:sizeof( int64_t) atIndex:14];
|
||||
[encoder setBytes:&ne12 length:sizeof( int64_t) atIndex:15];
|
||||
[encoder setBytes:&ne13 length:sizeof( int64_t) atIndex:16];
|
||||
[encoder setBytes:&nb10 length:sizeof(uint64_t) atIndex:17];
|
||||
[encoder setBytes:&nb11 length:sizeof(uint64_t) atIndex:18];
|
||||
[encoder setBytes:&nb12 length:sizeof(uint64_t) atIndex:19];
|
||||
[encoder setBytes:&nb13 length:sizeof(uint64_t) atIndex:20];
|
||||
[encoder setBytes:&nb31 length:sizeof(uint64_t) atIndex:21];
|
||||
[encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:22];
|
||||
[encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:23];
|
||||
[encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:24];
|
||||
[encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:25];
|
||||
[encoder setBytes:&scale length:sizeof( float) atIndex:26];
|
||||
[encoder setBytes:&max_bias length:sizeof( float) atIndex:27];
|
||||
[encoder setBytes:&m0 length:sizeof(m0) atIndex:28];
|
||||
[encoder setBytes:&m1 length:sizeof(m1) atIndex:29];
|
||||
[encoder setBytes:&n_head_log2 length:sizeof(n_head_log2) atIndex:30];
|
||||
[encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:5];
|
||||
[encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:6];
|
||||
[encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:7];
|
||||
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:8];
|
||||
[encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:9];
|
||||
[encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:10];
|
||||
[encoder setBytes:&ne11 length:sizeof( int64_t) atIndex:11];
|
||||
[encoder setBytes:&ne12 length:sizeof( int64_t) atIndex:12];
|
||||
[encoder setBytes:&ne13 length:sizeof( int64_t) atIndex:13];
|
||||
[encoder setBytes:&nb11 length:sizeof(uint64_t) atIndex:14];
|
||||
[encoder setBytes:&nb12 length:sizeof(uint64_t) atIndex:15];
|
||||
[encoder setBytes:&nb13 length:sizeof(uint64_t) atIndex:16];
|
||||
[encoder setBytes:&nb21 length:sizeof(uint64_t) atIndex:17];
|
||||
[encoder setBytes:&nb22 length:sizeof(uint64_t) atIndex:18];
|
||||
[encoder setBytes:&nb23 length:sizeof(uint64_t) atIndex:19];
|
||||
[encoder setBytes:&nb31 length:sizeof(uint64_t) atIndex:20];
|
||||
[encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:21];
|
||||
[encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:22];
|
||||
[encoder setBytes:&scale length:sizeof( float) atIndex:23];
|
||||
[encoder setBytes:&max_bias length:sizeof( float) atIndex:24];
|
||||
[encoder setBytes:&m0 length:sizeof(m0) atIndex:25];
|
||||
[encoder setBytes:&m1 length:sizeof(m1) atIndex:26];
|
||||
[encoder setBytes:&n_head_log2 length:sizeof(n_head_log2) atIndex:27];
|
||||
|
||||
if (!use_vec_kernel) {
|
||||
// half8x8 kernel
|
||||
|
||||
@@ -1852,7 +1852,10 @@ kernel void kernel_upscale_f32(
|
||||
constant uint64_t & nb1,
|
||||
constant uint64_t & nb2,
|
||||
constant uint64_t & nb3,
|
||||
constant int32_t & sf,
|
||||
constant float & sf0,
|
||||
constant float & sf1,
|
||||
constant float & sf2,
|
||||
constant float & sf3,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
uint3 ntg[[threads_per_threadgroup]]) {
|
||||
@@ -1861,15 +1864,17 @@ kernel void kernel_upscale_f32(
|
||||
const int64_t i2 = tgpig.y;
|
||||
const int64_t i1 = tgpig.x;
|
||||
|
||||
const int64_t i03 = i3;
|
||||
const int64_t i02 = i2;
|
||||
const int64_t i01 = i1/sf;
|
||||
|
||||
device const float * src0_ptr = (device const float *) (src0 + i03*nb03 + i02*nb02 + i01*nb01);
|
||||
device float * dst_ptr = (device float *) (dst + i3*nb3 + i2*nb2 + i1*nb1);
|
||||
const int64_t i03 = i3/sf3;
|
||||
const int64_t i02 = i2/sf2;
|
||||
const int64_t i01 = i1/sf1;
|
||||
|
||||
for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
|
||||
dst_ptr[i0] = src0_ptr[i0/sf];
|
||||
const int64_t i00 = i0/sf0;
|
||||
|
||||
device const float * src0_ptr = (device const float *) (src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
|
||||
device float * dst_ptr = (device float *) (dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||
|
||||
dst_ptr[0] = src0_ptr[0];
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2049,27 +2054,24 @@ typedef void (flash_attn_ext_f16_t)(
|
||||
device const char * v,
|
||||
device const char * mask,
|
||||
device float * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
constant int64_t & ne03,
|
||||
constant uint64_t & nb00,
|
||||
constant uint64_t & nb01,
|
||||
constant uint64_t & nb02,
|
||||
constant uint64_t & nb03,
|
||||
constant int64_t & ne10,
|
||||
constant int64_t & ne11,
|
||||
constant int64_t & ne12,
|
||||
constant int64_t & ne13,
|
||||
constant uint64_t & nb10,
|
||||
constant uint64_t & nb11,
|
||||
constant uint64_t & nb12,
|
||||
constant uint64_t & nb13,
|
||||
constant uint64_t & nb21,
|
||||
constant uint64_t & nb22,
|
||||
constant uint64_t & nb23,
|
||||
constant uint64_t & nb31,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant int64_t & ne2,
|
||||
constant int64_t & ne3,
|
||||
constant float & scale,
|
||||
constant float & max_bias,
|
||||
constant float & m0,
|
||||
@@ -2090,27 +2092,24 @@ kernel void kernel_flash_attn_ext_f16(
|
||||
device const char * v,
|
||||
device const char * mask,
|
||||
device float * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
constant int64_t & ne03,
|
||||
constant uint64_t & nb00,
|
||||
constant uint64_t & nb01,
|
||||
constant uint64_t & nb02,
|
||||
constant uint64_t & nb03,
|
||||
constant int64_t & ne10,
|
||||
constant int64_t & ne11,
|
||||
constant int64_t & ne12,
|
||||
constant int64_t & ne13,
|
||||
constant uint64_t & nb10,
|
||||
constant uint64_t & nb11,
|
||||
constant uint64_t & nb12,
|
||||
constant uint64_t & nb13,
|
||||
constant uint64_t & nb21,
|
||||
constant uint64_t & nb22,
|
||||
constant uint64_t & nb23,
|
||||
constant uint64_t & nb31,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant int64_t & ne2,
|
||||
constant int64_t & ne3,
|
||||
constant float & scale,
|
||||
constant float & max_bias,
|
||||
constant float & m0,
|
||||
@@ -2180,10 +2179,6 @@ kernel void kernel_flash_attn_ext_f16(
|
||||
const short ne22 = ne12;
|
||||
const short ne23 = ne13;
|
||||
|
||||
const uint nb21 = nb11;
|
||||
const uint nb22 = nb12;
|
||||
const uint nb23 = nb13;
|
||||
|
||||
// broadcast
|
||||
const short rk2 = ne02/ne12;
|
||||
const short rk3 = ne03/ne13;
|
||||
@@ -2247,11 +2242,16 @@ kernel void kernel_flash_attn_ext_f16(
|
||||
simdgroup_multiply_accumulate(mqk, mq[i], mk, mqk);
|
||||
}
|
||||
|
||||
// mqk = mqk*scale + mask*slope
|
||||
simdgroup_half8x8 mm;
|
||||
simdgroup_load(mm, mp + ic + 8*cc, nb31/sizeof(half), 0, false);
|
||||
simdgroup_multiply(mm, mslope, mm);
|
||||
simdgroup_multiply_accumulate(mqk, mqk, mscale, mm);
|
||||
if (mask != q) {
|
||||
// mqk = mqk*scale + mask*slope
|
||||
simdgroup_half8x8 mm;
|
||||
simdgroup_load(mm, mp + ic + 8*cc, nb31/sizeof(half), 0, false);
|
||||
simdgroup_multiply(mm, mslope, mm);
|
||||
simdgroup_multiply_accumulate(mqk, mqk, mscale, mm);
|
||||
} else {
|
||||
// mqk = mqk*scale
|
||||
simdgroup_multiply(mqk, mscale, mqk);
|
||||
}
|
||||
|
||||
simdgroup_store(mqk, ss + 8*cc, TF, 0, false);
|
||||
}
|
||||
@@ -2425,27 +2425,24 @@ kernel void kernel_flash_attn_ext_vec_f16(
|
||||
device const char * v,
|
||||
device const char * mask,
|
||||
device float * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
constant int64_t & ne03,
|
||||
constant uint64_t & nb00,
|
||||
constant uint64_t & nb01,
|
||||
constant uint64_t & nb02,
|
||||
constant uint64_t & nb03,
|
||||
constant int64_t & ne10,
|
||||
constant int64_t & ne11,
|
||||
constant int64_t & ne12,
|
||||
constant int64_t & ne13,
|
||||
constant uint64_t & nb10,
|
||||
constant uint64_t & nb11,
|
||||
constant uint64_t & nb12,
|
||||
constant uint64_t & nb13,
|
||||
constant uint64_t & nb21,
|
||||
constant uint64_t & nb22,
|
||||
constant uint64_t & nb23,
|
||||
constant uint64_t & nb31,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant int64_t & ne2,
|
||||
constant int64_t & ne3,
|
||||
constant float & scale,
|
||||
constant float & max_bias,
|
||||
constant float & m0,
|
||||
@@ -2521,10 +2518,6 @@ kernel void kernel_flash_attn_ext_vec_f16(
|
||||
const short ne22 = ne12;
|
||||
const short ne23 = ne13;
|
||||
|
||||
const uint nb21 = nb11;
|
||||
const uint nb22 = nb12;
|
||||
const uint nb23 = nb13;
|
||||
|
||||
// broadcast
|
||||
const short rk2 = ne02/ne12;
|
||||
const short rk3 = ne03/ne13;
|
||||
@@ -2589,8 +2582,7 @@ kernel void kernel_flash_attn_ext_vec_f16(
|
||||
|
||||
// mqk = mqk*scale + mask*slope
|
||||
if (tiisg == 0) {
|
||||
float4 mm = (float4) mp4[ic/4 + cc];
|
||||
mqk = mqk*scale + mm*slope;
|
||||
mqk = mqk*scale + ((mask != q) ? ((float4) mp4[ic/4 + cc])*slope : (float4) 0.0f);
|
||||
|
||||
ss4[cc] = mqk;
|
||||
}
|
||||
|
||||
2169
ggml-quants.c
2169
ggml-quants.c
File diff suppressed because it is too large
Load Diff
1023
ggml-rpc.cpp
Normal file
1023
ggml-rpc.cpp
Normal file
File diff suppressed because it is too large
Load Diff
24
ggml-rpc.h
Normal file
24
ggml-rpc.h
Normal file
@@ -0,0 +1,24 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-backend.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define GGML_RPC_MAX_SERVERS 16
|
||||
|
||||
// backend API
|
||||
GGML_API GGML_CALL ggml_backend_t ggml_backend_rpc_init(const char * endpoint);
|
||||
GGML_API GGML_CALL bool ggml_backend_is_rpc(ggml_backend_t backend);
|
||||
|
||||
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint);
|
||||
|
||||
GGML_API GGML_CALL void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);
|
||||
|
||||
GGML_API GGML_CALL void start_rpc_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
@@ -13987,6 +13987,10 @@ inline void ggml_sycl_op_upscale(const ggml_tensor *src0,
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
|
||||
|
||||
#pragma message("TODO: generalize upscale operator")
|
||||
#pragma message(" https://github.com/ggerganov/ggml/pull/814")
|
||||
GGML_ASSERT(false && "TODO: generalize upscale operator");
|
||||
|
||||
const int scale_factor = dst->op_params[0];
|
||||
|
||||
upscale_f32_sycl(src0_dd, dst_dd, src0->ne[0], src0->ne[1], src0->ne[2], scale_factor, main_stream);
|
||||
@@ -15564,26 +15568,6 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
|
||||
const int64_t r2 = ne12/ne02;
|
||||
const int64_t r3 = ne13/ne03;
|
||||
|
||||
#if 0
|
||||
// use syclGemmEx
|
||||
{
|
||||
for (int i13 = 0; i13 < ne13; ++i13) {
|
||||
for (int i12 = 0; i12 < ne12; ++i12) {
|
||||
int i03 = i13 / r3;
|
||||
int i02 = i12 / r2;
|
||||
|
||||
SYCL_CHECK(
|
||||
syclGemmEx(g_sycl_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
|
||||
ne01, ne11, ne10,
|
||||
alpha, (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , SYCL_R_16F, nb01/sizeof(half),
|
||||
(const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, SYCL_R_16F, nb11/sizeof(float),
|
||||
beta, ( char *) dst_t + i12*nbd2 + i13*nbd3, cu_data_type, ne01,
|
||||
cu_compute_type,
|
||||
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
if (r2 == 1 && r3 == 1 && src0->nb[2]*src0->ne[2] == src0->nb[3] && src1->nb[2]*src1->ne[2] == src1->nb[3]) {
|
||||
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
|
||||
@@ -15595,7 +15579,6 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
|
||||
nb11 / nb10, nb12 / nb10, beta,
|
||||
(char *)dst_t, cu_data_type, ne01, nb2 / nb0,
|
||||
ne12 * ne13, cu_compute_type)));
|
||||
g_sycl_handles[g_main_device]->wait();
|
||||
} else {
|
||||
const int ne23 = ne12*ne13;
|
||||
|
||||
@@ -15626,7 +15609,7 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
|
||||
nb02, nb03, nb12_scaled, nb13_scaled,
|
||||
nbd2, nbd3, r2, r3, item_ct1);
|
||||
});
|
||||
}).wait();
|
||||
});
|
||||
}
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
|
||||
*g_sycl_handles[g_main_device], oneapi::mkl::transpose::trans,
|
||||
@@ -15637,9 +15620,7 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
|
||||
dpct::library_data_t::real_half, nb11 / nb10, beta,
|
||||
(void **)(ptrs_dst.get() + 0 * ne23), cu_data_type, ne01, ne23,
|
||||
cu_compute_type)));
|
||||
g_sycl_handles[g_main_device]->wait();
|
||||
}
|
||||
#endif
|
||||
|
||||
if (no_mixed_dtypes) {
|
||||
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16);
|
||||
|
||||
467
ggml.c
467
ggml.c
@@ -112,6 +112,8 @@ typedef void * thread_ret_t;
|
||||
|
||||
#endif
|
||||
|
||||
typedef pthread_t ggml_thread_t;
|
||||
|
||||
#ifdef GGML_USE_CPU_HBM
|
||||
#include <hbwmalloc.h>
|
||||
#endif
|
||||
@@ -1306,6 +1308,8 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
|
||||
#define GGML_F16_VEC_ZERO GGML_F32x4_ZERO
|
||||
#define GGML_F16_VEC_SET1 GGML_F32x4_SET1
|
||||
#define GGML_F16_VEC_FMA GGML_F32x4_FMA
|
||||
#define GGML_F16_VEC_ADD GGML_F32x4_ADD
|
||||
#define GGML_F16_VEC_MUL GGML_F32x4_MUL
|
||||
#define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE
|
||||
// Use vec_xl, not vec_ld, in case the load address is not aligned.
|
||||
#define GGML_F16_VEC_LOAD(p, i) (i & 0x1) ? \
|
||||
@@ -1537,6 +1541,59 @@ static inline void __sse_f16x4_store(ggml_fp16_t *x, __m128 y) {
|
||||
#define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR)
|
||||
#endif
|
||||
|
||||
//
|
||||
// ggml context
|
||||
//
|
||||
|
||||
struct ggml_context {
|
||||
size_t mem_size;
|
||||
void* mem_buffer;
|
||||
bool mem_buffer_owned;
|
||||
bool no_alloc;
|
||||
bool no_alloc_save; // this is used to save the no_alloc state when using scratch buffers
|
||||
|
||||
int n_objects;
|
||||
|
||||
struct ggml_object* objects_begin;
|
||||
struct ggml_object* objects_end;
|
||||
|
||||
struct ggml_scratch scratch;
|
||||
struct ggml_scratch scratch_save;
|
||||
};
|
||||
|
||||
struct ggml_context_container {
|
||||
bool used;
|
||||
|
||||
struct ggml_context context;
|
||||
};
|
||||
|
||||
struct ggml_compute_state_shared {
|
||||
const struct ggml_cgraph* cgraph;
|
||||
const struct ggml_cplan* cplan;
|
||||
|
||||
int64_t perf_node_start_cycles;
|
||||
int64_t perf_node_start_time_us;
|
||||
|
||||
const int n_threads;
|
||||
|
||||
// synchronization primitives
|
||||
atomic_int n_active; // num active threads
|
||||
atomic_int node_n; // active graph node
|
||||
atomic_int node_task; // active graph node task phase
|
||||
|
||||
ggml_abort_callback abort_callback; // abort ggml_graph_compute when true
|
||||
void* abort_callback_data;
|
||||
|
||||
atomic_int current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
|
||||
};
|
||||
|
||||
struct ggml_compute_state {
|
||||
ggml_thread_t thrd;
|
||||
int ith;
|
||||
struct ggml_compute_state_shared* shared;
|
||||
enum ggml_status ec;
|
||||
};
|
||||
|
||||
//
|
||||
// fundamental operations
|
||||
//
|
||||
@@ -2383,32 +2440,6 @@ static void ggml_setup_op_has_task_pass(void) {
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// ggml context
|
||||
//
|
||||
|
||||
struct ggml_context {
|
||||
size_t mem_size;
|
||||
void * mem_buffer;
|
||||
bool mem_buffer_owned;
|
||||
bool no_alloc;
|
||||
bool no_alloc_save; // this is used to save the no_alloc state when using scratch buffers
|
||||
|
||||
int n_objects;
|
||||
|
||||
struct ggml_object * objects_begin;
|
||||
struct ggml_object * objects_end;
|
||||
|
||||
struct ggml_scratch scratch;
|
||||
struct ggml_scratch scratch_save;
|
||||
};
|
||||
|
||||
struct ggml_context_container {
|
||||
bool used;
|
||||
|
||||
struct ggml_context context;
|
||||
};
|
||||
|
||||
//
|
||||
// NUMA support
|
||||
//
|
||||
@@ -2822,6 +2853,16 @@ bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor
|
||||
(t0->ne[3] == t1->ne[3] );
|
||||
}
|
||||
|
||||
bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
|
||||
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
||||
|
||||
return
|
||||
(t0->nb[0] == t1->nb[0] ) &&
|
||||
(t0->nb[1] == t1->nb[1] ) &&
|
||||
(t0->nb[2] == t1->nb[2] ) &&
|
||||
(t0->nb[3] == t1->nb[3] );
|
||||
}
|
||||
|
||||
// check if t1 can be represented as a repeatition of t0
|
||||
static inline bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
|
||||
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
||||
@@ -3166,6 +3207,12 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
||||
|
||||
struct ggml_tensor * const result = (struct ggml_tensor *)((char *)ctx->mem_buffer + obj_new->offs);
|
||||
|
||||
#ifdef __clang__
|
||||
// temporary until ggml_tensor::backend is removed
|
||||
#pragma clang diagnostic push
|
||||
#pragma clang diagnostic ignored "-Wdeprecated-declarations"
|
||||
#endif
|
||||
|
||||
*result = (struct ggml_tensor) {
|
||||
/*.type =*/ type,
|
||||
/*.backend =*/ GGML_BACKEND_TYPE_CPU,
|
||||
@@ -3188,6 +3235,10 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
||||
/*.padding =*/ { 0 },
|
||||
};
|
||||
|
||||
#ifdef __clang__
|
||||
#pragma clang diagnostic pop
|
||||
#endif
|
||||
|
||||
// TODO: this should not be needed as long as we don't rely on aligned SIMD loads
|
||||
//ggml_assert_aligned(result->data);
|
||||
|
||||
@@ -6281,7 +6332,10 @@ struct ggml_tensor * ggml_pool_2d(
|
||||
static struct ggml_tensor * ggml_upscale_impl(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int scale_factor) {
|
||||
int ne0,
|
||||
int ne1,
|
||||
int ne2,
|
||||
int ne3) {
|
||||
bool is_node = false;
|
||||
|
||||
if (a->grad) {
|
||||
@@ -6289,19 +6343,45 @@ static struct ggml_tensor * ggml_upscale_impl(
|
||||
is_node = true;
|
||||
}
|
||||
|
||||
GGML_ASSERT(a->ne[0] <= ne0);
|
||||
GGML_ASSERT(a->ne[1] <= ne1);
|
||||
GGML_ASSERT(a->ne[2] <= ne2);
|
||||
GGML_ASSERT(a->ne[3] <= ne3);
|
||||
|
||||
struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
|
||||
a->ne[0] * scale_factor,
|
||||
a->ne[1] * scale_factor,
|
||||
a->ne[2], a->ne[3]);
|
||||
ne0,
|
||||
ne1,
|
||||
ne2,
|
||||
ne3
|
||||
);
|
||||
|
||||
result->op = GGML_OP_UPSCALE;
|
||||
result->op_params[0] = scale_factor;
|
||||
|
||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||
result->src[0] = a;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
struct ggml_tensor * ggml_upscale(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int scale_factor) {
|
||||
return ggml_upscale_impl(ctx, a, a->ne[0] * scale_factor, a->ne[1] * scale_factor, a->ne[2], a->ne[3]);
|
||||
}
|
||||
|
||||
struct ggml_tensor * ggml_upscale_ext(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int ne0,
|
||||
int ne1,
|
||||
int ne2,
|
||||
int ne3) {
|
||||
return ggml_upscale_impl(ctx, a, ne0, ne1, ne2, ne3);
|
||||
}
|
||||
|
||||
// ggml_pad
|
||||
|
||||
struct ggml_tensor * ggml_pad(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
@@ -6326,12 +6406,7 @@ struct ggml_tensor * ggml_pad(
|
||||
return result;
|
||||
}
|
||||
|
||||
struct ggml_tensor * ggml_upscale(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int scale_factor) {
|
||||
return ggml_upscale_impl(ctx, a, scale_factor);
|
||||
}
|
||||
// ggml_arange
|
||||
|
||||
struct ggml_tensor * ggml_arange(
|
||||
struct ggml_context * ctx,
|
||||
@@ -6353,6 +6428,8 @@ struct ggml_tensor * ggml_arange(
|
||||
return result;
|
||||
}
|
||||
|
||||
// ggml_timestep_embedding
|
||||
|
||||
struct ggml_tensor * ggml_timestep_embedding(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * timesteps,
|
||||
@@ -11767,9 +11844,101 @@ static bool ggml_compute_forward_mul_mat_use_blas(struct ggml_tensor * dst) {
|
||||
}
|
||||
#endif
|
||||
|
||||
static void ggml_compute_forward_mul_mat_one_chunk(
|
||||
const struct ggml_compute_params * params,
|
||||
struct ggml_tensor * dst,
|
||||
const int64_t num_rows_per_vec_dot,
|
||||
const int64_t ir0_start,
|
||||
const int64_t ir0_end,
|
||||
const int64_t ir1_start,
|
||||
const int64_t ir1_end) {
|
||||
|
||||
const struct ggml_tensor * src0 = dst->src[0];
|
||||
const struct ggml_tensor * src1 = dst->src[1];
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
||||
const enum ggml_type type = src0->type;
|
||||
|
||||
const bool src1_cont = ggml_is_contiguous(src1);
|
||||
|
||||
ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
|
||||
enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
|
||||
|
||||
// broadcast factors
|
||||
const int64_t r2 = ne12 / ne02;
|
||||
const int64_t r3 = ne13 / ne03;
|
||||
|
||||
//printf("ir0_start = %6lld, ir0_end = %6lld, ir1_start = %6lld, ir1_end = %6lld\n", ir0_start, ir0_end, ir1_start, ir1_end);
|
||||
|
||||
// threads with no work simply yield (not sure if it helps)
|
||||
if (ir0_start >= ir0_end || ir1_start >= ir1_end) {
|
||||
return;
|
||||
}
|
||||
|
||||
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
||||
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
||||
|
||||
assert(ne12 % ne02 == 0);
|
||||
assert(ne13 % ne03 == 0);
|
||||
|
||||
// block-tiling attempt
|
||||
const int64_t blck_0 = 16;
|
||||
const int64_t blck_1 = 16;
|
||||
|
||||
const size_t src1_col_stride = src1_cont || src1->type != vec_dot_type ? row_size : nb11;
|
||||
|
||||
// attempt to reduce false-sharing (does not seem to make a difference)
|
||||
// 16 * 2, accounting for mmla kernels
|
||||
float tmp[32];
|
||||
|
||||
for (int64_t iir1 = ir1_start; iir1 < ir1_end; iir1 += blck_1) {
|
||||
for (int64_t iir0 = ir0_start; iir0 < ir0_end; iir0 += blck_0) {
|
||||
for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir1_end; ir1 += num_rows_per_vec_dot) {
|
||||
const int64_t i13 = (ir1 / (ne12 * ne1));
|
||||
const int64_t i12 = (ir1 - i13 * ne12 * ne1) / ne1;
|
||||
const int64_t i11 = (ir1 - i13 * ne12 * ne1 - i12 * ne1);
|
||||
|
||||
// broadcast src0 into src1
|
||||
const int64_t i03 = i13 / r3;
|
||||
const int64_t i02 = i12 / r2;
|
||||
|
||||
const int64_t i1 = i11;
|
||||
const int64_t i2 = i12;
|
||||
const int64_t i3 = i13;
|
||||
|
||||
const char * src0_row = (const char*)src0->data + (0 + i02 * nb02 + i03 * nb03);
|
||||
|
||||
// desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
|
||||
// if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
|
||||
// the original src1 data pointer, so we should index using the indices directly
|
||||
// TODO: this is a bit of a hack, we should probably have a better way to handle this
|
||||
const char * src1_col = (const char*)wdata +
|
||||
(src1_cont || src1->type != vec_dot_type
|
||||
? (i11 + i12 * ne11 + i13 * ne12 * ne11) * row_size
|
||||
: (i11 * nb11 + i12 * nb12 + i13 * nb13));
|
||||
float * dst_col = (float*)((char*)dst->data + (i1 * nb1 + i2 * nb2 + i3 * nb3));
|
||||
|
||||
//for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ++ir0) {
|
||||
// vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
|
||||
//}
|
||||
|
||||
for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ir0 += num_rows_per_vec_dot) {
|
||||
vec_dot(ne00, &tmp[ir0 - iir0], (num_rows_per_vec_dot > 1 ? 16 : 0), src0_row + ir0 * nb01, (num_rows_per_vec_dot > 1 ? nb01 : 0), src1_col, (num_rows_per_vec_dot > 1 ? src1_col_stride : 0), num_rows_per_vec_dot);
|
||||
}
|
||||
|
||||
for (int cn = 0; cn < num_rows_per_vec_dot; ++cn) {
|
||||
memcpy(&dst_col[iir0 + cn * nb1 / nb0], tmp + (cn * 16), (MIN(iir0 + blck_0, ir0_end) - iir0) * sizeof(float));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_compute_forward_mul_mat(
|
||||
const struct ggml_compute_params * params,
|
||||
struct ggml_tensor * dst) {
|
||||
struct ggml_tensor * dst,
|
||||
struct ggml_compute_state * state) {
|
||||
|
||||
const struct ggml_tensor * src0 = dst->src[0];
|
||||
const struct ggml_tensor * src1 = dst->src[1];
|
||||
@@ -11784,9 +11953,6 @@ static void ggml_compute_forward_mul_mat(
|
||||
|
||||
const enum ggml_type type = src0->type;
|
||||
|
||||
const bool src1_cont = ggml_is_contiguous(src1);
|
||||
|
||||
ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
|
||||
enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
|
||||
ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
|
||||
int64_t const vec_dot_num_rows = type_traits[type].nrows;
|
||||
@@ -11807,8 +11973,10 @@ static void ggml_compute_forward_mul_mat(
|
||||
GGML_ASSERT(nb2 <= nb3);
|
||||
|
||||
// broadcast factors
|
||||
const int64_t r2 = ne12/ne02;
|
||||
const int64_t r3 = ne13/ne03;
|
||||
const int64_t r2 = ne12 / ne02;
|
||||
const int64_t r3 = ne13 / ne03;
|
||||
UNUSED(r2);
|
||||
UNUSED(r3);
|
||||
|
||||
// nb01 >= nb00 - src0 is not transposed
|
||||
// compute by src0 rows
|
||||
@@ -11890,6 +12058,8 @@ static void ggml_compute_forward_mul_mat(
|
||||
#endif
|
||||
|
||||
#if GGML_USE_LLAMAFILE
|
||||
const bool src1_cont = ggml_is_contiguous(src1);
|
||||
|
||||
if (src1_cont) {
|
||||
for (int64_t i13 = 0; i13 < ne13; i13++)
|
||||
for (int64_t i12 = 0; i12 < ne12; i12++)
|
||||
@@ -11915,6 +12085,8 @@ UseGgmlGemm1:;
|
||||
if (ith != 0) {
|
||||
return;
|
||||
}
|
||||
// Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
|
||||
atomic_store(&state->shared->current_chunk, nth);
|
||||
if (src1->type != vec_dot_type) {
|
||||
char * wdata = params->wdata;
|
||||
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
||||
@@ -11939,11 +12111,11 @@ UseGgmlGemm1:;
|
||||
return;
|
||||
}
|
||||
|
||||
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
||||
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
||||
|
||||
#if GGML_USE_LLAMAFILE
|
||||
if (src1->type != vec_dot_type) {
|
||||
const void* wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
||||
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
||||
|
||||
for (int64_t i13 = 0; i13 < ne13; i13++)
|
||||
for (int64_t i12 = 0; i12 < ne12; i12++)
|
||||
if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
|
||||
@@ -11964,98 +12136,87 @@ UseGgmlGemm1:;
|
||||
UseGgmlGemm2:;
|
||||
#endif
|
||||
|
||||
const int64_t nr0 = ne01; // src0 rows
|
||||
const int64_t nr1 = ne1*ne12*ne13; // src1 rows
|
||||
#ifdef GGML_PERF
|
||||
int chunks_executed = 0;
|
||||
UNUSED(chunks_executed);
|
||||
#endif
|
||||
|
||||
//printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
|
||||
// This is the size of the first dimension of the result, so we can iterate that way. (see the ASSERT above, these are the same numbers)
|
||||
const int64_t nr0 = ne0;
|
||||
|
||||
// distribute the thread work across the inner or outer loop based on which one is larger
|
||||
|
||||
const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
|
||||
const int64_t nth1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
|
||||
|
||||
const int64_t ith0 = ith % nth0;
|
||||
const int64_t ith1 = ith / nth0;
|
||||
|
||||
const int64_t dr0 = (nr0 + nth0 - 1)/nth0;
|
||||
const int64_t dr1 = (nr1 + nth1 - 1)/nth1;
|
||||
|
||||
const int64_t ir010 = dr0*ith0;
|
||||
const int64_t ir011 = MIN(ir010 + dr0, nr0);
|
||||
|
||||
const int64_t ir110 = dr1*ith1;
|
||||
const int64_t ir111 = MIN(ir110 + dr1, nr1);
|
||||
|
||||
//printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111);
|
||||
|
||||
// threads with no work simply yield (not sure if it helps)
|
||||
if (ir010 >= ir011 || ir110 >= ir111) {
|
||||
sched_yield();
|
||||
return;
|
||||
}
|
||||
|
||||
assert(ne12 % ne02 == 0);
|
||||
assert(ne13 % ne03 == 0);
|
||||
|
||||
// block-tiling attempt
|
||||
const int64_t blck_0 = 16;
|
||||
const int64_t blck_1 = 16;
|
||||
// This is the size of the rest of the dimensions of the result
|
||||
const int64_t nr1 = ne1 * ne2 * ne3;
|
||||
|
||||
// dot kernels can handle 1 row and col at a time, but mmla kernels can process 2 rows and cols
|
||||
int64_t nrc = vec_dot_num_rows;
|
||||
int64_t num_rows_per_vec_dot = vec_dot_num_rows;
|
||||
// TODO: currently the mmla kernels support only even numbered rows/cols.
|
||||
// this check can be removed once they are extended to support odd numbered rows/cols too
|
||||
if ((nr0 % 2 != 0) || (ne11 % 2 != 0)) {
|
||||
nrc = 1;
|
||||
num_rows_per_vec_dot = 1;
|
||||
}
|
||||
|
||||
const size_t src1_col_stride = src1_cont || src1->type != vec_dot_type ? row_size : nb11;
|
||||
// Now select a reasonable chunk size.
|
||||
int chunk_size = 16;
|
||||
|
||||
// attempt to reduce false-sharing (does not seem to make a difference)
|
||||
// 16 * 2, accounting for mmla kernels
|
||||
float tmp[32];
|
||||
// We need to step up the size if it's small
|
||||
if (nr0 == 1 || nr1 == 1) {
|
||||
chunk_size = 64;
|
||||
}
|
||||
|
||||
for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
|
||||
for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
|
||||
for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ir1 += nrc) {
|
||||
const int64_t i13 = (ir1/(ne12*ne1));
|
||||
const int64_t i12 = (ir1 - i13*ne12*ne1)/ne1;
|
||||
const int64_t i11 = (ir1 - i13*ne12*ne1 - i12*ne1);
|
||||
// distribute the work across the inner or outer loop based on which one is larger
|
||||
// The number of chunks in the 0/1 dim.
|
||||
// CEIL(nr0/chunk_size)
|
||||
int64_t nchunk0 = (nr0 + chunk_size - 1) / chunk_size;
|
||||
int64_t nchunk1 = (nr1 + chunk_size - 1) / chunk_size;
|
||||
|
||||
// broadcast src0 into src1
|
||||
const int64_t i03 = i13/r3;
|
||||
const int64_t i02 = i12/r2;
|
||||
// If the chunking is poor for the number of threads on this setup, scrap the whole plan. Re-chunk it by thread.
|
||||
// Also, chunking by thread was measured to have perform better on NUMA systems. See https://github.com/ggerganov/llama.cpp/pull/6915
|
||||
// In theory, chunking should be just as useful on NUMA and non NUMA systems, but testing disagreed with that.
|
||||
if (nchunk0 * nchunk1 < nth * 4 || ggml_is_numa()) {
|
||||
// distribute the thread work across the inner or outer loop based on which one is larger
|
||||
nchunk0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
|
||||
nchunk1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
|
||||
}
|
||||
|
||||
const int64_t i1 = i11;
|
||||
const int64_t i2 = i12;
|
||||
const int64_t i3 = i13;
|
||||
// The number of elements in each chunk
|
||||
const int64_t dr0 = (nr0 + nchunk0 - 1) / nchunk0;
|
||||
const int64_t dr1 = (nr1 + nchunk1 - 1) / nchunk1;
|
||||
|
||||
const char * src0_row = (const char *) src0->data + (0 + i02*nb02 + i03*nb03);
|
||||
//if (ith == 0)
|
||||
// printf("MUL_MAT = [%d, %d, %d, %d] x [%d, %d, %d, %d] = %d x %d = %d. Fp Ops/Ch %d\n", ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nchunk0, nchunk1, nchunk0 * nchunk1, ne00 * nr0 * nr1 / nchunk0 / nchunk1);
|
||||
|
||||
// desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
|
||||
// if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
|
||||
// the original src1 data pointer, so we should index using the indices directly
|
||||
// TODO: this is a bit of a hack, we should probably have a better way to handle this
|
||||
const char * src1_col = (const char *) wdata +
|
||||
(src1_cont || src1->type != vec_dot_type
|
||||
? (i11 + i12*ne11 + i13*ne12*ne11)*row_size
|
||||
: (i11*nb11 + i12*nb12 + i13*nb13));
|
||||
float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
|
||||
// The first chunk comes from our thread_id, the rest will get auto-assigned.
|
||||
int current_chunk = ith;
|
||||
|
||||
//for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
|
||||
// vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
|
||||
//}
|
||||
while (current_chunk < nchunk0 * nchunk1) {
|
||||
const int64_t ith0 = current_chunk % nchunk0;
|
||||
const int64_t ith1 = current_chunk / nchunk0;
|
||||
|
||||
for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ir0 += nrc) {
|
||||
vec_dot(ne00, &tmp[ir0 - iir0], (nrc>1 ? 16 : 0), src0_row + ir0*nb01, (nrc>1 ? nb01 : 0), src1_col, (nrc>1 ? src1_col_stride : 0), nrc);
|
||||
}
|
||||
const int64_t ir0_start = dr0 * ith0;
|
||||
const int64_t ir0_end = MIN(ir0_start + dr0, nr0);
|
||||
|
||||
for (int cn = 0; cn < nrc; ++cn) {
|
||||
memcpy(&dst_col[iir0 + cn*nb1/nb0], tmp + (cn*16), (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
|
||||
}
|
||||
}
|
||||
const int64_t ir1_start = dr1 * ith1;
|
||||
const int64_t ir1_end = MIN(ir1_start + dr1, nr1);
|
||||
|
||||
ggml_compute_forward_mul_mat_one_chunk(params, dst, num_rows_per_vec_dot, ir0_start, ir0_end, ir1_start, ir1_end);
|
||||
|
||||
#ifdef GGML_PERF
|
||||
chunks_executed++;
|
||||
#endif
|
||||
|
||||
if (nth >= nchunk0 * nchunk1) {
|
||||
break;
|
||||
}
|
||||
|
||||
current_chunk = atomic_fetch_add(&state->shared->current_chunk, 1);
|
||||
}
|
||||
|
||||
#ifdef GGML_PERF
|
||||
// These numbers are useful when trying to measure how well the threading scheduling works.
|
||||
//int64_t workSize = (ne01 * ne11 * ne12 * ne13 * ne00) / nchunk0 / nchunk1;
|
||||
//float time = (ggml_perf_time_us() - t0);
|
||||
//printf("MUL_MAT = %f ms, [%d, %d, %d, %d] x [%d, %d, %d, %d] = %I64u, %f ops/usec in %d chunks.\n", time / 1000.0, ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, workSize, (float)workSize/time, chunks_executed);
|
||||
#endif
|
||||
}
|
||||
|
||||
// ggml_compute_forward_mul_mat_id
|
||||
@@ -14808,25 +14969,28 @@ static void ggml_compute_forward_upscale_f32(
|
||||
return;
|
||||
}
|
||||
|
||||
GGML_ASSERT(src0->nb[0] == sizeof(float));
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
GGML_TENSOR_UNARY_OP_LOCALS
|
||||
|
||||
const int scale_factor = dst->op_params[0];
|
||||
const float sf0 = (float)ne0/src0->ne[0];
|
||||
const float sf1 = (float)ne1/src0->ne[1];
|
||||
const float sf2 = (float)ne2/src0->ne[2];
|
||||
const float sf3 = (float)ne3/src0->ne[3];
|
||||
|
||||
// TODO: optimize
|
||||
|
||||
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
||||
const int64_t i03 = i3;
|
||||
const int64_t i03 = i3 / sf3;
|
||||
for (int64_t i2 = ith; i2 < ne2; i2 += nth) {
|
||||
const int64_t i02 = i2;
|
||||
const int64_t i02 = i2 / sf2;
|
||||
for (int64_t i1 = 0; i1 < ne1; i1++) {
|
||||
const int64_t i01 = i1 / scale_factor;
|
||||
const int64_t i01 = i1 / sf1;
|
||||
for (int64_t i0 = 0; i0 < ne0; i0++) {
|
||||
const int64_t i00 = i0 / scale_factor;
|
||||
const int64_t i00 = i0 / sf0;
|
||||
|
||||
const float * x = (float *)((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
||||
float * y = (float *)((char *) dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
|
||||
@@ -14856,6 +15020,7 @@ static void ggml_compute_forward_upscale(
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// ggml_compute_forward_pad
|
||||
|
||||
static void ggml_compute_forward_pad_f32(
|
||||
@@ -17306,7 +17471,7 @@ static void ggml_compute_forward_cross_entropy_loss_back(
|
||||
|
||||
/////////////////////////////////
|
||||
|
||||
static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
||||
static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor, struct ggml_compute_state * state) {
|
||||
GGML_ASSERT(params);
|
||||
|
||||
if (tensor->op == GGML_OP_NONE || ggml_is_empty(tensor)) {
|
||||
@@ -17404,7 +17569,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
||||
} break;
|
||||
case GGML_OP_MUL_MAT:
|
||||
{
|
||||
ggml_compute_forward_mul_mat(params, tensor);
|
||||
ggml_compute_forward_mul_mat(params, tensor, state);
|
||||
} break;
|
||||
case GGML_OP_MUL_MAT_ID:
|
||||
{
|
||||
@@ -19020,8 +19185,6 @@ typedef int ggml_lock_t;
|
||||
|
||||
#define GGML_LOCK_INITIALIZER 0
|
||||
|
||||
typedef pthread_t ggml_thread_t;
|
||||
|
||||
#define ggml_thread_create pthread_create
|
||||
#define ggml_thread_join pthread_join
|
||||
|
||||
@@ -19047,8 +19210,6 @@ typedef int ggml_lock_t;
|
||||
|
||||
#define GGML_LOCK_INITIALIZER 0
|
||||
|
||||
typedef pthread_t ggml_thread_t;
|
||||
|
||||
#define ggml_thread_create pthread_create
|
||||
#define ggml_thread_join pthread_join
|
||||
|
||||
@@ -19128,31 +19289,6 @@ static void set_numa_thread_affinity(int thread_n) { UNUSED(thread_n); }
|
||||
static void clear_numa_thread_affinity(void) {}
|
||||
#endif
|
||||
|
||||
struct ggml_compute_state_shared {
|
||||
const struct ggml_cgraph * cgraph;
|
||||
const struct ggml_cplan * cplan;
|
||||
|
||||
int64_t perf_node_start_cycles;
|
||||
int64_t perf_node_start_time_us;
|
||||
|
||||
const int n_threads;
|
||||
|
||||
// synchronization primitives
|
||||
atomic_int n_active; // num active threads
|
||||
atomic_int node_n; // active graph node
|
||||
atomic_int node_task; // active graph node task phase
|
||||
|
||||
ggml_abort_callback abort_callback; // abort ggml_graph_compute when true
|
||||
void * abort_callback_data;
|
||||
};
|
||||
|
||||
struct ggml_compute_state {
|
||||
ggml_thread_t thrd;
|
||||
int ith;
|
||||
struct ggml_compute_state_shared * shared;
|
||||
enum ggml_status ec;
|
||||
};
|
||||
|
||||
static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) {
|
||||
int64_t cycles_cur = ggml_perf_cycles() - st->perf_node_start_cycles;
|
||||
int64_t time_us_cur = ggml_perf_time_us() - st->perf_node_start_time_us;
|
||||
@@ -19425,6 +19561,10 @@ static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_comput
|
||||
|
||||
* node_n = atomic_load(&state->shared->node_n);
|
||||
if (* node_n != last_node_n) break;
|
||||
#if defined(__SSE3__)
|
||||
// Tell the processor we're spinning. It's a processor hint for spinlocks.
|
||||
_mm_pause();
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
@@ -19439,6 +19579,10 @@ static void ggml_graph_compute_thread_sync_task(int * task_phase, struct ggml_co
|
||||
|
||||
* task_phase = atomic_load(&state->shared->node_task);
|
||||
if (* task_phase != last_task_phase) break;
|
||||
#if defined(__SSE3__)
|
||||
// Tell the processor we're spinning. It's a processor hint for spinlocks.
|
||||
_mm_pause();
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
@@ -19478,7 +19622,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
||||
struct ggml_tensor * node = cgraph->nodes[node_n];
|
||||
if (GGML_OP_HAS_FINALIZE[node->op]) {
|
||||
params.nth = ggml_get_n_tasks(node, n_threads, state->shared->n_threads);
|
||||
ggml_compute_forward(¶ms, node);
|
||||
ggml_compute_forward(¶ms, node, state);
|
||||
}
|
||||
ggml_graph_compute_perf_stats_node(node, state->shared);
|
||||
}
|
||||
@@ -19498,17 +19642,17 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
||||
/* INIT */
|
||||
if (GGML_OP_HAS_INIT[node->op]) {
|
||||
params.type = GGML_TASK_TYPE_INIT;
|
||||
ggml_compute_forward(¶ms, node);
|
||||
ggml_compute_forward(¶ms, node, state);
|
||||
}
|
||||
|
||||
// TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
|
||||
// they do something more efficient than spinning (?)
|
||||
params.type = GGML_TASK_TYPE_COMPUTE;
|
||||
ggml_compute_forward(¶ms, node);
|
||||
ggml_compute_forward(¶ms, node, state);
|
||||
|
||||
if (GGML_OP_HAS_FINALIZE[node->op]) {
|
||||
params.type = GGML_TASK_TYPE_FINALIZE;
|
||||
ggml_compute_forward(¶ms, node);
|
||||
ggml_compute_forward(¶ms, node, state);
|
||||
}
|
||||
|
||||
ggml_graph_compute_perf_stats_node(node, state->shared);
|
||||
@@ -19547,7 +19691,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
||||
|
||||
if (state->ith < n_tasks) {
|
||||
if (GGML_OP_HAS_INIT[node->op]) {
|
||||
ggml_compute_forward(¶ms, node);
|
||||
ggml_compute_forward(¶ms, node, state);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -19568,7 +19712,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
||||
|
||||
if (state->ith < n_tasks) {
|
||||
params.type = GGML_TASK_TYPE_COMPUTE;
|
||||
ggml_compute_forward(¶ms, node);
|
||||
ggml_compute_forward(¶ms, node, state);
|
||||
}
|
||||
|
||||
if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
|
||||
@@ -19819,6 +19963,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
|
||||
/*.node_task =*/ GGML_TASK_TYPE_FINALIZE,
|
||||
/*.abort_callback =*/ NULL,
|
||||
/*.abort_callback_data =*/ NULL,
|
||||
/*.current_chunk; =*/ 0,
|
||||
};
|
||||
struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
|
||||
|
||||
|
||||
18
ggml.h
18
ggml.h
@@ -565,7 +565,8 @@ extern "C" {
|
||||
// n-dimensional tensor
|
||||
struct ggml_tensor {
|
||||
enum ggml_type type;
|
||||
enum ggml_backend_type backend;
|
||||
|
||||
GGML_DEPRECATED(enum ggml_backend_type backend, "use the buffer type to find the storage location of the tensor");
|
||||
|
||||
struct ggml_backend_buffer * buffer;
|
||||
|
||||
@@ -766,7 +767,8 @@ extern "C" {
|
||||
GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
|
||||
GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
|
||||
|
||||
GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
||||
GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
||||
GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
||||
|
||||
// use this to compute the memory overhead of a tensor
|
||||
GGML_API size_t ggml_tensor_overhead(void);
|
||||
@@ -1673,12 +1675,24 @@ extern "C" {
|
||||
float p1);
|
||||
|
||||
// nearest interpolate
|
||||
// multiplies ne0 and ne1 by scale factor
|
||||
// used in stable-diffusion
|
||||
GGML_API struct ggml_tensor * ggml_upscale(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int scale_factor);
|
||||
|
||||
// nearest interpolate
|
||||
// nearest interpolate to specified dimensions
|
||||
// used in tortoise.cpp
|
||||
GGML_API struct ggml_tensor * ggml_upscale_ext(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int ne0,
|
||||
int ne1,
|
||||
int ne2,
|
||||
int ne3);
|
||||
|
||||
// pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
|
||||
GGML_API struct ggml_tensor * ggml_pad(
|
||||
struct ggml_context * ctx,
|
||||
|
||||
@@ -2,5 +2,6 @@ from .constants import *
|
||||
from .lazy import *
|
||||
from .gguf_reader import *
|
||||
from .gguf_writer import *
|
||||
from .quants import *
|
||||
from .tensor_mapping import *
|
||||
from .vocab import *
|
||||
|
||||
@@ -13,6 +13,7 @@ from string import ascii_letters, digits
|
||||
import numpy as np
|
||||
|
||||
from .constants import (
|
||||
GGML_QUANT_SIZES,
|
||||
GGUF_DEFAULT_ALIGNMENT,
|
||||
GGUF_MAGIC,
|
||||
GGUF_VERSION,
|
||||
@@ -195,7 +196,7 @@ class GGUFWriter:
|
||||
return ((x + n - 1) // n) * n
|
||||
|
||||
def add_tensor_info(
|
||||
self, name: str, tensor_shape: Sequence[int], tensor_dtype: np.dtype[np.float16] | np.dtype[np.float32],
|
||||
self, name: str, tensor_shape: Sequence[int], tensor_dtype: np.dtype,
|
||||
tensor_nbytes: int, raw_dtype: GGMLQuantizationType | None = None,
|
||||
) -> None:
|
||||
if self.state is not WriterState.EMPTY:
|
||||
@@ -208,10 +209,6 @@ class GGUFWriter:
|
||||
encoded_name = name.encode("utf-8")
|
||||
self.ti_data += self._pack("Q", len(encoded_name))
|
||||
self.ti_data += encoded_name
|
||||
n_dims = len(tensor_shape)
|
||||
self.ti_data += self._pack("I", n_dims)
|
||||
for i in range(n_dims):
|
||||
self.ti_data += self._pack("Q", tensor_shape[n_dims - 1 - i])
|
||||
if raw_dtype is None:
|
||||
if tensor_dtype == np.float16:
|
||||
dtype = GGMLQuantizationType.F16
|
||||
@@ -231,6 +228,15 @@ class GGUFWriter:
|
||||
raise ValueError("Only F16, F32, F64, I8, I16, I32, I64 tensors are supported for now")
|
||||
else:
|
||||
dtype = raw_dtype
|
||||
if tensor_dtype == np.uint8:
|
||||
block_size, type_size = GGML_QUANT_SIZES[raw_dtype]
|
||||
if tensor_shape[-1] % type_size != 0:
|
||||
raise ValueError(f"Quantized tensor row size ({tensor_shape[-1]}) is not a multiple of {dtype.name} type size ({type_size})")
|
||||
tensor_shape = tuple(tensor_shape[:-1]) + (tensor_shape[-1] // type_size * block_size,)
|
||||
n_dims = len(tensor_shape)
|
||||
self.ti_data += self._pack("I", n_dims)
|
||||
for i in range(n_dims):
|
||||
self.ti_data += self._pack("Q", tensor_shape[n_dims - 1 - i])
|
||||
self.ti_data += self._pack("I", dtype)
|
||||
self.ti_data += self._pack("Q", self.offset_tensor)
|
||||
self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment)
|
||||
|
||||
@@ -6,6 +6,7 @@ from typing import Any, Callable
|
||||
from collections import deque
|
||||
|
||||
import numpy as np
|
||||
from numpy._typing import _Shape
|
||||
from numpy.typing import DTypeLike
|
||||
|
||||
|
||||
@@ -110,7 +111,7 @@ class LazyBase(ABC, metaclass=LazyMeta):
|
||||
return o
|
||||
|
||||
@classmethod
|
||||
def _wrap_fn(cls, fn: Callable, *, use_self: LazyBase | None = None, meta_noop: bool | DTypeLike = False) -> Callable[[Any], Any]:
|
||||
def _wrap_fn(cls, fn: Callable, *, use_self: LazyBase | None = None, meta_noop: bool | DTypeLike | tuple[DTypeLike, Callable[[tuple[int, ...]], tuple[int, ...]]] = False) -> Callable[[Any], Any]:
|
||||
def wrapped_fn(*args, **kwargs):
|
||||
if kwargs is None:
|
||||
kwargs = {}
|
||||
@@ -130,9 +131,14 @@ class LazyBase(ABC, metaclass=LazyMeta):
|
||||
res = args[0]
|
||||
assert isinstance(res, cls)
|
||||
res = res._meta
|
||||
# allow operations to override the dtype
|
||||
# allow operations to override the dtype and shape
|
||||
if meta_noop is not True:
|
||||
res = cls.meta_with_dtype(res, meta_noop)
|
||||
if isinstance(meta_noop, tuple):
|
||||
dtype, shape = meta_noop
|
||||
assert callable(shape)
|
||||
res = cls.meta_with_dtype_and_shape(dtype, shape(res.shape))
|
||||
else:
|
||||
res = cls.meta_with_dtype_and_shape(meta_noop, res.shape)
|
||||
|
||||
if isinstance(res, cls._tensor_type):
|
||||
def collect_replace(t: LazyBase):
|
||||
@@ -168,7 +174,12 @@ class LazyBase(ABC, metaclass=LazyMeta):
|
||||
while _t._data is None:
|
||||
lt = _t._lazy.popleft()
|
||||
if lt._data is not None:
|
||||
raise ValueError(f"{lt} did not belong in the lazy queue")
|
||||
# Lazy tensor did not belong in the lazy queue.
|
||||
# Weirdly only happens with Bloom models...
|
||||
# likely because tensors aren't unique in the queue.
|
||||
# The final output is still the same as in eager mode,
|
||||
# so it's safe to ignore this.
|
||||
continue
|
||||
assert lt._func is not None
|
||||
lt._args = cls._recurse_apply(lt._args, already_eager_to_eager)
|
||||
lt._data = lt._func(lt._args)
|
||||
@@ -183,12 +194,12 @@ class LazyBase(ABC, metaclass=LazyMeta):
|
||||
|
||||
@classmethod
|
||||
def eager_to_meta(cls, t: Any) -> Any:
|
||||
return cls.meta_with_dtype(t, t.dtype)
|
||||
return cls.meta_with_dtype_and_shape(t.dtype, t.shape)
|
||||
|
||||
# must be overridden, meta tensor init is backend-specific
|
||||
@classmethod
|
||||
@abstractmethod
|
||||
def meta_with_dtype(cls, m: Any, dtype: Any) -> Any: pass
|
||||
def meta_with_dtype_and_shape(cls, dtype: Any, shape: Any) -> Any: pass
|
||||
|
||||
@classmethod
|
||||
def from_eager(cls, t: Any) -> Any:
|
||||
@@ -205,15 +216,15 @@ class LazyNumpyTensor(LazyBase):
|
||||
_tensor_type = np.ndarray
|
||||
|
||||
@classmethod
|
||||
def meta_with_dtype(cls, m: np.ndarray[Any, Any], dtype: DTypeLike) -> np.ndarray[Any, Any]:
|
||||
def meta_with_dtype_and_shape(cls, dtype: DTypeLike, shape: _Shape) -> np.ndarray[Any, Any]:
|
||||
# The initial idea was to use np.nan as the fill value,
|
||||
# but non-float types like np.int16 can't use that.
|
||||
# So zero it is.
|
||||
cheat = np.zeros(1, dtype)
|
||||
return np.lib.stride_tricks.as_strided(cheat, m.shape, (0 for _ in m.shape))
|
||||
return np.lib.stride_tricks.as_strided(cheat, shape, (0 for _ in shape))
|
||||
|
||||
def astype(self, dtype, *args, **kwargs):
|
||||
meta = type(self).meta_with_dtype(self._meta, dtype)
|
||||
meta = type(self).meta_with_dtype_and_shape(dtype, self._meta.shape)
|
||||
full_args = (self, dtype,) + args
|
||||
# very important to pass the shared _lazy deque, or else there's an infinite loop somewhere.
|
||||
return type(self)(meta=meta, args=full_args, lazy=self._lazy, func=(lambda a: a[0].astype(*a[1:], **kwargs)))
|
||||
|
||||
109
gguf-py/gguf/quants.py
Normal file
109
gguf-py/gguf/quants.py
Normal file
@@ -0,0 +1,109 @@
|
||||
from __future__ import annotations
|
||||
from typing import Callable
|
||||
|
||||
from numpy.typing import DTypeLike
|
||||
|
||||
from .constants import GGML_QUANT_SIZES, GGMLQuantizationType
|
||||
from .lazy import LazyNumpyTensor
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
# same as ggml_compute_fp32_to_bf16 in ggml-impl.h
|
||||
def __compute_fp32_to_bf16(n: np.ndarray) -> np.ndarray:
|
||||
n = n.astype(np.float32, copy=False).view(np.int32)
|
||||
# force nan to quiet
|
||||
n = np.where((n & 0x7fffffff) > 0x7f800000, (n & 0xffff0000) | (64 << 16), n)
|
||||
# flush subnormals to zero
|
||||
n = np.where((n & 0x7f800000) == 0, n & 0x80000000, n)
|
||||
# round to nearest even
|
||||
n = (n + (0x7fff + ((n >> 16) & 1))) >> 16
|
||||
return n.astype(np.int16)
|
||||
|
||||
|
||||
# This is faster than np.vectorize and np.apply_along_axis because it works on more than one row at a time
|
||||
def __apply_over_grouped_rows(func: Callable[[np.ndarray], np.ndarray], arr: np.ndarray, otype: DTypeLike, oshape: tuple[int, ...]) -> np.ndarray:
|
||||
rows = arr.reshape((-1, arr.shape[-1]))
|
||||
osize = 1
|
||||
for dim in oshape:
|
||||
osize *= dim
|
||||
out = np.empty(shape=osize, dtype=otype)
|
||||
# compute over groups of 16 rows (arbitrary, but seems good for performance)
|
||||
n_groups = rows.shape[0] // 16
|
||||
np.concatenate([func(group).ravel() for group in np.array_split(rows, n_groups)], axis=0, out=out)
|
||||
return out.reshape(oshape)
|
||||
|
||||
|
||||
def __quantize_bf16_array(n: np.ndarray) -> np.ndarray:
|
||||
return __apply_over_grouped_rows(__compute_fp32_to_bf16, arr=n, otype=np.int16, oshape=n.shape)
|
||||
|
||||
|
||||
__quantize_bf16_lazy = LazyNumpyTensor._wrap_fn(__quantize_bf16_array, meta_noop=np.int16)
|
||||
|
||||
|
||||
def quantize_bf16(n: np.ndarray):
|
||||
if type(n) is LazyNumpyTensor:
|
||||
return __quantize_bf16_lazy(n)
|
||||
else:
|
||||
return __quantize_bf16_array(n)
|
||||
|
||||
|
||||
__q8_block_size, __q8_type_size = GGML_QUANT_SIZES[GGMLQuantizationType.Q8_0]
|
||||
|
||||
|
||||
def can_quantize_to_q8_0(n: np.ndarray) -> bool:
|
||||
return n.shape[-1] % __q8_block_size == 0
|
||||
|
||||
|
||||
# round away from zero
|
||||
# ref: https://stackoverflow.com/a/59143326/22827863
|
||||
def np_roundf(n: np.ndarray) -> np.ndarray:
|
||||
a = abs(n)
|
||||
floored = np.floor(a)
|
||||
b = floored + np.floor(2 * (a - floored))
|
||||
return np.sign(n) * b
|
||||
|
||||
|
||||
def __quantize_q8_0_shape_change(s: tuple[int, ...]) -> tuple[int, ...]:
|
||||
return (*s[:-1], s[-1] // __q8_block_size * __q8_type_size)
|
||||
|
||||
|
||||
# Implementation of Q8_0 with bit-exact same results as reference implementation in ggml-quants.c
|
||||
def __quantize_q8_0_rows(n: np.ndarray) -> np.ndarray:
|
||||
shape = n.shape
|
||||
assert shape[-1] % __q8_block_size == 0
|
||||
|
||||
n_blocks = n.size // __q8_block_size
|
||||
|
||||
blocks = n.reshape((n_blocks, __q8_block_size)).astype(np.float32, copy=False)
|
||||
|
||||
d = abs(blocks).max(axis=1, keepdims=True) / 127
|
||||
with np.errstate(divide="ignore"):
|
||||
id = np.where(d == 0, 0, 1 / d)
|
||||
qs = np_roundf(blocks * id)
|
||||
|
||||
# (n_blocks, 2)
|
||||
d = d.astype(np.float16).view(np.uint8)
|
||||
# (n_blocks, block_size)
|
||||
qs = qs.astype(np.int8).view(np.uint8)
|
||||
|
||||
assert d.shape[1] + qs.shape[1] == __q8_type_size
|
||||
|
||||
return np.concatenate([d, qs], axis=1).reshape(__quantize_q8_0_shape_change(shape))
|
||||
|
||||
|
||||
def __quantize_q8_0_array(n: np.ndarray) -> np.ndarray:
|
||||
return __apply_over_grouped_rows(__quantize_q8_0_rows, arr=n, otype=np.uint8, oshape=__quantize_q8_0_shape_change(n.shape))
|
||||
|
||||
|
||||
__quantize_q8_0_lazy = LazyNumpyTensor._wrap_fn(
|
||||
__quantize_q8_0_array,
|
||||
meta_noop=(np.uint8, __quantize_q8_0_shape_change),
|
||||
)
|
||||
|
||||
|
||||
def quantize_q8_0(data: np.ndarray):
|
||||
if type(data) is LazyNumpyTensor:
|
||||
return __quantize_q8_0_lazy(data)
|
||||
else:
|
||||
return __quantize_q8_0_array(data)
|
||||
350
llama.cpp
350
llama.cpp
@@ -7,6 +7,10 @@
|
||||
#include "ggml-alloc.h"
|
||||
#include "ggml-backend.h"
|
||||
|
||||
#ifdef GGML_USE_RPC
|
||||
# include "ggml-rpc.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_CUDA
|
||||
# include "ggml-cuda.h"
|
||||
#elif defined(GGML_USE_CLBLAST)
|
||||
@@ -1685,91 +1689,6 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer
|
||||
GGML_UNUSED(host_buffer);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
|
||||
ggml_backend_buffer_type_t buft = nullptr;
|
||||
|
||||
#ifdef GGML_USE_METAL
|
||||
buft = ggml_backend_metal_buffer_type();
|
||||
#elif defined(GGML_USE_CUDA)
|
||||
buft = ggml_backend_cuda_buffer_type(gpu);
|
||||
#elif defined(GGML_USE_VULKAN)
|
||||
buft = ggml_backend_vk_buffer_type(gpu);
|
||||
#elif defined(GGML_USE_SYCL)
|
||||
buft = ggml_backend_sycl_buffer_type(gpu);
|
||||
#elif defined(GGML_USE_CLBLAST)
|
||||
buft = ggml_backend_opencl_buffer_type();
|
||||
#elif defined(GGML_USE_KOMPUTE)
|
||||
buft = ggml_backend_kompute_buffer_type(gpu);
|
||||
if (buft == nullptr) {
|
||||
LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
|
||||
}
|
||||
#endif
|
||||
|
||||
if (buft == nullptr) {
|
||||
buft = llama_default_buffer_type_cpu(true);
|
||||
}
|
||||
return buft;
|
||||
|
||||
GGML_UNUSED(gpu);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_gpu, const float * tensor_split) {
|
||||
ggml_backend_buffer_type_t buft = nullptr;
|
||||
|
||||
#ifdef GGML_USE_CUDA
|
||||
if (ggml_backend_cuda_get_device_count() > 1) {
|
||||
buft = ggml_backend_cuda_split_buffer_type(tensor_split);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_SYCL
|
||||
if (ggml_backend_sycl_get_device_count() > 1) {
|
||||
buft = ggml_backend_sycl_split_buffer_type(tensor_split);
|
||||
}
|
||||
#endif
|
||||
|
||||
if (buft == nullptr) {
|
||||
buft = llama_default_buffer_type_offload(fallback_gpu);
|
||||
}
|
||||
return buft;
|
||||
|
||||
GGML_UNUSED(tensor_split);
|
||||
}
|
||||
|
||||
static size_t llama_get_device_count() {
|
||||
#if defined(GGML_USE_CUDA)
|
||||
return ggml_backend_cuda_get_device_count();
|
||||
#elif defined(GGML_USE_SYCL)
|
||||
return ggml_backend_sycl_get_device_count();
|
||||
#elif defined(GGML_USE_VULKAN)
|
||||
return ggml_backend_vk_get_device_count();
|
||||
#else
|
||||
return 1;
|
||||
#endif
|
||||
}
|
||||
|
||||
static size_t llama_get_device_memory(int device) {
|
||||
#if defined(GGML_USE_CUDA)
|
||||
size_t total;
|
||||
size_t free;
|
||||
ggml_backend_cuda_get_device_memory(device, &free, &total);
|
||||
return free;
|
||||
#elif defined(GGML_USE_SYCL)
|
||||
size_t total;
|
||||
size_t free;
|
||||
ggml_backend_sycl_get_device_memory(device, &free, &total);
|
||||
return free;
|
||||
#elif defined(GGML_USE_VULKAN)
|
||||
size_t total;
|
||||
size_t free;
|
||||
ggml_backend_vk_get_device_memory(device, &free, &total);
|
||||
return free;
|
||||
#else
|
||||
return 1;
|
||||
GGML_UNUSED(device);
|
||||
#endif
|
||||
}
|
||||
|
||||
//
|
||||
// globals
|
||||
//
|
||||
@@ -2210,6 +2129,8 @@ struct llama_model {
|
||||
int main_gpu;
|
||||
int n_gpu_layers;
|
||||
|
||||
std::vector<std::string> rpc_servers;
|
||||
|
||||
// gguf metadata
|
||||
std::unordered_map<std::string, std::string> gguf_kv;
|
||||
|
||||
@@ -2353,6 +2274,104 @@ struct llama_context {
|
||||
#endif
|
||||
};
|
||||
|
||||
static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
|
||||
ggml_backend_buffer_type_t buft = nullptr;
|
||||
|
||||
#ifdef GGML_USE_RPC
|
||||
std::string endpoint = model.rpc_servers[gpu];
|
||||
buft = ggml_backend_rpc_buffer_type(endpoint.c_str());
|
||||
#elif defined(GGML_USE_METAL)
|
||||
buft = ggml_backend_metal_buffer_type();
|
||||
#elif defined(GGML_USE_CUDA)
|
||||
buft = ggml_backend_cuda_buffer_type(gpu);
|
||||
#elif defined(GGML_USE_VULKAN)
|
||||
buft = ggml_backend_vk_buffer_type(gpu);
|
||||
#elif defined(GGML_USE_SYCL)
|
||||
buft = ggml_backend_sycl_buffer_type(gpu);
|
||||
#elif defined(GGML_USE_CLBLAST)
|
||||
buft = ggml_backend_opencl_buffer_type();
|
||||
#elif defined(GGML_USE_KOMPUTE)
|
||||
buft = ggml_backend_kompute_buffer_type(gpu);
|
||||
if (buft == nullptr) {
|
||||
LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
|
||||
}
|
||||
#endif
|
||||
|
||||
if (buft == nullptr) {
|
||||
buft = llama_default_buffer_type_cpu(true);
|
||||
}
|
||||
return buft;
|
||||
GGML_UNUSED(model);
|
||||
GGML_UNUSED(gpu);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_model & model, int fallback_gpu, const float * tensor_split) {
|
||||
ggml_backend_buffer_type_t buft = nullptr;
|
||||
|
||||
#ifdef GGML_USE_CUDA
|
||||
if (ggml_backend_cuda_get_device_count() > 1) {
|
||||
buft = ggml_backend_cuda_split_buffer_type(tensor_split);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_SYCL
|
||||
if (ggml_backend_sycl_get_device_count() > 1) {
|
||||
buft = ggml_backend_sycl_split_buffer_type(tensor_split);
|
||||
}
|
||||
#endif
|
||||
|
||||
if (buft == nullptr) {
|
||||
buft = llama_default_buffer_type_offload(model, fallback_gpu);
|
||||
}
|
||||
return buft;
|
||||
|
||||
GGML_UNUSED(tensor_split);
|
||||
}
|
||||
|
||||
static size_t llama_get_device_count(const llama_model & model) {
|
||||
#if defined(GGML_USE_RPC)
|
||||
return model.rpc_servers.size();
|
||||
#elif defined(GGML_USE_CUDA)
|
||||
return ggml_backend_cuda_get_device_count();
|
||||
#elif defined(GGML_USE_SYCL)
|
||||
return ggml_backend_sycl_get_device_count();
|
||||
#elif defined(GGML_USE_VULKAN)
|
||||
return ggml_backend_vk_get_device_count();
|
||||
#else
|
||||
return 1;
|
||||
#endif
|
||||
GGML_UNUSED(model);
|
||||
}
|
||||
|
||||
static size_t llama_get_device_memory(const llama_model & model, int device) {
|
||||
#if defined(GGML_USE_RPC)
|
||||
size_t total;
|
||||
size_t free;
|
||||
std::string endpoint = model.rpc_servers[device];
|
||||
ggml_backend_rpc_get_device_memory(endpoint.c_str(), &free, &total);
|
||||
return free;
|
||||
#elif defined(GGML_USE_CUDA)
|
||||
size_t total;
|
||||
size_t free;
|
||||
ggml_backend_cuda_get_device_memory(device, &free, &total);
|
||||
return free;
|
||||
#elif defined(GGML_USE_SYCL)
|
||||
size_t total;
|
||||
size_t free;
|
||||
ggml_backend_sycl_get_device_memory(device, &free, &total);
|
||||
return free;
|
||||
#elif defined(GGML_USE_VULKAN)
|
||||
size_t total;
|
||||
size_t free;
|
||||
ggml_backend_vk_get_device_memory(device, &free, &total);
|
||||
return free;
|
||||
#else
|
||||
return 1;
|
||||
#endif
|
||||
GGML_UNUSED(model);
|
||||
GGML_UNUSED(device);
|
||||
}
|
||||
|
||||
//
|
||||
// kv cache helpers
|
||||
//
|
||||
@@ -2805,6 +2824,11 @@ static void llama_kv_cache_defrag(struct llama_kv_cache & cache) {
|
||||
cache.do_defrag = true;
|
||||
}
|
||||
|
||||
static uint32_t llama_kv_cache_get_padding(const struct llama_cparams & cparams) {
|
||||
// the FA kernels require padding to avoid extra runtime boundary checks
|
||||
return cparams.flash_attn ? 256u : 32u;
|
||||
}
|
||||
|
||||
//
|
||||
// model loading and saving
|
||||
//
|
||||
@@ -4424,7 +4448,9 @@ static void llm_load_vocab(
|
||||
} else if (
|
||||
tokenizer_pre == "gpt-2" ||
|
||||
tokenizer_pre == "jina-es" ||
|
||||
tokenizer_pre == "jina-de") {
|
||||
tokenizer_pre == "jina-de" ||
|
||||
tokenizer_pre == "jina-v2-es" ||
|
||||
tokenizer_pre == "jina-v2-de") {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
|
||||
} else if (
|
||||
tokenizer_pre == "refact") {
|
||||
@@ -4784,13 +4810,13 @@ static bool llm_load_tensors(
|
||||
|
||||
if (split_mode == LLAMA_SPLIT_MODE_LAYER) {
|
||||
// calculate the split points
|
||||
int device_count = llama_get_device_count();
|
||||
int device_count = llama_get_device_count(model);
|
||||
bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
|
||||
std::vector<float> splits(device_count);
|
||||
if (all_zero) {
|
||||
// default split, by free memory
|
||||
for (int i = 0; i < device_count; ++i) {
|
||||
splits[i] = llama_get_device_memory(i);
|
||||
splits[i] = llama_get_device_memory(model, i);
|
||||
}
|
||||
} else {
|
||||
std::copy(tensor_split, tensor_split + device_count, splits.begin());
|
||||
@@ -4810,35 +4836,35 @@ static bool llm_load_tensors(
|
||||
int act_gpu_layers = std::min(n_gpu_layers, (int)n_layer + 1);
|
||||
for (int64_t i = i_gpu_start; i < n_layer; ++i) {
|
||||
int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits.begin();
|
||||
model.buft_layer[i] = llama_default_buffer_type_offload(layer_gpu);
|
||||
model.buft_layer[i] = llama_default_buffer_type_offload(model, layer_gpu);
|
||||
}
|
||||
// assign the output layer
|
||||
if (n_gpu_layers > n_layer) {
|
||||
int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits.begin();
|
||||
model.buft_output = llama_default_buffer_type_offload(layer_gpu);
|
||||
model.buft_output = llama_default_buffer_type_offload(model, layer_gpu);
|
||||
} else {
|
||||
model.buft_output = llama_default_buffer_type_cpu(true);
|
||||
}
|
||||
} else {
|
||||
ggml_backend_buffer_type_t split_buft;
|
||||
if (split_mode == LLAMA_SPLIT_MODE_ROW) {
|
||||
split_buft = llama_default_buffer_type_split(main_gpu, tensor_split);
|
||||
split_buft = llama_default_buffer_type_split(model, main_gpu, tensor_split);
|
||||
} else {
|
||||
// LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_LAYER in backends where it is not supported
|
||||
split_buft = llama_default_buffer_type_offload(main_gpu);
|
||||
split_buft = llama_default_buffer_type_offload(model, main_gpu);
|
||||
}
|
||||
// assign the repeating layers
|
||||
for (int64_t i = i_gpu_start; i < n_layer; ++i) {
|
||||
model.buft_layer[i] = {
|
||||
split_buft,
|
||||
llama_default_buffer_type_offload(main_gpu)
|
||||
llama_default_buffer_type_offload(model, main_gpu)
|
||||
};
|
||||
}
|
||||
// assign the output layer
|
||||
if (n_gpu_layers > n_layer) {
|
||||
model.buft_output = {
|
||||
split_buft,
|
||||
llama_default_buffer_type_offload(main_gpu)
|
||||
llama_default_buffer_type_offload(model, main_gpu)
|
||||
};
|
||||
} else {
|
||||
model.buft_output = llama_default_buffer_type_cpu(true);
|
||||
@@ -6334,8 +6360,6 @@ static void llm_build_kv_store(
|
||||
(ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa))*kv_head);
|
||||
cb(k_cache_view, "k_cache_view", il);
|
||||
|
||||
// note: storing RoPE-ed version of K in the KV cache
|
||||
ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view));
|
||||
|
||||
assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens);
|
||||
|
||||
@@ -6354,7 +6378,19 @@ static void llm_build_kv_store(
|
||||
}
|
||||
cb(v_cache_view, "v_cache_view", il);
|
||||
|
||||
ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur, v_cache_view));
|
||||
struct ggml_tensor * k_cur_cast = ggml_cast(ctx, k_cur, k_cache_view->type);
|
||||
struct ggml_tensor * v_cur_cast = ggml_cast(ctx, v_cur, v_cache_view->type);
|
||||
|
||||
ggml_build_forward_expand(graph, k_cur_cast);
|
||||
ggml_build_forward_expand(graph, v_cur_cast);
|
||||
|
||||
// note: storing RoPE-ed version of K in the KV cache
|
||||
ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur_cast, k_cache_view));
|
||||
//ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view));
|
||||
|
||||
|
||||
ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur_cast, v_cache_view));
|
||||
//ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur, v_cache_view));
|
||||
}
|
||||
|
||||
static struct ggml_tensor * llm_build_norm(
|
||||
@@ -11508,7 +11544,8 @@ static int llama_decode_internal(
|
||||
// a heuristic, to avoid attending the full cache if it is not yet utilized
|
||||
// after enough generations, the benefit from this heuristic disappears
|
||||
// if we start defragmenting the cache, the benefit from this will be more important
|
||||
kv_self.n = std::min(kv_self.size, std::max(256u, GGML_PAD(llama_kv_cache_cell_max(kv_self), 256)));
|
||||
const uint32_t pad = llama_kv_cache_get_padding(cparams);
|
||||
kv_self.n = std::min(kv_self.size, std::max(pad, GGML_PAD(llama_kv_cache_cell_max(kv_self), pad)));
|
||||
//kv_self.n = llama_kv_cache_cell_max(kv_self);
|
||||
}
|
||||
}
|
||||
@@ -13174,6 +13211,58 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
|
||||
return rejects;
|
||||
}
|
||||
|
||||
static bool llama_grammar_detect_left_recursion(
|
||||
const std::vector<std::vector<llama_grammar_element>> & rules,
|
||||
size_t rule_index,
|
||||
std::vector<bool> * rules_visited,
|
||||
std::vector<bool> * rules_in_progress,
|
||||
std::vector<bool> * rules_may_be_empty) {
|
||||
if ((*rules_in_progress)[rule_index]) {
|
||||
return true;
|
||||
}
|
||||
|
||||
(*rules_in_progress)[rule_index] = true;
|
||||
|
||||
const std::vector<llama_grammar_element> & rule = rules[rule_index];
|
||||
|
||||
// First check if the rule might produce the empty string. This could be done combined with the second
|
||||
// step but it's more readable as two steps.
|
||||
bool at_rule_start = true;
|
||||
for (size_t i = 0; i < rule.size(); i++) {
|
||||
if (llama_grammar_is_end_of_sequence(&rule[i])) {
|
||||
if (at_rule_start) {
|
||||
(*rules_may_be_empty)[rule_index] = true;
|
||||
break;
|
||||
}
|
||||
at_rule_start = true;
|
||||
} else {
|
||||
at_rule_start = false;
|
||||
}
|
||||
}
|
||||
|
||||
// Second, recurse into leftmost nonterminals (or next-leftmost as long as the previous nonterminal may
|
||||
// be empty)
|
||||
bool recurse_into_nonterminal = true;
|
||||
for (size_t i = 0; i < rule.size(); i++) {
|
||||
if (rule[i].type == LLAMA_GRETYPE_RULE_REF && recurse_into_nonterminal) {
|
||||
if (llama_grammar_detect_left_recursion(rules, (size_t)rule[i].value, rules_visited, rules_in_progress, rules_may_be_empty)) {
|
||||
return true;
|
||||
}
|
||||
if (!((*rules_may_be_empty)[(size_t)rule[i].value])) {
|
||||
recurse_into_nonterminal = false;
|
||||
}
|
||||
} else if (llama_grammar_is_end_of_sequence(&rule[i])) {
|
||||
recurse_into_nonterminal = true;
|
||||
} else {
|
||||
recurse_into_nonterminal = false;
|
||||
}
|
||||
}
|
||||
|
||||
(*rules_in_progress)[rule_index] = false;
|
||||
(*rules_visited)[rule_index] = true;
|
||||
return false;
|
||||
}
|
||||
|
||||
//
|
||||
// grammar - external
|
||||
//
|
||||
@@ -13193,6 +13282,19 @@ struct llama_grammar * llama_grammar_init(
|
||||
vec_rules[i].push_back({LLAMA_GRETYPE_END, 0});
|
||||
}
|
||||
|
||||
// Check for left recursion
|
||||
std::vector<bool> rules_visited(n_rules);
|
||||
std::vector<bool> rules_in_progress(n_rules);
|
||||
std::vector<bool> rules_may_be_empty(n_rules);
|
||||
for (size_t i = 0; i < n_rules; i++) {
|
||||
if (rules_visited[i]) {
|
||||
continue;
|
||||
}
|
||||
if (llama_grammar_detect_left_recursion(vec_rules, i, &rules_visited, &rules_in_progress, &rules_may_be_empty)) {
|
||||
throw std::runtime_error(format("unsupported grammar, left recursion detected for nonterminal at index %zu", i));
|
||||
}
|
||||
}
|
||||
|
||||
// loop over alternates of start rule to build initial stacks
|
||||
std::vector<std::vector<const llama_grammar_element *>> stacks;
|
||||
pos = vec_rules[start_rule_index].data();
|
||||
@@ -13215,6 +13317,9 @@ struct llama_grammar * llama_grammar_init(
|
||||
}
|
||||
} while (true);
|
||||
|
||||
// Important: vec_rules has to be moved here, not copied, because stacks contains
|
||||
// pointers to elements of vec_rules. If vec_rules were copied into llama_grammar
|
||||
// then the pointers would be invalidated when the local vec_rules goes out of scope.
|
||||
return new llama_grammar{ std::move(vec_rules), std::move(stacks), {} };
|
||||
}
|
||||
|
||||
@@ -15314,6 +15419,7 @@ struct llama_model_params llama_model_default_params() {
|
||||
/*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
|
||||
/*.main_gpu =*/ 0,
|
||||
/*.tensor_split =*/ nullptr,
|
||||
/*.rpc_servers =*/ nullptr,
|
||||
/*.progress_callback =*/ nullptr,
|
||||
/*.progress_callback_user_data =*/ nullptr,
|
||||
/*.kv_overrides =*/ nullptr,
|
||||
@@ -15384,7 +15490,9 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
|
||||
}
|
||||
|
||||
size_t llama_max_devices(void) {
|
||||
#if defined(GGML_USE_METAL)
|
||||
#if defined(GGML_USE_RPC)
|
||||
return GGML_RPC_MAX_SERVERS;
|
||||
#elif defined(GGML_USE_METAL)
|
||||
return 1;
|
||||
#elif defined(GGML_USE_CUDA)
|
||||
return GGML_CUDA_MAX_DEVICES;
|
||||
@@ -15407,7 +15515,7 @@ bool llama_supports_mlock(void) {
|
||||
|
||||
bool llama_supports_gpu_offload(void) {
|
||||
#if defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
|
||||
defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
|
||||
defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_RPC)
|
||||
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
||||
return true;
|
||||
#else
|
||||
@@ -15470,7 +15578,17 @@ struct llama_model * llama_load_model_from_file(
|
||||
return true;
|
||||
};
|
||||
}
|
||||
|
||||
if (params.rpc_servers != nullptr) {
|
||||
// split the servers set them into model->rpc_servers
|
||||
std::string servers(params.rpc_servers);
|
||||
size_t pos = 0;
|
||||
while ((pos = servers.find(",")) != std::string::npos) {
|
||||
std::string server = servers.substr(0, pos);
|
||||
model->rpc_servers.push_back(server);
|
||||
servers.erase(0, pos + 1);
|
||||
}
|
||||
model->rpc_servers.push_back(servers);
|
||||
}
|
||||
int status = llama_model_load(path_model, *model, params);
|
||||
GGML_ASSERT(status <= 0);
|
||||
if (status < 0) {
|
||||
@@ -15509,6 +15627,11 @@ struct llama_context * llama_new_context_with_model(
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
if (params.flash_attn && model->arch == LLM_ARCH_GROK) {
|
||||
LLAMA_LOG_WARN("%s: flash_attn is not compatible with Grok - forcing off\n", __func__);
|
||||
params.flash_attn = false;
|
||||
}
|
||||
|
||||
llama_context * ctx = new llama_context(*model);
|
||||
|
||||
const auto & hparams = model->hparams;
|
||||
@@ -15532,7 +15655,7 @@ struct llama_context * llama_new_context_with_model(
|
||||
cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
|
||||
|
||||
// this is necessary due to kv_self.n being padded later during inference
|
||||
cparams.n_ctx = GGML_PAD(cparams.n_ctx, 256);
|
||||
cparams.n_ctx = GGML_PAD(cparams.n_ctx, llama_kv_cache_get_padding(cparams));
|
||||
|
||||
// with causal attention, the batch size is limited by the context size
|
||||
cparams.n_batch = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
|
||||
@@ -15577,11 +15700,6 @@ struct llama_context * llama_new_context_with_model(
|
||||
}
|
||||
}
|
||||
|
||||
if (cparams.flash_attn && model->arch == LLM_ARCH_GROK) {
|
||||
LLAMA_LOG_WARN("%s: flash_attn is not compatible with Grok - forcing off\n", __func__);
|
||||
cparams.flash_attn = false;
|
||||
}
|
||||
|
||||
if (params.seed == LLAMA_DEFAULT_SEED) {
|
||||
params.seed = time(NULL);
|
||||
}
|
||||
@@ -15617,7 +15735,17 @@ struct llama_context * llama_new_context_with_model(
|
||||
|
||||
if (!hparams.vocab_only) {
|
||||
// initialize backends
|
||||
#ifdef GGML_USE_METAL
|
||||
#if defined(GGML_USE_RPC)
|
||||
for (auto & server : model->rpc_servers) {
|
||||
ggml_backend_t backend = ggml_backend_rpc_init(server.c_str());
|
||||
if (backend == nullptr) {
|
||||
LLAMA_LOG_ERROR("%s: failed to connect RPC backend to %s\n", __func__, server.c_str());
|
||||
llama_free(ctx);
|
||||
return nullptr;
|
||||
}
|
||||
ctx->backends.push_back(backend);
|
||||
}
|
||||
#elif defined(GGML_USE_METAL)
|
||||
if (model->n_gpu_layers > 0) {
|
||||
ctx->backend_metal = ggml_backend_metal_init();
|
||||
if (ctx->backend_metal == nullptr) {
|
||||
@@ -15773,7 +15901,11 @@ struct llama_context * llama_new_context_with_model(
|
||||
ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead_custom(LLAMA_MAX_NODES, false));
|
||||
|
||||
// enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
|
||||
bool pipeline_parallel = llama_get_device_count() > 1 && model->n_gpu_layers > (int)model->hparams.n_layer && model->split_mode == LLAMA_SPLIT_MODE_LAYER;
|
||||
bool pipeline_parallel =
|
||||
llama_get_device_count(*model) > 1 &&
|
||||
model->n_gpu_layers > (int)model->hparams.n_layer &&
|
||||
model->split_mode == LLAMA_SPLIT_MODE_LAYER &&
|
||||
params.offload_kqv;
|
||||
#ifndef GGML_USE_CUDA
|
||||
// pipeline parallelism requires support for async compute and events
|
||||
// currently this is only implemented in the CUDA backend
|
||||
|
||||
3
llama.h
3
llama.h
@@ -242,6 +242,9 @@ extern "C" {
|
||||
// proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
|
||||
const float * tensor_split;
|
||||
|
||||
// comma separated list of RPC servers to use for offloading
|
||||
const char * rpc_servers;
|
||||
|
||||
// Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
|
||||
// If the provided progress_callback returns true, model loading continues.
|
||||
// If it returns false, model loading is immediately aborted.
|
||||
|
||||
@@ -112,6 +112,8 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
|
||||
# src/ggml-opencl.h -> ggml-opencl.h
|
||||
# src/ggml-quants.c -> ggml-quants.c
|
||||
# src/ggml-quants.h -> ggml-quants.h
|
||||
# src/ggml-rpc.cpp -> ggml-rpc.cpp
|
||||
# src/ggml-rpc.h -> ggml-rpc.h
|
||||
# src/ggml-sycl.cpp -> ggml-sycl.cpp
|
||||
# src/ggml-sycl.h -> ggml-sycl.h
|
||||
# src/ggml-vulkan.cpp -> ggml-vulkan.cpp
|
||||
@@ -149,6 +151,8 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
|
||||
-e 's/src\/ggml-opencl\.h/ggml-opencl.h/g' \
|
||||
-e 's/src\/ggml-quants\.c/ggml-quants.c/g' \
|
||||
-e 's/src\/ggml-quants\.h/ggml-quants.h/g' \
|
||||
-e 's/src\/ggml-rpc\.cpp/ggml-rpc.cpp/g' \
|
||||
-e 's/src\/ggml-rpc\.h/ggml-rpc.h/g' \
|
||||
-e 's/src\/ggml-sycl\.cpp/ggml-sycl.cpp/g' \
|
||||
-e 's/src\/ggml-sycl\.h/ggml-sycl.h/g' \
|
||||
-e 's/src\/ggml-vulkan\.cpp/ggml-vulkan.cpp/g' \
|
||||
|
||||
@@ -1 +1 @@
|
||||
30f54cbb3ada3e4c5bc6924de3e5918e5be4ff11
|
||||
126d34985705a5a2222723c145cb4e125ac689f3
|
||||
|
||||
@@ -20,6 +20,8 @@ cp -rpv ../ggml/src/ggml-opencl.cpp ./ggml-opencl.cpp
|
||||
cp -rpv ../ggml/src/ggml-opencl.h ./ggml-opencl.h
|
||||
cp -rpv ../ggml/src/ggml-quants.c ./ggml-quants.c
|
||||
cp -rpv ../ggml/src/ggml-quants.h ./ggml-quants.h
|
||||
cp -rpv ../ggml/src/ggml-rpc.cpp ./ggml-rpc.cpp
|
||||
cp -rpv ../ggml/src/ggml-rpc.h ./ggml-rpc.h
|
||||
cp -rpv ../ggml/src/ggml-sycl.cpp ./ggml-sycl.cpp
|
||||
cp -rpv ../ggml/src/ggml-sycl.h ./ggml-sycl.h
|
||||
cp -rpv ../ggml/src/ggml-vulkan.cpp ./ggml-vulkan.cpp
|
||||
|
||||
@@ -1329,23 +1329,47 @@ struct test_upscale : public test_case {
|
||||
const ggml_type type;
|
||||
const std::array<int64_t, 4> ne;
|
||||
const int32_t scale_factor;
|
||||
const bool transpose;
|
||||
|
||||
std::string vars() override {
|
||||
return VARS_TO_STR3(type, ne, scale_factor);
|
||||
return VARS_TO_STR4(type, ne, scale_factor, transpose);
|
||||
}
|
||||
|
||||
test_upscale(ggml_type type = GGML_TYPE_F32,
|
||||
std::array<int64_t, 4> ne = {512, 512, 3, 1},
|
||||
int32_t scale_factor = 2)
|
||||
: type(type), ne(ne), scale_factor(scale_factor) {}
|
||||
int32_t scale_factor = 2, bool transpose = false)
|
||||
: type(type), ne(ne), scale_factor(scale_factor), transpose(transpose) {}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
||||
if (transpose) a = ggml_transpose(ctx, a);
|
||||
ggml_tensor * out = ggml_upscale(ctx, a, scale_factor);
|
||||
return out;
|
||||
}
|
||||
};
|
||||
|
||||
// GGML_OP_UPSCALE (ext)
|
||||
struct test_upscale_ext : public test_case {
|
||||
const ggml_type type;
|
||||
const std::array<int64_t, 4> ne;
|
||||
const std::array<int64_t, 4> ne_tgt;
|
||||
|
||||
std::string vars() override {
|
||||
return VARS_TO_STR3(type, ne, ne_tgt);
|
||||
}
|
||||
|
||||
test_upscale_ext(ggml_type type = GGML_TYPE_F32,
|
||||
std::array<int64_t, 4> ne = {2, 5, 7, 11},
|
||||
std::array<int64_t, 4> ne_tgt = {5, 7, 11, 13})
|
||||
: type(type), ne(ne), ne_tgt(ne_tgt) {}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
||||
ggml_tensor * out = ggml_upscale_ext(ctx, a, ne_tgt[0], ne_tgt[1],ne_tgt[2], ne_tgt[3]);
|
||||
return out;
|
||||
}
|
||||
};
|
||||
|
||||
// GGML_OP_GROUP_NORM
|
||||
struct test_group_norm : public test_case {
|
||||
const ggml_type type;
|
||||
@@ -1487,25 +1511,27 @@ struct test_flash_attn_ext : public test_case {
|
||||
const int64_t kv; // kv size
|
||||
const int64_t nb; // batch size
|
||||
|
||||
const bool mask; // use mask
|
||||
|
||||
const float max_bias; // ALiBi
|
||||
|
||||
std::string vars() override {
|
||||
return VARS_TO_STR5(hs, nh, kv, nb, max_bias);
|
||||
return VARS_TO_STR6(hs, nh, kv, nb, mask, max_bias);
|
||||
}
|
||||
|
||||
double max_nmse_err() override {
|
||||
return 5e-4;
|
||||
}
|
||||
|
||||
test_flash_attn_ext(int64_t hs = 128, int64_t nh = 32, int64_t kv = 96, int64_t nb = 8, float max_bias = 0.0f)
|
||||
: hs(hs), nh(nh), kv(kv), nb(nb), max_bias(max_bias) {}
|
||||
test_flash_attn_ext(int64_t hs = 128, int64_t nh = 32, int64_t kv = 96, int64_t nb = 8, bool mask = true, float max_bias = 0.0f)
|
||||
: hs(hs), nh(nh), kv(kv), nb(nb), mask(mask), max_bias(max_bias) {}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
ggml_tensor * q = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, hs, nb, nh, 1);
|
||||
ggml_tensor * k = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, hs, kv, nh, 1);
|
||||
ggml_tensor * v = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, hs, kv, nh, 1);
|
||||
ggml_tensor * mask = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, kv, GGML_PAD(nb, GGML_KQ_MASK_PAD), 1, 1);
|
||||
ggml_tensor * out = ggml_flash_attn_ext(ctx, q, k, v, mask, 1.0f/sqrtf(hs), max_bias);
|
||||
ggml_tensor * m = mask ? ggml_new_tensor_4d(ctx, GGML_TYPE_F16, kv, GGML_PAD(nb, GGML_KQ_MASK_PAD), 1, 1) : nullptr;
|
||||
ggml_tensor * out = ggml_flash_attn_ext(ctx, q, k, v, m, 1.0f/sqrtf(hs), max_bias);
|
||||
return out;
|
||||
}
|
||||
};
|
||||
@@ -2167,6 +2193,8 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
||||
|
||||
test_cases.emplace_back(new test_sum_rows());
|
||||
test_cases.emplace_back(new test_upscale());
|
||||
test_cases.emplace_back(new test_upscale(GGML_TYPE_F32, { 512, 512, 3, 1 }, 2, true));
|
||||
test_cases.emplace_back(new test_upscale_ext());
|
||||
test_cases.emplace_back(new test_group_norm());
|
||||
test_cases.emplace_back(new test_acc());
|
||||
test_cases.emplace_back(new test_pad());
|
||||
@@ -2175,11 +2203,14 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
||||
test_cases.emplace_back(new test_leaky_relu());
|
||||
|
||||
for (int hs : { 64, 80, 128, 256, }) {
|
||||
for (float max_bias : {0.0f, 8.0f}) {
|
||||
for (int nh : { 32, }) {
|
||||
for (int kv : { 512, 1024, }) {
|
||||
for (int nb : { 1, 2, 4, 8, }) {
|
||||
test_cases.emplace_back(new test_flash_attn_ext(hs, nh, kv, nb, max_bias));
|
||||
for (bool mask : { true, false } ) {
|
||||
for (float max_bias : { 0.0f, 8.0f }) {
|
||||
if (!mask && max_bias > 0.0f) continue;
|
||||
for (int nh : { 32, }) {
|
||||
for (int kv : { 512, 1024, }) {
|
||||
for (int nb : { 1, 2, 4, 8, }) {
|
||||
test_cases.emplace_back(new test_flash_attn_ext(hs, nh, kv, nb, mask, max_bias));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -28,6 +28,19 @@ static llama_grammar* build_grammar(const std::string & grammar_str) {
|
||||
return grammar;
|
||||
}
|
||||
|
||||
static bool test_build_grammar_fails(const std::string & grammar_str) {
|
||||
fprintf(stderr, "⚫ Testing failure for grammar: %s\n", grammar_str.c_str());
|
||||
bool grammar_fails = false;
|
||||
try {
|
||||
build_grammar(grammar_str);
|
||||
fprintf(stderr, " ❌ Expected build failure, but succeeded\n");
|
||||
} catch (const std::exception & err) {
|
||||
grammar_fails = true;
|
||||
fprintf(stdout, " ✅︎\n");
|
||||
}
|
||||
return grammar_fails;
|
||||
}
|
||||
|
||||
static bool match_string(const std::string & input, llama_grammar* grammar) {
|
||||
auto decoded = decode_utf8(input, {});
|
||||
|
||||
@@ -320,6 +333,38 @@ number ::= [0-9]+)""";
|
||||
fprintf(stderr, " ✅︎ Passed\n");
|
||||
}
|
||||
|
||||
static void test_failure_left_recursion() {
|
||||
fprintf(stderr, "⚫ Testing left recursion detection:\n");
|
||||
|
||||
// Test simple left recursion detection
|
||||
const std::string simple_str = R"""(root ::= "a" | root "a")""";
|
||||
assert(test_build_grammar_fails(simple_str));
|
||||
|
||||
// Test more complicated left recursion detection
|
||||
const std::string medium_str = R"""(
|
||||
root ::= asdf
|
||||
asdf ::= "a" | asdf "a"
|
||||
)""";
|
||||
assert(test_build_grammar_fails(medium_str));
|
||||
|
||||
// Test even more complicated left recursion detection
|
||||
const std::string hard_str = R"""(
|
||||
root ::= asdf
|
||||
asdf ::= "a" | foo "b"
|
||||
foo ::= "c" | asdf "d" | "e")""";
|
||||
assert(test_build_grammar_fails(hard_str));
|
||||
|
||||
// Test yet even more complicated left recursion detection
|
||||
const std::string hardest_str = R"""(
|
||||
root ::= asdf
|
||||
asdf ::= "a" | foo "b"
|
||||
foo ::= "c" | empty asdf "d" | "e"
|
||||
empty ::= "blah" | )""";
|
||||
assert(test_build_grammar_fails(hardest_str));
|
||||
|
||||
fprintf(stderr, " ✅︎ Passed\n");
|
||||
}
|
||||
|
||||
int main() {
|
||||
fprintf(stdout, "Running grammar integration tests...\n");
|
||||
test_simple_grammar();
|
||||
@@ -327,6 +372,7 @@ int main() {
|
||||
test_quantifiers();
|
||||
test_failure_missing_root();
|
||||
test_failure_missing_reference();
|
||||
test_failure_left_recursion();
|
||||
fprintf(stdout, "All tests passed.\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user