sched : support async weight copy

readme : remove stray double quote (#7310 )
Signed-off-by: Daniel Bevenius <daniel.bevenius@gmail.com>
2026-04-23 16:37:33 +03:00 · 2024-05-16 00:47:40 +02:00 · 2024-05-15 23:41:03 +02:00 · 2024-05-15 19:59:12 +02:00 · 2024-05-15 15:44:49 +02:00 · 2024-05-15 15:08:48 +02:00
47 changed files with 4823 additions and 559 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -340,6 +340,36 @@ jobs:
          cd build
          ctest -L main --verbose

+  ubuntu-latest-cmake-rpc:
+    runs-on: ubuntu-latest
+
+    continue-on-error: true
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+
+      - name: Dependencies
+        id: depends
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential
+
+      - name: Build
+        id: cmake_build
+        run: |
+          mkdir build
+          cd build
+          cmake -DLLAMA_RPC=ON ..
+          cmake --build . --config Release -j $(nproc)
+
+      - name: Test
+        id: cmake_test
+        run: |
+          cd build
+          ctest -L main --verbose
+
  ubuntu-22-cmake-vulkan:
    runs-on: ubuntu-22.04

@@ -663,6 +693,8 @@ jobs:
    strategy:
      matrix:
        include:
+          - build: 'rpc'
+            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_RPC=ON -DBUILD_SHARED_LIBS=ON'
          - build: 'noavx'
            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF -DBUILD_SHARED_LIBS=ON'
          - build: 'avx2'
@@ -898,9 +930,9 @@ jobs:
        shell: bash

    env:
-      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/62641e01-1e8d-4ace-91d6-ae03f7f8a71f/w_BaseKit_p_2024.0.0.49563_offline.exe
+      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/7dff44ba-e3af-4448-841c-0d616c8da6e7/w_BaseKit_p_2024.1.0.595_offline.exe
      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel
-
+      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
    steps:
      - name: Clone
        id: checkout
@@ -932,6 +964,17 @@ jobs:
        id: pack_artifacts
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        run: |
+          echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin"
+          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.4.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin
+
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/pi_win_proxy_loader.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/pi_level_zero.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl7.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
+          echo "cp oneAPI running time dll files to ./build/bin done"
          7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/*

      - name: Upload artifacts
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -123,6 +123,7 @@ set(LLAMA_METAL_MACOSX_VERSION_MIN "" CACHE STRING
 set(LLAMA_METAL_STD "" CACHE STRING          "llama: metal standard version (-std flag)")
 option(LLAMA_KOMPUTE                         "llama: use Kompute"                               OFF)
 option(LLAMA_MPI                             "llama: use MPI"                                   OFF)
+option(LLAMA_RPC                             "llama: use RPC"                                   OFF)
 option(LLAMA_QKK_64                          "llama: use super-block size of 64 for k-quants"   OFF)
 option(LLAMA_SYCL                            "llama: use SYCL"                                  OFF)
 option(LLAMA_SYCL_F16                        "llama: use 16 bit floats for sycl calculations"   OFF)
@@ -494,6 +495,17 @@ if (LLAMA_MPI)
    endif()
 endif()

+if (LLAMA_RPC)
+    add_compile_definitions(GGML_USE_RPC)
+
+    if (WIN32)
+        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ws2_32)
+    endif()
+
+    set(GGML_HEADERS_RPC ggml-rpc.h)
+    set(GGML_SOURCES_RPC ggml-rpc.cpp)
+endif()
+
 if (LLAMA_CLBLAST)
    find_package(CLBlast)
    if (CLBlast_FOUND)
@@ -1176,6 +1188,7 @@ add_library(ggml OBJECT
            ${GGML_SOURCES_OPENCL}    ${GGML_HEADERS_OPENCL}
            ${GGML_SOURCES_METAL}     ${GGML_HEADERS_METAL}
            ${GGML_SOURCES_MPI}       ${GGML_HEADERS_MPI}
+            ${GGML_SOURCES_RPC}       ${GGML_HEADERS_RPC}
            ${GGML_SOURCES_EXTRA}     ${GGML_HEADERS_EXTRA}
            ${GGML_SOURCES_SYCL}      ${GGML_HEADERS_SYCL}
            ${GGML_SOURCES_KOMPUTE}   ${GGML_HEADERS_KOMPUTE}
--- a/README.md
+++ b/README.md
@@ -532,7 +532,7 @@ Building the program with BLAS support may lead to some performance improvements
        cmake -B build -DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
        && cmake --build build --config Release -- -j 16
    ```
-    On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DLLAMA_HIP_UMA=ON"`.
+    On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DLLAMA_HIP_UMA=ON`.
    However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).

  - Using `make` (example for target gfx1030, build with 16 CPU threads):
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1060,6 +1060,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
 #endif // GGML_USE_CUDA_SYCL_VULKAN
        return true;
    }
+    if (arg == "--rpc") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+        params.rpc_servers = argv[i];
+        return true;
+    }
    if (arg == "--no-mmap") {
        params.use_mmap = false;
        return true;
@@ -1557,6 +1565,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
        printf("  -mg i, --main-gpu i   the GPU to use for the model (with split-mode = none),\n");
        printf("                        or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu);
    }
+    printf("  --rpc SERVERS         comma separated list of RPC servers\n");
    printf("  --verbose-prompt      print a verbose prompt before generation (default: %s)\n", params.verbose_prompt ? "true" : "false");
    printf("  --no-display-prompt   don't print prompt at generation (default: %s)\n", !params.display_prompt ? "true" : "false");
    printf("  -gan N, --grp-attn-n N\n");
@@ -1830,6 +1839,7 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
    if (params.n_gpu_layers != -1) {
        mparams.n_gpu_layers = params.n_gpu_layers;
    }
+    mparams.rpc_servers     = params.rpc_servers.c_str();
    mparams.main_gpu        = params.main_gpu;
    mparams.split_mode      = params.split_mode;
    mparams.tensor_split    = params.tensor_split;
--- a/common/common.h
+++ b/common/common.h
@@ -82,6 +82,7 @@ struct gpt_params {
    float   yarn_beta_slow        = 1.0f;  // YaRN high correction dim
    int32_t yarn_orig_ctx         = 0;     // YaRN original context length
    float   defrag_thold          = -1.0f; // KV cache defragmentation threshold
+    std::string rpc_servers       = "";    // comma separated list of RPC servers

    ggml_backend_sched_eval_callback cb_eval = nullptr;
    void * cb_eval_user_data                 = nullptr;
--- a/convert-hf-to-gguf-update.py
+++ b/convert-hf-to-gguf-update.py
@@ -74,9 +74,9 @@ models = [
    {"name": "qwen2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },
    {"name": "olmo",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", },
    {"name": "dbrx",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", },
-    {"name": "jina-en",        "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
-    {"name": "jina-es",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
-    {"name": "jina-de",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
+    {"name": "jina-v2-en",     "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
+    {"name": "jina-v2-es",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
+    {"name": "jina-v2-de",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
 ]

 # make directory "models/tokenizers" if it doesn't exist
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -240,23 +240,6 @@ class Model:
        return False

    def write_tensors(self):
-        # same as ggml_compute_fp32_to_bf16 in ggml-impl.h
-        def np_fp32_to_bf16(n: np.ndarray):
-            # force nan to quiet
-            n = np.where((n & 0x7fffffff) > 0x7f800000, (n & 0xffff0000) | (64 << 16), n)
-            # flush subnormals to zero
-            n = np.where((n & 0x7f800000) == 0, n & 0x80000000, n)
-            # round to nearest even
-            n = (n + (0x7fff + ((n >> 16) & 1))) >> 16
-            return n.astype(np.int16)
-
-        # Doing this row-wise is much, much faster than element-wise, hence the signature
-        v_fp32_to_bf16 = np.vectorize(np_fp32_to_bf16, otypes=[np.int16], signature="(n)->(n)")
-        if self.lazy:
-            # TODO: find a way to implicitly wrap np.vectorize functions
-            # NOTE: the type is changed to reflect otypes passed to np.vectorize above
-            v_fp32_to_bf16 = gguf.LazyNumpyTensor._wrap_fn(v_fp32_to_bf16, meta_noop=np.int16)
-
        max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,")

        for name, data_torch in self.get_tensors():
@@ -309,27 +292,31 @@ class Model:
                ))

                if self.ftype != gguf.LlamaFileType.ALL_F32 and extra_f16 and not extra_f32:
-                    if self.ftype == gguf.LlamaFileType.MOSTLY_F16:
+                    if self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
+                        data = gguf.quantize_bf16(data)
+                        assert data.dtype == np.int16
+                        data_qtype = gguf.GGMLQuantizationType.BF16
+
+                    elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0 and gguf.can_quantize_to_q8_0(data):
+                        data = gguf.quantize_q8_0(data)
+                        assert data.dtype == np.uint8
+                        data_qtype = gguf.GGMLQuantizationType.Q8_0
+
+                    else:  # default to float16 for quantized tensors
                        if data_dtype != np.float16:
                            data = data.astype(np.float16)
                        data_qtype = gguf.GGMLQuantizationType.F16

-                    elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
-                        if data_dtype != np.float32:
-                            data = data.astype(np.float32)
-                        data = v_fp32_to_bf16(data.view(np.int32))
-                        assert data.dtype == np.int16
-                        data_qtype = gguf.GGMLQuantizationType.BF16
-
-                else:  # by default, convert to float32
+                if data_qtype is None:  # by default, convert to float32
                    if data_dtype != np.float32:
                        data = data.astype(np.float32)
                    data_qtype = gguf.GGMLQuantizationType.F32

-                assert data_qtype is not None
-
+                block_size, type_size = gguf.GGML_QUANT_SIZES[data_qtype]
                # reverse shape to make it similar to the internal ggml dimension order
-                shape_str = f"{{{', '.join(str(n) for n in reversed(data.shape))}}}"
+                shape_str = f"""{{{', '.join(str(n) for n in reversed(
+                    (*data.shape[:-1], data.shape[-1] * data.dtype.itemsize // type_size * block_size))
+                )}}}"""

                # n_dims is implicit in the shape
                logger.info(f"{f'%-{max_name_len}s' % f'{new_name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}")
@@ -475,13 +462,13 @@ class Model:
            res = "dbrx"
        if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
            # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-en
-            res = "jina-en"
+            res = "jina-v2-en"
        if chkhsh == "171aeeedd6fb548d418a7461d053f11b6f1f1fc9b387bd66640d28a4b9f5c643":
            # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-es
-            res = "jina-es"
+            res = "jina-v2-es"
        if chkhsh == "27949a2493fc4a9f53f5b9b029c82689cfbe5d3a1929bb25e043089e28466de6":
            # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de
-            res = "jina-de"
+            res = "jina-v2-de"

        if res is None:
            logger.warning("\n")
@@ -859,6 +846,7 @@ class BaichuanModel(Model):
        self.gguf_writer.add_head_count(head_count)
        self.gguf_writer.add_head_count_kv(head_count_kv)
        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
+        self.gguf_writer.add_file_type(self.ftype)

        if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
            if self.hparams["rope_scaling"].get("type") == "linear":
@@ -981,6 +969,7 @@ class XverseModel(Model):
        self.gguf_writer.add_head_count(head_count)
        self.gguf_writer.add_head_count_kv(head_count_kv)
        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
+        self.gguf_writer.add_file_type(self.ftype)

        if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
            if self.hparams["rope_scaling"].get("type") == "linear":
@@ -1215,6 +1204,7 @@ class StableLMModel(Model):
        self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"])
        self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
        self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_eps", "norm_eps"]))
+        self.gguf_writer.add_file_type(self.ftype)

    _q_norms: list[dict[str, Tensor]] | None = None
    _k_norms: list[dict[str, Tensor]] | None = None
@@ -1591,6 +1581,7 @@ class QwenModel(Model):
        self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
        self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
+        self.gguf_writer.add_file_type(self.ftype)


@Model.register("Qwen2ForCausalLM")
@@ -1828,6 +1819,7 @@ class PlamoModel(Model):
        self.gguf_writer.add_head_count(hparams["num_attention_heads"])
        self.gguf_writer.add_head_count_kv(5)  # hparams["num_key_value_heads"]) is wrong
        self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
+        self.gguf_writer.add_file_type(self.ftype)

    def shuffle_attn_q_weight(self, data_torch):
        assert data_torch.size() == (5120, 5120)
@@ -2007,6 +1999,7 @@ in chat mode so that the conversation can end normally.")
        self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
        self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
+        self.gguf_writer.add_file_type(self.ftype)

    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        num_heads = self.hparams["num_attention_heads"]
@@ -2415,25 +2408,15 @@ class LazyTorchTensor(gguf.LazyBase):
    def numpy(self) -> gguf.LazyNumpyTensor:
        dtype = self._dtype_map[self.dtype]
        return gguf.LazyNumpyTensor(
-            meta=np.lib.stride_tricks.as_strided(np.zeros(1, dtype), self.shape, (0 for _ in self.shape)),
+            meta=gguf.LazyNumpyTensor.meta_with_dtype_and_shape(dtype, self.shape),
            lazy=self._lazy,
            args=(self,),
            func=(lambda s: s[0].numpy())
        )

    @classmethod
-    def eager_to_meta(cls, t: Tensor) -> Tensor:
-        if t.is_meta:
-            return t
-        return t.detach().to("meta")
-
-    @classmethod
-    def meta_with_dtype(cls, m: Tensor, dtype: torch.dtype) -> Tensor:
-        m = m.detach()
-        if not m.is_meta:
-            m = m.to("meta")
-        m.dtype = dtype
-        return m
+    def meta_with_dtype_and_shape(cls, dtype: torch.dtype, shape: torch.Size) -> Tensor:
+        return torch.empty(size=shape, dtype=dtype, device="meta")

    @classmethod
    def __torch_function__(cls, func, types, args=(), kwargs=None):
@@ -2464,8 +2447,8 @@ def parse_args() -> argparse.Namespace:
        help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
    )
    parser.add_argument(
-        "--outtype", type=str, choices=["f32", "f16", "bf16", "auto"], default="f16",
-        help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
+        "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16",
+        help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
    )
    parser.add_argument(
        "--bigendian", action="store_true",
@@ -2523,6 +2506,7 @@ def main() -> None:
        "f32": gguf.LlamaFileType.ALL_F32,
        "f16": gguf.LlamaFileType.MOSTLY_F16,
        "bf16": gguf.LlamaFileType.MOSTLY_BF16,
+        "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
        "auto": gguf.LlamaFileType.GUESSED,
    }

--- a/convert.py
+++ b/convert.py
@@ -24,7 +24,7 @@ from abc import ABC, abstractmethod
 from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
 from dataclasses import dataclass
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable, ClassVar, IO, Iterable, Literal, Protocol, TypeVar, runtime_checkable
+from typing import TYPE_CHECKING, Any, Callable, ClassVar, IO, Iterable, Literal, Protocol, TypeVar, runtime_checkable, Optional

 import numpy as np
 from sentencepiece import SentencePieceProcessor
@@ -344,10 +344,47 @@ class Params:
        return params


+@dataclass
+class Metadata:
+    name: Optional[str] = None
+    author: Optional[str] = None
+    version: Optional[str] = None
+    url: Optional[str] = None
+    description: Optional[str] = None
+    licence: Optional[str] = None
+    source_url: Optional[str] = None
+    source_hf_repo: Optional[str] = None
+
+    @staticmethod
+    def load(metadata_path: Path) -> Metadata:
+        if metadata_path is None or not metadata_path.exists():
+            return Metadata()
+
+        with open(metadata_path, 'r') as file:
+            data = json.load(file)
+
+        # Create a new Metadata instance
+        metadata = Metadata()
+
+        # Assigning values to Metadata attributes if they exist in the JSON file
+        # This is based on LLM_KV_NAMES mapping in llama.cpp
+        metadata.name = data.get("general.name")
+        metadata.author = data.get("general.author")
+        metadata.version = data.get("general.version")
+        metadata.url = data.get("general.url")
+        metadata.description = data.get("general.description")
+        metadata.license = data.get("general.license")
+        metadata.source_url = data.get("general.source.url")
+        metadata.source_hf_repo = data.get("general.source.huggingface.repository")
+
+        return metadata
+
+
 #
 # vocab
 #

+
@runtime_checkable
 class BaseVocab(Protocol):
    tokenizer_model: ClassVar[str]
@@ -1066,21 +1103,42 @@ class OutputFile:
    def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE):
        self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)

-    def add_meta_arch(self, params: Params) -> None:
+    def add_meta_model(self, params: Params, metadata: Metadata) -> None:
+        # Metadata About The Model And Its Provenence
        name = "LLaMA"
-
-        # TODO: better logic to determine model name
-        if params.n_ctx == 4096:
-            name = "LLaMA v2"
+        if metadata is not None and metadata.name is not None:
+            name = metadata.name
        elif params.path_model is not None:
-            name = str(params.path_model.parent).split('/')[-1]
+            name = str(params.path_model.parent).split("/")[-1]
+        elif params.n_ctx == 4096:
+            # Heuristic detection of LLaMA v2 model
+            name = "LLaMA v2"

-        self.gguf.add_name                (name)
-        self.gguf.add_vocab_size          (params.n_vocab)
-        self.gguf.add_context_length      (params.n_ctx)
-        self.gguf.add_embedding_length    (params.n_embd)
-        self.gguf.add_block_count         (params.n_layer)
-        self.gguf.add_feed_forward_length (params.n_ff)
+        self.gguf.add_name(name)
+
+        if metadata is not None:
+            if metadata.author is not None:
+                self.gguf.add_author(metadata.author)
+            if metadata.version is not None:
+                self.gguf.add_version(metadata.version)
+            if metadata.url is not None:
+                self.gguf.add_url(metadata.url)
+            if metadata.description is not None:
+                self.gguf.add_description(metadata.description)
+            if metadata.licence is not None:
+                self.gguf.add_licence(metadata.licence)
+            if metadata.source_url is not None:
+                self.gguf.add_source_url(metadata.source_url)
+            if metadata.source_hf_repo is not None:
+                self.gguf.add_source_hf_repo(metadata.source_hf_repo)
+
+    def add_meta_arch(self, params: Params) -> None:
+        # Metadata About The Neural Architecture Itself
+        self.gguf.add_vocab_size(params.n_vocab)
+        self.gguf.add_context_length(params.n_ctx)
+        self.gguf.add_embedding_length(params.n_embd)
+        self.gguf.add_block_count(params.n_layer)
+        self.gguf.add_feed_forward_length(params.n_ff)
        self.gguf.add_rope_dimension_count(params.n_embd // params.n_head)
        self.gguf.add_head_count          (params.n_head)
        self.gguf.add_head_count_kv       (params.n_head_kv)
@@ -1183,13 +1241,14 @@ class OutputFile:
    @staticmethod
    def write_vocab_only(
        fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab,
-        endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False,
+        endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, metadata: Metadata = None,
    ) -> None:
        check_vocab_size(params, vocab, pad_vocab=pad_vocab)

        of = OutputFile(fname_out, endianess=endianess)

        # meta data
+        of.add_meta_model(params, metadata)
        of.add_meta_arch(params)
        of.add_meta_vocab(vocab)
        of.add_meta_special_vocab(svocab)
@@ -1216,12 +1275,14 @@ class OutputFile:
        fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: BaseVocab, svocab: gguf.SpecialVocab,
        concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
        pad_vocab: bool = False,
+        metadata: Metadata = None,
    ) -> None:
        check_vocab_size(params, vocab, pad_vocab=pad_vocab)

        of = OutputFile(fname_out, endianess=endianess)

        # meta data
+        of.add_meta_model(params, metadata)
        of.add_meta_arch(params)
        if isinstance(vocab, Vocab):
            of.add_meta_vocab(vocab)
@@ -1257,6 +1318,37 @@ def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileT
    raise ValueError(f"Unexpected combination of types: {name_to_type}")


+def model_parameter_count(model: LazyModel) -> int:
+    total_model_parameters = 0
+    for i, (name, lazy_tensor) in enumerate(model.items()):
+        sum_weights_in_tensor = 1
+        for dim in lazy_tensor.shape:
+            sum_weights_in_tensor *= dim
+        total_model_parameters += sum_weights_in_tensor
+    return total_model_parameters
+
+
+def model_parameter_count_rounded_notation(model_params_count: int) -> str:
+    if model_params_count > 1e12 :
+        # Trillions Of Parameters
+        scaled_model_params = model_params_count * 1e-12
+        scale_suffix = "T"
+    elif model_params_count > 1e9 :
+        # Billions Of Parameters
+        scaled_model_params = model_params_count * 1e-9
+        scale_suffix = "B"
+    elif model_params_count > 1e6 :
+        # Millions Of Parameters
+        scaled_model_params = model_params_count * 1e-6
+        scale_suffix = "M"
+    else:
+        # Thousands Of Parameters
+        scaled_model_params = model_params_count * 1e-3
+        scale_suffix = "K"
+
+    return f"{round(scaled_model_params)}{scale_suffix}"
+
+
 def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel:
    return {name: tensor.astype(output_type.type_for_tensor(name, tensor))
            for (name, tensor) in model.items()}
@@ -1436,13 +1528,35 @@ class VocabFactory:
        return vocab, special_vocab


-def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path:
-    namestr = {
-        GGMLFileType.AllF32:    "f32",
-        GGMLFileType.MostlyF16: "f16",
-        GGMLFileType.MostlyQ8_0:"q8_0",
+def default_convention_outfile(file_type: GGMLFileType, params: Params, model_params_count: int, metadata: Metadata) -> str:
+    quantization = {
+        GGMLFileType.AllF32:    "F32",
+        GGMLFileType.MostlyF16: "F16",
+        GGMLFileType.MostlyQ8_0: "Q8_0",
    }[file_type]
-    ret = model_paths[0].parent / f"ggml-model-{namestr}.gguf"
+
+    parameters = model_parameter_count_rounded_notation(model_params_count)
+
+    expert_count = ""
+    if params.n_experts is not None:
+        expert_count = f"{params.n_experts}x"
+
+    version = ""
+    if metadata is not None and metadata.version is not None:
+        version = f"-{metadata.version}"
+
+    name = "ggml-model"
+    if metadata is not None and metadata.name is not None:
+        name = metadata.name
+    elif params.path_model is not None:
+        name = params.path_model.name
+
+    return f"{name}{version}-{expert_count}{parameters}-{quantization}"
+
+
+def default_outfile(model_paths: list[Path], file_type: GGMLFileType, params: Params, model_params_count: int, metadata: Metadata) -> Path:
+    default_filename = default_convention_outfile(file_type, params, model_params_count, metadata)
+    ret = model_paths[0].parent / f"{default_filename}.gguf"
    if ret in model_paths:
        logger.error(
            f"Error: Default output path ({ret}) would overwrite the input. "
@@ -1480,17 +1594,30 @@ def main(args_in: list[str] | None = None) -> None:
    parser.add_argument("--pad-vocab",    action="store_true",    help="add pad tokens when model vocab expects more than tokenizer metadata provides")
    parser.add_argument("--skip-unknown", action="store_true",    help="skip unknown tensor names instead of failing")
    parser.add_argument("--verbose",      action="store_true",    help="increase output verbosity")
+    parser.add_argument("--metadata",     type=Path,              help="Specify the path for a metadata file")
+    parser.add_argument("--get-outfile",  action="store_true",    help="get calculated default outfile name")

    args = parser.parse_args(args_in)

    if args.verbose:
        logging.basicConfig(level=logging.DEBUG)
-    elif args.dump_single or args.dump:
+    elif args.dump_single or args.dump or args.get_outfile:
        # Avoid printing anything besides the dump output
        logging.basicConfig(level=logging.WARNING)
    else:
        logging.basicConfig(level=logging.INFO)

+    metadata = Metadata.load(args.metadata)
+
+    if args.get_outfile:
+        model_plus = load_some_model(args.model)
+        params = Params.load(model_plus)
+        model   = convert_model_names(model_plus.model, params, args.skip_unknown)
+        model_params_count = model_parameter_count(model_plus.model)
+        ftype   = pick_output_type(model, args.outtype)
+        print(f"{default_convention_outfile(ftype, params, model_params_count, metadata)}") # noqa: NP100
+        return
+
    if args.no_vocab and args.vocab_only:
        raise ValueError("--vocab-only does not make sense with --no-vocab")

@@ -1504,6 +1631,9 @@ def main(args_in: list[str] | None = None) -> None:
    else:
        model_plus = ModelPlus(model = {}, paths = [args.model / 'dummy'], format = 'none', vocab = None)

+    model_params_count = model_parameter_count(model_plus.model)
+    logger.info(f"model parameters count : {model_params_count} ({model_parameter_count_rounded_notation(model_params_count)})")
+
    if args.dump:
        do_dump_model(model_plus)
        return
@@ -1557,7 +1687,7 @@ def main(args_in: list[str] | None = None) -> None:
                f_norm_eps = 1e-5,
            )
        OutputFile.write_vocab_only(outfile, params, vocab, special_vocab,
-                                    endianess=endianess, pad_vocab=args.pad_vocab)
+                                    endianess=endianess, pad_vocab=args.pad_vocab, metadata=metadata)
        logger.info(f"Wrote {outfile}")
        return

@@ -1570,13 +1700,13 @@ def main(args_in: list[str] | None = None) -> None:
    model   = convert_model_names(model, params, args.skip_unknown)
    ftype   = pick_output_type(model, args.outtype)
    model   = convert_to_output_type(model, ftype)
-    outfile = args.outfile or default_outfile(model_plus.paths, ftype)
+    outfile = args.outfile or default_outfile(model_plus.paths, ftype, params, model_params_count, metadata)

    params.ftype = ftype
    logger.info(f"Writing {outfile}, format {ftype}")

    OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab,
-                         concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab)
+                         concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab, metadata=metadata)
    logger.info(f"Wrote {outfile}")


--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -49,4 +49,7 @@ else()
        add_subdirectory(server)
    endif()
    add_subdirectory(export-lora)
+    if (LLAMA_RPC)
+        add_subdirectory(rpc)
+    endif()
 endif()
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -211,6 +211,7 @@ int main(int argc, char ** argv) {

    // clean up
    llama_print_timings(ctx);
+    llama_batch_free(batch);
    llama_free(ctx);
    llama_free_model(model);
    llama_backend_free();
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -1358,6 +1358,7 @@ int main(int argc, char ** argv) {
        }

        p->print_test(t);
+        fflush(p->fout);

        llama_print_timings(ctx);

--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@@ -300,14 +300,10 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    for (auto & image : params.image) {
+    if (prompt_contains_image(params.prompt)) {
        auto ctx_llava = llava_init_context(&params, model);

-        auto image_embed = load_image(ctx_llava, &params, image);
-        if (!image_embed) {
-            std::cerr << "error: failed to load image " << image << ". Terminating\n\n";
-            return 1;
-        }
+        auto image_embed = load_image(ctx_llava, &params, "");

        // process the prompt
        process_prompt(ctx_llava, image_embed, &params, params.prompt);
@@ -316,7 +312,26 @@ int main(int argc, char ** argv) {
        llava_image_embed_free(image_embed);
        ctx_llava->model = NULL;
        llava_free(ctx_llava);
+    } else {
+        for (auto & image : params.image) {
+            auto ctx_llava = llava_init_context(&params, model);
+
+            auto image_embed = load_image(ctx_llava, &params, image);
+            if (!image_embed) {
+                std::cerr << "error: failed to load image " << image << ". Terminating\n\n";
+                return 1;
+            }
+
+            // process the prompt
+            process_prompt(ctx_llava, image_embed, &params, params.prompt);
+
+            llama_print_timings(ctx_llava->ctx_llama);
+            llava_image_embed_free(image_embed);
+            ctx_llava->model = NULL;
+            llava_free(ctx_llava);
+        }
    }
+
    llama_free_model(model);

    return 0;
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -88,7 +88,6 @@ static struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair<
 // Take the image segments in a grid configuration and return the embeddings and the number of embeddings into preallocated memory (image_embd_out)
 static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out) {
    struct {
-        struct ggml_tensor * newline;
        struct ggml_context * ctx;
    } model;

@@ -150,20 +149,6 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>

    model.ctx = ggml_init(params);

-    ggml_tensor * newline_tmp = clip_get_newline_tensor(ctx_clip);
-    model.newline = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, newline_tmp->ne[0]);
-    if (newline_tmp->backend != GGML_BACKEND_TYPE_CPU) {
-        if (newline_tmp->buffer == NULL) {
-            LOG_TEE("newline_tmp tensor buffer is NULL\n");
-        }
-        ggml_backend_tensor_get(newline_tmp, model.newline->data, 0, ggml_nbytes(newline_tmp));
-    } else {
-        model.newline->data = newline_tmp->data;
-        if (model.newline->data == NULL) {
-            LOG_TEE("newline_tmp tensor data is NULL\n");
-        }
-    }
-
    struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_patches(ctx_clip), num_images - 1); // example: 4096 x 576 x 4
    // ggml_tensor_printf(image_features,"image_features",__LINE__,false,false);
    // fill it with the image embeddings, ignoring the base
--- a/examples/perplexity/README.md
+++ b/examples/perplexity/README.md
@@ -7,6 +7,8 @@ Also note that finetunes typically result in a higher perplexity value even thou

 Within llama.cpp the perplexity of base models is used primarily to judge the quality loss from e.g. quantized models vs. FP16.
 The convention among contributors is to use the Wikitext-2 test set for testing unless noted otherwise (can be obtained with `scripts/get-wikitext-2.sh`).
+When numbers are listed all command line arguments and compilation options are left at their defaults unless noted otherwise.
+llama.cpp numbers are **not** directly comparable to those of other projects because the exact values depend strongly on the implementation details.

 By default only the mean perplexity value and the corresponding uncertainty is calculated.
 The uncertainty is determined empirically by assuming a Gaussian distribution of the "correct" logits per and then applying error propagation.
@@ -32,7 +34,13 @@ In addition to the KL divergence the following statistics are calculated with `-

 ## LLaMA 3 8b Scoreboard

-Results are sorted by Kullback-Leibler divergence relative to FP16.
+| Revision | f364eb6f           |
+|:---------|:-------------------|
+| Backend  | CUDA               |
+| CPU      | AMD Epyc 7742      |
+| GPU      | 1x NVIDIA RTX 4090 |
+
+Results were generated using the CUDA backend and are sorted by Kullback-Leibler divergence relative to FP16.
 The "WT" importance matrices were created using varying numbers of Wikitext tokens and can be found [here](https://huggingface.co/JohannesGaessler/llama.cpp_importance_matrices/blob/main/imatrix-llama_3-8b-f16-2.7m_tokens.dat).

 | Quantization | imatrix | Model size [GiB] | PPL                    | ΔPPL                   | KLD                   | Mean Δp           | RMS Δp           |
@@ -89,6 +97,12 @@ K-quants score better on mean Δp than the legacy quants than e.g. KL divergence

 ## LLaMA 2 vs. LLaMA 3 Quantization comparison

+| Revision | f364eb6f           |
+|:---------|:-------------------|
+| Backend  | CUDA               |
+| CPU      | AMD Epyc 7742      |
+| GPU      | 1x NVIDIA RTX 4090 |
+
 | Metric          |          L2 7b q2_K |          L3 8b q2_K |        L2 7b q4_K_M |        L3 8b q4_K_M |          L2 7b q6_K |          L3 8b q6_K |          L2 7b q8_0 |          L3 8b q8_0 |
 |-----------------|---------------------|---------------------|---------------------|---------------------|---------------------|---------------------|---------------------|---------------------|
 | Mean PPL        | 5.794552 ± 0.032298 | 9.751568 ± 0.063312 | 5.877078 ± 0.032781 | 6.407115 ± 0.039119 | 5.808494 ± 0.032425 | 6.253382 ± 0.038078 | 5.798542 ± 0.032366 | 6.234284 ± 0.037878 |
@@ -107,6 +121,50 @@ K-quants score better on mean Δp than the legacy quants than e.g. KL divergence
 | RMS Δp          |     9.762 ± 0.053 % |    21.421 ± 0.079 % |     3.252 ± 0.024 % |     5.519 ± 0.050 % |     1.339 ± 0.010 % |     2.295 ± 0.019 % |     0.618 ± 0.011 % |     1.198 ± 0.007 % |
 | Same top p      |    85.584 ± 0.086 % |    71.138 ± 0.119 % |    94.665 ± 0.055 % |    91.901 ± 0.072 % |    97.520 ± 0.038 % |    96.031 ± 0.051 % |    98.846 ± 0.026 % |    97.674 ± 0.040 % |

+## LLaMA 3 BF16 vs. FP16 comparison
+
+| Revision | 83330d8c      |
+|:---------|:--------------|
+| Backend  | CPU           |
+| CPU      | AMD Epyc 7742 |
+| GPU      | N/A           |
+
+Results were calculated with LLaMA 3 8b BF16 as `--kl-divergence-base` and LLaMA 3 8b FP16 as the `--model` for comparison.
+
+| Metric                         |                    Value |
+|--------------------------------|--------------------------|
+| Mean PPL(Q)                    |      6.227711 ± 0.037833 |
+| Mean PPL(base)                 |      6.225194 ± 0.037771 |
+| Cor(ln(PPL(Q)), ln(PPL(base))) |                  99.990% |
+| Mean ln(PPL(Q)/PPL(base))      |      0.000404 ± 0.000086 |
+| Mean PPL(Q)/PPL(base)          |      1.000404 ± 0.000086 |
+| Mean PPL(Q)-PPL(base)          |      0.002517 ± 0.000536 |
+| Mean    KLD                    |  0.00002515 ± 0.00000020 |
+| Maximum KLD                    |                 0.012206 |
+| 99.9%   KLD                    |                 0.000799 |
+| 99.0%   KLD                    |                 0.000222 |
+| 99.0%   KLD                    |                 0.000222 |
+| Median  KLD                    |                 0.000013 |
+| 10.0%   KLD                    |                -0.000002 |
+| 5.0%   KLD                     |                -0.000008 |
+| 1.0%   KLD                     |                -0.000023 |
+| Minimum KLD                    |                -0.000059 |
+| Mean    Δp                     | -0.0000745 ± 0.0003952 % |
+| Maximum Δp                     |                   4.186% |
+| 99.9%   Δp                     |                   1.049% |
+| 99.0%   Δp                     |                   0.439% |
+| 95.0%   Δp                     |                   0.207% |
+| 90.0%   Δp                     |                   0.125% |
+| 75.0%   Δp                     |                   0.029% |
+| Median  Δp                     |                   0.000% |
+| 25.0%   Δp                     |                  -0.030% |
+| 10.0%   Δp                     |                  -0.126% |
+| 5.0%   Δp                      |                  -0.207% |
+| 1.0%   Δp                      |                  -0.434% |
+| 0.1%   Δp                      |                  -1.016% |
+| Minimum Δp                     |                  -4.672% |
+| RMS Δp                         |          0.150 ± 0.001 % |
+| Same top p                     |         99.739 ± 0.013 % |

 ## Old Numbers

--- a/examples/rpc/CMakeLists.txt
+++ b/examples/rpc/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_executable(rpc-server rpc-server.cpp)
+target_link_libraries(rpc-server PRIVATE ggml llama)
--- a/examples/rpc/README.md
+++ b/examples/rpc/README.md
@@ -0,0 +1,74 @@
+## Overview
+
+The `rpc-server` allows  running `ggml` backend on a remote host.
+The RPC backend communicates with one or several instances of `rpc-server` and offloads computations to them.
+This can be used for distributed LLM inference with `llama.cpp` in the following way:
+
+```mermaid
+flowchart TD
+    rpcb---|TCP|srva
+    rpcb---|TCP|srvb
+    rpcb-.-|TCP|srvn
+    subgraph hostn[Host N]
+    srvn[rpc-server]-.-backend3["Backend (CUDA,Metal,etc.)"]
+    end
+    subgraph hostb[Host B]
+    srvb[rpc-server]---backend2["Backend (CUDA,Metal,etc.)"]
+    end
+    subgraph hosta[Host A]
+    srva[rpc-server]---backend["Backend (CUDA,Metal,etc.)"]
+    end
+    subgraph host[Main Host]
+    ggml[llama.cpp]---rpcb[RPC backend]
+    end
+    style hostn stroke:#66,stroke-width:2px,stroke-dasharray: 5 5
+```
+
+Each host can run a different backend, e.g. one with CUDA and another with Metal.
+You can also run multiple `rpc-server` instances on the same host, each with a different backend.
+
+## Usage
+
+On each host, build the corresponding backend with `cmake` and add `-DLLAMA_RPC=ON` to the build options.
+For example, to build the CUDA backend with RPC support:
+
+```bash
+mkdir build-rpc-cuda
+cd build-rpc-cuda
+cmake .. -DLLAMA_CUDA=ON -DLLAMA_RPC=ON
+cmake --build . --config Release
+```
+
+Then, start the `rpc-server` with the backend:
+
+```bash
+$ bin/rpc-server 0.0.0.0 50052
+create_backend: using CUDA backend
+ggml_cuda_init: GGML_CUDA_FORCE_MMQ:   no
+ggml_cuda_init: CUDA_USE_TENSOR_CORES: yes
+ggml_cuda_init: found 1 CUDA devices:
+  Device 0: NVIDIA T1200 Laptop GPU, compute capability 7.5, VMM: yes
+Starting RPC server on 0.0.0.0:50052
+```
+
+When using the CUDA backend, you can specify the device with the `CUDA_VISIBLE_DEVICES` environment variable, e.g.:
+```bash
+$ CUDA_VISIBLE_DEVICES=0 bin/rpc-server 0.0.0.0 50052
+```
+This way you can run multiple `rpc-server` instances on the same host, each with a different CUDA device.
+
+
+On the main host build `llama.cpp` only with `-DLLAMA_RPC=ON`:
+
+```bash
+mkdir build-rpc
+cd build-rpc
+cmake .. -DLLAMA_RPC=ON
+cmake --build . --config Release
+```
+
+Finally, use the `--rpc` option to specify the host and port of each `rpc-server`:
+
+```bash
+$ bin/main -m ../models/tinyllama-1b/ggml-model-f16.gguf -p "Hello, my name is" --repeat-penalty 1.0 -n 64 --rpc 192.168.88.10:50052,192.168.88.11:50052 -ngl 99
+```
--- a/examples/rpc/rpc-server.cpp
+++ b/examples/rpc/rpc-server.cpp
@@ -0,0 +1,70 @@
+#ifdef GGML_USE_CUDA
+#include "ggml-cuda.h"
+#endif
+
+#ifdef GGML_USE_METAL
+#include "ggml-metal.h"
+#endif
+
+#include "ggml-rpc.h"
+#include <string>
+#include <stdio.h>
+
+static ggml_backend_t create_backend() {
+    ggml_backend_t backend = NULL;
+#ifdef GGML_USE_CUDA
+    fprintf(stderr, "%s: using CUDA backend\n", __func__);
+    backend = ggml_backend_cuda_init(0); // init device 0
+    if (!backend) {
+        fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
+    }
+#elif GGML_USE_METAL
+    fprintf(stderr, "%s: using Metal backend\n", __func__);
+    backend = ggml_backend_metal_init();
+    if (!backend) {
+        fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
+    }
+#endif
+
+    // if there aren't GPU Backends fallback to CPU backend
+    if (!backend) {
+        fprintf(stderr, "%s: using CPU backend\n", __func__);
+        backend = ggml_backend_cpu_init();
+    }
+    return backend;
+}
+
+static void get_backend_memory(size_t * free_mem, size_t * total_mem) {
+#ifdef GGML_USE_CUDA
+    ggml_backend_cuda_get_device_memory(0, free_mem, total_mem);
+#else
+    // TODO: implement for other backends
+    *free_mem = 1;
+    *total_mem = 1;
+#endif
+}
+
+int main(int argc, char * argv[]) {
+    if (argc < 3) {
+        fprintf(stderr, "Usage: %s <host> <port>\n", argv[0]);
+        return 1;
+    }
+    const char * host = argv[1];
+    int port = std::stoi(argv[2]);
+    if (port <= 0 || port > 65535) {
+        fprintf(stderr, "Invalid port number: %d\n", port);
+        return 1;
+    }
+    ggml_backend_t backend = create_backend();
+    if (!backend) {
+        fprintf(stderr, "Failed to create backend\n");
+        return 1;
+    }
+    printf("Starting RPC server on %s:%d\n", host, port);
+    size_t free_mem, total_mem;
+    get_backend_memory(&free_mem, &total_mem);
+    std::string endpoint = std::string(host) + ":" + std::to_string(port);
+    start_rpc_server(backend, endpoint.c_str(), free_mem, total_mem);
+    ggml_backend_free(backend);
+    return 0;
+}
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -48,7 +48,7 @@ page cache before using this. See https://github.com/ggerganov/llama.cpp/issues/
 - `--path`: Path from which to serve static files. Default: disabled
 - `--api-key`: Set an api key for request authorization. By default, the server responds to every request. With an api key set, the requests must have the Authorization header set with the api key as Bearer token. May be used multiple times to enable multiple valid keys.
 - `--api-key-file`: Path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access. May be used in conjunction with `--api-key`s.
- `--embedding`: Enable embedding extraction. Default: disabled
+- `--embeddings`: Enable embedding vector output and the OAI compatible endpoint /v1/embeddings. Physical batch size (`--ubatch-size`) must be carefully defined. Default: disabled
 - `-np N`, `--parallel N`: Set the number of slots for process requests. Default: `1`
 - `-cb`, `--cont-batching`: Enable continuous batching (a.k.a dynamic batching).  Default: disabled
 - `-spf FNAME`, `--system-prompt-file FNAME` Set a file to load a system prompt (initial prompt of all slots). This is useful for chat applications. [See more](#change-system-prompt-on-runtime)
--- a/examples/server/bench/bench.py
+++ b/examples/server/bench/bench.py
@@ -293,13 +293,14 @@ def start_server_background(args):


 def is_server_listening(server_fqdn, server_port):
-    with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
-        result = sock.connect_ex((server_fqdn, server_port))
-        _is_server_listening = result == 0
-        if _is_server_listening:
-            print(f"server is listening on {server_fqdn}:{server_port}...")
-        return _is_server_listening
-
+    try:
+        url = f"{server_fqdn}:{server_port}/health"
+        if not url.startswith("http://"):
+            url = f"http://{url}"
+        result = requests.get(url)
+        return result.status_code == 200
+    except Exception:
+        return False

 def escape_metric_name(metric_name):
    return re.sub('[^A-Z0-9]', '_', metric_name.upper())
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -671,6 +671,13 @@ struct server_context {
            model = nullptr;
        }

+        // Clear any sampling context
+        for (server_slot & slot : slots) {
+            if (slot.ctx_sampling != nullptr) {
+                llama_sampling_free(slot.ctx_sampling);
+            }
+        }
+
        llama_batch_free(batch);
    }

--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@@ -887,6 +887,7 @@ async def oai_chat_completions(user_prompt,
                               base_path,
                               async_client,
                               debug=False,
+                               temperature=None,
                               model=None,
                               n_predict=None,
                               enable_streaming=None,
@@ -913,7 +914,8 @@ async def oai_chat_completions(user_prompt,
        "model": model,
        "max_tokens": n_predict,
        "stream": enable_streaming,
-        "seed": seed
+        "temperature": temperature if temperature is not None else 0.0,
+        "seed": seed,
    }
    if response_format is not None:
        payload['response_format'] = response_format
@@ -978,7 +980,8 @@ async def oai_chat_completions(user_prompt,
                max_tokens=n_predict,
                stream=enable_streaming,
                response_format=payload.get('response_format'),
-                seed=seed
+                seed=seed,
+                temperature=payload['temperature']
            )
        except openai.error.AuthenticationError as e:
            if expect_api_error is not None and expect_api_error:
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -371,7 +371,7 @@ static json oaicompat_completion_params_parse(
    llama_params["presence_penalty"]  = json_value(body,   "presence_penalty",  0.0);
    llama_params["seed"]              = json_value(body,   "seed",              LLAMA_DEFAULT_SEED);
    llama_params["stream"]            = json_value(body,   "stream",            false);
-    llama_params["temperature"]       = json_value(body,   "temperature",       0.0);
+    llama_params["temperature"]       = json_value(body,   "temperature",       1.0);
    llama_params["top_p"]             = json_value(body,   "top_p",             1.0);

    // Apply chat template to the list of messages
--- a/ggml-backend-impl.h
+++ b/ggml-backend-impl.h
@@ -114,6 +114,8 @@ extern "C" {
        void                 (*GGML_CALL event_record)      (ggml_backend_event_t event);
        void                 (*GGML_CALL event_wait)        (ggml_backend_t backend, ggml_backend_event_t event);
        void                 (*GGML_CALL event_synchronize) (ggml_backend_event_t event);
+
+        ggml_backend_t (*GGML_CALL backend_dup)(ggml_backend_t backend);
    };

    struct ggml_backend {
--- a/ggml-backend.c
+++ b/ggml-backend.c
@@ -180,6 +180,13 @@ void ggml_backend_free(ggml_backend_t backend) {
    backend->iface.free(backend);
 }

+ggml_backend_t ggml_backend_dup(ggml_backend_t backend) {
+    if (backend->iface.backend_dup) {
+        return backend->iface.backend_dup(backend);
+    }
+    return backend;
+}
+
 ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend) {
    return backend->iface.get_default_buffer_type(backend);
 }
@@ -855,6 +862,7 @@ static struct ggml_backend_i cpu_backend_i = {
    /* .event_record            = */ NULL,
    /* .event_wait              = */ NULL,
    /* .event_synchronize       = */ NULL,
+    /* .backend_dup             = */ NULL,
 };

 static ggml_guid_t ggml_backend_cpu_guid(void) {
@@ -1026,16 +1034,34 @@ static bool ggml_is_view_op(enum ggml_op op) {
 #define GGML_SCHED_MAX_COPIES 4
 #endif

+#ifndef GGML_SCHED_MAX_COPY_STREAMS
+#define GGML_SCHED_MAX_COPY_STREAMS 8
+#endif
+
 struct ggml_backend_sched_split {
    int backend_id;
    int i_start;
    int i_end;
+
+    // input tensors from other backends
    struct ggml_tensor * inputs[GGML_SCHED_MAX_SPLIT_INPUTS];
    int n_inputs;
+
+    // copy stream to use to copy the inputs that are weights (-1 = no copy stream)
+    int w_copy_stream_id;
+
    // graph view of this split
    struct ggml_cgraph graph;
 };

+struct ggml_backend_sched_copy_stream {
+    ggml_backend_t stream;
+    ggml_backend_buffer_t buffer;
+    ggml_backend_event_t event_copy;
+    ggml_backend_event_t event_use;
+    size_t max_size;
+};
+
 struct ggml_backend_sched {
    bool is_reset; // true if the scheduler has been reset since the last graph split
    bool is_alloc;
@@ -1046,6 +1072,9 @@ struct ggml_backend_sched {
    ggml_backend_buffer_type_t bufts[GGML_SCHED_MAX_BACKENDS];
    ggml_gallocr_t galloc;

+    struct ggml_backend_sched_copy_stream copy_streams[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPY_STREAMS];
+    int cur_copy_stream[GGML_SCHED_MAX_BACKENDS];
+
    // hash keys of the nodes in the graph
    struct ggml_hash_set    hash_set;
    // hash values
@@ -1228,6 +1257,14 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
 //#define DEBUG_PASS3
 //#define DEBUG_PASS4

+static void init_split(ggml_backend_sched_t sched, struct ggml_backend_sched_split * split, int backend_id, int i_start) {
+    split->backend_id = backend_id;
+    split->i_start = i_start;
+    split->i_end = -1;
+    split->n_inputs = 0;
+    split->w_copy_stream_id = -1;
+}
+
 // assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
 static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
    // reset splits
@@ -1406,19 +1443,17 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
    // pass 4: split graph, find tensors that need to be copied
    {
        int i_split = 0;
+        int cur_backend_id = 0;
        struct ggml_backend_sched_split * split = &sched->splits[0];
        // find the backend of the first split, skipping view ops
        for (int i = 0; i < graph->n_nodes; i++) {
            struct ggml_tensor * node = graph->nodes[i];
            if (!ggml_is_view_op(node->op)) {
-                split->backend_id = tensor_backend_id(node);
+                cur_backend_id = tensor_backend_id(node);
                break;
            }
        }
-        split->i_start = 0;
-        split->n_inputs = 0;
-        memset(split->inputs, 0, sizeof(split->inputs)); //HACK
-        int cur_backend_id = split->backend_id;
+        init_split(sched, split, cur_backend_id, 0);
        for (int i = 0; i < graph->n_nodes; i++) {
            struct ggml_tensor * node = graph->nodes[i];

@@ -1433,6 +1468,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
            // check if we should start a new split based on the sources of the current node
            bool need_new_split = false;
            if (node_backend_id == cur_backend_id && split->n_inputs > 0) {
+                if (split->w_copy_stream_id != -1) {
+                    // the previous op used a weight copy stream, start a new split to allow the next copy to start immediately after the op
+                    need_new_split = true;
+                }
+
                for (int j = 0; j < GGML_MAX_SRC; j++) {
                    struct ggml_tensor * src = node->src[j];
                    if (src == NULL) {
@@ -1452,7 +1492,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
                        const size_t id = hash_id(src);
                        int src_backend_id = sched->tensor_backend_id[id];
                        if (src_backend_id != cur_backend_id && sched->tensor_copies[hash_id(src)][cur_backend_id][0] == NULL) {
-                            //printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
                            need_new_split = true;
                            break;
                        }
@@ -1470,10 +1509,8 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
                }
                GGML_ASSERT(i_split < GGML_SCHED_MAX_SPLITS);
                split = &sched->splits[i_split];
-                split->backend_id = node_backend_id;
-                split->i_start = i;
-                split->n_inputs = 0;
                cur_backend_id = node_backend_id;
+                init_split(sched, split, cur_backend_id, i);
            }

            // find inputs that are not on the same backend
@@ -1529,6 +1566,13 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
                        int n_inputs = split->n_inputs++;
                        GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
                        split->inputs[n_inputs] = src;
+                        if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS && split->w_copy_stream_id == -1 && GGML_SCHED_MAX_COPY_STREAMS > 0) {
+                            split->w_copy_stream_id = sched->cur_copy_stream[cur_backend_id];
+                            sched->copy_streams[cur_backend_id][split->w_copy_stream_id].max_size = MAX(
+                                    sched->copy_streams[cur_backend_id][split->w_copy_stream_id].max_size,
+                                    ggml_backend_buft_get_alloc_size(sched->bufts[cur_backend_id], src));
+                            sched->cur_copy_stream[cur_backend_id] = (sched->cur_copy_stream[cur_backend_id] + 1) % GGML_SCHED_MAX_COPY_STREAMS;
+                        }
                    }
                    node->src[j] = sched->tensor_copies[id][cur_backend_id][sched->cur_copy];
                }
@@ -1540,6 +1584,10 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
 #ifdef DEBUG_PASS4
    fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
 #endif
+    if (getenv("GGML_DEBUG_SCHED")) {
+        fprintf(stderr, "SPLIT GRAPH\n");
+        ggml_backend_sched_print_assignments(sched, graph);
+    }

    // create copies of the graph for each split
    // TODO: avoid this copy
@@ -1613,6 +1661,25 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
 }

 static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
+    // allocate weights in the copy buffers
+    for (int s = 0; s < sched->n_splits; s++) {
+        struct ggml_backend_sched_split * split = &sched->splits[s];
+        if (split->w_copy_stream_id != -1) {
+            struct ggml_backend_sched_copy_stream * stream = &sched->copy_streams[split->backend_id][split->w_copy_stream_id];
+            ggml_backend_buffer_t buffer = stream->buffer;
+            if (buffer == NULL) {
+                continue;
+            }
+            for (int j = 0; j < split->n_inputs; j++) {
+                struct ggml_tensor * input = split->inputs[j];
+                if (input->buffer != NULL && input->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
+                    struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split->backend_id][sched->cur_copy];
+                    ggml_backend_tensor_alloc(buffer, input_cpy, ggml_backend_buffer_get_base(buffer));
+                }
+            }
+        }
+    }
+
    // allocate graph
    if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
        // the re-allocation may cause the split inputs to be moved to a different address
@@ -1637,6 +1704,11 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
        struct ggml_backend_sched_split * split = &splits[i];
        int split_backend_id = split->backend_id;
        ggml_backend_t split_backend = sched->backends[split_backend_id];
+        struct ggml_backend_sched_copy_stream * stream = NULL;
+
+        if (split->w_copy_stream_id != -1) {
+            stream = &sched->copy_streams[split_backend_id][split->w_copy_stream_id];
+        }

        // copy the input tensors to the split backend
        for (int j = 0; j < split->n_inputs; j++) {
@@ -1644,7 +1716,9 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
            struct ggml_tensor * input = split->inputs[j];
            struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split_backend_id][sched->cur_copy];

-            if (input->flags & GGML_TENSOR_FLAG_INPUT) {
+            if (input->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS && stream && stream->stream) {
+                ggml_backend_tensor_copy_async(input_backend, stream->stream, input, input_cpy);
+            } else if (input->flags & GGML_TENSOR_FLAG_INPUT) {
                // inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
                if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
                    ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
@@ -1663,6 +1737,11 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
            }
        }

+        if (stream && stream->stream) {
+            ggml_backend_event_record(stream->event_copy);
+            ggml_backend_event_wait(split_backend, stream->event_copy);
+        }
+
        if (!sched->callback_eval) {
            enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph);
            if (ec != GGML_STATUS_SUCCESS) {
@@ -1702,6 +1781,12 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
            }
        }

+        // record event of this copy stream
+        if (stream && stream->stream) {
+            ggml_backend_event_record(stream->event_use);
+            ggml_backend_event_wait(stream->stream, stream->event_use);
+        }
+
        // record the event of this copy
        if (split->n_inputs > 0) {
            if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
@@ -1766,11 +1851,19 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
    if (sched == NULL) {
        return;
    }
+
    for (int b = 0; b < sched->n_backends; b++) {
        for (int c = 0; c < sched->n_copies; c++) {
            ggml_backend_event_free(sched->events[b][c]);
        }
+        for (int s = 0; s < GGML_SCHED_MAX_COPY_STREAMS; s++) {
+            ggml_backend_buffer_free(sched->copy_streams[b][s].buffer);
+            ggml_backend_event_free(sched->copy_streams[b][s].event_copy);
+            ggml_backend_event_free(sched->copy_streams[b][s].event_use);
+            ggml_backend_free(sched->copy_streams[b][s].stream);
+        }
    }
+
    ggml_gallocr_free(sched->galloc);
    ggml_free(sched->ctx);
    free(sched->splits);
@@ -1789,6 +1882,7 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
        memset(sched->hash_set.keys,      0, sizeof(sched->hash_set.keys[0])     * hash_size); // NOLINT
        memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size);
        memset(sched->tensor_copies,      0, sizeof(sched->tensor_copies[0])     * hash_size);
+        memset(sched->cur_copy_stream,    0, sizeof(sched->cur_copy_stream[0])   * sched->n_backends);

        sched->is_reset = true;
    }
@@ -1800,7 +1894,46 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *

    ggml_backend_sched_split_graph(sched, measure_graph);

-    // TODO: extract this to a separate function
+    // allocate tensor copy streams
+    for (int b = 0; b < sched->n_backends; b++) {
+        for (int j = 0; j < GGML_SCHED_MAX_COPY_STREAMS; j++) {
+            struct ggml_backend_sched_copy_stream * stream = &sched->copy_streams[b][j];
+            if (stream->max_size > 0) {
+                // backend
+                if (!stream->stream) {
+                    stream->stream = ggml_backend_dup(sched->backends[b]);
+                }
+
+                if (!stream->stream) {
+                    continue;
+                }
+
+                // events
+                if (!stream->event_copy) {
+                    stream->event_copy = ggml_backend_event_new(stream->stream);
+                }
+
+                if (!stream->event_use) {
+                    stream->event_use = ggml_backend_event_new(sched->backends[b]);
+                }
+
+                if (!stream->event_copy || !stream->event_use) {
+                    continue;
+                }
+
+                // buffer
+                if (!stream->buffer || ggml_backend_buffer_get_size(stream->buffer) < stream->max_size) {
+                    ggml_backend_buffer_free(stream->buffer);
+                    stream->buffer = ggml_backend_buft_alloc_buffer(sched->bufts[b], stream->max_size);
+                    if (stream->buffer == NULL) {
+                        fprintf(stderr, "%s: failed to allocate buffer for copy stream\n", __func__);
+                        return false;
+                    }
+                }
+            }
+        }
+    }
+
    if (!ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
        return false;
    }
@@ -1868,7 +2001,16 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe
    int backend_index = ggml_backend_sched_backend_id(sched, backend);
    GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);

-    return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
+    size_t size = ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
+
+    for (int i = 0; i < GGML_SCHED_MAX_COPY_STREAMS; i++) {
+        if (sched->copy_streams[backend_index][i].buffer == NULL) {
+            continue;
+        }
+        size += ggml_backend_buffer_get_size(sched->copy_streams[backend_index][i].buffer);
+    }
+
+    return size;
 }

 void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
@@ -1895,7 +2037,6 @@ void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * t

    tensor->buffer = buffer;
    tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
-    tensor->backend = tensor->view_src->backend;
    ggml_backend_buffer_init_tensor(buffer, tensor);
 }

--- a/ggml-backend.h
+++ b/ggml-backend.h
@@ -50,9 +50,10 @@ extern "C" {
    // Backend
    //

-    GGML_API ggml_guid_t  ggml_backend_guid(ggml_backend_t backend);
-    GGML_API const char * ggml_backend_name(ggml_backend_t backend);
-    GGML_API void         ggml_backend_free(ggml_backend_t backend);
+    GGML_API ggml_guid_t    ggml_backend_guid(ggml_backend_t backend);
+    GGML_API const char *   ggml_backend_name(ggml_backend_t backend);
+    GGML_API void           ggml_backend_free(ggml_backend_t backend);
+    GGML_API ggml_backend_t ggml_backend_dup(ggml_backend_t backend);

    GGML_API ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend);
    GGML_API ggml_backend_buffer_t      ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -2558,7 +2558,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
        }

        // Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates.
-        if (cuda_graph_update_required) {
+        if (use_cuda_graph && cuda_graph_update_required) {
            cuda_ctx->cuda_graph->number_consecutive_updates++;
        } else {
            cuda_ctx->cuda_graph->number_consecutive_updates = 0;
@@ -2920,6 +2920,12 @@ static void ggml_backend_cuda_event_synchronize(ggml_backend_event_t event) {
    CUDA_CHECK(cudaEventSynchronize((cudaEvent_t)event->context));
 }

+static ggml_backend_t ggml_backend_cuda_dup(ggml_backend_t backend) {
+    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
+
+    return ggml_backend_cuda_init(cuda_ctx->device);
+}
+
 static ggml_backend_i ggml_backend_cuda_interface = {
    /* .get_name                = */ ggml_backend_cuda_name,
    /* .free                    = */ ggml_backend_cuda_free,
@@ -2939,6 +2945,7 @@ static ggml_backend_i ggml_backend_cuda_interface = {
    /* .event_record            = */ ggml_backend_cuda_event_record,
    /* .event_wait              = */ ggml_backend_cuda_event_wait,
    /* .event_synchronize       = */ ggml_backend_cuda_event_synchronize,
+    /* .backend_dup             = */ ggml_backend_cuda_dup,
 };

 static ggml_guid_t ggml_backend_cuda_guid() {
--- a/ggml-cuda/upscale.cu
+++ b/ggml-cuda/upscale.cu
@@ -1,35 +1,36 @@
 #include "upscale.cuh"

-static __global__ void upscale_f32(const float * x, float * dst, const int ne00, const int ne00xne01, const int scale_factor) {
-    // blockIdx.z: idx of ne02*ne03
-    // blockIdx.y: idx of ne01*scale_factor， aka ne1
-    // blockIDx.x: idx of ne00*scale_factor / BLOCK_SIZE
-    // ne00xne01: ne00 * ne01
-    int ne0 = ne00 * scale_factor;
-    int nidx = threadIdx.x + blockIdx.x * blockDim.x;
-    if (nidx >= ne0) {
+static __global__ void upscale_f32(const float * x, float * dst,
+        const int nb00, const int nb01, const int nb02, const int nb03,
+        const int ne10, const int ne11, const int ne12, const int ne13,
+        const float sf0, const float sf1, const float sf2, const float sf3) {
+    int index = threadIdx.x + blockIdx.x * blockDim.x;
+    if (index >= ne10 * ne11 * ne12 * ne13) {
        return;
    }
-    // operation
-    int i00 = nidx / scale_factor;
-    int i01 = blockIdx.y / scale_factor;
-    int offset_src =
-        i00 +
-        i01 * ne00 +
-        blockIdx.z * ne00xne01;
-    int offset_dst =
-        nidx +
-        blockIdx.y * ne0 +
-        blockIdx.z * ne0 * gridDim.y;
-    dst[offset_dst] = x[offset_src];
+
+    int i10 = index % ne10;
+    int i11 = (index / ne10) % ne11;
+    int i12 = (index / (ne10 * ne11)) % ne12;
+    int i13 = (index / (ne10 * ne11 * ne12)) % ne13;
+
+    int i00 = i10 / sf0;
+    int i01 = i11 / sf1;
+    int i02 = i12 / sf2;
+    int i03 = i13 / sf3;
+
+    dst[index] = *(float *)((char *)x + i03 * nb03 + i02 * nb02 + i01 * nb01 + i00 * nb00);
 }

-static void upscale_f32_cuda(const float * x, float * dst, const int ne00, const int ne01, const int ne02, const int ne03,
-                             const int scale_factor, cudaStream_t stream) {
-    int ne0 = (ne00 * scale_factor);
-    int num_blocks = (ne0 + CUDA_UPSCALE_BLOCK_SIZE - 1) / CUDA_UPSCALE_BLOCK_SIZE;
-    dim3 gridDim(num_blocks, (ne01 * scale_factor), ne02*ne03);
-    upscale_f32<<<gridDim, CUDA_UPSCALE_BLOCK_SIZE, 0, stream>>>(x, dst, ne00, ne00 * ne01, scale_factor);
+static void upscale_f32_cuda(const float * x, float * dst,
+        const int nb00, const int nb01, const int nb02, const int nb03,
+        const int ne10, const int ne11, const int ne12, const int ne13,
+        const float sf0, const float sf1, const float sf2, const float sf3,
+        cudaStream_t stream) {
+    int dst_size = ne10 * ne11 * ne12 * ne13;
+    int num_blocks = (dst_size + CUDA_UPSCALE_BLOCK_SIZE - 1) / CUDA_UPSCALE_BLOCK_SIZE;
+
+    upscale_f32<<<num_blocks, CUDA_UPSCALE_BLOCK_SIZE,0,stream>>>(x, dst, nb00, nb01, nb02, nb03, ne10, ne11, ne12, ne13, sf0, sf1, sf2, sf3);
 }

 void ggml_cuda_op_upscale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
@@ -39,10 +40,12 @@ void ggml_cuda_op_upscale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    cudaStream_t stream = ctx.stream();

    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->type == GGML_TYPE_F32);
-    GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);

-    const int scale_factor = dst->op_params[0];
+    const float sf0 = (float)dst->ne[0]/src0->ne[0];
+    const float sf1 = (float)dst->ne[1]/src0->ne[1];
+    const float sf2 = (float)dst->ne[2]/src0->ne[2];
+    const float sf3 = (float)dst->ne[3]/src0->ne[3];

-    upscale_f32_cuda(src0_d, dst_d, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], scale_factor, stream);
+    upscale_f32_cuda(src0_d, dst_d, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3, stream);
 }
--- a/ggml-impl.h
+++ b/ggml-impl.h
@@ -120,9 +120,16 @@ extern "C" {
 #ifndef __F16C__
 #define __F16C__
 #endif
+#endif
+
+// __SSE3__ and __SSSE3__ are not defined in MSVC, but SSE3/SSSE3 are present when AVX/AVX2/AVX512 are available
+#if defined(_MSC_VER) && (defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__))
 #ifndef __SSE3__
 #define __SSE3__
 #endif
+#ifndef __SSSE3__
+#define __SSSE3__
+#endif
 #endif

 // 16-bit float
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -1378,7 +1378,7 @@ static enum ggml_status ggml_metal_graph_compute(
                        const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16);

                        if (ne00%4 == 0) {
-                            while (nth < ne00/4 && nth < 256) {
+                            while (nth < ne00/4 && nth*ne01*ne02*ne03 < 256) {
                                nth *= 2;
                            }
                            if (use_f16) {
@@ -1387,7 +1387,7 @@ static enum ggml_status ggml_metal_graph_compute(
                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32_4].pipeline;
                            }
                        } else {
-                            while (nth < ne00 && nth < 1024) {
+                            while (nth < ne00 && nth*ne01*ne02*ne03 < 256) {
                                nth *= 2;
                            }
                            if (use_f16) {
@@ -2353,7 +2353,10 @@ static enum ggml_status ggml_metal_graph_compute(
                    {
                        GGML_ASSERT(src0->type == GGML_TYPE_F32);

-                        const int sf = dst->op_params[0];
+                        const float sf0 = (float)ne0/src0->ne[0];
+                        const float sf1 = (float)ne1/src0->ne[1];
+                        const float sf2 = (float)ne2/src0->ne[2];
+                        const float sf3 = (float)ne3/src0->ne[3];

                        const id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_UPSCALE_F32].pipeline;

@@ -2376,7 +2379,10 @@ static enum ggml_status ggml_metal_graph_compute(
                        [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:15];
                        [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:16];
                        [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:17];
-                        [encoder setBytes:&sf   length:sizeof(sf)   atIndex:18];
+                        [encoder setBytes:&sf0  length:sizeof(sf0)  atIndex:18];
+                        [encoder setBytes:&sf1  length:sizeof(sf1)  atIndex:19];
+                        [encoder setBytes:&sf2  length:sizeof(sf2)  atIndex:20];
+                        [encoder setBytes:&sf3  length:sizeof(sf3)  atIndex:21];

                        const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne0);

@@ -2512,13 +2518,14 @@ static enum ggml_status ggml_metal_graph_compute(
                    } break;
                case GGML_OP_FLASH_ATTN_EXT:
                    {
-                        GGML_ASSERT(ne00 % 4 == 0);
+                        GGML_ASSERT(ne00 % 4  == 0);
+                        GGML_ASSERT(ne11 % 32 == 0);
+
                        GGML_ASSERT(src0->type == GGML_TYPE_F32);

-                        struct ggml_tensor * src3 = gf->nodes[i]->src[3];
+                        GGML_ASSERT(ggml_are_same_shape (src1, src2));

-                        GGML_ASSERT(ggml_are_same_shape(src1, src2));
-                        GGML_ASSERT(src3);
+                        struct ggml_tensor * src3 = gf->nodes[i]->src[3];

                        size_t offs_src3 = 0;

@@ -2528,6 +2535,11 @@ static enum ggml_status ggml_metal_graph_compute(
                        GGML_ASSERT(!src3 || src3->ne[1] >= GGML_PAD(src0->ne[1], 8) &&
                                "the Flash-Attention Metal kernel requires the mask to be padded to 8 and at least n_queries big");

+                        const uint64_t nb20 = src2 ? src2->nb[0] : 0; GGML_UNUSED(nb20);
+                        const uint64_t nb21 = src2 ? src2->nb[1] : 0;
+                        const uint64_t nb22 = src2 ? src2->nb[2] : 0;
+                        const uint64_t nb23 = src2 ? src2->nb[3] : 0;
+
                        const int64_t  ne30 = src3 ? src3->ne[0] : 0; GGML_UNUSED(ne30);
                      //const int64_t  ne31 = src3 ? src3->ne[1] : 0;
                        const int64_t  ne32 = src3 ? src3->ne[2] : 0; GGML_UNUSED(ne32);
@@ -2590,34 +2602,35 @@ static enum ggml_status ggml_metal_graph_compute(
                        [encoder setBuffer:id_src0     offset:offs_src0           atIndex:0];
                        [encoder setBuffer:id_src1     offset:offs_src1           atIndex:1];
                        [encoder setBuffer:id_src2     offset:offs_src2           atIndex:2];
-                        [encoder setBuffer:id_src3     offset:offs_src3           atIndex:3];
+                        if (id_src3) {
+                            [encoder setBuffer:id_src3     offset:offs_src3           atIndex:3];
+                        } else {
+                            [encoder setBuffer:id_src0     offset:offs_src0           atIndex:3];
+                        }
                        [encoder setBuffer:id_dst      offset:offs_dst            atIndex:4];
-                        [encoder setBytes:&ne00        length:sizeof( int64_t)    atIndex:5];
-                        [encoder setBytes:&ne01        length:sizeof( int64_t)    atIndex:6];
-                        [encoder setBytes:&ne02        length:sizeof( int64_t)    atIndex:7];
-                        [encoder setBytes:&ne03        length:sizeof( int64_t)    atIndex:8];
-                        [encoder setBytes:&nb00        length:sizeof(uint64_t)    atIndex:9];
-                        [encoder setBytes:&nb01        length:sizeof(uint64_t)    atIndex:10];
-                        [encoder setBytes:&nb02        length:sizeof(uint64_t)    atIndex:11];
-                        [encoder setBytes:&nb03        length:sizeof(uint64_t)    atIndex:12];
-                        [encoder setBytes:&ne10        length:sizeof( int64_t)    atIndex:13];
-                        [encoder setBytes:&ne11        length:sizeof( int64_t)    atIndex:14];
-                        [encoder setBytes:&ne12        length:sizeof( int64_t)    atIndex:15];
-                        [encoder setBytes:&ne13        length:sizeof( int64_t)    atIndex:16];
-                        [encoder setBytes:&nb10        length:sizeof(uint64_t)    atIndex:17];
-                        [encoder setBytes:&nb11        length:sizeof(uint64_t)    atIndex:18];
-                        [encoder setBytes:&nb12        length:sizeof(uint64_t)    atIndex:19];
-                        [encoder setBytes:&nb13        length:sizeof(uint64_t)    atIndex:20];
-                        [encoder setBytes:&nb31        length:sizeof(uint64_t)    atIndex:21];
-                        [encoder setBytes:&ne0         length:sizeof( int64_t)    atIndex:22];
-                        [encoder setBytes:&ne1         length:sizeof( int64_t)    atIndex:23];
-                        [encoder setBytes:&ne2         length:sizeof( int64_t)    atIndex:24];
-                        [encoder setBytes:&ne3         length:sizeof( int64_t)    atIndex:25];
-                        [encoder setBytes:&scale       length:sizeof(   float)    atIndex:26];
-                        [encoder setBytes:&max_bias    length:sizeof(   float)    atIndex:27];
-                        [encoder setBytes:&m0          length:sizeof(m0)          atIndex:28];
-                        [encoder setBytes:&m1          length:sizeof(m1)          atIndex:29];
-                        [encoder setBytes:&n_head_log2 length:sizeof(n_head_log2) atIndex:30];
+                        [encoder setBytes:&ne01        length:sizeof( int64_t)    atIndex:5];
+                        [encoder setBytes:&ne02        length:sizeof( int64_t)    atIndex:6];
+                        [encoder setBytes:&ne03        length:sizeof( int64_t)    atIndex:7];
+                        [encoder setBytes:&nb01        length:sizeof(uint64_t)    atIndex:8];
+                        [encoder setBytes:&nb02        length:sizeof(uint64_t)    atIndex:9];
+                        [encoder setBytes:&nb03        length:sizeof(uint64_t)    atIndex:10];
+                        [encoder setBytes:&ne11        length:sizeof( int64_t)    atIndex:11];
+                        [encoder setBytes:&ne12        length:sizeof( int64_t)    atIndex:12];
+                        [encoder setBytes:&ne13        length:sizeof( int64_t)    atIndex:13];
+                        [encoder setBytes:&nb11        length:sizeof(uint64_t)    atIndex:14];
+                        [encoder setBytes:&nb12        length:sizeof(uint64_t)    atIndex:15];
+                        [encoder setBytes:&nb13        length:sizeof(uint64_t)    atIndex:16];
+                        [encoder setBytes:&nb21        length:sizeof(uint64_t)    atIndex:17];
+                        [encoder setBytes:&nb22        length:sizeof(uint64_t)    atIndex:18];
+                        [encoder setBytes:&nb23        length:sizeof(uint64_t)    atIndex:19];
+                        [encoder setBytes:&nb31        length:sizeof(uint64_t)    atIndex:20];
+                        [encoder setBytes:&ne1         length:sizeof( int64_t)    atIndex:21];
+                        [encoder setBytes:&ne2         length:sizeof( int64_t)    atIndex:22];
+                        [encoder setBytes:&scale       length:sizeof(   float)    atIndex:23];
+                        [encoder setBytes:&max_bias    length:sizeof(   float)    atIndex:24];
+                        [encoder setBytes:&m0          length:sizeof(m0)          atIndex:25];
+                        [encoder setBytes:&m1          length:sizeof(m1)          atIndex:26];
+                        [encoder setBytes:&n_head_log2 length:sizeof(n_head_log2) atIndex:27];

                        if (!use_vec_kernel) {
                            // half8x8 kernel
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@@ -1852,7 +1852,10 @@ kernel void kernel_upscale_f32(
    constant  uint64_t & nb1,
    constant  uint64_t & nb2,
    constant  uint64_t & nb3,
-    constant   int32_t & sf,
+    constant     float & sf0,
+    constant     float & sf1,
+    constant     float & sf2,
+    constant     float & sf3,
    uint3 tgpig[[threadgroup_position_in_grid]],
    uint3 tpitg[[thread_position_in_threadgroup]],
    uint3   ntg[[threads_per_threadgroup]]) {
@@ -1861,15 +1864,17 @@ kernel void kernel_upscale_f32(
    const int64_t i2 = tgpig.y;
    const int64_t i1 = tgpig.x;

-    const int64_t i03 = i3;
-    const int64_t i02 = i2;
-    const int64_t i01 = i1/sf;
-
-    device const float * src0_ptr = (device const float *) (src0 + i03*nb03 + i02*nb02 + i01*nb01);
-    device       float * dst_ptr  = (device       float *) (dst  +  i3*nb3  +  i2*nb2  +  i1*nb1);
+    const int64_t i03 = i3/sf3;
+    const int64_t i02 = i2/sf2;
+    const int64_t i01 = i1/sf1;

    for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
-        dst_ptr[i0] = src0_ptr[i0/sf];
+        const int64_t i00 = i0/sf0;
+
+        device const float * src0_ptr = (device const float *) (src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
+        device       float * dst_ptr  = (device       float *) (dst  +  i3*nb3  +  i2*nb2  +  i1*nb1  +  i0*nb0);
+
+        dst_ptr[0] = src0_ptr[0];
    }
 }

@@ -2049,27 +2054,24 @@ typedef void (flash_attn_ext_f16_t)(
        device const  char * v,
        device const  char * mask,
        device       float * dst,
-        constant   int64_t & ne00,
        constant   int64_t & ne01,
        constant   int64_t & ne02,
        constant   int64_t & ne03,
-        constant  uint64_t & nb00,
        constant  uint64_t & nb01,
        constant  uint64_t & nb02,
        constant  uint64_t & nb03,
-        constant   int64_t & ne10,
        constant   int64_t & ne11,
        constant   int64_t & ne12,
        constant   int64_t & ne13,
-        constant  uint64_t & nb10,
        constant  uint64_t & nb11,
        constant  uint64_t & nb12,
        constant  uint64_t & nb13,
+        constant  uint64_t & nb21,
+        constant  uint64_t & nb22,
+        constant  uint64_t & nb23,
        constant  uint64_t & nb31,
-        constant   int64_t & ne0,
        constant   int64_t & ne1,
        constant   int64_t & ne2,
-        constant   int64_t & ne3,
        constant     float & scale,
        constant     float & max_bias,
        constant     float & m0,
@@ -2090,27 +2092,24 @@ kernel void kernel_flash_attn_ext_f16(
        device const  char * v,
        device const  char * mask,
        device       float * dst,
-        constant   int64_t & ne00,
        constant   int64_t & ne01,
        constant   int64_t & ne02,
        constant   int64_t & ne03,
-        constant  uint64_t & nb00,
        constant  uint64_t & nb01,
        constant  uint64_t & nb02,
        constant  uint64_t & nb03,
-        constant   int64_t & ne10,
        constant   int64_t & ne11,
        constant   int64_t & ne12,
        constant   int64_t & ne13,
-        constant  uint64_t & nb10,
        constant  uint64_t & nb11,
        constant  uint64_t & nb12,
        constant  uint64_t & nb13,
+        constant  uint64_t & nb21,
+        constant  uint64_t & nb22,
+        constant  uint64_t & nb23,
        constant  uint64_t & nb31,
-        constant   int64_t & ne0,
        constant   int64_t & ne1,
        constant   int64_t & ne2,
-        constant   int64_t & ne3,
        constant     float & scale,
        constant     float & max_bias,
        constant     float & m0,
@@ -2180,10 +2179,6 @@ kernel void kernel_flash_attn_ext_f16(
        const short ne22 = ne12;
        const short ne23 = ne13;

-        const uint nb21 = nb11;
-        const uint nb22 = nb12;
-        const uint nb23 = nb13;
-
        // broadcast
        const short rk2 = ne02/ne12;
        const short rk3 = ne03/ne13;
@@ -2247,11 +2242,16 @@ kernel void kernel_flash_attn_ext_f16(
                        simdgroup_multiply_accumulate(mqk, mq[i], mk, mqk);
                    }

-                    // mqk = mqk*scale + mask*slope
-                    simdgroup_half8x8 mm;
-                    simdgroup_load(mm, mp + ic + 8*cc, nb31/sizeof(half), 0, false);
-                    simdgroup_multiply(mm, mslope, mm);
-                    simdgroup_multiply_accumulate(mqk, mqk, mscale, mm);
+                    if (mask != q) {
+                        // mqk = mqk*scale + mask*slope
+                        simdgroup_half8x8 mm;
+                        simdgroup_load(mm, mp + ic + 8*cc, nb31/sizeof(half), 0, false);
+                        simdgroup_multiply(mm, mslope, mm);
+                        simdgroup_multiply_accumulate(mqk, mqk, mscale, mm);
+                    } else {
+                        // mqk = mqk*scale
+                        simdgroup_multiply(mqk, mscale, mqk);
+                    }

                    simdgroup_store(mqk, ss + 8*cc, TF, 0, false);
                }
@@ -2425,27 +2425,24 @@ kernel void kernel_flash_attn_ext_vec_f16(
        device const  char * v,
        device const  char * mask,
        device       float * dst,
-        constant   int64_t & ne00,
        constant   int64_t & ne01,
        constant   int64_t & ne02,
        constant   int64_t & ne03,
-        constant  uint64_t & nb00,
        constant  uint64_t & nb01,
        constant  uint64_t & nb02,
        constant  uint64_t & nb03,
-        constant   int64_t & ne10,
        constant   int64_t & ne11,
        constant   int64_t & ne12,
        constant   int64_t & ne13,
-        constant  uint64_t & nb10,
        constant  uint64_t & nb11,
        constant  uint64_t & nb12,
        constant  uint64_t & nb13,
+        constant  uint64_t & nb21,
+        constant  uint64_t & nb22,
+        constant  uint64_t & nb23,
        constant  uint64_t & nb31,
-        constant   int64_t & ne0,
        constant   int64_t & ne1,
        constant   int64_t & ne2,
-        constant   int64_t & ne3,
        constant     float & scale,
        constant     float & max_bias,
        constant     float & m0,
@@ -2521,10 +2518,6 @@ kernel void kernel_flash_attn_ext_vec_f16(
        const short ne22 = ne12;
        const short ne23 = ne13;

-        const uint nb21 = nb11;
-        const uint nb22 = nb12;
-        const uint nb23 = nb13;
-
        // broadcast
        const short rk2 = ne02/ne12;
        const short rk3 = ne03/ne13;
@@ -2589,8 +2582,7 @@ kernel void kernel_flash_attn_ext_vec_f16(

                    // mqk = mqk*scale + mask*slope
                    if (tiisg == 0) {
-                        float4 mm = (float4) mp4[ic/4 + cc];
-                        mqk = mqk*scale + mm*slope;
+                        mqk = mqk*scale + ((mask != q) ? ((float4) mp4[ic/4 + cc])*slope : (float4) 0.0f);

                        ss4[cc] = mqk;
                    }
--- a/ggml-quants.c
+++ b/ggml-quants.c
--- a/ggml-rpc.cpp
+++ b/ggml-rpc.cpp
--- a/ggml-rpc.h
+++ b/ggml-rpc.h
@@ -0,0 +1,24 @@
+#pragma once
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+#define GGML_RPC_MAX_SERVERS       16
+
+// backend API
+GGML_API GGML_CALL ggml_backend_t ggml_backend_rpc_init(const char * endpoint);
+GGML_API GGML_CALL bool ggml_backend_is_rpc(ggml_backend_t backend);
+
+GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint);
+
+GGML_API GGML_CALL void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);
+
+GGML_API GGML_CALL void start_rpc_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem);
+
+#ifdef  __cplusplus
+}
+#endif
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -13987,6 +13987,10 @@ inline void ggml_sycl_op_upscale(const ggml_tensor *src0,
    GGML_ASSERT(dst->type == GGML_TYPE_F32);
    GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors

+#pragma message("TODO: generalize upscale operator")
+#pragma message("      https://github.com/ggerganov/ggml/pull/814")
+    GGML_ASSERT(false && "TODO: generalize upscale operator");
+
    const int scale_factor = dst->op_params[0];

    upscale_f32_sycl(src0_dd, dst_dd, src0->ne[0], src0->ne[1], src0->ne[2], scale_factor, main_stream);
@@ -15564,26 +15568,6 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
    const int64_t r2 = ne12/ne02;
    const int64_t r3 = ne13/ne03;

-#if 0
-    // use syclGemmEx
-    {
-        for (int i13 = 0; i13 < ne13; ++i13) {
-            for (int i12 = 0; i12 < ne12; ++i12) {
-                int i03 = i13 / r3;
-                int i02 = i12 / r2;
-
-                SYCL_CHECK(
-                        syclGemmEx(g_sycl_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
-                            ne01, ne11, ne10,
-                            alpha, (const char *) src0_as_f16 + i02*src0->nb[2]   + i03*src0->nb[3]  , SYCL_R_16F,   nb01/sizeof(half),
-                                   (const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, SYCL_R_16F,   nb11/sizeof(float),
-                            beta,  (      char *)       dst_t + i12*nbd2          + i13*nbd3,          cu_data_type, ne01,
-                            cu_compute_type,
-                            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-            }
-        }
-    }
-#else
    if (r2 == 1 && r3 == 1 && src0->nb[2]*src0->ne[2] == src0->nb[3] && src1->nb[2]*src1->ne[2] == src1->nb[3]) {
        // there is no broadcast and src0, src1 are contiguous across dims 2, 3
        SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
@@ -15595,7 +15579,6 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
            nb11 / nb10, nb12 / nb10, beta,
            (char *)dst_t, cu_data_type, ne01, nb2 / nb0,
            ne12 * ne13, cu_compute_type)));
-        g_sycl_handles[g_main_device]->wait();
    } else {
        const int ne23 = ne12*ne13;

@@ -15626,7 +15609,7 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
                                         nb02, nb03, nb12_scaled, nb13_scaled,
                                         nbd2, nbd3, r2, r3, item_ct1);
                                 });
-            }).wait();
+            });
        }
        SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
            *g_sycl_handles[g_main_device], oneapi::mkl::transpose::trans,
@@ -15637,9 +15620,7 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
            dpct::library_data_t::real_half, nb11 / nb10, beta,
            (void **)(ptrs_dst.get() + 0 * ne23), cu_data_type, ne01, ne23,
            cu_compute_type)));
-        g_sycl_handles[g_main_device]->wait();
    }
-#endif

    if (no_mixed_dtypes) {
        const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16);
--- a/ggml.c
+++ b/ggml.c
@@ -112,6 +112,8 @@ typedef void * thread_ret_t;

 #endif

+typedef pthread_t ggml_thread_t;
+
 #ifdef GGML_USE_CPU_HBM
 #include <hbwmalloc.h>
 #endif
@@ -1306,6 +1308,8 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
 #define GGML_F16_VEC_ZERO   GGML_F32x4_ZERO
 #define GGML_F16_VEC_SET1   GGML_F32x4_SET1
 #define GGML_F16_VEC_FMA    GGML_F32x4_FMA
+#define GGML_F16_VEC_ADD    GGML_F32x4_ADD
+#define GGML_F16_VEC_MUL    GGML_F32x4_MUL
 #define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE
 // Use vec_xl, not vec_ld, in case the load address is not aligned.
 #define GGML_F16_VEC_LOAD(p, i) (i & 0x1) ?                   \
@@ -1537,6 +1541,59 @@ static inline void __sse_f16x4_store(ggml_fp16_t *x, __m128 y) {
 #define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR)
 #endif

+//
+// ggml context
+//
+
+struct ggml_context {
+    size_t mem_size;
+    void* mem_buffer;
+    bool   mem_buffer_owned;
+    bool   no_alloc;
+    bool   no_alloc_save; // this is used to save the no_alloc state when using scratch buffers
+
+    int    n_objects;
+
+    struct ggml_object* objects_begin;
+    struct ggml_object* objects_end;
+
+    struct ggml_scratch scratch;
+    struct ggml_scratch scratch_save;
+};
+
+struct ggml_context_container {
+    bool used;
+
+    struct ggml_context context;
+};
+
+struct ggml_compute_state_shared {
+    const struct ggml_cgraph* cgraph;
+    const struct ggml_cplan* cplan;
+
+    int64_t perf_node_start_cycles;
+    int64_t perf_node_start_time_us;
+
+    const int n_threads;
+
+    // synchronization primitives
+    atomic_int n_active;  // num active threads
+    atomic_int node_n;    // active graph node
+    atomic_int node_task; // active graph node task phase
+
+    ggml_abort_callback abort_callback; // abort ggml_graph_compute when true
+    void* abort_callback_data;
+
+    atomic_int current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
+};
+
+struct ggml_compute_state {
+    ggml_thread_t thrd;
+    int ith;
+    struct ggml_compute_state_shared* shared;
+    enum ggml_status ec;
+};
+
 //
 // fundamental operations
 //
@@ -2383,32 +2440,6 @@ static void ggml_setup_op_has_task_pass(void) {
    }
 }

-//
-// ggml context
-//
-
-struct ggml_context {
-    size_t mem_size;
-    void * mem_buffer;
-    bool   mem_buffer_owned;
-    bool   no_alloc;
-    bool   no_alloc_save; // this is used to save the no_alloc state when using scratch buffers
-
-    int    n_objects;
-
-    struct ggml_object * objects_begin;
-    struct ggml_object * objects_end;
-
-    struct ggml_scratch scratch;
-    struct ggml_scratch scratch_save;
-};
-
-struct ggml_context_container {
-    bool used;
-
-    struct ggml_context context;
-};
-
 //
 // NUMA support
 //
@@ -2822,6 +2853,16 @@ bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor
        (t0->ne[3] == t1->ne[3] );
 }

+bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
+    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
+
+    return
+        (t0->nb[0] == t1->nb[0] ) &&
+        (t0->nb[1] == t1->nb[1] ) &&
+        (t0->nb[2] == t1->nb[2] ) &&
+        (t0->nb[3] == t1->nb[3] );
+}
+
 // check if t1 can be represented as a repeatition of t0
 static inline bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
@@ -3166,6 +3207,12 @@ static struct ggml_tensor * ggml_new_tensor_impl(

    struct ggml_tensor * const result = (struct ggml_tensor *)((char *)ctx->mem_buffer + obj_new->offs);

+#ifdef __clang__
+    // temporary until ggml_tensor::backend is removed
+    #pragma clang diagnostic push
+    #pragma clang diagnostic ignored "-Wdeprecated-declarations"
+#endif
+
    *result = (struct ggml_tensor) {
        /*.type         =*/ type,
        /*.backend      =*/ GGML_BACKEND_TYPE_CPU,
@@ -3188,6 +3235,10 @@ static struct ggml_tensor * ggml_new_tensor_impl(
        /*.padding      =*/ { 0 },
    };

+#ifdef __clang__
+    #pragma clang diagnostic pop
+#endif
+
    // TODO: this should not be needed as long as we don't rely on aligned SIMD loads
    //ggml_assert_aligned(result->data);

@@ -6281,7 +6332,10 @@ struct ggml_tensor * ggml_pool_2d(
 static struct ggml_tensor * ggml_upscale_impl(
    struct ggml_context * ctx,
    struct ggml_tensor * a,
-    int scale_factor) {
+    int ne0,
+    int ne1,
+    int ne2,
+    int ne3) {
    bool is_node = false;

    if (a->grad) {
@@ -6289,19 +6343,45 @@ static struct ggml_tensor * ggml_upscale_impl(
        is_node = true;
    }

+    GGML_ASSERT(a->ne[0] <= ne0);
+    GGML_ASSERT(a->ne[1] <= ne1);
+    GGML_ASSERT(a->ne[2] <= ne2);
+    GGML_ASSERT(a->ne[3] <= ne3);
+
    struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
-            a->ne[0] * scale_factor,
-            a->ne[1] * scale_factor,
-            a->ne[2], a->ne[3]);
+            ne0,
+            ne1,
+            ne2,
+            ne3
+            );

    result->op = GGML_OP_UPSCALE;
-    result->op_params[0] = scale_factor;
+
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;

    return result;
 }

+struct ggml_tensor * ggml_upscale(
+    struct ggml_context * ctx,
+    struct ggml_tensor * a,
+    int scale_factor) {
+    return ggml_upscale_impl(ctx, a, a->ne[0] * scale_factor, a->ne[1] * scale_factor, a->ne[2], a->ne[3]);
+}
+
+struct ggml_tensor * ggml_upscale_ext(
+    struct ggml_context * ctx,
+    struct ggml_tensor * a,
+    int ne0,
+    int ne1,
+    int ne2,
+    int ne3) {
+    return ggml_upscale_impl(ctx, a, ne0, ne1, ne2, ne3);
+}
+
+// ggml_pad
+
 struct ggml_tensor * ggml_pad(
    struct ggml_context * ctx,
    struct ggml_tensor  * a,
@@ -6326,12 +6406,7 @@ struct ggml_tensor * ggml_pad(
    return result;
 }

-struct ggml_tensor * ggml_upscale(
-    struct ggml_context * ctx,
-    struct ggml_tensor * a,
-    int scale_factor) {
-    return ggml_upscale_impl(ctx, a, scale_factor);
-}
+// ggml_arange

 struct ggml_tensor * ggml_arange(
    struct ggml_context * ctx,
@@ -6353,6 +6428,8 @@ struct ggml_tensor * ggml_arange(
    return result;
 }

+// ggml_timestep_embedding
+
 struct ggml_tensor * ggml_timestep_embedding(
            struct ggml_context * ctx,
            struct ggml_tensor  * timesteps,
@@ -11767,9 +11844,101 @@ static bool ggml_compute_forward_mul_mat_use_blas(struct ggml_tensor * dst) {
 }
 #endif

+static void ggml_compute_forward_mul_mat_one_chunk(
+    const struct ggml_compute_params * params,
+    struct ggml_tensor * dst,
+    const int64_t num_rows_per_vec_dot,
+    const int64_t ir0_start,
+    const int64_t ir0_end,
+    const int64_t ir1_start,
+    const int64_t ir1_end) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    const enum ggml_type type = src0->type;
+
+    const bool src1_cont = ggml_is_contiguous(src1);
+
+    ggml_vec_dot_t    const vec_dot = type_traits[type].vec_dot;
+    enum ggml_type    const vec_dot_type = type_traits[type].vec_dot_type;
+
+    // broadcast factors
+    const int64_t r2 = ne12 / ne02;
+    const int64_t r3 = ne13 / ne03;
+
+    //printf("ir0_start = %6lld, ir0_end = %6lld, ir1_start = %6lld, ir1_end = %6lld\n", ir0_start, ir0_end, ir1_start, ir1_end);
+
+    // threads with no work simply yield (not sure if it helps)
+    if (ir0_start >= ir0_end || ir1_start >= ir1_end) {
+        return;
+    }
+
+    const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
+    const size_t row_size = ggml_row_size(vec_dot_type, ne10);
+
+    assert(ne12 % ne02 == 0);
+    assert(ne13 % ne03 == 0);
+
+    // block-tiling attempt
+    const int64_t blck_0 = 16;
+    const int64_t blck_1 = 16;
+
+    const size_t src1_col_stride = src1_cont || src1->type != vec_dot_type ? row_size : nb11;
+
+    // attempt to reduce false-sharing (does not seem to make a difference)
+    // 16 * 2, accounting for mmla kernels
+    float tmp[32];
+
+    for (int64_t iir1 = ir1_start; iir1 < ir1_end; iir1 += blck_1) {
+        for (int64_t iir0 = ir0_start; iir0 < ir0_end; iir0 += blck_0) {
+            for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir1_end; ir1 += num_rows_per_vec_dot) {
+                const int64_t i13 = (ir1 / (ne12 * ne1));
+                const int64_t i12 = (ir1 - i13 * ne12 * ne1) / ne1;
+                const int64_t i11 = (ir1 - i13 * ne12 * ne1 - i12 * ne1);
+
+                // broadcast src0 into src1
+                const int64_t i03 = i13 / r3;
+                const int64_t i02 = i12 / r2;
+
+                const int64_t i1 = i11;
+                const int64_t i2 = i12;
+                const int64_t i3 = i13;
+
+                const char * src0_row = (const char*)src0->data + (0 + i02 * nb02 + i03 * nb03);
+
+                // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
+                //       if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
+                //       the original src1 data pointer, so we should index using the indices directly
+                // TODO: this is a bit of a hack, we should probably have a better way to handle this
+                const char * src1_col = (const char*)wdata +
+                    (src1_cont || src1->type != vec_dot_type
+                        ? (i11 + i12 * ne11 + i13 * ne12 * ne11) * row_size
+                        : (i11 * nb11 + i12 * nb12 + i13 * nb13));
+                float * dst_col = (float*)((char*)dst->data + (i1 * nb1 + i2 * nb2 + i3 * nb3));
+
+                //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ++ir0) {
+                //    vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
+                //}
+
+                for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ir0 += num_rows_per_vec_dot) {
+                    vec_dot(ne00, &tmp[ir0 - iir0], (num_rows_per_vec_dot > 1 ? 16 : 0), src0_row + ir0 * nb01, (num_rows_per_vec_dot > 1 ? nb01 : 0), src1_col, (num_rows_per_vec_dot > 1 ? src1_col_stride : 0), num_rows_per_vec_dot);
+                }
+
+                for (int cn = 0; cn < num_rows_per_vec_dot; ++cn) {
+                    memcpy(&dst_col[iir0 + cn * nb1 / nb0], tmp + (cn * 16), (MIN(iir0 + blck_0, ir0_end) - iir0) * sizeof(float));
+                }
+            }
+        }
+    }
+}
+
 static void ggml_compute_forward_mul_mat(
        const struct ggml_compute_params * params,
-              struct ggml_tensor * dst) {
+              struct ggml_tensor * dst,
+              struct ggml_compute_state * state) {

    const struct ggml_tensor * src0 = dst->src[0];
    const struct ggml_tensor * src1 = dst->src[1];
@@ -11784,9 +11953,6 @@ static void ggml_compute_forward_mul_mat(

    const enum ggml_type type = src0->type;

-    const bool src1_cont = ggml_is_contiguous(src1);
-
-    ggml_vec_dot_t    const vec_dot               = type_traits[type].vec_dot;
    enum ggml_type    const vec_dot_type          = type_traits[type].vec_dot_type;
    ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
    int64_t           const vec_dot_num_rows      = type_traits[type].nrows;
@@ -11807,8 +11973,10 @@ static void ggml_compute_forward_mul_mat(
    GGML_ASSERT(nb2 <= nb3);

    // broadcast factors
-    const int64_t r2 = ne12/ne02;
-    const int64_t r3 = ne13/ne03;
+    const int64_t r2 = ne12 / ne02;
+    const int64_t r3 = ne13 / ne03;
+    UNUSED(r2);
+    UNUSED(r3);

    // nb01 >= nb00 - src0 is not transposed
    //   compute by src0 rows
@@ -11890,6 +12058,8 @@ static void ggml_compute_forward_mul_mat(
 #endif

 #if GGML_USE_LLAMAFILE
+    const bool src1_cont = ggml_is_contiguous(src1);
+
    if (src1_cont) {
        for (int64_t i13 = 0; i13 < ne13; i13++)
            for (int64_t i12 = 0; i12 < ne12; i12++)
@@ -11915,6 +12085,8 @@ UseGgmlGemm1:;
        if (ith != 0) {
            return;
        }
+        // Every thread starts at ith, so the first unprocessed chunk is nth.  This save a bit of coordination right at the start.
+        atomic_store(&state->shared->current_chunk, nth);
        if (src1->type != vec_dot_type) {
            char * wdata = params->wdata;
            const size_t row_size = ggml_row_size(vec_dot_type, ne10);
@@ -11939,11 +12111,11 @@ UseGgmlGemm1:;
        return;
    }

-    const void * wdata    = (src1->type == vec_dot_type) ? src1->data : params->wdata;
-    const size_t row_size = ggml_row_size(vec_dot_type, ne10);
-
 #if GGML_USE_LLAMAFILE
    if (src1->type != vec_dot_type) {
+        const void* wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
+        const size_t row_size = ggml_row_size(vec_dot_type, ne10);
+
        for (int64_t i13 = 0; i13 < ne13; i13++)
            for (int64_t i12 = 0; i12 < ne12; i12++)
                if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
@@ -11964,98 +12136,87 @@ UseGgmlGemm1:;
 UseGgmlGemm2:;
 #endif

-    const int64_t nr0 = ne01;          // src0 rows
-    const int64_t nr1 = ne1*ne12*ne13; // src1 rows
+#ifdef GGML_PERF
+    int chunks_executed = 0;
+    UNUSED(chunks_executed);
+#endif

-    //printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
+    // This is the size of the first dimension of the result, so we can iterate that way. (see the ASSERT above, these are the same numbers)
+    const int64_t nr0 = ne0;

-    // distribute the thread work across the inner or outer loop based on which one is larger
-
-    const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
-    const int64_t nth1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
-
-    const int64_t ith0 = ith % nth0;
-    const int64_t ith1 = ith / nth0;
-
-    const int64_t dr0 = (nr0 + nth0 - 1)/nth0;
-    const int64_t dr1 = (nr1 + nth1 - 1)/nth1;
-
-    const int64_t ir010 = dr0*ith0;
-    const int64_t ir011 = MIN(ir010 + dr0, nr0);
-
-    const int64_t ir110 = dr1*ith1;
-    const int64_t ir111 = MIN(ir110 + dr1, nr1);
-
-    //printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111);
-
-    // threads with no work simply yield (not sure if it helps)
-    if (ir010 >= ir011 || ir110 >= ir111) {
-        sched_yield();
-        return;
-    }
-
-    assert(ne12 % ne02 == 0);
-    assert(ne13 % ne03 == 0);
-
-    // block-tiling attempt
-    const int64_t blck_0 = 16;
-    const int64_t blck_1 = 16;
+    // This is the size of the rest of the dimensions of the result
+    const int64_t nr1 = ne1 * ne2 * ne3;

    // dot kernels can handle 1 row and col at a time, but mmla kernels can process 2 rows and cols
-    int64_t nrc = vec_dot_num_rows;
+    int64_t num_rows_per_vec_dot = vec_dot_num_rows;
    // TODO: currently the mmla kernels support only even numbered rows/cols.
    // this check can be removed once they are extended to support odd numbered rows/cols too
    if ((nr0 % 2 != 0) || (ne11 % 2 != 0)) {
-        nrc = 1;
+        num_rows_per_vec_dot = 1;
    }

-    const size_t src1_col_stride = src1_cont || src1->type != vec_dot_type ? row_size : nb11;
+    // Now select a reasonable chunk size.
+    int chunk_size = 16;

-    // attempt to reduce false-sharing (does not seem to make a difference)
-    // 16 * 2, accounting for mmla kernels
-    float tmp[32];
+    // We need to step up the size if it's small
+    if (nr0 == 1 || nr1 == 1) {
+        chunk_size = 64;
+    }

-    for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
-        for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
-            for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ir1 += nrc) {
-                const int64_t i13 = (ir1/(ne12*ne1));
-                const int64_t i12 = (ir1 - i13*ne12*ne1)/ne1;
-                const int64_t i11 = (ir1 - i13*ne12*ne1 - i12*ne1);
+    // distribute the work across the inner or outer loop based on which one is larger
+    // The number of chunks in the 0/1 dim.
+    // CEIL(nr0/chunk_size)
+    int64_t nchunk0 = (nr0 + chunk_size - 1) / chunk_size;
+    int64_t nchunk1 = (nr1 + chunk_size - 1) / chunk_size;

-                // broadcast src0 into src1
-                const int64_t i03 = i13/r3;
-                const int64_t i02 = i12/r2;
+    // If the chunking is poor for the number of threads on this setup, scrap the whole plan.  Re-chunk it by thread.
+    //   Also, chunking by thread was measured to have perform better on NUMA systems.  See https://github.com/ggerganov/llama.cpp/pull/6915
+    //   In theory, chunking should be just as useful on NUMA and non NUMA systems, but testing disagreed with that.
+    if (nchunk0 * nchunk1 < nth * 4 || ggml_is_numa()) {
+        // distribute the thread work across the inner or outer loop based on which one is larger
+        nchunk0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
+        nchunk1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
+    }

-                const int64_t i1 = i11;
-                const int64_t i2 = i12;
-                const int64_t i3 = i13;
+    // The number of elements in each chunk
+    const int64_t dr0 = (nr0 + nchunk0 - 1) / nchunk0;
+    const int64_t dr1 = (nr1 + nchunk1 - 1) / nchunk1;

-                const char * src0_row = (const char *) src0->data + (0 + i02*nb02 + i03*nb03);
+    //if (ith == 0)
+    //    printf("MUL_MAT = [%d, %d, %d, %d] x [%d, %d, %d, %d] = %d x %d = %d.  Fp Ops/Ch %d\n", ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nchunk0, nchunk1, nchunk0 * nchunk1, ne00 * nr0 * nr1 / nchunk0 / nchunk1);

-                // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
-                //       if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
-                //       the original src1 data pointer, so we should index using the indices directly
-                // TODO: this is a bit of a hack, we should probably have a better way to handle this
-                const char * src1_col = (const char *) wdata +
-                    (src1_cont || src1->type != vec_dot_type
-                     ? (i11      + i12*ne11 + i13*ne12*ne11)*row_size
-                     : (i11*nb11 + i12*nb12 + i13*nb13));
-                float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
+    // The first chunk comes from our thread_id, the rest will get auto-assigned.
+    int current_chunk = ith;

-                //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
-                //    vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
-                //}
+    while (current_chunk < nchunk0 * nchunk1) {
+        const int64_t ith0 = current_chunk % nchunk0;
+        const int64_t ith1 = current_chunk / nchunk0;

-                for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ir0 += nrc) {
-                    vec_dot(ne00, &tmp[ir0 - iir0], (nrc>1 ? 16 : 0), src0_row + ir0*nb01, (nrc>1 ? nb01 : 0), src1_col, (nrc>1 ? src1_col_stride : 0), nrc);
-                }
+        const int64_t ir0_start = dr0 * ith0;
+        const int64_t ir0_end = MIN(ir0_start + dr0, nr0);

-                for (int cn = 0; cn < nrc; ++cn) {
-                    memcpy(&dst_col[iir0 + cn*nb1/nb0], tmp + (cn*16), (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
-                }
-            }
+        const int64_t ir1_start = dr1 * ith1;
+        const int64_t ir1_end = MIN(ir1_start + dr1, nr1);
+
+        ggml_compute_forward_mul_mat_one_chunk(params, dst, num_rows_per_vec_dot, ir0_start, ir0_end, ir1_start, ir1_end);
+
+#ifdef GGML_PERF
+        chunks_executed++;
+#endif
+
+        if (nth >= nchunk0 * nchunk1) {
+            break;
        }
+
+        current_chunk = atomic_fetch_add(&state->shared->current_chunk, 1);
    }
+
+#ifdef GGML_PERF
+    // These numbers are useful when trying to measure how well the threading scheduling works.
+    //int64_t workSize = (ne01 * ne11 * ne12 * ne13 * ne00) / nchunk0 / nchunk1;
+    //float time = (ggml_perf_time_us() - t0);
+    //printf("MUL_MAT = %f ms, [%d, %d, %d, %d] x [%d, %d, %d, %d] = %I64u, %f ops/usec in %d chunks.\n", time / 1000.0, ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, workSize, (float)workSize/time, chunks_executed);
+#endif
 }

 // ggml_compute_forward_mul_mat_id
@@ -14808,25 +14969,28 @@ static void ggml_compute_forward_upscale_f32(
        return;
    }

-    GGML_ASSERT(src0->nb[0] == sizeof(float));
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);

    const int ith = params->ith;
    const int nth = params->nth;

    GGML_TENSOR_UNARY_OP_LOCALS

-    const int scale_factor = dst->op_params[0];
+    const float sf0 = (float)ne0/src0->ne[0];
+    const float sf1 = (float)ne1/src0->ne[1];
+    const float sf2 = (float)ne2/src0->ne[2];
+    const float sf3 = (float)ne3/src0->ne[3];

    // TODO: optimize

    for (int64_t i3 = 0; i3 < ne3; i3++) {
-        const int64_t i03 = i3;
+        const int64_t i03 = i3 / sf3;
        for (int64_t i2 = ith; i2 < ne2; i2 += nth) {
-            const int64_t i02 = i2;
+            const int64_t i02 = i2 / sf2;
            for (int64_t i1 = 0; i1 < ne1; i1++) {
-                const int64_t i01 = i1 / scale_factor;
+                const int64_t i01 = i1 / sf1;
                for (int64_t i0 = 0; i0 < ne0; i0++) {
-                    const int64_t i00 = i0 / scale_factor;
+                    const int64_t i00 = i0 / sf0;

                    const float * x = (float *)((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
                          float * y = (float *)((char *)  dst->data +  i0*nb0  +  i1*nb1  +  i2*nb2  +  i3*nb3);
@@ -14856,6 +15020,7 @@ static void ggml_compute_forward_upscale(
    }
 }

+
 // ggml_compute_forward_pad

 static void ggml_compute_forward_pad_f32(
@@ -17306,7 +17471,7 @@ static void ggml_compute_forward_cross_entropy_loss_back(

 /////////////////////////////////

-static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
+static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor, struct ggml_compute_state * state) {
    GGML_ASSERT(params);

    if (tensor->op == GGML_OP_NONE || ggml_is_empty(tensor)) {
@@ -17404,7 +17569,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
            } break;
        case GGML_OP_MUL_MAT:
            {
-                ggml_compute_forward_mul_mat(params, tensor);
+                ggml_compute_forward_mul_mat(params, tensor, state);
            } break;
        case GGML_OP_MUL_MAT_ID:
            {
@@ -19020,8 +19185,6 @@ typedef int ggml_lock_t;

 #define GGML_LOCK_INITIALIZER 0

-typedef pthread_t ggml_thread_t;
-
 #define ggml_thread_create pthread_create
 #define ggml_thread_join   pthread_join

@@ -19047,8 +19210,6 @@ typedef int ggml_lock_t;

 #define GGML_LOCK_INITIALIZER 0

-typedef pthread_t ggml_thread_t;
-
 #define ggml_thread_create pthread_create
 #define ggml_thread_join   pthread_join

@@ -19128,31 +19289,6 @@ static void set_numa_thread_affinity(int thread_n) { UNUSED(thread_n);  }
 static void clear_numa_thread_affinity(void) {}
 #endif

-struct ggml_compute_state_shared {
-    const struct ggml_cgraph * cgraph;
-    const struct ggml_cplan  * cplan;
-
-    int64_t perf_node_start_cycles;
-    int64_t perf_node_start_time_us;
-
-    const int n_threads;
-
-    // synchronization primitives
-    atomic_int n_active;  // num active threads
-    atomic_int node_n;    // active graph node
-    atomic_int node_task; // active graph node task phase
-
-    ggml_abort_callback abort_callback; // abort ggml_graph_compute when true
-    void * abort_callback_data;
-};
-
-struct ggml_compute_state {
-    ggml_thread_t thrd;
-    int ith;
-    struct ggml_compute_state_shared * shared;
-    enum ggml_status ec;
-};
-
 static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) {
    int64_t cycles_cur  = ggml_perf_cycles()  - st->perf_node_start_cycles;
    int64_t time_us_cur = ggml_perf_time_us() - st->perf_node_start_time_us;
@@ -19425,6 +19561,10 @@ static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_comput

        * node_n = atomic_load(&state->shared->node_n);
        if (* node_n != last_node_n) break;
+#if defined(__SSE3__)
+        // Tell the processor we're spinning.  It's a processor hint for spinlocks.
+        _mm_pause();
+#endif
    }
 }

@@ -19439,6 +19579,10 @@ static void ggml_graph_compute_thread_sync_task(int * task_phase, struct ggml_co

        * task_phase = atomic_load(&state->shared->node_task);
        if (* task_phase != last_task_phase) break;
+#if defined(__SSE3__)
+        // Tell the processor we're spinning.  It's a processor hint for spinlocks.
+        _mm_pause();
+#endif
    }
 }

@@ -19478,7 +19622,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
                struct ggml_tensor * node = cgraph->nodes[node_n];
                if (GGML_OP_HAS_FINALIZE[node->op]) {
                    params.nth = ggml_get_n_tasks(node, n_threads, state->shared->n_threads);
-                    ggml_compute_forward(&params, node);
+                    ggml_compute_forward(&params, node, state);
                }
                ggml_graph_compute_perf_stats_node(node, state->shared);
            }
@@ -19498,17 +19642,17 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
                    /* INIT */
                    if (GGML_OP_HAS_INIT[node->op]) {
                        params.type = GGML_TASK_TYPE_INIT;
-                        ggml_compute_forward(&params, node);
+                        ggml_compute_forward(&params, node, state);
                    }

                    // TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
                    // they do something more efficient than spinning (?)
                    params.type = GGML_TASK_TYPE_COMPUTE;
-                    ggml_compute_forward(&params, node);
+                    ggml_compute_forward(&params, node, state);

                    if (GGML_OP_HAS_FINALIZE[node->op]) {
                        params.type = GGML_TASK_TYPE_FINALIZE;
-                        ggml_compute_forward(&params, node);
+                        ggml_compute_forward(&params, node, state);
                    }

                    ggml_graph_compute_perf_stats_node(node, state->shared);
@@ -19547,7 +19691,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {

        if (state->ith < n_tasks) {
            if (GGML_OP_HAS_INIT[node->op]) {
-                ggml_compute_forward(&params, node);
+                ggml_compute_forward(&params, node, state);
            }
        }

@@ -19568,7 +19712,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {

        if (state->ith < n_tasks) {
            params.type = GGML_TASK_TYPE_COMPUTE;
-            ggml_compute_forward(&params, node);
+            ggml_compute_forward(&params, node, state);
        }

        if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
@@ -19819,6 +19963,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
        /*.node_task               =*/ GGML_TASK_TYPE_FINALIZE,
        /*.abort_callback          =*/ NULL,
        /*.abort_callback_data     =*/ NULL,
+        /*.current_chunk;          =*/ 0,
    };
    struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);

--- a/ggml.h
+++ b/ggml.h
@@ -565,7 +565,8 @@ extern "C" {
    // n-dimensional tensor
    struct ggml_tensor {
        enum ggml_type         type;
-        enum ggml_backend_type backend;
+
+        GGML_DEPRECATED(enum ggml_backend_type backend, "use the buffer type to find the storage location of the tensor");

        struct ggml_backend_buffer * buffer;

@@ -766,7 +767,8 @@ extern "C" {
    GGML_API           bool ggml_is_3d        (const struct ggml_tensor * tensor);
    GGML_API           int  ggml_n_dims       (const struct ggml_tensor * tensor); // returns 1 for scalars

-    GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
+    GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
+    GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);

    // use this to compute the memory overhead of a tensor
    GGML_API size_t ggml_tensor_overhead(void);
@@ -1673,12 +1675,24 @@ extern "C" {
            float                 p1);

    // nearest interpolate
+    // multiplies ne0 and ne1 by scale factor
    // used in stable-diffusion
    GGML_API struct ggml_tensor * ggml_upscale(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            int                   scale_factor);

+    // nearest interpolate
+    // nearest interpolate to specified dimensions
+    // used in tortoise.cpp
+    GGML_API struct ggml_tensor * ggml_upscale_ext(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   ne0,
+            int                   ne1,
+            int                   ne2,
+            int                   ne3);
+
    // pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
    GGML_API struct ggml_tensor * ggml_pad(
            struct ggml_context * ctx,
--- a/gguf-py/gguf/init.py
+++ b/gguf-py/gguf/init.py
@@ -2,5 +2,6 @@ from .constants import *
 from .lazy import *
 from .gguf_reader import *
 from .gguf_writer import *
+from .quants import *
 from .tensor_mapping import *
 from .vocab import *
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -13,6 +13,7 @@ from string import ascii_letters, digits
 import numpy as np

 from .constants import (
+    GGML_QUANT_SIZES,
    GGUF_DEFAULT_ALIGNMENT,
    GGUF_MAGIC,
    GGUF_VERSION,
@@ -195,7 +196,7 @@ class GGUFWriter:
        return ((x + n - 1) // n) * n

    def add_tensor_info(
-        self, name: str, tensor_shape: Sequence[int], tensor_dtype: np.dtype[np.float16] | np.dtype[np.float32],
+        self, name: str, tensor_shape: Sequence[int], tensor_dtype: np.dtype,
        tensor_nbytes: int, raw_dtype: GGMLQuantizationType | None = None,
    ) -> None:
        if self.state is not WriterState.EMPTY:
@@ -208,10 +209,6 @@ class GGUFWriter:
        encoded_name = name.encode("utf-8")
        self.ti_data += self._pack("Q", len(encoded_name))
        self.ti_data += encoded_name
-        n_dims = len(tensor_shape)
-        self.ti_data += self._pack("I", n_dims)
-        for i in range(n_dims):
-            self.ti_data += self._pack("Q", tensor_shape[n_dims - 1 - i])
        if raw_dtype is None:
            if tensor_dtype == np.float16:
                dtype = GGMLQuantizationType.F16
@@ -231,6 +228,15 @@ class GGUFWriter:
                raise ValueError("Only F16, F32, F64, I8, I16, I32, I64 tensors are supported for now")
        else:
            dtype = raw_dtype
+            if tensor_dtype == np.uint8:
+                block_size, type_size = GGML_QUANT_SIZES[raw_dtype]
+                if tensor_shape[-1] % type_size != 0:
+                    raise ValueError(f"Quantized tensor row size ({tensor_shape[-1]}) is not a multiple of {dtype.name} type size ({type_size})")
+                tensor_shape = tuple(tensor_shape[:-1]) + (tensor_shape[-1] // type_size * block_size,)
+        n_dims = len(tensor_shape)
+        self.ti_data += self._pack("I", n_dims)
+        for i in range(n_dims):
+            self.ti_data += self._pack("Q", tensor_shape[n_dims - 1 - i])
        self.ti_data += self._pack("I", dtype)
        self.ti_data += self._pack("Q", self.offset_tensor)
        self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment)
--- a/gguf-py/gguf/lazy.py
+++ b/gguf-py/gguf/lazy.py
@@ -6,6 +6,7 @@ from typing import Any, Callable
 from collections import deque

 import numpy as np
+from numpy._typing import _Shape
 from numpy.typing import DTypeLike


@@ -110,7 +111,7 @@ class LazyBase(ABC, metaclass=LazyMeta):
            return o

    @classmethod
-    def _wrap_fn(cls, fn: Callable, *, use_self: LazyBase | None = None, meta_noop: bool | DTypeLike = False) -> Callable[[Any], Any]:
+    def _wrap_fn(cls, fn: Callable, *, use_self: LazyBase | None = None, meta_noop: bool | DTypeLike | tuple[DTypeLike, Callable[[tuple[int, ...]], tuple[int, ...]]] = False) -> Callable[[Any], Any]:
        def wrapped_fn(*args, **kwargs):
            if kwargs is None:
                kwargs = {}
@@ -130,9 +131,14 @@ class LazyBase(ABC, metaclass=LazyMeta):
                res = args[0]
                assert isinstance(res, cls)
                res = res._meta
-                # allow operations to override the dtype
+                # allow operations to override the dtype and shape
                if meta_noop is not True:
-                    res = cls.meta_with_dtype(res, meta_noop)
+                    if isinstance(meta_noop, tuple):
+                        dtype, shape = meta_noop
+                        assert callable(shape)
+                        res = cls.meta_with_dtype_and_shape(dtype, shape(res.shape))
+                    else:
+                        res = cls.meta_with_dtype_and_shape(meta_noop, res.shape)

            if isinstance(res, cls._tensor_type):
                def collect_replace(t: LazyBase):
@@ -168,7 +174,12 @@ class LazyBase(ABC, metaclass=LazyMeta):
            while _t._data is None:
                lt = _t._lazy.popleft()
                if lt._data is not None:
-                    raise ValueError(f"{lt} did not belong in the lazy queue")
+                    # Lazy tensor did not belong in the lazy queue.
+                    # Weirdly only happens with Bloom models...
+                    # likely because tensors aren't unique in the queue.
+                    # The final output is still the same as in eager mode,
+                    # so it's safe to ignore this.
+                    continue
                assert lt._func is not None
                lt._args = cls._recurse_apply(lt._args, already_eager_to_eager)
                lt._data = lt._func(lt._args)
@@ -183,12 +194,12 @@ class LazyBase(ABC, metaclass=LazyMeta):

    @classmethod
    def eager_to_meta(cls, t: Any) -> Any:
-        return cls.meta_with_dtype(t, t.dtype)
+        return cls.meta_with_dtype_and_shape(t.dtype, t.shape)

    # must be overridden, meta tensor init is backend-specific
    @classmethod
    @abstractmethod
-    def meta_with_dtype(cls, m: Any, dtype: Any) -> Any: pass
+    def meta_with_dtype_and_shape(cls, dtype: Any, shape: Any) -> Any: pass

    @classmethod
    def from_eager(cls, t: Any) -> Any:
@@ -205,15 +216,15 @@ class LazyNumpyTensor(LazyBase):
    _tensor_type = np.ndarray

    @classmethod
-    def meta_with_dtype(cls, m: np.ndarray[Any, Any], dtype: DTypeLike) -> np.ndarray[Any, Any]:
+    def meta_with_dtype_and_shape(cls, dtype: DTypeLike, shape: _Shape) -> np.ndarray[Any, Any]:
        # The initial idea was to use np.nan as the fill value,
        # but non-float types like np.int16 can't use that.
        # So zero it is.
        cheat = np.zeros(1, dtype)
-        return np.lib.stride_tricks.as_strided(cheat, m.shape, (0 for _ in m.shape))
+        return np.lib.stride_tricks.as_strided(cheat, shape, (0 for _ in shape))

    def astype(self, dtype, *args, **kwargs):
-        meta = type(self).meta_with_dtype(self._meta, dtype)
+        meta = type(self).meta_with_dtype_and_shape(dtype, self._meta.shape)
        full_args = (self, dtype,) + args
        # very important to pass the shared _lazy deque, or else there's an infinite loop somewhere.
        return type(self)(meta=meta, args=full_args, lazy=self._lazy, func=(lambda a: a[0].astype(*a[1:], **kwargs)))
--- a/gguf-py/gguf/quants.py
+++ b/gguf-py/gguf/quants.py
@@ -0,0 +1,109 @@
+from __future__ import annotations
+from typing import Callable
+
+from numpy.typing import DTypeLike
+
+from .constants import GGML_QUANT_SIZES, GGMLQuantizationType
+from .lazy import LazyNumpyTensor
+
+import numpy as np
+
+
+# same as ggml_compute_fp32_to_bf16 in ggml-impl.h
+def __compute_fp32_to_bf16(n: np.ndarray) -> np.ndarray:
+    n = n.astype(np.float32, copy=False).view(np.int32)
+    # force nan to quiet
+    n = np.where((n & 0x7fffffff) > 0x7f800000, (n & 0xffff0000) | (64 << 16), n)
+    # flush subnormals to zero
+    n = np.where((n & 0x7f800000) == 0, n & 0x80000000, n)
+    # round to nearest even
+    n = (n + (0x7fff + ((n >> 16) & 1))) >> 16
+    return n.astype(np.int16)
+
+
+# This is faster than np.vectorize and np.apply_along_axis because it works on more than one row at a time
+def __apply_over_grouped_rows(func: Callable[[np.ndarray], np.ndarray], arr: np.ndarray, otype: DTypeLike, oshape: tuple[int, ...]) -> np.ndarray:
+    rows = arr.reshape((-1, arr.shape[-1]))
+    osize = 1
+    for dim in oshape:
+        osize *= dim
+    out = np.empty(shape=osize, dtype=otype)
+    # compute over groups of 16 rows (arbitrary, but seems good for performance)
+    n_groups = rows.shape[0] // 16
+    np.concatenate([func(group).ravel() for group in np.array_split(rows, n_groups)], axis=0, out=out)
+    return out.reshape(oshape)
+
+
+def __quantize_bf16_array(n: np.ndarray) -> np.ndarray:
+    return __apply_over_grouped_rows(__compute_fp32_to_bf16, arr=n, otype=np.int16, oshape=n.shape)
+
+
+__quantize_bf16_lazy = LazyNumpyTensor._wrap_fn(__quantize_bf16_array, meta_noop=np.int16)
+
+
+def quantize_bf16(n: np.ndarray):
+    if type(n) is LazyNumpyTensor:
+        return __quantize_bf16_lazy(n)
+    else:
+        return __quantize_bf16_array(n)
+
+
+__q8_block_size, __q8_type_size = GGML_QUANT_SIZES[GGMLQuantizationType.Q8_0]
+
+
+def can_quantize_to_q8_0(n: np.ndarray) -> bool:
+    return n.shape[-1] % __q8_block_size == 0
+
+
+# round away from zero
+# ref: https://stackoverflow.com/a/59143326/22827863
+def np_roundf(n: np.ndarray) -> np.ndarray:
+    a = abs(n)
+    floored = np.floor(a)
+    b = floored + np.floor(2 * (a - floored))
+    return np.sign(n) * b
+
+
+def __quantize_q8_0_shape_change(s: tuple[int, ...]) -> tuple[int, ...]:
+    return (*s[:-1], s[-1] // __q8_block_size * __q8_type_size)
+
+
+# Implementation of Q8_0 with bit-exact same results as reference implementation in ggml-quants.c
+def __quantize_q8_0_rows(n: np.ndarray) -> np.ndarray:
+    shape = n.shape
+    assert shape[-1] % __q8_block_size == 0
+
+    n_blocks = n.size // __q8_block_size
+
+    blocks = n.reshape((n_blocks, __q8_block_size)).astype(np.float32, copy=False)
+
+    d = abs(blocks).max(axis=1, keepdims=True) / 127
+    with np.errstate(divide="ignore"):
+        id = np.where(d == 0, 0, 1 / d)
+    qs = np_roundf(blocks * id)
+
+    # (n_blocks, 2)
+    d = d.astype(np.float16).view(np.uint8)
+    # (n_blocks, block_size)
+    qs = qs.astype(np.int8).view(np.uint8)
+
+    assert d.shape[1] + qs.shape[1] == __q8_type_size
+
+    return np.concatenate([d, qs], axis=1).reshape(__quantize_q8_0_shape_change(shape))
+
+
+def __quantize_q8_0_array(n: np.ndarray) -> np.ndarray:
+    return __apply_over_grouped_rows(__quantize_q8_0_rows, arr=n, otype=np.uint8, oshape=__quantize_q8_0_shape_change(n.shape))
+
+
+__quantize_q8_0_lazy = LazyNumpyTensor._wrap_fn(
+    __quantize_q8_0_array,
+    meta_noop=(np.uint8, __quantize_q8_0_shape_change),
+)
+
+
+def quantize_q8_0(data: np.ndarray):
+    if type(data) is LazyNumpyTensor:
+        return __quantize_q8_0_lazy(data)
+    else:
+        return __quantize_q8_0_array(data)
--- a/llama.cpp
+++ b/llama.cpp
@@ -7,6 +7,10 @@
 #include "ggml-alloc.h"
 #include "ggml-backend.h"

+#ifdef GGML_USE_RPC
+#  include "ggml-rpc.h"
+#endif
+
 #ifdef GGML_USE_CUDA
 #  include "ggml-cuda.h"
 #elif defined(GGML_USE_CLBLAST)
@@ -1685,91 +1689,6 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer
    GGML_UNUSED(host_buffer);
 }

-static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
-    ggml_backend_buffer_type_t buft = nullptr;
-
-#ifdef GGML_USE_METAL
-    buft = ggml_backend_metal_buffer_type();
-#elif defined(GGML_USE_CUDA)
-    buft = ggml_backend_cuda_buffer_type(gpu);
-#elif defined(GGML_USE_VULKAN)
-    buft = ggml_backend_vk_buffer_type(gpu);
-#elif defined(GGML_USE_SYCL)
-    buft = ggml_backend_sycl_buffer_type(gpu);
-#elif defined(GGML_USE_CLBLAST)
-    buft = ggml_backend_opencl_buffer_type();
-#elif defined(GGML_USE_KOMPUTE)
-    buft = ggml_backend_kompute_buffer_type(gpu);
-    if (buft == nullptr) {
-        LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
-    }
-#endif
-
-    if (buft == nullptr) {
-        buft = llama_default_buffer_type_cpu(true);
-    }
-    return buft;
-
-    GGML_UNUSED(gpu);
-}
-
-static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_gpu, const float * tensor_split) {
-    ggml_backend_buffer_type_t buft = nullptr;
-
-#ifdef GGML_USE_CUDA
-    if (ggml_backend_cuda_get_device_count() > 1) {
-        buft = ggml_backend_cuda_split_buffer_type(tensor_split);
-    }
-#endif
-
-#ifdef GGML_USE_SYCL
-    if (ggml_backend_sycl_get_device_count() > 1) {
-        buft = ggml_backend_sycl_split_buffer_type(tensor_split);
-    }
-#endif
-
-    if (buft == nullptr) {
-        buft = llama_default_buffer_type_offload(fallback_gpu);
-    }
-    return buft;
-
-    GGML_UNUSED(tensor_split);
-}
-
-static size_t llama_get_device_count() {
-#if defined(GGML_USE_CUDA)
-    return ggml_backend_cuda_get_device_count();
-#elif defined(GGML_USE_SYCL)
-    return ggml_backend_sycl_get_device_count();
-#elif defined(GGML_USE_VULKAN)
-    return ggml_backend_vk_get_device_count();
-#else
-    return 1;
-#endif
-}
-
-static size_t llama_get_device_memory(int device) {
-#if defined(GGML_USE_CUDA)
-    size_t total;
-    size_t free;
-    ggml_backend_cuda_get_device_memory(device, &free, &total);
-    return free;
-#elif defined(GGML_USE_SYCL)
-    size_t total;
-    size_t free;
-    ggml_backend_sycl_get_device_memory(device, &free, &total);
-    return free;
-#elif defined(GGML_USE_VULKAN)
-    size_t total;
-    size_t free;
-    ggml_backend_vk_get_device_memory(device, &free, &total);
-    return free;
-#else
-    return 1;
-    GGML_UNUSED(device);
-#endif
-}
-
 //
 // globals
 //
@@ -2210,6 +2129,8 @@ struct llama_model {
    int main_gpu;
    int n_gpu_layers;

+    std::vector<std::string> rpc_servers;
+
    // gguf metadata
    std::unordered_map<std::string, std::string> gguf_kv;

@@ -2353,6 +2274,104 @@ struct llama_context {
 #endif
 };

+static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
+    ggml_backend_buffer_type_t buft = nullptr;
+
+#ifdef GGML_USE_RPC
+    std::string endpoint = model.rpc_servers[gpu];
+    buft = ggml_backend_rpc_buffer_type(endpoint.c_str());
+#elif defined(GGML_USE_METAL)
+    buft = ggml_backend_metal_buffer_type();
+#elif defined(GGML_USE_CUDA)
+    buft = ggml_backend_cuda_buffer_type(gpu);
+#elif defined(GGML_USE_VULKAN)
+    buft = ggml_backend_vk_buffer_type(gpu);
+#elif defined(GGML_USE_SYCL)
+    buft = ggml_backend_sycl_buffer_type(gpu);
+#elif defined(GGML_USE_CLBLAST)
+    buft = ggml_backend_opencl_buffer_type();
+#elif defined(GGML_USE_KOMPUTE)
+    buft = ggml_backend_kompute_buffer_type(gpu);
+    if (buft == nullptr) {
+        LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
+    }
+#endif
+
+    if (buft == nullptr) {
+        buft = llama_default_buffer_type_cpu(true);
+    }
+    return buft;
+    GGML_UNUSED(model);
+    GGML_UNUSED(gpu);
+}
+
+static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_model & model, int fallback_gpu, const float * tensor_split) {
+    ggml_backend_buffer_type_t buft = nullptr;
+
+#ifdef GGML_USE_CUDA
+    if (ggml_backend_cuda_get_device_count() > 1) {
+        buft = ggml_backend_cuda_split_buffer_type(tensor_split);
+    }
+#endif
+
+#ifdef GGML_USE_SYCL
+    if (ggml_backend_sycl_get_device_count() > 1) {
+        buft = ggml_backend_sycl_split_buffer_type(tensor_split);
+    }
+#endif
+
+    if (buft == nullptr) {
+        buft = llama_default_buffer_type_offload(model, fallback_gpu);
+    }
+    return buft;
+
+    GGML_UNUSED(tensor_split);
+}
+
+static size_t llama_get_device_count(const llama_model & model) {
+#if defined(GGML_USE_RPC)
+    return model.rpc_servers.size();
+#elif defined(GGML_USE_CUDA)
+    return ggml_backend_cuda_get_device_count();
+#elif defined(GGML_USE_SYCL)
+    return ggml_backend_sycl_get_device_count();
+#elif defined(GGML_USE_VULKAN)
+    return ggml_backend_vk_get_device_count();
+#else
+    return 1;
+#endif
+    GGML_UNUSED(model);
+}
+
+static size_t llama_get_device_memory(const llama_model & model, int device) {
+#if defined(GGML_USE_RPC)
+    size_t total;
+    size_t free;
+    std::string endpoint = model.rpc_servers[device];
+    ggml_backend_rpc_get_device_memory(endpoint.c_str(), &free, &total);
+    return free;
+#elif defined(GGML_USE_CUDA)
+    size_t total;
+    size_t free;
+    ggml_backend_cuda_get_device_memory(device, &free, &total);
+    return free;
+#elif defined(GGML_USE_SYCL)
+    size_t total;
+    size_t free;
+    ggml_backend_sycl_get_device_memory(device, &free, &total);
+    return free;
+#elif defined(GGML_USE_VULKAN)
+    size_t total;
+    size_t free;
+    ggml_backend_vk_get_device_memory(device, &free, &total);
+    return free;
+#else
+    return 1;
+#endif
+    GGML_UNUSED(model);
+    GGML_UNUSED(device);
+}
+
 //
 // kv cache helpers
 //
@@ -2805,6 +2824,11 @@ static void llama_kv_cache_defrag(struct llama_kv_cache & cache) {
    cache.do_defrag = true;
 }

+static uint32_t llama_kv_cache_get_padding(const struct llama_cparams & cparams) {
+    // the FA kernels require padding to avoid extra runtime boundary checks
+    return cparams.flash_attn ? 256u : 32u;
+}
+
 //
 // model loading and saving
 //
@@ -4424,7 +4448,9 @@ static void llm_load_vocab(
            } else if (
                    tokenizer_pre == "gpt-2"   ||
                    tokenizer_pre == "jina-es" ||
-                    tokenizer_pre == "jina-de") {
+                    tokenizer_pre == "jina-de" ||
+                    tokenizer_pre == "jina-v2-es" ||
+                    tokenizer_pre == "jina-v2-de") {
                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
            } else if (
                    tokenizer_pre == "refact") {
@@ -4784,13 +4810,13 @@ static bool llm_load_tensors(

    if (split_mode == LLAMA_SPLIT_MODE_LAYER) {
        // calculate the split points
-        int device_count = llama_get_device_count();
+        int device_count = llama_get_device_count(model);
        bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
        std::vector<float> splits(device_count);
        if (all_zero) {
            // default split, by free memory
            for (int i = 0; i < device_count; ++i) {
-                splits[i] = llama_get_device_memory(i);
+                splits[i] = llama_get_device_memory(model, i);
            }
        } else {
            std::copy(tensor_split, tensor_split + device_count, splits.begin());
@@ -4810,35 +4836,35 @@ static bool llm_load_tensors(
        int act_gpu_layers = std::min(n_gpu_layers, (int)n_layer + 1);
        for (int64_t i = i_gpu_start; i < n_layer; ++i) {
            int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits.begin();
-            model.buft_layer[i] = llama_default_buffer_type_offload(layer_gpu);
+            model.buft_layer[i] = llama_default_buffer_type_offload(model, layer_gpu);
        }
        // assign the output layer
        if (n_gpu_layers > n_layer) {
            int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits.begin();
-            model.buft_output = llama_default_buffer_type_offload(layer_gpu);
+            model.buft_output = llama_default_buffer_type_offload(model, layer_gpu);
        } else {
            model.buft_output = llama_default_buffer_type_cpu(true);
        }
    } else {
        ggml_backend_buffer_type_t split_buft;
        if (split_mode == LLAMA_SPLIT_MODE_ROW) {
-            split_buft = llama_default_buffer_type_split(main_gpu, tensor_split);
+            split_buft = llama_default_buffer_type_split(model, main_gpu, tensor_split);
        } else {
            // LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_LAYER in backends where it is not supported
-            split_buft = llama_default_buffer_type_offload(main_gpu);
+            split_buft = llama_default_buffer_type_offload(model, main_gpu);
        }
        // assign the repeating layers
        for (int64_t i = i_gpu_start; i < n_layer; ++i) {
            model.buft_layer[i] = {
                split_buft,
-                llama_default_buffer_type_offload(main_gpu)
+                llama_default_buffer_type_offload(model, main_gpu)
            };
        }
        // assign the output layer
        if (n_gpu_layers > n_layer) {
            model.buft_output = {
                split_buft,
-                llama_default_buffer_type_offload(main_gpu)
+                llama_default_buffer_type_offload(model, main_gpu)
            };
        } else {
            model.buft_output = llama_default_buffer_type_cpu(true);
@@ -6334,8 +6360,6 @@ static void llm_build_kv_store(
            (ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa))*kv_head);
    cb(k_cache_view, "k_cache_view", il);

-    // note: storing RoPE-ed version of K in the KV cache
-    ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view));

    assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens);

@@ -6354,7 +6378,19 @@ static void llm_build_kv_store(
    }
    cb(v_cache_view, "v_cache_view", il);

-    ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur, v_cache_view));
+    struct ggml_tensor * k_cur_cast = ggml_cast(ctx, k_cur, k_cache_view->type);
+    struct ggml_tensor * v_cur_cast = ggml_cast(ctx, v_cur, v_cache_view->type);
+
+    ggml_build_forward_expand(graph, k_cur_cast);
+    ggml_build_forward_expand(graph, v_cur_cast);
+
+    // note: storing RoPE-ed version of K in the KV cache
+    ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur_cast, k_cache_view));
+    //ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view));
+
+
+    ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur_cast, v_cache_view));
+    //ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur, v_cache_view));
 }

 static struct ggml_tensor * llm_build_norm(
@@ -11508,7 +11544,8 @@ static int llama_decode_internal(
                // a heuristic, to avoid attending the full cache if it is not yet utilized
                // after enough generations, the benefit from this heuristic disappears
                // if we start defragmenting the cache, the benefit from this will be more important
-                kv_self.n = std::min(kv_self.size, std::max(256u, GGML_PAD(llama_kv_cache_cell_max(kv_self), 256)));
+                const uint32_t pad = llama_kv_cache_get_padding(cparams);
+                kv_self.n = std::min(kv_self.size, std::max(pad, GGML_PAD(llama_kv_cache_cell_max(kv_self), pad)));
                //kv_self.n = llama_kv_cache_cell_max(kv_self);
            }
        }
@@ -13174,6 +13211,58 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
    return rejects;
 }

+static bool llama_grammar_detect_left_recursion(
+        const std::vector<std::vector<llama_grammar_element>> & rules,
+        size_t                                                  rule_index,
+        std::vector<bool>                                     * rules_visited,
+        std::vector<bool>                                     * rules_in_progress,
+        std::vector<bool>                                     * rules_may_be_empty) {
+    if ((*rules_in_progress)[rule_index]) {
+        return true;
+    }
+
+    (*rules_in_progress)[rule_index] = true;
+
+    const std::vector<llama_grammar_element> & rule = rules[rule_index];
+
+    // First check if the rule might produce the empty string. This could be done combined with the second
+    // step but it's more readable as two steps.
+    bool at_rule_start = true;
+    for (size_t i = 0; i < rule.size(); i++) {
+        if (llama_grammar_is_end_of_sequence(&rule[i])) {
+            if (at_rule_start) {
+                (*rules_may_be_empty)[rule_index] = true;
+                break;
+            }
+            at_rule_start = true;
+        } else {
+            at_rule_start = false;
+        }
+    }
+
+    // Second, recurse into leftmost nonterminals (or next-leftmost as long as the previous nonterminal may
+    // be empty)
+    bool recurse_into_nonterminal = true;
+    for (size_t i = 0; i < rule.size(); i++) {
+        if (rule[i].type == LLAMA_GRETYPE_RULE_REF && recurse_into_nonterminal) {
+            if (llama_grammar_detect_left_recursion(rules, (size_t)rule[i].value, rules_visited, rules_in_progress, rules_may_be_empty)) {
+                return true;
+            }
+            if (!((*rules_may_be_empty)[(size_t)rule[i].value])) {
+                recurse_into_nonterminal = false;
+            }
+        } else if (llama_grammar_is_end_of_sequence(&rule[i])) {
+            recurse_into_nonterminal = true;
+        } else {
+            recurse_into_nonterminal = false;
+        }
+    }
+
+    (*rules_in_progress)[rule_index] = false;
+    (*rules_visited)[rule_index] = true;
+    return false;
+}
+
 //
 // grammar - external
 //
@@ -13193,6 +13282,19 @@ struct llama_grammar * llama_grammar_init(
        vec_rules[i].push_back({LLAMA_GRETYPE_END, 0});
    }

+    // Check for left recursion
+    std::vector<bool> rules_visited(n_rules);
+    std::vector<bool> rules_in_progress(n_rules);
+    std::vector<bool> rules_may_be_empty(n_rules);
+    for (size_t i = 0; i < n_rules; i++) {
+        if (rules_visited[i]) {
+            continue;
+        }
+        if (llama_grammar_detect_left_recursion(vec_rules, i, &rules_visited, &rules_in_progress, &rules_may_be_empty)) {
+            throw std::runtime_error(format("unsupported grammar, left recursion detected for nonterminal at index %zu", i));
+        }
+    }
+
    // loop over alternates of start rule to build initial stacks
    std::vector<std::vector<const llama_grammar_element *>> stacks;
    pos = vec_rules[start_rule_index].data();
@@ -13215,6 +13317,9 @@ struct llama_grammar * llama_grammar_init(
        }
    } while (true);

+    // Important: vec_rules has to be moved here, not copied, because stacks contains
+    // pointers to elements of vec_rules. If vec_rules were copied into llama_grammar
+    // then the pointers would be invalidated when the local vec_rules goes out of scope.
    return new llama_grammar{ std::move(vec_rules), std::move(stacks), {} };
 }

@@ -15314,6 +15419,7 @@ struct llama_model_params llama_model_default_params() {
        /*.split_mode                  =*/ LLAMA_SPLIT_MODE_LAYER,
        /*.main_gpu                    =*/ 0,
        /*.tensor_split                =*/ nullptr,
+        /*.rpc_servers                 =*/ nullptr,
        /*.progress_callback           =*/ nullptr,
        /*.progress_callback_user_data =*/ nullptr,
        /*.kv_overrides                =*/ nullptr,
@@ -15384,7 +15490,9 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
 }

 size_t llama_max_devices(void) {
-#if defined(GGML_USE_METAL)
+#if defined(GGML_USE_RPC)
+    return GGML_RPC_MAX_SERVERS;
+#elif defined(GGML_USE_METAL)
    return 1;
 #elif defined(GGML_USE_CUDA)
    return GGML_CUDA_MAX_DEVICES;
@@ -15407,7 +15515,7 @@ bool llama_supports_mlock(void) {

 bool llama_supports_gpu_offload(void) {
 #if defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
-    defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
+    defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_RPC)
    // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
    return true;
 #else
@@ -15470,7 +15578,17 @@ struct llama_model * llama_load_model_from_file(
            return true;
        };
    }
-
+    if (params.rpc_servers != nullptr) {
+        // split the servers set them into model->rpc_servers
+        std::string servers(params.rpc_servers);
+        size_t pos = 0;
+        while ((pos = servers.find(",")) != std::string::npos) {
+            std::string server = servers.substr(0, pos);
+            model->rpc_servers.push_back(server);
+            servers.erase(0, pos + 1);
+        }
+        model->rpc_servers.push_back(servers);
+    }
    int status = llama_model_load(path_model, *model, params);
    GGML_ASSERT(status <= 0);
    if (status < 0) {
@@ -15509,6 +15627,11 @@ struct llama_context * llama_new_context_with_model(
        return nullptr;
    }

+    if (params.flash_attn && model->arch == LLM_ARCH_GROK) {
+        LLAMA_LOG_WARN("%s: flash_attn is not compatible with Grok - forcing off\n", __func__);
+        params.flash_attn = false;
+    }
+
    llama_context * ctx = new llama_context(*model);

    const auto & hparams = model->hparams;
@@ -15532,7 +15655,7 @@ struct llama_context * llama_new_context_with_model(
    cparams.rope_freq_scale  = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;

    // this is necessary due to kv_self.n being padded later during inference
-    cparams.n_ctx            = GGML_PAD(cparams.n_ctx, 256);
+    cparams.n_ctx            = GGML_PAD(cparams.n_ctx, llama_kv_cache_get_padding(cparams));

    // with causal attention, the batch size is limited by the context size
    cparams.n_batch          = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
@@ -15577,11 +15700,6 @@ struct llama_context * llama_new_context_with_model(
        }
    }

-    if (cparams.flash_attn && model->arch == LLM_ARCH_GROK) {
-        LLAMA_LOG_WARN("%s: flash_attn is not compatible with Grok - forcing off\n", __func__);
-        cparams.flash_attn = false;
-    }
-
    if (params.seed == LLAMA_DEFAULT_SEED) {
        params.seed = time(NULL);
    }
@@ -15617,7 +15735,17 @@ struct llama_context * llama_new_context_with_model(

    if (!hparams.vocab_only) {
        // initialize backends
-#ifdef GGML_USE_METAL
+#if defined(GGML_USE_RPC)
+        for (auto & server : model->rpc_servers) {
+            ggml_backend_t backend = ggml_backend_rpc_init(server.c_str());
+            if (backend == nullptr) {
+                LLAMA_LOG_ERROR("%s: failed to connect RPC backend to %s\n", __func__, server.c_str());
+                llama_free(ctx);
+                return nullptr;
+            }
+            ctx->backends.push_back(backend);
+        }
+#elif defined(GGML_USE_METAL)
        if (model->n_gpu_layers > 0) {
            ctx->backend_metal = ggml_backend_metal_init();
            if (ctx->backend_metal == nullptr) {
@@ -15773,7 +15901,11 @@ struct llama_context * llama_new_context_with_model(
            ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead_custom(LLAMA_MAX_NODES, false));

            // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
-            bool pipeline_parallel = llama_get_device_count() > 1 && model->n_gpu_layers > (int)model->hparams.n_layer && model->split_mode == LLAMA_SPLIT_MODE_LAYER;
+            bool pipeline_parallel =
+                llama_get_device_count(*model) > 1 &&
+                model->n_gpu_layers > (int)model->hparams.n_layer &&
+                model->split_mode == LLAMA_SPLIT_MODE_LAYER &&
+                params.offload_kqv;
 #ifndef GGML_USE_CUDA
            // pipeline parallelism requires support for async compute and events
            // currently this is only implemented in the CUDA backend
--- a/llama.h
+++ b/llama.h
@@ -242,6 +242,9 @@ extern "C" {
        // proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
        const float * tensor_split;

+        // comma separated list of RPC servers to use for offloading
+        const char * rpc_servers;
+
        // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
        // If the provided progress_callback returns true, model loading continues.
        // If it returns false, model loading is immediately aborted.
--- a/scripts/sync-ggml-am.sh
+++ b/scripts/sync-ggml-am.sh
@@ -112,6 +112,8 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
    # src/ggml-opencl.h           -> ggml-opencl.h
    # src/ggml-quants.c           -> ggml-quants.c
    # src/ggml-quants.h           -> ggml-quants.h
+    # src/ggml-rpc.cpp            -> ggml-rpc.cpp
+    # src/ggml-rpc.h              -> ggml-rpc.h
    # src/ggml-sycl.cpp           -> ggml-sycl.cpp
    # src/ggml-sycl.h             -> ggml-sycl.h
    # src/ggml-vulkan.cpp         -> ggml-vulkan.cpp
@@ -149,6 +151,8 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
        -e 's/src\/ggml-opencl\.h/ggml-opencl.h/g' \
        -e 's/src\/ggml-quants\.c/ggml-quants.c/g' \
        -e 's/src\/ggml-quants\.h/ggml-quants.h/g' \
+        -e 's/src\/ggml-rpc\.cpp/ggml-rpc.cpp/g' \
+        -e 's/src\/ggml-rpc\.h/ggml-rpc.h/g' \
        -e 's/src\/ggml-sycl\.cpp/ggml-sycl.cpp/g' \
        -e 's/src\/ggml-sycl\.h/ggml-sycl.h/g' \
        -e 's/src\/ggml-vulkan\.cpp/ggml-vulkan.cpp/g' \
--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@@ -1 +1 @@
-30f54cbb3ada3e4c5bc6924de3e5918e5be4ff11
+126d34985705a5a2222723c145cb4e125ac689f3
--- a/scripts/sync-ggml.sh
+++ b/scripts/sync-ggml.sh
@@ -20,6 +20,8 @@ cp -rpv ../ggml/src/ggml-opencl.cpp         ./ggml-opencl.cpp
 cp -rpv ../ggml/src/ggml-opencl.h           ./ggml-opencl.h
 cp -rpv ../ggml/src/ggml-quants.c           ./ggml-quants.c
 cp -rpv ../ggml/src/ggml-quants.h           ./ggml-quants.h
+cp -rpv ../ggml/src/ggml-rpc.cpp            ./ggml-rpc.cpp
+cp -rpv ../ggml/src/ggml-rpc.h              ./ggml-rpc.h
 cp -rpv ../ggml/src/ggml-sycl.cpp           ./ggml-sycl.cpp
 cp -rpv ../ggml/src/ggml-sycl.h             ./ggml-sycl.h
 cp -rpv ../ggml/src/ggml-vulkan.cpp         ./ggml-vulkan.cpp
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -1329,23 +1329,47 @@ struct test_upscale : public test_case {
    const ggml_type type;
    const std::array<int64_t, 4> ne;
    const int32_t scale_factor;
+    const bool transpose;

    std::string vars() override {
-        return VARS_TO_STR3(type, ne, scale_factor);
+        return VARS_TO_STR4(type, ne, scale_factor, transpose);
    }

    test_upscale(ggml_type type = GGML_TYPE_F32,
            std::array<int64_t, 4> ne = {512, 512, 3, 1},
-            int32_t scale_factor = 2)
-        : type(type), ne(ne), scale_factor(scale_factor) {}
+            int32_t scale_factor = 2, bool transpose = false)
+        : type(type), ne(ne), scale_factor(scale_factor), transpose(transpose) {}

    ggml_tensor * build_graph(ggml_context * ctx) override {
        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        if (transpose) a = ggml_transpose(ctx, a);
        ggml_tensor * out = ggml_upscale(ctx, a, scale_factor);
        return out;
    }
 };

+// GGML_OP_UPSCALE (ext)
+struct test_upscale_ext : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+    const std::array<int64_t, 4> ne_tgt;
+
+    std::string vars() override {
+        return VARS_TO_STR3(type, ne, ne_tgt);
+    }
+
+    test_upscale_ext(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne     = {2, 5,  7, 11},
+            std::array<int64_t, 4> ne_tgt = {5, 7, 11, 13})
+        : type(type), ne(ne), ne_tgt(ne_tgt) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_tensor * out = ggml_upscale_ext(ctx, a, ne_tgt[0], ne_tgt[1],ne_tgt[2], ne_tgt[3]);
+        return out;
+    }
+};
+
 // GGML_OP_GROUP_NORM
 struct test_group_norm : public test_case {
    const ggml_type type;
@@ -1487,25 +1511,27 @@ struct test_flash_attn_ext : public test_case {
    const int64_t kv; // kv size
    const int64_t nb; // batch size

+    const bool mask; // use mask
+
    const float max_bias; // ALiBi

    std::string vars() override {
-        return VARS_TO_STR5(hs, nh, kv, nb, max_bias);
+        return VARS_TO_STR6(hs, nh, kv, nb, mask, max_bias);
    }

    double max_nmse_err() override {
        return 5e-4;
    }

-    test_flash_attn_ext(int64_t hs = 128, int64_t nh = 32, int64_t kv = 96, int64_t nb = 8, float max_bias = 0.0f)
-        : hs(hs), nh(nh), kv(kv), nb(nb), max_bias(max_bias) {}
+    test_flash_attn_ext(int64_t hs = 128, int64_t nh = 32, int64_t kv = 96, int64_t nb = 8, bool mask = true, float max_bias = 0.0f)
+        : hs(hs), nh(nh), kv(kv), nb(nb), mask(mask), max_bias(max_bias) {}

    ggml_tensor * build_graph(ggml_context * ctx) override {
        ggml_tensor * q = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, hs, nb, nh, 1);
        ggml_tensor * k = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, hs, kv, nh, 1);
        ggml_tensor * v = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, hs, kv, nh, 1);
-        ggml_tensor * mask = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, kv, GGML_PAD(nb, GGML_KQ_MASK_PAD), 1, 1);
-        ggml_tensor * out = ggml_flash_attn_ext(ctx, q, k, v, mask, 1.0f/sqrtf(hs), max_bias);
+        ggml_tensor * m = mask ? ggml_new_tensor_4d(ctx, GGML_TYPE_F16, kv, GGML_PAD(nb, GGML_KQ_MASK_PAD), 1, 1) : nullptr;
+        ggml_tensor * out = ggml_flash_attn_ext(ctx, q, k, v, m, 1.0f/sqrtf(hs), max_bias);
        return out;
    }
 };
@@ -2167,6 +2193,8 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op

    test_cases.emplace_back(new test_sum_rows());
    test_cases.emplace_back(new test_upscale());
+    test_cases.emplace_back(new test_upscale(GGML_TYPE_F32, { 512, 512, 3, 1 }, 2, true));
+    test_cases.emplace_back(new test_upscale_ext());
    test_cases.emplace_back(new test_group_norm());
    test_cases.emplace_back(new test_acc());
    test_cases.emplace_back(new test_pad());
@@ -2175,11 +2203,14 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
    test_cases.emplace_back(new test_leaky_relu());

    for (int hs : { 64, 80, 128, 256, }) {
-        for (float max_bias : {0.0f, 8.0f}) {
-            for (int nh : { 32, }) {
-                for (int kv : { 512, 1024, }) {
-                    for (int nb : { 1, 2, 4, 8, }) {
-                        test_cases.emplace_back(new test_flash_attn_ext(hs, nh, kv, nb, max_bias));
+        for (bool mask : { true, false } ) {
+            for (float max_bias : { 0.0f, 8.0f }) {
+                if (!mask && max_bias > 0.0f) continue;
+                for (int nh : { 32, }) {
+                    for (int kv : { 512, 1024, }) {
+                        for (int nb : { 1, 2, 4, 8, }) {
+                            test_cases.emplace_back(new test_flash_attn_ext(hs, nh, kv, nb, mask, max_bias));
+                        }
                    }
                }
            }
--- a/tests/test-grammar-integration.cpp
+++ b/tests/test-grammar-integration.cpp
@@ -28,6 +28,19 @@ static llama_grammar* build_grammar(const std::string & grammar_str) {
    return grammar;
 }

+static bool test_build_grammar_fails(const std::string & grammar_str) {
+    fprintf(stderr, "⚫ Testing failure for grammar: %s\n", grammar_str.c_str());
+    bool grammar_fails = false;
+    try {
+        build_grammar(grammar_str);
+        fprintf(stderr, "  ❌ Expected build failure, but succeeded\n");
+    } catch (const std::exception & err) {
+        grammar_fails = true;
+        fprintf(stdout, "  ✅︎\n");
+    }
+    return grammar_fails;
+}
+
 static bool match_string(const std::string & input, llama_grammar* grammar) {
    auto decoded = decode_utf8(input, {});

@@ -320,6 +333,38 @@ number ::= [0-9]+)""";
    fprintf(stderr, "  ✅︎ Passed\n");
 }

+static void test_failure_left_recursion() {
+    fprintf(stderr, "⚫ Testing left recursion detection:\n");
+
+    // Test simple left recursion detection
+    const std::string simple_str = R"""(root ::= "a" | root "a")""";
+    assert(test_build_grammar_fails(simple_str));
+
+    // Test more complicated left recursion detection
+    const std::string medium_str = R"""(
+root ::= asdf
+asdf ::= "a" | asdf "a"
+)""";
+    assert(test_build_grammar_fails(medium_str));
+
+    // Test even more complicated left recursion detection
+    const std::string hard_str = R"""(
+root ::= asdf
+asdf ::= "a" | foo "b"
+foo ::= "c" | asdf "d" | "e")""";
+    assert(test_build_grammar_fails(hard_str));
+
+    // Test yet even more complicated left recursion detection
+    const std::string hardest_str = R"""(
+root ::= asdf
+asdf ::= "a" | foo "b"
+foo ::= "c" | empty asdf "d" | "e"
+empty ::= "blah" | )""";
+    assert(test_build_grammar_fails(hardest_str));
+
+    fprintf(stderr, "  ✅︎ Passed\n");
+}
+
 int main() {
    fprintf(stdout, "Running grammar integration tests...\n");
    test_simple_grammar();
@@ -327,6 +372,7 @@ int main() {
    test_quantifiers();
    test_failure_missing_root();
    test_failure_missing_reference();
+    test_failure_left_recursion();
    fprintf(stdout, "All tests passed.\n");
    return 0;
 }
Author	SHA1	Message	Date
slaren	5de9b743f8	sched : support async weight copy	2024-05-16 00:47:40 +02:00
Daniel Bevenius	8f7080bf48	readme : remove stray double quote (#7310 ) Signed-off-by: Daniel Bevenius <daniel.bevenius@gmail.com>	2024-05-15 23:41:03 +02:00
kunnis	e1b40ac3b9	ggml : use dynamic thread scheduling for matrix multiplication (#6915 ) * Just reordering some structs. * Adding in the calls to mm_pause * Passing around the state * Renaming and moving a bunch of variables around. * Extracting the logic to it's own function. * Moving some variable definitions into the chunk function. * Moving some variables around * moving src1_cont inside * Moving row_size * adding the current_chunk * Reorg the code. * Formatting to match the orig patch * starting to setup the chunking variables * Starting the buildup of the loop * The yield shouldn't be necessary. * adding the looping structure based on the chunk configuration. * Add in the re-chunking code. * Making it much more likely to rechunk. * disable resizing if numa is enabled. * Updating comments with what we've learned. * Fix formatting * Couple more formatting fixes. * More style fixes. * Fix Warnings * Going with unused because there's conditional logic that needs it. * Update ggml.c * Update ggml.c ---------	2024-05-15 19:59:12 +02:00
agray3	dc020985b8	Avoid unnecessarily disabling CUDA graphs (#7302 ) As discussed in PR #6766, CUDA graphs were being disabled in the presence of long prompts. This fixes the issue by avoiding the consective update counter from incrementing unnecessarily for tokens in which cuda graphs are disabled due to batch size > 1.	2024-05-15 15:44:49 +02:00
slaren	344f9126cc	ggml : tag ggml_tensor::backend as deprecated (#7290 )	2024-05-15 15:08:48 +02:00
AidanBeltonS	9a17ab914b	Add missing " (#7303 )	2024-05-15 17:56:30 +05:30
dm4	ea3b0590ee	embedding : free the batch after execution (#7297 )	2024-05-15 15:01:12 +03:00
Georgi Gerganov	29499bb593	sync : ggml	2024-05-15 13:23:41 +03:00
John Balis	48aa8fd1f2	ggml : add `ggml_upscale_ext` (ggml/814) * initial commit with CPU implementation of upscale to shape and test, cuda implementation next * experimental commit to see if dst shape is correct * test version * test * removed unnecessary params * refactor * fixed tests * ggml : metal impl + cleanup + sycl dev warnings * patched ggml_upscale cuda op to handle non-contiguous tensors, added test for non-contiguous behavior * metal : fix upsacle op to support nb00 + style --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2024-05-15 13:23:33 +03:00
Johannes Gäßler	583fd6b000	server bench: fix bench not waiting for model load (#7284 )	2024-05-15 08:44:16 +02:00
Georgi Gerganov	9f773486ab	script : sync ggml-rpc	2024-05-14 19:14:38 +03:00
Georgi Gerganov	e8a7fd4fb0	metal : support FA without mask + add asserts (#7278 ) * ggml : fa without mask + add asserts ggml-ci * metal : support non-contiguous KV ggml-ci	2024-05-14 19:09:30 +03:00
Georgi Gerganov	a5e3fde857	sync : ggml ggml-ci	2024-05-14 19:08:09 +03:00
Georgi Gerganov	f308ea7059	metal : tune soft_max number of threads (whisper/0)	2024-05-14 19:08:09 +03:00
Georgi Gerganov	c3c88f296a	ggml : try fix ppc64 (whisper/0)	2024-05-14 19:08:09 +03:00
Przemysław Pawełczyk	182adefcf3	ggml : expose SSE3 and SSSE3 for MSVC when AVX is available (whisper/2128)	2024-05-14 19:08:09 +03:00
Hong Bo PENG	0d26d8ccd8	ggml : optimize for ppc64le using VSX intrinsics (ggml/784) * optimize for ppc64le using VSX intrinsics * 1. code clean up by removing comments about overflow concern. 2. fix typo in suffix of scaling. * Continue to fix typo in suffix of scaling for QK_K <> 256 --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2024-05-14 19:08:09 +03:00
Steve Grubb	4f0263633b	server: free sampling contexts on exit (#7264 ) * server: free sampling contexts on exit This cleans up last leak found by the address sanitizer. * fix whitespace * fix whitespace	2024-05-14 16:11:24 +02:00
Brian	1265c670fd	Revert "move ndk code to a new library (#6951 )" (#7282 ) This reverts commit `efc8f767c8`.	2024-05-14 16:10:39 +03:00
Radoslav Gerganov	5e31828d3e	ggml : add RPC backend (#6829 ) * ggml : add RPC backend The RPC backend proxies all operations to a remote server which runs a regular backend (CPU, CUDA, Metal, etc). * set TCP_NODELAY * add CI workflows * Address review comments * fix warning * implement llama_max_devices() for RPC * Address review comments * Address review comments * wrap sockfd into a struct * implement get_alignment and get_max_size * add get_device_memory * fix warning * win32 support * add README * readme : trim trailing whitespace * Address review comments * win32 fix * Address review comments * fix compile warnings on macos	2024-05-14 14:27:19 +03:00
slaren	541600201e	llama : disable pipeline parallelism with nkvo (#7265 )	2024-05-14 17:33:42 +10:00
Elton Kola	efc8f767c8	move ndk code to a new library (#6951 )	2024-05-14 17:30:30 +10:00
Haggai Nuchi	e0f556186b	Add left recursion check: quit early instead of going into an infinite loop (#7083 ) * Add left recursion check: quit early instead of going into an infinite loop * Remove custom enum, rename left recursion check and move to "grammar internal" section, add handling for edge case where a leftmost nonterminal may be empty * Remove unnecessary declaration	2024-05-14 15:25:56 +10:00
Ryuei	27f65d6267	docs: Fix typo and update description for --embeddings flag (#7026 ) - Change '--embedding' to '--embeddings' in the README - Update the description to match the latest --help output - Added a caution about defining physical batch size	2024-05-14 15:20:47 +10:00
compilade	ee52225067	convert-hf : support direct Q8_0 conversion (#7234 ) * convert-hf : support q8_0 conversion * convert-hf : add missing ftype This was messing with the checksums otherwise. * convert-hf : add missing ftype to Baichuan and Xverse I didn't notice these on my first pass.	2024-05-13 14:10:51 -04:00
Georgi Gerganov	614d3b914e	llama : less KV padding when FA is off (#7257 ) ggml-ci	2024-05-13 17:15:15 +03:00
k.h.lai	30e70334f7	llava-cli: fix base64 prompt (#7248 )	2024-05-14 00:02:36 +10:00
Johannes Gäßler	1c570d8bee	perplexity: add BF16 vs. FP16 results (#7150 )	2024-05-13 13:03:27 +02:00
Neo Zhang	948f4ec7c5	[SYCL] rm wait() (#7233 )	2024-05-13 18:11:26 +08:00
Joan Fontanals	9aa672490c	llama : rename jina tokenizers to v2 (#7249 ) * refactor: rename jina tokenizers to v2 * refactor: keep refactoring non-breaking	2024-05-13 11:35:14 +03:00
Brian	b1f8af1886	convert.py: Outfile default name change and additional metadata support (#4858 ) * convert.py: Outfile default name change and additional metadata support * convert.py: don't stringify Metadata load method output * convert.py: typo fix * convert.py: fix metadata format to sync with LLM_KV_NAMES in llama.cpp	2024-05-13 12:56:47 +10:00
Benjamin Findley	e586ee4259	change default temperature of OAI compat API from 0 to 1 (#7226 ) * change default temperature of OAI compat API from 0 to 1 * make tests explicitly send temperature to OAI API	2024-05-13 12:40:08 +10:00
Neo Zhang	cbf75894d2	[SYCL] Add oneapi runtime dll files to win release package (#7241 ) * add oneapi running time dlls to release package * fix path * fix path * fix path * fix path * fix path --------- Co-authored-by: Zhang <jianyu.zhang@intel.com>	2024-05-13 08:04:29 +08:00
Neo Zhang	0d5cef78ae	[SYCL] update CI with oneapi 2024.1 (#7235 ) Co-authored-by: Zhang <jianyu.zhang@intel.com>	2024-05-13 08:02:55 +08:00