swift : fix build

ggml-ci
batched.swift : fix build
2026-04-23 16:37:33 +03:00 · 2024-02-23 19:02:09 +02:00 · 2024-02-23 16:15:37 +02:00 · 2024-02-23 14:25:05 +02:00 · 2024-02-23 12:34:16 +02:00 · 2024-02-22 17:05:23 -05:00
37 changed files with 1480 additions and 742 deletions
--- a/.devops/nix/docker.nix
+++ b/.devops/nix/docker.nix
@@ -0,0 +1,37 @@
+{
+  lib,
+  dockerTools,
+  buildEnv,
+  llama-cpp,
+  interactive ? true,
+  coreutils,
+}:
+
+# A tar that can be fed into `docker load`:
+#
+# $ nix build .#llamaPackages.docker
+# $ docker load < result
+
+# For details and variations cf.
+# - https://nixos.org/manual/nixpkgs/unstable/#ssec-pkgs-dockerTools-buildLayeredImage
+# - https://discourse.nixos.org/t/a-faster-dockertools-buildimage-prototype/16922
+# - https://nixery.dev/
+
+# Approximate (compressed) sizes, at the time of writing, are:
+#
+# .#llamaPackages.docker: 125M;
+# .#llamaPackagesCuda.docker: 537M;
+# .#legacyPackages.aarch64-linux.llamaPackagesXavier.docker: 415M.
+
+dockerTools.buildLayeredImage {
+  name = llama-cpp.pname;
+  tag = "latest";
+
+  contents =
+    [ llama-cpp ]
+    ++ lib.optionals interactive [
+      coreutils
+      dockerTools.binSh
+      dockerTools.caCertificates
+    ];
+}
--- a/.devops/nix/scope.nix
+++ b/.devops/nix/scope.nix
@@ -12,5 +12,8 @@ lib.makeScope newScope (
  self: {
    inherit llamaVersion;
    llama-cpp = self.callPackage ./package.nix { };
+    docker = self.callPackage ./docker.nix { };
+    docker-min = self.callPackage ./docker.nix { interactive = false; };
+    sif = self.callPackage ./sif.nix { };
  }
 )
--- a/.devops/nix/sif.nix
+++ b/.devops/nix/sif.nix
@@ -0,0 +1,27 @@
+{
+  lib,
+  singularity-tools,
+  llama-cpp,
+  bashInteractive,
+  interactive ? false,
+}:
+
+let
+    optionalInt = cond: x: if cond then x else 0;
+in
+singularity-tools.buildImage rec {
+  inherit (llama-cpp) name;
+  contents = [ llama-cpp ] ++ lib.optionals interactive [ bashInteractive ];
+
+  # These are excessive (but safe) for most variants. Building singularity
+  # images requires superuser privileges, so we build them inside a VM in a
+  # writable image of pre-determined size.
+  #
+  # ROCm is currently affected by https://github.com/NixOS/nixpkgs/issues/276846
+  #
+  # Expected image sizes:
+  # - cpu/blas: 150M,
+  # - cuda, all gencodes: 560M,
+  diskSize = 4096 + optionalInt llama-cpp.useRocm 16384;
+  memSize = diskSize;
+}
--- a/.github/workflows/nix-ci-aarch64.yml
+++ b/.github/workflows/nix-ci-aarch64.yml
@@ -19,7 +19,6 @@ on:

 jobs:
  nix-build-aarch64:
-    if: ${{ vars.CACHIX_NAME != '' }}
    runs-on: ubuntu-latest
    steps:
    - name: Checkout repository
@@ -37,8 +36,8 @@ jobs:
        extra-conf: |
          extra-platforms = aarch64-linux
          extra-system-features = nixos-test kvm
-          extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org
-          extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
+          extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
+          extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
    - uses: DeterminateSystems/magic-nix-cache-action@v2
      with:
        upstream-cache: https://${{ matrix.cachixName }}.cachix.org
@@ -46,7 +45,7 @@ jobs:
      uses: cachix/cachix-action@v13
      with:
        authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
-        name: ${{ vars.CACHIX_NAME }}
+        name: llama-cpp
    - name: Show all output paths
      run: >
          nix run github:nix-community/nix-eval-jobs
--- a/.github/workflows/nix-ci.yml
+++ b/.github/workflows/nix-ci.yml
@@ -23,8 +23,8 @@ jobs:
      with:
        github-token: ${{ secrets.GITHUB_TOKEN }}
        extra-conf: |
-          extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org
-          extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
+          extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
+          extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
    - uses: DeterminateSystems/magic-nix-cache-action@v2
      with:
        upstream-cache: https://${{ matrix.cachixName }}.cachix.org
@@ -37,7 +37,6 @@ jobs:
          --flake
          ".#packages.$(nix eval --raw --impure --expr builtins.currentSystem)"
  nix-build:
-    if: ${{ vars.CACHIX_NAME != '' }}
    strategy:
      fail-fast: false
      matrix:
@@ -51,8 +50,8 @@ jobs:
      with:
        github-token: ${{ secrets.GITHUB_TOKEN }}
        extra-conf: |
-          extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org
-          extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
+          extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
+          extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
    - uses: DeterminateSystems/magic-nix-cache-action@v2
      with:
        upstream-cache: https://${{ matrix.cachixName }}.cachix.org
@@ -60,7 +59,7 @@ jobs:
      uses: cachix/cachix-action@v13
      with:
        authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
-        name: ${{ vars.CACHIX_NAME }}
+        name: llama-cpp
    - name: Build
      run: >
          nix run github:Mic92/nix-fast-build
--- a/README.md
+++ b/README.md
@@ -10,13 +10,9 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)

 ### Hot topics

- Remove LLAMA_MAX_DEVICES and LLAMA_SUPPORTS_GPU_OFFLOAD: https://github.com/ggerganov/llama.cpp/pull/5240
- Incoming backends: https://github.com/ggerganov/llama.cpp/discussions/5138
-  - [SYCL backend](README-sycl.md) is ready (1/28/2024), support Linux/Windows in Intel GPUs (iGPU, Arc/Flex/Max series)
- New SOTA quantized models, including pure 2-bits: https://huggingface.co/ikawrakow
- Collecting Apple Silicon performance stats:
-  - M-series: https://github.com/ggerganov/llama.cpp/discussions/4167
-  - A-series: https://github.com/ggerganov/llama.cpp/discussions/4508
+- Support for chat templates: [Wiki (contributions welcome)](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
+- Support for Gemma models: https://github.com/ggerganov/llama.cpp/pull/5631
+- Non-linear quantization IQ4_NL: https://github.com/ggerganov/llama.cpp/pull/5590
 - Looking for contributions to improve and maintain the `server` example: https://github.com/ggerganov/llama.cpp/issues/4216

 ----
@@ -107,6 +103,7 @@ Typically finetunes of the base models below are supported as well.
 - [x] [Orion 14B](https://github.com/ggerganov/llama.cpp/pull/5118)
 - [x] [InternLM2](https://huggingface.co/models?search=internlm2)
 - [x] [CodeShell](https://github.com/WisdomShell/codeshell)
+- [x] [Gemma](https://ai.google.dev/gemma)

 **Multimodal models:**

@@ -145,6 +142,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
 - [nat/openplayground](https://github.com/nat/openplayground)
 - [Faraday](https://faraday.dev/) (proprietary)
 - [LMStudio](https://lmstudio.ai/) (proprietary)
+- [LocalAI](https://github.com/mudler/LocalAI) (MIT)
 - [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL)
 - [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile)
 - [nomic-ai/gpt4all](https://github.com/nomic-ai/gpt4all)
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -218,6 +218,8 @@ class Model:
            return BertModel
        if model_architecture == "NomicBertModel":
            return NomicBertModel
+        if model_architecture == "GemmaForCausalLM":
+            return GemmaModel
        return Model

    def _is_model_safetensors(self) -> bool:
@@ -277,6 +279,8 @@ class Model:
            return gguf.MODEL_ARCH.BERT
        if arch == "NomicBertModel":
            return gguf.MODEL_ARCH.NOMIC_BERT
+        if arch == "GemmaForCausalLM":
+            return gguf.MODEL_ARCH.GEMMA

        raise NotImplementedError(f'Architecture "{arch}" not supported!')

@@ -618,11 +622,6 @@ class MPTModel(Model):

            self.gguf_writer.add_tensor(new_name, data)

-            # note: MPT output is tied to (same as) wte in original model;
-            # for easier implementation in llama.cpp it's duplicated in GGUF, though :/
-            if new_name == "token_embd.weight":
-                self.gguf_writer.add_tensor("output.weight", data)
-

 class OrionModel(Model):
    def set_vocab(self):
@@ -655,6 +654,8 @@ class OrionModel(Model):
        self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
        self.gguf_writer.add_head_count(head_count)
        self.gguf_writer.add_head_count_kv(head_count_kv)
+        # note: config provides rms norm but it is actually layer norm
+        # ref:  https://huggingface.co/OrionStarAI/Orion-14B-Chat/blob/276a17221ce42beb45f66fac657a41540e71f4f5/modeling_orion.py#L570-L571
        self.gguf_writer.add_layer_norm_eps(self.hparams["rms_norm_eps"])

    def write_tensors(self):
@@ -1031,7 +1032,6 @@ class PersimmonModel(Model):
        self.gguf_writer.add_head_count_kv(head_count_kv)
        self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"])
        self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"])
-        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])

    def set_vocab(self):
        self._set_vocab_sentencepiece()
@@ -1785,6 +1785,62 @@ class NomicBertModel(BertModel):
            yield name, data


+class GemmaModel(Model):
+    def set_vocab(self):
+        self._set_vocab_sentencepiece()
+
+    def set_gguf_parameters(self):
+        hparams = self.hparams
+        block_count = hparams["num_hidden_layers"]
+
+        self.gguf_writer.add_name(self.dir_model.name)
+        self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
+        self.gguf_writer.add_embedding_length(hparams["hidden_size"])
+        self.gguf_writer.add_block_count(block_count)
+        self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
+        self.gguf_writer.add_head_count(hparams["num_attention_heads"])
+        self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"] if "num_key_value_heads" in hparams else hparams["num_attention_heads"])
+        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
+        self.gguf_writer.add_key_length(hparams["head_dim"])
+        self.gguf_writer.add_value_length(hparams["head_dim"])
+
+    def write_tensors(self):
+        block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
+        tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
+
+        for name, data_torch in self.get_tensors():
+            # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89
+            if name.endswith("norm.weight"):
+                data_torch = data_torch + 1
+
+            old_dtype = data_torch.dtype
+
+            # convert any unsupported data types to float32
+            if data_torch.dtype not in (torch.float16, torch.float32):
+                data_torch = data_torch.to(torch.float32)
+
+            data = data_torch.squeeze().numpy()
+
+            # map tensor names
+            new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
+            if new_name is None:
+                print(f"Can not map tensor {name!r}")
+                sys.exit()
+
+            n_dims = len(data.shape)
+            data_dtype = data.dtype
+
+            data = data.astype(np.float32)
+
+            # if f16 desired, convert any float32 2-dim weight tensors to float16
+            if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
+                data = data.astype(np.float16)
+
+            print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
+
+            self.gguf_writer.add_tensor(new_name, data)
+
+
 ###### CONVERSION LOGIC ######


--- a/examples/baby-llama/baby-llama.cpp
+++ b/examples/baby-llama/baby-llama.cpp
@@ -1015,9 +1015,9 @@ static struct ggml_tensor * forward_lora(
    struct ggml_tensor * kc = kv_self.k;
    struct ggml_tensor * vc = kv_self.v;

-    struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+    struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, N);
    {
-        int * data = (int *) KQ_pos->data;
+        float * data = (float *) KQ_pos->data;
        for (int i = 0; i < N; ++i) {
            data[i] = n_past + i;
        }
--- a/examples/batched.swift/Sources/main.swift
+++ b/examples/batched.swift/Sources/main.swift
@@ -79,7 +79,7 @@ batch.n_tokens = Int32(tokens.count)

 for (i, token) in tokens.enumerated() {
    batch.token[i] = token
-    batch.pos[i] = Int32(i)
+    batch.pos[i] = llama_pos(i)
    batch.n_seq_id[i] = 1
    // batch.seq_id[i][0] = 0
    // TODO: is this the proper way to do this?
@@ -98,7 +98,7 @@ if llama_decode(context, batch) != 0 {
 }

 for i in 1 ..< n_parallel {
-    llama_kv_cache_seq_cp(context, 0, Int32(i), 0, batch.n_tokens)
+    llama_kv_cache_seq_cp(context, 0, Int32(i), 0, llama_pos(batch.n_tokens))
 }

 if n_parallel > 1 {
@@ -125,8 +125,8 @@ while n_cur <= n_len {
            continue
        }

-        var n_vocab = llama_n_vocab(model)
-        var logits = llama_get_logits_ith(context, i_batch[i])
+        let n_vocab = llama_n_vocab(model)
+        let logits = llama_get_logits_ith(context, i_batch[i])

        var candidates: [llama_token_data] = .init(repeating: llama_token_data(), count: Int(n_vocab))

@@ -173,7 +173,7 @@ while n_cur <= n_len {

        // push this new token for next evaluation
        batch.token[Int(batch.n_tokens)] = new_token_id
-        batch.pos[Int(batch.n_tokens)] = n_cur
+        batch.pos[Int(batch.n_tokens)] = llama_pos(n_cur)
        batch.n_seq_id[Int(batch.n_tokens)] = 1
        if let seq_id = batch.seq_id[Int(batch.n_tokens)] {
            seq_id[0] = Int32(i)
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -554,7 +554,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
    };

    // KQ_pos - contains the positions
-    struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N);
+    struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, N);
    ggml_set_input(KQ_pos);

    // rope has so much parameters that we make a custom function for it
@@ -743,7 +743,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(

        // set KQ_pos
        {
-            int * data = (int *) KQ_pos->data;
+            float * data = (float *) KQ_pos->data;
            for (int i = 0; i < N; ++i) {
                data[i] = n_past + i;
            }
--- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
+++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
@@ -129,7 +129,7 @@ actor LlamaContext {

        for i1 in 0..<tokens_list.count {
            let i = Int(i1)
-            llama_batch_add(&batch, tokens_list[i], Int32(i), [0], false)
+            llama_batch_add(&batch, tokens_list[i], llama_pos(i), [0], false)
        }
        batch.logits[Int(batch.n_tokens) - 1] = 1 // true

@@ -183,7 +183,7 @@ actor LlamaContext {
        // tokens_list.append(new_token_id)

        llama_batch_clear(&batch)
-        llama_batch_add(&batch, new_token_id, n_cur, [0], true)
+        llama_batch_add(&batch, new_token_id, llama_pos(n_cur), [0], true)

        n_decode += 1
        n_cur    += 1
@@ -210,7 +210,7 @@ actor LlamaContext {
            let n_tokens = pp

            for i in 0..<n_tokens {
-                llama_batch_add(&batch, 0, Int32(i), [0], false)
+                llama_batch_add(&batch, 0, llama_pos(i), [0], false)
            }
            batch.logits[Int(batch.n_tokens) - 1] = 1 // true

@@ -234,7 +234,7 @@ actor LlamaContext {
                llama_batch_clear(&batch)

                for j in 0..<pl {
-                    llama_batch_add(&batch, 0, Int32(i), [Int32(j)], true)
+                    llama_batch_add(&batch, 0, llama_pos(i), [Int32(j)], true)
                }

                if llama_decode(context, batch) != 0 {
--- a/examples/llava/README.md
+++ b/examples/llava/README.md
@@ -63,13 +63,12 @@ Now both the LLaMA part and the image encoder is in the `llava-v1.5-7b` director
 ```console
 git clone https://huggingface.co/liuhaotian/llava-v1.6-vicuna-7b
 ```
-2) Backup your pth/safetensor model files as llava-surgery modifies them
-3) Use `llava-surgery-v2.py` which also supports llava-1.5 variants pytorch as well as safetensor models:
+2) Use `llava-surgery-v2.py` which also supports llava-1.5 variants pytorch as well as safetensor models:
 ```console
 python examples/llava/llava-surgery-v2.py -C -m ../llava-v1.6-vicuna-7b/
 ```
 - you will find a llava.projector and a llava.clip file in your model directory
-4) Copy the llava.clip file into a subdirectory (like vit), rename it to pytorch_model.bin and add a fitting vit configuration to the directory:
+3) Copy the llava.clip file into a subdirectory (like vit), rename it to pytorch_model.bin and add a fitting vit configuration to the directory:
 ```console
 mkdir vit
 cp ../llava-v1.6-vicuna-7b/llava.clip vit/pytorch_model.bin
@@ -77,18 +76,18 @@ cp ../llava-v1.6-vicuna-7b/llava.projector vit/
 curl -s -q https://huggingface.co/cmp-nct/llava-1.6-gguf/raw/main/config_vit.json -o vit/config.json
 ```

-5) Create the visual gguf model:
+4) Create the visual gguf model:
 ```console
 python ./examples/llava/convert-image-encoder-to-gguf.py -m vit --llava-projector vit/llava.projector --output-dir vit --clip-model-is-vision
 ```
 - This is similar to llava-1.5, the difference is that we tell the encoder that we are working with the pure vision model part of CLIP

-6) Then convert the model to gguf format:
+5) Then convert the model to gguf format:
 ```console
-python ./convert.py ../llava-v1.6-vicuna-7b/
+python ./convert.py ../llava-v1.6-vicuna-7b/ --skip-unknown
 ```

-7) And finally we can run the llava-cli using the 1.6 model version:
+6) And finally we can run the llava-cli using the 1.6 model version:
 ```console
 ./llava-cli -m ../llava-v1.6-vicuna-7b/ggml-model-f16.gguf --mmproj vit/mmproj-model-f16.gguf --image some-image.jpg -c 4096
 ```
--- a/examples/llava/llava-surgery-v2.py
+++ b/examples/llava/llava-surgery-v2.py
@@ -65,9 +65,7 @@ def clean_vision_tower_from_checkpoint(checkpoint_path):
        for name in clip_tensors:
            del checkpoint[name]

-        # Save the updated checkpoint
        checkpoint_path = checkpoint_path
-        save_model(checkpoint, checkpoint_path, file_type)
        return True
    return False

@@ -152,16 +150,6 @@ for name in first_mm_tensors:
 if len(projector) > 0:
    save_model(projector, f"{args.model}/llava.projector", 'pytorch')

-for name in mm_tensors:
-    del last_checkpoint[name]
-for name in first_mm_tensors:
-    del first_checkpoint[name]
-
-if len(mm_tensors) > 0:
-    save_model(last_checkpoint, projector_checkpoint_path, file_type)
-if len(first_mm_tensors) > 0:
-    save_model(first_checkpoint, newline_checkpoint_path, file_type)
-
 print("Done!")
 print(f"Now you can convert {args.model} to a a regular LLaMA GGUF file.")
 print(f"Also, use {args.model}/llava.projector to prepare a llava-encoder.gguf file.")
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -338,7 +338,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
        if (n_eval > n_batch) {
            n_eval = n_batch;
        }
-        llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
+        llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, (float) *n_past, 1, 0, };
        if (llama_decode(ctx_llama, batch)) {
            fprintf(stderr, "%s : failed to eval\n", __func__);
            return false;
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -334,6 +334,8 @@ int main(int argc, char ** argv) {
    // number of tokens to keep when resetting context
    if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size() || params.instruct || params.chatml) {
        params.n_keep = (int)embd_inp.size();
+    } else {
+        params.n_keep += add_bos; // always keep the BOS token
    }

    // prefix & suffix for instruct mode
@@ -383,8 +385,8 @@ int main(int argc, char ** argv) {
            }
        }

-        if (params.n_keep > 0) {
-        LOG_TEE("%s: static prompt based on n_keep: '", __func__);
+        if (params.n_keep > add_bos) {
+            LOG_TEE("%s: static prompt based on n_keep: '", __func__);
            for (int i = 0; i < params.n_keep; i++) {
                LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
            }
@@ -540,14 +542,14 @@ int main(int argc, char ** argv) {
                        break;
                    }

-                    const int n_left    = n_past - params.n_keep - 1;
+                    const int n_left    = n_past - params.n_keep;
                    const int n_discard = n_left/2;

                    LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
                            n_past, n_left, n_ctx, params.n_keep, n_discard);

-                    llama_kv_cache_seq_rm   (ctx, 0, params.n_keep + 1            , params.n_keep + n_discard + 1);
-                    llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
+                    llama_kv_cache_seq_rm   (ctx, 0, params.n_keep            , params.n_keep + n_discard);
+                    llama_kv_cache_seq_shift(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);

                    n_past -= n_discard;

--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -41,6 +41,7 @@ see https://github.com/ggerganov/llama.cpp/issues/1437
 - `--grp-attn-w`: Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`
 - `-n, --n-predict`: Set the maximum tokens to predict (default: -1)
 - `--slots-endpoint-disable`: To disable slots state monitoring endpoint. Slots state may contain user data, prompts included.
+- `--chat-template JINJA_TEMPLATE`: Set custom jinja chat template. This parameter accepts a string, not a file name (default: template taken from model's metadata). We only support [some pre-defined templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)

 ## Build

@@ -140,6 +141,8 @@ node index.js
  - 200 -> `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if no slot are currently available.
  - 503 -> `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if the query parameter `fail_on_no_slot` is provided and no slot are currently available.

+  If the query parameter `include_slots` is passed, `slots` field will contain internal slots data except if `--slots-endpoint-disable` is set.
+
 - **POST** `/completion`: Given a `prompt`, it returns the predicted completion.

    *Options:*
@@ -148,7 +151,7 @@ node index.js

    `temperature`: Adjust the randomness of the generated text (default: 0.8).

-    `dynatemp_range`: Dynamic temperature range (default: 0.0, 0.0 = disabled).
+    `dynatemp_range`: Dynamic temperature range. The final temperature will be in the range of `[temperature - dynatemp_range; temperature + dynatemp_range]` (default: 0.0, 0.0 = disabled).

    `dynatemp_exponent`: Dynamic temperature exponent (default: 1.0).

@@ -206,7 +209,7 @@ node index.js

    `slot_id`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot (default: -1)

-    `cache_prompt`: Save the prompt and generation for avoid reprocess entire prompt if a part of this isn't change (default: false)
+    `cache_prompt`: Re-use previously cached prompt from the last request if possible. This may prevent re-caching the prompt from scratch. (default: false)

    `system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)

@@ -239,7 +242,7 @@ Notice that each `probs` is an array of length `n_probs`.

 - `content`: Completion result as a string (excluding `stopping_word` if any). In case of streaming mode, will contain the next token as a string.
 - `stop`: Boolean for use with `stream` to check whether the generation has stopped (Note: This is not related to stopping words array `stop` from input options)
- `generation_settings`: The provided options above excluding `prompt` but including `n_ctx`, `model`
+- `generation_settings`: The provided options above excluding `prompt` but including `n_ctx`, `model`. These options may differ from the original ones in some way (e.g. bad values filtered out, strings converted to tokens, etc.).
 - `model`: The path to the model loaded with `-m`
 - `prompt`: The provided `prompt`
 - `stopped_eos`: Indicating whether the completion has stopped because it encountered the EOS token
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -400,6 +400,16 @@ struct llama_server_context
        return true;
    }

+    void validate_model_chat_template(server_params & sparams) {
+        llama_chat_message chat[] = {{"user", "test"}};
+        std::vector<char> buf(1);
+        int res = llama_chat_apply_template(model, nullptr, chat, 1, true, buf.data(), buf.size());
+        if (res < 0) {
+            LOG_ERROR("The chat template comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {});
+            sparams.chat_template = "<|im_start|>"; // llama_chat_apply_template only checks if <|im_start|> exist in the template
+        }
+    }
+
    void initialize() {
        // create slots
        all_slots_are_idle = true;
@@ -1271,7 +1281,7 @@ struct llama_server_context
                }

                const int n_embd = llama_n_embd(model);
-                llama_batch batch_img = { n_eval, nullptr, (img.image_embedding + i * n_embd), nullptr, nullptr, nullptr, nullptr, slot.n_past, 1, 0, };
+                llama_batch batch_img = { n_eval, nullptr, (img.image_embedding + i * n_embd), nullptr, nullptr, nullptr, nullptr, (float) slot.n_past, 1, 0, };
                if (llama_decode(ctx, batch_img))
                {
                    LOG_TEE("%s : failed to eval image\n", __func__);
@@ -1394,6 +1404,46 @@ struct llama_server_context
            case TASK_TYPE_NEXT_RESPONSE: {
                // do nothing
            } break;
+            case TASK_TYPE_SLOTS_DATA: {
+                json slots_data        = json::array();
+                int n_idle_slots       = 0;
+                int n_processing_slots = 0;
+
+                for (llama_client_slot &slot: slots) {
+                    if (slot.available()) {
+                        n_idle_slots++;
+                    } else {
+                        n_processing_slots++;
+                    }
+                    json slot_data = get_formated_generation(slot);
+                    slot_data["id"] = slot.id;
+                    slot_data["task_id"] = slot.task_id;
+                    slot_data["state"] = slot.state;
+                    slot_data["prompt"] = slot.prompt;
+                    slot_data["next_token"] = {
+                            {"has_next_token", slot.has_next_token},
+                            {"n_remain", slot.n_remaining},
+                            {"num_tokens_predicted", slot.n_decoded},
+                            {"stopped_eos", slot.stopped_eos},
+                            {"stopped_word", slot.stopped_word},
+                            {"stopped_limit", slot.stopped_limit},
+                            {"stopping_word", slot.stopping_word},
+                    };
+                    slots_data.push_back(slot_data);
+                }
+                LOG_TEE("task %i - slots data: idle=%i processing=%i\n", task.id, n_idle_slots, n_processing_slots);
+                task_result res;
+                res.id = task.id;
+                res.multitask_id = task.multitask_id;
+                res.stop = true;
+                res.error = false;
+                res.result_json = {
+                        { "idle",       n_idle_slots       },
+                        { "processing", n_processing_slots },
+                        { "slots",      slots_data         }
+                };
+                queue_results.send(res);
+            } break;
        }
    }

@@ -1447,14 +1497,15 @@ struct llama_server_context
                if (slot.is_processing() && system_tokens.size() + slot.cache_tokens.size() >= (size_t) slot.n_ctx)
                {
                    // Shift context
-                    const int n_left    = system_tokens.size() + slot.n_past - slot.params.n_keep - 1;
+                    const int n_keep    = slot.params.n_keep + add_bos_token;
+                    const int n_left    = system_tokens.size() + slot.n_past - n_keep;
                    const int n_discard = n_left / 2;

-                    LOG_TEE("slot %d: context shift - n_keep = %d, n_left = %d, n_discard = %d\n", slot.id, slot.params.n_keep, n_left, n_discard);
-                    llama_kv_cache_seq_rm   (ctx, slot.id, slot.params.n_keep + 1            , slot.params.n_keep + n_discard + 1);
-                    llama_kv_cache_seq_shift(ctx, slot.id, slot.params.n_keep + 1 + n_discard, system_tokens.size() + slot.n_past, -n_discard);
+                    LOG_TEE("slot %d: context shift - n_keep = %d, n_left = %d, n_discard = %d\n", slot.id, n_keep, n_left, n_discard);
+                    llama_kv_cache_seq_rm   (ctx, slot.id, n_keep            , n_keep + n_discard);
+                    llama_kv_cache_seq_shift(ctx, slot.id, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard);

-                    for (size_t i = slot.params.n_keep + 1 + n_discard; i < slot.cache_tokens.size(); i++)
+                    for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++)
                    {
                        slot.cache_tokens[i - n_discard] = slot.cache_tokens[i];
                    }
@@ -1467,7 +1518,7 @@ struct llama_server_context

                    LOG_VERBOSE("context shift", {
                        { "n_ctx", n_ctx },
-                        { "n_keep", params.n_keep },
+                        { "n_keep", n_keep },
                        { "n_left", n_left },
                    });
                }
@@ -2557,34 +2608,38 @@ int main(int argc, char **argv)
        server_state current_state = state.load();
        switch(current_state) {
            case SERVER_STATE_READY: {
-                int available_slots  = 0;
-                int processing_slots = 0;
-                for (llama_client_slot &slot: llama.slots) {
-                    if (slot.available()) {
-                        available_slots++;
-                    } else {
-                        processing_slots++;
-                    }
+                // request slots data using task queue
+                task_server task;
+                task.id   = llama.queue_tasks.get_new_id();
+                task.type = TASK_TYPE_SLOTS_DATA;
+                task.target_id = -1;
+
+                llama.queue_results.add_waiting_task_id(task.id);
+                llama.queue_tasks.post(task);
+
+                // get the result
+                task_result result = llama.queue_results.recv(task.id);
+                llama.queue_results.remove_waiting_task_id(task.id);
+
+                int n_idle_slots       = result.result_json["idle"];
+                int n_processing_slots = result.result_json["processing"];
+
+                json health = {
+                        {"status",           "ok"},
+                        {"slots_idle",       n_idle_slots},
+                        {"slots_processing", n_processing_slots}};
+                res.status = 200; // HTTP OK
+                if (sparams.slots_endpoint && req.has_param("include_slots")) {
+                    health["slots"] = result.result_json["slots"];
                }
-                if (available_slots > 0) {
-                    json health = {
-                            {"status",           "ok"},
-                            {"slots_idle",       available_slots},
-                            {"slots_processing", processing_slots}};
-                    res.set_content(health.dump(), "application/json");
-                    res.status = 200; // HTTP OK
-                } else {
-                    json health = {
-                            {"status",           "no slot available"},
-                            {"slots_idle",       available_slots},
-                            {"slots_processing", processing_slots}};
-                    res.set_content(health.dump(), "application/json");
+
+                if (n_idle_slots == 0) {
+                    health["status"] = "no slot available";
                    if (req.has_param("fail_on_no_slot")) {
                        res.status = 503; // HTTP Service Unavailable
-                    } else {
-                        res.status = 200; // HTTP OK
                    }
                }
+                res.set_content(health.dump(), "application/json");
                break;
            }
            case SERVER_STATE_LOADING_MODEL:
@@ -2600,26 +2655,20 @@ int main(int argc, char **argv)

    if (sparams.slots_endpoint) {
        svr.Get("/slots", [&](const httplib::Request&, httplib::Response& res) {
-            json slots;
-            for (llama_client_slot & slot : llama.slots) {
-                json slot_data = llama.get_formated_generation(slot);
-                slot_data["id"] = slot.id;
-                slot_data["task_id"] = slot.task_id;
-                slot_data["state"] = slot.state;
-                slot_data["prompt"] = slot.prompt;
-                slot_data["next_token"] = {
-                        {"has_next_token", slot.has_next_token},
-                        {"n_remain", slot.n_remaining},
-                        {"num_tokens_predicted", slot.n_decoded},
-                        {"stopped_eos", slot.stopped_eos},
-                        {"stopped_word", slot.stopped_word},
-                        {"stopped_limit", slot.stopped_limit},
-                        {"stopping_word", slot.stopping_word},
-                };
+            // request slots data using task queue
+            task_server task;
+            task.id = llama.queue_tasks.get_new_id();
+            task.type = TASK_TYPE_SLOTS_DATA;
+            task.target_id = -1;

-                slots.push_back(slot_data);
-            }
-            res.set_content(slots.dump(), "application/json");
+            llama.queue_results.add_waiting_task_id(task.id);
+            llama.queue_tasks.post(task);
+
+            // get the result
+            task_result result = llama.queue_results.recv(task.id);
+            llama.queue_results.remove_waiting_task_id(task.id);
+
+            res.set_content(result.result_json["slots"].dump(), "application/json");
            res.status = 200; // HTTP OK
        });
    }
@@ -2713,6 +2762,11 @@ int main(int argc, char **argv)
        LOG_INFO("model loaded", {});
    }

+    if (sparams.chat_template.empty()) { // custom chat template is not supplied
+        // check if the template comes with the model is supported by us
+        llama.validate_model_chat_template(sparams);
+    }
+
    // Middleware for API key validation
    auto validate_api_key = [&sparams](const httplib::Request &req, httplib::Response &res) -> bool {
        // If API key is not set, skip validation
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -49,7 +49,8 @@ enum server_state {
 enum task_type {
    TASK_TYPE_COMPLETION,
    TASK_TYPE_CANCEL,
-    TASK_TYPE_NEXT_RESPONSE
+    TASK_TYPE_NEXT_RESPONSE,
+    TASK_TYPE_SLOTS_DATA
 };

 struct task_server {
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -291,7 +291,7 @@ static struct ggml_tensor * llama_build_train_graphs(
    };

    // KQ_pos - contains the positions
-    struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N);
+    struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, N);
    ggml_set_input(KQ_pos);

    // rope has so much parameters that we make a custom function for it
@@ -419,7 +419,7 @@ static struct ggml_tensor * llama_build_train_graphs(
            ggml_gallocr_alloc_graph(alloc, gb);

            if (!measure_only) {
-                int * data = (int *) KQ_pos->data;
+                float * data = (float *) KQ_pos->data;
                for (int i = 0; i < N; ++i) {
                    data[i] = n_past + i;
                }
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -1,3 +1,7 @@
+#include "ggml-cuda.h"
+#include "ggml.h"
+#include "ggml-backend-impl.h"
+
 #include <algorithm>
 #include <assert.h>
 #include <atomic>
@@ -121,11 +125,6 @@

 #endif // defined(GGML_USE_HIPBLAS)

-// ggml-cuda need half type so keep ggml headers include at last
-#include "ggml-cuda.h"
-#include "ggml.h"
-#include "ggml-backend-impl.h"
-
 #define CUDART_HMAX     11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)

 #define CC_PASCAL     600
@@ -6041,7 +6040,7 @@ static __device__ void rope_yarn(
 // rope == RoPE == rotary positional embedding
 template<typename T, bool has_pos>
 static __global__ void rope(
-    const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
+    const T * x, T * dst, int ncols, const float * pos, float freq_scale, int p_delta_rows, float freq_base,
    float ext_factor, float attn_factor, rope_corr_dims corr_dims
 ) {
    const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
@@ -6054,7 +6053,7 @@ static __global__ void rope(
    const int i = row*ncols + col;
    const int i2 = row/p_delta_rows;

-    const int p = has_pos ? pos[i2] : 0;
+    const float p = has_pos ? pos[i2] : 0.0f;
    const float theta_base = p*powf(freq_base, -float(col)/ncols);

    float cos_theta, sin_theta;
@@ -6069,7 +6068,7 @@ static __global__ void rope(

 template<typename T, bool has_pos>
 static __global__ void rope_neox(
-    const T * x, T * dst, int ncols, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
+    const T * x, T * dst, int ncols, int n_dims, const float * pos, float freq_scale, int p_delta_rows,
    float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims
 ) {
    const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
@@ -6096,7 +6095,7 @@ static __global__ void rope_neox(

    float cur_rot = inv_ndims * ic - ib;

-    const int p = has_pos ? pos[i2] : 0;
+    const float p = has_pos ? pos[i2] : 0.0f;
    const float theta_base = p*freq_scale*powf(theta_scale, col/2.0f);

    float cos_theta, sin_theta;
@@ -6110,7 +6109,7 @@ static __global__ void rope_neox(
 }

 static __global__ void rope_glm_f32(
-    const float * x, float * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
+    const float * x, float * dst, int ncols, const float * pos, float freq_scale, int p_delta_rows, float freq_base,
    int n_ctx
 ) {
    const int col = blockDim.x*blockIdx.x + threadIdx.x;
@@ -6125,10 +6124,10 @@ static __global__ void rope_glm_f32(
    const int i2 = row/p_delta_rows;

    const float col_theta_scale = powf(freq_base, -2.0f*col/ncols);
-     // FIXME: this is likely wrong
-    const int p = pos != nullptr ? pos[i2] : 0;

-    const float theta = min(p, n_ctx - 2)*freq_scale*col_theta_scale;
+    const float p = pos != nullptr ? pos[i2] : 0.0f;
+
+    const float theta = min(p, (float) n_ctx - 2)*freq_scale*col_theta_scale;
    const float sin_theta = sinf(theta);
    const float cos_theta = cosf(theta);

@@ -6138,7 +6137,7 @@ static __global__ void rope_glm_f32(
    dst[i + 0]           = x0*cos_theta - x1*sin_theta;
    dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta;

-    const float block_theta = ((float)max(p - n_ctx - 2, 0))*col_theta_scale;
+    const float block_theta = max(p - n_ctx - 2, 0.0f)*col_theta_scale;
    const float sin_block_theta = sinf(block_theta);
    const float cos_block_theta = cosf(block_theta);

@@ -7689,7 +7688,7 @@ static void clamp_f32_cuda(const float * x, float * dst, const float min, const

 template<typename T>
 static void rope_cuda(
-    const T * x, T * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
+    const T * x, T * dst, int ncols, int nrows, const float * pos, float freq_scale, int p_delta_rows,
    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
 ) {
    GGML_ASSERT(ncols % 2 == 0);
@@ -7709,7 +7708,7 @@ static void rope_cuda(

 template<typename T>
 static void rope_neox_cuda(
-    const T * x, T * dst, int ncols, int n_dims, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
+    const T * x, T * dst, int ncols, int n_dims, int nrows, const float * pos, float freq_scale, int p_delta_rows,
    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
 ) {
    GGML_ASSERT(ncols % 2 == 0);
@@ -7734,7 +7733,7 @@ static void rope_neox_cuda(
 }

 static void rope_glm_f32_cuda(
-    const float * x, float * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
+    const float * x, float * dst, int ncols, int nrows, const float * pos, float freq_scale, int p_delta_rows,
    float freq_base, int n_ctx, cudaStream_t stream
 ) {
    GGML_ASSERT(ncols % 4 == 0);
@@ -9036,11 +9035,11 @@ static void ggml_cuda_op_rope(
    memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
    memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));

-    const int32_t * pos = nullptr;
+    const float * pos = nullptr;
    if ((mode & 1) == 0) {
-        GGML_ASSERT(src1->type == GGML_TYPE_I32);
+        GGML_ASSERT(src1->type == GGML_TYPE_F32);
        GGML_ASSERT(src1->ne[0] == ne2);
-        pos = (const int32_t *) src1_dd;
+        pos = (const float *) src1_dd;
    }

    const bool is_neox = mode & 2;
--- a/ggml-impl.h
+++ b/ggml-impl.h
@@ -53,11 +53,23 @@ extern "C" {
 //
 #include <arm_neon.h>

-#define GGML_COMPUTE_FP16_TO_FP32(x) ((float) (x))
-#define GGML_COMPUTE_FP32_TO_FP16(x) (x)
+#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
+#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)

-#define GGML_FP16_TO_FP32(x) ((float) (x))
-#define GGML_FP32_TO_FP16(x) (x)
+#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
+
+static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
+    __fp16 tmp;
+    memcpy(&tmp, &h, sizeof(ggml_fp16_t));
+    return (float)tmp;
+}
+
+static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
+    ggml_fp16_t res;
+    __fp16 tmp = f;
+    memcpy(&res, &tmp, sizeof(ggml_fp16_t));
+    return res;
+}

 #else

@@ -214,8 +226,7 @@ extern float ggml_table_f32_f16[1 << 16];
 // On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
 // so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
 // This is also true for POWER9.
-#if !defined(GGML_FP16_TO_FP32) || !defined(GGML_FP32_TO_FP16)
-
+#if !defined(GGML_FP16_TO_FP32)
 inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
    uint16_t s;
    memcpy(&s, &f, sizeof(uint16_t));
@@ -223,8 +234,10 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
 }

 #define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
-#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
+#endif

+#if !defined(GGML_FP32_TO_FP16)
+#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
 #endif

 #define GGML_HASHTABLE_FULL ((size_t)-1)
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -2057,7 +2057,13 @@ static bool ggml_metal_graph_compute(
                        // skip 3, n_ctx, used in GLM RoPE, unimplemented in metal
                        const int n_orig_ctx = ((int32_t *) dst->op_params)[4];

-                        float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
+                        float freq_base;
+                        float freq_scale;
+                        float ext_factor;
+                        float attn_factor;
+                        float beta_fast;
+                        float beta_slow;
+
                        memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
                        memcpy(&freq_scale,  (int32_t *) dst->op_params +  6, sizeof(float));
                        memcpy(&ext_factor,  (int32_t *) dst->op_params +  7, sizeof(float));
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@@ -1674,7 +1674,7 @@ static void rope_yarn_corr_dims(

 typedef void (rope_t)(
        device const    void * src0,
-        device const int32_t * src1,
+        device const   float * src1,
        device         float * dst,
        constant     int64_t & ne00,
        constant     int64_t & ne01,
@@ -1709,7 +1709,7 @@ typedef void (rope_t)(
 template<typename T>
 kernel void kernel_rope(
        device const    void * src0,
-        device const int32_t * src1,
+        device const   float * src1,
        device         float * dst,
        constant     int64_t & ne00,
        constant     int64_t & ne01,
@@ -1749,11 +1749,11 @@ kernel void kernel_rope(
    float corr_dims[2];
    rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims);

-    device const int32_t * pos = src1;
+    device const float * pos = src1;

-    const int64_t p = pos[i2];
+    const float p = pos[i2];

-    const float theta_0 = (float)p;
+    const float theta_0 = p;
    const float inv_ndims = -1.f/n_dims;

    if (!is_neox) {
--- a/ggml-quants.c
+++ b/ggml-quants.c
@@ -438,6 +438,30 @@ inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
    return res;
 }

+// NOTE: not tested
+inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
+    int8x16_t res;
+
+    res[ 0] = a[b[ 0]];
+    res[ 1] = a[b[ 1]];
+    res[ 2] = a[b[ 2]];
+    res[ 3] = a[b[ 3]];
+    res[ 4] = a[b[ 4]];
+    res[ 5] = a[b[ 5]];
+    res[ 6] = a[b[ 6]];
+    res[ 7] = a[b[ 7]];
+    res[ 8] = a[b[ 8]];
+    res[ 9] = a[b[ 9]];
+    res[10] = a[b[10]];
+    res[11] = a[b[11]];
+    res[12] = a[b[12]];
+    res[13] = a[b[13]];
+    res[14] = a[b[14]];
+    res[15] = a[b[15]];
+
+    return res;
+}
+
 #else

 #define ggml_int16x8x2_t  int16x8x2_t
@@ -451,6 +475,7 @@ inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
 #define ggml_vld1q_u8_x4  vld1q_u8_x4
 #define ggml_vld1q_s8_x2  vld1q_s8_x2
 #define ggml_vld1q_s8_x4  vld1q_s8_x4
+#define ggml_vqtbl1q_s8   vqtbl1q_s8

 #endif

@@ -5629,8 +5654,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r

    for (int i = 0; i < nb; ++i) {

-        const float d = y[i].d * (float)x[i].d;
-        const float dmin = -y[i].d * (float)x[i].dmin;
+        const float d    =  y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);

        const uint8_t * restrict q2 = x[i].qs;
        const int8_t  * restrict q8 = y[i].qs;
@@ -5779,8 +5804,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r

    for (int i = 0; i < nb; ++i) {

-        const float d = y[i].d * (float)x[i].d;
-        const float dmin = -y[i].d * (float)x[i].dmin;
+        const float d    =  y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);

        const uint8_t * restrict q2 = x[i].qs;
        const int8_t  * restrict q8 = y[i].qs;
@@ -6433,7 +6458,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r

        int32_t isum = -4*(scales[0] * y[i].bsums[0] + scales[2] * y[i].bsums[1] + scales[1] * y[i].bsums[2] + scales[3] * y[i].bsums[3]);

-        const float d = y[i].d * (float)x[i].d;
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);

        const uint8x16_t htmp = vcombine_u8(hbits, vshr_n_u8(hbits, 1));
        q3h.val[0] = vandq_u8(mh, vshlq_n_u8(htmp, 2));
@@ -6635,7 +6660,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r

        int32_t isum = -4*(scales[0] * y[i].bsums[0] + scales[2] * y[i].bsums[1] + scales[1] * y[i].bsums[2] + scales[3] * y[i].bsums[3]);

-        const float d = y[i].d * (float)x[i].d;
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);

        vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);

@@ -7138,9 +7163,9 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
        aux16[1] = (a[0] >> 4) & 0x0f0f;

        const int32_t summi = scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3]);
-        sum_mins += y[i].d * (float)x[i].d[1] * summi;
+        sum_mins += y[i].d * GGML_FP16_TO_FP32(x[i].d[1]) * summi;

-        const float d = y[i].d * (float)x[i].d[0];
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d[0]);

        const ggml_uint8x16x2_t q4bits = ggml_vld1q_u8_x2(q4);

@@ -7798,7 +7823,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r

    for (int i = 0; i < nb; ++i) {

-        const float d = y[i].d * (float)x[i].d;
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
        const int8_t * sc = x[i].scales;

        const uint8_t * restrict q5 = x[i].qs;
@@ -7940,7 +7965,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r

    for (int i = 0; i < nb; ++i) {

-        const float d = y[i].d * (float)x[i].d;
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
        const int8_t * sc = x[i].scales;

        const uint8_t * restrict q5 = x[i].qs;
@@ -8508,7 +8533,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r

    for (int i = 0; i < nb; ++i) {

-        const float d_all = (float)x[i].d;
+        const float d_all = GGML_FP16_TO_FP32(x[i].d);

        const uint8_t * restrict q6 = x[i].ql;
        const uint8_t * restrict qh = x[i].qh;
@@ -8679,7 +8704,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r

    for (int i = 0; i < nb; ++i) {

-        const float d_all = (float)x[i].d;
+        const float d_all = GGML_FP16_TO_FP32(x[i].d);

        const uint8_t * restrict q6 = x[i].ql;
        const uint8_t * restrict qh = x[i].qh;
@@ -9333,7 +9358,7 @@ void ggml_vec_dot_iq1_s_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const
    uint16_t gindex[8];
    uint16x8x2_t vindex;
    int8x16x4_t q1b;
-    int8x16x4_t q8b;
+    ggml_int8x16x4_t q8b;
    uint16x8x4_t scales;
    int32x4x2_t sumi;
    int32x4x2_t dotq;
@@ -9498,7 +9523,6 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
    float sumf = 0;

    for (int ib = 0; ib < nb; ib += 2) {
-
        q4bits.val[0] = vld1q_u8(x[ib+0].qs);
        q4bits.val[1] = vld1q_u8(x[ib+1].qs);
        q8b.val[0]    = vld1q_s8(y[ib+0].qs);
@@ -9506,16 +9530,17 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
        q8b.val[2]    = vld1q_s8(y[ib+1].qs);
        q8b.val[3]    = vld1q_s8(y[ib+1].qs + 16);

-        q4b.val[0] = vqtbl1q_s8(values, vandq_u8(q4bits.val[0], m4b));
-        q4b.val[1] = vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[0], 4));
-        q4b.val[2] = vqtbl1q_s8(values, vandq_u8(q4bits.val[1], m4b));
-        q4b.val[3] = vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[1], 4));
+        q4b.val[0] = ggml_vqtbl1q_s8(values, vandq_u8  (q4bits.val[0], m4b));
+        q4b.val[1] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[0], 4));
+        q4b.val[2] = ggml_vqtbl1q_s8(values, vandq_u8  (q4bits.val[1], m4b));
+        q4b.val[3] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[1], 4));

        prod_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[0], q8b.val[0]), q4b.val[1], q8b.val[1]);
        prod_2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[2], q8b.val[2]), q4b.val[3], q8b.val[3]);

-        sumf += (float)x[ib+0].d * (float)y[ib+0].d * vaddvq_s32(prod_1) + (float)x[ib+1].d * (float)y[ib+1].d * vaddvq_s32(prod_2);
-
+        sumf +=
+            GGML_FP16_TO_FP32(x[ib+0].d) * GGML_FP16_TO_FP32(y[ib+0].d) * vaddvq_s32(prod_1) +
+            GGML_FP16_TO_FP32(x[ib+1].d) * GGML_FP16_TO_FP32(y[ib+1].d) * vaddvq_s32(prod_2);
    }

    *s = sumf;
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -14642,7 +14642,8 @@ GGML_CALL static const char * ggml_backend_sycl_buffer_type_name(ggml_backend_bu
 static ggml_backend_buffer_t
 ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
                                           size_t size) try {
-    int device = (int) (intptr_t) buft->context;
+    ggml_backend_sycl_buffer_type_context * buft_ctx = (ggml_backend_sycl_buffer_type_context *)buft->context;
+    int device = (int) buft_ctx->device;

    ggml_sycl_set_device(device);
    int device_index = get_device_index_by_id(device);
@@ -14720,7 +14721,7 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) {
        for (int i = 0; i < GGML_SYCL_MAX_DEVICES; i++) {
            ggml_backend_sycl_buffer_types[i] = {
                /* .iface    = */ ggml_backend_sycl_buffer_type_interface,
-                /* .context  = */ (ggml_backend_buffer_type_context_t) (intptr_t) i,
+                /* .context  = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(i)},
            };
        }
        ggml_backend_sycl_buffer_type_initialized = true;
@@ -14782,10 +14783,6 @@ ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type() {

 // backend

-struct ggml_backend_context_sycl {
-    int device;
-};
-
 static const char * ggml_backend_sycl_name(ggml_backend_t backend) {
    return GGML_SYCL_NAME;

@@ -14793,14 +14790,14 @@ static const char * ggml_backend_sycl_name(ggml_backend_t backend) {
 }

 static void ggml_backend_sycl_free(ggml_backend_t backend) {
-    ggml_backend_context_sycl * sycl_ctx = (ggml_backend_context_sycl *)backend->context;
+    ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;

    delete sycl_ctx;
    delete backend;
 }

 static ggml_backend_buffer_type_t ggml_backend_sycl_get_default_buffer_type(ggml_backend_t backend) {
-    ggml_backend_context_sycl * sycl_ctx = (ggml_backend_context_sycl *)backend->context;
+    ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;

    return ggml_backend_sycl_buffer_type(sycl_ctx->device);
 }
@@ -14809,7 +14806,7 @@ static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend,
                                               ggml_tensor *tensor,
                                               const void *data, size_t offset,
                                               size_t size) try {
-    ggml_backend_context_sycl * sycl_ctx = (ggml_backend_context_sycl *)backend->context;
+    ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;

    GGML_ASSERT(tensor->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type");
    GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
@@ -14827,7 +14824,7 @@ static void ggml_backend_sycl_get_tensor_async(ggml_backend_t backend,
                                               const ggml_tensor *tensor,
                                               void *data, size_t offset,
                                               size_t size) try {
-    ggml_backend_context_sycl * sycl_ctx = (ggml_backend_context_sycl *)backend->context;
+    ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;

    GGML_ASSERT(tensor->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type");
    GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
@@ -14842,7 +14839,7 @@ catch (sycl::exception const &exc) {
 }

 static void ggml_backend_sycl_synchronize(ggml_backend_t backend) try {
-    ggml_backend_context_sycl * sycl_ctx = (ggml_backend_context_sycl *)backend->context;
+    ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;

    SYCL_CHECK(CHECK_TRY_ERROR(g_syclStreams[sycl_ctx->device][0]->wait()));

@@ -14878,7 +14875,7 @@ static void ggml_backend_sycl_graph_plan_compute(ggml_backend_t backend, ggml_ba
 }

 static bool ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
-    ggml_backend_context_sycl * sycl_ctx = (ggml_backend_context_sycl *)backend->context;
+    ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;

    ggml_sycl_set_main_device(sycl_ctx->device);

@@ -15092,8 +15089,9 @@ ggml_backend_t ggml_backend_sycl_init(int device) {
    // not strictly necessary, but it may reduce the overhead of the first graph_compute
    ggml_sycl_set_main_device(device);

-    ggml_backend_context_sycl * ctx = new ggml_backend_context_sycl {
-        /* .device = */ device
+    ggml_backend_sycl_context * ctx = new ggml_backend_sycl_context {
+        /* .device = */ device,
+        /* .name   = */ GGML_SYCL_NAME + std::to_string(device),
    };

    ggml_backend_t sycl_backend = new ggml_backend {
--- a/ggml.c
+++ b/ggml.c
--- a/ggml.h
+++ b/ggml.h
@@ -315,13 +315,7 @@
 extern "C" {
 #endif

-#if defined(__ARM_NEON) && defined(__CUDACC__)
-    typedef half ggml_fp16_t;
-#elif defined(__ARM_NEON) && !defined(_MSC_VER)
-    typedef __fp16 ggml_fp16_t;
-#else
    typedef uint16_t ggml_fp16_t;
-#endif

    // convert FP16 <-> FP32
    GGML_API float       ggml_fp16_to_fp32(ggml_fp16_t x);
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -111,6 +111,7 @@ class MODEL_ARCH(IntEnum):
    ORION      = auto()
    INTERNLM2  = auto()
    MINICPM    = auto()
+    GEMMA      = auto()


 class MODEL_TENSOR(IntEnum):
@@ -167,6 +168,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
    MODEL_ARCH.ORION:          "orion",
    MODEL_ARCH.INTERNLM2:      "internlm2",
    MODEL_ARCH.MINICPM:        "minicpm",
+    MODEL_ARCH.GEMMA:          "gemma",
 }

 TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
@@ -511,6 +513,19 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.FFN_DOWN_EXP,
        MODEL_TENSOR.FFN_UP_EXP,
    ],
+    MODEL_ARCH.GEMMA: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+        MODEL_TENSOR.FFN_NORM,
+    ],
    # TODO
 }

--- a/llama.cpp
+++ b/llama.cpp
@@ -208,6 +208,7 @@ enum llm_arch {
    LLM_ARCH_ORION,
    LLM_ARCH_INTERNLM2,
    LLM_ARCH_MINICPM,
+    LLM_ARCH_GEMMA,
    LLM_ARCH_UNKNOWN,
 };

@@ -234,6 +235,7 @@ static std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
    { LLM_ARCH_ORION,           "orion"      },
    { LLM_ARCH_INTERNLM2,       "internlm2"  },
    { LLM_ARCH_MINICPM,         "minicpm"    },
+    { LLM_ARCH_GEMMA,           "gemma"      },
 };

 enum llm_kv {
@@ -507,7 +509,6 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
        {
            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
-            { LLM_TENSOR_OUTPUT,          "output" },
            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
            { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },
@@ -760,6 +761,22 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
            { LLM_TENSOR_FFN_UP_EXP,      "blk.%d.ffn_up.%d" },
        },
    },
+    {
+        LLM_ARCH_GEMMA,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+        },
+    },
    {
        LLM_ARCH_UNKNOWN,
        {
@@ -1682,8 +1699,8 @@ struct llama_layer {
 };

 struct llama_kv_cell {
-    llama_pos pos   = -1;
-    llama_pos delta = 0;
+    float pos   = -1.0f;
+    float delta =  0.0f;

    std::set<llama_seq_id> seq_id;

@@ -1922,10 +1939,10 @@ struct llama_context {
    ggml_context * ctx_input = nullptr;
    struct ggml_tensor * inp_tokens;    // I32 [n_batch]
    struct ggml_tensor * inp_embd;      // F32 [n_embd, n_batch]
-    struct ggml_tensor * inp_pos;       // I32 [n_batch]
+    struct ggml_tensor * inp_pos;       // F32 [n_batch]
    struct ggml_tensor * inp_KQ_mask;   // F32 [n_ctx, n_batch]
    struct ggml_tensor * inp_KQ_pos;    // F32 [n_ctx]
-    struct ggml_tensor * inp_K_shift;   // I32 [n_ctx]
+    struct ggml_tensor * inp_K_shift;   // F32 [n_ctx]
    struct ggml_tensor * inp_mean;      // F32 [n_batch, n_batch]
    struct ggml_tensor * inp_cls;       // I32 [n_batch]

@@ -2205,7 +2222,7 @@ static void llama_kv_cache_seq_div(
                 llama_seq_id   seq_id,
                    llama_pos   p0,
                    llama_pos   p1,
-                          int   d) {
+                        float   d) {
    if (p0 < 0) p0 = 0;
    if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();

@@ -2773,13 +2790,7 @@ struct llama_model_loader {

        std::vector<no_init<uint8_t>> read_buf;

-        for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
-            struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i));
-            if (!cur) {
-                // some tensors may be allocated in a different context
-                continue;
-            }
-
+        for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
            if (progress_callback) {
                if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
                    return false;
@@ -3243,6 +3254,16 @@ static void llm_load_hparams(
                    default: model.type = e_model::MODEL_UNKNOWN;
                }
            } break;
+        case LLM_ARCH_GEMMA:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+                switch (hparams.n_layer) {
+                    case 18: model.type = e_model::MODEL_2B; break;
+                    case 28: model.type = e_model::MODEL_7B; break;
+                    default: model.type = e_model::MODEL_UNKNOWN;
+               }
+            } break;
        default: (void)0;
    }

@@ -3694,7 +3715,7 @@ static bool llm_load_tensors(
    }

    // create one context per buffer type
-    size_t ctx_size = ggml_tensor_overhead()*ml.n_tensors;
+    size_t ctx_size = ggml_tensor_overhead()*(ml.n_tensors + 1); // +1 for models where tok_embd is duplicated as output
    std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
    for (auto & it : buft_layer_count) {
        struct ggml_init_params params = {
@@ -3832,6 +3853,7 @@ static bool llm_load_tensors(
                        } else {
                            model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
                            ml.n_created--; // artificial tensor
+                            ml.size_data += ggml_nbytes(model.output);
                        }
                    }

@@ -4031,7 +4053,12 @@ static bool llm_load_tensors(
                    // output
                    {
                        model.output_norm   = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-                        model.output        = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab});
+                        model.output_norm_b = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, false);
+
+                        // same as tok_embd, duplicated to allow offloading
+                        model.output        = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab});
+                        ml.n_created--; // artificial tensor
+                        ml.size_data += ggml_nbytes(model.output);
                    }

                    for (int i = 0; i < n_layer; ++i) {
@@ -4040,14 +4067,23 @@ static bool llm_load_tensors(

                        auto & layer = model.layers[i];

-                        layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+                        layer.attn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+                        layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, false);

                        layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
-                        layer.wo   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
+                        layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa}, false);

-                        layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
-                        layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
-                        layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
+                        layer.wo   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
+                        layer.bo   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd}, false);
+
+                        layer.ffn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+                        layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, false);
+
+                        layer.ffn_down   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
+                        layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd}, false);
+
+                        layer.ffn_up     = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
+                        layer.ffn_up_b   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff}, false);

                        // AWQ ScaleActivation layer
                        layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, false);
@@ -4360,6 +4396,40 @@ static bool llm_load_tensors(
                        layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
                    }
                } break;
+            case LLM_ARCH_GEMMA:
+                {
+                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+
+                    // output
+                    model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+                    model.output      = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}); // same as tok_embd, duplicated to allow offloading
+                    ml.n_created--; // artificial tensor
+                    ml.size_data += ggml_nbytes(model.output);
+
+                    const int64_t n_ff          = hparams.n_ff;
+                    const int64_t n_embd_head_k = hparams.n_embd_head_k;
+                    const int64_t n_embd_k_gqa  = hparams.n_embd_k_gqa();
+                    const int64_t n_embd_v_gqa  = hparams.n_embd_v_gqa();
+
+                    for (uint32_t i = 0; i < n_layer; ++i) {
+                        ggml_context * ctx_layer = ctx_for_layer(i);
+                        ggml_context * ctx_split = ctx_for_layer_split(i);
+
+                        auto & layer = model.layers[i];
+
+                        layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+
+                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * hparams.n_head});
+                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa});
+                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa});
+                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * hparams.n_head, n_embd});
+
+                        layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+                        layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
+                        layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
+                        layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
+                    }
+                } break;
            default:
                throw std::runtime_error("unknown architecture");
        }
@@ -5858,9 +5928,10 @@ struct llm_build_context {

        // get input vectors with right size
        const size_t stride1 = n_tokens * ggml_type_size(lctx.inp_tokens->type);
-        struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
+
+        struct ggml_tensor * inp_pos  = ggml_view_1d(ctx0, lctx.inp_pos,  n_tokens, 0);
        struct ggml_tensor * inp_mean = ggml_view_2d(ctx0, lctx.inp_mean, n_tokens, n_tokens, stride1, 0);
-        struct ggml_tensor * inp_cls = ggml_view_1d(ctx0, lctx.inp_cls, n_tokens, 0);
+        struct ggml_tensor * inp_cls  = ggml_view_1d(ctx0, lctx.inp_cls,  n_tokens, 0);

        // construct input embeddings (token, type, position)
        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
@@ -5868,8 +5939,9 @@ struct llm_build_context {
        // token types are hardcoded to zero ("Sentence A")
        struct ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0);
        inpL = ggml_add(ctx0, inpL, type_row0);
+
        if (model.arch == LLM_ARCH_BERT) {
-            inpL = ggml_add(ctx0, ggml_get_rows(ctx0, model.pos_embd, inp_pos), inpL);
+            inpL = ggml_add(ctx0, ggml_get_rows(ctx0, model.pos_embd, ggml_cast(ctx0, inp_pos, GGML_TYPE_I32)), inpL);
        }
        cb(inpL, "inp_embd", -1);

@@ -6114,7 +6186,7 @@ struct llm_build_context {

            attn_norm = llm_build_norm(ctx0, inpL, hparams,
                    model.layers[il].attn_norm,
-                    NULL,
+                    model.layers[il].attn_norm_b,
                    LLM_NORM, cb, il);
            cb(attn_norm, "attn_norm", il);

@@ -6125,6 +6197,11 @@ struct llm_build_context {
                cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
                cb(cur, "wqkv", il);

+                if (model.layers[il].bqkv){
+                    cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
+                    cb(cur, "bqkv", il);
+                }
+
                if (hparams.f_clamp_kqv > 0.0f) {
                    cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
                    cb(cur, "wqkv_clamped", il);
@@ -6141,7 +6218,7 @@ struct llm_build_context {
                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);

                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
-                        model.layers[il].wo, NULL,
+                        model.layers[il].wo, model.layers[il].bo,
                        Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
                cb(cur, "kqv_out", il);
            }
@@ -6154,13 +6231,13 @@ struct llm_build_context {
            {
                cur = llm_build_norm(ctx0, ffn_inp, hparams,
                        model.layers[il].ffn_norm,
-                        NULL,
+                        model.layers[il].ffn_norm_b,
                        LLM_NORM, cb, il);
                cb(cur, "ffn_norm", il);
                cur = llm_build_ffn(ctx0, cur,
-                        model.layers[il].ffn_up,   NULL,
+                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,
                        NULL,                      NULL,
-                        model.layers[il].ffn_down, NULL,
+                        model.layers[il].ffn_down, model.layers[il].ffn_down_b,
                        model.layers[il].ffn_act,
                        LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
                cb(cur, "ffn_out", il);
@@ -6177,7 +6254,7 @@ struct llm_build_context {

        cur = llm_build_norm(ctx0, cur, hparams,
                model.output_norm,
-                NULL,
+                model.output_norm_b,
                LLM_NORM, cb, -1);
        cb(cur, "result_norm", -1);

@@ -7366,6 +7443,116 @@ struct llm_build_context {

        return gf;
    }
+
+    struct ggml_cgraph * build_gemma() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+
+        const int64_t n_embd_head_k = hparams.n_embd_head_k;
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
+        cb(inpL, "inp_embd", -1);
+
+        inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
+        cb(inpL, "inp_scaled", -1);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
+        cb(inp_pos, "inp_pos", -1);
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
+        cb(KQ_mask, "KQ_mask", -1);
+
+        // shift the entire K-cache if needed
+        if (do_rope_shift) {
+            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
+        }
+
+        for (int il = 0; il < n_layer; ++il) {
+
+            // norm
+            cur = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                // compute Q and K and RoPE them
+                struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+
+                struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+
+                struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+
+                Qcur = ggml_rope_custom(
+                        ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head,    n_tokens), inp_pos,
+                        n_embd_head_k, 2, 0, n_orig_ctx, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow);
+                cb(Qcur, "Qcur", il);
+
+                Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k)));
+                cb(Qcur, "Qcur_scaled", il);
+
+                Kcur = ggml_rope_custom(
+                        ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos,
+                        n_embd_head_k, 2, 0, n_orig_ctx, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow);
+                cb(Kcur, "Kcur", il);
+
+                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
+                        model.layers[il].wo, NULL,
+                        Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
+                cb(cur, "kqv_out", il);
+            }
+
+            struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
+            cb(sa_out, "sa_out", il);
+
+            cur = llm_build_norm(ctx0, sa_out, hparams,
+                    model.layers[il].ffn_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "ffn_norm", il);
+
+            // feed-forward network
+            {
+                cur = llm_build_ffn(ctx0, cur,
+                        model.layers[il].ffn_up, NULL,
+                        model.layers[il].ffn_gate, NULL,
+                        model.layers[il].ffn_down, NULL,
+                        NULL,
+                        LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
+                cb(cur, "ffn_out", il);
+            }
+
+            cur = ggml_add(ctx0, cur, sa_out);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = llm_build_norm(ctx0, cur, hparams,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = ggml_mul_mat(ctx0, model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
 };

 static struct ggml_cgraph * llama_build_graph(
@@ -7474,6 +7661,10 @@ static struct ggml_cgraph * llama_build_graph(
            {
                result = llm.build_minicpm();
            } break;
+        case LLM_ARCH_GEMMA:
+            {
+                result = llm.build_gemma();
+            } break;
        default:
            GGML_ASSERT(false);
    }
@@ -7555,7 +7746,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {

        assert(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer));

-        int32_t * data = (int32_t *) lctx.inp_K_shift->data;
+        float * data = (float *) lctx.inp_K_shift->data;

        for (int i = 0; i < n_ctx; ++i) {
            data[i] = lctx.kv_self.cells[i].delta;
@@ -10311,7 +10502,10 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
        return std::make_pair(i_layer, n_layer);
    };

-    if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
+    // for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings
+    // with the quantization of the output tensor
+    if (name == tn(LLM_TENSOR_OUTPUT, "weight") ||
+        (LLM_TENSOR_NAMES.at(arch).find(LLM_TENSOR_OUTPUT) == LLM_TENSOR_NAMES.at(arch).end() && name == "token_embd.weight")) {
        int nx = tensor->ne[0];
        if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
            new_type = GGML_TYPE_Q8_0;
@@ -11498,10 +11692,10 @@ struct llama_context * llama_new_context_with_model(

            ctx->inp_tokens  = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
            ctx->inp_embd    = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, hparams.n_embd, cparams.n_batch);
-            ctx->inp_pos     = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
+            ctx->inp_pos     = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_batch);
            ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx, cparams.n_batch);
            ctx->inp_KQ_pos  = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx);
-            ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_ctx);
+            ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx);
            ctx->inp_mean    = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_batch, cparams.n_batch);
            ctx->inp_cls     = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);

@@ -11854,7 +12048,7 @@ void llama_kv_cache_seq_shift(struct llama_context * ctx, llama_seq_id seq_id, l
    llama_kv_cache_seq_shift(ctx->kv_self, seq_id, p0, p1, delta);
 }

-void llama_kv_cache_seq_div(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
+void llama_kv_cache_seq_div(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, float d) {
    if (d == 1) {
        return;
    }
@@ -12004,18 +12198,19 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
        data_ctx->write(&kv_used,     sizeof(kv_used));

        if (kv_buf_size) {
-            const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
-
            std::vector<uint8_t> tmp_buf;
            for (int il = 0; il < (int) n_layer; ++il) {
-                tmp_buf.resize(elt_size*n_embd_k_gqa*kv_head);
+                size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
+                tmp_buf.resize(k_size);
                ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), 0, tmp_buf.size());
                data_ctx->write(tmp_buf.data(), tmp_buf.size());

                // v is not contiguous, copy row by row
-                tmp_buf.resize(elt_size*kv_head);
+                size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
+                size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx);
+                tmp_buf.resize(v_row_size);
                for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
-                    ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), ir*elt_size*n_ctx, tmp_buf.size());
+                    ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), ir*v_row_stride, tmp_buf.size());
                    data_ctx->write(tmp_buf.data(), tmp_buf.size());
                }
            }
@@ -12117,17 +12312,16 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
        if (kv_buf_size) {
            GGML_ASSERT(kv_self.total_size() == kv_buf_size);

-            const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
-
            for (int il = 0; il < (int) n_layer; ++il) {
-                size_t k_size = elt_size*n_embd_k_gqa*kv_head;
+                size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
                ggml_backend_tensor_set(kv_self.k_l[il], inp, 0, k_size);
                inp += k_size;

                // v is not contiguous, copy row by row
-                size_t v_row_size = elt_size*kv_head;
+                size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
+                size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx);
                for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
-                    ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*elt_size*n_ctx, v_row_size);
+                    ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*v_row_stride, v_row_size);
                    inp += v_row_size;
                }
            }
@@ -12269,7 +12463,7 @@ int llama_eval_embd(
                         int32_t   n_past) {
    llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1);

-    llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, nullptr, n_past, 1, 0, };
+    llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, nullptr, (float) n_past, 1, 0, };

    const int ret = llama_decode_internal(*ctx, batch);
    if (ret < 0) {
@@ -12589,6 +12783,37 @@ static int32_t llama_chat_apply_template_internal(
        if (add_ass) {
            ss << "<|assistant|>\n";
        }
+    } else if (tmpl.find("bos_token + message['role']") != std::string::npos) {
+        // mlabonne/AlphaMonarch-7B template (the <s> is included inside history)
+        for (auto message : chat) {
+            std::string bos = (message == chat.front()) ? "" : "<s>"; // skip BOS for first message
+            ss << bos << message->role << "\n" << message->content << "</s>\n";
+        }
+        if (add_ass) {
+            ss << "<s>assistant\n";
+        }
+    } else if (tmpl.find("<start_of_turn>") != std::string::npos) {
+        // google/gemma-7b-it
+        std::string system_prompt = "";
+        for (auto message : chat) {
+            std::string role(message->role);
+            if (role == "system") {
+                // there is no system message for gemma, but we will merge it with user prompt, so nothing is broken
+                system_prompt = trim(message->content);
+                continue;
+            }
+            // in gemma, "assistant" is "model"
+            role = role == "assistant" ? "model" : message->role;
+            ss << "<start_of_turn>" << role << "\n";
+            if (!system_prompt.empty() && role != "model") {
+                ss << system_prompt << "\n\n";
+                system_prompt = "";
+            }
+            ss << trim(message->content) << "<end_of_turn>\n";
+        }
+        if (add_ass) {
+            ss << "<start_of_turn>model\n";
+        }
    } else {
        // template not supported
        return -1;
--- a/llama.h
+++ b/llama.h
@@ -54,7 +54,7 @@ extern "C" {
    struct llama_model;
    struct llama_context;

-    typedef int32_t llama_pos;
+    typedef float   llama_pos;
    typedef int32_t llama_token;
    typedef int32_t llama_seq_id;

@@ -531,7 +531,7 @@ extern "C" {
                    llama_seq_id   seq_id,
                       llama_pos   p0,
                       llama_pos   p1,
-                             int   d);
+                           float   d);

    //
    // State / sessions
@@ -708,7 +708,7 @@ extern "C" {

    /// Apply chat template. Inspired by hf apply_chat_template() on python.
    /// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
-    /// NOTE: This function only support some known jinja templates. It is not a jinja parser.
+    /// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
    /// @param tmpl A Jinja template to use for this chat. If this is nullptr, the model’s default chat template will be used instead.
    /// @param chat Pointer to a list of multiple llama_chat_message
    /// @param n_msg Number of llama_chat_message in this chat
--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@@ -1 +1 @@
-818eeb8a3be99125746a90ec63af8f51516a2ec6
+8cdf783f288a98eddf521b0ab1b4d405be9e18ba
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -1134,14 +1134,15 @@ struct test_rope : public test_case {

    ggml_tensor * build_graph(ggml_context * ctx) override {
        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
-        ggml_tensor * pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ne[2]);
+        ggml_tensor * pos = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, ne[2]);
+        ggml_set_name(pos, "pos");
        ggml_tensor * out = ggml_rope(ctx, a, pos, n_dims, mode, n_ctx);
        return out;
    }

    void initialize_tensors(ggml_context * ctx) override {
        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
-            if (t->type == GGML_TYPE_I32) {
+            if (strcmp(ggml_get_name(t), "pos") == 0) {
                // pos
                std::vector<int> data(ne[2]);
                for (int i = 0; i < ne[2]; i++) {
@@ -1703,7 +1704,7 @@ struct test_llama : public test_llm {
        inpL = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, hp.n_embd, hp.n_tokens);

        // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, hp.n_tokens);
+        struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_tokens);

        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
        struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hp.n_kv, hp.n_tokens, 1);
@@ -1825,7 +1826,7 @@ struct test_falcon : public test_llm {
        inpL = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, hp.n_embd, hp.n_tokens);

        // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, hp.n_tokens);
+        struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_tokens);

        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
        struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hp.n_kv, hp.n_tokens, 1);
--- a/tests/test-chat-template.cpp
+++ b/tests/test-chat-template.cpp
@@ -27,12 +27,24 @@ int main(void) {
        "{%- for idx in range(0, messages|length) -%}\\n{%- if messages[idx]['role'] == 'user' -%}\\n{%- if idx > 1 -%}\\n{{- bos_token + '[INST] ' + messages[idx]['content'] + ' [/INST]' -}}\\n{%- else -%}\\n{{- messages[idx]['content'] + ' [/INST]' -}}\\n{%- endif -%}\\n{% elif messages[idx]['role'] == 'system' %}\\n{{- '[INST] <<SYS>>\\\\n' + messages[idx]['content'] + '\\\\n<</SYS>>\\\\n\\\\n' -}}\\n{%- elif messages[idx]['role'] == 'assistant' -%}\\n{{- ' '  + messages[idx]['content'] + ' ' + eos_token -}}\\n{% endif %}\\n{% endfor %}",
        // bofenghuang/vigogne-2-70b-chat
        "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif true == true and not '<<SYS>>' in messages[0]['content'] %}{% set loop_messages = messages %}{% set system_message = 'Vous êtes Vigogne, un assistant IA créé par Zaion Lab. Vous suivez extrêmement bien les instructions. Aidez autant que vous le pouvez.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\\\n' + system_message + '\\\\n<</SYS>>\\\\n\\\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'system' %}{{ '<<SYS>>\\\\n' + content.strip() + '\\\\n<</SYS>>\\\\n\\\\n' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}",
+        // mlabonne/AlphaMonarch-7B
+        "{% for message in messages %}{{bos_token + message['role'] + '\\n' + message['content'] + eos_token + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ bos_token + 'assistant\\n' }}{% endif %}",
+        // google/gemma-7b-it
+        "{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '\\n' + message['content'] | trim + '<end_of_turn>\\n' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\\n'}}{% endif %}",
    };
-    std::vector<std::string> expected_substr = {
-        "<|im_start|>assistant\n   I am an assistant   <|im_end|>\n<|im_start|>user\nAnother question<|im_end|>\n<|im_start|>assistant",
-        "[/INST]Hi there</s>[INST] Who are you [/INST]   I am an assistant   </s>[INST] Another question [/INST]",
-        "</s><s>[INST] Who are you [/INST]    I am an assistant    </s><s>[INST] Another question [/INST]",
-        "[/INST] Hi there </s>[INST] Who are you [/INST] I am an assistant </s>[INST] Another question [/INST]",
+    std::vector<std::string> expected_output = {
+        // teknium/OpenHermes-2.5-Mistral-7B
+        "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\nHello<|im_end|>\n<|im_start|>assistant\nHi there<|im_end|>\n<|im_start|>user\nWho are you<|im_end|>\n<|im_start|>assistant\n   I am an assistant   <|im_end|>\n<|im_start|>user\nAnother question<|im_end|>\n<|im_start|>assistant\n",
+        // mistralai/Mistral-7B-Instruct-v0.2
+        "[INST] You are a helpful assistant\nHello [/INST]Hi there</s>[INST] Who are you [/INST]   I am an assistant   </s>[INST] Another question [/INST]",
+        // TheBloke/FusionNet_34Bx2_MoE-AWQ
+        "[INST] <<SYS>>\nYou are a helpful assistant\n<</SYS>>\n\nHello [/INST] Hi there </s><s>[INST] Who are you [/INST]    I am an assistant    </s><s>[INST] Another question [/INST]",
+        // bofenghuang/vigogne-2-70b-chat
+        "[INST] <<SYS>>\nYou are a helpful assistant\n<</SYS>>\n\nHello [/INST] Hi there </s>[INST] Who are you [/INST] I am an assistant </s>[INST] Another question [/INST]",
+        // mlabonne/AlphaMonarch-7B
+        "system\nYou are a helpful assistant</s>\n<s>user\nHello</s>\n<s>assistant\nHi there</s>\n<s>user\nWho are you</s>\n<s>assistant\n   I am an assistant   </s>\n<s>user\nAnother question</s>\n<s>assistant\n",
+        // google/gemma-7b-it
+        "<start_of_turn>user\nYou are a helpful assistant\n\nHello<end_of_turn>\n<start_of_turn>model\nHi there<end_of_turn>\n<start_of_turn>user\nWho are you<end_of_turn>\n<start_of_turn>model\nI am an assistant<end_of_turn>\n<start_of_turn>user\nAnother question<end_of_turn>\n<start_of_turn>model\n",
    };
    std::vector<char> formatted_chat(1024);
    int32_t res;
@@ -43,7 +55,7 @@ int main(void) {

    for (size_t i = 0; i < templates.size(); i++) {
        std::string custom_template = templates[i];
-        std::string substr = expected_substr[i];
+        std::string expected = expected_output[i];
        formatted_chat.resize(1024);
        res = llama_chat_apply_template(
            nullptr,
@@ -57,8 +69,7 @@ int main(void) {
        formatted_chat.resize(res);
        std::string output(formatted_chat.data(), formatted_chat.size());
        std::cout << output << "\n-------------------------\n";
-        // expect the "formatted_chat" to contain pre-defined strings
-        assert(output.find(substr) != std::string::npos);
+        assert(output == expected);
    }
    return 0;
 }
--- a/tests/test-grad0.cpp
+++ b/tests/test-grad0.cpp
@@ -1449,9 +1449,9 @@ int main(int argc, const char ** argv) {
                    for (int n_past = 1; n_past < ne2[2]; ++n_past) {
                        x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);

-                        struct ggml_tensor * p = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne2[2]);
+                        struct ggml_tensor * p = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, ne2[2]);
                        for (int i = 0; i < ne2[2]; ++i) {
-                            ((int32_t *) p->data)[i] = n_past + i;
+                            ((float *) p->data)[i] = n_past + i;
                        }

                        ggml_set_param(ctx0, x[0]);
@@ -1489,9 +1489,9 @@ int main(int argc, const char ** argv) {
                    for (int n_past = 1; n_past < ne2[2]; ++n_past) {
                        x[0] = get_random_tensor_f16(ctx0, ndims, ne2, -1.0f, 1.0f);

-                        struct ggml_tensor * p = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne2[2]);
+                        struct ggml_tensor * p = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, ne2[2]);
                        for (int i = 0; i < ne2[2]; ++i) {
-                            ((int32_t *) p->data)[i] = n_past + i;
+                            ((float *) p->data)[i] = n_past + i;
                        }

                        ggml_set_param(ctx0, x[0]);
--- a/tests/test-quantize-fns.cpp
+++ b/tests/test-quantize-fns.cpp
@@ -143,10 +143,10 @@ int main(int argc, char * argv[]) {
            continue;
        }

-        printf("Testing %s\n", ggml_type_name((ggml_type) i));
-        ggml_quantize_init(ei);
+        if (qfns.from_float && qfns.to_float && qfns.vec_dot) {
+            printf("Testing %s\n", ggml_type_name((ggml_type) i));
+            ggml_quantize_init(ei);

-        if (qfns.from_float && qfns.to_float) {
            const float total_error = total_quantization_error(qfns, test_size, test_data.data());
            const float max_quantization_error =
                type == GGML_TYPE_Q2_K    ? MAX_QUANTIZATION_TOTAL_ERROR_2BITS :
--- a/tests/test-quantize-perf.cpp
+++ b/tests/test-quantize-perf.cpp
@@ -275,7 +275,7 @@ int main(int argc, char * argv[]) {
            continue;
        }

-        if (qfns.from_float && qfns.to_float) {
+        if (qfns.from_float && qfns.to_float && qfns.vec_dot) {
            printf("%s\n", ggml_type_name(type));

            ggml_quantize_init(type);
--- a/tests/test-rope.cpp
+++ b/tests/test-rope.cpp
@@ -146,14 +146,14 @@ int main(int /*argc*/, const char ** /*argv*/) {
        const int n_past_0 = 100;
        const int n_past_2 = 33;

-        struct ggml_tensor * p0 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
-        struct ggml_tensor * p1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
-        struct ggml_tensor * p2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
+        struct ggml_tensor * p0 = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, ne[2]);
+        struct ggml_tensor * p1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, ne[2]);
+        struct ggml_tensor * p2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, ne[2]);

        for (int i = 0; i < ne[2]; ++i) {
-            ((int32_t *) p0->data)[i] = n_past_0 + i;
-            ((int32_t *) p1->data)[i] = n_past_2 - n_past_0;
-            ((int32_t *) p2->data)[i] = n_past_2 + i;
+            ((float *) p0->data)[i] = n_past_0 + i;
+            ((float *) p1->data)[i] = n_past_2 - n_past_0;
+            ((float *) p2->data)[i] = n_past_2 + i;
        }

        // test mode 0, 2, 4 (standard, GPT-NeoX, GLM)
Author	SHA1	Message	Date
Georgi Gerganov	608f449880	swift : fix build ggml-ci	2024-02-23 19:02:09 +02:00
Georgi Gerganov	fff1e8a54a	batched.swift : fix build ggml-ci	2024-02-23 16:15:37 +02:00
Georgi Gerganov	8772658b11	ggml : add I32 <-> F32 conversion ggml-ci	2024-02-23 14:25:05 +02:00
Georgi Gerganov	fc775366f1	llama : switch to floating-point token positions ggml-ci	2024-02-23 12:34:16 +02:00
Jared Van Bortel	15499eb942	mpt : do not duplicate token_embd.weight on disk (#5670 )	2024-02-22 17:05:23 -05:00
Georgi Gerganov	96633eeca1	gemma : use more bits for the token_embd.weight tensor (#5650 ) * gemma : use Q8_0 for the token_embd.weight tensor * llama : quantize token_embd.weight using output type	2024-02-22 23:23:46 +02:00
Georgi Gerganov	847eedbdb2	py : add Gemma conversion from HF models (#5647 ) * py : add gemma conversion from HF models * Update convert-hf-to-gguf.py Co-authored-by: Aarni Koskela <akx@iki.fi> * Update convert-hf-to-gguf.py Co-authored-by: Aarni Koskela <akx@iki.fi> * Update convert-hf-to-gguf.py Co-authored-by: Jared Van Bortel <jared@nomic.ai> --------- Co-authored-by: Aarni Koskela <akx@iki.fi> Co-authored-by: Jared Van Bortel <jared@nomic.ai>	2024-02-22 23:22:48 +02:00
Georgi Gerganov	7e4f339c40	ggml : always define ggml_fp16_t as uint16_t (#5666 ) * ggml : always define ggml_fp16_t as uint16_t ggml-ci * ggml : cont ggml-ci * ggml : cont * ggml : cont ggml-ci * ggml : cont ggml-ci * cuda : no longer ggml headers last ggml-ci * ggml : fix q6_K FP16 -> FP32 conversion ggml-ci * ggml : more FP16 -> FP32 conversion fixes ggml-ci	2024-02-22 23:21:39 +02:00
Georgi Gerganov	334f76fa38	sync : ggml	2024-02-22 23:21:05 +02:00
Georgi Gerganov	efd56b1c21	ggml : 32-bit arm compat (whisper/1891) * ggml : 32-bit arm compat * ggml : add ggml_vqtbl1q_s8 impl * ggml : cont	2024-02-22 23:20:50 +02:00
Someone	201294ae17	nix: init singularity and docker images (#5056 ) Exposes a few attributes demonstrating how to build [singularity](https://docs.sylabs.io/guides/latest/user-guide/)/[apptainer](https://apptainer.org/) and Docker images re-using llama.cpp's Nix expression. Built locally on `x86_64-linux` with `nix build github:someoneserge/llama.cpp/feat/nix/images#llamaPackages.{docker,docker-min,sif,llama-cpp}` and it's fast and effective.	2024-02-22 11:44:10 -08:00
Georgi Gerganov	5a9e2f60ba	py : minor fixes (#5668 )	2024-02-22 20:13:25 +02:00
Xuan Son Nguyen	373ee3fbba	Add Gemma chat template (#5665 ) * add gemma chat template * gemma: only apply system_prompt on non-model message	2024-02-22 19:10:21 +01:00
Someone	4cb4d8b22d	workflows: nix: hardcode cachix ids, build unconditionally (#5663 ) GitHub does not expose environment and repository variables to PRs coming from forks implies that we've been disabling the Nix CI actions for most PRs. The `if:` also didn't make much sense, because we can always pull from cachix, and there's no point (albeit no risk either) in pushing cache for the untrusted code.	2024-02-22 08:32:09 -08:00
Georgi Gerganov	3a03541ced	minor : fix trailing whitespace (#5638 )	2024-02-22 13:54:03 +02:00
Georgi Gerganov	56d03d92be	readme : update hot topics	2024-02-22 10:35:54 +02:00
Xuan Son Nguyen	a46f50747b	server : fallback to chatml, add AlphaMonarch chat template (#5628 ) * server: fallback to chatml * add new chat template * server: add AlphaMonarch to test chat template * server: only check model template if there is no custom tmpl * remove TODO	2024-02-22 10:33:24 +02:00
Alexey Parfenov	c5688c6250	server : clarify some params in the docs (#5640 )	2024-02-22 10:27:32 +02:00
Dat Quoc Nguyen	4ef245a92a	mpt : add optional bias tensors (#5638 ) Update for MPT with optional bias parameters: to work with PhoGPT and SEA-LION models that were pre-trained with 'bias'.	2024-02-22 10:15:13 +02:00
slaren	973053d8b0	llama : fix loading models with shared tok_embd and output (#5651 ) ggml-ci	2024-02-22 00:42:09 +01:00
Xuan Son Nguyen	7c8bcc11dc	Add docs for llama_chat_apply_template (#5645 ) * add docs for llama_chat_apply_template * fix typo	2024-02-22 00:31:00 +01:00
slaren	7fe4678b02	llama : fix session save/load with quantized KV (#5649 )	2024-02-21 22:52:39 +01:00
slaren	ba2135ccae	gemma : allow offloading the output tensor (#5646 )	2024-02-21 22:18:23 +01:00
Jared Van Bortel	89febfed93	examples : do not assume BOS when shifting context (#5622 )	2024-02-21 10:33:54 -05:00
Georgi Gerganov	5022cf242d	sync : ggml	2024-02-21 16:52:52 +02:00
Pierrick Hymbert	1ecea255eb	server: health: fix race condition on slots data using tasks queue (#5634 ) * server: health: fix race condition on slots data using tasks queue * server: health: * include_slots only if slots_endpoint * fix compile warning task.target_id not initialized.	2024-02-21 15:47:48 +01:00
Ettore Di Giacinto	a00a35cef9	readme : add LocalAI to the availables UI (#5629 )	2024-02-21 16:39:10 +02:00
Georgi Gerganov	eccd7a26dd	sync : ggml (#5633 ) * ggml : fix conv_2d batch mode (ggml/737) Co-authored-by: bssrdf <bssrdf@gmail.com> * ggml : compute forward no longer pass src tensors (ggml/729) * sync : ggml ggml-ci --------- Co-authored-by: bssrdf <merlintiger@hotmail.com> Co-authored-by: bssrdf <bssrdf@gmail.com>	2024-02-21 16:17:10 +02:00
Georgi Gerganov	c14f72db9c	readme : update hot topics	2024-02-21 15:39:54 +02:00
Daniel Bevenius	cc6cac08e3	llava : add --skip-unknown to 1.6 convert.py (#5632 ) This commit adds the `--skip-unknown` option to the convert.py script and removes the saving of the updated checkpoints to avoid updating possibly checked out files. The motivation for this change is that this was done for 1.5 in Commit `fc0c8d286a` ("llava : update surgery script to not remove tensors") and makes the examples more consistent. Signed-off-by: Daniel Bevenius <daniel.bevenius@gmail.com>	2024-02-21 15:36:57 +02:00
postmasters	580111d42b	llama : add `gemma` model (#5631 ) There are couple things in this architecture: 1. Shared input and output embedding parameters. 2. Key length and value length are not derived from `n_embd`. More information about the models can be found at https://ai.google.dev/gemma. GGUFs can be downloaded from https://huggingface.co/google.	2024-02-21 15:08:22 +02:00
Meng, Hengyu	88c46cbdac	[SYCL] conext add name (#5624 ) * [SYCL] conext add name * name should start with SYCL*	2024-02-21 17:52:06 +08:00