ci : enable git lfs for build.yml

Revert "tmp : dummy change to trigger ci"
This reverts commit 97e40df5d6.
2026-02-12 14:03:20 +02:00 · 2024-05-08 10:53:02 +03:00 · 2024-05-08 10:42:25 +03:00 · 2024-05-08 10:42:11 +03:00 · 2024-05-08 10:30:25 +03:00 · 2024-05-08 10:24:53 +03:00
415 changed files with 58075 additions and 149542 deletions
--- a/.devops/full-cuda.Dockerfile
+++ b/.devops/full-cuda.Dockerfile
@@ -12,7 +12,7 @@ FROM ${BASE_CUDA_DEV_CONTAINER} as build
 ARG CUDA_DOCKER_ARCH=all

 RUN apt-get update && \
-    apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1
+    apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev

 COPY requirements.txt   requirements.txt
 COPY requirements       requirements
@@ -31,6 +31,6 @@ ENV LLAMA_CUDA=1
 # Enable cURL
 ENV LLAMA_CURL=1

-RUN make -j$(nproc)
+RUN make

 ENTRYPOINT ["/app/.devops/tools.sh"]
--- a/.devops/full-rocm.Dockerfile
+++ b/.devops/full-rocm.Dockerfile
@@ -45,6 +45,6 @@ ENV LLAMA_CURL=1
 RUN apt-get update && \
    apt-get install -y libcurl4-openssl-dev

-RUN make -j$(nproc)
+RUN make

 ENTRYPOINT ["/app/.devops/tools.sh"]
--- a/.devops/full.Dockerfile
+++ b/.devops/full.Dockerfile
@@ -3,7 +3,7 @@ ARG UBUNTU_VERSION=22.04
 FROM ubuntu:$UBUNTU_VERSION as build

 RUN apt-get update && \
-    apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1
+    apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev

 COPY requirements.txt   requirements.txt
 COPY requirements       requirements
@@ -18,7 +18,7 @@ COPY . .
 ENV LLAMA_CURL=1


-RUN make -j$(nproc)
+RUN make

 ENV LC_ALL=C.utf8

--- a/.devops/main-cuda.Dockerfile
+++ b/.devops/main-cuda.Dockerfile
@@ -23,13 +23,10 @@ ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
 # Enable CUDA
 ENV LLAMA_CUDA=1

-RUN make -j$(nproc) main
+RUN make

 FROM ${BASE_CUDA_RUN_CONTAINER} as runtime

-RUN apt-get update && \
-    apt-get install -y libgomp1
-
 COPY --from=build /app/main /main

 ENTRYPOINT [ "/main" ]
--- a/.devops/main-intel.Dockerfile
+++ b/.devops/main-intel.Dockerfile
@@ -1,4 +1,4 @@
-ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
+ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04

 FROM intel/oneapi-basekit:$ONEAPI_VERSION as build

--- a/.devops/main-rocm.Dockerfile
+++ b/.devops/main-rocm.Dockerfile
@@ -40,6 +40,6 @@ ENV LLAMA_HIPBLAS=1
 ENV CC=/opt/rocm/llvm/bin/clang
 ENV CXX=/opt/rocm/llvm/bin/clang++

-RUN make -j$(nproc) main
+RUN make

 ENTRYPOINT [ "/app/main" ]
--- a/.devops/main-vulkan.Dockerfile
+++ b/.devops/main-vulkan.Dockerfile
@@ -3,7 +3,7 @@ ARG UBUNTU_VERSION=jammy
 FROM ubuntu:$UBUNTU_VERSION as build

 # Install build tools
-RUN apt update && apt install -y git build-essential cmake wget libgomp1
+RUN apt update && apt install -y git build-essential cmake wget

 # Install Vulkan SDK
 RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
--- a/.devops/main.Dockerfile
+++ b/.devops/main.Dockerfile
@@ -9,13 +9,10 @@ WORKDIR /app

 COPY . .

-RUN make -j$(nproc) main
+RUN make

 FROM ubuntu:$UBUNTU_VERSION as runtime

-RUN apt-get update && \
-    apt-get install -y libgomp1
-
 COPY --from=build /app/main /main

 ENV LC_ALL=C.utf8
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@@ -214,6 +214,7 @@ effectiveStdenv.mkDerivation (
        (cmakeBool "LLAMA_CUDA" useCuda)
        (cmakeBool "LLAMA_HIPBLAS" useRocm)
        (cmakeBool "LLAMA_METAL" useMetalKit)
+        (cmakeBool "LLAMA_MPI" useMpi)
        (cmakeBool "LLAMA_VULKAN" useVulkan)
        (cmakeBool "LLAMA_STATIC" enableStatic)
      ]
@@ -226,20 +227,20 @@ effectiveStdenv.mkDerivation (
        )
      ]
      ++ optionals useRocm [
-        (cmakeFeature "CMAKE_HIP_COMPILER" "${rocmPackages.llvm.clang}/bin/clang")
-        (cmakeFeature "CMAKE_HIP_ARCHITECTURES" (builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets))
+        (cmakeFeature "CMAKE_C_COMPILER" "hipcc")
+        (cmakeFeature "CMAKE_CXX_COMPILER" "hipcc")
+
+        # Build all targets supported by rocBLAS. When updating search for TARGET_LIST_ROCM
+        # in https://github.com/ROCmSoftwarePlatform/rocBLAS/blob/develop/CMakeLists.txt
+        # and select the line that matches the current nixpkgs version of rocBLAS.
+        # Should likely use `rocmPackages.clr.gpuTargets`.
+        "-DAMDGPU_TARGETS=gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102"
      ]
      ++ optionals useMetalKit [
        (lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1")
        (cmakeBool "LLAMA_METAL_EMBED_LIBRARY" (!precompileMetalShaders))
      ];

-    # Environment variables needed for ROCm
-    env = optionals useRocm {
-      ROCM_PATH = "${rocmPackages.clr}";
-      HIP_DEVICE_LIB_PATH = "${rocmPackages.rocm-device-libs}/amdgcn/bitcode";
-    };
-
    # TODO(SomeoneSerge): It's better to add proper install targets at the CMake level,
    # if they haven't been added yet.
    postInstall = ''
--- a/.devops/server-cuda.Dockerfile
+++ b/.devops/server-cuda.Dockerfile
@@ -25,12 +25,12 @@ ENV LLAMA_CUDA=1
 # Enable cURL
 ENV LLAMA_CURL=1

-RUN make -j$(nproc) server
+RUN make

 FROM ${BASE_CUDA_RUN_CONTAINER} as runtime

 RUN apt-get update && \
-    apt-get install -y libcurl4-openssl-dev libgomp1
+    apt-get install -y libcurl4-openssl-dev

 COPY --from=build /app/server /server

--- a/.devops/server-intel.Dockerfile
+++ b/.devops/server-intel.Dockerfile
@@ -1,4 +1,4 @@
-ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
+ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04

 FROM intel/oneapi-basekit:$ONEAPI_VERSION as build

--- a/.devops/server-rocm.Dockerfile
+++ b/.devops/server-rocm.Dockerfile
@@ -45,6 +45,6 @@ ENV LLAMA_CURL=1
 RUN apt-get update && \
    apt-get install -y libcurl4-openssl-dev

-RUN make -j$(nproc)
+RUN make

 ENTRYPOINT [ "/app/server" ]
--- a/.devops/server.Dockerfile
+++ b/.devops/server.Dockerfile
@@ -11,12 +11,12 @@ COPY . .

 ENV LLAMA_CURL=1

-RUN make -j$(nproc) server
+RUN make

 FROM ubuntu:$UBUNTU_VERSION as runtime

 RUN apt-get update && \
-    apt-get install -y libcurl4-openssl-dev libgomp1
+    apt-get install -y libcurl4-openssl-dev

 COPY --from=build /app/server /server

--- a/.devops/tools.sh
+++ b/.devops/tools.sh
@@ -8,7 +8,7 @@ arg1="$1"
 shift

 if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
-    python3 ./convert-hf-to-gguf.py "$@"
+    python3 ./convert.py "$@"
 elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
    ./quantize "$@"
 elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
--- a/.gitattributes
+++ b/.gitattributes
@@ -0,0 +1 @@
+models/ggml-vocab-*.gguf filter=lfs diff=lfs merge=lfs -text
--- a/.github/ISSUE_TEMPLATE/01-bug-low.yml
+++ b/.github/ISSUE_TEMPLATE/01-bug-low.yml
@@ -1,50 +0,0 @@
-name: Low Severity Bugs
-description: Used to report low severity bugs in llama.cpp (e.g. cosmetic issues, non critical UI glitches)
-title: "Bug: "
-labels: ["bug-unconfirmed", "low severity"]
-body:
-  - type: markdown
-    attributes:
-      value: |
-        Thanks for taking the time to fill out this bug report!
-        Please include information about your system, the steps to reproduce the bug,
-        and the version of llama.cpp that you are using.
-        If possible, please provide a minimal code example that reproduces the bug.
-  - type: textarea
-    id: what-happened
-    attributes:
-      label: What happened?
-      description: Also tell us, what did you expect to happen?
-      placeholder: Tell us what you see!
-    validations:
-      required: true
-  - type: textarea
-    id: version
-    attributes:
-      label: Name and Version
-      description: Which executable and which version of our software are you running? (use `--version` to get a version string)
-      placeholder: |
-        $./main --version
-        version: 2999 (42b4109e)
-        built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
-    validations:
-      required: true
-  - type: dropdown
-    id: operating-system
-    attributes:
-      label: What operating system are you seeing the problem on?
-      multiple: true
-      options:
-        - Linux
-        - Mac
-        - Windows
-        - BSD
-        - Other? (Please let us know in description)
-    validations:
-      required: false
-  - type: textarea
-    id: logs
-    attributes:
-      label: Relevant log output
-      description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
-      render: shell
--- a/.github/ISSUE_TEMPLATE/02-bug-medium.yml
+++ b/.github/ISSUE_TEMPLATE/02-bug-medium.yml
@@ -1,50 +0,0 @@
-name: Medium Severity Bug
-description: Used to report medium severity bugs in llama.cpp (e.g. Malfunctioning Features but generally still useable)
-title: "Bug: "
-labels: ["bug-unconfirmed", "medium severity"]
-body:
-  - type: markdown
-    attributes:
-      value: |
-        Thanks for taking the time to fill out this bug report!
-        Please include information about your system, the steps to reproduce the bug,
-        and the version of llama.cpp that you are using.
-        If possible, please provide a minimal code example that reproduces the bug.
-  - type: textarea
-    id: what-happened
-    attributes:
-      label: What happened?
-      description: Also tell us, what did you expect to happen?
-      placeholder: Tell us what you see!
-    validations:
-      required: true
-  - type: textarea
-    id: version
-    attributes:
-      label: Name and Version
-      description: Which executable and which version of our software are you running? (use `--version` to get a version string)
-      placeholder: |
-        $./main --version
-        version: 2999 (42b4109e)
-        built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
-    validations:
-      required: true
-  - type: dropdown
-    id: operating-system
-    attributes:
-      label: What operating system are you seeing the problem on?
-      multiple: true
-      options:
-        - Linux
-        - Mac
-        - Windows
-        - BSD
-        - Other? (Please let us know in description)
-    validations:
-      required: false
-  - type: textarea
-    id: logs
-    attributes:
-      label: Relevant log output
-      description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
-      render: shell
--- a/.github/ISSUE_TEMPLATE/03-bug-high.yml
+++ b/.github/ISSUE_TEMPLATE/03-bug-high.yml
@@ -1,50 +0,0 @@
-name: High Severity Bug
-description: Used to report high severity bugs in llama.cpp (e.g. Malfunctioning features hindering important common workflow)
-title: "Bug: "
-labels: ["bug-unconfirmed", "high severity"]
-body:
-  - type: markdown
-    attributes:
-      value: |
-        Thanks for taking the time to fill out this bug report!
-        Please include information about your system, the steps to reproduce the bug,
-        and the version of llama.cpp that you are using.
-        If possible, please provide a minimal code example that reproduces the bug.
-  - type: textarea
-    id: what-happened
-    attributes:
-      label: What happened?
-      description: Also tell us, what did you expect to happen?
-      placeholder: Tell us what you see!
-    validations:
-      required: true
-  - type: textarea
-    id: version
-    attributes:
-      label: Name and Version
-      description: Which executable and which version of our software are you running? (use `--version` to get a version string)
-      placeholder: |
-        $./main --version
-        version: 2999 (42b4109e)
-        built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
-    validations:
-      required: true
-  - type: dropdown
-    id: operating-system
-    attributes:
-      label: What operating system are you seeing the problem on?
-      multiple: true
-      options:
-        - Linux
-        - Mac
-        - Windows
-        - BSD
-        - Other? (Please let us know in description)
-    validations:
-      required: false
-  - type: textarea
-    id: logs
-    attributes:
-      label: Relevant log output
-      description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
-      render: shell
--- a/.github/ISSUE_TEMPLATE/04-bug-critical.yml
+++ b/.github/ISSUE_TEMPLATE/04-bug-critical.yml
@@ -1,50 +0,0 @@
-name: Critical Severity Bug
-description: Used to report critical severity bugs in llama.cpp (e.g. Crashing, Corrupted, Dataloss)
-title: "Bug: "
-labels: ["bug-unconfirmed", "critical severity"]
-body:
-  - type: markdown
-    attributes:
-      value: |
-        Thanks for taking the time to fill out this bug report!
-        Please include information about your system, the steps to reproduce the bug,
-        and the version of llama.cpp that you are using.
-        If possible, please provide a minimal code example that reproduces the bug.
-  - type: textarea
-    id: what-happened
-    attributes:
-      label: What happened?
-      description: Also tell us, what did you expect to happen?
-      placeholder: Tell us what you see!
-    validations:
-      required: true
-  - type: textarea
-    id: version
-    attributes:
-      label: Name and Version
-      description: Which executable and which version of our software are you running? (use `--version` to get a version string)
-      placeholder: |
-        $./main --version
-        version: 2999 (42b4109e)
-        built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
-    validations:
-      required: true
-  - type: dropdown
-    id: operating-system
-    attributes:
-      label: What operating system are you seeing the problem on?
-      multiple: true
-      options:
-        - Linux
-        - Mac
-        - Windows
-        - BSD
-        - Other? (Please let us know in description)
-    validations:
-      required: false
-  - type: textarea
-    id: logs
-    attributes:
-      label: Relevant log output
-      description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
-      render: shell
--- a/.github/ISSUE_TEMPLATE/05-enhancement.yml
+++ b/.github/ISSUE_TEMPLATE/05-enhancement.yml
@@ -1,51 +0,0 @@
-name: Enhancement
-description: Used to request enhancements for llama.cpp
-title: "Feature Request: "
-labels: ["enhancement"]
-body:
-  - type: markdown
-    attributes:
-      value: |
-        [Please post your idea first in Discussion if there is not yet a consensus for this enhancement request. This will help to keep this issue tracker focused on enhancements that the community has agreed needs to be implemented.](https://github.com/ggerganov/llama.cpp/discussions/categories/ideas)
-
-  - type: checkboxes
-    id: prerequisites
-    attributes:
-      label: Prerequisites
-      description: Please confirm the following before submitting your enhancement request.
-      options:
-        - label: I am running the latest code. Mention the version if possible as well.
-          required: true
-        - label: I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md).
-          required: true
-        - label: I searched using keywords relevant to my issue to make sure that I am creating a new issue that is not already open (or closed).
-          required: true
-        - label: I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new and useful enhancement to share.
-          required: true
-
-  - type: textarea
-    id: feature-description
-    attributes:
-      label: Feature Description
-      description: Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an enhancement.
-      placeholder: Detailed description of the enhancement
-    validations:
-      required: true
-
-  - type: textarea
-    id: motivation
-    attributes:
-      label: Motivation
-      description: Please provide a detailed written description of reasons why this feature is necessary and how it is useful to `llama.cpp` users.
-      placeholder: Explanation of why this feature is needed and its benefits
-    validations:
-      required: true
-
-  - type: textarea
-    id: possible-implementation
-    attributes:
-      label: Possible Implementation
-      description: If you have an idea as to how it can be implemented, please write a detailed description. Feel free to give links to external sources or share visuals that might be helpful to understand the details better.
-      placeholder: Detailed description of potential implementation
-    validations:
-      required: false
--- a/.github/ISSUE_TEMPLATE/06-research.yml
+++ b/.github/ISSUE_TEMPLATE/06-research.yml
@@ -1,52 +0,0 @@
-name: Research
-description: Track new technical research area
-title: "Research: "
-labels: ["research 🔬"]
-body:
-  - type: markdown
-    attributes:
-      value: |
-        Don't forget to check for any [duplicate research issue tickets](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3A%22research+%F0%9F%94%AC%22)
-
-  - type: checkboxes
-    id: research-stage
-    attributes:
-      label: Research Stage
-      description: Track general state of this research ticket
-      options:
-        - label: Background Research (Let's try to avoid reinventing the wheel)
-        - label: Hypothesis Formed (How do you think this will work and it's effect?)
-        - label: Strategy / Implementation Forming
-        - label: Analysis of results
-        - label: Debrief / Documentation (So people in the future can learn from us)
-
-  - type: textarea
-    id: background
-    attributes:
-      label: Previous existing literature and research
-      description: Whats the current state of the art and whats the motivation for this research?
-
-  - type: textarea
-    id: hypothesis
-    attributes:
-      label: Hypothesis
-      description: How do you think this will work and it's effect?
-
-  - type: textarea
-    id: implementation
-    attributes:
-      label: Implementation
-      description: Got an approach? e.g. a PR ready to go?
-
-  - type: textarea
-    id: analysis
-    attributes:
-      label: Analysis
-      description: How does the proposed implementation behave?
-
-  - type: textarea
-    id: logs
-    attributes:
-      label: Relevant log output
-      description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
-      render: shell
--- a/.github/ISSUE_TEMPLATE/07-refactor.yml
+++ b/.github/ISSUE_TEMPLATE/07-refactor.yml
@@ -1,28 +0,0 @@
-name: Refactor (Maintainers)
-description: Used to track refactoring opportunities
-title: "Refactor: "
-labels: ["refactor"]
-body:
-  - type: markdown
-    attributes:
-      value: |
-        Don't forget to [check for existing refactor issue tickets](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3Arefactoring) in case it's already covered.
-        Also you may want to check [Pull request refactor label as well](https://github.com/ggerganov/llama.cpp/pulls?q=is%3Aopen+is%3Apr+label%3Arefactoring) for duplicates too.
-
-  - type: textarea
-    id: background-description
-    attributes:
-      label: Background Description
-      description: Please provide a detailed written description of the pain points you are trying to solve.
-      placeholder: Detailed description behind your motivation to request refactor
-    validations:
-      required: true
-
-  - type: textarea
-    id: possible-approaches
-    attributes:
-      label: Possible Refactor Approaches
-      description: If you have some idea of possible approaches to solve this problem. You may want to make it a todo list.
-      placeholder: Your idea of possible refactoring opportunity/approaches
-    validations:
-      required: false
--- a/.github/ISSUE_TEMPLATE/bug.md
+++ b/.github/ISSUE_TEMPLATE/bug.md
@@ -0,0 +1,11 @@
+---
+name: Bug template
+about: Used to report bugs in llama.cpp
+labels: ["bug-unconfirmed"]
+assignees: ''
+
+---
+
+Please include information about your system, the steps to reproduce the bug, and the version of llama.cpp that you are using. If possible, please provide a minimal code example that reproduces the bug.
+
+If the bug concerns the server, please try to reproduce it first using the [server test scenario framework](https://github.com/ggerganov/llama.cpp/tree/master/examples/server/tests).
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -1,13 +0,0 @@
-blank_issues_enabled: true
-contact_links:
-  - name: Got an idea?
-    url: https://github.com/ggerganov/llama.cpp/discussions/categories/ideas
-    about: Pop it there. It may then become an enhancement ticket.
-  - name: Got a question?
-    url: https://github.com/ggerganov/llama.cpp/discussions/categories/q-a
-    about: Ask a question there!
-  - name: Want to contribute?
-    url: https://github.com/ggerganov/llama.cpp/wiki/contribute
-    about: Head to the contribution guide page of the wiki for areas you can help with
-
-
--- a/.github/ISSUE_TEMPLATE/enhancement.md
+++ b/.github/ISSUE_TEMPLATE/enhancement.md
@@ -0,0 +1,28 @@
+---
+name: Enhancement template
+about: Used to request enhancements for llama.cpp
+labels: ["enhancement"]
+assignees: ''
+
+---
+
+# Prerequisites
+
+Please answer the following questions for yourself before submitting an issue.
+
+- [ ] I am running the latest code. Development is very rapid so there are no tagged versions as of now.
+- [ ] I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md).
+- [ ] I [searched using keywords relevant to my issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/filtering-and-searching-issues-and-pull-requests) to make sure that I am creating a new issue that is not already open (or closed).
+- [ ] I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new bug or useful enhancement to share.
+
+# Feature Description
+
+Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an enhancement.
+
+# Motivation
+
+Please provide a detailed written description of reasons why this feature is necessary and how it is useful to `llama.cpp` users.
+
+# Possible Implementation
+
+If you have an idea as to how it can be implemented, please write a detailed description. Feel free to give links to external sources or share visuals that might be helpful to understand the details better.
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -1,90 +0,0 @@
-# https://github.com/actions/labeler
-Kompute:
-    - changed-files:
-        - any-glob-to-any-file:
-            - ggml-kompute.h
-            - ggml-kompute.cpp
-            - README-kompute.md
-Apple Metal:
-    - changed-files:
-        - any-glob-to-any-file:
-            - ggml-metal.h
-            - ggml-metal.cpp
-            - README-metal.md
-SYCL:
-    - changed-files:
-        - any-glob-to-any-file:
-            - ggml-sycl.h
-            - ggml-sycl.cpp
-            - README-sycl.md
-Nvidia GPU:
-    - changed-files:
-        - any-glob-to-any-file:
-            - ggml-cuda.h
-            - ggml-cuda/**
-Vulkan:
-    - changed-files:
-        - any-glob-to-any-file:
-            - ggml_vk_generate_shaders.py
-            - ggml-vulkan*
-documentation:
-    - changed-files:
-        - any-glob-to-any-file:
-            - docs/**
-            - media/**
-testing:
-    - changed-files:
-        - any-glob-to-any-file:
-            - tests/**
-build:
-    - changed-files:
-        - any-glob-to-any-file:
-            - cmake/**
-            - CMakeLists.txt
-            - CMakePresets.json
-            - codecov.yml
-examples:
-    - changed-files:
-        - any-glob-to-any-file: examples/**
-devops:
-    - changed-files:
-        - any-glob-to-any-file:
-            - .devops/**
-            - .github/**
-            - ci/**
-python:
-    - changed-files:
-        - any-glob-to-any-file:
-            - "**/*.py"
-            - requirements/**
-            - gguf-py/**
-            - .flake8
-script:
-    - changed-files:
-        - any-glob-to-any-file:
-            - scripts/**
-android:
-    - changed-files:
-        - any-glob-to-any-file:
-            - examples/llama.android/**
-server:
-    - changed-files:
-        - any-glob-to-any-file:
-            - examples/server/**
-ggml:
-    - changed-files:
-        - any-glob-to-any-file:
-            - ggml.c
-            - ggml.h
-            - ggml-*.c
-            - ggml-*.h
-            - ggml-cuda/**
-nix:
-    - changed-files:
-        - any-glob-to-any-file:
-            - "**/*.nix"
-            - .github/workflows/nix-*.yml
-            - .devops/nix/nixpkgs-instances.nix
-embedding:
-    - changed-files:
-        - any-glob-to-any-file: examples/embedding/
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -1,5 +0,0 @@
- Self Reported Review Complexity:
-    - [ ] Review Complexity : Low
-    - [ ] Review Complexity : Medium
-    - [ ] Review Complexity : High
- [ ] I have read the [contributing guidelines](https://github.com/ggerganov/llama.cpp/blob/master/CONTRIBUTING.md)
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -13,7 +13,7 @@ on:
    paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m']
  pull_request:
    types: [opened, synchronize, reopened]
-    paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m']
+    paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m']

 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
@@ -33,6 +33,7 @@ jobs:
        id: checkout
        uses: actions/checkout@v4
        with:
+          lfs: true
          fetch-depth: 0

      - name: Dependencies
@@ -91,6 +92,7 @@ jobs:
        id: checkout
        uses: actions/checkout@v4
        with:
+          lfs: true
          fetch-depth: 0

      - name: Dependencies
@@ -153,6 +155,8 @@ jobs:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
+        with:
+          lfs: true

      - name: Dependencies
        id: depends
@@ -188,6 +192,8 @@ jobs:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
+        with:
+          lfs: true

      - name: Dependencies
        id: depends
@@ -211,6 +217,7 @@ jobs:
        id: checkout
        uses: actions/checkout@v4
        with:
+          lfs: true
          fetch-depth: 0

      - name: Dependencies
@@ -271,73 +278,71 @@ jobs:
          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip
          name: llama-bin-ubuntu-x64.zip

-  ubuntu-latest-cmake-sanitizer:
+#  ubuntu-latest-cmake-sanitizer:
+#    runs-on: ubuntu-latest
+#
+#    continue-on-error: true
+#
+#    strategy:
+#      matrix:
+#        sanitizer: [ADDRESS, THREAD, UNDEFINED]
+#        build_type: [Debug, Release]
+#
+#    steps:
+#      - name: Clone
+#        id: checkout
+#        uses: actions/checkout@v4
+#        with:
+#          lfs: true
+#
+#      - name: Dependencies
+#        id: depends
+#        run: |
+#          sudo apt-get update
+#          sudo apt-get install build-essential
+#
+#      - name: Build
+#        id: cmake_build
+#        run: |
+#          mkdir build
+#          cd build
+#          cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
+#          cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
+#
+#      - name: Test
+#        id: cmake_test
+#        run: |
+#          cd build
+#          ctest -L main --verbose --timeout 900
+
+  ubuntu-latest-cmake-mpi:
    runs-on: ubuntu-latest

    continue-on-error: true

    strategy:
      matrix:
-        sanitizer: [ADDRESS, THREAD, UNDEFINED]
-        build_type: [Debug, Release]
+        mpi_library: [mpich, libopenmpi-dev]

    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
+        with:
+          lfs: true

      - name: Dependencies
        id: depends
        run: |
          sudo apt-get update
-          sudo apt-get install build-essential
-
-      - name: Build
-        id: cmake_build
-        if: ${{ matrix.sanitizer != 'THREAD' }}
-        run: |
-          mkdir build
-          cd build
-          cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
-          cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
-
-      - name: Build (no OpenMP)
-        id: cmake_build_no_openmp
-        if: ${{ matrix.sanitizer == 'THREAD' }}
-        run: |
-          mkdir build
-          cd build
-          cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DLLAMA_OPENMP=OFF
-          cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
-
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          ctest -L main --verbose --timeout 900
-
-  ubuntu-latest-cmake-rpc:
-    runs-on: ubuntu-latest
-
-    continue-on-error: true
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential
+          sudo apt-get install build-essential ${{ matrix.mpi_library }}

      - name: Build
        id: cmake_build
        run: |
          mkdir build
          cd build
-          cmake -DLLAMA_RPC=ON ..
+          cmake -DLLAMA_MPI=ON ..
          cmake --build . --config Release -j $(nproc)

      - name: Test
@@ -353,6 +358,8 @@ jobs:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
+        with:
+          lfs: true

      - name: Dependencies
        id: depends
@@ -368,33 +375,6 @@ jobs:
          cmake -DLLAMA_VULKAN=ON ..
          cmake --build . --config Release -j $(nproc)

-  ubuntu-22-cmake-hip:
-    runs-on: ubuntu-22.04
-    container: rocm/dev-ubuntu-22.04:6.0.2
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v3
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev
-
-      - name: Build with native CMake HIP support
-        id: cmake_build
-        run: |
-          cmake -B build -S . -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" -DLLAMA_HIPBLAS=ON
-          cmake --build build --config Release -j $(nproc)
-
-      - name: Build with legacy HIP support
-        id: cmake_build_legacy_hip
-        run: |
-          cmake -B build2 -S . -DCMAKE_C_COMPILER=hipcc -DCMAKE_CXX_COMPILER=hipcc -DLLAMA_HIPBLAS=ON
-          cmake --build build2 --config Release -j $(nproc)
-
  ubuntu-22-cmake-sycl:
    runs-on: ubuntu-22.04

@@ -402,6 +382,8 @@ jobs:

    steps:
      - uses: actions/checkout@v2
+        with:
+          lfs: true

      - name: add oneAPI to apt
        shell: bash
@@ -426,6 +408,8 @@ jobs:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
+        with:
+          lfs: true

      - name: Build
        id: cmake_build
@@ -443,6 +427,8 @@ jobs:

    steps:
      - uses: actions/checkout@v2
+        with:
+          lfs: true

      - name: add oneAPI to apt
        shell: bash
@@ -467,6 +453,8 @@ jobs:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
+        with:
+          lfs: true

      - name: Build
        id: cmake_build
@@ -487,6 +475,8 @@ jobs:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
+        with:
+          lfs: true

      - name: Dependencies
        id: depends
@@ -518,6 +508,8 @@ jobs:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
+        with:
+          lfs: true

      - name: Dependencies
        id: depends
@@ -547,6 +539,8 @@ jobs:
      - name: Clone
        id: checkout
        uses: actions/checkout@v1
+        with:
+          lfs: true

      - name: Dependencies
        id: depends
@@ -576,6 +570,8 @@ jobs:
      - name: Clone
        id: checkout
        uses: actions/checkout@v1
+        with:
+          lfs: true

      - name: Dependencies
        id: depends
@@ -609,6 +605,8 @@ jobs:
      - name: Clone
        id: checkout
        uses: actions/checkout@v1
+        with:
+          lfs: true

      - name: Dependencies
        id: depends
@@ -639,6 +637,8 @@ jobs:
    steps:
      - name: Clone
        uses: actions/checkout@v4
+        with:
+          lfs: true

      - name: Setup ${{ matrix.sys }}
        uses: msys2/setup-msys2@v2
@@ -684,53 +684,75 @@ jobs:
            cmake --build build --config ${{ matrix.build }} -j $(nproc)

  windows-latest-cmake:
-    runs-on: windows-2019
+    runs-on: windows-latest

    env:
      OPENBLAS_VERSION: 0.3.23
+      OPENCL_VERSION: 2023.04.17
+      CLBLAST_VERSION: 1.6.0
      SDE_VERSION: 9.33.0-2024-01-07
      VULKAN_VERSION: 1.3.261.1

    strategy:
      matrix:
        include:
-          - build: 'rpc-x64'
-            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_RPC=ON -DBUILD_SHARED_LIBS=ON'
-          - build: 'noavx-x64'
+          - build: 'noavx'
            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF -DBUILD_SHARED_LIBS=ON'
-          - build: 'avx2-x64'
+          - build: 'avx2'
            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
-          - build: 'avx-x64'
+          - build: 'avx'
            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
-          - build: 'avx512-x64'
+          - build: 'avx512'
            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
-          - build: 'openblas-x64'
+          - build: 'clblast'
+            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
+          - build: 'openblas'
            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
-          - build: 'kompute-x64'
+          - build: 'kompute'
            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'
-          - build: 'vulkan-x64'
+          - build: 'vulkan'
            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_VULKAN=ON -DBUILD_SHARED_LIBS=ON'
-          - build: 'llvm-arm64'
-            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
-          - build: 'msvc-arm64'
-            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
+          - build: 'arm64'
+            defines: '-A ARM64 -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'

    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
        with:
+          lfs: true
          fetch-depth: 0

      - name: Clone Kompute submodule
        id: clone_kompute
-        if: ${{ matrix.build == 'kompute-x64' }}
+        if: ${{ matrix.build == 'kompute' }}
        run: |
          git submodule update --init kompute

+      - name: Download OpenCL SDK
+        id: get_opencl
+        if: ${{ matrix.build == 'clblast' }}
+        run: |
+          curl.exe -o $env:RUNNER_TEMP/opencl.zip -L "https://github.com/KhronosGroup/OpenCL-SDK/releases/download/v${env:OPENCL_VERSION}/OpenCL-SDK-v${env:OPENCL_VERSION}-Win-x64.zip"
+          mkdir $env:RUNNER_TEMP/opencl
+          tar.exe -xvf $env:RUNNER_TEMP/opencl.zip --strip-components=1 -C $env:RUNNER_TEMP/opencl
+
+      - name: Download CLBlast
+        id: get_clblast
+        if: ${{ matrix.build == 'clblast' }}
+        run: |
+          curl.exe -o $env:RUNNER_TEMP/clblast.7z -L "https://github.com/CNugteren/CLBlast/releases/download/${env:CLBLAST_VERSION}/CLBlast-${env:CLBLAST_VERSION}-windows-x64.7z"
+          curl.exe -o $env:RUNNER_TEMP/CLBlast.LICENSE.txt -L "https://github.com/CNugteren/CLBlast/raw/${env:CLBLAST_VERSION}/LICENSE"
+          7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/clblast.7z
+          rename-item $env:RUNNER_TEMP/CLBlast-${env:CLBLAST_VERSION}-windows-x64 clblast
+          foreach ($f in (gci -Recurse -Path "$env:RUNNER_TEMP/clblast" -Filter '*.cmake')) {
+            $txt = Get-Content -Path $f -Raw
+            $txt.Replace('C:/vcpkg/packages/opencl_x64-windows/', "$($env:RUNNER_TEMP.Replace('\','/'))/opencl/") | Set-Content -Path $f -Encoding UTF8
+          }
+
      - name: Download OpenBLAS
        id: get_openblas
-        if: ${{ matrix.build == 'openblas-x64' }}
+        if: ${{ matrix.build == 'openblas' }}
        run: |
          curl.exe -o $env:RUNNER_TEMP/openblas.zip -L "https://github.com/xianyi/OpenBLAS/releases/download/v${env:OPENBLAS_VERSION}/OpenBLAS-${env:OPENBLAS_VERSION}-x64.zip"
          curl.exe -o $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt -L "https://github.com/xianyi/OpenBLAS/raw/v${env:OPENBLAS_VERSION}/LICENSE"
@@ -743,34 +765,38 @@ jobs:

      - name: Install Vulkan SDK
        id: get_vulkan
-        if: ${{ matrix.build == 'kompute-x64' || matrix.build == 'vulkan-x64' }}
+        if: ${{ matrix.build == 'kompute' || matrix.build == 'vulkan' }}
        run: |
          curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
          & "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
          Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
          Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"

-      - name: Install Ninja
-        id: install_ninja
-        run: |
-          choco install ninja
-
      - name: Build
        id: cmake_build
        run: |
-          cmake -S . -B build ${{ matrix.defines }}
-          cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
+          mkdir build
+          cd build
+          cmake .. ${{ matrix.defines }}
+          cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
+
+      - name: Add clblast.dll
+        id: add_clblast_dll
+        if: ${{ matrix.build == 'clblast' }}
+        run: |
+          cp $env:RUNNER_TEMP/clblast/lib/clblast.dll ./build/bin/Release
+          cp $env:RUNNER_TEMP/CLBlast.LICENSE.txt ./build/bin/Release/CLBlast-${env:CLBLAST_VERSION}.txt

      - name: Add libopenblas.dll
        id: add_libopenblas_dll
-        if: ${{ matrix.build == 'openblas-x64' }}
+        if: ${{ matrix.build == 'openblas' }}
        run: |
          cp $env:RUNNER_TEMP/openblas/bin/libopenblas.dll ./build/bin/Release/openblas.dll
          cp $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt ./build/bin/Release/OpenBLAS-${env:OPENBLAS_VERSION}.txt

      - name: Check AVX512F support
        id: check_avx512f
-        if: ${{ matrix.build == 'avx512-x64' }}
+        if: ${{ matrix.build == 'avx512' }}
        continue-on-error: true
        run: |
          cd build
@@ -784,14 +810,14 @@ jobs:
      - name: Test
        id: cmake_test
        # not all machines have native AVX-512
-        if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }}
+        if: ${{ matrix.build != 'arm64' && matrix.build != 'clblast' && matrix.build != 'kompute' && matrix.build != 'vulkan' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }}
        run: |
          cd build
          ctest -L main -C Release --verbose --timeout 900

      - name: Test (Intel SDE)
        id: cmake_test_sde
-        if: ${{ matrix.build == 'avx512-x64' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation
+        if: ${{ matrix.build == 'avx512' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation
        run: |
          curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/813591/sde-external-${env:SDE_VERSION}-win.tar.xz"
          # for some weird reason windows tar doesn't like sde tar.xz
@@ -819,17 +845,17 @@ jobs:
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        run: |
          Copy-Item LICENSE .\build\bin\Release\llama.cpp.txt
-          7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip .\build\bin\Release\*
+          7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-x64.zip .\build\bin\Release\*

      - name: Upload artifacts
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        uses: actions/upload-artifact@v4
        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip
-          name: llama-bin-win-${{ matrix.build }}.zip
+          path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-x64.zip
+          name: llama-bin-win-${{ matrix.build }}-x64.zip

  windows-latest-cmake-cuda:
-    runs-on: windows-2019
+    runs-on: windows-latest

    strategy:
      matrix:
@@ -841,11 +867,11 @@ jobs:
        id: checkout
        uses: actions/checkout@v4
        with:
+          lfs: true
          fetch-depth: 0

-      - name: Install CUDA toolkit
+      - uses: Jimver/cuda-toolkit@v0.2.11
        id: cuda-toolkit
-        uses: Jimver/cuda-toolkit@v0.2.15
        with:
          cuda: ${{ matrix.cuda }}
          method: 'network'
@@ -907,14 +933,15 @@ jobs:
        shell: bash

    env:
-      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/7dff44ba-e3af-4448-841c-0d616c8da6e7/w_BaseKit_p_2024.1.0.595_offline.exe
+      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/62641e01-1e8d-4ace-91d6-ae03f7f8a71f/w_BaseKit_p_2024.0.0.49563_offline.exe
      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel
-      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
+
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
        with:
+          lfs: true
          fetch-depth: 0

      - name: Install
@@ -941,17 +968,6 @@ jobs:
        id: pack_artifacts
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        run: |
-          echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin"
-          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.4.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin
-
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/pi_win_proxy_loader.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/pi_level_zero.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl7.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
-          echo "cp oneAPI running time dll files to ./build/bin done"
          7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/*

      - name: Upload artifacts
@@ -961,43 +977,14 @@ jobs:
          path: llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip
          name: llama-bin-win-sycl-x64.zip

-  windows-latest-cmake-hip:
-    runs-on: windows-latest
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v3
-
-      - name: Install
-        id: depends
-        run: |
-          $ErrorActionPreference = "Stop"
-          write-host "Downloading AMD HIP SDK Installer"
-          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-23.Q4-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
-          write-host "Installing AMD HIP SDK"
-          Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
-          write-host "Completed AMD HIP SDK installation"
-
-      - name: Verify ROCm
-        id: verify
-        run: |
-          & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
-
-      - name: Build
-        id: cmake_build
-        run: |
-          $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
-          $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
-          cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DLLAMA_HIPBLAS=ON
-          cmake --build build --config Release
-
  ios-xcode-build:
    runs-on: macos-latest

    steps:
      - name: Checkout code
        uses: actions/checkout@v4
+        with:
+          lfs: true

      - name: Build Xcode project
        run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build
@@ -1008,6 +995,8 @@ jobs:
    steps:
      - name: Clone
        uses: actions/checkout@v4
+        with:
+          lfs: true

      - name: Set up JDK
        uses: actions/setup-java@v3
@@ -1030,7 +1019,9 @@ jobs:
 #    runs-on: macos-12
 #    steps:
 #    - name: Clone
-#      uses: actions/checkout@v4
+#      uses: actions/checkout@#v4
+#        with:
+#          lfs: true
 #
 #    - name: Build
 #      uses: cross-platform-actions/action@v0.19.0
@@ -1040,7 +1031,7 @@ jobs:
 #        hypervisor: 'qemu'
 #        run: |
 #            sudo pkg update
-#            sudo pkg install -y gmake automake autoconf pkgconf llvm15 openblas
+#            sudo pkg install -y gmake automake autoconf pkgconf llvm15 clinfo clover opencl clblast openblas
 #            gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j `sysctl -n hw.ncpu`

  release:
@@ -1063,6 +1054,7 @@ jobs:
        id: checkout
        uses: actions/checkout@v4
        with:
+          lfs: true
          fetch-depth: 0

      - name: Determine tag name
@@ -1128,6 +1120,8 @@ jobs:
 #    steps:
 #      - name: Clone
 #        uses: actions/checkout@v4
+#        with:
+#          lfs: true
 #
 #      - name: Dependencies
 #        run: |
@@ -1152,6 +1146,8 @@ jobs:
 #    steps:
 #      - name: Clone
 #        uses: actions/checkout@v4
+#        with:
+#          lfs: true
 #
 #      - name: Dependencies
 #        run: |
@@ -1176,6 +1172,8 @@ jobs:
 #    steps:
 #      - name: Clone
 #        uses: actions/checkout@v4
+#        with:
+#          lfs: true
 #
 #      - name: Dependencies
 #        run: |
@@ -1206,6 +1204,8 @@ jobs:
 #    steps:
 #      - name: Clone
 #        uses: actions/checkout@v4
+#        with:
+#          lfs: true
 #
 #      - name: Add msbuild to PATH
 #        uses: microsoft/setup-msbuild@v1
@@ -1245,6 +1245,8 @@ jobs:
 #    steps:
 #      - name: Clone
 #        uses: actions/checkout@v4
+#        with:
+#          lfs: true
 #
 #      - name: Add msbuild to PATH
 #        uses: microsoft/setup-msbuild@v1
@@ -1291,6 +1293,8 @@ jobs:
 #    steps:
 #      - name: Clone
 #        uses: actions/checkout@v4
+#        with:
+#          lfs: true
 #
 #      - name: Dependencies
 #        run: |
--- a/.github/workflows/code-coverage.yml
+++ b/.github/workflows/code-coverage.yml
@@ -13,14 +13,16 @@ jobs:
  run:
    runs-on: ubuntu-20.04
    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-
      - name: Dependencies
        run: |
          sudo apt-get update
          sudo apt-get install build-essential gcc-8 lcov

+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          lfs: true
+
      - name: Build
        run: CC=gcc-8 make -j LLAMA_CODE_COVERAGE=1 tests

--- a/.github/workflows/labeler.yml
+++ b/.github/workflows/labeler.yml
@@ -1,17 +0,0 @@
-name: "Pull Request Labeler"
-on:
- pull_request_target
-
-jobs:
-  labeler:
-    permissions:
-      contents: read
-      pull-requests: write
-    runs-on: ubuntu-latest
-    steps:
-    - uses: actions/checkout@v4
-      with:
-        repository: "ggerganov/llama.cpp"
-    - uses: actions/labeler@v5
-      with:
-        configuration-path: '.github/labeler.yml'
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -16,9 +16,11 @@ on:
    branches:
      - master
    paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
-  pull_request:
+  pull_request_target:
    types: [opened, synchronize, reopened]
    paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
+  schedule:
+    -  cron: '2 4 * * *'

 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
@@ -30,8 +32,10 @@ jobs:

    strategy:
      matrix:
-        sanitizer: [ADDRESS, THREAD, UNDEFINED]
-        build_type: [RelWithDebInfo]
+        # TODO: temporary disabled due to linux kernel issues
+        #sanitizer: [ADDRESS, THREAD, UNDEFINED]
+        sanitizer: [UNDEFINED]
+        build_type: [Debug]
        include:
          - build_type: Release
            sanitizer: ""
@@ -98,8 +102,10 @@ jobs:
              -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target server

+
      - name: Tests
        id: server_integration_tests
+        if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
        run: |
          cd examples/server/tests
          PORT=8888 ./tests.sh
@@ -113,7 +119,7 @@ jobs:


  server-windows:
-    runs-on: windows-2019
+    runs-on: windows-latest

    steps:
      - name: Clone
--- a/.github/workflows/zig-build.yml
+++ b/.github/workflows/zig-build.yml
@@ -0,0 +1,29 @@
+name: Zig CI
+
+on:
+  pull_request:
+  push:
+    branches:
+      - master
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  build:
+    strategy:
+      fail-fast: false
+      matrix:
+        runs-on: [ubuntu-latest, macos-latest, windows-latest]
+    runs-on: ${{ matrix.runs-on }}
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: recursive
+          fetch-depth: 0
+      - uses: goto-bus-stop/setup-zig@v2
+        with:
+          version: 0.11.0
+      - name: Build Summary
+        run: zig build --summary all -freference-trace
--- a/.gitignore
+++ b/.gitignore
@@ -34,11 +34,9 @@ ggml-metal-embed.metal
 lcov-report/
 gcovr-report/

-tags
 build*
 !build.zig
 cmake-build-*
-android-ndk-*
 out/
 tmp/

@@ -107,7 +105,6 @@ examples/jeopardy/results.txt
 examples/server/*.html.hpp
 examples/server/*.js.hpp
 examples/server/*.mjs.hpp
-examples/server/*.css.hpp

 poetry.lock
 poetry.toml
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories.
+cmake_minimum_required(VERSION 3.14)  # for add_link_options and implicit target directories.
 project("llama.cpp" C CXX)
 include(CheckIncludeFileCXX)

@@ -72,13 +72,11 @@ else()
    set(INS_ENB ON)
 endif()

-option(LLAMA_SVE                             "llama: enable SVE"                                OFF)
 option(LLAMA_AVX                             "llama: enable AVX"                                ${INS_ENB})
 option(LLAMA_AVX2                            "llama: enable AVX2"                               ${INS_ENB})
 option(LLAMA_AVX512                          "llama: enable AVX512"                             OFF)
 option(LLAMA_AVX512_VBMI                     "llama: enable AVX512-VBMI"                        OFF)
 option(LLAMA_AVX512_VNNI                     "llama: enable AVX512-VNNI"                        OFF)
-option(LLAMA_AVX512_BF16                     "llama: enable AVX512-BF16"                        OFF)
 option(LLAMA_FMA                             "llama: enable FMA"                                ${INS_ENB})
 # in MSVC F16C is implied with AVX2/AVX512
 if (NOT MSVC)
@@ -106,11 +104,11 @@ set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
                                             "llama: max. batch size for using peer access")
 option(LLAMA_CUDA_NO_PEER_COPY               "llama: do not use peer to peer copies"            OFF)
 option(LLAMA_CUDA_NO_VMM                     "llama: do not try to use CUDA VMM"                OFF)
-option(LLAMA_CUDA_FA_ALL_QUANTS              "llama: compile all quants for FlashAttention"     OFF)

 option(LLAMA_CURL                            "llama: use libcurl to download model from an URL" OFF)
 option(LLAMA_HIPBLAS                         "llama: use hipBLAS"                               OFF)
 option(LLAMA_HIP_UMA                         "llama: use HIP unified memory architecture"       OFF)
+option(LLAMA_CLBLAST                         "llama: use CLBlast"                               OFF)
 option(LLAMA_VULKAN                          "llama: use Vulkan"                                OFF)
 option(LLAMA_VULKAN_CHECK_RESULTS            "llama: run Vulkan op checks"                      OFF)
 option(LLAMA_VULKAN_DEBUG                    "llama: enable Vulkan debug output"                OFF)
@@ -124,8 +122,8 @@ set(LLAMA_METAL_MACOSX_VERSION_MIN "" CACHE STRING
                                             "llama: metal minimum macOS version")
 set(LLAMA_METAL_STD "" CACHE STRING          "llama: metal standard version (-std flag)")
 option(LLAMA_KOMPUTE                         "llama: use Kompute"                               OFF)
-option(LLAMA_RPC                             "llama: use RPC"                                   OFF)
-option(LLAMA_OPENMP                          "llama: use OpenMP"                                ON)
+option(LLAMA_MPI                             "llama: use MPI"                                   OFF)
+option(LLAMA_QKK_64                          "llama: use super-block size of 64 for k-quants"   OFF)
 option(LLAMA_SYCL                            "llama: use SYCL"                                  OFF)
 option(LLAMA_SYCL_F16                        "llama: use 16 bit floats for sycl calculations"   OFF)
 set(LLAMA_SYCL_TARGET   "INTEL" CACHE STRING "llama: sycl target device")
@@ -135,8 +133,6 @@ set(LLAMA_SCHED_MAX_COPIES  "4" CACHE STRING "llama: max input copies for pipeli
 option(LLAMA_BUILD_TESTS                     "llama: build tests"    ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_EXAMPLES                  "llama: build examples" ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_SERVER                    "llama: build server example"                      ON)
-option(LLAMA_LASX                            "llama: enable lasx"                               ON)
-option(LLAMA_LSX                             "llama: enable lsx"                                ON)

 # add perf arguments
 option(LLAMA_PERF                            "llama: enable perf"                               OFF)
@@ -296,22 +292,11 @@ if (LLAMA_METAL)
        )
 endif()

-if (LLAMA_OPENMP)
-    find_package(OpenMP)
-    if (OpenMP_FOUND)
-        message(STATUS "OpenMP found")
-        add_compile_definitions(GGML_USE_OPENMP)
-        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
-    else()
-        message(WARNING "OpenMP not found")
-    endif()
-endif()
-
 if (LLAMA_BLAS)
    if (LLAMA_STATIC)
        set(BLA_STATIC ON)
    endif()
-    if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.22)
+    if ($(CMAKE_VERSION) VERSION_GREATER_EQUAL 3.22)
        set(BLA_SIZEOF_INTEGER 8)
    endif()

@@ -396,45 +381,30 @@ if (LLAMA_LLAMAFILE)
    set(GGML_SOURCES_LLAMAFILE sgemm.cpp)
 endif()

+if (LLAMA_QKK_64)
+    add_compile_definitions(GGML_QKK_64)
+endif()
+
 if (LLAMA_CUBLAS)
    message(WARNING "LLAMA_CUBLAS is deprecated and will be removed in the future.\nUse LLAMA_CUDA instead")
    set(LLAMA_CUDA ON)
 endif()

 if (LLAMA_CUDA)
-    cmake_minimum_required(VERSION 3.18)  # for CMAKE_CUDA_ARCHITECTURES
+    cmake_minimum_required(VERSION 3.17)

    find_package(CUDAToolkit)
    if (CUDAToolkit_FOUND)
        message(STATUS "CUDA found")

-        if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
-            # 52 == lowest CUDA 12 standard
-            # 60 == f16 CUDA intrinsics
-            # 61 == integer CUDA intrinsics
-            # 70 == compute capability at which unrolling a loop in mul_mat_q kernels is faster
-            if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
-                set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
-            else()
-                set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
-                #set(CMAKE_CUDA_ARCHITECTURES "OFF") # use this to compile much faster, but only F16 models work
-            endif()
-        endif()
-        message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
-
        enable_language(CUDA)

        set(GGML_HEADERS_CUDA ggml-cuda.h)

        file(GLOB GGML_SOURCES_CUDA "ggml-cuda/*.cu")
        list(APPEND GGML_SOURCES_CUDA "ggml-cuda.cu")
-        file(GLOB SRCS "ggml-cuda/template-instances/fattn-wmma*.cu")
-        list(APPEND GGML_SOURCES_CUDA ${SRCS})
-        file(GLOB SRCS "ggml-cuda/template-instances/mmq*.cu")
-        list(APPEND GGML_SOURCES_CUDA ${SRCS})

        add_compile_definitions(GGML_USE_CUDA)
-        add_compile_definitions(GGML_CUDA_USE_GRAPHS)
        if (LLAMA_CUDA_FORCE_DMMV)
            add_compile_definitions(GGML_CUDA_FORCE_DMMV)
        endif()
@@ -457,22 +427,10 @@ if (LLAMA_CUDA)
        if (LLAMA_CUDA_NO_PEER_COPY)
            add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
        endif()
-        if (LLAMA_CUDA_FA_ALL_QUANTS)
-            file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*.cu")
-            list(APPEND GGML_SOURCES_CUDA ${SRCS})
-            add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS)
-        else()
-            file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu")
-            list(APPEND GGML_SOURCES_CUDA ${SRCS})
-            file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu")
-            list(APPEND GGML_SOURCES_CUDA ${SRCS})
-            file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*f16-f16.cu")
-            list(APPEND GGML_SOURCES_CUDA ${SRCS})
-        endif()

        if (LLAMA_STATIC)
            if (WIN32)
-                # As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library
+                # As of 12.3.1 CUDA Tookit for Windows does not offer a static cublas library
                set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
            else ()
                set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
@@ -486,20 +444,69 @@ if (LLAMA_CUDA)
        else()
            set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ...
        endif()
+
+    if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+        # 52 == lowest CUDA 12 standard
+        # 60 == f16 CUDA intrinsics
+        # 61 == integer CUDA intrinsics
+        # 70 == compute capability at which unrolling a loop in mul_mat_q kernels is faster
+        if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
+            set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
+        else()
+            set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
+            #set(CMAKE_CUDA_ARCHITECTURES "") # use this to compile much faster, but only F16 models work
+        endif()
+    endif()
+    message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
+
    else()
        message(WARNING "CUDA not found")
    endif()
 endif()

-if (LLAMA_RPC)
-    add_compile_definitions(GGML_USE_RPC)
+if (LLAMA_MPI)
+    cmake_minimum_required(VERSION 3.10)
+    find_package(MPI)
+    if (MPI_C_FOUND)
+        message(STATUS "MPI found")

-    if (WIN32)
-        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ws2_32)
+        set(GGML_HEADERS_MPI ggml-mpi.h)
+        set(GGML_SOURCES_MPI ggml-mpi.c)
+
+        add_compile_definitions(GGML_USE_MPI)
+        add_compile_definitions(${MPI_C_COMPILE_DEFINITIONS})
+
+        if (NOT MSVC)
+            add_compile_options(-Wno-cast-qual)
+        endif()
+
+        set(LLAMA_EXTRA_LIBS     ${LLAMA_EXTRA_LIBS}     ${MPI_C_LIBRARIES})
+        set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${MPI_C_INCLUDE_DIRS})
+
+        # Even if you're only using the C header, C++ programs may bring in MPI
+        # C++ functions, so more linkage is needed
+        if (MPI_CXX_FOUND)
+            set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS}     ${MPI_CXX_LIBRARIES})
+        endif()
+    else()
+        message(WARNING "MPI not found")
    endif()
+endif()

-    set(GGML_HEADERS_RPC ggml-rpc.h)
-    set(GGML_SOURCES_RPC ggml-rpc.cpp)
+if (LLAMA_CLBLAST)
+    find_package(CLBlast)
+    if (CLBlast_FOUND)
+        message(STATUS "CLBlast found")
+
+        set(GGML_HEADERS_OPENCL ggml-opencl.h)
+        set(GGML_SOURCES_OPENCL ggml-opencl.cpp)
+
+        add_compile_definitions(GGML_USE_CLBLAST)
+
+        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} clblast)
+    else()
+        message(WARNING "CLBlast not found")
+    endif()
 endif()

 if (LLAMA_VULKAN)
@@ -512,12 +519,6 @@ if (LLAMA_VULKAN)

        add_compile_definitions(GGML_USE_VULKAN)

-        # Workaround to the "can't dereference invalidated vector iterator" bug in clang-cl debug build
-        # Posssibly relevant: https://stackoverflow.com/questions/74748276/visual-studio-no-displays-the-correct-length-of-stdvector
-        if (MSVC AND CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
-            add_compile_definitions(_ITERATOR_DEBUG_LEVEL=0)
-        endif()
-
        if (LLAMA_VULKAN_CHECK_RESULTS)
            add_compile_definitions(GGML_VULKAN_CHECK_RESULTS)
        endif()
@@ -541,42 +542,16 @@ if (LLAMA_VULKAN)
 endif()

 if (LLAMA_HIPBLAS)
-    if (NOT EXISTS $ENV{ROCM_PATH})
-        if (NOT EXISTS /opt/rocm)
-            set(ROCM_PATH /usr)
-        else()
-            set(ROCM_PATH /opt/rocm)
-        endif()
-    else()
-        set(ROCM_PATH $ENV{ROCM_PATH})
-    endif()
-    list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH})
-    list(APPEND CMAKE_PREFIX_PATH "${ROCM_PATH}/lib64/cmake")
+    list(APPEND CMAKE_PREFIX_PATH /opt/rocm)

-    # CMake on Windows doesn't support the HIP language yet
-    if(WIN32)
-        set(CXX_IS_HIPCC TRUE)
-    else()
-        string(REGEX MATCH "hipcc(\.bat)?$" CXX_IS_HIPCC "${CMAKE_CXX_COMPILER}")
+    if (NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang")
+        message(WARNING "Only LLVM is supported for HIP, hint: CC=/opt/rocm/llvm/bin/clang")
    endif()

-    if(CXX_IS_HIPCC)
-        if(LINUX)
-            if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
-                message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++")
-            endif()
-
-            message(WARNING "Setting hipcc as the C++ compiler is legacy behavior."
-                    " Prefer setting the HIP compiler directly. See README for details.")
-        endif()
-    else()
-        # Forward AMDGPU_TARGETS to CMAKE_HIP_ARCHITECTURES.
-        if(AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES)
-            set(CMAKE_HIP_ARCHITECTURES ${AMDGPU_TARGETS})
-        endif()
-        cmake_minimum_required(VERSION 3.21)
-        enable_language(HIP)
+    if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
+        message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++")
    endif()
+
    find_package(hip     REQUIRED)
    find_package(hipblas REQUIRED)
    find_package(rocblas REQUIRED)
@@ -587,10 +562,6 @@ if (LLAMA_HIPBLAS)

    file(GLOB GGML_SOURCES_ROCM "ggml-cuda/*.cu")
    list(APPEND GGML_SOURCES_ROCM "ggml-cuda.cu")
-    file(GLOB SRCS "ggml-cuda/template-instances/fattn-wmma*.cu")
-    list(APPEND GGML_SOURCES_ROCM ${SRCS})
-    file(GLOB SRCS "ggml-cuda/template-instances/mmq*.cu")
-    list(APPEND GGML_SOURCES_ROCM ${SRCS})

    add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUDA)

@@ -610,35 +581,17 @@ if (LLAMA_HIPBLAS)
        add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
    endif()

-    if (LLAMA_CUDA_FA_ALL_QUANTS)
-        file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*.cu")
-        list(APPEND GGML_SOURCES_ROCM ${SRCS})
-        add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS)
-    else()
-        file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu")
-        list(APPEND GGML_SOURCES_ROCM ${SRCS})
-        file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu")
-        list(APPEND GGML_SOURCES_ROCM ${SRCS})
-        file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*f16-f16.cu")
-        list(APPEND GGML_SOURCES_ROCM ${SRCS})
-    endif()
-
    add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
    add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
    add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})

-    if (CXX_IS_HIPCC)
-        set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX)
-        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} hip::device)
-    else()
-        set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE HIP)
-    endif()
+    set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX)

    if (LLAMA_STATIC)
        message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
    endif()

-    set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} PUBLIC hip::host roc::rocblas roc::hipblas)
+    set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} hip::device PUBLIC hip::host roc::rocblas roc::hipblas)
 endif()

 if (LLAMA_SYCL)
@@ -661,10 +614,6 @@ if (LLAMA_SYCL)
        add_compile_definitions(GGML_SYCL_F16)
    endif()

-    if (LLAMA_CUDA_FORCE_MMQ)
-        add_compile_definitions(GGML_SYCL_FORCE_MMQ)
-    endif()
-
    add_compile_options(-I./) #include DPCT
    add_compile_options(-I/${SYCL_INCLUDE_DIR})

@@ -780,7 +729,6 @@ if (LLAMA_KOMPUTE)
            kompute-shaders/op_mul_mat_q4_0.comp
            kompute-shaders/op_mul_mat_q4_1.comp
            kompute-shaders/op_mul_mat_q6_k.comp
-            kompute-shaders/op_getrows_f32.comp
            kompute-shaders/op_getrows_f16.comp
            kompute-shaders/op_getrows_q4_0.comp
            kompute-shaders/op_getrows_q4_1.comp
@@ -813,7 +761,6 @@ if (LLAMA_KOMPUTE)
            shaderop_mul_mat_q4_0.h
            shaderop_mul_mat_q4_1.h
            shaderop_mul_mat_q6_k.h
-            shaderop_getrows_f32.h
            shaderop_getrows_f16.h
            shaderop_getrows_q4_0.h
            shaderop_getrows_q4_1.h
@@ -1047,11 +994,6 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR CMAKE_GENERATOR_PLATFORM_LWR STR
        if (GGML_COMPILER_SUPPORT_DOTPROD)
            add_compile_definitions(__ARM_FEATURE_DOTPROD)
        endif ()
-        check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
-        if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
-            add_compile_definitions(__ARM_FEATURE_MATMUL_INT8)
-        endif ()
-
        check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
        if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
            add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
@@ -1080,9 +1022,6 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR CMAKE_GENERATOR_PLATFORM_LWR STR
            # Raspberry Pi 3, 4, Zero 2 (32-bit)
            list(APPEND ARCH_FLAGS -mno-unaligned-access)
        endif()
-        if (LLAMA_SVE)
-            list(APPEND ARCH_FLAGS -march=armv8.6-a+sve)
-        endif()
    endif()
 elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
        (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
@@ -1107,10 +1046,6 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
            endif()
-            if (LLAMA_AVX512_BF16)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512BF16__>)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512BF16__>)
-            endif()
        elseif (LLAMA_AVX2)
            list(APPEND ARCH_FLAGS /arch:AVX2)
        elseif (LLAMA_AVX)
@@ -1142,9 +1077,6 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
        if (LLAMA_AVX512_VNNI)
            list(APPEND ARCH_FLAGS -mavx512vnni)
        endif()
-        if (LLAMA_AVX512_BF16)
-            list(APPEND ARCH_FLAGS -mavx512bf16)
-        endif()
    endif()
 elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
    message(STATUS "PowerPC detected")
@@ -1154,17 +1086,6 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
        list(APPEND ARCH_FLAGS -mcpu=native -mtune=native)
        #TODO: Add  targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
    endif()
-elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
-    message(STATUS "loongarch64 detected")
-
-    list(APPEND ARCH_FLAGS -march=loongarch64)
-    if (LLAMA_LASX)
-        list(APPEND ARCH_FLAGS -mlasx)
-    endif()
-    if (LLAMA_LSX)
-        list(APPEND ARCH_FLAGS -mlsx)
-    endif()
-
 else()
    message(STATUS "Unknown architecture")
 endif()
@@ -1251,8 +1172,9 @@ add_library(ggml OBJECT
            ggml-quants.c
            ggml-quants.h
            ${GGML_SOURCES_CUDA}      ${GGML_HEADERS_CUDA}
+            ${GGML_SOURCES_OPENCL}    ${GGML_HEADERS_OPENCL}
            ${GGML_SOURCES_METAL}     ${GGML_HEADERS_METAL}
-            ${GGML_SOURCES_RPC}       ${GGML_HEADERS_RPC}
+            ${GGML_SOURCES_MPI}       ${GGML_HEADERS_MPI}
            ${GGML_SOURCES_EXTRA}     ${GGML_HEADERS_EXTRA}
            ${GGML_SOURCES_SYCL}      ${GGML_HEADERS_SYCL}
            ${GGML_SOURCES_KOMPUTE}   ${GGML_HEADERS_KOMPUTE}
@@ -1338,9 +1260,8 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfig.cmake
        DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Llama)

 set(GGML_PUBLIC_HEADERS "ggml.h" "ggml-alloc.h" "ggml-backend.h"
-        "${GGML_HEADERS_CUDA}"
-        "${GGML_HEADERS_METAL}"
-        "${GGML_HEADERS_EXTRA}")
+        "${GGML_HEADERS_CUDA}"  "${GGML_HEADERS_OPENCL}"
+        "${GGML_HEADERS_METAL}" "${GGML_HEADERS_MPI}" "${GGML_HEADERS_EXTRA}")

 set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
 install(TARGETS ggml PUBLIC_HEADER)
@@ -1349,7 +1270,18 @@ set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}
 install(TARGETS llama LIBRARY PUBLIC_HEADER)

 install(
-    FILES convert-hf-to-gguf.py
+    FILES convert.py
+    PERMISSIONS
+        OWNER_READ
+        OWNER_WRITE
+        OWNER_EXECUTE
+        GROUP_READ
+        GROUP_EXECUTE
+        WORLD_READ
+        WORLD_EXECUTE
+    DESTINATION ${CMAKE_INSTALL_BINDIR})
+install(
+    FILES convert-lora-to-ggml.py
    PERMISSIONS
        OWNER_READ
        OWNER_WRITE
@@ -1376,13 +1308,6 @@ if (LLAMA_METAL)
    endif()
 endif()

-configure_file(cmake/llama.pc.in
-        "${CMAKE_CURRENT_BINARY_DIR}/llama.pc"
-        @ONLY)
-
-install(FILES "${CMAKE_CURRENT_BINARY_DIR}/llama.pc"
-        DESTINATION lib/pkgconfig)
-
 #
 # programs, examples and tests
 #
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -1,49 +0,0 @@
-{
-  "version": 4,
-  "configurePresets": [
-    {
-        "name":  "base",
-        "hidden": true,
-        "generator":   "Ninja",
-        "binaryDir":   "${sourceDir}/build-${presetName}",
-        "cacheVariables": {
-            "CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
-            "CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
-        }
-    },
-
-    { "name": "debug",   "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } },
-    { "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
-    { "name": "static",  "hidden": true, "cacheVariables": { "LLAMA_STATIC": "ON" } },
-
-    {
-        "name": "arm64-windows-msvc", "hidden": true,
-        "architecture": { "value": "arm64",       "strategy": "external" },
-        "toolset":      { "value": "host=x86_64", "strategy": "external" },
-        "cacheVariables": {
-            "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-msvc.cmake"
-        }
-    },
-
-    {
-        "name": "arm64-windows-llvm", "hidden": true,
-        "architecture": { "value": "arm64",       "strategy": "external" },
-        "toolset":      { "value": "host=x86_64", "strategy": "external" },
-        "cacheVariables": {
-            "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-llvm.cmake"
-        }
-    },
-
-    { "name": "arm64-windows-llvm-debug"  , "inherits": [ "base", "arm64-windows-llvm",  "debug"   ] },
-    { "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm",  "release" ] },
-    { "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm",  "release", "static" ] },
-
-    { "name": "arm64-windows-msvc-debug"  , "inherits": [ "base", "arm64-windows-msvc",  "debug"   ] },
-    { "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc",  "release" ] },
-    { "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc",  "release", "static" ] },
-
-    { "name": "x64-windows-msvc-debug"  , "inherits": [ "base", "debug"   ] },
-    { "name": "x64-windows-msvc-release", "inherits": [ "base", "release" ] },
-    { "name": "x64-windows-msvc+static-release", "inherits": [ "base", "release", "static" ] }
-  ]
-}
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,14 +0,0 @@
-# Contributing Guidelines
-
-## Checklist
-
-* Make sure your PR follows the [coding guidelines](https://github.com/ggerganov/llama.cpp/blob/master/README.md#coding-guidelines)
-* Test your changes using the commands in the [`tests`](tests) folder. For instance, running the `./tests/test-backend-ops` command tests different backend implementations of the GGML library
-* Execute [the full CI locally on your machine](ci/README.md) before publishing
-
-## PR formatting
-
-* Please rate the complexity of your PR (i.e. `Review Complexity : Low`, `Review Complexity : Medium`, `Review Complexity : High`). This makes it easier for maintainers to triage the PRs.
-    - The PR template has a series of review complexity checkboxes `[ ]` that you can mark as `[X]` for your conveience. Refer to [About task lists](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/about-task-lists) for more information.
-* If the pull request only contains documentation changes (e.g., updating READMEs, adding new wiki pages), please add `[no ci]` to the commit title. This will skip unnecessary CI checks and help reduce build times.
-* When squashing multiple commits on merge, use the following format for your commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : Fix typo in utils.py (#1234)`
--- a/126
+++ b/126
@@ -1,7 +1,7 @@
 # Define the default target now so that it is always the first target
 BUILD_TARGETS = \
 	main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
-	simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama \
+	simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama beam-search  \
 	retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o

 # Binaries only useful for tests
@@ -57,8 +57,6 @@ ifeq ($(UNAME_S),Darwin)
 		LLAMA_METAL := 1
 	endif

-	LLAMA_NO_OPENMP := 1
-
 	ifneq ($(UNAME_P),arm)
 		SYSCTL_M := $(shell sysctl -n hw.optional.arm64 2>/dev/null)
 		ifeq ($(SYSCTL_M),1)
@@ -69,10 +67,6 @@ ifeq ($(UNAME_S),Darwin)
 	endif
 endif

-ifdef LLAMA_RPC
-	BUILD_TARGETS += rpc-server
-endif
-
 default: $(BUILD_TARGETS)

 test: $(TEST_TARGETS)
@@ -141,16 +135,12 @@ MK_NVCCFLAGS = -std=c++11
 ifdef LLAMA_FAST
 MK_CFLAGS     += -Ofast
 HOST_CXXFLAGS += -Ofast
-ifndef LLAMA_DEBUG
 MK_NVCCFLAGS  += -O3
-endif # LLAMA_DEBUG
 else
 MK_CFLAGS     += -O3
 MK_CXXFLAGS   += -O3
-ifndef LLAMA_DEBUG
 MK_NVCCFLAGS  += -O3
-endif # LLAMA_DEBUG
-endif # LLAMA_FAST
+endif

 ifndef LLAMA_NO_CCACHE
 CCACHE := $(shell which ccache)
@@ -211,10 +201,9 @@ ifdef LLAMA_SCHED_MAX_COPIES
 endif

 ifdef LLAMA_DEBUG
-	MK_CFLAGS    += -O0 -g
-	MK_CXXFLAGS  += -O0 -g
-	MK_LDFLAGS   += -g
-	MK_NVCCFLAGS += -O0 -g
+	MK_CFLAGS   += -O0 -g
+	MK_CXXFLAGS += -O0 -g
+	MK_LDFLAGS  += -g

 	ifeq ($(UNAME_S),Linux)
 		MK_CPPFLAGS += -D_GLIBCXX_ASSERTIONS
@@ -390,16 +379,15 @@ ifneq ($(filter ppc64le%,$(UNAME_M)),)
 	CUDA_POWER_ARCH = 1
 endif

-ifneq ($(filter loongarch64%,$(UNAME_M)),)
-	MK_CFLAGS   += -mlasx
-	MK_CXXFLAGS += -mlasx
-endif
-
 else
 	MK_CFLAGS   += -march=rv64gcv -mabi=lp64d
 	MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
 endif

+ifdef LLAMA_QKK_64
+	MK_CPPFLAGS += -DGGML_QKK_64
+endif
+
 ifndef LLAMA_NO_ACCELERATE
 	# Mac OS - include Accelerate framework.
 	# `-framework Accelerate` works both with Apple Silicon and Mac Intel
@@ -411,11 +399,12 @@ ifndef LLAMA_NO_ACCELERATE
 	endif
 endif # LLAMA_NO_ACCELERATE

-ifndef LLAMA_NO_OPENMP
-	MK_CPPFLAGS += -DGGML_USE_OPENMP
-	MK_CFLAGS   += -fopenmp
-	MK_CXXFLAGS += -fopenmp
-endif # LLAMA_NO_OPENMP
+ifdef LLAMA_MPI
+	MK_CPPFLAGS += -DGGML_USE_MPI
+	MK_CFLAGS   += -Wno-cast-qual
+	MK_CXXFLAGS += -Wno-cast-qual
+	OBJS        += ggml-mpi.o
+endif # LLAMA_MPI

 ifdef LLAMA_OPENBLAS
 	MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas)
@@ -433,37 +422,21 @@ ifdef LLAMA_BLIS
 	MK_LDFLAGS  += -lblis -L/usr/local/lib
 endif # LLAMA_BLIS

-ifdef LLAMA_RPC
-	MK_CPPFLAGS   += -DGGML_USE_RPC
-	OBJS          += ggml-rpc.o
-endif # LLAMA_RPC
-
 ifdef LLAMA_CUBLAS
 # LLAMA_CUBLAS is deprecated and will be removed in the future
 	LLAMA_CUDA := 1
 endif

-OBJS_CUDA_TEMP_INST      = $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-wmma*.cu))
-OBJS_CUDA_TEMP_INST     += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/mmq*.cu))
-ifdef LLAMA_CUDA_FA_ALL_QUANTS
-	OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*.cu))
-else
-	OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu))
-	OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu))
-	OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*f16-f16.cu))
-endif # LLAMA_CUDA_FA_ALL_QUANTS
-
 ifdef LLAMA_CUDA
 	ifneq ('', '$(wildcard /opt/cuda)')
 		CUDA_PATH ?= /opt/cuda
 	else
 		CUDA_PATH ?= /usr/local/cuda
 	endif
-	MK_CPPFLAGS  += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include -DGGML_CUDA_USE_GRAPHS
+	MK_CPPFLAGS  += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
 	MK_LDFLAGS   += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
 	OBJS         += ggml-cuda.o
 	OBJS         += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
-	OBJS         += $(OBJS_CUDA_TEMP_INST)
 	MK_NVCCFLAGS += -use_fast_math
 ifdef LLAMA_FATAL_WARNINGS
 	MK_NVCCFLAGS += -Werror all-warnings
@@ -474,9 +447,6 @@ endif # JETSON_EOL_MODULE_DETECT
 ifdef LLAMA_DEBUG
 	MK_NVCCFLAGS += -lineinfo
 endif # LLAMA_DEBUG
-ifdef LLAMA_CUDA_DEBUG
-	MK_NVCCFLAGS += --device-debug
-endif # LLAMA_CUDA_DEBUG
 ifdef LLAMA_CUDA_NVCC
 	NVCC = $(CCACHE) $(LLAMA_CUDA_NVCC)
 else
@@ -526,10 +496,7 @@ ifdef LLAMA_CUDA_NO_PEER_COPY
 endif # LLAMA_CUDA_NO_PEER_COPY
 ifdef LLAMA_CUDA_CCBIN
 	MK_NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
-endif # LLAMA_CUDA_CCBIN
-ifdef LLAMA_CUDA_FA_ALL_QUANTS
-	MK_NVCCFLAGS += -DGGML_CUDA_FA_ALL_QUANTS
-endif # LLAMA_CUDA_FA_ALL_QUANTS
+endif

 ifdef JETSON_EOL_MODULE_DETECT
 define NVCC_COMPILE
@@ -541,13 +508,30 @@ define NVCC_COMPILE
 endef # NVCC_COMPILE
 endif # JETSON_EOL_MODULE_DETECT

-ggml-cuda/%.o: ggml-cuda/%.cu ggml.h ggml-common.h ggml-cuda/common.cuh
+ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh
 	$(NVCC_COMPILE)

 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
 	$(NVCC_COMPILE)
 endif # LLAMA_CUDA

+ifdef LLAMA_CLBLAST
+	MK_CPPFLAGS += -DGGML_USE_CLBLAST $(shell pkg-config --cflags-only-I clblast OpenCL)
+	MK_CFLAGS   += $(shell pkg-config --cflags-only-other clblast OpenCL)
+	MK_CXXFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
+
+	# Mac provides OpenCL as a framework
+	ifeq ($(UNAME_S),Darwin)
+		MK_LDFLAGS += -lclblast -framework OpenCL
+	else
+		MK_LDFLAGS += $(shell pkg-config --libs clblast OpenCL)
+	endif
+	OBJS    += ggml-opencl.o
+
+ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+endif # LLAMA_CLBLAST
+
 ifdef LLAMA_VULKAN
 	MK_CPPFLAGS  += -DGGML_USE_VULKAN
 	MK_LDFLAGS += -lvulkan
@@ -576,10 +560,10 @@ endif # LLAMA_VULKAN
 ifdef LLAMA_HIPBLAS
 	ifeq ($(wildcard /opt/rocm),)
 		ROCM_PATH	?= /usr
-		AMDGPU_TARGETS ?= $(shell $(shell which amdgpu-arch))
+		GPU_TARGETS ?= $(shell $(shell which amdgpu-arch))
 	else
 		ROCM_PATH	?= /opt/rocm
-		AMDGPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
+		GPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
 	endif
 	HIPCC                   ?= $(CCACHE) $(ROCM_PATH)/bin/hipcc
 	LLAMA_CUDA_DMMV_X       ?= 32
@@ -590,9 +574,8 @@ ifdef LLAMA_HIP_UMA
 	MK_CPPFLAGS += -DGGML_HIP_UMA
 endif # LLAMA_HIP_UMA
 	MK_LDFLAGS  += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
-	MK_LDFLAGS  += -L$(ROCM_PATH)/lib64 -Wl,-rpath=$(ROCM_PATH)/lib64
 	MK_LDFLAGS	+= -lhipblas -lamdhip64 -lrocblas
-	HIPFLAGS    += $(addprefix --offload-arch=,$(AMDGPU_TARGETS))
+	HIPFLAGS    += $(addprefix --offload-arch=,$(GPU_TARGETS))
 	HIPFLAGS    += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
 	HIPFLAGS    += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
 	HIPFLAGS    += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
@@ -604,12 +587,11 @@ ifdef LLAMA_CUDA_NO_PEER_COPY
 endif # LLAMA_CUDA_NO_PEER_COPY
 	OBJS        += ggml-cuda.o
 	OBJS        += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
-	OBJS        += $(OBJS_CUDA_TEMP_INST)

 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
 	$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<

-ggml-cuda/%.o: ggml-cuda/%.cu ggml.h ggml-common.h ggml-cuda/common.cuh
+ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh
 	$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<

 endif # LLAMA_HIPBLAS
@@ -647,26 +629,16 @@ ggml-metal-embed.o: ggml-metal.metal ggml-common.h
 endif
 endif # LLAMA_METAL

-OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
-COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h
-COMMON_DEPS   = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o
+ifdef LLAMA_MPI
+ggml-mpi.o: ggml-mpi.c ggml-mpi.h
+	$(CC) $(CFLAGS) -c $< -o $@
+endif # LLAMA_MPI

 ifndef LLAMA_NO_LLAMAFILE
 sgemm.o: sgemm.cpp sgemm.h ggml.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 endif

-ifdef LLAMA_RPC
-ggml-rpc.o: ggml-rpc.cpp ggml-rpc.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-
-rpc-server.o: examples/rpc/rpc-server.cpp ggml-rpc.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-
-rpc-server: rpc-server.o ggml.o llama.o $(COMMON_DEPS) $(OBJS)
-	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
-endif # LLAMA_RPC
-
 GF_CC := $(CC)
 include scripts/get-flags.mk

@@ -746,9 +718,14 @@ unicode.o: unicode.cpp unicode.h
 unicode-data.o: unicode-data.cpp unicode-data.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@

+OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
+
 llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@

+COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h
+COMMON_DEPS   = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o
+
 common.o: common/common.cpp $(COMMON_H_DEPS)
 	$(CXX) $(CXXFLAGS) -c $< -o $@

@@ -779,7 +756,6 @@ libllama.a: llama.o ggml.o $(OBJS) $(COMMON_DEPS)
 clean:
 	rm -vrf *.o tests/*.o *.so *.a *.dll benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
 	rm -vrf ggml-cuda/*.o
-	rm -vrf ggml-cuda/template-instances/*.o
 	find examples pocs -type f -name "*.o" -delete

 #
@@ -848,7 +824,7 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/colorthemes.css.hpp examples/server/style.css.hpp examples/server/theme-beeninorder.css.hpp examples/server/theme-ketivah.css.hpp examples/server/theme-mangotango.css.hpp examples/server/theme-playground.css.hpp examples/server/theme-polarnight.css.hpp examples/server/theme-snowstorm.css.hpp examples/server/index.html.hpp examples/server/index-new.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/system-prompts.js.hpp examples/server/prompt-formats.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
+server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)

@@ -898,6 +874,10 @@ baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) tra
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

+beam-search: examples/beam-search/beam-search.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
 finetune: examples/finetune/finetune.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
--- a/README-sycl.md
+++ b/README-sycl.md
@@ -29,7 +29,7 @@ The llama.cpp SYCL backend is designed to support **Intel GPU** firstly. Based o

 When targeting **Intel CPU**, it is recommended to use llama.cpp for [Intel oneMKL](README.md#intel-onemkl) backend.

-It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS, cuBLAS, etc..*. In beginning work, the oneAPI's [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) open-source migration tool (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) was used for this purpose.
+It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS, cuBLAS, CLBlast etc..*. In beginning work, the oneAPI's [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) open-source migration tool (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) was used for this purpose.

 ## News

@@ -54,10 +54,10 @@ It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS,

 ## OS

-| OS      | Status  | Verified                                       |
-|---------|---------|------------------------------------------------|
-| Linux   | Support | Ubuntu 22.04, Fedora Silverblue 39, Arch Linux |
-| Windows | Support | Windows 11                                     |
+| OS      | Status  | Verified                           |
+|---------|---------|------------------------------------|
+| Linux   | Support | Ubuntu 22.04, Fedora Silverblue 39 |
+| Windows | Support | Windows 11                         |


 ## Hardware
@@ -70,7 +70,7 @@ It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS,
 |-------------------------------|---------|---------------------------------------|
 | Intel Data Center Max Series  | Support | Max 1550, 1100                        |
 | Intel Data Center Flex Series | Support | Flex 170                              |
-| Intel Arc Series              | Support | Arc 770, 730M, Arc A750               |
+| Intel Arc Series              | Support | Arc 770, 730M                         |
 | Intel built-in Arc GPU        | Support | built-in Arc GPU in Meteor Lake       |
 | Intel iGPU                    | Support | iGPU in i5-1250P, i7-1260P, i7-1165G7 |

--- a/README.md
+++ b/README.md
@@ -3,8 +3,6 @@
 ![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png)

 [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
-[![Server](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml/badge.svg?branch=master&event=schedule)](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml)
-[![Conan Center](https://shields.io/conan/v/llama-cpp)](https://conan.io/center/llama-cpp)

 [Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)

@@ -22,8 +20,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)

 ### Hot topics

- **`convert.py` has been deprecated and moved to `examples/convert-legacy-llama.py`, please use `convert-hf-to-gguf.py`** https://github.com/ggerganov/llama.cpp/pull/7430
- Initial Flash-Attention support: https://github.com/ggerganov/llama.cpp/pull/5021
+- **Initial Flash-Attention support: https://github.com/ggerganov/llama.cpp/pull/5021**
 - BPE pre-tokenization support has been added: https://github.com/ggerganov/llama.cpp/pull/6920
 - MoE memory layout has been updated - reconvert models for `mmap` support and regenerate `imatrix` https://github.com/ggerganov/llama.cpp/pull/6387
 - Model sharding instructions using `gguf-split` https://github.com/ggerganov/llama.cpp/discussions/6404
@@ -53,6 +50,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
        <li><a href="#quantization">Quantization</a></li>
        <li><a href="#interactive-mode">Interactive mode</a></li>
        <li><a href="#constrained-output-with-grammars">Constrained output with grammars</a></li>
+        <li><a href="#instruct-mode">Instruct mode</a></li>
        <li><a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a></li>
        <li><a href="#seminal-papers-and-background-on-the-models">Seminal papers and background on the models</a></li>
        <li><a href="#perplexity-measuring-model-quality">Perplexity (measuring model quality)</a></li>
@@ -76,7 +74,7 @@ variety of hardware - locally and in the cloud.
 - AVX, AVX2 and AVX512 support for x86 architectures
 - 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
 - Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP)
- Vulkan and SYCL backend support
+- Vulkan, SYCL, and (partial) OpenCL backend support
 - CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity

 Since its [inception](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022), the project has
@@ -109,6 +107,7 @@ Typically finetunes of the base models below are supported as well.
 - [X] [Aquila 1 & 2](https://huggingface.co/models?search=BAAI/Aquila)
 - [X] [Starcoder models](https://github.com/ggerganov/llama.cpp/pull/3187)
 - [X] [Refact](https://huggingface.co/smallcloudai/Refact-1_6B-fim)
+- [X] [Persimmon 8B](https://github.com/ggerganov/llama.cpp/pull/3410)
 - [X] [MPT](https://github.com/ggerganov/llama.cpp/pull/3417)
 - [X] [Bloom](https://github.com/ggerganov/llama.cpp/pull/3553)
 - [x] [Yi models](https://huggingface.co/models?search=01-ai/Yi)
@@ -129,7 +128,6 @@ Typically finetunes of the base models below are supported as well.
 - [x] [SEA-LION](https://huggingface.co/models?search=sea-lion)
 - [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B)
 - [x] [OLMo](https://allenai.org/olmo)
- [x] [GPT-NeoX](https://github.com/EleutherAI/gpt-neox) + [Pythia](https://github.com/EleutherAI/pythia)

 (instructions for supporting more models: [HOWTO-add-model.md](./docs/HOWTO-add-model.md))

@@ -143,14 +141,11 @@ Typically finetunes of the base models below are supported as well.
 - [x] [Yi-VL](https://huggingface.co/models?search=Yi-VL)
 - [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM)
 - [x] [Moondream](https://huggingface.co/vikhyatk/moondream2)
- [x] [Bunny](https://github.com/BAAI-DCAI/Bunny)

 **HTTP server**

 [llama.cpp web server](./examples/server) is a lightweight [OpenAI API](https://github.com/openai/openai-openapi) compatible HTTP server that can be used to serve local models and easily connect them to existing clients.

-[simplechat](./examples/server/public_simplechat) is a simple chat client, which can be used to chat with the model exposed using above web server (use --path to point to simplechat), from a local web browser.
-
 **Bindings:**

 - Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
@@ -181,7 +176,6 @@ Unless otherwise noted these projects are open-source with permissive licensing:
 - [nat/openplayground](https://github.com/nat/openplayground)
 - [Faraday](https://faraday.dev/) (proprietary)
 - [LMStudio](https://lmstudio.ai/) (proprietary)
- [Layla](https://play.google.com/store/apps/details?id=com.laylalite) (proprietary)
 - [LocalAI](https://github.com/mudler/LocalAI) (MIT)
 - [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL)
 - [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile)
@@ -204,14 +198,9 @@ Unless otherwise noted these projects are open-source with permissive licensing:
 - [KodiBot](https://github.com/firatkiral/kodibot) (GPL)
 - [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
 - [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)
- [AIKit](https://github.com/sozercan/aikit) (MIT)

 *(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*

-**Tools:**
-
- [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from HuggingFace Hub and convert them to GGML
-
 ---

 Here is a typical run using LLaMA v2 13B on M2 Ultra:
@@ -311,7 +300,7 @@ cd llama.cpp

 ### Build

-In order to build llama.cpp you have four different options.
+In order to build llama.cpp you have three different options.

 - Using `make`:
  - On Linux or MacOS:
@@ -320,6 +309,8 @@ In order to build llama.cpp you have four different options.
      make
      ```

+      **Note**: for `Debug` builds, run `make LLAMA_DEBUG=1`
+
  - On Windows:

    1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
@@ -331,38 +322,40 @@ In order to build llama.cpp you have four different options.
        make
        ```

-  - Notes:
-    - For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `make -j 8` will run 8 jobs in parallel.
-    - For faster repeated compilation, install [ccache](https://ccache.dev/).
-    - For debug builds, run `make LLAMA_DEBUG=1`
-
 - Using `CMake`:

-  ```bash
-  cmake -B build
-  cmake --build build --config Release
-  ```
+    ```bash
+    cmake -B build
+    cmake --build build --config Release
+    ```

-  **Notes**:
+    **Note**: for `Debug` builds, there are two cases:

-    - For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `cmake --build build --config Release -j 8` will run 8 jobs in parallel.
-    - For faster repeated compilation, install [ccache](https://ccache.dev/).
-    - For debug builds, there are two cases:
-
-      1. Single-config generators (e.g. default = `Unix Makefiles`; note that they just ignore the `--config` flag):
+    - Single-config generators (e.g. default = `Unix Makefiles`; note that they just ignore the `--config` flag):

      ```bash
      cmake -B build -DCMAKE_BUILD_TYPE=Debug
      cmake --build build
      ```

-      2. Multi-config generators (`-G` param set to Visual Studio, XCode...):
+    - Multi-config generators (`-G` param set to Visual Studio, XCode...):

      ```bash
      cmake -B build -G "Xcode"
      cmake --build build --config Debug
      ```

+- Using `Zig` (version 0.11 or later):
+
+    Building for optimization levels and CPU features can be accomplished using standard build arguments, for example AVX2, FMA, F16C,
+    it's also possible to cross compile for other operating systems and architectures:
+
+    ```bash
+    zig build -Doptimize=ReleaseFast -Dtarget=x86_64-windows-gnu -Dcpu=x86_64+avx2+fma+f16c
+    ```
+
+    The `zig targets` command will give you valid options to use.
+
 -   Using `gmake` (FreeBSD):

    1. Install and activate [DRM in FreeBSD](https://wiki.freebsd.org/Graphics)
@@ -370,18 +363,15 @@ In order to build llama.cpp you have four different options.
    3. Install compilation dependencies.

        ```bash
-        sudo pkg install gmake automake autoconf pkgconf llvm15 openblas
+        sudo pkg install gmake automake autoconf pkgconf llvm15 clinfo clover \
+            opencl clblast openblas

        gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j4
        ```

-### Homebrew
-
-On Mac and Linux, the homebrew package manager can be used via
-```
-brew install llama.cpp
-```
-The formula is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggerganov/llama.cpp/discussions/7668
+    **Notes:** With this packages you can build llama.cpp with OPENBLAS and
+    CLBLAST support for use OpenCL GPU acceleration in FreeBSD. Please read
+    the instructions for use and activate this options in this document below.

 ### Metal Build

@@ -391,9 +381,48 @@ To disable the Metal build at compile time use the `LLAMA_NO_METAL=1` flag or th
 When built with Metal support, you can explicitly disable GPU inference with the `--n-gpu-layers|-ngl 0` command-line
 argument.

+### MPI Build
+
+MPI lets you distribute the computation over a cluster of machines. Because of the serial nature of LLM prediction, this won't yield any end-to-end speed-ups, but it will let you run larger models than would otherwise fit into RAM on a single machine.
+
+First you will need MPI libraries installed on your system. The two most popular (only?) options are [MPICH](https://www.mpich.org) and [OpenMPI](https://www.open-mpi.org). Either can be installed with a package manager (`apt`, Homebrew, MacPorts, etc).
+
+Next you will need to build the project with `LLAMA_MPI` set to true on all machines; if you're building with `make`, you will also need to specify an MPI-capable compiler (when building with CMake, this is configured automatically):
+
+- Using `make`:
+
+  ```bash
+  make CC=mpicc CXX=mpicxx LLAMA_MPI=1
+  ```
+
+- Using `CMake`:
+
+  ```bash
+  cmake -S . -B build -DLLAMA_MPI=ON
+  ```
+
+Once the programs are built, download/convert the weights on all of the machines in your cluster. The paths to the weights and programs should be identical on all machines.
+
+Next, ensure password-less SSH access to each machine from the primary host, and create a `hostfile` with a list of the hostnames and their relative "weights" (slots). If you want to use localhost for computation, use its local subnet IP address rather than the loopback address or "localhost".
+
+Here is an example hostfile:
+
+```
+192.168.0.1:2
+malvolio.local:1
+```
+
+The above will distribute the computation across 2 processes on the first host and 1 process on the second host. Each process will use roughly an equal amount of RAM. Try to keep these numbers small, as inter-process (intra-host) communication is expensive.
+
+Finally, you're ready to run a computation using `mpirun`:
+
+```bash
+mpirun -hostfile hostfile -n 3 ./main -m ./models/7B/ggml-model-q4_0.gguf -n 128
+```
+
 ### BLAS Build

-Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Support with CPU-only BLAS implementations doesn't affect the normal generation performance. We may see generation performance improvements with GPU-involved BLAS implementations, e.g. cuBLAS, hipBLAS. There are currently several different BLAS implementations available for build and use:
+Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Support with CPU-only BLAS implementations doesn't affect the normal generation performance. We may see generation performance improvements with GPU-involved BLAS implementations, e.g. cuBLAS, hipBLAS and CLBlast. There are currently several different BLAS implementations available for build and use:

 - #### Accelerate Framework:

@@ -481,12 +510,10 @@ Building the program with BLAS support may lead to some performance improvements
  |--------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
  | LLAMA_CUDA_FORCE_DMMV          | Boolean                | false   | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
  | LLAMA_CUDA_DMMV_X              | Positive integer >= 32 | 32      | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants.                                         |
-  | LLAMA_CUDA_MMV_Y               | Positive integer       | 1       | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended.                                               |
-  | LLAMA_CUDA_FORCE_MMQ           | Boolean                | false   | Force the use of dequantization + matrix multiplication kernels instead of leveraging Math libraries. |                                                                                                                                         |
+  | LLAMA_CUDA_MMV_Y               | Positive integer       | 1       | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended.                                                                                                                                         |
  | LLAMA_CUDA_F16                 | Boolean                | false   | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs.                                                           |
  | LLAMA_CUDA_KQUANTS_ITER        | 1 or 2                 | 2       | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs.                                                                                                                     |
  | LLAMA_CUDA_PEER_MAX_BATCH_SIZE | Positive integer       | 128     | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial.                                                                         |
-  | LLAMA_CUDA_FA_ALL_QUANTS       | Boolean                | false   | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer.                                                                                                  |

 - #### hipBLAS

@@ -500,28 +527,13 @@ Building the program with BLAS support may lead to some performance improvements
    ```
  - Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU):
    ```bash
-    HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
-        cmake -S . -B build -DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
+    CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ \
+        cmake -B build -DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
        && cmake --build build --config Release -- -j 16
    ```
-    On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DLLAMA_HIP_UMA=ON`.
+    On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DLLAMA_HIP_UMA=ON"`.
    However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).

-    Note that if you get the following error:
-    ```
-    clang: error: cannot find ROCm device library; provide its path via '--rocm-path' or '--rocm-device-lib-path', or pass '-nogpulib' to build without ROCm device library
-    ```
-    Try searching for a directory under `HIP_PATH` that contains the file
-    `oclc_abi_version_400.bc`. Then, add the following to the start of the
-    command: `HIP_DEVICE_LIB_PATH=<directory-you-just-found>`, so something
-    like:
-    ```bash
-    HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -p)" \
-    HIP_DEVICE_LIB_PATH=<directory-you-just-found> \
-        cmake -S . -B build -DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
-        && cmake --build build -- -j 16
-    ```
-
  - Using `make` (example for target gfx1030, build with 16 CPU threads):
    ```bash
    make -j16 LLAMA_HIPBLAS=1 LLAMA_HIP_UMA=1 AMDGPU_TARGETS=gfx1030
@@ -530,8 +542,10 @@ Building the program with BLAS support may lead to some performance improvements
  - Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU):
    ```bash
    set PATH=%HIP_PATH%\bin;%PATH%
-    cmake -S . -B build -G Ninja -DAMDGPU_TARGETS=gfx1100 -DLLAMA_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release
-    cmake --build build
+    mkdir build
+    cd build
+    cmake -G Ninja -DAMDGPU_TARGETS=gfx1100 -DLLAMA_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release ..
+    cmake --build .
    ```
    Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors)
    Find your gpu version string by matching the most significant version information from `rocminfo | grep gfx | head -1 | awk '{print $2}'` with the list of processors, e.g. `gfx1035` maps to `gfx1030`.
@@ -547,6 +561,111 @@ Building the program with BLAS support may lead to some performance improvements
  | LLAMA_CUDA_MMV_Y        | Positive integer       | 1       | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants.                                                                       |
  | LLAMA_CUDA_KQUANTS_ITER | 1 or 2                 | 2       | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs.                                                                             |

+- #### CLBlast
+
+  OpenCL acceleration is provided by the matrix multiplication kernels from the [CLBlast](https://github.com/CNugteren/CLBlast) project and custom kernels for ggml that can generate tokens on the GPU.
+
+  You will need the [OpenCL SDK](https://github.com/KhronosGroup/OpenCL-SDK).
+    - For Ubuntu, Debian, and Fedora the packages `opencl-headers`, `ocl-icd` may be needed.
+
+    - For Windows, a pre-built SDK is available on the [OpenCL Releases](https://github.com/KhronosGroup/OpenCL-SDK/releases) page.
+
+    - <details>
+        <summary>Installing the OpenCL SDK from source</summary>
+
+        ```sh
+        git clone --recurse-submodules https://github.com/KhronosGroup/OpenCL-SDK.git
+        cd OpenCL-SDK
+        cmake -B build -DBUILD_DOCS=OFF \
+          -DBUILD_EXAMPLES=OFF \
+          -DBUILD_TESTING=OFF \
+          -DOPENCL_SDK_BUILD_SAMPLES=OFF \
+          -DOPENCL_SDK_TEST_SAMPLES=OFF
+        cmake --build build
+        cmake --install build --prefix /some/path
+        ```
+      </details>
+
+  ##### Installing CLBlast
+
+  Pre-built CLBlast binaries may be found on the [CLBlast Releases](https://github.com/CNugteren/CLBlast/releases) page. For Unix variants, it may also be found in your operating system's packages.
+
+  Linux packaging:
+  Fedora Linux:
+  ```bash
+  sudo dnf install clblast
+  ```
+
+  Alternatively, they may be built from source.
+
+  - <details>
+    <summary>Windows:</summary>
+
+      ```cmd
+      set OPENCL_SDK_ROOT="C:/OpenCL-SDK-v2023.04.17-Win-x64"
+      git clone https://github.com/CNugteren/CLBlast.git
+      cd CLBlast
+      cmake -B build -DBUILD_SHARED_LIBS=OFF -DOVERRIDE_MSVC_FLAGS_TO_MT=OFF -DTUNERS=OFF -DOPENCL_ROOT=%OPENCL_SDK_ROOT% -G "Visual Studio 17 2022" -A x64
+      cmake --build build --config Release
+      cmake --install build --prefix C:/CLBlast
+      ```
+
+      (note: `--config Release` at build time is the default and only relevant for Visual Studio builds - or multi-config Ninja builds)
+
+  - <details>
+    <summary>Unix:</summary>
+
+      ```sh
+      git clone https://github.com/CNugteren/CLBlast.git
+      cd CLBlast
+      cmake -B build -DBUILD_SHARED_LIBS=OFF -DTUNERS=OFF
+      cmake --build build --config Release
+      cmake --install build --prefix /some/path
+      ```
+
+      Where `/some/path` is where the built library will be installed (default is `/usr/local`).
+    </details>
+
+  ##### Building Llama with CLBlast
+
+  - Build with make:
+    ```sh
+    make LLAMA_CLBLAST=1
+    ```
+  - CMake (Unix):
+    ```sh
+    cmake -B build -DLLAMA_CLBLAST=ON -DCLBlast_DIR=/some/path
+    cmake --build build --config Release
+    ```
+  - CMake (Windows):
+    ```cmd
+    set CL_BLAST_CMAKE_PKG="C:/CLBlast/lib/cmake/CLBlast"
+    git clone https://github.com/ggerganov/llama.cpp
+    cd llama.cpp
+    cmake -B build -DBUILD_SHARED_LIBS=OFF -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH=%CL_BLAST_CMAKE_PKG% -G "Visual Studio 17 2022" -A x64
+    cmake --build build --config Release
+    cmake --install build --prefix C:/LlamaCPP
+    ```
+
+  ##### Running Llama with CLBlast
+
+  The CLBlast build supports `--gpu-layers|-ngl` like the CUDA version does.
+
+  To select the correct platform (driver) and device (GPU), you can use the environment variables `GGML_OPENCL_PLATFORM` and `GGML_OPENCL_DEVICE`.
+  The selection can be a number (starting from 0) or a text string to search:
+
+  ```sh
+  GGML_OPENCL_PLATFORM=1 ./main ...
+  GGML_OPENCL_DEVICE=2 ./main ...
+  GGML_OPENCL_PLATFORM=Intel ./main ...
+  GGML_OPENCL_PLATFORM=AMD GGML_OPENCL_DEVICE=1 ./main ...
+  ```
+
+  The default behavior is to find the first GPU device, but when it is an integrated GPU on a laptop, for instance, the selectors are useful.
+  Using the variables it is possible to select a CPU-based driver as well, if so desired.
+
+  You can get a list of platforms and devices from the `clinfo -l` command, etc.
+
 - #### Vulkan

  **With docker**:
@@ -576,9 +695,7 @@ Building the program with BLAS support may lead to some performance improvements
  vulkaninfo
  ```

-  Alternatively your package manager might be able to provide the appropriate libraries.
-  For example for Ubuntu 22.04 you can install `libvulkan-dev` instead.
-  For Fedora 40, you can install `vulkan-devel`, `glslc` and `glslang` packages.
+  Alternatively your package manager might be able to provide the appropiate libraries. For example for Ubuntu 22.04 you can install `libvulkan-dev` instead.

  Then, build llama.cpp using the cmake command below:

@@ -594,13 +711,9 @@ Building the program with BLAS support may lead to some performance improvements

 ### Prepare and Quantize

-> [!NOTE]
-> You can use the [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space on Hugging Face to quantise your model weights without any setup too. It is synced from `llama.cpp` main every 6 hours.
-
 To obtain the official LLaMA 2 weights please see the <a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a> section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face.

-Note: `convert.py` has been moved to `examples/convert-legacy-llama.py` and shouldn't be used for anything other than `Llama/Llama2/Mistral` models and their derivatives.
-It does not support LLaMA 3, you can use `convert-hf-to-gguf.py` with LLaMA 3 downloaded from Hugging Face.
+Note: `convert.py` does not support LLaMA 3, you can use `convert-hf-to-gguf.py` with LLaMA 3 downloaded from Hugging Face.

 ```bash
 # obtain the official LLaMA model weights and place them in ./models
@@ -617,10 +730,10 @@ ls ./models
 python3 -m pip install -r requirements.txt

 # convert the model to ggml FP16 format
-python3 convert-hf-to-gguf.py models/mymodel/
+python3 convert.py models/mymodel/

 # [Optional] for models using BPE tokenizers
-python convert-hf-to-gguf.py models/mymodel/ --vocab-type bpe
+python convert.py models/mymodel/ --vocab-type bpe

 # quantize the model to 4-bits (using Q4_K_M method)
 ./quantize ./models/mymodel/ggml-model-f16.gguf ./models/mymodel/ggml-model-Q4_K_M.gguf Q4_K_M
@@ -770,6 +883,34 @@ The `grammars/` folder contains a handful of sample grammars. To write your own,

 For authoring more complex JSON grammars, you can also check out https://grammar.intrinsiclabs.ai/, a browser app that lets you write TypeScript interfaces which it compiles to GBNF grammars that you can save for local use. Note that the app is built and maintained by members of the community, please file any issues or FRs on [its repo](http://github.com/intrinsiclabsai/gbnfgen) and not this one.

+### Instruct mode
+
+1. First, download and place the `ggml` model into the `./models` folder
+2. Run the `main` tool like this:
+
+```
+./examples/alpaca.sh
+```
+
+Sample run:
+
+```
+== Running in interactive mode. ==
+ - Press Ctrl+C to interject at any time.
+ - Press Return to return control to LLaMA.
+ - If you want to submit another line, end your input in '\'.
+
+ Below is an instruction that describes a task. Write a response that appropriately completes the request.
+
+> How many letters are there in the English alphabet?
+There 26 letters in the English Alphabet
+> What is the most common way of transportation in Amsterdam?
+The majority (54%) are using public transit. This includes buses, trams and metros with over 100 lines throughout the city which make it very accessible for tourists to navigate around town as well as locals who commute by tram or metro on a daily basis
+> List 5 words that start with "ca".
+cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach.
+>
+```
+
 ### Obtaining and using the Facebook LLaMA 2 model

 - Refer to [Facebook's LLaMA download page](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) if you want to access the model data.
--- a/build.zig
+++ b/build.zig
@@ -0,0 +1,172 @@
+// Compatible with Zig Version 0.11.0
+const std = @import("std");
+const ArrayList = std.ArrayList;
+const Compile = std.Build.Step.Compile;
+const ConfigHeader = std.Build.Step.ConfigHeader;
+const Mode = std.builtin.Mode;
+const CrossTarget = std.zig.CrossTarget;
+
+const Maker = struct {
+    builder: *std.build.Builder,
+    target: CrossTarget,
+    optimize: Mode,
+    enable_lto: bool,
+
+    include_dirs: ArrayList([]const u8),
+    cflags: ArrayList([]const u8),
+    cxxflags: ArrayList([]const u8),
+    objs: ArrayList(*Compile),
+
+    fn addInclude(m: *Maker, dir: []const u8) !void {
+        try m.include_dirs.append(dir);
+    }
+    fn addProjectInclude(m: *Maker, path: []const []const u8) !void {
+        try m.addInclude(try m.builder.build_root.join(m.builder.allocator, path));
+    }
+    fn addCFlag(m: *Maker, flag: []const u8) !void {
+        try m.cflags.append(flag);
+    }
+    fn addCxxFlag(m: *Maker, flag: []const u8) !void {
+        try m.cxxflags.append(flag);
+    }
+    fn addFlag(m: *Maker, flag: []const u8) !void {
+        try m.addCFlag(flag);
+        try m.addCxxFlag(flag);
+    }
+
+    fn init(builder: *std.build.Builder) !Maker {
+        const target = builder.standardTargetOptions(.{});
+        const zig_version = @import("builtin").zig_version_string;
+        const commit_hash = try std.ChildProcess.exec(
+            .{ .allocator = builder.allocator, .argv = &.{ "git", "rev-parse", "HEAD" } },
+        );
+        try std.fs.cwd().writeFile("common/build-info.cpp", builder.fmt(
+            \\int LLAMA_BUILD_NUMBER = {};
+            \\char const *LLAMA_COMMIT = "{s}";
+            \\char const *LLAMA_COMPILER = "Zig {s}";
+            \\char const *LLAMA_BUILD_TARGET = "{s}";
+            \\
+        , .{ 0, commit_hash.stdout[0 .. commit_hash.stdout.len - 1], zig_version, try target.allocDescription(builder.allocator) }));
+        var m = Maker{
+            .builder = builder,
+            .target = target,
+            .optimize = builder.standardOptimizeOption(.{}),
+            .enable_lto = false,
+            .include_dirs = ArrayList([]const u8).init(builder.allocator),
+            .cflags = ArrayList([]const u8).init(builder.allocator),
+            .cxxflags = ArrayList([]const u8).init(builder.allocator),
+            .objs = ArrayList(*Compile).init(builder.allocator),
+        };
+
+        try m.addCFlag("-std=c11");
+        try m.addCxxFlag("-std=c++11");
+        try m.addProjectInclude(&.{});
+        try m.addProjectInclude(&.{"common"});
+        return m;
+    }
+
+    fn obj(m: *const Maker, name: []const u8, src: []const u8) *Compile {
+        const o = m.builder.addObject(.{ .name = name, .target = m.target, .optimize = m.optimize });
+        if (o.target.getAbi() != .msvc)
+            o.defineCMacro("_GNU_SOURCE", null);
+
+        if (std.mem.endsWith(u8, src, ".c")) {
+            o.addCSourceFiles(&.{src}, m.cflags.items);
+            o.linkLibC();
+        } else {
+            o.addCSourceFiles(&.{src}, m.cxxflags.items);
+            if (o.target.getAbi() == .msvc) {
+                o.linkLibC(); // need winsdk + crt
+            } else {
+                // linkLibCpp already add (libc++ + libunwind + libc)
+                o.linkLibCpp();
+            }
+        }
+        for (m.include_dirs.items) |i| o.addIncludePath(.{ .path = i });
+        o.want_lto = m.enable_lto;
+        return o;
+    }
+
+    fn exe(m: *const Maker, name: []const u8, src: []const u8, deps: []const *Compile) *Compile {
+        const e = m.builder.addExecutable(.{ .name = name, .target = m.target, .optimize = m.optimize });
+        e.addCSourceFiles(&.{src}, m.cxxflags.items);
+        for (deps) |d| e.addObject(d);
+        for (m.objs.items) |o| e.addObject(o);
+        for (m.include_dirs.items) |i| e.addIncludePath(.{ .path = i });
+
+        // https://github.com/ziglang/zig/issues/15448
+        if (e.target.getAbi() == .msvc) {
+            e.linkLibC(); // need winsdk + crt
+        } else {
+            // linkLibCpp already add (libc++ + libunwind + libc)
+            e.linkLibCpp();
+        }
+        m.builder.installArtifact(e);
+        e.want_lto = m.enable_lto;
+        return e;
+    }
+};
+
+pub fn build(b: *std.build.Builder) !void {
+    var make = try Maker.init(b);
+    make.enable_lto = b.option(bool, "lto", "Enable LTO optimization, (default: false)") orelse false;
+
+    const ggml = make.obj("ggml", "ggml.c");
+    const sgemm = make.obj("sgemm", "sgemm.cpp");
+    const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
+    const ggml_backend = make.obj("ggml-backend", "ggml-backend.c");
+    const ggml_quants = make.obj("ggml-quants", "ggml-quants.c");
+    const unicode = make.obj("unicode", "unicode.cpp");
+    const unicode_data = make.obj("unicode-data", "unicode-data.cpp");
+    const llama = make.obj("llama", "llama.cpp");
+    const buildinfo = make.obj("common", "common/build-info.cpp");
+    const common = make.obj("common", "common/common.cpp");
+    const console = make.obj("console", "common/console.cpp");
+    const sampling = make.obj("sampling", "common/sampling.cpp");
+    const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp");
+    const json_schema_to_grammar = make.obj("json-schema-to-grammar", "common/json-schema-to-grammar.cpp");
+    const train = make.obj("train", "common/train.cpp");
+    const clip = make.obj("clip", "examples/llava/clip.cpp");
+    const llava = make.obj("llava", "examples/llava/llava.cpp");
+
+    _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, sampling, console, grammar_parser });
+    _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo });
+    _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo });
+    _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo });
+    _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, train });
+    _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, train });
+
+    const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, sampling, grammar_parser, clip, llava });
+    if (server.target.isWindows()) {
+        server.linkSystemLibrary("ws2_32");
+    }
+
+    const server_assets = [_][]const u8{ "index.html", "index.js", "completion.js", "json-schema-to-grammar.mjs" };
+    for (server_assets) |asset| {
+        const input_path = b.fmt("examples/server/public/{s}", .{asset});
+        const output_path = b.fmt("examples/server/{s}.hpp", .{asset});
+
+        // Portable equivalent of `b.addSystemCommand(&.{ "xxd", "-n", asset, "-i", input_path, output_path }) })`:
+
+        const input = try std.fs.cwd().readFileAlloc(b.allocator, input_path, std.math.maxInt(usize));
+        defer b.allocator.free(input);
+
+        var buf = std.ArrayList(u8).init(b.allocator);
+        defer buf.deinit();
+
+        for (input) |byte| {
+            try std.fmt.format(buf.writer(), "0x{X:0>2}, ", .{byte});
+        }
+
+        var name = try std.mem.replaceOwned(u8, b.allocator, asset, "-", "_");
+        defer b.allocator.free(name);
+        std.mem.replaceScalar(u8, name, '.', '_');
+
+        try std.fs.cwd().writeFile(output_path, b.fmt(
+            "unsigned char {s}[] = {{{s}}};\nunsigned int {s}_len = {d};\n",
+            .{ name, buf.items, name, input.len },
+        ));
+
+        std.debug.print("Dumped hex of \"{s}\" ({s}) to {s}\n", .{ input_path, name, output_path });
+    }
+}
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -202,15 +202,12 @@ function gg_sum_test_scripts_release {
 }

 function gg_get_model {
-    local gguf_0="$MNT/models/pythia/1.4B/ggml-model-f16.gguf"
-    local gguf_1="$MNT/models/pythia/2.8B/ggml-model-f16.gguf"
-    local gguf_2="$MNT/models/open-llama/7B-v2/ggml-model-f16.gguf"
-    if [[ -s $gguf_0 ]]; then
-        echo -n "$gguf_0"
-    elif [[ -s $gguf_1 ]]; then
-        echo -n "$gguf_1"
-    elif [[ -s $gguf_2 ]]; then
-        echo -n "$gguf_2"
+    local gguf_3b="$MNT/models/open-llama/3B-v2/ggml-model-f16.gguf"
+    local gguf_7b="$MNT/models/open-llama/7B-v2/ggml-model-f16.gguf"
+    if [[ -s $gguf_3b ]]; then
+        echo -n "$gguf_3b"
+    elif [[ -s $gguf_7b ]]; then
+        echo -n "$gguf_7b"
    else
        echo >&2 "No model found. Can't run gg_run_ctest_with_model."
        exit 1
@@ -259,6 +256,186 @@ function gg_sum_ctest_with_model_release {
    gg_printf '```\n'
 }

+# open_llama_3b_v2
+
+function gg_run_open_llama_3b_v2 {
+    cd ${SRC}
+
+    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/config.json
+    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/tokenizer.model
+    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/tokenizer_config.json
+    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/special_tokens_map.json
+    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/pytorch_model.bin
+    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/generation_config.json
+
+    gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
+    unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
+    head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw
+
+    path_models="../models-mnt/open-llama/3B-v2"
+    path_wiki="../models-mnt/wikitext/wikitext-2-raw"
+
+    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
+
+    set -e
+
+    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_QKK_64=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time make -j                                                             ) 2>&1 | tee -a $OUT/${ci}-make.log
+
+    python3 ../convert.py ${path_models}
+
+    model_f16="${path_models}/ggml-model-f16.gguf"
+    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
+    model_q4_0="${path_models}/ggml-model-q4_0.gguf"
+    model_q4_1="${path_models}/ggml-model-q4_1.gguf"
+    model_q5_0="${path_models}/ggml-model-q5_0.gguf"
+    model_q5_1="${path_models}/ggml-model-q5_1.gguf"
+    model_q2_k="${path_models}/ggml-model-q2_k.gguf"
+    model_q3_k="${path_models}/ggml-model-q3_k.gguf"
+    model_q4_k="${path_models}/ggml-model-q4_k.gguf"
+    model_q5_k="${path_models}/ggml-model-q5_k.gguf"
+    model_q6_k="${path_models}/ggml-model-q6_k.gguf"
+
+    wiki_test_60="${path_wiki}/wiki.test-60.raw"
+
+    ./bin/quantize ${model_f16} ${model_q8_0} q8_0
+    ./bin/quantize ${model_f16} ${model_q4_0} q4_0
+    ./bin/quantize ${model_f16} ${model_q4_1} q4_1
+    ./bin/quantize ${model_f16} ${model_q5_0} q5_0
+    ./bin/quantize ${model_f16} ${model_q5_1} q5_1
+    ./bin/quantize ${model_f16} ${model_q2_k} q2_k
+    ./bin/quantize ${model_f16} ${model_q3_k} q3_k
+    ./bin/quantize ${model_f16} ${model_q4_k} q4_k
+    ./bin/quantize ${model_f16} ${model_q5_k} q5_k
+    ./bin/quantize ${model_f16} ${model_q6_k} q6_k
+
+    (time ./bin/main --model ${model_f16}  -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/main --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/main --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/main --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/main --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/main --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/main --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/main --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/main --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/main --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/main --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+
+    (time ./bin/perplexity --model ${model_f16}  -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+
+    (time ./bin/imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
+
+    (time ./bin/save-load-state     --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+
+    function check_ppl {
+        qnt="$1"
+        ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
+
+        if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
+            printf '  - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
+            return 20
+        fi
+
+        printf '  - %s @ %s OK\n' "$qnt" "$ppl"
+        return 0
+    }
+
+    check_ppl "f16"  "$(cat $OUT/${ci}-tg-f16.log  | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+
+    cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
+
+    # lora
+    function compare_ppl {
+        qnt="$1"
+        ppl1=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
+        ppl2=$(echo "$3" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
+
+        if [ $(echo "$ppl1 < $ppl2" | bc) -eq 1 ]; then
+            printf '  - %s @ %s (FAIL: %s > %s)\n' "$qnt" "$ppl" "$ppl1" "$ppl2"
+            return 20
+        fi
+
+        printf '  - %s @ %s %s OK\n' "$qnt" "$ppl1" "$ppl2"
+        return 0
+    }
+
+    path_lora="../models-mnt/open-llama/3B-v2/lora"
+    path_shakespeare="../models-mnt/shakespeare"
+
+    shakespeare="${path_shakespeare}/shakespeare.txt"
+    lora_shakespeare="${path_lora}/ggml-adapter-model.bin"
+
+    gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_config.json
+    gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_model.bin
+    gg_wget ${path_shakespeare} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/shakespeare.txt
+
+    python3 ../convert-lora-to-ggml.py ${path_lora}
+
+    # f16
+    (time ./bin/perplexity --model ${model_f16} -f ${shakespeare}                            -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log
+    (time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log
+    compare_ppl "f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-f16.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
+
+    # q8_0
+    (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare}                            -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log
+    (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log
+    compare_ppl "q8_0 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
+
+    # q8_0 + f16 lora-base
+    (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log
+    compare_ppl "q8_0 / f16 base shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
+
+    set +e
+}
+
+function gg_sum_open_llama_3b_v2 {
+    gg_printf '### %s\n\n' "${ci}"
+
+    gg_printf 'OpenLLaMA 3B-v2:\n'
+    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
+    gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
+    gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
+    gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)"
+    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
+    gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
+    gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
+    gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
+    gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
+    gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
+    gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
+    gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
+    gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
+    gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
+    gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
+    gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
+    gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)"
+    gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)"
+    gg_printf '- shakespeare (q8_0):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log)"
+    gg_printf '- shakespeare (q8_0 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log)"
+    gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)"
+}
+
 # open_llama_7b_v2
 # requires: GG_BUILD_CUDA

@@ -287,7 +464,7 @@ function gg_run_open_llama_7b_v2 {
    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
    (time make -j                                                           ) 2>&1 | tee -a $OUT/${ci}-make.log

-    python3 ../examples/convert-legacy-llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
+    python3 ../convert.py ${path_models}

    model_f16="${path_models}/ggml-model-f16.gguf"
    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
@@ -372,6 +549,48 @@ function gg_run_open_llama_7b_v2 {

    cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log

+    # lora
+    function compare_ppl {
+        qnt="$1"
+        ppl1=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
+        ppl2=$(echo "$3" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
+
+        if [ $(echo "$ppl1 < $ppl2" | bc) -eq 1 ]; then
+            printf '  - %s @ %s (FAIL: %s > %s)\n' "$qnt" "$ppl" "$ppl1" "$ppl2"
+            return 20
+        fi
+
+        printf '  - %s @ %s %s OK\n' "$qnt" "$ppl1" "$ppl2"
+        return 0
+    }
+
+    path_lora="../models-mnt/open-llama/7B-v2/lora"
+    path_shakespeare="../models-mnt/shakespeare"
+
+    shakespeare="${path_shakespeare}/shakespeare.txt"
+    lora_shakespeare="${path_lora}/ggml-adapter-model.bin"
+
+    gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/adapter_config.json
+    gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/adapter_model.bin
+    gg_wget ${path_shakespeare} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/shakespeare.txt
+
+    python3 ../convert-lora-to-ggml.py ${path_lora}
+
+    # f16
+    (time ./bin/perplexity --model ${model_f16} -f ${shakespeare}                            -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log
+    (time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log
+    compare_ppl "f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-f16.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
+
+    # currently not supported by the CUDA backend
+    # q8_0
+    #(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare}                            -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log
+    #(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log
+    #compare_ppl "q8_0 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
+
+    # q8_0 + f16 lora-base
+    #(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log
+    #compare_ppl "q8_0 / f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
+
    set +e
 }

@@ -382,6 +601,7 @@ function gg_sum_open_llama_7b_v2 {
    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
    gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
    gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
+    gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)"
    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
    gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
    gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
@@ -394,272 +614,11 @@ function gg_sum_open_llama_7b_v2 {
    gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
    gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
    gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
-}
-
-# pythia_1.4b
-
-function gg_run_pythia_1_4b {
-    cd ${SRC}
-
-    gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/config.json
-    gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/tokenizer.json
-    gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/tokenizer_config.json
-    gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/special_tokens_map.json
-    gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/resolve/main/pytorch_model.bin
-
-    gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
-    unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
-    head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw
-
-    path_models="../models-mnt/pythia/1.4B"
-    path_wiki="../models-mnt/wikitext/wikitext-2-raw"
-
-    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
-
-    set -e
-
-    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j                                            ) 2>&1 | tee -a $OUT/${ci}-make.log
-
-    python3 ../convert-hf-to-gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
-
-    model_f16="${path_models}/ggml-model-f16.gguf"
-    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
-    model_q4_0="${path_models}/ggml-model-q4_0.gguf"
-    model_q4_1="${path_models}/ggml-model-q4_1.gguf"
-    model_q5_0="${path_models}/ggml-model-q5_0.gguf"
-    model_q5_1="${path_models}/ggml-model-q5_1.gguf"
-    model_q2_k="${path_models}/ggml-model-q2_k.gguf"
-    model_q3_k="${path_models}/ggml-model-q3_k.gguf"
-    model_q4_k="${path_models}/ggml-model-q4_k.gguf"
-    model_q5_k="${path_models}/ggml-model-q5_k.gguf"
-    model_q6_k="${path_models}/ggml-model-q6_k.gguf"
-
-    wiki_test_60="${path_wiki}/wiki.test-60.raw"
-
-    ./bin/quantize ${model_f16} ${model_q8_0} q8_0
-    ./bin/quantize ${model_f16} ${model_q4_0} q4_0
-    ./bin/quantize ${model_f16} ${model_q4_1} q4_1
-    ./bin/quantize ${model_f16} ${model_q5_0} q5_0
-    ./bin/quantize ${model_f16} ${model_q5_1} q5_1
-    ./bin/quantize ${model_f16} ${model_q2_k} q2_k
-    ./bin/quantize ${model_f16} ${model_q3_k} q3_k
-    ./bin/quantize ${model_f16} ${model_q4_k} q4_k
-    ./bin/quantize ${model_f16} ${model_q5_k} q5_k
-    ./bin/quantize ${model_f16} ${model_q6_k} q6_k
-
-    (time ./bin/main --model ${model_f16}  -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/main --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/main --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/main --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/main --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/main --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/main --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/main --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/main --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/main --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/main --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
-
-    (time ./bin/perplexity --model ${model_f16}  -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
-
-    (time ./bin/imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
-
-    (time ./bin/save-load-state     --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-
-    function check_ppl {
-        qnt="$1"
-        ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
-
-        if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
-            printf '  - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
-            return 20
-        fi
-
-        printf '  - %s @ %s OK\n' "$qnt" "$ppl"
-        return 0
-    }
-
-    check_ppl "f16"  "$(cat $OUT/${ci}-tg-f16.log  | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-   #check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log # note: ppl > 20.0 for this quant and model
-    check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-
-    cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
-
-    set +e
-}
-
-function gg_sum_pythia_1_4b {
-    gg_printf '### %s\n\n' "${ci}"
-
-    gg_printf 'Pythia 1.4B:\n'
-    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
-    gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
-    gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
-    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
-    gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
-    gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
-    gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
-    gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
-    gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
-    gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
-    gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
-    gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
-    gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
-    gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
-    gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
-}
-
-# pythia_2_8b
-# requires: GG_BUILD_CUDA
-
-function gg_run_pythia_2_8b {
-    cd ${SRC}
-
-    gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/config.json
-    gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/tokenizer.json
-    gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/tokenizer_config.json
-    gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/special_tokens_map.json
-    gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/resolve/main/pytorch_model.bin
-
-    gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
-    unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
-
-    path_models="../models-mnt/pythia/2.8B"
-    path_wiki="../models-mnt/wikitext/wikitext-2-raw"
-
-    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
-
-    set -e
-
-    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j                                                           ) 2>&1 | tee -a $OUT/${ci}-make.log
-
-    python3 ../convert-hf-to-gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
-
-    model_f16="${path_models}/ggml-model-f16.gguf"
-    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
-    model_q4_0="${path_models}/ggml-model-q4_0.gguf"
-    model_q4_1="${path_models}/ggml-model-q4_1.gguf"
-    model_q5_0="${path_models}/ggml-model-q5_0.gguf"
-    model_q5_1="${path_models}/ggml-model-q5_1.gguf"
-    model_q2_k="${path_models}/ggml-model-q2_k.gguf"
-    model_q3_k="${path_models}/ggml-model-q3_k.gguf"
-    model_q4_k="${path_models}/ggml-model-q4_k.gguf"
-    model_q5_k="${path_models}/ggml-model-q5_k.gguf"
-    model_q6_k="${path_models}/ggml-model-q6_k.gguf"
-
-    wiki_test="${path_wiki}/wiki.test.raw"
-
-    ./bin/quantize ${model_f16} ${model_q8_0} q8_0
-    ./bin/quantize ${model_f16} ${model_q4_0} q4_0
-    ./bin/quantize ${model_f16} ${model_q4_1} q4_1
-    ./bin/quantize ${model_f16} ${model_q5_0} q5_0
-    ./bin/quantize ${model_f16} ${model_q5_1} q5_1
-    ./bin/quantize ${model_f16} ${model_q2_k} q2_k
-    ./bin/quantize ${model_f16} ${model_q3_k} q3_k
-    ./bin/quantize ${model_f16} ${model_q4_k} q4_k
-    ./bin/quantize ${model_f16} ${model_q5_k} q5_k
-    ./bin/quantize ${model_f16} ${model_q6_k} q6_k
-
-    (time ./bin/main --model ${model_f16}  -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/main --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/main --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/main --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/main --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/main --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/main --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/main --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/main --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/main --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/main --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
-
-    (time ./bin/perplexity --model ${model_f16}  -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
-
-    (time ./bin/imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
-
-    (time ./bin/save-load-state     -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/save-load-state     -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-
-    function check_ppl {
-        qnt="$1"
-        ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
-
-        if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
-            printf '  - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
-            return 20
-        fi
-
-        printf '  - %s @ %s OK\n' "$qnt" "$ppl"
-        return 0
-    }
-
-    check_ppl "f16"  "$(cat $OUT/${ci}-tg-f16.log  | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-   #check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log # note: ppl > 20.0 for this quant and model
-    check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-
-    cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
-
-    set +e
-}
-
-function gg_sum_pythia_2_8b {
-    gg_printf '### %s\n\n' "${ci}"
-
-    gg_printf 'Pythia 2.8B:\n'
-    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
-    gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
-    gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
-    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
-    gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
-    gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
-    gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
-    gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
-    gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
-    gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
-    gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
-    gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
-    gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
-    gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
-    gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
+    gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)"
+    gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)"
+    #gg_printf '- shakespeare (q8_0):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log)"
+    #gg_printf '- shakespeare (q8_0 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log)"
+    #gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)"
 }

 # bge-small
@@ -688,7 +647,7 @@ function gg_run_embd_bge_small {
    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
    (time make -j                                            ) 2>&1 | tee -a $OUT/${ci}-make.log

-    python3 ../convert-hf-to-gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
+    python3 ../convert-hf-to-gguf.py ${path_models}

    model_f16="${path_models}/ggml-model-f16.gguf"
    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
@@ -742,10 +701,9 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then

    if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
        if [ -z ${GG_BUILD_CUDA} ]; then
-            test $ret -eq 0 && gg_run pythia_1_4b
+            test $ret -eq 0 && gg_run open_llama_3b_v2
        else
-            test $ret -eq 0 && gg_run pythia_2_8b
-            #test $ret -eq 0 && gg_run open_llama_7b_v2
+            test $ret -eq 0 && gg_run open_llama_7b_v2
        fi
        test $ret -eq 0 && gg_run ctest_with_model_debug
        test $ret -eq 0 && gg_run ctest_with_model_release
--- a/cmake/arm64-windows-llvm.cmake
+++ b/cmake/arm64-windows-llvm.cmake
@@ -1,16 +0,0 @@
-set( CMAKE_SYSTEM_NAME Windows )
-set( CMAKE_SYSTEM_PROCESSOR arm64 )
-
-set( target arm64-pc-windows-msvc )
-
-set( CMAKE_C_COMPILER    clang )
-set( CMAKE_CXX_COMPILER  clang++ )
-
-set( CMAKE_C_COMPILER_TARGET   ${target} )
-set( CMAKE_CXX_COMPILER_TARGET ${target} )
-
-set( arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast -fno-finite-math-only" )
-set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function -Wno-gnu-zero-variadic-macro-arguments" )
-
-set( CMAKE_C_FLAGS_INIT   "${arch_c_flags} ${warn_c_flags}" )
-set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
--- a/cmake/arm64-windows-msvc.cmake
+++ b/cmake/arm64-windows-msvc.cmake
@@ -1,6 +0,0 @@
-set( CMAKE_SYSTEM_NAME Windows )
-set( CMAKE_SYSTEM_PROCESSOR arm64 )
-
-set( target arm64-pc-windows-msvc )
-set( CMAKE_C_COMPILER_TARGET   ${target} )
-set( CMAKE_CXX_COMPILER_TARGET ${target} )
--- a/cmake/llama.pc.in
+++ b/cmake/llama.pc.in
@@ -1,10 +0,0 @@
-prefix=@CMAKE_INSTALL_PREFIX@
-exec_prefix=${prefix}
-libdir=${exec_prefix}/lib
-includedir=${prefix}/include
-
-Name: llama
-Description: Port of Facebook's LLaMA model in C/C++
-Version: @PROJECT_VERSION@
-Libs: -L${libdir} -lllama
-Cflags: -I${includedir}
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -84,4 +84,4 @@ endif ()

 target_include_directories(${TARGET} PUBLIC .)
 target_compile_features(${TARGET} PUBLIC cxx_std_11)
-target_link_libraries(${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
+target_link_libraries(${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama)
--- a/common/common.cpp
+++ b/common/common.cpp
--- a/common/common.h
+++ b/common/common.h
@@ -27,7 +27,7 @@
 #define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)

 #define print_build_info() do {                                                                     \
-    fprintf(stderr, "%s: build = %d (%s)\n",      __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);      \
+    fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);           \
    fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);    \
 } while(0)

@@ -35,18 +35,14 @@

 // build info
 extern int LLAMA_BUILD_NUMBER;
-extern char const * LLAMA_COMMIT;
-extern char const * LLAMA_COMPILER;
-extern char const * LLAMA_BUILD_TARGET;
+extern char const *LLAMA_COMMIT;
+extern char const *LLAMA_COMPILER;
+extern char const *LLAMA_BUILD_TARGET;

 struct llama_control_vector_load_info;

-//
-// CPU utils
-//
-
-int32_t cpu_get_num_physical_cores();
-int32_t cpu_get_num_math();
+int get_math_cpu_count();
+int32_t get_num_physical_cores();

 //
 // CLI argument parsing
@@ -55,35 +51,36 @@ int32_t cpu_get_num_math();
 struct gpt_params {
    uint32_t seed                 = LLAMA_DEFAULT_SEED; // RNG seed

-    int32_t n_threads             = cpu_get_num_math();
-    int32_t n_threads_draft       =    -1;
-    int32_t n_threads_batch       =    -1; // number of threads to use for batch processing (-1 = use n_threads)
-    int32_t n_threads_batch_draft =    -1;
-    int32_t n_predict             =    -1; // new tokens to predict
-    int32_t n_ctx                 =     0; // context size
-    int32_t n_batch               =  2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
-    int32_t n_ubatch              =   512; // physical batch size for prompt processing (must be >=32 to use BLAS)
-    int32_t n_keep                =     0; // number of tokens to keep from initial prompt
-    int32_t n_draft               =     5; // number of tokens to draft during speculative decoding
-    int32_t n_chunks              =    -1; // max number of chunks to process (-1 = unlimited)
-    int32_t n_parallel            =     1; // number of parallel sequences to decode
-    int32_t n_sequences           =     1; // number of sequences to decode
-    float   p_split               =  0.1f; // speculative decoding split probability
-    int32_t n_gpu_layers          =    -1; // number of layers to store in VRAM (-1 - use default)
-    int32_t n_gpu_layers_draft    =    -1; // number of layers to store in VRAM for the draft model (-1 - use default)
-    int32_t main_gpu              =     0; // the GPU that is used for scratch and small tensors
-    float   tensor_split[128]     =   {0}; // how split tensors should be distributed across GPUs
-    int32_t n_beams               =     0; // if non-zero then use beam search of given width.
-    int32_t grp_attn_n            =     1; // group-attention factor
-    int32_t grp_attn_w            =   512; // group-attention width
-    int32_t n_print               =    -1; // print token count every n tokens (-1 = disabled)
-    float   rope_freq_base        =  0.0f; // RoPE base frequency
-    float   rope_freq_scale       =  0.0f; // RoPE frequency scaling factor
+    int32_t n_threads             = get_math_cpu_count();
+    int32_t n_threads_draft       = -1;
+    int32_t n_threads_batch       = -1;    // number of threads to use for batch processing (-1 = use n_threads)
+    int32_t n_threads_batch_draft = -1;
+    int32_t n_predict             = -1;    // new tokens to predict
+    int32_t n_ctx                 = 512;   // context size
+    int32_t n_batch               = 2048;  // logical batch size for prompt processing (must be >=32 to use BLAS)
+    int32_t n_ubatch              = 512;   // physical batch size for prompt processing (must be >=32 to use BLAS)
+    int32_t n_keep                = 0;     // number of tokens to keep from initial prompt
+    int32_t n_draft               = 5;     // number of tokens to draft during speculative decoding
+    int32_t n_chunks              = -1;    // max number of chunks to process (-1 = unlimited)
+    int32_t n_parallel            = 1;     // number of parallel sequences to decode
+    int32_t n_sequences           = 1;     // number of sequences to decode
+    float   p_split               = 0.1f;  // speculative decoding split probability
+    int32_t n_gpu_layers          = -1;    // number of layers to store in VRAM (-1 - use default)
+    int32_t n_gpu_layers_draft    = -1;    // number of layers to store in VRAM for the draft model (-1 - use default)
+    llama_split_mode split_mode   = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
+    int32_t main_gpu              = 0;     // the GPU that is used for scratch and small tensors
+    float   tensor_split[128]     = {0};   // how split tensors should be distributed across GPUs
+    int32_t n_beams               = 0;     // if non-zero then use beam search of given width.
+    int32_t grp_attn_n            = 1;     // group-attention factor
+    int32_t grp_attn_w            = 512;   // group-attention width
+    int32_t n_print               = -1;    // print token count every n tokens (-1 = disabled)
+    float   rope_freq_base        = 0.0f;  // RoPE base frequency
+    float   rope_freq_scale       = 0.0f;  // RoPE frequency scaling factor
    float   yarn_ext_factor       = -1.0f; // YaRN extrapolation mix factor
-    float   yarn_attn_factor      =  1.0f; // YaRN magnitude scaling factor
+    float   yarn_attn_factor      = 1.0f;  // YaRN magnitude scaling factor
    float   yarn_beta_fast        = 32.0f; // YaRN low correction dim
-    float   yarn_beta_slow        =  1.0f; // YaRN high correction dim
-    int32_t yarn_orig_ctx         =     0; // YaRN original context length
+    float   yarn_beta_slow        = 1.0f;  // YaRN high correction dim
+    int32_t yarn_orig_ctx         = 0;     // YaRN original context length
    float   defrag_thold          = -1.0f; // KV cache defragmentation threshold

    ggml_backend_sched_eval_callback cb_eval = nullptr;
@@ -91,32 +88,29 @@ struct gpt_params {

    ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;

-    enum llama_split_mode        split_mode        = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
    enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
    enum llama_pooling_type      pooling_type      = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings

    // // sampling parameters
    struct llama_sampling_params sparams;

-    std::string model                = ""; // model path
-    std::string model_draft          = ""; // draft model for speculative decoding
+    std::string model                = "";  // model path
+    std::string model_draft          = "";  // draft model for speculative decoding
    std::string model_alias          = "unknown"; // model alias
-    std::string model_url            = ""; // model url to download
-    std::string hf_repo              = ""; // HF repo
-    std::string hf_file              = ""; // HF file
+    std::string model_url            = "";  // model url to download
+    std::string hf_repo              = "";  // HF repo
+    std::string hf_file              = "";  // HF file
    std::string prompt               = "";
-    std::string prompt_file          = ""; // store the external prompt file name
-    std::string path_prompt_cache    = ""; // path to file for saving/loading prompt eval state
-    std::string input_prefix         = ""; // string to prefix user inputs with
-    std::string input_suffix         = ""; // string to suffix user inputs with
-    std::string logdir               = ""; // directory in which to save YAML log files
+    std::string prompt_file          = "";  // store the external prompt file name
+    std::string path_prompt_cache    = "";  // path to file for saving/loading prompt eval state
+    std::string input_prefix         = "";  // string to prefix user inputs with
+    std::string input_suffix         = "";  // string to suffix user inputs with
+    std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
+    std::string logdir               = "";  // directory in which to save YAML log files
    std::string lookup_cache_static  = ""; // path of static ngram cache file for lookup decoding
    std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding
-    std::string logits_file          = ""; // file for saving *all* logits
-    std::string rpc_servers          = ""; // comma separated list of RPC servers
+    std::string logits_file          = "";  // file for saving *all* logits

-    std::vector<std::string> in_files;   // all input files
-    std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
    std::vector<llama_model_kv_override> kv_overrides;

    // TODO: avoid tuple, use struct
@@ -125,36 +119,34 @@ struct gpt_params {

    std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale

-    int32_t verbosity                  = 0;
    int32_t control_vector_layer_start = -1; // layer range for control vector
    int32_t control_vector_layer_end   = -1; // layer range for control vector

-    int32_t ppl_stride      = 0;     // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
-    int32_t ppl_output_type = 0;     // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
-                                     //                                       (which is more convenient to use for plotting)
-                                     //
-    bool   hellaswag        = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
-    size_t hellaswag_tasks  = 400;   // number of tasks to use when computing the HellaSwag score
+    int  ppl_stride        = 0;     // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
+    int  ppl_output_type   = 0;     // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
+                                    //                                       (which is more convenient to use for plotting)
+                                    //
+    bool   hellaswag       = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
+    size_t hellaswag_tasks = 400;   // number of tasks to use when computing the HellaSwag score

-    bool   winogrande       = false; // compute Winogrande score over random tasks from datafile supplied in prompt
-    size_t winogrande_tasks = 0;     // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
+    bool   winogrande      = false; // compute Winogrande score over random tasks from datafile supplied in prompt
+    size_t winogrande_tasks= 0;     // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed

-    bool   multiple_choice  = false;  // compute TruthfulQA score over random tasks from datafile supplied in prompt
-    size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
+    bool   multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
+    size_t multiple_choice_tasks = 0;     // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed

-    bool   kl_divergence    = false; // compute KL divergence
+    bool   kl_divergence   = false; // compute KL divergence

-    bool usage             = false; // print usage
+    bool random_prompt     = false; // do not randomize prompt if none provided
    bool use_color         = false; // use color to distinguish generations and inputs
-    bool special           = false; // enable special token output
    bool interactive       = false; // interactive mode
-    bool interactive_first = false; // wait for user input immediately
-    bool conversation      = false; // conversation mode (does not print special tokens and suffix/prefix)
+    bool chatml            = false; // chatml mode (used for models trained on chatml syntax)
    bool prompt_cache_all  = false; // save user input and generations to prompt cache
    bool prompt_cache_ro   = false; // open the prompt cache read-only and do not update it

    bool embedding         = false; // get only sentence embedding
-    bool escape            = true;  // escape "\n", "\r", "\t", "\'", "\"", and "\\"
+    bool escape            = false; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
+    bool interactive_first = false; // wait for user input immediately
    bool multiline_input   = false; // reverse the usage of `\`
    bool simple_io         = false; // improves compatibility with subprocesses and limited consoles
    bool cont_batching     = true;  // insert new sequences for decoding on-the-fly
@@ -162,6 +154,7 @@ struct gpt_params {

    bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
    bool ignore_eos        = false; // ignore generated EOS tokens
+    bool instruct          = false; // instruction mode (used for Alpaca models)
    bool logits_all        = false; // return logits for all tokens in the batch
    bool use_mmap          = true;  // use mmap for faster loads
    bool use_mlock         = false; // use mlock to keep model in memory
@@ -179,105 +172,37 @@ struct gpt_params {
    // multimodal models (see examples/llava)
    std::string mmproj = "";        // path to multimodal projector
    std::vector<std::string> image; // path to image file(s)
-
-    // server params
-    int32_t port           = 8080;         // server listens on this network port
-    int32_t timeout_read   = 600;          // http read timeout in seconds
-    int32_t timeout_write  = timeout_read; // http write timeout in seconds
-    int32_t n_threads_http = -1;           // number of threads to process HTTP requests
-
-    std::string hostname      = "127.0.0.1";
-    std::string public_path   = "";
-    std::string chat_template = "";
-    std::string system_prompt = "";
-
-    std::vector<std::string> api_keys;
-
-    std::string ssl_file_key  = "";
-    std::string ssl_file_cert = "";
-
-    bool endpoint_slots   = true;
-    bool endpoint_metrics = false;
-
-    bool log_json = false;
-
-    std::string slot_save_path;
-
-    float slot_prompt_similarity = 0.5f;
-
-    // batched-bench params
-    bool is_pp_shared = false;
-
-    std::vector<int32_t> n_pp;
-    std::vector<int32_t> n_tg;
-    std::vector<int32_t> n_pl;
-
-    // retrieval params
-    std::vector<std::string> context_files; // context files to embed
-
-    int32_t chunk_size = 64; // chunk size for context embedding
-
-    std::string chunk_separator = "\n"; // chunk separator for context embedding
-
-    // passkey params
-    int32_t n_junk = 250; // number of times to repeat the junk text
-    int32_t i_pos  = -1;  // position of the passkey in the junk text
-
-    // imatrix params
-    std::string out_file = "imatrix.dat"; // save the resulting imatrix to this file
-
-    int32_t n_out_freq  = 10; // output the imatrix every n_out_freq iterations
-    int32_t n_save_freq =  0; // save the imatrix every n_save_freq iterations
-    int32_t i_chunk     =  0; // start processing from this chunk
-
-    bool process_output = false; // collect data for the output tensor
-    bool compute_ppl    = true;  // whether to compute perplexity
 };

 void gpt_params_handle_model_default(gpt_params & params);

-bool gpt_params_parse_ex   (int argc, char ** argv, gpt_params & params);
-bool gpt_params_parse      (int argc, char ** argv, gpt_params & params);
-bool gpt_params_find_arg   (int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param);
-void gpt_params_print_usage(int argc, char ** argv, const gpt_params & params);
+bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);

-std::string gpt_params_get_system_info(const gpt_params & params);
+bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params);
+
+bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
+
+void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
+
+bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param);
+
+std::string get_system_info(const gpt_params & params);
+
+std::string gpt_random_prompt(std::mt19937 & rng);
+
+void process_escapes(std::string& input);
+
+bool validate_file_name(const std::string & filename);

 //
 // String utils
 //

+std::vector<llama_sampler_type> sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
+std::vector<llama_sampler_type> sampler_types_from_chars(const std::string & names_string);
 std::vector<std::string> string_split(std::string input, char separator);
-
 std::string string_strip(const std::string & str);
-std::string string_get_sortable_timestamp();
-
-template<class T>
-static std::vector<T> string_split(const std::string & str, char delim) {
-    std::vector<T> values;
-    std::istringstream str_stream(str);
-    std::string token;
-    while (std::getline(str_stream, token, delim)) {
-        T value;
-        std::istringstream token_stream(token);
-        token_stream >> value;
-        values.push_back(value);
-    }
-    return values;
-}
-
-bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
-void string_process_escapes(std::string & input);
-
-//
-// Filesystem utils
-//
-
-bool fs_validate_filename(const std::string & filename);
-bool fs_create_directory_with_parents(const std::string & path);
-
-std::string fs_get_cache_directory();
-std::string fs_get_cache_file(const std::string & filename);
+std::string sampler_type_to_name_string(llama_sampler_type sampler_type);

 //
 // Model utils
@@ -349,21 +274,28 @@ std::string llama_detokenize_bpe(
 bool llama_should_add_bos_token(const llama_model * model);

 //
-// Chat template utils
+// YAML utils
 //

-// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
-bool llama_chat_verify_template(const std::string & tmpl);
+bool create_directory_with_parents(const std::string & path);
+void dump_vector_float_yaml(FILE * stream, const char * prop_name, const std::vector<float> & data);
+void dump_vector_int_yaml(FILE * stream, const char * prop_name, const std::vector<int> & data);
+void dump_string_yaml_multiline(FILE * stream, const char * prop_name, const char * data);
+std::string get_sortable_timestamp();
+
+void dump_non_result_info_yaml(
+    FILE * stream, const gpt_params & params, const llama_context * lctx,
+    const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);

 //
 // KV cache utils
 //

 // Dump the KV cache view with the number of sequences per cell.
-void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
+void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size = 80);

 // Dump the KV cache view showing individual sequences in each cell (long output).
-void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
+void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40);

 //
 // Embedding utils
@@ -397,20 +329,6 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
 //
 // Split utils
 //
-
 static const char * const LLM_KV_SPLIT_NO            = "split.no";
 static const char * const LLM_KV_SPLIT_COUNT         = "split.count";
 static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
-
-//
-// YAML utils
-//
-
-void yaml_dump_vector_float    (FILE * stream, const char * prop_name, const std::vector<float> & data);
-void yaml_dump_vector_int      (FILE * stream, const char * prop_name, const std::vector<int> & data);
-void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data);
-
-void yaml_dump_non_result_info(
-    FILE * stream, const gpt_params & params, const llama_context * lctx,
-    const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
-
--- a/common/grammar-parser.cpp
+++ b/common/grammar-parser.cpp
@@ -26,7 +26,7 @@ namespace grammar_parser {

    static uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) {
        uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
-        auto result = state.symbol_ids.emplace(std::string(src, len), next_id);
+        auto result = state.symbol_ids.insert(std::make_pair(std::string(src, len), next_id));
        return result.first->second;
    }

@@ -46,12 +46,8 @@ namespace grammar_parser {
        state.rules[rule_id] = rule;
    }

-    static bool is_digit_char(char c) {
-        return '0' <= c && c <= '9';
-    }
-
    static bool is_word_char(char c) {
-        return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || is_digit_char(c);
+        return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || ('0' <= c && c <= '9');
    }

    static std::pair<uint32_t, const char *> parse_hex(const char * src, int size) {
@@ -103,17 +99,6 @@ namespace grammar_parser {
        return pos;
    }

-    static const char * parse_int(const char * src) {
-        const char * pos = src;
-        while (is_digit_char(*pos)) {
-            pos++;
-        }
-        if (pos == src) {
-            throw std::runtime_error(std::string("expecting integer at ") + src);
-        }
-        return pos;
-    }
-
    static std::pair<uint32_t, const char *> parse_char(const char * src) {
        if (*src == '\\') {
            switch (src[1]) {
@@ -152,68 +137,11 @@ namespace grammar_parser {
            bool                                 is_nested) {
        size_t last_sym_start = out_elements.size();
        const char * pos = src;
-
-        auto handle_repetitions = [&](int min_times, int max_times) {
-
-            if (last_sym_start == out_elements.size()) {
-                throw std::runtime_error(std::string("expecting preceding item to */+/?/{ at ") + pos);
-            }
-
-            // apply transformation to previous symbol (last_sym_start to end) according to
-            // the following rewrite rules:
-            // S{m,n} --> S S S (m times) S'(n-m)
-            //            S'(x)   ::= S S'(x-1) |
-            //            (... n-m definitions of these S' rules ...)
-            //            S'(1)   ::= S |
-            // S{m,} -->  S S S (m times) S'
-            //            S'     ::= S S' |
-            // S*     --> S{0,}
-            //        --> S'     ::= S S' |
-            // S+     --> S{1,}
-            //        --> S S'
-            //            S'     ::= S S' |
-            // S?     --> S{0,1}
-            //        --> S'
-            //            S'     ::= S |
-
-            std::vector<llama_grammar_element> previous_elements(out_elements.begin() + last_sym_start, out_elements.end());
-            if (min_times == 0) {
-                out_elements.resize(last_sym_start);
-            } else {
-                // Repeat the previous elements (min_times - 1) times
-                for (int i = 1; i < min_times; i++) {
-                    out_elements.insert(out_elements.end(), previous_elements.begin(), previous_elements.end());
-                }
-            }
-
-            uint32_t last_rec_rule_id = 0;
-            auto n_opt = max_times < 0 ? 1 : max_times - min_times;
-
-            std::vector<llama_grammar_element> rec_rule(previous_elements);
-            for (int i = 0; i < n_opt; i++) {
-                rec_rule.resize(previous_elements.size());
-                uint32_t rec_rule_id = generate_symbol_id(state, rule_name);
-                if (i > 0 || max_times < 0) {
-                    rec_rule.push_back({LLAMA_GRETYPE_RULE_REF, max_times < 0 ? rec_rule_id : last_rec_rule_id});
-                }
-                rec_rule.push_back({LLAMA_GRETYPE_ALT, 0});
-                rec_rule.push_back({LLAMA_GRETYPE_END, 0});
-                add_rule(state, rec_rule_id, rec_rule);
-                last_rec_rule_id = rec_rule_id;
-            }
-            if (n_opt > 0) {
-                out_elements.push_back({LLAMA_GRETYPE_RULE_REF, last_rec_rule_id});
-            }
-        };
-
        while (*pos) {
            if (*pos == '"') { // literal string
                pos++;
                last_sym_start = out_elements.size();
                while (*pos != '"') {
-                    if (!*pos) {
-                        throw std::runtime_error("unexpected end of input");
-                    }
                    auto char_pair = parse_char(pos);
                         pos       = char_pair.second;
                    out_elements.push_back({LLAMA_GRETYPE_CHAR, char_pair.first});
@@ -228,9 +156,6 @@ namespace grammar_parser {
                }
                last_sym_start = out_elements.size();
                while (*pos != ']') {
-                    if (!*pos) {
-                        throw std::runtime_error("unexpected end of input");
-                    }
                    auto char_pair = parse_char(pos);
                         pos       = char_pair.second;
                    enum llama_gretype type = last_sym_start < out_elements.size()
@@ -239,9 +164,6 @@ namespace grammar_parser {

                    out_elements.push_back({type, char_pair.first});
                    if (pos[0] == '-' && pos[1] != ']') {
-                        if (!pos[1]) {
-                            throw std::runtime_error("unexpected end of input");
-                        }
                        auto endchar_pair = parse_char(pos + 1);
                             pos          = endchar_pair.second;
                        out_elements.push_back({LLAMA_GRETYPE_CHAR_RNG_UPPER, endchar_pair.first});
@@ -266,51 +188,40 @@ namespace grammar_parser {
                    throw std::runtime_error(std::string("expecting ')' at ") + pos);
                }
                pos = parse_space(pos + 1, is_nested);
-            } else if (*pos == '.') { // any char
-                last_sym_start = out_elements.size();
-                out_elements.push_back({LLAMA_GRETYPE_CHAR_ANY, 0});
-                pos = parse_space(pos + 1, is_nested);
-            } else if (*pos == '*') {
-                pos = parse_space(pos + 1, is_nested);
-                handle_repetitions(0, -1);
-            } else if (*pos == '+') {
-                pos = parse_space(pos + 1, is_nested);
-                handle_repetitions(1, -1);
-            } else if (*pos == '?') {
-                pos = parse_space(pos + 1, is_nested);
-                handle_repetitions(0, 1);
-            } else if (*pos == '{') {
-                pos = parse_space(pos + 1, is_nested);
-
-                if (!is_digit_char(*pos)) {
-                    throw std::runtime_error(std::string("expecting an int at ") + pos);
+            } else if (*pos == '*' || *pos == '+' || *pos == '?') { // repetition operator
+                if (last_sym_start == out_elements.size()) {
+                    throw std::runtime_error(std::string("expecting preceding item to */+/? at ") + pos);
                }
-                const char * int_end = parse_int(pos);
-                int min_times = std::stoul(std::string(pos, int_end - pos));
-                pos = parse_space(int_end, is_nested);

-                int max_times = -1;
-
-                if (*pos == '}') {
-                    max_times = min_times;
-                    pos = parse_space(pos + 1, is_nested);
-                } else if (*pos == ',') {
-                    pos = parse_space(pos + 1, is_nested);
-
-                    if (is_digit_char(*pos)) {
-                        const char * int_end = parse_int(pos);
-                        max_times = std::stoul(std::string(pos, int_end - pos));
-                        pos = parse_space(int_end, is_nested);
-                    }
-
-                    if (*pos != '}') {
-                        throw std::runtime_error(std::string("expecting '}' at ") + pos);
-                    }
-                    pos = parse_space(pos + 1, is_nested);
-                } else {
-                    throw std::runtime_error(std::string("expecting ',' at ") + pos);
+                // apply transformation to previous symbol (last_sym_start to end) according to
+                // rewrite rules:
+                // S* --> S' ::= S S' |
+                // S+ --> S' ::= S S' | S
+                // S? --> S' ::= S |
+                uint32_t sub_rule_id = generate_symbol_id(state, rule_name);
+                std::vector<llama_grammar_element> sub_rule;
+                // add preceding symbol to generated rule
+                sub_rule.insert(
+                    sub_rule.end(), out_elements.begin() + last_sym_start, out_elements.end());
+                if (*pos == '*' || *pos == '+') {
+                    // cause generated rule to recurse
+                    sub_rule.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
                }
-                handle_repetitions(min_times, max_times);
+                // mark start of alternate def
+                sub_rule.push_back({LLAMA_GRETYPE_ALT, 0});
+                if (*pos == '+') {
+                    // add preceding symbol as alternate only for '+' (otherwise empty)
+                    sub_rule.insert(
+                        sub_rule.end(), out_elements.begin() + last_sym_start, out_elements.end());
+                }
+                sub_rule.push_back({LLAMA_GRETYPE_END, 0});
+                add_rule(state, sub_rule_id, sub_rule);
+
+                // in original rule, replace previous symbol with reference to generated rule
+                out_elements.resize(last_sym_start);
+                out_elements.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
+
+                pos = parse_space(pos + 1, is_nested);
            } else {
                break;
            }
@@ -405,7 +316,6 @@ namespace grammar_parser {
            case LLAMA_GRETYPE_CHAR_NOT:       return true;
            case LLAMA_GRETYPE_CHAR_ALT:       return true;
            case LLAMA_GRETYPE_CHAR_RNG_UPPER: return true;
-            case LLAMA_GRETYPE_CHAR_ANY:       return true;
            default:                           return false;
        }
    }
@@ -420,7 +330,6 @@ namespace grammar_parser {
                case LLAMA_GRETYPE_CHAR_NOT:       fprintf(file, "CHAR_NOT");       break;
                case LLAMA_GRETYPE_CHAR_RNG_UPPER: fprintf(file, "CHAR_RNG_UPPER"); break;
                case LLAMA_GRETYPE_CHAR_ALT:       fprintf(file, "CHAR_ALT");       break;
-                case LLAMA_GRETYPE_CHAR_ANY:       fprintf(file, "CHAR_ANY");       break;
            }
            switch (elem.type) {
                case LLAMA_GRETYPE_END:
@@ -432,7 +341,6 @@ namespace grammar_parser {
                case LLAMA_GRETYPE_CHAR_NOT:
                case LLAMA_GRETYPE_CHAR_RNG_UPPER:
                case LLAMA_GRETYPE_CHAR_ALT:
-                case LLAMA_GRETYPE_CHAR_ANY:
                    fprintf(file, "(\"");
                    print_grammar_char(file, elem.value);
                    fprintf(file, "\") ");
@@ -490,15 +398,11 @@ namespace grammar_parser {
                    }
                    print_grammar_char(file, elem.value);
                    break;
-                case LLAMA_GRETYPE_CHAR_ANY:
-                    fprintf(file, ".");
-                    break;
            }
            if (is_char_element(elem)) {
                switch (rule[i + 1].type) {
                    case LLAMA_GRETYPE_CHAR_ALT:
                    case LLAMA_GRETYPE_CHAR_RNG_UPPER:
-                    case LLAMA_GRETYPE_CHAR_ANY:
                        break;
                    default:
                        fprintf(file, "] ");
--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@@ -16,55 +16,92 @@ static std::string join(Iterator begin, Iterator end, const std::string & separa

 static std::string repeat(const std::string & str, size_t n);

-static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "") {
-    auto has_max = max_items != std::numeric_limits<int>::max();
-
-    if (min_items == 0 && max_items == 1) {
-        return item_rule + "?";
-    }
-
+static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "", bool item_rule_is_literal = false) {
    if (separator_rule.empty()) {
-        if (min_items == 1 && !has_max) {
+        if (min_items == 0 && max_items == 1) {
+            return item_rule + "?";
+        } else if (min_items == 1 && max_items == std::numeric_limits<int>::max()) {
            return item_rule + "+";
-        } else if (min_items == 0 && !has_max) {
-            return item_rule + "*";
-        } else {
-            return item_rule + "{" + std::to_string(min_items) + "," + (has_max ? std::to_string(max_items) : "") + "}";
        }
    }

-    auto result = item_rule + " " + build_repetition("(" + separator_rule + " " + item_rule + ")", min_items == 0 ? 0 : min_items - 1, has_max ? max_items - 1 : max_items);
-    if (min_items == 0) {
-        result = "(" + result + ")?";
+    std::string result;
+    if (min_items > 0) {
+        if (item_rule_is_literal && separator_rule.empty()) {
+            result = "\"" + repeat(std::string(item_rule.begin() + 1, item_rule.end() - 1), min_items) + "\"";
+        } else {
+            std::vector<std::string> items(min_items, item_rule);
+            result = join(items.begin(), items.end(), separator_rule.empty() ? " " : " " + separator_rule + " ");
+        }
    }
+
+    std::function<std::string(int, bool)> opt_repetitions = [&](int up_to_n, bool prefix_with_sep) -> std::string {
+        auto content = prefix_with_sep && !separator_rule.empty() ? separator_rule + " " + item_rule : item_rule;
+
+        if (up_to_n == 0) {
+            return "";
+        } else if (up_to_n == 1) {
+            return "(" + content + ")?";
+        } else if (!separator_rule.empty() && !prefix_with_sep) {
+            return "(" + content + " " + opt_repetitions(up_to_n - 1, true) + ")?";
+        } else {
+            std::string res = repeat("(" + content + " ", up_to_n);
+            // strip trailing space
+            res = res.substr(0, res.length() - 1);
+            res += repeat(")?", up_to_n);
+            return res;
+        }
+    };
+
+    if (min_items > 0 && max_items != min_items) {
+        result += " ";
+    }
+
+    if (max_items != std::numeric_limits<int>::max()) {
+        result += opt_repetitions(max_items - min_items, min_items > 0);
+    } else {
+        std::string item_operator = "(" + (separator_rule.empty() ? "" : separator_rule + " ") + item_rule + ")";
+        if (min_items == 0 && !separator_rule.empty()) {
+            result = "(" + item_rule + " " + item_operator + "*)?";
+        } else {
+            result += item_operator + "*";
+        }
+    }
+
    return result;
 }

-const std::string SPACE_RULE = "| \" \" | \"\\n\" [ \\t]{0,20}";
+const std::string SPACE_RULE = "\" \"?";

 struct BuiltinRule {
    std::string content;
    std::vector<std::string> deps;
 };

+const std::string _up_to_15_digits = build_repetition("[0-9]", 0, 15);
+
 std::unordered_map<std::string, BuiltinRule> PRIMITIVE_RULES = {
    {"boolean", {"(\"true\" | \"false\") space", {}}},
-    {"decimal-part", {"[0-9]{1,16}", {}}},
-    {"integral-part", {"[0] | [1-9] [0-9]{0,15}", {}}},
+    {"decimal-part", {"[0-9] " + _up_to_15_digits, {}}},
+    {"integral-part", {"[0-9] | [1-9] " + _up_to_15_digits, {}}},
    {"number", {"(\"-\"? integral-part) (\".\" decimal-part)? ([eE] [-+]? integral-part)? space", {"integral-part", "decimal-part"}}},
    {"integer", {"(\"-\"? integral-part) space", {"integral-part"}}},
    {"value", {"object | array | string | number | boolean | null", {"object", "array", "string", "number", "boolean", "null"}}},
    {"object", {"\"{\" space ( string \":\" space value (\",\" space string \":\" space value)* )? \"}\" space", {"string", "value"}}},
    {"array", {"\"[\" space ( value (\",\" space value)* )? \"]\" space", {"value"}}},
-    {"uuid", {"\"\\\"\" [0-9a-fA-F]{8} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{12} \"\\\"\" space", {}}},
-    {"char",   {"[^\"\\\\\\x7F\\x00-\\x1F] | [\\\\] ([\"\\\\bfnrt] | \"u\" [0-9a-fA-F]{4})", {}}},
+    {"uuid", {"\"\\\"\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
+                "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
+                "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
+                "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
+                "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] \"\\\"\" space", {}}},
+    {"char",   {"[^\"\\\\] | \"\\\\\" ([\"\\\\/bfnrt] | \"u\" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])", {}}},
    {"string", {"\"\\\"\" char* \"\\\"\" space", {"char"}}},
    {"null", {"\"null\" space", {}}},
 };

 std::unordered_map<std::string, BuiltinRule> STRING_FORMAT_RULES = {
-    {"date", {"[0-9]{4} \"-\" ( \"0\" [1-9] | \"1\" [0-2] ) \"-\" ( \"0\" [1-9] | [1-2] [0-9] | \"3\" [0-1] )", {}}},
-    {"time", {"([01] [0-9] | \"2\" [0-3]) \":\" [0-5] [0-9] \":\" [0-5] [0-9] ( \".\" [0-9]{3} )? ( \"Z\" | ( \"+\" | \"-\" ) ( [01] [0-9] | \"2\" [0-3] ) \":\" [0-5] [0-9] )", {}}},
+    {"date", {"[0-9] [0-9] [0-9] [0-9] \"-\" ( \"0\" [1-9] | \"1\" [0-2] ) \"-\" ( \"0\" [1-9] | [1-2] [0-9] | \"3\" [0-1] )", {}}},
+    {"time", {"([01] [0-9] | \"2\" [0-3]) \":\" [0-5] [0-9] \":\" [0-5] [0-9] ( \".\" [0-9] [0-9] [0-9] )? ( \"Z\" | ( \"+\" | \"-\" ) ( [01] [0-9] | \"2\" [0-3] ) \":\" [0-5] [0-9] )", {}}},
    {"date-time", {"date \"T\" time", {"date", "time"}}},
    {"date-string", {"\"\\\"\" date \"\\\"\" space", {"date"}}},
    {"time-string", {"\"\\\"\" time \"\\\"\" space", {"time"}}},
@@ -235,7 +272,7 @@ private:
                    if (literal.empty()) {
                        return false;
                    }
-                    ret.emplace_back(literal, true);
+                    ret.push_back(std::make_pair(literal, true));
                    literal.clear();
                    return true;
                };
@@ -261,7 +298,7 @@ private:
            while (i < length) {
                char c = sub_pattern[i];
                if (c == '.') {
-                    seq.emplace_back(get_dot(), false);
+                    seq.push_back(std::make_pair(get_dot(), false));
                    i++;
                } else if (c == '(') {
                    i++;
@@ -270,7 +307,7 @@ private:
                            _warnings.push_back("Unsupported pattern syntax");
                        }
                    }
-                    seq.emplace_back("(" + to_rule(transform()) + ")", false);
+                    seq.push_back(std::make_pair("(" + to_rule(transform()) + ")", false));
                } else if (c == ')') {
                    i++;
                    if (start > 0 && sub_pattern[start - 1] != '(') {
@@ -294,9 +331,9 @@ private:
                    }
                    square_brackets += ']';
                    i++;
-                    seq.emplace_back(square_brackets, false);
+                    seq.push_back(std::make_pair(square_brackets, false));
                } else if (c == '|') {
-                    seq.emplace_back("|", false);
+                    seq.push_back(std::make_pair("|", false));
                    i++;
                } else if (c == '*' || c == '+' || c == '?') {
                    seq.back() = std::make_pair(to_rule(seq.back()) + c, false);
@@ -348,7 +385,8 @@ private:
                        sub_is_literal ? "\"" + sub + "\"" : sub,
                        min_times,
                        max_times,
-                        ""
+                        "",
+                        sub_is_literal
                    );
                    seq.back().second = false;
                } else {
@@ -379,7 +417,7 @@ private:
                        }
                    }
                    if (!literal.empty()) {
-                        seq.emplace_back(literal, true);
+                        seq.push_back(std::make_pair(literal, true));
                    }
                }
            }
--- a/common/json-schema-to-grammar.h
+++ b/common/json-schema-to-grammar.h
@@ -1,8 +1,4 @@
 #pragma once
-
-#include "ggml.h"
-// Change JSON_ASSERT from assert() to GGML_ASSERT:
-#define JSON_ASSERT GGML_ASSERT
 #include "json.hpp"

 std::string json_schema_to_grammar(const nlohmann::ordered_json& schema);
--- a/common/log.h
+++ b/common/log.h
@@ -211,7 +211,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
        #define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
    #else
        #define LOG_FLF_FMT "[%24s:%5ld][%24s] "
-        #define LOG_FLF_VAL , __FILE__, (long)__LINE__, __FUNCTION__
+        #define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
    #endif
 #else
    #define LOG_FLF_FMT "%s"
@@ -224,7 +224,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
        #define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
    #else
        #define LOG_TEE_FLF_FMT "[%24s:%5ld][%24s] "
-        #define LOG_TEE_FLF_VAL , __FILE__, (long)__LINE__, __FUNCTION__
+        #define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
    #endif
 #else
    #define LOG_TEE_FLF_FMT "%s"
@@ -294,7 +294,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
 // Main LOG macro.
 //  behaves like printf, and supports arguments the exact same way.
 //
-#if !defined(_MSC_VER) || defined(__clang__)
+#ifndef _MSC_VER
    #define LOG(...) LOG_IMPL(__VA_ARGS__, "")
 #else
    #define LOG(str, ...) LOG_IMPL("%s" str, "", ##__VA_ARGS__, "")
@@ -308,14 +308,14 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
 // Secondary target can be changed just like LOG_TARGET
 //  by defining LOG_TEE_TARGET
 //
-#if !defined(_MSC_VER) || defined(__clang__)
+#ifndef _MSC_VER
    #define LOG_TEE(...) LOG_TEE_IMPL(__VA_ARGS__, "")
 #else
    #define LOG_TEE(str, ...) LOG_TEE_IMPL("%s" str, "", ##__VA_ARGS__, "")
 #endif

 // LOG macro variants with auto endline.
-#if !defined(_MSC_VER) || defined(__clang__)
+#ifndef _MSC_VER
    #define LOGLN(...) LOG_IMPL(__VA_ARGS__, "\n")
    #define LOG_TEELN(...) LOG_TEE_IMPL(__VA_ARGS__, "\n")
 #else
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -35,7 +35,7 @@ struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_

    result->prev.resize(params.n_prev);

-    result->n_valid = 0;
+    result->n_considered = 0;

    llama_sampling_set_rng_seed(result, params.seed);

@@ -66,7 +66,7 @@ void llama_sampling_reset(llama_sampling_context * ctx) {

    std::fill(ctx->prev.begin(), ctx->prev.end(), 0);
    ctx->cur.clear();
-    ctx->n_valid = 0;
+    ctx->n_considered = 0;
 }

 void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed) {
@@ -125,7 +125,7 @@ std::string llama_sampling_order_print(const llama_sampling_params & params) {
    std::string result = "CFG -> Penalties ";
    if (params.mirostat == 0) {
        for (auto sampler_type : params.samplers_sequence) {
-            const auto sampler_type_name = llama_sampling_type_to_str(sampler_type);
+            const auto sampler_type_name = sampler_type_to_name_string(sampler_type);
            if (!sampler_type_name.empty()) {
                result += "-> " + sampler_type_name + " ";
            }
@@ -137,87 +137,6 @@ std::string llama_sampling_order_print(const llama_sampling_params & params) {
    return result;
 }

-std::string llama_sampling_type_to_str(llama_sampler_type sampler_type) {
-    switch (sampler_type) {
-        case llama_sampler_type::TOP_K:       return "top_k";
-        case llama_sampler_type::TFS_Z:       return "tfs_z";
-        case llama_sampler_type::TYPICAL_P:   return "typical_p";
-        case llama_sampler_type::TOP_P:       return "top_p";
-        case llama_sampler_type::MIN_P:       return "min_p";
-        case llama_sampler_type::TEMPERATURE: return "temperature";
-        default : return "";
-    }
-}
-
-std::vector<llama_sampler_type> llama_sampling_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
-    std::unordered_map<std::string, llama_sampler_type> sampler_canonical_name_map {
-        {"top_k",       llama_sampler_type::TOP_K},
-        {"top_p",       llama_sampler_type::TOP_P},
-        {"typical_p",   llama_sampler_type::TYPICAL_P},
-        {"min_p",       llama_sampler_type::MIN_P},
-        {"tfs_z",       llama_sampler_type::TFS_Z},
-        {"temperature", llama_sampler_type::TEMPERATURE}
-    };
-
-    // since samplers names are written multiple ways
-    // make it ready for both system names and input names
-    std::unordered_map<std::string, llama_sampler_type> sampler_alt_name_map {
-        {"top-k",       llama_sampler_type::TOP_K},
-        {"top-p",       llama_sampler_type::TOP_P},
-        {"nucleus",     llama_sampler_type::TOP_P},
-        {"typical-p",   llama_sampler_type::TYPICAL_P},
-        {"typical",     llama_sampler_type::TYPICAL_P},
-        {"min-p",       llama_sampler_type::MIN_P},
-        {"tfs-z",       llama_sampler_type::TFS_Z},
-        {"tfs",         llama_sampler_type::TFS_Z},
-        {"temp",        llama_sampler_type::TEMPERATURE}
-    };
-
-    std::vector<llama_sampler_type> sampler_types;
-    sampler_types.reserve(names.size());
-    for (const auto & name : names)
-    {
-        auto sampler_item = sampler_canonical_name_map.find(name);
-        if (sampler_item != sampler_canonical_name_map.end())
-        {
-            sampler_types.push_back(sampler_item->second);
-        }
-        else
-        {
-            if (allow_alt_names)
-            {
-                sampler_item = sampler_alt_name_map.find(name);
-                if (sampler_item != sampler_alt_name_map.end())
-                {
-                    sampler_types.push_back(sampler_item->second);
-                }
-            }
-        }
-    }
-    return sampler_types;
-}
-
-std::vector<llama_sampler_type> llama_sampling_types_from_chars(const std::string & names_string) {
-    std::unordered_map<char, llama_sampler_type> sampler_name_map {
-        {'k', llama_sampler_type::TOP_K},
-        {'p', llama_sampler_type::TOP_P},
-        {'y', llama_sampler_type::TYPICAL_P},
-        {'m', llama_sampler_type::MIN_P},
-        {'f', llama_sampler_type::TFS_Z},
-        {'t', llama_sampler_type::TEMPERATURE}
-    };
-
-    std::vector<llama_sampler_type> sampler_types;
-    sampler_types.reserve(names_string.size());
-    for (const auto & c : names_string) {
-        const auto sampler_item = sampler_name_map.find(c);
-        if (sampler_item != sampler_name_map.end()) {
-            sampler_types.push_back(sampler_item->second);
-        }
-    }
-    return sampler_types;
-}
-
 // no reasons to expose this function in header
 static void sampler_queue(
                   struct llama_context * ctx_main,
@@ -260,7 +179,7 @@ static llama_token llama_sampling_sample_impl(
                  struct llama_context * ctx_main,
                  struct llama_context * ctx_cfg,
                  const int idx,
-                  bool is_resampling) {
+                  bool is_resampling) {  // Add a parameter to indicate if we are resampling
    const llama_sampling_params & params = ctx_sampling->params;

    const float   temp            = params.temp;
@@ -269,8 +188,8 @@ static llama_token llama_sampling_sample_impl(
    const float   mirostat_eta    = params.mirostat_eta;

    std::vector<float> original_logits;
-    auto cur_p = llama_sampling_prepare(ctx_sampling, ctx_main, ctx_cfg, idx, /* apply_grammar= */ is_resampling, &original_logits);
-    if (ctx_sampling->grammar != NULL && !is_resampling) {
+    auto cur_p = llama_sampling_prepare(ctx_sampling, ctx_main, ctx_cfg, idx, !is_resampling, &original_logits);
+    if (!is_resampling) {
        GGML_ASSERT(!original_logits.empty());
    }
    llama_token id = 0;
@@ -333,11 +252,11 @@ static llama_token llama_sampling_sample_impl(
            // Restore logits from the copy
            std::copy(original_logits.begin(), original_logits.end(), logits);

-            return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, /* is_resampling= */ true);
+            return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, true);  // Pass true for is_resampling
        }
    }

-    ctx_sampling->n_valid = temp == 0.0f ? 0 : cur_p.size;
+    ctx_sampling->n_considered = cur_p.size;

    return id;
 }
@@ -366,8 +285,7 @@ static llama_token_data_array llama_sampling_prepare_impl(
    // Get a pointer to the logits
    float * logits = llama_get_logits_ith(ctx_main, idx);

-    if (ctx_sampling->grammar != NULL && !apply_grammar) {
-        GGML_ASSERT(original_logits != NULL);
+    if (apply_grammar && original_logits != NULL) {
        // Only make a copy of the original logits if we are not applying grammar checks, not sure if I actually have to do this.
        *original_logits = {logits, logits + llama_n_vocab(llama_get_model(ctx_main))};
    }
@@ -424,7 +342,7 @@ llama_token llama_sampling_sample(
                  struct llama_context * ctx_cfg,
                  const int idx) {
    // Call the implementation function with is_resampling set to false by default
-    return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, /* is_resampling= */ false);
+    return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, false);
 }

 llama_token_data_array llama_sampling_prepare(
--- a/common/sampling.h
+++ b/common/sampling.h
@@ -81,7 +81,7 @@ struct llama_sampling_context {
    // TODO: replace with ring-buffer
    std::vector<llama_token>      prev;
    std::vector<llama_token_data> cur;
-    size_t n_valid; // Number of correct top tokens with correct probabilities.
+    size_t n_considered;

    std::mt19937 rng;
 };
@@ -116,11 +116,6 @@ std::string llama_sampling_print(const llama_sampling_params & params);
 // Print sampling order into a string
 std::string llama_sampling_order_print(const llama_sampling_params & params);

-std::string llama_sampling_type_to_str(llama_sampler_type sampler_type);
-
-std::vector<llama_sampler_type> llama_sampling_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
-std::vector<llama_sampler_type> llama_sampling_types_from_chars(const std::string & names_string);
-
 // this is a common sampling function used across the examples for convenience
 // it can serve as a starting point for implementing your own sampling function
 // Note: When using multiple sequences, it is the caller's responsibility to call
--- a/common/train.cpp
+++ b/common/train.cpp
@@ -1052,7 +1052,7 @@ struct train_params_common get_default_train_params_common() {

    params.custom_n_ctx = false;

-    params.use_flash              = false;
+    params.use_flash              = true;
    params.use_checkpointing      = true;

    params.sample_start           = "";
@@ -1380,7 +1380,7 @@ bool consume_common_train_arg(

 void finish_processing_train_args(struct train_params_common * params) {
    if (params->escape) {
-        string_process_escapes(params->sample_start);
+        process_escapes(params->sample_start);
    }
 }

--- a/convert-hf-to-gguf-update.py
+++ b/convert-hf-to-gguf-update.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python3
-# -*- coding: utf-8 -*-

 # This script downloads the tokenizer models of the specified models from Huggingface and
 # generates the get_vocab_base_pre() function for convert-hf-to-gguf.py
@@ -21,13 +20,11 @@
 # - Update llama.cpp with the new pre-tokenizer if necessary
 #
 # TODO: generate tokenizer tests for llama.cpp
+# TODO: automate the update of convert-hf-to-gguf.py
 #

 import logging
 import os
-import pathlib
-import re
-
 import requests
 import sys
 import json
@@ -38,7 +35,6 @@ from transformers import AutoTokenizer

 logging.basicConfig(level=logging.DEBUG)
 logger = logging.getLogger("convert-hf-to-gguf-update")
-sess = requests.Session()


 class TOKENIZER_TYPE(IntEnum):
@@ -53,10 +49,6 @@ chktxt = '\n \n\n \n\n\n \t \t\t \t\n  \n   \n    \n     \n🚀 (normal) 😶‍

 if len(sys.argv) == 2:
    token = sys.argv[1]
-    if not token.startswith("hf_"):
-        logger.info("Huggingface token seems invalid")
-        logger.info("Usage: python convert-hf-to-gguf-update.py <huggingface_token>")
-        sys.exit(1)
 else:
    logger.info("Usage: python convert-hf-to-gguf-update.py <huggingface_token>")
    sys.exit(1)
@@ -73,57 +65,68 @@ models = [
    {"name": "mpt",            "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
    {"name": "starcoder",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
    {"name": "gpt-2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
-    {"name": "stablelm2",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b", },
    {"name": "refact",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
    {"name": "command-r",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", },
-    {"name": "qwen2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },
    {"name": "olmo",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", },
-    {"name": "dbrx",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", },
-    {"name": "jina-v2-en",     "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
-    {"name": "jina-v2-es",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
-    {"name": "jina-v2-de",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
-    {"name": "smaug-bpe",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
-    {"name": "jina-v2-code",   "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
 ]

+# make directory "models/tokenizers" if it doesn't exist
+if not os.path.exists("models/tokenizers"):
+    os.makedirs("models/tokenizers")
+

 def download_file_with_auth(url, token, save_path):
    headers = {"Authorization": f"Bearer {token}"}
-    response = sess.get(url, headers=headers)
-    response.raise_for_status()
-    os.makedirs(os.path.dirname(save_path), exist_ok=True)
-    with open(save_path, 'wb') as f:
-        f.write(response.content)
-    logger.info(f"File {save_path} downloaded successfully")
+    response = requests.get(url, headers=headers)
+    if response.status_code == 200:
+        with open(save_path, 'wb') as f:
+            f.write(response.content)
+        logger.info(f"File {save_path} downloaded successfully")
+    else:
+        logger.info(f"Failed to download file. Status code: {response.status_code}")


-def download_model(model):
+# download the tokenizer models
+for model in models:
    name = model["name"]
    repo = model["repo"]
    tokt = model["tokt"]

-    os.makedirs(f"models/tokenizers/{name}", exist_ok=True)
+    if not os.path.exists(f"models/tokenizers/{name}"):
+        os.makedirs(f"models/tokenizers/{name}")
+    else:
+        logger.info(f"Directory models/tokenizers/{name} already exists - skipping")
+        continue
+
+    logger.info(f"Downloading {name} to models/tokenizers/{name}")
+
+    url = f"{repo}/raw/main/config.json"
+    save_path = f"models/tokenizers/{name}/config.json"
+    download_file_with_auth(url, token, save_path)
+
+    url = f"{repo}/raw/main/tokenizer.json"
+    save_path = f"models/tokenizers/{name}/tokenizer.json"
+    download_file_with_auth(url, token, save_path)
+
+    # if downloaded file is less than 1KB, we likely need to download an LFS instead
+    if os.path.getsize(save_path) < 1024:
+        # remove the file
+        os.remove(save_path)
+        url = f"{repo}/resolve/main/tokenizer.json"
+        save_path = f"models/tokenizers/{name}/tokenizer.json"
+        download_file_with_auth(url, token, save_path)

-    files = ["config.json", "tokenizer.json", "tokenizer_config.json"]
    if tokt == TOKENIZER_TYPE.SPM:
-        files.append("tokenizer.model")
-
-    for file in files:
-        save_path = f"models/tokenizers/{name}/{file}"
-        if os.path.isfile(save_path):
-            logger.info(f"{name}: File {save_path} already exists - skipping")
-            continue
-        download_file_with_auth(f"{repo}/resolve/main/{file}", token, save_path)
-
-
-for model in models:
-    try:
-        download_model(model)
-    except Exception as e:
-        logger.error(f"Failed to download model {model['name']}. Error: {e}")
+        url = f"{repo}/resolve/main/tokenizer.model"
+        save_path = f"models/tokenizers/{name}/tokenizer.model"
+        download_file_with_auth(url, token, save_path)

+    url = f"{repo}/raw/main/tokenizer_config.json"
+    save_path = f"models/tokenizers/{name}/tokenizer_config.json"
+    download_file_with_auth(url, token, save_path)

 # generate the source code for the convert-hf-to-gguf.py:get_vocab_base_pre() function:
+# TODO: auto-update convert-hf-to-gguf.py with the generated function

 src_ifs = ""
 for model in models:
@@ -133,17 +136,8 @@ for model in models:
    if tokt == TOKENIZER_TYPE.SPM:
        continue

-    # Skip if the tokenizer folder does not exist or there are other download issues previously
-    if not os.path.exists(f"models/tokenizers/{name}"):
-        logger.warning(f"Directory for tokenizer {name} not found. Skipping...")
-        continue
-
    # create the tokenizer
-    try:
-        tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
-    except OSError as e:
-        logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}")
-        continue  # Skip to the next model if the tokenizer can't be loaded
+    tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")

    chktok = tokenizer.encode(chktxt)
    chkhsh = sha256(str(chktok).encode()).hexdigest()
@@ -157,12 +151,8 @@ for model in models:
    # print the "pre_tokenizer" content from the tokenizer.json
    with open(f"models/tokenizers/{name}/tokenizer.json", "r", encoding="utf-8") as f:
        cfg = json.load(f)
-        normalizer = cfg["normalizer"]
-        logger.info("normalizer: " + json.dumps(normalizer, indent=4))
        pre_tokenizer = cfg["pre_tokenizer"]
        logger.info("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4))
-        if "ignore_merges" in cfg["model"]:
-            logger.info("ignore_merges: " + json.dumps(cfg["model"]["ignore_merges"], indent=4))

    logger.info("")

@@ -212,18 +202,11 @@ src_func = f"""
        return res
 """

-convert_py_pth = pathlib.Path("convert-hf-to-gguf.py")
-convert_py = convert_py_pth.read_text()
-convert_py = re.sub(
-    r"(# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre)",
-    lambda m: m.group(1) + src_func + m.group(3),
-    convert_py,
-    flags=re.DOTALL | re.MULTILINE,
-)
+print(src_func) # noqa: NP100

-convert_py_pth.write_text(convert_py)
-
-logger.info("+++ convert-hf-to-gguf.py was updated")
+logger.info("\n")
+logger.info("!!! Copy-paste the function above into convert-hf-to-gguf.py !!!")
+logger.info("\n")

 # generate tests for each tokenizer model

@@ -270,7 +253,6 @@ tests = [
    "3333333",
    "33333333",
    "333333333",
-    # "Cửa Việt", # llama-bpe fails on this
    chktxt,
 ]

@@ -291,17 +273,8 @@ for model in models:
    name = model["name"]
    tokt = model["tokt"]

-    # Skip if the tokenizer folder does not exist or there are other download issues previously
-    if not os.path.exists(f"models/tokenizers/{name}"):
-        logger.warning(f"Directory for tokenizer {name} not found. Skipping...")
-        continue
-
    # create the tokenizer
-    try:
-        tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
-    except OSError as e:
-        logger.error(f"Failed to load tokenizer for model {name}. Error: {e}")
-        continue  # Skip this model and continue with the next one in the loop
+    tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")

    with open(f"models/ggml-vocab-{name}.gguf.inp", "w", encoding="utf-8") as f:
        for text in tests:
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
--- a/convert-lora-to-ggml.py
+++ b/convert-lora-to-ggml.py
@@ -0,0 +1,150 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import logging
+import json
+import os
+import struct
+import sys
+from pathlib import Path
+from typing import Any, BinaryIO, Sequence
+
+import numpy as np
+import torch
+
+if 'NO_LOCAL_GGUF' not in os.environ:
+    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
+import gguf
+
+logging.basicConfig(level=logging.DEBUG)
+logger = logging.getLogger("lora-to-gguf")
+
+NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1}
+
+
+def write_file_header(fout: BinaryIO, params: dict[str, Any]) -> None:
+    fout.write(b"ggla"[::-1])  # magic (ggml lora)
+    fout.write(struct.pack("i", 1))  # file version
+    fout.write(struct.pack("i", params["r"]))
+    # https://opendelta.readthedocs.io/en/latest/modules/deltas.html says that `lora_alpha` is an int
+    # but some models ship a float value instead
+    # let's convert to int, but fail if lossless conversion is not possible
+    assert (
+        int(params["lora_alpha"]) == params["lora_alpha"]
+    ), "cannot convert float to int losslessly"
+    fout.write(struct.pack("i", int(params["lora_alpha"])))
+
+
+def write_tensor_header(fout: BinaryIO, name: str, shape: Sequence[int], data_type: np.dtype[Any]) -> None:
+    sname = name.encode("utf-8")
+    fout.write(
+        struct.pack(
+            "iii",
+            len(shape),
+            len(sname),
+            NUMPY_TYPE_TO_FTYPE[data_type.name],
+        )
+    )
+    fout.write(struct.pack("i" * len(shape), *shape[::-1]))
+    fout.write(sname)
+    fout.seek((fout.tell() + 31) & -32)
+
+
+if __name__ == '__main__':
+    if len(sys.argv) < 2:
+        logger.info(f"Usage: python {sys.argv[0]} <path> [arch]")
+        logger.info("Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'")
+        logger.info(f"Arch must be one of {list(gguf.MODEL_ARCH_NAMES.values())} (default: llama)")
+        sys.exit(1)
+
+    input_json = os.path.join(sys.argv[1], "adapter_config.json")
+    input_model = os.path.join(sys.argv[1], "adapter_model.bin")
+    output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin")
+
+    if os.path.exists(input_model):
+        model = torch.load(input_model, map_location="cpu")
+    else:
+        input_model = os.path.join(sys.argv[1], "adapter_model.safetensors")
+        # lazy import load_file only if lora is in safetensors format.
+        from safetensors.torch import load_file
+        model = load_file(input_model, device="cpu")
+
+    arch_name = sys.argv[2] if len(sys.argv) == 3 else "llama"
+
+    if arch_name not in gguf.MODEL_ARCH_NAMES.values():
+        logger.error(f"Error: unsupported architecture {arch_name}")
+        sys.exit(1)
+
+    arch = list(gguf.MODEL_ARCH_NAMES.keys())[list(gguf.MODEL_ARCH_NAMES.values()).index(arch_name)]
+    name_map = gguf.TensorNameMap(arch, 200) # 200 layers ought to be enough for anyone
+
+    with open(input_json, "r") as f:
+        params = json.load(f)
+
+    if params["peft_type"] != "LORA":
+        logger.error(f"Error: unsupported adapter type {params['peft_type']}, expected LORA")
+        sys.exit(1)
+
+    if params["fan_in_fan_out"] is True:
+        logger.error("Error: param fan_in_fan_out is not supported")
+        sys.exit(1)
+
+    if params["bias"] is not None and params["bias"] != "none":
+        logger.error("Error: param bias is not supported")
+        sys.exit(1)
+
+    # TODO: these seem to be layers that have been trained but without lora.
+    # doesn't seem widely used but eventually should be supported
+    if params["modules_to_save"] is not None and len(params["modules_to_save"]) > 0:
+        logger.error("Error: param modules_to_save is not supported")
+        sys.exit(1)
+
+    with open(output_path, "wb") as fout:
+        fout.truncate()
+
+        write_file_header(fout, params)
+        for k, v in model.items():
+            orig_k = k
+            if k.endswith(".default.weight"):
+                k = k.replace(".default.weight", ".weight")
+            if k in ["llama_proj.weight", "llama_proj.bias"]:
+                continue
+            if k.endswith("lora_A.weight"):
+                if v.dtype != torch.float16 and v.dtype != torch.float32:
+                    v = v.float()
+                v = v.T
+            else:
+                v = v.float()
+
+            t = v.detach().numpy()
+
+            prefix = "base_model.model."
+            if k.startswith(prefix):
+                k = k[len(prefix) :]
+
+            lora_suffixes = (".lora_A.weight", ".lora_B.weight")
+            if k.endswith(lora_suffixes):
+                suffix = k[-len(lora_suffixes[0]):]
+                k = k[: -len(lora_suffixes[0])]
+            else:
+                logger.error(f"Error: unrecognized tensor name {orig_k}")
+                sys.exit(1)
+
+            tname = name_map.get_name(k)
+            if tname is None:
+                logger.error(f"Error: could not map tensor name {orig_k}")
+                logger.error(" Note: the arch parameter must be specified if the model is not llama")
+                sys.exit(1)
+
+            if suffix == ".lora_A.weight":
+                tname += ".weight.loraA"
+            elif suffix == ".lora_B.weight":
+                tname += ".weight.loraB"
+            else:
+                assert False
+
+            logger.info(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB")
+            write_tensor_header(fout, tname, t.shape, t.dtype)
+            t.tofile(fout)
+
+    logger.info(f"Converted {input_json} and {input_model} to {output_path}")
--- a/convert-persimmon-to-gguf.py
+++ b/convert-persimmon-to-gguf.py
@@ -0,0 +1,143 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import logging
+import argparse
+import os
+import sys
+from pathlib import Path
+from pprint import pprint
+
+import torch
+from sentencepiece import SentencePieceProcessor
+
+if 'NO_LOCAL_GGUF' not in os.environ:
+    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
+import gguf
+
+logger = logging.getLogger("persimmon-to-gguf")
+
+
+def _flatten_dict(dct, tensors, prefix=None):
+    assert isinstance(dct, dict)
+    for key in dct.keys():
+        new_prefix = prefix + '.' + key if prefix is not None else key
+        if isinstance(dct[key], torch.Tensor):
+            tensors[new_prefix] = dct[key]
+        elif isinstance(dct[key], dict):
+            _flatten_dict(dct[key], tensors, new_prefix)
+        else:
+            raise ValueError(type(dct[key]))
+    return None
+
+
+def _get_sentencepiece_tokenizer_info(dir_model: Path):
+    tokenizer_path = dir_model / 'adept_vocab.model'
+    logger.info('getting sentencepiece tokenizer from', tokenizer_path)
+    tokenizer = SentencePieceProcessor(str(tokenizer_path))
+    logger.info('adding tokens')
+    tokens: list[bytes] = []
+    scores: list[float] = []
+    toktypes: list[int] = []
+
+    for i in range(tokenizer.vocab_size()):
+        text: bytes
+        score: float
+
+        piece = tokenizer.id_to_piece(i)
+        text = piece.encode("utf-8")
+        score = tokenizer.get_score(i)
+
+        toktype = 1
+        if tokenizer.is_unknown(i):
+            toktype = 2
+        if tokenizer.is_control(i):
+            toktype = 3
+        if tokenizer.is_unused(i):
+            toktype = 5
+        if tokenizer.is_byte(i):
+            toktype = 6
+
+        tokens.append(text)
+        scores.append(score)
+        toktypes.append(toktype)
+        pass
+    return tokens, scores, toktypes
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Convert a Persimmon model from Adept (e.g. Persimmon 8b chat) to a GGML compatible file")
+    parser.add_argument("--outfile",             type=Path, help="path to write to; default: based on input")
+    parser.add_argument("--ckpt-path",           type=Path, help="path to persimmon checkpoint .pt file")
+    parser.add_argument("--model-dir",           type=Path, help="directory containing model e.g. 8b_chat_model_release")
+    parser.add_argument("--adept-inference-dir", type=str,  help="path to adept-inference code directory")
+    parser.add_argument("--verbose",  action="store_true",  help="increase output verbosity")
+    args = parser.parse_args()
+    logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
+    sys.path.append(str(args.adept_inference_dir))
+    persimmon_model = torch.load(args.ckpt_path)
+    hparams = persimmon_model['args']
+    pprint(hparams)
+    tensors: dict[str, torch.Tensor] = {}
+    _flatten_dict(persimmon_model['model'], tensors, None)
+
+    arch = gguf.MODEL_ARCH.PERSIMMON
+    gguf_writer = gguf.GGUFWriter(args.outfile, gguf.MODEL_ARCH_NAMES[arch])
+
+    block_count = hparams.num_layers
+    head_count = hparams.num_attention_heads
+    head_count_kv = head_count
+    ctx_length = hparams.seq_length
+    hidden_size = hparams.hidden_size
+
+    gguf_writer.add_name('persimmon-8b-chat')
+    gguf_writer.add_context_length(ctx_length)
+    gguf_writer.add_embedding_length(hidden_size)
+    gguf_writer.add_block_count(block_count)
+    gguf_writer.add_feed_forward_length(hparams.ffn_hidden_size)
+    # ref: https://github.com/ggerganov/llama.cpp/pull/4889/commits/eea19039fc52ea2dbd1aab45b59ab4e3e29a3443
+    gguf_writer.add_rope_dimension_count(hidden_size // head_count // 2)
+    gguf_writer.add_head_count(head_count)
+    gguf_writer.add_head_count_kv(head_count_kv)
+    gguf_writer.add_rope_freq_base(hparams.rotary_emb_base)
+    gguf_writer.add_layer_norm_eps(hparams.layernorm_epsilon)
+
+    tokens, scores, toktypes = _get_sentencepiece_tokenizer_info(args.model_dir)
+    gguf_writer.add_tokenizer_model('llama')
+    gguf_writer.add_tokenizer_pre('default')
+    gguf_writer.add_token_list(tokens)
+    gguf_writer.add_token_scores(scores)
+    gguf_writer.add_token_types(toktypes)
+    gguf_writer.add_bos_token_id(71013)
+    gguf_writer.add_eos_token_id(71013)
+
+    tensor_map = gguf.get_tensor_name_map(arch, block_count)
+    logger.info(tensor_map)
+    for name in tensors.keys():
+        data_torch = tensors[name]
+        if name.endswith(".self_attention.rotary_emb.inv_freq"):
+            continue
+        old_dtype = data_torch.dtype
+        # TODO: FP16 conversion produces garbage outputs. (Q8_0 does not, so..?)
+        data = data_torch.to(torch.float32).squeeze().numpy()
+        new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
+        if new_name is None:
+            raise ValueError(f"Can not map tensor '{name}'")
+
+        n_dims = len(data.shape)
+        logger.debug(f"{new_name}, n_dims = {str(n_dims)}, {str(old_dtype)} --> {str(data.dtype)}")
+        gguf_writer.add_tensor(new_name, data)
+    logger.info("gguf: write header")
+    gguf_writer.write_header_to_file()
+    logger.info("gguf: write metadata")
+    gguf_writer.write_kv_data_to_file()
+    logger.info("gguf: write tensors")
+    gguf_writer.write_tensors_to_file()
+
+    gguf_writer.close()
+
+    logger.info(f"gguf: model successfully exported to '{args.outfile}'")
+
+
+if __name__ == '__main__':
+    main()
--- a/examples/convert-legacy-llama.py
+++ b/examples/convert-legacy-llama.py
@@ -24,16 +24,14 @@ from abc import ABC, abstractmethod
 from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
 from dataclasses import dataclass
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable, IO, Iterable, Literal, TypeVar, Optional
+from typing import TYPE_CHECKING, Any, Callable, ClassVar, IO, Iterable, Literal, Protocol, TypeVar, runtime_checkable

 import numpy as np
+from sentencepiece import SentencePieceProcessor

 if 'NO_LOCAL_GGUF' not in os.environ:
-    # use .parent.parent since we are in "examples" directory
-    sys.path.insert(1, str(Path(__file__).parent.parent / 'gguf-py'))
-
+    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
 import gguf
-from gguf import BaseVocab, Vocab, NoVocab, BpeVocab, SentencePieceVocab, LlamaHfVocab

 if TYPE_CHECKING:
    from typing_extensions import Self, TypeAlias
@@ -176,7 +174,7 @@ class Params:
    rope_scaling_type: gguf.RopeScalingType | None = None
    f_rope_freq_base: float | None = None
    f_rope_scale: float | None = None
-    n_ctx_orig: int | None = None
+    n_orig_ctx: int | None = None
    rope_finetuned: bool | None = None

    ftype: GGMLFileType | None = None
@@ -226,7 +224,7 @@ class Params:
        with open(config_path) as f:
            config = json.load(f)

-        rope_scaling_type = f_rope_scale = n_ctx_orig = rope_finetuned = None
+        rope_scaling_type = f_rope_scale = n_orig_ctx = rope_finetuned = None
        rope_scaling = config.get("rope_scaling")

        if rope_scaling is not None and (typ := rope_scaling.get("type")):
@@ -236,7 +234,7 @@ class Params:
                rope_scaling_type = gguf.RopeScalingType.LINEAR
            elif typ == "yarn":
                rope_scaling_type = gguf.RopeScalingType.YARN
-                n_ctx_orig = rope_scaling['original_max_position_embeddings']
+                n_orig_ctx = rope_scaling['original_max_position_embeddings']
                rope_finetuned = rope_scaling['finetuned']
            else:
                raise NotImplementedError(f'Unknown rope scaling type: {typ}')
@@ -272,7 +270,7 @@ class Params:
            f_rope_freq_base  = config.get("rope_theta"),
            rope_scaling_type = rope_scaling_type,
            f_rope_scale      = f_rope_scale,
-            n_ctx_orig        = n_ctx_orig,
+            n_orig_ctx        = n_orig_ctx,
            rope_finetuned    = rope_finetuned,
        )

@@ -286,7 +284,6 @@ class Params:
        n_experts      = None
        n_experts_used = None
        f_rope_freq_base = None
-        n_ff = None

        # hack to determine LLaMA v1 vs v2 vs CodeLlama
        if config.get("moe"):
@@ -311,8 +308,6 @@ class Params:
            n_experts_used = config["moe"]["num_experts_per_tok"]
            f_rope_freq_base = 1e6

-        assert n_ff is not None
-
        return Params(
            n_vocab          = model["tok_embeddings.weight"].shape[0],
            n_embd           = config["dim"],
@@ -346,40 +341,302 @@ class Params:
        return params


-@dataclass
-class Metadata:
-    name: Optional[str] = None
-    author: Optional[str] = None
-    version: Optional[str] = None
-    url: Optional[str] = None
-    description: Optional[str] = None
-    licence: Optional[str] = None
-    source_url: Optional[str] = None
-    source_hf_repo: Optional[str] = None
+#
+# vocab
+#

-    @staticmethod
-    def load(metadata_path: Path) -> Metadata:
-        if metadata_path is None or not metadata_path.exists():
-            return Metadata()
+@runtime_checkable
+class BaseVocab(Protocol):
+    tokenizer_model: ClassVar[str]
+    name: ClassVar[str]

-        with open(metadata_path, 'r') as file:
-            data = json.load(file)

-        # Create a new Metadata instance
-        metadata = Metadata()
+class NoVocab(BaseVocab):
+    tokenizer_model = "no_vocab"
+    name = "no_vocab"

-        # Assigning values to Metadata attributes if they exist in the JSON file
-        # This is based on LLM_KV_NAMES mapping in llama.cpp
-        metadata.name = data.get("general.name")
-        metadata.author = data.get("general.author")
-        metadata.version = data.get("general.version")
-        metadata.url = data.get("general.url")
-        metadata.description = data.get("general.description")
-        metadata.license = data.get("general.license")
-        metadata.source_url = data.get("general.source.url")
-        metadata.source_hf_repo = data.get("general.source.huggingface.repository")
+    def __repr__(self) -> str:
+        return "<NoVocab for a model without integrated vocabulary>"

-        return metadata
+
+@runtime_checkable
+class Vocab(BaseVocab, Protocol):
+    vocab_size: int
+    added_tokens_dict: dict[str, int]
+    added_tokens_list: list[str]
+    fname_tokenizer: Path
+
+    def __init__(self, base_path: Path): ...
+    def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: ...
+
+
+class BpeVocab(Vocab):
+    tokenizer_model = "gpt2"
+    name = "bpe"
+
+    def __init__(self, base_path: Path):
+        added_tokens: dict[str, int] = {}
+
+        if (fname_tokenizer := base_path / 'vocab.json').exists():
+            # "slow" tokenizer
+            with open(fname_tokenizer, encoding="utf-8") as f:
+                self.vocab = json.load(f)
+
+            try:
+                # FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
+                with open(base_path / ADDED_TOKENS_FILE, encoding="utf-8") as f:
+                    added_tokens = json.load(f)
+            except FileNotFoundError:
+                pass
+        else:
+            # "fast" tokenizer
+            fname_tokenizer = base_path / FAST_TOKENIZER_FILE
+
+            # if this fails, FileNotFoundError propagates to caller
+            with open(fname_tokenizer, encoding="utf-8") as f:
+                tokenizer_json = json.load(f)
+
+            tokenizer_model: dict[str, Any] = tokenizer_json['model']
+            if (
+                tokenizer_model['type'] != 'BPE' or tokenizer_model.get('byte_fallback', False)
+                or tokenizer_json['decoder']['type'] != 'ByteLevel'
+            ):
+                raise FileNotFoundError('Cannot find GPT-2 BPE tokenizer')
+
+            self.vocab = tokenizer_model["vocab"]
+
+            if (added := tokenizer_json.get('added_tokens')) is not None:
+                # Added tokens here can be duplicates of the main vocabulary.
+                added_tokens = {item['content']: item['id']
+                                for item in added
+                                if item['content'] not in self.vocab}
+
+        vocab_size   = len(self.vocab)
+        expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
+        actual_ids   = sorted(added_tokens.values())
+        if expected_ids != actual_ids:
+            expected_end_id = vocab_size + len(actual_ids) - 1
+            raise ValueError(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range "
+                             f"{vocab_size} - {expected_end_id}; got {actual_ids}")
+
+        items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
+        self.added_tokens_dict    = added_tokens
+        self.added_tokens_list    = [text for (text, idx) in items]
+        self.vocab_size_base      = vocab_size
+        self.vocab_size           = self.vocab_size_base + len(self.added_tokens_list)
+        self.fname_tokenizer      = fname_tokenizer
+
+    def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
+        reverse_vocab = {id: encoded_tok for encoded_tok, id in self.vocab.items()}
+
+        for i, _ in enumerate(self.vocab):
+            yield reverse_vocab[i], 0.0, gguf.TokenType.NORMAL
+
+    def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
+        for text in self.added_tokens_list:
+            score = -1000.0
+            yield text.encode("utf-8"), score, gguf.TokenType.CONTROL
+
+    def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
+        yield from self.bpe_tokens()
+        yield from self.added_tokens()
+
+    def __repr__(self) -> str:
+        return f"<BpeVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
+
+
+class SentencePieceVocab(Vocab):
+    tokenizer_model = "llama"
+    name = "spm"
+
+    def __init__(self, base_path: Path):
+        added_tokens: dict[str, int] = {}
+        if (fname_tokenizer := base_path / 'tokenizer.model').exists():
+            # normal location
+            try:
+                with open(base_path / ADDED_TOKENS_FILE, encoding="utf-8") as f:
+                    added_tokens = json.load(f)
+            except FileNotFoundError:
+                pass
+        elif not (fname_tokenizer := base_path.parent / 'tokenizer.model').exists():
+            # not found in alternate location either
+            raise FileNotFoundError('Cannot find tokenizer.model')
+
+        self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
+        vocab_size = self.sentencepiece_tokenizer.vocab_size()
+
+        new_tokens       = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
+        expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens)))
+        actual_new_ids   = sorted(new_tokens.keys())
+
+        if expected_new_ids != actual_new_ids:
+            raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}")
+
+        # Token pieces that were added to the base vocabulary.
+        self.added_tokens_dict  = added_tokens
+        self.added_tokens_list  = [new_tokens[id] for id in actual_new_ids]
+        self.vocab_size_base    = vocab_size
+        self.vocab_size         = self.vocab_size_base + len(self.added_tokens_list)
+        self.fname_tokenizer    = fname_tokenizer
+
+    def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
+        tokenizer = self.sentencepiece_tokenizer
+        for i in range(tokenizer.vocab_size()):
+            piece = tokenizer.id_to_piece(i)
+            text         = piece.encode("utf-8")
+            score: float = tokenizer.get_score(i)
+
+            toktype = gguf.TokenType.NORMAL
+            if tokenizer.is_unknown(i):
+                toktype = gguf.TokenType.UNKNOWN
+            if tokenizer.is_control(i):
+                toktype = gguf.TokenType.CONTROL
+
+            # NOTE: I think added_tokens are user defined.
+            # ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
+            # if tokenizer.is_user_defined(i): toktype = gguf.TokenType.USER_DEFINED
+
+            if tokenizer.is_unused(i):
+                toktype = gguf.TokenType.UNUSED
+            if tokenizer.is_byte(i):
+                toktype = gguf.TokenType.BYTE
+
+            yield text, score, toktype
+
+    def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
+        for text in self.added_tokens_list:
+            score = -1000.0
+            yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED
+
+    def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
+        yield from self.sentencepiece_tokens()
+        yield from self.added_tokens()
+
+    def __repr__(self) -> str:
+        return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
+
+
+class LlamaHfVocab(Vocab):
+    tokenizer_model = "llama"
+    name = "hfft"
+
+    def __init__(self, base_path: Path):
+        fname_tokenizer = base_path / FAST_TOKENIZER_FILE
+        # if this fails, FileNotFoundError propagates to caller
+        with open(fname_tokenizer, encoding='utf-8') as f:
+            tokenizer_json = json.load(f)
+
+        # pre-check so we know if we need transformers
+        tokenizer_model: dict[str, Any] = tokenizer_json['model']
+        is_llama3 = (
+            tokenizer_model['type'] == 'BPE' and tokenizer_model.get('ignore_merges', False)
+            and not tokenizer_model.get('byte_fallback', True)
+        )
+        if is_llama3:
+            raise TypeError('Llama 3 must be converted with BpeVocab')
+
+        if not is_llama3 and (
+            tokenizer_model['type'] != 'BPE' or not tokenizer_model.get('byte_fallback', False)
+            or tokenizer_json['decoder']['type'] != 'Sequence'
+        ):
+            raise FileNotFoundError('Cannot find Llama BPE tokenizer')
+
+        try:
+            from transformers import AutoTokenizer
+        except ImportError as e:
+            raise ImportError(
+                "To use LlamaHfVocab, please install the `transformers` package. "
+                "You can install it with `pip install transformers`."
+            ) from e
+
+        # Allow the tokenizer to default to slow or fast versions.
+        # Explicitly set tokenizer to use local paths.
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            base_path,
+            cache_dir=base_path,
+            local_files_only=True,
+        )
+        assert self.tokenizer.is_fast  # assume tokenizer.json is used
+
+        # Initialize lists and dictionaries for added tokens
+        self.added_tokens_list = []
+        self.added_tokens_dict = dict()
+        self.added_tokens_ids  = set()
+
+        # Process added tokens
+        for tok, tokidx in sorted(
+            self.tokenizer.get_added_vocab().items(), key=lambda x: x[1]
+        ):
+            # Only consider added tokens that are not in the base vocabulary
+            if tokidx >= self.tokenizer.vocab_size:
+                self.added_tokens_list.append(tok)
+                self.added_tokens_dict[tok] = tokidx
+                self.added_tokens_ids.add(tokidx)
+
+        # Store special tokens and their IDs
+        self.specials = {
+            tok: self.tokenizer.get_vocab()[tok]
+            for tok in self.tokenizer.all_special_tokens
+        }
+        self.special_ids = set(self.tokenizer.all_special_ids)
+
+        # Set vocabulary sizes
+        self.vocab_size_base = self.tokenizer.vocab_size
+        self.vocab_size      = self.vocab_size_base + len(self.added_tokens_list)
+
+        self.fname_tokenizer = fname_tokenizer
+
+    def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
+        reverse_vocab = {
+            id: encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items()
+        }
+
+        for token_id in range(self.vocab_size_base):
+            # Skip processing added tokens here
+            if token_id in self.added_tokens_ids:
+                continue
+
+            # Convert token text to bytes
+            token_text = reverse_vocab[token_id].encode("utf-8")
+
+            # Yield token text, score, and type
+            yield token_text, self.get_token_score(token_id), self.get_token_type(
+                token_id, token_text, self.special_ids  # Reuse already stored special IDs
+            )
+
+    def get_token_type(self, token_id: int, token_text: bytes, special_ids: set[int]) -> gguf.TokenType:
+        # Special case for byte tokens
+        if re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text):
+            return gguf.TokenType.BYTE
+
+        # Determine token type based on whether it's a special token
+        return gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL
+
+    def get_token_score(self, token_id: int) -> float:
+        # Placeholder for actual logic to determine the token's score
+        # This needs to be implemented based on specific requirements
+        return -1000.0  # Default score
+
+    def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
+        for text in self.added_tokens_list:
+            if text in self.specials:
+                toktype = self.get_token_type(self.specials[text], b'', self.special_ids)
+                score = self.get_token_score(self.specials[text])
+            else:
+                toktype = gguf.TokenType.USER_DEFINED
+                score = -1000.0
+
+            yield text.encode("utf-8"), score, toktype
+
+    def has_newline_token(self):
+        return "<0x0A>" in self.tokenizer.vocab or "\n" in self.tokenizer.vocab
+
+    def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
+        yield from self.hf_tokens()
+        yield from self.added_tokens()
+
+    def __repr__(self) -> str:
+        return f"<LlamaHfVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"


 #
@@ -649,7 +906,7 @@ class LazyUnpickler(pickle.Unpickler):
    def rebuild_from_type_v2(func, new_type, args, state):
        return func(*args)

-    CLASSES: dict[tuple[str, str], type[LazyTensor] | LazyStorageKind] = {
+    CLASSES = {
        # getattr used here as a workaround for mypy not being smart enough to determine
        # the staticmethods have a __func__ attribute.
        ('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'),
@@ -805,42 +1062,21 @@ class OutputFile:
    def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE):
        self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)

-    def add_meta_model(self, params: Params, metadata: Metadata) -> None:
-        # Metadata About The Model And Its Provenence
-        name = "LLaMA"
-        if metadata is not None and metadata.name is not None:
-            name = metadata.name
-        elif params.path_model is not None:
-            name = params.path_model.name
-        elif params.n_ctx == 4096:
-            # Heuristic detection of LLaMA v2 model
-            name = "LLaMA v2"
-
-        self.gguf.add_name(name)
-
-        if metadata is not None:
-            if metadata.author is not None:
-                self.gguf.add_author(metadata.author)
-            if metadata.version is not None:
-                self.gguf.add_version(metadata.version)
-            if metadata.url is not None:
-                self.gguf.add_url(metadata.url)
-            if metadata.description is not None:
-                self.gguf.add_description(metadata.description)
-            if metadata.licence is not None:
-                self.gguf.add_licence(metadata.licence)
-            if metadata.source_url is not None:
-                self.gguf.add_source_url(metadata.source_url)
-            if metadata.source_hf_repo is not None:
-                self.gguf.add_source_hf_repo(metadata.source_hf_repo)
-
    def add_meta_arch(self, params: Params) -> None:
-        # Metadata About The Neural Architecture Itself
-        self.gguf.add_vocab_size(params.n_vocab)
-        self.gguf.add_context_length(params.n_ctx)
-        self.gguf.add_embedding_length(params.n_embd)
-        self.gguf.add_block_count(params.n_layer)
-        self.gguf.add_feed_forward_length(params.n_ff)
+        name = "LLaMA"
+
+        # TODO: better logic to determine model name
+        if params.n_ctx == 4096:
+            name = "LLaMA v2"
+        elif params.path_model is not None:
+            name = str(params.path_model.parent).split('/')[-1]
+
+        self.gguf.add_name                (name)
+        self.gguf.add_vocab_size          (params.n_vocab)
+        self.gguf.add_context_length      (params.n_ctx)
+        self.gguf.add_embedding_length    (params.n_embd)
+        self.gguf.add_block_count         (params.n_layer)
+        self.gguf.add_feed_forward_length (params.n_ff)
        self.gguf.add_rope_dimension_count(params.n_embd // params.n_head)
        self.gguf.add_head_count          (params.n_head)
        self.gguf.add_head_count_kv       (params.n_head_kv)
@@ -864,8 +1100,8 @@ class OutputFile:
            self.gguf.add_rope_scaling_type(params.rope_scaling_type)
            self.gguf.add_rope_scaling_factor(params.f_rope_scale)

-        if params.n_ctx_orig is not None:
-            self.gguf.add_rope_scaling_orig_ctx_len(params.n_ctx_orig)
+        if params.n_orig_ctx is not None:
+            self.gguf.add_rope_scaling_orig_ctx_len(params.n_orig_ctx)

        if params.rope_finetuned is not None:
            self.gguf.add_rope_scaling_finetuned(params.rope_finetuned)
@@ -943,14 +1179,13 @@ class OutputFile:
    @staticmethod
    def write_vocab_only(
        fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab,
-        endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, metadata: Metadata = None,
+        endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False,
    ) -> None:
        check_vocab_size(params, vocab, pad_vocab=pad_vocab)

        of = OutputFile(fname_out, endianess=endianess)

        # meta data
-        of.add_meta_model(params, metadata)
        of.add_meta_arch(params)
        of.add_meta_vocab(vocab)
        of.add_meta_special_vocab(svocab)
@@ -977,14 +1212,12 @@ class OutputFile:
        fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: BaseVocab, svocab: gguf.SpecialVocab,
        concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
        pad_vocab: bool = False,
-        metadata: Metadata = None,
    ) -> None:
        check_vocab_size(params, vocab, pad_vocab=pad_vocab)

        of = OutputFile(fname_out, endianess=endianess)

        # meta data
-        of.add_meta_model(params, metadata)
        of.add_meta_arch(params)
        if isinstance(vocab, Vocab):
            of.add_meta_vocab(vocab)
@@ -1020,37 +1253,6 @@ def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileT
    raise ValueError(f"Unexpected combination of types: {name_to_type}")


-def model_parameter_count(model: LazyModel) -> int:
-    total_model_parameters = 0
-    for i, (name, lazy_tensor) in enumerate(model.items()):
-        sum_weights_in_tensor = 1
-        for dim in lazy_tensor.shape:
-            sum_weights_in_tensor *= dim
-        total_model_parameters += sum_weights_in_tensor
-    return total_model_parameters
-
-
-def model_parameter_count_rounded_notation(model_params_count: int) -> str:
-    if model_params_count > 1e12 :
-        # Trillions Of Parameters
-        scaled_model_params = model_params_count * 1e-12
-        scale_suffix = "T"
-    elif model_params_count > 1e9 :
-        # Billions Of Parameters
-        scaled_model_params = model_params_count * 1e-9
-        scale_suffix = "B"
-    elif model_params_count > 1e6 :
-        # Millions Of Parameters
-        scaled_model_params = model_params_count * 1e-6
-        scale_suffix = "M"
-    else:
-        # Thousands Of Parameters
-        scaled_model_params = model_params_count * 1e-3
-        scale_suffix = "K"
-
-    return f"{round(scaled_model_params)}{scale_suffix}"
-
-
 def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel:
    return {name: tensor.astype(output_type.type_for_tensor(name, tensor))
            for (name, tensor) in model.items()}
@@ -1230,35 +1432,13 @@ class VocabFactory:
        return vocab, special_vocab


-def default_convention_outfile(file_type: GGMLFileType, params: Params, model_params_count: int, metadata: Metadata) -> str:
-    quantization = {
-        GGMLFileType.AllF32:    "F32",
-        GGMLFileType.MostlyF16: "F16",
-        GGMLFileType.MostlyQ8_0: "Q8_0",
+def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path:
+    namestr = {
+        GGMLFileType.AllF32:    "f32",
+        GGMLFileType.MostlyF16: "f16",
+        GGMLFileType.MostlyQ8_0:"q8_0",
    }[file_type]
-
-    parameters = model_parameter_count_rounded_notation(model_params_count)
-
-    expert_count = ""
-    if params.n_experts is not None:
-        expert_count = f"{params.n_experts}x"
-
-    version = ""
-    if metadata is not None and metadata.version is not None:
-        version = f"-{metadata.version}"
-
-    name = "ggml-model"
-    if metadata is not None and metadata.name is not None:
-        name = metadata.name
-    elif params.path_model is not None:
-        name = params.path_model.name
-
-    return f"{name}{version}-{expert_count}{parameters}-{quantization}"
-
-
-def default_outfile(model_paths: list[Path], file_type: GGMLFileType, params: Params, model_params_count: int, metadata: Metadata) -> Path:
-    default_filename = default_convention_outfile(file_type, params, model_params_count, metadata)
-    ret = model_paths[0].parent / f"{default_filename}.gguf"
+    ret = model_paths[0].parent / f"ggml-model-{namestr}.gguf"
    if ret in model_paths:
        logger.error(
            f"Error: Default output path ({ret}) would overwrite the input. "
@@ -1296,30 +1476,17 @@ def main(args_in: list[str] | None = None) -> None:
    parser.add_argument("--pad-vocab",    action="store_true",    help="add pad tokens when model vocab expects more than tokenizer metadata provides")
    parser.add_argument("--skip-unknown", action="store_true",    help="skip unknown tensor names instead of failing")
    parser.add_argument("--verbose",      action="store_true",    help="increase output verbosity")
-    parser.add_argument("--metadata",     type=Path,              help="Specify the path for a metadata file")
-    parser.add_argument("--get-outfile",  action="store_true",    help="get calculated default outfile name")

    args = parser.parse_args(args_in)

    if args.verbose:
        logging.basicConfig(level=logging.DEBUG)
-    elif args.dump_single or args.dump or args.get_outfile:
+    elif args.dump_single or args.dump:
        # Avoid printing anything besides the dump output
        logging.basicConfig(level=logging.WARNING)
    else:
        logging.basicConfig(level=logging.INFO)

-    metadata = Metadata.load(args.metadata)
-
-    if args.get_outfile:
-        model_plus = load_some_model(args.model)
-        params = Params.load(model_plus)
-        model   = convert_model_names(model_plus.model, params, args.skip_unknown)
-        model_params_count = model_parameter_count(model_plus.model)
-        ftype   = pick_output_type(model, args.outtype)
-        print(f"{default_convention_outfile(ftype, params, model_params_count, metadata)}") # noqa: NP100
-        return
-
    if args.no_vocab and args.vocab_only:
        raise ValueError("--vocab-only does not make sense with --no-vocab")

@@ -1333,9 +1500,6 @@ def main(args_in: list[str] | None = None) -> None:
    else:
        model_plus = ModelPlus(model = {}, paths = [args.model / 'dummy'], format = 'none', vocab = None)

-    model_params_count = model_parameter_count(model_plus.model)
-    logger.info(f"model parameters count : {model_params_count} ({model_parameter_count_rounded_notation(model_params_count)})")
-
    if args.dump:
        do_dump_model(model_plus)
        return
@@ -1344,27 +1508,25 @@ def main(args_in: list[str] | None = None) -> None:
    if args.big_endian:
        endianess = gguf.GGUFEndian.BIG

-    params = None
-    if args.pad_vocab or not args.vocab_only:
-        params = Params.load(model_plus)
-        if params.n_ctx == -1:
-            if args.ctx is None:
-                msg = """\
-                    The model doesn't have a context size, and you didn't specify one with --ctx
-                    Please specify one with --ctx:
-                     - LLaMA v1: --ctx 2048
-                     - LLaMA v2: --ctx 4096"""
-                parser.error(textwrap.dedent(msg))
-            params.n_ctx = args.ctx
+    params = Params.load(model_plus)
+    if params.n_ctx == -1:
+        if args.ctx is None:
+            msg = """\
+                The model doesn't have a context size, and you didn't specify one with --ctx
+                Please specify one with --ctx:
+                 - LLaMA v1: --ctx 2048
+                 - LLaMA v2: --ctx 4096"""
+            parser.error(textwrap.dedent(msg))
+        params.n_ctx = args.ctx

-        if args.outtype:
-            params.ftype = {
-                "f32": GGMLFileType.AllF32,
-                "f16": GGMLFileType.MostlyF16,
-                "q8_0": GGMLFileType.MostlyQ8_0,
-            }[args.outtype]
+    if args.outtype:
+        params.ftype = {
+            "f32": GGMLFileType.AllF32,
+            "f16": GGMLFileType.MostlyF16,
+            "q8_0": GGMLFileType.MostlyQ8_0,
+        }[args.outtype]

-        logger.info(f"params = {params}")
+    logger.info(f"params = {params}")

    model_parent_path = model_plus.paths[0].parent
    vocab_path = Path(args.vocab_dir or args.model or model_parent_path)
@@ -1377,19 +1539,8 @@ def main(args_in: list[str] | None = None) -> None:
        if not args.outfile:
            raise ValueError("need --outfile if using --vocab-only")
        outfile = args.outfile
-        if params is None:
-            params = Params(
-                n_vocab    = vocab.vocab_size,
-                n_embd     = 1,
-                n_layer    = 1,
-                n_ctx      = 1,
-                n_ff       = 1,
-                n_head     = 1,
-                n_head_kv  = 1,
-                f_norm_eps = 1e-5,
-            )
        OutputFile.write_vocab_only(outfile, params, vocab, special_vocab,
-                                    endianess=endianess, pad_vocab=args.pad_vocab, metadata=metadata)
+                                    endianess=endianess, pad_vocab=args.pad_vocab)
        logger.info(f"Wrote {outfile}")
        return

@@ -1402,13 +1553,13 @@ def main(args_in: list[str] | None = None) -> None:
    model   = convert_model_names(model, params, args.skip_unknown)
    ftype   = pick_output_type(model, args.outtype)
    model   = convert_to_output_type(model, ftype)
-    outfile = args.outfile or default_outfile(model_plus.paths, ftype, params, model_params_count, metadata)
+    outfile = args.outfile or default_outfile(model_plus.paths, ftype)

    params.ftype = ftype
    logger.info(f"Writing {outfile}, format {ftype}")

    OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab,
-                         concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab, metadata=metadata)
+                         concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab)
    logger.info(f"Wrote {outfile}")


--- a/docs/HOWTO-add-model.md
+++ b/docs/HOWTO-add-model.md
@@ -17,7 +17,7 @@ Also, it is important to check that the examples and main ggml backends (CUDA, M
 ### 1. Convert the model to GGUF

 This step is done in python with a `convert` script using the [gguf](https://pypi.org/project/gguf/) library.
-Depending on the model architecture, you can use either [convert-hf-to-gguf.py](../convert-hf-to-gguf.py) or [examples/convert-legacy-llama.py](../examples/convert-legacy-llama.py) (for `llama/llama2` models in `.pth` format).
+Depending on the model architecture, you can use either [convert.py](../convert.py) or [convert-hf-to-gguf.py](../convert-hf-to-gguf.py).

 The convert script reads the model configuration, tokenizer, tensor names+data and converts them to GGUF metadata and tensors.

--- a/docs/debugging-tests.md
+++ b/docs/debugging-tests.md
@@ -1,104 +0,0 @@
-# Debugging Tests Tips
-
-## How to run & execute or debug a specific test without anything else to keep the feedback loop short?
-
-There is a script called debug-test.sh in the scripts folder whose parameter takes a REGEX and an optional test number.
-
-For example, running the following command will output an interactive list from which you can select a test. It takes this form:
-
-`debug-test.sh [OPTION]... <test_regex> <test_number>`
-
-It will then build & run in the debugger for you.
-
-To just execute a test and get back a PASS or FAIL message run:
-
-```bash
-./scripts/debug-test.sh test-tokenizer
-```
-
-To test in GDB use the `-g` flag to enable gdb test mode.
-
-```bash
-./scripts/debug-test.sh -g test-tokenizer
-
-# Once in the debugger, i.e. at the chevrons prompt, setting a breakpoint could be as follows:
->>> b main
-```
-
-To speed up the testing loop, if you know your test number you can just run it similar to below:
-
-```bash
-./scripts/debug-test.sh test 23
-```
-
-For further reference use `debug-test.sh -h` to print help.
-
-&nbsp;
-
-### How does the script work?
-If you want to be able to use the concepts contained in the script separately, the important ones are briefly outlined below.
-
-#### Step 1: Reset and Setup folder context
-
-From base of this repository, let's create `build-ci-debug` as our build context.
-
-```bash
-rm -rf build-ci-debug && mkdir build-ci-debug && cd build-ci-debug
-```
-
-#### Step 2: Setup Build Environment and Compile Test Binaries
-
-Setup and trigger a build under debug mode. You may adapt the arguments as needed, but in this case these are sane defaults.
-
-```bash
-cmake -DCMAKE_BUILD_TYPE=Debug -DLLAMA_CUDA=1 -DLLAMA_FATAL_WARNINGS=ON ..
-make -j
-```
-
-#### Step 3: Find all tests available that matches REGEX
-
-The output of this command will give you the command & arguments needed to run GDB.
-
-* `-R test-tokenizer` : looks for all the test files named `test-tokenizer*` (R=Regex)
-* `-N` : "show-only" disables test execution & shows test commands that you can feed to GDB.
-* `-V` : Verbose Mode
-
-```bash
-ctest -R "test-tokenizer" -V -N
-```
-
-This may return output similar to below (focusing on key lines to pay attention to):
-
-```bash
-...
-1: Test command: ~/llama.cpp/build-ci-debug/bin/test-tokenizer-0 "~/llama.cpp/tests/../models/ggml-vocab-llama-spm.gguf"
-1: Working Directory: .
-Labels: main
-  Test  #1: test-tokenizer-0-llama-spm
-...
-4: Test command: ~/llama.cpp/build-ci-debug/bin/test-tokenizer-0 "~/llama.cpp/tests/../models/ggml-vocab-falcon.gguf"
-4: Working Directory: .
-Labels: main
-  Test  #4: test-tokenizer-0-falcon
-...
-```
-
-#### Step 4: Identify Test Command for Debugging
-
-So for test #1 above we can tell these two pieces of relevant information:
-* Test Binary: `~/llama.cpp/build-ci-debug/bin/test-tokenizer-0`
-* Test GGUF Model: `~/llama.cpp/tests/../models/ggml-vocab-llama-spm.gguf`
-
-#### Step 5: Run GDB on test command
-
-Based on the ctest 'test command' report above we can then run a gdb session via this command below:
-
-```bash
-gdb --args ${Test Binary} ${Test GGUF Model}
-```
-
-Example:
-
-```bash
-gdb --args ~/llama.cpp/build-ci-debug/bin/test-tokenizer-0 "~/llama.cpp/tests/../models/ggml-vocab-llama-spm.gguf"
-```
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -15,6 +15,7 @@ else()
    add_subdirectory(baby-llama)
    add_subdirectory(batched)
    add_subdirectory(batched-bench)
+    add_subdirectory(beam-search)
    add_subdirectory(benchmark)
    add_subdirectory(convert-llama2c-to-ggml)
    add_subdirectory(embedding)
@@ -48,7 +49,4 @@ else()
        add_subdirectory(server)
    endif()
    add_subdirectory(export-lora)
-    if (LLAMA_RPC)
-        add_subdirectory(rpc)
-    endif()
 endif()
--- a/examples/alpaca.sh
+++ b/examples/alpaca.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+#
+# Temporary script - will be removed in the future
+#
+
+cd `dirname $0`
+cd ..
+
+./main -m ./models/alpaca.13b.ggmlv3.q8_0.bin \
+       --color \
+       -f ./prompts/alpaca.txt \
+       --ctx_size 2048 \
+       -n -1 \
+       -ins -b 256 \
+       --top_k 10000 \
+       --temp 0.2 \
+       --repeat_penalty 1.1 \
+       -t 7
--- a/examples/baby-llama/baby-llama.cpp
+++ b/examples/baby-llama/baby-llama.cpp
@@ -522,8 +522,8 @@ static struct ggml_tensor * forward(
            // wk   shape [n_embd, n_embd, 1, 1]
            // Qcur shape [n_embd/n_head, n_head, N, 1]
            // Kcur shape [n_embd/n_head, n_head, N, 1]
-            struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), KQ_pos, n_rot, 0);
-            struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), KQ_pos, n_rot, 0);
+            struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), KQ_pos, n_rot, 0, 0);
+            struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), KQ_pos, n_rot, 0, 0);

            // store key and value to memory
            {
@@ -759,8 +759,8 @@ static struct ggml_tensor * forward_batch(
            // wk   shape [n_embd, n_embd, 1, 1]
            // Qcur shape [n_embd/n_head, n_head, N, n_batch]
            // Kcur shape [n_embd/n_head, n_head, N, n_batch]
-            struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), KQ_pos, n_rot, 0);
-            struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), KQ_pos, n_rot, 0);
+            struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), KQ_pos, n_rot, 0, 0);
+            struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), KQ_pos, n_rot, 0, 0);
            assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch);
            assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch);

@@ -1056,7 +1056,7 @@ static struct ggml_tensor * forward_lora(
                                                        model->layers[il].wqb,
                                                        cur)),
                                                n_embd/n_head, n_head, N),
-                                            KQ_pos, n_rot, 0);
+                                            KQ_pos, n_rot, 0, 0);
            struct ggml_tensor * Kcur = ggml_rope(ctx0,
                                            ggml_reshape_3d(ctx0,
                                                ggml_mul_mat(ctx0,
@@ -1065,7 +1065,7 @@ static struct ggml_tensor * forward_lora(
                                                        model->layers[il].wkb,
                                                        cur)),
                                                n_embd/n_head, n_head, N),
-                                            KQ_pos, n_rot, 0);
+                                            KQ_pos, n_rot, 0, 0);

            // store key and value to memory
            {
--- a/examples/batched-bench/README.md
+++ b/examples/batched-bench/README.md
@@ -10,16 +10,16 @@ There are 2 modes of operation:
 - `prompt is shared` - there is a common prompt of size `PP` used by all batches (i.e. `N_KV = PP + B*TG`)

 ```bash
-./batched-bench -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]
+./batched-bench MODEL_PATH [N_KV_MAX] [N_BATCH] [N_UBATCH] [IS_PP_SHARED] [NGL] [MMQ] <PP> <TG> <PL>

 # LLaMA 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared
-./batched-bench -m ./models/llama-7b/ggml-model-f16.gguf -c 16384 -b 2048 -ub 512 -ngl 99
+./batched-bench ./models/llama-7b/ggml-model-f16.gguf 16384 2048 512 0 99

 # LLaMA 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared
-./batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 16384 -b 2048 -ub 512 -ngl 99 -pps
+./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 16384 2048 512 1 99

 # custom set of batches
-./batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 2048 -b 512 -ub 512 -ngl 999 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32
+./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 2048 512 512 0 999 0 128,256,512 128,256 1,2,4,8,16,32
 ```

 ## Sample results
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@@ -28,27 +28,67 @@ static std::vector<int> parse_list(char * p) {
    return ret;
 }

-static void print_usage(int argc, char ** argv, const gpt_params & params) {
-    gpt_params_print_usage(argc, argv, params);
-
-    LOG_TEE("\nexample usage:\n");
-    LOG_TEE("\n    %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]);
-    LOG_TEE("\n");
-}
-
 int main(int argc, char ** argv) {
    gpt_params params;

-    if (!gpt_params_parse(argc, argv, params)) {
-        print_usage(argc, argv, params);
-        return 1;
+    if (argc == 1 || argv[1][0] == '-') {
+        printf("usage: %s MODEL_PATH [N_KV_MAX] [N_BATCH] [N_UBATCH] [FATTN] [IS_PP_SHARED] [NGL] <PP> <TG> <PL>\n" , argv[0]);
+        printf("  <PP>, <TG> and PL are comma-separated lists of numbers without spaces\n\n");
+        printf("  example: %s ggml-model-f16.gguf 2048 2048 512 0 999 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]);
+        return 1 ;
    }

-    int is_pp_shared = params.is_pp_shared;
+    int n_kv_max     = 2048;
+    int n_batch      = 2048;
+    int n_ubatch     = 512;
+    bool flash_attn  = false;
+    int is_pp_shared = 0;
+    int n_gpu_layers = 0;

-    std::vector<int> n_pp = params.n_pp;
-    std::vector<int> n_tg = params.n_tg;
-    std::vector<int> n_pl = params.n_pl;
+    std::vector<int> n_pp = { 128, 256, 512, 1024, 2048, 3584, 7680, };
+    std::vector<int> n_tg = { 128, 256, };
+    std::vector<int> n_pl = { 1, 2, 4, 8, 16, 32, };
+    //std::vector<int> n_pl = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 32, };
+
+    if (argc >= 2) {
+        params.model = argv[1];
+    }
+
+    if (argc >= 3) {
+        n_kv_max = std::atoi(argv[2]);
+    }
+
+    if (argc >= 4) {
+        n_batch = std::atoi(argv[3]);
+    }
+
+    if (argc >= 5) {
+        n_ubatch = std::atoi(argv[4]);
+    }
+
+    if (argc >= 6) {
+        flash_attn = std::atoi(argv[5]);
+    }
+
+    if (argc >= 7) {
+        is_pp_shared = std::atoi(argv[6]);
+    }
+
+    if (argc >= 8) {
+        n_gpu_layers = std::atoi(argv[7]);
+    }
+
+    if (argc >= 9) {
+        n_pp = parse_list(argv[8]);
+    }
+
+    if (argc >= 10) {
+        n_tg = parse_list(argv[9]);
+    }
+
+    if (argc >= 11) {
+        n_pl = parse_list(argv[10]);
+    }

    // init LLM

@@ -57,7 +97,12 @@ int main(int argc, char ** argv) {

    // initialize the model

-    llama_model_params model_params = llama_model_params_from_gpt_params(params);
+    llama_model_params model_params = llama_model_default_params();
+
+    const std::vector<float> t_split(llama_max_devices(), 0.0f);
+
+    model_params.n_gpu_layers = n_gpu_layers;
+    model_params.tensor_split = t_split.data();

    llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);

@@ -66,7 +111,16 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
+    llama_context_params ctx_params = llama_context_default_params();
+
+    ctx_params.seed       = 1234;
+    ctx_params.n_ctx      = n_kv_max;
+    ctx_params.n_batch    = n_batch;
+    ctx_params.n_ubatch   = n_ubatch;
+    ctx_params.flash_attn = flash_attn;
+
+    ctx_params.n_threads       = params.n_threads;
+    ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;

    // ensure enough sequences are available
    ctx_params.n_seq_max = *std::max_element(n_pl.begin(), n_pl.end());
@@ -78,8 +132,6 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    const int32_t n_kv_max = llama_n_ctx(ctx);
-
    llama_batch batch = llama_batch_init(n_kv_max, 0, 1);

    // decode in batches of ctx_params.n_batch tokens
@@ -123,7 +175,7 @@ int main(int argc, char ** argv) {
    }

    LOG_TEE("\n");
-    LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
+    LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, n_batch, n_ubatch, flash_attn, is_pp_shared, n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
    LOG_TEE("\n");

    LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP",     "TG",     "B",    "N_KV",     "T_PP s",   "S_PP t/s", "T_TG s",   "S_TG t/s", "T s",      "S t/s");
--- a/examples/batched/README.md
+++ b/examples/batched/README.md
@@ -3,7 +3,7 @@
 The example demonstrates batched generation from a given prompt

 ```bash
-./batched -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is" -np 4
+./batched ./models/llama-7b-v2/ggml-model-f16.gguf "Hello my name is" 4

 ...

--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@@ -7,31 +7,48 @@
 #include <string>
 #include <vector>

-static void print_usage(int argc, char ** argv, const gpt_params & params) {
-    gpt_params_print_usage(argc, argv, params);
-
-    LOG_TEE("\nexample usage:\n");
-    LOG_TEE("\n    %s -m model.gguf -p \"Hello my name is\" -n 32 -np 4\n", argv[0]);
-    LOG_TEE("\n");
-}
-
 int main(int argc, char ** argv) {
    gpt_params params;

-    params.prompt = "Hello my name is";
-    params.n_predict = 32;
-
-    if (!gpt_params_parse(argc, argv, params)) {
-        print_usage(argc, argv, params);
-        return 1;
+    if (argc == 1 || argv[1][0] == '-') {
+        printf("usage: %s MODEL_PATH [PROMPT] [PARALLEL] [LEN] [NGL]\n" , argv[0]);
+        return 1 ;
    }

-
    // number of parallel batches
-    int n_parallel = params.n_parallel;
+    int n_parallel = 1;

    // total length of the sequences including the prompt
-    int n_predict = 32;
+    int n_len = 32;
+
+    // number of layers to offload to the GPU
+    int n_gpu_layers = 0;
+
+    if (argc >= 2) {
+        params.model = argv[1];
+    }
+
+    if (argc >= 3) {
+        params.prompt = argv[2];
+    }
+
+    if (argc >= 4) {
+        n_parallel = std::atoi(argv[3]);
+    }
+
+    if (argc >= 5) {
+        n_len = std::atoi(argv[4]);
+    }
+
+    if (argc >= 6) {
+        n_gpu_layers = std::atoi(argv[5]);
+    }
+
+    if (params.prompt.empty()) {
+        params.prompt = "Hello my name is";
+    }
+
+    process_escapes(params.prompt);

    // init LLM

@@ -40,7 +57,9 @@ int main(int argc, char ** argv) {

    // initialize the model

-    llama_model_params model_params = llama_model_params_from_gpt_params(params);
+    llama_model_params model_params = llama_model_default_params();
+
+    model_params.n_gpu_layers = n_gpu_layers;

    llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);

@@ -54,14 +73,18 @@ int main(int argc, char ** argv) {
    std::vector<llama_token> tokens_list;
    tokens_list = ::llama_tokenize(model, params.prompt, true);

-    const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size())*n_parallel;
+    const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size())*n_parallel;

    // initialize the context

-    llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
+    llama_context_params ctx_params = llama_context_default_params();

+    ctx_params.seed  = 1234;
    ctx_params.n_ctx   = n_kv_req;
-    ctx_params.n_batch = std::max(n_predict, n_parallel);
+    ctx_params.n_batch = std::max(n_len, n_parallel);
+    ctx_params.n_seq_max       = n_parallel;
+    ctx_params.n_threads       = params.n_threads;
+    ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;

    llama_context * ctx = llama_new_context_with_model(model, ctx_params);

@@ -70,9 +93,9 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    const int n_ctx = llama_n_ctx(ctx);
+    const int n_ctx    = llama_n_ctx(ctx);

-    LOG_TEE("\n%s: n_predict = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
+    LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);

    // make sure the KV cache is big enough to hold all the prompt and generated tokens
    if (n_kv_req > n_ctx) {
@@ -133,7 +156,7 @@ int main(int argc, char ** argv) {

    const auto t_main_start = ggml_time_us();

-    while (n_cur <= n_predict) {
+    while (n_cur <= n_len) {
        // prepare the next batch
        llama_batch_clear(batch);

@@ -169,7 +192,7 @@ int main(int argc, char ** argv) {
            //const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);

            // is it an end of generation? -> mark the stream as finished
-            if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
+            if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
                i_batch[i] = -1;
                LOG_TEE("\n");
                if (n_parallel > 1) {
--- a/examples/beam-search/CMakeLists.txt
+++ b/examples/beam-search/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(TARGET beam-search)
+add_executable(${TARGET} beam-search.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/beam-search/beam-search.cpp
+++ b/examples/beam-search/beam-search.cpp
@@ -0,0 +1,188 @@
+#include "common.h"
+#include "llama.h"
+
+#include <cassert>
+#include <cinttypes>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <ctime>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
+#include <signal.h>
+#include <unistd.h>
+#elif defined (_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#ifndef NOMINMAX
+#   define NOMINMAX
+#endif
+#include <windows.h>
+#include <signal.h>
+#endif
+
+// Used for debugging to print out beam tokens.
+struct ostream_beam_view {
+    llama_context * ctx;
+    llama_beam_view beam_view;
+};
+
+static std::ostream & operator<<(std::ostream & os, const ostream_beam_view & obv) {
+    os << "p(" << obv.beam_view.p << ") eob(" << std::boolalpha << obv.beam_view.eob << ") tokens(";
+    for (size_t i = 0 ; i < obv.beam_view.n_tokens ; ++i) {
+        os << llama_token_to_piece(obv.ctx, obv.beam_view.tokens[i]);
+    }
+    return os << ')';
+}
+
+// Put here anything you want back in beam_search_callback().
+struct beam_search_callback_data {
+    llama_context * ctx;
+    std::vector<llama_token> response;
+};
+
+// In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same.
+// For example, eob can be flagged due to maximum token length, stop words, etc.
+static bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, size_t n_tokens) {
+    return n_tokens && llama_token_is_eog(llama_get_model(callback_data.ctx), tokens[n_tokens-1]);
+}
+
+// Function matching type llama_beam_search_callback_fn_t.
+// Custom callback example is called each time the beams lengths increase:
+//  * Show progress by printing ',' following by number of convergent beam tokens if any.
+//  * When all beams converge to a common prefix, they are made available in beams_state.beams[0].
+//    This is also called when the stop condition is met.
+//    Collect tokens into std::vector<llama_token> response which is pointed to by callback_data.
+static void beam_search_callback(void * callback_data_ptr, llama_beams_state beams_state) {
+    auto& callback_data = *static_cast<beam_search_callback_data*>(callback_data_ptr);
+    // Mark beams as EOS as needed.
+    for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
+        llama_beam_view& beam_view = beams_state.beam_views[i];
+        if (!beam_view.eob && is_at_eob(callback_data, beam_view.tokens, beam_view.n_tokens)) {
+            beam_view.eob = true;
+        }
+    }
+    printf(",");  // Show progress
+    if (const size_t n = beams_state.common_prefix_length) {
+        callback_data.response.resize(callback_data.response.size() + n);
+        assert(0u < beams_state.n_beams);
+        const llama_token * tokens = beams_state.beam_views[0].tokens;
+        std::copy(tokens, tokens + n, callback_data.response.end() - n);
+        printf("%zu", n);
+    }
+    fflush(stdout);
+#if 1 // DEBUG: print current beams for this iteration
+    std::cout << "\n\nCurrent beams (last_call=" << beams_state.last_call << "):\n";
+    for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
+        std::cout << "beams["<<i<<"]: " << ostream_beam_view{callback_data.ctx,beams_state.beam_views[i]} << std::endl;
+    }
+#endif
+}
+
+int main(int argc, char ** argv)
+{
+    gpt_params params;
+    //params.n_gpu_layers = 200;
+
+    //---------------------------------
+    // Print help :
+    //---------------------------------
+
+    if ( argc < 2 || argv[1][0] == '-' )
+    {
+        printf( "Usage: %s MODEL_PATH [BEAM_WIDTH=2] [PROMPT]\n" , argv[0] );
+        return 1 ;
+    }
+
+    //---------------------------------
+    // Load parameters :
+    //---------------------------------
+
+    params.model = argv[1];
+
+    params.n_beams = 2 < argc ? std::stoi(argv[2]) : 2;
+
+    if ( argc > 3 )
+    {
+        params.prompt = argv[3];
+    }
+
+    if ( params.prompt.empty() )
+    {
+        params.prompt = "### Request:\nHow many countries are there?\n\n### Response:\n";
+    }
+
+    //---------------------------------
+    // Init LLM :
+    //---------------------------------
+
+    llama_backend_init();
+    llama_numa_init(params.numa);
+
+    llama_model * model;
+    llama_context * ctx;
+
+    std::tie(model, ctx) = llama_init_from_gpt_params( params );
+
+    if ( model == NULL )
+    {
+        fprintf( stderr , "%s: error: unable to load model\n" , __func__ );
+        return 1;
+    }
+
+    //---------------------------------
+    // Tokenize the prompt :
+    //---------------------------------
+
+    std::vector<llama_token> tokens_list = llama_tokenize(ctx, params.prompt, true);
+
+    const size_t max_context_size     = llama_n_ctx( ctx );
+    const size_t max_tokens_list_size = max_context_size - 4 ;
+
+    if (tokens_list.size() > max_tokens_list_size)
+    {
+        fprintf( stderr , "%s: error: prompt too long (%zu tokens, max %zu)\n" ,
+             __func__ , tokens_list.size() , max_tokens_list_size );
+        return 1;
+    }
+
+    fprintf( stderr, "\n\n" );
+
+    // Print the tokens from the prompt :
+
+    for( auto id : tokens_list )
+    {
+        std::cout << llama_token_to_piece(ctx, id);
+    }
+    std::cout << std::flush;
+
+    int n_past = 0;
+
+    if (llama_decode(ctx, llama_batch_get_one(tokens_list.data(), tokens_list.size(), n_past, 0)))
+    {
+        fprintf(stderr, "%s : failed to eval prompt.\n" , __func__ );
+        return 1;
+    }
+    n_past += tokens_list.size();
+
+    beam_search_callback_data callback_data{ctx, {}};
+    size_t const beam_width = static_cast<size_t>(params.n_beams);
+    int const n_predict = 256;
+    llama_beam_search(ctx, beam_search_callback, &callback_data, beam_width, n_past, n_predict);
+
+    std::cout << "\n\n";
+    for (llama_token const token_id : callback_data.response) {
+        std::cout << llama_token_to_piece(ctx,token_id);
+    }
+    std::cout << std::endl;
+
+    llama_free( ctx );
+    llama_free_model( model );
+
+    llama_backend_free();
+
+    return 0;
+}
--- a/examples/convert-llama2c-to-ggml/README.md
+++ b/examples/convert-llama2c-to-ggml/README.md
@@ -2,7 +2,7 @@

 This example reads weights from project [llama2.c](https://github.com/karpathy/llama2.c) and saves them in ggml compatible format. The vocab that is available in `models/ggml-vocab.bin` is used by default.

-To convert the model first download the models from the [llama2.c](https://github.com/karpathy/llama2.c) repository:
+To convert the model first download the models from the [llma2.c](https://github.com/karpathy/llama2.c) repository:

 `$ make -j`

--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@@ -774,7 +774,7 @@ static struct train_params get_default_train_params() {

    params.samples_start_after_nl = false;
    params.use_adam               = true;
-    params.use_flash              = false;
+    params.use_flash              = true;
    params.use_scratch            = true;

    // only adam
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -49,12 +49,6 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
        }

        float * out = output + batch.seq_id[i][0] * n_embd;
-        //TODO: I would also add a parameter here to enable normalization or not.
-        /*fprintf(stdout, "unnormalized_embedding:");
-        for (int hh = 0; hh < n_embd; hh++) {
-            fprintf(stdout, "%9.6f ", embd[hh]);
-        }
-        fprintf(stdout, "\n");*/
        llama_embd_normalize(embd, out, n_embd);
    }
 }
@@ -63,7 +57,6 @@ int main(int argc, char ** argv) {
    gpt_params params;

    if (!gpt_params_parse(argc, argv, params)) {
-        gpt_params_print_usage(argc, argv, params);
        return 1;
    }

@@ -80,6 +73,9 @@ int main(int argc, char ** argv) {
    fprintf(stderr, "%s: seed  = %u\n", __func__, params.seed);

    std::mt19937 rng(params.seed);
+    if (params.random_prompt) {
+        params.prompt = gpt_random_prompt(rng);
+    }

    llama_backend_init();
    llama_numa_init(params.numa);
@@ -105,7 +101,7 @@ int main(int argc, char ** argv) {
    // print system information
    {
        fprintf(stderr, "\n");
-        fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
+        fprintf(stderr, "%s\n", get_system_info(params).c_str());
    }

    // split the prompt into lines
@@ -127,12 +123,10 @@ int main(int argc, char ** argv) {
        inputs.push_back(inp);
    }

-    // check if the last token is SEP
-    // it should be automatically added by the tokenizer when 'tokenizer.ggml.add_eos_token' is set to 'true'
+    // add SEP if not present
    for (auto & inp : inputs) {
        if (inp.empty() || inp.back() != llama_token_sep(model)) {
-            fprintf(stderr, "%s: warning: last token in the prompt is not SEP\n", __func__);
-            fprintf(stderr, "%s:          'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__);
+            inp.push_back(llama_token_sep(model));
        }
    }

@@ -209,7 +203,6 @@ int main(int argc, char ** argv) {

    // clean up
    llama_print_timings(ctx);
-    llama_batch_free(batch);
    llama_free(ctx);
    llama_free_model(model);
    llama_backend_free();
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@@ -52,15 +52,15 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
                    size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
                    float v;
                    if (type == GGML_TYPE_F16) {
-                        v = ggml_fp16_to_fp32(*(ggml_fp16_t *) &data[i]);
+                        v = ggml_fp16_to_fp32(*(ggml_fp16_t *) data + i);
                    } else if (type == GGML_TYPE_F32) {
-                        v = *(float *) &data[i];
+                        v = *(float *) data + i;
                    } else if (type == GGML_TYPE_I32) {
-                        v = (float) *(int32_t *) &data[i];
+                        v = (float) *(int32_t *) data + i;
                    } else if (type == GGML_TYPE_I16) {
-                        v = (float) *(int16_t *) &data[i];
+                        v = (float) *(int16_t *) data + i;
                    } else if (type == GGML_TYPE_I8) {
-                        v = (float) *(int8_t *) &data[i];
+                        v = (float) *(int8_t *) data + i;
                    } else {
                        GGML_ASSERT(false);
                    }
@@ -140,18 +140,20 @@ static bool run(llama_context * ctx, const gpt_params & params) {
 }

 int main(int argc, char ** argv) {
+
    callback_data cb_data;

    gpt_params params;
-
    if (!gpt_params_parse(argc, argv, params)) {
-        gpt_params_print_usage(argc, argv, params);
        return 1;
    }

    print_build_info();

    std::mt19937 rng(params.seed);
+    if (params.random_prompt) {
+        params.prompt = gpt_random_prompt(rng);
+    }

    llama_backend_init();
    llama_numa_init(params.numa);
@@ -174,7 +176,7 @@ int main(int argc, char ** argv) {
    // print system information
    {
        fprintf(stderr, "\n");
-        fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
+        fprintf(stderr, "%s\n", get_system_info(params).c_str());
    }

    bool OK = run(ctx, params);
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -563,8 +563,8 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
        // not capturing these, to silcence warnings
        const int rope_mode = 0;

-        return ggml_rope_ext(ctx,
-            t, KQ_pos, nullptr, n_rot, rope_mode, n_ctx,
+        return ggml_rope_custom(ctx,
+            t, KQ_pos, n_rot, rope_mode, n_ctx, 0,
            rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f
        );
    };
@@ -643,8 +643,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
        struct ggml_tensor * t15 = ggml_permute      (ctx, t12, 0, 3, 1, 2);                         set_name(t15, "t15");     assert_shape_4d(t15, N, n_embd_head, n_head_kv, n_batch);
        struct ggml_tensor * t16;
        if (enable_flash_attn) {
-            GGML_ASSERT(false && "TODO: ggml_flash_attn_ext() not yet supported");
-            //t16 = ggml_flash_attn(ctx, t13, t14, t15, true);                                         set_name(t16, "t16");     assert_shape_4d(t16, n_embd_head, N, n_head, n_batch);
+            t16 = ggml_flash_attn(ctx, t13, t14, t15, true);                                         set_name(t16, "t16");     assert_shape_4d(t16, n_embd_head, N, n_head, n_batch);
        } else {
            struct ggml_tensor * t16_0 = ggml_mul_mat              (ctx, t14, t13);                  set_name(t16_0, "t16_0"); assert_shape_4d(t16_0, N, N, n_head, n_batch);
            struct ggml_tensor * t16_1 = ggml_scale_inplace        (ctx, t16_0, kv_scale);           set_name(t16_1, "t16_1"); assert_shape_4d(t16_1, N, N, n_head, n_batch);
--- a/examples/gguf-split/gguf-split.cpp
+++ b/examples/gguf-split/gguf-split.cpp
@@ -61,10 +61,10 @@ static size_t split_str_to_n_bytes(std::string str) {
    int n;
    if (str.back() == 'M') {
        sscanf(str.c_str(), "%d", &n);
-        n_bytes = (size_t)n * 1000 * 1000; // megabytes
+        n_bytes = (size_t)n * 1024 * 1024; // megabytes
    } else if (str.back() == 'G') {
        sscanf(str.c_str(), "%d", &n);
-        n_bytes = (size_t)n * 1000 * 1000 * 1000; // gigabytes
+        n_bytes = (size_t)n * 1024 * 1024 * 1024; // gigabytes
    } else {
        throw std::invalid_argument("error: supported units are M (megabytes) or G (gigabytes), but got: " + std::string(1, str.back()));
    }
@@ -284,7 +284,7 @@ struct split_strategy {
                struct ggml_tensor * t = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_out, i));
                total_size += ggml_nbytes(t);
            }
-            total_size = total_size / 1000 / 1000; // convert to megabytes
+            total_size = total_size / 1024 / 1024; // convert to megabytes
            printf("split %05d: n_tensors = %d, total_size = %ldM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size);
            i_split++;
        }
--- a/examples/gguf-split/tests.sh
+++ b/examples/gguf-split/tests.sh
@@ -41,7 +41,7 @@ echo PASS
 echo

 # 2b. Test the sharded model is loading properly
-$MAIN --model $WORK_PATH/ggml-model-split-00001-of-00006.gguf --n-predict 32
+$MAIN --model $WORK_PATH/ggml-model-split-00001-of-00006.gguf --random-prompt --n-predict 32
 echo PASS
 echo

@@ -51,7 +51,7 @@ echo PASS
 echo

 # 3b. Test the merged model is loading properly
-$MAIN --model $WORK_PATH/ggml-model-merge.gguf --n-predict 32
+$MAIN --model $WORK_PATH/ggml-model-merge.gguf --random-prompt --n-predict 32
 echo PASS
 echo

@@ -61,7 +61,7 @@ echo PASS
 echo

 # 4b. Test the sharded model is loading properly
-$MAIN --model $WORK_PATH/ggml-model-split-32-tensors-00001-of-00007.gguf --n-predict 32
+$MAIN --model $WORK_PATH/ggml-model-split-32-tensors-00001-of-00007.gguf --random-prompt --n-predict 32
 echo PASS
 echo

@@ -71,7 +71,7 @@ echo
 #echo

 # 5b. Test the merged model is loading properly
-#$MAIN --model $WORK_PATH/ggml-model-merge-2.gguf --n-predict 32
+#$MAIN --model $WORK_PATH/ggml-model-merge-2.gguf --random-prompt --n-predict 32
 #echo PASS
 #echo

@@ -81,7 +81,7 @@ echo PASS
 echo

 # 6b. Test the sharded model is loading properly
-$MAIN --model $WORK_PATH/ggml-model-split-2G-00001-of-00002.gguf --n-predict 32
+$MAIN --model $WORK_PATH/ggml-model-split-2G-00001-of-00002.gguf --random-prompt --n-predict 32
 echo PASS
 echo

--- a/examples/gpt4all.sh
+++ b/examples/gpt4all.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+#
+# Temporary script - will be removed in the future
+#
+
+cd `dirname $0`
+cd ..
+
+./main --color --instruct --threads 4 \
+       --model ./models/gpt4all-7B/gpt4all-lora-quantized.bin \
+       --file ./prompts/alpaca.txt \
+       --batch_size 8 --ctx_size 2048 -n -1 \
+       --repeat_last_n 64 --repeat_penalty 1.3 \
+       --n_predict 128 --temp 0.1 --top_k 40 --top_p 0.95
--- a/examples/gritlm/gritlm.cpp
+++ b/examples/gritlm/gritlm.cpp
@@ -153,9 +153,7 @@ static std::string gritlm_instruction(const std::string & instruction) {

 int main(int argc, char * argv[]) {
    gpt_params params;
-
    if (!gpt_params_parse(argc, argv, params)) {
-        gpt_params_print_usage(argc, argv, params);
        return 1;
    }

--- a/examples/imatrix/README.md
+++ b/examples/imatrix/README.md
@@ -6,19 +6,16 @@ More information is available here: https://github.com/ggerganov/llama.cpp/pull/
 ## Usage

 ```
-./imatrix \
-    -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \
-    [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \
-    [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]
+./imatrix -m <some_fp_model> -f <some_training_data> [-o <output_file>] [--verbosity <verbosity_level>]
+        [-ofreq num_chunks] [-ow <0 or 1>] [other common params]
 ```

 Here `-m` with a model name and `-f` with a file containing training data (such as e.g. `wiki.train.raw`) are mandatory.
 The parameters in square brackets are optional and have the following meaning:
 * `-o` (or `--output-file`) specifies the name of the file where the computed data will be stored. If missing `imatrix.dat` is used.
 * `--verbosity` specifies the verbosity level. If set to `0`, no output other than the perplexity of the processed chunks will be generated. If set to `1`, each time the results are saved a message is written to `stderr`. If `>=2`, a message is output each time data is collected for any tensor. Default verbosity level is `1`.
-* `--output-frequency` specifies how often the so far computed result is saved to disk. Default is 10 (i.e., every 10 chunks)
-* `--save-frequency` specifies how often to save a copy of the imatrix in a separate file. Default is 0 (i.e., never)
-* `--process-output` specifies if data will be collected for the `output.weight` tensor. My experience is that it is better to not utilize the importance matrix when quantizing `output.weight`, so this is set to `false` by default.
+* `-ofreq` (or `--output-frequency`) specifies how often the so far computed result is saved to disk. Default is 10 (i.e., every 10 chunks)
+* `-ow` (or `--output-weight`) specifies if data will be collected for the `output.weight` tensor. My experience is that it is better to not utilize the importance matrix when quantizing `output.weight`, so this is set to `false` by default.

 For faster computation, make sure to use GPU offloading via the `-ngl` argument

--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -17,37 +17,39 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif

-static void print_usage(int argc, char ** argv, const gpt_params & params) {
-    gpt_params_print_usage(argc, argv, params);
-
-    LOG_TEE("\nexample usage:\n");
-    LOG_TEE("\n    %s \\\n"
-            "       -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \\\n"
-            "       [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \\\n"
-            "       [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]\n" , argv[0]);
-    LOG_TEE("\n");
-}
-
 struct Stats {
    std::vector<float> values;
    std::vector<int> counts;
    int ncall = 0;
 };

+struct StatParams {
+    std::string dataset;
+    std::string ofile = "imatrix.dat";
+    int         n_output_frequency = 10;
+    int         verbosity = 1;
+    int         keep_every = 0;
+    bool        collect_output_weight = false;
+};
+
 class IMatrixCollector {
 public:
    IMatrixCollector() = default;
-    void set_params(gpt_params params) { m_params = std::move(params); }
+    void set_parameters(StatParams&& params) { m_params = std::move(params); }
    bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data);
-    void save_imatrix(int ncall = -1) const;
-    bool load_imatrix(const char * file_name);
+    void save_imatrix() const;
+    bool load_imatrix(const char * file_name, bool add);
+    static bool load_imatrix(const char * file_name, std::unordered_map<std::string, Stats>& imatrix);
 private:
    std::unordered_map<std::string, Stats> m_stats;
-    gpt_params                             m_params;
+    StatParams                             m_params;
    std::mutex                             m_mutex;
    int                                    m_last_call = 0;
    std::vector<float>                     m_src1_data;
    std::vector<char>                      m_ids; // the expert ids from ggml_mul_mat_id
+                                                  //
+    void save_imatrix(const char * file_name, const char * dataset) const;
+    void keep_imatrix(int ncall) const;
 };

 // remove any prefix and suffixes from the name
@@ -83,7 +85,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
        if (t->op != GGML_OP_MUL_MAT) return false;
        // why are small batches ignored (<16 tokens)?
        if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false;
-        if (!(wname.substr(0, 4) == "blk." || (m_params.process_output && wname == "output.weight"))) return false;
+        if (!(wname.substr(0, 4) == "blk." || (m_params.collect_output_weight && wname == "output.weight"))) return false;
        return true;
    }

@@ -151,25 +153,21 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
                    for (int j = 0; j < (int)src1->ne[0]; ++j) {
                        e.values[e_start + j] += x[j]*x[j];
                        e.counts[e_start + j]++;
-                        if (!std::isfinite(e.values[e_start + j])) {
-                            fprintf(stderr, "%f detected in %s\n", e.values[e_start + j], wname.c_str());
-                            exit(1);
-                        }
                    }
                }
            }
            if (e.ncall > m_last_call) {
                m_last_call = e.ncall;
-                if (m_last_call % m_params.n_out_freq == 0) {
+                if (m_last_call % m_params.n_output_frequency == 0) {
                    save_imatrix();
                }
-                if (m_params.n_save_freq > 0 && m_last_call%m_params.n_save_freq == 0) {
-                    save_imatrix(m_last_call);
+                if (m_params.keep_every > 0 && m_last_call%m_params.keep_every == 0) {
+                    keep_imatrix(m_last_call);
                }
            }
        }
    } else {
-        auto & e = m_stats[wname];
+        auto& e = m_stats[wname];
        if (e.values.empty()) {
            e.values.resize(src1->ne[0], 0);
            e.counts.resize(src1->ne[0], 0);
@@ -187,19 +185,15 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
            for (int j = 0; j < (int)src1->ne[0]; ++j) {
                e.values[j] += x[j]*x[j];
                e.counts[j]++;
-                if (!std::isfinite(e.values[j])) {
-                    fprintf(stderr, "%f detected in %s\n", e.values[j], wname.c_str());
-                    exit(1);
-                }
            }
        }
        if (e.ncall > m_last_call) {
            m_last_call = e.ncall;
-            if (m_last_call % m_params.n_out_freq == 0) {
+            if (m_last_call % m_params.n_output_frequency == 0) {
                save_imatrix();
            }
-            if (m_params.n_save_freq > 0 && m_last_call%m_params.n_save_freq == 0) {
-                save_imatrix(m_last_call);
+            if (m_params.keep_every > 0 && m_last_call%m_params.keep_every == 0) {
+                keep_imatrix(m_last_call);
            }
        }
    }
@@ -207,75 +201,33 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
    return true;
 }

-void IMatrixCollector::save_imatrix(int ncall) const {
-    auto fname = m_params.out_file;
-    if (fname.empty()) {
-        fname = "imatrix.dat";
-    }
+void IMatrixCollector::save_imatrix() const {
+    save_imatrix(m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str(), m_params.dataset.c_str());
+}

-    if (ncall > 0) {
-        fname += ".at_";
-        fname += std::to_string(ncall);
-    }
-
-    // avoid writing imatrix entries that do not have full data
-    // this can happen with MoE models where some of the experts end up not being exercised by the provided training data
-
-    int n_entries = 0;
-    std::vector<std::string> to_store;
-
-    bool is_first = true; // for printing
-    for (const auto & kv : m_stats) {
-        const int n_all = kv.second.counts.size();
-
-        if (n_all == 0) {
-            continue;
-        }
-
-        int n_zeros = 0;
-        for (const int c : kv.second.counts) {
-            if (c == 0) {
-                n_zeros++;
-            }
-        }
-
-        if (n_zeros != 0 && is_first) {
-            fprintf(stderr, "\n");
-            is_first = false;
-        }
-
-        if (n_zeros == n_all) {
-            fprintf(stderr, "%s: entry '%40s' has no data - skipping\n", __func__, kv.first.c_str());
-            continue;
-        }
-
-        if (n_zeros > 0) {
-            fprintf(stderr, "%s: entry '%40s' has partial data (%.2f%%) - skipping\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all);
-            continue;
-        }
-
-        n_entries++;
-        to_store.push_back(kv.first);
-    }
-
-    if (to_store.size() < m_stats.size()) {
-        fprintf(stderr, "%s: warning: storing only %zu out of %zu entries\n", __func__, to_store.size(), m_stats.size());
-    }
+void IMatrixCollector::keep_imatrix(int ncall) const {
+    auto file_name = m_params.ofile;
+    if (file_name.empty()) file_name = "imatrix.dat";
+    file_name += ".at_";
+    file_name += std::to_string(ncall);
+    save_imatrix(file_name.c_str(), m_params.dataset.c_str());
+}

+void IMatrixCollector::save_imatrix(const char * fname, const char * dataset) const {
    std::ofstream out(fname, std::ios::binary);
+    int n_entries = m_stats.size();
    out.write((const char *) &n_entries, sizeof(n_entries));
-    for (const auto & name : to_store) {
-        const auto & stat = m_stats.at(name);
-        int len = name.size();
+    for (const auto & p : m_stats) {
+        int len = p.first.size();
        out.write((const char *) &len, sizeof(len));
-        out.write(name.c_str(), len);
-        out.write((const char *) &stat.ncall, sizeof(stat.ncall));
-        int nval = stat.values.size();
+        out.write(p.first.c_str(), len);
+        out.write((const char *) &p.second.ncall, sizeof(p.second.ncall));
+        int nval = p.second.values.size();
        out.write((const char *) &nval, sizeof(nval));
        if (nval > 0) {
            std::vector<float> tmp(nval);
            for (int i = 0; i < nval; i++) {
-                tmp[i] = (stat.values[i] / static_cast<float>(stat.counts[i])) * static_cast<float>(stat.ncall);
+                tmp[i] = (p.second.values[i] / static_cast<float>(p.second.counts[i])) * static_cast<float>(p.second.ncall);
            }
            out.write((const char*)tmp.data(), nval*sizeof(float));
        }
@@ -284,28 +236,26 @@ void IMatrixCollector::save_imatrix(int ncall) const {
    // Write the number of call the matrix was computed with
    out.write((const char *) &m_last_call, sizeof(m_last_call));

-    // Write the input filename at the end of the file to later on specify it in quantize
-    {
-        int len = m_params.prompt_file.size();
-        out.write((const char *) &len, sizeof(len));
-        out.write(m_params.prompt_file.c_str(), len);
-    }
+    // Write the dataset name at the end of the file to later on specify it in quantize
+    int n_dataset = strlen(dataset);
+    out.write((const char *) &n_dataset, sizeof(n_dataset));
+    out.write(dataset, n_dataset);

    if (m_params.verbosity > 0) {
-        fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname.c_str());
+        fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname);
    }
 }

-bool IMatrixCollector::load_imatrix(const char * fname) {
-    std::ifstream in(fname, std::ios::binary);
+bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_map<std::string, Stats>& imatrix_data) {
+    std::ifstream in(imatrix_file, std::ios::binary);
    if (!in) {
-        printf("%s: failed to open %s\n",__func__, fname);
+        printf("%s: failed to open %s\n",__func__,imatrix_file);
        return false;
    }
    int n_entries;
    in.read((char*)&n_entries, sizeof(n_entries));
    if (in.fail() || n_entries < 1) {
-        printf("%s: no data in file %s\n", __func__, fname);
+        printf("%s: no data in file %s\n", __func__, imatrix_file);
        return false;
    }
    for (int i = 0; i < n_entries; ++i) {
@@ -313,22 +263,23 @@ bool IMatrixCollector::load_imatrix(const char * fname) {
        std::vector<char> name_as_vec(len+1);
        in.read((char *)name_as_vec.data(), len);
        if (in.fail()) {
-            printf("%s: failed reading name for entry %d from %s\n",__func__,i+1, fname);
+            printf("%s: failed reading name for entry %d from %s\n",__func__,i+1,imatrix_file);
            return false;
        }
        name_as_vec[len] = 0;
        std::string name{name_as_vec.data()};
-        auto & e = m_stats[std::move(name)];
+        auto& e = imatrix_data[std::move(name)];
        int ncall;
        in.read((char*)&ncall, sizeof(ncall));
        int nval;
        in.read((char *)&nval, sizeof(nval));
        if (in.fail() || nval < 1) {
            printf("%s: failed reading number of values for entry %d\n",__func__,i);
-            m_stats = {};
+            imatrix_data = {};
            return false;
        }

+        // When re-called from load_imatrix() with add set, this will already be created.
        if (e.values.empty()) {
            e.values.resize(nval, 0);
            e.counts.resize(nval, 0);
@@ -338,7 +289,7 @@ bool IMatrixCollector::load_imatrix(const char * fname) {
        in.read((char*)tmp.data(), nval*sizeof(float));
        if (in.fail()) {
            printf("%s: failed reading data for entry %d\n",__func__,i);
-            m_stats = {};
+            imatrix_data = {};
            return false;
        }

@@ -353,6 +304,13 @@ bool IMatrixCollector::load_imatrix(const char * fname) {
    return true;
 }

+bool IMatrixCollector::load_imatrix(const char * file_name, bool add) {
+    if (!add) {
+        m_stats.clear();
+    }
+    return load_imatrix(file_name, m_stats);
+}
+
 static IMatrixCollector g_collector;

 static bool ik_collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
@@ -366,7 +324,7 @@ struct results_log_softmax {
    float  prob;
 };

-static std::vector<float> softmax(const std::vector<float> & logits) {
+static std::vector<float> softmax(const std::vector<float>& logits) {
    std::vector<float> probs(logits.size());
    float max_logit = logits[0];
    for (float v : logits) {
@@ -400,7 +358,8 @@ static results_log_softmax log_softmax(int n_vocab, const float * logits, int to

 static void process_logits(
    int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread> & workers,
-    double & nll, double & nll2, float * logit_history, float * prob_history) {
+    double & nll, double & nll2, float * logit_history, float * prob_history
+) {
    std::mutex mutex;
    int counter = 0;
    auto compute = [&mutex, &counter, &nll, &nll2, logit_history, prob_history, n_vocab, logits, tokens, n_token] () {
@@ -432,7 +391,8 @@ static void process_logits(
    }
 }

-static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
+static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool compute_ppl, int from_chunk) {
+
    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
    GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
    const int n_ctx = llama_n_ctx(ctx);
@@ -445,13 +405,13 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
    auto tim2 = std::chrono::high_resolution_clock::now();
    fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());

-    if (params.i_chunk > 0) {
-        if (size_t((params.i_chunk + 2)*n_ctx) >= tokens.size()) {
-            fprintf(stderr, "%s: there will be not enough tokens left after removing %d chunks\n", __func__, params.i_chunk);
+    if (from_chunk > 0) {
+        if (size_t((from_chunk + 2)*n_ctx) >= tokens.size()) {
+            fprintf(stderr, "%s: there will be not enough tokens left after removing %d chunks\n", __func__, from_chunk);
            return false;
        }
-        fprintf(stderr, "%s: removing initial %d chunks (%d tokens)\n", __func__, params.i_chunk, params.i_chunk*n_ctx);
-        tokens.erase(tokens.begin(), tokens.begin() + params.i_chunk*n_ctx);
+        fprintf(stderr, "%s: removing initial %d chunks (%d tokens)\n", __func__, from_chunk, from_chunk*n_ctx);
+        tokens.erase(tokens.begin(), tokens.begin() + from_chunk*n_ctx);
    }

    if (int(tokens.size()) < 2*n_ctx) {
@@ -464,7 +424,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
    std::vector<float> logit_history;
    std::vector<float> prob_history;

-    if (params.compute_ppl) {
+    if (compute_ppl) {
        logit_history.resize(tokens.size());
        prob_history.resize(tokens.size());
    }
@@ -486,7 +446,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
    const int num_batches = (n_ctx + n_batch - 1) / n_batch;

    std::vector<float> logits;
-    if (params.compute_ppl && num_batches > 1) {
+    if (compute_ppl && num_batches > 1) {
        logits.reserve((size_t)n_ctx * n_vocab);
    }

@@ -522,7 +482,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
            // restore the original token in case it was set to BOS
            tokens[batch_start] = token_org;

-            if (params.compute_ppl && num_batches > 1) {
+            if (compute_ppl && num_batches > 1) {
                const auto * batch_logits = llama_get_logits(ctx);
                logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
            }
@@ -541,7 +501,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
            fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
        }

-        if (params.compute_ppl) {
+        if (compute_ppl) {
            const int first = n_ctx/2;
            const auto all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
            process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
@@ -556,7 +516,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
    }
    printf("\n");

-    if (params.compute_ppl) {
+    if (compute_ppl) {
        nll2 /= count;
        nll /= count;
        const double ppl = exp(nll);
@@ -573,32 +533,111 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
 }

 int main(int argc, char ** argv) {
-    gpt_params params;

-    params.n_ctx = 512;
-    params.logits_all = true;
-    params.verbosity = 1;
-
-    if (!gpt_params_parse(argc, argv, params)) {
-        print_usage(argc, argv, params);
-        return 1;
+    StatParams sparams;
+    std::string prev_result_file;
+    std::string combine_files;
+    bool compute_ppl = true;
+    int  from_chunk  = 0;
+    std::vector<char*> args;
+    args.push_back(argv[0]);
+    int iarg = 1;
+    for (; iarg < argc-1; ++iarg) {
+        std::string arg{argv[iarg]};
+        if (arg == "-o" || arg == "--output-file") {
+            sparams.ofile = argv[++iarg];
+        }
+        else if (arg == "-ofreq" || arg == "--output-frequency") {
+            sparams.n_output_frequency = std::stoi(argv[++iarg]);
+        }
+        else if (arg == "-ow" || arg == "--output-weight") {
+            sparams.collect_output_weight = std::stoi(argv[++iarg]);
+        }
+        else if (arg == "--verbosity") {
+            sparams.verbosity = std::stoi(argv[++iarg]);
+        } else if (arg == "--no-ppl") {
+            compute_ppl = false;
+        } else if (arg == "--keep-imatrix") {
+            sparams.keep_every = std::stoi(argv[++iarg]);
+        } else if (arg == "--continue-from") {
+            prev_result_file = argv[++iarg];
+        } else if (arg == "--combine") {
+            combine_files = argv[++iarg];
+        }
+        else if (arg == "--from-chunk") {
+            from_chunk = std::stoi(argv[++iarg]);
+        } else {
+            args.push_back(argv[iarg]);
+        }
    }
-
-    params.n_batch = std::min(params.n_batch, params.n_ctx);
-
-    g_collector.set_params(params);
-
-    for (const auto & in_file : params.in_files) {
-        printf("%s : loading imatrix from '%s'\n", __func__, in_file.c_str());
-        if (!g_collector.load_imatrix(in_file.c_str())) {
-            fprintf(stderr, "%s : failed to load %s\n", __func__, in_file.c_str());
-            return 1;
+    if (iarg < argc) {
+        std::string arg{argv[iarg]};
+        if (arg == "--no-ppl") {
+            compute_ppl = false;
+        } else {
+            args.push_back(argv[iarg]);
        }
    }

-    if (params.in_files.size() > 1) {
-        printf("%s : saving combined imatrix to '%s'\n", __func__, params.out_file.c_str());
+    gpt_params params;
+    params.n_batch = 512;
+    if (!gpt_params_parse(args.size(), args.data(), params)) {
+        return 1;
+    }
+
+    params.logits_all = true;
+    params.n_batch = std::min(params.n_batch, params.n_ctx);
+
+    print_build_info();
+
+    if (params.seed == LLAMA_DEFAULT_SEED) {
+        params.seed = time(NULL);
+    }
+
+    fprintf(stderr, "%s: seed  = %u\n", __func__, params.seed);
+
+    std::mt19937 rng(params.seed);
+    if (params.random_prompt) {
+        params.prompt = gpt_random_prompt(rng);
+    }
+
+    sparams.dataset = params.prompt_file;
+    g_collector.set_parameters(std::move(sparams));
+
+    if (!combine_files.empty()) {
+        std::vector<std::string> files;
+        size_t pos = 0;
+        while (true) {
+            auto new_pos = combine_files.find(',', pos);
+            if (new_pos != std::string::npos) {
+                files.emplace_back(combine_files.substr(pos, new_pos - pos));
+                pos = new_pos + 1;
+            } else {
+                files.emplace_back(combine_files.substr(pos));
+                break;
+            }
+        }
+        if (files.size() < 2) {
+            fprintf(stderr, "You must provide at least two comma separated files to use --combine\n");
+            return 1;
+        }
+        printf("Combining the following %d files\n", int(files.size()));
+        for (auto& file : files) {
+            printf("    %s\n", file.c_str());
+            if (!g_collector.load_imatrix(file.c_str(), true)) {
+                fprintf(stderr, "Failed to load %s\n", file.c_str());
+                return 1;
+            }
+        }
        g_collector.save_imatrix();
+        return 0;
+    }
+
+    if (!prev_result_file.empty()) {
+        if (!g_collector.load_imatrix(prev_result_file.c_str(), false)) {
+            fprintf(stderr, "=============== Failed to load %s\n", prev_result_file.c_str());
+            return 1;
+        }
    }

    llama_backend_init();
@@ -613,7 +652,6 @@ int main(int argc, char ** argv) {
    // init
    llama_model * model;
    llama_context * ctx;
-
    std::tie(model, ctx) = llama_init_from_gpt_params(params);
    if (model == nullptr || ctx == nullptr) {
        fprintf(stderr, "%s : failed to init\n", __func__);
@@ -629,10 +667,11 @@ int main(int argc, char ** argv) {
    // print system information
    {
        fprintf(stderr, "\n");
-        fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
+        fprintf(stderr, "%s\n", get_system_info(params).c_str());
    }

-    if (!compute_imatrix(ctx, params)) {
+    bool OK = compute_imatrix(ctx, params, compute_ppl, from_chunk);
+    if (!OK) {
        return 1;
    }

--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@@ -50,9 +50,9 @@ static void write_logfile(
        return;
    }

-    const std::string timestamp = string_get_sortable_timestamp();
+    const std::string timestamp = get_sortable_timestamp();

-    const bool success = fs_create_directory_with_parents(params.logdir);
+    const bool success = create_directory_with_parents(params.logdir);
    if (!success) {
        fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
                __func__, params.logdir.c_str());
@@ -70,7 +70,7 @@ static void write_logfile(
    fprintf(logfile, "binary: infill\n");
    char model_desc[128];
    llama_model_desc(model, model_desc, sizeof(model_desc));
-    yaml_dump_non_result_info(logfile, params, ctx, timestamp, input_tokens, model_desc);
+    dump_non_result_info_yaml(logfile, params, ctx, timestamp, input_tokens, model_desc);

    fprintf(logfile, "\n");
    fprintf(logfile, "######################\n");
@@ -78,8 +78,8 @@ static void write_logfile(
    fprintf(logfile, "######################\n");
    fprintf(logfile, "\n");

-    yaml_dump_string_multiline(logfile, "output", output.c_str());
-    yaml_dump_vector_int(logfile, "output_tokens", output_tokens);
+    dump_string_yaml_multiline(logfile, "output", output.c_str());
+    dump_vector_int_yaml(logfile, "output_tokens", output_tokens);

    llama_dump_timing_info_yaml(logfile, ctx);
    fclose(logfile);
@@ -107,7 +107,6 @@ int main(int argc, char ** argv) {
    g_params = &params;

    if (!gpt_params_parse(argc, argv, params)) {
-        gpt_params_print_usage(argc, argv, params);
        return 1;
    }

@@ -140,6 +139,27 @@ int main(int argc, char ** argv) {
        LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
        params.n_ctx = 8;
    }
+    if (params.instruct) {
+        printf("\n************\n");
+        printf("%s: please use the 'main' tool for instruct mode\n", __func__);
+        printf("************\n\n");
+
+        return 0;
+    }
+    if (params.chatml) {
+        printf("\n************\n");
+        printf("%s: please use the 'main' tool for chatml mode\n", __func__);
+        printf("************\n\n");
+
+        return 0;
+    }
+    if (!params.antiprompt.empty()) {
+        printf("\n************\n");
+        printf("%s: please use the 'main' tool for antiprompt mode\n", __func__);
+        printf("************\n\n");
+
+        return 0;
+    }
    if (!params.interactive_first && (params.input_prefix.empty() && params.input_suffix.empty())) {
        printf("\n************\n");
        printf("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__);
@@ -147,6 +167,20 @@ int main(int argc, char ** argv) {

        return 0;
    }
+    if (params.random_prompt) {
+        printf("\n************\n");
+        printf("%s: please use the 'main' tool for random prompt mode\n", __func__);
+        printf("************\n\n");
+
+        return 0;
+    }
+    if (!params.path_prompt_cache.empty()) {
+        printf("\n************\n");
+        printf("%s: infill does not support prompt caching\n", __func__);
+        printf("************\n\n");
+
+        return 0;
+    }

    if (params.rope_freq_base != 0.0) {
        LOG_TEE("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
@@ -173,13 +207,17 @@ int main(int argc, char ** argv) {

    llama_model * model;
    llama_context * ctx;
-
+    llama_context * ctx_guidance = NULL;
    g_model = &model;
    g_ctx = &ctx;

    // load the model and apply lora adapter, if any
    LOG("%s: load the model and apply lora adapter, if any\n", __func__);
    std::tie(model, ctx) = llama_init_from_gpt_params(params);
+    if (sparams.cfg_scale > 1.f) {
+        struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
+        ctx_guidance = llama_new_context_with_model(model, lparams);
+    }

    if (model == NULL) {
        LOG_TEE("%s: error: unable to load model\n", __func__);
@@ -198,7 +236,7 @@ int main(int argc, char ** argv) {
    // print system information
    {
        LOG_TEE("\n");
-        LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str());
+        LOG_TEE("%s\n", get_system_info(params).c_str());
    }
    const bool add_bos = llama_should_add_bos_token(model);
    GGML_ASSERT(llama_add_eos_token(model) != 1);
@@ -235,6 +273,25 @@ int main(int argc, char ** argv) {
        LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
    }

+    // Tokenize negative prompt
+    std::vector<llama_token> guidance_inp;
+    int guidance_offset = 0;
+    int original_prompt_len = 0;
+    if (ctx_guidance) {
+        LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
+
+        guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true);
+        LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());
+
+        std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true);
+        LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
+
+        original_prompt_len = original_inp.size();
+        guidance_offset = (int)guidance_inp.size() - original_prompt_len;
+        LOG("original_prompt_len: %s", log_tostr(original_prompt_len));
+        LOG("guidance_offset:     %s", log_tostr(guidance_offset));
+    }
+
    if ((int) embd_inp.size() > n_ctx - 4) {
        LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
        return 1;
@@ -262,6 +319,15 @@ int main(int argc, char ** argv) {
            LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
        }

+        if (ctx_guidance) {
+            LOG_TEE("\n");
+            LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str());
+            LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
+            for (int i = 0; i < (int) guidance_inp.size(); i++) {
+                LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
+            }
+        }
+
        if (params.n_keep > 0) {
        LOG_TEE("%s: static prompt based on n_keep: '", __func__);
            for (int i = 0; i < params.n_keep; i++) {
@@ -329,11 +395,12 @@ int main(int argc, char ** argv) {
        is_interacting = params.interactive_first;
    }

-    bool input_echo = true;
+    bool input_echo           = true;

-    int n_past     = 0;
-    int n_remain   = params.n_predict;
-    int n_consumed = 0;
+    int n_past             = 0;
+    int n_remain           = params.n_predict;
+    int n_consumed         = 0;
+    int n_past_guidance    = 0;

    std::vector<int>   input_tokens;  g_input_tokens  = &input_tokens;
    std::vector<int>   output_tokens; g_output_tokens = &output_tokens;
@@ -343,6 +410,7 @@ int main(int argc, char ** argv) {
    console::set_display(console::prompt);

    std::vector<llama_token> embd;
+    std::vector<llama_token> embd_guidance;

    struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);

@@ -368,7 +436,7 @@ int main(int argc, char ** argv) {
            // if we run out of context:
            // - take the n_keep first tokens from the original prompt (via n_past)
            // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
-            if (n_past + (int) embd.size() > n_ctx) {
+            if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > n_ctx) {
                if (params.n_predict == -2) {
                    LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
                    break;
@@ -385,7 +453,11 @@ int main(int argc, char ** argv) {

                n_past -= n_discard;

-                LOG("after swap: n_past = %d\n", n_past);
+                if (ctx_guidance) {
+                    n_past_guidance -= n_discard;
+                }
+
+                LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance);

                LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());

@@ -393,6 +465,45 @@ int main(int argc, char ** argv) {

            // evaluate tokens in batches
            // embd is typically prepared beforehand to fit within a batch, but not always
+
+            if (ctx_guidance) {
+                int input_size = 0;
+                llama_token * input_buf = NULL;
+
+                if (n_past_guidance < (int) guidance_inp.size()) {
+                    // Guidance context should have the same data with these modifications:
+                    //
+                    // * Replace the initial prompt
+                    // * Shift everything by guidance_offset
+                    embd_guidance = guidance_inp;
+                    if (embd.begin() + original_prompt_len < embd.end()) {
+                        embd_guidance.insert(
+                            embd_guidance.end(),
+                            embd.begin() + original_prompt_len,
+                            embd.end()
+                        );
+                    }
+
+                    input_buf  = embd_guidance.data();
+                    input_size = embd_guidance.size();
+
+                    LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance).c_str());
+                } else {
+                    input_buf  = embd.data();
+                    input_size = embd.size();
+                }
+
+                for (int i = 0; i < input_size; i += params.n_batch) {
+                    int n_eval = std::min(input_size - i, params.n_batch);
+                    if (llama_decode(ctx_guidance, llama_batch_get_one(input_buf + i, n_eval, n_past_guidance, 0))) {
+                        LOG_TEE("%s : failed to eval\n", __func__);
+                        return 1;
+                    }
+
+                    n_past_guidance += n_eval;
+                }
+            }
+
            for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
                int n_eval = (int) embd.size() - i;
                if (n_eval > params.n_batch) {
@@ -414,9 +525,11 @@ int main(int argc, char ** argv) {
        }

        embd.clear();
+        embd_guidance.clear();

        if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
-            const llama_token id = llama_sampling_sample(ctx_sampling, ctx, nullptr);
+
+            const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance);

            llama_sampling_accept(ctx_sampling, ctx, id, true);

@@ -470,6 +583,7 @@ int main(int argc, char ** argv) {

        // if not currently processing queued inputs;
        if ((int) embd_inp.size() <= n_consumed) {
+
            // deal with eot token in infill mode
            if ((llama_sampling_last(ctx_sampling) == llama_token_eot(model) || is_interacting) && params.interactive){
                if (is_interacting && !params.interactive_first) {
@@ -507,8 +621,8 @@ int main(int argc, char ** argv) {

                if (params.escape) {
                    //process escape sequences, for the initial prompt this is done in common.cpp when we load the params, but for the interactive mode we need to do it here
-                    string_process_escapes(params.input_prefix);
-                    string_process_escapes(params.input_suffix);
+                    process_escapes(params.input_prefix);
+                    process_escapes(params.input_suffix);
                }
                suff_rm_leading_spc = params.escape;
                if (suff_rm_leading_spc && params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
@@ -530,6 +644,7 @@ int main(int argc, char ** argv) {
                embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
                embd_inp.push_back(llama_token_middle(model));
                embd.clear();
+                embd_guidance.clear();
                n_remain = params.n_predict;
                n_past = 0;
                n_consumed = 0;
@@ -636,6 +751,7 @@ int main(int argc, char ** argv) {
    llama_print_timings(ctx);
    write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);

+    if (ctx_guidance) { llama_free(ctx_guidance); }
    llama_free(ctx);
    llama_free_model(model);

--- a/examples/json_schema_to_grammar.py
+++ b/examples/json_schema_to_grammar.py
@@ -6,22 +6,52 @@ import re
 import sys
 from typing import Any, Dict, List, Set, Tuple, Union

-
-def _build_repetition(item_rule, min_items, max_items, separator_rule=None):
-
-    if min_items == 0 and max_items == 1:
-        return f'{item_rule}?'
-
+def _build_repetition(item_rule, min_items, max_items, separator_rule=None, item_rule_is_literal=False):
    if not separator_rule:
-        if min_items == 1 and max_items is None:
+        if min_items == 0 and max_items == 1:
+            return f'{item_rule}?'
+        elif min_items == 1 and max_items is None:
            return f'{item_rule}+'
-        elif min_items == 0 and max_items is None:
-            return f'{item_rule}*'
-        else:
-            return f'{item_rule}{{{min_items},{max_items if max_items is not None else ""}}}'

-    result = item_rule + ' ' + _build_repetition(f'({separator_rule} {item_rule})', min_items - 1 if min_items > 0 else 0, max_items - 1 if max_items is not None else None)
-    return f'({result})?' if min_items == 0 else result
+    result = ''
+
+    if min_items > 0:
+        if item_rule_is_literal and separator_rule is None:
+            result = '"' + (item_rule[1:-1] * min_items) + '"'
+        else:
+            result = (f' {separator_rule} ' if separator_rule else ' ').join([item_rule] * min_items)
+
+    def opt_repetitions(up_to_n, prefix_with_sep=False):
+        '''
+            - n=4, no sep:             '(a (a (a (a)?)?)?)?'
+            - n=4, sep=',', prefix:    '("," a ("," a ("," a ("," a)?)?)?)?'
+            - n=4, sep=',', no prefix: '(a ("," a ("," a ("," a)?)?)?)?'
+        '''
+
+        content = f'{separator_rule} {item_rule}' if prefix_with_sep and separator_rule else item_rule
+        if up_to_n == 0:
+            return ''
+        elif up_to_n == 1:
+            return f'({content})?'
+        elif separator_rule and not prefix_with_sep:
+            return f'({content} {opt_repetitions(up_to_n - 1, prefix_with_sep=True)})?'
+        else:
+            return (f'({content} ' * up_to_n).rstrip() + (')?' * up_to_n)
+
+    if min_items > 0 and max_items != min_items:
+        result += ' '
+
+    if max_items is not None:
+        result += opt_repetitions(max_items - min_items, prefix_with_sep=min_items > 0)
+    else:
+        item_operator = f'({separator_rule + " " if separator_rule else ""}{item_rule})'
+
+        if min_items == 0 and separator_rule:
+            result = f'({item_rule} {item_operator}*)?'
+        else:
+            result += f'{item_operator}*'
+
+    return result


 class BuiltinRule:
@@ -29,28 +59,31 @@ class BuiltinRule:
        self.content = content
        self.deps = deps or []

-# Constraining spaces to prevent model "running away".
-SPACE_RULE = '| " " | "\\n" [ \\t]{0,20}'
+_up_to_15_digits = _build_repetition('[0-9]', 0, 15)
+
+# whitespace is constrained to a single space char to prevent model "running away" in
+# whitespace. Also maybe improves generation quality?
+SPACE_RULE = '" "?'

 PRIMITIVE_RULES = {
    'boolean'      : BuiltinRule('("true" | "false") space', []),
-    'decimal-part' : BuiltinRule('[0-9]{1,16}', []),
-    'integral-part': BuiltinRule('[0] | [1-9] [0-9]{0,15}', []),
+    'decimal-part' : BuiltinRule('[0-9] ' + _up_to_15_digits, []),
+    'integral-part': BuiltinRule('[0-9] | [1-9] ' + _up_to_15_digits, []),
    'number'       : BuiltinRule('("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space', ['integral-part', 'decimal-part']),
    'integer'      : BuiltinRule('("-"? integral-part) space', ['integral-part']),
    'value'        : BuiltinRule('object | array | string | number | boolean | null', ['object', 'array', 'string', 'number', 'boolean', 'null']),
    'object'       : BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', ['string', 'value']),
    'array'        : BuiltinRule('"[" space ( value ("," space value)* )? "]" space', ['value']),
-    'uuid'         : BuiltinRule(r'"\"" [0-9a-fA-F]{8} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{12} "\"" space', []),
-    'char'         : BuiltinRule(r'[^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})', []),
+    'uuid'         : BuiltinRule(r'"\"" ' + ' "-" '.join('[0-9a-fA-F]' * n for n in [8, 4, 4, 4, 12]) + r' "\"" space', []),
+    'char'         : BuiltinRule(r'[^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])', []),
    'string'       : BuiltinRule(r'"\"" char* "\"" space', ['char']),
    'null'         : BuiltinRule('"null" space', []),
 }

 # TODO: support "uri", "email" string formats
 STRING_FORMAT_RULES = {
-    'date'            : BuiltinRule('[0-9]{4} "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []),
-    'time'            : BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9]{3} )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []),
+    'date'            : BuiltinRule('[0-9] [0-9] [0-9] [0-9] "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []),
+    'time'            : BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []),
    'date-time'       : BuiltinRule('date "T" time', ['date', 'time']),
    'date-string'     : BuiltinRule('"\\"" date "\\"" space', ['date']),
    'time-string'     : BuiltinRule('"\\"" time "\\"" space', ['time']),
@@ -300,7 +333,7 @@ class SchemaConverter:
                            sub_rule_ids[sub] = id
                        sub = id

-                    seq[-1] = (_build_repetition(f'"{sub}"' if sub_is_literal else sub, min_times, max_times), False)
+                    seq[-1] = (_build_repetition(f'"{sub}"' if sub_is_literal else sub, min_times, max_times, item_rule_is_literal=sub_is_literal), False)
                else:
                    literal = ''
                    while i < length:
--- a/examples/llama-bench/README.md
+++ b/examples/llama-bench/README.md
@@ -26,21 +26,16 @@ options:
  -m, --model <filename>              (default: models/7B/ggml-model-q4_0.gguf)
  -p, --n-prompt <n>                  (default: 512)
  -n, --n-gen <n>                     (default: 128)
-  -pg <pp,tg>                         (default: 512,128)
-  -b, --batch-size <n>                (default: 2048)
-  -ub, --ubatch-size <n>              (default: 512)
-  -ctk, --cache-type-k <t>            (default: f16)
-  -ctv, --cache-type-v <t>            (default: f16)
-  -t, --threads <n>                   (default: 16)
+  -b, --batch-size <n>                (default: 512)
+  -ctk <t>, --cache-type-k <t>        (default: f16)
+  -ctv <t>, --cache-type-v <t>        (default: f16)
+  -t, --threads <n>                   (default: 112)
  -ngl, --n-gpu-layers <n>            (default: 99)
  -sm, --split-mode <none|layer|row>  (default: layer)
  -mg, --main-gpu <i>                 (default: 0)
  -nkvo, --no-kv-offload <0|1>        (default: 0)
-  -fa, --flash-attn <0|1>             (default: 0)
  -mmp, --mmap <0|1>                  (default: 1)
-  --numa <distribute|isolate|numactl> (default: disabled)
-  -embd, --embeddings <0|1>           (default: 0)
-  -ts, --tensor-split <ts0/ts1/..>    (default: 0)
+  -ts, --tensor_split <ts0/ts1/..>    (default: 0)
  -r, --repetitions <n>               (default: 5)
  -o, --output <csv|json|md|sql>      (default: md)
  -v, --verbose                       (default: 0)
@@ -48,11 +43,10 @@ options:
 Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.
 ```

-llama-bench can perform three types of tests:
+llama-bench can perform two types of tests:

 - Prompt processing (pp): processing a prompt in batches (`-p`)
 - Text generation (tg): generating a sequence of tokens (`-n`)
- Prompt processing + text generation (pg): processing a prompt followed by generating a sequence of tokens (`-pg`)

 With the exception of `-r`, `-o` and `-v`, all options can be specified multiple times to run multiple tests. Each pp and tg test is run with all combinations of the specified options. To specify multiple values for an option, the values can be separated by commas (e.g. `-n 16,32`), or the option can be specified multiple times (e.g. `-n 16 -n 32`).

@@ -162,7 +156,7 @@ $ ./llama-bench -o csv
 ```

 ```csv
-build_commit,build_number,cuda,metal,gpu_blas,blas,cpu_info,gpu_info,model_filename,model_type,model_size,model_n_params,n_batch,n_threads,f16_kv,n_gpu_layers,main_gpu,mul_mat_q,tensor_split,n_prompt,n_gen,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts
+build_commit,build_number,cuda,opencl,metal,gpu_blas,blas,cpu_info,gpu_info,model_filename,model_type,model_size,model_n_params,n_batch,n_threads,f16_kv,n_gpu_layers,main_gpu,mul_mat_q,tensor_split,n_prompt,n_gen,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts
 "3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","512","0","2023-09-23T12:09:01Z","212155977","732372","2413.341687","8.305961"
 "3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","0","128","2023-09-23T12:09:02Z","969320879","2728399","132.052051","0.371342"
 ```
@@ -179,6 +173,7 @@ $ ./llama-bench -o json
    "build_commit": "3469684",
    "build_number": 1275,
    "cuda": true,
+    "opencl": false,
    "metal": false,
    "gpu_blas": true,
    "blas": true,
@@ -209,6 +204,7 @@ $ ./llama-bench -o json
    "build_commit": "3469684",
    "build_number": 1275,
    "cuda": true,
+    "opencl": false,
    "metal": false,
    "gpu_blas": true,
    "blas": true,
@@ -251,6 +247,7 @@ CREATE TABLE IF NOT EXISTS test (
  build_commit TEXT,
  build_number INTEGER,
  cuda INTEGER,
+  opencl INTEGER,
  metal INTEGER,
  gpu_blas INTEGER,
  blas INTEGER,
@@ -276,6 +273,6 @@ CREATE TABLE IF NOT EXISTS test (
  stddev_ts REAL
 );

-INSERT INTO test (build_commit, build_number, cuda, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '512', '0', '2023-09-23T12:10:30Z', '212693772', '743623', '2407.240204', '8.409634');
-INSERT INTO test (build_commit, build_number, cuda, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '0', '128', '2023-09-23T12:10:31Z', '977925003', '4037361', '130.891159', '0.537692');
+INSERT INTO test (build_commit, build_number, cuda, opencl, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '512', '0', '2023-09-23T12:10:30Z', '212693772', '743623', '2407.240204', '8.409634');
+INSERT INTO test (build_commit, build_number, cuda, opencl, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '0', '128', '2023-09-23T12:10:31Z', '977925003', '4037361', '130.891159', '0.537692');
 ```
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -41,6 +41,20 @@ static std::string join(const std::vector<T> & values, const std::string & delim
    return str.str();
 }

+template<class T>
+static std::vector<T> split(const std::string & str, char delim) {
+    std::vector<T> values;
+    std::istringstream str_stream(str);
+    std::string token;
+    while (std::getline(str_stream, token, delim)) {
+        T value;
+        std::istringstream token_stream(token);
+        token_stream >> value;
+        values.push_back(value);
+    }
+    return values;
+}
+
 template<typename T, typename F>
 static std::vector<std::string> transform_to_str(const std::vector<T> & values, F f) {
    std::vector<std::string> str_values;
@@ -126,11 +140,10 @@ static std::string get_gpu_info() {
 }

 // command line params
-enum output_formats {NONE, CSV, JSON, MARKDOWN, SQL};
+enum output_formats {CSV, JSON, MARKDOWN, SQL};

 static const char * output_format_str(output_formats format) {
    switch (format) {
-        case NONE:     return "none";
        case CSV:      return "csv";
        case JSON:     return "json";
        case MARKDOWN: return "md";
@@ -139,23 +152,6 @@ static const char * output_format_str(output_formats format) {
    }
 }

-static bool output_format_from_str(const std::string & s, output_formats & format) {
-    if (s == "none") {
-        format = NONE;
-    } else if (s == "csv") {
-        format = CSV;
-    } else if (s == "json") {
-        format = JSON;
-    } else if (s == "md") {
-        format = MARKDOWN;
-    } else if (s == "sql") {
-        format = SQL;
-    } else {
-        return false;
-    }
-    return true;
-}
-
 static const char * split_mode_str(llama_split_mode mode) {
    switch (mode) {
        case LLAMA_SPLIT_MODE_NONE:  return "none";
@@ -165,24 +161,16 @@ static const char * split_mode_str(llama_split_mode mode) {
    }
 }

-static std::string pair_str(const std::pair<int, int> & p) {
-    static char buf[32];
-    snprintf(buf, sizeof(buf), "%d,%d", p.first, p.second);
-    return buf;
-}
-
 struct cmd_params {
    std::vector<std::string> model;
    std::vector<int> n_prompt;
    std::vector<int> n_gen;
-    std::vector<std::pair<int, int>> n_pg;
    std::vector<int> n_batch;
    std::vector<int> n_ubatch;
    std::vector<ggml_type> type_k;
    std::vector<ggml_type> type_v;
    std::vector<int> n_threads;
    std::vector<int> n_gpu_layers;
-    std::vector<std::string> rpc_servers;
    std::vector<llama_split_mode> split_mode;
    std::vector<int> main_gpu;
    std::vector<bool> no_kv_offload;
@@ -194,33 +182,29 @@ struct cmd_params {
    int reps;
    bool verbose;
    output_formats output_format;
-    output_formats output_format_stderr;
 };

 static const cmd_params cmd_params_defaults = {
-    /* model                */ {"models/7B/ggml-model-q4_0.gguf"},
-    /* n_prompt             */ {512},
-    /* n_gen                */ {128},
-    /* n_pg                 */ {},
-    /* n_batch              */ {2048},
-    /* n_ubatch             */ {512},
-    /* type_k               */ {GGML_TYPE_F16},
-    /* type_v               */ {GGML_TYPE_F16},
-    /* n_threads            */ {cpu_get_num_math()},
-    /* n_gpu_layers         */ {99},
-    /* rpc_servers          */ {""},
-    /* split_mode           */ {LLAMA_SPLIT_MODE_LAYER},
-    /* main_gpu             */ {0},
-    /* no_kv_offload        */ {false},
-    /* flash_attn           */ {false},
-    /* tensor_split         */ {std::vector<float>(llama_max_devices(), 0.0f)},
-    /* use_mmap             */ {true},
-    /* embeddings           */ {false},
-    /* numa                 */ GGML_NUMA_STRATEGY_DISABLED,
-    /* reps                 */ 5,
-    /* verbose              */ false,
-    /* output_format        */ MARKDOWN,
-    /* output_format_stderr */ NONE,
+    /* model         */ {"models/7B/ggml-model-q4_0.gguf"},
+    /* n_prompt      */ {512},
+    /* n_gen         */ {128},
+    /* n_batch       */ {2048},
+    /* n_ubatch      */ {512},
+    /* type_k        */ {GGML_TYPE_F16},
+    /* type_v        */ {GGML_TYPE_F16},
+    /* n_threads     */ {get_math_cpu_count()},
+    /* n_gpu_layers  */ {99},
+    /* split_mode    */ {LLAMA_SPLIT_MODE_LAYER},
+    /* main_gpu      */ {0},
+    /* no_kv_offload */ {false},
+    /* flash_attn    */ {false},
+    /* tensor_split  */ {std::vector<float>(llama_max_devices(), 0.0f)},
+    /* use_mmap      */ {true},
+    /* embeddings    */ {false},
+    /* numa          */ GGML_NUMA_STRATEGY_DISABLED,
+    /* reps          */ 5,
+    /* verbose       */ false,
+    /* output_format */ MARKDOWN
 };

 static void print_usage(int /* argc */, char ** argv) {
@@ -231,14 +215,12 @@ static void print_usage(int /* argc */, char ** argv) {
    printf("  -m, --model <filename>              (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
    printf("  -p, --n-prompt <n>                  (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
    printf("  -n, --n-gen <n>                     (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
-    printf("  -pg <pp,tg>                         (default: %s)\n", join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str());
    printf("  -b, --batch-size <n>                (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
-    printf("  -ub, --ubatch-size <n>              (default: %s)\n", join(cmd_params_defaults.n_ubatch, ",").c_str());
-    printf("  -ctk, --cache-type-k <t>            (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
-    printf("  -ctv, --cache-type-v <t>            (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
+    printf("  -ub N, --ubatch-size <n>            (default: %s)\n", join(cmd_params_defaults.n_ubatch, ",").c_str());
+    printf("  -ctk <t>, --cache-type-k <t>        (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
+    printf("  -ctv <t>, --cache-type-v <t>        (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
    printf("  -t, --threads <n>                   (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
    printf("  -ngl, --n-gpu-layers <n>            (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
-    printf("  -rpc, --rpc <rpc_servers>           (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str());
    printf("  -sm, --split-mode <none|layer|row>  (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
    printf("  -mg, --main-gpu <i>                 (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
    printf("  -nkvo, --no-kv-offload <0|1>        (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
@@ -249,7 +231,6 @@ static void print_usage(int /* argc */, char ** argv) {
    printf("  -ts, --tensor-split <ts0/ts1/..>    (default: 0)\n");
    printf("  -r, --repetitions <n>               (default: %d)\n", cmd_params_defaults.reps);
    printf("  -o, --output <csv|json|md|sql>      (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
-    printf("  -oe, --output-err <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format_stderr));
    printf("  -v, --verbose                       (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
    printf("\n");
    printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
@@ -291,7 +272,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {

    params.verbose = cmd_params_defaults.verbose;
    params.output_format = cmd_params_defaults.output_format;
-    params.output_format_stderr = cmd_params_defaults.output_format_stderr;
    params.reps = cmd_params_defaults.reps;

    for (int i = 1; i < argc; i++) {
@@ -308,53 +288,42 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                invalid_param = true;
                break;
            }
-            auto p = string_split<std::string>(argv[i], split_delim);
+            auto p = split<std::string>(argv[i], split_delim);
            params.model.insert(params.model.end(), p.begin(), p.end());
        } else if (arg == "-p" || arg == "--n-prompt") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            auto p = string_split<int>(argv[i], split_delim);
+            auto p = split<int>(argv[i], split_delim);
            params.n_prompt.insert(params.n_prompt.end(), p.begin(), p.end());
        } else if (arg == "-n" || arg == "--n-gen") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            auto p = string_split<int>(argv[i], split_delim);
+            auto p = split<int>(argv[i], split_delim);
            params.n_gen.insert(params.n_gen.end(), p.begin(), p.end());
-        } else if (arg == "-pg") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            auto p = string_split<std::string>(argv[i], ',');
-            if (p.size() != 2) {
-                invalid_param = true;
-                break;
-            }
-            params.n_pg.push_back({std::stoi(p[0]), std::stoi(p[1])});
        } else if (arg == "-b" || arg == "--batch-size") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            auto p = string_split<int>(argv[i], split_delim);
+            auto p = split<int>(argv[i], split_delim);
            params.n_batch.insert(params.n_batch.end(), p.begin(), p.end());
        } else if (arg == "-ub" || arg == "--ubatch-size") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            auto p = string_split<int>(argv[i], split_delim);
+            auto p = split<int>(argv[i], split_delim);
            params.n_ubatch.insert(params.n_ubatch.end(), p.begin(), p.end());
        } else if (arg == "-ctk" || arg == "--cache-type-k") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            auto p = string_split<std::string>(argv[i], split_delim);
+            auto p = split<std::string>(argv[i], split_delim);
            std::vector<ggml_type> types;
            for (const auto & t : p) {
                ggml_type gt = ggml_type_from_name(t);
@@ -370,7 +339,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                invalid_param = true;
                break;
            }
-            auto p = string_split<std::string>(argv[i], split_delim);
+            auto p = split<std::string>(argv[i], split_delim);
            std::vector<ggml_type> types;
            for (const auto & t : p) {
                ggml_type gt = ggml_type_from_name(t);
@@ -386,27 +355,21 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                invalid_param = true;
                break;
            }
-            auto p = string_split<int>(argv[i], split_delim);
+            auto p = split<int>(argv[i], split_delim);
            params.n_threads.insert(params.n_threads.end(), p.begin(), p.end());
        } else if (arg == "-ngl" || arg == "--n-gpu-layers") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            auto p = string_split<int>(argv[i], split_delim);
+            auto p = split<int>(argv[i], split_delim);
            params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
-        } else if (arg == "-rpc" || arg == "--rpc") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.rpc_servers.push_back(argv[i]);
        } else if (arg == "-sm" || arg == "--split-mode") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            auto p = string_split<std::string>(argv[i], split_delim);
+            auto p = split<std::string>(argv[i], split_delim);
            std::vector<llama_split_mode> modes;
            for (const auto & m : p) {
                llama_split_mode mode;
@@ -428,13 +391,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                invalid_param = true;
                break;
            }
-            params.main_gpu = string_split<int>(argv[i], split_delim);
+            params.main_gpu = split<int>(argv[i], split_delim);
        } else if (arg == "-nkvo" || arg == "--no-kv-offload") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            auto p = string_split<bool>(argv[i], split_delim);
+            auto p = split<bool>(argv[i], split_delim);
            params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end());
        } else if (arg == "--numa") {
            if (++i >= argc) {
@@ -452,28 +415,28 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                invalid_param = true;
                break;
            }
-            auto p = string_split<bool>(argv[i], split_delim);
+            auto p = split<bool>(argv[i], split_delim);
            params.flash_attn.insert(params.flash_attn.end(), p.begin(), p.end());
        } else if (arg == "-mmp" || arg == "--mmap") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            auto p = string_split<bool>(argv[i], split_delim);
+            auto p = split<bool>(argv[i], split_delim);
            params.use_mmap.insert(params.use_mmap.end(), p.begin(), p.end());
        } else if (arg == "-embd" || arg == "--embeddings") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            auto p = string_split<bool>(argv[i], split_delim);
+            auto p = split<bool>(argv[i], split_delim);
            params.embeddings.insert(params.embeddings.end(), p.begin(), p.end());
        } else if (arg == "-ts" || arg == "--tensor-split") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            for (auto ts : string_split<std::string>(argv[i], split_delim)) {
+            for (auto ts : split<std::string>(argv[i], split_delim)) {
                // split string by ; and /
                const std::regex regex{R"([;/]+)"};
                std::sregex_token_iterator it{ts.begin(), ts.end(), regex, -1};
@@ -501,13 +464,18 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                invalid_param = true;
                break;
            }
-            invalid_param = !output_format_from_str(argv[i], params.output_format);
-        } else if (arg == "-oe" || arg == "--output-err") {
-            if (++i >= argc) {
+            if (argv[i] == std::string("csv")) {
+                params.output_format = CSV;
+            } else if (argv[i] == std::string("json")) {
+                params.output_format = JSON;
+            } else if (argv[i] == std::string("md")) {
+                params.output_format = MARKDOWN;
+            } else if (argv[i] == std::string("sql")) {
+                params.output_format = SQL;
+            } else {
                invalid_param = true;
                break;
            }
-            invalid_param = !output_format_from_str(argv[i], params.output_format_stderr);
        } else if (arg == "-v" || arg == "--verbose") {
            params.verbose = true;
        } else {
@@ -525,13 +493,11 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
    if (params.model.empty())        { params.model = cmd_params_defaults.model; }
    if (params.n_prompt.empty())     { params.n_prompt = cmd_params_defaults.n_prompt; }
    if (params.n_gen.empty())        { params.n_gen = cmd_params_defaults.n_gen; }
-    if (params.n_pg.empty())         { params.n_pg = cmd_params_defaults.n_pg; }
    if (params.n_batch.empty())      { params.n_batch = cmd_params_defaults.n_batch; }
    if (params.n_ubatch.empty())     { params.n_ubatch = cmd_params_defaults.n_ubatch; }
    if (params.type_k.empty())       { params.type_k = cmd_params_defaults.type_k; }
    if (params.type_v.empty())       { params.type_v = cmd_params_defaults.type_v; }
    if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; }
-    if (params.rpc_servers.empty())  { params.rpc_servers = cmd_params_defaults.rpc_servers; }
    if (params.split_mode.empty())   { params.split_mode = cmd_params_defaults.split_mode; }
    if (params.main_gpu.empty())     { params.main_gpu = cmd_params_defaults.main_gpu; }
    if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; }
@@ -554,7 +520,6 @@ struct cmd_params_instance {
    ggml_type type_v;
    int n_threads;
    int n_gpu_layers;
-    std::string rpc_servers;
    llama_split_mode split_mode;
    int main_gpu;
    bool no_kv_offload;
@@ -567,9 +532,6 @@ struct cmd_params_instance {
        llama_model_params mparams = llama_model_default_params();

        mparams.n_gpu_layers = n_gpu_layers;
-        if (!rpc_servers.empty()) {
-            mparams.rpc_servers = rpc_servers.c_str();
-        }
        mparams.split_mode = split_mode;
        mparams.main_gpu = main_gpu;
        mparams.tensor_split = tensor_split.data();
@@ -581,7 +543,6 @@ struct cmd_params_instance {
    bool equal_mparams(const cmd_params_instance & other) const {
        return model == other.model &&
               n_gpu_layers == other.n_gpu_layers &&
-               rpc_servers == other.rpc_servers &&
               split_mode == other.split_mode &&
               main_gpu == other.main_gpu &&
               use_mmap == other.use_mmap &&
@@ -610,7 +571,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
    // this ordering minimizes the number of times that each model needs to be reloaded
    for (const auto & m : params.model)
    for (const auto & nl : params.n_gpu_layers)
-    for (const auto & rpc : params.rpc_servers)
    for (const auto & sm : params.split_mode)
    for (const auto & mg : params.main_gpu)
    for (const auto & ts : params.tensor_split)
@@ -637,7 +597,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                /* .type_v       = */ tv,
                /* .n_threads    = */ nt,
                /* .n_gpu_layers = */ nl,
-                /* .rpc_servers  = */ rpc,
                /* .split_mode   = */ sm,
                /* .main_gpu     = */ mg,
                /* .no_kv_offload= */ nkvo,
@@ -663,33 +622,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                /* .type_v       = */ tv,
                /* .n_threads    = */ nt,
                /* .n_gpu_layers = */ nl,
-                /* .rpc_servers  = */ rpc,
-                /* .split_mode   = */ sm,
-                /* .main_gpu     = */ mg,
-                /* .no_kv_offload= */ nkvo,
-                /* .flash_attn   = */ fa,
-                /* .tensor_split = */ ts,
-                /* .use_mmap     = */ mmp,
-                /* .embeddings   = */ embd,
-            };
-            instances.push_back(instance);
-        }
-
-        for (const auto & n_pg : params.n_pg) {
-            if (n_pg.first == 0 && n_pg.second == 0) {
-                continue;
-            }
-            cmd_params_instance instance = {
-                /* .model        = */ m,
-                /* .n_prompt     = */ n_pg.first,
-                /* .n_gen        = */ n_pg.second,
-                /* .n_batch      = */ nb,
-                /* .n_ubatch     = */ nub,
-                /* .type_k       = */ tk,
-                /* .type_v       = */ tv,
-                /* .n_threads    = */ nt,
-                /* .n_gpu_layers = */ nl,
-                /* .rpc_servers  = */ rpc,
                /* .split_mode   = */ sm,
                /* .main_gpu     = */ mg,
                /* .no_kv_offload= */ nkvo,
@@ -709,11 +641,11 @@ struct test {
    static const std::string build_commit;
    static const int build_number;
    static const bool cuda;
+    static const bool opencl;
    static const bool vulkan;
    static const bool kompute;
    static const bool metal;
    static const bool sycl;
-    static const bool rpc;
    static const bool gpu_blas;
    static const bool blas;
    static const std::string cpu_info;
@@ -797,6 +729,9 @@ struct test {
        if (cuda) {
            return GGML_CUDA_NAME;
        }
+        if (opencl) {
+            return "OpenCL";
+        }
        if (vulkan) {
            return "Vulkan";
        }
@@ -809,9 +744,6 @@ struct test {
        if (sycl) {
            return GGML_SYCL_NAME;
        }
-        if (rpc) {
-            return "RPC";
-        }
        if (gpu_blas) {
            return "GPU BLAS";
        }
@@ -825,7 +757,7 @@ struct test {
    static const std::vector<std::string> & get_fields() {
        static const std::vector<std::string> fields = {
            "build_commit", "build_number",
-            "cuda", "vulkan", "kompute", "metal", "sycl", "rpc", "gpu_blas", "blas",
+            "cuda", "opencl", "vulkan", "kompute", "metal", "sycl", "gpu_blas", "blas",
            "cpu_info", "gpu_info",
            "model_filename", "model_type", "model_size", "model_n_params",
            "n_batch", "n_ubatch",
@@ -851,7 +783,7 @@ struct test {
            field == "avg_ns" || field == "stddev_ns") {
            return INT;
        }
-        if (field == "cuda" || field == "vulkan" || field == "kompute" || field == "metal" ||
+        if (field == "cuda" || field == "opencl"  || field == "vulkan" || field == "kompute" || field == "metal" ||
            field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
            field == "flash_attn" || field == "use_mmap" || field == "embeddings") {
            return BOOL;
@@ -880,8 +812,8 @@ struct test {
        }
        std::vector<std::string> values = {
            build_commit, std::to_string(build_number),
-            std::to_string(cuda), std::to_string(vulkan), std::to_string(vulkan),
-            std::to_string(metal), std::to_string(sycl), std::to_string(rpc), std::to_string(gpu_blas), std::to_string(blas),
+            std::to_string(cuda), std::to_string(opencl), std::to_string(vulkan), std::to_string(vulkan),
+            std::to_string(metal), std::to_string(sycl), std::to_string(gpu_blas), std::to_string(blas),
            cpu_info, gpu_info,
            model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
            std::to_string(n_batch), std::to_string(n_ubatch),
@@ -909,13 +841,13 @@ struct test {
 const std::string test::build_commit = LLAMA_COMMIT;
 const int         test::build_number = LLAMA_BUILD_NUMBER;
 const bool        test::cuda         = !!ggml_cpu_has_cuda();
+const bool        test::opencl       = !!ggml_cpu_has_clblast();
 const bool        test::vulkan       = !!ggml_cpu_has_vulkan();
 const bool        test::kompute      = !!ggml_cpu_has_kompute();
 const bool        test::metal        = !!ggml_cpu_has_metal();
 const bool        test::gpu_blas     = !!ggml_cpu_has_gpublas();
 const bool        test::blas         = !!ggml_cpu_has_blas();
 const bool        test::sycl         = !!ggml_cpu_has_sycl();
-const bool        test::rpc          = !!ggml_cpu_has_rpc();
 const std::string test::cpu_info     = get_cpu_info();
 const std::string test::gpu_info     = get_gpu_info();

@@ -1033,30 +965,6 @@ struct markdown_printer : public printer {
        if (field == "n_gpu_layers") {
            return 3;
        }
-        if (field == "n_threads") {
-            return 7;
-        }
-        if (field == "n_batch") {
-            return 7;
-        }
-        if (field == "n_ubatch") {
-            return 8;
-        }
-        if (field == "type_k" || field == "type_v") {
-            return 6;
-        }
-        if (field == "split_mode") {
-            return 5;
-        }
-        if (field == "flash_attn") {
-            return 2;
-        }
-        if (field == "use_mmap") {
-            return 4;
-        }
-        if (field == "test") {
-            return 13;
-        }

        int width = std::max((int)field.length(), 10);

@@ -1183,11 +1091,12 @@ struct markdown_printer : public printer {
                value = test::get_backend();
            } else if (field == "test") {
                if (t.n_prompt > 0 && t.n_gen == 0) {
-                    snprintf(buf, sizeof(buf), "pp%d", t.n_prompt);
+                    snprintf(buf, sizeof(buf), "pp %d", t.n_prompt);
                } else if (t.n_gen > 0 && t.n_prompt == 0) {
-                    snprintf(buf, sizeof(buf), "tg%d", t.n_gen);
+                    snprintf(buf, sizeof(buf), "tg %d", t.n_gen);
                } else {
-                    snprintf(buf, sizeof(buf), "pp%d+tg%d", t.n_prompt, t.n_gen);
+                    assert(false);
+                    exit(1);
                }
                value = buf;
            } else if (field == "t/s") {
@@ -1297,22 +1206,6 @@ static void llama_null_log_callback(enum ggml_log_level level, const char * text
    (void) user_data;
 }

-static std::unique_ptr<printer> create_printer(output_formats format) {
-    switch (format) {
-        case NONE:
-            return nullptr;
-        case CSV:
-            return std::unique_ptr<printer>(new csv_printer());
-        case JSON:
-            return std::unique_ptr<printer>(new json_printer());
-        case MARKDOWN:
-            return std::unique_ptr<printer>(new markdown_printer());
-        case SQL:
-            return std::unique_ptr<printer>(new sql_printer());
-    }
-    GGML_ASSERT(false);
-}
-
 int main(int argc, char ** argv) {
    // try to set locale for unicode characters in markdown
    setlocale(LC_CTYPE, ".UTF-8");
@@ -1339,18 +1232,26 @@ int main(int argc, char ** argv) {
    llama_numa_init(params.numa);

    // initialize printer
-    std::unique_ptr<printer> p = create_printer(params.output_format);
-    std::unique_ptr<printer> p_err = create_printer(params.output_format_stderr);
-
-    if (p) {
-        p->fout = stdout;
-        p->print_header(params);
-    }
-
-    if (p_err) {
-        p_err->fout = stderr;
-        p_err->print_header(params);
+    std::unique_ptr<printer> p;
+    switch (params.output_format) {
+        case CSV:
+            p.reset(new csv_printer());
+            break;
+        case JSON:
+            p.reset(new json_printer());
+            break;
+        case MARKDOWN:
+            p.reset(new markdown_printer());
+            break;
+        case SQL:
+            p.reset(new sql_printer());
+            break;
+        default:
+            assert(false);
+            exit(1);
    }
+    p->fout = stdout;
+    p->print_header(params);

    std::vector<cmd_params_instance> params_instances = get_cmd_params_instances(params);

@@ -1396,7 +1297,6 @@ int main(int argc, char ** argv) {
            llama_kv_cache_clear(ctx);

            uint64_t t_start = get_time_ns();
-
            if (t.n_prompt > 0) {
                test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
            }
@@ -1408,15 +1308,7 @@ int main(int argc, char ** argv) {
            t.samples_ns.push_back(t_ns);
        }

-        if (p) {
-            p->print_test(t);
-            fflush(p->fout);
-        }
-
-        if (p_err) {
-            p_err->print_test(t);
-            fflush(p_err->fout);
-        }
+        p->print_test(t);

        llama_print_timings(ctx);

@@ -1425,13 +1317,7 @@ int main(int argc, char ** argv) {

    llama_free_model(lmodel);

-    if (p) {
-        p->print_footer();
-    }
-
-    if (p_err) {
-        p_err->print_footer();
-    }
+    p->print_footer();

    llama_backend_free();

--- a/examples/llama.android/app/build.gradle.kts
+++ b/examples/llama.android/app/build.gradle.kts
@@ -7,6 +7,8 @@ android {
    namespace = "com.example.llama"
    compileSdk = 34

+    ndkVersion = "26.1.10909125"
+
    defaultConfig {
        applicationId = "com.example.llama"
        minSdk = 33
@@ -18,6 +20,17 @@ android {
        vectorDrawables {
            useSupportLibrary = true
        }
+        ndk {
+            // Add NDK properties if wanted, e.g.
+            // abiFilters += listOf("arm64-v8a")
+        }
+        externalNativeBuild {
+            cmake {
+                arguments += "-DCMAKE_BUILD_TYPE=Release"
+                cppFlags += listOf()
+                arguments += listOf()
+            }
+        }
    }

    buildTypes {
@@ -42,6 +55,17 @@ android {
    composeOptions {
        kotlinCompilerExtensionVersion = "1.5.1"
    }
+    packaging {
+        resources {
+            excludes += "/META-INF/{AL2.0,LGPL2.1}"
+        }
+    }
+    externalNativeBuild {
+        cmake {
+            path = file("src/main/cpp/CMakeLists.txt")
+            version = "3.22.1"
+        }
+    }
 }

 dependencies {
@@ -54,7 +78,6 @@ dependencies {
    implementation("androidx.compose.ui:ui-graphics")
    implementation("androidx.compose.ui:ui-tooling-preview")
    implementation("androidx.compose.material3:material3")
-    implementation(project(":llama"))
    testImplementation("junit:junit:4.13.2")
    androidTestImplementation("androidx.test.ext:junit:1.1.5")
    androidTestImplementation("androidx.test.espresso:espresso-core:3.5.1")
--- a/examples/llama.android/llama/src/main/cpp/CMakeLists.txt
+++ b/examples/llama.android/llama/src/main/cpp/CMakeLists.txt
@@ -1,3 +1,4 @@
+
 # For more information about using CMake with Android Studio, read the
 # documentation: https://d.android.com/studio/projects/add-native-code.html.
 # For more examples on how to use CMake, see https://github.com/android/ndk-samples.
@@ -35,15 +36,15 @@ FetchContent_MakeAvailable(llama)
 # for GameActivity/NativeActivity derived applications, the same library name must be
 # used in the AndroidManifest.xml file.
 add_library(${CMAKE_PROJECT_NAME} SHARED
-        # List C/C++ source files with relative paths to this CMakeLists.txt.
-        llama-android.cpp)
+    # List C/C++ source files with relative paths to this CMakeLists.txt.
+    llama-android.cpp)

 # Specifies libraries CMake should link to your target library. You
 # can link libraries from various origins, such as libraries defined in this
 # build script, prebuilt third-party libraries, or Android system libraries.
 target_link_libraries(${CMAKE_PROJECT_NAME}
-        # List libraries link to the target library
-        llama
-        common
-        android
-        log)
+    # List libraries link to the target library
+    llama
+    common
+    android
+    log)
--- a/examples/llama.android/llama/src/main/cpp/llama-android.cpp
+++ b/examples/llama.android/llama/src/main/cpp/llama-android.cpp
@@ -81,7 +81,7 @@ static void log_callback(ggml_log_level level, const char * fmt, void * data) {

 extern "C"
 JNIEXPORT jlong JNICALL
-Java_android_llama_cpp_LLamaAndroid_load_1model(JNIEnv *env, jobject, jstring filename) {
+Java_com_example_llama_Llm_load_1model(JNIEnv *env, jobject, jstring filename) {
    llama_model_params model_params = llama_model_default_params();

    auto path_to_model = env->GetStringUTFChars(filename, 0);
@@ -101,13 +101,13 @@ Java_android_llama_cpp_LLamaAndroid_load_1model(JNIEnv *env, jobject, jstring fi

 extern "C"
 JNIEXPORT void JNICALL
-Java_android_llama_cpp_LLamaAndroid_free_1model(JNIEnv *, jobject, jlong model) {
+Java_com_example_llama_Llm_free_1model(JNIEnv *, jobject, jlong model) {
    llama_free_model(reinterpret_cast<llama_model *>(model));
 }

 extern "C"
 JNIEXPORT jlong JNICALL
-Java_android_llama_cpp_LLamaAndroid_new_1context(JNIEnv *env, jobject, jlong jmodel) {
+Java_com_example_llama_Llm_new_1context(JNIEnv *env, jobject, jlong jmodel) {
    auto model = reinterpret_cast<llama_model *>(jmodel);

    if (!model) {
@@ -139,25 +139,25 @@ Java_android_llama_cpp_LLamaAndroid_new_1context(JNIEnv *env, jobject, jlong jmo

 extern "C"
 JNIEXPORT void JNICALL
-Java_android_llama_cpp_LLamaAndroid_free_1context(JNIEnv *, jobject, jlong context) {
+Java_com_example_llama_Llm_free_1context(JNIEnv *, jobject, jlong context) {
    llama_free(reinterpret_cast<llama_context *>(context));
 }

 extern "C"
 JNIEXPORT void JNICALL
-Java_android_llama_cpp_LLamaAndroid_backend_1free(JNIEnv *, jobject) {
+Java_com_example_llama_Llm_backend_1free(JNIEnv *, jobject) {
    llama_backend_free();
 }

 extern "C"
 JNIEXPORT void JNICALL
-Java_android_llama_cpp_LLamaAndroid_log_1to_1android(JNIEnv *, jobject) {
+Java_com_example_llama_Llm_log_1to_1android(JNIEnv *, jobject) {
    llama_log_set(log_callback, NULL);
 }

 extern "C"
 JNIEXPORT jstring JNICALL
-Java_android_llama_cpp_LLamaAndroid_bench_1model(
+Java_com_example_llama_Llm_bench_1model(
        JNIEnv *env,
        jobject,
        jlong context_pointer,
@@ -271,13 +271,13 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(

 extern "C"
 JNIEXPORT void JNICALL
-Java_android_llama_cpp_LLamaAndroid_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
+Java_com_example_llama_Llm_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
    llama_batch_free(*reinterpret_cast<llama_batch *>(batch_pointer));
 }

 extern "C"
 JNIEXPORT jlong JNICALL
-Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens, jint embd, jint n_seq_max) {
+Java_com_example_llama_Llm_new_1batch(JNIEnv *, jobject, jint n_tokens, jint embd, jint n_seq_max) {

    // Source: Copy of llama.cpp:llama_batch_init but heap-allocated.

@@ -313,19 +313,19 @@ Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens,

 extern "C"
 JNIEXPORT void JNICALL
-Java_android_llama_cpp_LLamaAndroid_backend_1init(JNIEnv *, jobject) {
+Java_com_example_llama_Llm_backend_1init(JNIEnv *, jobject) {
    llama_backend_init();
 }

 extern "C"
 JNIEXPORT jstring JNICALL
-Java_android_llama_cpp_LLamaAndroid_system_1info(JNIEnv *env, jobject) {
+Java_com_example_llama_Llm_system_1info(JNIEnv *env, jobject) {
    return env->NewStringUTF(llama_print_system_info());
 }

 extern "C"
 JNIEXPORT jint JNICALL
-Java_android_llama_cpp_LLamaAndroid_completion_1init(
+Java_com_example_llama_Llm_completion_1init(
        JNIEnv *env,
        jobject,
        jlong context_pointer,
@@ -376,7 +376,7 @@ Java_android_llama_cpp_LLamaAndroid_completion_1init(

 extern "C"
 JNIEXPORT jstring JNICALL
-Java_android_llama_cpp_LLamaAndroid_completion_1loop(
+Java_com_example_llama_Llm_completion_1loop(
        JNIEnv * env,
        jobject,
        jlong context_pointer,
@@ -438,6 +438,6 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(

 extern "C"
 JNIEXPORT void JNICALL
-Java_android_llama_cpp_LLamaAndroid_kv_1cache_1clear(JNIEnv *, jobject, jlong context) {
+Java_com_example_llama_Llm_kv_1cache_1clear(JNIEnv *, jobject, jlong context) {
    llama_kv_cache_clear(reinterpret_cast<llama_context *>(context));
 }
--- a/examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaAndroid.kt
+++ b/examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaAndroid.kt
@@ -1,4 +1,4 @@
-package android.llama.cpp
+package com.example.llama

 import android.util.Log
 import kotlinx.coroutines.CoroutineDispatcher
@@ -10,7 +10,7 @@ import kotlinx.coroutines.withContext
 import java.util.concurrent.Executors
 import kotlin.concurrent.thread

-class LLamaAndroid {
+class Llm {
    private val tag: String? = this::class.simpleName

    private val threadLocalState: ThreadLocal<State> = ThreadLocal.withInitial { State.Idle }
@@ -165,8 +165,8 @@ class LLamaAndroid {
        }

        // Enforce only one instance of Llm.
-        private val _instance: LLamaAndroid = LLamaAndroid()
+        private val _instance: Llm = Llm()

-        fun instance(): LLamaAndroid = _instance
+        fun instance(): Llm = _instance
    }
 }
--- a/examples/llama.android/app/src/main/java/com/example/llama/MainViewModel.kt
+++ b/examples/llama.android/app/src/main/java/com/example/llama/MainViewModel.kt
@@ -1,6 +1,5 @@
 package com.example.llama

-import android.llama.cpp.LLamaAndroid
 import android.util.Log
 import androidx.compose.runtime.getValue
 import androidx.compose.runtime.mutableStateOf
@@ -10,7 +9,7 @@ import androidx.lifecycle.viewModelScope
 import kotlinx.coroutines.flow.catch
 import kotlinx.coroutines.launch

-class MainViewModel(private val llamaAndroid: LLamaAndroid = LLamaAndroid.instance()): ViewModel() {
+class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
    companion object {
        @JvmStatic
        private val NanosPerSecond = 1_000_000_000.0
@@ -29,7 +28,7 @@ class MainViewModel(private val llamaAndroid: LLamaAndroid = LLamaAndroid.instan

        viewModelScope.launch {
            try {
-                llamaAndroid.unload()
+                llm.unload()
            } catch (exc: IllegalStateException) {
                messages += exc.message!!
            }
@@ -45,7 +44,7 @@ class MainViewModel(private val llamaAndroid: LLamaAndroid = LLamaAndroid.instan
        messages += ""

        viewModelScope.launch {
-            llamaAndroid.send(text)
+            llm.send(text)
                .catch {
                    Log.e(tag, "send() failed", it)
                    messages += it.message!!
@@ -58,7 +57,7 @@ class MainViewModel(private val llamaAndroid: LLamaAndroid = LLamaAndroid.instan
        viewModelScope.launch {
            try {
                val start = System.nanoTime()
-                val warmupResult = llamaAndroid.bench(pp, tg, pl, nr)
+                val warmupResult = llm.bench(pp, tg, pl, nr)
                val end = System.nanoTime()

                messages += warmupResult
@@ -71,7 +70,7 @@ class MainViewModel(private val llamaAndroid: LLamaAndroid = LLamaAndroid.instan
                    return@launch
                }

-                messages += llamaAndroid.bench(512, 128, 1, 3)
+                messages += llm.bench(512, 128, 1, 3)
            } catch (exc: IllegalStateException) {
                Log.e(tag, "bench() failed", exc)
                messages += exc.message!!
@@ -82,7 +81,7 @@ class MainViewModel(private val llamaAndroid: LLamaAndroid = LLamaAndroid.instan
    fun load(pathToModel: String) {
        viewModelScope.launch {
            try {
-                llamaAndroid.load(pathToModel)
+                llm.load(pathToModel)
                messages += "Loaded $pathToModel"
            } catch (exc: IllegalStateException) {
                Log.e(tag, "load() failed", exc)
--- a/examples/llama.android/build.gradle.kts
+++ b/examples/llama.android/build.gradle.kts
@@ -2,5 +2,4 @@
 plugins {
    id("com.android.application") version "8.2.0" apply false
    id("org.jetbrains.kotlin.android") version "1.9.0" apply false
-    id("com.android.library") version "8.2.0" apply false
 }
--- a/examples/llama.android/llama/.gitignore
+++ b/examples/llama.android/llama/.gitignore
@@ -1 +0,0 @@
-/build
--- a/examples/llama.android/llama/CMakeLists.txt
+++ b/examples/llama.android/llama/CMakeLists.txt
@@ -1,55 +0,0 @@
-
-# For more information about using CMake with Android Studio, read the
-# documentation: https://d.android.com/studio/projects/add-native-code.html.
-# For more examples on how to use CMake, see https://github.com/android/ndk-samples.
-
-# Sets the minimum CMake version required for this project.
-cmake_minimum_required(VERSION 3.22.1)
-
-# Declares the project name. The project name can be accessed via ${ PROJECT_NAME},
-# Since this is the top level CMakeLists.txt, the project name is also accessible
-# with ${CMAKE_PROJECT_NAME} (both CMake variables are in-sync within the top level
-# build script scope).
-project("llama-android")
-
-## Fetch latest llama.cpp from GitHub
-#include(FetchContent)
-#FetchContent_Declare(
-#        llama
-#        GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
-#        GIT_TAG        master
-#)
-#
-## Also provides "common"
-#FetchContent_MakeAvailable(llama)
-
-# llama.cpp CI uses the code from the current branch
-# ref: https://github.com/ggerganov/llama.cpp/pull/7341#issuecomment-2117617700
-add_subdirectory(../../../../../../ build-llama)
-
-# Creates and names a library, sets it as either STATIC
-# or SHARED, and provides the relative paths to its source code.
-# You can define multiple libraries, and CMake builds them for you.
-# Gradle automatically packages shared libraries with your APK.
-#
-# In this top level CMakeLists.txt, ${CMAKE_PROJECT_NAME} is used to define
-# the target library name; in the sub-module's CMakeLists.txt, ${PROJECT_NAME}
-# is preferred for the same purpose.
-#
-# In order to load a library into your app from Java/Kotlin, you must call
-# System.loadLibrary() and pass the name of the library defined here;
-# for GameActivity/NativeActivity derived applications, the same library name must be
-# used in the AndroidManifest.xml file.
-add_library(${CMAKE_PROJECT_NAME} SHARED
-    # List C/C++ source files with relative paths to this CMakeLists.txt.
-        llama-android.cpp)
-
-# Specifies libraries CMake should link to your target library. You
-# can link libraries from various origins, such as libraries defined in this
-# build script, prebuilt third-party libraries, or Android system libraries.
-target_link_libraries(${CMAKE_PROJECT_NAME}
-    # List libraries link to the target library
-    llama
-    common
-    android
-    log)
--- a/examples/llama.android/llama/build.gradle.kts
+++ b/examples/llama.android/llama/build.gradle.kts
@@ -1,68 +0,0 @@
-plugins {
-    id("com.android.library")
-    id("org.jetbrains.kotlin.android")
-}
-
-android {
-    namespace = "android.llama.cpp"
-    compileSdk = 34
-
-    defaultConfig {
-        minSdk = 33
-
-        testInstrumentationRunner = "androidx.test.runner.AndroidJUnitRunner"
-        consumerProguardFiles("consumer-rules.pro")
-        ndk {
-            // Add NDK properties if wanted, e.g.
-            // abiFilters += listOf("arm64-v8a")
-        }
-        externalNativeBuild {
-            cmake {
-                arguments += "-DCMAKE_BUILD_TYPE=Release"
-                cppFlags += listOf()
-                arguments += listOf()
-
-                cppFlags("")
-            }
-        }
-    }
-
-    buildTypes {
-        release {
-            isMinifyEnabled = false
-            proguardFiles(
-                getDefaultProguardFile("proguard-android-optimize.txt"),
-                "proguard-rules.pro"
-            )
-        }
-    }
-    externalNativeBuild {
-        cmake {
-            path("src/main/cpp/CMakeLists.txt")
-            version = "3.22.1"
-        }
-    }
-    compileOptions {
-        sourceCompatibility = JavaVersion.VERSION_1_8
-        targetCompatibility = JavaVersion.VERSION_1_8
-    }
-    kotlinOptions {
-        jvmTarget = "1.8"
-    }
-
-    packaging {
-        resources {
-            excludes += "/META-INF/{AL2.0,LGPL2.1}"
-        }
-    }
-}
-
-dependencies {
-
-    implementation("androidx.core:core-ktx:1.12.0")
-    implementation("androidx.appcompat:appcompat:1.6.1")
-    implementation("com.google.android.material:material:1.11.0")
-    testImplementation("junit:junit:4.13.2")
-    androidTestImplementation("androidx.test.ext:junit:1.1.5")
-    androidTestImplementation("androidx.test.espresso:espresso-core:3.5.1")
-}
--- a/examples/llama.android/llama/consumer-rules.pro
+++ b/examples/llama.android/llama/consumer-rules.pro
--- a/examples/llama.android/llama/proguard-rules.pro
+++ b/examples/llama.android/llama/proguard-rules.pro
@@ -1,21 +0,0 @@
-# Add project specific ProGuard rules here.
-# You can control the set of applied configuration files using the
-# proguardFiles setting in build.gradle.
-#
-# For more details, see
-#   http://developer.android.com/guide/developing/tools/proguard.html
-
-# If your project uses WebView with JS, uncomment the following
-# and specify the fully qualified class name to the JavaScript interface
-# class:
-#-keepclassmembers class fqcn.of.javascript.interface.for.webview {
-#   public *;
-#}
-
-# Uncomment this to preserve the line number information for
-# debugging stack traces.
-#-keepattributes SourceFile,LineNumberTable
-
-# If you keep the line number information, uncomment this to
-# hide the original source file name.
-#-renamesourcefileattribute SourceFile
--- a/examples/llama.android/llama/src/androidTest/java/android/llama/cpp/ExampleInstrumentedTest.kt
+++ b/examples/llama.android/llama/src/androidTest/java/android/llama/cpp/ExampleInstrumentedTest.kt
@@ -1,24 +0,0 @@
-package android.llama.cpp
-
-import androidx.test.platform.app.InstrumentationRegistry
-import androidx.test.ext.junit.runners.AndroidJUnit4
-
-import org.junit.Test
-import org.junit.runner.RunWith
-
-import org.junit.Assert.*
-
-/**
- * Instrumented test, which will execute on an Android device.
- *
- * See [testing documentation](http://d.android.com/tools/testing).
- */
-@RunWith(AndroidJUnit4::class)
-class ExampleInstrumentedTest {
-    @Test
-    fun useAppContext() {
-        // Context of the app under test.
-        val appContext = InstrumentationRegistry.getInstrumentation().targetContext
-        assertEquals("android.llama.cpp.test", appContext.packageName)
-    }
-}
--- a/examples/llama.android/llama/src/main/AndroidManifest.xml
+++ b/examples/llama.android/llama/src/main/AndroidManifest.xml
@@ -1,4 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<manifest xmlns:android="http://schemas.android.com/apk/res/android">
-
-</manifest>
--- a/examples/llama.android/llama/src/test/java/android/llama/cpp/ExampleUnitTest.kt
+++ b/examples/llama.android/llama/src/test/java/android/llama/cpp/ExampleUnitTest.kt
@@ -1,17 +0,0 @@
-package android.llama.cpp
-
-import org.junit.Test
-
-import org.junit.Assert.*
-
-/**
- * Example local unit test, which will execute on the development machine (host).
- *
- * See [testing documentation](http://d.android.com/tools/testing).
- */
-class ExampleUnitTest {
-    @Test
-    fun addition_isCorrect() {
-        assertEquals(4, 2 + 2)
-    }
-}
--- a/examples/llama.android/settings.gradle.kts
+++ b/examples/llama.android/settings.gradle.kts
@@ -15,4 +15,3 @@ dependencyResolutionManagement {

 rootProject.name = "LlamaAndroid"
 include(":app")
-include(":llama")
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Georgi Gerganov	0fc560fe96	ci : enable git lfs for build.yml	2024-05-08 10:53:02 +03:00
Georgi Gerganov	db5c2ad30e	Revert "tmp : dummy change to trigger ci" This reverts commit `97e40df5d6`.	2024-05-08 10:42:25 +03:00
Georgi Gerganov	97e40df5d6	tmp : dummy change to trigger ci	2024-05-08 10:42:11 +03:00
Georgi Gerganov	837f426f19	ci : try lfs true	2024-05-08 10:30:25 +03:00
Georgi Gerganov	9d13776f34	ci : deps before checkout	2024-05-08 10:24:53 +03:00
Georgi Gerganov	2c7ff2c7ae	ci : add git-lfs ggml-ci	2024-05-08 10:18:47 +03:00
Georgi Gerganov	0dc0e9aa42	models : convert vocab files to LFS ggml-ci	2024-05-08 09:54:38 +03:00
				`@@ -0,0 +1 @@`
				`models/ggml-vocab-*.gguf filter=lfs diff=lfs merge=lfs -text`