fix some spaces added by IDE in math op

ggml-ci
rename n_ctx to kv_size
2026-04-23 16:37:33 +03:00 · 2024-02-18 22:40:35 +02:00 · 2024-02-18 22:40:35 +02:00 · 2024-02-18 22:40:34 +02:00 · 2024-02-18 22:40:32 +02:00
135 changed files with 51788 additions and 59577 deletions
--- a/.devops/nix/docker.nix
+++ b/.devops/nix/docker.nix
@@ -1,37 +0,0 @@
-{
-  lib,
-  dockerTools,
-  buildEnv,
-  llama-cpp,
-  interactive ? true,
-  coreutils,
-}:
-
-# A tar that can be fed into `docker load`:
-#
-# $ nix build .#llamaPackages.docker
-# $ docker load < result
-
-# For details and variations cf.
-# - https://nixos.org/manual/nixpkgs/unstable/#ssec-pkgs-dockerTools-buildLayeredImage
-# - https://discourse.nixos.org/t/a-faster-dockertools-buildimage-prototype/16922
-# - https://nixery.dev/
-
-# Approximate (compressed) sizes, at the time of writing, are:
-#
-# .#llamaPackages.docker: 125M;
-# .#llamaPackagesCuda.docker: 537M;
-# .#legacyPackages.aarch64-linux.llamaPackagesXavier.docker: 415M.
-
-dockerTools.buildLayeredImage {
-  name = llama-cpp.pname;
-  tag = "latest";
-
-  contents =
-    [ llama-cpp ]
-    ++ lib.optionals interactive [
-      coreutils
-      dockerTools.binSh
-      dockerTools.caCertificates
-    ];
-}
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@@ -1,6 +1,5 @@
 {
  lib,
-  glibc,
  config,
  stdenv,
  mkShell,
@@ -31,11 +30,6 @@
  useRocm ? config.rocmSupport,
  useVulkan ? false,
  llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake
-
-  # It's necessary to consistently use backendStdenv when building with CUDA support,
-  # otherwise we get libstdc++ errors downstream.
-  effectiveStdenv ? if useCuda then cudaPackages.backendStdenv else stdenv,
-  enableStatic ? effectiveStdenv.hostPlatform.isStatic
 }@inputs:

 let
@@ -47,7 +41,10 @@ let
    versionOlder
    ;

+  # It's necessary to consistently use backendStdenv when building with CUDA support,
+  # otherwise we get libstdc++ errors downstream.
  stdenv = throw "Use effectiveStdenv instead";
+  effectiveStdenv = if useCuda then cudaPackages.backendStdenv else inputs.stdenv;

  suffices =
    lib.optionals useBlas [ "BLAS" ]
@@ -170,9 +167,6 @@ effectiveStdenv.mkDerivation (
        # TODO: Replace with autoAddDriverRunpath
        # once https://github.com/NixOS/nixpkgs/pull/275241 has been merged
        cudaPackages.autoAddOpenGLRunpathHook
-      ]
-      ++ optionals (effectiveStdenv.hostPlatform.isGnu && enableStatic) [
-        glibc.static
      ];

    buildInputs =
@@ -187,7 +181,7 @@ effectiveStdenv.mkDerivation (
      [
        (cmakeBool "LLAMA_NATIVE" false)
        (cmakeBool "LLAMA_BUILD_SERVER" true)
-        (cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
+        (cmakeBool "BUILD_SHARED_LIBS" true)
        (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
        (cmakeBool "LLAMA_BLAS" useBlas)
        (cmakeBool "LLAMA_CLBLAST" useOpenCL)
@@ -196,7 +190,6 @@ effectiveStdenv.mkDerivation (
        (cmakeBool "LLAMA_METAL" useMetalKit)
        (cmakeBool "LLAMA_MPI" useMpi)
        (cmakeBool "LLAMA_VULKAN" useVulkan)
-        (cmakeBool "LLAMA_STATIC" enableStatic)
      ]
      ++ optionals useCuda [
        (
@@ -262,11 +255,11 @@ effectiveStdenv.mkDerivation (
      # Configurations we don't want even the CI to evaluate. Results in the
      # "unsupported platform" messages. This is mostly a no-op, because
      # cudaPackages would've refused to evaluate anyway.
-      badPlatforms = optionals (useCuda || useOpenCL) lib.platforms.darwin;
+      badPlatforms = optionals (useCuda || useOpenCL || useVulkan) lib.platforms.darwin;

      # Configurations that are known to result in build failures. Can be
      # overridden by importing Nixpkgs with `allowBroken = true`.
-      broken = (useMetalKit && !effectiveStdenv.isDarwin);
+      broken = (useMetalKit && !effectiveStdenv.isDarwin) || (useVulkan && effectiveStdenv.isDarwin);

      description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}";
      homepage = "https://github.com/ggerganov/llama.cpp/";
--- a/.devops/nix/scope.nix
+++ b/.devops/nix/scope.nix
@@ -12,8 +12,5 @@ lib.makeScope newScope (
  self: {
    inherit llamaVersion;
    llama-cpp = self.callPackage ./package.nix { };
-    docker = self.callPackage ./docker.nix { };
-    docker-min = self.callPackage ./docker.nix { interactive = false; };
-    sif = self.callPackage ./sif.nix { };
  }
 )
--- a/.devops/nix/sif.nix
+++ b/.devops/nix/sif.nix
@@ -1,27 +0,0 @@
-{
-  lib,
-  singularity-tools,
-  llama-cpp,
-  bashInteractive,
-  interactive ? false,
-}:
-
-let
-  optionalInt = cond: x: if cond then x else 0;
-in
-singularity-tools.buildImage rec {
-  inherit (llama-cpp) name;
-  contents = [ llama-cpp ] ++ lib.optionals interactive [ bashInteractive ];
-
-  # These are excessive (but safe) for most variants. Building singularity
-  # images requires superuser privileges, so we build them inside a VM in a
-  # writable image of pre-determined size.
-  #
-  # ROCm is currently affected by https://github.com/NixOS/nixpkgs/issues/276846
-  #
-  # Expected image sizes:
-  # - cpu/blas: 150M,
-  # - cuda, all gencodes: 560M,
-  diskSize = 4096 + optionalInt llama-cpp.useRocm 16384;
-  memSize = diskSize;
-}
--- a/.github/ISSUE_TEMPLATE/bug.md
+++ b/.github/ISSUE_TEMPLATE/bug.md
@@ -7,5 +7,3 @@ assignees: ''
 ---

 Please include information about your system, the steps to reproduce the bug, and the version of llama.cpp that you are using. If possible, please provide a minimal code example that reproduces the bug.
-
-If the bug concerns the server, please try to reproduce it first using the [server test scenario framework](https://github.com/ggerganov/llama.cpp/tree/master/examples/server/tests).
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -145,28 +145,6 @@ jobs:
          cd build
          ctest -L main --verbose

-  ubuntu-22-cmake-vulkan:
-    runs-on: ubuntu-22.04
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v3
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential libvulkan-dev
-
-      - name: Build
-        id: cmake_build
-        run: |
-          mkdir build
-          cd build
-          cmake -DLLAMA_VULKAN=ON ..
-          cmake --build . --config Release -j $(nproc)
-
  ubuntu-22-cmake-sycl:
    runs-on: ubuntu-22.04

@@ -691,7 +669,8 @@ jobs:
        run: |
          cd examples/llama.android

-          ./gradlew build --no-daemon
+          # Skip armeabi-v7a for now (https://github.com/llvm/llvm-project/issues/65820).
+          ./gradlew build --no-daemon -Pskip-armeabi-v7a

 #  freeBSD-latest:
 #    runs-on: macos-12
--- a/.github/workflows/nix-ci-aarch64.yml
+++ b/.github/workflows/nix-ci-aarch64.yml
@@ -19,6 +19,7 @@ on:

 jobs:
  nix-build-aarch64:
+    if: ${{ vars.CACHIX_NAME != '' }}
    runs-on: ubuntu-latest
    steps:
    - name: Checkout repository
@@ -36,8 +37,8 @@ jobs:
        extra-conf: |
          extra-platforms = aarch64-linux
          extra-system-features = nixos-test kvm
-          extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
-          extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
+          extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org
+          extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
    - uses: DeterminateSystems/magic-nix-cache-action@v2
      with:
        upstream-cache: https://${{ matrix.cachixName }}.cachix.org
@@ -45,7 +46,7 @@ jobs:
      uses: cachix/cachix-action@v13
      with:
        authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
-        name: llama-cpp
+        name: ${{ vars.CACHIX_NAME }}
    - name: Show all output paths
      run: >
          nix run github:nix-community/nix-eval-jobs
--- a/.github/workflows/nix-ci.yml
+++ b/.github/workflows/nix-ci.yml
@@ -23,8 +23,8 @@ jobs:
      with:
        github-token: ${{ secrets.GITHUB_TOKEN }}
        extra-conf: |
-          extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
-          extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
+          extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org
+          extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
    - uses: DeterminateSystems/magic-nix-cache-action@v2
      with:
        upstream-cache: https://${{ matrix.cachixName }}.cachix.org
@@ -37,6 +37,7 @@ jobs:
          --flake
          ".#packages.$(nix eval --raw --impure --expr builtins.currentSystem)"
  nix-build:
+    if: ${{ vars.CACHIX_NAME != '' }}
    strategy:
      fail-fast: false
      matrix:
@@ -50,8 +51,8 @@ jobs:
      with:
        github-token: ${{ secrets.GITHUB_TOKEN }}
        extra-conf: |
-          extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
-          extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
+          extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org
+          extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
    - uses: DeterminateSystems/magic-nix-cache-action@v2
      with:
        upstream-cache: https://${{ matrix.cachixName }}.cachix.org
@@ -59,7 +60,7 @@ jobs:
      uses: cachix/cachix-action@v13
      with:
        authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
-        name: llama-cpp
+        name: ${{ vars.CACHIX_NAME }}
    - name: Build
      run: >
          nix run github:Mic92/nix-fast-build
--- a/.github/workflows/python-check-requirements.yml
+++ b/.github/workflows/python-check-requirements.yml
@@ -3,14 +3,12 @@ name: Python check requirements.txt
 on:
  push:
    paths:
-      - '.github/workflows/python-check-requirements.yml'
      - 'scripts/check-requirements.sh'
      - 'convert*.py'
      - 'requirements.txt'
      - 'requirements/*.txt'
  pull_request:
    paths:
-      - '.github/workflows/python-check-requirements.yml'
      - 'scripts/check-requirements.sh'
      - 'convert*.py'
      - 'requirements.txt'
@@ -28,4 +26,4 @@ jobs:
        with:
          python-version: "3.11"
      - name: Run check-requirements.sh script
-        run:  bash scripts/check-requirements.sh
+        run:  bash scripts/check-requirements.sh nocleanup
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -1,91 +0,0 @@
-# Server build and tests
-name: Server
-
-on:
-  workflow_dispatch: # allows manual triggering
-    inputs:
-      slow_tests:
-        description: 'Run slow tests'
-        required: true
-        type: boolean
-  push:
-    branches:
-      - master
-    paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/tests/**.*']
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/tests/**.*']
-  schedule:
-    -  cron: '0 0 * * *'
-
-jobs:
-  server:
-    runs-on: ubuntu-latest
-
-    strategy:
-      matrix:
-        sanitizer: [ADDRESS, THREAD, UNDEFINED]
-        build_type: [Debug, Release]
-        include:
-          - build_type: Release
-            sanitizer: ""
-        exclude:
-          - build_type: Release
-            sanitizer: ADDRESS
-          - build_type: Release
-            sanitizer: THREAD
-          - build_type: Release
-            sanitizer: UNDEFINED
-
-    container:
-      image: ubuntu:latest
-      ports:
-        - 8888
-      options: --cpus 4
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v3
-
-      - name: Dependencies
-        id: depends
-        run: |
-          apt-get update
-          apt-get -y install \
-            build-essential \
-            git \
-            cmake \
-            python3-pip \
-            wget \
-            psmisc
-
-      - name: Build
-        id: cmake_build
-        run: |
-          mkdir build
-          cd build
-          cmake .. \
-              -DLLAMA_NATIVE=OFF \
-              -DLLAMA_BUILD_SERVER=ON \
-              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-              -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
-          cmake --build . --config ${{ matrix.build_type }} -j $(nproc) --target server
-
-      - name: Tests dependencies
-        id: test_dependencies
-        run: |
-          pip install -r examples/server/tests/requirements.txt
-
-      - name: Tests
-        id: server_integration_tests
-        run: |
-          cd examples/server/tests
-          PORT=8888 ./tests.sh
-
-      - name: Slow tests
-        id: server_integration_tests_slow
-        if: ${{ github.event.schedule != '' && matrix.build_type == 'Release' || github.event.inputs.slow_tests == 'true' }}
-        run: |
-          cd examples/server/tests
-          PORT=8888 ./tests.sh --stop --no-skipped --no-capture --tags slow
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -110,7 +110,6 @@ option(LLAMA_VULKAN_RUN_TESTS                "llama: run Vulkan tests"
 option(LLAMA_METAL                           "llama: use Metal"                                 ${LLAMA_METAL_DEFAULT})
 option(LLAMA_METAL_NDEBUG                    "llama: disable Metal debugging"                   OFF)
 option(LLAMA_METAL_SHADER_DEBUG              "llama: compile Metal with -fno-fast-math"         OFF)
-option(LLAMA_METAL_EMBED_LIBRARY             "llama: embed Metal library"                       OFF)
 option(LLAMA_KOMPUTE                         "llama: use Kompute"                               OFF)
 option(LLAMA_MPI                             "llama: use MPI"                                   OFF)
 option(LLAMA_QKK_64                          "llama: use super-block size of 64 for k-quants"   OFF)
@@ -146,6 +145,14 @@ set(THREADS_PREFER_PTHREAD_FLAG ON)
 find_package(Threads REQUIRED)
 include(CheckCXXCompilerFlag)

+if (LLAMA_FATAL_WARNINGS)
+    if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+        add_compile_options(-Werror)
+    elseif (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
+        add_compile_options(/WX)
+    endif()
+endif()
+
 # enable libstdc++ assertions for debug builds
 if (CMAKE_SYSTEM_NAME MATCHES "Linux")
    add_compile_definitions($<$<CONFIG:Debug>:_GLIBCXX_ASSERTIONS>)
@@ -202,29 +209,6 @@ if (LLAMA_METAL)
    # copy ggml-metal.metal to bin directory
    configure_file(ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY)

-    if (LLAMA_METAL_EMBED_LIBRARY)
-        enable_language(ASM)
-        add_compile_definitions(GGML_METAL_EMBED_LIBRARY)
-
-        set(METALLIB_SOURCE "${CMAKE_SOURCE_DIR}/ggml-metal.metal")
-        file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/autogenerated")
-        set(EMBED_METALLIB_ASSEMBLY "${CMAKE_BINARY_DIR}/autogenerated/ggml-embed-metallib.s")
-
-        add_custom_command(
-            OUTPUT ${EMBED_METALLIB_ASSEMBLY}
-            COMMAND echo ".section __DATA,__ggml_metallib" > ${EMBED_METALLIB_ASSEMBLY}
-            COMMAND echo ".globl _ggml_metallib_start" >> ${EMBED_METALLIB_ASSEMBLY}
-            COMMAND echo "_ggml_metallib_start:" >> ${EMBED_METALLIB_ASSEMBLY}
-            COMMAND echo ".incbin \\\"${METALLIB_SOURCE}\\\"" >> ${EMBED_METALLIB_ASSEMBLY}
-            COMMAND echo ".globl _ggml_metallib_end" >> ${EMBED_METALLIB_ASSEMBLY}
-            COMMAND echo "_ggml_metallib_end:" >> ${EMBED_METALLIB_ASSEMBLY}
-            DEPENDS ${METALLIB_SOURCE}
-            COMMENT "Generate assembly for embedded Metal library"
-        )
-
-        set(GGML_SOURCES_METAL ${GGML_SOURCES_METAL} ${EMBED_METALLIB_ASSEMBLY})
-    endif()
-
    if (LLAMA_METAL_SHADER_DEBUG)
        # custom command to do the following:
        #   xcrun -sdk macosx metal    -fno-fast-math -c ggml-metal.metal -o ggml-metal.air
@@ -757,30 +741,28 @@ function(get_flags CCID CCVER)
        if (CCVER VERSION_GREATER_EQUAL 8.1.0)
            list(APPEND CXX_FLAGS -Wextra-semi)
        endif()
+    elseif (CCID MATCHES "Intel")
+        if (NOT LLAMA_SYCL)
+            # enable max optimization level when using Intel compiler
+            set(C_FLAGS   -ipo -O3 -static -fp-model=fast -flto -fno-stack-protector)
+            set(CXX_FLAGS -ipo -O3 -static -fp-model=fast -flto -fno-stack-protector)
+            add_link_options(-fuse-ld=lld -static-intel)
+        endif()
    endif()

    set(GF_C_FLAGS   ${C_FLAGS}   PARENT_SCOPE)
    set(GF_CXX_FLAGS ${CXX_FLAGS} PARENT_SCOPE)
 endfunction()

-if (LLAMA_FATAL_WARNINGS)
-    if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
-        list(APPEND C_FLAGS   -Werror)
-        list(APPEND CXX_FLAGS -Werror)
-    elseif (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
-        add_compile_options(/WX)
-    endif()
-endif()
-
 if (LLAMA_ALL_WARNINGS)
    if (NOT MSVC)
-        list(APPEND WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
-        list(APPEND C_FLAGS       -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes
-                                  -Werror=implicit-int -Werror=implicit-function-declaration)
-        list(APPEND CXX_FLAGS     -Wmissing-declarations -Wmissing-noreturn)
+        set(WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
+        set(C_FLAGS       -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes
+                          -Werror=implicit-int -Werror=implicit-function-declaration)
+        set(CXX_FLAGS     -Wmissing-declarations -Wmissing-noreturn)

-        list(APPEND C_FLAGS   ${WARNING_FLAGS})
-        list(APPEND CXX_FLAGS ${WARNING_FLAGS})
+        set(C_FLAGS   ${WARNING_FLAGS} ${C_FLAGS})
+        set(CXX_FLAGS ${WARNING_FLAGS} ${CXX_FLAGS})

        get_flags(${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION})

@@ -796,10 +778,9 @@ endif()
 set(CUDA_CXX_FLAGS "")

 if (LLAMA_CUBLAS)
-    set(CUDA_FLAGS -use_fast_math)
-
-    if (LLAMA_FATAL_WARNINGS)
-        list(APPEND CUDA_FLAGS -Werror all-warnings)
+    set(CUDA_FLAGS ${CXX_FLAGS} -use_fast_math)
+    if (NOT MSVC)
+        list(APPEND CUDA_FLAGS -Wno-pedantic)
    endif()

    if (LLAMA_ALL_WARNINGS AND NOT MSVC)
@@ -833,11 +814,7 @@ if (LLAMA_CUBLAS)
        message("-- CUDA host compiler is ${CUDA_CCID} ${CUDA_CCVER}")

        get_flags(${CUDA_CCID} ${CUDA_CCVER})
-        list(APPEND CUDA_CXX_FLAGS ${CXX_FLAGS} ${GF_CXX_FLAGS})  # This is passed to -Xcompiler later
-    endif()
-
-    if (NOT MSVC)
-        list(APPEND CUDA_CXX_FLAGS -Wno-pedantic)
+        list(APPEND CUDA_CXX_FLAGS ${GF_CXX_FLAGS})  # This is passed to -Xcompiler later
    endif()
 endif()

@@ -936,16 +913,10 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR CMAKE_GENERATOR_PLATFORM_LWR STR
            list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access)
        endif()
        if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7")
-            if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Android")
-                # Android armeabi-v7a
-                list(APPEND ARCH_FLAGS -mfpu=neon-vfpv4 -mno-unaligned-access -funsafe-math-optimizations)
-            else()
-                # Raspberry Pi 2
-                list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations)
-            endif()
+            # Raspberry Pi 2
+            list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations)
        endif()
        if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8")
-            # Android arm64-v8a
            # Raspberry Pi 3, 4, Zero 2 (32-bit)
            list(APPEND ARCH_FLAGS -mno-unaligned-access)
        endif()
--- a/86
+++ b/86
@@ -97,10 +97,9 @@ endif
 #

 # keep standard at C11 and C++11
-MK_CPPFLAGS  = -I. -Icommon
-MK_CFLAGS    = -std=c11   -fPIC
-MK_CXXFLAGS  = -std=c++11 -fPIC
-MK_NVCCFLAGS = -std=c++11
+MK_CPPFLAGS = -I. -Icommon
+MK_CFLAGS   = -std=c11   -fPIC
+MK_CXXFLAGS = -std=c++11 -fPIC

 # -Ofast tends to produce faster code, but may not be available for some compilers.
 ifdef LLAMA_FAST
@@ -173,7 +172,7 @@ ifdef LLAMA_DEBUG
 	MK_LDFLAGS  += -g

 	ifeq ($(UNAME_S),Linux)
-		MK_CPPFLAGS += -D_GLIBCXX_ASSERTIONS
+		MK_CXXFLAGS += -Wp,-D_GLIBCXX_ASSERTIONS
 	endif
 else
 	MK_CPPFLAGS += -DNDEBUG
@@ -217,10 +216,34 @@ MK_CFLAGS    += $(WARN_FLAGS) -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmis
 MK_CXXFLAGS  += $(WARN_FLAGS) -Wmissing-declarations -Wmissing-noreturn

 ifeq ($(LLAMA_FATAL_WARNINGS),1)
-	MK_CFLAGS   += -Werror
+	MK_CFLAGS += -Werror
 	MK_CXXFLAGS += -Werror
 endif

+ifeq ($(CC_IS_CLANG), 1)
+	# clang options
+	MK_CFLAGS        += -Wunreachable-code-break -Wunreachable-code-return
+	MK_HOST_CXXFLAGS += -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi
+
+	ifneq '' '$(and $(CC_IS_LLVM_CLANG),$(filter 1,$(shell expr $(CC_VER) \>= 030800)))'
+		MK_CFLAGS += -Wdouble-promotion
+	endif
+	ifneq '' '$(and $(CC_IS_APPLE_CLANG),$(filter 1,$(shell expr $(CC_VER) \>= 070300)))'
+		MK_CFLAGS += -Wdouble-promotion
+	endif
+else
+	# gcc options
+	MK_CFLAGS        += -Wdouble-promotion
+	MK_HOST_CXXFLAGS += -Wno-array-bounds
+
+	ifeq ($(shell expr $(CC_VER) \>= 070100), 1)
+		MK_HOST_CXXFLAGS += -Wno-format-truncation
+	endif
+	ifeq ($(shell expr $(CC_VER) \>= 080100), 1)
+		MK_HOST_CXXFLAGS += -Wextra-semi
+	endif
+endif
+
 # this version of Apple ld64 is buggy
 ifneq '' '$(findstring dyld-1015.7,$(shell $(CC) $(LDFLAGS) -Wl,-v 2>&1))'
 	MK_CPPFLAGS += -DHAVE_BUGGY_APPLE_LINKER
@@ -381,18 +404,10 @@ ifdef LLAMA_BLIS
 endif # LLAMA_BLIS

 ifdef LLAMA_CUBLAS
-	ifneq ('', '$(wildcard /opt/cuda)')
-		CUDA_PATH ?= /opt/cuda
-	else
-		CUDA_PATH ?= /usr/local/cuda
-	endif
-	MK_CPPFLAGS  += -DGGML_USE_CUBLAS -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
-	MK_LDFLAGS   += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
+	MK_CPPFLAGS  += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include -I/usr/local/cuda/targets/aarch64-linux/include
+	MK_LDFLAGS   += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib -L/usr/local/cuda/targets/aarch64-linux/lib -L/usr/lib/wsl/lib
 	OBJS         += ggml-cuda.o
 	MK_NVCCFLAGS += -use_fast_math
-ifdef LLAMA_FATAL_WARNINGS
-	MK_NVCCFLAGS += -Werror all-warnings
-endif # LLAMA_FATAL_WARNINGS
 ifndef JETSON_EOL_MODULE_DETECT
 	MK_NVCCFLAGS += --forward-unknown-to-host-compiler
 endif # JETSON_EOL_MODULE_DETECT
@@ -451,9 +466,9 @@ ifdef LLAMA_CUDA_CCBIN
 endif
 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
 ifdef JETSON_EOL_MODULE_DETECT
-	$(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
+	$(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
 else
-	$(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
+	$(NVCC) $(BASE_CXXFLAGS) $(NVCCFLAGS) -Wno-pedantic -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
 endif # JETSON_EOL_MODULE_DETECT
 endif # LLAMA_CUBLAS

@@ -538,29 +553,11 @@ ifdef LLAMA_METAL
 ifdef LLAMA_METAL_NDEBUG
 	MK_CPPFLAGS += -DGGML_METAL_NDEBUG
 endif
-ifdef LLAMA_METAL_EMBED_LIBRARY
-	MK_CPPFLAGS += -DGGML_METAL_EMBED_LIBRARY
-	OBJS        += ggml-metal-embed.o
-endif
 endif # LLAMA_METAL

 ifdef LLAMA_METAL
 ggml-metal.o: ggml-metal.m ggml-metal.h
 	$(CC) $(CFLAGS) -c $< -o $@
-
-ifdef LLAMA_METAL_EMBED_LIBRARY
-ggml-metal-embed.o: ggml-metal.metal
-	@echo "Embedding Metal library"
-	$(eval TEMP_ASSEMBLY=$(shell mktemp))
-	@echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY)
-	@echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY)
-	@echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY)
-	@echo ".incbin \"$<\"" >> $(TEMP_ASSEMBLY)
-	@echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY)
-	@echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY)
-	@$(AS) $(TEMP_ASSEMBLY) -o $@
-	@rm -f ${TEMP_ASSEMBLY}
-endif
 endif # LLAMA_METAL

 ifdef LLAMA_MPI
@@ -572,10 +569,9 @@ GF_CC := $(CC)
 include scripts/get-flags.mk

 # combine build flags with cmdline overrides
-override CPPFLAGS  := $(MK_CPPFLAGS) $(CPPFLAGS)
-override CFLAGS    := $(CPPFLAGS) $(MK_CFLAGS) $(GF_CFLAGS) $(CFLAGS)
-BASE_CXXFLAGS      := $(MK_CXXFLAGS) $(CXXFLAGS)
-override CXXFLAGS  := $(BASE_CXXFLAGS) $(HOST_CXXFLAGS) $(GF_CXXFLAGS) $(CPPFLAGS)
+override CFLAGS    := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CFLAGS) $(GF_CFLAGS) $(CFLAGS)
+BASE_CXXFLAGS      := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CXXFLAGS) $(CXXFLAGS)
+override CXXFLAGS  := $(BASE_CXXFLAGS) $(HOST_CXXFLAGS) $(GF_CXXFLAGS)
 override NVCCFLAGS := $(MK_NVCCFLAGS) $(NVCCFLAGS)
 override LDFLAGS   := $(MK_LDFLAGS) $(LDFLAGS)

@@ -583,7 +579,7 @@ override LDFLAGS   := $(MK_LDFLAGS) $(LDFLAGS)
 ifdef LLAMA_CUBLAS
 GF_CC := $(NVCC) $(NVCCFLAGS) 2>/dev/null .c -Xcompiler
 include scripts/get-flags.mk
-CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic
+CUDA_CXXFLAGS := $(GF_CXXFLAGS)
 endif

 #
@@ -602,7 +598,7 @@ $(info I CC:        $(shell $(CC)   --version | head -n 1))
 $(info I CXX:       $(shell $(CXX)  --version | head -n 1))
 ifdef LLAMA_CUBLAS
 $(info I NVCC:      $(shell $(NVCC) --version | tail -n 1))
-CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])')
+CUDA_VERSION := $(shell nvcc --version | grep -oP 'release (\K[0-9]+\.[0-9])')
 ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
 ifndef CUDA_DOCKER_ARCH
 ifndef CUDA_POWER_ARCH
@@ -724,7 +720,7 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h examples/llava/llava.h examples/llava/llava.cpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
+server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual
 	$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h %.hpp $< examples/llava/clip.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) -o $@ $(LDFLAGS) $(LWINSOCK2)
@@ -895,7 +891,3 @@ tests/test-model-load-cancel: tests/test-model-load-cancel.cpp ggml.o llama.o te
 tests/test-autorelease: tests/test-autorelease.cpp ggml.o llama.o tests/get-model.cpp $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-tests/test-chat-template: tests/test-chat-template.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
--- a/README-sycl.md
+++ b/README-sycl.md
@@ -1,7 +1,6 @@
 # llama.cpp for SYCL

 - [Background](#background)
- [News](#news)
 - [OS](#os)
 - [Intel GPU](#intel-gpu)
 - [Docker](#docker)
@@ -26,21 +25,6 @@ The llama.cpp for SYCL is used to support Intel GPUs.

 For Intel CPU, recommend to use llama.cpp for X86 (Intel MKL building).

-## News
-
- 2024.3
-  - Support multiple cards: **--split-mode**: [none|layer]; not support [row], it's on developing.
-  - Support to assign main GPU by **--main-gpu**, replace $GGML_SYCL_DEVICE.
-  - Support detecting all GPUs with level-zero and same top **Max compute units**.
-  - Support OPs
-    - hardsigmoid
-    - hardswish
-    - pool2d
-
- 2024.1
-  - Create SYCL backend for Intel GPU.
-  - Support Windows build
-
 ## OS

 |OS|Status|Verified|
@@ -288,7 +272,7 @@ Please install [Visual Studio](https://visualstudio.microsoft.com/) which impact

 a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).

-Recommend to install to default folder: **C:\Program Files (x86)\Intel\oneAPI**.
+Recommend to install to default folder: **/opt/intel/oneapi**.

 Following guide uses the default folder as example. If you use other folder, please modify the following guide info with your folder.

@@ -465,7 +449,6 @@ Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
 |-|-|-|
 |GGML_SYCL_DEVICE|0 (default) or 1|Set the device id used. Check the device ids by default running output|
 |GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG|
-|ZES_ENABLE_SYSMAN| 0 (default) or 1|Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer|

 ## Known Issue

@@ -475,10 +458,6 @@ Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device

  Solution: add **--no-mmap** or **--mmap 0**.

- Split-mode: [row] is not supported
-
-  It's on developing.
-
 ## Q&A

 - Error:  `error while loading shared libraries: libsycl.so.7: cannot open shared object file: No such file or directory`.
--- a/README.md
+++ b/README.md
@@ -8,17 +8,15 @@

 Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++

-### Recent API changes
-
- [2024 Mar 4] Embeddings API updated https://github.com/ggerganov/llama.cpp/pull/5796
- [2024 Mar 3] `struct llama_context_params` https://github.com/ggerganov/llama.cpp/pull/5849
-
 ### Hot topics

- The `api_like_OAI.py` script has been removed - use `server` instead ([#5766](https://github.com/ggerganov/llama.cpp/issues/5766#issuecomment-1969037761))
- Support for chat templates: [Wiki (contributions welcome)](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
- Support for Gemma models: https://github.com/ggerganov/llama.cpp/pull/5631
- Non-linear quantization IQ4_NL: https://github.com/ggerganov/llama.cpp/pull/5590
+- Remove LLAMA_MAX_DEVICES and LLAMA_SUPPORTS_GPU_OFFLOAD: https://github.com/ggerganov/llama.cpp/pull/5240
+- Incoming backends: https://github.com/ggerganov/llama.cpp/discussions/5138
+  - [SYCL backend](README-sycl.md) is ready (1/28/2024), support Linux/Windows in Intel GPUs (iGPU, Arc/Flex/Max series)
+- New SOTA quantized models, including pure 2-bits: https://huggingface.co/ikawrakow
+- Collecting Apple Silicon performance stats:
+  - M-series: https://github.com/ggerganov/llama.cpp/discussions/4167
+  - A-series: https://github.com/ggerganov/llama.cpp/discussions/4508
 - Looking for contributions to improve and maintain the `server` example: https://github.com/ggerganov/llama.cpp/issues/4216

 ----
@@ -63,7 +61,7 @@ variety of hardware - locally and in the cloud.
 - Plain C/C++ implementation without any dependencies
 - Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
 - AVX, AVX2 and AVX512 support for x86 architectures
- 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
+- 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
 - Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP)
 - Vulkan, SYCL, and (partial) OpenCL backend support
 - CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity
@@ -109,20 +107,16 @@ Typically finetunes of the base models below are supported as well.
 - [x] [Orion 14B](https://github.com/ggerganov/llama.cpp/pull/5118)
 - [x] [InternLM2](https://huggingface.co/models?search=internlm2)
 - [x] [CodeShell](https://github.com/WisdomShell/codeshell)
- [x] [Gemma](https://ai.google.dev/gemma)

 **Multimodal models:**

- [x] [LLaVA 1.5 models](https://huggingface.co/collections/liuhaotian/llava-15-653aac15d994e992e2677a7e), [LLaVA 1.6 models](https://huggingface.co/collections/liuhaotian/llava-16-65b9e40155f60fd046a5ccf2)
+- [x] [LLaVA 1.5 models](https://huggingface.co/collections/liuhaotian/llava-15-653aac15d994e992e2677a7e)
 - [x] [BakLLaVA](https://huggingface.co/models?search=SkunkworksAI/Bakllava)
 - [x] [Obsidian](https://huggingface.co/NousResearch/Obsidian-3B-V0.5)
 - [x] [ShareGPT4V](https://huggingface.co/models?search=Lin-Chen/ShareGPT4V)
 - [x] [MobileVLM 1.7B/3B models](https://huggingface.co/models?search=mobileVLM)
 - [x] [Yi-VL](https://huggingface.co/models?search=Yi-VL)

-**HTTP server**
-
-[llama.cpp web server](./examples/server) is a lightweight [OpenAI API](https://github.com/openai/openai-openapi) compatible HTTP server that can be used to serve local models and easily connect them to existing clients.

 **Bindings:**

@@ -151,7 +145,6 @@ Unless otherwise noted these projects are open-source with permissive licensing:
 - [nat/openplayground](https://github.com/nat/openplayground)
 - [Faraday](https://faraday.dev/) (proprietary)
 - [LMStudio](https://lmstudio.ai/) (proprietary)
- [LocalAI](https://github.com/mudler/LocalAI) (MIT)
 - [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL)
 - [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile)
 - [nomic-ai/gpt4all](https://github.com/nomic-ai/gpt4all)
@@ -163,9 +156,6 @@ Unless otherwise noted these projects are open-source with permissive licensing:
 - [pythops/tenere](https://github.com/pythops/tenere) (AGPL)
 - [semperai/amica](https://github.com/semperai/amica)
 - [withcatai/catai](https://github.com/withcatai/catai)
- [Mobile-Artificial-Intelligence/maid](https://github.com/Mobile-Artificial-Intelligence/maid) (MIT)
- [Msty](https://msty.app) (proprietary)
- [LLMFarm](https://github.com/guinmoon/LLMFarm?tab=readme-ov-file) (MIT)

 ---

@@ -196,7 +186,7 @@ llm_load_print_meta: vocab type     = SPM
 llm_load_print_meta: n_vocab        = 32000
 llm_load_print_meta: n_merges       = 0
 llm_load_print_meta: n_ctx_train    = 4096
-llm_load_print_meta: n_ctx          = 512
+llm_load_print_meta: kv_size        = 512
 llm_load_print_meta: n_embd         = 5120
 llm_load_print_meta: n_head         = 40
 llm_load_print_meta: n_head_kv      = 40
@@ -224,7 +214,7 @@ llama_new_context_with_model: compute buffer total size =   75.41 MB

 system_info: n_threads = 16 / 24 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | VSX = 0 |
 sampling: repeat_last_n = 64, repeat_penalty = 1.100000, presence_penalty = 0.000000, frequency_penalty = 0.000000, top_k = 40, tfs_z = 1.000000, top_p = 0.950000, typical_p = 1.000000, temp = 0.800000, mirostat = 0, mirostat_lr = 0.100000, mirostat_ent = 5.000000
-generate: n_ctx = 512, n_batch = 512, n_predict = 400, n_keep = 0
+generate: kv_size = 512, n_batch = 512, n_predict = 400, n_keep = 0


 Building a website can be done in 10 simple steps:
@@ -791,7 +781,7 @@ And after 4.45 hours, you will have the final perplexity.
 ### Interactive mode

 If you want a more ChatGPT-like experience, you can run in interactive mode by passing `-i` as a parameter.
-In this mode, you can always interrupt generation by pressing Ctrl+C and entering one or more lines of text, which will be converted into tokens and appended to the current context. You can also specify a *reverse prompt* with the parameter `-r "reverse prompt string"`. This will result in user input being prompted whenever the exact tokens of the reverse prompt string are encountered in the generation. A typical use is to use a prompt that makes LLaMA emulate a chat between multiple users, say Alice and Bob, and pass `-r "Alice:"`.
+In this mode, you can always interrupt generation by pressing Ctrl+C and entering one or more lines of text, which will be converted into tokens and appended to the current context. You can also specify a *reverse prompt* with the parameter `-r "reverse prompt string"`. This will result in user input being prompted whenever the exact tokens of the reverse prompt string are encountered in the generation. A typical use is to use a prompt that makes LLaMa emulate a chat between multiple users, say Alice and Bob, and pass `-r "Alice:"`.

 Here is an example of a few-shot interaction, invoked with the command

@@ -855,7 +845,7 @@ Sample run:
 ```
 == Running in interactive mode. ==
 - Press Ctrl+C to interject at any time.
- - Press Return to return control to LLaMA.
+ - Press Return to return control to LLaMa.
 - If you want to submit another line, end your input in '\'.

 Below is an instruction that describes a task. Write a response that appropriately completes the request.
--- a/awq-py/README.md
+++ b/awq-py/README.md
@@ -0,0 +1,116 @@
+# AWQ: Activation-aware Weight Quantization for LLM - version apply to llamacpp
+[[Paper](https://arxiv.org/abs/2306.00978)][[Original Repo](https://github.com/mit-han-lab/llm-awq)][[Easy-to-use Repo](https://github.com/casper-hansen/AutoAWQ)]
+
+**Supported models:**
+
+- [X] LLaMA
+- [x] LLaMA 2
+- [X] MPT
+- [X] Mistral AI v0.1
+- [ ] Bloom
+- [ ] Mixtral MoE
+
+**TODO:**
+- [x] Update version work with both MPT and MPT-AWQ model
+- [ ] Add OPT model
+- [ ] Add Bloom model
+- [ ] Add Mixtral MoE
+- [ ] Support w3, w2
+
+
+## Contents
+
+- [Install](##Install)
+- [Convert](##Convert)
+- [Quantize](##Quantize)
+- [Test](##Test)
+- [Benchmark](##Benchmark)
+- [Results](##Results)
+
+## Install
+Install requirements
+```bash
+pip install -r requirements.txt
+```
+Get the pre-computed AWQ search results for multiple model families, including LLaMA, LLaMA2, MPT, OPT
+```bash
+git clone https://huggingface.co/datasets/mit-han-lab/awq-model-zoo awq_cache
+```
+
+## Convert
+Example for llama model
+```bash
+# For llama7b and llama2 models
+python convert.py models/llama-7b/ --awq-path awq_cache/llama-7b-w4-g128.pt --outfile models/llama_7b_fp16.gguf
+# For mistral and mpt models
+python convert-hf-to-gguf.py models/mpt-7b/ --awq-path awq_cache/mpt-7b-w4-g128.pt --outfile models/mpt_7b_fp16.gguf
+```
+
+## Quantize
+```bash
+# We only benchmark and confirm the results on q4_0, q4_1, and q2_k types.
+./quantize models/llama_7b_fp16.gguf models/llama_7b_q4_0.gguf q4_0
+```
+
+## Test
+```bash
+# For all models.
+./build/bin/main -m models/llama_7b_q4_0.gguf -n 128 --prompt "Once upon a time"
+```
+
+## Benchmark
+The perplexity measurements in table above are done against the `wikitext2` test dataset (https://paperswithcode.com/dataset/wikitext-2), with context length of 512.
+```bash
+# For llama and llama2, and mistral models.
+./perplexity -m models/llama_7b_q4_0.gguf -f datasets/wikitext-2-raw/wiki.test.raw
+```
+
+## Results
+Results are run on OpenBLAS (CPU) and CuBLAS (GPU) for fair comparison
+We use three types of llamacpp quantization methods to work with our version, including q4_0, q4_1, and q2_k
+
+### Llama 7B (Build with OpenBLAS)
+
+| Model      | Measure      | F16    | Q4_0   | Q4_1   | Q2_K   |
+|-----------:|--------------|-------:|-------:|-------:|-------:|
+|Llama 7B    | perplexity   | 5.9066 | 6.1214 | 6.0643 | 6.5808 |
+|Llama 7B    | file size    |  12.9G  |   3.5G |   3.9G |   2.7G |
+|Llama 7B    | bits/weight  |   16.0 |    4.5 |    5.0 |    2.6 |
+|AWQ-LLama 7B| perplexity   | 5.9175 | 6.0252 | 5.9987 | 6.3692 |
+|AWQ-LLama 7B| file size    |  12.9G  |   3.5G |   3.9G |   2.7G |
+|AWQ-LLama 7B| bits/weight  |   16.0 |    4.5 |    5.0 |    2.6 |
+
+
+### Llama2 7B (Build with CuBLAS)
+
+| Model       | Measure      | F16    | Q4_0   | Q4_1   | Q2_K   |
+|------------:|--------------|-------:|-------:|-------:|-------:|
+|Llama2 7B    | perplexity   | 5.8664 | 6.0260 | 6.0656 | 6.4496 |
+|Llama2 7B    | file size    |  12.9G  |   3.5G |   3.9G |   2.7G |
+|Llama2 7B    | bits/weight  |   16.0 |    4.5 |    5.0 |    2.6 |
+|AWQ-LLama2 7B| perplexity   | 5.8801 | 6.0054 | 5.9849 | 6.3650 |
+|AWQ-LLama2 7B| file size    |  12.9G  |   3.5G |   3.9G |   2.7G |
+|AWQ-LLama2 7B| bits/weight  |   16.0 |    4.5 |    5.0 |    2.6 |
+
+
+### Mistral 7B v0.1 (Build with CuBLAS)
+
+| Model        | Measure      | F16    | Q4_0   | Q4_1   | Q2_K   |
+|-------------:|--------------|-------:|-------:|-------:|-------:|
+|Mistral 7B    | perplexity   | 5.6931 | 5.8202 | 5.8268 | 6.1645 |
+|Mistral 7B    | file size     |  14.5G |   4.1G |   4.5G |   3.1G |
+|Mistral 7B    | bits/weight  |   16.0 |    4.5 |    5.0 |    2.6 |
+|AWQ-Mistral 7B| perplexity   | 5.6934 | 5.8020 | 5.7691 | 6.0426 |
+|AWQ-Mistral 7B| file size     |  14.5G |   4.1G |   4.5G |   3.1G |
+|AWQ-Mistral 7B| bits/weight  |   16.0 |    4.5 |    5.0 |    2.6 |
+
+### MPT 7B (Build with OpenBLAS)
+
+| Model    | Measure      | F16    | Q4_0   | Q4_1   | Q2_K    |
+|---------:|--------------|-------:|-------:|-------:|--------:|
+|MPT 7B    | perplexity   | 8.4369 | 8.7956 | 8.6265 | 11.4913 |
+|MPT 7B    | file size    |  13.7G  |   3.9G |   4.3G |   2.8G  |
+|MPT 7B    | bits/weight  |   16.0 |    4.5 |    5.0 |    2.6  |
+|AWQ-MPT 7B| perplexity   | 8.4944 | 8.7053 |  8.6750 | 10.2873|
+|AWQ-MPT 7B| file size    |  13.7G  |   3.9G |   4.3G |   2.8G  |
+|AWQ-MPT 7B| bits/weight  |   16.0 |    4.5 |    5.0 |    2.6  |
--- a/awq-py/awq/apply_awq.py
+++ b/awq-py/awq/apply_awq.py
@@ -0,0 +1,254 @@
+"""
+Implements the AWQ for llama.cpp use cases.
+Original paper: https://arxiv.org/abs/2306.00978
+
+This code is based on versions of the AWQ implementation found in the following repositories:
+* https://github.com/mit-han-lab/llm-awq
+* https://github.com/casper-hansen/AutoAWQ
+"""
+
+import os
+import torch
+import torch.nn as nn
+
+from transformers import AutoModelForCausalLM, AutoConfig
+from transformers.models.bloom.modeling_bloom import BloomGelu
+from transformers.models.llama.modeling_llama import LlamaRMSNorm
+from transformers.activations import GELUActivation
+
+
+class ScaledActivation(nn.Module):
+    """
+    ScaledActivation module wraps an existing activation function and applies a
+    scale factor to its output.
+
+    Args:
+        module (nn.Module): The activation function to be scaled.
+        scales (torch.Tensor): A tensor of size (num_features,) containing the initial
+            scale factors for each feature.
+
+    Returns:
+        torch.Tensor: The scaled output of the activation function.
+    """
+
+    def __init__(self, module, scales):
+        super().__init__()
+        self.act = module
+        self.scales = nn.Parameter(scales.data)
+
+    def forward(self, x):
+        return self.act(x) / self.scales.view(1, 1, -1).to(x.device)
+
+
+def set_op_by_name(layer, name, new_module):
+    """
+    Set the new module for given module's name.
+
+    Args:
+        layer (nn.Module): The layer in which to replace the submodule.
+        name (str): The path to the submodule to be replaced, using dot notation
+            to access nested modules.
+        new_module (nn.Module): The new module to replace the existing one.
+    """
+    levels = name.split(".")
+    if len(levels) > 1:
+        mod_ = layer
+        for l_idx in range(len(levels) - 1):
+            if levels[l_idx].isdigit():
+                mod_ = mod_[int(levels[l_idx])]
+            else:
+                mod_ = getattr(mod_, levels[l_idx])
+        setattr(mod_, levels[-1], new_module)
+    else:
+        setattr(layer, name, new_module)
+
+
+def get_op_by_name(module, op_name):
+    """
+    Retrieves a submodule within a given layer based on its name.
+
+    Args:
+        module (nn.Module): The layer containing the submodule to find.
+        op_name (str): The name of the submodule.
+
+    Returns:
+        nn.Module: The requested submodule found within the given layer.
+
+    Raises:
+        ValueError: If the specified submodule cannot be found within the layer.
+    """
+    for name, m in module.named_modules():
+        if name == op_name:
+            return m
+    raise ValueError(f"Cannot find op {op_name} in module {module}")
+
+
+@torch.no_grad()
+def scale_ln_fcs(ln, fcs, scales):
+    """
+    Scales the weights of a LayerNorm and a list of fully-connected layers proportionally.
+
+    Args:
+        ln (nn.LayerNorm): The LayerNorm module to be scaled.
+        fcs (List[nn.Linear]): A list of fully-connected layers to be scaled.
+        scales (torch.Tensor): A 1D tensor of size (num_features,).
+    """
+
+    if not isinstance(fcs, list):
+        fcs = [fcs]
+
+    scales = scales.to(ln.weight.device)
+
+    ln.weight.div_(scales)
+    if hasattr(ln, "bias") and ln.bias is not None:
+        ln.bias.div_(scales)
+
+    for fc in fcs:
+        fc.weight.mul_(scales.view(1, -1))
+
+    for p in ln.parameters():
+        assert torch.isnan(p).sum() == 0
+    for fc in fcs:
+        for p in fc.parameters():
+            assert torch.isnan(p).sum() == 0
+
+
+@torch.no_grad()
+def scale_fc_fc(fc1, fc2, scales):
+    """
+    Scales the weights of two fully-connected layers in a specific pattern.
+
+    Args:
+        fc1 (nn.Linear): The first fully-connected layer to be scaled.
+        fc2 (nn.Linear): The second fully-connected layer to be scaled.
+        scales (torch.Tensor): A 1D tensor of size (num_features,).
+    """
+    assert isinstance(fc1, nn.Linear)
+    assert isinstance(fc2, nn.Linear)
+
+    scales = scales.to(fc1.weight.device)
+
+    fc1.weight[-scales.size(0):].div_(scales.view(-1, 1))
+    if fc1.bias is not None:
+        fc1.bias.div_(scales.view(-1))
+
+    fc2.weight.mul_(scales.view(1, -1))
+
+    for p in fc1.parameters():
+        assert torch.isnan(p).sum() == 0
+    for p in fc2.parameters():
+        assert torch.isnan(p).sum() == 0
+
+
+@torch.no_grad()
+def scale_gelu_fc(gelu, fc, scales):
+    """
+    Scales the weight of a GELU activation and a fully-connected layer proportionally.
+
+    Args:
+        gelu (Union[nn.GELU, BloomGelu, GELUActivation]): The GELU activation module to be scaled.
+        fc (nn.Linear): The fully-connected layer to be scaled.
+        scales (torch.Tensor): A 1D tensor of size (num_features,).
+
+    Raises:
+        TypeError: If the `gelu` module is not of type `nn.GELU`, `BloomGelu`, or `GELUActivation`.
+        TypeError: If the `fc` module is not of type `nn.Linear`.
+    """
+    assert isinstance(gelu, (nn.GELU, BloomGelu, GELUActivation))
+    assert isinstance(fc, nn.Linear)
+
+    fc.weight.mul_(scales.view(1, -1).to(fc.weight.device))
+
+    for p in fc.parameters():
+        assert torch.isnan(p).sum() == 0
+
+
+def apply_scale(module, scales_list, input_feat_dict=None):
+    """
+    Applies different scaling strategies to layers based on their type and hierarchy within a given module.
+
+    Args:
+        module (nn.Module): The module containing the layers to be scaled.
+        scales_list (List[Tuple[str, List[str], torch.Tensor]]): A list of tuples containing:
+            * prev_op_name (str): The name of the preceding operation or module,
+                relative to which the layers to be scaled are located.
+            * layer_names (List[str]): A list of names of the layers to be scaled, relative to the preceding operation.
+            * scales (torch.Tensor): A 1D tensor of size (num_features,) containing the scaling factors for each feature.
+        input_feat_dict (Optional[Dict[str, torch.Tensor]]): A dictionary mapping layer names to their corresponding
+            input features (optional).
+    """
+    for prev_op_name, layer_names, scales in scales_list:
+        prev_op = get_op_by_name(module, prev_op_name)
+        layers = [get_op_by_name(module, name) for name in layer_names]
+
+        prev_op.cuda()
+        for layer in layers:
+            layer.cuda()
+        scales.cuda()
+
+        if isinstance(prev_op, nn.Linear):
+            assert len(layers) == 1
+            scale_fc_fc(prev_op, layers[0], scales)
+        elif isinstance(prev_op, (nn.LayerNorm, LlamaRMSNorm)) or "rmsnorm" in str(prev_op.__class__).lower():
+            scale_ln_fcs(prev_op, layers, scales)
+        elif isinstance(prev_op, (nn.GELU, BloomGelu, GELUActivation)):
+            new_module = ScaledActivation(prev_op, scales)
+            set_op_by_name(module, prev_op_name, new_module)
+            scale_gelu_fc(prev_op, layers[0], scales)
+        else:
+            raise NotImplementedError(f"prev_op {type(prev_op)} not supported yet!")
+
+        # apply the scaling to input feat if given; prepare it for clipping
+        if input_feat_dict is not None:
+            for layer_name in layer_names:
+                inp = input_feat_dict[layer_name]
+                inp.div_(scales.view(1, -1).to(inp.device))
+
+        prev_op.cpu()
+        for layer in layers:
+            layer.cpu()
+        scales.cpu()
+
+
+@torch.no_grad()
+def apply_clip(module, clip_list):
+    """
+    Applies element-wise clipping to the weight of a specific layer within a given module.
+
+    Args:
+        module (nn.Module): The module containing the layer to be clipped.
+        clip_list (List[Tuple[str, torch.Tensor]]): A list of tuples containing:
+            * name (str): The name of the layer to be clipped, relative to the root of the module.
+            * max_val (torch.Tensor): A 1D or 2D tensor defining the upper bound for each element of the layer's weight.
+    """
+    for name, max_val in clip_list:
+        layer = get_op_by_name(module, name)
+        layer.cuda()
+        max_val = max_val.to(layer.weight.device)
+        org_shape = layer.weight.shape
+        layer.weight.data = layer.weight.data.reshape(*max_val.shape[:2], -1)
+        layer.weight.data = torch.clamp(layer.weight.data, -max_val, max_val)
+        layer.weight.data = layer.weight.data.reshape(org_shape)
+        layer.cpu()
+
+
+def add_scale_weights(model_path, scale_path, tmp_path):
+    """
+    Adds pre-computed Activation Weight Quantization (AWQ) results to a model,
+    including scaling factors and clipping bounds.
+
+    Args:
+        model_path (str): Path to the pre-trained model to be equipped with AWQ.
+        scale_path (str): Path to the AWQ scale factors (.pt file).
+        tmp_path (str): Path to the temporary directory where the equipped model will be saved.
+    """
+    config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
+    model = AutoModelForCausalLM.from_pretrained(
+        model_path, config=config, trust_remote_code=True
+    )
+    model.eval()
+    awq_results = torch.load(str(scale_path), map_location="cpu")
+    apply_scale(model, awq_results["scale"])
+    apply_clip(model, awq_results["clip"])
+    model.save_pretrained(str(tmp_path))
+    os.system(f"cp {str(model_path)}/tokenizer* {str(tmp_path)}")
--- a/awq-py/requirements.txt
+++ b/awq-py/requirements.txt
@@ -0,0 +1,2 @@
+torch>=2.1.1
+transformers>=4.32.0
--- a/build.zig
+++ b/build.zig
@@ -123,7 +123,6 @@ pub fn build(b: *std.build.Builder) !void {
    const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp");
    const train = make.obj("train", "common/train.cpp");
    const clip = make.obj("clip", "examples/llava/clip.cpp");
-    const llava = make.obj("llava", "examples/llava/llava.cpp");

    _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, console, grammar_parser });
    _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo });
@@ -132,7 +131,7 @@ pub fn build(b: *std.build.Builder) !void {
    _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, train });
    _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, train });

-    const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, grammar_parser, clip, llava });
+    const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, grammar_parser, clip });
    if (server.target.isWindows()) {
        server.linkSystemLibrary("ws2_32");
    }
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -272,19 +272,19 @@ function gg_run_open_llama_3b_v2 {
    (time ./bin/main --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
    (time ./bin/main --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log

-    (time ./bin/perplexity --model ${model_f16}  -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+    (time ./bin/perplexity --model ${model_f16}  -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log

-    (time ./bin/imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
+    (time ./bin/imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log

    (time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log

@@ -343,17 +343,17 @@ function gg_run_open_llama_3b_v2 {
    python3 ../convert-lora-to-ggml.py ${path_lora}

    # f16
-    (time ./bin/perplexity --model ${model_f16} -f ${shakespeare}                            -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log
-    (time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log
+    (time ./bin/perplexity --model ${model_f16} -f ${shakespeare}                            -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log
+    (time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log
    compare_ppl "f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-f16.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log

    # q8_0
-    (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare}                            -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log
-    (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log
+    (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare}                            -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log
+    (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log
    compare_ppl "q8_0 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log

    # q8_0 + f16 lora-base
-    (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log
+    (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log
    compare_ppl "q8_0 / f16 base shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log

    set +e
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -19,12 +19,7 @@ if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
        endif()
    endif()

-    if(EXISTS "${GIT_DIR}/index")
-        set(GIT_INDEX "${GIT_DIR}/index")
-    else()
-        message(WARNING "Git index not found in git repository.")
-        set(GIT_INDEX "")
-    endif()
+    set(GIT_INDEX "${GIT_DIR}/index")
 else()
    message(WARNING "Git repository not found; to enable automatic generation of build info, make sure Git is installed and the project is a Git repository.")
    set(GIT_INDEX "")
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -258,11 +258,19 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
            }
            sparams.top_k = std::stoi(argv[i]);
        } else if (arg == "-c" || arg == "--ctx-size") {
+            if (++i >= argc)
+            {
+                invalid_param = true;
+                break;
+            }
+            params.kv_size = std::stoi(argv[i]);
+            fprintf(stderr, "warning: -c,--ctx-size option is deprecated, use --kv-size instead");
+        } else if (arg == "-kv" || arg == "--kv-size" || arg == "--kv_size") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            params.n_ctx = std::stoi(argv[i]);
+            params.kv_size = std::stoi(argv[i]);
        } else if (arg == "--grp-attn-n" || arg == "-gan") {
            if (++i >= argc) {
                invalid_param = true;
@@ -295,9 +303,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                break;
            }
            std::string value(argv[i]);
-            /**/ if (value == "none")   { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; }
-            else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; }
-            else if (value == "yarn")   { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; }
+            /**/ if (value == "none")   { params.rope_scaling_type = LLAMA_ROPE_SCALING_NONE; }
+            else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_LINEAR; }
+            else if (value == "yarn")   { params.rope_scaling_type = LLAMA_ROPE_SCALING_YARN; }
            else { invalid_param = true; break; }
        } else if (arg == "--rope-scale") {
            if (++i >= argc) {
@@ -335,22 +343,6 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.yarn_beta_slow = std::stof(argv[i]);
-        } else if (arg == "--pooling") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            std::string value(argv[i]);
-            /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; }
-            else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; }
-            else if (value == "cls")  { params.pooling_type = LLAMA_POOLING_TYPE_CLS; }
-            else { invalid_param = true; break; }
-        } else if (arg == "--defrag-thold" || arg == "-dt") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.defrag_thold = std::stof(argv[i]);
        } else if (arg == "--samplers") {
            if (++i >= argc) {
                invalid_param = true;
@@ -513,6 +505,12 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.n_sequences = std::stoi(argv[i]);
+        } else if (arg == "--p-accept" || arg == "-pa") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.p_accept = std::stof(argv[i]);
        } else if (arg == "--p-split" || arg == "-ps") {
            if (++i >= argc) {
                invalid_param = true;
@@ -640,15 +638,11 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
            }
            std::string arg_next = argv[i];
            if (arg_next == "none") {
-                params.split_mode = LLAMA_SPLIT_MODE_NONE;
+                params.split_mode = LLAMA_SPLIT_NONE;
            } else if (arg_next == "layer") {
-                params.split_mode = LLAMA_SPLIT_MODE_LAYER;
+                params.split_mode = LLAMA_SPLIT_LAYER;
            } else if (arg_next == "row") {
-#ifdef GGML_USE_SYCL
-                fprintf(stderr, "warning: The split mode value:[row] is not supported by llama.cpp with SYCL. It's developing.\nExit!\n");
-                exit(1);
-#endif // GGML_USE_SYCL
-                params.split_mode = LLAMA_SPLIT_MODE_ROW;
+                params.split_mode = LLAMA_SPLIT_ROW;
            } else {
                invalid_param = true;
                break;
@@ -851,15 +845,15 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
            sep++;
            if (strncmp(sep, "int:", 4) == 0) {
                sep += 4;
-                kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
+                kvo.tag = LLAMA_KV_OVERRIDE_INT;
                kvo.int_value = std::atol(sep);
            } else if (strncmp(sep, "float:", 6) == 0) {
                sep += 6;
-                kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
+                kvo.tag = LLAMA_KV_OVERRIDE_FLOAT;
                kvo.float_value = std::atof(sep);
            } else if (strncmp(sep, "bool:", 5) == 0) {
                sep += 5;
-                kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
+                kvo.tag = LLAMA_KV_OVERRIDE_BOOL;
                if (std::strcmp(sep, "true") == 0) {
                    kvo.bool_value = true;
                } else if (std::strcmp(sep, "false") == 0) {
@@ -976,7 +970,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  -bf FNAME, --binary-file FNAME\n");
    printf("                        binary file containing multiple choice tasks.\n");
    printf("  -n N, --n-predict N   number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
-    printf("  -c N, --ctx-size N    size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
+    printf("  -kv N, --kv-size N    Specify the total size of the KV cache (default: %d)\n", params.kv_size);
    printf("  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
    printf("  --samplers            samplers that will be used for generation in the order, separated by \';\'\n");
    printf("                        (default: %s)\n", sampler_type_names.c_str());
@@ -986,7 +980,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  --min-p N             min-p sampling (default: %.1f, 0.0 = disabled)\n", (double)sparams.min_p);
    printf("  --tfs N               tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)sparams.tfs_z);
    printf("  --typical N           locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)sparams.typical_p);
-    printf("  --repeat-last-n N     last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", sparams.penalty_last_n);
+    printf("  --repeat-last-n N     last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = kv_size)\n", sparams.penalty_last_n);
    printf("  --repeat-penalty N    penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)sparams.penalty_repeat);
    printf("  --presence-penalty N  repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_present);
    printf("  --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_freq);
@@ -1018,14 +1012,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  --yarn-attn-factor N  YaRN: scale sqrt(t) or attention magnitude (default: 1.0)\n");
    printf("  --yarn-beta-slow N    YaRN: high correction dim or alpha (default: %.1f)\n", params.yarn_beta_slow);
    printf("  --yarn-beta-fast N    YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast);
-    printf("  --pooling {none,mean,cls}\n");
-    printf("                        pooling type for embeddings, use model default if unspecified\n");
-    printf("  -dt N, --defrag-thold N\n");
-    printf("                        KV cache defragmentation threshold (default: %.1f, < 0 - disabled)\n", params.defrag_thold);
    printf("  --ignore-eos          ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
    printf("  --no-penalize-nl      do not penalize newline token\n");
    printf("  --temp N              temperature (default: %.1f)\n", (double)sparams.temp);
-    printf("  --all-logits          return logits for all tokens in the batch (default: disabled)\n");
+    printf("  --logits-all          return logits for all tokens in the batch (default: disabled)\n");
    printf("  --hellaswag           compute HellaSwag score over random tasks from datafile supplied with -f\n");
    printf("  --hellaswag-tasks N   number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
    printf("  --winogrande          compute Winogrande score over random tasks from datafile supplied with -f\n");
@@ -1038,6 +1028,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  --chunks N            max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
    printf("  -np N, --parallel N   number of parallel sequences to decode (default: %d)\n", params.n_parallel);
    printf("  -ns N, --sequences N  number of sequences to decode (default: %d)\n", params.n_sequences);
+    printf("  -pa N, --p-accept N   speculative decoding accept probability (default: %.1f)\n", (double)params.p_accept);
    printf("  -ps N, --p-split N    speculative decoding split probability (default: %.1f)\n", (double)params.p_split);
    printf("  -cb, --cont-batching  enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
    printf("  --mmproj MMPROJ_FILE  path to a multimodal projector file for LLaVA. see examples/llava/README.md\n");
@@ -1286,13 +1277,14 @@ static ggml_type kv_cache_type_from_str(const std::string & s) {
 struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
    auto cparams = llama_context_default_params();

-    cparams.n_ctx             = params.n_ctx;
+    cparams.kv_size           = params.kv_size;
    cparams.n_batch           = params.n_batch;
    cparams.n_threads         = params.n_threads;
    cparams.n_threads_batch   = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
+    cparams.mul_mat_q         = params.mul_mat_q;
    cparams.seed              = params.seed;
    cparams.logits_all        = params.logits_all;
-    cparams.embeddings        = params.embedding;
+    cparams.embedding         = params.embedding;
    cparams.rope_scaling_type = params.rope_scaling_type;
    cparams.rope_freq_base    = params.rope_freq_base;
    cparams.rope_freq_scale   = params.rope_freq_scale;
@@ -1301,8 +1293,6 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
    cparams.yarn_beta_fast    = params.yarn_beta_fast;
    cparams.yarn_beta_slow    = params.yarn_beta_slow;
    cparams.yarn_orig_ctx     = params.yarn_orig_ctx;
-    cparams.pooling_type      = params.pooling_type;
-    cparams.defrag_thold      = params.defrag_thold;
    cparams.offload_kqv       = !params.no_kv_offload;

    cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
@@ -1676,7 +1666,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
    fprintf(stream, "cfg_scale: %f # default: 1.0\n", sparams.cfg_scale);
    fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks);
    fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
-    fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
+    fprintf(stream, "kv_size: %d # default: 512\n", params.kv_size);
    fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
    fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
    fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.penalty_freq);
@@ -1734,6 +1724,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
    fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
    fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", sparams.n_probs);
    fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
+    fprintf(stream, "no_mul_mat_q: %s # default: false\n", !params.mul_mat_q ? "true" : "false");
    fprintf(stream, "no_penalize_nl: %s # default: false\n", !sparams.penalize_nl ? "true" : "false");
    fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
    fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
--- a/common/common.h
+++ b/common/common.h
@@ -43,24 +43,25 @@ extern char const *LLAMA_BUILD_TARGET;
 int32_t get_num_physical_cores();

 struct gpt_params {
-    uint32_t seed                 = LLAMA_DEFAULT_SEED; // RNG seed
+    uint32_t seed                 = -1;    // RNG seed

    int32_t n_threads             = get_num_physical_cores();
    int32_t n_threads_draft       = -1;
    int32_t n_threads_batch       = -1;    // number of threads to use for batch processing (-1 = use n_threads)
    int32_t n_threads_batch_draft = -1;
    int32_t n_predict             = -1;    // new tokens to predict
-    int32_t n_ctx                 = 512;   // context size
+    int32_t kv_size               = 512;   // KV Cache size
    int32_t n_batch               = 512;   // batch size for prompt processing (must be >=32 to use BLAS)
    int32_t n_keep                = 0;     // number of tokens to keep from initial prompt
-    int32_t n_draft               = 5;     // number of tokens to draft during speculative decoding
+    int32_t n_draft               = 8;     // number of tokens to draft during speculative decoding
    int32_t n_chunks              = -1;    // max number of chunks to process (-1 = unlimited)
    int32_t n_parallel            = 1;     // number of parallel sequences to decode
    int32_t n_sequences           = 1;     // number of sequences to decode
+    float   p_accept              = 0.5f;  // speculative decoding accept probability
    float   p_split               = 0.1f;  // speculative decoding split probability
    int32_t n_gpu_layers          = -1;    // number of layers to store in VRAM (-1 - use default)
    int32_t n_gpu_layers_draft    = -1;    // number of layers to store in VRAM for the draft model (-1 - use default)
-    llama_split_mode split_mode   = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
+    llama_split_mode split_mode   = LLAMA_SPLIT_LAYER; // how to split the model across GPUs
    int32_t main_gpu              = 0;     // the GPU that is used for scratch and small tensors
    float   tensor_split[128]     = {0};   // how split tensors should be distributed across GPUs
    int32_t n_beams               = 0;     // if non-zero then use beam search of given width.
@@ -74,12 +75,8 @@ struct gpt_params {
    float   yarn_beta_fast        = 32.0f; // YaRN low correction dim
    float   yarn_beta_slow        = 1.0f;  // YaRN high correction dim
    int32_t yarn_orig_ctx         = 0;     // YaRN original context length
-    float   defrag_thold          = -1.0f; // KV cache defragmentation threshold
-
-    ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
-
-    llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
-    llama_pooling_type      pooling_type      = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
+    int32_t rope_scaling_type     = LLAMA_ROPE_SCALING_UNSPECIFIED;
+    ggml_numa_strategy numa       = GGML_NUMA_STRATEGY_DISABLED;

    // // sampling parameters
    struct llama_sampling_params sparams;
@@ -117,6 +114,7 @@ struct gpt_params {

    bool   kl_divergence   = false; // compute KL-divergence

+    bool mul_mat_q         = true;  // if true, use mul_mat_q kernels instead of cuBLAS
    bool random_prompt     = false; // do not randomize prompt if none provided
    bool use_color         = false; // use color to distinguish generations and inputs
    bool interactive       = false; // interactive mode
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -266,7 +266,7 @@ static llama_token llama_sampling_sample_impl(
            //    }
            //}

-            //LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx_main, id).c_str());
+            LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx_main, id).c_str());
        }
    }

@@ -295,77 +295,6 @@ static llama_token llama_sampling_sample_impl(
    return id;
 }

-static llama_token_data_array llama_sample_probability_distribution_impl(
-                  struct llama_sampling_context * ctx_sampling,
-                  struct llama_context * ctx_main,
-                  struct llama_context * ctx_cfg,
-                  const int idx) {
-    const llama_sampling_params & params = ctx_sampling->params;
-
-    const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
-
-    const int32_t penalty_last_n  = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n;
-    const float   penalty_repeat  = params.penalty_repeat;
-    const float   penalty_freq    = params.penalty_freq;
-    const float   penalty_present = params.penalty_present;
-    const bool    penalize_nl     = params.penalize_nl;
-
-    auto & prev = ctx_sampling->prev;
-    auto & cur  = ctx_sampling->cur;
-
-    // Get a pointer to the logits
-    float * logits = llama_get_logits_ith(ctx_main, idx);
-
-    // Declare original_logits at the beginning of the function scope
-    std::vector<float> original_logits;
-
-    // apply params.logit_bias map
-    for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
-        logits[it->first] += it->second;
-    }
-
-    if (ctx_cfg) {
-        float * logits_guidance = llama_get_logits_ith(ctx_cfg, idx);
-        llama_sample_apply_guidance(ctx_main, logits, logits_guidance, params.cfg_scale);
-    }
-
-    cur.clear();
-
-    for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-        cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
-    }
-
-    llama_token_data_array cur_p = { cur.data(), cur.size(), false };
-
-    // apply penalties
-    const auto& penalty_tokens = params.use_penalty_prompt_tokens ? params.penalty_prompt_tokens : prev;
-    const int penalty_tokens_used_size = std::min((int)penalty_tokens.size(), penalty_last_n);
-    if (penalty_tokens_used_size) {
-        const float nl_logit = logits[llama_token_nl(llama_get_model(ctx_main))];
-
-        llama_sample_repetition_penalties(ctx_main, &cur_p,
-                penalty_tokens.data() + penalty_tokens.size() - penalty_tokens_used_size,
-                penalty_tokens_used_size, penalty_repeat, penalty_freq, penalty_present);
-
-        if (!penalize_nl) {
-            for (size_t idx = 0; idx < cur_p.size; idx++) {
-                if (cur_p.data[idx].id == llama_token_nl(llama_get_model(ctx_main))) {
-                    cur_p.data[idx].logit = nl_logit;
-                    break;
-                }
-            }
-        }
-    }
-
-    // apply grammar checks
-    if (ctx_sampling->grammar != NULL) {
-        llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar);
-    }
-
-    llama_sample_softmax(ctx_main, &cur_p);
-    return cur_p;
-}
-
 llama_token llama_sampling_sample(
                  struct llama_sampling_context * ctx_sampling,
                  struct llama_context * ctx_main,
@@ -375,14 +304,6 @@ llama_token llama_sampling_sample(
    return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, false);
 }

-llama_token_data_array llama_sampling_probability_distribution(
-                  struct llama_sampling_context * ctx_sampling,
-                  struct llama_context * ctx_main,
-                  struct llama_context * ctx_cfg,
-                  const int idx) {
-    return llama_sample_probability_distribution_impl(ctx_sampling,ctx_main, ctx_cfg, idx);
-}
-
 void llama_sampling_accept(
        struct llama_sampling_context * ctx_sampling,
        struct llama_context * ctx_main,
--- a/common/sampling.h
+++ b/common/sampling.h
@@ -131,13 +131,6 @@ llama_token llama_sampling_sample(
        struct llama_context * ctx_cfg,
        int idx = 0);

-// returns the probability that token of given id will be sampled
-llama_token_data_array llama_sampling_probability_distribution(
-        struct llama_sampling_context * ctx_sampling,
-        struct llama_context * ctx_main,
-        struct llama_context * ctx_cfg,
-        int idx = 0);
-
 void llama_sampling_accept(
        struct llama_sampling_context * ctx_sampling,
        struct llama_context * ctx_main,
--- a/common/train.cpp
+++ b/common/train.cpp
@@ -31,7 +31,7 @@ struct train_state  * init_train_state() {

    state->opt = new struct ggml_opt_context;
    state->opt->ctx = NULL;
-    state->opt->params = ggml_opt_default_params(GGML_OPT_TYPE_ADAM);
+    state->opt->params = ggml_opt_default_params(GGML_OPT_ADAM);
    state->opt->params.graph_size = LLAMA_TRAIN_MAX_NODES;
    state->opt->loss_after = 0.0f;

@@ -556,7 +556,7 @@ void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_g
    std::string opt_type;
    GGUF_GET_KEY(fctx, opt_type, gguf_get_val_str, GGUF_TYPE_STRING, true, LLM_KV_OPTIMIZER_TYPE);
    if (opt_type == LLM_KV_OPTIMIZER_TYPE_ADAM) {
-        opt->params.type = GGML_OPT_TYPE_ADAM;
+        opt->params.type = GGML_OPT_ADAM;

        GGUF_GET_KEY(fctx, opt->adam.fx_best,          gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_ADAM_BEST_LOSS);
        GGUF_GET_KEY(fctx, opt->adam.fx_prev,          gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS);
@@ -568,7 +568,7 @@ void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_g
        copy_tensor_by_name(opt->adam.v,  f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS);
        copy_tensor_by_name(opt->adam.pf, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES);
    } else if (opt_type == LLM_KV_OPTIMIZER_TYPE_LBFGS) {
-        opt->params.type = GGML_OPT_TYPE_LBFGS;
+        opt->params.type = GGML_OPT_LBFGS;

        GGUF_GET_KEY(fctx, opt->params.lbfgs.m,         gguf_get_val_u32, GGUF_TYPE_UINT32,  true, LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT);
        GGUF_GET_KEY(fctx, opt->lbfgs.fx_best,          gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS);
@@ -603,7 +603,7 @@ void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context *
    gguf_set_val_bool(fctx, LLM_KV_OPTIMIZER_JUST_INITIALIZED, opt->just_initialized);

    switch (opt->params.type) {
-        case GGML_OPT_TYPE_ADAM:
+        case GGML_OPT_ADAM:
            {
                gguf_set_val_str(fctx, LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_ADAM);
                gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_ADAM_BEST_LOSS,            opt->adam.fx_best);
@@ -622,7 +622,7 @@ void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context *
                    gguf_add_tensor(fctx, opt->adam.pf);
                }
            } break;
-        case GGML_OPT_TYPE_LBFGS:
+        case GGML_OPT_LBFGS:
            {
                gguf_set_val_str(fctx, LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS);
                gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, opt->params.lbfgs.m);
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -8,10 +8,9 @@ import json
 import os
 import re
 import sys
-from abc import ABC, abstractmethod
 from enum import IntEnum
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterator, Sequence, TypeVar, cast
+from typing import TYPE_CHECKING, Any, ContextManager, Iterator, Sequence, cast

 import numpy as np
 import torch
@@ -37,12 +36,7 @@ class SentencePieceTokenTypes(IntEnum):
    BYTE = 6


-AnyModel = TypeVar("AnyModel", bound="type[Model]")
-
-
-class Model(ABC):
-    _model_classes: dict[str, type[Model]] = {}
-
+class Model:
    def __init__(self, dir_model: Path, ftype: int, fname_out: Path, is_big_endian: bool):
        self.dir_model = dir_model
        self.ftype = ftype
@@ -53,14 +47,10 @@ class Model(ABC):
        self.num_parts = Model.count_model_parts(self.dir_model, ".safetensors" if self.is_safetensors else ".bin")
        self.part_names = self._get_part_names()
        self.hparams = Model.load_hparams(self.dir_model)
+        self.model_arch = self._get_model_architecture()
        self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=False)
        self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"])

-    @property
-    @abstractmethod
-    def model_arch(self) -> gguf.MODEL_ARCH:
-        pass
-
    def find_hparam(self, keys: Sequence[str], optional: bool = False) -> Any:
        key = next((k for k in keys if k in self.hparams), None)
        if key is not None:
@@ -106,11 +96,9 @@ class Model(ABC):
        if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None:
            self.gguf_writer.add_head_count_kv(n_head_kv)

-        if (rope_theta := self.hparams.get("rope_theta")) is not None:
-            self.gguf_writer.add_rope_freq_base(rope_theta)
        if (f_rms_eps := self.hparams.get("rms_norm_eps")) is not None:
            self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps)
-        if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True)) is not None:
+        if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon"], optional=True)) is not None:
            self.gguf_writer.add_layer_norm_eps(f_norm_eps)
        if (n_experts := self.hparams.get("num_local_experts")) is not None:
            self.gguf_writer.add_expert_count(n_experts)
@@ -186,22 +174,51 @@ class Model(ABC):
        with open(dir_model / "config.json", "r", encoding="utf-8") as f:
            return json.load(f)

-    @classmethod
-    def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]:
-        assert names
-
-        def func(modelcls: type[Model]):
-            for name in names:
-                cls._model_classes[name] = modelcls
-            return modelcls
-        return func
-
-    @classmethod
-    def from_model_architecture(cls, arch):
-        try:
-            return cls._model_classes[arch]
-        except KeyError:
-            raise NotImplementedError(f'Architecture {arch!r} not supported!') from None
+    @staticmethod
+    def from_model_architecture(model_architecture):
+        if model_architecture == "GPTNeoXForCausalLM":
+            return GPTNeoXModel
+        if model_architecture == "BloomForCausalLM":
+            return BloomModel
+        if model_architecture == "MPTForCausalLM":
+            return MPTModel
+        if model_architecture in ("BaichuanForCausalLM", "BaiChuanForCausalLM"):
+            return BaichuanModel
+        if model_architecture in ("FalconForCausalLM", "RWForCausalLM"):
+            return FalconModel
+        if model_architecture == "GPTBigCodeForCausalLM":
+            return StarCoderModel
+        if model_architecture == "GPTRefactForCausalLM":
+            return RefactModel
+        if model_architecture == "PersimmonForCausalLM":
+            return PersimmonModel
+        if model_architecture in ("StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"):
+            return StableLMModel
+        if model_architecture == "QWenLMHeadModel":
+            return QwenModel
+        if model_architecture == "Qwen2ForCausalLM":
+            return Model
+        if model_architecture == "MixtralForCausalLM":
+            return MixtralModel
+        if model_architecture == "GPT2LMHeadModel":
+            return GPT2Model
+        if model_architecture == "PhiForCausalLM":
+            return Phi2Model
+        if model_architecture == "PlamoForCausalLM":
+            return PlamoModel
+        if model_architecture == "CodeShellForCausalLM":
+            return CodeShellModel
+        if model_architecture == "OrionForCausalLM":
+            return OrionModel
+        if model_architecture == "InternLM2ForCausalLM":
+            return InternLM2Model
+        if model_architecture == "MiniCPMForCausalLM":
+            return MiniCPMModel
+        if model_architecture == "BertModel":
+            return BertModel
+        if model_architecture == "NomicBertModel":
+            return NomicBertModel
+        return Model

    def _is_model_safetensors(self) -> bool:
        return Model.count_model_parts(self.dir_model, ".safetensors") > 0
@@ -216,6 +233,53 @@ class Model(ABC):
            return ("pytorch_model.bin",)
        return (f"pytorch_model-{n:05}-of-{self.num_parts:05}.bin" for n in range(1, self.num_parts + 1))

+    def _get_model_architecture(self) -> gguf.MODEL_ARCH:
+        arch = self.hparams["architectures"][0]
+        if arch == "GPTNeoXForCausalLM":
+            return gguf.MODEL_ARCH.GPTNEOX
+        if arch == "BloomForCausalLM":
+            return gguf.MODEL_ARCH.BLOOM
+        if arch == "MPTForCausalLM":
+            return gguf.MODEL_ARCH.MPT
+        if arch in ("BaichuanForCausalLM", "BaiChuanForCausalLM"):
+            return gguf.MODEL_ARCH.BAICHUAN
+        if arch in ("FalconForCausalLM", "RWForCausalLM"):
+            return gguf.MODEL_ARCH.FALCON
+        if arch == "GPTBigCodeForCausalLM":
+            return gguf.MODEL_ARCH.STARCODER
+        if arch == "GPTRefactForCausalLM":
+            return gguf.MODEL_ARCH.REFACT
+        if arch == "PersimmonForCausalLM":
+            return gguf.MODEL_ARCH.PERSIMMON
+        if arch in ("StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"):
+            return gguf.MODEL_ARCH.STABLELM
+        if arch == "QWenLMHeadModel":
+            return gguf.MODEL_ARCH.QWEN
+        if arch == "Qwen2ForCausalLM":
+            return gguf.MODEL_ARCH.QWEN2
+        if arch == "MixtralForCausalLM":
+            return gguf.MODEL_ARCH.LLAMA
+        if arch == "GPT2LMHeadModel":
+            return gguf.MODEL_ARCH.GPT2
+        if arch == "PhiForCausalLM":
+            return gguf.MODEL_ARCH.PHI2
+        if arch == "PlamoForCausalLM":
+            return gguf.MODEL_ARCH.PLAMO
+        if arch == "CodeShellForCausalLM":
+            return gguf.MODEL_ARCH.CODESHELL
+        if arch == "OrionForCausalLM":
+            return gguf.MODEL_ARCH.ORION
+        if arch == "InternLM2ForCausalLM":
+            return gguf.MODEL_ARCH.INTERNLM2
+        if arch == "MiniCPMForCausalLM":
+            return gguf.MODEL_ARCH.MINICPM
+        if arch == "BertModel":
+            return gguf.MODEL_ARCH.BERT
+        if arch == "NomicBertModel":
+            return gguf.MODEL_ARCH.NOMIC_BERT
+
+        raise NotImplementedError(f'Architecture "{arch}" not supported!')
+
    def _set_vocab_gpt2(self):
        dir_model = self.dir_model
        hparams = self.hparams
@@ -383,10 +447,7 @@ class Model(ABC):
        special_vocab.add_to_gguf(self.gguf_writer)


-@Model.register("GPTNeoXForCausalLM")
 class GPTNeoXModel(Model):
-    model_arch = gguf.MODEL_ARCH.GPTNEOX
-
    def set_gguf_parameters(self):
        block_count = self.hparams["num_hidden_layers"]

@@ -403,10 +464,7 @@ class GPTNeoXModel(Model):
        self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"])


-@Model.register("BloomForCausalLM")
 class BloomModel(Model):
-    model_arch = gguf.MODEL_ARCH.BLOOM
-
    def set_gguf_parameters(self):
        self.gguf_writer.add_name("Bloom")
        n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
@@ -498,10 +556,7 @@ class BloomModel(Model):
                print(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}")


-@Model.register("MPTForCausalLM")
 class MPTModel(Model):
-    model_arch = gguf.MODEL_ARCH.MPT
-
    def set_gguf_parameters(self):
        block_count = self.hparams["n_layers"]
        self.gguf_writer.add_name(self.dir_model.name)
@@ -563,11 +618,13 @@ class MPTModel(Model):

            self.gguf_writer.add_tensor(new_name, data)

+            # note: MPT output is tied to (same as) wte in original model;
+            # for easier implementation in llama.cpp it's duplicated in GGUF, though :/
+            if new_name == "token_embd.weight":
+                self.gguf_writer.add_tensor("output.weight", data)
+

-@Model.register("OrionForCausalLM")
 class OrionModel(Model):
-    model_arch = gguf.MODEL_ARCH.ORION
-
    def set_vocab(self):
        self._set_vocab_sentencepiece()

@@ -598,8 +655,6 @@ class OrionModel(Model):
        self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
        self.gguf_writer.add_head_count(head_count)
        self.gguf_writer.add_head_count_kv(head_count_kv)
-        # note: config provides rms norm but it is actually layer norm
-        # ref:  https://huggingface.co/OrionStarAI/Orion-14B-Chat/blob/276a17221ce42beb45f66fac657a41540e71f4f5/modeling_orion.py#L570-L571
        self.gguf_writer.add_layer_norm_eps(self.hparams["rms_norm_eps"])

    def write_tensors(self):
@@ -646,10 +701,7 @@ class OrionModel(Model):
            self.gguf_writer.add_tensor(new_name, data)


-@Model.register("BaichuanForCausalLM", "BaiChuanForCausalLM")
 class BaichuanModel(Model):
-    model_arch = gguf.MODEL_ARCH.BAICHUAN
-
    def set_vocab(self):
        self._set_vocab_sentencepiece()

@@ -764,10 +816,7 @@ class BaichuanModel(Model):
        return weights[r * n_part:r * n_part + r, ...]


-@Model.register("FalconForCausalLM", "RWForCausalLM")
 class FalconModel(Model):
-    model_arch = gguf.MODEL_ARCH.FALCON
-
    def set_gguf_parameters(self):
        block_count = self.hparams.get("num_hidden_layers")
        if block_count is None:
@@ -860,10 +909,7 @@ class FalconModel(Model):
            self.gguf_writer.add_tensor(new_name, data)


-@Model.register("GPTBigCodeForCausalLM")
 class StarCoderModel(Model):
-    model_arch = gguf.MODEL_ARCH.STARCODER
-
    def set_gguf_parameters(self):
        block_count = self.hparams["n_layer"]

@@ -878,10 +924,7 @@ class StarCoderModel(Model):
        self.gguf_writer.add_file_type(self.ftype)


-@Model.register("GPTRefactForCausalLM")
 class RefactModel(Model):
-    model_arch = gguf.MODEL_ARCH.REFACT
-
    def set_gguf_parameters(self):
        hidden_dim = self.hparams["n_embd"]
        inner_dim = 4 * hidden_dim
@@ -965,10 +1008,7 @@ class RefactModel(Model):
            self.gguf_writer.add_tensor(new_name, data)


-@Model.register("PersimmonForCausalLM")
 class PersimmonModel(Model):
-    model_arch = gguf.MODEL_ARCH.PERSIMMON
-
    def set_gguf_parameters(self):
        block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers"))
        head_count = self.hparams["num_attention_heads"]
@@ -991,6 +1031,7 @@ class PersimmonModel(Model):
        self.gguf_writer.add_head_count_kv(head_count_kv)
        self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"])
        self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"])
+        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])

    def set_vocab(self):
        self._set_vocab_sentencepiece()
@@ -1016,10 +1057,7 @@ class PersimmonModel(Model):
            self.gguf_writer.add_tensor(new_name, data)


-@Model.register("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM")
 class StableLMModel(Model):
-    model_arch = gguf.MODEL_ARCH.STABLELM
-
    def set_vocab(self):
        if (self.dir_model / "tokenizer.json").is_file():
            self._set_vocab_gpt2()
@@ -1036,25 +1074,18 @@ class StableLMModel(Model):
        self.gguf_writer.add_embedding_length(hparams["hidden_size"])
        self.gguf_writer.add_block_count(block_count)
        self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
-        rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"])
-        self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
+        self.gguf_writer.add_rope_dimension_count(int(hparams["rope_pct"] * (hparams["hidden_size"] // hparams["num_attention_heads"])))
        self.gguf_writer.add_head_count(hparams["num_attention_heads"])
        self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
-        self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_eps", "norm_eps"]))
+        self.gguf_writer.add_layer_norm_eps(1e-5)


-@Model.register("MixtralForCausalLM")
 class MixtralModel(Model):
-    model_arch = gguf.MODEL_ARCH.LLAMA
-
    def set_vocab(self):
        self._set_vocab_sentencepiece()


-@Model.register("MiniCPMForCausalLM")
 class MiniCPMModel(Model):
-    model_arch = gguf.MODEL_ARCH.MINICPM
-
    def set_gguf_parameters(self):
        block_count = self.hparams["num_hidden_layers"]
        self.gguf_writer.add_name("MiniCPM")
@@ -1131,10 +1162,7 @@ class MiniCPMModel(Model):
            self.gguf_writer.add_tensor(new_name, data)


-@Model.register("QWenLMHeadModel")
 class QwenModel(Model):
-    model_arch = gguf.MODEL_ARCH.QWEN
-
    @staticmethod
    def token_bytes_to_string(b):
        from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
@@ -1214,15 +1242,7 @@ class QwenModel(Model):
            self.gguf_writer.add_tensor(new_name, data)


-@Model.register("Qwen2ForCausalLM")
-class Qwen2Model(Model):
-    model_arch = gguf.MODEL_ARCH.QWEN2
-
-
-@Model.register("GPT2LMHeadModel")
 class GPT2Model(Model):
-    model_arch = gguf.MODEL_ARCH.GPT2
-
    def set_gguf_parameters(self):
        self.gguf_writer.add_name(self.dir_model.name)
        self.gguf_writer.add_block_count(self.hparams["n_layer"])
@@ -1284,10 +1304,7 @@ class GPT2Model(Model):
                self.gguf_writer.add_tensor("output.weight", data)


-@Model.register("PhiForCausalLM")
 class Phi2Model(Model):
-    model_arch = gguf.MODEL_ARCH.PHI2
-
    def set_gguf_parameters(self):
        block_count = self.find_hparam(["num_hidden_layers", "n_layer"])

@@ -1309,10 +1326,7 @@ class Phi2Model(Model):
        self.gguf_writer.add_add_bos_token(False)


-@Model.register("PlamoForCausalLM")
 class PlamoModel(Model):
-    model_arch = gguf.MODEL_ARCH.PLAMO
-
    def set_vocab(self):
        self._set_vocab_sentencepiece()

@@ -1391,10 +1405,7 @@ class PlamoModel(Model):
            self.gguf_writer.add_tensor(new_name, data)


-@Model.register("CodeShellForCausalLM")
 class CodeShellModel(Model):
-    model_arch = gguf.MODEL_ARCH.CODESHELL
-
    def set_gguf_parameters(self):
        block_count = self.hparams["n_layer"]

@@ -1459,10 +1470,7 @@ class CodeShellModel(Model):
                print(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}")


-@Model.register("InternLM2ForCausalLM")
 class InternLM2Model(Model):
-    model_arch = gguf.MODEL_ARCH.INTERNLM2
-
    def set_vocab(self):
        # (TODO): Is there a better way?
        # Copy from _set_vocab_sentencepiece, The only difference is that we will treat the character
@@ -1634,10 +1642,7 @@ in chat mode so that the conversation can end normally.")
                self.post_write_tensors(tensor_map, name, data_torch)


-@Model.register("BertModel")
 class BertModel(Model):
-    model_arch = gguf.MODEL_ARCH.BERT
-
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.vocab_size = None
@@ -1647,17 +1652,16 @@ class BertModel(Model):
        self.gguf_writer.add_causal_attention(False)

        # get pooling path
+        with open(self.dir_model / "modules.json", encoding="utf-8") as f:
+            modules = json.load(f)
        pooling_path = None
-        module_path = self.dir_model / "modules.json"
-        if module_path.is_file():
-            with open(module_path, encoding="utf-8") as f:
-                modules = json.load(f)
-            for mod in modules:
-                if mod["type"] == "sentence_transformers.models.Pooling":
-                    pooling_path = mod["path"]
-                    break
+        for mod in modules:
+            if mod["type"] == "sentence_transformers.models.Pooling":
+                pooling_path = mod["path"]
+                break

        # get pooling type
+        pooling_type = gguf.PoolingType.NONE
        if pooling_path is not None:
            with open(self.dir_model / pooling_path / "config.json", encoding="utf-8") as f:
                pooling = json.load(f)
@@ -1667,7 +1671,8 @@ class BertModel(Model):
                pooling_type = gguf.PoolingType.CLS
            else:
                raise NotImplementedError("Only MEAN and CLS pooling types supported")
-            self.gguf_writer.add_pooling_type(pooling_type)
+
+        self.gguf_writer.add_pooling_type(pooling_type.value)

    def set_vocab(self):
        path = self.dir_model
@@ -1743,10 +1748,7 @@ class BertModel(Model):
            self.gguf_writer.add_tensor(new_name, data)


-@Model.register("NomicBertModel")
 class NomicBertModel(BertModel):
-    model_arch = gguf.MODEL_ARCH.NOMIC_BERT
-
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

@@ -1783,70 +1785,6 @@ class NomicBertModel(BertModel):
            yield name, data


-@Model.register("GemmaForCausalLM")
-class GemmaModel(Model):
-    model_arch = gguf.MODEL_ARCH.GEMMA
-
-    def set_vocab(self):
-        self._set_vocab_sentencepiece()
-
-    def set_gguf_parameters(self):
-        hparams = self.hparams
-        block_count = hparams["num_hidden_layers"]
-
-        self.gguf_writer.add_name(self.dir_model.name)
-        self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
-        self.gguf_writer.add_embedding_length(hparams["hidden_size"])
-        self.gguf_writer.add_block_count(block_count)
-        self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
-        self.gguf_writer.add_head_count(hparams["num_attention_heads"])
-        self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"] if "num_key_value_heads" in hparams else hparams["num_attention_heads"])
-        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
-        self.gguf_writer.add_key_length(hparams["head_dim"])
-        self.gguf_writer.add_value_length(hparams["head_dim"])
-        self.gguf_writer.add_file_type(self.ftype)
-
-    def write_tensors(self):
-        block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
-        tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
-
-        for name, data_torch in self.get_tensors():
-            old_dtype = data_torch.dtype
-
-            # convert any unsupported data types to float32
-            if data_torch.dtype not in (torch.float16, torch.float32):
-                data_torch = data_torch.to(torch.float32)
-
-            # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89
-            if name.endswith("norm.weight"):
-                data_torch = data_torch + 1
-            data = data_torch.squeeze().numpy()
-
-            # map tensor names
-            new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
-            if new_name is None:
-                print(f"Can not map tensor {name!r}")
-                sys.exit()
-
-            n_dims = len(data.shape)
-            data_dtype = data.dtype
-
-            data = data.astype(np.float32)
-
-            # if f16 desired, convert any float32 2-dim weight tensors to float16
-            if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
-                data = data.astype(np.float16)
-
-            print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
-
-            self.gguf_writer.add_tensor(new_name, data)
-
-
-@Model.register("Starcoder2ForCausalLM")
-class StarCoder2Model(Model):
-    model_arch = gguf.MODEL_ARCH.STARCODER2
-
-
 ###### CONVERSION LOGIC ######


--- a/convert-llama-ggml-to-gguf.py
+++ b/convert-llama-ggml-to-gguf.py
@@ -373,7 +373,7 @@ def handle_metadata(cfg, hp):
        raise ValueError('Unable to load metadata')
    vocab_path = Path(cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir)
    vocab_factory = convert.VocabFactory(vocab_path)
-    vocab, special_vocab = vocab_factory.load_vocab(cfg.vocabtype.split(","), cfg.model_metadata_dir)
+    vocab, special_vocab = vocab_factory.load_vocab(cfg.vocabtype, cfg.model_metadata_dir)
    convert.check_vocab_size(params, vocab)
    return params, vocab, special_vocab

@@ -398,8 +398,8 @@ def handle_args():
                        help ='Load HuggingFace/.pth vocab and metadata from the specified directory')
    parser.add_argument("--vocab-dir", type=Path,
                        help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir")
-    parser.add_argument("--vocabtype", default="spm,hfft",
-                        help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm,hfft)")
+    parser.add_argument("--vocabtype", choices=["spm", "bpe"], default="spm",
+                        help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)")
    return parser.parse_args()


--- a/convert.py
+++ b/convert.py
@@ -1282,32 +1282,35 @@ def load_some_model(path: Path) -> ModelPlus:


 class VocabFactory:
-    _FILES = {"spm": "tokenizer.model", "bpe": "vocab.json", "hfft": "tokenizer.json"}
-
    def __init__(self, path: Path):
        self.path = path
-        self.file_paths = self._detect_files()
-        print(f"Found vocab files: {self.file_paths}")
+        self.files: dict[str, Path | None] = {
+            "tokenizer.model": None,
+            "vocab.json": None,
+            "tokenizer.json": None,
+        }
+        self._detect_files()

-    def _detect_files(self) -> dict[str, Path | None]:
-        def locate(file: str) -> Path | None:
-            if (path := self.path / file).exists():
-                return path
-            if (path := self.path.parent / file).exists():
-                return path
-            return None
+    def _detect_files(self):
+        for file in self.files.keys():
+            file_path = self.path / file
+            parent_file_path = self.path.parent / file
+            if file_path.exists():
+                self.files[file] = file_path
+            elif parent_file_path.exists():
+                self.files[file] = parent_file_path
+        print(f"Found vocab files: {self.files}")

-        return {vt: locate(f) for vt, f in self._FILES.items()}
-
-    def _select_file(self, vocab_types: list[str]) -> tuple[str, Path]:
-        for vtype in vocab_types:
-            try:
-                path = self.file_paths[vtype]
-            except KeyError:
-                raise ValueError(f"Unsupported vocabulary type {vtype}") from None
-            if path is not None:
-                return vtype, path
-        raise FileNotFoundError(f"Could not find any of {[self._FILES[vt] for vt in vocab_types]}")
+    def _select_file(self, vocabtype: str | None) -> Path:
+        if vocabtype in ["spm", "bpe"]:
+            for file_key in self.files.keys():
+                if (file := self.files[file_key]) is not None:
+                    return file
+            raise FileNotFoundError(f"{vocabtype} vocab not found.")
+        if vocabtype == "hfft":
+            # For Hugging Face Fast Tokenizer, return the directory path instead of a specific file
+            return self.path
+        raise ValueError(f"Unsupported vocabulary type {vocabtype}")

    def _create_special_vocab(self, vocab: Vocab, vocabtype: str, model_parent_path: Path) -> gguf.SpecialVocab:
        load_merges = vocabtype == "bpe"
@@ -1319,30 +1322,30 @@ class VocabFactory:
            n_vocab=n_vocab,
        )

-    def load_vocab(self, vocab_types: list[str], model_parent_path: Path) -> tuple[Vocab, gguf.SpecialVocab]:
-        vocab_type, path = self._select_file(vocab_types)
-        print(f"Loading vocab file {path!r}, type {vocab_type!r}")
+    def load_vocab(self, vocabtype: str, model_parent_path: Path) -> tuple[Vocab, gguf.SpecialVocab]:
+        path = self._select_file(vocabtype)
+        print(f"Loading vocab file '{path}', type '{vocabtype}'")

        added_tokens_path = path.parent / "added_tokens.json"
        vocab: Vocab
-        if vocab_type == "bpe":
+        if vocabtype == "bpe":
            vocab = BpeVocab(
                path, added_tokens_path if added_tokens_path.exists() else None
            )
-        elif vocab_type == "spm":
+        elif vocabtype == "spm":
            vocab = SentencePieceVocab(
                path, added_tokens_path if added_tokens_path.exists() else None
            )
-        elif vocab_type == "hfft":
+        elif vocabtype == "hfft":
            vocab = HfVocab(
-                path.parent, added_tokens_path if added_tokens_path.exists() else None
+                path, added_tokens_path if added_tokens_path.exists() else None
            )
        else:
-            raise ValueError(vocab_type)
+            raise ValueError(f"Unsupported vocabulary type {vocabtype}")
        # FIXME: Respect --vocab-dir?
        special_vocab = self._create_special_vocab(
            vocab,
-            vocab_type,
+            vocabtype,
            model_parent_path,
        )
        return vocab, special_vocab
@@ -1376,14 +1379,15 @@ def main(args_in: list[str] | None = None) -> None:
    if np.uint32(1) == np.uint32(1).newbyteorder("<"):
        # We currently only support Q8_0 output on little endian systems.
        output_choices.append("q8_0")
-    parser = argparse.ArgumentParser(description="Convert a LLaMA model to a GGML compatible file")
+    vocab_types = ["spm", "bpe", "hfft"]
+    parser = argparse.ArgumentParser(description="Convert a LLaMa model to a GGML compatible file")
    parser.add_argument("--awq-path",     type=Path,              help="Path to scale awq cache file", default=None)
    parser.add_argument("--dump",         action="store_true",    help="don't convert, just show what's in the model")
    parser.add_argument("--dump-single",  action="store_true",    help="don't convert, just show what's in a single model file")
    parser.add_argument("--vocab-only",   action="store_true",    help="extract only the vocab")
    parser.add_argument("--outtype",      choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)")
    parser.add_argument("--vocab-dir",    type=Path,              help="directory containing tokenizer.model, if separate from model file")
-    parser.add_argument("--vocab-type",                           help="vocab types to try in order, choose from 'spm', 'bpe', 'hfft' (default: spm,hfft)", default="spm,hfft")
+    parser.add_argument("--vocab-type",   choices=vocab_types,    help="The vocabulary format used to define the tokenizer model (default: spm)", default="spm")
    parser.add_argument("--outfile",      type=Path,              help="path to write to; default: based on input")
    parser.add_argument("model",          type=Path,              help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
    parser.add_argument("--ctx",          type=int,               help="model training context (default: based on input)")
@@ -1444,7 +1448,7 @@ def main(args_in: list[str] | None = None) -> None:
    model_parent_path = model_plus.paths[0].parent
    vocab_path = Path(args.vocab_dir or args.model or model_parent_path)
    vocab_factory = VocabFactory(vocab_path)
-    vocab, special_vocab = vocab_factory.load_vocab(args.vocab_type.split(","), model_parent_path)
+    vocab, special_vocab = vocab_factory.load_vocab(args.vocab_type, model_parent_path)

    if args.vocab_only:
        if not args.outfile:
--- a/examples/Miku.sh
+++ b/examples/Miku.sh
@@ -7,11 +7,11 @@ USER_NAME="${USER_NAME:-Anon}"

 # Uncomment and adjust to the number of CPU cores you want to use.
 #N_THREAD="${N_THREAD:-4}"
-CTX_SIZE="${CTX_SIZE:-4096}"
+KV_SIZE="${KV_SIZE:-4096}"
 N_PREDICTS="${N_PREDICTS:-4096}"

 GEN_OPTIONS=(--batch_size 1024
--ctx_size "$CTX_SIZE"
+--kv_size "$KV_SIZE"
 --keep -1
 --repeat_last_n 256
 --repeat_penalty 1.17647
--- a/examples/alpaca.sh
+++ b/examples/alpaca.sh
@@ -10,7 +10,7 @@ cd ..
 ./main -m ./models/alpaca.13b.ggmlv3.q8_0.bin \
       --color \
       -f ./prompts/alpaca.txt \
-       --ctx_size 2048 \
+       --kv_size 2048 \
       -n -1 \
       -ins -b 256 \
       --top_k 10000 \
--- a/examples/baby-llama/baby-llama.cpp
+++ b/examples/baby-llama/baby-llama.cpp
@@ -532,16 +532,16 @@ static struct ggml_tensor * forward(
                // Vcur shape [n_embd, N, 1, 1]
                struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wv, cur), n_embd, N)));

-                // kv_self.k shape [n_embd * n_ctx * n_layer, 1]
-                // kv_self.v shape [n_embd * n_ctx * n_layer, 1]
+                // kv_self.k shape [n_embd * kv_size * n_layer, 1]
+                // kv_self.v shape [n_embd * kv_size * n_layer, 1]
                // k         shape [n_embd * N, 1]   == kv_self.k[:,n_past:n_past+N,il,0]
                // v         shape [N, n_embd, 1, 1] == kv_self.v[:,n_past:n_past+N,il,0]

                /* {
-                    struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
+                    struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*kv_size + n_past));
                    struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
-                            (   n_ctx)*ggml_element_size(kv_self.v),
-                            (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
+                            (   kv_size)*ggml_element_size(kv_self.v),
+                            (il*kv_size)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));

                    // important: storing RoPE-ed version of K in the KV cache!
                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
@@ -560,7 +560,7 @@ static struct ggml_tensor * forward(
                        Qcur,
                        0, 2, 1, 3);

-            // kv_self.k shape [n_embd * n_ctx * n_layer, 1]
+            // kv_self.k shape [n_embd * kv_size * n_layer, 1]
            // K shape [n_embd/n_head, n_past + N, n_head, 1]
            struct ggml_tensor * K =
                ggml_permute(ctx0,
@@ -780,16 +780,16 @@ static struct ggml_tensor * forward_batch(

                assert_shape_3d(Vcur, N, n_embd, n_batch);

-                // kv_self.k shape [n_embd * n_ctx * n_batch * n_layer]
-                // kv_self.v shape [n_ctx * n_embd * n_batch * n_layer]
+                // kv_self.k shape [n_embd * kv_size * n_batch * n_layer]
+                // kv_self.v shape [kv_size * n_embd * n_batch * n_layer]
                // k         shape [n_embd * N, n_batch]   == kv_self.k[:,n_past:n_past+N,:,il]
                // v         shape [N, n_embd, n_batch, 1] == kv_self.v[:,n_past:n_past+N,:,il]

                /* {
-                    struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
+                    struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*kv_size + n_past));
                    struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
-                            (   n_ctx)*ggml_element_size(kv_self.v),
-                            (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
+                            (   kv_size)*ggml_element_size(kv_self.v),
+                            (il*kv_size)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));

                    // important: storing RoPE-ed version of K in the KV cache!
                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
@@ -817,7 +817,7 @@ static struct ggml_tensor * forward_batch(
                        0, 2, 1, 3);
            assert_shape_4d(Q, n_embd/n_head, N, n_head, n_batch);

-            // kv_self.k shape [n_embd * n_ctx * n_batch * n_layer]
+            // kv_self.k shape [n_embd * kv_size * n_batch * n_layer]
            // K shape [n_embd/n_head, n_past + N, n_head, n_batch]
            struct ggml_tensor * K =
                ggml_permute(ctx0,
@@ -855,7 +855,7 @@ static struct ggml_tensor * forward_batch(
            assert_shape_4d(KQ_soft_max, n_past + N, N, n_head, n_batch);

            // split cached V into n_head heads
-            // kv_self.v shape [n_ctx * n_embd * n_batch * n_layer]
+            // kv_self.v shape [kv_size * n_embd * n_batch * n_layer]
            // V shape [n_past + N, n_embd/n_head, n_head, n_batch] == kv_self.v[:(n_past+N),:,:,il]
            struct ggml_tensor * V =
                ggml_view_4d(ctx0, vc,
@@ -1082,16 +1082,16 @@ static struct ggml_tensor * forward_lora(
                                                                cur)),
                                                        n_embd, N)));

-                // kv_self.k shape [n_embd * n_ctx * n_layer, 1]
-                // kv_self.v shape [n_embd * n_ctx * n_layer, 1]
+                // kv_self.k shape [n_embd * kv_size * n_layer, 1]
+                // kv_self.v shape [n_embd * kv_size * n_layer, 1]
                // k         shape [n_embd * N, 1]   == kv_self.k[:,n_past:n_past+N,il,0]
                // v         shape [N, n_embd, 1, 1] == kv_self.v[:,n_past:n_past+N,il,0]

                /* {
-                    struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
+                    struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*kv_size + n_past));
                    struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
-                            (   n_ctx)*ggml_element_size(kv_self.v),
-                            (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
+                            (   kv_size)*ggml_element_size(kv_self.v),
+                            (il*kv_size)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));

                    // important: storing RoPE-ed version of K in the KV cache!
                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
@@ -1110,7 +1110,7 @@ static struct ggml_tensor * forward_lora(
                        Qcur,
                        0, 2, 1, 3);

-            // kv_self.k shape [n_embd * n_ctx * n_layer, 1]
+            // kv_self.k shape [n_embd * kv_size * n_layer, 1]
            // K shape [n_embd/n_head, n_past + N, n_head, 1]
            struct ggml_tensor * K =
                ggml_permute(ctx0,
@@ -1470,7 +1470,7 @@ int main(int argc, char ** argv) {
 /*
    struct llama_model_lora model_lora;
    // model.hparams.n_vocab = 6;
-    // model.hparams.n_ctx   = 64;
+    // model.hparams.kv_size = 64;
    // model.hparams.n_embd  = 128;
    // model.hparams.n_mult  = 2;
    // model.hparams.n_head  = 8;
@@ -1478,7 +1478,7 @@ int main(int argc, char ** argv) {
    // model.hparams.n_rot   = model.hparams.n_embd / model.hparams.n_head;

    model_lora.hparams.n_vocab = 16;
-    model_lora.hparams.n_ctx   = 32;
+    model_lora.hparams.kv_size = 32;
    model_lora.hparams.n_embd  = 256;
    model_lora.hparams.n_mult  = 2;
    model_lora.hparams.n_head  = 16;
@@ -1533,28 +1533,27 @@ int main(int argc, char ** argv) {

        int n_past = 0;

-        struct ggml_cgraph * gf = NULL;
-        gf = ggml_new_graph_custom(ctx0, LLAMA_TRAIN_MAX_NODES, true);
+        ggml_cgraph gf = {};

        get_example_targets_batch(ctx0, 64*ex+0,  tokens_input, targets);

-        struct ggml_tensor * logits = forward_batch(&model, &kv_self, ctx0, gf, tokens_input, n_tokens, n_past, n_batch);
+        struct ggml_tensor * logits = forward_batch(&model, &kv_self, ctx0, &gf, tokens_input, n_tokens, n_past, n_batch);
        // struct ggml_tensor * e = cross_entropy_loss(ctx0, targets, logits);
        struct ggml_tensor * e = square_error_loss(ctx0, targets, logits);

-        ggml_build_forward_expand(gf, e);
-        ggml_graph_compute_helper(work_buffer, gf, /*n_threads*/ 1);
+        ggml_build_forward_expand(&gf, e);
+        ggml_graph_compute_helper(work_buffer, &gf, /*n_threads*/ 1);

        float error_before_opt = ggml_get_f32_1d(e, 0);

-        struct ggml_opt_params opt_params_lbfgs = ggml_opt_default_params(GGML_OPT_TYPE_LBFGS);
+        struct ggml_opt_params opt_params_lbfgs = ggml_opt_default_params(GGML_OPT_LBFGS);
        opt_params_lbfgs.print_forward_graph = false;
        opt_params_lbfgs.print_backward_graph = false;
        opt_params_lbfgs.lbfgs.n_iter = 16;
        ggml_opt(ctx0, opt_params_lbfgs, e);
        //
-        ggml_build_forward_expand(gf, e);
-        ggml_graph_compute_helper(work_buffer, gf, /*n_threads*/ 1);
+        ggml_build_forward_expand(&gf, e);
+        ggml_graph_compute_helper(work_buffer, &gf, /*n_threads*/ 1);

        float error_after_opt = ggml_get_f32_1d(e, 0);

@@ -1601,14 +1600,13 @@ int main(int argc, char ** argv) {
            };
            struct ggml_context * ctx0 = ggml_init(params);

-            struct ggml_cgraph * gf = NULL;
-            gf = ggml_new_graph_custom(ctx0, LLAMA_TRAIN_MAX_NODES, true);
+            ggml_cgraph gf = {};

            int n_past = 0;
-            struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, gf, tokens_input, sample_ctx, n_past);
+            struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, &gf, tokens_input, sample_ctx, n_past);

-            ggml_build_forward_expand(gf, logits);
-            ggml_graph_compute_helper(work_buffer, gf, /*n_threads*/ 1);
+            ggml_build_forward_expand(&gf, logits);
+            ggml_graph_compute_helper(work_buffer, &gf, /*n_threads*/ 1);

            struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
            struct ggml_tensor * probs        = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx);
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@@ -32,15 +32,16 @@ int main(int argc, char ** argv) {
    gpt_params params;

    if (argc == 1 || argv[1][0] == '-') {
-        printf("usage: %s MODEL_PATH [N_KV_MAX] [IS_PP_SHARED] [NGL] <PP> <TG> <PL>\n" , argv[0]);
+        printf("usage: %s MODEL_PATH [N_KV_MAX] [IS_PP_SHARED] [NGL] [MMQ] <PP> <TG> <PL>\n" , argv[0]);
        printf("  <PP>, <TG> and PL are comma-separated lists of numbers without spaces\n\n");
-        printf("  example: %s ggml-model-f16.gguf 2048 0 999 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]);
+        printf("  example: %s ggml-model-f16.gguf 2048 0 999 0 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]);
        return 1 ;
    }

    int n_kv_max     = 2048;
    int is_pp_shared = 0;
    int n_gpu_layers = 0;
+    int mmq          = 0;

    std::vector<int> n_pp = { 128, 256, 512, 1024, 2048, 3584, 7680, };
    std::vector<int> n_tg = { 128, 256, };
@@ -64,15 +65,19 @@ int main(int argc, char ** argv) {
    }

    if (argc >= 6) {
-        n_pp = parse_list(argv[5]);
+        mmq = std::atoi(argv[5]);
    }

    if (argc >= 7) {
-        n_tg = parse_list(argv[6]);
+        n_pp = parse_list(argv[6]);
    }

    if (argc >= 8) {
-        n_pl = parse_list(argv[7]);
+        n_tg = parse_list(argv[7]);
+    }
+
+    if (argc >= 9) {
+        n_pl = parse_list(argv[8]);
    }

    // init LLM
@@ -99,8 +104,9 @@ int main(int argc, char ** argv) {
    llama_context_params ctx_params = llama_context_default_params();

    ctx_params.seed      = 1234;
-    ctx_params.n_ctx     = n_kv_max;
+    ctx_params.kv_size   = n_kv_max;
    ctx_params.n_batch   = 512;
+    ctx_params.mul_mat_q = mmq;

    ctx_params.n_threads       = params.n_threads;
    ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
@@ -153,7 +159,7 @@ int main(int argc, char ** argv) {
    }

    LOG_TEE("\n");
-    LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
+    LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, mmq = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, mmq, ctx_params.n_threads, ctx_params.n_threads_batch);
    LOG_TEE("\n");

    LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP",     "TG",     "B",    "N_KV",     "T_PP s",   "S_PP t/s", "T_TG s",   "S_TG t/s", "T s",      "S t/s");
--- a/examples/batched.swift/Sources/main.swift
+++ b/examples/batched.swift/Sources/main.swift
@@ -38,7 +38,7 @@ let n_kv_req = UInt32(tokens.count) + UInt32((n_len - Int(tokens.count)) * n_par

 var context_params = llama_context_default_params()
 context_params.seed = 1234
-context_params.n_ctx = n_kv_req
+context_params.kv_size = n_kv_req
 context_params.n_batch = UInt32(max(n_len, n_parallel))
 context_params.n_threads = 8
 context_params.n_threads_batch = 8
@@ -53,12 +53,12 @@ defer {
    llama_free(context)
 }

-let n_ctx = llama_n_ctx(context)
+let kv_size = llama_kv_size(context)

-print("\nn_len = \(n_len), n_ctx = \(n_ctx), n_batch = \(context_params.n_batch), n_parallel = \(n_parallel), n_kv_req = \(n_kv_req)\n")
+print("\nn_len = \(n_len), kv_size = \(kv_size), n_batch = \(context_params.n_batch), n_parallel = \(n_parallel), n_kv_req = \(n_kv_req)\n")

-if n_kv_req > n_ctx {
-    print("error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", n_kv_req)
+if n_kv_req > kv_size {
+    print("error: n_kv_req (%d) > kv_size, the required KV cache size is not big enough\n", n_kv_req)
    exit(1)
 }

--- a/examples/batched/README.md
+++ b/examples/batched/README.md
@@ -7,7 +7,7 @@ The example demonstrates batched generation from a given prompt

 ...

-main: n_len = 32, n_ctx = 2048, n_parallel = 4, n_kv_req = 113
+main: n_len = 32, kv_size = 2048, n_parallel = 4, n_kv_req = 113

 Hello my name is

--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@@ -78,7 +78,7 @@ int main(int argc, char ** argv) {
    llama_context_params ctx_params = llama_context_default_params();

    ctx_params.seed  = 1234;
-    ctx_params.n_ctx = n_kv_req;
+    ctx_params.kv_size = n_kv_req;
    ctx_params.n_batch = std::max(n_len, n_parallel);
    ctx_params.n_threads       = params.n_threads;
    ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
@@ -90,14 +90,14 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    const int n_ctx    = llama_n_ctx(ctx);
+    const int kv_size    = llama_kv_size(ctx);

-    LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
+    LOG_TEE("\n%s: n_len = %d, kv_size = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_len, kv_size, ctx_params.n_batch, n_parallel, n_kv_req);

    // make sure the KV cache is big enough to hold all the prompt and generated tokens
-    if (n_kv_req > n_ctx) {
-        LOG_TEE("%s: error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", __func__,  n_kv_req);
-        LOG_TEE("%s:        either reduce n_parallel or increase n_ctx\n", __func__);
+    if (n_kv_req > kv_size) {
+        LOG_TEE("%s: error: n_kv_req (%d) > kv_size, the required KV cache size is not big enough\n", __func__,  n_kv_req);
+        LOG_TEE("%s:        either reduce n_parallel or increase kv_size\n", __func__);
        return 1;
    }

--- a/examples/beam-search/beam-search.cpp
+++ b/examples/beam-search/beam-search.cpp
@@ -139,8 +139,8 @@ int main(int argc, char ** argv)

    std::vector<llama_token> tokens_list = llama_tokenize(ctx, params.prompt, true);

-    const size_t max_context_size     = llama_n_ctx( ctx );
-    const size_t max_tokens_list_size = max_context_size - 4 ;
+    const size_t max_kv_size          = llama_kv_size(ctx);
+    const size_t max_tokens_list_size = max_kv_size - 4 ;

    if (tokens_list.size() > max_tokens_list_size)
    {
--- a/examples/benchmark/benchmark-matmult.cpp
+++ b/examples/benchmark/benchmark-matmult.cpp
@@ -128,20 +128,20 @@ int main(int argc, char ** argv)  {
    // TODO: perform the bench for all types or for a user specified type
    const ggml_type qtype = GGML_TYPE_Q4_1;

-    size_t ctx_size = 0;
-    ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey);
-    ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey);
-    ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizez);
-    ctx_size += ggml_row_size(qtype,         sizex*sizey);
-    ctx_size += ggml_row_size(qtype,         sizex*sizey);
-    ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS
-    ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS
-    ctx_size += 1024*1024*16;
+    size_t kv_size = 0;
+    kv_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey);
+    kv_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey);
+    kv_size += ggml_row_size(GGML_TYPE_F32, sizex*sizez);
+    kv_size += ggml_row_size(qtype, sizex*sizey);
+    kv_size += ggml_row_size(qtype, sizex*sizey);
+    kv_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS
+    kv_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS
+    kv_size += 1024*1024*16;

-    printf("Allocating Memory of size %zi bytes, %zi MB\n",ctx_size, (ctx_size/1024/1024));
+    printf("Allocating Memory of size %zi bytes, %zi MB\n", kv_size, (kv_size/1024/1024));

    struct ggml_init_params params = {
-        /*.mem_size   =*/ ctx_size,
+        /*.mem_size   =*/ kv_size,
        /*.mem_buffer =*/ NULL,
        /* no_alloc   =*/ 0
    };
--- a/examples/chat-13B.bat
+++ b/examples/chat-13B.bat
@@ -15,7 +15,7 @@ rem Adjust to the number of CPU cores you want to use.
 rem if not defined N_THREAD set "N_THREAD=8"
 rem Number of tokens to predict (made it larger than default because we want a long interaction)
 if not defined N_PREDICTS set "N_PREDICTS=2048"
-if not defined GEN_OPTIONS set "GEN_OPTIONS=--ctx_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647"
+if not defined GEN_OPTIONS set "GEN_OPTIONS=--kv_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647"

 rem Default main script paths
 set "DEFAULT_MAIN_SCRIPT_PATHS=main.exe build\bin\main.exe"
--- a/examples/chat-13B.sh
+++ b/examples/chat-13B.sh
@@ -15,8 +15,8 @@ N_THREAD="${N_THREAD:-8}"
 N_PREDICTS="${N_PREDICTS:-2048}"

 # Note: you can also override the generation options by specifying them on the command line:
-# For example, override the context size by doing: ./chatLLaMa --ctx_size 1024
-GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647}"
+# For example, override the context size by doing: ./chatLLaMa --kv_size 1024
+GEN_OPTIONS="${GEN_OPTIONS:---kv_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647}"

 DATE_TIME=$(date +%H:%M)
 DATE_YEAR=$(date +%Y)
--- a/examples/chat-persistent.sh
+++ b/examples/chat-persistent.sh
@@ -27,9 +27,9 @@ SESSION_SIZE_MSG_PATTERN='main: session file matches [[:digit:]]+ / [[:digit:]]+
 SAMPLE_TIME_MSG_PATTERN='sample time =[[:space:]]+[[:digit:]]+.[[:digit:]]+ ms /[[:space:]]+[[:digit:]]+'
 SED_DELETE_MESSAGES="/^(${USER_NAME}:|${AI_NAME}:|\\.\\.\\.)/,\$d"

-CTX_SIZE=2048
-CTX_ROTATE_POINT=$((CTX_SIZE * 3 / 5)) # REVIEW
-OPTS=(--model "$MODEL" --ctx_size "$CTX_SIZE" --repeat_last_n 256 "$@")
+KV_SIZE=2048
+KV_ROTATE_POINT=$((KV_SIZE * 3 / 5)) # REVIEW
+OPTS=(--model "$MODEL" --kv_size "$KV_SIZE" --repeat_last_n 256 "$@")

 # An unbuffered `tail -c+N`
 skip_bytes() {
@@ -84,7 +84,7 @@ n_tokens=0

 while read -e line; do
    # Limit generation to remaining context, with a buffer and estimating 2 chars/token for input
-    n_predict=$((CTX_SIZE - n_tokens - ${#line} / 2 - 32))
+    n_predict=$((KV_SIZE - n_tokens - ${#line} / 2 - 32))

    # Swap prompts when we're about to run out of context
    if ((n_predict <= 0)); then
@@ -97,11 +97,11 @@ while read -e line; do
        cp "$PROMPT_CACHE_FILE" "$NEXT_PROMPT_CACHE"

        n_tokens=0
-        n_predict=$((CTX_SIZE / 2))
+        n_predict=$((KV_SIZE / 2))
    fi

    echo " ${line}" >>"$CUR_PROMPT_FILE"
-    if ((n_tokens > CTX_ROTATE_POINT)); then
+    if ((n_tokens > KV_ROTATE_POINT)); then
        echo " ${line}" >>"$NEXT_PROMPT_FILE"
    fi

@@ -139,7 +139,7 @@ while read -e line; do

    n_tokens=$(($(cut -d/ -f2 <<<"$session_size_msg") + $(cut -d/ -f2 <<<"$sample_time_msg")))

-    if ((n_tokens > CTX_ROTATE_POINT)); then
+    if ((n_tokens > KV_ROTATE_POINT)); then
        tail -c+$((n_prompt_len_pre + 1)) "$CUR_PROMPT_FILE" >>"$NEXT_PROMPT_FILE"
    fi

--- a/examples/chat-vicuna.sh
+++ b/examples/chat-vicuna.sh
@@ -15,8 +15,8 @@ N_THREAD="${N_THREAD:-8}"
 N_PREDICTS="${N_PREDICTS:-2048}"

 # Note: you can also override the generation options by specifying them on the command line:
-# For example, override the context size by doing: ./chatLLaMa --ctx_size 1024
-GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647}"
+# For example, override the context size by doing: ./chatLLaMa --kv_size 1024
+GEN_OPTIONS="${GEN_OPTIONS:---kv_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647}"

 DATE_TIME=$(date +%H:%M)
 DATE_YEAR=$(date +%Y)
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@@ -226,7 +226,7 @@ struct llama_vocab {

 struct my_llama_hparams {
    uint32_t n_vocab = 32000;
-    uint32_t n_ctx   = 512;   // this is provided as user input?
+    uint32_t kv_size   = 512;   // this is provided as user input?
    uint32_t n_embd  = 4096;
    uint32_t n_ff    = 11008;
    uint32_t n_mult  = 4;
@@ -326,7 +326,7 @@ struct train_params {

 static void print_params(struct my_llama_hparams * params) {
    printf("%s: n_vocab: %u\n", __func__, params->n_vocab);
-    printf("%s: n_ctx:   %u\n", __func__, params->n_ctx);
+    printf("%s: kv_size: %u\n", __func__, params->kv_size);
    printf("%s: n_embd:  %u\n", __func__, params->n_embd);
    printf("%s: n_mult:  %u\n", __func__, params->n_mult);
    printf("%s: n_head:  %u\n", __func__, params->n_head);
@@ -732,7 +732,7 @@ static void save_as_llama_model(
    gguf_set_val_u32(ctx, KV_TOKENIZER_SEP_ID, -1);
    gguf_set_val_u32(ctx, KV_TOKENIZER_PAD_ID, -1);

-    gguf_set_val_u32(ctx, KV_CONTEXT_LENGTH, model->hparams.n_ctx);
+    gguf_set_val_u32(ctx, KV_CONTEXT_LENGTH, model->hparams.kv_size);
    gguf_set_val_u32(ctx, KV_EMBEDDING_LENGTH, model->hparams.n_embd);
    gguf_set_val_u32(ctx, KV_FEED_FORWARD_LENGTH, model->hparams.n_ff);
    gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT, model->hparams.n_head);
@@ -937,7 +937,7 @@ int main(int argc, char ** argv) {

    struct my_llama_model model;
    model.hparams.n_vocab = config.vocab_size; //llama_n_vocab(lctx);
-    model.hparams.n_ctx   = params.n_ctx;
+    model.hparams.kv_size = params.n_ctx;
    model.hparams.n_embd  = config.dim; //params.n_embd;
    model.hparams.n_ff    = config.hidden_dim;
    model.hparams.n_mult  = 32;//params.n_mult;
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -19,11 +19,11 @@ static std::vector<std::string> split_lines(const std::string & s) {

 static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, int seq_id) {
    for (size_t i = 0; i < tokens.size(); i++) {
-        llama_batch_add(batch, tokens[i], i, { seq_id }, i == tokens.size() - 1);
+        llama_batch_add(batch, tokens[i], i, { seq_id }, false);
    }
 }

-static void normalize(const float * vec, float * out, int n) {
+static void normalize(float * vec, float * out, int n) {
    float norm = 0;
    for (int i = 0; i < n; i++) {
        norm += vec[i] * vec[i];
@@ -45,23 +45,10 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
    }

    // normalize on copy
-    for (int i = 0; i < batch.n_tokens; i++) {
-        if (!batch.logits[i]) {
-            continue;
-        }
-
-        // try to get sequence embeddings - supported only when pooling_type is not NONE
-        const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
-        if (embd == NULL) {
-            embd = llama_get_embeddings_ith(ctx, i);
-            if (embd == NULL) {
-                fprintf(stderr, "%s: failed to get embeddings for token %d\n", __func__, i);
-                continue;
-            }
-        }
-
-        float * out = output + batch.seq_id[i][0] * n_embd;
-        normalize(embd, out, n_embd);
+    for (int k = 0; k < n_seq; k++) {
+        float * emb = llama_get_embeddings_ith(ctx, k);
+        float * out = output + k * n_embd;
+        normalize(emb, out, n_embd);
    }
 }

@@ -101,11 +88,11 @@ int main(int argc, char ** argv) {
    }

    const int n_ctx_train = llama_n_ctx_train(model);
-    const int n_ctx = llama_n_ctx(ctx);
+    const int kv_size = llama_kv_size(ctx);

-    if (n_ctx > n_ctx_train) {
+    if (kv_size > n_ctx_train) {
        fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
-                __func__, n_ctx_train, n_ctx);
+                __func__, n_ctx_train, kv_size);
    }

    // print system information
@@ -119,7 +106,7 @@ int main(int argc, char ** argv) {

    // max batch size
    const uint64_t n_batch = params.n_batch;
-    GGML_ASSERT(params.n_batch == params.n_ctx);
+    GGML_ASSERT(params.n_batch == params.kv_size);

    // tokenize the prompts and trim
    std::vector<std::vector<int32_t>> inputs;
@@ -145,7 +132,7 @@ int main(int argc, char ** argv) {

    // initialize batch
    const int n_prompts = prompts.size();
-    struct llama_batch batch = llama_batch_init(n_batch, 0, 1);
+    struct llama_batch batch = llama_batch_init(n_batch, 0, n_prompts);

    // allocate output
    const int n_embd = llama_n_embd(model);
@@ -158,7 +145,6 @@ int main(int argc, char ** argv) {
    for (int k = 0; k < n_prompts; k++) {
        // clamp to n_batch tokens
        auto & inp = inputs[k];
-
        const uint64_t n_toks = inp.size();

        // encode if at capacity
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -16,7 +16,7 @@

 struct my_llama_hparams {
    uint32_t n_vocab    = 32000;
-    uint32_t n_ctx      = 512;
+    uint32_t kv_size    = 512;
    uint32_t n_embd     = 4096;
    uint32_t n_ff       = 11008;
    uint32_t n_head     = 32;
@@ -190,7 +190,7 @@ static const char * LLM_TENSOR_FFN_UP        = "blk.%d.ffn_up";

 static void print_params(struct my_llama_hparams * params) {
    printf("%s: n_vocab               : %u\n", __func__, params->n_vocab);
-    printf("%s: n_ctx                 : %u\n", __func__, params->n_ctx);
+    printf("%s: kv_size               : %u\n", __func__, params->kv_size);
    printf("%s: n_embd                : %u\n", __func__, params->n_embd);
    printf("%s: n_ff                  : %u\n", __func__, params->n_ff);
    printf("%s: n_head                : %u\n", __func__, params->n_head);
@@ -250,7 +250,7 @@ static void load_model_hparams_gguf(struct gguf_context * ctx, struct my_llama_h
    };

    GGUF_GET_KEY(ctx, hparams->n_embd,         gguf_get_val_u32, GGUF_TYPE_UINT32,  true, kv(LLM_KV_EMBEDDING_LENGTH));
-    GGUF_GET_KEY(ctx, hparams->n_ctx,          gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_CONTEXT_LENGTH));
+    GGUF_GET_KEY(ctx, hparams->kv_size,        gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_CONTEXT_LENGTH));
    GGUF_GET_KEY(ctx, hparams->n_ff,           gguf_get_val_u32, GGUF_TYPE_UINT32,  true, kv(LLM_KV_FEED_FORWARD_LENGTH));
    GGUF_GET_KEY(ctx, hparams->n_head,         gguf_get_val_u32, GGUF_TYPE_UINT32,  true, kv(LLM_KV_ATTENTION_HEAD_COUNT));
    GGUF_GET_KEY(ctx, hparams->n_layer,        gguf_get_val_u32, GGUF_TYPE_UINT32,  true, kv(LLM_KV_BLOCK_COUNT));
@@ -268,7 +268,7 @@ static void load_model_hparams_gguf(struct gguf_context * ctx, struct my_llama_h
    }
 }

-static void init_model(struct llama_model * input, struct my_llama_model * model, const char * fn_model, uint32_t n_ctx) {
+static void init_model(struct llama_model * input, struct my_llama_model * model, const char * fn_model, uint32_t kv_size) {
    auto & hparams = model->hparams;

    std::vector<char> tn_buf;
@@ -298,7 +298,7 @@ static void init_model(struct llama_model * input, struct my_llama_model * model
        gguf_free(mctx);
    }
    hparams.n_vocab = llama_n_vocab(input);
-    hparams.n_ctx = n_ctx;
+    hparams.kv_size = kv_size;

    // get tensors from llama_model (possibly mmapped)
    model->tok_embeddings = llama_get_model_tensor(input, tn(LLM_TENSOR_TOKEN_EMBD));
@@ -529,7 +529,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
    const int n_past = 0;
    const int N = n_tokens;
    const auto & hparams  = model->hparams;
-    const int n_ctx       = hparams.n_ctx;
+    const int kv_size     = hparams.kv_size;
    const int n_vocab     = hparams.n_vocab;
    const int n_embd      = hparams.n_embd;
    const int n_layer     = hparams.n_layer;
@@ -558,13 +558,13 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
    ggml_set_input(KQ_pos);

    // rope has so much parameters that we make a custom function for it
-    auto rope = [ctx, KQ_pos, n_rot, n_ctx, rope_freq_base, rope_freq_scale]
+    auto rope = [ctx, KQ_pos, n_rot, kv_size, rope_freq_base, rope_freq_scale]
                (struct ggml_tensor * t) -> struct ggml_tensor * {
        // not capturing these, to silcence warnings
        const int rope_mode = 0;

        return ggml_rope_custom(ctx,
-            t, KQ_pos, n_rot, rope_mode, n_ctx, 0,
+            t, KQ_pos, n_rot, rope_mode, kv_size, 0,
            rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f
        );
    };
@@ -848,7 +848,7 @@ static void save_llama_lora_gguf(struct gguf_context * fctx, struct my_llama_mod
    gguf_set_val_str(fctx, LLM_KV_GENERAL_ARCHITECTURE, arch);
    gguf_set_val_u32(fctx, LLM_KV_GENERAL_FILE_TYPE, ftype);

-    gguf_set_val_u32(fctx, kv(LLM_KV_CONTEXT_LENGTH),              model->hparams.n_ctx);
+    gguf_set_val_u32(fctx, kv(LLM_KV_CONTEXT_LENGTH),              model->hparams.kv_size);
    gguf_set_val_u32(fctx, kv(LLM_KV_EMBEDDING_LENGTH),            model->hparams.n_embd);
    gguf_set_val_u32(fctx, kv(LLM_KV_FEED_FORWARD_LENGTH),         model->hparams.n_ff);
    gguf_set_val_u32(fctx, kv(LLM_KV_ATTENTION_HEAD_COUNT),        model->hparams.n_head);
@@ -1531,7 +1531,7 @@ int main(int argc, char ** argv) {
    lora.hparams.n_rank_output         = n_rank_output;

    // set opt params from command line
-    opt->params = ggml_opt_default_params(GGML_OPT_TYPE_ADAM);
+    opt->params = ggml_opt_default_params(GGML_OPT_ADAM);
    opt->params.print_forward_graph     = false;
    opt->params.print_backward_graph    = false;
    opt->params.graph_size              = LLAMA_TRAIN_MAX_NODES;
@@ -1554,9 +1554,9 @@ int main(int argc, char ** argv) {
    bool existed = load_checkpoint_lora_file(params.common.fn_checkpoint_in, &model, &lora, train);

    if (existed) {
-        // overwrite last n_ctx with user provided n_ctx
+        // overwrite last kv_size with user provided kv_size
        if (params.common.custom_n_ctx) {
-            model.hparams.n_ctx = params.common.n_ctx;
+            model.hparams.kv_size = params.common.n_ctx;
        }

        const bool opt_param_count_changed = (
@@ -1625,7 +1625,7 @@ int main(int argc, char ** argv) {
    printf("%s: opt_size  = %zu bytes (%.1f MB)\n", __func__, ggml_get_mem_size(opt->ctx), (float) ggml_get_mem_size(opt->ctx) / (1024.0f*1024.0f));
    printf("%s: opt iter %d\n", __func__, opt->iter);

-    int n_tokens = model.hparams.n_ctx;
+    int n_tokens = model.hparams.kv_size;
    int n_vocab  = model.hparams.n_vocab;
    int n_batch  = params.common.n_batch;

--- a/examples/gpt4all.sh
+++ b/examples/gpt4all.sh
@@ -10,6 +10,6 @@ cd ..
 ./main --color --instruct --threads 4 \
       --model ./models/gpt4all-7B/gpt4all-lora-quantized.bin \
       --file ./prompts/alpaca.txt \
-       --batch_size 8 --ctx_size 2048 -n -1 \
+       --batch_size 8 --kv_size 2048 -n -1 \
       --repeat_last_n 64 --repeat_penalty 1.3 \
       --n_predict 128 --temp 0.1 --top_k 40 --top_p 0.95
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -325,7 +325,7 @@ static void process_logits(
 static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool compute_ppl, int from_chunk) {

    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
-    const int n_ctx = llama_n_ctx(ctx);
+    const int kv_size  = llama_kv_size(ctx);

    auto tim1 = std::chrono::high_resolution_clock::now();
    fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
@@ -336,17 +336,17 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
    fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());

    if (from_chunk > 0) {
-        if (size_t((from_chunk + 2)*n_ctx) >= tokens.size()) {
+        if (size_t((from_chunk + 2)*kv_size) >= tokens.size()) {
            fprintf(stderr, "%s: there will be not enough tokens left after removing %d chunks\n", __func__, from_chunk);
            return false;
        }
-        fprintf(stderr, "%s: removing initial %d chunks (%d tokens)\n", __func__, from_chunk, from_chunk*n_ctx);
-        tokens.erase(tokens.begin(), tokens.begin() + from_chunk*n_ctx);
+        fprintf(stderr, "%s: removing initial %d chunks (%d tokens)\n", __func__, from_chunk, from_chunk * kv_size);
+        tokens.erase(tokens.begin(), tokens.begin() + from_chunk * kv_size);
    }

-    if (int(tokens.size()) < 2*n_ctx) {
-        fprintf(stderr, "%s: you need at least %d tokens for a context of %d tokens\n",__func__,2*n_ctx,
-                n_ctx);
+    if (int(tokens.size()) < 2*kv_size) {
+        fprintf(stderr, "%s: you need at least %d tokens for a context of %d tokens\n", __func__, 2 * kv_size,
+                kv_size);
        fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
        return false;
    }
@@ -359,7 +359,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
        prob_history.resize(tokens.size());
    }

-    const int n_chunk_max = tokens.size() / n_ctx;
+    const int n_chunk_max = tokens.size() / kv_size;

    const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
    const int n_vocab = llama_n_vocab(llama_get_model(ctx));
@@ -373,16 +373,16 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool

    std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);

-    const int num_batches = (n_ctx + n_batch - 1) / n_batch;
+    const int num_batches = (kv_size + n_batch - 1) / n_batch;

    std::vector<float> logits;
    if (compute_ppl && num_batches > 1) {
-        logits.reserve((size_t)n_ctx * n_vocab);
+        logits.reserve((size_t)kv_size * n_vocab);
    }

    for (int i = 0; i < n_chunk; ++i) {
-        const int start =     i * n_ctx;
-        const int end   = start + n_ctx;
+        const int start = i * kv_size;
+        const int end   = start + kv_size;

        std::vector<float> logits;

@@ -431,11 +431,11 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
        }

        if (compute_ppl) {
-            const int first = n_ctx/2;
+            const int first = kv_size / 2;
            const auto all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
-            process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
+            process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, kv_size - 1 - first,
                    workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
-            count += n_ctx - first - 1;
+            count += kv_size - first - 1;

            printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
            fflush(stdout);
@@ -553,7 +553,7 @@ int main(int argc, char ** argv) {
    }

    params.logits_all = true;
-    params.n_batch = std::min(params.n_batch, params.n_ctx);
+    params.n_batch = std::min(params.n_batch, params.kv_size);

    print_build_info();

@@ -593,9 +593,9 @@ int main(int argc, char ** argv) {
    }

    const int n_ctx_train = llama_n_ctx_train(model);
-    if (params.n_ctx > n_ctx_train) {
+    if (params.kv_size > n_ctx_train) {
        fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
-                __func__, n_ctx_train, params.n_ctx);
+                __func__, n_ctx_train, params.kv_size);
    }

    // print system information
--- a/examples/infill/README.md
+++ b/examples/infill/README.md
@@ -14,7 +14,8 @@ In this section, we cover the most commonly used options for running the `infill
 -   `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
 -   `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
 -   `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
-   `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
+-   `-c N`, `--ctx-size N`: Deprecated, use `--kv-size` instead.
+-   `-kv N`, `--kv-size N`: Specify the total size of the KV cache for the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.

 ## Input Prompts

--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@@ -135,9 +135,9 @@ int main(int argc, char ** argv) {
        return 0;
    }

-    if (params.n_ctx != 0 && params.n_ctx < 8) {
+    if (params.kv_size != 0 && params.kv_size < 8) {
        LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
-        params.n_ctx = 8;
+        params.kv_size = 8;
    }
    if (params.instruct) {
        printf("\n************\n");
@@ -225,12 +225,12 @@ int main(int argc, char ** argv) {
    }

    const int n_ctx_train = llama_n_ctx_train(model);
-    const int n_ctx = llama_n_ctx(ctx);
-    LOG("n_ctx: %d\n", n_ctx);
+    const int kv_size     = llama_kv_size(ctx);
+    LOG("kv_size: %d\n", kv_size);

-    if (n_ctx > n_ctx_train) {
+    if (kv_size > n_ctx_train) {
        LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n",
-                __func__, n_ctx_train, n_ctx);
+                __func__, n_ctx_train, kv_size);
    }

    // print system information
@@ -291,8 +291,8 @@ int main(int argc, char ** argv) {
        LOG("guidance_offset:     %s", log_tostr(guidance_offset));
    }

-    if ((int) embd_inp.size() > n_ctx - 4) {
-        LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
+    if ((int) embd_inp.size() > kv_size - 4) {
+        LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), kv_size - 4);
        return 1;
    }

@@ -366,7 +366,7 @@ int main(int argc, char ** argv) {
        }
    }
    LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str());
-    LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
+    LOG_TEE("generate: kv_size = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", kv_size, params.n_batch, params.n_predict, params.n_keep);
    LOG_TEE("\n\n");

    LOG_TEE("\n#####  Infill mode  #####\n\n");
@@ -378,10 +378,10 @@ int main(int argc, char ** argv) {
    if (params.interactive) {
        const char *control_message;
        if (params.multiline_input) {
-            control_message = " - To return control to LLaMA, end your input with '\\'.\n"
+            control_message = " - To return control to LLaMa, end your input with '\\'.\n"
                              " - To return control without starting a new line, end your input with '/'.\n";
        } else {
-            control_message = " - Press Return to return control to LLaMA.\n"
+            control_message = " - Press Return to return control to LLaMa.\n"
                              " - To return control without starting a new line, end your input with '/'.\n"
                              " - If you want to submit another line, end your input with '\\'.\n";
        }
@@ -416,9 +416,9 @@ int main(int argc, char ** argv) {
    while (n_remain != 0 || params.interactive) {
        // predict
        if (!embd.empty()) {
-            // Note: n_ctx - 4 here is to match the logic for commandline prompt handling via
+            // Note: kv_size - 4 here is to match the logic for commandline prompt handling via
            // --prompt or --file which uses the same value.
-            int max_embd_size = n_ctx - 4;
+            int max_embd_size = kv_size - 4;

            // Ensure the input doesn't exceed the context size by truncating embd if necessary.
            if ((int) embd.size() > max_embd_size) {
@@ -434,8 +434,8 @@ int main(int argc, char ** argv) {
            // infinite text generation via context swapping
            // if we run out of context:
            // - take the n_keep first tokens from the original prompt (via n_past)
-            // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
-            if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > n_ctx) {
+            // - take half of the last (kv_size - n_keep) tokens and recompute the logits in batches
+            if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > kv_size) {
                if (params.n_predict == -2) {
                    LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
                    break;
@@ -444,11 +444,11 @@ int main(int argc, char ** argv) {
                const int n_left    = n_past - params.n_keep - 1;
                const int n_discard = n_left/2;

-                LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
-                    n_past, n_left, n_ctx, params.n_keep, n_discard);
+                LOG("context full, swapping: n_past = %d, n_left = %d, kv_size = %d, n_keep = %d, n_discard = %d\n",
+                    n_past, n_left, kv_size, params.n_keep, n_discard);

-                llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1            , params.n_keep + n_discard + 1);
-                llama_kv_cache_seq_add(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
+                llama_kv_cache_seq_rm   (ctx, 0, params.n_keep + 1            , params.n_keep + n_discard + 1);
+                llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);

                n_past -= n_discard;

--- a/examples/json-schema-to-grammar.py
+++ b/examples/json-schema-to-grammar.py
@@ -87,21 +87,7 @@ class SchemaConverter:
        elif schema_type == 'array' and 'items' in schema:
            # TODO `prefixItems` keyword
            item_rule_name = self.visit(schema['items'], f'{name}{"-" if name else ""}item')
-            list_item_operator = f'("," space {item_rule_name})'
-            successive_items = ""
-            min_items = schema.get("minItems", 0)
-            if min_items > 0:
-               first_item = f"({item_rule_name})"
-               successive_items = list_item_operator * (min_items - 1)
-               min_items -= 1
-            else:
-               first_item = f"({item_rule_name})?"
-            max_items = schema.get("maxItems")
-            if max_items is not None and max_items > min_items:
-                successive_items += (list_item_operator + "?") * (max_items - min_items - 1)
-            else:
-                successive_items += list_item_operator + "*"
-            rule = f'"[" space {first_item} {successive_items} "]" space'
+            rule = f'"[" space ({item_rule_name} ("," space {item_rule_name})*)? "]" space'
            return self._add_rule(rule_name, rule)

        else:
--- a/examples/llama-bench/README.md
+++ b/examples/llama-bench/README.md
@@ -35,6 +35,7 @@ options:
  -mg, --main-gpu <i>                 (default: 0)
  -nkvo, --no-kv-offload <0|1>        (default: 0)
  -mmp, --mmap <0|1>                  (default: 1)
+  -mmq, --mul-mat-q <0|1>             (default: 1)
  -ts, --tensor_split <ts0/ts1/..>    (default: 0)
  -r, --repetitions <n>               (default: 5)
  -o, --output <csv|json|md|sql>      (default: md)
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -123,15 +123,20 @@ static std::string get_gpu_info() {
    }
 #endif
 #ifdef GGML_USE_SYCL
-    int count = ggml_backend_sycl_get_device_count();
-    for (int i = 0; i < count; i++) {
-        char buf[128];
-        ggml_sycl_get_device_description(i, buf, sizeof(buf));
-        id += buf;
-        if (i < count - 1) {
+    int device_list[GGML_SYCL_MAX_DEVICES];
+    ggml_sycl_get_gpu_list(device_list, GGML_SYCL_MAX_DEVICES);
+
+    for (int i = 0; i < GGML_SYCL_MAX_DEVICES; i++) {
+        if (device_list[i] >0 ){
+            char buf[128];
+            ggml_sycl_get_device_description(i, buf, sizeof(buf));
+            id += buf;
            id += "/";
        }
    }
+    if (id.length() >2 ) {
+        id.pop_back();
+    }
 #endif
    // TODO: other backends
    return id;
@@ -152,9 +157,9 @@ static const char * output_format_str(output_formats format) {

 static const char * split_mode_str(llama_split_mode mode) {
    switch (mode) {
-        case LLAMA_SPLIT_MODE_NONE:  return "none";
-        case LLAMA_SPLIT_MODE_LAYER: return "layer";
-        case LLAMA_SPLIT_MODE_ROW:   return "row";
+        case LLAMA_SPLIT_NONE:  return "none";
+        case LLAMA_SPLIT_LAYER: return "layer";
+        case LLAMA_SPLIT_ROW:   return "row";
        default: GGML_ASSERT(!"invalid split mode");
    }
 }
@@ -171,6 +176,7 @@ struct cmd_params {
    std::vector<llama_split_mode> split_mode;
    std::vector<int> main_gpu;
    std::vector<bool> no_kv_offload;
+    std::vector<bool> mul_mat_q;
    std::vector<std::vector<float>> tensor_split;
    std::vector<bool> use_mmap;
    int reps;
@@ -187,9 +193,10 @@ static const cmd_params cmd_params_defaults = {
    /* type_v        */ {GGML_TYPE_F16},
    /* n_threads     */ {get_num_physical_cores()},
    /* n_gpu_layers  */ {99},
-    /* split_mode    */ {LLAMA_SPLIT_MODE_LAYER},
+    /* split_mode    */ {LLAMA_SPLIT_LAYER},
    /* main_gpu      */ {0},
    /* no_kv_offload */ {false},
+    /* mul_mat_q     */ {true},
    /* tensor_split  */ {std::vector<float>(llama_max_devices(), 0.0f)},
    /* use_mmap      */ {true},
    /* reps          */ 5,
@@ -214,6 +221,7 @@ static void print_usage(int /* argc */, char ** argv) {
    printf("  -mg, --main-gpu <i>                 (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
    printf("  -nkvo, --no-kv-offload <0|1>        (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
    printf("  -mmp, --mmap <0|1>                  (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
+    printf("  -mmq, --mul-mat-q <0|1>             (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str());
    printf("  -ts, --tensor_split <ts0/ts1/..>    (default: 0)\n");
    printf("  -r, --repetitions <n>               (default: %d)\n", cmd_params_defaults.reps);
    printf("  -o, --output <csv|json|md|sql>      (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
@@ -350,11 +358,11 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
            for (const auto & m : p) {
                llama_split_mode mode;
                if (m == "none") {
-                    mode = LLAMA_SPLIT_MODE_NONE;
+                    mode = LLAMA_SPLIT_NONE;
                } else if (m == "layer") {
-                    mode = LLAMA_SPLIT_MODE_LAYER;
+                    mode = LLAMA_SPLIT_LAYER;
                } else if (m == "row") {
-                    mode = LLAMA_SPLIT_MODE_ROW;
+                    mode = LLAMA_SPLIT_ROW;
                } else {
                    invalid_param = true;
                    break;
@@ -375,6 +383,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
            }
            auto p = split<bool>(argv[i], split_delim);
            params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end());
+        } else if (arg == "-mmq" || arg == "--mul-mat-q") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            auto p = split<bool>(argv[i], split_delim);
+            params.mul_mat_q.insert(params.mul_mat_q.end(), p.begin(), p.end());
        } else if (arg == "-mmp" || arg == "--mmap") {
            if (++i >= argc) {
                invalid_param = true;
@@ -451,6 +466,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
    if (params.split_mode.empty())   { params.split_mode = cmd_params_defaults.split_mode; }
    if (params.main_gpu.empty())     { params.main_gpu = cmd_params_defaults.main_gpu; }
    if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; }
+    if (params.mul_mat_q.empty())    { params.mul_mat_q = cmd_params_defaults.mul_mat_q; }
    if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; }
    if (params.use_mmap.empty())     { params.use_mmap = cmd_params_defaults.use_mmap; }
    if (params.n_threads.empty())    { params.n_threads = cmd_params_defaults.n_threads; }
@@ -470,6 +486,7 @@ struct cmd_params_instance {
    llama_split_mode split_mode;
    int main_gpu;
    bool no_kv_offload;
+    bool mul_mat_q;
    std::vector<float> tensor_split;
    bool use_mmap;

@@ -497,10 +514,11 @@ struct cmd_params_instance {
    llama_context_params to_llama_cparams() const {
        llama_context_params cparams = llama_context_default_params();

-        cparams.n_ctx = n_prompt + n_gen;
+        cparams.kv_size = n_prompt + n_gen;
        cparams.n_batch = n_batch;
        cparams.type_k = type_k;
        cparams.type_v = type_v;
+        cparams.mul_mat_q = mul_mat_q;
        cparams.offload_kqv = !no_kv_offload;

        return cparams;
@@ -520,6 +538,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
    for (const auto & nb : params.n_batch)
    for (const auto & tk : params.type_k)
    for (const auto & tv : params.type_v)
+    for (const auto & mmq : params.mul_mat_q)
    for (const auto & nkvo : params.no_kv_offload)
    for (const auto & nt : params.n_threads) {
        for (const auto & n_prompt : params.n_prompt) {
@@ -538,6 +557,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                /* .split_mode   = */ sm,
                /* .main_gpu     = */ mg,
                /* .no_kv_offload= */ nkvo,
+                /* .mul_mat_q    = */ mmq,
                /* .tensor_split = */ ts,
                /* .use_mmap     = */ mmp,
            };
@@ -560,6 +580,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                /* .split_mode   = */ sm,
                /* .main_gpu     = */ mg,
                /* .no_kv_offload= */ nkvo,
+                /* .mul_mat_q    = */ mmq,
                /* .tensor_split = */ ts,
                /* .use_mmap     = */ mmp,
            };
@@ -595,6 +616,7 @@ struct test {
    llama_split_mode split_mode;
    int main_gpu;
    bool no_kv_offload;
+    bool mul_mat_q;
    std::vector<float> tensor_split;
    bool use_mmap;
    int n_prompt;
@@ -617,6 +639,7 @@ struct test {
        split_mode = inst.split_mode;
        main_gpu = inst.main_gpu;
        no_kv_offload = inst.no_kv_offload;
+        mul_mat_q = inst.mul_mat_q;
        tensor_split = inst.tensor_split;
        use_mmap = inst.use_mmap;
        n_prompt = inst.n_prompt;
@@ -690,7 +713,7 @@ struct test {
            "n_batch", "n_threads", "type_k", "type_v",
            "n_gpu_layers", "split_mode",
            "main_gpu", "no_kv_offload",
-            "tensor_split", "use_mmap",
+            "mul_mat_q", "tensor_split", "use_mmap",
            "n_prompt", "n_gen", "test_time",
            "avg_ns", "stddev_ns",
            "avg_ts", "stddev_ts"
@@ -710,7 +733,7 @@ struct test {
        }
        if (field == "cuda" || field == "opencl"  || field == "vulkan" || field == "kompute" || field == "metal" ||
            field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
-            field == "use_mmap") {
+            field == "mul_mat_q" || field == "use_mmap") {
            return BOOL;
        }
        if (field == "avg_ts" || field == "stddev_ts") {
@@ -744,7 +767,7 @@ struct test {
            std::to_string(n_batch), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
            std::to_string(n_gpu_layers), split_mode_str(split_mode),
            std::to_string(main_gpu), std::to_string(no_kv_offload),
-            tensor_split_str, std::to_string(use_mmap),
+            std::to_string(mul_mat_q), tensor_split_str, std::to_string(use_mmap),
            std::to_string(n_prompt), std::to_string(n_gen), test_time,
            std::to_string(avg_ns()), std::to_string(stdev_ns()),
            std::to_string(avg_ts()), std::to_string(stdev_ts())
@@ -908,6 +931,9 @@ struct markdown_printer : public printer {
        if (field == "n_threads") {
            return "threads";
        }
+        if (field == "mul_mat_q") {
+            return "mmq";
+        }
        if (field == "no_kv_offload") {
            return "nkvo";
        }
@@ -948,6 +974,9 @@ struct markdown_printer : public printer {
        if (params.split_mode.size() > 1 || params.split_mode != cmd_params_defaults.split_mode) {
            fields.emplace_back("split_mode");
        }
+        if (params.mul_mat_q.size() > 1 || params.mul_mat_q != cmd_params_defaults.mul_mat_q) {
+            fields.emplace_back("mul_mat_q");
+        }
        if (params.no_kv_offload.size() > 1 || params.no_kv_offload != cmd_params_defaults.no_kv_offload) {
            fields.emplace_back("no_kv_offload");
        }
--- a/examples/llama.android/app/build.gradle.kts
+++ b/examples/llama.android/app/build.gradle.kts
@@ -21,8 +21,12 @@ android {
            useSupportLibrary = true
        }
        ndk {
-            // Add NDK properties if wanted, e.g.
-            // abiFilters += listOf("arm64-v8a")
+            // Workaround for https://github.com/llvm/llvm-project/issues/65820
+            // affecting armeabi-v7a. Skip armeabi-v7a when invoked with
+            // -Pskip-armeabi-v7a (e.g., ./gradlew build -Pskip-armeabi-v7a).
+            if (project.hasProperty("skip-armeabi-v7a")) {
+                abiFilters += listOf("arm64-v8a", "x86_64", "x86")
+            }
        }
        externalNativeBuild {
            cmake {
--- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
+++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
@@ -68,8 +68,8 @@ actor LlamaContext {
        print("Using \(n_threads) threads")

        var ctx_params = llama_context_default_params()
-        ctx_params.seed  = 1234
-        ctx_params.n_ctx = 2048
+        ctx_params.seed    = 1234
+        ctx_params.kv_size = 2048
        ctx_params.n_threads       = UInt32(n_threads)
        ctx_params.n_threads_batch = UInt32(n_threads)

@@ -112,13 +112,13 @@ actor LlamaContext {
        tokens_list = tokenize(text: text, add_bos: true)
        temporary_invalid_cchars = []

-        let n_ctx = llama_n_ctx(context)
+        let kv_size = llama_kv_size(context)
        let n_kv_req = tokens_list.count + (Int(n_len) - tokens_list.count)

-        print("\n n_len = \(n_len), n_ctx = \(n_ctx), n_kv_req = \(n_kv_req)")
+        print("\n n_len = \(n_len), kv_size = \(kv_size), n_kv_req = \(n_kv_req)")

-        if n_kv_req > n_ctx {
-            print("error: n_kv_req > n_ctx, the required KV cache size is not big enough")
+        if n_kv_req > kv_size {
+            print("error: n_kv_req > kv_size, the required KV cache size is not big enough")
        }

        for id in tokens_list {
--- a/examples/llama2-13b.sh
+++ b/examples/llama2-13b.sh
@@ -9,7 +9,7 @@ cd ..

 ./main -m models/available/Llama2/13B/llama-2-13b.ggmlv3.q4_0.bin \
       --color \
-       --ctx_size 2048 \
+       --kv_size 2048 \
       -n -1 \
       -ins -b 256 \
       --top_k 10000 \
--- a/examples/llama2.sh
+++ b/examples/llama2.sh
@@ -9,7 +9,7 @@ cd ..

 ./main -m models/available/Llama2/7B/llama-2-7b.ggmlv3.q4_0.bin \
       --color \
-       --ctx_size 2048 \
+       --kv_size 2048 \
       -n -1 \
       -ins -b 256 \
       --top_k 10000 \
--- a/examples/llava/README.md
+++ b/examples/llava/README.md
@@ -59,39 +59,14 @@ python ./convert.py ../llava-v1.5-7b --skip-unknown
 Now both the LLaMA part and the image encoder is in the `llava-v1.5-7b` directory.

 ## LLaVA 1.6 gguf conversion
-1) First clone a LLaVA 1.6 model:
-```console
-git clone https://huggingface.co/liuhaotian/llava-v1.6-vicuna-7b
-```
-2) Use `llava-surgery-v2.py` which also supports llava-1.5 variants pytorch as well as safetensor models:
-```console
-python examples/llava/llava-surgery-v2.py -C -m ../llava-v1.6-vicuna-7b/
-```
+
+1) Backup your pth/safetensor model files as llava-surgery modifies them
+2) Use `python llava-surgery-v2.py -C -m /path/to/hf-model` which also supports llava-1.5 variants pytorch as well as safetensor models:
 - you will find a llava.projector and a llava.clip file in your model directory
-3) Copy the llava.clip file into a subdirectory (like vit), rename it to pytorch_model.bin and add a fitting vit configuration to the directory:
-```console
-mkdir vit
-cp ../llava-v1.6-vicuna-7b/llava.clip vit/pytorch_model.bin
-cp ../llava-v1.6-vicuna-7b/llava.projector vit/
-curl -s -q https://huggingface.co/cmp-nct/llava-1.6-gguf/raw/main/config_vit.json -o vit/config.json
-```
-
-4) Create the visual gguf model:
-```console
-python ./examples/llava/convert-image-encoder-to-gguf.py -m vit --llava-projector vit/llava.projector --output-dir vit --clip-model-is-vision
-```
+3) Copy the llava.clip file into a subdirectory (like vit), rename it to pytorch_model.bin and add a fitting vit configuration to the directory (https://huggingface.co/cmp-nct/llava-1.6-gguf/blob/main/config_vit.json) and rename it to config.json.
+4) Create the visual gguf model: `python ./examples/llava/convert-image-encoder-to-gguf.py -m ../path/to/vit --llava-projector ../path/to/llava.projector --output-dir ../path/to/output --clip-model-is-vision`
 - This is similar to llava-1.5, the difference is that we tell the encoder that we are working with the pure vision model part of CLIP
-
-5) Then convert the model to gguf format:
-```console
-python ./convert.py ../llava-v1.6-vicuna-7b/ --skip-unknown
-```
-
-6) And finally we can run the llava-cli using the 1.6 model version:
-```console
-./llava-cli -m ../llava-v1.6-vicuna-7b/ggml-model-f16.gguf --mmproj vit/mmproj-model-f16.gguf --image some-image.jpg -c 4096
-```
-
+5) Everything else as usual: convert.py the hf model, quantize as needed
 **note** llava-1.6 needs more context than llava-1.5, at least 3000 is needed (just run it at -c 4096)
 **note** llava-1.6 greatly benefits from batched prompt processing (defaults work)

--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -616,9 +616,9 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
            KQ = ggml_soft_max_inplace(ctx0, KQ);
            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
            KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_positions, n_head, batch_size);
-            KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
+            KQV = ggml_cont(ctx0, ggml_permute(ctx0, KQV, 0, 2, 1, 3));

-            cur = ggml_cont_3d(ctx0, KQV, hidden_size, num_positions, batch_size);
+            cur = ggml_cpy(ctx0, KQV, ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size));
        }

        // attention output
--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@@ -230,7 +230,7 @@ static struct llava_context * llava_init(gpt_params * params) {
    }

    llama_context_params ctx_params = llama_context_params_from_gpt_params(*params);
-    ctx_params.n_ctx           = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings
+    ctx_params.kv_size              = params->kv_size < 2048 ? 2048 : params->kv_size; // we need a longer context size to process image embeddings

    llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);

--- a/examples/llava/llava-surgery-v2.py
+++ b/examples/llava/llava-surgery-v2.py
@@ -65,7 +65,9 @@ def clean_vision_tower_from_checkpoint(checkpoint_path):
        for name in clip_tensors:
            del checkpoint[name]

+        # Save the updated checkpoint
        checkpoint_path = checkpoint_path
+        save_model(checkpoint, checkpoint_path, file_type)
        return True
    return False

@@ -150,6 +152,16 @@ for name in first_mm_tensors:
 if len(projector) > 0:
    save_model(projector, f"{args.model}/llava.projector", 'pytorch')

+for name in mm_tensors:
+    del last_checkpoint[name]
+for name in first_mm_tensors:
+    del first_checkpoint[name]
+
+if len(mm_tensors) > 0:
+    save_model(last_checkpoint, projector_checkpoint_path, file_type)
+if len(first_mm_tensors) > 0:
+    save_model(first_checkpoint, newline_checkpoint_path, file_type)
+
 print("Done!")
 print(f"Now you can convert {args.model} to a a regular LLaMA GGUF file.")
 print(f"Also, use {args.model}/llava.projector to prepare a llava-encoder.gguf file.")
--- a/examples/llava/llava-surgery.py
+++ b/examples/llava/llava-surgery.py
@@ -25,6 +25,9 @@ if len(clip_tensors) > 0:
    clip = {name.replace("vision_tower.vision_tower.", ""): checkpoint[name].float() for name in clip_tensors}
    torch.save(clip, f"{args.model}/llava.clip")

+    # remove these tensors
+    for name in clip_tensors:
+        del checkpoint[name]

    # added tokens should be removed to be able to convert Mistral models
    if os.path.exists(f"{args.model}/added_tokens.json"):
@@ -32,6 +35,7 @@ if len(clip_tensors) > 0:
            f.write("{}\n")


+    torch.save(checkpoint, path)

 print("Done!")
 print(f"Now you can convert {args.model} to a regular LLaMA GGUF file.")
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -103,15 +103,15 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
    const size_t num_images = num_patches_width * num_patches_height + 1;

    // TODO: size calculation is not calculated - it's only tens of MB
-    size_t ctx_size = 0;
+    size_t kv_size = 0;

    {
-        ctx_size += clip_embd_nbytes(ctx_clip) * num_images * 8; // image_features
-        ctx_size += 1024*1024 * ggml_type_size(GGML_TYPE_F32);
+        kv_size += clip_embd_nbytes(ctx_clip) * num_images * 8; // image_features
+        kv_size += 1024*1024 * ggml_type_size(GGML_TYPE_F32);
    }

    struct ggml_init_params params {
-        /*.mem_size   =*/ ctx_size,
+        /*.mem_size   =*/ kv_size,
        /*.mem_buffer =*/ NULL,
        /*.no_alloc   =*/ false, // NOTE: this should be false when using the legacy API
    };
@@ -152,7 +152,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>

    ggml_tensor * newline_tmp = clip_get_newline_tensor(ctx_clip);
    model.newline = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, newline_tmp->ne[0]);
-    if (newline_tmp->backend != GGML_BACKEND_TYPE_CPU) {
+    if (newline_tmp->backend != GGML_BACKEND_CPU) {
        if (newline_tmp->buffer == NULL) {
            printf("newline_tmp tensor buffer is NULL\n");
        }
@@ -311,7 +311,7 @@ bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx *
    return true;
 }

-bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) {
+static bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) {
    float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*6); // TODO: base on gridsize/llava model
    if (!image_embd) {
        fprintf(stderr, "Unable to allocate memory for image embeddings\n");
--- a/examples/llava/llava.h
+++ b/examples/llava/llava.h
@@ -31,8 +31,6 @@ struct llava_image_embed {
 /** sanity check for clip <-> llava embed size match */
 LLAVA_API bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip);

-LLAVA_API bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out);
-
 /** build an image embed from image file bytes */
 LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length);
 /** build an image embed from a path to an image filename */
--- a/examples/lookahead/lookahead.cpp
+++ b/examples/lookahead/lookahead.cpp
@@ -73,8 +73,8 @@ int main(int argc, char ** argv) {
    inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
    all = inp;

-    const int max_context_size     = llama_n_ctx(ctx);
-    const int max_tokens_list_size = max_context_size - 4;
+    const int max_kv_size          = llama_kv_size(ctx);
+    const int max_tokens_list_size = max_kv_size - 4;

    if ((int) inp.size() > max_tokens_list_size) {
        fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
@@ -117,7 +117,7 @@ int main(int argc, char ** argv) {
    // seq_id == 0           : the current input token
    // seq_id [1, W]         : tokens from the past N - 1 Jacobi iterations
    // seq_id [W + 1, W + G] : verification n-grams
-    llama_batch batch = llama_batch_init(params.n_ctx, 0, W + G + 1);
+    llama_batch batch = llama_batch_init(params.kv_size, 0, W + G + 1);

    // target model sampling context
    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams);
--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@@ -47,8 +47,8 @@ int main(int argc, char ** argv){
    std::vector<llama_token> inp;
    inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);

-    const int max_context_size     = llama_n_ctx(ctx);
-    const int max_tokens_list_size = max_context_size - 4;
+    const int max_kv_size          = llama_kv_size(ctx);
+    const int max_tokens_list_size = max_kv_size - 4;

    if ((int) inp.size() > max_tokens_list_size) {
        fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
@@ -86,7 +86,7 @@ int main(int argc, char ** argv){

    std::vector<llama_token> draft;

-    llama_batch batch_tgt = llama_batch_init(params.n_ctx, 0, 1);
+    llama_batch batch_tgt = llama_batch_init(params.kv_size, 0, 1);

    // debug
    struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, 1);
--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -70,7 +70,8 @@ In this section, we cover the most commonly used options for running the `main`
 -   `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
 -   `-ins, --instruct`: Run the program in instruction mode, which is particularly useful when working with Alpaca models.
 -   `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
-   `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
+-   `-c N`, `--ctx-size N`: Deprecated, use `--kv-size` instead.
+-   `-kv N`, `--kv-size N`: Set the size of the KV cache for the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.

 ## Input Prompts

@@ -134,15 +135,15 @@ By understanding and utilizing these interaction options, you can create engagin

 During text generation, LLaMA models have a limited context size, which means they can only consider a certain number of tokens from the input and generated text. When the context fills up, the model resets internally, potentially losing some information from the beginning of the conversation or instructions. Context management options help maintain continuity and coherence in these situations.

-### Context Size
+### KV Context Size

-The `--ctx-size` option allows you to set the size of the prompt context used by the LLaMA models during text generation. A larger context size helps the model to better comprehend and generate responses for longer input or conversations.
+The `--kv-size` option allows you to set the size of the prompt context used by the LLaMA models during text generation. A larger context size helps the model to better comprehend and generate responses for longer input or conversations.

-   `-c N, --ctx-size N`: Set the size of the prompt context (default: 512). The LLaMA models were built with a context of 2048, which will yield the best results on longer input/inference. However, increasing the context size beyond 2048 may lead to unpredictable results.
+-   `-c N, --kv-size N`: Set the size of the prompt context (default: 512). The LLaMA models were built with a context of 2048, which will yield the best results on longer input/inference. However, increasing the context size beyond 2048 may lead to unpredictable results.

 ### Extended Context Size

-Some fine-tuned models have extended the context length by scaling RoPE. For example, if the original pre-trained model have a context length (max sequence length) of 4096 (4k) and the fine-tuned model have 32k. That is a scaling factor of 8, and should work by setting the above `--ctx-size` to 32768 (32k) and `--rope-scale` to 8.
+Some fine-tuned models have extended the context length by scaling RoPE. For example, if the original pre-trained model have a context length (max sequence length) of 4096 (4k) and the fine-tuned model have 32k. That is a scaling factor of 8, and should work by setting the above `--kv-size` to 32768 (32k) and `--rope-scale` to 8.

 -   `--rope-scale N`: Where N is the linear scaling factor used by the fine-tuned model.

@@ -152,7 +153,7 @@ The `--keep` option allows users to retain the original prompt when the model ru

 -   `--keep N`: Specify the number of tokens from the initial prompt to retain when the model resets its internal context. By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the initial prompt.

-By utilizing context management options like `--ctx-size` and `--keep`, you can maintain a more coherent and consistent interaction with the LLaMA models, ensuring that the generated text remains relevant to the original prompt or conversation.
+By utilizing context management options like `--kv-size` and `--keep`, you can maintain a more coherent and consistent interaction with the LLaMA models, ensuring that the generated text remains relevant to the original prompt or conversation.

 ## Generation Flags

@@ -181,12 +182,12 @@ Example usage: `--temp 0.5`
 ### Repeat Penalty

 -   `--repeat-penalty N`: Control the repetition of token sequences in the generated text (default: 1.1).
-   `--repeat-last-n N`: Last n tokens to consider for penalizing repetition (default: 64, 0 = disabled, -1 = ctx-size).
+-   `--repeat-last-n N`: Last n tokens to consider for penalizing repetition (default: 64, 0 = disabled, -1 = kv-size).
 -   `--no-penalize-nl`: Disable penalization for newline tokens when applying the repeat penalty.

 The `repeat-penalty` option helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. The default value is 1.1.

-The `repeat-last-n` option controls the number of tokens in the history to consider for penalizing repetition. A larger value will look further back in the generated text to prevent repetitions, while a smaller value will only consider recent tokens. A value of 0 disables the penalty, and a value of -1 sets the number of tokens considered equal to the context size (`ctx-size`).
+The `repeat-last-n` option controls the number of tokens in the history to consider for penalizing repetition. A larger value will look further back in the generated text to prevent repetitions, while a smaller value will only consider recent tokens. A value of 0 disables the penalty, and a value of -1 sets the number of tokens considered equal to the context size (`kv-size`).

 Use the `--no-penalize-nl` option to disable newline penalization when applying the repeat penalty. This option is particularly useful for generating chat conversations, dialogues, code, poetry, or any text where newline tokens play a significant role in structure and formatting. Disabling newline penalization helps maintain the natural flow and intended formatting in these specific use cases.

--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -157,9 +157,9 @@ int main(int argc, char ** argv) {
        return 0;
    }

-    if (params.n_ctx != 0 && params.n_ctx < 8) {
-        LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
-        params.n_ctx = 8;
+    if (params.kv_size != 0 && params.kv_size < 8) {
+        LOG_TEE("%s: warning: minimum KV size is 8, using minimum size.\n", __func__);
+        params.kv_size = 8;
    }

    if (params.rope_freq_base != 0.0) {
@@ -208,12 +208,12 @@ int main(int argc, char ** argv) {
    }

    const int n_ctx_train = llama_n_ctx_train(model);
-    const int n_ctx = llama_n_ctx(ctx);
-    LOG("n_ctx: %d\n", n_ctx);
+    const int kv_size     = llama_kv_size(ctx);
+    LOG("kv_size: %d\n", kv_size);

-    if (n_ctx > n_ctx_train) {
+    if (kv_size > n_ctx_train) {
        LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n",
-                __func__, n_ctx_train, n_ctx);
+                __func__, n_ctx_train, kv_size);
    }

    // print system information
@@ -233,7 +233,7 @@ int main(int argc, char ** argv) {
            LOG_TEE("%s: The session file is empty. A new session will be initialized.\n", __func__);
        } else {
            // The file exists and is not empty
-            session_tokens.resize(n_ctx);
+            session_tokens.resize(kv_size);
            size_t n_token_count_out = 0;
            if (!llama_load_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
                LOG_TEE("%s: error: failed to load session file '%s'\n", __func__, path_session.c_str());
@@ -289,8 +289,8 @@ int main(int argc, char ** argv) {
        LOG("guidance_offset:     %s", log_tostr(guidance_offset));
    }

-    if ((int) embd_inp.size() > n_ctx - 4) {
-        LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
+    if ((int) embd_inp.size() > kv_size - 4) {
+        LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), kv_size - 4);
        return 1;
    }

@@ -334,8 +334,6 @@ int main(int argc, char ** argv) {
    // number of tokens to keep when resetting context
    if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size() || params.instruct || params.chatml) {
        params.n_keep = (int)embd_inp.size();
-    } else {
-        params.n_keep += add_bos; // always keep the BOS token
    }

    // prefix & suffix for instruct mode
@@ -385,8 +383,8 @@ int main(int argc, char ** argv) {
            }
        }

-        if (params.n_keep > add_bos) {
-            LOG_TEE("%s: static prompt based on n_keep: '", __func__);
+        if (params.n_keep > 0) {
+        LOG_TEE("%s: static prompt based on n_keep: '", __func__);
            for (int i = 0; i < params.n_keep; i++) {
                LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
            }
@@ -452,7 +450,7 @@ int main(int argc, char ** argv) {
    }
    LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str());
    LOG_TEE("sampling order: \n%s\n", llama_sampling_order_print(sparams).c_str());
-    LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
+    LOG_TEE("generate: kv_size = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", kv_size, params.n_batch, params.n_predict, params.n_keep);

    // group-attention state
    // number of grouped KV tokens so far (used only if params.grp_attn_n > 1)
@@ -465,7 +463,7 @@ int main(int argc, char ** argv) {
        GGML_ASSERT(ga_n > 0                    && "grp_attn_n must be positive");                     // NOLINT
        GGML_ASSERT(ga_w % ga_n == 0            && "grp_attn_w must be a multiple of grp_attn_n");     // NOLINT
      //GGML_ASSERT(n_ctx_train % ga_w == 0     && "n_ctx_train must be a multiple of grp_attn_w");    // NOLINT
-      //GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * grp_attn_n"); // NOLINT
+      //GGML_ASSERT(kv_size >= n_ctx_train * ga_n && "kv_size must be at least n_ctx_train * grp_attn_n"); // NOLINT
        LOG_TEE("self-extend: n_ctx_train = %d, grp_attn_n = %d, grp_attn_w = %d\n", n_ctx_train, ga_n, ga_w);
    }
    LOG_TEE("\n\n");
@@ -511,22 +509,14 @@ int main(int argc, char ** argv) {
    std::vector<llama_token> embd;
    std::vector<llama_token> embd_guidance;

-    // tokenized antiprompts
-    std::vector<std::vector<llama_token>> antiprompt_ids;
-
-    antiprompt_ids.reserve(params.antiprompt.size());
-    for (const std::string & antiprompt : params.antiprompt) {
-        antiprompt_ids.emplace_back(::llama_tokenize(ctx, antiprompt, false, true));
-    }
-
    struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);

    while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
        // predict
        if (!embd.empty()) {
-            // Note: (n_ctx - 4) here is to match the logic for commandline prompt handling via
+            // Note: (kv_size - 4) here is to match the logic for commandline prompt handling via
            // --prompt or --file which uses the same value.
-            int max_embd_size = n_ctx - 4;
+            int max_embd_size = kv_size - 4;

            // Ensure the input doesn't exceed the context size by truncating embd if necessary.
            if ((int) embd.size() > max_embd_size) {
@@ -543,21 +533,21 @@ int main(int argc, char ** argv) {
                // infinite text generation via context shifting
                // if we run out of context:
                // - take the n_keep first tokens from the original prompt (via n_past)
-                // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
-                if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > n_ctx) {
+                // - take half of the last (kv_size - n_keep) tokens and recompute the logits in batches
+                if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > kv_size) {
                    if (params.n_predict == -2) {
                        LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
                        break;
                    }

-                    const int n_left    = n_past - params.n_keep;
+                    const int n_left    = n_past - params.n_keep - 1;
                    const int n_discard = n_left/2;

-                    LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
-                            n_past, n_left, n_ctx, params.n_keep, n_discard);
+                    LOG("context full, swapping: n_past = %d, n_left = %d, kv_size = %d, n_keep = %d, n_discard = %d\n",
+                        n_past, n_left, kv_size, params.n_keep, n_discard);

-                    llama_kv_cache_seq_rm (ctx, 0, params.n_keep            , params.n_keep + n_discard);
-                    llama_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
+                    llama_kv_cache_seq_rm   (ctx, 0, params.n_keep + 1            , params.n_keep + n_discard + 1);
+                    llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);

                    n_past -= n_discard;

@@ -584,9 +574,9 @@ int main(int argc, char ** argv) {
                    LOG("div:   [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n);
                    LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd);

-                    llama_kv_cache_seq_add(ctx, 0, ga_i,                n_past,              ib*bd);
-                    llama_kv_cache_seq_div(ctx, 0, ga_i + ib*bd,        ga_i + ib*bd + ga_w, ga_n);
-                    llama_kv_cache_seq_add(ctx, 0, ga_i + ib*bd + ga_w, n_past + ib*bd,      dd);
+                    llama_kv_cache_seq_shift(ctx, 0, ga_i,                n_past,              ib*bd);
+                    llama_kv_cache_seq_div  (ctx, 0, ga_i + ib*bd,        ga_i + ib*bd + ga_w, ga_n);
+                    llama_kv_cache_seq_shift(ctx, 0, ga_i + ib*bd + ga_w, n_past + ib*bd,      dd);

                    n_past -= bd;

@@ -676,7 +666,7 @@ int main(int argc, char ** argv) {
                LOG("n_past = %d\n", n_past);
                // Display total tokens alongside total time
                if (params.n_print > 0 && n_past % params.n_print == 0) {
-                    LOG_TEE("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, n_ctx);
+                    LOG_TEE("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, kv_size);
                }
            }

@@ -777,18 +767,6 @@ int main(int argc, char ** argv) {
                    }
                }

-                // check for reverse prompt using special tokens
-                llama_token last_token = llama_sampling_last(ctx_sampling);
-                for (std::vector<llama_token> ids : antiprompt_ids) {
-                    if (ids.size() == 1 && last_token == ids[0]) {
-                        if (params.interactive) {
-                            is_interacting = true;
-                        }
-                        is_antiprompt = true;
-                        break;
-                    }
-                }
-
                if (is_antiprompt) {
                    LOG("found antiprompt: %s\n", last_output.c_str());
                }
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -152,7 +152,7 @@ int main(int argc, char ** argv) {
    fprintf(stderr, "\n\n");
    fflush(stderr);

-    const int n_ctx = llama_n_ctx(ctx);
+    const int kv_size = llama_kv_size(ctx);

    std::vector<client> clients(n_clients);
    for (size_t i = 0; i < clients.size(); ++i) {
@@ -169,7 +169,7 @@ int main(int argc, char ** argv) {

    // the max batch size is as large as the context to handle cases where we get very long input prompt from multiple
    // users. regardless of the size, the main loop will chunk the batch into a maximum of params.n_batch tokens at a time
-    llama_batch batch = llama_batch_init(n_ctx, 0, 1);
+    llama_batch batch = llama_batch_init(kv_size, 0, 1);

    int32_t n_total_prompt = 0;
    int32_t n_total_gen    = 0;
--- a/examples/passkey/passkey.cpp
+++ b/examples/passkey/passkey.cpp
@@ -92,7 +92,7 @@ int main(int argc, char ** argv) {
    llama_context_params ctx_params = llama_context_default_params();

    ctx_params.seed    = seed;
-    ctx_params.n_ctx   = llama_n_ctx_train(model)*n_grp + n_keep;
+    ctx_params.kv_size   = llama_n_ctx_train(model)*n_grp + n_keep;
    ctx_params.n_batch = 512;
    ctx_params.n_threads       = params.n_threads;
    ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
@@ -121,12 +121,12 @@ int main(int argc, char ** argv) {
    // total length of the sequences including the prompt
    const int n_len = n_tokens_all + n_predict;

-    const int n_ctx       = llama_n_ctx(ctx) - n_keep;
-    const int n_kv_req    = llama_n_ctx(ctx);
+    const int kv_size     = llama_kv_size(ctx) - n_keep;
+    const int n_kv_req    = llama_kv_size(ctx);
    const int n_batch     = ctx_params.n_batch;
    const int n_batch_grp = ctx_params.n_batch/n_grp;

-    LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d, n_grp = %d, n_batch = %d, n_junk = %d, i_pos = %d\n", __func__, n_len, n_ctx, n_kv_req, n_grp, n_batch, n_junk, i_pos);
+    LOG_TEE("\n%s: n_len = %d, kv_size = %d, n_kv_req = %d, n_grp = %d, n_batch = %d\n", __func__, n_len, kv_size, n_kv_req, n_grp, n_batch);

    // print the prompt token-by-token

@@ -140,17 +140,16 @@ int main(int argc, char ** argv) {
    int n_past = 0;

    // fill the KV cache
-    for (int i = 0; i < n_ctx; i += n_batch) {
+    for (int i = 0; i < kv_size; i += n_batch) {
        if (i > 0 && n_grp > 1) {
            // if SelfExtend is enabled, we compress the position from the last batch by a factor of n_grp
            const int ib = i/n_batch - 1;
            const int bd = n_batch_grp*(n_grp - 1);

-            llama_kv_cache_seq_add (ctx, 0, n_past - n_batch,         n_past,         ib*bd);
-            llama_kv_cache_seq_div (ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp);
-            llama_kv_cache_update  (ctx);
+            llama_kv_cache_seq_shift(ctx, 0, n_past - n_batch,         n_past,         ib*bd);
+            llama_kv_cache_seq_div  (ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp);

-            n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1;
+            n_past -= bd;
        }

        llama_batch_clear(batch);
@@ -175,17 +174,15 @@ int main(int argc, char ** argv) {
        }
    }

-    for (int i = n_ctx; i < n_tokens_all; i += n_batch) {
+    for (int i = kv_size; i < n_tokens_all; i += n_batch) {
        const int n_discard = n_batch;

        LOG_TEE("%s: shifting KV cache with %d\n", __func__, n_discard);

-        llama_kv_cache_seq_rm (ctx, 0, n_keep            , n_keep + n_discard);
-        llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx,  -n_discard);
-      //llama_kv_cache_defrag (ctx);
-        llama_kv_cache_update (ctx);
+        llama_kv_cache_seq_rm   (ctx, 0, n_keep            , n_keep + n_discard);
+        llama_kv_cache_seq_shift(ctx, 0, n_keep + n_discard, kv_size, -n_discard);

-        n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1;
+        n_past -= n_discard;

        llama_batch_clear(batch);

@@ -206,17 +203,15 @@ int main(int argc, char ** argv) {
    }

    {
-        const int n_discard = n_past - n_ctx + n_predict;
+        const int n_discard = n_past - kv_size + n_predict;

        if (n_discard > 0) {
            LOG_TEE("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard);

-            llama_kv_cache_seq_rm (ctx, 0, n_keep            , n_keep + n_discard);
-            llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx,  -n_discard);
-          //llama_kv_cache_defrag (ctx);
-            llama_kv_cache_update (ctx);
+            llama_kv_cache_seq_rm   (ctx, 0, n_keep            , n_keep + n_discard);
+            llama_kv_cache_seq_shift(ctx, 0, n_keep + n_discard, kv_size, -n_discard);

-            n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1;
+            n_past -= n_discard;
        }
    }

--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -320,11 +320,11 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &

    std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);

-    const int n_ctx = llama_n_ctx(ctx);
+    const int kv_size = llama_kv_size(ctx);

-    if (int(tokens.size()) < 2*n_ctx) {
-        fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
-                n_ctx);
+    if (int(tokens.size()) < 2*kv_size) {
+        fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n", __func__, 2 * kv_size,
+                kv_size);
        fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
        return {std::move(tokens), 0., {}, {}};
    }
@@ -340,13 +340,13 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
        return {tokens, -1, logit_history, prob_history};
    }

-    const int calc_chunk = n_ctx;
+    const int calc_chunk = kv_size;

    fprintf(stderr, "%s: have %zu tokens. Calculation chunk = %d\n", __func__, tokens.size(), calc_chunk);

    if (int(tokens.size()) <= calc_chunk) {
-        fprintf(stderr, "%s: there are only %zu tokens, this is not enough for a context size of %d and stride %d\n",__func__,
-                tokens.size(), n_ctx, params.ppl_stride);
+        fprintf(stderr, "%s: there are only %zu tokens, this is not enough for a context size of %d and stride %d\n", __func__,
+                tokens.size(), kv_size, params.ppl_stride);
        return {tokens, -1, logit_history, prob_history};
    }

@@ -414,8 +414,8 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
            fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
        }

-        //fprintf(stderr, "%s: using tokens %d...%d\n",__func__,params.n_ctx - params.ppl_stride + start, params.n_ctx + start);
-        for (int j = n_ctx - params.ppl_stride - 1; j < n_ctx - 1; ++j) {
+        //fprintf(stderr, "%s: using tokens %d...%d\n",__func__,params.kv_size - params.ppl_stride + start, params.kv_size + start);
+        for (int j = kv_size - params.ppl_stride - 1; j < kv_size - 1; ++j) {

            // Calculate probability of next token, given the previous ones.
            const std::vector<float> tok_logits(
@@ -453,7 +453,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
    // BOS tokens will be added for each chunk before eval

    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
-    const int n_ctx = llama_n_ctx(ctx);
+    const int kv_size  = llama_kv_size(ctx);

    std::ofstream logits_stream;
    if (!params.logits_file.empty()) {
@@ -464,7 +464,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
        }
        fprintf(stderr, "%s: saving all logits to %s\n", __func__, params.logits_file.c_str());
        logits_stream.write("_logits_", 8);
-        logits_stream.write(reinterpret_cast<const char *>(&n_ctx), sizeof(n_ctx));
+        logits_stream.write(reinterpret_cast<const char *>(&kv_size), sizeof(kv_size));
    }

    auto tim1 = std::chrono::high_resolution_clock::now();
@@ -475,9 +475,9 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
    auto tim2 = std::chrono::high_resolution_clock::now();
    fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());

-    if (int(tokens.size()) < 2*n_ctx) {
-        fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
-                n_ctx);
+    if (int(tokens.size()) < 2*kv_size) {
+        fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n", __func__, 2 * kv_size,
+                kv_size);
        fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
        return {std::move(tokens), 0., {}, {}};
    }
@@ -488,7 +488,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
    std::vector<float> prob_history;
    prob_history.resize(tokens.size());

-    const int n_chunk_max = tokens.size() / n_ctx;
+    const int n_chunk_max = tokens.size() / kv_size;

    const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
    const int n_vocab = llama_n_vocab(llama_get_model(ctx));
@@ -498,11 +498,11 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
    double nll = 0.0;
    double nll2 = 0.0;

-    const int num_batches = (n_ctx + n_batch - 1) / n_batch;
+    const int num_batches = (kv_size + n_batch - 1) / n_batch;

    std::vector<float> logits;
    if (num_batches > 1) {
-        logits.reserve((size_t)n_ctx * n_vocab);
+        logits.reserve((size_t)kv_size * n_vocab);
    }

    fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
@@ -513,14 +513,14 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
    if (!params.logits_file.empty()) {
        logits_stream.write((const char *)&n_vocab, sizeof(n_vocab));
        logits_stream.write((const char *)&n_chunk, sizeof(n_chunk));
-        logits_stream.write((const char *)tokens.data(), n_chunk*n_ctx*sizeof(tokens[0]));
+        logits_stream.write((const char *)tokens.data(), n_chunk * kv_size * sizeof(tokens[0]));
        const int nv = 2*((n_vocab + 1)/2) + 4;
-        log_probs.resize(n_ctx * nv);
+        log_probs.resize(kv_size * nv);
    }

    for (int i = 0; i < n_chunk; ++i) {
-        const int start =     i * n_ctx;
-        const int end   = start + n_ctx;
+        const int start = i * kv_size;
+        const int end   = start + kv_size;

        const auto t_start = std::chrono::high_resolution_clock::now();

@@ -566,7 +566,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
            fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
        }

-        // We get the logits for all the tokens in the context window (params.n_ctx)
+        // We get the logits for all the tokens in the context window (params.kv_size)
        // from llama_eval above.  Now, based on https://huggingface.co/docs/transformers/perplexity,
        // calculate the perplexity over the last half of the window (so the model always has
        // some context to predict the token).
@@ -578,16 +578,16 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
        // Example, we have a context window of 512, we will compute perplexity for each of the
        // last 256 tokens.  Then, we split the input up into context window size chunks to
        // process the entire prompt.
-        const int first = n_ctx/2;
+        const int first = kv_size/2;
        const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
        if (!params.logits_file.empty()) {
-            process_logits(logits_stream, n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
+            process_logits(logits_stream, n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, kv_size - 1 - first,
                    workers, log_probs, nll, nll2);
        } else {
-            process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
+            process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, kv_size - 1 - first,
                    workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
        }
-        count += n_ctx - first - 1;
+        count += kv_size - first - 1;

        // perplexity is e^(average negative log-likelihood)
        if (params.ppl_output_type == 0) {
@@ -596,7 +596,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
            double av = nll/count;
            double av2 = nll2/count - av*av;
            if (av2 > 0) av2 = sqrt(av2/(count-1));
-            printf("%8d  %.4lf  %4lf  %4lf\n", i*n_ctx, std::exp(nll / count), av, av2);
+            printf("%8d  %.4lf  %4lf  %4lf\n", i*kv_size, std::exp(nll / count), av, av2);
        }
        fflush(stdout);

@@ -805,16 +805,16 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
    double acc = 0.0f;

    const int n_vocab = llama_n_vocab(llama_get_model(ctx));
-    const int n_ctx   = llama_n_ctx(ctx);
+    const int kv_size = llama_kv_size(ctx);
    const int n_batch = params.n_batch;

    const int max_tasks_per_batch = 32;
    const int max_seq = 4*max_tasks_per_batch;

-    llama_batch batch = llama_batch_init(n_ctx, 0, max_seq);
+    llama_batch batch = llama_batch_init(kv_size, 0, max_seq);

    std::vector<float> tok_logits(n_vocab);
-    std::vector<float> batch_logits(n_vocab*n_ctx);
+    std::vector<float> batch_logits(n_vocab*kv_size);

    std::vector<std::pair<size_t, llama_token>> eval_pairs;
    std::vector<float> eval_results;
@@ -832,7 +832,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
        // each task has 4 unique seuqnce ids - one for each ending
        // the common prefix is shared among the 4 sequences to save tokens
        // we extract logits only from the last common token and from all ending tokens of each sequence
-        while (n_cur + (int) hs_data[i1].required_tokens <= n_ctx) {
+        while (n_cur + (int) hs_data[i1].required_tokens <= kv_size) {
            auto & hs_cur = hs_data[i1];

            const int s0 = 4*(i1 - i0);
@@ -1082,16 +1082,16 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
    fprintf(stderr, "%s : calculating winogrande score over selected tasks.\n", __func__);

    const int n_vocab = llama_n_vocab(llama_get_model(ctx));
-    const int n_ctx   = llama_n_ctx(ctx);
+    const int kv_size = llama_kv_size(ctx);
    const int n_batch = params.n_batch;

    const int max_tasks_per_batch = 128;
    const int max_seq = 2*max_tasks_per_batch;

-    llama_batch batch = llama_batch_init(n_ctx, 0, max_seq);
+    llama_batch batch = llama_batch_init(kv_size, 0, max_seq);

    std::vector<float> tok_logits(n_vocab);
-    std::vector<float> batch_logits(n_vocab*n_ctx);
+    std::vector<float> batch_logits(n_vocab*kv_size);

    std::vector<std::pair<size_t, llama_token>> eval_pairs;
    std::vector<float> eval_results;
@@ -1108,7 +1108,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {

        llama_batch_clear(batch);

-        while (n_cur + (int) data[i1].required_tokens <= n_ctx) {
+        while (n_cur + (int) data[i1].required_tokens <= kv_size) {
            const int s0 = 2*(i1 - i0);
            if (s0 + 2 > max_seq) {
                break;
@@ -1434,16 +1434,16 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
    printf("\ntask\tacc_norm\n");

    const int n_vocab = llama_n_vocab(llama_get_model(ctx));
-    const int n_ctx   = llama_n_ctx(ctx);
+    const int kv_size = llama_kv_size(ctx);
    const int n_batch = params.n_batch;

    const int max_tasks_per_batch = 32;
    const int max_seq = 4*max_tasks_per_batch;

-    llama_batch batch = llama_batch_init(n_ctx, 0, max_seq);
+    llama_batch batch = llama_batch_init(kv_size, 0, max_seq);

    std::vector<float> tok_logits(n_vocab);
-    std::vector<float> batch_logits(n_vocab*n_ctx);
+    std::vector<float> batch_logits(n_vocab*kv_size);

    std::vector<std::pair<size_t, llama_token>> eval_pairs;
    std::vector<float> eval_results;
@@ -1467,7 +1467,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
        // the common prefix is shared among the 4 sequences to save tokens
        // we extract logits only from the last common token and from all ending tokens of each sequence
        int s0 = 0;
-        while (n_cur + (int) tasks[i1].required_tokens <= n_ctx) {
+        while (n_cur + (int) tasks[i1].required_tokens <= kv_size) {
            auto& cur_task = tasks[i1];

            int num_answers = cur_task.seq_tokens.size();
@@ -1620,11 +1620,11 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
        }
    }

-    uint32_t n_ctx;
-    in.read((char *)&n_ctx, sizeof(n_ctx));
-    if (n_ctx > llama_n_ctx(ctx)) {
-        fprintf(stderr, "%s: %s has been computed with %u, while the current context is %d. Increase it with -c and retry\n",
-                __func__, params.logits_file.c_str(), n_ctx, params.n_ctx);
+    uint32_t kv_size;
+    in.read((char *)&kv_size, sizeof(kv_size));
+    if (kv_size > llama_kv_size(ctx)) {
+        fprintf(stderr, "%s: %s has been computed with %u, while the current KV Cache size is %d. Increase it with -kv and retry\n",
+                __func__, params.logits_file.c_str(), kv_size, params.kv_size);
    }

    int n_vocab, n_chunk;
@@ -1638,22 +1638,22 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
        fprintf(stderr, "%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_n_vocab(llama_get_model(ctx)));
    }

-    std::vector<llama_token> tokens(n_ctx * n_chunk);
+    std::vector<llama_token> tokens(kv_size * n_chunk);
    if (in.read((char *)tokens.data(), tokens.size()*sizeof(tokens[0])).fail()) {
        fprintf(stderr, "%s: failed reading evaluation tokens from %s\n", __func__, params.logits_file.c_str());
        return;
    }

    const int n_batch = params.n_batch;
-    const int num_batches = (n_ctx + n_batch - 1)/n_batch;
+    const int num_batches = (kv_size + n_batch - 1)/n_batch;
    const int nv = 2*((n_vocab + 1)/2) + 4;
    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));

-    std::vector<uint16_t> log_probs_uint16(size_t(n_ctx - 1 - n_ctx/2) * nv);
-    std::vector<float> kld_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk);
+    std::vector<uint16_t> log_probs_uint16(size_t(kv_size - 1 - kv_size/2) * nv);
+    std::vector<float> kld_values(size_t(kv_size - 1 - kv_size /2)*n_chunk);
    std::vector<float> logits;
    if (num_batches > 1) {
-        logits.reserve(n_ctx * n_vocab);
+        logits.reserve(kv_size * n_vocab);
    }

    std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
@@ -1672,8 +1672,8 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
    auto kld_ptr = kld_values.data();

    for (int i = 0; i < n_chunk; ++i) {
-        const int start =     i * n_ctx;
-        const int end   = start + n_ctx;
+        const int start = i * kv_size;
+        const int end   = start + kv_size;

        const auto t_start = std::chrono::high_resolution_clock::now();

@@ -1726,11 +1726,11 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
            printf("\nchunk        PPL          ln(PPL(Q)/PPL(base))          KL-Divergence           Same top\n");
        }

-        const int first = n_ctx/2;
+        const int first = kv_size/2;
        const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
-        process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
+        process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, kv_size - 1 - first,
                workers, log_probs_uint16, kld, kld_ptr);
-        kld_ptr += n_ctx - 1 - first;
+        kld_ptr += kv_size - 1 - first;

        auto ppl           = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
        auto log_ppl_ratio = mean_and_uncertainty(kld.sum_nll_diff, kld.sum_nll_diff2, kld.count);
@@ -1788,12 +1788,12 @@ int main(int argc, char ** argv) {
    }

    params.logits_all = true;
-    params.n_batch = std::min(params.n_batch, params.n_ctx);
+    params.n_batch = std::min(params.n_batch, params.kv_size);

    if (params.ppl_stride > 0) {
-        fprintf(stderr, "Will perform strided perplexity calculation -> adjusting context size from %d to %d\n",
-                params.n_ctx, params.n_ctx + params.ppl_stride/2);
-        params.n_ctx += params.ppl_stride/2;
+        fprintf(stderr, "Will perform strided perplexity calculation -> adjusting KV size from %d to %d\n",
+                params.kv_size, params.kv_size + params.ppl_stride / 2);
+        params.kv_size += params.ppl_stride/2;
    }

    print_build_info();
@@ -1823,9 +1823,9 @@ int main(int argc, char ** argv) {
    }

    const int n_ctx_train = llama_n_ctx_train(model);
-    if (params.n_ctx > n_ctx_train) {
+    if (params.kv_size > n_ctx_train) {
        fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
-                __func__, n_ctx_train, params.n_ctx);
+                __func__, n_ctx_train, params.kv_size);
    }

    // print system information
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@@ -319,7 +319,7 @@ int main(int argc, char ** argv) {
        }

        auto cparams = llama_context_default_params();
-        cparams.n_ctx      = 256;
+        cparams.kv_size    = 256;
        cparams.seed       = 1;

        ctx = llama_new_context_with_model(model, cparams);
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -23,21 +23,15 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
    { "Q5_1",   LLAMA_FTYPE_MOSTLY_Q5_1,   " 4.70G, +0.0349 ppl @ LLaMA-v1-7B", },
    { "IQ2_XXS",LLAMA_FTYPE_MOSTLY_IQ2_XXS," 2.06 bpw quantization",            },
    { "IQ2_XS", LLAMA_FTYPE_MOSTLY_IQ2_XS, " 2.31 bpw quantization",            },
-    { "IQ2_S",  LLAMA_FTYPE_MOSTLY_IQ2_S,  " 2.5  bpw quantization",            },
-    { "IQ2_M",  LLAMA_FTYPE_MOSTLY_IQ2_M,  " 2.7  bpw quantization",            },
    { "IQ1_S",  LLAMA_FTYPE_MOSTLY_IQ1_S,  " 1.56 bpw quantization",            },
    { "Q2_K",   LLAMA_FTYPE_MOSTLY_Q2_K,   " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
    { "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", },
    { "IQ3_XXS",LLAMA_FTYPE_MOSTLY_IQ3_XXS," 3.06 bpw quantization",            },
-    { "IQ3_S",  LLAMA_FTYPE_MOSTLY_IQ3_S,  " 3.44 bpw quantization",            },
-    { "IQ3_M",  LLAMA_FTYPE_MOSTLY_IQ3_M,  " 3.66 bpw quantization mix",        },
    { "Q3_K",   LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
-    { "IQ3_XS", LLAMA_FTYPE_MOSTLY_IQ3_XS, " 3.3 bpw quantization"   ,          },
+    { "Q3_K_XS",LLAMA_FTYPE_MOSTLY_Q3_K_XS,"3-bit extra small quantization"   , },
    { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", },
    { "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.07G, +0.2496 ppl @ LLaMA-v1-7B", },
    { "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 3.35G, +0.1764 ppl @ LLaMA-v1-7B", },
-    { "IQ4_NL", LLAMA_FTYPE_MOSTLY_IQ4_NL, " 4.50 bpw non-linear quantization", },
-    { "IQ4_XS", LLAMA_FTYPE_MOSTLY_IQ4_XS, " 4.25 bpw non-linear quantization", },
    { "Q4_K",   LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", },
    { "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 3.59G, +0.0992 ppl @ LLaMA-v1-7B", },
    { "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 3.80G, +0.0532 ppl @ LLaMA-v1-7B", },
@@ -295,7 +289,6 @@ int main(int argc, char ** argv) {
    }

    if ((params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS ||
-         params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_S  ||
         params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) && imatrix_data.empty()) {
        fprintf(stderr, "\n===============================================================================================\n");
        fprintf(stderr, "Please do not use IQ1_S, IQ2_XXS, IQ2_XS or Q2_K_S quantization without an importance matrix\n");
--- a/examples/server-embd.py
+++ b/examples/server-embd.py
@@ -1,34 +0,0 @@
-import asyncio
-import requests
-import numpy as np
-
-n = 8
-
-result = []
-
-async def requests_post_async(*args, **kwargs):
-    return await asyncio.to_thread(requests.post, *args, **kwargs)
-
-async def main():
-    model_url = "http://127.0.0.1:6900"
-    responses: list[requests.Response] = await asyncio.gather(*[requests_post_async(
-        url= f"{model_url}/embedding",
-        json= {"content": str(i)*1024}
-    ) for i in range(n)])
-
-    for response in responses:
-        embedding = response.json()["embedding"]
-        print(embedding[-8:])
-        result.append(embedding)
-
-asyncio.run(main())
-
-# compute cosine similarity
-
-for i in range(n-1):
-    for j in range(i+1, n):
-        embedding1 = np.array(result[i])
-        embedding2 = np.array(result[j])
-        similarity = np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2))
-        print(f"Similarity between {i} and {j}: {similarity:.2f}")
-
--- a/examples/server-llama2-13B.sh
+++ b/examples/server-llama2-13B.sh
@@ -12,7 +12,7 @@ PROMPT_TEMPLATE=${PROMPT_TEMPLATE:-./prompts/chat-system.txt}
 N_THREAD="${N_THREAD:-12}"

 # Note: you can also override the generation options by specifying them on the command line:
-GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 4096 --batch-size 1024}"
+GEN_OPTIONS="${GEN_OPTIONS:---kv_size 4096 --batch-size 1024}"


 # shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -1,27 +1,15 @@
-# LLaMA.cpp HTTP Server
+# llama.cpp/example/server

-Fast, lightweight, pure C/C++ HTTP server based on [httplib](https://github.com/yhirose/cpp-httplib), [nlohmann::json](https://github.com/nlohmann/json) and **llama.cpp**.
+This example demonstrates a simple HTTP API server and a simple web front end to interact with llama.cpp.

-Set of LLM REST APIs and a simple web front end to interact with llama.cpp.
-
-**Features:**
- * LLM inference of F16 and quantum models on GPU and CPU
- * [OpenAI API](https://github.com/openai/openai-openapi) compatible chat completions and embeddings routes
- * Parallel decoding with multi-user support
- * Continuous batching
- * Multimodal (wip)
- * Monitoring endpoints
-
-The project is under active development, and we are [looking for feedback and contributors](https://github.com/ggerganov/llama.cpp/issues/4216).
-
-**Command line options:**
+Command line options:

 - `--threads N`, `-t N`: Set the number of threads to use during generation.
 - `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation.
- `--threads-http N`: number of threads in the http server pool to process requests (default: `max(std::thread::hardware_concurrency() - 1, --parallel N + 2)`)
 - `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
 - `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096.
+- `-c N`, `--ctx-size N`: Deprecated, use `--kv-size` instead.
+- `-kv N`, `--kv-size N`: Specify the total size of the KV cache. This corresponds to the total amount of tokens that can be stored across all independent sequences / slots. `llama.cpp` implements a "unified" cache strategy, the KV cache size is actually shared across all sequences. It's allowed to have sequences with more than `T` tokens as long as the sum of all tokens does not exceed `P*T`. The default is 512.
 - `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
 - `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
 - `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
@@ -46,18 +34,14 @@ see https://github.com/ggerganov/llama.cpp/issues/1437
 - `--api-key`: Set an api key for request authorization. By default the server responds to every request. With an api key set, the requests must have the Authorization header set with the api key as Bearer token. May be used multiple times to enable multiple valid keys.
 - `--api-key-file`: path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access. May be used in conjunction with `--api-key`'s.
 - `--embedding`: Enable embedding extraction, Default: disabled.
- `-np N`, `--parallel N`: Set the number of slots for process requests (default: 1)
+- `-np N`, `--parallel N`: Set the number of slots / sequences for process requests (default: 1). Each sequence can have a maximum of `T` tokens, use together with `--kv-size`.
 - `-cb`, `--cont-batching`: enable continuous batching (a.k.a dynamic batching) (default: disabled)
 - `-spf FNAME`, `--system-prompt-file FNAME` Set a file to load "a system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
 - `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA.
 - `--grp-attn-n`: Set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`
 - `--grp-attn-w`: Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`
- `-n N, --n-predict N`: Set the maximum tokens to predict (default: -1)
+- `-n, --n-predict`: Set the maximum tokens to predict (default: -1)
 - `--slots-endpoint-disable`: To disable slots state monitoring endpoint. Slots state may contain user data, prompts included.
- `--metrics`: enable prometheus `/metrics` compatible endpoint (default: disabled)
- `--chat-template JINJA_TEMPLATE`: Set custom jinja chat template. This parameter accepts a string, not a file name (default: template taken from model's metadata). We only support [some pre-defined templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
- `--log-disable`: Output logs to stdout only, default: enabled.
- `--log-format FORMAT`: Define the log output to FORMAT: json or text (default: json)

 ## Build

@@ -114,12 +98,6 @@ curl --request POST \
    --data '{"prompt": "Building a website can be done in 10 simple steps:","n_predict": 128}'
 ```

-## Advanced testing
-
-We implemented a [server test framework](./tests/README.md) using human-readable scenario.
-
-*Before submitting an issue, please try to reproduce it with this format.*
-
 ## Node JS Test

 You need to have [Node.js](https://nodejs.org/en) installed.
@@ -157,13 +135,10 @@ node index.js
 ## API Endpoints

 - **GET** `/health`: Returns the current state of the server:
-  - 503 -> `{"status": "loading model"}` if the model is still being loaded.
-  - 500 -> `{"status": "error"}` if the model failed to load.
-  - 200 -> `{"status": "ok", "slots_idle": 1, "slots_processing": 2 }` if the model is successfully loaded and the server is ready for further requests mentioned below.
-  - 200 -> `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if no slot are currently available.
-  - 503 -> `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if the query parameter `fail_on_no_slot` is provided and no slot are currently available.
-
-  If the query parameter `include_slots` is passed, `slots` field will contain internal slots data except if `--slots-endpoint-disable` is set.
+  - `{"status": "loading model"}` if the model is still being loaded.
+  - `{"status": "error"}` if the model failed to load.
+  - `{"status": "ok"}` if the model is successfully loaded and the server is ready for further requests mentioned below.
+  - `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if no slot are currently available

 - **POST** `/completion`: Given a `prompt`, it returns the predicted completion.

@@ -173,7 +148,7 @@ node index.js

    `temperature`: Adjust the randomness of the generated text (default: 0.8).

-    `dynatemp_range`: Dynamic temperature range. The final temperature will be in the range of `[temperature - dynatemp_range; temperature + dynatemp_range]` (default: 0.0, 0.0 = disabled).
+    `dynatemp_range`: Dynamic temperature range (default: 0.0, 0.0 = disabled).

    `dynatemp_exponent`: Dynamic temperature exponent (default: 1.0).

@@ -199,7 +174,7 @@ node index.js

    `repeat_penalty`: Control the repetition of token sequences in the generated text (default: 1.1).

-    `repeat_last_n`: Last n tokens to consider for penalizing repetition (default: 64, 0 = disabled, -1 = ctx-size).
+    `repeat_last_n`: Last n tokens to consider for penalizing repetition (default: 64, 0 = disabled, -1 = kv-size).

    `penalize_nl`: Penalize newline tokens when applying the repeat penalty (default: true).

@@ -231,7 +206,7 @@ node index.js

    `slot_id`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot (default: -1)

-    `cache_prompt`: Re-use previously cached prompt from the last request if possible. This may prevent re-caching the prompt from scratch. (default: false)
+    `cache_prompt`: Save the prompt and generation for avoid reprocess entire prompt if a part of this isn't change (default: false)

    `system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)

@@ -264,7 +239,7 @@ Notice that each `probs` is an array of length `n_probs`.

 - `content`: Completion result as a string (excluding `stopping_word` if any). In case of streaming mode, will contain the next token as a string.
 - `stop`: Boolean for use with `stream` to check whether the generation has stopped (Note: This is not related to stopping words array `stop` from input options)
- `generation_settings`: The provided options above excluding `prompt` but including `n_ctx`, `model`. These options may differ from the original ones in some way (e.g. bad values filtered out, strings converted to tokens, etc.).
+- `generation_settings`: The provided options above excluding `prompt` but including `kv_size`, `model`
 - `model`: The path to the model loaded with `-m`
 - `prompt`: The provided `prompt`
 - `stopped_eos`: Indicating whether the completion has stopped because it encountered the EOS token
@@ -274,7 +249,7 @@ Notice that each `probs` is an array of length `n_probs`.
 - `timings`: Hash of timing information about the completion such as the number of tokens `predicted_per_second`
 - `tokens_cached`: Number of tokens from the prompt which could be re-used from previous completion (`n_past`)
 - `tokens_evaluated`: Number of tokens evaluated in total from the prompt
- `truncated`: Boolean indicating if the context size was exceeded during generation, i.e. the number of tokens provided in the prompt (`tokens_evaluated`) plus tokens generated (`tokens predicted`) exceeded the context size (`n_ctx`)
+- `truncated`: Boolean indicating if the context size was exceeded during generation, i.e. the number of tokens provided in the prompt (`tokens_evaluated`) plus tokens generated (`tokens predicted`) exceeded the KV size (`kv_size`)

 - **POST** `/tokenize`: Tokenize a given text.

@@ -326,7 +301,7 @@ Notice that each `probs` is an array of length `n_probs`.
 - `default_generation_settings` - the default generation settings for the `/completion` endpoint, has the same fields as the `generation_settings` response object from the `/completion` endpoint.
 - `total_slots` - the total number of slots for process requests (defined by `--parallel` option)

- **POST** `/v1/chat/completions`: OpenAI-compatible Chat Completions API. Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only ChatML-tuned models, such as Dolphin, OpenOrca, OpenHermes, OpenChat-3.5, etc can be used with this endpoint.
+- **POST** `/v1/chat/completions`: OpenAI-compatible Chat Completions API. Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only ChatML-tuned models, such as Dolphin, OpenOrca, OpenHermes, OpenChat-3.5, etc can be used with this endpoint. Compared to `api_like_OAI.py` this API implementation does not require a wrapper to be served.

    *Options:*

@@ -429,7 +404,7 @@ Notice that each `probs` is an array of length `n_probs`.
        "mirostat_eta": 0.10000000149011612,
        "mirostat_tau": 5.0,
        "model": "llama-2-7b-32k-instruct.Q2_K.gguf",
-        "n_ctx": 2048,
+        "kv_size": 2048,
        "n_keep": 0,
        "n_predict": 100000,
        "n_probs": 0,
@@ -473,18 +448,6 @@ Notice that each `probs` is an array of length `n_probs`.
 ]
 ```

- **GET** `/metrics`: [Prometheus](https://prometheus.io/) compatible metrics exporter endpoint if `--metrics` is enabled:
-
-Available metrics:
- `llamacpp:prompt_tokens_total`: Number of prompt tokens processed.
- `llamacpp:tokens_predicted_total`: Number of generation tokens processed.
- `llamacpp:prompt_tokens_seconds`: Average prompt throughput in tokens/s.
- `llamacpp:predicted_tokens_seconds`: Average generation throughput in tokens/s.
- `llamacpp:kv_cache_usage_ratio`: KV-cache usage. 1 means 100 percent usage.
- `llamacpp:kv_cache_tokens`: KV-cache tokens.
- `llamacpp:requests_processing`: Number of request processing.
- `llamacpp:requests_deferred`: Number of request deferred.
-
 ## More examples

 ### Change system prompt on runtime
@@ -528,7 +491,20 @@ bash chat.sh

 ### API like OAI

-The HTTP server supports OAI-like API
+API example using Python Flask: [api_like_OAI.py](api_like_OAI.py)
+This example must be used with server.cpp
+
+```sh
+python api_like_OAI.py
+```
+
+After running the API server, you can use it in Python by setting the API base URL.
+
+```python
+openai.api_base = "http://<Your api-server IP>:port"
+```
+
+Then you can utilize llama.cpp as an OpenAI's **chat.completion** or **text_completion** API

 ### Extending or building alternative Web Front End

--- a/examples/server/api_like_OAI.py
+++ b/examples/server/api_like_OAI.py
@@ -0,0 +1,228 @@
+#!/usr/bin/env python3
+import argparse
+from flask import Flask, jsonify, request, Response
+import urllib.parse
+import requests
+import time
+import json
+
+
+app = Flask(__name__)
+slot_id = -1
+
+parser = argparse.ArgumentParser(description="An example of using server.cpp with a similar API to OAI. It must be used together with server.cpp.")
+parser.add_argument("--chat-prompt", type=str, help="the top prompt in chat completions(default: 'A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.')", default='A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.')
+parser.add_argument("--user-name", type=str, help="USER name in chat completions(default: 'USER: ')", default="USER: ")
+parser.add_argument("--ai-name", type=str, help="ASSISTANT name in chat completions(default: 'ASSISTANT: ')", default="ASSISTANT: ")
+parser.add_argument("--system-name", type=str, help="SYSTEM name in chat completions(default: 'ASSISTANT's RULE: ')", default="ASSISTANT's RULE: ")
+parser.add_argument("--stop", type=str, help="the end of response in chat completions(default: '</s>')", default="</s>")
+parser.add_argument("--llama-api", type=str, help="Set the address of server.cpp in llama.cpp(default: http://127.0.0.1:8080)", default='http://127.0.0.1:8080')
+parser.add_argument("--api-key", type=str, help="Set the api key to allow only few user(default: NULL)", default="")
+parser.add_argument("--host", type=str, help="Set the ip address to listen.(default: 127.0.0.1)", default='127.0.0.1')
+parser.add_argument("--port", type=int, help="Set the port to listen.(default: 8081)", default=8081)
+
+args = parser.parse_args()
+
+def is_present(json, key):
+    try:
+        buf = json[key]
+    except KeyError:
+        return False
+    if json[key] == None:
+        return False
+    return True
+
+#convert chat to prompt
+def convert_chat(messages):
+
+    system_n = args.system_name
+    user_n = args.user_name
+    ai_n = args.ai_name
+    stop = args.stop
+
+    prompt = "" + args.chat_prompt + stop
+
+    for line in messages:
+        if (line["role"] == "system"):
+            prompt += f"{system_n}{line['content']}{stop}"
+        if (line["role"] == "user"):
+            prompt += f"{user_n}{line['content']}{stop}"
+        if (line["role"] == "assistant"):
+            prompt += f"{ai_n}{line['content']}{stop}"
+    prompt += ai_n.rstrip()
+
+    return prompt
+
+def make_postData(body, chat=False, stream=False):
+    postData = {}
+    if (chat):
+        postData["prompt"] = convert_chat(body["messages"])
+    else:
+        postData["prompt"] = body["prompt"]
+    if(is_present(body, "temperature")): postData["temperature"] = body["temperature"]
+    if(is_present(body, "top_k")): postData["top_k"] = body["top_k"]
+    if(is_present(body, "top_p")): postData["top_p"] = body["top_p"]
+    if(is_present(body, "max_tokens")): postData["n_predict"] = body["max_tokens"]
+    if(is_present(body, "presence_penalty")): postData["presence_penalty"] = body["presence_penalty"]
+    if(is_present(body, "frequency_penalty")): postData["frequency_penalty"] = body["frequency_penalty"]
+    if(is_present(body, "repeat_penalty")): postData["repeat_penalty"] = body["repeat_penalty"]
+    if(is_present(body, "mirostat")): postData["mirostat"] = body["mirostat"]
+    if(is_present(body, "mirostat_tau")): postData["mirostat_tau"] = body["mirostat_tau"]
+    if(is_present(body, "mirostat_eta")): postData["mirostat_eta"] = body["mirostat_eta"]
+    if(is_present(body, "seed")): postData["seed"] = body["seed"]
+    if(is_present(body, "grammar")): postData["grammar"] = body["grammar"]
+    if(is_present(body, "logit_bias")): postData["logit_bias"] = [[int(token), body["logit_bias"][token]] for token in body["logit_bias"].keys()]
+    if (args.stop != ""):
+        postData["stop"] = [args.stop]
+    else:
+        postData["stop"] = []
+    if(is_present(body, "stop")): postData["stop"] += body["stop"]
+    postData["n_keep"] = -1
+    postData["stream"] = stream
+    postData["cache_prompt"] = True
+    postData["slot_id"] = slot_id
+    return postData
+
+def make_resData(data, chat=False, promptToken=[]):
+    resData = {
+        "id": "chatcmpl" if (chat) else "cmpl",
+        "object": "chat.completion" if (chat) else "text_completion",
+        "created": int(time.time()),
+        "truncated": data["truncated"],
+        "model": "LLaMA_CPP",
+        "usage": {
+            "prompt_tokens": data["tokens_evaluated"],
+            "completion_tokens": data["tokens_predicted"],
+            "total_tokens": data["tokens_evaluated"] + data["tokens_predicted"]
+        }
+    }
+    if (len(promptToken) != 0):
+        resData["promptToken"] = promptToken
+    if (chat):
+        #only one choice is supported
+        resData["choices"] = [{
+            "index": 0,
+            "message": {
+                "role": "assistant",
+                "content": data["content"],
+            },
+            "finish_reason": "stop" if (data["stopped_eos"] or data["stopped_word"]) else "length"
+        }]
+    else:
+        #only one choice is supported
+        resData["choices"] = [{
+            "text": data["content"],
+            "index": 0,
+            "logprobs": None,
+            "finish_reason": "stop" if (data["stopped_eos"] or data["stopped_word"]) else "length"
+        }]
+    return resData
+
+def make_resData_stream(data, chat=False, time_now = 0, start=False):
+    resData = {
+        "id": "chatcmpl" if (chat) else "cmpl",
+        "object": "chat.completion.chunk" if (chat) else "text_completion.chunk",
+        "created": time_now,
+        "model": "LLaMA_CPP",
+        "choices": [
+            {
+                "finish_reason": None,
+                "index": 0
+            }
+        ]
+    }
+    slot_id = data.get("slot_id")
+    if (chat):
+        if (start):
+            resData["choices"][0]["delta"] =  {
+                "role": "assistant"
+            }
+        else:
+            resData["choices"][0]["delta"] =  {
+                "content": data["content"]
+            }
+            if (data["stop"]):
+                resData["choices"][0]["finish_reason"] = "stop" if (data["stopped_eos"] or data["stopped_word"]) else "length"
+    else:
+        resData["choices"][0]["text"] = data["content"]
+        if (data["stop"]):
+            resData["choices"][0]["finish_reason"] = "stop" if (data["stopped_eos"] or data["stopped_word"]) else "length"
+
+    return resData
+
+
+@app.route('/chat/completions', methods=['POST', 'OPTIONS'])
+@app.route('/v1/chat/completions', methods=['POST', 'OPTIONS'])
+def chat_completions():
+    if (args.api_key != "" and request.headers["Authorization"].split()[1] != args.api_key):
+        return Response(status=403)
+    if request.method == 'OPTIONS':
+        return Response(headers={"Access-Control-Allow-Origin": "*", "Access-Control-Allow-Headers": "*"})
+    body = request.get_json()
+    stream = False
+    tokenize = False
+    if(is_present(body, "stream")): stream = body["stream"]
+    if(is_present(body, "tokenize")): tokenize = body["tokenize"]
+    postData = make_postData(body, chat=True, stream=stream)
+
+    promptToken = []
+    if (tokenize):
+        tokenData = requests.request("POST", urllib.parse.urljoin(args.llama_api, "/tokenize"), data=json.dumps({"content": postData["prompt"]})).json()
+        promptToken = tokenData["tokens"]
+
+    if (not stream):
+        data = requests.request("POST", urllib.parse.urljoin(args.llama_api, "/completion"), data=json.dumps(postData))
+        print(data.json())
+        resData = make_resData(data.json(), chat=True, promptToken=promptToken)
+        return jsonify(resData)
+    else:
+        def generate():
+            data = requests.request("POST", urllib.parse.urljoin(args.llama_api, "/completion"), data=json.dumps(postData), stream=True)
+            time_now = int(time.time())
+            resData = make_resData_stream({}, chat=True, time_now=time_now, start=True)
+            yield 'data: {}\n\n'.format(json.dumps(resData))
+            for line in data.iter_lines():
+                if line:
+                    decoded_line = line.decode('utf-8')
+                    resData = make_resData_stream(json.loads(decoded_line[6:]), chat=True, time_now=time_now)
+                    yield 'data: {}\n\n'.format(json.dumps(resData))
+        return Response(generate(), mimetype='text/event-stream', headers={"Access-Control-Allow-Origin": "*", "Access-Control-Allow-Headers": "*"})
+
+
+@app.route('/completions', methods=['POST', 'OPTIONS'])
+@app.route('/v1/completions', methods=['POST', 'OPTIONS'])
+def completion():
+    if (args.api_key != "" and request.headers["Authorization"].split()[1] != args.api_key):
+        return Response(status=403)
+    if request.method == 'OPTIONS':
+        return Response(headers={"Access-Control-Allow-Origin": "*", "Access-Control-Allow-Headers": "*"})
+    body = request.get_json()
+    stream = False
+    tokenize = False
+    if(is_present(body, "stream")): stream = body["stream"]
+    if(is_present(body, "tokenize")): tokenize = body["tokenize"]
+    postData = make_postData(body, chat=False, stream=stream)
+
+    promptToken = []
+    if (tokenize):
+        tokenData = requests.request("POST", urllib.parse.urljoin(args.llama_api, "/tokenize"), data=json.dumps({"content": postData["prompt"]})).json()
+        promptToken = tokenData["tokens"]
+
+    if (not stream):
+        data = requests.request("POST", urllib.parse.urljoin(args.llama_api, "/completion"), data=json.dumps(postData))
+        print(data.json())
+        resData = make_resData(data.json(), chat=False, promptToken=promptToken)
+        return jsonify(resData)
+    else:
+        def generate():
+            data = requests.request("POST", urllib.parse.urljoin(args.llama_api, "/completion"), data=json.dumps(postData), stream=True)
+            time_now = int(time.time())
+            for line in data.iter_lines():
+                if line:
+                    decoded_line = line.decode('utf-8')
+                    resData = make_resData_stream(json.loads(decoded_line[6:]), chat=False, time_now=time_now)
+                    yield 'data: {}\n\n'.format(json.dumps(resData))
+        return Response(generate(), mimetype='text/event-stream', headers={"Access-Control-Allow-Origin": "*", "Access-Control-Allow-Headers": "*"})
+
+if __name__ == '__main__':
+    app.run(args.host, port=args.port)
--- a/examples/server/oai.hpp
+++ b/examples/server/oai.hpp
@@ -15,11 +15,13 @@
 using json = nlohmann::json;

 inline static json oaicompat_completion_params_parse(
-    const struct llama_model * model,
    const json &body, /* openai api json semantics */
    const std::string &chat_template)
 {
    json llama_params;
+    std::string formatted_prompt = chat_template == "chatml"
+        ? format_chatml(body["messages"])  // OpenAI 'messages' to chatml (with <|im_start|>,...)
+        : format_llama2(body["messages"]); // OpenAI 'messages' to llama2 (with [INST],...)

    llama_params["__oaicompat"] = true;

@@ -32,7 +34,7 @@ inline static json oaicompat_completion_params_parse(
    // https://platform.openai.com/docs/api-reference/chat/create
    llama_sampling_params default_sparams;
    llama_params["model"]             = json_value(body, "model", std::string("unknown"));
-    llama_params["prompt"]            = format_chat(model, chat_template, body["messages"]);
+    llama_params["prompt"]            = formatted_prompt;
    llama_params["cache_prompt"]      = json_value(body, "cache_prompt", false);
    llama_params["temperature"]       = json_value(body, "temperature", 0.0);
    llama_params["top_k"]             = json_value(body, "top_k", default_sparams.top_k);
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
--- a/examples/server/tests/README.md
+++ b/examples/server/tests/README.md
@@ -1,67 +0,0 @@
-# Server tests
-
-Python based server tests scenario using [BDD](https://en.wikipedia.org/wiki/Behavior-driven_development)
-and [behave](https://behave.readthedocs.io/en/latest/):
-
-* [issues.feature](./features/issues.feature) Pending issues scenario
-* [parallel.feature](./features/parallel.feature) Scenario involving multi slots and concurrent requests
-* [security.feature](./features/security.feature) Security, CORS and API Key
-* [server.feature](./features/server.feature) Server base scenario: completion, embedding, tokenization, etc...
-
-Tests target GitHub workflows job runners with 4 vCPU.
-
-Requests are
-using [aiohttp](https://docs.aiohttp.org/en/stable/client_reference.html), [asyncio](https://docs.python.org/fr/3/library/asyncio.html)
-based http client.
-
-Note: If the host architecture inference speed is faster than GitHub runners one, parallel scenario may randomly fail.
-To mitigate it, you can increase values in `n_predict`, `kv_size`.
-
-### Install dependencies
-
-`pip install -r requirements.txt`
-
-### Run tests
-
-1. Build the server
-
-```shell
-cd ../../..
-mkdir build
-cd build
-cmake ../
-cmake --build . --target server
-```
-
-2. Start the test: `./tests.sh`
-
-It's possible to override some scenario steps values with environment variables:
-
-| variable                 | description                                                                                    |
-|--------------------------|------------------------------------------------------------------------------------------------|
-| `PORT`                   | `context.server_port` to set the listening port of the server during scenario, default: `8080` |
-| `LLAMA_SERVER_BIN_PATH`  | to change the server binary path, default: `../../../build/bin/server`                         |
-| `DEBUG`                  | "ON" to enable steps and server verbose mode `--verbose`                                       |
-| `SERVER_LOG_FORMAT_JSON` | if set switch server logs to json format                                                       |
-| `N_GPU_LAYERS`           | number of model layers to offload to VRAM `-ngl --n-gpu-layers`                                |
-
-### Run @bug, @wip or @wrong_usage annotated scenario
-
-Feature or Scenario must be annotated with `@llama.cpp` to be included in the default scope.
-
- `@bug` annotation aims to link a scenario with a GitHub issue.
- `@wrong_usage` are meant to show user issue that are actually an expected behavior
- `@wip` to focus on a scenario working in progress
- `@slow` heavy test, disabled by default
-
-To run a scenario annotated with `@bug`, start:
-
-```shell
-DEBUG=ON ./tests.sh --no-skipped --tags bug
-```
-
-After changing logic in `steps.py`, ensure that `@bug` and `@wrong_usage` scenario are updated.
-
-```shell
-./tests.sh --no-skipped --tags bug,wrong_usage || echo "should failed but compile"
-```
--- a/examples/server/tests/features/environment.py
+++ b/examples/server/tests/features/environment.py
@@ -1,72 +0,0 @@
-import os
-import socket
-import subprocess
-import time
-from contextlib import closing
-from signal import SIGKILL
-
-
-def before_scenario(context, scenario):
-    context.debug = 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON'
-    if context.debug:
-        print("DEBUG=ON\n")
-    print(f"\x1b[33;42mStarting new scenario: {scenario.name}!\x1b[0m\n")
-    port = 8080
-    if 'PORT' in os.environ:
-        port = int(os.environ['PORT'])
-    if is_server_listening("localhost", port):
-        assert False, "Server already started"
-
-
-def after_scenario(context, scenario):
-    if context.server_process is None:
-        return
-    if scenario.status == "failed":
-        if 'GITHUB_ACTIONS' in os.environ:
-            print(f"\x1b[33;101mSCENARIO FAILED: {scenario.name} server logs:\x1b[0m\n\n")
-            if os.path.isfile('llama.log'):
-                with closing(open('llama.log', 'r')) as f:
-                    for line in f:
-                        print(line)
-        if not is_server_listening(context.server_fqdn, context.server_port):
-            print("\x1b[33;101mERROR: Server stopped listening\x1b[0m")
-
-    if not pid_exists(context.server_process.pid):
-        assert False, f"Server not running pid={context.server_process.pid} ..."
-
-    print(f"stopping server pid={context.server_process.pid} ...")
-    context.server_process.kill()
-    # Wait few for socket to free up
-    time.sleep(0.05)
-
-    attempts = 0
-    while is_server_listening(context.server_fqdn, context.server_port):
-        print(f"stopping server pid={context.server_process.pid} ...")
-        os.kill(context.server_process.pid, SIGKILL)
-        time.sleep(0.1)
-        attempts += 1
-        if attempts > 5:
-            print(f"Server dangling exits, killing all {context.server_path} ...")
-            process = subprocess.run(['killall', '-9', context.server_path],
-                                     stderr=subprocess.PIPE,
-                                     universal_newlines=True)
-            print(process)
-
-
-def is_server_listening(server_fqdn, server_port):
-    with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
-        result = sock.connect_ex((server_fqdn, server_port))
-        return result == 0
-
-
-def pid_exists(pid):
-    """Check whether pid exists in the current process table."""
-    import errno
-    if pid < 0:
-        return False
-    try:
-        os.kill(pid, 0)
-    except OSError as e:
-        return e.errno == errno.EPERM
-    else:
-        return True
--- a/examples/server/tests/features/issues.feature
+++ b/examples/server/tests/features/issues.feature
@@ -1,5 +0,0 @@
-# List of ongoing issues
-# run with: DEBUG=ON ./tests.sh --no-skipped --tags bug
-@bug
-Feature: Issues
-  # No confirmed issue at the moment
--- a/examples/server/tests/features/parallel.feature
+++ b/examples/server/tests/features/parallel.feature
@@ -1,146 +0,0 @@
-@llama.cpp
-@parallel
-Feature: Parallel
-
-  Background: Server startup
-    Given a server listening on localhost:8080
-    And   a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
-    And   42 as server seed
-    And   512 as batch size
-    And   64 KV cache size
-    And   2 slots
-    And   embeddings extraction
-    And   continuous batching
-    Then  the server is starting
-    Then  the server is healthy
-
-  Scenario Outline: Multi users completion
-    Given a prompt:
-      """
-      Write a very long story about AI.
-      """
-    And a prompt:
-      """
-      Write another very long music lyrics.
-      """
-    And <n_predict> max tokens to predict
-    Given concurrent completion requests
-    Then the server is busy
-    Then the server is idle
-    And  all slots are idle
-    Then all prompts are predicted with <n_predict> tokens
-    Examples:
-      | n_predict |
-      | 128       |
-
-  Scenario Outline: Multi users OAI completions compatibility
-    Given a system prompt You are a writer.
-    And   a model tinyllama-2
-    Given a prompt:
-      """
-      Write a very long book.
-      """
-    And a prompt:
-      """
-      Write another a poem.
-      """
-    And <n_predict> max tokens to predict
-    And streaming is <streaming>
-    Given concurrent OAI completions requests
-    Then the server is busy
-    Then the server is idle
-    Then all prompts are predicted with <n_predict> tokens
-    Examples:
-      | streaming | n_predict |
-      | disabled  | 128       |
-      | enabled   | 64        |
-
-  Scenario Outline: Multi users OAI completions compatibility no v1
-    Given a system prompt You are a writer.
-    And   a model tinyllama-2
-    Given a prompt:
-      """
-      Write a very long book.
-      """
-    And a prompt:
-      """
-      Write another a poem.
-      """
-    And <n_predict> max tokens to predict
-    And streaming is <streaming>
-    Given concurrent OAI completions requests no v1
-    Then the server is busy
-    Then the server is idle
-    Then all prompts are predicted with <n_predict> tokens
-    Examples:
-      | streaming | n_predict |
-      | disabled  | 128       |
-      | enabled   | 64        |
-
-  Scenario:  Multi users with total number of tokens to predict exceeds the KV Cache size #3969
-    Given a prompt:
-      """
-      Write a very long story about AI.
-      """
-    And a prompt:
-      """
-      Write another very long music lyrics.
-      """
-    And a prompt:
-      """
-      Write a very long poem.
-      """
-    And a prompt:
-      """
-      Write a very long joke.
-      """
-    And 128 max tokens to predict
-    Given concurrent completion requests
-    Then the server is busy
-    Then the server is idle
-    Then all prompts are predicted
-
-  Scenario: Multi users embeddings
-    Given a prompt:
-      """
-      Write a very long story about AI.
-      """
-    And a prompt:
-      """
-      Write another very long music lyrics.
-      """
-    And a prompt:
-      """
-      Write a very long poem.
-      """
-    And a prompt:
-      """
-      Write a very long joke.
-      """
-    Given concurrent embedding requests
-    Then the server is busy
-    Then the server is idle
-    Then all embeddings are generated
-
-  Scenario: Multi users OAI compatibility embeddings
-    Given a prompt:
-      """
-      In which country Paris is located ?
-      """
-    And a prompt:
-      """
-      Is Madrid the capital of Spain ?
-      """
-    And a prompt:
-      """
-      What is the biggest US city ?
-      """
-    And a prompt:
-      """
-      What is the capital of Bulgaria ?
-      """
-    And   a model tinyllama-2
-    Given concurrent OAI embedding requests
-    Then the server is busy
-    Then the server is idle
-    Then all embeddings are generated
--- a/examples/server/tests/features/passkey.feature
+++ b/examples/server/tests/features/passkey.feature
@@ -1,55 +0,0 @@
-# run with: ./tests.sh --no-skipped --tags passkey
-@passkey
-@slow
-Feature: Passkey / Self-extend with context shift
-
-  Background: Server startup
-    Given a server listening on localhost:8080
-
-  # Generates a long text of junk and inserts a secret passkey number inside it.
-  # Then we query the LLM for the secret passkey.
-  # see #3856 and #4810
-  Scenario Outline: Passkey
-    Given a model file <hf_file> from HF repo <hf_repo>
-    And   <n_batch> as batch size
-    And   <n_junk> as number of junk
-    And   <n_predicted> server max tokens to predict
-    And   42 as seed
-    And   <n_ctx> KV cache size
-    And   1 slots
-    And   <n_ga> group attention factor to extend context size through self-extend
-    And   <n_ga_w> group attention width to extend context size through self-extend
-    # Can be override with N_GPU_LAYERS
-    And   <ngl> GPU offloaded layers
-    Then  the server is starting
-    Then  the server is healthy
-    Given available models
-    Then  model 0 is trained on <n_ctx_train> tokens context
-    Given a prefix prompt:
-    """
-    here is an important info hidden inside a lot of irrelevant text. Find it and memorize them. I will quiz you about the important information there.
-    """
-    And a passkey prompt template:
-    """
-    The pass key is <passkey> Remember it. <passkey> is the pass key.
-    """
-    And a junk suffix prompt:
-    """
-    The grass is green. The sky is blue. The sun is yellow. Here we go. There and back again.
-    """
-    And a suffix prompt:
-    """
-    What is the pass key? The pass key is
-    """
-    Given a "<passkey>" passkey challenge prompt with the passkey inserted every <i_pos> junk
-    And  a completion request with no api error
-    Then <n_predicted> tokens are predicted matching <re_content>
-
-    Examples:
-      | hf_repo                         | hf_file                     | n_ctx_train | ngl | n_ctx | n_batch | n_ga | n_ga_w | n_junk | i_pos | passkey | n_predicted | re_content     |
-      | TheBloke/phi-2-GGUF             | phi-2.Q4_K_M.gguf           | 2048        | 5   | 8192  | 512     | 4    | 512    | 250    | 50    | 42      | 1           | 42             |
-      | TheBloke/phi-2-GGUF             | phi-2.Q4_K_M.gguf           | 2048        | 5   | 8192  | 512     | 2    | 512    | 250    | 50    | 42      | 1           | \b((?!42)\w)+\b  |
-      #| TheBloke/Llama-2-7B-GGUF        | llama-2-7b.Q2_K.gguf        | 4096        | 3   | 16384 | 512     | 4    | 512    | 500    | 300   | 1234    | 5           | 1234           |
-      #| TheBloke/Mixtral-8x7B-v0.1-GGUF | mixtral-8x7b-v0.1.Q2_K.gguf | 32768       | 2   | 16384 | 512     | 4    | 512    | 500    | 100   | 0987    | 5           | 0
-      # 987           |
-
--- a/examples/server/tests/features/security.feature
+++ b/examples/server/tests/features/security.feature
@@ -1,51 +0,0 @@
-@llama.cpp
-@security
-Feature: Security
-
-  Background: Server startup with an api key defined
-    Given a server listening on localhost:8080
-    And   a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
-    And   a server api key llama.cpp
-    Then  the server is starting
-    Then  the server is healthy
-
-  Scenario Outline: Completion with some user api key
-    Given a prompt test
-    And   a user api key <api_key>
-    And   4 max tokens to predict
-    And   a completion request with <api_error> api error
-
-    Examples: Prompts
-      | api_key   | api_error |
-      | llama.cpp | no        |
-      | llama.cpp | no        |
-      | hackeme   | raised    |
-      |           | raised    |
-
-  Scenario Outline: OAI Compatibility
-    Given a system prompt test
-    And   a user prompt test
-    And   a model test
-    And   2 max tokens to predict
-    And   streaming is disabled
-    And   a user api key <api_key>
-    Given an OAI compatible chat completions request with <api_error> api error
-
-    Examples: Prompts
-      | api_key   | api_error |
-      | llama.cpp | no        |
-      | llama.cpp | no        |
-      | hackme    | raised    |
-
-
-  Scenario Outline: CORS Options
-    When an OPTIONS request is sent from <origin>
-    Then CORS header <cors_header> is set to <cors_header_value>
-
-    Examples: Headers
-      | origin          | cors_header                      | cors_header_value |
-      | localhost       | Access-Control-Allow-Origin      | localhost         |
-      | web.mydomain.fr | Access-Control-Allow-Origin      | web.mydomain.fr   |
-      | origin          | Access-Control-Allow-Credentials | true              |
-      | web.mydomain.fr | Access-Control-Allow-Methods     | POST              |
-      | web.mydomain.fr | Access-Control-Allow-Headers     | *                 |
--- a/examples/server/tests/features/server.feature
+++ b/examples/server/tests/features/server.feature
@@ -1,91 +0,0 @@
-@llama.cpp
-@server
-Feature: llama.cpp server
-
-  Background: Server startup
-    Given a server listening on localhost:8080
-    And   a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
-    And   a model alias tinyllama-2
-    And   42 as server seed
-      # KV Cache corresponds to the total amount of tokens
-      # that can be stored across all independent sequences: #4130
-      # see --ctx-size and #5568
-    And   32 KV cache size
-    And   512 as batch size
-    And   1 slots
-    And   embeddings extraction
-    And   32 server max tokens to predict
-    And   prometheus compatible metrics exposed
-    Then  the server is starting
-    Then  the server is healthy
-
-  Scenario: Health
-    Then the server is ready
-    And  all slots are idle
-
-  Scenario Outline: Completion
-    Given a prompt <prompt>
-    And   <n_predict> max tokens to predict
-    And   a completion request with no api error
-    Then  <n_predicted> tokens are predicted matching <re_content>
-    And   prometheus metrics are exposed
-
-    Examples: Prompts
-      | prompt                           | n_predict | re_content                       | n_predicted |
-      | I believe the meaning of life is | 8         | (read\|going)+                   | 8           |
-      | Write a joke about AI            | 64        | (park\|friends\|scared\|always)+ | 32          |
-
-  Scenario Outline: OAI Compatibility
-    Given a model <model>
-    And   a system prompt <system_prompt>
-    And   a user prompt <user_prompt>
-    And   <max_tokens> max tokens to predict
-    And   streaming is <enable_streaming>
-    Given an OAI compatible chat completions request with no api error
-    Then  <n_predicted> tokens are predicted matching <re_content>
-
-    Examples: Prompts
-      | model        | system_prompt               | user_prompt                          | max_tokens | re_content             | n_predicted | enable_streaming |
-      | llama-2      | Book                        | What is the best book                | 8          | (Mom\|what)+           | 8           | disabled         |
-      | codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 64         | (thanks\|happy\|bird)+ | 32          | enabled          |
-
-  Scenario: Embedding
-    When embeddings are computed for:
-    """
-    What is the capital of Bulgaria ?
-    """
-    Then embeddings are generated
-
-  Scenario: OAI Embeddings compatibility
-    Given a model tinyllama-2
-    When an OAI compatible embeddings computation request for:
-    """
-    What is the capital of Spain ?
-    """
-    Then embeddings are generated
-
-  Scenario: OAI Embeddings compatibility with multiple inputs
-    Given a model tinyllama-2
-    Given a prompt:
-      """
-      In which country Paris is located ?
-      """
-    And a prompt:
-      """
-      Is Madrid the capital of Spain ?
-      """
-    When an OAI compatible embeddings computation request for multiple inputs
-    Then embeddings are generated
-
-  Scenario: Tokenize / Detokenize
-    When tokenizing:
-    """
-    What is the capital of France ?
-    """
-    Then tokens can be detokenize
-
-  Scenario: Models available
-    Given available models
-    Then  1 models are supported
-    Then  model 0 is identified by tinyllama-2
-    Then  model 0 is trained on 128 tokens context
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@@ -1,970 +0,0 @@
-import asyncio
-import collections
-import json
-import os
-import re
-import socket
-import subprocess
-import time
-from contextlib import closing
-from re import RegexFlag
-
-import aiohttp
-import openai
-from behave import step
-from behave.api.async_step import async_run_until_complete
-from huggingface_hub import hf_hub_download
-from prometheus_client import parser
-
-
-@step(u"a server listening on {server_fqdn}:{server_port}")
-def step_server_config(context, server_fqdn, server_port):
-    context.server_fqdn = server_fqdn
-    context.server_port = int(server_port)
-    if 'PORT' in os.environ:
-        context.server_port = int(os.environ['PORT'])
-        print(f"$PORT set, overriding server port with to {context.server_port}")
-
-    context.base_url = f'http://{context.server_fqdn}:{context.server_port}'
-
-    context.model_alias = None
-    context.n_batch = None
-    context.n_ctx = None
-    context.n_ga = None
-    context.n_ga_w = None
-    context.n_gpu_layer = None
-    context.n_predict = None
-    context.n_server_predict = None
-    context.n_slots = None
-    context.prompt_prefix = None
-    context.prompt_suffix = None
-    context.server_api_key = None
-    context.server_continuous_batching = False
-    context.server_embeddings = False
-    context.server_metrics = False
-    context.server_process = None
-    context.seed = None
-    context.server_seed = None
-    context.user_api_key = None
-
-    context.tasks_result = []
-    context.concurrent_tasks = []
-    context.prompts = []
-
-
-@step(u'a model file {hf_file} from HF repo {hf_repo}')
-def step_download_hf_model(context, hf_file, hf_repo):
-    context.model_file = hf_hub_download(repo_id=hf_repo, filename=hf_file)
-    if context.debug:
-        print(f"model file: {context.model_file}\n")
-
-
-@step(u'a model alias {model_alias}')
-def step_model_alias(context, model_alias):
-    context.model_alias = model_alias
-
-
-@step(u'{seed:d} as server seed')
-def step_seed(context, seed):
-    context.server_seed = seed
-
-
-@step(u'{ngl:d} GPU offloaded layers')
-def step_n_gpu_layer(context, ngl):
-    if 'N_GPU_LAYERS' in os.environ:
-        new_ngl = int(os.environ['N_GPU_LAYERS'])
-        if context.debug:
-            print(f"-ngl upgraded from {ngl} to {new_ngl}")
-        ngl = new_ngl
-    context.n_gpu_layer = ngl
-
-
-@step(u'{n_ctx:d} KV cache size')
-def step_n_ctx(context, n_ctx):
-    context.n_ctx = n_ctx
-
-
-@step(u'{n_slots:d} slots')
-def step_n_slots(context, n_slots):
-    context.n_slots = n_slots
-
-
-@step(u'{n_predict:d} server max tokens to predict')
-def step_server_n_predict(context, n_predict):
-    context.n_server_predict = n_predict
-
-
-@step(u'continuous batching')
-def step_server_continuous_batching(context):
-    context.server_continuous_batching = True
-
-
-@step(u'embeddings extraction')
-def step_server_embeddings(context):
-    context.server_embeddings = True
-
-
-@step(u'prometheus compatible metrics exposed')
-def step_server_metrics(context):
-    context.server_metrics = True
-
-
-@step(u"the server is starting")
-def step_start_server(context):
-    start_server_background(context)
-    attempts = 0
-    while True:
-        with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
-            result = sock.connect_ex((context.server_fqdn, context.server_port))
-            if result == 0:
-                print("\x1b[33;46mserver started!\x1b[0m")
-                return
-            attempts += 1
-            if attempts > 20:
-                assert False, "server not started"
-            print(f"waiting for server to start, connect error code = {result}...")
-            time.sleep(0.1)
-
-
-@step(u"the server is {expecting_status}")
-@async_run_until_complete
-async def step_wait_for_the_server_to_be_started(context, expecting_status):
-    match expecting_status:
-        case 'healthy':
-            await wait_for_health_status(context, context.base_url, 200, 'ok')
-
-        case 'ready' | 'idle':
-            await wait_for_health_status(context, context.base_url, 200, 'ok',
-                                         timeout=10,
-                                         params={'fail_on_no_slot': 0, 'include_slots': 0},
-                                         slots_idle=context.n_slots,
-                                         slots_processing=0,
-                                         expected_slots=[{'id': slot_id, 'state': 0}
-                                                         for slot_id in
-                                                         range(context.n_slots if context.n_slots else 1)])
-        case 'busy':
-            await wait_for_health_status(context, context.base_url, 503,
-                                         'no slot available',
-                                         params={'fail_on_no_slot': 0, 'include_slots': 0},
-                                         slots_idle=0,
-                                         slots_processing=context.n_slots,
-                                         expected_slots=[{'id': slot_id, 'state': 1}
-                                                         for slot_id in
-                                                         range(context.n_slots if context.n_slots else 1)])
-        case _:
-            assert False, "unknown status"
-
-
-@step(u'all slots are {expected_slot_status_string}')
-@async_run_until_complete
-async def step_all_slots_status(context, expected_slot_status_string):
-    match expected_slot_status_string:
-        case 'idle':
-            expected_slot_status = 0
-        case 'busy':
-            expected_slot_status = 1
-        case _:
-            assert False, "unknown status"
-
-    expected_slots = [{'id': slot_id, 'state': expected_slot_status}
-                      for slot_id in range(context.n_slots)]
-    await request_slots_status(context, expected_slots)
-
-
-@step(u'a completion request with {api_error} api error')
-@async_run_until_complete
-async def step_request_completion(context, api_error):
-    expect_api_error = api_error == 'raised'
-    completion = await request_completion(context.prompts.pop(),
-                                          context.base_url,
-                                          debug=context.debug,
-                                          n_predict=context.n_predict,
-                                          seed=await completions_seed(context),
-                                          expect_api_error=expect_api_error,
-                                          user_api_key=context.user_api_key)
-    context.tasks_result.append(completion)
-    if context.debug:
-        print(f"Completion response: {completion}\n")
-    if expect_api_error:
-        assert completion == 401, f"completion must be an 401 status code: {completion}"
-
-
-@step(u'{predicted_n:d} tokens are predicted matching {re_content}')
-def step_n_tokens_predicted_with_content(context, predicted_n, re_content):
-    assert_n_tokens_predicted(context.tasks_result.pop(), predicted_n, re_content)
-
-
-@step(u'{predicted_n:d} tokens are predicted')
-def step_n_tokens_predicted(context, predicted_n):
-    assert_n_tokens_predicted(context.tasks_result.pop(), predicted_n)
-
-
-@step(u'a user prompt {user_prompt}')
-def step_user_prompt(context, user_prompt):
-    context.prompts.append(user_prompt)
-
-
-@step(u'a system prompt {system_prompt}')
-def step_system_prompt(context, system_prompt):
-    context.system_prompt = system_prompt
-
-
-@step(u'a model {model}')
-def step_model(context, model):
-    context.model = model
-
-
-@step(u'{max_tokens:d} max tokens to predict')
-def step_max_tokens(context, max_tokens):
-    context.n_predict = max_tokens
-
-
-@step(u'streaming is {enable_streaming}')
-def step_streaming(context, enable_streaming):
-    context.enable_streaming = enable_streaming == 'enabled'
-
-
-@step(u'a user api key {user_api_key}')
-def step_user_api_key(context, user_api_key):
-    context.user_api_key = user_api_key
-
-
-@step(u'no user api key')
-def step_no_user_api_key(context):
-    context.user_api_key = None
-
-
-@step(u'a user api key ')
-def step_no_user_api_key_space(context):
-    context.user_api_key = None
-
-
-@step(u'a server api key {server_api_key}')
-def step_server_api_key(context, server_api_key):
-    context.server_api_key = server_api_key
-
-
-@step(u'{n_junk:d} as number of junk')
-def step_n_junk(context, n_junk):
-    context.n_junk = n_junk
-
-
-@step(u'{n_batch:d} as batch size')
-def step_n_batch(context, n_batch):
-    context.n_batch = n_batch
-
-
-@step(u'{seed:d} as seed')
-def step_seed(context, seed):
-    context.seed = seed
-
-
-@step(u'a prefix prompt')
-def step_prompt_prefix(context):
-    context.prompt_prefix = context.text
-
-
-@step(u'a junk suffix prompt')
-def step_prompt_junk_suffix(context):
-    context.prompt_junk_suffix = context.text
-
-
-@step(u'a suffix prompt')
-def step_prompt_suffix(context):
-    context.prompt_suffix = context.text
-
-
-@step(u'{n_ga:d} group attention factor'
-      u' to extend context size through self-extend')
-def step_impl(context, n_ga):
-    context.n_ga = n_ga
-
-
-@step(u'{n_ga_w:d} group attention width to extend context size through self-extend')
-def step_impl(context, n_ga_w):
-    context.n_ga_w = n_ga_w
-
-
-@step(u'a passkey prompt template')
-def step_prompt_passkey(context):
-    context.prompt_passkey = context.text
-
-
-@step(u'a "{passkey}" passkey challenge prompt with the passkey inserted every {i_pos:d} junk')
-def step_prompt_passkey(context, passkey, i_pos):
-    prompt = ""
-    for i in range(context.n_junk):
-        if i % context.n_junk == i_pos:
-            prompt += context.prompt_passkey # the passkey is already substituted
-        prompt += context.prompt_junk_suffix
-    if context.debug:
-        passkey_highlight = "\x1b[33m" + passkey + "\x1b[0m"
-        print(f"Passkey challenge:\n```{prompt.replace(passkey, passkey_highlight)}```\n")
-    context.prompts.append(context.prompt_prefix + prompt + context.prompt_suffix)
-
-
-@step(u'an OAI compatible chat completions request with {api_error} api error')
-@async_run_until_complete
-async def step_oai_chat_completions(context, api_error):
-    if context.debug:
-        print(f"Submitting OAI compatible completions request...\n")
-    expect_api_error = api_error == 'raised'
-    completion = await oai_chat_completions(context.prompts.pop(),
-                                            context.system_prompt,
-                                            context.base_url,
-                                            '/v1/chat',
-                                            False,
-                                            model=context.model if hasattr(context, 'model') else None,
-
-                                            n_predict=context.n_predict
-                                            if hasattr(context, 'n_predict') else None,
-
-                                            enable_streaming=context.enable_streaming
-                                            if hasattr(context, 'enable_streaming') else None,
-
-                                            seed=await completions_seed(context),
-
-                                            user_api_key=context.user_api_key
-                                            if hasattr(context, 'user_api_key') else None,
-
-                                            expect_api_error=expect_api_error)
-    context.tasks_result.append(completion)
-    if context.debug:
-        print(f"Completion response: {completion}")
-    if expect_api_error:
-        assert completion == 401, f"completion must be an 401 status code: {completion}"
-
-    if context.debug:
-        print(f"Completion response: {completion}")
-
-
-@step(u'a prompt')
-def step_a_prompt(context):
-    context.prompts.append(context.text)
-
-
-@step(u'a prompt {prompt}')
-def step_a_prompt_prompt(context, prompt):
-    context.prompts.append(prompt)
-
-
-@step(u'concurrent completion requests')
-@async_run_until_complete()
-async def step_concurrent_completion_requests(context):
-    await concurrent_requests(context,
-                              request_completion,
-                              # prompt is inserted automatically
-                              context.base_url,
-                              debug=context.debug,
-                              prompt_prefix=context.prompt_prefix,
-                              prompt_suffix=context.prompt_suffix,
-                              n_predict=context.n_predict if hasattr(context, 'n_predict') else None,
-                              seed=await completions_seed(context),
-                              user_api_key=context.user_api_key if hasattr(context,
-                                                                           'user_api_key') else None)
-
-
-@step(u'concurrent OAI completions requests')
-@async_run_until_complete
-async def step_oai_chat_completions(context):
-    await concurrent_requests(context, oai_chat_completions,
-                              # user_prompt is inserted automatically
-                              context.system_prompt,
-                              context.base_url,
-                              '/v1/chat/completions',
-                              True,  # async_client
-                              model=context.model
-                              if hasattr(context, 'model') else None,
-                              n_predict=context.n_predict
-                              if hasattr(context, 'n_predict') else None,
-                              enable_streaming=context.enable_streaming
-                              if hasattr(context, 'enable_streaming') else None,
-                              seed=await completions_seed(context),
-                              user_api_key=context.user_api_key
-                              if hasattr(context, 'user_api_key') else None)
-
-
-@step(u'concurrent OAI completions requests no v1')
-@async_run_until_complete
-async def step_oai_chat_completions(context):
-    await concurrent_requests(context, oai_chat_completions,
-                              # user_prompt is inserted automatically
-                              context.system_prompt,
-                              context.base_url,
-                              '/chat/completions',
-                              True,  # async_client
-                              model=context.model
-                              if hasattr(context, 'model') else None,
-                              n_predict=context.n_predict
-                              if hasattr(context, 'n_predict') else None,
-                              enable_streaming=context.enable_streaming
-                              if hasattr(context, 'enable_streaming') else None,
-                              seed=context.seed
-                              if hasattr(context, 'seed') else
-                              context.server_seed
-                              if hasattr(context, 'server_seed') else None,
-                              user_api_key=context.user_api_key
-                              if hasattr(context, 'user_api_key') else None)
-
-
-@step(u'all prompts are predicted')
-@async_run_until_complete
-async def step_all_prompts_are_predicted(context):
-    await all_prompts_are_predicted(context)
-
-
-@step(u'all prompts are predicted with {n_expected_predicted:d} tokens')
-@async_run_until_complete
-async def step_all_prompts_are_predicted_with_n_tokens(context, n_expected_predicted):
-    await all_prompts_are_predicted(context, n_expected_predicted)
-
-
-async def all_prompts_are_predicted(context, expected_predicted_n=None):
-    n_completions = await gather_tasks_results(context)
-    assert n_completions > 0
-    for i in range(n_completions):
-        assert_n_tokens_predicted(context.tasks_result.pop(), expected_predicted_n=expected_predicted_n)
-    assert len(context.concurrent_tasks) == 0, f"{len(context.concurrent_tasks)} pending requests"
-
-
-@step(u'embeddings are computed for')
-@async_run_until_complete
-async def step_compute_embedding(context):
-    context.embeddings = await request_embedding(context.text, base_url=context.base_url)
-
-
-@step(u'embeddings are generated')
-def step_assert_embeddings(context):
-    if len(context.prompts) == 0:
-        assert_embeddings(context.embeddings)
-    else:
-        assert len(context.embeddings) == len(context.prompts), (f"unexpected response:\n"
-                                                                 f"context.prompts={context.prompts}\n"
-                                                                 f"context.embeddings={context.embeddings}")
-        for embedding in context.embeddings:
-            context.prompts.pop()
-            assert_embeddings(embedding)
-
-
-@step(u'an OAI compatible embeddings computation request for')
-@async_run_until_complete
-async def step_oai_compute_embeddings(context):
-    context.embeddings = await request_oai_embeddings(context.text,
-                                                      base_url=context.base_url,
-                                                      user_api_key=context.user_api_key,
-                                                      model=context.model)
-
-
-@step(u'an OAI compatible embeddings computation request for multiple inputs')
-@async_run_until_complete
-async def step_oai_compute_embeddings_multiple_inputs(context):
-    context.embeddings = await request_oai_embeddings(context.prompts,
-                                                      base_url=context.base_url,
-                                                      user_api_key=context.user_api_key,
-                                                      model=context.model)
-
-
-@step(u'concurrent embedding requests')
-@async_run_until_complete()
-async def step_concurrent_embedding_requests(context):
-    await concurrent_requests(context,
-                              request_embedding,
-                              # prompt is inserted automatically
-                              base_url=context.base_url)
-
-
-@step(u'concurrent OAI embedding requests')
-@async_run_until_complete()
-async def step_concurrent_oai_embedding_requests(context):
-    await concurrent_requests(context,
-                              request_oai_embeddings,
-                              # prompt is inserted automatically
-                              base_url=context.base_url,
-                              async_client=True,
-                              model=context.model)
-
-
-@step(u'all embeddings are generated')
-@async_run_until_complete()
-async def all_embeddings_are_generated(context):
-    n_embedding_requests = await gather_tasks_results(context)
-    assert n_embedding_requests > 0
-    for i in range(n_embedding_requests):
-        assert_embeddings(context.tasks_result.pop())
-
-
-@step(u'tokenizing')
-@async_run_until_complete
-async def step_tokenize(context):
-    context.tokenized_text = context.text
-    async with aiohttp.ClientSession() as session:
-        async with session.post(f'{context.base_url}/tokenize',
-                                json={
-                                    "content": context.tokenized_text,
-                                }) as response:
-            assert response.status == 200
-            tokenize_json = await response.json()
-            context.tokens = tokenize_json['tokens']
-
-
-@step(u'tokens can be detokenize')
-@async_run_until_complete
-async def step_detokenize(context):
-    assert len(context.tokens) > 0
-    async with aiohttp.ClientSession() as session:
-        async with session.post(f'{context.base_url}/detokenize',
-                                json={
-                                    "tokens": context.tokens,
-                                }) as response:
-            assert response.status == 200
-            detokenize_json = await response.json()
-            # SPM tokenizer adds a whitespace prefix: https://github.com/google/sentencepiece/issues/15
-            assert context.tokenized_text == detokenize_json['content'].strip()
-
-
-@step(u'an OPTIONS request is sent from {origin}')
-@async_run_until_complete
-async def step_options_request(context, origin):
-    async with aiohttp.ClientSession() as session:
-        async with session.options(f'{context.base_url}/v1/chat/completions',
-                                   headers={"Origin": origin}) as response:
-            assert response.status == 200
-            context.options_response = response
-
-
-@step(u'CORS header {cors_header} is set to {cors_header_value}')
-def step_check_options_header_value(context, cors_header, cors_header_value):
-    assert context.options_response.headers[cors_header] == cors_header_value
-
-
-@step(u'prometheus metrics are exposed')
-@async_run_until_complete
-async def step_prometheus_metrics_exported(context):
-    async with aiohttp.ClientSession() as session:
-        async with await session.get(f'{context.base_url}/metrics') as metrics_response:
-            assert metrics_response.status == 200
-            assert metrics_response.headers['Content-Type'] == "text/plain; version=0.0.4"
-            metrics_raw = await metrics_response.text()
-            metric_exported = False
-            if context.debug:
-                print(f"/metrics answer:\n{metrics_raw}\n")
-            for metric in parser.text_string_to_metric_families(metrics_raw):
-                match metric.name:
-                    case "llamacpp:kv_cache_usage_ratio":
-                        assert len(metric.samples) > 0
-                        metric_exported = True
-            assert metric_exported, "No metrics exported"
-
-
-@step(u'available models')
-def step_available_models(context):
-    # openai client always expects an api_key
-    openai.api_key = context.user_api_key if context.user_api_key is not None else 'nope'
-    openai.api_base = f'{context.base_url}/v1'
-    context.models = openai.Model.list().data
-
-
-@step(u'{n_model:d} models are supported')
-def step_supported_models(context, n_model):
-    if context.debug:
-        print("server models available:", context.models)
-    assert len(context.models) == n_model
-
-
-@step(u'model {i_model:d} is {param} {preposition} {param_value}')
-def step_supported_models(context, i_model, param, preposition, param_value):
-    assert i_model < len(context.models)
-    model = context.models[i_model]
-
-    param_value = param_value.split(' ', 1)[0]
-    match param:
-        case 'identified':
-            value = model.id
-        case 'trained':
-            value = str(model.meta.n_ctx_train)
-        case _:
-            assert False, "param {param} not supported"
-    assert param_value == value, f"model param {param} {value} != {param_value}"
-
-
-async def concurrent_requests(context, f_completion, *args, **kwargs):
-    n_prompts = len(context.prompts)
-    if context.debug:
-        print(f"starting {n_prompts} concurrent completion requests...")
-    assert n_prompts > 0
-    for prompt_no in range(n_prompts):
-        shifted_args = [context.prompts.pop(), *args]
-        context.concurrent_tasks.append(asyncio.create_task(f_completion(*shifted_args, **kwargs)))
-    await asyncio.sleep(0.1)
-
-
-async def request_completion(prompt,
-                             base_url,
-                             debug=False,
-                             prompt_prefix=None,
-                             prompt_suffix=None,
-                             n_predict=None,
-                             seed=None,
-                             expect_api_error=None,
-                             user_api_key=None):
-    if debug:
-        print(f"Sending completion request: {prompt}")
-    origin = "my.super.domain"
-    headers = {
-        'Origin': origin
-    }
-    if user_api_key is not None:
-        if debug:
-            print(f"Set user_api_key: {user_api_key}")
-        headers['Authorization'] = f'Bearer {user_api_key}'
-
-    async with aiohttp.ClientSession() as session:
-        async with session.post(f'{base_url}/completion',
-                                json={
-                                    "input_prefix": prompt_prefix,
-                                    "prompt": prompt,
-                                    "input_suffix": prompt_suffix,
-                                    "n_predict": n_predict if n_predict is not None else -1,
-                                    "seed": seed if seed is not None else 42
-                                },
-                                headers=headers,
-                                timeout=3600) as response:
-            if expect_api_error is None or not expect_api_error:
-                assert response.status == 200
-                assert response.headers['Access-Control-Allow-Origin'] == origin
-                return await response.json()
-            else:
-                return response.status
-
-
-async def oai_chat_completions(user_prompt,
-                               system_prompt,
-                               base_url,
-                               base_path,
-                               async_client,
-                               debug=False,
-                               model=None,
-                               n_predict=None,
-                               enable_streaming=None,
-                               seed=None,
-                               user_api_key=None,
-                               expect_api_error=None):
-    if debug:
-        print(f"Sending OAI Chat completions request: {user_prompt}")
-    # openai client always expects an api key
-    user_api_key = user_api_key if user_api_key is not None else 'nope'
-    seed = seed if seed is not None else 42
-    enable_streaming = enable_streaming if enable_streaming is not None else False
-    payload = {
-        "messages": [
-            {
-                "role": "system",
-                "content": system_prompt,
-            },
-            {
-                "role": "user",
-                "content": user_prompt,
-            }
-        ],
-        "model": model,
-        "max_tokens": n_predict,
-        "stream": enable_streaming,
-        "seed": seed
-    }
-    completion_response = {
-        'content': '',
-        'timings': {
-            'predicted_n': 0
-        }
-    }
-    if async_client:
-        origin = 'llama.cpp'
-        headers = {'Authorization': f'Bearer {user_api_key}', 'Origin': origin}
-        async with aiohttp.ClientSession() as session:
-            async with session.post(f'{base_url}{base_path}',
-                                    json=payload,
-                                    headers=headers) as response:
-                if enable_streaming:
-                    assert response.status == 200
-                    assert response.headers['Access-Control-Allow-Origin'] == origin
-                    assert response.headers['Content-Type'] == "text/event-stream"
-                    event_received = True
-                    while event_received:
-                        event_received = False
-                        async for line_in_bytes in response.content:
-                            line = line_in_bytes.decode('utf8')
-                            line = line.rstrip('\n').rstrip('\r')
-                            if line == '':
-                                continue
-                            event_data = line.split(': ', 1)
-                            assert event_data[0] == 'data', f'Bad event code received: ```{event_data}```'
-                            chunk_raw = event_data[1]
-
-                            chunk = json.loads(chunk_raw)
-                            assert len(chunk['choices']) == 1, f"no choices provided, line ```{line}```"
-                            delta = chunk['choices'][0]['delta']
-                            if 'content' in delta:
-                                completion_response['content'] += delta['content']
-                                completion_response['timings']['predicted_n'] += 1
-                else:
-                    if expect_api_error is None or not expect_api_error:
-                        assert response.status == 200
-                        assert response.headers['Access-Control-Allow-Origin'] == origin
-                        assert response.headers['Content-Type'] == "application/json; charset=utf-8"
-                        chat_completion_raw = await response.json()
-                        completion_response = {
-                            'content': chat_completion_raw['choices'][0]['message'],
-                            'timings': {
-                                'predicted_n': chat_completion_raw['usage']['completion_tokens']
-                            }
-                        }
-                    else:
-                        return response.status
-    else:
-        try:
-            openai.api_key = user_api_key
-            openai.api_base = f'{base_url}{base_path}'
-            chat_completion = openai.Completion.create(
-                messages=payload['messages'],
-                model=model,
-                max_tokens=n_predict,
-                stream=enable_streaming,
-                seed=seed
-            )
-        except openai.error.APIError as e:
-            if expect_api_error is not None and expect_api_error:
-                return 401
-            else:
-                assert False, f'error raised: {e}'
-
-        if enable_streaming:
-            for chunk in chat_completion:
-                assert len(chunk.choices) == 1
-                delta = chunk.choices[0].delta
-                if 'content' in delta:
-                    completion_response['content'] += delta['content']
-                    completion_response['timings']['predicted_n'] += 1
-        else:
-            assert len(chat_completion.choices) == 1
-            completion_response = {
-                'content': chat_completion.choices[0].message.content,
-                'timings': {
-                    'predicted_n': chat_completion.usage.completion_tokens
-                }
-            }
-    if debug:
-        print("OAI response formatted to llama.cpp:", completion_response)
-    return completion_response
-
-
-async def request_embedding(content, base_url=None):
-    async with aiohttp.ClientSession() as session:
-        async with session.post(f'{base_url}/embedding',
-                                json={
-                                    "content": content,
-                                }) as response:
-            assert response.status == 200
-            response_json = await response.json()
-            return response_json['embedding']
-
-
-async def request_oai_embeddings(input,
-                                 base_url=None, user_api_key=None,
-                                 model=None, async_client=False):
-    # openai client always expects an api_key
-    user_api_key = user_api_key if user_api_key is not None else 'nope'
-    if async_client:
-        origin = 'llama.cpp'
-        if user_api_key is not None:
-            headers = {'Authorization': f'Bearer {user_api_key}', 'Origin': origin}
-        async with aiohttp.ClientSession() as session:
-            async with session.post(f'{base_url}/v1/embeddings',
-                                    json={
-                                        "input": input,
-                                        "model": model,
-                                    },
-                                    headers=headers) as response:
-                assert response.status == 200, f"received status code not expected: {response.status}"
-                assert response.headers['Access-Control-Allow-Origin'] == origin
-                assert response.headers['Content-Type'] == "application/json; charset=utf-8"
-                response_json = await response.json()
-                assert response_json['model'] == model, f"invalid model received: {response_json['model']}"
-                assert response_json['object'] == 'list'
-                return response_json['data']
-    else:
-        openai.api_key = user_api_key
-        openai.api_base = f'{base_url}/v1'
-        oai_embeddings = openai.Embedding.create(
-            model=model,
-            input=input,
-        )
-
-        if isinstance(input, collections.abc.Sequence):
-            embeddings = []
-            for an_oai_embeddings in oai_embeddings.data:
-                embeddings.append(an_oai_embeddings.embedding)
-        else:
-            embeddings = oai_embeddings.data.embedding
-        return embeddings
-
-
-def assert_n_tokens_predicted(completion_response, expected_predicted_n=None, re_content=None):
-    content = completion_response['content']
-    n_predicted = completion_response['timings']['predicted_n']
-    assert len(content) > 0, "no token predicted"
-    if re_content is not None:
-        p = re.compile(re_content, flags=RegexFlag.IGNORECASE | RegexFlag.MULTILINE | RegexFlag.DOTALL)
-        matches = p.finditer(content)
-        last_match = 0
-        highlighted = ''
-        for match in matches:
-            start, end = match.span()
-            highlighted += content[last_match: start]
-            highlighted += '\x1b[33m'
-            highlighted += content[start: end]
-            highlighted += '\x1b[0m'
-            last_match = end
-        highlighted += content[last_match:]
-        if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON':
-          print(f"Checking completion response: {highlighted}\n")
-        assert last_match > 0, f'/{re_content}/ must match ```{highlighted}```'
-    if expected_predicted_n and expected_predicted_n > 0:
-        assert n_predicted == expected_predicted_n, (f'invalid number of tokens predicted:'
-                                                     f' {n_predicted} <> {expected_predicted_n}')
-
-
-
-async def gather_tasks_results(context):
-    n_tasks = len(context.concurrent_tasks)
-    if context.debug:
-        print(f"Waiting for all {n_tasks} tasks results...\n")
-    for task_no in range(n_tasks):
-        context.tasks_result.append(await context.concurrent_tasks.pop())
-    n_completions = len(context.tasks_result)
-    return n_completions
-
-
-async def wait_for_health_status(context,
-                                 base_url,
-                                 expected_http_status_code,
-                                 expected_health_status,
-                                 timeout=3,
-                                 params=None,
-                                 slots_idle=None,
-                                 slots_processing=None,
-                                 expected_slots=None):
-    if context.debug:
-        print(f"Starting checking for health for expected_health_status={expected_health_status}\n")
-    interval = 0.5
-    counter = 0
-    async with aiohttp.ClientSession() as session:
-        while True:
-            async with await session.get(f'{base_url}/health', params=params) as health_response:
-                status_code = health_response.status
-                health = await health_response.json()
-                if context.debug:
-                    print(f"HEALTH - response for expected health status='{expected_health_status}' on "
-                          f"'{base_url}/health'?{params} is {health}\n")
-                if (status_code == expected_http_status_code
-                        and health['status'] == expected_health_status
-                        and (slots_idle is None or health['slots_idle'] == slots_idle)
-                        and (slots_processing is None or health['slots_processing'] == slots_processing)):
-                    if expected_slots is not None:
-                        assert_slots_status(health['slots'], expected_slots)
-                    return
-                if (status_code == expected_http_status_code
-                        and health['status'] == expected_health_status
-                        and (slots_idle is None or health['slots_idle'] == slots_idle)
-                        and (slots_processing is None or health['slots_processing'] == slots_processing)):
-                    if expected_slots is not None:
-                        assert_slots_status(health['slots'], expected_slots)
-                    return
-            await asyncio.sleep(interval)
-
-            counter += interval
-            if counter >= timeout:
-                # Sometimes health requests are triggered after completions are predicted
-                if expected_http_status_code == 503:
-                    if len(context.tasks_result) == 0:
-                        print("\x1b[5;37;43mWARNING: forcing concurrent tasks,"
-                              " busy health check missed, probably too fast inference\x1b[0m\n")
-                        n_completions = await gather_tasks_results(context)
-                        if n_completions > 0:
-                            return
-
-                assert False, f'{expected_health_status} timeout exceeded {counter}s>={timeout}'
-
-
-def assert_embeddings(embeddings):
-    assert len(embeddings) > 0
-    embeddings_computed = False
-    for emb in embeddings:
-        if emb != 0:
-            embeddings_computed = True
-    assert embeddings_computed, f"Embeddings: {embeddings}"
-
-
-async def request_slots_status(context, expected_slots):
-    async with aiohttp.ClientSession() as session:
-        async with await session.get(f'{context.base_url}/slots') as slots_response:
-            assert slots_response.status == 200
-            slots = await slots_response.json()
-            assert_slots_status(slots, expected_slots)
-
-
-def assert_slots_status(slots, expected_slots):
-    assert len(slots) == len(expected_slots)
-    for slot_id, (expected, slot) in enumerate(zip(expected_slots, slots)):
-        for key in expected:
-            assert expected[key] == slot[key], (f"invalid slot {slot_id}"
-                                                f" expected[{key}] != slot[{key}]"
-                                                f" = {expected[key]} != {slot[key]}")
-
-
-async def completions_seed(context):
-    return context.seed if hasattr(context, 'seed') and context.seed is not None \
-        else context.server_seed if hasattr(context, 'server_seed') else None
-
-
-def start_server_background(context):
-    context.server_path = '../../../build/bin/server'
-    if 'LLAMA_SERVER_BIN_PATH' in os.environ:
-        context.server_path = os.environ['LLAMA_SERVER_BIN_PATH']
-    server_args = [
-        '--host', context.server_fqdn,
-        '--port', context.server_port,
-        '--model', context.model_file
-    ]
-    if context.n_batch:
-        server_args.extend(['--batch-size', context.n_batch])
-    if context.n_gpu_layer:
-        server_args.extend(['--n-gpu-layers', context.n_gpu_layer])
-    if context.server_continuous_batching:
-        server_args.append('--cont-batching')
-    if context.server_embeddings:
-        server_args.append('--embedding')
-    if context.server_metrics:
-        server_args.append('--metrics')
-    if context.model_alias:
-        server_args.extend(['--alias', context.model_alias])
-    if context.n_ctx:
-        server_args.extend(['--ctx-size', context.n_ctx])
-    if context.n_slots:
-        server_args.extend(['--parallel', context.n_slots])
-    if context.n_server_predict:
-        server_args.extend(['--n-predict', context.n_server_predict])
-    if context.server_api_key:
-        server_args.extend(['--api-key', context.server_api_key])
-    if context.n_ga:
-        server_args.extend(['--grp-attn-n', context.n_ga])
-    if context.n_ga_w:
-        server_args.extend(['--grp-attn-w', context.n_ga_w])
-    if context.debug:
-        server_args.append('--verbose')
-    if 'SERVER_LOG_FORMAT_JSON' not in os.environ:
-        server_args.extend(['--log-format', "text"])
-    print(f"starting server with: {context.server_path} {server_args}\n")
-    context.server_process = subprocess.Popen(
-        [str(arg) for arg in [context.server_path, *server_args]],
-        close_fds=True)
-    print(f"server pid={context.server_process.pid}")
--- a/examples/server/tests/features/wrong_usages.feature
+++ b/examples/server/tests/features/wrong_usages.feature
@@ -1,22 +0,0 @@
-# run with: ./tests.sh --no-skipped --tags wrong_usage
-@wrong_usage
-Feature: Wrong usage of llama.cpp server
-
-  #3969 The user must always set --n-predict option
-  # to cap the number of tokens any completion request can generate
-  # or pass n_predict/max_tokens in the request.
-  Scenario: Infinite loop
-    Given a server listening on localhost:8080
-    And   a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
-    # Uncomment below to fix the issue
-    #And   64 server max tokens to predict
-    Then  the server is starting
-    Given a prompt:
-      """
-      Go to: infinite loop
-      """
-    # Uncomment below to fix the issue
-    #And   128 max tokens to predict
-    Given concurrent completion requests
-    Then the server is idle
-    Then all prompts are predicted
--- a/examples/server/tests/requirements.txt
+++ b/examples/server/tests/requirements.txt
@@ -1,5 +0,0 @@
-aiohttp~=3.9.3
-behave~=1.2.6
-huggingface_hub~=0.20.3
-openai~=0.25.0
-prometheus-client~=0.20.0
--- a/examples/server/tests/tests.sh
+++ b/examples/server/tests/tests.sh
@@ -1,12 +0,0 @@
-#!/bin/bash
-
-set -eu
-
-if [ $# -lt 1 ]
-then
-  # Start @llama.cpp scenario
-  behave --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp
-else
-  behave "$@"
-fi
-
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -14,7 +14,6 @@
 using json = nlohmann::json;

 extern bool server_verbose;
-extern bool server_log_json;

 #ifndef SERVER_VERBOSE
 #define SERVER_VERBOSE 1
@@ -28,14 +27,18 @@ extern bool server_log_json;
    {                                                                    \
        if (server_verbose)                                              \
        {                                                                \
-            server_log("VERB", __func__, __LINE__, MSG, __VA_ARGS__); \
+            server_log("VERBOSE", __func__, __LINE__, MSG, __VA_ARGS__); \
        }                                                                \
    } while (0)
 #endif

-#define LOG_ERROR(  MSG, ...) server_log("ERR",  __func__, __LINE__, MSG, __VA_ARGS__)
-#define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__)
-#define LOG_INFO(   MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
+#define LOG_ERROR(  MSG, ...) server_log("ERROR",   __func__, __LINE__, MSG, __VA_ARGS__)
+#define LOG_WARNING(MSG, ...) server_log("WARNING", __func__, __LINE__, MSG, __VA_ARGS__)
+#define LOG_INFO(   MSG, ...) server_log("INFO",    __func__, __LINE__, MSG, __VA_ARGS__)
+
+//
+// parallel
+//

 enum server_state {
    SERVER_STATE_LOADING_MODEL,  // Server is starting up, model not fully loaded yet
@@ -46,8 +49,7 @@ enum server_state {
 enum task_type {
    TASK_TYPE_COMPLETION,
    TASK_TYPE_CANCEL,
-    TASK_TYPE_NEXT_RESPONSE,
-    TASK_TYPE_METRICS
+    TASK_TYPE_NEXT_RESPONSE
 };

 struct task_server {
@@ -74,8 +76,51 @@ struct task_multi {
    std::vector<task_result> results{};
 };

+// TODO: can become bool if we can't find use of more states
+enum slot_state
+{
+    IDLE,
+    PROCESSING,
+};
+
+enum slot_command
+{
+    NONE,
+    LOAD_PROMPT,
+    RELEASE,
+};
+
+struct slot_params
+{
+    bool stream       = true;
+    bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt
+
+    uint32_t seed      = -1; // RNG seed
+    int32_t  n_keep    =  0; // number of tokens to keep from initial prompt
+    int32_t  n_predict = -1; // new tokens to predict
+
+    std::vector<std::string> antiprompt;
+
+    json input_prefix;
+    json input_suffix;
+};
+
+struct slot_image
+{
+    int32_t id;
+
+    bool request_encode_image = false;
+    float * image_embedding = nullptr;
+    int32_t image_tokens = 0;
+
+    clip_image_u8 * img_data;
+
+    std::string prefix_prompt; // before of this image
+};
+
 // completion token output with probabilities
-struct completion_token_output {
+struct completion_token_output
+{
    struct token_prob
    {
        llama_token tok;
@@ -87,52 +132,26 @@ struct completion_token_output {
    std::string text_to_send;
 };

-struct token_translator {
-    llama_context * ctx;
-    std::string operator()(llama_token tok)                    const { return llama_token_to_piece(ctx, tok); }
-    std::string operator()(const completion_token_output &cto) const { return (*this)(cto.tok); }
-};
-
-static inline void server_log(const char *level, const char *function, int line, const char *message, const nlohmann::ordered_json &extra) {
-    std::stringstream ss_tid;
-    ss_tid << std::this_thread::get_id();
-    json log = nlohmann::ordered_json{
-        {"tid", ss_tid.str()},
+static inline void server_log(const char *level, const char *function, int line,
+                       const char *message, const nlohmann::ordered_json &extra)
+{
+    nlohmann::ordered_json log
+    {
        {"timestamp", time(nullptr)},
+        {"level",     level},
+        {"function",  function},
+        {"line",      line},
+        {"message",   message},
    };

-    if (server_log_json) {
-        log.merge_patch(
-                {
-                        {"level",     level},
-                        {"function",  function},
-                        {"line",      line},
-                        {"msg",       message},
-                });
-        if (!extra.empty()) {
-            log.merge_patch(extra);
-        }
-
-        std::cout << log.dump(-1, ' ', false, json::error_handler_t::replace) << "\n" << std::flush;
-    } else {
-        char buf[1024];
-        snprintf(buf, 1024, "%4s [%24s] %s", level, function, message);
-
-        if (!extra.empty()) {
-            log.merge_patch(extra);
-        }
-        std::stringstream ss;
-        ss << buf << " |";
-        for (const auto& el : log.items())
-        {
-            const std::string value = el.value().dump(-1, ' ', false, json::error_handler_t::replace);
-            ss << " " << el.key() << "=" << value;
-        }
-
-        const std::string str = ss.str();
-        printf("%.*s\n", (int)str.size(), str.data());
-        fflush(stdout);
+    if (!extra.empty())
+    {
+        log.merge_patch(extra);
    }
+
+    const std::string str = log.dump(-1, ' ', false, json::error_handler_t::replace);
+    printf("%.*s\n", (int)str.size(), str.data());
+    fflush(stdout);
 }

 //
@@ -140,53 +159,58 @@ static inline void server_log(const char *level, const char *function, int line,
 //

 template <typename T>
-static T json_value(const json &body, const std::string &key, const T &default_value) {
+static T json_value(const json &body, const std::string &key, const T &default_value)
+{
    // Fallback null to default value
    return body.contains(key) && !body.at(key).is_null()
        ? body.value(key, default_value)
        : default_value;
 }

-// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
-inline bool verify_custom_template(const std::string & tmpl) {
-    llama_chat_message chat[] = {{"user", "test"}};
-    std::vector<char> buf(1);
-    int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, buf.data(), buf.size());
-    return res >= 0;
+inline std::string format_llama2(std::vector<json> messages)
+{
+    std::ostringstream output;
+    bool is_inside_turn = false;
+
+    for (auto it = messages.begin(); it != messages.end(); ++it) {
+        if (!is_inside_turn) {
+            output << "[INST] ";
+        }
+        std::string role    = json_value(*it, "role", std::string("user"));
+        std::string content = json_value(*it, "content", std::string(""));
+        if (role == "system") {
+            output << "<<SYS>>\n" << content << "\n<<SYS>>\n\n";
+            is_inside_turn = true;
+        } else if (role == "user") {
+            output << content << " [/INST]";
+            is_inside_turn = true;
+        } else {
+            output << " " << content << " </s>";
+            is_inside_turn = false;
+        }
+    }
+
+    LOG_VERBOSE("format_llama2", {{"text", output.str()}});
+
+    return output.str();
 }

-// Format given chat. If tmpl is empty, we take the template from model metadata
-inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages) {
-    size_t alloc_size = 0;
-    // vector holding all allocated string to be passed to llama_chat_apply_template
-    std::vector<std::string> str(messages.size() * 2);
-    std::vector<llama_chat_message> chat(messages.size());
+inline std::string format_chatml(std::vector<json> messages)
+{
+    std::ostringstream chatml_msgs;

-    for (size_t i = 0; i < messages.size(); ++i) {
-        auto &curr_msg = messages[i];
-        str[i*2 + 0]    = json_value(curr_msg, "role",    std::string(""));
-        str[i*2 + 1]    = json_value(curr_msg, "content", std::string(""));
-        alloc_size     += str[i*2 + 1].length();
-        chat[i].role    = str[i*2 + 0].c_str();
-        chat[i].content = str[i*2 + 1].c_str();
+    for (auto it = messages.begin(); it != messages.end(); ++it) {
+        chatml_msgs << "<|im_start|>"
+                    << json_value(*it, "role",    std::string("user")) << '\n';
+        chatml_msgs << json_value(*it, "content", std::string(""))
+                    << "<|im_end|>\n";
    }

-    const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str();
-    std::vector<char> buf(alloc_size * 2);
+    chatml_msgs << "<|im_start|>assistant" << '\n';

-    // run the first time to get the total output length
-    int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size());
+    LOG_VERBOSE("format_chatml", {{"text", chatml_msgs.str()}});

-    // if it turns out that our buffer is too small, we resize it
-    if ((size_t) res > buf.size()) {
-        buf.resize(res);
-        res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size());
-    }
-
-    std::string formatted_chat(buf.data(), res);
-    LOG_VERBOSE("formatted_chat", {{"text", formatted_chat.c_str()}});
-
-    return formatted_chat;
+    return chatml_msgs.str();
 }

 //
@@ -205,14 +229,13 @@ struct llama_server_queue {
    // callback functions
    std::function<void(task_server&)> callback_new_task;
    std::function<void(task_multi&)> callback_finish_multitask;
-    std::function<void(void)> callback_run_slots;
+    std::function<void(void)> callback_all_task_finished;

    // Add a new task to the end of the queue
    int post(task_server task) {
        std::unique_lock<std::mutex> lock(mutex_tasks);
        if (task.id == -1) {
            task.id = id++;
-            LOG_VERBOSE("new task id", {{"new_id", task.id}});
        }
        queue_tasks.push_back(std::move(task));
        condition_tasks.notify_one();
@@ -228,9 +251,7 @@ struct llama_server_queue {
    // Get the next id for creating anew task
    int get_new_id() {
        std::unique_lock<std::mutex> lock(mutex_tasks);
-        int new_id = id++;
-        LOG_VERBOSE("new task id", {{"new_id", new_id}});
-        return new_id;
+        return id++;
    }

    // Register function to process a new task
@@ -238,14 +259,14 @@ struct llama_server_queue {
        callback_new_task = callback;
    }

-    // Register function to process a multitask when it is finished
+    // Register function to process a multitask
    void on_finish_multitask(std::function<void(task_multi&)> callback) {
        callback_finish_multitask = callback;
    }

-    // Register the function to be called when all slots data is ready to be processed
-    void on_run_slots(std::function<void(void)> callback) {
-        callback_run_slots = callback;
+    // Register the function to be called when the batch of tasks is finished
+    void on_all_tasks_finished(std::function<void(void)> callback) {
+        callback_all_task_finished = callback;
    }

    // Call when the state of one slot is changed
@@ -267,17 +288,12 @@ struct llama_server_queue {
        condition_tasks.notify_all();
    }

-    /**
-     * Main loop consists of these steps:
-     * - Wait until a new task arrives
-     * - Process the task (i.e. maybe copy data into slot)
-     * - Check if multitask is finished
-     * - Run all slots
-     */
+    // Start the main loop.
    void start_loop() {
        running = true;
        while (true) {
-            LOG_VERBOSE("new task may arrive", {});
+            // new task arrived
+            LOG_VERBOSE("have new task", {});
            {
                while (true)
                {
@@ -289,11 +305,11 @@ struct llama_server_queue {
                    task_server task = queue_tasks.front();
                    queue_tasks.erase(queue_tasks.begin());
                    lock.unlock();
-                    LOG_VERBOSE("callback_new_task", {{"task_id", task.id}});
+                    LOG_VERBOSE("callback_new_task", {});
                    callback_new_task(task);
                }
-                LOG_VERBOSE("update_multitasks", {});
-                // check if we have any finished multitasks
+                LOG_VERBOSE("callback_all_task_finished", {});
+                // process and update all the multitasks
                auto queue_iterator = queue_multitasks.begin();
                while (queue_iterator != queue_multitasks.end())
                {
@@ -310,9 +326,8 @@ struct llama_server_queue {
                        ++queue_iterator;
                    }
                }
-                // all tasks in the current loop is processed, slots data is now ready
-                LOG_VERBOSE("callback_run_slots", {});
-                callback_run_slots();
+                // all tasks in the current loop is finished
+                callback_all_task_finished();
            }
            LOG_VERBOSE("wait for new task", {});
            // wait for new task
@@ -370,16 +385,12 @@ struct llama_server_response {
    std::mutex mutex_results;
    std::condition_variable condition_results;

-    // add the task_id to the list of tasks waiting for response
    void add_waiting_task_id(int task_id) {
-        LOG_VERBOSE("waiting for task id", {{"task_id", task_id}});
        std::unique_lock<std::mutex> lock(mutex_results);
        waiting_task_ids.insert(task_id);
    }

-    // when the request is finished, we can remove task associated with it
    void remove_waiting_task_id(int task_id) {
-        LOG_VERBOSE("remove waiting for task id", {{"task_id", task_id}});
        std::unique_lock<std::mutex> lock(mutex_results);
        waiting_task_ids.erase(task_id);
    }
@@ -392,6 +403,7 @@ struct llama_server_response {
            condition_results.wait(lock, [&]{
                return !queue_results.empty();
            });
+            LOG_VERBOSE("condition_results unblock", {});

            for (int i = 0; i < (int) queue_results.size(); i++)
            {
@@ -416,22 +428,22 @@ struct llama_server_response {
    // Send a new result to a waiting task_id
    void send(task_result result) {
        std::unique_lock<std::mutex> lock(mutex_results);
-        LOG_VERBOSE("send new result", {{"task_id", result.id}});
+        LOG_VERBOSE("send new result", {});
        for (auto& task_id : waiting_task_ids) {
            // LOG_TEE("waiting task id %i \n", task_id);
            // for now, tasks that have associated parent multitasks just get erased once multitask picks up the result
            if (result.multitask_id == task_id)
            {
-                LOG_VERBOSE("callback_update_multitask", {{"task_id", task_id}});
+                LOG_VERBOSE("callback_update_multitask", {});
                callback_update_multitask(task_id, result.id, result);
                continue;
            }

            if (result.id == task_id)
            {
-                LOG_VERBOSE("queue_results.push_back", {{"task_id", task_id}});
+                LOG_VERBOSE("queue_results.push_back", {});
                queue_results.push_back(result);
-                condition_results.notify_all();
+                condition_results.notify_one();
                return;
            }
        }
@@ -538,96 +550,3 @@ static std::string gen_chatcmplid()
    chatcmplid << "chatcmpl-" << random_string();
    return chatcmplid.str();
 }
-
-//
-// other common utils
-//
-
-static size_t common_part(const std::vector<llama_token> &a, const std::vector<llama_token> &b)
-{
-    size_t i;
-    for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++)
-    {
-    }
-    return i;
-}
-
-static bool ends_with(const std::string &str, const std::string &suffix)
-{
-    return str.size() >= suffix.size() &&
-           0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
-}
-
-static size_t find_partial_stop_string(const std::string &stop,
-                                       const std::string &text)
-{
-    if (!text.empty() && !stop.empty())
-    {
-        const char text_last_char = text.back();
-        for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--)
-        {
-            if (stop[char_index] == text_last_char)
-            {
-                const std::string current_partial = stop.substr(0, char_index + 1);
-                if (ends_with(text, current_partial))
-                {
-                    return text.size() - char_index - 1;
-                }
-            }
-        }
-    }
-    return std::string::npos;
-}
-
-// TODO: reuse llama_detokenize
-template <class Iter>
-static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end)
-{
-    std::string ret;
-    for (; begin != end; ++begin)
-    {
-        ret += llama_token_to_piece(ctx, *begin);
-    }
-    return ret;
-}
-
-// format incomplete utf-8 multibyte character for output
-static std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token)
-{
-    std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token);
-    // if the size is 1 and first bit is 1, meaning it's a partial character
-    //   (size > 1 meaning it's already a known token)
-    if (out.size() == 1 && (out[0] & 0x80) == 0x80)
-    {
-        std::stringstream ss;
-        ss << std::hex << (out[0] & 0xff);
-        std::string res(ss.str());
-        out = "byte: \\x" + res;
-    }
-    return out;
-}
-
-// convert a vector of completion_token_output to json
-static json probs_vector_to_json(const llama_context *ctx, const std::vector<completion_token_output> &probs)
-{
-    json out = json::array();
-    for (const auto &prob : probs)
-    {
-        json probs_for_token = json::array();
-        for (const auto &p : prob.probs)
-        {
-            std::string tok_str = tokens_to_output_formatted_string(ctx, p.tok);
-            probs_for_token.push_back(json
-            {
-                {"tok_str", tok_str},
-                {"prob",    p.prob},
-            });
-        }
-        std::string tok_str = tokens_to_output_formatted_string(ctx, prob.tok);
-        out.push_back(json{
-            {"content", tok_str},
-            {"probs",   probs_for_token},
-        });
-    }
-    return out;
-}
--- a/examples/simple/README.md
+++ b/examples/simple/README.md
@@ -7,7 +7,7 @@ The purpose of this example is to demonstrate a minimal usage of llama.cpp for g

 ...

-main: n_len = 32, n_ctx = 2048, n_parallel = 1, n_kv_req = 32
+main: n_len = 32, kv_size = 2048, n_parallel = 1, n_kv_req = 32

 Hello my name is Shawn and I'm a 20 year old male from the United States. I'm a 20 year old

--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -52,7 +52,7 @@ int main(int argc, char ** argv) {
    llama_context_params ctx_params = llama_context_default_params();

    ctx_params.seed  = 1234;
-    ctx_params.n_ctx = 2048;
+    ctx_params.kv_size = 2048;
    ctx_params.n_threads = params.n_threads;
    ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;

@@ -68,15 +68,15 @@ int main(int argc, char ** argv) {
    std::vector<llama_token> tokens_list;
    tokens_list = ::llama_tokenize(ctx, params.prompt, true);

-    const int n_ctx    = llama_n_ctx(ctx);
+    const int kv_size  = llama_kv_size(ctx);
    const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size());

-    LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, n_kv_req);
+    LOG_TEE("\n%s: n_len = %d, kv_size = %d, n_kv_req = %d\n", __func__, n_len, kv_size, n_kv_req);

    // make sure the KV cache is big enough to hold all the prompt and generated tokens
-    if (n_kv_req > n_ctx) {
-        LOG_TEE("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__);
-        LOG_TEE("%s:        either reduce n_len or increase n_ctx\n", __func__);
+    if (n_kv_req > kv_size) {
+        LOG_TEE("%s: error: n_kv_req > kv_size, the required KV cache size is not big enough\n", __func__);
+        LOG_TEE("%s:        either reduce n_len or increase kv_size\n", __func__);
        return 1;
    }

--- a/examples/speculative/README.md
+++ b/examples/speculative/README.md
@@ -6,4 +6,3 @@ More info:

 - https://github.com/ggerganov/llama.cpp/pull/2926
 - https://github.com/ggerganov/llama.cpp/pull/3624
- https://github.com/ggerganov/llama.cpp/pull/5625
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -5,7 +5,6 @@
 #include <cstdio>
 #include <string>
 #include <vector>
-#include <set>

 #define SPEC_VOCAB_MAX_SIZE_DIFFERENCE  100
 #define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
@@ -19,7 +18,6 @@ struct seq_draft {
    std::vector<int> i_batch_tgt;

    std::vector<llama_token> tokens;
-    std::vector<std::vector<llama_token_data>> dists;

    struct llama_sampling_context * ctx_sampling;
 };
@@ -39,15 +37,12 @@ int main(int argc, char ** argv) {
    // max number of parallel drafting sequences (i.e. tree branches)
    const int n_seq_dft = params.n_parallel;

+    // probability threshold for accepting a token from the draft model
+    const float p_accept = params.p_accept;
+
    // probability threshold for splitting a draft branch (only for n_seq_dft > 1)
    const float p_split  = params.p_split;

-    if (params.seed == LLAMA_DEFAULT_SEED) {
-        params.seed = time(NULL);
-    }
-    std::default_random_engine rng(params.seed);
-    std::uniform_real_distribution<> u_dist;
-
 #ifndef LOG_DISABLE_LOGS
    log_set_target(log_filename_generator("speculative", "log"));
    LOG_TEE("Log start\n");
@@ -121,7 +116,7 @@ int main(int argc, char ** argv) {
    std::vector<llama_token> inp;
    inp = ::llama_tokenize(ctx_tgt, params.prompt, add_bos_tgt, true);

-    const int max_context_size     = llama_n_ctx(ctx_tgt);
+    const int max_context_size     = llama_kv_size(ctx_tgt);
    const int max_tokens_list_size = max_context_size - 4;

    if ((int) inp.size() > max_tokens_list_size) {
@@ -171,16 +166,14 @@ int main(int argc, char ** argv) {
    std::vector<seq_draft> drafts(n_seq_dft);

    params.sparams.grammar.clear(); // the draft samplers will copy the target sampler's grammar
-    if (params.sparams.temp == 0) {
-        params.sparams.temp = -1.0f; // force greedy sampling with probs for the draft model
-    }
+    params.sparams.temp = -1.0f;    // force greedy sampling with probs for the draft model

    for (int s = 0; s < n_seq_dft; ++s) {
        drafts[s].ctx_sampling = llama_sampling_init(params.sparams);
    }

-    llama_batch batch_dft = llama_batch_init(params.n_ctx, 0, 1);
-    llama_batch batch_tgt = llama_batch_init(params.n_ctx, 0, n_seq_dft);
+    llama_batch batch_dft = llama_batch_init(params.kv_size, 0, 1);
+    llama_batch batch_tgt = llama_batch_init(params.kv_size, 0, n_seq_dft);

    const auto t_dec_start = ggml_time_us();

@@ -189,15 +182,12 @@ int main(int argc, char ** argv) {
    drafts[0].i_batch_tgt[0] = 0;

    while (true) {
-        std::set<int> active_seqs = {};
-
        // print current draft sequences
        for (int s = 0; s < n_seq_dft; ++s) {
            if (!drafts[s].active) {
                continue;
            }

-            active_seqs.insert(s);
            const auto & tokens = drafts[s].tokens;

            LOG("draft %d: %s\n", s, LOG_TOKENS_TOSTR_PRETTY(ctx_dft, tokens).c_str());
@@ -206,156 +196,48 @@ int main(int argc, char ** argv) {
        int i_dft  = 0;
        int s_keep = 0;

-        llama_token token_id;
-        std::string token_str;
-
-        // loop until we fail to accept a drafted token or we run out of drafted tokens
        while (true) {
+            LOG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]);
+
+            // sample from the target model
+            llama_token id = llama_sampling_sample(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft]);
+
+            llama_sampling_accept(ctx_sampling, ctx_tgt, id, true);
+
+            //LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, ctx_sampling->prev).c_str());
+
+            const std::string token_str = llama_token_to_piece(ctx_tgt, id);
+
+            if (!params.use_color) {
+                printf("%s", token_str.c_str());
+            }
+
+            if (id == llama_token_eos(model_tgt)) {
+                has_eos = true;
+            }
+
+            ++n_predict;

            // check if the target token matches any of the drafts
-            // for stochastic sampling, attempt to match the token with the drafted tokens
            {
-                bool accept = false;
-                if (params.sparams.temp > 0) {
-                    // stochastic verification
+                bool matches = false;

-                    llama_token_data_array dist_tgt = llama_sampling_probability_distribution(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft]);
-                    float p_tgt = 0, p_dft = 0;
-
-                    // GGML_ASSERT(dist_tgt.size() == dist_dft.size());
-
-                    while (active_seqs.size() > 0) {
-                        // randomly select a sequence to verify from active sequences
-                        std::uniform_int_distribution<unsigned int> u_int_dist(0, active_seqs.size() - 1);
-                        int s = *std::next(active_seqs.begin(), u_int_dist(rng));
-                        if (i_dft >= (int) drafts[s].tokens.size()) {
-                            drafts[s].active = false;
-                            active_seqs.erase(s);
-                            continue;
-                        }
-                        if (accept) {
-                            // if we already accepted a token, we can skip the rest
-                            if (drafts[s].tokens[i_dft] != drafts[s_keep].tokens[i_dft]) {
-                                drafts[s].active = false;
-                                active_seqs.erase(s);
-                            }
-                            continue;
-                        }
-                        LOG("verifying sequence #%d at pos #%d from %d active sequence(s)\n", s, i_dft, (int) active_seqs.size());
-                        float r = u_dist(rng);
-                        llama_token_data_array dist_dft = { drafts[s].dists[i_dft].data() , drafts[s].dists[i_dft].size(), true };
-                        // acquire the token probabilities assigned by the draft and target models
-                        for (size_t i = 0; i < dist_tgt.size; i++) {
-                            if (dist_tgt.data[i].id == drafts[s].tokens[i_dft]) {
-                                p_tgt = dist_tgt.data[i].p;
-                            }
-                            if (dist_dft.data[i].id == drafts[s].tokens[i_dft]) {
-                                p_dft = dist_dft.data[i].p;
-                            }
-                            if (p_tgt && p_dft) {
-                                break;
-                            }
-                        }
-                        LOG("r = %f, p_dft = %f, p_tgt = %f\n", r, p_dft, p_tgt);
-                        if (r <= p_tgt / p_dft) {
-                            s_keep = s;
-                            accept = true;
-                            token_id = drafts[s].tokens[i_dft];
-                            token_str = llama_token_to_piece(ctx_tgt, token_id);
-                            llama_sampling_accept(ctx_sampling, ctx_tgt, token_id, true);
-
-                            LOG("draft token %d of sequence %d (%d, '%s') accepted\n", i_dft, s, token_id, token_str.c_str());
-                            break;
-                        } else {
-                            LOG("draft token %d of sequence %d (%d, '%s') rejected\n", i_dft, s, drafts[s].tokens[i_dft], llama_token_to_piece(ctx_tgt, drafts[s].tokens[i_dft]).c_str());
-                            drafts[s].active = false;
-
-                            // calculate residual probability
-                            GGML_ASSERT(dist_tgt.sorted);
-                            GGML_ASSERT(dist_dft.sorted);
-                            float sum_probs = 0.0f;
-
-                            // sort dist by id
-                            std::sort(dist_tgt.data, dist_tgt.data + dist_tgt.size, [](const llama_token_data &a, const llama_token_data &b) {
-                                return a.id < b.id;
-                            });
-                            std::sort(dist_dft.data, dist_dft.data + dist_dft.size, [](const llama_token_data &a, const llama_token_data &b) {
-                                return a.id < b.id;
-                            });
-
-                            for (size_t i = 0; i < dist_tgt.size; i++) {
-                                dist_tgt.data[i].p = std::max(0.0f, dist_tgt.data[i].p - dist_dft.data[i].p);
-                                sum_probs += dist_tgt.data[i].p;
-                            }
-                            for (size_t i = 0; i < dist_tgt.size; i++) {
-                                dist_tgt.data[i].p /= sum_probs;
-                            }
-
-                            // sort dist_tgt by p desc
-                            std::sort(dist_tgt.data, dist_tgt.data + dist_tgt.size, [](const llama_token_data &a, const llama_token_data &b) {
-                                return a.p > b.p;
-                            });
-                        }
-
-                        active_seqs.erase(s);
-                        for(int i = 0; i < n_seq_dft; i++) {
-                            if (i == s) {
-                                continue;
-                            }
-                            if (drafts[i].tokens[i_dft] == drafts[s].tokens[i_dft]) {
-                                // synchronize active status for sequences with the same drafted token
-                                drafts[i].active = drafts[i].active && accept;
-                                if (!drafts[i].active) {
-                                    active_seqs.erase(s);
-                                }
-                            }
-                        }
+                for (int s = 0; s < n_seq_dft; ++s) {
+                    if (!drafts[s].active) {
+                        continue;
                    }

-                    if (!accept) {
-                        // all drafted tokens were rejected
-                        // sample from the target model
-                        LOG("all drafted tokens were rejected, sampling from residual distribution\n");
-                        token_id = llama_sample_token(ctx_tgt, &dist_tgt);
-                        llama_sampling_accept(ctx_sampling, ctx_tgt, token_id, true);
-                        token_str = llama_token_to_piece(ctx_tgt, token_id);
-                    }
+                    if (i_dft < (int) drafts[s].tokens.size() && id == drafts[s].tokens[i_dft]) {
+                        LOG("the sampled target token matches the %dth drafted token of sequence %d (%d, '%s') - accepted\n", i_dft, s, id, token_str.c_str());

-                } else {
-                    // greedy verification
-
-                    // sample from the target model
-                    LOG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]);
-                    token_id = llama_sampling_sample(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft]);
-
-                    llama_sampling_accept(ctx_sampling, ctx_tgt, token_id, true);
-
-                    //LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, ctx_sampling->prev).c_str());
-
-                    token_str = llama_token_to_piece(ctx_tgt, token_id);
-
-                    for (int s = 0; s < n_seq_dft; ++s) {
-                        if (!drafts[s].active) {
-                            continue;
-                        }
-
-                        if (i_dft < (int) drafts[s].tokens.size() && token_id == drafts[s].tokens[i_dft]) {
-                            LOG("the sampled target token matches the %dth drafted token of sequence %d (%d, '%s') - accepted\n", i_dft, s, token_id, token_str.c_str());
-
-                            s_keep = s;
-                            accept = true;
-                        } else {
-                            drafts[s].active = false;
-                        }
+                        s_keep = s;
+                        matches = true;
+                    } else {
+                        drafts[s].active = false;
                    }
                }

-                if (token_id == llama_token_eos(model_tgt)) {
-                    has_eos = true;
-                }
-                ++n_predict;
-
-                if (accept) {
+                if (matches) {
                    ++n_accept;
                    ++n_past_tgt;
                    ++n_past_dft;
@@ -363,21 +245,17 @@ int main(int argc, char ** argv) {
                    if (params.use_color) {
                        // Color token according to its origin sequence
                        printf("\u001b[%dm%s\u001b[37m", (36 - s_keep % 6), token_str.c_str());
-                    } else {
-                        printf("%s", token_str.c_str());
+                        fflush(stdout);
                    }
-                    fflush(stdout);
                    continue;
-                } else {
-                    printf("%s", token_str.c_str());
-                    fflush(stdout);
-                    break;
                }
            }
-        }
+            if (params.use_color) {
+                printf("%s", token_str.c_str());
+            }
+            fflush(stdout);

-        {
-            LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", token_id, token_str.c_str());
+            LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str());

            // TODO: simplify
            {
@@ -397,21 +275,21 @@ int main(int argc, char ** argv) {
                drafts[s].active = false;
                drafts[s].tokens.clear();
                drafts[s].i_batch_tgt.clear();
-                drafts[s].dists.clear();
            }
            // note: will be erased after the speculation phase
-            drafts[0].tokens.push_back(token_id);
-            drafts[0].dists.push_back(std::vector<llama_token_data>());
+            drafts[0].tokens.push_back(id);
            drafts[0].i_batch_tgt.push_back(0);

            llama_batch_clear(batch_dft);
-            llama_batch_add  (batch_dft, token_id, n_past_dft, { 0 }, true);
+            llama_batch_add  (batch_dft, id, n_past_dft, { 0 }, true);

            llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1);
            // LOG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
-            llama_decode(ctx_dft, batch_dft);
+            llama_decode         (ctx_dft, batch_dft);

            ++n_past_dft;
+
+            break;
        }

        if (n_predict > params.n_predict || has_eos) {
@@ -456,6 +334,12 @@ int main(int argc, char ** argv) {
                            k, s, i, cur_p[k].id, cur_p[k].p, llama_token_to_piece(ctx_dft, cur_p[k].id).c_str());
                }

+                if (cur_p[0].p < p_accept) {
+                    LOG("stopping drafting for seq %3d, probability too low: %.3f < %.3f\n", s, cur_p[0].p, p_accept);
+                    drafts[s].drafting = false;
+                    continue;
+                }
+
                std::vector<int> sa(1, s);

                // attempt to split the branch if the probability is high enough
@@ -483,7 +367,6 @@ int main(int argc, char ** argv) {
                        drafts[n_seq_cur].skip     = true;

                        drafts[n_seq_cur].tokens      = drafts[s].tokens;
-                        drafts[n_seq_cur].dists       = drafts[s].dists;
                        drafts[n_seq_cur].i_batch_dft = drafts[s].i_batch_dft;
                        drafts[n_seq_cur].i_batch_tgt = drafts[s].i_batch_tgt;

@@ -506,8 +389,6 @@ int main(int argc, char ** argv) {
                    llama_sampling_accept(drafts[s].ctx_sampling, ctx_dft, id, true);

                    drafts[s].tokens.push_back(id);
-                    // save cur_p.data into drafts[s].dists
-                    drafts[s].dists.push_back(cur_p);

                    // add unique drafted tokens to the target batch
                    drafts[s].i_batch_tgt.push_back(batch_tgt.n_tokens);
@@ -559,7 +440,6 @@ int main(int argc, char ** argv) {
            }

            drafts[s].tokens.erase(drafts[s].tokens.begin());
-            drafts[s].dists.erase(drafts[s].dists.begin());
        }
    }

--- a/examples/sycl/ls-sycl-device.cpp
+++ b/examples/sycl/ls-sycl-device.cpp
@@ -7,7 +7,7 @@

 #include "ggml-sycl.h"

-int main() {
+int main(int argc, char ** argv) {
    ggml_backend_sycl_print_sycl_devices();
    return 0;
 }
--- a/examples/sycl/run-llama2.sh
+++ b/examples/sycl/run-llama2.sh
@@ -8,19 +8,12 @@ INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
 source /opt/intel/oneapi/setvars.sh

 if [ $# -gt 0 ]; then
-    GGML_SYCL_DEVICE=$1
+    export GGML_SYCL_DEVICE=$1
 else
-    GGML_SYCL_DEVICE=0
+    export GGML_SYCL_DEVICE=0
 fi
-echo "use $GGML_SYCL_DEVICE as main GPU"
+echo GGML_SYCL_DEVICE=$GGML_SYCL_DEVICE
 #export GGML_SYCL_DEBUG=1
-
-
-#ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer.
-
-#use all GPUs with same max compute units
-ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
-
-#use main GPU only
-#ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none
+./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
+#./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 5 -e -ngl 33 -t 1 -s 0

--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -22,7 +22,7 @@

 struct my_llama_hparams {
    uint32_t n_vocab = 32000;
-    uint32_t n_ctx   = 512;
+    uint32_t kv_size = 512;
    uint32_t n_embd  = 4096;
    uint32_t n_head  = 32;
    uint32_t n_layer = 32;
@@ -112,7 +112,7 @@ static const char * LLM_TENSOR_FFN_UP        = "blk.%d.ffn_up";

 static void print_params(struct my_llama_hparams * params) {
    printf("%s: n_vocab: %u\n", __func__, params->n_vocab);
-    printf("%s: n_ctx:   %u\n", __func__, params->n_ctx);
+    printf("%s: kv_size: %u\n", __func__, params->kv_size);
    printf("%s: n_embd:  %u\n", __func__, params->n_embd);
    printf("%s: n_head:  %u\n", __func__, params->n_head);
    printf("%s: n_ff:    %u\n", __func__, params->n_ff);
@@ -272,7 +272,7 @@ static struct ggml_tensor * llama_build_train_graphs(
    const int n_past = 0;
    const int N = n_tokens;
    const auto & hparams = model->hparams;
-    const int n_ctx      = hparams.n_ctx;
+    const int kv_size    = hparams.kv_size;
    const int n_vocab    = hparams.n_vocab;
    const int n_embd     = hparams.n_embd;
    const int n_layer    = hparams.n_layer;
@@ -295,13 +295,13 @@ static struct ggml_tensor * llama_build_train_graphs(
    ggml_set_input(KQ_pos);

    // rope has so much parameters that we make a custom function for it
-    auto rope = [ctx, KQ_pos, n_rot, n_ctx, rope_freq_base, rope_freq_scale]
+    auto rope = [ctx, KQ_pos, n_rot, kv_size, rope_freq_base, rope_freq_scale]
                (struct ggml_tensor * t) -> struct ggml_tensor * {
        // not capturing these, to silcence warnings
        const int rope_mode = 0;

        return ggml_rope_custom(
-            ctx, t, KQ_pos, n_rot, rope_mode, n_ctx, 0, rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f
+                ctx, t, KQ_pos, n_rot, rope_mode, kv_size, 0, rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f
        );
    };

@@ -487,8 +487,8 @@ static void load_llama_model_gguf(struct gguf_context * fctx, struct ggml_contex
    GGUF_GET_KEY(fctx, ftype_u, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_GENERAL_FILE_TYPE);
    GGML_ASSERT((enum llama_ftype) ftype_u == LLAMA_FTYPE_ALL_F32);

-    // n_ctx was not saved in earlier checkpoint file versions, so we make it optional here
-    GGUF_GET_KEY(fctx, model->hparams.n_ctx,   gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_CONTEXT_LENGTH));
+    // kv_size was not saved in earlier checkpoint file versions, so we make it optional here
+    GGUF_GET_KEY(fctx, model->hparams.kv_size, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_CONTEXT_LENGTH));

    GGUF_GET_KEY(fctx, model->hparams.n_embd,  gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_EMBEDDING_LENGTH));
    GGUF_GET_KEY(fctx, model->hparams.n_ff,    gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_FEED_FORWARD_LENGTH));
@@ -543,7 +543,7 @@ static void save_llama_model_gguf(struct gguf_context * fctx, const char * fn_vo
    gguf_set_val_u32(fctx, LLM_KV_GENERAL_FILE_TYPE, ftype);

    // set hparams
-    gguf_set_val_u32(fctx, kv(LLM_KV_CONTEXT_LENGTH),              model->hparams.n_ctx                  );
+    gguf_set_val_u32(fctx, kv(LLM_KV_CONTEXT_LENGTH),              model->hparams.kv_size                  );
    gguf_set_val_u32(fctx, kv(LLM_KV_EMBEDDING_LENGTH),            model->hparams.n_embd                 );
    gguf_set_val_u32(fctx, kv(LLM_KV_FEED_FORWARD_LENGTH),         model->hparams.n_ff                   );
    gguf_set_val_u32(fctx, kv(LLM_KV_ATTENTION_HEAD_COUNT),        model->hparams.n_head                 );
@@ -945,7 +945,7 @@ int main(int argc, char ** argv) {

    struct my_llama_model model;
    model.hparams.n_vocab = llama_n_vocab(lmodel);
-    model.hparams.n_ctx   = params.common.n_ctx;
+    model.hparams.kv_size = params.common.n_ctx;
    model.hparams.n_embd  = params.n_embd;
    model.hparams.n_head  = params.n_head;
    model.hparams.n_layer = params.n_layer;
@@ -960,7 +960,7 @@ int main(int argc, char ** argv) {
    struct ggml_opt_context * opt   = train->opt;

    // set opt params from command line
-    opt->params = ggml_opt_default_params(GGML_OPT_TYPE_ADAM);
+    opt->params = ggml_opt_default_params(GGML_OPT_ADAM);
    opt->params.print_forward_graph     = false;
    opt->params.print_backward_graph    = false;
    opt->params.graph_size              = LLAMA_TRAIN_MAX_NODES;
@@ -982,9 +982,9 @@ int main(int argc, char ** argv) {
    printf("%s: init model\n", __func__);
    bool existed = load_checkpoint_file(params.common.fn_checkpoint_in, &model, train);
    if (existed) {
-        // overwrite last n_ctx with user provided n_ctx
+        // overwrite last kv_size with user provided kv_size
        if (params.common.custom_n_ctx) {
-            model.hparams.n_ctx = params.common.n_ctx;
+            model.hparams.kv_size = params.common.n_ctx;
        }

        const bool opt_past_changed = opt->params.past != params.common.opt_past;
@@ -1031,7 +1031,7 @@ int main(int argc, char ** argv) {
    printf("%s: opt_size  = %zu bytes (%.1f MB)\n", __func__, ggml_get_mem_size(opt->ctx), (float) ggml_get_mem_size(opt->ctx) / (1024.0f*1024.0f));
    printf("%s: opt iter %d\n", __func__, opt->iter);

-    int n_tokens = model.hparams.n_ctx;
+    int n_tokens = model.hparams.kv_size;
    int n_vocab  = model.hparams.n_vocab;
    int n_batch  = params.common.n_batch;

--- a/flake.lock
+++ b/flake.lock
@@ -5,11 +5,11 @@
        "nixpkgs-lib": "nixpkgs-lib"
      },
      "locked": {
-        "lastModified": 1709336216,
-        "narHash": "sha256-Dt/wOWeW6Sqm11Yh+2+t0dfEWxoMxGBvv3JpIocFl9E=",
+        "lastModified": 1706830856,
+        "narHash": "sha256-a0NYyp+h9hlb7ddVz4LUn1vT/PLwqfrWYcHMvFB1xYg=",
        "owner": "hercules-ci",
        "repo": "flake-parts",
-        "rev": "f7b3c975cf067e56e7cda6cb098ebe3fb4d74ca2",
+        "rev": "b253292d9c0a5ead9bc98c4e9a26c6312e27d69f",
        "type": "github"
      },
      "original": {
@@ -20,11 +20,11 @@
    },
    "nixpkgs": {
      "locked": {
-        "lastModified": 1709237383,
-        "narHash": "sha256-cy6ArO4k5qTx+l5o+0mL9f5fa86tYUX3ozE1S+Txlds=",
+        "lastModified": 1708118438,
+        "narHash": "sha256-kk9/0nuVgA220FcqH/D2xaN6uGyHp/zoxPNUmPCMmEE=",
        "owner": "NixOS",
        "repo": "nixpkgs",
-        "rev": "1536926ef5621b09bba54035ae2bb6d806d72ac8",
+        "rev": "5863c27340ba4de8f83e7e3c023b9599c3cb3c80",
        "type": "github"
      },
      "original": {
@@ -37,11 +37,11 @@
    "nixpkgs-lib": {
      "locked": {
        "dir": "lib",
-        "lastModified": 1709237383,
-        "narHash": "sha256-cy6ArO4k5qTx+l5o+0mL9f5fa86tYUX3ozE1S+Txlds=",
+        "lastModified": 1706550542,
+        "narHash": "sha256-UcsnCG6wx++23yeER4Hg18CXWbgNpqNXcHIo5/1Y+hc=",
        "owner": "NixOS",
        "repo": "nixpkgs",
-        "rev": "1536926ef5621b09bba54035ae2bb6d806d72ac8",
+        "rev": "97b17f32362e475016f942bbdfda4a4a72a8a652",
        "type": "github"
      },
      "original": {
--- a/flake.nix
+++ b/flake.nix
@@ -107,12 +107,11 @@
        # ```
        #
        # Cf. https://nixos.org/manual/nix/unstable/command-ref/new-cli/nix3-flake.html?highlight=flake#flake-format
-        flake.overlays.default = (
-          final: prev: {
+        flake.overlays.default =
+          (final: prev: {
            llamaPackages = final.callPackage .devops/nix/scope.nix { inherit llamaVersion; };
            inherit (final.llamaPackages) llama-cpp;
-          }
-        );
+          });

        systems = [
          "aarch64-darwin"
@@ -132,9 +131,6 @@
            ...
          }:
          {
-            # For standardised reproducible formatting with `nix fmt`
-            formatter = pkgs.nixfmt-rfc-style;
-
            # Unlike `.#packages`, legacyPackages may contain values of
            # arbitrary types (including nested attrsets) and may even throw
            # exceptions. This attribute isn't recursed into by `nix flake
@@ -154,7 +150,6 @@
            packages =
              {
                default = config.legacyPackages.llamaPackages.llama-cpp;
-                vulkan = config.packages.default.override { useVulkan = true; };
              }
              // lib.optionalAttrs pkgs.stdenv.isLinux {
                opencl = config.packages.default.override { useOpenCL = true; };
@@ -162,6 +157,7 @@

                mpi-cpu = config.packages.default.override { useMpi = true; };
                mpi-cuda = config.packages.default.override { useMpi = true; };
+                vulkan = config.packages.default.override { useVulkan = true; };
              }
              // lib.optionalAttrs (system == "x86_64-linux") {
                rocm = config.legacyPackages.llamaPackagesRocm.llama-cpp;
--- a/ggml-alloc.c
+++ b/ggml-alloc.c
@@ -377,9 +377,6 @@ struct ggml_gallocr {

    struct node_alloc * node_allocs; // [n_nodes]
    int n_nodes;
-
-    struct tensor_alloc * leaf_allocs; // [n_leafs]
-    int n_leafs;
 };

 ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) {
@@ -430,7 +427,6 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
    free(galloc->buffers);
    free(galloc->buf_tallocs);
    free(galloc->node_allocs);
-    free(galloc->leaf_allocs);
    free(galloc);
 }

@@ -468,7 +464,7 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
            for (int i = 0; i < GGML_MAX_SRC; i++) {
                struct ggml_tensor * parent = node->src[i];
                if (parent == NULL) {
-                    continue;
+                    break;
                }

                // if the node's data is external, then we cannot re-use it
@@ -548,8 +544,22 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
    memset(galloc->hash_set.keys, 0, galloc->hash_set.size * sizeof(struct ggml_tensor *));
    memset(galloc->hash_values,   0, galloc->hash_set.size * sizeof(struct hash_node));

+    // allocate all graph inputs first to avoid overwriting them
+    for (int i = 0; i < graph->n_nodes; i++) {
+        if (graph->nodes[i]->flags & GGML_TENSOR_FLAG_INPUT) {
+            ggml_gallocr_allocate_node(galloc, graph->nodes[i], get_node_buffer_id(node_buffer_ids, i));
+        }
+        for (int j = 0; j < GGML_MAX_SRC; j++) {
+            if (graph->nodes[i]->src[j] == NULL) {
+                continue;
+            }
+            if (graph->nodes[i]->src[j]->flags & GGML_TENSOR_FLAG_INPUT) {
+                ggml_gallocr_allocate_node(galloc, graph->nodes[i]->src[j], get_node_buffer_id(node_buffer_ids, i));
+            }
+        }
+    }
+
    // count number of children and views
-    // allocate all graph inputs and leafs first to avoid overwriting them
    for (int i = 0; i < graph->n_nodes; i++) {
        struct ggml_tensor * node = graph->nodes[i];

@@ -558,37 +568,14 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
            ggml_gallocr_hash_get(galloc, view_src)->n_views += 1;
        }

-        if (node->flags & GGML_TENSOR_FLAG_INPUT) {
-            ggml_gallocr_allocate_node(galloc, graph->nodes[i], get_node_buffer_id(node_buffer_ids, i));
-        }
-
        for (int j = 0; j < GGML_MAX_SRC; j++) {
-            struct ggml_tensor * src = node->src[j];
-            if (src == NULL) {
-                continue;
-            }
-
-            ggml_gallocr_hash_get(galloc, src)->n_children += 1;
-
-            // allocate explicit inputs and leafs
-            if (src->flags & GGML_TENSOR_FLAG_INPUT || src->op == GGML_OP_NONE) {
-                ggml_gallocr_allocate_node(galloc, src, get_node_buffer_id(node_buffer_ids, i));
+            struct ggml_tensor * parent = node->src[j];
+            if (parent == NULL) {
+                break;
            }
+            ggml_gallocr_hash_get(galloc, parent)->n_children += 1;
        }
-    }
-
-    // allocate the remaining leafs that are unused on the graph
-    // these are effectively static tensors that the application is not using in the graph, but may still want to allocate for other purposes
-    for (int i = 0; i < graph->n_leafs; i++) {
-        struct ggml_tensor * leaf = graph->leafs[i];
-        struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
-
-        if (hn->n_children == 0) {
-            assert(!hn->allocated);
-            // since buffer ids are only given for nodes, these leafs are always allocated in the first buffer
-            ggml_gallocr_allocate_node(galloc, leaf, 0);
-        }
-    }
+   }

    // allocate tensors
    for (int i = 0; i < graph->n_nodes; i++) {
@@ -599,7 +586,7 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
        for (int j = 0; j < GGML_MAX_SRC; j++) {
            struct ggml_tensor * parent = node->src[j];
            if (parent == NULL) {
-                continue;
+                break;
            }
            ggml_gallocr_allocate_node(galloc, parent, buffer_id);
        }
@@ -611,7 +598,7 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
        for (int j = 0; j < GGML_MAX_SRC; j++) {
            struct ggml_tensor * parent = node->src[j];
            if (parent == NULL) {
-                continue;
+                break;
            }
            AT_PRINTF("%s", parent->name);
            if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
@@ -624,7 +611,7 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
        for (int j = 0; j < GGML_MAX_SRC; j++) {
            struct ggml_tensor * parent = node->src[j];
            if (parent == NULL) {
-                continue;
+                break;
            }
            struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent);
            p_hn->n_children -= 1;
@@ -709,18 +696,6 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
            }
        }
    }
-    if (galloc->n_leafs < graph->n_leafs) {
-        free(galloc->leaf_allocs);
-        galloc->leaf_allocs = calloc(sizeof(struct tensor_alloc), graph->n_leafs);
-        GGML_ASSERT(galloc->leaf_allocs != NULL);
-    }
-    galloc->n_leafs = graph->n_leafs;
-    for (int i = 0; i < graph->n_leafs; i++) {
-        struct ggml_tensor * leaf = graph->leafs[i];
-        struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
-        galloc->leaf_allocs[i].offset = hn->offset;
-        galloc->leaf_allocs[i].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
-    }

    // reallocate buffers if needed
    for (int i = 0; i < galloc->n_buffers; i++) {
@@ -747,8 +722,8 @@ bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
    return ggml_gallocr_reserve_n(galloc, graph, NULL);
 }

-static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id, struct tensor_alloc * tensor_alloc) {
-    assert(node->data || node->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], node) <= tensor_alloc->size_max);
+static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * node, struct node_alloc * node_alloc, struct tensor_alloc * tensor_alloc) {
+    assert(node->data || node->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[node_alloc->buffer_id], node) <= tensor_alloc->size_max);

    if (node->view_src != NULL) {
        if (node->buffer == NULL) {
@@ -757,20 +732,29 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
                // this tensor was allocated without ggml-backend
                return;
            }
-            ggml_backend_view_init(galloc->buffers[buffer_id], node);
+            ggml_backend_view_init(galloc->buffers[node_alloc->buffer_id], node);
        }
    } else {
        if (node->data == NULL) {
            assert(tensor_alloc->offset != SIZE_MAX);
-            assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], node) <= tensor_alloc->size_max);
-            void * base = ggml_backend_buffer_get_base(galloc->buffers[buffer_id]);
+            assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[node_alloc->buffer_id], node) <= tensor_alloc->size_max);
+            void * base = ggml_backend_buffer_get_base(galloc->buffers[node_alloc->buffer_id]);
            void * addr = (char *)base + tensor_alloc->offset;
-            ggml_backend_tensor_alloc(galloc->buffers[buffer_id], node, addr);
+            ggml_backend_tensor_alloc(galloc->buffers[node_alloc->buffer_id], node, addr);
        } else {
            if (node->buffer == NULL) {
                // this tensor was allocated without ggml-backend
                return;
            }
+
+#ifndef NDEBUG
+            size_t offset =
+                (char *)node->data -
+                (char *)ggml_backend_buffer_get_base(node->buffer);
+            size_t size = ggml_backend_buffer_get_alloc_size(node->buffer, node);
+            assert(tensor_alloc->offset == SIZE_MAX || offset == tensor_alloc->offset);
+            assert(tensor_alloc->offset == SIZE_MAX || size <= tensor_alloc->size_max);
+#endif
        }
    }
 }
@@ -789,13 +773,6 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
        return true;
    }

-    if (galloc->n_leafs != graph->n_leafs) {
-#ifndef NDEBUG
-        fprintf(stderr, "%s: graph has different number of leafs\n", __func__);
-#endif
-        return true;
-    }
-
    for (int i = 0; i < graph->n_nodes; i++) {
        struct ggml_tensor * node = graph->nodes[i];
        struct node_alloc * node_alloc = &galloc->node_allocs[i];
@@ -850,7 +827,6 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
    }

    // allocate the graph tensors from the previous assignments
-    // nodes
    for (int i = 0; i < graph->n_nodes; i++) {
        struct ggml_tensor * node = graph->nodes[i];
        struct node_alloc * node_alloc = &galloc->node_allocs[i];
@@ -859,15 +835,9 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
            if (src == NULL) {
                continue;
            }
-            ggml_gallocr_init_tensor(galloc, src, node_alloc->buffer_id, &node_alloc->src[j]);
+            ggml_gallocr_init_tensor(galloc, src, node_alloc, &node_alloc->src[j]);
        }
-        ggml_gallocr_init_tensor(galloc, node, node_alloc->buffer_id, &node_alloc->dst);
-    }
-    // leafs
-    for (int i = 0; i < graph->n_leafs; i++) {
-        struct ggml_tensor * leaf = graph->leafs[i];
-        struct tensor_alloc * leaf_alloc = &galloc->leaf_allocs[i];
-        ggml_gallocr_init_tensor(galloc, leaf, 0, leaf_alloc);
+        ggml_gallocr_init_tensor(galloc, node, node_alloc, &node_alloc->dst);
    }

    return true;
--- a/ggml-backend-impl.h
+++ b/ggml-backend-impl.h
@@ -91,22 +91,19 @@ extern "C" {
        // (optional) complete all pending operations
        void (*GGML_CALL synchronize)(ggml_backend_t backend);

-        // create a plan for ggml_cgraph and free it
+        // compute graph with a plan
        ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
        void                      (*GGML_CALL graph_plan_free)   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
+        void                      (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);

-        // compute graph with a plan
-        enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
        // compute graph without a plan (async)
-        enum ggml_status (*GGML_CALL graph_compute)     (ggml_backend_t backend, struct ggml_cgraph * cgraph);
+        bool (*GGML_CALL graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);

        // check if the backend supports an operation
        bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
    };

    struct ggml_backend {
-        ggml_guid_t guid;
-
        struct ggml_backend_i iface;

        ggml_backend_context_t context;
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Pierrick HYMBERT	47c662b0de	fix some spaces added by IDE in math op ggml-ci	2024-02-18 22:40:35 +02:00
Pierrick HYMBERT	606873401c	rename n_ctx to kv_size	2024-02-18 22:40:35 +02:00
Pierrick HYMBERT	ef96e8b1f7	server: document the --ctx-size deprecation in server README.md	2024-02-18 22:40:34 +02:00
Pierrick HYMBERT	9a0695671d	server: rename legacy --ctx-size to --kv-size	2024-02-18 22:40:32 +02:00