kompute : llama-bench support and ggml_cpu_has_kompute() (#5226 )

Revert "server : change deps.sh xxd files to string literals (#5221 )"
This reverts commit 4003be0e5f.
2026-02-12 14:03:20 +02:00 · 2024-01-30 19:04:37 -05:00 · 2024-01-30 21:19:26 +02:00 · 2024-01-30 20:17:30 +02:00 · 2024-01-30 20:15:05 +02:00 · 2024-01-30 19:15:28 +02:00
146 changed files with 96908 additions and 2443 deletions
--- a/.devops/main-intel.Dockerfile
+++ b/.devops/main-intel.Dockerfile
@@ -0,0 +1,26 @@
+ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
+ARG UBUNTU_VERSION=22.04
+
+FROM intel/hpckit:$ONEAPI_VERSION as build
+
+RUN apt-get update && \
+    apt-get install -y git
+
+WORKDIR /app
+
+COPY . .
+
+# for some reasons, "-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DLLAMA_NATIVE=ON" give worse performance
+RUN mkdir build && \
+    cd build && \
+    cmake .. -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx && \
+    cmake --build . --config Release --target main server
+
+FROM ubuntu:$UBUNTU_VERSION as runtime
+
+COPY --from=build /app/build/bin/main /main
+COPY --from=build /app/build/bin/server /server
+
+ENV LC_ALL=C.utf8
+
+ENTRYPOINT [ "/main" ]
--- a/.devops/nix/nixpkgs-instances.nix
+++ b/.devops/nix/nixpkgs-instances.nix
@@ -7,6 +7,18 @@
    { system, ... }:
    {
      _module.args = {
+        # Note: bringing up https://zimbatm.com/notes/1000-instances-of-nixpkgs
+        # again, the below creates several nixpkgs instances which the
+        # flake-centric CLI will be forced to evaluate e.g. on `nix flake show`.
+        #
+        # This is currently "slow" and "expensive", on a certain scale.
+        # This also isn't "right" in that this hinders dependency injection at
+        # the level of flake inputs. This might get removed in the foreseeable
+        # future.
+        #
+        # Note that you can use these expressions without Nix
+        # (`pkgs.callPackage ./devops/nix/scope.nix { }` is the entry point).
+
        pkgsCuda = import inputs.nixpkgs {
          inherit system;
          # Ensure dependencies use CUDA consistently (e.g. that openmpi, ucc,
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@@ -73,6 +73,7 @@ let
    ps: [
      ps.numpy
      ps.sentencepiece
+      ps.tiktoken
      ps.torchWithoutCuda
      ps.transformers
    ]
@@ -114,14 +115,22 @@ effectiveStdenv.mkDerivation (
    pname = "llama-cpp${pnameSuffix}";
    version = llamaVersion;

+    # Note: none of the files discarded here are visible in the sandbox or
+    # affect the output hash. This also means they can be modified without
+    # triggering a rebuild.
    src = lib.cleanSourceWith {
      filter =
        name: type:
-        !(builtins.any (_: _) [
+        let
+          noneOf = builtins.all (x: !x);
+          baseName = baseNameOf name;
+        in
+        noneOf [
          (lib.hasSuffix ".nix" name) # Ignore *.nix files when computing outPaths
-          (name == "README.md") # Ignore *.md changes whe computing outPaths
-          (lib.hasPrefix "." name) # Skip hidden files and directories
-        ]);
+          (lib.hasSuffix ".md" name) # Ignore *.md changes whe computing outPaths
+          (lib.hasPrefix "." baseName) # Skip hidden files and directories
+          (baseName == "flake.lock")
+        ];
      src = lib.cleanSource ../../.;
    };

@@ -159,7 +168,7 @@ effectiveStdenv.mkDerivation (

    cmakeFlags =
      [
-        (cmakeBool "LLAMA_NATIVE" true)
+        (cmakeBool "LLAMA_NATIVE" false)
        (cmakeBool "LLAMA_BUILD_SERVER" true)
        (cmakeBool "BUILD_SHARED_LIBS" true)
        (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
@@ -216,6 +225,9 @@ effectiveStdenv.mkDerivation (
        description = "contains numpy and sentencepiece";
        buildInputs = [ llama-python ];
        inputsFrom = [ finalAttrs.finalPackage ];
+        shellHook = ''
+          addToSearchPath "LD_LIBRARY_PATH" "${lib.getLib effectiveStdenv.cc.cc}/lib"
+        '';
      };

      shell-extra = mkShell {
--- a/.devops/nix/scope.nix
+++ b/.devops/nix/scope.nix
@@ -4,6 +4,10 @@
  llamaVersion ? "0.0.0",
 }:

+# We're using `makeScope` instead of just writing out an attrset
+# because it allows users to apply overlays later using `overrideScope'`.
+# Cf. https://noogle.dev/f/lib/makeScope
+
 lib.makeScope newScope (
  self: {
    inherit llamaVersion;
--- a/.devops/server-cuda.Dockerfile
+++ b/.devops/server-cuda.Dockerfile
@@ -0,0 +1,32 @@
+ARG UBUNTU_VERSION=22.04
+# This needs to generally match the container host's environment.
+ARG CUDA_VERSION=11.7.1
+# Target the CUDA build image
+ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
+# Target the CUDA runtime image
+ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
+
+FROM ${BASE_CUDA_DEV_CONTAINER} as build
+
+# Unless otherwise specified, we make a fat build.
+ARG CUDA_DOCKER_ARCH=all
+
+RUN apt-get update && \
+    apt-get install -y build-essential git
+
+WORKDIR /app
+
+COPY . .
+
+# Set nvcc architecture
+ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
+# Enable cuBLAS
+ENV LLAMA_CUBLAS=1
+
+RUN make
+
+FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
+
+COPY --from=build /app/server /server
+
+ENTRYPOINT [ "/server" ]
--- a/.devops/server-intel.Dockerfile
+++ b/.devops/server-intel.Dockerfile
@@ -0,0 +1,25 @@
+ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
+ARG UBUNTU_VERSION=22.04
+
+FROM intel/hpckit:$ONEAPI_VERSION as build
+
+RUN apt-get update && \
+    apt-get install -y git
+
+WORKDIR /app
+
+COPY . .
+
+# for some reasons, "-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DLLAMA_NATIVE=ON" give worse performance
+RUN mkdir build && \
+    cd build && \
+    cmake .. -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx && \
+    cmake --build . --config Release --target main server
+
+FROM ubuntu:$UBUNTU_VERSION as runtime
+
+COPY --from=build /app/build/bin/server /server
+
+ENV LC_ALL=C.utf8
+
+ENTRYPOINT [ "/server" ]
--- a/.devops/server-rocm.Dockerfile
+++ b/.devops/server-rocm.Dockerfile
@@ -0,0 +1,45 @@
+ARG UBUNTU_VERSION=22.04
+
+# This needs to generally match the container host's environment.
+ARG ROCM_VERSION=5.6
+
+# Target the CUDA build image
+ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
+
+FROM ${BASE_ROCM_DEV_CONTAINER} as build
+
+# Unless otherwise specified, we make a fat build.
+# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
+# This is mostly tied to rocBLAS supported archs.
+ARG ROCM_DOCKER_ARCH=\
+    gfx803 \
+    gfx900 \
+    gfx906 \
+    gfx908 \
+    gfx90a \
+    gfx1010 \
+    gfx1030 \
+    gfx1100 \
+    gfx1101 \
+    gfx1102
+
+COPY requirements.txt   requirements.txt
+COPY requirements       requirements
+
+RUN pip install --upgrade pip setuptools wheel \
+    && pip install -r requirements.txt
+
+WORKDIR /app
+
+COPY . .
+
+# Set nvcc architecture
+ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
+# Enable ROCm
+ENV LLAMA_HIPBLAS=1
+ENV CC=/opt/rocm/llvm/bin/clang
+ENV CXX=/opt/rocm/llvm/bin/clang++
+
+RUN make
+
+ENTRYPOINT [ "/app/server" ]
--- a/.devops/server.Dockerfile
+++ b/.devops/server.Dockerfile
@@ -0,0 +1,20 @@
+ARG UBUNTU_VERSION=22.04
+
+FROM ubuntu:$UBUNTU_VERSION as build
+
+RUN apt-get update && \
+    apt-get install -y build-essential git
+
+WORKDIR /app
+
+COPY . .
+
+RUN make
+
+FROM ubuntu:$UBUNTU_VERSION as runtime
+
+COPY --from=build /app/server /server
+
+ENV LC_ALL=C.utf8
+
+ENTRYPOINT [ "/server" ]
--- a/.ecrc
+++ b/.ecrc
@@ -1,4 +1,5 @@
 {
+  "Exclude": ["^\\.gitmodules$"],
  "Disable": {
    "IndentSize": true
  }
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -72,7 +72,7 @@ jobs:
        id: cmake_test
        run: |
          cd build
-          ctest --verbose --timeout 900
+          ctest -L main --verbose --timeout 900

  ubuntu-latest-cmake-sanitizer:
    runs-on: ubuntu-latest
@@ -107,7 +107,7 @@ jobs:
        id: cmake_test
        run: |
          cd build
-          ctest --verbose --timeout 900
+          ctest -L main --verbose --timeout 900

  ubuntu-latest-cmake-mpi:
    runs-on: ubuntu-latest
@@ -141,7 +141,48 @@ jobs:
        id: cmake_test
        run: |
          cd build
-          ctest --verbose
+          ctest -L main --verbose
+
+  ubuntu-22-cmake-sycl:
+    runs-on: ubuntu-22.04
+
+    continue-on-error: true
+
+    steps:
+      - uses: actions/checkout@v2
+
+      - name: add oneAPI to apt
+        shell: bash
+        run: |
+          cd /tmp
+          wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
+          sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
+          rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
+          sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
+
+      - name: install oneAPI dpcpp compiler
+        shell: bash
+        run: |
+          sudo apt update
+          sudo apt install intel-oneapi-compiler-dpcpp-cpp
+
+      - name: install oneAPI MKL library
+        shell: bash
+        run: |
+          sudo apt install intel-oneapi-mkl-devel
+
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v3
+
+      - name: Build
+        id: cmake_build
+        run: |
+          source /opt/intel/oneapi/setvars.sh
+          mkdir build
+          cd build
+          cmake -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ..
+          cmake --build . --config Release -j $(nproc)

  # TODO: build with LLAMA_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know
  #       how to debug it.
@@ -202,7 +243,7 @@ jobs:
        id: cmake_test
        run: |
          cd build
-          ctest --verbose --timeout 900
+          ctest -L main --verbose --timeout 900

  macOS-latest-cmake-ios:
    runs-on: macos-latest
@@ -295,7 +336,8 @@ jobs:
      OPENBLAS_VERSION: 0.3.23
      OPENCL_VERSION: 2023.04.17
      CLBLAST_VERSION: 1.6.0
-      SDE_VERSION: 9.21.1-2023-04-24
+      SDE_VERSION: 9.33.0-2024-01-07
+      VULKAN_VERSION: 1.3.261.1

    strategy:
      matrix:
@@ -312,6 +354,8 @@ jobs:
            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
          - build: 'openblas'
            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
+          - build: 'kompute'
+            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'

    steps:
      - name: Clone
@@ -320,6 +364,12 @@ jobs:
        with:
          fetch-depth: 0

+      - name: Clone Kompute submodule
+        id: clone_kompute
+        if: ${{ matrix.build == 'kompute' }}
+        run: |
+          git submodule update --init kompute
+
      - name: Download OpenCL SDK
        id: get_opencl
        if: ${{ matrix.build == 'clblast' }}
@@ -354,6 +404,15 @@ jobs:
          $lib =  $(join-path $msvc 'bin\Hostx64\x64\lib.exe')
          & $lib /machine:x64 "/def:${env:RUNNER_TEMP}/openblas/lib/libopenblas.def" "/out:${env:RUNNER_TEMP}/openblas/lib/openblas.lib" /name:openblas.dll

+      - name: Install Vulkan SDK
+        id: get_vulkan
+        if: ${{ matrix.build == 'kompute' }}
+        run: |
+          curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
+          & "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
+          Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
+          Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
+
      - name: Build
        id: cmake_build
        run: |
@@ -391,22 +450,23 @@ jobs:

      - name: Test
        id: cmake_test
-        if: ${{ matrix.build != 'clblast' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }} # not all machines have native AVX-512
+        # not all machines have native AVX-512
+        if: ${{ matrix.build != 'clblast' && matrix.build != 'kompute' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }}
        run: |
          cd build
-          ctest -C Release --verbose --timeout 900
+          ctest -L main -C Release --verbose --timeout 900

      - name: Test (Intel SDE)
        id: cmake_test_sde
        if: ${{ matrix.build == 'avx512' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation
        run: |
-          curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/777395/sde-external-${env:SDE_VERSION}-win.tar.xz"
+          curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/813591/sde-external-${env:SDE_VERSION}-win.tar.xz"
          # for some weird reason windows tar doesn't like sde tar.xz
          7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar.xz
          7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar
          $sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe)
          cd build
-          & $sde -future -- ctest -C Release --verbose --timeout 900
+          & $sde -future -- ctest -L main -C Release --verbose --timeout 900

      - name: Determine tag name
        id: tag
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -28,13 +28,18 @@ jobs:
        config:
          - { tag: "light", dockerfile: ".devops/main.Dockerfile", platforms: "linux/amd64,linux/arm64" }
          - { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
+          - { tag: "server", dockerfile: ".devops/server.Dockerfile", platforms: "linux/amd64,linux/arm64" }
          # NOTE(canardletter): The CUDA builds on arm64 are very slow, so I
          #                     have disabled them for now until the reason why
          #                     is understood.
          - { tag: "light-cuda", dockerfile: ".devops/main-cuda.Dockerfile", platforms: "linux/amd64" }
          - { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
+          - { tag: "server-cuda", dockerfile: ".devops/server-cuda.Dockerfile", platforms: "linux/amd64" }
          - { tag: "light-rocm", dockerfile: ".devops/main-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
          - { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
+          - { tag: "server-rocm", dockerfile: ".devops/server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
+          - { tag: "light-intel", dockerfile: ".devops/main-intel.Dockerfile", platforms: "linux/amd64" }
+          - { tag: "server-intel", dockerfile: ".devops/server-intel.Dockerfile", platforms: "linux/amd64" }
    steps:
      - name: Check out the repo
        uses: actions/checkout@v3
--- a/.github/workflows/nix-ci-aarch64.yml
+++ b/.github/workflows/nix-ci-aarch64.yml
@@ -2,13 +2,20 @@ name: Nix aarch64 builds

 on:
  workflow_dispatch: # allows manual triggering
+  schedule:
+    # Rebuild daily rather than on every push because QEMU is expensive (e.g.
+    # 1.5h instead of minutes with the cold cache).
+    #
+    # randint(0, 59), randint(0, 23)
+    - cron: '26 12 * * *'
+  # But also rebuild if we touched any of the Nix expressions:
  push:
    branches:
      - master
-    paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix']
+    paths: ['**/*.nix', 'flake.lock']
  pull_request:
    types: [opened, synchronize, reopened]
-    paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix']
+    paths: ['**/*.nix', 'flake.lock']

 jobs:
  nix-build-aarch64:
--- a/.github/workflows/nix-ci.yml
+++ b/.github/workflows/nix-ci.yml
@@ -5,10 +5,8 @@ on:
  push:
    branches:
      - master
-    paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix']
  pull_request:
    types: [opened, synchronize, reopened]
-    paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix']

 jobs:
  nix-eval:
--- a/.gitignore
+++ b/.gitignore
@@ -27,7 +27,7 @@
 lcov-report/
 gcovr-report/

-build*/
+build*
 out/
 tmp/

@@ -89,20 +89,3 @@ examples/jeopardy/results.txt

 poetry.lock
 poetry.toml
-
-# Test binaries
-/tests/test-grammar-parser
-/tests/test-llama-grammar
-/tests/test-double-float
-/tests/test-grad0
-/tests/test-opt
-/tests/test-quantize-fns
-/tests/test-quantize-perf
-/tests/test-sampling
-/tests/test-tokenizer-0-llama
-/tests/test-tokenizer-0-falcon
-/tests/test-tokenizer-1-llama
-/tests/test-tokenizer-1-bpe
-/tests/test-rope
-/tests/test-backend-ops
-/tests/test-autorelease
--- a/.gitmodules
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "kompute"]
+	path = kompute
+	url = https://github.com/nomic-ai/kompute.git
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,6 @@
 cmake_minimum_required(VERSION 3.14)  # for add_link_options and implicit target directories.
 project("llama.cpp" C CXX)
+include(CheckIncludeFileCXX)

 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)

@@ -47,6 +48,7 @@ option(BUILD_SHARED_LIBS                "build shared libraries"
 option(LLAMA_STATIC                     "llama: static link libraries"                          OFF)
 option(LLAMA_NATIVE                     "llama: enable -march=native flag"                      ON)
 option(LLAMA_LTO                        "llama: enable link time optimization"                  OFF)
+option(LLAMA_CCACHE                     "llama: use ccache if available"                        ON)

 # debug
 option(LLAMA_ALL_WARNINGS               "llama: enable all compiler warnings"                   ON)
@@ -97,24 +99,39 @@ set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
 option(LLAMA_HIPBLAS                         "llama: use hipBLAS"                               OFF)
 option(LLAMA_HIP_UMA                         "llama: use HIP unified memory architecture"       OFF)
 option(LLAMA_CLBLAST                         "llama: use CLBlast"                               OFF)
+option(LLAMA_VULKAN                          "llama: use Vulkan"                                OFF)
 option(LLAMA_METAL                           "llama: use Metal"                                 ${LLAMA_METAL_DEFAULT})
 option(LLAMA_METAL_NDEBUG                    "llama: disable Metal debugging"                   OFF)
 option(LLAMA_METAL_SHADER_DEBUG              "llama: compile Metal with -fno-fast-math"         OFF)
+option(LLAMA_KOMPUTE                         "llama: use Kompute"                               OFF)
 option(LLAMA_MPI                             "llama: use MPI"                                   OFF)
 option(LLAMA_QKK_64                          "llama: use super-block size of 64 for k-quants"   OFF)
+option(LLAMA_SYCL                            "llama: use SYCL"                                  OFF)
+option(LLAMA_SYCL_F16                        "llama: use 16 bit floats for sycl calculations"   OFF)

 option(LLAMA_BUILD_TESTS                     "llama: build tests"    ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_EXAMPLES                  "llama: build examples" ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_SERVER                    "llama: build server example"                      ON)

+
+# add perf arguments
+option(LLAMA_PERF                            "llama: enable perf"                               OFF)
+if (LLAMA_PERF)
+    add_definitions(-DGGML_PERF)
+endif()
+
 # Required for relocatable CMake package
 include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)

 #
 # Compile flags
 #
+if (LLAMA_SYCL)
+    set(CMAKE_CXX_STANDARD 17)
+else()
+    set(CMAKE_CXX_STANDARD 11)
+endif()

-set(CMAKE_CXX_STANDARD 11)
 set(CMAKE_CXX_STANDARD_REQUIRED true)
 set(CMAKE_C_STANDARD 11)
 set(CMAKE_C_STANDARD_REQUIRED true)
@@ -401,6 +418,28 @@ if (LLAMA_CLBLAST)
    endif()
 endif()

+if (LLAMA_VULKAN)
+    find_package(Vulkan)
+    if (Vulkan_FOUND)
+        message(STATUS "Vulkan found")
+
+        set(GGML_HEADERS_VULKAN ggml-vulkan.h)
+        set(GGML_SOURCES_VULKAN ggml-vulkan.cpp)
+
+        add_library(ggml-vulkan STATIC ggml-vulkan.cpp ggml-vulkan.h)
+        if (BUILD_SHARED_LIBS)
+            set_target_properties(ggml-vulkan PROPERTIES POSITION_INDEPENDENT_CODE ON)
+        endif()
+        target_link_libraries(ggml-vulkan PRIVATE Vulkan::Vulkan)
+
+        add_compile_definitions(GGML_USE_VULKAN)
+
+        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ggml-vulkan)
+    else()
+        message(WARNING "Vulkan not found")
+    endif()
+endif()
+
 if (LLAMA_HIPBLAS)
    list(APPEND CMAKE_PREFIX_PATH /opt/rocm)

@@ -446,6 +485,185 @@ if (LLAMA_HIPBLAS)
    endif()
 endif()

+if (LLAMA_SYCL)
+    if ( NOT DEFINED ENV{ONEAPI_ROOT})
+        message(FATAL_ERROR "Not detect ENV {ONEAPI_ROOT}, please install oneAPI & source it, like: source /opt/intel/oneapi/setvars.sh")
+    endif()
+    #todo: AOT
+
+    find_package(IntelSYCL REQUIRED)
+    if (LLAMA_SYCL_F16)
+        add_compile_definitions(GGML_SYCL_F16)
+    endif()
+    add_compile_definitions(GGML_USE_SYCL)
+
+    add_compile_options(-I./) #include DPCT
+    add_compile_options(-I/${SYCL_INCLUDE_DIR})
+
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib")
+
+    set(GGML_HEADERS_SYCL ggml.h ggml-sycl.h)
+    set(GGML_SOURCES_SYCL ggml-sycl.cpp)
+
+    set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} sycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
+endif()
+
+if (LLAMA_KOMPUTE)
+    add_compile_definitions(VULKAN_HPP_DISPATCH_LOADER_DYNAMIC=1)
+    find_package(Vulkan COMPONENTS glslc REQUIRED)
+    find_program(glslc_executable NAMES glslc HINTS Vulkan::glslc)
+    if (NOT glslc_executable)
+        message(FATAL_ERROR "glslc not found")
+    endif()
+
+    function(compile_shader)
+      set(options)
+      set(oneValueArgs)
+      set(multiValueArgs SOURCES)
+      cmake_parse_arguments(compile_shader "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+      foreach(source ${compile_shader_SOURCES})
+        get_filename_component(filename ${source} NAME)
+        set(spv_file ${filename}.spv)
+        add_custom_command(
+            OUTPUT ${spv_file}
+            DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${source}
+              ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/common.comp
+              ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_getrows.comp
+              ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_mul_mv_q_n_pre.comp
+              ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_mul_mv_q_n.comp
+              COMMAND ${glslc_executable} --target-env=vulkan1.2 -o ${spv_file} ${CMAKE_CURRENT_SOURCE_DIR}/${source}
+            COMMENT "Compiling ${source} to ${spv_file}"
+        )
+
+        get_filename_component(RAW_FILE_NAME ${spv_file} NAME)
+        set(FILE_NAME "shader${RAW_FILE_NAME}")
+        string(REPLACE ".comp.spv" ".h" HEADER_FILE ${FILE_NAME})
+        string(TOUPPER ${HEADER_FILE} HEADER_FILE_DEFINE)
+        string(REPLACE "." "_" HEADER_FILE_DEFINE "${HEADER_FILE_DEFINE}")
+        set(OUTPUT_HEADER_FILE "${HEADER_FILE}")
+        message(STATUS "${HEADER_FILE} generating ${HEADER_FILE_DEFINE}")
+        if(CMAKE_GENERATOR MATCHES "Visual Studio")
+            add_custom_command(
+              OUTPUT ${OUTPUT_HEADER_FILE}
+              COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE}
+              COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
+              COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
+              COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE}
+              COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE}
+              COMMAND ${CMAKE_BINARY_DIR}/bin/$<CONFIG>/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE}
+              COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE}
+              COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
+              DEPENDS ${spv_file} xxd
+              COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/$<CONFIG>/xxd"
+            )
+        else()
+            add_custom_command(
+              OUTPUT ${OUTPUT_HEADER_FILE}
+              COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE}
+              COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
+              COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
+              COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE}
+              COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE}
+              COMMAND ${CMAKE_BINARY_DIR}/bin/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE}
+              COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE}
+              COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
+              DEPENDS ${spv_file} xxd
+              COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/xxd"
+            )
+        endif()
+      endforeach()
+    endfunction()
+
+    if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/kompute/CMakeLists.txt")
+        message(STATUS "Kompute found")
+        set(KOMPUTE_OPT_LOG_LEVEL Error CACHE STRING "Kompute log level")
+        add_subdirectory(kompute)
+
+        # Compile our shaders
+        compile_shader(SOURCES
+          kompute-shaders/op_scale.comp
+          kompute-shaders/op_scale_8.comp
+          kompute-shaders/op_add.comp
+          kompute-shaders/op_addrow.comp
+          kompute-shaders/op_mul.comp
+          kompute-shaders/op_silu.comp
+          kompute-shaders/op_relu.comp
+          kompute-shaders/op_gelu.comp
+          kompute-shaders/op_softmax.comp
+          kompute-shaders/op_norm.comp
+          kompute-shaders/op_rmsnorm.comp
+          kompute-shaders/op_diagmask.comp
+          kompute-shaders/op_mul_mat_mat_f32.comp
+          kompute-shaders/op_mul_mat_f16.comp
+          kompute-shaders/op_mul_mat_q8_0.comp
+          kompute-shaders/op_mul_mat_q4_0.comp
+          kompute-shaders/op_mul_mat_q4_1.comp
+          kompute-shaders/op_mul_mat_q6_k.comp
+          kompute-shaders/op_getrows_f16.comp
+          kompute-shaders/op_getrows_q4_0.comp
+          kompute-shaders/op_getrows_q4_1.comp
+          kompute-shaders/op_getrows_q6_k.comp
+          kompute-shaders/op_rope_f16.comp
+          kompute-shaders/op_rope_f32.comp
+          kompute-shaders/op_cpy_f16_f16.comp
+          kompute-shaders/op_cpy_f16_f32.comp
+          kompute-shaders/op_cpy_f32_f16.comp
+          kompute-shaders/op_cpy_f32_f32.comp
+        )
+
+        # Create a custom target for our generated shaders
+        add_custom_target(generated_shaders DEPENDS
+          shaderop_scale.h
+          shaderop_scale_8.h
+          shaderop_add.h
+          shaderop_addrow.h
+          shaderop_mul.h
+          shaderop_silu.h
+          shaderop_relu.h
+          shaderop_gelu.h
+          shaderop_softmax.h
+          shaderop_norm.h
+          shaderop_rmsnorm.h
+          shaderop_diagmask.h
+          shaderop_mul_mat_mat_f32.h
+          shaderop_mul_mat_f16.h
+          shaderop_mul_mat_q8_0.h
+          shaderop_mul_mat_q4_0.h
+          shaderop_mul_mat_q4_1.h
+          shaderop_mul_mat_q6_k.h
+          shaderop_getrows_f16.h
+          shaderop_getrows_q4_0.h
+          shaderop_getrows_q4_1.h
+          shaderop_getrows_q6_k.h
+          shaderop_rope_f16.h
+          shaderop_rope_f32.h
+          shaderop_cpy_f16_f16.h
+          shaderop_cpy_f16_f32.h
+          shaderop_cpy_f32_f16.h
+          shaderop_cpy_f32_f32.h
+        )
+
+        # Create a custom command that depends on the generated_shaders
+        add_custom_command(
+            OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp
+            COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp
+            DEPENDS generated_shaders
+            COMMENT "Ensuring shaders are generated before compiling ggml-kompute.cpp"
+        )
+
+        # Add the stamp to the main sources to ensure dependency tracking
+        set(GGML_SOURCES_KOMPUTE ggml-kompute.cpp ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp)
+        set(GGML_HEADERS_KOMPUTE ggml-kompute.h ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp)
+        add_compile_definitions(GGML_USE_KOMPUTE)
+        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} kompute)
+        set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${CMAKE_BINARY_DIR})
+    else()
+        message(WARNING "Kompute not found")
+    endif()
+endif()
+
 function(get_flags CCID CCVER)
    set(C_FLAGS "")
    set(CXX_FLAGS "")
@@ -458,17 +676,24 @@ function(get_flags CCID CCVER)
            (CCID STREQUAL "Clang"      AND CCVER VERSION_GREATER_EQUAL 3.8.0) OR
            (CCID STREQUAL "AppleClang" AND CCVER VERSION_GREATER_EQUAL 7.3.0)
        )
-            set(C_FLAGS ${C_FLAGS} -Wdouble-promotion)
+            list(APPEND C_FLAGS -Wdouble-promotion)
        endif()
    elseif (CCID STREQUAL "GNU")
        set(C_FLAGS   -Wdouble-promotion)
        set(CXX_FLAGS -Wno-array-bounds)

        if (CCVER VERSION_GREATER_EQUAL 7.1.0)
-            set(CXX_FLAGS ${CXX_FLAGS} -Wno-format-truncation)
+            list(APPEND CXX_FLAGS -Wno-format-truncation)
        endif()
        if (CCVER VERSION_GREATER_EQUAL 8.1.0)
-            set(CXX_FLAGS ${CXX_FLAGS} -Wextra-semi)
+            list(APPEND CXX_FLAGS -Wextra-semi)
+        endif()
+    elseif (CCID MATCHES "Intel")
+        if (NOT LLAMA_SYCL)
+            # enable max optimization level when using Intel compiler
+            set(C_FLAGS   -ipo -O3 -static -fp-model=fast -flto -fno-stack-protector)
+            set(CXX_FLAGS -ipo -O3 -static -fp-model=fast -flto -fno-stack-protector)
+            add_link_options(-fuse-ld=lld -static-intel)
        endif()
    endif()

@@ -497,16 +722,18 @@ if (LLAMA_ALL_WARNINGS)
    endif()
 endif()

+set(CUDA_CXX_FLAGS "")
+
 if (LLAMA_CUBLAS)
    set(CUDA_FLAGS ${CXX_FLAGS} -use_fast_math)
    if (NOT MSVC)
-        set(CUDA_FLAGS ${CUDA_FLAGS} -Wno-pedantic)
+        list(APPEND CUDA_FLAGS -Wno-pedantic)
    endif()

    if (LLAMA_ALL_WARNINGS AND NOT MSVC)
        set(NVCC_CMD ${CMAKE_CUDA_COMPILER} .c)
        if (NOT CMAKE_CUDA_HOST_COMPILER STREQUAL "")
-            set(NVCC_CMD ${NVCC_CMD} -ccbin ${CMAKE_CUDA_HOST_COMPILER})
+            list(APPEND NVCC_CMD -ccbin ${CMAKE_CUDA_HOST_COMPILER})
        endif()

        execute_process(
@@ -534,13 +761,8 @@ if (LLAMA_CUBLAS)
        message("-- CUDA host compiler is ${CUDA_CCID} ${CUDA_CCVER}")

        get_flags(${CUDA_CCID} ${CUDA_CCVER})
-        list(JOIN GF_CXX_FLAGS " " CUDA_CXX_FLAGS)  # pass host compiler flags as a single argument
-        if (NOT CUDA_CXX_FLAGS STREQUAL "")
-            set(CUDA_FLAGS ${CUDA_FLAGS} -Xcompiler ${CUDA_CXX_FLAGS})
-        endif()
+        list(APPEND CUDA_CXX_FLAGS ${GF_CXX_FLAGS})  # This is passed to -Xcompiler later
    endif()
-
-    add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:${CUDA_FLAGS}>")
 endif()

 if (WIN32)
@@ -561,6 +783,17 @@ if (LLAMA_LTO)
    endif()
 endif()

+if (LLAMA_CCACHE)
+    find_program(LLAMA_CCACHE_FOUND ccache)
+    if (LLAMA_CCACHE_FOUND)
+        set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ccache)
+        set(ENV{CCACHE_SLOPPINESS} time_macros)
+        message(STATUS "Using ccache")
+    else()
+        message(STATUS "Warning: ccache not found - consider installing it or use LLAMA_CCACHE=OFF")
+    endif ()
+endif()
+
 # this version of Apple ld64 is buggy
 execute_process(
    COMMAND ${CMAKE_C_COMPILER} ${CMAKE_EXE_LINKER_FLAGS} -Wl,-v
@@ -594,12 +827,7 @@ if (NOT MSVC)
    endif()
 endif()

-function(add_compile_option_cpp ARG)
-    # Adds a compile option to C/C++ only, but not for Cuda.
-    # Use, e.g., for CPU-architecture flags.
-    add_compile_options($<$<COMPILE_LANGUAGE:CXX>:${ARG}>)
-    add_compile_options($<$<COMPILE_LANGUAGE:C>:${ARG}>)
-endfunction()
+set(ARCH_FLAGS "")

 if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") OR ("${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "arm64"))
    message(STATUS "ARM detected")
@@ -612,19 +840,19 @@ if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATC
    else()
        check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E)
        if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
-            add_compile_options(-mfp16-format=ieee)
+            list(APPEND ARCH_FLAGS -mfp16-format=ieee)
        endif()
        if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6")
            # Raspberry Pi 1, Zero
-            add_compile_options(-mfpu=neon-fp-armv8 -mno-unaligned-access)
+            list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access)
        endif()
        if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7")
            # Raspberry Pi 2
-            add_compile_options(-mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations)
+            list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations)
        endif()
        if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8")
            # Raspberry Pi 3, 4, Zero 2 (32-bit)
-            add_compile_options(-mno-unaligned-access)
+            list(APPEND ARCH_FLAGS -mno-unaligned-access)
        endif()
    endif()
 elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "^(x86_64|i686|amd64|x64)$" )
@@ -635,7 +863,7 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GE
            include(cmake/FindSIMD.cmake)
        endif ()
        if (LLAMA_AVX512)
-            add_compile_option_cpp(/arch:AVX512)
+            list(APPEND ARCH_FLAGS /arch:AVX512)
            # MSVC has no compile-time flags enabling specific
            # AVX512 extensions, neither it defines the
            # macros corresponding to the extensions.
@@ -649,49 +877,61 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GE
                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
            endif()
        elseif (LLAMA_AVX2)
-            add_compile_option_cpp(/arch:AVX2)
+            list(APPEND ARCH_FLAGS /arch:AVX2)
        elseif (LLAMA_AVX)
-            add_compile_option_cpp(/arch:AVX)
+            list(APPEND ARCH_FLAGS /arch:AVX)
        endif()
    else()
        if (LLAMA_NATIVE)
-            add_compile_option_cpp(-march=native)
+            list(APPEND ARCH_FLAGS -march=native)
        endif()
        if (LLAMA_F16C)
-            add_compile_option_cpp(-mf16c)
+            list(APPEND ARCH_FLAGS -mf16c)
        endif()
        if (LLAMA_FMA)
-            add_compile_option_cpp(-mfma)
+            list(APPEND ARCH_FLAGS -mfma)
        endif()
        if (LLAMA_AVX)
-            add_compile_option_cpp(-mavx)
+            list(APPEND ARCH_FLAGS -mavx)
        endif()
        if (LLAMA_AVX2)
-            add_compile_option_cpp(-mavx2)
+            list(APPEND ARCH_FLAGS -mavx2)
        endif()
        if (LLAMA_AVX512)
-            add_compile_option_cpp(-mavx512f)
-            add_compile_option_cpp(-mavx512bw)
+            list(APPEND ARCH_FLAGS -mavx512f)
+            list(APPEND ARCH_FLAGS -mavx512bw)
        endif()
        if (LLAMA_AVX512_VBMI)
-            add_compile_option_cpp(-mavx512vbmi)
+            list(APPEND ARCH_FLAGS -mavx512vbmi)
        endif()
        if (LLAMA_AVX512_VNNI)
-            add_compile_option_cpp(-mavx512vnni)
+            list(APPEND ARCH_FLAGS -mavx512vnni)
        endif()
    endif()
 elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
    message(STATUS "PowerPC detected")
    if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
-        add_compile_options(-mcpu=powerpc64le)
+        list(APPEND ARCH_FLAGS -mcpu=powerpc64le)
    else()
-        add_compile_options(-mcpu=native -mtune=native)
+        list(APPEND ARCH_FLAGS -mcpu=native -mtune=native)
        #TODO: Add  targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
    endif()
 else()
    message(STATUS "Unknown architecture")
 endif()

+add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:${ARCH_FLAGS}>")
+add_compile_options("$<$<COMPILE_LANGUAGE:C>:${ARCH_FLAGS}>")
+
+if (LLAMA_CUBLAS)
+    list(APPEND CUDA_CXX_FLAGS ${ARCH_FLAGS})
+    list(JOIN CUDA_CXX_FLAGS " " CUDA_CXX_FLAGS_JOINED)  # pass host compiler flags as a single argument
+    if (NOT CUDA_CXX_FLAGS_JOINED STREQUAL "")
+        list(APPEND CUDA_FLAGS -Xcompiler ${CUDA_CXX_FLAGS_JOINED})
+    endif()
+    add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:${CUDA_FLAGS}>")
+endif()
+
 if (MINGW)
    # Target Windows 8 for PrefetchVirtualMemory
    add_compile_definitions(_WIN32_WINNT=${LLAMA_WIN_VER})
@@ -766,11 +1006,14 @@ add_library(ggml OBJECT
            ggml-backend.h
            ggml-quants.c
            ggml-quants.h
-            ${GGML_SOURCES_CUDA}   ${GGML_HEADERS_CUDA}
-            ${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
-            ${GGML_SOURCES_METAL}  ${GGML_HEADERS_METAL}
-            ${GGML_SOURCES_MPI}    ${GGML_HEADERS_MPI}
-            ${GGML_SOURCES_EXTRA}  ${GGML_HEADERS_EXTRA}
+            ${GGML_SOURCES_CUDA}    ${GGML_HEADERS_CUDA}
+            ${GGML_SOURCES_OPENCL}  ${GGML_HEADERS_OPENCL}
+            ${GGML_SOURCES_VULKAN}  ${GGML_HEADERS_VULKAN}
+            ${GGML_SOURCES_METAL}   ${GGML_HEADERS_METAL}
+            ${GGML_SOURCES_MPI}     ${GGML_HEADERS_MPI}
+            ${GGML_SOURCES_EXTRA}   ${GGML_HEADERS_EXTRA}
+            ${GGML_SOURCES_SYCL}    ${GGML_HEADERS_SYCL}
+            ${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE}
            )

 target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})
@@ -846,8 +1089,8 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfig.cmake
              ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfigVersion.cmake
        DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Llama)

-set(GGML_PUBLIC_HEADERS "ggml.h"
-        "${GGML_HEADERS_CUDA}" "${GGML_HEADERS_OPENCL}"
+set(GGML_PUBLIC_HEADERS "ggml.h" "ggml-alloc.h" "ggml-backend.h"
+        "${GGML_HEADERS_CUDA}" "${GGML_HEADERS_OPENCL}" "${GGML_HEADERS_VULKAN}"
        "${GGML_HEADERS_METAL}" "${GGML_HEADERS_MPI}" "${GGML_HEADERS_EXTRA}")

 set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
--- a/22
+++ b/22
@@ -9,7 +9,7 @@ TEST_TARGETS = \
 	tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
 	tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama          \
 	tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope      \
-	tests/test-backend-ops tests/test-autorelease
+	tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease

 # Code coverage output files
 COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
@@ -448,6 +448,19 @@ ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 endif # LLAMA_CLBLAST

+ifdef LLAMA_VULKAN
+	MK_CPPFLAGS  += -DGGML_USE_VULKAN
+	MK_LDFLAGS += -lvulkan
+	OBJS    += ggml-vulkan.o
+
+ifdef LLAMA_VULKAN_CHECK_RESULTS
+	MK_CPPFLAGS  += -DGGML_VULKAN_CHECK_RESULTS
+endif
+
+ggml-vulkan.o: ggml-vulkan.cpp ggml-vulkan.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+endif # LLAMA_VULKAN
+
 ifdef LLAMA_HIPBLAS

 	ifeq ($(wildcard /opt/rocm),)
@@ -619,7 +632,7 @@ embedding: examples/embedding/embedding.cpp                   ggml.o llama.o $(C
 save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
+server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2) -Wno-cast-qual

 gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
@@ -748,5 +761,8 @@ tests/test-c.o: tests/test-c.c llama.h
 tests/test-backend-ops: tests/test-backend-ops.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tests/test-autorelease: tests/test-autorelease.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-model-load-cancel: tests/test-model-load-cancel.cpp ggml.o llama.o tests/get-model.cpp $(COMMON_DEPS) $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+
+tests/test-autorelease: tests/test-autorelease.cpp ggml.o llama.o tests/get-model.cpp $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
--- a/README.md
+++ b/README.md
@@ -14,7 +14,6 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
 - Collecting Apple Silicon performance stats:
  - M-series: https://github.com/ggerganov/llama.cpp/discussions/4167
  - A-series: https://github.com/ggerganov/llama.cpp/discussions/4508
- Added Mixtral support: https://github.com/ggerganov/llama.cpp/pull/4406
 - Looking for contributions to improve and maintain the `server` example: https://github.com/ggerganov/llama.cpp/issues/4216

 ----
@@ -63,7 +62,7 @@ The main goal of `llama.cpp` is to run the LLaMA model using 4-bit integer quant
 - AVX, AVX2 and AVX512 support for x86 architectures
 - Mixed F16 / F32 precision
 - 2-bit, 3-bit, 4-bit, 5-bit, 6-bit and 8-bit integer quantization support
- CUDA, Metal and OpenCL GPU backend support
+- CUDA, Metal, OpenCL, SYCL GPU backend support

 The original implementation of `llama.cpp` was [hacked in an evening](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022).
 Since then, the project has improved significantly thanks to many contributions. This project is mainly for educational purposes and serves
@@ -112,6 +111,7 @@ as the main playground for developing new features for the [ggml](https://github
 - [x] [Bakllava](https://huggingface.co/models?search=SkunkworksAI/Bakllava)
 - [x] [Obsidian](https://huggingface.co/NousResearch/Obsidian-3B-V0.5)
 - [x] [ShareGPT4V](https://huggingface.co/models?search=Lin-Chen/ShareGPT4V)
+- [x] [MobileVLM 1.7B/3B models](https://huggingface.co/models?search=mobileVLM)


 **Bindings:**
@@ -121,13 +121,15 @@ as the main playground for developing new features for the [ggml](https://github
 - Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp)
 - JS/TS (llama.cpp server client): [lgrammel/modelfusion](https://modelfusion.dev/integration/model-provider/llamacpp)
 - Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
- Rust: [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
+- Rust (nicer API): [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
+- Rust (more direct bindings): [utilityai/llama-cpp-rs](https://github.com/utilityai/llama-cpp-rs)
 - C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp)
 - Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s)
 - Clojure: [phronmophobic/llama.clj](https://github.com/phronmophobic/llama.clj)
 - React Native: [mybigday/llama.rn](https://github.com/mybigday/llama.rn)
 - Java: [kherud/java-llama.cpp](https://github.com/kherud/java-llama.cpp)
 - Zig: [deins/llama.cpp.zig](https://github.com/Deins/llama.cpp.zig)
+- Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart)

 **UI:**

@@ -288,7 +290,7 @@ In order to build llama.cpp you have three different options.
        sudo pkg install gmake automake autoconf pkgconf llvm15 clinfo clover \
            opencl clblast openblas

-            gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j4
+        gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j4
        ```

    **Notes:** With this packages you can build llama.cpp with OPENBLAS and
@@ -596,15 +598,24 @@ Building the program with BLAS support may lead to some performance improvements

  You can get a list of platforms and devices from the `clinfo -l` command, etc.

+- #### SYCL
+
+  SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators.
+
+  llama.cpp based on SYCL is used to support Intel GPU (Data Center Max series, Flex series, Arc series, Built-in GPU and iGPU).
+
+  For detailed info, please refer to [llama.cpp for SYCL](README_sycl.md).
+
+
 ### Prepare Data & Run

 ```bash
 # obtain the original LLaMA model weights and place them in ./models
 ls ./models
 65B 30B 13B 7B tokenizer_checklist.chk tokenizer.model
-  # [Optional] for models using BPE tokenizers
-  ls ./models
-  65B 30B 13B 7B vocab.json
+# [Optional] for models using BPE tokenizers
+ls ./models
+65B 30B 13B 7B vocab.json

 # install Python dependencies
 python3 -m pip install -r requirements.txt
@@ -612,8 +623,8 @@ python3 -m pip install -r requirements.txt
 # convert the 7B model to ggml FP16 format
 python3 convert.py models/7B/

-  # [Optional] for models using BPE tokenizers
-  python convert.py models/7B/ --vocabtype bpe
+# [Optional] for models using BPE tokenizers
+python convert.py models/7B/ --vocabtype bpe

 # quantize the model to 4-bits (using q4_0 method)
 ./quantize ./models/7B/ggml-model-f16.gguf ./models/7B/ggml-model-q4_0.gguf q4_0
@@ -929,17 +940,20 @@ Place your desired model into the `~/llama.cpp/models/` directory and execute th
 * Create a folder to store big models & intermediate files (ex. /llama/models)

 #### Images
-We have two Docker images available for this project:
+We have three Docker images available for this project:

 1. `ghcr.io/ggerganov/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`)
 2. `ghcr.io/ggerganov/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`)
+3. `ghcr.io/ggerganov/llama.cpp:server`: This image only includes the server executabhle file. (platforms: `linux/amd64`, `linux/arm64`)

 Additionally, there the following images, similar to the above:

 - `ghcr.io/ggerganov/llama.cpp:full-cuda`: Same as `full` but compiled with CUDA support. (platforms: `linux/amd64`)
 - `ghcr.io/ggerganov/llama.cpp:light-cuda`: Same as `light` but compiled with CUDA support. (platforms: `linux/amd64`)
+- `ghcr.io/ggerganov/llama.cpp:server-cuda`: Same as `server` but compiled with CUDA support. (platforms: `linux/amd64`)
 - `ghcr.io/ggerganov/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
 - `ghcr.io/ggerganov/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
+- `ghcr.io/ggerganov/llama.cpp:server-rocm`: Same as `server` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)

 The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](.github/workflows/docker.yml). If you need different settings (for example, a different CUDA or ROCm library, you'll need to build the images locally for now).

@@ -965,6 +979,12 @@ or with a light image:
 docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
 ```

+or with a server image:
+
+```bash
+docker run -v /path/to/models:/models -p 8000:8000 ghcr.io/ggerganov/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512
+```
+
 ### Docker With CUDA

 Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia-container-toolkit) properly installed on Linux, or is using a GPU enabled cloud, `cuBLAS` should be accessible inside the container.
@@ -974,6 +994,7 @@ Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia
 ```bash
 docker build -t local/llama.cpp:full-cuda -f .devops/full-cuda.Dockerfile .
 docker build -t local/llama.cpp:light-cuda -f .devops/main-cuda.Dockerfile .
+docker build -t local/llama.cpp:server-cuda -f .devops/server-cuda.Dockerfile .
 ```

 You may want to pass in some different `ARGS`, depending on the CUDA environment supported by your container host, as well as the GPU architecture.
@@ -987,6 +1008,7 @@ The resulting images, are essentially the same as the non-CUDA images:

 1. `local/llama.cpp:full-cuda`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
 2. `local/llama.cpp:light-cuda`: This image only includes the main executable file.
+3. `local/llama.cpp:server-cuda`: This image only includes the server executable file.

 #### Usage

@@ -995,6 +1017,7 @@ After building locally, Usage is similar to the non-CUDA examples, but you'll ne
 ```bash
 docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
 docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
+docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1
 ```

 ### Contributing
--- a/README_sycl.md
+++ b/README_sycl.md
@@ -0,0 +1,252 @@
+# llama.cpp for SYCL
+
+[Background](#background)
+
+[OS](#os)
+
+[Intel GPU](#intel-gpu)
+
+[Linux](#linux)
+
+[Environment Variable](#environment-variable)
+
+[Known Issue](#known-issue)
+
+[Todo](#todo)
+
+## Background
+
+SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators—such as CPUs, GPUs, and FPGAs. It is a single-source embedded domain-specific language based on pure C++17.
+
+oneAPI is a specification that is open and standards-based, supporting multiple architecture types including but not limited to GPU, CPU, and FPGA. The spec has both direct programming and API-based programming paradigms.
+
+Intel uses the SYCL as direct programming language to support CPU, GPUs and FPGAs.
+
+To avoid to re-invent the wheel, this code refer other code paths in llama.cpp (like OpenBLAS, cuBLAS, CLBlast). We use a open-source tool [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) migrate to SYCL.
+
+The llama.cpp for SYCL is used to support Intel GPUs.
+
+For Intel CPU, recommend to use llama.cpp for X86 (Intel MKL building).
+
+## OS
+
+|OS|Status|Verified|
+|-|-|-|
+|Linux|Support|Ubuntu 22.04|
+|Windows|Ongoing| |
+
+
+## Intel GPU
+
+|Intel GPU| Status | Verified Model|
+|-|-|-|
+|Intel Data Center Max Series| Support| Max 1550|
+|Intel Data Center Flex Series| Support| Flex 170|
+|Intel Arc Series| Support| Arc 770|
+|Intel built-in Arc GPU| Support| built-in Arc GPU in Meteor Lake|
+|Intel iGPU| Support| iGPU in i5-1250P, i7-1165G7|
+
+
+## Linux
+
+### Setup Environment
+
+1. Install Intel GPU driver.
+
+a. Please install Intel GPU driver by official guide: [Install GPU Drivers](https://dgpu-docs.intel.com/driver/installation.html).
+
+Note: for iGPU, please install the client GPU driver.
+
+b. Add user to group: video, render.
+
+```
+sudo usermod -aG render username
+sudo usermod -aG video username
+```
+
+Note: re-login to enable it.
+
+c. Check
+
+```
+sudo apt install clinfo
+sudo clinfo -l
+```
+
+Output (example):
+
+```
+Platform #0: Intel(R) OpenCL Graphics
+ `-- Device #0: Intel(R) Arc(TM) A770 Graphics
+
+
+Platform #0: Intel(R) OpenCL HD Graphics
+ `-- Device #0: Intel(R) Iris(R) Xe Graphics [0x9a49]
+```
+
+2. Install Intel® oneAPI Base toolkit.
+
+
+a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).
+
+Recommend to install to default folder: **/opt/intel/oneapi**.
+
+Following guide use the default folder as example. If you use other folder, please modify the following guide info with your folder.
+
+b. Check
+
+```
+source /opt/intel/oneapi/setvars.sh
+
+sycl-ls
+```
+
+There should be one or more level-zero devices. Like **[ext_oneapi_level_zero:gpu:0]**.
+
+Output (example):
+```
+[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2  [2023.16.10.0.17_160000]
+[opencl:cpu:1] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i7-13700K OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000]
+[opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics OpenCL 3.0 NEO  [23.30.26918.50]
+[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Arc(TM) A770 Graphics 1.3 [1.3.26918]
+
+```
+
+2. Build locally:
+
+```
+mkdir -p build
+cd build
+source /opt/intel/oneapi/setvars.sh
+
+#for FP16
+#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON # faster for long-prompt inference
+
+#for FP32
+cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+
+#build example/main only
+#cmake --build . --config Release --target main
+
+#build all binary
+cmake --build . --config Release -v
+
+```
+
+or
+
+```
+./examples/sycl/build.sh
+```
+
+Note:
+
+- By default, it will build for all binary files. It will take more time. To reduce the time, we recommend to build for **example/main** only.
+
+### Run
+
+1. Put model file to folder **models**
+
+2. Enable oneAPI running environment
+
+```
+source /opt/intel/oneapi/setvars.sh
+```
+
+3. List device ID
+
+Run without parameter:
+
+```
+./build/bin/ls-sycl-device
+
+or
+
+./build/bin/main
+```
+
+Check the ID in startup log, like:
+
+```
+found 4 SYCL devices:
+  Device 0: Intel(R) Arc(TM) A770 Graphics,	compute capability 1.3,
+    max compute_units 512,	max work group size 1024,	max sub group size 32,	global mem size 16225243136
+  Device 1: Intel(R) FPGA Emulation Device,	compute capability 1.2,
+    max compute_units 24,	max work group size 67108864,	max sub group size 64,	global mem size 67065057280
+  Device 2: 13th Gen Intel(R) Core(TM) i7-13700K,	compute capability 3.0,
+    max compute_units 24,	max work group size 8192,	max sub group size 64,	global mem size 67065057280
+  Device 3: Intel(R) Arc(TM) A770 Graphics,	compute capability 3.0,
+    max compute_units 512,	max work group size 1024,	max sub group size 32,	global mem size 16225243136
+
+```
+
+|Attribute|Note|
+|-|-|
+|compute capability 1.3|Level-zero running time, recommended |
+|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
+
+4. Set device ID and execute llama.cpp
+
+Set device ID = 0 by **GGML_SYCL_DEVICE=0**
+
+```
+GGML_SYCL_DEVICE=0 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
+```
+or run by script:
+
+```
+./examples/sycl/run_llama2.sh
+```
+
+Note:
+
+- By default, mmap is used to read model file. In some cases, it leads to the hang issue. Recommend to use parameter **--no-mmap** to disable mmap() to skip this issue.
+
+
+5. Check the device ID in output
+
+Like：
+```
+Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
+```
+
+
+## Environment Variable
+
+#### Build
+
+|Name|Value|Function|
+|-|-|-|
+|LLAMA_SYCL|ON (mandatory)|Enable build with SYCL code path. <br>For FP32/FP16, LLAMA_SYCL=ON is mandatory.|
+|LLAMA_SYCL_F16|ON (optional)|Enable FP16 build with SYCL code path. Faster for long-prompt inference. <br>For FP32, not set it.|
+|CMAKE_C_COMPILER|icx|Use icx compiler for SYCL code path|
+|CMAKE_CXX_COMPILER|icpx|use icpx for SYCL code path|
+
+#### Running
+
+
+|Name|Value|Function|
+|-|-|-|
+|GGML_SYCL_DEVICE|0 (default) or 1|Set the device id used. Check the device ids by default running output|
+|GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG|
+
+## Known Issue
+
+- Error:  `error while loading shared libraries: libsycl.so.7: cannot open shared object file: No such file or directory`.
+
+  Miss to enable oneAPI running environment.
+
+  Install oneAPI base toolkit and enable it by: `source /opt/intel/oneapi/setvars.sh`.
+
+
+- Hang during startup
+
+  llama.cpp use mmap as default way to read model file and copy to GPU. In some system, memcpy will be abnormal and block.
+
+  Solution: add **--no-mmap**.
+
+## Todo
+
+- Support to build in Windows.
+
+- Support multiple cards.
--- a/ci/README.md
+++ b/ci/README.md
@@ -22,4 +22,8 @@ bash ./ci/run.sh ./tmp/results ./tmp/mnt

 # with CUDA support
 GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
+
+# with SYCL support
+source /opt/intel/oneapi/setvars.sh
+GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 ```
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -10,6 +10,9 @@
 # # with CUDA support
 # GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 #
+# # with SYCL support
+# GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
+#

 if [ -z "$2" ]; then
    echo "usage: $0 <output-dir> <mnt-dir>"
@@ -22,9 +25,9 @@ mkdir -p "$2"
 OUT=$(realpath "$1")
 MNT=$(realpath "$2")

-rm -v $OUT/*.log
-rm -v $OUT/*.exit
-rm -v $OUT/*.md
+rm -f "$OUT/*.log"
+rm -f "$OUT/*.exit"
+rm -f "$OUT/*.md"

 sd=`dirname $0`
 cd $sd/../
@@ -40,6 +43,14 @@ if [ ! -z ${GG_BUILD_CUDA} ]; then
    CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_CUBLAS=1"
 fi

+if [ ! -z ${GG_BUILD_SYCL} ]; then
+    if [ -z ${ONEAPI_ROOT} ]; then
+        echo "Not detected ONEAPI_ROOT, please install oneAPI base toolkit and enable it by:\n source /opt/intel/oneapi/setvars.sh"
+        exit 1
+    fi
+
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_SYCL=1 DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON"
+fi
 ## helpers

 # download a file if it does not exist or if it is outdated
@@ -94,7 +105,7 @@ function gg_run_ctest_debug {
    (time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
    (time make -j                                          ) 2>&1 | tee -a $OUT/${ci}-make.log

-    (time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
+    (time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log

    set +e
 }
@@ -123,9 +134,9 @@ function gg_run_ctest_release {
    (time make -j                                            ) 2>&1 | tee -a $OUT/${ci}-make.log

    if [ -z ${GG_BUILD_LOW_PERF} ]; then
-        (time ctest --output-on-failure ) 2>&1 | tee -a $OUT/${ci}-ctest.log
+        (time ctest --output-on-failure -L main ) 2>&1 | tee -a $OUT/${ci}-ctest.log
    else
-        (time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
+        (time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
    fi

    set +e
@@ -141,6 +152,61 @@ function gg_sum_ctest_release {
    gg_printf '```\n'
 }

+function gg_get_model {
+    local gguf_3b="$MNT/models/open-llama/3B-v2/ggml-model-f16.gguf"
+    local gguf_7b="$MNT/models/open-llama/7B-v2/ggml-model-f16.gguf"
+    if [[ -s $gguf_3b ]]; then
+        echo -n "$gguf_3b"
+    elif [[ -s $gguf_7b ]]; then
+        echo -n "$gguf_7b"
+    else
+        echo >&2 "No model found. Can't run gg_run_ctest_with_model."
+        exit 1
+    fi
+}
+
+function gg_run_ctest_with_model_debug {
+    cd ${SRC}
+
+    local model; model=$(gg_get_model)
+    cd build-ci-debug
+    set -e
+    (LLAMACPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
+    set +e
+    cd ..
+}
+
+function gg_run_ctest_with_model_release {
+    cd ${SRC}
+
+    local model; model=$(gg_get_model)
+    cd build-ci-release
+    set -e
+    (LLAMACPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
+    set +e
+    cd ..
+}
+
+function gg_sum_ctest_with_model_debug {
+    gg_printf '### %s\n\n' "${ci}"
+
+    gg_printf 'Runs ctest with model files in debug mode\n'
+    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
+    gg_printf '```\n'
+    gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
+    gg_printf '```\n'
+}
+
+function gg_sum_ctest_with_model_release {
+    gg_printf '### %s\n\n' "${ci}"
+
+    gg_printf 'Runs ctest with model files in release mode\n'
+    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
+    gg_printf '```\n'
+    gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
+    gg_printf '```\n'
+}
+
 # open_llama_3b_v2

 function gg_run_open_llama_3b_v2 {
@@ -183,8 +249,6 @@ function gg_run_open_llama_3b_v2 {

    wiki_test_60="${path_wiki}/wiki.test-60.raw"

-    ./bin/test-autorelease ${model_f16}
-
    ./bin/quantize ${model_f16} ${model_q8_0} q8_0
    ./bin/quantize ${model_f16} ${model_q4_0} q4_0
    ./bin/quantize ${model_f16} ${model_q4_1} q4_1
@@ -507,14 +571,18 @@ function gg_sum_open_llama_7b_v2 {
 ## main

 if [ -z ${GG_BUILD_LOW_PERF} ]; then
+    # Create symlink: ./llama.cpp/models-mnt -> $MNT/models/models-mnt
    rm -rf ${SRC}/models-mnt
-
    mnt_models=${MNT}/models
    mkdir -p ${mnt_models}
    ln -sfn ${mnt_models} ${SRC}/models-mnt

-    python3 -m pip install -r ${SRC}/requirements.txt
-    python3 -m pip install --editable gguf-py
+    # Create a fresh python3 venv and enter it
+    python3 -m venv "$MNT/venv"
+    source "$MNT/venv/bin/activate"
+
+    pip install -r ${SRC}/requirements.txt --disable-pip-version-check
+    pip install --editable gguf-py --disable-pip-version-check
 fi

 ret=0
@@ -529,6 +597,8 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
        else
            test $ret -eq 0 && gg_run open_llama_7b_v2
        fi
+        test $ret -eq 0 && gg_run ctest_with_model_debug
+        test $ret -eq 0 && gg_run ctest_with_model_release
    fi
 fi

--- a/common/common.cpp
+++ b/common/common.cpp
@@ -42,6 +42,10 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif

+#if (defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL))
+#define GGML_USE_CUBLAS_SYCL
+#endif
+
 int32_t get_num_physical_cores() {
 #ifdef __linux__
    // enumerate the set of thread siblings, num entries is num cores
@@ -203,6 +207,23 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
            params.prompt_cache_all = true;
        } else if (arg == "--prompt-cache-ro") {
            params.prompt_cache_ro = true;
+        } else if (arg == "-bf" || arg == "--binary-file") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            std::ifstream file(argv[i], std::ios::binary);
+            if (!file) {
+                fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
+                invalid_param = true;
+                break;
+            }
+            // store the external file name in params
+            params.prompt_file = argv[i];
+            std::ostringstream ss;
+            ss << file.rdbuf();
+            params.prompt = ss.str();
+            fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), argv[i]);
        } else if (arg == "-f" || arg == "--file") {
            if (++i >= argc) {
                invalid_param = true;
@@ -582,9 +603,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.main_gpu = std::stoi(argv[i]);
-#ifndef GGML_USE_CUBLAS
-            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the main GPU has no effect.\n");
-#endif // GGML_USE_CUBLAS
+#ifndef GGML_USE_CUBLAS_SYCL
+            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the main GPU has no effect.\n");
+#endif // GGML_USE_CUBLAS_SYCL
        } else if (arg == "--split-mode" || arg == "-sm") {
            if (++i >= argc) {
                invalid_param = true;
@@ -601,9 +622,10 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                invalid_param = true;
                break;
            }
-#ifndef GGML_USE_CUBLAS
-            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the split mode has no effect.\n");
-#endif // GGML_USE_CUBLAS
+#ifndef GGML_USE_CUBLAS_SYCL
+            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the split mode has no effect.\n");
+#endif // GGML_USE_CUBLAS_SYCL
+
        } else if (arg == "--tensor-split" || arg == "-ts") {
            if (++i >= argc) {
                invalid_param = true;
@@ -626,9 +648,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                    params.tensor_split[i] = 0.0f;
                }
            }
-#ifndef GGML_USE_CUBLAS
-            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting a tensor split has no effect.\n");
-#endif // GGML_USE_CUBLAS
+#ifndef GGML_USE_CUBLAS_SYCL
+            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting a tensor split has no effect.\n");
+#endif // GGML_USE_CUBLAS_SYCL
        } else if (arg == "--no-mmap") {
            params.use_mmap = false;
        } else if (arg == "--numa") {
@@ -653,6 +675,12 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
            if (params.logdir.back() != DIRECTORY_SEPARATOR) {
                params.logdir += DIRECTORY_SEPARATOR;
            }
+        } else if (arg == "--save-all-logits" || arg == "--kl-divergence-base") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.logits_file = argv[i];
        } else if (arg == "--perplexity" || arg == "--all-logits") {
            params.logits_all = true;
        } else if (arg == "--ppl-stride") {
@@ -689,6 +717,16 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.winogrande_tasks = std::stoi(argv[i]);
+        } else if (arg == "--multiple-choice") {
+            params.multiple_choice = true;
+        } else if (arg == "--multiple-choice-tasks") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.multiple_choice_tasks = std::stoi(argv[i]);
+        } else if (arg == "--kl-divergence") {
+            params.kl_divergence = true;
        } else if (arg == "--ignore-eos") {
            params.ignore_eos = true;
        } else if (arg == "--no-penalize-nl") {
@@ -888,6 +926,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  --in-suffix STRING    string to suffix after user inputs with (default: empty)\n");
    printf("  -f FNAME, --file FNAME\n");
    printf("                        prompt file to start generation.\n");
+    printf("  -bf FNAME, --binary-file FNAME\n");
+    printf("                        binary file containing multiple choice tasks.\n");
    printf("  -n N, --n-predict N   number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
    printf("  -c N, --ctx-size N    size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
    printf("  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
@@ -936,6 +976,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  --hellaswag-tasks N   number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
    printf("  --winogrande          compute Winogrande score over random tasks from datafile supplied with -f\n");
    printf("  --winogrande-tasks N  number of tasks to use when computing the Winogrande score (default: %zu)\n", params.winogrande_tasks);
+    printf("  --multiple-choice     compute multiple choice score over random tasks from datafile supplied with -f\n");
+    printf("  --multiple-choice-tasks N number of tasks to use when computing the multiple choice score (default: %zu)\n", params.winogrande_tasks);
+    printf("  --kl-divergence       computes KL-divergence to logits provided via --kl-divergence-base");
    printf("  --keep N              number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
    printf("  --draft N             number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft);
    printf("  --chunks N            max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
@@ -969,7 +1012,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("                        fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n");
    printf("  -mg i, --main-gpu i   the GPU to use for the model (with split-mode = none),\n");
    printf("                        or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu);
-#endif
+#endif // LLAMA_SUPPORTS_GPU_OFFLOAD
    printf("  --verbose-prompt      print a verbose prompt before generation (default: %s)\n", params.verbose_prompt ? "true" : "false");
    printf("  --no-display-prompt   don't print prompt at generation (default: %s)\n", !params.display_prompt ? "true" : "false");
    printf("  -gan N, --grp-attn-n N\n");
@@ -1476,9 +1519,9 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
    fprintf(stream, "cpu_has_avx512: %s\n",      ggml_cpu_has_avx512()      ? "true" : "false");
    fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
    fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
-    fprintf(stream, "cpu_has_blas: %s\n",        ggml_cpu_has_blas()        ? "true" : "false");
    fprintf(stream, "cpu_has_cublas: %s\n",      ggml_cpu_has_cublas()      ? "true" : "false");
    fprintf(stream, "cpu_has_clblast: %s\n",     ggml_cpu_has_clblast()     ? "true" : "false");
+    fprintf(stream, "cpu_has_kompute: %s\n",     ggml_cpu_has_kompute()     ? "true" : "false");
    fprintf(stream, "cpu_has_fma: %s\n",         ggml_cpu_has_fma()         ? "true" : "false");
    fprintf(stream, "cpu_has_gpublas: %s\n",     ggml_cpu_has_gpublas()     ? "true" : "false");
    fprintf(stream, "cpu_has_neon: %s\n",        ggml_cpu_has_neon()        ? "true" : "false");
--- a/common/common.h
+++ b/common/common.h
@@ -91,6 +91,7 @@ struct gpt_params {
    std::string input_suffix      = "";  // string to suffix user inputs with
    std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
    std::string logdir            = "";  // directory in which to save YAML log files
+    std::string logits_file       = "";  // file for saving *all* logits

    std::vector<llama_model_kv_override> kv_overrides;

@@ -108,6 +109,11 @@ struct gpt_params {
    bool   winogrande      = false; // compute Winogrande score over random tasks from datafile supplied in prompt
    size_t winogrande_tasks= 0;     // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed

+    bool   multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
+    size_t multiple_choice_tasks = 0;     // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
+
+    bool   kl_divergence   = false; // compute KL-divergence
+
    bool mul_mat_q         = true;  // if true, use mul_mat_q kernels instead of cuBLAS
    bool random_prompt     = false; // do not randomize prompt if none provided
    bool use_color         = false; // use color to distinguish generations and inputs
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -13,6 +13,7 @@ struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_
        // will be empty (default) if there are parse errors
        if (result->parsed_grammar.rules.empty()) {
            fprintf(stderr, "%s: failed to parse grammar\n", __func__);
+            delete result;
            return nullptr;
        }

@@ -129,6 +130,8 @@ static void sampler_queue(
    const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));

    const float         temp              = params.temp;
+    const float         dynatemp_range    = params.dynatemp_range;
+    const float         dynatemp_exponent = params.dynatemp_exponent;
    const int32_t       top_k             = params.top_k <= 0 ? n_vocab : params.top_k;
    const float         top_p             = params.top_p;
    const float         min_p             = params.min_p;
@@ -143,7 +146,15 @@ static void sampler_queue(
            case 'y': llama_sample_typical  (ctx_main, &cur_p, typical_p, min_keep); break;
            case 'p': llama_sample_top_p    (ctx_main, &cur_p, top_p,     min_keep); break;
            case 'm': llama_sample_min_p    (ctx_main, &cur_p, min_p,     min_keep); break;
-            case 't': llama_sample_temp     (ctx_main, &cur_p, temp); break;
+            case 't':
+                if (dynatemp_range > 0) {
+                    float dynatemp_min = std::max(0.0f, temp - dynatemp_range);
+                    float dynatemp_max = std::max(0.0f, temp + dynatemp_range);
+                    llama_sample_entropy(ctx_main, &cur_p, dynatemp_min, dynatemp_max, dynatemp_exponent);
+                } else {
+                    llama_sample_temp(ctx_main, &cur_p, temp);
+                }
+                break;
            default : break;
        }
    }
--- a/common/sampling.h
+++ b/common/sampling.h
@@ -18,6 +18,8 @@ typedef struct llama_sampling_params {
    float       tfs_z                 = 1.00f;    // 1.0 = disabled
    float       typical_p             = 1.00f;    // 1.0 = disabled
    float       temp                  = 0.80f;    // <= 0.0 to sample greedily, 0.0 to not output probabilities
+    float       dynatemp_range        = 0.00f;    // 0.0 = disabled
+    float       dynatemp_exponent     = 1.00f;    // controls how entropy maps to temperature in dynamic temperature sampler
    int32_t     penalty_last_n        = 64;       // last n tokens to penalize (0 = disable penalty, -1 = context size)
    float       penalty_repeat        = 1.10f;    // 1.0 = disabled
    float       penalty_freq          = 0.00f;    // 0.0 = disabled
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -10,7 +10,7 @@ import re
 import sys
 from enum import IntEnum
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, ContextManager, Iterator, cast, Optional
+from typing import TYPE_CHECKING, Any, ContextManager, Iterator, cast

 import numpy as np
 import torch
@@ -189,6 +189,8 @@ class Model:
            return StableLMModel
        if model_architecture == "QWenLMHeadModel":
            return QwenModel
+        if model_architecture == "Qwen2ForCausalLM":
+            return Model
        if model_architecture == "MixtralForCausalLM":
            return MixtralModel
        if model_architecture == "GPT2LMHeadModel":
@@ -197,6 +199,10 @@ class Model:
            return Phi2Model
        if model_architecture == "PlamoForCausalLM":
            return PlamoModel
+        if model_architecture == "CodeShellForCausalLM":
+            return CodeShellModel
+        if model_architecture == "OrionForCausalLM":
+            return OrionModel
        return Model

    def _is_model_safetensors(self) -> bool:
@@ -234,6 +240,8 @@ class Model:
            return gguf.MODEL_ARCH.STABLELM
        if arch == "QWenLMHeadModel":
            return gguf.MODEL_ARCH.QWEN
+        if arch == "Qwen2ForCausalLM":
+            return gguf.MODEL_ARCH.QWEN2
        if arch == "MixtralForCausalLM":
            return gguf.MODEL_ARCH.LLAMA
        if arch == "GPT2LMHeadModel":
@@ -242,6 +250,10 @@ class Model:
            return gguf.MODEL_ARCH.PHI2
        if arch == "PlamoForCausalLM":
            return gguf.MODEL_ARCH.PLAMO
+        if arch == "CodeShellForCausalLM":
+            return gguf.MODEL_ARCH.CODESHELL
+        if arch == "OrionForCausalLM":
+            return gguf.MODEL_ARCH.ORION

        raise NotImplementedError(f'Architecture "{arch}" not supported!')

@@ -281,6 +293,58 @@ class Model:
        special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
        special_vocab.add_to_gguf(self.gguf_writer)

+    def _set_vocab_qwen(self):
+        dir_model = self.dir_model
+        hparams = self.hparams
+        tokens: list[bytearray] = []
+        toktypes: list[int] = []
+
+        from transformers import AutoTokenizer
+        tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
+        vocab_size = hparams["vocab_size"]
+        assert max(tokenizer.get_vocab().values()) < vocab_size
+
+        merges = []
+        vocab = {}
+        mergeable_ranks = tokenizer.mergeable_ranks
+        for token, rank in mergeable_ranks.items():
+            vocab[QwenModel.token_bytes_to_string(token)] = rank
+            if len(token) == 1:
+                continue
+            merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
+            assert len(merged) == 2
+            merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged)))
+
+        # for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined
+        added_vocab = tokenizer.special_tokens
+        reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in (vocab | added_vocab).items()}
+
+        for i in range(vocab_size):
+            if i not in reverse_vocab:
+                pad_token = f"[PAD{i}]".encode("utf-8")
+                tokens.append(bytearray(pad_token))
+                toktypes.append(gguf.TokenType.USER_DEFINED)
+            elif reverse_vocab[i] in added_vocab:
+                tokens.append(reverse_vocab[i])
+                toktypes.append(gguf.TokenType.CONTROL)
+            else:
+                tokens.append(reverse_vocab[i])
+                toktypes.append(gguf.TokenType.NORMAL)
+
+        self.gguf_writer.add_tokenizer_model("gpt2")
+        self.gguf_writer.add_token_list(tokens)
+        self.gguf_writer.add_token_types(toktypes)
+
+        special_vocab = gguf.SpecialVocab(dir_model, load_merges=False)
+        special_vocab.merges = merges
+        # only add special tokens when they were not already loaded from config.json
+        if len(special_vocab.special_token_ids) == 0:
+            special_vocab._set_special_token("bos", tokenizer.special_tokens["<|endoftext|>"])
+            special_vocab._set_special_token("eos", tokenizer.special_tokens["<|endoftext|>"])
+        # this one is usually not in config.json anyway
+        special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"])
+        special_vocab.add_to_gguf(self.gguf_writer)
+
    def _set_vocab_sentencepiece(self):
        from sentencepiece import SentencePieceProcessor

@@ -479,7 +543,8 @@ class MPTModel(Model):
            # map tensor names
            if "scales" in name:
                new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias", ".scales"))
-                new_name = new_name.replace("scales", "act.scales")
+                if new_name is not None:
+                    new_name = new_name.replace("scales", "act.scales")
            else:
                new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
            if new_name is None:
@@ -511,6 +576,83 @@ class MPTModel(Model):
                self.gguf_writer.add_tensor("output.weight", data)


+class OrionModel(Model):
+    def set_vocab(self):
+        self._set_vocab_sentencepiece()
+
+    def set_gguf_parameters(self):
+        block_count = self.hparams["num_hidden_layers"]
+        head_count = self.hparams["num_attention_heads"]
+        head_count_kv = self.hparams.get("num_key_value_heads", head_count)
+        hf_repo = self.hparams.get("_name_or_path", "")
+
+        ctx_length = 0
+        if "max_sequence_length" in self.hparams:
+            ctx_length = self.hparams["max_sequence_length"]
+        elif "max_position_embeddings" in self.hparams:
+            ctx_length = self.hparams["max_position_embeddings"]
+        elif "model_max_length" in self.hparams:
+            ctx_length = self.hparams["model_max_length"]
+        else:
+            print("gguf: can not find ctx length parameter.")
+            sys.exit()
+
+        self.gguf_writer.add_file_type(self.ftype)
+        self.gguf_writer.add_name(self.dir_model.name)
+        self.gguf_writer.add_source_hf_repo(hf_repo)
+        self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
+        self.gguf_writer.add_context_length(ctx_length)
+        self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
+        self.gguf_writer.add_block_count(block_count)
+        self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
+        self.gguf_writer.add_head_count(head_count)
+        self.gguf_writer.add_head_count_kv(head_count_kv)
+        self.gguf_writer.add_layer_norm_eps(self.hparams["rms_norm_eps"])
+
+    def write_tensors(self):
+        # Collect tensors from generator object
+        model_kv = dict(self.get_tensors())
+        block_count = self.hparams["num_hidden_layers"]
+        tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
+
+        for name, data_torch in model_kv.items():
+            # we don't need these
+            if name.endswith(".rotary_emb.inv_freq"):
+                continue
+
+            old_dtype = data_torch.dtype
+
+            # convert any unsupported data types to float32
+            if data_torch.dtype not in (torch.float16, torch.float32):
+                data_torch = data_torch.to(torch.float32)
+
+            data = data_torch.squeeze().numpy()
+
+            # map tensor names
+            new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
+            if new_name is None:
+                print(f"Can not map tensor {name!r}")
+                sys.exit()
+
+            n_dims = len(data.shape)
+            data_dtype = data.dtype
+
+            # if f32 desired, convert any float16 to float32
+            if self.ftype == 0 and data_dtype == np.float16:
+                data = data.astype(np.float32)
+
+            # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
+            if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
+                data = data.astype(np.float32)
+
+            # if f16 desired, convert any float32 2-dim weight tensors to float16
+            if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
+                data = data.astype(np.float16)
+
+            print(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
+            self.gguf_writer.add_tensor(new_name, data)
+
+
 class BaichuanModel(Model):
    def set_vocab(self):
        self._set_vocab_sentencepiece()
@@ -868,6 +1010,13 @@ class PersimmonModel(Model):


 class StableLMModel(Model):
+    def set_vocab(self):
+        if (self.dir_model / "tokenizer.json").is_file():
+            self._set_vocab_gpt2()
+        else:
+            # StableLM 2 1.6B uses a vocab in a similar format to Qwen's vocab
+            self._set_vocab_qwen()
+
    def set_gguf_parameters(self):
        hparams = self.hparams
        block_count = hparams["num_hidden_layers"]
@@ -896,7 +1045,7 @@ class QwenModel(Model):
        return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')])

    @staticmethod
-    def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: Optional[int] = None) -> list[bytes]:
+    def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None) -> list[bytes]:
        parts = [bytes([b]) for b in token]
        while True:
            min_idx = None
@@ -913,52 +1062,7 @@ class QwenModel(Model):
        return parts

    def set_vocab(self):
-        dir_model = self.dir_model
-        hparams = self.hparams
-        tokens: list[bytearray] = []
-        toktypes: list[int] = []
-
-        from transformers import AutoTokenizer
-        tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
-        vocab_size = hparams["vocab_size"]
-        assert max(tokenizer.get_vocab().values()) < vocab_size
-
-        merges = []
-        vocab = {}
-        mergeable_ranks = tokenizer.mergeable_ranks
-        for token, rank in mergeable_ranks.items():
-            vocab[self.token_bytes_to_string(token)] = rank
-            if len(token) == 1:
-                continue
-            merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
-            assert len(merged) == 2
-            merges.append(' '.join(map(self.token_bytes_to_string, merged)))
-
-        reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in vocab.items()}
-        added_vocab = tokenizer.special_tokens
-
-        for i in range(vocab_size):
-            if i not in reverse_vocab:
-                pad_token = f"[PAD{i}]".encode("utf-8")
-                tokens.append(bytearray(pad_token))
-                toktypes.append(gguf.TokenType.USER_DEFINED)
-            elif reverse_vocab[i] in added_vocab:
-                tokens.append(reverse_vocab[i])
-                toktypes.append(gguf.TokenType.CONTROL)
-            else:
-                tokens.append(reverse_vocab[i])
-                toktypes.append(gguf.TokenType.NORMAL)
-
-        self.gguf_writer.add_tokenizer_model("gpt2")
-        self.gguf_writer.add_token_list(tokens)
-        self.gguf_writer.add_token_types(toktypes)
-
-        special_vocab = gguf.SpecialVocab(dir_model, load_merges=False)
-        special_vocab.merges = merges
-        special_vocab._set_special_token("bos", tokenizer.special_tokens["<|endoftext|>"])
-        special_vocab._set_special_token("eos", tokenizer.special_tokens["<|endoftext|>"])
-        special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"])
-        special_vocab.add_to_gguf(self.gguf_writer)
+        self._set_vocab_qwen()

    def set_gguf_parameters(self):
        self.gguf_writer.add_name("Qwen")
@@ -1176,6 +1280,70 @@ class PlamoModel(Model):
            self.gguf_writer.add_tensor(new_name, data)


+class CodeShellModel(Model):
+    def set_gguf_parameters(self):
+        block_count = self.hparams["n_layer"]
+
+        self.gguf_writer.add_name("CodeShell")
+        self.gguf_writer.add_context_length(self.hparams["n_positions"])
+        self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
+        self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
+        self.gguf_writer.add_block_count(block_count)
+        self.gguf_writer.add_head_count(self.hparams["n_head"])
+        self.gguf_writer.add_head_count_kv(self.hparams["num_query_groups"])
+        self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
+        self.gguf_writer.add_file_type(self.ftype)
+        self.gguf_writer.add_rope_freq_base(10000.0)
+        self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
+        self.gguf_writer.add_rope_scaling_factor(1.0)
+
+    def write_tensors(self):
+        block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
+        tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
+        tensors = dict(self.get_tensors())
+        has_lm_head = "lm_head.weight" in tensors.keys() or "output.weight" in tensors.keys()
+        for name, data_torch in tensors.items():
+            # we don't need these
+            if name.endswith((".attn.rotary_emb.inv_freq")):
+                continue
+
+            old_dtype = data_torch.dtype
+
+            # convert any unsupported data types to float32
+            if data_torch.dtype not in (torch.float16, torch.float32):
+                data_torch = data_torch.to(torch.float32)
+
+            data = data_torch.squeeze().numpy()
+
+            # map tensor names
+            new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
+            if new_name is None:
+                print(f"Can not map tensor {name!r}")
+                sys.exit()
+
+            n_dims = len(data.shape)
+            data_dtype = data.dtype
+
+            # if f32 desired, convert any float16 to float32
+            if self.ftype == 0 and data_dtype == np.float16:
+                data = data.astype(np.float32)
+
+            # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
+            if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
+                data = data.astype(np.float32)
+
+            # if f16 desired, convert any float32 2-dim weight tensors to float16
+            if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
+                data = data.astype(np.float16)
+
+            print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
+
+            self.gguf_writer.add_tensor(new_name, data)
+
+            if not has_lm_head and name == "transformer.wte.weight":
+                self.gguf_writer.add_tensor("output.weight", data)
+                print(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}")
+
 ###### CONVERSION LOGIC ######


@@ -1213,7 +1381,7 @@ def main() -> None:

    if args.awq_path:
        sys.path.insert(1, str(Path(__file__).parent / 'awq-py'))
-        from awq.apply_awq import add_scale_weights
+        from awq.apply_awq import add_scale_weights  # type: ignore[import-not-found]
        tmp_model_path = args.model / "weighted_model"
        dir_model = tmp_model_path
        if tmp_model_path.is_dir():
--- a/convert-llama-ggml-to-gguf.py
+++ b/convert-llama-ggml-to-gguf.py
@@ -2,6 +2,7 @@
 from __future__ import annotations

 import argparse
+import os
 import struct
 import sys
 from enum import IntEnum
@@ -9,7 +10,6 @@ from pathlib import Path

 import numpy as np

-import os
 if 'NO_LOCAL_GGUF' not in os.environ:
    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
 import gguf
@@ -371,15 +371,11 @@ def handle_metadata(cfg, hp):
        params = convert.Params.loadOriginalParamsJson(fakemodel, orig_config_path)
    else:
        raise ValueError('Unable to load metadata')
-    vocab = convert.load_vocab(
-        cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir,
-        cfg.vocabtype)
-    # FIXME: Respect cfg.vocab_dir?
-    svocab = gguf.SpecialVocab(cfg.model_metadata_dir,
-                               load_merges = cfg.vocabtype == 'bpe',
-                               n_vocab = vocab.vocab_size)
+    vocab_path = Path(cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir)
+    vocab_factory = convert.VocabFactory(vocab_path)
+    vocab, special_vocab = vocab_factory.load_vocab(cfg.vocabtype, cfg.model_metadata_dir)
    convert.check_vocab_size(params, vocab)
-    return (params, vocab, svocab)
+    return params, vocab, special_vocab


 def handle_args():
--- a/convert-lora-to-ggml.py
+++ b/convert-lora-to-ggml.py
@@ -5,17 +5,16 @@ import json
 import os
 import struct
 import sys
+from pathlib import Path
 from typing import Any, BinaryIO, Sequence

 import numpy as np
 import torch

-from pathlib import Path
 if 'NO_LOCAL_GGUF' not in os.environ:
    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
 import gguf

-
 NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1}


@@ -60,7 +59,14 @@ if __name__ == '__main__':
    input_model = os.path.join(sys.argv[1], "adapter_model.bin")
    output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin")

-    model = torch.load(input_model, map_location="cpu")
+    if os.path.exists(input_model):
+        model = torch.load(input_model, map_location="cpu")
+    else:
+        input_model = os.path.join(sys.argv[1], "adapter_model.safetensors")
+        # lazy import load_file only if lora is in safetensors format.
+        from safetensors.torch import load_file
+        model = load_file(input_model, device="cpu")
+
    arch_name = sys.argv[2] if len(sys.argv) == 3 else "llama"

    if arch_name not in gguf.MODEL_ARCH_NAMES.values():
--- a/convert-persimmon-to-gguf.py
+++ b/convert-persimmon-to-gguf.py
@@ -1,11 +1,13 @@
 #!/usr/bin/env python3
-import torch
-import os
-from pprint import pprint
-import sys
 import argparse
+import os
+import sys
 from pathlib import Path
+from pprint import pprint
+
+import torch
 from sentencepiece import SentencePieceProcessor
+
 if 'NO_LOCAL_GGUF' not in os.environ:
    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
 import gguf
@@ -69,7 +71,7 @@ def main():
    persimmon_model = torch.load(args.ckpt_path)
    hparams = persimmon_model['args']
    pprint(hparams)
-    tensors = {}
+    tensors: dict[str, torch.Tensor] = {}
    _flatten_dict(persimmon_model['model'], tensors, None)

    arch = gguf.MODEL_ARCH.PERSIMMON
--- a/convert.py
+++ b/convert.py
@@ -17,58 +17,28 @@ import signal
 import struct
 import sys
 import time
-import warnings
 import zipfile
 from abc import ABCMeta, abstractmethod
-from argparse import ArgumentParser
 from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
 from dataclasses import dataclass
 from pathlib import Path
-from typing import (
-    IO,
-    TYPE_CHECKING,
-    Any,
-    Callable,
-    Iterable,
-    Literal,
-    Optional,
-    Tuple,
-    TypeVar,
-)
+from typing import IO, TYPE_CHECKING, Any, Callable, Iterable, Literal, TypeVar

 import numpy as np
 from sentencepiece import SentencePieceProcessor

-try:
-    from transformers import AutoTokenizer
-except ModuleNotFoundError as e:
-    warnings.warn(f"Could not import AutoTokenizer from transformers: {e}")
+if 'NO_LOCAL_GGUF' not in os.environ:
+    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
+import gguf

-# If NO_LOCAL_GGUF is not set, try to import gguf from the local gguf-py directory
-if "NO_LOCAL_GGUF" not in os.environ:
-    # Use absolute path to the gguf-py directory
-    gguf_py_dir = str(Path(__file__).resolve().parent / "gguf-py")
-    print(gguf_py_dir)  # NOTE: Remove this once path is verified after changes are completed
-    if gguf_py_dir not in sys.path:
-        sys.path.insert(1, gguf_py_dir)
+if TYPE_CHECKING:
+    from typing import TypeAlias

-# Import gguf module
-try:
-    import gguf
-except ModuleNotFoundError as e:
-    print(f"Could not import gguf: {e}")
-    sys.exit(1)
-
-if TYPE_CHECKING:  # NOTE: This isn't necessary.
-    from typing import TypeAlias  # This can technically be omitted.
-
-if hasattr(faulthandler, "register") and hasattr(signal, "SIGUSR1"):
+if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'):
    faulthandler.register(signal.SIGUSR1)

-# NOTE: n-dimensional arrays should be directly referenced
-NDArray: TypeAlias = "np.ndarray[Any, Any]"
+NDArray: TypeAlias = 'np.ndarray[Any, Any]'

-# Why is this here? LLAMA and GPT are technically the only compatible ARCHs.
 ARCH = gguf.MODEL_ARCH.LLAMA

 DEFAULT_CONCURRENCY = 8
@@ -78,7 +48,6 @@ DEFAULT_CONCURRENCY = 8
 #


-# TODO: Clean up and refactor data types
@dataclass(frozen=True)
 class DataType:
    name: str
@@ -183,85 +152,65 @@ GGML_FILE_TYPE_TO_DATA_TYPE: dict[GGMLFileType, DataType] = {

@dataclass
 class Params:
-    n_vocab: int
-    n_embd: int
-    n_layer: int
-    n_ctx: int
-    n_ff: int
-    n_head: int
-    n_head_kv: int
-    f_norm_eps: Optional[float] = None
-    n_experts: Optional[int] = None
-    n_experts_used: Optional[int] = None
+    n_vocab:        int
+    n_embd:         int
+    n_layer:        int
+    n_ctx:          int
+    n_ff:           int
+    n_head:         int
+    n_head_kv:      int
+    n_experts:      int | None = None
+    n_experts_used: int | None = None
+    f_norm_eps:     float | None = None

-    rope_scaling_type: Optional[gguf.RopeScalingType] = None
-    f_rope_freq_base: Optional[float] = None
-    f_rope_scale: Optional[float] = None
-    n_orig_ctx: Optional[int] = None
-    rope_finetuned: Optional[bool] = None
+    rope_scaling_type: gguf.RopeScalingType | None = None
+    f_rope_freq_base: float | None = None
+    f_rope_scale: float | None = None
+    n_orig_ctx: int | None = None
+    rope_finetuned: bool | None = None

-    ftype: Optional[GGMLFileType] = None
+    ftype: GGMLFileType | None = None

    # path to the directory containing the model files
-    path_model: Optional[Path] = None
+    path_model: Path | None = None

    @staticmethod
-    def guessed(model: LazyModel) -> "Params":
+    def guessed(model: LazyModel) -> Params:
        # try transformer naming first
-        n_vocab, n_embd = (
-            model["model.embed_tokens.weight"].shape
-            if "model.embed_tokens.weight" in model
-            else model["tok_embeddings.weight"].shape
-        )
+        n_vocab, n_embd = model["model.embed_tokens.weight"].shape if "model.embed_tokens.weight" in model else model["tok_embeddings.weight"].shape

        # try transformer naming first
        if "model.layers.0.self_attn.q_proj.weight" in model:
-            n_layer = next(
-                i
-                for i in itertools.count()
-                if f"model.layers.{i}.self_attn.q_proj.weight" not in model
-            )
-        elif (
-            "model.layers.0.self_attn.W_pack.weight" in model
-        ):  # next: try baichuan naming
-            n_layer = next(
-                i
-                for i in itertools.count()
-                if f"model.layers.{i}.self_attn.W_pack.weight" not in model
-            )
+            n_layer = next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model)
+        elif "model.layers.0.self_attn.W_pack.weight" in model:   # next: try baichuan naming
+            n_layer = next(i for i in itertools.count() if f"model.layers.{i}.self_attn.W_pack.weight" not in model)
        else:
-            n_layer = next(
-                i
-                for i in itertools.count()
-                if f"layers.{i}.attention.wq.weight" not in model
-            )
+            n_layer = next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)

        if n_layer < 1:
-            raise Exception(
-                "failed to guess 'n_layer'. This model is unknown or unsupported.\n"
-                "Suggestion: provide 'config.json' of the model in the same directory containing model files."
-            )
+            raise Exception("failed to guess 'n_layer'. This model is unknown or unsupported.\n"
+                            "Suggestion: provide 'config.json' of the model in the same directory containing model files.")

-        n_head = n_embd // 128  # guessed
-        n_mult = 256  # guessed
+        n_head = n_embd // 128 # guessed
+        n_mult = 256           # guessed

        # TODO: verify this
        n_ff = int(2 * (4 * n_embd) / 3)
        n_ff = n_mult * ((n_ff + n_mult - 1) // n_mult)

        return Params(
-            n_vocab=n_vocab,
-            n_embd=n_embd,
-            n_layer=n_layer,
-            n_ctx=-1,
-            n_ff=n_ff,
-            n_head=n_head,
-            n_head_kv=n_head,
-            f_norm_eps=1e-5,
+            n_vocab    = n_vocab,
+            n_embd     = n_embd,
+            n_layer    = n_layer,
+            n_ctx      = -1,
+            n_ff       = n_ff,
+            n_head     = n_head,
+            n_head_kv  = n_head,
+            f_norm_eps = 1e-5,
        )

    @staticmethod
-    def load_transformers_config(model: LazyModel, config_path: Path) -> "Params":
+    def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params:
        config = json.load(open(config_path))

        rope_scaling_type = f_rope_scale = n_orig_ctx = rope_finetuned = None
@@ -274,22 +223,20 @@ class Params:
                rope_scaling_type = gguf.RopeScalingType.LINEAR
            elif typ == "yarn":
                rope_scaling_type = gguf.RopeScalingType.YARN
-                n_orig_ctx = rope_scaling["original_max_position_embeddings"]
-                rope_finetuned = rope_scaling["finetuned"]
+                n_orig_ctx = rope_scaling['original_max_position_embeddings']
+                rope_finetuned = rope_scaling['finetuned']
            else:
-                raise NotImplementedError(f"Unknown rope scaling type: {typ}")
+                raise NotImplementedError(f'Unknown rope scaling type: {typ}')

        if "max_sequence_length" in config:
            n_ctx = config["max_sequence_length"]
        elif "max_position_embeddings" in config:
            n_ctx = config["max_position_embeddings"]
        else:
-            raise Exception(
-                "failed to guess 'n_ctx'. This model is unknown or unsupported.\n"
-                "Suggestion: provide 'config.json' of the model in the same directory containing model files."
-            )
+            raise Exception("failed to guess 'n_ctx'. This model is unknown or unsupported.\n"
+                            "Suggestion: provide 'config.json' of the model in the same directory containing model files.")

-        n_experts = None
+        n_experts      = None
        n_experts_used = None

        if "num_local_experts" in config:
@@ -297,30 +244,30 @@ class Params:
            n_experts_used = config["num_experts_per_tok"]

        return Params(
-            n_vocab=config["vocab_size"],
-            n_embd=config["hidden_size"],
-            n_layer=config["num_hidden_layers"],
-            n_ctx=n_ctx,
-            n_ff=config["intermediate_size"],
-            n_head=(n_head := config["num_attention_heads"]),
-            n_head_kv=config.get("num_key_value_heads", n_head),
-            n_experts=n_experts,
-            n_experts_used=n_experts_used,
-            f_norm_eps=config["rms_norm_eps"],
-            f_rope_freq_base=config.get("rope_theta"),
-            rope_scaling_type=rope_scaling_type,
-            f_rope_scale=f_rope_scale,
-            n_orig_ctx=n_orig_ctx,
-            rope_finetuned=rope_finetuned,
+            n_vocab           = config["vocab_size"],
+            n_embd            = config["hidden_size"],
+            n_layer           = config["num_hidden_layers"],
+            n_ctx             = n_ctx,
+            n_ff              = config["intermediate_size"],
+            n_head            = (n_head := config["num_attention_heads"]),
+            n_head_kv         = config.get("num_key_value_heads", n_head),
+            n_experts         = n_experts,
+            n_experts_used    = n_experts_used,
+            f_norm_eps        = config["rms_norm_eps"],
+            f_rope_freq_base  = config.get("rope_theta"),
+            rope_scaling_type = rope_scaling_type,
+            f_rope_scale      = f_rope_scale,
+            n_orig_ctx        = n_orig_ctx,
+            rope_finetuned    = rope_finetuned,
        )

    # LLaMA v2 70B params.json
    # {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1}
    @staticmethod
-    def load_torch_params(model: LazyModel, config_path: Path) -> "Params":
+    def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:
        config = json.load(open(config_path))

-        n_experts = None
+        n_experts      = None
        n_experts_used = None
        f_rope_freq_base = None

@@ -343,85 +290,85 @@ class Params:

        if config.get("moe"):
            n_ff = model["layers.0.feed_forward.experts.0.w1.weight"].shape[0]
-            n_experts = config["moe"]["num_experts"]
+            n_experts      = config["moe"]["num_experts"]
            n_experts_used = config["moe"]["num_experts_per_tok"]
            f_rope_freq_base = 1e6

        return Params(
-            n_vocab=config.get("vocab_size", model["tok_embeddings.weight"].shape[0]),
-            n_embd=config["dim"],
-            n_layer=config["n_layers"],
-            n_ctx=n_ctx,
-            n_ff=n_ff,
-            n_head=(n_head := config["n_heads"]),
-            n_head_kv=config.get("n_kv_heads", n_head),
-            n_experts=n_experts,
-            n_experts_used=n_experts_used,
-            f_norm_eps=config["norm_eps"],
-            f_rope_freq_base=config.get("rope_theta", f_rope_freq_base),
+            n_vocab          = model["tok_embeddings.weight"].shape[0],
+            n_embd           = config["dim"],
+            n_layer          = config["n_layers"],
+            n_ctx            = n_ctx,
+            n_ff             = n_ff,
+            n_head           = (n_head := config["n_heads"]),
+            n_head_kv        = config.get("n_kv_heads", n_head),
+            n_experts        = n_experts,
+            n_experts_used   = n_experts_used,
+            f_norm_eps       = config["norm_eps"],
+            f_rope_freq_base = config.get("rope_theta", f_rope_freq_base),
        )

    @staticmethod
-    def load(model_plus: ModelPlus) -> "Params":
-        hf_config_path = model_plus.paths[0].parent / "config.json"
+    def load(model_plus: ModelPlus) -> Params:
+        hf_config_path   = model_plus.paths[0].parent / "config.json"
        orig_config_path = model_plus.paths[0].parent / "params.json"

        if hf_config_path.exists():
-            params = Params.load_transformers_config(model_plus.model, hf_config_path)
+            params = Params.loadHFTransformerJson(model_plus.model, hf_config_path)
        elif orig_config_path.exists():
-            params = Params.load_torch_params(model_plus.model, orig_config_path)
-        elif model_plus.format != "none":
+            params = Params.loadOriginalParamsJson(model_plus.model, orig_config_path)
+        elif model_plus.format != 'none':
            params = Params.guessed(model_plus.model)
        else:
-            raise ValueError("Cannot guess params when model format is none")
+            raise ValueError('Cannot guess params when model format is none')

        params.path_model = model_plus.paths[0].parent

        return params


-class BpeVocab:  # GPT
-    def __init__(
-        self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]
-    ) -> None:
-        self.bpe_tokenizer = json.loads(
-            open(str(fname_tokenizer), encoding="utf-8").read()
-        )
-        self.vocab = self.bpe_tokenizer["model"]["vocab"]
+#
+# vocab
+#
+
+class BpeVocab:
+    def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
+        self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read())
+        try:
+            self.vocab = self.bpe_tokenizer["model"]["vocab"]
+        except KeyError:
+            self.vocab = self.bpe_tokenizer
        added_tokens: dict[str, int]
        if fname_added_tokens is not None:
            # FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
            added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
        else:
            # Fall back to trying to find the added tokens in tokenizer.json
-            tokenizer_json_file = fname_tokenizer.parent / "tokenizer.json"
+            tokenizer_json_file = fname_tokenizer.parent / 'tokenizer.json'
            if not tokenizer_json_file.is_file():
                added_tokens = {}
            else:
                tokenizer_json = json.load(open(tokenizer_json_file, encoding="utf-8"))
                added_tokens = dict(
-                    (item["content"], item["id"])
-                    for item in tokenizer_json.get("added_tokens", [])
+                    (item['content'], item['id'])
+                    for item in tokenizer_json.get('added_tokens', [])
                    # Added tokens here can be duplicates of the main vocabulary.
-                    if item["content"] not in self.bpe_tokenizer
-                )
+                    if item['content'] not in self.bpe_tokenizer)

        vocab_size: int = len(self.vocab)
-        expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
-        actual_ids = sorted(added_tokens.values())
+        expected_ids    = list(range(vocab_size, vocab_size + len(added_tokens)))
+        actual_ids      = sorted(added_tokens.values())
        if expected_ids != actual_ids:
            expected_end_id = vocab_size + len(actual_ids) - 1
-            raise Exception(
-                f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range {vocab_size} - {expected_end_id}; got {actual_ids}"
-            )
+            raise Exception(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range {vocab_size} - {expected_end_id}; got {actual_ids}")

        items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
-        self.added_tokens_dict = added_tokens
-        self.added_tokens_list = [text for (text, idx) in items]
+        self.added_tokens_dict    = added_tokens
+        self.added_tokens_list    = [text for (text, idx) in items]
        self.vocab_size_base: int = vocab_size
-        self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
-        self.fname_tokenizer = fname_tokenizer
-        self.fname_added_tokens = fname_added_tokens
+        self.vocab_size: int      = self.vocab_size_base + len(self.added_tokens_list)
+        self.fname_tokenizer      = fname_tokenizer
+        self.fname_added_tokens   = fname_added_tokens

    def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
        reverse_vocab = {id: encoded_tok for encoded_tok, id in self.vocab.items()}
@@ -442,10 +389,8 @@ class BpeVocab:  # GPT
        return f"<BpeVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"


-class SentencePieceVocab:  # LlaMa
-    def __init__(
-        self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]
-    ) -> None:
+class SentencePieceVocab:
+    def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
        self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
        added_tokens: dict[str, int]
        if fname_added_tokens is not None:
@@ -455,23 +400,19 @@ class SentencePieceVocab:  # LlaMa

        vocab_size: int = self.sentencepiece_tokenizer.vocab_size()

-        new_tokens = {
-            id: piece for piece, id in added_tokens.items() if id >= vocab_size
-        }
+        new_tokens       = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
        expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens)))
-        actual_new_ids = sorted(new_tokens.keys())
+        actual_new_ids   = sorted(new_tokens.keys())

        if expected_new_ids != actual_new_ids:
-            raise ValueError(
-                f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}"
-            )
+            raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}")

        # Token pieces that were added to the base vocabulary.
        self.added_tokens_dict = added_tokens
-        self.added_tokens_list = [new_tokens[id] for id in actual_new_ids]
-        self.vocab_size_base = vocab_size
-        self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
-        self.fname_tokenizer = fname_tokenizer
+        self.added_tokens_list  = [new_tokens[id] for id in actual_new_ids]
+        self.vocab_size_base    = vocab_size
+        self.vocab_size         = self.vocab_size_base + len(self.added_tokens_list)
+        self.fname_tokenizer    = fname_tokenizer
        self.fname_added_tokens = fname_added_tokens

    def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
@@ -512,11 +453,15 @@ class SentencePieceVocab:  # LlaMa


 class HfVocab:
-    def __init__(
-        self,
-        fname_tokenizer: Path,
-        fname_added_tokens: Optional[Path] = None,
-    ) -> None:
+    def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None = None) -> None:
+        try:
+            from transformers import AutoTokenizer
+        except ImportError as e:
+            raise ImportError(
+                "To use HfVocab, please install the `transformers` package. "
+                "You can install it with `pip install transformers`."
+            ) from e
+
        print("fname_tokenizer:", fname_tokenizer)
        # Allow the tokenizer to default to slow or fast versions.
        # Explicitly set tokenizer to use local paths.
@@ -529,7 +474,7 @@ class HfVocab:
        # Initialize lists and dictionaries for added tokens
        self.added_tokens_list = []
        self.added_tokens_dict = dict()
-        self.added_tokens_ids = set()
+        self.added_tokens_ids  = set()

        # Process added tokens
        for tok, tokidx in sorted(
@@ -550,12 +495,12 @@ class HfVocab:

        # Set vocabulary sizes
        self.vocab_size_base = self.tokenizer.vocab_size
-        self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
+        self.vocab_size      = self.vocab_size_base + len(self.added_tokens_list)

-        self.fname_tokenizer = fname_tokenizer
+        self.fname_tokenizer    = fname_tokenizer
        self.fname_added_tokens = fname_added_tokens

-    def hf_tokens(self) -> Iterable[Tuple[bytes, float, gguf.TokenType]]:
+    def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
        reverse_vocab = {
            id: encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items()
        }
@@ -573,11 +518,9 @@ class HfVocab:
                token_id, self.special_ids  # Reuse already stored special IDs
            )

-    def get_token_type(self, token_id: int, special_ids: set) -> gguf.TokenType:
+    def get_token_type(self, token_id: int, special_ids: set[int]) -> gguf.TokenType:
        # Determine token type based on whether it's a special token
-        return (
-            gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL
-        )
+        return gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL

    def get_token_score(self, token_id: int) -> float:
        # Placeholder for actual logic to determine the token's score
@@ -589,7 +532,6 @@ class HfVocab:
            if text in self.specials:
                toktype = self.get_token_type(self.specials[text], self.special_ids)
                score = self.get_token_score(self.specials[text])
-
            else:
                toktype = gguf.TokenType.USER_DEFINED
                score = -1000.0
@@ -783,7 +725,7 @@ def merge_multifile_models(models_plus: list[ModelPlus]) -> ModelPlus:
    else:
        model = merge_sharded([mp.model for mp in models_plus])

-    return ModelPlus(model, paths, format, vocab)
+    return ModelPlus(model, paths, format, vocab)  # pytype: disable=wrong-arg-types


 def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_head_kv: int) -> LazyTensor:
@@ -871,17 +813,13 @@ class LazyUnpickler(pickle.Unpickler):
    CLASSES: dict[tuple[str, str], Any] = {
        # getattr used here as a workaround for mypy not being smart enough to determine
        # the staticmethods have a __func__ attribute.
-        ("torch._tensor", "_rebuild_from_type_v2"): getattr(
-            rebuild_from_type_v2, "__func__"
-        ),
-        ("torch._utils", "_rebuild_tensor_v2"): getattr(
-            lazy_rebuild_tensor_v2, "__func__"
-        ),
-        ("torch", "BFloat16Storage"): LazyStorageKind(DT_BF16),
-        ("torch", "HalfStorage"): LazyStorageKind(DT_F16),
-        ("torch", "FloatStorage"): LazyStorageKind(DT_F32),
-        ("torch", "IntStorage"): LazyStorageKind(DT_I32),
-        ("torch", "Tensor"): LazyTensor,
+        ('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'),
+        ('torch._utils', '_rebuild_tensor_v2'): getattr(lazy_rebuild_tensor_v2, '__func__'),
+        ('torch', 'BFloat16Storage'): LazyStorageKind(DT_BF16),
+        ('torch', 'HalfStorage'): LazyStorageKind(DT_F16),
+        ('torch', 'FloatStorage'): LazyStorageKind(DT_F32),
+        ('torch', 'IntStorage'): LazyStorageKind(DT_I32),
+        ('torch', 'Tensor'): LazyTensor,
    }

    def find_class(self, module: str, name: str) -> Any:
@@ -968,7 +906,7 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc
        executor_class = ProcessPoolExecutor
    else:
        executor_class = ThreadPoolExecutor
-    with executor_class(max_workers = max_workers) as executor:
+    with executor_class(max_workers=max_workers) as executor:
        futures: list[concurrent.futures.Future[Out]] = []
        done = False
        for _ in range(concurrency):
@@ -1022,12 +960,8 @@ def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> N


 class OutputFile:
-    def __init__(
-        self, fname_out: Path, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE
-    ) -> None:
-        self.gguf = gguf.GGUFWriter(
-            fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess
-        )
+    def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE) -> None:
+        self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)

    def add_meta_arch(self, params: Params) -> None:
        name = "LLaMA"
@@ -1036,21 +970,16 @@ class OutputFile:
        if params.n_ctx == 4096:
            name = "LLaMA v2"
        elif params.path_model is not None:
-            name = str(params.path_model.parent).split("/")[-1]
+            name = str(params.path_model.parent).split('/')[-1]

-        self.gguf.add_name(name)
-        self.gguf.add_context_length(params.n_ctx)
-        self.gguf.add_embedding_length(params.n_embd)
-        self.gguf.add_block_count(params.n_layer)
-        self.gguf.add_feed_forward_length(params.n_ff)
+        self.gguf.add_name                (name)
+        self.gguf.add_context_length      (params.n_ctx)
+        self.gguf.add_embedding_length    (params.n_embd)
+        self.gguf.add_block_count         (params.n_layer)
+        self.gguf.add_feed_forward_length (params.n_ff)
        self.gguf.add_rope_dimension_count(params.n_embd // params.n_head)
-        self.gguf.add_head_count(params.n_head)
-        self.gguf.add_head_count_kv(params.n_head_kv)
-
-        if params.f_norm_eps is None:
-            raise ValueError("f_norm_eps is None")
-
-        self.gguf.add_layer_norm_rms_eps(params.f_norm_eps)
+        self.gguf.add_head_count          (params.n_head)
+        self.gguf.add_head_count_kv       (params.n_head_kv)

        if params.n_experts:
            self.gguf.add_expert_count(params.n_experts)
@@ -1058,6 +987,11 @@ class OutputFile:
        if params.n_experts_used:
            self.gguf.add_expert_used_count(params.n_experts_used)

+        if params.f_norm_eps:
+            self.gguf.add_layer_norm_rms_eps(params.f_norm_eps)
+        else:
+            raise ValueError('f_norm_eps is None')
+
        if params.f_rope_freq_base is not None:
            self.gguf.add_rope_freq_base(params.f_rope_freq_base)

@@ -1089,7 +1023,7 @@ class OutputFile:

        return tokenizer_model

-    def extract_vocabulary_from_model(self, vocab: Vocab) -> Tuple[list, list, list]:
+    def extract_vocabulary_from_model(self, vocab: Vocab) -> tuple[list[bytes], list[float], list[gguf.TokenType]]:
        tokens = []
        scores = []
        toktypes = []
@@ -1124,14 +1058,10 @@ class OutputFile:

    def add_tensor_info(self, name: str, tensor: LazyTensor) -> None:
        n_elements = int(np.prod(tensor.shape))
-        raw_dtype = getattr(tensor.data_type, "ggml_type", None)
-        data_type = (
-            getattr(tensor.data_type, "quantized_type", None) or tensor.data_type.dtype
-        )
+        raw_dtype = getattr(tensor.data_type, 'ggml_type', None)
+        data_type = getattr(tensor.data_type, 'quantized_type', None) or tensor.data_type.dtype
        data_nbytes = tensor.data_type.elements_to_bytes(n_elements)
-        self.gguf.add_tensor_info(
-            name, tensor.shape, data_type, data_nbytes, raw_dtype=raw_dtype
-        )
+        self.gguf.add_tensor_info(name, tensor.shape, data_type, data_nbytes, raw_dtype=raw_dtype)

    def write_meta(self) -> None:
        self.gguf.write_header_to_file()
@@ -1145,14 +1075,10 @@ class OutputFile:

    @staticmethod
    def write_vocab_only(
-        fname_out: Path,
-        params: Params,
-        vocab: Vocab,
-        svocab: gguf.SpecialVocab,
-        endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
-        pad_vocab: bool = False,
+        fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab,
+        endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False,
    ) -> None:
-        check_vocab_size(params, vocab, pad_vocab=pad_vocab)
+        check_vocab_size(params, vocab, pad_vocab = pad_vocab)

        of = OutputFile(fname_out, endianess=endianess)

@@ -1180,14 +1106,8 @@ class OutputFile:

    @staticmethod
    def write_all(
-        fname_out: Path,
-        ftype: GGMLFileType,
-        params: Params,
-        model: LazyModel,
-        vocab: Vocab,
-        svocab: gguf.SpecialVocab,
-        concurrency: int = DEFAULT_CONCURRENCY,
-        endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
+        fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab,
+        concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
        pad_vocab: bool = False,
    ) -> None:
        check_vocab_size(params, vocab, pad_vocab=pad_vocab)
@@ -1207,26 +1127,19 @@ class OutputFile:
        of.write_tensor_info()

        # tensor data
-        ndarrays_inner = bounded_parallel_map(
-            OutputFile.do_item, model.items(), concurrency=concurrency
-        )
+        ndarrays_inner = bounded_parallel_map(OutputFile.do_item, model.items(), concurrency = concurrency)
        if ftype == GGMLFileType.MostlyQ8_0:
            ndarrays = bounded_parallel_map(
-                OutputFile.maybe_do_quantize,
-                ndarrays_inner,
-                concurrency=concurrency,
-                max_workers=concurrency,
+                OutputFile.maybe_do_quantize, ndarrays_inner, concurrency=concurrency, max_workers=concurrency,
                use_processpool_executor=True,
            )
        else:
            ndarrays = map(OutputFile.maybe_do_quantize, ndarrays_inner)

        start = time.time()
-        for i, ((name, lazy_tensor), ndarray) in enumerate(
-            zip(model.items(), ndarrays)
-        ):
+        for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)):
            elapsed = time.time() - start
-            size = " x ".join(f"{dim:6d}" for dim in lazy_tensor.shape)
+            size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
            padi = len(str(len(model)))
            print(
                f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}"
@@ -1363,7 +1276,7 @@ def load_some_model(path: Path) -> ModelPlus:
 class VocabFactory:
    def __init__(self, path: Path):
        self.path = path
-        self.files = {
+        self.files: dict[str, Path | None] = {
            "tokenizer.model": None,
            "vocab.json": None,
            "tokenizer.json": None,
@@ -1380,24 +1293,18 @@ class VocabFactory:
                self.files[file] = parent_file_path
        print(f"Found vocab files: {self.files}")

-    def _select_file(self, vocabtype: Optional[str]) -> Path:
+    def _select_file(self, vocabtype: str | None) -> Path:
        if vocabtype in ["spm", "bpe"]:
            for file_key in self.files.keys():
-                if self.files[file_key]:
-                    return self.files[file_key]
+                if (file := self.files[file_key]) is not None:
+                    return file
            raise FileNotFoundError(f"{vocabtype} vocab not found.")
-        elif vocabtype == "hfft":
+        if vocabtype == "hfft":
            # For Hugging Face Fast Tokenizer, return the directory path instead of a specific file
            return self.path
-        else:
-            raise ValueError(f"Unsupported vocabulary type {vocabtype}")
+        raise ValueError(f"Unsupported vocabulary type {vocabtype}")

-    def _create_special_vocab(
-        self,
-        vocab: Vocab,
-        vocabtype: str,
-        model_parent_path: Path,
-    ) -> gguf.SpecialVocab:
+    def _create_special_vocab(self, vocab: Vocab, vocabtype: str, model_parent_path: Path) -> gguf.SpecialVocab:
        load_merges = vocabtype == "bpe"
        n_vocab = vocab.vocab_size if hasattr(vocab, "vocab_size") else None
        return gguf.SpecialVocab(
@@ -1407,13 +1314,12 @@ class VocabFactory:
            n_vocab=n_vocab,
        )

-    def load_vocab(
-        self, vocabtype: str, model_parent_path: Path
-    ) -> Tuple[Vocab, gguf.SpecialVocab]:
+    def load_vocab(self, vocabtype: str, model_parent_path: Path) -> tuple[Vocab, gguf.SpecialVocab]:
        path = self._select_file(vocabtype)
        print(f"Loading vocab file '{path}', type '{vocabtype}'")

        added_tokens_path = path.parent / "added_tokens.json"
+        vocab: Vocab
        if vocabtype == "bpe":
            vocab = BpeVocab(
                path, added_tokens_path if added_tokens_path.exists() else None
@@ -1428,6 +1334,7 @@ class VocabFactory:
            )
        else:
            raise ValueError(f"Unsupported vocabulary type {vocabtype}")
+        # FIXME: Respect --vocab-dir?
        special_vocab = self._create_special_vocab(
            vocab,
            vocabtype,
@@ -1436,18 +1343,17 @@ class VocabFactory:
        return vocab, special_vocab


-def default_output_file(model_paths: list[Path], file_type: GGMLFileType) -> Path:
+def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path:
    namestr = {
-        GGMLFileType.AllF32: "f32",
+        GGMLFileType.AllF32:    "f32",
        GGMLFileType.MostlyF16: "f16",
-        GGMLFileType.MostlyQ8_0: "q8_0",
+        GGMLFileType.MostlyQ8_0:"q8_0",
    }[file_type]
    ret = model_paths[0].parent / f"ggml-model-{namestr}.gguf"
    if ret in model_paths:
        sys.stderr.write(
            f"Error: Default output path ({ret}) would overwrite the input. "
-            "Please explicitly specify a path using --outfile.\n"
-        )
+            "Please explicitly specify a path using --outfile.\n")
        sys.exit(1)
    return ret

@@ -1457,111 +1363,34 @@ def do_dump_model(model_plus: ModelPlus) -> None:
    print(f"model_plus.format = {model_plus.format!r}")
    print(f"model_plus.vocab = {model_plus.vocab!r}")
    for name, lazy_tensor in model_plus.model.items():
-        print(
-            f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}"
-        )
+        print(f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}")


-def get_argument_parser() -> ArgumentParser:
+def main(args_in: list[str] | None = None) -> None:
    output_choices = ["f32", "f16"]
    if np.uint32(1) == np.uint32(1).newbyteorder("<"):
        # We currently only support Q8_0 output on little endian systems.
        output_choices.append("q8_0")
+    vocab_types = ["spm", "bpe", "hfft"]
+    parser = argparse.ArgumentParser(description="Convert a LLaMa model to a GGML compatible file")
+    parser.add_argument("--awq-path",    type=Path,              help="Path to scale awq cache file", default=None)
+    parser.add_argument("--dump",        action="store_true",    help="don't convert, just show what's in the model")
+    parser.add_argument("--dump-single", action="store_true",    help="don't convert, just show what's in a single model file")
+    parser.add_argument("--vocab-only",  action="store_true",    help="extract only the vocab")
+    parser.add_argument("--outtype",     choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)")
+    parser.add_argument("--vocab-dir",   type=Path,              help="directory containing tokenizer.model, if separate from model file")
+    parser.add_argument("--vocab-type",  choices=vocab_types,    help="The vocabulary format used to define the tokenizer model (default: spm)", default="spm")
+    parser.add_argument("--outfile",     type=Path,              help="path to write to; default: based on input")
+    parser.add_argument("model",         type=Path,              help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
+    parser.add_argument("--ctx",         type=int,               help="model training context (default: based on input)")
+    parser.add_argument("--concurrency", type=int,               help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default=DEFAULT_CONCURRENCY)
+    parser.add_argument("--big-endian",  action="store_true",    help="model is executed on big endian machine")
+    parser.add_argument("--pad-vocab",   action="store_true",    help="add pad tokens when model vocab expects more than tokenizer metadata provides")

-    parser = argparse.ArgumentParser(
-        description="Convert a LLaMa model to a GGML compatible file"
-    )
-
-    parser.add_argument(
-        "model",
-        type=Path,
-        help="Directory containing the model file or the model file itself (*.pth, *.pt, *.bin)",
-    )
-
-    parser.add_argument(
-        "--awq-path",
-        type=Path,
-        help="Path to the Activation-aware Weight Quantization cache file",
-        default=None,
-    )
-
-    parser.add_argument(
-        "--dump",
-        action="store_true",
-        help="Display the model content without converting it",
-    )
-
-    parser.add_argument(
-        "--dump-single",
-        action="store_true",
-        help="Display the content of a single model file without conversion",
-    )
-
-    parser.add_argument(
-        "--vocab-only",
-        action="store_true",
-        help="Extract and output only the vocabulary",
-    )
-
-    parser.add_argument(
-        "--outtype",
-        choices=output_choices,
-        help="Output format - note: q8_0 may be very slow (default: f16 or f32 based on input)",
-    )
-
-    parser.add_argument(
-        "--vocab-dir",
-        type=Path,
-        help="Directory containing the tokenizer.model, if separate from the model file",
-    )
-
-    parser.add_argument(
-        "--vocab-type",
-        choices=["spm", "bpe", "hfft"],  # hfft: Hugging Face Fast Tokenizer
-        default="spm",
-        help="The vocabulary format used to define the tokenizer model (default: spm)",
-    )
-
-    parser.add_argument(
-        "--pad-vocab",
-        action="store_true",
-        help="Add padding tokens when the model's vocabulary size exceeds the tokenizer metadata",
-    )
-
-    parser.add_argument(
-        "--outfile",
-        type=Path,
-        help="Specify the path for the output file (default is based on input)",
-    )
-
-    parser.add_argument(
-        "--ctx", type=int, help="Model training context (default is based on input)"
-    )
-
-    parser.add_argument(
-        "--concurrency",
-        type=int,
-        help=f"Concurrency used for conversion (default: {DEFAULT_CONCURRENCY})",
-        default=DEFAULT_CONCURRENCY,
-    )
-
-    parser.add_argument(
-        "--big-endian",
-        action="store_true",
-        help="Indicate that the model is executed on a big-endian machine",
-    )
-
-    return parser
-
-
-def main(argv: Optional[list[str]] = None) -> None:
-    parser = get_argument_parser()
-    args = parser.parse_args(argv)
-
+    args = parser.parse_args(args_in)
    if args.awq_path:
-        sys.path.insert(1, str(Path(__file__).resolve().parent / "awq-py"))
-        from awq.apply_awq import add_scale_weights
-
+        sys.path.insert(1, str(Path(__file__).parent / 'awq-py'))
+        from awq.apply_awq import add_scale_weights  # type: ignore[import-not-found]
        tmp_model_path = args.model / "weighted_model"
        if tmp_model_path.is_dir():
            print(f"{tmp_model_path} exists as a weighted model.")
@@ -1580,14 +1409,11 @@ def main(argv: Optional[list[str]] = None) -> None:
    if not args.vocab_only:
        model_plus = load_some_model(args.model)
    else:
-        model_plus = ModelPlus(
-            model={}, paths=[args.model / "dummy"], format="none", vocab=None
-        )
+        model_plus = ModelPlus(model = {}, paths = [args.model / 'dummy'], format = 'none', vocab = None)

    if args.dump:
        do_dump_model(model_plus)
        return
-
    endianess = gguf.GGUFEndian.LITTLE
    if args.big_endian:
        endianess = gguf.GGUFEndian.BIG
@@ -1595,12 +1421,10 @@ def main(argv: Optional[list[str]] = None) -> None:
    params = Params.load(model_plus)
    if params.n_ctx == -1:
        if args.ctx is None:
-            raise Exception(
-                "The model doesn't have a context size, and you didn't specify one with --ctx\n"
-                "Please specify one with --ctx:\n"
-                " - LLaMA v1: --ctx 2048\n"
-                " - LLaMA v2: --ctx 4096\n"
-            )
+            raise Exception("The model doesn't have a context size, and you didn't specify one with --ctx\n"
+                            "Please specify one with --ctx:\n"
+                            " - LLaMA v1: --ctx 2048\n"
+                            " - LLaMA v2: --ctx 4096\n")
        params.n_ctx = args.ctx

    if args.outtype:
@@ -1621,42 +1445,30 @@ def main(argv: Optional[list[str]] = None) -> None:
        if not args.outfile:
            raise ValueError("need --outfile if using --vocab-only")
        outfile = args.outfile
-        OutputFile.write_vocab_only(
-            outfile,
-            params,
-            vocab,
-            special_vocab,
-            endianess=endianess,
-            pad_vocab=args.pad_vocab,
-        )
+        OutputFile.write_vocab_only(outfile, params, vocab, special_vocab,
+                                    endianess=endianess, pad_vocab=args.pad_vocab)
        print(f"Wrote {outfile}")
        return

    if model_plus.vocab is not None and args.vocab_dir is None:
        vocab = model_plus.vocab

-    model = model_plus.model
-    model = convert_model_names(model, params)
-    ftype = pick_output_type(model, args.outtype)
-    model = convert_to_output_type(model, ftype)
-    outfile = args.outfile or default_output_file(model_plus.paths, ftype)
+    print(f"Vocab info: {vocab}")
+    print(f"Special vocab info: {special_vocab}")
+
+    model   = model_plus.model
+    model   = convert_model_names(model, params)
+    ftype   = pick_output_type(model, args.outtype)
+    model   = convert_to_output_type(model, ftype)
+    outfile = args.outfile or default_outfile(model_plus.paths, ftype)

    params.ftype = ftype
    print(f"Writing {outfile}, format {ftype}")

-    OutputFile.write_all(
-        outfile,
-        ftype,
-        params,
-        model,
-        vocab,
-        special_vocab,
-        concurrency=args.concurrency,
-        endianess=endianess,
-        pad_vocab=args.pad_vocab,
-    )
+    OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab,
+                         concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab)
    print(f"Wrote {outfile}")


-if __name__ == "__main__":
-    main(sys.argv[1:])  # Exclude the first element (script name) from sys.argv
+if __name__ == '__main__':
+    main()
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -23,6 +23,9 @@ else()
    add_subdirectory(infill)
    add_subdirectory(llama-bench)
    add_subdirectory(llava)
+    if (LLAMA_SYCL)
+        add_subdirectory(sycl)
+    endif()
    add_subdirectory(main)
    add_subdirectory(tokenize)
    add_subdirectory(parallel)
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -1800,6 +1800,8 @@ int main(int argc, char ** argv) {
    std::vector<size_t> train_samples_begin;
    std::vector<size_t> train_samples_size;
    printf("%s: tokenize training data from %s\n", __func__, params.common.fn_train_data);
+    printf("%s: sample-start: %s\n", __func__, params.common.sample_start.c_str());
+    printf("%s: include-sample-start: %s\n", __func__, params.common.include_sample_start ? "true" : "false");
    tokenize_file(lctx,
            params.common.fn_train_data,
            params.common.sample_start,
--- a/examples/imatrix/README.md
+++ b/examples/imatrix/README.md
@@ -0,0 +1,32 @@
+# llama.cpp/examples/imatrix
+
+Compute an importance matrix for a model and given text dataset. Can be used during quantization to enchance the quality of the quantum models.
+More information is available here: https://github.com/ggerganov/llama.cpp/pull/4861
+
+## Usage
+
+```
+./imatrix -m <some_fp_model> -f <some_training_data> [-o <output_file>] [--verbosity <verbosity_level>]
+        [-ofreq num_chunks] [-ow <0 or 1>] [other common params]
+```
+
+Here `-m` with a model name and `-f` with a file containing training data (such as e.g. `wiki.train.raw`) are mandatory.
+The parameters in square brackets are optional and have the following meaning:
+* `-o` (or `--output-file`) specifies the name of the file where the computed data will be stored. If missing `imatrix.dat` is used.
+* `--verbosity` specifies the verbosity level. If set to `0`, no output other than the perplexity of the processed chunks will be generated. If set to `1`, each time the results are saved a message is written to `stderr`. If `>=2`, a message is output each time data is collected for any tensor. Default verbosity level is `1`.
+* `-ofreq` (or `--output-frequency`) specifies how often the so far computed result is saved to disk. Default is 10 (i.e., every 10 chunks)
+* `-ow` (or `--output-weight`) specifies if data will be collected for the `output.weight` tensor. My experience is that it is better to not utilize the importance matrix when quantizing `output.weight`, so this is set to `false` by default.
+
+For faster computation, make sure to use GPU offloading via the `-ngl` argument
+
+## Example
+
+```bash
+LLAMA_CUBLAS=1 make -j
+
+# generate importance matrix (imatrix.dat)
+./imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99
+
+# use the imatrix to perform a Q4_K_M quantization
+./quantize --imatrix imatrix.dat ggml-model-f16.gguf ./ggml-model-q4_k_m.gguf q4_k_m
+```
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -26,6 +26,7 @@ struct StatParams {
    std::string ofile = "imatrix.dat";
    int         n_output_frequency = 10;
    int         verbosity = 1;
+    int         keep_every = 0;
    bool        collect_output_weight = false;
 };

@@ -42,6 +43,9 @@ private:
    int                                    m_last_call = 0;
    std::vector<float>                     m_src1_data;
    std::vector<int>                       m_ids; // the expert ids from ggml_mul_mat_id
+                                                  //
+    void save_imatrix(const char * file_name) const;
+    void keep_imatrix(int ncall) const;
 };

 bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
@@ -80,7 +84,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
        // for simplicity, always copy src0 to host, because it is small
        // take into account that src0 is not contiguous!
        GGML_ASSERT(src0->ne[1] == src1->ne[1]);
-        GGML_ASSERT(n_as*ggml_nrows(src0));
+        GGML_ASSERT(n_as*ggml_nrows(src0)*sizeof(int) == GGML_PAD(ggml_nbytes(src0), n_as*sizeof(int)));
        m_ids.resize(ggml_nbytes(src0)/sizeof(int));
        ggml_backend_tensor_get(src0, m_ids.data(), 0, ggml_nbytes(src0));

@@ -117,6 +121,9 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
                if (m_last_call % m_params.n_output_frequency == 0) {
                    save_imatrix();
                }
+                if (m_params.keep_every > 0 && m_last_call%m_params.keep_every == 0) {
+                    keep_imatrix(m_last_call);
+                }
            }
        }
    } else {
@@ -143,6 +150,9 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
            if (m_last_call % m_params.n_output_frequency == 0) {
                save_imatrix();
            }
+            if (m_params.keep_every > 0 && m_last_call%m_params.keep_every == 0) {
+                keep_imatrix(m_last_call);
+            }
        }
    }

@@ -150,7 +160,18 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
 }

 void IMatrixCollector::save_imatrix() const {
-    const char * fname = m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str();
+    save_imatrix(m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str());
+}
+
+void IMatrixCollector::keep_imatrix(int ncall) const {
+    auto file_name = m_params.ofile;
+    if (file_name.empty()) file_name = "imatrix.dat";
+    file_name += ".at_";
+    file_name += std::to_string(ncall);
+    save_imatrix(file_name.c_str());
+}
+
+void IMatrixCollector::save_imatrix(const char * fname) const {
    std::ofstream out(fname, std::ios::binary);
    int n_entries = m_stats.size();
    out.write((const char*)&n_entries, sizeof(n_entries));
@@ -248,7 +269,7 @@ static void process_logits(
    }
 }

-static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
+static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool compute_ppl) {

    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
    const int n_ctx = llama_n_ctx(ctx);
@@ -269,10 +290,12 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
    }

    std::vector<float> logit_history;
-    logit_history.resize(tokens.size());
-
    std::vector<float> prob_history;
-    prob_history.resize(tokens.size());
+
+    if (compute_ppl) {
+        logit_history.resize(tokens.size());
+        prob_history.resize(tokens.size());
+    }

    const int n_chunk_max = tokens.size() / n_ctx;

@@ -288,12 +311,17 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {

    std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);

+    const int num_batches = (n_ctx + n_batch - 1) / n_batch;
+
+    std::vector<float> logits;
+    if (compute_ppl && num_batches > 1) {
+        logits.reserve((size_t)n_ctx * n_vocab);
+    }
+
    for (int i = 0; i < n_chunk; ++i) {
        const int start =     i * n_ctx;
        const int end   = start + n_ctx;

-        const int num_batches = (n_ctx + n_batch - 1) / n_batch;
-
        std::vector<float> logits;

        const auto t_start = std::chrono::high_resolution_clock::now();
@@ -321,8 +349,10 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
            // restore the original token in case it was set to BOS
            tokens[batch_start] = token_org;

-            const auto * batch_logits = llama_get_logits(ctx);
-            logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
+            if (compute_ppl && num_batches > 1) {
+                const auto * batch_logits = llama_get_logits(ctx);
+                logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
+            }
        }

        const auto t_end = std::chrono::high_resolution_clock::now();
@@ -338,25 +368,32 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
            fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
        }

-        const int first = n_ctx/2;
-        process_logits(n_vocab, logits.data() + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
-                       workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
-        count += n_ctx - first - 1;
+        if (compute_ppl) {
+            const int first = n_ctx/2;
+            const auto all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
+            process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
+                    workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
+            count += n_ctx - first - 1;

-        printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
-        fflush(stdout);
+            printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
+            fflush(stdout);
+
+            logits.clear();
+        }
    }
    printf("\n");

-    nll2 /= count;
-    nll /= count;
-    const double ppl = exp(nll);
-    nll2 -= nll * nll;
-    if (nll2 > 0) {
-        nll2 = sqrt(nll2/(count-1));
-        printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
-    } else {
-        printf("Unexpected negative standard deviation of log(prob)\n");
+    if (compute_ppl) {
+        nll2 /= count;
+        nll /= count;
+        const double ppl = exp(nll);
+        nll2 -= nll * nll;
+        if (nll2 > 0) {
+            nll2 = sqrt(nll2/(count-1));
+            printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
+        } else {
+            printf("Unexpected negative standard deviation of log(prob)\n");
+        }
    }

    return true;
@@ -365,6 +402,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
 int main(int argc, char ** argv) {

    StatParams sparams;
+    bool compute_ppl = true;
    std::vector<char*> args;
    args.push_back(argv[0]);
    int iarg = 1;
@@ -381,12 +419,21 @@ int main(int argc, char ** argv) {
        }
        else if (arg == "--verbosity") {
            sparams.verbosity = std::stoi(argv[++iarg]);
+        } else if (arg == "--no-ppl") {
+            compute_ppl = false;
+        } else if (arg == "--keep-imatrix") {
+            sparams.keep_every = std::stoi(argv[++iarg]);
        } else {
            args.push_back(argv[iarg]);
        }
    }
    if (iarg < argc) {
-        args.push_back(argv[iarg]);
+        std::string arg{argv[iarg]};
+        if (arg == "--no-ppl") {
+            compute_ppl = false;
+        } else {
+            args.push_back(argv[iarg]);
+        }
    }

    gpt_params params;
@@ -448,7 +495,7 @@ int main(int argc, char ** argv) {
        fprintf(stderr, "%s\n", get_system_info(params).c_str());
    }

-    bool OK = compute_imatrix(ctx, params);
+    bool OK = compute_imatrix(ctx, params, compute_ppl);
    if (!OK) {
        return 1;
    }
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@@ -241,7 +241,7 @@ int main(int argc, char ** argv) {
    LOG("add_bos: %d\n", add_bos);

    bool suff_rm_leading_spc = params.escape;
-    if (suff_rm_leading_spc && params.input_suffix.find_first_of(" ") == 0 && params.input_suffix.size() > 1) {
+    if (suff_rm_leading_spc && params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
        params.input_suffix.erase(0, 1);
        suff_rm_leading_spc = false;
    }
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -562,6 +562,8 @@ struct test {
    static const int build_number;
    static const bool cuda;
    static const bool opencl;
+    static const bool vulkan;
+    static const bool kompute;
    static const bool metal;
    static const bool gpu_blas;
    static const bool blas;
@@ -643,6 +645,12 @@ struct test {
        if (opencl) {
            return "OpenCL";
        }
+        if (vulkan) {
+            return "Vulkan";
+        }
+        if (kompute) {
+            return "Kompute";
+        }
        if (metal) {
            return "Metal";
        }
@@ -658,7 +666,7 @@ struct test {
    static const std::vector<std::string> & get_fields() {
        static const std::vector<std::string> fields = {
            "build_commit", "build_number",
-            "cuda", "opencl", "metal", "gpu_blas", "blas",
+            "cuda", "opencl", "vulkan", "kompute", "metal", "gpu_blas", "blas",
            "cpu_info", "gpu_info",
            "model_filename", "model_type", "model_size", "model_n_params",
            "n_batch", "n_threads", "type_k", "type_v",
@@ -682,8 +690,9 @@ struct test {
            field == "avg_ns" || field == "stddev_ns") {
            return INT;
        }
-        if (field == "cuda" || field == "opencl" || field == "metal" || field == "gpu_blas" || field == "blas" ||
-            field == "f16_kv" || field == "no_kv_offload" || field == "mul_mat_q") {
+        if (field == "cuda" || field == "opencl"  || field == "vulkan" || field == "kompute" || field == "metal" ||
+            field == "gpu_blas" || field == "blas" || field == "f16_kv" || field == "no_kv_offload" ||
+            field == "mul_mat_q") {
            return BOOL;
        }
        if (field == "avg_ts" || field == "stddev_ts") {
@@ -710,7 +719,8 @@ struct test {
        }
        std::vector<std::string> values = {
            build_commit, std::to_string(build_number),
-            std::to_string(cuda), std::to_string(opencl), std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas),
+            std::to_string(cuda), std::to_string(opencl), std::to_string(vulkan), std::to_string(vulkan),
+            std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas),
            cpu_info, gpu_info,
            model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
            std::to_string(n_batch), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
@@ -738,6 +748,8 @@ const std::string test::build_commit = LLAMA_COMMIT;
 const int         test::build_number = LLAMA_BUILD_NUMBER;
 const bool        test::cuda         = !!ggml_cpu_has_cublas();
 const bool        test::opencl       = !!ggml_cpu_has_clblast();
+const bool        test::vulkan       = !!ggml_cpu_has_vulkan();
+const bool        test::kompute      = !!ggml_cpu_has_kompute();
 const bool        test::metal        = !!ggml_cpu_has_metal();
 const bool        test::gpu_blas     = !!ggml_cpu_has_gpublas();
 const bool        test::blas         = !!ggml_cpu_has_blas();
--- a/examples/llama.android/app/build.gradle.kts
+++ b/examples/llama.android/app/build.gradle.kts
@@ -30,6 +30,7 @@ android {
        }
        externalNativeBuild {
            cmake {
+                arguments += "-DCMAKE_BUILD_TYPE=Release"
                cppFlags += listOf()
                arguments += listOf()
            }
--- a/examples/llama.vim
+++ b/examples/llama.vim
@@ -6,7 +6,7 @@
 " Similarly, you could add an insert mode keybind with
 " inoremap <C-B> <Cmd>call llama#doLlamaGen()<CR>
 "
-" g:llama_api_url and g:llama_overrides can be configured in your .vimrc
+" g:llama_api_url, g:llama_api_key and g:llama_overrides can be configured in your .vimrc
 " let g:llama_api_url = "192.168.1.10:8080"
 " llama_overrides can also be set through buffer/window scopes. For instance
 " autocmd filetype python let b:llama_overrides = {"temp": 0.2}
@@ -82,6 +82,9 @@ func llama#doLlamaGen()
   endif
   let l:querydata.prompt = join(l:buflines, "\n")
   let l:curlcommand = copy(s:curlcommand)
+   if exists("g:llama_api_key")
+       call extend(l:curlcommand, ['--header', 'Authorization: Bearer ' .. g:llama_api_key])
+   endif
   let l:curlcommand[2] = json_encode(l:querydata)
   let b:job = job_start(l:curlcommand, {"callback": function("s:callbackHandler", [l:cbuffer])})
 endfunction
--- a/examples/llava/MobileVLM-README.md
+++ b/examples/llava/MobileVLM-README.md
@@ -0,0 +1,131 @@
+# MobileVLM
+
+Currently this implementation supports [MobileVLM-v1.7](https://huggingface.co/mtgv/MobileVLM-1.7B) variants.
+
+for more information, please go to [Meituan-AutoML/MobileVLM](https://github.com/Meituan-AutoML/MobileVLM)
+
+The implementation is based on llava, and is compatible with llava and mobileVLM. The usage is basically same as llava.
+
+## Usage
+Build with cmake or run `make llava-cli` to build it.
+
+After building, run: `./llava-cli` to see the usage. For example:
+
+```sh
+./llava-cli -m MobileVLM-1.7B/ggml-model-q4_k.gguf \
+    --mmproj MobileVLM-1.7B/mmproj-model-f16.gguf \
+    --image path/to/an/image.jpg \
+    -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWho is the author of this book? Answer the question using a single word or phrase. ASSISTANT:"
+```
+
+## Model conversion
+
+- Clone `mobileVLM-1.7B` and `clip-vit-large-patch14-336` locally:
+
+```sh
+git clone https://huggingface.co/mtgv/MobileVLM-1.7B
+
+git clone https://huggingface.co/openai/clip-vit-large-patch14-336
+```
+
+2. Use `llava-surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:
+
+```sh
+python ./examples/llava/llava-surgery.py -m path/to/MobileVLM-1.7B
+```
+
+3. Use `convert-image-encoder-to-gguf.py` with `--projector-type ldp` to convert the LLaVA image encoder to GGUF:
+
+```sh
+python ./examples/llava/convert-image-encoder-to-gguf \
+    -m path/to/clip-vit-large-patch14-336 \
+    --llava-projector path/to/MobileVLM-1.7B/llava.projector \
+    --output-dir path/to/MobileVLM-1.7B \
+    --projector-type ldp
+```
+
+4. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF:
+
+```sh
+python ./convert.py path/to/MobileVLM-1.7B
+```
+
+5. Use `quantize` to convert LLaMA part's DataType from `fp16` to `q4_k`
+```sh
+./quantize path/to/MobileVLM-1.7B/ggml-model-f16.gguf path/to/MobileVLM-1.7B/ggml-model-q4_k.gguf q4_k_s
+```
+
+Now both the LLaMA part and the image encoder is in the `MobileVLM-1.7B` directory.
+
+## Android compile and run
+### compile
+refer to `examples/llava/android/build_64.sh`
+```sh
+mkdir examples/llava/android/build_64
+cd examples/llava/android/build_64
+../build_64.sh
+```
+### run on Android
+refer to `android/adb_run.sh`, modify resources' `name` and `path`
+
+## some result on Android with `Snapdragon 888` chip
+### case 1
+**input**
+```sh
+/data/local/tmp/llava-cli \
+    -m /data/local/tmp/ggml-model-q4_k.gguf \
+    --mmproj /data/local/tmp/mmproj-model-f16.gguf \
+    -t 4 \
+    --image /data/local/tmp/demo.jpg \
+    -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWho is the author of this book? \nAnswer the question using a single word or phrase. ASSISTANT:"
+```
+**output**
+```sh
+encode_image_with_clip: image encoded in 21148.71 ms by CLIP (  146.87 ms per image patch)
+ Susan Wise Bauer
+llama_print_timings:        load time =   23574.72 ms
+llama_print_timings:      sample time =       1.24 ms /     6 runs   (    0.21 ms per token,  4850.44 tokens per second)
+llama_print_timings: prompt eval time =   12460.15 ms /   246 tokens (   50.65 ms per token,    19.74 tokens per second)
+llama_print_timings:        eval time =     424.86 ms /     6 runs   (   70.81 ms per token,    14.12 tokens per second)
+llama_print_timings:       total time =   34731.93 ms
+```
+### case 2
+**input**
+```sh
+/data/local/tmp/llava-cli \
+    -m /data/local/tmp/ggml-model-q4_k.gguf \
+    --mmproj /data/local/tmp/mmproj-model-f16.gguf \
+    -t 4 \
+    --image /data/local/tmp/cat.jpeg \
+    -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:"
+```
+
+**output**
+```sh
+encode_image_with_clip: image encoded in 21149.51 ms by CLIP (  146.87 ms per image patch)
+ The image depicts a cat sitting in the grass near some tall green plants.
+llama_print_timings:        load time =   23257.32 ms
+llama_print_timings:      sample time =       5.25 ms /    18 runs   (    0.29 ms per token,  3430.53 tokens per second)
+llama_print_timings: prompt eval time =   11900.73 ms /   232 tokens (   51.30 ms per token,    19.49 tokens per second)
+llama_print_timings:        eval time =    1279.03 ms /    18 runs   (   71.06 ms per token,    14.07 tokens per second)
+llama_print_timings:       total time =   34570.79 ms
+```
+
+## Minor shortcomings
+The `n_patch` of output in `ldp` is 1/4 of the input. In order to implement quickly, we uniformly modified `clip_n_patches` function to a quarter. when counting the time consumption, the calculated time will be 4 times bigger than the real cost.
+
+## TODO
+
+- [ ] Support non-CPU backend for the new operators, such as `depthwise`, `hardswish`, `hardsigmoid`
+- [ ] Optimize LDP projector performance
+
+      - Optimize the structure definition to avoid unnecessary memory rearrangements, to reduce the use of `ggml_permute_cpy`;
+      - Optimize operator implementation (ARM CPU/NVIDIA GPU): such as depthwise conv, hardswish, hardsigmoid, etc.
+- [ ] run MobileVLM on `Jetson Orin`
+- [ ] Support more model variants, such as `MobileVLM-3B`.
+
+
+## contributor
+```sh
+zhangjidong05, yangyang260, huyiming03, chenxiaotao03
+```
--- a/examples/llava/android/adb_run.sh
+++ b/examples/llava/android/adb_run.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+
+model_dir="/Users/cxt/model/llm/mobileVLM/MobileVLM-1.7B_processed"
+projector_name="mmproj-model-f16.gguf"
+llama_name="ggml-model-q4_k.gguf"
+img_dir="/Users/cxt/model/llm"
+img_name="demo.jpg"
+prompt="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWho is the author of this book? \nAnswer the question using a single word or phrase. ASSISTANT:"
+# img_name="cat.jpeg"
+# prompt="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:"
+
+program_dir="build_64/bin"
+binName="llava-cli"
+n_threads=4
+
+
+deviceDir="/data/local/tmp"
+saveDir="output"
+if [ ! -d ${saveDir} ]; then
+    mkdir ${saveDir}
+fi
+
+
+function android_run() {
+    # # copy resource into device
+    # adb push ${model_dir}/${projector_name} ${deviceDir}/${projector_name}
+    # adb push ${model_dir}/${llama_name} ${deviceDir}/${llama_name}
+    adb push ${img_dir}/${img_name} ${deviceDir}/${img_name}
+    # copy program into device
+    adb push ${program_dir}/${binName} ${deviceDir}/${binName}
+    adb shell "chmod 0777 ${deviceDir}/${binName}"
+
+    # run
+    adb shell "echo cd ${deviceDir} ${deviceDir}/${binName} \
+                                                 -m ${deviceDir}/${llama_name} \
+                                                 --mmproj ${deviceDir}/${projector_name} \
+                                                 -t ${n_threads} \
+                                                 --image ${deviceDir}/${img_name} \
+                                                 -p \"${prompt}\" \
+                                                 > ${deviceDir}/${modelName}_${projector_name}_${n_threads}_${img_name}.txt"
+    adb shell "cd ${deviceDir}; pwd; ${deviceDir}/${binName} \
+                                                 -m ${deviceDir}/${llama_name} \
+                                                 --mmproj ${deviceDir}/${projector_name} \
+                                                 -t ${n_threads} \
+                                                 --image ${deviceDir}/${img_name} \
+                                                 -p \"${prompt}\" \
+                                                 >> ${deviceDir}/${modelName}_${projector_name}_${n_threads}_${img_name}.txt 2>&1"
+    adb pull ${deviceDir}/${modelName}_${projector_name}_${n_threads}_${img_name}.txt ${saveDir}
+}
+
+android_run
+
+echo "android_run is Done!"
--- a/examples/llava/android/build_64.sh
+++ b/examples/llava/android/build_64.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+cmake ../../../../ \
+-DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
+-DCMAKE_BUILD_TYPE=Release \
+-DANDROID_ABI="arm64-v8a" \
+-DANDROID_PLATFORM=android-23 $1
+
+make -j4
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -2,17 +2,6 @@
 // so there might be still unnecessary artifacts hanging around
 // I'll gradually clean and extend it

-#include <cassert>
-#include <cmath>
-#include <cstdlib>
-#include <cstring>
-#include <fstream>
-#include <iostream>
-#include <map>
-#include <regex>
-#include <stdexcept>
-#include <vector>
-
 #include "clip.h"
 #include "ggml.h"
 #include "ggml-alloc.h"
@@ -29,6 +18,19 @@
 #define STB_IMAGE_IMPLEMENTATION
 #include "stb_image.h"

+#include <cassert>
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <regex>
+#include <stdexcept>
+#include <vector>
+#include <sstream>
+#include <cinttypes>
+
 static std::string format(const char * fmt, ...) {
    va_list ap;
    va_list ap2;
@@ -67,6 +69,7 @@ static std::string format(const char * fmt, ...) {
 #define KEY_PATCH_SIZE "clip.vision.patch_size"
 #define KEY_IMAGE_MEAN "clip.vision.image_mean"
 #define KEY_IMAGE_STD "clip.vision.image_std"
+#define KEY_PROJ_TYPE "clip.projector_type"

 //
 // tensor name constants
@@ -89,6 +92,22 @@ static std::string format(const char * fmt, ...) {
 #define TN_TEXT_PROJ "text_projection.weight"
 #define TN_VIS_PROJ "visual_projection.weight"
 #define TN_LLAVA_PROJ "mm.%d.%s"
+#define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s"
+#define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
+
+
+enum projector_type {
+    PROJECTOR_TYPE_MLP,
+    PROJECTOR_TYPE_MLP_NORM,
+    PROJECTOR_TYPE_LDP,
+    PROJECTOR_TYPE_UNKNOWN,
+};
+
+static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
+    { PROJECTOR_TYPE_MLP,           "mlp"     },
+    { PROJECTOR_TYPE_LDP,          "ldp"    },
+};
+

 //
 // utilities to get data from a gguf file
@@ -129,6 +148,91 @@ static std::string get_ftype(int ftype) {
    return ggml_type_name(static_cast<ggml_type>(ftype));
 }

+static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
+    switch (type) {
+        case GGUF_TYPE_UINT8:   return std::to_string(((const uint8_t  *)data)[i]);
+        case GGUF_TYPE_INT8:    return std::to_string(((const int8_t   *)data)[i]);
+        case GGUF_TYPE_UINT16:  return std::to_string(((const uint16_t *)data)[i]);
+        case GGUF_TYPE_INT16:   return std::to_string(((const int16_t  *)data)[i]);
+        case GGUF_TYPE_UINT32:  return std::to_string(((const uint32_t *)data)[i]);
+        case GGUF_TYPE_INT32:   return std::to_string(((const int32_t  *)data)[i]);
+        case GGUF_TYPE_UINT64:  return std::to_string(((const uint64_t *)data)[i]);
+        case GGUF_TYPE_INT64:   return std::to_string(((const int64_t  *)data)[i]);
+        case GGUF_TYPE_FLOAT32: return std::to_string(((const float    *)data)[i]);
+        case GGUF_TYPE_FLOAT64: return std::to_string(((const double   *)data)[i]);
+        case GGUF_TYPE_BOOL:    return ((const bool *)data)[i] ? "true" : "false";
+        default:                return format("unknown type %d", type);
+    }
+}
+
+
+static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
+    std::string result;
+    for (size_t pos = 0; ; pos += search.length()) {
+        auto new_pos = s.find(search, pos);
+        if (new_pos == std::string::npos) {
+            result += s.substr(pos, s.size() - pos);
+            break;
+        }
+        result += s.substr(pos, new_pos - pos) + replace;
+        pos = new_pos;
+    }
+    s = std::move(result);
+}
+
+static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
+    const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
+
+    switch (type) {
+        case GGUF_TYPE_STRING:
+            return gguf_get_val_str(ctx_gguf, i);
+        case GGUF_TYPE_ARRAY:
+            {
+                const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
+                int arr_n = gguf_get_arr_n(ctx_gguf, i);
+                const void * data = gguf_get_arr_data(ctx_gguf, i);
+                std::stringstream ss;
+                ss << "[";
+                for (int j = 0; j < arr_n; j++) {
+                    if (arr_type == GGUF_TYPE_STRING) {
+                        std::string val = gguf_get_arr_str(ctx_gguf, i, j);
+                        // escape quotes
+                        replace_all(val, "\\", "\\\\");
+                        replace_all(val, "\"", "\\\"");
+                        ss << '"' << val << '"';
+                    } else if (arr_type == GGUF_TYPE_ARRAY) {
+                        ss << "???";
+                    } else {
+                        ss << gguf_data_to_str(arr_type, data, j);
+                    }
+                    if (j < arr_n - 1) {
+                        ss << ", ";
+                    }
+                }
+                ss << "]";
+                return ss.str();
+            }
+        default:
+            return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0);
+    }
+}
+
+static void print_tensor_info(const ggml_tensor* tensor, const char* prefix = "") {
+    size_t tensor_size = ggml_nbytes(tensor);
+    printf("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n",
+            prefix, ggml_n_dims(tensor), tensor->name, tensor_size,
+            tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], ggml_type_name(tensor->type));
+}
+
+static projector_type clip_projector_type_from_string(const std::string & name) {
+    for (const auto & kv : PROJECTOR_TYPE_NAMES) { // NOLINT
+        if (kv.second == name) {
+            return kv.first;
+        }
+    }
+    return PROJECTOR_TYPE_UNKNOWN;
+}
+
 //
 // image data
 //
@@ -201,10 +305,44 @@ struct clip_vision_model {
    struct ggml_tensor * projection;

    // LLaVA projection
-    struct ggml_tensor * mm_0_w;
-    struct ggml_tensor * mm_0_b;
-    struct ggml_tensor * mm_2_w;
-    struct ggml_tensor * mm_2_b;
+    struct ggml_tensor * mm_0_w = NULL;
+    struct ggml_tensor * mm_0_b = NULL;
+    struct ggml_tensor * mm_2_w = NULL;
+    struct ggml_tensor * mm_2_b = NULL;
+
+    // Yi type models with mlp+normalization projection
+    struct ggml_tensor * mm_1_w = NULL; // Yi type models have 0, 1, 3, 4
+    struct ggml_tensor * mm_1_b = NULL;
+    struct ggml_tensor * mm_3_w = NULL;
+    struct ggml_tensor * mm_3_b = NULL;
+    struct ggml_tensor * mm_4_w = NULL;
+    struct ggml_tensor * mm_4_b = NULL;
+
+    // MobileVLM projection
+    struct ggml_tensor * mm_model_mlp_1_w;
+    struct ggml_tensor * mm_model_mlp_1_b;
+    struct ggml_tensor * mm_model_mlp_3_w;
+    struct ggml_tensor * mm_model_mlp_3_b;
+    struct ggml_tensor * mm_model_block_1_block_0_0_w;
+    struct ggml_tensor * mm_model_block_1_block_0_1_w;
+    struct ggml_tensor * mm_model_block_1_block_0_1_b;
+    struct ggml_tensor * mm_model_block_1_block_1_fc1_w;
+    struct ggml_tensor * mm_model_block_1_block_1_fc1_b;
+    struct ggml_tensor * mm_model_block_1_block_1_fc2_w;
+    struct ggml_tensor * mm_model_block_1_block_1_fc2_b;
+    struct ggml_tensor * mm_model_block_1_block_2_0_w;
+    struct ggml_tensor * mm_model_block_1_block_2_1_w;
+    struct ggml_tensor * mm_model_block_1_block_2_1_b;
+    struct ggml_tensor * mm_model_block_2_block_0_0_w;
+    struct ggml_tensor * mm_model_block_2_block_0_1_w;
+    struct ggml_tensor * mm_model_block_2_block_0_1_b;
+    struct ggml_tensor * mm_model_block_2_block_1_fc1_w;
+    struct ggml_tensor * mm_model_block_2_block_1_fc1_b;
+    struct ggml_tensor * mm_model_block_2_block_1_fc2_w;
+    struct ggml_tensor * mm_model_block_2_block_1_fc2_b;
+    struct ggml_tensor * mm_model_block_2_block_2_0_w;
+    struct ggml_tensor * mm_model_block_2_block_2_1_w;
+    struct ggml_tensor * mm_model_block_2_block_2_1_b;
 };

 struct clip_ctx {
@@ -213,6 +351,7 @@ struct clip_ctx {
    bool has_llava_projector = false;

    struct clip_vision_model vision_model;
+    projector_type proj_type = PROJECTOR_TYPE_MLP;

    float image_mean[3];
    float image_std[3];
@@ -330,6 +469,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
    // pre-layernorm
    {
        embeddings = ggml_norm(ctx0, embeddings, eps);
+        ggml_set_name(embeddings, "pre_ln");

        embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.pre_ln_w), model.pre_ln_b);
    }
@@ -430,16 +570,156 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
            free(patches_data);
        }

+        // shape [1, 576, 1024]
+        // ne is whcn, ne = [1024, 576, 1, 1]
        embeddings = ggml_get_rows(ctx0, embeddings, patches);

-        // mm projection 0
-        embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
-        embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
+        // print_tensor_info(embeddings, "embeddings");

-        embeddings = ggml_gelu(ctx0, embeddings);
+        // llava projector
+        if (ctx->proj_type == PROJECTOR_TYPE_MLP) {
+            embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
+            embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);

-        embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
-        embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
+            embeddings = ggml_gelu(ctx0, embeddings);
+
+            embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
+            embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
+
+        } else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
+            embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
+            embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
+            // ggml_tensor_printf(embeddings, "mm_0_w",0,true,false);
+            // First LayerNorm
+            embeddings = ggml_norm(ctx0, embeddings, eps);
+            embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_1_w),
+                                model.mm_1_b);
+
+            // GELU activation
+            embeddings = ggml_gelu(ctx0, embeddings);
+
+            // Second linear layer
+            embeddings = ggml_mul_mat(ctx0, model.mm_3_w, embeddings);
+            embeddings = ggml_add(ctx0, embeddings, model.mm_3_b);
+
+            // Second LayerNorm
+            embeddings = ggml_norm(ctx0, embeddings, eps);
+            embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_4_w),
+                                model.mm_4_b);
+        }
+        else if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
+            // MobileVLM projector
+            int n_patch = 24;
+            struct ggml_tensor * mlp_1 = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, embeddings);
+            mlp_1 = ggml_add(ctx0, mlp_1, model.mm_model_mlp_1_b);
+            mlp_1 = ggml_gelu(ctx0, mlp_1);
+            struct ggml_tensor * mlp_3 = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, mlp_1);
+            mlp_3 = ggml_add(ctx0, mlp_3, model.mm_model_mlp_3_b);
+            // mlp_3 shape = [1, 576, 2048], ne = [2048, 576, 1, 1]
+
+            // block 1
+            struct ggml_tensor * block_1 = nullptr;
+            {
+                // transpose from [1, 576, 2048] --> [1, 2048, 576] --> [1, 2048, 24, 24]
+                mlp_3 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_3, 1, 0, 2, 3));
+                mlp_3 = ggml_reshape_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]);
+                // stride = 1, padding = 1, bias is nullptr
+                block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1);
+
+                // layer norm
+                // // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
+                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
+                // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
+                block_1 = ggml_norm(ctx0, block_1, eps);
+                block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_0_1_w), model.mm_model_block_1_block_0_1_b);
+                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
+
+                // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
+                // hardswish
+                struct ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
+
+                block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0);
+                // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
+                // pointwise conv
+                block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
+                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc1_w, block_1);
+                block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc1_b);
+                block_1 = ggml_relu(ctx0, block_1);
+                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc2_w, block_1);
+                block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc2_b);
+                block_1 = ggml_hardsigmoid(ctx0, block_1);
+                // block_1_hw shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1], block_1 shape = [1, 2048], ne = [2048, 1, 1, 1]
+                block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
+                block_1 = ggml_mul(ctx0, block_1_hw, block_1);
+
+                int w = block_1->ne[0], h = block_1->ne[1];
+                block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
+                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
+
+                // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
+                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_2_0_w, block_1);
+                block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
+
+                // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
+                block_1 = ggml_norm(ctx0, block_1, eps);
+                block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_2_1_w), model.mm_model_block_1_block_2_1_b);
+                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
+                // block1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
+                // residual
+                block_1 = ggml_add(ctx0, mlp_3, block_1);
+            }
+
+            // block_2
+            {
+                // stride = 2
+                block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_2_block_0_0_w, block_1, 2, 2, 1, 1, 1, 1);
+
+                // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
+                // layer norm
+                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
+                // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
+                block_1 = ggml_norm(ctx0, block_1, eps);
+                block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_0_1_w), model.mm_model_block_2_block_0_1_b);
+                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
+                // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
+                // hardswish
+                struct ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
+
+                // not sure the parameters is right for globalAvgPooling
+                block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0);
+                // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
+                // pointwise conv
+                block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
+                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc1_w, block_1);
+                block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc1_b);
+                block_1 = ggml_relu(ctx0, block_1);
+                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc2_w, block_1);
+                block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc2_b);
+                block_1 = ggml_hardsigmoid(ctx0, block_1);
+
+                // block_1_hw shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1], block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
+                block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
+                block_1 = ggml_mul(ctx0, block_1_hw, block_1);
+
+                int w = block_1->ne[0], h = block_1->ne[1];
+                block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
+                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
+                // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
+                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_2_0_w, block_1);
+                block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
+
+
+                // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
+                block_1 = ggml_norm(ctx0, block_1, eps);
+                block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_2_1_w), model.mm_model_block_2_block_2_1_b);
+                block_1 = ggml_reshape_3d(ctx0, block_1, block_1->ne[0], block_1->ne[1] * block_1->ne[2], block_1->ne[3]);
+                // block_1 shape = [1, 144, 2048], ne = [2048, 144, 1]
+            }
+            embeddings = block_1;
+        }
+        else {
+            GGML_ASSERT(false);
+        }
    }

    // build the graph
@@ -485,16 +765,47 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        printf("\n");
    }
    const int n_tensors = gguf_get_n_tensors(ctx);
+
    // kv
-    if (verbosity >= 3) {
-        const int n_kv = gguf_get_n_kv(ctx);
+    const int n_kv = gguf_get_n_kv(ctx);
+    printf("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n",
+        __func__, n_kv, n_tensors, fname);
+    {
+        std::map<enum ggml_type, uint32_t> n_type;

-        for (int i = 0; i < n_kv; ++i) {
-            const char * key = gguf_get_key(ctx, i);
+        for (int i = 0; i < n_tensors; i++) {
+            enum ggml_type type = gguf_get_tensor_type(ctx, i);

-            printf("%s: kv[%d]: key = %s\n", __func__, i, key);
+            n_type[type]++;
+        }
+
+        printf("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
+        for (int i = 0; i < n_kv; i++) {
+            const char * name           = gguf_get_key(ctx, i);
+            const enum gguf_type type   = gguf_get_kv_type(ctx, i);
+            const std::string type_name =
+                type == GGUF_TYPE_ARRAY
+                ? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(ctx, i)), gguf_get_arr_n(ctx, i))
+                : gguf_type_name(type);
+
+            std::string value          = gguf_kv_to_str(ctx, i);
+            const size_t MAX_VALUE_LEN = 40;
+            if (value.size() > MAX_VALUE_LEN) {
+                value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
+            }
+            replace_all(value, "\n", "\\n");
+
+            printf("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
+        }
+
+        // print type counts
+        for (auto & kv : n_type) {
+            if (kv.second == 0) {
+                continue;
+            }
+
+            printf("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
        }
-        printf("\n");
    }

    // data
@@ -503,12 +814,13 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        for (int i = 0; i < n_tensors; ++i) {
            const char * name = gguf_get_tensor_name(ctx, i);
            const size_t offset = gguf_get_tensor_offset(ctx, i);
+            enum ggml_type type = gguf_get_tensor_type(ctx, i);
            struct ggml_tensor * cur = ggml_get_tensor(meta, name);
            size_t tensor_size = ggml_nbytes(cur);
            buffer_size += tensor_size;
            if (verbosity >= 3) {
-                printf("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu\n", __func__, i,
-                       ggml_n_dims(cur), cur->name, tensor_size, offset);
+                printf("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
+                       __func__, i, ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_type_name(type));
            }
        }
    }
@@ -517,6 +829,23 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {

    clip_ctx * new_clip = new clip_ctx;

+    // update projector type
+    {
+        int idx = gguf_find_key(ctx, KEY_PROJ_TYPE);
+        if (idx != -1) {
+            const std::string proj_type = gguf_get_val_str(ctx, idx);
+            new_clip->proj_type = clip_projector_type_from_string(proj_type);
+        }
+        else {
+            new_clip->proj_type = PROJECTOR_TYPE_MLP;
+        }
+        if (new_clip->proj_type == PROJECTOR_TYPE_MLP) {
+            if (gguf_find_tensor(ctx, format(TN_LLAVA_PROJ, 3, "weight").c_str()) != -1) {
+                new_clip->proj_type = PROJECTOR_TYPE_MLP_NORM;
+            }
+        }
+    }
+
 #ifdef GGML_USE_CUBLAS
    new_clip->backend = ggml_backend_cuda_init(0);
    printf("%s: CLIP using CUDA backend\n", __func__);
@@ -661,10 +990,63 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
        vision_model.pre_ln_w            = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
        vision_model.pre_ln_b            = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
-        vision_model.mm_0_w              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight"));
-        vision_model.mm_0_b              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias"));
-        vision_model.mm_2_w              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
-        vision_model.mm_2_b              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
+
+        // LLaVA projection
+        if (new_clip->proj_type == PROJECTOR_TYPE_MLP || new_clip->proj_type == PROJECTOR_TYPE_MLP_NORM) {
+            vision_model.mm_0_w              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight"));
+            vision_model.mm_0_b              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias"));
+            try {
+                // Yi-type llava
+                vision_model.mm_1_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 1, "weight"));
+                vision_model.mm_1_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 1, "bias"));
+            } catch (std::runtime_error & e) {  }
+            try {
+                // missing in Yi-type llava
+                vision_model.mm_2_w              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
+                vision_model.mm_2_b              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
+            } catch (std::runtime_error & e) {  }
+            try {
+                // Yi-type llava
+                vision_model.mm_3_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 3, "weight"));
+                vision_model.mm_3_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 3, "bias"));
+            } catch (std::runtime_error & e) {  }
+            try {
+                // Yi-type llava
+                vision_model.mm_4_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "weight"));
+                vision_model.mm_4_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "bias"));
+            } catch (std::runtime_error & e) {  }
+        }
+        else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) {
+            // MobileVLM projection
+            vision_model.mm_model_mlp_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "weight"));
+            vision_model.mm_model_mlp_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "bias"));
+            vision_model.mm_model_mlp_3_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 3, "weight"));
+            vision_model.mm_model_mlp_3_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 3, "bias"));
+            vision_model.mm_model_block_1_block_0_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "0.weight"));
+            vision_model.mm_model_block_1_block_0_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.weight"));
+            vision_model.mm_model_block_1_block_0_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.bias"));
+            vision_model.mm_model_block_1_block_1_fc1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.weight"));
+            vision_model.mm_model_block_1_block_1_fc1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.bias"));
+            vision_model.mm_model_block_1_block_1_fc2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.weight"));
+            vision_model.mm_model_block_1_block_1_fc2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.bias"));
+            vision_model.mm_model_block_1_block_2_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "0.weight"));
+            vision_model.mm_model_block_1_block_2_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.weight"));
+            vision_model.mm_model_block_1_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.bias"));
+            vision_model.mm_model_block_2_block_0_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "0.weight"));
+            vision_model.mm_model_block_2_block_0_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.weight"));
+            vision_model.mm_model_block_2_block_0_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.bias"));
+            vision_model.mm_model_block_2_block_1_fc1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.weight"));
+            vision_model.mm_model_block_2_block_1_fc1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.bias"));
+            vision_model.mm_model_block_2_block_1_fc2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.weight"));
+            vision_model.mm_model_block_2_block_1_fc2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.bias"));
+            vision_model.mm_model_block_2_block_2_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "0.weight"));
+            vision_model.mm_model_block_2_block_2_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.weight"));
+            vision_model.mm_model_block_2_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.bias"));
+        }
+        else {
+            std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type];
+            throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
+        }

        vision_model.layers.resize(hparams.n_layer);
        for (int il = 0; il < hparams.n_layer; ++il) {
@@ -949,7 +1331,6 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
        ".*weight",
    };

-    std::vector<uint8_t> read_data(512);
    std::vector<uint8_t> work(512);
    std::vector<float> conv_buf(512);
    std::vector<int64_t> hist_all(1 << 4, 0);
@@ -1100,13 +1481,27 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
 }

 int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
-    return ctx->vision_model.mm_2_b->ne[0];
+    if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
+        return ctx->vision_model.mm_model_block_1_block_2_1_b->ne[0];
+    }
+    else if (ctx->proj_type == PROJECTOR_TYPE_MLP) {
+        return ctx->vision_model.mm_2_b->ne[0];
+    } else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
+        return ctx->vision_model.mm_3_b->ne[0];
+    }
+    else {
+        std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
+        throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
+    }
 }

 int clip_n_patches(const struct clip_ctx * ctx) {
    auto & params = ctx->vision_model.hparams;
-
-    return (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
+    int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
+    if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
+        n_patches /= 4;
+    }
+    return n_patches;
 }

 size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
--- a/examples/llava/convert-image-encoder-to-gguf.py
+++ b/examples/llava/convert-image-encoder-to-gguf.py
@@ -81,6 +81,7 @@ ap.add_argument("--vision-only", action="store_true", required=False,
 ap.add_argument("--clip_model_is_vision", action="store_true", required=False,
                help="The clip model is a pure vision model (ShareGPT4V vision extract for example)")
 ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.")
+ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp", choices=["mlp", "ldp"], default="mlp")
 ap.add_argument("--image-mean", nargs=3, type=float, required=False, help="Override image mean values")
 ap.add_argument("--image-std", nargs=3, type=float, required=False, help="Override image std values")
 ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
@@ -174,6 +175,8 @@ elif args.vision_only and not has_llava_projector:
    fout.add_description("vision-only CLIP model")
 elif has_llava_projector:
    fout.add_description("image encoder for LLaVA")
+    # add projector type
+    fout.add_string("clip.projector_type", args.projector_type)
 else:
    fout.add_description("two-tower CLIP model")

@@ -218,7 +221,8 @@ if has_llava_projector:
    projector = torch.load(args.llava_projector)
    for name, data in projector.items():
        name = get_tensor_name(name)
-        if data.ndim == 2:
+        # pw and dw conv ndim==4
+        if data.ndim == 2 or data.ndim == 4:
            data = data.squeeze().numpy().astype(np.float16)
        else:
            data = data.squeeze().numpy().astype(np.float32)
--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@@ -148,10 +148,35 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
    const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;
    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx_llava->ctx_llama));

-    // llava chat format is "<system_prompt>\nUSER:<image_embeddings>\n<textual_prompt>\nASSISTANT:"
-    eval_string(ctx_llava->ctx_llama, "A chat between a curious human and an artificial intelligence assistant.  The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:", params->n_batch, &n_past, add_bos);
+    std::string system_prompt, user_prompt;
+    size_t image_pos = prompt.find("<image>");
+    if (image_pos != std::string::npos) {
+        // new templating mode: Provide the full prompt including system message and use <image> as a placeholder for the image
+
+        system_prompt = prompt.substr(0, image_pos);
+        user_prompt = prompt.substr(image_pos + std::string("<image>").length());
+        // We replace \n with actual newlines in user_prompt, just in case -e was not used in templating string
+        size_t pos = 0;
+        while ((pos = user_prompt.find("\\n", pos)) != std::string::npos) {
+            user_prompt.replace(pos, 2, "\n");
+            pos += 1; // Advance past the replaced newline
+        }
+        while ((pos = system_prompt.find("\\n", pos)) != std::string::npos) {
+            system_prompt.replace(pos, 2, "\n");
+            pos += 1; // Advance past the replaced newline
+        }
+
+        printf("system_prompt: %s\n", system_prompt.c_str());
+        printf("user_prompt: %s\n", user_prompt.c_str());
+    } else {
+        // llava-1.5 native mode
+        system_prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:";
+        user_prompt = prompt + "\nASSISTANT:";
+    }
+
+    eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, add_bos);
    llava_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past);
-    eval_string(ctx_llava->ctx_llama, (prompt + "\nASSISTANT:").c_str(), params->n_batch, &n_past, false);
+    eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);

    // generate the response

@@ -162,6 +187,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
    for (int i = 0; i < max_tgt_len; i++) {
        const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past);
        if (strcmp(tmp, "</s>") == 0) break;
+        if (strstr(tmp, "###")) break; // Yi-VL behavior

        printf("%s", tmp);
        fflush(stdout);
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -39,6 +39,17 @@ static std::ostringstream       * g_output_ss;
 static std::vector<llama_token> * g_output_tokens;
 static bool is_interacting = false;

+static bool file_exists(const std::string &path) {
+    std::ifstream f(path.c_str());
+    return f.good();
+}
+
+static bool file_is_empty(const std::string &path) {
+    std::ifstream f;
+    f.exceptions(std::ifstream::failbit | std::ifstream::badbit);
+    f.open(path.c_str(), std::ios::in | std::ios::binary | std::ios::ate);
+    return f.tellg() == 0;
+}

 static void write_logfile(
    const llama_context * ctx, const gpt_params & params, const llama_model * model,
@@ -215,12 +226,12 @@ int main(int argc, char ** argv) {

    if (!path_session.empty()) {
        LOG_TEE("%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str());
-
-        // fopen to check for existing session
-        FILE * fp = std::fopen(path_session.c_str(), "rb");
-        if (fp != NULL) {
-            std::fclose(fp);
-
+        if (!file_exists(path_session)) {
+            LOG_TEE("%s: session file does not exist, will create.\n", __func__);
+        } else if (file_is_empty(path_session)) {
+            LOG_TEE("%s: The session file is empty. A new session will be initialized.\n", __func__);
+        } else {
+            // The file exists and is not empty
            session_tokens.resize(n_ctx);
            size_t n_token_count_out = 0;
            if (!llama_load_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
@@ -229,10 +240,7 @@ int main(int argc, char ** argv) {
            }
            session_tokens.resize(n_token_count_out);
            llama_set_rng_seed(ctx, params.seed);
-
-            LOG_TEE("%s: loaded a session with prompt size of %d tokens\n", __func__, (int) session_tokens.size());
-        } else {
-            LOG_TEE("%s: session file does not exist, will create\n", __func__);
+            LOG_TEE("%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size());
        }
    }

--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
--- a/examples/pydantic-models-to-grammar-examples.py
+++ b/examples/pydantic-models-to-grammar-examples.py
@@ -1,14 +1,14 @@
 # Function calling example using pydantic models.
 import datetime
+import importlib
 import json
 from enum import Enum
-from typing import Union, Optional
+from typing import Optional, Union

 import requests
 from pydantic import BaseModel, Field
-
-import importlib
-from pydantic_models_to_grammar import generate_gbnf_grammar_and_documentation, convert_dictionary_to_pydantic_model, add_run_method_to_dynamic_model, create_dynamic_model_from_function
+from pydantic_models_to_grammar import (add_run_method_to_dynamic_model, convert_dictionary_to_pydantic_model,
+                                        create_dynamic_model_from_function, generate_gbnf_grammar_and_documentation)


 # Function to get completion on the llama.cpp server with grammar.
@@ -35,7 +35,7 @@ class SendMessageToUser(BaseModel):
        print(self.message)


-# Enum for the calculator function.
+# Enum for the calculator tool.
 class MathOperation(Enum):
    ADD = "add"
    SUBTRACT = "subtract"
@@ -43,7 +43,7 @@ class MathOperation(Enum):
    DIVIDE = "divide"


-# Very simple calculator tool for the agent.
+# Simple pydantic calculator tool for the agent that can add, subtract, multiply, and divide. Docstring and description of fields will be used in system prompt.
 class Calculator(BaseModel):
    """
    Perform a math operation on two numbers.
@@ -148,37 +148,6 @@ def get_current_datetime(output_format: Optional[str] = None):
    return datetime.datetime.now().strftime(output_format)


-# Enum for the calculator tool.
-class MathOperation(Enum):
-    ADD = "add"
-    SUBTRACT = "subtract"
-    MULTIPLY = "multiply"
-    DIVIDE = "divide"
-
-
-
-# Simple pydantic calculator tool for the agent that can add, subtract, multiply, and divide. Docstring and description of fields will be used in system prompt.
-class Calculator(BaseModel):
-    """
-    Perform a math operation on two numbers.
-    """
-    number_one: Union[int, float] = Field(..., description="First number.")
-    operation: MathOperation = Field(..., description="Math operation to perform.")
-    number_two: Union[int, float] = Field(..., description="Second number.")
-
-    def run(self):
-        if self.operation == MathOperation.ADD:
-            return self.number_one + self.number_two
-        elif self.operation == MathOperation.SUBTRACT:
-            return self.number_one - self.number_two
-        elif self.operation == MathOperation.MULTIPLY:
-            return self.number_one * self.number_two
-        elif self.operation == MathOperation.DIVIDE:
-            return self.number_one / self.number_two
-        else:
-            raise ValueError("Unknown operation.")
-
-
 # Example function to get the weather
 def get_current_weather(location, unit):
    """Get the current weather in a given location"""
--- a/examples/pydantic_models_to_grammar.py
+++ b/examples/pydantic_models_to_grammar.py
@@ -1,15 +1,21 @@
+from __future__ import annotations
+
 import inspect
 import json
+import re
 from copy import copy
-from inspect import isclass, getdoc
-from types import NoneType
+from enum import Enum
+from inspect import getdoc, isclass
+from typing import TYPE_CHECKING, Any, Callable, List, Optional, Union, get_args, get_origin, get_type_hints

 from docstring_parser import parse
-from pydantic import BaseModel, create_model, Field
-from typing import Any, Type, List, get_args, get_origin, Tuple, Union, Optional, _GenericAlias
-from enum import Enum
-from typing import get_type_hints, Callable
-import re
+from pydantic import BaseModel, Field, create_model
+
+if TYPE_CHECKING:
+    from types import GenericAlias
+else:
+    # python 3.8 compat
+    from typing import _GenericAlias as GenericAlias


 class PydanticDataType(Enum):
@@ -43,7 +49,7 @@ class PydanticDataType(Enum):
    SET = "set"


-def map_pydantic_type_to_gbnf(pydantic_type: Type[Any]) -> str:
+def map_pydantic_type_to_gbnf(pydantic_type: type[Any]) -> str:
    if isclass(pydantic_type) and issubclass(pydantic_type, str):
        return PydanticDataType.STRING.value
    elif isclass(pydantic_type) and issubclass(pydantic_type, bool):
@@ -57,22 +63,22 @@ def map_pydantic_type_to_gbnf(pydantic_type: Type[Any]) -> str:

    elif isclass(pydantic_type) and issubclass(pydantic_type, BaseModel):
        return format_model_and_field_name(pydantic_type.__name__)
-    elif get_origin(pydantic_type) == list:
+    elif get_origin(pydantic_type) is list:
        element_type = get_args(pydantic_type)[0]
        return f"{map_pydantic_type_to_gbnf(element_type)}-list"
-    elif get_origin(pydantic_type) == set:
+    elif get_origin(pydantic_type) is set:
        element_type = get_args(pydantic_type)[0]
        return f"{map_pydantic_type_to_gbnf(element_type)}-set"
-    elif get_origin(pydantic_type) == Union:
+    elif get_origin(pydantic_type) is Union:
        union_types = get_args(pydantic_type)
        union_rules = [map_pydantic_type_to_gbnf(ut) for ut in union_types]
        return f"union-{'-or-'.join(union_rules)}"
-    elif get_origin(pydantic_type) == Optional:
+    elif get_origin(pydantic_type) is Optional:
        element_type = get_args(pydantic_type)[0]
        return f"optional-{map_pydantic_type_to_gbnf(element_type)}"
    elif isclass(pydantic_type):
        return f"{PydanticDataType.CUSTOM_CLASS.value}-{format_model_and_field_name(pydantic_type.__name__)}"
-    elif get_origin(pydantic_type) == dict:
+    elif get_origin(pydantic_type) is dict:
        key_type, value_type = get_args(pydantic_type)
        return f"custom-dict-key-type-{format_model_and_field_name(map_pydantic_type_to_gbnf(key_type))}-value-type-{format_model_and_field_name(map_pydantic_type_to_gbnf(value_type))}"
    else:
@@ -106,7 +112,6 @@ def get_members_structure(cls, rule_name):
        return f"{cls.__name__.lower()} ::= " + " | ".join(members)
    if cls.__annotations__ and cls.__annotations__ != {}:
        result = f'{rule_name} ::= "{{"'
-        type_list_rules = []
        # Modify this comprehension
        members = [
            f'  "\\"{name}\\"" ":"  {map_pydantic_type_to_gbnf(param_type)}'
@@ -116,27 +121,25 @@ def get_members_structure(cls, rule_name):

        result += '"," '.join(members)
        result += '  "}"'
-        return result, type_list_rules
-    elif rule_name == "custom-class-any":
+        return result
+    if rule_name == "custom-class-any":
        result = f"{rule_name} ::= "
        result += "value"
-        type_list_rules = []
-        return result, type_list_rules
-    else:
-        init_signature = inspect.signature(cls.__init__)
-        parameters = init_signature.parameters
-        result = f'{rule_name} ::=  "{{"'
-        type_list_rules = []
-        # Modify this comprehension too
-        members = [
-            f'  "\\"{name}\\"" ":"  {map_pydantic_type_to_gbnf(param.annotation)}'
-            for name, param in parameters.items()
-            if name != "self" and param.annotation != inspect.Parameter.empty
-        ]
+        return result

-        result += '", "'.join(members)
-        result += '  "}"'
-        return result, type_list_rules
+    init_signature = inspect.signature(cls.__init__)
+    parameters = init_signature.parameters
+    result = f'{rule_name} ::=  "{{"'
+    # Modify this comprehension too
+    members = [
+        f'  "\\"{name}\\"" ":"  {map_pydantic_type_to_gbnf(param.annotation)}'
+        for name, param in parameters.items()
+        if name != "self" and param.annotation != inspect.Parameter.empty
+    ]
+
+    result += '", "'.join(members)
+    result += '  "}"'
+    return result


 def regex_to_gbnf(regex_pattern: str) -> str:
@@ -269,7 +272,7 @@ def generate_gbnf_float_rules(max_digit=None, min_digit=None, max_precision=None

 def generate_gbnf_rule_for_type(
    model_name, field_name, field_type, is_optional, processed_models, created_rules, field_info=None
-) -> Tuple[str, list]:
+) -> tuple[str, list[str]]:
    """
    Generate GBNF rule for a given field type.

@@ -283,7 +286,7 @@ def generate_gbnf_rule_for_type(
    :param field_info: Additional information about the field (optional).

    :return: Tuple containing the GBNF type and a list of additional rules.
-    :rtype: Tuple[str, list]
+    :rtype: tuple[str, list]
    """
    rules = []

@@ -321,8 +324,7 @@ def generate_gbnf_rule_for_type(
        gbnf_type, rules = model_name + "-" + field_name, rules

    elif gbnf_type.startswith("custom-class-"):
-        nested_model_rules, field_types = get_members_structure(field_type, gbnf_type)
-        rules.append(nested_model_rules)
+        rules.append(get_members_structure(field_type, gbnf_type))
    elif gbnf_type.startswith("custom-dict-"):
        key_type, value_type = get_args(field_type)

@@ -341,14 +343,14 @@ def generate_gbnf_rule_for_type(
        union_rules = []

        for union_type in union_types:
-            if isinstance(union_type, _GenericAlias):
+            if isinstance(union_type, GenericAlias):
                union_gbnf_type, union_rules_list = generate_gbnf_rule_for_type(
                    model_name, field_name, union_type, False, processed_models, created_rules
                )
                union_rules.append(union_gbnf_type)
                rules.extend(union_rules_list)

-            elif not issubclass(union_type, NoneType):
+            elif not issubclass(union_type, type(None)):
                union_gbnf_type, union_rules_list = generate_gbnf_rule_for_type(
                    model_name, field_name, union_type, False, processed_models, created_rules
                )
@@ -424,14 +426,10 @@ def generate_gbnf_rule_for_type(
    else:
        gbnf_type, rules = gbnf_type, []

-    if gbnf_type not in created_rules:
-        return gbnf_type, rules
-    else:
-        if gbnf_type in created_rules:
-            return gbnf_type, rules
+    return gbnf_type, rules


-def generate_gbnf_grammar(model: Type[BaseModel], processed_models: set, created_rules: dict) -> (list, bool, bool):
+def generate_gbnf_grammar(model: type[BaseModel], processed_models: set[type[BaseModel]], created_rules: dict[str, list[str]]) -> tuple[list[str], bool]:
    """

    Generate GBnF Grammar
@@ -452,7 +450,7 @@ def generate_gbnf_grammar(model: Type[BaseModel], processed_models: set, created
    ```
    """
    if model in processed_models:
-        return []
+        return [], False

    processed_models.add(model)
    model_name = format_model_and_field_name(model.__name__)
@@ -518,7 +516,7 @@ def generate_gbnf_grammar(model: Type[BaseModel], processed_models: set, created


 def generate_gbnf_grammar_from_pydantic_models(
-    models: List[Type[BaseModel]], outer_object_name: str = None, outer_object_content: str = None,
+    models: list[type[BaseModel]], outer_object_name: str | None = None, outer_object_content: str | None = None,
    list_of_outputs: bool = False
 ) -> str:
    """
@@ -528,7 +526,7 @@ def generate_gbnf_grammar_from_pydantic_models(
    * grammar.

    Args:
-        models (List[Type[BaseModel]]): A list of Pydantic models to generate the grammar from.
+        models (list[type[BaseModel]]): A list of Pydantic models to generate the grammar from.
        outer_object_name (str): Outer object name for the GBNF grammar. If None, no outer object will be generated. Eg. "function" for function calling.
        outer_object_content (str): Content for the outer rule in the GBNF grammar. Eg. "function_parameters" or "params" for function calling.
        list_of_outputs (str, optional): Allows a list of output objects
@@ -543,9 +541,9 @@ def generate_gbnf_grammar_from_pydantic_models(
        # root ::= UserModel | PostModel
        # ...
    """
-    processed_models = set()
+    processed_models: set[type[BaseModel]] = set()
    all_rules = []
-    created_rules = {}
+    created_rules: dict[str, list[str]] = {}
    if outer_object_name is None:
        for model in models:
            model_rules, _ = generate_gbnf_grammar(model, processed_models, created_rules)
@@ -608,7 +606,7 @@ def get_primitive_grammar(grammar):
    Returns:
        str: GBNF primitive grammar string.
    """
-    type_list = []
+    type_list: list[type[object]] = []
    if "string-list" in grammar:
        type_list.append(str)
    if "boolean-list" in grammar:
@@ -666,14 +664,14 @@ triple-quotes ::= "'''" """


 def generate_markdown_documentation(
-    pydantic_models: List[Type[BaseModel]], model_prefix="Model", fields_prefix="Fields",
+    pydantic_models: list[type[BaseModel]], model_prefix="Model", fields_prefix="Fields",
    documentation_with_field_description=True
 ) -> str:
    """
    Generate markdown documentation for a list of Pydantic models.

    Args:
-        pydantic_models (List[Type[BaseModel]]): List of Pydantic model classes.
+        pydantic_models (list[type[BaseModel]]): list of Pydantic model classes.
        model_prefix (str): Prefix for the model section.
        fields_prefix (str): Prefix for the fields section.
        documentation_with_field_description (bool): Include field descriptions in the documentation.
@@ -731,7 +729,7 @@ def generate_markdown_documentation(


 def generate_field_markdown(
-    field_name: str, field_type: Type[Any], model: Type[BaseModel], depth=1,
+    field_name: str, field_type: type[Any], model: type[BaseModel], depth=1,
    documentation_with_field_description=True
 ) -> str:
    """
@@ -739,8 +737,8 @@ def generate_field_markdown(

    Args:
        field_name (str): Name of the field.
-        field_type (Type[Any]): Type of the field.
-        model (Type[BaseModel]): Pydantic model class.
+        field_type (type[Any]): Type of the field.
+        model (type[BaseModel]): Pydantic model class.
        depth (int): Indentation depth in the documentation.
        documentation_with_field_description (bool): Include field descriptions in the documentation.

@@ -798,7 +796,7 @@ def generate_field_markdown(
    return field_text


-def format_json_example(example: dict, depth: int) -> str:
+def format_json_example(example: dict[str, Any], depth: int) -> str:
    """
    Format a JSON example into a readable string with indentation.

@@ -819,14 +817,14 @@ def format_json_example(example: dict, depth: int) -> str:


 def generate_text_documentation(
-    pydantic_models: List[Type[BaseModel]], model_prefix="Model", fields_prefix="Fields",
+    pydantic_models: list[type[BaseModel]], model_prefix="Model", fields_prefix="Fields",
    documentation_with_field_description=True
 ) -> str:
    """
    Generate text documentation for a list of Pydantic models.

    Args:
-        pydantic_models (List[Type[BaseModel]]): List of Pydantic model classes.
+        pydantic_models (list[type[BaseModel]]): List of Pydantic model classes.
        model_prefix (str): Prefix for the model section.
        fields_prefix (str): Prefix for the fields section.
        documentation_with_field_description (bool): Include field descriptions in the documentation.
@@ -885,7 +883,7 @@ def generate_text_documentation(


 def generate_field_text(
-    field_name: str, field_type: Type[Any], model: Type[BaseModel], depth=1,
+    field_name: str, field_type: type[Any], model: type[BaseModel], depth=1,
    documentation_with_field_description=True
 ) -> str:
    """
@@ -893,8 +891,8 @@ def generate_field_text(

    Args:
        field_name (str): Name of the field.
-        field_type (Type[Any]): Type of the field.
-        model (Type[BaseModel]): Pydantic model class.
+        field_type (type[Any]): Type of the field.
+        model (type[BaseModel]): Pydantic model class.
        depth (int): Indentation depth in the documentation.
        documentation_with_field_description (bool): Include field descriptions in the documentation.

@@ -1017,8 +1015,8 @@ def generate_and_save_gbnf_grammar_and_documentation(
    pydantic_model_list,
    grammar_file_path="./generated_grammar.gbnf",
    documentation_file_path="./generated_grammar_documentation.md",
-    outer_object_name: str = None,
-    outer_object_content: str = None,
+    outer_object_name: str | None = None,
+    outer_object_content: str | None = None,
    model_prefix: str = "Output Model",
    fields_prefix: str = "Output Fields",
    list_of_outputs: bool = False,
@@ -1053,8 +1051,8 @@ def generate_and_save_gbnf_grammar_and_documentation(

 def generate_gbnf_grammar_and_documentation(
    pydantic_model_list,
-    outer_object_name: str = None,
-    outer_object_content: str = None,
+    outer_object_name: str | None = None,
+    outer_object_content: str | None = None,
    model_prefix: str = "Output Model",
    fields_prefix: str = "Output Fields",
    list_of_outputs: bool = False,
@@ -1086,9 +1084,9 @@ def generate_gbnf_grammar_and_documentation(


 def generate_gbnf_grammar_and_documentation_from_dictionaries(
-    dictionaries: List[dict],
-    outer_object_name: str = None,
-    outer_object_content: str = None,
+    dictionaries: list[dict[str, Any]],
+    outer_object_name: str | None = None,
+    outer_object_content: str | None = None,
    model_prefix: str = "Output Model",
    fields_prefix: str = "Output Fields",
    list_of_outputs: bool = False,
@@ -1098,7 +1096,7 @@ def generate_gbnf_grammar_and_documentation_from_dictionaries(
    Generate GBNF grammar and documentation from a list of dictionaries.

    Args:
-        dictionaries (List[dict]): List of dictionaries representing Pydantic models.
+        dictionaries (list[dict]): List of dictionaries representing Pydantic models.
        outer_object_name (str): Outer object name for the GBNF grammar. If None, no outer object will be generated. Eg. "function" for function calling.
        outer_object_content (str): Content for the outer rule in the GBNF grammar. Eg. "function_parameters" or "params" for function calling.
        model_prefix (str): Prefix for the model section in the documentation.
@@ -1120,7 +1118,7 @@ def generate_gbnf_grammar_and_documentation_from_dictionaries(
    return grammar, documentation


-def create_dynamic_model_from_function(func: Callable):
+def create_dynamic_model_from_function(func: Callable[..., Any]):
    """
    Creates a dynamic Pydantic model from a given function's type hints and adds the function as a 'run' method.

@@ -1135,6 +1133,7 @@ def create_dynamic_model_from_function(func: Callable):
    sig = inspect.signature(func)

    # Parse the docstring
+    assert func.__doc__ is not None
    docstring = parse(func.__doc__)

    dynamic_fields = {}
@@ -1157,7 +1156,6 @@ def create_dynamic_model_from_function(func: Callable):
                f"Parameter '{param.name}' in function '{func.__name__}' lacks a description in the docstring")

        # Add parameter details to the schema
-        param_doc = next((d for d in docstring.params if d.arg_name == param.name), None)
        param_docs.append((param.name, param_doc))
        if param.default == inspect.Parameter.empty:
            default_value = ...
@@ -1166,10 +1164,10 @@ def create_dynamic_model_from_function(func: Callable):
        dynamic_fields[param.name] = (
            param.annotation if param.annotation != inspect.Parameter.empty else str, default_value)
    # Creating the dynamic model
-    dynamic_model = create_model(f"{func.__name__}", **dynamic_fields)
+    dynamic_model = create_model(f"{func.__name__}", **dynamic_fields)  # type: ignore[call-overload]

-    for param_doc in param_docs:
-        dynamic_model.model_fields[param_doc[0]].description = param_doc[1].description
+    for name, param_doc in param_docs:
+        dynamic_model.model_fields[name].description = param_doc.description

    dynamic_model.__doc__ = docstring.short_description

@@ -1182,16 +1180,16 @@ def create_dynamic_model_from_function(func: Callable):
    return dynamic_model


-def add_run_method_to_dynamic_model(model: Type[BaseModel], func: Callable):
+def add_run_method_to_dynamic_model(model: type[BaseModel], func: Callable[..., Any]):
    """
    Add a 'run' method to a dynamic Pydantic model, using the provided function.

    Args:
-        model (Type[BaseModel]): Dynamic Pydantic model class.
+        model (type[BaseModel]): Dynamic Pydantic model class.
        func (Callable): Function to be added as a 'run' method to the model.

    Returns:
-        Type[BaseModel]: Pydantic model class with the added 'run' method.
+        type[BaseModel]: Pydantic model class with the added 'run' method.
    """

    def run_method_wrapper(self):
@@ -1204,15 +1202,15 @@ def add_run_method_to_dynamic_model(model: Type[BaseModel], func: Callable):
    return model


-def create_dynamic_models_from_dictionaries(dictionaries: List[dict]):
+def create_dynamic_models_from_dictionaries(dictionaries: list[dict[str, Any]]):
    """
    Create a list of dynamic Pydantic model classes from a list of dictionaries.

    Args:
-        dictionaries (List[dict]): List of dictionaries representing model structures.
+        dictionaries (list[dict]): List of dictionaries representing model structures.

    Returns:
-        List[Type[BaseModel]]: List of generated dynamic Pydantic model classes.
+        list[type[BaseModel]]: List of generated dynamic Pydantic model classes.
    """
    dynamic_models = []
    for func in dictionaries:
@@ -1249,7 +1247,7 @@ def list_to_enum(enum_name, values):
    return Enum(enum_name, {value: value for value in values})


-def convert_dictionary_to_pydantic_model(dictionary: dict, model_name: str = "CustomModel") -> Type[BaseModel]:
+def convert_dictionary_to_pydantic_model(dictionary: dict[str, Any], model_name: str = "CustomModel") -> type[Any]:
    """
    Convert a dictionary to a Pydantic model class.

@@ -1258,9 +1256,9 @@ def convert_dictionary_to_pydantic_model(dictionary: dict, model_name: str = "Cu
        model_name (str): Name of the generated Pydantic model.

    Returns:
-        Type[BaseModel]: Generated Pydantic model class.
+        type[BaseModel]: Generated Pydantic model class.
    """
-    fields = {}
+    fields: dict[str, Any] = {}

    if "properties" in dictionary:
        for field_name, field_data in dictionary.get("properties", {}).items():
@@ -1277,7 +1275,7 @@ def convert_dictionary_to_pydantic_model(dictionary: dict, model_name: str = "Cu
                    if items != {}:
                        array = {"properties": items}
                        array_type = convert_dictionary_to_pydantic_model(array, f"{model_name}_{field_name}_items")
-                        fields[field_name] = (List[array_type], ...)
+                        fields[field_name] = (List[array_type], ...)  # type: ignore[valid-type]
                    else:
                        fields[field_name] = (list, ...)
                elif field_type == "object":
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@@ -378,6 +378,8 @@ int main(int argc, char ** argv) {
                printf("testing %s ...\n",  ggml_type_name(type));
            }

+            ggml_quantize_init(type);
+
            error_stats global_stats {};

            for (const auto& kv_tensor : tensors) {
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -25,7 +25,9 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
    { "IQ2_XS", LLAMA_FTYPE_MOSTLY_IQ2_XS, " 2.31 bpw quantization",            },
    { "Q2_K",   LLAMA_FTYPE_MOSTLY_Q2_K,   " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
    { "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", },
+    { "IQ3_XXS",LLAMA_FTYPE_MOSTLY_IQ3_XXS," 3.06 bpw quantization",            },
    { "Q3_K",   LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
+    { "Q3_K_XS",LLAMA_FTYPE_MOSTLY_Q3_K_XS,"3-bit extra small quantization"   , },
    { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", },
    { "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.07G, +0.2496 ppl @ LLaMA-v1-7B", },
    { "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 3.35G, +0.1764 ppl @ LLaMA-v1-7B", },
@@ -35,7 +37,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
    { "Q5_K",   LLAMA_FTYPE_MOSTLY_Q5_K_M, "alias for Q5_K_M", },
    { "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 4.33G, +0.0400 ppl @ LLaMA-v1-7B", },
    { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0122 ppl @ LLaMA-v1-7B", },
-    { "Q6_K",   LLAMA_FTYPE_MOSTLY_Q6_K,   " 5.15G, -0.0008 ppl @ LLaMA-v1-7B", },
+    { "Q6_K",   LLAMA_FTYPE_MOSTLY_Q6_K,   " 5.15G, +0.0008 ppl @ LLaMA-v1-7B", },
    { "Q8_0",   LLAMA_FTYPE_MOSTLY_Q8_0,   " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", },
    { "F16",    LLAMA_FTYPE_MOSTLY_F16,    "13.00G              @ 7B", },
    { "F32",    LLAMA_FTYPE_ALL_F32,       "26.00G              @ 7B", },
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@@ -1,7 +1,7 @@
 set(TARGET server)
 option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR})
-add_executable(${TARGET} server.cpp json.hpp httplib.h)
+add_executable(${TARGET} server.cpp oai.hpp utils.hpp json.hpp httplib.h)
 install(TARGETS ${TARGET} RUNTIME)
 target_compile_definitions(${TARGET} PRIVATE
    SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -4,32 +4,34 @@ This example demonstrates a simple HTTP API server and a simple web front end to

 Command line options:

-   `--threads N`, `-t N`: Set the number of threads to use during generation.
-   `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation.
-   `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
-   `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
-   `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096.
-   `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
-   `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
-   `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
-   `-b N`, `--batch-size N`: Set the batch size for prompt processing. Default: `512`.
-   `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended.
-   `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped.
-   `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed.
-   `--numa`: Attempt optimizations that help on some NUMA systems.
-   `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
-   `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
-   `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`.
-   `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`.
-   `--port`: Set the port to listen. Default: `8080`.
-   `--path`: path from which to serve static files (default examples/server/public)
-   `--api-key`: Set an api key for request authorization. By default the server responds to every request. With an api key set, the requests must have the Authorization header set with the api key as Bearer token. May be used multiple times to enable multiple valid keys.
-   `--api-key-file`: path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access. May be used in conjunction with `--api-key`'s.
-   `--embedding`: Enable embedding extraction, Default: disabled.
-   `-np N`, `--parallel N`: Set the number of slots for process requests (default: 1)
-   `-cb`, `--cont-batching`: enable continuous batching (a.k.a dynamic batching) (default: disabled)
-   `-spf FNAME`, `--system-prompt-file FNAME` Set a file to load "a system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
-   `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA.
+- `--threads N`, `-t N`: Set the number of threads to use during generation.
+- `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation.
+- `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
+- `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
+- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096.
+- `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
+- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
+- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
+- `-b N`, `--batch-size N`: Set the batch size for prompt processing. Default: `512`.
+- `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended.
+- `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped.
+- `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed.
+- `--numa`: Attempt optimizations that help on some NUMA systems.
+- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
+- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
+- `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`.
+- `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`.
+- `--port`: Set the port to listen. Default: `8080`.
+- `--path`: path from which to serve static files (default examples/server/public)
+- `--api-key`: Set an api key for request authorization. By default the server responds to every request. With an api key set, the requests must have the Authorization header set with the api key as Bearer token. May be used multiple times to enable multiple valid keys.
+- `--api-key-file`: path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access. May be used in conjunction with `--api-key`'s.
+- `--embedding`: Enable embedding extraction, Default: disabled.
+- `-np N`, `--parallel N`: Set the number of slots for process requests (default: 1)
+- `-cb`, `--cont-batching`: enable continuous batching (a.k.a dynamic batching) (default: disabled)
+- `-spf FNAME`, `--system-prompt-file FNAME` Set a file to load "a system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
+- `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA.
+- `--grp-attn-n`: Set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`
+- `--grp-attn-w`: Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`

 ## Build

@@ -51,20 +53,30 @@ server is build alongside everything else from the root of the project

 To get started right away, run the following command, making sure to use the correct path for the model you have:

-### Unix-based systems (Linux, macOS, etc.):
+### Unix-based systems (Linux, macOS, etc.)

 ```bash
 ./server -m models/7B/ggml-model.gguf -c 2048
 ```

-### Windows:
+### Windows

 ```powershell
 server.exe -m models\7B\ggml-model.gguf -c 2048
 ```
+
 The above command will start a server that by default listens on `127.0.0.1:8080`.
 You can consume the endpoints with Postman or NodeJS with axios library. You can visit the web front end at the same url.

+### Docker
+
+```bash
+docker run -p 8080:8080 -v /path/to/models:/models ggerganov/llama.cpp:server -m models/7B/ggml-model.gguf -c 512 --host 0.0.0.0 --port 8080
+
+# or, with CUDA:
+docker run -p 8080:8080 -v /path/to/models:/models --gpus all ggerganov/llama.cpp:server-cuda -m models/7B/ggml-model.gguf -c 512 --host 0.0.0.0 --port 8080 --n-gpu-layers 99
+```
+
 ## Testing with CURL

 Using [curl](https://curl.se/). On Windows `curl.exe` should be available in the base OS.
@@ -111,12 +123,13 @@ node index.js
 ```

 ## API Endpoints
- **GET** `/health`: Returns the current state of the server:
-    - `{"status": "loading model"}` if the model is still being loaded.
-    - `{"status": "error"}` if the model failed to load.
-    - `{"status": "ok"}` if the model is successfully loaded and the server is ready for further requests mentioned below.

-   **POST** `/completion`: Given a `prompt`, it returns the predicted completion.
+- **GET** `/health`: Returns the current state of the server:
+  - `{"status": "loading model"}` if the model is still being loaded.
+  - `{"status": "error"}` if the model failed to load.
+  - `{"status": "ok"}` if the model is successfully loaded and the server is ready for further requests mentioned below.
+
+- **POST** `/completion`: Given a `prompt`, it returns the predicted completion.

    *Options:*

@@ -180,14 +193,13 @@ node index.js

    `system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)

-### Result JSON:
-
-* Note: When using streaming mode (`stream`) only `content` and `stop` will be returned until end of completion.
+### Result JSON

+- Note: When using streaming mode (`stream`) only `content` and `stop` will be returned until end of completion.

 - `completion_probabilities`: An array of token probabilities for each completion. The array's length is `n_predict`. Each item in the array has the following structure:

-```
+```json
 {
  "content": "<the token selected by the model>",
  "probs": [
@@ -203,6 +215,7 @@ node index.js
  ]
 },
 ```
+
 Notice that each `probs` is an array of length `n_probs`.

 - `content`: Completion result as a string (excluding `stopping_word` if any). In case of streaming mode, will contain the next token as a string.
@@ -219,7 +232,7 @@ Notice that each `probs` is an array of length `n_probs`.
 - `tokens_evaluated`: Number of tokens evaluated in total from the prompt
 - `truncated`: Boolean indicating if the context size was exceeded during generation, i.e. the number of tokens provided in the prompt (`tokens_evaluated`) plus tokens generated (`tokens predicted`) exceeded the context size (`n_ctx`)

-   **POST** `/tokenize`: Tokenize a given text.
+- **POST** `/tokenize`: Tokenize a given text.

    *Options:*

@@ -227,13 +240,13 @@ Notice that each `probs` is an array of length `n_probs`.

    Note that the special `BOS` token is not added in front of the text and also a space character is not inserted automatically as it is for `/completion`.

-   **POST** `/detokenize`: Convert tokens to text.
+- **POST** `/detokenize`: Convert tokens to text.

    *Options:*

    `tokens`: Set the tokens to detokenize.

-   **POST** `/embedding`: Generate embedding of a given text just as [the embedding example](../embedding) does.
+- **POST** `/embedding`: Generate embedding of a given text just as [the embedding example](../embedding) does.

    *Options:*

@@ -241,7 +254,7 @@ Notice that each `probs` is an array of length `n_probs`.

    `image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `content`. You can determine the place of the image in the content as in the following: `Image: [img-21].\nCaption: This is a picture of a house`. In this case, `[img-21]` will be replaced by the embeddings of the image with id `21` in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 21}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.

-   **POST** `/infill`: For code infilling. Takes a prefix and a suffix and returns the predicted completion as stream.
+- **POST** `/infill`: For code infilling. Takes a prefix and a suffix and returns the predicted completion as stream.

    *Options:*

@@ -251,9 +264,9 @@ Notice that each `probs` is an array of length `n_probs`.

    It also accepts all the options of `/completion` except `stream` and `prompt`.

-   **GET** `/props`: Return the required assistant name and anti-prompt to generate the prompt in case you have specified a system prompt for all slots.
+- **GET** `/props`: Return the required assistant name and anti-prompt to generate the prompt in case you have specified a system prompt for all slots.

-   **POST** `/v1/chat/completions`: OpenAI-compatible Chat Completions API. Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only ChatML-tuned models, such as Dolphin, OpenOrca, OpenHermes, OpenChat-3.5, etc can be used with this endpoint. Compared to `api_like_OAI.py` this API implementation does not require a wrapper to be served.
+- **POST** `/v1/chat/completions`: OpenAI-compatible Chat Completions API. Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only ChatML-tuned models, such as Dolphin, OpenOrca, OpenHermes, OpenChat-3.5, etc can be used with this endpoint. Compared to `api_like_OAI.py` this API implementation does not require a wrapper to be served.

    *Options:*

@@ -281,6 +294,7 @@ Notice that each `probs` is an array of length `n_probs`.

    print(completion.choices[0].message)
    ```
+
    ... or raw HTTP requests:

    ```shell
@@ -302,6 +316,40 @@ Notice that each `probs` is an array of length `n_probs`.
    }'
    ```

+- **POST** `/v1/embeddings`: OpenAI-compatible embeddings API.
+
+    *Options:*
+
+    See [OpenAI Embeddings API documentation](https://platform.openai.com/docs/api-reference/embeddings).
+
+    *Examples:*
+
+  - input as string
+
+    ```shell
+    curl http://localhost:8080/v1/embeddings \
+    -H "Content-Type: application/json" \
+    -H "Authorization: Bearer no-key" \
+    -d '{
+            "input": "hello",
+            "model":"GPT-4",
+            "encoding_format": "float"
+    }'
+    ```
+
+  - `input` as string array
+
+    ```shell
+    curl http://localhost:8080/v1/embeddings \
+    -H "Content-Type: application/json" \
+    -H "Authorization: Bearer no-key" \
+    -d '{
+            "input": ["hello", "world"],
+            "model":"GPT-4",
+            "encoding_format": "float"
+    }'
+    ```
+
 ## More examples

 ### Change system prompt on runtime
@@ -353,6 +401,7 @@ python api_like_OAI.py
 ```

 After running the API server, you can use it in Python by setting the API base URL.
+
 ```python
 openai.api_base = "http://<Your api-server IP>:port"
 ```
--- a/examples/server/chat.sh
+++ b/examples/server/chat.sh
@@ -48,6 +48,7 @@ chat_completion() {
        top_p: 0.9,
        n_keep: $n_keep,
        n_predict: 256,
+        cache_prompt: true,
        stop: ["\n### Human:"],
        stream: true
    }')"
--- a/examples/server/oai.hpp
+++ b/examples/server/oai.hpp
@@ -0,0 +1,223 @@
+#pragma once
+
+#include <string>
+#include <vector>
+#include <set>
+#include <mutex>
+#include <condition_variable>
+#include <unordered_map>
+
+#include "json.hpp"
+#include "utils.hpp"
+
+#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613"
+
+using json = nlohmann::json;
+
+inline static json oaicompat_completion_params_parse(
+    const json &body /* openai api json semantics */)
+{
+    json llama_params;
+
+    llama_params["__oaicompat"] = true;
+
+    // Map OpenAI parameters to llama.cpp parameters
+    //
+    // For parameters that are defined by the OpenAI documentation (e.g.
+    // temperature), we explicitly specify OpenAI's intended default; we
+    // need to do that because sometimes OpenAI disagrees with llama.cpp
+    //
+    // https://platform.openai.com/docs/api-reference/chat/create
+    llama_sampling_params default_sparams;
+    llama_params["model"]             = json_value(body, "model", std::string("unknown"));
+    llama_params["prompt"]            = format_chatml(body["messages"]); // OpenAI 'messages' to llama.cpp 'prompt'
+    llama_params["cache_prompt"]      = json_value(body, "cache_prompt", false);
+    llama_params["temperature"]       = json_value(body, "temperature", 0.0);
+    llama_params["top_k"]             = json_value(body, "top_k", default_sparams.top_k);
+    llama_params["top_p"]             = json_value(body, "top_p", 1.0);
+    llama_params["n_predict"]         = json_value(body, "max_tokens", -1);
+    llama_params["logit_bias"]        = json_value(body, "logit_bias",json::object());
+    llama_params["frequency_penalty"] = json_value(body, "frequency_penalty", 0.0);
+    llama_params["presence_penalty"]  = json_value(body, "presence_penalty", 0.0);
+    llama_params["seed"]              = json_value(body, "seed", LLAMA_DEFAULT_SEED);
+    llama_params["stream"]            = json_value(body, "stream", false);
+    llama_params["mirostat"]          = json_value(body, "mirostat", default_sparams.mirostat);
+    llama_params["mirostat_tau"]      = json_value(body, "mirostat_tau", default_sparams.mirostat_tau);
+    llama_params["mirostat_eta"]      = json_value(body, "mirostat_eta", default_sparams.mirostat_eta);
+    llama_params["penalize_nl"]       = json_value(body, "penalize_nl", default_sparams.penalize_nl);
+    llama_params["typical_p"]         = json_value(body, "typical_p", default_sparams.typical_p);
+    llama_params["repeat_last_n"]     = json_value(body, "repeat_last_n", default_sparams.penalty_last_n);
+    llama_params["ignore_eos"]        = json_value(body, "ignore_eos", false);
+    llama_params["tfs_z"]             = json_value(body, "tfs_z", default_sparams.tfs_z);
+
+    if (body.count("grammar") != 0) {
+        llama_params["grammar"] = json_value(body, "grammar", json::object());
+    }
+
+    // Handle 'stop' field
+    if (body.contains("stop") && body["stop"].is_string()) {
+        llama_params["stop"] = json::array({body["stop"].get<std::string>()});
+    } else {
+        llama_params["stop"] = json_value(body, "stop", json::array());
+    }
+
+    // Ensure there is ChatML-specific end sequence among stop words
+    llama_params["stop"].push_back("<|im_end|>");
+
+    return llama_params;
+}
+
+inline static json format_final_response_oaicompat(const json &request, const task_result &response, bool streaming = false)
+{
+    json result = response.result_json;
+
+    bool stopped_word        = result.count("stopped_word") != 0;
+    bool stopped_eos         = json_value(result, "stopped_eos", false);
+    int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
+    int num_prompt_tokens    = json_value(result, "tokens_evaluated", 0);
+    std::string content      = json_value(result, "content", std::string(""));
+
+    std::string finish_reason = "length";
+    if (stopped_word || stopped_eos) {
+        finish_reason = "stop";
+    }
+
+    json choices =
+        streaming ? json::array({json{{"finish_reason", finish_reason},
+                                        {"index", 0},
+                                        {"delta", json::object()}}})
+                  : json::array({json{{"finish_reason", finish_reason},
+                                        {"index", 0},
+                                        {"message", json{{"content", content},
+                                                         {"role", "assistant"}}}}});
+
+    std::time_t t = std::time(0);
+
+    json res =
+        json{{"choices", choices},
+            {"created", t},
+            {"model",
+                json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
+            {"object", streaming ? "chat.completion.chunk" : "chat.completion"},
+            {"usage",
+                json{{"completion_tokens", num_tokens_predicted},
+                     {"prompt_tokens",     num_prompt_tokens},
+                     {"total_tokens",      num_tokens_predicted + num_prompt_tokens}}},
+            {"id", gen_chatcmplid()}};
+
+    if (server_verbose) {
+        res["__verbose"] = result;
+    }
+
+    if (result.contains("completion_probabilities")) {
+        res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array());
+    }
+
+    return res;
+}
+
+// return value is vector as there is one case where we might need to generate two responses
+inline static std::vector<json> format_partial_response_oaicompat(const task_result &response) {
+    json result = response.result_json;
+
+    if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) {
+        return std::vector<json>({response.result_json});
+    }
+
+    bool first = json_value(result, "oaicompat_token_ctr", 0) == 0;
+    std::string modelname = json_value(result, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
+
+    bool stopped_word   = json_value(result, "stopped_word", false);
+    bool stopped_eos    = json_value(result, "stopped_eos", false);
+    bool stopped_limit  = json_value(result, "stopped_limit", false);
+    std::string content = json_value(result, "content", std::string(""));
+
+    std::string finish_reason;
+    if (stopped_word || stopped_eos) {
+        finish_reason = "stop";
+    }
+    if (stopped_limit) {
+        finish_reason = "length";
+    }
+
+    std::time_t t = std::time(0);
+
+    json choices;
+
+    if (!finish_reason.empty()) {
+        choices = json::array({json{{"finish_reason", finish_reason},
+                                    {"index", 0},
+                                    {"delta", json::object()}}});
+    } else {
+        if (first) {
+            if (content.empty()) {
+                choices = json::array({json{{"finish_reason", nullptr},
+                                            {"index", 0},
+                                            {"delta", json{{"role", "assistant"}}}}});
+            } else {
+                // We have to send this as two updates to conform to openai behavior
+                json initial_ret = json{{"choices", json::array({json{
+                                        {"finish_reason", nullptr},
+                                        {"index", 0},
+                                        {"delta", json{
+                                            {"role", "assistant"}
+                                        }}}})},
+                            {"created", t},
+                            {"id", gen_chatcmplid()},
+                            {"model", modelname},
+                            {"object", "chat.completion.chunk"}};
+
+                json second_ret = json{
+                            {"choices", json::array({json{{"finish_reason", nullptr},
+                                                            {"index", 0},
+                                                            {"delta", json{
+                                                            {"content", content}}}
+                                                            }})},
+                            {"created", t},
+                            {"id", gen_chatcmplid()},
+                            {"model", modelname},
+                            {"object", "chat.completion.chunk"}};
+
+                return std::vector<json>({initial_ret, second_ret});
+            }
+        } else {
+            // Some idiosyncrasy in task processing logic makes several trailing calls
+            // with empty content, we ignore these at the calee site.
+            if (content.empty()) {
+                return std::vector<json>({json::object()});
+            }
+
+            choices = json::array({json{
+                {"finish_reason", nullptr},
+                {"index", 0},
+                {"delta",
+                json{
+                    {"content", content},
+                }},
+            }});
+        }
+    }
+
+    json ret = json{{"choices", choices},
+                    {"created", t},
+                    {"id", gen_chatcmplid()},
+                    {"model", modelname},
+                    {"object", "chat.completion.chunk"}};
+
+    return std::vector<json>({ret});
+}
+
+inline static json format_embeddings_response_oaicompat(const json &request, const json &embeddings)
+{
+    json res =
+        json{
+            {"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
+            {"object", "list"},
+            {"usage",
+                json{{"prompt_tokens", 0},
+                     {"total_tokens", 0}}},
+            {"data", embeddings}
+        };
+    return res;
+}
+
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -0,0 +1,508 @@
+#pragma once
+
+#include <string>
+#include <vector>
+#include <set>
+#include <mutex>
+#include <condition_variable>
+#include <unordered_map>
+
+#include "json.hpp"
+
+#include "../llava/clip.h"
+
+using json = nlohmann::json;
+
+extern bool server_verbose;
+
+#ifndef SERVER_VERBOSE
+#define SERVER_VERBOSE 1
+#endif
+
+#if SERVER_VERBOSE != 1
+#define LOG_VERBOSE(MSG, ...)
+#else
+#define LOG_VERBOSE(MSG, ...)                                            \
+    do                                                                   \
+    {                                                                    \
+        if (server_verbose)                                              \
+        {                                                                \
+            server_log("VERBOSE", __func__, __LINE__, MSG, __VA_ARGS__); \
+        }                                                                \
+    } while (0)
+#endif
+
+#define LOG_ERROR(  MSG, ...) server_log("ERROR",   __func__, __LINE__, MSG, __VA_ARGS__)
+#define LOG_WARNING(MSG, ...) server_log("WARNING", __func__, __LINE__, MSG, __VA_ARGS__)
+#define LOG_INFO(   MSG, ...) server_log("INFO",    __func__, __LINE__, MSG, __VA_ARGS__)
+
+//
+// parallel
+//
+
+enum server_state {
+    SERVER_STATE_LOADING_MODEL,  // Server is starting up, model not fully loaded yet
+    SERVER_STATE_READY,          // Server is ready and model is loaded
+    SERVER_STATE_ERROR           // An error occurred, load_model failed
+};
+
+enum task_type {
+    TASK_TYPE_COMPLETION,
+    TASK_TYPE_CANCEL,
+    TASK_TYPE_NEXT_RESPONSE
+};
+
+struct task_server {
+    int id = -1; // to be filled by llama_server_queue
+    int target_id;
+    task_type type;
+    json data;
+    bool infill_mode = false;
+    bool embedding_mode = false;
+    int multitask_id = -1;
+};
+
+struct task_result {
+    int id;
+    int multitask_id = -1;
+    bool stop;
+    bool error;
+    json result_json;
+};
+
+struct task_multi {
+    int id;
+    std::set<int> subtasks_remaining{};
+    std::vector<task_result> results{};
+};
+
+// TODO: can become bool if we can't find use of more states
+enum slot_state
+{
+    IDLE,
+    PROCESSING,
+};
+
+enum slot_command
+{
+    NONE,
+    LOAD_PROMPT,
+    RELEASE,
+};
+
+struct slot_params
+{
+    bool stream       = true;
+    bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt
+
+    uint32_t seed      = -1; // RNG seed
+    int32_t  n_keep    =  0; // number of tokens to keep from initial prompt
+    int32_t  n_predict = -1; // new tokens to predict
+
+    std::vector<std::string> antiprompt;
+
+    json input_prefix;
+    json input_suffix;
+};
+
+struct slot_image
+{
+    int32_t id;
+
+    bool request_encode_image = false;
+    float * image_embedding = nullptr;
+    int32_t image_tokens = 0;
+
+    clip_image_u8 * img_data;
+
+    std::string prefix_prompt; // before of this image
+};
+
+// completion token output with probabilities
+struct completion_token_output
+{
+    struct token_prob
+    {
+        llama_token tok;
+        float prob;
+    };
+
+    std::vector<token_prob> probs;
+    llama_token tok;
+    std::string text_to_send;
+};
+
+static inline void server_log(const char *level, const char *function, int line,
+                       const char *message, const nlohmann::ordered_json &extra)
+{
+    nlohmann::ordered_json log
+    {
+        {"timestamp", time(nullptr)},
+        {"level",     level},
+        {"function",  function},
+        {"line",      line},
+        {"message",   message},
+    };
+
+    if (!extra.empty())
+    {
+        log.merge_patch(extra);
+    }
+
+    const std::string str = log.dump(-1, ' ', false, json::error_handler_t::replace);
+    printf("%.*s\n", (int)str.size(), str.data());
+    fflush(stdout);
+}
+
+//
+// server utils
+//
+
+template <typename T>
+static T json_value(const json &body, const std::string &key, const T &default_value)
+{
+    // Fallback null to default value
+    return body.contains(key) && !body.at(key).is_null()
+        ? body.value(key, default_value)
+        : default_value;
+}
+
+inline std::string format_chatml(std::vector<json> messages)
+{
+    std::ostringstream chatml_msgs;
+
+    for (auto it = messages.begin(); it != messages.end(); ++it) {
+        chatml_msgs << "<|im_start|>"
+                    << json_value(*it, "role",    std::string("user")) << '\n';
+        chatml_msgs << json_value(*it, "content", std::string(""))
+                    << "<|im_end|>\n";
+    }
+
+    chatml_msgs << "<|im_start|>assistant" << '\n';
+
+    return chatml_msgs.str();
+}
+
+//
+// work queue utils
+//
+
+struct llama_server_queue {
+    int id = 0;
+    std::mutex mutex_tasks;
+    // queues
+    std::vector<task_server> queue_tasks;
+    std::vector<task_server> queue_tasks_deferred;
+    std::vector<task_multi> queue_multitasks;
+    std::condition_variable condition_tasks;
+    // callback functions
+    std::function<void(task_server&)> callback_new_task;
+    std::function<void(task_multi&)> callback_finish_multitask;
+    std::function<void(void)> callback_all_task_finished;
+
+    // Add a new task to the end of the queue
+    int post(task_server task) {
+        std::unique_lock<std::mutex> lock(mutex_tasks);
+        if (task.id == -1) {
+            task.id = id++;
+        }
+        queue_tasks.push_back(std::move(task));
+        condition_tasks.notify_one();
+        return task.id;
+    }
+
+    // Add a new task, but defer until one slot is available
+    void defer(task_server task) {
+        std::unique_lock<std::mutex> lock(mutex_tasks);
+        queue_tasks_deferred.push_back(std::move(task));
+    }
+
+    // Get the next id for creating anew task
+    int get_new_id() {
+        std::unique_lock<std::mutex> lock(mutex_tasks);
+        return id++;
+    }
+
+    // Register function to process a new task
+    void on_new_task(std::function<void(task_server&)> callback) {
+        callback_new_task = callback;
+    }
+
+    // Register function to process a multitask
+    void on_finish_multitask(std::function<void(task_multi&)> callback) {
+        callback_finish_multitask = callback;
+    }
+
+    // Register the function to be called when the batch of tasks is finished
+    void on_all_tasks_finished(std::function<void(void)> callback) {
+        callback_all_task_finished = callback;
+    }
+
+    // Call when the state of one slot is changed
+    void notify_slot_changed() {
+        // move deferred tasks back to main loop
+        std::unique_lock<std::mutex> lock(mutex_tasks);
+        for (auto & task : queue_tasks_deferred) {
+            queue_tasks.push_back(std::move(task));
+        }
+        queue_tasks_deferred.clear();
+    }
+
+    // Start the main loop. This call is blocking
+    [[noreturn]]
+    void start_loop() {
+        while (true) {
+            // new task arrived
+            LOG_VERBOSE("have new task", {});
+            {
+                while (true)
+                {
+                    std::unique_lock<std::mutex> lock(mutex_tasks);
+                    if (queue_tasks.empty()) {
+                        lock.unlock();
+                        break;
+                    }
+                    task_server task = queue_tasks.front();
+                    queue_tasks.erase(queue_tasks.begin());
+                    lock.unlock();
+                    LOG_VERBOSE("callback_new_task", {});
+                    callback_new_task(task);
+                }
+                LOG_VERBOSE("callback_all_task_finished", {});
+                // process and update all the multitasks
+                auto queue_iterator = queue_multitasks.begin();
+                while (queue_iterator != queue_multitasks.end())
+                {
+                    if (queue_iterator->subtasks_remaining.empty())
+                    {
+                        // all subtasks done == multitask is done
+                        task_multi current_multitask = *queue_iterator;
+                        callback_finish_multitask(current_multitask);
+                        // remove this multitask
+                        queue_iterator = queue_multitasks.erase(queue_iterator);
+                    }
+                    else
+                    {
+                        ++queue_iterator;
+                    }
+                }
+                // all tasks in the current loop is finished
+                callback_all_task_finished();
+            }
+            LOG_VERBOSE("wait for new task", {});
+            // wait for new task
+            {
+                std::unique_lock<std::mutex> lock(mutex_tasks);
+                if (queue_tasks.empty()) {
+                    condition_tasks.wait(lock, [&]{
+                        return !queue_tasks.empty();
+                    });
+                }
+            }
+        }
+    }
+
+    //
+    // functions to manage multitasks
+    //
+
+    // add a multitask by specifying the id of all subtask (subtask is a task_server)
+    void add_multitask(int multitask_id, std::vector<int>& sub_ids)
+    {
+        std::lock_guard<std::mutex> lock(mutex_tasks);
+        task_multi multi;
+        multi.id = multitask_id;
+        std::copy(sub_ids.begin(), sub_ids.end(), std::inserter(multi.subtasks_remaining, multi.subtasks_remaining.end()));
+        queue_multitasks.push_back(multi);
+    }
+
+    // updatethe remaining subtasks, while appending results to multitask
+    void update_multitask(int multitask_id, int subtask_id, task_result& result)
+    {
+        std::lock_guard<std::mutex> lock(mutex_tasks);
+        for (auto& multitask : queue_multitasks)
+        {
+            if (multitask.id == multitask_id)
+            {
+                multitask.subtasks_remaining.erase(subtask_id);
+                multitask.results.push_back(result);
+            }
+        }
+    }
+};
+
+struct llama_server_response {
+    typedef std::function<void(int, int, task_result&)> callback_multitask_t;
+    callback_multitask_t callback_update_multitask;
+    // for keeping track of all tasks waiting for the result
+    std::set<int> waiting_task_ids;
+    // the main result queue
+    std::vector<task_result> queue_results;
+    std::mutex mutex_results;
+    std::condition_variable condition_results;
+
+    void add_waiting_task_id(int task_id) {
+        std::unique_lock<std::mutex> lock(mutex_results);
+        waiting_task_ids.insert(task_id);
+    }
+
+    void remove_waiting_task_id(int task_id) {
+        std::unique_lock<std::mutex> lock(mutex_results);
+        waiting_task_ids.erase(task_id);
+    }
+
+    // This function blocks the thread until there is a response for this task_id
+    task_result recv(int task_id) {
+        while (true)
+        {
+            std::unique_lock<std::mutex> lock(mutex_results);
+            condition_results.wait(lock, [&]{
+                return !queue_results.empty();
+            });
+            LOG_VERBOSE("condition_results unblock", {});
+
+            for (int i = 0; i < (int) queue_results.size(); i++)
+            {
+                if (queue_results[i].id == task_id)
+                {
+                    assert(queue_results[i].multitask_id == -1);
+                    task_result res = queue_results[i];
+                    queue_results.erase(queue_results.begin() + i);
+                    return res;
+                }
+            }
+        }
+
+        // should never reach here
+    }
+
+    // Register the function to update multitask
+    void on_multitask_update(callback_multitask_t callback) {
+        callback_update_multitask = callback;
+    }
+
+    // Send a new result to a waiting task_id
+    void send(task_result result) {
+        std::unique_lock<std::mutex> lock(mutex_results);
+        LOG_VERBOSE("send new result", {});
+        for (auto& task_id : waiting_task_ids) {
+            // LOG_TEE("waiting task id %i \n", task_id);
+            // for now, tasks that have associated parent multitasks just get erased once multitask picks up the result
+            if (result.multitask_id == task_id)
+            {
+                LOG_VERBOSE("callback_update_multitask", {});
+                callback_update_multitask(task_id, result.id, result);
+                continue;
+            }
+
+            if (result.id == task_id)
+            {
+                LOG_VERBOSE("queue_results.push_back", {});
+                queue_results.push_back(result);
+                condition_results.notify_one();
+                return;
+            }
+        }
+    }
+};
+
+//
+// base64 utils (TODO: move to common in the future)
+//
+
+static const std::string base64_chars =
+             "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+             "abcdefghijklmnopqrstuvwxyz"
+             "0123456789+/";
+
+static inline bool is_base64(uint8_t c)
+{
+    return (isalnum(c) || (c == '+') || (c == '/'));
+}
+
+static inline std::vector<uint8_t> base64_decode(const std::string & encoded_string)
+{
+    int i = 0;
+    int j = 0;
+    int in_ = 0;
+
+    int in_len = encoded_string.size();
+
+    uint8_t char_array_4[4];
+    uint8_t char_array_3[3];
+
+    std::vector<uint8_t> ret;
+
+    while (in_len-- && (encoded_string[in_] != '=') && is_base64(encoded_string[in_]))
+    {
+        char_array_4[i++] = encoded_string[in_]; in_++;
+        if (i == 4)
+        {
+            for (i = 0; i <4; i++)
+            {
+                char_array_4[i] = base64_chars.find(char_array_4[i]);
+            }
+
+            char_array_3[0] = ((char_array_4[0]      ) << 2) + ((char_array_4[1] & 0x30) >> 4);
+            char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
+            char_array_3[2] = ((char_array_4[2] & 0x3) << 6) +   char_array_4[3];
+
+            for (i = 0; (i < 3); i++)
+            {
+                ret.push_back(char_array_3[i]);
+            }
+            i = 0;
+        }
+    }
+
+    if (i)
+    {
+        for (j = i; j <4; j++)
+        {
+            char_array_4[j] = 0;
+        }
+
+        for (j = 0; j <4; j++)
+        {
+            char_array_4[j] = base64_chars.find(char_array_4[j]);
+        }
+
+        char_array_3[0] = ((char_array_4[0]      ) << 2) + ((char_array_4[1] & 0x30) >> 4);
+        char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
+        char_array_3[2] = ((char_array_4[2] & 0x3) << 6) +   char_array_4[3];
+
+        for (j = 0; (j < i - 1); j++)
+        {
+            ret.push_back(char_array_3[j]);
+        }
+    }
+
+    return ret;
+}
+
+//
+// random string / id
+//
+
+static std::string random_string()
+{
+    static const std::string str("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
+
+    std::random_device rd;
+    std::mt19937 generator(rd());
+
+    std::string result(32, ' ');
+
+    for (int i = 0; i < 32; ++i) {
+        result[i] = str[generator() % str.size()];
+    }
+
+    return result;
+}
+
+static std::string gen_chatcmplid()
+{
+    std::stringstream chatcmplid;
+    chatcmplid << "chatcmpl-" << random_string();
+    return chatcmplid.str();
+}
--- a/examples/sycl/CMakeLists.txt
+++ b/examples/sycl/CMakeLists.txt
@@ -0,0 +1,9 @@
+#  MIT license
+#  Copyright (C) 2024 Intel Corporation
+#  SPDX-License-Identifier: MIT
+
+set(TARGET ls-sycl-device)
+add_executable(${TARGET} ls-sycl-device.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
--- a/examples/sycl/README.md
+++ b/examples/sycl/README.md
@@ -0,0 +1,47 @@
+# llama.cpp/example/sycl
+
+This example program provide the tools for llama.cpp for SYCL on Intel GPU.
+
+## Tool
+
+|Tool Name| Function|Status|
+|-|-|-|
+|ls-sycl-device| List all SYCL devices with ID, compute capability, max work group size, ect.|Support|
+
+### ls-sycl-device
+
+List all SYCL devices with ID, compute capability, max work group size, ect.
+
+1. Build the llama.cpp for SYCL for all targets.
+
+2. Enable oneAPI running environment
+
+```
+source /opt/intel/oneapi/setvars.sh
+```
+
+3. Execute
+
+```
+./build/bin/ls-sycl-device
+```
+
+Check the ID in startup log, like:
+
+```
+found 4 SYCL devices:
+  Device 0: Intel(R) Arc(TM) A770 Graphics,	compute capability 1.3,
+    max compute_units 512,	max work group size 1024,	max sub group size 32,	global mem size 16225243136
+  Device 1: Intel(R) FPGA Emulation Device,	compute capability 1.2,
+    max compute_units 24,	max work group size 67108864,	max sub group size 64,	global mem size 67065057280
+  Device 2: 13th Gen Intel(R) Core(TM) i7-13700K,	compute capability 3.0,
+    max compute_units 24,	max work group size 8192,	max sub group size 64,	global mem size 67065057280
+  Device 3: Intel(R) Arc(TM) A770 Graphics,	compute capability 3.0,
+    max compute_units 512,	max work group size 1024,	max sub group size 32,	global mem size 16225243136
+
+```
+
+|Attribute|Note|
+|-|-|
+|compute capability 1.3|Level-zero running time, recommended |
+|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
--- a/examples/sycl/build.sh
+++ b/examples/sycl/build.sh
@@ -0,0 +1,20 @@
+
+#  MIT license
+#  Copyright (C) 2024 Intel Corporation
+#  SPDX-License-Identifier: MIT
+
+mkdir -p build
+cd build
+source /opt/intel/oneapi/setvars.sh
+
+#for FP16
+#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON # faster for long-prompt inference
+
+#for FP32
+cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+
+#build example/main only
+#cmake --build . --config Release --target main
+
+#build all binary
+cmake --build . --config Release -v
--- a/examples/sycl/ls-sycl-device.cpp
+++ b/examples/sycl/ls-sycl-device.cpp
@@ -0,0 +1,11 @@
+/*MIT license
+  Copyright (C) 2024 Intel Corporation
+  SPDX-License-Identifier: MIT
+*/
+
+#include "ggml-sycl.h"
+
+int main(int argc, char ** argv) {
+    ggml_backend_sycl_print_sycl_devices();
+    return 0;
+}
--- a/examples/sycl/run-llama2.sh
+++ b/examples/sycl/run-llama2.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+#  MIT license
+#  Copyright (C) 2024 Intel Corporation
+#  SPDX-License-Identifier: MIT
+
+INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
+source /opt/intel/oneapi/setvars.sh
+
+if [ $# -gt 0 ]; then
+    export GGML_SYCL_DEVICE=$1
+else
+    export GGML_SYCL_DEVICE=0
+fi
+echo GGML_SYCL_DEVICE=$GGML_SYCL_DEVICE
+#export GGML_SYCL_DEBUG=1
+./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
+#./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 5 -e -ngl 33 -t 1 -s 0
+
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -263,7 +263,6 @@ static void init_model(struct my_llama_model * model) {
    model->data.resize(size + tensor_alignment);
    alloc = ggml_allocr_new(model->data.data(), model->data.size(), tensor_alignment);
    alloc_model(alloc, model);
-    ggml_allocr_free(alloc);
 }

 static void randomize_model(struct my_llama_model * model, int seed, float mean, float std, float min, float max) {
@@ -1102,7 +1101,6 @@ int main(int argc, char ** argv) {
    alloc = ggml_allocr_new(mem_input_data.data(), mem_input_data.size(), tensor_alignment);
    ggml_allocr_alloc(alloc, tokens_input);
    ggml_allocr_alloc(alloc, target_probs);
-    ggml_allocr_free(alloc);

    // context for compute tensors without their data
    const size_t estimated_compute_size_wo_data = (
@@ -1149,7 +1147,6 @@ int main(int argc, char ** argv) {
            best_compute_size = max_compute_size;
            best_order = gf->order;
        }
-        ggml_allocr_free(alloc);
        ggml_free(ctx_compute);
    }
    size_t max_compute_size = best_compute_size;
@@ -1177,7 +1174,6 @@ int main(int argc, char ** argv) {
        params.common.use_flash,
        params.common.use_checkpointing
    );
-    ggml_allocr_free(alloc);

    std::vector<llama_token> train_tokens;
    std::vector<size_t> train_samples_begin;
--- a/flake.lock
+++ b/flake.lock
@@ -20,11 +20,11 @@
    },
    "nixpkgs": {
      "locked": {
-        "lastModified": 1705133751,
-        "narHash": "sha256-rCIsyE80jgiOU78gCWN3A0wE0tR2GI5nH6MlS+HaaSQ=",
+        "lastModified": 1706191920,
+        "narHash": "sha256-eLihrZAPZX0R6RyM5fYAWeKVNuQPYjAkCUBr+JNvtdE=",
        "owner": "NixOS",
        "repo": "nixpkgs",
-        "rev": "9b19f5e77dd906cb52dade0b7bd280339d2a1f3d",
+        "rev": "ae5c332cbb5827f6b1f02572496b141021de335f",
        "type": "github"
      },
      "original": {
--- a/flake.nix
+++ b/flake.nix
@@ -1,3 +1,17 @@
+# The flake interface to llama.cpp's Nix expressions. The flake is used as a
+# more discoverable entry-point, as well as a way to pin the dependencies and
+# expose default outputs, including the outputs built by the CI.
+
+# For more serious applications involving some kind of customization  you may
+# want to consider consuming the overlay, or instantiating `llamaPackages`
+# directly:
+#
+# ```nix
+# pkgs.callPackage ${llama-cpp-root}/.devops/nix/scope.nix { }`
+# ```
+
+# Cf. https://jade.fyi/blog/flakes-arent-real/ for a more detailed exposition
+# of the relation between Nix and the Nix Flakes.
 {
  description = "Port of Facebook's LLaMA model in C/C++";

--- a/ggml-alloc.c
+++ b/ggml-alloc.c
@@ -109,8 +109,8 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
        if (block->size >= size) {
            best_fit_block = alloc->n_free_blocks - 1;
        } else {
-            fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
-                    __func__, size, max_avail);
+            fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, largest block available %zu)\n",
+                    __func__, tensor->name, size, max_avail);
            GGML_ASSERT(!"not enough space in the buffer");
            return;
        }
@@ -335,7 +335,9 @@ bool ggml_tallocr_is_measure(ggml_tallocr_t alloc) {
 }

 size_t ggml_tallocr_max_size(ggml_tallocr_t alloc) {
-    return alloc->max_size;
+    // FIXME: changes in the tensor sizes compared to the measure graph may cause allocations to fail
+    // to avoid this, we add a 10% margin to the buffer size
+    return alloc->max_size + alloc->max_size/10;
 }

 // graph allocator
@@ -776,38 +778,26 @@ size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph * graph)
 }

 // utils
-ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
-    GGML_ASSERT(ggml_get_no_alloc(ctx) == true);

-    size_t alignment = ggml_backend_buft_get_alignment(buft);
-
-    size_t nbytes = 0;
-    for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
-        if (t->data == NULL && t->view_src == NULL) {
-            nbytes += GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
-        }
-    }
-
-    if (nbytes == 0) {
-        // all the tensors in the context are already allocated
-#ifndef NDEBUG
-        fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
-#endif
-        return NULL;
-    }
-
-    ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, nbytes);
+static bool alloc_tensor_range(struct ggml_context * ctx,
+        struct ggml_tensor * first, struct ggml_tensor * last,
+        ggml_backend_buffer_type_t buft, size_t size,
+        ggml_backend_buffer_t ** buffers, size_t * n_buffers) {
+    ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
    if (buffer == NULL) {
-        // failed to allocate buffer
 #ifndef NDEBUG
-        fprintf(stderr, "%s: failed to allocate buffer\n", __func__);
+        fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
 #endif
-        return NULL;
+        for (size_t i = 0; i < *n_buffers; i++) {
+            ggml_backend_buffer_free(*buffers[i]);
+        }
+        free(*buffers);
+        return false;
    }

    ggml_tallocr_t tallocr = ggml_tallocr_new_from_buffer(buffer);

-    for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+    for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
        if (t->data == NULL) {
            if (t->view_src == NULL) {
                ggml_tallocr_alloc(tallocr, t);
@@ -824,6 +814,76 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte

    ggml_tallocr_free(tallocr);

+    *buffers = realloc(*buffers, sizeof(ggml_backend_buffer_t) * (*n_buffers + 1));
+    (*buffers)[(*n_buffers)++] = buffer;
+
+    return true;
+}
+
+ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
+    GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
+
+    size_t alignment = ggml_backend_buft_get_alignment(buft);
+    size_t max_size = ggml_backend_buft_get_max_size(buft);
+
+    ggml_backend_buffer_t * buffers = NULL;
+    size_t n_buffers = 0;
+
+    size_t cur_buf_size = 0;
+    struct ggml_tensor * first = ggml_get_first_tensor(ctx);
+    for (struct ggml_tensor * t = first; t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+        size_t this_size = 0;
+        if (t->data == NULL && t->view_src == NULL) {
+            this_size = GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
+        }
+
+        if (this_size > max_size) {
+            // tensor is too large to fit in a single buffer
+            fprintf(stderr, "%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
+                    __func__, t->name,
+                    ggml_backend_buft_name(buft),
+                    this_size, max_size);
+            for (size_t i = 0; i < n_buffers; i++) {
+                ggml_backend_buffer_free(buffers[i]);
+            }
+            free(buffers);
+            return NULL;
+        }
+
+        if ((cur_buf_size + this_size) > max_size) {
+            // allocate tensors in the current buffer
+            if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
+                return NULL;
+            }
+            first = t;
+            cur_buf_size = this_size;
+        } else {
+            cur_buf_size += this_size;
+        }
+    }
+
+    // allocate remaining tensors
+    if (cur_buf_size > 0) {
+        if (!alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) {
+            return NULL;
+        }
+    }
+
+    if (n_buffers == 0) {
+        // all the tensors in the context are already allocated
+#ifndef NDEBUG
+        fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
+#endif
+        return NULL;
+    }
+
+    ggml_backend_buffer_t buffer;
+    if (n_buffers == 1) {
+        buffer = buffers[0];
+    } else {
+        buffer = ggml_backend_multi_buffer_alloc_buffer(buffers, n_buffers);
+    }
+    free(buffers);
    return buffer;
 }

--- a/ggml-backend-impl.h
+++ b/ggml-backend-impl.h
@@ -19,6 +19,7 @@ extern "C" {
        const char *          (*GGML_CALL get_name)        (ggml_backend_buffer_type_t buft);
        ggml_backend_buffer_t (*GGML_CALL alloc_buffer)    (ggml_backend_buffer_type_t buft, size_t size);
        size_t                (*GGML_CALL get_alignment)   (ggml_backend_buffer_type_t buft); // tensor alignment
+        size_t                (*GGML_CALL get_max_size)    (ggml_backend_buffer_type_t buft); // allocation max size
        size_t                (*GGML_CALL get_alloc_size)  (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
        bool                  (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
        // check if tensor data is in host memory
@@ -63,6 +64,11 @@ extern "C" {
    // do not use directly, use ggml_backend_tensor_copy instead
    bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);

+    // buffer that contains a collection of buffers
+    GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
+    GGML_CALL bool                  ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
+    GGML_CALL void                  ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
+
    //
    // Backend
    //
--- a/ggml-backend.c
+++ b/ggml-backend.c
@@ -27,10 +27,20 @@ size_t ggml_backend_buft_get_alignment(ggml_backend_buffer_type_t buft) {
    return buft->iface.get_alignment(buft);
 }

+size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) {
+    // get_max_size is optional, defaults to SIZE_MAX
+    if (buft->iface.get_max_size) {
+        return buft->iface.get_max_size(buft);
+    }
+    return SIZE_MAX;
+}
+
 GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
    // get_alloc_size is optional, defaults to ggml_nbytes
    if (buft->iface.get_alloc_size) {
-        return buft->iface.get_alloc_size(buft, tensor);
+        size_t size = buft->iface.get_alloc_size(buft, tensor);
+        assert(size >= ggml_nbytes(tensor));
+        return size;
    }
    return ggml_nbytes(tensor);
 }
@@ -55,8 +65,6 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
               size_t                          size) {
    ggml_backend_buffer_t buffer = malloc(sizeof(struct ggml_backend_buffer));

-    GGML_ASSERT(iface.get_base != NULL);
-
    (*buffer) = (struct ggml_backend_buffer) {
        /* .interface = */ iface,
        /* .buft      = */ buft,
@@ -106,6 +114,10 @@ size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer) {
    return ggml_backend_buft_get_alignment(ggml_backend_buffer_get_type(buffer));
 }

+size_t ggml_backend_buffer_get_max_size(ggml_backend_buffer_t buffer) {
+    return ggml_backend_buft_get_max_size(ggml_backend_buffer_get_type(buffer));
+}
+
 size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
    return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
 }
@@ -120,6 +132,11 @@ bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {

 void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
    buffer->usage = usage;
+
+    // FIXME: add a generic callback to the buffer interface
+    if (ggml_backend_buffer_is_multi_buffer(buffer)) {
+        ggml_backend_multi_buffer_set_usage(buffer, usage);
+    }
 }

 ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) {
@@ -169,6 +186,10 @@ size_t ggml_backend_get_alignment(ggml_backend_t backend) {
    return ggml_backend_buft_get_alignment(ggml_backend_get_default_buffer_type(backend));
 }

+size_t ggml_backend_get_max_size(ggml_backend_t backend) {
+    return ggml_backend_buft_get_max_size(ggml_backend_get_default_buffer_type(backend));
+}
+
 void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
    GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
    GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
@@ -337,11 +358,26 @@ GGML_CALL static void ggml_backend_registry_init(void) {
    ggml_backend_cuda_reg_devices();
 #endif

+#ifdef GGML_USE_SYCL
+    extern void ggml_backend_sycl_reg_devices(void);
+    ggml_backend_sycl_reg_devices();
+#endif
+
 #ifdef GGML_USE_METAL
    extern GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data);
    extern GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
    ggml_backend_register("Metal", ggml_backend_reg_metal_init, ggml_backend_metal_buffer_type(), NULL);
 #endif
+
+#ifdef GGML_USE_VULKAN
+    extern GGML_CALL int ggml_backend_vk_reg_devices(void);
+    ggml_backend_vk_reg_devices();
+#endif
+
+#ifdef GGML_USE_KOMPUTE
+    extern GGML_CALL void ggml_backend_kompute_reg_devices(void);
+    ggml_backend_kompute_reg_devices();
+#endif
 }

 GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
@@ -545,6 +581,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
            /* .get_name         = */ ggml_backend_cpu_buffer_type_get_name,
            /* .alloc_buffer     = */ ggml_backend_cpu_buffer_type_alloc_buffer,
            /* .get_alignment    = */ ggml_backend_cpu_buffer_type_get_alignment,
+            /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
            /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
            /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
            /* .is_host          = */ ggml_backend_cpu_buffer_type_is_host,
@@ -600,6 +637,7 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
            /* .get_name         = */ ggml_backend_cpu_hbm_buffer_type_get_name,
            /* .alloc_buffer     = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
            /* .get_alignment    = */ ggml_backend_cpu_buffer_type_get_alignment,
+            /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
            /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
            /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
            /* .is_host          = */ ggml_backend_cpu_buffer_type_is_host,
@@ -756,6 +794,80 @@ GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, v
    GGML_UNUSED(user_data);
 }

+// multi-buffer buffer
+
+struct ggml_backend_multi_buffer_context {
+    ggml_backend_buffer_t * buffers;
+    size_t n_buffers;
+};
+
+typedef struct ggml_backend_multi_buffer_context * ggml_backend_multi_buffer_context_t;
+
+GGML_CALL static const char * ggml_backend_multi_buffer_get_name(ggml_backend_buffer_t buffer) {
+    ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
+
+    return ctx->buffers[0]->iface.get_name(ctx->buffers[0]);
+}
+
+GGML_CALL static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+    ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
+    for (size_t i = 0; i < ctx->n_buffers; i++) {
+        ggml_backend_buffer_free(ctx->buffers[i]);
+    }
+
+    free(ctx->buffers);
+    free(ctx);
+}
+
+GGML_CALL static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
+    ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
+    for (size_t i = 0; i < ctx->n_buffers; i++) {
+        ggml_backend_buffer_clear(ctx->buffers[i], value);
+    }
+}
+
+static struct ggml_backend_buffer_i ggml_backend_multi_buffer_context_interface(void) {
+    static struct ggml_backend_buffer_i multi_backend_buffer_i = {
+        /* .get_name        = */ ggml_backend_multi_buffer_get_name,
+        /* .free_buffer     = */ ggml_backend_multi_buffer_free_buffer,
+        /* .get_base        = */ NULL,
+        /* .init_tensor     = */ NULL,
+        /* .set_tensor      = */ NULL,
+        /* .get_tensor      = */ NULL,
+        /* .cpy_tensor      = */ NULL,
+        /* .clear           = */ ggml_backend_multi_buffer_clear,
+        /* .reset           = */ NULL,
+    };
+
+    return multi_backend_buffer_i;
+}
+
+GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers) {
+    ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) malloc(sizeof(struct ggml_backend_multi_buffer_context));
+    ctx->n_buffers = n_buffers;
+    ctx->buffers = (ggml_backend_buffer_t *) malloc(n_buffers * sizeof(ggml_backend_buffer_t));
+
+    size_t total_size = 0;
+    for (size_t i = 0; i < n_buffers; i++) {
+        ctx->buffers[i] = buffers[i];
+        total_size += ggml_backend_buffer_get_size(buffers[i]);
+    }
+
+    return ggml_backend_buffer_init(buffers[0]->buft, ggml_backend_multi_buffer_context_interface(), ctx, total_size);
+}
+
+GGML_CALL bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer) {
+    return buffer->iface.get_name == ggml_backend_multi_buffer_get_name;
+}
+
+GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
+    GGML_ASSERT(ggml_backend_buffer_is_multi_buffer(buffer));
+    ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
+    for (size_t i = 0; i < ctx->n_buffers; i++) {
+        ggml_backend_buffer_set_usage(ctx->buffers[i], usage);
+    }
+}
+

 // scheduler

@@ -1191,6 +1303,24 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
                ggml_tallocr_t src_allocr = node_allocr(src);
                GGML_ASSERT(src_allocr != NULL); // all inputs should be assigned by now
                if (src_allocr != node_allocr) {
+                    // create a copy of the input in the split's backend
+                    size_t id = hash_id(src);
+                    if (sched->node_copies[id][cur_backend_id] == NULL) {
+                        ggml_backend_t backend = get_allocr_backend(sched, cur_allocr);
+                        struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
+                        ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name);
+
+                        sched->node_copies[id][cur_backend_id] = tensor_copy;
+                        node_allocr(tensor_copy) = cur_allocr;
+                        SET_CAUSE(tensor_copy, "4.cpy");
+
+                        int n_inputs = sched->splits[cur_split].n_inputs++;
+                        GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
+                        sched->splits[cur_split].inputs[n_inputs] = src;
+                    }
+                    node->src[j] = sched->node_copies[id][cur_backend_id];
+
+#if 0
                    // check if the input is already in the split
                    bool found = false;
                    for (int k = 0; k < sched->splits[cur_split].n_inputs; k++) {
@@ -1206,19 +1336,7 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
                        GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
                        sched->splits[cur_split].inputs[n_inputs] = src;
                    }
-
-                    // create a copy of the input in the split's backend
-                    size_t id = hash_id(src);
-                    if (sched->node_copies[id][cur_backend_id] == NULL) {
-                        ggml_backend_t backend = get_allocr_backend(sched, cur_allocr);
-                        struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
-                        ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name);
-
-                        sched->node_copies[id][cur_backend_id] = tensor_copy;
-                        node_allocr(tensor_copy) = cur_allocr;
-                        SET_CAUSE(tensor_copy, "4.cpy");
-                    }
-                    node->src[j] = sched->node_copies[id][cur_backend_id];
+#endif
                }
            }
        }
@@ -1333,7 +1451,7 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
        uint64_t compute_start_us = ggml_time_us();
        if (!sched->callback_eval) {
            ggml_backend_graph_compute(split_backend, &split->graph);
-          //ggml_backend_synchronize(split_backend); // necessary to measure compute time
+            //ggml_backend_synchronize(split_backend); // necessary to measure compute time
        } else {
            // similar to ggml_backend_compare_graph_backend
            for (int j0 = 0; j0 < split->graph.n_nodes; j0++) {
--- a/ggml-backend.h
+++ b/ggml-backend.h
@@ -20,6 +20,7 @@ extern "C" {
    GGML_API           const char *          ggml_backend_buft_name            (ggml_backend_buffer_type_t buft);
    GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer    (ggml_backend_buffer_type_t buft, size_t size);
    GGML_API           size_t                ggml_backend_buft_get_alignment   (ggml_backend_buffer_type_t buft);
+    GGML_API           size_t                ggml_backend_buft_get_max_size    (ggml_backend_buffer_type_t buft);
    GGML_API GGML_CALL size_t                ggml_backend_buft_get_alloc_size  (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
    GGML_API           bool                  ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
    GGML_API           bool                  ggml_backend_buft_is_host         (ggml_backend_buffer_type_t buft);
@@ -36,6 +37,7 @@ extern "C" {
    GGML_API           size_t                     ggml_backend_buffer_get_size      (ggml_backend_buffer_t buffer);
    GGML_API GGML_CALL void                       ggml_backend_buffer_init_tensor   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
    GGML_API           size_t                     ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
+    GGML_API           size_t                     ggml_backend_buffer_get_max_size  (ggml_backend_buffer_t buffer);
    GGML_API           size_t                     ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
    GGML_API           void                       ggml_backend_buffer_clear         (ggml_backend_buffer_t buffer, uint8_t value);
    GGML_API           bool                       ggml_backend_buffer_is_host       (ggml_backend_buffer_t buffer);
@@ -54,6 +56,7 @@ extern "C" {
    GGML_API ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend);
    GGML_API ggml_backend_buffer_t      ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);
    GGML_API size_t                     ggml_backend_get_alignment(ggml_backend_t backend);
+    GGML_API size_t                     ggml_backend_get_max_size(ggml_backend_t backend);

    GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
    GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -12,9 +12,10 @@
 #include <vector>
 #include <map>
 #include <array>
-#include "ggml-cuda.h"
-#include "ggml.h"
-#include "ggml-backend-impl.h"
+
+// stringize macro for converting __CUDA_ARCH_LIST__ (list of integers) to string
+#define STRINGIZE_IMPL(...) #__VA_ARGS__
+#define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__)

 #if defined(GGML_USE_HIPBLAS)
 #include <hip/hip_runtime.h>
@@ -118,6 +119,11 @@

 #endif // defined(GGML_USE_HIPBLAS)

+// ggml-cuda need half type so keep ggml headers include at last
+#include "ggml-cuda.h"
+#include "ggml.h"
+#include "ggml-backend-impl.h"
+
 #define CUDART_HMAX     11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)

 #define CC_PASCAL     600
@@ -185,6 +191,10 @@ static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
 #endif // __has_builtin(__builtin_elementwise_sub_sat)
 }

+static __device__ __forceinline__ int __vsub4(const int a, const int b) {
+    return __vsubss4(a, b);
+}
+
 static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
 #if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
    c = __builtin_amdgcn_sdot4(a, b, c, false);
@@ -499,6 +509,14 @@ typedef struct {
 } block_iq2_xs;
 static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding");

+#define QR3_XXS 8
+#define QI3_XXS (QK_K / (4*QR3_XXS))
+typedef struct {
+    half d;
+    uint8_t qs[3*(QK_K/8)];
+} block_iq3_xxs;
+static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
+
 #define WARP_SIZE 32
 #define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses

@@ -582,13 +600,28 @@ static cuda_device_capabilities g_device_caps[GGML_CUDA_MAX_DEVICES] = { {0, 0,
 static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};

 [[noreturn]]
-static __device__ void bad_arch() {
-    printf("ERROR: ggml-cuda was compiled without support for the current GPU architecture.\n");
+static __device__ void no_device_code(
+    const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
+
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+    printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n",
+           file_name, line, function_name, arch);
+    (void) arch_list;
+#else
+    printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n",
+           file_name, line, function_name, arch, arch_list);
+#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
    __trap();

-    (void) bad_arch; // suppress unused function warning
+    (void) no_device_code; // suppress unused function warning
 }

+#ifdef __CUDA_ARCH__
+#define NO_DEVICE_CODE no_device_code(__FILE__, __LINE__, __FUNCTION__, __CUDA_ARCH__, STRINGIZE(__CUDA_ARCH_LIST__))
+#else
+#define NO_DEVICE_CODE GGML_ASSERT(false && "NO_DEVICE_CODE not valid in host code.")
+#endif // __CUDA_ARCH__
+
 static __device__ __forceinline__ float warp_reduce_sum(float x) {
 #pragma unroll
    for (int mask = 16; mask > 0; mask >>= 1) {
@@ -615,7 +648,7 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
    return a;
 #else
    (void) a;
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
 }

@@ -636,7 +669,7 @@ static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
    return x;
 #else
    (void) x;
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
 }

@@ -1592,6 +1625,41 @@ static const __device__ uint64_t iq2xs_grid[512] = {
    0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b,
 };

+static const __device__ uint32_t iq3xxs_grid[256] = {
+    0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414,
+    0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14,
+    0x040c140c, 0x040c142c, 0x040c1c04, 0x040c1c14, 0x040c240c, 0x040c2c24, 0x040c3e04, 0x04140404,
+    0x04140414, 0x04140424, 0x04140c0c, 0x04141404, 0x04141414, 0x04141c0c, 0x04141c1c, 0x04141c3e,
+    0x04142c0c, 0x04142c3e, 0x04143e2c, 0x041c040c, 0x041c043e, 0x041c0c04, 0x041c0c14, 0x041c142c,
+    0x041c3e04, 0x04240c1c, 0x04241c3e, 0x04242424, 0x04242c3e, 0x04243e1c, 0x04243e2c, 0x042c040c,
+    0x042c043e, 0x042c1c14, 0x042c2c14, 0x04341c2c, 0x04343424, 0x043e0c04, 0x043e0c24, 0x043e0c34,
+    0x043e241c, 0x043e340c, 0x0c04040c, 0x0c04041c, 0x0c040c04, 0x0c040c14, 0x0c04140c, 0x0c04141c,
+    0x0c041c04, 0x0c041c14, 0x0c041c24, 0x0c04243e, 0x0c042c04, 0x0c0c0404, 0x0c0c0414, 0x0c0c0c0c,
+    0x0c0c1404, 0x0c0c1414, 0x0c14040c, 0x0c14041c, 0x0c140c04, 0x0c140c14, 0x0c14140c, 0x0c141c04,
+    0x0c143e14, 0x0c1c0404, 0x0c1c0414, 0x0c1c1404, 0x0c1c1c0c, 0x0c1c2434, 0x0c1c3434, 0x0c24040c,
+    0x0c24042c, 0x0c242c04, 0x0c2c1404, 0x0c2c1424, 0x0c2c2434, 0x0c2c3e0c, 0x0c34042c, 0x0c3e1414,
+    0x0c3e2404, 0x14040404, 0x14040414, 0x14040c0c, 0x14040c1c, 0x14041404, 0x14041414, 0x14041434,
+    0x14041c0c, 0x14042414, 0x140c040c, 0x140c041c, 0x140c042c, 0x140c0c04, 0x140c0c14, 0x140c140c,
+    0x140c1c04, 0x140c341c, 0x140c343e, 0x140c3e04, 0x14140404, 0x14140414, 0x14140c0c, 0x14140c3e,
+    0x14141404, 0x14141414, 0x14141c3e, 0x14142404, 0x14142c2c, 0x141c040c, 0x141c0c04, 0x141c0c24,
+    0x141c3e04, 0x141c3e24, 0x14241c2c, 0x14242c1c, 0x142c041c, 0x142c143e, 0x142c240c, 0x142c3e24,
+    0x143e040c, 0x143e041c, 0x143e0c34, 0x143e242c, 0x1c04040c, 0x1c040c04, 0x1c040c14, 0x1c04140c,
+    0x1c04141c, 0x1c042c04, 0x1c04342c, 0x1c043e14, 0x1c0c0404, 0x1c0c0414, 0x1c0c1404, 0x1c0c1c0c,
+    0x1c0c2424, 0x1c0c2434, 0x1c14040c, 0x1c14041c, 0x1c140c04, 0x1c14142c, 0x1c142c14, 0x1c143e14,
+    0x1c1c0c0c, 0x1c1c1c1c, 0x1c241c04, 0x1c24243e, 0x1c243e14, 0x1c2c0404, 0x1c2c0434, 0x1c2c1414,
+    0x1c2c2c2c, 0x1c340c24, 0x1c341c34, 0x1c34341c, 0x1c3e1c1c, 0x1c3e3404, 0x24040424, 0x24040c3e,
+    0x24041c2c, 0x24041c3e, 0x24042c1c, 0x24042c3e, 0x240c3e24, 0x24141404, 0x24141c3e, 0x24142404,
+    0x24143404, 0x24143434, 0x241c043e, 0x241c242c, 0x24240424, 0x24242c0c, 0x24243424, 0x242c142c,
+    0x242c241c, 0x242c3e04, 0x243e042c, 0x243e0c04, 0x243e0c14, 0x243e1c04, 0x2c040c14, 0x2c04240c,
+    0x2c043e04, 0x2c0c0404, 0x2c0c0434, 0x2c0c1434, 0x2c0c2c2c, 0x2c140c24, 0x2c141c14, 0x2c143e14,
+    0x2c1c0414, 0x2c1c2c1c, 0x2c240c04, 0x2c24141c, 0x2c24143e, 0x2c243e14, 0x2c2c0414, 0x2c2c1c0c,
+    0x2c342c04, 0x2c3e1424, 0x2c3e2414, 0x34041424, 0x34042424, 0x34042434, 0x34043424, 0x340c140c,
+    0x340c340c, 0x34140c3e, 0x34143424, 0x341c1c04, 0x341c1c34, 0x34242424, 0x342c042c, 0x342c2c14,
+    0x34341c1c, 0x343e041c, 0x343e140c, 0x3e04041c, 0x3e04042c, 0x3e04043e, 0x3e040c04, 0x3e041c14,
+    0x3e042c14, 0x3e0c1434, 0x3e0c2404, 0x3e140c14, 0x3e14242c, 0x3e142c14, 0x3e1c0404, 0x3e1c0c2c,
+    0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04,
+};
+
 static const __device__ uint8_t ksigns_iq2xs[128] = {
      0, 129, 130,   3, 132,   5,   6, 135, 136,   9,  10, 139,  12, 141, 142,  15,
    144,  17,  18, 147,  20, 149, 150,  23,  24, 153, 154,  27, 156,  29,  30, 159,
@@ -1603,6 +1671,43 @@ static const __device__ uint8_t ksigns_iq2xs[128] = {
    240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255,
 };

+//#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+static const __device__ uint64_t ksigns64[128] = {
+    0x0000000000000000, 0xff000000000000ff, 0xff0000000000ff00, 0x000000000000ffff,
+    0xff00000000ff0000, 0x0000000000ff00ff, 0x0000000000ffff00, 0xff00000000ffffff,
+    0xff000000ff000000, 0x00000000ff0000ff, 0x00000000ff00ff00, 0xff000000ff00ffff,
+    0x00000000ffff0000, 0xff000000ffff00ff, 0xff000000ffffff00, 0x00000000ffffffff,
+    0xff0000ff00000000, 0x000000ff000000ff, 0x000000ff0000ff00, 0xff0000ff0000ffff,
+    0x000000ff00ff0000, 0xff0000ff00ff00ff, 0xff0000ff00ffff00, 0x000000ff00ffffff,
+    0x000000ffff000000, 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0x000000ffff00ffff,
+    0xff0000ffffff0000, 0x000000ffffff00ff, 0x000000ffffffff00, 0xff0000ffffffffff,
+    0xff00ff0000000000, 0x0000ff00000000ff, 0x0000ff000000ff00, 0xff00ff000000ffff,
+    0x0000ff0000ff0000, 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0x0000ff0000ffffff,
+    0x0000ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00, 0x0000ff00ff00ffff,
+    0xff00ff00ffff0000, 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0xff00ff00ffffffff,
+    0x0000ffff00000000, 0xff00ffff000000ff, 0xff00ffff0000ff00, 0x0000ffff0000ffff,
+    0xff00ffff00ff0000, 0x0000ffff00ff00ff, 0x0000ffff00ffff00, 0xff00ffff00ffffff,
+    0xff00ffffff000000, 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0xff00ffffff00ffff,
+    0x0000ffffffff0000, 0xff00ffffffff00ff, 0xff00ffffffffff00, 0x0000ffffffffffff,
+    0xffff000000000000, 0x00ff0000000000ff, 0x00ff00000000ff00, 0xffff00000000ffff,
+    0x00ff000000ff0000, 0xffff000000ff00ff, 0xffff000000ffff00, 0x00ff000000ffffff,
+    0x00ff0000ff000000, 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0x00ff0000ff00ffff,
+    0xffff0000ffff0000, 0x00ff0000ffff00ff, 0x00ff0000ffffff00, 0xffff0000ffffffff,
+    0x00ff00ff00000000, 0xffff00ff000000ff, 0xffff00ff0000ff00, 0x00ff00ff0000ffff,
+    0xffff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00, 0xffff00ff00ffffff,
+    0xffff00ffff000000, 0x00ff00ffff0000ff, 0x00ff00ffff00ff00, 0xffff00ffff00ffff,
+    0x00ff00ffffff0000, 0xffff00ffffff00ff, 0xffff00ffffffff00, 0x00ff00ffffffffff,
+    0x00ffff0000000000, 0xffffff00000000ff, 0xffffff000000ff00, 0x00ffff000000ffff,
+    0xffffff0000ff0000, 0x00ffff0000ff00ff, 0x00ffff0000ffff00, 0xffffff0000ffffff,
+    0xffffff00ff000000, 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0xffffff00ff00ffff,
+    0x00ffff00ffff0000, 0xffffff00ffff00ff, 0xffffff00ffffff00, 0x00ffff00ffffffff,
+    0xffffffff00000000, 0x00ffffff000000ff, 0x00ffffff0000ff00, 0xffffffff0000ffff,
+    0x00ffffff00ff0000, 0xffffffff00ff00ff, 0xffffffff00ffff00, 0x00ffffff00ffffff,
+    0x00ffffffff000000, 0xffffffffff0000ff, 0xffffffffff00ff00, 0x00ffffffff00ffff,
+    0xffffffffffff0000, 0x00ffffffffff00ff, 0x00ffffffffffff00, 0xffffffffffffffff,
+};
+//#endif
+
 static const __device__ uint8_t kmask_iq2xs[8] = {1, 2, 4, 8, 16, 32, 64, 128};

 inline bool ggml_cuda_supports_mmq(enum ggml_type type) {
@@ -1669,6 +1774,34 @@ static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst

 }

+template<typename dst_t>
+static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
+
+    const int i   = blockIdx.x;
+    const block_iq3_xxs * x = (const block_iq3_xxs  *) vx;
+
+    const int tid = threadIdx.x;
+#if QK_K == 256
+    const int il = tid/8; // 0...3
+    const int ib = tid%8; // 0...7
+    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
+    const uint8_t  * q3 = x[i].qs + 8*ib;
+    const uint16_t * gas = (const uint16_t *)(x[i].qs + QK_K/4) + 2*ib;
+    const uint8_t  * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*il+0]);
+    const uint8_t  * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*il+1]);
+    const uint32_t aux32 = gas[0] | (gas[1] << 16);
+    const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.5f;
+    const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127];
+    for (int j = 0; j < 4; ++j) {
+        y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
+        y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
+    }
+#else
+    assert(false);
+#endif
+
+}
+
 static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {

    static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
@@ -2419,7 +2552,7 @@ static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, h
    }
 #else
    (void) vx; (void) y; (void) k;
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= CC_PASCAL
 }

@@ -2450,7 +2583,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_imp
    // second part effectively subtracts 8 from each quant value
    return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
 #else
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }

@@ -2487,7 +2620,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp
    // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
    return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
 #else
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }

@@ -2522,7 +2655,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_imp
    // second part effectively subtracts 16 from each quant value
    return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
 #else
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }

@@ -2567,7 +2700,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp
    return sumi*d5d8 + m5s8 / (QI5_1 / vdr);

 #else
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }

@@ -2588,7 +2721,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_q8_1_imp

    return d8_0*d8_1 * sumi;
 #else
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }

@@ -2618,7 +2751,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp
    // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
    return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
 #else
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }

@@ -2653,7 +2786,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(

    return dm2f.x*sumf_d - dm2f.y*sumf_m;
 #else
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }

@@ -2690,7 +2823,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(

    return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m);
 #else
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }

@@ -2730,7 +2863,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq(

    return d3 * sumf;
 #else
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }

@@ -2755,7 +2888,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(

    return d3*d8 * sumi;
 #else
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }

@@ -2788,7 +2921,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
    return dm4f.x*sumf_d - dm4f.y*sumf_m;

 #else
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }

@@ -2821,7 +2954,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
    return dm4f.x*sumf_d - dm4f.y*sumf_m;

 #else
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }

@@ -2861,7 +2994,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq(
    return dm5f.x*sumf_d - dm5f.y*sumf_m;

 #else
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }

@@ -2894,7 +3027,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(
    return dm4f.x*sumf_d - dm4f.y*sumf_m;

 #else
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }

@@ -2924,7 +3057,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(

    return d*sumf;
 #else
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }

@@ -2955,7 +3088,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
    return d6 * sumf_d;

 #else
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }

@@ -3821,7 +3954,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
    return dall * sumf_d - dmin * sumf_m;

 #else
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A

 #endif
@@ -4004,7 +4137,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
    return d * sumf_d;

 #else
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A

 #endif
@@ -4262,7 +4395,7 @@ static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(
        q8 += 8;
        aux32 >>= 7;
    }
-    const float d = (float)bq2->d * (0.5f + aux32) * (float)bq8_1[ib32].ds.x * 0.25f;
+    const float d = (float)bq2->d * (0.5f + aux32) * __low2float(bq8_1[ib32].ds) * 0.25f;
    return d * sumi;
 #else
    // iqs is 0...15
@@ -4273,7 +4406,7 @@ static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(
    const uint8_t  * grid1 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+0]);
    const uint8_t  * grid2 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+1]);
    const uint32_t aux32 = q2[2] | (q2[3] << 16);
-    const float d = (float)bq2->d * (0.5f + (aux32 >> 28)) * (float)bq8_1[ib32].ds.x * 0.25f;
+    const float d = (float)bq2->d * (0.5f + (aux32 >> 28)) * __low2float(bq8_1[ib32].ds) * 0.25f;
    const uint8_t signs1 = ksigns_iq2xs[(aux32 >> 14*il) & 127];
    const uint8_t signs2 = ksigns_iq2xs[(aux32 >> (14*il + 7)) & 127];
    const int8_t * q8 = bq8_1[ib32].qs + 16*il;
@@ -4292,6 +4425,7 @@ static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(

 static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
 #if QK_K == 256
    const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq;

@@ -4302,28 +4436,69 @@ static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
    const uint8_t ls2 = bq2->scales[ib32] >>  4;
    int sumi1 = 0;
    for (int l = 0; l < 2; ++l) {
-        const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
-        const uint8_t  signs = ksigns_iq2xs[q2[l] >> 9];
-        for (int j = 0; j < 8; ++j) {
-            sumi1 += q8[j] * grid[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
-        }
+        const uint32_t * grid = (const uint32_t *)(iq2xs_grid + (q2[l] & 511));
+        const uint32_t * signs = (const uint32_t *)(ksigns64 + (q2[l] >> 9));
+        const int grid_l = __vsub4(grid[0] ^ signs[0], signs[0]);
+        const int grid_h = __vsub4(grid[1] ^ signs[1], signs[1]);
+        sumi1 = __dp4a(grid_l, *((const int *)q8 + 0), sumi1);
+        sumi1 = __dp4a(grid_h, *((const int *)q8 + 1), sumi1);
        q8 += 8;
    }
    int sumi2 = 0;
    for (int l = 2; l < 4; ++l) {
-        const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
-        const uint8_t  signs = ksigns_iq2xs[q2[l] >> 9];
-        for (int j = 0; j < 8; ++j) {
-            sumi2 += q8[j] * grid[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
-        }
+        const uint32_t * grid = (const uint32_t *)(iq2xs_grid + (q2[l] & 511));
+        const uint32_t * signs = (const uint32_t *)(ksigns64 + (q2[l] >> 9));
+        const int grid_l = __vsub4(grid[0] ^ signs[0], signs[0]);
+        const int grid_h = __vsub4(grid[1] ^ signs[1], signs[1]);
+        sumi2 = __dp4a(grid_l, *((const int *)q8 + 0), sumi2);
+        sumi2 = __dp4a(grid_h, *((const int *)q8 + 1), sumi2);
        q8 += 8;
    }
-    const float d = (float)bq2->d * (float)bq8_1[ib32].ds.x * 0.25f;
+    const float d = (float)bq2->d * __low2float(bq8_1[ib32].ds) * 0.25f;
    return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
 #else
    assert(false);
    return 0.f;
 #endif
+#else
+    assert(false);
+    return 0.f;
+#endif
+}
+
+static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+#if QK_K == 256
+    const block_iq3_xxs * bq2 = (const block_iq3_xxs *) vbq;
+
+    const int ib32 = iqs;
+    const uint8_t  * q3 = bq2->qs + 8*ib32;
+    const uint16_t * gas = (const uint16_t *)(bq2->qs + QK_K/4) + 2*ib32;
+    const int8_t   * q8 = bq8_1[ib32].qs;
+    uint32_t aux32 = gas[0] | (gas[1] << 16);
+    int sumi = 0;
+    for (int l = 0; l < 4; ++l) {
+        const uint32_t * grid1 = iq3xxs_grid + q3[2*l+0];
+        const uint32_t * grid2 = iq3xxs_grid + q3[2*l+1];
+        const uint32_t * signs = (const uint32_t *)(ksigns64 + (aux32 & 127));
+        const int grid_l = __vsub4(grid1[0] ^ signs[0], signs[0]);
+        const int grid_h = __vsub4(grid2[0] ^ signs[1], signs[1]);
+        sumi = __dp4a(grid_l, *((int *)q8+0), sumi);
+        sumi = __dp4a(grid_h, *((int *)q8+1), sumi);
+        q8 += 8;
+        aux32 >>= 7;
+    }
+    const float d = (float)bq2->d * (0.5f + aux32) * __low2float(bq8_1[ib32].ds) * 0.5f;
+    return d * sumi;
+#else
+    assert(false);
+    return 0.f;
+#endif
+#else
+    assert(false);
+    return 0.f;
+#endif
 }

 template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x, int mmq_y, int nwarps,
@@ -4499,7 +4674,7 @@ template <bool need_check> static __global__ void
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 #else
    (void) vec_dot_q4_0_q8_1_mul_mat;
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= CC_VOLTA
 }

@@ -4568,7 +4743,7 @@ template <bool need_check> static __global__ void
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 #else
    (void) vec_dot_q4_1_q8_1_mul_mat;
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= CC_VOLTA
 }

@@ -4635,7 +4810,7 @@ template <bool need_check> static __global__ void
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 #else
    (void) vec_dot_q5_0_q8_1_mul_mat;
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= CC_VOLTA
 }

@@ -4702,7 +4877,7 @@ mul_mat_q5_1(
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 #else
    (void) vec_dot_q5_1_q8_1_mul_mat;
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= CC_VOLTA
 }

@@ -4769,7 +4944,7 @@ template <bool need_check> static __global__ void
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 #else
    (void) vec_dot_q8_0_q8_1_mul_mat;
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= CC_VOLTA
 }

@@ -4836,7 +5011,7 @@ mul_mat_q2_K(
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 #else
    (void) vec_dot_q2_K_q8_1_mul_mat;
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= CC_VOLTA
 }

@@ -4905,7 +5080,7 @@ template <bool need_check> static __global__ void
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 #else
    (void) vec_dot_q3_K_q8_1_mul_mat;
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= CC_VOLTA
 }

@@ -4974,7 +5149,7 @@ template <bool need_check> static __global__ void
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 #else
    (void) vec_dot_q4_K_q8_1_mul_mat;
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= CC_VOLTA
 }

@@ -5041,7 +5216,7 @@ mul_mat_q5_K(
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 #else
    (void) vec_dot_q5_K_q8_1_mul_mat;
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= CC_VOLTA
 }

@@ -5110,7 +5285,7 @@ template <bool need_check> static __global__ void
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 #else
    (void) vec_dot_q6_K_q8_1_mul_mat;
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ >= CC_VOLTA
 }

@@ -5336,27 +5511,37 @@ static __device__ void cpy_1_f16_f16(const char * cxi, char * cdsti) {
    *dsti = *xi;
 }

+static __device__ void cpy_1_f16_f32(const char * cxi, char * cdsti) {
+    const half * xi = (const half *) cxi;
+    float * dsti = (float *) cdsti;
+
+    *dsti = *xi;
+}
+
 template <cpy_kernel_t cpy_1>
 static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
-                                   const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
-                                   const int ne10, const int ne11, const int nb10, const int nb11, const int nb12) {
+                                   const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+                                   const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
+                                   const int nb12, const int nb13) {
    const int i = blockDim.x*blockIdx.x + threadIdx.x;

    if (i >= ne) {
        return;
    }

-    // determine indices i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor
+    // determine indices i03/i13, i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor
    // then combine those indices with the corresponding byte offsets to get the total offsets
-    const int i02 = i / (ne00*ne01);
-    const int i01 = (i - i02*ne01*ne00) / ne00;
-    const int i00 = i - i02*ne01*ne00 - i01*ne00;
-    const int x_offset = i00*nb00 + i01*nb01 + i02*nb02;
+    const int i03 = i/(ne00 * ne01 * ne02);
+    const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
+    const int i01 = (i - i03*ne00*ne01*ne02  -  i02*ne01*ne00) / ne00;
+    const int i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
+    const int x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;

-    const int i12 = i / (ne10*ne11);
-    const int i11 = (i - i12*ne10*ne11) / ne10;
-    const int i10 = i - i12*ne10*ne11 - i11*ne10;
-    const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12;
+    const int i13 = i/(ne10 * ne11 * ne12);
+    const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
+    const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
+    const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
+    const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13 * nb13;

    cpy_1(cx + x_offset, cdst + dst_offset);
 }
@@ -5450,23 +5635,26 @@ static __device__ void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) {

 template <cpy_kernel_t cpy_blck, int qk>
 static __global__ void cpy_f32_q(const char * cx, char * cdst, const int ne,
-                                 const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
-                                 const int ne10, const int ne11, const int nb10, const int nb11, const int nb12) {
+                                 const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+                                 const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
+                                 const int nb12, const int nb13) {
    const int i = (blockDim.x*blockIdx.x + threadIdx.x)*qk;

    if (i >= ne) {
        return;
    }

-    const int i02 = i / (ne00*ne01);
-    const int i01 = (i - i02*ne01*ne00) / ne00;
-    const int i00 = (i - i02*ne01*ne00 - i01*ne00);
-    const int x_offset = i00*nb00 + i01*nb01 + i02*nb02;
+    const int i03 = i/(ne00 * ne01 * ne02);
+    const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
+    const int i01 = (i - i03*ne00*ne01*ne02  -  i02*ne01*ne00) / ne00;
+    const int i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
+    const int x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;

-    const int i12 = i / (ne10*ne11);
-    const int i11 = (i - i12*ne10*ne11) / ne10;
-    const int i10 = (i - i12*ne10*ne11 - i11*ne10)/qk;
-    const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12;
+    const int i13 = i/(ne10 * ne11 * ne12);
+    const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
+    const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
+    const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
+    const int dst_offset = (i10/qk)*nb10 + i11*nb11 + i12*nb12 + i13*nb13;

    cpy_blck(cx + x_offset, cdst + dst_offset);
 }
@@ -5833,7 +6021,7 @@ static __global__ void soft_max_f16(const float * x, const float * y, float * ds
    }
 #else
    (void) x; (void) y; (void) dst; (void) ncols_par; (void) nrows_y; (void) scale;
-    bad_arch();
+    NO_DEVICE_CODE;
 #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
 }

@@ -6360,6 +6548,12 @@ static void dequantize_row_iq2_xs_cuda(const void * vx, dst_t * y, const int k,
    dequantize_block_iq2_xs<<<nb, 32, 0, stream>>>(vx, y);
 }

+template<typename dst_t>
+static void dequantize_row_iq3_xxs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
+    const int nb = k / QK_K;
+    dequantize_block_iq3_xxs<<<nb, 32, 0, stream>>>(vx, y);
+}
+
 template <typename src_t, typename dst_t>
 static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
    const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
@@ -6397,6 +6591,8 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
            return dequantize_row_iq2_xxs_cuda;
        case GGML_TYPE_IQ2_XS:
            return dequantize_row_iq2_xs_cuda;
+        case GGML_TYPE_IQ3_XXS:
+            return dequantize_row_iq3_xxs_cuda;
        case GGML_TYPE_F32:
            return convert_unary_cuda<float>;
        default:
@@ -6430,6 +6626,8 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
            return dequantize_row_iq2_xxs_cuda;
        case GGML_TYPE_IQ2_XS:
            return dequantize_row_iq2_xs_cuda;
+        case GGML_TYPE_IQ3_XXS:
+            return dequantize_row_iq3_xxs_cuda;
        case GGML_TYPE_F16:
            return convert_unary_cuda<half>;
        default:
@@ -6642,6 +6840,15 @@ static void mul_mat_vec_iq2_xs_q8_1_cuda(const void * vx, const void * vy, float
        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
 }

+static void mul_mat_vec_iq3_xxs_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(block_num_y, 1, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<QK_K, QI3_XXS, block_iq3_xxs, 1, vec_dot_iq3_xxs_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
+}
+
 static void ggml_mul_mat_q4_0_q8_1_cuda(
    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
@@ -7114,69 +7321,82 @@ static void ggml_mul_mat_vec_nc_f16_f32_cuda(
        (vx, y, dst, ncols_x, nrows_x, row_stride_x, channel_stride_x, nchannels_y/nchannels_x);
 }

+
+static void ggml_cpy_f16_f32_cuda(
+    const char * cx, char * cdst, const int ne,
+    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+
+    const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
+    cpy_f32_f16<cpy_1_f16_f32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+}
+
 static void ggml_cpy_f32_f32_cuda(
    const char * cx, char * cdst, const int ne,
-    const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
-    const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
+    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {

    const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
    cpy_f32_f16<cpy_1_f32_f32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
-        (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
 }

 static void ggml_cpy_f32_f16_cuda(
    const char * cx, char * cdst, const int ne,
-    const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
-    const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
+    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {

    const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
    cpy_f32_f16<cpy_1_f32_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
-        (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
 }

 static void ggml_cpy_f32_q8_0_cuda(
    const char * cx, char * cdst, const int ne,
-    const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
-    const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
+    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {

    GGML_ASSERT(ne % QK8_0 == 0);
    const int num_blocks = ne / QK8_0;
    cpy_f32_q<cpy_blck_f32_q8_0, QK8_0><<<num_blocks, 1, 0, stream>>>
-        (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
 }

 static void ggml_cpy_f32_q4_0_cuda(
    const char * cx, char * cdst, const int ne,
-    const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
-    const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
+    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {

    GGML_ASSERT(ne % QK4_0 == 0);
    const int num_blocks = ne / QK4_0;
    cpy_f32_q<cpy_blck_f32_q4_0, QK4_0><<<num_blocks, 1, 0, stream>>>
-        (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
 }

 static void ggml_cpy_f32_q4_1_cuda(
    const char * cx, char * cdst, const int ne,
-    const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
-    const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
+    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {

    GGML_ASSERT(ne % QK4_1 == 0);
    const int num_blocks = ne / QK4_1;
    cpy_f32_q<cpy_blck_f32_q4_1, QK4_1><<<num_blocks, 1, 0, stream>>>
-        (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
 }

 static void ggml_cpy_f16_f16_cuda(
    const char * cx, char * cdst, const int ne,
-    const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
-    const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
+    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {

    const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
    cpy_f32_f16<cpy_1_f16_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
-        (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
 }

+
+
 static void scale_f32_cuda(const float * x, float * dst, const float scale, const int k, cudaStream_t stream) {
    const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
    scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
@@ -8192,6 +8412,7 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_CUD
        case GGML_TYPE_Q6_K:
        case GGML_TYPE_IQ2_XXS:
        case GGML_TYPE_IQ2_XS:
+        case GGML_TYPE_IQ3_XXS:
            return max_compute_capability >= CC_RDNA2 ? 128 : 64;
        default:
            GGML_ASSERT(false);
@@ -8214,6 +8435,7 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_CUD
        case GGML_TYPE_Q5_K:
        case GGML_TYPE_IQ2_XXS:
        case GGML_TYPE_IQ2_XS:
+        case GGML_TYPE_IQ3_XXS:
            return max_compute_capability >= CC_VOLTA ? 128 : 64;
        case GGML_TYPE_Q6_K:
            return 64;
@@ -8285,6 +8507,9 @@ static void ggml_cuda_op_mul_mat_vec_q(
        case GGML_TYPE_IQ2_XS:
            mul_mat_vec_iq2_xs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
            break;
+        case GGML_TYPE_IQ3_XXS:
+            mul_mat_vec_iq3_xxs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            break;
        default:
            GGML_ASSERT(false);
            break;
@@ -9769,8 +9994,8 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
    // TODO: mmq/mmv support
 #endif

-    const int64_t nb11 = src1->nb[1];
-    const int64_t nb1  =  dst->nb[1];
+    const size_t nb11 = src1->nb[1];
+    const size_t nb1  =  dst->nb[1];

    const struct ggml_tensor * ids = src0;
    const int32_t id = ((int32_t *) dst->op_params)[0];
@@ -9920,19 +10145,25 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg

    const int64_t ne00 = src0->ne[0];
    const int64_t ne01 = src0->ne[1];
-    GGML_ASSERT(src0->ne[3] == 1);
+    const int64_t ne02 = src0->ne[2];
+
+    //GGML_ASSERT(src0->ne[3] == 1);

    const int64_t nb00 = src0->nb[0];
    const int64_t nb01 = src0->nb[1];
    const int64_t nb02 = src0->nb[2];
+    const int64_t nb03 = src0->nb[3];

    const int64_t ne10 = src1->ne[0];
    const int64_t ne11 = src1->ne[1];
-    GGML_ASSERT(src1->ne[3] == 1);
+    const int64_t ne12 = src1->ne[2];
+
+    //GGML_ASSERT(src1->ne[3] == 1);

    const int64_t nb10 = src1->nb[0];
    const int64_t nb11 = src1->nb[1];
    const int64_t nb12 = src1->nb[2];
+    const int64_t nb13 = src1->nb[3];

    ggml_cuda_set_device(g_main_device);
    cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
@@ -9944,17 +10175,19 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
    char * src1_ddc = (char *) src1_extra->data_device[g_main_device];

    if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
-        ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
+        ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
-        ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
+        ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
-        ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
+        ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
-        ggml_cpy_f32_q4_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
+        ggml_cpy_f32_q4_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
-        ggml_cpy_f32_q4_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
+        ggml_cpy_f32_q4_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
    } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
-        ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
+        ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+    } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
+        ggml_cpy_f16_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
    } else {
        fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
                ggml_type_name(src0->type), ggml_type_name(src1->type));
@@ -10283,15 +10516,11 @@ GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t

    if (ggml_is_quantized(tensor->type)) {
        // initialize padding to 0 to avoid possible NaN values
-        int64_t row_low = 0;
-        int64_t row_high = ggml_nrows(tensor);
-        int64_t nrows_split = row_high - row_low;
-
-        size_t original_size = ggml_nbytes_split(tensor, nrows_split);
+        size_t original_size = ggml_nbytes(tensor);
        size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);

        if (padded_size > original_size && tensor->view_src == nullptr) {
-            CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, padded_size - original_size, g_cudaStreams[ctx->device][0]));
+            CUDA_CHECK(cudaMemset((char *)tensor->data + original_size, 0, padded_size - original_size));
        }
    }
 }
@@ -10394,12 +10623,7 @@ GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend
 }

 GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
-    int64_t row_low = 0;
-    int64_t row_high = ggml_nrows(tensor);
-    int64_t nrows_split = row_high - row_low;
-
-    size_t size = ggml_nbytes_split(tensor, nrows_split);
-
+    size_t size = ggml_nbytes(tensor);
    int64_t ne0 = tensor->ne[0];

    if (ggml_is_quantized(tensor->type)) {
@@ -10428,6 +10652,7 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
    /* .get_name         = */ ggml_backend_cuda_buffer_type_name,
    /* .alloc_buffer     = */ ggml_backend_cuda_buffer_type_alloc_buffer,
    /* .get_alignment    = */ ggml_backend_cuda_buffer_type_get_alignment,
+    /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
    /* .get_alloc_size   = */ ggml_backend_cuda_buffer_type_get_alloc_size,
    /* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
    /* .is_host          = */ NULL,
@@ -10703,6 +10928,7 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface
    /* .get_name         = */ ggml_backend_cuda_split_buffer_type_name,
    /* .alloc_buffer     = */ ggml_backend_cuda_split_buffer_type_alloc_buffer,
    /* .get_alignment    = */ ggml_backend_cuda_split_buffer_type_get_alignment,
+    /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
    /* .get_alloc_size   = */ ggml_backend_cuda_split_buffer_type_get_alloc_size,
    /* .supports_backend = */ ggml_backend_cuda_split_buffer_type_supports_backend,
    /* .is_host          = */ ggml_backend_cuda_split_buffer_type_is_host,
@@ -10782,6 +11008,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
            /* .get_name         = */ ggml_backend_cuda_host_buffer_type_name,
            /* .alloc_buffer     = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
            /* .get_alignment    = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
+            /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
            /* .get_alloc_size   = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
            /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
            /* .is_host          = */ ggml_backend_cpu_buffer_type()->iface.is_host,
@@ -10919,7 +11146,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
                    return false;
                }
                ggml_type a_type = a->type;
-                if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS) {
+                if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS) {
                    if (b->ne[1] == 1 && ggml_nrows(b) > 1) {
                        return false;
                    }
@@ -10963,6 +11190,9 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
                if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) {
                    return true;
                }
+                if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) {
+                    return true;
+                }
                return false;
            } break;
        case GGML_OP_DUP:
--- a/ggml-kompute.cpp
+++ b/ggml-kompute.cpp
--- a/ggml-kompute.h
+++ b/ggml-kompute.h
@@ -0,0 +1,46 @@
+#pragma once
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct ggml_vk_device {
+    int index;
+    int type; // same as VkPhysicalDeviceType
+    size_t heapSize;
+    const char * name;
+    const char * vendor;
+    int subgroupSize;
+    uint64_t bufferAlignment;
+    uint64_t maxAlloc;
+};
+
+struct ggml_vk_device * ggml_vk_available_devices(size_t memoryRequired, size_t * count);
+bool ggml_vk_get_device(struct ggml_vk_device * device, size_t memoryRequired, const char * name);
+bool ggml_vk_has_vulkan(void);
+bool ggml_vk_has_device(void);
+struct ggml_vk_device ggml_vk_current_device(void);
+
+//
+// backend API
+//
+
+// forward declaration
+typedef struct ggml_backend * ggml_backend_t;
+
+GGML_API ggml_backend_t ggml_backend_kompute_init(int device);
+
+GGML_API bool ggml_backend_is_kompute(ggml_backend_t backend);
+
+GGML_API ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device);
+
+#ifdef __cplusplus
+}
+#endif
--- a/ggml-metal.h
+++ b/ggml-metal.h
@@ -57,6 +57,9 @@ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(voi
 // ref: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
 GGML_API bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family);

+// capture all command buffers committed the next time `ggml_backend_graph_compute` is called
+GGML_API void ggml_backend_metal_capture_next_compute(ggml_backend_t backend);
+
 #ifdef __cplusplus
 }
 #endif
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -24,19 +24,7 @@

 #define UNUSED(x) (void)(x)

-#define GGML_METAL_MAX_KERNELS 256
-
-struct ggml_metal_buffer {
-    const char * name;
-
-    void   * data;
-    size_t   size;
-
-    id<MTLBuffer> metal;
-};
-
 struct ggml_metal_kernel {
-    id<MTLFunction>             function;
    id<MTLComputePipelineState> pipeline;
 };

@@ -72,6 +60,7 @@ enum ggml_metal_kernel_type {
    GGML_METAL_KERNEL_TYPE_GET_ROWS_Q6_K,
    GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS,
    GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS,
+    GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS,
    GGML_METAL_KERNEL_TYPE_GET_ROWS_I32,
    GGML_METAL_KERNEL_TYPE_RMS_NORM,
    GGML_METAL_KERNEL_TYPE_GROUP_NORM,
@@ -93,6 +82,7 @@ enum ggml_metal_kernel_type {
    GGML_METAL_KERNEL_TYPE_MUL_MV_Q6_K_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32,
  //GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F16,
    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32,
@@ -110,6 +100,7 @@ enum ggml_metal_kernel_type {
    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q6_K_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32,
@@ -124,6 +115,7 @@ enum ggml_metal_kernel_type {
    GGML_METAL_KERNEL_TYPE_MUL_MM_Q6_K_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32,
@@ -138,6 +130,7 @@ enum ggml_metal_kernel_type {
    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32,
    GGML_METAL_KERNEL_TYPE_ROPE_F32,
    GGML_METAL_KERNEL_TYPE_ROPE_F16,
    GGML_METAL_KERNEL_TYPE_ALIBI_F32,
@@ -168,17 +161,15 @@ struct ggml_metal_context {

    id<MTLDevice>       device;
    id<MTLCommandQueue> queue;
-    id<MTLLibrary>      library;

    dispatch_queue_t d_queue;

-    int n_buffers;
-    struct ggml_metal_buffer buffers[GGML_METAL_MAX_BUFFERS];
-
-    struct ggml_metal_kernel kernels[GGML_METAL_MAX_KERNELS];
+    struct ggml_metal_kernel kernels[GGML_METAL_KERNEL_TYPE_COUNT];

    bool support_simdgroup_reduction;
    bool support_simdgroup_mm;
+
+    bool should_capture_next_compute;
 };

 // MSL code
@@ -242,26 +233,24 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
    // Show all the Metal device instances in the system
    NSArray * devices = MTLCopyAllDevices();
    for (id<MTLDevice> device in devices) {
-        NSString * s = [device name];
-        GGML_METAL_LOG_INFO("%s: found device: %s\n", __func__, [s UTF8String]);
+        GGML_METAL_LOG_INFO("%s: found device: %s\n", __func__, [[device name] UTF8String]);
    }
    [devices release]; // since it was created by a *Copy* C method
 #endif

    // Pick and show default Metal device
    id<MTLDevice> device = MTLCreateSystemDefaultDevice();
-    NSString * s = [device name];
-    GGML_METAL_LOG_INFO("%s: picking default device: %s\n", __func__, [s UTF8String]);
+    GGML_METAL_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]);

    // Configure context
    struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
    ctx->device = device;
    ctx->n_cb   = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
    ctx->queue  = [ctx->device newCommandQueue];
-    ctx->n_buffers = 0;
-
    ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);

+    id<MTLLibrary> metal_library;
+
    // load library
    {
        NSBundle * bundle = nil;
@@ -276,7 +265,11 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
            // pre-compiled library found
            NSURL * libURL = [NSURL fileURLWithPath:libPath];
            GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [libPath UTF8String]);
-            ctx->library = [ctx->device newLibraryWithURL:libURL error:&error];
+            metal_library = [ctx->device newLibraryWithURL:libURL error:&error];
+            if (error) {
+                GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
+                return NULL;
+            }
        } else {
            GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);

@@ -314,14 +307,13 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {

                //[options setFastMathEnabled:false];

-                ctx->library = [ctx->device newLibraryWithSource:src options:options error:&error];
+                metal_library = [ctx->device newLibraryWithSource:src options:options error:&error];
+                if (error) {
+                    GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
+                    return NULL;
+                }
            }
        }
-
-        if (error) {
-            GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
-            return NULL;
-        }
    }

    // print MTL GPU family:
@@ -364,6 +356,8 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
    GGML_METAL_LOG_INFO("%s: simdgroup matrix mul. support = %s\n",       __func__, ctx->support_simdgroup_mm ? "true" : "false");
    GGML_METAL_LOG_INFO("%s: hasUnifiedMemory              = %s\n",       __func__, ctx->device.hasUnifiedMemory ? "true" : "false");

+    ctx->should_capture_next_compute = false;
+
 #if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15)
    if (@available(macOS 10.12, iOS 16.0, *)) {
        GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize  = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1e6);
@@ -380,8 +374,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
    {
        NSError * error = nil;

-        for (int i = 0; i < GGML_METAL_MAX_KERNELS; ++i) {
-            ctx->kernels[i].function = nil;
+        for (int i = 0; i < GGML_METAL_KERNEL_TYPE_COUNT; ++i) {
            ctx->kernels[i].pipeline = nil;
        }

@@ -393,10 +386,12 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
 #define GGML_METAL_ADD_KERNEL(e, name, supported) \
        if (supported) { \
            struct ggml_metal_kernel * kernel = &ctx->kernels[e]; \
-            kernel->function = [ctx->library newFunctionWithName:@"kernel_"#name]; \
-            kernel->pipeline = [ctx->device newComputePipelineStateWithFunction:kernel->function error:&error]; \
+            id<MTLFunction> metal_function = [metal_library newFunctionWithName:@"kernel_"#name]; \
+            kernel->pipeline = [ctx->device newComputePipelineStateWithFunction:metal_function error:&error]; \
+            [metal_function release]; \
            if (error) { \
                GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
+                [metal_library release]; \
                return NULL; \
            } \
        } else { \
@@ -436,6 +431,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q6_K,             get_rows_q6_K,          true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS,          get_rows_iq2_xxs,       true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS,           get_rows_iq2_xs,        true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS,          get_rows_iq3_xxs,       true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_I32,              get_rows_i32,           true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RMS_NORM,                  rms_norm,               ctx->support_simdgroup_reduction);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GROUP_NORM,                group_norm,             ctx->support_simdgroup_reduction);
@@ -457,6 +453,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q6_K_F32,           mul_mv_q6_K_f32,        ctx->support_simdgroup_reduction);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32,        mul_mv_iq2_xxs_f32,     ctx->support_simdgroup_reduction);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32,         mul_mv_iq2_xs_f32,      ctx->support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32,        mul_mv_iq3_xxs_f32,     ctx->support_simdgroup_reduction);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32,         mul_mv_id_f32_f32,      ctx->support_simdgroup_reduction);
      //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F16,         mul_mv_id_f16_f16,      ctx->support_simdgroup_reduction);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32,         mul_mv_id_f16_f32,      ctx->support_simdgroup_reduction);
@@ -474,6 +471,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q6_K_F32,        mul_mv_id_q6_K_f32,     ctx->support_simdgroup_reduction);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32,     mul_mv_id_iq2_xxs_f32,  ctx->support_simdgroup_reduction);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32,      mul_mv_id_iq2_xs_f32,   ctx->support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32,     mul_mv_id_iq3_xxs_f32,  ctx->support_simdgroup_reduction);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32,            mul_mm_f32_f32,         ctx->support_simdgroup_mm);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32,            mul_mm_f16_f32,         ctx->support_simdgroup_mm);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32,           mul_mm_q4_0_f32,        ctx->support_simdgroup_mm);
@@ -488,6 +486,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q6_K_F32,           mul_mm_q6_K_f32,        ctx->support_simdgroup_mm);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32,        mul_mm_iq2_xxs_f32,     ctx->support_simdgroup_mm);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32,         mul_mm_iq2_xs_f32,      ctx->support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32,        mul_mm_iq3_xxs_f32,     ctx->support_simdgroup_mm);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32,         mul_mm_id_f32_f32,      ctx->support_simdgroup_mm);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32,         mul_mm_id_f16_f32,      ctx->support_simdgroup_mm);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32,        mul_mm_id_q4_0_f32,     ctx->support_simdgroup_mm);
@@ -502,6 +501,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F32,        mul_mm_id_q6_K_f32,     ctx->support_simdgroup_mm);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32,     mul_mm_id_iq2_xxs_f32,  ctx->support_simdgroup_mm);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32,      mul_mm_id_iq2_xs_f32,   ctx->support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32,     mul_mm_id_iq3_xxs_f32,  ctx->support_simdgroup_mm);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F32,                  rope_f32,               true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F16,                  rope_f16,               true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ALIBI_F32,                 alibi_f32,              true);
@@ -525,27 +525,17 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUM_ROWS,                  sum_rows,               true);
    }

+    [metal_library release];
    return ctx;
 }

 static void ggml_metal_free(struct ggml_metal_context * ctx) {
    GGML_METAL_LOG_INFO("%s: deallocating\n", __func__);

-    for (int i = 0; i < ctx->n_buffers; ++i) {
-        [ctx->buffers[i].metal release];
+    for (int i = 0; i < GGML_METAL_KERNEL_TYPE_COUNT; ++i) {
+        [ctx->kernels[i].pipeline release];
    }

-    for (int i = 0; i < GGML_METAL_MAX_KERNELS; ++i) {
-        if (ctx->kernels[i].pipeline) {
-            [ctx->kernels[i].pipeline release];
-        }
-
-        if (ctx->kernels[i].function) {
-            [ctx->kernels[i].function release];
-        }
-    }
-
-    [ctx->library release];
    [ctx->queue release];
    [ctx->device release];

@@ -577,51 +567,30 @@ struct ggml_backend_metal_buffer_context {
 // the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
 // Metal buffer based on the host memory pointer
 //
-static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, struct ggml_tensor * t, size_t * offs) {
+static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_tensor * t, size_t * offs) {
    //GGML_METAL_LOG_INFO("%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach);

    const int64_t tsize = ggml_nbytes(t);

    ggml_backend_buffer_t buffer = t->view_src ? t->view_src->buffer : t->buffer;

-    // compatibility with ggml-backend
-    if (buffer && buffer->buft == ggml_backend_metal_buffer_type()) {
-        struct ggml_backend_metal_buffer_context * buf_ctx = (struct ggml_backend_metal_buffer_context *) buffer->context;
-
-        // find the view that contains the tensor fully
-        for (int i = 0; i < buf_ctx->n_buffers; ++i) {
-            const int64_t ioffs = (int64_t) t->data - (int64_t) buf_ctx->buffers[i].data;
-
-            //GGML_METAL_LOG_INFO("ioffs = %10ld, tsize = %10ld, sum = %10ld, buf_ctx->buffers[%d].size = %10ld\n", ioffs, tsize, ioffs + tsize, i, buf_ctx->buffers[i].size);
-            if (ioffs >= 0 && ioffs + tsize <= (int64_t) buf_ctx->buffers[i].size) {
-                *offs = (size_t) ioffs;
-
-                //GGML_METAL_LOG_INFO("%s: tensor '%16s', offs = %8ld\n", __func__, t->name, *offs);
-
-                return buf_ctx->buffers[i].metal;
-            }
-        }
-
-        GGML_METAL_LOG_ERROR("%s: error: tensor '%s' buffer is nil\n", __func__, t->name);
-
-        return nil;
-    }
+    struct ggml_backend_metal_buffer_context * buf_ctx = (struct ggml_backend_metal_buffer_context *) buffer->context;

    // find the view that contains the tensor fully
-    for (int i = 0; i < ctx->n_buffers; ++i) {
-        const int64_t ioffs = (int64_t) t->data - (int64_t) ctx->buffers[i].data;
+    for (int i = 0; i < buf_ctx->n_buffers; ++i) {
+        const int64_t ioffs = (int64_t) t->data - (int64_t) buf_ctx->buffers[i].data;

-        //GGML_METAL_LOG_INFO("ioffs = %10ld, tsize = %10ld, sum = %10ld, ctx->buffers[%d].size = %10ld, name = %s\n", ioffs, tsize, ioffs + tsize, i, ctx->buffers[i].size, ctx->buffers[i].name);
-        if (ioffs >= 0 && ioffs + tsize <= (int64_t) ctx->buffers[i].size) {
+        //GGML_METAL_LOG_INFO("ioffs = %10ld, tsize = %10ld, sum = %10ld, buf_ctx->buffers[%d].size = %10ld\n", ioffs, tsize, ioffs + tsize, i, buf_ctx->buffers[i].size);
+        if (ioffs >= 0 && ioffs + tsize <= (int64_t) buf_ctx->buffers[i].size) {
            *offs = (size_t) ioffs;

-            //GGML_METAL_LOG_INFO("%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs);
+            //GGML_METAL_LOG_INFO("%s: tensor '%16s', offs = %8ld\n", __func__, t->name, *offs);

-            return ctx->buffers[i].metal;
+            return buf_ctx->buffers[i].metal;
        }
    }

-    GGML_METAL_LOG_ERROR("%s: error: buffer is nil\n", __func__);
+    GGML_METAL_LOG_ERROR("%s: error: tensor '%s' buffer is nil\n", __func__, t->name);

    return nil;
 }
@@ -668,7 +637,8 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
            return true;
        case GGML_OP_MUL_MAT:
        case GGML_OP_MUL_MAT_ID:
-            return ctx->support_simdgroup_reduction;
+            return ctx->support_simdgroup_reduction &&
+                (op->src[0]->type != GGML_TYPE_F32 || op->src[1]->type == GGML_TYPE_F32);
        case GGML_OP_CPY:
        case GGML_OP_DUP:
        case GGML_OP_CONT:
@@ -721,6 +691,20 @@ static bool ggml_metal_graph_compute(
    const int n_cb = ctx->n_cb;
    const int n_nodes_per_cb = (n_nodes + n_cb - 1) / n_cb;

+    const bool should_capture = ctx->should_capture_next_compute;
+    if (should_capture) {
+        ctx->should_capture_next_compute = false;
+
+        MTLCaptureDescriptor * descriptor = [MTLCaptureDescriptor new];
+        descriptor.captureObject = ctx->queue;
+
+        NSError * error = nil;
+        if (![[MTLCaptureManager sharedCaptureManager] startCaptureWithDescriptor:descriptor error:&error]) {
+            GGML_METAL_LOG_ERROR("%s: error: unable to start capture '%s'\n", __func__, [[error localizedDescription] UTF8String]);
+            GGML_ASSERT(!"capture failed");
+        }
+    }
+
    id<MTLCommandBuffer> command_buffer_builder[n_cb];
    for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) {
        id<MTLCommandBuffer> command_buffer  = [ctx->queue commandBufferWithUnretainedReferences];
@@ -729,6 +713,7 @@ static bool ggml_metal_graph_compute(
        // enqueue the command buffers in order to specify their execution order
        [command_buffer enqueue];
    }
+
    const id<MTLCommandBuffer> *command_buffers = command_buffer_builder;

    dispatch_apply(n_cb, ctx->d_queue, ^(size_t iter) {
@@ -775,9 +760,9 @@ static bool ggml_metal_graph_compute(
                GGML_ASSERT(!"unsupported op");
            }

-#ifndef GGML_METAL_NDEBUG
-            [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(dst) encoding:NSUTF8StringEncoding]];
-#endif
+            if (should_capture) {
+                [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(dst) encoding:NSUTF8StringEncoding]];
+            }

            const int64_t  ne00 = src0 ? src0->ne[0] : 0;
            const int64_t  ne01 = src0 ? src0->ne[1] : 0;
@@ -813,9 +798,9 @@ static bool ggml_metal_graph_compute(
            const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
            const enum ggml_type dstt  = dst  ? dst->type  : GGML_TYPE_COUNT;

-            id<MTLBuffer> id_src0 = src0 ? ggml_metal_get_buffer(ctx, src0, &offs_src0) : nil;
-            id<MTLBuffer> id_src1 = src1 ? ggml_metal_get_buffer(ctx, src1, &offs_src1) : nil;
-            id<MTLBuffer> id_dst  = dst  ? ggml_metal_get_buffer(ctx, dst,  &offs_dst)  : nil;
+            id<MTLBuffer> id_src0 = src0 ? ggml_metal_get_buffer(src0, &offs_src0) : nil;
+            id<MTLBuffer> id_src1 = src1 ? ggml_metal_get_buffer(src1, &offs_src1) : nil;
+            id<MTLBuffer> id_dst  = dst  ? ggml_metal_get_buffer(dst,  &offs_dst)  : nil;

            //GGML_METAL_LOG_INFO("%s: op - %s\n", __func__, ggml_op_name(dst->op));
            //if (src0) {
@@ -1304,6 +1289,7 @@ static bool ggml_metal_graph_compute(
                                case GGML_TYPE_Q6_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q6_K_F32   ].pipeline; break;
                                case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32].pipeline; break;
                                case GGML_TYPE_IQ2_XS:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32 ].pipeline; break;
+                                case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32].pipeline; break;
                                default: GGML_ASSERT(false && "MUL MAT-MAT not implemented");
                            }

@@ -1432,6 +1418,12 @@ static bool ggml_metal_graph_compute(
                                        nth1 = 16;
                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32].pipeline;
                                    } break;
+                                case GGML_TYPE_IQ3_XXS:
+                                    {
+                                        nth0 = 4;
+                                        nth1 = 16;
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32].pipeline;
+                                    } break;
                                default:
                                    {
                                        GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src0t);
@@ -1474,6 +1466,11 @@ static bool ggml_metal_graph_compute(
                                [encoder setThreadgroupMemoryLength:mem_size atIndex:0];
                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                            }
+                            else if (src0t == GGML_TYPE_IQ3_XXS) {
+                                const int mem_size = 256*4+128;
+                                [encoder setThreadgroupMemoryLength:mem_size atIndex:0];
+                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                            }
                            else if (src0t == GGML_TYPE_Q4_K) {
                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                            }
@@ -1568,6 +1565,7 @@ static bool ggml_metal_graph_compute(
                                case GGML_TYPE_Q6_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F32   ].pipeline; break;
                                case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32].pipeline; break;
                                case GGML_TYPE_IQ2_XS:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32 ].pipeline; break;
+                                case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32].pipeline; break;
                                default: GGML_ASSERT(false && "MUL_MAT_ID not implemented");
                            }

@@ -1597,7 +1595,7 @@ static bool ggml_metal_graph_compute(
                                struct ggml_tensor * src_cur = dst->src[2 + (j % n_as)];

                                size_t offs_src_cur = 0;
-                                id<MTLBuffer> id_src_cur = ggml_metal_get_buffer(ctx, src_cur, &offs_src_cur);
+                                id<MTLBuffer> id_src_cur = ggml_metal_get_buffer(src_cur, &offs_src_cur);

                                [encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:19 + j];
                            }
@@ -1699,6 +1697,12 @@ static bool ggml_metal_graph_compute(
                                        nth1 = 16;
                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32].pipeline;
                                    } break;
+                                case GGML_TYPE_IQ3_XXS:
+                                    {
+                                        nth0 = 4;
+                                        nth1 = 16;
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32].pipeline;
+                                    } break;
                                default:
                                    {
                                        GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src2t);
@@ -1742,7 +1746,7 @@ static bool ggml_metal_graph_compute(
                                struct ggml_tensor * src_cur = dst->src[2 + (j % n_as)];

                                size_t offs_src_cur = 0;
-                                id<MTLBuffer> id_src_cur = ggml_metal_get_buffer(ctx, src_cur, &offs_src_cur);
+                                id<MTLBuffer> id_src_cur = ggml_metal_get_buffer(src_cur, &offs_src_cur);

                                [encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:23 + j];
                            }
@@ -1757,6 +1761,11 @@ static bool ggml_metal_graph_compute(
                                [encoder setThreadgroupMemoryLength:mem_size atIndex:0];
                                [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                            }
+                            else if (src2t == GGML_TYPE_IQ3_XXS) {
+                                const int mem_size = 256*4+128;
+                                [encoder setThreadgroupMemoryLength:mem_size atIndex:0];
+                                [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                            }
                            else if (src2t == GGML_TYPE_Q4_K) {
                                [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                            }
@@ -1797,6 +1806,7 @@ static bool ggml_metal_graph_compute(
                            case GGML_TYPE_Q6_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q6_K   ].pipeline; break;
                            case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS].pipeline; break;
                            case GGML_TYPE_IQ2_XS:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS ].pipeline; break;
+                            case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS].pipeline; break;
                            case GGML_TYPE_I32:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_I32    ].pipeline; break;
                            default: GGML_ASSERT(false && "not implemented");
                        }
@@ -2227,9 +2237,9 @@ static bool ggml_metal_graph_compute(
                    }
            }

-#ifndef GGML_METAL_NDEBUG
-            [encoder popDebugGroup];
-#endif
+            if (should_capture) {
+                [encoder popDebugGroup];
+            }
        }

        [encoder endEncoding];
@@ -2251,6 +2261,10 @@ static bool ggml_metal_graph_compute(
        }
    }

+    if (should_capture) {
+        [[MTLCaptureManager sharedCaptureManager] stopCapture];
+    }
+
    return true;
 }

@@ -2419,6 +2433,16 @@ GGML_CALL static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backen
    UNUSED(buft);
 }

+GGML_CALL static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
+    id<MTLDevice> device = ggml_backend_metal_get_device();
+    size_t max_size = device.maxBufferLength;
+    ggml_backend_metal_free_device();
+
+    return max_size;
+
+    UNUSED(buft);
+}
+
 GGML_CALL static bool ggml_backend_metal_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
    return ggml_backend_is_metal(backend) || ggml_backend_is_cpu(backend);

@@ -2437,6 +2461,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
            /* .get_name         = */ ggml_backend_metal_buffer_type_get_name,
            /* .alloc_buffer     = */ ggml_backend_metal_buffer_type_alloc_buffer,
            /* .get_alignment    = */ ggml_backend_metal_buffer_type_get_alignment,
+            /* .get_max_size     = */ ggml_backend_metal_buffer_type_get_max_size,
            /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
            /* .supports_backend = */ ggml_backend_metal_buffer_type_supports_backend,
            /* .is_host          = */ ggml_backend_metal_buffer_type_is_host,
@@ -2611,6 +2636,13 @@ bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family) {
    return [ctx->device supportsFamily:(MTLGPUFamilyApple1 + family - 1)];
 }

+void ggml_backend_metal_capture_next_compute(ggml_backend_t backend) {
+    GGML_ASSERT(ggml_backend_is_metal(backend));
+
+    struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
+    ctx->should_capture_next_compute = true;
+}
+
 GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data); // silence warning

 GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data) {
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@@ -2459,6 +2459,12 @@ typedef struct {
 } block_iq2_xs;
 // 74 bytes / block for QK_K = 256, so 2.3125 bpw

+typedef struct {
+    half d;
+    uint8_t qs[3*QK_K/8];
+} block_iq3_xxs;
+// 98 bytes / block for QK_K = 256, so 3.0625 bpw
+
 //====================================== dot products =========================

 void kernel_mul_mv_q2_K_f32_impl(
@@ -3681,6 +3687,42 @@ constexpr constant static uint64_t iq2xs_grid[512] = {
    0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b,
 };

+constexpr constant static uint32_t iq3xxs_grid[256] = {
+    0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414,
+    0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14,
+    0x040c140c, 0x040c142c, 0x040c1c04, 0x040c1c14, 0x040c240c, 0x040c2c24, 0x040c3e04, 0x04140404,
+    0x04140414, 0x04140424, 0x04140c0c, 0x04141404, 0x04141414, 0x04141c0c, 0x04141c1c, 0x04141c3e,
+    0x04142c0c, 0x04142c3e, 0x04143e2c, 0x041c040c, 0x041c043e, 0x041c0c04, 0x041c0c14, 0x041c142c,
+    0x041c3e04, 0x04240c1c, 0x04241c3e, 0x04242424, 0x04242c3e, 0x04243e1c, 0x04243e2c, 0x042c040c,
+    0x042c043e, 0x042c1c14, 0x042c2c14, 0x04341c2c, 0x04343424, 0x043e0c04, 0x043e0c24, 0x043e0c34,
+    0x043e241c, 0x043e340c, 0x0c04040c, 0x0c04041c, 0x0c040c04, 0x0c040c14, 0x0c04140c, 0x0c04141c,
+    0x0c041c04, 0x0c041c14, 0x0c041c24, 0x0c04243e, 0x0c042c04, 0x0c0c0404, 0x0c0c0414, 0x0c0c0c0c,
+    0x0c0c1404, 0x0c0c1414, 0x0c14040c, 0x0c14041c, 0x0c140c04, 0x0c140c14, 0x0c14140c, 0x0c141c04,
+    0x0c143e14, 0x0c1c0404, 0x0c1c0414, 0x0c1c1404, 0x0c1c1c0c, 0x0c1c2434, 0x0c1c3434, 0x0c24040c,
+    0x0c24042c, 0x0c242c04, 0x0c2c1404, 0x0c2c1424, 0x0c2c2434, 0x0c2c3e0c, 0x0c34042c, 0x0c3e1414,
+    0x0c3e2404, 0x14040404, 0x14040414, 0x14040c0c, 0x14040c1c, 0x14041404, 0x14041414, 0x14041434,
+    0x14041c0c, 0x14042414, 0x140c040c, 0x140c041c, 0x140c042c, 0x140c0c04, 0x140c0c14, 0x140c140c,
+    0x140c1c04, 0x140c341c, 0x140c343e, 0x140c3e04, 0x14140404, 0x14140414, 0x14140c0c, 0x14140c3e,
+    0x14141404, 0x14141414, 0x14141c3e, 0x14142404, 0x14142c2c, 0x141c040c, 0x141c0c04, 0x141c0c24,
+    0x141c3e04, 0x141c3e24, 0x14241c2c, 0x14242c1c, 0x142c041c, 0x142c143e, 0x142c240c, 0x142c3e24,
+    0x143e040c, 0x143e041c, 0x143e0c34, 0x143e242c, 0x1c04040c, 0x1c040c04, 0x1c040c14, 0x1c04140c,
+    0x1c04141c, 0x1c042c04, 0x1c04342c, 0x1c043e14, 0x1c0c0404, 0x1c0c0414, 0x1c0c1404, 0x1c0c1c0c,
+    0x1c0c2424, 0x1c0c2434, 0x1c14040c, 0x1c14041c, 0x1c140c04, 0x1c14142c, 0x1c142c14, 0x1c143e14,
+    0x1c1c0c0c, 0x1c1c1c1c, 0x1c241c04, 0x1c24243e, 0x1c243e14, 0x1c2c0404, 0x1c2c0434, 0x1c2c1414,
+    0x1c2c2c2c, 0x1c340c24, 0x1c341c34, 0x1c34341c, 0x1c3e1c1c, 0x1c3e3404, 0x24040424, 0x24040c3e,
+    0x24041c2c, 0x24041c3e, 0x24042c1c, 0x24042c3e, 0x240c3e24, 0x24141404, 0x24141c3e, 0x24142404,
+    0x24143404, 0x24143434, 0x241c043e, 0x241c242c, 0x24240424, 0x24242c0c, 0x24243424, 0x242c142c,
+    0x242c241c, 0x242c3e04, 0x243e042c, 0x243e0c04, 0x243e0c14, 0x243e1c04, 0x2c040c14, 0x2c04240c,
+    0x2c043e04, 0x2c0c0404, 0x2c0c0434, 0x2c0c1434, 0x2c0c2c2c, 0x2c140c24, 0x2c141c14, 0x2c143e14,
+    0x2c1c0414, 0x2c1c2c1c, 0x2c240c04, 0x2c24141c, 0x2c24143e, 0x2c243e14, 0x2c2c0414, 0x2c2c1c0c,
+    0x2c342c04, 0x2c3e1424, 0x2c3e2414, 0x34041424, 0x34042424, 0x34042434, 0x34043424, 0x340c140c,
+    0x340c340c, 0x34140c3e, 0x34143424, 0x341c1c04, 0x341c1c34, 0x34242424, 0x342c042c, 0x342c2c14,
+    0x34341c1c, 0x343e041c, 0x343e140c, 0x3e04041c, 0x3e04042c, 0x3e04043e, 0x3e040c04, 0x3e041c14,
+    0x3e042c14, 0x3e0c1434, 0x3e0c2404, 0x3e140c14, 0x3e14242c, 0x3e142c14, 0x3e1c0404, 0x3e1c0c2c,
+    0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04,
+};
+
+
 constexpr constant static uint8_t ksigns_iq2xs[128] = {
      0, 129, 130,   3, 132,   5,   6, 135, 136,   9,  10, 139,  12, 141, 142,  15,
    144,  17,  18, 147,  20, 149, 150,  23,  24, 153, 154,  27, 156,  29,  30, 159,
@@ -3970,6 +4012,143 @@ kernel void kernel_mul_mv_iq2_xs_f32(
    kernel_mul_mv_iq2_xs_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
 }

+void kernel_mul_mv_iq3_xxs_f32_impl(
+        device const  void * src0,
+        device const float * src1,
+        device       float * dst,
+        constant   int64_t & ne00,
+        constant   int64_t & ne01,
+        constant   int64_t & ne02,
+        constant   int64_t & ne10,
+        constant   int64_t & ne12,
+        constant   int64_t & ne0,
+        constant   int64_t & ne1,
+        constant   uint    & r2,
+        constant   uint    & r3,
+        threadgroup int8_t * shared_values [[threadgroup(0)]],
+        uint3 tgpig[[threadgroup_position_in_grid]],
+        uint  tiisg[[thread_index_in_simdgroup]],
+        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
+
+    const int nb = ne00/QK_K;
+    const int r0 = tgpig.x;
+    const int r1 = tgpig.y;
+    const int im = tgpig.z;
+
+    const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
+    const int ib_row = first_row * nb;
+
+    const uint i12 = im%ne12;
+    const uint i13 = im/ne12;
+
+    const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
+
+    device const block_iq3_xxs * x = (device const block_iq3_xxs *) src0 + ib_row + offset0;
+    device const float         * y = (device const float         *) src1 + r1*ne10 + im*ne00*ne1;
+
+    float yl[32];
+    float sumf[N_DST]={0.f}, all_sum;
+
+    const int nb32 = nb * (QK_K / 32);
+
+    threadgroup uint32_t * values = (threadgroup uint32_t *)shared_values;
+    threadgroup uint8_t  * shared_signs = (threadgroup uint8_t *)(values + 256);
+    {
+        int nval = 4;
+        int pos  = (32*sgitg + tiisg)*nval;
+        for (int i = 0; i < nval; ++i) values[pos + i] = iq3xxs_grid[pos + i];
+        nval = 2;
+        pos  = (32*sgitg + tiisg)*nval;
+        for (int i = 0; i < nval; ++i) shared_signs[pos+i] = ksigns_iq2xs[pos+i];
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+    }
+
+#if QK_K == 256
+    const int ix = tiisg;
+
+    device const float * y4 = y + 32 * ix;
+
+    for (int ib32 = ix; ib32 < nb32; ib32 += 32) {
+
+        for (int i = 0; i < 32; ++i) {
+            yl[i] = y4[i];
+        }
+
+        const int ibl = ib32 / (QK_K / 32);
+        const int ib  = ib32 % (QK_K / 32);
+
+        device const block_iq3_xxs * xr = x + ibl;
+        device const uint8_t  * q3 = xr->qs + 8 * ib;
+        device const uint16_t * gas = (device const uint16_t *)(xr->qs + QK_K/4) + 2 * ib;
+        device const half * dh = &xr->d;
+
+        for (int row = 0; row < N_DST; row++) {
+
+            const float db = dh[0];
+            const uint32_t aux32 = gas[0] | (gas[1] << 16);
+            const float d = db * (0.5f + (aux32 >> 28));
+
+            float2 sum = {0};
+            for (int l = 0; l < 4; ++l) {
+                const threadgroup uint8_t * grid1 = (const threadgroup uint8_t *)(values + q3[2*l+0]);
+                const threadgroup uint8_t * grid2 = (const threadgroup uint8_t *)(values + q3[2*l+1]);
+                const uint8_t signs = shared_signs[(aux32 >> 7*l) & 127];
+                for (int j = 0; j < 4; ++j) {
+                    sum[0] += yl[8*l + j + 0] * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
+                    sum[1] += yl[8*l + j + 4] * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
+                }
+            }
+            sumf[row] += d * (sum[0] + sum[1]);
+
+            dh  += nb*sizeof(block_iq3_xxs)/2;
+            q3  += nb*sizeof(block_iq3_xxs);
+            gas += nb*sizeof(block_iq3_xxs)/2;
+        }
+
+        y4 += 32 * 32;
+    }
+#else
+    // TODO
+#endif
+
+    for (int row = 0; row < N_DST; ++row) {
+        all_sum = simd_sum(sumf[row]);
+        if (tiisg == 0) {
+            dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum * 0.5f;
+        }
+    }
+}
+
+[[host_name("kernel_mul_mv_iq3_xxs_f32")]]
+kernel void kernel_mul_mv_iq3_xxs_f32(
+        device const  void * src0,
+        device const float * src1,
+        device       float * dst,
+        constant   int64_t & ne00,
+        constant   int64_t & ne01,
+        constant   int64_t & ne02,
+        constant  uint64_t & nb00,
+        constant  uint64_t & nb01,
+        constant  uint64_t & nb02,
+        constant   int64_t & ne10,
+        constant   int64_t & ne11,
+        constant   int64_t & ne12,
+        constant  uint64_t & nb10,
+        constant  uint64_t & nb11,
+        constant  uint64_t & nb12,
+        constant   int64_t & ne0,
+        constant   int64_t & ne1,
+        constant   uint    & r2,
+        constant   uint    & r3,
+        threadgroup int8_t * shared_values [[threadgroup(0)]],
+        uint3 tgpig[[threadgroup_position_in_grid]],
+        uint  tiisg[[thread_index_in_simdgroup]],
+        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
+
+    kernel_mul_mv_iq3_xxs_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
+}
+
+
 //============================= templates and their specializations =============================

 // NOTE: this is not dequantizing - we are simply fitting the template
@@ -4287,6 +4466,33 @@ void dequantize_iq2_xs(device const block_iq2_xs * xb, short il, thread type4x4
    }
 }

+template <typename type4x4>
+void dequantize_iq3_xxs(device const block_iq3_xxs * xb, short il, thread type4x4 & reg) {
+    // il is 0...15 for QK_K = 256 => index of block of 32 is il/2
+    const float d = xb->d;
+    const int ib32 = il/2;
+    il = il%2;
+    // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16
+    device const uint8_t * q3 = xb->qs + 8*ib32;
+    device const uint16_t * gas = (device const uint16_t *)(xb->qs + QK_K/4) + 2*ib32;
+    const uint32_t aux32 = gas[0] | (gas[1] << 16);
+    const float dl = d * (0.5f + (aux32 >> 28)) * 0.5f;
+    constant uint8_t * grid1 = (constant uint8_t *)(iq3xxs_grid + q3[4*il+0]);
+    constant uint8_t * grid2 = (constant uint8_t *)(iq3xxs_grid + q3[4*il+1]);
+    uint8_t signs = ksigns_iq2xs[(aux32 >> 14*il) & 127];
+    for (int i = 0; i < 4; ++i) {
+        reg[0][i] = dl * grid1[i] * (signs & kmask_iq2xs[i+0] ? -1.f : 1.f);
+        reg[1][i] = dl * grid2[i] * (signs & kmask_iq2xs[i+4] ? -1.f : 1.f);
+    }
+    grid1 = (constant uint8_t *)(iq3xxs_grid + q3[4*il+2]);
+    grid2 = (constant uint8_t *)(iq3xxs_grid + q3[4*il+3]);
+    signs = ksigns_iq2xs[(aux32 >> (14*il+7)) & 127];
+    for (int i = 0; i < 4; ++i) {
+        reg[2][i] = dl * grid1[i] * (signs & kmask_iq2xs[i+0] ? -1.f : 1.f);
+        reg[3][i] = dl * grid2[i] * (signs & kmask_iq2xs[i+4] ? -1.f : 1.f);
+    }
+}
+
 template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread float4x4 &)>
 kernel void kernel_get_rows(
        device const  void * src0,
@@ -4828,6 +5034,7 @@ template [[host_name("kernel_get_rows_q5_K")]] kernel get_rows_t kernel_get_rows
 template [[host_name("kernel_get_rows_q6_K")]] kernel get_rows_t kernel_get_rows<block_q6_K, QK_NL, dequantize_q6_K>;
 template [[host_name("kernel_get_rows_iq2_xxs")]] kernel get_rows_t kernel_get_rows<block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
 template [[host_name("kernel_get_rows_iq2_xs")]]  kernel get_rows_t kernel_get_rows<block_iq2_xs,  QK_NL, dequantize_iq2_xs>;
+template [[host_name("kernel_get_rows_iq3_xxs")]] kernel get_rows_t kernel_get_rows<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;

 //
 // matrix-matrix multiplication
@@ -4866,6 +5073,7 @@ template [[host_name("kernel_mul_mm_q5_K_f32")]] kernel mat_mm_t kernel_mul_mm<b
 template [[host_name("kernel_mul_mm_q6_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q6_K, QK_NL, dequantize_q6_K>;
 template [[host_name("kernel_mul_mm_iq2_xxs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
 template [[host_name("kernel_mul_mm_iq2_xs_f32")]]  kernel mat_mm_t kernel_mul_mm<block_iq2_xs,  QK_NL, dequantize_iq2_xs>;
+template [[host_name("kernel_mul_mm_iq3_xxs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;

 //
 // indirect matrix-matrix multiplication
@@ -4916,6 +5124,7 @@ template [[host_name("kernel_mul_mm_id_q5_K_f32")]] kernel mat_mm_id_t kernel_mu
 template [[host_name("kernel_mul_mm_id_q6_K_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_q6_K, QK_NL, dequantize_q6_K>;
 template [[host_name("kernel_mul_mm_id_iq2_xxs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
 template [[host_name("kernel_mul_mm_id_iq2_xs_f32")]]  kernel mat_mm_id_t kernel_mul_mm_id<block_iq2_xs,  QK_NL, dequantize_iq2_xs>;
+template [[host_name("kernel_mul_mm_id_iq3_xxs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;

 //
 // matrix-vector multiplication
@@ -5818,3 +6027,68 @@ kernel void kernel_mul_mv_id_iq2_xs_f32(
        tiisg,
        sgitg);
 }
+
+[[host_name("kernel_mul_mv_id_iq3_xxs_f32")]]
+kernel void kernel_mul_mv_id_iq3_xxs_f32(
+        device const    char * ids,
+        device const    char * src1,
+        device         float * dst,
+        constant    uint64_t & nbi1,
+        constant     int64_t & ne00,
+        constant     int64_t & ne01,
+        constant     int64_t & ne02,
+        constant    uint64_t & nb00,
+        constant    uint64_t & nb01,
+        constant    uint64_t & nb02,
+        constant     int64_t & ne10,
+        constant     int64_t & ne11,
+        constant     int64_t & ne12,
+        constant     int64_t & ne13,
+        constant    uint64_t & nb10,
+        constant    uint64_t & nb11,
+        constant    uint64_t & nb12,
+        constant     int64_t & ne0,
+        constant     int64_t & ne1,
+        constant    uint64_t & nb1,
+        constant        uint & r2,
+        constant        uint & r3,
+        constant         int & idx,
+        device const    char * src00,
+        device const    char * src01,
+        device const    char * src02,
+        device const    char * src03,
+        device const    char * src04,
+        device const    char * src05,
+        device const    char * src06,
+        device const    char * src07,
+        threadgroup int8_t   * shared_values [[threadgroup(0)]],
+        uint3                  tgpig[[threadgroup_position_in_grid]],
+        uint                   tiitg[[thread_index_in_threadgroup]],
+        uint                   tiisg[[thread_index_in_simdgroup]],
+        uint                   sgitg[[simdgroup_index_in_threadgroup]]) {
+    device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07};
+
+    const int64_t bid = tgpig.z/(ne12*ne13);
+
+    tgpig.z = tgpig.z%(ne12*ne13);
+
+    const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx];
+
+    kernel_mul_mv_iq3_xxs_f32_impl(
+        src0[id],
+        (device const float *) (src1 + bid*nb11),
+        dst + bid*ne0,
+        ne00,
+        ne01,
+        ne02,
+        ne10,
+        ne12,
+        ne0,
+        ne1,
+        r2,
+        r3,
+        shared_values,
+        tgpig,
+        tiisg,
+        sgitg);
+}
--- a/ggml-opencl.cpp
+++ b/ggml-opencl.cpp
@@ -714,7 +714,6 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
        dst[row] = tmp[0];
    }
 }
-
 );


@@ -784,6 +783,7 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float
        dst[row] = tmp[0];
    }
 }
+
 );


@@ -799,6 +799,18 @@ __kernel void KERNEL_NAME(__global TYPE* x, const int x_offset, __global TYPE* y
 }
 );

+std::string add_template = MULTILINE_QUOTE(
+__kernel void add_f32(__global float * x, const int x_offset, __global float * y, const int y_offset, __global float * dst, const int dst_offset, const int ky) {
+    const int i = get_group_id(0)*get_local_size(0) + get_local_id(0);
+
+    if (i >= get_global_size(0)) {
+        return;
+    }
+
+    dst[dst_offset + i] = x[x_offset + i] + y[y_offset + i%ky];
+}
+);
+
 #define CL_CHECK(err)                                               \
    do {                                                            \
        cl_int err_ = (err);                                        \
@@ -878,6 +890,7 @@ static std::string generate_kernels() {
        }
        src << mul_kernel << '\n';
    }
+    src << add_template << '\n';

    return src.str();
 }
@@ -893,6 +906,7 @@ static cl_kernel dequantize_mul_mat_vec_q4_0_cl, dequantize_mul_mat_vec_q4_1_cl,
 static cl_kernel dequantize_block_q2_k_cl, dequantize_block_q3_k_cl, dequantize_block_q4_k_cl, dequantize_block_q5_k_cl, dequantize_block_q6_k_cl;
 static cl_kernel dequantize_mul_mat_vec_q2_K_cl, dequantize_mul_mat_vec_q3_K_cl, dequantize_mul_mat_vec_q4_K_cl, dequantize_mul_mat_vec_q5_K_cl, dequantize_mul_mat_vec_q6_K_cl;
 static cl_kernel mul_f32_cl;
+static cl_kernel add_f32_cl;
 static bool fp16_support;

 static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer) {
@@ -1100,9 +1114,10 @@ void ggml_cl_init(void) {
    char *ext_buffer = (char *)alloca(ext_str_size + 1);
    clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, ext_str_size, ext_buffer, NULL);
    ext_buffer[ext_str_size] = '\0'; // ensure it is null terminated
+    // Disabled due to faulty outputs
    // Check if ext_buffer contains cl_khr_fp16
-    fp16_support = strstr(ext_buffer, "cl_khr_fp16") != NULL;
-    fprintf(stderr, "ggml_opencl: device FP16 support: %s\n", fp16_support ? "true" : "false");
+    fp16_support = false;  // strstr(ext_buffer, "cl_khr_fp16") != NULL;
+    // fprintf(stderr, "ggml_opencl: device FP16 support: %s\n", fp16_support ? "true" : "false");

    cl_context_properties properties[] = {
        (intptr_t)CL_CONTEXT_PLATFORM, (intptr_t)platform, 0
@@ -1150,6 +1165,8 @@ void ggml_cl_init(void) {

    // mul kernel
    CL_CHECK((mul_f32_cl = clCreateKernel(program, "mul_f32", &err), err));
+
+    CL_CHECK((add_f32_cl = clCreateKernel(program, "add_f32", &err), err));
 }

 static cl_kernel* ggml_get_to_fp32_cl(ggml_type type) {
@@ -1458,6 +1475,70 @@ void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src
    ggml_cl_mul_f32(src0, src1, dst);
 }

+static void ggml_cl_add_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne02 = src0->ne[2];
+    const int64_t ne03 = src0->ne[3];
+    const int64_t ne10 = src1->ne[0];
+    const int64_t ne11 = src1->ne[1];
+    const int64_t ne12 = src1->ne[2];
+    const int64_t ne13 = src1->ne[3];
+    const int nb2  = dst->nb[2];
+    const int nb3  = dst->nb[3];
+    size_t x_size;
+    size_t d_size;
+
+    cl_mem d_X = ggml_cl_pool_malloc(ne00 * ne01 * sizeof(float), &x_size); // src0
+    cl_mem d_Y = (cl_mem) src1->extra; // src1 is already on device, broadcasted.
+    cl_mem d_D = ggml_cl_pool_malloc(ne00 * ne01 * sizeof(float), &d_size); // dst
+
+
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+        for (int64_t i02 = 0; i02 < ne02; i02++) {
+            cl_event ev;
+
+            // copy src0 to device
+            CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, &ev));
+
+            const int64_t i13 = i03%ne13;
+            const int64_t i12 = i02%ne12;
+            const int i1 = i13*ne12*ne11 + i12*ne11;
+
+            cl_int x_offset = 0;
+            cl_int y_offset = i1*ne10;
+            cl_int d_offset = 0;
+
+            size_t global = ne00 * ne01;
+            cl_int ky = ne10 * ne11;
+
+            CL_CHECK(clSetKernelArg(add_f32_cl, 0, sizeof(cl_mem), &d_X));
+            CL_CHECK(clSetKernelArg(add_f32_cl, 1, sizeof(cl_int), &x_offset));
+            CL_CHECK(clSetKernelArg(add_f32_cl, 2, sizeof(cl_mem), &d_Y));
+            CL_CHECK(clSetKernelArg(add_f32_cl, 3, sizeof(cl_int), &y_offset));
+            CL_CHECK(clSetKernelArg(add_f32_cl, 4, sizeof(cl_mem), &d_D));
+            CL_CHECK(clSetKernelArg(add_f32_cl, 5, sizeof(cl_int), &d_offset));
+            CL_CHECK(clSetKernelArg(add_f32_cl, 6, sizeof(cl_int), &ky));
+            CL_CHECK(clEnqueueNDRangeKernel(queue, add_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
+
+            CL_CHECK(clReleaseEvent(ev));
+            CL_CHECK(clFinish(queue));
+
+            // copy dst to host
+            float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
+            CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * ne00*ne01, d, 0, NULL, NULL));
+        }
+    }
+    ggml_cl_pool_free(d_X, x_size);
+    ggml_cl_pool_free(d_D, d_size);
+}
+
+void ggml_cl_add(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
+    GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
+    ggml_cl_add_f32(src0, src1, dst);
+}
+
 static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    const int64_t ne00 = src0->ne[0];
    const int64_t ne01 = src0->ne[1];
@@ -2044,6 +2125,15 @@ static size_t ggml_backend_opencl_buffer_type_get_alignment(ggml_backend_buffer_
    GGML_UNUSED(buffer_type);
 }

+static size_t ggml_backend_opencl_buffer_type_get_max_size(ggml_backend_buffer_type_t buffer_type) {
+    static size_t max_size = -1;
+    if (max_size == (size_t)-1) {
+        ggml_cl_init();
+        clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &max_size, NULL);
+    }
+    return max_size;
+}
+
 static bool ggml_backend_opencl_buffer_type_supports_backend(ggml_backend_buffer_type_t buffer_type, ggml_backend_t backend) {
    //return ggml_backend_is_opencl(backend); // opencl must be used through the cpu backend
    return ggml_backend_is_cpu(backend);
@@ -2055,6 +2145,7 @@ static ggml_backend_buffer_type_i ggml_backend_opencl_buffer_type_interface = {
    /* .get_name         = */ ggml_backend_opencl_buffer_type_name,
    /* .alloc_buffer     = */ ggml_backend_opencl_buffer_type_alloc_buffer,
    /* .get_alignment    = */ ggml_backend_opencl_buffer_type_get_alignment,
+    /* .get_max_size     = */ ggml_backend_opencl_buffer_type_get_max_size,
    /* .get_alloc_size   = */ NULL,
    /* .supports_backend = */ ggml_backend_opencl_buffer_type_supports_backend,
    /* .is_host          = */ NULL,
@@ -2111,6 +2202,7 @@ ggml_backend_buffer_type_t ggml_backend_opencl_host_buffer_type() {
            /* .get_name         = */ ggml_backend_opencl_host_buffer_type_name,
            /* .alloc_buffer     = */ ggml_backend_opencl_host_buffer_type_alloc_buffer,
            /* .get_alignment    = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
+            /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
            /* .get_alloc_size   = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
            /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
            /* .is_host          = */ ggml_backend_cpu_buffer_type()->iface.is_host,
--- a/ggml-opencl.h
+++ b/ggml-opencl.h
@@ -10,6 +10,7 @@ extern "C" {
 GGML_API void ggml_cl_init(void);

 GGML_API void   ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
+GGML_API void   ggml_cl_add(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
 GGML_API bool   ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, const struct ggml_tensor * dst);
 GGML_API size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
 GGML_API void   ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
--- a/ggml-quants.c
+++ b/ggml-quants.c
@@ -3441,6 +3441,41 @@ static const uint64_t iq2xs_grid[512] = {
    0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b,
 };

+static const uint32_t iq3xxs_grid[256] = {
+    0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414,
+    0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14,
+    0x040c140c, 0x040c142c, 0x040c1c04, 0x040c1c14, 0x040c240c, 0x040c2c24, 0x040c3e04, 0x04140404,
+    0x04140414, 0x04140424, 0x04140c0c, 0x04141404, 0x04141414, 0x04141c0c, 0x04141c1c, 0x04141c3e,
+    0x04142c0c, 0x04142c3e, 0x04143e2c, 0x041c040c, 0x041c043e, 0x041c0c04, 0x041c0c14, 0x041c142c,
+    0x041c3e04, 0x04240c1c, 0x04241c3e, 0x04242424, 0x04242c3e, 0x04243e1c, 0x04243e2c, 0x042c040c,
+    0x042c043e, 0x042c1c14, 0x042c2c14, 0x04341c2c, 0x04343424, 0x043e0c04, 0x043e0c24, 0x043e0c34,
+    0x043e241c, 0x043e340c, 0x0c04040c, 0x0c04041c, 0x0c040c04, 0x0c040c14, 0x0c04140c, 0x0c04141c,
+    0x0c041c04, 0x0c041c14, 0x0c041c24, 0x0c04243e, 0x0c042c04, 0x0c0c0404, 0x0c0c0414, 0x0c0c0c0c,
+    0x0c0c1404, 0x0c0c1414, 0x0c14040c, 0x0c14041c, 0x0c140c04, 0x0c140c14, 0x0c14140c, 0x0c141c04,
+    0x0c143e14, 0x0c1c0404, 0x0c1c0414, 0x0c1c1404, 0x0c1c1c0c, 0x0c1c2434, 0x0c1c3434, 0x0c24040c,
+    0x0c24042c, 0x0c242c04, 0x0c2c1404, 0x0c2c1424, 0x0c2c2434, 0x0c2c3e0c, 0x0c34042c, 0x0c3e1414,
+    0x0c3e2404, 0x14040404, 0x14040414, 0x14040c0c, 0x14040c1c, 0x14041404, 0x14041414, 0x14041434,
+    0x14041c0c, 0x14042414, 0x140c040c, 0x140c041c, 0x140c042c, 0x140c0c04, 0x140c0c14, 0x140c140c,
+    0x140c1c04, 0x140c341c, 0x140c343e, 0x140c3e04, 0x14140404, 0x14140414, 0x14140c0c, 0x14140c3e,
+    0x14141404, 0x14141414, 0x14141c3e, 0x14142404, 0x14142c2c, 0x141c040c, 0x141c0c04, 0x141c0c24,
+    0x141c3e04, 0x141c3e24, 0x14241c2c, 0x14242c1c, 0x142c041c, 0x142c143e, 0x142c240c, 0x142c3e24,
+    0x143e040c, 0x143e041c, 0x143e0c34, 0x143e242c, 0x1c04040c, 0x1c040c04, 0x1c040c14, 0x1c04140c,
+    0x1c04141c, 0x1c042c04, 0x1c04342c, 0x1c043e14, 0x1c0c0404, 0x1c0c0414, 0x1c0c1404, 0x1c0c1c0c,
+    0x1c0c2424, 0x1c0c2434, 0x1c14040c, 0x1c14041c, 0x1c140c04, 0x1c14142c, 0x1c142c14, 0x1c143e14,
+    0x1c1c0c0c, 0x1c1c1c1c, 0x1c241c04, 0x1c24243e, 0x1c243e14, 0x1c2c0404, 0x1c2c0434, 0x1c2c1414,
+    0x1c2c2c2c, 0x1c340c24, 0x1c341c34, 0x1c34341c, 0x1c3e1c1c, 0x1c3e3404, 0x24040424, 0x24040c3e,
+    0x24041c2c, 0x24041c3e, 0x24042c1c, 0x24042c3e, 0x240c3e24, 0x24141404, 0x24141c3e, 0x24142404,
+    0x24143404, 0x24143434, 0x241c043e, 0x241c242c, 0x24240424, 0x24242c0c, 0x24243424, 0x242c142c,
+    0x242c241c, 0x242c3e04, 0x243e042c, 0x243e0c04, 0x243e0c14, 0x243e1c04, 0x2c040c14, 0x2c04240c,
+    0x2c043e04, 0x2c0c0404, 0x2c0c0434, 0x2c0c1434, 0x2c0c2c2c, 0x2c140c24, 0x2c141c14, 0x2c143e14,
+    0x2c1c0414, 0x2c1c2c1c, 0x2c240c04, 0x2c24141c, 0x2c24143e, 0x2c243e14, 0x2c2c0414, 0x2c2c1c0c,
+    0x2c342c04, 0x2c3e1424, 0x2c3e2414, 0x34041424, 0x34042424, 0x34042434, 0x34043424, 0x340c140c,
+    0x340c340c, 0x34140c3e, 0x34143424, 0x341c1c04, 0x341c1c34, 0x34242424, 0x342c042c, 0x342c2c14,
+    0x34341c1c, 0x343e041c, 0x343e140c, 0x3e04041c, 0x3e04042c, 0x3e04043e, 0x3e040c04, 0x3e041c14,
+    0x3e042c14, 0x3e0c1434, 0x3e0c2404, 0x3e140c14, 0x3e14242c, 0x3e142c14, 0x3e1c0404, 0x3e1c0c2c,
+    0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04,
+};
+
 static const uint8_t ksigns_iq2xs[128] = {
      0, 129, 130,   3, 132,   5,   6, 135, 136,   9,  10, 139,  12, 141, 142,  15,
    144,  17,  18, 147,  20, 149, 150,  23,  24, 153, 154,  27, 156,  29,  30, 159,
@@ -3507,6 +3542,38 @@ void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y,
    }
 }

+// ====================== 3.0625 bpw (de)-quantization
+
+void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y, int k) {
+    assert(k % QK_K == 0);
+    const int nb = k / QK_K;
+
+    uint32_t aux32;
+
+    for (int i = 0; i < nb; i++) {
+
+        const float d = GGML_FP16_TO_FP32(x[i].d);
+        const uint8_t * qs = x[i].qs;
+        const uint8_t * scales_and_signs = qs + QK_K/4;
+
+        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
+            memcpy(&aux32, scales_and_signs + 4*ib32, sizeof(uint32_t));
+            const float db = d * (0.5f + (aux32 >> 28)) * 0.5f;
+            for (int l = 0; l < 4; ++l) {
+                const uint8_t  signs = ksigns_iq2xs[(aux32 >> 7*l) & 127];
+                const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + qs[2*l+0]);
+                const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + qs[2*l+1]);
+                for (int j = 0; j < 4; ++j) {
+                    y[j+0] = db * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
+                    y[j+4] = db * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
+                }
+                y += 8;
+            }
+            qs += 8;
+        }
+    }
+}
+
 //===================================== Q8_K ==============================================

 void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k) {
@@ -8458,17 +8525,36 @@ void ggml_vec_dot_iq2_xs_q8_K(const int n, float * restrict s, const void * rest

    const __m128i m4 = _mm_set1_epi8(0xf);
    const __m128i m1 = _mm_set1_epi8(1);
-    const __m128i m511 = _mm_set1_epi16(511);
-    const __m128i m127 = _mm_set1_epi16(127);
+    const __m256i m511 = _mm256_set1_epi16(511);
+    const __m256i mone = _mm256_set1_epi8(1);

-    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
+    static const uint8_t k_bit_helper[32] = {
+        0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
+        0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
+    };
+    static const char block_sign_shuffle_mask_1[32] = {
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
+        0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
+    };
+    static const char block_sign_shuffle_mask_2[32] = {
+        0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a,
+        0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e,
+    };
+    static const uint8_t bit_selector_mask_bytes[32] = {
+        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+    };
+
+    const __m256i bit_helper = _mm256_loadu_si256((const __m256i*)k_bit_helper);
+    const __m256i bit_selector_mask = _mm256_loadu_si256((const __m256i*)bit_selector_mask_bytes);
+    const __m256i block_sign_shuffle_1 = _mm256_loadu_si256((const __m256i*)block_sign_shuffle_mask_1);
+    const __m256i block_sign_shuffle_2 = _mm256_loadu_si256((const __m256i*)block_sign_shuffle_mask_2);

    uint64_t aux64;

    // somewhat hacky, but gives a significant boost in performance
-    __m128i aux_gindex, aux_sindex;
+    __m256i aux_gindex;
    const uint16_t * gindex = (const uint16_t *)&aux_gindex;
-    const uint16_t * sindex = (const uint16_t *)&aux_sindex;

    __m256 accumf = _mm256_setzero_ps();
    for (int i = 0; i < nb; ++i) {
@@ -8483,26 +8569,68 @@ void ggml_vec_dot_iq2_xs_q8_K(const int n, float * restrict s, const void * rest

        __m256i sumi1 = _mm256_setzero_si256();
        __m256i sumi2 = _mm256_setzero_si256();
-        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 4) {
+
+            const __m256i q2_data = _mm256_loadu_si256((const __m256i*)q2);  q2 += 16;
+            aux_gindex = _mm256_and_si256(q2_data, m511);
+
+            const __m256i partial_sign_bits = _mm256_srli_epi16(q2_data, 9);
+            const __m256i partial_sign_bits_upper = _mm256_srli_epi16(q2_data, 13);
+            const __m256i partial_sign_bits_for_counting = _mm256_xor_si256(partial_sign_bits, partial_sign_bits_upper);
+
+            const __m256i odd_bits = _mm256_shuffle_epi8(bit_helper, partial_sign_bits_for_counting);
+            const __m256i full_sign_bits = _mm256_or_si256(partial_sign_bits, odd_bits);
+
            const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
            const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
-            const __m128i q2_data = _mm_loadu_si128((const __m128i*)q2);  q2 +=  8;
-            aux_gindex = _mm_and_si128(q2_data, m511);
-            aux_sindex = _mm_and_si128(_mm_srli_epi16(q2_data, 9), m127);
-            const __m256i q2_1 = _mm256_set_epi64x(iq2xs_grid[gindex[3]], iq2xs_grid[gindex[2]], iq2xs_grid[gindex[1]], iq2xs_grid[gindex[0]]);
-            const __m256i q2_2 = _mm256_set_epi64x(iq2xs_grid[gindex[7]], iq2xs_grid[gindex[6]], iq2xs_grid[gindex[5]], iq2xs_grid[gindex[4]]);
-            const __m256i s2_1 = _mm256_set_epi64x(signs64[sindex[3]], signs64[sindex[2]], signs64[sindex[1]], signs64[sindex[0]]);
-            const __m256i s2_2 = _mm256_set_epi64x(signs64[sindex[7]], signs64[sindex[6]], signs64[sindex[5]], signs64[sindex[4]]);
-            const __m256i q8s_1 = _mm256_sign_epi8(q8_1, s2_1);
-            const __m256i q8s_2 = _mm256_sign_epi8(q8_2, s2_2);
+            const __m256i q8_3 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
+            const __m256i q8_4 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
+
+            const __m256i q2_1 = _mm256_set_epi64x(iq2xs_grid[gindex[ 3]], iq2xs_grid[gindex[ 2]],
+                                                   iq2xs_grid[gindex[ 1]], iq2xs_grid[gindex[ 0]]);
+            const __m256i q2_2 = _mm256_set_epi64x(iq2xs_grid[gindex[ 7]], iq2xs_grid[gindex[ 6]],
+                                                   iq2xs_grid[gindex[ 5]], iq2xs_grid[gindex[ 4]]);
+            const __m256i q2_3 = _mm256_set_epi64x(iq2xs_grid[gindex[11]], iq2xs_grid[gindex[10]],
+                                                   iq2xs_grid[gindex[ 9]], iq2xs_grid[gindex[ 8]]);
+            const __m256i q2_4 = _mm256_set_epi64x(iq2xs_grid[gindex[15]], iq2xs_grid[gindex[14]],
+                                                   iq2xs_grid[gindex[13]], iq2xs_grid[gindex[12]]);
+
+            const __m128i full_signs_l = _mm256_castsi256_si128(full_sign_bits);
+            const __m128i full_signs_h = _mm256_extractf128_si256(full_sign_bits, 1);
+            const __m256i full_signs_1 = _mm256_set_m128i(full_signs_l, full_signs_l);
+            const __m256i full_signs_2 = _mm256_set_m128i(full_signs_h, full_signs_h);
+
+            __m256i signs;
+            signs = _mm256_shuffle_epi8(full_signs_1, block_sign_shuffle_1);
+            signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask);
+            const __m256i q8s_1 = _mm256_sign_epi8(q8_1, _mm256_or_si256(signs, mone));
+
+            signs = _mm256_shuffle_epi8(full_signs_1, block_sign_shuffle_2);
+            signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask);
+            const __m256i q8s_2 = _mm256_sign_epi8(q8_2, _mm256_or_si256(signs, mone));
+
+            signs = _mm256_shuffle_epi8(full_signs_2, block_sign_shuffle_1);
+            signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask);
+            const __m256i q8s_3 = _mm256_sign_epi8(q8_3, _mm256_or_si256(signs, mone));
+
+            signs = _mm256_shuffle_epi8(full_signs_2, block_sign_shuffle_2);
+            signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask);
+            const __m256i q8s_4 = _mm256_sign_epi8(q8_4, _mm256_or_si256(signs, mone));
+
            const __m256i dot1  = _mm256_maddubs_epi16(q2_1, q8s_1);
            const __m256i dot2  = _mm256_maddubs_epi16(q2_2, q8s_2);
+            const __m256i dot3  = _mm256_maddubs_epi16(q2_3, q8s_3);
+            const __m256i dot4  = _mm256_maddubs_epi16(q2_4, q8s_4);

            const __m256i sc1 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+0)));
            const __m256i sc2 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+1)));
+            const __m256i sc3 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+2)));
+            const __m256i sc4 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+3)));

            sumi1 = _mm256_add_epi32(sumi1, _mm256_madd_epi16(dot1, sc1));
            sumi2 = _mm256_add_epi32(sumi2, _mm256_madd_epi16(dot2, sc2));
+            sumi1 = _mm256_add_epi32(sumi1, _mm256_madd_epi16(dot3, sc3));
+            sumi2 = _mm256_add_epi32(sumi2, _mm256_madd_epi16(dot4, sc4));
        }

        accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
@@ -8551,6 +8679,136 @@ void ggml_vec_dot_iq2_xs_q8_K(const int n, float * restrict s, const void * rest
 #endif
 }

+// TODO
+void ggml_vec_dot_iq3_xxs_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+    assert(n % QK_K == 0);
+
+    const block_iq3_xxs * restrict x = vx;
+    const block_q8_K    * restrict y = vy;
+
+    const int nb = n / QK_K;
+
+#if defined(__ARM_NEON)
+
+    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
+
+    uint32_t aux32[2];
+
+    ggml_int8x16x4_t q3s;
+    ggml_int8x16x4_t q8b;
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint8_t * restrict q3 = x[i].qs;
+        const uint8_t * restrict gas = x[i].qs + QK_K/4;
+        const int8_t   * restrict q8 = y[i].qs;
+        float sumf1 = 0, sumf2 = 0;
+        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+            q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
+            memcpy(aux32, gas, 2*sizeof(uint32_t)); gas += 2*sizeof(uint32_t);
+            const uint32x4_t aux32x4_0 = {iq3xxs_grid[q3[ 0]], iq3xxs_grid[q3[ 1]], iq3xxs_grid[q3[ 2]], iq3xxs_grid[q3[ 3]]};
+            const uint32x4_t aux32x4_1 = {iq3xxs_grid[q3[ 4]], iq3xxs_grid[q3[ 5]], iq3xxs_grid[q3[ 6]], iq3xxs_grid[q3[ 7]]};
+            const uint32x4_t aux32x4_2 = {iq3xxs_grid[q3[ 8]], iq3xxs_grid[q3[ 9]], iq3xxs_grid[q3[10]], iq3xxs_grid[q3[11]]};
+            const uint32x4_t aux32x4_3 = {iq3xxs_grid[q3[12]], iq3xxs_grid[q3[13]], iq3xxs_grid[q3[14]], iq3xxs_grid[q3[15]]};
+            q3 += 16;
+            q3s.val[0] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[0] >>  0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[0] >>  7) & 127))));
+            q3s.val[1] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[0] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[0] >> 21) & 127))));
+            q3s.val[2] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[1] >>  0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[1] >>  7) & 127))));
+            q3s.val[3] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[1] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[1] >> 21) & 127))));
+            q3s.val[0] = vmulq_s8(q3s.val[0], vreinterpretq_s8_u32(aux32x4_0));
+            q3s.val[1] = vmulq_s8(q3s.val[1], vreinterpretq_s8_u32(aux32x4_1));
+            q3s.val[2] = vmulq_s8(q3s.val[2], vreinterpretq_s8_u32(aux32x4_2));
+            q3s.val[3] = vmulq_s8(q3s.val[3], vreinterpretq_s8_u32(aux32x4_3));
+            const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[0], q8b.val[0]), q3s.val[1], q8b.val[1]);
+            const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[2], q8b.val[2]), q3s.val[3], q8b.val[3]);
+            sumf1 += vaddvq_s32(p1) * (0.5f + (aux32[0] >> 28));
+            sumf2 += vaddvq_s32(p2) * (0.5f + (aux32[1] >> 28));
+        }
+        sumf += d*(sumf1 + sumf2);
+    }
+    *s = 0.5f * sumf;
+
+#elif defined(__AVX2__)
+
+    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
+
+    uint32_t aux32[2];
+
+    __m256 accumf = _mm256_setzero_ps();
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint8_t * restrict q3 = x[i].qs;
+        const uint8_t * restrict gas = x[i].qs + QK_K/4;
+        const int8_t  * restrict q8 = y[i].qs;
+        __m256i sumi1 = _mm256_setzero_si256();
+        __m256i sumi2 = _mm256_setzero_si256();
+        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+            const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
+            const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
+            const __m256i q2_1 = _mm256_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]],
+                                                  iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
+            q3 += 8;
+            const __m256i q2_2 = _mm256_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]],
+                                                  iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
+            q3 += 8;
+            memcpy(aux32, gas, 8); gas += 8;
+            const __m256i s2_1 = _mm256_set_epi64x(signs64[(aux32[0] >> 21) & 127], signs64[(aux32[0] >> 14) & 127],
+                                                   signs64[(aux32[0] >>  7) & 127], signs64[(aux32[0] >>  0) & 127]);
+            const __m256i s2_2 = _mm256_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127],
+                                                   signs64[(aux32[1] >>  7) & 127], signs64[(aux32[1] >>  0) & 127]);
+            const __m256i q8s_1 = _mm256_sign_epi8(q8_1, s2_1);
+            const __m256i q8s_2 = _mm256_sign_epi8(q8_2, s2_2);
+            const __m256i dot1  = _mm256_maddubs_epi16(q2_1, q8s_1);
+            const __m256i dot2  = _mm256_maddubs_epi16(q2_2, q8s_2);
+            const uint16_t ls1 = aux32[0] >> 28;
+            const uint16_t ls2 = aux32[1] >> 28;
+            const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(2*ls1+1));
+            const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(2*ls2+1));
+            sumi1 = _mm256_add_epi32(sumi1, p1);
+            sumi2 = _mm256_add_epi32(sumi2, p2);
+        }
+
+        accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
+
+    }
+
+    *s = 0.25f * hsum_float_8(accumf);
+
+#else
+
+    uint32_t aux32;
+
+    float sumf = 0.f;
+    for (int i = 0; i < nb; ++i) {
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+        const uint8_t * restrict q3 = x[i].qs;
+        const uint8_t * restrict gas = x[i].qs + QK_K/4;
+        const int8_t  * restrict q8 = y[i].qs;
+        int32_t bsum = 0;
+        for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
+            memcpy(&aux32, gas, sizeof(uint32_t)); gas += sizeof(uint32_t);
+            const uint32_t ls = 2*(aux32 >> 28) + 1;
+            int32_t sumi = 0;
+            for (int l = 0; l < 4; ++l) {
+                const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*l+0]);
+                const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*l+1]);
+                const uint8_t  signs = ksigns_iq2xs[(aux32 >> 7*l) & 127];
+                for (int j = 0; j < 4; ++j) {
+                    sumi += grid1[j] * q8[j+0] * (signs & kmask_iq2xs[j+0] ? -1 : 1);
+                    sumi += grid2[j] * q8[j+4] * (signs & kmask_iq2xs[j+4] ? -1 : 1);
+                }
+                q8 += 8;
+            }
+            q3 += 8;
+            bsum += sumi * ls;
+        }
+        sumf += d * bsum;
+    }
+    *s = 0.25f * sumf;
+#endif
+}
+
 // ================================ IQ2 quantization =============================================

 typedef struct {
@@ -9189,3 +9447,436 @@ size_t quantize_iq2_xs(const float * src, void * dst, int nrow, int n_per_row, i
    return nrow * nblock * sizeof(block_iq2_xs);
 }

+//
+// ============================================= 3-bit using D4 lattice
+//
+
+typedef struct {
+    uint32_t * grid;
+    int      * map;
+    uint16_t * neighbours;
+} iq3_entry_t;
+
+static iq3_entry_t iq3_data[1] = {
+    {NULL, NULL, NULL},
+};
+
+static inline int iq3_data_index(int grid_size) {
+    (void)grid_size;
+    GGML_ASSERT(grid_size == 256);
+    return 0;
+}
+
+static int iq3_compare_func(const void * left, const void * right) {
+    const int * l = (const int *)left;
+    const int * r = (const int *)right;
+    return l[0] < r[0] ? -1 : l[0] > r[0] ? 1 : l[1] < r[1] ? -1 : l[1] > r[1] ? 1 : 0;
+}
+
+void iq3xs_init_impl(int grid_size) {
+    const int gindex = iq3_data_index(grid_size);
+    if (iq3_data[gindex].grid) {
+        return;
+    }
+    static const uint16_t kgrid_256[256] = {
+            0,     2,     4,     9,    11,    15,    16,    18,    25,    34,    59,    61,    65,    67,    72,    74,
+           81,    85,    88,    90,    97,   108,   120,   128,   130,   132,   137,   144,   146,   153,   155,   159,
+          169,   175,   189,   193,   199,   200,   202,   213,   248,   267,   287,   292,   303,   315,   317,   321,
+          327,   346,   362,   413,   436,   456,   460,   462,   483,   497,   513,   515,   520,   522,   529,   531,
+          536,   538,   540,   551,   552,   576,   578,   585,   592,   594,   641,   643,   648,   650,   657,   664,
+          698,   704,   706,   720,   729,   742,   758,   769,   773,   808,   848,   852,   870,   889,   901,   978,
+          992,  1024,  1026,  1033,  1035,  1040,  1042,  1046,  1049,  1058,  1089,  1091,  1093,  1096,  1098,  1105,
+         1112,  1139,  1143,  1144,  1152,  1154,  1161,  1167,  1168,  1170,  1183,  1184,  1197,  1217,  1224,  1228,
+         1272,  1276,  1309,  1323,  1347,  1367,  1377,  1404,  1473,  1475,  1486,  1509,  1537,  1544,  1546,  1553,
+         1555,  1576,  1589,  1594,  1600,  1602,  1616,  1625,  1636,  1638,  1665,  1667,  1672,  1685,  1706,  1722,
+         1737,  1755,  1816,  1831,  1850,  1856,  1862,  1874,  1901,  1932,  1950,  1971,  2011,  2032,  2052,  2063,
+         2077,  2079,  2091,  2095,  2172,  2192,  2207,  2208,  2224,  2230,  2247,  2277,  2308,  2345,  2356,  2389,
+         2403,  2424,  2501,  2504,  2506,  2520,  2570,  2593,  2616,  2624,  2630,  2646,  2669,  2700,  2714,  2746,
+         2754,  2795,  2824,  2835,  2839,  2874,  2882,  2905,  2984,  3028,  3042,  3092,  3108,  3110,  3124,  3153,
+         3185,  3215,  3252,  3288,  3294,  3364,  3397,  3434,  3483,  3523,  3537,  3587,  3589,  3591,  3592,  3610,
+         3626,  3670,  3680,  3722,  3749,  3754,  3776,  3789,  3803,  3824,  3857,  3873,  3904,  3906,  3924,  3992,
+    };
+    const int kmap_size = 4096;
+    const int nwant = 2;
+    const uint16_t * kgrid = kgrid_256;
+    uint32_t * kgrid_q3xs;
+    int      * kmap_q3xs;
+    uint16_t * kneighbors_q3xs;
+
+    printf("================================================================= %s(grid_size = %d)\n", __func__, grid_size);
+    uint32_t * the_grid = (uint32_t *)malloc(grid_size*sizeof(uint32_t));
+    for (int k = 0; k < grid_size; ++k) {
+        int8_t * pos = (int8_t *)(the_grid + k);
+        for (int i = 0; i < 4; ++i) {
+            int l = (kgrid[k] >> 3*i) & 0x7;
+            pos[i] = 2*l + 1;
+        }
+    }
+    kgrid_q3xs = the_grid;
+    iq3_data[gindex].grid = the_grid;
+    kmap_q3xs = (int *)malloc(kmap_size*sizeof(int));
+    iq3_data[gindex].map = kmap_q3xs;
+    for (int i = 0; i < kmap_size; ++i) kmap_q3xs[i] = -1;
+    uint32_t aux32;
+    uint8_t * aux8 = (uint8_t *)&aux32;
+    for (int i = 0; i < grid_size; ++i) {
+        aux32 = kgrid_q3xs[i];
+        uint16_t index = 0;
+        for (int k=0; k<4; ++k) {
+            uint16_t q = (aux8[k] - 1)/2;
+            index |= (q << 3*k);
+        }
+        kmap_q3xs[index] = i;
+    }
+    int8_t pos[4];
+    int * dist2 = (int *)malloc(2*grid_size*sizeof(int));
+    int num_neighbors = 0, num_not_in_map = 0;
+    for (int i = 0; i < kmap_size; ++i) {
+        if (kmap_q3xs[i] >= 0) continue;
+        ++num_not_in_map;
+        for (int k = 0; k < 4; ++k) {
+            int l = (i >> 3*k) & 0x7;
+            pos[k] = 2*l + 1;
+        }
+        for (int j = 0; j < grid_size; ++j) {
+            const int8_t * pg = (const int8_t *)(kgrid_q3xs + j);
+            int d2 = 0;
+            for (int k = 0; k < 4; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]);
+            dist2[2*j+0] = d2;
+            dist2[2*j+1] = j;
+        }
+        qsort(dist2, grid_size, 2*sizeof(int), iq3_compare_func);
+        int n = 0; int d2 = dist2[0];
+        int nhave = 1;
+        for (int j = 0; j < grid_size; ++j) {
+            if (dist2[2*j] > d2) {
+                if (nhave == nwant) break;
+                d2 = dist2[2*j];
+                ++nhave;
+            }
+            ++n;
+        }
+        num_neighbors += n;
+    }
+    printf("%s: %d neighbours in total\n", __func__, num_neighbors);
+    kneighbors_q3xs = (uint16_t *)malloc((num_neighbors + num_not_in_map)*sizeof(uint16_t));
+    iq3_data[gindex].neighbours = kneighbors_q3xs;
+    int counter = 0;
+    for (int i = 0; i < kmap_size; ++i) {
+        if (kmap_q3xs[i] >= 0) continue;
+        for (int k = 0; k < 4; ++k) {
+            int l = (i >> 3*k) & 0x7;
+            pos[k] = 2*l + 1;
+        }
+        for (int j = 0; j < grid_size; ++j) {
+            const int8_t * pg = (const int8_t *)(kgrid_q3xs + j);
+            int d2 = 0;
+            for (int k = 0; k < 4; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]);
+            dist2[2*j+0] = d2;
+            dist2[2*j+1] = j;
+        }
+        qsort(dist2, grid_size, 2*sizeof(int), iq3_compare_func);
+        kmap_q3xs[i] = -(counter + 1);
+        int d2 = dist2[0];
+        uint16_t * start = &kneighbors_q3xs[counter++];
+        int n = 0, nhave = 1;
+        for (int j = 0; j < grid_size; ++j) {
+            if (dist2[2*j] > d2) {
+                if (nhave == nwant) break;
+                d2 = dist2[2*j];
+                ++nhave;
+            }
+            kneighbors_q3xs[counter++] = dist2[2*j+1];
+            ++n;
+        }
+        *start = n;
+    }
+    free(dist2);
+}
+
+void iq3xs_free_impl(int grid_size) {
+    GGML_ASSERT(grid_size == 256);
+    const int gindex = iq3_data_index(grid_size);
+    if (iq3_data[gindex].grid) {
+        free(iq3_data[gindex].grid);       iq3_data[gindex].grid = NULL;
+        free(iq3_data[gindex].map);        iq3_data[gindex].map  = NULL;
+        free(iq3_data[gindex].neighbours); iq3_data[gindex].neighbours = NULL;
+    }
+}
+
+static int iq3_find_best_neighbour(const uint16_t * restrict neighbours, const uint32_t * restrict grid,
+        const float * restrict xval, const float * restrict weight, float scale, int8_t * restrict L) {
+    int num_neighbors = neighbours[0];
+    GGML_ASSERT(num_neighbors > 0);
+    float best_d2 = FLT_MAX;
+    int grid_index = -1;
+    for (int j = 1; j <= num_neighbors; ++j) {
+        const int8_t * pg = (const int8_t *)(grid + neighbours[j]);
+        float d2 = 0;
+        for (int i = 0; i < 4; ++i) {
+            float q = pg[i];
+            float diff = scale*q - xval[i];
+            d2 += weight[i]*diff*diff;
+        }
+        if (d2 < best_d2) {
+            best_d2 = d2; grid_index = neighbours[j];
+        }
+    }
+    GGML_ASSERT(grid_index >= 0);
+    const int8_t * pg = (const int8_t *)(grid + grid_index);
+    for (int i = 0; i < 4; ++i) L[i] = (pg[i] - 1)/2;
+    return grid_index;
+}
+
+static void quantize_row_iq3_xxs_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
+
+    const int gindex = iq3_data_index(256);
+
+    const uint32_t * kgrid_q3xs      = iq3_data[gindex].grid;
+    const int      * kmap_q3xs       = iq3_data[gindex].map;
+    const uint16_t * kneighbors_q3xs = iq3_data[gindex].neighbours;
+
+    //GGML_ASSERT(quant_weights   && "missing quantization weights");
+    GGML_ASSERT(kgrid_q3xs      && "forgot to call ggml_quantize_init()?");
+    GGML_ASSERT(kmap_q3xs       && "forgot to call ggml_quantize_init()?");
+    GGML_ASSERT(kneighbors_q3xs && "forgot to call ggml_quantize_init()?");
+    GGML_ASSERT(n%QK_K == 0);
+
+    const int kMaxQ = 8;
+
+    const int nbl = n/256;
+
+    block_iq3_xxs * y = vy;
+
+    float scales[QK_K/32];
+    float weight[32];
+    float xval[32];
+    int8_t L[32];
+    int8_t Laux[32];
+    float  waux[32];
+    bool   is_on_grid[8];
+    bool   is_on_grid_aux[8];
+    uint8_t block_signs[8];
+    uint8_t q3[3*(QK_K/8)];
+    uint32_t * scales_and_signs = (uint32_t *)(q3 + QK_K/4);
+
+    for (int ibl = 0; ibl < nbl; ++ibl) {
+
+        y[ibl].d = GGML_FP32_TO_FP16(0.f);
+        memset(q3, 0, 3*QK_K/8);
+
+        float max_scale = 0;
+
+        const float * xbl = x + QK_K*ibl;
+        float sumx2 = 0;
+        for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
+        float sigma2 = sumx2/QK_K;
+
+        for (int ib = 0; ib < QK_K/32; ++ib) {
+            const float * xb = xbl + 32*ib;
+            if (quant_weights) {
+                const float * qw = quant_weights + QK_K*ibl + 32*ib;
+                for (int i = 0; i < 32; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
+            } else {
+                for (int i = 0; i < 32; ++i) weight[i] = xb[i]*xb[i];
+            }
+            for (int i = 0; i < 32; ++i) waux[i] = sqrtf(weight[i]);
+            for (int k = 0; k < 4; ++k) {
+                int nflip = 0;
+                uint8_t s = 0;
+                for (int i = 0; i < 8; ++i) {
+                    if (xb[8*k + i] >= 0) xval[8*k + i] = xb[8*k + i];
+                    else {
+                        xval[8*k + i] = -xb[8*k + i]; ++nflip; s |= (1 << i);
+                    }
+                }
+                if (nflip%2) {
+                    int imin = 0; float min = weight[8*k+imin]*xb[8*k+imin]*xb[8*k+imin];
+                    for (int i = 1; i < 8; ++i) {
+                        float ax = weight[8*k+i]*xb[8*k+i]*xb[8*k+i];
+                        if (ax < min) {
+                            min = ax; imin = i;
+                        }
+                    }
+                    xval[8*k+imin] = -xval[8*k+imin];
+                    s ^= (1 << imin);
+                }
+                block_signs[k] = s & 127;
+            }
+            float max = xval[0];
+            for (int i = 1; i < 32; ++i) max = MAX(max, xval[i]);
+            if (!max) {
+                scales[ib] = 0;
+                memset(L, 0, 32);
+                continue;
+            }
+            float best = 0;
+            float scale = max/(2*kMaxQ-1);
+            for (int is = -15; is <= 15; ++is) {
+                float id = (2*kMaxQ-1+is*0.2f)/max;
+                float this_scale = 1/id;
+                for (int k = 0; k < 8; ++k) {
+                    for (int i = 0; i < 4; ++i) {
+                        int l = nearest_int(0.5f*(id*xval[4*k+i]-1));
+                        Laux[4*k+i] = MAX(0, MIN(kMaxQ-1, l));
+                    }
+                    uint16_t u = 0;
+                    for (int i = 0; i < 4; ++i) u |= (Laux[4*k+i] << 3*i);
+                    int grid_index = kmap_q3xs[u];
+                    is_on_grid_aux[k] = true;
+                    if (grid_index < 0) {
+                        is_on_grid_aux[k] = false;
+                        const uint16_t * neighbours = kneighbors_q3xs - kmap_q3xs[u] - 1;
+                        grid_index = iq3_find_best_neighbour(neighbours, kgrid_q3xs, xval + 4*k, waux + 4*k, this_scale, Laux + 4*k);
+                    }
+                }
+                float sumqx = 0, sumq2 = 0;
+                for (int i = 0; i < 32; ++i) {
+                    float w = weight[i];
+                    float q = 2*Laux[i] + 1;
+                    sumqx += w*xval[i]*q;
+                    sumq2 += w*q*q;
+                }
+                if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
+                    scale = sumqx/sumq2; best = scale*sumqx;
+                    for (int i = 0; i < 32; ++i) L[i] = Laux[i];
+                    for (int k = 0; k <  8; ++k) is_on_grid[k] = is_on_grid_aux[k];
+                }
+            }
+            int n_not_ongrid = 0;
+            for (int k = 0; k < 8; ++k) if (!is_on_grid[k]) ++n_not_ongrid;
+            if (n_not_ongrid > 0 && scale > 0) {
+                float id = 1/scale;
+                for (int k = 0; k < 8; ++k) {
+                    if (is_on_grid[k]) continue;
+                    uint16_t u = 0;
+                    for (int i = 0; i < 4; ++i) {
+                        int l = nearest_int(0.5f*(id*xval[4*k+i]-1));
+                        l = MAX(0, MIN(kMaxQ-1, l));
+                        u |= (l << 3*i);
+                    }
+                    int grid_index = kmap_q3xs[u];
+                    if (grid_index < 0) {
+                        const uint16_t * neighbours = kneighbors_q3xs - kmap_q3xs[u] - 1;
+                        grid_index = iq3_find_best_neighbour(neighbours, kgrid_q3xs, xval + 4*k, waux + 4*k, scale, L + 4*k);
+                    }
+                    const int8_t * pg = (const int8_t *)(kgrid_q3xs + grid_index);
+                    for (int i = 0; i < 4; ++i) L[4*k+i] = (pg[i] - 1)/2;
+                }
+                float sumqx = 0, sumq2 = 0;
+                for (int i = 0; i < 32; ++i) {
+                    float w = weight[i];
+                    float q = 2*L[i] + 1;
+                    sumqx += w*xval[i]*q;
+                    sumq2 += w*q*q;
+                }
+                if (sumq2 > 0) scale = sumqx/sumq2;
+            }
+            if (scale < 0) {
+                // This should never happen, but just in case, flip scale so that it is positive (we use uint's to encode the scale)
+                // and correspondingly flip quant signs.
+                scale = -scale;
+                for (int k = 0; k < 4; ++k) block_signs[k] = (~block_signs[k]) & 127;
+            }
+            for (int k = 0; k < 8; ++k) {
+                uint16_t u = 0;
+                for (int i = 0; i < 4; ++i) u |= (L[4*k+i] << 3*i);
+                int grid_index = kmap_q3xs[u];
+                if (grid_index < 0) {
+                    printf("Oops: found point %u not on grid:", u);
+                    for (int i = 0; i < 4; ++i) printf(" %d", L[4*k+i]);
+                    printf("\n");
+                    GGML_ASSERT(false);
+                }
+                q3[8*ib+k] = grid_index;
+            }
+            scales_and_signs[ib] = block_signs[0] | (block_signs[1] << 7) | (block_signs[2] << 14) | (block_signs[3] << 21);
+            GGML_ASSERT(scale >= 0);
+            scales[ib] = scale;
+            max_scale = MAX(max_scale, scale);
+        }
+
+        if (!max_scale) {
+            memset(y[ibl].qs, 0, 3*QK_K/8);
+            continue;
+        }
+
+        float d = max_scale/31;
+        y[ibl].d = GGML_FP32_TO_FP16(d);
+        float id = 1/d;
+        float sumqx = 0, sumq2 = 0;
+        for (int ib = 0; ib < QK_K/32; ++ib) {
+            int l = nearest_int(0.5f*(id*scales[ib]-1));
+            l = MAX(0, MIN(15, l));
+            scales_and_signs[ib] |= ((uint32_t)l << 28);
+            if (false) {
+                const float * xb = xbl + 32*ib;
+                if (quant_weights) {
+                    const float * qw = quant_weights + QK_K*ibl + 32*ib;
+                    for (int i = 0; i < 32; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
+                } else {
+                    for (int i = 0; i < 32; ++i) weight[i] = xb[i]*xb[i];
+                }
+                const float db = 0.25f * d * (1 + 2*l);
+                for (int k = 0; k < 8; ++k) {
+                    const int8_t * signs = keven_signs_q2xs + 8*((scales_and_signs[ib] >> 7*(k/2)) & 127) + 4*(k%2);
+                    const float * xk = xb + 4*k;
+                    const float * wk = weight + 4*k;
+                    //const uint8_t * grid = (const uint8_t *)(kgrid_q3xs + q3[8*ib+k]);
+                    const uint8_t * grid = (const uint8_t *)(iq3xxs_grid + q3[8*ib+k]);
+                    float best_mse = 0; int best_index = q3[8*ib+k];
+                    for (int j = 0; j < 4; ++j) {
+                        float diff = db * grid[j] * signs[j] - xk[j];
+                        best_mse += wk[j] * diff * diff;
+                    }
+                    for (int idx = 0; idx < 256; ++idx) {
+                        //grid = (const uint8_t *)(kgrid_q3xs + idx);
+                        grid = (const uint8_t *)(iq3xxs_grid + idx);
+                        float mse = 0;
+                        for (int j = 0; j < 4; ++j) {
+                            float diff = db * grid[j] * signs[j] - xk[j];
+                            mse += wk[j] * diff * diff;
+                        }
+                        if (mse < best_mse) {
+                            best_mse = mse; best_index = idx;
+                        }
+                    }
+                    q3[8*ib+k] = best_index;
+                    //grid = (const uint8_t *)(kgrid_q3xs + best_index);
+                    grid = (const uint8_t *)(iq3xxs_grid + best_index);
+                    for (int j = 0; j < 4; ++j) {
+                        float q = db * grid[j] * signs[j];
+                        sumqx += wk[j] * q * xk[j];
+                        sumq2 += wk[j] * q * q;
+                    }
+                }
+                if (sumq2 > 0) y[ibl].d = GGML_FP32_TO_FP16(d*sumqx/sumq2);
+            }
+        }
+        memcpy(y[ibl].qs, q3, 3*QK_K/8);
+    }
+}
+
+size_t quantize_iq3_xxs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
+    (void)hist;
+    GGML_ASSERT(n_per_row%QK_K == 0);
+    int nblock = n_per_row/QK_K;
+    char * qrow = (char *)dst;
+    for (int row = 0; row < nrow; ++row) {
+        quantize_row_iq3_xxs_impl(src, qrow, n_per_row, quant_weights);
+        src += n_per_row;
+        qrow += nblock*sizeof(block_iq3_xxs);
+    }
+    return nrow * nblock * sizeof(block_iq3_xxs);
+}
+
+void quantize_row_iq3_xxs(const float * restrict x, void * restrict vy, int k) {
+    assert(k % QK_K == 0);
+    block_iq3_xxs * restrict y = vy;
+    quantize_row_iq3_xxs_reference(x, y, k);
+}
+
+void quantize_row_iq3_xxs_reference(const float * restrict x, block_iq3_xxs * restrict y, int k) {
+    assert(k % QK_K == 0);
+    quantize_row_iq3_xxs_impl(x, y, k, NULL);
+}
--- a/ggml-quants.h
+++ b/ggml-quants.h
@@ -166,7 +166,7 @@ typedef struct {
 static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_t), "wrong q8_K block size/padding");

 // (Almost) "true" 2-bit quantization.
-// Due to the need to use blocks as per ggml dsign, it ends up using
+// Due to the need to use blocks as per ggml design, it ends up using
 // 2.0625 bpw because of the 16-bit scale for each block of 256.
 typedef struct {
    ggml_fp16_t d;
@@ -182,6 +182,15 @@ typedef struct {
 } block_iq2_xs;
 static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding");

+// (Almost) "true" 3-bit quantization.
+// Due to the need to use blocks as per ggml design, it ends up using
+// 3.0625 bpw because of the 16-bit scale for each block of 256.
+typedef struct {
+    ggml_fp16_t d;
+    uint8_t qs[3*QK_K/8];
+} block_iq3_xxs;
+static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
+
 // Quantization
 void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k);
 void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k);
@@ -196,6 +205,7 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict
 void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k);
 void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k);
 void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);
+void quantize_row_iq3_xxs_reference(const float * restrict x, block_iq3_xxs * restrict y, int k);

 void quantize_row_q4_0(const float * restrict x, void * restrict y, int k);
 void quantize_row_q4_1(const float * restrict x, void * restrict y, int k);
@@ -210,6 +220,7 @@ void quantize_row_q4_K(const float * restrict x, void * restrict y, int k);
 void quantize_row_q5_K(const float * restrict x, void * restrict y, int k);
 void quantize_row_q6_K(const float * restrict x, void * restrict y, int k);
 void quantize_row_q8_K(const float * restrict x, void * restrict y, int k);
+void quantize_row_iq3_xxs(const float * restrict x, void * restrict y, int k);

 // Dequantization
 void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k);
@@ -227,6 +238,7 @@ void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int
 void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k);
 void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int k);
 void dequantize_row_iq2_xs (const block_iq2_xs  * restrict x, float * restrict y, int k);
+void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y, int k);

 // Dot product
 void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
@@ -242,12 +254,14 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx,
 void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
 void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
 void ggml_vec_dot_iq2_xs_q8_K (int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);

 //
 // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
 //
 size_t quantize_iq2_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
 size_t quantize_iq2_xs (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
+size_t quantize_iq3_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
 size_t quantize_q2_K   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
 size_t quantize_q3_K   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
 size_t quantize_q4_K   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
@@ -260,3 +274,5 @@ size_t quantize_q5_1   (const float * src, void * dst, int nrows, int n_per_row,

 void iq2xs_init_impl(int grid_size);
 void iq2xs_free_impl(int grid_size);
+void iq3xs_init_impl(int grid_size);
+void iq3xs_free_impl(int grid_size);
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
--- a/ggml-sycl.h
+++ b/ggml-sycl.h
@@ -0,0 +1,27 @@
+/*MIT license
+  Copyright (C) 2024 Intel Corporation
+  SPDX-License-Identifier: MIT
+*/
+
+#pragma once
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+#define GGML_SYCL_MAX_DEVICES       16
+#define GGML_SYCL_NAME "SYCL"
+
+GGML_API void   ggml_init_sycl(void);
+GGML_API bool   ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
+GGML_API ggml_backend_t ggml_backend_sycl_init(int device);
+GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);
+GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
+GGML_API void   ggml_backend_sycl_print_sycl_devices(void);
+
+#ifdef  __cplusplus
+}
+#endif
--- a/ggml-vulkan-shaders.hpp
+++ b/ggml-vulkan-shaders.hpp
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
--- a/ggml-vulkan.h
+++ b/ggml-vulkan.h
@@ -0,0 +1,34 @@
+#pragma once
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+#define GGML_VK_NAME "Vulkan"
+
+GGML_API void ggml_vk_init(void);
+
+GGML_API void ggml_vk_preallocate_buffers_graph(struct ggml_tensor * node);
+GGML_API void ggml_vk_preallocate_buffers(void);
+GGML_API void ggml_vk_build_graph(struct ggml_tensor * node, bool last_node);
+GGML_API bool ggml_vk_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
+#ifdef GGML_VULKAN_CHECK_RESULTS
+void ggml_vk_check_results_1(struct ggml_compute_params * params, struct ggml_tensor * tensor);
+#endif
+GGML_API void ggml_vk_graph_cleanup(void);
+
+// backend API
+GGML_API GGML_CALL ggml_backend_t ggml_backend_vk_init(void);
+
+GGML_API GGML_CALL bool ggml_backend_is_vk(ggml_backend_t backend);
+
+GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(void);
+// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
+GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void);
+
+#ifdef  __cplusplus
+}
+#endif
--- a/ggml.c
+++ b/ggml.c
--- a/ggml.h
+++ b/ggml.h
@@ -353,6 +353,7 @@ extern "C" {
        GGML_TYPE_Q8_K = 15,
        GGML_TYPE_IQ2_XXS = 16,
        GGML_TYPE_IQ2_XS  = 17,
+        GGML_TYPE_IQ3_XXS = 18,
        GGML_TYPE_I8,
        GGML_TYPE_I16,
        GGML_TYPE_I32,
@@ -389,6 +390,7 @@ extern "C" {
        GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors
        GGML_FTYPE_MOSTLY_IQ2_XXS = 15, // except 1d tensors
        GGML_FTYPE_MOSTLY_IQ2_XS  = 16, // except 1d tensors
+        GGML_FTYPE_MOSTLY_IQ3_XXS = 17, // except 1d tensors
    };

    // available tensor operations:
@@ -489,6 +491,8 @@ extern "C" {
        GGML_UNARY_OP_GELU,
        GGML_UNARY_OP_GELU_QUICK,
        GGML_UNARY_OP_SILU,
+        GGML_UNARY_OP_HARDSWISH,
+        GGML_UNARY_OP_HARDSIGMOID,

        GGML_UNARY_OP_COUNT,
    };
@@ -1032,6 +1036,16 @@ extern "C" {
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);

+    // hardswish(x) = x * relu6(x + 3) / 6
+    GGML_API struct ggml_tensor * ggml_hardswish(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    // hardsigmoid(x) = relu6(x + 3) / 6
+    GGML_API struct ggml_tensor * ggml_hardsigmoid(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
    // normalize along rows
    GGML_API struct ggml_tensor * ggml_norm(
            struct ggml_context * ctx,
@@ -1483,6 +1497,17 @@ extern "C" {
            int                  d1,
            bool                 is_2D);

+    GGML_API struct ggml_tensor * ggml_conv_depthwise_2d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            int                  s0,
+            int                  s1,
+            int                  p0,
+            int                  p1,
+            int                  d0,
+            int                  d1);
+
    GGML_API struct ggml_tensor * ggml_conv_1d(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
@@ -2240,9 +2265,12 @@ extern "C" {
    GGML_API int ggml_cpu_has_blas       (void);
    GGML_API int ggml_cpu_has_cublas     (void);
    GGML_API int ggml_cpu_has_clblast    (void);
+    GGML_API int ggml_cpu_has_vulkan     (void);
+    GGML_API int ggml_cpu_has_kompute    (void);
    GGML_API int ggml_cpu_has_gpublas    (void);
    GGML_API int ggml_cpu_has_sse3       (void);
    GGML_API int ggml_cpu_has_ssse3      (void);
+    GGML_API int ggml_cpu_has_sycl       (void);
    GGML_API int ggml_cpu_has_vsx        (void);

    //
--- a/ggml_vk_generate_shaders.py
+++ b/ggml_vk_generate_shaders.py
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -97,8 +97,11 @@ class MODEL_ARCH(IntEnum):
    BLOOM     = auto()
    STABLELM  = auto()
    QWEN      = auto()
+    QWEN2     = auto()
    PHI2      = auto()
    PLAMO     = auto()
+    CODESHELL = auto()
+    ORION     = auto()


 class MODEL_TENSOR(IntEnum):
@@ -145,8 +148,11 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
    MODEL_ARCH.BLOOM:          "bloom",
    MODEL_ARCH.STABLELM:       "stablelm",
    MODEL_ARCH.QWEN:           "qwen",
+    MODEL_ARCH.QWEN2:          "qwen2",
    MODEL_ARCH.PHI2:           "phi2",
    MODEL_ARCH.PLAMO:          "plamo",
+    MODEL_ARCH.CODESHELL:      "codeshell",
+    MODEL_ARCH.ORION:          "orion",
 }

 TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
@@ -356,6 +362,20 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.FFN_DOWN,
        MODEL_TENSOR.FFN_UP,
    ],
+    MODEL_ARCH.QWEN2: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+    ],
    MODEL_ARCH.PLAMO: [
        MODEL_TENSOR.TOKEN_EMBD,
        MODEL_TENSOR.OUTPUT_NORM,
@@ -396,7 +416,36 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.FFN_NORM,
        MODEL_TENSOR.FFN_DOWN,
        MODEL_TENSOR.FFN_UP,
-    ]
+    ],
+    MODEL_ARCH.CODESHELL: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.POS_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_QKV,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.ATTN_ROT_EMBD,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+    ],
+    MODEL_ARCH.ORION: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ROPE_FREQS,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.ATTN_ROT_EMBD,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+    ],
    # TODO
 }

@@ -417,6 +466,14 @@ MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.ROPE_FREQS,
        MODEL_TENSOR.ATTN_ROT_EMBD,
    ],
+    MODEL_ARCH.CODESHELL: [
+        MODEL_TENSOR.ROPE_FREQS,
+        MODEL_TENSOR.ATTN_ROT_EMBD,
+    ],
+    MODEL_ARCH.ORION: [
+        MODEL_TENSOR.ROPE_FREQS,
+        MODEL_TENSOR.ATTN_ROT_EMBD,
+    ],
 }

 #
--- a/gguf-py/gguf/gguf_reader.py
+++ b/gguf-py/gguf/gguf_reader.py
@@ -107,7 +107,7 @@ class GGUFReader:
        offs, tensors_fields = self._build_tensors_fields(offs, tensor_count)
        new_align = self.fields.get('general.alignment')
        if new_align is not None:
-            if new_align.types != [GGUFValueType.UINT64]:
+            if new_align.types != [GGUFValueType.UINT32]:
                raise ValueError('Bad type for general.alignment field')
            self.alignment = new_align.parts[-1][0]
        padding = offs % self.alignment
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -154,6 +154,7 @@ class TensorNameMap:
            "model.layers.{bid}.self_attn.rotary_emb.inv_freq",        # llama-hf
            "layers.{bid}.attention.inner_attention.rope.freqs",       # llama-pth
            "model.layers.layers.{bid}.self_attn.rotary_emb.inv_freq", # plamo
+            "transformer.h.{bid}.attn.rotary_emb.inv_freq",            # codeshell
        ),

        # Feed-forward norm
--- a/1
+++ b/1
--- a/kompute-shaders/common.comp
+++ b/kompute-shaders/common.comp
@@ -0,0 +1,102 @@
+#extension GL_EXT_shader_16bit_storage: require
+#extension GL_EXT_shader_8bit_storage: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
+#extension GL_EXT_control_flow_attributes: enable
+#extension GL_KHR_shader_subgroup_arithmetic : require
+#extension GL_EXT_debug_printf : enable
+
+#define QK4_0 32
+#define QK4_1 32
+
+#define GELU_COEF_A 0.044715
+#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
+#define TWOPI_F 6.283185307179586f
+
+#define QK_K 256
+
+#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
+#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
+#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
+#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
+
+#define sizeof_block_q4_0 0x12
+struct block_q4_0 {
+    float16_t d;
+    uint8_t qs[QK4_0 / 2];
+};
+mat4 dequantize_q4_0(const block_q4_0 xb, uint il) {
+    const float d1 = il != 0 ? (xb.d / 16.f) : xb.d;
+    const float d2 = d1 / 256.f;
+    const float md = -8.f * xb.d;
+    const uint16_t mask0 = il != 0 ? uint16_t(0x00F0) : uint16_t(0x000F);
+    const uint16_t mask1 = mask0 << 8;
+
+    mat4 reg;
+    for (int i=0;i<8;i++) {
+        uint16_t b = (uint16_t(xb.qs[2 * i + 1]) << 8) | uint16_t(xb.qs[2 * i]);
+        reg[i/2][2*(i%2)+0] = d1 * (b & mask0) + md;
+        reg[i/2][2*(i%2)+1] = d2 * (b & mask1) + md;
+    }
+    return reg;
+}
+
+#define sizeof_block_q4_1 0x14
+struct block_q4_1 {
+    float16_t d;
+    float16_t m;
+    uint8_t qs[QK4_1 / 2];
+};
+mat4 dequantize_q4_1(const block_q4_1 xb, uint il) {
+    const float d1 = il != 0 ? (xb.d / 16.f) : xb.d;
+    const float d2 = d1 / 256.f;
+    const float  m = xb.m;
+    const uint16_t mask0 = il != 0 ? uint16_t(0x00F0) : uint16_t(0x000F);
+    const uint16_t mask1 = mask0 << 8;
+
+    mat4 reg;
+    for (int i=0;i<8;i++) {
+        uint16_t b = (uint16_t(xb.qs[2 * i + 1]) << 8) | uint16_t(xb.qs[2 * i]);
+        reg[i/2][2*(i%2)+0] = ((b & mask0) * d1) + m;
+        reg[i/2][2*(i%2)+1] = ((b & mask1) * d2) + m;
+    }
+    return reg;
+}
+
+#define sizeof_block_q6_k 210
+struct block_q6_k {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    float16_t d;             // super-block scale
+};
+mat4 dequantize_q6_k(const block_q6_k xb, uint il) {
+    const float16_t d_all = xb.d;
+
+    const uint qlIndex = 64*(il/8) + 32*((il/2)&1) + 16*(il&1);
+    const uint qhIndex = 32*(il/8) + 16*(il&1);
+    float16_t sc = xb.scales[(il%2) + 2 * ((il/2))];
+    il = (il/2) & 3;
+
+    const uint16_t  kmask1 = il>1 ? uint16_t(il>2 ? 192 : 48) : uint16_t(il>0 ? 12 : 3);
+    const uint16_t  kmask2 = il>1 ? uint8_t(0xF0)             : uint8_t(0x0F);
+    const float16_t coef   = il>1 ? float16_t(1.f/16.f)       : float16_t(1.f);
+    const float16_t ml = float16_t(d_all * sc * 32.f);
+    const float16_t dl = float16_t(d_all * sc * coef);
+    mat4 reg;
+    for (int i = 0; i < 16; ++i) {
+        const float16_t q = (il&1) != 0 ? ((xb.ql[qlIndex + i] & kmask2) | ((xb.qh[qhIndex + i] & kmask1) << 2))
+                                        : ((xb.ql[qlIndex + i] & kmask2) | ((xb.qh[qhIndex + i] & kmask1) << 4));
+        reg[i/4][i%4] = dl * q - ml;
+    }
+    return reg;
+}
+
+
+#define QK8_0 32
+// struct block_q8_0 {
+//     float16_t d;         // delta
+//     int8_t    qs[QK8_0]; // quants
+// };
+#define sizeof_block_q8_0 34
--- a/kompute-shaders/op_add.comp
+++ b/kompute-shaders/op_add.comp
@@ -0,0 +1,58 @@
+#version 450
+
+#include "common.comp"
+
+layout(local_size_x = 1024) in;
+
+layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
+layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; };
+layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
+
+layout(push_constant) uniform PushConstants {
+    uint inAOff;
+    uint inBOff;
+    uint outOff;
+    int ne00;
+    int nb00;
+    int nb01;
+    int nb02;
+    int nb03;
+    int ne10;
+    int ne11;
+    int ne12;
+    int ne13;
+    int nb10;
+    int nb11;
+    int nb12;
+    int nb13;
+    int ne0;
+    int nb0;
+    int nb1;
+    int nb2;
+    int nb3;
+  //int offs; // TODO: needed for GGML_OP_ACC, see metal code
+} pcs;
+
+// general-purpose kernel for addition of two tensors
+// pros: works for non-contiguous tensors, supports broadcast across dims 1, 2 and 3
+// cons: not very efficient
+void main() {
+    const uint i03 = gl_WorkGroupID.z;
+    const uint i02 = gl_WorkGroupID.y;
+    const uint i01 = gl_WorkGroupID.x;
+
+    const uint i13 = i03 % pcs.ne13;
+    const uint i12 = i02 % pcs.ne12;
+    const uint i11 = i01 % pcs.ne11;
+
+    int offs = 0; // TMP (see above)
+
+    uint src0_off = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + offs) / 4);
+    uint src1_off = uint((i13*pcs.nb13 + i12*pcs.nb12 + i11*pcs.nb11       ) / 4);
+    uint dst_off  = uint((i03*pcs.nb3  + i02*pcs.nb2  + i01*pcs.nb1  + offs) / 4);
+
+    for (uint i0 = gl_LocalInvocationID.x; i0 < pcs.ne0; i0 += gl_WorkGroupSize.x) {
+        const uint i10 = i0 % pcs.ne10;
+        out_[pcs.outOff + dst_off + i0] = inA[pcs.inAOff + src0_off + i0] + inB[pcs.inBOff + src1_off + i10];
+    }
+}
--- a/kompute-shaders/op_addrow.comp
+++ b/kompute-shaders/op_addrow.comp
@@ -0,0 +1,25 @@
+#version 450
+
+#include "common.comp"
+
+layout(local_size_x = 1) in;
+
+layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
+layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; };
+layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
+
+layout(push_constant) uniform PushConstants {
+    uint inAOff;
+    uint inBOff;
+    uint outOff;
+    uint row;
+} pcs;
+
+void main() {
+    const uint baseIndex = gl_WorkGroupID.x * 4;
+
+    for (uint x = 0; x < 4; x++) {
+        const uint i = baseIndex + x;
+        out_[i + pcs.outOff] = inA[i + pcs.inAOff] + inB[(i % pcs.row) + pcs.inBOff];
+    }
+}
--- a/kompute-shaders/op_cpy_f16_f16.comp
+++ b/kompute-shaders/op_cpy_f16_f16.comp
@@ -0,0 +1,52 @@
+#version 450
+
+#include "common.comp"
+
+#define IN_TYPE float16_t
+#define IN_TYPE_SIZE 2
+#define OUT_TYPE float16_t
+#define OUT_TYPE_SIZE 2
+
+layout(local_size_x = 1024) in;
+
+layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
+layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
+
+layout (push_constant) uniform parameter {
+    uint inOff;
+    uint outOff;
+    int ne00;
+    int ne01;
+    int ne02;
+    uint nb00;
+    uint nb01;
+    uint nb02;
+    uint nb03;
+    int ne0;
+    int ne1;
+    int ne2;
+    uint nb0;
+    uint nb1;
+    uint nb2;
+    uint nb3;
+} pcs;
+
+void main() {
+    const uint i03 = gl_WorkGroupID.z;
+    const uint i02 = gl_WorkGroupID.y;
+    const uint i01 = gl_WorkGroupID.x;
+
+    const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00;
+
+    const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0);
+    const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0);
+    const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0;
+    const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0);
+
+    const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
+
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
+        const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
+        out_[dst_data+i00] = OUT_TYPE(in_[src]);
+    }
+}
--- a/kompute-shaders/op_cpy_f16_f32.comp
+++ b/kompute-shaders/op_cpy_f16_f32.comp
@@ -0,0 +1,52 @@
+#version 450
+
+#include "common.comp"
+
+#define IN_TYPE float16_t
+#define IN_TYPE_SIZE 2
+#define OUT_TYPE float
+#define OUT_TYPE_SIZE 4
+
+layout(local_size_x = 1024) in;
+
+layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
+layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
+
+layout (push_constant) uniform parameter {
+    uint inOff;
+    uint outOff;
+    int ne00;
+    int ne01;
+    int ne02;
+    uint nb00;
+    uint nb01;
+    uint nb02;
+    uint nb03;
+    int ne0;
+    int ne1;
+    int ne2;
+    uint nb0;
+    uint nb1;
+    uint nb2;
+    uint nb3;
+} pcs;
+
+void main() {
+    const uint i03 = gl_WorkGroupID.z;
+    const uint i02 = gl_WorkGroupID.y;
+    const uint i01 = gl_WorkGroupID.x;
+
+    const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00;
+
+    const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0);
+    const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0);
+    const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0;
+    const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0);
+
+    const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
+
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
+        const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
+        out_[dst_data+i00] = OUT_TYPE(in_[src]);
+    }
+}
--- a/kompute-shaders/op_cpy_f32_f16.comp
+++ b/kompute-shaders/op_cpy_f32_f16.comp
@@ -0,0 +1,52 @@
+#version 450
+
+#include "common.comp"
+
+#define IN_TYPE float
+#define IN_TYPE_SIZE 4
+#define OUT_TYPE float16_t
+#define OUT_TYPE_SIZE 2
+
+layout(local_size_x = 1024) in;
+
+layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
+layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
+
+layout (push_constant) uniform parameter {
+    uint inOff;
+    uint outOff;
+    int ne00;
+    int ne01;
+    int ne02;
+    uint nb00;
+    uint nb01;
+    uint nb02;
+    uint nb03;
+    int ne0;
+    int ne1;
+    int ne2;
+    uint nb0;
+    uint nb1;
+    uint nb2;
+    uint nb3;
+} pcs;
+
+void main() {
+    const uint i03 = gl_WorkGroupID.z;
+    const uint i02 = gl_WorkGroupID.y;
+    const uint i01 = gl_WorkGroupID.x;
+
+    const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00;
+
+    const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0);
+    const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0);
+    const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0;
+    const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0);
+
+    const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
+
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
+        const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
+        out_[dst_data+i00] = OUT_TYPE(in_[src]);
+    }
+}
--- a/kompute-shaders/op_cpy_f32_f32.comp
+++ b/kompute-shaders/op_cpy_f32_f32.comp
@@ -0,0 +1,52 @@
+#version 450
+
+#include "common.comp"
+
+#define IN_TYPE float
+#define IN_TYPE_SIZE 4
+#define OUT_TYPE float
+#define OUT_TYPE_SIZE 4
+
+layout(local_size_x = 1024) in;
+
+layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
+layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
+
+layout (push_constant) uniform parameter {
+    uint inOff;
+    uint outOff;
+    int ne00;
+    int ne01;
+    int ne02;
+    uint nb00;
+    uint nb01;
+    uint nb02;
+    uint nb03;
+    int ne0;
+    int ne1;
+    int ne2;
+    uint nb0;
+    uint nb1;
+    uint nb2;
+    uint nb3;
+} pcs;
+
+void main() {
+    const uint i03 = gl_WorkGroupID.z;
+    const uint i02 = gl_WorkGroupID.y;
+    const uint i01 = gl_WorkGroupID.x;
+
+    const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00;
+
+    const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0);
+    const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0);
+    const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0;
+    const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0);
+
+    const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
+
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
+        const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
+        out_[dst_data+i00] = OUT_TYPE(in_[src]);
+    }
+}
--- a/kompute-shaders/op_diagmask.comp
+++ b/kompute-shaders/op_diagmask.comp
@@ -0,0 +1,30 @@
+#version 450
+
+#include "common.comp"
+
+layout(local_size_x = 1) in;
+
+layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
+layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
+
+layout(push_constant) uniform PushConstants {
+    uint inOff;
+    uint outOff;
+    uint n_past;
+    int ne00;
+    int ne01;
+} pcs;
+
+void main() {
+    const uint i02 = gl_WorkGroupID.z;
+    const uint i01 = gl_WorkGroupID.y;
+    const uint i00 = gl_WorkGroupID.x;
+
+    const uint index = i02*pcs.ne01*pcs.ne00 + i01*pcs.ne00 + i00;
+
+    if (i00 > pcs.n_past + i01) {
+        out_[index + pcs.outOff] = uintBitsToFloat(0xFF800000);
+    } else {
+        out_[index + pcs.outOff] = in_[index + pcs.inOff];
+    }
+}
--- a/kompute-shaders/op_gelu.comp
+++ b/kompute-shaders/op_gelu.comp
@@ -0,0 +1,22 @@
+#version 450
+
+#include "common.comp"
+
+layout(local_size_x = 1) in;
+
+layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
+layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
+layout(push_constant) uniform PushConstants {
+    uint inOff;
+    uint outOff;
+} pcs;
+
+void main() {
+    const uint baseIndex = gl_WorkGroupID.x * 8;
+
+    for (uint x = 0; x < 8; x++) {
+        const uint i = baseIndex + x;
+        const float y = in_[i + pcs.inOff];
+        out_[i + pcs.outOff] = 0.5*y*(1.0 + tanh(clamp(SQRT_2_OVER_PI*y*(1.0 + GELU_COEF_A*y*y), -15.0, 15.0)));
+    }
+}
--- a/kompute-shaders/op_getrows.comp
+++ b/kompute-shaders/op_getrows.comp
@@ -0,0 +1,17 @@
+void main() {
+    const uint i = gl_WorkGroupID.x;
+    const int r = inB[i + pcs.inBOff];
+
+    int z = 0;
+    for (uint ind = gl_LocalInvocationID.x; ind < pcs.ne00/16; ind += gl_WorkGroupSize.x) {
+        const uint inIndex = (r * pcs.nb01 + pcs.inAOff) + ind/NL * SIZE_OF_BLOCK;
+        const mat4 result = dequantize_block(inIndex, ind%NL);
+        for (uint j = 0; j < 4; ++j) {
+            for (uint k = 0; k < 4; ++k) {
+                const uint outIndex = i * pcs.nb1/BYTES_FOR_TYPE + pcs.outOff + z;
+                out_[outIndex] = result[j][k];
+                ++z;
+            }
+        }
+    }
+}
--- a/Show More
+++ b/Show More