test zero max buffer size

2026-02-12 14:03:20 +02:00 · 2024-06-26 17:11:09 +02:00
464 changed files with 4429 additions and 8861 deletions
--- a/.devops/full-cuda.Dockerfile
+++ b/.devops/full-cuda.Dockerfile
@@ -27,7 +27,7 @@ COPY . .
 # Set nvcc architecture
 ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
 # Enable CUDA
-ENV GGML_CUDA=1
+ENV LLAMA_CUDA=1
 # Enable cURL
 ENV LLAMA_CURL=1

--- a/.devops/full-rocm.Dockerfile
+++ b/.devops/full-rocm.Dockerfile
@@ -36,7 +36,7 @@ COPY . .
 # Set nvcc architecture
 ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
 # Enable ROCm
-ENV GGML_HIPBLAS=1
+ENV LLAMA_HIPBLAS=1
 ENV CC=/opt/rocm/llvm/bin/clang
 ENV CXX=/opt/rocm/llvm/bin/clang++

--- a/.devops/llama-cli-cuda.Dockerfile
+++ b/.devops/llama-cli-cuda.Dockerfile
@@ -21,7 +21,7 @@ COPY . .
 # Set nvcc architecture
 ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
 # Enable CUDA
-ENV GGML_CUDA=1
+ENV LLAMA_CUDA=1

 RUN make -j$(nproc) llama-cli

--- a/.devops/llama-cli-intel.Dockerfile
+++ b/.devops/llama-cli-intel.Dockerfile
@@ -2,7 +2,7 @@ ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04

 FROM intel/oneapi-basekit:$ONEAPI_VERSION as build

-ARG GGML_SYCL_F16=OFF
+ARG LLAMA_SYCL_F16=OFF
 RUN apt-get update && \
    apt-get install -y git

@@ -10,11 +10,11 @@ WORKDIR /app

 COPY . .

-RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
-        echo "GGML_SYCL_F16 is set" && \
-        export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
+RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
+        echo "LLAMA_SYCL_F16 is set" && \
+        export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
    fi && \
-    cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
+    cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
    cmake --build build --config Release --target llama-cli

 FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
--- a/.devops/llama-cli-rocm.Dockerfile
+++ b/.devops/llama-cli-rocm.Dockerfile
@@ -36,7 +36,7 @@ COPY . .
 # Set nvcc architecture
 ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
 # Enable ROCm
-ENV GGML_HIPBLAS=1
+ENV LLAMA_HIPBLAS=1
 ENV CC=/opt/rocm/llvm/bin/clang
 ENV CXX=/opt/rocm/llvm/bin/clang++

--- a/.devops/llama-cli-vulkan.Dockerfile
+++ b/.devops/llama-cli-vulkan.Dockerfile
@@ -14,7 +14,7 @@ RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key
 # Build it
 WORKDIR /app
 COPY . .
-RUN cmake -B build -DGGML_VULKAN=1 && \
+RUN cmake -B build -DLLAMA_VULKAN=1 && \
    cmake --build build --config Release --target llama-cli

 # Clean up
--- a/.devops/llama-cpp-clblast.srpm.spec
+++ b/.devops/llama-cpp-clblast.srpm.spec
@@ -0,0 +1,84 @@
+# SRPM for building from source and packaging an RPM for RPM-based distros.
+# https://docs.fedoraproject.org/en-US/quick-docs/creating-rpm-packages
+# Built and maintained by John Boero - boeroboy@gmail.com
+# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
+
+# Notes for llama.cpp:
+# 1. Tags are currently based on hash - which will not sort asciibetically.
+#    We need to declare standard versioning if people want to sort latest releases.
+# 2. Builds for CUDA/OpenCL support are separate, with different depenedencies.
+# 3. NVidia's developer repo must be enabled with nvcc, cublas, clblas, etc installed.
+#    Example: https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo
+# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
+#    It is up to the user to install the correct vendor-specific support.
+
+Name:           llama.cpp-clblast
+Version:        %( date "+%%Y%%m%%d" )
+Release:        1%{?dist}
+Summary:        OpenCL Inference of LLaMA model in C/C++
+License:        MIT
+Source0:        https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
+BuildRequires:  coreutils make gcc-c++ git mesa-libOpenCL-devel clblast-devel
+Requires:       clblast
+URL:            https://github.com/ggerganov/llama.cpp
+
+%define debug_package %{nil}
+%define source_date_epoch_from_changelog 0
+
+%description
+CPU inference for Meta's Lllama2 models using default options.
+
+%prep
+%setup -n llama.cpp-master
+
+%build
+make -j LLAMA_CLBLAST=1
+
+%install
+mkdir -p %{buildroot}%{_bindir}/
+cp -p llama-cli %{buildroot}%{_bindir}/llama-clblast-cli
+cp -p llama-server %{buildroot}%{_bindir}/llama-clblast-server
+cp -p llama-simple %{buildroot}%{_bindir}/llama-clblast-simple
+
+mkdir -p %{buildroot}/usr/lib/systemd/system
+%{__cat} <<EOF  > %{buildroot}/usr/lib/systemd/system/llamaclblast.service
+[Unit]
+Description=Llama.cpp server, CPU only (no GPU support in this build).
+After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
+
+[Service]
+Type=simple
+EnvironmentFile=/etc/sysconfig/llama
+ExecStart=/usr/bin/llama-clblast-server $LLAMA_ARGS
+ExecReload=/bin/kill -s HUP $MAINPID
+Restart=never
+
+[Install]
+WantedBy=default.target
+EOF
+
+mkdir -p %{buildroot}/etc/sysconfig
+%{__cat} <<EOF  > %{buildroot}/etc/sysconfig/llama
+LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin"
+EOF
+
+%clean
+rm -rf %{buildroot}
+rm -rf %{_builddir}/*
+
+%files
+%{_bindir}/llama-clblast-cli
+%{_bindir}/llama-clblast-server
+%{_bindir}/llama-clblast-simple
+/usr/lib/systemd/system/llamaclblast.service
+%config /etc/sysconfig/llama
+
+
+%pre
+
+%post
+
+%preun
+%postun
+
+%changelog
--- a/.devops/llama-cpp-cuda.srpm.spec
+++ b/.devops/llama-cpp-cuda.srpm.spec
@@ -32,7 +32,7 @@ CPU inference for Meta's Lllama2 models using default options.
 %setup -n llama.cpp-master

 %build
-make -j GGML_CUDA=1
+make -j LLAMA_CUDA=1

 %install
 mkdir -p %{buildroot}%{_bindir}/
--- a/.devops/llama-server-cuda.Dockerfile
+++ b/.devops/llama-server-cuda.Dockerfile
@@ -21,7 +21,7 @@ COPY . .
 # Set nvcc architecture
 ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
 # Enable CUDA
-ENV GGML_CUDA=1
+ENV LLAMA_CUDA=1
 # Enable cURL
 ENV LLAMA_CURL=1

--- a/.devops/llama-server-intel.Dockerfile
+++ b/.devops/llama-server-intel.Dockerfile
@@ -2,7 +2,7 @@ ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04

 FROM intel/oneapi-basekit:$ONEAPI_VERSION as build

-ARG GGML_SYCL_F16=OFF
+ARG LLAMA_SYCL_F16=OFF
 RUN apt-get update && \
    apt-get install -y git libcurl4-openssl-dev

@@ -10,11 +10,11 @@ WORKDIR /app

 COPY . .

-RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
-        echo "GGML_SYCL_F16 is set" && \
-        export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
+RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
+        echo "LLAMA_SYCL_F16 is set" && \
+        export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
    fi && \
-    cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
+    cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
    cmake --build build --config Release --target llama-server

 FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
--- a/.devops/llama-server-rocm.Dockerfile
+++ b/.devops/llama-server-rocm.Dockerfile
@@ -36,7 +36,7 @@ COPY . .
 # Set nvcc architecture
 ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
 # Enable ROCm
-ENV GGML_HIPBLAS=1
+ENV LLAMA_HIPBLAS=1
 ENV CC=/opt/rocm/llvm/bin/clang
 ENV CXX=/opt/rocm/llvm/bin/clang++

--- a/.devops/llama-server-vulkan.Dockerfile
+++ b/.devops/llama-server-vulkan.Dockerfile
@@ -14,7 +14,7 @@ RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key
 # Build it
 WORKDIR /app
 COPY . .
-RUN cmake -B build -DGGML_VULKAN=1 -DLLAMA_CURL=1 && \
+RUN cmake -B build -DLLAMA_VULKAN=1 -DLLAMA_CURL=1 && \
    cmake --build build --config Release --target llama-server

 # Clean up
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@@ -17,18 +17,19 @@
  rocmPackages,
  vulkan-headers,
  vulkan-loader,
-  curl,
+  clblast,
  useBlas ? builtins.all (x: !x) [
    useCuda
    useMetalKit
+    useOpenCL
    useRocm
    useVulkan
  ] && blas.meta.available,
  useCuda ? config.cudaSupport,
-  useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin,
+  useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin && !useOpenCL,
  useMpi ? false, # Increases the runtime closure size by ~700M
+  useOpenCL ? false,
  useRocm ? config.rocmSupport,
-  enableCurl ? true,
  useVulkan ? false,
  llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake

@@ -55,6 +56,7 @@ let
    ++ lib.optionals useCuda [ "CUDA" ]
    ++ lib.optionals useMetalKit [ "MetalKit" ]
    ++ lib.optionals useMpi [ "MPI" ]
+    ++ lib.optionals useOpenCL [ "OpenCL" ]
    ++ lib.optionals useRocm [ "ROCm" ]
    ++ lib.optionals useVulkan [ "Vulkan" ];

@@ -158,9 +160,9 @@ effectiveStdenv.mkDerivation (
    };

    postPatch = ''
-      substituteInPlace ./ggml/src/ggml-metal.m \
+      substituteInPlace ./ggml-metal.m \
        --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
-      substituteInPlace ./ggml/src/ggml-metal.m \
+      substituteInPlace ./ggml-metal.m \
        --replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";"
    '';

@@ -196,24 +198,24 @@ effectiveStdenv.mkDerivation (
      optionals effectiveStdenv.isDarwin darwinBuildInputs
      ++ optionals useCuda cudaBuildInputs
      ++ optionals useMpi [ mpi ]
+      ++ optionals useOpenCL [ clblast ]
      ++ optionals useRocm rocmBuildInputs
      ++ optionals useBlas [ blas ]
-      ++ optionals useVulkan vulkanBuildInputs
-      ++ optionals enableCurl [ curl ];
+      ++ optionals useVulkan vulkanBuildInputs;

    cmakeFlags =
      [
+        (cmakeBool "LLAMA_NATIVE" false)
        (cmakeBool "LLAMA_BUILD_SERVER" true)
        (cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
        (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
-        (cmakeBool "LLAMA_CURL" enableCurl)
-        (cmakeBool "GGML_NATIVE" false)
-        (cmakeBool "GGML_BLAS" useBlas)
-        (cmakeBool "GGML_CUDA" useCuda)
-        (cmakeBool "GGML_HIPBLAS" useRocm)
-        (cmakeBool "GGML_METAL" useMetalKit)
-        (cmakeBool "GGML_VULKAN" useVulkan)
-        (cmakeBool "GGML_STATIC" enableStatic)
+        (cmakeBool "LLAMA_BLAS" useBlas)
+        (cmakeBool "LLAMA_CLBLAST" useOpenCL)
+        (cmakeBool "LLAMA_CUDA" useCuda)
+        (cmakeBool "LLAMA_HIPBLAS" useRocm)
+        (cmakeBool "LLAMA_METAL" useMetalKit)
+        (cmakeBool "LLAMA_VULKAN" useVulkan)
+        (cmakeBool "LLAMA_STATIC" enableStatic)
      ]
      ++ optionals useCuda [
        (
@@ -229,7 +231,7 @@ effectiveStdenv.mkDerivation (
      ]
      ++ optionals useMetalKit [
        (lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1")
-        (cmakeBool "GGML_METAL_EMBED_LIBRARY" (!precompileMetalShaders))
+        (cmakeBool "LLAMA_METAL_EMBED_LIBRARY" (!precompileMetalShaders))
      ];

    # Environment variables needed for ROCm
@@ -242,7 +244,7 @@ effectiveStdenv.mkDerivation (
    # if they haven't been added yet.
    postInstall = ''
      mkdir -p $out/include
-      cp $src/include/llama.h $out/include/
+      cp $src/llama.h $out/include/
    '';

    # Define the shells here, but don't add in the inputsFrom to avoid recursion.
@@ -252,6 +254,7 @@ effectiveStdenv.mkDerivation (
        useCuda
        useMetalKit
        useMpi
+        useOpenCL
        useRocm
        useVulkan
        ;
@@ -278,7 +281,7 @@ effectiveStdenv.mkDerivation (
      # Configurations we don't want even the CI to evaluate. Results in the
      # "unsupported platform" messages. This is mostly a no-op, because
      # cudaPackages would've refused to evaluate anyway.
-      badPlatforms = optionals useCuda lib.platforms.darwin;
+      badPlatforms = optionals (useCuda || useOpenCL) lib.platforms.darwin;

      # Configurations that are known to result in build failures. Can be
      # overridden by importing Nixpkgs with `allowBroken = true`.
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -9,3 +9,5 @@ contact_links:
  - name: Want to contribute?
    url: https://github.com/ggerganov/llama.cpp/wiki/contribute
    about: Head to the contribution guide page of the wiki for areas you can help with
+
+
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -2,31 +2,31 @@
 Kompute:
    - changed-files:
        - any-glob-to-any-file:
-            - ggml/include/ggml-kompute.h
-            - ggml/src/ggml-kompute.cpp
+            - ggml-kompute.h
+            - ggml-kompute.cpp
            - README-kompute.md
 Apple Metal:
    - changed-files:
        - any-glob-to-any-file:
-            - ggml/include/ggml-metal.h
-            - ggml/src/ggml-metal.cpp
+            - ggml-metal.h
+            - ggml-metal.cpp
            - README-metal.md
 SYCL:
    - changed-files:
        - any-glob-to-any-file:
-            - ggml/include/ggml-sycl.h
-            - ggml/src/ggml-sycl.cpp
+            - ggml-sycl.h
+            - ggml-sycl.cpp
            - README-sycl.md
 Nvidia GPU:
    - changed-files:
        - any-glob-to-any-file:
-            - ggml/include/ggml-cuda.h
-            - ggml/src/ggml-cuda/**
+            - ggml-cuda.h
+            - ggml-cuda/**
 Vulkan:
    - changed-files:
        - any-glob-to-any-file:
-            - ggml/ggml_vk_generate_shaders.py
-            - ggml/src/ggml-vulkan*
+            - ggml_vk_generate_shaders.py
+            - ggml-vulkan*
 documentation:
    - changed-files:
        - any-glob-to-any-file:
@@ -73,10 +73,10 @@ server:
 ggml:
    - changed-files:
        - any-glob-to-any-file:
-            - ggml/include/ggml*.h
-            - ggml/src/ggml*.c
-            - ggml/src/ggml*.cpp
-            - ggml/src/ggml*.h
+            - ggml.c
+            - ggml.h
+            - ggml-*.c
+            - ggml-*.h
            - ggml-cuda/**
 nix:
    - changed-files:
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@@ -109,7 +109,7 @@ jobs:
        run: |
          set -eux
          cmake -B build \
-              -DGGML_NATIVE=OFF \
+              -DLLAMA_NATIVE=OFF \
              -DLLAMA_BUILD_SERVER=ON \
              -DLLAMA_CURL=ON \
              -DLLAMA_CUBLAS=ON \
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -10,10 +10,10 @@ on:
  push:
    branches:
      - master
-    paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal']
+    paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m']
  pull_request:
    types: [opened, synchronize, reopened]
-    paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal']
+    paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m']

 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
@@ -47,7 +47,7 @@ jobs:
          sysctl -a
          mkdir build
          cd build
-          cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF ..
+          cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON ..
          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)

      - name: Test
@@ -105,7 +105,7 @@ jobs:
          sysctl -a
          # Metal is disabled due to intermittent failures with Github runners not having a GPU:
          # https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
-          cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF
+          cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL=OFF -DLLAMA_CURL=ON
          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)

      - name: Test
@@ -222,7 +222,7 @@ jobs:
        run: |
          mkdir build
          cd build
-          cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF
+          cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON
          cmake --build . --config Release -j $(nproc)

      - name: Test
@@ -305,7 +305,7 @@ jobs:
        run: |
          mkdir build
          cd build
-          cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DGGML_OPENMP=OFF
+          cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DLLAMA_OPENMP=OFF
          cmake --build . --config ${{ matrix.build_type }} -j $(nproc)

      - name: Test
@@ -335,7 +335,7 @@ jobs:
        run: |
          mkdir build
          cd build
-          cmake -DGGML_RPC=ON ..
+          cmake -DLLAMA_RPC=ON ..
          cmake --build . --config Release -j $(nproc)

      - name: Test
@@ -363,7 +363,7 @@ jobs:
        run: |
          mkdir build
          cd build
-          cmake -DGGML_VULKAN=ON ..
+          cmake -DLLAMA_VULKAN=ON ..
          cmake --build . --config Release -j $(nproc)

  ubuntu-22-cmake-hip:
@@ -384,13 +384,13 @@ jobs:
      - name: Build with native CMake HIP support
        id: cmake_build
        run: |
-          cmake -B build -S . -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" -DGGML_HIPBLAS=ON
+          cmake -B build -S . -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" -DLLAMA_HIPBLAS=ON
          cmake --build build --config Release -j $(nproc)

      - name: Build with legacy HIP support
        id: cmake_build_legacy_hip
        run: |
-          cmake -B build2 -S . -DCMAKE_C_COMPILER=hipcc -DCMAKE_CXX_COMPILER=hipcc -DGGML_HIPBLAS=ON
+          cmake -B build2 -S . -DCMAKE_C_COMPILER=hipcc -DCMAKE_CXX_COMPILER=hipcc -DLLAMA_HIPBLAS=ON
          cmake --build build2 --config Release -j $(nproc)

  ubuntu-22-cmake-sycl:
@@ -431,7 +431,7 @@ jobs:
          source /opt/intel/oneapi/setvars.sh
          mkdir build
          cd build
-          cmake -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ..
+          cmake -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ..
          cmake --build . --config Release -j $(nproc)

  ubuntu-22-cmake-sycl-fp16:
@@ -472,10 +472,10 @@ jobs:
          source /opt/intel/oneapi/setvars.sh
          mkdir build
          cd build
-          cmake -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON ..
+          cmake -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON ..
          cmake --build . --config Release -j $(nproc)

-  # TODO: build with GGML_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know
+  # TODO: build with LLAMA_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know
  #       how to debug it.
  #       ref: https://github.com/ggerganov/llama.cpp/actions/runs/7131777249/job/19420981052#step:5:1124
  macOS-latest-make:
@@ -497,15 +497,15 @@ jobs:
        env:
            LLAMA_FATAL_WARNINGS: 1
        run: |
-          GGML_NO_METAL=1 make -j $(sysctl -n hw.logicalcpu)
+          LLAMA_NO_METAL=1 make -j $(sysctl -n hw.logicalcpu)

      - name: Test
        id: make_test
        run: |
-          GGML_NO_METAL=1 make tests -j $(sysctl -n hw.logicalcpu)
-          GGML_NO_METAL=1 make test  -j $(sysctl -n hw.logicalcpu)
+          LLAMA_NO_METAL=1 make tests -j $(sysctl -n hw.logicalcpu)
+          LLAMA_NO_METAL=1 make test  -j $(sysctl -n hw.logicalcpu)

-  # TODO: build with GGML_METAL=OFF because test-backend-ops fail on "Apple Paravirtual device" and I don't know
+  # TODO: build with LLAMA_METAL=OFF because test-backend-ops fail on "Apple Paravirtual device" and I don't know
  #       how to debug it.
  #       ref: https://github.com/ggerganov/llama.cpp/actions/runs/7132125951/job/19422043567?pr=4359#step:5:6584
  #       would be great if we fix these
@@ -529,7 +529,7 @@ jobs:
          sysctl -a
          mkdir build
          cd build
-          cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF ..
+          cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL=OFF ..
          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)

      - name: Test
@@ -559,14 +559,13 @@ jobs:
          mkdir build
          cd build
          cmake -G Xcode .. \
-            -DGGML_METAL_EMBED_LIBRARY=ON \
+            -DLLAMA_METAL_EMBED_LIBRARY=ON \
            -DLLAMA_BUILD_EXAMPLES=OFF \
            -DLLAMA_BUILD_TESTS=OFF \
            -DLLAMA_BUILD_SERVER=OFF \
            -DCMAKE_SYSTEM_NAME=iOS \
-            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
-            -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
-          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
+            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0
+          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)

  macOS-latest-cmake-tvos:
    runs-on: macos-latest
@@ -589,14 +588,13 @@ jobs:
          mkdir build
          cd build
          cmake -G Xcode .. \
-            -DGGML_METAL_EMBED_LIBRARY=ON \
+            -DLLAMA_METAL_EMBED_LIBRARY=ON \
            -DLLAMA_BUILD_EXAMPLES=OFF \
            -DLLAMA_BUILD_TESTS=OFF \
            -DLLAMA_BUILD_SERVER=OFF \
            -DCMAKE_SYSTEM_NAME=tvOS \
-            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
-            -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
-          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
+            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0
+          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)

  macOS-latest-swift:
    runs-on: macos-latest
@@ -664,7 +662,7 @@ jobs:
      - name: Build using make w/ OpenBLAS
        shell: msys2 {0}
        run: |
-            make GGML_OPENBLAS=1 -j $(nproc)
+            make LLAMA_OPENBLAS=1 -j $(nproc)

      - name: Build using CMake
        shell: msys2 {0}
@@ -680,7 +678,7 @@ jobs:
      - name: Build using CMake w/ OpenBLAS
        shell: msys2 {0}
        run: |
-            cmake -B build -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
+            cmake -B build -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
            cmake --build build --config ${{ matrix.build }} -j $(nproc)

  windows-latest-cmake:
@@ -695,25 +693,25 @@ jobs:
      matrix:
        include:
          - build: 'rpc-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=ON'
+            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_RPC=ON -DBUILD_SHARED_LIBS=ON'
          - build: 'noavx-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DBUILD_SHARED_LIBS=ON'
+            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF -DBUILD_SHARED_LIBS=ON'
          - build: 'avx2-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
+            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
          - build: 'avx-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
+            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
          - build: 'avx512-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_AVX512=ON -DBUILD_SHARED_LIBS=ON'
+            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
          - build: 'openblas-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_BLAS=ON -DBUILD_SHARED_LIBS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
+            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
          - build: 'kompute-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'
+            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'
          - build: 'vulkan-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_VULKAN=ON -DBUILD_SHARED_LIBS=ON'
+            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_VULKAN=ON -DBUILD_SHARED_LIBS=ON'
          - build: 'llvm-arm64'
-            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
+            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
          - build: 'msvc-arm64'
-            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
+            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'

    steps:
      - name: Clone
@@ -726,7 +724,7 @@ jobs:
        id: clone_kompute
        if: ${{ matrix.build == 'kompute-x64' }}
        run: |
-          git submodule update --init ggml/src/kompute
+          git submodule update --init kompute

      - name: Download OpenBLAS
        id: get_openblas
@@ -799,7 +797,6 @@ jobs:
          7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar
          $sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe)
          cd build
-          $env:LLAMA_SKIP_TESTS_SLOW_ON_EMULATOR = 1
          & $sde -future -- ctest -L main -C Release --verbose --timeout 900

      - name: Determine tag name
@@ -857,7 +854,7 @@ jobs:
        run: |
          mkdir build
          cd build
-          cmake .. -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON
+          cmake .. -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=ON
          cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}

      - name: Determine tag name
@@ -990,7 +987,7 @@ jobs:
        run: |
          $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
          $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
-          cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIPBLAS=ON
+          cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DLLAMA_HIPBLAS=ON
          cmake --build build --config Release

  ios-xcode-build:
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -14,7 +14,6 @@ on:
  push:
    branches:
      - master
-    paths: ['.github/workflows/docker.yml', '.devops/*.Dockerfile', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal']

 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -92,12 +92,12 @@ jobs:
        if: ${{ matrix.sanitizer == 'THREAD' }}
        run: |
          cmake -B build \
-              -DGGML_NATIVE=OFF \
+              -DLLAMA_NATIVE=OFF \
              -DLLAMA_BUILD_SERVER=ON \
              -DLLAMA_CURL=ON \
              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
              -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
-              -DGGML_OPENMP=OFF ;
+              -DLLAMA_OPENMP=OFF ;
          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server

      - name: Build
@@ -105,7 +105,7 @@ jobs:
        if: ${{ matrix.sanitizer != 'THREAD' }}
        run: |
          cmake -B build \
-              -DGGML_NATIVE=OFF \
+              -DLLAMA_NATIVE=OFF \
              -DLLAMA_BUILD_SERVER=ON \
              -DLLAMA_CURL=ON \
              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
--- a/.gitignore
+++ b/.gitignore
@@ -56,7 +56,6 @@ CMakeSettings.json
 compile_commands.json
 ggml-metal-embed.metal
 llama-batched-swift
-/rpc-server
 out/
 tmp/

@@ -98,14 +97,13 @@ examples/server/*.mjs.hpp

 # Python

-/.venv
-__pycache__/
-*/poetry.lock
+__pycache__
+.venv
+/Pipfile
+dist
+poetry.lock
 poetry.toml

-# Nix
-/result
-
 # Test binaries
 /tests/test-backend-ops
 /tests/test-double-float
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,3 @@
 [submodule "kompute"]
-	path = ggml/src/kompute
+	path = kompute
 	url = https://github.com/nomic-ai/kompute.git
--- a/129
+++ b/129
@@ -1,9 +1,8 @@
-# date: Wed Jun 26 19:36:34 EEST 2024
+# date: Tue Apr  9 09:17:14 EEST 2024
 # this file is auto-generated by scripts/gen-authors.sh

 0cc4m <picard12@live.de>
 0xspringtime <110655352+0xspringtime@users.noreply.github.com>
-20kdc <asdd2808@gmail.com>
 2f38b454 <dxf@protonmail.com>
 3ooabkhxtn <31479382+3ooabkhxtn@users.noreply.github.com>
 44670 <44670@users.noreply.github.com>
@@ -12,18 +11,14 @@ AT <manyoso@users.noreply.github.com>
 Aarni Koskela <akx@iki.fi>
 Aaron Miller <apage43@ninjawhale.com>
 Aaryaman Vasishta <aaryaman.vasishta@amd.com>
-Abheek Gulati <abheekg@hotmail.com>
 Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com>
 Abhishek Gopinath K <31348521+overtunned@users.noreply.github.com>
 Adithya Balaji <adithya.b94@gmail.com>
 AdithyanI <adithyan.i4internet@gmail.com>
 Adrian <smith.adriane@gmail.com>
 Adrian Hesketh <a-h@users.noreply.github.com>
-Ahmet Zeer <ahmed.zeer@std.yildiz.edu.tr>
 AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com>
 Aisuko <urakiny@gmail.com>
-Akarshan Biswas <akarshanbiswas@fedoraproject.org>
-Albert Jin <albert.jin@gmail.com>
 Alberto <57916483+albbus-stack@users.noreply.github.com>
 Alex <awhill19@icloud.com>
 Alex Azarov <alex@azarov.by>
@@ -40,24 +35,19 @@ Ali Nehzat <ali.nehzat@thanks.dev>
 Ali Tariq <ali.tariq@10xengineers.ai>
 Alon <alonfaraj@gmail.com>
 AlpinDale <52078762+AlpinDale@users.noreply.github.com>
-Amir <amir_zia@outlook.com>
 AmirAli Mirian <37371367+amiralimi@users.noreply.github.com>
 Ananta Bastola <anantarajbastola@gmail.com>
 Anas Ahouzi <112881240+aahouzi@users.noreply.github.com>
 András Salamon <ott2@users.noreply.github.com>
 Andrei <abetlen@gmail.com>
 Andrew Canis <andrew.canis@gmail.com>
-Andrew Downing <andrew2085@gmail.com>
 Andrew Duffy <a10y@users.noreply.github.com>
 Andrew Godfrey <AndrewGodfrey@users.noreply.github.com>
-Andy Tai <andy-tai@users.noreply.github.com>
 Arik Poznanski <arikpoz@users.noreply.github.com>
 Artem <guinmoon@gmail.com>
-Artem Zinnatullin <ceo@abstractny.gay>
 Artyom Lebedev <vagran.ast@gmail.com>
 Asbjørn Olling <asbjornolling@gmail.com>
 Ásgeir Bjarni Ingvarsson <asgeir@fundinn.org>
-Ashish <1856117+ashishdatta@users.noreply.github.com>
 Ashok Gelal <401055+ashokgelal@users.noreply.github.com>
 Ashraful Islam <ashraful.meche@gmail.com>
 Atsushi Tatsuma <yoshoku@outlook.com>
@@ -67,46 +57,35 @@ BADR <contact@pythops.com>
 Bach Le <bach@bullno1.com>
 Bailey Chittle <39804642+bachittle@users.noreply.github.com>
 BarfingLemurs <128182951+BarfingLemurs@users.noreply.github.com>
-Bartowski <ckealty1182@gmail.com>
 Behnam M <58621210+ibehnam@users.noreply.github.com>
-Ben Ashbaugh <ben.ashbaugh@intel.com>
 Ben Garney <bengarney@users.noreply.github.com>
 Ben Siraphob <bensiraphob@gmail.com>
 Ben Williams <ben@719ben.com>
-Benjamin Findley <39356821+Kartoffelsaft@users.noreply.github.com>
 Benjamin Lecaillon <84293038+blecaillon@users.noreply.github.com>
 Bernat Vadell <hounter.caza@gmail.com>
-Bingan <70050083+binganao@users.noreply.github.com>
 Bodo Graumann <mail@bodograumann.de>
 Bono Lv <lvscar@users.noreply.github.com>
 Borislav Stanimirov <b.stanimirov@abv.bg>
 Branden Butler <bwtbutler@hotmail.com>
 Brian <mofosyne@gmail.com>
 Bruce MacDonald <brucewmacdonald@gmail.com>
-Bryan Honof <bryanhonof@gmail.com>
 CJ Pais <cj@cjpais.com>
 CRD716 <crd716@gmail.com>
-Calvin Laurenson <calvin@laurenson.dev>
 Cameron <csteele@steelecameron.com>
 Cameron Kaiser <classilla@users.noreply.github.com>
-Carolinabanana <140120812+Carolinabanana@users.noreply.github.com>
 Casey Primozic <casey@cprimozic.net>
 Casey Primozic <me@ameo.link>
 CausalLM <148736309+CausalLM@users.noreply.github.com>
 Cebtenzzre <cebtenzzre@gmail.com>
 Chad Brewbaker <crb002@gmail.com>
-Chao Jiang <jc19chaoj@zoho.com>
 Cheng Shao <terrorjack@type.dance>
-Chris Elrod <elrodc@gmail.com>
 Chris Kuehl <ckuehl@ckuehl.me>
 Christian Demsar <christian@github.email.demsar.us>
 Christian Demsar <crasm@git.vczf.us>
 Christian Falch <875252+chrfalch@users.noreply.github.com>
 Christian Kögler <ck3d@gmx.de>
-Christian Zhou-Zheng <59622928+christianazinn@users.noreply.github.com>
 Clark Saben <76020733+csaben@users.noreply.github.com>
 Clint Herron <hanclinto@gmail.com>
-CrispStrobe <154636388+CrispStrobe@users.noreply.github.com>
 Cuong Trinh Manh <nguoithichkhampha@gmail.com>
 DAN™ <dranger003@gmail.com>
 Damian Stewart <d@damianstewart.com>
@@ -116,12 +95,8 @@ Daniel Bevenius <daniel.bevenius@gmail.com>
 Daniel Drake <drake@endlessos.org>
 Daniel Hiltgen <dhiltgen@users.noreply.github.com>
 Daniel Illescas Romero <illescas.daniel@protonmail.com>
-Daniele <57776841+daniandtheweb@users.noreply.github.com>
 DannyDaemonic <DannyDaemonic@gmail.com>
 Dat Quoc Nguyen <2412555+datquocnguyen@users.noreply.github.com>
-Dave <dave-fl@users.noreply.github.com>
-Dave Airlie <airlied@gmail.com>
-Dave Airlie <airlied@redhat.com>
 Dave Della Costa <ddellacosta+github@gmail.com>
 David Friehs <david@friehs.info>
 David Kennedy <dakennedyd@gmail.com>
@@ -129,13 +104,10 @@ David Pflug <david@pflug.email>
 David Renshaw <dwrenshaw@gmail.com>
 David Sommers <12738+databyte@users.noreply.github.com>
 David Yang <davidyang6us@gmail.com>
-Dawid Potocki <github@dawidpotocki.com>
 Dawid Wysocki <62249621+TortillaZHawaii@users.noreply.github.com>
 Dean <Dean.Sinaean@gmail.com>
 Deins <deinsegle@gmail.com>
-Deven Mistry <31466137+deven367@users.noreply.github.com>
 Didzis Gosko <didzis@users.noreply.github.com>
-Djip007 <djip.perois@free.fr>
 Don Mahurin <dmahurin@users.noreply.github.com>
 DooWoong Lee (David) <manics99@naver.com>
 Doomsdayrs <38189170+Doomsdayrs@users.noreply.github.com>
@@ -144,11 +116,8 @@ Dr. Tom Murphy VII Ph.D <499244+tom7@users.noreply.github.com>
 Ebey Abraham <ebey97@gmail.com>
 Ed Lee <edilee@mozilla.com>
 Ed Lepedus <ed.lepedus@googlemail.com>
-Eddie-Wang <wangjinheng1120@163.com>
 Edward Taylor <edeetee@gmail.com>
-Elaine <elaine.zosa@gmail.com>
 Elbios <141279586+Elbios@users.noreply.github.com>
-Elton Kola <eltonkola@gmail.com>
 Engininja2 <139037756+Engininja2@users.noreply.github.com>
 Equim <sayaka@ekyu.moe>
 Eric Sommerlade <es0m@users.noreply.github.com>
@@ -174,47 +143,37 @@ Firat <firatkiral@gmail.com>
 Folko-Ven <71110216+Folko-Ven@users.noreply.github.com>
 Foul-Tarnished <107711110+Foul-Tarnished@users.noreply.github.com>
 Francisco Melo <43780565+francis2tm@users.noreply.github.com>
-Frank Mai <thxcode0824@gmail.com>
 FrankHB <frankhb1989@gmail.com>
-Fred Douglas <43351173+fredlas@users.noreply.github.com>
 Frederik Vogel <Schaltfehler@users.noreply.github.com>
 Gabe Goodhart <gabe.l.hart@gmail.com>
 GainLee <perfecter.gen@gmail.com>
 Galunid <karolek1231456@gmail.com>
 Gary Linscott <glinscott@gmail.com>
 Gary Mulder <gjmulder@gmail.com>
-Gavin Zhao <gavinzhaojw@protonmail.com>
 Genkagaku.GPT <hlhr202@163.com>
 Georgi Gerganov <ggerganov@gmail.com>
 Gilad S <giladgd@users.noreply.github.com>
-Giuseppe Scrivano <giuseppe@scrivano.org>
 GiviMAD <GiviMAD@users.noreply.github.com>
 Govlzkoy <gotope@users.noreply.github.com>
 Guillaume "Vermeille" Sanchez <Guillaume.V.Sanchez@gmail.com>
 Guillaume Wenzek <gwenzek@users.noreply.github.com>
 Guoteng <32697156+SolenoidWGT@users.noreply.github.com>
 Gustavo Rocha Dias <91472747+gustrd@users.noreply.github.com>
-Haggai Nuchi <h.nuchi@gmail.com>
 Halalaluyafail3 <55773281+Halalaluyafail3@users.noreply.github.com>
-Hamdoud Hakem <90524568+hamdoudhakem@users.noreply.github.com>
-HanishKVC <hanishkvc@gmail.com>
 Haohui Mai <ricetons@gmail.com>
 Haoxiang Fei <tonyfettes@tonyfettes.com>
 Harald Fernengel <harald.fernengel@here.com>
 Hatsune Miku <129688334+at8u@users.noreply.github.com>
-HatsuneMikuUwU33 <173229399+HatsuneMikuUwU33@users.noreply.github.com>
 Henk Poley <HenkPoley@gmail.com>
 Henri Vasserman <henv@hot.ee>
 Henrik Forstén <henrik.forsten@gmail.com>
 Herman Semenov <GermanAizek@yandex.ru>
 Hesen Peng <hesen.peng@gmail.com>
 Hoang Nguyen <hugo53@users.noreply.github.com>
-Hong Bo PENG <penghb@cn.ibm.com>
 Hongyu Ouyang <96765450+casavaca@users.noreply.github.com>
 Howard Su <howard0su@gmail.com>
 Hua Jiang <allenhjiang@outlook.com>
 Huawei Lin <huaweilin.cs@gmail.com>
-Hugo Roussel <hugo.rous@gmail.com>
 Ian Bull <irbull@eclipsesource.com>
 Ian Bull <irbull@gmail.com>
 Ian Scrivener <github@zilogy.asia>
@@ -231,10 +190,8 @@ Ivan Stepanov <ivanstepanovftw@gmail.com>
 JH23X <165871467+JH23X@users.noreply.github.com>
 Jack Mousseau <jmousseau@users.noreply.github.com>
 JackJollimore <130917767+JackJollimore@users.noreply.github.com>
-Jaemin Son <woalsdnd@gmail.com>
 Jag Chadha <jagtesh@gmail.com>
 Jakub N <jakubniemczyk97@gmail.com>
-James A Capozzoli <157492257+jac-jim@users.noreply.github.com>
 James Reynolds <magnusviri@users.noreply.github.com>
 Jan Boon <jan.boon@kaetemi.be>
 Jan Boon <kaetemi@gmail.com>
@@ -248,17 +205,12 @@ Jean-Michaël Celerier <jeanmichael.celerier+github@gmail.com>
 Jed Fox <git@jedfox.com>
 Jeffrey Quesnelle <emozilla@nousresearch.com>
 Jesse Jojo Johnson <williamsaintgeorge@gmail.com>
-Jeximo <jeximo@gmail.com>
 Jhen-Jie Hong <iainst0409@gmail.com>
 Jiahao Li <liplus17@163.com>
 Jian Liao <jianliao@users.noreply.github.com>
 JidongZhang-THU <1119708529@qq.com>
 Jinwoo Jeong <33892306+williamjeong2@users.noreply.github.com>
 Jiří Podivín <66251151+jpodivin@users.noreply.github.com>
-Jiří Sejkora <Sejseloid@gmail.com>
-Joan Fontanals <jfontanalsmartinez@gmail.com>
-Joan Fontanals <joan.fontanals.martinez@jina.ai>
-Johan <JohanAR@users.noreply.github.com>
 Johannes Gäßler <johannesg@5d6.de>
 Johannes Rudolph <johannes.rudolph@gmail.com>
 John <78893154+cmp-nct@users.noreply.github.com>
@@ -269,19 +221,15 @@ Jonas Wunderlich <32615971+jonas-w@users.noreply.github.com>
 Jorge A <161275481+jorgealias@users.noreply.github.com>
 Jose Maldonado <63384398+yukiteruamano@users.noreply.github.com>
 Joseph Stahl <1269177+josephst@users.noreply.github.com>
-Josh Ramer <josh.ramer@icloud.com>
 Joyce <joycebrum@google.com>
 Juan Calderon-Perez <835733+gaby@users.noreply.github.com>
 Judd <foldl@users.noreply.github.com>
 Julius Arkenberg <arki05@users.noreply.github.com>
 Jun Jie <71215065+junnjiee16@users.noreply.github.com>
-Junyang Lin <justinlin930319@hotmail.com>
 Juraj Bednar <juraj@bednar.io>
 Justin Parker <jparkerweb@gmail.com>
 Justin Suess <justin.suess@westpoint.edu>
-Justina Cho <justcho5@gmail.com>
 Justine Tunney <jtunney@gmail.com>
-Justine Tunney <jtunney@mozilla.com>
 Juuso Alasuutari <juuso.alasuutari@gmail.com>
 KASR <karim.asrih@gmail.com>
 Kamil Tomšík <info@tomsik.cz>
@@ -294,7 +242,6 @@ Kawrakow <48489457+ikawrakow@users.noreply.github.com>
 Keiichi Tabata <keiichi.tabata@outlook.com>
 Kenvix ⭐ <kenvixzure@live.com>
 Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
-Kevin Gibbons <bakkot@gmail.com>
 Kevin Ji <1146876+kevinji@users.noreply.github.com>
 Kevin Kwok <antimatter15@gmail.com>
 Kevin Lo <kevlo@kevlo.org>
@@ -310,7 +257,6 @@ Laura <Tijntje_7@msn.com>
 Lee <44310445+lx200916@users.noreply.github.com>
 Lee Drake <b.lee.drake@gmail.com>
 Leng Yue <lengyue@lengyue.me>
-Leon Knauer <git@leonknauer.com>
 LeonEricsson <70749762+LeonEricsson@users.noreply.github.com>
 Leonardo Neumann <leonardo@neumann.dev.br>
 Li Tan <tanliboy@gmail.com>
@@ -319,26 +265,20 @@ LoganDark <github@logandark.mozmail.com>
 LostRuins <39025047+LostRuins@users.noreply.github.com>
 Luciano <lucianostrika44@gmail.com>
 Luo Tian <lt@basecity.com>
-Lyle Dean <dean@lyle.dev>
 M. Yusuf Sarıgöz <yusufsarigoz@gmail.com>
 Maarten ter Huurne <maarten@treewalker.org>
 Mack Straight <eiz@users.noreply.github.com>
 Maël Kerbiriou <m431.kerbiriou@gmail.com>
 MaggotHATE <clay1326@gmail.com>
-Manuel <44313466+makuche@users.noreply.github.com>
 Marc Köhlbrugge <subscriptions@marckohlbrugge.com>
 Marco Matthies <71844+marcom@users.noreply.github.com>
 Marcus Dunn <51931484+MarcusDunn@users.noreply.github.com>
 Marian Cepok <marian.cepok@gmail.com>
 Mark Fairbairn <thebaron88@gmail.com>
 Marko Tasic <mtasic85@gmail.com>
-Markus Tavenrath <mtavenrath@users.noreply.github.com>
-Martin Delille <martin@delille.org>
 Martin Krasser <krasserm@googlemail.com>
 Martin Schwaighofer <mschwaig@users.noreply.github.com>
 Marvin Gießing <marvin.giessing@gmail.com>
-Masaya, Kato <62578291+msy-kato@users.noreply.github.com>
-MasterYi1024 <39848311+MasterYi1024@users.noreply.github.com>
 Mateusz Charytoniuk <mateusz.charytoniuk@protonmail.com>
 Matheus C. França <matheus-catarino@hotmail.com>
 Matheus Gabriel Alves Silva <matheusgasource@gmail.com>
@@ -347,11 +287,8 @@ Mathijs de Bruin <mathijs@mathijsfietst.nl>
 Matt Clayton <156335168+mattjcly@users.noreply.github.com>
 Matt Pulver <matt.pulver@heavy.ai>
 Matteo Boschini <12133566+mbosc@users.noreply.github.com>
-Mattheus Chediak <shammcity00@gmail.com>
 Matthew Tejo <matthew.tejo@gmail.com>
 Matvey Soloviev <blackhole89@gmail.com>
-Max Krasnyansky <max.krasnyansky@gmail.com>
-Max Krasnyansky <quic_maxk@quicinc.com>
 Maxime <672982+maximegmd@users.noreply.github.com>
 Maximilian Winter <maximilian.winter.91@gmail.com>
 Meng Zhang <meng@tabbyml.com>
@@ -363,41 +300,32 @@ Michael Kesper <mkesper@schokokeks.org>
 Michael Klimenko <mklimenko29@gmail.com>
 Michael Podvitskiy <podvitskiymichael@gmail.com>
 Michael Potter <NanoTekGuy@Gmail.com>
-Michael de Gans <michael.john.degans@gmail.com>
 Michaël de Vries <vriesdemichael@gmail.com>
 Mihai <mihai.chirculescu@yahoo.com>
 Mike <ytianhui2004@gmail.com>
-Mikko Juola <mikjuo@gmail.com>
 Minsoo Cheong <54794500+mscheong01@users.noreply.github.com>
 Mirko185 <mirkosig@gmail.com>
 Mirror Azure <54669636+MirrorAzure@users.noreply.github.com>
 Miwa / Ensan <63481257+ensan-hcl@users.noreply.github.com>
 Mohammadreza Hendiani <hendiani.mohammadreza@gmail.com>
-Mohammadreza Hendiani <mohammad.r.hendiani@gmail.com>
 Murilo Santana <mvrilo@gmail.com>
 Musab Gultekin <musabgultekin@users.noreply.github.com>
 Nam D. Tran <42194884+namtranase@users.noreply.github.com>
-Nathan Epstein <nate2@umbc.edu>
 NawafAlansari <72708095+NawafAlansari@users.noreply.github.com>
 Nebula <infinitewormhole@gmail.com>
-Neo Zhang <14088817+arthw@users.noreply.github.com>
-Neo Zhang <zhang.jianyu@outlook.com>
 Neo Zhang Jianyu <jianyu.zhang@intel.com>
 Neuman Vong <neuman.vong@gmail.com>
 Nexesenex <124105151+Nexesenex@users.noreply.github.com>
 Niall Coates <1349685+Niall-@users.noreply.github.com>
 Nicolai Weitkemper <kontakt@nicolaiweitkemper.de>
-Nicolás Pérez <nicolas_perez@brown.edu>
 Nigel Bosch <pnigelb@gmail.com>
 Niklas Korz <niklas@niklaskorz.de>
-Nikolas <127742645+nneubacher@users.noreply.github.com>
 Nindaleth <Nindaleth@users.noreply.github.com>
 Oleksandr Nikitin <oleksandr@tvori.info>
 Oleksii Maryshchenko <oleksii.maryshchenko@gmail.com>
 Olivier Chafik <ochafik@users.noreply.github.com>
 Ondřej Čertík <ondrej@certik.us>
 Ouadie EL FAROUKI <ouadie.elfarouki@codeplay.com>
-Patrice Ferlet <metal3d@gmail.com>
 Paul Tsochantaris <ptsochantaris@icloud.com>
 Pavol Rusnak <pavol@rusnak.io>
 Pedro Cuenca <pedro@huggingface.co>
@@ -415,14 +343,9 @@ RJ Adriaansen <adriaansen@eshcc.eur.nl>
 Radoslav Gerganov <rgerganov@gmail.com>
 Radosław Gryta <radek.gryta@gmail.com>
 Rahul Vivek Nair <68507071+RahulVivekNair@users.noreply.github.com>
-Raj Hammeer Singh Hada <hammeerraj@gmail.com>
-Ralph Soika <ralph.soika@imixs.com>
 Rand Xie <randxiexyy29@gmail.com>
 Randall Fitzgerald <randall@dasaku.net>
 Reinforce-II <fate@eastal.com>
-Ren Xuancheng <jklj077@users.noreply.github.com>
-Rene Leonhardt <65483435+reneleonhardt@users.noreply.github.com>
-RhinoDevel <RhinoDevel@users.noreply.github.com>
 Riceball LEE <snowyu.lee@gmail.com>
 Richard Kiss <him@richardkiss.com>
 Richard Roberson <richardr1126@gmail.com>
@@ -450,7 +373,6 @@ Rowan Hart <rowanbhart@gmail.com>
 Rune <43761327+Rune-AI@users.noreply.github.com>
 Ryan Landay <rlanday@gmail.com>
 Ryder Wishart <ryderwishart@gmail.com>
-Ryuei <louixs@users.noreply.github.com>
 Rőczey Barnabás <31726601+An0nie@users.noreply.github.com>
 SakuraUmi <yukinon244@gmail.com>
 Salvador E. Tropea <stropea@inti.gob.ar>
@@ -464,7 +386,6 @@ SebastianApel <13675545+SebastianApel@users.noreply.github.com>
 Senemu <10880819+Senemu@users.noreply.github.com>
 Sergey Alirzaev <zl29ah@gmail.com>
 Sergio López <slp@sinrega.org>
-Sertaç Özercan <852750+sozercan@users.noreply.github.com>
 SeungWon Jeong <65549245+redlion0929@users.noreply.github.com>
 ShadovvBeast <ShadovvBeast@gmail.com>
 Shakhar Dasgupta <shakhardasgupta@gmail.com>
@@ -473,7 +394,6 @@ Shijie <821898965@qq.com>
 Shintarou Okada <kokuzen@gmail.com>
 Shouzheng Liu <61452103+lshzh-ww@users.noreply.github.com>
 Shouzheng Liu <lshzh.hi@gmail.com>
-Shuichi Tsutsumi <shuichi0526@gmail.com>
 Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
 Simon Willison <swillison@gmail.com>
 Siwen Yu <yusiwen@gmail.com>
@@ -485,14 +405,11 @@ Someone <sergei.kozlukov@aalto.fi>
 Someone Serge <sergei.kozlukov@aalto.fi>
 Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
 Spencer Sutton <spencersutton@users.noreply.github.com>
-Srihari-mcw <96763064+Srihari-mcw@users.noreply.github.com>
 Srinivas Billa <nivibilla@gmail.com>
 Stefan Sydow <stefan@sydow.email>
-Steffen Röcker <sroecker@gmail.com>
 Stephan Walter <stephan@walter.name>
 Stephen Nichols <snichols@users.noreply.github.com>
 Steve Grubb <ausearch.1@gmail.com>
-Steven Prichard <spprichard20@gmail.com>
 Steven Roussey <sroussey@gmail.com>
 Steward Garcia <57494570+FSSRepo@users.noreply.github.com>
 Suaj Carrot <72162667+SuajCarrot@users.noreply.github.com>
@@ -517,19 +434,16 @@ Tom C <tom.corelis@gmail.com>
 Tom Jobbins <784313+TheBloke@users.noreply.github.com>
 Tomas <tom.tomas.36478119@gmail.com>
 Tomáš Pazdiora <tomas.pazdiora@gmail.com>
-Tristan Druyen <tristan@vault81.mozmail.com>
 Tristan Ross <rosscomputerguy@protonmail.com>
 Tungsten842 <886724vf@anonaddy.me>
 Tungsten842 <quantmint@protonmail.com>
 Tushar <ditsuke@protonmail.com>
 UEXTM.com <84163508+uextm@users.noreply.github.com>
-Ulrich Drepper <drepper@gmail.com>
 Uzo Nweke <uzoechi@gmail.com>
 Vaibhav Srivastav <vaibhavs10@gmail.com>
 Val Kharitonov <mail@kharvd.com>
 Valentin Konovalov <valle.ketsujin@gmail.com>
 Valentyn Bezshapkin <61702053+valentynbez@users.noreply.github.com>
-Victor Nogueira <felladrin@gmail.com>
 Victor Z. Peng <ziliangdotme@gmail.com>
 Vlad <spitfireage@gmail.com>
 Vladimir <bogdad@gmail.com>
@@ -541,9 +455,7 @@ Weird Constructor <weirdconstructor@gmail.com>
 Welby Seely <welbyseely@gmail.com>
 Wentai Zhang <rchardx@gmail.com>
 WillCorticesAI <150854901+WillCorticesAI@users.noreply.github.com>
-William Tambellini <william.tambellini@gmail.com>
 Willy Tarreau <w@1wt.eu>
-Wouter <9594229+DifferentialityDevelopment@users.noreply.github.com>
 Wu Jian Ping <wujjpp@hotmail.com>
 Wu Jian Ping <wujp@greatld.com>
 Xiake Sun <xiake.sun@intel.com>
@@ -554,8 +466,6 @@ Xiaoyi Chen <cxychina@gmail.com>
 Xingchen Song(宋星辰) <xingchensong1996@163.com>
 Xuan Son Nguyen <thichthat@gmail.com>
 Yann Follet <131855179+YannFollet@users.noreply.github.com>
-Yaroslav <yaroslav.yashin@me.com>
-Yazan Agha-Schrader <mountaiin@icloud.com>
 Yiming Cui <conandiy@vip.qq.com>
 Yishuo Wang <MeouSker77@outlook.com>
 Yueh-Po Peng <94939112+y10ab1@users.noreply.github.com>
@@ -567,7 +477,6 @@ Zane Shannon <z@zcs.me>
 Zay <95888118+isaiahbjork@users.noreply.github.com>
 Zenix <zenixls2@gmail.com>
 Zhang Peiyuan <a1286225768@gmail.com>
-Zheng.Deng <32841220+dengzheng-cloud@users.noreply.github.com>
 ZhouYuChen <zhouyuchen@naver.com>
 Ziad Ben Hadj-Alouane <zied.benhadjalouane@gmail.com>
 Ziang Wu <97337387+ZiangWu-77@users.noreply.github.com>
@@ -575,18 +484,14 @@ Zsapi <martin1.zsapka@gmail.com>
 a-n-n-a-l-e-e <150648636+a-n-n-a-l-e-e@users.noreply.github.com>
 adel boussaken <netdur@gmail.com>
 afrideva <95653597+afrideva@users.noreply.github.com>
-agray3 <agray3@users.noreply.github.com>
 akawrykow <142945436+akawrykow@users.noreply.github.com>
 alexpinel <93524949+alexpinel@users.noreply.github.com>
 alonfaraj <alonfaraj@gmail.com>
-alwqx <kenan3015@gmail.com>
-amd-lalithnc <lalithnc@amd.com>
 andrijdavid <david@geek.mg>
 anon998 <131767832+anon998@users.noreply.github.com>
 anzz1 <anzz1@live.com>
 apaz <aarpazdera@gmail.com>
 apcameron <37645737+apcameron@users.noreply.github.com>
-arch-btw <57669023+arch-btw@users.noreply.github.com>
 arcrank <arcrank@gmail.com>
 arlo-phoenix <140345165+arlo-phoenix@users.noreply.github.com>
 at8u <129688334+at8u@users.noreply.github.com>
@@ -609,17 +514,13 @@ cocktailpeanut <121128867+cocktailpeanut@users.noreply.github.com>
 coezbek <c.oezbek@gmail.com>
 comex <comexk@gmail.com>
 compilade <113953597+compilade@users.noreply.github.com>
-compilade <git@compilade.net>
-cpumaxx <163466046+cpumaxx@users.noreply.github.com>
 crasm <crasm@git.vczf.net>
 crasm <crasm@git.vczf.us>
 daboe01 <daboe01@googlemail.com>
 david raistrick <keen99@users.noreply.github.com>
-ddh0 <dylanhalladay02@icloud.com>
 ddpasa <112642920+ddpasa@users.noreply.github.com>
 deepdiffuser <112834445+deepdiffuser@users.noreply.github.com>
 divinity76 <divinity76@gmail.com>
-dm4 <sunrisedm4@gmail.com>
 dotpy314 <33351922+dotpy314@users.noreply.github.com>
 drbh <david.richard.holtz@gmail.com>
 ds5t5 <145942675+ds5t5@users.noreply.github.com>
@@ -628,7 +529,6 @@ eastriver <lee@eastriver.dev>
 ebraminio <ebraminio@gmail.com>
 eiery <19350831+eiery@users.noreply.github.com>
 eric8607242 <e0928021388@gmail.com>
-fairydreaming <166155368+fairydreaming@users.noreply.github.com>
 fraxy-v <65565042+fraxy-v@users.noreply.github.com>
 github-actions[bot] <github-actions[bot]@users.noreply.github.com>
 gliptic <gliptic@users.noreply.github.com>
@@ -639,7 +539,6 @@ h-h-h-h <13482553+h-h-h-h@users.noreply.github.com>
 hankcs <cnhankmc@gmail.com>
 hoangmit <hoangmit@users.noreply.github.com>
 hongbo.mo <352280764@qq.com>
-hopkins385 <98618192+hopkins385@users.noreply.github.com>
 howlger <eclipse@voormann.de>
 howlger <github@voormann.de>
 hutli <6594598+hutli@users.noreply.github.com>
@@ -650,22 +549,14 @@ hydai <z54981220@gmail.com>
 iSma <ismail.senhaji@gmail.com>
 iacore <74560659+iacore@users.noreply.github.com>
 igarnier <igarnier@protonmail.com>
-intelmatt <61025942+intelmatt@users.noreply.github.com>
 iohub <rickyang.pro@gmail.com>
 jacobi petrucciani <8117202+jpetrucciani@users.noreply.github.com>
-jaime-m-p <167997752+jaime-m-p@users.noreply.github.com>
 jameswu2014 <545426914@qq.com>
-jiez <373447296@qq.com>
 jneem <joeneeman@gmail.com>
-joecryptotoo <80373433+joecryptotoo@users.noreply.github.com>
 johnson442 <56517414+johnson442@users.noreply.github.com>
-jojorne <jojorne@users.noreply.github.com>
 jon-chuang <9093549+jon-chuang@users.noreply.github.com>
 jp-x-g <jpxg-dev@protonmail.com>
-jukofyork <69222624+jukofyork@users.noreply.github.com>
-junchao-loongson <68935141+junchao-loongson@users.noreply.github.com>
 jwj7140 <32943891+jwj7140@users.noreply.github.com>
-k.h.lai <adrian.k.h.lai@outlook.com>
 kaizau <kaizau@users.noreply.github.com>
 kalomaze <66376113+kalomaze@users.noreply.github.com>
 kang <tpdns9032100@gmail.com>
@@ -684,15 +575,11 @@ ldwang <ftgreat@163.com>
 le.chang <cljs118@126.com>
 leejet <leejet714@gmail.com>
 limitedAtonement <limitedAtonement@users.noreply.github.com>
-liuwei-git <14815172+liuwei-git@users.noreply.github.com>
 lon <114724657+longregen@users.noreply.github.com>
-loonerin <132926317+loonerin@users.noreply.github.com>
-luoyu-intel <yu.luo@intel.com>
 m3ndax <adrian.goessl@outlook.com>
 maddes8cht <55592906+maddes8cht@users.noreply.github.com>
 makomk <makosoft@googlemail.com>
 manikbhandari <mbbhandarimanik2@gmail.com>
-maor-ps <154728172+maor-ps@users.noreply.github.com>
 mdrokz <mohammadmunshi@gmail.com>
 mgroeber9110 <45620825+mgroeber9110@users.noreply.github.com>
 minarchist <minarchist@users.noreply.github.com>
@@ -706,19 +593,15 @@ ngc92 <7938269+ngc92@users.noreply.github.com>
 nhamanasu <45545786+nhamanasu@users.noreply.github.com>
 niansa/tuxifan <anton-sa@web.de>
 niansa/tuxifan <tuxifan@posteo.de>
-nickp27 <nb.porter@gmail.com>
 ningshanwutuobang <ningshanwutuobang@gmail.com>
 nold <Nold360@users.noreply.github.com>
 nopperl <54780682+nopperl@users.noreply.github.com>
 nusu-github <29514220+nusu-github@users.noreply.github.com>
 olexiyb <olexiyb@gmail.com>
-omahs <73983677+omahs@users.noreply.github.com>
 oobabooga <112222186+oobabooga@users.noreply.github.com>
 opparco <parco.opaai@gmail.com>
 ostix360 <55257054+ostix360@users.noreply.github.com>
-pengxin99 <pengxin.yuan@intel.com>
 perserk <perserk@gmail.com>
-pmysl <piotr.myslinski@outlook.com>
 postmasters <namnguyen@google.com>
 pudepiedj <pudepiedj@gmail.com>
 qingfengfenga <41416092+qingfengfenga@users.noreply.github.com>
@@ -731,19 +614,16 @@ rhuddleston <ryan.huddleston@percona.com>
 rimoliga <53384203+rimoliga@users.noreply.github.com>
 runfuture <runfuture@users.noreply.github.com>
 sandyiscool <sandyiscool@gmail.com>
-sasha0552 <admin@sasha0552.org>
 semidark <me@semidark.net>
 sharpHL <132747147+sharpHL@users.noreply.github.com>
 shibe2 <shibe@tuta.io>
 singularity <12184989+singularity-s0@users.noreply.github.com>
 sjinzh <sjinzh@gmail.com>
-sjxx <63994076+ylsdamxssjxxdd@users.noreply.github.com>
 slaren <2141330+slaren@users.noreply.github.com>
 slaren <slarengh@gmail.com>
 snadampal <87143774+snadampal@users.noreply.github.com>
 staviq <staviq@gmail.com>
 stduhpf <stephduh@live.fr>
-strawberrymelonpanda <152940198+strawberrymelonpanda@users.noreply.github.com>
 swittk <switt1995@gmail.com>
 takov751 <40316768+takov751@users.noreply.github.com>
 tarcey <cey.tarik@gmail.com>
@@ -756,16 +636,12 @@ uint256_t <konndennsa@gmail.com>
 uint256_t <maekawatoshiki1017@gmail.com>
 unbounded <haakon@likedan.net>
 valiray <133289098+valiray@users.noreply.github.com>
-vik <vikhyatk@gmail.com>
-viric <viric@viric.name>
 vodkaslime <646329483@qq.com>
 vvhg1 <94630311+vvhg1@users.noreply.github.com>
 vxiiduu <73044267+vxiiduu@users.noreply.github.com>
 wbpxre150 <100937007+wbpxre150@users.noreply.github.com>
 whoreson <139810751+whoreson@users.noreply.github.com>
-woachk <24752637+woachk@users.noreply.github.com>
 wonjun Jang <strutive07@gmail.com>
-woodx <124784234+woodx9@users.noreply.github.com>
 wzy <32936898+Freed-Wu@users.noreply.github.com>
 xaedes <xaedes@gmail.com>
 xaedes <xaedes@googlemail.com>
@@ -773,10 +649,7 @@ xloem <0xloem@gmail.com>
 yangli2 <yangli2@gmail.com>
 yuiseki <yuiseki@gmail.com>
 zakkor <edward.partenie@gmail.com>
-zhangkaihuo <zhangkaihuo@gmail.com>
 zhouwg <6889919+zhouwg@users.noreply.github.com>
-zhouwg <zhouwg2000@gmail.com>
 zrm <trustiosity.zrm@gmail.com>
-Ștefan-Gabriel Muscalu <legraphista@users.noreply.github.com>
 源文雨 <41315874+fumiama@users.noreply.github.com>
 Нияз Гарифзянов <112617865+garrnizon@users.noreply.github.com>
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -19,15 +19,14 @@
        "cacheVariables": {
            "CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
            "CMAKE_CXX_COMPILER": "icx",
-            "CMAKE_C_COMPILER": "cl",
-            "GGML_SYCL": "ON",
+            "LLAMA_SYCL": "ON",
            "CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
        }
    },
    { "name": "debug",   "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } },
    { "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" } },
-    { "name": "reldbg",  "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
-    { "name": "static",  "hidden": true, "cacheVariables": { "GGML_STATIC": "ON" } },
+    { "name": "reldbg", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
+    { "name": "static",  "hidden": true, "cacheVariables": { "LLAMA_STATIC": "ON" } },

    {
        "name": "arm64-windows-msvc", "hidden": true,
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,24 +1,14 @@
-# Pull requests
+# Contributing Guidelines

- Always squash-merge the PR before merging
- Use the following format for your final commit: `<module> : <commit title> (#<issue_number>)`. For example: `utils : fix typo in utils.py (#1234)`
- Test your changes:
-  - Using the commands in the [`tests`](tests) folder. For instance, running the `./tests/test-backend-ops` command tests different backend implementations of the GGML library
-  - Execute [the full CI locally on your machine](ci/README.md) before publishing
- If the pull request contains only documentation changes (e.g., updating READMEs, adding new wiki pages), please add `[no ci]` to the commit title. This will skip unnecessary CI checks and help reduce build times
- Please rate the complexity of your PR (i.e. `Review Complexity : Low`, `Review Complexity : Medium`, `Review Complexity : High`). This makes it easier for maintainers to triage the PRs.
-  - The PR template has a series of review complexity checkboxes `[ ]` that [you can mark as](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/about-task-lists) `[X]` for your conveience
+## Checklist

-# Coding guidelines
+* Make sure your PR follows the [coding guidelines](https://github.com/ggerganov/llama.cpp/blob/master/README.md#coding-guidelines)
+* Test your changes using the commands in the [`tests`](tests) folder. For instance, running the `./tests/test-backend-ops` command tests different backend implementations of the GGML library
+* Execute [the full CI locally on your machine](ci/README.md) before publishing

- Avoid adding third-party dependencies, extra files, extra headers, etc.
- Always consider cross-compatibility with other operating systems and architectures
- Avoid fancy looking modern STL constructs, use basic `for` loops, avoid templates, keep it simple
- There are no strict rules for the code style, but try to follow the patterns in the code (indentation, spaces, etc.). Vertical alignment makes things more readable and easier to batch edit
- Clean-up any trailing whitespaces, use 4 spaces for indentation, brackets on the same line, `void * ptr`, `int & a`
- Naming usually optimizes for common prefix (see https://github.com/ggerganov/ggml/pull/302#discussion_r1243240963)
- Tensors store data in row-major order. We refer to dimension 0 as columns, 1 as rows, 2 as matrices
- Matrix multiplication is unconventional: [`C = ggml_mul_mat(ctx, A, B)`](https://github.com/ggerganov/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means $C^T = A B^T \Leftrightarrow C = B A^T.$
-
-![matmul](media/matmul.png)
+## PR formatting

+* Please rate the complexity of your PR (i.e. `Review Complexity : Low`, `Review Complexity : Medium`, `Review Complexity : High`). This makes it easier for maintainers to triage the PRs.
+    - The PR template has a series of review complexity checkboxes `[ ]` that you can mark as `[X]` for your conveience. Refer to [About task lists](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/about-task-lists) for more information.
+* If the pull request only contains documentation changes (e.g., updating READMEs, adding new wiki pages), please add `[no ci]` to the commit title. This will skip unnecessary CI checks and help reduce build times.
+* When squashing multiple commits on merge, use the following format for your commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : Fix typo in utils.py (#1234)`
--- a/1075
+++ b/1075
--- a/Package.swift
+++ b/Package.swift
@@ -3,13 +3,14 @@
 import PackageDescription

 var sources = [
-    "src/llama.cpp",
-    "src/unicode.cpp",
-    "src/unicode-data.cpp",
-    "ggml/src/ggml.c",
-    "ggml/src/ggml-alloc.c",
-    "ggml/src/ggml-backend.c",
-    "ggml/src/ggml-quants.c",
+    "ggml.c",
+    "sgemm.cpp",
+    "llama.cpp",
+    "unicode.cpp",
+    "unicode-data.cpp",
+    "ggml-alloc.c",
+    "ggml-backend.c",
+    "ggml-quants.c",
 ]

 var resources: [Resource] = []
@@ -25,8 +26,8 @@ var cSettings: [CSetting] =  [
 ]

 #if canImport(Darwin)
-sources.append("ggml/src/ggml-metal.m")
-resources.append(.process("ggml/src/ggml-metal.metal"))
+sources.append("ggml-metal.m")
+resources.append(.process("ggml-metal.metal"))
 linkerSettings.append(.linkedFramework("Accelerate"))
 cSettings.append(
    contentsOf: [
@@ -62,6 +63,8 @@ let package = Package(
               "models",
               "tests",
               "CMakeLists.txt",
+               "ggml-cuda.cu",
+               "ggml-cuda.h",
               "Makefile"
            ],
            sources: sources,
--- a/README-sycl.md
+++ b/README-sycl.md
@@ -115,12 +115,12 @@ The docker build option is currently limited to *intel GPU* targets.
 ### Build image
 ```sh
 # Using FP16
-docker build -t llama-cpp-sycl --build-arg="GGML_SYCL_F16=ON" -f .devops/llama-cli-intel.Dockerfile .
+docker build -t llama-cpp-sycl --build-arg="LLAMA_SYCL_F16=ON" -f .devops/llama-cli-intel.Dockerfile .
 ```

 *Notes*:

-To build in default FP32 *(Slower than FP16 alternative)*, you can remove the `--build-arg="GGML_SYCL_F16=ON"` argument from the previous command.
+To build in default FP32 *(Slower than FP16 alternative)*, you can remove the `--build-arg="LLAMA_SYCL_F16=ON"` argument from the previous command.

 You can also use the `.devops/llama-server-intel.Dockerfile`, which builds the *"server"* alternative.

@@ -244,10 +244,10 @@ source /opt/intel/oneapi/setvars.sh
 # Build LLAMA with MKL BLAS acceleration for intel GPU

 # Option 1: Use FP32 (recommended for better performance in most cases)
-cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx

 # Option 2: Use FP16
-cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
+cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON

 # build all binary
 cmake --build build --config Release -j -v
@@ -264,10 +264,10 @@ export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR
 # Build LLAMA with Nvidia BLAS acceleration through SYCL

 # Option 1: Use FP32 (recommended for better performance in most cases)
-cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+cmake -B build -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx

 # Option 2: Use FP16
-cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
+cmake -B build -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON

 # build all binary
 cmake --build build --config Release -j -v
@@ -422,10 +422,10 @@ On the oneAPI command line window, step into the llama.cpp main directory and ru
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force

 # Option 1: Use FP32 (recommended for better performance in most cases)
-cmake -B build -G "Ninja" -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx  -DCMAKE_BUILD_TYPE=Release
+cmake -B build -G "Ninja" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx  -DCMAKE_BUILD_TYPE=Release

 # Option 2: Or FP16
-cmake -B build -G "Ninja" -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx  -DCMAKE_BUILD_TYPE=Release -DGGML_SYCL_F16=ON
+cmake -B build -G "Ninja" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx  -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON

 cmake --build build --config Release -j
 ```
@@ -440,7 +440,7 @@ Or, use CMake presets to build:
 cmake --preset x64-windows-sycl-release
 cmake --build build-x64-windows-sycl-release -j --target llama-cli

-cmake -DGGML_SYCL_F16=ON --preset x64-windows-sycl-release
+cmake -DLLAMA_SYCL_F16=ON --preset x64-windows-sycl-release
 cmake --build build-x64-windows-sycl-release -j --target llama-cli

 cmake --preset x64-windows-sycl-debug
@@ -544,9 +544,9 @@ use 1 SYCL GPUs: [0] with Max compute units:512

 | Name               | Value                             | Function                                    |
 |--------------------|-----------------------------------|---------------------------------------------|
-| GGML_SYCL          | ON (mandatory)                    | Enable build with SYCL code path.           |
-| GGML_SYCL_TARGET   | INTEL *(default)* \| NVIDIA       | Set the SYCL target device type.            |
-| GGML_SYCL_F16      | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path.      |
+| LLAMA_SYCL         | ON (mandatory)                    | Enable build with SYCL code path.           |
+| LLAMA_SYCL_TARGET  | INTEL *(default)* \| NVIDIA       | Set the SYCL target device type.            |
+| LLAMA_SYCL_F16     | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path.      |
 | CMAKE_C_COMPILER   | icx                               | Set *icx* compiler for SYCL code path.      |
 | CMAKE_CXX_COMPILER | icpx *(Linux)*, icx *(Windows)*   | Set `icpx/icx` compiler for SYCL code path. |

--- a/README.md
+++ b/README.md
@@ -15,7 +15,6 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)

 ### Recent API changes

- [2024 Jun 26] The source code and CMake build scripts have been restructured https://github.com/ggerganov/llama.cpp/pull/8006
 - [2024 Apr 21] `llama_token_to_piece` can now optionally render special tokens https://github.com/ggerganov/llama.cpp/pull/6807
 - [2024 Apr 4] State and session file functions reorganized under `llama_state_*` https://github.com/ggerganov/llama.cpp/pull/6341
 - [2024 Mar 26] Logits and embeddings API updated for compactness https://github.com/ggerganov/llama.cpp/pull/6122
@@ -26,7 +25,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)

 ### Hot topics

- **`convert.py` has been deprecated and moved to `examples/convert_legacy_llama.py`, please use `convert_hf_to_gguf.py`** https://github.com/ggerganov/llama.cpp/pull/7430
+- **`convert.py` has been deprecated and moved to `examples/convert-legacy-llama.py`, please use `convert-hf-to-gguf.py`** https://github.com/ggerganov/llama.cpp/pull/7430
 - Initial Flash-Attention support: https://github.com/ggerganov/llama.cpp/pull/5021
 - BPE pre-tokenization support has been added: https://github.com/ggerganov/llama.cpp/pull/6920
 - MoE memory layout has been updated - reconvert models for `mmap` support and regenerate `imatrix` https://github.com/ggerganov/llama.cpp/pull/6387
@@ -108,7 +107,6 @@ Typically finetunes of the base models below are supported as well.
 - [X] [Falcon](https://huggingface.co/models?search=tiiuae/falcon)
 - [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2)
 - [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne)
- [X] [BERT](https://github.com/ggerganov/llama.cpp/pull/5423)
 - [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/)
 - [X] [Baichuan 1 & 2](https://huggingface.co/models?search=baichuan-inc/Baichuan) + [derivations](https://huggingface.co/hiyouga/baichuan-7b-sft)
 - [X] [Aquila 1 & 2](https://huggingface.co/models?search=BAAI/Aquila)
@@ -218,11 +216,6 @@ Unless otherwise noted these projects are open-source with permissive licensing:
 **Tools:**

 - [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from HuggingFace Hub and convert them to GGML
- [crashr/gppm](https://github.com/crashr/gppm) – launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption
-
-**Infrastructure:**
-
- [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp

 ---

@@ -422,7 +415,7 @@ Flox follows the nixpkgs build of llama.cpp.
 ### Metal Build

 On MacOS, Metal is enabled by default. Using Metal makes the computation run on the GPU.
-To disable the Metal build at compile time use the `GGML_NO_METAL=1` flag or the `GGML_METAL=OFF` cmake option.
+To disable the Metal build at compile time use the `LLAMA_NO_METAL=1` flag or the `LLAMA_METAL=OFF` cmake option.

 When built with Metal support, you can explicitly disable GPU inference with the `--n-gpu-layers|-ngl 0` command-line
 argument.
@@ -442,7 +435,7 @@ Building the program with BLAS support may lead to some performance improvements
  - Using `make`:
    - On Linux:
      ```bash
-      make GGML_OPENBLAS=1
+      make LLAMA_OPENBLAS=1
      ```

    - On Windows:
@@ -457,13 +450,13 @@ Building the program with BLAS support may lead to some performance improvements
      8. From here you can run:

          ```bash
-          make GGML_OPENBLAS=1
+          make LLAMA_OPENBLAS=1
          ```

  - Using `CMake` on Linux:

      ```bash
-      cmake -B build -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
+      cmake -B build -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
      cmake --build build --config Release
      ```

@@ -482,10 +475,10 @@ Building the program with BLAS support may lead to some performance improvements
  Building through oneAPI compilers will make avx_vnni instruction set available for intel processors that do not support avx512 and avx512_vnni. Please note that this build config **does not support Intel GPU**. For Intel GPU support, please refer to [llama.cpp for SYCL](./README-sycl.md).

  - Using manual oneAPI installation:
-    By default, `GGML_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DGGML_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. Otherwise please install oneAPI and follow the below steps:
+    By default, `LLAMA_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DLLAMA_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. Otherwise please install oneAPI and follow the below steps:
      ```bash
      source /opt/intel/oneapi/setvars.sh # You can skip this step if  in oneapi-basekit docker image, only required for manual installation
-      cmake -B build -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_NATIVE=ON
+      cmake -B build -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_NATIVE=ON
      cmake --build build --config Release
      ```

@@ -502,28 +495,28 @@ Building the program with BLAS support may lead to some performance improvements

  - Using `make`:
    ```bash
-    make GGML_CUDA=1
+    make LLAMA_CUDA=1
    ```
  - Using `CMake`:

    ```bash
-    cmake -B build -DGGML_CUDA=ON
+    cmake -B build -DLLAMA_CUDA=ON
    cmake --build build --config Release
    ```

  The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance:

-  | Option                        | Legal values           | Default | Description                                                                                                                                                                                                                                                                             |
-  |-------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-  | GGML_CUDA_FORCE_DMMV          | Boolean                | false   | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
-  | GGML_CUDA_DMMV_X              | Positive integer >= 32 | 32      | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants.                                         |
-  | GGML_CUDA_MMV_Y               | Positive integer       | 1       | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended.                                                                                                                                         |
-  | GGML_CUDA_FORCE_MMQ           | Boolean                | false   | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, RDNA3). MMQ kernels are enabled by default on GPUs with int8 tensor core support. With MMQ force enabled, speed for large batch sizes will be worse but VRAM consumption will be lower.                       |
-  | GGML_CUDA_FORCE_CUBLAS        | Boolean                | false   | Force the use of FP16 cuBLAS instead of custom matrix multiplication kernels for quantized models                                                                                                                                                                                       |
-  | GGML_CUDA_F16                 | Boolean                | false   | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs.                                                           |
-  | GGML_CUDA_KQUANTS_ITER        | 1 or 2                 | 2       | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs.                                                                                                                     |
-  | GGML_CUDA_PEER_MAX_BATCH_SIZE | Positive integer       | 128     | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial.                                                                         |
-  | GGML_CUDA_FA_ALL_QUANTS       | Boolean                | false   | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer.                                                                                                  |
+  | Option                         | Legal values           | Default | Description                                                                                                                                                                                                                                                                             |
+  |--------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+  | LLAMA_CUDA_FORCE_DMMV          | Boolean                | false   | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
+  | LLAMA_CUDA_DMMV_X              | Positive integer >= 32 | 32      | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants.                                         |
+  | LLAMA_CUDA_MMV_Y               | Positive integer       | 1       | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended.                                                                                                                                         |
+  | LLAMA_CUDA_FORCE_MMQ           | Boolean                | false   | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, RDNA3). MMQ kernels are enabled by default on GPUs with int8 tensor core support. With MMQ force enabled, speed for large batch sizes will be worse but VRAM consumption will be lower.                       |
+  | LLAMA_CUDA_FORCE_CUBLAS        | Boolean                | false   | Force the use of FP16 cuBLAS instead of custom matrix multiplication kernels for quantized models                                                                                                                                                                                       |
+  | LLAMA_CUDA_F16                 | Boolean                | false   | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs.                                                           |
+  | LLAMA_CUDA_KQUANTS_ITER        | 1 or 2                 | 2       | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs.                                                                                                                     |
+  | LLAMA_CUDA_PEER_MAX_BATCH_SIZE | Positive integer       | 128     | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial.                                                                         |
+  | LLAMA_CUDA_FA_ALL_QUANTS       | Boolean                | false   | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer.                                                                                                  |

 - #### hipBLAS

@@ -533,15 +526,15 @@ Building the program with BLAS support may lead to some performance improvements

  - Using `make`:
    ```bash
-    make GGML_HIPBLAS=1
+    make LLAMA_HIPBLAS=1
    ```
  - Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU):
    ```bash
    HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
-        cmake -S . -B build -DGGML_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
+        cmake -S . -B build -DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
        && cmake --build build --config Release -- -j 16
    ```
-    On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DGGML_HIP_UMA=ON`.
+    On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DLLAMA_HIP_UMA=ON`.
    However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).

    Note that if you get the following error:
@@ -555,19 +548,19 @@ Building the program with BLAS support may lead to some performance improvements
    ```bash
    HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -p)" \
    HIP_DEVICE_LIB_PATH=<directory-you-just-found> \
-        cmake -S . -B build -DGGML_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
+        cmake -S . -B build -DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
        && cmake --build build -- -j 16
    ```

  - Using `make` (example for target gfx1030, build with 16 CPU threads):
    ```bash
-    make -j16 GGML_HIPBLAS=1 GGML_HIP_UMA=1 AMDGPU_TARGETS=gfx1030
+    make -j16 LLAMA_HIPBLAS=1 LLAMA_HIP_UMA=1 AMDGPU_TARGETS=gfx1030
    ```

  - Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU):
    ```bash
    set PATH=%HIP_PATH%\bin;%PATH%
-    cmake -S . -B build -G Ninja -DAMDGPU_TARGETS=gfx1100 -DGGML_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release
+    cmake -S . -B build -G Ninja -DAMDGPU_TARGETS=gfx1100 -DLLAMA_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release
    cmake --build build
    ```
    Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors)
@@ -578,11 +571,11 @@ Building the program with BLAS support may lead to some performance improvements
  If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3.
  The following compilation options are also available to tweak performance (yes, they refer to CUDA, not HIP, because it uses the same code as the cuBLAS version above):

-  | Option                 | Legal values           | Default | Description                                                                                                                                                                                                                                    |
-  |------------------------|------------------------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-  | GGML_CUDA_DMMV_X       | Positive integer >= 32 | 32      | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
-  | GGML_CUDA_MMV_Y        | Positive integer       | 1       | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants.                                                                       |
-  | GGML_CUDA_KQUANTS_ITER | 1 or 2                 | 2       | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs.                                                                             |
+  | Option                  | Legal values           | Default | Description                                                                                                                                                                                                                                    |
+  |-------------------------|------------------------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+  | LLAMA_CUDA_DMMV_X       | Positive integer >= 32 | 32      | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
+  | LLAMA_CUDA_MMV_Y        | Positive integer       | 1       | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants.                                                                       |
+  | LLAMA_CUDA_KQUANTS_ITER | 1 or 2                 | 2       | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs.                                                                             |

 - #### Vulkan

@@ -620,7 +613,7 @@ Building the program with BLAS support may lead to some performance improvements
  Then, build llama.cpp using the cmake command below:

  ```bash
-  cmake -B build -DGGML_VULKAN=1
+  cmake -B build -DLLAMA_VULKAN=1
  cmake --build build --config Release
  # Test the output binary (with "-ngl 33" to offload all layers to GPU)
  ./bin/llama-cli -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4
@@ -636,8 +629,8 @@ Building the program with BLAS support may lead to some performance improvements

 To obtain the official LLaMA 2 weights please see the <a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a> section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face.

-Note: `convert.py` has been moved to `examples/convert_legacy_llama.py` and shouldn't be used for anything other than `Llama/Llama2/Mistral` models and their derivatives.
-It does not support LLaMA 3, you can use `convert_hf_to_gguf.py` with LLaMA 3 downloaded from Hugging Face.
+Note: `convert.py` has been moved to `examples/convert-legacy-llama.py` and shouldn't be used for anything other than `Llama/Llama2/Mistral` models and their derivatives.
+It does not support LLaMA 3, you can use `convert-hf-to-gguf.py` with LLaMA 3 downloaded from Hugging Face.

 ```bash
 # obtain the official LLaMA model weights and place them in ./models
@@ -654,7 +647,7 @@ ls ./models
 python3 -m pip install -r requirements.txt

 # convert the model to ggml FP16 format
-python3 convert_hf_to_gguf.py models/mymodel/
+python3 convert-hf-to-gguf.py models/mymodel/

 # quantize the model to 4-bits (using Q4_K_M method)
 ./llama-quantize ./models/mymodel/ggml-model-f16.gguf ./models/mymodel/ggml-model-Q4_K_M.gguf Q4_K_M
@@ -976,11 +969,22 @@ docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m
 - Collaborators can push to branches in the `llama.cpp` repo and merge PRs into the `master` branch
 - Collaborators will be invited based on contributions
 - Any help with managing issues and PRs is very appreciated!
- See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions
- Read the [CONTRIBUTING.md](CONTRIBUTING.md) for more information
 - Make sure to read this: [Inference at the edge](https://github.com/ggerganov/llama.cpp/discussions/205)
 - A bit of backstory for those who are interested: [Changelog podcast](https://changelog.com/podcast/532)

+### Coding guidelines
+
+- Avoid adding third-party dependencies, extra files, extra headers, etc.
+- Always consider cross-compatibility with other operating systems and architectures
+- Avoid fancy looking modern STL constructs, use basic `for` loops, avoid templates, keep it simple
+- There are no strict rules for the code style, but try to follow the patterns in the code (indentation, spaces, etc.). Vertical alignment makes things more readable and easier to batch edit
+- Clean-up any trailing whitespaces, use 4 spaces for indentation, brackets on the same line, `void * ptr`, `int & a`
+- See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions
+- Tensors store data in row-major order. We refer to dimension 0 as columns, 1 as rows, 2 as matrices
+- Matrix multiplication is unconventional: [`C = ggml_mul_mat(ctx, A, B)`](https://github.com/ggerganov/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means $C^T = A B^T \Leftrightarrow C = B A^T.$
+
+![matmul](media/matmul.png)
+
 ### Docs

 - [main (cli)](./examples/main/README.md)
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -36,11 +36,11 @@ SRC=`pwd`
 CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON"

 if [ ! -z ${GG_BUILD_METAL} ]; then
-    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON"
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_METAL_SHADER_DEBUG=ON"
 fi

 if [ ! -z ${GG_BUILD_CUDA} ]; then
-    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=1"
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_CUDA=1"
 fi

 if [ ! -z ${GG_BUILD_SYCL} ]; then
@@ -50,7 +50,7 @@ if [ ! -z ${GG_BUILD_SYCL} ]; then
        exit 1
    fi

-    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_SYCL=1 DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON"
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_SYCL=1 DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON"
 fi
 ## helpers

@@ -284,10 +284,10 @@ function gg_run_open_llama_7b_v2 {

    set -e

-    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DGGML_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
    (time make -j                                                           ) 2>&1 | tee -a $OUT/${ci}-make.log

-    python3 ../examples/convert_legacy_llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
+    python3 ../examples/convert-legacy-llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf

    model_f16="${path_models}/ggml-model-f16.gguf"
    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
@@ -421,7 +421,7 @@ function gg_run_pythia_1_4b {
    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
    (time make -j                                            ) 2>&1 | tee -a $OUT/${ci}-make.log

-    python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
+    python3 ../convert-hf-to-gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf

    model_f16="${path_models}/ggml-model-f16.gguf"
    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
@@ -550,10 +550,10 @@ function gg_run_pythia_2_8b {

    set -e

-    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DGGML_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
    (time make -j                                                           ) 2>&1 | tee -a $OUT/${ci}-make.log

-    python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
+    python3 ../convert-hf-to-gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf

    model_f16="${path_models}/ggml-model-f16.gguf"
    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
@@ -688,7 +688,7 @@ function gg_run_embd_bge_small {
    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
    (time make -j                                            ) 2>&1 | tee -a $OUT/${ci}-make.log

-    python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
+    python3 ../convert-hf-to-gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf

    model_f16="${path_models}/ggml-model-f16.gguf"
    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
--- a/ggml/cmake/FindSIMD.cmake
+++ b/ggml/cmake/FindSIMD.cmake
@@ -79,22 +79,22 @@ endmacro()
 # flags are for MSVC only!
 check_sse("AVX" " ;/arch:AVX")
 if (NOT ${AVX_FOUND})
-    set(GGML_AVX OFF)
+    set(LLAMA_AVX OFF)
 else()
-    set(GGML_AVX ON)
+    set(LLAMA_AVX ON)
 endif()

 check_sse("AVX2" " ;/arch:AVX2")
 check_sse("FMA" " ;/arch:AVX2")
 if ((NOT ${AVX2_FOUND}) OR (NOT ${FMA_FOUND}))
-    set(GGML_AVX2 OFF)
+    set(LLAMA_AVX2 OFF)
 else()
-    set(GGML_AVX2 ON)
+    set(LLAMA_AVX2 ON)
 endif()

 check_sse("AVX512" " ;/arch:AVX512")
 if (NOT ${AVX512_FOUND})
-    set(GGML_AVX512 OFF)
+    set(LLAMA_AVX512 OFF)
 else()
-    set(GGML_AVX512 ON)
+    set(LLAMA_AVX512 ON)
 endif()
--- a/cmake/git-vars.cmake
+++ b/cmake/git-vars.cmake
@@ -1,22 +0,0 @@
-find_package(Git)
-
-# the commit's SHA1
-execute_process(COMMAND
-    "${GIT_EXECUTABLE}" describe --match=NeVeRmAtCh --always --abbrev=8
-    WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
-    OUTPUT_VARIABLE GIT_SHA1
-    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
-
-# the date of the commit
-execute_process(COMMAND
-    "${GIT_EXECUTABLE}" log -1 --format=%ad --date=local
-    WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
-    OUTPUT_VARIABLE GIT_DATE
-    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
-
-# the subject of the commit
-execute_process(COMMAND
-    "${GIT_EXECUTABLE}" log -1 --format=%s
-    WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
-    OUTPUT_VARIABLE GIT_COMMIT_SUBJECT
-    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -1,6 +1,5 @@
 # common

-find_package(Threads REQUIRED)

 # Build info header
 #
@@ -37,7 +36,7 @@ add_custom_command(
    COMMENT "Generating build details from Git"
    COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION}
            -DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME}
-            -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info-gen-cpp.cmake"
+            -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/../scripts/gen-build-info-cpp.cmake"
    WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/.."
    DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in" ${GIT_INDEX}
    VERBATIM
@@ -84,5 +83,5 @@ if (LLAMA_CURL)
 endif ()

 target_include_directories(${TARGET} PUBLIC .)
-target_compile_features   (${TARGET} PUBLIC cxx_std_11)
-target_link_libraries     (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
+target_compile_features(${TARGET} PUBLIC cxx_std_11)
+target_link_libraries(${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -472,14 +472,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        else { invalid_param = true; }
        return true;
    }
-    if (arg == "--attention") {
-        CHECK_ARG
-        std::string value(argv[i]);
-        /**/ if (value == "causal") { params.attention_type = LLAMA_ATTENTION_TYPE_CAUSAL; }
-        else if (value == "non-causal") { params.attention_type = LLAMA_ATTENTION_TYPE_NON_CAUSAL; }
-        else { invalid_param = true; }
-        return true;
-    }
    if (arg == "--defrag-thold" || arg == "-dt") {
        CHECK_ARG
        params.defrag_thold = std::stof(argv[i]);
@@ -765,7 +757,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        params.cache_type_v = argv[++i];
        return true;
    }
-    if (arg == "-mli" || arg == "--multiline-input") {
+    if (arg == "--multiline-input") {
        params.multiline_input = true;
        return true;
    }
@@ -1022,23 +1014,16 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
    }
    if (arg == "--in-prefix-bos") {
        params.input_prefix_bos = true;
-        params.enable_chat_template = false;
        return true;
    }
    if (arg == "--in-prefix") {
        CHECK_ARG
        params.input_prefix = argv[i];
-        params.enable_chat_template = false;
        return true;
    }
    if (arg == "--in-suffix") {
        CHECK_ARG
        params.input_suffix = argv[i];
-        params.enable_chat_template = false;
-        return true;
-    }
-    if (arg == "--spm-infill") {
-        params.spm_infill = true;
        return true;
    }
    if (arg == "--grammar") {
@@ -1402,9 +1387,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
    options.push_back({ "*",           "       --keep N",               "number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep });
    options.push_back({ "*",           "       --chunks N",             "max number of chunks to process (default: %d, -1 = all)", params.n_chunks });
    options.push_back({ "*",           "-fa,   --flash-attn",           "enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled" });
-    options.push_back({ "*",           "-p,    --prompt PROMPT",        "prompt to start generation with\n"
-                                                                        "in conversation mode, this will be used as system prompt\n"
-                                                                        "(default: '%s')", params.prompt.c_str() });
+    options.push_back({ "*",           "-p,    --prompt PROMPT",        "prompt to start generation with (default: '%s')", params.prompt.c_str() });
    options.push_back({ "*",           "-f,    --file FNAME",           "a file containing the prompt (default: none)" });
    options.push_back({ "*",           "       --in-file FNAME",        "an input file (repeat to specify multiple files)" });
    options.push_back({ "*",           "-bf,   --binary-file FNAME",    "binary file containing the prompt (default: none)" });
@@ -1419,17 +1402,13 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
                                                                        "halt generation at PROMPT, return control in interactive mode\n"
                                                                        "can be specified more than once for multiple prompts" });
    options.push_back({ "main",        "-sp,   --special",              "special tokens output enabled (default: %s)", params.special ? "true" : "false" });
-    options.push_back({ "main",        "-cnv,  --conversation",         "run in conversation mode, does not print special tokens and suffix/prefix\n"
-                                                                        "if suffix/prefix are not specified, default chat template will be used\n"
-                                                                        "(default: %s)", params.conversation ? "true" : "false" });
+    options.push_back({ "main",        "-cnv,  --conversation",         "run in conversation mode (does not print special tokens and suffix/prefix) (default: %s)", params.conversation ? "true" : "false" });
    options.push_back({ "main infill", "-i,    --interactive",          "run in interactive mode (default: %s)", params.interactive ? "true" : "false" });
    options.push_back({ "main infill", "-if,   --interactive-first",    "run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false" });
    options.push_back({ "main infill", "-mli,  --multiline-input",      "allows you to write or paste multiple lines without ending each in '\\'" });
    options.push_back({ "main infill", "       --in-prefix-bos",        "prefix BOS to user inputs, preceding the `--in-prefix` string" });
    options.push_back({ "main infill", "       --in-prefix STRING",     "string to prefix user inputs with (default: empty)" });
    options.push_back({ "main infill", "       --in-suffix STRING",     "string to suffix after user inputs with (default: empty)" });
-    options.push_back({ "server infill",
-                                       "       --spm-infill",           "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)", params.spm_infill ? "enabled" : "disabled" });

    options.push_back({ "sampling" });
    options.push_back({ "*",           "       --samplers SAMPLERS",    "samplers that will be used for generation in the order, separated by \';\'\n"
@@ -1465,7 +1444,6 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
    options.push_back({ "main",        "       --cfg-scale N",          "strength of guidance (default: %.1f, 1.0 = disable)", (double)sparams.cfg_scale });
    options.push_back({ "main",        "       --chat-template JINJA_TEMPLATE",
                                                                        "set custom jinja chat template (default: template taken from model's metadata)\n"
-                                                                        "if suffix/prefix are specified, template will be disabled\n"
                                                                        "only commonly used templates are accepted:\n"
                                                                        "https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" });
    options.push_back({ "grammar" });
@@ -1476,10 +1454,8 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
                                                                        "For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead" });

    options.push_back({ "embedding" });
-    options.push_back({ "embedding",   "       --pooling {none,mean,cls,last}",
+    options.push_back({ "embedding",   "       --pooling {none,mean,cls}",
                                                                        "pooling type for embeddings, use model default if unspecified" });
-    options.push_back({ "embedding",   "       --attention {causal,non-causal}",
-                                                                        "attention type for embeddings, use model default if unspecified" });

    options.push_back({ "context hacking" });
    options.push_back({ "*",           "       --rope-scaling {none,linear,yarn}",
@@ -2085,24 +2061,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
    if (params.warmup) {
        LOG("warming up the model with an empty run\n");

-        std::vector<llama_token> tmp;
-        llama_token bos = llama_token_bos(model);
-        llama_token eos = llama_token_eos(model);
-        // some models (e.g. T5) don't have a BOS token
-        if (bos != -1) {
-            tmp.push_back(bos);
-        }
-        tmp.push_back(eos);
-
-        if (llama_model_has_encoder(model)) {
-            llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size(), 0, 0));
-            llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
-            if (decoder_start_token_id == -1) {
-                decoder_start_token_id = bos;
-            }
-            tmp.clear();
-            tmp.push_back(decoder_start_token_id);
-        }
+        std::vector<llama_token> tmp = { llama_token_bos(model), llama_token_eos(model), };
        llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
        llama_kv_cache_clear(lctx);
        llama_synchronize(lctx);
@@ -2185,7 +2144,6 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
    cparams.yarn_beta_slow    = params.yarn_beta_slow;
    cparams.yarn_orig_ctx     = params.yarn_orig_ctx;
    cparams.pooling_type      = params.pooling_type;
-    cparams.attention_type    = params.attention_type;
    cparams.defrag_thold      = params.defrag_thold;
    cparams.cb_eval           = params.cb_eval;
    cparams.cb_eval_user_data = params.cb_eval_user_data;
@@ -2660,7 +2618,6 @@ std::string llama_chat_apply_template(const struct llama_model * model,
        const std::vector<llama_chat_msg> & msgs,
        bool add_ass) {
    int alloc_size = 0;
-    bool fallback = false; // indicate if we must fallback to default chatml
    std::vector<llama_chat_message> chat;
    for (auto & msg : msgs) {
        chat.push_back({msg.role.c_str(), msg.content.c_str()});
@@ -2673,26 +2630,10 @@ std::string llama_chat_apply_template(const struct llama_model * model,
    // run the first time to get the total output length
    int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());

-    // error: chat template is not supported
-    if (res < 0) {
-        if (ptr_tmpl != nullptr) {
-            // if the custom "tmpl" is not supported, we throw an error
-            // this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
-            throw std::runtime_error("this custom template is not supported");
-        } else {
-            // If the built-in template is not supported, we default to chatml
-            res = llama_chat_apply_template(nullptr, "chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size());
-            fallback = true;
-        }
-    }
-
    // if it turns out that our buffer is too small, we resize it
    if ((size_t) res > buf.size()) {
        buf.resize(res);
-        res = llama_chat_apply_template(
-            fallback ? nullptr : model,
-            fallback ? "chatml" : ptr_tmpl,
-            chat.data(), chat.size(), add_ass, buf.data(), buf.size());
+        res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
    }

    std::string formatted_chat(buf.data(), res);
@@ -2704,19 +2645,12 @@ std::string llama_chat_format_single(const struct llama_model * model,
        const std::vector<llama_chat_msg> & past_msg,
        const llama_chat_msg & new_msg,
        bool add_ass) {
-    std::ostringstream ss;
    auto fmt_past_msg = llama_chat_apply_template(model, tmpl, past_msg, false);
    std::vector<llama_chat_msg> chat_new(past_msg);
-    // if the past_msg ends with a newline, we must preserve it in the formatted version
-    if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') {
-        ss << "\n";
-    };
-    // format chat with new_msg
    chat_new.push_back(new_msg);
    auto fmt_new_msg = llama_chat_apply_template(model, tmpl, chat_new, add_ass);
-    // get the diff part
-    ss << fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size());
-    return ss.str();
+    auto formatted = fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size());
+    return formatted;
 }

 std::string llama_chat_format_example(const struct llama_model * model,
@@ -2870,87 +2804,125 @@ float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n)
 //

 static llama_control_vector_data llama_control_vector_load_one(const llama_control_vector_load_info & load_info) {
+    int32_t n_tensors;
+
+    size_t n_bytes = 0;
+
+    uint32_t max_direction_layer = 0;
+
    llama_control_vector_data result = { -1, {} };

-    ggml_context * ctx = nullptr;
-    struct gguf_init_params meta_gguf_params = {
-        /* .no_alloc = */ false,
-        /* .ctx      = */ &ctx,
-    };
-    struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params);
-    if (!ctx_gguf) {
-        fprintf(stderr, "%s: failed to load control vector file from %s\n", __func__, load_info.fname.c_str());
+    // calculate size of ctx needed for tensors, ensure tensors are f32, and find max layer
+    {
+        struct ggml_init_params meta_params = {
+            /* .mem_size   = */ ggml_tensor_overhead() * 128 + ggml_graph_overhead(),
+            /* .mem_buffer = */ nullptr,
+            /* .no_alloc   = */ true,
+        };
+        ggml_context * meta_ctx = ggml_init(meta_params);
+        struct gguf_init_params meta_gguf_params = {
+            /* .no_alloc = */ true,
+            /* .ctx      = */ &meta_ctx,
+        };
+        struct gguf_context * meta_ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params);
+        if (!meta_ctx_gguf) {
+            fprintf(stderr, "%s: failed to load control vector from %s\n", __func__, load_info.fname.c_str());
+            ggml_free(meta_ctx);
+            return result;
+        }
+
+        n_tensors = gguf_get_n_tensors(meta_ctx_gguf);
+        for (int i = 0; i < n_tensors; i++) {
+            std::string name = gguf_get_tensor_name(meta_ctx_gguf, i);
+
+            // split on '.'
+            size_t dotpos = name.find('.');
+            if (dotpos != std::string::npos && name.substr(0, dotpos) == "direction") {
+                try {
+                    uint32_t layer = std::stoi(name.substr(dotpos + 1));
+                    if (layer == 0) {
+                        fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str());
+                        ggml_free(meta_ctx);
+                        gguf_free(meta_ctx_gguf);
+                        return result;
+                    }
+                    if (layer > max_direction_layer) {
+                        max_direction_layer = layer;
+                    }
+                } catch (...) {
+                    fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str());
+                    ggml_free(meta_ctx);
+                    gguf_free(meta_ctx_gguf);
+                    return result;
+                }
+            }
+
+            struct ggml_tensor * tensor_meta = ggml_get_tensor(meta_ctx, name.c_str());
+            if (tensor_meta->type != GGML_TYPE_F32 || ggml_n_dims(tensor_meta) != 1) {
+                fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str());
+                ggml_free(meta_ctx);
+                gguf_free(meta_ctx_gguf);
+                return result;
+            }
+            if (result.n_embd == -1) {
+                result.n_embd = ggml_nelements(tensor_meta);
+            } else if (ggml_nelements(tensor_meta) != result.n_embd) {
+                fprintf(stderr, "%s: direction tensor sizes mismatched in %s\n", __func__, load_info.fname.c_str());
+                ggml_free(meta_ctx);
+                gguf_free(meta_ctx_gguf);
+                return result;
+            }
+            n_bytes += ggml_nbytes(tensor_meta);
+        }
+        ggml_free(meta_ctx);
+        gguf_free(meta_ctx_gguf);
+    }
+
+    if (n_tensors == 0) {
+        fprintf(stderr, "%s: no direction tensors found in %s\n", __func__, load_info.fname.c_str());
        return result;
    }

-    int32_t n_tensors = gguf_get_n_tensors(ctx_gguf);
-    if (n_tensors == 0) {
-        fprintf(stderr, "%s: no direction tensors found in %s\n", __func__, load_info.fname.c_str());
+    // load and scale tensors into final control vector context
+    struct ggml_init_params ggml_params = {
+        /* .mem_size   = */ ggml_tensor_overhead() * n_tensors + n_bytes,
+        /* .mem_buffer = */ nullptr,
+        /* .no_alloc   = */ false,
+    };
+    struct ggml_context * ctx = ggml_init(ggml_params);
+
+    struct gguf_init_params params = {
+        /*.no_alloc = */ false,
+        /*.ctx      = */ &ctx,
+    };
+    struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), params);
+    if (!ctx_gguf) {
+        fprintf(stderr, "%s: failed to load control vector from %s\n", __func__, load_info.fname.c_str());
+        ggml_free(ctx);
+        return result;
    }

-    for (int i = 0; i < n_tensors; i++) {
-        std::string name = gguf_get_tensor_name(ctx_gguf, i);
+    // do not store data for layer 0 (it's not used)
+    result.data.resize(result.n_embd * max_direction_layer);

-        int layer_idx = -1;
+    for (uint32_t il = 1; il <= max_direction_layer; il++) {
+        const std::string name = "direction." + std::to_string(il);
+        const ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str());

-        // split on '.'
-        size_t dotpos = name.find('.');
-        if (dotpos != std::string::npos && name.substr(0, dotpos) == "direction") {
-            try {
-                layer_idx = std::stoi(name.substr(dotpos + 1));
-            } catch (...) {
-                layer_idx = -1;
+        float * dst = result.data.data() + result.n_embd * (il - 1);
+
+        if (tensor) {
+            const float * src = (const float *) tensor->data;
+            for (int j = 0; j < result.n_embd; j++) {
+                dst[j] = src[j] * load_info.strength;
+            }
+        } else {
+            for (int j = 0; j < result.n_embd; j++) {
+                dst[j] = 0.0f;
            }
        }
-        if (layer_idx < 0) {
-            fprintf(stderr, "%s: invalid/unparsable direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
-            result.n_embd = -1;
-            break;
-        } else if (layer_idx == 0) {
-            fprintf(stderr, "%s: invalid (zero) direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
-            result.n_embd = -1;
-            break;
-        }
-
-        struct ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str());
-        if (tensor->type != GGML_TYPE_F32) {
-            fprintf(stderr, "%s: invalid (non-F32) direction tensor type in %s\n", __func__, load_info.fname.c_str());
-            result.n_embd = -1;
-            break;
-        }
-        if (ggml_n_dims(tensor) != 1) {
-            fprintf(stderr, "%s: invalid (non-1D) direction tensor shape in %s\n", __func__, load_info.fname.c_str());
-            result.n_embd = -1;
-            break;
-        }
-
-        if (result.n_embd == -1) {
-            result.n_embd = ggml_nelements(tensor);
-        } else if (ggml_nelements(tensor) != result.n_embd) {
-            fprintf(stderr, "%s: direction tensor in %s does not match previous dimensions\n", __func__, load_info.fname.c_str());
-            result.n_embd = -1;
-            break;
-        }
-
-        // extend if necessary - do not store data for layer 0 (it's not used)
-        result.data.resize(std::max(result.data.size(), static_cast<size_t>(result.n_embd * layer_idx)), 0.0f);
-
-        const float * src = (const float *) tensor->data;
-        float * dst = result.data.data() + result.n_embd * (layer_idx - 1);  // layer 1 at [0]
-        for (int j = 0; j < result.n_embd; j++) {
-            dst[j] += src[j] * load_info.strength;  // allows multiple directions for same layer in same file
-        }
-
    }

-    if (result.n_embd == -1) {
-        fprintf(stderr, "%s: skipping %s due to invalid direction tensors\n", __func__, load_info.fname.c_str());
-        result.data.clear();
-    }
-
-    gguf_free(ctx_gguf);
-    ggml_free(ctx);
-
    return result;
 }

@@ -2961,19 +2933,16 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
        auto cur = llama_control_vector_load_one(info);

        if (cur.n_embd == -1) {
-            result.n_embd = -1;
-            break;
+            return result;
        }
-        if (result.n_embd != -1 && result.n_embd != cur.n_embd) {
-            fprintf(stderr, "%s: control vectors in %s does not match previous dimensions\n", __func__, info.fname.c_str());
-            result.n_embd = -1;
-            break;
+        if (result.n_embd != -1 && (result.n_embd != cur.n_embd || result.data.size() != cur.data.size())) {
+            fprintf(stderr, "%s: control vector in %s does not match previous vector dimensions\n", __func__, info.fname.c_str());
+            return result;
        }

        if (result.n_embd == -1) {
            result = std::move(cur);
        } else {
-            result.data.resize(std::max(result.data.size(), cur.data.size()), 0.0f);  // extend if necessary
            for (size_t i = 0; i < cur.data.size(); i++) {
                result.data[i] += cur.data[i];
            }
@@ -2981,8 +2950,7 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
    }

    if (result.n_embd == -1) {
-        fprintf(stderr, "%s: no valid control vector files passed\n", __func__);
-        result.data.clear();
+        fprintf(stderr, "%s: no vectors passed\n", __func__);
    }

    return result;
--- a/common/common.h
+++ b/common/common.h
@@ -99,7 +99,6 @@ struct gpt_params {
    enum llama_split_mode        split_mode        = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
    enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
    enum llama_pooling_type      pooling_type      = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
-    enum llama_attention_type    attention_type    = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings

    // // sampling parameters
    struct llama_sampling_params sparams;
@@ -201,7 +200,6 @@ struct gpt_params {
    std::string public_path   = "";
    std::string chat_template = "";
    std::string system_prompt = "";
-    bool enable_chat_template = true;

    std::vector<std::string> api_keys;

@@ -252,8 +250,6 @@ struct gpt_params {
    std::string cvector_outfile       = "control_vector.gguf";
    std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
    std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
-
-    bool spm_infill = false; // suffix/prefix/middle pattern for infill
 };

 void gpt_params_handle_model_default(gpt_params & params);
@@ -384,8 +380,6 @@ struct llama_chat_msg {
 bool llama_chat_verify_template(const std::string & tmpl);

 // CPP wrapper for llama_chat_apply_template
-// If the built-in template is not supported, we default to chatml
-// If the custom "tmpl" is not supported, we throw an error
 std::string llama_chat_apply_template(const struct llama_model * model,
        const std::string & tmpl,
        const std::vector<llama_chat_msg> & chat,
@@ -460,3 +454,4 @@ void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const cha
 void yaml_dump_non_result_info(
    FILE * stream, const gpt_params & params, const llama_context * lctx,
    const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
+
--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@@ -316,7 +316,7 @@ std::unordered_map<char, std::string> GRAMMAR_LITERAL_ESCAPES = {
 };

 std::unordered_set<char> NON_LITERAL_SET = {'|', '.', '(', ')', '[', ']', '{', '}', '*', '+', '?'};
-std::unordered_set<char> ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {'^', '$', '.', '[', ']', '(', ')', '|', '{', '}', '*', '+', '?'};
+std::unordered_set<char> ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {'[', ']', '(', ')', '|', '{', '}', '*', '+', '?'};

 template <typename Iterator>
 std::string join(Iterator begin, Iterator end, const std::string & separator) {
@@ -720,7 +720,7 @@ private:
            }
            prop_names.push_back(prop_name);
        }
-        if ((additional_properties.is_boolean() && additional_properties.get<bool>()) || additional_properties.is_object()) {
+        if (!(additional_properties.is_boolean() && !additional_properties.get<bool>())) {
            std::string sub_name = name + (name.empty() ? "" : "-") + "additional";
            std::string value_rule =
                additional_properties.is_object() ? visit(additional_properties, sub_name + "-value")
--- a/convert-hf-to-gguf-update.py
+++ b/convert-hf-to-gguf-update.py
@@ -2,7 +2,7 @@
 # -*- coding: utf-8 -*-

 # This script downloads the tokenizer models of the specified models from Huggingface and
-# generates the get_vocab_base_pre() function for convert_hf_to_gguf.py
+# generates the get_vocab_base_pre() function for convert-hf-to-gguf.py
 #
 # This is necessary in order to analyze the type of pre-tokenizer used by the model and
 # provide the necessary information to llama.cpp via the GGUF header in order to implement
@@ -15,9 +15,9 @@
 # - Add a new model to the "models" list
 # - Run the script with your huggingface token:
 #
-#   python3 convert_hf_to_gguf_update.py <huggingface_token>
+#   python3 convert-hf-to-gguf-update.py <huggingface_token>
 #
-# - Copy-paste the generated get_vocab_base_pre() function into convert_hf_to_gguf.py
+# - Copy-paste the generated get_vocab_base_pre() function into convert-hf-to-gguf.py
 # - Update llama.cpp with the new pre-tokenizer if necessary
 #
 # TODO: generate tokenizer tests for llama.cpp
@@ -37,7 +37,7 @@ from enum import IntEnum, auto
 from transformers import AutoTokenizer

 logging.basicConfig(level=logging.DEBUG)
-logger = logging.getLogger("convert_hf_to_gguf_update")
+logger = logging.getLogger("convert-hf-to-gguf-update")
 sess = requests.Session()


@@ -45,7 +45,6 @@ class TOKENIZER_TYPE(IntEnum):
    SPM = auto()
    BPE = auto()
    WPM = auto()
-    UGM = auto()


 # TODO: this string has to exercise as much pre-tokenizer functionality as possible
@@ -56,10 +55,10 @@ if len(sys.argv) == 2:
    token = sys.argv[1]
    if not token.startswith("hf_"):
        logger.info("Huggingface token seems invalid")
-        logger.info("Usage: python convert_hf_to_gguf_update.py <huggingface_token>")
+        logger.info("Usage: python convert-hf-to-gguf-update.py <huggingface_token>")
        sys.exit(1)
 else:
-    logger.info("Usage: python convert_hf_to_gguf_update.py <huggingface_token>")
+    logger.info("Usage: python convert-hf-to-gguf-update.py <huggingface_token>")
    sys.exit(1)

 # TODO: add models here, base models preferred
@@ -86,11 +85,6 @@ models = [
    {"name": "smaug-bpe",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
    {"name": "poro-chat",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", },
    {"name": "jina-v2-code",   "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
-    {"name": "viking",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Viking-7B", }, # Also used for Viking 13B and 33B
-    {"name": "gemma",          "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2b", },
-    {"name": "gemma-2",        "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2-9b", },
-    {"name": "jais",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/core42/jais-13b", },
-    {"name": "t5",             "tokt": TOKENIZER_TYPE.UGM, "repo": "https://huggingface.co/google-t5/t5-small", },
 ]


@@ -112,13 +106,9 @@ def download_model(model):
    os.makedirs(f"models/tokenizers/{name}", exist_ok=True)

    files = ["config.json", "tokenizer.json", "tokenizer_config.json"]
-
    if tokt == TOKENIZER_TYPE.SPM:
        files.append("tokenizer.model")

-    if tokt == TOKENIZER_TYPE.UGM:
-        files.append("spiece.model")
-
    for file in files:
        save_path = f"models/tokenizers/{name}/{file}"
        if os.path.isfile(save_path):
@@ -134,14 +124,14 @@ for model in models:
        logger.error(f"Failed to download model {model['name']}. Error: {e}")


-# generate the source code for the convert_hf_to_gguf.py:get_vocab_base_pre() function:
+# generate the source code for the convert-hf-to-gguf.py:get_vocab_base_pre() function:

 src_ifs = ""
 for model in models:
    name = model["name"]
    tokt = model["tokt"]

-    if tokt == TOKENIZER_TYPE.SPM or tokt == TOKENIZER_TYPE.UGM:
+    if tokt == TOKENIZER_TYPE.SPM:
        continue

    # Skip if the tokenizer folder does not exist or there are other download issues previously
@@ -151,10 +141,7 @@ for model in models:

    # create the tokenizer
    try:
-        if name == "t5":
-            tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}", use_fast=False)
-        else:
-            tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
+        tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
    except OSError as e:
        logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}")
        continue  # Skip to the next model if the tokenizer can't be loaded
@@ -201,7 +188,7 @@ src_func = f"""

        res = None

-        # NOTE: if you get an error here, you need to update the convert_hf_to_gguf_update.py script
+        # NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script
        #       or pull the latest version of the model from Huggingface
        #       don't edit the hashes manually!
 {src_ifs}
@@ -210,9 +197,9 @@ src_func = f"""
            logger.warning("**************************************************************************************")
            logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!")
            logger.warning("**          There are 2 possible reasons for this:")
-            logger.warning("**          - the model has not been added to convert_hf_to_gguf_update.py yet")
+            logger.warning("**          - the model has not been added to convert-hf-to-gguf-update.py yet")
            logger.warning("**          - the pre-tokenization config has changed upstream")
-            logger.warning("**          Check your model files and convert_hf_to_gguf_update.py and update them accordingly.")
+            logger.warning("**          Check your model files and convert-hf-to-gguf-update.py and update them accordingly.")
            logger.warning("** ref:     https://github.com/ggerganov/llama.cpp/pull/6920")
            logger.warning("**")
            logger.warning(f"** chkhsh:  {{chkhsh}}")
@@ -226,7 +213,7 @@ src_func = f"""
        return res
 """

-convert_py_pth = pathlib.Path("convert_hf_to_gguf.py")
+convert_py_pth = pathlib.Path("convert-hf-to-gguf.py")
 convert_py = convert_py_pth.read_text(encoding="utf-8")
 convert_py = re.sub(
    r"(# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre)",
@@ -237,7 +224,7 @@ convert_py = re.sub(

 convert_py_pth.write_text(convert_py, encoding="utf-8")

-logger.info("+++ convert_hf_to_gguf.py was updated")
+logger.info("+++ convert-hf-to-gguf.py was updated")

 # generate tests for each tokenizer model

@@ -275,7 +262,6 @@ tests = [
    "\n =",
    "' era",
    "Hello, y'all! How are you 😁 ?我想在apple工作1314151天～",
-    "!!!!!!",
    "3",
    "33",
    "333",
@@ -285,8 +271,7 @@ tests = [
    "3333333",
    "33333333",
    "333333333",
-    "Cửa Việt", # llama-bpe fails on this
-    " discards",
+    # "Cửa Việt", # llama-bpe fails on this
    chktxt,
 ]

@@ -314,10 +299,7 @@ for model in models:

    # create the tokenizer
    try:
-        if name == "t5":
-            tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}", use_fast=False)
-        else:
-            tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
+        tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
    except OSError as e:
        logger.error(f"Failed to load tokenizer for model {name}. Error: {e}")
        continue  # Skip this model and continue with the next one in the loop
@@ -343,6 +325,6 @@ logger.info("\nRun the following commands to generate the vocab files for testin
 for model in models:
    name = model["name"]

-    print(f"python3 convert_hf_to_gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only") # noqa: NP100
+    print(f"python3 convert-hf-to-gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only") # noqa: NP100

 logger.info("\n")
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -13,7 +13,7 @@ import sys
 from enum import IntEnum
 from pathlib import Path
 from hashlib import sha256
-from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Literal, Sequence, TypeVar, cast
+from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Sequence, TypeVar, cast

 import math
 import numpy as np
@@ -404,7 +404,7 @@ class Model:

        return tokens, toktypes, tokpre

-    # NOTE: this function is generated by convert_hf_to_gguf_update.py
+    # NOTE: this function is generated by convert-hf-to-gguf-update.py
    #       do not modify it manually!
    # ref:  https://github.com/ggerganov/llama.cpp/pull/6920
    # Marker: Start get_vocab_base_pre
@@ -424,7 +424,7 @@ class Model:

        res = None

-        # NOTE: if you get an error here, you need to update the convert_hf_to_gguf_update.py script
+        # NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script
        #       or pull the latest version of the model from Huggingface
        #       don't edit the hashes manually!
        if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
@@ -487,21 +487,15 @@ class Model:
        if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a":
            # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
            res = "jina-v2-code"
-        if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee":
-            # ref: https://huggingface.co/LumiOpen/Viking-7B
-            res = "viking"
-        if chkhsh == "b53802fb28e26d645c3a310b34bfe07da813026ec7c7716883404d5e0f8b1901":
-            # ref: https://huggingface.co/core42/jais-13b
-            res = "jais"

        if res is None:
            logger.warning("\n")
            logger.warning("**************************************************************************************")
            logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!")
            logger.warning("**          There are 2 possible reasons for this:")
-            logger.warning("**          - the model has not been added to convert_hf_to_gguf_update.py yet")
+            logger.warning("**          - the model has not been added to convert-hf-to-gguf-update.py yet")
            logger.warning("**          - the pre-tokenization config has changed upstream")
-            logger.warning("**          Check your model files and convert_hf_to_gguf_update.py and update them accordingly.")
+            logger.warning("**          Check your model files and convert-hf-to-gguf-update.py and update them accordingly.")
            logger.warning("** ref:     https://github.com/ggerganov/llama.cpp/pull/6920")
            logger.warning("**")
            logger.warning(f"** chkhsh:  {chkhsh}")
@@ -579,19 +573,7 @@ class Model:
        special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"])
        special_vocab.add_to_gguf(self.gguf_writer)

-    def _set_vocab_sentencepiece(self, add_to_gguf=True):
-        tokens, scores, toktypes = self._create_vocab_sentencepiece()
-
-        self.gguf_writer.add_tokenizer_model("llama")
-        self.gguf_writer.add_tokenizer_pre("default")
-        self.gguf_writer.add_token_list(tokens)
-        self.gguf_writer.add_token_scores(scores)
-        self.gguf_writer.add_token_types(toktypes)
-
-        special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
-        special_vocab.add_to_gguf(self.gguf_writer)
-
-    def _create_vocab_sentencepiece(self):
+    def _set_vocab_sentencepiece(self):
        from sentencepiece import SentencePieceProcessor

        tokenizer_path = self.dir_model / 'tokenizer.model'
@@ -653,7 +635,14 @@ class Model:
                scores.append(-1000.0)
                toktypes.append(SentencePieceTokenTypes.UNUSED)

-        return tokens, scores, toktypes
+        self.gguf_writer.add_tokenizer_model("llama")
+        self.gguf_writer.add_tokenizer_pre("default")
+        self.gguf_writer.add_token_list(tokens)
+        self.gguf_writer.add_token_scores(scores)
+        self.gguf_writer.add_token_types(toktypes)
+
+        special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
+        special_vocab.add_to_gguf(self.gguf_writer)

    def _set_vocab_llama_hf(self):
        vocab = gguf.LlamaHfVocab(self.dir_model)
@@ -677,51 +666,6 @@ class Model:
        special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
        special_vocab.add_to_gguf(self.gguf_writer)

-    def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "llama-spm"], vocab_size: int):
-        tokenizer_path = Path(sys.path[0]) / "models" / f"ggml-vocab-{model_name}.gguf"
-        logger.warning(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'")
-        vocab_reader = gguf.GGUFReader(tokenizer_path, "r")
-
-        default_pre = "mpt" if model_name == "gpt-neox" else "default"
-
-        field = vocab_reader.get_field(gguf.Keys.Tokenizer.MODEL)
-        assert field  # tokenizer model
-        self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1]).decode("utf-8"))
-
-        field = vocab_reader.get_field(gguf.Keys.Tokenizer.PRE)
-        self.gguf_writer.add_tokenizer_pre(bytes(field.parts[-1]).decode("utf-8") if field else default_pre)
-
-        field = vocab_reader.get_field(gguf.Keys.Tokenizer.LIST)
-        assert field  # token list
-        self.gguf_writer.add_token_list([bytes(field.parts[i]) for i in field.data][:vocab_size])
-
-        if model_name == "llama-spm":
-            field = vocab_reader.get_field(gguf.Keys.Tokenizer.SCORES)
-            assert field  # token scores
-            self.gguf_writer.add_token_scores([field.parts[i].tolist()[0] for i in field.data][:vocab_size])
-
-        field = vocab_reader.get_field(gguf.Keys.Tokenizer.TOKEN_TYPE)
-        assert field  # token types
-        self.gguf_writer.add_token_types([field.parts[i].tolist()[0] for i in field.data][:vocab_size])
-
-        if model_name != "llama-spm":
-            field = vocab_reader.get_field(gguf.Keys.Tokenizer.MERGES)
-            assert field  # token merges
-            self.gguf_writer.add_token_merges([bytes(field.parts[i]) for i in field.data])
-
-        if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.BOS_ID)) is not None:
-            self.gguf_writer.add_bos_token_id(field.parts[-1].tolist()[0])
-        if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.EOS_ID)) is not None:
-            self.gguf_writer.add_eos_token_id(field.parts[-1].tolist()[0])
-        if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.UNK_ID)) is not None:
-            self.gguf_writer.add_unk_token_id(field.parts[-1].tolist()[0])
-        if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.PAD_ID)) is not None:
-            self.gguf_writer.add_pad_token_id(field.parts[-1].tolist()[0])
-        if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.ADD_BOS)) is not None:
-            self.gguf_writer.add_add_bos_token(field.parts[-1].tolist()[0])
-        if (field := vocab_reader.get_field(gguf.Keys.Tokenizer.ADD_EOS)) is not None:
-            self.gguf_writer.add_add_eos_token(field.parts[-1].tolist()[0])
-

@Model.register("GPTNeoXForCausalLM")
 class GPTNeoXModel(Model):
@@ -1987,7 +1931,7 @@ class Phi3MiniModel(Model):
        if len(rope_scaling_type) == 0:
            raise KeyError('Missing the required key rope_scaling.type')

-        if rope_scaling_type == 'su' or rope_scaling_type == 'longrope':
+        if rope_scaling_type == 'su':
            attn_factor = math.sqrt(1 + math.log(scale) / math.log(orig_max_pos_embds)) if scale > 1.0 else 1.0
        elif rope_scaling_type == 'yarn':
            attn_factor = 0.1 * math.log(scale) + 1.0 if scale > 1.0 else 1.0
@@ -2361,8 +2305,6 @@ class GemmaModel(Model):
        special_vocab._set_special_token("eot",    107)
        special_vocab.add_to_gguf(self.gguf_writer)

-        self.gguf_writer.add_add_space_prefix(False)
-
    def set_gguf_parameters(self):
        hparams = self.hparams
        block_count = hparams["num_hidden_layers"]
@@ -2395,71 +2337,6 @@ class GemmaModel(Model):
        return [(self.map_tensor_name(name), data_torch)]


-@Model.register("Gemma2ForCausalLM")
-class Gemma2Model(Model):
-    model_arch = gguf.MODEL_ARCH.GEMMA2
-
-    def set_vocab(self):
-        tokens, scores, toktypes = self._create_vocab_sentencepiece()
-        # hack: This is required so that we can properly use start/end-of-turn for chat template
-        for i in range(108):
-            # including <unusedX>, <start_of_turn>, <end_of_turn>
-            toktypes[i] = SentencePieceTokenTypes.CONTROL
-        self.gguf_writer.add_tokenizer_model("llama")
-        self.gguf_writer.add_tokenizer_pre("default")
-        self.gguf_writer.add_token_list(tokens)
-        self.gguf_writer.add_token_scores(scores)
-        self.gguf_writer.add_token_types(toktypes)
-
-        special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
-        special_vocab.add_to_gguf(self.gguf_writer)
-
-        self.gguf_writer.add_add_space_prefix(False)
-
-    def set_gguf_parameters(self):
-        hparams = self.hparams
-        block_count = hparams["num_hidden_layers"]
-
-        self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
-        self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
-        self.gguf_writer.add_embedding_length(hparams["hidden_size"])
-        self.gguf_writer.add_block_count(block_count)
-        self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
-        self.gguf_writer.add_head_count(hparams["num_attention_heads"])
-        self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"] if "num_key_value_heads" in hparams else hparams["num_attention_heads"])
-        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
-        self.gguf_writer.add_key_length(hparams["head_dim"])
-        self.gguf_writer.add_value_length(hparams["head_dim"])
-        self.gguf_writer.add_file_type(self.ftype)
-        self.gguf_writer.add_attn_logit_softcapping(
-            self.hparams["attn_logit_softcapping"]
-        )
-        self.gguf_writer.add_final_logit_softcapping(
-            self.hparams["final_logit_softcapping"]
-        )
-        self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
-
-        # sanity check
-        attn_scalar = self.hparams["query_pre_attn_scalar"]
-        if attn_scalar != hparams["hidden_size"] / hparams["num_attention_heads"]:
-            raise ValueError("query_pre_attn_scalar must be equal to n_embd / n_head")
-
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        del bid  # unusem
-
-        # lm_head is not used in llama.cpp, while autoawq will include this tensor in model
-        # To prevent errors, skip loading lm_head.weight.
-        if name == "lm_head.weight":
-            logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.")
-            return []
-
-        # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89
-        if name.endswith("norm.weight"):
-            data_torch = data_torch + 1
-
-        return [(self.map_tensor_name(name), data_torch)]
-
-
@Model.register("Starcoder2ForCausalLM")
 class StarCoder2Model(Model):
    model_arch = gguf.MODEL_ARCH.STARCODER2
@@ -2484,7 +2361,39 @@ class MambaModel(Model):
            self._set_vocab_sentencepiece()
        else:
            # Use the GPT-NeoX tokenizer when no tokenizer files are present
-            self._set_vocab_builtin("gpt-neox", vocab_size)
+            tokenizer_path = Path(sys.path[0]) / "models" / "ggml-vocab-gpt-neox.gguf"
+            logger.warning(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'")
+            neox_reader = gguf.GGUFReader(tokenizer_path, "r")
+
+            field = neox_reader.get_field(gguf.Keys.Tokenizer.MODEL)
+            self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1]).decode("utf-8") if field else "gpt2")
+
+            field = neox_reader.get_field(gguf.Keys.Tokenizer.PRE)
+            self.gguf_writer.add_tokenizer_pre(bytes(field.parts[-1]).decode("utf-8") if field else "mpt")
+
+            field = neox_reader.get_field(gguf.Keys.Tokenizer.LIST)
+            assert field
+            self.gguf_writer.add_token_list([bytes(field.parts[i]) for i in field.data][:vocab_size])
+
+            field = neox_reader.get_field(gguf.Keys.Tokenizer.TOKEN_TYPE)
+            assert field
+            self.gguf_writer.add_token_types([field.parts[i].tolist()[0] for i in field.data][:vocab_size])
+
+            field = neox_reader.get_field(gguf.Keys.Tokenizer.MERGES)
+            assert field
+            self.gguf_writer.add_token_merges([bytes(field.parts[i]) for i in field.data])
+
+            field = neox_reader.get_field(gguf.Keys.Tokenizer.BOS_ID)
+            self.gguf_writer.add_bos_token_id(field.parts[-1].tolist()[0] if field else 1)
+
+            field = neox_reader.get_field(gguf.Keys.Tokenizer.EOS_ID)
+            self.gguf_writer.add_eos_token_id(field.parts[-1].tolist()[0] if field else 0)
+
+            field = neox_reader.get_field(gguf.Keys.Tokenizer.UNK_ID)
+            self.gguf_writer.add_unk_token_id(field.parts[-1].tolist()[0] if field else 0)
+
+            field = neox_reader.get_field(gguf.Keys.Tokenizer.PAD_ID)
+            self.gguf_writer.add_pad_token_id(field.parts[-1].tolist()[0] if field else 0)

    def set_gguf_parameters(self):
        d_model = self.find_hparam(["hidden_size",       "d_model"])
@@ -2636,82 +2545,6 @@ class JinaBertV2Model(BertModel):
        self.gguf_writer.add_add_eos_token(True)


-@Model.register("OpenELMForCausalLM")
-class OpenELMModel(Model):
-    model_arch = gguf.MODEL_ARCH.OPENELM
-
-    @staticmethod
-    def _make_divisible(v: float | int, divisor: int) -> int:
-        # ref: https://huggingface.co/apple/OpenELM-270M-Instruct/blob/eb111ff2e6724348e5b905984063d4064d4bc579/configuration_openelm.py#L34-L38
-        new_v = max(divisor, int(v + divisor / 2) // divisor * divisor)
-        # Make sure that round down does not go down by more than 10%.
-        if new_v < 0.9 * v:
-            new_v += divisor
-        return new_v
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-        ffn_multipliers: list[float] = self.hparams["ffn_multipliers"]
-        ffn_dim_divisor: int = self.hparams["ffn_dim_divisor"]
-        self._n_embd: int = self.hparams["model_dim"]
-        self._num_kv_heads: list[int] = self.hparams["num_kv_heads"]
-        self._num_query_heads: list[int] = self.hparams["num_query_heads"]
-        self._ffn_dims: list[int] = [
-            OpenELMModel._make_divisible(multiplier * self._n_embd, ffn_dim_divisor)
-            for multiplier in ffn_multipliers
-        ]
-        assert isinstance(self._num_kv_heads, list) and isinstance(self._num_kv_heads[0], int)
-        assert isinstance(self._num_query_heads, list) and isinstance(self._num_query_heads[0], int)
-
-    # Uses the tokenizer from meta-llama/Llama-2-7b-hf
-    def set_vocab(self):
-        try:
-            self._set_vocab_sentencepiece()
-        except FileNotFoundError:
-            self._set_vocab_builtin("llama-spm", self.hparams["vocab_size"])
-
-    def set_gguf_parameters(self):
-        n_embd = self._n_embd
-        head_dim = self.hparams["head_dim"]
-        rot_pct = 1.0
-        assert self.block_count == len(self._num_kv_heads)
-        assert self.block_count == len(self._num_query_heads)
-        assert self.block_count == len(self._ffn_dims)
-
-        self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
-        self.gguf_writer.add_block_count(self.block_count)
-        self.gguf_writer.add_context_length(self.hparams["max_context_length"])
-        self.gguf_writer.add_embedding_length(n_embd)
-        self.gguf_writer.add_feed_forward_length(self._ffn_dims)
-        self.gguf_writer.add_head_count(self._num_query_heads)
-        self.gguf_writer.add_head_count_kv(self._num_kv_heads)
-        self.gguf_writer.add_rope_freq_base(self.hparams["rope_freq_constant"])
-        # https://huggingface.co/apple/OpenELM-270M-Instruct/blob/c401df2/modeling_openelm.py#L30
-        self.gguf_writer.add_layer_norm_rms_eps(1e-6)
-        self.gguf_writer.add_rope_dimension_count(int(rot_pct * head_dim))
-        self.gguf_writer.add_key_length(head_dim)
-        self.gguf_writer.add_value_length(head_dim)
-        self.gguf_writer.add_file_type(self.ftype)
-
-    def find_hparam(self, keys: Iterable[str], optional: bool = False) -> Any:
-        if "n_layers" in keys:
-            return self.hparams["num_transformer_layers"]
-
-        return super().find_hparam(keys, optional)
-
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-
-        # split ff
-        if bid is not None and name == f"transformer.layers.{bid}.ffn.proj_1.weight":
-            ff_dim = self._ffn_dims[bid]
-            yield (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), data_torch[:ff_dim])
-            yield (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), data_torch[ff_dim:])
-            return
-
-        yield (self.map_tensor_name(name), data_torch)
-
-
@Model.register("ArcticForCausalLM")
 class ArcticModel(Model):
    model_arch = gguf.MODEL_ARCH.ARCTIC
@@ -2942,17 +2775,11 @@ class DeepseekV2Model(Model):
                raise ValueError(f"Unprocessed experts: {experts}")


-@Model.register("T5WithLMHeadModel")
@Model.register("T5ForConditionalGeneration")
-@Model.register("MT5ForConditionalGeneration")
-@Model.register("UMT5ForConditionalGeneration")
+@Model.register("T5WithLMHeadModel")
 class T5Model(Model):
    model_arch = gguf.MODEL_ARCH.T5

-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.shared_token_embeddings_found = False
-
    def set_vocab(self):
        # to avoid TypeError: Descriptors cannot be created directly
        # exception when importing sentencepiece_model_pb2
@@ -2960,29 +2787,17 @@ class T5Model(Model):
        from sentencepiece import SentencePieceProcessor
        from sentencepiece import sentencepiece_model_pb2 as model

-        tokenizer_path = self.dir_model / 'tokenizer.model'
-
-        # many older models use spiece.model tokenizer model filename
-        if not tokenizer_path.is_file():
-            tokenizer_path = self.dir_model / 'spiece.model'
+        tokenizer_path = self.dir_model / 'spiece.model'

        if not tokenizer_path.is_file():
            raise FileNotFoundError(f"File not found: {tokenizer_path}")

        sentencepiece_model = model.ModelProto()
        sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
-
-        # some models like Pile-T5 family use BPE tokenizer instead of Unigram
-        if sentencepiece_model.trainer_spec.model_type == 2: # BPE
-            # assure the tokenizer model file name is correct
-            assert tokenizer_path.name == 'tokenizer.model'
-            return self._set_vocab_sentencepiece()
-        else:
-            assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM
-
        add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
        remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces
        precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap
+        assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM

        tokenizer = SentencePieceProcessor()
        tokenizer.LoadFromFile(str(tokenizer_path))
@@ -3052,10 +2867,7 @@ class T5Model(Model):

    def set_gguf_parameters(self):
        self.gguf_writer.add_name("T5")
-        if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None:
-            logger.warning("Couldn't find context length in config.json, assuming default value of 512")
-            n_ctx = 512
-        self.gguf_writer.add_context_length(n_ctx)
+        self.gguf_writer.add_context_length(self.hparams["n_positions"])
        self.gguf_writer.add_embedding_length(self.hparams["d_model"])
        self.gguf_writer.add_feed_forward_length(self.hparams["d_ff"])
        self.gguf_writer.add_block_count(self.hparams["num_layers"])
@@ -3071,111 +2883,16 @@ class T5Model(Model):
    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        del bid  # unused

-        # T5 based models contain shared token embeddings tensors saved randomly as either "encoder.embed_tokens.weight",
-        # "decoder.embed_tokens.weight" or "shared.weight" tensor. In some models there are even multiple of them stored
-        # in the safetensors files. We use the first tensor from these three as the token embeddings for both encoder
-        # and decoder and ignore the remaining ones.
-        if name in ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "shared.weight"]:
-            if not self.shared_token_embeddings_found:
-                name = "shared.weight"
-                self.shared_token_embeddings_found = True
-            else:
-                logger.debug(f"Skipping shared tensor {name!r} in safetensors so that convert can end normally.")
-                return []
+        # Sometimes T5 and Flan-T5 based models contain "encoder.embed_tokens.weight" tensor or
+        # "decoder.embed_tokens.weight" tensors that are duplicates of "shared.weight" tensor
+        # To prevent errors caused by an unnecessary unmapped tensor, skip both of them and use only "shared.weight".
+        if name == "decoder.embed_tokens.weight" or name == "encoder.embed_tokens.weight":
+            logger.debug(f"Skipping tensor {name!r} in safetensors so that convert can end normally.")
+            return []

        return [(self.map_tensor_name(name), data_torch)]


-@Model.register("JAISLMHeadModel")
-class JaisModel(Model):
-    model_arch = gguf.MODEL_ARCH.JAIS
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-        # SwigLU activation
-        assert self.hparams["activation_function"] == "swiglu"
-        # ALiBi position embedding
-        assert self.hparams["position_embedding_type"] == "alibi"
-
-        # Embeddings scale
-        self.embeddings_scale = 1.0
-        # note: For some JAIS flavors, output is tied to (same as) wte in original model
-        self.output_is_wte = False
-        if 'mup_embeddings_scale' in self.hparams:
-            self.output_is_wte = True   # Hack (?)
-            self.embeddings_scale = self.hparams['mup_embeddings_scale']
-        elif 'embeddings_scale' in self.hparams:
-            self.embeddings_scale = self.hparams['embeddings_scale']
-        else:
-            assert False
-
-        self.width_scale = 1.0
-        if 'mup_output_alpha' in self.hparams:
-            assert 'mup_width_scale' in self.hparams
-            self.width_scale = self.hparams['mup_output_alpha'] * self.hparams['mup_width_scale']
-        elif 'width_scale' in self.hparams:
-            self.width_scale = self.hparams['width_scale']
-        else:
-            assert False
-
-        self.max_alibi_bias = 8.0
-
-    def set_vocab(self):
-        self._set_vocab_gpt2()
-
-    def set_gguf_parameters(self):
-        self.gguf_writer.add_name(self.dir_model.name)
-        self.gguf_writer.add_block_count(self.hparams["n_layer"])
-        self.gguf_writer.add_context_length(self.hparams["n_positions"])
-        self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
-        self.gguf_writer.add_feed_forward_length(self.hparams["n_inner"])
-        self.gguf_writer.add_head_count(self.hparams["n_head"])
-        self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
-        self.gguf_writer.add_file_type(self.ftype)
-
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        del bid  # unused
-
-        tensors: list[tuple[str, Tensor]] = []
-
-        # we don't need these
-        if name.endswith((".attn.bias")):
-            return tensors
-
-        if name.endswith(("relative_pe.slopes")):
-            # Calculate max ALiBi bias (this is the inverse of the ALiBi calculation)
-            # Some other models has max_alibi_bias spelled out explicitly in the hyperparams,
-            # but Jais's PyTorch model simply precalculates the slope values and places them
-            # in relative_pes.slopes
-            n_head_closest_log2 = 2 ** math.floor(math.log2(self.hparams["n_head"]))
-            first_val = float(data_torch._data[0])
-            self.max_alibi_bias = -round(math.log2(first_val) * n_head_closest_log2)
-
-            return tensors
-
-        if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_fc2.weight")):
-            data_torch = data_torch.transpose(1, 0)
-
-        new_name = self.map_tensor_name(name)
-
-        if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD):
-            tensors.append((new_name, data_torch * self.embeddings_scale))
-            if self.output_is_wte:
-                tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch * self.width_scale))
-        elif new_name == self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT):
-            assert not self.output_is_wte
-            tensors.append((new_name, data_torch * self.width_scale))
-        else:
-            tensors.append((new_name, data_torch))
-
-        return tensors
-
-    def write_tensors(self):
-        super().write_tensors()
-        self.gguf_writer.add_max_alibi_bias(self.max_alibi_bias)
-
-
 ###### CONVERSION LOGIC ######


@@ -3331,8 +3048,7 @@ def main() -> None:
        "auto": gguf.LlamaFileType.GUESSED,
    }

-    is_split = args.split_max_tensors > 0 or args.split_max_size != "0"
-    if args.use_temp_file and is_split:
+    if args.use_temp_file and (args.split_max_tensors > 0 or args.split_max_size != "0"):
        logger.error("Error: Cannot use temp file when splitting")
        sys.exit(1)

@@ -3369,12 +3085,11 @@ def main() -> None:
        if args.vocab_only:
            logger.info("Exporting model vocab...")
            model_instance.write_vocab()
-            logger.info(f"Model vocab successfully exported to {model_instance.fname_out}")
+            logger.info("Model vocab successfully exported.")
        else:
            logger.info("Exporting model...")
            model_instance.write()
-            out_path = f"{model_instance.fname_out.parent}{os.sep}" if is_split else model_instance.fname_out
-            logger.info(f"Model successfully exported to {out_path}")
+            logger.info("Model successfully exported.")


 if __name__ == '__main__':
--- a/convert-llama-ggml-to-gguf.py
+++ b/convert-llama-ggml-to-gguf.py
--- a/docs/BLIS.md
+++ b/docs/BLIS.md
@@ -30,8 +30,8 @@ We recommend using openmp since it's easier to modify the cores being used.
 Makefile:

 ```bash
-make GGML_BLIS=1 -j
-# make GGML_BLIS=1 llama-benchmark-matmult
+make LLAMA_BLIS=1 -j
+# make LLAMA_BLIS=1 benchmark-matmult
 ```

 CMake:
@@ -39,7 +39,7 @@ CMake:
 ```bash
 mkdir build
 cd build
-cmake -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=FLAME ..
+cmake -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=FLAME ..
 make -j
 ```

--- a/docs/HOWTO-add-model.md
+++ b/docs/HOWTO-add-model.md
@@ -17,7 +17,7 @@ Also, it is important to check that the examples and main ggml backends (CUDA, M
 ### 1. Convert the model to GGUF

 This step is done in python with a `convert` script using the [gguf](https://pypi.org/project/gguf/) library.
-Depending on the model architecture, you can use either [convert_hf_to_gguf.py](../convert_hf_to_gguf.py) or [examples/convert_legacy_llama.py](../examples/convert_legacy_llama.py) (for `llama/llama2` models in `.pth` format).
+Depending on the model architecture, you can use either [convert-hf-to-gguf.py](../convert-hf-to-gguf.py) or [examples/convert-legacy-llama.py](../examples/convert-legacy-llama.py) (for `llama/llama2` models in `.pth` format).

 The convert script reads the model configuration, tokenizer, tensor names+data and converts them to GGUF metadata and tensors.

--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -39,13 +39,13 @@ else()
    add_subdirectory(quantize-stats)
    add_subdirectory(quantize)
    add_subdirectory(retrieval)
-    if (GGML_RPC)
+    if (LLAMA_RPC)
        add_subdirectory(rpc)
    endif()
    if (LLAMA_BUILD_SERVER)
    add_subdirectory(server)
    endif()
-    if (GGML_SYCL)
+    if (LLAMA_SYCL)
        add_subdirectory(sycl)
    endif()
    add_subdirectory(save-load-state)
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@@ -93,34 +93,14 @@ int main(int argc, char ** argv) {

    // create a llama_batch
    // we use this object to submit token data for decoding
-    llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t) n_parallel), 0, n_parallel);
-
-    std::vector<llama_seq_id> seq_ids(n_parallel, 0);
-    for (int32_t i = 0; i < n_parallel; ++i) {
-        seq_ids[i] = i;
-    }
+    llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t)n_parallel), 0, 1);

    // evaluate the initial prompt
    for (size_t i = 0; i < tokens_list.size(); ++i) {
-        llama_batch_add(batch, tokens_list[i], i, seq_ids, false);
+        llama_batch_add(batch, tokens_list[i], i, { 0 }, false);
    }
    GGML_ASSERT(batch.n_tokens == (int) tokens_list.size());

-    if (llama_model_has_encoder(model)) {
-        if (llama_encode(ctx, batch)) {
-            LOG_TEE("%s : failed to eval\n", __func__);
-            return 1;
-        }
-
-        llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
-        if (decoder_start_token_id == -1) {
-            decoder_start_token_id = llama_token_bos(model);
-        }
-
-        llama_batch_clear(batch);
-        llama_batch_add(batch, decoder_start_token_id, 0, seq_ids, false);
-    }
-
    // llama_decode will output logits only for the last token of the prompt
    batch.logits[batch.n_tokens - 1] = true;

@@ -129,11 +109,11 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    //// assign the system KV cache to all parallel sequences
-    //// this way, the parallel sequences will "reuse" the prompt tokens without having to copy them
-    //for (int32_t i = 1; i < n_parallel; ++i) {
-    //    llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
-    //}
+    // assign the system KV cache to all parallel sequences
+    // this way, the parallel sequences will "reuse" the prompt tokens without having to copy them
+    for (int32_t i = 1; i < n_parallel; ++i) {
+        llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
+    }

    if (n_parallel > 1) {
        LOG_TEE("\n\n%s: generating %d sequences ...\n", __func__, n_parallel);
--- a/examples/convert-legacy-llama.py
+++ b/examples/convert-legacy-llama.py
--- a/examples/embedding/README.md
+++ b/examples/embedding/README.md
@@ -58,3 +58,4 @@ The above command will output space-separated float values.
 ```powershell
 embedding.exe -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --embd-separator '<#sep#>' --embd-normalize 2  --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null
 ```
+
--- a/examples/finetune/convert-finetune-checkpoint-to-gguf.py
+++ b/examples/finetune/convert-finetune-checkpoint-to-gguf.py
--- a/examples/imatrix/README.md
+++ b/examples/imatrix/README.md
@@ -25,7 +25,7 @@ For faster computation, make sure to use GPU offloading via the `-ngl` argument
 ## Example

 ```bash
-GGML_CUDA=1 make -j
+LLAMA_CUDA=1 make -j

 # generate importance matrix (imatrix.dat)
 ./llama-imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99
--- a/examples/infill/README.md
+++ b/examples/infill/README.md
@@ -15,7 +15,6 @@ In this section, we cover the most commonly used options for running the `infill
 -   `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
 -   `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
 -   `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
-   `--spm-infill`: Use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this.

 ## Input Prompts

--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@@ -210,7 +210,6 @@ int main(int argc, char ** argv) {
        suff_rm_leading_spc = false;
    }
    std::vector<llama_token> embd_inp;
-    std::vector<llama_token> embd_end;
    std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
    std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);
    const int space_token = 29871;
@@ -218,13 +217,12 @@ int main(int argc, char ** argv) {
        inp_sfx.erase(inp_sfx.begin());
    }
    inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
-    inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
-    embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
-    embd_end = params.spm_infill ? inp_pfx : inp_sfx;
    if (add_bos) {
-        embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
+        inp_pfx.insert(inp_pfx.begin(), llama_token_bos(model));
    }
-    embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
+    inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
+    embd_inp = inp_pfx;
+    embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());

    const llama_token middle_token = llama_token_middle(model);
    if (middle_token >= 0) {
@@ -528,14 +526,14 @@ int main(int argc, char ** argv) {
                    inp_sfx.erase(inp_sfx.begin());
                }
                inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
-                inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
-                embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
-                embd_end = params.spm_infill ? inp_pfx : inp_sfx;
                if (add_bos) {
-                    embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
+                    inp_pfx.insert(inp_pfx.begin(), llama_token_bos(model));
                }
-                embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
+                inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
+                embd_inp = inp_pfx;
+                embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());

+                const llama_token middle_token = llama_token_middle(model);
                if (middle_token >= 0) {
                    embd_inp.push_back(middle_token);
                }
@@ -659,3 +657,4 @@ int main(int argc, char ** argv) {

    return 0;
 }
+
--- a/examples/json-schema-pydantic-example.py
+++ b/examples/json-schema-pydantic-example.py
@@ -1,7 +1,7 @@
 # Usage:
 #! ./llama-server -m some-model.gguf &
 #! pip install pydantic
-#! python json_schema_pydantic_example.py
+#! python json-schema-pydantic-example.py

 from pydantic import BaseModel, Extra, TypeAdapter
 from annotated_types import MinLen
--- a/examples/json_schema_to_grammar.py
+++ b/examples/json_schema_to_grammar.py
@@ -231,7 +231,7 @@ GRAMMAR_RANGE_LITERAL_ESCAPE_RE = re.compile(r'[\r\n"\]\-\\]')
 GRAMMAR_LITERAL_ESCAPES = {'\r': '\\r', '\n': '\\n', '"': '\\"', '-': '\\-', ']': '\\]'}

 NON_LITERAL_SET = set('|.()[]{}*+?')
-ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = set('^$.[]()|{}*+?')
+ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = set('[]()|{}*+?')


 class SchemaConverter:
@@ -602,7 +602,7 @@ class SchemaConverter:
                else:
                    add_component(t, is_required=True)

-            return self._add_rule(rule_name, self._build_object_rule(properties, required, hybrid_name, additional_properties=None))
+            return self._add_rule(rule_name, self._build_object_rule(properties, required, hybrid_name, additional_properties=[]))

        elif schema_type in (None, 'array') and ('items' in schema or 'prefixItems' in schema):
            items = schema.get('items') or schema['prefixItems']
@@ -691,7 +691,7 @@ class SchemaConverter:
        required_props = [k for k in sorted_props if k in required]
        optional_props = [k for k in sorted_props if k not in required]

-        if additional_properties is not None and additional_properties != False:
+        if additional_properties != False:
            sub_name = f'{name}{"-" if name else ""}additional'
            value_rule = self.visit(additional_properties, f'{sub_name}-value') if isinstance(additional_properties, dict) else \
                self._add_primitive('value', PRIMITIVE_RULES['value'])
--- a/examples/llama.android/llama/CMakeLists.txt
+++ b/examples/llama.android/llama/CMakeLists.txt
@@ -0,0 +1,55 @@
+
+# For more information about using CMake with Android Studio, read the
+# documentation: https://d.android.com/studio/projects/add-native-code.html.
+# For more examples on how to use CMake, see https://github.com/android/ndk-samples.
+
+# Sets the minimum CMake version required for this project.
+cmake_minimum_required(VERSION 3.22.1)
+
+# Declares the project name. The project name can be accessed via ${ PROJECT_NAME},
+# Since this is the top level CMakeLists.txt, the project name is also accessible
+# with ${CMAKE_PROJECT_NAME} (both CMake variables are in-sync within the top level
+# build script scope).
+project("llama-android")
+
+## Fetch latest llama.cpp from GitHub
+#include(FetchContent)
+#FetchContent_Declare(
+#        llama
+#        GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
+#        GIT_TAG        master
+#)
+#
+## Also provides "common"
+#FetchContent_MakeAvailable(llama)
+
+# llama.cpp CI uses the code from the current branch
+# ref: https://github.com/ggerganov/llama.cpp/pull/7341#issuecomment-2117617700
+add_subdirectory(../../../../../../ build-llama)
+
+# Creates and names a library, sets it as either STATIC
+# or SHARED, and provides the relative paths to its source code.
+# You can define multiple libraries, and CMake builds them for you.
+# Gradle automatically packages shared libraries with your APK.
+#
+# In this top level CMakeLists.txt, ${CMAKE_PROJECT_NAME} is used to define
+# the target library name; in the sub-module's CMakeLists.txt, ${PROJECT_NAME}
+# is preferred for the same purpose.
+#
+# In order to load a library into your app from Java/Kotlin, you must call
+# System.loadLibrary() and pass the name of the library defined here;
+# for GameActivity/NativeActivity derived applications, the same library name must be
+# used in the AndroidManifest.xml file.
+add_library(${CMAKE_PROJECT_NAME} SHARED
+    # List C/C++ source files with relative paths to this CMakeLists.txt.
+        llama-android.cpp)
+
+# Specifies libraries CMake should link to your target library. You
+# can link libraries from various origins, such as libraries defined in this
+# build script, prebuilt third-party libraries, or Android system libraries.
+target_link_libraries(${CMAKE_PROJECT_NAME}
+    # List libraries link to the target library
+    llama
+    common
+    android
+    log)
--- a/examples/llama.android/llama/src/main/cpp/CMakeLists.txt
+++ b/examples/llama.android/llama/src/main/cpp/CMakeLists.txt
@@ -11,15 +11,15 @@ cmake_minimum_required(VERSION 3.22.1)
 # build script scope).
 project("llama-android")

-#include(FetchContent)
-#FetchContent_Declare(
-#        llama
-#        GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
-#        GIT_TAG        master
-#)
+include(FetchContent)
+FetchContent_Declare(
+        llama
+        GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
+        GIT_TAG        master
+)

 # Also provides "common"
-#FetchContent_MakeAvailable(llama)
+FetchContent_MakeAvailable(llama)

 # Creates and names a library, sets it as either STATIC
 # or SHARED, and provides the relative paths to its source code.
@@ -30,10 +30,6 @@ project("llama-android")
 # the target library name; in the sub-module's CMakeLists.txt, ${PROJECT_NAME}
 # is preferred for the same purpose.
 #
-
-#load local llama.cpp
-add_subdirectory(../../../../../../ build-llama)
-
 # In order to load a library into your app from Java/Kotlin, you must call
 # System.loadLibrary() and pass the name of the library defined here;
 # for GameActivity/NativeActivity derived applications, the same library name must be
--- a/examples/llama.android/llama/src/main/cpp/llama-android.cpp
+++ b/examples/llama.android/llama/src/main/cpp/llama-android.cpp
@@ -5,7 +5,7 @@
 #include <string>
 #include <unistd.h>
 #include "llama.h"
-#include "common.h"
+#include "common/common.h"

 // Write C++ code here.
 //
--- a/examples/llava/MobileVLM-README.md
+++ b/examples/llava/MobileVLM-README.md
@@ -30,16 +30,16 @@ git clone https://huggingface.co/mtgv/MobileVLM-1.7B
 git clone https://huggingface.co/openai/clip-vit-large-patch14-336
 ```

-2. Use `llava_surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:
+2. Use `llava-surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:

 ```sh
-python ./examples/llava/llava_surgery.py -m path/to/MobileVLM-1.7B
+python ./examples/llava/llava-surgery.py -m path/to/MobileVLM-1.7B
 ```

-3. Use `convert_image_encoder_to_gguf.py` with `--projector-type ldp` (for **V2** please use `--projector-type ldpv2`) to convert the LLaVA image encoder to GGUF:
+3. Use `convert-image-encoder-to-gguf.py` with `--projector-type ldp` (for **V2** please use `--projector-type ldpv2`) to convert the LLaVA image encoder to GGUF:

 ```sh
-python ./examples/llava/convert_image_encoder_to_gguf \
+python ./examples/llava/convert-image-encoder-to-gguf \
    -m path/to/clip-vit-large-patch14-336 \
    --llava-projector path/to/MobileVLM-1.7B/llava.projector \
    --output-dir path/to/MobileVLM-1.7B \
@@ -47,17 +47,17 @@ python ./examples/llava/convert_image_encoder_to_gguf \
 ```

 ```sh
-python ./examples/llava/convert_image_encoder_to_gguf \
+python ./examples/llava/convert-image-encoder-to-gguf \
    -m path/to/clip-vit-large-patch14-336 \
    --llava-projector path/to/MobileVLM-1.7B_V2/llava.projector \
    --output-dir path/to/MobileVLM-1.7B_V2 \
    --projector-type ldpv2
 ```

-4. Use `examples/convert_legacy_llama.py` to convert the LLaMA part of LLaVA to GGUF:
+4. Use `examples/convert-legacy-llama.py` to convert the LLaMA part of LLaVA to GGUF:

 ```sh
-python ./examples/convert_legacy_llama.py path/to/MobileVLM-1.7B
+python ./examples/convert-legacy-llama.py path/to/MobileVLM-1.7B
 ```

 5. Use `quantize` to convert LLaMA part's DataType from `fp16` to `q4_k`
@@ -194,7 +194,7 @@ llama_print_timings:       total time =   44411.01 ms /   377 tokens
 ## Orin compile and run
 ### compile
 ```sh
-make GGML_CUDA=1 CUDA_DOCKER_ARCH=sm_87 GGML_CUDA_F16=1 -j 32
+make LLAMA_CUDA=1 CUDA_DOCKER_ARCH=sm_87 LLAMA_CUDA_F16=1 -j 32
 ```
 ### run on Orin
 ### case 1
--- a/examples/llava/README.md
+++ b/examples/llava/README.md
@@ -38,22 +38,22 @@ git clone https://huggingface.co/openai/clip-vit-large-patch14-336
 pip install -r examples/llava/requirements.txt
 ```

-3. Use `llava_surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:
+3. Use `llava-surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:

 ```sh
-python ./examples/llava/llava_surgery.py -m ../llava-v1.5-7b
+python ./examples/llava/llava-surgery.py -m ../llava-v1.5-7b
 ```

-4. Use `convert_image_encoder_to_gguf.py` to convert the LLaVA image encoder to GGUF:
+4. Use `convert-image-encoder-to-gguf.py` to convert the LLaVA image encoder to GGUF:

 ```sh
-python ./examples/llava/convert_image_encoder_to_gguf.py -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b
+python ./examples/llava/convert-image-encoder-to-gguf.py -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b
 ```

-5. Use `examples/convert_legacy_llama.py` to convert the LLaMA part of LLaVA to GGUF:
+5. Use `examples/convert-legacy-llama.py` to convert the LLaMA part of LLaVA to GGUF:

 ```sh
-python ./examples/convert_legacy_llama.py ../llava-v1.5-7b --skip-unknown
+python ./examples/convert-legacy-llama.py ../llava-v1.5-7b --skip-unknown
 ```

 Now both the LLaMA part and the image encoder are in the `llava-v1.5-7b` directory.
@@ -70,9 +70,9 @@ git clone https://huggingface.co/liuhaotian/llava-v1.6-vicuna-7b
 pip install -r examples/llava/requirements.txt
 ```

-3) Use `llava_surgery_v2.py` which also supports llava-1.5 variants pytorch as well as safetensor models:
+3) Use `llava-surgery-v2.py` which also supports llava-1.5 variants pytorch as well as safetensor models:
 ```console
-python examples/llava/llava_surgery_v2.py -C -m ../llava-v1.6-vicuna-7b/
+python examples/llava/llava-surgery-v2.py -C -m ../llava-v1.6-vicuna-7b/
 ```
 - you will find a llava.projector and a llava.clip file in your model directory

@@ -86,13 +86,13 @@ curl -s -q https://huggingface.co/cmp-nct/llava-1.6-gguf/raw/main/config_vit.jso

 5) Create the visual gguf model:
 ```console
-python ./examples/llava/convert_image_encoder_to_gguf.py -m vit --llava-projector vit/llava.projector --output-dir vit --clip-model-is-vision
+python ./examples/llava/convert-image-encoder-to-gguf.py -m vit --llava-projector vit/llava.projector --output-dir vit --clip-model-is-vision
 ```
 - This is similar to llava-1.5, the difference is that we tell the encoder that we are working with the pure vision model part of CLIP

 6) Then convert the model to gguf format:
 ```console
-python ./examples/convert_legacy_llama.py ../llava-v1.6-vicuna-7b/ --skip-unknown
+python ./examples/convert-legacy-llama.py ../llava-v1.6-vicuna-7b/ --skip-unknown
 ```

 7) And finally we can run the llava cli using the 1.6 model version:
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -1121,20 +1121,20 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
            }
            if (n < 32)
                hparams.image_grid_pinpoints[n] = 0;
-        } catch (std::runtime_error & /*e*/) {
+        } catch (std::runtime_error & e) {
            hparams.image_grid_pinpoints[0]=0;
        }

        try {
            int idx = get_key_idx(ctx, KEY_MM_PATCH_MERGE_TYPE);
            strcpy(hparams.mm_patch_merge_type, gguf_get_val_str(ctx, idx));
-        } catch (std::runtime_error & /*e*/) {
+        } catch (std::runtime_error & e) {
            strcpy(hparams.mm_patch_merge_type, "flat");
        }

        try {
            hparams.image_crop_resolution = get_u32(ctx, KEY_IMAGE_CROP_RESOLUTION); // llava-1.6
-        } catch(const std::exception& /*e*/) {
+        } catch(const std::exception& e) {
            hparams.image_crop_resolution = hparams.image_size;
        }

@@ -1173,7 +1173,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        try {
            vision_model.class_embedding  = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD);
            new_clip->has_class_embedding = true;
-        } catch (const std::exception& /*e*/) {
+        } catch (const std::exception& e) {
            new_clip->has_class_embedding = false;
        }

@@ -1181,7 +1181,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
            vision_model.pre_ln_w  = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
            vision_model.pre_ln_b  = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
            new_clip->has_pre_norm = true;
-        } catch (std::exception & /*e*/) {
+        } catch (std::exception & e) {
            new_clip->has_pre_norm = false;
        }

@@ -1189,21 +1189,21 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
            vision_model.post_ln_w  = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "weight"));
            vision_model.post_ln_b  = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "bias"));
            new_clip->has_post_norm = true;
-        } catch (std::exception & /*e*/) {
+        } catch (std::exception & e) {
            new_clip->has_post_norm = false;
        }

        try {
            vision_model.patch_bias = get_tensor(new_clip->ctx_data, TN_PATCH_BIAS);
            new_clip->has_patch_bias = true;
-        } catch (std::exception & /*e*/) {
+        } catch (std::exception & e) {
            new_clip->has_patch_bias = false;
        }

        try {
            vision_model.patch_embeddings    = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
            vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
-        } catch(const std::exception& /*e*/) {
+        } catch(const std::exception& e) {
            LOG_TEE("%s: failed to load vision model tensors\n", __func__);
        }

@@ -1215,26 +1215,26 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
                // Yi-type llava
                vision_model.mm_1_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 1, "weight"));
                vision_model.mm_1_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 1, "bias"));
-            } catch (std::runtime_error & /*e*/) { }
+            } catch (std::runtime_error & e) {  }
            try {
                // missing in Yi-type llava
                vision_model.mm_2_w              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
                vision_model.mm_2_b              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
-            } catch (std::runtime_error & /*e*/) { }
+            } catch (std::runtime_error & e) {  }
            try {
                // Yi-type llava
                vision_model.mm_3_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 3, "weight"));
                vision_model.mm_3_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 3, "bias"));
-            } catch (std::runtime_error & /*e*/) { }
+            } catch (std::runtime_error & e) {  }
            try {
                // Yi-type llava
                vision_model.mm_4_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "weight"));
                vision_model.mm_4_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "bias"));
-            } catch (std::runtime_error & /*e*/) { }
+            } catch (std::runtime_error & e) {  }
            try {
                vision_model.image_newline = get_tensor(new_clip->ctx_data, TN_IMAGE_NEWLINE);
                // LOG_TEE("%s: image_newline tensor (llava-1.6) found\n", __func__);
-            } catch (std::runtime_error & /*e*/) { }
+            } catch (std::runtime_error & e) {  }
        } else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) {
            // MobileVLM projection
            vision_model.mm_model_mlp_1_w               = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "weight"));
--- a/examples/llava/convert-image-encoder-to-gguf.py
+++ b/examples/llava/convert-image-encoder-to-gguf.py
--- a/examples/llava/llava-surgery-v2.py
+++ b/examples/llava/llava-surgery-v2.py
--- a/examples/llava/llava-surgery.py
+++ b/examples/llava/llava-surgery.py
--- a/examples/llava/requirements.txt
+++ b/examples/llava/requirements.txt
@@ -1,3 +1,3 @@
-r ../../requirements/requirements-convert_legacy_llama.txt
+-r ../../requirements/requirements-convert-legacy-llama.txt
 pillow~=10.2.0
-torch~=2.2.1
+torch~=2.1.1
--- a/examples/lookup/README.md
+++ b/examples/lookup/README.md
@@ -10,3 +10,4 @@ More info:

 https://github.com/ggerganov/llama.cpp/pull/4484
 https://github.com/ggerganov/llama.cpp/issues/4226
+
--- a/examples/main-cmake-pkg/.gitignore
+++ b/examples/main-cmake-pkg/.gitignore
@@ -48,3 +48,4 @@
 build*/
 out/
 tmp/
+
--- a/examples/main-cmake-pkg/CMakeLists.txt
+++ b/examples/main-cmake-pkg/CMakeLists.txt
@@ -30,3 +30,4 @@ target_include_directories(${TARGET} PRIVATE ${_common_path})
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
+
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -37,8 +37,7 @@ static gpt_params               * g_params;
 static std::vector<llama_token> * g_input_tokens;
 static std::ostringstream       * g_output_ss;
 static std::vector<llama_token> * g_output_tokens;
-static bool is_interacting  = false;
-static bool need_insert_eot = false;
+static bool is_interacting = false;

 static bool file_exists(const std::string & path) {
    std::ifstream f(path.c_str());
@@ -100,8 +99,7 @@ static void write_logfile(
 static void sigint_handler(int signo) {
    if (signo == SIGINT) {
        if (!is_interacting && g_params->interactive) {
-            is_interacting  = true;
-            need_insert_eot = true;
+            is_interacting = true;
        } else {
            console::cleanup();
            printf("\n");
@@ -226,14 +224,7 @@ int main(int argc, char ** argv) {
                __func__, n_ctx_train, n_ctx);
    }

-    // print chat template example in conversation mode
-    if (params.conversation) {
-        if (params.enable_chat_template) {
-            LOG_TEE("%s: chat template example: %s\n", __func__, llama_chat_format_example(model, params.chat_template).c_str());
-        } else {
-            LOG_TEE("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__);
-        }
-    }
+    LOG_TEE("%s: chat template example: %s\n", __func__, llama_chat_format_example(model, params.chat_template).c_str());

    // print system information
    {
@@ -264,15 +255,13 @@ int main(int argc, char ** argv) {
    }

    const bool add_bos = llama_should_add_bos_token(model);
-    if (!llama_model_has_encoder(model)) {
-        GGML_ASSERT(llama_add_eos_token(model) != 1);
-    }
+    GGML_ASSERT(llama_add_eos_token(model) != 1);
    LOG("add_bos: %d\n", add_bos);

    std::vector<llama_token> embd_inp;

    {
-        auto prompt = (params.conversation && params.enable_chat_template && !params.prompt.empty())
+        auto prompt = params.conversation
            ? chat_add_and_format(model, chat_msgs, "system", params.prompt) // format the system prompt in conversation mode
            : params.prompt;
        if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
@@ -528,24 +517,6 @@ int main(int argc, char ** argv) {
        exit(1);
    }

-    if (llama_model_has_encoder(model)) {
-        int enc_input_size = embd_inp.size();
-        llama_token * enc_input_buf = embd_inp.data();
-
-        if (llama_encode(ctx, llama_batch_get_one(enc_input_buf, enc_input_size, 0, 0))) {
-            LOG_TEE("%s : failed to eval\n", __func__);
-            return 1;
-        }
-
-        llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
-        if (decoder_start_token_id == -1) {
-            decoder_start_token_id = llama_token_bos(model);
-        }
-
-        embd_inp.clear();
-        embd_inp.push_back(decoder_start_token_id);
-    }
-
    while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
        // predict
        if (!embd.empty()) {
@@ -839,9 +810,7 @@ int main(int argc, char ** argv) {
                        is_antiprompt = true;
                    }

-                    if (params.enable_chat_template) {
-                        chat_add_and_format(model, chat_msgs, "assistant", assistant_ss.str());
-                    }
+                    chat_add_and_format(model, chat_msgs, "system", assistant_ss.str());
                    is_interacting = true;
                    printf("\n");
                }
@@ -903,24 +872,16 @@ int main(int argc, char ** argv) {
                        string_process_escapes(buffer);
                    }

-                    bool format_chat = params.conversation && params.enable_chat_template;
-                    std::string user_inp = format_chat
+                    std::string user_inp = params.conversation
                        ? chat_add_and_format(model, chat_msgs, "user", std::move(buffer))
                        : std::move(buffer);
                    // TODO: one inconvenient of current chat template implementation is that we can't distinguish between user input and special tokens (prefix/postfix)
                    const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
-                    const auto line_inp = ::llama_tokenize(ctx, user_inp,            false, format_chat);
+                    const auto line_inp = ::llama_tokenize(ctx, user_inp,            false, params.conversation);
                    const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);

                    LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());

-                    // if user stop generation mid-way, we must add EOT to finish model's last response
-                    if (need_insert_eot && format_chat) {
-                        llama_token eot = llama_token_eot(model);
-                        embd_inp.push_back(eot == -1 ? llama_token_eos(model) : eot);
-                        need_insert_eot = false;
-                    }
-
                    embd_inp.insert(embd_inp.end(), line_pfx.begin(), line_pfx.end());
                    embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
                    embd_inp.insert(embd_inp.end(), line_sfx.begin(), line_sfx.end());
--- a/examples/passkey/README.md
+++ b/examples/passkey/README.md
@@ -1,8 +1,5 @@
 # llama.cpp/example/passkey

-A passkey retrieval task is an evaluation method used to measure a language
-models ability to recall information from long contexts.
-
 See the following PRs for more info:

 - https://github.com/ggerganov/llama.cpp/pull/3856
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -1991,12 +1991,6 @@ int main(int argc, char ** argv) {
        params.n_batch = std::min(params.n_batch, n_kv);
    } else {
        params.n_batch = std::min(params.n_batch, params.n_ctx);
-        if (params.kl_divergence) {
-            params.n_parallel = 1;
-        } else {
-            // ensure there's at least enough seq_ids for HellaSwag
-            params.n_parallel = std::max(4, params.n_parallel);
-        }
    }

    if (params.ppl_stride > 0) {
@@ -2021,6 +2015,9 @@ int main(int argc, char ** argv) {
    llama_model * model;
    llama_context * ctx;

+    // ensure there's at least enough seq_ids for HellaSwag
+    params.n_parallel = std::max(4, params.n_parallel);
+
    // load the model and apply lora adapter, if any
    std::tie(model, ctx) = llama_init_from_gpt_params(params);
    if (model == NULL) {
--- a/examples/pydantic-models-to-grammar-examples.py
+++ b/examples/pydantic-models-to-grammar-examples.py
--- a/examples/regex-to-grammar.py
+++ b/examples/regex-to-grammar.py
--- a/examples/rpc/README.md
+++ b/examples/rpc/README.md
@@ -29,13 +29,13 @@ You can also run multiple `rpc-server` instances on the same host, each with a d

 ## Usage

-On each host, build the corresponding backend with `cmake` and add `-DGGML_RPC=ON` to the build options.
+On each host, build the corresponding backend with `cmake` and add `-DLLAMA_RPC=ON` to the build options.
 For example, to build the CUDA backend with RPC support:

 ```bash
 mkdir build-rpc-cuda
 cd build-rpc-cuda
-cmake .. -DGGML_CUDA=ON -DGGML_RPC=ON
+cmake .. -DLLAMA_CUDA=ON -DLLAMA_RPC=ON
 cmake --build . --config Release
 ```

@@ -58,12 +58,12 @@ $ CUDA_VISIBLE_DEVICES=0 bin/rpc-server -p 50052
 This way you can run multiple `rpc-server` instances on the same host, each with a different CUDA device.


-On the main host build `llama.cpp` only with `-DGGML_RPC=ON`:
+On the main host build `llama.cpp` only with `-DLLAMA_RPC=ON`:

 ```bash
 mkdir build-rpc
 cd build-rpc
-cmake .. -DGGML_RPC=ON
+cmake .. -DLLAMA_RPC=ON
 cmake --build . --config Release
 ```

--- a/examples/server-embd.py
+++ b/examples/server-embd.py
@@ -31,3 +31,4 @@ for i in range(n-1):
        embedding2 = np.array(result[j])
        similarity = np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2))
        print(f"Similarity between {i} and {j}: {similarity:.2f}")
+
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@@ -1,14 +1,7 @@
 set(TARGET llama-server)
 option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
-option(LLAMA_SERVER_SSL     "Build SSL support for the server"        OFF)
-
+option(LLAMA_SERVER_SSL "Build SSL support for the server" OFF)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
-
-if (MINGW)
-    # fix: https://github.com/ggerganov/llama.cpp/actions/runs/9651004652/job/26617901362?pr=8006
-    add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER})
-endif()
-
 set(TARGET_SRCS
    server.cpp
    utils.hpp
@@ -31,7 +24,6 @@ set(PUBLIC_ASSETS
    prompt-formats.js
    json-schema-to-grammar.mjs
 )
-
 foreach(asset ${PUBLIC_ASSETS})
    set(input "${CMAKE_CURRENT_SOURCE_DIR}/public/${asset}")
    set(output "${CMAKE_CURRENT_BINARY_DIR}/${asset}.hpp")
@@ -42,23 +34,18 @@ foreach(asset ${PUBLIC_ASSETS})
        COMMAND "${CMAKE_COMMAND}" "-DINPUT=${input}" "-DOUTPUT=${output}" -P "${PROJECT_SOURCE_DIR}/scripts/xxd.cmake"
    )
 endforeach()
-
 add_executable(${TARGET} ${TARGET_SRCS})
 install(TARGETS ${TARGET} RUNTIME)
 target_compile_definitions(${TARGET} PRIVATE
    SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
 )
-
 target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT})
-
 if (LLAMA_SERVER_SSL)
    find_package(OpenSSL REQUIRED)
    target_link_libraries(${TARGET} PRIVATE OpenSSL::SSL OpenSSL::Crypto)
    target_compile_definitions(${TARGET} PRIVATE CPPHTTPLIB_OPENSSL_SUPPORT)
 endif()
-
 if (WIN32)
    TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
 endif()
-
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -73,7 +73,6 @@ The project is under active development, and we are [looking for feedback and co
 - `-fa`, `--flash-attn` : enable flash attention (default: disabled).
 - `-ctk TYPE`, `--cache-type-k TYPE` : KV cache data type for K (default: `f16`, options `f32`, `f16`, `q8_0`, `q4_0`, `q4_1`, `iq4_nl`, `q5_0`, or `q5_1`)
 - `-ctv TYPE`, `--cache-type-v TYPE` : KV cache type for V (default `f16`, see `-ctk` for options)
- `--spm-infill` : Use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this.

 **If compiled with `LLAMA_SERVER_SSL=ON`**
 - `--ssl-key-file FNAME`: path to file a PEM-encoded SSL private key
@@ -375,7 +374,7 @@ Notice that each `probs` is an array of length `n_probs`.
 - `default_generation_settings` - the default generation settings for the `/completion` endpoint, which has the same fields as the `generation_settings` response object from the `/completion` endpoint.
 - `total_slots` - the total number of slots for process requests (defined by `--parallel` option)

- **POST** `/v1/chat/completions`: OpenAI-compatible Chat Completions API. Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only models with a [supported chat template](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, the ChatML template will be used.
+- **POST** `/v1/chat/completions`: OpenAI-compatible Chat Completions API. Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only model with [supported chat template](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, ChatML template will be used.

    *Options:*

--- a/examples/server/public/json-schema-to-grammar.mjs
+++ b/examples/server/public/json-schema-to-grammar.mjs
@@ -259,7 +259,7 @@ const GRAMMAR_RANGE_LITERAL_ESCAPE_RE = /[\n\r"\]\-\\]/g;
 const GRAMMAR_LITERAL_ESCAPES = { '\r': '\\r', '\n': '\\n', '"': '\\"', '-': '\\-', ']': '\\]' };

 const NON_LITERAL_SET = new Set('|.()[]{}*+?');
-const ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = new Set('^$.[]()|{}*+?');
+const ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = new Set('[]()|{}*+?');

 export class SchemaConverter {
  constructor(options) {
@@ -751,7 +751,7 @@ export class SchemaConverter {
    const requiredProps = sortedProps.filter(k => required.has(k));
    const optionalProps = sortedProps.filter(k => !required.has(k));

-    if (additionalProperties) {
+    if (additionalProperties !== false) {
      const subName = `${name ?? ''}${name ? '-' : ''}additional`;
      const valueRule =
        additionalProperties != null && typeof additionalProperties === 'object' ? this.visit(additionalProperties, `${subName}-value`)
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -2020,7 +2020,6 @@ struct server_context {
                        slot.t_start_generation = 0;

                        if (slot.infill) {
-                            const bool add_bos = llama_should_add_bos_token(model);
                            bool suff_rm_leading_spc = true;
                            if (params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
                                params.input_suffix.erase(0, 1);
@@ -2036,21 +2035,16 @@ struct server_context {
                            }

                            prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(model));
-                            suffix_tokens.insert(suffix_tokens.begin(), llama_token_suffix(model));
-
-                            auto embd_inp = params.spm_infill ? suffix_tokens : prefix_tokens;
-                            auto embd_end = params.spm_infill ? prefix_tokens : suffix_tokens;
-                            if (add_bos) {
-                                embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
-                            }
-                            embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
+                            prefix_tokens.insert(prefix_tokens.begin(), llama_token_bos(model)); // always add BOS
+                            prefix_tokens.insert(prefix_tokens.end(),   llama_token_suffix(model));
+                            prefix_tokens.insert(prefix_tokens.end(),   suffix_tokens.begin(), suffix_tokens.end());

                            const llama_token middle_token = llama_token_middle(model);
                            if (middle_token >= 0) {
-                                embd_inp.push_back(middle_token);
+                                prefix_tokens.push_back(middle_token);
                            }

-                            prompt_tokens = embd_inp;
+                            prompt_tokens = prefix_tokens;
                        } else {
                            prompt_tokens = tokenize(slot.prompt, system_prompt.empty()); // add BOS if there isn't system prompt
                        }
--- a/examples/server/tests/features/passkey.feature
+++ b/examples/server/tests/features/passkey.feature
@@ -52,3 +52,4 @@ Feature: Passkey / Self-extend with context shift
      #| TheBloke/Llama-2-7B-GGUF        | llama-2-7b.Q2_K.gguf        | 4096        | 3   | 16384 | 512     | 4    | 512    | 500    | 300   | 1234    | 5           | 1234           |
      #| TheBloke/Mixtral-8x7B-v0.1-GGUF | mixtral-8x7b-v0.1.Q2_K.gguf | 32768       | 2   | 16384 | 512     | 4    | 512    | 500    | 100   | 0987    | 5           | 0
      # 987           |
+
--- a/examples/server/themes/buttons-top/index.html
+++ b/examples/server/themes/buttons-top/index.html
@@ -1054,3 +1054,4 @@
 </body>

 </html>
+
--- a/examples/server/themes/wild/index.html
+++ b/examples/server/themes/wild/index.html
@@ -1058,3 +1058,4 @@
 </body>

 </html>
+
--- a/examples/sycl/build.sh
+++ b/examples/sycl/build.sh
@@ -8,10 +8,10 @@ cd build
 source /opt/intel/oneapi/setvars.sh

 #for FP16
-#cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON # faster for long-prompt inference
+#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON # faster for long-prompt inference

 #for FP32
-cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx

 #build example/main
 #cmake --build . --config Release --target main
--- a/examples/sycl/run-llama2.sh
+++ b/examples/sycl/run-llama2.sh
@@ -34,3 +34,4 @@ fi

 #use multiple GPUs with same max compute units
 #ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
+
--- a/examples/sycl/win-build-sycl.bat
+++ b/examples/sycl/win-build-sycl.bat
@@ -13,10 +13,10 @@ if %errorlevel% neq 0 goto ERROR

 ::  for FP16
 ::  faster for long-prompt inference
-::  cmake -G "MinGW Makefiles" ..  -DGGML_SYCL=ON -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DGGML_SYCL_F16=ON
+::  cmake -G "MinGW Makefiles" ..  -DLLAMA_SYCL=ON -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON

 ::  for FP32
-cmake -G "Ninja" ..  -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release
+cmake -G "Ninja" ..  -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release
 if %errorlevel% neq 0 goto ERROR
 ::  build example/main only
 ::  make main
@@ -31,3 +31,4 @@ exit /B 0
 :ERROR
 echo comomand error: %errorlevel%
 exit /B %errorlevel%
+
--- a/examples/sycl/win-run-llama2.bat
+++ b/examples/sycl/win-run-llama2.bat
@@ -7,3 +7,5 @@ set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"


 .\build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p %INPUT2% -n 400 -e -ngl 33 -s 0
+
+
--- a/examples/tokenize/tokenize.cpp
+++ b/examples/tokenize/tokenize.cpp
@@ -30,7 +30,6 @@ static void print_usage_information(const char * argv0, FILE * stream) {
    fprintf(stream, "    --stdin                              read prompt from standard input.\n");
    fprintf(stream, "    --no-bos                             do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n");
    fprintf(stream, "    --log-disable                        disable logs. Makes stderr quiet when loading the model.\n");
-    fprintf(stream, "    --show-count                         print the total number of tokens.\n");
 }

 static void llama_log_callback_null(ggml_log_level level, const char * text, void * user_data) {
@@ -196,7 +195,6 @@ int main(int raw_argc, char ** raw_argv) {
    bool printing_ids = false;
    bool no_bos = false;
    bool disable_logging = false;
-    bool show_token_count = false;
    const char * model_path = NULL;
    const char * prompt_path = NULL;
    const char * prompt_arg = NULL;
@@ -251,9 +249,6 @@ int main(int raw_argc, char ** raw_argv) {
        else if (arg == "--log-disable") {
            disable_logging = true;
        }
-        else if (arg == "--show-count") {
-            show_token_count = true;
-        }
        else {
            fprintf(stderr, "Error: unknown option '%s'\n", argv[iarg].c_str());
            return 1;
@@ -389,9 +384,6 @@ int main(int raw_argc, char ** raw_argv) {
        printf("]\n");
    }

-    if (show_token_count) {
-        printf("Total number of tokens: %ld\n", tokens.size());
-    }
    // silence valgrind
    llama_free(ctx);
    llama_free_model(model);
--- a/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py
+++ b/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py
--- a/flake.lock
+++ b/flake.lock
@@ -20,11 +20,11 @@
    },
    "nixpkgs": {
      "locked": {
-        "lastModified": 1719506693,
-        "narHash": "sha256-C8e9S7RzshSdHB7L+v9I51af1gDM5unhJ2xO1ywxNH8=",
+        "lastModified": 1718318537,
+        "narHash": "sha256-4Zu0RYRcAY/VWuu6awwq4opuiD//ahpc2aFHg2CWqFY=",
        "owner": "NixOS",
        "repo": "nixpkgs",
-        "rev": "b2852eb9365c6de48ffb0dc2c9562591f652242a",
+        "rev": "e9ee548d90ff586a6471b4ae80ae9cfcbceb3420",
        "type": "github"
      },
      "original": {
--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
@@ -988,19 +988,19 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
            this_size = GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
        }

-        if (this_size > max_size) {
-            fprintf(stderr, "%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
-                    __func__, t->name,
-                    ggml_backend_buft_name(buft),
-                    this_size, max_size);
-            for (size_t i = 0; i < n_buffers; i++) {
-                ggml_backend_buffer_free(buffers[i]);
-            }
-            free(buffers);
-            return NULL;
-        }
+        //if (this_size > max_size) {
+        //    fprintf(stderr, "%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
+        //            __func__, t->name,
+        //            ggml_backend_buft_name(buft),
+        //            this_size, max_size);
+        //    for (size_t i = 0; i < n_buffers; i++) {
+        //        ggml_backend_buffer_free(buffers[i]);
+        //    }
+        //    free(buffers);
+        //    return NULL;
+        //}

-        if ((cur_buf_size + this_size) > max_size) {
+        if (cur_buf_size != 0 && (cur_buf_size + this_size) > max_size) {
            // allocate tensors in the current buffer
            if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
                return NULL;
--- a/ggml/include/ggml-alloc.h
+++ b/ggml/include/ggml-alloc.h
--- a/ggml/src/ggml-backend-impl.h
+++ b/ggml/src/ggml-backend-impl.h
--- a/ggml/src/ggml-backend.c
+++ b/ggml/src/ggml-backend.c
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
--- a/ggml/src/ggml-blas.cpp
+++ b/ggml/src/ggml-blas.cpp
--- a/ggml/include/ggml-blas.h
+++ b/ggml/include/ggml-blas.h
--- a/ggml/src/ggml-common.h
+++ b/ggml/src/ggml-common.h
@@ -106,19 +106,19 @@ typedef sycl::half2 ggml_half2;
 #define QR6_K 2

 #define QI2_XXS (QK_K / (4*QR2_XXS))
-#define QR2_XXS 4
+#define QR2_XXS 8

 #define QI2_XS (QK_K / (4*QR2_XS))
-#define QR2_XS 4
+#define QR2_XS 8

 #define QI2_S (QK_K / (4*QR2_S))
-#define QR2_S 4
+#define QR2_S 8

 #define QI3_XXS (QK_K / (4*QR3_XXS))
-#define QR3_XXS 4
+#define QR3_XXS 8

 #define QI3_XS (QK_K / (4*QR3_XS))
-#define QR3_XS 4
+#define QR3_XS 8

 #define QI1_S (QK_K / (4*QR1_S))
 #define QR1_S 8
@@ -130,10 +130,10 @@ typedef sycl::half2 ggml_half2;
 #define QR4_NL 2

 #define QI4_XS (QK_K / (4*QR4_XS))
-#define QR4_XS 2
+#define QR4_XS 8

 #define QI3_S (QK_K / (4*QR3_S))
-#define QR3_S 4
+#define QR3_S 8

 #endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP

--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -576,6 +576,11 @@ GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend
    GGML_UNUSED(buft);
 }

+GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
+    ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
+    return ggml_cuda_info().devices[buft_ctx->device].vmm_granularity;
+}
+
 GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
    size_t size = ggml_nbytes(tensor);
    int64_t ne0 = tensor->ne[0];
@@ -595,7 +600,7 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
    /* .get_name         = */ ggml_backend_cuda_buffer_type_name,
    /* .alloc_buffer     = */ ggml_backend_cuda_buffer_type_alloc_buffer,
    /* .get_alignment    = */ ggml_backend_cuda_buffer_type_get_alignment,
-    /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
+    /* .get_max_size     = */ ggml_backend_cuda_buffer_type_get_max_size,
    /* .get_alloc_size   = */ ggml_backend_cuda_buffer_type_get_alloc_size,
    /* .is_host          = */ NULL,
 };
@@ -1882,11 +1887,6 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
    bool              use_mul_mat_q =  ggml_is_quantized(src0->type)
        && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;

-    // if mmvq is available it's a better choice than dmmv:
-#ifndef GGML_CUDA_FORCE_DMMV
-    use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
-#endif // GGML_CUDA_FORCE_DMMV
-
    bool any_gpus_with_slow_fp16 = false;

    if (split) {
@@ -1899,15 +1899,22 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
            }

            const int cc            = ggml_cuda_info().devices[id].cc;
+            use_mul_mat_vec_q       = use_mul_mat_vec_q       && cc >= MIN_CC_DP4A;
            use_mul_mat_q           = use_mul_mat_q           && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
            any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc);
        }
    } else {
        const int cc            = ggml_cuda_info().devices[ctx.device].cc;
+        use_mul_mat_vec_q       = use_mul_mat_vec_q       && cc >= MIN_CC_DP4A;
        use_mul_mat_q           = use_mul_mat_q           && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
        any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc);
    }

+    // if mmvq is available it's a better choice than dmmv:
+#ifndef GGML_CUDA_FORCE_DMMV
+    use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
+#endif // GGML_CUDA_FORCE_DMMV
+
    // debug helpers
    //printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
    //printf("      %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
@@ -2711,40 +2718,27 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
        case GGML_OP_MUL_MAT:
        case GGML_OP_MUL_MAT_ID:
            {
-                struct ggml_tensor * a = op->src[0];
+                struct ggml_tensor * a;
+                struct ggml_tensor * b;
                if (op->op == GGML_OP_MUL_MAT) {
-                    struct ggml_tensor * b = op->src[1];
-                    if (a->ne[3] != b->ne[3]) {
+                    a = op->src[0];
+                    b = op->src[1];
+                } else {
+                    a = op->src[2];
+                    b = op->src[1];
+                }
+                if (a->ne[3] != b->ne[3]) {
+                    return false;
+                }
+                ggml_type a_type = a->type;
+                if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS ||
+                    a_type == GGML_TYPE_IQ1_S   || a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ3_S   ||
+                    a_type == GGML_TYPE_IQ1_M   || a_type == GGML_TYPE_IQ2_S  || a_type == GGML_TYPE_IQ4_XS) {
+                    if (b->ne[1] == 1 && ggml_nrows(b) > 1) {
                        return false;
                    }
                }
-                switch (a->type) {
-                    case GGML_TYPE_F32:
-                    case GGML_TYPE_F16:
-                    case GGML_TYPE_Q4_0:
-                    case GGML_TYPE_Q4_1:
-                    case GGML_TYPE_Q5_0:
-                    case GGML_TYPE_Q5_1:
-                    case GGML_TYPE_Q8_0:
-                    case GGML_TYPE_Q2_K:
-                    case GGML_TYPE_Q3_K:
-                    case GGML_TYPE_Q4_K:
-                    case GGML_TYPE_Q5_K:
-                    case GGML_TYPE_Q6_K:
-                    case GGML_TYPE_Q8_K:
-                    case GGML_TYPE_IQ1_M:
-                    case GGML_TYPE_IQ1_S:
-                    case GGML_TYPE_IQ2_S:
-                    case GGML_TYPE_IQ2_XS:
-                    case GGML_TYPE_IQ2_XXS:
-                    case GGML_TYPE_IQ3_S:
-                    case GGML_TYPE_IQ3_XXS:
-                    case GGML_TYPE_IQ4_NL:
-                    case GGML_TYPE_IQ4_XS:
-                        return true;
-                    default:
-                        return false;
-                }
+                return true;
            } break;
        case GGML_OP_GET_ROWS:
            {
--- a/ggml/include/ggml-cuda.h
+++ b/ggml/include/ggml-cuda.h
--- a/ggml/src/ggml-cuda/acc.cu
+++ b/ggml/src/ggml-cuda/acc.cu
--- a/ggml/src/ggml-cuda/acc.cuh
+++ b/ggml/src/ggml-cuda/acc.cuh
--- a/ggml/src/ggml-cuda/arange.cu
+++ b/ggml/src/ggml-cuda/arange.cu
--- a/ggml/src/ggml-cuda/arange.cuh
+++ b/ggml/src/ggml-cuda/arange.cuh
--- a/ggml/src/ggml-cuda/argsort.cu
+++ b/ggml/src/ggml-cuda/argsort.cu
--- a/Show More
+++ b/Show More
				`@@ -7,3 +7,5 @@ set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"`


				`.\build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p %INPUT2% -n 400 -e -ngl 33 -s 0`