tokenize : add --no-parse-special option

This should allow more easily explaining how parse_special affects tokenization.
2026-02-19 14:13:22 +02:00 · 2024-07-10 18:06:25 -04:00
314 changed files with 162144 additions and 35493 deletions
--- a/.devops/full-cuda.Dockerfile
+++ b/.devops/full-cuda.Dockerfile
@@ -6,7 +6,7 @@ ARG CUDA_VERSION=11.7.1
 # Target the CUDA build image
 ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}

-FROM ${BASE_CUDA_DEV_CONTAINER} AS build
+FROM ${BASE_CUDA_DEV_CONTAINER} as build

 # Unless otherwise specified, we make a fat build.
 ARG CUDA_DOCKER_ARCH=all
--- a/.devops/full-rocm.Dockerfile
+++ b/.devops/full-rocm.Dockerfile
@@ -6,7 +6,7 @@ ARG ROCM_VERSION=5.6
 # Target the CUDA build image
 ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete

-FROM ${BASE_ROCM_DEV_CONTAINER} AS build
+FROM ${BASE_ROCM_DEV_CONTAINER} as build

 # Unless otherwise specified, we make a fat build.
 # List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
--- a/.devops/full.Dockerfile
+++ b/.devops/full.Dockerfile
@@ -1,6 +1,6 @@
 ARG UBUNTU_VERSION=22.04

-FROM ubuntu:$UBUNTU_VERSION AS build
+FROM ubuntu:$UBUNTU_VERSION as build

 RUN apt-get update && \
    apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1
--- a/.devops/llama-cli-cann.Dockerfile
+++ b/.devops/llama-cli-cann.Dockerfile
@@ -1,44 +0,0 @@
-ARG ASCEND_VERSION=8.0.rc2.alpha003-910b-openeuler22.03-py3.8
-
-FROM cosdt/cann:$ASCEND_VERSION AS build
-
-WORKDIR /app
-
-COPY . .
-
-RUN yum install -y gcc g++ cmake make
-ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
-ENV LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:$LIBRARY_PATH
-ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling:${LD_LIBRARY_PATH}
-ENV PYTHONPATH=${ASCEND_TOOLKIT_HOME}/python/site-packages:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe:${PYTHONPATH}
-ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${ASCEND_TOOLKIT_HOME}/compiler/ccec_compiler/bin:${PATH}
-ENV ASCEND_AICPU_PATH=${ASCEND_TOOLKIT_HOME}
-ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
-ENV TOOLCHAIN_HOME=${ASCEND_TOOLKIT_HOME}/toolkit
-ENV ASCEND_HOME_PATH=${ASCEND_TOOLKIT_HOME}
-
-# find libascend_hal.so, because the drive hasn`t been mounted.
-ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
-
-RUN echo "Building with static libs" && \
-    source /usr/local/Ascend/ascend-toolkit/set_env.sh --force && \
-    cmake -B build -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF  && \
-    cmake --build build --config Release --target llama-cli
-
-# TODO: use image with NNRT
-FROM cosdt/cann:$ASCEND_VERSION AS runtime
-COPY --from=build /app/build/bin/llama-cli /llama-cli
-
-ENV LC_ALL=C.utf8
-
-ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
-ENV LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:$LIBRARY_PATH
-ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling:${LD_LIBRARY_PATH}
-ENV PYTHONPATH=${ASCEND_TOOLKIT_HOME}/python/site-packages:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe:${PYTHONPATH}
-ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${ASCEND_TOOLKIT_HOME}/compiler/ccec_compiler/bin:${PATH}
-ENV ASCEND_AICPU_PATH=${ASCEND_TOOLKIT_HOME}
-ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
-ENV TOOLCHAIN_HOME=${ASCEND_TOOLKIT_HOME}/toolkit
-ENV ASCEND_HOME_PATH=${ASCEND_TOOLKIT_HOME}
-
-ENTRYPOINT ["/llama-cli" ]
--- a/.devops/llama-cli-cuda.Dockerfile
+++ b/.devops/llama-cli-cuda.Dockerfile
@@ -6,7 +6,7 @@ ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VER
 # Target the CUDA runtime image
 ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}

-FROM ${BASE_CUDA_DEV_CONTAINER} AS build
+FROM ${BASE_CUDA_DEV_CONTAINER} as build

 # Unless otherwise specified, we make a fat build.
 ARG CUDA_DOCKER_ARCH=all
@@ -25,7 +25,7 @@ ENV GGML_CUDA=1

 RUN make -j$(nproc) llama-cli

-FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
+FROM ${BASE_CUDA_RUN_CONTAINER} as runtime

 RUN apt-get update && \
    apt-get install -y libgomp1
--- a/.devops/llama-cli-intel.Dockerfile
+++ b/.devops/llama-cli-intel.Dockerfile
@@ -1,6 +1,6 @@
 ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04

-FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
+FROM intel/oneapi-basekit:$ONEAPI_VERSION as build

 ARG GGML_SYCL_F16=OFF
 RUN apt-get update && \
@@ -14,12 +14,10 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
        echo "GGML_SYCL_F16 is set" && \
        export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
    fi && \
-    echo "Building with static libs" && \
-    cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx \
-    ${OPT_SYCL_F16} -DBUILD_SHARED_LIBS=OFF && \
+    cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
    cmake --build build --config Release --target llama-cli

-FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime
+FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime

 COPY --from=build /app/build/bin/llama-cli /llama-cli

--- a/.devops/llama-cli-rocm.Dockerfile
+++ b/.devops/llama-cli-rocm.Dockerfile
@@ -6,7 +6,7 @@ ARG ROCM_VERSION=5.6
 # Target the CUDA build image
 ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete

-FROM ${BASE_ROCM_DEV_CONTAINER} AS build
+FROM ${BASE_ROCM_DEV_CONTAINER} as build

 # Unless otherwise specified, we make a fat build.
 # List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
--- a/.devops/llama-cli-vulkan.Dockerfile
+++ b/.devops/llama-cli-vulkan.Dockerfile
@@ -1,6 +1,6 @@
 ARG UBUNTU_VERSION=jammy

-FROM ubuntu:$UBUNTU_VERSION AS build
+FROM ubuntu:$UBUNTU_VERSION as build

 # Install build tools
 RUN apt update && apt install -y git build-essential cmake wget libgomp1
--- a/.devops/llama-cli.Dockerfile
+++ b/.devops/llama-cli.Dockerfile
@@ -1,6 +1,6 @@
 ARG UBUNTU_VERSION=22.04

-FROM ubuntu:$UBUNTU_VERSION AS build
+FROM ubuntu:$UBUNTU_VERSION as build

 RUN apt-get update && \
    apt-get install -y build-essential git
@@ -11,7 +11,7 @@ COPY . .

 RUN make -j$(nproc) llama-cli

-FROM ubuntu:$UBUNTU_VERSION AS runtime
+FROM ubuntu:$UBUNTU_VERSION as runtime

 RUN apt-get update && \
    apt-get install -y libgomp1
--- a/.devops/llama-server-cuda.Dockerfile
+++ b/.devops/llama-server-cuda.Dockerfile
@@ -6,7 +6,7 @@ ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VER
 # Target the CUDA runtime image
 ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}

-FROM ${BASE_CUDA_DEV_CONTAINER} AS build
+FROM ${BASE_CUDA_DEV_CONTAINER} as build

 # Unless otherwise specified, we make a fat build.
 ARG CUDA_DOCKER_ARCH=all
@@ -27,7 +27,7 @@ ENV LLAMA_CURL=1

 RUN make -j$(nproc) llama-server

-FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
+FROM ${BASE_CUDA_RUN_CONTAINER} as runtime

 RUN apt-get update && \
    apt-get install -y libcurl4-openssl-dev libgomp1 curl
--- a/.devops/llama-server-intel.Dockerfile
+++ b/.devops/llama-server-intel.Dockerfile
@@ -1,6 +1,6 @@
 ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04

-FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
+FROM intel/oneapi-basekit:$ONEAPI_VERSION as build

 ARG GGML_SYCL_F16=OFF
 RUN apt-get update && \
@@ -14,11 +14,10 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
        echo "GGML_SYCL_F16 is set" && \
        export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
    fi && \
-    echo "Building with dynamic libs" && \
    cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
    cmake --build build --config Release --target llama-server

-FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime
+FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime

 RUN apt-get update && \
    apt-get install -y libcurl4-openssl-dev curl
--- a/.devops/llama-server-rocm.Dockerfile
+++ b/.devops/llama-server-rocm.Dockerfile
@@ -6,7 +6,7 @@ ARG ROCM_VERSION=5.6
 # Target the CUDA build image
 ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete

-FROM ${BASE_ROCM_DEV_CONTAINER} AS build
+FROM ${BASE_ROCM_DEV_CONTAINER} as build

 # Unless otherwise specified, we make a fat build.
 # List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
--- a/.devops/llama-server-vulkan.Dockerfile
+++ b/.devops/llama-server-vulkan.Dockerfile
@@ -1,6 +1,6 @@
 ARG UBUNTU_VERSION=jammy

-FROM ubuntu:$UBUNTU_VERSION AS build
+FROM ubuntu:$UBUNTU_VERSION as build

 # Install build tools
 RUN apt update && apt install -y git build-essential cmake wget
--- a/.devops/llama-server.Dockerfile
+++ b/.devops/llama-server.Dockerfile
@@ -1,9 +1,9 @@
 ARG UBUNTU_VERSION=22.04

-FROM ubuntu:$UBUNTU_VERSION AS build
+FROM ubuntu:$UBUNTU_VERSION as build

 RUN apt-get update && \
-    apt-get install -y build-essential git libcurl4-openssl-dev
+    apt-get install -y build-essential git libcurl4-openssl-dev curl

 WORKDIR /app

@@ -13,10 +13,10 @@ ENV LLAMA_CURL=1

 RUN make -j$(nproc) llama-server

-FROM ubuntu:$UBUNTU_VERSION AS runtime
+FROM ubuntu:$UBUNTU_VERSION as runtime

 RUN apt-get update && \
-    apt-get install -y libcurl4-openssl-dev libgomp1 curl
+    apt-get install -y libcurl4-openssl-dev libgomp1

 COPY --from=build /app/llama-server /llama-server

--- a/.devops/nix/apps.nix
+++ b/.devops/nix/apps.nix
@@ -10,6 +10,7 @@
            "llama-embedding"
            "llama-server"
            "llama-quantize"
+            "llama-train-text-from-scratch"
          ];
          mkApp = name: {
            type = "app";
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@@ -18,7 +18,6 @@
  vulkan-headers,
  vulkan-loader,
  curl,
-  shaderc,
  useBlas ? builtins.all (x: !x) [
    useCuda
    useMetalKit
@@ -126,9 +125,16 @@ let
    ++ optionals useMetalKit [ MetalKit ];

  cudaBuildInputs = with cudaPackages; [
-    cuda_cudart
-    cuda_cccl # <nv/target>
-    libcublas
+    cuda_cccl.dev # <nv/target>
+
+    # A temporary hack for reducing the closure size, remove once cudaPackages
+    # have stopped using lndir: https://github.com/NixOS/nixpkgs/issues/271792
+    cuda_cudart.dev
+    cuda_cudart.lib
+    cuda_cudart.static
+    libcublas.dev
+    libcublas.lib
+    libcublas.static
  ];

  rocmBuildInputs = with rocmPackages; [
@@ -140,7 +146,6 @@ let
  vulkanBuildInputs = [
    vulkan-headers
    vulkan-loader
-    shaderc
  ];
 in

--- a/.devops/tools.sh
+++ b/.devops/tools.sh
@@ -8,11 +8,13 @@ arg1="$1"
 shift

 if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
-    python3 ./convert_hf_to_gguf.py "$@"
+    python3 ./convert-hf-to-gguf.py "$@"
 elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
    ./llama-quantize "$@"
 elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
    ./llama-cli "$@"
+elif [[ "$arg1" == '--finetune' || "$arg1" == '-f' ]]; then
+    ./llama-finetune "$@"
 elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
    echo "Converting PTH to GGML..."
    for i in `ls $1/$2/ggml-model-f16.bin*`; do
@@ -34,6 +36,8 @@ else
    echo "              ex: --outtype f16 \"/models/7B/\" "
    echo "  --quantize (-q): Optimize with quantization process ggml"
    echo "              ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
+    echo "  --finetune (-f): Run finetune command to create a lora finetune of the model"
+    echo "              See documentation for finetune for command-line parameters"
    echo "  --all-in-one (-a): Execute --convert & --quantize"
    echo "              ex: \"/models/\" 7B"
    echo "  --server (-s): Run a model on the server"
--- a/.github/workflows/bench.yml.disabled
+++ b/.github/workflows/bench.yml.disabled
@@ -1,6 +1,3 @@
-# TODO: there have been some issues with the workflow, so disabling for now
-#       https://github.com/ggerganov/llama.cpp/issues/7893
-#
 # Benchmark
 name: Benchmark

@@ -132,8 +129,6 @@ jobs:

      - name: Server bench
        id: server_bench
-        env:
-            HEAD_REF: ${{ github.head_ref || github.ref_name }}
        run: |
          set -eux

@@ -142,7 +137,7 @@ jobs:
          python bench.py \
              --runner-label ${{ env.RUNNER_LABEL }} \
              --name ${{ github.job }} \
-              --branch $HEAD_REF \
+              --branch ${{ github.head_ref || github.ref_name }} \
              --commit ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha }} \
              --scenario script.js \
              --duration ${{ github.event.inputs.duration || env.DURATION }} \
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -47,7 +47,7 @@ jobs:
          sysctl -a
          mkdir build
          cd build
-          cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF ..
+          cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF ..
          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)

      - name: Test
@@ -105,7 +105,7 @@ jobs:
          sysctl -a
          # Metal is disabled due to intermittent failures with Github runners not having a GPU:
          # https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
-          cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF
+          cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF
          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)

      - name: Test
@@ -222,7 +222,7 @@ jobs:
        run: |
          mkdir build
          cd build
-          cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF
+          cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF
          cmake --build . --config Release -j $(nproc)

      - name: Test
@@ -355,10 +355,8 @@ jobs:
      - name: Dependencies
        id: depends
        run: |
-          wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add -
-          sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
-          sudo apt-get update -y
-          sudo apt-get install -y build-essential vulkan-sdk
+          sudo apt-get update
+          sudo apt-get install build-essential libvulkan-dev

      - name: Build
        id: cmake_build
@@ -696,20 +694,22 @@ jobs:
    strategy:
      matrix:
        include:
-          - build: 'noavx-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DBUILD_SHARED_LIBS=ON'
-          - build: 'avx2-x64'
+          - build: 'rpc-x64'
            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=ON'
+          - build: 'noavx-x64'
+            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DBUILD_SHARED_LIBS=ON'
+          - build: 'avx2-x64'
+            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
          - build: 'avx-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
+            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
          - build: 'avx512-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX512=ON -DBUILD_SHARED_LIBS=ON'
+            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_AVX512=ON -DBUILD_SHARED_LIBS=ON'
          - build: 'openblas-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BLAS=ON -DBUILD_SHARED_LIBS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
+            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_BLAS=ON -DBUILD_SHARED_LIBS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
          - build: 'kompute-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'
+            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'
          - build: 'vulkan-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_VULKAN=ON -DBUILD_SHARED_LIBS=ON'
+            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_VULKAN=ON -DBUILD_SHARED_LIBS=ON'
          - build: 'llvm-arm64'
            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
          - build: 'msvc-arm64'
@@ -858,7 +858,6 @@ jobs:
          mkdir build
          cd build
          cmake .. -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON
-          cmake --build . --config Release -j $((${env:NUMBER_OF_PROCESSORS} - 1)) -t ggml
          cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}

      - name: Determine tag name
--- a/.github/workflows/python-check-requirements.yml
+++ b/.github/workflows/python-check-requirements.yml
@@ -6,13 +6,15 @@ on:
      - '.github/workflows/python-check-requirements.yml'
      - 'scripts/check-requirements.sh'
      - 'convert*.py'
-      - '**/requirements*.txt'
+      - 'requirements.txt'
+      - 'requirements/*.txt'
  pull_request:
    paths:
      - '.github/workflows/python-check-requirements.yml'
      - 'scripts/check-requirements.sh'
      - 'convert*.py'
-      - '**/requirements*.txt'
+      - 'requirements.txt'
+      - 'requirements/*.txt'

 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
--- a/.gitignore
+++ b/.gitignore
@@ -50,7 +50,6 @@ build*
 !docs/build.md
 /libllama.so
 /llama-*
-/vulkan-shaders-gen
 android-ndk-*
 arm_neon.h
 cmake-build-*
@@ -62,11 +61,6 @@ llama-batched-swift
 out/
 tmp/

-# Deprecated
-
-/main
-/server
-
 # CI

 !.github/workflows/*.yml
@@ -79,6 +73,7 @@ models-mnt
 !models/ggml-vocab-*.gguf*

 # Zig
+
 zig-out/
 zig-cache/

@@ -129,6 +124,3 @@ poetry.toml

 # Scripts
 !/scripts/install-oneapi.bat
-
-# Test models for lora adapters
-/lora-tests
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -106,7 +106,6 @@ llama_option_depr(WARNING     LLAMA_NATIVE              GGML_NATIVE)
 llama_option_depr(WARNING     LLAMA_RPC                 GGML_RPC)
 llama_option_depr(WARNING     LLAMA_SYCL                GGML_SYCL)
 llama_option_depr(WARNING     LLAMA_SYCL_F16            GGML_SYCL_F16)
-llama_option_depr(WARNING     LLAMA_CANN                GGML_CANN)

 #
 # build the library
@@ -133,17 +132,7 @@ set(LLAMA_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location o
 set(LLAMA_LIB_INSTALL_DIR     ${CMAKE_INSTALL_LIBDIR}     CACHE PATH "Location of library files")
 set(LLAMA_BIN_INSTALL_DIR     ${CMAKE_INSTALL_BINDIR}     CACHE PATH "Location of binary  files")

-
-# At the moment some compile definitions are placed within the ggml/src
-# directory but not exported on the `ggml` target. This could be improved by
-# determining _precisely_ which defines are necessary for the llama-config
-# package.
-#
-get_target_property(GGML_DIRECTORY ggml SOURCE_DIR)
-get_directory_property(GGML_DIR_DEFINES DIRECTORY ${GGML_DIRECTORY} COMPILE_DEFINITIONS)
-get_target_property(GGML_TARGET_DEFINES ggml COMPILE_DEFINITIONS)
-set(GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES} ${GGML_DIR_DEFINES})
-get_target_property(GGML_LINK_LIBRARIES ggml LINK_LIBRARIES)
+get_directory_property(LLAMA_TRANSIENT_DEFINES COMPILE_DEFINITIONS)

 set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h)
 install(TARGETS llama LIBRARY PUBLIC_HEADER)
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -28,7 +28,6 @@
    { "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" } },
    { "name": "reldbg",  "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
    { "name": "static",  "hidden": true, "cacheVariables": { "GGML_STATIC": "ON" } },
-    { "name": "sycl_f16",  "hidden": true, "cacheVariables": { "GGML_SYCL_F16": "ON" } },

    {
        "name": "arm64-windows-msvc", "hidden": true,
@@ -61,8 +60,6 @@
    { "name": "x64-windows-msvc+static-release", "inherits": [ "base", "reldbg", "static" ] },

    { "name": "x64-windows-sycl-debug"  , "inherits": [ "sycl-base", "debug"   ] },
-    { "name": "x64-windows-sycl-debug-f16", "inherits": [ "sycl-base", "debug", "sycl_f16" ] },
-    { "name": "x64-windows-sycl-release", "inherits": [ "sycl-base", "release" ] },
-    { "name": "x64-windows-sycl-release-f16", "inherits": [ "sycl-base", "release", "sycl_f16" ] }
+    { "name": "x64-windows-sycl-release", "inherits": [ "sycl-base", "release" ] }
  ]
 }
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,18 +1,13 @@
-# Pull requests (for contributors)
+# Pull requests

+- Always squash-merge the PR before merging
+- Use the following format for your final commit: `<module> : <commit title> (#<issue_number>)`. For example: `utils : fix typo in utils.py (#1234)`
 - Test your changes:
  - Using the commands in the [`tests`](tests) folder. For instance, running the `./tests/test-backend-ops` command tests different backend implementations of the GGML library
  - Execute [the full CI locally on your machine](ci/README.md) before publishing
+- If the pull request contains only documentation changes (e.g., updating READMEs, adding new wiki pages), please add `[no ci]` to the commit title. This will skip unnecessary CI checks and help reduce build times
 - Please rate the complexity of your PR (i.e. `Review Complexity : Low`, `Review Complexity : Medium`, `Review Complexity : High`). This makes it easier for maintainers to triage the PRs.
-  - The PR template has a series of review complexity checkboxes `[ ]` that [you can mark as](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/about-task-lists) `[X]` for your convenience
- Consider allowing write access to your branch for faster review
- If your PR becomes stale, don't hesitate to ping the maintainers in the comments
-
-# Pull requests (for collaborators)
-
- Squash-merge PRs
- Use the following format for the squashed commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : fix typo in utils.py (#1234)`
- Optionally, pick a `<module>` from here: https://github.com/ggerganov/llama.cpp/wiki/Modules
+  - The PR template has a series of review complexity checkboxes `[ ]` that [you can mark as](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/about-task-lists) `[X]` for your conveience

 # Coding guidelines

--- a/247
+++ b/247
@@ -11,6 +11,7 @@ BUILD_TARGETS = \
 	llama-embedding \
 	llama-eval-callback \
 	llama-export-lora \
+	llama-finetune \
 	llama-gbnf-validator \
 	llama-gguf \
 	llama-gguf-hash \
@@ -19,7 +20,6 @@ BUILD_TARGETS = \
 	llama-imatrix \
 	llama-infill \
 	llama-llava-cli \
-	llama-minicpmv-cli\
 	llama-lookahead \
 	llama-lookup \
 	llama-lookup-create \
@@ -37,6 +37,7 @@ BUILD_TARGETS = \
 	llama-simple \
 	llama-speculative \
 	llama-tokenize \
+	llama-train-text-from-scratch \
 	llama-vdot \
 	llama-cvector-generator \
 	tests/test-c.o
@@ -63,13 +64,13 @@ TEST_TARGETS = \
 	tests/test-tokenizer-1-spm

 # Legacy build targets that were renamed in #7809, but should still be removed when the project is cleaned
-LEGACY_TARGETS_CLEAN = main quantize quantize-stats perplexity imatrix embedding vdot q8dot convert-llama2c-to-ggml \
+LEGACY_TARGETS_CLEAN = main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
 	simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama \
-	retrieval speculative infill tokenize benchmark-matmult parallel export-lora lookahead lookup passkey gritlm
+	retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm

 # Legacy build targets that were renamed in #7809, but we want to build binaries that for them that output a deprecation warning if people try to use them.
 #  We don't want to clutter things too much, so we only build replacements for the most commonly used binaries.
-LEGACY_TARGETS_BUILD = main quantize perplexity embedding server
+LEGACY_TARGETS_BUILD = main quantize perplexity embedding server finetune

 # Deprecation aliases
 ifdef LLAMA_CUBLAS
@@ -196,10 +197,6 @@ ifdef GGML_RPC
 	BUILD_TARGETS += rpc-server
 endif

-ifdef GGML_VULKAN
-	BUILD_TARGETS += vulkan-shaders-gen
-endif
-
 default: $(BUILD_TARGETS) $(LEGACY_TARGETS_BUILD)

 test: $(TEST_TARGETS)
@@ -326,9 +323,9 @@ ifdef LLAMA_DEBUG
 	endif
 else
 	MK_CPPFLAGS   += -DNDEBUG
-	MK_CFLAGS     += -O3 -g
-	MK_CXXFLAGS   += -O3 -g
-	MK_NVCCFLAGS  += -O3 -g
+	MK_CFLAGS     += -O3
+	MK_CXXFLAGS   += -O3
+	MK_NVCCFLAGS  += -O3
 endif

 ifdef LLAMA_SANITIZE_THREAD
@@ -529,21 +526,10 @@ ifndef GGML_NO_ACCELERATE
 	endif
 endif # GGML_NO_ACCELERATE

-ifdef GGML_MUSA
-	CC := clang
-	CXX := clang++
-	GGML_CUDA := 1
-	MK_CPPFLAGS += -DGGML_USE_MUSA
-endif
-
 ifndef GGML_NO_OPENMP
 	MK_CPPFLAGS += -DGGML_USE_OPENMP
 	MK_CFLAGS   += -fopenmp
 	MK_CXXFLAGS += -fopenmp
-	ifdef GGML_MUSA
-		MK_CPPFLAGS += -I/usr/lib/llvm-10/include/openmp
-		MK_LDFLAGS  += -L/usr/lib/llvm-10/lib
-	endif # GGML_MUSA
 endif # GGML_NO_OPENMP

 ifdef GGML_OPENBLAS
@@ -561,17 +547,11 @@ ifdef GGML_OPENBLAS64
 endif # GGML_OPENBLAS64

 ifdef GGML_BLIS
-	MK_CPPFLAGS += -DGGML_USE_BLAS -DGGML_BLAS_USE_BLIS -I/usr/local/include/blis -I/usr/include/blis
+	MK_CPPFLAGS += -DGGML_USE_BLAS -I/usr/local/include/blis -I/usr/include/blis
 	MK_LDFLAGS  += -lblis -L/usr/local/lib
 	OBJ_GGML    += ggml/src/ggml-blas.o
 endif # GGML_BLIS

-ifdef GGML_NVPL
-	MK_CPPFLAGS += -DGGML_USE_BLAS -DGGML_BLAS_USE_NVPL -DNVPL_ILP64 -I/usr/local/include/nvpl_blas -I/usr/include/nvpl_blas
-	MK_LDFLAGS  += -L/usr/local/lib -lnvpl_blas_core -lnvpl_blas_ilp64_gomp
-	OBJ_GGML    += ggml/src/ggml-blas.o
-endif # GGML_NVPL
-
 ifndef GGML_NO_LLAMAFILE
 	MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
 	OBJ_GGML    += ggml/src/llamafile/sgemm.o
@@ -594,27 +574,15 @@ else
 endif # GGML_CUDA_FA_ALL_QUANTS

 ifdef GGML_CUDA
-	ifdef GGML_MUSA
-		ifneq ('', '$(wildcard /opt/musa)')
-			CUDA_PATH ?= /opt/musa
-		else
-			CUDA_PATH ?= /usr/local/musa
-		endif
-
-		MK_CPPFLAGS  += -DGGML_USE_CUDA -I$(CUDA_PATH)/include
-		MK_LDFLAGS   += -lmusa -lmublas -lmusart -lpthread -ldl -lrt -L$(CUDA_PATH)/lib -L/usr/lib64
-		MK_NVCCFLAGS += -x musa -mtgpu --cuda-gpu-arch=mp_22
+	ifneq ('', '$(wildcard /opt/cuda)')
+		CUDA_PATH ?= /opt/cuda
 	else
-		ifneq ('', '$(wildcard /opt/cuda)')
-			CUDA_PATH ?= /opt/cuda
-		else
-			CUDA_PATH ?= /usr/local/cuda
-		endif
+		CUDA_PATH ?= /usr/local/cuda
+	endif

-		MK_CPPFLAGS  += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include -DGGML_CUDA_USE_GRAPHS
-		MK_LDFLAGS   += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib
-		MK_NVCCFLAGS += -use_fast_math
-	endif # GGML_MUSA
+	MK_CPPFLAGS  += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include -DGGML_CUDA_USE_GRAPHS
+	MK_LDFLAGS   += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib
+	MK_NVCCFLAGS += -use_fast_math

 	OBJ_GGML += ggml/src/ggml-cuda.o
 	OBJ_GGML += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu))
@@ -624,11 +592,9 @@ ifdef LLAMA_FATAL_WARNINGS
 	MK_NVCCFLAGS += -Werror all-warnings
 endif # LLAMA_FATAL_WARNINGS

-ifndef GGML_MUSA
 ifndef JETSON_EOL_MODULE_DETECT
 	MK_NVCCFLAGS += --forward-unknown-to-host-compiler
 endif # JETSON_EOL_MODULE_DETECT
-endif # GGML_MUSA

 ifdef LLAMA_DEBUG
 	MK_NVCCFLAGS += -lineinfo
@@ -641,12 +607,8 @@ endif # GGML_CUDA_DEBUG
 ifdef GGML_CUDA_NVCC
 	NVCC = $(CCACHE) $(GGML_CUDA_NVCC)
 else
-	ifdef GGML_MUSA
-		NVCC = $(CCACHE) mcc
-	else
-		NVCC = $(CCACHE) nvcc
-	endif # GGML_MUSA
-endif # GGML_CUDA_NVCC
+	NVCC = $(CCACHE) nvcc
+endif #GGML_CUDA_NVCC

 ifdef CUDA_DOCKER_ARCH
 	MK_NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH)
@@ -717,15 +679,9 @@ define NVCC_COMPILE
 	$(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
 endef # NVCC_COMPILE
 else
-	ifdef GGML_MUSA
-define NVCC_COMPILE
-	$(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -c $< -o $@
-endef # NVCC_COMPILE
-	else
 define NVCC_COMPILE
 	$(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
 endef # NVCC_COMPILE
-	endif # GGML_MUSA
 endif # JETSON_EOL_MODULE_DETECT

 ggml/src/ggml-cuda/%.o: \
@@ -748,8 +704,8 @@ endif # GGML_CUDA

 ifdef GGML_VULKAN
 	MK_CPPFLAGS += -DGGML_USE_VULKAN
-	MK_LDFLAGS  += $(shell pkg-config --libs vulkan)
-	OBJ_GGML    += ggml/src/ggml-vulkan.o ggml/src/ggml-vulkan-shaders.o
+	MK_LDFLAGS  += -lvulkan
+	OBJ_GGML    += ggml/src/ggml-vulkan.o

 ifdef GGML_VULKAN_CHECK_RESULTS
 	MK_CPPFLAGS  += -DGGML_VULKAN_CHECK_RESULTS
@@ -763,10 +719,6 @@ ifdef GGML_VULKAN_MEMORY_DEBUG
 	MK_CPPFLAGS  += -DGGML_VULKAN_MEMORY_DEBUG
 endif

-ifdef GGML_VULKAN_PERF
-	MK_CPPFLAGS  += -DGGML_VULKAN_PERF
-endif
-
 ifdef GGML_VULKAN_VALIDATE
 	MK_CPPFLAGS  += -DGGML_VULKAN_VALIDATE
 endif
@@ -775,28 +727,10 @@ ifdef GGML_VULKAN_RUN_TESTS
 	MK_CPPFLAGS  += -DGGML_VULKAN_RUN_TESTS
 endif

-GLSLC_CMD  = glslc
-_ggml_vk_genshaders_cmd = $(shell pwd)/vulkan-shaders-gen
-_ggml_vk_header = ggml/src/ggml-vulkan-shaders.hpp
-_ggml_vk_source = ggml/src/ggml-vulkan-shaders.cpp
-_ggml_vk_input_dir = ggml/src/vulkan-shaders
-_ggml_vk_shader_deps = $(echo $(_ggml_vk_input_dir)/*.comp)
-
-ggml/src/ggml-vulkan.o: ggml/src/ggml-vulkan.cpp ggml/include/ggml-vulkan.h $(_ggml_vk_header) $(_ggml_vk_source)
-	$(CXX) $(CXXFLAGS) $(shell pkg-config --cflags vulkan) -c $< -o $@
-
-$(_ggml_vk_header): $(_ggml_vk_source)
-
-$(_ggml_vk_source): $(_ggml_vk_shader_deps) vulkan-shaders-gen
-	$(_ggml_vk_genshaders_cmd) \
-		--glslc      $(GLSLC_CMD) \
-		--input-dir  $(_ggml_vk_input_dir) \
-		--target-hpp $(_ggml_vk_header) \
-		--target-cpp $(_ggml_vk_source)
-
-vulkan-shaders-gen: ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
-	$(CXX) $(CXXFLAGS) -o $@ $(LDFLAGS) ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
-
+ggml/src/ggml-vulkan.o: \
+	ggml/src/ggml-vulkan.cpp \
+	ggml/include/ggml-vulkan.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
 endif # GGML_VULKAN

 ifdef GGML_HIPBLAS
@@ -833,14 +767,6 @@ ifdef GGML_CUDA_FORCE_DMMV
 	HIPFLAGS += -DGGML_CUDA_FORCE_DMMV
 endif # GGML_CUDA_FORCE_DMMV

-ifdef GGML_CUDA_FORCE_MMQ
-	HIPFLAGS += -DGGML_CUDA_FORCE_MMQ
-endif # GGML_CUDA_FORCE_MMQ
-
-ifdef GGML_CUDA_FORCE_CUBLAS
-	HIPFLAGS += -DGGML_CUDA_FORCE_CUBLAS
-endif # GGML_CUDA_FORCE_CUBLAS
-
 ifdef GGML_CUDA_NO_PEER_COPY
 	HIPFLAGS += -DGGML_CUDA_NO_PEER_COPY
 endif # GGML_CUDA_NO_PEER_COPY
@@ -893,16 +819,15 @@ ggml/src/ggml-metal-embed.o: \
 	ggml/src/ggml-common.h
 	@echo "Embedding Metal library"
 	@sed -e '/#include "ggml-common.h"/r ggml/src/ggml-common.h' -e '/#include "ggml-common.h"/d' < ggml/src/ggml-metal.metal > ggml/src/ggml-metal-embed.metal
-	$(eval TEMP_ASSEMBLY=$(shell mktemp -d))
-	@echo ".section __DATA, __ggml_metallib"            >  $(TEMP_ASSEMBLY)/ggml-metal-embed.s
-	@echo ".globl _ggml_metallib_start"                 >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
-	@echo "_ggml_metallib_start:"                       >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
-	@echo ".incbin \"ggml/src/ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
-	@echo ".globl _ggml_metallib_end"                   >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
-	@echo "_ggml_metallib_end:"                         >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
-	$(CC) $(CFLAGS) -c $(TEMP_ASSEMBLY)/ggml-metal-embed.s -o $@
-	@rm -f ${TEMP_ASSEMBLY}/ggml-metal-embed.s
-	@rmdir ${TEMP_ASSEMBLY}
+	$(eval TEMP_ASSEMBLY=$(shell mktemp))
+	@echo ".section __DATA, __ggml_metallib"            >  $(TEMP_ASSEMBLY)
+	@echo ".globl _ggml_metallib_start"                 >> $(TEMP_ASSEMBLY)
+	@echo "_ggml_metallib_start:"                       >> $(TEMP_ASSEMBLY)
+	@echo ".incbin \"ggml/src/ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY)
+	@echo ".globl _ggml_metallib_end"                   >> $(TEMP_ASSEMBLY)
+	@echo "_ggml_metallib_end:"                         >> $(TEMP_ASSEMBLY)
+	@$(AS) $(TEMP_ASSEMBLY) -o $@
+	@rm -f ${TEMP_ASSEMBLY}
 endif
 endif # GGML_METAL

@@ -915,9 +840,6 @@ OBJ_GGML += \

 OBJ_LLAMA = \
 	src/llama.o \
-	src/llama-vocab.o \
-	src/llama-grammar.o \
-	src/llama-sampling.o \
 	src/unicode.o \
 	src/unicode-data.o

@@ -985,7 +907,6 @@ $(info I CXX:       $(shell $(CXX)  --version | head -n 1))
 ifdef GGML_CUDA
 $(info I NVCC:      $(shell $(NVCC) --version | tail -n 1))
 CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])')
-ifndef GGML_MUSA
 ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)

 ifndef CUDA_DOCKER_ARCH
@@ -995,7 +916,6 @@ endif # CUDA_POWER_ARCH
 endif # CUDA_DOCKER_ARCH

 endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
-endif # GGML_MUSA
 endif # GGML_CUDA
 $(info )

@@ -1099,10 +1019,6 @@ src/unicode-data.o: \

 src/llama.o: \
 	src/llama.cpp \
-	src/llama-impl.h \
-	src/llama-vocab.h \
-	src/llama-grammar.h \
-	src/llama-sampling.h \
 	src/unicode.h \
 	include/llama.h \
 	ggml/include/ggml-cuda.h \
@@ -1112,29 +1028,6 @@ src/llama.o: \
 	ggml/include/ggml-backend.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@

-src/llama-vocab.o: \
-	src/llama-vocab.cpp \
-	src/llama-vocab.h \
-	src/llama-impl.h \
-	include/llama.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-
-src/llama-grammar.o: \
-	src/llama-grammar.cpp \
-	src/llama-grammar.h \
-	src/llama-impl.h \
-	src/llama-vocab.h \
-	src/llama-sampling.h \
-	include/llama.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-
-src/llama-sampling.o: \
-	src/llama-sampling.cpp \
-	src/llama-sampling.h \
-	src/llama-impl.h \
-	include/llama.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-
 $(LIB_LLAMA): \
 	$(OBJ_LLAMA) \
 	$(LIB_GGML)
@@ -1211,14 +1104,12 @@ clean:
 	rm -rvf ggml/*.dll
 	rm -rvf ggml/*.so
 	rm -vrf ggml/src/*.o
-	rm -rvf ggml/src/llamafile/*.o
 	rm -rvf common/build-info.cpp
 	rm -vrf ggml/src/ggml-metal-embed.metal
 	rm -vrf ggml/src/ggml-cuda/*.o
 	rm -vrf ggml/src/ggml-cuda/template-instances/*.o
 	rm -rvf $(BUILD_TARGETS)
 	rm -rvf $(TEST_TARGETS)
-	rm -f vulkan-shaders-gen ggml/src/ggml-vulkan-shaders.hpp ggml/src/ggml-vulkan-shaders.cpp
 	rm -rvf $(LEGACY_TARGETS_CLEAN)
 	find examples pocs -type f -name "*.o" -delete

@@ -1338,6 +1229,11 @@ llama-cvector-generator: examples/cvector-generator/cvector-generator.cpp \
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

+llama-train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
 llama-convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp \
 	$(OBJ_GGML) $(OBJ_LLAMA)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
@@ -1353,11 +1249,16 @@ llama-baby-llama: examples/baby-llama/baby-llama.cpp \
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-llama-export-lora: examples/export-lora/export-lora.cpp \
+llama-finetune: examples/finetune/finetune.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

+llama-export-lora: examples/export-lora/export-lora.cpp \
+	$(OBJ_GGML) common/log.h
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
 llama-retrieval: examples/retrieval/retrieval.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
@@ -1458,20 +1359,15 @@ libllava.a: examples/llava/llava.cpp \
 	$(CXX) $(CXXFLAGS) -static -fPIC -c $< -o $@ -Wno-cast-qual

 llama-llava-cli: examples/llava/llava-cli.cpp \
-	examples/llava/llava.cpp \
-	examples/llava/llava.h \
-	examples/llava/clip.cpp \
 	examples/llava/clip.h \
-	$(OBJ_ALL)
-	$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
-
-llama-minicpmv-cli: examples/llava/minicpmv-cli.cpp \
-	examples/llava/llava.cpp \
-	examples/llava/llava.h \
 	examples/llava/clip.cpp \
-	examples/llava/clip.h \
+	examples/llava/llava.h \
+	examples/llava/llava.cpp \
 	$(OBJ_ALL)
-	$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp  -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual
+	$(CXX) $(CXXFLAGS) -c examples/llava/llava.cpp -o $(call GET_OBJ_FILE, examples/llava/llava.cpp)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $< examples/llava/clip.cpp examples/llava/llava.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) $(call GET_OBJ_FILE, examples/llava/llava.cpp) -o $@ $(LDFLAGS)

 ifeq ($(UNAME_S),Darwin)
 swift: examples/batched.swift
@@ -1506,7 +1402,7 @@ run-benchmark-matmult: llama-benchmark-matmult
 .PHONY: run-benchmark-matmult swift

 tests/test-llama-grammar: tests/test-llama-grammar.cpp \
-	$(OBJ_ALL)
+	$(OBJ_GGML) $(OBJ_COMMON) src/unicode.o src/unicode-data.o
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

@@ -1615,45 +1511,56 @@ llama-q8dot: pocs/vdot/q8dot.cpp ggml/src/ggml.o \
 # Deprecated binaries that we want to keep around long enough for people to migrate to the new filenames, then these can be removed.
 #
 # Mark legacy binary targets as .PHONY so that they are always checked.
-.PHONY: main quantize perplexity embedding server
-
-# Define the object file target
-examples/deprecation-warning/deprecation-warning.o: examples/deprecation-warning/deprecation-warning.cpp
-	$(CXX) $(CXXFLAGS) -c $< -o $@
+.PHONY: main quantize perplexity embedding server finetune

 # NOTE: We currently will always build the deprecation-warning `main` and `server` binaries to help users migrate.
 #  Eventually we will want to remove these target from building all the time.
-main: examples/deprecation-warning/deprecation-warning.o
-	$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
+main: examples/deprecation-warning/deprecation-warning.cpp
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 	@echo "NOTICE: The 'main' binary is deprecated. Please use 'llama-cli' instead."

-server: examples/deprecation-warning/deprecation-warning.o
-	$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
+server: examples/deprecation-warning/deprecation-warning.cpp
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 	@echo "NOTICE: The 'server' binary is deprecated. Please use 'llama-server' instead."

-quantize: examples/deprecation-warning/deprecation-warning.o
+quantize: examples/deprecation-warning/deprecation-warning.cpp
 ifneq (,$(wildcard quantize))
-	$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 	@echo "#########"
 	@echo "WARNING: The 'quantize' binary is deprecated. Please use 'llama-quantize' instead."
 	@echo "  Remove the 'quantize' binary to remove this warning."
 	@echo "#########"
 endif

-perplexity: examples/deprecation-warning/deprecation-warning.o
+perplexity: examples/deprecation-warning/deprecation-warning.cpp
 ifneq (,$(wildcard perplexity))
-	$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 	@echo "#########"
 	@echo "WARNING: The 'perplexity' binary is deprecated. Please use 'llama-perplexity' instead."
 	@echo "  Remove the 'perplexity' binary to remove this warning."
 	@echo "#########"
 endif

-embedding: examples/deprecation-warning/deprecation-warning.o
+embedding: examples/deprecation-warning/deprecation-warning.cpp
 ifneq (,$(wildcard embedding))
-	$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 	@echo "#########"
 	@echo "WARNING: The 'embedding' binary is deprecated. Please use 'llama-embedding' instead."
 	@echo "  Remove the 'embedding' binary to remove this warning."
 	@echo "#########"
 endif
+
+finetune: examples/deprecation-warning/deprecation-warning.cpp
+ifneq (,$(wildcard finetune))
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+	@echo "#########"
+	@echo "WARNING: The 'finetune' binary is deprecated. Please use 'llama-finetune' instead."
+	@echo "  Remove the 'finetune' binary to remove this warning."
+	@echo "#########"
+endif
--- a/Package.swift
+++ b/Package.swift
@@ -4,9 +4,6 @@ import PackageDescription

 var sources = [
    "src/llama.cpp",
-    "src/llama-vocab.cpp",
-    "src/llama-grammar.cpp",
-    "src/llama-sampling.cpp",
    "src/unicode.cpp",
    "src/unicode-data.cpp",
    "ggml/src/ggml.c",
--- a/README.md
+++ b/README.md
@@ -3,7 +3,7 @@
 ![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png)

 [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
-[![Server](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml/badge.svg)](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml)
+[![Server](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml/badge.svg?branch=master&event=schedule)](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml)
 [![Conan Center](https://shields.io/conan/v/llama-cpp)](https://conan.io/center/llama-cpp)

 [Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
@@ -95,18 +95,8 @@ Typically finetunes of the base models below are supported as well.
 - [x] [SEA-LION](https://huggingface.co/models?search=sea-lion)
 - [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B)
 - [x] [OLMo](https://allenai.org/olmo)
- [x] [Granite models](https://huggingface.co/collections/ibm-granite/granite-code-models-6624c5cec322e4c148c8b330)
 - [x] [GPT-NeoX](https://github.com/EleutherAI/gpt-neox) + [Pythia](https://github.com/EleutherAI/pythia)
- [x] [Snowflake-Arctic MoE](https://huggingface.co/collections/Snowflake/arctic-66290090abe542894a5ac520)
- [x] [Smaug](https://huggingface.co/models?search=Smaug)
- [x] [Poro 34B](https://huggingface.co/LumiOpen/Poro-34B)
- [x] [Bitnet b1.58 models](https://huggingface.co/1bitLLM)
- [x] [Flan T5](https://huggingface.co/models?search=flan-t5)
- [x] [Open Elm models](https://huggingface.co/collections/apple/openelm-instruct-models-6619ad295d7ae9f868b759ca)
 - [x] [ChatGLM3-6b](https://huggingface.co/THUDM/chatglm3-6b) + [ChatGLM4-9b](https://huggingface.co/THUDM/glm-4-9b)
- [x] [SmolLM](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966)
- [x] [EXAONE-3.0-7.8B-Instruct](https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct)
- [x] [FalconMamba Models](https://huggingface.co/collections/tiiuae/falconmamba-7b-66b9a580324dd1598b0f6d4a)

 (instructions for supporting more models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md))

@@ -148,14 +138,12 @@ Typically finetunes of the base models below are supported as well.

 Unless otherwise noted these projects are open-source with permissive licensing:

- [MindWorkAI/AI-Studio](https://github.com/MindWorkAI/AI-Studio) (FSL-1.1-MIT)
 - [iohub/collama](https://github.com/iohub/coLLaMA)
 - [janhq/jan](https://github.com/janhq/jan) (AGPL)
 - [nat/openplayground](https://github.com/nat/openplayground)
 - [Faraday](https://faraday.dev/) (proprietary)
 - [LMStudio](https://lmstudio.ai/) (proprietary)
 - [Layla](https://play.google.com/store/apps/details?id=com.laylalite) (proprietary)
- [ramalama](https://github.com/containers/ramalama) (MIT)
 - [LocalAI](https://github.com/mudler/LocalAI) (MIT)
 - [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL)
 - [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile)
@@ -188,15 +176,10 @@ Unless otherwise noted these projects are open-source with permissive licensing:

 - [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from HuggingFace Hub and convert them to GGML
 - [crashr/gppm](https://github.com/crashr/gppm) – launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption
- [gpustack/gguf-parser](https://github.com/gpustack/gguf-parser-go/tree/main/cmd/gguf-parser) - review/check the GGUF file and estimate the memory usage

 **Infrastructure:**

 - [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp
- [GPUStack](https://github.com/gpustack/gpustack) - Manage GPU clusters for running LLMs
-
-**Games:**
- [Lucy's Labyrinth](https://github.com/MorganRO8/Lucys_Labyrinth) - A simple maze game where agents controlled by an AI model will try to trick you.

 ## Demo

@@ -422,11 +405,9 @@ Please refer to [Build llama.cpp locally](./docs/build.md)
 | [BLAS](./docs/build.md#blas-build) | All |
 | [BLIS](./docs/backend/BLIS.md) | All |
 | [SYCL](./docs/backend/SYCL.md) | Intel and Nvidia GPU |
-| [MUSA](./docs/build.md#musa) | Moore Threads GPU |
 | [CUDA](./docs/build.md#cuda) | Nvidia GPU |
 | [hipBLAS](./docs/build.md#hipblas) | AMD GPU |
 | [Vulkan](./docs/build.md#vulkan) | GPU |
-| [CANN](./docs/build.md#cann) | Ascend NPU |

 ## Tools

--- a/ci/run.sh
+++ b/ci/run.sh
@@ -13,9 +13,6 @@
 # # with SYCL support
 # GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 #
-# # with VULKAN support
-# GG_BUILD_VULKAN=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
-#

 if [ -z "$2" ]; then
    echo "usage: $0 <output-dir> <mnt-dir>"
@@ -43,7 +40,7 @@ if [ ! -z ${GG_BUILD_METAL} ]; then
 fi

 if [ ! -z ${GG_BUILD_CUDA} ]; then
-    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=native"
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=1"
 fi

 if [ ! -z ${GG_BUILD_SYCL} ]; then
@@ -55,10 +52,6 @@ if [ ! -z ${GG_BUILD_SYCL} ]; then

    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_SYCL=1 DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON"
 fi
-
-if [ ! -z ${GG_BUILD_VULKAN} ]; then
-    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_VULKAN=1"
-fi
 ## helpers

 # download a file if it does not exist or if it is outdated
@@ -114,7 +107,7 @@ function gg_run_ctest_debug {
    gg_check_build_requirements

    (time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j$(nproc)                                  ) 2>&1 | tee -a $OUT/${ci}-make.log
+    (time make -j                                          ) 2>&1 | tee -a $OUT/${ci}-make.log

    (time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log

@@ -145,7 +138,7 @@ function gg_run_ctest_release {
    gg_check_build_requirements

    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log
+    (time make -j                                            ) 2>&1 | tee -a $OUT/${ci}-make.log

    if [ -z ${GG_BUILD_LOW_PERF} ]; then
        (time ctest --output-on-failure -L main ) 2>&1 | tee -a $OUT/${ci}-ctest.log
@@ -273,6 +266,7 @@ function gg_sum_ctest_with_model_release {
 }

 # open_llama_7b_v2
+# requires: GG_BUILD_CUDA

 function gg_run_open_llama_7b_v2 {
    cd ${SRC}
@@ -296,8 +290,8 @@ function gg_run_open_llama_7b_v2 {

    set -e

-    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log
+    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DGGML_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time make -j                                                           ) 2>&1 | tee -a $OUT/${ci}-make.log

    python3 ../examples/convert_legacy_llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf

@@ -431,7 +425,7 @@ function gg_run_pythia_1_4b {
    set -e

    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log
+    (time make -j                                            ) 2>&1 | tee -a $OUT/${ci}-make.log

    python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf

@@ -541,6 +535,7 @@ function gg_sum_pythia_1_4b {
 }

 # pythia_2_8b
+# requires: GG_BUILD_CUDA

 function gg_run_pythia_2_8b {
    cd ${SRC}
@@ -561,8 +556,8 @@ function gg_run_pythia_2_8b {

    set -e

-    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log
+    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DGGML_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time make -j                                                           ) 2>&1 | tee -a $OUT/${ci}-make.log

    python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf

@@ -697,7 +692,7 @@ function gg_run_embd_bge_small {
    set -e

    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log
+    (time make -j                                            ) 2>&1 | tee -a $OUT/${ci}-make.log

    python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf

@@ -766,7 +761,7 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
    fi

    if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
-        if [ -z ${GG_BUILD_CUDA} ] && [ -z ${GG_BUILD_VULKAN} ]; then
+        if [ -z ${GG_BUILD_CUDA} ]; then
            test $ret -eq 0 && gg_run pythia_1_4b
        else
            test $ret -eq 0 && gg_run pythia_2_8b
--- a/cmake/llama-config.cmake.in
+++ b/cmake/llama-config.cmake.in
@@ -8,13 +8,6 @@ set(GGML_CUDA       @GGML_CUDA@)
 set(GGML_METAL      @GGML_METAL@)
 set(GGML_HIPBLAS    @GGML_HIPBLAS@)
 set(GGML_ACCELERATE @GGML_ACCELERATE@)
-set(GGML_VULKAN @GGML_VULKAN@)
-set(GGML_VULKAN_CHECK_RESULTS @GGML_VULKAN_CHECK_RESULTS@)
-set(GGML_VULKAN_DEBUG @GGML_VULKAN_DEBUG@)
-set(GGML_VULKAN_MEMORY_DEBUG @GGML_VULKAN_MEMORY_DEBUG@)
-set(GGML_VULKAN_VALIDATE @GGML_VULKAN_VALIDATE@)
-set(GGML_SYCL @GGML_SYCL@)
-set(GGML_OPENMP @GGML_OPENMP@)

@PACKAGE_INIT@

@@ -44,36 +37,18 @@ if (GGML_METAL)
    find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
 endif()

-if (GGML_VULKAN)
-    find_package(Vulkan REQUIRED)
-endif()
-
 if (GGML_HIPBLAS)
    find_package(hip REQUIRED)
    find_package(hipblas REQUIRED)
    find_package(rocblas REQUIRED)
 endif()

-if (GGML_SYCL)
-    find_package(IntelSYCL REQUIRED)
-    find_package(MKL REQUIRED)
-endif()
-
-if (GGML_OPENMP)
-    find_package(OpenMP REQUIRED)
-endif()
-
-
-find_library(ggml_LIBRARY ggml
-    REQUIRED
-    HINTS ${LLAMA_LIB_DIR})
-
 find_library(llama_LIBRARY llama
    REQUIRED
    HINTS ${LLAMA_LIB_DIR})

-set(_llama_link_deps "${ggml_LIBRARY}" "@GGML_LINK_LIBRARIES@")
-set(_llama_transient_defines "@GGML_TRANSIENT_DEFINES@")
+set(_llama_link_deps "Threads::Threads" "@LLAMA_EXTRA_LIBS@")
+set(_llama_transient_defines "@LLAMA_TRANSIENT_DEFINES@")

 add_library(llama UNKNOWN IMPORTED)

--- a/common/common.cpp
+++ b/common/common.cpp
@@ -77,41 +77,6 @@

 using json = nlohmann::ordered_json;

-//
-// Environment variable utils
-//
-
-template<typename T>
-static typename std::enable_if<std::is_same<T, std::string>::value, void>::type
-get_env(std::string name, T & target) {
-    char * value = std::getenv(name.c_str());
-    target = value ? std::string(value) : target;
-}
-
-template<typename T>
-static typename std::enable_if<!std::is_same<T, bool>::value && std::is_integral<T>::value, void>::type
-get_env(std::string name, T & target) {
-    char * value = std::getenv(name.c_str());
-    target = value ? std::stoi(value) : target;
-}
-
-template<typename T>
-static typename std::enable_if<std::is_floating_point<T>::value, void>::type
-get_env(std::string name, T & target) {
-    char * value = std::getenv(name.c_str());
-    target = value ? std::stof(value) : target;
-}
-
-template<typename T>
-static typename std::enable_if<std::is_same<T, bool>::value, void>::type
-get_env(std::string name, T & target) {
-    char * value = std::getenv(name.c_str());
-    if (value) {
-        std::string val(value);
-        target = val == "1" || val == "true";
-    }
-}
-
 //
 // CPU utils
 //
@@ -145,34 +110,8 @@ int32_t cpu_get_num_physical_cores() {
    if (result == 0) {
        return num_physical_cores;
    }
-#elif defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
-    // TODO: windows + arm64 + mingw64
-    unsigned int n_threads_win = std::thread::hardware_concurrency();
-    unsigned int default_threads = n_threads_win > 0 ? (n_threads_win <= 4 ? n_threads_win : n_threads_win / 2) : 4;
-
-    DWORD buffer_size = 0;
-    if (!GetLogicalProcessorInformationEx(RelationProcessorCore, nullptr, &buffer_size)) {
-        if (GetLastError() != ERROR_INSUFFICIENT_BUFFER) {
-            return default_threads;
-        }
-    }
-
-    std::vector<char> buffer(buffer_size);
-    if (!GetLogicalProcessorInformationEx(RelationProcessorCore, reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buffer.data()), &buffer_size)) {
-        return default_threads;
-    }
-
-    int32_t num_physical_cores = 0;
-    PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX info = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buffer.data());
-    while (buffer_size > 0) {
-        if (info->Relationship == RelationProcessorCore) {
-            num_physical_cores += info->Processor.GroupCount;
-        }
-        buffer_size -= info->Size;
-        info = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(reinterpret_cast<char*>(info) + info->Size);
-    }
-
-    return num_physical_cores > 0 ? num_physical_cores : default_threads;
+#elif defined(_WIN32)
+    //TODO: Implement
 #endif
    unsigned int n_threads = std::thread::hardware_concurrency();
    return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
@@ -255,6 +194,12 @@ int32_t cpu_get_num_math() {
 // CLI argument parsing
 //

+void gpt_params_handle_hf_token(gpt_params & params) {
+    if (params.hf_token.empty() && std::getenv("HF_TOKEN")) {
+        params.hf_token = std::getenv("HF_TOKEN");
+    }
+}
+
 void gpt_params_handle_model_default(gpt_params & params) {
    if (!params.hf_repo.empty()) {
        // short-hand to avoid specifying --hf-file -> default it to --model
@@ -302,9 +247,7 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {

    gpt_params_handle_model_default(params);

-    if (params.hf_token.empty()) {
-        get_env("HF_TOKEN", params.hf_token);
-    }
+    gpt_params_handle_hf_token(params);

    if (params.escape) {
        string_process_escapes(params.prompt);
@@ -324,25 +267,6 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
    return true;
 }

-void gpt_params_parse_from_env(gpt_params & params) {
-    // we only care about server-related params for now
-    get_env("LLAMA_ARG_MODEL",            params.model);
-    get_env("LLAMA_ARG_THREADS",          params.n_threads);
-    get_env("LLAMA_ARG_CTX_SIZE",         params.n_ctx);
-    get_env("LLAMA_ARG_N_PARALLEL",       params.n_parallel);
-    get_env("LLAMA_ARG_BATCH",            params.n_batch);
-    get_env("LLAMA_ARG_UBATCH",           params.n_ubatch);
-    get_env("LLAMA_ARG_N_GPU_LAYERS",     params.n_gpu_layers);
-    get_env("LLAMA_ARG_THREADS_HTTP",     params.n_threads_http);
-    get_env("LLAMA_ARG_CHAT_TEMPLATE",    params.chat_template);
-    get_env("LLAMA_ARG_N_PREDICT",        params.n_predict);
-    get_env("LLAMA_ARG_ENDPOINT_METRICS", params.endpoint_metrics);
-    get_env("LLAMA_ARG_ENDPOINT_SLOTS",   params.endpoint_slots);
-    get_env("LLAMA_ARG_EMBEDDINGS",       params.embedding);
-    get_env("LLAMA_ARG_FLASH_ATTN",       params.flash_attn);
-    get_env("LLAMA_ARG_DEFRAG_THOLD",     params.defrag_thold);
-}
-
 bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
    const auto params_org = params; // the example can modify the default params

@@ -760,24 +684,21 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
    }
    if (arg == "--lora") {
        CHECK_ARG
-        params.lora_adapters.push_back({
-            std::string(argv[i]),
-            1.0,
-        });
+        params.lora_adapter.emplace_back(argv[i], 1.0f);
+        params.use_mmap = false;
        return true;
    }
    if (arg == "--lora-scaled") {
        CHECK_ARG
-        std::string lora_adapter = argv[i];
+        const char* lora_adapter = argv[i];
        CHECK_ARG
-        params.lora_adapters.push_back({
-            lora_adapter,
-            std::stof(argv[i]),
-        });
+        params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
+        params.use_mmap = false;
        return true;
    }
-    if (arg == "--lora-init-without-apply") {
-        params.lora_init_without_apply = true;
+    if (arg == "--lora-base") {
+        CHECK_ARG
+        params.lora_base = argv[i];
        return true;
    }
    if (arg == "--control-vector") {
@@ -876,10 +797,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        params.cont_batching = true;
        return true;
    }
-    if (arg == "-nocb" || arg == "--no-cont-batching") {
-        params.cont_batching = false;
-        return true;
-    }
    if (arg == "-fa" || arg == "--flash-attn") {
        params.flash_attn = true;
        return true;
@@ -901,7 +818,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        }
        return true;
    }
-    if (arg == "-ngld" || arg == "--gpu-layers-draft" || arg == "--n-gpu-layers-draft") {
+    if (arg == "-ngld" || arg == "--gpu-layers-draft" || arg == "--gpu-layers-draft") {
        CHECK_ARG
        params.n_gpu_layers_draft = std::stoi(argv[i]);
        if (!llama_supports_gpu_offload()) {
@@ -1355,7 +1272,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        CHECK_ARG
        params.out_file = argv[i];
        params.cvector_outfile = argv[i];
-        params.lora_outfile = argv[i];
        return true;
    }
    if (arg == "-ofreq" || arg == "--output-frequency") {
@@ -1410,10 +1326,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        else { invalid_param = true; }
        return true;
    }
-    if (arg == "--no-warmup") {
-        params.warmup = false;
-        return true;
-    }
 #ifndef LOG_DISABLE_LOGS
    // Parse args for logging parameters
    if (log_param_single_parse(argv[i])) {
@@ -1536,7 +1448,6 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
    options.push_back({ "main infill", "       --in-prefix-bos",        "prefix BOS to user inputs, preceding the `--in-prefix` string" });
    options.push_back({ "main infill", "       --in-prefix STRING",     "string to prefix user inputs with (default: empty)" });
    options.push_back({ "main infill", "       --in-suffix STRING",     "string to suffix after user inputs with (default: empty)" });
-    options.push_back({ "main",        "       --no-warmup",            "skip warming up the model with an empty run" });
    options.push_back({ "server infill",
                                       "       --spm-infill",           "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)", params.spm_infill ? "enabled" : "disabled" });

@@ -1627,7 +1538,6 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
    options.push_back({ "*",           "-np,   --parallel N",           "number of parallel sequences to decode (default: %d)", params.n_parallel });
    options.push_back({ "*",           "-ns,   --sequences N",          "number of sequences to decode (default: %d)", params.n_sequences });
    options.push_back({ "*",           "-cb,   --cont-batching",        "enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled" });
-    options.push_back({ "*",           "-nocb, --no-cont-batching",     "disable continuous batching" });

    options.push_back({ "multi-modality" });
    options.push_back({ "*",           "       --mmproj FILE",          "path to a multimodal projector file for LLaVA. see examples/llava/README.md" });
@@ -1670,8 +1580,9 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
    options.push_back({ "*",           "       --override-kv KEY=TYPE:VALUE",
                                                                        "advanced option to override model metadata by key. may be specified multiple times.\n"
                                                                        "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false" });
-    options.push_back({ "*",           "       --lora FNAME",           "apply LoRA adapter (can be repeated to use multiple adapters)" });
-    options.push_back({ "*",           "       --lora-scaled FNAME S",  "apply LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)" });
+    options.push_back({ "*",           "       --lora FNAME",           "apply LoRA adapter (implies --no-mmap)" });
+    options.push_back({ "*",           "       --lora-scaled FNAME S",  "apply LoRA adapter with user defined scaling S (implies --no-mmap)" });
+    options.push_back({ "*",           "       --lora-base FNAME",      "optional model to use as a base for the layers modified by the LoRA adapter" });
    options.push_back({ "*",           "       --control-vector FNAME", "add a control vector\n"
                                                                        "note: this argument can be repeated to add multiple control vectors" });
    options.push_back({ "*",           "       --control-vector-scaled FNAME SCALE",
@@ -1720,7 +1631,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
    options.push_back({ "server",      "       --host HOST",            "ip address to listen (default: %s)", params.hostname.c_str() });
    options.push_back({ "server",      "       --port PORT",            "port to listen (default: %d)", params.port });
    options.push_back({ "server",      "       --path PATH",            "path to serve static files from (default: %s)", params.public_path.c_str() });
-    options.push_back({ "server",      "       --embedding(s)",         "restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled" });
+    options.push_back({ "server",      "       --embedding(s)",         "enable embedding endpoint (default: %s)", params.embedding ? "enabled" : "disabled" });
    options.push_back({ "server",      "       --api-key KEY",          "API key to use for authentication (default: none)" });
    options.push_back({ "server",      "       --api-key-file FNAME",   "path to file containing API keys (default: none)" });
    options.push_back({ "server",      "       --ssl-key-file FNAME",   "path to file a PEM-encoded SSL private key" });
@@ -1740,7 +1651,6 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
                                                                        "https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" });
    options.push_back({ "server",      "-sps,  --slot-prompt-similarity SIMILARITY",
                                                                        "how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity });
-    options.push_back({ "server",      "       --lora-init-without-apply",     "load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"});

 #ifndef LOG_DISABLE_LOGS
    options.push_back({ "logging" });
@@ -1763,13 +1673,6 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
    options.push_back({ "cvector",     "       --pca-iter N",           "number of iterations used for PCA (default: %d)", params.n_pca_iterations });
    options.push_back({ "cvector",     "       --method {pca,mean}",    "dimensionality reduction method to be used (default: pca)" });

-    options.push_back({ "export-lora" });
-    options.push_back({ "export-lora", "-m,    --model",                "model path from which to load base model (default '%s')", params.model.c_str() });
-    options.push_back({ "export-lora", "       --lora FNAME",           "path to LoRA adapter  (can be repeated to use multiple adapters)" });
-    options.push_back({ "export-lora", "       --lora-scaled FNAME S",  "path to LoRA adapter with user defined scaling S  (can be repeated to use multiple adapters)" });
-    options.push_back({ "*",           "-t,    --threads N",            "number of threads to use during computation (default: %d)", params.n_threads });
-    options.push_back({ "export-lora", "-o,    --output FNAME",         "output file (default: '%s')", params.lora_outfile.c_str() });
-
    printf("usage: %s [options]\n", argv[0]);

    for (const auto & o : options) {
@@ -1803,13 +1706,7 @@ std::string gpt_params_get_system_info(const gpt_params & params) {
    if (params.n_threads_batch != -1) {
        os << " (n_threads_batch = " << params.n_threads_batch << ")";
    }
-#if defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
-    // TODO: windows + arm64 + mingw64
-    DWORD logicalProcessorCount = GetActiveProcessorCount(ALL_PROCESSOR_GROUPS);
-    os << " / " << logicalProcessorCount << " | " << llama_print_system_info();
-#else
    os << " / " << std::thread::hardware_concurrency() << " | " << llama_print_system_info();
-#endif

    return os.str();
 }
@@ -1859,23 +1756,6 @@ std::string string_get_sortable_timestamp() {
    return std::string(timestamp_no_ns) + "." + std::string(timestamp_ns);
 }

-void string_replace_all(std::string & s, const std::string & search, const std::string & replace) {
-    if (search.empty()) {
-        return;
-    }
-    std::string builder;
-    builder.reserve(s.length());
-    size_t pos = 0;
-    size_t last_pos = 0;
-    while ((pos = s.find(search, last_pos)) != std::string::npos) {
-        builder.append(s, last_pos, pos - last_pos);
-        builder.append(replace);
-        last_pos = pos + search.length();
-    }
-    builder.append(s, last_pos, std::string::npos);
-    s = std::move(builder);
-}
-
 void string_process_escapes(std::string & input) {
    std::size_t input_len = input.length();
    std::size_t output_idx = 0;
@@ -2149,8 +2029,8 @@ std::string fs_get_cache_file(const std::string & filename) {
 //
 // Model utils
 //
-struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
-    llama_init_result iparams;
+
+std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params) {
    auto mparams = llama_model_params_from_gpt_params(params);

    llama_model * model = nullptr;
@@ -2165,7 +2045,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {

    if (model == NULL) {
        fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
-        return iparams;
+        return std::make_tuple(nullptr, nullptr);
    }

    auto cparams = llama_context_params_from_gpt_params(params);
@@ -2174,7 +2054,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
    if (lctx == NULL) {
        fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
        llama_free_model(model);
-        return iparams;
+        return std::make_tuple(nullptr, nullptr);
    }

    if (!params.control_vectors.empty()) {
@@ -2185,7 +2065,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
        if (cvec.n_embd == -1) {
            llama_free(lctx);
            llama_free_model(model);
-            return iparams;
+            return std::make_tuple(nullptr, nullptr);
        }

        int err = llama_control_vector_apply(lctx,
@@ -2197,26 +2077,26 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
        if (err) {
            llama_free(lctx);
            llama_free_model(model);
-            return iparams;
+            return std::make_tuple(nullptr, nullptr);
        }
    }

-    // load and optionally apply lora adapters
-    for (auto & la : params.lora_adapters) {
-        llama_lora_adapter_container loaded_la;
-        loaded_la.path = la.path;
-        loaded_la.scale = la.scale;
-        loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
-        if (loaded_la.adapter == nullptr) {
-            fprintf(stderr, "%s: error: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
+    for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
+        const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]);
+        float lora_scale = std::get<1>(params.lora_adapter[i]);
+        int err = llama_model_apply_lora_from_file(model,
+                                             lora_adapter.c_str(),
+                                             lora_scale,
+                                             ((i > 0) || params.lora_base.empty())
+                                                ? NULL
+                                                : params.lora_base.c_str(),
+                                             params.n_threads);
+        if (err != 0) {
+            fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
            llama_free(lctx);
            llama_free_model(model);
-            return iparams;
+            return std::make_tuple(nullptr, nullptr);
        }
-        iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters
-    }
-    if (!params.lora_init_without_apply) {
-        llama_lora_adapters_apply(lctx, iparams.lora_adapters);
    }

    if (params.ignore_eos) {
@@ -2244,26 +2124,13 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
            tmp.clear();
            tmp.push_back(decoder_start_token_id);
        }
-        if (llama_model_has_decoder(model)) {
-            llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
-        }
+        llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
        llama_kv_cache_clear(lctx);
        llama_synchronize(lctx);
        llama_reset_timings(lctx);
    }

-    iparams.model   = model;
-    iparams.context = lctx;
-    return iparams;
-}
-
-void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lora_adapter_container> & lora_adapters) {
-    llama_lora_adapter_clear(ctx);
-    for (auto & la : lora_adapters) {
-        if (la.scale != 0.0f) {
-            llama_lora_adapter_set(ctx, la.adapter, la.scale);
-        }
-    }
+    return std::make_tuple(model, lctx);
 }

 struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & params) {
@@ -2790,6 +2657,12 @@ std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token>
    return text;
 }

+bool llama_should_add_bos_token(const llama_model * model) {
+    const int add_bos = llama_add_bos_token(model);
+
+    return add_bos != -1 ? bool(add_bos) : (llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM);
+}
+
 //
 // Chat template utils
 //
@@ -2850,7 +2723,7 @@ std::string llama_chat_format_single(const struct llama_model * model,
        const llama_chat_msg & new_msg,
        bool add_ass) {
    std::ostringstream ss;
-    auto fmt_past_msg = past_msg.empty() ? "" : llama_chat_apply_template(model, tmpl, past_msg, false);
+    auto fmt_past_msg = llama_chat_apply_template(model, tmpl, past_msg, false);
    std::vector<llama_chat_msg> chat_new(past_msg);
    // if the past_msg ends with a newline, we must preserve it in the formatted version
    if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') {
@@ -3282,18 +3155,20 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
    }

    fprintf(stream, "lora:\n");
-    for (auto & la : params.lora_adapters) {
-        if (la.scale == 1.0f) {
-            fprintf(stream, "  - %s\n", la.path.c_str());
+    for (std::tuple<std::string, float> la : params.lora_adapter) {
+        if (std::get<1>(la) != 1.0f) {
+            continue;
        }
+        fprintf(stream, "  - %s\n", std::get<0>(la).c_str());
    }
    fprintf(stream, "lora_scaled:\n");
-    for (auto & la : params.lora_adapters) {
-        if (la.scale != 1.0f) {
-            fprintf(stream, "  - %s: %f\n", la.path.c_str(), la.scale);
+    for (std::tuple<std::string, float> la : params.lora_adapter) {
+        if (std::get<1>(la) == 1.0f) {
+            continue;
        }
+        fprintf(stream, "  - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la));
    }
-    fprintf(stream, "lora_init_without_apply: %s # default: false\n", params.lora_init_without_apply ? "true" : "false");
+    fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
    fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
    fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
    fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
--- a/common/common.h
+++ b/common/common.h
@@ -33,15 +33,6 @@

 #define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"

-struct llama_lora_adapter_info {
-    std::string path;
-    float scale;
-};
-
-struct llama_lora_adapter_container : llama_lora_adapter_info {
-    struct llama_lora_adapter * adapter;
-};
-
 // build info
 extern int LLAMA_BUILD_NUMBER;
 extern char const * LLAMA_COMMIT;
@@ -135,8 +126,9 @@ struct gpt_params {
    std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
    std::vector<llama_model_kv_override> kv_overrides;

-    bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply)
-    std::vector<llama_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale
+    // TODO: avoid tuple, use struct
+    std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
+    std::string lora_base  = "";                              // base model path for the lora adapter

    std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale

@@ -263,11 +255,9 @@ struct gpt_params {
    std::string cvector_negative_file = "examples/cvector-generator/negative.txt";

    bool spm_infill = false; // suffix/prefix/middle pattern for infill
-
-    std::string lora_outfile = "ggml-lora-merged-f16.gguf";
 };

-void gpt_params_parse_from_env(gpt_params & params);
+void gpt_params_handle_hf_token(gpt_params & params);
 void gpt_params_handle_model_default(gpt_params & params);

 bool gpt_params_parse_ex   (int argc, char ** argv, gpt_params & params);
@@ -286,8 +276,6 @@ std::vector<std::string> string_split(std::string input, char separator);
 std::string string_strip(const std::string & str);
 std::string string_get_sortable_timestamp();

-void string_replace_all(std::string & s, const std::string & search, const std::string & replace);
-
 template<class T>
 static std::vector<T> string_split(const std::string & str, char delim) {
    std::vector<T> values;
@@ -319,13 +307,8 @@ std::string fs_get_cache_file(const std::string & filename);
 // Model utils
 //

-struct llama_init_result {
-    struct llama_model   * model   = nullptr;
-    struct llama_context * context = nullptr;
-    std::vector<llama_lora_adapter_container> lora_adapters;
-};
-
-struct llama_init_result    llama_init_from_gpt_params(gpt_params & params);
+// TODO: avoid tuplue, use struct
+std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params);

 struct llama_model_params   llama_model_params_from_gpt_params  (const gpt_params & params);
 struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
@@ -333,9 +316,6 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
 struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
 struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);

-// clear LoRA adapters from context, then apply new list of adapters
-void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lora_adapter_container> & lora_adapters);
-
 // Batch utils

 void llama_batch_clear(struct llama_batch & batch);
@@ -380,6 +360,10 @@ std::string llama_detokenize(
        const std::vector<llama_token> & tokens,
                                  bool   special = true);

+// Uses the value from the model metadata if possible, otherwise
+// defaults to true when model type is SPM, otherwise false.
+bool llama_should_add_bos_token(const llama_model * model);
+
 //
 // Chat template utils
 //
--- a/common/grammar-parser.cpp
+++ b/common/grammar-parser.cpp
@@ -369,9 +369,6 @@ namespace grammar_parser {
            }
            // Validate the state to ensure that all rules are defined
            for (const auto & rule : state.rules) {
-                if (rule.empty()) {
-                    throw std::runtime_error("Undefined rule");
-                }
                for (const auto & elem : rule) {
                    if (elem.type == LLAMA_GRETYPE_RULE_REF) {
                        // Ensure that the rule at that location exists
--- a/common/ngram-cache.h
+++ b/common/ngram-cache.h
@@ -37,18 +37,11 @@ struct llama_ngram {
    }
 };

-struct llama_token_hash_function {
-    size_t operator()(const llama_token token) const {
-        // see https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
-        return token * 11400714819323198485llu;
-    }
-};
-
 struct llama_ngram_hash_function {
    size_t operator()(const llama_ngram & ngram) const {
-        size_t hash = llama_token_hash_function{}(ngram.tokens[0]);
-        for (int i = 1; i < LLAMA_NGRAM_MAX; ++i) {
-            hash ^= llama_token_hash_function{}(ngram.tokens[i]);
+        size_t hash = 0;
+        for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
+            hash ^= std::hash<llama_token>{}(ngram.tokens[i]);
        }
        return hash;
    }
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -330,7 +330,7 @@ static llama_token llama_sampling_sample_impl(
        llama_token_data_array single_token_data_array = { &single_token_data, 1, false };

        // Apply grammar constraints to the single token
-        llama_grammar_sample(ctx_sampling->grammar, ctx_main, &single_token_data_array);
+        llama_sample_grammar(ctx_main, &single_token_data_array, ctx_sampling->grammar);

        // Check if the token is valid according to the grammar by seeing if its logit has been set to -INFINITY
        bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
@@ -421,7 +421,7 @@ static llama_token_data_array llama_sampling_prepare_impl(

    // apply grammar checks before sampling logic
    if (apply_grammar && ctx_sampling->grammar != NULL) {
-        llama_grammar_sample(ctx_sampling->grammar, ctx_main, &cur_p);
+        llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar);
    }

    return cur_p;
@@ -455,6 +455,6 @@ void llama_sampling_accept(
    ctx_sampling->prev.push_back(id);

    if (ctx_sampling->grammar != NULL && apply_grammar) {
-        llama_grammar_accept_token(ctx_sampling->grammar, ctx_main, id);
+        llama_grammar_accept_token(ctx_main, ctx_sampling->grammar, id);
    }
 }
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -50,7 +50,7 @@ class TOKENIZER_TYPE(IntEnum):

 # TODO: this string has to exercise as much pre-tokenizer functionality as possible
 #       will be updated with time - contributions welcome
-CHK_TXT = '\n \n\n \n\n\n \t \t\t \t\n  \n   \n    \n     \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
+chktxt = '\n \n\n \n\n\n \t \t\t \t\n  \n   \n    \n     \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'

 if len(sys.argv) == 2:
    token = sys.argv[1]
@@ -91,12 +91,6 @@ models = [
    {"name": "gemma-2",        "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2-9b", },
    {"name": "jais",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/core42/jais-13b", },
    {"name": "t5",             "tokt": TOKENIZER_TYPE.UGM, "repo": "https://huggingface.co/google-t5/t5-small", },
-    {"name": "codeshell",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/WisdomShell/CodeShell-7B", },
-    {"name": "tekken",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistralai/Mistral-Nemo-Base-2407", },
-    {"name": "smollm",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/HuggingFaceTB/SmolLM-135M", },
-    {'name': "bloom",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigscience/bloom", },
-    {'name': "gpt3-finnish",   "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/TurkuNLP/gpt3-finnish-small", },
-    {"name": "exaone",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", },
 ]


@@ -105,8 +99,8 @@ def download_file_with_auth(url, token, save_path):
    response = sess.get(url, headers=headers)
    response.raise_for_status()
    os.makedirs(os.path.dirname(save_path), exist_ok=True)
-    with open(save_path, 'wb') as downloaded_file:
-        downloaded_file.write(response.content)
+    with open(save_path, 'wb') as f:
+        f.write(response.content)
    logger.info(f"File {save_path} downloaded successfully")


@@ -165,7 +159,7 @@ for model in models:
        logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}")
        continue  # Skip to the next model if the tokenizer can't be loaded

-    chktok = tokenizer.encode(CHK_TXT)
+    chktok = tokenizer.encode(chktxt)
    chkhsh = sha256(str(chktok).encode()).hexdigest()

    logger.info(f"model: {name}")
@@ -197,7 +191,7 @@ src_func = f"""
        # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
        # use in llama.cpp to implement the same pre-tokenizer

-        chktxt = {repr(CHK_TXT)}
+        chktxt = {repr(chktxt)}

        chktok = tokenizer.encode(chktxt)
        chkhsh = sha256(str(chktok).encode()).hexdigest()
@@ -293,7 +287,7 @@ tests = [
    "333333333",
    "Cửa Việt", # llama-bpe fails on this
    " discards",
-    CHK_TXT,
+    chktxt,
 ]

 # write the tests to ./models/ggml-vocab-{name}.gguf.inp
--- a/convert_llama_ggml_to_gguf.py
+++ b/convert_llama_ggml_to_gguf.py
@@ -116,7 +116,7 @@ class Tensor:
        assert quant is not None, 'Unknown tensor type'
        (blksize, tysize) = quant
        offset += 12
-        self.dtype= gguf.GGMLQuantizationType(dtype)
+        self.dtype= dtype
        self.dims = struct.unpack(f'<{n_dims}I', data[offset:offset + (4 * n_dims)])
        offset += 4 * n_dims
        self.name = bytes(data[offset:offset + name_len])
@@ -132,10 +132,6 @@ class Tensor:


 class GGMLModel:
-
-    file_format: GGMLFormat
-    format_version: int
-
    def __init__(self):
        self.hyperparameters = None
        self.vocab = None
@@ -294,7 +290,7 @@ class GGMLToGGUF:
        if self.vocab_override is not None:
            vo = self.vocab_override
            logger.info('* Adding vocab item(s)')
-            for (_, (vbytes, score, ttype)) in enumerate(vo.all_tokens()):
+            for (idx, (vbytes, score, ttype)) in enumerate(vo.all_tokens()):
                tokens.append(vbytes)
                scores.append(score)
                toktypes.append(ttype)
--- a/convert_lora_to_gguf.py
+++ b/convert_lora_to_gguf.py
@@ -1,394 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-
-from __future__ import annotations
-
-from dataclasses import dataclass
-import logging
-import argparse
-import os
-import sys
-import json
-from math import prod
-from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable, Iterable, Iterator, Sequence, SupportsIndex, cast
-
-import torch
-
-if TYPE_CHECKING:
-    from torch import Tensor
-
-if 'NO_LOCAL_GGUF' not in os.environ:
-    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
-import gguf
-
-# reuse model definitions from convert_hf_to_gguf.py
-from convert_hf_to_gguf import LazyTorchTensor, Model
-
-logger = logging.getLogger("lora-to-gguf")
-
-
-@dataclass
-class PartialLoraTensor:
-    A: Tensor | None = None
-    B: Tensor | None = None
-
-
-# magic to support tensor shape modifications and splitting
-class LoraTorchTensor:
-    _lora_A: Tensor  # (n_rank, row_size)
-    _lora_B: Tensor  # (col_size, n_rank)
-    _rank: int
-
-    def __init__(self, A: Tensor, B: Tensor):
-        assert len(A.shape) == len(B.shape)
-        assert A.shape[-2] == B.shape[-1]
-        if A.dtype != B.dtype:
-            A = A.to(torch.float32)
-            B = B.to(torch.float32)
-        self._lora_A = A
-        self._lora_B = B
-        self._rank = B.shape[-1]
-
-    def get_lora_A_B(self) -> tuple[Tensor, Tensor]:
-        return (self._lora_A, self._lora_B)
-
-    def __getitem__(
-        self,
-        indices: (
-            SupportsIndex
-            | slice
-            | tuple[SupportsIndex | slice | Tensor, ...]  # TODO: add ellipsis in the type signature
-        ),
-    ) -> LoraTorchTensor:
-        shape = self.shape
-        if isinstance(indices, SupportsIndex):
-            if len(shape) > 2:
-                return LoraTorchTensor(self._lora_A[indices], self._lora_B[indices])
-            else:
-                raise NotImplementedError  # can't return a vector
-        elif isinstance(indices, slice):
-            if len(shape) > 2:
-                return LoraTorchTensor(self._lora_A[indices], self._lora_B[indices])
-            else:
-                return LoraTorchTensor(self._lora_A, self._lora_B[indices])
-        elif isinstance(indices, tuple):
-            assert len(indices) > 0
-            if indices[-1] is Ellipsis:
-                return self[indices[:-1]]
-            # expand ellipsis
-            indices = tuple(
-                u
-                for v in (
-                    (
-                        (slice(None, None) for _ in range(len(indices) - 1))
-                        if i is Ellipsis
-                        else (i,)
-                    )
-                    for i in indices
-                )
-                for u in v
-            )
-
-            if len(indices) < len(shape):
-                indices = (*indices, *(slice(None, None) for _ in range(len(indices), len(shape))))
-
-            # TODO: make sure this is correct
-            indices_A = (
-                *(
-                    (
-                        j.__index__() % self._lora_A.shape[i]
-                        if isinstance(j, SupportsIndex)
-                        else slice(None, None)
-                    )
-                    for i, j in enumerate(indices[:-2])
-                ),
-                slice(None, None),
-                indices[-1],
-            )
-            indices_B = indices[:-1]
-            return LoraTorchTensor(self._lora_A[indices_A], self._lora_B[indices_B])
-        else:
-            raise NotImplementedError  # unknown indice type
-
-    @property
-    def dtype(self) -> torch.dtype:
-        assert self._lora_A.dtype == self._lora_B.dtype
-        return self._lora_A.dtype
-
-    @property
-    def shape(self) -> tuple[int, ...]:
-        assert len(self._lora_A.shape) == len(self._lora_B.shape)
-        return (*self._lora_B.shape[:-1], self._lora_A.shape[-1])
-
-    def size(self, dim=None):
-        assert dim is None
-        return self.shape
-
-    def reshape(self, *shape: int | tuple[int, ...]) -> LoraTorchTensor:
-        if isinstance(shape[0], tuple):
-            new_shape: tuple[int, ...] = shape[0]
-        else:
-            new_shape = cast(tuple[int, ...], shape)
-        orig_shape = self.shape
-        if len(new_shape) < 2:
-            raise NotImplementedError  # can't become a vector
-
-        # expand -1 in the shape
-        if any(dim == -1 for dim in new_shape):
-            n_elems = prod(orig_shape)
-            n_new_elems = prod(dim if dim != -1 else 1 for dim in new_shape)
-            assert n_elems % n_new_elems == 0
-            new_shape = (*(dim if dim != -1 else n_elems // n_new_elems for dim in new_shape),)
-
-        if new_shape[-1] != orig_shape[-1]:
-            raise NotImplementedError  # can't reshape the row size trivially
-
-        shape_A = (*(1 for _ in new_shape[:-2]), self._rank, orig_shape[-1])
-        shape_B = (*new_shape[:-1], self._rank)
-        return LoraTorchTensor(
-            self._lora_A.reshape(shape_A),
-            self._lora_B.reshape(shape_B),
-        )
-
-    def reshape_as(self, other: Tensor) -> LoraTorchTensor:
-        return self.reshape(*other.shape)
-
-    def view(self, *size: int) -> LoraTorchTensor:
-        return self.reshape(*size)
-
-    def permute(self, *dims: int) -> LoraTorchTensor:
-        shape = self.shape
-        dims = tuple(dim - len(shape) if dim >= 0 else dim for dim in dims)
-        if dims[-1] == -1:
-            # TODO: support higher dimensional A shapes bigger than 1
-            assert all(dim == 1 for dim in self._lora_A.shape[:-2])
-            return LoraTorchTensor(self._lora_A, self._lora_B.permute(*dims))
-        if len(shape) == 2 and dims[-1] == -2 and dims[-2] == -1:
-            return LoraTorchTensor(self._lora_B.permute(*dims), self._lora_A.permute(*dims))
-        else:
-            # TODO: compose the above two
-            raise NotImplementedError
-
-    def transpose(self, dim0: int, dim1: int) -> LoraTorchTensor:
-        shape = self.shape
-        dims = [i for i in range(len(shape))]
-        dims[dim0], dims[dim1] = dims[dim1], dims[dim0]
-        return self.permute(*dims)
-
-    def swapaxes(self, axis0: int, axis1: int) -> LoraTorchTensor:
-        return self.transpose(axis0, axis1)
-
-    def to(self, *args, **kwargs):
-        return LoraTorchTensor(self._lora_A.to(*args, **kwargs), self._lora_B.to(*args, **kwargs))
-
-    @classmethod
-    def __torch_function__(cls, func: Callable, types, args=(), kwargs=None):
-        del types  # unused
-
-        if kwargs is None:
-            kwargs = {}
-
-        if func is torch.permute:
-            return type(args[0]).permute(*args, **kwargs)
-        elif func is torch.reshape:
-            return type(args[0]).reshape(*args, **kwargs)
-        elif func is torch.stack:
-            assert isinstance(args[0], Sequence)
-            dim = kwargs.get("dim", 0)
-            assert dim == 0
-            return LoraTorchTensor(
-                torch.stack([a._lora_A for a in args[0]], dim),
-                torch.stack([b._lora_B for b in args[0]], dim),
-            )
-        elif func is torch.cat:
-            assert isinstance(args[0], Sequence)
-            dim = kwargs.get("dim", 0)
-            assert dim == 0
-            if len(args[0][0].shape) > 2:
-                return LoraTorchTensor(
-                    torch.cat([a._lora_A for a in args[0]], dim),
-                    torch.cat([b._lora_B for b in args[0]], dim),
-                )
-            elif all(torch.equal(args[0][0]._lora_A, t._lora_A) for t in args[0][1:]):
-                return LoraTorchTensor(
-                    args[0][0]._lora_A,
-                    torch.cat([b._lora_B for b in args[0]], dim),
-                )
-            else:
-                raise NotImplementedError
-        else:
-            raise NotImplementedError
-
-
-def get_base_tensor_name(lora_tensor_name: str) -> str:
-    base_name = lora_tensor_name.replace("base_model.model.", "")
-    base_name = base_name.replace(".lora_A.weight", ".weight")
-    base_name = base_name.replace(".lora_B.weight", ".weight")
-    return base_name
-
-
-def parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser(
-        description="Convert a huggingface PEFT LoRA adapter to a GGML compatible file")
-    parser.add_argument(
-        "--outfile", type=Path,
-        help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
-    )
-    parser.add_argument(
-        "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16",
-        help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
-    )
-    parser.add_argument(
-        "--bigendian", action="store_true",
-        help="model is executed on big endian machine",
-    )
-    parser.add_argument(
-        "--no-lazy", action="store_true",
-        help="use more RAM by computing all outputs before writing (use in case lazy evaluation is broken)",
-    )
-    parser.add_argument(
-        "--verbose", action="store_true",
-        help="increase output verbosity",
-    )
-    parser.add_argument(
-        "--dry-run", action="store_true",
-        help="only print out what will be done, without writing any new files",
-    )
-    parser.add_argument(
-        "--base", type=Path, required=True,
-        help="directory containing base model file",
-    )
-    parser.add_argument(
-        "lora_path", type=Path,
-        help="directory containing LoRA adapter file",
-    )
-
-    return parser.parse_args()
-
-
-if __name__ == '__main__':
-    args = parse_args()
-    logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
-
-    ftype_map: dict[str, gguf.LlamaFileType] = {
-        "f32": gguf.LlamaFileType.ALL_F32,
-        "f16": gguf.LlamaFileType.MOSTLY_F16,
-        "bf16": gguf.LlamaFileType.MOSTLY_BF16,
-        "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
-        "auto": gguf.LlamaFileType.GUESSED,
-    }
-
-    ftype = ftype_map[args.outtype]
-
-    dir_base_model: Path = args.base
-    dir_lora: Path = args.lora_path
-    lora_config = dir_lora / "adapter_config.json"
-    input_model = dir_lora / "adapter_model.safetensors"
-
-    if args.outfile is not None:
-        fname_out = args.outfile
-    else:
-        # output in the same directory as the model by default
-        fname_out = dir_lora
-
-    if os.path.exists(input_model):
-        # lazy import load_file only if lora is in safetensors format.
-        from safetensors.torch import load_file
-
-        lora_model = load_file(input_model, device="cpu")
-    else:
-        input_model = os.path.join(dir_lora, "adapter_model.bin")
-        lora_model = torch.load(input_model, map_location="cpu", weights_only=True)
-
-    # load base model
-    logger.info(f"Loading base model: {dir_base_model.name}")
-    hparams = Model.load_hparams(dir_base_model)
-    with torch.inference_mode():
-        try:
-            model_class = Model.from_model_architecture(hparams["architectures"][0])
-        except NotImplementedError:
-            logger.error(f"Model {hparams['architectures'][0]} is not supported")
-            sys.exit(1)
-
-        class LoraModel(model_class):
-            model_arch = model_class.model_arch
-
-            lora_alpha: float
-
-            def __init__(self, *args, dir_lora_model: Path, lora_alpha: float, **kwargs):
-
-                super().__init__(*args, **kwargs)
-
-                self.dir_model_card = dir_lora_model
-                self.lora_alpha = float(lora_alpha)
-
-            def set_type(self):
-                self.gguf_writer.add_type(gguf.GGUFType.ADAPTER)
-                self.gguf_writer.add_string(gguf.Keys.Adapter.TYPE, "lora")
-
-            def set_gguf_parameters(self):
-                self.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, self.lora_alpha)
-                super().set_gguf_parameters()
-
-            def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
-                tensor_map: dict[str, PartialLoraTensor] = {}
-
-                for name, tensor in lora_model.items():
-                    if self.lazy:
-                        tensor = LazyTorchTensor.from_eager(tensor)
-                    base_name = get_base_tensor_name(name)
-                    is_lora_a = ".lora_A.weight" in name
-                    is_lora_b = ".lora_B.weight" in name
-                    if not is_lora_a and not is_lora_b:
-                        if ".base_layer.weight" in name:
-                            continue
-                        logger.error(f"Unexpected name '{name}': Not a lora_A or lora_B tensor")
-                        sys.exit(1)
-
-                    if base_name in tensor_map:
-                        if is_lora_a:
-                            tensor_map[base_name].A = tensor
-                        else:
-                            tensor_map[base_name].B = tensor
-                    else:
-                        if is_lora_a:
-                            tensor_map[base_name] = PartialLoraTensor(A=tensor)
-                        else:
-                            tensor_map[base_name] = PartialLoraTensor(B=tensor)
-
-                for name, tensor in tensor_map.items():
-                    assert tensor.A is not None
-                    assert tensor.B is not None
-                    yield (name, cast(torch.Tensor, LoraTorchTensor(tensor.A, tensor.B)))
-
-            def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-                dest = super().modify_tensors(data_torch, name, bid)
-                for dest_name, dest_data in dest:
-                    assert isinstance(dest_data, LoraTorchTensor)
-                    lora_a, lora_b = dest_data.get_lora_A_B()
-
-                    yield (dest_name + ".lora_a", lora_a)
-                    yield (dest_name + ".lora_b", lora_b)
-
-        with open(lora_config, "r") as f:
-            lparams: dict[str, Any] = json.load(f)
-
-        alpha: float = lparams["lora_alpha"]
-
-        model_instance = LoraModel(
-            dir_base_model,
-            ftype,
-            fname_out,
-            is_big_endian=args.bigendian,
-            use_temp_file=False,
-            eager=args.no_lazy,
-            dry_run=args.dry_run,
-            dir_lora_model=dir_lora,
-            lora_alpha=alpha,
-            is_lora=True,
-        )
-
-        logger.info("Exporting model...")
-        model_instance.write()
-        logger.info(f"Model successfully exported to {model_instance.fname_out}")
--- a/docs/backend/CANN.md
+++ b/docs/backend/CANN.md
@@ -1,259 +0,0 @@
-# llama.cpp for CANN
-
- - [Background](#background)
- - [News](#news)
- - [OS](#os)
- - [Hardware](#hardware)
- - [Model Supports](#model-supports)
- - [DataType Supports](#datatype-supports)
- - [Docker](#docker)
- - [Linux](#linux)
- - [TODO](#todo)
-
-
-## Background
-
-**Ascend NPU** is a range of AI processors using Neural Processing Unit. It will efficiently handle matrix-matrix multiplication, dot-product and scalars.
-
-**CANN** (Compute Architecture for Neural Networks) is a heterogeneous computing architecture for AI scenarios, providing support for multiple AI frameworks on the top and serving AI processors and programming at the bottom. It plays a crucial role in bridging the gap between upper and lower layers, and is a key platform for improving the computing efficiency of Ascend AI processors. Meanwhile, it offers a highly efficient and easy-to-use programming interface for diverse application scenarios, allowing users to rapidly build AI applications and services based on the Ascend platform.
-
-**Llama.cpp + CANN**
-
-The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the ability of AscendC and ACLNN which are intergrated to CANN Toolkit and kernels to using Ascend NPU directly.
-
-## News
-
- 2024.8
-  - Support `Q4_0` and `Q8_0` data type for Ascend NPU.
- 2024.7
-  - Create CANN backend for Ascend NPU.
-
-## OS
-
-| OS      | Status  | Verified                                       |
-|:-------:|:-------:|:----------------------------------------------:|
-| Linux   | Support | Ubuntu 22.04, OpenEuler22.03                   |
-
-
-## Hardware
-
-### Ascend NPU
-
-**Verified devices**
-| Ascend NPU                    | Status  |
-|:-----------------------------:|:-------:|
-| Atlas 300T A2                 | Support |
-
-*Notes:*
-
- If you have trouble with Ascend NPU device, please create a issue with **[CANN]** prefix/tag.
- If you run successfully with your Ascend NPU device, please help update the upper table.
-
-
-## Model Supports
-
-| Model Name                  | FP16  | Q8_0 | Q4_0 |
-|:----------------------------|:-----:|:----:|:----:|
-| AquilaChat2-7B              |   √   |   √  |   √  |
-| Baichuan-7b                 |   √   |   √  |   √  |
-| Baichuan2-7B-Chat           |   √   |   √  |   √  |
-| bitnet_b1_58-large          |   √   |   √  |   √  |
-| bloom-560m                  |   √   |   x  |   √  |
-| bloomz-alpaca-560m          |   √   |   x  |   √  |
-| c4ai-command-r-35B-v01      |   x   |   x  |   x  |
-| chatglm3-6B                 |   x   |   x  |   x  |
-| chinese-alpaca-2-1.3b       |   √   |   √  |   √  |
-| CodeShell-7B                |   √   |   √  |   √  |
-| deepseek-ai_deepseek-coder-1.3B-base | x |   x  |   x  |
-| deepseek-ai_DeepSeek-V2-Lite | x   |   x  |   x   |
-| deepseek-coder-6.7B-instruct | x   |   x  |   x   |
-| DeepSeek-V2-Lite-64x1.5B    |   x   |   x  |   x  |
-| falcon-7b-instruct          |   √   |   √  |   √  |
-| flan-t5-large               |   √   |   √  |   √  |
-| gemma-2-9b-it               |   √   |   √  |   √  |
-| glm-4-9B                    |   x   |   x  |   x  |
-| gpt2                        |   √   |   √  |   √  |
-| Gpt2-163M                   |   √   |   √  |   √  |
-| granite-3B-code-instruct    |   √   |   √  |   √  |
-| GritLM-7B                   |   √   |   √  |   √  |
-| internlm2_5-7b-chat         |   √   |   √  |   √  |
-| koala-7B-HF                 |   √   |   √  |   √  |
-| Llama-2-7b-chat-hf          |   √   |   √  |   √  |
-| Llama-3-Smaug-8B            |   √   |   √  |   √  |
-| Llama2-Chinese-7b-Chat      |   √   |   √  |   √  |
-| Llama3-8B                   |   √   |   √  |   √  |
-| Llama3-8b-chinese           |   √   |   √  |   √  |
-| mamba-130m-hf               |   √   |   √  |   √  |
-| Mistral-7B-Instruct-v0.2    |   √   |   √  |   √  |
-| Mixtral-8x7B-Instruct-v0.1  |   x   |   √  |   √  |
-| mpt-7B                      |   √   |   √  |   √  |
-| OLMo-1B-hf                  |   √   |   √  |   √  |
-| OpenELM-3B-Instruct         |   √   |   √  |   √  |
-| Orion-14b-base              |   √   |   √  |   √  |
-| phi1                        |   x   |   x  |   x  |
-| phi2                        |   x   |   x  |   x  |
-| Phi-3-mini-4k-instruct      |   √   |   √  |   √  |
-| plamo-13b                   |   √   |   √  |   √  |
-| pythia-70M                  |   x   |   x  |   x  |
-| Qwen-7B                     |   √   |   √  |   √  |
-| Qwen2-1.5B-Instruct         |   √   |   x  |   √  |
-| Refact-1_6B-fim             |   √   |   √  |   √  |
-| SmolLM-135M                 |   √   |   √  |   √  |
-| stablelm-zephyr             |   x   |   x  |   x  |
-| stablelm-2-zephyr-1_6b      |   x   |   x  |   x  |
-| starcoderbase-1b            |   √   |   √  |   √  |
-| starcoder2-3b               |   √   |   √  |   √  |
-| vigogne-7b-chat             |   √   |   √  |   √  |
-| xverse-7b-chat              |   √   |   √  |   √  |
-| Yi-6b-Chat                  |   √   |   √  |   √  |
-
-
-
-## DataType Supports
-
-| DataType               | Status  |
-|:----------------------:|:-------:|
-| FP16                   | Support |
-| Q8_0                   | Support |
-| Q4_0                   | Support |
-
-## Docker
-
-### Build Images
-You can get a image with llama.cpp in one command.
-```sh
-docker build -t llama-cpp-cann -f .devops/llama-cli-cann.Dockerfile .
-```
-
-### Run container
-
-```sh
-# Find all cards.
-npu-smi info
-
-# Select the cards that you want to use, make sure these cards are not used by someone.
-# Following using cards of device0.
-docker run --name llamacpp --device /dev/davinci0  --device /dev/davinci_manager --device /dev/devmm_svm --device /dev/hisi_hdc -v /usr/local/dcmi:/usr/local/dcmi -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info -v /PATH_TO_YOUR_MODELS/:/app/models -it llama-cpp-cann -m /app/models/MODEL_PATH -ngl 32 -p "Building a website can be done in 10 simple steps:"
-```
-
-*Notes:*
-
- You may need to install Ascend Driver and firmware on the **host** machine *(Please refer to the [Linux configuration](#linux) for details)*.
-
-## Linux
-
-### I. Setup Environment
-
-1. **Install Ascend Driver and firmware**
-
-    ```sh
-    # create driver running user.
-    sudo groupadd -g HwHiAiUser
-    sudo useradd -g HwHiAiUser -d /home/HwHiAiUser -m HwHiAiUser -s /bin/bash
-    sudo usermod -aG HwHiAiUser $USER
-
-    # download driver from https://www.hiascend.com/hardware/firmware-drivers/community according to your system
-    # and install driver.
-    sudo sh Ascend-hdk-910b-npu-driver_x.x.x_linux-{arch}.run --full --install-for-all
-    ```
-
-    Once installed, run `npu-smi info` to check whether driver is installed successfully.
-    ```sh
-    +-------------------------------------------------------------------------------------------+
-    | npu-smi 24.1.rc2               Version: 24.1.rc2                                          |
-    +----------------------+---------------+----------------------------------------------------+
-    | NPU   Name           | Health        | Power(W)    Temp(C)           Hugepages-Usage(page)|
-    | Chip                 | Bus-Id        | AICore(%)   Memory-Usage(MB)  HBM-Usage(MB)        |
-    +======================+===============+====================================================+
-    | 2     xxx            | OK            | 64.4        51                15   / 15            |
-    | 0                    | 0000:01:00.0  | 0           1873 / 15077      0    / 32768         |
-    +======================+===============+====================================================+
-    | 5     xxx            | OK            | 64.0        52                15   / 15            |
-    | 0                    | 0000:81:00.0  | 0           1874 / 15077      0    / 32768         |
-    +======================+===============+====================================================+
-    | No running processes found in NPU 2                                                       |
-    +======================+===============+====================================================+
-    | No running processes found in NPU 5                                                       |
-    +======================+===============+====================================================+
-    ```
-
-2. **Install Ascend Firmware**
-    ```sh
-    # download driver from https://www.hiascend.com/hardware/firmware-drivers/community according to your system
-    # and install driver.
-    sudo sh Ascend-hdk-910b-npu-firmware_x.x.x.x.X.run --full
-    ```
-    If the following messaage appers, firmware is installed successfully.
-    ```sh
-    Firmware package installed successfully!
-    ```
-
-
-3. **Install CANN toolkit and kernels**
-
-    CANN toolkit and kernels can be obtained from the official [CANN Toolkit](https://www.hiascend.com/zh/developer/download/community/result?module=cann) page.
-
-    Please download the corresponding version that satified your system. The minimum version required is 8.0.RC2.alpha002 and here is the install command.
-    ```sh
-    pip3 install attrs numpy decorator sympy cffi pyyaml pathlib2 psutil protobuf scipy requests absl-py wheel typing_extensions
-    sh Ascend-cann-toolkit_8.0.RC2.alpha002_linux-aarch64.run --install
-    sh Ascend-cann-kernels-910b_8.0.RC2.alpha002_linux.run --install
-    ```
-
-    Set Ascend Variables:
-    ```sh
-    echo "source ~/Ascend/ascend-toolkit/set_env.sh" >> ~/.bashrc
-    source ~/.bashrc
-    ```
-
-Upon a successful installation, CANN is enabled for the available ascend devices.
-
-### II. Build llama.cpp
-
-```sh
-cmake -B build -DGGML_CANN=on -DCMAKE_BUILD_TYPE=release
-cmake --build build --config release
-```
-
-### III. Run the inference
-
-1. **Retrieve and prepare model**
-
-    You can refer to the general [*Prepare and Quantize*](../../README.md#prepare-and-quantize) guide for model prepration.
-
-    **Notes**:
-
-      - CANN backend only supports FP16/Q4_0/Q8_0 models currently.
-
-2. **Launch inference**
-
-    There are two device selection modes:
-
-    - Single device: Use one device target specified by the user.
-    - Multiple devices: Automatically choose the devices with the same backend.
-
-    | Device selection | Parameter                              |
-    |:----------------:|:--------------------------------------:|
-    | Single device    | --split-mode none --main-gpu DEVICE_ID |
-    | Multiple devices | --split-mode layer (default)           |
-
-    Examples:
-
-    - Use device 0:
-
-    ```sh
-    ./build/bin/llama-cli -m path_to_model -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
-    ```
-
-    - Use multiple devices:
-
-    ```sh
-    ./build/bin/llama-cli -m path_to_model -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
-    ```
-
-### **GitHub contribution**:
-Please add the **[CANN]** prefix/tag in issues/PRs titles to help the CANN-team check/address them without delay.
-
-
-## TODO
- Support more models and data types.
--- a/docs/backend/SYCL.md
+++ b/docs/backend/SYCL.md
@@ -20,7 +20,7 @@
 **oneAPI** is an open ecosystem and a standard-based specification, supporting multiple architectures including but not limited to intel CPUs, GPUs and FPGAs. The key components of the oneAPI ecosystem include:

 - **DPCPP** *(Data Parallel C++)*: The primary oneAPI SYCL implementation, which includes the icpx/icx Compilers.
- **oneAPI Libraries**: A set of highly optimized libraries targeting multiple domains *(e.g. oneMKL and oneDNN)*.
+- **oneAPI Libraries**: A set of highly optimized libraries targeting multiple domains *(e.g. oneMKL - Math Kernel Library)*.
 - **oneAPI LevelZero**: A high performance low level interface for fine-grained control over intel iGPUs and dGPUs.
 - **Nvidia & AMD Plugins**: These are plugins extending oneAPI's DPCPP support to SYCL on Nvidia and AMD GPU targets.

@@ -28,6 +28,10 @@

 The llama.cpp SYCL backend is designed to support **Intel GPU** firstly. Based on the cross-platform feature of SYCL, it could support other vendor GPUs: Nvidia GPU (*AMD GPU coming*).

+When targeting **Intel CPU**, it is recommended to use llama.cpp for [Intel oneMKL](README.md#intel-onemkl) backend.
+
+It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS, cuBLAS, etc..*. In beginning work, the oneAPI's [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) open-source migration tool (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) was used for this purpose.
+
 ## Recommended Release

 The SYCL backend would be broken by some PRs due to no online CI.
@@ -41,10 +45,6 @@ The following release is verified with good quality:

 ## News

-
- 2024.8
-  - Use oneDNN as the default GEMM library, improve the compatibility for new Intel GPUs.
-
 - 2024.5
  - Performance is increased: 34 -> 37 tokens/s of llama-2-7b.Q4_0 on Arc770.
  - Arch Linux is verified successfully.
@@ -80,14 +80,7 @@ The following release is verified with good quality:

 ### Intel GPU

-SYCL backend supports Intel GPU Family:
-
- Intel Data Center Max Series
- Intel Flex Series, Arc Series
- Intel Built-in Arc GPU
- Intel iGPU in Core CPU (11th Generation Core CPU and newer, refer to [oneAPI supported GPU](https://www.intel.com/content/www/us/en/developer/articles/system-requirements/intel-oneapi-base-toolkit-system-requirements.html#inpage-nav-1-1)).
-
-#### Verified devices
+**Verified devices**

 | Intel GPU                     | Status  | Verified Model                        |
 |-------------------------------|---------|---------------------------------------|
@@ -95,7 +88,7 @@ SYCL backend supports Intel GPU Family:
 | Intel Data Center Flex Series | Support | Flex 170                              |
 | Intel Arc Series              | Support | Arc 770, 730M, Arc A750               |
 | Intel built-in Arc GPU        | Support | built-in Arc GPU in Meteor Lake       |
-| Intel iGPU                    | Support | iGPU in 13700k, i5-1250P, i7-1260P, i7-1165G7 |
+| Intel iGPU                    | Support | iGPU in i5-1250P, i7-1260P, i7-1165G7 |

 *Notes:*

@@ -196,7 +189,7 @@ Please follow the instructions for downloading and installing the Toolkit for Li

 Following guidelines/code snippets assume the default installation values. Otherwise, please make sure the necessary changes are reflected where applicable.

-Upon a successful installation, SYCL is enabled for the available intel devices, along with relevant libraries such as oneAPI oneDNN for Intel GPUs.
+Upon a successful installation, SYCL is enabled for the available intel devices, along with relevant libraries such as oneAPI MKL for intel GPUs.

 - **Adding support to Nvidia GPUs**

@@ -244,17 +237,12 @@ Similarly, user targeting Nvidia GPUs should expect at least one SYCL-CUDA devic
 ### II. Build llama.cpp

 #### Intel GPU
-
-```
-./examples/sycl/build.sh
-```
-
-or
-
 ```sh
 # Export relevant ENV variables
 source /opt/intel/oneapi/setvars.sh

+# Build LLAMA with MKL BLAS acceleration for intel GPU
+
 # Option 1: Use FP32 (recommended for better performance in most cases)
 cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx

@@ -288,71 +276,48 @@ cmake --build build --config Release -j -v

 ### III. Run the inference

-#### Retrieve and prepare model
+1. Retrieve and prepare model

 You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model prepration, or simply download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) model as example.

-##### Check device
-
-1. Enable oneAPI running environment
+2. Enable oneAPI running environment

 ```sh
 source /opt/intel/oneapi/setvars.sh
 ```

-2. List devices information
+3. List devices information

 Similar to the native `sycl-ls`, available SYCL devices can be queried as follow:

 ```sh
 ./build/bin/llama-ls-sycl-device
 ```
-
-This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following:
+A example of such log in a system with 1 *intel CPU* and 1 *intel GPU* can look like the following:
 ```
-found 2 SYCL devices:
-
+found 6 SYCL devices:
 |  |                  |                                             |Compute   |Max compute|Max work|Max sub|               |
 |ID|       Device Type|                                         Name|capability|units      |group   |group  |Global mem size|
 |--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|
 | 0|[level_zero:gpu:0]|               Intel(R) Arc(TM) A770 Graphics|       1.3|        512|    1024|     32|    16225243136|
 | 1|[level_zero:gpu:1]|                    Intel(R) UHD Graphics 770|       1.3|         32|     512|     32|    53651849216|
+| 2|    [opencl:gpu:0]|               Intel(R) Arc(TM) A770 Graphics|       3.0|        512|    1024|     32|    16225243136|
+| 3|    [opencl:gpu:1]|                    Intel(R) UHD Graphics 770|       3.0|         32|     512|     32|    53651849216|
+| 4|    [opencl:cpu:0]|         13th Gen Intel(R) Core(TM) i7-13700K|       3.0|         24|    8192|     64|    67064815616|
+| 5|    [opencl:acc:0]|               Intel(R) FPGA Emulation Device|       1.2|         24|67108864|     64|    67064815616|
 ```

-#### Choose level-zero devices
+| Attribute              | Note                                                        |
+|------------------------|-------------------------------------------------------------|
+| compute capability 1.3 | Level-zero driver/runtime, recommended                      |
+| compute capability 3.0 | OpenCL driver/runtime, slower than level-zero in most cases |

-|Chosen Device ID|Setting|
-|-|-|
-|0|`export ONEAPI_DEVICE_SELECTOR="level_zero:1"` or no action|
-|1|`export ONEAPI_DEVICE_SELECTOR="level_zero:1"`|
-|0 & 1|`export ONEAPI_DEVICE_SELECTOR="level_zero:0;level_zero:1"`|
-
-#### Execute
-
-Choose one of following methods to run.
-
-1. Script
-
- Use device 0:
-
-```sh
-./examples/sycl/run_llama2.sh 0
-```
- Use multiple devices:
-
-```sh
-./examples/sycl/run_llama2.sh
-```
-
-2. Command line
-Launch inference
+4. Launch inference

 There are two device selection modes:

- Single device: Use one device assigned by user. Default device id is 0.
- Multiple devices: Automatically choose the devices with the same backend.
-
-In two device selection modes, the default SYCL backend is level_zero, you can choose other backend supported by SYCL by setting environment variable ONEAPI_DEVICE_SELECTOR.
+- Single device: Use one device target specified by the user.
+- Multiple devices: Automatically select the devices with the same largest Max compute-units.

 | Device selection | Parameter                              |
 |------------------|----------------------------------------|
@@ -366,6 +331,11 @@ Examples:
 ```sh
 ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
 ```
+or run by script:
+
+```sh
+./examples/sycl/run_llama2.sh 0
+```

 - Use multiple devices:

@@ -373,6 +343,12 @@ ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Bui
 ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
 ```

+Otherwise, you can run the script:
+
+```sh
+./examples/sycl/run_llama2.sh
+```
+
 *Notes:*

 - Upon execution, verify the selected device(s) ID(s) in the output log, which can for instance be displayed as follow:
@@ -419,7 +395,7 @@ c. Verify installation
 In the oneAPI command line, run the following to print the available SYCL devices:

 ```
-sycl-ls.exe
+sycl-ls
 ```

 There should be one or more *level-zero* GPU devices displayed as **[ext_oneapi_level_zero:gpu]**. Below is example of such output detecting an *intel Iris Xe* GPU as a Level-zero SYCL device:
@@ -440,18 +416,6 @@ b. The new Visual Studio will install Ninja as default. (If not, please install

 ### II. Build llama.cpp

-You could download the release package for Windows directly, which including binary files and depended oneAPI dll files.
-
-Choose one of following methods to build from source code.
-
-1. Script
-
-```sh
-.\examples\sycl\win-build-sycl.bat
-```
-
-2. CMake
-
 On the oneAPI command line window, step into the llama.cpp main directory and run the following:

 ```
@@ -466,8 +430,12 @@ cmake -B build -G "Ninja" -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPI
 cmake --build build --config Release -j
 ```

-Or, use CMake presets to build:
+Otherwise, run the `win-build-sycl.bat` wrapper which encapsulates the former instructions:
+```sh
+.\examples\sycl\win-build-sycl.bat
+```

+Or, use CMake presets to build:
 ```sh
 cmake --preset x64-windows-sycl-release
 cmake --build build-x64-windows-sycl-release -j --target llama-cli
@@ -479,9 +447,7 @@ cmake --preset x64-windows-sycl-debug
 cmake --build build-x64-windows-sycl-debug -j --target llama-cli
 ```

-3. Visual Studio
-
-You can use Visual Studio to open llama.cpp folder as a CMake project. Choose the sycl CMake presets (`x64-windows-sycl-release` or `x64-windows-sycl-debug`) before you compile the project.
+Or, you can use Visual Studio to open llama.cpp folder as a CMake project. Choose the sycl CMake presets (`x64-windows-sycl-release` or `x64-windows-sycl-debug`) before you compile the project.

 *Notes:*

@@ -489,65 +455,52 @@ You can use Visual Studio to open llama.cpp folder as a CMake project. Choose th

 ### III. Run the inference

-#### Retrieve and prepare model
+1. Retrieve and prepare model

-You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model prepration, or simply download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) model as example.
+You can refer to the general [*Prepare and Quantize*](README#prepare-and-quantize) guide for model prepration, or simply download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) model as example.

-##### Check device
-
-1. Enable oneAPI running environment
+2. Enable oneAPI running environment

 On the oneAPI command line window, run the following and step into the llama.cpp directory:
 ```
 "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64
 ```

-2. List devices information
+3. List devices information

 Similar to the native `sycl-ls`, available SYCL devices can be queried as follow:

 ```
-build\bin\llama-ls-sycl-device.exe
+build\bin\ls-sycl-device.exe
 ```

-This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following:
+The output of this command in a system with 1 *intel CPU* and 1 *intel GPU* would look like the following:
 ```
-found 2 SYCL devices:
+found 6 SYCL devices:
 |  |                  |                                             |Compute   |Max compute|Max work|Max sub|               |
 |ID|       Device Type|                                         Name|capability|units      |group   |group  |Global mem size|
 |--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|
 | 0|[level_zero:gpu:0]|               Intel(R) Arc(TM) A770 Graphics|       1.3|        512|    1024|     32|    16225243136|
 | 1|[level_zero:gpu:1]|                    Intel(R) UHD Graphics 770|       1.3|         32|     512|     32|    53651849216|
+| 2|    [opencl:gpu:0]|               Intel(R) Arc(TM) A770 Graphics|       3.0|        512|    1024|     32|    16225243136|
+| 3|    [opencl:gpu:1]|                    Intel(R) UHD Graphics 770|       3.0|         32|     512|     32|    53651849216|
+| 4|    [opencl:cpu:0]|         13th Gen Intel(R) Core(TM) i7-13700K|       3.0|         24|    8192|     64|    67064815616|
+| 5|    [opencl:acc:0]|               Intel(R) FPGA Emulation Device|       1.2|         24|67108864|     64|    67064815616|

 ```
-#### Choose level-zero devices

-|Chosen Device ID|Setting|
-|-|-|
-|0|`set ONEAPI_DEVICE_SELECTOR="level_zero:1"` or no action|
-|1|`set ONEAPI_DEVICE_SELECTOR="level_zero:1"`|
-|0 & 1|`set ONEAPI_DEVICE_SELECTOR="level_zero:0;level_zero:1"`|
+| Attribute              | Note                                                      |
+|------------------------|-----------------------------------------------------------|
+| compute capability 1.3 | Level-zero running time, recommended                      |
+| compute capability 3.0 | OpenCL running time, slower than level-zero in most cases |

-#### Execute

-Choose one of following methods to run.
-
-1. Script
-
-```
-examples\sycl\win-run-llama2.bat
-```
-
-2. Command line
-
-Launch inference
+4. Launch inference

 There are two device selection modes:

- Single device: Use one device assigned by user. Default device id is 0.
- Multiple devices: Automatically choose the devices with the same backend.
-
-In two device selection modes, the default SYCL backend is level_zero, you can choose other backend supported by SYCL by setting environment variable ONEAPI_DEVICE_SELECTOR.
+- Single device: Use one device assigned by user.
+- Multiple devices: Automatically choose the devices with the same biggest Max compute units.

 | Device selection | Parameter                              |
 |------------------|----------------------------------------|
@@ -567,7 +520,11 @@ build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website ca
 ```
 build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer
 ```
+Otherwise, run the following wrapper script:

+```
+.\examples\sycl\win-run-llama2.bat
+```

 Note:

@@ -581,18 +538,17 @@ Or
 use 1 SYCL GPUs: [0] with Max compute units:512
 ```

-
 ## Environment Variable

 #### Build

 | Name               | Value                             | Function                                    |
 |--------------------|-----------------------------------|---------------------------------------------|
-| GGML_SYCL          | ON (mandatory)                    | Enable build with SYCL code path.<br>FP32 path - recommended for better perforemance than FP16 on quantized model|
+| GGML_SYCL          | ON (mandatory)                    | Enable build with SYCL code path.           |
 | GGML_SYCL_TARGET   | INTEL *(default)* \| NVIDIA       | Set the SYCL target device type.            |
 | GGML_SYCL_F16      | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path.      |
-| CMAKE_C_COMPILER   | `icx` *(Linux)*, `icx/cl` *(Windows)* | Set `icx` compiler for SYCL code path.      |
-| CMAKE_CXX_COMPILER | `icpx` *(Linux)*, `icx` *(Windows)*   | Set `icpx/icx` compiler for SYCL code path. |
+| CMAKE_C_COMPILER   | icx                               | Set *icx* compiler for SYCL code path.      |
+| CMAKE_CXX_COMPILER | icpx *(Linux)*, icx *(Windows)*   | Set `icpx/icx` compiler for SYCL code path. |

 #### Runtime

@@ -628,18 +584,9 @@ use 1 SYCL GPUs: [0] with Max compute units:512
  ```
  Otherwise, please double-check the GPU driver installation steps.

- Can I report Ollama issue on Intel GPU to llama.cpp SYCL backend?
-
-  No. We can't support Ollama issue directly, because we aren't familiar with Ollama.
-
-  Sugguest reproducing on llama.cpp and report similar issue to llama.cpp. We will surpport it.
-
-  It's same for other projects including llama.cpp SYCL backend.
-
-
 ### **GitHub contribution**:
 Please add the **[SYCL]** prefix/tag in issues/PRs titles to help the SYCL-team check/address them without delay.

 ## TODO

- NA
+- Support row layer split for multiple card runs.
--- a/docs/build.md
+++ b/docs/build.md
@@ -16,7 +16,7 @@ In order to build llama.cpp you have four different options.
      make
      ```

-  - On Windows (x86/x64 only, arm64 requires cmake):
+  - On Windows:

    1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
    2. Extract `w64devkit` on your pc.
@@ -60,17 +60,6 @@ In order to build llama.cpp you have four different options.
      cmake -B build -G "Xcode"
      cmake --build build --config Debug
      ```
-    - Building for Windows (x86, x64 and arm64) with MSVC or clang as compilers:
-      - Install Visual Studio 2022, e.g. via the [Community Edition](https://visualstudio.microsoft.com/de/vs/community/). In the installer, select at least the following options (this also automatically installs the required additional tools like CMake,...):
-        - Tab Workload: Desktop-development with C++
-        - Tab Components (select quickly via search): C++-_CMake_ Tools for Windows, _Git_ for Windows, C++-_Clang_ Compiler for Windows, MS-Build Support for LLVM-Toolset (clang)
-      - Please remember to always use a Developer Command Prompt / PowerShell for VS2022 for git, build, test
-      - For Windows on ARM (arm64, WoA) build with:
-        ```bash
-        cmake --preset arm64-windows-llvm-release -D GGML_OPENMP=OFF
-        cmake --build build-arm64-windows-llvm-release
-        ```
-        Note: Building for arm64 could also be done just with MSVC (with the build-arm64-windows-MSVC preset, or the standard CMake build instructions). But MSVC does not support inline ARM assembly-code, used e.g. for the accelerated Q4_0_4_8 CPU kernels.

 -   Using `gmake` (FreeBSD):

@@ -178,11 +167,7 @@ For Jetson user, if you have Jetson Orin, you can try this: [Offical Support](ht
  cmake --build build --config Release
  ```

-The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used.
-
-The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted. In Windows this setting is available in the NVIDIA control panel as `System Memory Fallback`.
-
-The following compilation options are also available to tweak performance:
+The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance:

 | Option                        | Legal values           | Default | Description                                                                                                                                                                                                                                                                             |
 |-------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
@@ -196,19 +181,6 @@ The following compilation options are also available to tweak performance:
 | GGML_CUDA_PEER_MAX_BATCH_SIZE | Positive integer       | 128     | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial.                                                                         |
 | GGML_CUDA_FA_ALL_QUANTS       | Boolean                | false   | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer.                                                                                                  |

-### MUSA
-
- Using `make`:
-  ```bash
-  make GGML_MUSA=1
-  ```
- Using `CMake`:
-
-  ```bash
-  cmake -B build -DGGML_MUSA=ON
-  cmake --build build --config Release
-  ```
-
 ### hipBLAS

 This provides BLAS acceleration on HIP-supported AMD GPUs.
@@ -270,45 +242,6 @@ The following compilation options are also available to tweak performance (yes,

 ### Vulkan

-**Windows**
-
-#### w64devkit
-
-Download and extract [w64devkit](https://github.com/skeeto/w64devkit/releases).
-
-Download and install the [Vulkan SDK](https://vulkan.lunarg.com/sdk/home#windows). When selecting components, only the Vulkan SDK Core is required.
-
-Launch `w64devkit.exe` and run the following commands to copy Vulkan dependencies:
-```sh
-SDK_VERSION=1.3.283.0
-cp /VulkanSDK/$SDK_VERSION/Bin/glslc.exe $W64DEVKIT_HOME/bin/
-cp /VulkanSDK/$SDK_VERSION/Lib/vulkan-1.lib $W64DEVKIT_HOME/x86_64-w64-mingw32/lib/
-cp -r /VulkanSDK/$SDK_VERSION/Include/* $W64DEVKIT_HOME/x86_64-w64-mingw32/include/
-cat > $W64DEVKIT_HOME/x86_64-w64-mingw32/lib/pkgconfig/vulkan.pc <<EOF
-Name: Vulkan-Loader
-Description: Vulkan Loader
-Version: $SDK_VERSION
-Libs: -lvulkan-1
-EOF
-
-```
-Switch into the `llama.cpp` directory and run `make GGML_VULKAN=1`.
-
-#### MSYS2
-Install [MSYS2](https://www.msys2.org/) and then run the following commands in a UCRT terminal to install dependencies.
-  ```sh
-  pacman -S git \
-      mingw-w64-ucrt-x86_64-gcc \
-      mingw-w64-ucrt-x86_64-cmake \
-      mingw-w64-ucrt-x86_64-vulkan-devel \
-      mingw-w64-ucrt-x86_64-shaderc
-  ```
-Switch into `llama.cpp` directory and build using CMake.
-```sh
-cmake -B build -DGGML_VULKAN=ON
-cmake --build build --config Release
-```
-
 **With docker**:

 You don't need to install Vulkan SDK. It will be installed inside the container.
@@ -352,31 +285,6 @@ cmake --build build --config Release
 # ggml_vulkan: Using Intel(R) Graphics (ADL GT2) | uma: 1 | fp16: 1 | warp size: 32
 ```

-### CANN
-This provides NPU acceleration using the AI cores of your Ascend NPU. And [CANN](https://www.hiascend.com/en/software/cann) is a hierarchical APIs to help you to quickly build AI applications and service based on Ascend NPU.
-
-For more information about Ascend NPU in [Ascend Community](https://www.hiascend.com/en/).
-
-Make sure to have the CANN toolkit installed. You can download it from here: [CANN Toolkit](https://www.hiascend.com/developer/download/community/result?module=cann)
-
-Go to `llama.cpp` directory and build using CMake.
-```bash
-cmake -B build -DGGML_CANN=on -DCMAKE_BUILD_TYPE=release
-cmake --build build --config release
-```
-
-You can test with:
-
-`./build/llama-cli -m PATH_TO_MODEL -p "Building a website can be done in 10 steps:" -ngl 32`
-
-If the fllowing info is output on screen, you are using `llama.cpp by CANN backend`:
-```bash
-llm_load_tensors:       CANN buffer size = 13313.00 MiB
-llama_new_context_with_model:       CANN compute buffer size =  1260.81 MiB
-```
-
-For detailed info, such as model/device supports, CANN install, please refer to [llama.cpp for CANN](./backend/CANN.md).
-
 ### Android

 To read documentation for how to build on Android, [click here](./android.md)
--- a/docs/development/HOWTO-add-model.md
+++ b/docs/development/HOWTO-add-model.md
@@ -9,15 +9,15 @@ Adding a model requires few steps:
 After following these steps, you can open PR.

 Also, it is important to check that the examples and main ggml backends (CUDA, METAL, CPU) are working with the new architecture, especially:
- [main](/examples/main/)
- [imatrix](/examples/imatrix/)
- [quantize](/examples/quantize/)
- [server](/examples/server/)
+- [main](../examples/main)
+- [imatrix](../examples/imatrix)
+- [quantize](../examples/quantize)
+- [server](../examples/server)

 ### 1. Convert the model to GGUF

 This step is done in python with a `convert` script using the [gguf](https://pypi.org/project/gguf/) library.
-Depending on the model architecture, you can use either [convert_hf_to_gguf.py](/convert_hf_to_gguf.py) or [examples/convert_legacy_llama.py](/examples/convert_legacy_llama.py) (for `llama/llama2` models in `.pth` format).
+Depending on the model architecture, you can use either [convert_hf_to_gguf.py](../convert_hf_to_gguf.py) or [examples/convert_legacy_llama.py](../examples/convert_legacy_llama.py) (for `llama/llama2` models in `.pth` format).

 The convert script reads the model configuration, tokenizer, tensor names+data and converts them to GGUF metadata and tensors.

@@ -31,7 +31,7 @@ class MyModel(Model):
    model_arch = gguf.MODEL_ARCH.GROK
 ```

-2. Define the layout of the GGUF tensors in [constants.py](/gguf-py/gguf/constants.py)
+2. Define the layout of the GGUF tensors in [constants.py](../gguf-py/gguf/constants.py)

 Add an enum entry in `MODEL_ARCH`, the model human friendly name in `MODEL_ARCH_NAMES` and the GGUF tensor names in `MODEL_TENSORS`.

@@ -54,7 +54,7 @@ Example for `falcon` model:

 As a general rule, before adding a new tensor name to GGUF, be sure the equivalent naming does not already exist.

-Once you have found the GGUF tensor name equivalent, add it to the [tensor_mapping.py](/gguf-py/gguf/tensor_mapping.py) file.
+Once you have found the GGUF tensor name equivalent, add it to the [tensor_mapping.py](../gguf-py/gguf/tensor_mapping.py) file.

 If the tensor name is part of a repetitive layer/block, the key word `bid` substitutes it.

@@ -100,7 +100,7 @@ Have a look at existing implementation like `build_llama`, `build_dbrx` or `buil

 When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support for missing backend operations can be added in another PR.

-Note: to debug the inference graph: you can use [llama-eval-callback](/examples/eval-callback/).
+Note: to debug the inference graph: you can use [llama-eval-callback](../examples/eval-callback).

 ## GGUF specification

--- a/docs/development/token_generation_performance_tips.md
+++ b/docs/development/token_generation_performance_tips.md
@@ -1,7 +1,7 @@
 # Token generation performance troubleshooting

 ## Verifying that the model is running on the GPU with CUDA
-Make sure you compiled llama with the correct env variables according to [this guide](/docs/build.md#cuda), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
+Make sure you compiled llama with the correct env variables according to [this guide](../README.md#CUDA), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
 ```shell
 ./llama-cli -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some "
 ```
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -21,6 +21,7 @@ else()
    add_subdirectory(embedding)
    add_subdirectory(eval-callback)
    add_subdirectory(export-lora)
+    add_subdirectory(finetune)
    add_subdirectory(gbnf-validator)
    add_subdirectory(gguf-hash)
    add_subdirectory(gguf-split)
@@ -52,4 +53,5 @@ else()
    add_subdirectory(simple)
    add_subdirectory(speculative)
    add_subdirectory(tokenize)
+    add_subdirectory(train-text-from-scratch)
 endif()
--- a/examples/baby-llama/baby-llama.cpp
+++ b/examples/baby-llama/baby-llama.cpp
@@ -1,6 +1,7 @@
 #include "ggml.h"
 #include "train.h"

+#include <vector>
 #include <cassert>
 #include <cstdlib>
 #include <cstring>
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@@ -69,7 +69,7 @@ int main(int argc, char ** argv) {
    llama_context_params ctx_params = llama_context_params_from_gpt_params(params);

    // ensure enough sequences are available
-    ctx_params.n_seq_max = n_pl.empty() ? 1 : *std::max_element(n_pl.begin(), n_pl.end());
+    ctx_params.n_seq_max = *std::max_element(n_pl.begin(), n_pl.end());

    llama_context * ctx = llama_new_context_with_model(model, ctx_params);

--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@@ -31,7 +31,7 @@ int main(int argc, char ** argv) {
    int n_parallel = params.n_parallel;

    // total length of the sequences including the prompt
-    int n_predict = params.n_predict;
+    int n_predict = 32;

    // init LLM

--- a/examples/convert_legacy_llama.py
+++ b/examples/convert_legacy_llama.py
@@ -24,7 +24,7 @@ from abc import ABC, abstractmethod
 from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
 from dataclasses import dataclass
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable, IO, Iterable, Literal, TypeVar
+from typing import TYPE_CHECKING, Any, Callable, IO, Iterable, Literal, TypeVar, Optional

 import numpy as np

@@ -346,6 +346,42 @@ class Params:
        return params


+@dataclass
+class Metadata:
+    name: Optional[str] = None
+    author: Optional[str] = None
+    version: Optional[str] = None
+    url: Optional[str] = None
+    description: Optional[str] = None
+    license: Optional[str] = None
+    source_url: Optional[str] = None
+    source_hf_repo: Optional[str] = None
+
+    @staticmethod
+    def load(metadata_path: Path) -> Metadata:
+        if metadata_path is None or not metadata_path.exists():
+            return Metadata()
+
+        with open(metadata_path, 'r') as file:
+            data = json.load(file)
+
+        # Create a new Metadata instance
+        metadata = Metadata()
+
+        # Assigning values to Metadata attributes if they exist in the JSON file
+        # This is based on LLM_KV_NAMES mapping in llama.cpp
+        metadata.name = data.get("general.name")
+        metadata.author = data.get("general.author")
+        metadata.version = data.get("general.version")
+        metadata.url = data.get("general.url")
+        metadata.description = data.get("general.description")
+        metadata.license = data.get("general.license")
+        metadata.source_url = data.get("general.source.url")
+        metadata.source_hf_repo = data.get("general.source.huggingface.repository")
+
+        return metadata
+
+
 #
 # data loading
 # TODO: reuse (probably move to gguf.py?)
@@ -770,7 +806,7 @@ class OutputFile:
    def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE):
        self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)

-    def add_meta_model(self, params: Params, metadata: gguf.Metadata | None) -> None:
+    def add_meta_model(self, params: Params, metadata: Metadata | None) -> None:
        # Metadata About The Model And Its Provenence
        name = "LLaMA"
        if metadata is not None and metadata.name is not None:
@@ -788,73 +824,16 @@ class OutputFile:
                self.gguf.add_author(metadata.author)
            if metadata.version is not None:
                self.gguf.add_version(metadata.version)
-            if metadata.organization is not None:
-                self.gguf.add_organization(metadata.organization)
-
-            if metadata.finetune is not None:
-                self.gguf.add_finetune(metadata.finetune)
-            if metadata.basename is not None:
-                self.gguf.add_basename(metadata.basename)
-
-            if metadata.description is not None:
-                self.gguf.add_description(metadata.description)
-            if metadata.quantized_by is not None:
-                self.gguf.add_quantized_by(metadata.quantized_by)
-
-            if metadata.size_label is not None:
-                self.gguf.add_size_label(metadata.size_label)
-
-            if metadata.license is not None:
-                self.gguf.add_license(metadata.license)
-            if metadata.license_name is not None:
-                self.gguf.add_license_name(metadata.license_name)
-            if metadata.license_link is not None:
-                self.gguf.add_license_link(metadata.license_link)
-
            if metadata.url is not None:
                self.gguf.add_url(metadata.url)
-            if metadata.doi is not None:
-                self.gguf.add_doi(metadata.doi)
-            if metadata.uuid is not None:
-                self.gguf.add_uuid(metadata.uuid)
-            if metadata.repo_url is not None:
-                self.gguf.add_repo_url(metadata.repo_url)
-
+            if metadata.description is not None:
+                self.gguf.add_description(metadata.description)
+            if metadata.license is not None:
+                self.gguf.add_licence(metadata.license)
            if metadata.source_url is not None:
                self.gguf.add_source_url(metadata.source_url)
-            if metadata.source_doi is not None:
-                self.gguf.add_source_doi(metadata.source_doi)
-            if metadata.source_uuid is not None:
-                self.gguf.add_source_uuid(metadata.source_uuid)
-            if metadata.source_repo_url is not None:
-                self.gguf.add_source_repo_url(metadata.source_repo_url)
-
-            if metadata.base_models is not None:
-                self.gguf.add_base_model_count(len(metadata.base_models))
-                for key, base_model_entry in enumerate(metadata.base_models):
-                    if "name" in base_model_entry:
-                        self.gguf.add_base_model_name(key, base_model_entry["name"])
-                    if "author" in base_model_entry:
-                        self.gguf.add_base_model_author(key, base_model_entry["author"])
-                    if "version" in base_model_entry:
-                        self.gguf.add_base_model_version(key, base_model_entry["version"])
-                    if "organization" in base_model_entry:
-                        self.gguf.add_base_model_organization(key, base_model_entry["organization"])
-                    if "url" in base_model_entry:
-                        self.gguf.add_base_model_url(key, base_model_entry["url"])
-                    if "doi" in base_model_entry:
-                        self.gguf.add_base_model_doi(key, base_model_entry["doi"])
-                    if "uuid" in base_model_entry:
-                        self.gguf.add_base_model_uuid(key, base_model_entry["uuid"])
-                    if "repo_url" in base_model_entry:
-                        self.gguf.add_base_model_repo_url(key, base_model_entry["repo_url"])
-
-            if metadata.tags is not None:
-                self.gguf.add_tags(metadata.tags)
-            if metadata.languages is not None:
-                self.gguf.add_languages(metadata.languages)
-            if metadata.datasets is not None:
-                self.gguf.add_datasets(metadata.datasets)
+            if metadata.source_hf_repo is not None:
+                self.gguf.add_source_hf_repo(metadata.source_hf_repo)

    def add_meta_arch(self, params: Params) -> None:
        # Metadata About The Neural Architecture Itself
@@ -965,7 +944,7 @@ class OutputFile:
    @staticmethod
    def write_vocab_only(
        fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab,
-        endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, metadata: gguf.Metadata | None = None,
+        endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, metadata: Metadata | None = None,
    ) -> None:
        check_vocab_size(params, vocab, pad_vocab=pad_vocab)

@@ -999,7 +978,7 @@ class OutputFile:
        fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: BaseVocab, svocab: gguf.SpecialVocab,
        concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
        pad_vocab: bool = False,
-        metadata: gguf.Metadata | None = None,
+        metadata: Metadata | None = None,
    ) -> None:
        check_vocab_size(params, vocab, pad_vocab=pad_vocab)

@@ -1042,32 +1021,35 @@ def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileT
    raise ValueError(f"Unexpected combination of types: {name_to_type}")


-def per_model_weight_count_estimation(tensors: Iterable[tuple[str, LazyTensor]]) -> tuple[int, int, int]:
-    total_params = 0
-    shared_params = 0
-    expert_params = 0
-
-    for name, lazy_tensor in tensors:
-        # We don't need these
-        if name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")):
-            continue
-
-        # Got A Tensor
-        sum_weights_in_tensor: int = 1
-
-        # Tensor Volume
+def model_parameter_count(model: LazyModel) -> int:
+    total_model_parameters = 0
+    for i, (name, lazy_tensor) in enumerate(model.items()):
+        sum_weights_in_tensor = 1
        for dim in lazy_tensor.shape:
            sum_weights_in_tensor *= dim
+        total_model_parameters += sum_weights_in_tensor
+    return total_model_parameters

-        if ".experts." in name:
-            if ".experts.0." in name:
-                expert_params += sum_weights_in_tensor
-        else:
-            shared_params += sum_weights_in_tensor

-        total_params += sum_weights_in_tensor
+def model_parameter_count_rounded_notation(model_params_count: int) -> str:
+    if model_params_count > 1e12 :
+        # Trillions Of Parameters
+        scaled_model_params = model_params_count * 1e-12
+        scale_suffix = "T"
+    elif model_params_count > 1e9 :
+        # Billions Of Parameters
+        scaled_model_params = model_params_count * 1e-9
+        scale_suffix = "B"
+    elif model_params_count > 1e6 :
+        # Millions Of Parameters
+        scaled_model_params = model_params_count * 1e-6
+        scale_suffix = "M"
+    else:
+        # Thousands Of Parameters
+        scaled_model_params = model_params_count * 1e-3
+        scale_suffix = "K"

-    return total_params, shared_params, expert_params
+    return f"{round(scaled_model_params)}{scale_suffix}"


 def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel:
@@ -1249,24 +1231,34 @@ class VocabFactory:
        return vocab, special_vocab


-def default_convention_outfile(file_type: GGMLFileType, expert_count: int | None, model_params_count: tuple[int, int, int], metadata: gguf.Metadata) -> str:
-    name = metadata.name if metadata.name is not None else None
-    basename = metadata.basename if metadata.basename is not None else None
-    finetune = metadata.finetune if metadata.finetune is not None else None
-    version = metadata.version if metadata.version is not None else None
-    size_label = metadata.size_label if metadata.size_label is not None else gguf.size_label(*model_params_count, expert_count=expert_count or 0)
-
-    output_type = {
+def default_convention_outfile(file_type: GGMLFileType, params: Params, model_params_count: int, metadata: Metadata) -> str:
+    quantization = {
        GGMLFileType.AllF32:    "F32",
        GGMLFileType.MostlyF16: "F16",
        GGMLFileType.MostlyQ8_0: "Q8_0",
    }[file_type]

-    return gguf.naming_convention(name, basename, finetune, version, size_label, output_type)
+    parameters = model_parameter_count_rounded_notation(model_params_count)
+
+    expert_count = ""
+    if params.n_experts is not None:
+        expert_count = f"{params.n_experts}x"
+
+    version = ""
+    if metadata is not None and metadata.version is not None:
+        version = f"-{metadata.version}"
+
+    name = "ggml-model"
+    if metadata is not None and metadata.name is not None:
+        name = metadata.name
+    elif params.path_model is not None:
+        name = params.path_model.name
+
+    return f"{name}{version}-{expert_count}{parameters}-{quantization}"


-def default_outfile(model_paths: list[Path], file_type: GGMLFileType, expert_count: int | None, model_params_count: tuple[int, int, int], metadata: gguf.Metadata) -> Path:
-    default_filename = default_convention_outfile(file_type, expert_count, model_params_count, metadata)
+def default_outfile(model_paths: list[Path], file_type: GGMLFileType, params: Params, model_params_count: int, metadata: Metadata) -> Path:
+    default_filename = default_convention_outfile(file_type, params, model_params_count, metadata)
    ret = model_paths[0].parent / f"{default_filename}.gguf"
    if ret in model_paths:
        logger.error(
@@ -1305,9 +1297,8 @@ def main(args_in: list[str] | None = None) -> None:
    parser.add_argument("--pad-vocab",    action="store_true",    help="add pad tokens when model vocab expects more than tokenizer metadata provides")
    parser.add_argument("--skip-unknown", action="store_true",    help="skip unknown tensor names instead of failing")
    parser.add_argument("--verbose",      action="store_true",    help="increase output verbosity")
-    parser.add_argument("--metadata",     type=Path,              help="Specify the path for an authorship metadata override file")
+    parser.add_argument("--metadata",     type=Path,              help="Specify the path for a metadata file")
    parser.add_argument("--get-outfile",  action="store_true",    help="get calculated default outfile name")
-    parser.add_argument("--model-name",   type=str, default=None, help="name of the model")

    args = parser.parse_args(args_in)

@@ -1319,36 +1310,32 @@ def main(args_in: list[str] | None = None) -> None:
    else:
        logging.basicConfig(level=logging.INFO)

-    model_name = args.model_name
-    dir_model = args.model
-
-    metadata = gguf.Metadata.load(args.metadata, dir_model, model_name)
+    metadata = Metadata.load(args.metadata)

    if args.get_outfile:
-        model_plus = load_some_model(dir_model)
+        model_plus = load_some_model(args.model)
        params = Params.load(model_plus)
-        model = convert_model_names(model_plus.model, params, args.skip_unknown)
-        model_params_count = per_model_weight_count_estimation(model_plus.model.items())
-        ftype = pick_output_type(model, args.outtype)
-
-        if (metadata is None or metadata.name is None) and params.path_model is not None:
-            metadata.name = params.path_model.name
-
-        print(f"{default_convention_outfile(ftype, params.n_experts, model_params_count, metadata)}") # noqa: NP100
+        model   = convert_model_names(model_plus.model, params, args.skip_unknown)
+        model_params_count = model_parameter_count(model_plus.model)
+        ftype   = pick_output_type(model, args.outtype)
+        print(f"{default_convention_outfile(ftype, params, model_params_count, metadata)}") # noqa: NP100
        return

    if args.no_vocab and args.vocab_only:
        raise ValueError("--vocab-only does not make sense with --no-vocab")

    if args.dump_single:
-        model_plus = lazy_load_file(dir_model)
+        model_plus = lazy_load_file(args.model)
        do_dump_model(model_plus)
        return

    if not args.vocab_only:
-        model_plus = load_some_model(dir_model)
+        model_plus = load_some_model(args.model)
    else:
-        model_plus = ModelPlus(model = {}, paths = [dir_model / 'dummy'], format = 'none', vocab = None)
+        model_plus = ModelPlus(model = {}, paths = [args.model / 'dummy'], format = 'none', vocab = None)
+
+    model_params_count = model_parameter_count(model_plus.model)
+    logger.info(f"model parameters count : {model_params_count} ({model_parameter_count_rounded_notation(model_params_count)})")

    if args.dump:
        do_dump_model(model_plus)
@@ -1381,7 +1368,7 @@ def main(args_in: list[str] | None = None) -> None:
        logger.info(f"params = {params}")

    model_parent_path = model_plus.paths[0].parent
-    vocab_path = Path(args.vocab_dir or dir_model or model_parent_path)
+    vocab_path = Path(args.vocab_dir or args.model or model_parent_path)
    vocab_factory = VocabFactory(vocab_path)
    vocab_types = None if args.no_vocab else args.vocab_type.split(",")
    vocab, special_vocab = vocab_factory.load_vocab(vocab_types, model_parent_path)
@@ -1412,21 +1399,13 @@ def main(args_in: list[str] | None = None) -> None:

    assert params is not None

-    if metadata.name is None and params.path_model is not None:
-        metadata.name = params.path_model.name
-
-    model_params_count = per_model_weight_count_estimation(model_plus.model.items())
-    logger.info(f"model parameters count : {model_params_count} ({gguf.model_weight_count_rounded_notation(model_params_count[0])})")
-
    logger.info(f"Vocab info: {vocab}")
    logger.info(f"Special vocab info: {special_vocab}")
    model   = model_plus.model
    model   = convert_model_names(model, params, args.skip_unknown)
    ftype   = pick_output_type(model, args.outtype)
    model   = convert_to_output_type(model, ftype)
-    outfile = args.outfile or default_outfile(model_plus.paths, ftype, params.n_experts, model_params_count, metadata=metadata)
-
-    metadata.size_label = gguf.size_label(*model_params_count, expert_count=params.n_experts or 0)
+    outfile = args.outfile or default_outfile(model_plus.paths, ftype, params, model_params_count, metadata)

    params.ftype = ftype
    logger.info(f"Writing {outfile}, format {ftype}")
--- a/examples/cvector-generator/cvector-generator.cpp
+++ b/examples/cvector-generator/cvector-generator.cpp
@@ -271,7 +271,7 @@ struct tokenized_prompt {
    size_t max_seq_len;

    tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) {
-        const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
+        const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
        tokens_pos = ::llama_tokenize(ctx, pos, add_bos, true);
        tokens_neg = ::llama_tokenize(ctx, neg, add_bos, true);
        max_seq_len = std::max(tokens_pos.size(), tokens_neg.size());
@@ -414,10 +414,9 @@ int main(int argc, char ** argv) {
    llama_numa_init(params.numa);

    // load the model to get hparams
-    llama_init_result llama_init = llama_init_from_gpt_params(params);
-
-    llama_model * model = llama_init.model;
-    llama_context * ctx = llama_init.context;
+    llama_model * model;
+    llama_context * ctx;
+    std::tie(model, ctx) = llama_init_from_gpt_params(params);

    // int n_ctx = llama_n_ctx(ctx);
    int n_layers = llama_n_layer(model);
--- a/examples/deprecation-warning/README.md
+++ b/examples/deprecation-warning/README.md
@@ -13,6 +13,7 @@ Please update all scripts and workflows to use the new binary names.
 | server | llama-server |
 | llama-bench | llama-bench |
 | embedding | llama-embedding |
+| finetune | llama-finetune |
 | quantize | llama-quantize |
 | tokenize | llama-tokenize |
 | export-lora | llama-export-lora |
@@ -44,6 +45,7 @@ Please update all scripts and workflows to use the new binary names.
 | save-load-state | llama-save-load-state |
 | simple | llama-simple |
 | speculative | llama-speculative |
+| train-text-from-scratch | llama-train-text-from-scratch |
 | vdot | llama-vdot |
 | tests/test-c.o | tests/test-c.o |

--- a/examples/embedding/README.md
+++ b/examples/embedding/README.md
@@ -9,13 +9,13 @@ To get started right away, run the following command, making sure to use the cor
 ### Unix-based systems (Linux, macOS, etc.):

 ```bash
-./llama-embedding -m ./path/to/model --pooling mean --log-disable -p "Hello World!" 2>/dev/null
+./llama-embedding -m ./path/to/model --log-disable -p "Hello World!" 2>/dev/null
 ```

 ### Windows:

 ```powershell
-llama-embedding.exe -m ./path/to/model --pooling mean --log-disable -p "Hello World!" 2>$null
+llama-embedding.exe -m ./path/to/model --log-disable -p "Hello World!" 2>$null
 ```

 The above command will output space-separated float values.
@@ -50,11 +50,11 @@ The above command will output space-separated float values.
 ### Unix-based systems (Linux, macOS, etc.):

 ```bash
-./llama-embedding -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --pooling mean --embd-separator '<#sep#>' --embd-normalize 2  --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null
+./embedding -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --embd-separator '<#sep#>' --embd-normalize 2  --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null
 ```

 ### Windows:

 ```powershell
-llama-embedding.exe -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --pooling mean --embd-separator '<#sep#>' --embd-normalize 2  --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null
+embedding.exe -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --embd-separator '<#sep#>' --embd-normalize 2  --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null
 ```
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -31,24 +31,13 @@ static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & toke
 }

 static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd, int embd_norm) {
-    const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
-    const struct llama_model * model = llama_get_model(ctx);
-
    // clear previous kv_cache values (irrelevant for embeddings)
    llama_kv_cache_clear(ctx);

    // run model
    fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
-    if (llama_model_has_encoder(model) && !llama_model_has_decoder(model)) {
-        // encoder-only model
-        if (llama_encode(ctx, batch) < 0) {
-            fprintf(stderr, "%s : failed to encode\n", __func__);
-        }
-    } else if (!llama_model_has_encoder(model) && llama_model_has_decoder(model)) {
-        // decoder-only model
-        if (llama_decode(ctx, batch) < 0) {
-            fprintf(stderr, "%s : failed to decode\n", __func__);
-        }
+    if (llama_decode(ctx, batch) < 0) {
+        fprintf(stderr, "%s : failed to decode\n", __func__);
    }

    for (int i = 0; i < batch.n_tokens; i++) {
@@ -56,22 +45,11 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
            continue;
        }

-        const float * embd = nullptr;
-        int embd_pos = 0;
+        // try to get sequence embeddings - supported only when pooling_type is not NONE
+        const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
+        GGML_ASSERT(embd != NULL && "failed to get sequence embeddings");

-        if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
-            // try to get token embeddings
-            embd = llama_get_embeddings_ith(ctx, i);
-            embd_pos = i;
-            GGML_ASSERT(embd != NULL && "failed to get token embeddings");
-        } else {
-            // try to get sequence embeddings - supported only when pooling_type is not NONE
-            embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
-            embd_pos = batch.seq_id[i][0];
-            GGML_ASSERT(embd != NULL && "failed to get sequence embeddings");
-        }
-
-        float * out = output + embd_pos * n_embd;
+        float * out = output + batch.seq_id[i][0] * n_embd;
        llama_embd_normalize(embd, out, n_embd, embd_norm);
    }
 }
@@ -101,11 +79,11 @@ int main(int argc, char ** argv) {
    llama_backend_init();
    llama_numa_init(params.numa);

-    // load the model
-    llama_init_result llama_init = llama_init_from_gpt_params(params);
+    llama_model * model;
+    llama_context * ctx;

-    llama_model * model = llama_init.model;
-    llama_context * ctx = llama_init.context;
+    // load the model
+    std::tie(model, ctx) = llama_init_from_gpt_params(params);
    if (model == NULL) {
        fprintf(stderr, "%s: error: unable to load model\n", __func__);
        return 1;
@@ -115,9 +93,8 @@ int main(int argc, char ** argv) {
    const int n_ctx = llama_n_ctx(ctx);

    const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
-
-    if (llama_model_has_encoder(model) && llama_model_has_decoder(model)) {
-        fprintf(stderr, "%s: error: computing embeddings in encoder-decoder models is not supported\n", __func__);
+    if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
+        fprintf(stderr, "%s: error: pooling type NONE not supported\n", __func__);
        return 1;
    }

@@ -176,23 +153,13 @@ int main(int argc, char ** argv) {
    const int n_prompts = prompts.size();
    struct llama_batch batch = llama_batch_init(n_batch, 0, 1);

-    // count number of embeddings
-    int n_embd_count = 0;
-    if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
-        for (int k = 0; k < n_prompts; k++) {
-            n_embd_count += inputs[k].size();
-        }
-    } else {
-        n_embd_count = n_prompts;
-    }
-
    // allocate output
    const int n_embd = llama_n_embd(model);
-    std::vector<float> embeddings(n_embd_count * n_embd, 0);
+    std::vector<float> embeddings(n_prompts * n_embd, 0);
    float * emb = embeddings.data();

    // break into batches
-    int e = 0; // number of embeddings already stored
+    int p = 0; // number of prompts processed already
    int s = 0; // number of prompts in current batch
    for (int k = 0; k < n_prompts; k++) {
        // clamp to n_batch tokens
@@ -202,11 +169,11 @@ int main(int argc, char ** argv) {

        // encode if at capacity
        if (batch.n_tokens + n_toks > n_batch) {
-            float * out = emb + e * n_embd;
+            float * out = emb + p * n_embd;
            batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
-            e += pooling_type == LLAMA_POOLING_TYPE_NONE ? batch.n_tokens : s;
-            s = 0;
            llama_batch_clear(batch);
+            p += s;
+            s = 0;
        }

        // add to batch
@@ -215,62 +182,39 @@ int main(int argc, char ** argv) {
    }

    // final batch
-    float * out = emb + e * n_embd;
+    float * out = emb + p * n_embd;
    batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);

    if (params.embd_out.empty()) {
+        // print the first part of the embeddings or for a single prompt, the full embedding
        fprintf(stdout, "\n");
-
-        if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
-            for (int j = 0; j < n_embd_count; j++) {
-                fprintf(stdout, "embedding %d: ", j);
-                for (int i = 0; i < std::min(3, n_embd); i++) {
-                    if (params.embd_normalize == 0) {
-                        fprintf(stdout, "%6.0f ", emb[j * n_embd + i]);
-                    } else {
-                        fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
-                    }
+        for (int j = 0; j < n_prompts; j++) {
+            fprintf(stdout, "embedding %d: ", j);
+            for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) {
+                if (params.embd_normalize == 0) {
+                    fprintf(stdout, "%6.0f ", emb[j * n_embd + i]);
+                } else {
+                    fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
                }
-                fprintf(stdout, " ... ");
-                for (int i = n_embd - 3; i < n_embd; i++) {
-                    if (params.embd_normalize == 0) {
-                        fprintf(stdout, "%6.0f ", emb[j * n_embd + i]);
-                    } else {
-                        fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
-                    }
-                }
-                fprintf(stdout, "\n");
-            }
-        } else {
-            // print the first part of the embeddings or for a single prompt, the full embedding
-            for (int j = 0; j < n_prompts; j++) {
-                fprintf(stdout, "embedding %d: ", j);
-                for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) {
-                    if (params.embd_normalize == 0) {
-                        fprintf(stdout, "%6.0f ", emb[j * n_embd + i]);
-                    } else {
-                        fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
-                    }
-                }
-                fprintf(stdout, "\n");
            }
+            fprintf(stdout, "\n");
+        }

-            // print cosine similarity matrix
-            if (n_prompts > 1) {
-                fprintf(stdout, "\n");
-                printf("cosine similarity matrix:\n\n");
-                for (int i = 0; i < n_prompts; i++) {
-                    fprintf(stdout, "%6.6s ", prompts[i].c_str());
+        // print cosine similarity matrix
+        if (n_prompts > 1) {
+            fprintf(stdout, "\n");
+            printf("cosine similarity matrix:\n\n");
+            for (int i = 0; i < n_prompts; i++) {
+                fprintf(stdout, "%6.6s ", prompts[i].c_str());
+            }
+            fprintf(stdout, "\n");
+            for (int i = 0; i < n_prompts; i++) {
+                for (int j = 0; j < n_prompts; j++) {
+                    float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
+                    fprintf(stdout, "%6.2f ", sim);
                }
+                fprintf(stdout, "%1.10s", prompts[i].c_str());
                fprintf(stdout, "\n");
-                for (int i = 0; i < n_prompts; i++) {
-                    for (int j = 0; j < n_prompts; j++) {
-                        float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
-                        fprintf(stdout, "%6.2f ", sim);
-                    }
-                    fprintf(stdout, "%1.10s", prompts[i].c_str());
-                    fprintf(stdout, "\n");
-                }
            }
        }
    }
@@ -289,23 +233,23 @@ int main(int argc, char ** argv) {
            }
            fprintf(stdout, notArray ? "]\n    }" : "]");
            j++;
-            if (j < n_embd_count) fprintf(stdout, notArray ? ",\n" : ","); else break;
+            if (j < n_prompts) fprintf(stdout, notArray ? ",\n" : ","); else break;
        }
        fprintf(stdout, notArray ? "\n  ]" : "]\n");

        if (params.embd_out == "json+" && n_prompts > 1) {
            fprintf(stdout, ",\n  \"cosineSimilarity\": [\n");
-            for (int i = 0;;) { // at least two iteration (n_embd_count > 1)
+            for (int i = 0;;) { // at least two iteration (n_prompts > 1)
                fprintf(stdout, "    [");
-                for (int j = 0;;) { // at least two iteration (n_embd_count > 1)
+                for (int j = 0;;) { // at least two iteration (n_prompts > 1)
                    float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
                    fprintf(stdout, "%6.2f", sim);
                    j++;
-                    if (j < n_embd_count) fprintf(stdout, ", "); else break;
+                    if (j < n_prompts) fprintf(stdout, ", "); else break;
                }
                fprintf(stdout, " ]");
                i++;
-                if (i < n_embd_count) fprintf(stdout, ",\n"); else break;
+                if (i < n_prompts) fprintf(stdout, ",\n"); else break;
            }
            fprintf(stdout, "\n  ]");
        }
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@@ -62,7 +62,7 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
                    } else if (type == GGML_TYPE_I8) {
                        v = (float) *(int8_t *) &data[i];
                    } else {
-                        GGML_ABORT("fatal error");
+                        GGML_ASSERT(false);
                    }
                    printf("%12.4f", v);
                    sum += v;
@@ -99,7 +99,7 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {

    char src1_str[128] = {0};
    if (src1) {
-        snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, ggml_ne_string(src1).c_str());
+        sprintf(src1_str, "%s{%s}", src1->name, ggml_ne_string(src1).c_str());
    }

    printf("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
@@ -127,7 +127,7 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
 }

 static bool run(llama_context * ctx, const gpt_params & params) {
-    const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
+    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));

    std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);

@@ -163,10 +163,9 @@ int main(int argc, char ** argv) {
    params.warmup = false;

    // init
-    llama_init_result llama_init = llama_init_from_gpt_params(params);
-
-    llama_model * model = llama_init.model;
-    llama_context * ctx = llama_init.context;
+    llama_model * model;
+    llama_context * ctx;
+    std::tie(model, ctx) = llama_init_from_gpt_params(params);
    if (model == nullptr || ctx == nullptr) {
        fprintf(stderr, "%s : failed to init\n", __func__);
        return 1;
--- a/examples/export-lora/README.md
+++ b/examples/export-lora/README.md
@@ -6,28 +6,21 @@ Apply LORA adapters to base model and export the resulting model.
 usage: llama-export-lora [options]

 options:
-  -m,    --model                  model path from which to load base model (default '')
-         --lora FNAME             path to LoRA adapter  (can be repeated to use multiple adapters)
-         --lora-scaled FNAME S    path to LoRA adapter with user defined scaling S  (can be repeated to use multiple adapters)
-  -t,    --threads N              number of threads to use during computation (default: 4)
-  -o,    --output FNAME           output file (default: 'ggml-lora-merged-f16.gguf')
+  -h, --help                         show this help message and exit
+  -m FNAME, --model-base FNAME       model path from which to load base model (default '')
+  -o FNAME, --model-out FNAME        path to save exported model (default '')
+  -l FNAME, --lora FNAME             apply LoRA adapter
+  -s FNAME S, --lora-scaled FNAME S  apply LoRA adapter with user defined scaling S
+  -t N, --threads N                  number of threads to use during computation (default: 4)
 ```

 For example:

 ```bash
 ./bin/llama-export-lora \
-    -m open-llama-3b-v2.gguf \
-    -o open-llama-3b-v2-english2tokipona-chat.gguf \
-    --lora lora-open-llama-3b-v2-english2tokipona-chat-LATEST.gguf
+    -m open-llama-3b-v2-q8_0.gguf \
+    -o open-llama-3b-v2-q8_0-english2tokipona-chat.gguf \
+    -l lora-open-llama-3b-v2-q8_0-english2tokipona-chat-LATEST.bin
 ```

-Multiple LORA adapters can be applied by passing multiple `--lora FNAME` or `--lora-scaled FNAME S` command line parameters:
-
-```bash
-./bin/llama-export-lora \
-    -m your_base_model.gguf \
-    -o your_merged_model.gguf \
-    --lora-scaled lora_task_A.gguf 0.5 \
-    --lora-scaled lora_task_B.gguf 0.5
-```
+Multiple LORA adapters can be applied by passing multiple `-l FN` or `-s FN S` command line parameters.
--- a/examples/export-lora/export-lora.cpp
+++ b/examples/export-lora/export-lora.cpp
@@ -1,423 +1,462 @@
+
 #include "common.h"
 #include "ggml.h"
 #include "ggml-alloc.h"

-#include <map>
 #include <vector>
 #include <string>
 #include <thread>
-#include <fstream>

-static bool g_verbose = false;
-
-struct tensor_transformation {
-    struct ggml_tensor * in;
-    struct ggml_tensor * out;
-    bool is_copy;
-};
-
-static std::string get_kv_str(struct gguf_context * ctx_gguf, const std::string & key){
-    int id = gguf_find_key(ctx_gguf, key.c_str());
-    return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf, id));
-}
-
-static float get_kv_f32(struct gguf_context * ctx_gguf, const std::string & key) {
-    int id = gguf_find_key(ctx_gguf, key.c_str());
-    return id < 0 ? 0.0f : gguf_get_val_f32(ctx_gguf, id);
-}
-
-static void zeros(std::ofstream & file, size_t n) {
-    char zero = 0;
-    for (size_t i = 0; i < n; ++i) {
-        file.write(&zero, 1);
-    }
-}
-
-static std::string ggml_ne_string(const ggml_tensor * t) {
-    std::string str;
-    for (int i = 0; i < GGML_MAX_DIMS; ++i) {
-        str += std::to_string(t->ne[i]);
-        if (i + 1 < GGML_MAX_DIMS) {
-            str += ", ";
-        }
-    }
-    return str;
-}
-
-static struct gguf_context * load_gguf(std::string & fname, struct ggml_context ** ctx_ggml) {
-    struct gguf_init_params params = {
-        /*.no_alloc = */ true,
-        /*.ctx      = */ ctx_ggml,
-    };
-    struct gguf_context * ctx_gguf = gguf_init_from_file(fname.c_str(), params);
-    if (!ctx_gguf) {
-        throw std::runtime_error("failed to load input GGUF from " + fname);
-    }
-    return ctx_gguf;
-}
-
-struct file_input {
-    struct ggml_context * ctx_meta = nullptr;
-    struct gguf_context * ctx_gguf = nullptr;
-    std::ifstream f_in;
-    std::map<std::string, ggml_tensor *> tensors;
-    float alpha;
+struct lora_info {
+    std::string filename;
    float scale;
-
-    file_input(std::string & fname, float scale): f_in(fname, std::ios::binary), scale(scale) {
-        if (!f_in.is_open()) {
-            throw std::runtime_error("failed to open input gguf from " + fname);
-        }
-
-        ctx_gguf = load_gguf(fname, &ctx_meta);
-        alpha = get_kv_f32(ctx_gguf, "adapter.lora.alpha");
-        printf("%s: loaded gguf from %s\n", __func__, fname.c_str());
-
-        for (ggml_tensor * cur = ggml_get_first_tensor(ctx_meta); cur; cur = ggml_get_next_tensor(ctx_meta, cur)) {
-            std::string name(cur->name);
-            tensors[name] = cur;
-            if (g_verbose) {
-                printf("%s: %s\n", __func__, cur->name);
-            }
-        }
-    }
-
-    ggml_tensor * get_tensor(std::string name) {
-        if (tensors.find(name) == tensors.end()) {
-            return nullptr;
-        }
-        return tensors[name];
-    }
-
-    void read_tensor_data(std::string name, std::vector<uint8_t> & buf) {
-        if (tensors.find(name) == tensors.end()) {
-            throw std::runtime_error("cannot find tensor with name: " + name);
-        }
-        auto len = ggml_nbytes(tensors[name]);
-        if (buf.size() < len) {
-            buf.resize(len);
-        }
-        auto i_tensor_in = gguf_find_tensor(ctx_gguf, name.c_str()); // idx of tensor in the input file
-        auto offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i_tensor_in);
-        f_in.seekg(offset);
-        f_in.read((char* )buf.data(), len);
-    }
-
-    ~file_input() {
-        gguf_free(ctx_gguf);
-        ggml_free(ctx_meta);
-    }
 };

-struct lora_merge_ctx {
-    // input base model + adapters
-    file_input base_model;
-    std::vector<std::unique_ptr<file_input>> adapters;
-
-    // for computing merged tensor
+struct export_lora_params {
+    std::string fn_model_base;
+    std::string fn_model_out;
+    std::vector<struct lora_info> lora;
    int n_threads;
-    ggml_backend_t backend = nullptr;
-    ggml_gallocr_t allocr = nullptr;
-    std::vector<uint8_t> read_buf;
+};

-    // output file
-    struct gguf_context * ctx_out;
-    struct ggml_context * ctx_out_ggml;
-    std::ofstream fout;
+struct lora_data {
+    struct lora_info     info;
+    std::vector<uint8_t> data;
+    struct ggml_context * ctx;

-    lora_merge_ctx(
-            std::string & base_fname,
-            std::vector<llama_lora_adapter_info> & lora_files,
-            std::string & outfile,
-            int n_threads) : base_model(base_fname, 0), n_threads(n_threads), fout(outfile, std::ios::binary) {
-        fout.exceptions(std::ofstream::failbit); // fail fast on write errors
+    uint32_t lora_r;
+    uint32_t lora_alpha;
+};

-        if (gguf_find_key(base_model.ctx_gguf, LLM_KV_SPLIT_COUNT) >= 0) {
-            throw std::runtime_error("split model is not yet supported");
-        }
+struct llama_file {
+    // use FILE * so we don't have to re-open the file to mmap
+    FILE * fp;
+    size_t size;

-        for (auto & lora_inp : lora_files) {
-            auto fname = lora_inp.path;
-            auto scale = lora_inp.scale;
-            std::unique_ptr<file_input> adapter(new file_input(fname, scale));
-            check_metadata_lora(adapter.get());
-            adapters.push_back(std::move(adapter));
-        }
-
-        ctx_out = gguf_init_empty();
-        struct ggml_init_params params = {
-            /*.mem_size   =*/ gguf_get_n_tensors(base_model.ctx_gguf)*ggml_tensor_overhead(),
-            /*.mem_buffer =*/ NULL,
-            /*.no_alloc   =*/ true,
-        };
-        ctx_out_ggml = ggml_init(params);
-        backend = ggml_backend_cpu_init();
-        allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
-    }
-
-    void check_metadata_lora(file_input * adapter) {
-        auto general_type = get_kv_str(adapter->ctx_gguf, "general.type");
-        if (general_type != "adapter") {
-            throw std::runtime_error("expect general.type to be 'adapter', but got: " + general_type);
-        }
-
-        auto adapter_type = get_kv_str(adapter->ctx_gguf, "adapter.type");
-        if (adapter_type != "lora") {
-            throw std::runtime_error("expect adapter.type to be 'lora', but got: " + adapter_type);
-        }
-
-        auto general_arch_base = get_kv_str(base_model.ctx_gguf, "general.architecture");
-        auto general_arch_lora = get_kv_str(adapter->ctx_gguf,   "general.architecture");
-        if (general_arch_base != general_arch_lora) {
-            throw std::runtime_error("model arch and LoRA arch mismatch");
-        }
-    }
-
-    ggml_type get_out_tensor_type(struct ggml_tensor * t) {
-        if (t->type == GGML_TYPE_F32) {
-            return GGML_TYPE_F32;
+    llama_file(const char * fname, const char * mode) {
+        fp = std::fopen(fname, mode);
+        if (fp == NULL) {
+            size = 0;
        } else {
-            return GGML_TYPE_F16;
+            seek(0, SEEK_END);
+            size = tell();
+            seek(0, SEEK_SET);
        }
    }

-    void run_merge() {
-        // prepare metadata
-        gguf_set_kv(ctx_out, base_model.ctx_gguf);
-        // output is forced to f16 for now
-        gguf_set_val_u32(ctx_out, "general.file_type", LLAMA_FTYPE_MOSTLY_F16);
-
-        // check if all lora adapters have the same tensors
-        // TODO: remove this when we can support merging subset of adapters. Ref: https://github.com/ggerganov/llama.cpp/pull/8607#discussion_r1686027777
-        static const char * err_no_subset_adapter = "Input adapters do not have the same list of tensors. This is not yet supported. Please merge the adapter one-by-one instead of merging all at once.";
-        if (adapters.size() > 1) {
-            for (size_t i = 1; i < adapters.size(); ++i) {
-                if (adapters[0]->tensors.size() != adapters[i]->tensors.size()) {
-                    throw std::runtime_error(err_no_subset_adapter);
-                }
-                for (auto & it : adapters[i]->tensors) {
-                    if (adapters[0]->get_tensor(it.first) == nullptr) {
-                        throw std::runtime_error(err_no_subset_adapter);
-                    }
-                }
-            }
-        }
-
-        // mapping base tensor to out tensor (same shape with base, but different type)
-        std::vector<tensor_transformation> trans;
-        for (auto & it : base_model.tensors) {
-            bool t_a = true;
-            bool t_b = true;
-            for (auto & adapter : adapters) {
-                t_a &= nullptr != adapter->get_tensor(it.first + ".lora_a");
-                t_b &= nullptr != adapter->get_tensor(it.first + ".lora_b");
-            }
-            auto base_tensor = it.second;
-            if (!t_a && !t_b) {
-                // only copy
-                struct ggml_tensor * cpy_tensor = ggml_dup_tensor(ctx_out_ggml, base_tensor);
-                ggml_set_name(cpy_tensor, base_tensor->name);
-                trans.push_back({
-                    cpy_tensor,
-                    cpy_tensor,
-                    true,
-                });
-                gguf_add_tensor(ctx_out, cpy_tensor);
-            } else if (t_a && t_b) {
-                // need merging
-                struct ggml_tensor * out_tensor = ggml_new_tensor(
-                    ctx_out_ggml, get_out_tensor_type(base_tensor), GGML_MAX_DIMS, base_tensor->ne);
-                ggml_set_name(out_tensor, base_tensor->name);
-                trans.push_back({
-                    base_tensor,
-                    out_tensor,
-                    false,
-                });
-                gguf_add_tensor(ctx_out, out_tensor);
-            } else {
-                throw std::runtime_error("tensor " + it.first + " missing either lora_a or lora_b");
-            }
-        }
-
-        // placeholder for the meta data
-        {
-            size_t meta_size = gguf_get_meta_size(ctx_out);
-            zeros(fout, meta_size);
-        }
-
-        // process base model tensors
-        size_t n_merged = 0;
-        for (auto & it : trans) {
-            if (!it.is_copy) {
-                merge_tensor(it.in, it.out);
-                n_merged++;
-            } else {
-                copy_tensor(it.in);
-            }
-        }
-
-        // write output metadata
-        {
-            std::vector<uint8_t> data(gguf_get_meta_size(ctx_out));
-            gguf_get_meta_data(ctx_out, data.data());
-            fout.seekp(0);
-            fout.write((const char *)data.data(), data.size());
-        }
-
-        printf("%s : merged %ld tensors with lora adapters\n", __func__, n_merged);
-        printf("%s : wrote %ld tensors to output file\n", __func__, trans.size());
+    size_t tell() const {
+#ifdef _WIN32
+        __int64 ret = _ftelli64(fp);
+#else
+        long ret = std::ftell(fp);
+#endif
+        GGML_ASSERT(ret != -1); // this really shouldn't fail
+        return (size_t) ret;
    }

-    void copy_tensor(struct ggml_tensor * base) {
-        printf("%s :  %s [%s]\n", __func__, base->name, ggml_ne_string(base).c_str());
-        size_t len = ggml_nbytes(base);
-        base_model.read_tensor_data(base->name, read_buf);
-        fout.write((char* )read_buf.data(), len);
-        zeros(fout, GGML_PAD(len, GGUF_DEFAULT_ALIGNMENT) - len);
+    void seek(size_t offset, int whence) {
+#ifdef _WIN32
+        int ret = _fseeki64(fp, (__int64) offset, whence);
+#else
+        int ret = std::fseek(fp, (long) offset, whence);
+#endif
+        GGML_ASSERT(ret == 0); // same
    }

-    void merge_tensor(struct ggml_tensor * base, struct ggml_tensor * out) {
-        std::string name_base(base->name);
-        std::string name_lora_a = name_base + ".lora_a";
-        std::string name_lora_b = name_base + ".lora_b";
-
-        printf("%s : %s [%s]\n", __func__, base->name, ggml_ne_string(base).c_str());
-
-        // context for input tensor
-        std::vector<struct ggml_tensor *> inp_a(adapters.size());
-        std::vector<struct ggml_tensor *> inp_b(adapters.size());
-        struct ggml_init_params params {
-            /*.mem_size   =*/ ggml_tensor_overhead()*(2+adapters.size()*2),
-            /*.mem_buffer =*/ NULL,
-            /*.no_alloc   =*/ true,
-        };
-        struct ggml_context * ctx = ggml_init(params);
-
-        // alloc tensors
-        struct ggml_tensor * inp_base = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, base->ne);
-        for (size_t i = 0; i < adapters.size(); ++i) {
-            auto t_a = adapters[i]->get_tensor(name_lora_a);
-            auto t_b = adapters[i]->get_tensor(name_lora_b);
-            // TODO: add support for quantized lora
-            if (ggml_is_quantized(t_a->type) || ggml_is_quantized(t_b->type)) {
-                throw std::runtime_error("quantized LoRA adapters is not supported, please retry with f16 or f32");
-            }
-            inp_a[i] = ggml_dup_tensor(ctx, t_a);
-            inp_b[i] = ggml_dup_tensor(ctx, t_b);
+    void read_raw(void * ptr, size_t size) {
+        if (size == 0) {
+            return;
        }
-        ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx, backend);
-
-        // load base tensor to backend buffer
-        base_model.read_tensor_data(name_base, read_buf);
-        if (base->type != GGML_TYPE_F32) {
-            // optionally dequantize it
-            printf("%s :   + dequantize base tensor from %s to F32\n", __func__, ggml_type_name(base->type));
-            auto nels = ggml_nelements(inp_base);
-            ggml_type_traits_t qtype = ggml_internal_get_type_traits(base->type);
-            std::vector<uint8_t> dequant_buf(nels * sizeof(float));
-            qtype.to_float(read_buf.data(), (float *)dequant_buf.data(), nels);
-            ggml_backend_tensor_set(inp_base, dequant_buf.data(), 0, dequant_buf.size());
-        } else {
-            ggml_backend_tensor_set(inp_base, read_buf.data(), 0, ggml_nbytes(inp_base));
+        errno = 0;
+        std::size_t ret = std::fread(ptr, size, 1, fp);
+        if (ferror(fp)) {
+            die_fmt("read error: %s", strerror(errno));
        }
-
-        // load lora tensors to backend buffer
-        for (size_t i = 0; i < adapters.size(); ++i) {
-            adapters[i]->read_tensor_data(name_lora_a, read_buf);
-            ggml_backend_tensor_set(inp_a[i], read_buf.data(), 0, ggml_nbytes(inp_a[i]));
-            adapters[i]->read_tensor_data(name_lora_b, read_buf);
-            ggml_backend_tensor_set(inp_b[i], read_buf.data(), 0, ggml_nbytes(inp_b[i]));
+        if (ret != 1) {
+            die("unexpectedly reached end of file");
        }
-
-        // build graph
-        struct ggml_cgraph * gf;
-        {
-            static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
-            static std::vector<uint8_t> buf(buf_size);
-            struct ggml_init_params params0 = {
-                /*.mem_size   =*/ buf_size,
-                /*.mem_buffer =*/ buf.data(),
-                /*.no_alloc   =*/ true,
-            };
-            struct ggml_context * ctx0 = ggml_init(params0);
-            gf = ggml_new_graph(ctx0);
-            struct ggml_tensor * cur = inp_base;
-            for (size_t i = 0; i < adapters.size(); ++i) {
-                struct ggml_tensor * a_T = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_cast(ctx0, inp_a[i], GGML_TYPE_F32)));
-                struct ggml_tensor * delta = ggml_mul_mat(ctx0, a_T, ggml_cast(ctx0, inp_b[i], GGML_TYPE_F32));
-                // scale
-                const float alpha = adapters[i]->alpha;
-                const float rank  = (float) inp_b[i]->ne[0];
-                const float scale = alpha ? adapters[i]->scale * alpha / rank : adapters[i]->scale;
-                delta = ggml_scale(ctx0, delta, scale);
-                cur = ggml_add(ctx0, delta, cur);
-                printf("%s :   + merging from adapter[%ld] type=%s\n", __func__, i, ggml_type_name(inp_a[i]->type));
-                printf("%s :     input_scale=%f calculated_scale=%f rank=%d\n", __func__, adapters[i]->scale, scale, (int) inp_b[i]->ne[0]);
-            }
-            cur = ggml_cast(ctx0, cur, out->type);
-            printf("%s :   + output type is %s\n", __func__, ggml_type_name(out->type));
-            ggml_build_forward_expand(gf, cur);
-            ggml_free(ctx0);
-        }
-
-        // compute
-        {
-            ggml_gallocr_alloc_graph(allocr, gf);
-            ggml_backend_cpu_set_n_threads(backend, n_threads);
-            ggml_backend_graph_compute(backend, gf);
-        }
-
-        // write data to output file
-        {
-            auto result = gf->nodes[gf->n_nodes - 1];
-            size_t len = ggml_nbytes(result);
-            if (read_buf.size() < len) {
-                read_buf.resize(len);
-            }
-            ggml_backend_tensor_get(result, read_buf.data(), 0, len);
-            fout.write((char* )read_buf.data(), len);
-            zeros(fout, GGML_PAD(len, GGUF_DEFAULT_ALIGNMENT) - len);
-        }
-
-        ggml_free(ctx);
-        ggml_backend_buffer_free(buffer);
    }

-    ~lora_merge_ctx() {
-        ggml_gallocr_free(allocr);
-        ggml_backend_free(backend);
-        gguf_free(ctx_out);
-        ggml_free(ctx_out_ggml);
+    std::uint32_t read_u32() {
+        std::uint32_t ret;
+        read_raw(&ret, sizeof(ret));
+        return ret;
+    }
+
+    std::string read_string(std::uint32_t len) {
+        std::vector<char> chars(len);
+        read_raw(chars.data(), len);
+        return std::string(chars.data(), len);
+    }
+
+    void write_raw(const void * ptr, size_t size) {
+        if (size == 0) {
+            return;
+        }
+        errno = 0;
+        size_t ret = std::fwrite(ptr, size, 1, fp);
+        if (ret != 1) {
+            die_fmt("write error: %s", strerror(errno));
+        }
+    }
+
+    void write_u32(std::uint32_t val) {
+        write_raw(&val, sizeof(val));
+    }
+
+    bool eof() {
+        return tell() >= size;
+    }
+
+    ~llama_file() {
+        if (fp) {
+            std::fclose(fp);
+        }
    }
 };

-static void print_usage(int argc, char ** argv, const gpt_params & params) {
-    gpt_params_print_usage(argc, argv, params);
+static struct export_lora_params get_default_export_lora_params() {
+    struct export_lora_params result;
+    result.fn_model_base = "";
+    result.fn_model_out  = "";
+    result.n_threads = GGML_DEFAULT_N_THREADS;
+    return result;
+}

-    printf("\nexample usage:\n");
-    printf("\n  %s -m base-model.gguf --lora lora-file.gguf -o merged-model-f16.gguf\n", argv[0]);
-    printf("\nNOTE: output model is F16\n");
+static void export_lora_print_usage(int /*argc*/, char ** argv, const struct export_lora_params * params) {
+    fprintf(stderr, "usage: %s [options]\n", argv[0]);
+    fprintf(stderr, "\n");
+    fprintf(stderr, "options:\n");
+    fprintf(stderr, "  -h, --help                         show this help message and exit\n");
+    fprintf(stderr, "  -m FNAME, --model-base FNAME       model path from which to load base model (default '%s')\n", params->fn_model_base.c_str());
+    fprintf(stderr, "  -o FNAME, --model-out FNAME        path to save exported model (default '%s')\n", params->fn_model_out.c_str());
+    fprintf(stderr, "  -l FNAME, --lora FNAME             apply LoRA adapter\n");
+    fprintf(stderr, "  -s FNAME S, --lora-scaled FNAME S  apply LoRA adapter with user defined scaling S\n");
+    fprintf(stderr, "  -t N, --threads N                  number of threads to use during computation (default: %d)\n", params->n_threads);
+}
+
+static bool export_lora_params_parse(int argc, char ** argv, struct export_lora_params * params) {
+    bool invalid_param = false;
+    std::string arg;
+    struct export_lora_params default_params = get_default_export_lora_params();
+    const std::string arg_prefix = "--";
+
+    for (int i = 1; i < argc; i++) {
+        arg = argv[i];
+        if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
+            std::replace(arg.begin(), arg.end(), '_', '-');
+        }
+
+        if (arg == "-m" || arg == "--model-base") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->fn_model_base = argv[i];
+        } else if (arg == "-o" || arg == "--model-out") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->fn_model_out = argv[i];
+        } else if (arg == "-l" || arg == "--lora") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            struct lora_info lora;
+            lora.filename = argv[i];
+            lora.scale = 1.0f;
+            params->lora.push_back(lora);
+        } else if (arg == "-s" || arg == "--lora-scaled") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            struct lora_info lora;
+            lora.filename = argv[i];
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            lora.scale = std::stof(argv[i]);
+            params->lora.push_back(lora);
+        } else if (arg == "-t" || arg == "--threads") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->n_threads = std::stoi(argv[i]);
+            if (params->n_threads <= 0) {
+                params->n_threads = std::thread::hardware_concurrency();
+            }
+        } else {
+            fprintf(stderr, "error: unknown argument: '%s'\n", arg.c_str());
+            export_lora_print_usage(argc, argv, &default_params);
+            exit(1);
+        }
+    }
+
+    if (params->fn_model_base == default_params.fn_model_base) {
+        fprintf(stderr, "error: please specify a filename for model-base.\n");
+        export_lora_print_usage(argc, argv, &default_params);
+        exit(1);
+    }
+    if (params->fn_model_out == default_params.fn_model_out) {
+        fprintf(stderr, "error: please specify a filename for model-out.\n");
+        export_lora_print_usage(argc, argv, &default_params);
+        exit(1);
+    }
+    if (invalid_param) {
+        fprintf(stderr, "error: invalid parameter for argument: '%s'\n", arg.c_str());
+        export_lora_print_usage(argc, argv, &default_params);
+        exit(1);
+    }
+    return true;
+}
+
+static void free_lora(struct lora_data * lora) {
+    if (lora->ctx != NULL) {
+        ggml_free(lora->ctx);
+    }
+    delete lora;
+}
+
+static struct lora_data * load_lora(struct lora_info * info) {
+    struct lora_data * result = new struct lora_data;
+    result->info = *info;
+    result->ctx = NULL;
+    result->lora_r     = 1;
+    result->lora_alpha = 1;
+
+    struct llama_file file(info->filename.c_str(), "rb");
+    if (file.fp == NULL) {
+        fprintf(stderr, "warning: Could not open lora adapter '%s'. Ignoring this adapter.\n",
+            info->filename.c_str());
+        free_lora(result);
+        return NULL;
+    }
+
+    struct ggml_init_params params_ggml;
+    params_ggml.mem_size   = ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE;
+    params_ggml.mem_buffer = NULL;
+    params_ggml.no_alloc   = true;
+    result->ctx = ggml_init(params_ggml);
+
+    uint32_t magic   = file.read_u32();
+    if (magic != LLAMA_FILE_MAGIC_GGLA) {
+        die_fmt("unexpected lora header file magic in '%s'", info->filename.c_str());
+    }
+    uint32_t version = file.read_u32();
+    if (version != 1) {
+        die_fmt("unexpected lora file version '%u' in '%s'", (unsigned) version, info->filename.c_str());
+    }
+    result->lora_r     = file.read_u32();
+    result->lora_alpha = file.read_u32();
+    // read tensor infos from file
+    std::vector<char> name_buf;
+    std::vector<struct ggml_tensor *> tensors;
+    std::vector<size_t> tensors_offset;
+    size_t total_nbytes_pad = 0;
+    while(!file.eof()) {
+        int64_t ne[4]   = {1,1,1,1};
+        uint32_t n_dims  = file.read_u32();
+        uint32_t namelen = file.read_u32();
+        uint32_t type    = file.read_u32();
+        for (uint32_t k = 0; k < n_dims; ++k) {
+            ne[k] = (int64_t)file.read_u32();
+        }
+        name_buf.clear();
+        name_buf.resize(namelen + 1, '\0');
+        file.read_raw(name_buf.data(), namelen);
+        file.seek((0-file.tell()) & 31, SEEK_CUR);
+        size_t offset = file.tell();
+        struct ggml_tensor * tensor = ggml_new_tensor(result->ctx, (enum ggml_type) type, n_dims, ne);
+        ggml_set_name(tensor, name_buf.data());
+        size_t nbytes     = ggml_nbytes(tensor);
+        size_t nbytes_pad = ggml_nbytes_pad(tensor);
+        total_nbytes_pad += nbytes_pad;
+        tensors.push_back(tensor);
+        tensors_offset.push_back(offset);
+        file.seek(nbytes, SEEK_CUR);
+    }
+    // read tensor data
+    result->data.resize(total_nbytes_pad);
+    size_t data_offset = 0;
+    for (size_t i = 0; i < tensors.size(); ++i) {
+        struct ggml_tensor * tensor = tensors[i];
+        size_t offset     = tensors_offset[i];
+        size_t nbytes     = ggml_nbytes(tensor);
+        size_t nbytes_pad = ggml_nbytes_pad(tensor);
+        file.seek(offset, SEEK_SET);
+        tensor->data = result->data.data() + data_offset;
+        file.read_raw(tensor->data, nbytes);
+        data_offset += nbytes_pad;
+    }
+    return result;
+}
+
+
+static struct ggml_cgraph * build_graph_lora(
+    struct ggml_context * ctx,
+    struct ggml_tensor * tensor,
+    struct ggml_tensor * lora_a,
+    struct ggml_tensor * lora_b,
+    float scaling
+) {
+    struct ggml_tensor * ab = ggml_mul_mat(ctx, lora_a, lora_b);
+    if (scaling != 1.0f) {
+        ab = ggml_scale(ctx, ab, scaling);
+    }
+    struct ggml_tensor * res = ggml_add_inplace(ctx, tensor, ab);
+
+    struct ggml_cgraph * gf = ggml_new_graph(ctx);
+    ggml_build_forward_expand (gf, res);
+    return gf;
+}
+
+static bool apply_lora(struct ggml_tensor * tensor, struct lora_data * lora, int n_threads) {
+    if (lora->ctx == NULL) {
+        return false;
+    }
+    std::string name = ggml_get_name(tensor);
+    std::string name_a = name + std::string(".loraA");
+    std::string name_b = name + std::string(".loraB");
+    struct ggml_tensor * lora_a = ggml_get_tensor(lora->ctx, name_a.c_str());
+    struct ggml_tensor * lora_b = ggml_get_tensor(lora->ctx, name_b.c_str());
+    if (lora_a == NULL || lora_b == NULL) {
+        return false;
+    }
+
+    float scaling = lora->info.scale * (float)lora->lora_alpha / (float)lora->lora_r;
+
+    struct ggml_init_params params;
+    params.mem_size   = GGML_OBJECT_SIZE + ggml_graph_overhead() + ggml_tensor_overhead()*4 + GGML_MEM_ALIGN*5;
+    params.mem_buffer = NULL;
+    params.no_alloc   = true;
+    struct ggml_context * ctx = NULL;
+    struct ggml_gallocr * alloc = NULL;
+    struct ggml_cgraph  * gf = NULL;
+
+    ctx   = ggml_init(params);
+    alloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type());
+    gf    = build_graph_lora(ctx, tensor, lora_a, lora_b, scaling);
+
+    ggml_gallocr_alloc_graph(alloc, gf);
+
+    struct ggml_cplan cplan = ggml_graph_plan(gf, n_threads);
+    static std::vector<uint8_t> data_work;
+    data_work.resize(cplan.work_size);
+    cplan.work_data = data_work.data();
+
+    ggml_graph_compute(gf, &cplan);
+
+    ggml_gallocr_free(alloc);
+    ggml_free(ctx);
+    return true;
+}
+
+static void export_lora(struct export_lora_params * params) {
+    // load all loras
+    std::vector<struct lora_data *> loras;
+    for (size_t i = 0; i < params->lora.size(); ++i) {
+        struct lora_data * lora = load_lora(&params->lora[i]);
+        if (lora != NULL) {
+            loras.push_back(lora);
+        }
+    }
+    if (loras.size() == 0) {
+        fprintf(stderr, "warning: no lora adapters will be applied.\n");
+    }
+
+    // open input file
+    struct llama_file fin(params->fn_model_base.c_str(), "rb");
+    if (!fin.fp) {
+        die_fmt("Could not open file '%s'\n", params->fn_model_base.c_str());
+    }
+
+    // open base model gguf, read tensors without their data
+    struct ggml_context * ctx_in;
+    struct gguf_init_params params_gguf;
+    params_gguf.no_alloc = true;
+    params_gguf.ctx      = &ctx_in;
+    struct gguf_context * gguf_in = gguf_init_from_file(params->fn_model_base.c_str(), params_gguf);
+
+    // create new gguf
+    struct gguf_context * gguf_out = gguf_init_empty();
+
+    // copy meta data from base model: kv and tensors
+    gguf_set_kv(gguf_out, gguf_in);
+    int n_tensors = gguf_get_n_tensors(gguf_in);
+    for (int i=0; i < n_tensors; ++i) {
+        const char * name = gguf_get_tensor_name(gguf_in, i);
+        struct ggml_tensor * tensor = ggml_get_tensor(ctx_in, name);
+        gguf_add_tensor(gguf_out, tensor);
+    }
+
+    // create output file
+    struct llama_file fout(params->fn_model_out.c_str(), "wb");
+    if (!fout.fp) {
+        die_fmt("Could not create file '%s'\n", params->fn_model_out.c_str());
+    }
+
+    // write gguf meta data
+    std::vector<uint8_t> meta;
+    meta.resize(gguf_get_meta_size(gguf_out));
+    gguf_get_meta_data(gguf_out, meta.data());
+    fout.write_raw(meta.data(), meta.size());
+
+    std::vector<uint8_t> data;
+    std::vector<uint8_t> padding;
+    for (int i=0; i < n_tensors; ++i) {
+        const char * name = gguf_get_tensor_name(gguf_in, i);
+        struct ggml_tensor * tensor = ggml_get_tensor(ctx_in, name);
+
+        // read tensor data
+        data.resize(ggml_nbytes(tensor));
+        tensor->data = data.data();
+        size_t offset = gguf_get_tensor_offset(gguf_in, i);
+        fin.seek(offset + meta.size(), SEEK_SET);
+        fin.read_raw(data.data(), data.size());
+
+        // apply all loras
+        for (size_t k = 0; k < loras.size(); ++k) {
+            apply_lora(tensor, loras[k], params->n_threads);
+        }
+
+        // write tensor data + padding
+        padding.clear();
+        padding.resize(GGML_PAD(data.size(), gguf_get_alignment(gguf_out)) - data.size(), 0);
+
+        GGML_ASSERT(fout.tell() == offset + meta.size());
+        // fout.seek(offset + meta.size(), SEEK_SET);
+        fout.write_raw(data.data(), data.size());
+        fout.write_raw(padding.data(), padding.size());
+
+        if (i % 2 == 0) {
+            printf(".");
+        }
+    }
    printf("\n");
+
+    // close gguf
+    gguf_free(gguf_out);
+    gguf_free(gguf_in);
+
+    // free loras
+    for (size_t i = 0; i < loras.size(); ++i) {
+        free_lora(loras[i]);
+    }
 }

 int main(int argc, char ** argv) {
-    gpt_params params;
+    struct export_lora_params params = get_default_export_lora_params();

-    if (!gpt_params_parse(argc, argv, params)) {
-        print_usage(argc, argv, params);
+    if (!export_lora_params_parse(argc, argv, &params)) {
        return 1;
    }

-    g_verbose = (params.verbosity == 1);
-    try {
-        lora_merge_ctx ctx(params.model, params.lora_adapters, params.lora_outfile, params.n_threads);
-        ctx.run_merge();
-    } catch (const std::exception & err) {
-        fprintf(stderr, "%s\n", err.what());
-        exit(EXIT_FAILURE);
-    }
-
-    printf("done, output file is %s\n", params.lora_outfile.c_str());
+    export_lora(&params);

    return 0;
 }
--- a/examples/finetune/CMakeLists.txt
+++ b/examples/finetune/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(TARGET llama-finetune)
+add_executable(${TARGET} finetune.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/finetune/README.md
+++ b/examples/finetune/README.md
@@ -0,0 +1,90 @@
+# finetune
+
+Basic usage instructions:
+
+```bash
+# get training data
+wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/shakespeare.txt
+
+# finetune LORA adapter
+./bin/llama-finetune \
+        --model-base open-llama-3b-v2-q8_0.gguf \
+        --checkpoint-in  chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf \
+        --checkpoint-out chk-lora-open-llama-3b-v2-q8_0-shakespeare-ITERATION.gguf \
+        --lora-out lora-open-llama-3b-v2-q8_0-shakespeare-ITERATION.bin \
+        --train-data "shakespeare.txt" \
+        --save-every 10 \
+        --threads 6 --adam-iter 30 --batch 4 --ctx 64 \
+        --use-checkpointing
+
+# predict
+./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
+```
+
+**Only llama based models are supported!** The output files will be saved every N iterations (config with `--save-every N`).
+The pattern 'ITERATION' in the output filenames will be replaced with the iteration number and with 'LATEST' for the latest output.
+So in above example after 10 iterations these files will be written:
+- chk-lora-open-llama-3b-v2-q8_0-shakespeare-10.gguf
+- chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf
+- lora-open-llama-3b-v2-q8_0-shakespeare-10.bin
+- lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
+
+After 10 more iterations:
+- chk-lora-open-llama-3b-v2-q8_0-shakespeare-20.gguf
+- chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf
+- lora-open-llama-3b-v2-q8_0-shakespeare-20.bin
+- lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
+
+Checkpoint files (`--checkpoint-in FN`, `--checkpoint-out FN`) store the training process. When the input checkpoint file does not exist, it will begin finetuning a new randomly initialized adapter.
+
+llama.cpp compatible LORA adapters will be saved with filename specified by `--lora-out FN`.
+These LORA adapters can then be used by `llama-cli` together with the base model, like in the 'predict' example command above.
+
+In `llama-cli` you can also load multiple LORA adapters, which will then be mixed together.
+
+For example if you have two LORA adapters `lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin` and `lora-open-llama-3b-v2-q8_0-bible-LATEST.bin`, you can mix them together like this:
+
+```bash
+./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf \
+  --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin \
+  --lora lora-open-llama-3b-v2-q8_0-bible-LATEST.bin
+```
+
+You can change how strong each LORA adapter is applied to the base model by using `--lora-scaled FN SCALE` instead of `--lora FN`.
+
+For example to apply 40% of the 'shakespeare' LORA adapter, 80% of the 'bible' LORA adapter and 100% of yet another one:
+
+```bash
+./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf \
+  --lora-scaled lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin 0.4 \
+  --lora-scaled lora-open-llama-3b-v2-q8_0-bible-LATEST.bin 0.8 \
+  --lora lora-open-llama-3b-v2-q8_0-yet-another-one-LATEST.bin
+```
+
+The scale numbers don't need to add up to one, and you can also use numbers greater than 1 to further increase the influence of an adapter. But making the values too big will sometimes result in worse output. Play around to find good values.
+
+Gradient checkpointing reduces the memory requirements by ~50% but increases the runtime.
+If you have enough RAM, you can make finetuning a bit faster by disabling checkpointing with `--no-checkpointing`.
+
+The default LORA rank can be specified with `--lora-r N`.
+The LORA rank can be configured for each model tensor type separately with these command line options:
+
+```bash
+  --lora-r N                 LORA r: default rank. Also specifies resulting scaling together with lora-alpha. (default 4)
+  --rank-att-norm N          LORA rank for attention norm tensor (default 1)
+  --rank-ffn-norm N          LORA rank for feed-forward norm tensor (default 1)
+  --rank-out-norm N          LORA rank for output norm tensor (default 1)
+  --rank-tok-embd N          LORA rank for token embeddings tensor (default 4)
+  --rank-out N               LORA rank for output tensor (default 4)
+  --rank-wq N                LORA rank for wq tensor (default 4)
+  --rank-wk N                LORA rank for wk tensor (default 4)
+  --rank-wv N                LORA rank for wv tensor (default 4)
+  --rank-wo N                LORA rank for wo tensor (default 4)
+  --rank-ffn_gate N          LORA rank for ffn_gate tensor (default 4)
+  --rank-ffn_down N          LORA rank for ffn_down tensor (default 4)
+  --rank-ffn_up N            LORA rank for ffn_up tensor (default 4)
+```
+
+The LORA rank of 'norm' tensors should always be 1.
+
+To see all available options use `llama-finetune --help`.
--- a/examples/finetune/convert_finetune_checkpoint_to_gguf.py
+++ b/examples/finetune/convert_finetune_checkpoint_to_gguf.py
@@ -0,0 +1,487 @@
+#!/usr/bin/env python3
+# finetune checkpoint --> gguf conversion
+
+import argparse
+import gguf
+import struct
+import numpy as np
+from pathlib import Path
+
+# gguf constants
+LLM_KV_OPTIMIZER_TYPE = "optimizer.type"
+LLM_KV_OPTIMIZER_TYPE_ADAM  = "adam"
+LLM_KV_OPTIMIZER_TYPE_LBFGS = "lbfgs"
+LLM_KV_OPTIMIZER_FILE_VERSION               = "optimizer.file_version"
+LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT     = "optimizer.convergence_past_count"
+LLM_KV_OPTIMIZER_PARAMETER_COUNT            = "optimizer.parameter_count"
+LLM_KV_OPTIMIZER_ITERATION_COUNT            = "optimizer.iteration_count"
+LLM_KV_OPTIMIZER_JUST_INITIALIZED           = "optimizer.just_initialized"
+LLM_KV_OPTIMIZER_ADAM_BEST_LOSS             = "optimizer.adam.best_loss"
+LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS         = "optimizer.adam.previous_loss"
+LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT  = "optimizer.adam.no_improvement_count"
+LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT = "optimizer.lbfgs.approx_hessian_count"
+LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS            = "optimizer.lbfgs.best_loss"
+LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP     = "optimizer.lbfgs.line_search_step"
+LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J        = "optimizer.lbfgs.line_search_j"
+LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K        = "optimizer.lbfgs.line_search_k"
+LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END      = "optimizer.lbfgs.line_search_end"
+LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT = "optimizer.lbfgs.no_improvement_count"
+
+LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS    = "optimizer.adam.first_moments"
+LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS   = "optimizer.adam.second_moments"
+LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES = "optimizer.adam.past_loss_values"
+
+LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS  = "optimizer.lbfgs.current_parameters"
+LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS = "optimizer.lbfgs.previous_parameters"
+LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS   = "optimizer.lbfgs.current_gradients"
+LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS  = "optimizer.lbfgs.previous_gradients"
+LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION    = "optimizer.lbfgs.search_direction"
+LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES    = "optimizer.lbfgs.past_loss_values"
+LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA        = "optimizer.lbfgs.memory_alpha"
+LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS           = "optimizer.lbfgs.memory_ys"
+LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S            = "optimizer.lbfgs.memory_s"
+LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y            = "optimizer.lbfgs.memory_y"
+
+LLM_KV_TRAINING_TYPE_TRAIN_MODEL   = "train_model"
+LLM_KV_TRAINING_TYPE_FINETUNE_LORA = "finetune_lora"
+LLM_KV_TRAINING_TYPE               = "training.type"
+LLM_KV_TRAINING_FILE_VERSION       = "training.file_version"
+LLM_KV_TRAINING_ITERATION_COUNT    = "training.iteration_count"
+LLM_KV_TRAINING_SAMPLE_COUNT       = "training.sample_count"
+LLM_KV_TRAINING_TOKEN_COUNT        = "training.token_count"
+
+LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD  = "training.lora.rank.token_embd"
+LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM = "training.lora.rank.output_norm"
+LLM_KV_TRAINING_LORA_RANK_OUTPUT      = "training.lora.rank.output"
+LLM_KV_TRAINING_LORA_RANK_ATTN_NORM   = "training.lora.rank.attn_norm"
+LLM_KV_TRAINING_LORA_RANK_ATTN_Q      = "training.lora.rank.attn_q"
+LLM_KV_TRAINING_LORA_RANK_ATTN_K      = "training.lora.rank.attn_k"
+LLM_KV_TRAINING_LORA_RANK_ATTN_V      = "training.lora.rank.attn_v"
+LLM_KV_TRAINING_LORA_RANK_ATTN_OUT    = "training.lora.rank.attn_output"
+LLM_KV_TRAINING_LORA_RANK_FFN_NORM    = "training.lora.rank.ffn_norm"
+LLM_KV_TRAINING_LORA_RANK_FFN_GATE    = "training.lora.rank.ffn_gate"
+LLM_KV_TRAINING_LORA_RANK_FFN_DOWN    = "training.lora.rank.ffn_down"
+LLM_KV_TRAINING_LORA_RANK_FFN_UP      = "training.lora.rank.ffn_up"
+
+class Tensor:
+    def __init__(self, dtype='f', ne=None):
+        if ne is None:
+            ne = []
+        self.dtype = dtype
+        self.ne = ne
+        self.nbytes = 0
+        if self.dtype == 'f':
+            if len(self.ne) == 0:
+                self.nbytes = 0
+            else:
+                self.nbytes = int(np.prod(self.ne)) * 4
+        else:
+            raise ValueError(f"Unhandled data type '{self.dtype}'")
+
+    def load(self, data, offset):
+        nd = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
+        namelen = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
+        dtype = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
+
+        assert(nd == len(self.ne))
+        ne = []
+        for d in range(nd):
+            n = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
+            ne.append(n)
+
+        if tuple(ne) != tuple(self.ne):
+            raise ValueError(f"Tensor.load: Expected number of elements {str(self.ne)} does not match what is read from file {str(ne)}")
+
+        if self.dtype == 'f':
+            assert(dtype == 0)
+        else:
+            raise ValueError(f"Unhandled data type '{self.dtype}'")
+
+        self.name = bytes(data[offset:offset+namelen]); offset += namelen
+        # 32-byte alignment
+        offset += (0 - offset) & 31
+        self.data = data[offset:offset+self.nbytes]
+        offset += self.nbytes
+        return offset
+
+    def max_storage_size(self):
+        result = 0
+        result += 4 # nd
+        result += 4 # namelen
+        result += 4 # dtype
+        result += len(self.ne)*8 # ne
+        result += 48 # name (maximum as of commit 3b5515bbe0e2224425986ba24f1f5d84aa38dce9)
+        result += 31 # 32-byte alignment
+        result += self.nbytes
+        return result
+
+    def save_gguf(self, gguf_writer, name):
+        gguf_writer.add_tensor(
+            name=name,
+            tensor=self.data,
+            raw_shape=np.array(list(reversed(self.ne))),
+            raw_dtype=gguf.GGMLQuantizationType.F32)
+
+class OptimizationContext:
+    def __init__(self):
+        pass
+
+    def load(self, data, offset):
+        self.version = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]
+        offset += 4
+
+        if self.version != 1:
+            raise ValueError('Invalid version of optimization context in checkpoint file')
+
+        self.past    = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.lbfgs_m = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.nx      = struct.unpack('N',  bytes(data[offset:offset + 8]))[0];  offset += 8
+        self.iter    = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.just_initialized = bool(struct.unpack('<i', bytes(data[offset:offset + 4]))[0]);  offset += 4
+
+        self.adam_m  = Tensor('f', [self.nx])
+        self.adam_v  = Tensor('f', [self.nx])
+        self.adam_pf = Tensor('f', [self.past] if self.past > 0 else [])
+
+        self.lbfgs_x    = Tensor('f', [self.nx])
+        self.lbfgs_xp   = Tensor('f', [self.nx])
+        self.lbfgs_g    = Tensor('f', [self.nx])
+        self.lbfgs_gp   = Tensor('f', [self.nx])
+        self.lbfgs_d    = Tensor('f', [self.nx])
+        self.lbfgs_pf   = Tensor('f', [self.past] if self.past > 0 else [])
+        self.lbfgs_lmal = Tensor('f', [self.lbfgs_m])
+        self.lbfgs_lmys = Tensor('f', [self.lbfgs_m])
+        self.lbfgs_lms  = Tensor('f', [self.nx, self.lbfgs_m])
+        self.lbfgs_lmy  = Tensor('f', [self.nx, self.lbfgs_m])
+
+        # forgot to save type in version 1:
+        # guess self.type from number of remaining bytes
+        size_type_0 = 12 + sum([t.max_storage_size() for t in
+                                [self.adam_m, self.adam_v]
+                                +([self.adam_pf] if (self.past > 0) else [])])
+        size_type_1 = 24 + sum([t.max_storage_size() for t in
+                                [self.lbfgs_x, self.lbfgs_xp, self.lbfgs_g,
+                                 self.lbfgs_gp, self.lbfgs_d, self.lbfgs_pf,
+                                 self.lbfgs_lmal, self.lbfgs_lmys,
+                                 self.lbfgs_lms, self.lbfgs_lmy]
+                                 +([self.lbfgs_pf] if (self.past > 0) else [])])
+        # due to alignment padding the size might not by exact
+        # but the difference in size for both types is significant,
+        # so we can just use whichever is closest
+        remaining = len(data) - offset
+        if abs(remaining - size_type_0) < abs(remaining - size_type_1):
+            self.type = 0
+        else:
+            self.type = 1
+
+        if self.type == 0:
+            offset = self.adam_m.load(data, offset)
+            offset = self.adam_v.load(data, offset)
+            offset = self.adam_pf.load(data,offset)
+
+            self.adam_fx_best          = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
+            self.adam_fx_prev          = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
+            self.adam_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
+
+        elif self.type == 1:
+            offset = self.lbfgs_x.load(data, offset)
+            offset = self.lbfgs_xp.load(data, offset)
+            offset = self.lbfgs_g.load(data, offset)
+            offset = self.lbfgs_gp.load(data, offset)
+            offset = self.lbfgs_d.load(data, offset)
+            offset = self.lbfgs_pf.load(data, offset)
+            offset = self.lbfgs_lmal.load(data, offset)
+            offset = self.lbfgs_lmys.load(data, offset)
+            offset = self.lbfgs_lms.load(data, offset)
+            offset = self.lbfgs_lmy.load(data, offset)
+
+            self.lbfgs_fx_best          = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
+            self.lbfgs_step             = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
+            self.lbfgs_j                = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
+            self.lbfgs_k                = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
+            self.lbfgs_end              = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
+            self.lbfgs_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
+
+        else:
+            raise ValueError(f"Invalid optimizer type '{self.type}'")
+
+        return offset
+
+    def save_gguf(self, gguf_writer):
+        gguf_writer.add_uint32(LLM_KV_OPTIMIZER_FILE_VERSION, 0)
+        gguf_writer.add_uint32(LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT, self.past)
+        gguf_writer.add_uint64(LLM_KV_OPTIMIZER_PARAMETER_COUNT, self.nx)
+        gguf_writer.add_uint32(LLM_KV_OPTIMIZER_ITERATION_COUNT, self.iter)
+        gguf_writer.add_bool(LLM_KV_OPTIMIZER_JUST_INITIALIZED, self.just_initialized)
+
+        if self.type == 0:
+            gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_ADAM)
+            gguf_writer.add_float32(LLM_KV_OPTIMIZER_ADAM_BEST_LOSS, self.adam_fx_best)
+            gguf_writer.add_float32(LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS, self.adam_fx_prev)
+            gguf_writer.add_uint32(LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT, self.adam_n_no_improvement)
+
+            self.adam_m.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS)
+            self.adam_v.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS)
+            if self.past > 0:
+                self.adam_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES)
+
+        elif self.type == 1:
+            gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS)
+            gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, self.lbfgs_m)
+            gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS, self.lbfgs_fx_best)
+            gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP, self.lbfgs_step)
+            gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J, self.lbfgs_j)
+            gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K, self.lbfgs_k)
+            gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END, self.lbfgs_end)
+            gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT, self.lbfgs_n_no_improvement)
+
+            self.lbfgs_x.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS)
+            self.lbfgs_xp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS)
+            self.lbfgs_g.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS)
+            self.lbfgs_gp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS)
+            self.lbfgs_d.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION)
+            if self.past > 0:
+                self.lbfgs_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES)
+            self.lbfgs_lmal.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA)
+            self.lbfgs_lmys.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS)
+            self.lbfgs_lms.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S)
+            self.lbfgs_lmy.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y)
+        else:
+            raise ValueError('Unknown optimizer type')
+
+class LoraParams:
+    def __init__(self):
+        pass
+
+    def load(self, data, offset):
+        self.n_rank_attention_norm  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.n_rank_wq              = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.n_rank_wk              = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.n_rank_wv              = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.n_rank_wo              = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.n_rank_ffn_norm        = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.n_rank_w1              = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.n_rank_w2              = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.n_rank_w3              = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.n_rank_tok_embeddings  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.n_rank_norm            = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.n_rank_output          = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
+        return offset
+
+    def save_gguf(self, gguf_writer):
+        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD,  self.n_rank_tok_embeddings)
+        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM, self.n_rank_norm)
+        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_OUTPUT,      self.n_rank_output)
+        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_NORM,   self.n_rank_attention_norm)
+        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_Q,      self.n_rank_wq)
+        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_K,      self.n_rank_wk)
+        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_V,      self.n_rank_wv)
+        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_OUT,    self.n_rank_wo)
+        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_FFN_NORM,    self.n_rank_ffn_norm)
+        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_FFN_GATE,    self.n_rank_w1)
+        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_FFN_DOWN,    self.n_rank_w2)
+        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_FFN_UP,      self.n_rank_w3)
+
+class ModelParams:
+    def __init__(self, n_ff = None):
+        self.n_ff = n_ff
+
+    def load(self, data, offset):
+        self.n_vocab = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.n_embd  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.n_mult  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.n_head  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.n_layer = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
+        self.n_rot   = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
+        return offset
+
+    def get_n_ff(self):
+        if self.n_ff is None:
+            # struct my_llama_model::get_n_ff in train-text-from-scratch.cpp commit 3b5515bbe0e2224425986ba24f1f5d84aa38dce9
+            return ((2*(4*self.n_embd)//3 + self.n_mult - 1)//self.n_mult)*self.n_mult
+        else:
+            return self.n_ff
+
+    def save_gguf(self, gguf_writer):
+        # self.n_vocab not saved
+        gguf_writer.add_embedding_length(self.n_embd)
+        gguf_writer.add_head_count(self.n_head)
+        gguf_writer.add_block_count(self.n_layer)
+        gguf_writer.add_rope_dimension_count(self.n_rot)
+        gguf_writer.add_feed_forward_length(self.get_n_ff())
+
+def tensor_name(key, bid=None, suffix=".weight"):
+    return gguf.TENSOR_NAMES[key].format(bid=bid) + suffix
+
+class Layer:
+    def __init__(self, params, lora_params, bid):
+        self.bid = bid
+        self.att_norm_a = Tensor('f', [lora_params.n_rank_attention_norm, params.n_embd])
+        self.att_norm_b = Tensor('f', [lora_params.n_rank_attention_norm, 1])
+        self.wq_a       = Tensor('f', [lora_params.n_rank_wq, params.n_embd])
+        self.wq_b       = Tensor('f', [lora_params.n_rank_wq, params.n_embd])
+        self.wk_a       = Tensor('f', [lora_params.n_rank_wk, params.n_embd])
+        self.wk_b       = Tensor('f', [lora_params.n_rank_wk, params.n_embd])
+        self.wv_a       = Tensor('f', [lora_params.n_rank_wv, params.n_embd])
+        self.wv_b       = Tensor('f', [lora_params.n_rank_wv, params.n_embd])
+        self.wo_a       = Tensor('f', [lora_params.n_rank_wo, params.n_embd])
+        self.wo_b       = Tensor('f', [lora_params.n_rank_wo, params.n_embd])
+        self.ffn_norm_a = Tensor('f', [lora_params.n_rank_ffn_norm, params.n_embd])
+        self.ffn_norm_b = Tensor('f', [lora_params.n_rank_ffn_norm, 1])
+        self.w1_a       = Tensor('f', [lora_params.n_rank_w1, params.n_embd])
+        self.w1_b       = Tensor('f', [lora_params.n_rank_w1, params.get_n_ff()])
+        self.w2_a       = Tensor('f', [lora_params.n_rank_w2, params.get_n_ff()])
+        self.w2_b       = Tensor('f', [lora_params.n_rank_w2, params.n_embd])
+        self.w3_a       = Tensor('f', [lora_params.n_rank_w3, params.n_embd])
+        self.w3_b       = Tensor('f', [lora_params.n_rank_w3, params.get_n_ff()])
+
+    def load(self, data, offset):
+        offset = self.att_norm_a.load(data, offset)
+        offset = self.att_norm_b.load(data, offset)
+        offset = self.wq_a.load(data, offset)
+        offset = self.wq_b.load(data, offset)
+        offset = self.wk_a.load(data, offset)
+        offset = self.wk_b.load(data, offset)
+        offset = self.wv_a.load(data, offset)
+        offset = self.wv_b.load(data, offset)
+        offset = self.wo_a.load(data, offset)
+        offset = self.wo_b.load(data, offset)
+        offset = self.ffn_norm_a.load(data, offset)
+        offset = self.ffn_norm_b.load(data, offset)
+        offset = self.w1_a.load(data, offset)
+        offset = self.w1_b.load(data, offset)
+        offset = self.w2_a.load(data, offset)
+        offset = self.w2_b.load(data, offset)
+        offset = self.w3_a.load(data, offset)
+        offset = self.w3_b.load(data, offset)
+        return offset
+
+    def save_gguf(self, gguf_writer):
+        self.att_norm_a.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_NORM, self.bid, ".weight.lora_a"))
+        self.att_norm_b.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_NORM, self.bid, ".weight.lora_b"))
+        self.wq_a.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_Q,    self.bid, ".weight.lora_a"))
+        self.wq_b.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_Q,    self.bid, ".weight.lora_b"))
+        self.wk_a.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_K,    self.bid, ".weight.lora_a"))
+        self.wk_b.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_K,    self.bid, ".weight.lora_b"))
+        self.wv_a.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_V,    self.bid, ".weight.lora_a"))
+        self.wv_b.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_V,    self.bid, ".weight.lora_b"))
+        self.wo_a.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_OUT,  self.bid, ".weight.lora_a"))
+        self.wo_b.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_OUT,  self.bid, ".weight.lora_b"))
+        self.ffn_norm_a.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_NORM,  self.bid, ".weight.lora_a"))
+        self.ffn_norm_b.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_NORM,  self.bid, ".weight.lora_b"))
+        self.w1_a.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_GATE,  self.bid, ".weight.lora_a"))
+        self.w1_b.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_GATE,  self.bid, ".weight.lora_b"))
+        self.w2_a.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_DOWN,  self.bid, ".weight.lora_a"))
+        self.w2_b.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_DOWN,  self.bid, ".weight.lora_b"))
+        self.w3_a.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_UP,    self.bid, ".weight.lora_a"))
+        self.w3_b.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_UP,    self.bid, ".weight.lora_b"))
+
+class LoraModel:
+    def __init__(self, n_ff = None):
+        self.params = ModelParams(n_ff = n_ff)
+        self.lora_params = LoraParams()
+        self.layers = []
+
+    def load(self, data, offset):
+        offset = self.params.load(data, offset)
+        offset = self.lora_params.load(data, offset)
+
+        self.tok_embd_a = Tensor('f', [self.lora_params.n_rank_tok_embeddings, self.params.n_embd])
+        self.tok_embd_b = Tensor('f', [self.lora_params.n_rank_tok_embeddings, self.params.n_vocab])
+        self.norm_a     = Tensor('f', [self.lora_params.n_rank_norm, self.params.n_embd])
+        self.norm_b     = Tensor('f', [self.lora_params.n_rank_norm, 1])
+        self.output_a   = Tensor('f', [self.lora_params.n_rank_output, self.params.n_embd])
+        self.output_b   = Tensor('f', [self.lora_params.n_rank_output, self.params.n_vocab])
+
+        offset = self.tok_embd_a.load(data, offset)
+        offset = self.tok_embd_b.load(data, offset)
+        offset = self.norm_a.load(data, offset)
+        offset = self.norm_b.load(data, offset)
+        offset = self.output_a.load(data, offset)
+        offset = self.output_b.load(data, offset)
+
+        self.layers.clear()
+        for bid in range(self.params.n_layer):
+            layer = Layer(self.params, self.lora_params, bid)
+            offset = layer.load(data, offset)
+            self.layers.append(layer)
+
+        return offset
+
+    def save_gguf(self, gguf_writer):
+        self.params.save_gguf(gguf_writer)
+        self.lora_params.save_gguf(gguf_writer)
+
+        self.tok_embd_a.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD,  suffix=".weight.lora_a"))
+        self.tok_embd_b.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD,  suffix=".weight.lora_b"))
+        self.norm_a.save_gguf    (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT_NORM, suffix=".weight.lora_a"))
+        self.norm_b.save_gguf    (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT_NORM, suffix=".weight.lora_b"))
+        self.output_a.save_gguf  (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT,      suffix=".weight.lora_a"))
+        self.output_b.save_gguf  (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT,      suffix=".weight.lora_b"))
+
+        for layer in self.layers:
+            layer.save_gguf(gguf_writer)
+
+class LoraCheckpoint:
+    def __init__(self, n_ff = None):
+        self.model = LoraModel(n_ff = n_ff)
+        self.opt_ctx = OptimizationContext()
+
+    def load(self, data, offset):
+        magic   = bytes(reversed(data[offset:offset + 4])); offset += 4
+        if magic != b'ggcl':
+            raise ValueError(f"File header magic indicates, that this is no finetune-lora checkpoint file. Expected 'ggcl', Got '{str(magic)}'")
+
+        self.version = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
+        if self.version != 0:
+            raise ValueError('Invalid version of checkpoint file')
+
+        self.train_its     = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
+        self.train_samples = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
+        self.train_tokens  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
+
+        offset = self.model.load(data, offset)
+        offset = self.opt_ctx.load(data, offset)
+
+        return offset
+
+    def save_gguf(self, gguf_writer):
+        gguf_writer.add_file_type(gguf.GGMLQuantizationType.F32)
+        gguf_writer.add_layer_norm_rms_eps(1e-5)
+        gguf_writer.add_uint32(LLM_KV_TRAINING_FILE_VERSION,    0)
+        gguf_writer.add_string(LLM_KV_TRAINING_TYPE,            LLM_KV_TRAINING_TYPE_FINETUNE_LORA)
+        gguf_writer.add_uint32(LLM_KV_TRAINING_ITERATION_COUNT, self.train_its)
+        gguf_writer.add_uint32(LLM_KV_TRAINING_SAMPLE_COUNT,    self.train_samples)
+        gguf_writer.add_uint32(LLM_KV_TRAINING_TOKEN_COUNT,     self.train_tokens)
+        self.model.save_gguf(gguf_writer)
+        self.opt_ctx.save_gguf(gguf_writer)
+
+def handle_args():
+    parser = argparse.ArgumentParser(description = 'Convert finetune checkpoints to GGUF')
+    parser.add_argument('--input',  '-i', type = Path, help = 'Input finetune checkpoint filename', required=True)
+    parser.add_argument('--output', '-o', type = Path, help = 'Output GGUF filename', required=True)
+    parser.add_argument('--ff', type = int, help = "Feedforward size, if not provided compute from n_mult. Provide this if you get 'ValueError: Tensor.load: Expected number of elements does not match what is read from file'", required=False)
+    return parser.parse_args()
+
+def main():
+    cfg = handle_args()
+    print(cfg)
+    data = np.memmap(cfg.input, mode = 'r')
+    chk = LoraCheckpoint(n_ff = cfg.ff)
+    offset = 0
+    offset = chk.load(data, offset)
+    # we should have read all available data
+    assert(offset == len(data))
+
+    gguf_writer = gguf.GGUFWriter(cfg.output, gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], use_temp_file = False)
+    chk.save_gguf(gguf_writer)
+    print("    gguf: write header")
+    gguf_writer.write_header_to_file()
+    print("    gguf: write metadata")
+    gguf_writer.write_kv_data_to_file()
+    print("    gguf: write tensors")
+    gguf_writer.write_tensors_to_file()
+    gguf_writer.close()
+
+if __name__ == '__main__':
+    main()
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
--- a/examples/finetune/finetune.sh
+++ b/examples/finetune/finetune.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+cd `dirname $0`
+cd ../..
+
+EXE="./llama-finetune"
+
+if [[ ! $LLAMA_MODEL_DIR ]]; then LLAMA_MODEL_DIR="./models"; fi
+if [[ ! $LLAMA_TRAINING_DIR ]]; then LLAMA_TRAINING_DIR="."; fi
+
+# MODEL="$LLAMA_MODEL_DIR/openllama-3b-v2-q8_0.gguf" # This is the model the readme uses.
+MODEL="$LLAMA_MODEL_DIR/openllama-3b-v2.gguf" # An f16 model. Note in this case with "-g", you get an f32-format .BIN file that isn't yet supported if you use it with "llama-cli --lora" with GPU inferencing.
+
+while getopts "dg" opt; do
+  case $opt in
+    d)
+      DEBUGGER="gdb --args"
+      ;;
+    g)
+      EXE="./build/bin/Release/finetune"
+      GPUARG="--gpu-layers 25"
+      ;;
+  esac
+done
+
+$DEBUGGER $EXE \
+        --model-base $MODEL \
+        $GPUARG \
+        --checkpoint-in  chk-ol3b-shakespeare-LATEST.gguf \
+        --checkpoint-out chk-ol3b-shakespeare-ITERATION.gguf \
+        --lora-out lora-ol3b-shakespeare-ITERATION.bin \
+        --train-data "$LLAMA_TRAINING_DIR\shakespeare.txt" \
+        --save-every 10 \
+        --threads 10 --adam-iter 30 --batch 4 --ctx 64 \
+        --use-checkpointing
--- a/examples/gbnf-validator/gbnf-validator.cpp
+++ b/examples/gbnf-validator/gbnf-validator.cpp
@@ -16,25 +16,20 @@ static bool llama_sample_grammar_string(struct llama_grammar * grammar, const st
    auto decoded = decode_utf8(input_str, {});
    const auto & code_points = decoded.first;

-    const llama_grammar_rules  & rules      = llama_grammar_get_rules (grammar);
-          llama_grammar_stacks & cur_stacks = llama_grammar_get_stacks(grammar);
-
    size_t pos = 0;
    for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
-        const llama_grammar_stacks prev_stacks = llama_grammar_get_stacks(grammar); // copy
-
-        llama_grammar_accept(rules, prev_stacks, *it, cur_stacks);
-
-        if (cur_stacks.empty()) {
+        auto prev_stacks = grammar->stacks;
+        llama_grammar_accept(grammar->rules, prev_stacks, *it, grammar->stacks);
+        if (grammar->stacks.empty()) {
            error_pos = pos;
            error_msg = "Unexpected character '" + unicode_cpt_to_utf8(*it) + "'";
-            cur_stacks = prev_stacks;
+            grammar->stacks = prev_stacks;
            return false;
        }
        ++pos;
    }

-    for (const auto & stack : cur_stacks) {
+    for (const auto & stack : grammar->stacks) {
        if (stack.empty()) {
            return true;
        }
--- a/examples/gguf-hash/README.md
+++ b/examples/gguf-hash/README.md
@@ -201,6 +201,6 @@ Verification results for test.gguf.manifest - Success

 These micro c libraries dependencies was installed via the [clib c package manager](https://github.com/clibs)

- https://github.com/Cyan4973/xxHash
+- https://github.com/mofosyne/xxHash (From: https://github.com/Cyan4973/xxHash)
 - https://github.com/clibs/sha1/
 - https://github.com/jb55/sha256.c
--- a/examples/gguf-hash/deps/xxhash/clib.json
+++ b/examples/gguf-hash/deps/xxhash/clib.json
@@ -1,7 +1,7 @@
 {
  "name": "xxhash",
  "version": "0.8.2",
-  "repo": "Cyan4973/xxhash",
+  "repo": "mofosyne/xxhash",
  "description": "Extremely fast non-cryptographic hash algorithm",
  "keywords": ["xxhash", "hashing"],
  "license": "BSD-2-Clause",
--- a/examples/gguf-hash/gguf-hash.cpp
+++ b/examples/gguf-hash/gguf-hash.cpp
@@ -347,7 +347,7 @@ static hash_exit_code_t gguf_hash(const hash_params & hash_params) {
                char hex_result[17];
                for (int  offset = 0; offset < 8; offset++) {
                    unsigned int shift_bits_by = (8 * (8 - offset - 1));
-                    snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", (unsigned char) (hash >> shift_bits_by)&0xff);
+                    sprintf( ( hex_result + (2*offset)), "%02x", (unsigned char) (hash >> shift_bits_by)&0xff);
                }

                if (hash_params.manifest_is_usable) {
@@ -384,7 +384,7 @@ static hash_exit_code_t gguf_hash(const hash_params & hash_params) {

                char hex_result[41] = {0};
                for (int  offset = 0; offset < 20; offset++) {
-                    snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", result[offset]&0xff);
+                    sprintf( ( hex_result + (2*offset)), "%02x", result[offset]&0xff);
                }

                if (hash_params.manifest_is_usable) {
@@ -421,7 +421,7 @@ static hash_exit_code_t gguf_hash(const hash_params & hash_params) {

                char hex_result[SHA256_DIGEST_SIZE * 2 + 1] = {0};
                for (int  offset = 0; offset < SHA256_DIGEST_SIZE; offset++) {
-                    snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", result[offset]&0xff);
+                    sprintf( ( hex_result + (2*offset)), "%02x", result[offset]&0xff);
                }

                if (hash_params.manifest_is_usable) {
@@ -460,7 +460,7 @@ static hash_exit_code_t gguf_hash(const hash_params & hash_params) {
        char hex_result[17];
        for (int  offset = 0; offset < 8; offset++) {
            unsigned int shift_bits_by = (8 * (8 - offset - 1));
-            snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", (unsigned char) (hash >> shift_bits_by)&0xff);
+            sprintf( ( hex_result + (2*offset)), "%02x", (unsigned char) (hash >> shift_bits_by)&0xff);
        }

        if (hash_params.manifest_is_usable) {
@@ -490,7 +490,7 @@ static hash_exit_code_t gguf_hash(const hash_params & hash_params) {

        char hex_result[41];
        for (int  offset = 0; offset < 20; offset++) {
-            snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", result[offset]&0xff);
+            sprintf( ( hex_result + (2*offset)), "%02x", result[offset]&0xff);
        }

        if (hash_params.manifest_is_usable) {
@@ -520,7 +520,7 @@ static hash_exit_code_t gguf_hash(const hash_params & hash_params) {

        char hex_result[SHA256_DIGEST_SIZE * 2 + 1] = {0};
        for (int  offset = 0; offset < SHA256_DIGEST_SIZE; offset++) {
-            snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", result[offset]&0xff);
+            sprintf( ( hex_result + (2*offset)), "%02x", result[offset]&0xff);
        }

        if (hash_params.manifest_is_usable) {
@@ -552,7 +552,7 @@ static hash_exit_code_t gguf_hash(const hash_params & hash_params) {
        generate_uuidv5(result, uuid);

        char string_buffer[37] = {0};
-        snprintf(string_buffer, sizeof(string_buffer), "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
+        sprintf(string_buffer, "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
            uuid[0], uuid[1], uuid[2], uuid[3],
            uuid[4], uuid[5], uuid[6], uuid[7],
            uuid[8], uuid[9], uuid[10], uuid[11],
--- a/examples/gguf/gguf.cpp
+++ b/examples/gguf/gguf.cpp
@@ -92,11 +92,6 @@ static bool gguf_ex_read_0(const std::string & fname) {

    struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);

-    if (!ctx) {
-        fprintf(stderr, "%s: failed to load '%s'\n", __func__, fname.c_str());
-        return false;
-    }
-
    printf("%s: version:      %d\n", __func__, gguf_get_version(ctx));
    printf("%s: alignment:   %zu\n", __func__, gguf_get_alignment(ctx));
    printf("%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx));
--- a/examples/imatrix/README.md
+++ b/examples/imatrix/README.md
@@ -1,6 +1,6 @@
 # llama.cpp/examples/imatrix

-Compute an importance matrix for a model and given text dataset. Can be used during quantization to enchance the quality of the quantized models.
+Compute an importance matrix for a model and given text dataset. Can be used during quantization to enchance the quality of the quantum models.
 More information is available here: https://github.com/ggerganov/llama.cpp/pull/4861

 ## Usage
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -127,7 +127,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
        }
        else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
            fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
-            exit(1); //GGML_ABORT("fatal error");
+            exit(1); //GGML_ASSERT(false);
        }
        if (m_params.verbosity > 1) {
            printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
@@ -176,7 +176,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
        }
        else if (e.values.size() != (size_t)src1->ne[0]) {
            fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
-            exit(1); //GGML_ABORT("fatal error");
+            exit(1); //GGML_ASSERT(false);
        }
        ++e.ncall;
        if (m_params.verbosity > 1) {
@@ -433,8 +433,8 @@ static void process_logits(
 }

 static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
-    const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
-    GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));
+    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
+    GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
    const int n_ctx = llama_n_ctx(ctx);

    auto tim1 = std::chrono::high_resolution_clock::now();
@@ -611,10 +611,10 @@ int main(int argc, char ** argv) {
    params.warmup = false;

    // init
-    llama_init_result llama_init = llama_init_from_gpt_params(params);
+    llama_model * model;
+    llama_context * ctx;

-    llama_model * model = llama_init.model;
-    llama_context * ctx = llama_init.context;
+    std::tie(model, ctx) = llama_init_from_gpt_params(params);
    if (model == nullptr || ctx == nullptr) {
        fprintf(stderr, "%s : failed to init\n", __func__);
        return 1;
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@@ -179,10 +179,7 @@ int main(int argc, char ** argv) {

    // load the model and apply lora adapter, if any
    LOG("%s: load the model and apply lora adapter, if any\n", __func__);
-    llama_init_result llama_init = llama_init_from_gpt_params(params);
-
-    model = llama_init.model;
-    ctx = llama_init.context;
+    std::tie(model, ctx) = llama_init_from_gpt_params(params);

    if (model == NULL) {
        LOG_TEE("%s: error: unable to load model\n", __func__);
@@ -203,8 +200,8 @@ int main(int argc, char ** argv) {
        LOG_TEE("\n");
        LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str());
    }
-    const bool add_bos = llama_add_bos_token(model);
-    GGML_ASSERT(!llama_add_eos_token(model));
+    const bool add_bos = llama_should_add_bos_token(model);
+    GGML_ASSERT(llama_add_eos_token(model) != 1);
    LOG("add_bos: %d\n", add_bos);

    std::vector<llama_token> embd_inp;
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -23,18 +23,6 @@
 #include "ggml-cuda.h"
 #include "ggml-sycl.h"

-#ifdef GGML_USE_CANN
-#include "ggml-cann.h"
-#endif
-
-#ifdef _WIN32
-#define WIN32_LEAN_AND_MEAN
-#ifndef NOMINMAX
-#   define NOMINMAX
-#endif
-#include <windows.h>
-#endif
-
 // utils
 static uint64_t get_time_ns() {
    using clock = std::chrono::high_resolution_clock;
@@ -104,27 +92,6 @@ static std::string get_cpu_info() {
        }
        fclose(f);
    }
-#elif defined(_WIN32)
-    HKEY hKey;
-    if (RegOpenKeyEx(HKEY_LOCAL_MACHINE,
-                     TEXT("HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"),
-                     0,
-                     KEY_READ,
-                     &hKey) != ERROR_SUCCESS) {
-        // fail to open registry key
-        return "";
-    }
-    char cpu_brand[256];
-    DWORD cpu_brand_size = sizeof(cpu_brand);
-    if (RegQueryValueExA(hKey,
-                        TEXT("ProcessorNameString"),
-                        NULL,
-                        NULL,
-                        (LPBYTE)cpu_brand,
-                        &cpu_brand_size) == ERROR_SUCCESS) {
-        id.assign(cpu_brand, cpu_brand_size);
-    }
-    RegCloseKey(hKey);
 #endif
    // TODO: other platforms
    return id;
@@ -153,17 +120,6 @@ static std::string get_gpu_info() {
            id += "/";
        }
    }
-#endif
-#ifdef GGML_USE_CANN
-    uint32_t count = ggml_backend_cann_get_device_count();
-    for (uint32_t i = 0; i < count; i++) {
-        char buf[128];
-        ggml_backend_cann_get_device_description(i, buf, sizeof(buf));
-        id += buf;
-        if (i < count - 1) {
-            id += "/";
-        }
-    }
 #endif
    // TODO: other backends
    return id;
@@ -179,7 +135,7 @@ static const char * output_format_str(output_formats format) {
        case JSON:     return "json";
        case MARKDOWN: return "md";
        case SQL:      return "sql";
-        default: GGML_ABORT("invalid output format");
+        default: GGML_ASSERT(!"invalid output format");
    }
 }

@@ -205,7 +161,7 @@ static const char * split_mode_str(llama_split_mode mode) {
        case LLAMA_SPLIT_MODE_NONE:  return "none";
        case LLAMA_SPLIT_MODE_LAYER: return "layer";
        case LLAMA_SPLIT_MODE_ROW:   return "row";
-        default: GGML_ABORT("invalid split mode");
+        default: GGML_ASSERT(!"invalid split mode");
    }
 }

@@ -1355,7 +1311,7 @@ static std::unique_ptr<printer> create_printer(output_formats format) {
        case SQL:
            return std::unique_ptr<printer>(new sql_printer());
    }
-    GGML_ABORT("fatal error");
+    GGML_ASSERT(false);
 }

 int main(int argc, char ** argv) {
--- a/examples/llama.android/llama/src/main/cpp/llama-android.cpp
+++ b/examples/llama.android/llama/src/main/cpp/llama-android.cpp
@@ -409,7 +409,7 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(

    const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value);
    if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
-        return nullptr;
+        return env->NewStringUTF("");
    }

    auto new_token_chars = llama_token_to_piece(context, new_token_id);
--- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
+++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
@@ -26,12 +26,11 @@ actor LlamaContext {
    private var context: OpaquePointer
    private var batch: llama_batch
    private var tokens_list: [llama_token]
-    var is_done: Bool = false

    /// This variable is used to store temporarily invalid cchars
    private var temporary_invalid_cchars: [CChar]

-    var n_len: Int32 = 1024
+    var n_len: Int32 = 64
    var n_cur: Int32 = 0

    var n_decode: Int32 = 0
@@ -161,7 +160,6 @@ actor LlamaContext {

        if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
            print("\n")
-            is_done = true
            let new_token_str = String(cString: temporary_invalid_cchars + [0])
            temporary_invalid_cchars.removeAll()
            return new_token_str
--- a/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift
+++ b/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift
@@ -132,7 +132,7 @@ class LlamaState: ObservableObject {
        messageLog += "\(text)"

        Task.detached {
-            while await !llamaContext.is_done {
+            while await llamaContext.n_cur < llamaContext.n_len {
                let result = await llamaContext.completion_loop()
                await MainActor.run {
                    self.messageLog += "\(result)"
--- a/examples/llava/CMakeLists.txt
+++ b/examples/llava/CMakeLists.txt
@@ -36,10 +36,3 @@ set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-llava-cli)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
-
-set(TARGET llama-minicpmv-cli)
-add_executable(${TARGET} minicpmv-cli.cpp)
-set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-minicpmv-cli)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/llava/README-minicpmv2.5.md
+++ b/examples/llava/README-minicpmv2.5.md
@@ -1,99 +0,0 @@
-## MiniCPM-Llama3-V 2.5
-
-### Prepare models and code
-
-Download [MiniCPM-Llama3-V-2_5](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5) PyTorch model from huggingface to "MiniCPM-Llama3-V-2_5" folder.
-
-Clone llama.cpp:
-```bash
-git clone https://github.com/ggerganov/llama.cpp
-cd llama.cpp
-```
-
-### Usage
-
-Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf) by us)
-
-```bash
-python ./examples/minicpmv/minicpmv-surgery.py -m ../MiniCPM-Llama3-V-2_5
-python ./examples/minicpmv/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-Llama3-V-2_5 --minicpmv-projector ../MiniCPM-Llama3-V-2_5/minicpmv.projector --output-dir ../MiniCPM-Llama3-V-2_5/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 2
-python ./convert_hf_to_gguf.py ../MiniCPM-Llama3-V-2_5/model
-
-# quantize int4 version
-./llama-quantize ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf Q4_K_M
-```
-
-Build for Linux or Mac
-
-```bash
-make
-make llama-minicpmv-cli
-```
-
-Inference on Linux or Mac
-```
-# run f16 version
-./llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
-
-# run quantized int4 version
-./llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg  -p "What is in the image?"
-
-# or run in interactive mode
-./llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -i
-```
-
-### Android
-
-#### Build on Android device using Termux
-We found that build on Android device would bring better runtime performance, so we recommend to build on device.
-
-[Termux](https://github.com/termux/termux-app#installation) is a terminal app on Android device (no root required).
-
-Install tools in Termux:
-```
-apt update && apt upgrade -y
-apt install git make cmake
-```
-
-It's recommended to move your model inside the `~/` directory for best performance:
-```
-cd storage/downloads
-mv model.gguf ~/
-```
-
-#### Building the Project using Android NDK
-Obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake.
-
-Execute the following commands on your computer to avoid downloading the NDK to your mobile. Alternatively, you can also do this in Termux:
-
-```bash
-mkdir build-android
-cd build-android
-export NDK=/your_ndk_path
-cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod ..
-make
-```
-
-Install [termux](https://github.com/termux/termux-app#installation) on your device and run `termux-setup-storage` to get access to your SD card (if Android 11+ then run the command twice).
-
-Finally, copy these built `llama` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission:
-
-(Assumed that you have pushed the built executable files to the /sdcard/llama.cpp/bin path using `adb push`)
-```
-$cp -r /sdcard/llama.cpp/bin /data/data/com.termux/files/home/
-$cd /data/data/com.termux/files/home/bin
-$chmod +x ./*
-```
-
-Download models and push them to `/sdcard/llama.cpp/`, then move it to `/data/data/com.termux/files/home/model/`
-
-```
-$mv /sdcard/llama.cpp/ggml-model-Q4_K_M.gguf /data/data/com.termux/files/home/model/
-$mv /sdcard/llama.cpp/mmproj-model-f16.gguf /data/data/com.termux/files/home/model/
-```
-
-Now, you can start chatting:
-```
-$cd /data/data/com.termux/files/home/bin
-$./llama-minicpmv-cli -m ../model/ggml-model-Q4_K_M.gguf --mmproj ../model/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg  -p "What is in the image?"
-```
--- a/examples/llava/README-minicpmv2.6.md
+++ b/examples/llava/README-minicpmv2.6.md
@@ -1,107 +0,0 @@
-## MiniCPM-V 2.6
-
-### Prepare models and code
-
-Download [MiniCPM-V-2_6](https://huggingface.co/openbmb/MiniCPM-V-2_6) PyTorch model from huggingface to "MiniCPM-V-2_6" folder.
-
-Clone llama.cpp:
-```bash
-git clone git@github.com:OpenBMB/llama.cpp.git
-cd llama.cpp
-git checkout minicpmv-main
-```
-
-### Usage of MiniCPM-V 2.6
-
-Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) by us)
-
-```bash
-python ./examples/llava/minicpmv-surgery.py -m ../MiniCPM-V-2_6
-python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-V-2_6 --minicpmv-projector ../MiniCPM-V-2_6/minicpmv.projector --output-dir ../MiniCPM-V-2_6/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 3
-python ./convert_hf_to_gguf.py ../MiniCPM-V-2_6/model
-
-# quantize int4 version
-./llama-quantize ../MiniCPM-V-2_6/model/ggml-model-f16.gguf ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf Q4_K_M
-```
-
-Build for Linux or Mac
-
-```bash
-make
-make llama-minicpmv-cli
-```
-
-Inference on Linux or Mac
-```
-# run f16 version
-./llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-f16.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
-
-# run quantized int4 version
-./llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg  -p "What is in the image?"
-
-# or run in interactive mode
-./llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -i
-```
-
-### Video
-Install FFmpeg
-```
-brew install ffmpeg
-brew install pkg-config
-```
-
-### Android
-
-#### Build on Android device using Termux
-We found that build on Android device would bring better runtime performance, so we recommend to build on device.
-
-[Termux](https://github.com/termux/termux-app#installation) is a terminal app on Android device (no root required).
-
-Install tools in Termux:
-```
-apt update && apt upgrade -y
-apt install git make cmake
-```
-
-It's recommended to move your model inside the `~/` directory for best performance:
-```
-cd storage/downloads
-mv model.gguf ~/
-```
-
-#### Building the Project using Android NDK
-Obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake.
-
-Execute the following commands on your computer to avoid downloading the NDK to your mobile. Alternatively, you can also do this in Termux:
-
-```bash
-mkdir build-android
-cd build-android
-export NDK=/your_ndk_path
-cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod ..
-make
-```
-
-Install [termux](https://github.com/termux/termux-app#installation) on your device and run `termux-setup-storage` to get access to your SD card (if Android 11+ then run the command twice).
-
-Finally, copy these built `llama` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission:
-
-(Assumed that you have pushed the built executable files to the /sdcard/llama.cpp/bin path using `adb push`)
-```
-$cp -r /sdcard/llama.cpp/bin /data/data/com.termux/files/home/
-$cd /data/data/com.termux/files/home/bin
-$chmod +x ./*
-```
-
-Download models and push them to `/sdcard/llama.cpp/`, then move it to `/data/data/com.termux/files/home/model/`
-
-```
-$mv /sdcard/llama.cpp/ggml-model-Q4_K_M.gguf /data/data/com.termux/files/home/model/
-$mv /sdcard/llama.cpp/mmproj-model-f16.gguf /data/data/com.termux/files/home/model/
-```
-
-Now, you can start chatting:
-```
-$cd /data/data/com.termux/files/home/bin
-$./llama-minicpmv-cli -m ../model/ggml-model-Q4_K_M.gguf --mmproj ../model/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg  -p "What is in the image?"
-```
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -16,14 +16,6 @@
 #include "ggml-metal.h"
 #endif

-#ifdef GGML_USE_CANN
-#include "ggml-cann.h"
-#endif
-
-#ifdef GGML_USE_VULKAN
-#include "ggml-vulkan.h"
-#endif
-
 #define STB_IMAGE_IMPLEMENTATION
 #include "stb_image.h"

@@ -78,28 +70,26 @@ static std::string format(const char * fmt, ...) {
 // key constants
 //

-#define KEY_FTYPE               "general.file_type"
-#define KEY_NAME                "general.name"
-#define KEY_DESCRIPTION         "general.description"
-#define KEY_HAS_TEXT_ENC        "clip.has_text_encoder"
-#define KEY_HAS_VIS_ENC         "clip.has_vision_encoder"
-#define KEY_HAS_LLAVA_PROJ      "clip.has_llava_projector"
-#define KEY_HAS_MINICPMV_PROJ   "clip.has_minicpmv_projector"
-#define KEY_MINICPMV_VERSION    "clip.minicpmv_version"
-#define KEY_USE_GELU            "clip.use_gelu"
-#define KEY_N_EMBD              "clip.%s.embedding_length"
-#define KEY_N_FF                "clip.%s.feed_forward_length"
-#define KEY_N_BLOCK             "clip.%s.block_count"
-#define KEY_N_HEAD              "clip.%s.attention.head_count"
-#define KEY_LAYER_NORM_EPS      "clip.%s.attention.layer_norm_epsilon"
-#define KEY_PROJ_DIM            "clip.%s.projection_dim"
-#define KEY_TOKENS              "tokenizer.ggml.tokens"
-#define KEY_N_POSITIONS         "clip.text.context_length"
-#define KEY_IMAGE_SIZE          "clip.vision.image_size"
-#define KEY_PATCH_SIZE          "clip.vision.patch_size"
-#define KEY_IMAGE_MEAN          "clip.vision.image_mean"
-#define KEY_IMAGE_STD           "clip.vision.image_std"
-#define KEY_PROJ_TYPE           "clip.projector_type"
+#define KEY_FTYPE          "general.file_type"
+#define KEY_NAME           "general.name"
+#define KEY_DESCRIPTION    "general.description"
+#define KEY_HAS_TEXT_ENC   "clip.has_text_encoder"
+#define KEY_HAS_VIS_ENC    "clip.has_vision_encoder"
+#define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector"
+#define KEY_USE_GELU       "clip.use_gelu"
+#define KEY_N_EMBD         "clip.%s.embedding_length"
+#define KEY_N_FF           "clip.%s.feed_forward_length"
+#define KEY_N_BLOCK        "clip.%s.block_count"
+#define KEY_N_HEAD         "clip.%s.attention.head_count"
+#define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon"
+#define KEY_PROJ_DIM       "clip.%s.projection_dim"
+#define KEY_TOKENS         "tokenizer.ggml.tokens"
+#define KEY_N_POSITIONS    "clip.text.context_length"
+#define KEY_IMAGE_SIZE     "clip.vision.image_size"
+#define KEY_PATCH_SIZE     "clip.vision.patch_size"
+#define KEY_IMAGE_MEAN     "clip.vision.image_mean"
+#define KEY_IMAGE_STD      "clip.vision.image_std"
+#define KEY_PROJ_TYPE      "clip.projector_type"

 #define KEY_MM_PATCH_MERGE_TYPE   "clip.vision.mm_patch_merge_type"
 #define KEY_IMAGE_GRID_PINPOINTS  "clip.vision.image_grid_pinpoints"
@@ -133,20 +123,12 @@ static std::string format(const char * fmt, ...) {
 #define TN_MVLM_PROJ_PEG   "mm.model.peg.%d.%s"
 #define TN_IMAGE_NEWLINE   "model.image_newline"

-#define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k"
-#define TN_MINICPMV_QUERY "resampler.query"
-#define TN_MINICPMV_PROJ "resampler.proj.weight"
-#define TN_MINICPMV_KV_PROJ "resampler.kv.weight"
-#define TN_MINICPMV_ATTN "resampler.attn.%s.%s"
-#define TN_MINICPMV_LN "resampler.ln_%s.%s"
-

 enum projector_type {
    PROJECTOR_TYPE_MLP,
    PROJECTOR_TYPE_MLP_NORM,
    PROJECTOR_TYPE_LDP,
    PROJECTOR_TYPE_LDPV2,
-    PROJECTOR_TYPE_RESAMPLER,
    PROJECTOR_TYPE_UNKNOWN,
 };

@@ -154,7 +136,6 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
    { PROJECTOR_TYPE_MLP, "mlp" },
    { PROJECTOR_TYPE_LDP, "ldp" },
    { PROJECTOR_TYPE_LDPV2, "ldpv2"},
-    { PROJECTOR_TYPE_RESAMPLER, "resampler"},
 };


@@ -215,20 +196,17 @@ static std::string gguf_data_to_str(enum gguf_type type, const void * data, int
 }

 static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
-    if (search.empty()) {
-        return;
+    std::string result;
+    for (size_t pos = 0; ; pos += search.length()) {
+        auto new_pos = s.find(search, pos);
+        if (new_pos == std::string::npos) {
+            result += s.substr(pos, s.size() - pos);
+            break;
+        }
+        result += s.substr(pos, new_pos - pos) + replace;
+        pos = new_pos;
    }
-    std::string builder;
-    builder.reserve(s.length());
-    size_t pos = 0;
-    size_t last_pos = 0;
-    while ((pos = s.find(search, last_pos)) != std::string::npos) {
-        builder.append(s, last_pos, pos - last_pos);
-        builder.append(replace);
-        last_pos = pos + search.length();
-    }
-    builder.append(s, last_pos, std::string::npos);
-    s = std::move(builder);
+    s = std::move(result);
 }

 static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
@@ -510,34 +488,12 @@ struct clip_vision_model {
    struct ggml_tensor * mm_model_mlp_2_b;
    struct ggml_tensor * mm_model_peg_0_w;
    struct ggml_tensor * mm_model_peg_0_b;
-
-    // MINICPMV projection
-    struct ggml_tensor * mm_model_pos_embed_k;
-    struct ggml_tensor * mm_model_query;
-    struct ggml_tensor * mm_model_proj;
-    struct ggml_tensor * mm_model_kv_proj;
-    struct ggml_tensor * mm_model_attn_q_w;
-    struct ggml_tensor * mm_model_attn_q_b;
-    struct ggml_tensor * mm_model_attn_k_w;
-    struct ggml_tensor * mm_model_attn_k_b;
-    struct ggml_tensor * mm_model_attn_v_w;
-    struct ggml_tensor * mm_model_attn_v_b;
-    struct ggml_tensor * mm_model_attn_o_w;
-    struct ggml_tensor * mm_model_attn_o_b;
-    struct ggml_tensor * mm_model_ln_q_w;
-    struct ggml_tensor * mm_model_ln_q_b;
-    struct ggml_tensor * mm_model_ln_kv_w;
-    struct ggml_tensor * mm_model_ln_kv_b;
-    struct ggml_tensor * mm_model_ln_post_w;
-    struct ggml_tensor * mm_model_ln_post_b;
 };

 struct clip_ctx {
    bool has_text_encoder    = false;
    bool has_vision_encoder  = false;
    bool has_llava_projector = false;
-    bool has_minicpmv_projector = false;
-    int minicpmv_version = 2;

    struct clip_vision_model vision_model;
    projector_type proj_type = PROJECTOR_TYPE_MLP;
@@ -562,11 +518,9 @@ struct clip_ctx {

    ggml_backend_t backend       = NULL;
    ggml_gallocr_t compute_alloc = NULL;
-
-    struct clip_image_size * load_image_size;
 };

-static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) {
+static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs) {
    if (!ctx->has_vision_encoder) {
        LOG_TEE("This gguf file seems to have no vision encoder\n");
        return nullptr;
@@ -575,33 +529,20 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
    const auto & model = ctx->vision_model;
    const auto & hparams = model.hparams;

-    const int image_size = hparams.image_size;
-    int image_size_width  = image_size;
-    int image_size_height = image_size;
-    if (ctx->has_minicpmv_projector) {
-        if (load_image_size == nullptr) {
-            load_image_size = clip_image_size_init();
-        }
-        LOG_TEE("%s: %d %d\n", __func__, load_image_size->width, load_image_size->height);
-        image_size_width  = load_image_size->width;
-        image_size_height = load_image_size->height;
-        if (is_inf) {
-            image_size_width  = imgs->data->nx;
-            image_size_height = imgs->data->ny;
-        }
-    }
+    const int image_size           = hparams.image_size;
    const int patch_size           = hparams.patch_size;
-    const int num_patches          = ((image_size_width / patch_size) * (image_size_height / patch_size));
+    const int num_patches          = ((image_size / patch_size) * (image_size / patch_size));
+    const int num_patches_per_side = image_size / patch_size; GGML_UNUSED(num_patches_per_side);
    const int num_positions        = num_patches + (ctx->has_class_embedding ? 1 : 0);
    const int hidden_size          = hparams.hidden_size;
    const int n_head               = hparams.n_head;
    const int d_head               = hidden_size / n_head;
-    int n_layer                    = hparams.n_layer;
+    const int n_layer              = hparams.n_layer;
    const float eps                = hparams.eps;

    const int batch_size = imgs->size;

-    if (ctx->has_llava_projector || ctx->has_minicpmv_projector) {
+    if (ctx->has_llava_projector) {
        GGML_ASSERT(batch_size == 1);
    }

@@ -614,7 +555,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
    struct ggml_context * ctx0 = ggml_init(params);
    struct ggml_cgraph * gf = ggml_new_graph(ctx0);

-    struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, 3, batch_size);
+    struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size, image_size, 3, batch_size);
    ggml_set_name(inp_raw, "inp_raw");
    ggml_set_input(inp_raw);

@@ -627,22 +568,20 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
        // inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp));
        inp = ggml_add(ctx0, inp, model.patch_bias);
    }
-    struct ggml_tensor * embeddings = inp;
-    struct ggml_tensor * pos_embed = nullptr;

-    if (ctx->has_llava_projector) {
-        // concat class_embeddings and patch_embeddings
-        if (ctx->has_class_embedding) {
-            embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
-            ggml_set_name(embeddings, "embeddings");
-            ggml_set_input(embeddings);
-            embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
-                    embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
-            embeddings = ggml_acc(ctx0, embeddings, inp,
-                    embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
-        }
+    // concat class_embeddings and patch_embeddings
+    struct ggml_tensor * embeddings = inp;
+    if (ctx->has_class_embedding) {
+        embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
+        ggml_set_name(embeddings, "embeddings");
+        ggml_set_input(embeddings);
+        embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
+                embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
+        embeddings = ggml_acc(ctx0, embeddings, inp,
+                embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
    }

+
    struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
    ggml_set_name(positions, "positions");
    ggml_set_input(positions);
@@ -650,19 +589,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
    embeddings =
        ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions));

-    if (ctx->has_minicpmv_projector) {
-        int pos_w = image_size_width/patch_size;
-        int pos_h = image_size_height/patch_size;
-        if (ctx->minicpmv_version == 2) {
-            pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 4096, pos_w * pos_h, 1);
-        }
-        else if (ctx->minicpmv_version == 3) {
-            pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 3584, pos_w * pos_h, 1);
-        }
-        ggml_set_name(pos_embed, "pos_embed");
-        ggml_set_input(pos_embed);
-    }
-
    // pre-layernorm
    if (ctx->has_pre_norm) {
        embeddings = ggml_norm(ctx0, embeddings, eps);
@@ -672,9 +598,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
    }

    // loop over layers
-    if (ctx->has_minicpmv_projector) {
-        n_layer += 1;
-    }
    for (int il = 0; il < n_layer - 1; il++) {
        struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states

@@ -764,7 +687,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
    }

    // llava projector
-    if (ctx->has_llava_projector) {
+    {
        embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);

        struct ggml_tensor * patches = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_patches);
@@ -785,8 +708,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
            embeddings = ggml_gelu(ctx0, embeddings);
            embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
            embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
-        }
-        else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
+
+        } else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
            embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
            embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
            // ggml_tensor_printf(embeddings, "mm_0_w",0,true,false);
@@ -941,75 +864,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
            peg_0 = ggml_reshape_3d(ctx0, peg_0, peg_0->ne[0], peg_0->ne[1] * peg_0->ne[2], peg_0->ne[3]);
            embeddings = peg_0;
        }
-        else {
-            GGML_ABORT("fatal error");
-        }
-    }
-    // minicpmv projector
-    else if (ctx->has_minicpmv_projector)
-    {
-        if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
-            struct ggml_tensor * q = model.mm_model_query;
-            { // layernorm
-                q = ggml_norm(ctx0, q, eps);
-                q = ggml_add(ctx0, ggml_mul(ctx0, q, model.mm_model_ln_q_w), model.mm_model_ln_q_b);
-            }
-            struct ggml_tensor * v = ggml_mul_mat(ctx0, model.mm_model_kv_proj, embeddings);
-            { // layernorm
-                v = ggml_norm(ctx0, v, eps);
-                v = ggml_add(ctx0, ggml_mul(ctx0, v, model.mm_model_ln_kv_w), model.mm_model_ln_kv_b);
-            }
-            struct ggml_tensor * k;
-            { // position
-                // q = ggml_add(ctx0, q, model.mm_model_pos_embed);
-                k = ggml_add(ctx0, v, pos_embed);
-            }
-
-            { // attention
-                int hidden_size = 4096;
-                const int d_head = 128;
-                int n_head = hidden_size/d_head;
-                int num_query = 96;
-                if (ctx->minicpmv_version == 2) {
-                    hidden_size = 4096;
-                    n_head = hidden_size/d_head;
-                    num_query = 96;
-                }
-                else if (ctx->minicpmv_version == 3) {
-                    hidden_size = 3584;
-                    n_head = hidden_size/d_head;
-                    num_query = 64;
-                }
-
-                struct ggml_tensor * Q = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q), model.mm_model_attn_q_b);
-                Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head));
-                struct ggml_tensor * K = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k), model.mm_model_attn_k_b);
-                struct ggml_tensor * V = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v), model.mm_model_attn_v_b);
-                // permute
-                Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_query, batch_size);
-                Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
-                Q = ggml_reshape_3d(ctx0, Q, d_head, num_query, n_head * batch_size);
-                K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size);
-                K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
-                K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size);
-                V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size);
-                V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
-                V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size);
-                struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-                KQ = ggml_soft_max_inplace(ctx0, KQ);
-                struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
-                KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_query, n_head, batch_size);
-                KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-                KQV = ggml_cont_3d(ctx0, KQV, hidden_size, num_query, batch_size);
-
-                embeddings = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_o_w, KQV), model.mm_model_attn_o_b);
-            }
-            { // layernorm
-                embeddings = ggml_norm(ctx0, embeddings, eps);
-                embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_post_w), model.mm_model_ln_post_b);
-            }
-            embeddings = ggml_mul_mat(ctx0, model.mm_model_proj, embeddings);
-        }
        else {
            GGML_ASSERT(false);
        }
@@ -1118,7 +972,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        }
    }

-    clip_ctx * new_clip = new clip_ctx{};
+    clip_ctx * new_clip = new clip_ctx;

    // update projector type
    {
@@ -1147,15 +1001,6 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
    LOG_TEE("%s: CLIP using Metal backend\n", __func__);
 #endif

-#ifdef GGML_USE_CANN
-    new_clip->backend = ggml_backend_cann_init(0);
-    LOG_TEE("%s: CLIP using CANN backend\n", __func__);
-#endif
-
-#ifdef GGML_USE_VULKAN
-    new_clip->backend = ggml_backend_vk_init(0);
-    LOG_TEE("%s: CLIP using Vulkan backend\n", __func__);
-#endif

    if (!new_clip->backend) {
        new_clip->backend = ggml_backend_cpu_init();
@@ -1175,18 +1020,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
            new_clip->has_llava_projector = gguf_get_val_bool(ctx, idx);
        }

-        idx = gguf_find_key(ctx, KEY_HAS_MINICPMV_PROJ);
-        if (idx != -1) {
-            new_clip->has_minicpmv_projector = gguf_get_val_bool(ctx, idx);
-        }
-
-        idx = gguf_find_key(ctx, KEY_MINICPMV_VERSION);
-        if (idx != -1) {
-            new_clip->minicpmv_version = gguf_get_val_i32(ctx, idx);
-        }
-
-        // GGML_ASSERT(new_clip->has_llava_projector); // see monatis/clip.cpp for image and/or text encoding for semantic search
-
+        GGML_ASSERT(new_clip->has_llava_projector); // see monatis/clip.cpp for image and/or text encoding for semantic search
        GGML_ASSERT(new_clip->has_vision_encoder);
        GGML_ASSERT(!new_clip->has_text_encoder);

@@ -1197,7 +1031,6 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
            LOG_TEE("%s: text_encoder:   %d\n", __func__, new_clip->has_text_encoder);
            LOG_TEE("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
            LOG_TEE("%s: llava_projector:  %d\n", __func__, new_clip->has_llava_projector);
-            LOG_TEE("%s: minicpmv_projector:  %d\n", __func__, new_clip->has_minicpmv_projector);
            LOG_TEE("%s: model size:     %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
            LOG_TEE("%s: metadata size:  %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
        }
@@ -1439,27 +1272,6 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
            vision_model.mm_model_peg_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "weight"));
            vision_model.mm_model_peg_0_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "bias"));
        }
-        else if (new_clip->proj_type == PROJECTOR_TYPE_RESAMPLER) {
-            // vision_model.mm_model_pos_embed = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD);
-            vision_model.mm_model_pos_embed_k = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD_K);
-            vision_model.mm_model_query = get_tensor(new_clip->ctx_data, TN_MINICPMV_QUERY);
-            vision_model.mm_model_proj = get_tensor(new_clip->ctx_data, TN_MINICPMV_PROJ);
-            vision_model.mm_model_kv_proj = get_tensor(new_clip->ctx_data, TN_MINICPMV_KV_PROJ);
-            vision_model.mm_model_attn_q_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "q", "weight"));
-            vision_model.mm_model_attn_k_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "k", "weight"));
-            vision_model.mm_model_attn_v_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "v", "weight"));
-            vision_model.mm_model_attn_q_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "q", "bias"));
-            vision_model.mm_model_attn_k_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "k", "bias"));
-            vision_model.mm_model_attn_v_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "v", "bias"));
-            vision_model.mm_model_attn_o_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "out", "weight"));
-            vision_model.mm_model_attn_o_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "out", "bias"));
-            vision_model.mm_model_ln_q_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "q", "weight"));
-            vision_model.mm_model_ln_q_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "q", "bias"));
-            vision_model.mm_model_ln_kv_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "kv", "weight"));
-            vision_model.mm_model_ln_kv_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "kv", "bias"));
-            vision_model.mm_model_ln_post_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "weight"));
-            vision_model.mm_model_ln_post_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "bias"));
-        }
        else {
            std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type];
            throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
@@ -1498,7 +1310,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        new_clip->compute_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(new_clip->backend));
        clip_image_f32_batch batch;
        batch.size = 1;
-        ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, nullptr, false);
+        ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch);
        ggml_gallocr_reserve(new_clip->compute_alloc, gf);
        size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0);
        LOG_TEE("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
@@ -1507,17 +1319,6 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
    return new_clip;
 }

-void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size) {
-    ctx_clip->load_image_size = load_image_size;
-}
-
-struct clip_image_size * clip_image_size_init() {
-    struct clip_image_size * load_image_size = new struct clip_image_size();
-    load_image_size->width = 448;
-    load_image_size->height = 448;
-    return load_image_size;
-}
-
 struct clip_image_u8 * clip_image_u8_init() {
    return new clip_image_u8();
 }
@@ -1788,186 +1589,9 @@ static std::vector<clip_image_u8*> divide_to_patches_u8(const clip_image_u8 & im
    return patches;
 }

-static int ensure_divide(int length, int patch_size) {
-    return std::max(static_cast<int>(std::round(static_cast<float>(length) / patch_size) * patch_size), patch_size);
-}
-
-static std::pair<int, int> uhd_find_best_resize(std::pair<int, int> original_size, int scale_resolution, int patch_size, bool allow_upscale = false) {
-    int width = original_size.first;
-    int height = original_size.second;
-    if ((width * height > scale_resolution * scale_resolution) || allow_upscale) {
-        float r = static_cast<float>(width) / height;
-        height = static_cast<int>(scale_resolution / std::sqrt(r));
-        width = static_cast<int>(height * r);
-    }
-    int best_width = ensure_divide(width, patch_size);
-    int best_height = ensure_divide(height, patch_size);
-    return std::make_pair(best_width, best_height);
-}
-
-static std::pair<int, int> uhd_get_refine_size(std::pair<int, int> original_size, std::pair<int, int> grid, int scale_resolution, int patch_size, bool allow_upscale = false) {
-    int width, height;
-    std::tie(width, height) = original_size;
-    int grid_x, grid_y;
-    std::tie(grid_x, grid_y) = grid;
-
-    int refine_width = ensure_divide(width, grid_x);
-    int refine_height = ensure_divide(height, grid_y);
-
-    int grid_width = refine_width / grid_x;
-    int grid_height = refine_height / grid_y;
-
-   // auto best_grid_size = find_best_resize(std::make_tuple(grid_width, grid_height), scale_resolution, patch_size, allow_upscale); (old line)
-    auto best_grid_size = uhd_find_best_resize(std::make_pair(grid_width, grid_height), scale_resolution, patch_size, allow_upscale); // (new line) => fixes conversion for make_tuple to make_pair
-    int best_grid_width, best_grid_height;
-    std::tie(best_grid_width, best_grid_height) = best_grid_size;
-
-  //  std::pair<int, int> refine_size = std::make_tuple(best_grid_width * grid_x, best_grid_height * grid_y); (old line)
-    std::pair<int, int> refine_size = std::make_pair(best_grid_width * grid_x, best_grid_height * grid_y); // (new line)
-    return refine_size;
-}
-
-inline int clip(int x, int lower, int upper) {
-    return std::max(lower, std::min(x, upper));
-}
-
-static std::pair<int, int> uhd_best_grid(const int max_slice_nums, const int multiple, const float log_ratio) {
-    std::vector<int> candidate_split_grids_nums;
-    for (int i : {multiple - 1, multiple, multiple + 1}) {
-        if (i == 1 || i > max_slice_nums) {
-            continue;
-        }
-        candidate_split_grids_nums.push_back(i);
-    }
-
-    std::vector<std::pair<int, int>> candidate_grids;
-    for (int split_grids_nums : candidate_split_grids_nums) {
-        int m = 1;
-        while (m <= split_grids_nums) {
-            if (split_grids_nums % m == 0) {
-                candidate_grids.emplace_back(m, split_grids_nums / m);
-            }
-            ++m;
-        }
-    }
-
-    std::pair<int, int> best_grid{1, 1};
-    float min_error = std::numeric_limits<float>::infinity();
-    for (const auto& grid : candidate_grids) {
-        float error = std::abs(log_ratio - std::log(1.0 * grid.first / grid.second));
-        if (error < min_error) {
-            best_grid = grid;
-            min_error = error;
-        }
-    }
-    return best_grid;
-}
-
-// inspired from LLaVA-UHD:
-//    -> https://arxiv.org/pdf/2403.11703
-//    -> https://github.com/thunlp/LLaVA-UHD
-//    -> https://github.com/thunlp/LLaVA-UHD/blob/302301bc2175f7e717fb8548516188e89f649753/llava_uhd/train/llava-uhd/slice_logic.py#L118
-static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_image_u8 * img, const int max_slice_nums=9, const int scale_resolution=448, const int patch_size=14) {
-    const std::pair<int, int> original_size={img->nx,img->ny};
-    const int original_width = img->nx;
-    const int original_height = img->ny;
-    const float log_ratio = log(1.0*original_width/original_height);
-    const float ratio = 1.0 * original_width * original_height/ (scale_resolution * scale_resolution);
-    const int multiple = fmin(ceil(ratio), max_slice_nums);
-
-    std::vector<std::vector<clip_image_u8 *>> images;
-    LOG_TEE("%s: multiple %d\n", __func__, multiple);
-    images.push_back(std::vector<clip_image_u8 *>());
-
-    if (multiple <= 1) {
-        auto best_size = uhd_find_best_resize(original_size, scale_resolution, patch_size, true);
-        clip_image_u8 * source_image = clip_image_u8_init();
-        bicubic_resize(*img, *source_image, best_size.first, best_size.second);
-        // source_image = image.resize(best_size, Image.Resampling.BICUBIC)
-        images[images.size()-1].push_back(source_image);
-    }
-    else if (multiple > 1) {
-        auto best_size = uhd_find_best_resize(original_size, scale_resolution, patch_size);
-        clip_image_u8 * source_image = clip_image_u8_init();
-        bicubic_resize(*img, *source_image, best_size.first, best_size.second);
-        // source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC)
-        LOG_TEE("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img->nx, img->ny, best_size.first, best_size.second);
-        images[images.size()-1].push_back(source_image);
-
-        std::pair<int, int> best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio);
-        LOG_TEE("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img->nx, img->ny, best_grid.first, best_grid.second);
-
-        auto refine_size = uhd_get_refine_size(original_size, best_grid, scale_resolution, patch_size, true);
-        clip_image_u8 * refine_image = clip_image_u8_init();
-        bicubic_resize(*img, *refine_image, refine_size.first, refine_size.second);
-
-        LOG_TEE("%s: refine_image_size: %d %d; refine_size: %d %d\n", __func__, refine_image->nx, refine_image->ny, refine_size.first, refine_size.second);
-
-        // split_to_patches
-        int width = refine_image->nx;
-        int height = refine_image->ny;
-        int grid_x = int(width / best_grid.first);
-        int grid_y = int(height / best_grid.second);
-        for (int patches_i = 0, ic = 0; patches_i < height && ic < best_grid.second; patches_i += grid_y, ic += 1){
-            images.push_back(std::vector<clip_image_u8 *>());
-            for(int patches_j = 0, jc = 0; patches_j < width && jc < best_grid.first; patches_j += grid_x, jc += 1){
-                clip_image_u8 * patch = clip_image_u8_init();
-                patch->nx = grid_x;
-                patch->ny = grid_y;
-                patch->buf.resize(3 * patch->nx * patch->ny);
-                for (int y = patches_i; y < patches_i + grid_y; ++y) {
-                    for (int x = patches_j; x < patches_j + grid_x; ++x) {
-                        const int i = 3 * (y * refine_image->nx + x);
-                        const int j = 3 * ((y-patches_i) * patch->nx + (x-patches_j));
-                        patch->buf[j]   = refine_image->buf[i];
-                        patch->buf[j+1] = refine_image->buf[i+1];
-                        patch->buf[j+2] = refine_image->buf[i+2];
-                    }
-                }
-                images[images.size()-1].push_back(patch);
-            }
-        }
-    }
-    return images;
-}
-
-int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip) {
-    const int max_slice_nums=9;
-    const int scale_resolution=448;
-    const int original_width = ctx_clip->load_image_size->width;
-    const int original_height = ctx_clip->load_image_size->height;
-    const float log_ratio = log(1.0*original_width/original_height);
-    const float ratio = 1.0 * original_width * original_height/ (scale_resolution * scale_resolution);
-    const int multiple = fmin(ceil(ratio), max_slice_nums);
-    std::pair<int, int> best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio);
-    return best_grid.first;
-}
-
 // returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
 // res_imgs memory is being allocated here, previous allocations will be freed if found
 bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch * res_imgs) {
-
-    if(clip_is_minicpmv(ctx)){
-        int max_slice_nums = 9;
-        std::vector<std::vector<clip_image_u8 *>> imgs = uhd_slice_image(img, max_slice_nums);
-        res_imgs->size = 0;
-        for (size_t i = 0; i < imgs.size(); ++i){
-            res_imgs->size += imgs[i].size();
-        }
-        res_imgs->data = new clip_image_f32[res_imgs->size];
-        int idx = 0;
-        for (size_t i = 0; i < imgs.size(); ++i) {
-            for (size_t j = 0; j < imgs[i].size(); ++j) {
-                LOG_TEE("%s: %d %d\n", __func__,imgs[i][j]->nx,imgs[i][j]->ny);
-                clip_image_f32 * res = clip_image_f32_init();
-                normalize_image_u8_to_f32(imgs[i][j], res, ctx->image_mean, ctx->image_std);
-                res_imgs->data[idx++] = *res;
-                clip_image_f32_free(res);
-            }
-        }
-        return true;
-    }
-
    bool pad_to_square = true;
    if (!ctx->has_vision_encoder) {
        LOG_TEE("This gguf file seems to have no vision encoder\n");
@@ -2183,104 +1807,11 @@ int clip_n_patches(const struct clip_ctx * ctx) {

    if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2) {
        n_patches /= 4;
-    } else if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
-        if (ctx->minicpmv_version == 2) {
-            n_patches = 96;
-        }
-        else if (ctx->minicpmv_version == 3) {
-            n_patches = 64;
-        }
    }

    return n_patches;
 }

-static std::vector<std::vector<std::vector<float>>> get_1d_sincos_pos_embed_from_grid_new(int embed_dim, const std::vector<std::vector<float>> & pos) {
-    assert(embed_dim % 2 == 0);
-    int H = pos.size();
-    int W = pos[0].size();
-
-    std::vector<float> omega(embed_dim / 2);
-    for (int i = 0; i < embed_dim / 2; ++i) {
-        omega[i] = 1.0 / pow(10000.0, static_cast<float>(i) / (embed_dim / 2));
-    }
-
-    std::vector<std::vector<std::vector<float>>> emb(H, std::vector<std::vector<float>>(W, std::vector<float>(embed_dim)));
-    for (int h = 0; h < H; ++h) {
-        for (int w = 0; w < W; ++w) {
-            for (int d = 0; d < embed_dim / 2; ++d) {
-                float out_value = pos[h][w] * omega[d];
-                emb[h][w][d] = sin(out_value);
-                emb[h][w][d + embed_dim / 2] = cos(out_value);
-            }
-        }
-    }
-
-    return emb;
-}
-
-static std::vector<std::vector<std::vector<float>>> get_2d_sincos_pos_embed_from_grid(int embed_dim, const std::vector<std::vector<std::vector<float>>> & grid) {
-    assert(embed_dim % 2 == 0);
-    std::vector<std::vector<std::vector<float>>> emb_h = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, grid[0]); // (H, W, D/2)
-    std::vector<std::vector<std::vector<float>>> emb_w = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, grid[1]); // (H, W, D/2)
-
-    int H = emb_h.size();
-    int W = emb_h[0].size();
-    std::vector<std::vector<std::vector<float>>> emb(H, std::vector<std::vector<float>>(W, std::vector<float>(embed_dim)));
-
-    for (int h = 0; h < H; ++h) {
-        for (int w = 0; w < W; ++w) {
-            for (int d = 0; d < embed_dim / 2; ++d) {
-                emb[h][w][d] = emb_h[h][w][d];
-                emb[h][w][d + embed_dim / 2] = emb_w[h][w][d];
-            }
-        }
-    }
-    return emb;
-}
-
-static std::vector<std::vector<float>> get_2d_sincos_pos_embed(int embed_dim, const std::pair<int, int> image_size) {
-    int grid_h_size = image_size.first;
-    int grid_w_size = image_size.second;
-
-    std::vector<float> grid_h(grid_h_size);
-    std::vector<float> grid_w(grid_w_size);
-
-    for (int i = 0; i < grid_h_size; ++i) {
-        grid_h[i] = static_cast<float>(i);
-    }
-    for (int i = 0; i < grid_w_size; ++i) {
-        grid_w[i] = static_cast<float>(i);
-    }
-
-    std::vector<std::vector<float>> grid(grid_h_size, std::vector<float>(grid_w_size));
-    for (int h = 0; h < grid_h_size; ++h) {
-        for (int w = 0; w < grid_w_size; ++w) {
-            grid[h][w] = grid_w[w];
-        }
-    }
-    std::vector<std::vector<std::vector<float>>> grid_2d = {grid, grid};
-    for (int h = 0; h < grid_h_size; ++h) {
-        for (int w = 0; w < grid_w_size; ++w) {
-            grid_2d[0][h][w] = grid_h[h];
-            grid_2d[1][h][w] = grid_w[w];
-        }
-    }
-
-    std::vector<std::vector<std::vector<float>>> pos_embed_3d = get_2d_sincos_pos_embed_from_grid(embed_dim, grid_2d);
-
-    int H = image_size.first;
-    int W = image_size.second;
-    std::vector<std::vector<float>> pos_embed_2d(H * W, std::vector<float>(embed_dim));
-    for (int h = 0; h < H; ++h) {
-        for (int w = 0; w < W; ++w) {
-            pos_embed_2d[w * H + h] = pos_embed_3d[h][w];
-        }
-    }
-
-    return pos_embed_2d;
-}
-
 bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
    if (!ctx->has_vision_encoder) {
        LOG_TEE("This gguf file seems to have no vision encoder\n");
@@ -2303,33 +1834,19 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
    if (ctx->has_llava_projector) {
        GGML_ASSERT(batch_size == 1); // TODO: support multiple images
    }
-    if (ctx->has_minicpmv_projector) {
-        GGML_ASSERT(batch_size == 1);
-    }

    // build the inference graph
-    ggml_cgraph * gf = clip_image_build_graph(ctx, imgs, ctx->load_image_size, true);
+    ggml_cgraph * gf = clip_image_build_graph(ctx, imgs);
    ggml_gallocr_alloc_graph(ctx->compute_alloc, gf);

    // set inputs
    const auto & model = ctx->vision_model;
    const auto & hparams = model.hparams;

-    const int image_size = hparams.image_size;
-    int image_size_width  = image_size;
-    int image_size_height = image_size;
-    if (ctx->has_minicpmv_projector) {
-        image_size_width  = imgs->data[0].nx;
-        image_size_height = imgs->data[0].ny;
-    }
+    const int image_size    = hparams.image_size;
    const int patch_size    = hparams.patch_size;
-    const int num_patches   = ((image_size_width / patch_size) * (image_size_height / patch_size));
+    const int num_patches   = ((image_size / patch_size) * (image_size / patch_size));
    const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
-    if(ctx->load_image_size==nullptr){
-        ctx->load_image_size= clip_image_size_init();
-    }
-    const int pos_w = ctx->load_image_size->width/patch_size;
-    const int pos_h = ctx->load_image_size->height/patch_size;

    {
        struct ggml_tensor * inp_raw = ggml_graph_get_tensor(gf, "inp_raw");
@@ -2338,9 +1855,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
        for (size_t i = 0; i < imgs->size; i++) {
            const int nx = imgs->data[i].nx;
            const int ny = imgs->data[i].ny;
-            if (!ctx->has_minicpmv_projector) {
-                GGML_ASSERT(nx == image_size && ny == image_size);
-            }
+            GGML_ASSERT(nx == image_size && ny == image_size);

            const int n = nx * ny;

@@ -2357,87 +1872,37 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
        ggml_backend_tensor_set(inp_raw, data, 0, ggml_nbytes(inp_raw));
        free(data);
    }
-    if (ctx->has_minicpmv_projector) {
-        {
-            // inspired from siglip:
-            //    -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit
-            //    -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316
-            struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
-            int* positions_data = (int*)malloc(ggml_nbytes(positions));
-            int bucket_coords_h[70];
-            int bucket_coords_w[70];
-            for (int i = 0; i < pos_h; i++){
-                bucket_coords_h[i] = std::floor(70.0*i/pos_h);
-            }
-            for (int i = 0; i < pos_w; i++){
-                bucket_coords_w[i] = std::floor(70.0*i/pos_w);
-            }
-            for (int i = 0, id = 0; i < pos_h; i++){
-                for (int j = 0; j < pos_w; j++){
-                    positions_data[id++] = bucket_coords_h[i]*70 + bucket_coords_w[j];
-                }
-            }
-            ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
-            free(positions_data);
-        }

-        {
-            // inspired from resampler of Qwen-VL:
-            //    -> https://huggingface.co/Qwen/Qwen-VL/tree/main
-            //    -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23
-            struct ggml_tensor * pos_embed = ggml_graph_get_tensor(gf, "pos_embed");
-            int embed_dim = 4096;
-            if (ctx->minicpmv_version == 2) {
-                embed_dim = 4096;
-            }
-            else if (ctx->minicpmv_version == 3) {
-                embed_dim = 3584;
-            }
-            auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h));
+    {
+        if (ctx->has_class_embedding) {
+            struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings");

-            float * pos_embed_data = (float *)malloc(ggml_nbytes(pos_embed));
-            for(int i=0;i<pos_w * pos_h;++i){
-                for(int j=0;j<embed_dim;++j){
-                    pos_embed_data[i*embed_dim+j]=pos_embed_t[i][j];
-                }
-            }
-
-            ggml_backend_tensor_set(pos_embed, pos_embed_data, 0, ggml_nbytes(pos_embed));
-            free(pos_embed_data);
+            void* zero_mem = malloc(ggml_nbytes(embeddings));
+            memset(zero_mem, 0, ggml_nbytes(embeddings));
+            ggml_backend_tensor_set(embeddings, zero_mem, 0, ggml_nbytes(embeddings));
+            free(zero_mem);
        }
    }
-    else{
-        {
-            if (ctx->has_class_embedding) {
-                struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings");

-                void* zero_mem = malloc(ggml_nbytes(embeddings));
-                memset(zero_mem, 0, ggml_nbytes(embeddings));
-                ggml_backend_tensor_set(embeddings, zero_mem, 0, ggml_nbytes(embeddings));
-                free(zero_mem);
-            }
+    {
+        struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
+
+        int* positions_data = (int*)malloc(ggml_nbytes(positions));
+        for (int i = 0; i < num_positions; i++) {
+            positions_data[i] = i;
        }
+        ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
+        free(positions_data);
+    }

-        {
-            struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
-
-            int* positions_data = (int*)malloc(ggml_nbytes(positions));
-            for (int i = 0; i < num_positions; i++) {
-                positions_data[i] = i;
-            }
-            ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
-            free(positions_data);
-        }
-
-        {
-            struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
-            int* patches_data = (int*)malloc(ggml_nbytes(patches));
-            for (int i = 0; i < num_patches; i++) {
-                patches_data[i] = i + 1;
-            }
-            ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
-            free(patches_data);
+    {
+        struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
+        int* patches_data = (int*)malloc(ggml_nbytes(patches));
+        for (int i = 0; i < num_patches; i++) {
+            patches_data[i] = i + 1;
        }
+        ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
+        free(patches_data);
    }

    if (ggml_backend_is_cpu(ctx->backend)) {
@@ -2607,22 +2072,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
    if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
        return ctx->vision_model.mm_3_b->ne[0];
    }
-    if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
-        if (ctx->minicpmv_version == 2) {
-            return 4096;
-        }
-        else if (ctx->minicpmv_version == 3) {
-            return 3584;
-        }
-    }

    std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
    throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
 }
-
-int clip_is_minicpmv(const struct clip_ctx * ctx) {
-    if (ctx->has_minicpmv_projector) {
-        return ctx->minicpmv_version;
-    }
-    return 0;
-}
--- a/examples/llava/clip.h
+++ b/examples/llava/clip.h
@@ -18,17 +18,14 @@
 #    define CLIP_API
 #endif

+struct clip_ctx;
+
 #ifdef __cplusplus
 extern "C" {
 #endif

 struct clip_ctx;

-struct clip_image_size {
-    int width;
-    int height;
-};
-
 struct clip_image_u8_batch {
    struct clip_image_u8 * data;
    size_t size;
@@ -58,10 +55,6 @@ CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx);
 CLIP_API int clip_n_patches    (const struct clip_ctx * ctx);
 CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx);

-CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip);
-CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);
-
-CLIP_API struct clip_image_size * clip_image_size_init();
 CLIP_API struct clip_image_u8  * clip_image_u8_init ();
 CLIP_API struct clip_image_f32 * clip_image_f32_init();

@@ -85,8 +78,6 @@ CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, cons

 CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype);

-CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx);
-
 #ifdef __cplusplus
 }
 #endif
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -202,33 +202,6 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
    return true;
 }

-static clip_image_f32 * only_v2_5_reshape_by_patch(clip_image_f32 * image, int patch_size) {
-    int width = image->nx;
-    int height = image->ny;
-    int num_patches = (height / patch_size) * (width / patch_size);
-    clip_image_f32 * patch = clip_image_f32_init();
-    patch->nx = patch_size * num_patches;
-    patch->ny = patch_size;
-    patch->buf.resize(3 * patch->nx * patch->ny);
-
-    int patch_index = 0;
-
-    for (int i = 0; i < height; i += patch_size) {
-        for (int j = 0; j < width; j += patch_size) {
-            for (int pi = 0; pi < patch_size; ++pi) {
-                for (int pj = 0; pj < patch_size; ++pj) {
-                    int input_index = ((i + pi) * width + (j + pj)) * 3;
-                    int output_index = (pi * patch_size * num_patches + patch_index * patch_size + pj) * 3;
-                    patch->buf[output_index] = image->buf[input_index];
-                    patch->buf[output_index+1] = image->buf[input_index+1];
-                    patch->buf[output_index+2] = image->buf[input_index+2];
-                }
-            }
-            patch_index++;
-        }
-    }
-    return patch;
-}

 static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) {
    // std::vector<clip_image_f32*> img_res_v; // format VectN x H x W x RGB (N x 336 x 336 x 3), so interleaved RGB - different to the python implementation which is N x 3 x 336 x 336
@@ -245,51 +218,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli

    const char * mm_patch_merge_type = clip_patch_merge_type(ctx_clip);

-    if (clip_is_minicpmv(ctx_clip)) {
-        std::vector<float *> image_embd_v;
-        image_embd_v.resize(img_res_v.size);
-        struct clip_image_size * load_image_size = clip_image_size_init();
-        for (size_t i = 0; i < img_res_v.size; i++) {
-            const int64_t t_img_enc_step_start_us = ggml_time_us();
-            image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip));
-            int patch_size=14;
-            load_image_size->width = img_res_v.data[i].nx;
-            load_image_size->height = img_res_v.data[i].ny;
-            clip_add_load_image_size(ctx_clip, load_image_size);
-            bool encoded = false;
-            int has_minicpmv_projector = clip_is_minicpmv(ctx_clip);
-            if (has_minicpmv_projector == 2) {
-                encoded = clip_image_encode(ctx_clip, n_threads, only_v2_5_reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]);
-            }
-            else if (has_minicpmv_projector == 3) {
-                encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
-            }
-            if (!encoded) {
-                LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
-                return false;
-            }
-            const int64_t t_img_enc_steop_batch_us = ggml_time_us();
-            LOG_TEE("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)img_res_v.size, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0);
-        }
-        const int64_t t_img_enc_batch_us = ggml_time_us();
-        LOG_TEE("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
-
-        int n_img_pos_out = 0;
-        for (size_t i = 0; i < image_embd_v.size(); i++) {
-            std::memcpy(image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip), image_embd_v[i], clip_embd_nbytes(ctx_clip));
-            n_img_pos_out += clip_n_patches(ctx_clip);
-        }
-        *n_img_pos = n_img_pos_out;
-        for (size_t i = 0; i < image_embd_v.size(); i++) {
-            free(image_embd_v[i]);
-        }
-        image_embd_v.clear();
-        load_image_size->width = img->nx;
-        load_image_size->height = img->ny;
-        clip_add_load_image_size(ctx_clip, load_image_size);
-        LOG_TEE("%s: load_image_size %d %d\n", __func__, load_image_size->width, load_image_size->height);
-    }
-    else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
+    if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
        // flat / default llava-1.5 type embedding
        *n_img_pos = clip_n_patches(ctx_clip);
        bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 576 x 4096
@@ -299,8 +228,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli

            return false;
        }
-    }
-    else {
+    } else {
        // spatial_unpad llava-1.6 type embedding
        // TODO: CLIP needs batching support - in HF the llm projection is separate after encoding, which might be a solution to quickly get batching working
        std::vector<float *> image_embd_v;
@@ -369,11 +297,7 @@ bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx *
 }

 bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) {
-    int num_max_patches = 6;
-    if (clip_is_minicpmv(ctx_clip)) {
-        num_max_patches = 10;
-    }
-    float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*num_max_patches); // TODO: base on gridsize/llava model
+    float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*6); // TODO: base on gridsize/llava model
    if (!image_embd) {
        LOG_TEE("Unable to allocate memory for image embeddings\n");
        return false;
--- a/examples/llava/llava.h
+++ b/examples/llava/llava.h
@@ -17,11 +17,12 @@
 #    define LLAVA_API
 #endif

+struct clip_ctx;
+
 #ifdef __cplusplus
 extern "C" {
 #endif

-struct clip_ctx;
 struct llava_image_embed {
    float * embed;
    int n_image_pos;
@@ -36,8 +37,8 @@ LLAVA_API bool llava_image_embed_make_with_clip_img(struct clip_ctx * ctx_clip,
 LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length);
 /** build an image embed from a path to an image filename */
 LLAVA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path);
-/** free an embedding made with llava_image_embed_make_* */
 LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed);
+/** free an embedding made with llava_image_embed_make_* */

 /** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */
 LLAVA_API bool llava_eval_image_embed(struct llama_context * ctx_llama, const struct llava_image_embed * embed, int n_batch, int * n_past);
--- a/examples/llava/minicpmv-cli.cpp
+++ b/examples/llava/minicpmv-cli.cpp
@@ -1,329 +0,0 @@
-#include "ggml.h"
-#include "log.h"
-#include "common.h"
-#include "clip.h"
-#include "llava.h"
-#include "llama.h"
-
-#include <cstdio>
-#include <cstdlib>
-#include <vector>
-
-struct llava_context {
-    struct clip_ctx * ctx_clip = NULL;
-    struct llama_context * ctx_llama = NULL;
-    struct llama_model * model = NULL;
-};
-
-static void show_additional_info(int /*argc*/, char ** argv) {
-    LOG_TEE("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
-    LOG_TEE("  note: a lower temperature value like 0.1 is recommended for better quality.\n");
-}
-
-static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
-    (void) level;
-    (void) user_data;
-    LOG_TEE("%s", text);
-}
-
-static struct llama_model * llava_init(gpt_params * params) {
-    llama_backend_init();
-    llama_numa_init(params->numa);
-
-    llama_model_params model_params = llama_model_params_from_gpt_params(*params);
-
-    llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
-    if (model == NULL) {
-        LOG_TEE("%s: error: unable to load model\n" , __func__);
-        return NULL;
-    }
-    return model;
-}
-
-static struct llava_context * llava_init_context(gpt_params * params, llama_model * model) {
-    auto prompt = params->prompt;
-    if (prompt.empty()) {
-        prompt = "describe the image in detail.";
-    }
-
-    llama_context_params ctx_params = llama_context_params_from_gpt_params(*params);
-    if (params->n_ctx < 2048) {
-        // warn user here, "Image processing requires at least 2048 context, setting context to 2048"
-        LOG_TEE("%s: warn: Image processing requires at least 2048 context, setting context to 2048\n" , __func__);
-        ctx_params.n_ctx = 2048;
-    } else {
-        ctx_params.n_ctx = params->n_ctx;
-    }
-
-    llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
-
-    if (ctx_llama == NULL) {
-        LOG_TEE("%s: error: failed to create the llama_context\n" , __func__);
-        return NULL;
-    }
-
-    auto ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
-
-    ctx_llava->ctx_llama = ctx_llama;
-    ctx_llava->model = model;
-    return ctx_llava;
-}
-
-static void llava_free(struct llava_context * ctx_llava) {
-    if (ctx_llava->ctx_clip) {
-        clip_free(ctx_llava->ctx_clip);
-        ctx_llava->ctx_clip = NULL;
-    }
-
-    llama_free(ctx_llava->ctx_llama);
-    llama_free_model(ctx_llava->model);
-    llama_backend_free();
-}
-
-static struct clip_ctx * clip_init_context(gpt_params * params) {
-    const char * clip_path = params->mmproj.c_str();
-
-    auto prompt = params->prompt;
-    if (prompt.empty()) {
-        prompt = "describe the image in detail.";
-    }
-    auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
-    return ctx_clip;
-}
-
-static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
-    int N = (int) tokens.size();
-    for (int i = 0; i < N; i += n_batch) {
-        int n_eval = (int) tokens.size() - i;
-        if (n_eval > n_batch) {
-            n_eval = n_batch;
-        }
-        if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
-            LOG_TEE("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
-            return false;
-        }
-        *n_past += n_eval;
-    }
-    return true;
-}
-
-static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) {
-    std::vector<llama_token> tokens;
-    tokens.push_back(id);
-    return eval_tokens(ctx_llama, tokens, 1, n_past);
-}
-
-static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){
-    std::string              str2     = str;
-    std::vector<llama_token> embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos, true);
-    return eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
-}
-
-static void process_eval_image_embed(struct llava_context * ctx_llava, const struct llava_image_embed * embeds, int n_batch, int * n_past, int idx) {
-    float * image_embed = (float *)malloc(clip_embd_nbytes(ctx_llava->ctx_clip));
-    std::memcpy(image_embed, embeds->embed + idx * clip_n_patches(ctx_llava->ctx_clip) * clip_n_mmproj_embd(ctx_llava->ctx_clip), clip_embd_nbytes(ctx_llava->ctx_clip));
-
-    auto slice_embed = (llava_image_embed*)malloc(sizeof(llava_image_embed));
-    slice_embed->embed = image_embed;
-    slice_embed->n_image_pos = clip_n_patches(ctx_llava->ctx_clip);
-    llava_eval_image_embed(ctx_llava->ctx_llama, slice_embed, n_batch, n_past);
-    llava_image_embed_free(slice_embed);
-}
-
-static void process_image(struct llava_context * ctx_llava, struct llava_image_embed * embeds, gpt_params * params, int &n_past) {
-    std::string system_prompt;
-    int idx = 0;
-    int num_image_embeds = embeds->n_image_pos / clip_n_patches(ctx_llava->ctx_clip);
-    int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip);
-    if (has_minicpmv_projector == 2) {
-        system_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n";
-    }
-    else if (has_minicpmv_projector == 3) {
-        system_prompt = "<|im_start|>user\n";
-    }
-    LOG_TEE("%s: image token past: %d\n", __func__, n_past);
-    eval_string(ctx_llava->ctx_llama, (system_prompt+"<image>").c_str(), params->n_batch, &n_past, false);
-    process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
-    eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
-    if (num_image_embeds > 1) {
-        size_t num_image_embeds_col = clip_uhd_num_image_embeds_col(ctx_llava->ctx_clip);
-        eval_string(ctx_llava->ctx_llama, std::string("<slice>").c_str(), params->n_batch, &n_past, false);
-        for (size_t i = 0; i < (num_image_embeds-1)/num_image_embeds_col; ++i) {
-            for (size_t j = 0; j < num_image_embeds_col; ++j) {
-                eval_string(ctx_llava->ctx_llama, std::string("<image>").c_str(), params->n_batch, &n_past, false);
-                process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
-                eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
-                if (j == num_image_embeds_col - 1) {
-                    eval_string(ctx_llava->ctx_llama, std::string("\n").c_str(), params->n_batch, &n_past, false);
-                }
-            }
-        }
-        eval_string(ctx_llava->ctx_llama, std::string("</slice>").c_str(), params->n_batch, &n_past, false);
-    }
-    LOG_TEE("%s: image token past: %d\n", __func__, n_past);
-}
-
-static const char * sample(struct llama_sampling_context * ctx_sampling,
-                           struct llama_context * ctx_llama,
-                           int * n_past) {
-    const llama_token id = llama_sampling_sample(ctx_sampling, ctx_llama, NULL);
-    llama_sampling_accept(ctx_sampling, ctx_llama, id, true);
-    static std::string ret;
-    if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
-        ret = "</s>";
-    } else {
-        ret = llama_token_to_piece(ctx_llama, id);
-    }
-    eval_id(ctx_llama, id, n_past);
-    return ret.c_str();
-}
-
-static struct llava_context * minicpmv_init(gpt_params * params, const std::string & fname, int &n_past){
-    auto ctx_clip = clip_init_context(params);
-    auto embeds = llava_image_embed_make_with_filename(ctx_clip, params->n_threads, fname.c_str());
-    if (!embeds) {
-        std::cerr << "error: failed to load image " << fname << ". Terminating\n\n";
-        return NULL;
-    }
-
-    // process the prompt
-    if (params->prompt.empty() && params->interactive == false) {
-        LOG_TEE("prompt should be given or interactive mode should be on");
-        return NULL;
-    }
-
-    auto model = llava_init(params);
-    if (model == NULL) {
-        fprintf(stderr, "%s: error: failed to init minicpmv model\n", __func__);
-        return NULL;
-    }
-    const int64_t t_llava_init_start_us = ggml_time_us();
-    auto ctx_llava = llava_init_context(params, model);
-    ctx_llava->ctx_clip = ctx_clip;
-    const int64_t t_llava_init_end_us = ggml_time_us();
-    float t_llava_init_ms = (t_llava_init_end_us - t_llava_init_start_us) / 1000.0;
-    LOG_TEE("\n%s: llava init in %8.2f ms.\n", __func__, t_llava_init_ms);
-
-    const int64_t t_process_image_start_us = ggml_time_us();
-    process_image(ctx_llava, embeds, params, n_past);
-    const int64_t t_process_image_end_us = ggml_time_us();
-    float t_process_image_ms = (t_process_image_end_us - t_process_image_start_us) / 1000.0;
-    LOG_TEE("\n%s: llama process image in %8.2f ms.\n", __func__, t_process_image_ms);
-
-    llava_image_embed_free(embeds);
-    return ctx_llava;
-}
-
-static struct llama_sampling_context * llama_init(struct llava_context * ctx_llava, gpt_params * params, std::string prompt, int &n_past, bool is_first = false){
-    std::string user_prompt = prompt;
-    int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip);
-    if (!is_first) {
-        if (has_minicpmv_projector == 2) {
-            user_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" + prompt;
-        }
-        else if (has_minicpmv_projector == 3) {
-            user_prompt = "<|im_start|>user\n" + prompt;
-        }
-    }
-
-    eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
-    if (has_minicpmv_projector == 2) {
-        eval_string(ctx_llava->ctx_llama, "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", params->n_batch, &n_past, false);
-    }
-    else if (has_minicpmv_projector == 3) {
-        eval_string(ctx_llava->ctx_llama, "<|im_end|><|im_start|>assistant\n", params->n_batch, &n_past, false);
-    }
-
-    // generate the response
-
-    LOG_TEE("\n");
-
-    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams);
-    return ctx_sampling;
-}
-
-static const char * llama_loop(struct llava_context * ctx_llava,struct llama_sampling_context * ctx_sampling, int &n_past){
-
-    const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past);
-    return tmp;
-}
-
-int main(int argc, char ** argv) {
-    ggml_time_init();
-
-    gpt_params params;
-
-    if (!gpt_params_parse(argc, argv, params)) {
-        show_additional_info(argc, argv);
-        return 1;
-    }
-
-#ifndef LOG_DISABLE_LOGS
-    log_set_target(log_filename_generator("llava", "log"));
-    LOG_TEE("Log start\n");
-    log_dump_cmdline(argc, argv);
-    llama_log_set(llama_log_callback_logTee, nullptr);
-#endif // LOG_DISABLE_LOGS
-
-    if (params.mmproj.empty() || (params.image.empty())) {
-        gpt_params_print_usage(argc, argv, params);
-        show_additional_info(argc, argv);
-        return 1;
-    }
-
-    for (auto & image : params.image) {
-        int n_past = 0;
-        auto ctx_llava = minicpmv_init(&params, image, n_past);
-
-        if (!params.prompt.empty()) {
-            LOG_TEE("<user>%s\n", params.prompt.c_str());
-            LOG_TEE("<assistant>");
-            auto ctx_sampling = llama_init(ctx_llava, &params, params.prompt.c_str(), n_past, true);
-            const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
-            std::string response = "";
-            bool have_tmp = false;
-            for (int i = 0; i < max_tgt_len; i++) {
-                auto tmp = llama_loop(ctx_llava, ctx_sampling, n_past);
-                response += tmp;
-                if (strcmp(tmp, "</s>") == 0){
-                    if(!have_tmp)continue;
-                    else break;
-                }
-                if (strstr(tmp, "###")) break; // Yi-VL behavior
-                have_tmp = true;
-                printf("%s", tmp);
-                if (strstr(response.c_str(), "<user>")) break; // minicpm-v
-
-                fflush(stdout);
-            }
-            llama_sampling_free(ctx_sampling);
-        }else {
-            while (true) {
-                LOG_TEE("<user>");
-                std::string prompt;
-                std::getline(std::cin, prompt);
-                LOG_TEE("<assistant>");
-                auto ctx_sampling = llama_init(ctx_llava, &params, prompt, n_past, true);
-                const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
-                std::string response = "";
-                for (int i = 0; i < max_tgt_len; i++) {
-                    auto tmp = llama_loop(ctx_llava, ctx_sampling, n_past);
-                    response += tmp;
-                    if (strcmp(tmp, "</s>") == 0) break;
-                    if (strstr(tmp, "###")) break; // Yi-VL behavior
-                    printf("%s", tmp);// mistral llava-1.6
-                    if (strstr(response.c_str(), "<user>")) break; // minicpm-v
-                    fflush(stdout);
-                }
-                llama_sampling_free(ctx_sampling);
-            }
-        }
-        printf("\n");
-        llama_print_timings(ctx_llava->ctx_llama);
-
-        ctx_llava->model = NULL;
-        llava_free(ctx_llava);
-    }
-
-    return 0;
-}
--- a/examples/llava/minicpmv-convert-image-encoder-to-gguf.py
+++ b/examples/llava/minicpmv-convert-image-encoder-to-gguf.py
@@ -1,806 +0,0 @@
-# coding=utf-8
-# Copyright 2024 Google AI and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" PyTorch Siglip model. """
-# Copied from  HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit and add tgt_sizes
-
-
-import os
-import math
-import warnings
-
-import numpy as np
-import torch
-import torch.nn.functional as F
-import torch.utils.checkpoint
-from torch import nn
-from torch.nn.init import _calculate_fan_in_and_fan_out
-
-from transformers.activations import ACT2FN
-from transformers.modeling_utils import PreTrainedModel
-from transformers.configuration_utils import PretrainedConfig
-from transformers.utils import (
-    logging,
-)
-from transformers.utils import logging
-
-logger = logging.get_logger(__name__)
-
-class SiglipVisionConfig(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`SiglipVisionModel`]. It is used to instantiate a
-    Siglip vision encoder according to the specified arguments, defining the model architecture. Instantiating a
-    configuration with the defaults will yield a similar configuration to that of the vision encoder of the Siglip
-    [google/siglip-base-patch16-224](https://huggingface.co/google/siglip-base-patch16-224) architecture.
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-    Args:
-        hidden_size (`int`, *optional*, defaults to 768):
-            Dimensionality of the encoder layers and the pooler layer.
-        intermediate_size (`int`, *optional*, defaults to 3072):
-            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-        num_hidden_layers (`int`, *optional*, defaults to 12):
-            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (`int`, *optional*, defaults to 12):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        num_channels (`int`, *optional*, defaults to 3):
-            Number of channels in the input images.
-        image_size (`int`, *optional*, defaults to 224):
-            The size (resolution) of each image.
-        patch_size (`int`, *optional*, defaults to 16):
-            The size (resolution) of each patch.
-        hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
-            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
-            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
-            The epsilon used by the layer normalization layers.
-        attention_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for the attention probabilities.
-    Example:
-    ```python
-    >>> from transformers import SiglipVisionConfig, SiglipVisionModel
-    >>> # Initializing a SiglipVisionConfig with google/siglip-base-patch16-224 style configuration
-    >>> configuration = SiglipVisionConfig()
-    >>> # Initializing a SiglipVisionModel (with random weights) from the google/siglip-base-patch16-224 style configuration
-    >>> model = SiglipVisionModel(configuration)
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-
-    model_type = "siglip_vision_model"
-
-    def __init__(
-        self,
-        hidden_size=768,
-        intermediate_size=3072,
-        num_hidden_layers=12,
-        num_attention_heads=12,
-        num_channels=3,
-        image_size=224,
-        patch_size=16,
-        hidden_act="gelu_pytorch_tanh",
-        layer_norm_eps=1e-6,
-        attention_dropout=0.0,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.num_channels = num_channels
-        self.patch_size = patch_size
-        self.image_size = image_size
-        self.attention_dropout = attention_dropout
-        self.layer_norm_eps = layer_norm_eps
-        self.hidden_act = hidden_act
-
-_CHECKPOINT_FOR_DOC = "google/siglip-base-patch16-224"
-
-SIGLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "google/siglip-base-patch16-224",
-    # See all SigLIP models at https://huggingface.co/models?filter=siglip
-]
-
-# Copied from transformers.models.llama.modeling_llama._get_unpad_data
-def _get_unpad_data(attention_mask):
-    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
-    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
-    max_seqlen_in_batch = seqlens_in_batch.max().item()
-    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
-    return (
-        indices,
-        cu_seqlens,
-        max_seqlen_in_batch,
-    )
-
-
-def _trunc_normal_(tensor, mean, std, a, b):
-    # Cut & paste from PyTorch official master until it's in a few official releases - RW
-    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
-    def norm_cdf(x):
-        # Computes standard normal cumulative distribution function
-        return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
-
-    if (mean < a - 2 * std) or (mean > b + 2 * std):
-        warnings.warn(
-            "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
-            "The distribution of values may be incorrect.",
-            stacklevel=2,
-        )
-
-    # Values are generated by using a truncated uniform distribution and
-    # then using the inverse CDF for the normal distribution.
-    # Get upper and lower cdf values
-    l = norm_cdf((a - mean) / std)
-    u = norm_cdf((b - mean) / std)
-
-    # Uniformly fill tensor with values from [l, u], then translate to
-    # [2l-1, 2u-1].
-    tensor.uniform_(2 * l - 1, 2 * u - 1)
-
-    # Use inverse cdf transform for normal distribution to get truncated
-    # standard normal
-    if tensor.dtype in [torch.float16, torch.bfloat16]:
-        # The `erfinv_` op is not (yet?) defined in float16+cpu, bfloat16+gpu
-        og_dtype = tensor.dtype
-        tensor = tensor.to(torch.float32)
-        tensor.erfinv_()
-        tensor = tensor.to(og_dtype)
-    else:
-        tensor.erfinv_()
-
-    # Transform to proper mean, std
-    tensor.mul_(std * math.sqrt(2.0))
-    tensor.add_(mean)
-
-    # Clamp to ensure it's in the proper range
-    if tensor.dtype == torch.float16:
-        # The `clamp_` op is not (yet?) defined in float16+cpu
-        tensor = tensor.to(torch.float32)
-        tensor.clamp_(min=a, max=b)
-        tensor = tensor.to(torch.float16)
-    else:
-        tensor.clamp_(min=a, max=b)
-
-
-def trunc_normal_tf_(
-    tensor: torch.Tensor, mean: float = 0.0, std: float = 1.0, a: float = -2.0, b: float = 2.0
-):
-    """Fills the input Tensor with values drawn from a truncated
-    normal distribution. The values are effectively drawn from the
-    normal distribution :math:`\\mathcal{N}(\text{mean}, \text{std}^2)`
-    with values outside :math:`[a, b]` redrawn until they are within
-    the bounds. The method used for generating the random values works
-    best when :math:`a \\leq \text{mean} \\leq b`.
-    NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where the
-    bounds [a, b] are applied when sampling the normal distribution with mean=0, std=1.0
-    and the result is subsquently scaled and shifted by the mean and std args.
-    Args:
-        tensor: an n-dimensional `torch.Tensor`
-        mean: the mean of the normal distribution
-        std: the standard deviation of the normal distribution
-        a: the minimum cutoff value
-        b: the maximum cutoff value
-    """
-    with torch.no_grad():
-        _trunc_normal_(tensor, 0, 1.0, a, b)
-        tensor.mul_(std).add_(mean)
-
-
-def variance_scaling_(tensor, scale=1.0, mode="fan_in", distribution="normal"):
-    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
-    denom = fan_in
-    if mode == "fan_in":
-        denom = fan_in
-    elif mode == "fan_out":
-        denom = fan_out
-    elif mode == "fan_avg":
-        denom = (fan_in + fan_out) / 2
-
-    variance = scale / denom
-
-    if distribution == "truncated_normal":
-        # constant is stddev of standard normal truncated to (-2, 2)
-        trunc_normal_tf_(tensor, std=math.sqrt(variance) / 0.87962566103423978)
-    elif distribution == "normal":
-        with torch.no_grad():
-            tensor.normal_(std=math.sqrt(variance))
-    elif distribution == "uniform":
-        bound = math.sqrt(3 * variance)
-        with torch.no_grad():
-            tensor.uniform_(-bound, bound)
-    else:
-        raise ValueError(f"invalid distribution {distribution}")
-
-
-def lecun_normal_(tensor):
-    variance_scaling_(tensor, mode="fan_in", distribution="truncated_normal")
-
-
-def default_flax_embed_init(tensor):
-    variance_scaling_(tensor, mode="fan_in", distribution="normal")
-
-class SiglipVisionEmbeddings(nn.Module):
-    def __init__(self, config: SiglipVisionConfig):
-        super().__init__()
-        self.config = config
-        self.embed_dim = config.hidden_size
-        self.image_size = config.image_size
-        self.patch_size = config.patch_size
-
-        self.patch_embedding = nn.Conv2d(
-            in_channels=config.num_channels,
-            out_channels=self.embed_dim,
-            kernel_size=self.patch_size,
-            stride=self.patch_size,
-            padding="valid",
-        )
-
-        self.num_patches_per_side = self.image_size // self.patch_size
-        self.num_patches = self.num_patches_per_side**2
-        self.num_positions = self.num_patches
-        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
-
-class SiglipAttention(nn.Module):
-    """Multi-headed attention from 'Attention Is All You Need' paper"""
-
-    # Copied from transformers.models.clip.modeling_clip.CLIPAttention.__init__
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.embed_dim = config.hidden_size
-        self.num_heads = config.num_attention_heads
-        self.head_dim = self.embed_dim // self.num_heads
-        if self.head_dim * self.num_heads != self.embed_dim:
-            raise ValueError(
-                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
-                f" {self.num_heads})."
-            )
-        self.scale = self.head_dim**-0.5
-        self.dropout = config.attention_dropout
-
-        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
-        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
-        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
-        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
-
-# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->Siglip
-class SiglipMLP(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.activation_fn = ACT2FN[config.hidden_act]
-        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
-        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
-
-
-# Copied from transformers.models.clip.modeling_clip.CLIPEncoderLayer with CLIP->Siglip
-class SiglipEncoderLayer(nn.Module):
-    def __init__(self, config: SiglipVisionConfig):
-        super().__init__()
-        self.embed_dim = config.hidden_size
-        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
-        self.self_attn = (
-            SiglipAttention(config)
-        )
-        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
-        self.mlp = SiglipMLP(config)
-        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
-
-class SiglipPreTrainedModel(PreTrainedModel):
-    """
-    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
-    models.
-    """
-
-    config_class = SiglipVisionConfig
-    base_model_prefix = "siglip"
-    supports_gradient_checkpointing = True
-
-    def _init_weights(self, module):
-        """Initialize the weights"""
-
-        if isinstance(module, SiglipVisionEmbeddings):
-            width = self.config.hidden_size
-            nn.init.normal_(module.position_embedding.weight, std=1 / np.sqrt(width))
-        elif isinstance(module, nn.Embedding):
-            default_flax_embed_init(module.weight)
-        elif isinstance(module, SiglipAttention):
-            nn.init.normal_(module.q_proj.weight)
-            nn.init.normal_(module.k_proj.weight)
-            nn.init.normal_(module.v_proj.weight)
-            nn.init.normal_(module.out_proj.weight)
-            nn.init.zeros_(module.q_proj.bias)
-            nn.init.zeros_(module.k_proj.bias)
-            nn.init.zeros_(module.v_proj.bias)
-            nn.init.zeros_(module.out_proj.bias)
-        elif isinstance(module, SiglipMLP):
-            nn.init.normal_(module.fc1.weight)
-            nn.init.normal_(module.fc2.weight)
-            nn.init.normal_(module.fc1.bias, std=1e-6)
-            nn.init.normal_(module.fc2.bias, std=1e-6)
-        elif isinstance(module, (nn.Linear, nn.Conv2d)):
-            lecun_normal_(module.weight)
-            if module.bias is not None:
-                nn.init.zeros_(module.bias)
-        elif isinstance(module, nn.LayerNorm):
-            module.bias.data.zero_()
-            module.weight.data.fill_(1.0)
-
-
-SIGLIP_START_DOCSTRING = r"""
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
-    Parameters:
-        config ([`SiglipVisionConfig`]): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-
-SIGLIP_VISION_INPUTS_DOCSTRING = r"""
-    Args:
-        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
-            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-
-
-# Copied from transformers.models.clip.modeling_clip.CLIPEncoder with CLIP->Siglip
-class SiglipEncoder(nn.Module):
-    """
-    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
-    [`SiglipEncoderLayer`].
-    Args:
-        config: SiglipConfig
-    """
-
-    def __init__(self, config: SiglipVisionConfig):
-        super().__init__()
-        self.config = config
-        self.layers = nn.ModuleList([SiglipEncoderLayer(config) for _ in range(config.num_hidden_layers)])
-        self.gradient_checkpointing = False
-
-class SiglipVisionTransformer(SiglipPreTrainedModel):
-    config_class = SiglipVisionConfig
-    main_input_name = "pixel_values"
-    _supports_flash_attn_2 = True
-
-    def __init__(self, config: SiglipVisionConfig):
-        super().__init__(config)
-        self.config = config
-        embed_dim = config.hidden_size
-
-        self.embeddings = SiglipVisionEmbeddings(config)
-        self.encoder = SiglipEncoder(config)
-        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
-        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self) -> nn.Module:
-        return self.embeddings.patch_embedding
-
-import argparse
-import json
-import re
-
-import numpy as np
-from gguf import *
-from transformers.models.idefics2.modeling_idefics2 import Idefics2VisionTransformer, Idefics2VisionConfig
-
-TEXT = "clip.text"
-VISION = "clip.vision"
-
-
-def add_key_str(raw_key: str, arch: str) -> str:
-    return raw_key.format(arch=arch)
-
-
-def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_minicpmv: bool) -> bool:
-    if name in (
-        "logit_scale",
-        "text_model.embeddings.position_ids",
-        "vision_model.embeddings.position_ids",
-    ):
-        return True
-
-    if has_minicpmv and name in ["visual_projection.weight"]:
-        return True
-
-    if name.startswith("v") and not has_vision:
-        return True
-
-    if name.startswith("t") and not has_text:
-        return True
-
-    return False
-
-
-def get_tensor_name(name: str) -> str:
-    if "projection" in name:
-        return name
-    if "mm_projector" in name:
-        name = name.replace("model.mm_projector", "mm")
-        name = re.sub(r'mm\.mlp\.mlp', 'mm.model.mlp', name, count=1)
-        name = re.sub(r'mm\.peg\.peg', 'mm.model.peg', name, count=1)
-        return name
-
-    return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln")
-
-
-def bytes_to_unicode():
-    """
-    Returns list of utf-8 byte and a corresponding list of unicode strings.
-    The reversible bpe codes work on unicode strings.
-    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
-    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
-    This is a significant percentage of your normal, say, 32K bpe vocab.
-    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
-    And avoids mapping to whitespace/control characters the bpe code barfs on.
-    """
-    bs = (
-        list(range(ord("!"), ord("~") + 1))
-        + list(range(ord("¡"), ord("¬") + 1))
-        + list(range(ord("®"), ord("ÿ") + 1))
-    )
-    cs = bs[:]
-    n = 0
-    for b in range(2**8):
-        if b not in bs:
-            bs.append(b)
-            cs.append(2**8 + n)
-            n += 1
-    cs = [chr(n) for n in cs]
-    return dict(zip(bs, cs))
-
-
-ap = argparse.ArgumentParser()
-ap.add_argument("-m", "--model-dir", help="Path to model directory cloned from HF Hub", required=True)
-ap.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16")
-ap.add_argument("--text-only", action="store_true", required=False,
-                help="Save a text-only model. It can't be used to encode images")
-ap.add_argument("--vision-only", action="store_true", required=False,
-                help="Save a vision-only model. It can't be used to encode texts")
-ap.add_argument("--clip-model-is-vision", action="store_true", required=False,
-                help="The clip model is a pure vision model (ShareGPT4V vision extract for example)")
-ap.add_argument("--clip-model-is-openclip", action="store_true", required=False,
-                help="The clip model is from openclip (for ViT-SO400M type))")
-ap.add_argument("--minicpmv-projector", help="Path to minicpmv.projector file. If specified, save an image encoder for MiniCPM-V models.")
-ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp, ldpv2", choices=["mlp", "ldp", "ldpv2"], default="mlp")
-ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
-# Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711
-# Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5
-default_image_mean = [0.48145466, 0.4578275, 0.40821073]
-default_image_std = [0.26862954, 0.26130258, 0.27577711]
-ap.add_argument('--image-mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None)
-ap.add_argument('--image-std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None)
-ap.add_argument('--minicpmv_version', type=int, help='minicpmv_version: MiniCPM-V-2 use 1; MiniCPM-V-2.5 use 2; MiniCPM-V-2.6 use 3', default=2)
-
-# with proper
-args = ap.parse_args()
-
-
-if args.text_only and args.vision_only:
-    print("--text-only and --image-only arguments cannot be specified at the same time.")
-    exit(1)
-
-if args.use_f32:
-    print("WARNING: Weights for the convolution op is always saved in f16, as the convolution op in GGML does not support 32-bit kernel weights yet.")
-
-# output in the same directory as the model if output_dir is None
-dir_model = args.model_dir
-
-if args.clip_model_is_vision or not os.path.exists(dir_model + "/vocab.json") or args.clip_model_is_openclip:
-    vocab = None
-    tokens = None
-else:
-    with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f:
-        vocab = json.load(f)
-        tokens = [key for key in vocab]
-
-# possible data types
-#   ftype == 0 -> float32
-#   ftype == 1 -> float16
-#
-# map from ftype to string
-ftype_str = ["f32", "f16"]
-
-ftype = 1
-if args.use_f32:
-    ftype = 0
-
-# if args.clip_model_is_vision or args.clip_model_is_openclip:
-#     model = CLIPVisionModel.from_pretrained(dir_model)
-#     processor = None
-# else:
-#     model = CLIPModel.from_pretrained(dir_model)
-#     processor = CLIPProcessor.from_pretrained(dir_model)
-
-minicpmv_version = args.minicpmv_version
-emb_dim = 4096
-if minicpmv_version == 1:
-    emb_dim = 2304
-elif minicpmv_version == 2:
-    emb_dim = 4096
-elif minicpmv_version == 3:
-    emb_dim = 3584
-
-default_vision_config = {
-        "hidden_size": 1152,
-        "image_size": 980,
-        "intermediate_size": 4304,
-        "model_type": "idefics2",
-        "num_attention_heads": 16,
-        "num_hidden_layers": 27,
-        "patch_size": 14,
-    }
-
-vision_config = Idefics2VisionConfig(**default_vision_config)
-model = Idefics2VisionTransformer(vision_config)
-if minicpmv_version == 3:
-    vision_config = SiglipVisionConfig(**default_vision_config)
-    model = SiglipVisionTransformer(vision_config)
-
-processor = None
-# if model.attn_pool is not None:
-#     model.attn_pool = torch.nn.Identity()
-
-# model.blocks = model.blocks[:-1]
-model.load_state_dict(torch.load(os.path.join(dir_model, "minicpmv.clip")))
-
-fname_middle = None
-has_text_encoder = True
-has_vision_encoder = True
-has_minicpmv_projector = False
-
-if args.text_only:
-    fname_middle = "text-"
-    has_vision_encoder = False
-elif args.minicpmv_projector is not None:
-    fname_middle = "mmproj-"
-    has_text_encoder = False
-    has_minicpmv_projector = True
-    minicpmv_version = 3
-elif args.vision_only:
-    fname_middle = "vision-"
-    has_text_encoder = False
-else:
-    fname_middle = ""
-
-output_dir = args.output_dir if args.output_dir is not None else dir_model
-os.makedirs(output_dir, exist_ok=True)
-output_prefix = os.path.basename(output_dir).replace("ggml_", "")
-fname_out = os.path.join(output_dir, f"{fname_middle}model-{ftype_str[ftype]}.gguf")
-fout = GGUFWriter(path=fname_out, arch="clip")
-
-fout.add_bool("clip.has_text_encoder", has_text_encoder)
-fout.add_bool("clip.has_vision_encoder", has_vision_encoder)
-fout.add_bool("clip.has_minicpmv_projector", has_minicpmv_projector)
-fout.add_file_type(ftype)
-if args.text_only:
-    fout.add_description("text-only CLIP model")
-elif args.vision_only and not has_minicpmv_projector:
-    fout.add_description("vision-only CLIP model")
-elif has_minicpmv_projector:
-    fout.add_description("image encoder for MiniCPM-V")
-    # add projector type
-    fout.add_string("clip.projector_type", "resampler")
-    fout.add_int32("clip.minicpmv_version", minicpmv_version)
-else:
-    fout.add_description("two-tower CLIP model")
-
-if has_vision_encoder:
-    # vision_model hparams
-    fout.add_uint32("clip.vision.image_size", 448)
-    fout.add_uint32("clip.vision.patch_size", 14)
-    fout.add_uint32(add_key_str(KEY_EMBEDDING_LENGTH, VISION), 1152)
-    fout.add_uint32(add_key_str(KEY_FEED_FORWARD_LENGTH, VISION), 4304)
-    fout.add_uint32("clip.vision.projection_dim", 0)
-    fout.add_uint32(add_key_str(KEY_ATTENTION_HEAD_COUNT, VISION), 16)
-    fout.add_float32(add_key_str(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6)
-    block_count = 26
-    fout.add_uint32(add_key_str(KEY_BLOCK_COUNT, VISION), block_count)
-
-    if processor is not None:
-        image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean
-        image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std
-    else:
-        image_mean = args.image_mean if args.image_mean is not None else default_image_mean
-        image_std = args.image_std if args.image_std is not None else default_image_std
-    fout.add_array("clip.vision.image_mean", image_mean)
-    fout.add_array("clip.vision.image_std", image_std)
-
-use_gelu = True
-fout.add_bool("clip.use_gelu", use_gelu)
-
-def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
-    """
-    embed_dim: output dimension for each position
-    pos: a list of positions to be encoded: size (M,)
-    out: (M, D)
-    """
-    assert embed_dim % 2 == 0
-    omega = np.arange(embed_dim // 2, dtype=np.float32)
-    omega /= embed_dim / 2.
-    omega = 1. / 10000 ** omega  # (D/2,)
-
-    pos = pos.reshape(-1)  # (M,)
-    out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
-
-    emb_sin = np.sin(out)  # (M, D/2)
-    emb_cos = np.cos(out)  # (M, D/2)
-
-    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
-    return emb
-
-def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
-    assert embed_dim % 2 == 0
-
-    # use half of dimensions to encode grid_h
-    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
-    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
-
-    emb = np.concatenate([emb_h, emb_w], axis=1)  # (H*W, D)
-    return emb
-
-
-# https://github.com/facebookresearch/mae/blob/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/pos_embed.py#L20
-def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False):
-    """
-    grid_size: int of the grid height and width
-    return:
-    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
-    """
-    if isinstance(grid_size, int):
-        grid_h_size, grid_w_size = grid_size, grid_size
-    else:
-        grid_h_size, grid_w_size = grid_size[0], grid_size[1]
-
-    grid_h = np.arange(grid_h_size, dtype=np.float32)
-    grid_w = np.arange(grid_w_size, dtype=np.float32)
-    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
-    grid = np.stack(grid, axis=0)
-
-    grid = grid.reshape([2, 1, grid_h_size, grid_w_size])
-    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
-    if cls_token:
-        pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
-    return pos_embed
-
-def _replace_name_resampler(s, v):
-    if re.match("resampler.pos_embed", s):
-        return {
-            s: v,
-            re.sub("pos_embed", "pos_embed_k", s): torch.from_numpy(get_2d_sincos_pos_embed(emb_dim, (70, 70))),
-        }
-    if re.match("resampler.proj", s):
-        return {
-            re.sub("proj", "pos_embed_k", s): torch.from_numpy(get_2d_sincos_pos_embed(emb_dim, (70, 70))),
-            re.sub("proj", "proj.weight", s): v.transpose(-1, -2).contiguous(),
-        }
-    if re.match("resampler.attn.in_proj_.*", s):
-        return {
-            re.sub("attn.in_proj_", "attn.q.", s): v.chunk(3, dim=0)[0],
-            re.sub("attn.in_proj_", "attn.k.", s): v.chunk(3, dim=0)[1],
-            re.sub("attn.in_proj_", "attn.v.", s): v.chunk(3, dim=0)[2],
-        }
-    return {s: v}
-
-if has_minicpmv_projector:
-    projector = torch.load(args.minicpmv_projector)
-    new_state_dict = {}
-    for k, v in projector.items():
-        kvs = _replace_name_resampler(k, v)
-        for nk, nv in kvs.items():
-            new_state_dict[nk] = nv
-    projector = new_state_dict
-    ftype_cur = 0
-    for name, data in projector.items():
-        name = get_tensor_name(name)
-        data = data.squeeze().numpy()
-
-        n_dims = len(data.shape)
-        if ftype == 1:
-            if name[-7:] == ".weight" and n_dims == 2:
-                print("  Converting to float16")
-                data = data.astype(np.float16)
-                ftype_cur = 1
-            else:
-                print("  Converting to float32")
-                data = data.astype(np.float32)
-                ftype_cur = 0
-        else:
-            if data.dtype != np.float32:
-                print("  Converting to float32")
-                data = data.astype(np.float32)
-                ftype_cur = 0
-
-        fout.add_tensor(name, data)
-        print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}")
-
-    print("Projector tensors added\n")
-
-def _replace_name(s, v):
-    s = "vision_model." + s
-    if re.match("vision_model.embeddings.position_embedding", s):
-        v = v.unsqueeze(0)
-        return {s: v}
-
-    return {s: v}
-
-state_dict = model.state_dict()
-new_state_dict = {}
-for k, v in state_dict.items():
-    kvs = _replace_name(k, v)
-    for nk, nv in kvs.items():
-        new_state_dict[nk] = nv
-state_dict = new_state_dict
-for name, data in state_dict.items():
-    if should_skip_tensor(name, has_text_encoder, has_vision_encoder, has_minicpmv_projector):
-        # we don't need this
-        print(f"skipping parameter: {name}")
-        continue
-
-    name = get_tensor_name(name)
-    data = data.squeeze().numpy()
-
-    n_dims = len(data.shape)
-
-    # ftype == 0 -> float32, ftype == 1 -> float16
-    ftype_cur = 0
-    if n_dims == 4:
-        print(f"tensor {name} is always saved in f16")
-        data = data.astype(np.float16)
-        ftype_cur = 1
-    elif ftype == 1:
-        if name[-7:] == ".weight" and n_dims == 2:
-            print("  Converting to float16")
-            data = data.astype(np.float16)
-            ftype_cur = 1
-        else:
-            print("  Converting to float32")
-            data = data.astype(np.float32)
-            ftype_cur = 0
-    else:
-        if data.dtype != np.float32:
-            print("  Converting to float32")
-            data = data.astype(np.float32)
-            ftype_cur = 0
-
-    print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}")
-    fout.add_tensor(name, data)
-
-
-fout.write_header_to_file()
-fout.write_kv_data_to_file()
-fout.write_tensors_to_file()
-fout.close()
-
-print("Done. Output file: " + fname_out)
--- a/examples/llava/minicpmv-surgery.py
+++ b/examples/llava/minicpmv-surgery.py
@@ -1,45 +0,0 @@
-import argparse
-import os
-import torch
-from transformers import AutoModel, AutoTokenizer
-
-ap = argparse.ArgumentParser()
-ap.add_argument("-m", "--model", help="Path to MiniCPM-V model")
-args = ap.parse_args()
-
-# find the model part that includes the the multimodal projector weights
-model = AutoModel.from_pretrained(args.model, trust_remote_code=True, local_files_only=True)
-checkpoint = model.state_dict()
-
-# get a list of mm tensor names
-mm_tensors = [k for k, v in checkpoint.items() if k.startswith("resampler")]
-
-# store these tensors in a new dictionary and torch.save them
-projector = {name: checkpoint[name].float() for name in mm_tensors}
-torch.save(projector, f"{args.model}/minicpmv.projector")
-
-clip_tensors = [k for k, v in checkpoint.items() if k.startswith("vpm")]
-if len(clip_tensors) > 0:
-    clip = {name.replace("vpm.", ""): checkpoint[name].float() for name in clip_tensors}
-    torch.save(clip, f"{args.model}/minicpmv.clip")
-
-    # added tokens should be removed to be able to convert Mistral models
-    if os.path.exists(f"{args.model}/added_tokens.json"):
-        with open(f"{args.model}/added_tokens.json", "w") as f:
-            f.write("{}\n")
-
-config = model.llm.config
-config.auto_map = {
-    "AutoConfig": "configuration_minicpm.MiniCPMConfig",
-    "AutoModel": "modeling_minicpm.MiniCPMModel",
-    "AutoModelForCausalLM": "modeling_minicpm.MiniCPMForCausalLM",
-    "AutoModelForSeq2SeqLM": "modeling_minicpm.MiniCPMForCausalLM",
-    "AutoModelForSequenceClassification": "modeling_minicpm.MiniCPMForSequenceClassification"
-}
-model.llm.save_pretrained(f"{args.model}/model")
-tok = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True)
-tok.save_pretrained(f"{args.model}/model")
-
-print("Done!")
-print(f"Now you can convert {args.model} to a regular LLaMA GGUF file.")
-print(f"Also, use {args.model}/minicpmv.projector to prepare a minicpmv-encoder.gguf file.")
--- a/examples/llava/requirements.txt
+++ b/examples/llava/requirements.txt
@@ -2,4 +2,3 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
 pillow~=10.2.0
 torch~=2.2.1
-torchvision~=0.17.1
--- a/examples/lookahead/lookahead.cpp
+++ b/examples/lookahead/lookahead.cpp
@@ -58,11 +58,11 @@ int main(int argc, char ** argv) {
    llama_backend_init();
    llama_numa_init(params.numa);

-    // load the target model
-    llama_init_result llama_init = llama_init_from_gpt_params(params);
+    llama_model * model = NULL;
+    llama_context * ctx = NULL;

-    llama_model * model = llama_init.model;
-    llama_context * ctx = llama_init.context;
+    // load the target model
+    std::tie(model, ctx) = llama_init_from_gpt_params(params);

    // Tokenize the prompt
    std::vector<llama_token> inp;
--- a/examples/lookup/lookup-create.cpp
+++ b/examples/lookup/lookup-create.cpp
@@ -22,11 +22,11 @@ int main(int argc, char ** argv){
    llama_backend_init();
    llama_numa_init(params.numa);

-    // load the model
-    llama_init_result llama_init = llama_init_from_gpt_params(params);
+    llama_model * model = NULL;
+    llama_context * ctx = NULL;

-    llama_model * model = llama_init.model;
-    llama_context * ctx = llama_init.context;
+    // load the model
+    std::tie(model, ctx) = llama_init_from_gpt_params(params);
    GGML_ASSERT(model != nullptr);

    // tokenize the prompt
--- a/examples/lookup/lookup-stats.cpp
+++ b/examples/lookup/lookup-stats.cpp
@@ -26,11 +26,12 @@ int main(int argc, char ** argv){
    llama_backend_init();
    llama_numa_init(params.numa);

-    // load the model
-    llama_init_result llama_init = llama_init_from_gpt_params(params);
+    llama_model * model = NULL;
+    llama_context * ctx = NULL;

-    llama_model * model = llama_init.model;
-    llama_context * ctx = llama_init.context;
+    // load the model
+    std::tie(model, ctx) = llama_init_from_gpt_params(params);
+    GGML_ASSERT(llama_n_vocab(model) < (1 << 16));

    // tokenize the prompt
    std::vector<llama_token> inp;
@@ -64,7 +65,7 @@ int main(int argc, char ** argv){
    }

    const int n_input = inp.size();
-    const int n_ctx = llama_n_ctx(ctx);
+    const int n_ctx = params.n_ctx;

    int n_drafted = 0;
    int n_accept  = 0;
--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@@ -34,11 +34,12 @@ int main(int argc, char ** argv){
    llama_backend_init();
    llama_numa_init(params.numa);

-    // load the model
-    llama_init_result llama_init = llama_init_from_gpt_params(params);
+    llama_model * model = NULL;
+    llama_context * ctx = NULL;

-    llama_model * model = llama_init.model;
-    llama_context * ctx = llama_init.context;
+    // load the model
+    std::tie(model, ctx) = llama_init_from_gpt_params(params);
+    GGML_ASSERT(llama_n_vocab(model) < (1 << 16));

    // tokenize the prompt
    std::vector<llama_token> inp;
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -124,7 +124,6 @@ static std::string chat_add_and_format(struct llama_model * model, std::vector<l
    auto formatted = llama_chat_format_single(
        model, g_params->chat_template, chat_msgs, new_msg, role == "user");
    chat_msgs.push_back({role, content});
-    LOG("formatted: %s\n", formatted.c_str());
    return formatted;
 }

@@ -207,10 +206,7 @@ int main(int argc, char ** argv) {

    // load the model and apply lora adapter, if any
    LOG("%s: load the model and apply lora adapter, if any\n", __func__);
-    llama_init_result llama_init = llama_init_from_gpt_params(params);
-
-    model = llama_init.model;
-    ctx = llama_init.context;
+    std::tie(model, ctx) = llama_init_from_gpt_params(params);
    if (sparams.cfg_scale > 1.f) {
        struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
        ctx_guidance = llama_new_context_with_model(model, lparams);
@@ -267,9 +263,9 @@ int main(int argc, char ** argv) {
        }
    }

-    const bool add_bos = llama_add_bos_token(model);
+    const bool add_bos = llama_should_add_bos_token(model);
    if (!llama_model_has_encoder(model)) {
-        GGML_ASSERT(!llama_add_eos_token(model));
+        GGML_ASSERT(llama_add_eos_token(model) != 1);
    }
    LOG("add_bos: %d\n", add_bos);

@@ -293,13 +289,8 @@ int main(int argc, char ** argv) {

    // Should not run without any tokens
    if (embd_inp.empty()) {
-        if (add_bos) {
-            embd_inp.push_back(llama_token_bos(model));
-            LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
-        } else {
-            LOG_TEE("error: input is empty\n");
-            return -1;
-        }
+        embd_inp.push_back(llama_token_bos(model));
+        LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
    }

    // Tokenize negative prompt
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -129,11 +129,11 @@ int main(int argc, char ** argv) {
    llama_backend_init();
    llama_numa_init(params.numa);

-    // load the target model
-    llama_init_result llama_init = llama_init_from_gpt_params(params);
+    llama_model * model = NULL;
+    llama_context * ctx = NULL;

-    llama_model * model = llama_init.model;
-    llama_context * ctx = llama_init.context;
+    // load the target model
+    std::tie(model, ctx) = llama_init_from_gpt_params(params);

    // load the prompts from an external file if there are any
    if (params.prompt.empty()) {
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -340,8 +340,8 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
    // Output: `perplexity: 13.5106 [114/114]`
    // BOS tokens will be added for each chunk before eval

-    const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
-    GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));
+    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
+    GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);

    fprintf(stderr, "%s: tokenizing the input ..\n", __func__);

@@ -480,8 +480,8 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
    // Output: `perplexity: 13.5106 [114/114]`
    // BOS tokens will be added for each chunk before eval

-    const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
-    GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));
+    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
+    GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);

    std::ofstream logits_stream;
    if (!params.logits_file.empty()) {
@@ -1733,8 +1733,8 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
    const int n_batch = params.n_batch;
    const int num_batches = (n_ctx + n_batch - 1)/n_batch;
    const int nv = 2*((n_vocab + 1)/2) + 4;
-    const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
-    GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));
+    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
+    GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);

    std::vector<uint16_t> log_probs_uint16(size_t(n_ctx - 1 - n_ctx/2) * nv);
    std::vector<float>    kld_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk);
@@ -2018,11 +2018,11 @@ int main(int argc, char ** argv) {
    llama_backend_init();
    llama_numa_init(params.numa);

-    // load the model and apply lora adapter, if any
-    llama_init_result llama_init = llama_init_from_gpt_params(params);
+    llama_model * model;
+    llama_context * ctx;

-    llama_model * model = llama_init.model;
-    llama_context * ctx = llama_init.context;
+    // load the model and apply lora adapter, if any
+    std::tie(model, ctx) = llama_init_from_gpt_params(params);
    if (model == NULL) {
        fprintf(stderr, "%s: error: unable to load model\n", __func__);
        return 1;
--- a/examples/pydantic_models_to_grammar.py
+++ b/examples/pydantic_models_to_grammar.py
@@ -6,7 +6,7 @@ import re
 from copy import copy
 from enum import Enum
 from inspect import getdoc, isclass
-from typing import TYPE_CHECKING, Any, Callable, List, Optional, Union, get_args, get_origin, get_type_hints
+from typing import TYPE_CHECKING, Any, Callable, List, Optional, Union, get_args, get_origin

 from docstring_parser import parse
 from pydantic import BaseModel, create_model
@@ -53,38 +53,35 @@ class PydanticDataType(Enum):


 def map_pydantic_type_to_gbnf(pydantic_type: type[Any]) -> str:
-    origin_type = get_origin(pydantic_type)
-    origin_type = pydantic_type if origin_type is None else origin_type
-
-    if isclass(origin_type) and issubclass(origin_type, str):
+    if isclass(pydantic_type) and issubclass(pydantic_type, str):
        return PydanticDataType.STRING.value
-    elif isclass(origin_type) and issubclass(origin_type, bool):
+    elif isclass(pydantic_type) and issubclass(pydantic_type, bool):
        return PydanticDataType.BOOLEAN.value
-    elif isclass(origin_type) and issubclass(origin_type, int):
+    elif isclass(pydantic_type) and issubclass(pydantic_type, int):
        return PydanticDataType.INTEGER.value
-    elif isclass(origin_type) and issubclass(origin_type, float):
+    elif isclass(pydantic_type) and issubclass(pydantic_type, float):
        return PydanticDataType.FLOAT.value
-    elif isclass(origin_type) and issubclass(origin_type, Enum):
+    elif isclass(pydantic_type) and issubclass(pydantic_type, Enum):
        return PydanticDataType.ENUM.value

-    elif isclass(origin_type) and issubclass(origin_type, BaseModel):
-        return format_model_and_field_name(origin_type.__name__)
-    elif origin_type is list:
+    elif isclass(pydantic_type) and issubclass(pydantic_type, BaseModel):
+        return format_model_and_field_name(pydantic_type.__name__)
+    elif get_origin(pydantic_type) is list:
        element_type = get_args(pydantic_type)[0]
        return f"{map_pydantic_type_to_gbnf(element_type)}-list"
-    elif origin_type is set:
+    elif get_origin(pydantic_type) is set:
        element_type = get_args(pydantic_type)[0]
        return f"{map_pydantic_type_to_gbnf(element_type)}-set"
-    elif origin_type is Union:
+    elif get_origin(pydantic_type) is Union:
        union_types = get_args(pydantic_type)
        union_rules = [map_pydantic_type_to_gbnf(ut) for ut in union_types]
        return f"union-{'-or-'.join(union_rules)}"
-    elif origin_type is Optional:
+    elif get_origin(pydantic_type) is Optional:
        element_type = get_args(pydantic_type)[0]
        return f"optional-{map_pydantic_type_to_gbnf(element_type)}"
-    elif isclass(origin_type):
-        return f"{PydanticDataType.CUSTOM_CLASS.value}-{format_model_and_field_name(origin_type.__name__)}"
-    elif origin_type is dict:
+    elif isclass(pydantic_type):
+        return f"{PydanticDataType.CUSTOM_CLASS.value}-{format_model_and_field_name(pydantic_type.__name__)}"
+    elif get_origin(pydantic_type) is dict:
        key_type, value_type = get_args(pydantic_type)
        return f"custom-dict-key-type-{format_model_and_field_name(map_pydantic_type_to_gbnf(key_type))}-value-type-{format_model_and_field_name(map_pydantic_type_to_gbnf(value_type))}"
    else:
@@ -121,7 +118,7 @@ def get_members_structure(cls, rule_name):
        # Modify this comprehension
        members = [
            f'  "\\"{name}\\"" ":"  {map_pydantic_type_to_gbnf(param_type)}'
-            for name, param_type in get_type_hints(cls).items()
+            for name, param_type in cls.__annotations__.items()
            if name != "self"
        ]

@@ -300,20 +297,17 @@ def generate_gbnf_rule_for_type(
    field_name = format_model_and_field_name(field_name)
    gbnf_type = map_pydantic_type_to_gbnf(field_type)

-    origin_type = get_origin(field_type)
-    origin_type = field_type if origin_type is None else origin_type
-
-    if isclass(origin_type) and issubclass(origin_type, BaseModel):
+    if isclass(field_type) and issubclass(field_type, BaseModel):
        nested_model_name = format_model_and_field_name(field_type.__name__)
        nested_model_rules, _ = generate_gbnf_grammar(field_type, processed_models, created_rules)
        rules.extend(nested_model_rules)
        gbnf_type, rules = nested_model_name, rules
-    elif isclass(origin_type) and issubclass(origin_type, Enum):
+    elif isclass(field_type) and issubclass(field_type, Enum):
        enum_values = [f'"\\"{e.value}\\""' for e in field_type]  # Adding escaped quotes
        enum_rule = f"{model_name}-{field_name} ::= {' | '.join(enum_values)}"
        rules.append(enum_rule)
        gbnf_type, rules = model_name + "-" + field_name, rules
-    elif origin_type is list:  # Array
+    elif get_origin(field_type) == list:  # Array
        element_type = get_args(field_type)[0]
        element_rule_name, additional_rules = generate_gbnf_rule_for_type(
            model_name, f"{field_name}-element", element_type, is_optional, processed_models, created_rules
@@ -323,7 +317,7 @@ def generate_gbnf_rule_for_type(
        rules.append(array_rule)
        gbnf_type, rules = model_name + "-" + field_name, rules

-    elif origin_type is set:  # Array
+    elif get_origin(field_type) == set or field_type == set:  # Array
        element_type = get_args(field_type)[0]
        element_rule_name, additional_rules = generate_gbnf_rule_for_type(
            model_name, f"{field_name}-element", element_type, is_optional, processed_models, created_rules
@@ -377,7 +371,7 @@ def generate_gbnf_rule_for_type(
            gbnf_type = f"{model_name}-{field_name}-optional"
        else:
            gbnf_type = f"{model_name}-{field_name}-union"
-    elif isclass(origin_type) and issubclass(origin_type, str):
+    elif isclass(field_type) and issubclass(field_type, str):
        if field_info and hasattr(field_info, "json_schema_extra") and field_info.json_schema_extra is not None:
            triple_quoted_string = field_info.json_schema_extra.get("triple_quoted_string", False)
            markdown_string = field_info.json_schema_extra.get("markdown_code_block", False)
@@ -393,8 +387,8 @@ def generate_gbnf_rule_for_type(
            gbnf_type = PydanticDataType.STRING.value

    elif (
-        isclass(origin_type)
-        and issubclass(origin_type, float)
+        isclass(field_type)
+        and issubclass(field_type, float)
        and field_info
        and hasattr(field_info, "json_schema_extra")
        and field_info.json_schema_extra is not None
@@ -419,8 +413,8 @@ def generate_gbnf_rule_for_type(
        )

    elif (
-        isclass(origin_type)
-        and issubclass(origin_type, int)
+        isclass(field_type)
+        and issubclass(field_type, int)
        and field_info
        and hasattr(field_info, "json_schema_extra")
        and field_info.json_schema_extra is not None
@@ -468,7 +462,7 @@ def generate_gbnf_grammar(model: type[BaseModel], processed_models: set[type[Bas
    if not issubclass(model, BaseModel):
        # For non-Pydantic classes, generate model_fields from __annotations__ or __init__
        if hasattr(model, "__annotations__") and model.__annotations__:
-            model_fields = {name: (typ, ...) for name, typ in get_type_hints(model).items()}
+            model_fields = {name: (typ, ...) for name, typ in model.__annotations__.items()}  # pyright: ignore[reportGeneralTypeIssues]
        else:
            init_signature = inspect.signature(model.__init__)
            parameters = init_signature.parameters
@@ -476,7 +470,7 @@ def generate_gbnf_grammar(model: type[BaseModel], processed_models: set[type[Bas
                            name != "self"}
    else:
        # For Pydantic models, use model_fields and check for ellipsis (required fields)
-        model_fields = get_type_hints(model)
+        model_fields = model.__annotations__

    model_rule_parts = []
    nested_rules = []
@@ -712,7 +706,7 @@ def generate_markdown_documentation(
        else:
            documentation += f"  Fields:\n"  # noqa: F541
        if isclass(model) and issubclass(model, BaseModel):
-            for name, field_type in get_type_hints(model).items():
+            for name, field_type in model.__annotations__.items():
                # if name == "markdown_code_block":
                #    continue
                if get_origin(field_type) == list:
@@ -760,17 +754,14 @@ def generate_field_markdown(
    field_info = model.model_fields.get(field_name)
    field_description = field_info.description if field_info and field_info.description else ""

-    origin_type = get_origin(field_type)
-    origin_type = field_type if origin_type is None else origin_type
-
-    if origin_type == list:
+    if get_origin(field_type) == list:
        element_type = get_args(field_type)[0]
        field_text = f"{indent}{field_name} ({format_model_and_field_name(field_type.__name__)} of {format_model_and_field_name(element_type.__name__)})"
        if field_description != "":
            field_text += ":\n"
        else:
            field_text += "\n"
-    elif origin_type == Union:
+    elif get_origin(field_type) == Union:
        element_types = get_args(field_type)
        types = []
        for element_type in element_types:
@@ -801,9 +792,9 @@ def generate_field_markdown(
            example_text = f"'{field_example}'" if isinstance(field_example, str) else field_example
            field_text += f"{indent}  Example: {example_text}\n"

-    if isclass(origin_type) and issubclass(origin_type, BaseModel):
+    if isclass(field_type) and issubclass(field_type, BaseModel):
        field_text += f"{indent}  Details:\n"
-        for name, type_ in get_type_hints(field_type).items():
+        for name, type_ in field_type.__annotations__.items():
            field_text += generate_field_markdown(name, type_, field_type, depth + 2)

    return field_text
@@ -864,7 +855,7 @@ def generate_text_documentation(

        if isclass(model) and issubclass(model, BaseModel):
            documentation_fields = ""
-            for name, field_type in get_type_hints(model).items():
+            for name, field_type in model.__annotations__.items():
                # if name == "markdown_code_block":
                #    continue
                if get_origin(field_type) == list:
@@ -957,7 +948,7 @@ def generate_field_text(

    if isclass(field_type) and issubclass(field_type, BaseModel):
        field_text += f"{indent}  Details:\n"
-        for name, type_ in get_type_hints(field_type).items():
+        for name, type_ in field_type.__annotations__.items():
            field_text += generate_field_text(name, type_, field_type, depth + 2)

    return field_text
--- a/examples/pydantic_models_to_grammar_examples.py
+++ b/examples/pydantic_models_to_grammar_examples.py
@@ -1,15 +1,8 @@
-#!/usr/bin/env python3
-
-"""Function calling example using pydantic models."""
-
+# Function calling example using pydantic models.
 from __future__ import annotations

-import argparse
 import datetime
 import json
-import logging
-import textwrap
-import sys
 from enum import Enum
 from typing import Optional, Union

@@ -19,54 +12,28 @@ from pydantic_models_to_grammar import (add_run_method_to_dynamic_model, convert
                                        create_dynamic_model_from_function, generate_gbnf_grammar_and_documentation)


-def create_completion(host, prompt, gbnf_grammar):
-    """Calls the /completion API on llama-server.
-
-    See
-    https://github.com/ggerganov/llama.cpp/tree/HEAD/examples/server#api-endpoints
-    """
-    print(f"  Request:\n    Grammar:\n{textwrap.indent(gbnf_grammar, '      ')}\n    Prompt:\n{textwrap.indent(prompt.rstrip(), '      ')}")
+# Function to get completion on the llama.cpp server with grammar.
+def create_completion(prompt, grammar):
    headers = {"Content-Type": "application/json"}
-    data = {"prompt": prompt, "grammar": gbnf_grammar}
-    result = requests.post(f"http://{host}/completion", headers=headers, json=data).json()
-    assert data.get("error") is None, data
-    logging.info("Result: %s", result)
-    content = result["content"]
-    print(f"  Model: {result['model']}")
-    print(f"  Result:\n{textwrap.indent(json.dumps(json.loads(content), indent=2), '    ')}")
-    return content
+    data = {"prompt": prompt, "grammar": grammar}
+
+    response = requests.post("http://127.0.0.1:8080/completion", headers=headers, json=data)
+    data = response.json()
+
+    print(data["content"])
+    return data["content"]


 # A function for the agent to send a message to the user.
 class SendMessageToUser(BaseModel):
-    """Send a message to the User."""
+    """
+    Send a message to the User.
+    """
    chain_of_thought: str = Field(..., description="Your chain of thought while sending the message.")
    message: str = Field(..., description="Message you want to send to the user.")

    def run(self):
-        print(f"SendMessageToUser: {self.message}")
-
-
-def example_rce(host):
-    """Minimal test case where the LLM call an arbitrary python function."""
-    print("- example_rce")
-    tools = [SendMessageToUser]
-    gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(
-        pydantic_model_list=tools, outer_object_name="function",
-        outer_object_content="function_parameters", model_prefix="Function", fields_prefix="Parameters")
-    system_message = "You are an advanced AI, tasked to assist the user by calling functions in JSON format. The following are the available functions and their parameters and types:\n\n" + documentation
-    user_message = "What is 42 * 42?"
-    prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{user_message}<|im_end|>\n<|im_start|>assistant"
-    text = create_completion(host, prompt, gbnf_grammar)
-    json_data = json.loads(text)
-    tools_map = {tool.__name__:tool for tool in tools}
-    # This finds "SendMessageToUser":
-    tool = tools_map.get(json_data["function"])
-    if not tool:
-        print(f"Error: unknown tool {json_data['function']}")
-        return 1
-    tool(**json_data["function_parameters"]).run()
-    return 0
+        print(self.message)


 # Enum for the calculator tool.
@@ -77,11 +44,11 @@ class MathOperation(Enum):
    DIVIDE = "divide"


-# Simple pydantic calculator tool for the agent that can add, subtract,
-# multiply, and divide. Docstring and description of fields will be used in
-# system prompt.
+# Simple pydantic calculator tool for the agent that can add, subtract, multiply, and divide. Docstring and description of fields will be used in system prompt.
 class Calculator(BaseModel):
-    """Perform a math operation on two numbers."""
+    """
+    Perform a math operation on two numbers.
+    """
    number_one: Union[int, float] = Field(..., description="First number.")
    operation: MathOperation = Field(..., description="Math operation to perform.")
    number_two: Union[int, float] = Field(..., description="Second number.")
@@ -99,61 +66,55 @@ class Calculator(BaseModel):
            raise ValueError("Unknown operation.")


-def example_calculator(host):
-    """Have the LLM ask to get a calculation done.
+# Here the grammar gets generated by passing the available function models to generate_gbnf_grammar_and_documentation function. This also generates a documentation usable by the LLM.
+# pydantic_model_list is the list of pydanitc models
+# outer_object_name is an optional name for an outer object around the actual model object. Like a "function" object with "function_parameters" which contains the actual model object. If None, no outer object will be generated
+# outer_object_content is the name of outer object content.
+# model_prefix is the optional prefix for models in the documentation. (Default="Output Model")
+# fields_prefix is the prefix for the model fields in the documentation. (Default="Output Fields")
+gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(
+    pydantic_model_list=[SendMessageToUser, Calculator], outer_object_name="function",
+    outer_object_content="function_parameters", model_prefix="Function", fields_prefix="Parameters")

-    Here the grammar gets generated by passing the available function models to
-    generate_gbnf_grammar_and_documentation function. This also generates a
-    documentation usable by the LLM.
+print(gbnf_grammar)
+print(documentation)

-    pydantic_model_list is the list of pydantic models outer_object_name is an
-    optional name for an outer object around the actual model object. Like a
-    "function" object with "function_parameters" which contains the actual model
-    object. If None, no outer object will be generated outer_object_content is
-    the name of outer object content.
+system_message = "You are an advanced AI, tasked to assist the user by calling functions in JSON format. The following are the available functions and their parameters and types:\n\n" + documentation

-    model_prefix is the optional prefix for models in the documentation. (Default="Output Model")
-    fields_prefix is the prefix for the model fields in the documentation. (Default="Output Fields")
-    """
-    print("- example_calculator")
-    tools = [SendMessageToUser, Calculator]
-    gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(
-        pydantic_model_list=tools, outer_object_name="function",
-        outer_object_content="function_parameters", model_prefix="Function", fields_prefix="Parameters")
-    system_message = "You are an advanced AI, tasked to assist the user by calling functions in JSON format. The following are the available functions and their parameters and types:\n\n" + documentation
-    user_message1 = "What is 42 * 42?"
-    prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{user_message1}<|im_end|>\n<|im_start|>assistant"
-    text = create_completion(host, prompt, gbnf_grammar)
-    json_data = json.loads(text)
-    expected = {
-        "function": "Calculator",
-        "function_parameters": {
-            "number_one": 42,
-            "operation": "multiply",
-            "number_two": 42
-        }
-    }
-    if json_data != expected:
-        print("  Result is not as expected!")
-    tools_map = {tool.__name__:tool for tool in tools}
-    # This finds "Calculator":
-    tool = tools_map.get(json_data["function"])
-    if not tool:
-        print(f"Error: unknown tool {json_data['function']}")
-        return 1
-    result = tool(**json_data["function_parameters"]).run()
-    print(f"  Call {json_data['function']} gave result {result}")
-    return 0
+user_message = "What is 42 * 42?"
+prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{user_message}<|im_end|>\n<|im_start|>assistant"
+
+text = create_completion(prompt=prompt, grammar=gbnf_grammar)
+# This should output something like this:
+# {
+#     "function": "calculator",
+#     "function_parameters": {
+#         "number_one": 42,
+#         "operation": "multiply",
+#         "number_two": 42
+#     }
+# }
+function_dictionary = json.loads(text)
+if function_dictionary["function"] == "calculator":
+    function_parameters = {**function_dictionary["function_parameters"]}
+
+    print(Calculator(**function_parameters).run())
+    # This should output: 1764


+# A example structured output based on pydantic models. The LLM will create an entry for a Book database out of an unstructured text.
 class Category(Enum):
-    """The category of the book."""
+    """
+    The category of the book.
+    """
    Fiction = "Fiction"
    NonFiction = "Non-Fiction"


 class Book(BaseModel):
-    """Represents an entry about a book."""
+    """
+    Represents an entry about a book.
+    """
    title: str = Field(..., description="Title of the book.")
    author: str = Field(..., description="Author of the book.")
    published_year: Optional[int] = Field(..., description="Publishing year of the book.")
@@ -162,42 +123,33 @@ class Book(BaseModel):
    summary: str = Field(..., description="Summary of the book.")


-def example_struct(host):
-    """A example structured output based on pydantic models.
+# We need no additional parameters other than our list of pydantic models.
+gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation([Book])

-    The LLM will create an entry for a Book database out of an unstructured
-    text. We need no additional parameters other than our list of pydantic
-    models.
-    """
-    print("- example_struct")
-    tools = [Book]
-    gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(pydantic_model_list=tools)
-    system_message = "You are an advanced AI, tasked to create a dataset entry in JSON for a Book. The following is the expected output model:\n\n" + documentation
-    text = """The Feynman Lectures on Physics is a physics textbook based on some lectures by Richard Feynman, a Nobel laureate who has sometimes been called "The Great Explainer". The lectures were presented before undergraduate students at the California Institute of Technology (Caltech), during 1961–1963. The book's co-authors are Feynman, Robert B. Leighton, and Matthew Sands."""
-    prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant"
-    text = create_completion(host, prompt, gbnf_grammar)
-    json_data = json.loads(text)
-    # In this case, there's no function nor function_parameters.
-    # Here the result will vary based on the LLM used.
-    keys = sorted(["title", "author", "published_year", "keywords", "category", "summary"])
-    if keys != sorted(json_data.keys()):
-        print(f"Unexpected result: {sorted(json_data.keys())}")
-        return 1
-    book = Book(**json_data)
-    print(f"  As a Book object: %s" % book)
-    return 0
+system_message = "You are an advanced AI, tasked to create a dataset entry in JSON for a Book. The following is the expected output model:\n\n" + documentation

+text = """The Feynman Lectures on Physics is a physics textbook based on some lectures by Richard Feynman, a Nobel laureate who has sometimes been called "The Great Explainer". The lectures were presented before undergraduate students at the California Institute of Technology (Caltech), during 1961–1963. The book's co-authors are Feynman, Robert B. Leighton, and Matthew Sands."""
+prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant"
+
+text = create_completion(prompt=prompt, grammar=gbnf_grammar)
+
+json_data = json.loads(text)
+
+print(Book(**json_data))
+# An example for parallel function calling with a Python function, a pydantic function model and an OpenAI like function definition.

 def get_current_datetime(output_format: Optional[str] = None):
-    """Get the current date and time in the given format.
-
+    """
+    Get the current date and time in the given format.
    Args:
         output_format: formatting string for the date and time, defaults to '%Y-%m-%d %H:%M:%S'
    """
-    return datetime.datetime.now().strftime(output_format or "%Y-%m-%d %H:%M:%S")
+    if output_format is None:
+        output_format = '%Y-%m-%d %H:%M:%S'
+    return datetime.datetime.now().strftime(output_format)


-# Example function to get the weather.
+# Example function to get the weather
 def get_current_weather(location, unit):
    """Get the current weather in a given location"""
    if "London" in location:
@@ -206,107 +158,68 @@ def get_current_weather(location, unit):
        return json.dumps({"location": "New York", "temperature": "24", "unit": unit.value})
    elif "North Pole" in location:
        return json.dumps({"location": "North Pole", "temperature": "-42", "unit": unit.value})
-    return json.dumps({"location": location, "temperature": "unknown"})
+    else:
+        return json.dumps({"location": location, "temperature": "unknown"})


-def example_concurrent(host):
-    """An example for parallel function calling with a Python function, a pydantic
-    function model and an OpenAI like function definition.
-    """
-    print("- example_concurrent")
-    # Function definition in OpenAI style.
-    current_weather_tool = {
-        "type": "function",
-        "function": {
-            "name": "get_current_weather",
-            "description": "Get the current weather in a given location",
-            "parameters": {
-                "type": "object",
-                "properties": {
-                    "location": {
-                        "type": "string",
-                        "description": "The city and state, e.g. San Francisco, CA",
-                    },
-                    "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
+# Here is a function definition in OpenAI style
+current_weather_tool = {
+    "type": "function",
+    "function": {
+        "name": "get_current_weather",
+        "description": "Get the current weather in a given location",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "location": {
+                    "type": "string",
+                    "description": "The city and state, e.g. San Francisco, CA",
                },
-                "required": ["location"],
+                "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
            },
+            "required": ["location"],
        },
-    }
-    # Convert OpenAI function definition into pydantic model.
-    current_weather_tool_model = convert_dictionary_to_pydantic_model(current_weather_tool)
-    # Add the actual function to a pydantic model.
-    current_weather_tool_model = add_run_method_to_dynamic_model(current_weather_tool_model, get_current_weather)
+    },
+}

-    # Convert normal Python function to a pydantic model.
-    current_datetime_model = create_dynamic_model_from_function(get_current_datetime)
+# Convert OpenAI function definition into pydantic model
+current_weather_tool_model = convert_dictionary_to_pydantic_model(current_weather_tool)
+# Add the actual function to a pydantic model
+current_weather_tool_model = add_run_method_to_dynamic_model(current_weather_tool_model, get_current_weather)

-    tools = [SendMessageToUser, Calculator, current_datetime_model, current_weather_tool_model]
-    gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(
-        pydantic_model_list=tools, outer_object_name="function",
-        outer_object_content="params", model_prefix="Function", fields_prefix="Parameters", list_of_outputs=True)
-    system_message = "You are an advanced AI assistant. You are interacting with the user and with your environment by calling functions. You call functions by writing JSON objects, which represent specific function calls.\nBelow is a list of your available function calls:\n\n" + documentation
-    text = """Get the date and time, get the current weather in celsius in London and solve the following calculation: 42 * 42"""
-    prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant"
-    text = create_completion(host, prompt, gbnf_grammar)
-    json_data = json.loads(text)
-    expected = [
-      {
-        "function": "get_current_datetime",
-        "params": {
-          "output_format": "%Y-%m-%d %H:%M:%S"
-        }
-      },
-      {
-        "function": "get_current_weather",
-        "params": {
-          "location": "London",
-          "unit": "celsius"
-        }
-      },
-      {
-        "function": "Calculator",
-        "params": {
-          "number_one": 42,
-          "operation": "multiply",
-          "number_two": 42
-        }
-      }
-    ]
-    res = 0
-    if json_data != expected:
-        print("  Result is not as expected!")
-        print("  This can happen on highly quantized models")
-        res = 1
-    tools_map = {tool.__name__:tool for tool in tools}
-    for call in json_data:
-      tool = tools_map.get(call["function"])
-      if not tool:
-          print(f"Error: unknown tool {call['function']}")
-          return 1
-      result = tool(**call["params"]).run()
-      print(f"  Call {call['function']} returned {result}")
-    # Should output something like this:
-    #   Call get_current_datetime returned 2024-07-15 09:50:38
-    #   Call get_current_weather returned {"location": "London", "temperature": "42", "unit": "celsius"}
-    #   Call Calculator returned 1764
-    return res
+# Convert normal Python function to a pydantic model
+current_datetime_model = create_dynamic_model_from_function(get_current_datetime)
+
+tool_list = [SendMessageToUser, Calculator, current_datetime_model, current_weather_tool_model]


-def main():
-    parser = argparse.ArgumentParser(description=sys.modules[__name__].__doc__)
-    parser.add_argument("--host", default="localhost:8080", help="llama.cpp server")
-    parser.add_argument("-v", "--verbose", action="store_true", help="enables logging")
-    args = parser.parse_args()
-    logging.basicConfig(level=logging.INFO if args.verbose else logging.ERROR)
-    ret = 0
-    # Comment out below to only run the example you want.
-    ret = ret or example_rce(args.host)
-    ret = ret or example_calculator(args.host)
-    ret = ret or example_struct(args.host)
-    ret = ret or example_concurrent(args.host)
-    return ret
+gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(
+    pydantic_model_list=tool_list, outer_object_name="function",
+    outer_object_content="params", model_prefix="Function", fields_prefix="Parameters", list_of_outputs=True)
+
+system_message = "You are an advanced AI assistant. You are interacting with the user and with your environment by calling functions. You call functions by writing JSON objects, which represent specific function calls.\nBelow is a list of your available function calls:\n\n" + documentation


-if __name__ == "__main__":
-    sys.exit(main())
+text = """Get the date and time, get the current weather in celsius in London and solve the following calculation: 42 * 42"""
+prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant"
+
+text = create_completion(prompt=prompt, grammar=gbnf_grammar)
+
+json_data = json.loads(text)
+
+print(json_data)
+# Should output something like this:
+# [{'function': 'get_current_datetime', 'params': {'output_format': '%Y-%m-%d %H:%M:%S'}}, {'function': 'get_current_weather', 'params': {'location': 'London', 'unit': 'celsius'}}, {'function': 'Calculator', 'params': {'number_one': 42, 'operation': 'multiply', 'number_two': 42}}]
+
+
+for call in json_data:
+    if call["function"] == "Calculator":
+        print(Calculator(**call["params"]).run())
+    elif call["function"] == "get_current_datetime":
+        print(current_datetime_model(**call["params"]).run())  # pyright: ignore[reportAttributeAccessIssue]
+    elif call["function"] == "get_current_weather":
+        print(current_weather_tool_model(**call["params"]).run())  # pyright: ignore[reportAttributeAccessIssue]
+# Should output something like this:
+# 2024-01-14 13:36:06
+# {"location": "London", "temperature": "42", "unit": "celsius"}
+# 1764
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@@ -154,7 +154,7 @@ static void test_roundtrip_on_chunk(
    }

    if (use_reference) {
-        qfns.from_float_ref(input_scratch, quantized_scratch, chunk_size);
+        qfns.from_float_reference(input_scratch, quantized_scratch, chunk_size);
    } else {
        qfns.from_float(input_scratch, quantized_scratch, chunk_size);
    }
--- a/examples/quantize/README.md
+++ b/examples/quantize/README.md
@@ -34,7 +34,7 @@ Run the quantized model:

 ```bash
 # start inference on a gguf model
-./llama-cli -m ./models/mymodel/ggml-model-Q4_K_M.gguf -cnv -p "You are a helpful assistant"
+./llama-cli -m ./models/mymodel/ggml-model-Q4_K_M.gguf -n 128
 ```

 When running the larger models, make sure you have enough disk space to store all the intermediate files.
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -16,44 +16,44 @@ struct quant_option {
 };

 static const std::vector<struct quant_option> QUANT_OPTIONS = {
-    { "Q4_0",     LLAMA_FTYPE_MOSTLY_Q4_0,     " 4.34G, +0.4685 ppl @ Llama-3-8B",  },
-    { "Q4_1",     LLAMA_FTYPE_MOSTLY_Q4_1,     " 4.78G, +0.4511 ppl @ Llama-3-8B",  },
-    { "Q5_0",     LLAMA_FTYPE_MOSTLY_Q5_0,     " 5.21G, +0.1316 ppl @ Llama-3-8B",  },
-    { "Q5_1",     LLAMA_FTYPE_MOSTLY_Q5_1,     " 5.65G, +0.1062 ppl @ Llama-3-8B",  },
-    { "IQ2_XXS",  LLAMA_FTYPE_MOSTLY_IQ2_XXS,  " 2.06 bpw quantization",            },
-    { "IQ2_XS",   LLAMA_FTYPE_MOSTLY_IQ2_XS,   " 2.31 bpw quantization",            },
-    { "IQ2_S",    LLAMA_FTYPE_MOSTLY_IQ2_S,    " 2.5  bpw quantization",            },
-    { "IQ2_M",    LLAMA_FTYPE_MOSTLY_IQ2_M,    " 2.7  bpw quantization",            },
-    { "IQ1_S",    LLAMA_FTYPE_MOSTLY_IQ1_S,    " 1.56 bpw quantization",            },
-    { "IQ1_M",    LLAMA_FTYPE_MOSTLY_IQ1_M,    " 1.75 bpw quantization",            },
-    { "Q2_K",     LLAMA_FTYPE_MOSTLY_Q2_K,     " 2.96G, +3.5199 ppl @ Llama-3-8B",  },
-    { "Q2_K_S",   LLAMA_FTYPE_MOSTLY_Q2_K_S,   " 2.96G, +3.1836 ppl @ Llama-3-8B",  },
-    { "IQ3_XXS",  LLAMA_FTYPE_MOSTLY_IQ3_XXS,  " 3.06 bpw quantization",            },
-    { "IQ3_S",    LLAMA_FTYPE_MOSTLY_IQ3_S,    " 3.44 bpw quantization",            },
-    { "IQ3_M",    LLAMA_FTYPE_MOSTLY_IQ3_M,    " 3.66 bpw quantization mix",        },
-    { "Q3_K",     LLAMA_FTYPE_MOSTLY_Q3_K_M,   "alias for Q3_K_M"                   },
-    { "IQ3_XS",   LLAMA_FTYPE_MOSTLY_IQ3_XS,   " 3.3 bpw quantization",             },
-    { "Q3_K_S",   LLAMA_FTYPE_MOSTLY_Q3_K_S,   " 3.41G, +1.6321 ppl @ Llama-3-8B",  },
-    { "Q3_K_M",   LLAMA_FTYPE_MOSTLY_Q3_K_M,   " 3.74G, +0.6569 ppl @ Llama-3-8B",  },
-    { "Q3_K_L",   LLAMA_FTYPE_MOSTLY_Q3_K_L,   " 4.03G, +0.5562 ppl @ Llama-3-8B",  },
-    { "IQ4_NL",   LLAMA_FTYPE_MOSTLY_IQ4_NL,   " 4.50 bpw non-linear quantization", },
-    { "IQ4_XS",   LLAMA_FTYPE_MOSTLY_IQ4_XS,   " 4.25 bpw non-linear quantization", },
-    { "Q4_K",     LLAMA_FTYPE_MOSTLY_Q4_K_M,   "alias for Q4_K_M",                  },
-    { "Q4_K_S",   LLAMA_FTYPE_MOSTLY_Q4_K_S,   " 4.37G, +0.2689 ppl @ Llama-3-8B",  },
-    { "Q4_K_M",   LLAMA_FTYPE_MOSTLY_Q4_K_M,   " 4.58G, +0.1754 ppl @ Llama-3-8B",  },
-    { "Q5_K",     LLAMA_FTYPE_MOSTLY_Q5_K_M,   "alias for Q5_K_M",                  },
-    { "Q5_K_S",   LLAMA_FTYPE_MOSTLY_Q5_K_S,   " 5.21G, +0.1049 ppl @ Llama-3-8B",  },
-    { "Q5_K_M",   LLAMA_FTYPE_MOSTLY_Q5_K_M,   " 5.33G, +0.0569 ppl @ Llama-3-8B",  },
-    { "Q6_K",     LLAMA_FTYPE_MOSTLY_Q6_K,     " 6.14G, +0.0217 ppl @ Llama-3-8B",  },
-    { "Q8_0",     LLAMA_FTYPE_MOSTLY_Q8_0,     " 7.96G, +0.0026 ppl @ Llama-3-8B",  },
-    { "Q4_0_4_4", LLAMA_FTYPE_MOSTLY_Q4_0_4_4, " 4.34G, +0.4685 ppl @ Llama-3-8B",  },
-    { "Q4_0_4_8", LLAMA_FTYPE_MOSTLY_Q4_0_4_8, " 4.34G, +0.4685 ppl @ Llama-3-8B",  },
-    { "Q4_0_8_8", LLAMA_FTYPE_MOSTLY_Q4_0_8_8, " 4.34G, +0.4685 ppl @ Llama-3-8B",  },
-    { "F16",      LLAMA_FTYPE_MOSTLY_F16,      "14.00G, +0.0020 ppl @ Mistral-7B",  },
-    { "BF16",     LLAMA_FTYPE_MOSTLY_BF16,     "14.00G, -0.0050 ppl @ Mistral-7B",  },
-    { "F32",      LLAMA_FTYPE_ALL_F32,         "26.00G              @ 7B",          },
+    { "Q4_0",   LLAMA_FTYPE_MOSTLY_Q4_0,   " 4.34G, +0.4685 ppl @ Llama-3-8B",  },
+    { "Q4_1",   LLAMA_FTYPE_MOSTLY_Q4_1,   " 4.78G, +0.4511 ppl @ Llama-3-8B",  },
+    { "Q5_0",   LLAMA_FTYPE_MOSTLY_Q5_0,   " 5.21G, +0.1316 ppl @ Llama-3-8B",  },
+    { "Q5_1",   LLAMA_FTYPE_MOSTLY_Q5_1,   " 5.65G, +0.1062 ppl @ Llama-3-8B",  },
+    { "IQ2_XXS",LLAMA_FTYPE_MOSTLY_IQ2_XXS," 2.06 bpw quantization",            },
+    { "IQ2_XS", LLAMA_FTYPE_MOSTLY_IQ2_XS, " 2.31 bpw quantization",            },
+    { "IQ2_S",  LLAMA_FTYPE_MOSTLY_IQ2_S,  " 2.5  bpw quantization",            },
+    { "IQ2_M",  LLAMA_FTYPE_MOSTLY_IQ2_M,  " 2.7  bpw quantization",            },
+    { "IQ1_S",  LLAMA_FTYPE_MOSTLY_IQ1_S,  " 1.56 bpw quantization",            },
+    { "IQ1_M",  LLAMA_FTYPE_MOSTLY_IQ1_M,  " 1.75 bpw quantization",            },
+    { "Q2_K",   LLAMA_FTYPE_MOSTLY_Q2_K,   " 2.96G, +3.5199 ppl @ Llama-3-8B",  },
+    { "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.96G, +3.1836 ppl @ Llama-3-8B",  },
+    { "IQ3_XXS",LLAMA_FTYPE_MOSTLY_IQ3_XXS," 3.06 bpw quantization",            },
+    { "IQ3_S",  LLAMA_FTYPE_MOSTLY_IQ3_S,  " 3.44 bpw quantization",            },
+    { "IQ3_M",  LLAMA_FTYPE_MOSTLY_IQ3_M,  " 3.66 bpw quantization mix",        },
+    { "Q3_K",   LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M"                   },
+    { "IQ3_XS", LLAMA_FTYPE_MOSTLY_IQ3_XS, " 3.3 bpw quantization",             },
+    { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 3.41G, +1.6321 ppl @ Llama-3-8B",  },
+    { "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.74G, +0.6569 ppl @ Llama-3-8B",  },
+    { "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 4.03G, +0.5562 ppl @ Llama-3-8B",  },
+    { "IQ4_NL", LLAMA_FTYPE_MOSTLY_IQ4_NL, " 4.50 bpw non-linear quantization", },
+    { "IQ4_XS", LLAMA_FTYPE_MOSTLY_IQ4_XS, " 4.25 bpw non-linear quantization", },
+    { "Q4_K",   LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M",                  },
+    { "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 4.37G, +0.2689 ppl @ Llama-3-8B",  },
+    { "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 4.58G, +0.1754 ppl @ Llama-3-8B",  },
+    { "Q5_K",   LLAMA_FTYPE_MOSTLY_Q5_K_M, "alias for Q5_K_M",                  },
+    { "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 5.21G, +0.1049 ppl @ Llama-3-8B",  },
+    { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 5.33G, +0.0569 ppl @ Llama-3-8B",  },
+    { "Q6_K",   LLAMA_FTYPE_MOSTLY_Q6_K,   " 6.14G, +0.0217 ppl @ Llama-3-8B",  },
+    { "Q8_0",   LLAMA_FTYPE_MOSTLY_Q8_0,   " 7.96G, +0.0026 ppl @ Llama-3-8B",  },
+    { "Q4_0_4_4", LLAMA_FTYPE_MOSTLY_Q4_0_4_4, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
+    { "Q4_0_4_8", LLAMA_FTYPE_MOSTLY_Q4_0_4_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
+    { "Q4_0_8_8", LLAMA_FTYPE_MOSTLY_Q4_0_8_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
+    { "F16",    LLAMA_FTYPE_MOSTLY_F16,    "14.00G, +0.0020 ppl @ Mistral-7B",  },
+    { "BF16",   LLAMA_FTYPE_MOSTLY_BF16,   "14.00G, -0.0050 ppl @ Mistral-7B",  },
+    { "F32",    LLAMA_FTYPE_ALL_F32,       "26.00G              @ 7B",          },
    // Note: Ensure COPY comes after F32 to avoid ftype 0 from matching.
-    { "COPY",     LLAMA_FTYPE_ALL_F32,         "only copy tensors, no quantizing",  },
+    { "COPY",   LLAMA_FTYPE_ALL_F32,       "only copy tensors, no quantizing",  },
 };

 static const char * const LLM_KV_QUANTIZE_IMATRIX_FILE       = "quantize.imatrix.file";
@@ -91,7 +91,7 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
 }

 // usage:
-//  ./llama-quantize [--allow-requantize] [--leave-output-tensor] [--pure] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
+//  ./quantize [--allow-requantize] [--leave-output-tensor] [--pure] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
 //
 [[noreturn]]
 static void usage(const char * executable) {
@@ -104,7 +104,7 @@ static void usage(const char * executable) {
    printf("  --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
    printf("  --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n");
    printf("  --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n");
-    printf("  --keep-split: will generate quantized model in the same shards as input\n");
+    printf("  --keep-split: will generate quatized model in the same shards as input");
    printf("  --override-kv KEY=TYPE:VALUE\n");
    printf("      Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");
    printf("Note: --include-weights and --exclude-weights cannot be used together\n");
--- a/examples/retrieval/retrieval.cpp
+++ b/examples/retrieval/retrieval.cpp
@@ -148,12 +148,11 @@ int main(int argc, char ** argv) {
    llama_backend_init();
    llama_numa_init(params.numa);

+    llama_model * model;
+    llama_context * ctx;
+
    // load the model
-    llama_init_result llama_init = llama_init_from_gpt_params(params);
-
-    llama_model * model = llama_init.model;
-    llama_context * ctx = llama_init.context;
-
+    std::tie(model, ctx) = llama_init_from_gpt_params(params);
    if (model == NULL) {
        fprintf(stderr, "%s: error: unable to load model\n", __func__);
        return 1;
@@ -253,8 +252,6 @@ int main(int argc, char ** argv) {
        chunks[i].tokens.clear();
    }

-    struct llama_batch query_batch = llama_batch_init(n_batch, 0, 1);
-
    // start loop, receive query and return top k similar chunks based on cosine similarity
    std::string query;
    while (true) {
@@ -262,6 +259,7 @@ int main(int argc, char ** argv) {
        std::getline(std::cin, query);
        std::vector<int32_t> query_tokens = llama_tokenize(ctx, query, true);

+        struct llama_batch query_batch = llama_batch_init(n_batch, 0, 1);
        batch_add_seq(query_batch, query_tokens, 0);

        std::vector<float> query_emb(n_embd, 0);
@@ -294,7 +292,6 @@ int main(int argc, char ** argv) {
    }

    // clean up
-    llama_batch_free(query_batch);
    llama_print_timings(ctx);
    llama_free(ctx);
    llama_free_model(model);
--- a/examples/rpc/README.md
+++ b/examples/rpc/README.md
@@ -1,9 +1,5 @@
 ## Overview

-> [!IMPORTANT]
-> This example and the RPC backend are currently in a proof-of-concept development stage. As such, the functionality is fragile and
-> insecure. **Never run the RPC server on an open network or in a sensitive environment!**
-
 The `rpc-server` allows  running `ggml` backend on a remote host.
 The RPC backend communicates with one or several instances of `rpc-server` and offloads computations to them.
 This can be used for distributed LLM inference with `llama.cpp` in the following way:
--- a/examples/rpc/rpc-server.cpp
+++ b/examples/rpc/rpc-server.cpp
@@ -16,7 +16,7 @@
 #include <stdio.h>

 struct rpc_server_params {
-    std::string host        = "127.0.0.1";
+    std::string host        = "0.0.0.0";
    int         port        = 50052;
    size_t      backend_mem = 0;
 };
@@ -114,17 +114,6 @@ int main(int argc, char * argv[]) {
        fprintf(stderr, "Invalid parameters\n");
        return 1;
    }
-
-    if (params.host != "127.0.0.1") {
-        fprintf(stderr, "\n");
-        fprintf(stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
-        fprintf(stderr, "WARNING: Host ('%s') is != '127.0.0.1'\n", params.host.c_str());
-        fprintf(stderr, "         Never expose the RPC server to an open network!\n");
-        fprintf(stderr, "         This is an experimental feature and is not secure!\n");
-        fprintf(stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
-        fprintf(stderr, "\n");
-    }
-
    ggml_backend_t backend = create_backend();
    if (!backend) {
        fprintf(stderr, "Failed to create backend\n");
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@@ -28,11 +28,10 @@ int main(int argc, char ** argv) {
    std::string result2;

    // init
-    llama_init_result llama_init = llama_init_from_gpt_params(params);
-
-    llama_model * model = llama_init.model;
-    llama_context * ctx = llama_init.context;
+    llama_model * model;
+    llama_context * ctx;

+    std::tie(model, ctx) = llama_init_from_gpt_params(params);
    if (model == nullptr || ctx == nullptr) {
        fprintf(stderr, "%s : failed to init\n", __func__);
        return 1;
@@ -48,7 +47,7 @@ int main(int argc, char ** argv) {
    // save state (rng, logits, embedding and kv_cache) to file
    {
        std::vector<uint8_t> state_mem(llama_state_get_size(ctx));
-        const size_t written = llama_state_get_data(ctx, state_mem.data(), state_mem.size());
+        const size_t written = llama_state_get_data(ctx, state_mem.data());

        FILE *fp_write = fopen("dump_state.bin", "wb");
        fwrite(state_mem.data(), 1, written, fp_write);
@@ -100,16 +99,13 @@ int main(int argc, char ** argv) {

    // load state (rng, logits, embedding and kv_cache) from file
    {
-        std::vector<uint8_t> state_mem;
+        std::vector<uint8_t> state_mem(llama_state_get_size(ctx2));

        FILE * fp_read = fopen("dump_state.bin", "rb");
-        fseek(fp_read, 0, SEEK_END);
-        state_mem.resize(ftell(fp_read));
-        fseek(fp_read, 0, SEEK_SET);
        const size_t read = fread(state_mem.data(), 1, state_mem.size(), fp_read);
        fclose(fp_read);

-        if (read != llama_state_set_data(ctx2, state_mem.data(), state_mem.size())) {
+        if (read != llama_state_set_data(ctx2, state_mem.data())) {
            fprintf(stderr, "\n%s : failed to read state\n", __func__);
            llama_free(ctx2);
            llama_free_model(model);
@@ -163,16 +159,13 @@ int main(int argc, char ** argv) {

    // load state (rng, logits, embedding and kv_cache) from file
    {
-        std::vector<uint8_t> state_mem;
+        std::vector<uint8_t> state_mem(llama_state_get_size(ctx3));

        FILE * fp_read = fopen("dump_state.bin", "rb");
-        fseek(fp_read, 0, SEEK_END);
-        state_mem.resize(ftell(fp_read));
-        fseek(fp_read, 0, SEEK_SET);
        const size_t read = fread(state_mem.data(), 1, state_mem.size(), fp_read);
        fclose(fp_read);

-        if (read != llama_state_set_data(ctx3, state_mem.data(), state_mem.size())) {
+        if (read != llama_state_set_data(ctx3, state_mem.data())) {
            fprintf(stderr, "\n%s : failed to read state\n", __func__);
            llama_free(ctx3);
            llama_free_model(model);
@@ -189,7 +182,7 @@ int main(int argc, char ** argv) {
    {
        // save kv of seq 0
        std::vector<uint8_t> seq_store(llama_state_seq_get_size(ctx3, 0));
-        const size_t ncopy = llama_state_seq_get_data(ctx3, seq_store.data(), seq_store.size(), 0);
+        const size_t ncopy = llama_state_seq_get_data(ctx3, seq_store.data(), 0);
        if (ncopy != seq_store.size()) {
            fprintf(stderr, "\n%s : seq copy data length %zd does not match expected length %zd\n", __func__, ncopy, seq_store.size());
            llama_free(ctx3);
@@ -203,7 +196,7 @@ int main(int argc, char ** argv) {
        fprintf(stderr, "%s : kv cache cleared\n", __func__);

        // restore kv into seq 1
-        const size_t nset = llama_state_seq_set_data(ctx3, seq_store.data(), seq_store.size(), 1);
+        const size_t nset = llama_state_seq_set_data(ctx3, seq_store.data(), 1);
        if (nset != seq_store.size()) {
            fprintf(stderr, "\n%s : seq set data length %zd does not match expected length %zd\n", __func__, nset, seq_store.size());
            llama_free(ctx3);
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -5,7 +5,7 @@ Fast, lightweight, pure C/C++ HTTP server based on [httplib](https://github.com/
 Set of LLM REST APIs and a simple web front end to interact with llama.cpp.

 **Features:**
- * LLM inference of F16 and quantized models on GPU and CPU
+ * LLM inference of F16 and quantum models on GPU and CPU
 * [OpenAI API](https://github.com/openai/openai-openapi) compatible chat completions and embeddings routes
 * Parallel decoding with multi-user support
 * Continuous batching
@@ -15,257 +15,69 @@ Set of LLM REST APIs and a simple web front end to interact with llama.cpp.

 The project is under active development, and we are [looking for feedback and contributors](https://github.com/ggerganov/llama.cpp/issues/4216).

-## Usage
+**Command line options:**

-```
-usage: ./llama-server [options]
+- `-v`, `--verbose`: Enable verbose server output. When using the `/completion` endpoint, this includes the tokenized prompt, the full request and the full response.
+- `-t N`, `--threads N`: Set the number of threads to use by CPU layers during generation. Not used by model layers that are offloaded to GPU. This option has no effect when using the maximum number of GPU layers. Default: `std::thread::hardware_concurrency()` (number of CPU cores).
+- `-tb N, --threads-batch N`: Set the number of threads to use by CPU layers during batch and prompt processing (>= 32 tokens). This option has no effect if a GPU is available. Default: `--threads`.
+- `--threads-http N`: Number of threads in the http server pool to process requests. Default: `max(std::thread::hardware_concurrency() - 1, --parallel N + 2)`
+- `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
+- `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file. Default: unused
+- `-hfr REPO, --hf-repo REPO`: Hugging Face model repository. Default: unused
+- `-hff FILE, --hf-file FILE`: Hugging Face model file. Default: unused
+- `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
+- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is `512`, but LLaMA models were built with a context of `2048`, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of `4096`.
+- `-ngl N`, `--n-gpu-layers N`: When compiled with GPU support, this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
+- `-mg i, --main-gpu i`: When using multiple GPUs, this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default, GPU `0` is used.
+- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs, this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default, the data is split in proportion to VRAM, but this may not be optimal for performance.
+- `-b N`, `--batch-size N`: Set the batch size for prompt processing. Default: `2048`
+- `-ub N`, `--ubatch-size N`: Physical maximum batch size. Default: `512`
+- `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped.
+- `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed.
+- `--numa STRATEGY`: Attempt one of the below optimization strategies that may help on some NUMA systems
+- `--numa distribute`: Spread execution evenly over all nodes
+- `--numa isolate`: Only spawn threads on CPUs on the node that execution started on
+- `--numa numactl`: Use the CPU map provided by numactl. If run without this previously, it is recommended to drop the system page cache before using this. See https://github.com/ggerganov/llama.cpp/issues/1437
+- `--numa`: Attempt optimizations that may help on some NUMA systems.
+- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
+- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
+- `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`
+- `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`
+- `--port`: Set the port to listen. Default: `8080`
+- `--path`: Path from which to serve static files. Default: disabled
+- `--api-key`: Set an api key for request authorization. By default, the server responds to every request. With an api key set, the requests must have the Authorization header set with the api key as Bearer token. May be used multiple times to enable multiple valid keys.
+- `--api-key-file`: Path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access. May be used in conjunction with `--api-key`s.
+- `--embeddings`: Enable embedding vector output and the OAI compatible endpoint /v1/embeddings. Physical batch size (`--ubatch-size`) must be carefully defined. Default: disabled
+- `-np N`, `--parallel N`: Set the number of slots for process requests. Default: `1`. Values > 1 will allow for higher throughput with multiple parallel requests but the results will **not** be deterministic due to differences in rounding error.
+- `-cb`, `--cont-batching`: Enable continuous batching (a.k.a dynamic batching).  Default: disabled
+- `-spf FNAME`, `--system-prompt-file FNAME` Set a file to load a system prompt (initial prompt of all slots). This is useful for chat applications. [See more](#change-system-prompt-on-runtime)
+- `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA.
+- `--grp-attn-n`: Set the group attention factor to extend context size through self-extend. Used together with group attention width `--grp-attn-w`. Default: `1`, which is disabled.
+- `--grp-attn-w`: Set the group attention width to extend context size through self-extend.  Used together with group attention factor `--grp-attn-n`. Default: `512`
+- `-n N, --n-predict N`: Set the maximum tokens to predict. Default: `-1`
+- `--slots-endpoint-disable`: To disable slots state monitoring endpoint. Slots state may contain user data, prompts included.
+- `--metrics`: enable prometheus `/metrics` compatible endpoint. Default: disabled
+- `--slot-save-path PATH`: Specifies the path where the state of slots (the prompt cache) can be stored. If not provided, the slot management endpoints will be disabled.
+- `--chat-template JINJA_TEMPLATE`: Set custom jinja chat template. This parameter accepts a string, not a file name.  Default: template taken from model's metadata. We only support [some pre-defined templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
+- `--log-disable`: Output logs to stdout only, not to `llama.log`. Default: enabled
+- `--log-format FORMAT`: Define the log output to FORMAT: json or text Default: `json`
+- `--rope-scaling` : RoPE scaling method. Defaults to linear unless otherwise specified by the model. Options are `none`, `linear`, `yarn`
+- `--rope-freq-base N` : RoPE frequency base (default: loaded from model)
+- `--rope-freq-scale N`: RoPE frequency scaling factor, expands context by a factor of 1/N (e.g. 0.25)
+- `--yarn-ext-factor N` : YaRN: extrapolation mix factor (Default: 1.0, 0.0 = full interpolation)
+- `--yarn-attn-factor N` : YaRN: scale sqrt(t) or attention magnitude (default: 1.0)
+- `--yarn-beta-slow N`: YaRN: High correction dim or alpha (default: 1.0)
+- `--yarn-beta-fast N`: YaRN: low correction dim or beta (default: 32.0)
+- `--pooling` : Pooling type for embeddings, use model default if unspecified. Options are `none`, `mean`, `cls`
+- `-dt N`, `--defrag-thold N`: KV cache defragmentation threshold (default: -1.0, < 0 = disabled)
+- `-fa`, `--flash-attn` : enable flash attention (default: disabled).
+- `-ctk TYPE`, `--cache-type-k TYPE` : KV cache data type for K (default: `f16`, options `f32`, `f16`, `q8_0`, `q4_0`, `q4_1`, `iq4_nl`, `q5_0`, or `q5_1`)
+- `-ctv TYPE`, `--cache-type-v TYPE` : KV cache type for V (default `f16`, see `-ctk` for options)
+- `--spm-infill` : Use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this.

-general:
-
-  -h,    --help, --usage          print usage and exit
-         --version                show version and build info
-  -v,    --verbose                print verbose information
-         --verbosity N            set specific verbosity level (default: 0)
-         --verbose-prompt         print a verbose prompt before generation (default: false)
-         --no-display-prompt      don't print prompt at generation (default: false)
-  -co,   --color                  colorise output to distinguish prompt and user input from generations (default: false)
-  -s,    --seed SEED              RNG seed (default: -1, use random seed for < 0)
-  -t,    --threads N              number of threads to use during generation (default: 8)
-  -tb,   --threads-batch N        number of threads to use during batch and prompt processing (default: same as --threads)
-  -td,   --threads-draft N        number of threads to use during generation (default: same as --threads)
-  -tbd,  --threads-batch-draft N  number of threads to use during batch and prompt processing (default: same as --threads-draft)
-         --draft N                number of tokens to draft for speculative decoding (default: 5)
-  -ps,   --p-split N              speculative decoding split probability (default: 0.1)
-  -lcs,  --lookup-cache-static FNAME
-                                  path to static lookup cache to use for lookup decoding (not updated by generation)
-  -lcd,  --lookup-cache-dynamic FNAME
-                                  path to dynamic lookup cache to use for lookup decoding (updated by generation)
-  -c,    --ctx-size N             size of the prompt context (default: 0, 0 = loaded from model)
-  -n,    --predict N              number of tokens to predict (default: -1, -1 = infinity, -2 = until context filled)
-  -b,    --batch-size N           logical maximum batch size (default: 2048)
-  -ub,   --ubatch-size N          physical maximum batch size (default: 512)
-         --keep N                 number of tokens to keep from the initial prompt (default: 0, -1 = all)
-         --chunks N               max number of chunks to process (default: -1, -1 = all)
-  -fa,   --flash-attn             enable Flash Attention (default: disabled)
-  -p,    --prompt PROMPT          prompt to start generation with
-                                  in conversation mode, this will be used as system prompt
-                                  (default: '')
-  -f,    --file FNAME             a file containing the prompt (default: none)
-         --in-file FNAME          an input file (repeat to specify multiple files)
-  -bf,   --binary-file FNAME      binary file containing the prompt (default: none)
-  -e,    --escape                 process escapes sequences (\n, \r, \t, \', \", \\) (default: true)
-         --no-escape              do not process escape sequences
-  -ptc,  --print-token-count N    print token count every N tokens (default: -1)
-         --prompt-cache FNAME     file to cache prompt state for faster startup (default: none)
-         --prompt-cache-all       if specified, saves user input and generations to cache as well
-                                  not supported with --interactive or other interactive options
-         --prompt-cache-ro        if specified, uses the prompt cache but does not update it
-  -r,    --reverse-prompt PROMPT  halt generation at PROMPT, return control in interactive mode
-                                  can be specified more than once for multiple prompts
-  -sp,   --special                special tokens output enabled (default: false)
-  -cnv,  --conversation           run in conversation mode, does not print special tokens and suffix/prefix
-                                  if suffix/prefix are not specified, default chat template will be used
-                                  (default: false)
-  -i,    --interactive            run in interactive mode (default: false)
-  -if,   --interactive-first      run in interactive mode and wait for input right away (default: false)
-  -mli,  --multiline-input        allows you to write or paste multiple lines without ending each in '\'
-         --in-prefix-bos          prefix BOS to user inputs, preceding the `--in-prefix` string
-         --in-prefix STRING       string to prefix user inputs with (default: empty)
-         --in-suffix STRING       string to suffix after user inputs with (default: empty)
-         --spm-infill             use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled)
-
-sampling:
-
-         --samplers SAMPLERS      samplers that will be used for generation in the order, separated by ';'
-                                  (default: top_k;tfs_z;typical_p;top_p;min_p;temperature)
-         --sampling-seq SEQUENCE  simplified sequence for samplers that will be used (default: kfypmt)
-         --ignore-eos             ignore end of stream token and continue generating (implies --logit-bias EOS-inf)
-         --penalize-nl            penalize newline tokens (default: false)
-         --temp N                 temperature (default: 0.8)
-         --top-k N                top-k sampling (default: 40, 0 = disabled)
-         --top-p N                top-p sampling (default: 0.9, 1.0 = disabled)
-         --min-p N                min-p sampling (default: 0.1, 0.0 = disabled)
-         --tfs N                  tail free sampling, parameter z (default: 1.0, 1.0 = disabled)
-         --typical N              locally typical sampling, parameter p (default: 1.0, 1.0 = disabled)
-         --repeat-last-n N        last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size)
-         --repeat-penalty N       penalize repeat sequence of tokens (default: 1.0, 1.0 = disabled)
-         --presence-penalty N     repeat alpha presence penalty (default: 0.0, 0.0 = disabled)
-         --frequency-penalty N    repeat alpha frequency penalty (default: 0.0, 0.0 = disabled)
-         --dynatemp-range N       dynamic temperature range (default: 0.0, 0.0 = disabled)
-         --dynatemp-exp N         dynamic temperature exponent (default: 1.0)
-         --mirostat N             use Mirostat sampling.
-                                  Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.
-                                  (default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)
-         --mirostat-lr N          Mirostat learning rate, parameter eta (default: 0.1)
-         --mirostat-ent N         Mirostat target entropy, parameter tau (default: 5.0)
-         -l TOKEN_ID(+/-)BIAS     modifies the likelihood of token appearing in the completion,
-                                  i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',
-                                  or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'
-         --cfg-negative-prompt PROMPT
-                                  negative prompt to use for guidance (default: '')
-         --cfg-negative-prompt-file FNAME
-                                  negative prompt file to use for guidance
-         --cfg-scale N            strength of guidance (default: 1.0, 1.0 = disable)
-         --chat-template JINJA_TEMPLATE
-                                  set custom jinja chat template (default: template taken from model's metadata)
-                                  if suffix/prefix are specified, template will be disabled
-                                  only commonly used templates are accepted:
-                                  https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
-
-grammar:
-
-         --grammar GRAMMAR        BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '')
-         --grammar-file FNAME     file to read grammar from
-  -j,    --json-schema SCHEMA     JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object
-                                  For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead
-
-embedding:
-
-         --pooling {none,mean,cls,last}
-                                  pooling type for embeddings, use model default if unspecified
-         --attention {causal,non-causal}
-                                  attention type for embeddings, use model default if unspecified
-
-context hacking:
-
-         --rope-scaling {none,linear,yarn}
-                                  RoPE frequency scaling method, defaults to linear unless specified by the model
-         --rope-scale N           RoPE context scaling factor, expands context by a factor of N
-         --rope-freq-base N       RoPE base frequency, used by NTK-aware scaling (default: loaded from model)
-         --rope-freq-scale N      RoPE frequency scaling factor, expands context by a factor of 1/N
-         --yarn-orig-ctx N        YaRN: original context size of model (default: 0 = model training context size)
-         --yarn-ext-factor N      YaRN: extrapolation mix factor (default: -1.0, 0.0 = full interpolation)
-         --yarn-attn-factor N     YaRN: scale sqrt(t) or attention magnitude (default: 1.0)
-         --yarn-beta-slow N       YaRN: high correction dim or alpha (default: 1.0)
-         --yarn-beta-fast N       YaRN: low correction dim or beta (default: 32.0)
-  -gan,  --grp-attn-n N           group-attention factor (default: 1)
-  -gaw,  --grp-attn-w N           group-attention width (default: 512.0)
-  -dkvc, --dump-kv-cache          verbose print of the KV cache
-  -nkvo, --no-kv-offload          disable KV offload
-  -ctk,  --cache-type-k TYPE      KV cache data type for K (default: f16)
-  -ctv,  --cache-type-v TYPE      KV cache data type for V (default: f16)
-
-perplexity:
-
-         --all-logits             return logits for all tokens in the batch (default: false)
-         --hellaswag              compute HellaSwag score over random tasks from datafile supplied with -f
-         --hellaswag-tasks N      number of tasks to use when computing the HellaSwag score (default: 400)
-         --winogrande             compute Winogrande score over random tasks from datafile supplied with -f
-         --winogrande-tasks N     number of tasks to use when computing the Winogrande score (default: 0)
-         --multiple-choice        compute multiple choice score over random tasks from datafile supplied with -f
-         --multiple-choice-tasks N
-                                  number of tasks to use when computing the multiple choice score (default: 0)
-         --kl-divergence          computes KL-divergence to logits provided via --kl-divergence-base
-         --ppl-stride N           stride for perplexity calculation (default: 0)
-         --ppl-output-type {0,1}  output type for perplexity calculation (default: 0)
-
-parallel:
-
-  -dt,   --defrag-thold N         KV cache defragmentation threshold (default: -1.0, < 0 - disabled)
-  -np,   --parallel N             number of parallel sequences to decode (default: 1)
-  -ns,   --sequences N            number of sequences to decode (default: 1)
-  -cb,   --cont-batching          enable continuous batching (a.k.a dynamic batching) (default: enabled)
-
-multi-modality:
-
-         --mmproj FILE            path to a multimodal projector file for LLaVA. see examples/llava/README.md
-         --image FILE             path to an image file. use with multimodal models. Specify multiple times for batching
-
-backend:
-
-         --rpc SERVERS            comma separated list of RPC servers
-         --mlock                  force system to keep model in RAM rather than swapping or compressing
-         --no-mmap                do not memory-map model (slower load but may reduce pageouts if not using mlock)
-         --numa TYPE              attempt optimizations that help on some NUMA systems
-                                    - distribute: spread execution evenly over all nodes
-                                    - isolate: only spawn threads on CPUs on the node that execution started on
-                                    - numactl: use the CPU map provided by numactl
-                                  if run without this previously, it is recommended to drop the system page cache before using this
-                                  see https://github.com/ggerganov/llama.cpp/issues/1437
-
-model:
-
-         --check-tensors          check model tensor data for invalid values (default: false)
-         --override-kv KEY=TYPE:VALUE
-                                  advanced option to override model metadata by key. may be specified multiple times.
-                                  types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false
-         --lora FNAME             apply LoRA adapter (implies --no-mmap)
-         --lora-scaled FNAME S    apply LoRA adapter with user defined scaling S (implies --no-mmap)
-         --lora-base FNAME        optional model to use as a base for the layers modified by the LoRA adapter
-         --control-vector FNAME   add a control vector
-                                  note: this argument can be repeated to add multiple control vectors
-         --control-vector-scaled FNAME SCALE
-                                  add a control vector with user defined scaling SCALE
-                                  note: this argument can be repeated to add multiple scaled control vectors
-         --control-vector-layer-range START END
-                                  layer range to apply the control vector(s) to, start and end inclusive
-  -m,    --model FNAME            model path (default: models/$filename with filename from --hf-file
-                                  or --model-url if set, otherwise models/7B/ggml-model-f16.gguf)
-  -md,   --model-draft FNAME      draft model for speculative decoding (default: unused)
-  -mu,   --model-url MODEL_URL    model download url (default: unused)
-  -hfr,  --hf-repo REPO           Hugging Face model repository (default: unused)
-  -hff,  --hf-file FILE           Hugging Face model file (default: unused)
-  -hft,  --hf-token TOKEN         Hugging Face access token (default: value from HF_TOKEN environment variable)
-
-server:
-
-         --host HOST              ip address to listen (default: 127.0.0.1)
-         --port PORT              port to listen (default: 8080)
-         --path PATH              path to serve static files from (default: )
-         --embedding(s)           restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)
-         --api-key KEY            API key to use for authentication (default: none)
-         --api-key-file FNAME     path to file containing API keys (default: none)
-         --ssl-key-file FNAME     path to file a PEM-encoded SSL private key
-         --ssl-cert-file FNAME    path to file a PEM-encoded SSL certificate
-         --timeout N              server read/write timeout in seconds (default: 600)
-         --threads-http N         number of threads used to process HTTP requests (default: -1)
-         --system-prompt-file FNAME
-                                  set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications
-         --log-format {text,json}
-                                  log output format: json or text (default: json)
-         --metrics                enable prometheus compatible metrics endpoint (default: disabled)
-         --no-slots               disables slots monitoring endpoint (default: enabled)
-         --slot-save-path PATH    path to save slot kv cache (default: disabled)
-         --chat-template JINJA_TEMPLATE
-                                  set custom jinja chat template (default: template taken from model's metadata)
-                                  only commonly used templates are accepted:
-                                  https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
-  -sps,  --slot-prompt-similarity SIMILARITY
-                                  how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)
-         --lora-init-without-apply
-                                  load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled)
-
-logging:
-
-         --simple-io              use basic IO for better compatibility in subprocesses and limited consoles
-  -ld,   --logdir LOGDIR          path under which to save YAML logs (no logging if unset)
-         --log-test               Run simple logging test
-         --log-disable            Disable trace logs
-         --log-enable             Enable trace logs
-         --log-file FNAME         Specify a log filename (without extension)
-         --log-new                Create a separate new log file on start. Each log file will have unique name: "<name>.<ID>.log"
-         --log-append             Don't truncate the old log file.
-```
-
-Available environment variables (if specified, these variables will override parameters specified in arguments):
-
- `LLAMA_CACHE` (cache directory, used by `--hf-repo`)
- `HF_TOKEN` (Hugging Face access token, used when accessing a gated model with `--hf-repo`)
- `LLAMA_ARG_MODEL`
- `LLAMA_ARG_THREADS`
- `LLAMA_ARG_CTX_SIZE`
- `LLAMA_ARG_N_PARALLEL`
- `LLAMA_ARG_BATCH`
- `LLAMA_ARG_UBATCH`
- `LLAMA_ARG_N_GPU_LAYERS`
- `LLAMA_ARG_THREADS_HTTP`
- `LLAMA_ARG_CHAT_TEMPLATE`
- `LLAMA_ARG_N_PREDICT`
- `LLAMA_ARG_ENDPOINT_METRICS`
- `LLAMA_ARG_ENDPOINT_SLOTS`
- `LLAMA_ARG_EMBEDDINGS`
- `LLAMA_ARG_FLASH_ATTN`
- `LLAMA_ARG_DEFRAG_THOLD`
+**If compiled with `LLAMA_SERVER_SSL=ON`**
+- `--ssl-key-file FNAME`: path to file a PEM-encoded SSL private key
+- `--ssl-cert-file FNAME`: path to file a PEM-encoded SSL certificate

 ## Build

@@ -387,18 +199,16 @@ node index.js

 ## API Endpoints

-### GET `/health`: Returns heath check result
+- **GET** `/health`: Returns the current state of the server:
+  - 503 -> `{"status": "loading model"}` if the model is still being loaded.
+  - 500 -> `{"status": "error"}` if the model failed to load.
+  - 200 -> `{"status": "ok", "slots_idle": 1, "slots_processing": 2 }` if the model is successfully loaded and the server is ready for further requests mentioned below.
+  - 200 -> `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if no slots are currently available.
+  - 503 -> `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if the query parameter `fail_on_no_slot` is provided and no slots are currently available.

-**Response format**
+  If the query parameter `include_slots` is passed, `slots` field will contain internal slots data except if `--slots-endpoint-disable` is set.

- HTTP status code 503
-  - Body: `{"error": {"code": 503, "message": "Loading model", "type": "unavailable_error"}}`
-  - Explanation: the model is still being loaded.
- HTTP status code 200
-  - Body: `{"status": "ok" }`
-  - Explanation: the model is successfully loaded and the server is ready.
-
-### POST `/completion`: Given a `prompt`, it returns the predicted completion.
+- **POST** `/completion`: Given a `prompt`, it returns the predicted completion.

    *Options:*

@@ -422,7 +232,7 @@ node index.js

    `n_predict`: Set the maximum number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. Default: `-1`, where `-1` is infinity.

-    `n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded. The number excludes the BOS token.
+    `n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded.
    By default, this value is set to `0`, meaning no tokens are kept. Use `-1` to retain all tokens from the prompt.

    `stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`.
@@ -476,7 +286,7 @@ node index.js

    `samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["top_k", "tfs_z", "typical_p", "top_p", "min_p", "temperature"]` - these are all the available values.

-**Response format**
+### Result JSON

 - Note: When using streaming mode (`stream`), only `content` and `stop` will be returned until end of completion.

@@ -515,7 +325,7 @@ Notice that each `probs` is an array of length `n_probs`.
 - `tokens_evaluated`: Number of tokens evaluated in total from the prompt
 - `truncated`: Boolean indicating if the context size was exceeded during generation, i.e. the number of tokens provided in the prompt (`tokens_evaluated`) plus tokens generated (`tokens predicted`) exceeded the context size (`n_ctx`)

-### POST `/tokenize`: Tokenize a given text
+- **POST** `/tokenize`: Tokenize a given text.

    *Options:*

@@ -523,15 +333,13 @@ Notice that each `probs` is an array of length `n_probs`.

    `add_special`: Boolean indicating if special tokens, i.e. `BOS`, should be inserted.  Default: `false`

-### POST `/detokenize`: Convert tokens to text
+- **POST** `/detokenize`: Convert tokens to text.

    *Options:*

    `tokens`: Set the tokens to detokenize.

-### POST `/embedding`: Generate embedding of a given text
-
-The same as [the embedding example](../embedding) does.
+- **POST** `/embedding`: Generate embedding of a given text just as [the embedding example](../embedding) does.

    *Options:*

@@ -539,9 +347,7 @@ The same as [the embedding example](../embedding) does.

    `image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `content`. You can determine the place of the image in the content as in the following: `Image: [img-21].\nCaption: This is a picture of a house`. In this case, `[img-21]` will be replaced by the embeddings of the image with id `21` in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 21}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.

-### POST `/infill`: For code infilling.
-
-Takes a prefix and a suffix and returns the predicted completion as stream.
+- **POST** `/infill`: For code infilling. Takes a prefix and a suffix and returns the predicted completion as stream.

    *Options:*

@@ -553,7 +359,7 @@ Takes a prefix and a suffix and returns the predicted completion as stream.

 - **GET** `/props`: Return current server settings.

-**Response format**
+### Result JSON

 ```json
 {
@@ -571,9 +377,7 @@ Takes a prefix and a suffix and returns the predicted completion as stream.
 - `total_slots` - the total number of slots for process requests (defined by `--parallel` option)
 - `chat_template` - the model's original Jinja2 prompt template

-### POST `/v1/chat/completions`: OpenAI-compatible Chat Completions API
-
-Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only models with a [supported chat template](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, the ChatML template will be used.
+- **POST** `/v1/chat/completions`: OpenAI-compatible Chat Completions API. Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only models with a [supported chat template](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, the ChatML template will be used.

    *Options:*

@@ -625,7 +429,7 @@ Given a ChatML-formatted json description in `messages`, it returns the predicte
    }'
    ```

-### POST `/v1/embeddings`: OpenAI-compatible embeddings API
+- **POST** `/v1/embeddings`: OpenAI-compatible embeddings API.

    *Options:*

@@ -659,15 +463,9 @@ Given a ChatML-formatted json description in `messages`, it returns the predicte
    }'
    ```

-### GET `/slots`: Returns the current slots processing state
+- **GET** `/slots`: Returns the current slots processing state. Can be disabled with `--slots-endpoint-disable`.

-This endpoint can be disabled with `--no-slots`
-
-If query param `?fail_on_no_slot=1` is set, this endpoint will respond with status code 503 if there is no available slots.
-
-**Response format**
-
-Example:
+### Result JSON

 ```json
 [
@@ -728,13 +526,7 @@ Example:
 ]
 ```

-Possible values for `slot[i].state` are:
- `0`: SLOT_STATE_IDLE
- `1`: SLOT_STATE_PROCESSING
-
-### GET `/metrics`: Prometheus compatible metrics exporter
-
-This endpoint is only accessible if `--metrics` is set.
+- **GET** `/metrics`: [Prometheus](https://prometheus.io/) compatible metrics exporter endpoint if `--metrics` is enabled:

 Available metrics:
 - `llamacpp:prompt_tokens_total`: Number of prompt tokens processed.
@@ -746,13 +538,13 @@ Available metrics:
 - `llamacpp:requests_processing`: Number of requests processing.
 - `llamacpp:requests_deferred`: Number of requests deferred.

-### POST `/slots/{id_slot}?action=save`: Save the prompt cache of the specified slot to a file.
+- **POST** `/slots/{id_slot}?action=save`: Save the prompt cache of the specified slot to a file.

    *Options:*

    `filename`: Name of the file to save the slot's prompt cache. The file will be saved in the directory specified by the `--slot-save-path` server parameter.

-**Response format**
+### Result JSON

 ```json
 {
@@ -766,13 +558,13 @@ Available metrics:
 }
 ```

-### POST `/slots/{id_slot}?action=restore`: Restore the prompt cache of the specified slot from a file.
+- **POST** `/slots/{id_slot}?action=restore`: Restore the prompt cache of the specified slot from a file.

    *Options:*

    `filename`: Name of the file to restore the slot's prompt cache from. The file should be located in the directory specified by the `--slot-save-path` server parameter.

-**Response format**
+### Result JSON

 ```json
 {
@@ -786,9 +578,9 @@ Available metrics:
 }
 ```

-### POST `/slots/{id_slot}?action=erase`: Erase the prompt cache of the specified slot.
+- **POST** `/slots/{id_slot}?action=erase`: Erase the prompt cache of the specified slot.

-**Response format**
+### Result JSON

 ```json
 {
@@ -797,46 +589,6 @@ Available metrics:
 }
 ```

-### GET `/lora-adapters`: Get list of all LoRA adapters
-
-This endpoint returns the loaded LoRA adapters. You can add adapters using `--lora` when starting the server, for example: `--lora my_adapter_1.gguf --lora my_adapter_2.gguf ...`
-
-By default, all adapters will be loaded with scale set to 1. To initialize all adapters scale to 0, add `--lora-init-without-apply`
-
-If an adapter is disabled, the scale will be set to 0.
-
-**Response format**
-
-```json
-[
-    {
-        "id": 0,
-        "path": "my_adapter_1.gguf",
-        "scale": 0.0
-    },
-    {
-        "id": 1,
-        "path": "my_adapter_2.gguf",
-        "scale": 0.0
-    }
-]
-```
-
-### POST `/lora-adapters`: Set list of LoRA adapters
-
-To disable an adapter, either remove it from the list below, or set scale to 0.
-
-**Request format**
-
-To know the `id` of the adapter, use GET `/lora-adapters`
-
-```json
-[
-  {"id": 0, "scale": 0.2},
-  {"id": 1, "scale": 0.8}
-]
-```
-
 ## More examples

 ### Change system prompt on runtime
--- a/Show More
+++ b/Show More