common : remove defaults

common : add HF arg helpers
2026-04-16 16:27:32 +03:00 · 2024-03-22 15:33:24 +02:00 · 2024-03-22 14:32:36 +02:00
199 changed files with 21160 additions and 50033 deletions
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -12,7 +12,6 @@ Checks: >
    -readability-implicit-bool-conversion,
    -readability-magic-numbers,
    -readability-uppercase-literal-suffix,
-    -readability-simplify-boolean-expr,
    clang-analyzer-*,
    -clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,
    performance-*,
--- a/.devops/full-cuda.Dockerfile
+++ b/.devops/full-cuda.Dockerfile
@@ -12,7 +12,7 @@ FROM ${BASE_CUDA_DEV_CONTAINER} as build
 ARG CUDA_DOCKER_ARCH=all

 RUN apt-get update && \
-    apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev
+    apt-get install -y build-essential python3 python3-pip git

 COPY requirements.txt   requirements.txt
 COPY requirements       requirements
@@ -26,10 +26,8 @@ COPY . .

 # Set nvcc architecture
 ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
-# Enable CUDA
-ENV LLAMA_CUDA=1
-# Enable cURL
-ENV LLAMA_CURL=1
+# Enable cuBLAS
+ENV LLAMA_CUBLAS=1

 RUN make

--- a/.devops/full-rocm.Dockerfile
+++ b/.devops/full-rocm.Dockerfile
@@ -40,11 +40,6 @@ ENV LLAMA_HIPBLAS=1
 ENV CC=/opt/rocm/llvm/bin/clang
 ENV CXX=/opt/rocm/llvm/bin/clang++

-# Enable cURL
-ENV LLAMA_CURL=1
-RUN apt-get update && \
-    apt-get install -y libcurl4-openssl-dev
-
 RUN make

 ENTRYPOINT ["/app/.devops/tools.sh"]
--- a/.devops/full.Dockerfile
+++ b/.devops/full.Dockerfile
@@ -3,7 +3,7 @@ ARG UBUNTU_VERSION=22.04
 FROM ubuntu:$UBUNTU_VERSION as build

 RUN apt-get update && \
-    apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev
+    apt-get install -y build-essential python3 python3-pip git

 COPY requirements.txt   requirements.txt
 COPY requirements       requirements
@@ -15,9 +15,6 @@ WORKDIR /app

 COPY . .

-ENV LLAMA_CURL=1
-
-
 RUN make

 ENV LC_ALL=C.utf8
--- a/.devops/llama-cpp-clblast.srpm.spec
+++ b/.devops/llama-cpp-clblast.srpm.spec
@@ -1,5 +1,5 @@
 # SRPM for building from source and packaging an RPM for RPM-based distros.
-# https://docs.fedoraproject.org/en-US/quick-docs/creating-rpm-packages
+# https://fedoraproject.org/wiki/How_to_create_an_RPM_package
 # Built and maintained by John Boero - boeroboy@gmail.com
 # In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal

--- a/.devops/llama-cpp-cublas.srpm.spec
+++ b/.devops/llama-cpp-cublas.srpm.spec
@@ -1,5 +1,5 @@
 # SRPM for building from source and packaging an RPM for RPM-based distros.
-# https://docs.fedoraproject.org/en-US/quick-docs/creating-rpm-packages
+# https://fedoraproject.org/wiki/How_to_create_an_RPM_package
 # Built and maintained by John Boero - boeroboy@gmail.com
 # In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal

@@ -12,7 +12,7 @@
 # 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
 #    It is up to the user to install the correct vendor-specific support.

-Name:           llama.cpp-cuda
+Name:           llama.cpp-cublas
 Version:        %( date "+%%Y%%m%%d" )
 Release:        1%{?dist}
 Summary:        CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
@@ -32,16 +32,16 @@ CPU inference for Meta's Lllama2 models using default options.
 %setup -n llama.cpp-master

 %build
-make -j LLAMA_CUDA=1
+make -j LLAMA_CUBLAS=1

 %install
 mkdir -p %{buildroot}%{_bindir}/
-cp -p main %{buildroot}%{_bindir}/llamacppcuda
-cp -p server %{buildroot}%{_bindir}/llamacppcudaserver
-cp -p simple %{buildroot}%{_bindir}/llamacppcudasimple
+cp -p main %{buildroot}%{_bindir}/llamacppcublas
+cp -p server %{buildroot}%{_bindir}/llamacppcublasserver
+cp -p simple %{buildroot}%{_bindir}/llamacppcublassimple

 mkdir -p %{buildroot}/usr/lib/systemd/system
-%{__cat} <<EOF  > %{buildroot}/usr/lib/systemd/system/llamacuda.service
+%{__cat} <<EOF  > %{buildroot}/usr/lib/systemd/system/llamacublas.service
 [Unit]
 Description=Llama.cpp server, CPU only (no GPU support in this build).
 After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
@@ -49,7 +49,7 @@ After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.t
 [Service]
 Type=simple
 EnvironmentFile=/etc/sysconfig/llama
-ExecStart=/usr/bin/llamacppcudaserver $LLAMA_ARGS
+ExecStart=/usr/bin/llamacppcublasserver $LLAMA_ARGS
 ExecReload=/bin/kill -s HUP $MAINPID
 Restart=never

@@ -67,10 +67,10 @@ rm -rf %{buildroot}
 rm -rf %{_builddir}/*

 %files
-%{_bindir}/llamacppcuda
-%{_bindir}/llamacppcudaserver
-%{_bindir}/llamacppcudasimple
-/usr/lib/systemd/system/llamacuda.service
+%{_bindir}/llamacppcublas
+%{_bindir}/llamacppcublasserver
+%{_bindir}/llamacppcublassimple
+/usr/lib/systemd/system/llamacublas.service
 %config /etc/sysconfig/llama

 %pre
--- a/.devops/llama-cpp.srpm.spec
+++ b/.devops/llama-cpp.srpm.spec
@@ -1,5 +1,5 @@
 # SRPM for building from source and packaging an RPM for RPM-based distros.
-# https://docs.fedoraproject.org/en-US/quick-docs/creating-rpm-packages
+# https://fedoraproject.org/wiki/How_to_create_an_RPM_package
 # Built and maintained by John Boero - boeroboy@gmail.com
 # In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal

--- a/.devops/main-cuda.Dockerfile
+++ b/.devops/main-cuda.Dockerfile
@@ -20,8 +20,8 @@ COPY . .

 # Set nvcc architecture
 ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
-# Enable CUDA
-ENV LLAMA_CUDA=1
+# Enable cuBLAS
+ENV LLAMA_CUBLAS=1

 RUN make

--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@@ -4,14 +4,13 @@
  config,
  stdenv,
  mkShell,
-  runCommand,
  cmake,
  ninja,
  pkg-config,
  git,
  python3,
  mpi,
-  blas,
+  openblas, # TODO: Use the generic `blas` so users could switch between alternative implementations
  cudaPackages,
  darwin,
  rocmPackages,
@@ -24,7 +23,7 @@
    useOpenCL
    useRocm
    useVulkan
-  ] && blas.meta.available,
+  ],
  useCuda ? config.cudaSupport,
  useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin && !useOpenCL,
  useMpi ? false, # Increases the runtime closure size by ~700M
@@ -36,8 +35,7 @@
  # It's necessary to consistently use backendStdenv when building with CUDA support,
  # otherwise we get libstdc++ errors downstream.
  effectiveStdenv ? if useCuda then cudaPackages.backendStdenv else stdenv,
-  enableStatic ? effectiveStdenv.hostPlatform.isStatic,
-  precompileMetalShaders ? false
+  enableStatic ? effectiveStdenv.hostPlatform.isStatic
 }@inputs:

 let
@@ -67,15 +65,10 @@ let
    strings.optionalString (suffices != [ ])
      ", accelerated with ${strings.concatStringsSep ", " suffices}";

-  executableSuffix = effectiveStdenv.hostPlatform.extensions.executable;
-
  # TODO: package the Python in this repository in a Nix-like way.
  # It'd be nice to migrate to buildPythonPackage, as well as ensure this repo
  # is PEP 517-compatible, and ensure the correct .dist-info is generated.
  # https://peps.python.org/pep-0517/
-  #
-  # TODO: Package up each Python script or service appropriately, by making
-  # them into "entrypoints"
  llama-python = python3.withPackages (
    ps: [
      ps.numpy
@@ -94,11 +87,6 @@ let
    ]
  );

-  xcrunHost = runCommand "xcrunHost" {} ''
-    mkdir -p $out/bin
-    ln -s /usr/bin/xcrun $out/bin
-  '';
-
  # apple_sdk is supposed to choose sane defaults, no need to handle isAarch64
  # separately
  darwinBuildInputs =
@@ -162,17 +150,12 @@ effectiveStdenv.mkDerivation (
    postPatch = ''
      substituteInPlace ./ggml-metal.m \
        --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
-      substituteInPlace ./ggml-metal.m \
-        --replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";"
-    '';

-    # With PR#6015 https://github.com/ggerganov/llama.cpp/pull/6015,
-    # `default.metallib` may be compiled with Metal compiler from XCode
-    # and we need to escape sandbox on MacOS to access Metal compiler.
-    # `xcrun` is used find the path of the Metal compiler, which is varible
-    # and not on $PATH
-    # see https://github.com/ggerganov/llama.cpp/pull/6118 for discussion
-    __noChroot = effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders;
+      # TODO: Package up each Python script or service appropriately.
+      # If we were to migrate to buildPythonPackage and prepare the `pyproject.toml`,
+      # we could make those *.py into setuptools' entrypoints
+      substituteInPlace ./*.py --replace "/usr/bin/env python" "${llama-python}/bin/python"
+    '';

    nativeBuildInputs =
      [
@@ -190,8 +173,6 @@ effectiveStdenv.mkDerivation (
      ]
      ++ optionals (effectiveStdenv.hostPlatform.isGnu && enableStatic) [
        glibc.static
-      ] ++ optionals (effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders) [
-        xcrunHost
      ];

    buildInputs =
@@ -200,7 +181,6 @@ effectiveStdenv.mkDerivation (
      ++ optionals useMpi [ mpi ]
      ++ optionals useOpenCL [ clblast ]
      ++ optionals useRocm rocmBuildInputs
-      ++ optionals useBlas [ blas ]
      ++ optionals useVulkan vulkanBuildInputs;

    cmakeFlags =
@@ -211,7 +191,7 @@ effectiveStdenv.mkDerivation (
        (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
        (cmakeBool "LLAMA_BLAS" useBlas)
        (cmakeBool "LLAMA_CLBLAST" useOpenCL)
-        (cmakeBool "LLAMA_CUDA" useCuda)
+        (cmakeBool "LLAMA_CUBLAS" useCuda)
        (cmakeBool "LLAMA_HIPBLAS" useRocm)
        (cmakeBool "LLAMA_METAL" useMetalKit)
        (cmakeBool "LLAMA_MPI" useMpi)
@@ -236,16 +216,14 @@ effectiveStdenv.mkDerivation (
        # Should likely use `rocmPackages.clr.gpuTargets`.
        "-DAMDGPU_TARGETS=gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102"
      ]
-      ++ optionals useMetalKit [
-        (lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1")
-        (cmakeBool "LLAMA_METAL_EMBED_LIBRARY" (!precompileMetalShaders))
-      ];
+      ++ optionals useMetalKit [ (lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1") ]
+      ++ optionals useBlas [ (lib.cmakeFeature "LLAMA_BLAS_VENDOR" "OpenBLAS") ];

    # TODO(SomeoneSerge): It's better to add proper install targets at the CMake level,
    # if they haven't been added yet.
    postInstall = ''
-      mv $out/bin/main${executableSuffix} $out/bin/llama${executableSuffix}
-      mv $out/bin/server${executableSuffix} $out/bin/llama-server${executableSuffix}
+      mv $out/bin/main $out/bin/llama
+      mv $out/bin/server $out/bin/llama-server
      mkdir -p $out/include
      cp $src/llama.h $out/include/
    '';
--- a/.devops/server-cuda.Dockerfile
+++ b/.devops/server-cuda.Dockerfile
@@ -12,7 +12,7 @@ FROM ${BASE_CUDA_DEV_CONTAINER} as build
 ARG CUDA_DOCKER_ARCH=all

 RUN apt-get update && \
-    apt-get install -y build-essential git libcurl4-openssl-dev
+    apt-get install -y build-essential git

 WORKDIR /app

@@ -20,18 +20,13 @@ COPY . .

 # Set nvcc architecture
 ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
-# Enable CUDA
-ENV LLAMA_CUDA=1
-# Enable cURL
-ENV LLAMA_CURL=1
+# Enable cuBLAS
+ENV LLAMA_CUBLAS=1

 RUN make

 FROM ${BASE_CUDA_RUN_CONTAINER} as runtime

-RUN apt-get update && \
-    apt-get install -y libcurl4-openssl-dev
-
 COPY --from=build /app/server /server

 ENTRYPOINT [ "/server" ]
--- a/.devops/server-intel.Dockerfile
+++ b/.devops/server-intel.Dockerfile
@@ -4,7 +4,7 @@ FROM intel/oneapi-basekit:$ONEAPI_VERSION as build

 ARG LLAMA_SYCL_F16=OFF
 RUN apt-get update && \
-    apt-get install -y git libcurl4-openssl-dev
+    apt-get install -y git

 WORKDIR /app

@@ -16,14 +16,11 @@ RUN mkdir build && \
        echo "LLAMA_SYCL_F16 is set" && \
        export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
    fi && \
-    cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
+    cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
    cmake --build . --config Release --target server

 FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime

-RUN apt-get update && \
-    apt-get install -y libcurl4-openssl-dev
-
 COPY --from=build /app/build/bin/server /server

 ENV LC_ALL=C.utf8
--- a/.devops/server-rocm.Dockerfile
+++ b/.devops/server-rocm.Dockerfile
@@ -40,11 +40,6 @@ ENV LLAMA_HIPBLAS=1
 ENV CC=/opt/rocm/llvm/bin/clang
 ENV CXX=/opt/rocm/llvm/bin/clang++

-# Enable cURL
-ENV LLAMA_CURL=1
-RUN apt-get update && \
-    apt-get install -y libcurl4-openssl-dev
-
 RUN make

 ENTRYPOINT [ "/app/server" ]
--- a/.devops/server-vulkan.Dockerfile
+++ b/.devops/server-vulkan.Dockerfile
@@ -11,16 +11,12 @@ RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key
    apt update -y && \
    apt-get install -y vulkan-sdk

-# Install cURL
-RUN apt-get update && \
-    apt-get install -y libcurl4-openssl-dev
-
 # Build it
 WORKDIR /app
 COPY . .
 RUN mkdir build && \
    cd build && \
-    cmake .. -DLLAMA_VULKAN=1 -DLLAMA_CURL=1 && \
+    cmake .. -DLLAMA_VULKAN=1 && \
    cmake --build . --config Release --target server

 # Clean up
--- a/.devops/server.Dockerfile
+++ b/.devops/server.Dockerfile
@@ -3,21 +3,16 @@ ARG UBUNTU_VERSION=22.04
 FROM ubuntu:$UBUNTU_VERSION as build

 RUN apt-get update && \
-    apt-get install -y build-essential git libcurl4-openssl-dev
+    apt-get install -y build-essential git

 WORKDIR /app

 COPY . .

-ENV LLAMA_CURL=1
-
 RUN make

 FROM ubuntu:$UBUNTU_VERSION as runtime

-RUN apt-get update && \
-    apt-get install -y libcurl4-openssl-dev
-
 COPY --from=build /app/server /server

 ENV LC_ALL=C.utf8
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@@ -1,300 +0,0 @@
-# Benchmark
-name: Benchmark
-
-on:
-  workflow_dispatch:
-    inputs:
-      gpu-series:
-        description: 'Azure GPU series to run with'
-        required: true
-        type: choice
-        options:
-          - Standard_NC4as_T4_v3
-          - Standard_NC24ads_A100_v4
-          - Standard_NC80adis_H100_v5
-      sha:
-        description: 'Commit SHA1 to build'
-        required: false
-        type: string
-      duration:
-        description: 'Duration of the bench'
-        type: string
-        default: 10m
-
-  push:
-    branches:
-      - master
-    paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
-  pull_request_target:
-    types: [opened, synchronize, reopened]
-    paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
-  schedule:
-    -  cron: '04 2 * * *'
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}-${{ github.event.inputs.sha }}
-  cancel-in-progress: true
-
-jobs:
-  bench-server-baseline:
-    runs-on: Standard_NC4as_T4_v3
-    env:
-      RUNNER_LABEL: Standard_NC4as_T4_v3 # FIXME Do not find a way to not duplicate it
-      N_USERS: 8
-      DURATION: 10m
-
-    strategy:
-      matrix:
-        model: [phi-2]
-        ftype: [q4_0, q8_0, f16]
-        include:
-          - model: phi-2
-            ftype: q4_0
-            pr_comment_enabled: "true"
-
-    if: ${{ github.event.inputs.gpu-series == 'Standard_NC4as_T4_v3' || github.event.schedule || github.event.pull_request || github.head_ref == 'master' || github.ref_name == 'master' || github.event.push.ref == 'refs/heads/master' }}
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
-
-      - name: Install python env
-        id: pipenv
-        run: |
-          cd examples/server/bench
-          python3 -m venv venv
-          source venv/bin/activate
-          pip install -r requirements.txt
-
-      - name: Prometheus
-        id: install_prometheus
-        run: |
-          wget --quiet https://github.com/prometheus/prometheus/releases/download/v2.51.0/prometheus-2.51.0.linux-amd64.tar.gz
-          tar xzf prometheus*.tar.gz --strip-components=1
-          ./prometheus --config.file=examples/server/bench/prometheus.yml &
-          while ! nc -z localhost 9090; do
-            sleep 0.1
-          done
-
-      - name: Set up Go
-        uses: actions/setup-go@v5
-        with:
-          go-version: '1.21'
-
-      - name: Install k6 and xk6-sse
-        id: k6_installation
-        run: |
-          cd examples/server/bench
-          go install go.k6.io/xk6/cmd/xk6@latest
-          xk6 build master \
-              --with github.com/phymbert/xk6-sse
-
-      - name: Build
-        id: cmake_build
-        run: |
-          set -eux
-          mkdir build
-          cd build
-          cmake .. \
-              -DLLAMA_NATIVE=OFF \
-              -DLLAMA_BUILD_SERVER=ON \
-              -DLLAMA_CURL=ON \
-              -DLLAMA_CUBLAS=ON \
-              -DCUDAToolkit_ROOT=/usr/local/cuda \
-              -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \
-              -DCMAKE_CUDA_ARCHITECTURES=75 \
-              -DLLAMA_FATAL_WARNINGS=OFF \
-              -DLLAMA_ALL_WARNINGS=OFF \
-              -DCMAKE_BUILD_TYPE=Release;
-          cmake --build . --config Release -j $(nproc) --target server
-
-      - name: Download the dataset
-        id: download_dataset
-        run: |
-          cd examples/server/bench
-          wget --quiet https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-
-      - name: Server bench
-        id: server_bench
-        run: |
-          set -eux
-
-          cd examples/server/bench
-          source venv/bin/activate
-          python bench.py \
-              --runner-label ${{ env.RUNNER_LABEL }} \
-              --name ${{ github.job }} \
-              --branch ${{ github.head_ref || github.ref_name }} \
-              --commit ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha }} \
-              --scenario script.js \
-              --duration ${{ github.event.inputs.duration || env.DURATION }} \
-              --hf-repo ggml-org/models	 \
-              --hf-file ${{ matrix.model }}/ggml-model-${{ matrix.ftype }}.gguf \
-              --model-path-prefix /models \
-              --parallel ${{ env.N_USERS }} \
-              -ngl 33 \
-              --batch-size 2048 \
-              --ubatch-size	256 \
-              --ctx-size 16384 \
-              --n-prompts 1000 \
-              --max-prompt-tokens 1024 \
-              --max-tokens 2048
-
-          cat results.github.env >> $GITHUB_ENV
-
-          # Remove dataset as we do not want it in the artefact
-          rm ShareGPT_V3_unfiltered_cleaned_split.json
-
-      - uses: actions/upload-artifact@v4
-        with:
-          name: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
-          compression-level: 9
-          path: |
-            examples/server/bench/*.jpg
-            examples/server/bench/*.json
-            examples/server/bench/*.log
-
-      - name: Commit status
-        uses: Sibz/github-status-action@v1
-        with:
-          authToken: ${{secrets.GITHUB_TOKEN}}
-          sha: ${{ inputs.sha || github.event.pull_request.head.sha || github.sha }}
-          context: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
-          description: |
-            ${{ env.BENCH_RESULTS }}
-          state: 'success'
-
-      - name: Upload benchmark images
-        uses: devicons/public-upload-to-imgur@v2.2.2
-        continue-on-error: true # Important as it looks unstable: 503
-        id: imgur_step
-        with:
-          client_id: ${{secrets.IMGUR_CLIENT_ID}}
-          path: |
-            examples/server/bench/prompt_tokens_seconds.jpg
-            examples/server/bench/predicted_tokens_seconds.jpg
-            examples/server/bench/kv_cache_usage_ratio.jpg
-            examples/server/bench/requests_processing.jpg
-
-      - name: Extract mermaid
-        id: set_mermaid
-        run: |
-          set -eux
-
-          cd examples/server/bench
-          PROMPT_TOKENS_SECONDS=$(cat prompt_tokens_seconds.mermaid)
-          echo "PROMPT_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
-          echo "$PROMPT_TOKENS_SECONDS" >> $GITHUB_ENV
-          echo "EOF" >> $GITHUB_ENV
-
-          PREDICTED_TOKENS_SECONDS=$(cat predicted_tokens_seconds.mermaid)
-          echo "PREDICTED_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
-          echo "$PREDICTED_TOKENS_SECONDS" >> $GITHUB_ENV
-          echo "EOF" >> $GITHUB_ENV
-
-          KV_CACHE_USAGE_RATIO=$(cat kv_cache_usage_ratio.mermaid)
-          echo "KV_CACHE_USAGE_RATIO<<EOF" >> $GITHUB_ENV
-          echo "$KV_CACHE_USAGE_RATIO" >> $GITHUB_ENV
-          echo "EOF" >> $GITHUB_ENV
-
-          REQUESTS_PROCESSING=$(cat requests_processing.mermaid)
-          echo "REQUESTS_PROCESSING<<EOF" >> $GITHUB_ENV
-          echo "$REQUESTS_PROCESSING" >> $GITHUB_ENV
-          echo "EOF" >> $GITHUB_ENV
-
-      - name: Extract image url
-        id: extract_image_url
-        continue-on-error: true
-        run: |
-          set -eux
-
-          echo "IMAGE_O=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[0] }}" >> $GITHUB_ENV
-          echo "IMAGE_1=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[1] }}" >> $GITHUB_ENV
-          echo "IMAGE_2=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[2] }}" >> $GITHUB_ENV
-          echo "IMAGE_3=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[3] }}" >> $GITHUB_ENV
-
-      - name: Comment PR
-        uses: mshick/add-pr-comment@v2
-        id: comment_pr
-        if: ${{ github.event.pull_request != '' && matrix.pr_comment_enabled == 'true' }}
-        with:
-          message-id: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
-          message: |
-            <p align="center">
-
-            📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_ for `${{ matrix.model }}`-`${{ matrix.ftype }}`: **${{ env.BENCH_ITERATIONS}} iterations** 🚀
-
-            </p>
-
-            <details>
-
-            <summary>Expand details for performance related PR only</summary>
-
-            - Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }}
-            - HTTP request          : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms        p(95)=${{ env.HTTP_REQ_DURATION_P_95_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
-            - Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_P_95_ }}tk/s
-            - Token generation  (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_TOKENS_SECOND_P_95_ }}tk/s
-            - ${{ env.BENCH_GRAPH_XLABEL }}
-
-
-            <p align="center">
-
-            <img width="100%" height="100%" src="${{ env.IMAGE_O }}" alt="prompt_tokens_seconds" />
-
-            <details>
-
-            <summary>More</summary>
-
-            ```mermaid
-            ${{ env.PROMPT_TOKENS_SECONDS }}
-            ```
-
-            </details>
-
-            <img width="100%" height="100%" src="${{ env.IMAGE_1 }}" alt="predicted_tokens_seconds"/>
-
-            <details>
-                <summary>More</summary>
-
-            ```mermaid
-            ${{ env.PREDICTED_TOKENS_SECONDS }}
-            ```
-
-            </details>
-
-            </p>
-
-            <details>
-
-            <summary>Details</summary>
-
-            <p align="center">
-
-            <img width="100%" height="100%" src="${{ env.IMAGE_2 }}" alt="kv_cache_usage_ratio" />
-
-            <details>
-                <summary>More</summary>
-
-            ```mermaid
-            ${{ env.KV_CACHE_USAGE_RATIO }}
-            ```
-
-            </details>
-
-            <img width="100%" height="100%" src="${{ env.IMAGE_3 }}" alt="requests_processing"/>
-
-            <details>
-                <summary>More</summary>
-
-            ```mermaid
-            ${{ env.REQUESTS_PROCESSING }}
-            ```
-
-            </details>
-
-            </p>
-            </details>
-            </details>
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -15,10 +15,6 @@ on:
    types: [opened, synchronize, reopened]
    paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m']

-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
 env:
  BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
  GGML_NLOOP: 3
@@ -31,7 +27,7 @@ jobs:
    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v3

      - name: Dependencies
        id: depends
@@ -76,10 +72,10 @@ jobs:

      - name: Upload artifacts
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@v3
        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip
-          name: llama-bin-macos-arm64.zip
+          path: |
+            llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip

  macOS-latest-cmake-x64:
    runs-on: macos-latest
@@ -87,7 +83,7 @@ jobs:
    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v3

      - name: Dependencies
        id: depends
@@ -132,21 +128,18 @@ jobs:

      - name: Upload artifacts
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@v3
        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip
-          name: llama-bin-macos-x64.zip
+          path: |
+            llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip

  ubuntu-focal-make:
    runs-on: ubuntu-20.04
-    env:
-      LLAMA_NODE_AVAILABLE: true
-      LLAMA_PYTHON_AVAILABLE: true

    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v3

      - name: Dependencies
        id: depends
@@ -154,14 +147,6 @@ jobs:
          sudo apt-get update
          sudo apt-get install build-essential gcc-8

-      - uses: actions/setup-node@v4
-        with:
-          node-version: "20"
-
-      - uses: actions/setup-python@v5
-        with:
-          python-version: "3.11"
-
      - name: Build
        id: make_build
        env:
@@ -181,7 +166,7 @@ jobs:
    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v3

      - name: Dependencies
        id: depends
@@ -203,7 +188,7 @@ jobs:
    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v3

      - name: Dependencies
        id: depends
@@ -225,17 +210,6 @@ jobs:
          cd build
          ctest -L main --verbose --timeout 900

-      - name: Test llama2c conversion
-        id: llama2c_test
-        run: |
-          cd build
-          echo "Fetch tokenizer"
-          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin
-          echo "Fetch llama2c model"
-          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
-          ./bin/convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
-          ./bin/main -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
-
 #  ubuntu-latest-cmake-sanitizer:
 #    runs-on: ubuntu-latest
 #
@@ -249,7 +223,7 @@ jobs:
 #    steps:
 #      - name: Clone
 #        id: checkout
-#        uses: actions/checkout@v4
+#        uses: actions/checkout@v3
 #
 #      - name: Dependencies
 #        id: depends
@@ -283,7 +257,7 @@ jobs:
    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v3

      - name: Dependencies
        id: depends
@@ -311,7 +285,7 @@ jobs:
    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v3

      - name: Dependencies
        id: depends
@@ -357,7 +331,7 @@ jobs:

      - name: Clone
        id: checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v3

      - name: Build
        id: cmake_build
@@ -398,7 +372,7 @@ jobs:

      - name: Clone
        id: checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v3

      - name: Build
        id: cmake_build
@@ -418,7 +392,7 @@ jobs:
    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v3

      - name: Dependencies
        id: depends
@@ -449,7 +423,7 @@ jobs:
    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v3

      - name: Dependencies
        id: depends
@@ -593,7 +567,7 @@ jobs:
    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v3
        with:
          fetch-depth: 0

@@ -723,23 +697,23 @@ jobs:

      - name: Upload artifacts
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@v3
        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-x64.zip
-          name: llama-bin-win-${{ matrix.build }}-x64.zip
+          path: |
+            llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-x64.zip

-  windows-latest-cmake-cuda:
+  windows-latest-cmake-cublas:
    runs-on: windows-latest

    strategy:
      matrix:
        cuda: ['12.2.0', '11.7.1']
-        build: ['cuda']
+        build: ['cublas']

    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v3
        with:
          fetch-depth: 0

@@ -755,7 +729,7 @@ jobs:
        run: |
          mkdir build
          cd build
-          cmake .. -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=ON
+          cmake .. -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON -DBUILD_SHARED_LIBS=ON
          cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}

      - name: Determine tag name
@@ -779,10 +753,10 @@ jobs:

      - name: Upload artifacts
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@v3
        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip
-          name: llama-bin-win-cu${{ matrix.cuda }}-x64.zip
+          path: |
+            llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip

      - name: Copy and pack Cuda runtime
        run: |
@@ -793,14 +767,13 @@ jobs:

      - name: Upload Cuda runtime
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@v3
        with:
-          path: cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
-          name: cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
+          path: |
+            cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip

  windows-latest-cmake-sycl:
    runs-on: windows-latest
-
    defaults:
      run:
        shell: bash
@@ -809,10 +782,11 @@ jobs:
      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/62641e01-1e8d-4ace-91d6-ae03f7f8a71f/w_BaseKit_p_2024.0.0.49563_offline.exe
      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel

+
    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v3
        with:
          fetch-depth: 0

@@ -823,38 +797,12 @@ jobs:
        id: cmake_build
        run:  examples/sycl/win-build-sycl.bat

-      - name: Determine tag name
-        id: tag
-        shell: bash
-        run: |
-          BUILD_NUMBER="$(git rev-list --count HEAD)"
-          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
-          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
-            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
-          else
-            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
-            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
-          fi
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        run: |
-          7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/*
-
-      - name: Upload artifacts
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip
-          name: llama-bin-win-sycl-x64.zip
-
  ios-xcode-build:
    runs-on: macos-latest

    steps:
      - name: Checkout code
-        uses: actions/checkout@v4
+        uses: actions/checkout@v3

      - name: Build Xcode project
        run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build
@@ -864,7 +812,7 @@ jobs:

    steps:
      - name: Clone
-        uses: actions/checkout@v4
+        uses: actions/checkout@v3

      - name: Set up JDK
        uses: actions/setup-java@v3
@@ -887,7 +835,7 @@ jobs:
 #    runs-on: macos-12
 #    steps:
 #    - name: Clone
-#      uses: actions/checkout@v4
+#      uses: actions/checkout@v3
 #
 #    - name: Build
 #      uses: cross-platform-actions/action@v0.19.0
@@ -911,14 +859,14 @@ jobs:
      - macOS-latest-make
      - macOS-latest-cmake
      - windows-latest-cmake
-      - windows-latest-cmake-cuda
+      - windows-latest-cmake-cublas
      - macOS-latest-cmake-arm64
      - macOS-latest-cmake-x64

    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v3
        with:
          fetch-depth: 0

@@ -937,7 +885,7 @@ jobs:

      - name: Download artifacts
        id: download-artifact
-        uses: actions/download-artifact@v4
+        uses: actions/download-artifact@v3

      - name: Create release
        id: create_release
@@ -978,7 +926,7 @@ jobs:
 #
 #    steps:
 #      - name: Clone
-#        uses: actions/checkout@v4
+#        uses: actions/checkout@v3
 #
 #      - name: Dependencies
 #        run: |
@@ -1002,7 +950,7 @@ jobs:
 #
 #    steps:
 #      - name: Clone
-#        uses: actions/checkout@v4
+#        uses: actions/checkout@v3
 #
 #      - name: Dependencies
 #        run: |
@@ -1026,7 +974,7 @@ jobs:
 #
 #    steps:
 #      - name: Clone
-#        uses: actions/checkout@v4
+#        uses: actions/checkout@v3
 #
 #      - name: Dependencies
 #        run: |
@@ -1056,7 +1004,7 @@ jobs:
 #
 #    steps:
 #      - name: Clone
-#        uses: actions/checkout@v4
+#        uses: actions/checkout@v3
 #
 #      - name: Add msbuild to PATH
 #        uses: microsoft/setup-msbuild@v1
@@ -1072,7 +1020,7 @@ jobs:
 #          msbuild ALL_BUILD.vcxproj -t:build -p:configuration=${{ matrix.build }} -p:platform=${{ matrix.arch }}
 #
 #      - name: Upload binaries
-#        uses: actions/upload-artifact@v4
+#        uses: actions/upload-artifact@v1
 #        with:
 #          name: llama-bin-${{ matrix.arch }}
 #          path: build/bin/${{ matrix.build }}
@@ -1095,7 +1043,7 @@ jobs:
 #
 #    steps:
 #      - name: Clone
-#        uses: actions/checkout@v4
+#        uses: actions/checkout@v3
 #
 #      - name: Add msbuild to PATH
 #        uses: microsoft/setup-msbuild@v1
@@ -1127,7 +1075,7 @@ jobs:
 #
 #      - name: Upload binaries
 #        if: matrix.blas == 'ON'
-#        uses: actions/upload-artifact@v4
+#        uses: actions/upload-artifact@v1
 #        with:
 #          name: llama-blas-bin-${{ matrix.arch }}
 #          path: build/bin/${{ matrix.build }}
@@ -1141,7 +1089,7 @@ jobs:
 #
 #    steps:
 #      - name: Clone
-#        uses: actions/checkout@v4
+#        uses: actions/checkout@v3
 #
 #      - name: Dependencies
 #        run: |
--- a/.github/workflows/close-issue.yml
+++ b/.github/workflows/close-issue.yml
@@ -19,5 +19,5 @@ jobs:
          close-issue-message: "This issue was closed because it has been inactive for 14 days since being marked as stale."
          days-before-pr-stale: -1
          days-before-pr-close: -1
-          operations-per-run: 10000
+          operations-per-run: 1000
          repo-token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/code-coverage.yml
+++ b/.github/workflows/code-coverage.yml
@@ -5,16 +5,12 @@ env:
  GGML_NLOOP: 3
  GGML_N_THREADS: 1

-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
 jobs:
  run:
    runs-on: ubuntu-20.04
    steps:
      - name: Checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v3

      - name: Dependencies
        run: |
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -15,10 +15,6 @@ on:
    branches:
      - master

-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
 jobs:
  push_to_registry:
    name: Push Docker image to Docker Hub
@@ -46,7 +42,7 @@ jobs:
          - { tag: "server-intel", dockerfile: ".devops/server-intel.Dockerfile", platforms: "linux/amd64" }
    steps:
      - name: Check out the repo
-        uses: actions/checkout@v4
+        uses: actions/checkout@v3

      - name: Set up QEMU
        uses: docker/setup-qemu-action@v2
--- a/.github/workflows/editorconfig.yml
+++ b/.github/workflows/editorconfig.yml
@@ -14,14 +14,10 @@ on:
    branches:
      - master

-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
 jobs:
  editorconfig:
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v3
      - uses: editorconfig-checker/action-editorconfig-checker@main
      - run: editorconfig-checker
--- a/.github/workflows/gguf-publish.yml
+++ b/.github/workflows/gguf-publish.yml
@@ -24,9 +24,9 @@ jobs:
    runs-on: ubuntu-latest

    steps:
-    - uses: actions/checkout@v4
+    - uses: actions/checkout@v3
    - name: Set up Python
-      uses: actions/setup-python@v5
+      uses: actions/setup-python@v2
      with:
        python-version: '3.9.x'
    - name: Install dependencies
--- a/.github/workflows/nix-ci-aarch64.yml
+++ b/.github/workflows/nix-ci-aarch64.yml
@@ -17,10 +17,6 @@ on:
    types: [opened, synchronize, reopened]
    paths: ['**/*.nix', 'flake.lock']

-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
 jobs:
  nix-build-aarch64:
    runs-on: ubuntu-latest
--- a/.github/workflows/nix-ci.yml
+++ b/.github/workflows/nix-ci.yml
@@ -8,10 +8,6 @@ on:
  pull_request:
    types: [opened, synchronize, reopened]

-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
 jobs:
  nix-eval:
    strategy:
--- a/.github/workflows/python-check-requirements.yml
+++ b/.github/workflows/python-check-requirements.yml
@@ -16,19 +16,15 @@ on:
      - 'requirements.txt'
      - 'requirements/*.txt'

-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
 jobs:
  python-check-requirements:
    runs-on: ubuntu-latest
    name: check-requirements
    steps:
      - name: Check out source repository
-        uses: actions/checkout@v4
+        uses: actions/checkout@v3
      - name: Set up Python environment
-        uses: actions/setup-python@v5
+        uses: actions/setup-python@v4
        with:
          python-version: "3.11"
      - name: Run check-requirements.sh script
--- a/.github/workflows/python-lint.yml
+++ b/.github/workflows/python-lint.yml
@@ -2,19 +2,15 @@ name: flake8 Lint

 on: [push, pull_request]

-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
 jobs:
  flake8-lint:
    runs-on: ubuntu-latest
    name: Lint
    steps:
      - name: Check out source repository
-        uses: actions/checkout@v4
+        uses: actions/checkout@v3
      - name: Set up Python environment
-        uses: actions/setup-python@v5
+        uses: actions/setup-python@v4
        with:
          python-version: "3.11"
      - name: flake8 Lint
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -4,10 +4,6 @@ name: Server
 on:
  workflow_dispatch: # allows manual triggering
    inputs:
-      sha:
-        description: 'Commit SHA1 to build'
-        required: false
-        type: string
      slow_tests:
        description: 'Run slow tests'
        required: true
@@ -15,16 +11,12 @@ on:
  push:
    branches:
      - master
-    paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
-  pull_request_target:
+    paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/tests/**.*']
+  pull_request:
    types: [opened, synchronize, reopened]
-    paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
+    paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/tests/**.*']
  schedule:
-    -  cron: '2 4 * * *'
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
+    -  cron: '0 0 * * *'

 jobs:
  server:
@@ -39,6 +31,7 @@ jobs:
        include:
          - build_type: Release
            sanitizer: ""
+            disabled_on_pr: true
      fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken

    container:
@@ -48,45 +41,25 @@ jobs:
      options: --cpus 4

    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+
      - name: Dependencies
        id: depends
        run: |
          apt-get update
          apt-get -y install \
            build-essential \
-            xxd \
            git \
            cmake \
            python3-pip \
-            curl \
            wget \
            language-pack-en \
            libcurl4-openssl-dev

-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
-
-      - name: Verify server deps
-        id: verify_server_deps
-        run: |
-          git config --global --add safe.directory $(realpath .)
-          cd examples/server
-          git ls-files --others --modified
-          git status
-          ./deps.sh
-          git status
-          not_ignored_files="$(git ls-files --others --modified)"
-          echo "Modified files: ${not_ignored_files}"
-          if [ -n "${not_ignored_files}" ]; then
-            echo "Repository is dirty or server deps are not built as expected"
-            echo "${not_ignored_files}"
-            exit 1
-          fi
-
      - name: Build
        id: cmake_build
        run: |
@@ -126,7 +99,7 @@ jobs:
    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v3
        with:
          fetch-depth: 0

--- a/.github/workflows/zig-build.yml
+++ b/.github/workflows/zig-build.yml
@@ -6,10 +6,6 @@ on:
    branches:
      - master

-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
 jobs:
  build:
    strategy:
@@ -18,7 +14,7 @@ jobs:
        runs-on: [ubuntu-latest, macos-latest, windows-latest]
    runs-on: ${{ matrix.runs-on }}
    steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v3
        with:
          submodules: recursive
          fetch-depth: 0
--- a/.gitignore
+++ b/.gitignore
@@ -50,7 +50,6 @@ models-mnt
 /embedding
 /gguf
 /gguf-llama-simple
-/gguf-split
 /gritlm
 /imatrix
 /infill
@@ -59,9 +58,6 @@ models-mnt
 /llava-cli
 /lookahead
 /lookup
-/lookup-create
-/lookup-merge
-/lookup-stats
 /main
 /metal
 /passkey
@@ -77,7 +73,6 @@ models-mnt
 /batched-bench
 /export-lora
 /finetune
-/retrieval
 /speculative
 /parallel
 /train-text-from-scratch
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -89,8 +89,8 @@ endif()
 option(LLAMA_ACCELERATE                      "llama: enable Accelerate framework"               ON)
 option(LLAMA_BLAS                            "llama: use BLAS"                                  OFF)
 set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
-option(LLAMA_CUDA                            "llama: use CUDA"                                  OFF)
-option(LLAMA_CUBLAS                          "llama: use CUDA (deprecated, use LLAMA_CUDA)"     OFF)
+option(LLAMA_CUBLAS                          "llama: use CUDA"                                  OFF)
+#option(LLAMA_CUDA_CUBLAS                     "llama: use cuBLAS for prompt processing"          OFF)
 option(LLAMA_CUDA_FORCE_DMMV                 "llama: use dmmv instead of mmvq CUDA kernels"     OFF)
 option(LLAMA_CUDA_FORCE_MMQ                  "llama: use mmq kernels instead of cuBLAS"         OFF)
 set(LLAMA_CUDA_DMMV_X      "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
@@ -99,7 +99,6 @@ option(LLAMA_CUDA_F16                        "llama: use 16 bit floats for some
 set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
 set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
                                             "llama: max. batch size for using peer access")
-option(LLAMA_CUDA_NO_PEER_COPY               "llama: do not use peer to peer copies"            OFF)
 option(LLAMA_CURL                            "llama: use libcurl to download model from an URL" OFF)
 option(LLAMA_HIPBLAS                         "llama: use hipBLAS"                               OFF)
 option(LLAMA_HIP_UMA                         "llama: use HIP unified memory architecture"       OFF)
@@ -113,9 +112,6 @@ option(LLAMA_METAL                           "llama: use Metal"
 option(LLAMA_METAL_NDEBUG                    "llama: disable Metal debugging"                   OFF)
 option(LLAMA_METAL_SHADER_DEBUG              "llama: compile Metal with -fno-fast-math"         OFF)
 option(LLAMA_METAL_EMBED_LIBRARY             "llama: embed Metal library"                       OFF)
-set(LLAMA_METAL_MACOSX_VERSION_MIN "" CACHE STRING
-                                             "llama: metal minimum macOS version")
-set(LLAMA_METAL_STD "" CACHE STRING          "llama: metal standard version (-std flag)")
 option(LLAMA_KOMPUTE                         "llama: use Kompute"                               OFF)
 option(LLAMA_MPI                             "llama: use MPI"                                   OFF)
 option(LLAMA_QKK_64                          "llama: use super-block size of 64 for k-quants"   OFF)
@@ -253,16 +249,6 @@ if (LLAMA_METAL)
            set(XC_FLAGS -O3)
        endif()

-        # Append macOS metal versioning flags
-        if (LLAMA_METAL_MACOSX_VERSION_MIN)
-            message(STATUS "Adding -mmacosx-version-min=${LLAMA_METAL_MACOSX_VERSION_MIN} flag to metal compilation")
-            list(APPEND XC_FLAGS -mmacosx-version-min=${LLAMA_METAL_MACOSX_VERSION_MIN})
-        endif()
-        if (LLAMA_METAL_STD)
-            message(STATUS "Adding -std=${LLAMA_METAL_STD} flag to metal compilation")
-            list(APPEND XC_FLAGS -std=${LLAMA_METAL_STD})
-        endif()
-
        add_custom_command(
            OUTPUT ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
            COMMAND xcrun -sdk macosx metal    ${XC_FLAGS} -c ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air
@@ -373,25 +359,18 @@ if (LLAMA_QKK_64)
 endif()

 if (LLAMA_CUBLAS)
-    message(WARNING "LLAMA_CUBLAS is deprecated and will be removed in the future.\nUse LLAMA_CUDA instead")
-    set(LLAMA_CUDA ON)
-endif()
-
-if (LLAMA_CUDA)
    cmake_minimum_required(VERSION 3.17)

    find_package(CUDAToolkit)
    if (CUDAToolkit_FOUND)
-        message(STATUS "CUDA found")
+        message(STATUS "cuBLAS found")

        enable_language(CUDA)

        set(GGML_HEADERS_CUDA ggml-cuda.h)
+        set(GGML_SOURCES_CUDA ggml-cuda.cu)

-        file(GLOB GGML_SOURCES_CUDA "ggml-cuda/*.cu")
-        list(APPEND GGML_SOURCES_CUDA "ggml-cuda.cu")
-
-        add_compile_definitions(GGML_USE_CUDA)
+        add_compile_definitions(GGML_USE_CUBLAS)
        if (LLAMA_CUDA_FORCE_DMMV)
            add_compile_definitions(GGML_CUDA_FORCE_DMMV)
        endif()
@@ -408,9 +387,6 @@ if (LLAMA_CUDA)
        endif()
        add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
        add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${LLAMA_CUDA_PEER_MAX_BATCH_SIZE})
-        if (LLAMA_CUDA_NO_PEER_COPY)
-            add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
-        endif()

        if (LLAMA_STATIC)
            if (WIN32)
@@ -440,7 +416,7 @@ if (LLAMA_CUDA)
    message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")

    else()
-        message(WARNING "CUDA not found")
+        message(WARNING "cuBLAS not found")
    endif()
 endif()

@@ -539,11 +515,9 @@ if (LLAMA_HIPBLAS)
    message(STATUS "HIP and hipBLAS found")

    set(GGML_HEADERS_ROCM ggml-cuda.h)
+    set(GGML_SOURCES_ROCM ggml-cuda.cu)

-    file(GLOB GGML_SOURCES_ROCM "ggml-cuda/*.cu")
-    list(APPEND GGML_SOURCES_ROCM "ggml-cuda.cu")
-
-    add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUDA)
+    add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS)

    if (LLAMA_HIP_UMA)
        add_compile_definitions(GGML_HIP_UMA)
@@ -557,15 +531,11 @@ if (LLAMA_HIPBLAS)
        add_compile_definitions(GGML_CUDA_FORCE_MMQ)
    endif()

-    if (LLAMA_CUDA_NO_PEER_COPY)
-        add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
-    endif()
-
    add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
    add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
    add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})

-    set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX)
+    set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX)

    if (LLAMA_STATIC)
        message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
@@ -848,7 +818,7 @@ endif()

 set(CUDA_CXX_FLAGS "")

-if (LLAMA_CUDA)
+if (LLAMA_CUBLAS)
    set(CUDA_FLAGS -use_fast_math)

    if (LLAMA_FATAL_WARNINGS)
@@ -1073,7 +1043,7 @@ endif()
 add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:${ARCH_FLAGS}>")
 add_compile_options("$<$<COMPILE_LANGUAGE:C>:${ARCH_FLAGS}>")

-if (LLAMA_CUDA)
+if (LLAMA_CUBLAS)
    list(APPEND CUDA_CXX_FLAGS ${ARCH_FLAGS})
    list(JOIN CUDA_CXX_FLAGS " " CUDA_CXX_FLAGS_JOINED)  # pass host compiler flags as a single argument
    if (NOT CUDA_CXX_FLAGS_JOINED STREQUAL "")
@@ -1183,7 +1153,6 @@ add_library(llama
            llama.h
            unicode.h
            unicode.cpp
-            unicode-data.cpp
            )

 target_include_directories(llama PUBLIC .)
@@ -1279,12 +1248,6 @@ if (LLAMA_METAL)
            GROUP_READ
            WORLD_READ
        DESTINATION ${CMAKE_INSTALL_BINDIR})
-    if (NOT LLAMA_METAL_EMBED_LIBRARY)
-        install(
-            FILES ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
-            DESTINATION ${CMAKE_INSTALL_BINDIR}
-        )
-    endif()
 endif()

 #
--- a/95
+++ b/95
@@ -1,8 +1,8 @@
 # Define the default target now so that it is always the first target
 BUILD_TARGETS = \
 	main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
-	simple batched batched-bench save-load-state server gguf gguf-split llama-bench libllava.a llava-cli baby-llama beam-search  \
-	retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
+	simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search  \
+	speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o

 # Binaries only useful for tests
 TEST_TARGETS = \
@@ -10,7 +10,7 @@ TEST_TARGETS = \
 	tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama          \
 	tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope      \
 	tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease                                 \
-	tests/test-json-schema-to-grammar tests/test-grammar-integration
+	tests/test-json-schema-to-grammar

 # Code coverage output files
 COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
@@ -390,20 +390,14 @@ ifdef LLAMA_BLIS
 endif # LLAMA_BLIS

 ifdef LLAMA_CUBLAS
-# LLAMA_CUBLAS is deprecated and will be removed in the future
-	LLAMA_CUDA := 1
-endif
-
-ifdef LLAMA_CUDA
 	ifneq ('', '$(wildcard /opt/cuda)')
 		CUDA_PATH ?= /opt/cuda
 	else
 		CUDA_PATH ?= /usr/local/cuda
 	endif
-	MK_CPPFLAGS  += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
+	MK_CPPFLAGS  += -DGGML_USE_CUBLAS -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
 	MK_LDFLAGS   += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
 	OBJS         += ggml-cuda.o
-	OBJS         += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
 	MK_NVCCFLAGS += -use_fast_math
 ifdef LLAMA_FATAL_WARNINGS
 	MK_NVCCFLAGS += -Werror all-warnings
@@ -458,30 +452,19 @@ ifdef LLAMA_CUDA_PEER_MAX_BATCH_SIZE
 else
 	MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128
 endif # LLAMA_CUDA_PEER_MAX_BATCH_SIZE
-ifdef LLAMA_CUDA_NO_PEER_COPY
-	MK_NVCCFLAGS += -DGGML_CUDA_NO_PEER_COPY
-endif # LLAMA_CUDA_NO_PEER_COPY
+#ifdef LLAMA_CUDA_CUBLAS
+#	MK_NVCCFLAGS += -DGGML_CUDA_CUBLAS
+#endif # LLAMA_CUDA_CUBLAS
 ifdef LLAMA_CUDA_CCBIN
 	MK_NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
 endif
-
+ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml-common.h
 ifdef JETSON_EOL_MODULE_DETECT
-define NVCC_COMPILE
-	$(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
-endef # NVCC_COMPILE
+	$(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
 else
-define NVCC_COMPILE
 	$(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
-endef # NVCC_COMPILE
 endif # JETSON_EOL_MODULE_DETECT
-
-ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh
-	$(NVCC_COMPILE)
-
-ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
-	$(NVCC_COMPILE)
-
-endif # LLAMA_CUDA
+endif # LLAMA_CUBLAS

 ifdef LLAMA_CLBLAST

@@ -527,6 +510,7 @@ ggml-vulkan.o: ggml-vulkan.cpp ggml-vulkan.h
 endif # LLAMA_VULKAN

 ifdef LLAMA_HIPBLAS
+
 	ifeq ($(wildcard /opt/rocm),)
 		ROCM_PATH	?= /usr
 		GPU_TARGETS ?= $(shell $(shell which amdgpu-arch))
@@ -538,7 +522,7 @@ ifdef LLAMA_HIPBLAS
 	LLAMA_CUDA_DMMV_X       ?= 32
 	LLAMA_CUDA_MMV_Y        ?= 1
 	LLAMA_CUDA_KQUANTS_ITER ?= 2
-	MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUDA
+	MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
 ifdef LLAMA_HIP_UMA
 	MK_CPPFLAGS += -DGGML_HIP_UMA
 endif # LLAMA_HIP_UMA
@@ -551,18 +535,9 @@ endif # LLAMA_HIP_UMA
 ifdef LLAMA_CUDA_FORCE_DMMV
 	HIPFLAGS 	+= -DGGML_CUDA_FORCE_DMMV
 endif # LLAMA_CUDA_FORCE_DMMV
-ifdef LLAMA_CUDA_NO_PEER_COPY
-	HIPFLAGS 	+= -DGGML_CUDA_NO_PEER_COPY
-endif # LLAMA_CUDA_NO_PEER_COPY
 	OBJS        += ggml-cuda.o
-	OBJS        += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
-
-ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
+ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
 	$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
-
-ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh
-	$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
-
 endif # LLAMA_HIPBLAS

 ifdef LLAMA_METAL
@@ -615,7 +590,7 @@ override NVCCFLAGS := $(MK_NVCCFLAGS) $(NVCCFLAGS)
 override LDFLAGS   := $(MK_LDFLAGS) $(LDFLAGS)

 # identify CUDA host compiler
-ifdef LLAMA_CUDA
+ifdef LLAMA_CUBLAS
 GF_CC := $(NVCC) $(NVCCFLAGS) 2>/dev/null .c -Xcompiler
 include scripts/get-flags.mk
 CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic
@@ -640,7 +615,7 @@ $(info I NVCCFLAGS: $(NVCCFLAGS))
 $(info I LDFLAGS:   $(LDFLAGS))
 $(info I CC:        $(shell $(CC)   --version | head -n 1))
 $(info I CXX:       $(shell $(CXX)  --version | head -n 1))
-ifdef LLAMA_CUDA
+ifdef LLAMA_CUBLAS
 $(info I NVCC:      $(shell $(NVCC) --version | tail -n 1))
 CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])')
 ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
@@ -650,16 +625,9 @@ $(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be exp
 endif # CUDA_POWER_ARCH
 endif # CUDA_DOCKER_ARCH
 endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
-endif # LLAMA_CUDA
+endif # LLAMA_CUBLAS
 $(info )

-ifdef LLAMA_CUBLAS
-$(info !!!!)
-$(info LLAMA_CUBLAS is deprecated and will be removed in the future. Use LLAMA_CUDA instead.)
-$(info !!!!)
-$(info )
-endif
-
 #
 # Build library
 #
@@ -679,10 +647,7 @@ ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h ggml-common.h
 unicode.o: unicode.cpp unicode.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@

-unicode-data.o: unicode-data.cpp unicode-data.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-
-OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
+OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o

 llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
@@ -708,9 +673,6 @@ json-schema-to-grammar.o: common/json-schema-to-grammar.cpp common/json-schema-t
 train.o: common/train.cpp common/train.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@

-ngram-cache.o: common/ngram-cache.cpp common/ngram-cache.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-
 libllama.so: llama.o ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)

@@ -718,8 +680,7 @@ libllama.a: llama.o ggml.o $(OBJS) $(COMMON_DEPS)
 	ar rcs libllama.a llama.o ggml.o $(OBJS) $(COMMON_DEPS)

 clean:
-	rm -vrf *.o tests/*.o *.so *.a *.dll benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
-	rm -vrf ggml-cuda/*.o
+	rm -vrf *.o tests/*.o *.so *.a *.dll benchmark-matmult common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
 	find examples pocs -type f -name "*.o" -delete

 #
@@ -837,10 +798,6 @@ export-lora: examples/export-lora/export-lora.cpp ggml.o common/common.h $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-retrieval: examples/retrieval/retrieval.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
 speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -853,24 +810,14 @@ lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-lookup: examples/lookup/lookup.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS)
+lookup: examples/lookup/lookup.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-	$(CXX) $(CXXFLAGS) -c examples/lookup/lookup-create.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-create.cpp)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-create.cpp) -o lookup-create $(LDFLAGS)
-	$(CXX) $(CXXFLAGS) -c examples/lookup/lookup-merge.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-merge.cpp)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-merge.cpp) -o lookup-merge $(LDFLAGS)
-	$(CXX) $(CXXFLAGS) -c examples/lookup/lookup-stats.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-stats.cpp)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-stats.cpp) -o lookup-stats $(LDFLAGS)

 passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-gbnf-validator: examples/gbnf-validator/gbnf-validator.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
 ifeq ($(UNAME_S),Darwin)
 swift: examples/batched.swift
 	(cd examples/batched.swift; make build)
@@ -918,10 +865,6 @@ tests/test-grammar-parser: tests/test-grammar-parser.cpp ggml.o llama.o grammar-
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-tests/test-grammar-integration: tests/test-grammar-integration.cpp ggml.o llama.o grammar-parser.o $(OBJS)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
 tests/test-double-float: tests/test-double-float.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
--- a/Package.swift
+++ b/Package.swift
@@ -32,7 +32,6 @@ let package = Package(
                "ggml.c",
                "llama.cpp",
                "unicode.cpp",
-                "unicode-data.cpp",
                "ggml-alloc.c",
                "ggml-backend.c",
                "ggml-quants.c",
--- a/README-sycl.md
+++ b/README-sycl.md
@@ -3,7 +3,7 @@
 - [Background](#background)
 - [News](#news)
 - [OS](#os)
- [Hardware](#hardware)
+- [Intel GPU](#intel-gpu)
 - [Docker](#docker)
 - [Linux](#linux)
 - [Windows](#windows)
@@ -14,30 +14,21 @@

 ## Background

-**SYCL** is a high-level parallel programming model designed to improve developers productivity writing code across various hardware accelerators such as CPUs, GPUs, and FPGAs. It is a single-source language designed for heterogeneous computing and based on standard C++17.
+SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators—such as CPUs, GPUs, and FPGAs. It is a single-source embedded domain-specific language based on pure C++17.

-**oneAPI** is an open ecosystem and a standard-based specification, supporting multiple architectures including but not limited to intel CPUs, GPUs and FPGAs. The key components of the oneAPI ecosystem include:
+oneAPI is a specification that is open and standards-based, supporting multiple architecture types including but not limited to GPU, CPU, and FPGA. The spec has both direct programming and API-based programming paradigms.

- **DPCPP** *(Data Parallel C++)*: The primary oneAPI SYCL implementation, which includes the icpx/icx Compilers.
- **oneAPI Libraries**: A set of highly optimized libraries targeting multiple domains *(e.g. oneMKL - Math Kernel Library)*.
- **oneAPI LevelZero**: A high performance low level interface for fine-grained control over intel iGPUs and dGPUs.
- **Nvidia & AMD Plugins**: These are plugins extending oneAPI's DPCPP support to SYCL on Nvidia and AMD GPU targets.
+Intel uses the SYCL as direct programming language to support CPU, GPUs and FPGAs.

-### Llama.cpp + SYCL
+To avoid to re-invent the wheel, this code refer other code paths in llama.cpp (like OpenBLAS, cuBLAS, CLBlast). We use a open-source tool [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) migrate to SYCL.

-The llama.cpp SYCL backend is designed to support **Intel GPU** firstly. Based on the cross-platform feature of SYCL, it could support other vendor GPUs: Nvidia GPU (*AMD GPU coming*).
+The llama.cpp for SYCL is used to support Intel GPUs.

-When targeting **Intel CPU**, it is recommended to use llama.cpp for [Intel oneMKL](README.md#intel-onemkl) backend.
-
-It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS, cuBLAS, CLBlast etc..*. In beginning work, the oneAPI's [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) open-source migration tool (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) was used for this purpose.
+For Intel CPU, recommend to use llama.cpp for X86 (Intel MKL building).

 ## News

- 2024.4
-  - Support data types: GGML_TYPE_IQ4_NL, GGML_TYPE_IQ4_XS, GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ3_S, GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M.
-
 - 2024.3
-  - Release binary files of Windows.
  - A blog is published: **Run LLM on all Intel GPUs Using llama.cpp**: [intel.com](https://www.intel.com/content/www/us/en/developer/articles/technical/run-llm-on-all-gpus-using-llama-cpp-artical.html) or [medium.com](https://medium.com/@jianyu_neo/run-llm-on-all-intel-gpus-using-llama-cpp-fd2e2dcbd9bd).
  - New base line is ready: [tag b2437](https://github.com/ggerganov/llama.cpp/tree/b2437).
  - Support multiple cards: **--split-mode**: [none|layer]; not support [row], it's on developing.
@@ -60,11 +51,9 @@ It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS,
 |Windows|Support|Windows 11|


-## Hardware
+## Intel GPU

-### Intel GPU
-
-**Verified devices**
+### Verified

 |Intel GPU| Status | Verified Model|
 |-|-|-|
@@ -74,231 +63,198 @@ It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS,
 |Intel built-in Arc GPU| Support| built-in Arc GPU in Meteor Lake|
 |Intel iGPU| Support| iGPU in i5-1250P, i7-1260P, i7-1165G7|

-*Notes:*
+Note: If the EUs (Execution Unit) in iGPU is less than 80, the inference speed will be too slow to use.

- **Memory**
-  - The device memory is a limitation when running a large model. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/main`.
+### Memory

-  - Please make sure the GPU shared memory from the host is large enough to account for the model's size. For e.g. the *llama-2-7b.Q4_0* requires at least 8.0GB for integrated GPU and 4.0GB for discrete GPU.
+The memory is a limitation to run LLM on GPUs.

- **Execution Unit (EU)**
-  - If the iGPU has less than 80 EUs, the inference speed will likely be too slow for practical use.
+When run llama.cpp, there is print log to show the applied memory on GPU. You could know how much memory to be used in your case. Like `llm_load_tensors:            buffer size =  3577.56 MiB`.

-### Nvidia GPU
-The BLAS acceleration on Nvidia GPU through oneAPI can be obtained using the Nvidia plugins for oneAPI and the cuBLAS backend of the upstream oneMKL library. Details and instructions on how to setup the runtime and library can be found in [this section](#i-setup-environment)
+For iGPU, please make sure the shared memory from host memory is enough. For llama-2-7b.Q4_0, recommend the host memory is 8GB+.

-**Verified devices**
+For dGPU, please make sure the device memory is enough. For llama-2-7b.Q4_0, recommend the device memory is 4GB+.

-|Nvidia GPU| Status | Verified Model|
+## Nvidia GPU
+
+### Verified
+
+|Intel GPU| Status | Verified Model|
 |-|-|-|
-|Ampere Series| Support| A100, A4000|
-|Ampere Series *(Mobile)*| Support| RTX 40 Series|
+|Ampere Series| Support| A100|

-*Notes:*
-  - Support for Nvidia targets through oneAPI is currently limited to Linux platforms.
+### oneMKL for CUDA

-  - Please make sure the native oneAPI MKL *(dedicated to intel CPUs and GPUs)* is not "visible" at this stage to properly setup and use the built-from-source oneMKL with cuBLAS backend in llama.cpp for Nvidia GPUs.
+The current oneMKL release does not contain the oneMKL cuBlas backend.
+As a result for Nvidia GPU's oneMKL must be built from source.

-
-## Docker
-The docker build option is currently limited to *intel GPU* targets.
-### Build image
-```sh
-# Using FP16
-docker build -t llama-cpp-sycl --build-arg="LLAMA_SYCL_F16=ON" -f .devops/main-intel.Dockerfile .
+```
+git clone https://github.com/oneapi-src/oneMKL
+cd oneMKL
+mkdir build
+cd build
+cmake -G Ninja .. -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENABLE_MKLGPU_BACKEND=OFF -DENABLE_MKLCPU_BACKEND=OFF -DENABLE_CUBLAS_BACKEND=ON
+ninja
+// Add paths as necessary
 ```

-*Notes*:
+## Docker

-To build in default FP32 *(Slower than FP16 alternative)*, you can remove the `--build-arg="LLAMA_SYCL_F16=ON"` argument from the previous command.
+Note:
+- Only docker on Linux is tested. Docker on WSL may not work.
+- You may need to install Intel GPU driver on the host machine (See the [Linux](#linux) section to know how to do that)

-You can also use the `.devops/server-intel.Dockerfile`, which builds the *"server"* alternative.
+### Build the image
+
+You can choose between **F16** and **F32** build. F16 is faster for long-prompt inference.

-### Run container

 ```sh
-# First, find all the DRI cards
+# For F16:
+#docker build -t llama-cpp-sycl --build-arg="LLAMA_SYCL_F16=ON" -f .devops/main-intel.Dockerfile .
+
+# Or, for F32:
+docker build -t llama-cpp-sycl -f .devops/main-intel.Dockerfile .
+
+# Note: you can also use the ".devops/server-intel.Dockerfile", which compiles the "server" example
+```
+
+### Run
+
+```sh
+# Firstly, find all the DRI cards:
 ls -la /dev/dri
-# Then, pick the card that you want to use (here for e.g. /dev/dri/card1).
+# Then, pick the card that you want to use.
+
+# For example with "/dev/dri/card1"
 docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-sycl -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
 ```

-*Notes:*
- Docker has been tested successfully on native Linux. WSL support has not been verified yet.
- You may need to install Intel GPU driver on the **host** machine *(Please refer to the [Linux configuration](#linux) for details)*.
-
 ## Linux

-### I. Setup Environment
+### Setup Environment

-1. **Install GPU drivers**
+1. Install Intel GPU driver.

-  - **Intel GPU**
+a. Please install Intel GPU driver by official guide: [Install GPU Drivers](https://dgpu-docs.intel.com/driver/installation.html).

-Intel data center GPUs drivers installation guide and download page can be found here: [Get intel dGPU Drivers](https://dgpu-docs.intel.com/driver/installation.html#ubuntu-install-steps).
+Note: for iGPU, please install the client GPU driver.

-*Note*: for client GPUs *(iGPU & Arc A-Series)*, please refer to the [client iGPU driver installation](https://dgpu-docs.intel.com/driver/client/overview.html).
-
-Once installed, add the user(s) to the `video` and `render` groups.
+b. Add user to group: video, render.

 ```sh
-sudo usermod -aG render $USER
-sudo usermod -aG video $USER
+sudo usermod -aG render username
+sudo usermod -aG video username
 ```

-*Note*: logout/re-login for the changes to take effect.
+Note: re-login to enable it.

-Verify installation through `clinfo`:
+c. Check

 ```sh
 sudo apt install clinfo
 sudo clinfo -l
 ```

-Sample output:
+Output (example):

-```sh
+```
 Platform #0: Intel(R) OpenCL Graphics
 `-- Device #0: Intel(R) Arc(TM) A770 Graphics

+
 Platform #0: Intel(R) OpenCL HD Graphics
 `-- Device #0: Intel(R) Iris(R) Xe Graphics [0x9a49]
 ```

- **Nvidia GPU**
+2. Install Intel® oneAPI Base toolkit.

-In order to target Nvidia GPUs through SYCL, please make sure the CUDA/CUBLAS native requirements *-found [here](README.md#cublas)-* are installed.
-Installation can be verified by running the following:
-```sh
-nvidia-smi
-```
-Please make sure at least one CUDA device is available, which can be displayed like this *(here an A100-40GB Nvidia GPU)*:
-```
-+---------------------------------------------------------------------------------------+
-| NVIDIA-SMI 535.54.03              Driver Version: 535.54.03    CUDA Version: 12.2     |
-|-----------------------------------------+----------------------+----------------------+
-| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
-| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
-|                                         |                      |               MIG M. |
-|=========================================+======================+======================|
-|   0  NVIDIA A100-PCIE-40GB          On  | 00000000:8D:00.0 Off |                    0 |
-| N/A   36C    P0              57W / 250W |      4MiB / 40960MiB |      0%      Default |
-|                                         |                      |             Disabled |
-+-----------------------------------------+----------------------+----------------------+
-```
+a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).

+Recommend to install to default folder: **/opt/intel/oneapi**.

-2. **Install Intel® oneAPI Base toolkit**
+Following guide use the default folder as example. If you use other folder, please modify the following guide info with your folder.

- **Base installation**
+b. Check

-The base toolkit can be obtained from the official [Intel® oneAPI Base Toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) page.
-
-Please follow the instructions for downloading and installing the Toolkit for Linux, and preferably keep the default installation values unchanged, notably the installation path *(`/opt/intel/oneapi` by default)*.
-
-Following guidelines/code snippets assume the default installation values. Otherwise, please make sure the necessary changes are reflected where applicable.
-
-Upon a successful installation, SYCL is enabled for the available intel devices, along with relevant libraries such as oneAPI MKL for intel GPUs.
-
- **Adding support to Nvidia GPUs**
-
-**oneAPI**: In order to enable SYCL support on Nvidia GPUs, please install the [Codeplay oneAPI Plugin for Nvidia GPUs](https://developer.codeplay.com/products/oneapi/nvidia/download). User should also make sure the plugin version matches the installed base toolkit one *(previous step)* for a seamless "oneAPI on Nvidia GPU" setup.
-
-
-**oneMKL**: The current oneMKL releases *(shipped with the oneAPI base-toolkit)* do not contain the cuBLAS backend. A build from source of the upstream [oneMKL](https://github.com/oneapi-src/oneMKL) with the *cuBLAS* backend enabled is thus required to run it on Nvidia GPUs.
-
-```sh
-git clone https://github.com/oneapi-src/oneMKL
-cd oneMKL
-mkdir -p buildWithCublas && cd buildWithCublas
-cmake ../ -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENABLE_MKLGPU_BACKEND=OFF -DENABLE_MKLCPU_BACKEND=OFF -DENABLE_CUBLAS_BACKEND=ON -DTARGET_DOMAINS=blas
-make
-```
-
-
-3. **Verify installation and environment**
-
-In order to check the available SYCL devices on the machine, please use the `sycl-ls` command.
 ```sh
 source /opt/intel/oneapi/setvars.sh
+
 sycl-ls
 ```

- **Intel GPU**
-
-When targeting an intel GPU, the user should expect one or more level-zero devices among the available SYCL devices. Please make sure that at least one GPU is present, for instance [`ext_oneapi_level_zero:gpu:0`] in the sample output below:
+There should be one or more level-zero devices. Please confirm that at least one GPU is present, like **[ext_oneapi_level_zero:gpu:0]**.

+Output (example):
 ```
 [opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2  [2023.16.10.0.17_160000]
 [opencl:cpu:1] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i7-13700K OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000]
 [opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics OpenCL 3.0 NEO  [23.30.26918.50]
 [ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Arc(TM) A770 Graphics 1.3 [1.3.26918]
+
 ```

- **Nvidia GPU**
+2. Build locally:

-Similarly, user targetting Nvidia GPUs should expect at least one SYCL-CUDA device [`ext_oneapi_cuda:gpu`] as bellow:
-```
-[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2  [2023.16.12.0.12_195853.xmain-hotfix]
-[opencl:cpu:1] Intel(R) OpenCL, Intel(R) Xeon(R) Gold 6326 CPU @ 2.90GHz OpenCL 3.0 (Build 0) [2023.16.12.0.12_195853.xmain-hotfix]
-[ext_oneapi_cuda:gpu:0] NVIDIA CUDA BACKEND, NVIDIA A100-PCIE-40GB 8.0 [CUDA 12.2]
-```
+Note:
+- You can choose between **F16** and **F32** build. F16 is faster for long-prompt inference.
+- By default, it will build for all binary files. It will take more time. To reduce the time, we recommend to build for **example/main** only.

-### II. Build llama.cpp
-
-#### Intel GPU
 ```sh
-# Export relevant ENV variables
+mkdir -p build
+cd build
 source /opt/intel/oneapi/setvars.sh

-# Build LLAMA with MKL BLAS acceleration for intel GPU
-mkdir -p build && cd build
+# For FP16:
+#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON

-# Option 1: Use FP16 for better performance in long-prompt  inference
-cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
-
-# Option 2: Use FP32 by default
+# Or, for FP32:
 cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
-```

-#### Nvidia GPU
-```sh
-# Export relevant ENV variables
-export LD_LIBRARY_PATH=/path/to/oneMKL/buildWithCublas/lib:$LD_LIBRARY_PATH
-export LIBRARY_PATH=/path/to/oneMKL/buildWithCublas/lib:$LIBRARY_PATH
-export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithCublas/include:$CPLUS_INCLUDE_DIR
-export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR
-
-# Build LLAMA with Nvidia BLAS acceleration through SYCL
-mkdir -p build && cd build
-
-# Option 1: Use FP16 for better performance in long-prompt  inference
-cmake .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
-
-# Option 2: Use FP32 by default
+# For Nvidia GPUs
 cmake .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+
+# Build example/main only
+#cmake --build . --config Release --target main
+
+# Or, build all binary
+cmake --build . --config Release -v
+
+cd ..
 ```

-### III. Run the inference
+or

-1. Retrieve and prepare model
+```sh
+./examples/sycl/build.sh
+```

-You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model prepration, or simply download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) model as example.
+### Run
+
+1. Put model file to folder **models**
+
+You could download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) as example.

 2. Enable oneAPI running environment

-```sh
+```
 source /opt/intel/oneapi/setvars.sh
 ```

-3. List devices information
+3. List device ID

-Similar to the native `sycl-ls`, available SYCL devices can be queried as follow:
+Run without parameter:

 ```sh
 ./build/bin/ls-sycl-device
+
+# or running the "main" executable and look at the output log:
+
+./build/bin/main
 ```
-A example of such log in a system with 1 *intel CPU* and 1 *intel GPU* can look like the following:
+
+Check the ID in startup log, like:
+
 ```
 found 6 SYCL devices:
 |  |                  |                                             |Compute   |Max compute|Max work|Max sub|               |
@@ -314,15 +270,15 @@ found 6 SYCL devices:

 |Attribute|Note|
 |-|-|
-|compute capability 1.3|Level-zero driver/runtime, recommended |
-|compute capability 3.0|OpenCL driver/runtime, slower than level-zero in most cases|
+|compute capability 1.3|Level-zero running time, recommended |
+|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|

-4. Launch inference
+4. Device selection and execution of llama.cpp

 There are two device selection modes:

- Single device: Use one device target specified by the user.
- Multiple devices: Automatically select the devices with the same largest Max compute-units.
+- Single device: Use one device assigned by user.
+- Multiple devices: Automatically choose the devices with the same biggest Max compute units.

 |Device selection|Parameter|
 |-|-|
@@ -347,64 +303,74 @@ or run by script:
 ```sh
 ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
 ```
-
-Otherwise, you can run the script:
+or run by script:

 ```sh
 ./examples/sycl/run_llama2.sh
 ```

-*Notes:*
+Note:

- By default, `mmap` is used to read the model file. In some cases, it causes runtime hang issues. Please disable it by passing `--no-mmap` to the `/bin/main` if faced with the issue.
- Upon execution, verify the selected device(s) ID(s) in the output log, which can for instance be displayed as follow:
+- By default, mmap is used to read model file. In some cases, it leads to the hang issue. Recommend to use parameter **--no-mmap** to disable mmap() to skip this issue.

-```sh
+
+5. Verify the device ID in output
+
+Verify to see if the selected GPU is shown in the output, like:
+
+```
 detect 1 SYCL GPUs: [0] with top Max compute units:512
 ```
 Or
-```sh
+```
 use 1 SYCL GPUs: [0] with Max compute units:512
 ```

+
 ## Windows

-### I. Setup Environment
+### Setup Environment

-1. Install GPU driver
+1. Install Intel GPU driver.

-Intel GPU drivers instructions guide and download page can be found here: [Get intel GPU Drivers](https://www.intel.com/content/www/us/en/products/docs/discrete-gpus/arc/software/drivers.html).
+Please install Intel GPU driver by official guide: [Install GPU Drivers](https://www.intel.com/content/www/us/en/products/docs/discrete-gpus/arc/software/drivers.html).

-2. Install Visual Studio
+Note: **The driver is mandatory for compute function**.

-If you already have a recent version of Microsoft Visual Studio, you can skip this step. Otherwise, please refer to the official download page for [Microsoft Visual Studio](https://visualstudio.microsoft.com/).
+2. Install Visual Studio.

-3. Install Intel® oneAPI Base toolkit
+Please install [Visual Studio](https://visualstudio.microsoft.com/) which impact oneAPI environment enabling in Windows.

-The base toolkit can be obtained from the official [Intel® oneAPI Base Toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) page.
+3. Install Intel® oneAPI Base toolkit.

-Please follow the instructions for downloading and installing the Toolkit for Windows, and preferably keep the default installation values unchanged, notably the installation path *(`C:\Program Files (x86)\Intel\oneAPI` by default)*.
+a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).

-Following guidelines/code snippets assume the default installation values. Otherwise, please make sure the necessary changes are reflected where applicable.
+Recommend to install to default folder: **C:\Program Files (x86)\Intel\oneAPI**.
+
+Following guide uses the default folder as example. If you use other folder, please modify the following guide info with your folder.

 b. Enable oneAPI running environment:

- Type "oneAPI" in the search bar, then open the `Intel oneAPI command prompt for Intel 64 for Visual Studio 2022` App.
+- In Search, input 'oneAPI'.

- On the command prompt, enable the runtime environment with the following:
+Search & open "Intel oneAPI command prompt for Intel 64 for Visual Studio 2022"
+
+- In Run:
+
+In CMD:
 ```
 "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64
 ```

-c. Verify installation
+c. Check GPU

-In the oneAPI command line, run the following to print the available SYCL devices:
+In oneAPI command line:

 ```
 sycl-ls
 ```

-There should be one or more *level-zero* GPU devices displayed as **[ext_oneapi_level_zero:gpu]**. Below is example of such output detecting an *intel Iris Xe* GPU as a Level-zero SYCL device:
+There should be one or more level-zero devices. Please confirm that at least one GPU is present, like **[ext_oneapi_level_zero:gpu:0]**.

 Output (example):
 ```
@@ -414,7 +380,7 @@ Output (example):
 [ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Iris(R) Xe Graphics 1.3 [1.3.28044]
 ```

-4. Install build tools
+4. Install cmake & make

 a. Download & install cmake for Windows: https://cmake.org/download/

@@ -424,53 +390,76 @@ b. Download & install mingw-w64 make for Windows provided by w64devkit

 - Extract `w64devkit` on your pc.

- Add the **bin** folder path in the Windows system PATH environment (for e.g. `C:\xxx\w64devkit\bin\`).
+- Add the **bin** folder path in the Windows system PATH environment, like `C:\xxx\w64devkit\bin\`.

-### II. Build llama.cpp
+### Build locally:

-On the oneAPI command line window, step into the llama.cpp main directory and run the following:
+In oneAPI command line window:

 ```
 mkdir -p build
 cd build
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force

-cmake -G "MinGW Makefiles" ..  -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx  -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
+::  for FP16
+::  faster for long-prompt inference
+::  cmake -G "MinGW Makefiles" ..  -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx  -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON

-make
+::  for FP32
+cmake -G "MinGW Makefiles" ..  -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx  -DCMAKE_BUILD_TYPE=Release
+
+
+::  build example/main only
+::  make main
+
+::  build all binary
+make -j
+cd ..
 ```

-Otherwise, run the `win-build-sycl.bat` wrapper which encapsulates the former instructions:
-```sh
+or
+
+```
 .\examples\sycl\win-build-sycl.bat
 ```

-*Notes:*
+Note:

- By default, calling `make` will build all target binary files. In case of a minimal experimental setup, the user can build the inference executable only through `make main`.
+- By default, it will build for all binary files. It will take more time. To reduce the time, we recommend to build for **example/main** only.

-### III. Run the inference
+### Run

-1. Retrieve and prepare model
+1. Put model file to folder **models**

-You can refer to the general [*Prepare and Quantize*](README#prepare-and-quantize) guide for model prepration, or simply download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) model as example.
+You could download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) as example.

 2. Enable oneAPI running environment

-On the oneAPI command line window, run the following and step into the llama.cpp directory:
+- In Search, input 'oneAPI'.
+
+Search & open "Intel oneAPI command prompt for Intel 64 for Visual Studio 2022"
+
+- In Run:
+
+In CMD:
 ```
 "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64
 ```

-3. List devices information
+3. List device ID

-Similar to the native `sycl-ls`, available SYCL devices can be queried as follow:
+Run without parameter:

 ```
 build\bin\ls-sycl-device.exe
+
+or
+
+build\bin\main.exe
 ```

-The output of this command in a system with 1 *intel CPU* and 1 *intel GPU* would look like the following:
+Check the ID in startup log, like:
+
 ```
 found 6 SYCL devices:
 |  |                  |                                             |Compute   |Max compute|Max work|Max sub|               |
@@ -491,7 +480,7 @@ found 6 SYCL devices:
 |compute capability 3.0|OpenCL running time, slower than level-zero in most cases|


-4. Launch inference
+4. Device selection and execution of llama.cpp

 There are two device selection modes:

@@ -516,7 +505,7 @@ build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be
 ```
 build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer
 ```
-Otherwise, run the following wrapper script:
+or run by script:

 ```
 .\examples\sycl\win-run-llama2.bat
@@ -524,14 +513,19 @@ Otherwise, run the following wrapper script:

 Note:

- By default, `mmap` is used to read the model file. In some cases, it causes runtime hang issues. Please disable it by passing `--no-mmap` to the `main.exe` if faced with the issue.
- Upon execution, verify the selected device(s) ID(s) in the output log, which can for instance be displayed as follow:
+- By default, mmap is used to read model file. In some cases, it leads to the hang issue. Recommend to use parameter **--no-mmap** to disable mmap() to skip this issue.

-```sh
+
+
+5. Verify the device ID in output
+
+Verify to see if the selected GPU is shown in the output, like:
+
+```
 detect 1 SYCL GPUs: [0] with top Max compute units:512
 ```
 Or
-```sh
+```
 use 1 SYCL GPUs: [0] with Max compute units:512
 ```

@@ -541,54 +535,64 @@ use 1 SYCL GPUs: [0] with Max compute units:512

 |Name|Value|Function|
 |-|-|-|
-|LLAMA_SYCL|ON (mandatory)|Enable build with SYCL code path.|
-|LLAMA_SYCL_TARGET | INTEL *(default)* \| NVIDIA|Set the SYCL target device type.|
-|LLAMA_SYCL_F16|OFF *(default)* \|ON *(optional)*|Enable FP16 build with SYCL code path.|
-|CMAKE_C_COMPILER|icx|Set *icx* compiler for SYCL code path.|
-|CMAKE_CXX_COMPILER|icpx *(Linux)*, icx *(Windows)*|Set `icpx/icx` compiler for SYCL code path.|
+|LLAMA_SYCL|ON (mandatory)|Enable build with SYCL code path. <br>For FP32/FP16, LLAMA_SYCL=ON is mandatory.|
+|LLAMA_SYCL_F16|ON (optional)|Enable FP16 build with SYCL code path. Faster for long-prompt inference. <br>For FP32, not set it.|
+|CMAKE_C_COMPILER|icx|Use icx compiler for SYCL code path|
+|CMAKE_CXX_COMPILER|icpx (Linux), icx (Windows)|use icpx/icx for SYCL code path|
+
+#### Running

-#### Runtime

 |Name|Value|Function|
 |-|-|-|
 |GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG|
 |ZES_ENABLE_SYSMAN| 0 (default) or 1|Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer|

-## Known Issues
+## Known Issue

- Hanging during startup
+- Hang during startup

-  llama.cpp uses *mmap* as the default mode for reading the model file and copying it to the GPU. In some systems, `memcpy` might behave abnormally and therefore hang.
+  llama.cpp use mmap as default way to read model file and copy to GPU. In some system, memcpy will be abnormal and block.

-  - **Solution**: add `--no-mmap` or `--mmap 0` flag to the `main` executable.
+  Solution: add **--no-mmap** or **--mmap 0**.

- `Split-mode:[row]` is not supported.
+- Split-mode: [row] is not supported
+
+  It's on developing.

 ## Q&A

+Note: please add prefix **[SYCL]** in issue title, so that we will check it as soon as possible.
+
+
 - Error:  `error while loading shared libraries: libsycl.so.7: cannot open shared object file: No such file or directory`.

-  - Potential cause: Unavailable oneAPI installation or not set ENV variables.
-  - Solution: Install *oneAPI base toolkit* and enable its ENV through: `source /opt/intel/oneapi/setvars.sh`.
+  Miss to enable oneAPI running environment.

- General compiler error:
+  Install oneAPI base toolkit and enable it by: `source /opt/intel/oneapi/setvars.sh`.

-  - Remove build folder or try a clean-build.
+- In Windows, no result, not error.

- I can **not** see `[ext_oneapi_level_zero:gpu]` afer installing the GPU driver on Linux.
+  Miss to enable oneAPI running environment.

-  Please double-check with `sudo sycl-ls`.
+- Meet compile error.

-  If it's present in the list, please add video/render group to your user then **logout/login** or restart your system:
+  Remove folder **build** and try again.
+
+- I can **not** see **[ext_oneapi_level_zero:gpu:0]** afer install GPU driver in Linux.
+
+  Please run **sudo sycl-ls**.
+
+  If you see it in result, please add video/render group to your ID:

  ```
-  sudo usermod -aG render $USER
-  sudo usermod -aG video $USER
+  sudo usermod -aG render username
+  sudo usermod -aG video username
  ```
-  Otherwise, please double-check the GPU driver installation steps.

-### **GitHub contribution**:
-Please add the **[SYCL]** prefix/tag in issues/PRs titles to help the SYCL-team check/address them without delay.
+  Then **relogin**.
+
+  If you do not see it, please check the installation GPU steps again.

 ## Todo

--- a/README.md
+++ b/README.md
@@ -10,7 +10,6 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)

 ### Recent API changes

- [2024 Mar 26] Logits and embeddings API updated for compactness https://github.com/ggerganov/llama.cpp/pull/6122
 - [2024 Mar 13] Add `llama_synchronize()` + `llama_context_params.n_ubatch` https://github.com/ggerganov/llama.cpp/pull/6017
 - [2024 Mar 8] `llama_kv_cache_seq_rm()` returns a `bool` instead of `void`, and new `llama_n_seq_max()` returns the upper limit of acceptable `seq_id` in batches (relevant when dealing with multiple sequences) https://github.com/ggerganov/llama.cpp/pull/5328
 - [2024 Mar 4] Embeddings API updated https://github.com/ggerganov/llama.cpp/pull/5796
@@ -18,10 +17,8 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)

 ### Hot topics

- **MoE memory layout has been updated - reconvert models for `mmap` support and regenerate `imatrix` https://github.com/ggerganov/llama.cpp/pull/6387**
- Model sharding instructions using `gguf-split` https://github.com/ggerganov/llama.cpp/discussions/6404
 - Fix major bug in Metal batched inference https://github.com/ggerganov/llama.cpp/pull/6225
- Multi-GPU pipeline parallelism support https://github.com/ggerganov/llama.cpp/pull/6017
+- Multi-GPU pipeline parallelizm support https://github.com/ggerganov/llama.cpp/pull/6017
 - Looking for contributions to add Deepseek support: https://github.com/ggerganov/llama.cpp/issues/5981
 - Quantization blind testing: https://github.com/ggerganov/llama.cpp/discussions/5962
 - Initial Mamba support has been added: https://github.com/ggerganov/llama.cpp/pull/5328
@@ -116,9 +113,7 @@ Typically finetunes of the base models below are supported as well.
 - [x] [CodeShell](https://github.com/WisdomShell/codeshell)
 - [x] [Gemma](https://ai.google.dev/gemma)
 - [x] [Mamba](https://github.com/state-spaces/mamba)
- [x] [Xverse](https://huggingface.co/models?search=xverse)
 - [x] [Command-R](https://huggingface.co/CohereForAI/c4ai-command-r-v01)
- [x] [SEA-LION](https://huggingface.co/models?search=sea-lion)

 **Multimodal models:**

@@ -142,7 +137,6 @@ Typically finetunes of the base models below are supported as well.
 - JavaScript/Wasm (works in browser): [tangledgroup/llama-cpp-wasm](https://github.com/tangledgroup/llama-cpp-wasm)
 - Typescript/Wasm (nicer API, available on npm): [ngxson/wllama](https://github.com/ngxson/wllama)
 - Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
- Rust (more features): [edgenai/llama_cpp-rs](https://github.com/edgenai/llama_cpp-rs)
 - Rust (nicer API): [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
 - Rust (more direct bindings): [utilityai/llama-cpp-rs](https://github.com/utilityai/llama-cpp-rs)
 - C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp)
@@ -152,7 +146,6 @@ Typically finetunes of the base models below are supported as well.
 - Java: [kherud/java-llama.cpp](https://github.com/kherud/java-llama.cpp)
 - Zig: [deins/llama.cpp.zig](https://github.com/Deins/llama.cpp.zig)
 - Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart)
- PHP (API bindings and features built on top of llama.cpp): [distantmagic/resonance](https://github.com/distantmagic/resonance) [(more info)](https://github.com/ggerganov/llama.cpp/pull/6326)

 **UI:**

@@ -179,11 +172,6 @@ Unless otherwise noted these projects are open-source with permissive licensing:
 - [Mobile-Artificial-Intelligence/maid](https://github.com/Mobile-Artificial-Intelligence/maid) (MIT)
 - [Msty](https://msty.app) (proprietary)
 - [LLMFarm](https://github.com/guinmoon/LLMFarm?tab=readme-ov-file) (MIT)
- [KanTV](https://github.com/zhouwg/kantv?tab=readme-ov-file)(Apachev2.0 or later)
- [Dot](https://github.com/alexpinel/Dot) (GPL)
- [MindMac](https://mindmac.app) (proprietary)
-
-*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*

 ---

@@ -459,27 +447,30 @@ Building the program with BLAS support may lead to some performance improvements

  Check [Optimizing and Running LLaMA2 on Intel® CPU](https://www.intel.com/content/www/us/en/content-details/791610/optimizing-and-running-llama2-on-intel-cpu.html) for more information.

- #### CUDA
+- #### cuBLAS

-  This provides GPU acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
+  This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).

  For Jetson user, if you have Jetson Orin, you can try this: [Offical Support](https://www.jetson-ai-lab.com/tutorial_text-generation.html). If you are using an old model(nano/TX2), need some additional operations before compiling.

  - Using `make`:
    ```bash
-    make LLAMA_CUDA=1
+    make LLAMA_CUBLAS=1
    ```
  - Using `CMake`:

    ```bash
    mkdir build
    cd build
-    cmake .. -DLLAMA_CUDA=ON
+    cmake .. -DLLAMA_CUBLAS=ON
    cmake --build . --config Release
    ```

  The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance:

+<!---
+  | LLAMA_CUDA_CUBLAS       | Boolean                |   false | Use cuBLAS instead of custom CUDA kernels for prompt processing. Faster for all quantization formats except for q4_0 and q8_0, especially for k-quants. Increases VRAM usage (700 MiB for 7b, 970 MiB for 13b, 1430 MiB for 33b). |
+--->
  | Option                         | Legal values           | Default | Description |
  |--------------------------------|------------------------|---------|-------------|
  | LLAMA_CUDA_FORCE_DMMV          | Boolean                |   false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -1,67 +0,0 @@
-# Security Policy
-
- - [**Using llama.cpp securely**](#using-llamacpp-securely)
-   - [Untrusted models](#untrusted-models)
-   - [Untrusted inputs](#untrusted-inputs)
-   - [Data privacy](#data-privacy)
-   - [Untrusted environments or networks](#untrusted-environments-or-networks)
-   - [Multi-Tenant environments](#multi-tenant-environments)
- - [**Reporting a vulnerability**](#reporting-a-vulnerability)
-
-## Using llama.cpp securely
-
-### Untrusted models
-Be careful when running untrusted models. This classification includes models created by unknown developers or utilizing data obtained from unknown sources.
-
-*Always execute untrusted models within a secure, isolated environment such as a sandbox* (e.g., containers, virtual machines). This helps protect your system from potentially malicious code.
-
-> [!NOTE]
-> The trustworthiness of a model is not binary. You must always determine the proper level of caution depending on the specific model and how it matches your use case and risk tolerance.
-
-### Untrusted inputs
-
-Some models accept various input formats (text, images, audio, etc.). The libraries converting these inputs have varying security levels, so it's crucial to isolate the model and carefully pre-process inputs to mitigate script injection risks.
-
-For maximum security when handling untrusted inputs, you may need to employ the following:
-
-* Sandboxing: Isolate the environment where the inference happens.
-* Pre-analysis: Check how the model performs by default when exposed to prompt injection (e.g. using [fuzzing for prompt injection](https://github.com/FonduAI/awesome-prompt-injection?tab=readme-ov-file#tools)). This will give you leads on how hard you will have to work on the next topics.
-* Updates: Keep both LLaMA C++ and your libraries updated with the latest security patches.
-* Input Sanitation: Before feeding data to the model, sanitize inputs rigorously. This involves techniques such as:
-    * Validation: Enforce strict rules on allowed characters and data types.
-    * Filtering: Remove potentially malicious scripts or code fragments.
-    * Encoding: Convert special characters into safe representations.
-    * Verification: Run tooling that identifies potential script injections (e.g. [models that detect prompt injection attempts](https://python.langchain.com/docs/guides/safety/hugging_face_prompt_injection)).
-
-### Data privacy
-
-To protect sensitive data from potential leaks or unauthorized access, it is crucial to sandbox the model execution. This means running the model in a secure, isolated environment, which helps mitigate many attack vectors.
-
-### Untrusted environments or networks
-
-If you can't run your models in a secure and isolated environment or if it must be exposed to an untrusted network, make sure to take the following security precautions:
-* Confirm the hash of any downloaded artifact (e.g. pre-trained model weights) matches a known-good value
-* Encrypt your data if sending it over the network.
-
-### Multi-Tenant environments
-
-If you intend to run multiple models in parallel with shared memory, it is your responsibility to ensure the models do not interact or access each other's data. The primary areas of concern are tenant isolation, resource allocation, model sharing and hardware attacks.
-
-1. Tenant Isolation: Models should run separately with strong isolation methods to prevent unwanted data access. Separating networks is crucial for isolation, as it prevents unauthorized access to data or models and malicious users from sending graphs to execute under another tenant's identity.
-
-1. Resource Allocation: A denial of service caused by one model can impact the overall system health. Implement safeguards like rate limits, access controls, and health monitoring.
-
-1. Model Sharing: In a multitenant model sharing design, tenants and users must understand the security risks of running code provided by others. Since there are no reliable methods to detect malicious models, sandboxing the model execution is the recommended approach to mitigate the risk.
-
-1. Hardware Attacks: GPUs or TPUs can also be attacked. [Researches](https://scholar.google.com/scholar?q=gpu+side+channel) has shown that side channel attacks on GPUs are possible, which can make data leak from other models or processes running on the same system at the same time.
-
-## Reporting a vulnerability
-
-Beware that none of the topics under [Using llama.cpp securely](#using-llamacpp-securely) are considered vulnerabilities of LLaMA C++.
-
-<!-- normal version -->
-However, If you have discovered a security vulnerability in this project, please report it privately. **Do not disclose it as a public issue.** This gives us time to work with you to fix the issue before public exposure, reducing the chance that the exploit will be used before a patch is released.
-
-Please disclose it as a private [security advisory](https://github.com/ggerganov/llama.cpp/security/advisories/new).
-
-A team of volunteers on a reasonable-effort basis maintains this project. As such, please give us at least 90 days to work on a fix before public exposure.
--- a/build.zig
+++ b/build.zig
@@ -116,7 +116,6 @@ pub fn build(b: *std.build.Builder) !void {
    const ggml_backend = make.obj("ggml-backend", "ggml-backend.c");
    const ggml_quants = make.obj("ggml-quants", "ggml-quants.c");
    const unicode = make.obj("unicode", "unicode.cpp");
-    const unicode_data = make.obj("unicode-data", "unicode-data.cpp");
    const llama = make.obj("llama", "llama.cpp");
    const buildinfo = make.obj("common", "common/build-info.cpp");
    const common = make.obj("common", "common/common.cpp");
@@ -128,14 +127,14 @@ pub fn build(b: *std.build.Builder) !void {
    const clip = make.obj("clip", "examples/llava/clip.cpp");
    const llava = make.obj("llava", "examples/llava/llava.cpp");

-    _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, sampling, console, grammar_parser });
-    _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo });
-    _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo });
-    _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo });
-    _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, train });
-    _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, train });
+    _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, common, buildinfo, sampling, console, grammar_parser });
+    _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, common, buildinfo });
+    _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, common, buildinfo });
+    _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, common, buildinfo });
+    _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, common, buildinfo, train });
+    _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, common, buildinfo, train });

-    const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, sampling, grammar_parser, json_schema_to_grammar, clip, llava });
+    const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, common, buildinfo, sampling, grammar_parser, json_schema_to_grammar, clip, llava });
    if (server.target.isWindows()) {
        server.linkSystemLibrary("ws2_32");
    }
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -40,7 +40,7 @@ if [ ! -z ${GG_BUILD_METAL} ]; then
 fi

 if [ ! -z ${GG_BUILD_CUDA} ]; then
-    CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_CUDA=1"
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_CUBLAS=1"
 fi

 if [ ! -z ${GG_BUILD_SYCL} ]; then
@@ -412,8 +412,8 @@ function gg_run_open_llama_7b_v2 {

    set -e

-    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j                                                           ) 2>&1 | tee -a $OUT/${ci}-make.log
+    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUBLAS=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time make -j                                                             ) 2>&1 | tee -a $OUT/${ci}-make.log

    python3 ../convert.py ${path_models}

@@ -575,7 +575,7 @@ function gg_run_embd_bge_small {
    cd ${SRC}

    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/config.json
-    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/tokenizer.json
+    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/resolve/main/tokenizer.model
    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/tokenizer_config.json
    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/special_tokens_map.json
    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/resolve/main/pytorch_model.bin
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -65,8 +65,6 @@ add_library(${TARGET} STATIC
    json.hpp
    train.h
    train.cpp
-    ngram-cache.h
-    ngram-cache.cpp
    )

 if (BUILD_SHARED_LIBS)
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -39,21 +39,18 @@
 #endif
 #if defined(LLAMA_USE_CURL)
 #include <curl/curl.h>
-#include <curl/easy.h>
-#include <thread>
-#include <future>
 #endif

 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif

-#if (defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL))
-#define GGML_USE_CUDA_SYCL
+#if (defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL))
+#define GGML_USE_CUBLAS_SYCL
 #endif

-#if (defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)) || defined(GGML_USE_VULKAN)
-#define GGML_USE_CUDA_SYCL_VULKAN
+#if (defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)) || defined(GGML_USE_VULKAN)
+#define GGML_USE_CUBLAS_SYCL_VULKAN
 #endif

 #if defined(LLAMA_USE_CURL)
@@ -64,7 +61,7 @@
 #else
 #include <sys/syslimits.h>
 #endif
-#define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
+#define LLAMA_CURL_MAX_PATH_LENGTH PATH_MAX
 #define LLAMA_CURL_MAX_HEADER_LENGTH 256
 #endif // LLAMA_USE_CURL

@@ -157,7 +154,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
    return result;
 }

-bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) {
+static bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) {
    llama_sampling_params& sparams = params.sparams;

    if (arg == "-s" || arg == "--seed") {
@@ -861,9 +858,9 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
            return true;
        }
        params.main_gpu = std::stoi(argv[i]);
-#ifndef GGML_USE_CUDA_SYCL
-        fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL. Setting the main GPU has no effect.\n");
-#endif // GGML_USE_CUDA_SYCL
+#ifndef GGML_USE_CUBLAS_SYCL
+        fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the main GPU has no effect.\n");
+#endif // GGML_USE_CUBLAS_SYCL
        return true;
    }
    if (arg == "--split-mode" || arg == "-sm") {
@@ -889,9 +886,9 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
            invalid_param = true;
            return true;
        }
-#ifndef GGML_USE_CUDA_SYCL
-        fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL. Setting the split mode has no effect.\n");
-#endif // GGML_USE_CUDA_SYCL
+#ifndef GGML_USE_CUBLAS_SYCL
+        fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the split mode has no effect.\n");
+#endif // GGML_USE_CUBLAS_SYCL
        return true;
    }
    if (arg == "--tensor-split" || arg == "-ts") {
@@ -917,9 +914,9 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
                params.tensor_split[i] = 0.0f;
            }
        }
-#ifndef GGML_USE_CUDA_SYCL_VULKAN
-        fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting a tensor split has no effect.\n");
-#endif // GGML_USE_CUDA_SYCL_VULKAN
+#ifndef GGML_USE_CUBLAS_SYCL_VULKAN
+        fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL/Vulkan. Setting a tensor split has no effect.\n");
+#endif // GGML_USE_CUBLAS_SYCL
        return true;
    }
    if (arg == "--no-mmap") {
@@ -966,22 +963,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        }
        return true;
    }
-    if (arg == "-lcs" || arg == "--lookup-cache-static") {
-        if (++i >= argc) {
-            invalid_param = true;
-            return true;
-        }
-        params.lookup_cache_static = argv[i];
-        return true;
-    }
-    if (arg == "-lcd" || arg == "--lookup-cache-dynamic") {
-        if (++i >= argc) {
-            invalid_param = true;
-            return true;
-        }
-        params.lookup_cache_dynamic = argv[i];
-        return true;
-    }
    if (arg == "--save-all-logits" || arg == "--kl-divergence-base") {
        if (++i >= argc) {
            invalid_param = true;
@@ -1062,8 +1043,8 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        params.ignore_eos = true;
        return true;
    }
-    if (arg == "--penalize-nl") {
-        sparams.penalize_nl = true;
+    if (arg == "--no-penalize-nl") {
+        sparams.penalize_nl = false;
        return true;
    }
    if (arg == "-l" || arg == "--logit-bias") {
@@ -1239,11 +1220,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
            throw std::invalid_argument("error: unknown argument: " + arg);
        }
    }
-
    if (invalid_param) {
        throw std::invalid_argument("error: invalid parameter for argument: " + arg);
    }
-
    if (params.prompt_cache_all &&
            (params.interactive || params.interactive_first ||
             params.instruct)) {
@@ -1251,11 +1230,6 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
        throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
    }

-    // short-hand to avoid specifying --hf-file -> default it to --model
-    if (!params.hf_repo.empty() && params.hf_file.empty()) {
-        params.hf_file = params.model;
-    }
-
    if (params.escape) {
        process_escapes(params.prompt);
        process_escapes(params.input_prefix);
@@ -1373,7 +1347,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  -dt N, --defrag-thold N\n");
    printf("                        KV cache defragmentation threshold (default: %.1f, < 0 - disabled)\n", params.defrag_thold);
    printf("  --ignore-eos          ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
-    printf("  --penalize-nl         penalize newline tokens\n");
+    printf("  --no-penalize-nl      do not penalize newline token\n");
    printf("  --temp N              temperature (default: %.1f)\n", (double)sparams.temp);
    printf("  --all-logits          return logits for all tokens in the batch (default: disabled)\n");
    printf("  --hellaswag           compute HellaSwag score over random tasks from datafile supplied with -f\n");
@@ -1455,10 +1429,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("                        Hugging Face model file (default: unused)\n");
    printf("  -ld LOGDIR, --logdir LOGDIR\n");
    printf("                        path under which to save YAML logs (no logging if unset)\n");
-    printf("  -lcs FNAME, --lookup-cache-static FNAME\n");
-    printf("                        path to static lookup cache to use for lookup decoding (not updated by generation)\n");
-    printf("  -lcd FNAME, --lookup-cache-dynamic FNAME\n");
-    printf("                        path to dynamic lookup cache to use for lookup decoding (updated by generation)\n");
    printf("  --override-kv KEY=TYPE:VALUE\n");
    printf("                        advanced option to override model metadata by key. may be specified multiple times.\n");
    printf("                        types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
@@ -1705,13 +1675,27 @@ void llama_batch_add(

 #ifdef LLAMA_USE_CURL

-static bool llama_download_file(CURL * curl, const char * url, const char * path) {
-    bool force_download = false;
+struct llama_model * llama_load_model_from_url(
+        const char * model_url,
+        const char * path_model,
+        const struct llama_model_params & params) {
+    // Basic validation of the model_url
+    if (!model_url || strlen(model_url) == 0) {
+        fprintf(stderr, "%s: invalid model_url\n", __func__);
+        return NULL;
+    }
+
+    // Initialize libcurl globally
+    auto curl = curl_easy_init();
+
+    if (!curl) {
+        fprintf(stderr, "%s: error initializing libcurl\n", __func__);
+        return NULL;
+    }

    // Set the URL, allow to follow http redirection
-    curl_easy_setopt(curl, CURLOPT_URL, url);
+    curl_easy_setopt(curl, CURLOPT_URL, model_url);
    curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
-
 #if defined(_WIN32)
    // CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
    //   operating system. Currently implemented under MS-Windows.
@@ -1720,16 +1704,16 @@ static bool llama_download_file(CURL * curl, const char * url, const char * path

    // Check if the file already exists locally
    struct stat model_file_info;
-    auto file_exists = (stat(path, &model_file_info) == 0);
+    auto file_exists = (stat(path_model, &model_file_info) == 0);

    // If the file exists, check for ${path_model}.etag or ${path_model}.lastModified files
    char etag[LLAMA_CURL_MAX_HEADER_LENGTH] = {0};
-    char etag_path[PATH_MAX] = {0};
-    snprintf(etag_path, sizeof(etag_path), "%s.etag", path);
+    char etag_path[LLAMA_CURL_MAX_PATH_LENGTH] = {0};
+    snprintf(etag_path, sizeof(etag_path), "%s.etag", path_model);

    char last_modified[LLAMA_CURL_MAX_HEADER_LENGTH] = {0};
-    char last_modified_path[PATH_MAX] = {0};
-    snprintf(last_modified_path, sizeof(last_modified_path), "%s.lastModified", path);
+    char last_modified_path[LLAMA_CURL_MAX_PATH_LENGTH] = {0};
+    snprintf(last_modified_path, sizeof(last_modified_path), "%s.lastModified", path_model);

    if (file_exists) {
        auto * f_etag = fopen(etag_path, "r");
@@ -1737,7 +1721,7 @@ static bool llama_download_file(CURL * curl, const char * url, const char * path
            if (!fgets(etag, sizeof(etag), f_etag)) {
                fprintf(stderr, "%s: unable to read file %s\n", __func__, etag_path);
            } else {
-                fprintf(stderr, "%s: previous file found %s: %s\n", __func__, etag_path, etag);
+                fprintf(stderr, "%s: previous model file found %s: %s\n", __func__, etag_path, etag);
            }
            fclose(f_etag);
        }
@@ -1747,7 +1731,7 @@ static bool llama_download_file(CURL * curl, const char * url, const char * path
            if (!fgets(last_modified, sizeof(last_modified), f_last_modified)) {
                fprintf(stderr, "%s: unable to read file %s\n", __func__, last_modified_path);
            } else {
-                fprintf(stderr, "%s: previous file found %s: %s\n", __func__, last_modified_path,
+                fprintf(stderr, "%s: previous model file found %s: %s\n", __func__, last_modified_path,
                        last_modified);
            }
            fclose(f_last_modified);
@@ -1765,11 +1749,6 @@ static bool llama_download_file(CURL * curl, const char * url, const char * path
        auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
            llama_load_model_from_url_headers *headers = (llama_load_model_from_url_headers *) userdata;

-            // Convert header field name to lowercase
-            for (size_t i = 0; i < n_items && buffer[i] != ':'; ++i) {
-                buffer[i] = tolower(buffer[i]);
-            }
-
            const char * etag_prefix = "etag: ";
            if (strncmp(buffer, etag_prefix, strlen(etag_prefix)) == 0) {
                strncpy(headers->etag, buffer + strlen(etag_prefix), n_items - strlen(etag_prefix) - 2); // Remove CRLF
@@ -1792,7 +1771,7 @@ static bool llama_download_file(CURL * curl, const char * url, const char * path
        if (res != CURLE_OK) {
            curl_easy_cleanup(curl);
            fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res));
-            return false;
+            return NULL;
        }

        long http_code = 0;
@@ -1800,34 +1779,30 @@ static bool llama_download_file(CURL * curl, const char * url, const char * path
        if (http_code != 200) {
            // HEAD not supported, we don't know if the file has changed
            // force trigger downloading
-            force_download = true;
+            file_exists = false;
            fprintf(stderr, "%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
        }
    }

    // If the ETag or the Last-Modified headers are different: trigger a new download
-    bool should_download = !file_exists
-        || force_download
-        || (strlen(headers.etag) > 0 && strcmp(etag, headers.etag) != 0)
-        || (strlen(headers.last_modified) > 0 && strcmp(last_modified, headers.last_modified) != 0);
-    if (should_download) {
-        char path_temporary[PATH_MAX] = {0};
-        snprintf(path_temporary, sizeof(path_temporary), "%s.downloadInProgress", path);
+    if (!file_exists || strcmp(etag, headers.etag) != 0 || strcmp(last_modified, headers.last_modified) != 0) {
+        char path_model_temporary[LLAMA_CURL_MAX_PATH_LENGTH] = {0};
+        snprintf(path_model_temporary, sizeof(path_model_temporary), "%s.downloadInProgress", path_model);
        if (file_exists) {
-            fprintf(stderr, "%s: deleting previous downloaded file: %s\n", __func__, path);
-            if (remove(path) != 0) {
+            fprintf(stderr, "%s: deleting previous downloaded model file: %s\n", __func__, path_model);
+            if (remove(path_model) != 0) {
                curl_easy_cleanup(curl);
-                fprintf(stderr, "%s: unable to delete file: %s\n", __func__, path);
-                return false;
+                fprintf(stderr, "%s: unable to delete file: %s\n", __func__, path_model);
+                return NULL;
            }
        }

        // Set the output file
-        auto * outfile = fopen(path_temporary, "wb");
+        auto * outfile = fopen(path_model_temporary, "wb");
        if (!outfile) {
            curl_easy_cleanup(curl);
-            fprintf(stderr, "%s: error opening local file for writing: %s\n", __func__, path);
-            return false;
+            fprintf(stderr, "%s: error opening local file for writing: %s\n", __func__, path_model);
+            return NULL;
        }

        typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * data, size_t size, size_t nmemb, void * fd);
@@ -1841,30 +1816,15 @@ static bool llama_download_file(CURL * curl, const char * url, const char * path
        //  display download progress
        curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L);

-        // helper function to hide password in URL
-        auto llama_download_hide_password_in_url = [](const std::string & url) -> std::string {
-            std::size_t protocol_pos = url.find("://");
-            if (protocol_pos == std::string::npos) {
-                return url;  // Malformed URL
-            }
-
-            std::size_t at_pos = url.find('@', protocol_pos + 3);
-            if (at_pos == std::string::npos) {
-                return url;  // No password in URL
-            }
-
-            return url.substr(0, protocol_pos + 3) + "********" + url.substr(at_pos);
-        };
-
        // start the download
-        fprintf(stderr, "%s: downloading from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
-                llama_download_hide_password_in_url(url).c_str(), path, headers.etag, headers.last_modified);
+        fprintf(stderr, "%s: downloading model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
+                model_url, path_model, headers.etag, headers.last_modified);
        auto res = curl_easy_perform(curl);
        if (res != CURLE_OK) {
            fclose(outfile);
            curl_easy_cleanup(curl);
            fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res));
-            return false;
+            return NULL;
        }

        long http_code = 0;
@@ -1873,7 +1833,7 @@ static bool llama_download_file(CURL * curl, const char * url, const char * path
            fclose(outfile);
            curl_easy_cleanup(curl);
            fprintf(stderr, "%s: invalid http status code received: %ld\n", __func__, http_code);
-            return false;
+            return NULL;
        }

        // Clean up
@@ -1885,7 +1845,7 @@ static bool llama_download_file(CURL * curl, const char * url, const char * path
            if (etag_file) {
                fputs(headers.etag, etag_file);
                fclose(etag_file);
-                fprintf(stderr, "%s: file etag saved %s: %s\n", __func__, etag_path, headers.etag);
+                fprintf(stderr, "%s: model etag saved %s: %s\n", __func__, etag_path, headers.etag);
            }
        }

@@ -1895,113 +1855,20 @@ static bool llama_download_file(CURL * curl, const char * url, const char * path
            if (last_modified_file) {
                fputs(headers.last_modified, last_modified_file);
                fclose(last_modified_file);
-                fprintf(stderr, "%s: file last modified saved %s: %s\n", __func__, last_modified_path,
+                fprintf(stderr, "%s: model last modified saved %s: %s\n", __func__, last_modified_path,
                        headers.last_modified);
            }
        }

-        if (rename(path_temporary, path) != 0) {
-            curl_easy_cleanup(curl);
-            fprintf(stderr, "%s: unable to rename file: %s to %s\n", __func__, path_temporary, path);
-            return false;
-        }
-    }
-
-    return true;
-}
-
-struct llama_model * llama_load_model_from_url(
-        const char * model_url,
-        const char * path_model,
-        const struct llama_model_params & params) {
-    // Basic validation of the model_url
-    if (!model_url || strlen(model_url) == 0) {
-        fprintf(stderr, "%s: invalid model_url\n", __func__);
-        return NULL;
-    }
-
-    // Initialize libcurl
-    auto * curl = curl_easy_init();
-
-    if (!curl) {
-        fprintf(stderr, "%s: error initializing libcurl\n", __func__);
-        return NULL;
-    }
-
-    if (!llama_download_file(curl, model_url, path_model)) {
-        return NULL;
-    }
-
-    // check for additional GGUFs split to download
-    int n_split = 0;
-    {
-        struct gguf_init_params gguf_params = {
-            /*.no_alloc = */ true,
-            /*.ctx      = */ NULL,
-        };
-        auto * ctx_gguf = gguf_init_from_file(path_model, gguf_params);
-        if (!ctx_gguf) {
-            fprintf(stderr, "\n%s:  failed to load input GGUF from %s\n", __func__, path_model);
+        if (rename(path_model_temporary, path_model) != 0) {
            curl_easy_cleanup(curl);
+            fprintf(stderr, "%s: unable to rename file: %s to %s\n", __func__, path_model_temporary, path_model);
            return NULL;
        }
-
-        auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT);
-        if (key_n_split >= 0) {
-            n_split = gguf_get_val_u16(ctx_gguf, key_n_split);
-        }
-
-        gguf_free(ctx_gguf);
    }

    curl_easy_cleanup(curl);

-    if (n_split > 1) {
-        char split_prefix[PATH_MAX] = {0};
-        char split_url_prefix[LLAMA_CURL_MAX_URL_LENGTH] = {0};
-
-        // Verify the first split file format
-        // and extract split URL and PATH prefixes
-        {
-            if (!llama_split_prefix(split_prefix, sizeof(split_prefix), path_model, 0, n_split)) {
-                fprintf(stderr, "\n%s: unexpected model file name: %s"
-                                " n_split=%d\n", __func__, path_model, n_split);
-                return NULL;
-            }
-
-            if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url, 0, n_split)) {
-                fprintf(stderr, "\n%s: unexpected model url: %s"
-                                " n_split=%d\n", __func__, model_url, n_split);
-                return NULL;
-            }
-        }
-
-        // Prepare download in parallel
-        std::vector<std::future<bool>> futures_download;
-        for (int idx = 1; idx < n_split; idx++) {
-            futures_download.push_back(std::async(std::launch::async, [&split_prefix, &split_url_prefix, &n_split](int download_idx) -> bool {
-                char split_path[PATH_MAX] = {0};
-                llama_split_path(split_path, sizeof(split_path), split_prefix, download_idx, n_split);
-
-                char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
-                llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split);
-
-                auto * curl = curl_easy_init();
-                bool res = llama_download_file(curl, split_url, split_path);
-                curl_easy_cleanup(curl);
-
-                return res;
-            }, idx));
-        }
-
-        // Wait for all downloads to complete
-        for (auto & f : futures_download) {
-            if (!f.get()) {
-                return NULL;
-            }
-        }
-    }
-
    return llama_load_model_from_file(path_model, params);
 }

@@ -2382,7 +2249,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
    fprintf(stream, "cpu_has_avx512: %s\n",      ggml_cpu_has_avx512()      ? "true" : "false");
    fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
    fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
-    fprintf(stream, "cpu_has_cuda: %s\n",        ggml_cpu_has_cuda()        ? "true" : "false");
+    fprintf(stream, "cpu_has_cublas: %s\n",      ggml_cpu_has_cublas()      ? "true" : "false");
    fprintf(stream, "cpu_has_vulkan: %s\n",      ggml_cpu_has_vulkan()      ? "true" : "false");
    fprintf(stream, "cpu_has_clblast: %s\n",     ggml_cpu_has_clblast()     ? "true" : "false");
    fprintf(stream, "cpu_has_kompute: %s\n",     ggml_cpu_has_kompute()     ? "true" : "false");
@@ -2484,7 +2351,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
    fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
    fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", sparams.n_probs);
    fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
-    fprintf(stream, "penalize_nl: %s # default: false\n", sparams.penalize_nl ? "true" : "false");
+    fprintf(stream, "no_penalize_nl: %s # default: false\n", !sparams.penalize_nl ? "true" : "false");
    fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
    fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
    fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.penalty_present);
--- a/common/common.h
+++ b/common/common.h
@@ -88,22 +88,20 @@ struct gpt_params {
    // // sampling parameters
    struct llama_sampling_params sparams;

-    std::string model                = "models/7B/ggml-model-f16.gguf"; // model path
-    std::string model_draft          = "";  // draft model for speculative decoding
-    std::string model_alias          = "unknown"; // model alias
-    std::string model_url            = "";  // model url to download
-    std::string hf_repo              = "";  // HF repo
-    std::string hf_file              = "";  // HF file
-    std::string prompt               = "";
-    std::string prompt_file          = "";  // store the external prompt file name
-    std::string path_prompt_cache    = "";  // path to file for saving/loading prompt eval state
-    std::string input_prefix         = "";  // string to prefix user inputs with
-    std::string input_suffix         = "";  // string to suffix user inputs with
+    std::string model             = "models/7B/ggml-model-f16.gguf"; // model path
+    std::string model_draft       = "";  // draft model for speculative decoding
+    std::string model_alias       = "unknown"; // model alias
+    std::string model_url         = "";  // model url to download
+    std::string hf_repo           = "";  // HF repo
+    std::string hf_file           = "";  // HF file
+    std::string prompt            = "";
+    std::string prompt_file       = "";  // store the external prompt file name
+    std::string path_prompt_cache = "";  // path to file for saving/loading prompt eval state
+    std::string input_prefix      = "";  // string to prefix user inputs with
+    std::string input_suffix      = "";  // string to suffix user inputs with
    std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
-    std::string logdir               = "";  // directory in which to save YAML log files
-    std::string lookup_cache_static  = ""; // path of static ngram cache file for lookup decoding
-    std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding
-    std::string logits_file          = "";  // file for saving *all* logits
+    std::string logdir            = "";  // directory in which to save YAML log files
+    std::string logits_file       = "";  // file for saving *all* logits

    std::vector<llama_model_kv_override> kv_overrides;

@@ -171,8 +169,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params);

 void gpt_print_usage(int argc, char ** argv, const gpt_params & params);

-bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param);
-
 std::string get_system_info(const gpt_params & params);

 std::string gpt_random_prompt(std::mt19937 & rng);
@@ -308,10 +304,3 @@ struct llama_control_vector_load_info {
 // Load control vectors, scale each by strength, and add them together.
 // On error, returns {-1, empty}
 llama_control_vector_data llama_control_vector_load(const std::vector<llama_control_vector_load_info> & load_infos);
-
-//
-// Split utils
-//
-static const char * const LLM_KV_SPLIT_NO            = "split.no";
-static const char * const LLM_KV_SPLIT_COUNT         = "split.count";
-static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@@ -9,7 +9,7 @@
 #include <unordered_set>
 #include <vector>

-using json = nlohmann::ordered_json;
+using json = nlohmann::json;

 const std::string SPACE_RULE = "\" \"?";

@@ -124,7 +124,7 @@ static std::string replacePattern(const std::string & input, const std::regex &
 }

 static std::string format_literal(const std::string & literal) {
-    std::string escaped = replacePattern(literal, GRAMMAR_LITERAL_ESCAPE_RE, [&](const std::smatch & match) {
+    std::string escaped = replacePattern(json(literal).dump(), GRAMMAR_LITERAL_ESCAPE_RE, [&](const std::smatch & match) {
        char c = match.str()[0];
        return GRAMMAR_LITERAL_ESCAPES.at(c);
    });
@@ -137,7 +137,7 @@ private:
    std::function<json(const std::string &)> _fetch_json;
    bool _dotall;
    std::map<std::string, std::string> _rules;
-    std::unordered_map<std::string, json> _refs;
+    std::unordered_map<std::string, nlohmann::json> _refs;
    std::unordered_set<std::string> _refs_being_resolved;
    std::vector<std::string> _errors;
    std::vector<std::string> _warnings;
@@ -413,7 +413,7 @@ private:
            std::string prop_rule_name = visit(prop_schema, name + (name.empty() ? "" : "-") + prop_name);
            prop_kv_rule_names[prop_name] = _add_rule(
                name + (name.empty() ? "" : "-") + prop_name + "-kv",
-                format_literal(json(prop_name).dump()) + " space \":\" space " + prop_rule_name
+                format_literal(prop_name) + " space \":\" space " + prop_rule_name
            );
            if (required.find(prop_name) != required.end()) {
                required_props.push_back(prop_name);
@@ -495,7 +495,7 @@ public:
        _rules["space"] = SPACE_RULE;
    }

-    void resolve_refs(json & schema, const std::string & url) {
+    void resolve_refs(nlohmann::json & schema, const std::string & url) {
        /*
        * Resolves all $ref fields in the given schema, fetching any remote schemas,
        * replacing each $ref with absolute reference URL and populates _refs with the
@@ -557,7 +557,11 @@ public:
    }

    std::string _generate_constant_rule(const json & value) {
-        return format_literal(value.dump());
+        if (!value.is_string()) {
+            _errors.push_back("Only std::string constants are supported, got " + value.dump());
+            return "";
+        }
+        return format_literal(value.get<std::string>());
    }

    std::string visit(const json & schema, const std::string & name) {
--- a/common/json-schema-to-grammar.h
+++ b/common/json-schema-to-grammar.h
@@ -1,4 +1,4 @@
 #pragma once
 #include "json.hpp"

-std::string json_schema_to_grammar(const nlohmann::ordered_json& schema);
+std::string json_schema_to_grammar(const nlohmann::json& schema);
--- a/common/log.h
+++ b/common/log.h
@@ -234,7 +234,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
 // INTERNAL, DO NOT USE
 //  USE LOG() INSTEAD
 //
-#if !defined(_MSC_VER) or defined(__INTEL_LLVM_COMPILER)
+#ifndef _MSC_VER
    #define LOG_IMPL(str, ...)                                                                                      \
    do {                                                                                                            \
        if (LOG_TARGET != nullptr)                                                                                  \
@@ -257,7 +257,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
 // INTERNAL, DO NOT USE
 //  USE LOG_TEE() INSTEAD
 //
-#if !defined(_MSC_VER) or defined(__INTEL_LLVM_COMPILER)
+#ifndef _MSC_VER
    #define LOG_TEE_IMPL(str, ...)                                                                                                      \
    do {                                                                                                                                \
        if (LOG_TARGET != nullptr)                                                                                                      \
@@ -566,7 +566,6 @@ inline void log_print_usage()
    printf("  --log-new             Create a separate new log file on start. "
                                   "Each log file will have unique name: \"<name>.<ID>.log\"\n");
    printf("  --log-append          Don't truncate the old log file.\n");
-    printf("\n");
 }

 #define log_dump_cmdline(argc, argv) log_dump_cmdline_impl(argc, argv)
--- a/common/ngram-cache.cpp
+++ b/common/ngram-cache.cpp
@@ -1,282 +0,0 @@
-#include "ngram-cache.h"
-#include "common.h"
-#include "log.h"
-
-#include <cstdint>
-#include <fstream>
-
-void llama_ngram_cache_update(llama_ngram_cache & ngram_cache, int ngram_min, int ngram_max,
-                              std::vector<llama_token> & inp, int nnew, bool print_progress) {
-    const int64_t t_start_ms = ggml_time_ms();
-    const int64_t inp_size = inp.size();
-
-    const int64_t n_todo = inp_size * (ngram_max - ngram_min + 1);
-    int64_t n_done = 0;
-
-    for (int64_t ngram_size = ngram_min; ngram_size <= ngram_max; ++ngram_size) {
-        const int64_t i_start = std::max(inp_size - nnew, ngram_size);
-        for (int64_t i = i_start; i < inp_size; ++i) {
-            const int64_t ngram_start = i - ngram_size;
-            llama_ngram ngram(&inp[ngram_start], ngram_size);
-            const llama_token token = inp[i];
-
-            llama_ngram_cache::iterator part_it = ngram_cache.find(ngram);
-            if (part_it == ngram_cache.end()) {
-                llama_ngram_cache_part part;
-                part.emplace(token, 1);
-                ngram_cache.emplace(ngram, part);
-            } else {
-                llama_ngram_cache_part::iterator token_count_it = part_it->second.find(token);
-                if (token_count_it == part_it->second.end()) {
-                    part_it->second.emplace(token, 1);
-                } else {
-                    token_count_it->second++;
-                }
-            }
-            ++n_done;
-
-            if (print_progress && n_done % 10000000 == 0) {
-                const int64_t t_now_ms = ggml_time_ms();
-                const int64_t eta_ms   = (inp_size*(ngram_max-ngram_min+1) - n_done) * (t_now_ms - t_start_ms) / n_done;
-                const int64_t eta_min  = eta_ms / (60*1000);
-                const int64_t eta_s    = (eta_ms - 60*1000*eta_min) / 1000;
-
-                fprintf(stderr, "%s: %" PRId64 "/%" PRId64 " done, ETA: %02" PRId64 ":%02" PRId64 "\n", __func__, n_done, n_todo, eta_min, eta_s);
-            }
-        }
-    }
-}
-
-// Helper function to get a token from the combined, speculative sequence of inp and draft.
-static llama_token get_token(const std::vector<llama_token> & inp, const std::vector<llama_token> & draft, const size_t i) {
-    return i < inp.size() ? inp[i] : draft[1 + i - inp.size()];
-}
-
-// If sample size or percentage are below these thresholds the draft is aborted early:
-constexpr int    draft_min_sample_size_lax[LLAMA_NGRAM_MAX] = { 2,  2,  1,  1};
-constexpr int        draft_min_percent_lax[LLAMA_NGRAM_MAX] = {66, 50, 50, 50};
-constexpr int draft_min_sample_size_strict[LLAMA_NGRAM_MAX] = { 4,  3,  2,  2};
-constexpr int     draft_min_percent_strict[LLAMA_NGRAM_MAX] = {75, 66, 66, 66};
-
-// Helper function that tries to draft a token from only the static ngram cache:
-static llama_token try_draft(llama_ngram_cache & nc_static, const llama_ngram ngram_static) {
-    llama_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
-    if (part_static_it == nc_static.end()) {
-        return -1;
-    }
-    const llama_ngram_cache_part part_static = part_static_it->second;
-
-    int max_count_static  = 0;
-    int sum_count_static  = 0;
-    llama_token max_token = -1;
-
-    for (std::pair<llama_token, int> token_count_static : part_static) {
-        const llama_token token = token_count_static.first;
-        const int32_t count_static  = token_count_static.second;
-
-        if (count_static > max_count_static) {
-            max_token        = token;
-            max_count_static = count_static;
-        }
-        sum_count_static += count_static;
-    }
-
-    if (sum_count_static < draft_min_sample_size_lax[LLAMA_NGRAM_STATIC-1]) {
-        return -1;
-    }
-    if (100*max_count_static < draft_min_percent_lax[LLAMA_NGRAM_STATIC-1]*sum_count_static) {
-        return -1;
-    }
-    return max_token;
-}
-
-// Try to draft a token from primary cache (context/dynamic), validate with static cache:
-static llama_token try_draft(
-    llama_ngram_cache & nc_primary, const std::vector<llama_ngram> & ngrams_primary, llama_ngram_cache_part & part_static,
-    const int * min_sample_size, const int * min_percent) {
-
-    llama_token drafted_token = -1;
-
-    for (int i = ngrams_primary.size()-1; i >= 0 && drafted_token == -1; --i) {
-        const llama_ngram ngram_primary = ngrams_primary[i];
-
-        llama_ngram_cache::iterator part_primary_it = nc_primary.find(ngram_primary);
-        if (part_primary_it == nc_primary.end()) {
-            continue;
-        }
-        const llama_ngram_cache_part part_primary = part_primary_it->second;
-
-        int max_count_primary = 0;
-        int max_count_static  = 0;
-        int sum_count_primary = 0;
-        llama_token max_token = -1;
-
-        for (std::pair<llama_token, int> token_count_primary : part_primary) {
-            const llama_token token = token_count_primary.first;
-
-            llama_ngram_cache_part::iterator token_count_static_it = part_static.find(token);
-
-            const int32_t count_primary = token_count_primary.second;
-            const int32_t count_static  = token_count_static_it != part_static.end() ? 100*token_count_static_it->second : 1;
-
-            if (count_primary*count_static > max_count_primary*max_count_static) {
-                max_token         = token;
-                max_count_primary = count_primary;
-                max_count_static  = count_static;
-            }
-            sum_count_primary += count_primary;
-        }
-
-        if (sum_count_primary < min_sample_size[i]) {
-            continue;
-        }
-        if (100*max_count_primary < min_percent[i]*sum_count_primary) {
-            continue;;
-        }
-        drafted_token = max_token;
-    }
-
-    return drafted_token;
-}
-
-void llama_ngram_cache_draft(
-    std::vector<llama_token> & inp, std::vector<llama_token> & draft, int n_draft, int ngram_min, int ngram_max,
-    llama_ngram_cache & nc_context, llama_ngram_cache & nc_dynamic, llama_ngram_cache & nc_static
-) {
-    GGML_ASSERT(draft.size() == 1);
-    const int inp_size = inp.size();
-
-    if (inp_size < LLAMA_NGRAM_STATIC) {
-        return;
-    }
-
-    while ((int) draft.size()-1 < n_draft) {
-        llama_token drafted_token = -1;
-
-        const int ngram_start_static = inp_size-LLAMA_NGRAM_STATIC + draft.size()-1;
-        llama_ngram ngram_static;
-        for (int j = ngram_start_static; j < ngram_start_static + LLAMA_NGRAM_STATIC; ++j) {
-            ngram_static.tokens[j-ngram_start_static] = get_token(inp, draft, j);
-        }
-        llama_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
-        llama_ngram_cache_part part_static;
-        if (part_static_it != nc_static.end()) {
-            part_static = part_static_it->second;
-        }
-
-        // cd = context + dynamic
-        std::vector<llama_ngram> ngrams_cd;
-        for (int ngram_size_cd = ngram_min; ngram_size_cd <= ngram_max; ++ngram_size_cd) {
-            const int ngram_start_cd = inp_size-ngram_size_cd + draft.size()-1;
-            llama_ngram ngram_cd;
-            for (int j = ngram_start_cd; j < ngram_start_cd + ngram_size_cd; ++j) {
-                ngram_cd.tokens[j-ngram_start_cd] = get_token(inp, draft, j);
-            }
-            ngrams_cd.push_back(ngram_cd);
-        }
-        if (drafted_token == -1) {
-            drafted_token = try_draft(nc_context, ngrams_cd, part_static, draft_min_sample_size_lax, draft_min_percent_lax);
-        }
-        if (drafted_token == -1) {
-            drafted_token = try_draft(nc_dynamic, ngrams_cd, part_static, draft_min_sample_size_strict, draft_min_percent_strict);
-        }
-        if (drafted_token == -1) {
-            drafted_token = try_draft(nc_static, ngram_static);
-        }
-
-        if (drafted_token == -1) {
-            break;
-        }
-
-        LOG(" - draft candidate: token=%d\n", drafted_token);
-        draft.push_back(drafted_token);
-    }
-}
-
-void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filename) {
-    std::ofstream file_out(filename, std::ios::binary);
-    for (std::pair<llama_ngram, llama_ngram_cache_part> item : ngram_cache) {
-        const llama_ngram      ngram        = item.first;
-        llama_ngram_cache_part token_counts = item.second;
-        GGML_ASSERT(!token_counts.empty());
-        const int32_t ntokens = token_counts.size();
-        GGML_ASSERT(ntokens > 0);
-
-        file_out.write(reinterpret_cast<const char *>(&ngram),   sizeof(llama_ngram));
-        file_out.write(reinterpret_cast<const char *>(&ntokens), sizeof(int32_t));
-        for (std::pair<llama_token, int32_t> item2 : token_counts) {
-            const llama_token token = item2.first;
-            const int32_t     count = item2.second;
-            GGML_ASSERT(count > 0);
-
-            file_out.write(reinterpret_cast<const char *>(&token), sizeof(llama_token));
-            file_out.write(reinterpret_cast<const char *>(&count), sizeof(int32_t));
-        }
-    }
-
-}
-
-llama_ngram_cache llama_ngram_cache_load(std::string & filename) {
-    std::ifstream hashmap_file(filename, std::ios::binary);
-    if (!hashmap_file) {
-        throw std::ifstream::failure("Unable to open file " + filename);
-    }
-    llama_ngram_cache ngram_cache;
-
-    llama_ngram ngram;
-    int32_t     ntokens;
-    llama_token token;
-    int32_t     count;
-
-    char * ngramc   = reinterpret_cast<char*>(&ngram);
-    char * ntokensc = reinterpret_cast<char*>(&ntokens);
-    char * tokenc   = reinterpret_cast<char*>(&token);
-    char * countc   = reinterpret_cast<char*>(&count);
-    while(hashmap_file.read(ngramc, sizeof(llama_ngram))) {
-        GGML_ASSERT(!hashmap_file.eof());
-        GGML_ASSERT(hashmap_file.read(ntokensc, sizeof(int32_t)));
-        GGML_ASSERT(ntokens > 0);
-        llama_ngram_cache_part token_counts;
-
-        for (int i = 0; i < ntokens; ++i) {
-            GGML_ASSERT(!hashmap_file.eof());
-            GGML_ASSERT(hashmap_file.read(tokenc, sizeof(llama_token)));
-            GGML_ASSERT(!hashmap_file.eof());
-            GGML_ASSERT(hashmap_file.read(countc, sizeof(int32_t)));
-            GGML_ASSERT(count > 0);
-            token_counts.emplace(token, count);
-        }
-
-        ngram_cache.emplace(ngram, token_counts);
-    }
-    GGML_ASSERT(hashmap_file.eof());
-
-    return ngram_cache;
-}
-
-void llama_ngram_cache_merge(llama_ngram_cache & ngram_cache_target, llama_ngram_cache & ngram_cache_add) {
-    for (std::pair<llama_ngram, llama_ngram_cache_part> ngram_part : ngram_cache_add) {
-        const llama_ngram      ngram = ngram_part.first;
-        llama_ngram_cache_part  part = ngram_part.second;
-
-        llama_ngram_cache::iterator part_merged_it = ngram_cache_target.find(ngram);
-        if (part_merged_it == ngram_cache_target.end()) {
-            ngram_cache_target.emplace(ngram, part);
-            continue;
-        }
-
-        for (std::pair<llama_token, int32_t> token_count : part) {
-            const llama_token token = token_count.first;
-            const int32_t     count = token_count.second;
-            GGML_ASSERT(count > 0);
-
-            llama_ngram_cache_part::iterator token_count_merged_it = part_merged_it->second.find(token);
-            if (token_count_merged_it == part_merged_it->second.end()) {
-                part_merged_it->second.emplace(token, count);
-                continue;
-            }
-
-            token_count_merged_it->second += count;
-        }
-    }
-}
--- a/common/ngram-cache.h
+++ b/common/ngram-cache.h
@@ -1,94 +0,0 @@
-#pragma once
-
-#include "llama.h"
-
-#include <unordered_map>
-#include <string>
-#include <vector>
-
-#define LLAMA_NGRAM_MIN    1
-#define LLAMA_NGRAM_MAX    4
-#define LLAMA_NGRAM_STATIC 2
-
-// Data structures to map n-grams to empirical token probabilities:
-
-struct llama_ngram {
-    llama_token tokens[LLAMA_NGRAM_MAX];
-
-    llama_ngram() {
-        for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
-            tokens[i] = -1;
-        }
-    }
-
-    llama_ngram(const llama_token * input, const int ngram_size) {
-        for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
-            tokens[i] = i < ngram_size ? input[i] : -1;
-        }
-    }
-
-    bool operator==(const llama_ngram & other) const {
-        for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
-            if (tokens[i] != other.tokens[i]) {
-                return false;
-            }
-        }
-        return true;
-    }
-};
-
-struct llama_ngram_hash_function {
-    size_t operator()(const llama_ngram & ngram) const {
-        size_t hash = 0;
-        for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
-            hash ^= std::hash<llama_token>{}(ngram.tokens[i]);
-        }
-        return hash;
-    }
-};
-
-// token -> number of times token has been seen
-typedef std::unordered_map<llama_token, int32_t> llama_ngram_cache_part;
-
-// n-gram -> empirical distribution of following tokens
-typedef std::unordered_map<llama_ngram, llama_ngram_cache_part, llama_ngram_hash_function> llama_ngram_cache;
-
-
-// Update an ngram cache with tokens.
-// ngram_cache:         the cache to modify.
-// ngram_min/ngram_max: the min/max size of the ngrams to extract from inp_data.
-// inp_data:            the token sequence with which to update ngram_cache.
-// nnew:                how many new tokens have been appended to inp_data since the last call to this function.
-// print_progress:      whether to print progress to stderr.
-//
-// In order to get correct results inp_data can ONLY BE APPENDED TO.
-// Changes in the middle need a complete rebuild.
-void llama_ngram_cache_update(
-    llama_ngram_cache & ngram_cache, int ngram_min, int ngram_max, std::vector<llama_token> & inp_data, int nnew, bool print_progress);
-
-// Try to draft tokens from ngram caches.
-// inp:                the tokens generated so far.
-// draft:              the token sequence to draft. Expected to initially contain the previously sampled token.
-// n_draft:            maximum number of tokens to add to draft.
-// ngram_min/gram_max: the min/max size of the ngrams in nc_context and nc_dynamic.
-// nc_context:         ngram cache based on current context.
-// nc_dynamic:         ngram cache based on previous user generations.
-// nc_static:          ngram cache generated from a large text corpus, used for validation.
-void llama_ngram_cache_draft(
-    std::vector<llama_token> & inp, std::vector<llama_token> & draft, int n_draft, int ngram_min, int ngram_max,
-    llama_ngram_cache & nc_context, llama_ngram_cache & nc_dynamic, llama_ngram_cache & nc_static);
-
-// Save an ngram cache to a file.
-// ngram_cache: the ngram cache to save.
-// filename:    the path under which to save the ngram cache.
-void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filename);
-
-// Load an ngram cache saved with llama_ngram_cache_save.
-// filename: the path from which to load the ngram cache.
-// returns:  an ngram cache containing the information saved to filename.
-llama_ngram_cache llama_ngram_cache_load(std::string & filename);
-
-// Merge two ngram caches.
-// ngram_cache_target: the ngram cache to which to add the information from ngram_cache_add.
-// ngram_cache_add:    the ngram cache to add to ngram_cache_target.
-void llama_ngram_cache_merge(llama_ngram_cache & ngram_cache_target, llama_ngram_cache & ngram_cache_add);
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -168,20 +168,77 @@ static llama_token llama_sampling_sample_impl(
                  bool is_resampling) {  // Add a parameter to indicate if we are resampling
    const llama_sampling_params & params = ctx_sampling->params;

+    const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
+
    const float   temp            = params.temp;
+    const int32_t penalty_last_n  = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n;
+    const float   penalty_repeat  = params.penalty_repeat;
+    const float   penalty_freq    = params.penalty_freq;
+    const float   penalty_present = params.penalty_present;
    const int     mirostat        = params.mirostat;
    const float   mirostat_tau    = params.mirostat_tau;
    const float   mirostat_eta    = params.mirostat_eta;
+    const bool    penalize_nl     = params.penalize_nl;
+
+    auto & prev = ctx_sampling->prev;
+    auto & cur  = ctx_sampling->cur;

-    std::vector<float> original_logits;
-    auto cur_p = llama_sampling_prepare(ctx_sampling, ctx_main, ctx_cfg, idx, !is_resampling, &original_logits);
-    if (!is_resampling) {
-        GGML_ASSERT(!original_logits.empty());
-    }
    llama_token id = 0;
+
    // Get a pointer to the logits
    float * logits = llama_get_logits_ith(ctx_main, idx);

+    // Declare original_logits at the beginning of the function scope
+    std::vector<float> original_logits;
+
+    if (!is_resampling) {
+        // Only make a copy of the original logits if we are not in the resampling phase, not sure if I actually have to do this.
+        original_logits = std::vector<float>(logits, logits + llama_n_vocab(llama_get_model(ctx_main)));
+    }
+
+    // apply params.logit_bias map
+    for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
+        logits[it->first] += it->second;
+    }
+
+    if (ctx_cfg) {
+        float * logits_guidance = llama_get_logits_ith(ctx_cfg, idx);
+        llama_sample_apply_guidance(ctx_main, logits, logits_guidance, params.cfg_scale);
+    }
+
+    cur.clear();
+
+    for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
+        cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
+    }
+
+    llama_token_data_array cur_p = { cur.data(), cur.size(), false };
+
+    // apply penalties
+    const auto& penalty_tokens = params.use_penalty_prompt_tokens ? params.penalty_prompt_tokens : prev;
+    const int penalty_tokens_used_size = std::min((int)penalty_tokens.size(), penalty_last_n);
+    if (penalty_tokens_used_size) {
+        const float nl_logit = logits[llama_token_nl(llama_get_model(ctx_main))];
+
+        llama_sample_repetition_penalties(ctx_main, &cur_p,
+                penalty_tokens.data() + penalty_tokens.size() - penalty_tokens_used_size,
+                penalty_tokens_used_size, penalty_repeat, penalty_freq, penalty_present);
+
+        if (!penalize_nl) {
+            for (size_t idx = 0; idx < cur_p.size; idx++) {
+                if (cur_p.data[idx].id == llama_token_nl(llama_get_model(ctx_main))) {
+                    cur_p.data[idx].logit = nl_logit;
+                    break;
+                }
+            }
+        }
+    }
+
+    // If we are in the resampling phase, apply grammar checks before sampling logic
+    if (is_resampling && ctx_sampling->grammar != NULL) {
+        llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar);
+    }
+
    if (temp < 0.0) {
        // greedy sampling, with probs
        llama_sample_softmax(ctx_main, &cur_p);
@@ -245,13 +302,11 @@ static llama_token llama_sampling_sample_impl(
    return id;
 }

-static llama_token_data_array llama_sampling_prepare_impl(
+static llama_token_data_array llama_sample_probability_distribution_impl(
                  struct llama_sampling_context * ctx_sampling,
                  struct llama_context * ctx_main,
                  struct llama_context * ctx_cfg,
-                  const int idx,
-                  bool apply_grammar,
-                  std::vector<float> * original_logits) {
+                  const int idx) {
    const llama_sampling_params & params = ctx_sampling->params;

    const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
@@ -260,7 +315,6 @@ static llama_token_data_array llama_sampling_prepare_impl(
    const float   penalty_repeat  = params.penalty_repeat;
    const float   penalty_freq    = params.penalty_freq;
    const float   penalty_present = params.penalty_present;
-
    const bool    penalize_nl     = params.penalize_nl;

    auto & prev = ctx_sampling->prev;
@@ -269,10 +323,8 @@ static llama_token_data_array llama_sampling_prepare_impl(
    // Get a pointer to the logits
    float * logits = llama_get_logits_ith(ctx_main, idx);

-    if (apply_grammar && original_logits != NULL) {
-        // Only make a copy of the original logits if we are not applying grammar checks, not sure if I actually have to do this.
-        *original_logits = {logits, logits + llama_n_vocab(llama_get_model(ctx_main))};
-    }
+    // Declare original_logits at the beginning of the function scope
+    std::vector<float> original_logits;

    // apply params.logit_bias map
    for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
@@ -312,11 +364,12 @@ static llama_token_data_array llama_sampling_prepare_impl(
        }
    }

-    // apply grammar checks before sampling logic
-    if (apply_grammar && ctx_sampling->grammar != NULL) {
+    // apply grammar checks
+    if (ctx_sampling->grammar != NULL) {
        llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar);
    }

+    llama_sample_softmax(ctx_main, &cur_p);
    return cur_p;
 }

@@ -329,14 +382,12 @@ llama_token llama_sampling_sample(
    return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, false);
 }

-llama_token_data_array llama_sampling_prepare(
+llama_token_data_array llama_sampling_probability_distribution(
                  struct llama_sampling_context * ctx_sampling,
                  struct llama_context * ctx_main,
                  struct llama_context * ctx_cfg,
-                  const int idx,
-                  bool apply_grammar,
-                  std::vector<float> * original_logits) {
-    return llama_sampling_prepare_impl(ctx_sampling,ctx_main, ctx_cfg, idx, apply_grammar, original_logits);
+                  const int idx) {
+    return llama_sample_probability_distribution_impl(ctx_sampling,ctx_main, ctx_cfg, idx);
 }

 void llama_sampling_accept(
--- a/common/sampling.h
+++ b/common/sampling.h
@@ -131,14 +131,12 @@ llama_token llama_sampling_sample(
        struct llama_context * ctx_cfg,
        int idx = 0);

-// Prepares and adjusts the set of token candidates for sampling based on penalties, biases, and sampling parameters.
-llama_token_data_array llama_sampling_prepare(
+// returns the probability that token of given id will be sampled
+llama_token_data_array llama_sampling_probability_distribution(
        struct llama_sampling_context * ctx_sampling,
        struct llama_context * ctx_main,
        struct llama_context * ctx_cfg,
-        int idx = 0,
-        bool apply_grammar = true,
-        std::vector<float> * original_logits = nullptr);
+        int idx = 0);

 void llama_sampling_accept(
        struct llama_sampling_context * ctx_sampling,
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -23,7 +23,7 @@ if 'NO_LOCAL_GGUF' not in os.environ:
    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
 import gguf

-from convert import LlamaHfVocab, permute
+from convert import HfVocab


 ###### MODEL DEFINITIONS ######
@@ -93,42 +93,31 @@ class Model(ABC):

        if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None:
            self.gguf_writer.add_context_length(n_ctx)
-            print(f"gguf: context length = {n_ctx}")

        n_embd = self.find_hparam(["hidden_size", "n_embd"])
        self.gguf_writer.add_embedding_length(n_embd)
-        print(f"gguf: embedding length = {n_embd}")

        if (n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True)) is not None:
            self.gguf_writer.add_feed_forward_length(n_ff)
-            print(f"gguf: feed forward length = {n_ff}")

        n_head = self.find_hparam(["num_attention_heads", "n_head"])
        self.gguf_writer.add_head_count(n_head)
-        print(f"gguf: head count = {n_head}")

        if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None:
            self.gguf_writer.add_head_count_kv(n_head_kv)
-            print(f"gguf: key-value head count = {n_head_kv}")

        if (rope_theta := self.hparams.get("rope_theta")) is not None:
            self.gguf_writer.add_rope_freq_base(rope_theta)
-            print(f"gguf: rope theta = {rope_theta}")
        if (f_rms_eps := self.hparams.get("rms_norm_eps")) is not None:
            self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps)
-            print(f"gguf: rms norm epsilon = {f_rms_eps}")
        if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True)) is not None:
            self.gguf_writer.add_layer_norm_eps(f_norm_eps)
-            print(f"gguf: layer norm epsilon = {f_norm_eps}")
        if (n_experts := self.hparams.get("num_local_experts")) is not None:
            self.gguf_writer.add_expert_count(n_experts)
-            print(f"gguf: expert count = {n_experts}")
        if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None:
            self.gguf_writer.add_expert_used_count(n_experts_used)
-            print(f"gguf: experts used count = {n_experts_used}")

        self.gguf_writer.add_file_type(self.ftype)
-        print(f"gguf: file type = {self.ftype}")

    def write_tensors(self):
        block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
@@ -230,7 +219,7 @@ class Model(ABC):
    def _set_vocab_gpt2(self):
        dir_model = self.dir_model
        hparams = self.hparams
-        tokens: list[str] = []
+        tokens: list[bytearray] = []
        toktypes: list[int] = []

        from transformers import AutoTokenizer
@@ -243,7 +232,8 @@ class Model(ABC):

        for i in range(vocab_size):
            if i not in reverse_vocab:
-                tokens.append(f"[PAD{i}]")
+                pad_token = f"[PAD{i}]".encode('utf-8')
+                tokens.append(bytearray(pad_token))
                toktypes.append(gguf.TokenType.USER_DEFINED)
            elif reverse_vocab[i] in added_vocab:
                tokens.append(reverse_vocab[i])
@@ -265,7 +255,7 @@ class Model(ABC):
    def _set_vocab_qwen(self):
        dir_model = self.dir_model
        hparams = self.hparams
-        tokens: list[str] = []
+        tokens: list[bytearray] = []
        toktypes: list[int] = []

        from transformers import AutoTokenizer
@@ -290,7 +280,8 @@ class Model(ABC):

        for i in range(vocab_size):
            if i not in reverse_vocab:
-                tokens.append(f"[PAD{i}]")
+                pad_token = f"[PAD{i}]".encode("utf-8")
+                tokens.append(bytearray(pad_token))
                toktypes.append(gguf.TokenType.USER_DEFINED)
            elif reverse_vocab[i] in added_vocab:
                tokens.append(reverse_vocab[i])
@@ -323,12 +314,13 @@ class Model(ABC):
        toktypes: list[int] = []

        if not tokenizer_path.is_file():
-            raise FileNotFoundError(f"File not found: {tokenizer_path}")
+            print(f'Error: Missing {tokenizer_path}', file=sys.stderr)
+            sys.exit(1)

        tokenizer = SentencePieceProcessor(str(tokenizer_path))
        vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())

-        for token_id in range(tokenizer.vocab_size()):
+        for token_id in range(vocab_size):
            piece = tokenizer.id_to_piece(token_id)
            text = piece.encode("utf-8")
            score = tokenizer.get_score(token_id)
@@ -353,13 +345,9 @@ class Model(ABC):
                added_tokens_json = json.load(f)

                for key in added_tokens_json:
-                    key = key.encode("utf-8")
-                    if key not in tokens:
-                        tokens.append(key)
-                        scores.append(-1000.0)
-                        toktypes.append(SentencePieceTokenTypes.USER_DEFINED)
-
-        assert len(tokens) == vocab_size
+                    tokens.append(key.encode("utf-8"))
+                    scores.append(-1000.0)
+                    toktypes.append(SentencePieceTokenTypes.USER_DEFINED)

        self.gguf_writer.add_tokenizer_model("llama")
        self.gguf_writer.add_token_list(tokens)
@@ -369,8 +357,12 @@ class Model(ABC):
        special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
        special_vocab.add_to_gguf(self.gguf_writer)

-    def _set_vocab_llama_hf(self):
-        vocab = LlamaHfVocab(self.dir_model)
+    def _set_vocab_hf(self):
+        path = self.dir_model
+        added_tokens_path = self.dir_model
+        vocab = HfVocab(
+            path, added_tokens_path if added_tokens_path.exists() else None
+        )
        tokens = []
        scores = []
        toktypes = []
@@ -510,17 +502,6 @@ class BloomModel(Model):
 class MPTModel(Model):
    model_arch = gguf.MODEL_ARCH.MPT

-    def set_vocab(self):
-        try:
-            self._set_vocab_gpt2()
-        except Exception:
-            # Fallback for SEA-LION model
-            self._set_vocab_sentencepiece()
-            self.gguf_writer.add_add_bos_token(False)
-            self.gguf_writer.add_pad_token_id(3)
-            self.gguf_writer.add_eos_token_id(1)
-            self.gguf_writer.add_unk_token_id(0)
-
    def set_gguf_parameters(self):
        block_count = self.hparams["n_layers"]
        self.gguf_writer.add_name(self.dir_model.name)
@@ -534,10 +515,7 @@ class MPTModel(Model):
        self.gguf_writer.add_layer_norm_eps(1e-5)
        if self.hparams["attn_config"]["clip_qkv"] is not None:
            self.gguf_writer.add_clamp_kqv(self.hparams["attn_config"]["clip_qkv"])
-        if self.hparams["attn_config"]["alibi"]:
-            self.gguf_writer.add_max_alibi_bias(self.hparams["attn_config"]["alibi_bias_max"])
-        else:
-            self.gguf_writer.add_max_alibi_bias(0.0)
+        self.gguf_writer.add_max_alibi_bias(self.hparams["attn_config"]["alibi_bias_max"])

    def write_tensors(self):
        block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers"))
@@ -786,148 +764,6 @@ class BaichuanModel(Model):
        return weights[r * n_part:r * n_part + r, ...]


-@Model.register("XverseForCausalLM")
-class XverseModel(Model):
-    model_arch = gguf.MODEL_ARCH.XVERSE
-
-    def set_vocab(self):
-        assert (self.dir_model / "tokenizer.json").is_file()
-        dir_model = self.dir_model
-        hparams = self.hparams
-
-        tokens: list[bytearray] = []
-        toktypes: list[int] = []
-
-        from transformers import AutoTokenizer
-        tokenizer = AutoTokenizer.from_pretrained(dir_model)
-        vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
-        assert max(tokenizer.vocab.values()) < vocab_size
-
-        reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
-        added_vocab = tokenizer.get_added_vocab()
-
-        for token_id in range(vocab_size):
-            token_text = reverse_vocab[token_id].encode('utf-8')
-            # replace "\x00" to string with length > 0
-            if token_text == b"\x00":
-                toktype = gguf.TokenType.BYTE  # special
-                token_text = f"<{token_text}>".encode('utf-8')
-            elif re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text):
-                toktype = gguf.TokenType.BYTE  # special
-            elif reverse_vocab[token_id] in added_vocab:
-                if tokenizer.added_tokens_decoder[token_id].special:
-                    toktype = gguf.TokenType.CONTROL
-                else:
-                    toktype = gguf.TokenType.USER_DEFINED
-            else:
-                toktype = gguf.TokenType.NORMAL
-
-            tokens.append(token_text)
-            toktypes.append(toktype)
-
-        self.gguf_writer.add_tokenizer_model("llama")
-        self.gguf_writer.add_token_list(tokens)
-        self.gguf_writer.add_token_types(toktypes)
-
-        special_vocab = gguf.SpecialVocab(dir_model, n_vocab=len(tokens))
-        special_vocab.add_to_gguf(self.gguf_writer)
-
-    def set_gguf_parameters(self):
-        block_count = self.hparams["num_hidden_layers"]
-        head_count = self.hparams["num_attention_heads"]
-        head_count_kv = self.hparams.get("num_key_value_heads", head_count)
-        hf_repo = self.hparams.get("_name_or_path", "")
-
-        ctx_length = 0
-        if "max_sequence_length" in self.hparams:
-            ctx_length = self.hparams["max_sequence_length"]
-        elif "max_position_embeddings" in self.hparams:
-            ctx_length = self.hparams["max_position_embeddings"]
-        elif "model_max_length" in self.hparams:
-            ctx_length = self.hparams["model_max_length"]
-        else:
-            print("gguf: can not find ctx length parameter.")
-            sys.exit()
-
-        self.gguf_writer.add_name(self.dir_model.name)
-        self.gguf_writer.add_source_hf_repo(hf_repo)
-        self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
-        self.gguf_writer.add_context_length(ctx_length)
-        self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
-        self.gguf_writer.add_block_count(block_count)
-        self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
-        self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
-        self.gguf_writer.add_head_count(head_count)
-        self.gguf_writer.add_head_count_kv(head_count_kv)
-        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
-
-        if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
-            if self.hparams["rope_scaling"].get("type") == "linear":
-                self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
-                self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
-
-    def write_tensors(self):
-        # Collect tensors from generator object
-        model_kv = dict(self.get_tensors())
-        block_count = self.hparams["num_hidden_layers"]
-        head_count = self.hparams["num_attention_heads"]
-        tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
-        head_count_kv = self.hparams.get("num_key_value_heads", head_count)
-
-        for name, data_torch in model_kv.items():
-            # we don't need these
-            if name.endswith(".rotary_emb.inv_freq"):
-                continue
-
-            old_dtype = data_torch.dtype
-
-            # convert any unsupported data types to float32
-            if data_torch.dtype not in (torch.float16, torch.float32):
-                data_torch = data_torch.to(torch.float32)
-
-            # HF models permute some of the tensors, so we need to undo that
-            if name.endswith(("q_proj.weight")):
-                data_torch = self._reverse_hf_permute(data_torch, head_count, head_count)
-            if name.endswith(("k_proj.weight")):
-                data_torch = self._reverse_hf_permute(data_torch, head_count, head_count_kv)
-
-            data = data_torch.squeeze().numpy()
-
-            # map tensor names
-            new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
-            if new_name is None:
-                print(f"Can not map tensor {name!r}")
-                sys.exit()
-
-            n_dims = len(data.shape)
-            data_dtype = data.dtype
-
-            # if f32 desired, convert any float16 to float32
-            if self.ftype == 0 and data_dtype == np.float16:
-                data = data.astype(np.float32)
-
-            # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
-            if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
-                data = data.astype(np.float32)
-
-            # if f16 desired, convert any float32 2-dim weight tensors to float16
-            if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
-                data = data.astype(np.float16)
-
-            print(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
-            self.gguf_writer.add_tensor(new_name, data)
-
-    def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
-        if n_kv_head is not None and n_head != n_kv_head:
-            n_head //= n_kv_head
-
-        return (
-            weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
-            .swapaxes(1, 2)
-            .reshape(weights.shape)
-        )
-
-
@Model.register("FalconForCausalLM", "RWForCausalLM")
 class FalconModel(Model):
    model_arch = gguf.MODEL_ARCH.FALCON
@@ -1207,222 +1043,13 @@ class StableLMModel(Model):
        self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_eps", "norm_eps"]))


-@Model.register("LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
-class LlamaModel(Model):
+@Model.register("MixtralForCausalLM")
+class MixtralModel(Model):
    model_arch = gguf.MODEL_ARCH.LLAMA

-    def set_vocab(self):
-        try:
-            self. _set_vocab_sentencepiece()
-        except FileNotFoundError:
-            self._set_vocab_llama_hf()
-
-    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-        hparams = self.hparams
-        self.gguf_writer.add_vocab_size(hparams["vocab_size"])
-        self.gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
-
-    # Same as super class, but permuting q_proj, k_proj
-    def write_tensors(self):
-        block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
-        tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
-        n_head = self.hparams.get("num_attention_heads")
-        n_kv_head = self.hparams.get("num_key_value_heads")
-        n_experts = self.hparams.get("num_local_experts")
-        experts = dict()
-        for name, data_torch in self.get_tensors():
-            # we don't need these
-            if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
-                continue
-
-            old_dtype = data_torch.dtype
-
-            # convert any unsupported data types to float32
-            if data_torch.dtype not in (torch.float16, torch.float32):
-                data_torch = data_torch.to(torch.float32)
-
-            data = data_torch.numpy()
-
-            if name.endswith("q_proj.weight"):
-                data = permute(data, n_head, n_head)
-            if name.endswith("k_proj.weight"):
-                data = permute(data, n_head, n_kv_head)
-
-            data = data.squeeze()
-
-            # process the experts separately
-            if name.find("block_sparse_moe.experts") != -1:
-                experts[name] = data
-                if len(experts) >= n_experts:
-                    # merge the experts into a single 3d tensor
-                    for bid in range(block_count):
-                        for wid in range(1, 4):
-                            full = True
-                            for xid in range(n_experts):
-                                ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.w{wid}.weight"
-                                if ename not in experts:
-                                    full = False
-                                    break
-                            if not full:
-                                continue
-
-                            datas = []
-                            for xid in range(n_experts):
-                                ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.w{wid}.weight"
-                                datas.append(experts[ename])
-                                del experts[ename]
-
-                            data = np.stack(datas, axis=0)
-                            data_dtype = data.dtype
-
-                            if self.ftype == 0 and data_dtype == np.float16:
-                                data = data.astype(np.float32)
-
-                            if self.ftype == 1 and data_dtype == np.float32:
-                                data = data.astype(np.float16)
-
-                            merged_name = f"layers.{bid}.feed_forward.experts.w{wid}.weight"
-
-                            new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias"))
-                            if new_name is None:
-                                print(f"Can not map tensor {name!r}")
-                                sys.exit()
-
-                            print(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}")
-
-                            self.gguf_writer.add_tensor(new_name, data)
-                continue
-
-            # map tensor names
-            new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
-            if new_name is None:
-                print(f"Can not map tensor {name!r}")
-                sys.exit()
-
-            n_dims = len(data.shape)
-            data_dtype = data.dtype
-
-            # if f32 desired, convert any float16 to float32
-            if self.ftype == 0 and data_dtype == np.float16:
-                data = data.astype(np.float32)
-
-            # 1d tensors need to be converted to float32
-            if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
-                data = data.astype(np.float32)
-
-            # if f16 desired, convert any float32 2-dim weight tensors to float16
-            if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
-                data = data.astype(np.float16)
-
-            print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
-
-            self.gguf_writer.add_tensor(new_name, data)
-
-        if len(experts) > 0:
-            raise ValueError(f"Unprocessed experts: {experts.keys()}")
-
-
-@Model.register("GrokForCausalLM")
-class GrokModel(Model):
-    model_arch = gguf.MODEL_ARCH.GROK
-
    def set_vocab(self):
        self._set_vocab_sentencepiece()

-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-        self.gguf_writer.add_name("Grok")
-
-    def write_tensors(self):
-        block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
-        tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
-        n_experts = self.hparams.get("num_local_experts")
-        experts = dict()
-        for name, data_torch in self.get_tensors():
-            # we don't need these
-            if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
-                continue
-
-            old_dtype = data_torch.dtype
-
-            # convert any unsupported data types to float32
-            if data_torch.dtype not in (torch.float16, torch.float32):
-                data_torch = data_torch.to(torch.float32)
-
-            data = data_torch.squeeze().numpy()
-
-            # process the experts separately
-            if name.find(".moe.") != -1:
-                experts[name] = data
-                if len(experts) >= n_experts:
-                    # merge the experts into a single 3d tensor
-                    for bid in range(block_count):
-                        for wid in ["linear", "linear_1", "linear_v"]:
-                            full = True
-                            for xid in range(n_experts):
-                                ename = f"transformer.decoder_layer.{bid}.moe.{xid}.{wid}.weight"
-                                if ename not in experts:
-                                    full = False
-                                    break
-                            if not full:
-                                continue
-
-                            datas = []
-                            for xid in range(n_experts):
-                                ename = f"transformer.decoder_layer.{bid}.moe.{xid}.{wid}.weight"
-                                datas.append(experts[ename])
-                                del experts[ename]
-
-                            data = np.stack(datas, axis=0)
-                            data_dtype = data.dtype
-
-                            if self.ftype == 0 and data_dtype == np.float16:
-                                data = data.astype(np.float32)
-
-                            if self.ftype == 1 and data_dtype == np.float32:
-                                data = data.astype(np.float16)
-
-                            merged_name = f"transformer.decoder_layer.{bid}.moe.{wid}.weight"
-
-                            new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias"))
-                            if new_name is None:
-                                print(f"Can not map tensor {name!r}")
-                                sys.exit()
-
-                            print(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}")
-
-                            self.gguf_writer.add_tensor(new_name, data)
-                continue
-
-            # map tensor names
-            new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
-            if new_name is None:
-                print(f"Can not map tensor {name!r}")
-                sys.exit()
-
-            n_dims = len(data.shape)
-            data_dtype = data.dtype
-
-            # if f32 desired, convert any float16 to float32
-            if self.ftype == 0 and data_dtype == np.float16:
-                data = data.astype(np.float32)
-
-            # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
-            if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
-                data = data.astype(np.float32)
-
-            # if f16 desired, convert any float32 2-dim weight tensors to float16
-            if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
-                data = data.astype(np.float16)
-
-            print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
-
-            self.gguf_writer.add_tensor(new_name, data)
-

@Model.register("MiniCPMForCausalLM")
 class MiniCPMModel(Model):
@@ -1442,7 +1069,7 @@ class MiniCPMModel(Model):
        self.gguf_writer.add_file_type(self.ftype)

    def set_vocab(self):
-        self._set_vocab_llama_hf()
+        self._set_vocab_hf()

    def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
        if n_kv_head is not None and n_head != n_kv_head:
@@ -2043,8 +1670,11 @@ class BertModel(Model):
            self.gguf_writer.add_pooling_type(pooling_type)

    def set_vocab(self):
+        path = self.dir_model
+        added_tokens_path = self.dir_model if self.dir_model.exists() else None
+
        # use huggingface vocab to get all tokens
-        vocab = LlamaHfVocab(self.dir_model, ignore_nonllama=True)
+        vocab = HfVocab(path, added_tokens_path)
        tokens, scores, toktypes = zip(*vocab.all_tokens())
        assert len(tokens) == vocab.vocab_size
        self.vocab_size = vocab.vocab_size
--- a/convert-persimmon-to-gguf.py
+++ b/convert-persimmon-to-gguf.py
@@ -106,12 +106,12 @@ def main():
    tensor_map = gguf.get_tensor_name_map(arch, block_count)
    print(tensor_map)
    for name in tensors.keys():
-        data_torch = tensors[name]
+        data = tensors[name]
        if name.endswith(".self_attention.rotary_emb.inv_freq"):
            continue
-        old_dtype = data_torch.dtype
+        old_dtype = data.dtype
        # TODO: FP16 conversion produces garbage outputs. (Q8_0 does not, so..?)
-        data = data_torch.to(torch.float32).squeeze().numpy()
+        data = data.to(torch.float32).squeeze().numpy()
        new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
        if new_name is None:
            print("Can not map tensor '" + name + "'")
--- a/convert.py
+++ b/convert.py
@@ -16,14 +16,13 @@ import re
 import signal
 import struct
 import sys
-import textwrap
 import time
 import zipfile
-from abc import ABC, abstractmethod
+from abc import ABCMeta, abstractmethod
 from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
 from dataclasses import dataclass
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable, ClassVar, IO, Iterable, Literal, Protocol, TypeVar, runtime_checkable
+from typing import IO, TYPE_CHECKING, Any, Callable, Iterable, Literal, TypeVar

 import numpy as np
 from sentencepiece import SentencePieceProcessor
@@ -44,9 +43,6 @@ ARCH = gguf.MODEL_ARCH.LLAMA

 DEFAULT_CONCURRENCY = 8

-ADDED_TOKENS_FILE = 'added_tokens.json'
-FAST_TOKENIZER_FILE = 'tokenizer.json'
-
 #
 # data types
 #
@@ -192,10 +188,8 @@ class Params:
            n_layer = next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)

        if n_layer < 1:
-            msg = """\
-                failed to guess 'n_layer'. This model is unknown or unsupported.
-                Suggestion: provide 'config.json' of the model in the same directory containing model files."""
-            raise KeyError(textwrap.dedent(msg))
+            raise Exception("failed to guess 'n_layer'. This model is unknown or unsupported.\n"
+                            "Suggestion: provide 'config.json' of the model in the same directory containing model files.")

        n_head = n_embd // 128 # guessed
        n_mult = 256           # guessed
@@ -217,8 +211,7 @@ class Params:

    @staticmethod
    def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params:
-        with open(config_path) as f:
-            config = json.load(f)
+        config = json.load(open(config_path))

        rope_scaling_type = f_rope_scale = n_orig_ctx = rope_finetuned = None
        rope_scaling = config.get("rope_scaling")
@@ -240,10 +233,8 @@ class Params:
        elif "max_position_embeddings" in config:
            n_ctx = config["max_position_embeddings"]
        else:
-            msg = """\
-                failed to guess 'n_ctx'. This model is unknown or unsupported.
-                Suggestion: provide 'config.json' of the model in the same directory containing model files."""
-            raise KeyError(textwrap.dedent(msg))
+            raise Exception("failed to guess 'n_ctx'. This model is unknown or unsupported.\n"
+                            "Suggestion: provide 'config.json' of the model in the same directory containing model files.")

        n_experts      = None
        n_experts_used = None
@@ -274,8 +265,7 @@ class Params:
    # {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1}
    @staticmethod
    def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:
-        with open(config_path) as f:
-            config = json.load(f)
+        config = json.load(open(config_path))

        n_experts      = None
        n_experts_used = None
@@ -341,86 +331,47 @@ class Params:
 # vocab
 #

-@runtime_checkable
-class BaseVocab(Protocol):
-    tokenizer_model: ClassVar[str]
-    name: ClassVar[str]
-
-
-class NoVocab(BaseVocab):
-    tokenizer_model = "no_vocab"
-    name = "no_vocab"
-
-    def __repr__(self) -> str:
-        return "<NoVocab for a model without integrated vocabulary>"
-
-
-@runtime_checkable
-class Vocab(BaseVocab, Protocol):
-    vocab_size: int
-    added_tokens_dict: dict[str, int]
-    added_tokens_list: list[str]
-    fname_tokenizer: Path
-
-    def __init__(self, base_path: Path): ...
-    def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: ...
-
-
-class BpeVocab(Vocab):
+class BpeVocab:
    tokenizer_model = "gpt2"
    name = "bpe"

-    def __init__(self, base_path: Path):
-        added_tokens: dict[str, int] = {}
-
-        if (fname_tokenizer := base_path / 'vocab.json').exists():
-            # "slow" tokenizer
-            with open(fname_tokenizer, encoding="utf-8") as f:
-                self.vocab = json.load(f)
-
-            try:
-                # FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
-                with open(base_path / ADDED_TOKENS_FILE, encoding="utf-8") as f:
-                    added_tokens = json.load(f)
-            except FileNotFoundError:
-                pass
+    def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
+        self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read())
+        if isinstance(self.bpe_tokenizer.get('model'), dict):
+            self.vocab = self.bpe_tokenizer["model"]["vocab"]
        else:
-            # "fast" tokenizer
-            fname_tokenizer = base_path / FAST_TOKENIZER_FILE
+            self.vocab = self.bpe_tokenizer
+        added_tokens: dict[str, int]
+        if fname_added_tokens is not None:
+            # FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
+            added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
+        else:
+            # Fall back to trying to find the added tokens in tokenizer.json
+            tokenizer_json_file = fname_tokenizer.parent / 'tokenizer.json'
+            if not tokenizer_json_file.is_file():
+                added_tokens = {}
+            else:
+                tokenizer_json = json.load(open(tokenizer_json_file, encoding="utf-8"))
+                added_tokens = dict(
+                    (item['content'], item['id'])
+                    for item in tokenizer_json.get('added_tokens', [])
+                    # Added tokens here can be duplicates of the main vocabulary.
+                    if item['content'] not in self.bpe_tokenizer)

-            # if this fails, FileNotFoundError propagates to caller
-            with open(fname_tokenizer, encoding="utf-8") as f:
-                tokenizer_json = json.load(f)
-
-            tokenizer_model: dict[str, Any] = tokenizer_json['model']
-            if (
-                tokenizer_model['type'] != 'BPE' or tokenizer_model.get('byte_fallback', False)
-                or tokenizer_json['decoder']['type'] != 'ByteLevel'
-            ):
-                raise FileNotFoundError('Cannot find GPT-2 BPE tokenizer')
-
-            self.vocab = tokenizer_model["vocab"]
-
-            if (added := tokenizer_json.get('added_tokens')) is not None:
-                # Added tokens here can be duplicates of the main vocabulary.
-                added_tokens = {item['content']: item['id']
-                                for item in added
-                                if item['content'] not in self.vocab}
-
-        vocab_size   = len(self.vocab)
-        expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
-        actual_ids   = sorted(added_tokens.values())
+        vocab_size: int = len(self.vocab)
+        expected_ids    = list(range(vocab_size, vocab_size + len(added_tokens)))
+        actual_ids      = sorted(added_tokens.values())
        if expected_ids != actual_ids:
            expected_end_id = vocab_size + len(actual_ids) - 1
-            raise ValueError(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range "
-                             f"{vocab_size} - {expected_end_id}; got {actual_ids}")
+            raise Exception(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range {vocab_size} - {expected_end_id}; got {actual_ids}")

        items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
        self.added_tokens_dict    = added_tokens
        self.added_tokens_list    = [text for (text, idx) in items]
-        self.vocab_size_base      = vocab_size
-        self.vocab_size           = self.vocab_size_base + len(self.added_tokens_list)
+        self.vocab_size_base: int = vocab_size
+        self.vocab_size: int      = self.vocab_size_base + len(self.added_tokens_list)
        self.fname_tokenizer      = fname_tokenizer
+        self.fname_added_tokens   = fname_added_tokens

    def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
        reverse_vocab = {id: encoded_tok for encoded_tok, id in self.vocab.items()}
@@ -441,25 +392,19 @@ class BpeVocab(Vocab):
        return f"<BpeVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"


-class SentencePieceVocab(Vocab):
+class SentencePieceVocab:
    tokenizer_model = "llama"
    name = "spm"

-    def __init__(self, base_path: Path):
-        added_tokens: dict[str, int] = {}
-        if (fname_tokenizer := base_path / 'tokenizer.model').exists():
-            # normal location
-            try:
-                with open(base_path / ADDED_TOKENS_FILE, encoding="utf-8") as f:
-                    added_tokens = json.load(f)
-            except FileNotFoundError:
-                pass
-        elif not (fname_tokenizer := base_path.parent / 'tokenizer.model').exists():
-            # not found in alternate location either
-            raise FileNotFoundError('Cannot find tokenizer.model')
-
+    def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
        self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
-        vocab_size = self.sentencepiece_tokenizer.vocab_size()
+        added_tokens: dict[str, int]
+        if fname_added_tokens is not None:
+            added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
+        else:
+            added_tokens = {}
+
+        vocab_size: int = self.sentencepiece_tokenizer.vocab_size()

        new_tokens       = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
        expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens)))
@@ -469,17 +414,18 @@ class SentencePieceVocab(Vocab):
            raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}")

        # Token pieces that were added to the base vocabulary.
-        self.added_tokens_dict  = added_tokens
+        self.added_tokens_dict = added_tokens
        self.added_tokens_list  = [new_tokens[id] for id in actual_new_ids]
        self.vocab_size_base    = vocab_size
        self.vocab_size         = self.vocab_size_base + len(self.added_tokens_list)
        self.fname_tokenizer    = fname_tokenizer
+        self.fname_added_tokens = fname_added_tokens

    def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
        tokenizer = self.sentencepiece_tokenizer
        for i in range(tokenizer.vocab_size()):
            piece = tokenizer.id_to_piece(i)
-            text         = piece.encode("utf-8")
+            text: bytes = piece.encode("utf-8")
            score: float = tokenizer.get_score(i)

            toktype = gguf.TokenType.NORMAL
@@ -512,42 +458,27 @@ class SentencePieceVocab(Vocab):
        return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"


-class LlamaHfVocab(Vocab):
+class HfVocab:
    tokenizer_model = "llama"
    name = "hfft"

-    def __init__(self, base_path: Path, ignore_nonllama: bool = False):
-        fname_tokenizer = base_path / FAST_TOKENIZER_FILE
-        # if this fails, FileNotFoundError propagates to caller
-        with open(fname_tokenizer, encoding='utf-8') as f:
-            tokenizer_json = json.load(f)
-
-        # pre-check so we know if we need transformers
-        tokenizer_model: dict[str, Any] = tokenizer_json['model']
-        if ignore_nonllama:
-            pass  # workaround incorrect use of this class for WordPiece
-        elif (
-            tokenizer_model['type'] != 'BPE' or not tokenizer_model.get('byte_fallback', False)
-            or tokenizer_json['decoder']['type'] != 'Sequence'
-        ):
-            raise FileNotFoundError('Cannot find Llama BPE tokenizer')
-
+    def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None = None) -> None:
        try:
            from transformers import AutoTokenizer
        except ImportError as e:
            raise ImportError(
-                "To use LlamaHfVocab, please install the `transformers` package. "
+                "To use HfVocab, please install the `transformers` package. "
                "You can install it with `pip install transformers`."
            ) from e

+        print("fname_tokenizer:", fname_tokenizer)
        # Allow the tokenizer to default to slow or fast versions.
        # Explicitly set tokenizer to use local paths.
        self.tokenizer = AutoTokenizer.from_pretrained(
-            base_path,
-            cache_dir=base_path,
+            fname_tokenizer,
+            cache_dir=fname_tokenizer,
            local_files_only=True,
        )
-        assert self.tokenizer.is_fast  # assume tokenizer.json is used

        # Initialize lists and dictionaries for added tokens
        self.added_tokens_list = []
@@ -575,7 +506,8 @@ class LlamaHfVocab(Vocab):
        self.vocab_size_base = self.tokenizer.vocab_size
        self.vocab_size      = self.vocab_size_base + len(self.added_tokens_list)

-        self.fname_tokenizer = fname_tokenizer
+        self.fname_tokenizer    = fname_tokenizer
+        self.fname_added_tokens = fname_added_tokens

    def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
        reverse_vocab = {
@@ -627,7 +559,18 @@ class LlamaHfVocab(Vocab):
        yield from self.added_tokens()

    def __repr__(self) -> str:
-        return f"<LlamaHfVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
+        return f"<HfVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
+
+
+class NoVocab:
+    tokenizer_model = "no_vocab"
+    name = "no_vocab"
+
+    def __repr__(self) -> str:
+        return "<NoVocab for a model without integrated vocabulary>"
+
+
+Vocab: TypeAlias = "BpeVocab | SentencePieceVocab | HfVocab | NoVocab"


 #
@@ -645,7 +588,7 @@ def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray:
            .reshape(weights.shape))


-class Tensor(ABC):
+class Tensor(metaclass=ABCMeta):
    data_type: DataType

    @abstractmethod
@@ -667,7 +610,7 @@ def bf16_to_fp32(bf16_arr: np.ndarray[Any, np.dtype[np.uint16]]) -> NDArray:


 class UnquantizedTensor(Tensor):
-    def __init__(self, ndarray: NDArray):
+    def __init__(self, ndarray: NDArray) -> None:
        assert isinstance(ndarray, np.ndarray)
        self.ndarray = ndarray
        self.data_type = NUMPY_TYPE_TO_DATA_TYPE[ndarray.dtype]
@@ -746,7 +689,7 @@ class ModelPlus:
    model: LazyModel
    paths: list[Path]  # Where this was read from.
    format: Literal['ggml', 'torch', 'safetensors', 'none']
-    vocab: BaseVocab | None  # For GGML models (which have vocab built in), the vocab.
+    vocab: Vocab | None  # For GGML models (which have vocab built in), the vocab.


 def merge_sharded(models: list[LazyModel]) -> LazyModel:
@@ -755,7 +698,7 @@ def merge_sharded(models: list[LazyModel]) -> LazyModel:
    names = {name: None for model in models for name in model}

    def convert(name: str) -> LazyTensor:
-        lazy_tensors = [model[name] for model in models]
+        lazy_tensors: list[LazyTensor] = [model[name] for model in models]
        if len(lazy_tensors) == 1:
            # only one file; don't go through this procedure since there might
            # be quantized tensors
@@ -776,7 +719,7 @@ def merge_sharded(models: list[LazyModel]) -> LazyModel:

        def load() -> UnquantizedTensor:
            ndarrays = [load_unquantized(tensor) for tensor in lazy_tensors]
-            concatenated = np.concatenate(ndarrays, axis=axis)
+            concatenated: NDArray = np.concatenate(ndarrays, axis=axis)
            return UnquantizedTensor(concatenated)
        description = 'concatenated[[' + '] | ['.join(lt.description for lt in lazy_tensors) + ']]'
        return LazyTensor(load, concatenated_shape, lazy_tensors[0].data_type, description)
@@ -828,15 +771,6 @@ def part_lazy(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor:
    return LazyTensor(load, s, lazy_tensor.data_type, 'part ' + lazy_tensor.description)


-def pack_experts_lazy(lazy_tensors: list[LazyTensor]) -> LazyTensor:
-    def load() -> Tensor:
-        tensors = [lazy_tensor.load() for lazy_tensor in lazy_tensors]
-        return UnquantizedTensor(np.array([tensor.ndarray for tensor in tensors]))
-    s = lazy_tensors[0].shape.copy()
-    s.insert(0, len(lazy_tensors))
-    return LazyTensor(load, s, lazy_tensors[0].data_type, 'pack_experts ' + ' | '.join(lt.description for lt in lazy_tensors))
-
-
 # Functionality that simulates `torch.load` but where individual tensors are
 # only loaded into memory on demand, not all at once.
 # PyTorch can't do this natively as of time of writing:
@@ -873,10 +807,10 @@ class LazyUnpickler(pickle.Unpickler):

        def load(offset: int, elm_count: int) -> NDArray:
            dtype = data_type.dtype
-            with self.zip_file.open(info) as fp:
-                fp.seek(offset * dtype.itemsize)
-                size = elm_count * dtype.itemsize
-                data = fp.read(size)
+            fp = self.zip_file.open(info)
+            fp.seek(offset * dtype.itemsize)
+            size = elm_count * dtype.itemsize
+            data = fp.read(size)
            assert len(data) == size
            return np.frombuffer(data, dtype)
        description = f'storage data_type={data_type} path-in-zip={filename} path={self.zip_file.filename}'
@@ -897,7 +831,7 @@ class LazyUnpickler(pickle.Unpickler):
    def rebuild_from_type_v2(func, new_type, args, state):
        return func(*args)

-    CLASSES = {
+    CLASSES: dict[tuple[str, str], Any] = {
        # getattr used here as a workaround for mypy not being smart enough to determine
        # the staticmethods have a __func__ attribute.
        ('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'),
@@ -956,7 +890,7 @@ def lazy_load_safetensors_file(fp: IO[bytes], path: Path) -> ModelPlus:
 def must_read(fp: IO[bytes], length: int) -> bytes:
    ret = fp.read(length)
    if len(ret) < length:
-        raise EOFError("unexpectedly reached end of file")
+        raise Exception("unexpectedly reached end of file")
    return ret


@@ -1014,14 +948,13 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc
            yield result


-def check_vocab_size(params: Params, vocab: BaseVocab, pad_vocab: bool = False) -> None:
+def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> None:
    # Handle special case where the model's vocab size is not set
    if params.n_vocab == -1:
        raise ValueError(
-            "The model's vocab size is set to -1 in params.json. Please update it manually."
-            + (f" Maybe {vocab.vocab_size}?" if isinstance(vocab, Vocab) else ""),
+            f"The model's vocab size is set to -1 in params.json. Please update it manually.{f' Maybe {vocab.vocab_size}?' if hasattr(vocab, 'vocab_size') else ''}"
        )
-    if not isinstance(vocab, Vocab):
+    if isinstance(vocab, NoVocab):
        return  # model has no vocab

    # Check for a vocab size mismatch
@@ -1046,11 +979,11 @@ def check_vocab_size(params: Params, vocab: BaseVocab, pad_vocab: bool = False)
    if vocab.vocab_size < params.n_vocab:
        msg += " Add the --pad-vocab option and try again."

-    raise ValueError(msg)
+    raise Exception(msg)


 class OutputFile:
-    def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE):
+    def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE) -> None:
        self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)

    def add_meta_arch(self, params: Params) -> None:
@@ -1101,6 +1034,8 @@ class OutputFile:
            self.gguf.add_file_type(params.ftype)

    def extract_vocabulary_from_model(self, vocab: Vocab) -> tuple[list[bytes], list[float], list[gguf.TokenType]]:
+        assert not isinstance(vocab, NoVocab)
+
        tokens = []
        scores = []
        toktypes = []
@@ -1200,7 +1135,7 @@ class OutputFile:

    @staticmethod
    def write_all(
-        fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: BaseVocab, svocab: gguf.SpecialVocab,
+        fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab,
        concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
        pad_vocab: bool = False,
    ) -> None:
@@ -1210,11 +1145,11 @@ class OutputFile:

        # meta data
        of.add_meta_arch(params)
-        if isinstance(vocab, Vocab):
+        if isinstance(vocab, NoVocab):
+            of.gguf.add_tokenizer_model(vocab.tokenizer_model)
+        else:
            of.add_meta_vocab(vocab)
            of.add_meta_special_vocab(svocab)
-        else:  # NoVocab
-            of.gguf.add_tokenizer_model(vocab.tokenizer_model)

        # tensor info
        for name, lazy_tensor in model.items():
@@ -1241,7 +1176,7 @@ def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileT

    name_to_type = {name: lazy_tensor.data_type for (name, lazy_tensor) in model.items()}

-    raise ValueError(f"Unexpected combination of types: {name_to_type}")
+    raise Exception(f"Unexpected combination of types: {name_to_type}")


 def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel:
@@ -1251,26 +1186,10 @@ def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyM

 def convert_model_names(model: LazyModel, params: Params, skip_unknown: bool) -> LazyModel:
    tmap = gguf.TensorNameMap(ARCH, params.n_layer)
-    should_skip = set(gguf.MODEL_TENSOR_SKIP.get(ARCH, []))
+    should_skip: set[gguf.MODEL_TENSOR] = set(gguf.MODEL_TENSOR_SKIP.get(ARCH, []))

    tmp = model

-    # merge experts into one tensor
-    if params.n_experts and params.n_experts > 0:
-        for i_l in range(params.n_layer):
-            for w in range(1, 4):
-                experts = []
-                for e in range(params.n_experts):
-                    if f"layers.{i_l}.feed_forward.experts.{e}.w{w}.weight" in model:
-                        experts.append(model[f"layers.{i_l}.feed_forward.experts.{e}.w{w}.weight"])
-                        del tmp[f"layers.{i_l}.feed_forward.experts.{e}.w{w}.weight"]
-                    elif f"model.layers.{i_l}.block_sparse_moe.experts.{e}.w{w}.weight" in model:
-                        experts.append(model[f"model.layers.{i_l}.block_sparse_moe.experts.{e}.w{w}.weight"])
-                        del tmp[f"model.layers.{i_l}.block_sparse_moe.experts.{e}.w{w}.weight"]
-                    else:
-                        raise ValueError(f"Expert tensor not found: layers.{i_l}.feed_forward.experts.{e}.w{w}.weight")
-                tmp[f"layers.{i_l}.feed_forward.experts.w{w}.weight"] = pack_experts_lazy(experts)
-
    # HF models permut or pack some of the tensors, so we need to undo that
    for i in itertools.count():
        if f"model.layers.{i}.self_attn.q_proj.weight" in model:
@@ -1294,7 +1213,8 @@ def convert_model_names(model: LazyModel, params: Params, skip_unknown: bool) ->
            if skip_unknown:
                print(f"Unexpected tensor name: {name} - skipping")
                continue
-            raise ValueError(f"Unexpected tensor name: {name}. Use --skip-unknown to ignore it (e.g. LLaVA)")
+            else:
+                raise Exception(f"Unexpected tensor name: {name}. Use --skip-unknown to ignore it (e.g. LLaVA)")

        if tensor_type in should_skip:
            print(f"skipping tensor {name_new}")
@@ -1311,7 +1231,7 @@ def nth_multifile_path(path: Path, n: int) -> Path | None:
    the nth path in the model.
    '''
    # Support the following patterns:
-    patterns = [
+    patterns: list[tuple[str, str]] = [
        # - x.00.pth, x.01.pth, etc.
        (r'\.[0-9]{2}\.pth$', f'.{n:02}.pth'),
        # - x-00001-of-00002.bin, x-00002-of-00002.bin, etc.
@@ -1357,9 +1277,9 @@ def load_some_model(path: Path) -> ModelPlus:
            globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin"]
            files = [file for glob in globs for file in path.glob(glob)]
        if not files:
-            raise FileNotFoundError(f"Can't find model in directory {path}")
+            raise Exception(f"Can't find model in directory {path}")
        if len(files) > 1:
-            raise ValueError(f"Found multiple models in {path}, not sure which to pick: {files}")
+            raise Exception(f"Found multiple models in {path}, not sure which to pick: {files}")
        path = files[0]

    paths = find_multifile_paths(path)
@@ -1373,14 +1293,36 @@ def load_some_model(path: Path) -> ModelPlus:


 class VocabFactory:
-    _VOCAB_CLASSES: list[type[Vocab]] = [SentencePieceVocab, BpeVocab, LlamaHfVocab]
+    _FILES = {"spm": "tokenizer.model", "bpe": "vocab.json", "hfft": "tokenizer.json"}

    def __init__(self, path: Path):
        self.path = path
+        self.file_paths = self._detect_files()
+        print(f"Found vocab files: {self.file_paths}")

-    def _create_special_vocab(self, vocab: BaseVocab, model_parent_path: Path) -> gguf.SpecialVocab:
+    def _detect_files(self) -> dict[str, Path | None]:
+        def locate(file: str) -> Path | None:
+            if (path := self.path / file).exists():
+                return path
+            if (path := self.path.parent / file).exists():
+                return path
+            return None
+
+        return {vt: locate(f) for vt, f in self._FILES.items()}
+
+    def _select_file(self, vocab_types: list[str]) -> tuple[str, Path]:
+        for vtype in vocab_types:
+            try:
+                path = self.file_paths[vtype]
+            except KeyError:
+                raise ValueError(f"Unsupported vocabulary type {vtype}") from None
+            if path is not None:
+                return vtype, path
+        raise FileNotFoundError(f"Could not find any of {[self._FILES[vt] for vt in vocab_types]}")
+
+    def _create_special_vocab(self, vocab: Vocab, model_parent_path: Path) -> gguf.SpecialVocab:
        load_merges = vocab.name == "bpe"
-        n_vocab = vocab.vocab_size if isinstance(vocab, Vocab) else None
+        n_vocab = vocab.vocab_size if hasattr(vocab, "vocab_size") else None
        return gguf.SpecialVocab(
            model_parent_path,
            load_merges=load_merges,
@@ -1389,29 +1331,27 @@ class VocabFactory:
        )

    def _create_vocab_by_path(self, vocab_types: list[str]) -> Vocab:
-        vocab_classes: dict[str, type[Vocab]] = {cls.name: cls for cls in self._VOCAB_CLASSES}
-        selected_vocabs: dict[str, type[Vocab]] = {}
-        for vtype in vocab_types:
-            try:
-                selected_vocabs[vtype] = vocab_classes[vtype]
-            except KeyError:
-                raise ValueError(f"Unsupported vocabulary type {vtype}") from None
+        vocab_type, path = self._select_file(vocab_types)
+        print(f"Loading vocab file {path!r}, type {vocab_type!r}")

-        for vtype, cls in selected_vocabs.items():
-            try:
-                vocab = cls(self.path)
-                break
-            except FileNotFoundError:
-                pass  # ignore unavailable tokenizers
-        else:
-            raise FileNotFoundError(f"Could not find a tokenizer matching any of {vocab_types}")
+        added_tokens_path = path.parent / "added_tokens.json"
+        if vocab_type == "bpe":
+            return BpeVocab(
+                path, added_tokens_path if added_tokens_path.exists() else None
+            )
+        if vocab_type == "spm":
+            return SentencePieceVocab(
+                path, added_tokens_path if added_tokens_path.exists() else None
+            )
+        if vocab_type == "hfft":
+            return HfVocab(
+                path.parent, added_tokens_path if added_tokens_path.exists() else None
+            )
+        raise ValueError(vocab_type)

-        print(f"Loaded vocab file {vocab.fname_tokenizer!r}, type {vocab.name!r}")
-        return vocab
-
-    def load_vocab(self, vocab_types: list[str] | None, model_parent_path: Path) -> tuple[BaseVocab, gguf.SpecialVocab]:
-        vocab: BaseVocab
-        if vocab_types is None:
+    def load_vocab(self, vocab_types: list[str], model_parent_path: Path) -> tuple[Vocab, gguf.SpecialVocab]:
+        vocab: Vocab
+        if len(vocab_types) == 1 and "no_vocab" in vocab_types:
            vocab = NoVocab()
        else:
            vocab = self._create_vocab_by_path(vocab_types)
@@ -1468,8 +1408,10 @@ def main(args_in: list[str] | None = None) -> None:
    parser.add_argument("--skip-unknown", action="store_true",    help="skip unknown tensor names instead of failing")

    args = parser.parse_args(args_in)
-    if args.no_vocab and args.vocab_only:
-        raise ValueError("--vocab-only does not make sense with --no-vocab")
+    if args.no_vocab:
+        if args.vocab_only:
+            raise ValueError("no need to specify --vocab-only if using --no-vocab")
+        args.vocab_type = "no_vocab"

    if args.dump_single:
        model_plus = lazy_load_file(args.model)
@@ -1491,12 +1433,10 @@ def main(args_in: list[str] | None = None) -> None:
    params = Params.load(model_plus)
    if params.n_ctx == -1:
        if args.ctx is None:
-            msg = """\
-                The model doesn't have a context size, and you didn't specify one with --ctx
-                Please specify one with --ctx:
-                 - LLaMA v1: --ctx 2048
-                 - LLaMA v2: --ctx 4096"""
-            parser.error(textwrap.dedent(msg))
+            raise Exception("The model doesn't have a context size, and you didn't specify one with --ctx\n"
+                            "Please specify one with --ctx:\n"
+                            " - LLaMA v1: --ctx 2048\n"
+                            " - LLaMA v2: --ctx 4096\n")
        params.n_ctx = args.ctx

    if args.outtype:
@@ -1511,11 +1451,9 @@ def main(args_in: list[str] | None = None) -> None:
    model_parent_path = model_plus.paths[0].parent
    vocab_path = Path(args.vocab_dir or args.model or model_parent_path)
    vocab_factory = VocabFactory(vocab_path)
-    vocab_types = None if args.no_vocab else args.vocab_type.split(",")
-    vocab, special_vocab = vocab_factory.load_vocab(vocab_types, model_parent_path)
+    vocab, special_vocab = vocab_factory.load_vocab(args.vocab_type.split(","), model_parent_path)

    if args.vocab_only:
-        assert isinstance(vocab, Vocab)
        if not args.outfile:
            raise ValueError("need --outfile if using --vocab-only")
        outfile = args.outfile
--- a/docs/token_generation_performance_tips.md
+++ b/docs/token_generation_performance_tips.md
@@ -1,7 +1,7 @@
 # Token generation performance troubleshooting

-## Verifying that the model is running on the GPU with CUDA
-Make sure you compiled llama with the correct env variables according to [this guide](../README.md#CUDA), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
+## Verifying that the model is running on the GPU with cuBLAS
+Make sure you compiled llama with the correct env variables according to [this guide](../README.md#cublas), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
 ```shell
 ./main -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some "
 ```
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -34,7 +34,6 @@ else()
    add_subdirectory(perplexity)
    add_subdirectory(quantize)
    add_subdirectory(quantize-stats)
-    add_subdirectory(retrieval)
    add_subdirectory(save-load-state)
    add_subdirectory(simple)
    add_subdirectory(passkey)
--- a/examples/batched-bench/README.md
+++ b/examples/batched-bench/README.md
@@ -10,16 +10,16 @@ There are 2 modes of operation:
 - `prompt is shared` - there is a common prompt of size `PP` used by all batches (i.e. `N_KV = PP + B*TG`)

 ```bash
-./batched-bench MODEL_PATH [N_KV_MAX] [N_BATCH] [N_UBATCH] [IS_PP_SHARED] [NGL] [MMQ] <PP> <TG> <PL>
+./batched-bench MODEL_PATH [N_KV_MAX] [IS_PP_SHARED] [NGL] [MMQ] <PP> <TG> <PL>

 # LLaMA 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared
-./batched-bench ./models/llama-7b/ggml-model-f16.gguf 16384 2048 512 0 99
+./batched-bench ./models/llama-7b/ggml-model-f16.gguf 16384 0 99

 # LLaMA 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared
-./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 16384 2048 512 1 99
+./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 16384 1 99

 # custom set of batches
-./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 2048 512 512 0 999 0 128,256,512 128,256 1,2,4,8,16,32
+./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 2048 0 999 0 128,256,512 128,256 1,2,4,8,16,32
 ```

 ## Sample results
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@@ -32,15 +32,13 @@ int main(int argc, char ** argv) {
    gpt_params params;

    if (argc == 1 || argv[1][0] == '-') {
-        printf("usage: %s MODEL_PATH [N_KV_MAX] [N_BATCH] [N_UBATCH] [IS_PP_SHARED] [NGL] <PP> <TG> <PL>\n" , argv[0]);
+        printf("usage: %s MODEL_PATH [N_KV_MAX] [IS_PP_SHARED] [NGL] <PP> <TG> <PL>\n" , argv[0]);
        printf("  <PP>, <TG> and PL are comma-separated lists of numbers without spaces\n\n");
-        printf("  example: %s ggml-model-f16.gguf 2048 2048 512 0 999 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]);
+        printf("  example: %s ggml-model-f16.gguf 2048 0 999 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]);
        return 1 ;
    }

    int n_kv_max     = 2048;
-    int n_batch      = 2048;
-    int n_ubatch     = 512;
    int is_pp_shared = 0;
    int n_gpu_layers = 0;

@@ -58,31 +56,23 @@ int main(int argc, char ** argv) {
    }

    if (argc >= 4) {
-        n_batch = std::atoi(argv[3]);
+        is_pp_shared = std::atoi(argv[3]);
    }

    if (argc >= 5) {
-        n_ubatch = std::atoi(argv[4]);
+        n_gpu_layers = std::atoi(argv[4]);
    }

    if (argc >= 6) {
-        is_pp_shared = std::atoi(argv[5]);
+        n_pp = parse_list(argv[5]);
    }

    if (argc >= 7) {
-        n_gpu_layers = std::atoi(argv[6]);
+        n_tg = parse_list(argv[6]);
    }

    if (argc >= 8) {
-        n_pp = parse_list(argv[7]);
-    }
-
-    if (argc >= 9) {
-        n_tg = parse_list(argv[8]);
-    }
-
-    if (argc >= 10) {
-        n_pl = parse_list(argv[9]);
+        n_pl = parse_list(argv[7]);
    }

    // init LLM
@@ -110,8 +100,7 @@ int main(int argc, char ** argv) {

    ctx_params.seed      = 1234;
    ctx_params.n_ctx     = n_kv_max;
-    ctx_params.n_batch   = n_batch;
-    ctx_params.n_ubatch  = n_ubatch;
+    ctx_params.n_batch   = 512;

    ctx_params.n_threads       = params.n_threads;
    ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
@@ -169,7 +158,7 @@ int main(int argc, char ** argv) {
    }

    LOG_TEE("\n");
-    LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, n_batch, n_ubatch, is_pp_shared, n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
+    LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
    LOG_TEE("\n");

    LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP",     "TG",     "B",    "N_KV",     "T_PP s",   "S_PP t/s", "T_TG s",   "S_TG t/s", "T s",      "S t/s");
--- a/examples/convert-llama2c-to-ggml/README.md
+++ b/examples/convert-llama2c-to-ggml/README.md
@@ -21,8 +21,6 @@ An example command using a model from [karpathy/tinyllamas](https://huggingface.

 `$ ./convert-llama2c-to-ggml --copy-vocab-from-model llama-2-7b-chat.gguf.q2_K.bin --llama2c-model stories42M.bin --llama2c-output-model stories42M.gguf.bin`

-Note: The vocabulary for `stories260K.bin` should be its own tokenizer `tok512.bin` found in [karpathy/tinyllamas/stories260K](https://huggingface.co/karpathy/tinyllamas/tree/main/stories260K).
-
 Now you can use the model with a command like:

 `$ ./main -m stories42M.gguf.bin -p "One day, Lily met a Shoggoth" -n 500 -c 256`
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@@ -1,7 +1,6 @@
 #include "ggml.h"
 #include "llama.h"
 #include "common.h"
-#include "log.h"

 #include <unordered_map>
 #include <vector>
@@ -79,101 +78,111 @@ typedef struct {

 struct TransformerWeights {
    // token embedding table
-    std::vector<float> token_embedding_table;    // (vocab_size, dim)
+    float* token_embedding_table;    // (vocab_size, dim)
    // weights for rmsnorms
-    std::vector<float> rms_att_weight; // (layer, dim) rmsnorm weights
-    std::vector<float> rms_ffn_weight; // (layer, dim)
+    float* rms_att_weight; // (layer, dim) rmsnorm weights
+    float* rms_ffn_weight; // (layer, dim)
    // weights for matmuls
-    std::vector<float> wq; // (layer, dim, dim)
-    std::vector<float> wk; // (layer, dim, dim)
-    std::vector<float> wv; // (layer, dim, dim)
-    std::vector<float> wo; // (layer, dim, dim)
+    float* wq; // (layer, dim, dim)
+    float* wk; // (layer, dim, dim)
+    float* wv; // (layer, dim, dim)
+    float* wo; // (layer, dim, dim)
    // weights for ffn
-    std::vector<float> w1; // (layer, hidden_dim, dim)
-    std::vector<float> w2; // (layer, dim, hidden_dim)
-    std::vector<float> w3; // (layer, hidden_dim, dim)
+    float* w1; // (layer, hidden_dim, dim)
+    float* w2; // (layer, dim, hidden_dim)
+    float* w3; // (layer, hidden_dim, dim)
    // final rmsnorm
-    std::vector<float> rms_final_weight; // (dim,)
+    float* rms_final_weight; // (dim,)
    // freq_cis for RoPE relatively positional embeddings
-    // std::vector<float> freq_cis_real; // (seq_len, dim/2)
-    // std::vector<float> freq_cis_imag; // (seq_len, dim/2)
+    // float* freq_cis_real; // (seq_len, dim/2)
+    // float* freq_cis_imag; // (seq_len, dim/2)
    // (optional) classifier weights for the logits, on the last layer
-    std::vector<float> wcls;
+    float* wcls;
+
+    ~TransformerWeights() {
+        delete[] token_embedding_table;
+        delete[] rms_att_weight;
+        delete[] rms_ffn_weight;
+        delete[] wq;
+        delete[] wk;
+        delete[] wv;
+        delete[] wo;
+        delete[] w1;
+        delete[] w2;
+        delete[] w3;
+        delete[] rms_final_weight;
+        delete[] wcls;
+    }
 };

-static void alloc_weights(TransformerWeights * w, const Config * p, bool shared_weights) {
-    const int n_multiqueries = p->n_kv_heads <= 0 || p->n_kv_heads >= p->n_heads ? 1 : p->n_heads / p->n_kv_heads;
-    try {
-        w->token_embedding_table.resize(p->vocab_size * p->dim);
-        LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
+static void malloc_weights(TransformerWeights* w, Config* p, bool shared_weights) {
+    // we calloc instead of malloc to keep valgrind happy
+    w->token_embedding_table = new float[p->vocab_size * p->dim]();
+    printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);

-        w->rms_att_weight.resize(p->n_layers * p->dim);
-        LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim);
+    w->rms_att_weight = new float[p->n_layers * p->dim]();
+    printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim);

-        w->rms_ffn_weight.resize(p->n_layers * p->dim);
-        LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim);
+    w->rms_ffn_weight = new float[p->n_layers * p->dim]();
+    printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim);

-        w->wq.resize(p->n_layers * p->dim * p->dim);
-        LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
+    w->wq = new float[p->n_layers * p->dim * p->dim]();
+    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);

-        w->wk.resize(p->n_layers * p->dim * p->dim / n_multiqueries);
-        LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);
+    w->wk = new float[p->n_layers * p->dim * p->dim]();
+    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);

-        w->wv.resize(p->n_layers * p->dim * p->dim / n_multiqueries);
-        LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);
+    w->wv = new float[p->n_layers * p->dim * p->dim]();
+    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);

-        w->wo.resize(p->n_layers * p->dim * p->dim);
-        LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
+    w->wo = new float[p->n_layers * p->dim * p->dim]();
+    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);

-        w->w1.resize(p->n_layers * p->hidden_dim * p->dim);
-        LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
+    w->w1 = new float[p->n_layers * p->hidden_dim * p->dim]();
+    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);

-        w->w2.resize(p->n_layers * p->hidden_dim * p->dim);
-        LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim);
+    w->w2 = new float[p->n_layers * p->hidden_dim * p->dim]();
+    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim);

-        w->w3.resize(p->n_layers * p->hidden_dim * p->dim);
-        LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
+    w->w3 = new float[p->n_layers * p->hidden_dim * p->dim]();
+    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);

-        w->rms_final_weight.resize(p->dim);
-        LOG("%s: Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim);
+    w->rms_final_weight = new float[p->dim]();
+    printf("[%s:AK] Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim);

-        if (shared_weights) {
-            w->wcls = {};
-        } else {
-            w->wcls.resize(p->vocab_size * p->dim);
-            LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
-        }
-    }
-    catch (std::length_error &) {
-        die("Invalid configuration. Failed to allocate memory for weights");
+    if (shared_weights) {
+        w->wcls = NULL;
+    } else {
+        w->wcls = new float[p->vocab_size * p->dim]();
+        printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
    }
 }

-static int checkpoint_init_weights(TransformerWeights * w, const Config * p, FILE * f, bool shared_weights) {
-    if (fread(w->token_embedding_table.data(), sizeof(float), w->token_embedding_table.size(), f) != w->token_embedding_table.size()) return 1;
-    if (fread(w->rms_att_weight.data(), sizeof(float), w->rms_att_weight.size(), f) != w->rms_att_weight.size()) return 1;
-    if (fread(w->wq.data(), sizeof(float), w->wq.size(), f) != w->wq.size()) return 1;
-    if (fread(w->wk.data(), sizeof(float), w->wk.size(), f) != w->wk.size()) return 1;
-    if (fread(w->wv.data(), sizeof(float), w->wv.size(), f) != w->wv.size()) return 1;
-    if (fread(w->wo.data(), sizeof(float), w->wo.size(), f) != w->wo.size()) return 1;
-    if (fread(w->rms_ffn_weight.data(), sizeof(float), w->rms_ffn_weight.size(), f) != w->rms_ffn_weight.size()) return 1;
-    if (fread(w->w1.data(), sizeof(float), w->w1.size(), f) != w->w1.size()) return 1;
-    if (fread(w->w2.data(), sizeof(float), w->w2.size(), f) != w->w2.size()) return 1;
-    if (fread(w->w3.data(), sizeof(float), w->w3.size(), f) != w->w3.size()) return 1;
-    if (fread(w->rms_final_weight.data(), sizeof(float), w->rms_final_weight.size(), f) != w->rms_final_weight.size()) return 1;
+static int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bool shared_weights) {
+    if (fread(w->token_embedding_table, sizeof(float), p->vocab_size * p->dim, f) != static_cast<size_t>(p->vocab_size * p->dim)) return 1;
+    if (fread(w->rms_att_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1;
+    if (fread(w->wq, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
+    if (fread(w->wk, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
+    if (fread(w->wv, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
+    if (fread(w->wo, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
+    if (fread(w->rms_ffn_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1;
+    if (fread(w->w1, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->hidden_dim)) return 1;
+    if (fread(w->w2, sizeof(float), p->n_layers * p->hidden_dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->hidden_dim * p->dim)) return 1;
+    if (fread(w->w3, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->hidden_dim)) return 1;
+    if (fread(w->rms_final_weight, sizeof(float), p->dim, f) != static_cast<size_t>(p->dim)) return 1;

    // Skip freq_cis_real & freq_cis_imag
    int head_size = p->dim / p->n_heads;
    fseek(f, p->seq_len * head_size * sizeof(float), SEEK_CUR);

-    if (!shared_weights && fread(w->wcls.data(), sizeof(float), w->wcls.size(), f) != w->wcls.size()) return 1;
+    if (!shared_weights && fread(w->wcls, sizeof(float), p->vocab_size * p->dim, f) != static_cast<size_t>(p->vocab_size * p->dim)) return 1;

    // Check we didn't forget to read anything
    auto curr = ftell(f);
    fseek(f, 0, SEEK_END);
    auto end = ftell(f);
    if (curr != end) {
-        LOG("%s: Error: failed to read the checkpoint file to the end (curr = %ld, end =  %ld)\n", __func__, curr, end);
+        printf("Error: failed to read the checkpoint file to the end (curr = %ld, end =  %ld)\n", curr, end);
        return 1;
    }

@@ -181,20 +190,20 @@ static int checkpoint_init_weights(TransformerWeights * w, const Config * p, FIL
 }

 static void print_sample_weights(TransformerWeights *w){
-    LOG("----- Quick print of first of the weight vales of all the variables\n");
-    LOG("%f\n", w->token_embedding_table[0]);
-    LOG("%f\n", w->rms_att_weight[0]);
-    LOG("%f\n", w->rms_ffn_weight[0]);
+    printf("----- Quick print of first of the weight vales of all the variables\n");
+    printf("%f\n", w->token_embedding_table[0]);
+    printf("%f\n", w->rms_att_weight[0]);
+    printf("%f\n", w->rms_ffn_weight[0]);

-    LOG("%f\n", w->wq[0]);
-    LOG("%f\n", w->wk[0]);
-    LOG("%f\n", w->wv[0]);
-    LOG("%f\n", w->wo[0]);
-    LOG("%f\n", w->w1[0]);
-    LOG("%f\n", w->w2[0]);
-    LOG("%f\n", w->w3[0]);
-    LOG("%f\n", w->rms_att_weight[0]);
-    if (!w->wcls.empty()) LOG("%f\n", w->wcls[0]);
+    printf("%f\n", w->wq[0]);
+    printf("%f\n", w->wk[0]);
+    printf("%f\n", w->wv[0]);
+    printf("%f\n", w->wo[0]);
+    printf("%f\n", w->w1[0]);
+    printf("%f\n", w->w2[0]);
+    printf("%f\n", w->w3[0]);
+    printf("%f\n", w->rms_att_weight[0]);
+    if (w->wcls) printf("%f\n", w->wcls[0]);
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////

@@ -216,16 +225,14 @@ struct llama_vocab {
 };

 struct my_llama_hparams {
-    uint32_t n_vocab   = 32000;
-    uint32_t n_ctx     = 512;   // this is provided as user input?
-    uint32_t n_embd    = 4096;
-    uint32_t n_ff      = 11008;
-    uint32_t n_mult    = 4;
-    uint32_t n_head    = 32;
-    uint32_t n_head_kv = 32;
-    uint32_t n_layer   = 32;
-    uint32_t n_rot     = 64;
-
+    uint32_t n_vocab = 32000;
+    uint32_t n_ctx   = 512;   // this is provided as user input?
+    uint32_t n_embd  = 4096;
+    uint32_t n_ff    = 11008;
+    uint32_t n_mult  = 4;
+    uint32_t n_head  = 32;
+    uint32_t n_layer = 32;
+    uint32_t n_rot   = 64;
    bool operator!=(const my_llama_hparams& other) const {
        return memcmp(this, &other, sizeof(my_llama_hparams));
    }
@@ -318,30 +325,14 @@ struct train_params {
 };

 static void print_params(struct my_llama_hparams * params) {
-    LOG("%s: n_vocab:   %u\n", __func__, params->n_vocab);
-    LOG("%s: n_ctx:     %u\n", __func__, params->n_ctx);
-    LOG("%s: n_embd:    %u\n", __func__, params->n_embd);
-    LOG("%s: n_mult:    %u\n", __func__, params->n_mult);
-    LOG("%s: n_head:    %u\n", __func__, params->n_head);
-    LOG("%s: n_head_kv: %u\n", __func__, params->n_head_kv);
-    LOG("%s: n_ff:      %u\n", __func__, params->n_ff);
-    LOG("%s: n_layer:   %u\n", __func__, params->n_layer);
-    LOG("%s: n_rot:     %u\n", __func__, params->n_rot);
-}
-
-static void print_tensor_info(const struct ggml_context * ctx) {
-    for (auto t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
-        LOG("%s: Allocating ", __func__);
-        int64_t total = 1;
-        int i = 0;
-        for (; i < ggml_n_dims(t); ++i) {
-            if (i > 0) LOG("x ");
-            LOG("[%" PRId64 "] ", t->ne[i]);
-            total *= t->ne[i];
-        }
-        if (i > 1) LOG("= [%" PRId64 "] ", total);
-        LOG("float space for %s\n", ggml_get_name(t));
-    }
+    printf("%s: n_vocab: %u\n", __func__, params->n_vocab);
+    printf("%s: n_ctx:   %u\n", __func__, params->n_ctx);
+    printf("%s: n_embd:  %u\n", __func__, params->n_embd);
+    printf("%s: n_mult:  %u\n", __func__, params->n_mult);
+    printf("%s: n_head:  %u\n", __func__, params->n_head);
+    printf("%s: n_ff:    %u\n", __func__, params->n_ff);
+    printf("%s: n_layer: %u\n", __func__, params->n_layer);
+    printf("%s: n_rot:   %u\n", __func__, params->n_rot);
 }

 static void init_model(struct my_llama_model * model) {
@@ -351,8 +342,6 @@ static void init_model(struct my_llama_model * model) {
    const uint32_t n_layer = hparams.n_layer;
    const uint32_t n_vocab = hparams.n_vocab;

-    const uint32_t n_multiqueries = hparams.n_head_kv <= 0 || hparams.n_head_kv >= hparams.n_head ? 1 : hparams.n_head / hparams.n_head_kv;
-
    const uint32_t n_ff = hparams.n_ff;
    struct ggml_context * ctx = model->ctx;

@@ -361,8 +350,25 @@ static void init_model(struct my_llama_model * model) {
    model->train_tokens = 0;

    model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
+    printf("[%s:GG] Allocating [%u] x [%u] = [%u] float space for model->tok_embeddings\n",__func__,n_embd , n_vocab, n_embd * n_vocab);
+
    model->norm           = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+    printf("[%s:GG] Allocating [%u] float space for model->norm\n",__func__,n_embd);
+
    model->output         = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
+    printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for model->output\n",__func__,n_embd, n_vocab, n_embd * n_vocab);
+
+    // printing the per-layer allocations here so we dont print in the for loop.
+    printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wq for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
+    printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wk for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
+    printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wv for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
+    printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wo for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
+
+    printf("[%s:GG] Allocating [%u] float space for layer.ffn_norm for [%u] layers\n",__func__,n_embd, n_layer);
+
+    printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.w1 for [%u] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer);
+    printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.w2 for [%u] layers\n",__func__, n_embd, n_ff, n_ff * n_embd, n_layer);
+    printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.w3 for [%u] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer);

    ggml_set_name(model->tok_embeddings, "tok_embeddings.weight");
    ggml_set_name(model->norm,           "norm.weight");
@@ -377,8 +383,8 @@ static void init_model(struct my_llama_model * model) {
        layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);

        layer.wq = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
-        layer.wk = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd / n_multiqueries);
-        layer.wv = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd / n_multiqueries);
+        layer.wk = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
+        layer.wv = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
        layer.wo = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);

        layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
@@ -400,8 +406,6 @@ static void init_model(struct my_llama_model * model) {
        ggml_format_name(layer.w2, "%s.feed_forward.w2.weight", layers_i.c_str());
        ggml_format_name(layer.w3, "%s.feed_forward.w3.weight", layers_i.c_str());
    }
-
-    print_tensor_info(ctx);
 }

 static float get_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
@@ -417,9 +421,9 @@ static int32_t get_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
 static void print_row(struct ggml_tensor * probs, int i) {
    for (int k = 0; k < probs->ne[0]; ++k) {
        float p = get_f32_2d(probs, k, i);
-        LOG(" %f", p);
+        printf(" %f", p);
    }
-    LOG("\n");
+    printf("\n");
 }

 static void print_matrix(struct ggml_tensor * probs) {
@@ -427,12 +431,33 @@ static void print_matrix(struct ggml_tensor * probs) {
    for (int i = 0; i < probs->ne[1]; ++i) {
        for (int k = 0; k < probs->ne[0]; ++k) {
            float p = get_f32_2d(probs, k, i);
-            LOG(" %.2f", p);
+            printf(" %.2f", p);
        }
-        LOG("\n");
+        printf("\n");
    }
 }

+#ifdef __GNUC__
+#ifdef __MINGW32__
+__attribute__((format(gnu_printf, 1, 2)))
+#else
+__attribute__((format(printf, 1, 2)))
+#endif
+#endif
+static std::string format(const char * fmt, ...) {
+    va_list ap, ap2;
+    va_start(ap, fmt);
+    va_copy(ap2, ap);
+    int size = vsnprintf(NULL, 0, fmt, ap);
+    GGML_ASSERT(size >= 0 && size < INT_MAX);
+    std::vector<char> buf(size + 1);
+    int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
+    GGML_ASSERT(size2 == size);
+    va_end(ap2);
+    va_end(ap);
+    return std::string(buf.data(), size);
+}
+
 struct llama_file {
    // use FILE * so we don't have to re-open the file to mmap
    FILE * fp;
@@ -524,9 +549,8 @@ static std::string llama_escape_whitespaces(const std::string & text) {
    return out.str();
 }

-static void load_vocab(const char * filename, const Config * config, struct llama_vocab * vocab) {
+static void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) {
    if (is_ggml_file(filename)) {
-        LOG("%s: Loading vocabulary from gguf file %s\n", __func__, filename);
        struct ggml_context * ctx_data = NULL;

        struct gguf_init_params params = {
@@ -554,9 +578,6 @@ static void load_vocab(const char * filename, const Config * config, struct llam
        const int * toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);

        const uint32_t n_vocab = gguf_get_arr_n(ctx, token_idx);
-        if (n_vocab != static_cast<uint32_t>(config->vocab_size)) {
-            die_fmt("vocab size mismatch: (gguf) %u != (llama2c) %d", n_vocab, config->vocab_size);
-        }

        vocab->id_to_token.resize(n_vocab);

@@ -574,7 +595,7 @@ static void load_vocab(const char * filename, const Config * config, struct llam
        gguf_free(ctx);
    } else {
        // assume llama2.c vocabulary
-        LOG("%s: Assuming llama2.c vocabulary since %s is not a gguf file\n", __func__, filename);
+        printf("Assuming llama2.c vocabulary since %s is not a gguf file\n", filename);
        llama_file file(filename, "rb");
        if (!file.fp) {
            die_fmt("%s: %s", strerror(errno), filename);
@@ -617,15 +638,38 @@ static void load_vocab(const char * filename, const Config * config, struct llam
 }

 static void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) {
-    int size = 1;
-    for (int dim = 0; dim < ggml_n_dims(gg_weights); ++dim) {
-        size *= gg_weights->ne[dim];
-    }
-    for (int ct = 0; ct < size; ++ct) {
-        int64_t i0 = 0; int64_t i1 = 0;
-        int64_t i2 = 0; int64_t i3 = 0;
-        ggml_unravel_index(gg_weights, ct, &i0, &i1, &i2, &i3);
-        ggml_set_f32_nd(gg_weights, i0, i1, i2, i3, karpathy_weights[ct]);
+    int ct;
+    switch (ggml_n_dims(gg_weights)) {
+        case 1:
+            ct = 0;
+            for (int i0 = 0; i0 < gg_weights->ne[0]; i0++){
+                float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0]);
+                *ptr = karpathy_weights[ct];
+                ct++;
+            }
+            break;
+        case 2:
+            ct = 0;
+            for (int i1 = 0; i1 < gg_weights->ne[1]; i1++) {
+                for (int i0 = 0; i0 < gg_weights->ne[0]; i0++) {
+                    float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0] + i1*gg_weights->nb[1]);
+                    *ptr = karpathy_weights[ct];
+                    ct++;
+                }
+            }
+            break;
+        case 3:
+            ct = 0;
+            for (int i2 = 0; i2 < gg_weights->ne[2]; i2++) {
+                for (int i1 = 0; i1 < gg_weights->ne[1]; i1++) {
+                    for (int i0 = 0; i0 < gg_weights->ne[0]; i0++) {
+                        float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0] + i1*gg_weights->nb[1] + i2*gg_weights->nb[2]);
+                        *ptr = karpathy_weights[ct];
+                        ct++;
+                    }
+                }
+            }
+            break;
    }
 }

@@ -635,18 +679,16 @@ static void save_as_llama_model(
    // convert AK weights into GG weights one by one.
    // w->token_embedding_table -> model->tok_embeddings
    // float*                   -> struct ggml_tensor
-    convert_weights_ak_to_gg(model->tok_embeddings, w->token_embedding_table.data());
-    convert_weights_ak_to_gg(model->output, !w->wcls.empty() ? w->wcls.data() : w->token_embedding_table.data());
+    convert_weights_ak_to_gg(model->tok_embeddings, w->token_embedding_table);
+    convert_weights_ak_to_gg(model->output, w->wcls ? w->wcls : w->token_embedding_table);

-    convert_weights_ak_to_gg(model->norm, w->rms_final_weight.data());
+    convert_weights_ak_to_gg(model->norm, w->rms_final_weight);
    //print_row(model->norm, 0);

    // for rms-att-weight
    int row_length = model->hparams.n_embd;
    int n_ff = model->hparams.n_ff;

-    const uint32_t n_multiqueries = model->hparams.n_head_kv <= 0 || model->hparams.n_head_kv >= model->hparams.n_head ? 1 : model->hparams.n_head / model->hparams.n_head_kv;
-
    for (uint32_t i = 0; i < model->hparams.n_layer; ++i){
        auto & layer = model->layers[i];
        // 1d
@@ -655,10 +697,9 @@ static void save_as_llama_model(

        // from 3d matrix layer x dim x dim to 2d matrix dim x dim
        convert_weights_ak_to_gg(layer.wq            , &w->wq[i*row_length*row_length]);
+        convert_weights_ak_to_gg(layer.wk            , &w->wk[i*row_length*row_length]);
+        convert_weights_ak_to_gg(layer.wv            , &w->wv[i*row_length*row_length]);
        convert_weights_ak_to_gg(layer.wo            , &w->wo[i*row_length*row_length]);
-        // from 3d matrix layer x dim x dim to 2d matrix dim x dim / n_multiqueries
-        convert_weights_ak_to_gg(layer.wk            , &w->wk[i*row_length*row_length/n_multiqueries]);
-        convert_weights_ak_to_gg(layer.wv            , &w->wv[i*row_length*row_length/n_multiqueries]);

        convert_weights_ak_to_gg(layer.w1            , &w->w1[i*row_length*n_ff]);
        convert_weights_ak_to_gg(layer.w2            , &w->w2[i*n_ff*row_length]);
@@ -695,8 +736,8 @@ static void save_as_llama_model(
    gguf_set_val_u32(ctx, KV_EMBEDDING_LENGTH, model->hparams.n_embd);
    gguf_set_val_u32(ctx, KV_FEED_FORWARD_LENGTH, model->hparams.n_ff);
    gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT, model->hparams.n_head);
-    gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT, model->hparams.n_head);
-    gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT_KV, model->hparams.n_head_kv);
+    // n_head_kv is optional, default to n_head
+    // gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT_KV, ...);
    gguf_set_val_u32(ctx, KV_BLOCK_COUNT, model->hparams.n_layer);
    gguf_set_val_u32(ctx, KV_ROPE_DIMENSION_COUNT, model->hparams.n_rot);
    gguf_set_val_f32(ctx, KV_ATTENTION_LAYERNORM_RMS_EPS, 1e-5f);
@@ -748,12 +789,12 @@ static void save_as_llama_model(

 static struct train_params get_default_train_params() {
    struct train_params params;
-    params.fn_vocab_model          = "models/7B/ggml-model-f16.gguf";
+    params.fn_vocab_model    = "models/7B/ggml-model-f16.gguf";
    params.fn_llama2c_output_model = "ak_llama_model.bin";
-    params.fn_train_data           = "shakespeare.txt";
-    params.fn_checkpoint_in        = "checkpoint.bin";
-    params.fn_checkpoint_out       = "checkpoint.bin";
-    params.fn_model_out            = "ggml-checkpoint-f32.bin";
+    params.fn_train_data     = "shakespeare.txt";
+    params.fn_checkpoint_in  = "checkpoint.bin";
+    params.fn_checkpoint_out = "checkpoint.bin";
+    params.fn_model_out      = "ggml-checkpoint-f32.bin";

    params.seed       =   -1;

@@ -788,8 +829,8 @@ static struct train_params get_default_train_params() {
    params.adam_alpha        = 1e-3f;
    params.adam_decay        = 1e-3f;

-    params.mem_model_gb    = 2;
-    params.mem_compute_gb  = 24;
+    params.mem_model_gb   = 2;
+    params.mem_compute_gb = 24;
    params.mem_compute0_gb = 8;
    params.mem_compute1_gb = 2;

@@ -875,30 +916,19 @@ int main(int argc, char ** argv) {
    if (!params_parse(argc, argv, &params)) {
        return 1;
    }
-    log_set_target(stdout);
    Config config;
    TransformerWeights weights = {};
    {
-        LOG("%s: Loading llama2c model from %s\n", __func__, params.fn_llama2c_model);
-        FILE * file = fopen(params.fn_llama2c_model, "rb");
-        if (!file) {
-            LOG("%s: Unable to open the checkpoint file %s!\n", __func__, params.fn_llama2c_model);
-            return 1;
-        }
+        FILE *file = fopen(params.fn_llama2c_model, "rb");
+        if (!file) { printf("Unable to open the checkpoint file %s!\n", params.fn_llama2c_model); return 1; }
        // read in the config header
-        if (fread(&config, sizeof(Config), 1, file) != 1) {
-            LOG("%s: Unable to read llama2c config from %s!\n",__func__,params.fn_llama2c_model);
-            return 1;
-        }
+        if(fread(&config, sizeof(Config), 1, file) != 1) { return 1; }
        auto shared_weights = config.vocab_size > 0;
        config.vocab_size = abs(config.vocab_size);

        // read in the Transformer weights
-        alloc_weights(&weights, &config, shared_weights);
-        if (checkpoint_init_weights(&weights, &config, file, shared_weights)) {
-            LOG("%s: Unable to initialize transformer weights from %s!",__func__,params.fn_llama2c_model);
-            return 1;
-        }
+        malloc_weights(&weights, &config, shared_weights);
+        if(checkpoint_init_weights(&weights, &config, file, shared_weights)) { return 1; }
        fclose(file);
    }

@@ -906,18 +936,15 @@ int main(int argc, char ** argv) {
    load_vocab(params.fn_vocab_model, &config, &vocab);

    struct my_llama_model model;
-    model.hparams.n_vocab   = config.vocab_size; //llama_n_vocab(lctx);
-    model.hparams.n_ctx     = params.n_ctx;
-    model.hparams.n_embd    = config.dim; //params.n_embd;
-    model.hparams.n_ff      = config.hidden_dim;
-    model.hparams.n_mult    = 32;//params.n_mult;
-    model.hparams.n_head    = config.n_heads; //params.n_head;
-    model.hparams.n_head_kv = config.n_kv_heads;
-    model.hparams.n_layer   = config.n_layers; //params.n_layer;
-    model.hparams.n_rot     = std::min((uint32_t)params.n_rotmax, model.hparams.n_embd / model.hparams.n_head);
-
+    model.hparams.n_vocab = config.vocab_size; //llama_n_vocab(lctx);
+    model.hparams.n_ctx   = params.n_ctx;
+    model.hparams.n_embd  = config.dim; //params.n_embd;
+    model.hparams.n_ff    = config.hidden_dim;
+    model.hparams.n_mult  = 32;//params.n_mult;
+    model.hparams.n_head  = config.n_heads; //params.n_head;
+    model.hparams.n_layer = config.n_layers; //params.n_layer;
+    model.hparams.n_rot   = std::min((uint32_t)params.n_rotmax, model.hparams.n_embd / model.hparams.n_head);
    print_params(&model.hparams);
-
    struct ggml_init_params lcparams;
    lcparams.mem_size   = 1024ll*1024ll*1024ll*((size_t) params.mem_model_gb);
    lcparams.mem_buffer = NULL;
@@ -929,7 +956,7 @@ int main(int argc, char ** argv) {
    model.name = basename(params.fn_llama2c_model);
    save_as_llama_model(&vocab, &model, &weights, params.fn_llama2c_output_model);

-    LOG("%s: Saving llama.c model file %s in ggml format at %s\n", __func__, params.fn_llama2c_model, params.fn_llama2c_output_model);
+    printf("Saving llama.c model file %s in ggml format at %s\n", params.fn_llama2c_model, params.fn_llama2c_output_model);

    ggml_free(model.ctx);
    return 0;
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -61,8 +61,6 @@ int main(int argc, char ** argv) {
    }

    params.embedding = true;
-    // For non-causal models, batch size must be equal to ubatch size
-    params.n_ubatch = params.n_batch;

    print_build_info();

@@ -116,9 +114,7 @@ int main(int argc, char ** argv) {
    for (const auto & prompt : prompts) {
        auto inp = ::llama_tokenize(ctx, prompt, true, false);
        if (inp.size() > n_batch) {
-            fprintf(stderr, "%s: error: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
-                    __func__, (long long int) inp.size(), (long long int) n_batch);
-            return 1;
+            inp.resize(n_batch);
        }
        inputs.push_back(inp);
    }
@@ -178,27 +174,25 @@ int main(int argc, char ** argv) {
    float * out = emb + p * n_embd;
    batch_decode(ctx, batch, out, s, n_embd);

-    // print the first part of the embeddings or for a single prompt, the full embedding
+    // print the first part of the embeddings
    fprintf(stdout, "\n");
    for (int j = 0; j < n_prompts; j++) {
        fprintf(stdout, "embedding %d: ", j);
-        for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) {
+        for (int i = 0; i < std::min(16, n_embd); i++) {
            fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
        }
        fprintf(stdout, "\n");
    }

    // print cosine similarity matrix
-    if (n_prompts > 1) {
-        fprintf(stdout, "\n");
-        printf("cosine similarity matrix:\n\n");
-        for (int i = 0; i < n_prompts; i++) {
-            for (int j = 0; j < n_prompts; j++) {
-                float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
-                fprintf(stdout, "%6.2f ", sim);
-            }
-            fprintf(stdout, "\n");
+    fprintf(stdout, "\n");
+    printf("cosine similarity matrix:\n\n");
+    for (int i = 0; i < n_prompts; i++) {
+        for (int j = 0; j < n_prompts; j++) {
+            float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
+            fprintf(stdout, "%6.2f ", sim);
        }
+        fprintf(stdout, "\n");
    }

    // clean up
--- a/examples/gbnf-validator/CMakeLists.txt
+++ b/examples/gbnf-validator/CMakeLists.txt
@@ -1,5 +0,0 @@
-set(TARGET gbnf-validator)
-add_executable(${TARGET} gbnf-validator.cpp)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common grammar-parser llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/gbnf-validator/gbnf-validator.cpp
+++ b/examples/gbnf-validator/gbnf-validator.cpp
@@ -1,132 +0,0 @@
-#define LLAMA_API_INTERNAL
-
-#include "grammar-parser.h"
-#include "ggml.h"
-#include "llama.h"
-#include "unicode.h"
-
-#include <cstdio>
-#include <cstdlib>
-#include <string>
-#include <vector>
-
-static bool llama_sample_grammar_string(struct llama_grammar * grammar, const std::string & input_str, size_t & error_pos, std::string & error_msg) {
-    auto decoded = decode_utf8(input_str, {});
-    const auto & code_points = decoded.first;
-
-    size_t pos = 0;
-    for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
-        auto prev_stacks = grammar->stacks;
-        grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
-        if (grammar->stacks.empty()) {
-            error_pos = pos;
-            error_msg = "Unexpected character '" + unicode_cpt_to_utf8(*it) + "'";
-            grammar->stacks = prev_stacks;
-            return false;
-        }
-        ++pos;
-    }
-
-    for (const auto & stack : grammar->stacks) {
-        if (stack.empty()) {
-            return true;
-        }
-    }
-
-    error_pos = pos;
-    error_msg = "Unexpected end of input";
-    return false;
-}
-
-static void print_error_message(const std::string & input_str, size_t error_pos, const std::string & error_msg) {
-    fprintf(stdout, "Input string is invalid according to the grammar.\n");
-    fprintf(stdout, "Error: %s at position %zu\n", error_msg.c_str(), error_pos);
-    fprintf(stdout, "\n");
-    fprintf(stdout, "Input string:\n");
-    fprintf(stdout, "%s", input_str.substr(0, error_pos).c_str());
-    if (error_pos < input_str.size()) {
-        fprintf(stdout, "\033[1;31m%c", input_str[error_pos]);
-        if (error_pos+1 < input_str.size()) {
-            fprintf(stdout, "\033[0;31m%s", input_str.substr(error_pos+1).c_str());
-        }
-        fprintf(stdout, "\033[0m\n");
-    }
-}
-
-int main(int argc, char** argv) {
-    if (argc != 3) {
-        fprintf(stdout, "Usage: %s <grammar_filename> <input_filename>\n", argv[0]);
-        return 1;
-    }
-
-    const std::string grammar_filename = argv[1];
-    const std::string input_filename = argv[2];
-
-    // Read the GBNF grammar file
-    FILE* grammar_file = fopen(grammar_filename.c_str(), "r");
-    if (!grammar_file) {
-        fprintf(stdout, "Failed to open grammar file: %s\n", grammar_filename.c_str());
-        return 1;
-    }
-
-    fseek(grammar_file, 0, SEEK_END);
-    size_t grammar_size = ftell(grammar_file);
-    fseek(grammar_file, 0, SEEK_SET);
-
-    std::string grammar_str(grammar_size, ' ');
-    fread(&grammar_str[0], 1, grammar_size, grammar_file);
-    fclose(grammar_file);
-
-    // Parse the GBNF grammar
-    auto parsed_grammar = grammar_parser::parse(grammar_str.c_str());
-
-    // will be empty (default) if there are parse errors
-    if (parsed_grammar.rules.empty()) {
-        fprintf(stdout, "%s: failed to parse grammar\n", __func__);
-        return 1;
-    }
-
-    // Ensure that there is a "root" node.
-    if (parsed_grammar.symbol_ids.find("root") == parsed_grammar.symbol_ids.end()) {
-        fprintf(stdout, "%s: grammar does not contain a 'root' symbol\n", __func__);
-        return 1;
-    }
-
-    std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
-
-    // Create the LLAMA grammar
-    auto grammar = llama_grammar_init(
-            grammar_rules.data(),
-            grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
-
-    // Read the input file
-    FILE* input_file = fopen(input_filename.c_str(), "r");
-    if (!input_file) {
-        fprintf(stdout, "Failed to open input file: %s\n", input_filename.c_str());
-        return 1;
-    }
-
-    fseek(input_file, 0, SEEK_END);
-    size_t input_size = ftell(input_file);
-    fseek(input_file, 0, SEEK_SET);
-
-    std::string input_str(input_size, ' ');
-    fread(&input_str[0], 1, input_size, input_file);
-    fclose(input_file);
-
-    // Validate the input string against the grammar
-    size_t error_pos;
-    std::string error_msg;
-    bool is_valid = llama_sample_grammar_string(grammar, input_str, error_pos, error_msg);
-
-    if (is_valid) {
-        fprintf(stdout, "Input string is valid according to the grammar.\n");
-    } else {
-        print_error_message(input_str, error_pos, error_msg);
-    }
-
-    // Clean up
-    llama_grammar_free(grammar);
-
-    return 0;
-}
--- a/examples/gguf-split/gguf-split.cpp
+++ b/examples/gguf-split/gguf-split.cpp
@@ -1,38 +1,37 @@
 #include "llama.h"
+#include "ggml.h"
 #include "common.h"

 #include <algorithm>
 #include <cmath>
+#include <cstdint>
 #include <cstdlib>
 #include <fstream>
+#include <ios>
 #include <string>
 #include <vector>

 #include <stdio.h>
+#include <fcntl.h>
 #include <string.h>
-#include <climits>
-#include <stdexcept>
-
-#if defined(_WIN32)
-    #include <windows.h>
-    #ifndef PATH_MAX
-        #define PATH_MAX MAX_PATH
-    #endif
-    #include <io.h>
-#endif

 enum split_operation : uint8_t {
    SPLIT_OP_SPLIT,
    SPLIT_OP_MERGE,
 };

+static const char * const LLM_KV_GENERAL_SPLIT_I_SPLIT = "general.split";
+static const char * const LLM_KV_GENERAL_SPLIT_N_SPLIT = "general.split_count";
+
+static const int SPLIT_FILENAME_MAX = 256;
+
+static const char * const SPLIT_FILENAME_FORMAT = "%s-%05d-of-%05d.gguf";
+
 struct split_params {
    split_operation operation = SPLIT_OP_SPLIT;
-    size_t n_bytes_split = 0;
    int n_split_tensors = 128;
    std::string input;
    std::string output;
-    bool dry_run = false;
 };

 static void split_print_usage(const char * executable) {
@@ -43,36 +42,15 @@ static void split_print_usage(const char * executable) {
    printf("Apply a GGUF operation on IN to OUT.");
    printf("\n");
    printf("options:\n");
-    printf("  -h, --help              show this help message and exit\n");
-    printf("  --version               show version and build info\n");
-    printf("  --split                 split GGUF to multiple GGUF (enabled by default)\n");
-    printf("  --merge                 merge multiple GGUF to a single GGUF\n");
-    printf("  --split-max-tensors     max tensors in each split (default: %d)\n", default_params.n_split_tensors);
-    printf("  --split-max-size N(M|G) max size per split\n");
-    printf("  --dry-run               only print out a split plan and exit, without writing any new files\n");
+    printf("  -h, --help            show this help message and exit\n");
+    printf("  --version             show version and build info\n");
+    printf("  --split               split GGUF to multiple GGUF (default)\n");
+    printf("  --split-max-tensors   max tensors in each split: default(%d)\n", default_params.n_split_tensors);
+    printf("  --merge               merge multiple GGUF to a single GGUF\n");
    printf("\n");
 }

-// return convert string, for example "128M" or "4G" to number of bytes
-static size_t split_str_to_n_bytes(std::string str) {
-    size_t n_bytes = 0;
-    int n;
-    if (str.back() == 'M') {
-        sscanf(str.c_str(), "%d", &n);
-        n_bytes = n * 1024 * 1024; // megabytes
-    } else if (str.back() == 'G') {
-        sscanf(str.c_str(), "%d", &n);
-        n_bytes = n * 1024 * 1024 * 1024; // gigabytes
-    } else {
-        throw std::invalid_argument("error: supported units are M (megabytes) or G (gigabytes), but got: " + std::string(1, str.back()));
-    }
-    if (n <= 0) {
-        throw std::invalid_argument("error: size must be a positive value");
-    }
-    return n_bytes;
-}
-
-static void split_params_parse_ex(int argc, const char ** argv, split_params & params) {
+static bool split_params_parse_ex(int argc, const char ** argv, split_params & params) {
    std::string arg;
    const std::string arg_prefix = "--";
    bool invalid_param = false;
@@ -85,8 +63,6 @@ static void split_params_parse_ex(int argc, const char ** argv, split_params & p
        }

        bool arg_found = false;
-        bool is_op_set = false;
-        bool is_mode_set = false;
        if (arg == "-h" || arg == "--help") {
            split_print_usage(argv[0]);
            exit(0);
@@ -96,46 +72,23 @@ static void split_params_parse_ex(int argc, const char ** argv, split_params & p
            fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
            exit(0);
        }
-        if (arg == "--dry-run") {
-            arg_found = true;
-            params.dry_run = true;
-        }

-        if (is_op_set) {
-            throw std::invalid_argument("error: either --split or --merge can be specified, but not both");
-        }
        if (arg == "--merge") {
            arg_found = true;
-            is_op_set = true;
            params.operation = SPLIT_OP_MERGE;
        }
        if (arg == "--split") {
            arg_found = true;
-            is_op_set = true;
            params.operation = SPLIT_OP_SPLIT;
        }
-
-        if (is_mode_set) {
-            throw std::invalid_argument("error: either --split-max-tensors or --split-max-size can be specified, but not both");
-        }
        if (arg == "--split-max-tensors") {
            if (++arg_idx >= argc) {
                invalid_param = true;
                break;
            }
            arg_found = true;
-            is_mode_set = true;
            params.n_split_tensors = atoi(argv[arg_idx]);
        }
-        if (arg == "--split-max-size") {
-            if (++arg_idx >= argc) {
-                invalid_param = true;
-                break;
-            }
-            arg_found = true;
-            is_mode_set = true;
-            params.n_bytes_split = split_str_to_n_bytes(argv[arg_idx]);
-        }

        if (!arg_found) {
            throw std::invalid_argument("error: unknown argument: " + arg);
@@ -147,22 +100,29 @@ static void split_params_parse_ex(int argc, const char ** argv, split_params & p
    }

    if (argc - arg_idx < 2) {
-        throw std::invalid_argument("error: bad arguments");
+        printf("%s: bad arguments\n", argv[0]);
+        split_print_usage(argv[0]);
+        return false;
    }

    params.input = argv[arg_idx++];
    params.output = argv[arg_idx++];
+
+    return true;
 }

 static bool split_params_parse(int argc, const char ** argv, split_params & params) {
    bool result = true;
    try {
-        split_params_parse_ex(argc, argv, params);
+        if (!split_params_parse_ex(argc, argv, params)) {
+            split_print_usage(argv[0]);
+            exit(1);
+        }
    }
    catch (const std::invalid_argument & ex) {
        fprintf(stderr, "%s\n", ex.what());
        split_print_usage(argv[0]);
-        exit(EXIT_FAILURE);
+        exit(1);
    }
    return result;
 }
@@ -174,6 +134,12 @@ static void zeros(std::ofstream & file, size_t n) {
    }
 }

+static std::string split_file_name(const std::string & path, int i_split, int n_split) {
+    char f_split[SPLIT_FILENAME_MAX] = {0};
+    snprintf(f_split, sizeof(f_split), SPLIT_FILENAME_FORMAT, path.c_str(), i_split + 1, n_split);
+    return std::string(f_split);
+}
+
 struct split_strategy {
    const split_params params;
    std::ifstream & f_input;
@@ -181,11 +147,15 @@ struct split_strategy {
    struct ggml_context * ctx_meta = NULL;
    const int n_tensors;

-    // one ctx_out per one output file
-    std::vector<struct gguf_context *> ctx_outs;
+    const int n_split;
+    int i_split = 0;

-    // temporary buffer for reading in tensor data
-    std::vector<uint8_t> read_buf;
+    int i_tensor = 0;
+
+    std::vector<uint8_t> read_data;
+
+    struct gguf_context * ctx_out;
+    std::ofstream fout;

    split_strategy(const split_params & params,
            std::ifstream & f_input,
@@ -195,141 +165,77 @@ struct split_strategy {
        f_input(f_input),
        ctx_gguf(ctx_gguf),
        ctx_meta(ctx_meta),
-        n_tensors(gguf_get_n_tensors(ctx_gguf)) {
-
-        // because we need to know list of tensors for each file in advance, we will build all the ctx_out for all output splits
-        int i_split = -1;
-        struct gguf_context * ctx_out = NULL;
-        auto new_ctx_out = [&]() {
-            i_split++;
-            if (ctx_out != NULL) {
-                if (gguf_get_n_tensors(ctx_out) == 0) {
-                    fprintf(stderr, "error: one of splits have 0 tensors. Maybe size or tensors limit is too small\n");
-                    exit(EXIT_FAILURE);
-                }
-                ctx_outs.push_back(ctx_out);
-            }
-            ctx_out = gguf_init_empty();
-            // Save all metadata in first split only
-            if (i_split == 0) {
-                gguf_set_kv(ctx_out, ctx_gguf);
-            }
-            gguf_set_val_u16(ctx_out, LLM_KV_SPLIT_NO, i_split);
-            gguf_set_val_u16(ctx_out, LLM_KV_SPLIT_COUNT, 0); // placeholder
-            gguf_set_val_i32(ctx_out, LLM_KV_SPLIT_TENSORS_COUNT, n_tensors);
-        };
-
-        // initialize ctx_out for the first split
-        new_ctx_out();
-
-        // process tensors one by one
-        size_t curr_tensors_size = 0; // current size by counting only tensors size (without metadata)
-        for (int i = 0; i < n_tensors; ++i) {
-            struct ggml_tensor * t = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
-            // calculate the "imaginary" size = the current size + next tensor size
-            size_t n_bytes = GGML_PAD(ggml_nbytes(t), GGUF_DEFAULT_ALIGNMENT);
-            size_t next_tensors_size = curr_tensors_size + n_bytes;
-            if (should_split(i, next_tensors_size)) {
-                new_ctx_out();
-                curr_tensors_size = n_bytes;
-            } else {
-                curr_tensors_size = next_tensors_size;
-            }
-            gguf_add_tensor(ctx_out, t);
+        n_tensors(gguf_get_n_tensors(ctx_gguf)),
+        n_split(std::ceil(1. * n_tensors / params.n_split_tensors)) {
        }

-        // push the last ctx_out
-        ctx_outs.push_back(ctx_out);
-
-        // set the correct n_split for all ctx_out
-        for (auto & ctx : ctx_outs) {
-            gguf_set_val_u16(ctx, LLM_KV_SPLIT_COUNT, ctx_outs.size());
-        }
+    bool should_split() const {
+        return i_tensor < n_tensors && i_tensor % params.n_split_tensors == 0;
    }

-    ~split_strategy() {
-        for (auto & ctx_out : ctx_outs) {
-            gguf_free(ctx_out);
+    void split_start() {
+        ctx_out = gguf_init_empty();
+
+        // Save all metadata in first split only
+        if (i_split == 0) {
+            gguf_set_kv(ctx_out, ctx_gguf);
        }
+        gguf_set_val_u8(ctx_out, LLM_KV_GENERAL_SPLIT_I_SPLIT, i_split);
+        gguf_set_val_u8(ctx_out, LLM_KV_GENERAL_SPLIT_N_SPLIT, n_split);
+
+        // populate the original tensors, so we get an initial metadata
+        for (int i = i_split * params.n_split_tensors; i < n_tensors && i < (i_split + 1) * params.n_split_tensors; ++i) {
+            struct ggml_tensor * meta = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
+            gguf_add_tensor(ctx_out, meta);
+        }
+
+        auto split_name = split_file_name(params.output, i_split, n_split);
+
+        fprintf(stderr, "%s: %s ...", __func__, split_name.c_str());
+        fout = std::ofstream(split_name, std::ios::binary);
+        fout.exceptions(std::ofstream::failbit); // fail fast on write errors
+
+        auto meta_size = gguf_get_meta_size(ctx_out);
+
+        // placeholder for the meta data
+        ::zeros(fout, meta_size);
+
+        i_split++;
    }

-    bool should_split(int i_tensor, size_t next_size) {
-        if (params.n_bytes_split > 0) {
-            // split by max size per file
-            return next_size > params.n_bytes_split;
-        } else {
-            // split by number of tensors per file
-            return i_tensor > 0 && i_tensor < n_tensors && i_tensor % params.n_split_tensors == 0;
+    void next_tensor() {
+        const char * t_name = gguf_get_tensor_name(ctx_gguf, i_tensor);
+        struct ggml_tensor * t = ggml_get_tensor(ctx_meta, t_name);
+        auto n_bytes = ggml_nbytes(t);
+
+        if (read_data.size() < n_bytes) {
+            read_data.resize(n_bytes);
        }
+
+        auto offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i_tensor);
+        f_input.seekg(offset);
+        f_input.read((char *)read_data.data(), n_bytes);
+
+        t->data = read_data.data();
+
+        // write tensor data + padding
+        fout.write((const char *)t->data, n_bytes);
+        zeros(fout, GGML_PAD(n_bytes, GGUF_DEFAULT_ALIGNMENT) - n_bytes);
+
+        i_tensor++;
    }

-    void print_info() {
-        printf("n_split: %ld\n", ctx_outs.size());
-        int i_split = 0;
-        for (auto & ctx_out : ctx_outs) {
-            // re-calculate the real gguf size for each split (= metadata size + total size of all tensors)
-            size_t total_size = gguf_get_meta_size(ctx_out);
-            for (int i = 0; i < gguf_get_n_tensors(ctx_out); ++i) {
-                struct ggml_tensor * t = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_out, i));
-                total_size += ggml_nbytes(t);
-            }
-            total_size = total_size / 1024 / 1024; // convert to megabytes
-            printf("split %05d: n_tensors = %d, total_size = %ldM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size);
-            i_split++;
-        }
-    }
+    void split_end() {
+        // go back to beginning of file and write the updated metadata
+        fout.seekp(0);
+        std::vector<uint8_t> data(gguf_get_meta_size(ctx_out));
+        gguf_get_meta_data(ctx_out, data.data());
+        fout.write((const char *)data.data(), data.size());

-    void write() {
-        int i_split = 0;
-        int n_split = ctx_outs.size();
-        for (auto & ctx_out : ctx_outs) {
-            // construct file path
-            char split_path[PATH_MAX] = {0};
-            llama_split_path(split_path, sizeof(split_path), params.output.c_str(), i_split, n_split);
+        fout.close();
+        gguf_free(ctx_out);

-            // open the output file
-            printf("Writing file %s ... ", split_path);
-            fflush(stdout);
-            std::ofstream fout = std::ofstream(split_path, std::ios::binary);
-            fout.exceptions(std::ofstream::failbit); // fail fast on write errors
-
-            // write metadata
-            std::vector<uint8_t> data(gguf_get_meta_size(ctx_out));
-            gguf_get_meta_data(ctx_out, data.data());
-            fout.write((const char *)data.data(), data.size());
-
-            // write tensors
-            for (int i = 0; i < gguf_get_n_tensors(ctx_out); ++i) {
-                // read tensor meta and prepare buffer
-                const char * t_name = gguf_get_tensor_name(ctx_out, i);
-                struct ggml_tensor * t = ggml_get_tensor(ctx_meta, t_name);
-                auto n_bytes = ggml_nbytes(t);
-                read_buf.resize(n_bytes);
-
-                // calculate offset
-                auto i_tensor_in = gguf_find_tensor(ctx_gguf, t_name); // idx of tensor in the input file
-                auto offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i_tensor_in);
-
-                // copy tensor from input to output file
-                copy_file_to_file(f_input, fout, offset, n_bytes);
-                zeros(fout, GGML_PAD(n_bytes, GGUF_DEFAULT_ALIGNMENT) - n_bytes);
-            }
-
-            printf("done\n");
-            // close the file
-            fout.close();
-            i_split++;
-        }
-    }
-
-    void copy_file_to_file(std::ifstream & f_in, std::ofstream & f_out, const size_t in_offset, const size_t len) {
-        // TODO: detect OS and use copy_file_range() here for better performance
-        if (read_buf.size() < len) {
-            read_buf.resize(len);
-        }
-        f_in.seekg(in_offset);
-        f_in.read((char *)read_buf.data(), len);
-        f_out.write((const char *)read_buf.data(), len);
+        fprintf(stderr, "\033[3Ddone\n");
    }
 };

@@ -344,31 +250,37 @@ static void gguf_split(const split_params & split_params) {
    std::ifstream f_input(split_params.input.c_str(), std::ios::binary);
    if (!f_input.is_open()) {
        fprintf(stderr, "%s:  failed to open input GGUF from %s\n", __func__, split_params.input.c_str());
-        exit(EXIT_FAILURE);
+        exit(1);
    }

    auto * ctx_gguf = gguf_init_from_file(split_params.input.c_str(), params);
    if (!ctx_gguf) {
        fprintf(stderr, "%s:  failed to load input GGUF from %s\n", __func__, split_params.input.c_str());
-        exit(EXIT_FAILURE);
+        exit(1);
    }

-    // prepare the strategy
    split_strategy strategy(split_params, f_input, ctx_gguf, ctx_meta);
-    int n_split = strategy.ctx_outs.size();
-    strategy.print_info();
+    fprintf(stderr, "%s: %s -> %s (%d tensors per file)\n",
+            __func__, split_params.input.c_str(),
+            split_file_name(split_params.output, strategy.i_split, strategy.n_split).c_str(),
+            split_params.n_split_tensors);

-    if (!split_params.dry_run) {
-        // write all output splits
-        strategy.write();
+    strategy.split_start();
+
+    while (strategy.i_tensor < strategy.n_tensors) {
+        strategy.next_tensor();
+        if (strategy.should_split()) {
+            strategy.split_end();
+            strategy.split_start();
+        }
    }
+    strategy.split_end();

-    // done, clean up
    gguf_free(ctx_gguf);
    f_input.close();

    fprintf(stderr, "%s: %d gguf split written with a total of %d tensors.\n",
-            __func__, n_split, strategy.n_tensors);
+            __func__, strategy.n_split, strategy.n_tensors);
 }

 static void gguf_merge(const split_params & split_params) {
@@ -386,9 +298,7 @@ static void gguf_merge(const split_params & split_params) {
    std::vector<ggml_context *> ctx_metas;
    std::vector<gguf_context *> ctx_ggufs;

-    char split_path[PATH_MAX] = {0};
-    strncpy(split_path, split_params.input.c_str(), sizeof(split_path) - 1);
-    char split_prefix[PATH_MAX] = {0};
+    std::string split_prefix;

    // First pass to find KV and tensors metadata
    for (int i_split = 0; i_split < n_split; i_split++) {
@@ -399,66 +309,89 @@ static void gguf_merge(const split_params & split_params) {
            /*.ctx      = */ &ctx_meta,
        };

+        auto split_name = split_params.input;
        if (i_split > 0) {
-            llama_split_path(split_path, sizeof(split_path), split_prefix, i_split, n_split);
+            split_name = split_file_name(split_prefix, i_split, n_split);
        }
-        fprintf(stderr, "%s: reading metadata %s ...", __func__, split_path);
+        fprintf(stderr, "%s: reading metadata %s ...", __func__, split_name.c_str());

-        auto * ctx_gguf = gguf_init_from_file(split_path, params);
+        auto * ctx_gguf = gguf_init_from_file(split_name.c_str(), params);
        if (!ctx_gguf) {
            fprintf(stderr, "\n%s:  failed to load input GGUF from %s\n", __func__, split_params.input.c_str());
-            exit(EXIT_FAILURE);
+            exit(1);
        }
        ctx_ggufs.push_back(ctx_gguf);
        ctx_metas.push_back(ctx_meta);

        if (i_split == 0) {
-            auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT);
+            auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_GENERAL_SPLIT_N_SPLIT);
            if (key_n_split < 0) {
                fprintf(stderr,
                        "\n%s: input file does not contain %s metadata\n",
                        __func__,
-                        LLM_KV_SPLIT_COUNT);
+                        LLM_KV_GENERAL_SPLIT_N_SPLIT);
                gguf_free(ctx_gguf);
-                ggml_free(ctx_meta);
                gguf_free(ctx_out);
                fout.close();
-                exit(EXIT_FAILURE);
+                exit(1);
            }

-            n_split = gguf_get_val_u16(ctx_gguf, key_n_split);
+            n_split = gguf_get_val_u8(ctx_gguf, key_n_split);
            if (n_split < 1) {
                fprintf(stderr,
                        "\n%s: input file does not contain a valid split count %d\n",
                        __func__,
                        n_split);
                gguf_free(ctx_gguf);
-                ggml_free(ctx_meta);
                gguf_free(ctx_out);
                fout.close();
-                exit(EXIT_FAILURE);
-            }
-
-            // Verify the file naming and extract split_prefix
-            if (!llama_split_prefix(split_prefix, sizeof (split_prefix), split_path, i_split, n_split)) {
-                fprintf(stderr, "\n%s: unexpected input file name: %s"
-                                " i_split=%d"
-                                " n_split=%d\n", __func__,
-                        split_path, i_split, n_split);
-                gguf_free(ctx_gguf);
-                ggml_free(ctx_meta);
-                gguf_free(ctx_out);
-                fout.close();
-                exit(EXIT_FAILURE);
+                exit(1);
            }

            // Do not trigger merge if we try to merge again the output
-            gguf_set_val_u16(ctx_gguf, LLM_KV_SPLIT_COUNT, 0);
+            gguf_set_val_u8(ctx_out, LLM_KV_GENERAL_SPLIT_N_SPLIT, 0);

            // Set metadata from the first split
            gguf_set_kv(ctx_out, ctx_gguf);
        }

+        // Verify the file naming
+        {
+            int i_split_file = 0;
+            int n_split_file = 0;
+            const char * i_split_format = "-00000-of-00000.gguf";
+
+            if (split_name.size() < strlen(i_split_format)) {
+                fprintf(stderr, "\n%s: unexpected input file name: %s\n", __func__, split_params.input.c_str());
+                for (auto * _ctx_gguf : ctx_ggufs) {
+                    gguf_free(_ctx_gguf);
+                }
+                gguf_free(ctx_out);
+                fout.close();
+                exit(1);
+            }
+
+            split_prefix = split_name.substr(0, split_name.size() - strlen(i_split_format));
+
+            const char * split_name_c_str = split_name.c_str();
+            int n_part = sscanf(&split_name_c_str[0] + split_prefix.size(), "-%d-of-%d", &i_split_file, &n_split_file);
+
+            if (n_part != 2 || i_split_file - 1 != i_split || n_split_file != n_split) {
+                fprintf(stderr, "\n%s: unexpected input file name: %s"
+                                " i_split=%d i_split_file=%d"
+                                " n_split=%d n_split_file=%d\n", __func__,
+                        split_params.input.c_str(),
+                        i_split, i_split_file,
+                        n_split, n_split_file);
+                for (auto * _ctx_gguf : ctx_ggufs) {
+                    gguf_free(_ctx_gguf);
+                }
+                gguf_free(ctx_out);
+                fout.close();
+                exit(1);
+            }
+        }
+
        auto n_tensors = gguf_get_n_tensors(ctx_gguf);
        for (int i_tensor = 0; i_tensor < n_tensors; i_tensor++) {
            const char * t_name = gguf_get_tensor_name(ctx_gguf, i_tensor);
@@ -478,19 +411,18 @@ static void gguf_merge(const split_params & split_params) {

    // Write tensors data
    for (int i_split = 0; i_split < n_split; i_split++) {
-        llama_split_path(split_path, sizeof(split_path), split_prefix, i_split, n_split);
-        std::ifstream f_input(split_path, std::ios::binary);
+        auto split_name = split_file_name(split_prefix, i_split, n_split);
+        std::ifstream f_input(split_name.c_str(), std::ios::binary);
        if (!f_input.is_open()) {
-            fprintf(stderr, "%s:  failed to open input GGUF from %s\n", __func__, split_path);
-            for (uint32_t i = 0; i < ctx_ggufs.size(); i++) {
-                gguf_free(ctx_ggufs[i]);
-                ggml_free(ctx_metas[i]);
+            fprintf(stderr, "%s:  failed to open input GGUF from %s\n", __func__, split_name.c_str());
+            for (auto * _ctx_gguf : ctx_ggufs) {
+                gguf_free(_ctx_gguf);
            }
            gguf_free(ctx_out);
            fout.close();
-            exit(EXIT_FAILURE);
+            exit(1);
        }
-        fprintf(stderr, "%s: writing tensors %s ...", __func__, split_path);
+        fprintf(stderr, "%s: writing tensors %s ...", __func__, split_name.c_str());

        auto * ctx_gguf = ctx_ggufs[i_split];
        auto * ctx_meta = ctx_metas[i_split];
@@ -537,6 +469,10 @@ static void gguf_merge(const split_params & split_params) {
 }

 int main(int argc, const char ** argv) {
+    if (argc < 3) {
+        split_print_usage(argv[0]);
+    }
+
    split_params params;
    split_params_parse(argc, argv, params);

@@ -545,8 +481,8 @@ int main(int argc, const char ** argv) {
            break;
        case SPLIT_OP_MERGE: gguf_merge(params);
            break;
-        default: split_print_usage(argv[0]);
-            exit(EXIT_FAILURE);
+        default:split_print_usage(argv[0]);
+            exit(1);
    }

    return 0;
--- a/examples/imatrix/README.md
+++ b/examples/imatrix/README.md
@@ -22,7 +22,7 @@ For faster computation, make sure to use GPU offloading via the `-ngl` argument
 ## Example

 ```bash
-LLAMA_CUDA=1 make -j
+LLAMA_CUBLAS=1 make -j

 # generate importance matrix (imatrix.dat)
 ./imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -50,31 +50,29 @@ private:
    void keep_imatrix(int ncall) const;
 };

-// remove any prefix and suffixes from the name
-// CUDA0#blk.0.attn_k.weight#0 => blk.0.attn_k.weight
-static std::string filter_tensor_name(const char * name) {
-    std::string wname;
-    const char * p = strchr(name, '#');
-    if (p != NULL) {
-        p = p + 1;
-        const char * q = strchr(p, '#');
-        if (q != NULL) {
-            wname = std::string(p, q - p);
-        } else {
-            wname = p;
-        }
-    } else {
-        wname = name;
-    }
-    return wname;
-}
-
 bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
    GGML_UNUSED(user_data);

    const struct ggml_tensor * src0 = t->src[0];
    const struct ggml_tensor * src1 = t->src[1];
-    std::string wname = filter_tensor_name(src0->name);
+
+    std::string wname;
+    {
+        // remove any prefix and suffixes from the name
+        // CUDA0#blk.0.attn_k.weight#0 => blk.0.attn_k.weight
+        const char * p = strchr(src0->name, '#');
+        if (p != NULL) {
+            p = p + 1;
+            const char * q = strchr(p, '#');
+            if (q != NULL) {
+                wname = std::string(p, q - p);
+            } else {
+                wname = p;
+            }
+        } else {
+            wname = src0->name;
+        }
+    }

    // when ask is true, the scheduler wants to know if we are interested in data from this tensor
    // if we return true, a follow-up call will be made with ask=false in which we can do the actual collection
@@ -98,38 +96,34 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *

    const float * data = is_host ? (const float *) src1->data : m_src1_data.data();

-    // this has been adapted to the new format of storing merged experts in a single 3d tensor
-    // ref: https://github.com/ggerganov/llama.cpp/pull/6387
    if (t->op == GGML_OP_MUL_MAT_ID) {
        const int idx  = ((int32_t *) t->op_params)[0];
-        const ggml_tensor * ids = t->src[2];
-        const int n_as = src0->ne[2];
+        const int n_as = ((int32_t *) t->op_params)[1];

-        // the top-k selected expert ids are stored in the ids tensor
-        // for simplicity, always copy ids to host, because it is small
-        // take into account that ids is not contiguous!
-        GGML_ASSERT(ids->ne[1] == src1->ne[1]);
-        GGML_ASSERT(n_as*ggml_nrows(ids)*sizeof(int) == GGML_PAD(ggml_nbytes(ids), n_as*sizeof(int)));
-        m_ids.resize(ggml_nbytes(ids)/sizeof(int));
-        ggml_backend_tensor_get(ids, m_ids.data(), 0, ggml_nbytes(ids));
-
-        auto & e = m_stats[wname];
-
-        ++e.ncall;
-        // NOTE: since we select top-k experts, the number of calls for the expert tensors will be k times larger
-        //       using the following line, we can correct for that if needed by replacing the line above with:
-        //if (idx == t->src[0]->ne[0] - 1) ++e.ncall;
+        // the top-k selected expert ids are stored in the src0 tensor
+        // for simplicity, always copy src0 to host, because it is small
+        // take into account that src0 is not contiguous!
+        GGML_ASSERT(src0->ne[1] == src1->ne[1]);
+        GGML_ASSERT(n_as*ggml_nrows(src0)*sizeof(int) == GGML_PAD(ggml_nbytes(src0), n_as*sizeof(int)));
+        m_ids.resize(ggml_nbytes(src0)/sizeof(int));
+        ggml_backend_tensor_get(src0, m_ids.data(), 0, ggml_nbytes(src0));

        // loop over all possible experts, regardless if they are used or not in the batch
+        // this is necessary to guarantee equal number of "ncall" for each tensor
        for (int ex = 0; ex < n_as; ++ex) {
-            size_t e_start = ex*src1->ne[0];
+            src0 = t->src[2 + ex];
+            auto& e = m_stats[wname];
            if (e.values.empty()) {
-                e.values.resize(src1->ne[0]*n_as, 0);
+                e.values.resize(src1->ne[0], 0);
            }
-            else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
-                fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
+            else if (e.values.size() != (size_t)src1->ne[0]) {
+                fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
                exit(1); //GGML_ASSERT(false);
            }
+            // NOTE: since we select top-k experts, the number of calls for the expert tensors will be k times larger
+            //       using the following line, we can correct for that if needed
+            //if (idx == t->src[0]->ne[0] - 1) ++e.ncall;
+            ++e.ncall;
            if (m_params.verbosity > 1) {
                printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
            }
@@ -139,7 +133,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
                if (excur != ex) continue;
                const float * x = data + row * src1->ne[0];
                for (int j = 0; j < (int)src1->ne[0]; ++j) {
-                    e.values[e_start + j] += x[j]*x[j];
+                    e.values[j] += x[j]*x[j];
                }
            }
            if (e.ncall > m_last_call) {
@@ -427,7 +421,6 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
                tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
            }

-            // TODO: use batch.logits to save computations instead of relying on logits_all == true
            if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
                fprintf(stderr, "%s : failed to eval\n", __func__);
                return false;
--- a/examples/json-schema-to-grammar.py
+++ b/examples/json-schema-to-grammar.py
@@ -61,7 +61,7 @@ class SchemaConverter:

    def _format_literal(self, literal):
        escaped = GRAMMAR_LITERAL_ESCAPE_RE.sub(
-            lambda m: GRAMMAR_LITERAL_ESCAPES.get(m.group(0)), literal
+            lambda m: GRAMMAR_LITERAL_ESCAPES.get(m.group(0)), json.dumps(literal)
        )
        return f'"{escaped}"'

@@ -308,7 +308,8 @@ class SchemaConverter:
        return ref_name

    def _generate_constant_rule(self, value):
-        return self._format_literal(json.dumps(value))
+        assert isinstance(value, str), f'Only string constants are supported, got {value}'
+        return self._format_literal(value)

    def visit(self, schema, name):
        schema_type = schema.get('type')
@@ -427,7 +428,7 @@ class SchemaConverter:
            prop_rule_name = self.visit(prop_schema, f'{name}{"-" if name else ""}{prop_name}')
            prop_kv_rule_names[prop_name] = self._add_rule(
                f'{name}{"-" if name else ""}{prop_name}-kv',
-                fr'{self._format_literal(json.dumps(prop_name))} space ":" space {prop_rule_name}'
+                fr'{self._format_literal(prop_name)} space ":" space {prop_rule_name}'
            )
        required_props = [k for k in sorted_props if k in required]
        optional_props = [k for k in sorted_props if k not in required]
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -113,7 +113,7 @@ static std::string get_cpu_info() {

 static std::string get_gpu_info() {
    std::string id;
-#ifdef GGML_USE_CUDA
+#ifdef GGML_USE_CUBLAS
    int count = ggml_backend_cuda_get_device_count();
    for (int i = 0; i < count; i++) {
        char buf[128];
@@ -808,7 +808,7 @@ struct test {

 const std::string test::build_commit = LLAMA_COMMIT;
 const int         test::build_number = LLAMA_BUILD_NUMBER;
-const bool        test::cuda         = !!ggml_cpu_has_cuda();
+const bool        test::cuda         = !!ggml_cpu_has_cublas();
 const bool        test::opencl       = !!ggml_cpu_has_clblast();
 const bool        test::vulkan       = !!ggml_cpu_has_vulkan();
 const bool        test::kompute      = !!ggml_cpu_has_kompute();
--- a/examples/llava/MobileVLM-README.md
+++ b/examples/llava/MobileVLM-README.md
@@ -6,7 +6,7 @@ for more information, please go to [Meituan-AutoML/MobileVLM](https://github.com

 The implementation is based on llava, and is compatible with llava and mobileVLM. The usage is basically same as llava.

-Notice: The overall process of model inference for both **MobileVLM** and **MobileVLM_V2** models is the same, but the process of model conversion is a little different. Therefore, using **MobileVLM-1.7B** as an example, the different conversion step will be shown.
+Notice: The overall process of model inference for both **MobileVLM** and **MobileVLM_V2** models is the same, but the process of model conversion  is a little different. Therefore, using MobiVLM as an example, the different conversion step will be shown.

 ## Usage
 Build with cmake or run `make llava-cli` to build it.
@@ -36,7 +36,7 @@ git clone https://huggingface.co/openai/clip-vit-large-patch14-336
 python ./examples/llava/llava-surgery.py -m path/to/MobileVLM-1.7B
 ```

-3. Use `convert-image-encoder-to-gguf.py` with `--projector-type ldp` (for **V2** please use `--projector-type ldpv2`) to convert the LLaVA image encoder to GGUF:
+3. Use `convert-image-encoder-to-gguf.py` with `--projector-type ldp` (for **V2** the arg is `--projector-type ldpv2`) to convert the LLaVA image encoder to GGUF:

 ```sh
 python ./examples/llava/convert-image-encoder-to-gguf \
@@ -78,7 +78,7 @@ cd examples/llava/android/build_64
 ### run on Android
 refer to `android/adb_run.sh`, modify resources' `name` and `path`

-## Some result on Android with `Snapdragon 888` chip
+## some result on Android with `Snapdragon 888` chip
 ### case 1
 **input**
 ```sh
@@ -109,6 +109,7 @@ llama_print_timings:       total time =   34731.93 ms
    --image /data/local/tmp/cat.jpeg \
    -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:"
 ```
+
 **output**
 ```sh
 encode_image_with_clip: image encoded in 21149.51 ms by CLIP (  146.87 ms per image patch)
@@ -120,82 +121,12 @@ llama_print_timings:        eval time =    1279.03 ms /    18 runs   (   71.06 m
 llama_print_timings:       total time =   34570.79 ms
 ```

-
-## Some result on Android with `Snapdragon 778G` chip
-### MobileVLM-1.7B case
-#### llava-cli release-b2005
-**input**
-```sh
-/data/local/tmp/llava-cli \
-    -m /data/local/tmp/ggml-model-q4_k.gguf \
-    --mmproj /data/local/tmp/mmproj-model-f16.gguf \
-    -t 4 \
-    --image /data/local/tmp/many_llamas.jpeg \
-    -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat's that? ASSISTANT:"
-```
-**output**
-```sh
-encode_image_with_clip: image encoded in 18728.52 ms by CLIP (  130.06 ms per image patch)
-system_prompt: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER:
-user_prompt: \nWhat's that? ASSISTANT:
-
- A group of llamas are standing in a green pasture.
-
-llama_print_timings:        load time =   20357.33 ms
-llama_print_timings:      sample time =       2.96 ms /    14 runs   (    0.21 ms per token,  4734.53 tokens per second)
-llama_print_timings: prompt eval time =    8119.49 ms /   191 tokens (   42.51 ms per token,    23.52 tokens per second)
-llama_print_timings:        eval time =    1005.75 ms /    14 runs   (   71.84 ms per token,    13.92 tokens per second)
-llama_print_timings:       total time =   28038.34 ms /   205 tokens
-```
-#### llava-cli latest-version
-**input**
-
-Just the same as above.
-
-**output**(seems to be much slower)
-```sh
-encode_image_with_clip: image embedding created: 144 tokens
-
-encode_image_with_clip: image encoded in 288268.88 ms by CLIP ( 2001.87 ms per image patch)
-system_prompt: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER:
-user_prompt: \nWhat's that? ASSISTANT:
-
- It is a group of sheep standing together in a grass field.
-
-llama_print_timings:        load time =  818120.91 ms
-llama_print_timings:      sample time =       3.44 ms /    14 runs   (    0.25 ms per token,  4067.40 tokens per second)
-llama_print_timings: prompt eval time =  529274.69 ms /   191 tokens ( 2771.07 ms per token,     0.36 tokens per second)
-llama_print_timings:        eval time =   43894.02 ms /    13 runs   ( 3376.46 ms per token,     0.30 tokens per second)
-llama_print_timings:       total time =  865441.76 ms /   204 tokens
-```
-### MobileVLM_V2-1.7B case
-#### llava-cli release-2005b
-**input**
-
-Just the same as above.
-
-**output**
-```sh
-encode_image_with_clip: image encoded in 20609.61 ms by CLIP (  143.12 ms per image patch)
-system_prompt: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER:
-user_prompt: \nWhat's that? ASSISTANT:
-
- This image captures a lively scene of 20 llamas in motion on an expansive, grassy field. The llama is scattered across the landscape with some standing and others sitting down as if taking rest or observing their surroundings from different vantage points within this verdant setting.
-
-The background offers glimpses into a picturesque town nestled amidst hills under an overcast sky, adding depth to the scene while also emphasizing that distance between these llama and human-made structures like houses or roads in which they roam freely without any barriers around them. The image is framed by text at both right angles on white backgrounds against a contrasting blue backdrop with green foliage, further drawing attention to the llamas amidst their natural habitat while also inviting viewers into this picturesque landscape within town limits of Alta Llama
-
-llama_print_timings:        load time =   22406.77 ms
-llama_print_timings:      sample time =      49.26 ms /   186 runs   (    0.26 ms per token,  3776.27 tokens per second)
-llama_print_timings: prompt eval time =    9044.54 ms /   191 tokens (   47.35 ms per token,    21.12 tokens per second)
-llama_print_timings:        eval time =   14497.49 ms /   186 runs   (   77.94 ms per token,    12.83 tokens per second)
-llama_print_timings:       total time =   44411.01 ms /   377 tokens
-```
-
 ## Orin compile and run
 ### compile
 ```sh
-make LLAMA_CUDA=1 CUDA_DOCKER_ARCH=sm_87 LLAMA_CUDA_F16=1 -j 32
+make LLAMA_CUBLAS=1 CUDA_DOCKER_ARCH=sm_87 LLAMA_CUDA_F16=1 -j 32
 ```
+
 ### run on Orin
 ### case 1
 **input**
@@ -244,121 +175,8 @@ llama_print_timings:        eval time =     166.65 ms /    11 runs   (   15.15 m
 llama_print_timings:       total time =    1365.47 ms /   243 tokens
 ```

-## Running on Intel(R) Core(TM) i7-10750H
-### Operating system
-Ubuntu22.04
-### compile
-```sh
-make -j32
-```
-### MobileVLM-1.7B case
-**input**
-```sh
-m /path/to/ggml-model-q4_k.gguf \
-    --mmproj /path/to/mmproj-model-f16.gguf \
-    --image /path/to/many_llamas.jpeg
-    -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat's that? ASSISTANT:" \
-```
-**output**
-```sh
-encode_image_with_clip: image embedding created: 144 tokens
-
-encode_image_with_clip: image encoded in  2730.94 ms by CLIP (   18.96 ms per image patch)
-system_prompt: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER:
-user_prompt: \nWhat's that?ASSISTANT:
-
- A group of llamas are walking together in a field.
-
-llama_print_timings:        load time =    5506.60 ms
-llama_print_timings:      sample time =       0.44 ms /    13 runs   (    0.03 ms per token, 29545.45 tokens per second)
-llama_print_timings: prompt eval time =    2031.58 ms /   190 tokens (   10.69 ms per token,    93.52 tokens per second)
-llama_print_timings:        eval time =     438.92 ms /    12 runs   (   36.58 ms per token,    27.34 tokens per second)
-llama_print_timings:       total time =    5990.25 ms /   202 tokens
-```
-
-### MobileVLM_V2-1.7B case
-**input**
-
-Just the same as above.
-
-**ouput**
-```sh
-encode_image_with_clip: image embedding created: 144 tokens
-
-encode_image_with_clip: image encoded in  3223.89 ms by CLIP (   22.39 ms per image patch)
-system_prompt: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER:
-user_prompt: \nWhat's that?ASSISTANT:
-
- The image captures a tranquil scene in a park, where a group of approximately 20 llamas are gathered. The llamas, a mix of white and black, are standing in a line, their black and white patterns contrasting with the lush green grass of the park. The lamas are arranged in a line, suggesting a social order.
-
-The park itself is lush and green, with trees dotting the landscape in the background. A sign reading "Llamas Tico  Ana" is also visible in the image, possibly indicating the location or the breed of the llamas. The image seems to be taken from a distance, providing a wide view of the scene and the surrounding environment.
-
-The llamas' positions relative to each other, the sign, and the trees create a harmonious composition. The image does not contain any discernible text. The overall scene is one of peace and natural beauty, with the llamas in their natural habitat, surrounded by the vibrant colors and lush greenery of the park.
-
-llama_print_timings:        load time =    6642.61 ms
-llama_print_timings:      sample time =       8.15 ms /   223 runs   (    0.04 ms per token, 27358.61 tokens per second)
-llama_print_timings: prompt eval time =    2475.07 ms /   190 tokens (   13.03 ms per token,    76.77 tokens per second)
-llama_print_timings:        eval time =    8760.60 ms /   222 runs   (   39.46 ms per token,    25.34 tokens per second)
-llama_print_timings:       total time =   15513.95 ms /   412 tokens
-```
-
-## Run on Intel(R) Core(TM) Ultra7 115H
-### operation system
-Windows11
-### comiple
-```sh
-make -j32
-```
-### MobileVLM-1.7B case
-**input**
-```sh
-m /path/to/ggml-model-q4_k.gguf \
-    --mmproj /path/to/tmp/mmproj-model-f16.gguf \
-    -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat's that? ASSISTANT:" \
-```
-**output**
-```sh
-encode_image_with_clip: image encoded in  4902.81 ms by CLIP (   34.05 ms per image patch)
-system_prompt: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER:
-user_prompt: \nWhat's that? ASSISTANT:
-
- The image features a group of brown and white llamas standing in a grassy field.
-
-llama_print_timings:        load time =    7441.06 ms
-llama_print_timings:      sample time =       0.72 ms /    19 runs   (    0.04 ms per token, 26279.39 tokens per second)
-llama_print_timings: prompt eval time =    2090.71 ms /   191 tokens (   10.95 ms per token,    91.36 tokens per second)
-llama_print_timings:        eval time =     512.35 ms /    18 runs   (   28.46 ms per token,    35.13 tokens per second)
-llama_print_timings:       total time =    7987.23 ms /   209 tokens
-```
-
-### MobileVLM_V2-1.7B case
-**input**
-
-Just the same as above.
-
-**output**
-```sh
-encode_image_with_clip: image encoded in  4682.44 ms by CLIP (   32.52 ms per image patch)
-system_prompt: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER:
-user_prompt: \nWhat's that? ASSISTANT:
-
- This image captures a lively scene of a group of 14 llamas in a grassy field. The llamas, with their distinctive black and white coats, are standing and walking in a line, seemingly engaged in a social activity. One
- of them, possibly the first in the line, has its back turned, perhaps observing something in the distance.
-
-The llama in the front of the line stands out due to its black and white coloring, which is quite unusual for llama patterns. The llama in the front also seems to be more aware of its surroundings, as it faces the camera, giving a sense of engagement with the viewer.
-
-The image is taken from the side of the llama, providing a clear view of the llama in the front and its companions. The lameness in the llama in
- front is not visible, indicating that it might not be the main focus of the photo.
-
-The background of the image features a grassy field, with a fence and a tree visible in the distance. The tree appears to be bare, suggesting that it might be during a time of year when most trees are dormant or have shed their leaves.
-
-
-llama_print_timings:        load time =    7015.35 ms
-llama_print_timings:      sample time =      10.61 ms /   256 runs   (    0.04 ms per token, 24119.09 tokens per second)
-llama_print_timings: prompt eval time =    2052.45 ms /   191 tokens (   10.75 ms per token,    93.06 tokens per second)
-llama_print_timings:        eval time =    7259.43 ms /   255 runs   (   28.47 ms per token,    35.13 tokens per second)
-llama_print_timings:       total time =   14371.19 ms /   446 tokens
-```
+## Minor shortcomings
+The `n_patch` of output in `ldp` is 1/4 of the input. In order to implement quickly, we uniformly modified `clip_n_patches` function to a quarter. when counting the time consumption, the calculated time will be 4 times bigger than the real cost.

 ## TODO

@@ -373,5 +191,5 @@ llama_print_timings:       total time =   14371.19 ms /   446 tokens

 ## contributor
 ```sh
-zhangjidong05, yangyang260, huyiming03, chenxiaotao03, ZiangWu-77
+zhangjidong05, yangyang260, huyiming03, chenxiaotao03
 ```
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -7,7 +7,7 @@
 #include "ggml-alloc.h"
 #include "ggml-backend.h"

-#ifdef GGML_USE_CUDA
+#ifdef GGML_USE_CUBLAS
 #include "ggml-cuda.h"
 #endif

@@ -835,10 +835,9 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
            mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0);
            // weight ne = [3, 3, 2048, 1]
            struct ggml_tensor * peg_0 = ggml_conv_depthwise_2d(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1);
+            peg_0 = ggml_add(ctx0, peg_0, mlp_2);
            peg_0 = ggml_cont(ctx0, ggml_permute(ctx0, peg_0, 1, 2, 0, 3));
            peg_0 = ggml_add(ctx0, peg_0, model.mm_model_peg_0_b);
-            mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 2, 0, 3));
-            peg_0 = ggml_add(ctx0, peg_0, mlp_2);
            peg_0 = ggml_reshape_3d(ctx0, peg_0, peg_0->ne[0], peg_0->ne[1] * peg_0->ne[2], peg_0->ne[3]);
            embeddings = peg_0;
        }
@@ -969,7 +968,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        }
    }

-#ifdef GGML_USE_CUDA
+#ifdef GGML_USE_CUBLAS
    new_clip->backend = ggml_backend_cuda_init(0);
    printf("%s: CLIP using CUDA backend\n", __func__);
 #endif
@@ -1756,7 +1755,7 @@ int clip_n_patches(const struct clip_ctx * ctx) {

    int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);

-    if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2) {
+    if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
        n_patches /= 4;
    }

--- a/examples/lookup/CMakeLists.txt
+++ b/examples/lookup/CMakeLists.txt
@@ -3,21 +3,3 @@ add_executable(${TARGET} lookup.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
-
-set(TARGET lookup-create)
-add_executable(${TARGET} lookup-create.cpp)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
-
-set(TARGET lookup-merge)
-add_executable(${TARGET} lookup-merge.cpp)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
-
-set(TARGET lookup-stats)
-add_executable(${TARGET} lookup-stats.cpp)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/lookup/lookup-create.cpp
+++ b/examples/lookup/lookup-create.cpp
@@ -1,43 +0,0 @@
-#include "ggml.h"
-#include "llama.h"
-#include "common.h"
-#include "ngram-cache.h"
-
-#include <cstdint>
-#include <fstream>
-#include <iostream>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-int main(int argc, char ** argv){
-    gpt_params params;
-
-    if (!gpt_params_parse(argc, argv, params)) {
-        return 1;
-    }
-    // init llama.cpp
-    llama_backend_init();
-    llama_numa_init(params.numa);
-
-    llama_model * model = NULL;
-    llama_context * ctx = NULL;
-
-    // load the model
-    std::tie(model, ctx) = llama_init_from_gpt_params(params);
-    GGML_ASSERT(model != nullptr);
-
-    // tokenize the prompt
-    const bool add_bos = llama_should_add_bos_token(model);
-
-    std::vector<llama_token> inp;
-    inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
-    fprintf(stderr, "%s: tokenization done\n", __func__);
-
-
-    llama_ngram_cache ngram_cache;
-    llama_ngram_cache_update(ngram_cache, LLAMA_NGRAM_STATIC, LLAMA_NGRAM_STATIC, inp, inp.size(), true);
-    fprintf(stderr, "%s: hashing done, writing file to %s\n", __func__, params.lookup_cache_static.c_str());
-
-    llama_ngram_cache_save(ngram_cache, params.lookup_cache_static);
-}
--- a/examples/lookup/lookup-merge.cpp
+++ b/examples/lookup/lookup-merge.cpp
@@ -1,47 +0,0 @@
-#include "ggml.h"
-#include "llama.h"
-#include "common.h"
-#include "ngram-cache.h"
-
-#include <cstdint>
-#include <cstdio>
-#include <fstream>
-#include <iostream>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-static void print_usage() {
-    fprintf(stderr, "Merges multiple lookup cache files into a single one.\n");
-    fprintf(stderr, "Usage: lookup-merge [--help] lookup_part_1.bin lookup_part_2.bin ... lookup_merged.bin\n");
-}
-
-int main(int argc, char ** argv){
-    if (argc < 3) {
-        print_usage();
-        exit(1);
-    }
-
-    std::vector<std::string> args;
-    args.resize(argc-1);
-    for (int i = 0; i < argc-1; ++i) {
-        args[i] = argv[i+1];
-        if (args[i] == "-h" || args[i] == "--help") {
-            print_usage();
-            exit(0);
-        }
-    }
-
-    fprintf(stderr, "lookup-merge: loading file %s\n", args[0].c_str());
-    llama_ngram_cache ngram_cache_merged = llama_ngram_cache_load(args[0]);
-
-    for (size_t i = 1; i < args.size()-1; ++i) {
-        fprintf(stderr, "lookup-merge: loading file %s\n", args[i].c_str());
-        llama_ngram_cache ngram_cache = llama_ngram_cache_load(args[i]);
-
-        llama_ngram_cache_merge(ngram_cache_merged, ngram_cache);
-    }
-
-    fprintf(stderr, "lookup-merge: saving file %s\n", args.back().c_str());
-    llama_ngram_cache_save(ngram_cache_merged, args.back());
-}
--- a/examples/lookup/lookup-stats.cpp
+++ b/examples/lookup/lookup-stats.cpp
@@ -1,163 +0,0 @@
-#include "ggml.h"
-#include "common.h"
-#include "llama.h"
-#include "log.h"
-#include "ngram-cache.h"
-
-#include <cmath>
-#include <cstdint>
-#include <cstdio>
-#include <fstream>
-#include <string>
-#include <vector>
-#include <unordered_map>
-
-int main(int argc, char ** argv){
-    gpt_params params;
-
-    if (!gpt_params_parse(argc, argv, params)) {
-        return 1;
-    }
-
-    const int n_draft = params.n_draft;
-
-    // init llama.cpp
-    llama_backend_init();
-    llama_numa_init(params.numa);
-
-    llama_model * model = NULL;
-    llama_context * ctx = NULL;
-
-    // load the model
-    std::tie(model, ctx) = llama_init_from_gpt_params(params);
-    llama_set_rng_seed(ctx, params.seed);
-    GGML_ASSERT(llama_n_vocab(model) < (1 << 16));
-
-    // tokenize the prompt
-    const bool add_bos = llama_should_add_bos_token(model);
-    LOG("add_bos tgt: %d\n", add_bos);
-
-    std::vector<llama_token> inp;
-    inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
-
-    llama_ngram_cache ngram_cache_context;
-    llama_ngram_cache ngram_cache_dynamic;
-    llama_ngram_cache ngram_cache_static;
-    int64_t t_draft_flat_us = 0;
-    int64_t t_draft_us = 0;
-
-    {
-        const int64_t t_start_draft_us = ggml_time_us();
-
-        if (!params.lookup_cache_static.empty()) {
-            try {
-                ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static);
-            } catch (std::ifstream::failure const &) {
-                fprintf(stderr, "error: failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
-                exit(1);
-            }
-        }
-
-        if (!params.lookup_cache_dynamic.empty()) {
-            try {
-                ngram_cache_dynamic = llama_ngram_cache_load(params.lookup_cache_dynamic);
-            } catch (std::ifstream::failure const &) {} // if the file does not exist it will simply be created at the end of the program
-        }
-
-        t_draft_flat_us += ggml_time_us() - t_start_draft_us;
-    }
-
-    const int n_input = inp.size();
-    const int n_ctx = params.n_ctx;
-
-    int n_drafted = 0;
-    int n_accept  = 0;
-
-    const int64_t t_start_ms = ggml_time_ms();
-
-    // Iterate over input tokens in chunks of size n_ctx.
-    // Each chunk is treated as if a sequential generation but with pre-determined tokens to ensure reproducibility.
-    for (int i_start = 0; i_start + n_ctx < n_input; i_start += n_ctx) {
-        const std::vector<llama_token> inp_slice(inp.begin() + i_start, inp.begin() + i_start + n_ctx);
-        std::vector<llama_token> pseudo_output;
-        pseudo_output.push_back(inp_slice[0]);
-
-        while ((int) pseudo_output.size() < n_ctx) {
-            // Simulate drafting and decoding from draft:
-            std::vector<llama_token> draft;
-            draft.push_back(pseudo_output.back());
-
-            {
-                const int64_t t_start_draft_us = ggml_time_us();
-                llama_ngram_cache_draft(pseudo_output, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static);
-                t_draft_us += ggml_time_us() - t_start_draft_us;
-            }
-
-            n_drafted += draft.size() - 1;
-
-            for (size_t j = 1; j < draft.size() && (int) pseudo_output.size() < n_ctx; ++j) {
-                const llama_token ground_truth = inp_slice[pseudo_output.size()];
-                const llama_token drafted = draft[j];
-
-                if (ground_truth != drafted) {
-                    break;
-                }
-
-                ++n_accept;
-                pseudo_output.push_back(ground_truth);
-
-                {
-                    const int64_t t_start_draft_us = ggml_time_us();
-                    llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, pseudo_output, 1, false);
-                    t_draft_us += ggml_time_us() - t_start_draft_us;
-                }
-            }
-
-            // After each simulated batch decoding simulate the sampling of a single token:
-            if ((int) pseudo_output.size() < n_ctx) {
-                pseudo_output.push_back(inp_slice[pseudo_output.size()]);
-                {
-                    const int64_t t_start_draft_us = ggml_time_us();
-                    llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, pseudo_output, 1, false);
-                    t_draft_us += ggml_time_us() - t_start_draft_us;
-                }
-            }
-
-            draft.erase(draft.begin());
-
-        }
-        if (i_start > 0 && i_start / 100000 != (i_start - n_ctx) / 100000) {
-            const int64_t t_now_ms = ggml_time_ms();
-            const int64_t eta_ms   = (n_input - i_start) * (t_now_ms - t_start_ms) / i_start;
-            const int64_t eta_min  = eta_ms / (60*1000);
-            const int64_t eta_s    = (eta_ms - 60*1000*eta_min) / 1000;
-
-            LOG_TEE("lookup-stats: %d/%d done, ETA: %02" PRId64 ":%02" PRId64 "\n", i_start, n_input, eta_min, eta_s);
-        }
-
-        // After each chunk, update the dynamic ngram cache with the context ngram cache:
-        llama_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context);
-        ngram_cache_context.clear();
-    }
-
-    LOG_TEE("\n");
-
-    LOG_TEE("\n");
-    LOG_TEE("n_draft      = %d\n", n_draft);
-    LOG_TEE("n_predict    = %d\n", n_input - n_input % n_ctx);
-    LOG_TEE("n_drafted    = %d\n", n_drafted);
-    LOG_TEE("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
-    LOG_TEE("t_draft      = %.2f ms, %.2f us per token, %.2f tokens per second\n",
-            t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us));
-    LOG_TEE("n_accept     = %d\n", n_accept);
-    LOG_TEE("accept       = %.3f%%\n", 100.0f * n_accept / n_drafted);
-
-    llama_free(ctx);
-    llama_free_model(model);
-
-    llama_backend_free();
-
-    fprintf(stderr, "\n\n");
-
-    return 0;
-}
--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@@ -1,15 +1,12 @@
+#include "common.h"
 #include "ggml.h"
 #include "llama.h"
-#include "common.h"
-#include "ngram-cache.h"

 #include <cmath>
 #include <cstdint>
 #include <cstdio>
-#include <fstream>
 #include <string>
 #include <vector>
-#include <unordered_map>

 int main(int argc, char ** argv){
    gpt_params params;
@@ -18,7 +15,11 @@ int main(int argc, char ** argv){
        return 1;
    }

-    // max. number of additional tokens to draft if match is found
+    // max/min n-grams size to search for in prompt
+    const int ngram_max = 4;
+    const int ngram_min = 1;
+
+    // length of the candidate / draft sequence, if match is found
    const int n_draft = params.n_draft;

    const bool dump_kv_cache = params.dump_kv_cache;
@@ -38,8 +39,6 @@ int main(int argc, char ** argv){

    // load the model
    std::tie(model, ctx) = llama_init_from_gpt_params(params);
-    llama_set_rng_seed(ctx, params.seed);
-    GGML_ASSERT(llama_n_vocab(model) < (1 << 16));

    // tokenize the prompt
    const bool add_bos = llama_should_add_bos_token(model);
@@ -48,35 +47,6 @@ int main(int argc, char ** argv){
    std::vector<llama_token> inp;
    inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);

-    llama_ngram_cache ngram_cache_context;
-    llama_ngram_cache ngram_cache_dynamic;
-    llama_ngram_cache ngram_cache_static;
-    int64_t t_draft_flat_us = 0;
-    int64_t t_draft_us = 0;
-
-    {
-        // Fill up context ngram cache with tokens from user input:
-        const int64_t t_start_draft_us = ggml_time_us();
-        llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, inp.size(), false);
-
-        if (!params.lookup_cache_static.empty()) {
-            try {
-                ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static);
-            } catch (std::ifstream::failure const &) {
-                fprintf(stderr, "error: failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
-                exit(1);
-            }
-        }
-
-        if (!params.lookup_cache_dynamic.empty()) {
-            try {
-                ngram_cache_dynamic = llama_ngram_cache_load(params.lookup_cache_dynamic);
-            } catch (std::ifstream::failure const &) {} // if the file does not exist it will simply be created at the end of the program
-        }
-
-        t_draft_flat_us += ggml_time_us() - t_start_draft_us;
-    }
-
    const int max_context_size     = llama_n_ctx(ctx);
    const int max_tokens_list_size = max_context_size - 4;

@@ -106,6 +76,8 @@ int main(int argc, char ** argv){
    int n_drafted = 0;
    int n_accept  = 0;

+    int64_t t_draft_us = 0;
+
    int n_past = inp.size();

    bool has_eos = false;
@@ -157,12 +129,6 @@ int main(int argc, char ** argv){
                ++n_past;
                ++i_dft;
                inp.push_back(id);
-                {
-                    // Update context ngram cache with the newly accepted token:
-                    const int64_t t_start_draft_us = ggml_time_us();
-                    llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, 1, false);
-                    t_draft_us += ggml_time_us() - t_start_draft_us;
-                }

                if (params.use_color) {
                    // color accepted draft token
@@ -183,12 +149,6 @@ int main(int argc, char ** argv){
            draft.clear();
            draft.push_back(id);
            inp.push_back(id);
-            {
-                // Update context ngram cache with the newly accepted token:
-                const int64_t t_start_draft_us = ggml_time_us();
-                llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, 1, false);
-                t_draft_us += ggml_time_us() - t_start_draft_us;
-            }
            break;
        }

@@ -203,19 +163,44 @@ int main(int argc, char ** argv){
        llama_batch_clear(batch_tgt);
        llama_batch_add(batch_tgt, draft[0], n_past, { 0 }, true);

-        // Draft already contains a single token sampled from the model:
-        GGML_ASSERT(draft.size() == 1);
-        GGML_ASSERT(draft[0] == inp.back());
+        // generate n_pred tokens through prompt lookup
+        auto prompt_lookup = [&]() -> void {
+            const int inp_size = inp.size();
+            for (int ngram_size = ngram_max ; ngram_size > ngram_min; --ngram_size){
+                const llama_token * ngram = &inp[inp_size - ngram_size];
+
+                for (int i = 0; i <= (int) inp_size - (ngram_size * 2); ++i) {
+                    bool match = true;
+                    for (int j = 0; j < ngram_size; ++j) {
+                        if (inp[i + j] != ngram[j]) {
+                            match = false;
+                            break;
+                        }
+                    }
+
+                    if (match) {
+                        const int startIdx = i + ngram_size;
+                        const int endIdx = startIdx + n_draft;
+                        if (endIdx < inp_size) {
+                            for (int j = startIdx; j < endIdx; ++j) {
+                                LOG(" - draft candidate %d: %d\n", j, inp[j]);
+                                draft.push_back(inp[j]);
+                                llama_batch_add(batch_tgt, inp[j], n_past + (j - startIdx) + 1, { 0 }, true);
+                                ++n_drafted;
+                            }
+                            return;
+                        }
+                    }
+                }
+            }
+            return;
+        };
+
        const int64_t t_start_draft_us = ggml_time_us();

-        llama_ngram_cache_draft(inp, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static);
-
-        for (size_t i = 1; i < draft.size(); ++i) {
-            llama_batch_add(batch_tgt, draft[i], n_past + i, { 0 }, true);
-        }
+        prompt_lookup();

        t_draft_us += ggml_time_us() - t_start_draft_us;
-        n_drafted += draft.size() - 1;

        llama_decode(ctx, batch_tgt);
        ++n_past;
@@ -225,24 +210,19 @@ int main(int argc, char ** argv){

    auto t_dec_end = ggml_time_us();

-    // Update dynamic ngram cache with context ngram cache and save it to disk:
-    llama_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context);
-    llama_ngram_cache_save(ngram_cache_dynamic, params.lookup_cache_dynamic);
-
    LOG_TEE("\n\n");

    LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input,   (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
    LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict  / ((t_dec_end - t_dec_start) / 1e6f));

    LOG_TEE("\n");
-    LOG_TEE("n_draft      = %d\n", n_draft);
-    LOG_TEE("n_predict    = %d\n", n_predict);
-    LOG_TEE("n_drafted    = %d\n", n_drafted);
-    LOG_TEE("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
-    LOG_TEE("t_draft      = %.2f ms, %.2f us per token, %.2f tokens per second\n",
+    LOG_TEE("n_draft   = %d\n", n_draft);
+    LOG_TEE("n_predict = %d\n", n_predict);
+    LOG_TEE("n_drafted = %d\n", n_drafted);
+    LOG_TEE("t_draft   = %.2f ms, %.2f us per token, %.2f tokens per second\n",
            t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us));
-    LOG_TEE("n_accept     = %d\n", n_accept);
-    LOG_TEE("accept       = %.3f%%\n", 100.0f * n_accept / n_drafted);
+    LOG_TEE("n_accept  = %d\n", n_accept);
+    LOG_TEE("accept    = %.3f%%\n", 100.0f * n_accept / n_drafted);

    LOG_TEE("\ntarget:\n");
    llama_print_timings(ctx);
--- a/examples/main-cmake-pkg/README.md
+++ b/examples/main-cmake-pkg/README.md
@@ -8,7 +8,7 @@ Because this example is "outside of the source tree", it is important to first b

 ### Considerations

-When hardware acceleration libraries are used (e.g. CUDA, Metal, CLBlast, etc.), CMake must be able to locate the associated CMake package. In the example below, when building _main-cmake-pkg_ notice the `CMAKE_PREFIX_PATH` includes the Llama CMake package location _in addition to_ the CLBlast package—which was used when compiling _llama.cpp_.
+When hardware acceleration libraries are used (e.g. CUBlas, Metal, CLBlast, etc.), CMake must be able to locate the associated CMake package. In the example below, when building _main-cmake-pkg_ notice the `CMAKE_PREFIX_PATH` includes the Llama CMake package location _in addition to_ the CLBlast package—which was used when compiling _llama.cpp_.

 ### Build llama.cpp and install to C:\LlamaCPP directory

--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -296,9 +296,7 @@ These options help improve the performance and memory usage of the LLaMA models.

 ### Batch Size

-   `-b N, --batch-size N`: Set the batch size for prompt processing (default: `2048`). This large batch size benefits users who have BLAS installed and enabled it during the build. If you don't have BLAS enabled ("BLAS=0"), you can use a smaller number, such as 8, to see the prompt progress as it's evaluated in some situations.
-
- `-ub N`, `--ubatch-size N`: physical maximum batch size. This is for pipeline parallelization. Default: `512`.
+-   `-b N, --batch-size N`: Set the batch size for prompt processing (default: 512). This large batch size benefits users who have BLAS installed and enabled it during the build. If you don't have BLAS enabled ("BLAS=0"), you can use a smaller number, such as 8, to see the prompt progress as it's evaluated in some situations.

 ### Prompt Caching

@@ -318,8 +316,8 @@ These options provide extra functionality and customization when running the LLa

 -   `-h, --help`: Display a help message showing all available options and their default values. This is particularly useful for checking the latest options and default values, as they can change frequently, and the information in this document may become outdated.
 -   `--verbose-prompt`: Print the prompt before generating text.
-   `-ngl N, --n-gpu-layers N`: When compiled with GPU support, this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
-   `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used.
-   `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance.
+-   `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
+-   `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
+-   `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
 -   `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
 -   `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -132,6 +132,7 @@ int main(int argc, char ** argv) {
    llama_context * ctx = NULL;

    // load the target model
+    params.logits_all = true;
    std::tie(model, ctx) = llama_init_from_gpt_params(params);

    // load the prompts from an external file if there are any
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -380,7 +380,6 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
            const int batch_size  = std::min(end - batch_start, n_batch);

            //fprintf(stderr, "    Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch);
-            // TODO: use llama_batch.logits instead of relying on logits_all == true
            if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
                //fprintf(stderr, "%s : failed to eval\n", __func__);
                return {tokens, -1, logit_history, prob_history};
@@ -553,8 +552,6 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
            const int batch_start = start + j * n_batch;
            const int batch_size  = std::min(end - batch_start, n_batch);

-            int n_outputs = 0;
-
            batch.n_tokens = 0;
            for (int seq = 0; seq < n_seq_batch; seq++) {
                int seq_start = batch_start + seq*n_ctx;
@@ -569,13 +566,11 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par

                for (int k = 0; k < batch_size; ++k) {
                    const int idx = seq*n_ctx + k;
-                    batch.token   [idx]    = tokens[seq_start + k];
-                    batch.pos     [idx]    = j*n_batch + k;
-                    batch.n_seq_id[idx]    = 1;
-                    batch.seq_id  [idx][0] = seq;
-                    batch.logits  [idx]    = batch.pos[idx] >= first ? 1 : 0;
-
-                    n_outputs += batch.logits[idx] != 0;
+                    batch.token[idx] = tokens[seq_start + k];
+                    batch.pos[idx] = j*n_batch + k;
+                    batch.n_seq_id[idx] = 1;
+                    batch.seq_id[idx][0] = seq;
+                    batch.logits[idx] = batch.pos[idx] >= first ? 1 : 0;
                }
                batch.n_tokens += batch_size;

@@ -588,9 +583,9 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
                return {tokens, -1, logit_history, prob_history};
            }

-            if (num_batches > 1 && n_outputs > 0) {
+            if (num_batches > 1) {
                const auto * batch_logits = llama_get_logits(ctx);
-                logits.insert(logits.end(), batch_logits, batch_logits + n_outputs * n_vocab);
+                logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
            }
        }

@@ -609,15 +604,14 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
        }

        for (int seq = 0; seq < n_seq_batch; seq++) {
-            const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits_ith(ctx, seq*n_ctx + first);
-
+            const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits_ith(ctx, seq*n_ctx);
            llama_token * tokens_data = tokens.data() + start + seq*n_ctx + first;
            if (!params.logits_file.empty()) {
-                process_logits(logits_stream, n_vocab, all_logits,
+                process_logits(logits_stream, n_vocab, all_logits + first*n_vocab,
                        tokens_data, n_ctx - 1 - first,
                        workers, log_probs, nll, nll2);
            } else {
-                process_logits(n_vocab, all_logits,
+                process_logits(n_vocab, all_logits + first*n_vocab,
                        tokens_data, n_ctx - 1 - first,
                        workers, nll, nll2,
                        logit_history.data() + start + seq*n_ctx + first,
@@ -658,7 +652,6 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
 }

 static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<float> & batch_logits, int32_t n_batch, int32_t n_vocab) {
-    int prev_outputs = 0;
    for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
        const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));

@@ -679,14 +672,7 @@ static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<
            return false;
        }

-        int n_outputs = 0;
-        for (int i = 0; i < n_tokens; ++i) {
-            n_outputs += batch_view.logits[i] != 0;
-        }
-
-        memcpy(batch_logits.data() + prev_outputs*n_vocab, llama_get_logits(ctx), n_outputs*n_vocab*sizeof(float));
-
-        prev_outputs += n_outputs;
+        memcpy(batch_logits.data() + i*n_vocab, llama_get_logits(ctx), n_tokens*n_vocab*sizeof(float));
    }

    return true;
@@ -793,7 +779,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
        size_t ending_logprob_count[4];
        double ending_logprob[4];

-        size_t i_logits;        // starting index of logits in the llama_batch
+        size_t i_batch;         // starting index in the llama_batch
        size_t common_prefix;   // max number of initial tokens that are the same in all sentences
        size_t required_tokens; // needed number of tokens to evaluate all 4 endings
        std::vector<llama_token> seq_tokens[4];
@@ -858,10 +844,9 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
    const int max_tasks_per_batch = 32;
    const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_seq_max(ctx));

-    llama_batch batch = llama_batch_init(n_ctx, 0, 4);
+    llama_batch batch = llama_batch_init(n_ctx, 0, max_seq);

    std::vector<float> tok_logits(n_vocab);
-    // TODO: this could be made smaller; it's currently the worst-case size
    std::vector<float> batch_logits(n_vocab*n_ctx);

    std::vector<std::pair<size_t, llama_token>> eval_pairs;
@@ -872,17 +857,16 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
        int n_cur = 0;

        size_t i1 = i0;
-        size_t i_logits = 0; // this tells us how many logits were needed before this point in the batch
+        size_t i_batch = 0; // this tells us where in `llama_batch` we are currently

        llama_batch_clear(batch);

        // batch as much tasks as possible into the available context
-        // each task has 4 unique sequence ids - one for each ending
+        // each task has 4 unique seuqnce ids - one for each ending
        // the common prefix is shared among the 4 sequences to save tokens
        // we extract logits only from the last common token and from all ending tokens of each sequence
        while (n_cur + (int) hs_data[i1].required_tokens <= n_ctx) {
            auto & hs_cur = hs_data[i1];
-            int n_logits = 0;

            const int s0 = 4*(i1 - i0);
            if (s0 + 4 > max_seq) {
@@ -890,23 +874,18 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
            }

            for (size_t i = 0; i < hs_cur.common_prefix; ++i) {
-                llama_batch_add(batch, hs_cur.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3 }, false);
+                llama_batch_add(batch, hs_cur.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3}, false);
            }
            batch.logits[batch.n_tokens - 1] = true; // we need logits for the last token of the common prefix
-            n_logits += 1;

            for (int s = 0; s < 4; ++s) {
-                const size_t seq_tokens_size = hs_cur.seq_tokens[s].size();
-                // TODO: don't evaluate the last token of each sequence
-                for (size_t i = hs_cur.common_prefix; i < seq_tokens_size; ++i) {
-                    const bool needs_logits = i < seq_tokens_size - 1;
-                    llama_batch_add(batch, hs_cur.seq_tokens[s][i], i, { s0 + s }, needs_logits);
-                    n_logits += needs_logits;
+                for (size_t i = hs_cur.common_prefix; i < hs_cur.seq_tokens[s].size(); ++i) {
+                    llama_batch_add(batch, hs_cur.seq_tokens[s][i], i, { s0 + s }, true);
                }
            }

-            hs_cur.i_logits = i_logits;
-            i_logits += n_logits;
+            hs_cur.i_batch = i_batch;
+            i_batch += hs_cur.required_tokens;

            n_cur += hs_data[i1].required_tokens;
            if (++i1 == hs_task_count) {
@@ -932,11 +911,12 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
        eval_pairs.clear();
        for (size_t i = i0; i < i1; ++i) {
            auto & hs_cur = hs_data[i];
-            size_t li = 1; // skip the last logit of the common prefix (computed separately below)
+            size_t li = hs_cur.common_prefix;
            for (int s = 0; s < 4; ++s) {
                for (size_t j = hs_cur.common_prefix; j < hs_cur.seq_tokens[s].size() - 1; j++) {
-                    eval_pairs.emplace_back(hs_cur.i_logits + li++, hs_cur.seq_tokens[s][j + 1]);
+                    eval_pairs.emplace_back(hs_cur.i_batch + li++, hs_cur.seq_tokens[s][j + 1]);
                }
+                ++li;
            }
        }
        // Then we do the actual calculation
@@ -948,8 +928,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
        for (size_t i = i0; i < i1; ++i) {
            auto & hs_cur = hs_data[i];

-            // get the logits of the last token of the common prefix
-            std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*hs_cur.i_logits, n_vocab*sizeof(float));
+            std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*(hs_cur.i_batch + hs_cur.common_prefix - 1), n_vocab*sizeof(float));

            const auto first_probs = softmax(tok_logits);

@@ -999,7 +978,7 @@ struct winogrande_entry {
    std::array<std::string, 2> choices;
    int answer;

-    size_t i_logits;
+    size_t i_batch;
    size_t common_prefix;
    size_t required_tokens;
    size_t n_base1; // number of tokens for context + choice 1
@@ -1125,7 +1104,6 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
            task.common_prefix++;
        }

-        // TODO: the last token of each of the sequences don't need to be evaluated
        task.required_tokens = task.common_prefix +
            task.seq_tokens[0].size() - task.common_prefix +
            task.seq_tokens[1].size() - task.common_prefix;
@@ -1143,10 +1121,9 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
    const int max_tasks_per_batch = 128;
    const int max_seq = std::min(2*max_tasks_per_batch, (int) llama_n_seq_max(ctx));

-    llama_batch batch = llama_batch_init(n_ctx, 0, 2);
+    llama_batch batch = llama_batch_init(n_ctx, 0, max_seq);

    std::vector<float> tok_logits(n_vocab);
-    // TODO: this could be made smaller; it's currently the worst-case size
    std::vector<float> batch_logits(n_vocab*n_ctx);

    std::vector<std::pair<size_t, llama_token>> eval_pairs;
@@ -1160,33 +1137,29 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
        int n_cur = 0;

        size_t i1 = i0;
-        size_t i_logits = 0;
+        size_t i_batch = 0;

        llama_batch_clear(batch);

        while (n_cur + (int) data[i1].required_tokens <= n_ctx) {
-            int n_logits = 0;
            const int s0 = 2*(i1 - i0);
            if (s0 + 2 > max_seq) {
                break;
            }

            for (size_t i = 0; i < data[i1].common_prefix; ++i) {
-                llama_batch_add(batch, data[i1].seq_tokens[0][i], i, { s0 + 0, s0 + 1 }, false);
+                llama_batch_add(batch, data[i1].seq_tokens[0][i], i, { s0 + 0, s0 + 1}, false);
            }
            batch.logits[batch.n_tokens - 1] = true;
-            n_logits += 1;

            for (int s = 0; s < 2; ++s) {
-                // TODO: end before the last token, no need to predict past the end of the sequences
                for (size_t i = data[i1].common_prefix; i < data[i1].seq_tokens[s].size(); ++i) {
                    llama_batch_add(batch, data[i1].seq_tokens[s][i], i, { s0 + s }, true);
-                    n_logits += 1;
                }
            }

-            data[i1].i_logits = i_logits;
-            i_logits += n_logits;
+            data[i1].i_batch = i_batch;
+            i_batch += data[i1].required_tokens;

            n_cur += data[i1].required_tokens;
            if (++i1 == data.size()) {
@@ -1217,16 +1190,15 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {

            const auto& n_base1 = skip_choice ? task.n_base1 : task.common_prefix;
            const int last_1st = task.seq_tokens[0].size() - n_base1 > 1 ? 1 : 0;
-            size_t li = n_base1 - task.common_prefix;
+            size_t li = n_base1 - 1;
            for (size_t j = n_base1-1; j < task.seq_tokens[0].size()-1-last_1st; ++j) {
-                eval_pairs.emplace_back(task.i_logits + li++, task.seq_tokens[0][j+1]);
+                eval_pairs.emplace_back(task.i_batch + li++, task.seq_tokens[0][j+1]);
            }
            const auto& n_base2 = skip_choice ? task.n_base2 : task.common_prefix;
            const int last_2nd = task.seq_tokens[1].size() - n_base2 > 1 ? 1 : 0;
-            // FIXME: this uses the wrong first logits when not skipping the choice word
-            li = task.seq_tokens[0].size() - task.common_prefix + n_base2 - task.common_prefix;
+            li = task.seq_tokens[0].size() - task.common_prefix + n_base2 - 1;
            for (size_t j = n_base2-1; j < task.seq_tokens[1].size()-1-last_2nd; ++j) {
-                eval_pairs.emplace_back(task.i_logits + li++, task.seq_tokens[1][j+1]);
+                eval_pairs.emplace_back(task.i_batch + li++, task.seq_tokens[1][j+1]);
            }
        }
        compute_logprobs(batch_logits.data(), n_vocab, workers, eval_pairs, eval_results);
@@ -1315,7 +1287,7 @@ struct multiple_choice_task {
    }

    // For evaluation
-    size_t i_logits;        // starting index of logits in the llama_batch
+    size_t i_batch;         // starting index in the llama_batch
    size_t common_prefix;   // max number of initial tokens that are the same in all sentences
    size_t required_tokens; // needed number of tokens to evaluate all answers
    std::vector<std::vector<llama_token>> seq_tokens;
@@ -1394,7 +1366,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
    std::vector<uint32_t> task_pos(n_task);
    strstream.read((char *)task_pos.data(), task_pos.size()*sizeof(uint32_t));
    if (strstream.fail()) {
-        printf("%s: failed to read task positions from prompt\n", __func__);
+        printf("%s: failed to raad task positions from prompt\n", __func__);
        return;
    }

@@ -1475,7 +1447,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
            return;
        }
    } else {
-        int n_dot = std::max((int) n_task/100, 1);
+        int n_dot = n_task/100;
        int i_task = 0;
        for (auto& task : tasks) {
            ++i_task;
@@ -1519,18 +1491,17 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
        int n_cur = 0;

        size_t i1 = i0;
-        size_t i_logits = 0; // this tells us how many logits were needed before this point in the batch
+        size_t i_batch = 0; // this tells us where in `llama_batch` we are currently

        llama_batch_clear(batch);

        // batch as much tasks as possible into the available context
-        // each task has 4 unique sequence ids - one for each ending
+        // each task has 4 unique seuqnce ids - one for each ending
        // the common prefix is shared among the 4 sequences to save tokens
        // we extract logits only from the last common token and from all ending tokens of each sequence
        int s0 = 0;
        while (n_cur + (int) tasks[i1].required_tokens <= n_ctx) {
            auto& cur_task = tasks[i1];
-            int n_logits = 0;

            int num_answers = cur_task.seq_tokens.size();
            if (s0 + num_answers > max_seq) {
@@ -1547,22 +1518,17 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
                llama_batch_add(batch, cur_task.seq_tokens[0][i], i, batch_indeces, false);
            }
            batch.logits[batch.n_tokens - 1] = true; // we need logits for the last token of the common prefix
-            n_logits += 1;

            for (int s = 0; s < int(cur_task.seq_tokens.size()); ++s) {
-                const size_t seq_tokens_size = cur_task.seq_tokens[s].size();
-                // TODO: don't evaluate the last token of each sequence
-                for (size_t i = cur_task.common_prefix; i < seq_tokens_size; ++i) {
-                    const bool needs_logits = i < seq_tokens_size - 1;
-                    llama_batch_add(batch, cur_task.seq_tokens[s][i], i, { s0 + s }, needs_logits);
-                    n_logits += needs_logits;
+                for (size_t i = cur_task.common_prefix; i < cur_task.seq_tokens[s].size(); ++i) {
+                    llama_batch_add(batch, cur_task.seq_tokens[s][i], i, { s0 + s }, true);
                }
            }

            s0 += num_answers;

-            cur_task.i_logits = i_logits;
-            i_logits += n_logits;
+            cur_task.i_batch = i_batch;
+            i_batch += cur_task.required_tokens;

            n_cur += cur_task.required_tokens;
            if (++i1 == tasks.size()) {
@@ -1588,11 +1554,12 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
        eval_pairs.clear();
        for (size_t i = i0; i < i1; ++i) {
            auto& cur_task = tasks[i];
-            size_t li = 1; // skip the last logit of the common prefix (computed separately below)
+            size_t li = cur_task.common_prefix;
            for (int s = 0; s < int(cur_task.seq_tokens.size()); ++s) {
                for (size_t j = cur_task.common_prefix; j < cur_task.seq_tokens[s].size() - 1; j++) {
-                    eval_pairs.emplace_back(cur_task.i_logits + li++, cur_task.seq_tokens[s][j + 1]);
+                    eval_pairs.emplace_back(cur_task.i_batch + li++, cur_task.seq_tokens[s][j + 1]);
                }
+                ++li;
            }
        }
        // Then we do the actual calculation
@@ -1611,8 +1578,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
            //}
            //printf("\n    common_prefix: %zu\n", cur_task.common_prefix);

-            // get the logits of the last token of the common prefix
-            std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*cur_task.i_logits, n_vocab*sizeof(float));
+            std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*(cur_task.i_batch + cur_task.common_prefix - 1), n_vocab*sizeof(float));

            const auto first_probs = softmax(tok_logits);

@@ -1764,7 +1730,6 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
                tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
            }

-            // TODO: use llama_batch.logits instead of relying on logits_all == true
            if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
                fprintf(stderr, "%s : failed to eval\n", __func__);
                return;
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -26,7 +26,6 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
    { "IQ2_S",  LLAMA_FTYPE_MOSTLY_IQ2_S,  " 2.5  bpw quantization",            },
    { "IQ2_M",  LLAMA_FTYPE_MOSTLY_IQ2_M,  " 2.7  bpw quantization",            },
    { "IQ1_S",  LLAMA_FTYPE_MOSTLY_IQ1_S,  " 1.56 bpw quantization",            },
-    { "IQ1_M",  LLAMA_FTYPE_MOSTLY_IQ1_M,  " 1.75 bpw quantization",            },
    { "Q2_K",   LLAMA_FTYPE_MOSTLY_Q2_K,   " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
    { "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", },
    { "IQ3_XXS",LLAMA_FTYPE_MOSTLY_IQ3_XXS," 3.06 bpw quantization",            },
@@ -88,17 +87,13 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
 //
 [[noreturn]]
 static void usage(const char * executable) {
-    printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
+    printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
    printf("  --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
    printf("  --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
    printf("  --pure: Disable k-quant mixtures and quantize all tensors to the same type\n");
    printf("  --imatrix file_name: use data in file_name as importance matrix for quant optimizations\n");
    printf("  --include-weights tensor_name: use importance matrix for this/these tensor(s)\n");
    printf("  --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
-    printf("  --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n");
-    printf("  --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n");
-    printf("  --override-kv KEY=TYPE:VALUE\n");
-    printf("      Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");
    printf("Note: --include-weights and --exclude-weights cannot be used together\n");
    printf("\nAllowed quantization types:\n");
    for (auto & it : QUANT_OPTIONS) {
@@ -112,60 +107,56 @@ static void usage(const char * executable) {
    exit(1);
 }

-static void load_imatrix(const std::string & imatrix_file, std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
+static void load_imatrix(const std::string& imatrix_file, std::unordered_map<std::string, std::vector<float>>& imatrix_data) {
    std::ifstream in(imatrix_file.c_str(), std::ios::binary);
    if (!in) {
-        printf("%s: failed to open %s\n",__func__, imatrix_file.c_str());
-        exit(1);
+        printf("%s: failed to open %s\n",__func__,imatrix_file.c_str());
+        return;
    }
    int n_entries;
-    in.read((char *)&n_entries, sizeof(n_entries));
+    in.read((char*)&n_entries, sizeof(n_entries));
    if (in.fail() || n_entries < 1) {
        printf("%s: no data in file %s\n", __func__, imatrix_file.c_str());
-        exit(1);
+        return;
    }
    for (int i = 0; i < n_entries; ++i) {
        int len; in.read((char *)&len, sizeof(len));
        std::vector<char> name_as_vec(len+1);
        in.read((char *)name_as_vec.data(), len);
        if (in.fail()) {
-            printf("%s: failed reading name for entry %d from %s\n", __func__, i+1, imatrix_file.c_str());
-            exit(1);
+            printf("%s: failed reading name for entry %d from %s\n",__func__,i+1,imatrix_file.c_str());
+            return;
        }
        name_as_vec[len] = 0;
        std::string name{name_as_vec.data()};
-        auto & e = imatrix_data[name];
+        auto& e = imatrix_data[std::move(name)];
        int ncall;
-        in.read((char *)&ncall, sizeof(ncall));
+        in.read((char*)&ncall, sizeof(ncall));
        int nval;
        in.read((char *)&nval, sizeof(nval));
        if (in.fail() || nval < 1) {
-            printf("%s: failed reading number of values for entry %d\n", __func__, i);
+            printf("%s: failed reading number of values for entry %d\n",__func__,i);
            imatrix_data = {};
-            exit(1);
+            return;
        }
        e.resize(nval);
-        in.read((char *)e.data(), nval*sizeof(float));
+        in.read((char*)e.data(), nval*sizeof(float));
        if (in.fail()) {
-            printf("%s: failed reading data for entry %d\n", __func__, i);
+            printf("%s: failed reading data for entry %d\n",__func__,i);
            imatrix_data = {};
-            exit(1);
+            return;
        }
        if (ncall > 0) {
            for (auto& v : e) v /= ncall;
        }
-
-        if (getenv("LLAMA_TRACE")) {
-            printf("%s: loaded data (size = %6d, ncall = %6d) for '%s'\n", __func__, int(e.size()), ncall, name.c_str());
-        }
    }
-    printf("%s: loaded %d importance matrix entries from %s\n", __func__, int(imatrix_data.size()), imatrix_file.c_str());
+    printf("%s: loaded %d importance matrix entries from %s\n",__func__,int(imatrix_data.size()),imatrix_file.c_str());
 }

-static void prepare_imatrix(const std::string & imatrix_file,
-        const std::vector<std::string> & included_weights,
-        const std::vector<std::string> & excluded_weights,
-        std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
+static void prepare_imatrix(const std::string& imatrix_file,
+        const std::vector<std::string>& included_weights,
+        const std::vector<std::string>& excluded_weights,
+        std::unordered_map<std::string, std::vector<float>>& imatrix_data) {
    if (!imatrix_file.empty()) {
        load_imatrix(imatrix_file, imatrix_data);
    }
@@ -198,55 +189,6 @@ static void prepare_imatrix(const std::string & imatrix_file,
    }
 }

-static ggml_type parse_ggml_type(const char * arg) {
-    ggml_type result = GGML_TYPE_COUNT;
-    for (int j = 0; j < GGML_TYPE_COUNT; ++j) {
-        auto type = ggml_type(j);
-        const auto * name = ggml_type_name(type);
-        if (name && strcmp(arg, name) == 0) {
-            result = type; break;
-        }
-    }
-    return result;
-}
-
-static bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides) {
-    const char* sep = strchr(data, '=');
-    if (sep == nullptr || sep - data >= 128) {
-        fprintf(stderr, "%s: malformed KV override '%s'\n", __func__, data);
-        return false;
-    }
-    llama_model_kv_override kvo;
-    std::strncpy(kvo.key, data, sep - data);
-    kvo.key[sep - data] = 0;
-    sep++;
-    if (strncmp(sep, "int:", 4) == 0) {
-        sep += 4;
-        kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
-        kvo.int_value = std::atol(sep);
-    } else if (strncmp(sep, "float:", 6) == 0) {
-        sep += 6;
-        kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
-        kvo.float_value = std::atof(sep);
-    } else if (strncmp(sep, "bool:", 5) == 0) {
-        sep += 5;
-        kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
-        if (std::strcmp(sep, "true") == 0) {
-            kvo.bool_value = true;
-        } else if (std::strcmp(sep, "false") == 0) {
-            kvo.bool_value = false;
-        } else {
-            fprintf(stderr, "%s: invalid boolean value for KV override '%s'\n", __func__, data);
-            return false;
-        }
-    } else {
-        fprintf(stderr, "%s: invalid type for KV override '%s'\n", __func__, data);
-        return false;
-    }
-    overrides.emplace_back(std::move(kvo));
-    return true;
-}
-
 int main(int argc, char ** argv) {
    if (argc < 3) {
        usage(argv[0]);
@@ -257,27 +199,10 @@ int main(int argc, char ** argv) {
    int arg_idx = 1;
    std::string imatrix_file;
    std::vector<std::string> included_weights, excluded_weights;
-    std::vector<llama_model_kv_override> kv_overrides;

    for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
        if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) {
            params.quantize_output_tensor = false;
-        } else if (strcmp(argv[arg_idx], "--output-tensor-type") == 0) {
-            if (arg_idx < argc-1) {
-                params.output_tensor_type = parse_ggml_type(argv[++arg_idx]);
-            } else {
-                usage(argv[0]);
-            }
-        } else if (strcmp(argv[arg_idx], "--token-embedding-type") == 0) {
-            if (arg_idx < argc-1) {
-                params.token_embedding_type = parse_ggml_type(argv[++arg_idx]);
-            } else {
-                usage(argv[0]);
-            }
-        } else if (strcmp(argv[arg_idx], "--override-kv") == 0) {
-            if (arg_idx == argc-1 || !parse_kv_override(argv[++arg_idx], kv_overrides)) {
-                usage(argv[0]);
-            }
        } else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) {
            params.allow_requantize = true;
        } else if (strcmp(argv[arg_idx], "--pure") == 0) {
@@ -318,11 +243,6 @@ int main(int argc, char ** argv) {
    if (!imatrix_data.empty()) {
        params.imatrix = &imatrix_data;
    }
-    if (!kv_overrides.empty()) {
-        kv_overrides.emplace_back();
-        kv_overrides.back().key[0] = 0;
-        params.kv_overrides = &kv_overrides;
-    }

    llama_backend_init();

@@ -344,7 +264,8 @@ int main(int argc, char ** argv) {
        if (ftype_str == "COPY") {
            params.only_copy = true;
        }
-    } else {
+    }
+    else {
        fname_out = argv[arg_idx];
        arg_idx++;

@@ -375,12 +296,10 @@ int main(int argc, char ** argv) {

    if ((params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS ||
         params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_S  ||
-         params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S ||
-         params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_S  ||
-         params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) && imatrix_data.empty()) {
-        fprintf(stderr, "\n==========================================================================================================\n");
-        fprintf(stderr, "Please do not use IQ1_S, IQ1_M, IQ2_S, IQ2_XXS, IQ2_XS or Q2_K_S quantization without an importance matrix\n");
-        fprintf(stderr, "==========================================================================================================\n\n\n");
+         params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) && imatrix_data.empty()) {
+        fprintf(stderr, "\n===============================================================================================\n");
+        fprintf(stderr, "Please do not use IQ1_S, IQ2_XXS, IQ2_XS or Q2_K_S quantization without an importance matrix\n");
+        fprintf(stderr, "===============================================================================================\n\n\n");
        return 1;
    }

--- a/examples/retrieval/CMakeLists.txt
+++ b/examples/retrieval/CMakeLists.txt
@@ -1,5 +0,0 @@
-set(TARGET retrieval)
-add_executable(${TARGET} retrieval.cpp)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/retrieval/README.md
+++ b/examples/retrieval/README.md
@@ -1,69 +0,0 @@
-# llama.cpp/examples/retrieval
-
-Demonstration of simple retrieval technique based on cosine similarity
-
-More info:
-https://github.com/ggerganov/llama.cpp/pull/6193
-
-### How to use
-
-`retieval.cpp` has parameters of its own:
- `--context-file`: file to be embedded - state this option multiple times to embed multiple files
- `--chunk-size`: minimum size of each text chunk to be embedded
- `--chunk-separator`: STRING to divide chunks by. newline by default
-
-`retrieval` example can be tested as follows:
-
-```bash
-make -j && ./retrieval --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator .
-```
-
-This chunks and embeds all given files and starts a loop requesting query inputs:
-
-```
-Enter query:
-```
-
-On each query input, top k chunks are shown along with file name, chunk position within file and original text:
-
-```
-Enter query: describe the mit license
-batch_decode: n_tokens = 6, n_seq = 1
-Top 3 similar chunks:
-filename: README.md
-filepos: 119
-similarity: 0.762334
-textdata:
-png)
-
-[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
-
-[Roadmap](https://github.
--------------------
-filename: License
-filepos: 0
-similarity: 0.725146
-textdata:
-MIT License
-
-Copyright (c) 2023 Georgi Gerganov
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
--------------------
-filename: README.md
-filepos: 9178
-similarity: 0.621722
-textdata:
-com/cztomsik/ava) (MIT)
- [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal)
- [pythops/tenere](https://github.
--------------------
-```
--- a/examples/retrieval/retrieval.cpp
+++ b/examples/retrieval/retrieval.cpp
@@ -1,350 +0,0 @@
-#include "common.h"
-#include "llama.h"
-
-#include <algorithm>
-#include <fstream>
-
-struct retrieval_params {
-    std::vector<std::string> context_files; // context files to embed
-    int32_t chunk_size            = 64;     // chunk size for context embedding
-    std::string chunk_separator   = "\n";   // chunk separator for context embedding
-};
-
-static void retrieval_params_print_usage(int argc, char ** argv, gpt_params & gpt_params, retrieval_params & params) {
-    gpt_print_usage(argc, argv, gpt_params);
-    printf("retrieval options:\n");
-    printf("  --context-file FNAME  file containing context to embed.\n");
-    printf("                        specify multiple files by providing --context-file option multiple times.\n");
-    printf("  --chunk-size N        minimum length of embedded text chunk (default:%d)\n", params.chunk_size);
-    printf("  --chunk-separator STRING\n");
-    printf("                        string to separate chunks (default: \"\\n\")\n");
-    printf("\n");
-}
-
-static void retrieval_params_parse(int argc, char ** argv, gpt_params & gpt_params, retrieval_params & retrieval_params) {
-    int i = 1;
-    std::string arg;
-    while (i < argc) {
-        arg = argv[i];
-        bool invalid_gpt_param = false;
-        if(gpt_params_find_arg(argc, argv, argv[i], gpt_params, i, invalid_gpt_param)) {
-            if (invalid_gpt_param) {
-                fprintf(stderr, "error: invalid argument: %s\n", arg.c_str());
-                retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params);
-                exit(1);
-            }
-            // option was parsed by gpt_params_find_arg
-        } else if (arg == "--context-file") {
-            if (++i >= argc) {
-                fprintf(stderr, "error: missing argument for --context-file\n");
-                retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params);
-                exit(1);
-            }
-            std::ifstream file(argv[i]);
-            if (!file) {
-                fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
-                retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params);
-                exit(1);
-            }
-            // store the external file name in params
-            retrieval_params.context_files.push_back(argv[i]);
-        } else if (arg == "--chunk-size") {
-            if (++i >= argc) {
-                fprintf(stderr, "error: missing argument for --chunk-size\n");
-                retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params);
-                exit(1);
-            }
-            retrieval_params.chunk_size = std::stoi(argv[i]);
-        } else if (arg == "--chunk-separator") {
-            if (++i >= argc) {
-                fprintf(stderr, "error: missing argument for --chunk-separator\n");
-                retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params);
-                exit(1);
-            }
-            retrieval_params.chunk_separator = argv[i];
-        } else {
-            // unknown argument
-            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
-            retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params);
-            exit(1);
-        }
-        i++;
-    }
-}
-
-struct chunk {
-    // filename
-    std::string filename;
-    // original file position
-    size_t filepos;
-    // original text data
-    std::string textdata = "";
-    // tokenized text data
-    std::vector<llama_token> tokens;
-    // embedding
-    std::vector<float> embedding;
-};
-
-// chunk file data to chunks of size >= chunk_size
-// chunk_separator is the separator between chunks
-static std::vector<chunk> chunk_file(const std::string & filename, int chunk_size, const std::string & chunk_separator) {
-    std::vector<chunk> chunks;
-    std::ifstream f(filename.c_str());
-
-    if (!f.is_open()) {
-        fprintf(stderr, "Error: could not open file %s\n", filename.c_str());
-        return chunks;
-    }
-
-    chunk current_chunk;
-    char buffer[1024];
-    int64_t filepos = 0;
-    std::string current = "";
-    while (f.read(buffer, 1024)) {
-        current += std::string(buffer, f.gcount());
-        size_t pos;
-        while ((pos = current.find(chunk_separator)) != std::string::npos) {
-            current_chunk.textdata += current.substr(0, pos + chunk_separator.size());
-            if ((int) current_chunk.textdata.size() > chunk_size) {
-                // save chunk
-                current_chunk.filepos = filepos;
-                current_chunk.filename = filename;
-                chunks.push_back(current_chunk);
-                // update filepos
-                filepos += (int) current_chunk.textdata.size();
-                // reset current_chunk
-                current_chunk = chunk();
-            }
-            current = current.substr(pos + chunk_separator.size());
-        }
-
-    }
-    // add leftover data to last chunk
-    if (current_chunk.textdata.size() > 0) {
-        if (chunks.empty()) {
-            current_chunk.filepos = filepos;
-            current_chunk.filename = filename;
-            chunks.push_back(current_chunk);
-        } else {
-            chunks.back().textdata += current_chunk.textdata;
-        }
-    }
-    f.close();
-    return chunks;
-}
-
-static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, int seq_id) {
-    for (size_t i = 0; i < tokens.size(); i++) {
-        llama_batch_add(batch, tokens[i], i, { seq_id }, i == tokens.size() - 1);
-    }
-}
-
-static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
-    // clear previous kv_cache values (irrelevant for embeddings)
-    llama_kv_cache_clear(ctx);
-
-    // run model
-    fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
-    if (llama_decode(ctx, batch) < 0) {
-        fprintf(stderr, "%s : failed to decode\n", __func__);
-    }
-
-    for (int i = 0; i < batch.n_tokens; i++) {
-        if (!batch.logits[i]) {
-            continue;
-        }
-
-        // try to get sequence embeddings - supported only when pooling_type is not NONE
-        const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
-        if (embd == NULL) {
-            embd = llama_get_embeddings_ith(ctx, i);
-            if (embd == NULL) {
-                fprintf(stderr, "%s: failed to get embeddings for token %d\n", __func__, i);
-                continue;
-            }
-        }
-
-        float * out = output + batch.seq_id[i][0] * n_embd;
-        llama_embd_normalize(embd, out, n_embd);
-    }
-}
-
-int main(int argc, char ** argv) {
-    gpt_params params;
-    retrieval_params retrieval_params;
-
-    retrieval_params_parse(argc, argv, params, retrieval_params);
-
-    // For BERT models, batch size must be equal to ubatch size
-    params.n_ubatch = params.n_batch;
-
-    if (retrieval_params.chunk_size <= 0) {
-        fprintf(stderr, "chunk_size must be positive\n");
-        return 1;
-    }
-    if (retrieval_params.context_files.empty()) {
-        fprintf(stderr, "context_files must be specified\n");
-        return 1;
-    }
-    params.embedding = true;
-
-    print_build_info();
-
-    printf("processing files:\n");
-    for (auto & context_file : retrieval_params.context_files) {
-        printf("%s\n", context_file.c_str());
-    }
-
-    std::vector<chunk> chunks;
-    for (auto & context_file : retrieval_params.context_files) {
-        std::vector<chunk> file_chunk = chunk_file(context_file, retrieval_params.chunk_size, retrieval_params.chunk_separator);
-        chunks.insert(chunks.end(), file_chunk.begin(), file_chunk.end());
-    }
-    printf("Number of chunks: %ld\n", chunks.size());
-
-    llama_backend_init();
-    llama_numa_init(params.numa);
-
-    llama_model * model;
-    llama_context * ctx;
-
-    // load the model
-    std::tie(model, ctx) = llama_init_from_gpt_params(params);
-    if (model == NULL) {
-        fprintf(stderr, "%s: error: unable to load model\n", __func__);
-        return 1;
-    }
-
-    const int n_ctx_train = llama_n_ctx_train(model);
-    const int n_ctx = llama_n_ctx(ctx);
-
-    if (n_ctx > n_ctx_train) {
-        fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
-                __func__, n_ctx_train, n_ctx);
-    }
-
-    // print system information
-    {
-        fprintf(stderr, "\n");
-        fprintf(stderr, "%s\n", get_system_info(params).c_str());
-    }
-
-    // max batch size
-    const uint64_t n_batch = params.n_batch;
-    GGML_ASSERT(params.n_batch >= params.n_ctx);
-
-    // tokenize the prompts and trim
-    for (auto & chunk : chunks) {
-        auto inp = ::llama_tokenize(ctx, chunk.textdata, true, false);
-        if (inp.size() > n_batch) {
-            fprintf(stderr, "%s: error: chunk size (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
-                    __func__, (long long int) inp.size(), (long long int) n_batch);
-            return 1;
-        }
-        // add eos if not present
-        if (inp.empty() || inp.back() != llama_token_eos(model)) {
-            inp.push_back(llama_token_eos(model));
-        }
-        chunk.tokens = inp;
-    }
-
-    // tokenization stats
-    if (params.verbose_prompt) {
-        for (int i = 0; i < (int) chunks.size(); i++) {
-            fprintf(stderr, "%s: prompt %d: '%s'\n", __func__, i, chunks[i].textdata.c_str());
-            fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, chunks[i].tokens.size());
-            for (int j = 0; j < (int) chunks[i].tokens.size(); j++) {
-                fprintf(stderr, "%6d -> '%s'\n", chunks[i].tokens[j], llama_token_to_piece(ctx, chunks[i].tokens[j]).c_str());
-            }
-            fprintf(stderr, "\n\n");
-        }
-    }
-
-    // initialize batch
-    const int n_chunks = chunks.size();
-    struct llama_batch batch = llama_batch_init(n_batch, 0, 1);
-
-    // allocate output
-    const int n_embd = llama_n_embd(model);
-    std::vector<float> embeddings(n_chunks * n_embd, 0);
-    float * emb = embeddings.data();
-
-    // break into batches
-    int p = 0; // number of prompts processed already
-    int s = 0; // number of prompts in current batch
-    for (int k = 0; k < n_chunks; k++) {
-        // clamp to n_batch tokens
-        auto & inp = chunks[k].tokens;
-
-        const uint64_t n_toks = inp.size();
-
-        // encode if at capacity
-        if (batch.n_tokens + n_toks > n_batch) {
-            float * out = emb + p * n_embd;
-            batch_decode(ctx, batch, out, s, n_embd);
-            llama_batch_clear(batch);
-            p += s;
-            s = 0;
-        }
-
-        // add to batch
-        batch_add_seq(batch, inp, s);
-        s += 1;
-    }
-
-    // final batch
-    float * out = emb + p * n_embd;
-    batch_decode(ctx, batch, out, s, n_embd);
-
-    // save embeddings to chunks
-    for (int i = 0; i < n_chunks; i++) {
-        chunks[i].embedding = std::vector<float>(emb + i * n_embd, emb + (i + 1) * n_embd);
-        // clear tokens as they are no longer needed
-        chunks[i].tokens.clear();
-    }
-
-    // start loop, receive query and return top k similar chunks based on cosine similarity
-    std::string query;
-    while (true) {
-        printf("Enter query: ");
-        std::getline(std::cin, query);
-        std::vector<int32_t> query_tokens = llama_tokenize(ctx, query, true);
-
-        struct llama_batch query_batch = llama_batch_init(n_batch, 0, 1);
-        batch_add_seq(query_batch, query_tokens, 0);
-
-        std::vector<float> query_emb(n_embd, 0);
-        batch_decode(ctx, query_batch, query_emb.data(), 1, n_embd);
-
-        llama_batch_clear(query_batch);
-
-        // compute cosine similarities
-        {
-            std::vector<std::pair<int, float>> similarities;
-            for (int i = 0; i < n_chunks; i++) {
-                float sim = llama_embd_similarity_cos(chunks[i].embedding.data(), query_emb.data(), n_embd);
-                similarities.push_back(std::make_pair(i, sim));
-            }
-
-            // sort similarities
-            std::sort(similarities.begin(), similarities.end(), [](const std::pair<int, float> & a, const std::pair<int, float> & b) {
-                return a.second > b.second;
-            });
-
-            printf("Top %d similar chunks:\n", params.sparams.top_k);
-            for (int i = 0; i < std::min(params.sparams.top_k, (int) chunks.size()); i++) {
-                printf("filename: %s\n", chunks[similarities[i].first].filename.c_str());
-                printf("filepos: %lld\n", (long long int) chunks[similarities[i].first].filepos);
-                printf("similarity: %f\n", similarities[i].second);
-                printf("textdata:\n%s\n", chunks[similarities[i].first].textdata.c_str());
-                printf("--------------------\n");
-            }
-        }
-    }
-
-    // clean up
-    llama_print_timings(ctx);
-    llama_free(ctx);
-    llama_free_model(model);
-    llama_backend_free();
-}
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -16,50 +16,49 @@ The project is under active development, and we are [looking for feedback and co

 **Command line options:**

- `--threads N`, `-t N`: Set the number of threads to use during generation. Not used if model layers are offloaded to GPU. The server is using batching. This parameter is used only if one token is to be processed on CPU backend.
- `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation. Not used if model layers are offloaded to GPU.
- `--threads-http N`: Number of threads in the http server pool to process requests. Default: `max(std::thread::hardware_concurrency() - 1, --parallel N + 2)`
+- `--threads N`, `-t N`: Set the number of threads to use during generation.
+- `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation.
+- `--threads-http N`: number of threads in the http server pool to process requests (default: `max(std::thread::hardware_concurrency() - 1, --parallel N + 2)`)
 - `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
- `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file. Default: unused
- `-hfr REPO, --hf-repo REPO`: Hugging Face model repository. Default: unused
- `-hff FILE, --hf-file FILE`: Hugging Face model file. Default: unused
+- `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (e.g https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf).
 - `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is `512`, but LLaMA models were built with a context of `2048`, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of `4096`.
- `-ngl N`, `--n-gpu-layers N`: When compiled with GPU support, this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
- `-mg i, --main-gpu i`: When using multiple GPUs, this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default, GPU `0` is used.
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs, this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default, the data is split in proportion to VRAM, but this may not be optimal for performance.
- `-b N`, `--batch-size N`: Set the batch size for prompt processing. Default: `2048`
- `-ub N`, `--ubatch-size N`: Physical maximum batch size. Default: `512`
+- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096.
+- `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
+- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
+- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
+- `-b N`, `--batch-size N`: Set the batch size for prompt processing. Default: `512`.
+- `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended.
 - `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped.
 - `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed.
- `--numa STRATEGY`: Attempt one of the below optimization strategies that may help on some NUMA systems
+- `--numa STRATEGY`: Attempt one of the below optimization strategies  that help on some NUMA systems
 - `--numa distribute`: Spread execution evenly over all nodes
 - `--numa isolate`: Only spawn threads on CPUs on the node that execution started on
- `--numa numactl`: Use the CPU map provided by numactl. If run without this previously, it is recommended to drop the system
-page cache before using this. See https://github.com/ggerganov/llama.cpp/issues/1437
+- `--numa numactl`: Use the CPU map provided by numactl
+if run without this previously, it is recommended to drop the system page cache before using this
+see https://github.com/ggerganov/llama.cpp/issues/1437

- `--numa`: Attempt optimizations that may help on some NUMA systems.
+- `--numa`: Attempt optimizations that help on some NUMA systems.
 - `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
 - `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
- `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`
- `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`
- `--port`: Set the port to listen. Default: `8080`
- `--path`: Path from which to serve static files. Default: disabled
- `--api-key`: Set an api key for request authorization. By default, the server responds to every request. With an api key set, the requests must have the Authorization header set with the api key as Bearer token. May be used multiple times to enable multiple valid keys.
- `--api-key-file`: Path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access. May be used in conjunction with `--api-key`s.
- `--embedding`: Enable embedding extraction. Default: disabled
- `-np N`, `--parallel N`: Set the number of slots for process requests. Default: `1`
- `-cb`, `--cont-batching`: Enable continuous batching (a.k.a dynamic batching).  Default: disabled
- `-spf FNAME`, `--system-prompt-file FNAME` Set a file to load a system prompt (initial prompt of all slots). This is useful for chat applications. [See more](#change-system-prompt-on-runtime)
+- `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`.
+- `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`.
+- `--port`: Set the port to listen. Default: `8080`.
+- `--path`: path from which to serve static files (default: disabled)
+- `--api-key`: Set an api key for request authorization. By default the server responds to every request. With an api key set, the requests must have the Authorization header set with the api key as Bearer token. May be used multiple times to enable multiple valid keys.
+- `--api-key-file`: path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access. May be used in conjunction with `--api-key`'s.
+- `--embedding`: Enable embedding extraction, Default: disabled.
+- `-np N`, `--parallel N`: Set the number of slots for process requests (default: 1)
+- `-cb`, `--cont-batching`: enable continuous batching (a.k.a dynamic batching) (default: disabled)
+- `-spf FNAME`, `--system-prompt-file FNAME` Set a file to load "a system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
 - `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA.
- `--grp-attn-n`: Set the group attention factor to extend context size through self-extend. Used together with group attention width `--grp-attn-w`. Default: `1`, which is disabled.
- `--grp-attn-w`: Set the group attention width to extend context size through self-extend.  Used together with group attention factor `--grp-attn-n`. Default: `512`
- `-n N, --n-predict N`: Set the maximum tokens to predict. Default: `-1`
+- `--grp-attn-n`: Set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`
+- `--grp-attn-w`: Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`
+- `-n N, --n-predict N`: Set the maximum tokens to predict (default: -1)
 - `--slots-endpoint-disable`: To disable slots state monitoring endpoint. Slots state may contain user data, prompts included.
- `--metrics`: enable prometheus `/metrics` compatible endpoint. Default: disabled
- `--chat-template JINJA_TEMPLATE`: Set custom jinja chat template. This parameter accepts a string, not a file name.  Default: template taken from model's metadata. We only support [some pre-defined templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
- `--log-disable`: Output logs to stdout only, not to `llama.log`. Default: enabled
- `--log-format FORMAT`: Define the log output to FORMAT: json or text Default: `json`
+- `--metrics`: enable prometheus `/metrics` compatible endpoint (default: disabled)
+- `--chat-template JINJA_TEMPLATE`: Set custom jinja chat template. This parameter accepts a string, not a file name (default: template taken from model's metadata). We only support [some pre-defined templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
+- `--log-disable`: Output logs to stdout only, default: enabled.
+- `--log-format FORMAT`: Define the log output to FORMAT: json or text (default: json)

 **If compiled with `LLAMA_SERVER_SSL=ON`**
 - `--ssl-key-file FNAME`: path to file a PEM-encoded SSL private key
@@ -67,7 +66,7 @@ page cache before using this. See https://github.com/ggerganov/llama.cpp/issues/

 ## Build

-`server` is built alongside everything else from the root of the project
+server is build alongside everything else from the root of the project

 - Using `make`:

@@ -83,7 +82,7 @@ page cache before using this. See https://github.com/ggerganov/llama.cpp/issues/

 ## Build with SSL

-`server` can also be built with SSL support using OpenSSL 3
+server can also be built with SSL support using OpenSSL 3

 - Using `make`:

@@ -133,7 +132,7 @@ docker run -p 8080:8080 -v /path/to/models:/models --gpus all ghcr.io/ggerganov/

 ## Testing with CURL

-Using [curl](https://curl.se/). On Windows, `curl.exe` should be available in the base OS.
+Using [curl](https://curl.se/). On Windows `curl.exe` should be available in the base OS.

 ```sh
 curl --request POST \
@@ -157,7 +156,7 @@ mkdir llama-client
 cd llama-client
 ```

-Create a index.js file and put this inside:
+Create a index.js file and put inside this:

 ```javascript
 const prompt = `Building a website can be done in 10 simple steps:`;
@@ -188,8 +187,8 @@ node index.js
  - 503 -> `{"status": "loading model"}` if the model is still being loaded.
  - 500 -> `{"status": "error"}` if the model failed to load.
  - 200 -> `{"status": "ok", "slots_idle": 1, "slots_processing": 2 }` if the model is successfully loaded and the server is ready for further requests mentioned below.
-  - 200 -> `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if no slots are currently available.
-  - 503 -> `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if the query parameter `fail_on_no_slot` is provided and no slots are currently available.
+  - 200 -> `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if no slot are currently available.
+  - 503 -> `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if the query parameter `fail_on_no_slot` is provided and no slot are currently available.

  If the query parameter `include_slots` is passed, `slots` field will contain internal slots data except if `--slots-endpoint-disable` is set.

@@ -203,75 +202,75 @@ node index.js
      - The model's `tokenizer.ggml.add_bos_token` metadata is `true`
      - The system prompt is empty

-    `temperature`: Adjust the randomness of the generated text. Default: `0.8`
+    `temperature`: Adjust the randomness of the generated text (default: 0.8).

-    `dynatemp_range`: Dynamic temperature range. The final temperature will be in the range of `[temperature - dynatemp_range; temperature + dynatemp_range]` Default: `0.0`, which is disabled.
+    `dynatemp_range`: Dynamic temperature range. The final temperature will be in the range of `[temperature - dynatemp_range; temperature + dynatemp_range]` (default: 0.0, 0.0 = disabled).

-    `dynatemp_exponent`: Dynamic temperature exponent. Default: `1.0`
+    `dynatemp_exponent`: Dynamic temperature exponent (default: 1.0).

-    `top_k`: Limit the next token selection to the K most probable tokens.  Default: `40`
+    `top_k`: Limit the next token selection to the K most probable tokens (default: 40).

-    `top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P. Default: `0.95`
+    `top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.95).

-    `min_p`: The minimum probability for a token to be considered, relative to the probability of the most likely token. Default: `0.05`
+    `min_p`: The minimum probability for a token to be considered, relative to the probability of the most likely token (default: 0.05).

-    `n_predict`: Set the maximum number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. Default: `-1`, where `-1` is infinity.
+    `n_predict`: Set the maximum number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. (default: -1, -1 = infinity).

    `n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded.
-    By default, this value is set to `0`, meaning no tokens are kept. Use `-1` to retain all tokens from the prompt.
+    By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the prompt.

    `stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`.

    `stop`: Specify a JSON array of stopping strings.
-    These words will not be included in the completion, so make sure to add them to the prompt for the next iteration. Default: `[]`
+    These words will not be included in the completion, so make sure to add them to the prompt for the next iteration (default: []).

-    `tfs_z`: Enable tail free sampling with parameter z. Default: `1.0`, which is disabled.
+    `tfs_z`: Enable tail free sampling with parameter z (default: 1.0, 1.0 = disabled).

-    `typical_p`: Enable locally typical sampling with parameter p. Default: `1.0`, which is disabled.
+    `typical_p`: Enable locally typical sampling with parameter p (default: 1.0, 1.0 = disabled).

-    `repeat_penalty`: Control the repetition of token sequences in the generated text. Default: `1.1`
+    `repeat_penalty`: Control the repetition of token sequences in the generated text (default: 1.1).

-    `repeat_last_n`: Last n tokens to consider for penalizing repetition. Default: `64`, where `0` is disabled and `-1` is ctx-size.
+    `repeat_last_n`: Last n tokens to consider for penalizing repetition (default: 64, 0 = disabled, -1 = ctx-size).

-    `penalize_nl`: Penalize newline tokens when applying the repeat penalty. Default: `true`
+    `penalize_nl`: Penalize newline tokens when applying the repeat penalty (default: true).

-    `presence_penalty`: Repeat alpha presence penalty. Default: `0.0`, which is disabled.
+    `presence_penalty`: Repeat alpha presence penalty (default: 0.0, 0.0 = disabled).

-    `frequency_penalty`: Repeat alpha frequency penalty. Default: `0.0`, which is disabled.
+    `frequency_penalty`: Repeat alpha frequency penalty (default: 0.0, 0.0 = disabled);

-    `penalty_prompt`: This will replace the `prompt` for the purpose of the penalty evaluation. Can be either `null`, a string or an array of numbers representing tokens. Default: `null`, which is to use the original `prompt`.
+    `penalty_prompt`: This will replace the `prompt` for the purpose of the penalty evaluation. Can be either `null`, a string or an array of numbers representing tokens (default: `null` = use the original `prompt`).

-    `mirostat`: Enable Mirostat sampling, controlling perplexity during text generation. Default: `0`, where `0` is disabled, `1` is Mirostat, and `2` is Mirostat 2.0.
+    `mirostat`: Enable Mirostat sampling, controlling perplexity during text generation (default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0).

-    `mirostat_tau`: Set the Mirostat target entropy, parameter tau. Default: `5.0`
+    `mirostat_tau`: Set the Mirostat target entropy, parameter tau (default: 5.0).

-    `mirostat_eta`: Set the Mirostat learning rate, parameter eta.  Default: `0.1`
+    `mirostat_eta`: Set the Mirostat learning rate, parameter eta (default: 0.1).

-    `grammar`: Set grammar for grammar-based sampling.  Default: no grammar
+    `grammar`: Set grammar for grammar-based sampling (default: no grammar)

-    `seed`: Set the random number generator (RNG) seed.  Default: `-1`, which is a random seed.
+    `seed`: Set the random number generator (RNG) seed (default: -1, -1 = random seed).

-    `ignore_eos`: Ignore end of stream token and continue generating.  Default: `false`
+    `ignore_eos`: Ignore end of stream token and continue generating (default: false).

-    `logit_bias`: Modify the likelihood of a token appearing in the generated text completion. For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood. Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced. The tokens can also be represented as strings, e.g. `[["Hello, World!",-0.5]]` will reduce the likelihood of all the individual tokens that represent the string `Hello, World!`, just like the `presence_penalty` does. Default: `[]`
+    `logit_bias`: Modify the likelihood of a token appearing in the generated text completion. For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood. Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced. The tokens can also be represented as strings, e.g. `[["Hello, World!",-0.5]]` will reduce the likelihood of all the individual tokens that represent the string `Hello, World!`, just like the `presence_penalty` does. (default: []).

-    `n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token. Default: `0`
+    `n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token (default: 0)

-    `min_keep`: If greater than 0, force samplers to return N possible tokens at minimum. Default: `0`
+    `min_keep`: If greater than 0, force samplers to return N possible tokens at minimum (default: 0)

    `image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `prompt`. You can determine the place of the image in the prompt as in the following: `USER:[img-12]Describe the image in detail.\nASSISTANT:`. In this case, `[img-12]` will be replaced by the embeddings of the image with id `12` in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 12}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.

-    `id_slot`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot.  Default: `-1`
+    `id_slot`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot (default: -1)

-    `cache_prompt`: Re-use previously cached prompt from the last request if possible. This may prevent re-caching the prompt from scratch.  Default: `false`
+    `cache_prompt`: Re-use previously cached prompt from the last request if possible. This may prevent re-caching the prompt from scratch. (default: false)

    `system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)

-    `samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["top_k", "tfs_z", "typical_p", "top_p", "min_p", "temperature"]` - these are all the available values.
+    `samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. (default: `["top_k", "tfs_z", "typical_p", "top_p", "min_p", "temperature"]` - these are all the available values)

 ### Result JSON

- Note: When using streaming mode (`stream`), only `content` and `stop` will be returned until end of completion.
+- Note: When using streaming mode (`stream`) only `content` and `stop` will be returned until end of completion.

 - `completion_probabilities`: An array of token probabilities for each completion. The array's length is `n_predict`. Each item in the array has the following structure:

@@ -285,7 +284,7 @@ node index.js
    },
    {
      "prob": float,
-      "tok_str": "<second most likely token>"
+      "tok_str": "<second most likely tonen>"
    },
    ...
  ]
@@ -355,14 +354,14 @@ Notice that each `probs` is an array of length `n_probs`.

 - `assistant_name` - the required assistant name to generate the prompt in case you have specified a system prompt for all slots.
 - `user_name` - the required anti-prompt to generate the prompt in case you have specified a system prompt for all slots.
- `default_generation_settings` - the default generation settings for the `/completion` endpoint, which has the same fields as the `generation_settings` response object from the `/completion` endpoint.
+- `default_generation_settings` - the default generation settings for the `/completion` endpoint, has the same fields as the `generation_settings` response object from the `/completion` endpoint.
 - `total_slots` - the total number of slots for process requests (defined by `--parallel` option)

- **POST** `/v1/chat/completions`: OpenAI-compatible Chat Completions API. Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only model with [supported chat template](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, ChatML template will be used.
+- **POST** `/v1/chat/completions`: OpenAI-compatible Chat Completions API. Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only ChatML-tuned models, such as Dolphin, OpenOrca, OpenHermes, OpenChat-3.5, etc can be used with this endpoint.

    *Options:*

-    See [OpenAI Chat Completions API documentation](https://platform.openai.com/docs/api-reference/chat). While some OpenAI-specific features such as function calling aren't supported, llama.cpp `/completion`-specific features such as `mirostat` are supported.
+    See [OpenAI Chat Completions API documentation](https://platform.openai.com/docs/api-reference/chat). While some OpenAI-specific features such as function calling aren't supported, llama.cpp `/completion`-specific features such are `mirostat` are supported.

    *Examples:*

@@ -512,16 +511,16 @@ Available metrics:
 - `llamacpp:tokens_predicted_total`: Number of generation tokens processed.
 - `llamacpp:prompt_tokens_seconds`: Average prompt throughput in tokens/s.
 - `llamacpp:predicted_tokens_seconds`: Average generation throughput in tokens/s.
- `llamacpp:kv_cache_usage_ratio`: KV-cache usage. `1` means 100 percent usage.
+- `llamacpp:kv_cache_usage_ratio`: KV-cache usage. 1 means 100 percent usage.
 - `llamacpp:kv_cache_tokens`: KV-cache tokens.
- `llamacpp:requests_processing`: Number of requests processing.
- `llamacpp:requests_deferred`: Number of requests deferred.
+- `llamacpp:requests_processing`: Number of request processing.
+- `llamacpp:requests_deferred`: Number of request deferred.

 ## More examples

 ### Change system prompt on runtime

-To use the server example to serve multiple chat-type clients while keeping the same system prompt, you can utilize the option `system_prompt`. This only needs to be used once.
+To use the server example to serve multiple chat-type clients while keeping the same system prompt, you can utilize the option `system_prompt` to achieve that. This only needs to be done once to establish it.

 `prompt`: Specify a context that you want all connecting clients to respect.

@@ -560,11 +559,11 @@ bash chat.sh

 ### OAI-like API

-The HTTP `server` supports an OAI-like API: https://github.com/openai/openai-openapi
+The HTTP server supports OAI-like API: https://github.com/openai/openai-openapi

 ### API errors

-`server` returns errors in the same format as OAI: https://github.com/openai/openai-openapi
+Server returns error in the same format as OAI: https://github.com/openai/openai-openapi

 Example of an error:

--- a/examples/server/bench/README.md
+++ b/examples/server/bench/README.md
@@ -2,15 +2,13 @@

 Benchmark is using [k6](https://k6.io/).

-##### Install k6 and sse extension
+##### Install k6

-SSE is not supported by default in k6, you have to build k6 with the [xk6-sse](https://github.com/phymbert/xk6-sse) extension.
+Follow instruction from: https://k6.io/docs/get-started/installation/

-Example:
+Example for ubuntu:
 ```shell
-go install go.k6.io/xk6/cmd/xk6@latest
-xk6 build master \
--with github.com/phymbert/xk6-sse
+snap install k6
 ```

 #### Download a dataset
@@ -48,7 +46,7 @@ server --host localhost --port 8080 \

 For 500 chat completions request with 8 concurrent users during maximum 10 minutes, run:
 ```shell
-./k6 run script.js --duration 10m --iterations 500 --vus 8
+k6 run script.js --duration 10m --iterations 500 --vus 8
 ```

 The benchmark values can be overridden with:
@@ -88,33 +86,3 @@ K6 metrics might be compared against [server metrics](../README.md), with:
 ```shell
 curl http://localhost:8080/metrics
 ```
-
-### Using the CI python script
-The `bench.py` script does several steps:
- start the server
- define good variable for k6
- run k6 script
- extract metrics from prometheus
-
-It aims to be used in the CI, but you can run it manually:
-
-```shell
-LLAMA_SERVER_BIN_PATH=../../../cmake-build-release/bin/server python bench.py \
-              --runner-label local \
-              --name local \
-              --branch `git rev-parse --abbrev-ref HEAD` \
-              --commit `git rev-parse HEAD` \
-              --scenario script.js \
-              --duration 5m \
-              --hf-repo ggml-org/models	 \
-              --hf-file phi-2/ggml-model-q4_0.gguf \
-              --model-path-prefix models \
-              --parallel 4 \
-              -ngl 33 \
-              --batch-size 2048 \
-              --ubatch-size	256 \
-              --ctx-size 4096 \
-              --n-prompts 200 \
-              --max-prompt-tokens 256 \
-              --max-tokens 256
-```
--- a/examples/server/bench/bench.py
+++ b/examples/server/bench/bench.py
@@ -1,308 +0,0 @@
-import argparse
-import json
-import os
-import re
-import signal
-import socket
-import subprocess
-import sys
-import threading
-import time
-import traceback
-from contextlib import closing
-from datetime import datetime
-
-import matplotlib
-import matplotlib.dates
-import matplotlib.pyplot as plt
-import requests
-from statistics import mean
-
-
-def main(args_in: list[str] | None = None) -> None:
-    parser = argparse.ArgumentParser(description="Start server benchmark scenario")
-    parser.add_argument("--name", type=str, help="Bench name", required=True)
-    parser.add_argument("--runner-label", type=str, help="Runner label", required=True)
-    parser.add_argument("--branch", type=str, help="Branch name", default="detached")
-    parser.add_argument("--commit", type=str, help="Commit name", default="dirty")
-    parser.add_argument("--host", type=str, help="Server listen host", default="0.0.0.0")
-    parser.add_argument("--port", type=int, help="Server listen host", default="8080")
-    parser.add_argument("--model-path-prefix", type=str, help="Prefix where to store the model files", default="models")
-    parser.add_argument("--n-prompts", type=int,
-                        help="SERVER_BENCH_N_PROMPTS: total prompts to randomly select in the benchmark", required=True)
-    parser.add_argument("--max-prompt-tokens", type=int,
-                        help="SERVER_BENCH_MAX_PROMPT_TOKENS: maximum prompt tokens to filter out in the dataset",
-                        required=True)
-    parser.add_argument("--max-tokens", type=int,
-                        help="SERVER_BENCH_MAX_CONTEXT: maximum context size of the completions request to filter out in the dataset: prompt + predicted tokens",
-                        required=True)
-    parser.add_argument("--hf-repo", type=str, help="Hugging Face model repository", required=True)
-    parser.add_argument("--hf-file", type=str, help="Hugging Face model file", required=True)
-    parser.add_argument("-ngl", "--n-gpu-layers", type=int, help="layers to the GPU for computation", required=True)
-    parser.add_argument("--ctx-size", type=int, help="Set the size of the prompt context", required=True)
-    parser.add_argument("--parallel", type=int, help="Set the number of slots for process requests", required=True)
-    parser.add_argument("--batch-size", type=int, help="Set the batch size for prompt processing", required=True)
-    parser.add_argument("--ubatch-size", type=int, help="physical maximum batch size", required=True)
-    parser.add_argument("--scenario", type=str, help="Scenario to run", required=True)
-    parser.add_argument("--duration", type=str, help="Bench scenario", required=True)
-
-    args = parser.parse_args(args_in)
-
-    start_time = time.time()
-
-    # Start the server and performance scenario
-    try:
-        server_process = start_server(args)
-    except Exception:
-        print("bench: server start error :")
-        traceback.print_exc(file=sys.stdout)
-        sys.exit(1)
-
-    # start the benchmark
-    try:
-        start_benchmark(args)
-
-        iterations = 0
-        with open("results.github.env", 'w') as github_env:
-            # parse output
-            with open('k6-results.json', 'r') as bench_results:
-                # Load JSON data from file
-                data = json.load(bench_results)
-                for metric_name in data['metrics']:
-                    for metric_metric in data['metrics'][metric_name]:
-                        value = data['metrics'][metric_name][metric_metric]
-                        if isinstance(value, float) or isinstance(value, int):
-                            value = round(value, 2)
-                            data['metrics'][metric_name][metric_metric]=value
-                            github_env.write(
-                                f"{escape_metric_name(metric_name)}_{escape_metric_name(metric_metric)}={value}\n")
-                iterations = data['root_group']['checks']['success completion']['passes']
-
-    except Exception:
-        print("bench: error :")
-        traceback.print_exc(file=sys.stdout)
-
-    # Stop the server
-    if server_process:
-        try:
-            print(f"bench: shutting down server pid={server_process.pid} ...")
-            if os.name == 'nt':
-                interrupt = signal.CTRL_C_EVENT
-            else:
-                interrupt = signal.SIGINT
-            server_process.send_signal(interrupt)
-            server_process.wait(0.5)
-
-        except subprocess.TimeoutExpired:
-            print(f"server still alive after 500ms, force-killing pid={server_process.pid} ...")
-            server_process.kill()  # SIGKILL
-            server_process.wait()
-
-        while is_server_listening(args.host, args.port):
-            time.sleep(0.1)
-
-    title = (f"llama.cpp {args.name} on {args.runner_label}\n "
-             f"duration={args.duration} {iterations} iterations")
-    xlabel = (f"{args.hf_repo}/{args.hf_file}\n"
-              f"parallel={args.parallel} ctx-size={args.ctx_size} ngl={args.n_gpu_layers} batch-size={args.batch_size} ubatch-size={args.ubatch_size} pp={args.max_prompt_tokens} pp+tg={args.max_tokens}\n"
-              f"branch={args.branch} commit={args.commit}")
-
-    # Prometheus
-    end_time = time.time()
-    prometheus_metrics = {}
-    if is_server_listening("0.0.0.0", 9090):
-        metrics = ['prompt_tokens_seconds', 'predicted_tokens_seconds',
-                   'kv_cache_usage_ratio', 'requests_processing', 'requests_deferred']
-
-        for metric in metrics:
-            resp = requests.get(f"http://localhost:9090/api/v1/query_range",
-                                params={'query': 'llamacpp:' + metric, 'start': start_time, 'end': end_time, 'step': 2})
-
-            with open(f"{metric}.json", 'w') as metric_json:
-                metric_json.write(resp.text)
-
-            if resp.status_code != 200:
-                print(f"bench: unable to extract prometheus metric {metric}: {resp.text}")
-            else:
-                metric_data = resp.json()
-                values = metric_data['data']['result'][0]['values']
-                timestamps, metric_values = zip(*values)
-                metric_values = [float(value) for value in metric_values]
-                prometheus_metrics[metric] = metric_values
-                timestamps_dt = [datetime.fromtimestamp(int(ts)) for ts in timestamps]
-                plt.figure(figsize=(16, 10), dpi=80)
-                plt.plot(timestamps_dt, metric_values, label=metric)
-                plt.xticks(rotation=0, fontsize=14, horizontalalignment='center', alpha=.7)
-                plt.yticks(fontsize=12, alpha=.7)
-
-                ylabel = f"llamacpp:{metric}"
-                plt.title(title,
-                          fontsize=14, wrap=True)
-                plt.grid(axis='both', alpha=.3)
-                plt.ylabel(ylabel, fontsize=22)
-                plt.xlabel(xlabel, fontsize=14, wrap=True)
-                plt.gca().xaxis.set_major_locator(matplotlib.dates.MinuteLocator())
-                plt.gca().xaxis.set_major_formatter(matplotlib.dates.DateFormatter("%Y-%m-%d %H:%M:%S"))
-                plt.gcf().autofmt_xdate()
-
-                # Remove borders
-                plt.gca().spines["top"].set_alpha(0.0)
-                plt.gca().spines["bottom"].set_alpha(0.3)
-                plt.gca().spines["right"].set_alpha(0.0)
-                plt.gca().spines["left"].set_alpha(0.3)
-
-                # Save the plot as a jpg image
-                plt.savefig(f'{metric}.jpg', dpi=60)
-                plt.close()
-
-                # Mermaid format in case images upload failed
-                with (open(f"{metric}.mermaid", 'w') as mermaid_f):
-                    mermaid = (
-                    f"""---
-config:
-    xyChart:
-        titleFontSize: 12
-        width: 900
-        height: 600
-    themeVariables:
-        xyChart:
-            titleColor: "#000000"
---
-xychart-beta
-    title "{title}"
-    y-axis "llamacpp:{metric}"
-    x-axis "llamacpp:{metric}" {int(min(timestamps))} --> {int(max(timestamps))}
-    line [{', '.join([str(round(float(value), 2)) for value in metric_values])}]
-                    """)
-                    mermaid_f.write(mermaid)
-
-    # 140 chars max for commit status description
-    bench_results = {
-        "i": iterations,
-        "req": {
-            "p95": round(data['metrics']["http_req_duration"]["p(95)"], 2),
-            "avg": round(data['metrics']["http_req_duration"]["avg"], 2),
-        },
-        "pp": {
-            "p95": round(data['metrics']["llamacpp_prompt_processing_second"]["p(95)"], 2),
-            "avg": round(data['metrics']["llamacpp_prompt_processing_second"]["avg"], 2),
-            "0": round(mean(prometheus_metrics['prompt_tokens_seconds']), 2),
-        },
-        "tg": {
-            "p95": round(data['metrics']["llamacpp_tokens_second"]["p(95)"], 2),
-            "avg": round(data['metrics']["llamacpp_tokens_second"]["avg"], 2),
-            "0": round(mean(prometheus_metrics['predicted_tokens_seconds']), 2),
-        },
-    }
-    with open("results.github.env", 'a') as github_env:
-        github_env.write(f"BENCH_RESULTS={json.dumps(bench_results, indent=None, separators=(',', ':') )}\n")
-        github_env.write(f"BENCH_ITERATIONS={iterations}\n")
-
-        title = title.replace('\n', ' ')
-        xlabel = xlabel.replace('\n', ' ')
-        github_env.write(f"BENCH_GRAPH_TITLE={title}\n")
-        github_env.write(f"BENCH_GRAPH_XLABEL={xlabel}\n")
-
-
-def start_benchmark(args):
-    k6_path = './k6'
-    if 'BENCH_K6_BIN_PATH' in os.environ:
-        k6_path = os.environ['BENCH_K6_BIN_PATH']
-    k6_args = [
-        'run', args.scenario,
-        '--no-color',
-    ]
-    k6_args.extend(['--duration', args.duration])
-    k6_args.extend(['--iterations', args.n_prompts])
-    k6_args.extend(['--vus', args.parallel])
-    k6_args.extend(['--summary-export', 'k6-results.json'])
-    args = f"SERVER_BENCH_N_PROMPTS={args.n_prompts} SERVER_BENCH_MAX_PROMPT_TOKENS={args.max_prompt_tokens} SERVER_BENCH_MAX_CONTEXT={args.max_tokens} "
-    args = args + ' '.join([str(arg) for arg in [k6_path, *k6_args]])
-    print(f"bench: starting k6 with: {args}")
-    k6_completed = subprocess.run(args, shell=True, stdout=sys.stdout, stderr=sys.stderr)
-    if k6_completed.returncode != 0:
-        raise Exception("bench: unable to run k6")
-
-
-def start_server(args):
-    server_process = start_server_background(args)
-
-    attempts = 0
-    max_attempts = 20
-    if 'GITHUB_ACTIONS' in os.environ:
-        max_attempts *= 2
-
-    while not is_server_listening(args.host, args.port):
-        attempts += 1
-        if attempts > max_attempts:
-            assert False, "server not started"
-        print(f"bench:     waiting for server to start ...")
-        time.sleep(0.5)
-
-    print("bench: server started.")
-    return server_process
-
-
-def start_server_background(args):
-    # Start the server
-    server_path = '../../../build/bin/server'
-    if 'LLAMA_SERVER_BIN_PATH' in os.environ:
-        server_path = os.environ['LLAMA_SERVER_BIN_PATH']
-    server_args = [
-        '--host', args.host,
-        '--port', args.port,
-    ]
-    model_file = args.model_path_prefix + os.path.sep + args.hf_file
-    model_dir  = os.path.dirname(model_file)
-    if not os.path.exists(model_dir):
-        os.makedirs(model_dir)
-    server_args.extend(['--model', model_file])
-    server_args.extend(['--hf-repo', args.hf_repo])
-    server_args.extend(['--hf-file', args.hf_file])
-    server_args.extend(['--n-gpu-layers', args.n_gpu_layers])
-    server_args.extend(['--ctx-size', args.ctx_size])
-    server_args.extend(['--parallel', args.parallel])
-    server_args.extend(['--batch-size', args.batch_size])
-    server_args.extend(['--ubatch-size', args.ubatch_size])
-    server_args.extend(['--n-predict', args.max_tokens * 2])
-    server_args.extend(['--defrag-thold', "0.1"])
-    server_args.append('--cont-batching')
-    server_args.append('--metrics')
-    server_args.extend(['--log-format', "text"])
-    args = [str(arg) for arg in [server_path, *server_args]]
-    print(f"bench: starting server with: {' '.join(args)}")
-    pkwargs = {
-        'stdout': subprocess.PIPE,
-        'stderr': subprocess.PIPE
-    }
-    server_process = subprocess.Popen(
-        args,
-        **pkwargs)
-
-    def server_log(in_stream, out_stream):
-        for line in iter(in_stream.readline, b''):
-            print(line.decode('utf-8'), end='', file=out_stream)
-
-    thread_stdout = threading.Thread(target=server_log, args=(server_process.stdout, sys.stdout))
-    thread_stdout.start()
-    thread_stderr = threading.Thread(target=server_log, args=(server_process.stderr, sys.stderr))
-    thread_stderr.start()
-
-    return server_process
-
-
-def is_server_listening(server_fqdn, server_port):
-    with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
-        result = sock.connect_ex((server_fqdn, server_port))
-        _is_server_listening = result == 0
-        if _is_server_listening:
-            print(f"server is listening on {server_fqdn}:{server_port}...")
-        return _is_server_listening
-
-
-def escape_metric_name(metric_name):
-    return re.sub('[^A-Z0-9]', '_', metric_name.upper())
-
-
-if __name__ == '__main__':
-    main()
--- a/examples/server/bench/prometheus.yml
+++ b/examples/server/bench/prometheus.yml
@@ -1,9 +0,0 @@
-global:
-  scrape_interval:     10s
-  external_labels:
-    llamacpp: 'server'
-
-scrape_configs:
-  - job_name: 'llama.cpp server'
-    static_configs:
-      - targets: ['localhost:8080']
--- a/examples/server/bench/requirements.txt
+++ b/examples/server/bench/requirements.txt
@@ -1,2 +0,0 @@
-matplotlib
-requests
--- a/examples/server/bench/script.js
+++ b/examples/server/bench/script.js
@@ -1,4 +1,4 @@
-import sse from 'k6/x/sse'
+import http from 'k6/http'
 import {check, sleep} from 'k6'
 import {SharedArray} from 'k6/data'
 import {Counter, Rate, Trend} from 'k6/metrics'
@@ -53,9 +53,7 @@ const data = new SharedArray('conversations', function () {

 const llamacpp_prompt_tokens = new Trend('llamacpp_prompt_tokens')
 const llamacpp_completion_tokens = new Trend('llamacpp_completion_tokens')
-
 const llamacpp_tokens_second = new Trend('llamacpp_tokens_second')
-const llamacpp_prompt_processing_second = new Trend('llamacpp_prompt_processing_second')

 const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter')
 const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter')
@@ -88,62 +86,35 @@ export default function () {
            }
        ],
        "model": model,
-        "stream": true,
-        "seed": 42,
+        "stream": false,
        "max_tokens": max_tokens
    }

-    const params = {method: 'POST', body: JSON.stringify(payload)};
+    const body = JSON.stringify(payload)

-    const startTime = new Date()
-    let promptEvalEndTime = null
-    let prompt_tokens = 0
-    let completions_tokens = 0
-    let finish_reason = null
-    const res = sse.open(`${server_url}/chat/completions`, params, function (client) {
-        client.on('event', function (event) {
-            if (promptEvalEndTime == null) {
-                promptEvalEndTime = new Date()
-            }
-
-            let chunk = JSON.parse(event.data)
-            let choice = chunk.choices[0]
-            if (choice.finish_reason) {
-                finish_reason = choice.finish_reason
-            }
-
-            if (chunk.usage) {
-                prompt_tokens = chunk.usage.prompt_tokens
-                llamacpp_prompt_tokens.add(prompt_tokens)
-                llamacpp_prompt_tokens_total_counter.add(prompt_tokens)
-
-                completions_tokens = chunk.usage.completion_tokens
-                llamacpp_completion_tokens.add(completions_tokens)
-                llamacpp_completion_tokens_total_counter.add(completions_tokens)
-            }
-        })
-
-        client.on('error', function (e) {
-            console.log('An unexpected error occurred: ', e.error());
-            throw e;
-        })
+    let res = http.post(`${server_url}/chat/completions`, body, {
+        headers: {'Content-Type': 'application/json'},
+        timeout: '300s'
    })

    check(res, {'success completion': (r) => r.status === 200})

-    const endTime = new Date()
+    if (res.status === 200) {
+        const completions = res.json()

-    const promptEvalTime = promptEvalEndTime - startTime
-    if (promptEvalTime > 0) {
-        llamacpp_prompt_processing_second.add(prompt_tokens / (promptEvalEndTime - startTime) * 1.e3)
-    }
+        llamacpp_prompt_tokens.add(completions.usage.prompt_tokens)
+        llamacpp_prompt_tokens_total_counter.add(completions.usage.prompt_tokens)

-    const completion_time = endTime - promptEvalEndTime
-    if (completions_tokens > 0 && completion_time > 0) {
-        llamacpp_tokens_second.add(completions_tokens / completion_time * 1.e3)
+        llamacpp_completion_tokens.add(completions.usage.completion_tokens)
+        llamacpp_completion_tokens_total_counter.add(completions.usage.completion_tokens)
+
+        llamacpp_completions_truncated_rate.add(completions.choices[0].finish_reason === 'length')
+        llamacpp_completions_stop_rate.add(completions.choices[0].finish_reason === 'stop')
+
+        llamacpp_tokens_second.add(completions.usage.total_tokens / res.timings.duration * 1.e3)
+    } else {
+        console.error(`response: ${res.body} request=${payload}`)
    }
-    llamacpp_completions_truncated_rate.add(finish_reason === 'length')
-    llamacpp_completions_stop_rate.add(finish_reason === 'stop')

    sleep(0.3)
 }
--- a/examples/server/completion.js.hpp
+++ b/examples/server/completion.js.hpp
@@ -43,454 +43,444 @@ unsigned char completion_js[] = {
  0x7d, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x63,
  0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x20, 0x3d, 0x20,
  0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x72,
-  0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x3b, 0x0a, 0x20, 0x20, 0x63, 0x6f, 0x6e,
-  0x73, 0x74, 0x20, 0x61, 0x70, 0x69, 0x5f, 0x75, 0x72, 0x6c, 0x20, 0x3d,
-  0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x2e, 0x61, 0x70, 0x69, 0x5f,
-  0x75, 0x72, 0x6c, 0x20, 0x7c, 0x7c, 0x20, 0x22, 0x22, 0x3b, 0x0a, 0x0a,
-  0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x21, 0x63, 0x6f, 0x6e, 0x74, 0x72,
-  0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x20,
-  0x3d, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x41, 0x62, 0x6f, 0x72, 0x74, 0x43,
-  0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x28, 0x29, 0x3b,
-  0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73,
-  0x74, 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e,
-  0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x20, 0x3d, 0x20, 0x7b, 0x20, 0x2e,
-  0x2e, 0x2e, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x44, 0x65, 0x66, 0x61, 0x75,
-  0x6c, 0x74, 0x73, 0x2c, 0x20, 0x2e, 0x2e, 0x2e, 0x70, 0x61, 0x72, 0x61,
-  0x6d, 0x73, 0x2c, 0x20, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x20, 0x7d,
-  0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72,
-  0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x20, 0x3d, 0x20, 0x61, 0x77,
-  0x61, 0x69, 0x74, 0x20, 0x66, 0x65, 0x74, 0x63, 0x68, 0x28, 0x60, 0x24,
-  0x7b, 0x61, 0x70, 0x69, 0x5f, 0x75, 0x72, 0x6c, 0x7d, 0x2f, 0x63, 0x6f,
-  0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x60, 0x2c, 0x20, 0x7b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x65, 0x74, 0x68, 0x6f, 0x64, 0x3a,
-  0x20, 0x27, 0x50, 0x4f, 0x53, 0x54, 0x27, 0x2c, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x62, 0x6f, 0x64, 0x79, 0x3a, 0x20, 0x4a, 0x53, 0x4f, 0x4e, 0x2e,
-  0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x69, 0x66, 0x79, 0x28, 0x63, 0x6f,
+  0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x69, 0x66,
+  0x20, 0x28, 0x21, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65,
+  0x72, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e,
+  0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x6e, 0x65,
+  0x77, 0x20, 0x41, 0x62, 0x6f, 0x72, 0x74, 0x43, 0x6f, 0x6e, 0x74, 0x72,
+  0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d,
+  0x0a, 0x0a, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x6f,
  0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x50, 0x61, 0x72, 0x61,
-  0x6d, 0x73, 0x29, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x68, 0x65, 0x61,
-  0x64, 0x65, 0x72, 0x73, 0x3a, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x27, 0x43, 0x6f, 0x6e, 0x6e, 0x65, 0x63, 0x74, 0x69, 0x6f,
-  0x6e, 0x27, 0x3a, 0x20, 0x27, 0x6b, 0x65, 0x65, 0x70, 0x2d, 0x61, 0x6c,
-  0x69, 0x76, 0x65, 0x27, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x27, 0x43, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x2d, 0x54, 0x79, 0x70,
-  0x65, 0x27, 0x3a, 0x20, 0x27, 0x61, 0x70, 0x70, 0x6c, 0x69, 0x63, 0x61,
-  0x74, 0x69, 0x6f, 0x6e, 0x2f, 0x6a, 0x73, 0x6f, 0x6e, 0x27, 0x2c, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x27, 0x41, 0x63, 0x63, 0x65, 0x70,
-  0x74, 0x27, 0x3a, 0x20, 0x27, 0x74, 0x65, 0x78, 0x74, 0x2f, 0x65, 0x76,
-  0x65, 0x6e, 0x74, 0x2d, 0x73, 0x74, 0x72, 0x65, 0x61, 0x6d, 0x27, 0x2c,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x2e, 0x2e, 0x28, 0x70,
-  0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x61, 0x70, 0x69, 0x5f, 0x6b, 0x65,
-  0x79, 0x20, 0x3f, 0x20, 0x7b, 0x27, 0x41, 0x75, 0x74, 0x68, 0x6f, 0x72,
-  0x69, 0x7a, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x27, 0x3a, 0x20, 0x60, 0x42,
-  0x65, 0x61, 0x72, 0x65, 0x72, 0x20, 0x24, 0x7b, 0x70, 0x61, 0x72, 0x61,
-  0x6d, 0x73, 0x2e, 0x61, 0x70, 0x69, 0x5f, 0x6b, 0x65, 0x79, 0x7d, 0x60,
-  0x7d, 0x20, 0x3a, 0x20, 0x7b, 0x7d, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x7d, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x73, 0x69, 0x67, 0x6e, 0x61,
-  0x6c, 0x3a, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65,
-  0x72, 0x2e, 0x73, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x2c, 0x0a, 0x20, 0x20,
-  0x7d, 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74,
-  0x20, 0x72, 0x65, 0x61, 0x64, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x72, 0x65,
-  0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x2e, 0x62, 0x6f, 0x64, 0x79, 0x2e,
-  0x67, 0x65, 0x74, 0x52, 0x65, 0x61, 0x64, 0x65, 0x72, 0x28, 0x29, 0x3b,
-  0x0a, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x64, 0x65, 0x63,
-  0x6f, 0x64, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x54,
-  0x65, 0x78, 0x74, 0x44, 0x65, 0x63, 0x6f, 0x64, 0x65, 0x72, 0x28, 0x29,
-  0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x63, 0x6f, 0x6e,
-  0x74, 0x65, 0x6e, 0x74, 0x20, 0x3d, 0x20, 0x22, 0x22, 0x3b, 0x0a, 0x20,
-  0x20, 0x6c, 0x65, 0x74, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x6f, 0x76, 0x65,
-  0x72, 0x20, 0x3d, 0x20, 0x22, 0x22, 0x3b, 0x20, 0x2f, 0x2f, 0x20, 0x42,
-  0x75, 0x66, 0x66, 0x65, 0x72, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x70, 0x61,
-  0x72, 0x74, 0x69, 0x61, 0x6c, 0x6c, 0x79, 0x20, 0x72, 0x65, 0x61, 0x64,
-  0x20, 0x6c, 0x69, 0x6e, 0x65, 0x73, 0x0a, 0x0a, 0x20, 0x20, 0x74, 0x72,
-  0x79, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20,
-  0x63, 0x6f, 0x6e, 0x74, 0x20, 0x3d, 0x20, 0x74, 0x72, 0x75, 0x65, 0x3b,
-  0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20,
-  0x28, 0x63, 0x6f, 0x6e, 0x74, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x65, 0x73,
-  0x75, 0x6c, 0x74, 0x20, 0x3d, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20,
-  0x72, 0x65, 0x61, 0x64, 0x65, 0x72, 0x2e, 0x72, 0x65, 0x61, 0x64, 0x28,
-  0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20,
-  0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x6f, 0x6e, 0x65,
-  0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x62, 0x72, 0x65, 0x61, 0x6b, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f,
-  0x20, 0x41, 0x64, 0x64, 0x20, 0x61, 0x6e, 0x79, 0x20, 0x6c, 0x65, 0x66,
-  0x74, 0x6f, 0x76, 0x65, 0x72, 0x20, 0x64, 0x61, 0x74, 0x61, 0x20, 0x74,
-  0x6f, 0x20, 0x74, 0x68, 0x65, 0x20, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e,
-  0x74, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x20, 0x6f, 0x66, 0x20, 0x64,
-  0x61, 0x74, 0x61, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f,
-  0x6e, 0x73, 0x74, 0x20, 0x74, 0x65, 0x78, 0x74, 0x20, 0x3d, 0x20, 0x6c,
-  0x65, 0x66, 0x74, 0x6f, 0x76, 0x65, 0x72, 0x20, 0x2b, 0x20, 0x64, 0x65,
-  0x63, 0x6f, 0x64, 0x65, 0x72, 0x2e, 0x64, 0x65, 0x63, 0x6f, 0x64, 0x65,
-  0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x76, 0x61, 0x6c, 0x75,
-  0x65, 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f,
-  0x2f, 0x20, 0x43, 0x68, 0x65, 0x63, 0x6b, 0x20, 0x69, 0x66, 0x20, 0x74,
-  0x68, 0x65, 0x20, 0x6c, 0x61, 0x73, 0x74, 0x20, 0x63, 0x68, 0x61, 0x72,
-  0x61, 0x63, 0x74, 0x65, 0x72, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x6c,
-  0x69, 0x6e, 0x65, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x65, 0x6e,
+  0x6d, 0x73, 0x20, 0x3d, 0x20, 0x7b, 0x20, 0x2e, 0x2e, 0x2e, 0x70, 0x61,
+  0x72, 0x61, 0x6d, 0x44, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x73, 0x2c,
+  0x20, 0x2e, 0x2e, 0x2e, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20,
+  0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x20, 0x7d, 0x3b, 0x0a, 0x0a, 0x20,
+  0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x65, 0x73, 0x70, 0x6f,
+  0x6e, 0x73, 0x65, 0x20, 0x3d, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20,
+  0x66, 0x65, 0x74, 0x63, 0x68, 0x28, 0x22, 0x2f, 0x63, 0x6f, 0x6d, 0x70,
+  0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x2c, 0x20, 0x7b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x6d, 0x65, 0x74, 0x68, 0x6f, 0x64, 0x3a, 0x20, 0x27,
+  0x50, 0x4f, 0x53, 0x54, 0x27, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x62,
+  0x6f, 0x64, 0x79, 0x3a, 0x20, 0x4a, 0x53, 0x4f, 0x4e, 0x2e, 0x73, 0x74,
+  0x72, 0x69, 0x6e, 0x67, 0x69, 0x66, 0x79, 0x28, 0x63, 0x6f, 0x6d, 0x70,
+  0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x73,
+  0x29, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x68, 0x65, 0x61, 0x64, 0x65,
+  0x72, 0x73, 0x3a, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x27, 0x43, 0x6f, 0x6e, 0x6e, 0x65, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x27,
+  0x3a, 0x20, 0x27, 0x6b, 0x65, 0x65, 0x70, 0x2d, 0x61, 0x6c, 0x69, 0x76,
+  0x65, 0x27, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x27, 0x43,
+  0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x2d, 0x54, 0x79, 0x70, 0x65, 0x27,
+  0x3a, 0x20, 0x27, 0x61, 0x70, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x74, 0x69,
+  0x6f, 0x6e, 0x2f, 0x6a, 0x73, 0x6f, 0x6e, 0x27, 0x2c, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x27, 0x41, 0x63, 0x63, 0x65, 0x70, 0x74, 0x27,
+  0x3a, 0x20, 0x27, 0x74, 0x65, 0x78, 0x74, 0x2f, 0x65, 0x76, 0x65, 0x6e,
+  0x74, 0x2d, 0x73, 0x74, 0x72, 0x65, 0x61, 0x6d, 0x27, 0x2c, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x2e, 0x2e, 0x28, 0x70, 0x61, 0x72,
+  0x61, 0x6d, 0x73, 0x2e, 0x61, 0x70, 0x69, 0x5f, 0x6b, 0x65, 0x79, 0x20,
+  0x3f, 0x20, 0x7b, 0x27, 0x41, 0x75, 0x74, 0x68, 0x6f, 0x72, 0x69, 0x7a,
+  0x61, 0x74, 0x69, 0x6f, 0x6e, 0x27, 0x3a, 0x20, 0x60, 0x42, 0x65, 0x61,
+  0x72, 0x65, 0x72, 0x20, 0x24, 0x7b, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73,
+  0x2e, 0x61, 0x70, 0x69, 0x5f, 0x6b, 0x65, 0x79, 0x7d, 0x60, 0x7d, 0x20,
+  0x3a, 0x20, 0x7b, 0x7d, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x2c,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x73, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x3a,
+  0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e,
+  0x73, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x2c, 0x0a, 0x20, 0x20, 0x7d, 0x29,
+  0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72,
+  0x65, 0x61, 0x64, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x72, 0x65, 0x73, 0x70,
+  0x6f, 0x6e, 0x73, 0x65, 0x2e, 0x62, 0x6f, 0x64, 0x79, 0x2e, 0x67, 0x65,
+  0x74, 0x52, 0x65, 0x61, 0x64, 0x65, 0x72, 0x28, 0x29, 0x3b, 0x0a, 0x20,
+  0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x64, 0x65, 0x63, 0x6f, 0x64,
+  0x65, 0x72, 0x20, 0x3d, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x54, 0x65, 0x78,
+  0x74, 0x44, 0x65, 0x63, 0x6f, 0x64, 0x65, 0x72, 0x28, 0x29, 0x3b, 0x0a,
+  0x0a, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65,
+  0x6e, 0x74, 0x20, 0x3d, 0x20, 0x22, 0x22, 0x3b, 0x0a, 0x20, 0x20, 0x6c,
+  0x65, 0x74, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x6f, 0x76, 0x65, 0x72, 0x20,
+  0x3d, 0x20, 0x22, 0x22, 0x3b, 0x20, 0x2f, 0x2f, 0x20, 0x42, 0x75, 0x66,
+  0x66, 0x65, 0x72, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x70, 0x61, 0x72, 0x74,
+  0x69, 0x61, 0x6c, 0x6c, 0x79, 0x20, 0x72, 0x65, 0x61, 0x64, 0x20, 0x6c,
+  0x69, 0x6e, 0x65, 0x73, 0x0a, 0x0a, 0x20, 0x20, 0x74, 0x72, 0x79, 0x20,
+  0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x63, 0x6f,
+  0x6e, 0x74, 0x20, 0x3d, 0x20, 0x74, 0x72, 0x75, 0x65, 0x3b, 0x0a, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x63,
+  0x6f, 0x6e, 0x74, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c,
+  0x74, 0x20, 0x3d, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x72, 0x65,
+  0x61, 0x64, 0x65, 0x72, 0x2e, 0x72, 0x65, 0x61, 0x64, 0x28, 0x29, 0x3b,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x72,
+  0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x6f, 0x6e, 0x65, 0x29, 0x20,
+  0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x72,
+  0x65, 0x61, 0x6b, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d,
+  0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x41,
+  0x64, 0x64, 0x20, 0x61, 0x6e, 0x79, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x6f,
+  0x76, 0x65, 0x72, 0x20, 0x64, 0x61, 0x74, 0x61, 0x20, 0x74, 0x6f, 0x20,
+  0x74, 0x68, 0x65, 0x20, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x20,
+  0x63, 0x68, 0x75, 0x6e, 0x6b, 0x20, 0x6f, 0x66, 0x20, 0x64, 0x61, 0x74,
+  0x61, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73,
+  0x74, 0x20, 0x74, 0x65, 0x78, 0x74, 0x20, 0x3d, 0x20, 0x6c, 0x65, 0x66,
+  0x74, 0x6f, 0x76, 0x65, 0x72, 0x20, 0x2b, 0x20, 0x64, 0x65, 0x63, 0x6f,
+  0x64, 0x65, 0x72, 0x2e, 0x64, 0x65, 0x63, 0x6f, 0x64, 0x65, 0x28, 0x72,
+  0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x29,
+  0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20,
+  0x43, 0x68, 0x65, 0x63, 0x6b, 0x20, 0x69, 0x66, 0x20, 0x74, 0x68, 0x65,
+  0x20, 0x6c, 0x61, 0x73, 0x74, 0x20, 0x63, 0x68, 0x61, 0x72, 0x61, 0x63,
+  0x74, 0x65, 0x72, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x6c, 0x69, 0x6e,
+  0x65, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x65, 0x6e, 0x64, 0x73,
+  0x57, 0x69, 0x74, 0x68, 0x4c, 0x69, 0x6e, 0x65, 0x42, 0x72, 0x65, 0x61,
+  0x6b, 0x20, 0x3d, 0x20, 0x74, 0x65, 0x78, 0x74, 0x2e, 0x65, 0x6e, 0x64,
+  0x73, 0x57, 0x69, 0x74, 0x68, 0x28, 0x27, 0x5c, 0x6e, 0x27, 0x29, 0x3b,
+  0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x53,
+  0x70, 0x6c, 0x69, 0x74, 0x20, 0x74, 0x68, 0x65, 0x20, 0x74, 0x65, 0x78,
+  0x74, 0x20, 0x69, 0x6e, 0x74, 0x6f, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x73,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x6c,
+  0x69, 0x6e, 0x65, 0x73, 0x20, 0x3d, 0x20, 0x74, 0x65, 0x78, 0x74, 0x2e,
+  0x73, 0x70, 0x6c, 0x69, 0x74, 0x28, 0x27, 0x5c, 0x6e, 0x27, 0x29, 0x3b,
+  0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x49,
+  0x66, 0x20, 0x74, 0x68, 0x65, 0x20, 0x74, 0x65, 0x78, 0x74, 0x20, 0x64,
+  0x6f, 0x65, 0x73, 0x6e, 0x27, 0x74, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77,
+  0x69, 0x74, 0x68, 0x20, 0x61, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x20, 0x62,
+  0x72, 0x65, 0x61, 0x6b, 0x2c, 0x20, 0x74, 0x68, 0x65, 0x6e, 0x20, 0x74,
+  0x68, 0x65, 0x20, 0x6c, 0x61, 0x73, 0x74, 0x20, 0x6c, 0x69, 0x6e, 0x65,
+  0x20, 0x69, 0x73, 0x20, 0x69, 0x6e, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65,
+  0x74, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20,
+  0x53, 0x74, 0x6f, 0x72, 0x65, 0x20, 0x69, 0x74, 0x20, 0x69, 0x6e, 0x20,
+  0x6c, 0x65, 0x66, 0x74, 0x6f, 0x76, 0x65, 0x72, 0x20, 0x74, 0x6f, 0x20,
+  0x62, 0x65, 0x20, 0x61, 0x64, 0x64, 0x65, 0x64, 0x20, 0x74, 0x6f, 0x20,
+  0x74, 0x68, 0x65, 0x20, 0x6e, 0x65, 0x78, 0x74, 0x20, 0x63, 0x68, 0x75,
+  0x6e, 0x6b, 0x20, 0x6f, 0x66, 0x20, 0x64, 0x61, 0x74, 0x61, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x21, 0x65, 0x6e,
  0x64, 0x73, 0x57, 0x69, 0x74, 0x68, 0x4c, 0x69, 0x6e, 0x65, 0x42, 0x72,
-  0x65, 0x61, 0x6b, 0x20, 0x3d, 0x20, 0x74, 0x65, 0x78, 0x74, 0x2e, 0x65,
-  0x6e, 0x64, 0x73, 0x57, 0x69, 0x74, 0x68, 0x28, 0x27, 0x5c, 0x6e, 0x27,
-  0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f,
-  0x20, 0x53, 0x70, 0x6c, 0x69, 0x74, 0x20, 0x74, 0x68, 0x65, 0x20, 0x74,
-  0x65, 0x78, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x6f, 0x20, 0x6c, 0x69, 0x6e,
-  0x65, 0x73, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74,
-  0x20, 0x6c, 0x69, 0x6e, 0x65, 0x73, 0x20, 0x3d, 0x20, 0x74, 0x65, 0x78,
-  0x74, 0x2e, 0x73, 0x70, 0x6c, 0x69, 0x74, 0x28, 0x27, 0x5c, 0x6e, 0x27,
-  0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f,
-  0x20, 0x49, 0x66, 0x20, 0x74, 0x68, 0x65, 0x20, 0x74, 0x65, 0x78, 0x74,
-  0x20, 0x64, 0x6f, 0x65, 0x73, 0x6e, 0x27, 0x74, 0x20, 0x65, 0x6e, 0x64,
-  0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x61, 0x20, 0x6c, 0x69, 0x6e, 0x65,
-  0x20, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x2c, 0x20, 0x74, 0x68, 0x65, 0x6e,
-  0x20, 0x74, 0x68, 0x65, 0x20, 0x6c, 0x61, 0x73, 0x74, 0x20, 0x6c, 0x69,
-  0x6e, 0x65, 0x20, 0x69, 0x73, 0x20, 0x69, 0x6e, 0x63, 0x6f, 0x6d, 0x70,
-  0x6c, 0x65, 0x74, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f,
-  0x2f, 0x20, 0x53, 0x74, 0x6f, 0x72, 0x65, 0x20, 0x69, 0x74, 0x20, 0x69,
-  0x6e, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x6f, 0x76, 0x65, 0x72, 0x20, 0x74,
-  0x6f, 0x20, 0x62, 0x65, 0x20, 0x61, 0x64, 0x64, 0x65, 0x64, 0x20, 0x74,
-  0x6f, 0x20, 0x74, 0x68, 0x65, 0x20, 0x6e, 0x65, 0x78, 0x74, 0x20, 0x63,
-  0x68, 0x75, 0x6e, 0x6b, 0x20, 0x6f, 0x66, 0x20, 0x64, 0x61, 0x74, 0x61,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x21,
-  0x65, 0x6e, 0x64, 0x73, 0x57, 0x69, 0x74, 0x68, 0x4c, 0x69, 0x6e, 0x65,
-  0x42, 0x72, 0x65, 0x61, 0x6b, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x6f, 0x76, 0x65,
-  0x72, 0x20, 0x3d, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x73, 0x2e, 0x70, 0x6f,
-  0x70, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d,
-  0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x6f, 0x76, 0x65, 0x72,
-  0x20, 0x3d, 0x20, 0x22, 0x22, 0x3b, 0x20, 0x2f, 0x2f, 0x20, 0x52, 0x65,
-  0x73, 0x65, 0x74, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x6f, 0x76, 0x65, 0x72,
-  0x20, 0x69, 0x66, 0x20, 0x77, 0x65, 0x20, 0x68, 0x61, 0x76, 0x65, 0x20,
-  0x61, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b,
-  0x20, 0x61, 0x74, 0x20, 0x74, 0x68, 0x65, 0x20, 0x65, 0x6e, 0x64, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x50, 0x61, 0x72, 0x73, 0x65, 0x20,
-  0x61, 0x6c, 0x6c, 0x20, 0x73, 0x73, 0x65, 0x20, 0x65, 0x76, 0x65, 0x6e,
-  0x74, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x61, 0x64, 0x64, 0x20, 0x74,
-  0x68, 0x65, 0x6d, 0x20, 0x74, 0x6f, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c,
-  0x74, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73,
-  0x74, 0x20, 0x72, 0x65, 0x67, 0x65, 0x78, 0x20, 0x3d, 0x20, 0x2f, 0x5e,
-  0x28, 0x5c, 0x53, 0x2b, 0x29, 0x3a, 0x5c, 0x73, 0x28, 0x2e, 0x2a, 0x29,
-  0x24, 0x2f, 0x67, 0x6d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x66, 0x6f, 0x72, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c,
-  0x69, 0x6e, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x73,
-  0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x20,
-  0x3d, 0x20, 0x72, 0x65, 0x67, 0x65, 0x78, 0x2e, 0x65, 0x78, 0x65, 0x63,
-  0x28, 0x6c, 0x69, 0x6e, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x6d, 0x61, 0x74, 0x63,
-  0x68, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x5b, 0x6d, 0x61,
-  0x74, 0x63, 0x68, 0x5b, 0x31, 0x5d, 0x5d, 0x20, 0x3d, 0x20, 0x6d, 0x61,
-  0x74, 0x63, 0x68, 0x5b, 0x32, 0x5d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x73, 0x69, 0x6e, 0x63,
-  0x65, 0x20, 0x77, 0x65, 0x20, 0x6b, 0x6e, 0x6f, 0x77, 0x20, 0x74, 0x68,
-  0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2e,
-  0x63, 0x70, 0x70, 0x2c, 0x20, 0x6c, 0x65, 0x74, 0x27, 0x73, 0x20, 0x6a,
-  0x75, 0x73, 0x74, 0x20, 0x64, 0x65, 0x63, 0x6f, 0x64, 0x65, 0x20, 0x74,
-  0x68, 0x65, 0x20, 0x6a, 0x73, 0x6f, 0x6e, 0x20, 0x69, 0x6e, 0x20, 0x64,
-  0x61, 0x74, 0x61, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74,
-  0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x73,
-  0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, 0x20, 0x4a,
-  0x53, 0x4f, 0x4e, 0x2e, 0x70, 0x61, 0x72, 0x73, 0x65, 0x28, 0x72, 0x65,
-  0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x3b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x2b, 0x3d, 0x20, 0x72,
-  0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x63,
-  0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20,
-  0x79, 0x69, 0x65, 0x6c, 0x64, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x79, 0x69, 0x65, 0x6c, 0x64, 0x20,
-  0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20,
-  0x69, 0x66, 0x20, 0x77, 0x65, 0x20, 0x67, 0x6f, 0x74, 0x20, 0x61, 0x20,
-  0x73, 0x74, 0x6f, 0x70, 0x20, 0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x20, 0x66,
-  0x72, 0x6f, 0x6d, 0x20, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x2c, 0x20,
-  0x77, 0x65, 0x20, 0x77, 0x69, 0x6c, 0x6c, 0x20, 0x62, 0x72, 0x65, 0x61,
-  0x6b, 0x20, 0x68, 0x65, 0x72, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x65, 0x61, 0x6b, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x6f, 0x76, 0x65, 0x72, 0x20,
+  0x3d, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x73, 0x2e, 0x70, 0x6f, 0x70, 0x28,
+  0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x65,
+  0x6c, 0x73, 0x65, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x6f, 0x76, 0x65, 0x72, 0x20, 0x3d,
+  0x20, 0x22, 0x22, 0x3b, 0x20, 0x2f, 0x2f, 0x20, 0x52, 0x65, 0x73, 0x65,
+  0x74, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x6f, 0x76, 0x65, 0x72, 0x20, 0x69,
+  0x66, 0x20, 0x77, 0x65, 0x20, 0x68, 0x61, 0x76, 0x65, 0x20, 0x61, 0x20,
+  0x6c, 0x69, 0x6e, 0x65, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x20, 0x61,
+  0x74, 0x20, 0x74, 0x68, 0x65, 0x20, 0x65, 0x6e, 0x64, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x2f, 0x2f, 0x20, 0x50, 0x61, 0x72, 0x73, 0x65, 0x20, 0x61, 0x6c,
+  0x6c, 0x20, 0x73, 0x73, 0x65, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x73,
+  0x20, 0x61, 0x6e, 0x64, 0x20, 0x61, 0x64, 0x64, 0x20, 0x74, 0x68, 0x65,
+  0x6d, 0x20, 0x74, 0x6f, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20,
+  0x72, 0x65, 0x67, 0x65, 0x78, 0x20, 0x3d, 0x20, 0x2f, 0x5e, 0x28, 0x5c,
+  0x53, 0x2b, 0x29, 0x3a, 0x5c, 0x73, 0x28, 0x2e, 0x2a, 0x29, 0x24, 0x2f,
+  0x67, 0x6d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f,
+  0x72, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c, 0x69, 0x6e,
+  0x65, 0x20, 0x6f, 0x66, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x73, 0x29, 0x20,
+  0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f,
+  0x6e, 0x73, 0x74, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x20, 0x3d, 0x20,
+  0x72, 0x65, 0x67, 0x65, 0x78, 0x2e, 0x65, 0x78, 0x65, 0x63, 0x28, 0x6c,
+  0x69, 0x6e, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x29,
+  0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x5b, 0x6d, 0x61, 0x74, 0x63,
+  0x68, 0x5b, 0x31, 0x5d, 0x5d, 0x20, 0x3d, 0x20, 0x6d, 0x61, 0x74, 0x63,
+  0x68, 0x5b, 0x32, 0x5d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x73, 0x69, 0x6e, 0x63, 0x65, 0x20,
+  0x77, 0x65, 0x20, 0x6b, 0x6e, 0x6f, 0x77, 0x20, 0x74, 0x68, 0x69, 0x73,
+  0x20, 0x69, 0x73, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2e, 0x63, 0x70,
+  0x70, 0x2c, 0x20, 0x6c, 0x65, 0x74, 0x27, 0x73, 0x20, 0x6a, 0x75, 0x73,
+  0x74, 0x20, 0x64, 0x65, 0x63, 0x6f, 0x64, 0x65, 0x20, 0x74, 0x68, 0x65,
+  0x20, 0x6a, 0x73, 0x6f, 0x6e, 0x20, 0x69, 0x6e, 0x20, 0x64, 0x61, 0x74,
+  0x61, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x69, 0x66, 0x20, 0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64,
+  0x61, 0x74, 0x61, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c,
+  0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, 0x20, 0x4a, 0x53, 0x4f,
+  0x4e, 0x2e, 0x70, 0x61, 0x72, 0x73, 0x65, 0x28, 0x72, 0x65, 0x73, 0x75,
+  0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x3b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f,
+  0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x2b, 0x3d, 0x20, 0x72, 0x65, 0x73,
+  0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x63, 0x6f, 0x6e,
+  0x74, 0x65, 0x6e, 0x74, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x79, 0x69,
+  0x65, 0x6c, 0x64, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x79, 0x69, 0x65, 0x6c, 0x64, 0x20, 0x72, 0x65,
+  0x73, 0x75, 0x6c, 0x74, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x69, 0x66,
+  0x20, 0x77, 0x65, 0x20, 0x67, 0x6f, 0x74, 0x20, 0x61, 0x20, 0x73, 0x74,
+  0x6f, 0x70, 0x20, 0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x20, 0x66, 0x72, 0x6f,
+  0x6d, 0x20, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x2c, 0x20, 0x77, 0x65,
+  0x20, 0x77, 0x69, 0x6c, 0x6c, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x20,
+  0x68, 0x65, 0x72, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x72, 0x65, 0x73,
+  0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x73, 0x74, 0x6f,
+  0x70, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x72,
-  0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x73,
-  0x74, 0x6f, 0x70, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20,
-  0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61,
-  0x2e, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f,
-  0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69,
-  0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x20,
-  0x3d, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74,
-  0x61, 0x2e, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e,
-  0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x3b, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x20, 0x3d, 0x20,
-  0x66, 0x61, 0x6c, 0x73, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x72, 0x65,
-  0x61, 0x6b, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x72, 0x65, 0x73, 0x75,
-  0x6c, 0x74, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x29, 0x20, 0x7b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x74, 0x72, 0x79, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x73, 0x75,
-  0x6c, 0x74, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x20, 0x3d, 0x20, 0x4a,
-  0x53, 0x4f, 0x4e, 0x2e, 0x70, 0x61, 0x72, 0x73, 0x65, 0x28, 0x72, 0x65,
-  0x73, 0x75, 0x6c, 0x74, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x29, 0x3b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x72, 0x65, 0x73, 0x75, 0x6c,
-  0x74, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x2e, 0x6d, 0x65, 0x73, 0x73,
-  0x61, 0x67, 0x65, 0x2e, 0x69, 0x6e, 0x63, 0x6c, 0x75, 0x64, 0x65, 0x73,
-  0x28, 0x27, 0x73, 0x6c, 0x6f, 0x74, 0x20, 0x75, 0x6e, 0x61, 0x76, 0x61,
-  0x69, 0x6c, 0x61, 0x62, 0x6c, 0x65, 0x27, 0x29, 0x29, 0x20, 0x7b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x54, 0x68, 0x72, 0x6f, 0x77,
-  0x20, 0x61, 0x6e, 0x20, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x20, 0x74, 0x6f,
-  0x20, 0x62, 0x65, 0x20, 0x63, 0x61, 0x75, 0x67, 0x68, 0x74, 0x20, 0x62,
-  0x79, 0x20, 0x75, 0x70, 0x73, 0x74, 0x72, 0x65, 0x61, 0x6d, 0x20, 0x63,
-  0x61, 0x6c, 0x6c, 0x65, 0x72, 0x73, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74,
-  0x68, 0x72, 0x6f, 0x77, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x45, 0x72, 0x72,
-  0x6f, 0x72, 0x28, 0x27, 0x73, 0x6c, 0x6f, 0x74, 0x20, 0x75, 0x6e, 0x61,
-  0x76, 0x61, 0x69, 0x6c, 0x61, 0x62, 0x6c, 0x65, 0x27, 0x29, 0x3b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x7d, 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x7b, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e, 0x65,
-  0x72, 0x72, 0x6f, 0x72, 0x28, 0x60, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2e,
-  0x63, 0x70, 0x70, 0x20, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x20, 0x5b, 0x24,
-  0x7b, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x65, 0x72, 0x72, 0x6f,
-  0x72, 0x2e, 0x63, 0x6f, 0x64, 0x65, 0x7d, 0x20, 0x2d, 0x20, 0x24, 0x7b,
-  0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72,
-  0x2e, 0x74, 0x79, 0x70, 0x65, 0x7d, 0x5d, 0x3a, 0x20, 0x24, 0x7b, 0x72,
-  0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x2e,
-  0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x7d, 0x60, 0x29, 0x3b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x63, 0x61, 0x74, 0x63, 0x68, 0x28,
-  0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f,
-  0x6c, 0x65, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x28, 0x60, 0x6c, 0x6c,
-  0x61, 0x6d, 0x61, 0x2e, 0x63, 0x70, 0x70, 0x20, 0x65, 0x72, 0x72, 0x6f,
-  0x72, 0x20, 0x24, 0x7b, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x65,
-  0x72, 0x72, 0x6f, 0x72, 0x7d, 0x60, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20,
-  0x7d, 0x20, 0x63, 0x61, 0x74, 0x63, 0x68, 0x20, 0x28, 0x65, 0x29, 0x20,
-  0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x65, 0x2e,
-  0x6e, 0x61, 0x6d, 0x65, 0x20, 0x21, 0x3d, 0x3d, 0x20, 0x27, 0x41, 0x62,
-  0x6f, 0x72, 0x74, 0x45, 0x72, 0x72, 0x6f, 0x72, 0x27, 0x29, 0x20, 0x7b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f,
-  0x6c, 0x65, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x28, 0x22, 0x6c, 0x6c,
-  0x61, 0x6d, 0x61, 0x20, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x3a, 0x20, 0x22,
-  0x2c, 0x20, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x72, 0x6f, 0x77, 0x20, 0x65, 0x3b,
-  0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x66, 0x69, 0x6e, 0x61, 0x6c,
-  0x6c, 0x79, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e,
-  0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x61, 0x62, 0x6f, 0x72,
-  0x74, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20,
-  0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65,
-  0x6e, 0x74, 0x3b, 0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2f, 0x20, 0x43, 0x61,
-  0x6c, 0x6c, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2c, 0x20, 0x72, 0x65,
-  0x74, 0x75, 0x72, 0x6e, 0x20, 0x61, 0x6e, 0x20, 0x65, 0x76, 0x65, 0x6e,
-  0x74, 0x20, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x20, 0x74, 0x68, 0x61,
-  0x74, 0x20, 0x79, 0x6f, 0x75, 0x20, 0x63, 0x61, 0x6e, 0x20, 0x73, 0x75,
-  0x62, 0x73, 0x63, 0x72, 0x69, 0x62, 0x65, 0x20, 0x74, 0x6f, 0x0a, 0x2f,
-  0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x45, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65,
-  0x3a, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x69,
-  0x6d, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x7b, 0x20, 0x6c, 0x6c, 0x61, 0x6d,
-  0x61, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74,
-  0x20, 0x7d, 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x27, 0x2f, 0x63, 0x6f,
-  0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x2e, 0x6a, 0x73, 0x27,
-  0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f,
-  0x6e, 0x73, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x6e, 0x20, 0x3d, 0x20, 0x6c,
-  0x6c, 0x61, 0x6d, 0x61, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72,
-  0x67, 0x65, 0x74, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x29, 0x0a,
-  0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x6e, 0x2e, 0x61,
-  0x64, 0x64, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x4c, 0x69, 0x73, 0x74, 0x65,
-  0x6e, 0x65, 0x72, 0x28, 0x22, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65,
-  0x22, 0x2c, 0x20, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x29, 0x20, 0x3d,
-  0x3e, 0x20, 0x7b, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x77, 0x72, 0x69,
-  0x74, 0x65, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x65, 0x74,
-  0x61, 0x69, 0x6c, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29,
-  0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x29, 0x0a, 0x2f, 0x2f,
-  0x0a, 0x65, 0x78, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x73,
-  0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x45, 0x76, 0x65, 0x6e, 0x74,
-  0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x20, 0x3d, 0x20, 0x28, 0x70, 0x72,
-  0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73,
-  0x20, 0x3d, 0x20, 0x7b, 0x7d, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69,
-  0x67, 0x20, 0x3d, 0x20, 0x7b, 0x7d, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b,
-  0x0a, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x65, 0x76, 0x65,
-  0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x20, 0x3d, 0x20, 0x6e,
-  0x65, 0x77, 0x20, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67,
-  0x65, 0x74, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x28, 0x61, 0x73, 0x79,
-  0x6e, 0x63, 0x20, 0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65,
-  0x6e, 0x74, 0x20, 0x3d, 0x20, 0x22, 0x22, 0x3b, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x66, 0x6f, 0x72, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x28,
-  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x20,
-  0x6f, 0x66, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70, 0x72, 0x6f,
-  0x6d, 0x70, 0x74, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c,
-  0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x29, 0x29, 0x20, 0x7b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63, 0x68,
-  0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x20, 0x7b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74,
-  0x65, 0x6e, 0x74, 0x20, 0x2b, 0x3d, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b,
-  0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e,
-  0x74, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x65,
-  0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x64,
-  0x69, 0x73, 0x70, 0x61, 0x74, 0x63, 0x68, 0x45, 0x76, 0x65, 0x6e, 0x74,
-  0x28, 0x6e, 0x65, 0x77, 0x20, 0x43, 0x75, 0x73, 0x74, 0x6f, 0x6d, 0x45,
-  0x76, 0x65, 0x6e, 0x74, 0x28, 0x22, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67,
-  0x65, 0x22, 0x2c, 0x20, 0x7b, 0x20, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c,
-  0x3a, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61,
-  0x20, 0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28,
-  0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x67,
+  0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x67,
  0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65,
  0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54,
-  0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61, 0x74,
-  0x63, 0x68, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77, 0x20,
-  0x43, 0x75, 0x73, 0x74, 0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28,
-  0x22, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f,
-  0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x22, 0x2c, 0x20, 0x7b,
-  0x20, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x3a, 0x20, 0x63, 0x68, 0x75,
-  0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x67, 0x65, 0x6e, 0x65,
-  0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69,
-  0x6e, 0x67, 0x73, 0x20, 0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69,
-  0x66, 0x20, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74,
-  0x61, 0x2e, 0x74, 0x69, 0x6d, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e,
+  0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x20, 0x3d, 0x20,
+  0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e,
+  0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73,
+  0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x3b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x20, 0x3d, 0x20, 0x66, 0x61,
+  0x6c, 0x73, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b,
+  0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74,
+  0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x72,
+  0x79, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74,
+  0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x20, 0x3d, 0x20, 0x4a, 0x53, 0x4f,
+  0x4e, 0x2e, 0x70, 0x61, 0x72, 0x73, 0x65, 0x28, 0x72, 0x65, 0x73, 0x75,
+  0x6c, 0x74, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x29, 0x3b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x69, 0x66, 0x20, 0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e,
+  0x65, 0x72, 0x72, 0x6f, 0x72, 0x2e, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67,
+  0x65, 0x2e, 0x69, 0x6e, 0x63, 0x6c, 0x75, 0x64, 0x65, 0x73, 0x28, 0x27,
+  0x73, 0x6c, 0x6f, 0x74, 0x20, 0x75, 0x6e, 0x61, 0x76, 0x61, 0x69, 0x6c,
+  0x61, 0x62, 0x6c, 0x65, 0x27, 0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x2f, 0x2f, 0x20, 0x54, 0x68, 0x72, 0x6f, 0x77, 0x20, 0x61,
+  0x6e, 0x20, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x20, 0x74, 0x6f, 0x20, 0x62,
+  0x65, 0x20, 0x63, 0x61, 0x75, 0x67, 0x68, 0x74, 0x20, 0x62, 0x79, 0x20,
+  0x75, 0x70, 0x73, 0x74, 0x72, 0x65, 0x61, 0x6d, 0x20, 0x63, 0x61, 0x6c,
+  0x6c, 0x65, 0x72, 0x73, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x72,
+  0x6f, 0x77, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x45, 0x72, 0x72, 0x6f, 0x72,
+  0x28, 0x27, 0x73, 0x6c, 0x6f, 0x74, 0x20, 0x75, 0x6e, 0x61, 0x76, 0x61,
+  0x69, 0x6c, 0x61, 0x62, 0x6c, 0x65, 0x27, 0x29, 0x3b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x7d, 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e, 0x65, 0x72, 0x72,
+  0x6f, 0x72, 0x28, 0x60, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2e, 0x63, 0x70,
+  0x70, 0x20, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x20, 0x5b, 0x24, 0x7b, 0x72,
+  0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x2e,
+  0x63, 0x6f, 0x64, 0x65, 0x7d, 0x20, 0x2d, 0x20, 0x24, 0x7b, 0x72, 0x65,
+  0x73, 0x75, 0x6c, 0x74, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x2e, 0x74,
+  0x79, 0x70, 0x65, 0x7d, 0x5d, 0x3a, 0x20, 0x24, 0x7b, 0x72, 0x65, 0x73,
+  0x75, 0x6c, 0x74, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x2e, 0x6d, 0x65,
+  0x73, 0x73, 0x61, 0x67, 0x65, 0x7d, 0x60, 0x29, 0x3b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x7d, 0x20, 0x63, 0x61, 0x74, 0x63, 0x68, 0x28, 0x65, 0x29,
+  0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65,
+  0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x28, 0x60, 0x6c, 0x6c, 0x61, 0x6d,
+  0x61, 0x2e, 0x63, 0x70, 0x70, 0x20, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x20,
+  0x24, 0x7b, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x65, 0x72, 0x72,
+  0x6f, 0x72, 0x7d, 0x60, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x7d, 0x20,
+  0x63, 0x61, 0x74, 0x63, 0x68, 0x20, 0x28, 0x65, 0x29, 0x20, 0x7b, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x65, 0x2e, 0x6e, 0x61,
+  0x6d, 0x65, 0x20, 0x21, 0x3d, 0x3d, 0x20, 0x27, 0x41, 0x62, 0x6f, 0x72,
+  0x74, 0x45, 0x72, 0x72, 0x6f, 0x72, 0x27, 0x29, 0x20, 0x7b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65,
+  0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x28, 0x22, 0x6c, 0x6c, 0x61, 0x6d,
+  0x61, 0x20, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x3a, 0x20, 0x22, 0x2c, 0x20,
+  0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x74, 0x68, 0x72, 0x6f, 0x77, 0x20, 0x65, 0x3b, 0x0a, 0x20,
+  0x20, 0x7d, 0x0a, 0x20, 0x20, 0x66, 0x69, 0x6e, 0x61, 0x6c, 0x6c, 0x79,
+  0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72,
+  0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x61, 0x62, 0x6f, 0x72, 0x74, 0x28,
+  0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x72, 0x65,
+  0x74, 0x75, 0x72, 0x6e, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74,
+  0x3b, 0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2f, 0x20, 0x43, 0x61, 0x6c, 0x6c,
+  0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2c, 0x20, 0x72, 0x65, 0x74, 0x75,
+  0x72, 0x6e, 0x20, 0x61, 0x6e, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x20,
+  0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x20, 0x74, 0x68, 0x61, 0x74, 0x20,
+  0x79, 0x6f, 0x75, 0x20, 0x63, 0x61, 0x6e, 0x20, 0x73, 0x75, 0x62, 0x73,
+  0x63, 0x72, 0x69, 0x62, 0x65, 0x20, 0x74, 0x6f, 0x0a, 0x2f, 0x2f, 0x0a,
+  0x2f, 0x2f, 0x20, 0x45, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x3a, 0x0a,
+  0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70,
+  0x6f, 0x72, 0x74, 0x20, 0x7b, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x45,
+  0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x20, 0x7d,
+  0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x27, 0x2f, 0x63, 0x6f, 0x6d, 0x70,
+  0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x2e, 0x6a, 0x73, 0x27, 0x0a, 0x2f,
+  0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73,
+  0x74, 0x20, 0x63, 0x6f, 0x6e, 0x6e, 0x20, 0x3d, 0x20, 0x6c, 0x6c, 0x61,
+  0x6d, 0x61, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65,
+  0x74, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x29, 0x0a, 0x2f, 0x2f,
+  0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x6e, 0x2e, 0x61, 0x64, 0x64,
+  0x45, 0x76, 0x65, 0x6e, 0x74, 0x4c, 0x69, 0x73, 0x74, 0x65, 0x6e, 0x65,
+  0x72, 0x28, 0x22, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x22, 0x2c,
+  0x20, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x29, 0x20, 0x3d, 0x3e, 0x20,
+  0x7b, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f,
+  0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x77, 0x72, 0x69, 0x74, 0x65,
+  0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x65, 0x74, 0x61, 0x69,
+  0x6c, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x0a, 0x2f,
+  0x2f, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x29, 0x0a, 0x2f, 0x2f, 0x0a, 0x65,
+  0x78, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20,
+  0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61,
+  0x72, 0x67, 0x65, 0x74, 0x20, 0x3d, 0x20, 0x28, 0x70, 0x72, 0x6f, 0x6d,
+  0x70, 0x74, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x20, 0x3d,
+  0x20, 0x7b, 0x7d, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x20,
+  0x3d, 0x20, 0x7b, 0x7d, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20,
+  0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74,
+  0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x20, 0x3d, 0x20, 0x6e, 0x65, 0x77,
+  0x20, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74,
+  0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x28, 0x61, 0x73, 0x79, 0x6e, 0x63,
+  0x20, 0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x6c, 0x65, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74,
+  0x20, 0x3d, 0x20, 0x22, 0x22, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66,
+  0x6f, 0x72, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x28, 0x63, 0x6f,
+  0x6e, 0x73, 0x74, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x20, 0x6f, 0x66,
+  0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70,
+  0x74, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x63,
+  0x6f, 0x6e, 0x66, 0x69, 0x67, 0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63, 0x68, 0x75, 0x6e,
+  0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e,
+  0x74, 0x20, 0x2b, 0x3d, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64,
+  0x61, 0x74, 0x61, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3b,
  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x65, 0x76, 0x65,
  0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x64, 0x69, 0x73,
  0x70, 0x61, 0x74, 0x63, 0x68, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x6e,
  0x65, 0x77, 0x20, 0x43, 0x75, 0x73, 0x74, 0x6f, 0x6d, 0x45, 0x76, 0x65,
-  0x6e, 0x74, 0x28, 0x22, 0x74, 0x69, 0x6d, 0x69, 0x6e, 0x67, 0x73, 0x22,
+  0x6e, 0x74, 0x28, 0x22, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x22,
  0x2c, 0x20, 0x7b, 0x20, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x3a, 0x20,
-  0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x74,
-  0x69, 0x6d, 0x69, 0x6e, 0x67, 0x73, 0x20, 0x7d, 0x29, 0x29, 0x3b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54,
-  0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61, 0x74,
-  0x63, 0x68, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77, 0x20,
-  0x43, 0x75, 0x73, 0x74, 0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28,
-  0x22, 0x64, 0x6f, 0x6e, 0x65, 0x22, 0x2c, 0x20, 0x7b, 0x20, 0x64, 0x65,
-  0x74, 0x61, 0x69, 0x6c, 0x3a, 0x20, 0x7b, 0x20, 0x63, 0x6f, 0x6e, 0x74,
-  0x65, 0x6e, 0x74, 0x20, 0x7d, 0x20, 0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20,
-  0x20, 0x7d, 0x29, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74,
-  0x75, 0x72, 0x6e, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72,
-  0x67, 0x65, 0x74, 0x3b, 0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2f, 0x20, 0x43,
-  0x61, 0x6c, 0x6c, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2c, 0x20, 0x72,
-  0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x61, 0x20, 0x70, 0x72, 0x6f, 0x6d,
-  0x69, 0x73, 0x65, 0x20, 0x74, 0x68, 0x61, 0x74, 0x20, 0x72, 0x65, 0x73,
-  0x6f, 0x6c, 0x76, 0x65, 0x73, 0x20, 0x74, 0x6f, 0x20, 0x74, 0x68, 0x65,
-  0x20, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x65, 0x64, 0x20, 0x74,
-  0x65, 0x78, 0x74, 0x2e, 0x20, 0x54, 0x68, 0x69, 0x73, 0x20, 0x64, 0x6f,
-  0x65, 0x73, 0x20, 0x6e, 0x6f, 0x74, 0x20, 0x73, 0x75, 0x70, 0x70, 0x6f,
-  0x72, 0x74, 0x20, 0x73, 0x74, 0x72, 0x65, 0x61, 0x6d, 0x69, 0x6e, 0x67,
-  0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x45, 0x78, 0x61, 0x6d, 0x70,
-  0x6c, 0x65, 0x3a, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x50, 0x72, 0x6f, 0x6d, 0x69,
-  0x73, 0x65, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x29, 0x2e, 0x74,
-  0x68, 0x65, 0x6e, 0x28, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74,
-  0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74,
-  0x2e, 0x77, 0x72, 0x69, 0x74, 0x65, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65,
-  0x6e, 0x74, 0x29, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d,
-  0x29, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x6f, 0x72, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65,
-  0x6e, 0x74, 0x20, 0x3d, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x6c,
-  0x6c, 0x61, 0x6d, 0x61, 0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x28,
-  0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x29, 0x0a, 0x2f, 0x2f, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e,
-  0x77, 0x72, 0x69, 0x74, 0x65, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e,
-  0x74, 0x29, 0x0a, 0x2f, 0x2f, 0x0a, 0x65, 0x78, 0x70, 0x6f, 0x72, 0x74,
-  0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61,
-  0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x70,
-  0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d,
-  0x73, 0x20, 0x3d, 0x20, 0x7b, 0x7d, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66,
-  0x69, 0x67, 0x20, 0x3d, 0x20, 0x7b, 0x7d, 0x29, 0x20, 0x3d, 0x3e, 0x20,
-  0x7b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e,
-  0x65, 0x77, 0x20, 0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x28, 0x61,
-  0x73, 0x79, 0x6e, 0x63, 0x20, 0x28, 0x72, 0x65, 0x73, 0x6f, 0x6c, 0x76,
-  0x65, 0x2c, 0x20, 0x72, 0x65, 0x6a, 0x65, 0x63, 0x74, 0x29, 0x20, 0x3d,
-  0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20,
-  0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x3d, 0x20, 0x22, 0x22,
-  0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x72, 0x79, 0x20, 0x7b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x61, 0x77,
-  0x61, 0x69, 0x74, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63,
-  0x68, 0x75, 0x6e, 0x6b, 0x20, 0x6f, 0x66, 0x20, 0x6c, 0x6c, 0x61, 0x6d,
-  0x61, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20, 0x70, 0x61,
-  0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67,
-  0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x2b, 0x3d, 0x20,
-  0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x63,
-  0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65,
-  0x73, 0x6f, 0x6c, 0x76, 0x65, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e,
-  0x74, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x63, 0x61,
-  0x74, 0x63, 0x68, 0x20, 0x28, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x29, 0x20,
-  0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x6a, 0x65,
-  0x63, 0x74, 0x28, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x29, 0x3b, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x7d, 0x29, 0x3b, 0x0a, 0x7d,
-  0x3b, 0x0a, 0x0a, 0x2f, 0x2a, 0x2a, 0x0a, 0x20, 0x2a, 0x20, 0x28, 0x64,
-  0x65, 0x70, 0x72, 0x65, 0x63, 0x61, 0x74, 0x65, 0x64, 0x29, 0x0a, 0x20,
-  0x2a, 0x2f, 0x0a, 0x65, 0x78, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x63, 0x6f,
-  0x6e, 0x73, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x43, 0x6f, 0x6d,
-  0x70, 0x6c, 0x65, 0x74, 0x65, 0x20, 0x3d, 0x20, 0x61, 0x73, 0x79, 0x6e,
-  0x63, 0x20, 0x28, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x63,
-  0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2c, 0x20, 0x63,
-  0x61, 0x6c, 0x6c, 0x62, 0x61, 0x63, 0x6b, 0x29, 0x20, 0x3d, 0x3e, 0x20,
-  0x7b, 0x0a, 0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x61, 0x77, 0x61, 0x69,
+  0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x7d,
+  0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63, 0x68,
+  0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x67, 0x65, 0x6e,
+  0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74,
+  0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72,
+  0x67, 0x65, 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61, 0x74, 0x63, 0x68,
+  0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77, 0x20, 0x43, 0x75,
+  0x73, 0x74, 0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x22, 0x67,
+  0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65,
+  0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x22, 0x2c, 0x20, 0x7b, 0x20, 0x64,
+  0x65, 0x74, 0x61, 0x69, 0x6c, 0x3a, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b,
+  0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61,
+  0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67,
+  0x73, 0x20, 0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20,
+  0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e,
+  0x74, 0x69, 0x6d, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74,
+  0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61,
+  0x74, 0x63, 0x68, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77,
+  0x20, 0x43, 0x75, 0x73, 0x74, 0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74,
+  0x28, 0x22, 0x74, 0x69, 0x6d, 0x69, 0x6e, 0x67, 0x73, 0x22, 0x2c, 0x20,
+  0x7b, 0x20, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x3a, 0x20, 0x63, 0x68,
+  0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x74, 0x69, 0x6d,
+  0x69, 0x6e, 0x67, 0x73, 0x20, 0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72,
+  0x67, 0x65, 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61, 0x74, 0x63, 0x68,
+  0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77, 0x20, 0x43, 0x75,
+  0x73, 0x74, 0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x22, 0x64,
+  0x6f, 0x6e, 0x65, 0x22, 0x2c, 0x20, 0x7b, 0x20, 0x64, 0x65, 0x74, 0x61,
+  0x69, 0x6c, 0x3a, 0x20, 0x7b, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e,
+  0x74, 0x20, 0x7d, 0x20, 0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d,
+  0x29, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72,
+  0x6e, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65,
+  0x74, 0x3b, 0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2f, 0x20, 0x43, 0x61, 0x6c,
+  0x6c, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2c, 0x20, 0x72, 0x65, 0x74,
+  0x75, 0x72, 0x6e, 0x20, 0x61, 0x20, 0x70, 0x72, 0x6f, 0x6d, 0x69, 0x73,
+  0x65, 0x20, 0x74, 0x68, 0x61, 0x74, 0x20, 0x72, 0x65, 0x73, 0x6f, 0x6c,
+  0x76, 0x65, 0x73, 0x20, 0x74, 0x6f, 0x20, 0x74, 0x68, 0x65, 0x20, 0x63,
+  0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x65, 0x64, 0x20, 0x74, 0x65, 0x78,
+  0x74, 0x2e, 0x20, 0x54, 0x68, 0x69, 0x73, 0x20, 0x64, 0x6f, 0x65, 0x73,
+  0x20, 0x6e, 0x6f, 0x74, 0x20, 0x73, 0x75, 0x70, 0x70, 0x6f, 0x72, 0x74,
+  0x20, 0x73, 0x74, 0x72, 0x65, 0x61, 0x6d, 0x69, 0x6e, 0x67, 0x0a, 0x2f,
+  0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x45, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65,
+  0x3a, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65,
+  0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x29, 0x2e, 0x74, 0x68, 0x65,
+  0x6e, 0x28, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x20,
+  0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x77,
+  0x72, 0x69, 0x74, 0x65, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74,
+  0x29, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x29, 0x0a,
+  0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x72,
+  0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63,
+  0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74,
+  0x20, 0x3d, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x6c, 0x6c, 0x61,
+  0x6d, 0x61, 0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x28, 0x70, 0x72,
+  0x6f, 0x6d, 0x70, 0x74, 0x29, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x77, 0x72,
+  0x69, 0x74, 0x65, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29,
+  0x0a, 0x2f, 0x2f, 0x0a, 0x65, 0x78, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x63,
+  0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x50, 0x72,
+  0x6f, 0x6d, 0x69, 0x73, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x70, 0x72, 0x6f,
+  0x6d, 0x70, 0x74, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x20,
+  0x3d, 0x20, 0x7b, 0x7d, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67,
+  0x20, 0x3d, 0x20, 0x7b, 0x7d, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a,
+  0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x65, 0x77,
+  0x20, 0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x28, 0x61, 0x73, 0x79,
+  0x6e, 0x63, 0x20, 0x28, 0x72, 0x65, 0x73, 0x6f, 0x6c, 0x76, 0x65, 0x2c,
+  0x20, 0x72, 0x65, 0x6a, 0x65, 0x63, 0x74, 0x29, 0x20, 0x3d, 0x3e, 0x20,
+  0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x63, 0x6f,
+  0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x3d, 0x20, 0x22, 0x22, 0x3b, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x74, 0x72, 0x79, 0x20, 0x7b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x61, 0x77, 0x61, 0x69,
  0x74, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68, 0x75,
  0x6e, 0x6b, 0x20, 0x6f, 0x66, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x28,
-  0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x70, 0x72, 0x6f, 0x6d, 0x70,
-  0x74, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x7b,
-  0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x20,
-  0x7d, 0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x61,
-  0x6c, 0x6c, 0x62, 0x61, 0x63, 0x6b, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b,
-  0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2f,
-  0x20, 0x47, 0x65, 0x74, 0x20, 0x74, 0x68, 0x65, 0x20, 0x6d, 0x6f, 0x64,
-  0x65, 0x6c, 0x20, 0x69, 0x6e, 0x66, 0x6f, 0x20, 0x66, 0x72, 0x6f, 0x6d,
-  0x20, 0x74, 0x68, 0x65, 0x20, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x2e,
-  0x20, 0x54, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x75, 0x73, 0x65,
-  0x66, 0x75, 0x6c, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x67, 0x65, 0x74, 0x74,
-  0x69, 0x6e, 0x67, 0x20, 0x74, 0x68, 0x65, 0x20, 0x63, 0x6f, 0x6e, 0x74,
-  0x65, 0x78, 0x74, 0x20, 0x77, 0x69, 0x6e, 0x64, 0x6f, 0x77, 0x20, 0x61,
-  0x6e, 0x64, 0x20, 0x73, 0x6f, 0x20, 0x6f, 0x6e, 0x2e, 0x0a, 0x65, 0x78,
-  0x70, 0x6f, 0x72, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c,
-  0x6c, 0x61, 0x6d, 0x61, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x49, 0x6e, 0x66,
-  0x6f, 0x20, 0x3d, 0x20, 0x61, 0x73, 0x79, 0x6e, 0x63, 0x20, 0x28, 0x63,
-  0x6f, 0x6e, 0x66, 0x69, 0x67, 0x20, 0x3d, 0x20, 0x7b, 0x7d, 0x29, 0x20,
-  0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x21,
-  0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73,
-  0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x61, 0x70, 0x69,
-  0x5f, 0x75, 0x72, 0x6c, 0x20, 0x3d, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69,
-  0x67, 0x2e, 0x61, 0x70, 0x69, 0x5f, 0x75, 0x72, 0x6c, 0x20, 0x7c, 0x7c,
-  0x20, 0x22, 0x22, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e,
-  0x73, 0x74, 0x20, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x20, 0x3d, 0x20, 0x61,
-  0x77, 0x61, 0x69, 0x74, 0x20, 0x66, 0x65, 0x74, 0x63, 0x68, 0x28, 0x60,
-  0x24, 0x7b, 0x61, 0x70, 0x69, 0x5f, 0x75, 0x72, 0x6c, 0x7d, 0x2f, 0x70,
-  0x72, 0x6f, 0x70, 0x73, 0x60, 0x29, 0x2e, 0x74, 0x68, 0x65, 0x6e, 0x28,
-  0x72, 0x20, 0x3d, 0x3e, 0x20, 0x72, 0x2e, 0x6a, 0x73, 0x6f, 0x6e, 0x28,
-  0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x67, 0x65, 0x6e, 0x65,
-  0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69,
-  0x6e, 0x67, 0x73, 0x20, 0x3d, 0x20, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x2e,
-  0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x5f, 0x67, 0x65, 0x6e, 0x65,
-  0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69,
-  0x6e, 0x67, 0x73, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x72,
-  0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61,
-  0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67,
-  0x73, 0x3b, 0x0a, 0x7d, 0x0a
+  0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61,
+  0x6d, 0x73, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x29, 0x29,
+  0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63,
+  0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x2b, 0x3d, 0x20, 0x63, 0x68,
+  0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x63, 0x6f, 0x6e,
+  0x74, 0x65, 0x6e, 0x74, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x73, 0x6f,
+  0x6c, 0x76, 0x65, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29,
+  0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x63, 0x61, 0x74, 0x63,
+  0x68, 0x20, 0x28, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x29, 0x20, 0x7b, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x6a, 0x65, 0x63, 0x74,
+  0x28, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x7d, 0x0a, 0x20, 0x20, 0x7d, 0x29, 0x3b, 0x0a, 0x7d, 0x3b, 0x0a,
+  0x0a, 0x2f, 0x2a, 0x2a, 0x0a, 0x20, 0x2a, 0x20, 0x28, 0x64, 0x65, 0x70,
+  0x72, 0x65, 0x63, 0x61, 0x74, 0x65, 0x64, 0x29, 0x0a, 0x20, 0x2a, 0x2f,
+  0x0a, 0x65, 0x78, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x73,
+  0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x43, 0x6f, 0x6d, 0x70, 0x6c,
+  0x65, 0x74, 0x65, 0x20, 0x3d, 0x20, 0x61, 0x73, 0x79, 0x6e, 0x63, 0x20,
+  0x28, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x63, 0x6f, 0x6e,
+  0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2c, 0x20, 0x63, 0x61, 0x6c,
+  0x6c, 0x62, 0x61, 0x63, 0x6b, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a,
+  0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20,
+  0x28, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b,
+  0x20, 0x6f, 0x66, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70, 0x61,
+  0x72, 0x61, 0x6d, 0x73, 0x2e, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c,
+  0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x7b, 0x20, 0x63,
+  0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x20, 0x7d, 0x29,
+  0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x61, 0x6c, 0x6c,
+  0x62, 0x61, 0x63, 0x6b, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x29, 0x3b,
+  0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2f, 0x20, 0x47,
+  0x65, 0x74, 0x20, 0x74, 0x68, 0x65, 0x20, 0x6d, 0x6f, 0x64, 0x65, 0x6c,
+  0x20, 0x69, 0x6e, 0x66, 0x6f, 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x74,
+  0x68, 0x65, 0x20, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x2e, 0x20, 0x54,
+  0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x75, 0x73, 0x65, 0x66, 0x75,
+  0x6c, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x67, 0x65, 0x74, 0x74, 0x69, 0x6e,
+  0x67, 0x20, 0x74, 0x68, 0x65, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78,
+  0x74, 0x20, 0x77, 0x69, 0x6e, 0x64, 0x6f, 0x77, 0x20, 0x61, 0x6e, 0x64,
+  0x20, 0x73, 0x6f, 0x20, 0x6f, 0x6e, 0x2e, 0x0a, 0x65, 0x78, 0x70, 0x6f,
+  0x72, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c, 0x6c, 0x61,
+  0x6d, 0x61, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x49, 0x6e, 0x66, 0x6f, 0x20,
+  0x3d, 0x20, 0x61, 0x73, 0x79, 0x6e, 0x63, 0x20, 0x28, 0x29, 0x20, 0x3d,
+  0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x21, 0x67,
+  0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65,
+  0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x70, 0x72, 0x6f, 0x70,
+  0x73, 0x20, 0x3d, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x66, 0x65,
+  0x74, 0x63, 0x68, 0x28, 0x22, 0x2f, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x22,
+  0x29, 0x2e, 0x74, 0x68, 0x65, 0x6e, 0x28, 0x72, 0x20, 0x3d, 0x3e, 0x20,
+  0x72, 0x2e, 0x6a, 0x73, 0x6f, 0x6e, 0x28, 0x29, 0x29, 0x3b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f,
+  0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x20, 0x3d,
+  0x20, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x2e, 0x64, 0x65, 0x66, 0x61, 0x75,
+  0x6c, 0x74, 0x5f, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f,
+  0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x3b, 0x0a,
+  0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e,
+  0x20, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f,
+  0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x3b, 0x0a, 0x7d, 0x0a
 };
-unsigned int completion_js_len = 5909;
+size_t completion_js_len = 5796;
--- a/examples/server/index.html.hpp
+++ b/examples/server/index.html.hpp
--- a/examples/server/index.js.hpp
+++ b/examples/server/index.js.hpp
@@ -1928,4 +1928,4 @@ unsigned char index_js[] = {
  0x45, 0x66, 0x66, 0x65, 0x63, 0x74, 0x2c, 0x50, 0x74, 0x20, 0x61, 0x73,
  0x20, 0x75, 0x73, 0x65, 0x53, 0x74, 0x61, 0x74, 0x65, 0x7d, 0x3b, 0x0a
 };
-unsigned int index_js_len = 23136;
+size_t index_js_len = 23136;
--- a/examples/server/json-schema-to-grammar.mjs.hpp
+++ b/examples/server/json-schema-to-grammar.mjs.hpp
--- a/examples/server/public/completion.js
+++ b/examples/server/public/completion.js
@@ -21,7 +21,6 @@ let generation_settings = null;
 //
 export async function* llama(prompt, params = {}, config = {}) {
  let controller = config.controller;
-  const api_url = config.api_url || "";

  if (!controller) {
    controller = new AbortController();
@@ -29,7 +28,7 @@ export async function* llama(prompt, params = {}, config = {}) {

  const completionParams = { ...paramDefaults, ...params, prompt };

-  const response = await fetch(`${api_url}/completion`, {
+  const response = await fetch("/completion", {
    method: 'POST',
    body: JSON.stringify(completionParams),
    headers: {
@@ -194,10 +193,9 @@ export const llamaComplete = async (params, controller, callback) => {
 }

 // Get the model info from the server. This is useful for getting the context window and so on.
-export const llamaModelInfo = async (config = {}) => {
+export const llamaModelInfo = async () => {
  if (!generation_settings) {
-    const api_url = config.api_url || "";
-    const props = await fetch(`${api_url}/props`).then(r => r.json());
+    const props = await fetch("/props").then(r => r.json());
    generation_settings = props.default_generation_settings;
  }
  return generation_settings;
--- a/examples/server/public/index.html
+++ b/examples/server/public/index.html
@@ -199,10 +199,10 @@
  <script type="module">
    import {
      html, h, signal, effect, computed, render, useSignal, useEffect, useRef, Component
-    } from './index.js';
+    } from '/index.js';

-    import { llama } from './completion.js';
-    import { SchemaConverter } from './json-schema-to-grammar.mjs';
+    import { llama } from '/completion.js';
+    import { SchemaConverter } from '/json-schema-to-grammar.mjs';
    let selected_image = false;
    var slot_id = -1;

@@ -222,7 +222,6 @@
      temperature: 0.7,
      repeat_last_n: 256, // 0 = disable penalty, -1 = context size
      repeat_penalty: 1.18, // 1.0 = disabled
-      penalize_nl: false,
      top_k: 40, // <= 0 to use vocab size
      top_p: 0.95, // 1.0 = disabled
      min_p: 0.05, // 0 = disabled
@@ -406,7 +405,7 @@
        throw new Error("already running");
      }
      controller.value = new AbortController();
-      for await (const chunk of llama(prompt, llamaParams, { controller: controller.value, api_url: document.baseURI.replace(/\/+$/, '') })) {
+      for await (const chunk of llama(prompt, llamaParams, { controller: controller.value })) {
        const data = chunk.data;

        if (data.stop) {
@@ -628,7 +627,6 @@
      const updateParams = (el) => params.value = { ...params.value, [el.target.name]: el.target.value }
      const updateParamsFloat = (el) => params.value = { ...params.value, [el.target.name]: parseFloat(el.target.value) }
      const updateParamsInt = (el) => params.value = { ...params.value, [el.target.name]: Math.floor(parseFloat(el.target.value)) }
-      const updateParamsBool = (el) => params.value = { ...params.value, [el.target.name]: el.target.checked }

      const grammarJsonSchemaPropOrder = signal('')
      const updateGrammarJsonSchemaPropOrder = (el) => grammarJsonSchemaPropOrder.value = el.target.value
@@ -672,15 +670,6 @@
        `
      };

-      const BoolField = ({ label, name, value }) => {
-        return html`
-          <div>
-            <label for="${name}">${label}</label>
-            <input type="checkbox" id="${name}" name="${name}" checked="${value}" onclick=${updateParamsBool} />
-          </div>
-        `
-      };
-
      const userTemplateReset = (e) => {
        e.preventDefault();
        userTemplateResetToDefaultAndApply()
@@ -780,7 +769,6 @@
            ${FloatField({ label: "Temperature", max: 2.0, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature })}
            ${FloatField({ label: "Penalize repeat sequence", max: 2.0, min: 0.0, name: "repeat_penalty", step: 0.01, value: params.value.repeat_penalty })}
            ${IntField({ label: "Consider N tokens for penalize", max: 2048, min: 0, name: "repeat_last_n", value: params.value.repeat_last_n })}
-            ${BoolField({ label: "Penalize repetition of newlines", name: "penalize_nl", value: params.value.penalize_nl })}
            ${IntField({ label: "Top-K sampling", max: 100, min: -1, name: "top_k", value: params.value.top_k })}
            ${FloatField({ label: "Top-P sampling", max: 1.0, min: 0.0, name: "top_p", step: 0.01, value: params.value.top_p })}
            ${FloatField({ label: "Min-P sampling", max: 1.0, min: 0.0, name: "min_p", step: 0.01, value: params.value.min_p })}
--- a/examples/server/public/json-schema-to-grammar.mjs
+++ b/examples/server/public/json-schema-to-grammar.mjs
@@ -48,7 +48,7 @@ export class SchemaConverter {
  }

  _formatLiteral(literal) {
-    const escaped = literal.replace(
+    const escaped = JSON.stringify(literal).replace(
      GRAMMAR_LITERAL_ESCAPE_RE,
      m => GRAMMAR_LITERAL_ESCAPES[m]
    );
@@ -327,7 +327,10 @@ export class SchemaConverter {
  }

  _generateConstantRule(value) {
-    return this._formatLiteral(JSON.stringify(value));
+    if (typeof value !== 'string') {
+      throw new Error('Only string constants are supported, got ' + JSON.stringify(value));
+    }
+    return this._formatLiteral(value);
  }

  visit(schema, name) {
@@ -343,6 +346,9 @@ export class SchemaConverter {
    } else if (Array.isArray(schemaType)) {
      return this._addRule(ruleName, this._generateUnionRule(name, schemaType.map(t => ({ type: t }))));
    } else if ('const' in schema) {
+      if (typeof schema.const !== 'string') {
+        throw new Error('Only string constants are supported, got ' + JSON.stringify(schema.const));
+      }
      return this._addRule(ruleName, this._generateConstantRule(schema.const));
    } else if ('enum' in schema) {
      const rule = schema.enum.map(v => this._generateConstantRule(v)).join(' | ');
@@ -451,7 +457,7 @@ export class SchemaConverter {
      const propRuleName = this.visit(propSchema, `${name ?? ''}${name ? '-' : ''}${propName}`);
      propKvRuleNames[propName] = this._addRule(
        `${name ?? ''}${name ? '-' : ''}${propName}-kv`,
-        `${this._formatLiteral(JSON.stringify(propName))} space ":" space ${propRuleName}`
+        `${this._formatLiteral(propName)} space ":" space ${propRuleName}`
      );
    }
    const requiredProps = sortedProps.filter(k => required.has(k));
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -30,7 +30,7 @@
 #include <signal.h>
 #include <memory>

-using json = nlohmann::ordered_json;
+using json = nlohmann::json;

 bool server_verbose = false;
 bool server_log_json = true;
@@ -99,7 +99,6 @@ struct slot_params {

    uint32_t seed      = -1; // RNG seed
    int32_t  n_keep    =  0; // number of tokens to keep from initial prompt
-    int32_t  n_discard =  0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
    int32_t  n_predict = -1; // new tokens to predict

    std::vector<std::string> antiprompt;
@@ -747,8 +746,7 @@ struct server_context {
        {
            const int32_t n_batch = llama_n_batch(ctx);

-            // only a single seq_id per token is needed
-            batch = llama_batch_init(n_batch, 0, 1);
+            batch = llama_batch_init(n_batch, 0, params.n_parallel);
        }

        metrics.init();
@@ -848,18 +846,10 @@ struct server_context {
        slot.sparams.mirostat_eta      = json_value(data, "mirostat_eta",      default_sparams.mirostat_eta);
        slot.sparams.penalize_nl       = json_value(data, "penalize_nl",       default_sparams.penalize_nl);
        slot.params.n_keep             = json_value(data, "n_keep",            slot.params.n_keep);
-        slot.params.n_discard          = json_value(data, "n_discard",         default_params.n_discard);
        slot.params.seed               = json_value(data, "seed",              default_params.seed);
-        slot.sparams.n_probs           = json_value(data, "n_probs",           default_sparams.n_probs);
-        slot.sparams.min_keep          = json_value(data, "min_keep",          default_sparams.min_keep);
-
-        // process "json_schema" and "grammar"
-        if (data.contains("json_schema") && data.contains("grammar")) {
-            send_error(task, "Either \"json_schema\" or \"grammar\" can be specified, but not both", ERROR_TYPE_INVALID_REQUEST);
-            return false;
-        } else if (data.contains("json_schema") && !data.contains("grammar")) {
+        if (data.contains("json_schema") && !data.contains("grammar")) {
            try {
-                auto schema                = json_value(data, "json_schema", json::object());
+                auto schema                = json_value(data, "json_schema",       json::object());
                slot.sparams.grammar       = json_schema_to_grammar(schema);
            } catch (const std::exception & e) {
                send_error(task, std::string("\"json_schema\": ") + e.what(), ERROR_TYPE_INVALID_REQUEST);
@@ -868,6 +858,8 @@ struct server_context {
        } else {
            slot.sparams.grammar       = json_value(data, "grammar",           default_sparams.grammar);
        }
+        slot.sparams.n_probs           = json_value(data, "n_probs",           default_sparams.n_probs);
+        slot.sparams.min_keep          = json_value(data, "min_keep",          default_sparams.min_keep);

        if (slot.params.cache_prompt && slot.ga_n != 1) {
            LOG_WARNING("cache_prompt is not supported with group-attention", {});
@@ -1256,7 +1248,6 @@ struct server_context {
            {"stop",                      slot.params.antiprompt},
            {"n_predict",                 slot.params.n_predict}, // TODO: fix duplicate key n_predict
            {"n_keep",                    slot.params.n_keep},
-            {"n_discard",                 slot.params.n_discard},
            {"ignore_eos",                ignore_eos},
            {"stream",                    slot.params.stream},
            {"logit_bias",                slot.sparams.logit_bias},
@@ -1700,7 +1691,7 @@ struct server_context {
                    // Shift context
                    const int n_keep    = slot.params.n_keep + add_bos_token;
                    const int n_left    = (int) system_tokens.size() + slot.n_past - n_keep;
-                    const int n_discard = slot.params.n_discard ? slot.params.n_discard : (n_left / 2);
+                    const int n_discard = n_left / 2;

                    LOG_INFO("slot context shift", {
                        {"id_slot",         slot.id},
@@ -2189,6 +2180,8 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
    printf("                            KV cache defragmentation threshold (default: %.1f, < 0 - disabled)\n", params.defrag_thold);
    printf("  -b N, --batch-size N      logical maximum batch size (default: %d)\n", params.n_batch);
    printf("  -ub N, --ubatch-size N    physical maximum batch size (default: %d)\n", params.n_ubatch);
+    printf("  --memory-f32              use f32 instead of f16 for memory key+value (default: disabled)\n");
+    printf("                            not recommended: doubles context memory required and no measurable increase in quality\n");
    if (llama_supports_mlock()) {
        printf("  --mlock                   force system to keep model in RAM rather than swapping or compressing\n");
    }
@@ -2211,17 +2204,11 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
        printf("                            fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n");
        printf("  -mg i, --main-gpu i       the GPU to use for the model (with split-mode = none),\n");
        printf("                            or for intermediate results and KV (with split-mode = row)\n");
-        printf("  -nkvo, --no-kv-offload\n");
-        printf("                            disable KV offload\n");
    }
    printf("  -m FNAME, --model FNAME\n");
    printf("                            model path (default: %s)\n", params.model.c_str());
    printf("  -mu MODEL_URL, --model-url MODEL_URL\n");
-    printf("                            model download url (default: unused)\n");
-    printf("  -hfr REPO, --hf-repo REPO\n");
-    printf("                            Hugging Face model repository (default: unused)\n");
-    printf("  -hff FILE, --hf-file FILE\n");
-    printf("                            Hugging Face model file (default: unused)\n");
+    printf("                            model download url (default: %s)\n", params.model_url.c_str());
    printf("  -a ALIAS, --alias ALIAS\n");
    printf("                            set an alias for the model, will be added as `model` field in completion response\n");
    printf("  --lora FNAME              apply LoRA adapter (implies --no-mmap)\n");
@@ -2350,18 +2337,6 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
                break;
            }
            params.model_url = argv[i];
-        } else if (arg == "-hfr" || arg == "--hf-repo") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.hf_repo = argv[i];
-        } else if (arg == "-hff" || arg == "--hf-file") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.hf_file = argv[i];
        } else if (arg == "-a" || arg == "--alias") {
            if (++i >= argc) {
                invalid_param = true;
@@ -2498,8 +2473,6 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
                    "See main README.md for information on enabling GPU BLAS support",
                    {{"n_gpu_layers", params.n_gpu_layers}});
            }
-        } else if (arg == "-nkvo" || arg == "--no-kv-offload") {
-            params.no_kv_offload = true;
        } else if (arg == "--split-mode" || arg == "-sm") {
            if (++i >= argc) {
                invalid_param = true;
@@ -2516,15 +2489,15 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
                invalid_param = true;
                break;
            }
-#ifndef GGML_USE_CUDA
-            fprintf(stderr, "warning: llama.cpp was compiled without CUDA. Setting the split mode has no effect.\n");
-#endif // GGML_USE_CUDA
+#ifndef GGML_USE_CUBLAS
+            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the split mode has no effect.\n");
+#endif // GGML_USE_CUBLAS
        } else if (arg == "--tensor-split" || arg == "-ts") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-#if defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)
+#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)
            std::string arg_next = argv[i];

            // split string by , and /
@@ -2541,17 +2514,17 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
                }
            }
 #else
-            LOG_WARNING("llama.cpp was compiled without CUDA. It is not possible to set a tensor split.\n", {});
-#endif // GGML_USE_CUDA
+            LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n", {});
+#endif // GGML_USE_CUBLAS
        } else if (arg == "--main-gpu" || arg == "-mg") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-#if defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)
+#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)
            params.main_gpu = std::stoi(argv[i]);
 #else
-            LOG_WARNING("llama.cpp was compiled without CUDA. It is not possible to set a main GPU.", {});
+            LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.", {});
 #endif
        } else if (arg == "--lora") {
            if (++i >= argc) {
@@ -3568,7 +3541,6 @@ int main(int argc, char ** argv) {
    sigemptyset (&sigint_action.sa_mask);
    sigint_action.sa_flags = 0;
    sigaction(SIGINT, &sigint_action, NULL);
-    sigaction(SIGTERM, &sigint_action, NULL);
 #elif defined (_WIN32)
    auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
        return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
--- a/examples/server/tests/features/parallel.feature
+++ b/examples/server/tests/features/parallel.feature
@@ -4,8 +4,7 @@ Feature: Parallel

  Background: Server startup
    Given a server listening on localhost:8080
-    And   a model file tinyllamas/split/stories15M-00001-of-00003.gguf from HF repo ggml-org/models
-    And   a model file test-model-00001-of-00003.gguf
+    And   a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
    And   42 as server seed
    And   128 as batch size
    And   256 KV cache size
--- a/examples/server/tests/features/server.feature
+++ b/examples/server/tests/features/server.feature
@@ -4,8 +4,8 @@ Feature: llama.cpp server

  Background: Server startup
    Given a server listening on localhost:8080
-    And   a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
-    And   a model file test-model.gguf
+    And   a model url https://huggingface.co/ggml-org/models/resolve/main/tinyllamas/stories260K.gguf
+    And   a model file stories260K.gguf
    And   a model alias tinyllama-2
    And   42 as server seed
      # KV Cache corresponds to the total amount of tokens
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@@ -16,6 +16,7 @@ import numpy as np
 import openai
 from behave import step
 from behave.api.async_step import async_run_until_complete
+from huggingface_hub import hf_hub_download
 from prometheus_client import parser


@@ -38,8 +39,6 @@ def step_server_config(context, server_fqdn, server_port):

    context.model_alias = None
    context.model_file = None
-    context.model_hf_repo = None
-    context.model_hf_file = None
    context.model_url = None
    context.n_batch = None
    context.n_ubatch = None
@@ -69,9 +68,9 @@ def step_server_config(context, server_fqdn, server_port):

@step('a model file {hf_file} from HF repo {hf_repo}')
 def step_download_hf_model(context, hf_file, hf_repo):
-    context.model_hf_repo = hf_repo
-    context.model_hf_file = hf_file
-    context.model_file = os.path.basename(hf_file)
+    context.model_file = hf_hub_download(repo_id=hf_repo, filename=hf_file)
+    if context.debug:
+        print(f"model file: {context.model_file}")


@step('a model file {model_file}')
@@ -1080,10 +1079,6 @@ def start_server_background(context):
        server_args.extend(['--model', context.model_file])
    if context.model_url:
        server_args.extend(['--model-url', context.model_url])
-    if context.model_hf_repo:
-        server_args.extend(['--hf-repo', context.model_hf_repo])
-    if context.model_hf_file:
-        server_args.extend(['--hf-file', context.model_hf_file])
    if context.n_batch:
        server_args.extend(['--batch-size', context.n_batch])
    if context.n_ubatch:
@@ -1114,10 +1109,7 @@ def start_server_background(context):
        server_args.append('--verbose')
    if 'SERVER_LOG_FORMAT_JSON' not in os.environ:
        server_args.extend(['--log-format', "text"])
-
-    args = [str(arg) for arg in [context.server_path, *server_args]]
-    print(f"bench: starting server with: {' '.join(args)}")
-
+    print(f"starting server with: {context.server_path} {server_args}")
    flags = 0
    if 'nt' == os.name:
        flags |= subprocess.DETACHED_PROCESS
@@ -1133,14 +1125,16 @@ def start_server_background(context):
        [str(arg) for arg in [context.server_path, *server_args]],
        **pkwargs)

-    def server_log(in_stream, out_stream):
-        for line in iter(in_stream.readline, b''):
-            print(line.decode('utf-8'), end='', file=out_stream)
-
-    thread_stdout = threading.Thread(target=server_log, args=(context.server_process.stdout, sys.stdout))
+    def log_stdout(process):
+        for line in iter(process.stdout.readline, b''):
+            print(line.decode('utf-8'), end='')
+    thread_stdout = threading.Thread(target=log_stdout, args=(context.server_process,))
    thread_stdout.start()

-    thread_stderr = threading.Thread(target=server_log, args=(context.server_process.stderr, sys.stderr))
+    def log_stderr(process):
+        for line in iter(process.stderr.readline, b''):
+            print(line.decode('utf-8'), end='', file=sys.stderr)
+    thread_stderr = threading.Thread(target=log_stderr, args=(context.server_process,))
    thread_stderr.start()

    print(f"server pid={context.server_process.pid}, behave pid={os.getpid()}")
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -12,7 +12,7 @@

 #define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613"

-using json = nlohmann::ordered_json;
+using json = nlohmann::json;

 // https://community.openai.com/t/openai-chat-list-of-error-codes-and-types/357791/11
 enum error_type {
@@ -49,23 +49,12 @@ extern bool server_log_json;
 #define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__)
 #define LOG_INFO(   MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)

-static inline void server_log(const char *level, const char *function, int line, const char *message, const nlohmann::ordered_json &extra);
-
 template <typename T>
 static T json_value(const json &body, const std::string &key, const T &default_value) {
    // Fallback null to default value
-    if (body.contains(key) && !body.at(key).is_null()){
-        try {
-            return body.value(key, default_value);
-        }
-        catch (nlohmann::json_abi_v3_11_3::detail::type_error const&){
-            std::string message = "Wrong type supplied for parameter '" + key + "'. Expected '" + typeid(default_value).name() + "', using default value.";
-            server_log("WARN", __func__, __LINE__, message.c_str(), body);
-            return default_value;
-        }
-    } else {
-        return default_value;
-    }
+    return body.contains(key) && !body.at(key).is_null()
+        ? body.value(key, default_value)
+        : default_value;
 }

 static inline void server_log(const char *level, const char *function, int line, const char *message, const nlohmann::ordered_json &extra) {
@@ -106,8 +95,8 @@ static inline void server_log(const char *level, const char *function, int line,

        const std::string str = ss.str();
        printf("%.*s\n", (int)str.size(), str.data());
+        fflush(stdout);
    }
-    fflush(stdout);
 }

 //
@@ -363,71 +352,51 @@ static json oaicompat_completion_params_parse(
    // https://platform.openai.com/docs/api-reference/chat/create
    llama_sampling_params default_sparams;
    llama_params["model"]             = json_value(body,   "model",             std::string("unknown"));
-    llama_params["frequency_penalty"] = json_value(body,   "frequency_penalty", 0.0);
-    llama_params["logit_bias"]        = json_value(body,   "logit_bias",        json::object());
+    llama_params["prompt"]            = format_chat(model, chat_template,       body["messages"]);
+    llama_params["cache_prompt"]      = json_value(body,   "cache_prompt",      false);
+    llama_params["temperature"]       = json_value(body,   "temperature",       0.0);
+    llama_params["top_k"]             = json_value(body,   "top_k",             default_sparams.top_k);
+    llama_params["top_p"]             = json_value(body,   "top_p",             1.0);
    llama_params["n_predict"]         = json_value(body,   "max_tokens",        -1);
+    llama_params["logit_bias"]        = json_value(body,   "logit_bias",        json::object());
+    llama_params["frequency_penalty"] = json_value(body,   "frequency_penalty", 0.0);
    llama_params["presence_penalty"]  = json_value(body,   "presence_penalty",  0.0);
    llama_params["seed"]              = json_value(body,   "seed",              LLAMA_DEFAULT_SEED);
    llama_params["stream"]            = json_value(body,   "stream",            false);
-    llama_params["temperature"]       = json_value(body,   "temperature",       0.0);
-    llama_params["top_p"]             = json_value(body,   "top_p",             1.0);
+    llama_params["mirostat"]          = json_value(body,   "mirostat",          default_sparams.mirostat);
+    llama_params["mirostat_tau"]      = json_value(body,   "mirostat_tau",      default_sparams.mirostat_tau);
+    llama_params["mirostat_eta"]      = json_value(body,   "mirostat_eta",      default_sparams.mirostat_eta);
+    llama_params["penalize_nl"]       = json_value(body,   "penalize_nl",       default_sparams.penalize_nl);
+    llama_params["typical_p"]         = json_value(body,   "typical_p",         default_sparams.typical_p);
+    llama_params["repeat_last_n"]     = json_value(body,   "repeat_last_n",     default_sparams.penalty_last_n);
+    llama_params["ignore_eos"]        = json_value(body,   "ignore_eos",        false);
+    llama_params["tfs_z"]             = json_value(body,   "tfs_z",             default_sparams.tfs_z);
+    llama_params["n_keep"]            = json_value(body,   "n_keep",            0);

-    // Apply chat template to the list of messages
-    llama_params["prompt"] = format_chat(model, chat_template, body["messages"]);
+    if (body.contains("grammar")) {
+        llama_params["grammar"] = json_value(body, "grammar", json::object());
+    }

-    // Handle "stop" field
+    if (body.contains("response_format")) {
+        auto response_format = json_value(body, "response_format", json::object());
+        if (response_format.contains("type")) {
+            if (response_format["type"] == "json_object") {
+                llama_params["json_schema"] = json_value(response_format, "schema", json::object());
+            } else {
+                throw std::runtime_error("response_format type not supported: " + response_format["type"].dump());
+            }
+        }
+    }
+
+    // Handle 'stop' field
    if (body.contains("stop") && body["stop"].is_string()) {
        llama_params["stop"] = json::array({body["stop"].get<std::string>()});
    } else {
        llama_params["stop"] = json_value(body, "stop", json::array());
    }
-    // Some chat templates don't use EOS token to stop generation
-    // We must add their end sequences to list of stop words
-    llama_params["stop"].push_back("<|im_end|>"); // chatml
-    llama_params["stop"].push_back("<end_of_turn>"); // gemma

-    // Handle "response_format" field
-    if (body.contains("response_format")) {
-        json response_format      = json_value(body, "response_format", json::object());
-        std::string response_type = json_value(response_format, "type", std::string());
-        if (response_type == "json_object") {
-            llama_params["json_schema"] = json_value(response_format, "schema", json::object());
-        } else if (!response_type.empty() && response_type != "text") {
-            throw std::runtime_error("response_format type must be one of \"text\" or \"json_object\", but got: " + response_type);
-        }
-    }
-
-    // Handle "n" field
-    int n_choices = json_value(body, "n", 1);
-    if (n_choices != 1) {
-        throw std::runtime_error("Only one completion choice is allowed");
-    }
-
-    // Handle "logprobs" field
-    // TODO: The response format of this option is not yet OAI-compatible, but seems like no one really using it; We may need to fix it in the future
-    if (body.contains("logprobs")) {
-        llama_params["n_probs"] = json_value(body, "top_logprobs", 20);
-    } else if (body.contains("top_logprobs")) {
-        throw std::runtime_error("top_logprobs requires logprobs to be set to true");
-    }
-
-    // Params supported by OAI but unsupported by llama.cpp
-    static const std::vector<std::string> unsupported_params { "tools", "tool_choice" };
-    for (auto & param : unsupported_params) {
-        if (body.contains(param)) {
-            throw std::runtime_error("Unsupported param: " + param);
-        }
-    }
-
-    // Copy remaining properties to llama_params
-    // This allows user to use llama.cpp-specific params like "mirostat", "tfs_z",... via OAI endpoint.
-    // See "launch_slot_with_task()" for a complete list of params supported by llama.cpp
-    for (const auto & item : body.items()) {
-        // Exception: if "n_predict" is present, we overwrite the value specified earlier by "max_tokens"
-        if (!llama_params.contains(item.key()) || item.key() == "n_predict") {
-            llama_params[item.key()] = item.value();
-        }
-    }
+    // Ensure there is ChatML-specific end sequence among stop words
+    llama_params["stop"].push_back("<|im_end|>");

    return llama_params;
 }
@@ -567,15 +536,6 @@ static std::vector<json> format_partial_response_oaicompat(json result, const st
        {"model",   modelname},
        {"object",  "chat.completion.chunk"}
    };
-    if (!finish_reason.empty()) {
-        int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
-        int num_prompt_tokens    = json_value(result, "tokens_evaluated", 0);
-        ret.push_back({"usage", json {
-            {"completion_tokens", num_tokens_predicted},
-            {"prompt_tokens",     num_prompt_tokens},
-            {"total_tokens",      num_tokens_predicted + num_prompt_tokens}
-        }});
-    }

    return std::vector<json>({ret});
 }
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -65,6 +65,7 @@ int main(int argc, char ** argv) {
    llama_context * ctx_dft = NULL;

    // load the target model
+    params.logits_all = true;
    std::tie(model_tgt, ctx_tgt) = llama_init_from_gpt_params(params);

    // load the draft model
@@ -218,8 +219,7 @@ int main(int argc, char ** argv) {
                if (params.sparams.temp > 0) {
                    // stochastic verification

-                    llama_token_data_array dist_tgt = llama_sampling_prepare(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft], true, NULL);
-                    llama_sample_softmax(ctx_tgt, &dist_tgt);
+                    llama_token_data_array dist_tgt = llama_sampling_probability_distribution(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft]);
                    float p_tgt = 0, p_dft = 0;

                    // GGML_ASSERT(dist_tgt.size() == dist_dft.size());
--- a/examples/sycl/win-build-sycl.bat
+++ b/examples/sycl/win-build-sycl.bat
@@ -3,13 +3,9 @@
 ::  Copyright (C) 2024 Intel Corporation
 ::  SPDX-License-Identifier: MIT

-
-IF not exist build (mkdir build)
+mkdir -p build
 cd build
-if %errorlevel% neq 0 goto ERROR
-
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
-if %errorlevel% neq 0 goto ERROR

 ::  for FP16
 ::  faster for long-prompt inference
@@ -17,18 +13,11 @@ if %errorlevel% neq 0 goto ERROR

 ::  for FP32
 cmake -G "MinGW Makefiles" ..  -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx  -DCMAKE_BUILD_TYPE=Release
-if %errorlevel% neq 0 goto ERROR
+
+
 ::  build example/main only
 ::  make main

 ::  build all binary
 make -j
-if %errorlevel% neq 0 goto ERROR
-
 cd ..
-exit /B 0
-
-:ERROR
-echo comomand error: %errorlevel%
-exit /B %errorlevel%
-
--- a/flake.lock
+++ b/flake.lock
@@ -20,11 +20,11 @@
    },
    "nixpkgs": {
      "locked": {
-        "lastModified": 1711703276,
-        "narHash": "sha256-iMUFArF0WCatKK6RzfUJknjem0H9m4KgorO/p3Dopkk=",
+        "lastModified": 1710451336,
+        "narHash": "sha256-pP86Pcfu3BrAvRO7R64x7hs+GaQrjFes+mEPowCfkxY=",
        "owner": "NixOS",
        "repo": "nixpkgs",
-        "rev": "d8fe5e6c92d0d190646fb9f1056741a229980089",
+        "rev": "d691274a972b3165335d261cc4671335f5c67de9",
        "type": "github"
      },
      "original": {
--- a/flake.nix
+++ b/flake.nix
@@ -145,7 +145,6 @@
            # the same path you would with an overlay.
            legacyPackages = {
              llamaPackages = pkgs.callPackage .devops/nix/scope.nix { inherit llamaVersion; };
-              llamaPackagesWindows = pkgs.pkgsCross.mingwW64.callPackage .devops/nix/scope.nix { inherit llamaVersion; };
              llamaPackagesCuda = pkgsCuda.callPackage .devops/nix/scope.nix { inherit llamaVersion; };
              llamaPackagesRocm = pkgsRocm.callPackage .devops/nix/scope.nix { inherit llamaVersion; };
            };
@@ -156,7 +155,6 @@
              {
                default = config.legacyPackages.llamaPackages.llama-cpp;
                vulkan = config.packages.default.override { useVulkan = true; };
-                windows = config.legacyPackages.llamaPackagesWindows.llama-cpp;
              }
              // lib.optionalAttrs pkgs.stdenv.isLinux {
                opencl = config.packages.default.override { useOpenCL = true; };
@@ -170,14 +168,9 @@
              };

            # Packages exposed in `.#checks` will be built by the CI and by
-            # `nix flake check`.
-            #
-            # We could test all outputs e.g. as `checks = confg.packages`.
-            #
-            # TODO: Build more once https://github.com/ggerganov/llama.cpp/issues/6346 has been addressed
-            checks = {
-              inherit (config.packages) default vulkan;
-            };
+            # `nix flake check`. Currently we expose all packages, but we could
+            # make more granular choices
+            checks = config.packages;
          };
      };
 }
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Georgi Gerganov	8c3d5b5a79	common : remove defaults	2024-03-22 15:33:24 +02:00
Georgi Gerganov	1b2f0a9ee8	common : add HF arg helpers	2024-03-22 14:32:36 +02:00