llama : add llama_sampling API + move grammar in libllama

ggml-ci
2026-04-16 16:27:32 +03:00 · 2024-09-03 10:31:54 +03:00
200 changed files with 14028 additions and 22612 deletions
--- a/.devops/full-rocm.Dockerfile
+++ b/.devops/full-rocm.Dockerfile
@@ -11,7 +11,7 @@ FROM ${BASE_ROCM_DEV_CONTAINER} AS build
 # Unless otherwise specified, we make a fat build.
 # List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
 # This is mostly tied to rocBLAS supported archs.
-ARG ROCM_DOCKER_ARCH="\
+ARG ROCM_DOCKER_ARCH=\
    gfx803 \
    gfx900 \
    gfx906 \
@@ -21,7 +21,7 @@ ARG ROCM_DOCKER_ARCH="\
    gfx1030 \
    gfx1100 \
    gfx1101 \
-    gfx1102"
+    gfx1102

 COPY requirements.txt   requirements.txt
 COPY requirements       requirements
@@ -34,7 +34,7 @@ WORKDIR /app
 COPY . .

 # Set nvcc architecture
-ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
+ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
 # Enable ROCm
 ENV GGML_HIPBLAS=1
 ENV CC=/opt/rocm/llvm/bin/clang
--- a/.devops/llama-cli-rocm.Dockerfile
+++ b/.devops/llama-cli-rocm.Dockerfile
@@ -11,7 +11,7 @@ FROM ${BASE_ROCM_DEV_CONTAINER} AS build
 # Unless otherwise specified, we make a fat build.
 # List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
 # This is mostly tied to rocBLAS supported archs.
-ARG ROCM_DOCKER_ARCH="\
+ARG ROCM_DOCKER_ARCH=\
    gfx803 \
    gfx900 \
    gfx906 \
@@ -21,7 +21,7 @@ ARG ROCM_DOCKER_ARCH="\
    gfx1030 \
    gfx1100 \
    gfx1101 \
-    gfx1102"
+    gfx1102

 COPY requirements.txt   requirements.txt
 COPY requirements       requirements
@@ -34,7 +34,7 @@ WORKDIR /app
 COPY . .

 # Set nvcc architecture
-ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
+ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
 # Enable ROCm
 ENV GGML_HIPBLAS=1
 ENV CC=/opt/rocm/llvm/bin/clang
--- a/.devops/llama-server-rocm.Dockerfile
+++ b/.devops/llama-server-rocm.Dockerfile
@@ -11,7 +11,7 @@ FROM ${BASE_ROCM_DEV_CONTAINER} AS build
 # Unless otherwise specified, we make a fat build.
 # List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
 # This is mostly tied to rocBLAS supported archs.
-ARG ROCM_DOCKER_ARCH="\
+ARG ROCM_DOCKER_ARCH=\
    gfx803 \
    gfx900 \
    gfx906 \
@@ -21,7 +21,7 @@ ARG ROCM_DOCKER_ARCH="\
    gfx1030 \
    gfx1100 \
    gfx1101 \
-    gfx1102"
+    gfx1102

 COPY requirements.txt   requirements.txt
 COPY requirements       requirements
@@ -34,7 +34,7 @@ WORKDIR /app
 COPY . .

 # Set nvcc architecture
-ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
+ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
 # Enable ROCm
 ENV GGML_HIPBLAS=1
 ENV CC=/opt/rocm/llvm/bin/clang
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,7 +1,7 @@
 *.o
 *.a
 .cache/
-# Do not ignore .git directory, otherwise the reported build number will always be 0
+.git/
 .github/
 .gitignore
 .vs/
--- a/.github/workflows/bench.yml.disabled
+++ b/.github/workflows/bench.yml.disabled
@@ -27,10 +27,10 @@ on:
  push:
    branches:
      - master
-    paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
+    paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
  pull_request_target:
    types: [opened, synchronize, reopened]
-    paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
+    paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
  schedule:
    -  cron: '04 2 * * *'

--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -23,9 +23,6 @@ env:
  BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
  GGML_NLOOP: 3
  GGML_N_THREADS: 1
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1

 jobs:
  macOS-latest-cmake-arm64:
@@ -378,7 +375,7 @@ jobs:
    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v3

      - name: Dependencies
        id: depends
@@ -404,7 +401,7 @@ jobs:
    continue-on-error: true

    steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v2

      - name: add oneAPI to apt
        shell: bash
@@ -445,7 +442,7 @@ jobs:
    continue-on-error: true

    steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v2

      - name: add oneAPI to apt
        shell: bash
@@ -549,7 +546,7 @@ jobs:
    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v1

      - name: Dependencies
        id: depends
@@ -579,7 +576,7 @@ jobs:
    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v1

      - name: Dependencies
        id: depends
@@ -613,7 +610,7 @@ jobs:
    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v1

      - name: Dependencies
        id: depends
@@ -860,7 +857,7 @@ jobs:
        run: |
          mkdir build
          cd build
-          cmake .. -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON -DGGML_RPC=ON
+          cmake .. -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON
          cmake --build . --config Release -j $((${env:NUMBER_OF_PROCESSORS} - 1)) -t ggml
          cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}

@@ -956,7 +953,6 @@ jobs:
          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl7.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
          echo "cp oneAPI running time dll files to ./build/bin done"
          7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/*

@@ -968,20 +964,19 @@ jobs:
          name: llama-bin-win-sycl-x64.zip

  windows-latest-cmake-hip:
-    if: ${{ github.event.inputs.create_release != 'true' }}
    runs-on: windows-latest

    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v3

      - name: Install
        id: depends
        run: |
          $ErrorActionPreference = "Stop"
          write-host "Downloading AMD HIP SDK Installer"
-          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
+          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-23.Q4-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
          write-host "Installing AMD HIP SDK"
          Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
          write-host "Completed AMD HIP SDK installation"
@@ -996,72 +991,8 @@ jobs:
        run: |
          $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
          $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
-          cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DGGML_RPC=ON
-          cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
-
-  windows-latest-cmake-hip-release:
-    if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-    runs-on: windows-latest
-
-    strategy:
-      matrix:
-        gpu_target: [gfx1100, gfx1101, gfx1030]
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: Install
-        id: depends
-        run: |
-          $ErrorActionPreference = "Stop"
-          write-host "Downloading AMD HIP SDK Installer"
-          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
-          write-host "Installing AMD HIP SDK"
-          Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
-          write-host "Completed AMD HIP SDK installation"
-
-      - name: Verify ROCm
-        id: verify
-        run: |
-          & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
-
-      - name: Build
-        id: cmake_build
-        run: |
-          $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
-          $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
-          cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS=${{ matrix.gpu_target }} -DGGML_RPC=ON
-          cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
-          md "build\bin\rocblas\library\"
-          cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
-          cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\"
-          cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\"
-
-      - name: Determine tag name
-        id: tag
-        shell: bash
-        run: |
-          BUILD_NUMBER="$(git rev-list --count HEAD)"
-          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
-          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
-            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
-          else
-            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
-            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
-          fi
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        run: |
-          7z a llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip .\build\bin\*
-
-      - name: Upload artifacts
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip
-          name: llama-bin-win-hip-x64-${{ matrix.gpu_target }}.zip
+          cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIPBLAS=ON
+          cmake --build build --config Release

  ios-xcode-build:
    runs-on: macos-latest
@@ -1126,7 +1057,6 @@ jobs:
      - macOS-latest-cmake
      - windows-latest-cmake
      - windows-latest-cmake-cuda
-      - windows-latest-cmake-hip-release
      - macOS-latest-cmake-arm64
      - macOS-latest-cmake-x64

--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -15,17 +15,11 @@ on:
    branches:
      - master
    paths: ['.github/workflows/docker.yml', '.devops/*.Dockerfile', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal']
-  workflow_dispatch: # allows manual triggering, useful for debugging

 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
  cancel-in-progress: true

-# Fine-grant permission
-# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
-permissions:
-  packages: write
-
 jobs:
  push_to_registry:
    name: Push Docker image to Docker Hub
@@ -43,17 +37,15 @@ jobs:
          - { tag: "light-cuda", dockerfile: ".devops/llama-cli-cuda.Dockerfile", platforms: "linux/amd64" }
          - { tag: "server-cuda", dockerfile: ".devops/llama-server-cuda.Dockerfile", platforms: "linux/amd64" }
          - { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
-          # Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
-          #- { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
-          #- { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
+          - { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
+          - { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
+          # Note: the full-rocm image is failing due to a "no space left on device" error. It is disabled for now to allow the workflow to complete.
          #- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
          - { tag: "light-intel", dockerfile: ".devops/llama-cli-intel.Dockerfile", platforms: "linux/amd64" }
          - { tag: "server-intel", dockerfile: ".devops/llama-server-intel.Dockerfile", platforms: "linux/amd64" }
    steps:
      - name: Check out the repo
        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0 # preserve git history, so we can determine the build number

      - name: Set up QEMU
        uses: docker/setup-qemu-action@v2
@@ -68,34 +60,6 @@ jobs:
          username: ${{ github.repository_owner }}
          password: ${{ secrets.GITHUB_TOKEN }}

-      - name: Determine tag name
-        id: tag
-        shell: bash
-        run: |
-          BUILD_NUMBER="$(git rev-list --count HEAD)"
-          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
-          REPO_OWNER="${GITHUB_REPOSITORY_OWNER@L}"  # to lower case
-          REPO_NAME="${{ github.event.repository.name }}"
-
-          # determine tag name postfix (build number, commit hash)
-          if [[ "${{ env.GITHUB_BRANCH_NAME }}" == "master" ]]; then
-            TAG_POSTFIX="b${BUILD_NUMBER}"
-          else
-            SAFE_NAME=$(echo "${{ env.GITHUB_BRANCH_NAME }}" | tr '/' '-')
-            TAG_POSTFIX="${SAFE_NAME}-${SHORT_HASH}"
-          fi
-
-          # list all tags possible
-          TAGS=""
-          TAGS="${TAGS}ghcr.io/${REPO_OWNER}/${REPO_NAME}:${{ matrix.config.tag }},"
-          TAGS="${TAGS}ghcr.io/${REPO_OWNER}/${REPO_NAME}:${{ matrix.config.tag }}-${TAG_POSTFIX}"
-
-          echo "output_tags=$TAGS" >> $GITHUB_OUTPUT
-          echo "output_tags=$TAGS"  # print out for debugging
-        env:
-          GITHUB_BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
-          GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
-
      # https://github.com/jlumbroso/free-disk-space/tree/54081f138730dfa15788a46383842cd2f914a1be#example
      - name: Free Disk Space (Ubuntu)
        uses: jlumbroso/free-disk-space@main
@@ -113,6 +77,25 @@ jobs:
          docker-images: true
          swap-storage: true

+      - name: Determine tag name
+        id: tag
+        shell: bash
+        run: |
+          BUILD_NUMBER="$(git rev-list --count HEAD)"
+          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
+          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
+            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
+          else
+            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
+            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Downcase github.repository_owner
+        run: |
+          echo "repository_owner_lowercase=${GITHUB_REPOSITORY_OWNER@L}" >> $GITHUB_ENV
+        env:
+          GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
+
      - name: Build and push Docker image (tagged + versioned)
        if: github.event_name == 'push'
        uses: docker/build-push-action@v6
@@ -120,6 +103,5 @@ jobs:
          context: .
          push: true
          platforms: ${{ matrix.config.platforms }}
-          # tag list is generated from step above
-          tags: ${{ steps.tag.outputs.output_tags }}
+          tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }},ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }},ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}"
          file: ${{ matrix.config.dockerfile }}
--- a/.github/workflows/python-type-check.yml
+++ b/.github/workflows/python-type-check.yml
@@ -4,13 +4,11 @@ on:
  push:
    paths:
      - '.github/workflows/python-type-check.yml'
-      - 'pyrightconfig.json'
      - '**.py'
      - '**/requirements*.txt'
  pull_request:
    paths:
      - '.github/workflows/python-type-check.yml'
-      - 'pyrightconfig.json'
      - '**.py'
      - '**/requirements*.txt'

@@ -35,6 +33,6 @@ jobs:
      - name: Type-check with Pyright
        uses: jakebailey/pyright-action@v2
        with:
-          version: 1.1.382
+          version: 1.1.370
          level: warning
          warnings: true
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -20,12 +20,6 @@ on:
    types: [opened, synchronize, reopened]
    paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']

-env:
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
-  LLAMA_LOG_VERBOSITY: 10
-
 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
  cancel-in-progress: true
@@ -179,7 +173,6 @@ jobs:
        if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
        run: |
          cd examples/server/tests
-          $env:PYTHONIOENCODING = ":replace"
          behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp

      - name: Slow tests
--- a/.gitignore
+++ b/.gitignore
@@ -61,7 +61,6 @@ llama-batched-swift
 /rpc-server
 out/
 tmp/
-autogen-*.md

 # Deprecated

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -62,9 +62,6 @@ option(LLAMA_SANITIZE_THREAD    "llama: enable thread sanitizer"    OFF)
 option(LLAMA_SANITIZE_ADDRESS   "llama: enable address sanitizer"   OFF)
 option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF)

-# utils
-option(LLAMA_BUILD_COMMON "llama: build common utils library" ON)
-
 # extra artifacts
 option(LLAMA_BUILD_TESTS    "llama: build tests"          ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_EXAMPLES "llama: build examples"       ${LLAMA_STANDALONE})
@@ -85,11 +82,11 @@ set(GGML_FATAL_WARNINGS     ${LLAMA_FATAL_WARNINGS})

 # change the default for these ggml options
 if (NOT DEFINED GGML_LLAMAFILE)
-    set(GGML_LLAMAFILE_DEFAULT ON)
+    set(GGML_LLAMAFILE ON)
 endif()

-if (NOT DEFINED GGML_CUDA_GRAPHS)
-    set(GGML_CUDA_GRAPHS_DEFAULT ON)
+if (NOT DEFINED GGML_CUDA_USE_GRAPHS)
+    set(GGML_CUDA_USE_GRAPHS ON)
 endif()

 # transition helpers
@@ -142,16 +139,10 @@ set(LLAMA_BIN_INSTALL_DIR     ${CMAKE_INSTALL_BINDIR}     CACHE PATH "Location o
 # determining _precisely_ which defines are necessary for the llama-config
 # package.
 #
-set(GGML_TRANSIENT_DEFINES)
 get_target_property(GGML_DIRECTORY ggml SOURCE_DIR)
 get_directory_property(GGML_DIR_DEFINES DIRECTORY ${GGML_DIRECTORY} COMPILE_DEFINITIONS)
-if (GGML_DIR_DEFINES)
-    list(APPEND GGML_TRANSIENT_DEFINES ${GGML_DIR_DEFINES})
-endif()
 get_target_property(GGML_TARGET_DEFINES ggml COMPILE_DEFINITIONS)
-if (GGML_TARGET_DEFINES)
-    list(APPEND GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES})
-endif()
+set(GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES} ${GGML_DIR_DEFINES})
 get_target_property(GGML_LINK_LIBRARIES ggml LINK_LIBRARIES)

 set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h)
@@ -194,17 +185,15 @@ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/llama.pc"
        DESTINATION lib/pkgconfig)

 #
-# utils, programs, examples and tests
+# programs, examples and tests
 #

-if (LLAMA_BUILD_COMMON)
-    add_subdirectory(common)
-endif()
+add_subdirectory(common)

 if (LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
    include(CTest)
    add_subdirectory(tests)
-endif()
+endif ()

 if (LLAMA_BUILD_EXAMPLES)
    add_subdirectory(examples)
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -32,8 +32,8 @@

    {
        "name": "arm64-windows-msvc", "hidden": true,
-        "architecture": { "value": "arm64",    "strategy": "external" },
-        "toolset":      { "value": "host=x64", "strategy": "external" },
+        "architecture": { "value": "arm64",       "strategy": "external" },
+        "toolset":      { "value": "host=x86_64", "strategy": "external" },
        "cacheVariables": {
            "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-msvc.cmake"
        }
@@ -41,8 +41,8 @@

    {
        "name": "arm64-windows-llvm", "hidden": true,
-        "architecture": { "value": "arm64",    "strategy": "external" },
-        "toolset":      { "value": "host=x64", "strategy": "external" },
+        "architecture": { "value": "arm64",       "strategy": "external" },
+        "toolset":      { "value": "host=x86_64", "strategy": "external" },
        "cacheVariables": {
            "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-llvm.cmake"
        }
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -27,8 +27,3 @@

 ![matmul](media/matmul.png)

-# Resources
-
-The Github issues, PRs and discussions contain a lot of information that can be useful to get familiar with the codebase. For convenience, some of the more important information is referenced from Github projects:
-
-https://github.com/ggerganov/llama.cpp/projects
--- a/82
+++ b/82
@@ -5,6 +5,7 @@ BUILD_TARGETS = \
 	llama-batched \
 	llama-batched-bench \
 	llama-bench \
+	llama-benchmark-matmult \
 	llama-cli \
 	llama-convert-llama2c-to-ggml \
 	llama-embedding \
@@ -38,12 +39,10 @@ BUILD_TARGETS = \
 	llama-tokenize \
 	llama-vdot \
 	llama-cvector-generator \
-	llama-gen-docs \
 	tests/test-c.o

 # Binaries only useful for tests
 TEST_TARGETS = \
-	tests/test-arg-parser \
 	tests/test-autorelease \
 	tests/test-backend-ops \
 	tests/test-chat-template \
@@ -53,7 +52,6 @@ TEST_TARGETS = \
 	tests/test-grammar-parser \
 	tests/test-json-schema-to-grammar \
 	tests/test-llama-grammar \
-	tests/test-log \
 	tests/test-model-load-cancel \
 	tests/test-opt \
 	tests/test-quantize-fns \
@@ -67,7 +65,7 @@ TEST_TARGETS = \
 # Legacy build targets that were renamed in #7809, but should still be removed when the project is cleaned
 LEGACY_TARGETS_CLEAN = main quantize quantize-stats perplexity imatrix embedding vdot q8dot convert-llama2c-to-ggml \
 	simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama \
-	retrieval speculative infill tokenize parallel export-lora lookahead lookup passkey gritlm
+	retrieval speculative infill tokenize benchmark-matmult parallel export-lora lookahead lookup passkey gritlm

 # Legacy build targets that were renamed in #7809, but we want to build binaries that for them that output a deprecation warning if people try to use them.
 #  We don't want to clutter things too much, so we only build replacements for the most commonly used binaries.
@@ -148,14 +146,6 @@ GGML_NO_METAL := 1
 DEPRECATE_WARNING := 1
 endif

-ifdef LLAMA_DISABLE_LOGS
-REMOVE_WARNING := 1
-endif
-
-ifdef LLAMA_SERVER_VERBOSE
-REMOVE_WARNING := 1
-endif
-
 ifndef UNAME_S
 UNAME_S := $(shell uname -s)
 endif
@@ -359,11 +349,19 @@ ifdef LLAMA_SANITIZE_UNDEFINED
 	MK_LDFLAGS  += -fsanitize=undefined -g
 endif

+ifdef LLAMA_SERVER_VERBOSE
+	MK_CPPFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE)
+endif
+
 ifdef LLAMA_SERVER_SSL
 	MK_CPPFLAGS += -DCPPHTTPLIB_OPENSSL_SUPPORT
 	MK_LDFLAGS += -lssl -lcrypto
 endif

+ifdef LLAMA_DISABLE_LOGS
+	MK_CPPFLAGS += -DLOG_DISABLE_LOGS
+endif # LLAMA_DISABLE_LOGS
+
 # warnings
 WARN_FLAGS = \
 	-Wall \
@@ -434,7 +432,7 @@ endif
 # TODO: probably these flags need to be tweaked on some architectures
 #       feel free to update the Makefile for your architecture and send a pull request or issue

-ifndef RISCV_CROSS_COMPILE
+ifndef RISCV

 ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
 	# Use all CPU extensions that are available:
@@ -514,12 +512,7 @@ ifneq ($(filter loongarch64%,$(UNAME_M)),)
 	MK_CXXFLAGS += -mlasx
 endif

-ifneq ($(filter riscv64%,$(UNAME_M)),)
-	MK_CFLAGS   += -march=rv64gcv -mabi=lp64d
-	MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
-endif
-
-else # RISC-V CROSS COMPILATION
+else
 	MK_CFLAGS   += -march=rv64gcv -mabi=lp64d
 	MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
 endif
@@ -610,7 +603,7 @@ ifdef GGML_CUDA

 		MK_CPPFLAGS  += -DGGML_USE_CUDA -I$(CUDA_PATH)/include
 		MK_LDFLAGS   += -lmusa -lmublas -lmusart -lpthread -ldl -lrt -L$(CUDA_PATH)/lib -L/usr/lib64
-		MK_NVCCFLAGS += -x musa -mtgpu --cuda-gpu-arch=mp_21 --cuda-gpu-arch=mp_22
+		MK_NVCCFLAGS += -x musa -mtgpu --cuda-gpu-arch=mp_22
 	else
 		ifneq ('', '$(wildcard /opt/cuda)')
 			CUDA_PATH ?= /opt/cuda
@@ -618,7 +611,7 @@ ifdef GGML_CUDA
 			CUDA_PATH ?= /usr/local/cuda
 		endif

-		MK_CPPFLAGS  += -DGGML_USE_CUDA -DGGML_CUDA_USE_GRAPHS -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
+		MK_CPPFLAGS  += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include -DGGML_CUDA_USE_GRAPHS
 		MK_LDFLAGS   += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib
 		MK_NVCCFLAGS += -use_fast_math
 	endif # GGML_MUSA
@@ -930,8 +923,6 @@ OBJ_LLAMA = \

 OBJ_COMMON = \
 	common/common.o \
-	common/arg.o \
-	common/log.o \
 	common/console.o \
 	common/ngram-cache.o \
 	common/sampling.o \
@@ -1028,14 +1019,6 @@ $(info   - LLAMA_NO_CCACHE)
 $(info )
 endif

-ifdef REMOVE_WARNING
-$(info !!! REMOVAL WARNING !!!)
-$(info The following LLAMA_ options have been removed and are no longer supported)
-$(info   - LLAMA_DISABLE_LOGS   (https://github.com/ggerganov/llama.cpp/pull/9418))
-$(info   - LLAMA_SERVER_VERBOSE (https://github.com/ggerganov/llama.cpp/pull/9418))
-$(info )
-endif
-
 #
 # Build libraries
 #
@@ -1054,11 +1037,10 @@ ggml/src/ggml-alloc.o: \
 	$(CC)  $(CFLAGS)   -c $< -o $@

 ggml/src/ggml-backend.o: \
-	ggml/src/ggml-backend.cpp \
-	ggml/src/ggml-backend-impl.h \
+	ggml/src/ggml-backend.c \
 	ggml/include/ggml.h \
 	ggml/include/ggml-backend.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
+	$(CC)  $(CFLAGS)   -c $< -o $@

 ggml/src/ggml-quants.o: \
 	ggml/src/ggml-quants.c \
@@ -1173,16 +1155,6 @@ common/common.o: \
 	include/llama.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@

-common/arg.o: \
-	common/arg.cpp \
-	common/arg.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-
-common/log.o: \
-	common/log.cpp \
-	common/log.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-
 common/sampling.o: \
 	common/sampling.cpp \
 	common/sampling.h \
@@ -1361,7 +1333,7 @@ llama-cvector-generator: examples/cvector-generator/cvector-generator.cpp \
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

 llama-convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp \
-	$(OBJ_ALL)
+	$(OBJ_GGML) $(OBJ_LLAMA)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

@@ -1455,7 +1427,6 @@ llama-server: \
 	examples/server/system-prompts.js.hpp \
 	examples/server/prompt-formats.js.hpp \
 	examples/server/json-schema-to-grammar.mjs.hpp \
-	examples/server/loading.html.hpp \
 	common/json.hpp \
 	common/stb_image.h \
 	$(OBJ_ALL)
@@ -1471,11 +1442,6 @@ examples/server/%.hpp: examples/server/public/% Makefile
 		echo "unsigned int $${NAME}_len = $(shell cat $< | wc -c );" \
 	) > $@

-llama-gen-docs: examples/gen-docs/gen-docs.cpp \
-	$(OBJ_ALL)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
 libllava.a: examples/llava/llava.cpp \
 	examples/llava/llava.h \
 	examples/llava/clip.cpp \
@@ -1523,21 +1489,21 @@ common/build-info.o: common/build-info.cpp

 tests: $(TEST_TARGETS)

-tests/test-arg-parser: tests/test-arg-parser.cpp \
-	$(OBJ_ALL)
+llama-benchmark-matmult: examples/benchmark/benchmark-matmult.cpp \
+	$(OBJ_GGML) common/build-info.o
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

+run-benchmark-matmult: llama-benchmark-matmult
+	./$@
+
+.PHONY: run-benchmark-matmult swift
+
 tests/test-llama-grammar: tests/test-llama-grammar.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-tests/test-log: tests/test-log.cpp \
-	$(OBJ_ALL)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
 tests/test-grammar-parser: tests/test-grammar-parser.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
--- a/Package.swift
+++ b/Package.swift
@@ -11,7 +11,7 @@ var sources = [
    "src/unicode-data.cpp",
    "ggml/src/ggml.c",
    "ggml/src/ggml-alloc.c",
-    "ggml/src/ggml-backend.cpp",
+    "ggml/src/ggml-backend.c",
    "ggml/src/ggml-quants.c",
    "ggml/src/ggml-aarch64.c",
 ]
--- a/README.md
+++ b/README.md
@@ -17,8 +17,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)

 ## Hot topics

- **Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggerganov/llama.cpp/discussions/9669**
- Hugging Face GGUF editor: [discussion](https://github.com/ggerganov/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor)
+- *add hot topics here*

 ----

@@ -78,7 +77,6 @@ Typically finetunes of the base models below are supported as well.
 - [x] [SEA-LION](https://huggingface.co/models?search=sea-lion)
 - [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B)
 - [x] [OLMo](https://allenai.org/olmo)
- [x] [OLMoE](https://huggingface.co/allenai/OLMoE-1B-7B-0924)
 - [x] [Granite models](https://huggingface.co/collections/ibm-granite/granite-code-models-6624c5cec322e4c148c8b330)
 - [x] [GPT-NeoX](https://github.com/EleutherAI/gpt-neox) + [Pythia](https://github.com/EleutherAI/pythia)
 - [x] [Snowflake-Arctic MoE](https://huggingface.co/collections/Snowflake/arctic-66290090abe542894a5ac520)
@@ -91,8 +89,6 @@ Typically finetunes of the base models below are supported as well.
 - [x] [SmolLM](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966)
 - [x] [EXAONE-3.0-7.8B-Instruct](https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct)
 - [x] [FalconMamba Models](https://huggingface.co/collections/tiiuae/falconmamba-7b-66b9a580324dd1598b0f6d4a)
- [x] [Jais](https://huggingface.co/inceptionai/jais-13b-chat)
- [x] [Bielik-11B-v2.3](https://huggingface.co/collections/speakleash/bielik-11b-v23-66ee813238d9b526a072408a)

 (instructions for supporting more models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md))

@@ -114,7 +110,6 @@ Typically finetunes of the base models below are supported as well.
 - Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
 - Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp)
 - JS/TS (llama.cpp server client): [lgrammel/modelfusion](https://modelfusion.dev/integration/model-provider/llamacpp)
- JS/TS (Programmable Prompt Engine CLI): [offline-ai/cli](https://github.com/offline-ai/cli)
 - JavaScript/Wasm (works in browser): [tangledgroup/llama-cpp-wasm](https://github.com/tangledgroup/llama-cpp-wasm)
 - Typescript/Wasm (nicer API, available on npm): [ngxson/wllama](https://github.com/ngxson/wllama)
 - Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
@@ -168,17 +163,14 @@ Unless otherwise noted these projects are open-source with permissive licensing:
 - [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)
 - [AIKit](https://github.com/sozercan/aikit) (MIT)
 - [LARS - The LLM & Advanced Referencing Solution](https://github.com/abgulati/LARS) (AGPL)
- [LLMUnity](https://github.com/undreamai/LLMUnity) (MIT)

 *(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*

 **Tools:**

 - [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from HuggingFace Hub and convert them to GGML
- [akx/ollama-dl](https://github.com/akx/ollama-dl) – download models from the Ollama library to be used directly with llama.cpp
 - [crashr/gppm](https://github.com/crashr/gppm) – launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption
 - [gpustack/gguf-parser](https://github.com/gpustack/gguf-parser-go/tree/main/cmd/gguf-parser) - review/check the GGUF file and estimate the memory usage
- [Styled Lines](https://marketplace.unity.com/packages/tools/generative-ai/styled-lines-llama-cpp-model-292902) (proprietary licensed, async wrapper of inference part for game development in Unity3d with prebuild Mobile and Web platform wrappers and a model example)

 **Infrastructure:**

@@ -444,7 +436,7 @@ To learn more how to measure perplexity using llama.cpp, [read this documentatio
 - Contributors can open PRs
 - Collaborators can push to branches in the `llama.cpp` repo and merge PRs into the `master` branch
 - Collaborators will be invited based on contributions
- Any help with managing issues, PRs and projects is very appreciated!
+- Any help with managing issues and PRs is very appreciated!
 - See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions
 - Read the [CONTRIBUTING.md](CONTRIBUTING.md) for more information
 - Make sure to read this: [Inference at the edge](https://github.com/ggerganov/llama.cpp/discussions/205)
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -712,81 +712,6 @@ function gg_run_embd_bge_small {
    set +e
 }

-function gg_sum_embd_bge_small {
-    gg_printf '### %s\n\n' "${ci}"
-
-    gg_printf 'BGE Small (BERT):\n'
-    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
-    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
-    gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
-}
-
-# rerank_tiny
-
-function gg_run_rerank_tiny {
-    cd ${SRC}
-
-    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/config.json
-    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/tokenizer.json
-    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/tokenizer_config.json
-    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/special_tokens_map.json
-    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/resolve/main/pytorch_model.bin
-    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/sentence_bert_config.json
-    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/vocab.txt
-    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/modules.json
-    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/config.json
-
-    gg_wget models-mnt/rerank-tiny/1_Pooling https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/1_Pooling/config.json
-
-    path_models="../models-mnt/rerank-tiny"
-
-    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
-
-    set -e
-
-    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log
-
-    python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
-
-    model_f16="${path_models}/ggml-model-f16.gguf"
-
-    (time ./bin/llama-embedding --model ${model_f16}  -p "what is panda?</s><s>hi\nwhat is panda?</s><s>it's a bear\nwhat is panda?</s><s>The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
-
-    # sample output
-    # rerank score 0:    0.029
-    # rerank score 1:    0.029
-    # rerank score 2:    0.135
-
-    # check that the score is in the range [$3, $4]
-    function check_score {
-        qnt="$1"
-        score=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
-
-        if [ $(echo "$score < $3" | bc) -eq 1 ] || [ $(echo "$score > $4" | bc) -eq 1 ]; then
-            printf '  - %s @ %s (FAIL: score not in range [%s, %s])\n' "$qnt" "$score" "$3" "$4"
-            return 20
-        fi
-
-        printf '  - %s @ %s OK\n' "$qnt" "$score"
-        return 0
-    }
-
-    check_score "rerank score 0" "$(cat $OUT/${ci}-rk-f16.log | grep "rerank score 0")" "0.00" "0.05" | tee -a $OUT/${ci}-rk-f16.log
-    check_score "rerank score 1" "$(cat $OUT/${ci}-rk-f16.log | grep "rerank score 1")" "0.00" "0.05" | tee -a $OUT/${ci}-rk-f16.log
-    check_score "rerank score 2" "$(cat $OUT/${ci}-rk-f16.log | grep "rerank score 2")" "0.10" "0.15" | tee -a $OUT/${ci}-rk-f16.log
-
-    set +e
-}
-
-function gg_sum_rerank_tiny {
-    gg_printf '### %s\n\n' "${ci}"
-
-    gg_printf 'Rerank Tiny (Jina):\n'
-    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
-    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-rk-f16.log)"
-}
-
 function gg_check_build_requirements {
    if ! command -v cmake &> /dev/null; then
        gg_printf 'cmake not found, please install'
@@ -801,10 +726,16 @@ function gg_check_build_requirements {
    fi
 }

-## main
+function gg_sum_embd_bge_small {
+    gg_printf '### %s\n\n' "${ci}"

-export LLAMA_LOG_PREFIX=1
-export LLAMA_LOG_TIMESTAMPS=1
+    gg_printf 'BGE Small (BERT):\n'
+    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
+    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
+    gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
+}
+
+## main

 if [ -z ${GG_BUILD_LOW_PERF} ]; then
    # Create symlink: ./llama.cpp/models-mnt -> $MNT/models/models-mnt
@@ -828,7 +759,6 @@ test $ret -eq 0 && gg_run ctest_release

 if [ -z ${GG_BUILD_LOW_PERF} ]; then
    test $ret -eq 0 && gg_run embd_bge_small
-    test $ret -eq 0 && gg_run rerank_tiny

    if [ -z ${GG_BUILD_CLOUD} ] || [ ${GG_BUILD_EXTRA_TESTS_0} ]; then
        test $ret -eq 0 && gg_run test_scripts_debug
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -51,23 +51,19 @@ endif()
 set(TARGET common)

 add_library(${TARGET} STATIC
-    arg.cpp
-    arg.h
    base64.hpp
-    common.cpp
    common.h
-    console.cpp
-    console.h
-    json-schema-to-grammar.cpp
-    json.hpp
-    log.cpp
-    log.h
-    ngram-cache.cpp
-    ngram-cache.h
-    sampling.cpp
+    common.cpp
    sampling.h
-    train.cpp
+    sampling.cpp
+    console.h
+    console.cpp
+    json.hpp
+    json-schema-to-grammar.cpp
    train.h
+    train.cpp
+    ngram-cache.h
+    ngram-cache.cpp
    )

 if (BUILD_SHARED_LIBS)
--- a/common/arg.cpp
+++ b/common/arg.cpp
--- a/common/arg.h
+++ b/common/arg.h
@@ -1,77 +0,0 @@
-#pragma once
-
-#include "common.h"
-
-#include <set>
-#include <string>
-#include <vector>
-
-//
-// CLI argument parsing
-//
-
-struct llama_arg {
-    std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
-    std::vector<const char *> args;
-    const char * value_hint   = nullptr; // help text or example for arg value
-    const char * value_hint_2 = nullptr; // for second arg value
-    const char * env          = nullptr;
-    std::string help;
-    bool is_sparam = false; // is current arg a sampling param?
-    void (*handler_void)   (gpt_params & params) = nullptr;
-    void (*handler_string) (gpt_params & params, const std::string &) = nullptr;
-    void (*handler_str_str)(gpt_params & params, const std::string &, const std::string &) = nullptr;
-    void (*handler_int)    (gpt_params & params, int) = nullptr;
-
-    llama_arg(
-        const std::initializer_list<const char *> & args,
-        const char * value_hint,
-        const std::string & help,
-        void (*handler)(gpt_params & params, const std::string &)
-    ) : args(args), value_hint(value_hint), help(help), handler_string(handler) {}
-
-    llama_arg(
-        const std::initializer_list<const char *> & args,
-        const char * value_hint,
-        const std::string & help,
-        void (*handler)(gpt_params & params, int)
-    ) : args(args), value_hint(value_hint), help(help), handler_int(handler) {}
-
-    llama_arg(
-        const std::initializer_list<const char *> & args,
-        const std::string & help,
-        void (*handler)(gpt_params & params)
-    ) : args(args), help(help), handler_void(handler) {}
-
-    // support 2 values for arg
-    llama_arg(
-        const std::initializer_list<const char *> & args,
-        const char * value_hint,
-        const char * value_hint_2,
-        const std::string & help,
-        void (*handler)(gpt_params & params, const std::string &, const std::string &)
-    ) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {}
-
-    llama_arg & set_examples(std::initializer_list<enum llama_example> examples);
-    llama_arg & set_env(const char * env);
-    llama_arg & set_sparam();
-    bool in_example(enum llama_example ex);
-    bool get_value_from_env(std::string & output);
-    bool has_value_from_env();
-    std::string to_string();
-};
-
-struct gpt_params_context {
-    enum llama_example ex = LLAMA_EXAMPLE_COMMON;
-    gpt_params & params;
-    std::vector<llama_arg> options;
-    void(*print_usage)(int, char **) = nullptr;
-    gpt_params_context(gpt_params & params) : params(params) {}
-};
-
-// parse input arguments from CLI
-// if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message)
-bool gpt_params_parse(int argc, char ** argv, gpt_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
-
-// function to be used by test-arg-parser
-gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
--- a/common/common.cpp
+++ b/common/common.cpp
--- a/common/common.h
+++ b/common/common.h
@@ -4,9 +4,18 @@

 #include "llama.h"

+#include "sampling.h"
+
+#define LOG_NO_FILE_LINE_FUNCTION
+#include "log.h"
+
+#include <cmath>
 #include <string>
 #include <vector>
-#include <sstream>
+#include <random>
+#include <thread>
+#include <unordered_map>
+#include <tuple>

 #ifdef _WIN32
 #define DIRECTORY_SEPARATOR '\\'
@@ -45,100 +54,26 @@ struct llama_control_vector_load_info;
 // CPU utils
 //

-struct cpu_params {
-    int      n_threads                   = -1;
-    bool     cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
-    bool     mask_valid                  = false;   // Default: any CPU
-    enum ggml_sched_priority  priority   = GGML_SCHED_PRIO_NORMAL;  // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
-    bool     strict_cpu                  = false;   // Use strict CPU placement
-    uint32_t poll                        = 50;      // Polling (busywait) level (0 - no polling, 100 - mostly polling)
-};
-
 int32_t cpu_get_num_physical_cores();
 int32_t cpu_get_num_math();

 //
-// Common params
+// CLI argument parsing
 //

-enum llama_example {
-    LLAMA_EXAMPLE_COMMON,
-    LLAMA_EXAMPLE_SPECULATIVE,
-    LLAMA_EXAMPLE_MAIN,
-    LLAMA_EXAMPLE_INFILL,
-    LLAMA_EXAMPLE_EMBEDDING,
-    LLAMA_EXAMPLE_PERPLEXITY,
-    LLAMA_EXAMPLE_RETRIEVAL,
-    LLAMA_EXAMPLE_PASSKEY,
-    LLAMA_EXAMPLE_IMATRIX,
-    LLAMA_EXAMPLE_BENCH,
-    LLAMA_EXAMPLE_SERVER,
-    LLAMA_EXAMPLE_CVECTOR_GENERATOR,
-    LLAMA_EXAMPLE_EXPORT_LORA,
-    LLAMA_EXAMPLE_LLAVA,
-    LLAMA_EXAMPLE_LOOKUP,
-    LLAMA_EXAMPLE_PARALLEL,
-
-    LLAMA_EXAMPLE_COUNT,
-};
-
-enum gpt_sampler_type {
-    GPT_SAMPLER_TYPE_NONE        = 0,
-    GPT_SAMPLER_TYPE_TOP_K       = 1,
-    GPT_SAMPLER_TYPE_TOP_P       = 2,
-    GPT_SAMPLER_TYPE_MIN_P       = 3,
-    GPT_SAMPLER_TYPE_TFS_Z       = 4,
-    GPT_SAMPLER_TYPE_TYPICAL_P   = 5,
-    GPT_SAMPLER_TYPE_TEMPERATURE = 6,
-};
-
 // dimensionality reduction methods, used by cvector-generator
 enum dimre_method {
    DIMRE_METHOD_PCA,
    DIMRE_METHOD_MEAN,
 };

-// sampler parameters
-struct gpt_sampler_params {
-    uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
-
-    int32_t n_prev            = 64;    // number of previous tokens to remember
-    int32_t n_probs           = 0;     // if greater than 0, output the probabilities of top n_probs tokens.
-    int32_t min_keep          = 0;     // 0 = disabled, otherwise samplers should return at least min_keep tokens
-    int32_t top_k             = 40;    // <= 0 to use vocab size
-    float   top_p             = 0.95f; // 1.0 = disabled
-    float   min_p             = 0.05f; // 0.0 = disabled
-    float   tfs_z             = 1.00f; // 1.0 = disabled
-    float   typ_p             = 1.00f; // typical_p, 1.0 = disabled
-    float   temp              = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
-    float   dynatemp_range    = 0.00f; // 0.0 = disabled
-    float   dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
-    int32_t penalty_last_n    = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
-    float   penalty_repeat    = 1.00f; // 1.0 = disabled
-    float   penalty_freq      = 0.00f; // 0.0 = disabled
-    float   penalty_present   = 0.00f; // 0.0 = disabled
-    int32_t mirostat          = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
-    float   mirostat_tau      = 5.00f; // target entropy
-    float   mirostat_eta      = 0.10f; // learning rate
-    bool    penalize_nl       = false; // consider newlines as a repeatable token
-    bool    ignore_eos        = false;
-    bool    no_perf           = false; // disable performance metrics
-
-    std::vector<enum gpt_sampler_type> samplers = {
-        GPT_SAMPLER_TYPE_TOP_K,
-        GPT_SAMPLER_TYPE_TFS_Z,
-        GPT_SAMPLER_TYPE_TYPICAL_P,
-        GPT_SAMPLER_TYPE_TOP_P,
-        GPT_SAMPLER_TYPE_MIN_P,
-        GPT_SAMPLER_TYPE_TEMPERATURE
-    };
-
-    std::string grammar; // optional BNF-like grammar to constrain sampling
-
-    std::vector<llama_logit_bias> logit_bias; // logit biases to apply
-
-    // print the parameters into a string
-    std::string print() const;
+struct cpu_params {
+    int      n_threads                   = -1;
+    bool     cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
+    bool     mask_valid                  = false;   // Default: any CPU
+    enum ggml_sched_priority  priority   = GGML_SCHED_PRIO_NORMAL;  // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
+    bool     strict_cpu                  = false;   // Use strict CPU placement
+    uint32_t poll                        = 50;      // Polling (busywait) level (0 - no polling, 100 - mostly polling)
 };

 struct gpt_params {
@@ -183,25 +118,25 @@ struct gpt_params {
    enum llama_pooling_type      pooling_type      = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
    enum llama_attention_type    attention_type    = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings

-    struct gpt_sampler_params sparams;
+    struct gpt_sampling_params sparams;

-    std::string model                = ""; // model path                                                    // NOLINT
-    std::string model_draft          = ""; // draft model for speculative decoding                          // NOLINT
-    std::string model_alias          = "unknown"; // model alias                                            // NOLINT
-    std::string model_url            = ""; // model url to download                                         // NOLINT
-    std::string hf_token             = ""; // HF token                                                      // NOLINT
-    std::string hf_repo              = ""; // HF repo                                                       // NOLINT
-    std::string hf_file              = ""; // HF file                                                       // NOLINT
-    std::string prompt               = "";                                                                  // NOLINT
-    std::string prompt_file          = ""; // store the external prompt file name                           // NOLINT
-    std::string path_prompt_cache    = ""; // path to file for saving/loading prompt eval state             // NOLINT
-    std::string input_prefix         = ""; // string to prefix user inputs with                             // NOLINT
-    std::string input_suffix         = ""; // string to suffix user inputs with                             // NOLINT
-    std::string logdir               = ""; // directory in which to save YAML log files                     // NOLINT
-    std::string lookup_cache_static  = ""; // path of static ngram cache file for lookup decoding           // NOLINT
-    std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding          // NOLINT
-    std::string logits_file          = ""; // file for saving *all* logits                                  // NOLINT
-    std::string rpc_servers          = ""; // comma separated list of RPC servers                           // NOLINT
+    std::string model                = ""; // model path
+    std::string model_draft          = ""; // draft model for speculative decoding
+    std::string model_alias          = "unknown"; // model alias
+    std::string model_url            = ""; // model url to download
+    std::string hf_token             = ""; // HF token
+    std::string hf_repo              = ""; // HF repo
+    std::string hf_file              = ""; // HF file
+    std::string prompt               = "";
+    std::string prompt_file          = ""; // store the external prompt file name
+    std::string path_prompt_cache    = ""; // path to file for saving/loading prompt eval state
+    std::string input_prefix         = ""; // string to prefix user inputs with
+    std::string input_suffix         = ""; // string to suffix user inputs with
+    std::string logdir               = ""; // directory in which to save YAML log files
+    std::string lookup_cache_static  = ""; // path of static ngram cache file for lookup decoding
+    std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding
+    std::string logits_file          = ""; // file for saving *all* logits
+    std::string rpc_servers          = ""; // comma separated list of RPC servers

    std::vector<std::string> in_files;   // all input files
    std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
@@ -245,8 +180,6 @@ struct gpt_params {
    bool simple_io         = false; // improves compatibility with subprocesses and limited consoles
    bool cont_batching     = true;  // insert new sequences for decoding on-the-fly
    bool flash_attn        = false; // flash attention
-    bool no_perf           = false; // disable performance metrics
-    bool ctx_shift         = true;  // context shift on inifinite text generation

    bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
    bool logits_all        = false; // return logits for all tokens in the batch
@@ -254,6 +187,7 @@ struct gpt_params {
    bool use_mlock         = false; // use mlock to keep model in memory
    bool verbose_prompt    = false; // print prompt tokens before generation
    bool display_prompt    = true;  // print prompt before generation
+    bool infill            = false; // use infill mode
    bool dump_kv_cache     = false; // dump the KV cache contents for debugging purposes
    bool no_kv_offload     = false; // disable KV offloading
    bool warmup            = true;  // warmup run
@@ -263,7 +197,7 @@ struct gpt_params {
    std::string cache_type_v = "f16"; // KV cache data type for the V

    // multimodal models (see examples/llava)
-    std::string mmproj = "";        // path to multimodal projector                                         // NOLINT
+    std::string mmproj = "";        // path to multimodal projector
    std::vector<std::string> image; // path to image file(s)

    // embedding
@@ -271,7 +205,6 @@ struct gpt_params {
    int32_t embd_normalize = 2;     // normalisation for embendings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
    std::string embd_out   = "";    // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
    std::string embd_sep   = "\n";  // separator of embendings
-    bool reranking         = false; // enable reranking support on server

    // server params
    int32_t port           = 8080;         // server listens on this network port
@@ -280,15 +213,15 @@ struct gpt_params {
    int     n_threads_http = -1;           // number of threads to process HTTP requests (TODO: support threadpool)

    std::string hostname      = "127.0.0.1";
-    std::string public_path   = "";                                                                         // NOLINT
-    std::string chat_template = "";                                                                         // NOLINT
-    std::string system_prompt = "";                                                                         // NOLINT
+    std::string public_path   = "";
+    std::string chat_template = "";
+    std::string system_prompt = "";
    bool enable_chat_template = true;

    std::vector<std::string> api_keys;

-    std::string ssl_file_key  = "";                                                                         // NOLINT
-    std::string ssl_file_cert = "";                                                                         // NOLINT
+    std::string ssl_file_key  = "";
+    std::string ssl_file_cert = "";

    bool endpoint_slots   = true;
    bool endpoint_metrics = false;
@@ -338,14 +271,15 @@ struct gpt_params {
    bool spm_infill = false; // suffix/prefix/middle pattern for infill

    std::string lora_outfile = "ggml-lora-merged-f16.gguf";
-
-    // batched-bench params
-    bool batched_bench_output_jsonl = false;
 };

-// call once at the start of a program if it uses libcommon
-// initializes the logging system and prints info about the build
-void gpt_init();
+void gpt_params_parse_from_env(gpt_params & params);
+void gpt_params_handle_model_default(gpt_params & params);
+
+bool gpt_params_parse_ex   (int argc, char ** argv, gpt_params & params);
+bool gpt_params_parse      (int argc, char ** argv, gpt_params & params);
+bool gpt_params_find_arg   (int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param);
+void gpt_params_print_usage(int argc, char ** argv, const gpt_params & params);

 std::string gpt_params_get_system_info(const gpt_params & params);

@@ -382,11 +316,6 @@ static std::vector<T> string_split(const std::string & str, char delim) {
 bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
 void string_process_escapes(std::string & input);

-std::string string_from(bool value);
-std::string string_from(const std::vector<int> & values);
-std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens);
-std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch);
-
 //
 // Filesystem utils
 //
--- a/common/console.cpp
+++ b/common/console.cpp
@@ -94,9 +94,6 @@ namespace console {
                simple_io = true;
            }
        }
-        if (simple_io) {
-            _setmode(_fileno(stdin), _O_U8TEXT);
-        }
 #else
        // POSIX-specific console initialization
        if (!simple_io) {
--- a/common/log.cpp
+++ b/common/log.cpp
@@ -1,401 +0,0 @@
-#include "log.h"
-
-#include <condition_variable>
-#include <cstdarg>
-#include <cstdio>
-#include <mutex>
-#include <sstream>
-#include <thread>
-#include <vector>
-
-int gpt_log_verbosity_thold = LOG_DEFAULT_LLAMA;
-
-void gpt_log_set_verbosity_thold(int verbosity) {
-    gpt_log_verbosity_thold = verbosity;
-}
-
-#define LOG_COL_DEFAULT "\033[0m"
-#define LOG_COL_BOLD    "\033[1m"
-#define LOG_COL_RED     "\033[31m"
-#define LOG_COL_GREEN   "\033[32m"
-#define LOG_COL_YELLOW  "\033[33m"
-#define LOG_COL_BLUE    "\033[34m"
-#define LOG_COL_MAGENTA "\033[35m"
-#define LOG_COL_CYAN    "\033[36m"
-#define LOG_COL_WHITE   "\033[37m"
-
-static int64_t t_us() {
-    return std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
-}
-
-// colors
-enum gpt_log_col : int {
-    GPT_LOG_COL_DEFAULT = 0,
-    GPT_LOG_COL_BOLD,
-    GPT_LOG_COL_RED,
-    GPT_LOG_COL_GREEN,
-    GPT_LOG_COL_YELLOW,
-    GPT_LOG_COL_BLUE,
-    GPT_LOG_COL_MAGENTA,
-    GPT_LOG_COL_CYAN,
-    GPT_LOG_COL_WHITE,
-};
-
-// disable colors by default
-static std::vector<const char *> g_col = {
-    "",
-    "",
-    "",
-    "",
-    "",
-    "",
-    "",
-    "",
-    "",
-};
-
-struct gpt_log_entry {
-    enum ggml_log_level level;
-
-    bool prefix;
-
-    int64_t timestamp;
-
-    std::vector<char> msg;
-
-    // signals the worker thread to stop
-    bool is_end;
-
-    void print(FILE * file = nullptr) const {
-        FILE * fcur = file;
-        if (!fcur) {
-            // stderr displays DBG messages only when their verbosity level is not higher than the threshold
-            // these messages will still be logged to a file
-            if (level == GGML_LOG_LEVEL_DEBUG && gpt_log_verbosity_thold < LOG_DEFAULT_DEBUG) {
-                return;
-            }
-
-            fcur = stdout;
-
-            if (level != GGML_LOG_LEVEL_NONE) {
-                fcur = stderr;
-            }
-        }
-
-        if (level != GGML_LOG_LEVEL_NONE && level != GGML_LOG_LEVEL_CONT && prefix) {
-            if (timestamp) {
-                // [M.s.ms.us]
-                fprintf(fcur, "%s%d.%02d.%03d.%03d%s ",
-                        g_col[GPT_LOG_COL_BLUE],
-                        (int) (timestamp / 1000000 / 60),
-                        (int) (timestamp / 1000000 % 60),
-                        (int) (timestamp / 1000 % 1000),
-                        (int) (timestamp % 1000),
-                        g_col[GPT_LOG_COL_DEFAULT]);
-            }
-
-            switch (level) {
-                case GGML_LOG_LEVEL_INFO:  fprintf(fcur, "%sI %s", g_col[GPT_LOG_COL_GREEN],   g_col[GPT_LOG_COL_DEFAULT]); break;
-                case GGML_LOG_LEVEL_WARN:  fprintf(fcur, "%sW %s", g_col[GPT_LOG_COL_MAGENTA], ""                        ); break;
-                case GGML_LOG_LEVEL_ERROR: fprintf(fcur, "%sE %s", g_col[GPT_LOG_COL_RED],     ""                        ); break;
-                case GGML_LOG_LEVEL_DEBUG: fprintf(fcur, "%sD %s", g_col[GPT_LOG_COL_YELLOW],  ""                        ); break;
-                default:
-                    break;
-            }
-        }
-
-        fprintf(fcur, "%s", msg.data());
-
-        if (level == GGML_LOG_LEVEL_WARN || level == GGML_LOG_LEVEL_ERROR || level == GGML_LOG_LEVEL_DEBUG) {
-            fprintf(fcur, "%s", g_col[GPT_LOG_COL_DEFAULT]);
-        }
-
-        fflush(fcur);
-    }
-};
-
-struct gpt_log {
-    // default capacity - will be expanded if needed
-    gpt_log() : gpt_log(256) {}
-
-    gpt_log(size_t capacity) {
-        file = nullptr;
-        prefix = false;
-        timestamps = false;
-        running = false;
-        t_start = t_us();
-
-        // initial message size - will be expanded if longer messages arrive
-        entries.resize(capacity);
-        for (auto & entry : entries) {
-            entry.msg.resize(256);
-        }
-
-        head = 0;
-        tail = 0;
-
-        resume();
-    }
-
-    ~gpt_log() {
-        pause();
-        if (file) {
-            fclose(file);
-        }
-    }
-
-private:
-    std::mutex mtx;
-    std::thread thrd;
-    std::condition_variable cv;
-
-    FILE * file;
-
-    bool prefix;
-    bool timestamps;
-    bool running;
-
-    int64_t t_start;
-
-    // ring buffer of entries
-    std::vector<gpt_log_entry> entries;
-    size_t head;
-    size_t tail;
-
-    // worker thread copies into this
-    gpt_log_entry cur;
-
-public:
-    void add(enum ggml_log_level level, const char * fmt, va_list args) {
-        std::lock_guard<std::mutex> lock(mtx);
-
-        if (!running) {
-            // discard messages while the worker thread is paused
-            return;
-        }
-
-        auto & entry = entries[tail];
-
-        {
-            // cannot use args twice, so make a copy in case we need to expand the buffer
-            va_list args_copy;
-            va_copy(args_copy, args);
-
-#if 1
-            const size_t n = vsnprintf(entry.msg.data(), entry.msg.size(), fmt, args);
-            if (n >= entry.msg.size()) {
-                entry.msg.resize(n + 1);
-                vsnprintf(entry.msg.data(), entry.msg.size(), fmt, args_copy);
-            }
-#else
-            // hack for bolding arguments
-
-            std::stringstream ss;
-            for (int i = 0; fmt[i] != 0; i++) {
-                if (fmt[i] == '%') {
-                    ss << LOG_COL_BOLD;
-                    while (fmt[i] != ' ' && fmt[i] != ')' && fmt[i] != ']' && fmt[i] != 0) ss << fmt[i++];
-                    ss << LOG_COL_DEFAULT;
-                    if (fmt[i] == 0) break;
-                }
-                ss << fmt[i];
-            }
-            const size_t n = vsnprintf(entry.msg.data(), entry.msg.size(), ss.str().c_str(), args);
-            if (n >= entry.msg.size()) {
-                entry.msg.resize(n + 1);
-                vsnprintf(entry.msg.data(), entry.msg.size(), ss.str().c_str(), args_copy);
-            }
-#endif
-        }
-
-        entry.level = level;
-        entry.prefix = prefix;
-        entry.timestamp = 0;
-        if (timestamps) {
-            entry.timestamp = t_us() - t_start;
-        }
-        entry.is_end = false;
-
-        tail = (tail + 1) % entries.size();
-        if (tail == head) {
-            // expand the buffer
-            std::vector<gpt_log_entry> new_entries(2*entries.size());
-
-            size_t new_tail = 0;
-
-            do {
-                new_entries[new_tail] = std::move(entries[head]);
-
-                head     = (head     + 1) % entries.size();
-                new_tail = (new_tail + 1);
-            } while (head != tail);
-
-            head = 0;
-            tail = new_tail;
-
-            for (size_t i = tail; i < new_entries.size(); i++) {
-                new_entries[i].msg.resize(256);
-            }
-
-            entries = std::move(new_entries);
-        }
-
-        cv.notify_one();
-    }
-
-    void resume() {
-        std::lock_guard<std::mutex> lock(mtx);
-
-        if (running) {
-            return;
-        }
-
-        running = true;
-
-        thrd = std::thread([this]() {
-            while (true) {
-                {
-                    std::unique_lock<std::mutex> lock(mtx);
-                    cv.wait(lock, [this]() { return head != tail; });
-
-                    cur = entries[head];
-
-                    head = (head + 1) % entries.size();
-                }
-
-                if (cur.is_end) {
-                    break;
-                }
-
-                cur.print(); // stdout and stderr
-
-                if (file) {
-                    cur.print(file);
-                }
-            }
-        });
-    }
-
-    void pause() {
-        {
-            std::lock_guard<std::mutex> lock(mtx);
-
-            if (!running) {
-                return;
-            }
-
-            running = false;
-
-            // push an entry to signal the worker thread to stop
-            {
-                auto & entry = entries[tail];
-                entry.is_end = true;
-
-                tail = (tail + 1) % entries.size();
-            }
-
-            cv.notify_one();
-        }
-
-        thrd.join();
-    }
-
-    void set_file(const char * path) {
-        pause();
-
-        if (file) {
-            fclose(file);
-        }
-
-        if (path) {
-            file = fopen(path, "w");
-        } else {
-            file = nullptr;
-        }
-
-        resume();
-    }
-
-    void set_colors(bool colors) {
-        pause();
-
-        if (colors) {
-            g_col[GPT_LOG_COL_DEFAULT] = LOG_COL_DEFAULT;
-            g_col[GPT_LOG_COL_BOLD]    = LOG_COL_BOLD;
-            g_col[GPT_LOG_COL_RED]     = LOG_COL_RED;
-            g_col[GPT_LOG_COL_GREEN]   = LOG_COL_GREEN;
-            g_col[GPT_LOG_COL_YELLOW]  = LOG_COL_YELLOW;
-            g_col[GPT_LOG_COL_BLUE]    = LOG_COL_BLUE;
-            g_col[GPT_LOG_COL_MAGENTA] = LOG_COL_MAGENTA;
-            g_col[GPT_LOG_COL_CYAN]    = LOG_COL_CYAN;
-            g_col[GPT_LOG_COL_WHITE]   = LOG_COL_WHITE;
-        } else {
-            for (size_t i = 0; i < g_col.size(); i++) {
-                g_col[i] = "";
-            }
-        }
-
-        resume();
-    }
-
-    void set_prefix(bool prefix) {
-        std::lock_guard<std::mutex> lock(mtx);
-
-        this->prefix = prefix;
-    }
-
-    void set_timestamps(bool timestamps) {
-        std::lock_guard<std::mutex> lock(mtx);
-
-        this->timestamps = timestamps;
-    }
-};
-
-//
-// public API
-//
-
-struct gpt_log * gpt_log_init() {
-    return new gpt_log;
-}
-
-struct gpt_log * gpt_log_main() {
-    static struct gpt_log log;
-
-    return &log;
-}
-
-void gpt_log_pause(struct gpt_log * log) {
-    log->pause();
-}
-
-void gpt_log_resume(struct gpt_log * log) {
-    log->resume();
-}
-
-void gpt_log_free(struct gpt_log * log) {
-    delete log;
-}
-
-void gpt_log_add(struct gpt_log * log, enum ggml_log_level level, const char * fmt, ...) {
-    va_list args;
-    va_start(args, fmt);
-    log->add(level, fmt, args);
-    va_end(args);
-}
-
-void gpt_log_set_file(struct gpt_log * log, const char * file) {
-    log->set_file(file);
-}
-
-void gpt_log_set_colors(struct gpt_log * log, bool colors) {
-    log->set_colors(colors);
-}
-
-void gpt_log_set_prefix(struct gpt_log * log, bool prefix) {
-    log->set_prefix(prefix);
-}
-
-void gpt_log_set_timestamps(struct gpt_log * log, bool timestamps) {
-    log->set_timestamps(timestamps);
-}
--- a/common/log.h
+++ b/common/log.h
@@ -1,92 +1,724 @@
 #pragma once

-#include "ggml.h" // for ggml_log_level
+#include <chrono>
+#include <cstring>
+#include <sstream>
+#include <iostream>
+#include <thread>
+#include <vector>
+#include <algorithm>
+#include <cinttypes>

-#ifndef __GNUC__
-#    define LOG_ATTRIBUTE_FORMAT(...)
-#elif defined(__MINGW32__)
-#    define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
-#else
-#    define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
+// --------------------------------
+//
+// Basic usage:
+//
+// --------
+//
+//  The LOG() and LOG_TEE() macros are ready to go by default
+//   they do not require any initialization.
+//
+//  LOGLN() and LOG_TEELN() are variants which automatically
+//   include \n character at the end of the log string.
+//
+//  LOG() behaves exactly like printf, by default writing to a logfile.
+//  LOG_TEE() additionally, prints to the screen too ( mimics Unix tee command ).
+//
+//  Default logfile is named
+//   "llama.<threadID>.log"
+//  Default LOG_TEE() secondary output target is
+//   stderr
+//
+//  Logs can be dynamically disabled or enabled using functions:
+//   log_disable()
+//  and
+//   log_enable()
+//
+//  A log target can be changed with:
+//   log_set_target( string )
+//    creating and opening, or re-opening a file by string filename
+//  or
+//   log_set_target( FILE* )
+//    allowing to point at stderr, stdout, or any valid FILE* file handler.
+//
+// --------
+//
+// End of Basic usage.
+//
+// --------------------------------
+
+// Specifies a log target.
+//  default uses log_handler() with "llama.log" log file
+//  this can be changed, by defining LOG_TARGET
+//  like so:
+//
+//  #define LOG_TARGET (a valid FILE*)
+//  #include "log.h"
+//
+//  or it can be simply redirected to stdout or stderr
+//  like so:
+//
+//  #define LOG_TARGET stderr
+//  #include "log.h"
+//
+//  The log target can also be redirected to a different function
+//  like so:
+//
+//  #define LOG_TARGET log_handler_different()
+//  #include "log.h"
+//
+//  FILE* log_handler_different()
+//  {
+//      return stderr;
+//  }
+//
+//  or:
+//
+//  #define LOG_TARGET log_handler_another_one("somelog.log")
+//  #include "log.h"
+//
+//  FILE* log_handler_another_one(char*filename)
+//  {
+//      static FILE* logfile = nullptr;
+//      (...)
+//      if( !logfile )
+//      {
+//          fopen(...)
+//      }
+//      (...)
+//      return logfile
+//  }
+//
+#ifndef LOG_TARGET
+    #define LOG_TARGET log_handler()
 #endif

-#define LOG_DEFAULT_DEBUG 1
-#define LOG_DEFAULT_LLAMA 0
+#ifndef LOG_TEE_TARGET
+    #define LOG_TEE_TARGET stderr
+#endif

-// needed by the LOG_TMPL macro to avoid computing log arguments if the verbosity lower
-// set via gpt_log_set_verbosity()
-extern int gpt_log_verbosity_thold;
+// Utility for synchronizing log configuration state
+//  since std::optional was introduced only in c++17
+enum LogTriState
+{
+    LogTriStateSame,
+    LogTriStateFalse,
+    LogTriStateTrue
+};

-void gpt_log_set_verbosity_thold(int verbosity); // not thread-safe
+// Utility to obtain "pid" like unique process id and use it when creating log files.
+inline std::string log_get_pid()
+{
+   static std::string pid;
+   if (pid.empty())
+   {
+       // std::this_thread::get_id() is the most portable way of obtaining a "process id"
+       //  it's not the same as "pid" but is unique enough to solve multiple instances
+       //  trying to write to the same log.
+       std::stringstream ss;
+       ss << std::this_thread::get_id();
+       pid = ss.str();
+   }

-// the gpt_log uses an internal worker thread to print/write log messages
-// when the worker thread is paused, incoming log messages are discarded
-struct gpt_log;
+   return pid;
+}

-struct gpt_log * gpt_log_init();
-struct gpt_log * gpt_log_main(); // singleton, automatically destroys itself on exit
-void             gpt_log_pause (struct gpt_log * log); // pause  the worker thread, not thread-safe
-void             gpt_log_resume(struct gpt_log * log); // resume the worker thread, not thread-safe
-void             gpt_log_free  (struct gpt_log * log);
+// Utility function for generating log file names with unique id based on thread id.
+//  invocation with log_filename_generator( "llama", "log" ) creates a string "llama.<number>.log"
+//  where the number is a runtime id of the current thread.

-LOG_ATTRIBUTE_FORMAT(3, 4)
-void gpt_log_add(struct gpt_log * log, enum ggml_log_level level, const char * fmt, ...);
+#define log_filename_generator(log_file_basename, log_file_extension) log_filename_generator_impl(LogTriStateSame, log_file_basename, log_file_extension)

-// defaults: file = NULL, colors = false, prefix = false, timestamps = false
-//
-// regular log output:
-//
-//   ggml_backend_metal_log_allocated_size: allocated buffer, size =  6695.84 MiB, ( 6695.91 / 21845.34)
-//   llm_load_tensors: ggml ctx size =    0.27 MiB
-//   llm_load_tensors: offloading 32 repeating layers to GPU
-//   llm_load_tensors: offloading non-repeating layers to GPU
-//
-// with prefix = true, timestamps = true, the log output will look like this:
-//
-//   0.00.035.060 D ggml_backend_metal_log_allocated_size: allocated buffer, size =  6695.84 MiB, ( 6695.91 / 21845.34)
-//   0.00.035.064 I llm_load_tensors: ggml ctx size =    0.27 MiB
-//   0.00.090.578 I llm_load_tensors: offloading 32 repeating layers to GPU
-//   0.00.090.579 I llm_load_tensors: offloading non-repeating layers to GPU
-//
-// I - info    (stdout, V = 0)
-// W - warning (stderr, V = 0)
-// E - error   (stderr, V = 0)
-// D - debug   (stderr, V = LOG_DEFAULT_DEBUG)
-//
+// INTERNAL, DO NOT USE
+inline std::string log_filename_generator_impl(LogTriState multilog, const std::string & log_file_basename, const std::string & log_file_extension)
+{
+    static bool _multilog = false;

-void gpt_log_set_file      (struct gpt_log * log, const char * file);       // not thread-safe
-void gpt_log_set_colors    (struct gpt_log * log,       bool   colors);     // not thread-safe
-void gpt_log_set_prefix    (struct gpt_log * log,       bool   prefix);     // whether to output prefix to each log
-void gpt_log_set_timestamps(struct gpt_log * log,       bool   timestamps); // whether to output timestamps in the prefix
+    if (multilog != LogTriStateSame)
+    {
+        _multilog = multilog == LogTriStateTrue;
+    }

-// helper macros for logging
-// use these to avoid computing log arguments if the verbosity of the log is higher than the threshold
-//
-// for example:
-//
-//   LOG_DBG("this is a debug message: %d\n", expensive_function());
-//
-// this will avoid calling expensive_function() if LOG_DEFAULT_DEBUG > gpt_log_verbosity_thold
-//
+    std::stringstream buf;

-#define LOG_TMPL(level, verbosity, ...) \
-    do { \
-        if ((verbosity) <= gpt_log_verbosity_thold) { \
-            gpt_log_add(gpt_log_main(), (level), __VA_ARGS__); \
-        } \
+    buf << log_file_basename;
+    if (_multilog)
+    {
+        buf << ".";
+        buf << log_get_pid();
+    }
+    buf << ".";
+    buf << log_file_extension;
+
+    return buf.str();
+}
+
+#ifndef LOG_DEFAULT_FILE_NAME
+    #define LOG_DEFAULT_FILE_NAME log_filename_generator("llama", "log")
+#endif
+
+// Utility for turning #define values into string literals
+//  so we can have a define for stderr and
+//  we can print "stderr" instead of literal stderr, etc.
+#define LOG_STRINGIZE1(s) #s
+#define LOG_STRINGIZE(s) LOG_STRINGIZE1(s)
+
+#define LOG_TEE_TARGET_STRING LOG_STRINGIZE(LOG_TEE_TARGET)
+
+// Allows disabling timestamps.
+//  in order to disable, define LOG_NO_TIMESTAMPS
+//  like so:
+//
+//  #define LOG_NO_TIMESTAMPS
+//  #include "log.h"
+//
+#ifndef LOG_NO_TIMESTAMPS
+    #ifndef _MSC_VER
+        #define LOG_TIMESTAMP_FMT "[%" PRIu64 "] "
+        #define LOG_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
+    #else
+        #define LOG_TIMESTAMP_FMT "[%" PRIu64 "] "
+        #define LOG_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
+    #endif
+#else
+    #define LOG_TIMESTAMP_FMT "%s"
+    #define LOG_TIMESTAMP_VAL ,""
+#endif
+
+#ifdef LOG_TEE_TIMESTAMPS
+    #ifndef _MSC_VER
+        #define LOG_TEE_TIMESTAMP_FMT "[%" PRIu64 "] "
+        #define LOG_TEE_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
+    #else
+        #define LOG_TEE_TIMESTAMP_FMT "[%" PRIu64 "] "
+        #define LOG_TEE_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
+    #endif
+#else
+    #define LOG_TEE_TIMESTAMP_FMT "%s"
+    #define LOG_TEE_TIMESTAMP_VAL ,""
+#endif
+
+// Allows disabling file/line/function prefix
+//  in order to disable, define LOG_NO_FILE_LINE_FUNCTION
+//  like so:
+//
+//  #define LOG_NO_FILE_LINE_FUNCTION
+//  #include "log.h"
+//
+#ifndef LOG_NO_FILE_LINE_FUNCTION
+    #ifndef _MSC_VER
+        #define LOG_FLF_FMT "[%24s:%5d][%24s] "
+        #define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
+    #else
+        #define LOG_FLF_FMT "[%24s:%5ld][%24s] "
+        #define LOG_FLF_VAL , __FILE__, (long)__LINE__, __FUNCTION__
+    #endif
+#else
+    #define LOG_FLF_FMT "%s"
+    #define LOG_FLF_VAL ,""
+#endif
+
+#ifdef LOG_TEE_FILE_LINE_FUNCTION
+    #ifndef _MSC_VER
+        #define LOG_TEE_FLF_FMT "[%24s:%5d][%24s] "
+        #define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
+    #else
+        #define LOG_TEE_FLF_FMT "[%24s:%5ld][%24s] "
+        #define LOG_TEE_FLF_VAL , __FILE__, (long)__LINE__, __FUNCTION__
+    #endif
+#else
+    #define LOG_TEE_FLF_FMT "%s"
+    #define LOG_TEE_FLF_VAL ,""
+#endif
+
+// INTERNAL, DO NOT USE
+//  USE LOG() INSTEAD
+//
+#if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER) || defined(__clang__)
+    #define LOG_IMPL(str, ...)                                                                                      \
+    do {                                                                                                            \
+        if (LOG_TARGET != nullptr)                                                                                  \
+        {                                                                                                           \
+            fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__); \
+            fflush(LOG_TARGET);                                                                                     \
+        }                                                                                                           \
    } while (0)
+#else
+    #define LOG_IMPL(str, ...)                                                                                           \
+    do {                                                                                                                 \
+        if (LOG_TARGET != nullptr)                                                                                       \
+        {                                                                                                                \
+            fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__); \
+            fflush(LOG_TARGET);                                                                                          \
+        }                                                                                                                \
+    } while (0)
+#endif

-#define LOG(...)             LOG_TMPL(GGML_LOG_LEVEL_NONE, 0,         __VA_ARGS__)
-#define LOGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_NONE, verbosity, __VA_ARGS__)
+// INTERNAL, DO NOT USE
+//  USE LOG_TEE() INSTEAD
+//
+#if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER) || defined(__clang__)
+    #define LOG_TEE_IMPL(str, ...)                                                                                                      \
+    do {                                                                                                                                \
+        if (LOG_TARGET != nullptr)                                                                                                      \
+        {                                                                                                                               \
+            fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__);                     \
+            fflush(LOG_TARGET);                                                                                                         \
+        }                                                                                                                               \
+        if (LOG_TARGET != nullptr && LOG_TARGET != stdout && LOG_TARGET != stderr && LOG_TEE_TARGET != nullptr)                         \
+        {                                                                                                                               \
+            fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL, __VA_ARGS__); \
+            fflush(LOG_TEE_TARGET);                                                                                                     \
+        }                                                                                                                               \
+    } while (0)
+#else
+    #define LOG_TEE_IMPL(str, ...)                                                                                                           \
+    do {                                                                                                                                     \
+        if (LOG_TARGET != nullptr)                                                                                                           \
+        {                                                                                                                                    \
+            fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__);                     \
+            fflush(LOG_TARGET);                                                                                                              \
+        }                                                                                                                                    \
+        if (LOG_TARGET != nullptr && LOG_TARGET != stdout && LOG_TARGET != stderr && LOG_TEE_TARGET != nullptr)                              \
+        {                                                                                                                                    \
+            fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL "", ##__VA_ARGS__); \
+            fflush(LOG_TEE_TARGET);                                                                                                          \
+        }                                                                                                                                    \
+    } while (0)
+#endif

-#define LOG_INF(...) LOG_TMPL(GGML_LOG_LEVEL_INFO,  0,                 __VA_ARGS__)
-#define LOG_WRN(...) LOG_TMPL(GGML_LOG_LEVEL_WARN,  0,                 __VA_ARGS__)
-#define LOG_ERR(...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, 0,                 __VA_ARGS__)
-#define LOG_DBG(...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, LOG_DEFAULT_DEBUG, __VA_ARGS__)
-#define LOG_CNT(...) LOG_TMPL(GGML_LOG_LEVEL_CONT,  0,                 __VA_ARGS__)
+// The '\0' as a last argument, is a trick to bypass the silly
+//  "warning: ISO C++11 requires at least one argument for the "..." in a variadic macro"
+//  so we can have a single macro which can be called just like printf.

-#define LOG_INFV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_INFO,  verbosity, __VA_ARGS__)
-#define LOG_WRNV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_WARN,  verbosity, __VA_ARGS__)
-#define LOG_ERRV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, verbosity, __VA_ARGS__)
-#define LOG_DBGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, verbosity, __VA_ARGS__)
-#define LOG_CNTV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_CONT,  verbosity, __VA_ARGS__)
+// Main LOG macro.
+//  behaves like printf, and supports arguments the exact same way.
+//
+#if !defined(_MSC_VER) || defined(__clang__)
+    #define LOG(...) LOG_IMPL(__VA_ARGS__, "")
+#else
+    #define LOG(str, ...) LOG_IMPL("%s" str, "", ##__VA_ARGS__, "")
+#endif
+
+// Main TEE macro.
+//  does the same as LOG
+//  and
+//  simultaneously writes stderr.
+//
+// Secondary target can be changed just like LOG_TARGET
+//  by defining LOG_TEE_TARGET
+//
+#if !defined(_MSC_VER) || defined(__clang__)
+    #define LOG_TEE(...) LOG_TEE_IMPL(__VA_ARGS__, "")
+#else
+    #define LOG_TEE(str, ...) LOG_TEE_IMPL("%s" str, "", ##__VA_ARGS__, "")
+#endif
+
+// LOG macro variants with auto endline.
+#if !defined(_MSC_VER) || defined(__clang__)
+    #define LOGLN(...) LOG_IMPL(__VA_ARGS__, "\n")
+    #define LOG_TEELN(...) LOG_TEE_IMPL(__VA_ARGS__, "\n")
+#else
+    #define LOGLN(str, ...) LOG_IMPL("%s" str, "", ##__VA_ARGS__, "\n")
+    #define LOG_TEELN(str, ...) LOG_TEE_IMPL("%s" str, "", ##__VA_ARGS__, "\n")
+#endif
+
+// INTERNAL, DO NOT USE
+inline FILE *log_handler1_impl(bool change = false, LogTriState append = LogTriStateSame, LogTriState disable = LogTriStateSame, const std::string & filename = LOG_DEFAULT_FILE_NAME, FILE *target = nullptr)
+{
+    static bool _initialized = false;
+    static bool _append = false;
+    static bool _disabled = filename.empty() && target == nullptr;
+    static std::string log_current_filename{filename};
+    static FILE *log_current_target{target};
+    static FILE *logfile = nullptr;
+
+    if (change)
+    {
+        if (append != LogTriStateSame)
+        {
+            _append = append == LogTriStateTrue;
+            return logfile;
+        }
+
+        if (disable == LogTriStateTrue)
+        {
+            // Disable primary target
+            _disabled = true;
+        }
+        // If previously disabled, only enable, and keep previous target
+        else if (disable == LogTriStateFalse)
+        {
+            _disabled = false;
+        }
+        // Otherwise, process the arguments
+        else if (log_current_filename != filename || log_current_target != target)
+        {
+            _initialized = false;
+        }
+    }
+
+    if (_disabled)
+    {
+        // Log is disabled
+        return nullptr;
+    }
+
+    if (_initialized)
+    {
+        // with fallback in case something went wrong
+        return logfile ? logfile : stderr;
+    }
+
+    // do the (re)initialization
+    if (target != nullptr)
+    {
+        if (logfile != nullptr && logfile != stdout && logfile != stderr)
+        {
+            fclose(logfile);
+        }
+
+        log_current_filename = LOG_DEFAULT_FILE_NAME;
+        log_current_target = target;
+
+        logfile = target;
+    }
+    else
+    {
+        if (log_current_filename != filename)
+        {
+            if (logfile != nullptr && logfile != stdout && logfile != stderr)
+            {
+                fclose(logfile);
+            }
+        }
+
+        logfile = fopen(filename.c_str(), _append ? "a" : "w");
+    }
+
+    if (!logfile)
+    {
+        //  Verify whether the file was opened, otherwise fallback to stderr
+        logfile = stderr;
+
+        fprintf(stderr, "Failed to open logfile '%s' with error '%s'\n", filename.c_str(), std::strerror(errno));
+        fflush(stderr);
+
+        // At this point we let the init flag be to true below, and let the target fallback to stderr
+        //  otherwise we would repeatedly fopen() which was already unsuccessful
+    }
+
+    _initialized = true;
+
+    return logfile ? logfile : stderr;
+}
+
+// INTERNAL, DO NOT USE
+inline FILE *log_handler2_impl(bool change = false, LogTriState append = LogTriStateSame, LogTriState disable = LogTriStateSame, FILE *target = nullptr, const std::string & filename = LOG_DEFAULT_FILE_NAME)
+{
+    return log_handler1_impl(change, append, disable, filename, target);
+}
+
+// Disables logs entirely at runtime.
+//  Makes LOG() and LOG_TEE() produce no output,
+//  until enabled back.
+#define log_disable() log_disable_impl()
+
+// INTERNAL, DO NOT USE
+inline FILE *log_disable_impl()
+{
+    return log_handler1_impl(true, LogTriStateSame, LogTriStateTrue);
+}
+
+// Enables logs at runtime.
+#define log_enable() log_enable_impl()
+
+// INTERNAL, DO NOT USE
+inline FILE *log_enable_impl()
+{
+    return log_handler1_impl(true, LogTriStateSame, LogTriStateFalse);
+}
+
+// Sets target fir logs, either by a file name or FILE* pointer (stdout, stderr, or any valid FILE*)
+#define log_set_target(target) log_set_target_impl(target)
+
+// INTERNAL, DO NOT USE
+inline FILE *log_set_target_impl(const std::string & filename) { return log_handler1_impl(true, LogTriStateSame, LogTriStateSame, filename); }
+inline FILE *log_set_target_impl(FILE *target) { return log_handler2_impl(true, LogTriStateSame, LogTriStateSame, target); }
+
+// INTERNAL, DO NOT USE
+inline FILE *log_handler() { return log_handler1_impl(); }
+
+// Enable or disable creating separate log files for each run.
+//  can ONLY be invoked BEFORE first log use.
+#define log_multilog(enable) log_filename_generator_impl((enable) ? LogTriStateTrue : LogTriStateFalse, "", "")
+// Enable or disable append mode for log file.
+//  can ONLY be invoked BEFORE first log use.
+#define log_append(enable) log_append_impl(enable)
+// INTERNAL, DO NOT USE
+inline FILE *log_append_impl(bool enable)
+{
+    return log_handler1_impl(true, enable ? LogTriStateTrue : LogTriStateFalse, LogTriStateSame);
+}
+
+inline void log_test()
+{
+    log_disable();
+    LOG("01 Hello World to nobody, because logs are disabled!\n");
+    log_enable();
+    LOG("02 Hello World to default output, which is \"%s\" ( Yaaay, arguments! )!\n", LOG_STRINGIZE(LOG_TARGET));
+    LOG_TEE("03 Hello World to **both** default output and " LOG_TEE_TARGET_STRING "!\n");
+    log_set_target(stderr);
+    LOG("04 Hello World to stderr!\n");
+    LOG_TEE("05 Hello World TEE with double printing to stderr prevented!\n");
+    log_set_target(LOG_DEFAULT_FILE_NAME);
+    LOG("06 Hello World to default log file!\n");
+    log_set_target(stdout);
+    LOG("07 Hello World to stdout!\n");
+    log_set_target(LOG_DEFAULT_FILE_NAME);
+    LOG("08 Hello World to default log file again!\n");
+    log_disable();
+    LOG("09 Hello World _1_ into the void!\n");
+    log_enable();
+    LOG("10 Hello World back from the void ( you should not see _1_ in the log or the output )!\n");
+    log_disable();
+    log_set_target("llama.anotherlog.log");
+    LOG("11 Hello World _2_ to nobody, new target was selected but logs are still disabled!\n");
+    log_enable();
+    LOG("12 Hello World this time in a new file ( you should not see _2_ in the log or the output )?\n");
+    log_set_target("llama.yetanotherlog.log");
+    LOG("13 Hello World this time in yet new file?\n");
+    log_set_target(log_filename_generator("llama_autonamed", "log"));
+    LOG("14 Hello World in log with generated filename!\n");
+#ifdef _MSC_VER
+    LOG_TEE("15 Hello msvc TEE without arguments\n");
+    LOG_TEE("16 Hello msvc TEE with (%d)(%s) arguments\n", 1, "test");
+    LOG_TEELN("17 Hello msvc TEELN without arguments\n");
+    LOG_TEELN("18 Hello msvc TEELN with (%d)(%s) arguments\n", 1, "test");
+    LOG("19 Hello msvc LOG without arguments\n");
+    LOG("20 Hello msvc LOG with (%d)(%s) arguments\n", 1, "test");
+    LOGLN("21 Hello msvc LOGLN without arguments\n");
+    LOGLN("22 Hello msvc LOGLN with (%d)(%s) arguments\n", 1, "test");
+#endif
+}
+
+inline bool log_param_single_parse(const std::string & param)
+{
+    if ( param == "--log-test")
+    {
+        log_test();
+        return true;
+    }
+
+    if ( param == "--log-disable")
+    {
+        log_disable();
+        return true;
+    }
+
+    if ( param == "--log-enable")
+    {
+        log_enable();
+        return true;
+    }
+
+    if (param == "--log-new")
+    {
+        log_multilog(true);
+        return true;
+    }
+
+    if (param == "--log-append")
+    {
+        log_append(true);
+        return true;
+    }
+
+    return false;
+}
+
+inline bool log_param_pair_parse(bool check_but_dont_parse, const std::string & param, const std::string & next = std::string())
+{
+    if ( param == "--log-file")
+    {
+        if (!check_but_dont_parse)
+        {
+            log_set_target(log_filename_generator(next.empty() ? "unnamed" : next, "log"));
+        }
+
+        return true;
+    }
+
+    return false;
+}
+
+inline void log_print_usage()
+{
+    printf("log options:\n");
+    /* format
+    printf("  -h, --help            show this help message and exit\n");*/
+    /* spacing
+    printf("__-param----------------Description\n");*/
+    printf("  --log-test            Run simple logging test\n");
+    printf("  --log-disable         Disable trace logs\n");
+    printf("  --log-enable          Enable trace logs\n");
+    printf("  --log-file            Specify a log filename (without extension)\n");
+    printf("  --log-new             Create a separate new log file on start. "
+                                   "Each log file will have unique name: \"<name>.<ID>.log\"\n");
+    printf("  --log-append          Don't truncate the old log file.\n");
+    printf("\n");
+}
+
+#define log_dump_cmdline(argc, argv) log_dump_cmdline_impl(argc, argv)
+
+// INTERNAL, DO NOT USE
+inline void log_dump_cmdline_impl(int argc, char **argv)
+{
+    std::stringstream buf;
+    for (int i = 0; i < argc; ++i)
+    {
+        if (std::string(argv[i]).find(' ') != std::string::npos)
+        {
+            buf << " \"" << argv[i] <<"\"";
+        }
+        else
+        {
+            buf << " " << argv[i];
+        }
+    }
+    LOGLN("Cmd:%s", buf.str().c_str());
+}
+
+#define log_tostr(var) log_var_to_string_impl(var).c_str()
+
+inline std::string log_var_to_string_impl(bool var)
+{
+    return var ? "true" : "false";
+}
+
+inline std::string log_var_to_string_impl(std::string var)
+{
+    return var;
+}
+
+inline std::string log_var_to_string_impl(const std::vector<int> & var)
+{
+    std::stringstream buf;
+    buf << "[ ";
+    bool first = true;
+    for (auto e : var)
+    {
+        if (first)
+        {
+            first = false;
+        }
+        else
+        {
+            buf << ", ";
+        }
+        buf << std::to_string(e);
+    }
+    buf << " ]";
+
+    return buf.str();
+}
+
+template <typename C, typename T>
+inline std::string LOG_TOKENS_TOSTR_PRETTY(const C & ctx, const T & tokens)
+{
+    std::stringstream buf;
+    buf << "[ ";
+
+    bool first = true;
+    for (const auto & token : tokens)
+    {
+        if (!first) {
+            buf << ", ";
+        } else {
+            first = false;
+        }
+
+        auto detokenized = llama_token_to_piece(ctx, token);
+
+        detokenized.erase(
+            std::remove_if(
+                detokenized.begin(),
+                detokenized.end(),
+                [](const unsigned char c) { return !std::isprint(c); }),
+            detokenized.end());
+
+        buf
+            << "'" << detokenized << "'"
+            << ":" << std::to_string(token);
+    }
+    buf << " ]";
+
+    return buf.str();
+}
+
+template <typename C, typename B>
+inline std::string LOG_BATCH_TOSTR_PRETTY(const C & ctx, const B & batch)
+{
+    std::stringstream buf;
+    buf << "[ ";
+
+    bool first = true;
+    for (int i = 0; i < batch.n_tokens; ++i)
+    {
+        if (!first) {
+            buf << ", ";
+        } else {
+            first = false;
+        }
+
+        auto detokenized = llama_token_to_piece(ctx, batch.token[i]);
+
+        detokenized.erase(
+            std::remove_if(
+                detokenized.begin(),
+                detokenized.end(),
+                [](const unsigned char c) { return !std::isprint(c); }),
+            detokenized.end());
+
+        buf
+            << "\n" << std::to_string(i)
+            << ":token '" << detokenized << "'"
+            << ":pos " << std::to_string(batch.pos[i])
+            << ":n_seq_id  " << std::to_string(batch.n_seq_id[i])
+            << ":seq_id " << std::to_string(batch.seq_id[i][0])
+            << ":logits " << std::to_string(batch.logits[i]);
+    }
+    buf << " ]";
+
+    return buf.str();
+}
+
+#ifdef LOG_DISABLE_LOGS
+
+#undef LOG
+#define LOG(...) // dummy stub
+#undef LOGLN
+#define LOGLN(...) // dummy stub
+
+#undef LOG_TEE
+#define LOG_TEE(...) fprintf(stderr, __VA_ARGS__) // convert to normal fprintf
+
+#undef LOG_TEELN
+#define LOG_TEELN(...) fprintf(stderr, __VA_ARGS__) // convert to normal fprintf
+
+#undef LOG_DISABLE
+#define LOG_DISABLE() // dummy stub
+
+#undef LOG_ENABLE
+#define LOG_ENABLE() // dummy stub
+
+#undef LOG_ENABLE
+#define LOG_ENABLE() // dummy stub
+
+#undef LOG_SET_TARGET
+#define LOG_SET_TARGET(...) // dummy stub
+
+#undef LOG_DUMP_CMDLINE
+#define LOG_DUMP_CMDLINE(...) // dummy stub
+
+#endif // LOG_DISABLE_LOGS
--- a/common/ngram-cache.cpp
+++ b/common/ngram-cache.cpp
@@ -2,11 +2,8 @@
 #include "common.h"
 #include "log.h"

-#include <cinttypes>
 #include <cstdint>
-#include <cstdio>
 #include <fstream>
-#include <thread>

 void llama_ngram_cache_update(llama_ngram_cache & ngram_cache, int ngram_min, int ngram_max,
                              std::vector<llama_token> & inp, int nnew, bool print_progress) {
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -2,130 +2,7 @@

 #include "common.h"

-#include <cmath>
-#include <unordered_map>
-
-// the ring buffer works similarly to std::deque, but with a fixed capacity
-// TODO: deduplicate with llama-impl.h
-template<typename T>
-struct ring_buffer {
-    ring_buffer(size_t cap) : capacity(cap), data(cap) {}
-
-    T & front() {
-        if (sz == 0) {
-            throw std::runtime_error("ring buffer is empty");
-        }
-        return data[first];
-    }
-
-    const T & front() const {
-        if (sz == 0) {
-            throw std::runtime_error("ring buffer is empty");
-        }
-        return data[first];
-    }
-
-    T & back() {
-        if (sz == 0) {
-            throw std::runtime_error("ring buffer is empty");
-        }
-        return data[pos];
-    }
-
-    const T & back() const {
-        if (sz == 0) {
-            throw std::runtime_error("ring buffer is empty");
-        }
-        return data[pos];
-    }
-
-    void push_back(const T & value) {
-        if (sz == capacity) {
-            // advance the start when buffer is full
-            first = (first + 1) % capacity;
-        } else {
-            sz++;
-        }
-        data[pos] = value;
-        pos = (pos + 1) % capacity;
-    }
-
-    T pop_front() {
-        if (sz == 0) {
-            throw std::runtime_error("ring buffer is empty");
-        }
-        T value = data[first];
-        first = (first + 1) % capacity;
-        sz--;
-        return value;
-    }
-
-    const T & rat(size_t i) const {
-        if (i >= sz) {
-            throw std::runtime_error("ring buffer: index out of bounds");
-        }
-        return data[(first + sz - i - 1) % capacity];
-    }
-
-    std::vector<T> to_vector() const {
-        std::vector<T> result;
-        result.reserve(sz);
-        for (size_t i = 0; i < sz; i++) {
-            result.push_back(data[(first + i) % capacity]);
-        }
-        return result;
-    }
-
-    void clear() {
-        // here only reset the status of the buffer
-        sz = 0;
-        first = 0;
-        pos = 0;
-    }
-
-    bool empty() const {
-        return sz == 0;
-    }
-
-    size_t size() const {
-        return sz;
-    }
-
-    size_t capacity = 0;
-    size_t sz = 0;
-    size_t first = 0;
-    size_t pos = 0;
-    std::vector<T> data;
-};
-
-struct gpt_sampler {
-    gpt_sampler_params params;
-
-    struct llama_sampler * grmr;
-    struct llama_sampler * chain;
-
-    ring_buffer<llama_token> prev;
-
-    std::vector<llama_token_data> cur;
-
-    llama_token_data_array cur_p;
-
-    void set_logits(struct llama_context * ctx, int idx) {
-        const auto * logits = llama_get_logits_ith(ctx, idx);
-
-        const int n_vocab = llama_n_vocab(llama_get_model(ctx));
-
-        cur.resize(n_vocab);
-
-        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-            cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
-        }
-
-        cur_p = { cur.data(), cur.size(), -1, false };
-    }
-};
-
-std::string gpt_sampler_params::print() const {
+std::string gpt_sampling_params::print_all() const {
    char result[1024];

    snprintf(result, sizeof(result),
@@ -139,212 +16,99 @@ std::string gpt_sampler_params::print() const {
    return std::string(result);
 }

-struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params) {
-    llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
-
-    lparams.no_perf = params.no_perf;
-
-    auto * result = new gpt_sampler {
-        /* .params = */ params,
-        /* .grmr   = */ llama_sampler_init_grammar(model, params.grammar.c_str(), "root"),
-        /* .chain  = */ llama_sampler_chain_init(lparams),
-        /* .prev   = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
-        /* .cur    = */ {},
-        /* .cur_p  = */ {},
-    };
-
-    llama_sampler_chain_add(result->chain,
-            llama_sampler_init_logit_bias(
-                llama_n_vocab(model),
-                params.logit_bias.size(),
-                params.logit_bias.data()));
-
-    llama_sampler_chain_add(result->chain,
-            llama_sampler_init_penalties(
-                llama_n_vocab  (model),
-                llama_token_eos(model),
-                llama_token_nl (model),
-                params.penalty_last_n,
-                params.penalty_repeat,
-                params.penalty_freq,
-                params.penalty_present,
-                params.penalize_nl,
-                params.ignore_eos));
-
-    if (params.temp > 0.0f) {
-        if (params.mirostat == 0) {
-            for (const auto & cnstr : params.samplers) {
-                switch (cnstr) {
-                    case GPT_SAMPLER_TYPE_TOP_K:
-                        llama_sampler_chain_add(result->chain, llama_sampler_init_top_k    (params.top_k));
-                        break;
-                    case GPT_SAMPLER_TYPE_TOP_P:
-                        llama_sampler_chain_add(result->chain, llama_sampler_init_top_p    (params.top_p, params.min_keep));
-                        break;
-                    case GPT_SAMPLER_TYPE_MIN_P:
-                        llama_sampler_chain_add(result->chain, llama_sampler_init_min_p    (params.min_p, params.min_keep));
-                        break;
-                    case GPT_SAMPLER_TYPE_TFS_Z:
-                        llama_sampler_chain_add(result->chain, llama_sampler_init_tail_free(params.tfs_z, params.min_keep));
-                        break;
-                    case GPT_SAMPLER_TYPE_TYPICAL_P:
-                        llama_sampler_chain_add(result->chain, llama_sampler_init_typical  (params.typ_p, params.min_keep));
-                        break;
-                    case GPT_SAMPLER_TYPE_TEMPERATURE:
-                        llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
-                        break;
-                    default:
-                        GGML_ASSERT(false && "unknown sampler type");
-                }
+std::string gpt_sampling_params::print_samplers() const {
+    std::string result = "CFG -> Penalties ";
+    if (mirostat == 0) {
+        for (const auto & sampler : samplers) {
+            const auto name = llama_sampling_type_to_str(sampler);
+            if (!name.empty()) {
+                result += "-> " + name + " ";
            }
-            llama_sampler_chain_add(result->chain, llama_sampler_init_softmax());
-            llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
-        } else if (params.mirostat == 1) {
-            llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
-            llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_n_vocab(model), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
-        } else if (params.mirostat == 2) {
-            llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
-            llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));
-        } else {
-            GGML_ASSERT(false && "unknown mirostat version");
        }
    } else {
-        if (params.n_probs > 0) {
-            // some use cases require to sample greedily, but still obtain the probabilities of the top tokens
-            // ref: https://github.com/ggerganov/llama.cpp/pull/9605
-            //
-            // the following will not produce exactly the same probs as applyging softmax to the full vocabulary, but
-            // it is much faster, since we avoid sorting all tokens and should give a good approximation
-            llama_sampler_chain_add(result->chain, llama_sampler_init_top_k(params.n_probs));
-            llama_sampler_chain_add(result->chain, llama_sampler_init_softmax());
-        }
-        llama_sampler_chain_add(result->chain, llama_sampler_init_greedy());
+        result += "-> mirostat ";
    }

    return result;
 }

-void gpt_sampler_free(struct gpt_sampler * gsmpl) {
-    if (gsmpl) {
-        llama_sampler_free(gsmpl->grmr);
+struct llama_sampling * llama_sampling_init(const struct llama_model * model, const struct gpt_sampling_params & params) {
+    llama_sampling_params lparams = llama_sampling_default_params();

-        llama_sampler_free(gsmpl->chain);
+    lparams.seed              = params.seed;
+    lparams.n_prev            = params.n_prev;
+    lparams.n_probs           = params.n_probs;
+    lparams.min_keep          = params.min_keep;
+    lparams.top_k             = params.top_k;
+    lparams.top_p             = params.top_p;
+    lparams.min_p             = params.min_p;
+    lparams.tfs_z             = params.tfs_z;
+    lparams.typ_p             = params.typ_p;
+    lparams.temp              = params.temp;
+    lparams.dynatemp_range    = params.dynatemp_range;
+    lparams.dynatemp_exponent = params.dynatemp_exponent;
+    lparams.penalty_last_n    = params.penalty_last_n;
+    lparams.penalty_repeat    = params.penalty_repeat;
+    lparams.penalty_freq      = params.penalty_freq;
+    lparams.penalty_present   = params.penalty_present;
+    lparams.mirostat          = params.mirostat;
+    lparams.mirostat_tau      = params.mirostat_tau;
+    lparams.mirostat_eta      = params.mirostat_eta;
+    lparams.penalize_nl       = params.penalize_nl;
+    lparams.ignore_eos        = params.ignore_eos;

-        delete gsmpl;
-    }
-}
-
-void gpt_sampler_accept(struct gpt_sampler * gsmpl, llama_token token, bool accept_grammar) {
-    if (accept_grammar) {
-        llama_sampler_accept(gsmpl->grmr, token);
+    lparams.n_samplers = params.samplers.size();
+    for (int i = 0; i < lparams.n_samplers; i++) {
+        lparams.samplers[i] = params.samplers[i];
    }

-    llama_sampler_accept(gsmpl->chain, token);
+    struct llama_sampling * result = llama_sampling_init(model, lparams);

-    gsmpl->prev.push_back(token);
+    llama_sampling_set_grammar   (result, params.grammar.c_str(), "root");
+    llama_sampling_set_logit_bias(result, params.logit_bias.size(), params.logit_bias.data());
+
+    return result;
 }

-void gpt_sampler_reset(struct gpt_sampler * gsmpl) {
-    llama_sampler_reset(gsmpl->grmr);
-
-    llama_sampler_reset(gsmpl->chain);
-}
-
-struct gpt_sampler * gpt_sampler_clone(gpt_sampler * gsmpl) {
-    return new gpt_sampler {
-        /* .params = */ gsmpl->params,
-        /* .grmr   = */ llama_sampler_clone(gsmpl->grmr),
-        /* .chain  = */ llama_sampler_clone(gsmpl->chain),
-        /* .prev   = */ gsmpl->prev,
-        /* .cur    = */ gsmpl->cur,
-        /* .cur_p  = */ gsmpl->cur_p,
-    };
-}
-
-void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler * gsmpl) {
-    // TODO: measure grammar performance
-
-    if (gsmpl) {
-        llama_perf_sampler_print(gsmpl->chain);
-    }
-    if (ctx) {
-        llama_perf_context_print(ctx);
-    }
-}
-
-llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
-    gsmpl->set_logits(ctx, idx);
-
-    auto & grmr  = gsmpl->grmr;
-    auto & chain = gsmpl->chain;
-    auto & cur_p = gsmpl->cur_p; // initialized by set_logits
-
-    if (grammar_first) {
-        llama_sampler_apply(grmr, &cur_p);
+void llama_sampling_cp(llama_sampling * src, llama_sampling *& dst) {
+    if (dst) {
+        llama_sampling_free(dst);
    }

-    llama_sampler_apply(chain, &cur_p);
+    dst = llama_sampling_cp(src);
+}

-    GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration");
+llama_token llama_sampling_sample(
+        struct llama_sampling * smpl,
+        struct llama_context * ctx,
+        int idx) {
+    llama_sampling_set_logits(smpl, llama_get_logits_ith(ctx, idx));

-    const llama_token id = cur_p.data[cur_p.selected].id;
+    // first, sample the token without any grammar constraints
+    const llama_token id = llama_sampling_sample(smpl, nullptr);

-    if (grammar_first) {
+    // create an array with a single token data element for the sampled id
+    llama_token_data       single_token_data       = { id, 1.0f, 0.0f };
+    llama_token_data_array single_token_data_array = { &single_token_data, 1, false };
+
+    llama_sampling_grammar(smpl, &single_token_data_array);
+
+    // check if the token is valid according to the grammar by seeing if its logit has been set to -INFINITY
+    const bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
+    if (is_valid) {
        return id;
    }

-    // check if it the sampled token fits the grammar
-    {
-        llama_token_data       single_token_data       = { id, 1.0f, 0.0f };
-        llama_token_data_array single_token_data_array = { &single_token_data, 1, -1, false };
+    // if the token is not valid, sample again, after applying the grammar constraints
+    llama_sampling_set_logits(smpl, llama_get_logits_ith(ctx, idx));

-        llama_sampler_apply(grmr, &single_token_data_array);
+    llama_sampling_grammar(smpl, nullptr);

-        const bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
-        if (is_valid) {
-            return id;
-        }
-    }
-
-    // resampling:
-    // if the token is not valid, sample again, but first apply the grammar sampler and then the sampling chain
-    gsmpl->set_logits(ctx, idx);
-
-    llama_sampler_apply(grmr,  &cur_p);
-    llama_sampler_apply(chain, &cur_p);
-
-    GGML_ASSERT(cur_p.selected != -1 && "no selected token during re-sampling - check your sampling configuration");
-
-    return cur_p.data[cur_p.selected].id;
+    return llama_sampling_sample(smpl, nullptr);
 }

-uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl) {
-    return llama_sampler_get_seed(gsmpl->chain);
-}
-
-// helpers
-
-llama_token_data_array * gpt_sampler_get_candidates(struct gpt_sampler * gsmpl) {
-    return &gsmpl->cur_p;
-}
-
-llama_token gpt_sampler_last(const struct gpt_sampler * gsmpl) {
-    return gsmpl->prev.rat(0);
-}
-
-std::string gpt_sampler_print(const struct gpt_sampler * gsmpl) {
-    std::string result = "logits ";
-
-    for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) {
-        const auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
-        result += std::string("-> ") + llama_sampler_name(smpl) + " ";
-    }
-
-    return result;
-}
-
-std::string gpt_sampler_prev_str(gpt_sampler * gsmpl, llama_context * ctx_main, int n) {
-    n = std::min(n, (int) gsmpl->prev.size());
+std::string llama_sampling_prev_str(llama_sampling * smpl, llama_context * ctx_main, int n) {
+    n = std::min(n, llama_sampling_n_prev(smpl));

    if (n <= 0) {
        return "";
@@ -354,7 +118,7 @@ std::string gpt_sampler_prev_str(gpt_sampler * gsmpl, llama_context * ctx_main,
    result.reserve(8*n); // 8 is the average length of a token [citation needed], TODO: compute this from the vocab

    for (int i = n - 1; i >= 0; i--) {
-        const llama_token id = gsmpl->prev.rat(i);
+        const llama_token id = llama_sampling_prev(smpl, i);

        GGML_ASSERT(id != LLAMA_TOKEN_NULL && "null token in the sampling history - should not happen");

@@ -364,57 +128,57 @@ std::string gpt_sampler_prev_str(gpt_sampler * gsmpl, llama_context * ctx_main,
    return result;
 }

-char gpt_sampler_type_to_chr(enum gpt_sampler_type cnstr) {
-    switch (cnstr) {
-        case GPT_SAMPLER_TYPE_TOP_K:       return 'k';
-        case GPT_SAMPLER_TYPE_TFS_Z:       return 'f';
-        case GPT_SAMPLER_TYPE_TYPICAL_P:   return 'y';
-        case GPT_SAMPLER_TYPE_TOP_P:       return 'p';
-        case GPT_SAMPLER_TYPE_MIN_P:       return 'm';
-        case GPT_SAMPLER_TYPE_TEMPERATURE: return 't';
+char llama_sampling_type_to_chr(llama_sampler_type sampler) {
+    switch (sampler) {
+        case LLAMA_SAMPLER_TYPE_TOP_K:       return 'k';
+        case LLAMA_SAMPLER_TYPE_TFS_Z:       return 'f';
+        case LLAMA_SAMPLER_TYPE_TYPICAL_P:   return 'y';
+        case LLAMA_SAMPLER_TYPE_TOP_P:       return 'p';
+        case LLAMA_SAMPLER_TYPE_MIN_P:       return 'm';
+        case LLAMA_SAMPLER_TYPE_TEMPERATURE: return 't';
        default : return '?';
    }
 }

-std::string gpt_sampler_type_to_str(enum gpt_sampler_type cnstr) {
-    switch (cnstr) {
-        case GPT_SAMPLER_TYPE_TOP_K:       return "top_k";
-        case GPT_SAMPLER_TYPE_TFS_Z:       return "tfs_z";
-        case GPT_SAMPLER_TYPE_TYPICAL_P:   return "typ_p";
-        case GPT_SAMPLER_TYPE_TOP_P:       return "top_p";
-        case GPT_SAMPLER_TYPE_MIN_P:       return "min_p";
-        case GPT_SAMPLER_TYPE_TEMPERATURE: return "temperature";
+std::string llama_sampling_type_to_str(llama_sampler_type sampler) {
+    switch (sampler) {
+        case LLAMA_SAMPLER_TYPE_TOP_K:       return "top_k";
+        case LLAMA_SAMPLER_TYPE_TFS_Z:       return "tfs_z";
+        case LLAMA_SAMPLER_TYPE_TYPICAL_P:   return "typ_p";
+        case LLAMA_SAMPLER_TYPE_TOP_P:       return "top_p";
+        case LLAMA_SAMPLER_TYPE_MIN_P:       return "min_p";
+        case LLAMA_SAMPLER_TYPE_TEMPERATURE: return "temperature";
        default : return "";
    }
 }

-std::vector<gpt_sampler_type> gpt_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
-    std::unordered_map<std::string, gpt_sampler_type> sampler_canonical_name_map {
-        { "top_k",       GPT_SAMPLER_TYPE_TOP_K },
-        { "top_p",       GPT_SAMPLER_TYPE_TOP_P },
-        { "typ_p",       GPT_SAMPLER_TYPE_TYPICAL_P },
-        { "min_p",       GPT_SAMPLER_TYPE_MIN_P },
-        { "tfs_z",       GPT_SAMPLER_TYPE_TFS_Z },
-        { "temperature", GPT_SAMPLER_TYPE_TEMPERATURE },
+std::vector<llama_sampler_type> llama_sampling_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
+    std::unordered_map<std::string, llama_sampler_type> sampler_canonical_name_map {
+        { "top_k",       LLAMA_SAMPLER_TYPE_TOP_K },
+        { "top_p",       LLAMA_SAMPLER_TYPE_TOP_P },
+        { "typ_p",       LLAMA_SAMPLER_TYPE_TYPICAL_P },
+        { "min_p",       LLAMA_SAMPLER_TYPE_MIN_P },
+        { "tfs_z",       LLAMA_SAMPLER_TYPE_TFS_Z },
+        { "temperature", LLAMA_SAMPLER_TYPE_TEMPERATURE },
    };

    // since samplers names are written multiple ways
    // make it ready for both system names and input names
-    std::unordered_map<std::string, gpt_sampler_type> sampler_alt_name_map {
-        { "top-k",       GPT_SAMPLER_TYPE_TOP_K },
-        { "top-p",       GPT_SAMPLER_TYPE_TOP_P },
-        { "nucleus",     GPT_SAMPLER_TYPE_TOP_P },
-        { "typical-p",   GPT_SAMPLER_TYPE_TYPICAL_P },
-        { "typical",     GPT_SAMPLER_TYPE_TYPICAL_P },
-        { "typ-p",       GPT_SAMPLER_TYPE_TYPICAL_P },
-        { "typ",         GPT_SAMPLER_TYPE_TYPICAL_P },
-        { "min-p",       GPT_SAMPLER_TYPE_MIN_P },
-        { "tfs-z",       GPT_SAMPLER_TYPE_TFS_Z },
-        { "tfs",         GPT_SAMPLER_TYPE_TFS_Z },
-        { "temp",        GPT_SAMPLER_TYPE_TEMPERATURE },
+    std::unordered_map<std::string, llama_sampler_type> sampler_alt_name_map {
+        { "top-k",       LLAMA_SAMPLER_TYPE_TOP_K },
+        { "top-p",       LLAMA_SAMPLER_TYPE_TOP_P },
+        { "nucleus",     LLAMA_SAMPLER_TYPE_TOP_P },
+        { "typical-p",   LLAMA_SAMPLER_TYPE_TYPICAL_P },
+        { "typical",     LLAMA_SAMPLER_TYPE_TYPICAL_P },
+        { "typ-p",       LLAMA_SAMPLER_TYPE_TYPICAL_P },
+        { "typ",         LLAMA_SAMPLER_TYPE_TYPICAL_P },
+        { "min-p",       LLAMA_SAMPLER_TYPE_MIN_P },
+        { "tfs-z",       LLAMA_SAMPLER_TYPE_TFS_Z },
+        { "tfs",         LLAMA_SAMPLER_TYPE_TFS_Z },
+        { "temp",        LLAMA_SAMPLER_TYPE_TEMPERATURE },
    };

-    std::vector<gpt_sampler_type> samplers;
+    std::vector<llama_sampler_type> samplers;
    samplers.reserve(names.size());

    for (const auto & name : names) {
@@ -434,17 +198,17 @@ std::vector<gpt_sampler_type> gpt_sampler_types_from_names(const std::vector<std
    return samplers;
 }

-std::vector<gpt_sampler_type> gpt_sampler_types_from_chars(const std::string & chars) {
-    std::unordered_map<char, gpt_sampler_type> sampler_name_map = {
-        { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TOP_K),       GPT_SAMPLER_TYPE_TOP_K },
-        { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TFS_Z),       GPT_SAMPLER_TYPE_TFS_Z },
-        { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TYPICAL_P),   GPT_SAMPLER_TYPE_TYPICAL_P },
-        { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TOP_P),       GPT_SAMPLER_TYPE_TOP_P },
-        { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_MIN_P),       GPT_SAMPLER_TYPE_MIN_P },
-        { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TEMPERATURE), GPT_SAMPLER_TYPE_TEMPERATURE }
+std::vector<llama_sampler_type> llama_sampling_types_from_chars(const std::string & chars) {
+    std::unordered_map<char, llama_sampler_type> sampler_name_map {
+        { llama_sampling_type_to_chr(LLAMA_SAMPLER_TYPE_TOP_K),       LLAMA_SAMPLER_TYPE_TOP_K },
+        { llama_sampling_type_to_chr(LLAMA_SAMPLER_TYPE_TFS_Z),       LLAMA_SAMPLER_TYPE_TFS_Z },
+        { llama_sampling_type_to_chr(LLAMA_SAMPLER_TYPE_TYPICAL_P),   LLAMA_SAMPLER_TYPE_TYPICAL_P },
+        { llama_sampling_type_to_chr(LLAMA_SAMPLER_TYPE_TOP_P),       LLAMA_SAMPLER_TYPE_TOP_P },
+        { llama_sampling_type_to_chr(LLAMA_SAMPLER_TYPE_MIN_P),       LLAMA_SAMPLER_TYPE_MIN_P },
+        { llama_sampling_type_to_chr(LLAMA_SAMPLER_TYPE_TEMPERATURE), LLAMA_SAMPLER_TYPE_TEMPERATURE }
    };

-    std::vector<gpt_sampler_type> samplers;
+    std::vector<llama_sampler_type> samplers;
    samplers.reserve(chars.size());

    for (const auto & c : chars) {
--- a/common/sampling.h
+++ b/common/sampling.h
@@ -2,82 +2,78 @@

 #include "llama.h"

-#include "common.h"
-
 #include <string>
 #include <vector>

-// gpt_sampler extends llama_sampler with additional functionality:
-//
-//  - grammar support
-//  - custom sampler logic based on the parameters
-//  - history of the last accepted tokens
-//  - performance metrics
-//
-// This goal is to have a common implementation of the sampling logic shared across the examples.
-// For example, depending on the temperature, the sampling chain can be very simple (greedy) or more
-// complex (top-k, top-p, etc).
-//
-// Another example is related to the grammar. In general, the grammar constraints applied on the full
-// vocabulary can be very taxing. To improve performance, the grammar can be applied only to the sampled
-// token in order to verify if it fits the grammar. And only if the token doesn't fit the grammar, the
-// grammar constraints are applied to the full vocabulary and the token is resampled.
-//
-// The gpt_sampler also maintains a container with the last accepted tokens. In the future, this can
-// be moved into the core llama library.
-//
-// For convenience, the gpt_sampler also maintains a container with the current candidate tokens.
-// This can be used to access the probabilities of the rest of the non-sampled tokens.
-//
-// TODO: measure grammar performance
-//
+// sampling parameters
+typedef struct gpt_sampling_params {
+    uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampling

-struct gpt_sampler;
+    int32_t n_prev            = 64;    // number of previous tokens to remember
+    int32_t n_probs           = 0;     // if greater than 0, output the probabilities of top n_probs tokens.
+    int32_t min_keep          = 0;     // 0 = disabled, otherwise samplers should return at least min_keep tokens
+    int32_t top_k             = 40;    // <= 0 to use vocab size
+    float   top_p             = 0.95f; // 1.0 = disabled
+    float   min_p             = 0.05f; // 0.0 = disabled
+    float   tfs_z             = 1.00f; // 1.0 = disabled
+    float   typ_p             = 1.00f; // typical_p, 1.0 = disabled
+    float   temp              = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
+    float   dynatemp_range    = 0.00f; // 0.0 = disabled
+    float   dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
+    int32_t penalty_last_n    = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
+    float   penalty_repeat    = 1.00f; // 1.0 = disabled
+    float   penalty_freq      = 0.00f; // 0.0 = disabled
+    float   penalty_present   = 0.00f; // 0.0 = disabled
+    int32_t mirostat          = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
+    float   mirostat_tau      = 5.00f; // target entropy
+    float   mirostat_eta      = 0.10f; // learning rate
+    bool    penalize_nl       = false; // consider newlines as a repeatable token
+    bool    ignore_eos        = false;

-// llama_sampler API overloads
+    std::vector<enum llama_sampler_type> samplers = {
+        LLAMA_SAMPLER_TYPE_TOP_K,
+        LLAMA_SAMPLER_TYPE_TFS_Z,
+        LLAMA_SAMPLER_TYPE_TYPICAL_P,
+        LLAMA_SAMPLER_TYPE_TOP_P,
+        LLAMA_SAMPLER_TYPE_MIN_P,
+        LLAMA_SAMPLER_TYPE_TEMPERATURE
+    };

-struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params);
+    std::string grammar; // optional BNF-like grammar to constrain sampling

-void gpt_sampler_free(struct gpt_sampler * gsmpl);
+    std::vector<llama_logit_bias> logit_bias; // logit biases to apply

-// if accept_grammar is true, the token is accepted both by the sampling chain and the grammar
-void                 gpt_sampler_accept(struct gpt_sampler * gsmpl, llama_token token, bool accept_grammar);
-void                 gpt_sampler_reset (struct gpt_sampler * gsmpl);
-struct gpt_sampler * gpt_sampler_clone (struct gpt_sampler * gsmpl);
+    // print the parameters into a string
+    std::string print_all() const;

-// arguments can be nullptr to skip printing
-void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler * gsmpl);
+    // print the samplers into a string
+    std::string print_samplers() const;
+} gpt_sampling_params;

-// extended sampling implementation:
+// overload of llama_sampling_init using gpt_sampling_params
+struct llama_sampling * llama_sampling_init(const struct llama_model * model, const struct gpt_sampling_params & params);
+
+void llama_sampling_cp(llama_sampling * src, llama_sampling *& dst);
+
+// common sampling implementation:
 //
 // - set logits
-// - apply the configured sampler chain
+// - apply the configured sampling constraints
 // - check if the token fits the grammar (if any)
 // - if not: resample by first applying the grammar constraints and then sampling again (slower path)
 //
-// if grammar_first is true, the grammar is applied before the samplers (slower)
-// useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar
-//
-llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
-
-uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl);
+llama_token llama_sampling_sample(
+        struct llama_sampling * smpl,
+         struct llama_context * ctx,
+                          int   idx);

 // helpers

-// access the internal list of current candidate tokens
-llama_token_data_array * gpt_sampler_get_candidates(struct gpt_sampler * gsmpl);
-
-// get the last accepted token
-llama_token gpt_sampler_last(const struct gpt_sampler * gsmpl);
-
-// print the sampler chain into a string
-std::string gpt_sampler_print(const struct gpt_sampler * gsmpl);
-
 // get a string representation of the last accepted tokens
-std::string gpt_sampler_prev_str(gpt_sampler * gsmpl, llama_context * ctx, int n);
+std::string llama_sampling_prev_str(llama_sampling * smpl, llama_context * ctx, int n);

-char        gpt_sampler_type_to_chr(enum gpt_sampler_type cnstr);
-std::string gpt_sampler_type_to_str(enum gpt_sampler_type cnstr);
+char        llama_sampling_type_to_chr(enum llama_sampler_type sampler_type);
+std::string llama_sampling_type_to_str(enum llama_sampler_type sampler_type);

-std::vector<enum gpt_sampler_type> gpt_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
-std::vector<enum gpt_sampler_type> gpt_sampler_types_from_chars(const std::string & chars);
+std::vector<enum llama_sampler_type> llama_sampling_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
+std::vector<enum llama_sampler_type> llama_sampling_types_from_chars(const std::string & chars);
--- a/common/train.cpp
+++ b/common/train.cpp
@@ -1,11 +1,9 @@
 #include "train.h"
 #include "common.h"

-#include <algorithm>
 #include <random>
 #include <sstream>
 #include <functional>
-#include <cstring>

 struct random_normal_distribution {
    std::mt19937 gen;
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -15,7 +15,6 @@ from enum import IntEnum
 from pathlib import Path
 from hashlib import sha256
 from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Literal, Sequence, TypeVar, cast
-from itertools import chain

 import math
 import numpy as np
@@ -65,6 +64,7 @@ class Model:
    model_name: str | None
    metadata_override: Path | None
    dir_model_card: Path
+    is_lora: bool

    # subclasses should define this!
    model_arch: gguf.MODEL_ARCH
@@ -72,7 +72,7 @@ class Model:
    def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool = False,
                 use_temp_file: bool = False, eager: bool = False,
                 metadata_override: Path | None = None, model_name: str | None = None,
-                 split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False):
+                 split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False, is_lora: bool = False):
        if type(self) is Model:
            raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")

@@ -94,6 +94,7 @@ class Model:
        self.metadata_override = metadata_override
        self.model_name = model_name
        self.dir_model_card = dir_model  # overridden in convert_lora_to_gguf.py
+        self.is_lora = is_lora  # true if model is used inside convert_lora_to_gguf.py

        # Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type
        if self.ftype == gguf.LlamaFileType.GUESSED:
@@ -131,14 +132,12 @@ class Model:
    def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
        tensor_names_from_parts: set[str] = set()

-        index_name = "model.safetensors" if self.is_safetensors else "pytorch_model.bin"
-        index_name += ".index.json"
-        index_file = self.dir_model / index_name
-
-        if index_file.is_file():
+        if len(self.part_names) > 1:
            self.tensor_names = set()
+            index_name = "model.safetensors" if self.is_safetensors else "pytorch_model.bin"
+            index_name += ".index.json"
            logger.info(f"gguf: loading model weight map from '{index_name}'")
-            with open(index_file, "r", encoding="utf-8") as f:
+            with open(self.dir_model / index_name, "r", encoding="utf-8") as f:
                index: dict[str, Any] = json.load(f)
                weight_map = index.get("weight_map")
                if weight_map is None or not isinstance(weight_map, dict):
@@ -146,7 +145,6 @@ class Model:
                self.tensor_names.update(weight_map.keys())
        else:
            self.tensor_names = tensor_names_from_parts
-            weight_map = {}

        for part_name in self.part_names:
            logger.info(f"gguf: loading model part '{part_name}'")
@@ -173,17 +171,9 @@ class Model:
                            data = LazyTorchTensor.from_eager(data)
                    yield name, data

-        # verify tensor name presence and identify potentially missing files
-        if len(tensor_names_from_parts.symmetric_difference(self.tensor_names)) > 0:
-            missing = sorted(self.tensor_names.difference(tensor_names_from_parts))
-            extra = sorted(tensor_names_from_parts.difference(self.tensor_names))
-            missing_files = sorted(set(weight_map[n] for n in missing if n in weight_map))
-            if len(extra) == 0 and len(missing_files) > 0:
-                raise ValueError(f"Missing or incomplete model files: {missing_files}")
-            else:
-                raise ValueError("Mismatch between weight map and model parts for tensor names:\n"
-                                 f"Missing tensors: {missing}\n"
-                                 f"Extra tensors: {extra}")
+        # only verify tensor name presence; it doesn't matter if they are not in the right files
+        if len(sym_diff := tensor_names_from_parts.symmetric_difference(self.tensor_names)) > 0:
+            raise ValueError(f"Mismatch between weight map and model parts for tensor names: {sym_diff}")

    def format_tensor_name(self, key: gguf.MODEL_TENSOR, bid: int | None = None, suffix: str = ".weight") -> str:
        if key not in gguf.MODEL_TENSORS[self.model_arch]:
@@ -269,14 +259,10 @@ class Model:

        return False

-    # some models need extra generated tensors (like rope_freqs)
-    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
-        return ()
-
    def prepare_tensors(self):
        max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,")

-        for name, data_torch in chain(self.generate_extra_tensors(), self.get_tensors()):
+        for name, data_torch in self.get_tensors():
            # we don't need these
            if name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")):
                continue
@@ -294,13 +280,8 @@ class Model:
                    bid = int(part)
                    break

-            for new_name, data_torch in (self.modify_tensors(data_torch, name, bid)):
-                data = data_torch.squeeze().numpy()
-
-                # if data ends up empty, it means data_torch was a scalar tensor -> restore
-                if len(data.shape) == 0:
-                    data = data_torch.numpy()
-
+            for new_name, data in ((n, d.squeeze().numpy()) for n, d in self.modify_tensors(data_torch, name, bid)):
+                data: np.ndarray  # type hint
                n_dims = len(data.shape)
                data_qtype: gguf.GGMLQuantizationType | bool = self.tensor_force_quant(name, new_name, bid, n_dims)

@@ -321,28 +302,12 @@ class Model:
                            gguf.MODEL_TENSOR.TIME_MIX_FIRST,
                            gguf.MODEL_TENSOR.TIME_MIX_W1,
                            gguf.MODEL_TENSOR.TIME_MIX_W2,
-                            gguf.MODEL_TENSOR.TIME_MIX_DECAY_W1,
-                            gguf.MODEL_TENSOR.TIME_MIX_DECAY_W2,
                        )
                    )
                    or not new_name.endswith(".weight")
                ):
                    data_qtype = gguf.GGMLQuantizationType.F32

-                if data_qtype is False and any(
-                    self.match_model_tensor_name(new_name, key, bid)
-                    for key in (
-                        gguf.MODEL_TENSOR.TOKEN_EMBD,
-                        gguf.MODEL_TENSOR.OUTPUT,
-                    )
-                ):
-                    if self.ftype in (
-                        gguf.LlamaFileType.MOSTLY_TQ1_0,
-                        gguf.LlamaFileType.MOSTLY_TQ2_0,
-                    ):
-                        # TODO: use Q4_K and Q6_K
-                        data_qtype = gguf.GGMLQuantizationType.F16
-
                # No override (data_qtype is False), or wants to be quantized (data_qtype is True)
                if isinstance(data_qtype, bool):
                    if self.ftype == gguf.LlamaFileType.ALL_F32:
@@ -353,10 +318,6 @@ class Model:
                        data_qtype = gguf.GGMLQuantizationType.BF16
                    elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0:
                        data_qtype = gguf.GGMLQuantizationType.Q8_0
-                    elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ1_0:
-                        data_qtype = gguf.GGMLQuantizationType.TQ1_0
-                    elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ2_0:
-                        data_qtype = gguf.GGMLQuantizationType.TQ2_0
                    else:
                        raise ValueError(f"Unknown file type: {self.ftype.name}")

@@ -600,9 +561,6 @@ class Model:
        if chkhsh == "a8594e3edff7c29c003940395316294b2c623e09894deebbc65f33f1515df79e":
            # ref: https://huggingface.co/databricks/dbrx-base
            res = "dbrx"
-        if chkhsh == "c7699093ba4255a91e702aa38a596aa81669f3525dae06c2953267dde580f448":
-            # ref: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
-            res = "jina-v1-en"
        if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
            # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-en
            res = "jina-v2-en"
@@ -648,12 +606,6 @@ class Model:
        if chkhsh == "4e2b24cc4770243d65a2c9ec19770a72f08cffc161adbb73fcbb6b7dd45a0aae":
            # ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct
            res = "exaone"
-        if chkhsh == "fcace8b9cac38ce847670c970cd5892031a753a1ef381abd1d9af00f713da085":
-            # ref: https://huggingface.co/microsoft/phi-2
-            res = "phi-2"
-        if chkhsh == "60824e3c0d9401f89943cbb2fff727f0e2d4c545ba4df2d6e4f09a6db0f5b450":
-            # ref: https://huggingface.co/facebook/chameleon-7b
-            res = "chameleon"

        if res is None:
            logger.warning("\n")
@@ -1512,7 +1464,7 @@ class StableLMModel(Model):
                raise ValueError(f"Unprocessed norms: {norms}")


-@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
+@Model.register("LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
 class LlamaModel(Model):
    model_arch = gguf.MODEL_ARCH.LLAMA

@@ -1620,7 +1572,7 @@ class LlamaModel(Model):

        return [(self.map_tensor_name(name), data_torch)]

-    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
+    def prepare_tensors(self):
        if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
            if rope_scaling.get("rope_type", '').lower() == "llama3":
                base = self.hparams.get("rope_theta", 10000.0)
@@ -1647,9 +1599,9 @@ class LlamaModel(Model):
                        smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
                        rope_factors.append(1 / ((1 - smooth) / factor + smooth))

-                yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))
+                if not self.is_lora:
+                    self.gguf_writer.add_tensor(self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), np.array(rope_factors, dtype=np.float32))

-    def prepare_tensors(self):
        super().prepare_tensors()

        if self._experts is not None:
@@ -1671,16 +1623,15 @@ class BitnetModel(Model):
        self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
        self.gguf_writer.add_rope_scaling_factor(1.0)

-    def weight_quant(self, weight: Tensor) -> Tensor:
+    def weight_quant(self, weight):
        dtype = weight.dtype
        weight = weight.float()
-        scale = weight.abs().mean().clamp(min=1e-5)
-        iscale = 1 / scale
-        # TODO: multiply by the scale directly instead of inverting it twice
-        # (this is also unnecessarily doubly inverted upstream)
-        # ref: https://huggingface.co/1bitLLM/bitnet_b1_58-3B/blob/af89e318d78a70802061246bf037199d2fb97020/utils_quant.py#L10
-        result = (weight * iscale).round().clamp(-1, 1) / iscale
-        return result.type(dtype)
+        s = 1 / weight.abs().mean().clamp(min=1e-5)
+        weight = (weight * s).round().clamp(-1, 1) / s
+        scale = weight.abs().max().unsqueeze(0)
+        weight = torch.where(weight.abs().less(1e-6), 0, weight).type(dtype)
+        weight = torch.sign(weight).type(dtype)
+        return weight.type(dtype), scale.type(torch.float32)

    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        new_name = self.map_tensor_name(name)
@@ -1695,9 +1646,11 @@ class BitnetModel(Model):
            gguf.MODEL_TENSOR.FFN_GATE,
        ]):
            # transform weight into 1/0/-1 (in fp32)
-            data_torch = self.weight_quant(data_torch)
-
-        yield (new_name, data_torch)
+            weight_torch, scale_torch = self.weight_quant(data_torch)
+            yield (new_name, weight_torch)
+            yield (new_name.removesuffix(".weight") + ".scale", scale_torch)
+        else:
+            yield (new_name, data_torch)


@Model.register("GrokForCausalLM")
@@ -1866,59 +1819,6 @@ class MiniCPMModel(Model):
        return [(self.map_tensor_name(name), data_torch)]


-@Model.register("MiniCPM3ForCausalLM")
-class MiniCPM3Model(Model):
-    model_arch = gguf.MODEL_ARCH.MINICPM3
-
-    def set_gguf_parameters(self):
-        hparams = self.hparams
-
-        self.gguf_writer.add_file_type(self.ftype)
-        self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
-        self.gguf_writer.add_embedding_length(hparams["hidden_size"])
-        self.gguf_writer.add_block_count(self.block_count)
-        self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
-        self.gguf_writer.add_head_count(hparams["num_attention_heads"])
-        self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"])
-        self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
-        self.gguf_writer.add_vocab_size(hparams["vocab_size"])
-        if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None:
-            self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"])
-        self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"])
-        self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
-        self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
-
-    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
-        rope_scaling = self.find_hparam(['rope_scaling'], True)
-        if rope_scaling is not None:
-            rope_dims = self.hparams["qk_rope_head_dim"]
-
-            long_factors = rope_scaling.get('long_factor', None)
-            short_factors = rope_scaling.get('short_factor', None)
-
-            if long_factors is None or short_factors is None:
-                raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')
-
-            if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
-                raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
-
-            yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32))
-            yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32))
-
-    def set_vocab(self):
-        self._set_vocab_sentencepiece()
-
-    def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
-        if n_kv_head is not None and n_head != n_kv_head:
-            n_head //= n_kv_head
-
-        return (
-            weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
-            .swapaxes(1, 2)
-            .reshape(weights.shape)
-        )
-
-
@Model.register("QWenLMHeadModel")
 class QwenModel(Model):
    model_arch = gguf.MODEL_ARCH.QWEN
@@ -2218,13 +2118,6 @@ class Phi3MiniModel(Model):
        self.gguf_writer.add_file_type(self.ftype)
        self.gguf_writer.add_sliding_window(self.find_hparam(["sliding_window"]))

-    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
-        n_embd = self.find_hparam(["hidden_size", "n_embd"])
-        n_head = self.find_hparam(["num_attention_heads", "n_head"])
-        max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"])
-        orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
-        rope_dims = n_embd // n_head
-
        # write rope scaling for long context (128k) model
        rope_scaling = self.find_hparam(['rope_scaling'], True)
        if rope_scaling is None:
@@ -2254,8 +2147,9 @@ class Phi3MiniModel(Model):
        if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
            raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')

-        yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32))
-        yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32))
+        if not self.is_lora:
+            self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_LONG]  + ".weight", np.array(long_factors, dtype=np.float32))
+            self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT] + ".weight", np.array(short_factors, dtype=np.float32))


@Model.register("PlamoForCausalLM")
@@ -2617,7 +2511,7 @@ class NomicBertModel(BertModel):
        self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])


-@Model.register("XLMRobertaModel", "XLMRobertaForSequenceClassification")
+@Model.register("XLMRobertaModel")
 class XLMRobertaModel(BertModel):
    model_arch = gguf.MODEL_ARCH.BERT

@@ -2715,11 +2609,6 @@ class XLMRobertaModel(BertModel):
        self.gguf_writer.add_add_eos_token(True)

    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        # if name starts with "roberta.", remove the prefix
-        # e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main
-        if name.startswith("roberta."):
-            name = name[8:]
-
        # position embeddings start at pad_token_id + 1, so just chop down the weight tensor
        if name == "embeddings.position_embeddings.weight":
            if self._position_offset is not None:
@@ -2863,8 +2752,6 @@ class Rwkv6Model(Model):
        self.gguf_writer.add_tokenizer_model("rwkv")
        self.gguf_writer.add_token_list(tokens)
        self.gguf_writer.add_token_types(toktypes)
-        special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
-        special_vocab.add_to_gguf(self.gguf_writer)

    def set_gguf_parameters(self):
        block_count = self.hparams["num_hidden_layers"]
@@ -3033,66 +2920,6 @@ class OlmoModel(Model):
        return [(self.map_tensor_name(name), data_torch)]


-@Model.register("OlmoeForCausalLM")
-class OlmoeModel(Model):
-    model_arch = gguf.MODEL_ARCH.OLMOE
-
-    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-        self.gguf_writer.add_layer_norm_rms_eps(1e-5)
-        if (n_experts := self.hparams.get("num_experts")) is not None:
-            self.gguf_writer.add_expert_count(n_experts)
-
-    _experts: list[dict[str, Tensor]] | None = None
-
-    # Copied from: Qwen2MoeModel
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        # process the experts separately
-        if name.find("experts") != -1:
-            n_experts = self.hparams["num_experts"]
-            assert bid is not None
-
-            if self._experts is None:
-                self._experts = [{} for _ in range(self.block_count)]
-
-            self._experts[bid][name] = data_torch
-
-            if len(self._experts[bid]) >= n_experts * 3:
-                tensors: list[tuple[str, Tensor]] = []
-
-                # merge the experts into a single 3d tensor
-                for w_name in ["down_proj", "gate_proj", "up_proj"]:
-                    datas: list[Tensor] = []
-
-                    for xid in range(n_experts):
-                        ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
-                        datas.append(self._experts[bid][ename])
-                        del self._experts[bid][ename]
-
-                    data_torch = torch.stack(datas, dim=0)
-
-                    merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
-
-                    new_name = self.map_tensor_name(merged_name)
-
-                    tensors.append((new_name, data_torch))
-                return tensors
-            else:
-                return []
-
-        return [(self.map_tensor_name(name), data_torch)]
-
-    # Copied from: Qwen2MoeModel
-    def prepare_tensors(self):
-        super().prepare_tensors()
-
-        if self._experts is not None:
-            # flatten `list[dict[str, Tensor]]` into `list[str]`
-            experts = [k for d in self._experts for k in d.keys()]
-            if len(experts) > 0:
-                raise ValueError(f"Unprocessed experts: {experts}")
-
-
@Model.register("JinaBertModel", "JinaBertForMaskedLM")
 class JinaBertV2Model(BertModel):
    model_arch = gguf.MODEL_ARCH.JINA_BERT_V2
@@ -3131,14 +2958,6 @@ class JinaBertV2Model(BertModel):
        self.gguf_writer.add_add_bos_token(True)
        self.gguf_writer.add_add_eos_token(True)

-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        # if name starts with "bert.", remove the prefix
-        # e.g. https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
-        if name.startswith("bert."):
-            name = name[5:]
-
-        return super().modify_tensors(data_torch, name, bid)
-

@Model.register("OpenELMForCausalLM")
 class OpenELMModel(Model):
@@ -4079,7 +3898,7 @@ class ExaoneModel(Model):
                self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
                self.gguf_writer.add_rope_scaling_factor(hparams["rope_scaling"]["factor"])

-    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
+    def prepare_tensors(self):
        if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
            if rope_scaling.get("rope_type", '').lower() == "llama3":
                base = self.hparams.get("rope_theta", 10000.0)
@@ -4106,112 +3925,14 @@ class ExaoneModel(Model):
                        smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
                        rope_factors.append(1 / ((1 - smooth) / factor + smooth))

-                yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))
+                if not self.is_lora:
+                    self.gguf_writer.add_tensor(self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), np.array(rope_factors, dtype=np.float32))

-
-@Model.register("GraniteForCausalLM")
-class GraniteModel(LlamaModel):
-    """Conversion for IBM's GraniteForCausalLM"""
-    model_arch = gguf.MODEL_ARCH.GRANITE
-
-    def set_gguf_parameters(self):
-        """Granite uses standard llama parameters with the following differences:
-
-        - No head_dim support
-        - New multiplier params:
-            - attention_scale
-            - embedding_scale
-            - residual_scale
-        - logits_scaling
-        """
-        if head_dim := self.hparams.pop("head_dim", None):
-            logger.warning("Ignoring head_dim (%s) from config for Granite", head_dim)
-        super().set_gguf_parameters()
-        # NOTE: Convert _multiplier params to _scale params for naming
-        #   consistency
-        if attention_scale := self.hparams.get("attention_multiplier"):
-            self.gguf_writer.add_attention_scale(attention_scale)
-            logger.info("gguf: (granite) attention_scale = %s", attention_scale)
-        if embedding_scale := self.hparams.get("embedding_multiplier"):
-            self.gguf_writer.add_embedding_scale(embedding_scale)
-            logger.info("gguf: (granite) embedding_scale = %s", embedding_scale)
-        if residual_scale := self.hparams.get("residual_multiplier"):
-            self.gguf_writer.add_residual_scale(residual_scale)
-            logger.info("gguf: (granite) residual_scale = %s", residual_scale)
-        if logits_scale := self.hparams.get("logits_scaling"):
-            self.gguf_writer.add_logit_scale(logits_scale)
-            logger.info("gguf: (granite) logits_scale = %s", logits_scale)
-
-
-@Model.register("GraniteMoeForCausalLM")
-class GraniteMoeModel(GraniteModel):
-    """Conversion for IBM's GraniteMoeForCausalLM"""
-    model_arch = gguf.MODEL_ARCH.GRANITE_MOE
-
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        """In modeling_granitemoe, the JetMoe implementation of parallel experts
-        is used. This essentially merges w1 and w3 into a single tensor with 2x
-        the hidden size that is then split during forward. To keep compatibility
-        with existing mixtral support, we pull them apart here.
-        """
-
-        if name.endswith("block_sparse_moe.input_linear.weight"):
-            ffn_dim = self.hparams["intermediate_size"]
-            assert data_torch.shape[-2] == 2 * ffn_dim, "Merged FFN tensor size must be 2 * intermediate_size"
-            gate, up = data_torch[..., :ffn_dim, :], data_torch[..., ffn_dim:, :]
-            return [
-                (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_EXP, bid), gate),
-                (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_EXP, bid), up),
-            ]
-
-        return super().modify_tensors(data_torch, name, bid)
-
-
-@Model.register("ChameleonForConditionalGeneration")
-@Model.register("ChameleonForCausalLM")  # obsolete
-class ChameleonModel(Model):
-    model_arch = gguf.MODEL_ARCH.CHAMELEON
-
-    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-        self.gguf_writer.add_swin_norm(self.hparams.get("swin_norm", False))
-
-    def set_vocab(self):
-        self._set_vocab_gpt2()
-
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        # ignore image tokenizer for now
-        # TODO: remove this once image support is implemented for Chameleon
-        if name.startswith("model.vqmodel"):
-            return []
-
-        n_head = self.hparams["num_attention_heads"]
-        n_kv_head = self.hparams.get("num_key_value_heads")
-        hidden_dim = self.hparams.get("hidden_size")
-
-        if name.endswith(("q_proj.weight", "q_proj.bias")):
-            data_torch = LlamaModel.permute(data_torch, n_head, n_head)
-        if name.endswith(("k_proj.weight", "k_proj.bias")):
-            data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
-        if name.endswith(("q_norm.weight", "q_norm.bias")):
-            data_torch = ChameleonModel._reverse_hf_permute(data_torch, n_head, hidden_dim)
-        if name.endswith(("k_norm.weight", "k_norm.bias")):
-            data_torch = ChameleonModel._reverse_hf_permute(data_torch, n_kv_head, hidden_dim)
-
-        return [(self.map_tensor_name(name), data_torch)]
-
-    # see: https://github.com/huggingface/transformers/blob/72fb02c47dbbe1999ae105319f24631cad6e2e00/src/transformers/models/chameleon/convert_chameleon_weights_to_hf.py#L176-L203
-    @staticmethod
-    def _reverse_hf_permute(data_torch, n_heads, hidden_dim):
-        head_dim = hidden_dim // n_heads
-        data_torch = data_torch[0].view(2, head_dim // 2).t().reshape(1, -1)
-        data_torch = data_torch.repeat_interleave(n_heads, 0)
-        return data_torch
+        super().prepare_tensors()


 ###### CONVERSION LOGIC ######

-
 # tree of lazy tensors
 class LazyTorchTensor(gguf.LazyBase):
    _tensor_type = torch.Tensor
@@ -4290,8 +4011,8 @@ def parse_args() -> argparse.Namespace:
        help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
    )
    parser.add_argument(
-        "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "tq1_0", "tq2_0", "auto"], default="f16",
-        help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, tq1_0 or tq2_0 for ternary, and auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
+        "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16",
+        help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
    )
    parser.add_argument(
        "--bigendian", action="store_true",
@@ -4378,8 +4099,6 @@ def main() -> None:
        "f16": gguf.LlamaFileType.MOSTLY_F16,
        "bf16": gguf.LlamaFileType.MOSTLY_BF16,
        "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
-        "tq1_0": gguf.LlamaFileType.MOSTLY_TQ1_0,
-        "tq2_0": gguf.LlamaFileType.MOSTLY_TQ2_0,
        "auto": gguf.LlamaFileType.GUESSED,
    }

--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -31,7 +31,6 @@ import re
 import requests
 import sys
 import json
-import shutil

 from hashlib import sha256
 from enum import IntEnum, auto
@@ -81,7 +80,6 @@ models = [
    {"name": "qwen2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },
    {"name": "olmo",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", },
    {"name": "dbrx",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", },
-    {"name": "jina-v1-en",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-reranker-v1-tiny-en", },
    {"name": "jina-v2-en",     "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
    {"name": "jina-v2-es",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
    {"name": "jina-v2-de",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
@@ -99,8 +97,6 @@ models = [
    {'name': "bloom",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigscience/bloom", },
    {'name': "gpt3-finnish",   "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/TurkuNLP/gpt3-finnish-small", },
    {"name": "exaone",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", },
-    {"name": "phi-2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", },
-    {"name": "chameleon",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/facebook/chameleon-7b", },
 ]


@@ -129,27 +125,12 @@ def download_model(model):
    if tokt == TOKENIZER_TYPE.UGM:
        files.append("spiece.model")

-    if os.path.isdir(repo):
-        # If repo is a path on the file system, copy the directory
-        for file in files:
-            src_path = os.path.join(repo, file)
-            dst_path = f"models/tokenizers/{name}/{file}"
-            if os.path.isfile(dst_path):
-                logger.info(f"{name}: File {dst_path} already exists - skipping")
-                continue
-            if os.path.isfile(src_path):
-                shutil.copy2(src_path, dst_path)
-                logger.info(f"{name}: Copied {src_path} to {dst_path}")
-            else:
-                logger.warning(f"{name}: Source file {src_path} does not exist")
-    else:
-        # If repo is a URL, download the files
-        for file in files:
-            save_path = f"models/tokenizers/{name}/{file}"
-            if os.path.isfile(save_path):
-                logger.info(f"{name}: File {save_path} already exists - skipping")
-                continue
-            download_file_with_auth(f"{repo}/resolve/main/{file}", token, save_path)
+    for file in files:
+        save_path = f"models/tokenizers/{name}/{file}"
+        if os.path.isfile(save_path):
+            logger.info(f"{name}: File {save_path} already exists - skipping")
+            continue
+        download_file_with_auth(f"{repo}/resolve/main/{file}", token, save_path)


 for model in models:
--- a/convert_lora_to_gguf.py
+++ b/convert_lora_to_gguf.py
@@ -331,10 +331,6 @@ if __name__ == '__main__':
                self.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, self.lora_alpha)
                super().set_gguf_parameters()

-            def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
-                # Never add extra tensors (e.g. rope_freqs) for LoRA adapters
-                return ()
-
            def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
                tensor_map: dict[str, PartialLoraTensor] = {}

@@ -367,13 +363,7 @@ if __name__ == '__main__':
                    yield (name, cast(torch.Tensor, LoraTorchTensor(tensor.A, tensor.B)))

            def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-                dest = list(super().modify_tensors(data_torch, name, bid))
-                # some archs may have the same tensor for lm_head and output (tie word embeddings)
-                # in this case, adapters targeting lm_head will fail when using llama-export-lora
-                # therefore, we ignore them for now
-                # see: https://github.com/ggerganov/llama.cpp/issues/9065
-                if name == "lm_head.weight" and len(dest) == 0:
-                    raise ValueError("lm_head is present in adapter, but is ignored in base model")
+                dest = super().modify_tensors(data_torch, name, bid)
                for dest_name, dest_data in dest:
                    assert isinstance(dest_data, LoraTorchTensor)
                    lora_a, lora_b = dest_data.get_lora_A_B()
@@ -396,6 +386,7 @@ if __name__ == '__main__':
            dry_run=args.dry_run,
            dir_lora_model=dir_lora,
            lora_alpha=alpha,
+            is_lora=True,
        )

        logger.info("Exporting model...")
--- a/docs/backend/SYCL.md
+++ b/docs/backend/SYCL.md
@@ -26,7 +26,7 @@

 ### Llama.cpp + SYCL

-The llama.cpp SYCL backend is designed to support **Intel GPU** firstly. Based on the cross-platform feature of SYCL, it also supports other vendor GPUs: Nvidia and AMD.
+The llama.cpp SYCL backend is designed to support **Intel GPU** firstly. Based on the cross-platform feature of SYCL, it could support other vendor GPUs: Nvidia GPU (*AMD GPU coming*).

 ## Recommended Release

@@ -111,18 +111,10 @@ SYCL backend supports Intel GPU Family:

 **Verified devices**

-| Nvidia GPU               | Status    | Verified Model |
-|--------------------------|-----------|----------------|
-| Ampere Series            | Supported | A100, A4000    |
-| Ampere Series *(Mobile)* | Supported | RTX 40 Series  |
-
-| AMD GPU                  | Status       | Verified Model |
-|--------------------------|--------------|----------------|
-| Radeon Pro               | Experimental | W6800          |
-| Radeon RX                | Experimental | 6700 XT        |
-
-Note: AMD GPU support is highly experimental and is incompatible with F16.
-Additionally, it only supports GPUs with a sub_group_size (warp size) of 32.
+| Nvidia GPU               | Status  | Verified Model |
+|--------------------------|---------|----------------|
+| Ampere Series            | Support | A100, A4000    |
+| Ampere Series *(Mobile)* | Support | RTX 40 Series  |

 ## Docker
 The docker build option is currently limited to *intel GPU* targets.
@@ -194,10 +186,6 @@ Platform #0: Intel(R) OpenCL HD Graphics

 In order to target Nvidia GPUs through SYCL, please make sure the CUDA/CUBLAS native requirements *-found [here](README.md#cuda)-* are installed.

- **AMD GPU**
-
-To target AMD GPUs with SYCL, the ROCm stack must be installed first.
-
 2. **Install Intel® oneAPI Base toolkit**

 - **For Intel GPU**
@@ -224,19 +212,6 @@ cmake -B buildWithCublas -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENAB
 cmake --build buildWithCublas --config Release
 ```

- **Adding support to AMD GPUs**
-
-**oneAPI Plugin**: In order to enable SYCL support on AMD GPUs, please install the [Codeplay oneAPI Plugin for AMD GPUs](https://developer.codeplay.com/products/oneapi/amd/download). As with Nvidia GPUs, the user should also make sure the plugin version matches the installed base toolkit.
-
-**oneMKL for rocBlas**: The current oneMKL releases *(shipped with the oneAPI base-toolkit)* doesn't contain the rocBLAS backend. A build from source of the upstream [oneMKL](https://github.com/oneapi-src/oneMKL) with the *rocBLAS* backend enabled is thus required to run it on AMD GPUs.
-
-```sh
-git clone https://github.com/oneapi-src/oneMKL
-cd oneMKL
-# Find your HIPTARGET with rocminfo, under the key 'Name:'
-cmake -B buildWithrocBLAS -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENABLE_MKLGPU_BACKEND=OFF -DENABLE_MKLCPU_BACKEND=OFF -DENABLE_ROCBLAS_BACKEND=ON -DHIPTARGETS=${HIPTARGET} -DTARGET_DOMAINS=blas
-cmake --build buildWithrocBLAS --config Release
-```

 3. **Verify installation and environment**

@@ -248,32 +223,22 @@ sycl-ls

 - **Intel GPU**

-When targeting an intel GPU, the user should expect one or more level-zero devices among the available SYCL devices. Please make sure that at least one GPU is present, for instance [`level_zero:gpu`] in the sample output below:
+When targeting an intel GPU, the user should expect one or more level-zero devices among the available SYCL devices. Please make sure that at least one GPU is present, for instance [`ext_oneapi_level_zero:gpu:0`] in the sample output below:

 ```
-[opencl:acc][opencl:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2  [2023.16.10.0.17_160000]
-[opencl:cpu][opencl:1] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i7-13700K OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000]
-[opencl:gpu][opencl:2] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics OpenCL 3.0 NEO  [23.30.26918.50]
-[level_zero:gpu][level_zero:0] Intel(R) Level-Zero, Intel(R) Arc(TM) A770 Graphics 1.3 [1.3.26918]
+[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2  [2023.16.10.0.17_160000]
+[opencl:cpu:1] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i7-13700K OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000]
+[opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics OpenCL 3.0 NEO  [23.30.26918.50]
+[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Arc(TM) A770 Graphics 1.3 [1.3.26918]
 ```

 - **Nvidia GPU**

-Similarly, user targeting Nvidia GPUs should expect at least one SYCL-CUDA device [`cuda:gpu`] as below:
-
+Similarly, user targeting Nvidia GPUs should expect at least one SYCL-CUDA device [`ext_oneapi_cuda:gpu`] as bellow:
 ```
-[opencl:acc][opencl:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2  [2023.16.12.0.12_195853.xmain-hotfix]
-[opencl:cpu][opencl:1] Intel(R) OpenCL, Intel(R) Xeon(R) Gold 6326 CPU @ 2.90GHz OpenCL 3.0 (Build 0) [2023.16.12.0.12_195853.xmain-hotfix]
-[cuda:gpu][cuda:0] NVIDIA CUDA BACKEND, NVIDIA A100-PCIE-40GB 8.0 [CUDA 12.5]
-```
-
- **AMD GPU**
-
-For AMD GPUs we should expect at least one SYCL-HIP device [`hip:gpu`]:
-
-```
-[opencl:cpu][opencl:0] Intel(R) OpenCL, 12th Gen Intel(R) Core(TM) i9-12900K OpenCL 3.0 (Build 0) [2024.18.6.0.02_160000]
-[hip:gpu][hip:0] AMD HIP BACKEND, AMD Radeon PRO W6800 gfx1030 [HIP 60140.9]
+[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2  [2023.16.12.0.12_195853.xmain-hotfix]
+[opencl:cpu:1] Intel(R) OpenCL, Intel(R) Xeon(R) Gold 6326 CPU @ 2.90GHz OpenCL 3.0 (Build 0) [2023.16.12.0.12_195853.xmain-hotfix]
+[ext_oneapi_cuda:gpu:0] NVIDIA CUDA BACKEND, NVIDIA A100-PCIE-40GB 8.0 [CUDA 12.2]
 ```

 ### II. Build llama.cpp
@@ -301,7 +266,6 @@ cmake --build build --config Release -j -v
 ```

 #### Nvidia GPU
-
 ```sh
 # Export relevant ENV variables
 export LD_LIBRARY_PATH=/path/to/oneMKL/buildWithCublas/lib:$LD_LIBRARY_PATH
@@ -319,25 +283,7 @@ cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -

 # build all binary
 cmake --build build --config Release -j -v
-```

-#### AMD GPU
-
-```sh
-# Export relevant ENV variables
-export LD_LIBRARY_PATH=/path/to/oneMKL/buildWithrocBLAS/lib:$LD_LIBRARY_PATH
-export LIBRARY_PATH=/path/to/oneMKL/buildWithrocBLAS/lib:$LIBRARY_PATH
-export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithrocBLAS/include:$CPLUS_INCLUDE_DIR
-
-# Build LLAMA with rocBLAS acceleration through SYCL
-
-## AMD
-# Use FP32, FP16 is not supported
-# Find your GGML_SYCL_HIP_TARGET with rocminfo, under the key 'Name:'
-cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=AMD -DGGML_SYCL_HIP_TARGET=${GGML_SYCL_HIP_TARGET} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
-
-# build all binary
-cmake --build build --config Release -j -v
 ```

 ### III. Run the inference
@@ -640,11 +586,11 @@ use 1 SYCL GPUs: [0] with Max compute units:512

 #### Build

-| Name               | Value                                 | Function                                    |
-|--------------------|---------------------------------------|---------------------------------------------|
-| GGML_SYCL          | ON (mandatory)                        | Enable build with SYCL code path.<br>FP32 path - recommended for better perforemance than FP16 on quantized model|
-| GGML_SYCL_TARGET   | INTEL *(default)* \| NVIDIA \| AMD    | Set the SYCL target device type.            |
-| GGML_SYCL_F16      | OFF *(default)* \|ON *(optional)*     | Enable FP16 build with SYCL code path.      |
+| Name               | Value                             | Function                                    |
+|--------------------|-----------------------------------|---------------------------------------------|
+| GGML_SYCL          | ON (mandatory)                    | Enable build with SYCL code path.<br>FP32 path - recommended for better perforemance than FP16 on quantized model|
+| GGML_SYCL_TARGET   | INTEL *(default)* \| NVIDIA       | Set the SYCL target device type.            |
+| GGML_SYCL_F16      | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path.      |
 | CMAKE_C_COMPILER   | `icx` *(Linux)*, `icx/cl` *(Windows)* | Set `icx` compiler for SYCL code path.      |
 | CMAKE_CXX_COMPILER | `icpx` *(Linux)*, `icx` *(Windows)*   | Set `icpx/icx` compiler for SYCL code path. |

@@ -690,14 +636,6 @@ use 1 SYCL GPUs: [0] with Max compute units:512

  It's same for other projects including llama.cpp SYCL backend.

- Meet issue: `Native API failed. Native API returns: -6 (PI_ERROR_OUT_OF_HOST_MEMORY) -6 (PI_ERROR_OUT_OF_HOST_MEMORY) -999 (UNKNOWN PI error)` or `failed to allocate SYCL0 buffer`
-
-  Device Memory is not enough.
-
-  |Reason|Solution|
-  |-|-|
-  |Default Context is too big. It leads to more memory usage.|Set `-c 8192` or smaller value.|
-  |Model is big and require more memory than device's.|Choose smaller quantized model, like Q5 -> Q4;<br>Use more than one devices to load model.|

 ### **GitHub contribution**:
 Please add the **[SYCL]** prefix/tag in issues/PRs titles to help the SYCL-team check/address them without delay.
--- a/docs/build.md
+++ b/docs/build.md
@@ -380,9 +380,3 @@ For detailed info, such as model/device supports, CANN install, please refer to
 ### Android

 To read documentation for how to build on Android, [click here](./android.md)
-
-### Arm CPU optimized mulmat kernels
-
-Llama.cpp includes a set of optimized mulmat kernels for the Arm architecture, leveraging Arm® Neon™, int8mm and SVE instructions. These kernels are enabled at build time through the appropriate compiler cpu-type flags, such as `-DCMAKE_C_FLAGS=-march=armv8.2a+i8mm+sve`. Note that these optimized kernels require the model to be quantized into one of the formats: `Q4_0_4_4` (Arm Neon), `Q4_0_4_8` (int8mm) or `Q4_0_8_8` (SVE). The SVE mulmat kernel specifically requires a vector width of 256 bits. When running on devices with a different vector width, it is recommended to use the `Q4_0_4_8` (int8mm) or `Q4_0_4_4` (Arm Neon) formats for better performance. Refer to [examples/quantize/README.md](../examples/quantize/README.md) for more information on the quantization formats.
-
-To support `Q4_0_4_4`, you must build with `GGML_NO_LLAMAFILE=1` (`make`) or `-DGGML_LLAMAFILE=OFF` (`cmake`).
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -20,7 +20,7 @@ Additionally, there the following images, similar to the above:
 - `ghcr.io/ggerganov/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
 - `ghcr.io/ggerganov/llama.cpp:server-rocm`: Same as `server` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)

-The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](../.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](../.github/workflows/docker.yml). If you need different settings (for example, a different CUDA or ROCm library, you'll need to build the images locally for now).
+The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](.github/workflows/docker.yml). If you need different settings (for example, a different CUDA or ROCm library, you'll need to build the images locally for now).

 ## Usage

--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -16,6 +16,7 @@ else()
    add_subdirectory(baby-llama)
    add_subdirectory(batched-bench)
    add_subdirectory(batched)
+    add_subdirectory(benchmark)
    add_subdirectory(convert-llama2c-to-ggml)
    add_subdirectory(embedding)
    add_subdirectory(eval-callback)
--- a/examples/batched-bench/README.md
+++ b/examples/batched-bench/README.md
@@ -49,12 +49,3 @@ There are 2 modes of operation:
 |   128 |    256 |    8 |   3072 |    0.751 |  1363.92 |   15.110 |   135.54 |   15.861 |   193.69 |
 |   128 |    256 |   16 |   6144 |    1.569 |  1304.93 |   18.073 |   226.64 |   19.642 |   312.80 |
 |   128 |    256 |   32 |  12288 |    3.409 |  1201.35 |   19.223 |   426.15 |   22.633 |   542.93 |
-
-### JSONL output
-
-Pass `--output-format jsonl` to output JSONL instead of Markdown, á la
-
-```json lines
-{"n_kv_max": 2048, "n_batch": 2048, "n_ubatch": 512, "flash_attn": 0, "is_pp_shared": 0, "n_gpu_layers": 99, "n_threads": 8, "n_threads_batch": 8, "pp": 128, "tg": 128, "pl": 1, "n_kv": 256, "t_pp": 0.233810, "speed_pp": 547.453064, "t_tg": 3.503684, "speed_tg": 36.532974, "t": 3.737494, "speed": 68.495094}
-{"n_kv_max": 2048, "n_batch": 2048, "n_ubatch": 512, "flash_attn": 0, "is_pp_shared": 0, "n_gpu_layers": 99, "n_threads": 8, "n_threads_batch": 8, "pp": 128, "tg": 128, "pl": 2, "n_kv": 512, "t_pp": 0.422602, "speed_pp": 605.770935, "t_tg": 11.106112, "speed_tg": 23.050371, "t": 11.528713, "speed": 44.410854}
-```
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@@ -1,28 +1,49 @@
-#include "arg.h"
 #include "common.h"
-#include "log.h"
 #include "llama.h"

 #include <algorithm>
+#include <cmath>
 #include <cstdio>
 #include <string>
 #include <vector>

-static void print_usage(int, char ** argv) {
-    LOG("\nexample usage:\n");
-    LOG("\n    %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]);
-    LOG("\n");
+// mutates the input string
+static std::vector<int> parse_list(char * p) {
+    std::vector<int> ret;
+
+    char * q = p;
+
+    while (*p) {
+        if (*p == ',') {
+            *p = '\0';
+            ret.push_back(std::atoi(q));
+            q = p + 1;
+        }
+
+        ++p;
+    }
+
+    ret.push_back(std::atoi(q));
+
+    return ret;
+}
+
+static void print_usage(int argc, char ** argv, const gpt_params & params) {
+    gpt_params_print_usage(argc, argv, params);
+
+    LOG_TEE("\nexample usage:\n");
+    LOG_TEE("\n    %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]);
+    LOG_TEE("\n");
 }

 int main(int argc, char ** argv) {
    gpt_params params;

-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_BENCH, print_usage)) {
+    if (!gpt_params_parse(argc, argv, params)) {
+        print_usage(argc, argv, params);
        return 1;
    }

-    gpt_init();
-
    int is_pp_shared = params.is_pp_shared;

    std::vector<int> n_pp = params.n_pp;
@@ -79,7 +100,7 @@ int main(int argc, char ** argv) {

            const int ret = llama_decode(ctx, batch_view);
            if (ret != 0) {
-                LOG_ERR("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
+                LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
                return false;
            }

@@ -96,18 +117,17 @@ int main(int argc, char ** argv) {
        }

        if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
-            LOG_ERR("%s: llama_decode() failed\n", __func__);
+            LOG_TEE("%s: llama_decode() failed\n", __func__);
            return 1;
        }
    }

-    if (!params.batched_bench_output_jsonl) {
-        LOG("\n");
-        LOG("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
-        LOG("\n");
-        LOG("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
-        LOG("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
-    }
+    LOG_TEE("\n");
+    LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
+    LOG_TEE("\n");
+
+    LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP",     "TG",     "B",    "N_KV",     "T_PP s",   "S_PP t/s", "T_TG s",   "S_TG t/s", "T s",      "S t/s");
+    LOG_TEE("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");

    for (        int i_pp = 0; i_pp < (int) n_pp.size(); ++i_pp) {
        for (    int i_tg = 0; i_tg < (int) n_tg.size(); ++i_tg) {
@@ -136,7 +156,7 @@ int main(int argc, char ** argv) {
                llama_kv_cache_clear(ctx);

                if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
-                    LOG_ERR("%s: llama_decode() failed\n", __func__);
+                    LOG_TEE("%s: llama_decode() failed\n", __func__);
                    return 1;
                }

@@ -158,7 +178,7 @@ int main(int argc, char ** argv) {
                    }

                    if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
-                        LOG_ERR("%s: llama_decode() failed\n", __func__);
+                        LOG_TEE("%s: llama_decode() failed\n", __func__);
                        return 1;
                    }
                }
@@ -175,22 +195,12 @@ int main(int argc, char ** argv) {
                const float speed_tg = pl*tg / t_tg;
                const float speed    = n_kv / t;

-                if(params.batched_bench_output_jsonl) {
-                    LOG(
-                        "{\"n_kv_max\": %d, \"n_batch\": %d, \"n_ubatch\": %d, \"flash_attn\": %d, \"is_pp_shared\": %d, \"n_gpu_layers\": %d, \"n_threads\": %u, \"n_threads_batch\": %u, "
-                        "\"pp\": %d, \"tg\": %d, \"pl\": %d, \"n_kv\": %d, \"t_pp\": %f, \"speed_pp\": %f, \"t_tg\": %f, \"speed_tg\": %f, \"t\": %f, \"speed\": %f}\n",
-                        n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch,
-                        pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed
-                    );
-                } else {
-                    LOG("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed);
-                }
+                LOG_TEE("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed);
            }
        }
    }

-    LOG("\n");
-    llama_perf_context_print(ctx);
+    llama_print_timings(ctx, nullptr);

    llama_batch_free(batch);

@@ -199,7 +209,7 @@ int main(int argc, char ** argv) {

    llama_backend_free();

-    LOG("\n\n");
+    fprintf(stderr, "\n\n");

    return 0;
 }
--- a/examples/batched.swift/Sources/main.swift
+++ b/examples/batched.swift/Sources/main.swift
@@ -50,22 +50,20 @@ defer {
    llama_free(context)
 }

-var sparams = llama_sampler_chain_default_params()
+var sparams = llama_sampling_params()
+sparams.top_k = 40
+sparams.top_p = 0.9
+sparams.temp  = 0.4

-let smpl = llama_sampler_chain_init(sparams)
+let smpl = llama_sampling_init(model, sparams)
 guard smpl != nil else {
    print("Failed to initialize sampling")
    exit(1)
 }
 defer {
-    llama_sampler_free(smpl)
+    llama_sampling_free(smpl)
 }

-llama_sampler_chain_add(smpl, llama_sampler_init_top_k(40));
-llama_sampler_chain_add(smpl, llama_sampler_init_top_p(0.9, 1));
-llama_sampler_chain_add(smpl, llama_sampler_init_temp (0.4));
-llama_sampler_chain_add(smpl, llama_sampler_init_dist (1234));
-
 let n_ctx = llama_n_ctx(context)

 print("\nn_len = \(n_len), n_ctx = \(n_ctx), n_batch = \(context_params.n_batch), n_parallel = \(n_parallel), n_kv_req = \(n_kv_req)\n")
@@ -138,7 +136,17 @@ while n_cur <= n_len {
            continue
        }

-        let new_token_id = llama_sampler_sample(smpl, context, i_batch[i])
+        var logits = llama_get_logits_ith(context, i_batch[i])
+
+        llama_sampling_set_logits(smpl, logits)
+
+        llama_sampling_top_k(smpl, nil)
+        llama_sampling_top_p(smpl, nil)
+        llama_sampling_temp (smpl, nil)
+
+        let new_token_id = llama_sampling_sample_dist(smpl, nil)
+
+        // const llama_token new_token_id = llama_sampling_sample_greedy(smpl, nil);

        // is it an end of stream? -> mark the stream as finished
        if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
@@ -198,10 +206,9 @@ if n_parallel > 1 {

 let t_main_end = ggml_time_us()

-print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end - t_main_start) / 1_000_000.0)) s, speed: \(String(format: "%.2f", Double(n_decode) / (Double(t_main_end - t_main_start) / 1_000_000.0))) t/s\n\n")
+print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end - t_main_start) / 1_000_000.0)) s, speed: \(String(format: "%.2f", Double(n_decode) / (Double(t_main_end - t_main_start) / 1_000_000.0))) t/s\n")

-llama_perf_sampler_print(smpl)
-llama_perf_context_print(context)
+llama_print_timings(context, smpl)

 private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
    let utf8Count = text.utf8.count
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@@ -1,6 +1,4 @@
-#include "arg.h"
 #include "common.h"
-#include "log.h"
 #include "llama.h"

 #include <algorithm>
@@ -8,10 +6,12 @@
 #include <string>
 #include <vector>

-static void print_usage(int, char ** argv) {
-    LOG("\nexample usage:\n");
-    LOG("\n    %s -m model.gguf -p \"Hello my name is\" -n 32 -np 4\n", argv[0]);
-    LOG("\n");
+static void print_usage(int argc, char ** argv, const gpt_params & params) {
+    gpt_params_print_usage(argc, argv, params);
+
+    LOG_TEE("\nexample usage:\n");
+    LOG_TEE("\n    %s -m model.gguf -p \"Hello my name is\" -n 32 -np 4\n", argv[0]);
+    LOG_TEE("\n");
 }

 int main(int argc, char ** argv) {
@@ -20,11 +20,11 @@ int main(int argc, char ** argv) {
    params.prompt = "Hello my name is";
    params.n_predict = 32;

-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) {
+    if (!gpt_params_parse(argc, argv, params)) {
+        print_usage(argc, argv, params);
        return 1;
    }

-    gpt_init();

    // number of parallel batches
    int n_parallel = params.n_parallel;
@@ -44,7 +44,7 @@ int main(int argc, char ** argv) {
    llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);

    if (model == NULL) {
-        LOG_ERR("%s: error: unable to load model\n" , __func__);
+        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
        return 1;
    }

@@ -64,39 +64,41 @@ int main(int argc, char ** argv) {

    llama_context * ctx = llama_new_context_with_model(model, ctx_params);

-    auto sparams = llama_sampler_chain_default_params();
+    auto sparams = llama_sampling_default_params();

-    llama_sampler * smpl = llama_sampler_chain_init(sparams);
+    sparams.seed  = params.sparams.seed;
+    sparams.top_k = 40;
+    sparams.top_p = 0.9f;
+    sparams.temp  = 0.4f;

-    llama_sampler_chain_add(smpl, llama_sampler_init_top_k(params.sparams.top_k));
-    llama_sampler_chain_add(smpl, llama_sampler_init_top_p(params.sparams.top_p, params.sparams.min_keep));
-    llama_sampler_chain_add(smpl, llama_sampler_init_temp (params.sparams.temp));
-    llama_sampler_chain_add(smpl, llama_sampler_init_dist (params.sparams.seed));
+    llama_sampling * smpl = llama_sampling_init(model, sparams);

    if (ctx == NULL) {
-        LOG_ERR("%s: error: failed to create the llama_context\n" , __func__);
+        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
        return 1;
    }

    const int n_ctx = llama_n_ctx(ctx);

-    LOG_INF("\n%s: n_predict = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
+    LOG_TEE("\n%s: n_predict = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);

    // make sure the KV cache is big enough to hold all the prompt and generated tokens
    if (n_kv_req > n_ctx) {
-        LOG_ERR("%s: error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", __func__,  n_kv_req);
-        LOG_ERR("%s:        either reduce n_parallel or increase n_ctx\n", __func__);
+        LOG_TEE("%s: error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", __func__,  n_kv_req);
+        LOG_TEE("%s:        either reduce n_parallel or increase n_ctx\n", __func__);
        return 1;
    }

    // print the prompt token-by-token

-    LOG("\n");
+    fprintf(stderr, "\n");

    for (auto id : tokens_list) {
-        LOG("%s", llama_token_to_piece(ctx, id).c_str());
+        fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
    }

+    fflush(stderr);
+
    // create a llama_batch
    // we use this object to submit token data for decoding
    llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t) n_parallel), 0, n_parallel);
@@ -114,7 +116,7 @@ int main(int argc, char ** argv) {

    if (llama_model_has_encoder(model)) {
        if (llama_encode(ctx, batch)) {
-            LOG_ERR("%s : failed to eval\n", __func__);
+            LOG_TEE("%s : failed to eval\n", __func__);
            return 1;
        }

@@ -131,7 +133,7 @@ int main(int argc, char ** argv) {
    batch.logits[batch.n_tokens - 1] = true;

    if (llama_decode(ctx, batch) != 0) {
-        LOG_ERR("%s: llama_decode() failed\n", __func__);
+        LOG_TEE("%s: llama_decode() failed\n", __func__);
        return 1;
    }

@@ -142,7 +144,7 @@ int main(int argc, char ** argv) {
    //}

    if (n_parallel > 1) {
-        LOG("\n\n%s: generating %d sequences ...\n", __func__, n_parallel);
+        LOG_TEE("\n\n%s: generating %d sequences ...\n", __func__, n_parallel);
    }

    // main loop
@@ -170,14 +172,24 @@ int main(int argc, char ** argv) {
                continue;
            }

-            const llama_token new_token_id = llama_sampler_sample(smpl, ctx, i_batch[i]);
+            const auto * logits = llama_get_logits_ith(ctx, i_batch[i]);
+
+            llama_sampling_set_logits(smpl, logits);
+
+            llama_sampling_top_k(smpl, nullptr);
+            llama_sampling_top_p(smpl, nullptr);
+            llama_sampling_temp (smpl, nullptr);
+
+            const llama_token new_token_id = llama_sampling_sample_dist(smpl, nullptr);
+
+            //const llama_token new_token_id = llama_sampling_sample_greedy(smpl, nullptr);

            // is it an end of generation? -> mark the stream as finished
            if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
                i_batch[i] = -1;
-                LOG("\n");
+                LOG_TEE("\n");
                if (n_parallel > 1) {
-                    LOG_INF("%s: stream %d finished at n_cur = %d", __func__, i, n_cur);
+                    LOG_TEE("%s: stream %d finished at n_cur = %d", __func__, i, n_cur);
                }

                continue;
@@ -185,7 +197,8 @@ int main(int argc, char ** argv) {

            // if there is only one stream, we print immediately to stdout
            if (n_parallel == 1) {
-                LOG("%s", llama_token_to_piece(ctx, new_token_id).c_str());
+                LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str());
+                fflush(stdout);
            }

            streams[i] += llama_token_to_piece(ctx, new_token_id);
@@ -207,33 +220,33 @@ int main(int argc, char ** argv) {

        // evaluate the current batch with the transformer model
        if (llama_decode(ctx, batch)) {
-            LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1);
+            fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
            return 1;
        }
    }

+    LOG_TEE("\n");
+
    if (n_parallel > 1) {
-        LOG("\n");
+        LOG_TEE("\n");

        for (int32_t i = 0; i < n_parallel; ++i) {
-            LOG("sequence %d:\n\n%s%s\n\n", i, params.prompt.c_str(), streams[i].c_str());
+            LOG_TEE("sequence %d:\n\n%s%s\n\n", i, params.prompt.c_str(), streams[i].c_str());
        }
    }

    const auto t_main_end = ggml_time_us();

-    LOG_INF("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
+    LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
            __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));

-    LOG("\n");
-    llama_perf_sampler_print(smpl);
-    llama_perf_context_print(ctx);
+    llama_print_timings(ctx, smpl);

    fprintf(stderr, "\n");

    llama_batch_free(batch);

-    llama_sampler_free(smpl);
+    llama_sampling_free(smpl);
    llama_free(ctx);
    llama_free_model(model);

--- a/examples/benchmark/CMakeLists.txt
+++ b/examples/benchmark/CMakeLists.txt
@@ -0,0 +1,6 @@
+set(TARGET llama-bench-matmult)
+add_executable(${TARGET} benchmark-matmult.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
+target_include_directories(${TARGET} PRIVATE ../../common)
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/benchmark/benchmark-matmult.cpp
+++ b/examples/benchmark/benchmark-matmult.cpp
@@ -0,0 +1,275 @@
+#include "common.h"
+#include "ggml.h"
+
+#include <locale.h>
+#include <assert.h>
+#include <math.h>
+#include <cstring>
+#include <cstdio>
+#include <cinttypes>
+#include <unordered_map>
+#include <queue>
+#include <string.h>
+#include <cassert>
+#include <fstream>
+#include <string>
+#include <iterator>
+#include <algorithm>
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+
+static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
+    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
+
+    if (plan.work_size > 0) {
+        buf.resize(plan.work_size);
+        plan.work_data = buf.data();
+    }
+
+    ggml_graph_compute(graph, &plan);
+}
+
+static float tensor_sum_elements(const ggml_tensor * tensor) {
+    double sum = 0;
+    if (tensor->type == GGML_TYPE_F32) {
+        for (int j = 0; j < tensor->ne[1]; j++) {
+            for (int k = 0; k < tensor->ne[0]; k++) {
+                sum += ((float *) tensor->data)[j*tensor->ne[0] + k];
+            }
+        }
+    }
+    return sum;
+}
+
+static void tensor_dump(const ggml_tensor * tensor, const char * name) {
+    printf("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi) - ", name,
+        tensor->type, ggml_type_name(tensor->type),
+        tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->nb[0], tensor->nb[1], tensor->nb[2]);
+    float sum = tensor_sum_elements(tensor);
+    printf("Sum of tensor %s is %6.2f\n", name, sum);
+}
+
+#define TENSOR_DUMP(tensor) tensor_dump(tensor, #tensor)
+
+struct benchmark_params_struct {
+    int     n_threads     = 1;
+    int32_t n_iterations  = 10;
+};
+
+static void print_usage(int /*argc*/, char ** argv, struct benchmark_params_struct params) {
+    fprintf(stderr, "usage: %s [options]\n", argv[0]);
+    fprintf(stderr, "\n");
+    fprintf(stderr, "options:\n");
+    fprintf(stderr, "  -h, --help            show this help message and exit\n");
+    fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
+    fprintf(stderr, "  -i N, --iter N     number of iterations to use during computation (default: %d)\n", params.n_iterations);
+    fprintf(stderr, "\n");
+}
+
+int main(int argc, char ** argv)  {
+    struct benchmark_params_struct benchmark_params;
+
+    bool invalid_param = false;
+    std::string arg;
+    for (int i = 1; i < argc; i++) {
+        arg = argv[i];
+
+        if (arg == "-t" || arg == "--threads") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            benchmark_params.n_threads = std::stoi(argv[i]);
+        } else if (arg == "-i" || arg == "--iter") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            benchmark_params.n_iterations = std::stoi(argv[i]);
+        }  else if (arg == "-h" || arg == "--help") {
+            print_usage(argc, argv, benchmark_params);
+            exit(0);
+        }
+    }
+    if (invalid_param) {
+        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
+        print_usage(argc, argv, benchmark_params);
+        exit(1);
+    }
+
+    print_build_info();
+    printf("Starting Test\n");
+
+    // create the ggml context
+    struct ggml_context * ctx;
+    //const int sizex = 4096;
+    //const int sizey = 11008;
+
+#undef VERBOSE_DEBUGGING
+#ifndef VERBOSE_DEBUGGING
+    const int sizey = 4096;
+    const int sizex = 11008;
+    const int sizez = 128;
+#else
+    /* Working - let's increase size */
+    const int sizey = 1;
+    const int sizex = (8*32);
+    const int sizez = 1;
+
+    /*const int sizey = 1;
+    const int sizex = 3*(8*32);
+    const int sizez = 1;*/
+#endif
+
+    //printf("Memsize required = %i\n", sizex*sizex);
+
+    // TODO: perform the bench for all types or for a user specified type
+    const ggml_type qtype = GGML_TYPE_Q4_1;
+
+    size_t ctx_size = 0;
+    ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey);
+    ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey);
+    ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizez);
+    ctx_size += ggml_row_size(qtype,         sizex*sizey);
+    ctx_size += ggml_row_size(qtype,         sizex*sizey);
+    ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS
+    ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS
+    ctx_size += 1024*1024*16;
+
+    printf("Allocating Memory of size %zi bytes, %zi MB\n",ctx_size, (ctx_size/1024/1024));
+
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ ctx_size,
+        /*.mem_buffer =*/ NULL,
+        /* no_alloc   =*/ 0
+    };
+
+    ctx = ggml_init(params);
+    if (!ctx) {
+        fprintf(stderr, "%s: ggml_init() failed\n", __func__);
+        return 1;
+    }
+
+
+    printf("Creating new tensors\n");
+    // printf("Creating new tensor m1\n");
+    struct ggml_tensor * m11 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey);
+    ggml_set_f32(m11, 1.0f);
+
+    // printf("Creating new tensor m1\n");
+    struct ggml_tensor * m12 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey);
+    ggml_set_f32(m12, 1.5f);
+
+    // printf("Creating new tensor m2\n");
+    struct ggml_tensor * m2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizez);
+    ggml_set_f32(m2, 2.0f);
+
+    printf("\n------ Test 1 - Matrix Mult via F32 code\n");
+    // printf("Creating new tensor m11xm2\n");
+    struct ggml_tensor * m11xm2 = ggml_mul_mat(ctx, m11, m2);
+
+    // printf("Creating compute graph\n");
+    struct ggml_cgraph * gf = ggml_new_graph(ctx);
+    ggml_build_forward_expand(gf, m11xm2);
+
+    printf("n_threads=%i\n", benchmark_params.n_threads);
+
+    TENSOR_DUMP(m11);
+    TENSOR_DUMP(m2);
+
+    std::vector<uint8_t> work_buffer;
+
+    ggml_graph_compute_helper(work_buffer, gf, benchmark_params.n_threads);
+
+    TENSOR_DUMP(gf->nodes[0]);
+
+    printf("\n------ Test 2 - Matrix Mult via %s code\n", ggml_type_name(qtype));
+
+    int32_t nelements = sizex*sizey;
+
+    // Set up a the benchmark matrices
+    // printf("Creating new tensor q11 & Running quantize\n");
+    struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
+    ggml_quantize_chunk(qtype, (const float *) m11->data, q11->data, 0, nelements/m11->ne[0], m11->ne[0], nullptr);
+
+    // Set up a the compute graph
+    // printf("Creating new tensor q31\n");
+    struct ggml_tensor * q31 = ggml_mul_mat(ctx, q11, m2);
+
+    // printf("Creating compute graph\n");
+    struct ggml_cgraph * gf31 = ggml_new_graph(ctx);
+    ggml_build_forward_expand(gf31, q31);
+
+    // Set up a second graph computation to make sure we override the CPU cache lines
+    // printf("Creating new tensor q12 & Running quantize\n");
+    struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
+    ggml_quantize_chunk(qtype, (const float *) m12->data, q12->data, 0, nelements/m12->ne[0], m12->ne[0], nullptr);
+
+    // printf("Creating new tensor q32\n");
+    struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2);
+
+    //printf("Creating compute graph\n");
+    struct ggml_cgraph * gf32 = ggml_new_graph(ctx);
+    ggml_build_forward_expand(gf32, q32);
+    printf("n_threads=%i\n", benchmark_params.n_threads);
+
+    const int dimx = sizex;
+    const int dimy = sizey;
+    const int dimz = sizez;
+    long long int flops_per_dot_product = dimy + dimy;
+    long long int flops_per_matrix = flops_per_dot_product * dimx * dimz; ;
+    printf("Matrix Multiplication of (%i,%i,%i) x (%i,%i,%i) - about %6.2f gFLOPS\n\n", sizex, sizey, 1, sizex, sizez, 1, 1.0f*flops_per_matrix / 1000 / 1000 / 1000);
+
+
+    // Let's use the F32 result from above as a reference for the quantized multiplication
+    float sum_of_F32_reference = tensor_sum_elements(gf->nodes[0]);
+
+    printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n");
+    printf("=====================================================================================\n");
+
+    double  gflops_sum = 0;
+    for (int i=0;i<benchmark_params.n_iterations ;i++) {
+
+        long long int start = ggml_time_us();
+        //printf("Running ggml_graph_compute\n");
+        ggml_graph_compute_helper(work_buffer, gf31, benchmark_params.n_threads);
+
+        long long int stop = ggml_time_us();
+        long long int usec = stop-start;
+        double gflops = (double)(flops_per_matrix)/usec/1000.0;
+        gflops_sum += gflops;
+        printf("%9i;%8i;%6i;%6i;%6i;%15lli;%18lli;%10.2f\n",
+            i,
+            benchmark_params.n_threads,
+            sizex, sizey, sizez, flops_per_matrix,
+            usec,gflops);
+
+#ifdef VERBOSE_DEBUGGING
+        TENSOR_DUMP("res",gf31.nodes[0])
+#endif
+
+        // Check that the matrix multiplication result is in the right ballpark
+        // We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different
+        float sum_of_Q4_result = tensor_sum_elements(gf31->nodes[0]);
+        float delta = std::abs(sum_of_Q4_result - sum_of_F32_reference);
+        float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; //  Let's accept an epsilon of 10^-6
+
+        if (delta > allowed_delta)  {
+            printf("\nABORT - ERROR in Matrix Multiplication result - expected %6.2f, got %6.2f (delta %6.2f > allowed_delta %6.2f)\n",
+                sum_of_F32_reference,
+                sum_of_Q4_result,
+                delta,
+                allowed_delta
+            );
+            exit(0);
+        }
+
+        // Running a different graph computation to make sure we override the CPU cache lines
+        ggml_graph_compute_helper(work_buffer, gf32, benchmark_params.n_threads);
+    }
+    printf("\n");
+    printf("Average%78.2f\n",gflops_sum/((double)benchmark_params.n_iterations));
+    printf("=====================================================================================\n");
+}
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@@ -9,7 +9,6 @@
 #include <climits>
 #include <cstring>
 #include <cstdarg>
-#include <cinttypes>
 #include <ctime>
 #include <random>
 #include <stdexcept>
@@ -106,43 +105,43 @@ static void alloc_weights(TransformerWeights * w, const Config * p, bool shared_
    const int n_multiqueries = p->n_kv_heads <= 0 || p->n_kv_heads >= p->n_heads ? 1 : p->n_heads / p->n_kv_heads;
    try {
        w->token_embedding_table.resize(p->vocab_size * p->dim);
-        LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
+        LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);

        w->rms_att_weight.resize(p->n_layers * p->dim);
-        LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim);
+        LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim);

        w->rms_ffn_weight.resize(p->n_layers * p->dim);
-        LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim);
+        LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim);

        w->wq.resize(p->n_layers * p->dim * p->dim);
-        LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
+        LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);

        w->wk.resize(p->n_layers * p->dim * p->dim / n_multiqueries);
-        LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);
+        LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);

        w->wv.resize(p->n_layers * p->dim * p->dim / n_multiqueries);
-        LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);
+        LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);

        w->wo.resize(p->n_layers * p->dim * p->dim);
-        LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
+        LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);

        w->w1.resize(p->n_layers * p->hidden_dim * p->dim);
-        LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
+        LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);

        w->w2.resize(p->n_layers * p->hidden_dim * p->dim);
-        LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim);
+        LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim);

        w->w3.resize(p->n_layers * p->hidden_dim * p->dim);
-        LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
+        LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);

        w->rms_final_weight.resize(p->dim);
-        LOG_INF("%s: Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim);
+        LOG("%s: Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim);

        if (shared_weights) {
            w->wcls = {};
        } else {
            w->wcls.resize(p->vocab_size * p->dim);
-            LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
+            LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
        }
    }
    catch (std::length_error &) {
@@ -174,7 +173,7 @@ static int checkpoint_init_weights(TransformerWeights * w, const Config * p, FIL
    fseek(f, 0, SEEK_END);
    auto end = ftell(f);
    if (curr != end) {
-        LOG_ERR("%s: Error: failed to read the checkpoint file to the end (curr = %ld, end =  %ld)\n", __func__, curr, end);
+        LOG("%s: Error: failed to read the checkpoint file to the end (curr = %ld, end =  %ld)\n", __func__, curr, end);
        return 1;
    }

@@ -182,26 +181,26 @@ static int checkpoint_init_weights(TransformerWeights * w, const Config * p, FIL
 }

 static void print_sample_weights(TransformerWeights *w){
-    LOG_INF("----- Quick print of first of the weight vales of all the variables\n");
-    LOG_INF("%f\n", w->token_embedding_table[0]);
-    LOG_INF("%f\n", w->rms_att_weight[0]);
-    LOG_INF("%f\n", w->rms_ffn_weight[0]);
+    LOG("----- Quick print of first of the weight vales of all the variables\n");
+    LOG("%f\n", w->token_embedding_table[0]);
+    LOG("%f\n", w->rms_att_weight[0]);
+    LOG("%f\n", w->rms_ffn_weight[0]);

-    LOG_INF("%f\n", w->wq[0]);
-    LOG_INF("%f\n", w->wk[0]);
-    LOG_INF("%f\n", w->wv[0]);
-    LOG_INF("%f\n", w->wo[0]);
-    LOG_INF("%f\n", w->w1[0]);
-    LOG_INF("%f\n", w->w2[0]);
-    LOG_INF("%f\n", w->w3[0]);
-    LOG_INF("%f\n", w->rms_att_weight[0]);
-    if (!w->wcls.empty()) LOG_INF("%f\n", w->wcls[0]);
+    LOG("%f\n", w->wq[0]);
+    LOG("%f\n", w->wk[0]);
+    LOG("%f\n", w->wv[0]);
+    LOG("%f\n", w->wo[0]);
+    LOG("%f\n", w->w1[0]);
+    LOG("%f\n", w->w2[0]);
+    LOG("%f\n", w->w3[0]);
+    LOG("%f\n", w->rms_att_weight[0]);
+    if (!w->wcls.empty()) LOG("%f\n", w->wcls[0]);
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////

 //////////////////////////////////////// ggml structs and functions required to load models, configs and save the model.

-struct my_llama_vocab {
+struct llama_vocab {
    using id    = int32_t;
    using token = std::string;
    using ttype = llama_token_type;
@@ -319,20 +318,20 @@ struct train_params {
 };

 static void print_params(struct my_llama_hparams * params) {
-    LOG_INF("%s: n_vocab:   %u\n", __func__, params->n_vocab);
-    LOG_INF("%s: n_ctx:     %u\n", __func__, params->n_ctx);
-    LOG_INF("%s: n_embd:    %u\n", __func__, params->n_embd);
-    LOG_INF("%s: n_mult:    %u\n", __func__, params->n_mult);
-    LOG_INF("%s: n_head:    %u\n", __func__, params->n_head);
-    LOG_INF("%s: n_head_kv: %u\n", __func__, params->n_head_kv);
-    LOG_INF("%s: n_ff:      %u\n", __func__, params->n_ff);
-    LOG_INF("%s: n_layer:   %u\n", __func__, params->n_layer);
-    LOG_INF("%s: n_rot:     %u\n", __func__, params->n_rot);
+    LOG("%s: n_vocab:   %u\n", __func__, params->n_vocab);
+    LOG("%s: n_ctx:     %u\n", __func__, params->n_ctx);
+    LOG("%s: n_embd:    %u\n", __func__, params->n_embd);
+    LOG("%s: n_mult:    %u\n", __func__, params->n_mult);
+    LOG("%s: n_head:    %u\n", __func__, params->n_head);
+    LOG("%s: n_head_kv: %u\n", __func__, params->n_head_kv);
+    LOG("%s: n_ff:      %u\n", __func__, params->n_ff);
+    LOG("%s: n_layer:   %u\n", __func__, params->n_layer);
+    LOG("%s: n_rot:     %u\n", __func__, params->n_rot);
 }

 static void print_tensor_info(const struct ggml_context * ctx) {
    for (auto t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
-        LOG_INF("%s: Allocating ", __func__);
+        LOG("%s: Allocating ", __func__);
        int64_t total = 1;
        int i = 0;
        for (; i < ggml_n_dims(t); ++i) {
@@ -525,9 +524,9 @@ static std::string llama_escape_whitespaces(const std::string & text) {
    return out.str();
 }

-static void load_vocab(const char * filename, const Config * config, struct my_llama_vocab * vocab) {
+static void load_vocab(const char * filename, const Config * config, struct llama_vocab * vocab) {
    if (is_ggml_file(filename)) {
-        LOG_INF("%s: Loading vocabulary from gguf file %s\n", __func__, filename);
+        LOG("%s: Loading vocabulary from gguf file %s\n", __func__, filename);
        struct ggml_context * ctx_data = NULL;

        struct gguf_init_params params = {
@@ -575,7 +574,7 @@ static void load_vocab(const char * filename, const Config * config, struct my_l
        gguf_free(ctx);
    } else {
        // assume llama2.c vocabulary
-        LOG_INF("%s: Assuming llama2.c vocabulary since %s is not a gguf file\n", __func__, filename);
+        LOG("%s: Assuming llama2.c vocabulary since %s is not a gguf file\n", __func__, filename);
        llama_file file(filename, "rb");
        if (!file.fp) {
            die_fmt("%s: %s", strerror(errno), filename);
@@ -583,13 +582,13 @@ static void load_vocab(const char * filename, const Config * config, struct my_l
        const int  n_vocab = config->vocab_size;
        /* uint32_t max_token_length =  */ file.read_u32(); // unused
        vocab->id_to_token.resize(n_vocab);
-        for (my_llama_vocab::id id=0; id<n_vocab; ++id) {
+        for (llama_vocab::id id=0; id<n_vocab; ++id) {
            float_t score = file.read_f32();
            uint32_t len = file.read_u32();
            std::string text = file.read_string(len);

            unsigned char byte_val;
-            my_llama_vocab::ttype type = LLAMA_TOKEN_TYPE_NORMAL;
+            llama_vocab::ttype type = LLAMA_TOKEN_TYPE_NORMAL;
            if (id == UNKNOWN_TOKEN_ID) {
                text = "<unk>";
                type = LLAMA_TOKEN_TYPE_UNKNOWN;
@@ -631,7 +630,7 @@ static void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const floa
 }

 static void save_as_llama_model(
-    struct my_llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename
+    struct llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename
 ) {
    // convert AK weights into GG weights one by one.
    // w->token_embedding_table -> model->tok_embeddings
@@ -671,7 +670,7 @@ static void save_as_llama_model(
    std::vector<const char*> tokens;
    std::vector<float> scores;
    std::vector<llama_token_type> token_types;
-    for (const my_llama_vocab::token_data & token_data : vocab->id_to_token) {
+    for (const llama_vocab::token_data & token_data : vocab->id_to_token) {
        tokens.push_back(token_data.text.c_str());
        scores.push_back(token_data.score);
        token_types.push_back(token_data.type);
@@ -872,25 +871,23 @@ static std::string basename(const std::string &path) {
 }

 int main(int argc, char ** argv) {
-    gpt_init();
-
    struct train_params params = get_default_train_params();
    if (!params_parse(argc, argv, &params)) {
        return 1;
    }
-
+    log_set_target(stdout);
    Config config;
    TransformerWeights weights = {};
    {
-        LOG_INF("%s: Loading llama2c model from %s\n", __func__, params.fn_llama2c_model);
+        LOG("%s: Loading llama2c model from %s\n", __func__, params.fn_llama2c_model);
        FILE * file = fopen(params.fn_llama2c_model, "rb");
        if (!file) {
-            LOG_ERR("%s: Unable to open the checkpoint file %s!\n", __func__, params.fn_llama2c_model);
+            LOG("%s: Unable to open the checkpoint file %s!\n", __func__, params.fn_llama2c_model);
            return 1;
        }
        // read in the config header
        if (fread(&config, sizeof(Config), 1, file) != 1) {
-            LOG_ERR("%s: Unable to read llama2c config from %s!\n",__func__,params.fn_llama2c_model);
+            LOG("%s: Unable to read llama2c config from %s!\n",__func__,params.fn_llama2c_model);
            return 1;
        }
        auto shared_weights = config.vocab_size > 0;
@@ -899,13 +896,13 @@ int main(int argc, char ** argv) {
        // read in the Transformer weights
        alloc_weights(&weights, &config, shared_weights);
        if (checkpoint_init_weights(&weights, &config, file, shared_weights)) {
-            LOG_ERR("%s: Unable to initialize transformer weights from %s!",__func__,params.fn_llama2c_model);
+            LOG("%s: Unable to initialize transformer weights from %s!",__func__,params.fn_llama2c_model);
            return 1;
        }
        fclose(file);
    }

-    struct my_llama_vocab vocab;
+    struct llama_vocab vocab;
    load_vocab(params.fn_vocab_model, &config, &vocab);

    struct my_llama_model model;
@@ -932,7 +929,7 @@ int main(int argc, char ** argv) {
    model.name = basename(params.fn_llama2c_model);
    save_as_llama_model(&vocab, &model, &weights, params.fn_llama2c_output_model);

-    LOG_INF("%s: Saving llama.c model file %s in ggml format at %s\n", __func__, params.fn_llama2c_model, params.fn_llama2c_output_model);
+    LOG("%s: Saving llama.c model file %s in ggml format at %s\n", __func__, params.fn_llama2c_model, params.fn_llama2c_output_model);

    ggml_free(model.ctx);
    return 0;
--- a/examples/cvector-generator/cvector-generator.cpp
+++ b/examples/cvector-generator/cvector-generator.cpp
@@ -1,4 +1,3 @@
-#include "arg.h"
 #include "common.h"
 #include "llama.h"
 #include "ggml.h"
@@ -13,15 +12,14 @@
 #include "ggml-metal.h"
 #endif

-#include <algorithm>
-#include <climits>
 #include <cstdio>
-#include <cstring>
-#include <fstream>
-#include <iostream>
 #include <string>
 #include <tuple>
 #include <vector>
+#include <algorithm>
+#include <iostream>
+#include <fstream>
+#include <climits>


 //////////////////////////////////////////////////
@@ -37,7 +35,9 @@ static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
    return ret;
 }

-static void print_usage(int, char ** argv) {
+static void print_usage(int argc, char ** argv, const gpt_params & params) {
+    gpt_params_print_usage(argc, argv, params);
+
    printf("\nexample usage:\n");
    printf("\n    CPU only:   %s -m ./llama-3.Q4_K_M.gguf\n", argv[0]);
    printf("\n    with GPU:   %s -m ./llama-3.Q4_K_M.gguf -ngl 99\n", argv[0]);
@@ -390,7 +390,8 @@ static int prepare_entries(gpt_params & params, train_context & ctx_train) {
 int main(int argc, char ** argv) {
    gpt_params params;

-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage)) {
+    if (!gpt_params_parse(argc, argv, params)) {
+        print_usage(argc, argv, params);
        return 1;
    }

--- a/examples/cvector-generator/pca.hpp
+++ b/examples/cvector-generator/pca.hpp
@@ -12,9 +12,12 @@

 #include <cstdio>
 #include <ctime>
-#include <random>
 #include <string>
+#include <tuple>
 #include <vector>
+#include <algorithm>
+#include <iostream>
+#include <fstream>

 #define DEBUG_POS 5

@@ -204,6 +207,13 @@ static ggml_status compute_piter(
        ggml_backend_cpu_set_n_threads(model.backend, params.n_threads);
    }

+// TODO: enable GPU support when support for GGML_OP_SQRT is added
+//#ifdef GGML_USE_METAL
+//    if (ggml_backend_is_metal(model.backend)) {
+//        ggml_backend_metal_set_n_cb(model.backend, params.n_threads);
+//    }
+//#endif
+
    ggml_status res = ggml_backend_graph_compute(model.backend, gf);
    if (res == GGML_STATUS_SUCCESS) {
        auto extract_i = [](std::string prefix, std::string str) -> int {
@@ -219,8 +229,8 @@ static ggml_status compute_piter(
        result.eigenvectors.resize(params.n_batch);
        result.distances.resize(params.n_batch);
        // get output nodes
-        for (int i = 0; i < ggml_graph_n_nodes(gf); ++i) {
-            auto node = ggml_graph_node(gf, i);
+        for (int i = 0; i < gf->n_nodes; ++i) {
+            auto node = gf->nodes[i];
            int iter = -1;
            // find b_tensor (without copying data from device)
            if ((iter = extract_i("b_tensor_norm_", node->name)) > -1) {
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -1,6 +1,4 @@
-#include "arg.h"
 #include "common.h"
-#include "log.h"
 #include "llama.h"

 #include <ctime>
@@ -40,16 +38,16 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
    llama_kv_cache_clear(ctx);

    // run model
-    LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
+    fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
    if (llama_model_has_encoder(model) && !llama_model_has_decoder(model)) {
        // encoder-only model
        if (llama_encode(ctx, batch) < 0) {
-            LOG_ERR("%s : failed to encode\n", __func__);
+            fprintf(stderr, "%s : failed to encode\n", __func__);
        }
    } else if (!llama_model_has_encoder(model) && llama_model_has_decoder(model)) {
        // decoder-only model
        if (llama_decode(ctx, batch) < 0) {
-            LOG_ERR("%s : failed to decode\n", __func__);
+            fprintf(stderr, "%s : failed to decode\n", __func__);
        }
    }

@@ -81,16 +79,19 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
 int main(int argc, char ** argv) {
    gpt_params params;

-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_EMBEDDING)) {
+    if (!gpt_params_parse(argc, argv, params)) {
+        gpt_params_print_usage(argc, argv, params);
        return 1;
    }

-    gpt_init();
-
    params.embedding = true;
    // For non-causal models, batch size must be equal to ubatch size
    params.n_ubatch = params.n_batch;

+    print_build_info();
+
+    LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed);
+
    llama_backend_init();
    llama_numa_init(params.numa);

@@ -100,7 +101,7 @@ int main(int argc, char ** argv) {
    llama_model * model = llama_init.model;
    llama_context * ctx = llama_init.context;
    if (model == NULL) {
-        LOG_ERR("%s: unable to load model\n", __func__);
+        fprintf(stderr, "%s: error: unable to load model\n", __func__);
        return 1;
    }

@@ -110,19 +111,19 @@ int main(int argc, char ** argv) {
    const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);

    if (llama_model_has_encoder(model) && llama_model_has_decoder(model)) {
-        LOG_ERR("%s: computing embeddings in encoder-decoder models is not supported\n", __func__);
+        fprintf(stderr, "%s: error: computing embeddings in encoder-decoder models is not supported\n", __func__);
        return 1;
    }

    if (n_ctx > n_ctx_train) {
-        LOG_WRN("%s: warning: model was trained on only %d context tokens (%d specified)\n",
+        fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
                __func__, n_ctx_train, n_ctx);
    }

    // print system information
    {
-        LOG_INF("\n");
-        LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
+        fprintf(stderr, "\n");
+        fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
    }

    // split the prompt into lines
@@ -135,9 +136,9 @@ int main(int argc, char ** argv) {
    // tokenize the prompts and trim
    std::vector<std::vector<int32_t>> inputs;
    for (const auto & prompt : prompts) {
-        auto inp = ::llama_tokenize(ctx, prompt, true, true);
+        auto inp = ::llama_tokenize(ctx, prompt, true, false);
        if (inp.size() > n_batch) {
-            LOG_ERR("%s: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
+            fprintf(stderr, "%s: error: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
                    __func__, (long long int) inp.size(), (long long int) n_batch);
            return 1;
        }
@@ -148,20 +149,20 @@ int main(int argc, char ** argv) {
    // it should be automatically added by the tokenizer when 'tokenizer.ggml.add_eos_token' is set to 'true'
    for (auto & inp : inputs) {
        if (inp.empty() || inp.back() != llama_token_sep(model)) {
-            LOG_WRN("%s: last token in the prompt is not SEP\n", __func__);
-            LOG_WRN("%s: 'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__);
+            fprintf(stderr, "%s: warning: last token in the prompt is not SEP\n", __func__);
+            fprintf(stderr, "%s:          'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__);
        }
    }

    // tokenization stats
    if (params.verbose_prompt) {
        for (int i = 0; i < (int) inputs.size(); i++) {
-            LOG_INF("%s: prompt %d: '%s'\n", __func__, i, prompts[i].c_str());
-            LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, inputs[i].size());
+            fprintf(stderr, "%s: prompt %d: '%s'\n", __func__, i, prompts[i].c_str());
+            fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, inputs[i].size());
            for (int j = 0; j < (int) inputs[i].size(); j++) {
-                LOG("%6d -> '%s'\n", inputs[i][j], llama_token_to_piece(ctx, inputs[i][j]).c_str());
+                fprintf(stderr, "%6d -> '%s'\n", inputs[i][j], llama_token_to_piece(ctx, inputs[i][j]).c_str());
            }
-            LOG("\n\n");
+            fprintf(stderr, "\n\n");
        }
    }

@@ -212,62 +213,57 @@ int main(int argc, char ** argv) {
    batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);

    if (params.embd_out.empty()) {
-        LOG("\n");
+        fprintf(stdout, "\n");

        if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
            for (int j = 0; j < n_embd_count; j++) {
-                LOG("embedding %d: ", j);
+                fprintf(stdout, "embedding %d: ", j);
                for (int i = 0; i < std::min(3, n_embd); i++) {
                    if (params.embd_normalize == 0) {
-                        LOG("%6.0f ", emb[j * n_embd + i]);
+                        fprintf(stdout, "%6.0f ", emb[j * n_embd + i]);
                    } else {
-                        LOG("%9.6f ", emb[j * n_embd + i]);
+                        fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
                    }
                }
-                LOG(" ... ");
+                fprintf(stdout, " ... ");
                for (int i = n_embd - 3; i < n_embd; i++) {
                    if (params.embd_normalize == 0) {
-                        LOG("%6.0f ", emb[j * n_embd + i]);
+                        fprintf(stdout, "%6.0f ", emb[j * n_embd + i]);
                    } else {
-                        LOG("%9.6f ", emb[j * n_embd + i]);
+                        fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
                    }
                }
-                LOG("\n");
-            }
-        } else if (pooling_type == LLAMA_POOLING_TYPE_RANK) {
-            for (int j = 0; j < n_embd_count; j++) {
-                // NOTE: if you change this log - update the tests in ci/run.sh
-                LOG("rerank score %d: %8.3f\n", j, emb[j * n_embd]);
+                fprintf(stdout, "\n");
            }
        } else {
            // print the first part of the embeddings or for a single prompt, the full embedding
            for (int j = 0; j < n_prompts; j++) {
-                LOG("embedding %d: ", j);
+                fprintf(stdout, "embedding %d: ", j);
                for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) {
                    if (params.embd_normalize == 0) {
-                        LOG("%6.0f ", emb[j * n_embd + i]);
+                        fprintf(stdout, "%6.0f ", emb[j * n_embd + i]);
                    } else {
-                        LOG("%9.6f ", emb[j * n_embd + i]);
+                        fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
                    }
                }
-                LOG("\n");
+                fprintf(stdout, "\n");
            }

            // print cosine similarity matrix
            if (n_prompts > 1) {
-                LOG("\n");
-                LOG("cosine similarity matrix:\n\n");
+                fprintf(stdout, "\n");
+                printf("cosine similarity matrix:\n\n");
                for (int i = 0; i < n_prompts; i++) {
-                    LOG("%6.6s ", prompts[i].c_str());
+                    fprintf(stdout, "%6.6s ", prompts[i].c_str());
                }
-                LOG("\n");
+                fprintf(stdout, "\n");
                for (int i = 0; i < n_prompts; i++) {
                    for (int j = 0; j < n_prompts; j++) {
                        float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
-                        LOG("%6.2f ", sim);
+                        fprintf(stdout, "%6.2f ", sim);
                    }
-                    LOG("%1.10s", prompts[i].c_str());
-                    LOG("\n");
+                    fprintf(stdout, "%1.10s", prompts[i].c_str());
+                    fprintf(stdout, "\n");
                }
            }
        }
@@ -276,45 +272,43 @@ int main(int argc, char ** argv) {
    if (params.embd_out == "json" || params.embd_out == "json+" || params.embd_out == "array") {
        const bool notArray = params.embd_out != "array";

-        LOG(notArray ? "{\n  \"object\": \"list\",\n  \"data\": [\n" : "[");
+        fprintf(stdout, notArray ? "{\n  \"object\": \"list\",\n  \"data\": [\n" : "[");
        for (int j = 0;;) { // at least one iteration (one prompt)
-            if (notArray) LOG("    {\n      \"object\": \"embedding\",\n      \"index\": %d,\n      \"embedding\": ",j);
-            LOG("[");
+            if (notArray) fprintf(stdout, "    {\n      \"object\": \"embedding\",\n      \"index\": %d,\n      \"embedding\": ",j);
+            fprintf(stdout, "[");
            for (int i = 0;;) { // at least one iteration (n_embd > 0)
-                LOG(params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd + i]);
+                fprintf(stdout, params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd + i]);
                i++;
-                if (i < n_embd) LOG(","); else break;
+                if (i < n_embd) fprintf(stdout, ","); else break;
            }
-            LOG(notArray ? "]\n    }" : "]");
+            fprintf(stdout, notArray ? "]\n    }" : "]");
            j++;
-            if (j < n_embd_count) LOG(notArray ? ",\n" : ","); else break;
+            if (j < n_embd_count) fprintf(stdout, notArray ? ",\n" : ","); else break;
        }
-        LOG(notArray ? "\n  ]" : "]\n");
+        fprintf(stdout, notArray ? "\n  ]" : "]\n");

        if (params.embd_out == "json+" && n_prompts > 1) {
-            LOG(",\n  \"cosineSimilarity\": [\n");
+            fprintf(stdout, ",\n  \"cosineSimilarity\": [\n");
            for (int i = 0;;) { // at least two iteration (n_embd_count > 1)
-                LOG("    [");
+                fprintf(stdout, "    [");
                for (int j = 0;;) { // at least two iteration (n_embd_count > 1)
                    float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
-                    LOG("%6.2f", sim);
+                    fprintf(stdout, "%6.2f", sim);
                    j++;
-                    if (j < n_embd_count) LOG(", "); else break;
+                    if (j < n_embd_count) fprintf(stdout, ", "); else break;
                }
-                LOG(" ]");
+                fprintf(stdout, " ]");
                i++;
-                if (i < n_embd_count) LOG(",\n"); else break;
+                if (i < n_embd_count) fprintf(stdout, ",\n"); else break;
            }
-            LOG("\n  ]");
+            fprintf(stdout, "\n  ]");
        }

-        if (notArray) LOG("\n}\n");
+        if (notArray) fprintf(stdout, "\n}\n");
    }

-    LOG("\n");
-    llama_perf_context_print(ctx);
-
    // clean up
+    llama_print_timings(ctx, nullptr);
    llama_batch_free(batch);
    llama_free(ctx);
    llama_free_model(model);
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@@ -1,11 +1,11 @@
-#include "arg.h"
 #include "common.h"
-#include "log.h"
 #include "llama.h"
 #include "ggml.h"

 #include <cstdio>
+#include <random>
 #include <string>
+#include <tuple>
 #include <vector>

 /**
@@ -31,22 +31,22 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
    GGML_ASSERT(n > 0);
    float sum = 0;
    for (int64_t i3 = 0; i3 < ne[3]; i3++) {
-        LOG("                                     [\n");
+        printf("                                     [\n");
        for (int64_t i2 = 0; i2 < ne[2]; i2++) {
            if (i2 == n && ne[2] > 2*n) {
-                LOG("                                      ..., \n");
+                printf("                                      ..., \n");
                i2 = ne[2] - n;
            }
-            LOG("                                      [\n");
+            printf("                                      [\n");
            for (int64_t i1 = 0; i1 < ne[1]; i1++) {
                if (i1 == n && ne[1] > 2*n) {
-                    LOG("                                       ..., \n");
+                    printf("                                       ..., \n");
                    i1 = ne[1] - n;
                }
-                LOG("                                       [");
+                printf("                                       [");
                for (int64_t i0 = 0; i0 < ne[0]; i0++) {
                    if (i0 == n && ne[0] > 2*n) {
-                        LOG("..., ");
+                        printf("..., ");
                        i0 = ne[0] - n;
                    }
                    size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
@@ -64,16 +64,16 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
                    } else {
                        GGML_ABORT("fatal error");
                    }
-                    LOG("%12.4f", v);
+                    printf("%12.4f", v);
                    sum += v;
-                    if (i0 < ne[0] - 1) LOG(", ");
+                    if (i0 < ne[0] - 1) printf(", ");
                }
-                LOG("],\n");
+                printf("],\n");
            }
-            LOG("                                      ],\n");
+            printf("                                      ],\n");
        }
-        LOG("                                     ]\n");
-        LOG("                                     sum = %f\n", sum);
+        printf("                                     ]\n");
+        printf("                                     sum = %f\n", sum);
    }
 }

@@ -102,11 +102,11 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
        snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, ggml_ne_string(src1).c_str());
    }

-    LOG("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
-         t->name, ggml_type_name(t->type), ggml_op_desc(t),
-         src0->name, ggml_ne_string(src0).c_str(),
-         src1 ? src1_str : "",
-         ggml_ne_string(t).c_str());
+    printf("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
+           t->name, ggml_type_name(t->type), ggml_op_desc(t),
+           src0->name, ggml_ne_string(src0).c_str(),
+           src1 ? src1_str : "",
+           ggml_ne_string(t).c_str());


    // copy the data from the GPU memory if needed
@@ -132,7 +132,7 @@ static bool run(llama_context * ctx, const gpt_params & params) {
    std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);

    if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {
-        LOG_ERR("%s : failed to eval\n", __func__);
+        fprintf(stderr, "%s : failed to eval\n", __func__);
        return false;
    }

@@ -144,11 +144,12 @@ int main(int argc, char ** argv) {

    gpt_params params;

-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
+    if (!gpt_params_parse(argc, argv, params)) {
+        gpt_params_print_usage(argc, argv, params);
        return 1;
    }

-    gpt_init();
+    print_build_info();

    llama_backend_init();
    llama_numa_init(params.numa);
@@ -165,15 +166,14 @@ int main(int argc, char ** argv) {
    llama_model * model = llama_init.model;
    llama_context * ctx = llama_init.context;
    if (model == nullptr || ctx == nullptr) {
-        LOG_ERR("%s : failed to init\n", __func__);
+        fprintf(stderr, "%s : failed to init\n", __func__);
        return 1;
    }

    // print system information
    {
-        LOG_INF("\n");
-        LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
-        LOG_INF("\n");
+        fprintf(stderr, "\n");
+        fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
    }

    bool OK = run(ctx, params);
@@ -181,8 +181,7 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    LOG("\n");
-    llama_perf_context_print(ctx);
+    llama_print_timings(ctx, nullptr);

    llama_free(ctx);
    llama_free_model(model);
--- a/examples/export-lora/export-lora.cpp
+++ b/examples/export-lora/export-lora.cpp
@@ -1,4 +1,3 @@
-#include "arg.h"
 #include "common.h"
 #include "ggml.h"
 #include "ggml-alloc.h"
@@ -370,7 +369,7 @@ struct lora_merge_ctx {

        // write data to output file
        {
-            auto * result = ggml_graph_node(gf, -1);
+            auto result = gf->nodes[gf->n_nodes - 1];
            size_t len = ggml_nbytes(result);
            if (read_buf.size() < len) {
                read_buf.resize(len);
@@ -392,7 +391,9 @@ struct lora_merge_ctx {
    }
 };

-static void print_usage(int, char ** argv) {
+static void print_usage(int argc, char ** argv, const gpt_params & params) {
+    gpt_params_print_usage(argc, argv, params);
+
    printf("\nexample usage:\n");
    printf("\n  %s -m base-model.gguf --lora lora-file.gguf -o merged-model-f16.gguf\n", argv[0]);
    printf("\nNOTE: output model is F16\n");
@@ -402,11 +403,12 @@ static void print_usage(int, char ** argv) {
 int main(int argc, char ** argv) {
    gpt_params params;

-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_LORA, print_usage)) {
+    if (!gpt_params_parse(argc, argv, params)) {
+        print_usage(argc, argv, params);
        return 1;
    }

-    g_verbose = (params.verbosity > 1);
+    g_verbose = (params.verbosity == 1);
    try {
        lora_merge_ctx ctx(params.model, params.lora_adapters, params.lora_outfile, params.cpuparams.n_threads);
        ctx.run_merge();
--- a/examples/gbnf-validator/gbnf-validator.cpp
+++ b/examples/gbnf-validator/gbnf-validator.cpp
@@ -12,24 +12,24 @@ static bool llama_grammar_validate(struct llama_grammar * grammar, const std::st
    const auto cpts = unicode_cpts_from_utf8(input_str);

    const llama_grammar_rules  & rules      = llama_grammar_get_rules (grammar);
-          llama_grammar_stacks & stacks_cur = llama_grammar_get_stacks(grammar);
+          llama_grammar_stacks & cur_stacks = llama_grammar_get_stacks(grammar);

    size_t pos = 0;
    for (const auto & cpt : cpts) {
-        const llama_grammar_stacks stacks_prev = llama_grammar_get_stacks(grammar); // copy
+        const llama_grammar_stacks prev_stacks = llama_grammar_get_stacks(grammar); // copy

-        llama_grammar_accept(rules, stacks_prev, cpt, stacks_cur);
+        cur_stacks = llama_grammar_accept(rules, prev_stacks, cpt);

-        if (stacks_cur.empty()) {
+        if (cur_stacks.empty()) {
            error_pos = pos;
            error_msg = "Unexpected character '" + unicode_cpt_to_utf8(cpt) + "'";
-            stacks_cur = stacks_prev;
+            cur_stacks = prev_stacks;
            return false;
        }
        ++pos;
    }

-    for (const auto & stack : stacks_cur) {
+    for (const auto & stack : cur_stacks) {
        if (stack.empty()) {
            return true;
        }
--- a/examples/gen-docs/CMakeLists.txt
+++ b/examples/gen-docs/CMakeLists.txt
@@ -1,5 +0,0 @@
-set(TARGET llama-gen-docs)
-add_executable(${TARGET} gen-docs.cpp)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/gen-docs/gen-docs.cpp
+++ b/examples/gen-docs/gen-docs.cpp
@@ -1,83 +0,0 @@
-#include "arg.h"
-#include "common.h"
-
-#include <fstream>
-#include <string>
-
-// Export usage message (-h) to markdown format
-
-static void write_table_header(std::ofstream & file) {
-    file << "| Argument | Explanation |\n";
-    file << "| -------- | ----------- |\n";
-}
-
-static void write_table_entry(std::ofstream & file, const llama_arg & opt) {
-    file << "| `";
-    // args
-    for (const auto & arg : opt.args) {
-    if (arg == opt.args.front()) {
-            file << arg;
-            if (opt.args.size() > 1) file << ", ";
-        } else {
-            file << arg << (arg != opt.args.back() ? ", " : "");
-        }
-    }
-    // value hint
-    if (opt.value_hint) {
-        std::string md_value_hint(opt.value_hint);
-        string_replace_all(md_value_hint, "|", "\\|");
-        file << " " << md_value_hint;
-    }
-    if (opt.value_hint_2) {
-        std::string md_value_hint_2(opt.value_hint_2);
-        string_replace_all(md_value_hint_2, "|", "\\|");
-        file << " " << md_value_hint_2;
-    }
-    // help text
-    std::string md_help(opt.help);
-    string_replace_all(md_help, "\n", "<br/>");
-    string_replace_all(md_help, "|", "\\|");
-    file << "` | " << md_help << " |\n";
-}
-
-static void write_table(std::ofstream & file, std::vector<llama_arg *> & opts) {
-    write_table_header(file);
-    for (const auto & opt : opts) {
-        write_table_entry(file, *opt);
-    }
-}
-
-static void export_md(std::string fname, llama_example ex) {
-    std::ofstream file(fname, std::ofstream::out | std::ofstream::trunc);
-
-    gpt_params params;
-    auto ctx_arg = gpt_params_parser_init(params, ex);
-
-    std::vector<llama_arg *> common_options;
-    std::vector<llama_arg *> sparam_options;
-    std::vector<llama_arg *> specific_options;
-    for (auto & opt : ctx_arg.options) {
-        // in case multiple LLAMA_EXAMPLE_* are set, we prioritize the LLAMA_EXAMPLE_* matching current example
-        if (opt.is_sparam) {
-            sparam_options.push_back(&opt);
-        } else if (opt.in_example(ctx_arg.ex)) {
-            specific_options.push_back(&opt);
-        } else {
-            common_options.push_back(&opt);
-        }
-    }
-
-    file << "**Common params**\n\n";
-    write_table(file, common_options);
-    file << "\n\n**Sampling params**\n\n";
-    write_table(file, sparam_options);
-    file << "\n\n**Example-specific params**\n\n";
-    write_table(file, specific_options);
-}
-
-int main(int, char **) {
-    export_md("autogen-main.md", LLAMA_EXAMPLE_MAIN);
-    export_md("autogen-server.md", LLAMA_EXAMPLE_SERVER);
-
-    return 0;
-}
--- a/examples/gguf-split/gguf-split.cpp
+++ b/examples/gguf-split/gguf-split.cpp
@@ -22,20 +22,12 @@
 #endif

 enum split_operation : uint8_t {
-    OP_NONE,
-    OP_SPLIT,
-    OP_MERGE,
-};
-
-enum split_mode : uint8_t {
-    MODE_NONE,
-    MODE_TENSOR,
-    MODE_SIZE,
+    SPLIT_OP_SPLIT,
+    SPLIT_OP_MERGE,
 };

 struct split_params {
-    split_operation operation = OP_NONE;
-    split_mode mode = MODE_NONE;
+    split_operation operation = SPLIT_OP_SPLIT;
    size_t n_bytes_split = 0;
    int n_split_tensors = 128;
    std::string input;
@@ -95,52 +87,59 @@ static void split_params_parse_ex(int argc, const char ** argv, split_params & p
        }

        bool arg_found = false;
+        bool is_op_set = false;
+        bool is_mode_set = false;
        if (arg == "-h" || arg == "--help") {
            split_print_usage(argv[0]);
            exit(0);
-        } else if (arg == "--version") {
+        }
+        if (arg == "--version") {
            fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
            fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
            exit(0);
-        } else if (arg == "--dry-run") {
+        }
+        if (arg == "--dry-run") {
            arg_found = true;
            params.dry_run = true;
-        } else if (arg == "--no-tensor-first-split") {
+        }
+        if (arg == "--no-tensor-first-split") {
            arg_found = true;
            params.no_tensor_first_split = true;
-        } else if (arg == "--merge") {
+        }
+
+        if (is_op_set) {
+            throw std::invalid_argument("error: either --split or --merge can be specified, but not both");
+        }
+        if (arg == "--merge") {
            arg_found = true;
-            if (params.operation != OP_NONE && params.operation != OP_MERGE) {
-                throw std::invalid_argument("error: either --split or --merge can be specified, but not both");
-            }
-            params.operation = OP_MERGE;
-        } else if (arg == "--split") {
+            is_op_set = true;
+            params.operation = SPLIT_OP_MERGE;
+        }
+        if (arg == "--split") {
            arg_found = true;
-            if (params.operation != OP_NONE && params.operation != OP_SPLIT) {
-                throw std::invalid_argument("error: either --split or --merge can be specified, but not both");
-            }
-            params.operation = OP_SPLIT;
-        } else if (arg == "--split-max-tensors") {
+            is_op_set = true;
+            params.operation = SPLIT_OP_SPLIT;
+        }
+
+        if (is_mode_set) {
+            throw std::invalid_argument("error: either --split-max-tensors or --split-max-size can be specified, but not both");
+        }
+        if (arg == "--split-max-tensors") {
            if (++arg_idx >= argc) {
                invalid_param = true;
                break;
            }
            arg_found = true;
-            if (params.mode != MODE_NONE && params.mode != MODE_TENSOR) {
-                throw std::invalid_argument("error: either --split-max-tensors or --split-max-size can be specified, but not both");
-            }
-            params.mode = MODE_TENSOR;
+            is_mode_set = true;
            params.n_split_tensors = atoi(argv[arg_idx]);
-        } else if (arg == "--split-max-size") {
+        }
+        if (arg == "--split-max-size") {
            if (++arg_idx >= argc) {
                invalid_param = true;
                break;
            }
            arg_found = true;
-            if (params.mode != MODE_NONE && params.mode != MODE_SIZE) {
-                throw std::invalid_argument("error: either --split-max-tensors or --split-max-size can be specified, but not both");
-            }
-            params.mode = MODE_SIZE;
+            is_mode_set = true;
            params.n_bytes_split = split_str_to_n_bytes(argv[arg_idx]);
        }

@@ -149,20 +148,11 @@ static void split_params_parse_ex(int argc, const char ** argv, split_params & p
        }
    }

-    // the operation is split if not specified
-    if (params.operation == OP_NONE) {
-        params.operation = OP_SPLIT;
-    }
-    // the split mode is by tensor if not specified
-    if (params.mode == MODE_NONE) {
-        params.mode = MODE_TENSOR;
-    }
-
    if (invalid_param) {
        throw std::invalid_argument("error: invalid parameter for argument: " + arg);
    }

-    if (argc - arg_idx != 2) {
+    if (argc - arg_idx < 2) {
        throw std::invalid_argument("error: bad arguments");
    }

@@ -275,15 +265,13 @@ struct split_strategy {
    }

    bool should_split(int i_tensor, size_t next_size) {
-        if (params.mode == MODE_SIZE) {
+        if (params.n_bytes_split > 0) {
            // split by max size per file
            return next_size > params.n_bytes_split;
-        } else if (params.mode == MODE_TENSOR) {
+        } else {
            // split by number of tensors per file
            return i_tensor > 0 && i_tensor < n_tensors && i_tensor % params.n_split_tensors == 0;
        }
-        // should never happen
-        GGML_ABORT("invalid mode");
    }

    void print_info() {
@@ -401,17 +389,10 @@ static void gguf_merge(const split_params & split_params) {
    int n_split = 1;
    int total_tensors = 0;

-    // avoid overwriting existing output file
-    if (std::ifstream(split_params.output.c_str())) {
-        fprintf(stderr, "%s: output file %s already exists\n", __func__, split_params.output.c_str());
-        exit(EXIT_FAILURE);
-    }
-
+    auto * ctx_out = gguf_init_empty();
    std::ofstream fout(split_params.output.c_str(), std::ios::binary);
    fout.exceptions(std::ofstream::failbit); // fail fast on write errors

-    auto * ctx_out = gguf_init_empty();
-
    std::vector<uint8_t> read_data;
    std::vector<ggml_context *> ctx_metas;
    std::vector<gguf_context *> ctx_ggufs;
@@ -571,9 +552,9 @@ int main(int argc, const char ** argv) {
    split_params_parse(argc, argv, params);

    switch (params.operation) {
-        case OP_SPLIT: gguf_split(params);
+        case SPLIT_OP_SPLIT: gguf_split(params);
            break;
-        case OP_MERGE: gguf_merge(params);
+        case SPLIT_OP_MERGE: gguf_merge(params);
            break;
        default: split_print_usage(argv[0]);
            exit(EXIT_FAILURE);
--- a/examples/gritlm/gritlm.cpp
+++ b/examples/gritlm/gritlm.cpp
@@ -1,4 +1,3 @@
-#include "arg.h"
 #include "common.h"
 #include "llama.h"

@@ -93,7 +92,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
    return result;
 }

-static std::string generate(llama_context * ctx, llama_sampler * smpl, const std::string & prompt, bool stream) {
+static std::string generate(llama_context * ctx, llama_sampling * smpl, const std::string & prompt, bool stream) {
    std::string result;

    const llama_model * model = llama_get_model(ctx);
@@ -121,8 +120,11 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std

        llama_decode(ctx, bat);

-        llama_token token = llama_sampler_sample(smpl, ctx, bat.n_tokens - 1);
+        const auto * logits = llama_get_logits_ith(ctx, bat.n_tokens - 1);

+        llama_sampling_set_logits(smpl, logits);
+
+        llama_token token = llama_sampling_sample_greedy(smpl, nullptr);
        if (token == eos_token) {
            break;
        }
@@ -154,12 +156,11 @@ static std::string gritlm_instruction(const std::string & instruction) {
 int main(int argc, char * argv[]) {
    gpt_params params;

-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
+    if (!gpt_params_parse(argc, argv, params)) {
+        gpt_params_print_usage(argc, argv, params);
        return 1;
    }

-    gpt_init();
-
    llama_model_params mparams = llama_model_params_from_gpt_params(params);
    llama_context_params cparams = llama_context_params_from_gpt_params(params);

@@ -170,13 +171,7 @@ int main(int argc, char * argv[]) {
    // create generation context
    llama_context * ctx = llama_new_context_with_model(model, cparams);

-    auto sparams = llama_sampler_chain_default_params();
-
-    sparams.no_perf = false;
-
-    llama_sampler * smpl = llama_sampler_chain_init(sparams);
-
-    llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
+    llama_sampling * smpl = llama_sampling_init(model, llama_sampling_default_params());

    // ### Embedding/Representation ###
    // samples taken from: https://github.com/ContextualAI/gritlm#basic
@@ -217,7 +212,7 @@ int main(int argc, char * argv[]) {
        std::string response = generate(ctx, smpl, prompt, true);
    }

-    llama_sampler_free(smpl);
+    llama_sampling_free(smpl);
    llama_free(ctx);
    llama_free_model(model);
    llama_backend_free();
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -1,6 +1,4 @@
-#include "arg.h"
 #include "common.h"
-#include "log.h"
 #include "llama.h"

 #include <cmath>
@@ -19,13 +17,15 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif

-static void print_usage(int, char ** argv) {
-    LOG("\nexample usage:\n");
-    LOG("\n    %s \\\n"
-            "       -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] \\\n"
+static void print_usage(int argc, char ** argv, const gpt_params & params) {
+    gpt_params_print_usage(argc, argv, params);
+
+    LOG_TEE("\nexample usage:\n");
+    LOG_TEE("\n    %s \\\n"
+            "       -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \\\n"
            "       [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \\\n"
            "       [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]\n" , argv[0]);
-    LOG("\n");
+    LOG_TEE("\n");
 }

 struct Stats {
@@ -126,10 +126,12 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
            e.counts.resize(src1->ne[0]*n_as, 0);
        }
        else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
-            LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
+            fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
            exit(1); //GGML_ABORT("fatal error");
        }
-        LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
+        if (m_params.verbosity > 1) {
+            printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
+        }
        // loop over all possible experts, regardless if they are used or not in the batch
        for (int ex = 0; ex < n_as; ++ex) {
            size_t e_start = ex*src1->ne[0];
@@ -150,8 +152,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
                        e.values[e_start + j] += x[j]*x[j];
                        e.counts[e_start + j]++;
                        if (!std::isfinite(e.values[e_start + j])) {
-                            LOG("\n");
-                            LOG_ERR("%f detected in %s\n", e.values[e_start + j], wname.c_str());
+                            fprintf(stderr, "%f detected in %s\n", e.values[e_start + j], wname.c_str());
                            exit(1);
                        }
                    }
@@ -174,18 +175,20 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
            e.counts.resize(src1->ne[0], 0);
        }
        else if (e.values.size() != (size_t)src1->ne[0]) {
-            LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
+            fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
            exit(1); //GGML_ABORT("fatal error");
        }
        ++e.ncall;
-        LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
+        if (m_params.verbosity > 1) {
+            printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
+        }
        for (int row = 0; row < (int)src1->ne[1]; ++row) {
            const float * x = data + row * src1->ne[0];
            for (int j = 0; j < (int)src1->ne[0]; ++j) {
                e.values[j] += x[j]*x[j];
                e.counts[j]++;
                if (!std::isfinite(e.values[j])) {
-                    LOG_ERR("%f detected in %s\n", e.values[j], wname.c_str());
+                    fprintf(stderr, "%f detected in %s\n", e.values[j], wname.c_str());
                    exit(1);
                }
            }
@@ -237,17 +240,17 @@ void IMatrixCollector::save_imatrix(int ncall) const {
        }

        if (n_zeros != 0 && is_first) {
-            LOG_INF("\n");
+            fprintf(stderr, "\n");
            is_first = false;
        }

        if (n_zeros == n_all) {
-            LOG_WRN("%s: entry '%40s' has no data - skipping\n", __func__, kv.first.c_str());
+            fprintf(stderr, "%s: entry '%40s' has no data - skipping\n", __func__, kv.first.c_str());
            continue;
        }

        if (n_zeros > 0) {
-            LOG_WRN("%s: entry '%40s' has partial data (%.2f%%) - skipping\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all);
+            fprintf(stderr, "%s: entry '%40s' has partial data (%.2f%%) - skipping\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all);
            continue;
        }

@@ -256,7 +259,7 @@ void IMatrixCollector::save_imatrix(int ncall) const {
    }

    if (to_store.size() < m_stats.size()) {
-        LOG_WRN("%s: storing only %zu out of %zu entries\n", __func__, to_store.size(), m_stats.size());
+        fprintf(stderr, "%s: warning: storing only %zu out of %zu entries\n", __func__, to_store.size(), m_stats.size());
    }

    std::ofstream out(fname, std::ios::binary);
@@ -288,20 +291,21 @@ void IMatrixCollector::save_imatrix(int ncall) const {
        out.write(m_params.prompt_file.c_str(), len);
    }

-    LOGV(1, "\n");
-    LOG_DBGV(1, "%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname.c_str());
+    if (m_params.verbosity > 0) {
+        fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname.c_str());
+    }
 }

 bool IMatrixCollector::load_imatrix(const char * fname) {
    std::ifstream in(fname, std::ios::binary);
    if (!in) {
-        LOG_ERR("%s: failed to open %s\n",__func__, fname);
+        printf("%s: failed to open %s\n",__func__, fname);
        return false;
    }
    int n_entries;
    in.read((char*)&n_entries, sizeof(n_entries));
    if (in.fail() || n_entries < 1) {
-        LOG_ERR("%s: no data in file %s\n", __func__, fname);
+        printf("%s: no data in file %s\n", __func__, fname);
        return false;
    }
    for (int i = 0; i < n_entries; ++i) {
@@ -309,7 +313,7 @@ bool IMatrixCollector::load_imatrix(const char * fname) {
        std::vector<char> name_as_vec(len+1);
        in.read((char *)name_as_vec.data(), len);
        if (in.fail()) {
-            LOG_ERR("%s: failed reading name for entry %d from %s\n",__func__,i+1, fname);
+            printf("%s: failed reading name for entry %d from %s\n",__func__,i+1, fname);
            return false;
        }
        name_as_vec[len] = 0;
@@ -320,7 +324,7 @@ bool IMatrixCollector::load_imatrix(const char * fname) {
        int nval;
        in.read((char *)&nval, sizeof(nval));
        if (in.fail() || nval < 1) {
-            LOG_ERR("%s: failed reading number of values for entry %d\n",__func__,i);
+            printf("%s: failed reading number of values for entry %d\n",__func__,i);
            m_stats = {};
            return false;
        }
@@ -333,7 +337,7 @@ bool IMatrixCollector::load_imatrix(const char * fname) {
        std::vector<float> tmp(nval);
        in.read((char*)tmp.data(), nval*sizeof(float));
        if (in.fail()) {
-            LOG_ERR("%s: failed reading data for entry %d\n",__func__,i);
+            printf("%s: failed reading data for entry %d\n",__func__,i);
            m_stats = {};
            return false;
        }
@@ -434,25 +438,26 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
    const int n_ctx = llama_n_ctx(ctx);

    auto tim1 = std::chrono::high_resolution_clock::now();
-    LOG_INF("%s: tokenizing the input ..\n", __func__);
+    fprintf(stderr, "%s: tokenizing the input ..\n", __func__);

    std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);

    auto tim2 = std::chrono::high_resolution_clock::now();
-    LOG_INF("%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
+    fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());

    if (params.i_chunk > 0) {
        if (size_t((params.i_chunk + 2)*n_ctx) >= tokens.size()) {
-            LOG_ERR("%s: there will be not enough tokens left after removing %d chunks\n", __func__, params.i_chunk);
+            fprintf(stderr, "%s: there will be not enough tokens left after removing %d chunks\n", __func__, params.i_chunk);
            return false;
        }
-        LOG_INF("%s: removing initial %d chunks (%d tokens)\n", __func__, params.i_chunk, params.i_chunk*n_ctx);
+        fprintf(stderr, "%s: removing initial %d chunks (%d tokens)\n", __func__, params.i_chunk, params.i_chunk*n_ctx);
        tokens.erase(tokens.begin(), tokens.begin() + params.i_chunk*n_ctx);
    }

    if (int(tokens.size()) < 2*n_ctx) {
-        LOG_ERR("%s: you need at least %d tokens for a context of %d tokens\n", __func__, 2*n_ctx, n_ctx);
-        LOG_ERR("%s: the data file you provided tokenizes to only %zu tokens\n", __func__, tokens.size());
+        fprintf(stderr, "%s: you need at least %d tokens for a context of %d tokens\n",__func__,2*n_ctx,
+                n_ctx);
+        fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
        return false;
    }

@@ -474,7 +479,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
    double nll = 0.0;
    double nll2 = 0.0;

-    LOG_INF("%s: computing over %d chunks with batch_size %d\n", __func__, n_chunk, n_batch);
+    fprintf(stderr, "%s: computing over %d chunks with batch_size %d\n", __func__, n_chunk, n_batch);

    std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);

@@ -510,7 +515,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {

            // TODO: use batch.logits to save computations instead of relying on logits_all == true
            if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
-                LOG_ERR("%s : failed to eval\n", __func__);
+                fprintf(stderr, "%s : failed to eval\n", __func__);
                return false;
            }

@@ -527,29 +532,29 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {

        if (i == 0) {
            const float t_total = std::chrono::duration<float>(t_end - t_start).count();
-            LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
+            fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
            int total_seconds = (int)(t_total * n_chunk);
            if (total_seconds >= 60*60) {
-                LOG("%d hours ", total_seconds / (60*60));
+                fprintf(stderr, "%d hours ", total_seconds / (60*60));
                total_seconds = total_seconds % (60*60);
            }
-            LOG("%.2f minutes\n", total_seconds / 60.0);
+            fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
        }

        if (params.compute_ppl) {
            const int first = n_ctx/2;
-            const auto * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
+            const auto all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
            process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
                    workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
            count += n_ctx - first - 1;

-            LOG("[%d]%.4lf,", i + 1, std::exp(nll / count));
+            printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
            fflush(stdout);

            logits.clear();
        }
    }
-    LOG("\n");
+    printf("\n");

    if (params.compute_ppl) {
        nll2 /= count;
@@ -558,9 +563,9 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
        nll2 -= nll * nll;
        if (nll2 > 0) {
            nll2 = sqrt(nll2/(count-1));
-            LOG("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
+            printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
        } else {
-            LOG("Unexpected negative standard deviation of log(prob)\n");
+            printf("Unexpected negative standard deviation of log(prob)\n");
        }
    }

@@ -572,28 +577,27 @@ int main(int argc, char ** argv) {

    params.n_ctx = 512;
    params.logits_all = true;
-    params.escape = false;
+    params.verbosity = 1;

-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_IMATRIX, print_usage)) {
+    if (!gpt_params_parse(argc, argv, params)) {
+        print_usage(argc, argv, params);
        return 1;
    }

-    gpt_init();
-
    params.n_batch = std::min(params.n_batch, params.n_ctx);

    g_collector.set_params(params);

    for (const auto & in_file : params.in_files) {
-        LOG_INF("%s : loading imatrix from '%s'\n", __func__, in_file.c_str());
+        printf("%s : loading imatrix from '%s'\n", __func__, in_file.c_str());
        if (!g_collector.load_imatrix(in_file.c_str())) {
-            LOG_ERR("%s : failed to load %s\n", __func__, in_file.c_str());
+            fprintf(stderr, "%s : failed to load %s\n", __func__, in_file.c_str());
            return 1;
        }
    }

    if (params.in_files.size() > 1) {
-        LOG_INF("%s : saving combined imatrix to '%s'\n", __func__, params.out_file.c_str());
+        printf("%s : saving combined imatrix to '%s'\n", __func__, params.out_file.c_str());
        g_collector.save_imatrix();
    }

@@ -612,20 +616,20 @@ int main(int argc, char ** argv) {
    llama_model * model = llama_init.model;
    llama_context * ctx = llama_init.context;
    if (model == nullptr || ctx == nullptr) {
-        LOG_ERR("%s : failed to init\n", __func__);
+        fprintf(stderr, "%s : failed to init\n", __func__);
        return 1;
    }

    const int n_ctx_train = llama_n_ctx_train(model);
    if (params.n_ctx > n_ctx_train) {
-        LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n",
+        fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
                __func__, n_ctx_train, params.n_ctx);
    }

    // print system information
    {
-        LOG_INF("\n");
-        LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
+        fprintf(stderr, "\n");
+        fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
    }

    if (!compute_imatrix(ctx, params)) {
@@ -634,8 +638,7 @@ int main(int argc, char ** argv) {

    g_collector.save_imatrix();

-    LOG("\n");
-    llama_perf_context_print(ctx);
+    llama_print_timings(ctx, nullptr);

    llama_free(ctx);
    llama_free_model(model);
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@@ -1,8 +1,6 @@
-#include "arg.h"
 #include "common.h"
+
 #include "console.h"
-#include "sampling.h"
-#include "log.h"
 #include "llama.h"

 #include <cassert>
@@ -35,7 +33,7 @@

 static llama_context           ** g_ctx;
 static llama_model             ** g_model;
-static gpt_sampler             ** g_smpl;
+static llama_sampling          ** g_smpl;
 static gpt_params               * g_params;
 static std::vector<llama_token> * g_input_tokens;
 static std::ostringstream       * g_output_ss;
@@ -56,7 +54,7 @@ static void write_logfile(

    const bool success = fs_create_directory_with_parents(params.logdir);
    if (!success) {
-        LOG_ERR("%s: warning: failed to create logdir %s, cannot write logfile\n",
+        fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
                __func__, params.logdir.c_str());
        return;
    }
@@ -65,7 +63,7 @@ static void write_logfile(
    FILE * logfile = fopen(logfile_path.c_str(), "w");

    if (logfile == NULL) {
-        LOG_ERR("%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
+        fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
        return;
    }

@@ -83,7 +81,7 @@ static void write_logfile(
    yaml_dump_string_multiline(logfile, "output", output.c_str());
    yaml_dump_vector_int(logfile, "output_tokens", output_tokens);

-    llama_perf_dump_yaml(logfile, ctx);
+    llama_dump_timing_info_yaml(logfile, ctx);
    fclose(logfile);
 }

@@ -94,14 +92,9 @@ static void sigint_handler(int signo) {
            is_interacting = true;
        } else {
            console::cleanup();
-            LOG("\n");
-            gpt_perf_print(*g_ctx, *g_smpl);
+            printf("\n");
+            llama_print_timings(*g_ctx, *g_smpl);
            write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
-
-            // make sure all logs are flushed
-            LOG("Interrupted by user\n");
-            gpt_log_pause(gpt_log_main());
-
            _exit(130);
        }
    }
@@ -112,93 +105,103 @@ int main(int argc, char ** argv) {
    gpt_params params;
    g_params = &params;

-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_INFILL)) {
+    if (!gpt_params_parse(argc, argv, params)) {
+        gpt_params_print_usage(argc, argv, params);
        return 1;
    }

-    gpt_init();
-
    auto & sparams = params.sparams;

+#ifndef LOG_DISABLE_LOGS
+    log_set_target(log_filename_generator("infill", "log"));
+    LOG_TEE("Log start\n");
+    log_dump_cmdline(argc, argv);
+#endif // LOG_DISABLE_LOGS
+
    console::init(params.simple_io, params.use_color);
    atexit([]() { console::cleanup(); });

    if (params.logits_all) {
-        LOG_ERR("\n************\n");
-        LOG_ERR("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
-        LOG_ERR("************\n\n");
+        printf("\n************\n");
+        printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
+        printf("************\n\n");

        return 0;
    }

    if (params.embedding) {
-        LOG_ERR("\n************\n");
-        LOG_ERR("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
-        LOG_ERR("************\n\n");
+        printf("\n************\n");
+        printf("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
+        printf("************\n\n");

        return 0;
    }

    if (params.n_ctx != 0 && params.n_ctx < 8) {
-        LOG_WRN("%s: minimum context size is 8, using minimum size.\n", __func__);
+        LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
        params.n_ctx = 8;
    }
-
    if (!params.interactive_first && (params.input_prefix.empty() && params.input_suffix.empty())) {
-        LOG_ERR("\n************\n");
-        LOG_ERR("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__);
-        LOG_ERR("************\n\n");
+        printf("\n************\n");
+        printf("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__);
+        printf("************\n\n");

        return 0;
    }

    if (params.rope_freq_base != 0.0) {
-        LOG_WRN("%s: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
+        LOG_TEE("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
    }

    if (params.rope_freq_scale != 0.0) {
-        LOG_WRN("%s: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
+        LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
    }

-    LOG_INF("%s: llama backend init\n", __func__);
+    print_build_info();
+
+    LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed);
+
+    LOG("%s: llama backend init\n", __func__);
    llama_backend_init();
    llama_numa_init(params.numa);

    llama_model * model = nullptr;
    llama_context * ctx = nullptr;
-    gpt_sampler  * smpl = nullptr;
+    llama_sampling * smpl = nullptr;

    g_model = &model;
    g_ctx = &ctx;
    g_smpl = &smpl;

    // load the model and apply lora adapter, if any
-    LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
+    LOG("%s: load the model and apply lora adapter, if any\n", __func__);
    llama_init_result llama_init = llama_init_from_gpt_params(params);

    model = llama_init.model;
    ctx = llama_init.context;

    if (model == NULL) {
-        LOG_ERR("%s: unable to load model\n", __func__);
+        LOG_TEE("%s: error: unable to load model\n", __func__);
        return 1;
    }

    const int n_ctx_train = llama_n_ctx_train(model);
    const int n_ctx = llama_n_ctx(ctx);
-    LOG_DBG("n_ctx: %d\n", n_ctx);
+    LOG("n_ctx: %d\n", n_ctx);

    if (n_ctx > n_ctx_train) {
-        LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx);
+        LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n",
+                __func__, n_ctx_train, n_ctx);
    }

    // print system information
    {
-        LOG_INF("\n");
-        LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
+        LOG_TEE("\n");
+        LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str());
    }
    const bool add_bos = llama_add_bos_token(model);
    GGML_ASSERT(!llama_add_eos_token(model));
+    LOG("add_bos: %d\n", add_bos);

    std::vector<llama_token> embd_inp;
    std::vector<llama_token> embd_end;
@@ -223,19 +226,18 @@ int main(int argc, char ** argv) {
        embd_inp.push_back(middle_token);
    }

-    LOG_DBG("add_bos: %d\n", add_bos);
-    LOG_DBG("prefix: \"%s\"\n", params.input_prefix.c_str());
-    LOG_DBG("suffix: \"%s\"\n", params.input_suffix.c_str());
-    LOG_DBG("tokens: %s\n", string_from(ctx, embd_inp).c_str());
+    LOG("prefix: \"%s\"\n", log_tostr(params.input_prefix));
+    LOG("suffix: \"%s\"\n", log_tostr(params.input_suffix));
+    LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());

    // Should not run without any tokens
    if (embd_inp.empty()) {
        embd_inp.push_back(llama_token_bos(model));
-        LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
+        LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
    }

    if ((int) embd_inp.size() > n_ctx - 4) {
-        LOG_ERR("%s: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
+        LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
        return 1;
    }

@@ -244,8 +246,9 @@ int main(int argc, char ** argv) {
        params.n_keep = (int)embd_inp.size();
    }

-    LOG_INF("inp_pfx: %s\n", string_from(ctx, inp_pfx).c_str());
-    LOG_INF("inp_sfx: %s\n", string_from(ctx, inp_sfx).c_str());
+    LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str());
+    LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str());
+

    // enable interactive mode if interactive start is specified
    if (params.interactive_first) {
@@ -253,21 +256,21 @@ int main(int argc, char ** argv) {
    }

    if (params.verbose_prompt) {
-        LOG_INF("\n");
-        LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
-        LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
+        LOG_TEE("\n");
+        LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
+        LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
        for (int i = 0; i < (int) embd_inp.size(); i++) {
-            LOG_INF("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
+            LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
        }

        if (params.n_keep > 0) {
-        LOG_INF("%s: static prompt based on n_keep: '", __func__);
+        LOG_TEE("%s: static prompt based on n_keep: '", __func__);
            for (int i = 0; i < params.n_keep; i++) {
-                LOG_CNT("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
+                LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
            }
-            LOG_CNT("'\n");
+            LOG_TEE("'\n");
        }
-        LOG_INF("\n");
+        LOG_TEE("\n");
    }

    if (params.interactive) {
@@ -284,30 +287,30 @@ int main(int argc, char ** argv) {
        SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
 #endif

-        LOG_INF("%s: interactive mode on.\n", __func__);
+        LOG_TEE("%s: interactive mode on.\n", __func__);

        if (params.input_prefix_bos) {
-            LOG_INF("Input prefix with BOS\n");
+            LOG_TEE("Input prefix with BOS\n");
        }

        if (!params.input_prefix.empty()) {
-            LOG_INF("Input prefix: '%s'\n", params.input_prefix.c_str());
+            LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str());
        }

        if (!params.input_suffix.empty()) {
-            LOG_INF("Input suffix: '%s'\n", params.input_suffix.c_str());
+            LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
        }
    }
-    smpl = gpt_sampler_init(model, sparams);
+    LOG_TEE("sampling: \n%s\n", sparams.print_all().c_str());
+    LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
+    LOG_TEE("\n\n");

-    LOG_INF("sampler seed: %u\n",     gpt_sampler_get_seed(smpl));
-    LOG_INF("sampler params: \n%s\n", sparams.print().c_str());
-    LOG_INF("sampler chain: %s\n",    gpt_sampler_print(smpl).c_str());
-
-    LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
-
-    LOG_INF("\n");
-    LOG_INF("\n#####  Infill mode  #####\n\n");
+    LOG_TEE("\n#####  Infill mode  #####\n\n");
+    if (params.infill) {
+        printf("\n************\n");
+        printf("no need to specify '--infill', always running infill\n");
+        printf("************\n\n");
+    }
    if (params.interactive) {
        const char *control_message;
        if (params.multiline_input) {
@@ -318,11 +321,11 @@ int main(int argc, char ** argv) {
                              " - To return control without starting a new line, end your input with '/'.\n"
                              " - If you want to submit another line, end your input with '\\'.\n";
        }
-        LOG_INF("== Running in interactive mode. ==\n");
+        LOG_TEE("== Running in interactive mode. ==\n");
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
-        LOG_INF(       " - Press Ctrl+C to interject at any time.\n");
+        LOG_TEE(       " - Press Ctrl+C to interject at any time.\n");
 #endif
-        LOG_INF(       "%s\n", control_message);
+        LOG_TEE(       "%s\n", control_message);

        is_interacting = params.interactive_first;
    }
@@ -342,6 +345,8 @@ int main(int argc, char ** argv) {

    std::vector<llama_token> embd;

+    smpl = llama_sampling_init(model, sparams);
+
    while (n_remain != 0 || params.interactive) {
        // predict
        if (!embd.empty()) {
@@ -355,8 +360,9 @@ int main(int argc, char ** argv) {
                embd.resize(max_embd_size);

                console::set_display(console::error);
-                LOG_WRN("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
+                printf("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
                console::set_display(console::reset);
+                fflush(stdout);
            }

            // infinite text generation via context swapping
@@ -365,14 +371,14 @@ int main(int argc, char ** argv) {
            // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
            if (n_past + (int) embd.size() > n_ctx) {
                if (params.n_predict == -2) {
-                    LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
+                    LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
                    break;
                }

                const int n_left    = n_past - params.n_keep - 1;
                const int n_discard = n_left/2;

-                LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
+                LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
                    n_past, n_left, n_ctx, params.n_keep, n_discard);

                llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1            , params.n_keep + n_discard + 1);
@@ -380,9 +386,9 @@ int main(int argc, char ** argv) {

                n_past -= n_discard;

-                LOG_DBG("after swap: n_past = %d\n", n_past);
+                LOG("after swap: n_past = %d\n", n_past);

-                LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str());
+                LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());

            }

@@ -394,16 +400,16 @@ int main(int argc, char ** argv) {
                    n_eval = params.n_batch;
                }

-                LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());
+                LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());

                if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
-                    LOG_ERR("%s : failed to eval\n", __func__);
+                    LOG_TEE("%s : failed to eval\n", __func__);
                    return 1;
                }

                n_past += n_eval;

-                LOG_DBG("n_past = %d\n", n_past);
+                LOG("n_past = %d\n", n_past);
            }

        }
@@ -411,11 +417,11 @@ int main(int argc, char ** argv) {
        embd.clear();

        if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
-            const llama_token id = gpt_sampler_sample(smpl, ctx, -1);
+            const llama_token id = llama_sampling_sample(smpl, ctx, -1);

-            gpt_sampler_accept(smpl, id, true);
+            llama_sampling_accept(smpl, id, true);

-            // LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str());
+            // LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, smpl->prev.to_vector()).c_str());

            embd.push_back(id);

@@ -425,16 +431,16 @@ int main(int argc, char ** argv) {
            // decrement remaining sampling budget
            --n_remain;

-            LOG_DBG("n_remain: %d\n", n_remain);
+            LOG("n_remain: %d\n", n_remain);
        } else {
            // some user input remains from prompt or interaction, forward it to processing
-            LOG_DBG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
+            LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
            while ((int) embd_inp.size() > n_consumed) {
                embd.push_back(embd_inp[n_consumed]);

                // push the prompt in the sampling context in order to apply repetition penalties later
                // for the prompt, we don't apply grammar rules
-                gpt_sampler_accept(smpl, embd_inp[n_consumed], false);
+                llama_sampling_accept(smpl, embd_inp[n_consumed], false);

                ++n_consumed;
                if ((int) embd.size() >= params.n_batch) {
@@ -447,7 +453,7 @@ int main(int argc, char ** argv) {
        if (input_echo) {
            for (auto id : embd) {
                const std::string token_str = llama_token_to_piece(ctx, id);
-                LOG("%s", token_str.c_str());
+                printf("%s", token_str.c_str());

                if (embd.size() > 1) {
                    input_tokens.push_back(id);
@@ -456,6 +462,7 @@ int main(int argc, char ** argv) {
                    output_ss << token_str;
                }
            }
+            fflush(stdout);
        }
        // reset color to default if we there is no pending user input
        if (input_echo && (int) embd_inp.size() == n_consumed) {
@@ -465,12 +472,13 @@ int main(int argc, char ** argv) {
        // if not currently processing queued inputs;
        if ((int) embd_inp.size() <= n_consumed) {
            // deal with eot token in infill mode
-            if ((gpt_sampler_last(smpl) == llama_token_eot(model) || is_interacting) && params.interactive){
+            if ((llama_sampling_last(smpl) == llama_token_eot(model) || is_interacting) && params.interactive){
                if (is_interacting && !params.interactive_first) {
                    // print an eot token
-                    LOG("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
+                    printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
                }
-                LOG("\n");
+                fflush(stdout);
+                printf("\n");
                console::set_display(console::user_input);
                std::string buffer;
                std::string line;
@@ -526,33 +534,35 @@ int main(int argc, char ** argv) {
                n_remain = params.n_predict;
                n_past = 0;
                n_consumed = 0;
+                // LOG_TEE("took new input\n");
                is_interacting = false;
            }
            // deal with end of generation tokens in interactive mode
-            else if (llama_token_is_eog(model, gpt_sampler_last(smpl))) {
-                LOG_DBG("found EOS token\n");
+            else if (llama_token_is_eog(model, llama_sampling_last(smpl))) {
+                LOG("found EOS token\n");

                if (params.interactive) {

                    is_interacting = true;
-                    LOG("\n");
+                    printf("\n");
                    console::set_display(console::user_input);
+                    fflush(stdout);
               }
            }

            if (n_past > 0 && is_interacting && !params.interactive) {
-                LOG_DBG("waiting for user input\n");
+                LOG("waiting for user input\n");

                if (params.input_prefix_bos) {
-                    LOG_DBG("adding input prefix BOS token\n");
+                    LOG("adding input prefix BOS token\n");
                    embd_inp.push_back(llama_token_bos(model));
                }

                std::string buffer;
                if (!params.input_prefix.empty()) {
-                    LOG_DBG("appending input prefix: '%s'\n", params.input_prefix.c_str());
+                    LOG("appending input prefix: '%s'\n", params.input_prefix.c_str());
                    buffer += params.input_prefix;
-                    LOG("%s", buffer.c_str());
+                    printf("%s", buffer.c_str());
                }

                std::string line;
@@ -570,17 +580,17 @@ int main(int argc, char ** argv) {
                if (buffer.length() > 1) {
                    // append input suffix if any
                    if (!params.input_suffix.empty()) {
-                        LOG_DBG("appending input suffix: '%s'\n", params.input_suffix.c_str());
+                        LOG("appending input suffix: '%s'\n", params.input_suffix.c_str());
                        buffer += params.input_suffix;
-                        LOG("%s", params.input_suffix.c_str());
+                        printf("%s", params.input_suffix.c_str());
                    }

-                    LOG_DBG("buffer: '%s'\n", buffer.c_str());
+                    LOG("buffer: '%s'\n", buffer.c_str());

                    const size_t original_size = embd_inp.size();

                    const auto line_inp = ::llama_tokenize(ctx, buffer, false);
-                    LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str());
+                    LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());

                    embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());

@@ -591,9 +601,9 @@ int main(int argc, char ** argv) {
                    }

                    n_remain -= line_inp.size();
-                    LOG_DBG("n_remain: %d\n", n_remain);
+                    LOG("n_remain: %d\n", n_remain);
                } else {
-                    LOG_DBG("empty line, passing control back\n");
+                    LOG("empty line, passing control back\n");
                }

                input_echo = false; // do not echo this again
@@ -601,7 +611,7 @@ int main(int argc, char ** argv) {

            if (n_past > 0) {
                if (is_interacting) {
-                    gpt_sampler_reset(smpl);
+                    llama_sampling_reset(smpl);
                }
                is_interacting = false;
            }
@@ -620,18 +630,22 @@ int main(int argc, char ** argv) {
        }
    }
    if (!params.interactive && n_remain <= 0) {
-        LOG("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
+        printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
+        fflush(stdout);
    }

-    LOG("\n");
-    gpt_perf_print(ctx, smpl);
+    llama_print_timings(ctx, smpl);
    write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);

    llama_free(ctx);
    llama_free_model(model);

-    gpt_sampler_free(smpl);
+    llama_sampling_free(smpl);
    llama_backend_free();

+#ifndef LOG_DISABLE_LOGS
+    LOG_TEE("Log end\n");
+#endif // LOG_DISABLE_LOGS
+
    return 0;
 }
--- a/examples/llama-bench/README.md
+++ b/examples/llama-bench/README.md
@@ -14,8 +14,7 @@ Performance testing tool for llama.cpp.
    1. [Markdown](#markdown)
    2. [CSV](#csv)
    3. [JSON](#json)
-    4. [JSONL](#jsonl)
-    5. [SQL](#sql)
+    4. [SQL](#sql)

 ## Syntax

@@ -24,34 +23,27 @@ usage: ./llama-bench [options]

 options:
  -h, --help
-  -m, --model <filename>                    (default: models/7B/ggml-model-q4_0.gguf)
-  -p, --n-prompt <n>                        (default: 512)
-  -n, --n-gen <n>                           (default: 128)
-  -pg <pp,tg>                               (default: )
-  -b, --batch-size <n>                      (default: 2048)
-  -ub, --ubatch-size <n>                    (default: 512)
-  -ctk, --cache-type-k <t>                  (default: f16)
-  -ctv, --cache-type-v <t>                  (default: f16)
-  -t, --threads <n>                         (default: 8)
-  -C, --cpu-mask <hex,hex>                  (default: 0x0)
-  --cpu-strict <0|1>                        (default: 0)
-  --poll <0...100>                          (default: 50)
-  -ngl, --n-gpu-layers <n>                  (default: 99)
-  -rpc, --rpc <rpc_servers>                 (default: )
-  -sm, --split-mode <none|layer|row>        (default: layer)
-  -mg, --main-gpu <i>                       (default: 0)
-  -nkvo, --no-kv-offload <0|1>              (default: 0)
-  -fa, --flash-attn <0|1>                   (default: 0)
-  -mmp, --mmap <0|1>                        (default: 1)
-  --numa <distribute|isolate|numactl>       (default: disabled)
-  -embd, --embeddings <0|1>                 (default: 0)
-  -ts, --tensor-split <ts0/ts1/..>          (default: 0)
-  -r, --repetitions <n>                     (default: 5)
-  --prio <0|1|2|3>                          (default: 0)
-  --delay <0...N> (seconds)                 (default: 0)
-  -o, --output <csv|json|jsonl|md|sql>      (default: md)
-  -oe, --output-err <csv|json|jsonl|md|sql> (default: none)
-  -v, --verbose                             (default: 0)
+  -m, --model <filename>              (default: models/7B/ggml-model-q4_0.gguf)
+  -p, --n-prompt <n>                  (default: 512)
+  -n, --n-gen <n>                     (default: 128)
+  -pg <pp,tg>                         (default: 512,128)
+  -b, --batch-size <n>                (default: 2048)
+  -ub, --ubatch-size <n>              (default: 512)
+  -ctk, --cache-type-k <t>            (default: f16)
+  -ctv, --cache-type-v <t>            (default: f16)
+  -t, --threads <n>                   (default: 16)
+  -ngl, --n-gpu-layers <n>            (default: 99)
+  -sm, --split-mode <none|layer|row>  (default: layer)
+  -mg, --main-gpu <i>                 (default: 0)
+  -nkvo, --no-kv-offload <0|1>        (default: 0)
+  -fa, --flash-attn <0|1>             (default: 0)
+  -mmp, --mmap <0|1>                  (default: 1)
+  --numa <distribute|isolate|numactl> (default: disabled)
+  -embd, --embeddings <0|1>           (default: 0)
+  -ts, --tensor-split <ts0/ts1/..>    (default: 0)
+  -r, --repetitions <n>               (default: 5)
+  -o, --output <csv|json|md|sql>      (default: md)
+  -v, --verbose                       (default: 0)

 Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.
 ```
@@ -246,19 +238,6 @@ $ ./llama-bench -o json
 ]
 ```

-
-### JSONL
-
-```sh
-$ ./llama-bench -o jsonl
-```
-
-```json lines
-{"build_commit":"3469684","build_number":1275,"cuda":true,"metal":false,"gpu_blas":true,"blas":true,"cpu_info":"13th Gen Intel(R) Core(TM) i9-13900K","gpu_info":"NVIDIA GeForce RTX 3090 Ti","model_filename":"models/7B/ggml-model-q4_0.gguf","model_type":"llama 7B mostly Q4_0","model_size":3825065984,"model_n_params":6738415616,"n_batch":512,"n_threads":16,"f16_kv":true,"n_gpu_layers":99,"main_gpu":0,"mul_mat_q":true,"tensor_split":"0.00","n_prompt":512,"n_gen":0,"test_time":"2023-09-23T12:09:57Z","avg_ns":212365953,"stddev_ns":985423,"avg_ts":2410.974041,"stddev_ts":11.163766,"samples_ns":[213837238,211635853,212328053,211329715,212698907],"samples_ts":[2394.34,2419.25,2411.36,2422.75,2407.16]}
-{"build_commit":"3469684","build_number":1275,"cuda":true,"metal":false,"gpu_blas":true,"blas":true,"cpu_info":"13th Gen Intel(R) Core(TM) i9-13900K","gpu_info":"NVIDIA GeForce RTX 3090 Ti","model_filename":"models/7B/ggml-model-q4_0.gguf","model_type":"llama 7B mostly Q4_0","model_size":3825065984,"model_n_params":6738415616,"n_batch":512,"n_threads":16,"f16_kv":true,"n_gpu_layers":99,"main_gpu":0,"mul_mat_q":true,"tensor_split":"0.00","n_prompt":0,"n_gen":128,"test_time":"2023-09-23T12:09:59Z","avg_ns":977425219,"stddev_ns":9268593,"avg_ts":130.965708,"stddev_ts":1.238924,"samples_ns":[984472709,974901233,989474741,970729355,967548060],"samples_ts":[130.019,131.295,129.362,131.86,132.293]}
-```
-
-
 ### SQL

 SQL output is suitable for importing into a SQLite database. The output can be piped into the `sqlite3` command line tool to add the results to a database.
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -124,9 +124,6 @@ static std::string get_cpu_info() {
                        (LPBYTE)cpu_brand,
                        &cpu_brand_size) == ERROR_SUCCESS) {
        id.assign(cpu_brand, cpu_brand_size);
-        if (id.find('\0') != std::string::npos) {
-            id.resize(id.find('\0'));
-        }
    }
    RegCloseKey(hKey);
 #endif
@@ -174,14 +171,13 @@ static std::string get_gpu_info() {
 }

 // command line params
-enum output_formats {NONE, CSV, JSON, JSONL, MARKDOWN, SQL};
+enum output_formats {NONE, CSV, JSON, MARKDOWN, SQL};

 static const char * output_format_str(output_formats format) {
    switch (format) {
        case NONE:     return "none";
        case CSV:      return "csv";
        case JSON:     return "json";
-        case JSONL:    return "jsonl";
        case MARKDOWN: return "md";
        case SQL:      return "sql";
        default: GGML_ABORT("invalid output format");
@@ -195,8 +191,6 @@ static bool output_format_from_str(const std::string & s, output_formats & forma
        format = CSV;
    } else if (s == "json") {
        format = JSON;
-    } else if (s == "jsonl") {
-        format = JSONL;
    } else if (s == "md") {
        format = MARKDOWN;
    } else if (s == "sql") {
@@ -249,7 +243,6 @@ struct cmd_params {
    ggml_sched_priority prio;
    int delay;
    bool verbose;
-    bool progress;
    output_formats output_format;
    output_formats output_format_stderr;
 };
@@ -281,7 +274,6 @@ static const cmd_params cmd_params_defaults = {
    /* prio                 */ GGML_SCHED_PRIO_NORMAL,
    /* delay                */ 0,
    /* verbose              */ false,
-    /* progress             */ false,
    /* output_format        */ MARKDOWN,
    /* output_format_stderr */ NONE,
 };
@@ -291,37 +283,34 @@ static void print_usage(int /* argc */, char ** argv) {
    printf("\n");
    printf("options:\n");
    printf("  -h, --help\n");
-    printf("  -m, --model <filename>                    (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
-    printf("  -p, --n-prompt <n>                        (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
-    printf("  -n, --n-gen <n>                           (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
-    printf("  -pg <pp,tg>                               (default: %s)\n", join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str());
-    printf("  -b, --batch-size <n>                      (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
-    printf("  -ub, --ubatch-size <n>                    (default: %s)\n", join(cmd_params_defaults.n_ubatch, ",").c_str());
-    printf("  -ctk, --cache-type-k <t>                  (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
-    printf("  -ctv, --cache-type-v <t>                  (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
-    printf("  -t, --threads <n>                         (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
-    printf("  -C, --cpu-mask <hex,hex>                  (default: %s)\n", join(cmd_params_defaults.cpu_mask, ",").c_str());
-    printf("  --cpu-strict <0|1>                        (default: %s)\n", join(cmd_params_defaults.cpu_strict, ",").c_str());
-    printf("  --poll <0...100>                          (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str());
-    printf("  -ngl, --n-gpu-layers <n>                  (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
-#ifdef GGML_USE_RPC
-    printf("  -rpc, --rpc <rpc_servers>                 (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str());
-#endif
-    printf("  -sm, --split-mode <none|layer|row>        (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
-    printf("  -mg, --main-gpu <i>                       (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
-    printf("  -nkvo, --no-kv-offload <0|1>              (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
-    printf("  -fa, --flash-attn <0|1>                   (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str());
-    printf("  -mmp, --mmap <0|1>                        (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
-    printf("  --numa <distribute|isolate|numactl>       (default: disabled)\n");
-    printf("  -embd, --embeddings <0|1>                 (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
-    printf("  -ts, --tensor-split <ts0/ts1/..>          (default: 0)\n");
-    printf("  -r, --repetitions <n>                     (default: %d)\n", cmd_params_defaults.reps);
-    printf("  --prio <0|1|2|3>                          (default: %d)\n", cmd_params_defaults.prio);
-    printf("  --delay <0...N> (seconds)                 (default: %d)\n", cmd_params_defaults.delay);
-    printf("  -o, --output <csv|json|jsonl|md|sql>      (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
-    printf("  -oe, --output-err <csv|json|jsonl|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format_stderr));
-    printf("  -v, --verbose                             (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
-    printf("  --progress                                (default: %s)\n", cmd_params_defaults.progress ? "1" : "0");
+    printf("  -m, --model <filename>              (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
+    printf("  -p, --n-prompt <n>                  (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
+    printf("  -n, --n-gen <n>                     (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
+    printf("  -pg <pp,tg>                         (default: %s)\n", join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str());
+    printf("  -b, --batch-size <n>                (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
+    printf("  -ub, --ubatch-size <n>              (default: %s)\n", join(cmd_params_defaults.n_ubatch, ",").c_str());
+    printf("  -ctk, --cache-type-k <t>            (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
+    printf("  -ctv, --cache-type-v <t>            (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
+    printf("  -t, --threads <n>                   (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
+    printf("  -C, --cpu-mask <hex,hex>            (default: %s)\n", join(cmd_params_defaults.cpu_mask, ",").c_str());
+    printf("  --cpu-strict <0|1>                  (default: %s)\n", join(cmd_params_defaults.cpu_strict, ",").c_str());
+    printf("  --poll <0...100>                    (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str());
+    printf("  -ngl, --n-gpu-layers <n>            (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
+    printf("  -rpc, --rpc <rpc_servers>           (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str());
+    printf("  -sm, --split-mode <none|layer|row>  (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
+    printf("  -mg, --main-gpu <i>                 (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
+    printf("  -nkvo, --no-kv-offload <0|1>        (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
+    printf("  -fa, --flash-attn <0|1>             (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str());
+    printf("  -mmp, --mmap <0|1>                  (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
+    printf("  --numa <distribute|isolate|numactl> (default: disabled)\n");
+    printf("  -embd, --embeddings <0|1>           (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
+    printf("  -ts, --tensor-split <ts0/ts1/..>    (default: 0)\n");
+    printf("  -r, --repetitions <n>               (default: %d)\n", cmd_params_defaults.reps);
+    printf("  --prio <0|1|2|3>                    (default: %d)\n", cmd_params_defaults.prio);
+    printf("  --delay <0...N> (seconds)           (default: %d)\n", cmd_params_defaults.delay);
+    printf("  -o, --output <csv|json|md|sql>      (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
+    printf("  -oe, --output-err <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format_stderr));
+    printf("  -v, --verbose                       (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
    printf("\n");
    printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
 }
@@ -367,7 +356,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
    params.numa = cmd_params_defaults.numa;
    params.prio = cmd_params_defaults.prio;
    params.delay = cmd_params_defaults.delay;
-    params.progress = cmd_params_defaults.progress;

    for (int i = 1; i < argc; i++) {
        arg = argv[i];
@@ -439,9 +427,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                }
                types.push_back(gt);
            }
-            if (invalid_param) {
-                break;
-            }
            params.type_k.insert(params.type_k.end(), types.begin(), types.end());
        } else if (arg == "-ctv" || arg == "--cache-type-v") {
            if (++i >= argc) {
@@ -458,9 +443,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                }
                types.push_back(gt);
            }
-            if (invalid_param) {
-                break;
-            }
            params.type_v.insert(params.type_v.end(), types.begin(), types.end());
        } else if (arg == "-t" || arg == "--threads") {
            if (++i >= argc) {
@@ -497,14 +479,12 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
            }
            auto p = string_split<int>(argv[i], split_delim);
            params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
-#ifdef GGML_USE_RPC
        } else if (arg == "-rpc" || arg == "--rpc") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.rpc_servers.push_back(argv[i]);
-#endif
        } else if (arg == "-sm" || arg == "--split-mode") {
            if (++i >= argc) {
                invalid_param = true;
@@ -526,9 +506,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                }
                modes.push_back(mode);
            }
-            if (invalid_param) {
-                break;
-            }
            params.split_mode.insert(params.split_mode.end(), modes.begin(), modes.end());
        } else if (arg == "-mg" || arg == "--main-gpu") {
            if (++i >= argc) {
@@ -629,8 +606,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
            invalid_param = !output_format_from_str(argv[i], params.output_format_stderr);
        } else if (arg == "-v" || arg == "--verbose") {
            params.verbose = true;
-        } else if (arg == "--progress") {
-            params.progress = true;
        } else {
            invalid_param = true;
            break;
@@ -1099,39 +1074,38 @@ struct csv_printer : public printer {
    }
 };

-
-static std::string escape_json(const std::string & value) {
-    std::string escaped;
-    for (auto c : value) {
-        if (c == '"') {
-            escaped += "\\\"";
-        } else if (c == '\\') {
-            escaped += "\\\\";
-        } else  if (c <= 0x1f) {
-            char buf[8];
-            snprintf(buf, sizeof(buf), "\\u%04x", c);
-            escaped += buf;
-        } else {
-            escaped += c;
-        }
-    }
-    return escaped;
-}
-
-static std::string format_json_value(const std::string & field, const std::string & value) {
-    switch (test::get_field_type(field)) {
-        case test::STRING:
-            return "\"" + escape_json(value) + "\"";
-        case test::BOOL:
-            return value == "0" ? "false" : "true";
-        default:
-            return value;
-    }
-}
-
 struct json_printer : public printer {
    bool first = true;

+    static std::string escape_json(const std::string & value) {
+        std::string escaped;
+        for (auto c : value) {
+            if (c == '"') {
+                escaped += "\\\"";
+            } else if (c == '\\') {
+                escaped += "\\\\";
+            } else  if (c <= 0x1f) {
+                char buf[8];
+                snprintf(buf, sizeof(buf), "\\u%04x", c);
+                escaped += buf;
+            } else {
+                escaped += c;
+            }
+        }
+        return escaped;
+    }
+
+    static std::string format_value(const std::string & field, const std::string & value) {
+        switch (test::get_field_type(field)) {
+            case test::STRING:
+                return "\"" + escape_json(value) + "\"";
+            case test::BOOL:
+                return value == "0" ? "false" : "true";
+            default:
+                return value;
+        }
+    }
+
    void print_header(const cmd_params & params) override {
        fprintf(fout, "[\n");
        (void) params;
@@ -1140,7 +1114,7 @@ struct json_printer : public printer {
    void print_fields(const std::vector<std::string> & fields, const std::vector<std::string> & values) {
        assert(fields.size() == values.size());
        for (size_t i = 0; i < fields.size(); i++) {
-            fprintf(fout, "    \"%s\": %s,\n", fields.at(i).c_str(), format_json_value(fields.at(i), values.at(i)).c_str());
+            fprintf(fout, "    \"%s\": %s,\n", fields.at(i).c_str(), format_value(fields.at(i), values.at(i)).c_str());
        }
    }

@@ -1163,25 +1137,6 @@ struct json_printer : public printer {
    }
 };

-
-struct jsonl_printer : public printer {
-    void print_fields(const std::vector<std::string> & fields, const std::vector<std::string> & values) {
-        assert(fields.size() == values.size());
-        for (size_t i = 0; i < fields.size(); i++) {
-            fprintf(fout, "\"%s\": %s, ", fields.at(i).c_str(), format_json_value(fields.at(i), values.at(i)).c_str());
-        }
-    }
-
-    void print_test(const test & t) override {
-        fprintf(fout, "{");
-        print_fields(test::get_fields(), t.get_values());
-        fprintf(fout, "\"samples_ns\": [ %s ],", join(t.samples_ns, ", ").c_str());
-        fprintf(fout, "\"samples_ts\": [ %s ]", join(t.get_ts(), ", ").c_str());
-        fprintf(fout, "}\n");
-        fflush(fout);
-    }
-};
-
 struct markdown_printer : public printer {
    std::vector<std::string> fields;

@@ -1482,8 +1437,6 @@ static std::unique_ptr<printer> create_printer(output_formats format) {
            return std::unique_ptr<printer>(new csv_printer());
        case JSON:
            return std::unique_ptr<printer>(new json_printer());
-        case JSONL:
-            return std::unique_ptr<printer>(new jsonl_printer());
        case MARKDOWN:
            return std::unique_ptr<printer>(new markdown_printer());
        case SQL:
@@ -1538,13 +1491,7 @@ int main(int argc, char ** argv) {
    llama_model * lmodel = nullptr;
    const cmd_params_instance * prev_inst = nullptr;

-    int params_idx = 0;
-    auto params_count = params_instances.size();
    for (const auto & inst : params_instances) {
-        params_idx ++;
-        if (params.progress) {
-            fprintf(stderr, "llama-bench: benchmark %d/%ld: starting\n", params_idx, params_count);
-        }
        // keep the same model between tests when possible
        if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) {
            if (lmodel) {
@@ -1577,7 +1524,7 @@ int main(int argc, char ** argv) {

        struct ggml_threadpool_params tpp = ggml_threadpool_params_default(t.n_threads);
        if (!parse_cpu_mask(t.cpu_mask, tpp.cpumask)) {
-            fprintf(stderr, "%s: failed to parse cpu-mask: %s\n", __func__, t.cpu_mask.c_str());
+            LOG_TEE("%s: failed to parse cpu-mask: %s\n", __func__, t.cpu_mask.c_str());
            exit(1);
        }
        tpp.strict_cpu = t.cpu_strict;
@@ -1586,7 +1533,7 @@ int main(int argc, char ** argv) {

        struct ggml_threadpool* threadpool = ggml_threadpool_new(&tpp);
        if (!threadpool) {
-            fprintf(stderr, "%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
+            LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
            exit(1);
        }

@@ -1594,16 +1541,10 @@ int main(int argc, char ** argv) {

        // warmup run
        if (t.n_prompt > 0) {
-            if (params.progress) {
-                fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup prompt run\n", params_idx, params_count);
-            }
            //test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);
            test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
        }
        if (t.n_gen > 0) {
-            if (params.progress) {
-                fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup generation run\n", params_idx, params_count);
-            }
            test_gen(ctx, 1, 0, t.n_threads);
        }

@@ -1613,15 +1554,9 @@ int main(int argc, char ** argv) {
            uint64_t t_start = get_time_ns();

            if (t.n_prompt > 0) {
-                if (params.progress) {
-                    fprintf(stderr, "llama-bench: benchmark %d/%ld: prompt run %d/%d\n", params_idx, params_count, i + 1, params.reps);
-                }
                test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
            }
            if (t.n_gen > 0) {
-                if (params.progress) {
-                    fprintf(stderr, "llama-bench: benchmark %d/%ld: generation run %d/%d\n", params_idx, params_count, i + 1, params.reps);
-                }
                test_gen(ctx, t.n_gen, t.n_prompt, t.n_threads);
            }

@@ -1639,7 +1574,7 @@ int main(int argc, char ** argv) {
            fflush(p_err->fout);
        }

-        llama_perf_context_print(ctx);
+        llama_print_timings(ctx, nullptr);

        llama_free(ctx);

--- a/examples/llama.android/llama/src/main/cpp/llama-android.cpp
+++ b/examples/llama.android/llama/src/main/cpp/llama-android.cpp
@@ -269,6 +269,12 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
    return env->NewStringUTF(result.str().c_str());
 }

+extern "C"
+JNIEXPORT void JNICALL
+Java_android_llama_cpp_LLamaAndroid_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
+    llama_batch_free(*reinterpret_cast<llama_batch *>(batch_pointer));
+}
+
 extern "C"
 JNIEXPORT jlong JNICALL
 Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens, jint embd, jint n_seq_max) {
@@ -305,29 +311,6 @@ Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens,
    return reinterpret_cast<jlong>(batch);
 }

-extern "C"
-JNIEXPORT void JNICALL
-Java_android_llama_cpp_LLamaAndroid_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
-    llama_batch_free(*reinterpret_cast<llama_batch *>(batch_pointer));
-}
-
-extern "C"
-JNIEXPORT jlong JNICALL
-Java_android_llama_cpp_LLamaAndroid_new_1sampler(JNIEnv *, jobject) {
-    auto sparams = llama_sampler_chain_default_params();
-    sparams.no_perf = true;
-    llama_sampler * smpl = llama_sampler_chain_init(sparams);
-    llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
-
-    return reinterpret_cast<jlong>(smpl);
-}
-
-extern "C"
-JNIEXPORT void JNICALL
-Java_android_llama_cpp_LLamaAndroid_free_1sampler(JNIEnv *, jobject, jlong sampler_pointer) {
-    llama_sampler_free(reinterpret_cast<llama_sampler *>(sampler_pointer));
-}
-
 extern "C"
 JNIEXPORT void JNICALL
 Java_android_llama_cpp_LLamaAndroid_backend_1init(JNIEnv *, jobject) {
@@ -397,22 +380,26 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
        JNIEnv * env,
        jobject,
        jlong context_pointer,
+        jlong sampling_pointer,
        jlong batch_pointer,
-        jlong sampler_pointer,
        jint n_len,
        jobject intvar_ncur
 ) {
    const auto context = reinterpret_cast<llama_context *>(context_pointer);
-    const auto batch   = reinterpret_cast<llama_batch   *>(batch_pointer);
-    const auto sampler = reinterpret_cast<llama_sampler *>(sampler_pointer);
+    const auto sampling = reinterpret_cast<llama_sampling *>(sampling_pointer);
+    const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
    const auto model = llama_get_model(context);

    if (!la_int_var) la_int_var = env->GetObjectClass(intvar_ncur);
    if (!la_int_var_value) la_int_var_value = env->GetMethodID(la_int_var, "getValue", "()I");
    if (!la_int_var_inc) la_int_var_inc = env->GetMethodID(la_int_var, "inc", "()V");

+    const auto * logits = llama_get_logits_ith(context, batch->n_tokens - 1);
+
+    llama_sampling_set_logits(sampling, logits);
+
    // sample the most likely token
-    const auto new_token_id = llama_sampler_sample(sampler, context, -1);
+    const auto new_token_id = llama_sampling_sample_greedy(sampling, nullptr);

    const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value);
    if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
--- a/examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaAndroid.kt
+++ b/examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaAndroid.kt
@@ -45,10 +45,8 @@ class LLamaAndroid {
    private external fun free_context(context: Long)
    private external fun backend_init(numa: Boolean)
    private external fun backend_free()
-    private external fun new_batch(nTokens: Int, embd: Int, nSeqMax: Int): Long
    private external fun free_batch(batch: Long)
-    private external fun new_sampler(): Long
-    private external fun free_sampler(sampler: Long)
+    private external fun new_batch(nTokens: Int, embd: Int, nSeqMax: Int): Long
    private external fun bench_model(
        context: Long,
        model: Long,
@@ -71,7 +69,6 @@ class LLamaAndroid {
    private external fun completion_loop(
        context: Long,
        batch: Long,
-        sampler: Long,
        nLen: Int,
        ncur: IntVar
    ): String?
@@ -104,11 +101,8 @@ class LLamaAndroid {
                    val batch = new_batch(512, 0, 1)
                    if (batch == 0L) throw IllegalStateException("new_batch() failed")

-                    val sampler = new_sampler()
-                    if (sampler == 0L) throw IllegalStateException("new_sampler() failed")
-
                    Log.i(tag, "Loaded model $pathToModel")
-                    threadLocalState.set(State.Loaded(model, context, batch, sampler))
+                    threadLocalState.set(State.Loaded(model, context, batch))
                }
                else -> throw IllegalStateException("Model already loaded")
            }
@@ -120,7 +114,7 @@ class LLamaAndroid {
            is State.Loaded -> {
                val ncur = IntVar(completion_init(state.context, state.batch, message, nlen))
                while (ncur.value <= nlen) {
-                    val str = completion_loop(state.context, state.batch, state.sampler, nlen, ncur)
+                    val str = completion_loop(state.context, state.batch, nlen, ncur)
                    if (str == null) {
                        break
                    }
@@ -144,7 +138,6 @@ class LLamaAndroid {
                    free_context(state.context)
                    free_model(state.model)
                    free_batch(state.batch)
-                    free_sampler(state.sampler);

                    threadLocalState.set(State.Idle)
                }
@@ -168,7 +161,7 @@ class LLamaAndroid {

        private sealed interface State {
            data object Idle: State
-            data class Loaded(val model: Long, val context: Long, val batch: Long, val sampler: Long): State
+            data class Loaded(val model: Long, val context: Long, val batch: Long): State
        }

        // Enforce only one instance of Llm.
--- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
+++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
@@ -24,7 +24,7 @@ func llama_batch_add(_ batch: inout llama_batch, _ id: llama_token, _ pos: llama
 actor LlamaContext {
    private var model: OpaquePointer
    private var context: OpaquePointer
-    private var sampling: UnsafeMutablePointer<llama_sampler>
+    private var sampling: OpaquePointer
    private var batch: llama_batch
    private var tokens_list: [llama_token]
    var is_done: Bool = false
@@ -43,15 +43,11 @@ actor LlamaContext {
        self.tokens_list = []
        self.batch = llama_batch_init(512, 0, 1)
        self.temporary_invalid_cchars = []
-        let sparams = llama_sampler_chain_default_params()
-        self.sampling = llama_sampler_chain_init(sparams)
-        llama_sampler_chain_add(self.sampling, llama_sampler_init_temp(0.4))
-        llama_sampler_chain_add(self.sampling, llama_sampler_init_softmax())
-        llama_sampler_chain_add(self.sampling, llama_sampler_init_dist(1234))
+        self.sampling = llama_sampling_init(context, llama_sampling_default_params())
    }

    deinit {
-        llama_sampler_free(sampling)
+        llama_sampling_free(sampling)
        llama_batch_free(batch)
        llama_free(context)
        llama_free_model(model)
@@ -150,7 +146,12 @@ actor LlamaContext {
    func completion_loop() -> String {
        var new_token_id: llama_token = 0

-        new_token_id = llama_sampler_sample(sampling, context, batch.n_tokens - 1)
+        let n_vocab = llama_n_vocab(model)
+        let logits = llama_get_logits_ith(context, batch.n_tokens - 1)
+
+        llama_sampling_set_logits(sampling, logits);
+
+        new_token_id = llama_sampling_sample_greedy(sampling, nil)

        if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
            print("\n")
--- a/examples/llava/MobileVLM-README.md
+++ b/examples/llava/MobileVLM-README.md
@@ -39,7 +39,7 @@ python ./examples/llava/llava_surgery.py -m path/to/MobileVLM-1.7B
 3. Use `convert_image_encoder_to_gguf.py` with `--projector-type ldp` (for **V2** please use `--projector-type ldpv2`) to convert the LLaVA image encoder to GGUF:

 ```sh
-python ./examples/llava/convert_image_encoder_to_gguf.py \
+python ./examples/llava/convert_image_encoder_to_gguf \
    -m path/to/clip-vit-large-patch14-336 \
    --llava-projector path/to/MobileVLM-1.7B/llava.projector \
    --output-dir path/to/MobileVLM-1.7B \
@@ -47,7 +47,7 @@ python ./examples/llava/convert_image_encoder_to_gguf.py \
 ```

 ```sh
-python ./examples/llava/convert_image_encoder_to_gguf.py \
+python ./examples/llava/convert_image_encoder_to_gguf \
    -m path/to/clip-vit-large-patch14-336 \
    --llava-projector path/to/MobileVLM-1.7B_V2/llava.projector \
    --output-dir path/to/MobileVLM-1.7B_V2 \
@@ -57,12 +57,12 @@ python ./examples/llava/convert_image_encoder_to_gguf.py \
 4. Use `examples/convert_legacy_llama.py` to convert the LLaMA part of LLaVA to GGUF:

 ```sh
-python ./examples/convert_legacy_llama.py path/to/MobileVLM-1.7B --skip-unknown
+python ./examples/convert_legacy_llama.py path/to/MobileVLM-1.7B
 ```

-5. Use `quantize` to convert LLaMA part's DataType from `fp32` to `q4_k`
+5. Use `quantize` to convert LLaMA part's DataType from `fp16` to `q4_k`
 ```sh
-./llama-quantize path/to/MobileVLM-1.7B/ggml-model-F32.gguf path/to/MobileVLM-1.7B/ggml-model-q4_k.gguf q4_k_s
+./llama-quantize path/to/MobileVLM-1.7B/ggml-model-f16.gguf path/to/MobileVLM-1.7B/ggml-model-q4_k.gguf q4_k_s
 ```

 Now both the LLaMA part and the image encoder is in the `MobileVLM-1.7B` directory.
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -3,6 +3,7 @@
 // I'll gradually clean and extend it
 // Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
 #include "clip.h"
+#include "log.h"
 #include "ggml.h"
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
@@ -39,11 +40,6 @@
 #include <cinttypes>
 #include <limits>

-#define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
-#define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
-#define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
-#define LOG_DBG(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
-
 //#define CLIP_DEBUG_FUNCTIONS

 // RGB uint8 image
@@ -169,7 +165,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
 static int get_key_idx(const gguf_context * ctx, const char * key) {
    int i = gguf_find_key(ctx, key);
    if (i == -1) {
-        LOG_ERR("key %s not found in file\n", key);
+        LOG_TEE("key %s not found in file\n", key);
        throw std::runtime_error(format("Missing required key: %s", key));
    }

@@ -274,7 +270,7 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {

 static void print_tensor_info(const ggml_tensor * tensor, const char * prefix = "") {
    size_t tensor_size = ggml_nbytes(tensor);
-    LOG_INF("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n",
+    LOG_TEE("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n",
            prefix, ggml_n_dims(tensor), tensor->name, tensor_size,
            tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], ggml_type_name(tensor->type));
 }
@@ -292,7 +288,7 @@ static projector_type clip_projector_type_from_string(const std::string & name)
 static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) {
    std::ofstream file(filename, std::ios::binary);
    if (!file.is_open()) {
-        LOG_ERR("Failed to open file for writing: %s\n", filename.c_str());
+        LOG_TEE("Failed to open file for writing: %s\n", filename.c_str());
        return;
    }

@@ -311,7 +307,7 @@ static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::s
 static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename) {
    std::ofstream file(filename, std::ios::binary);
    if (!file.is_open()) {
-        LOG_ERR("Failed to open file for writing: %s\n", filename.c_str());
+        LOG_TEE("Failed to open file for writing: %s\n", filename.c_str());
        return;
    }

@@ -572,7 +568,7 @@ struct clip_ctx {

 static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) {
    if (!ctx->has_vision_encoder) {
-        LOG_ERR("This gguf file seems to have no vision encoder\n");
+        LOG_TEE("This gguf file seems to have no vision encoder\n");
        return nullptr;
    }

@@ -586,7 +582,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
        if (load_image_size == nullptr) {
            load_image_size = clip_image_size_init();
        }
-        LOG_DBG("%s: %d %d\n", __func__, load_image_size->width, load_image_size->height);
+        LOG_TEE("%s: %d %d\n", __func__, load_image_size->width, load_image_size->height);
        image_size_width  = load_image_size->width;
        image_size_height = load_image_size->height;
        if (is_inf) {
@@ -1051,21 +1047,21 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        const int idx_name = gguf_find_key(ctx, KEY_NAME);
        if (idx_name != -1) { // make name optional temporarily as some of the uploaded models missing it due to a bug
            const std::string name = gguf_get_val_str(ctx, idx_name);
-            LOG_INF("%s: model name:   %s\n", __func__, name.c_str());
+            LOG_TEE("%s: model name:   %s\n", __func__, name.c_str());
        }
-        LOG_INF("%s: description:  %s\n", __func__, description.c_str());
-        LOG_INF("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx));
-        LOG_INF("%s: alignment:    %zu\n", __func__, gguf_get_alignment(ctx));
-        LOG_INF("%s: n_tensors:    %d\n", __func__, n_tensors);
-        LOG_INF("%s: n_kv:         %d\n", __func__, n_kv);
-        LOG_INF("%s: ftype:        %s\n", __func__, ftype_str.c_str());
-        LOG_INF("\n");
+        LOG_TEE("%s: description:  %s\n", __func__, description.c_str());
+        LOG_TEE("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx));
+        LOG_TEE("%s: alignment:    %zu\n", __func__, gguf_get_alignment(ctx));
+        LOG_TEE("%s: n_tensors:    %d\n", __func__, n_tensors);
+        LOG_TEE("%s: n_kv:         %d\n", __func__, n_kv);
+        LOG_TEE("%s: ftype:        %s\n", __func__, ftype_str.c_str());
+        LOG_TEE("\n");
    }
    const int n_tensors = gguf_get_n_tensors(ctx);

    // kv
    const int n_kv = gguf_get_n_kv(ctx);
-    LOG_INF("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n",
+    LOG_TEE("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n",
        __func__, n_kv, n_tensors, fname);
    {
        std::map<enum ggml_type, uint32_t> n_type;
@@ -1076,7 +1072,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
            n_type[type]++;
        }

-        LOG_INF("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
+        LOG_TEE("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
        for (int i = 0; i < n_kv; i++) {
            const char * name           = gguf_get_key(ctx, i);
            const enum gguf_type type   = gguf_get_kv_type(ctx, i);
@@ -1092,7 +1088,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
            }
            replace_all(value, "\n", "\\n");

-            LOG_INF("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
+            LOG_TEE("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
        }

        // print type counts
@@ -1101,7 +1097,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
                continue;
            }

-            LOG_INF("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
+            LOG_TEE("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
        }
    }

@@ -1116,7 +1112,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
            size_t tensor_size = ggml_nbytes(cur);
            model_size += tensor_size;
            if (verbosity >= 3) {
-                LOG_INF("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
+                LOG_TEE("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
                       __func__, i, ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_type_name(type));
            }
        }
@@ -1143,27 +1139,27 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {

 #ifdef GGML_USE_CUDA
    new_clip->backend = ggml_backend_cuda_init(0);
-    LOG_INF("%s: CLIP using CUDA backend\n", __func__);
+    LOG_TEE("%s: CLIP using CUDA backend\n", __func__);
 #endif

 #ifdef GGML_USE_METAL
    new_clip->backend = ggml_backend_metal_init();
-    LOG_INF("%s: CLIP using Metal backend\n", __func__);
+    LOG_TEE("%s: CLIP using Metal backend\n", __func__);
 #endif

 #ifdef GGML_USE_CANN
    new_clip->backend = ggml_backend_cann_init(0);
-    LOG_INF("%s: CLIP using CANN backend\n", __func__);
+    LOG_TEE("%s: CLIP using CANN backend\n", __func__);
 #endif

 #ifdef GGML_USE_VULKAN
    new_clip->backend = ggml_backend_vk_init(0);
-    LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
+    LOG_TEE("%s: CLIP using Vulkan backend\n", __func__);
 #endif

    if (!new_clip->backend) {
        new_clip->backend = ggml_backend_cpu_init();
-        LOG_INF("%s: CLIP using CPU backend\n", __func__);
+        LOG_TEE("%s: CLIP using CPU backend\n", __func__);
    }

    // model size and capabilities
@@ -1198,16 +1194,16 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        new_clip->use_gelu = gguf_get_val_bool(ctx, idx);

        if (verbosity >= 1) {
-            LOG_INF("%s: text_encoder:   %d\n", __func__, new_clip->has_text_encoder);
-            LOG_INF("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
-            LOG_INF("%s: llava_projector:  %d\n", __func__, new_clip->has_llava_projector);
-            LOG_INF("%s: minicpmv_projector:  %d\n", __func__, new_clip->has_minicpmv_projector);
-            LOG_INF("%s: model size:     %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
-            LOG_INF("%s: metadata size:  %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
+            LOG_TEE("%s: text_encoder:   %d\n", __func__, new_clip->has_text_encoder);
+            LOG_TEE("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
+            LOG_TEE("%s: llava_projector:  %d\n", __func__, new_clip->has_llava_projector);
+            LOG_TEE("%s: minicpmv_projector:  %d\n", __func__, new_clip->has_minicpmv_projector);
+            LOG_TEE("%s: model size:     %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
+            LOG_TEE("%s: metadata size:  %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
        }
    }

-    LOG_INF("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, model_size / (1024.0 * 1024.0), n_tensors);
+    LOG_TEE("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, model_size / (1024.0 * 1024.0), n_tensors);

    // load tensors
    {
@@ -1220,7 +1216,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {

        new_clip->ctx_data = ggml_init(params);
        if (!new_clip->ctx_data) {
-            LOG_ERR("%s: ggml_init() failed\n", __func__);
+            LOG_TEE("%s: ggml_init() failed\n", __func__);
            clip_free(new_clip);
            gguf_free(ctx);
            return nullptr;
@@ -1228,7 +1224,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {

        auto fin = std::ifstream(fname, std::ios::binary);
        if (!fin) {
-            LOG_ERR("cannot open model file for loading tensors\n");
+            LOG_TEE("cannot open model file for loading tensors\n");
            clip_free(new_clip);
            gguf_free(ctx);
            return nullptr;
@@ -1250,7 +1246,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
            const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i);
            fin.seekg(offset, std::ios::beg);
            if (!fin) {
-                LOG_ERR("%s: failed to seek for tensor %s\n", __func__, name);
+                LOG_TEE("%s: failed to seek for tensor %s\n", __func__, name);
                clip_free(new_clip);
                gguf_free(ctx);
                return nullptr;
@@ -1321,23 +1317,23 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        }

        if (verbosity >= 2) {
-            LOG_INF("\n%s: vision model hparams\n", __func__);
-            LOG_INF("image_size         %d\n", hparams.image_size);
-            LOG_INF("patch_size         %d\n", hparams.patch_size);
-            LOG_INF("v_hidden_size      %d\n", hparams.hidden_size);
-            LOG_INF("v_n_intermediate   %d\n", hparams.n_intermediate);
-            LOG_INF("v_projection_dim   %d\n", hparams.projection_dim);
-            LOG_INF("v_n_head           %d\n", hparams.n_head);
-            LOG_INF("v_n_layer          %d\n", hparams.n_layer);
-            LOG_INF("v_eps              %f\n", hparams.eps);
-            LOG_INF("v_image_mean       %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]);
-            LOG_INF("v_image_std        %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]);
-            LOG_INF("v_image_grid_pinpoints: ");
+            LOG_TEE("\n%s: vision model hparams\n", __func__);
+            LOG_TEE("image_size         %d\n", hparams.image_size);
+            LOG_TEE("patch_size         %d\n", hparams.patch_size);
+            LOG_TEE("v_hidden_size      %d\n", hparams.hidden_size);
+            LOG_TEE("v_n_intermediate   %d\n", hparams.n_intermediate);
+            LOG_TEE("v_projection_dim   %d\n", hparams.projection_dim);
+            LOG_TEE("v_n_head           %d\n", hparams.n_head);
+            LOG_TEE("v_n_layer          %d\n", hparams.n_layer);
+            LOG_TEE("v_eps              %f\n", hparams.eps);
+            LOG_TEE("v_image_mean       %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]);
+            LOG_TEE("v_image_std        %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]);
+            LOG_TEE("v_image_grid_pinpoints: ");
            for (int i = 0; i < 32 && (hparams.image_grid_pinpoints[i] != 0); ++i) {
-                LOG_INF("%d ", hparams.image_grid_pinpoints[i]);
+                LOG_TEE("%d ", hparams.image_grid_pinpoints[i]);
            }
-            LOG_INF("\n");
-            LOG_INF("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type);
+            LOG_TEE("\n");
+            LOG_TEE("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type);

        }

@@ -1375,7 +1371,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
            vision_model.patch_embeddings    = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
            vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
        } catch(const std::exception& /*e*/) {
-            LOG_ERR("%s: failed to load vision model tensors\n", __func__);
+            LOG_TEE("%s: failed to load vision model tensors\n", __func__);
        }

        // LLaVA projection
@@ -1404,7 +1400,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
            } catch (std::runtime_error & /*e*/) { }
            try {
                vision_model.image_newline = get_tensor(new_clip->ctx_data, TN_IMAGE_NEWLINE);
-                // LOG_INF("%s: image_newline tensor (llava-1.6) found\n", __func__);
+                // LOG_TEE("%s: image_newline tensor (llava-1.6) found\n", __func__);
            } catch (std::runtime_error & /*e*/) { }
        } else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) {
            // MobileVLM projection
@@ -1505,7 +1501,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, nullptr, false);
        ggml_gallocr_reserve(new_clip->compute_alloc, gf);
        size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0);
-        LOG_INF("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
+        LOG_TEE("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
    }

    return new_clip;
@@ -1556,7 +1552,7 @@ bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
    int nx, ny, nc;
    auto * data = stbi_load(fname, &nx, &ny, &nc, 3);
    if (!data) {
-        LOG_ERR("%s: failed to load image '%s'\n", __func__, fname);
+        LOG_TEE("%s: failed to load image '%s'\n", __func__, fname);
        return false;
    }
    build_clip_img_from_data(data, nx, ny, img);
@@ -1568,7 +1564,7 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length
    int nx, ny, nc;
    auto * data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3);
    if (!data) {
-        LOG_ERR("%s: failed to decode image bytes\n", __func__);
+        LOG_TEE("%s: failed to decode image bytes\n", __func__);
        return false;
    }
    build_clip_img_from_data(data, nx, ny, img);
@@ -1758,7 +1754,7 @@ static std::pair<int, int> select_best_resolution(const std::pair<int, int> & or
        int downscaled_height = static_cast<int>(original_height * scale);
        int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
        int wasted_resolution = (width * height) - effective_resolution;
-        // LOG_INF("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
+        // LOG_TEE("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
        if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
            max_effective_resolution = effective_resolution;
            min_wasted_resolution = wasted_resolution;
@@ -1876,7 +1872,7 @@ static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_imag
    const int multiple = fmin(ceil(ratio), max_slice_nums);

    std::vector<std::vector<clip_image_u8 *>> images;
-    LOG_INF("%s: multiple %d\n", __func__, multiple);
+    LOG_TEE("%s: multiple %d\n", __func__, multiple);
    images.push_back(std::vector<clip_image_u8 *>());

    if (multiple <= 1) {
@@ -1891,17 +1887,17 @@ static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_imag
        clip_image_u8 * source_image = clip_image_u8_init();
        bicubic_resize(*img, *source_image, best_size.first, best_size.second);
        // source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC)
-        LOG_INF("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img->nx, img->ny, best_size.first, best_size.second);
+        LOG_TEE("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img->nx, img->ny, best_size.first, best_size.second);
        images[images.size()-1].push_back(source_image);

        std::pair<int, int> best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio);
-        LOG_INF("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img->nx, img->ny, best_grid.first, best_grid.second);
+        LOG_TEE("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img->nx, img->ny, best_grid.first, best_grid.second);

        auto refine_size = uhd_get_refine_size(original_size, best_grid, scale_resolution, patch_size, true);
        clip_image_u8 * refine_image = clip_image_u8_init();
        bicubic_resize(*img, *refine_image, refine_size.first, refine_size.second);

-        LOG_INF("%s: refine_image_size: %d %d; refine_size: %d %d\n", __func__, refine_image->nx, refine_image->ny, refine_size.first, refine_size.second);
+        LOG_TEE("%s: refine_image_size: %d %d; refine_size: %d %d\n", __func__, refine_image->nx, refine_image->ny, refine_size.first, refine_size.second);

        // split_to_patches
        int width = refine_image->nx;
@@ -1958,7 +1954,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
        int idx = 0;
        for (size_t i = 0; i < imgs.size(); ++i) {
            for (size_t j = 0; j < imgs[i].size(); ++j) {
-                LOG_DBG("%s: %d %d\n", __func__,imgs[i][j]->nx,imgs[i][j]->ny);
+                LOG_TEE("%s: %d %d\n", __func__,imgs[i][j]->nx,imgs[i][j]->ny);
                clip_image_f32 * res = clip_image_f32_init();
                normalize_image_u8_to_f32(imgs[i][j], res, ctx->image_mean, ctx->image_std);
                res_imgs->data[idx++] = *res;
@@ -1970,7 +1966,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli

    bool pad_to_square = true;
    if (!ctx->has_vision_encoder) {
-        LOG_ERR("This gguf file seems to have no vision encoder\n");
+        LOG_TEE("This gguf file seems to have no vision encoder\n");
        return false;
    }
    auto & params = ctx->vision_model.hparams;
@@ -2047,7 +2043,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
            }

            for (size_t i = 0; i < patches.size(); i++) {
-                // LOG_DBG("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny);
+                // LOG_TEE("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny);
                clip_image_u8_free(patches[i]);
            }

@@ -2283,7 +2279,7 @@ static std::vector<std::vector<float>> get_2d_sincos_pos_embed(int embed_dim, co

 bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
    if (!ctx->has_vision_encoder) {
-        LOG_ERR("This gguf file seems to have no vision encoder\n");
+        LOG_TEE("This gguf file seems to have no vision encoder\n");
        return false;
    }

@@ -2295,7 +2291,7 @@ bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f3

 bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec) {
    if (!ctx->has_vision_encoder) {
-        LOG_ERR("This gguf file seems to have no vision encoder\n");
+        LOG_TEE("This gguf file seems to have no vision encoder\n");
        return false;
    }

@@ -2444,10 +2440,16 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
        ggml_backend_cpu_set_n_threads(ctx->backend, n_threads);
    }

+#ifdef GGML_USE_METAL
+    if (ggml_backend_is_metal(ctx->backend)) {
+        ggml_backend_metal_set_n_cb(ctx->backend, n_threads);
+    }
+#endif
+
    ggml_backend_graph_compute(ctx->backend, gf);

    // the last node is the embedding tensor
-    struct ggml_tensor * embeddings = ggml_graph_node(gf, -1);
+    struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 1];

    // copy the embeddings to the location passed by the user
    ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
@@ -2519,7 +2521,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
            new_type = type;
            if (new_type >= GGML_TYPE_Q2_K && name.find("embd") != std::string::npos) {
                new_type = GGML_TYPE_Q8_0; // ggml_get_rows needs non K type
-                // LOG_ERR("%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type));
+                // LOG_TEE("%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type));
            }
            const size_t n_elms = ggml_nelements(cur);
            float * f32_data;
@@ -2538,7 +2540,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
                f32_data = (float *)conv_buf.data();
                break;
            default:
-                LOG_ERR("Please use an input file in f32 or f16\n");
+                LOG_TEE("Please use an input file in f32 or f16\n");
                gguf_free(ctx_out);
                return false;
            }
@@ -2565,7 +2567,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
            fout.put(0);
        }

-        LOG_INF("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize,
+        LOG_TEE("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize,
               orig_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
    }

@@ -2581,8 +2583,8 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
    gguf_free(ctx_out);

    {
-        LOG_INF("%s: original  size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
-        LOG_INF("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0);
+        LOG_TEE("%s: original  size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
+        LOG_TEE("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0);
    }

    return true;
--- a/examples/llava/convert_image_encoder_to_gguf.py
+++ b/examples/llava/convert_image_encoder_to_gguf.py
@@ -274,7 +274,7 @@ fout.add_bool("clip.use_gelu", use_gelu)


 if has_llava_projector:
-    model.vision_model.encoder.layers.pop(-1)
+    model.vision_model.encoder.layers.pop(-1)  # pyright: ignore[reportAttributeAccessIssue]
    projector = torch.load(args.llava_projector)
    for name, data in projector.items():
        name = get_tensor_name(name)
@@ -288,7 +288,7 @@ if has_llava_projector:

    print("Projector tensors added\n")

-state_dict = model.state_dict()
+state_dict = model.state_dict()  # pyright: ignore[reportAttributeAccessIssue]
 for name, data in state_dict.items():
    if should_skip_tensor(name, has_text_encoder, has_vision_encoder, has_llava_projector):
        # we don't need this
--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@@ -1,16 +1,14 @@
-#include "arg.h"
-#include "base64.hpp"
+#include "ggml.h"
 #include "log.h"
 #include "common.h"
-#include "sampling.h"
 #include "clip.h"
 #include "llava.h"
 #include "llama.h"
-#include "ggml.h"
+
+#include "base64.hpp"

 #include <cstdio>
 #include <cstdlib>
-#include <cstring>
 #include <vector>

 static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
@@ -21,7 +19,7 @@ static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_toke
            n_eval = n_batch;
        }
        if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
-            LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
+            LOG_TEE("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
            return false;
        }
        *n_past += n_eval;
@@ -42,11 +40,11 @@ static bool eval_string(struct llama_context * ctx_llama, const char* str, int n
    return true;
 }

-static const char * sample(struct gpt_sampler * smpl,
+static const char * sample(struct llama_sampling * smpl,
                           struct llama_context * ctx_llama,
                           int * n_past) {
-    const llama_token id = gpt_sampler_sample(smpl, ctx_llama, -1);
-    gpt_sampler_accept(smpl, id, true);
+    const llama_token id = llama_sampling_sample(smpl, ctx_llama, -1);
+    llama_sampling_accept(smpl, id, true);
    static std::string ret;
    if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
        ret = "</s>";
@@ -76,7 +74,7 @@ static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip
    size_t img_base64_str_start, img_base64_str_end;
    find_image_tag_in_prompt(prompt, img_base64_str_start, img_base64_str_end);
    if (img_base64_str_start == std::string::npos || img_base64_str_end == std::string::npos) {
-        LOG_ERR("%s: invalid base64 image tag. must be %s<base64 byte string>%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END);
+        LOG_TEE("%s: invalid base64 image tag. must be %s<base64 byte string>%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END);
        return NULL;
    }

@@ -90,7 +88,7 @@ static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip

    auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, img_bytes.data(), img_bytes.size());
    if (!embed) {
-        LOG_ERR("%s: could not load image from base64 string.\n", __func__);
+        LOG_TEE("%s: could not load image from base64 string.\n", __func__);
        return NULL;
    }

@@ -114,10 +112,12 @@ struct llava_context {
    struct llama_model * model = NULL;
 };

-static void print_usage(int, char ** argv) {
-    LOG("\n example usage:\n");
-    LOG("\n     %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
-    LOG("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
+static void print_usage(int argc, char ** argv, const gpt_params & params) {
+    gpt_params_print_usage(argc, argv, params);
+
+    LOG_TEE("\n example usage:\n");
+    LOG_TEE("\n     %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
+    LOG_TEE("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
 }

 static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_params * params, const std::string & fname) {
@@ -127,11 +127,11 @@ static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_para
    auto prompt = params->prompt;
    if (prompt_contains_image(prompt)) {
        if (!params->image.empty()) {
-            LOG_INF("using base64 encoded image instead of command line image path\n");
+            LOG_TEE("using base64 encoded image instead of command line image path\n");
        }
        embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->cpuparams.n_threads, prompt);
        if (!embed) {
-            LOG_ERR("%s: can't load image from prompt\n", __func__);
+            LOG_TEE("%s: can't load image from prompt\n", __func__);
            return NULL;
        }
        params->prompt = remove_image_from_prompt(prompt);
@@ -157,18 +157,18 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
        // new templating mode: Provide the full prompt including system message and use <image> as a placeholder for the image
        system_prompt = prompt.substr(0, image_pos);
        user_prompt = prompt.substr(image_pos + std::string("<image>").length());
-        LOG_INF("system_prompt: %s\n", system_prompt.c_str());
+        LOG_TEE("system_prompt: %s\n", system_prompt.c_str());
        if (params->verbose_prompt) {
            auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, system_prompt, true, true);
            for (int i = 0; i < (int) tmp.size(); i++) {
-                LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
+                LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
            }
        }
-        LOG_INF("user_prompt: %s\n", user_prompt.c_str());
+        LOG_TEE("user_prompt: %s\n", user_prompt.c_str());
        if (params->verbose_prompt) {
            auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
            for (int i = 0; i < (int) tmp.size(); i++) {
-                LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
+                LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
            }
        }
    } else {
@@ -178,7 +178,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
        if (params->verbose_prompt) {
            auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
            for (int i = 0; i < (int) tmp.size(); i++) {
-                LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
+                LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
            }
        }
    }
@@ -189,11 +189,11 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_

    // generate the response

-    LOG("\n");
+    LOG_TEE("\n");

-    struct gpt_sampler * smpl = gpt_sampler_init(ctx_llava->model, params->sparams);
+    struct llama_sampling * smpl = llama_sampling_init(ctx_llava->model, params->sparams);
    if (!smpl) {
-        LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
+        fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);
        exit(1);
    }

@@ -203,7 +203,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
        response += tmp;
        if (strcmp(tmp, "</s>") == 0) break;
        if (strstr(tmp, "###")) break; // Yi-VL behavior
-        LOG("%s", tmp);
+        printf("%s", tmp);
        if (strstr(response.c_str(), "<|im_end|>")) break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works)
        if (strstr(response.c_str(), "<|im_start|>")) break; // Yi-34B llava-1.6
        if (strstr(response.c_str(), "USER:")) break; // mistral llava-1.6
@@ -211,8 +211,8 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
        fflush(stdout);
    }

-    gpt_sampler_free(smpl);
-    LOG("\n");
+    llama_sampling_free(smpl);
+    printf("\n");
 }

 static struct llama_model * llava_init(gpt_params * params) {
@@ -223,7 +223,7 @@ static struct llama_model * llava_init(gpt_params * params) {

    llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
    if (model == NULL) {
-        LOG_ERR("%s: unable to load model\n" , __func__);
+        LOG_TEE("%s: error: unable to load model\n" , __func__);
        return NULL;
    }
    return model;
@@ -246,11 +246,11 @@ static struct llava_context * llava_init_context(gpt_params * params, llama_mode
    llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);

    if (ctx_llama == NULL) {
-        LOG_ERR("%s: failed to create the llama_context\n" , __func__);
+        LOG_TEE("%s: error: failed to create the llama_context\n" , __func__);
        return NULL;
    }

-    auto * ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
+    auto ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));

    ctx_llava->ctx_llama = ctx_llama;
    ctx_llava->ctx_clip = ctx_clip;
@@ -269,54 +269,65 @@ static void llava_free(struct llava_context * ctx_llava) {
    llama_backend_free();
 }

+static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
+    (void) level;
+    (void) user_data;
+    LOG_TEE("%s", text);
+}
+
 int main(int argc, char ** argv) {
    ggml_time_init();

    gpt_params params;

-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, print_usage)) {
+    if (!gpt_params_parse(argc, argv, params)) {
+        print_usage(argc, argv, params);
        return 1;
    }

-    gpt_init();
+#ifndef LOG_DISABLE_LOGS
+    log_set_target(log_filename_generator("llava", "log"));
+    LOG_TEE("Log start\n");
+    log_dump_cmdline(argc, argv);
+    llama_log_set(llama_log_callback_logTee, nullptr);
+#endif // LOG_DISABLE_LOGS

    if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
-        print_usage(argc, argv);
+        print_usage(argc, argv, {});
        return 1;
    }
-
-    auto * model = llava_init(&params);
+    auto model = llava_init(&params);
    if (model == NULL) {
        fprintf(stderr, "%s: error: failed to init llava model\n", __func__);
        return 1;
    }

    if (prompt_contains_image(params.prompt)) {
-        auto * ctx_llava = llava_init_context(&params, model);
+        auto ctx_llava = llava_init_context(&params, model);

-        auto * image_embed = load_image(ctx_llava, &params, "");
+        auto image_embed = load_image(ctx_llava, &params, "");

        // process the prompt
        process_prompt(ctx_llava, image_embed, &params, params.prompt);

-        llama_perf_context_print(ctx_llava->ctx_llama);
+        llama_print_timings(ctx_llava->ctx_llama, nullptr);
        llava_image_embed_free(image_embed);
        ctx_llava->model = NULL;
        llava_free(ctx_llava);
    } else {
        for (auto & image : params.image) {
-            auto * ctx_llava = llava_init_context(&params, model);
+            auto ctx_llava = llava_init_context(&params, model);

-            auto * image_embed = load_image(ctx_llava, &params, image);
+            auto image_embed = load_image(ctx_llava, &params, image);
            if (!image_embed) {
-                LOG_ERR("%s: failed to load image %s. Terminating\n\n", __func__, image.c_str());
+                std::cerr << "error: failed to load image " << image << ". Terminating\n\n";
                return 1;
            }

            // process the prompt
            process_prompt(ctx_llava, image_embed, &params, params.prompt);

-            llama_perf_context_print(ctx_llava->ctx_llama);
+            llama_print_timings(ctx_llava->ctx_llama, nullptr);
            llava_image_embed_free(image_embed);
            ctx_llava->model = NULL;
            llava_free(ctx_llava);
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -1,23 +1,13 @@
 #include "clip.h"
-#include "llava.h"
-
+#include "common.h"
 #include "llama.h"
+#include "llava.h"
+#include "base64.hpp"

-#include <algorithm>
-#include <cerrno>
 #include <cstdio>
 #include <cstdlib>
-#include <cstring>
-#include <limits>
 #include <vector>
-
-#define die(msg)          do { fputs("error: " msg "\n", stderr);                exit(1); } while (0)
-#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
-
-#define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
-#define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
-#define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
-#define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
+#include <numeric>

 // RGB uint8 image
 struct clip_image_u8 {
@@ -64,7 +54,7 @@ static std::pair<int, int> select_best_resolution(const std::pair<int, int>& ori
        int downscaled_height = static_cast<int>(original_height * scale);
        int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
        int wasted_resolution = (width * height) - effective_resolution;
-        // LOG_DBG("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
+        // LOG_TEE("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
        if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
            max_effective_resolution = effective_resolution;
            min_wasted_resolution = wasted_resolution;
@@ -194,7 +184,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
    // ggml_tensor_printf(flatten,"flatten",__LINE__,false,false);
    ggml_build_forward_expand(gf, flatten);
    ggml_graph_compute_with_ctx(model.ctx, gf, 1);
-    struct ggml_tensor* result = ggml_graph_node(gf, -1);
+    struct ggml_tensor* result = gf->nodes[gf->n_nodes - 1];

    memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
    // append without newline tokens (default behavior in llava_arch when not using unpad ):
@@ -246,7 +236,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
    img_res_v.size = 0;
    img_res_v.data = nullptr;
    if (!clip_image_preprocess(ctx_clip, img, &img_res_v)) {
-        LOG_ERR("%s: unable to preprocess image\n", __func__);
+        LOG_TEE("%s: unable to preprocess image\n", __func__);
        delete[] img_res_v.data;
        return false;
    }
@@ -275,14 +265,14 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
                encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
            }
            if (!encoded) {
-                LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
+                LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
                return false;
            }
            const int64_t t_img_enc_steop_batch_us = ggml_time_us();
-            LOG_INF("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)img_res_v.size, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0);
+            LOG_TEE("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)img_res_v.size, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0);
        }
        const int64_t t_img_enc_batch_us = ggml_time_us();
-        LOG_INF("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
+        LOG_TEE("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);

        int n_img_pos_out = 0;
        for (size_t i = 0; i < image_embd_v.size(); i++) {
@@ -297,7 +287,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
        load_image_size->width = img->nx;
        load_image_size->height = img->ny;
        clip_add_load_image_size(ctx_clip, load_image_size);
-        LOG_INF("%s: load_image_size %d %d\n", __func__, load_image_size->width, load_image_size->height);
+        LOG_TEE("%s: load_image_size %d %d\n", __func__, load_image_size->width, load_image_size->height);
    }
    else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
        // flat / default llava-1.5 type embedding
@@ -305,7 +295,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
        bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 576 x 4096
        delete[] img_res_v.data;
        if (!encoded) {
-            LOG_ERR("Unable to encode image\n");
+            LOG_TEE("Unable to encode image\n");

            return false;
        }
@@ -319,12 +309,12 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
            image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184
            const bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside
            if (!encoded) {
-                LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
+                LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
                return false;
            }
        }
        const int64_t t_img_enc_batch_us = ggml_time_us();
-        LOG_INF("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
+        LOG_TEE("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);

        const int32_t * image_grid = clip_image_grid(ctx_clip);

@@ -357,12 +347,12 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
        // clip_image_save_to_bmp(*tmp, "image_feature.bmp");
    }

-    LOG_INF("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);
+    LOG_TEE("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);

    const int64_t t_img_enc_end_us = ggml_time_us();
    float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0;

-    LOG_INF("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos);
+    LOG_TEE("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos);

    return true;
 }
@@ -372,7 +362,7 @@ bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx *
    int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama));
    auto n_image_embd = clip_n_mmproj_embd(ctx_clip);
    if (n_image_embd != n_llama_embd) {
-        LOG_ERR("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd);
+        LOG_TEE("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd);
        return false;
    }
    return true;
@@ -385,13 +375,13 @@ bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, co
    }
    float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*num_max_patches); // TODO: base on gridsize/llava model
    if (!image_embd) {
-        LOG_ERR("Unable to allocate memory for image embeddings\n");
+        LOG_TEE("Unable to allocate memory for image embeddings\n");
        return false;
    }

    int n_img_pos;
    if (!encode_image_with_clip(ctx_clip, n_threads, img, image_embd, &n_img_pos)) {
-        LOG_ERR("%s: cannot encode image, aborting\n", __func__);
+        LOG_TEE("%s: cannot encode image, aborting\n", __func__);
        free(image_embd);
        return false;
    }
@@ -411,7 +401,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
        }
        llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
        if (llama_decode(ctx_llama, batch)) {
-            LOG_ERR("%s : failed to eval\n", __func__);
+            LOG_TEE("%s : failed to eval\n", __func__);
            return false;
        }
        *n_past += n_eval;
@@ -423,7 +413,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
    clip_image_u8 * img = clip_image_u8_init();
    if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) {
        clip_image_u8_free(img);
-        LOG_ERR("%s: can't load image from bytes, is it a valid image?", __func__);
+        LOG_TEE("%s: can't load image from bytes, is it a valid image?", __func__);
        return NULL;
    }

@@ -432,7 +422,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
    bool image_embed_result = llava_image_embed_make_with_clip_img(ctx_clip, n_threads, img, &image_embed, &n_image_pos);
    if (!image_embed_result) {
        clip_image_u8_free(img);
-        LOG_ERR("%s: coulnd't embed the image\n", __func__);
+        LOG_TEE("%s: coulnd't embed the image\n", __func__);
        return NULL;
    }

@@ -446,7 +436,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
 static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long *sizeOut) {
    auto file = fopen(path, "rb");
    if (file == NULL) {
-        LOG_ERR("%s: can't read file %s\n", __func__, path);
+        LOG_TEE("%s: can't read file %s\n", __func__, path);
        return false;
    }

@@ -456,7 +446,7 @@ static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long

    auto buffer = (unsigned char *)malloc(fileSize); // Allocate memory to hold the file data
    if (buffer == NULL) {
-        LOG_ERR("%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path);
+        LOG_TEE("%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path);
        perror("Memory allocation error");
        fclose(file);
        return false;
@@ -481,7 +471,7 @@ struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx
    long image_bytes_length;
    auto loaded = load_file_to_bytes(image_path, &image_bytes, &image_bytes_length);
    if (!loaded) {
-        LOG_ERR("%s: failed to load %s\n", __func__, image_path);
+        LOG_TEE("%s: failed to load %s\n", __func__, image_path);
        return NULL;
    }

--- a/examples/llava/minicpmv-cli.cpp
+++ b/examples/llava/minicpmv-cli.cpp
@@ -1,18 +1,13 @@
-#include "arg.h"
+#include "ggml.h"
 #include "log.h"
 #include "common.h"
-#include "sampling.h"
 #include "clip.h"
 #include "llava.h"
 #include "llama.h"
-#include "ggml.h"

-#include <algorithm>
 #include <cstdio>
 #include <cstdlib>
-#include <cstring>
 #include <vector>
-#include <iostream> // TODO: remove me

 struct llava_context {
    struct clip_ctx * ctx_clip = NULL;
@@ -21,8 +16,14 @@ struct llava_context {
 };

 static void show_additional_info(int /*argc*/, char ** argv) {
-    LOG("\nexample usage:\n\n%s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
-    LOG("\nnote: a lower temperature value like 0.1 is recommended for better quality.\n");
+    LOG_TEE("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
+    LOG_TEE("  note: a lower temperature value like 0.1 is recommended for better quality.\n");
+}
+
+static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
+    (void) level;
+    (void) user_data;
+    LOG_TEE("%s", text);
 }

 static struct llama_model * llava_init(gpt_params * params) {
@@ -33,7 +34,7 @@ static struct llama_model * llava_init(gpt_params * params) {

    llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
    if (model == NULL) {
-        LOG_ERR("%s: unable to load model\n" , __func__);
+        LOG_TEE("%s: error: unable to load model\n" , __func__);
        return NULL;
    }
    return model;
@@ -48,7 +49,7 @@ static struct llava_context * llava_init_context(gpt_params * params, llama_mode
    llama_context_params ctx_params = llama_context_params_from_gpt_params(*params);
    if (params->n_ctx < 2048) {
        // warn user here, "Image processing requires at least 2048 context, setting context to 2048"
-        LOG_WRN("%s: Image processing requires at least 2048 context, setting context to 2048\n" , __func__);
+        LOG_TEE("%s: warn: Image processing requires at least 2048 context, setting context to 2048\n" , __func__);
        ctx_params.n_ctx = 2048;
    } else {
        ctx_params.n_ctx = params->n_ctx;
@@ -57,11 +58,11 @@ static struct llava_context * llava_init_context(gpt_params * params, llama_mode
    llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);

    if (ctx_llama == NULL) {
-        LOG_ERR("%s: failed to create the llama_context\n" , __func__);
+        LOG_TEE("%s: error: failed to create the llama_context\n" , __func__);
        return NULL;
    }

-    auto * ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
+    auto ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));

    ctx_llava->ctx_llama = ctx_llama;
    ctx_llava->model = model;
@@ -86,7 +87,7 @@ static struct clip_ctx * clip_init_context(gpt_params * params) {
    if (prompt.empty()) {
        prompt = "describe the image in detail.";
    }
-    auto * ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
+    auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
    return ctx_clip;
 }

@@ -98,7 +99,7 @@ static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_toke
            n_eval = n_batch;
        }
        if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
-            LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
+            LOG_TEE("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
            return false;
        }
        *n_past += n_eval;
@@ -122,7 +123,7 @@ static void process_eval_image_embed(struct llava_context * ctx_llava, const str
    float * image_embed = (float *)malloc(clip_embd_nbytes(ctx_llava->ctx_clip));
    std::memcpy(image_embed, embeds->embed + idx * clip_n_patches(ctx_llava->ctx_clip) * clip_n_mmproj_embd(ctx_llava->ctx_clip), clip_embd_nbytes(ctx_llava->ctx_clip));

-    auto * slice_embed = (llava_image_embed*)malloc(sizeof(llava_image_embed));
+    auto slice_embed = (llava_image_embed*)malloc(sizeof(llava_image_embed));
    slice_embed->embed = image_embed;
    slice_embed->n_image_pos = clip_n_patches(ctx_llava->ctx_clip);
    llava_eval_image_embed(ctx_llava->ctx_llama, slice_embed, n_batch, n_past);
@@ -140,7 +141,7 @@ static void process_image(struct llava_context * ctx_llava, struct llava_image_e
    else if (has_minicpmv_projector == 3) {
        system_prompt = "<|im_start|>user\n";
    }
-    LOG_INF("%s: image token past: %d\n", __func__, n_past);
+    LOG_TEE("%s: image token past: %d\n", __func__, n_past);
    eval_string(ctx_llava->ctx_llama, (system_prompt+"<image>").c_str(), params->n_batch, &n_past, false);
    process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
    eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
@@ -159,14 +160,14 @@ static void process_image(struct llava_context * ctx_llava, struct llava_image_e
        }
        eval_string(ctx_llava->ctx_llama, std::string("</slice>").c_str(), params->n_batch, &n_past, false);
    }
-    LOG_INF("%s: image token past: %d\n", __func__, n_past);
+    LOG_TEE("%s: image token past: %d\n", __func__, n_past);
 }

-static const char * sample(struct gpt_sampler * smpl,
+static const char * sample(struct llama_sampling * smpl,
                           struct llama_context * ctx_llama,
                           int * n_past) {
-    const llama_token id = gpt_sampler_sample(smpl, ctx_llama, -1);
-    gpt_sampler_accept(smpl, id, true);
+    const llama_token id = llama_sampling_sample(smpl, ctx_llama, -1);
+    llama_sampling_accept(smpl, id, true);
    static std::string ret;
    if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
        ret = "</s>";
@@ -178,42 +179,42 @@ static const char * sample(struct gpt_sampler * smpl,
 }

 static struct llava_context * minicpmv_init(gpt_params * params, const std::string & fname, int &n_past){
-    auto * ctx_clip = clip_init_context(params);
-    auto * embeds = llava_image_embed_make_with_filename(ctx_clip, params->cpuparams.n_threads, fname.c_str());
+    auto ctx_clip = clip_init_context(params);
+    auto embeds = llava_image_embed_make_with_filename(ctx_clip, params->cpuparams.n_threads, fname.c_str());
    if (!embeds) {
-        LOG_ERR("failed to load image %s. Terminating\n\n", fname.c_str());
+        std::cerr << "error: failed to load image " << fname << ". Terminating\n\n";
        return NULL;
    }

    // process the prompt
    if (params->prompt.empty() && params->interactive == false) {
-        LOG_ERR("prompt should be given or interactive mode should be on");
+        LOG_TEE("prompt should be given or interactive mode should be on");
        return NULL;
    }

-    auto * model = llava_init(params);
+    auto model = llava_init(params);
    if (model == NULL) {
        fprintf(stderr, "%s: error: failed to init minicpmv model\n", __func__);
        return NULL;
    }
    const int64_t t_llava_init_start_us = ggml_time_us();
-    auto * ctx_llava = llava_init_context(params, model);
+    auto ctx_llava = llava_init_context(params, model);
    ctx_llava->ctx_clip = ctx_clip;
    const int64_t t_llava_init_end_us = ggml_time_us();
    float t_llava_init_ms = (t_llava_init_end_us - t_llava_init_start_us) / 1000.0;
-    LOG_INF("%s: llava init in %8.2f ms.\n", __func__, t_llava_init_ms);
+    LOG_TEE("\n%s: llava init in %8.2f ms.\n", __func__, t_llava_init_ms);

    const int64_t t_process_image_start_us = ggml_time_us();
    process_image(ctx_llava, embeds, params, n_past);
    const int64_t t_process_image_end_us = ggml_time_us();
    float t_process_image_ms = (t_process_image_end_us - t_process_image_start_us) / 1000.0;
-    LOG_INF("%s: llama process image in %8.2f ms.\n", __func__, t_process_image_ms);
+    LOG_TEE("\n%s: llama process image in %8.2f ms.\n", __func__, t_process_image_ms);

    llava_image_embed_free(embeds);
    return ctx_llava;
 }

-static struct gpt_sampler * llama_init(struct llava_context * ctx_llava, gpt_params * params, const std::string & prompt, int & n_past, bool is_first = false){
+static struct llama_sampling * llama_init(struct llava_context * ctx_llava, gpt_params * params, std::string prompt, int &n_past, bool is_first = false){
    std::string user_prompt = prompt;
    int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip);
    if (!is_first) {
@@ -235,13 +236,13 @@ static struct gpt_sampler * llama_init(struct llava_context * ctx_llava, gpt_par

    // generate the response

-    LOG_INF("\n");
+    LOG_TEE("\n");

-    struct gpt_sampler * smpl = gpt_sampler_init(ctx_llava->model, params->sparams);
+    struct llama_sampling * smpl = llama_sampling_init(ctx_llava->model, params->sparams);
    return smpl;
 }

-static const char * llama_loop(struct llava_context * ctx_llava,struct gpt_sampler * smpl, int &n_past){
+static const char * llama_loop(struct llava_context * ctx_llava,struct llama_sampling * smpl, int &n_past){

    const char * tmp = sample(smpl, ctx_llava->ctx_llama, &n_past);
    return tmp;
@@ -252,36 +253,41 @@ int main(int argc, char ** argv) {

    gpt_params params;

-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, show_additional_info)) {
+    if (!gpt_params_parse(argc, argv, params)) {
+        show_additional_info(argc, argv);
        return 1;
    }

-    gpt_init();
+#ifndef LOG_DISABLE_LOGS
+    log_set_target(log_filename_generator("llava", "log"));
+    LOG_TEE("Log start\n");
+    log_dump_cmdline(argc, argv);
+    llama_log_set(llama_log_callback_logTee, nullptr);
+#endif // LOG_DISABLE_LOGS

    if (params.mmproj.empty() || (params.image.empty())) {
+        gpt_params_print_usage(argc, argv, params);
        show_additional_info(argc, argv);
        return 1;
    }

    for (auto & image : params.image) {
        int n_past = 0;
-        auto * ctx_llava = minicpmv_init(&params, image, n_past);
+        auto ctx_llava = minicpmv_init(&params, image, n_past);

        if (!params.prompt.empty()) {
-            LOG("<user>%s\n", params.prompt.c_str());
-            LOG("<assistant>");
-            auto * smpl = llama_init(ctx_llava, &params, params.prompt, n_past, true);
+            LOG_TEE("<user>%s\n", params.prompt.c_str());
+            LOG_TEE("<assistant>");
+            auto smpl = llama_init(ctx_llava, &params, params.prompt.c_str(), n_past, true);
            const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
-            std::string response;
+            std::string response = "";
            bool have_tmp = false;
            for (int i = 0; i < max_tgt_len; i++) {
-                const auto * tmp = llama_loop(ctx_llava, smpl, n_past);
+                auto tmp = llama_loop(ctx_llava, smpl, n_past);
                response += tmp;
                if (strcmp(tmp, "</s>") == 0){
-                    if (!have_tmp) {
-                        continue;
-                    }
-                    break;
+                    if(!have_tmp)continue;
+                    else break;
                }
                if (strstr(tmp, "###")) break; // Yi-VL behavior
                have_tmp = true;
@@ -290,18 +296,18 @@ int main(int argc, char ** argv) {

                fflush(stdout);
            }
-            gpt_sampler_free(smpl);
+            llama_sampling_free(smpl);
        }else {
            while (true) {
-                LOG("<user>");
+                LOG_TEE("<user>");
                std::string prompt;
                std::getline(std::cin, prompt);
-                LOG("<assistant>");
-                auto * smpl = llama_init(ctx_llava, &params, prompt, n_past, true);
+                LOG_TEE("<assistant>");
+                auto smpl = llama_init(ctx_llava, &params, prompt, n_past, true);
                const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
-                std::string response;
+                std::string response = "";
                for (int i = 0; i < max_tgt_len; i++) {
-                    const auto * tmp = llama_loop(ctx_llava, smpl, n_past);
+                    auto tmp = llama_loop(ctx_llava, smpl, n_past);
                    response += tmp;
                    if (strcmp(tmp, "</s>") == 0) break;
                    if (strstr(tmp, "###")) break; // Yi-VL behavior
@@ -309,11 +315,11 @@ int main(int argc, char ** argv) {
                    if (strstr(response.c_str(), "<user>")) break; // minicpm-v
                    fflush(stdout);
                }
-                gpt_sampler_free(smpl);
+                llama_sampling_free(smpl);
            }
        }
        printf("\n");
-        llama_perf_context_print(ctx_llava->ctx_llama);
+        llama_print_timings(ctx_llava->ctx_llama, nullptr);

        ctx_llava->model = NULL;
        llava_free(ctx_llava);
--- a/examples/lookahead/lookahead.cpp
+++ b/examples/lookahead/lookahead.cpp
@@ -1,7 +1,4 @@
-#include "arg.h"
 #include "common.h"
-#include "sampling.h"
-#include "log.h"
 #include "llama.h"

 #include <cstdio>
@@ -39,18 +36,23 @@ struct ngram_container {
 int main(int argc, char ** argv) {
    gpt_params params;

-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
+    if (!gpt_params_parse(argc, argv, params)) {
+        gpt_params_print_usage(argc, argv, params);
        return 1;
    }

-    gpt_init();
-
    const int W = 15; // lookahead window
    const int N = 5;  // n-gram size
    const int G = 15; // max verification n-grams

    const bool dump_kv_cache = params.dump_kv_cache;

+#ifndef LOG_DISABLE_LOGS
+    log_set_target(log_filename_generator("lookahead", "log"));
+    LOG_TEE("Log start\n");
+    log_dump_cmdline(argc, argv);
+#endif // LOG_DISABLE_LOGS
+
    // init llama.cpp
    llama_backend_init();
    llama_numa_init(params.numa);
@@ -72,14 +74,14 @@ int main(int argc, char ** argv) {
    const int max_tokens_list_size = max_context_size - 4;

    if ((int) inp.size() > max_tokens_list_size) {
-        LOG_ERR("%s: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
+        fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
        return 1;
    }

-    LOG("\n\n");
+    fprintf(stderr, "\n\n");

    for (auto id : inp) {
-        LOG("%s", llama_token_to_piece(ctx, id).c_str());
+        fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
    }

    fflush(stderr);
@@ -115,7 +117,7 @@ int main(int argc, char ** argv) {
    llama_batch batch = llama_batch_init(params.n_ctx, 0, W + G + 1);

    // target model sampling context
-    struct gpt_sampler * smpl = gpt_sampler_init(model, params.sparams);
+    struct llama_sampling * smpl = llama_sampling_init(model, params.sparams);

    // verification n-grams
    std::vector<ngram_data> ngrams_cur(G);
@@ -156,14 +158,14 @@ int main(int argc, char ** argv) {

    // sample first token
    {
-        id = gpt_sampler_sample(smpl, ctx, 0);
+        id = llama_sampling_sample(smpl, ctx, 0);

-        gpt_sampler_accept(smpl, id, true);
+        llama_sampling_accept(smpl, id, true);

        {
            const std::string token_str = llama_token_to_piece(ctx, id);

-            LOG("%s", token_str.c_str());
+            printf("%s", token_str.c_str());
            fflush(stdout);
        }
    }
@@ -253,7 +255,7 @@ int main(int argc, char ** argv) {
        }

        if (llama_decode(ctx, batch) != 0) {
-            LOG_ERR("\n\n%s: llama_decode failed - increase KV cache size\n", __func__);
+            fprintf(stderr, "\n\n%s: error: llama_decode failed - increase KV cache size\n", __func__);
            return 1;
        }

@@ -281,19 +283,19 @@ int main(int argc, char ** argv) {
            }

            // sample the next token
-            id = gpt_sampler_sample(smpl, ctx, i_batch);
+            id = llama_sampling_sample(smpl, ctx, i_batch);

-            gpt_sampler_accept(smpl, id, true);
+            llama_sampling_accept(smpl, id, true);

            // print
            {
                const std::string token_str = llama_token_to_piece(ctx, id);

                if (v == 0) {
-                    LOG("%s", token_str.c_str());
+                    printf("%s", token_str.c_str());
                } else {
                    // print light cyan
-                    LOG("\033[0;96m%s\033[0m", token_str.c_str());
+                    printf("\033[0;96m%s\033[0m", token_str.c_str());
                }
                fflush(stdout);

@@ -327,21 +329,21 @@ int main(int argc, char ** argv) {
            // print known n-grams starting with token id (debug)
            if (0 && v == 0) {
                if (ngrams_observed.cnt[id] > 0) {
-                    LOG("\n - %d n-grams starting with '%s'\n", ngrams_observed.cnt[id], llama_token_to_piece(ctx, id).c_str());
+                    printf("\n - %d n-grams starting with '%s'\n", ngrams_observed.cnt[id], llama_token_to_piece(ctx, id).c_str());
                }

                for (int i = 0; i < ngrams_observed.cnt[id]; i++) {
-                    LOG("   - ngram %2d: ", i);
+                    printf("   - ngram %2d: ", i);

                    const int idx = id*(N - 1)*G + i*(N - 1);

                    for (int j = 0; j < N - 1; j++) {
                        const std::string token_str = llama_token_to_piece(ctx, ngrams_observed.tokens[idx + j]);

-                        LOG("%s", token_str.c_str());
+                        printf("%s", token_str.c_str());
                    }

-                    LOG("\n");
+                    printf("\n");
                }
            }

@@ -358,7 +360,7 @@ int main(int argc, char ** argv) {
                if (v == 0) {
                    // sample from the last level
                    for (int i = 0; i < W; i++) {
-                        tokens_j[N - 2][i] = gpt_sampler_sample(smpl, ctx, ngrams_cur.size()*(N-1) + W*(N - 2) + i);
+                        tokens_j[N - 2][i] = llama_sampling_sample(smpl, ctx, ngrams_cur.size()*(N-1) + W*(N - 2) + i);
                    }
                } else {
                    for (int i = 0; i < W; i++) {
@@ -452,25 +454,23 @@ int main(int argc, char ** argv) {

    auto t_dec_end = ggml_time_us();

-    LOG("\n\n");
+    LOG_TEE("\n\n");

-    LOG_INF("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input,   (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
-    LOG_INF("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict  / ((t_dec_end - t_dec_start) / 1e6f));
+    LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input,   (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
+    LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict  / ((t_dec_end - t_dec_start) / 1e6f));

-    LOG_INF("\n");
-    LOG_INF("W = %2d\n", W);
-    LOG_INF("N = %2d\n", N);
-    LOG_INF("G = %2d\n", G);
-    LOG_INF("\n");
-    LOG_INF("n_predict = %d\n", n_predict);
-    LOG_INF("n_accept  = %d\n", n_accept);
+    LOG_TEE("\n");
+    LOG_TEE("W = %2d\n", W);
+    LOG_TEE("N = %2d\n", N);
+    LOG_TEE("G = %2d\n", G);
+    LOG_TEE("\n");
+    LOG_TEE("n_predict = %d\n", n_predict);
+    LOG_TEE("n_accept  = %d\n", n_accept);

-    LOG_INF("\n");
-    gpt_perf_print(ctx, smpl);
-
-    gpt_sampler_free(smpl);
+    llama_print_timings(ctx, smpl);

    llama_kv_cache_view_free(&kvc_view);
+    llama_sampling_free(smpl);

    llama_batch_free(batch);

@@ -479,7 +479,7 @@ int main(int argc, char ** argv) {

    llama_backend_free();

-    LOG("\n\n");
+    fprintf(stderr, "\n\n");

    return 0;
 }
--- a/examples/lookup/lookup-create.cpp
+++ b/examples/lookup/lookup-create.cpp
@@ -1,8 +1,7 @@
-#include "arg.h"
-#include "common.h"
-#include "ngram-cache.h"
 #include "ggml.h"
 #include "llama.h"
+#include "common.h"
+#include "ngram-cache.h"

 #include <cstdint>
 #include <fstream>
@@ -14,7 +13,8 @@
 int main(int argc, char ** argv){
    gpt_params params;

-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
+    if (!gpt_params_parse(argc, argv, params)) {
+        gpt_params_print_usage(argc, argv, params);
        return 1;
    }

@@ -40,6 +40,4 @@ int main(int argc, char ** argv){
    fprintf(stderr, "%s: hashing done, writing file to %s\n", __func__, params.lookup_cache_static.c_str());

    llama_ngram_cache_save(ngram_cache, params.lookup_cache_static);
-
-    return 0;
 }
--- a/examples/lookup/lookup-stats.cpp
+++ b/examples/lookup/lookup-stats.cpp
@@ -1,26 +1,25 @@
-#include "arg.h"
+#include "ggml.h"
 #include "common.h"
+#include "llama.h"
 #include "log.h"
 #include "ngram-cache.h"
-#include "llama.h"
-#include "ggml.h"

+#include <cmath>
 #include <cstdint>
 #include <cstdio>
-#include <cinttypes>
 #include <fstream>
 #include <string>
 #include <vector>
+#include <unordered_map>

 int main(int argc, char ** argv){
    gpt_params params;

-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
+    if (!gpt_params_parse(argc, argv, params)) {
+        gpt_params_print_usage(argc, argv, params);
        return 1;
    }

-    gpt_init();
-
    const int n_draft = params.n_draft;

    // init llama.cpp
@@ -50,7 +49,7 @@ int main(int argc, char ** argv){
            try {
                ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static);
            } catch (std::ifstream::failure const &) {
-                LOG_ERR("failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
+                fprintf(stderr, "error: failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
                exit(1);
            }
        }
@@ -129,7 +128,7 @@ int main(int argc, char ** argv){
            const int64_t eta_min  = eta_ms / (60*1000);
            const int64_t eta_s    = (eta_ms - 60*1000*eta_min) / 1000;

-            LOG_INF("lookup-stats: %d/%d done, ETA: %02" PRId64 ":%02" PRId64 "\n", i_start, n_input, eta_min, eta_s);
+            LOG_TEE("lookup-stats: %d/%d done, ETA: %02" PRId64 ":%02" PRId64 "\n", i_start, n_input, eta_min, eta_s);
        }

        // After each chunk, update the dynamic ngram cache with the context ngram cache:
@@ -137,24 +136,24 @@ int main(int argc, char ** argv){
        ngram_cache_context.clear();
    }

-    LOG("\n");
+    LOG_TEE("\n");

-    LOG_INF("\n");
-    LOG_INF("n_draft      = %d\n", n_draft);
-    LOG_INF("n_predict    = %d\n", n_input - n_input % n_ctx);
-    LOG_INF("n_drafted    = %d\n", n_drafted);
-    LOG_INF("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
-    LOG_INF("t_draft      = %.2f ms, %.2f us per token, %.2f tokens per second\n",
+    LOG_TEE("\n");
+    LOG_TEE("n_draft      = %d\n", n_draft);
+    LOG_TEE("n_predict    = %d\n", n_input - n_input % n_ctx);
+    LOG_TEE("n_drafted    = %d\n", n_drafted);
+    LOG_TEE("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
+    LOG_TEE("t_draft      = %.2f ms, %.2f us per token, %.2f tokens per second\n",
            t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us));
-    LOG_INF("n_accept     = %d\n", n_accept);
-    LOG_INF("accept       = %.3f%%\n", 100.0f * n_accept / n_drafted);
+    LOG_TEE("n_accept     = %d\n", n_accept);
+    LOG_TEE("accept       = %.3f%%\n", 100.0f * n_accept / n_drafted);

    llama_free(ctx);
    llama_free_model(model);

    llama_backend_free();

-    LOG("\n\n");
+    fprintf(stderr, "\n\n");

    return 0;
 }
--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@@ -1,10 +1,7 @@
-#include "arg.h"
 #include "ggml.h"
+#include "llama.h"
 #include "common.h"
 #include "ngram-cache.h"
-#include "sampling.h"
-#include "log.h"
-#include "llama.h"

 #include <cstdint>
 #include <cstdio>
@@ -15,17 +12,22 @@
 int main(int argc, char ** argv){
    gpt_params params;

-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
+    if (!gpt_params_parse(argc, argv, params)) {
+        gpt_params_print_usage(argc, argv, params);
        return 1;
    }

-    gpt_init();
-
    // max. number of additional tokens to draft if match is found
    const int n_draft = params.n_draft;

    const bool dump_kv_cache = params.dump_kv_cache;

+#ifndef LOG_DISABLE_LOGS
+    log_set_target(log_filename_generator("lookup", "log"));
+    LOG_TEE("Log start\n");
+    log_dump_cmdline(argc, argv);
+#endif // LOG_DISABLE_LOGS
+
    // init llama.cpp
    llama_backend_init();
    llama_numa_init(params.numa);
@@ -55,7 +57,7 @@ int main(int argc, char ** argv){
            try {
                ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static);
            } catch (std::ifstream::failure const &) {
-                LOG_ERR("failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
+                fprintf(stderr, "error: failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
                exit(1);
            }
        }
@@ -73,14 +75,14 @@ int main(int argc, char ** argv){
    const int max_tokens_list_size = max_context_size - 4;

    if ((int) inp.size() > max_tokens_list_size) {
-        LOG_ERR("%s: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
+        fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
        return 1;
    }

-    LOG("\n\n");
+    fprintf(stderr, "\n\n");

    for (auto id : inp) {
-        LOG("%s", llama_token_to_piece(ctx, id).c_str());
+        fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
    }

    fflush(stderr);
@@ -102,7 +104,7 @@ int main(int argc, char ** argv){

    bool has_eos = false;

-    struct gpt_sampler * smpl = gpt_sampler_init(model, params.sparams);
+    struct llama_sampling * smpl = llama_sampling_init(model, params.sparams);

    std::vector<llama_token> draft;

@@ -121,19 +123,19 @@ int main(int argc, char ** argv){
        }

        // print current draft sequence
-        LOG_DBG("drafted %s\n", string_from(ctx, draft).c_str());
+        LOG("drafted %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, draft).c_str());

        int i_dft = 0;
        while (true) {
            // sample from the target model
-            llama_token id = gpt_sampler_sample(smpl, ctx, i_dft);
+            llama_token id = llama_sampling_sample(smpl, ctx, i_dft);

-            gpt_sampler_accept(smpl, id, true);
+            llama_sampling_accept(smpl, id, true);

            const std::string token_str = llama_token_to_piece(ctx, id);

            if (!params.use_color) {
-                LOG("%s", token_str.c_str());
+                printf("%s", token_str.c_str());
            }

            if (llama_token_is_eog(model, id)) {
@@ -144,7 +146,7 @@ int main(int argc, char ** argv){

            // check if the target token matches the draft
            if (i_dft < (int) draft.size() && id == draft[i_dft]) {
-                LOG_DBG("the sampled target token matches the %dth drafted token (%d, '%s') - accepted\n", i_dft, id, token_str.c_str());
+                LOG("the sampled target token matches the %dth drafted token (%d, '%s') - accepted\n", i_dft, id, token_str.c_str());
                ++n_accept;
                ++n_past;
                ++i_dft;
@@ -158,19 +160,19 @@ int main(int argc, char ** argv){

                if (params.use_color) {
                    // color accepted draft token
-                    LOG("\033[34m%s\033[0m", token_str.c_str());
+                    printf("\033[34m%s\033[0m", token_str.c_str());
                    fflush(stdout);
                }
                continue;
            }

            if (params.use_color) {
-                LOG("%s", token_str.c_str());
+                printf("%s", token_str.c_str());
            }
            fflush(stdout);


-            LOG_DBG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str());
+            LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str());

            draft.clear();
            draft.push_back(id);
@@ -221,26 +223,25 @@ int main(int argc, char ** argv){
    llama_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context);
    llama_ngram_cache_save(ngram_cache_dynamic, params.lookup_cache_dynamic);

-    LOG("\n\n");
+    LOG_TEE("\n\n");

-    LOG_INF("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input,   (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
-    LOG_INF("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict  / ((t_dec_end - t_dec_start) / 1e6f));
+    LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input,   (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
+    LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict  / ((t_dec_end - t_dec_start) / 1e6f));

-    LOG_INF("\n");
-    LOG_INF("n_draft      = %d\n", n_draft);
-    LOG_INF("n_predict    = %d\n", n_predict);
-    LOG_INF("n_drafted    = %d\n", n_drafted);
-    LOG_INF("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
-    LOG_INF("t_draft      = %.2f ms, %.2f us per token, %.2f tokens per second\n",
+    LOG_TEE("\n");
+    LOG_TEE("n_draft      = %d\n", n_draft);
+    LOG_TEE("n_predict    = %d\n", n_predict);
+    LOG_TEE("n_drafted    = %d\n", n_drafted);
+    LOG_TEE("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
+    LOG_TEE("t_draft      = %.2f ms, %.2f us per token, %.2f tokens per second\n",
            t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us));
-    LOG_INF("n_accept     = %d\n", n_accept);
-    LOG_INF("accept       = %.3f%%\n", 100.0f * n_accept / n_drafted);
+    LOG_TEE("n_accept     = %d\n", n_accept);
+    LOG_TEE("accept       = %.3f%%\n", 100.0f * n_accept / n_drafted);

-    LOG_INF("\ntarget:\n\n");
-    gpt_perf_print(ctx, smpl);
-
-    gpt_sampler_free(smpl);
+    LOG_TEE("\ntarget:\n");
+    llama_print_timings(ctx, smpl);

+    llama_sampling_free(smpl);
    llama_batch_free(batch_tgt);

    llama_free(ctx);
@@ -248,7 +249,7 @@ int main(int argc, char ** argv){

    llama_backend_free();

-    LOG("\n\n");
+    fprintf(stderr, "\n\n");

    return 0;
 }
--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -161,8 +161,6 @@ A value of -1 will enable infinite text generation, even though we have a finite

 If the pause is undesirable, a value of -2 will stop generation immediately when the context is filled.

-The `--no-context-shift` option allows you to stop the infinite text generation once the finite context window is full.
-
 It is important to note that the generated text may be shorter than the specified number of tokens if an End-of-Sequence (EOS) token or a reverse prompt is encountered. In interactive mode, text generation will pause and control will be returned to the user. In non-interactive mode, the program will end. In both cases, the text generation may stop before reaching the specified `--predict` value. If you want the model to keep going without ever producing End-of-Sequence on its own, you can use the `--ignore-eos` parameter.

 ### Temperature
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -1,11 +1,11 @@
-#include "arg.h"
 #include "common.h"
+
 #include "console.h"
-#include "log.h"
-#include "sampling.h"
 #include "llama.h"

 #include <cassert>
+#include <cinttypes>
+#include <cmath>
 #include <cstdio>
 #include <cstring>
 #include <ctime>
@@ -33,7 +33,7 @@

 static llama_context           ** g_ctx;
 static llama_model             ** g_model;
-static gpt_sampler             ** g_smpl;
+static llama_sampling          ** g_smpl;
 static gpt_params               * g_params;
 static std::vector<llama_token> * g_input_tokens;
 static std::ostringstream       * g_output_ss;
@@ -41,15 +41,6 @@ static std::vector<llama_token> * g_output_tokens;
 static bool is_interacting  = false;
 static bool need_insert_eot = false;

-static void print_usage(int argc, char ** argv) {
-    (void) argc;
-
-    LOG("\nexample usage:\n");
-    LOG("\n  text generation:     %s -m your_model.gguf -p \"I believe the meaning of life is\" -n 128\n", argv[0]);
-    LOG("\n  chat (conversation): %s -m your_model.gguf -p \"You are a helpful assistant\" -cnv\n", argv[0]);
-    LOG("\n");
-}
-
 static bool file_exists(const std::string & path) {
    std::ifstream f(path.c_str());
    return f.good();
@@ -75,7 +66,8 @@ static void write_logfile(

    const bool success = fs_create_directory_with_parents(params.logdir);
    if (!success) {
-        LOG_ERR("%s: failed to create logdir %s, cannot write logfile\n", __func__, params.logdir.c_str());
+        fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
+                __func__, params.logdir.c_str());
        return;
    }

@@ -83,7 +75,7 @@ static void write_logfile(
    FILE * logfile = fopen(logfile_path.c_str(), "w");

    if (logfile == NULL) {
-        LOG_ERR("%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
+        fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
        return;
    }

@@ -101,7 +93,7 @@ static void write_logfile(
    yaml_dump_string_multiline(logfile, "output", output.c_str());
    yaml_dump_vector_int(logfile, "output_tokens", output_tokens);

-    llama_perf_dump_yaml(logfile, ctx);
+    llama_dump_timing_info_yaml(logfile, ctx);
    fclose(logfile);
 }

@@ -113,81 +105,95 @@ static void sigint_handler(int signo) {
            need_insert_eot = true;
        } else {
            console::cleanup();
-            LOG("\n");
-            gpt_perf_print(*g_ctx, *g_smpl);
+            printf("\n");
+            llama_print_timings(*g_ctx, *g_smpl);
            write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
-
-            // make sure all logs are flushed
-            LOG("Interrupted by user\n");
-            gpt_log_pause(gpt_log_main());
-
            _exit(130);
        }
    }
 }
 #endif

-static std::string chat_add_and_format(struct llama_model * model, std::vector<llama_chat_msg> & chat_msgs, const std::string & role, const std::string & content) {
+static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
+    (void) level;
+    (void) user_data;
+    LOG_TEE("%s", text);
+}
+
+static std::string chat_add_and_format(struct llama_model * model, std::vector<llama_chat_msg> & chat_msgs, std::string role, std::string content) {
    llama_chat_msg new_msg{role, content};
    auto formatted = llama_chat_format_single(model, g_params->chat_template, chat_msgs, new_msg, role == "user");
    chat_msgs.push_back({role, content});
-    LOG_DBG("formatted: '%s'\n", formatted.c_str());
+    LOG("formatted: %s\n", formatted.c_str());
    return formatted;
 }

 int main(int argc, char ** argv) {
    gpt_params params;
    g_params = &params;
-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_MAIN, print_usage)) {
+
+    if (!gpt_params_parse(argc, argv, params)) {
+        gpt_params_print_usage(argc, argv, params);
        return 1;
    }

-    gpt_init();
-
    auto & sparams = params.sparams;

+#ifndef LOG_DISABLE_LOGS
+    log_set_target(log_filename_generator("main", "log"));
+    LOG_TEE("Log start\n");
+    log_dump_cmdline(argc, argv);
+    llama_log_set(llama_log_callback_logTee, nullptr);
+#endif // LOG_DISABLE_LOGS
+
+    // TODO: Dump params ?
+    //LOG("Params perplexity: %s\n", LOG_TOSTR(params.perplexity));
+
    // save choice to use color for later
    // (note for later: this is a slightly awkward choice)
    console::init(params.simple_io, params.use_color);
    atexit([]() { console::cleanup(); });

    if (params.logits_all) {
-        LOG_ERR("************\n");
-        LOG_ERR("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
-        LOG_ERR("************\n\n");
+        printf("\n************\n");
+        printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
+        printf("************\n\n");

        return 0;
    }

    if (params.embedding) {
-        LOG_ERR("************\n");
-        LOG_ERR("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
-        LOG_ERR("************\n\n");
+        printf("\n************\n");
+        printf("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
+        printf("************\n\n");

        return 0;
    }

    if (params.n_ctx != 0 && params.n_ctx < 8) {
-        LOG_WRN("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
+        LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
        params.n_ctx = 8;
    }

    if (params.rope_freq_base != 0.0) {
-        LOG_WRN("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
+        LOG_TEE("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
    }

    if (params.rope_freq_scale != 0.0) {
-        LOG_WRN("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
+        LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
    }

-    LOG_INF("%s: llama backend init\n", __func__);
+    print_build_info();

+    LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed);
+
+    LOG("%s: llama backend init\n", __func__);
    llama_backend_init();
    llama_numa_init(params.numa);

    llama_model * model = nullptr;
    llama_context * ctx = nullptr;
-    gpt_sampler * smpl = nullptr;
+    llama_sampling * smpl = nullptr;

    std::vector<llama_chat_msg> chat_msgs;

@@ -196,19 +202,21 @@ int main(int argc, char ** argv) {
    g_smpl = &smpl;

    // load the model and apply lora adapter, if any
-    LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
+    LOG("%s: load the model and apply lora adapter, if any\n", __func__);
    llama_init_result llama_init = llama_init_from_gpt_params(params);

    model = llama_init.model;
    ctx = llama_init.context;

    if (model == NULL) {
-        LOG_ERR("%s: error: unable to load model\n", __func__);
+        LOG_TEE("%s: error: unable to load model\n", __func__);
        return 1;
    }

-    LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads);
-
+    LOG("%s: llama threadpool init = n_threads = %d\n",
+        __func__,
+        (int) params.cpuparams.n_threads
+    );
    struct ggml_threadpool_params tpp_batch =
            ggml_threadpool_params_from_cpu_params(params.cpuparams_batch);
    struct ggml_threadpool_params tpp =
@@ -220,8 +228,8 @@ int main(int argc, char ** argv) {
    if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) {
        threadpool_batch = ggml_threadpool_new(&tpp_batch);
        if (!threadpool_batch) {
-            LOG_ERR("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads);
-            return 1;
+            LOG_TEE("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads);
+            exit(1);
        }

        // Start the non-batch threadpool in the paused state
@@ -230,54 +238,55 @@ int main(int argc, char ** argv) {

    struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp);
    if (!threadpool) {
-        LOG_ERR("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
-        return 1;
+        LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
+        exit(1);
    }

    llama_attach_threadpool(ctx, threadpool, threadpool_batch);

    const int n_ctx_train = llama_n_ctx_train(model);
    const int n_ctx = llama_n_ctx(ctx);
+    LOG("n_ctx: %d\n", n_ctx);

    if (n_ctx > n_ctx_train) {
-        LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx);
+        LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n",
+                __func__, n_ctx_train, n_ctx);
    }

    // print chat template example in conversation mode
    if (params.conversation) {
        if (params.enable_chat_template) {
-            LOG_INF("%s: chat template example:\n%s\n", __func__, llama_chat_format_example(model, params.chat_template).c_str());
+            LOG_TEE("%s: chat template example: %s\n", __func__, llama_chat_format_example(model, params.chat_template).c_str());
        } else {
-            LOG_INF("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__);
+            LOG_TEE("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__);
        }
    }

    // print system information
    {
-        LOG_INF("\n");
-        LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
-        LOG_INF("\n");
+        LOG_TEE("\n");
+        LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str());
    }

    std::string path_session = params.path_prompt_cache;
    std::vector<llama_token> session_tokens;

    if (!path_session.empty()) {
-        LOG_INF("%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str());
+        LOG_TEE("%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str());
        if (!file_exists(path_session)) {
-            LOG_INF("%s: session file does not exist, will create.\n", __func__);
+            LOG_TEE("%s: session file does not exist, will create.\n", __func__);
        } else if (file_is_empty(path_session)) {
-            LOG_INF("%s: The session file is empty. A new session will be initialized.\n", __func__);
+            LOG_TEE("%s: The session file is empty. A new session will be initialized.\n", __func__);
        } else {
            // The file exists and is not empty
            session_tokens.resize(n_ctx);
            size_t n_token_count_out = 0;
            if (!llama_state_load_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
-                LOG_ERR("%s: failed to load session file '%s'\n", __func__, path_session.c_str());
+                LOG_TEE("%s: error: failed to load session file '%s'\n", __func__, path_session.c_str());
                return 1;
            }
            session_tokens.resize(n_token_count_out);
-            LOG_INF("%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size());
+            LOG_TEE("%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size());
        }
    }

@@ -285,8 +294,7 @@ int main(int argc, char ** argv) {
    if (!llama_model_has_encoder(model)) {
        GGML_ASSERT(!llama_add_eos_token(model));
    }
-
-    LOG_DBG("n_ctx: %d, add_bos: %d\n", n_ctx, add_bos);
+    LOG("add_bos: %d\n", add_bos);

    std::vector<llama_token> embd_inp;

@@ -295,31 +303,31 @@ int main(int argc, char ** argv) {
            ? chat_add_and_format(model, chat_msgs, "system", params.prompt) // format the system prompt in conversation mode
            : params.prompt;
        if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
-            LOG_DBG("tokenize the prompt\n");
+            LOG("tokenize the prompt\n");
            embd_inp = ::llama_tokenize(ctx, prompt, true, true);
        } else {
-            LOG_DBG("use session tokens\n");
+            LOG("use session tokens\n");
            embd_inp = session_tokens;
        }

-        LOG_DBG("prompt: \"%s\"\n", prompt.c_str());
-        LOG_DBG("tokens: %s\n", string_from(ctx, embd_inp).c_str());
+        LOG("prompt: \"%s\"\n", log_tostr(prompt));
+        LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
    }

    // Should not run without any tokens
    if (embd_inp.empty()) {
        if (add_bos) {
            embd_inp.push_back(llama_token_bos(model));
-            LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
+            LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
        } else {
-            LOG_ERR("input is empty\n");
+            LOG_TEE("error: input is empty\n");
            return -1;
        }
    }

    // Tokenize negative prompt
    if ((int) embd_inp.size() > n_ctx - 4) {
-        LOG_ERR("%s: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
+        LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
        return 1;
    }

@@ -333,28 +341,29 @@ int main(int argc, char ** argv) {
            n_matching_session_tokens++;
        }
        if (params.prompt.empty() && n_matching_session_tokens == embd_inp.size()) {
-            LOG_INF("%s: using full prompt from session file\n", __func__);
+            LOG_TEE("%s: using full prompt from session file\n", __func__);
        } else if (n_matching_session_tokens >= embd_inp.size()) {
-            LOG_INF("%s: session file has exact match for prompt!\n", __func__);
+            LOG_TEE("%s: session file has exact match for prompt!\n", __func__);
        } else if (n_matching_session_tokens < (embd_inp.size() / 2)) {
-            LOG_WRN("%s: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
-                    __func__, n_matching_session_tokens, embd_inp.size());
+            LOG_TEE("%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
+                __func__, n_matching_session_tokens, embd_inp.size());
        } else {
-            LOG_INF("%s: session file matches %zu / %zu tokens of prompt\n",
-                    __func__, n_matching_session_tokens, embd_inp.size());
+            LOG_TEE("%s: session file matches %zu / %zu tokens of prompt\n",
+                __func__, n_matching_session_tokens, embd_inp.size());
        }

        // remove any "future" tokens that we might have inherited from the previous session
        llama_kv_cache_seq_rm(ctx, -1, n_matching_session_tokens, -1);
    }

-    LOG_DBG("recalculate the cached logits (check): embd_inp.size() %zu, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu\n",
-         embd_inp.size(), n_matching_session_tokens, embd_inp.size(), session_tokens.size());
+    LOGLN(
+            "recalculate the cached logits (check): embd_inp.empty() %s, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu",
+            log_tostr(embd_inp.empty()), n_matching_session_tokens, embd_inp.size(), session_tokens.size());

    // if we will use the cache for the full prompt without reaching the end of the cache, force
    // reevaluation of the last token to recalculate the cached logits
    if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() && session_tokens.size() > embd_inp.size()) {
-        LOG_DBG("recalculate the cached logits (do): session_tokens.resize( %zu )\n", embd_inp.size() - 1);
+        LOGLN("recalculate the cached logits (do): session_tokens.resize( %zu )", embd_inp.size() - 1);

        session_tokens.resize(embd_inp.size() - 1);
    }
@@ -376,20 +385,21 @@ int main(int argc, char ** argv) {
    }

    if (params.verbose_prompt) {
-        LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
-        LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
+        LOG_TEE("\n");
+        LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
+        LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
        for (int i = 0; i < (int) embd_inp.size(); i++) {
-            LOG_INF("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
+            LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
        }

        if (params.n_keep > add_bos) {
-            LOG_INF("%s: static prompt based on n_keep: '", __func__);
+            LOG_TEE("%s: static prompt based on n_keep: '", __func__);
            for (int i = 0; i < params.n_keep; i++) {
-                LOG_CNT("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
+                LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
            }
-            LOG_CNT("'\n");
+            LOG_TEE("'\n");
        }
-        LOG_INF("\n");
+        LOG_TEE("\n");
    }

    // ctrl+C handling
@@ -409,56 +419,47 @@ int main(int argc, char ** argv) {
    }

    if (params.interactive) {
-        LOG_INF("%s: interactive mode on.\n", __func__);
+        LOG_TEE("%s: interactive mode on.\n", __func__);

        if (!params.antiprompt.empty()) {
            for (const auto & antiprompt : params.antiprompt) {
-                LOG_INF("Reverse prompt: '%s'\n", antiprompt.c_str());
+                LOG_TEE("Reverse prompt: '%s'\n", antiprompt.c_str());
                if (params.verbose_prompt) {
                    auto tmp = ::llama_tokenize(ctx, antiprompt, false, true);
                    for (int i = 0; i < (int) tmp.size(); i++) {
-                        LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
+                        LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
                    }
                }
            }
        }

        if (params.input_prefix_bos) {
-            LOG_INF("Input prefix with BOS\n");
+            LOG_TEE("Input prefix with BOS\n");
        }

        if (!params.input_prefix.empty()) {
-            LOG_INF("Input prefix: '%s'\n", params.input_prefix.c_str());
+            LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str());
            if (params.verbose_prompt) {
                auto tmp = ::llama_tokenize(ctx, params.input_prefix, true, true);
                for (int i = 0; i < (int) tmp.size(); i++) {
-                    LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
+                    LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
                }
            }
        }

        if (!params.input_suffix.empty()) {
-            LOG_INF("Input suffix: '%s'\n", params.input_suffix.c_str());
+            LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
            if (params.verbose_prompt) {
                auto tmp = ::llama_tokenize(ctx, params.input_suffix, false, true);
                for (int i = 0; i < (int) tmp.size(); i++) {
-                    LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
+                    LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
                }
            }
        }
    }
-
-    smpl = gpt_sampler_init(model, sparams);
-    if (!smpl) {
-        LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
-        return 1;
-    }
-
-    LOG_INF("sampler seed: %u\n",     gpt_sampler_get_seed(smpl));
-    LOG_INF("sampler params: \n%s\n", sparams.print().c_str());
-    LOG_INF("sampler chain: %s\n",    gpt_sampler_print(smpl).c_str());
-
-    LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
+    LOG_TEE("sampling params: \n%s\n", sparams.print_all().c_str());
+    LOG_TEE("sampling order:  \n%s\n", sparams.print_samplers().c_str());
+    LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);

    // group-attention state
    // number of grouped KV tokens so far (used only if params.grp_attn_n > 1)
@@ -472,9 +473,9 @@ int main(int argc, char ** argv) {
        GGML_ASSERT(ga_w % ga_n == 0            && "grp_attn_w must be a multiple of grp_attn_n");     // NOLINT
      //GGML_ASSERT(n_ctx_train % ga_w == 0     && "n_ctx_train must be a multiple of grp_attn_w");    // NOLINT
      //GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * grp_attn_n"); // NOLINT
-        LOG_INF("self-extend: n_ctx_train = %d, grp_attn_n = %d, grp_attn_w = %d\n", n_ctx_train, ga_n, ga_w);
+        LOG_TEE("self-extend: n_ctx_train = %d, grp_attn_n = %d, grp_attn_w = %d\n", n_ctx_train, ga_n, ga_w);
    }
-    LOG_INF("\n");
+    LOG_TEE("\n\n");

    if (params.interactive) {
        const char * control_message;
@@ -486,11 +487,11 @@ int main(int argc, char ** argv) {
                              " - To return control without starting a new line, end your input with '/'.\n"
                              " - If you want to submit another line, end your input with '\\'.\n";
        }
-        LOG_INF("== Running in interactive mode. ==\n");
+        LOG_TEE("== Running in interactive mode. ==\n");
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
-        LOG_INF(       " - Press Ctrl+C to interject at any time.\n");
+        LOG_TEE(       " - Press Ctrl+C to interject at any time.\n");
 #endif
-        LOG_INF(       "%s\n", control_message);
+        LOG_TEE(       "%s\n", control_message);

        is_interacting = params.interactive_first;
    }
@@ -524,12 +525,18 @@ int main(int argc, char ** argv) {
        antiprompt_ids.emplace_back(::llama_tokenize(ctx, antiprompt, false, true));
    }

+    smpl = llama_sampling_init(model, sparams);
+    if (!smpl) {
+        fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);
+        exit(1);
+    }
+
    if (llama_model_has_encoder(model)) {
        int enc_input_size = embd_inp.size();
        llama_token * enc_input_buf = embd_inp.data();

        if (llama_encode(ctx, llama_batch_get_one(enc_input_buf, enc_input_size, 0, 0))) {
-            LOG_ERR("%s : failed to eval\n", __func__);
+            LOG_TEE("%s : failed to eval\n", __func__);
            return 1;
        }

@@ -555,8 +562,9 @@ int main(int argc, char ** argv) {
                embd.resize(max_embd_size);

                console::set_display(console::error);
-                LOG_WRN("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
+                printf("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
                console::set_display(console::reset);
+                fflush(stdout);
            }

            if (ga_n == 1) {
@@ -564,35 +572,29 @@ int main(int argc, char ** argv) {
                // if we run out of context:
                // - take the n_keep first tokens from the original prompt (via n_past)
                // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
-
                if (n_past + (int) embd.size() >= n_ctx) {
-                    if (!params.ctx_shift){
-                        LOG_DBG("\n\n%s: context full and context shift is disabled => stopping\n", __func__);
+                    if (params.n_predict == -2) {
+                        LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
                        break;
-                    } else {
-                        if (params.n_predict == -2) {
-                            LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
-                            break;
-                        }
-
-                        const int n_left    = n_past - params.n_keep;
-                        const int n_discard = n_left/2;
-
-                        LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
-                                n_past, n_left, n_ctx, params.n_keep, n_discard);
-
-                        llama_kv_cache_seq_rm (ctx, 0, params.n_keep            , params.n_keep + n_discard);
-                        llama_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
-
-                        n_past -= n_discard;
-
-                        LOG_DBG("after swap: n_past = %d\n", n_past);
-
-                        LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str());
-
-                        LOG_DBG("clear session path\n");
-                        path_session.clear();
                    }
+
+                    const int n_left    = n_past - params.n_keep;
+                    const int n_discard = n_left/2;
+
+                    LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
+                            n_past, n_left, n_ctx, params.n_keep, n_discard);
+
+                    llama_kv_cache_seq_rm (ctx, 0, params.n_keep            , params.n_keep + n_discard);
+                    llama_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
+
+                    n_past -= n_discard;
+
+                    LOG("after swap: n_past = %d\n", n_past);
+
+                    LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
+
+                    LOG("clear session path\n");
+                    path_session.clear();
                }
            } else {
                // context extension via Self-Extend
@@ -601,10 +603,10 @@ int main(int argc, char ** argv) {
                    const int bd = (ga_w/ga_n)*(ga_n - 1);
                    const int dd = (ga_w/ga_n) - ib*bd - ga_w;

-                    LOG_DBG("\n");
-                    LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i, n_past, ib*bd, ga_i + ib*bd, n_past + ib*bd);
-                    LOG_DBG("div:   [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n);
-                    LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd);
+                    LOG("\n");
+                    LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i, n_past, ib*bd, ga_i + ib*bd, n_past + ib*bd);
+                    LOG("div:   [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n);
+                    LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd);

                    llama_kv_cache_seq_add(ctx, 0, ga_i,                n_past,              ib*bd);
                    llama_kv_cache_seq_div(ctx, 0, ga_i + ib*bd,        ga_i + ib*bd + ga_w, ga_n);
@@ -614,7 +616,7 @@ int main(int argc, char ** argv) {

                    ga_i += ga_w/ga_n;

-                    LOG_DBG("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", n_past + bd, n_past, ga_i);
+                    LOG("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", n_past + bd, n_past, ga_i);
                }
            }

@@ -646,19 +648,19 @@ int main(int argc, char ** argv) {
                    n_eval = params.n_batch;
                }

-                LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());
+                LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());

                if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
-                    LOG_ERR("%s : failed to eval\n", __func__);
+                    LOG_TEE("%s : failed to eval\n", __func__);
                    return 1;
                }

                n_past += n_eval;

-                LOG_DBG("n_past = %d\n", n_past);
+                LOG("n_past = %d\n", n_past);
                // Display total tokens alongside total time
                if (params.n_print > 0 && n_past % params.n_print == 0) {
-                    LOG_DBG("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, n_ctx);
+                    LOG_TEE("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, n_ctx);
                }
            }

@@ -676,14 +678,14 @@ int main(int argc, char ** argv) {
                need_to_save_session = false;
                llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());

-                LOG_DBG("saved session to %s\n", path_session.c_str());
+                LOG("saved session to %s\n", path_session.c_str());
            }

-            const llama_token id = gpt_sampler_sample(smpl, ctx, -1);
+            const llama_token id = llama_sampling_sample(smpl, ctx, -1);

-            gpt_sampler_accept(smpl, id, /* accept_grammar= */ true);
+            llama_sampling_accept(smpl, id, /* apply_grammar= */ true);

-            // LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str());
+            // LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, smpl->prev.to_vector()).c_str());

            embd.push_back(id);

@@ -693,16 +695,16 @@ int main(int argc, char ** argv) {
            // decrement remaining sampling budget
            --n_remain;

-            LOG_DBG("n_remain: %d\n", n_remain);
+            LOG("n_remain: %d\n", n_remain);
        } else {
            // some user input remains from prompt or interaction, forward it to processing
-            LOG_DBG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
+            LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
            while ((int) embd_inp.size() > n_consumed) {
                embd.push_back(embd_inp[n_consumed]);

                // push the prompt in the sampling context in order to apply repetition penalties later
                // for the prompt, we don't apply grammar rules
-                gpt_sampler_accept(smpl, embd_inp[n_consumed], /* accept_grammar= */ false);
+                llama_sampling_accept(smpl, embd_inp[n_consumed], /* apply_grammar= */ false);

                ++n_consumed;
                if ((int) embd.size() >= params.n_batch) {
@@ -717,7 +719,7 @@ int main(int argc, char ** argv) {
                const std::string token_str = llama_token_to_piece(ctx, id, params.special);

                // Console/Stream Output
-                LOG("%s", token_str.c_str());
+                fprintf(stdout, "%s", token_str.c_str());

                // Record Displayed Tokens To Log
                // Note: Generated tokens are created one by one hence this check
@@ -729,6 +731,8 @@ int main(int argc, char ** argv) {
                    output_tokens.push_back(id);
                    output_ss << token_str;
                }
+
+                fflush(stdout);
            }
        }

@@ -743,7 +747,7 @@ int main(int argc, char ** argv) {
            // check for reverse prompt in the last n_prev tokens
            if (!params.antiprompt.empty()) {
                const int n_prev = 32;
-                const std::string last_output = gpt_sampler_prev_str(smpl, ctx, n_prev);
+                const std::string last_output = llama_sampling_prev_str(smpl, ctx, n_prev);

                is_antiprompt = false;
                // Check if each of the reverse prompts appears at the end of the output.
@@ -765,7 +769,7 @@ int main(int argc, char ** argv) {
                }

                // check for reverse prompt using special tokens
-                llama_token last_token = gpt_sampler_last(smpl);
+                llama_token last_token = llama_sampling_last(smpl);
                for (std::vector<llama_token> ids : antiprompt_ids) {
                    if (ids.size() == 1 && last_token == ids[0]) {
                        if (params.interactive) {
@@ -777,13 +781,13 @@ int main(int argc, char ** argv) {
                }

                if (is_antiprompt) {
-                    LOG_DBG("found antiprompt: %s\n", last_output.c_str());
+                    LOG("found antiprompt: %s\n", last_output.c_str());
                }
            }

            // deal with end of generation tokens in interactive mode
-            if (llama_token_is_eog(model, gpt_sampler_last(smpl))) {
-                LOG_DBG("found an EOG token\n");
+            if (llama_token_is_eog(model, llama_sampling_last(smpl))) {
+                LOG("found an EOG token\n");

                if (params.interactive) {
                    if (!params.antiprompt.empty()) {
@@ -797,32 +801,32 @@ int main(int argc, char ** argv) {
                        chat_add_and_format(model, chat_msgs, "assistant", assistant_ss.str());
                    }
                    is_interacting = true;
-                    LOG("\n");
+                    printf("\n");
                }
            }

            // if current token is not EOG, we add it to current assistant message
            if (params.conversation) {
-                const auto id = gpt_sampler_last(smpl);
+                auto id = llama_sampling_last(smpl);
                assistant_ss << llama_token_to_piece(ctx, id, false);
            }

            if (n_past > 0 && is_interacting) {
-                LOG_DBG("waiting for user input\n");
+                LOG("waiting for user input\n");

                if (params.conversation) {
-                    LOG("\n> ");
+                    printf("\n> ");
                }

                if (params.input_prefix_bos) {
-                    LOG_DBG("adding input prefix BOS token\n");
+                    LOG("adding input prefix BOS token\n");
                    embd_inp.push_back(llama_token_bos(model));
                }

                std::string buffer;
                if (!params.input_prefix.empty() && !params.conversation) {
-                    LOG_DBG("appending input prefix: '%s'\n", params.input_prefix.c_str());
-                    LOG("%s", params.input_prefix.c_str());
+                    LOG("appending input prefix: '%s'\n", params.input_prefix.c_str());
+                    printf("%s", params.input_prefix.c_str());
                }

                // color user input only
@@ -845,11 +849,11 @@ int main(int argc, char ** argv) {
                if (buffer.length() > 1) {
                    // append input suffix if any
                    if (!params.input_suffix.empty() && !params.conversation) {
-                        LOG_DBG("appending input suffix: '%s'\n", params.input_suffix.c_str());
-                        LOG("%s", params.input_suffix.c_str());
+                        LOG("appending input suffix: '%s'\n", params.input_suffix.c_str());
+                        printf("%s", params.input_suffix.c_str());
                    }

-                    LOG_DBG("buffer: '%s'\n", buffer.c_str());
+                    LOG("buffer: '%s'\n", buffer.c_str());

                    const size_t original_size = embd_inp.size();

@@ -866,7 +870,7 @@ int main(int argc, char ** argv) {
                    const auto line_inp = ::llama_tokenize(ctx, user_inp,            false, format_chat);
                    const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);

-                    LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str());
+                    LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());

                    // if user stop generation mid-way, we must add EOT to finish model's last response
                    if (need_insert_eot && format_chat) {
@@ -889,9 +893,9 @@ int main(int argc, char ** argv) {
                    assistant_ss.str("");

                    n_remain -= line_inp.size();
-                    LOG_DBG("n_remain: %d\n", n_remain);
+                    LOG("n_remain: %d\n", n_remain);
                } else {
-                    LOG_DBG("empty line, passing control back\n");
+                    LOG("empty line, passing control back\n");
                }

                input_echo = false; // do not echo this again
@@ -899,7 +903,7 @@ int main(int argc, char ** argv) {

            if (n_past > 0) {
                if (is_interacting) {
-                    gpt_sampler_reset(smpl);
+                    llama_sampling_reset(smpl);
                }
                is_interacting = false;
            }
@@ -907,7 +911,7 @@ int main(int argc, char ** argv) {

        // end of generation
        if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !(params.interactive)) {
-            LOG(" [end of text]\n");
+            LOG_TEE(" [end of text]\n");
            break;
        }

@@ -920,23 +924,25 @@ int main(int argc, char ** argv) {
    }

    if (!path_session.empty() && params.prompt_cache_all && !params.prompt_cache_ro) {
-        LOG("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());
+        LOG_TEE("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());
        llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
    }

-    LOG("\n\n");
-    gpt_perf_print(ctx, smpl);
+    llama_print_timings(ctx, smpl);
    write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);

-    gpt_sampler_free(smpl);
-
    llama_free(ctx);
    llama_free_model(model);

+    llama_sampling_free(smpl);
    llama_backend_free();

    ggml_threadpool_free(threadpool);
    ggml_threadpool_free(threadpool_batch);

+#ifndef LOG_DISABLE_LOGS
+    LOG_TEE("Log end\n");
+#endif // LOG_DISABLE_LOGS
+
    return 0;
 }
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -1,10 +1,7 @@
 // A basic application simulating a server with multiple clients.
 // The clients submit requests to the server and they are processed in parallel.

-#include "arg.h"
 #include "common.h"
-#include "sampling.h"
-#include "log.h"
 #include "llama.h"

 #include <cmath>
@@ -54,7 +51,7 @@ static std::vector<std::string> k_prompts = {
 struct client {
    ~client() {
        if (smpl) {
-            gpt_sampler_free(smpl);
+            llama_sampling_free(smpl);
        }
    }

@@ -75,7 +72,7 @@ struct client {
    std::string prompt;
    std::string response;

-    struct gpt_sampler * smpl = nullptr;
+    struct llama_sampling * smpl = nullptr;
 };

 static void print_date_time() {
@@ -84,9 +81,7 @@ static void print_date_time() {
    char buffer[80];
    strftime(buffer, sizeof(buffer), "%Y-%m-%d %H:%M:%S", local_time);

-    LOG_INF("\n");
-    LOG_INF("\033[35mrun parameters as of %s\033[0m\n", buffer);
-    LOG_INF("\n");
+    printf("\n\033[35mrun parameters as at %s\033[0m\n", buffer);
 }

 // Define a split string function to ...
@@ -105,12 +100,11 @@ int main(int argc, char ** argv) {

    gpt_params params;

-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_PARALLEL)) {
+    if (!gpt_params_parse(argc, argv, params)) {
+        gpt_params_print_usage(argc, argv, params);
        return 1;
    }

-    gpt_init();
-
    // number of simultaneous "clients" to simulate
    const int32_t n_clients = params.n_parallel;

@@ -125,6 +119,12 @@ int main(int argc, char ** argv) {

    const bool dump_kv_cache = params.dump_kv_cache;

+#ifndef LOG_DISABLE_LOGS
+    log_set_target(log_filename_generator("parallel", "log"));
+    LOG_TEE("Log start\n");
+    log_dump_cmdline(argc, argv);
+#endif // LOG_DISABLE_LOGS
+
    // init llama.cpp
    llama_backend_init();
    llama_numa_init(params.numa);
@@ -137,22 +137,23 @@ int main(int argc, char ** argv) {

    // load the prompts from an external file if there are any
    if (params.prompt.empty()) {
-        LOG_INF("\033[32mNo new questions so proceed with build-in defaults.\033[0m\n");
+        printf("\n\033[32mNo new questions so proceed with build-in defaults.\033[0m\n");
    } else {
        // Output each line of the input params.prompts vector and copy to k_prompts
        int index = 0;
-        LOG_INF("\033[32mNow printing the external prompt file %s\033[0m\n\n", params.prompt_file.c_str());
+        printf("\n\033[32mNow printing the external prompt file %s\033[0m\n\n", params.prompt_file.c_str());

        std::vector<std::string> prompts = split_string(params.prompt, '\n');
        for (const auto& prompt : prompts) {
            k_prompts.resize(index + 1);
            k_prompts[index] = prompt;
            index++;
-            LOG_INF("%3d prompt: %s\n", index, prompt.c_str());
+            printf("%3d prompt: %s\n", index, prompt.c_str());
        }
    }

-    LOG_INF("\n\n");
+    fprintf(stderr, "\n\n");
+    fflush(stderr);

    const int n_ctx = llama_n_ctx(ctx);

@@ -160,7 +161,7 @@ int main(int argc, char ** argv) {
    for (size_t i = 0; i < clients.size(); ++i) {
        auto & client = clients[i];
        client.id = i;
-        client.smpl = gpt_sampler_init(model, params.sparams);
+        client.smpl = llama_sampling_init(model, params.sparams);
    }

    std::vector<llama_token> tokens_system;
@@ -181,19 +182,19 @@ int main(int argc, char ** argv) {

    const auto t_main_start = ggml_time_us();

-    LOG_INF("%s: Simulating parallel requests from clients:\n", __func__);
-    LOG_INF("%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
-    LOG_INF("\n");
+    LOG_TEE("%s: Simulating parallel requests from clients:\n", __func__);
+    LOG_TEE("%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
+    LOG_TEE("\n");

    {
-        LOG_INF("%s: Evaluating the system prompt ...\n", __func__);
+        LOG_TEE("%s: Evaluating the system prompt ...\n", __func__);

        for (int32_t i = 0; i < n_tokens_system; ++i) {
            llama_batch_add(batch, tokens_system[i], i, { 0 }, false);
        }

        if (llama_decode(ctx, batch) != 0) {
-            LOG_ERR("%s: llama_decode() failed\n", __func__);
+            LOG_TEE("%s: llama_decode() failed\n", __func__);
            return 1;
        }

@@ -202,10 +203,10 @@ int main(int argc, char ** argv) {
            llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
        }

-        LOG_INF("\n");
+        LOG_TEE("\n");
    }

-    LOG_INF("Processing requests ...\n\n");
+    LOG_TEE("Processing requests ...\n\n");

    while (true) {
        if (dump_kv_cache) {
@@ -236,7 +237,7 @@ int main(int argc, char ** argv) {
                llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
            }

-            LOG_INF("%s: clearing the KV cache\n", __func__);
+            LOG_TEE("%s: clearing the KV cache\n", __func__);
        }

        // insert new sequences for decoding
@@ -252,7 +253,7 @@ int main(int argc, char ** argv) {
                    client.prompt   = client.input + "\nAssistant:";
                    client.response = "";

-                    gpt_sampler_reset(client.smpl);
+                    llama_sampling_reset(client.smpl);

                    // do not prepend BOS because we have a system prompt!
                    std::vector<llama_token> tokens_prompt;
@@ -271,7 +272,7 @@ int main(int argc, char ** argv) {
                    client.n_decoded = 0;
                    client.i_batch   = batch.n_tokens - 1;

-                    LOG_INF("\033[31mClient %3d, seq %4d, started decoding ...\033[0m\n", client.id, client.seq_id);
+                    LOG_TEE("\033[31mClient %3d, seq %4d, started decoding ...\033[0m\n", client.id, client.seq_id);

                    g_seq_id += 1;

@@ -315,11 +316,11 @@ int main(int argc, char ** argv) {
            if (ret != 0) {
                if (n_batch == 1 || ret < 0) {
                    // if you get here, it means the KV cache is full - try increasing it via the context size
-                    LOG_ERR("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret);
+                    LOG_TEE("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret);
                    return 1;
                }

-                LOG_ERR("%s : failed to decode the batch, retrying with n_batch = %d\n", __func__, n_batch / 2);
+                LOG("%s : failed to decode the batch, retrying with n_batch = %d\n", __func__, n_batch / 2);

                n_cache_miss += 1;

@@ -330,7 +331,7 @@ int main(int argc, char ** argv) {
                continue;
            }

-            LOG_DBG("%s : decoded batch of %d tokens\n", __func__, n_tokens);
+            LOG("%s : decoded batch of %d tokens\n", __func__, n_tokens);

            for (auto & client : clients) {
                if (client.i_batch < (int) i || client.i_batch >= (int) (i + n_tokens)) {
@@ -340,9 +341,9 @@ int main(int argc, char ** argv) {
                //printf("client %d, seq %d, token %d, pos %d, batch %d\n",
                //        client.id, client.seq_id, client.sampled, client.n_decoded, client.i_batch);

-                const llama_token id = gpt_sampler_sample(client.smpl, ctx, client.i_batch - i);
+                const llama_token id = llama_sampling_sample(client.smpl, ctx, client.i_batch - i);

-                gpt_sampler_accept(client.smpl, id, true);
+                llama_sampling_accept(client.smpl, id, true);

                if (client.n_decoded == 1) {
                    // start measuring generation time after the first token to make sure all concurrent clients
@@ -375,7 +376,7 @@ int main(int argc, char ** argv) {

                    const auto t_main_end = ggml_time_us();

-                    LOG_INF("\033[31mClient %3d, seq %3d/%3d, prompt %4d t, response %4d t, time %5.2f s, speed %5.2f t/s, cache miss %d \033[0m \n\nInput:    %s\n\033[35mResponse: %s\033[0m\n\n",
+                    LOG_TEE("\033[31mClient %3d, seq %3d/%3d, prompt %4d t, response %4d t, time %5.2f s, speed %5.2f t/s, cache miss %d \033[0m \nInput:    %s\n\033[35mResponse: %s\033[0m\n\n",
                            client.id, client.seq_id, n_seq, client.n_prompt, client.n_decoded,
                            (t_main_end - client.t_start_prompt) / 1e6,
                            (double) (client.n_prompt + client.n_decoded) / (t_main_end - client.t_start_prompt) * 1e6,
@@ -398,22 +399,22 @@ int main(int argc, char ** argv) {

    print_date_time();

-    LOG_INF("%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
+    LOG_TEE("\n%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
    if (params.prompt_file.empty()) {
        params.prompt_file = "used built-in defaults";
    }
-    LOG_INF("External prompt file: \033[32m%s\033[0m\n", params.prompt_file.c_str());
-    LOG_INF("Model and path used:  \033[32m%s\033[0m\n\n", params.model.c_str());
+    LOG_TEE("External prompt file: \033[32m%s\033[0m\n", params.prompt_file.c_str());
+    LOG_TEE("Model and path used:  \033[32m%s\033[0m\n\n", params.model.c_str());

-    LOG_INF("Total prompt tokens: %6d, speed: %5.2f t/s\n", n_total_prompt, (double) (n_total_prompt              ) / (t_main_end - t_main_start) * 1e6);
-    LOG_INF("Total gen tokens:    %6d, speed: %5.2f t/s\n", n_total_gen,    (double) (n_total_gen                 ) / (t_main_end - t_main_start) * 1e6);
-    LOG_INF("Total speed (AVG):   %6s  speed: %5.2f t/s\n", "",             (double) (n_total_prompt + n_total_gen) / (t_main_end - t_main_start) * 1e6);
-    LOG_INF("Cache misses:        %6d\n", n_cache_miss);
+    LOG_TEE("Total prompt tokens: %6d, speed: %5.2f t/s\n", n_total_prompt, (double) (n_total_prompt              ) / (t_main_end - t_main_start) * 1e6);
+    LOG_TEE("Total gen tokens:    %6d, speed: %5.2f t/s\n", n_total_gen,    (double) (n_total_gen                 ) / (t_main_end - t_main_start) * 1e6);
+    LOG_TEE("Total speed (AVG):   %6s  speed: %5.2f t/s\n", "",             (double) (n_total_prompt + n_total_gen) / (t_main_end - t_main_start) * 1e6);
+    LOG_TEE("Cache misses:        %6d\n", n_cache_miss);

-    LOG_INF("\n");
+    LOG_TEE("\n");

    // TODO: print sampling/grammar timings for all clients
-    llama_perf_context_print(ctx);
+    llama_print_timings(ctx, nullptr);

    llama_batch_free(batch);

@@ -422,7 +423,7 @@ int main(int argc, char ** argv) {

    llama_backend_free();

-    LOG("\n\n");
+    fprintf(stderr, "\n\n");

    return 0;
 }
--- a/examples/passkey/passkey.cpp
+++ b/examples/passkey/passkey.cpp
@@ -1,6 +1,4 @@
-#include "arg.h"
 #include "common.h"
-#include "log.h"
 #include "llama.h"

 #include <cmath>
@@ -8,10 +6,12 @@
 #include <string>
 #include <vector>

-static void print_usage(int, char ** argv) {
-    LOG("\nexample usage:\n");
-    LOG("\n    %s -m model.gguf --junk 250 --pos 90 --keep 32 --grp-attn-n 2 [--seed 1234]\n", argv[0]);
-    LOG("\n");
+static void print_usage(int argc, char ** argv, const gpt_params & params) {
+    gpt_params_print_usage(argc, argv, params);
+
+    LOG_TEE("\nexample usage:\n");
+    LOG_TEE("\n    %s -m model.gguf --junk 250 --pos 90 --keep 32 --grp-attn-n 2 [--seed 1234]\n", argv[0]);
+    LOG_TEE("\n");
 }

 int main(int argc, char ** argv) {
@@ -21,12 +21,11 @@ int main(int argc, char ** argv) {
    params.n_keep = 32;
    params.i_pos  = -1;

-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_PASSKEY, print_usage)) {
+    if (!gpt_params_parse(argc, argv, params)) {
+        print_usage(argc, argv, params);
        return 1;
    }

-    gpt_init();
-
    int n_junk = params.n_junk;
    int n_keep = params.n_keep;
    int n_grp  = params.grp_attn_n;
@@ -66,7 +65,7 @@ int main(int argc, char ** argv) {
    llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);

    if (model == NULL) {
-        LOG_ERR("%s: unable to load model\n" , __func__);
+        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
        return 1;
    }

@@ -80,15 +79,11 @@ int main(int argc, char ** argv) {

    llama_context * ctx = llama_new_context_with_model(model, ctx_params);
    if (ctx == NULL) {
-        LOG_ERR("%s: failed to create the llama_context\n" , __func__);
+        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
        return 1;
    }

-    auto sparams = llama_sampler_chain_default_params();
-
-    llama_sampler * smpl = llama_sampler_chain_init(sparams);
-
-    llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
+    llama_sampling * smpl = llama_sampling_init(model, llama_sampling_default_params());

    // tokenize the prompt
    std::vector<llama_token> tokens_list;
@@ -110,14 +105,14 @@ int main(int argc, char ** argv) {
    const int n_batch     = ctx_params.n_batch;
    const int n_batch_grp = ctx_params.n_batch/n_grp;

-    LOG_INF("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d, n_grp = %d, n_batch = %d, n_junk = %d, i_pos = %d\n", __func__, n_len, n_ctx, n_kv_req, n_grp, n_batch, n_junk, i_pos);
+    LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d, n_grp = %d, n_batch = %d, n_junk = %d, i_pos = %d\n", __func__, n_len, n_ctx, n_kv_req, n_grp, n_batch, n_junk, i_pos);

    // print the prompt token-by-token

-    LOG_INF("\n");
-    LOG_INF("prefix tokens: %d\n", n_tokens_prefix);
-    LOG_INF("prompt tokens: %d\n", n_tokens_all);
-    //LOG_INF("prompt: %s\n", params.prompt.c_str());
+    LOG_TEE("\n");
+    LOG_TEE("prefix tokens: %d\n", n_tokens_prefix);
+    LOG_TEE("prompt tokens: %d\n", n_tokens_all);
+    //LOG_TEE("prompt: %s\n", params.prompt.c_str());

    llama_batch batch = llama_batch_init(params.n_batch, 0, 1);

@@ -148,11 +143,11 @@ int main(int argc, char ** argv) {
        }

        if (llama_decode(ctx, batch) != 0) {
-            LOG_INF("%s: llama_decode() failed\n", __func__);
+            LOG_TEE("%s: llama_decode() failed\n", __func__);
            return 1;
        }

-        LOG_INF("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all));
+        LOG_TEE("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all));

        if (i + n_batch >= n_tokens_all) {
            break;
@@ -162,7 +157,7 @@ int main(int argc, char ** argv) {
    for (int i = n_ctx; i < n_tokens_all; i += n_batch) {
        const int n_discard = n_batch;

-        LOG_INF("%s: shifting KV cache with %d\n", __func__, n_discard);
+        LOG_TEE("%s: shifting KV cache with %d\n", __func__, n_discard);

        llama_kv_cache_seq_rm (ctx, 0, n_keep            , n_keep + n_discard);
        llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx,  -n_discard);
@@ -182,18 +177,18 @@ int main(int argc, char ** argv) {
        }

        if (llama_decode(ctx, batch) != 0) {
-            LOG_ERR("%s: llama_decode() failed\n", __func__);
+            LOG_TEE("%s: llama_decode() failed\n", __func__);
            return 1;
        }

-        LOG_INF("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all));
+        LOG_TEE("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all));
    }

    {
        const int n_discard = n_past - n_ctx + n_predict;

        if (n_discard > 0) {
-            LOG_INF("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard);
+            LOG_TEE("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard);

            llama_kv_cache_seq_rm (ctx, 0, n_keep            , n_keep + n_discard);
            llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx,  -n_discard);
@@ -204,32 +199,39 @@ int main(int argc, char ** argv) {
        }
    }

-    LOG_INF("\n");
-    LOG_INF("%s: passkey = %d, inserted at position %d / %d (token pos: ~%d)\n", __func__, passkey, i_pos, n_junk, (i_pos * n_tokens_all) / n_junk);
-    LOG_INF("\n");
+    LOG_TEE("\n");
+    LOG_TEE("%s: passkey = %d, inserted at position %d / %d (token pos: ~%d)\n", __func__, passkey, i_pos, n_junk, (i_pos * n_tokens_all) / n_junk);
+    LOG_TEE("\n");

    // main loop

    int n_cur    = n_tokens_all;
    int n_decode = 0;

-    LOG_INF("%s", prompt_suffix.c_str());
+    LOG_TEE("%s", prompt_suffix.c_str());
+    fflush(stdout);

    const auto t_main_start = ggml_time_us();

    while (n_cur <= n_len) {
        // sample the next token
        {
-            const llama_token new_token_id = llama_sampler_sample(smpl, ctx, batch.n_tokens - 1);
+            const auto * logits = llama_get_logits_ith(ctx, batch.n_tokens - 1);
+
+            llama_sampling_set_logits(smpl, logits);
+
+            // sample the most likely token
+            const llama_token new_token_id = llama_sampling_sample_greedy(smpl, nullptr);

            // is it an end of generation?
            if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
-                LOG("\n");
+                LOG_TEE("\n");

                break;
            }

-            LOG("%s", llama_token_to_piece(ctx, new_token_id).c_str());
+            LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str());
+            fflush(stdout);

            n_decode += 1;

@@ -244,27 +246,25 @@ int main(int argc, char ** argv) {

        // evaluate the current batch with the transformer model
        if (llama_decode(ctx, batch)) {
-            LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1);
+            fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
            return 1;
        }
    }

-    LOG("\n");
+    LOG_TEE("\n");

    const auto t_main_end = ggml_time_us();

-    LOG_INF("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
+    LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
            __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));

-    LOG("\n");
-    llama_perf_context_print(ctx);
+    llama_print_timings(ctx, nullptr);

-    LOG("\n");
-
-    llama_sampler_free(smpl);
+    fprintf(stderr, "\n");

    llama_batch_free(batch);

+    llama_sampling_free(smpl);
    llama_free(ctx);
    llama_free_model(model);

--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -1,21 +1,18 @@
-#include "arg.h"
 #include "common.h"
-#include "log.h"
 #include "llama.h"

-#include <algorithm>
-#include <array>
-#include <atomic>
 #include <cmath>
 #include <cstdio>
 #include <cstring>
 #include <ctime>
-#include <fstream>
-#include <mutex>
-#include <random>
 #include <sstream>
 #include <thread>
+#include <mutex>
+#include <atomic>
 #include <vector>
+#include <array>
+#include <fstream>
+#include <sstream>

 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
@@ -43,7 +40,7 @@ static void write_logfile(
    }

    if (params.hellaswag) {
-        LOG_WRN("%s: logging results is not implemented for HellaSwag. No files will be written.\n", __func__);
+        fprintf(stderr, "%s: warning: logging results is not implemented for HellaSwag. No files will be written.\n", __func__);
        return;
    }

@@ -51,7 +48,7 @@ static void write_logfile(

    const bool success = fs_create_directory_with_parents(params.logdir);
    if (!success) {
-        LOG_WRN("%s: failed to create logdir %s, cannot write logfile\n",
+        fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
                __func__, params.logdir.c_str());
        return;
    }
@@ -60,7 +57,7 @@ static void write_logfile(
    FILE * logfile = fopen(logfile_path.c_str(), "w");

    if (logfile == NULL) {
-        LOG_ERR("%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
+        fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
        return;
    }

@@ -79,7 +76,7 @@ static void write_logfile(
    fprintf(logfile, "ppl_value: %f\n", results.ppl_value);
    yaml_dump_vector_float(logfile, "probs", results.probs);

-    llama_perf_dump_yaml(logfile, ctx);
+    llama_dump_timing_info_yaml(logfile, ctx);
    fclose(logfile);
 }

@@ -346,16 +343,16 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
    const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
    GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));

-    LOG_INF("%s: tokenizing the input ..\n", __func__);
+    fprintf(stderr, "%s: tokenizing the input ..\n", __func__);

    std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);

    const int n_ctx = llama_n_ctx(ctx);

    if (int(tokens.size()) < 2*n_ctx) {
-        LOG_ERR("%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
+        fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
                n_ctx);
-        LOG_ERR("%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
+        fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
        return {std::move(tokens), 0., {}, {}};
    }

@@ -366,16 +363,16 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
    prob_history.resize(tokens.size());

    if (params.ppl_stride <= 0) {
-        LOG_ERR("%s: stride is %d but must be greater than zero!\n",__func__,params.ppl_stride);
+        fprintf(stderr, "%s: stride is %d but must be greater than zero!\n",__func__,params.ppl_stride);
        return {tokens, -1, logit_history, prob_history};
    }

    const int calc_chunk = n_ctx;

-    LOG_INF("%s: have %zu tokens. Calculation chunk = %d\n", __func__, tokens.size(), calc_chunk);
+    fprintf(stderr, "%s: have %zu tokens. Calculation chunk = %d\n", __func__, tokens.size(), calc_chunk);

    if (int(tokens.size()) <= calc_chunk) {
-        LOG_ERR("%s: there are only %zu tokens, this is not enough for a context size of %d and stride %d\n",__func__,
+        fprintf(stderr, "%s: there are only %zu tokens, this is not enough for a context size of %d and stride %d\n",__func__,
                tokens.size(), n_ctx, params.ppl_stride);
        return {tokens, -1, logit_history, prob_history};
    }
@@ -389,14 +386,14 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
    int count = 0;
    double nll = 0.0;

-    LOG_INF("%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
+    fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);

    for (int i = 0; i < n_chunk; ++i) {
        const int start =     i * params.ppl_stride;
        const int end   = start + calc_chunk;

        const int num_batches = (calc_chunk + n_batch - 1) / n_batch;
-        //LOG_DBG("%s: evaluating %d...%d using %d batches\n", __func__, start, end, num_batches);
+        //fprintf(stderr, "%s: evaluating %d...%d using %d batches\n", __func__, start, end, num_batches);

        std::vector<float> logits;

@@ -409,10 +406,10 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
            const int batch_start = start + j * n_batch;
            const int batch_size  = std::min(end - batch_start, n_batch);

-            //LOG_DBG("    Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch);
+            //fprintf(stderr, "    Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch);
            // TODO: use llama_batch.logits instead of relying on logits_all == true
            if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
-                //LOG_ERR("%s : failed to eval\n", __func__);
+                //fprintf(stderr, "%s : failed to eval\n", __func__);
                return {tokens, -1, logit_history, prob_history};
            }

@@ -436,16 +433,16 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &

        if (i == 0) {
            const float t_total = std::chrono::duration<float>(t_end - t_start).count();
-            LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
+            fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
            int total_seconds = (int)(t_total * n_chunk);
            if (total_seconds >= 60*60) {
-                LOG("%d hours ", total_seconds / (60*60));
+                fprintf(stderr, "%d hours ", total_seconds / (60*60));
                total_seconds = total_seconds % (60*60);
            }
-            LOG("%.2f minutes\n", total_seconds / 60.0);
+            fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
        }

-        //LOG_DBG("%s: using tokens %d...%d\n",__func__,params.n_ctx - params.ppl_stride + start, params.n_ctx + start);
+        //fprintf(stderr, "%s: using tokens %d...%d\n",__func__,params.n_ctx - params.ppl_stride + start, params.n_ctx + start);
        for (int j = n_ctx - params.ppl_stride - 1; j < n_ctx - 1; ++j) {

            // Calculate probability of next token, given the previous ones.
@@ -462,12 +459,13 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
        }
        // perplexity is e^(average negative log-likelihood)
        if (params.ppl_output_type == 0) {
-            LOG("[%d]%.4lf,", i + 1, std::exp(nll / count));
+            printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
        } else {
-            LOG("%8d  %.4lf\n", i*params.ppl_stride, std::exp(nll / count));
+            printf("%8d  %.4lf\n", i*params.ppl_stride, std::exp(nll / count));
        }
+        fflush(stdout);
    }
-    LOG("\n");
+    printf("\n");

    return {tokens, std::exp(nll / count), logit_history, prob_history};
 }
@@ -489,26 +487,26 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
    if (!params.logits_file.empty()) {
        logits_stream.open(params.logits_file.c_str(), std::ios::binary);
        if (!logits_stream.is_open()) {
-            LOG_ERR("%s: failed to open %s for writing\n", __func__, params.logits_file.c_str());
+            fprintf(stderr, "%s: failed to open %s for writing\n", __func__, params.logits_file.c_str());
            return {};
        }
-        LOG_INF("%s: saving all logits to %s\n", __func__, params.logits_file.c_str());
+        fprintf(stderr, "%s: saving all logits to %s\n", __func__, params.logits_file.c_str());
        logits_stream.write("_logits_", 8);
        logits_stream.write(reinterpret_cast<const char *>(&n_ctx), sizeof(n_ctx));
    }

    auto tim1 = std::chrono::high_resolution_clock::now();
-    LOG_INF("%s: tokenizing the input ..\n", __func__);
+    fprintf(stderr, "%s: tokenizing the input ..\n", __func__);

    std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);

    auto tim2 = std::chrono::high_resolution_clock::now();
-    LOG_INF("%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
+    fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());

    if (int(tokens.size()) < 2*n_ctx) {
-        LOG_ERR("%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
+        fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
                n_ctx);
-        LOG_ERR("%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
+        fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
        return {std::move(tokens), 0., {}, {}};
    }

@@ -541,7 +539,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
        logits.reserve((size_t)n_ctx * n_vocab);
    }

-    LOG_INF("%s: calculating perplexity over %d chunks, n_ctx=%d, batch_size=%d, n_seq=%d\n", __func__, n_chunk, n_ctx, n_batch, n_seq);
+    fprintf(stderr, "%s: calculating perplexity over %d chunks, n_ctx=%d, batch_size=%d, n_seq=%d\n", __func__, n_chunk, n_ctx, n_batch, n_seq);

    std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);

@@ -614,7 +612,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
            }

            if (llama_decode(ctx, batch)) {
-                LOG_INF("%s : failed to eval\n", __func__);
+                fprintf(stderr, "%s : failed to eval\n", __func__);
                return {tokens, -1, logit_history, prob_history};
            }

@@ -629,13 +627,13 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
            llama_synchronize(ctx);
            const auto t_end = std::chrono::high_resolution_clock::now();
            const float t_total = std::chrono::duration<float>(t_end - t_start).count();
-            LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
+            fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
            int total_seconds = (int)(t_total*n_chunk/n_seq);
            if (total_seconds >= 60*60) {
-                LOG("%d hours ", total_seconds / (60*60));
+                fprintf(stderr, "%d hours ", total_seconds / (60*60));
                total_seconds = total_seconds % (60*60);
            }
-            LOG("%.2f minutes\n", total_seconds / 60.0);
+            fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
        }

        for (int seq = 0; seq < n_seq_batch; seq++) {
@@ -657,18 +655,19 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par

            // perplexity is e^(average negative log-likelihood)
            if (params.ppl_output_type == 0) {
-                LOG("[%d]%.4lf,", i + seq + 1, std::exp(nll / count));
+                printf("[%d]%.4lf,", i + seq + 1, std::exp(nll / count));
            } else {
                double av = nll/count;
                double av2 = nll2/count - av*av;
                if (av2 > 0) av2 = sqrt(av2/(count-1));
-                LOG("%8d  %.4lf  %4lf  %4lf\n", i*n_ctx, std::exp(nll / count), av, av2);
+                printf("%8d  %.4lf  %4lf  %4lf\n", i*n_ctx, std::exp(nll / count), av, av2);
            }
        }
+        fflush(stdout);

        logits.clear();
    }
-    LOG("\n");
+    printf("\n");

    nll2 /= count;
    nll /= count;
@@ -676,9 +675,9 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
    nll2 -= nll * nll;
    if (nll2 > 0) {
        nll2 = sqrt(nll2/(count-1));
-        LOG_INF("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
+        printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
    } else {
-        LOG_ERR("Unexpected negative standard deviation of log(prob)\n");
+        printf("Unexpected negative standard deviation of log(prob)\n");
    }

    llama_batch_free(batch);
@@ -704,7 +703,7 @@ static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<

        const int ret = llama_decode(ctx, batch_view);
        if (ret != 0) {
-            LOG_ERR("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
+            LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
            return false;
        }

@@ -790,15 +789,15 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
    }

    if (prompt_lines.size() % 6 != 0) {
-        LOG_ERR("%s : number of lines in prompt not a multiple of 6.\n", __func__);
+        fprintf(stderr, "%s : number of lines in prompt not a multiple of 6.\n", __func__);
        return;
    }

    size_t hs_task_count = prompt_lines.size()/6;
-    LOG_INF("%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count);
+    fprintf(stderr, "%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count);

    const bool is_spm = llama_vocab_type(llama_get_model(ctx)) == LLAMA_VOCAB_TYPE_SPM;
-    LOG_INF("================================= is_spm = %d\n", is_spm);
+    fprintf(stderr, "================================= is_spm = %d\n", is_spm);

    // The tasks should be randomized so the score stabilizes quickly.
    bool randomize_tasks = true;
@@ -825,7 +824,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
        std::vector<llama_token> seq_tokens[4];
    };

-    LOG_INF("%s : selecting %zu %s tasks.\n", __func__, hs_task_count, (randomize_tasks?"randomized":"the first")  );
+    fprintf(stderr, "%s : selecting %zu %s tasks.\n", __func__, hs_task_count, (randomize_tasks?"randomized":"the first")  );

    // Select and read data from prompt lines
    std::vector<hs_data_t> hs_data(hs_task_count);
@@ -871,9 +870,9 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
        }
    }

-    LOG_INF("%s : calculating hellaswag score over selected tasks.\n", __func__);
+    fprintf(stderr, "%s : calculating hellaswag score over selected tasks.\n", __func__);

-    LOG("\ntask\tacc_norm\n");
+    printf("\ntask\tacc_norm\n");

    double acc = 0.0f;

@@ -941,7 +940,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
        }

        if (i0 == i1) {
-            LOG_ERR("%s : task %zu does not fit in the context window\n", __func__, i0);
+            fprintf(stderr, "%s : task %zu does not fit in the context window\n", __func__, i0);
            return;
        }

@@ -949,7 +948,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {

        // decode all tasks [i0, i1)
        if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
-            LOG_ERR("%s: llama_decode() failed\n", __func__);
+            fprintf(stderr, "%s: llama_decode() failed\n", __func__);
            return;
        }

@@ -999,7 +998,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
                }
            }

-            //LOG("max logprob ending idx %lu, gold ending idx %lu\n", ending_logprob_max_idx, hs_cur.gold_ending_idx);
+            //printf("max logprob ending idx %lu, gold ending idx %lu\n", ending_logprob_max_idx, hs_cur.gold_ending_idx);

            // If the gold ending got the maximum logprobe add one accuracy point
            if (ending_logprob_max_idx == hs_cur.gold_ending_idx) {
@@ -1007,7 +1006,8 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
            }

            // Print the accumulated accuracy mean x 100
-            LOG("%zu\t%.8lf\n", i + 1, acc/double(i + 1)*100.0);
+            printf("%zu\t%.8lf\n", i + 1, acc/double(i + 1)*100.0);
+            fflush(stdout);
        }

        i0 = i1 - 1;
@@ -1015,7 +1015,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {

    llama_batch_free(batch);

-    LOG("\n");
+    printf("\n");
 }

 struct winogrande_entry {
@@ -1059,7 +1059,7 @@ static std::vector<winogrande_entry> load_winogrande_from_csv(const std::string
            }
        }
        if (ipos != 4) {
-            LOG_ERR("%s: failed to find comma separators in <%s>\n", __func__, line.c_str());
+            printf("%s: failed to find comma separators in <%s>\n", __func__, line.c_str());
            continue;
        }
        auto sentence = line[comma_pos[0]+1] == '"' ? line.substr(comma_pos[0]+2, comma_pos[1] - comma_pos[0] - 3)
@@ -1073,13 +1073,13 @@ static std::vector<winogrande_entry> load_winogrande_from_csv(const std::string
            if (sentence[where] == '_') break;
        }
        if (where == int(sentence.size())) {
-            LOG_ERR("%s: no _ in <%s>\n", __func__, sentence.c_str());
+            printf("%s: no _ in <%s>\n", __func__, sentence.c_str());
            continue;
        }
        std::istringstream stream(answer.c_str());
        int i_answer; stream >> i_answer;
        if (stream.fail() || i_answer < 1 || i_answer > 2) {
-            LOG_ERR("%s: failed to parse answer <%s>\n", __func__, answer.c_str());
+            printf("%s: failed to parse answer <%s>\n", __func__, answer.c_str());
            continue;
        }
        result.emplace_back();
@@ -1108,14 +1108,14 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {

    auto data = load_winogrande_from_csv(params.prompt);
    if (data.empty()) {
-        LOG_ERR("%s: no tasks\n", __func__);
+        fprintf(stderr, "%s: no tasks\n", __func__);
        return;
    }

-    LOG_INF("%s : loaded %zu tasks from prompt.\n", __func__, data.size());
+    fprintf(stderr, "%s : loaded %zu tasks from prompt.\n", __func__, data.size());

    if (params.winogrande_tasks > 0 && params.winogrande_tasks < data.size()) {
-        LOG_INF("%s : selecting %zu random tasks\n", __func__, params.winogrande_tasks);
+        fprintf(stderr, "%s : selecting %zu random tasks\n", __func__, params.winogrande_tasks);
        std::mt19937 rng(1);
        std::vector<int> aux(data.size());
        for (int i = 0; i < int(data.size()); ++i) {
@@ -1133,7 +1133,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
        data = std::move(selected);
    }

-    LOG_INF("%s : tokenizing selected tasks\n", __func__);
+    fprintf(stderr, "%s : tokenizing selected tasks\n", __func__);

    for (auto & task : data) {
        task.seq_tokens[0] = ::llama_tokenize(ctx, task.first + task.choices[0] + task.second, true);
@@ -1156,7 +1156,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
        task.n_base2 = ::llama_tokenize(ctx, task.first + task.choices[1], true).size();
    }

-    LOG_INF("%s : calculating winogrande score over selected tasks.\n", __func__);
+    fprintf(stderr, "%s : calculating winogrande score over selected tasks.\n", __func__);

    const int n_vocab = llama_n_vocab(llama_get_model(ctx));
    const int n_ctx   = llama_n_ctx(ctx);
@@ -1217,7 +1217,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
        }

        if (i0 == i1) {
-            LOG_ERR("%s : task %zu does not fit in the context window\n", __func__, i0);
+            fprintf(stderr, "%s : task %zu does not fit in the context window\n", __func__, i0);
            return;
        }

@@ -1225,7 +1225,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {

        // decode all tasks [i0, i1)
        if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
-            LOG_ERR("%s: llama_decode() failed\n", __func__);
+            fprintf(stderr, "%s: llama_decode() failed\n", __func__);
            return;
        }

@@ -1285,20 +1285,20 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
            ++n_done;

            // print the accumulated accuracy mean x 100
-            LOG("%zu\t%.4lf\t%10.6f  %10.6f  %d  %d\n", i+1, 100.0 * n_correct/n_done, score_1st, score_2nd, result, task.answer);
+            printf("%zu\t%.4lf\t%10.6f  %10.6f  %d  %d\n", i+1, 100.0 * n_correct/n_done, score_1st, score_2nd, result, task.answer);
+            fflush(stdout);
        }

        i0 = i1 - 1;
    }

-    LOG("\n");
+    printf("\n");

    if (n_done < 100) return;

    const float p = 1.f*n_correct/n_done;
    const float sigma = 100.f*sqrt(p*(1-p)/(n_done-1));
-
-    LOG_INF("Final Winogrande score(%d tasks): %.4lf +/- %.4lf\n", n_done, 100*p, sigma);
+    printf("Final Winogrande score(%d tasks): %.4lf +/- %.4lf\n", n_done, 100*p, sigma);
 }

 static bool deserialize_string(std::istream & in, std::string & str) {
@@ -1347,7 +1347,7 @@ struct multiple_choice_task {
 static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choice_task& task, bool log_error) {
    if (task.question.empty() || task.mc1.answers.empty()) {
        if (log_error) {
-            LOG_ERR("%s: found bad task with empty question and/or answers\n", __func__);
+            printf("%s: found bad task with empty question and/or answers\n", __func__);
        }
        return false;
    }
@@ -1355,7 +1355,7 @@ static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choic
    for (auto& answer : task.mc1.answers) {
        if (answer.empty()) {
            if (log_error) {
-                LOG_ERR("%s: found empty answer\n", __func__);
+                printf("%s: found empty answer\n", __func__);
            }
            return false;
        }
@@ -1409,14 +1409,14 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
    uint32_t n_task;
    strstream.read((char *)&n_task, sizeof(n_task));
    if (strstream.fail() || n_task == 0) {
-        LOG_ERR("%s: no tasks\n", __func__);
+        printf("%s: no tasks\n", __func__);
        return;
    }
-    LOG_INF("%s: there are %u tasks in prompt\n", __func__, n_task);
+    printf("%s: there are %u tasks in prompt\n", __func__, n_task);
    std::vector<uint32_t> task_pos(n_task);
    strstream.read((char *)task_pos.data(), task_pos.size()*sizeof(uint32_t));
    if (strstream.fail()) {
-        LOG_ERR("%s: failed to read task positions from prompt\n", __func__);
+        printf("%s: failed to read task positions from prompt\n", __func__);
        return;
    }

@@ -1424,21 +1424,21 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
    if (params.multiple_choice_tasks == 0 || params.multiple_choice_tasks >= (size_t)n_task) {
        // Use all tasks
        tasks.resize(n_task);
-        LOG_INF("%s: reading tasks", __func__);
+        printf("%s: reading tasks", __func__);
        int n_dot = std::max((int) n_task/100, 1);
        int i = 0;
        for (auto& task : tasks) {
            ++i;
            if (!task.deserialize(strstream)) {
-                LOG_ERR("%s: failed to read task %d of %u\n", __func__, i, n_task);
+                printf("%s: failed to read task %d of %u\n", __func__, i, n_task);
                return;
            }
-            if (i%n_dot == 0) LOG(".");
+            if (i%n_dot == 0) printf(".");
        }
-        LOG("done\n");
+        printf("done\n");
    }
    else {
-        LOG_INF("%s: selecting %zu random tasks from %u tasks available\n", __func__, params.multiple_choice_tasks, n_task);
+        printf("%s: selecting %zu random tasks from %u tasks available\n", __func__, params.multiple_choice_tasks, n_task);
        std::mt19937 rng(1);
        std::vector<int> aux(n_task);
        for (uint32_t i = 0; i < n_task; ++i) aux[i] = i;
@@ -1451,16 +1451,18 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
            aux.pop_back();
            strstream.seekg(task_pos[idx], std::ios::beg);
            if (!task.deserialize(strstream)) {
-                LOG_ERR("%s: failed to read task %d at position %u\n", __func__, idx, task_pos[idx]);
+                printf("%s: failed to read task %d at position %u\n", __func__, idx, task_pos[idx]);
                return;
            }
        }
        n_task = params.multiple_choice_tasks;
    }

-    LOG_INF("%s: preparing task data", __func__);
+    printf("%s: preparing task data", __func__);
+    fflush(stdout);
    if (n_task > 500) {
-        LOG("...");
+        printf("...");
+        fflush(stdout);
        std::atomic<int> counter(0);
        std::atomic<int> n_bad(0);
        auto prepare = [&counter, &n_bad, &tasks, ctx] () {
@@ -1484,10 +1486,11 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
        for (auto& w : workers) w = std::thread(prepare);
        prepare();
        for (auto& w : workers) w.join();
-        LOG("done\n");
+        printf("done\n");
+        fflush(stdout);
        int nbad = n_bad;
        if (nbad > 0) {
-            LOG_ERR("%s: found %d malformed tasks\n", __func__, nbad);
+            printf("%s: found %d malformed tasks\n", __func__, nbad);
            return;
        }
    } else {
@@ -1499,15 +1502,16 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
                return;
            }
            if (i_task%n_dot == 0) {
-                LOG(".");
+                printf(".");
+                fflush(stdout);
            }
        }
-        LOG("done\n");
+        printf("done\n");
    }

-    LOG_INF("%s : calculating TruthfulQA score over %zu tasks.\n", __func__, tasks.size());
+    printf("%s : calculating TruthfulQA score over %zu tasks.\n", __func__, tasks.size());

-    LOG("\ntask\tacc_norm\n");
+    printf("\ntask\tacc_norm\n");

    const int n_vocab = llama_n_vocab(llama_get_model(ctx));
    const int n_ctx   = llama_n_ctx(ctx);
@@ -1586,7 +1590,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
        }

        if (i0 == i1) {
-            LOG_ERR("%s : task %zu does not fit in the context window\n", __func__, i0);
+            fprintf(stderr, "%s : task %zu does not fit in the context window\n", __func__, i0);
            return;
        }

@@ -1594,7 +1598,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params

        // decode all tasks [i0, i1)
        if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
-            LOG_ERR("%s: llama_decode() failed\n", __func__);
+            fprintf(stderr, "%s: llama_decode() failed\n", __func__);
            return;
        }

@@ -1618,13 +1622,13 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
        // compute the logprobs for each ending of the decoded tasks
        for (size_t i = i0; i < i1; ++i) {
            auto & cur_task = tasks[i];
-            //LOG("==== Evaluating <%s> with correct answer ", cur_task.question.c_str());
+            //printf("==== Evaluating <%s> with correct answer ", cur_task.question.c_str());
            //for (int j = 0; j < int(cur_task.mc1.labels.size()); ++j) {
            //    if (cur_task.mc1.labels[j] == 1) {
-            //        LOG("%d", j+1);
+            //        printf("%d", j+1);
            //    }
            //}
-            //LOG("\n    common_prefix: %zu\n", cur_task.common_prefix);
+            //printf("\n    common_prefix: %zu\n", cur_task.common_prefix);

            // get the logits of the last token of the common prefix
            std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*cur_task.i_logits, n_vocab*sizeof(float));
@@ -1636,13 +1640,13 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
                size_t count = 1;
                float  log_prob  = std::log(first_probs[cur_task.seq_tokens[s][cur_task.common_prefix]]);
                for (size_t j = cur_task.common_prefix; j < cur_task.seq_tokens[s].size() - 1; j++) {
-                    //LOG("        %zu  %g\n", ir, eval_results[ir]);
+                    //printf("        %zu  %g\n", ir, eval_results[ir]);
                    ++count;
                    log_prob += eval_results[ir++];
                }
                cur_task.log_probs[s] = log_prob / count;
-                //LOG("        Final: %g\n", log_prob / count);
-                //LOG("    <%s> : %g\n", cur_task.mc1.answers[s].c_str(), log_prob/count);
+                //printf("        Final: %g\n", log_prob / count);
+                //printf("    <%s> : %g\n", cur_task.mc1.answers[s].c_str(), log_prob/count);
            }

            // Find the ending with maximum logprob
@@ -1662,7 +1666,8 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
            ++n_done;

            // Print the accumulated accuracy mean x 100
-            LOG("%d\t%.8lf\n", n_done, 100.*n_correct/n_done);
+            printf("%d\t%.8lf\n", n_done, 100.*n_correct/n_done);
+            fflush(stdout);
        }

        i0 = i1 - 1;
@@ -1674,30 +1679,29 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params

    float p = 1.f*n_correct/n_done;
    float sigma = sqrt(p*(1-p)/(n_done-1));
-    LOG("\n");
-    LOG_INF("Final result: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
+    printf("\n Final result: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
    p = 1.f*n_done/n_tot_answers;
    sigma = sqrt(p*(1-p)/(n_done-1));
-    LOG_INF("Random chance: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
+    printf("Random chance: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);

-    LOG_INF("\n");
+    printf("\n");
 }

 static void kl_divergence(llama_context * ctx, const gpt_params & params) {
    if (params.logits_file.empty()) {
-        LOG_ERR("%s: you must provide a name of a file containing the log probabilities of the base model\n", __func__);
+        fprintf(stderr, "%s: you must provide a name of a file containing the log probabilities of the base model\n", __func__);
        return;
    }
    std::ifstream in(params.logits_file.c_str(), std::ios::binary);
    if (!in) {
-        LOG_ERR("%s: failed to open %s\n", __func__, params.logits_file.c_str());
+        fprintf(stderr, "%s: failed to open %s\n", __func__, params.logits_file.c_str());
        return;
    }
    {
        char check[9]; check[8] = 0;
        in.read(check, 8);
        if (in.fail() || strncmp("_logits_", check, 8) != 0) {
-            LOG_ERR("%s: %s does not look like a file containing log-probabilities\n", __func__, params.logits_file.c_str());
+            fprintf(stderr, "%s: %s does not look like a file containing log-probabilities\n", __func__, params.logits_file.c_str());
            return;
        }
    }
@@ -1705,7 +1709,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
    uint32_t n_ctx;
    in.read((char *)&n_ctx, sizeof(n_ctx));
    if (n_ctx > llama_n_ctx(ctx)) {
-        LOG_ERR("%s: %s has been computed with %u, while the current context is %d. Increase it with -c and retry\n",
+        fprintf(stderr, "%s: %s has been computed with %u, while the current context is %d. Increase it with -c and retry\n",
                __func__, params.logits_file.c_str(), n_ctx, params.n_ctx);
    }

@@ -1713,16 +1717,16 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
    in.read((char *)&n_vocab, sizeof(n_vocab));
    in.read((char *)&n_chunk, sizeof(n_chunk));
    if (in.fail()) {
-        LOG_ERR("%s: failed reading n_vocab, n_chunk from %s\n", __func__, params.logits_file.c_str());
+        fprintf(stderr, "%s: failed reading n_vocab, n_chunk from %s\n", __func__, params.logits_file.c_str());
        return;
    }
    if (n_vocab != llama_n_vocab(llama_get_model(ctx))) {
-        LOG_ERR("%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_n_vocab(llama_get_model(ctx)));
+        fprintf(stderr, "%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_n_vocab(llama_get_model(ctx)));
    }

    std::vector<llama_token> tokens(n_ctx * n_chunk);
    if (in.read((char *)tokens.data(), tokens.size()*sizeof(tokens[0])).fail()) {
-        LOG_ERR("%s: failed reading evaluation tokens from %s\n", __func__, params.logits_file.c_str());
+        fprintf(stderr, "%s: failed reading evaluation tokens from %s\n", __func__, params.logits_file.c_str());
        return;
    }

@@ -1771,7 +1775,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
        const auto t_start = std::chrono::high_resolution_clock::now();

        if (in.read((char *)log_probs_uint16.data(), log_probs_uint16.size()*sizeof(uint16_t)).fail()) {
-            LOG_ERR("%s: failed reading log-probs for chunk %d\n", __func__, i);
+            fprintf(stderr, "%s: failed reading log-probs for chunk %d\n", __func__, i);
            return;
        }

@@ -1792,7 +1796,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {

            // TODO: use llama_batch.logits instead of relying on logits_all == true
            if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
-                LOG_ERR("%s : failed to eval\n", __func__);
+                fprintf(stderr, "%s : failed to eval\n", __func__);
                return;
            }

@@ -1809,16 +1813,16 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {

        if (i == 0) {
            const float t_total = std::chrono::duration<float>(t_end - t_start).count();
-            LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
+            fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
            int total_seconds = (int)(t_total * n_chunk);
            if (total_seconds >= 60*60) {
-                LOG("%d hours ", total_seconds / (60*60));
+                fprintf(stderr, "%d hours ", total_seconds / (60*60));
                total_seconds = total_seconds % (60*60);
            }
-            LOG("%.2f minutes\n", total_seconds / 60.0);
+            fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
+
+            printf("\nchunk             PPL               ln(PPL(Q)/PPL(base))          KL Divergence              Δp RMS            Same top p\n");
        }
-        LOG("\n");
-        LOG("chunk             PPL               ln(PPL(Q)/PPL(base))          KL Divergence              Δp RMS            Same top p\n");

        const int first = n_ctx/2;
        const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
@@ -1827,77 +1831,79 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
        p_diff_ptr += n_ctx - 1 - first;
        kld_ptr    += n_ctx - 1 - first;

-        LOG("%4d", i+1);
+        printf("%4d", i+1);

        auto log_ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
        const double ppl_val = exp(log_ppl.first);
        const double ppl_unc = ppl_val * log_ppl.second; // ppl_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl.second ** 2 )
-        LOG("    %9.4lf ± %9.4lf", ppl_val, ppl_unc);
+        printf("    %9.4lf ± %9.4lf", ppl_val, ppl_unc);

        auto log_ppl_base = mean_and_uncertainty(kld.sum_nll_base, kld.sum_nll_base2, kld.count);
        const double log_ppl_cov = covariance(kld.sum_nll, kld.sum_nll_base, kld.sum_nll_nll_base, kld.count);
        const double log_ppl_ratio_val = log_ppl.first - log_ppl_base.first;
        const double log_ppl_ratio_unc = sqrt(log_ppl.second*log_ppl.second + log_ppl_base.second*log_ppl_base.second - 2.0*log_ppl_cov);
-        LOG("    %10.5lf ± %10.5lf", log_ppl_ratio_val, log_ppl_ratio_unc);
+        printf("    %10.5lf ± %10.5lf", log_ppl_ratio_val, log_ppl_ratio_unc);

        auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
-        LOG("    %10.5lf ± %10.5lf", kl_div.first, kl_div.second);
+        printf("    %10.5lf ± %10.5lf", kl_div.first, kl_div.second);

        auto p_diff_mse   = mean_and_uncertainty(kld.sum_p_diff2, kld.sum_p_diff4, kld.count);
        const double p_diff_rms_val = sqrt(p_diff_mse.first);
        const double p_diff_rms_unc = 0.5/p_diff_rms_val * p_diff_mse.second;
-        LOG("    %6.3lf ± %6.3lf %%", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);
+        printf("    %6.3lf ± %6.3lf %%", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);

        double p_top_val = 1.*kld.n_same_top/kld.count;
        double p_top_unc = sqrt(p_top_val*(1 - p_top_val)/(kld.count - 1));
-        LOG("    %6.3lf ± %6.3lf %%", 100.0*p_top_val, 100.0*p_top_unc);
+        printf("    %6.3lf ± %6.3lf %%", 100.0*p_top_val, 100.0*p_top_unc);

-        LOG("\n");
+        printf("\n");
+
+        fflush(stdout);

        logits.clear();
    }
-    LOG("\n");
+    printf("\n");

    if (kld.count < 100) return; // we do not wish to do statistics on so few values

    std::sort(kld_values.begin(), kld_values.end());
    std::sort(p_diff_values.begin(), p_diff_values.end());

-    LOG("====== Perplexity statistics ======\n");
+    printf("====== Perplexity statistics ======\n");

    auto log_ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
    const double ppl_val = exp(log_ppl.first);
    const double ppl_unc = ppl_val * log_ppl.second; // ppl_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl.second ** 2 )
-    LOG("Mean PPL(Q)                   : %10.6lf ± %10.6lf\n", ppl_val, ppl_unc);
+    printf("Mean PPL(Q)                   : %10.6lf ± %10.6lf\n", ppl_val, ppl_unc);

    auto log_ppl_base = mean_and_uncertainty(kld.sum_nll_base, kld.sum_nll_base2, kld.count);
    const double ppl_base_val = exp(log_ppl_base.first);
    const double ppl_base_unc = ppl_base_val * log_ppl_base.second; // ppl_base_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl_base.second ** 2 )
-    LOG("Mean PPL(base)                : %10.6lf ± %10.6lf\n", ppl_base_val, ppl_base_unc);
+    printf("Mean PPL(base)                : %10.6lf ± %10.6lf\n", ppl_base_val, ppl_base_unc);

    const double log_ppl_cov = covariance(kld.sum_nll, kld.sum_nll_base, kld.sum_nll_nll_base, kld.count);
-    // LOG("Cov(ln(PPL(Q)), ln(PPL(base))): %10.6lf\n", log_ppl_cov);
+    // printf("Cov(ln(PPL(Q)), ln(PPL(base))): %10.6lf\n", log_ppl_cov);
    const double log_ppl_cor = log_ppl_cov / (log_ppl.second*log_ppl_base.second);
-    LOG("Cor(ln(PPL(Q)), ln(PPL(base))): %6.2lf%%\n", 100.0*log_ppl_cor);
+    printf("Cor(ln(PPL(Q)), ln(PPL(base))): %6.2lf%%\n", 100.0*log_ppl_cor);

    const double log_ppl_ratio_val = log_ppl.first - log_ppl_base.first;
    const double log_ppl_ratio_unc = sqrt(log_ppl.second*log_ppl.second + log_ppl_base.second*log_ppl_base.second - 2.0*log_ppl_cov);
-    LOG("Mean ln(PPL(Q)/PPL(base))     : %10.6lf ± %10.6lf\n", log_ppl_ratio_val, log_ppl_ratio_unc);
+    printf("Mean ln(PPL(Q)/PPL(base))     : %10.6lf ± %10.6lf\n", log_ppl_ratio_val, log_ppl_ratio_unc);

    const double ppl_ratio_val = exp(log_ppl_ratio_val);
    const double ppl_ratio_unc = ppl_ratio_val * log_ppl_ratio_unc; // ppl_ratio_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl_ratio.second ** 2 )
-    LOG("Mean PPL(Q)/PPL(base)         : %10.6lf ± %10.6lf\n", ppl_ratio_val, ppl_ratio_unc);
+    printf("Mean PPL(Q)/PPL(base)         : %10.6lf ± %10.6lf\n", ppl_ratio_val, ppl_ratio_unc);

    const double ppl_cov = ppl_val * ppl_base_val * log_ppl_cov;
    const double ppl_diff_val = ppl_val - ppl_base_val;
    const double ppl_diff_unc = sqrt(ppl_unc*ppl_unc + ppl_base_unc*ppl_base_unc - 2.0*ppl_cov);
-    LOG("Mean PPL(Q)-PPL(base)         : %10.6lf ± %10.6lf\n", ppl_diff_val, ppl_diff_unc);
+    printf("Mean PPL(Q)-PPL(base)         : %10.6lf ± %10.6lf\n", ppl_diff_val, ppl_diff_unc);

-    LOG("\n");
+    printf("\n");

-    LOG("====== KL divergence statistics ======\n");
+    printf("====== KL divergence statistics ======\n");
    auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
-    LOG("Mean    KLD: %10.6lf ± %10.6lf\n", kl_div.first, kl_div.second);
+    printf("Mean    KLD: %10.6lf ± %10.6lf\n", kl_div.first, kl_div.second);
    auto kld_median = kld_values.size()%2 == 0 ? 0.5f*(kld_values[kld_values.size()/2] + kld_values[kld_values.size()/2-1])
                                               : kld_values[kld_values.size()/2];

@@ -1909,49 +1915,50 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
        return (1 - p)*values[ip] + p*values[std::min(ip+1, values.size()-1)];
    };

-    LOG("Maximum KLD: %10.6f\n", kld_values.back());
-    LOG("99.9%%   KLD: %10.6f\n", percentile(kld_values, 0.999f));
-    LOG("99.0%%   KLD: %10.6f\n", percentile(kld_values, 0.990f));
-    LOG("99.0%%   KLD: %10.6f\n", percentile(kld_values, 0.990f));
-    LOG("Median  KLD: %10.6f\n", kld_median);
-    LOG("10.0%%   KLD: %10.6f\n", percentile(kld_values, 0.100f));
-    LOG(" 5.0%%   KLD: %10.6f\n", percentile(kld_values, 0.050f));
-    LOG(" 1.0%%   KLD: %10.6f\n", percentile(kld_values, 0.010f));
-    LOG("Minimum KLD: %10.6f\n", kld_values.front());
+    printf("Maximum KLD: %10.6f\n", kld_values.back());
+    printf("99.9%%   KLD: %10.6f\n", percentile(kld_values, 0.999f));
+    printf("99.0%%   KLD: %10.6f\n", percentile(kld_values, 0.990f));
+    printf("99.0%%   KLD: %10.6f\n", percentile(kld_values, 0.990f));
+    printf("Median  KLD: %10.6f\n", kld_median);
+    printf("10.0%%   KLD: %10.6f\n", percentile(kld_values, 0.100f));
+    printf(" 5.0%%   KLD: %10.6f\n", percentile(kld_values, 0.050f));
+    printf(" 1.0%%   KLD: %10.6f\n", percentile(kld_values, 0.010f));
+    printf("Minimum KLD: %10.6f\n", kld_values.front());

-    LOG("\n");
+    printf("\n");

-    LOG("====== Token probability statistics ======\n");
+    printf("====== Token probability statistics ======\n");

    auto p_diff = mean_and_uncertainty(kld.sum_p_diff, kld.sum_p_diff2, kld.count);
-    LOG("Mean    Δp: %6.3lf ± %5.3lf %%\n",  100.0*p_diff.first, 100.0*p_diff.second);
+    printf("Mean    Δp: %6.3lf ± %5.3lf %%\n",  100.0*p_diff.first, 100.0*p_diff.second);

    auto p_diff_median = p_diff_values.size()%2 == 0 ? 0.5f*(p_diff_values[p_diff_values.size()/2] + p_diff_values[p_diff_values.size()/2-1])
                                               : p_diff_values[p_diff_values.size()/2];

-    LOG("Maximum Δp: %6.3lf%%\n",  100.0*p_diff_values.back());
-    LOG("99.9%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.999f));
-    LOG("99.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.990f));
-    LOG("95.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.950f));
-    LOG("90.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.900f));
-    LOG("75.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.750f));
-    LOG("Median  Δp: %6.3lf%%\n",  100.0*p_diff_median);
-    LOG("25.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.250f));
-    LOG("10.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.100f));
-    LOG(" 5.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.050f));
-    LOG(" 1.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.010f));
-    LOG(" 0.1%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.001f));
-    LOG("Minimum Δp: %6.3lf%%\n",  100.0*p_diff_values.front());
+    printf("Maximum Δp: %6.3lf%%\n",  100.0*p_diff_values.back());
+    printf("99.9%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.999f));
+    printf("99.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.990f));
+    printf("95.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.950f));
+    printf("90.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.900f));
+    printf("75.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.750f));
+    printf("Median  Δp: %6.3lf%%\n",  100.0*p_diff_median);
+    printf("25.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.250f));
+    printf("10.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.100f));
+    printf(" 5.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.050f));
+    printf(" 1.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.010f));
+    printf(" 0.1%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.001f));
+    printf("Minimum Δp: %6.3lf%%\n",  100.0*p_diff_values.front());

    auto p_diff_mse = mean_and_uncertainty(kld.sum_p_diff2, kld.sum_p_diff4, kld.count);
-    // LOG("MSE Δp    : %10.6lf ± %10.6lf\n", p_diff_mse.first, p_diff_mse.second);
+    // printf("MSE Δp    : %10.6lf ± %10.6lf\n", p_diff_mse.first, p_diff_mse.second);

    const double p_diff_rms_val = sqrt(p_diff_mse.first);
    const double p_diff_rms_unc = 0.5/p_diff_rms_val * p_diff_mse.second;
-    LOG("RMS Δp    : %6.3lf ± %5.3lf %%\n", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);
+    printf("RMS Δp    : %6.3lf ± %5.3lf %%\n", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);

    const double same_top_p = 1.0*kld.n_same_top/kld.count;
-    LOG("Same top p: %6.3lf ± %5.3lf %%\n", 100.0*same_top_p, 100.0*sqrt(same_top_p*(1.0 - same_top_p)/(kld.count - 1)));
+    printf("Same top p: %6.3lf ± %5.3lf %%\n", 100.0*same_top_p, 100.0*sqrt(same_top_p*(1.0 - same_top_p)/(kld.count - 1)));
+
 }

 int main(int argc, char ** argv) {
@@ -1959,18 +1966,16 @@ int main(int argc, char ** argv) {

    params.n_ctx = 512;
    params.logits_all = true;
-    params.escape = false;

-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_PERPLEXITY)) {
+    if (!gpt_params_parse(argc, argv, params)) {
+        gpt_params_print_usage(argc, argv, params);
        return 1;
    }

-    gpt_init();
-
    const int32_t n_ctx = params.n_ctx;

    if (n_ctx <= 0) {
-        LOG_ERR("%s: perplexity tool requires '--ctx-size' > 0\n", __func__);
+        fprintf(stderr, "%s: perplexity tool requires '--ctx-size' > 0\n", __func__);
        return 1;
    }

@@ -1995,11 +2000,15 @@ int main(int argc, char ** argv) {
    }

    if (params.ppl_stride > 0) {
-        LOG_INF("Will perform strided perplexity calculation -> adjusting context size from %d to %d\n",
+        fprintf(stderr, "Will perform strided perplexity calculation -> adjusting context size from %d to %d\n",
                params.n_ctx, params.n_ctx + params.ppl_stride/2);
        params.n_ctx += params.ppl_stride/2;
    }

+    print_build_info();
+
+    LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed);
+
    llama_backend_init();
    llama_numa_init(params.numa);

@@ -2009,21 +2018,21 @@ int main(int argc, char ** argv) {
    llama_model * model = llama_init.model;
    llama_context * ctx = llama_init.context;
    if (model == NULL) {
-        LOG_ERR("%s: unable to load model\n", __func__);
+        fprintf(stderr, "%s: error: unable to load model\n", __func__);
        return 1;
    }

    const int n_ctx_train = llama_n_ctx_train(model);

    if (params.n_ctx > n_ctx_train) {
-        LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n",
+        fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
                __func__, n_ctx_train, params.n_ctx);
    }

    // print system information
    {
-        LOG_INF("\n");
-        LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
+        fprintf(stderr, "\n");
+        fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
    }

    struct results_perplexity results;
@@ -2039,9 +2048,7 @@ int main(int argc, char ** argv) {
        results = perplexity(ctx, params, n_ctx);
    }

-    LOG("\n");
-    llama_perf_context_print(ctx);
-
+    llama_print_timings(ctx, nullptr);
    write_logfile(ctx, params, model, results);

    llama_free(ctx);
--- a/examples/quantize/CMakeLists.txt
+++ b/examples/quantize/CMakeLists.txt
@@ -1,6 +1,6 @@
 set(TARGET llama-quantize)
 add_executable(${TARGET} quantize.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
 target_include_directories(${TARGET} PRIVATE ../../common)
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/quantize/README.md
+++ b/examples/quantize/README.md
@@ -54,8 +54,6 @@ As the models are currently fully loaded into memory, you will need adequate dis

 Several quantization methods are supported. They differ in the resulting model disk size and inference speed.

-The quantization formats `Q4_0_4_4`, `Q4_0_4_8` and `Q4_0_8_8` are block interleaved variants of the `Q4_0` format, providing a data layout that is better suited for specific implementations of optimized mulmat kernels. Since these formats differ only in data layout, they have the same quantized size as the `Q4_0` format.
-
 *(outdated)*

 | Model | Measure      |    F16 |   Q4_0 |   Q4_1 |   Q5_0 |   Q5_1 |   Q8_0 |
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -26,8 +26,6 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
    { "IQ2_M",    LLAMA_FTYPE_MOSTLY_IQ2_M,    " 2.7  bpw quantization",            },
    { "IQ1_S",    LLAMA_FTYPE_MOSTLY_IQ1_S,    " 1.56 bpw quantization",            },
    { "IQ1_M",    LLAMA_FTYPE_MOSTLY_IQ1_M,    " 1.75 bpw quantization",            },
-    { "TQ1_0",    LLAMA_FTYPE_MOSTLY_TQ1_0,    " 1.69 bpw ternarization",           },
-    { "TQ2_0",    LLAMA_FTYPE_MOSTLY_TQ2_0,    " 2.06 bpw ternarization",           },
    { "Q2_K",     LLAMA_FTYPE_MOSTLY_Q2_K,     " 2.96G, +3.5199 ppl @ Llama-3-8B",  },
    { "Q2_K_S",   LLAMA_FTYPE_MOSTLY_Q2_K_S,   " 2.96G, +3.1836 ppl @ Llama-3-8B",  },
    { "IQ3_XXS",  LLAMA_FTYPE_MOSTLY_IQ3_XXS,  " 3.06 bpw quantization",            },
@@ -63,16 +61,6 @@ static const char * const LLM_KV_QUANTIZE_IMATRIX_DATASET    = "quantize.imatrix
 static const char * const LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES  = "quantize.imatrix.entries_count";
 static const char * const LLM_KV_QUANTIZE_IMATRIX_N_CHUNKS   = "quantize.imatrix.chunks_count";

-static bool striequals(const char * a, const char * b) {
-    while (*a && *b) {
-        if (std::tolower(*a) != std::tolower(*b)) {
-            return false;
-        }
-        a++; b++;
-    }
-    return *a == *b;
-}
-
 static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) {
    std::string ftype_str;

@@ -80,7 +68,7 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
        ftype_str.push_back(std::toupper(ch));
    }
    for (auto & it : QUANT_OPTIONS) {
-        if (striequals(it.name.c_str(), ftype_str.c_str())) {
+        if (it.name == ftype_str) {
            ftype = it.ftype;
            ftype_str_out = it.name;
            return true;
@@ -235,15 +223,15 @@ static int prepare_imatrix(const std::string & imatrix_file,
 }

 static ggml_type parse_ggml_type(const char * arg) {
-    for (int i = 0; i < GGML_TYPE_COUNT; ++i) {
-        auto type = (ggml_type)i;
+    ggml_type result = GGML_TYPE_COUNT;
+    for (int j = 0; j < GGML_TYPE_COUNT; ++j) {
+        auto type = ggml_type(j);
        const auto * name = ggml_type_name(type);
-        if (name && striequals(name, arg)) {
-            return type;
+        if (name && strcmp(arg, name) == 0) {
+            result = type; break;
        }
    }
-    fprintf(stderr, "%s: invalid ggml_type '%s'\n", __func__, arg);
-    return GGML_TYPE_COUNT;
+    return result;
 }

 int main(int argc, char ** argv) {
@@ -264,18 +252,12 @@ int main(int argc, char ** argv) {
        } else if (strcmp(argv[arg_idx], "--output-tensor-type") == 0) {
            if (arg_idx < argc-1) {
                params.output_tensor_type = parse_ggml_type(argv[++arg_idx]);
-                if (params.output_tensor_type == GGML_TYPE_COUNT) {
-                    usage(argv[0]);
-                }
            } else {
                usage(argv[0]);
            }
        } else if (strcmp(argv[arg_idx], "--token-embedding-type") == 0) {
            if (arg_idx < argc-1) {
                params.token_embedding_type = parse_ggml_type(argv[++arg_idx]);
-                if (params.token_embedding_type == GGML_TYPE_COUNT) {
-                    usage(argv[0]);
-                }
            } else {
                usage(argv[0]);
            }
--- a/examples/retrieval/retrieval.cpp
+++ b/examples/retrieval/retrieval.cpp
@@ -1,16 +1,15 @@
-#include "arg.h"
 #include "common.h"
-#include "log.h"
 #include "llama.h"

 #include <algorithm>
 #include <fstream>
-#include <iostream> // TODO: remove me

-static void print_usage(int, char ** argv) {
-    LOG("\nexample usage:\n");
-    LOG("\n    %s --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator .\n", argv[0]);
-    LOG("\n");
+static void print_usage(int argc, char ** argv, const gpt_params & params) {
+    gpt_params_print_usage(argc, argv, params);
+
+    LOG_TEE("\nexample usage:\n");
+    LOG_TEE("\n    %s --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator .\n", argv[0]);
+    LOG_TEE("\n");
 }

 struct chunk {
@@ -19,7 +18,7 @@ struct chunk {
    // original file position
    size_t filepos;
    // original text data
-    std::string textdata;
+    std::string textdata = "";
    // tokenized text data
    std::vector<llama_token> tokens;
    // embedding
@@ -33,14 +32,14 @@ static std::vector<chunk> chunk_file(const std::string & filename, int chunk_siz
    std::ifstream f(filename.c_str());

    if (!f.is_open()) {
-        LOG_ERR("could not open file %s\n", filename.c_str());
+        fprintf(stderr, "Error: could not open file %s\n", filename.c_str());
        return chunks;
    }

    chunk current_chunk;
    char buffer[1024];
    int64_t filepos = 0;
-    std::string current;
+    std::string current = "";
    while (f.read(buffer, 1024)) {
        current += std::string(buffer, f.gcount());
        size_t pos;
@@ -86,9 +85,9 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
    llama_kv_cache_clear(ctx);

    // run model
-    LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
+    fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
    if (llama_decode(ctx, batch) < 0) {
-        LOG_ERR("%s : failed to decode\n", __func__);
+        fprintf(stderr, "%s : failed to decode\n", __func__);
    }

    for (int i = 0; i < batch.n_tokens; i++) {
@@ -101,7 +100,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
        if (embd == NULL) {
            embd = llama_get_embeddings_ith(ctx, i);
            if (embd == NULL) {
-                LOG_ERR("%s: failed to get embeddings for token %d\n", __func__, i);
+                fprintf(stderr, "%s: failed to get embeddings for token %d\n", __func__, i);
                continue;
            }
        }
@@ -114,28 +113,29 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
 int main(int argc, char ** argv) {
    gpt_params params;

-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_RETRIEVAL, print_usage)) {
+    if (!gpt_params_parse(argc, argv, params)) {
+        print_usage(argc, argv, params);
        return 1;
    }

-    gpt_init();
-
    // For BERT models, batch size must be equal to ubatch size
    params.n_ubatch = params.n_batch;
    params.embedding = true;

    if (params.chunk_size <= 0) {
-        LOG_ERR("chunk_size must be positive\n");
+        fprintf(stderr, "chunk_size must be positive\n");
        return 1;
    }
    if (params.context_files.empty()) {
-        LOG_ERR("context_files must be specified\n");
+        fprintf(stderr, "context_files must be specified\n");
        return 1;
    }

-    LOG_INF("processing files:\n");
+    print_build_info();
+
+    printf("processing files:\n");
    for (auto & context_file : params.context_files) {
-        LOG_INF("%s\n", context_file.c_str());
+        printf("%s\n", context_file.c_str());
    }

    std::vector<chunk> chunks;
@@ -143,7 +143,7 @@ int main(int argc, char ** argv) {
        std::vector<chunk> file_chunk = chunk_file(context_file, params.chunk_size, params.chunk_separator);
        chunks.insert(chunks.end(), file_chunk.begin(), file_chunk.end());
    }
-    LOG_INF("Number of chunks: %ld\n", chunks.size());
+    printf("Number of chunks: %ld\n", chunks.size());

    llama_backend_init();
    llama_numa_init(params.numa);
@@ -155,7 +155,7 @@ int main(int argc, char ** argv) {
    llama_context * ctx = llama_init.context;

    if (model == NULL) {
-        LOG_ERR("%s: unable to load model\n", __func__);
+        fprintf(stderr, "%s: error: unable to load model\n", __func__);
        return 1;
    }

@@ -164,19 +164,19 @@ int main(int argc, char ** argv) {

    const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
    if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
-        LOG_ERR("%s: pooling type NONE not supported\n", __func__);
+        fprintf(stderr, "%s: error: pooling type NONE not supported\n", __func__);
        return 1;
    }

    if (n_ctx > n_ctx_train) {
-        LOG_WRN("%s: warning: model was trained on only %d context tokens (%d specified)\n",
+        fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
                __func__, n_ctx_train, n_ctx);
    }

    // print system information
    {
-        LOG_INF("\n");
-        LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
+        fprintf(stderr, "\n");
+        fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
    }

    // max batch size
@@ -187,7 +187,7 @@ int main(int argc, char ** argv) {
    for (auto & chunk : chunks) {
        auto inp = ::llama_tokenize(ctx, chunk.textdata, true, false);
        if (inp.size() > n_batch) {
-            LOG_ERR("%s: chunk size (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
+            fprintf(stderr, "%s: error: chunk size (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
                    __func__, (long long int) inp.size(), (long long int) n_batch);
            return 1;
        }
@@ -201,12 +201,12 @@ int main(int argc, char ** argv) {
    // tokenization stats
    if (params.verbose_prompt) {
        for (int i = 0; i < (int) chunks.size(); i++) {
-            LOG_INF("%s: prompt %d: '%s'\n", __func__, i, chunks[i].textdata.c_str());
-            LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, chunks[i].tokens.size());
+            fprintf(stderr, "%s: prompt %d: '%s'\n", __func__, i, chunks[i].textdata.c_str());
+            fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, chunks[i].tokens.size());
            for (int j = 0; j < (int) chunks[i].tokens.size(); j++) {
-                LOG_INF("%6d -> '%s'\n", chunks[i].tokens[j], llama_token_to_piece(ctx, chunks[i].tokens[j]).c_str());
+                fprintf(stderr, "%6d -> '%s'\n", chunks[i].tokens[j], llama_token_to_piece(ctx, chunks[i].tokens[j]).c_str());
            }
-            LOG_INF("\n\n");
+            fprintf(stderr, "\n\n");
        }
    }

@@ -258,7 +258,7 @@ int main(int argc, char ** argv) {
    // start loop, receive query and return top k similar chunks based on cosine similarity
    std::string query;
    while (true) {
-        LOG("Enter query: ");
+        printf("Enter query: ");
        std::getline(std::cin, query);
        std::vector<int32_t> query_tokens = llama_tokenize(ctx, query, true);

@@ -282,21 +282,19 @@ int main(int argc, char ** argv) {
                return a.second > b.second;
            });

-            LOG("Top %d similar chunks:\n", params.sparams.top_k);
+            printf("Top %d similar chunks:\n", params.sparams.top_k);
            for (int i = 0; i < std::min(params.sparams.top_k, (int) chunks.size()); i++) {
-                LOG("filename: %s\n", chunks[similarities[i].first].filename.c_str());
-                LOG("filepos: %lld\n", (long long int) chunks[similarities[i].first].filepos);
-                LOG("similarity: %f\n", similarities[i].second);
-                LOG("textdata:\n%s\n", chunks[similarities[i].first].textdata.c_str());
-                LOG("--------------------\n");
+                printf("filename: %s\n", chunks[similarities[i].first].filename.c_str());
+                printf("filepos: %lld\n", (long long int) chunks[similarities[i].first].filepos);
+                printf("similarity: %f\n", similarities[i].second);
+                printf("textdata:\n%s\n", chunks[similarities[i].first].textdata.c_str());
+                printf("--------------------\n");
            }
        }
    }

-    LOG("\n");
-    llama_perf_context_print(ctx);
-
    // clean up
+    llama_print_timings(ctx, nullptr);
    llama_batch_free(query_batch);
    llama_free(ctx);
    llama_free_model(model);
--- a/examples/rpc/README.md
+++ b/examples/rpc/README.md
@@ -10,21 +10,20 @@ This can be used for distributed LLM inference with `llama.cpp` in the following

 ```mermaid
 flowchart TD
-    rpcb<-->|TCP|srva
-    rpcb<-->|TCP|srvb
-    rpcb<-.->|TCP|srvn
+    rpcb---|TCP|srva
+    rpcb---|TCP|srvb
+    rpcb-.-|TCP|srvn
    subgraph hostn[Host N]
-    srvn[rpc-server]<-.->backend3["Backend (CUDA,Metal,etc.)"]
+    srvn[rpc-server]-.-backend3["Backend (CUDA,Metal,etc.)"]
    end
    subgraph hostb[Host B]
-    srvb[rpc-server]<-->backend2["Backend (CUDA,Metal,etc.)"]
+    srvb[rpc-server]---backend2["Backend (CUDA,Metal,etc.)"]
    end
    subgraph hosta[Host A]
-    srva[rpc-server]<-->backend["Backend (CUDA,Metal,etc.)"]
+    srva[rpc-server]---backend["Backend (CUDA,Metal,etc.)"]
    end
    subgraph host[Main Host]
-    local["Backend (CUDA,Metal,etc.)"]<-->ggml[llama-cli]
-    ggml[llama-cli]<-->rpcb[RPC backend]
+    ggml[llama.cpp]---rpcb[RPC backend]
    end
    style hostn stroke:#66,stroke-width:2px,stroke-dasharray: 5 5
 ```
@@ -63,12 +62,17 @@ $ CUDA_VISIBLE_DEVICES=0 bin/rpc-server -p 50052
 This way you can run multiple `rpc-server` instances on the same host, each with a different CUDA device.


-On the main host build `llama.cpp` for the local backend and add `-DGGML_RPC=ON` to the build options.
-Finally, when running `llama-cli`, use the `--rpc` option to specify the host and port of each `rpc-server`:
+On the main host build `llama.cpp` only with `-DGGML_RPC=ON`:
+
+```bash
+mkdir build-rpc
+cd build-rpc
+cmake .. -DGGML_RPC=ON
+cmake --build . --config Release
+```
+
+Finally, use the `--rpc` option to specify the host and port of each `rpc-server`:

 ```bash
 $ bin/llama-cli -m ../models/tinyllama-1b/ggml-model-f16.gguf -p "Hello, my name is" --repeat-penalty 1.0 -n 64 --rpc 192.168.88.10:50052,192.168.88.11:50052 -ngl 99
 ```
-
-This way you can offload model layers to both local and remote devices.
-
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@@ -1,4 +1,3 @@
-#include "arg.h"
 #include "common.h"
 #include "llama.h"

@@ -11,7 +10,8 @@ int main(int argc, char ** argv) {
    params.prompt = "The quick brown fox";
    params.sparams.seed = 1234;

-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
+    if (!gpt_params_parse(argc, argv, params)) {
+        gpt_params_print_usage(argc, argv, params);
        return 1;
    }

@@ -38,12 +38,10 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    auto sparams = llama_sampler_chain_default_params();
+    llama_sampling_params sparams = llama_sampling_default_params();
+    sparams.seed = params.sparams.seed;

-    llama_sampler * smpl = llama_sampler_chain_init(sparams);
-
-    llama_sampler_chain_add(smpl, llama_sampler_init_softmax());
-    llama_sampler_chain_add(smpl, llama_sampler_init_dist(params.sparams.seed));
+    llama_sampling * smpl = llama_sampling_init(model, sparams);

    // tokenize prompt
    auto tokens = llama_tokenize(ctx, params.prompt, true);
@@ -71,7 +69,11 @@ int main(int argc, char ** argv) {
    printf("\nfirst run: %s", params.prompt.c_str());

    for (auto i = 0; i < params.n_predict; i++) {
-        auto next_token     = llama_sampler_sample(smpl, ctx, -1);
+        const auto * logits = llama_get_logits(ctx);
+
+        llama_sampling_set_logits(smpl, logits);
+
+        auto next_token = llama_sampling_sample_dist(smpl, nullptr);
        auto next_token_str = llama_token_to_piece(ctx, next_token);

        printf("%s", next_token_str.c_str());
@@ -94,10 +96,7 @@ int main(int argc, char ** argv) {
    // make new context
    auto * ctx2 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params));

-    llama_sampler * smpl2 = llama_sampler_chain_init(sparams);
-
-    llama_sampler_chain_add(smpl2, llama_sampler_init_softmax());
-    llama_sampler_chain_add(smpl2, llama_sampler_init_dist(params.sparams.seed));
+    llama_sampling * smpl2 = llama_sampling_init(model, sparams);

    printf("\nsecond run: %s", params.prompt.c_str());

@@ -127,7 +126,11 @@ int main(int argc, char ** argv) {

    // second run
    for (auto i = 0; i < params.n_predict; i++) {
-        auto next_token     = llama_sampler_sample(smpl2, ctx2, -1);
+        const auto * logits = llama_get_logits(ctx2);
+
+        llama_sampling_set_logits(smpl2, logits);
+
+        auto next_token = llama_sampling_sample_dist(smpl2, nullptr);
        auto next_token_str = llama_token_to_piece(ctx2, next_token);

        printf("%s", next_token_str.c_str());
@@ -154,10 +157,7 @@ int main(int argc, char ** argv) {
    // make new context
    auto * ctx3 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params));

-    llama_sampler * smpl3 = llama_sampler_chain_init(sparams);
-
-    llama_sampler_chain_add(smpl3, llama_sampler_init_softmax());
-    llama_sampler_chain_add(smpl3, llama_sampler_init_dist(params.sparams.seed));
+    llama_sampling * smpl3 = llama_sampling_init(model, sparams);

    printf("\nsingle seq run: %s", params.prompt.c_str());

@@ -215,7 +215,11 @@ int main(int argc, char ** argv) {

    // third run with seq 1 instead of 0
    for (auto i = 0; i < params.n_predict; i++) {
-        auto next_token     = llama_sampler_sample(smpl3, ctx3, -1);
+        const auto * logits = llama_get_logits(ctx3);
+
+        llama_sampling_set_logits(smpl3, logits);
+
+        auto next_token = llama_sampling_sample_dist(smpl3, nullptr);
        auto next_token_str = llama_token_to_piece(ctx3, next_token);

        printf("%s", next_token_str.c_str());
@@ -232,9 +236,9 @@ int main(int argc, char ** argv) {

    printf("\n");

-    llama_sampler_free(smpl);
-    llama_sampler_free(smpl2);
-    llama_sampler_free(smpl3);
+    llama_sampling_free(smpl);
+    llama_sampling_free(smpl2);
+    llama_sampling_free(smpl3);

    llama_free(ctx3);
    llama_free_model(model);
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@@ -1,6 +1,6 @@
 set(TARGET llama-server)
-
-option(LLAMA_SERVER_SSL "Build SSL support for the server" OFF)
+option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
+option(LLAMA_SERVER_SSL     "Build SSL support for the server"        OFF)

 include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})

@@ -30,7 +30,6 @@ set(PUBLIC_ASSETS
    system-prompts.js
    prompt-formats.js
    json-schema-to-grammar.mjs
-    loading.html
 )

 foreach(asset ${PUBLIC_ASSETS})
@@ -46,6 +45,9 @@ endforeach()

 add_executable(${TARGET} ${TARGET_SRCS})
 install(TARGETS ${TARGET} RUNTIME)
+target_compile_definitions(${TARGET} PRIVATE
+    SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
+)

 target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT})

--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -7,7 +7,6 @@ Set of LLM REST APIs and a simple web front end to interact with llama.cpp.
 **Features:**
 * LLM inference of F16 and quantized models on GPU and CPU
 * [OpenAI API](https://github.com/openai/openai-openapi) compatible chat completions and embeddings routes
- * Reranking endoint (WIP: https://github.com/ggerganov/llama.cpp/pull/9510)
 * Parallel decoding with multi-user support
 * Continuous batching
 * Multimodal (wip)
@@ -18,145 +17,262 @@ The project is under active development, and we are [looking for feedback and co

 ## Usage

-**Common params**
+```
+usage: ./llama-server [options]

-| Argument | Explanation |
-| -------- | ----------- |
-| `-h, --help, --usage` | print usage and exit |
-| `--version` | show version and build info |
-| `--verbose-prompt` | print a verbose prompt before generation (default: false) |
-| `-t, --threads N` | number of threads to use during generation (default: -1)<br/>(env: LLAMA_ARG_THREADS) |
-| `-tb, --threads-batch N` | number of threads to use during batch and prompt processing (default: same as --threads) |
-| `-C, --cpu-mask M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: "") |
-| `-Cr, --cpu-range lo-hi` | range of CPUs for affinity. Complements --cpu-mask |
-| `--cpu-strict <0\|1>` | use strict CPU placement (default: 0)<br/> |
-| `--prio N` | set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: 0)<br/> |
-| `--poll <0...100>` | use polling level to wait for work (0 - no polling, default: 50)<br/> |
-| `-Cb, --cpu-mask-batch M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask) |
-| `-Crb, --cpu-range-batch lo-hi` | ranges of CPUs for affinity. Complements --cpu-mask-batch |
-| `--cpu-strict-batch <0\|1>` | use strict CPU placement (default: same as --cpu-strict) |
-| `--prio-batch N` | set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: 0)<br/> |
-| `--poll-batch <0\|1>` | use polling to wait for work (default: same as --poll) |
-| `-c, --ctx-size N` | size of the prompt context (default: 0, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE) |
-| `-n, --predict, --n-predict N` | number of tokens to predict (default: -1, -1 = infinity, -2 = until context filled)<br/>(env: LLAMA_ARG_N_PREDICT) |
-| `-b, --batch-size N` | logical maximum batch size (default: 2048)<br/>(env: LLAMA_ARG_BATCH) |
-| `-ub, --ubatch-size N` | physical maximum batch size (default: 512)<br/>(env: LLAMA_ARG_UBATCH) |
-| `--keep N` | number of tokens to keep from the initial prompt (default: 0, -1 = all) |
-| `-fa, --flash-attn` | enable Flash Attention (default: disabled)<br/>(env: LLAMA_ARG_FLASH_ATTN) |
-| `-p, --prompt PROMPT` | prompt to start generation with |
-| `--no-perf` | disable internal libllama performance timings (default: false)<br/>(env: LLAMA_ARG_NO_PERF) |
-| `-f, --file FNAME` | a file containing the prompt (default: none) |
-| `-bf, --binary-file FNAME` | binary file containing the prompt (default: none) |
-| `-e, --escape` | process escapes sequences (\n, \r, \t, \', \", \\) (default: true) |
-| `--no-escape` | do not process escape sequences |
-| `--rope-scaling {none,linear,yarn}` | RoPE frequency scaling method, defaults to linear unless specified by the model<br/>(env: LLAMA_ARG_ROPE_SCALING_TYPE) |
-| `--rope-scale N` | RoPE context scaling factor, expands context by a factor of N<br/>(env: LLAMA_ARG_ROPE_SCALE) |
-| `--rope-freq-base N` | RoPE base frequency, used by NTK-aware scaling (default: loaded from model)<br/>(env: LLAMA_ARG_ROPE_FREQ_BASE) |
-| `--rope-freq-scale N` | RoPE frequency scaling factor, expands context by a factor of 1/N<br/>(env: LLAMA_ARG_ROPE_FREQ_SCALE) |
-| `--yarn-orig-ctx N` | YaRN: original context size of model (default: 0 = model training context size)<br/>(env: LLAMA_ARG_YARN_ORIG_CTX) |
-| `--yarn-ext-factor N` | YaRN: extrapolation mix factor (default: -1.0, 0.0 = full interpolation)<br/>(env: LLAMA_ARG_YARN_EXT_FACTOR) |
-| `--yarn-attn-factor N` | YaRN: scale sqrt(t) or attention magnitude (default: 1.0)<br/>(env: LLAMA_ARG_YARN_ATTN_FACTOR) |
-| `--yarn-beta-slow N` | YaRN: high correction dim or alpha (default: 1.0)<br/>(env: LLAMA_ARG_YARN_BETA_SLOW) |
-| `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: 32.0)<br/>(env: LLAMA_ARG_YARN_BETA_FAST) |
-| `-gan, --grp-attn-n N` | group-attention factor (default: 1)<br/>(env: LLAMA_ARG_GRP_ATTN_N) |
-| `-gaw, --grp-attn-w N` | group-attention width (default: 512.0)<br/>(env: LLAMA_ARG_GRP_ATTN_W) |
-| `-dkvc, --dump-kv-cache` | verbose print of the KV cache |
-| `-nkvo, --no-kv-offload` | disable KV offload<br/>(env: LLAMA_ARG_NO_KV_OFFLOAD) |
-| `-ctk, --cache-type-k TYPE` | KV cache data type for K (default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K) |
-| `-ctv, --cache-type-v TYPE` | KV cache data type for V (default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V) |
-| `-dt, --defrag-thold N` | KV cache defragmentation threshold (default: -1.0, < 0 - disabled)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
-| `-np, --parallel N` | number of parallel sequences to decode (default: 1)<br/>(env: LLAMA_ARG_N_PARALLEL) |
-| `--mlock` | force system to keep model in RAM rather than swapping or compressing<br/>(env: LLAMA_ARG_MLOCK) |
-| `--no-mmap` | do not memory-map model (slower load but may reduce pageouts if not using mlock)<br/>(env: LLAMA_ARG_NO_MMAP) |
-| `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggerganov/llama.cpp/issues/1437<br/>(env: LLAMA_ARG_NUMA) |
-| `-ngl, --gpu-layers, --n-gpu-layers N` | number of layers to store in VRAM<br/>(env: LLAMA_ARG_N_GPU_LAYERS) |
-| `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:<br/>- none: use one GPU only<br/>- layer (default): split layers and KV across GPUs<br/>- row: split rows across GPUs<br/>(env: LLAMA_ARG_SPLIT_MODE) |
-| `-ts, --tensor-split N0,N1,N2,...` | fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1<br/>(env: LLAMA_ARG_TENSOR_SPLIT) |
-| `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0)<br/>(env: LLAMA_ARG_MAIN_GPU) |
-| `--check-tensors` | check model tensor data for invalid values (default: false) |
-| `--override-kv KEY=TYPE:VALUE` | advanced option to override model metadata by key. may be specified multiple times.<br/>types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false |
-| `--lora FNAME` | path to LoRA adapter (can be repeated to use multiple adapters) |
-| `--lora-scaled FNAME SCALE` | path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters) |
-| `--control-vector FNAME` | add a control vector<br/>note: this argument can be repeated to add multiple control vectors |
-| `--control-vector-scaled FNAME SCALE` | add a control vector with user defined scaling SCALE<br/>note: this argument can be repeated to add multiple scaled control vectors |
-| `--control-vector-layer-range START END` | layer range to apply the control vector(s) to, start and end inclusive |
-| `-m, --model FNAME` | model path (default: `models/$filename` with filename from `--hf-file` or `--model-url` if set, otherwise models/7B/ggml-model-f16.gguf)<br/>(env: LLAMA_ARG_MODEL) |
-| `-mu, --model-url MODEL_URL` | model download url (default: unused)<br/>(env: LLAMA_ARG_MODEL_URL) |
-| `-hfr, --hf-repo REPO` | Hugging Face model repository (default: unused)<br/>(env: LLAMA_ARG_HF_REPO) |
-| `-hff, --hf-file FILE` | Hugging Face model file (default: unused)<br/>(env: LLAMA_ARG_HF_FILE) |
-| `-hft, --hf-token TOKEN` | Hugging Face access token (default: value from HF_TOKEN environment variable)<br/>(env: HF_TOKEN) |
-| `-ld, --logdir LOGDIR` | path under which to save YAML logs (no logging if unset) |
-| `--log-disable` | Log disable |
-| `--log-file FNAME` | Log to file |
-| `--log-colors` | Enable colored logging<br/>(env: LLAMA_LOG_COLORS) |
-| `-v, --verbose, --log-verbose` | Set verbosity level to infinity (i.e. log all messages, useful for debugging) |
-| `-lv, --verbosity, --log-verbosity N` | Set the verbosity threshold. Messages with a higher verbosity will be ignored.<br/>(env: LLAMA_LOG_VERBOSITY) |
-| `--log-prefix` | Enable prefx in log messages<br/>(env: LLAMA_LOG_PREFIX) |
-| `--log-timestamps` | Enable timestamps in log messages<br/>(env: LLAMA_LOG_TIMESTAMPS) |
+general:

+  -h,    --help, --usage          print usage and exit
+         --version                show version and build info
+  -v,    --verbose                print verbose information
+         --verbosity N            set specific verbosity level (default: 0)
+         --verbose-prompt         print a verbose prompt before generation (default: false)
+         --no-display-prompt      don't print prompt at generation (default: false)
+  -co,   --color                  colorise output to distinguish prompt and user input from generations (default: false)
+  -s,    --seed SEED              RNG seed (default: -1, use random seed for < 0)
+  -t,    --threads N              number of threads to use during generation (default: 8)
+  -tb,   --threads-batch N        number of threads to use during batch and prompt processing (default: same as --threads)
+  -td,   --threads-draft N        number of threads to use during generation (default: same as --threads)
+  -tbd,  --threads-batch-draft N  number of threads to use during batch and prompt processing (default: same as --threads-draft)
+         --draft N                number of tokens to draft for speculative decoding (default: 5)
+  -ps,   --p-split N              speculative decoding split probability (default: 0.1)
+  -lcs,  --lookup-cache-static FNAME
+                                  path to static lookup cache to use for lookup decoding (not updated by generation)
+  -lcd,  --lookup-cache-dynamic FNAME
+                                  path to dynamic lookup cache to use for lookup decoding (updated by generation)
+  -c,    --ctx-size N             size of the prompt context (default: 0, 0 = loaded from model)
+  -n,    --predict N              number of tokens to predict (default: -1, -1 = infinity, -2 = until context filled)
+  -b,    --batch-size N           logical maximum batch size (default: 2048)
+  -ub,   --ubatch-size N          physical maximum batch size (default: 512)
+         --keep N                 number of tokens to keep from the initial prompt (default: 0, -1 = all)
+         --chunks N               max number of chunks to process (default: -1, -1 = all)
+  -fa,   --flash-attn             enable Flash Attention (default: disabled)
+  -p,    --prompt PROMPT          prompt to start generation with
+                                  in conversation mode, this will be used as system prompt
+                                  (default: '')
+  -f,    --file FNAME             a file containing the prompt (default: none)
+         --in-file FNAME          an input file (repeat to specify multiple files)
+  -bf,   --binary-file FNAME      binary file containing the prompt (default: none)
+  -e,    --escape                 process escapes sequences (\n, \r, \t, \', \", \\) (default: true)
+         --no-escape              do not process escape sequences
+  -ptc,  --print-token-count N    print token count every N tokens (default: -1)
+         --prompt-cache FNAME     file to cache prompt state for faster startup (default: none)
+         --prompt-cache-all       if specified, saves user input and generations to cache as well
+                                  not supported with --interactive or other interactive options
+         --prompt-cache-ro        if specified, uses the prompt cache but does not update it
+  -r,    --reverse-prompt PROMPT  halt generation at PROMPT, return control in interactive mode
+                                  can be specified more than once for multiple prompts
+  -sp,   --special                special tokens output enabled (default: false)
+  -cnv,  --conversation           run in conversation mode, does not print special tokens and suffix/prefix
+                                  if suffix/prefix are not specified, default chat template will be used
+                                  (default: false)
+  -i,    --interactive            run in interactive mode (default: false)
+  -if,   --interactive-first      run in interactive mode and wait for input right away (default: false)
+  -mli,  --multiline-input        allows you to write or paste multiple lines without ending each in '\'
+         --in-prefix-bos          prefix BOS to user inputs, preceding the `--in-prefix` string
+         --in-prefix STRING       string to prefix user inputs with (default: empty)
+         --in-suffix STRING       string to suffix after user inputs with (default: empty)
+         --spm-infill             use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled)

-**Sampling params**
+sampling:

-| Argument | Explanation |
-| -------- | ----------- |
-| `--samplers SAMPLERS` | samplers that will be used for generation in the order, separated by ';'<br/>(default: top_k;tfs_z;typ_p;top_p;min_p;temperature) |
-| `-s, --seed SEED` | RNG seed (default: 4294967295, use random seed for 4294967295) |
-| `--sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: kfypmt) |
-| `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) |
-| `--penalize-nl` | penalize newline tokens (default: false) |
-| `--temp N` | temperature (default: 0.8) |
-| `--top-k N` | top-k sampling (default: 40, 0 = disabled) |
-| `--top-p N` | top-p sampling (default: 0.9, 1.0 = disabled) |
-| `--min-p N` | min-p sampling (default: 0.1, 0.0 = disabled) |
-| `--tfs N` | tail free sampling, parameter z (default: 1.0, 1.0 = disabled) |
-| `--typical N` | locally typical sampling, parameter p (default: 1.0, 1.0 = disabled) |
-| `--repeat-last-n N` | last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size) |
-| `--repeat-penalty N` | penalize repeat sequence of tokens (default: 1.0, 1.0 = disabled) |
-| `--presence-penalty N` | repeat alpha presence penalty (default: 0.0, 0.0 = disabled) |
-| `--frequency-penalty N` | repeat alpha frequency penalty (default: 0.0, 0.0 = disabled) |
-| `--dynatemp-range N` | dynamic temperature range (default: 0.0, 0.0 = disabled) |
-| `--dynatemp-exp N` | dynamic temperature exponent (default: 1.0) |
-| `--mirostat N` | use Mirostat sampling.<br/>Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.<br/>(default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0) |
-| `--mirostat-lr N` | Mirostat learning rate, parameter eta (default: 0.1) |
-| `--mirostat-ent N` | Mirostat target entropy, parameter tau (default: 5.0) |
-| `-l, --logit-bias TOKEN_ID(+/-)BIAS` | modifies the likelihood of token appearing in the completion,<br/>i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',<br/>or `--logit-bias 15043-1` to decrease likelihood of token ' Hello' |
-| `--grammar GRAMMAR` | BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '') |
-| `--grammar-file FNAME` | file to read grammar from |
-| `-j, --json-schema SCHEMA` | JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object<br/>For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead |
+         --samplers SAMPLERS      samplers that will be used for generation in the order, separated by ';'
+                                  (default: top_k;tfs_z;typical_p;top_p;min_p;temperature)
+         --sampling-seq SEQUENCE  simplified sequence for samplers that will be used (default: kfypmt)
+         --ignore-eos             ignore end of stream token and continue generating (implies --logit-bias EOS-inf)
+         --penalize-nl            penalize newline tokens (default: false)
+         --temp N                 temperature (default: 0.8)
+         --top-k N                top-k sampling (default: 40, 0 = disabled)
+         --top-p N                top-p sampling (default: 0.9, 1.0 = disabled)
+         --min-p N                min-p sampling (default: 0.1, 0.0 = disabled)
+         --tfs N                  tail free sampling, parameter z (default: 1.0, 1.0 = disabled)
+         --typical N              locally typical sampling, parameter p (default: 1.0, 1.0 = disabled)
+         --repeat-last-n N        last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size)
+         --repeat-penalty N       penalize repeat sequence of tokens (default: 1.0, 1.0 = disabled)
+         --presence-penalty N     repeat alpha presence penalty (default: 0.0, 0.0 = disabled)
+         --frequency-penalty N    repeat alpha frequency penalty (default: 0.0, 0.0 = disabled)
+         --dynatemp-range N       dynamic temperature range (default: 0.0, 0.0 = disabled)
+         --dynatemp-exp N         dynamic temperature exponent (default: 1.0)
+         --mirostat N             use Mirostat sampling.
+                                  Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.
+                                  (default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)
+         --mirostat-lr N          Mirostat learning rate, parameter eta (default: 0.1)
+         --mirostat-ent N         Mirostat target entropy, parameter tau (default: 5.0)
+         -l TOKEN_ID(+/-)BIAS     modifies the likelihood of token appearing in the completion,
+                                  i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',
+                                  or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'
+         --cfg-negative-prompt PROMPT
+                                  negative prompt to use for guidance (default: '')
+         --cfg-negative-prompt-file FNAME
+                                  negative prompt file to use for guidance
+         --cfg-scale N            strength of guidance (default: 1.0, 1.0 = disable)
+         --chat-template JINJA_TEMPLATE
+                                  set custom jinja chat template (default: template taken from model's metadata)
+                                  if suffix/prefix are specified, template will be disabled
+                                  only commonly used templates are accepted:
+                                  https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template

+grammar:

-**Example-specific params**
+         --grammar GRAMMAR        BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '')
+         --grammar-file FNAME     file to read grammar from
+  -j,    --json-schema SCHEMA     JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object
+                                  For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead

-| Argument | Explanation |
-| -------- | ----------- |
-| `--no-context-shift` | disables context shift on inifinite text generation (default: disabled)<br/>(env: LLAMA_ARG_NO_CONTEXT_SHIFT) |
-| `-sp, --special` | special tokens output enabled (default: false) |
-| `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) |
-| `--pooling {none,mean,cls,last,rank}` | pooling type for embeddings, use model default if unspecified<br/>(env: LLAMA_ARG_POOLING) |
-| `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
-| `-nocb, --no-cont-batching` | disable continuous batching<br/>(env: LLAMA_ARG_NO_CONT_BATCHING) |
-| `-a, --alias STRING` | set alias for model name (to be used by REST API)<br/>(env: LLAMA_ARG_ALIAS) |
-| `--host HOST` | ip address to listen (default: 127.0.0.1)<br/>(env: LLAMA_ARG_HOST) |
-| `--port PORT` | port to listen (default: 8080)<br/>(env: LLAMA_ARG_PORT) |
-| `--path PATH` | path to serve static files from (default: )<br/>(env: LLAMA_ARG_STATIC_PATH) |
-| `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)<br/>(env: LLAMA_ARG_EMBEDDINGS) |
-| `--reranking, --rerank` | enable reranking endpoint on server (default: disabled)<br/>(env: LLAMA_ARG_RERANKING) |
-| `--api-key KEY` | API key to use for authentication (default: none)<br/>(env: LLAMA_API_KEY) |
-| `--api-key-file FNAME` | path to file containing API keys (default: none) |
-| `--ssl-key-file FNAME` | path to file a PEM-encoded SSL private key<br/>(env: LLAMA_ARG_SSL_KEY_FILE) |
-| `--ssl-cert-file FNAME` | path to file a PEM-encoded SSL certificate<br/>(env: LLAMA_ARG_SSL_CERT_FILE) |
-| `-to, --timeout N` | server read/write timeout in seconds (default: 600)<br/>(env: LLAMA_ARG_TIMEOUT) |
-| `--threads-http N` | number of threads used to process HTTP requests (default: -1)<br/>(env: LLAMA_ARG_THREADS_HTTP) |
-| `-spf, --system-prompt-file FNAME` | set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications |
-| `--metrics` | enable prometheus compatible metrics endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_METRICS) |
-| `--no-slots` | disables slots monitoring endpoint (default: enabled)<br/>(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) |
-| `--slot-save-path PATH` | path to save slot kv cache (default: disabled) |
-| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted:<br/>https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
-| `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)<br/> |
-| `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) |
+embedding:

+         --pooling {none,mean,cls,last}
+                                  pooling type for embeddings, use model default if unspecified
+         --attention {causal,non-causal}
+                                  attention type for embeddings, use model default if unspecified

-Note: If both command line argument and environment variable are both set for the same param, the argument will take precedence over env var.
+context hacking:
+
+         --rope-scaling {none,linear,yarn}
+                                  RoPE frequency scaling method, defaults to linear unless specified by the model
+         --rope-scale N           RoPE context scaling factor, expands context by a factor of N
+         --rope-freq-base N       RoPE base frequency, used by NTK-aware scaling (default: loaded from model)
+         --rope-freq-scale N      RoPE frequency scaling factor, expands context by a factor of 1/N
+         --yarn-orig-ctx N        YaRN: original context size of model (default: 0 = model training context size)
+         --yarn-ext-factor N      YaRN: extrapolation mix factor (default: -1.0, 0.0 = full interpolation)
+         --yarn-attn-factor N     YaRN: scale sqrt(t) or attention magnitude (default: 1.0)
+         --yarn-beta-slow N       YaRN: high correction dim or alpha (default: 1.0)
+         --yarn-beta-fast N       YaRN: low correction dim or beta (default: 32.0)
+  -gan,  --grp-attn-n N           group-attention factor (default: 1)
+  -gaw,  --grp-attn-w N           group-attention width (default: 512.0)
+  -dkvc, --dump-kv-cache          verbose print of the KV cache
+  -nkvo, --no-kv-offload          disable KV offload
+  -ctk,  --cache-type-k TYPE      KV cache data type for K (default: f16)
+  -ctv,  --cache-type-v TYPE      KV cache data type for V (default: f16)
+
+perplexity:
+
+         --all-logits             return logits for all tokens in the batch (default: false)
+         --hellaswag              compute HellaSwag score over random tasks from datafile supplied with -f
+         --hellaswag-tasks N      number of tasks to use when computing the HellaSwag score (default: 400)
+         --winogrande             compute Winogrande score over random tasks from datafile supplied with -f
+         --winogrande-tasks N     number of tasks to use when computing the Winogrande score (default: 0)
+         --multiple-choice        compute multiple choice score over random tasks from datafile supplied with -f
+         --multiple-choice-tasks N
+                                  number of tasks to use when computing the multiple choice score (default: 0)
+         --kl-divergence          computes KL-divergence to logits provided via --kl-divergence-base
+         --ppl-stride N           stride for perplexity calculation (default: 0)
+         --ppl-output-type {0,1}  output type for perplexity calculation (default: 0)
+
+parallel:
+
+  -dt,   --defrag-thold N         KV cache defragmentation threshold (default: -1.0, < 0 - disabled)
+  -np,   --parallel N             number of parallel sequences to decode (default: 1)
+  -ns,   --sequences N            number of sequences to decode (default: 1)
+  -cb,   --cont-batching          enable continuous batching (a.k.a dynamic batching) (default: enabled)
+
+multi-modality:
+
+         --mmproj FILE            path to a multimodal projector file for LLaVA. see examples/llava/README.md
+         --image FILE             path to an image file. use with multimodal models. Specify multiple times for batching
+
+backend:
+
+         --rpc SERVERS            comma separated list of RPC servers
+         --mlock                  force system to keep model in RAM rather than swapping or compressing
+         --no-mmap                do not memory-map model (slower load but may reduce pageouts if not using mlock)
+         --numa TYPE              attempt optimizations that help on some NUMA systems
+                                    - distribute: spread execution evenly over all nodes
+                                    - isolate: only spawn threads on CPUs on the node that execution started on
+                                    - numactl: use the CPU map provided by numactl
+                                  if run without this previously, it is recommended to drop the system page cache before using this
+                                  see https://github.com/ggerganov/llama.cpp/issues/1437
+
+model:
+
+         --check-tensors          check model tensor data for invalid values (default: false)
+         --override-kv KEY=TYPE:VALUE
+                                  advanced option to override model metadata by key. may be specified multiple times.
+                                  types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false
+         --lora FNAME             apply LoRA adapter (implies --no-mmap)
+         --lora-scaled FNAME S    apply LoRA adapter with user defined scaling S (implies --no-mmap)
+         --lora-base FNAME        optional model to use as a base for the layers modified by the LoRA adapter
+         --control-vector FNAME   add a control vector
+                                  note: this argument can be repeated to add multiple control vectors
+         --control-vector-scaled FNAME SCALE
+                                  add a control vector with user defined scaling SCALE
+                                  note: this argument can be repeated to add multiple scaled control vectors
+         --control-vector-layer-range START END
+                                  layer range to apply the control vector(s) to, start and end inclusive
+  -m,    --model FNAME            model path (default: models/$filename with filename from --hf-file
+                                  or --model-url if set, otherwise models/7B/ggml-model-f16.gguf)
+  -md,   --model-draft FNAME      draft model for speculative decoding (default: unused)
+  -mu,   --model-url MODEL_URL    model download url (default: unused)
+  -hfr,  --hf-repo REPO           Hugging Face model repository (default: unused)
+  -hff,  --hf-file FILE           Hugging Face model file (default: unused)
+  -hft,  --hf-token TOKEN         Hugging Face access token (default: value from HF_TOKEN environment variable)
+
+server:
+
+         --host HOST              ip address to listen (default: 127.0.0.1)
+         --port PORT              port to listen (default: 8080)
+         --path PATH              path to serve static files from (default: )
+         --embedding(s)           restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)
+         --api-key KEY            API key to use for authentication (default: none)
+         --api-key-file FNAME     path to file containing API keys (default: none)
+         --ssl-key-file FNAME     path to file a PEM-encoded SSL private key
+         --ssl-cert-file FNAME    path to file a PEM-encoded SSL certificate
+         --timeout N              server read/write timeout in seconds (default: 600)
+         --threads-http N         number of threads used to process HTTP requests (default: -1)
+         --system-prompt-file FNAME
+                                  set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications
+         --log-format {text,json}
+                                  log output format: json or text (default: json)
+         --metrics                enable prometheus compatible metrics endpoint (default: disabled)
+         --no-slots               disables slots monitoring endpoint (default: enabled)
+         --slot-save-path PATH    path to save slot kv cache (default: disabled)
+         --chat-template JINJA_TEMPLATE
+                                  set custom jinja chat template (default: template taken from model's metadata)
+                                  only commonly used templates are accepted:
+                                  https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
+  -sps,  --slot-prompt-similarity SIMILARITY
+                                  how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)
+         --lora-init-without-apply
+                                  load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled)
+
+logging:
+
+         --simple-io              use basic IO for better compatibility in subprocesses and limited consoles
+  -ld,   --logdir LOGDIR          path under which to save YAML logs (no logging if unset)
+         --log-test               Run simple logging test
+         --log-disable            Disable trace logs
+         --log-enable             Enable trace logs
+         --log-file FNAME         Specify a log filename (without extension)
+         --log-new                Create a separate new log file on start. Each log file will have unique name: "<name>.<ID>.log"
+         --log-append             Don't truncate the old log file.
+```
+
+Available environment variables (if specified, these variables will override parameters specified in arguments):
+
+- `LLAMA_CACHE`: cache directory, used by `--hf-repo`
+- `HF_TOKEN`: Hugging Face access token, used when accessing a gated model with `--hf-repo`
+- `LLAMA_ARG_MODEL`: equivalent to `-m`
+- `LLAMA_ARG_MODEL_URL`: equivalent to `-mu`
+- `LLAMA_ARG_MODEL_ALIAS`: equivalent to `-a`
+- `LLAMA_ARG_HF_REPO`: equivalent to `--hf-repo`
+- `LLAMA_ARG_HF_FILE`: equivalent to `--hf-file`
+- `LLAMA_ARG_THREADS`: equivalent to `-t`
+- `LLAMA_ARG_CTX_SIZE`: equivalent to `-c`
+- `LLAMA_ARG_N_PARALLEL`: equivalent to `-np`
+- `LLAMA_ARG_BATCH`: equivalent to `-b`
+- `LLAMA_ARG_UBATCH`: equivalent to `-ub`
+- `LLAMA_ARG_N_GPU_LAYERS`: equivalent to `-ngl`
+- `LLAMA_ARG_THREADS_HTTP`: equivalent to `--threads-http`
+- `LLAMA_ARG_CHAT_TEMPLATE`: equivalent to `--chat-template`
+- `LLAMA_ARG_N_PREDICT`: equivalent to `-n`
+- `LLAMA_ARG_ENDPOINT_METRICS`: if set to `1`, it will enable metrics endpoint (equivalent to `--metrics`)
+- `LLAMA_ARG_ENDPOINT_SLOTS`: if set to `0`, it will **disable** slots endpoint (equivalent to `--no-slots`). This feature is enabled by default.
+- `LLAMA_ARG_EMBEDDINGS`: if set to `1`, it will enable embeddings endpoint (equivalent to `--embeddings`)
+- `LLAMA_ARG_FLASH_ATTN`: if set to `1`, it will enable flash attention (equivalent to `-fa`)
+- `LLAMA_ARG_CONT_BATCHING`: if set to `0`, it will **disable** continuous batching (equivalent to `--no-cont-batching`). This feature is enabled by default.
+- `LLAMA_ARG_DEFRAG_THOLD`: equivalent to `-dt`
+- `LLAMA_ARG_HOST`: equivalent to `--host`
+- `LLAMA_ARG_PORT`: equivalent to `--port`

 Example usage of docker compose with environment variables:

@@ -173,7 +289,7 @@ services:
      LLAMA_ARG_MODEL: /models/my_model.gguf
      LLAMA_ARG_CTX_SIZE: 4096
      LLAMA_ARG_N_PARALLEL: 2
-      LLAMA_ARG_ENDPOINT_METRICS: 1
+      LLAMA_ARG_ENDPOINT_METRICS: 1  # to disable, either remove or set to 0
      LLAMA_ARG_PORT: 8080
 ```

@@ -427,44 +543,9 @@ Notice that each `probs` is an array of length `n_probs`.

    *Options:*

-    `content`: (Required) The text to tokenize.
+    `content`: Set the text to tokenize.

-    `add_special`: (Optional) Boolean indicating if special tokens, i.e. `BOS`, should be inserted.  Default: `false`
-
-    `with_pieces`: (Optional) Boolean indicating whether to return token pieces along with IDs.  Default: `false`
-
-**Response:**
-
-Returns a JSON object with a `tokens` field containing the tokenization result. The `tokens` array contains either just token IDs or objects with `id` and `piece` fields, depending on the `with_pieces` parameter. The piece field is a string if the piece is valid unicode or a list of bytes otherwise.
-
-
-If `with_pieces` is `false`:
-```json
-{
-  "tokens": [123, 456, 789]
-}
-```
-
-If `with_pieces` is `true`:
-```json
-{
-  "tokens": [
-    {"id": 123, "piece": "Hello"},
-    {"id": 456, "piece": " world"},
-    {"id": 789, "piece": "!"}
-  ]
-}
-```
-
-With input 'á' (utf8 hex: C3 A1) on tinyllama/stories260k
-```json
-{
-  "tokens": [
-    {"id": 198, "piece": [195]}, // hex C3
-    {"id": 164, "piece": [161]} // hex A1
-  ]
-}
-```
+    `add_special`: Boolean indicating if special tokens, i.e. `BOS`, should be inserted.  Default: `false`

 ### POST `/detokenize`: Convert tokens to text

@@ -482,39 +563,6 @@ The same as [the embedding example](../embedding) does.

    `image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `content`. You can determine the place of the image in the content as in the following: `Image: [img-21].\nCaption: This is a picture of a house`. In this case, `[img-21]` will be replaced by the embeddings of the image with id `21` in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 21}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.

-### POST `/reranking`: Rerank documents according to a given query
-
-Similar to https://jina.ai/reranker/ but might change in the future.
-Requires a reranker model (such as [bge-reranker-v2-m3](https://huggingface.co/BAAI/bge-reranker-v2-m3)) and the `--embedding --pooling rank` options.
-
-    *Options:*
-
-    `query`: The query against which the documents will be ranked.
-
-    `documents`: An array strings representing the documents to be ranked.
-
-    *Aliases:*
-      - `/rerank`
-      - `/v1/rerank`
-      - `/v1/reranking`
-
-    *Examples:*
-
-    ```shell
-    curl http://127.0.0.1:8012/v1/rerank \
-        -H "Content-Type: application/json" \
-        -d '{
-            "model": "some-model",
-                "query": "What is panda?",
-                "top_n": 3,
-                "documents": [
-                    "hi",
-                "it is a bear",
-                "The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China."
-                ]
-        }' | jq
-    ```
-
 ### POST `/infill`: For code infilling.

 Takes a prefix and a suffix and returns the predicted completion as stream.
@@ -555,7 +603,7 @@ Given a ChatML-formatted json description in `messages`, it returns the predicte

    See [OpenAI Chat Completions API documentation](https://platform.openai.com/docs/api-reference/chat). While some OpenAI-specific features such as function calling aren't supported, llama.cpp `/completion`-specific features such as `mirostat` are supported.

-    The `response_format` parameter supports both plain JSON output (e.g. `{"type": "json_object"}`) and schema-constrained JSON (e.g. `{"type": "json_object", "schema": {"type": "string", "minLength": 10, "maxLength": 100}}` or `{"type": "json_schema", "schema": {"properties": { "name": { "title": "Name",  "type": "string" }, "date": { "title": "Date",  "type": "string" }, "participants": { "items": {"type: "string" }, "title": "Participants",  "type": "string" } } } }`), similar to other OpenAI-inspired API providers.
+    The `response_format` parameter supports both plain JSON output (e.g. `{"type": "json_object"}`) and schema-constrained JSON (e.g. `{"type": "json_object", "schema": {"type": "string", "minLength": 10, "maxLength": 100}}`), similar to other OpenAI-inspired API providers.

    *Examples:*

--- a/examples/server/bench/README.md
+++ b/examples/server/bench/README.md
@@ -40,6 +40,7 @@ server --host localhost --port 8080 \
  --parallel 8 \
  --batch-size 512 \
  --ctx-size 4096 \
+  --log-format text \
  -ngl 33
 ```

--- a/examples/server/bench/bench.py
+++ b/examples/server/bench/bench.py
@@ -272,6 +272,7 @@ def start_server_background(args):
    server_args.append('--cont-batching')
    server_args.append('--metrics')
    server_args.append('--flash-attn')
+    server_args.extend(['--log-format', "text"])
    args = [str(arg) for arg in [server_path, *server_args]]
    print(f"bench: starting server with: {' '.join(args)}")
    pkwargs = {
--- a/examples/server/public/loading.html
+++ b/examples/server/public/loading.html
@@ -1,12 +0,0 @@
-<!DOCTYPE html>
-<html>
-    <head>
-        <meta http-equiv="refresh" content="5">
-    </head>
-    <body>
-        <div id="loading">
-            The model is loading. Please wait.<br/>
-            The user interface will appear soon.
-        </div>
-    </body>
-</html>
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
--- a/examples/server/tests/.gitignore
+++ b/examples/server/tests/.gitignore
@@ -1 +0,0 @@
-.venv
--- a/examples/server/tests/README.md
+++ b/examples/server/tests/README.md
@@ -40,6 +40,7 @@ It's possible to override some scenario steps values with environment variables:
 | `PORT`                   | `context.server_port` to set the listening port of the server during scenario, default: `8080` |
 | `LLAMA_SERVER_BIN_PATH`  | to change the server binary path, default: `../../../build/bin/llama-server`                         |
 | `DEBUG`                  | "ON" to enable steps and server verbose mode `--verbose`                                       |
+| `SERVER_LOG_FORMAT_JSON` | if set switch server logs to json format                                                       |
 | `N_GPU_LAYERS`           | number of model layers to offload to VRAM `-ngl --n-gpu-layers`                                |

 ### Run @bug, @wip or @wrong_usage annotated scenario
--- a/examples/server/tests/features/ctx_shift.feature
+++ b/examples/server/tests/features/ctx_shift.feature
@@ -1,62 +0,0 @@
-@llama.cpp
-@ctx_shift
-Feature: llama.cpp server
-
-  Background: Server startup
-    Given a server listening on localhost:8080
-    And   a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
-    And   a model file test-model.gguf
-    And   a model alias tinyllama-2
-    And   BOS token is 1
-    And   42 as server seed
-    And   256 KV cache size
-    And   32 as batch size
-    And   2 slots
-
-  Scenario: Inference with context shift
-    And   64 server max tokens to predict
-    Then  the server is starting
-    Then  the server is healthy
-    Given a prompt:
-    """
-    Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
-    Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
-    Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
-    Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
-    """
-    And   a completion request with no api error
-    Then  64 tokens are predicted matching fun|Annaks|popcorns|pictry|bowl
-    And   the completion is  truncated
-    And   109 prompt tokens are processed
-
-  Scenario Outline: Inference without context shift
-    And   <n_predict> server max tokens to predict
-    And   disable context shifting
-    Then  the server is starting
-    Then  the server is healthy
-    Given a prompt:
-    """
-    Hi how are you
-    """
-    And   a completion request with no api error
-    Then  <n_token_output> tokens are predicted matching twind|Anna
-    And   the completion is <truncated> truncated
-    And   8 prompt tokens are processed
-    Examples:
-      | n_predict | n_token_output | truncated |
-      | 64        | 64             | not       |
-      | -1        | 120            |           |
-
-  Scenario: Inference without context shift (expected error: prompt too long)
-    And   disable context shifting
-    Then  the server is starting
-    Then  the server is healthy
-    Given a prompt:
-    """
-    Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
-    Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
-    Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
-    Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
-    """
-    And   a completion request with 400 api error
-
--- a/examples/server/tests/features/embeddings.feature
+++ b/examples/server/tests/features/embeddings.feature
@@ -9,13 +9,10 @@ Feature: llama.cpp server
    And   a model alias bert-bge-small
    And   42 as server seed
    And   2 slots
-    # the bert-bge-small model has context size of 512
-    # since the generated prompts are as big as the batch size, we need to set the batch size to <= 512
-    # ref: https://huggingface.co/BAAI/bge-small-en-v1.5/blob/5c38ec7c405ec4b44b94cc5a9bb96e735b38267a/config.json#L20
-    And   128 as batch size
-    And   128 as ubatch size
-    And   512 KV cache size
-    And   enable embeddings endpoint
+    And   1024 as batch size
+    And   1024 as ubatch size
+    And   2048 KV cache size
+    And   embeddings extraction
    Then  the server is starting
    Then  the server is healthy

@@ -26,20 +23,6 @@ Feature: llama.cpp server
    """
    Then embeddings are generated

-  Scenario: Embedding (error: prompt too long)
-    When embeddings are computed for:
-    """
-    Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
-    Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
-    Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
-    Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
-    Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
-    Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
-    Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
-    Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
-    """
-    And  embeddings request with 500 api error
-
  Scenario: OAI Embeddings compatibility
    Given a model bert-bge-small
    When an OAI compatible embeddings computation request for:
--- a/examples/server/tests/features/parallel.feature
+++ b/examples/server/tests/features/parallel.feature
@@ -77,35 +77,6 @@ Feature: Parallel
      | disabled  | 128       |
      | enabled   | 64        |

-  Scenario Outline: Multi users with number of prompts exceeding number of slots
-    Given a system prompt You are a writer.
-    And   a model tinyllama-2
-    Given a prompt:
-      """
-      Write a very long book.
-      """
-    And a prompt:
-      """
-      Write another a poem.
-      """
-    And a prompt:
-      """
-      What is LLM?
-      """
-    And a prompt:
-      """
-      The sky is blue and I love it.
-      """
-    And <n_predict> max tokens to predict
-    And streaming is <streaming>
-    Given concurrent OAI completions requests
-    Then the server is busy
-    Then the server is idle
-    Then all prompts are predicted with <n_predict> tokens
-    Examples:
-      | streaming | n_predict |
-      | disabled  | 128       |
-      | enabled   | 64        |

  Scenario:  Multi users with total number of tokens to predict exceeds the KV Cache size #3969
    Given a prompt:
--- a/examples/server/tests/features/passkey.feature
+++ b/examples/server/tests/features/passkey.feature
@@ -15,7 +15,6 @@ Feature: Passkey / Self-extend with context shift
    And   <n_junk> as number of junk
    And   <n_predicted> server max tokens to predict
    And   42 as seed
-    And   0.0 temperature
    And   <n_ctx> KV cache size
    And   1 slots
    And   <n_ga> group attention factor to extend context size through self-extend
@@ -23,8 +22,7 @@ Feature: Passkey / Self-extend with context shift
    # Can be override with N_GPU_LAYERS
    And   <ngl> GPU offloaded layers
    Then  the server is starting
-    # Higher timeout because the model may need to be downloaded from the internet
-    Then  the server is healthy with timeout 120 seconds
+    Then  the server is healthy
    Given available models
    Then  model 0 is trained on <n_ctx_train> tokens context
    Given a prefix prompt:
--- a/examples/server/tests/features/rerank.feature
+++ b/examples/server/tests/features/rerank.feature
@@ -1,42 +0,0 @@
-@llama.cpp
-@rerank
-Feature: llama.cpp server
-
-  Background: Server startup
-    Given a server listening on localhost:8080
-    And   a model url https://huggingface.co/ggml-org/models/resolve/main/jina-reranker-v1-tiny-en/ggml-model-f16.gguf
-    And   a model file jina-reranker-v1-tiny-en.gguf
-    And   a model alias jina-reranker-v1-tiny-en
-    And   42 as server seed
-    And   2 slots
-    And   512 as batch size
-    And   512 as ubatch size
-    And   512 KV cache size
-    And   enable reranking endpoint
-    Then  the server is starting
-    Then  the server is healthy
-
-  Scenario: Rerank
-    Given a rerank query:
-      """
-      Machine learning is
-      """
-    And   a rerank document:
-      """
-      A machine is a physical system that uses power to apply forces and control movement to perform an action. The term is commonly applied to artificial devices, such as those employing engines or motors, but also to natural biological macromolecules, such as molecular machines.
-      """
-    And   a rerank document:
-      """
-      Learning is the process of acquiring new understanding, knowledge, behaviors, skills, values, attitudes, and preferences. The ability to learn is possessed by humans, non-human animals, and some machines; there is also evidence for some kind of learning in certain plants.
-      """
-    And   a rerank document:
-      """
-      Machine learning is a field of study in artificial intelligence concerned with the development and study of statistical algorithms that can learn from data and generalize to unseen data, and thus perform tasks without explicit instructions.
-      """
-    And   a rerank document:
-      """
-      Paris, capitale de la France, est une grande ville européenne et un centre mondial de l'art, de la mode, de la gastronomie et de la culture. Son paysage urbain du XIXe siècle est traversé par de larges boulevards et la Seine.
-      """
-    When  reranking request
-    Then  reranking results are returned
-    Then  reranking highest score is index 2 and lowest score is index 3
--- a/examples/server/tests/features/server.feature
+++ b/examples/server/tests/features/server.feature
@@ -105,14 +105,6 @@ Feature: llama.cpp server
    Given first token is removed
    Then  tokens can be detokenized

-  Scenario: Tokenize with pieces
-    When  tokenizing with pieces:
-    """
-    What is the capital of Germany?
-    媽
-    """
-    Then  tokens are given with pieces
-
  Scenario: Models available
    Given available models
    Then  1 models are supported
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@@ -1,6 +1,3 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-
 import asyncio
 import json
 import os
@@ -68,7 +65,6 @@ def step_server_config(context, server_fqdn: str, server_port: str):
    context.server_api_key = None
    context.server_continuous_batching = False
    context.server_embeddings = False
-    context.server_reranking = False
    context.server_metrics = False
    context.server_process = None
    context.seed = None
@@ -78,16 +74,11 @@ def step_server_config(context, server_fqdn: str, server_port: str):
    context.response_format = None
    context.temperature = None
    context.lora_file = None
-    context.disable_ctx_shift = False

    context.tasks_result = []
    context.concurrent_tasks = []
    context.prompts = []

-    context.reranking_query = None
-    context.reranking_documents = []
-    context.reranking_results = None
-

@step('a model file {hf_file} from HF repo {hf_repo}')
 def step_download_hf_model(context, hf_file: str, hf_repo: str):
@@ -154,7 +145,7 @@ def step_n_slots(context, n_slots: int):

@step('{n_predict:d} server max tokens to predict')
 def step_server_n_predict(context, n_predict: int):
-    context.n_server_predict = n_predict if n_predict > 0 else None
+    context.n_server_predict = n_predict


@step('{slot_save_path} as slot save path')
@@ -177,21 +168,15 @@ def step_server_continuous_batching(context):
    context.server_continuous_batching = True


-@step('enable embeddings endpoint')
+@step('embeddings extraction')
 def step_server_embeddings(context):
    context.server_embeddings = True

-@step('enable reranking endpoint')
-def step_server_reranking(context):
-    context.server_reranking = True

@step('prometheus compatible metrics exposed')
 def step_server_metrics(context):
    context.server_metrics = True

-@step('disable context shifting')
-def step_server_disable_ctx_shift(context):
-    context.disable_ctx_shift = True

@step("the server is starting")
 def step_start_server(context):
@@ -217,15 +202,17 @@ def step_start_server(context):
            time.sleep(0.1)


-async def wait_for_server_status_with_timeout(context, expecting_status: Literal['healthy', 'ready', 'idle', 'busy'] | str, timeout: int):
+@step("the server is {expecting_status}")
+@async_run_until_complete
+async def step_wait_for_the_server_to_be_started(context, expecting_status: Literal['healthy', 'ready', 'idle', 'busy'] | str):
    match expecting_status:
        case 'healthy':
            await wait_for_slots_status(context, context.base_url, 200,
-                                        timeout=timeout)
+                                        timeout=30)

        case 'ready' | 'idle':
            await wait_for_slots_status(context, context.base_url, 200,
-                                        timeout=timeout,
+                                        timeout=30,
                                        params={'fail_on_no_slot': 1},
                                        slots_idle=context.n_slots,
                                        slots_processing=0)
@@ -238,18 +225,6 @@ async def wait_for_server_status_with_timeout(context, expecting_status: Literal
            assert False, "unknown status"


-@step("the server is {expecting_status} with timeout {timeout:d} seconds")
-@async_run_until_complete
-async def step_wait_for_server_status_with_timeout(context, expecting_status: Literal['healthy', 'ready', 'idle', 'busy'] | str, timeout: int):
-    await wait_for_server_status_with_timeout(context, expecting_status, timeout)
-
-
-@step("the server is {expecting_status}")
-@async_run_until_complete
-async def step_wait_for_server_status(context, expecting_status: Literal['healthy', 'ready', 'idle', 'busy'] | str):
-    await wait_for_server_status_with_timeout(context, expecting_status, 30)
-
-
@step('all slots are {expected_slot_status_string}')
@async_run_until_complete
 async def step_all_slots_status(context, expected_slot_status_string: Literal['idle', 'busy'] | str):
@@ -269,7 +244,7 @@ async def step_all_slots_status(context, expected_slot_status_string: Literal['i
@step('a completion request with {api_error} api error')
@async_run_until_complete
 async def step_request_completion(context, api_error: Literal['raised'] | str):
-    expect_api_error = api_error == 'raised' or api_error != 'no'
+    expect_api_error = api_error == 'raised'
    seeds = await completions_seed(context, num_seeds=1)
    completion = await request_completion(context.prompts.pop(),
                                          seeds[0] if seeds is not None else seeds,
@@ -284,11 +259,8 @@ async def step_request_completion(context, api_error: Literal['raised'] | str):
    context.tasks_result.append(completion)
    if context.debug:
        print(f"Completion response: {completion}")
-    if api_error == 'raised':
+    if expect_api_error:
        assert completion == 401, f"completion must be an 401 status code: {completion}"
-    elif api_error.isdigit():
-        api_error_code = int(api_error)
-        assert completion == api_error_code, f"completion must be an {api_error_code} status code: {completion}"


@step('{predicted_n:d} tokens are predicted matching {re_content}')
@@ -460,14 +432,6 @@ def step_impl(context, n_ga_w):
 def step_prompt_passkey(context):
    context.prompt_passkey = context_text(context)

-@step('a rerank query')
-def step_set_rerank_query(context):
-    context.reranking_query = context_text(context)
-    context.reranking_documents = []
-
-@step('a rerank document')
-def step_set_rerank_document(context):
-    context.reranking_documents.append(context_text(context))

@step('{n_prompts:d} fixed prompts')
 def step_fixed_prompts(context, n_prompts):
@@ -635,22 +599,6 @@ async def step_compute_embedding(context):
    context.embeddings = await request_embedding(context_text(context), None, base_url=context.base_url)


-@step('reranking request')
-@async_run_until_complete
-async def step_compute_reranking(context):
-    async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
-        async with session.post(f'{context.base_url}/reranking',
-                                json={
-                                    "query": context.reranking_query,
-                                    "documents": context.reranking_documents,
-                                }) as response:
-            if response.status == 200:
-                response_json = await response.json()
-                context.reranking_results = response_json['results']
-            else:
-                context.reranking_results = response.status
-
-
@step('all embeddings are the same')
@async_run_until_complete
 async def step_all_embeddings_are_the_same(context):
@@ -684,9 +632,6 @@ def step_assert_embeddings(context):
    for embedding in context.embeddings:
        assert_embeddings(embedding)

-@step('embeddings request with {api_error_code:d} api error')
-def step_assert_embeddings(context, api_error_code: int):
-    assert context.embeddings == api_error_code, f"embeddings request must return code {api_error_code}, but got {context.embeddings}"

@step('an OAI compatible embeddings computation request for')
@async_run_until_complete
@@ -736,56 +681,12 @@ async def all_embeddings_are_generated(context):
    for i in range(n_embedding_requests):
        assert_embeddings(context.tasks_result.pop().pop())

-@step('reranking results are returned')
-def reranking_results_are_returned(context):
-    assert len(context.reranking_results) == len(context.reranking_documents)
-
-@step('reranking highest score is index {idx_high:d} and lowest score is index {idx_low:d}')
-def reranking_results_are_returned(context, idx_high: int, idx_low: int):
-    max_score, max_idx = 0, 0
-    min_score, min_idx = 0, 0
-    for res in context.reranking_results:
-        if max_score < res['relevance_score']:
-            max_score = res['relevance_score']
-            max_idx   = res['index']
-        if min_score > res['relevance_score']:
-            min_score = res['relevance_score']
-            min_idx   = res['index']
-    print(context.reranking_results)
-    assert max_idx == idx_high
-    assert min_idx == idx_low

@step('adding special tokens')
 def step_tokenize_set_add_special(context):
    context.tokenize_add_special = True


-@step("tokenizing with pieces")
-@async_run_until_complete
-async def step_tokenize_with_pieces(context):
-    context.tokenized_text = context_text(context)
-    async with aiohttp.ClientSession() as session:
-        tokenize_args = {"content": context.tokenized_text, "with_pieces": True}
-        if getattr(context, "tokenize_add_special", None) is not None:
-            tokenize_args["add_special"] = context.tokenize_add_special
-
-        async with session.post(
-            f"{context.base_url}/tokenize", json=tokenize_args
-        ) as response:
-            assert response.status == 200
-            tokenize_json = await response.json()
-            context.tokens_with_pieces = tokenize_json["tokens"]
-
-
-@step("tokens are given with pieces")
-@async_run_until_complete
-async def step_tokenize_with_pieces(context):
-    # Verify that the response contains both token IDs and pieces
-    assert all(
-        "id" in token and "piece" in token for token in context.tokens_with_pieces
-    )
-
-
@step('tokenizing')
@async_run_until_complete
 async def step_tokenize(context):
@@ -1080,8 +981,6 @@ async def oai_chat_completions(user_prompt,
                            event_data = line.split(': ', 1)
                            assert event_data[0] == 'data', f'Bad event code received: ```{event_data}```'
                            chunk_raw = event_data[1]
-                            if chunk_raw == '[DONE]':
-                                break

                            chunk = json.loads(chunk_raw)
                            assert len(chunk['choices']) == 1, f"no choices provided, line ```{line}```"
@@ -1149,17 +1048,15 @@ async def oai_chat_completions(user_prompt,
    return completion_response


-async def request_embedding(content, seed, base_url=None) -> list[list[float]] | int:
+async def request_embedding(content, seed, base_url=None) -> list[list[float]]:
    async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
        async with session.post(f'{base_url}/embedding',
                                json={
                                    "content": content,
                                }) as response:
-            if response.status == 200:
-                response_json = await response.json()
-                return [response_json['embedding']]
-            else:
-                return response.status
+            assert response.status == 200
+            response_json = await response.json()
+            return [response_json['embedding']]


 async def request_oai_embeddings(input, seed,
@@ -1412,8 +1309,6 @@ def start_server_background(context):
        server_args.append('--cont-batching')
    if context.server_embeddings:
        server_args.append('--embedding')
-    if context.server_reranking:
-        server_args.append('--reranking')
    if context.server_metrics:
        server_args.append('--metrics')
    if context.model_alias:
@@ -1436,8 +1331,8 @@ def start_server_background(context):
        server_args.append('--verbose')
    if context.lora_file:
        server_args.extend(['--lora', context.lora_file])
-    if context.disable_ctx_shift:
-        server_args.extend(['--no-context-shift'])
+    if 'SERVER_LOG_FORMAT_JSON' not in os.environ:
+        server_args.extend(['--log-format', "text"])

    args = [str(arg) for arg in [context.server_path, *server_args]]
    print(f"bench: starting server with: {' '.join(args)}")
--- a/examples/server/tests/requirements.txt
+++ b/examples/server/tests/requirements.txt
@@ -1,6 +1,6 @@
 aiohttp~=3.9.3
 behave~=1.2.6
-huggingface_hub~=0.23.2
+huggingface_hub~=0.20.3
 numpy~=1.26.4
 openai~=1.30.3
 prometheus-client~=0.20.0
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -1,8 +1,7 @@
 #pragma once

-#include "common.h"
-#include "log.h"
 #include "llama.h"
+#include "common.h"

 #ifndef NDEBUG
 // crash the server in debug mode, otherwise send an http 500 error
@@ -16,10 +15,10 @@
 #define JSON_ASSERT GGML_ASSERT
 #include "json.hpp"

-#include <random>
-#include <sstream>
 #include <string>
 #include <vector>
+#include <sstream>
+#include <random>

 #define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613"

@@ -36,6 +35,32 @@ enum error_type {
    ERROR_TYPE_NOT_SUPPORTED, // custom error
 };

+extern bool server_verbose;
+extern bool server_log_json;
+
+#ifndef SERVER_VERBOSE
+#define SERVER_VERBOSE 1
+#endif
+
+#if SERVER_VERBOSE != 1
+#define LOG_VERBOSE(MSG, ...)
+#else
+#define LOG_VERBOSE(MSG, ...)                                            \
+    do                                                                   \
+    {                                                                    \
+        if (server_verbose)                                              \
+        {                                                                \
+            server_log("VERB", __func__, __LINE__, MSG, __VA_ARGS__); \
+        }                                                                \
+    } while (0)
+#endif
+
+#define LOG_ERROR(  MSG, ...) server_log("ERR",  __func__, __LINE__, MSG, __VA_ARGS__)
+#define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__)
+#define LOG_INFO(   MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
+
+static inline void server_log(const char * level, const char * function, int line, const char * message, const json & extra);
+
 template <typename T>
 static T json_value(const json & body, const std::string & key, const T & default_value) {
    // Fallback null to default value
@@ -43,7 +68,9 @@ static T json_value(const json & body, const std::string & key, const T & defaul
        try {
            return body.at(key);
        } catch (NLOHMANN_JSON_NAMESPACE::detail::type_error const &) {
-            LOG_WRN("Wrong type supplied for parameter '%s'. Expected '%s', using default value\n", key.c_str(), json(default_value).type_name());
+            std::stringstream ss;
+            ss << "Wrong type supplied for parameter '" << key << "'. Expected '" << json(default_value).type_name() << "', using default value.";
+            LOG_WARNING(ss.str().c_str(), body);
            return default_value;
        }
    } else {
@@ -51,6 +78,48 @@ static T json_value(const json & body, const std::string & key, const T & defaul
    }
 }

+static inline void server_log(const char * level, const char * function, int line, const char * message, const json & extra) {
+    std::stringstream ss_tid;
+    ss_tid << std::this_thread::get_id();
+    json log = json{
+        {"tid",       ss_tid.str()},
+        {"timestamp", time(nullptr)},
+    };
+
+    if (server_log_json) {
+        log.merge_patch({
+            {"level",    level},
+            {"function", function},
+            {"line",     line},
+            {"msg",      message},
+        });
+
+        if (!extra.empty()) {
+            log.merge_patch(extra);
+        }
+
+        printf("%s\n", log.dump(-1, ' ', false, json::error_handler_t::replace).c_str());
+    } else {
+        char buf[1024];
+        snprintf(buf, 1024, "%4s [%24s] %s", level, function, message);
+
+        if (!extra.empty()) {
+            log.merge_patch(extra);
+        }
+        std::stringstream ss;
+        ss << buf << " |";
+        for (const auto & el : log.items())
+        {
+            const std::string value = el.value().dump(-1, ' ', false, json::error_handler_t::replace);
+            ss << " " << el.key() << "=" << value;
+        }
+
+        const std::string str = ss.str();
+        printf("%.*s\n", (int)str.size(), str.data());
+    }
+    fflush(stdout);
+}
+
 //
 // chat template utils
 //
@@ -84,9 +153,8 @@ inline std::string format_chat(const struct llama_model * model, const std::stri
        chat.push_back({role, content});
    }

-    const auto formatted_chat = llama_chat_apply_template(model, tmpl, chat, true);
-    LOG_DBG("formatted_chat: '%s'\n", formatted_chat.c_str());
-
+    auto formatted_chat = llama_chat_apply_template(model, tmpl, chat, true);
+    LOG_VERBOSE("formatted_chat", {{"text", formatted_chat.c_str()}});
    return formatted_chat;
 }

@@ -175,7 +243,10 @@ static std::string random_string() {
 }

 static std::string gen_chatcmplid() {
-    return "chatcmpl-" + random_string();
+    std::stringstream chatcmplid;
+    chatcmplid << "chatcmpl-" << random_string();
+
+    return chatcmplid.str();
 }

 //
@@ -216,7 +287,7 @@ static size_t find_partial_stop_string(const std::string &stop, const std::strin
    return std::string::npos;
 }

-static bool json_is_array_of_numbers(const json & data) {
+static bool json_is_array_of_numbers(json data) {
    if (data.is_array()) {
        for (const auto & e : data) {
            if (!e.is_number()) {
@@ -292,13 +363,15 @@ static json probs_vector_to_json(const llama_context * ctx, const std::vector<co
    return out;
 }

-static bool server_sent_event(httplib::DataSink & sink, const char * event, const json & data) {
+static bool server_sent_event(httplib::DataSink & sink, const char * event, json & data) {
    const std::string str =
        std::string(event) + ": " +
        data.dump(-1, ' ', false, json::error_handler_t::replace) +
-        "\n\n"; // note: these newlines are important (not sure why though, if you know, add a comment to explain)
+        "\n\n";

-    LOG_DBG("data stream, to_send: %s", str.c_str());
+    LOG_VERBOSE("data stream", {
+        { "to_send", str }
+    });

    return sink.write(str.c_str(), str.size());
 }
@@ -331,9 +404,6 @@ static json oaicompat_completion_params_parse(
        std::string response_type = json_value(response_format, "type", std::string());
        if (response_type == "json_object") {
            llama_params["json_schema"] = json_value(response_format, "schema", json::object());
-        } else if (response_type == "json_schema") {
-            json json_schema = json_value(response_format, "json_schema", json::object());
-            llama_params["json_schema"] = json_value(json_schema, "schema", json::object());
        } else if (!response_type.empty() && response_type != "text") {
            throw std::runtime_error("response_format type must be one of \"text\" or \"json_object\", but got: " + response_type);
        }
@@ -355,7 +425,7 @@ static json oaicompat_completion_params_parse(

    // Params supported by OAI but unsupported by llama.cpp
    static const std::vector<std::string> unsupported_params { "tools", "tool_choice" };
-    for (const auto & param : unsupported_params) {
+    for (auto & param : unsupported_params) {
        if (body.contains(param)) {
            throw std::runtime_error("Unsupported param: " + param);
        }
@@ -374,7 +444,7 @@ static json oaicompat_completion_params_parse(
    return llama_params;
 }

-static json format_final_response_oaicompat(const json & request, const json & result, const std::string & completion_id, bool streaming = false, bool verbose = false) {
+static json format_final_response_oaicompat(const json & request, json result, const std::string & completion_id, bool streaming = false) {
    bool stopped_word        = result.count("stopped_word") != 0;
    bool stopped_eos         = json_value(result, "stopped_eos", false);
    int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
@@ -411,8 +481,7 @@ static json format_final_response_oaicompat(const json & request, const json & r
        {"id", completion_id}
    };

-    // extra fields for debugging purposes
-    if (verbose) {
+    if (server_verbose) {
        res["__verbose"] = result;
    }

@@ -424,7 +493,7 @@ static json format_final_response_oaicompat(const json & request, const json & r
 }

 // return value is vector as there is one case where we might need to generate two responses
-static std::vector<json> format_partial_response_oaicompat(const json & result, const std::string & completion_id) {
+static std::vector<json> format_partial_response_oaicompat(json result, const std::string & completion_id) {
    if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) {
        return std::vector<json>({result});
    }
@@ -526,7 +595,7 @@ static std::vector<json> format_partial_response_oaicompat(const json & result,
 static json format_embeddings_response_oaicompat(const json & request, const json & embeddings) {
    json data = json::array();
    int i = 0;
-    for (const auto & elem : embeddings) {
+    for (auto & elem : embeddings) {
        data.push_back(json{
            {"embedding", json_value(elem, "embedding", json::array())},
            {"index",     i++},
@@ -537,7 +606,7 @@ static json format_embeddings_response_oaicompat(const json & request, const jso
    json res = json {
        {"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
        {"object", "list"},
-        {"usage", json { // TODO: fill
+        {"usage", json {
            {"prompt_tokens", 0},
            {"total_tokens", 0}
        }},
@@ -547,63 +616,7 @@ static json format_embeddings_response_oaicompat(const json & request, const jso
    return res;
 }

-static json format_response_rerank(const json & request, const json & ranks) {
-    json data = json::array();
-    int i = 0;
-    for (const auto & rank : ranks) {
-        data.push_back(json{
-            {"index",    i++},
-            {"relevance_score", json_value(rank, "score", 0.0)},
-        });
-    }
-
-    json res = json {
-        {"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
-        {"object", "list"},
-        {"usage", json { // TODO: fill
-            {"prompt_tokens", 0},
-            {"total_tokens", 0}
-        }},
-        {"results", data}
-    };
-
-    return res;
-}
-
-static bool is_valid_utf8(const std::string & str) {
-    const unsigned char* bytes = reinterpret_cast<const unsigned char*>(str.data());
-    const unsigned char* end = bytes + str.length();
-
-    while (bytes < end) {
-        if (*bytes <= 0x7F) {
-            // 1-byte sequence (0xxxxxxx)
-            bytes++;
-        } else if ((*bytes & 0xE0) == 0xC0) {
-            // 2-byte sequence (110xxxxx 10xxxxxx)
-            if (end - bytes < 2 || (bytes[1] & 0xC0) != 0x80)
-                return false;
-            bytes += 2;
-        } else if ((*bytes & 0xF0) == 0xE0) {
-            // 3-byte sequence (1110xxxx 10xxxxxx 10xxxxxx)
-            if (end - bytes < 3 || (bytes[1] & 0xC0) != 0x80 || (bytes[2] & 0xC0) != 0x80)
-                return false;
-            bytes += 3;
-        } else if ((*bytes & 0xF8) == 0xF0) {
-            // 4-byte sequence (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
-            if (end - bytes < 4 || (bytes[1] & 0xC0) != 0x80 ||
-                (bytes[2] & 0xC0) != 0x80 || (bytes[3] & 0xC0) != 0x80)
-                return false;
-            bytes += 4;
-        } else {
-            // Invalid UTF-8 lead byte
-            return false;
-        }
-    }
-
-    return true;
-}
-
-static json format_tokenizer_response(const json & tokens) {
+static json format_tokenizer_response(const std::vector<llama_token> & tokens) {
    return json {
        {"tokens", tokens}
    };
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -1,14 +1,17 @@
-#include "arg.h"
 #include "common.h"
-#include "log.h"
 #include "llama.h"

+#include <cmath>
+#include <cstdio>
+#include <string>
 #include <vector>

-static void print_usage(int, char ** argv) {
-    LOG("\nexample usage:\n");
-    LOG("\n    %s -m model.gguf -p \"Hello my name is\" -n 32\n", argv[0]);
-    LOG("\n");
+static void print_usage(int argc, char ** argv, const gpt_params & params) {
+    gpt_params_print_usage(argc, argv, params);
+
+    LOG_TEE("\nexample usage:\n");
+    LOG_TEE("\n    %s -m model.gguf -p \"Hello my name is\" -n 32\n", argv[0]);
+    LOG_TEE("\n");
 }

 int main(int argc, char ** argv) {
@@ -17,12 +20,11 @@ int main(int argc, char ** argv) {
    params.prompt = "Hello my name is";
    params.n_predict = 32;

-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) {
+    if (!gpt_params_parse(argc, argv, params)) {
+        print_usage(argc, argv, params);
        return 1;
    }

-    gpt_init();
-
    // total length of the sequence including the prompt
    const int n_predict = params.n_predict;

@@ -53,13 +55,7 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    auto sparams = llama_sampler_chain_default_params();
-
-    sparams.no_perf = false;
-
-    llama_sampler * smpl = llama_sampler_chain_init(sparams);
-
-    llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
+    llama_sampling * smpl = llama_sampling_init(model, llama_sampling_default_params());

    // tokenize the prompt

@@ -69,24 +65,25 @@ int main(int argc, char ** argv) {
    const int n_ctx    = llama_n_ctx(ctx);
    const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size());

-    LOG("\n");
-    LOG_INF("%s: n_predict = %d, n_ctx = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, n_kv_req);
+    LOG_TEE("\n%s: n_predict = %d, n_ctx = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, n_kv_req);

    // make sure the KV cache is big enough to hold all the prompt and generated tokens
    if (n_kv_req > n_ctx) {
-        LOG_ERR("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__);
-        LOG_ERR("%s:        either reduce n_predict or increase n_ctx\n", __func__);
+        LOG_TEE("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__);
+        LOG_TEE("%s:        either reduce n_predict or increase n_ctx\n", __func__);
        return 1;
    }

    // print the prompt token-by-token

-    LOG("\n");
+    fprintf(stderr, "\n");

    for (auto id : tokens_list) {
-        LOG("%s", llama_token_to_piece(ctx, id).c_str());
+        fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
    }

+    fflush(stderr);
+
    // create a llama_batch with size 512
    // we use this object to submit token data for decoding

@@ -101,7 +98,7 @@ int main(int argc, char ** argv) {
    batch.logits[batch.n_tokens - 1] = true;

    if (llama_decode(ctx, batch) != 0) {
-        LOG("%s: llama_decode() failed\n", __func__);
+        LOG_TEE("%s: llama_decode() failed\n", __func__);
        return 1;
    }

@@ -115,16 +112,21 @@ int main(int argc, char ** argv) {
    while (n_cur <= n_predict) {
        // sample the next token
        {
-            const llama_token new_token_id = llama_sampler_sample(smpl, ctx, -1);
+            const auto * logits = llama_get_logits_ith(ctx, batch.n_tokens - 1);
+
+            llama_sampling_set_logits(smpl, logits);
+
+            // sample the most likely token
+            const llama_token new_token_id = llama_sampling_sample_greedy(smpl, nullptr);

            // is it an end of generation?
            if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
-                LOG("\n");
+                LOG_TEE("\n");

                break;
            }

-            LOG("%s", llama_token_to_piece(ctx, new_token_id).c_str());
+            LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str());
            fflush(stdout);

            // prepare the next batch
@@ -140,26 +142,25 @@ int main(int argc, char ** argv) {

        // evaluate the current batch with the transformer model
        if (llama_decode(ctx, batch)) {
-            LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1);
+            fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
            return 1;
        }
    }

-    LOG("\n");
+    LOG_TEE("\n");

    const auto t_main_end = ggml_time_us();

-    LOG_INF("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
+    LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
            __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));

-    LOG("\n");
-    llama_perf_sampler_print(smpl);
-    llama_perf_context_print(ctx);
+    llama_print_timings(ctx, nullptr);

-    LOG("\n");
+    fprintf(stderr, "\n");

    llama_batch_free(batch);
-    llama_sampler_free(smpl);
+
+    llama_sampling_free(smpl);
    llama_free(ctx);
    llama_free_model(model);

--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -1,16 +1,11 @@
-#include "arg.h"
 #include "common.h"
-#include "sampling.h"
-#include "log.h"
 #include "llama.h"

-#include <algorithm>
+#include <cmath>
 #include <cstdio>
-#include <cstring>
-#include <random>
-#include <set>
 #include <string>
 #include <vector>
+#include <set>

 #define SPEC_VOCAB_MAX_SIZE_DIFFERENCE  100
 #define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
@@ -26,35 +21,40 @@ struct seq_draft {
    std::vector<llama_token> tokens;
    std::vector<std::vector<llama_token_data>> dists;

-    struct gpt_sampler * smpl = nullptr;
+    struct llama_sampling * smpl;
 };

 int main(int argc, char ** argv) {
    gpt_params params;

-    // needed to get candidate probs even for temp <= 0.0
-    params.sparams.n_probs = 128;
-
-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_SPECULATIVE)) {
+    if (!gpt_params_parse(argc, argv, params)) {
+        gpt_params_print_usage(argc, argv, params);
        return 1;
    }

-    gpt_init();
-
    if (params.model_draft.empty()) {
-        LOG_ERR("%s: --model-draft is required\n", __func__);
+        fprintf(stderr, "%s: error: --model-draft is required\n", __func__);
        return 1;
    }

+    // for probabilities to be computed even with temp = 0
+    params.sparams.n_probs = 16;
+
    // max number of parallel drafting sequences (i.e. tree branches)
    const int n_seq_dft = params.n_parallel;

    // probability threshold for splitting a draft branch (only for n_seq_dft > 1)
    const float p_split  = params.p_split;

-    std::default_random_engine rng(params.sparams.seed == LLAMA_DEFAULT_SEED ? std::random_device()() : params.sparams.seed);
+    std::default_random_engine rng(params.sparams.seed);
    std::uniform_real_distribution<> u_dist;

+#ifndef LOG_DISABLE_LOGS
+    log_set_target(log_filename_generator("speculative", "log"));
+    LOG_TEE("Log start\n");
+    log_dump_cmdline(argc, argv);
+#endif // LOG_DISABLE_LOGS
+
    // init llama.cpp
    llama_backend_init();
    llama_numa_init(params.numa);
@@ -83,14 +83,14 @@ int main(int argc, char ** argv) {
    ctx_dft = llama_init_dft.context;

    const bool vocab_type_tgt = llama_vocab_type(model_tgt);
-    LOG_DBG("vocab_type tgt: %d\n", vocab_type_tgt);
+    LOG("vocab_type tgt: %d\n", vocab_type_tgt);

    const bool vocab_type_dft = llama_vocab_type(model_dft);
-    LOG_DBG("vocab_type dft: %d\n", vocab_type_dft);
+    LOG("vocab_type dft: %d\n", vocab_type_dft);

    if (vocab_type_tgt != vocab_type_dft) {
-        LOG_ERR("%s: draft model vocab type must match target model to use speculation but ", __func__);
-        LOG_ERR("vocab_type_dft = %d while vocab_type_tgt = %d\n", vocab_type_dft, vocab_type_tgt);
+        fprintf(stderr, "%s: error: draft model vocab type must match target model to use speculation but ", __func__);
+        fprintf(stderr, "vocab_type_dft = %d while vocab_type_tgt = %d\n", vocab_type_dft, vocab_type_tgt);
        return 1;
    }

@@ -100,7 +100,7 @@ int main(int argc, char ** argv) {
        llama_token_bos(model_tgt) != llama_token_bos(model_dft) ||
        llama_token_eos(model_tgt) != llama_token_eos(model_dft)
    ) {
-        LOG_ERR("%s: draft model special tokens must match target model to use speculation\n", __func__);
+        fprintf(stderr, "%s: error: draft model special tokens must match target model to use speculation\n", __func__);
        return 1;
    }

@@ -112,8 +112,8 @@ int main(int argc, char ** argv) {
            : n_vocab_dft - n_vocab_tgt;

        if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) {
-            LOG_ERR("%s: draft model vocab must closely match target model to use speculation but ", __func__);
-            LOG_ERR("target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
+            fprintf(stderr, "%s: error: draft model vocab must closely match target model to use speculation but ", __func__);
+            fprintf(stderr, "target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
                    n_vocab_tgt, llama_n_vocab(model_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
            return 1;
        }
@@ -122,8 +122,8 @@ int main(int argc, char ** argv) {
            const char * token_text_tgt = llama_token_get_text(model_tgt, i);
            const char * token_text_dft = llama_token_get_text(model_dft, i);
            if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
-                LOG_ERR("%s: draft model vocab must match target model to use speculation but ", __func__);
-                LOG_ERR("token %d content differs - target '%s', draft '%s'\n", i,
+                fprintf(stderr, "%s: error: draft model vocab must match target model to use speculation but ", __func__);
+                fprintf(stderr, "token %d content differs - target '%s', draft '%s'\n", i,
                        llama_token_to_piece(ctx_tgt, i).c_str(),
                        llama_token_to_piece(ctx_dft, i).c_str());
                return 1;
@@ -140,16 +140,18 @@ int main(int argc, char ** argv) {
    const int max_tokens_list_size = max_context_size - 4;

    if ((int) inp.size() > max_tokens_list_size) {
-        LOG_ERR("%s: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
+        fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
        return 1;
    }

-    LOG("\n\n");
+    fprintf(stderr, "\n\n");

    for (auto id : inp) {
-        LOG("%s", llama_token_to_piece(ctx_tgt, id).c_str());
+        fprintf(stderr, "%s", llama_token_to_piece(ctx_tgt, id).c_str());
    }

+    fflush(stderr);
+
    const int n_input = inp.size();

    const auto t_enc_start = ggml_time_us();
@@ -178,16 +180,14 @@ int main(int argc, char ** argv) {
    bool has_eos = false;

    // target model sampling context (reuse the llama_context's sampling instance)
-    struct gpt_sampler * smpl = gpt_sampler_init(model_tgt, params.sparams);
-
-    struct llama_sampler * softmax = llama_sampler_init_softmax();
+    struct llama_sampling * smpl = llama_sampling_init(model_tgt, params.sparams);

    // draft sequence data
    std::vector<seq_draft> drafts(n_seq_dft);

    for (int s = 0; s < n_seq_dft; ++s) {
-        // allocate gpt_sampler for each draft sequence
-        drafts[s].smpl = gpt_sampler_init(model_dft, params.sparams);
+        // allocate llama_sampling for each draft sequence
+        drafts[s].smpl = llama_sampling_init(model_dft, params.sparams);
    }

    llama_batch batch_dft = llama_batch_init(params.n_ctx, 0, 1);
@@ -211,7 +211,7 @@ int main(int argc, char ** argv) {
            active_seqs.insert(s);
            const auto & tokens = drafts[s].tokens;

-            LOG_DBG("draft %d: %s\n", s, string_from(ctx_dft, tokens).c_str());
+            LOG("draft %d: %s\n", s, LOG_TOKENS_TOSTR_PRETTY(ctx_dft, tokens).c_str());
        }

        int i_dft  = 0;
@@ -229,13 +229,19 @@ int main(int argc, char ** argv) {
                bool accept = false;
                if (params.sparams.temp > 0) {
                    // stochastic verification
-                    gpt_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft], true);

-                    auto & dist_tgt = *gpt_sampler_get_candidates(smpl);
+                    llama_sampling_set_logits(smpl, llama_get_logits_ith(ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft]));
+
+                    auto & dist_tgt = *llama_sampling_get_candidates(smpl);
+
+                    llama_sampling_grammar(smpl, &dist_tgt);
+                    llama_sampling_softmax(smpl, &dist_tgt);

                    float p_tgt = 0.0f;
                    float p_dft = 0.0f;

+                    // GGML_ASSERT(dist_tgt.size() == dist_dft.size());
+
                    while (active_seqs.size() > 0) {
                        // randomly select a sequence to verify from active sequences
                        std::uniform_int_distribution<unsigned int> u_int_dist(0, active_seqs.size() - 1);
@@ -253,13 +259,9 @@ int main(int argc, char ** argv) {
                            }
                            continue;
                        }
-
-                        LOG_DBG("verifying sequence #%d at pos #%d from %d active sequence(s)\n", s, i_dft, (int) active_seqs.size());
+                        LOG("verifying sequence #%d at pos #%d from %d active sequence(s)\n", s, i_dft, (int) active_seqs.size());
                        float r = u_dist(rng);
-                        llama_token_data_array dist_dft = { drafts[s].dists[i_dft].data() , drafts[s].dists[i_dft].size(), LLAMA_TOKEN_NULL, true };
-
-                        //GGML_ASSERT(dist_tgt.size <= dist_dft.size);
-
+                        llama_token_data_array dist_dft = { drafts[s].dists[i_dft].data() , drafts[s].dists[i_dft].size(), true };
                        // acquire the token probabilities assigned by the draft and target models
                        for (size_t i = 0; i < dist_tgt.size; i++) {
                            if (dist_tgt.data[i].id == drafts[s].tokens[i_dft]) {
@@ -272,23 +274,24 @@ int main(int argc, char ** argv) {
                                break;
                            }
                        }
-                        LOG_DBG("r = %f, p_dft = %f, p_tgt = %f\n", r, p_dft, p_tgt);
+                        LOG("r = %f, p_dft = %f, p_tgt = %f\n", r, p_dft, p_tgt);
                        if (r <= p_tgt / p_dft) {
                            s_keep = s;
                            accept = true;
                            token_id = drafts[s].tokens[i_dft];
                            token_str = llama_token_to_piece(ctx_tgt, token_id);
-                            gpt_sampler_accept(smpl, token_id, true);
+                            llama_sampling_accept(smpl, token_id, true);

-                            LOG_DBG("draft token %d of sequence %d (%d, '%s') accepted\n", i_dft, s, token_id, token_str.c_str());
+                            LOG("draft token %d of sequence %d (%d, '%s') accepted\n", i_dft, s, token_id, token_str.c_str());
                            break;
                        } else {
-                            LOG_DBG("draft token %d of sequence %d (%d, '%s') rejected\n", i_dft, s, drafts[s].tokens[i_dft], llama_token_to_piece(ctx_tgt, drafts[s].tokens[i_dft]).c_str());
+                            LOG("draft token %d of sequence %d (%d, '%s') rejected\n", i_dft, s, drafts[s].tokens[i_dft], llama_token_to_piece(ctx_tgt, drafts[s].tokens[i_dft]).c_str());
                            drafts[s].active = false;

                            // calculate residual probability
                            GGML_ASSERT(dist_tgt.sorted);
                            GGML_ASSERT(dist_dft.sorted);
+                            float sum_probs = 0.0f;

                            // sort dist by id
                            std::sort(dist_tgt.data, dist_tgt.data + dist_tgt.size, [](const llama_token_data &a, const llama_token_data &b) {
@@ -298,18 +301,10 @@ int main(int argc, char ** argv) {
                                return a.id < b.id;
                            });

-                            float sum_probs = 0.0f;
-
                            for (size_t i = 0; i < dist_tgt.size; i++) {
-                                if (i < dist_dft.size) {
-                                    dist_tgt.data[i].p = std::max(0.0f, dist_tgt.data[i].p - dist_dft.data[i].p);
-                                } else {
-                                    dist_tgt.data[i].p = std::max(0.0f, dist_tgt.data[i].p);
-                                }
-
+                                dist_tgt.data[i].p = std::max(0.0f, dist_tgt.data[i].p - dist_dft.data[i].p);
                                sum_probs += dist_tgt.data[i].p;
                            }
-
                            for (size_t i = 0; i < dist_tgt.size; i++) {
                                dist_tgt.data[i].p /= sum_probs;
                            }
@@ -338,28 +333,22 @@ int main(int argc, char ** argv) {
                    if (!accept) {
                        // all drafted tokens were rejected
                        // sample from the target model
-                        LOG_DBG("all drafted tokens were rejected, sampling from residual distribution\n");
-                        std::vector<float> probs(dist_tgt.size);
-                        for (size_t i = 0; i < dist_tgt.size; ++i) {
-                            probs[i] = dist_tgt.data[i].p;
-                        }
-
-                        std::discrete_distribution<> dist(probs.begin(), probs.end());
-
-                        const int idx = dist(rng);
-
-                        token_id = dist_tgt.data[idx].id;
-                        gpt_sampler_accept(smpl, token_id, true);
+                        LOG("all drafted tokens were rejected, sampling from residual distribution\n");
+                        token_id = llama_sampling_sample_dist(smpl, &dist_tgt);
+                        llama_sampling_accept(smpl, token_id, true);
                        token_str = llama_token_to_piece(ctx_tgt, token_id);
                    }
+
                } else {
                    // greedy verification

                    // sample from the target model
-                    LOG_DBG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]);
-                    token_id = gpt_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft]);
+                    LOG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]);
+                    token_id = llama_sampling_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft]);

-                    gpt_sampler_accept(smpl, token_id, true);
+                    llama_sampling_accept(smpl, token_id, true);
+
+                    //LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, smpl->prev).c_str());

                    token_str = llama_token_to_piece(ctx_tgt, token_id);

@@ -369,7 +358,7 @@ int main(int argc, char ** argv) {
                        }

                        if (i_dft < (int) drafts[s].tokens.size() && token_id == drafts[s].tokens[i_dft]) {
-                            LOG_DBG("the sampled target token matches the %dth drafted token of sequence %d (%d, '%s') - accepted\n", i_dft, s, token_id, token_str.c_str());
+                            LOG("the sampled target token matches the %dth drafted token of sequence %d (%d, '%s') - accepted\n", i_dft, s, token_id, token_str.c_str());

                            s_keep = s;
                            accept = true;
@@ -391,24 +380,26 @@ int main(int argc, char ** argv) {
                    ++i_dft;
                    if (params.use_color) {
                        // Color token according to its origin sequence
-                        LOG("\u001b[%dm%s\u001b[37m", (36 - s_keep % 6), token_str.c_str());
+                        printf("\u001b[%dm%s\u001b[37m", (36 - s_keep % 6), token_str.c_str());
                    } else {
-                        LOG("%s", token_str.c_str());
+                        printf("%s", token_str.c_str());
                    }
+                    fflush(stdout);
                    continue;
                } else {
-                    LOG("%s", token_str.c_str());
+                    printf("%s", token_str.c_str());
+                    fflush(stdout);
                    break;
                }
            }
        }

        {
-            LOG_DBG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", token_id, token_str.c_str());
+            LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", token_id, token_str.c_str());

            // TODO: simplify
            {
-                LOG_DBG("keeping sequence %d, n_past_tgt = %d, n_past_dft = %d\n", s_keep, n_past_tgt, n_past_dft);
+                LOG("keeping sequence %d, n_past_tgt = %d, n_past_dft = %d\n", s_keep, n_past_tgt, n_past_dft);

                llama_kv_cache_seq_keep(ctx_dft, s_keep);
                llama_kv_cache_seq_cp  (ctx_dft, s_keep, 0, -1, -1);
@@ -435,7 +426,7 @@ int main(int argc, char ** argv) {
            llama_batch_add  (batch_dft, token_id, n_past_dft, { 0 }, true);

            llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1);
-            // LOG_DBG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
+            // LOG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
            llama_decode(ctx_dft, batch_dft);

            ++n_past_dft;
@@ -445,10 +436,7 @@ int main(int argc, char ** argv) {
            break;
        }

-        if (drafts[0].smpl) {
-            gpt_sampler_free(drafts[0].smpl);
-        }
-        drafts[0].smpl = gpt_sampler_clone(smpl);
+        llama_sampling_cp(smpl, drafts[0].smpl);

        int n_seq_cur  = 1;
        int n_past_cur = n_past_dft;
@@ -477,12 +465,12 @@ int main(int argc, char ** argv) {
                    continue;
                }

-                gpt_sampler_sample(drafts[s].smpl, ctx_dft, drafts[s].i_batch_dft, true);
+                llama_sampling_sample(drafts[s].smpl, ctx_dft, drafts[s].i_batch_dft);

-                const auto * cur_p = gpt_sampler_get_candidates(drafts[s].smpl);
+                const auto * cur_p = llama_sampling_get_candidates(drafts[s].smpl);

                for (int k = 0; k < std::min(n_seq_dft + 3, (int) cur_p->size); ++k) {
-                    LOG_DBG(" - draft candidate %3d for seq %3d, pos %3d: %6d (%8.3f) '%s'\n",
+                    LOG(" - draft candidate %3d for seq %3d, pos %3d: %6d (%8.3f) '%s'\n",
                            k, s, i, cur_p->data[k].id, cur_p->data[k].p, llama_token_to_piece(ctx_dft, cur_p->data[k].id).c_str());
                }

@@ -491,7 +479,7 @@ int main(int argc, char ** argv) {
                // attempt to split the branch if the probability is high enough
                for (int f = 1; f < 8; ++f) {
                    if (n_seq_cur < n_seq_dft && cur_p->data[f].p > p_split) {
-                        LOG_DBG("splitting seq %3d into %3d\n", s, n_seq_cur);
+                        LOG("splitting seq %3d into %3d\n", s, n_seq_cur);

                        llama_kv_cache_seq_rm(ctx_dft,    n_seq_cur, -1, -1);
                        llama_kv_cache_seq_cp(ctx_dft, s, n_seq_cur, -1, -1);
@@ -517,10 +505,7 @@ int main(int argc, char ** argv) {
                        drafts[n_seq_cur].i_batch_dft = drafts[s].i_batch_dft;
                        drafts[n_seq_cur].i_batch_tgt = drafts[s].i_batch_tgt;

-                        if (drafts[n_seq_cur].smpl) {
-                            gpt_sampler_free(drafts[n_seq_cur].smpl);
-                        }
-                        drafts[n_seq_cur].smpl = gpt_sampler_clone(drafts[s].smpl);
+                        llama_sampling_cp(drafts[s].smpl, drafts[n_seq_cur].smpl);

                        sa.push_back(n_seq_cur);

@@ -536,7 +521,7 @@ int main(int argc, char ** argv) {

                    const int s = sa[is];

-                    gpt_sampler_accept(drafts[s].smpl, id, true);
+                    llama_sampling_accept(drafts[s].smpl, id, true);

                    drafts[s].tokens.push_back(id);
                    // save cur_p.data into drafts[s].dists
@@ -580,7 +565,7 @@ int main(int argc, char ** argv) {
                llama_kv_cache_seq_cp(ctx_tgt, 0, s, -1, -1);
            }

-            // LOG_DBG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str());
+            // LOG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str());
            llama_decode(ctx_tgt, batch_tgt);
            ++n_past_tgt;
        }
@@ -598,33 +583,30 @@ int main(int argc, char ** argv) {

    auto t_dec_end = ggml_time_us();

-    LOG("\n\n");
+    LOG_TEE("\n\n");

-    LOG_INF("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input,   (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
-    LOG_INF("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict  / ((t_dec_end - t_dec_start) / 1e6f));
+    LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input,   (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
+    LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict  / ((t_dec_end - t_dec_start) / 1e6f));

-    LOG_INF("\n");
-    LOG_INF("n_draft   = %d\n", n_draft);
-    LOG_INF("n_predict = %d\n", n_predict);
-    LOG_INF("n_drafted = %d\n", n_drafted);
-    LOG_INF("n_accept  = %d\n", n_accept);
-    LOG_INF("accept    = %.3f%%\n", 100.0f * n_accept / n_drafted);
+    LOG_TEE("\n");
+    LOG_TEE("n_draft   = %d\n", n_draft);
+    LOG_TEE("n_predict = %d\n", n_predict);
+    LOG_TEE("n_drafted = %d\n", n_drafted);
+    LOG_TEE("n_accept  = %d\n", n_accept);
+    LOG_TEE("accept    = %.3f%%\n", 100.0f * n_accept / n_drafted);

-    LOG_INF("\n");
-    LOG_INF("draft:\n\n");
+    LOG_TEE("\ndraft:\n");
    // TODO: print sampling/grammar timings for all drafts
-    llama_perf_context_print(ctx_dft);
+    llama_print_timings(ctx_dft, nullptr);

-    LOG_INF("\n");
-    LOG_INF("target:\n\n");
-    gpt_perf_print(ctx_tgt, smpl);
+    LOG_TEE("\ntarget:\n");
+    llama_print_timings(ctx_tgt, smpl);

-    gpt_sampler_free(smpl);
+    llama_sampling_free(smpl);
    for (int s = 0; s < n_seq_dft; ++s) {
-        gpt_sampler_free(drafts[s].smpl);
+        llama_sampling_free(drafts[s].smpl);
    }

-    llama_sampler_free(softmax);
    llama_batch_free(batch_dft);

    llama_free(ctx_tgt);
@@ -635,7 +617,7 @@ int main(int argc, char ** argv) {

    llama_backend_free();

-    LOG("\n\n");
+    fprintf(stderr, "\n\n");

    return 0;
 }
--- a/Show More
+++ b/Show More